diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,89161 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 12733, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 45.35453070047731, + "learning_rate": 2.617801047120419e-08, + "loss": 2.0919, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 46.13214000486192, + "learning_rate": 5.235602094240838e-08, + "loss": 2.1655, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 47.256185844438896, + "learning_rate": 7.853403141361257e-08, + "loss": 2.1959, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 4.691128378686303, + "learning_rate": 1.0471204188481677e-07, + "loss": 0.8643, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 50.55032367486194, + "learning_rate": 1.3089005235602095e-07, + "loss": 2.2682, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 47.215536200795114, + "learning_rate": 1.5706806282722514e-07, + "loss": 2.1322, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 49.32478506639732, + "learning_rate": 1.8324607329842932e-07, + "loss": 2.2113, + "step": 7 + }, + { + "epoch": 0.0, + "grad_norm": 48.727801154955124, + "learning_rate": 2.0942408376963353e-07, + "loss": 2.2509, + "step": 8 + }, + { + "epoch": 0.0, + "grad_norm": 46.213727291809505, + "learning_rate": 2.3560209424083772e-07, + "loss": 2.268, + "step": 9 + }, + { + "epoch": 0.0, + "grad_norm": 46.9165533638362, + "learning_rate": 2.617801047120419e-07, + "loss": 2.136, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 47.513601900674225, + "learning_rate": 2.879581151832461e-07, + "loss": 2.1864, + "step": 11 + }, + { + "epoch": 0.0, + "grad_norm": 48.15897615141773, + "learning_rate": 3.1413612565445027e-07, + "loss": 2.1267, + "step": 12 + }, + { + "epoch": 0.0, + "grad_norm": 44.9728150083751, + "learning_rate": 3.403141361256545e-07, + "loss": 2.175, + "step": 13 + }, + { + "epoch": 0.0, + "grad_norm": 38.31471245525897, + "learning_rate": 3.6649214659685864e-07, + "loss": 2.0458, + "step": 14 + }, + { + "epoch": 0.0, + "grad_norm": 36.517769751470475, + "learning_rate": 3.926701570680629e-07, + "loss": 2.1302, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 35.03509277918861, + "learning_rate": 4.1884816753926706e-07, + "loss": 2.1028, + "step": 16 + }, + { + "epoch": 0.0, + "grad_norm": 35.90078278812605, + "learning_rate": 4.4502617801047125e-07, + "loss": 2.0215, + "step": 17 + }, + { + "epoch": 0.0, + "grad_norm": 36.6697157725411, + "learning_rate": 4.7120418848167543e-07, + "loss": 2.0382, + "step": 18 + }, + { + "epoch": 0.0, + "grad_norm": 25.06941191683323, + "learning_rate": 4.973821989528796e-07, + "loss": 1.8211, + "step": 19 + }, + { + "epoch": 0.0, + "grad_norm": 26.23231811919509, + "learning_rate": 5.235602094240838e-07, + "loss": 1.7987, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 23.06337189948916, + "learning_rate": 5.49738219895288e-07, + "loss": 1.6787, + "step": 21 + }, + { + "epoch": 0.0, + "grad_norm": 21.361321476963756, + "learning_rate": 5.759162303664922e-07, + "loss": 1.705, + "step": 22 + }, + { + "epoch": 0.0, + "grad_norm": 21.549608835731224, + "learning_rate": 6.020942408376964e-07, + "loss": 1.7105, + "step": 23 + }, + { + "epoch": 0.0, + "grad_norm": 21.021308074943473, + "learning_rate": 6.282722513089005e-07, + "loss": 1.6903, + "step": 24 + }, + { + "epoch": 0.0, + "grad_norm": 19.64468927723245, + "learning_rate": 6.544502617801048e-07, + "loss": 1.599, + "step": 25 + }, + { + "epoch": 0.0, + "grad_norm": 13.842243155236734, + "learning_rate": 6.80628272251309e-07, + "loss": 1.3371, + "step": 26 + }, + { + "epoch": 0.0, + "grad_norm": 13.007324116297383, + "learning_rate": 7.068062827225131e-07, + "loss": 1.2012, + "step": 27 + }, + { + "epoch": 0.0, + "grad_norm": 11.699921616286352, + "learning_rate": 7.329842931937173e-07, + "loss": 1.1503, + "step": 28 + }, + { + "epoch": 0.0, + "grad_norm": 2.1694482285209635, + "learning_rate": 7.591623036649215e-07, + "loss": 0.8114, + "step": 29 + }, + { + "epoch": 0.0, + "grad_norm": 9.473830646625572, + "learning_rate": 7.853403141361258e-07, + "loss": 1.1225, + "step": 30 + }, + { + "epoch": 0.0, + "grad_norm": 2.1434185968423534, + "learning_rate": 8.115183246073299e-07, + "loss": 0.818, + "step": 31 + }, + { + "epoch": 0.0, + "grad_norm": 6.733538116772037, + "learning_rate": 8.376963350785341e-07, + "loss": 1.1269, + "step": 32 + }, + { + "epoch": 0.0, + "grad_norm": 6.034106740920896, + "learning_rate": 8.638743455497383e-07, + "loss": 1.0395, + "step": 33 + }, + { + "epoch": 0.0, + "grad_norm": 1.8819983017729784, + "learning_rate": 8.900523560209425e-07, + "loss": 0.8208, + "step": 34 + }, + { + "epoch": 0.0, + "grad_norm": 4.565638901619074, + "learning_rate": 9.162303664921466e-07, + "loss": 1.0407, + "step": 35 + }, + { + "epoch": 0.0, + "grad_norm": 3.942797329758625, + "learning_rate": 9.424083769633509e-07, + "loss": 0.9814, + "step": 36 + }, + { + "epoch": 0.0, + "grad_norm": 4.204111590726132, + "learning_rate": 9.685863874345552e-07, + "loss": 0.9773, + "step": 37 + }, + { + "epoch": 0.0, + "grad_norm": 6.489121510119625, + "learning_rate": 9.947643979057591e-07, + "loss": 1.0226, + "step": 38 + }, + { + "epoch": 0.0, + "grad_norm": 3.9381644636937665, + "learning_rate": 1.0209424083769635e-06, + "loss": 0.9893, + "step": 39 + }, + { + "epoch": 0.0, + "grad_norm": 3.7947487895876972, + "learning_rate": 1.0471204188481676e-06, + "loss": 0.9293, + "step": 40 + }, + { + "epoch": 0.0, + "grad_norm": 4.813517039648087, + "learning_rate": 1.0732984293193717e-06, + "loss": 0.9161, + "step": 41 + }, + { + "epoch": 0.0, + "grad_norm": 3.661896737719922, + "learning_rate": 1.099476439790576e-06, + "loss": 0.9508, + "step": 42 + }, + { + "epoch": 0.0, + "grad_norm": 2.7547648016055883, + "learning_rate": 1.1256544502617802e-06, + "loss": 0.844, + "step": 43 + }, + { + "epoch": 0.0, + "grad_norm": 2.7526840475439416, + "learning_rate": 1.1518324607329843e-06, + "loss": 0.8643, + "step": 44 + }, + { + "epoch": 0.0, + "grad_norm": 3.653828512334836, + "learning_rate": 1.1780104712041885e-06, + "loss": 0.8822, + "step": 45 + }, + { + "epoch": 0.0, + "grad_norm": 3.10466377297939, + "learning_rate": 1.2041884816753928e-06, + "loss": 0.9145, + "step": 46 + }, + { + "epoch": 0.0, + "grad_norm": 2.5673009092667383, + "learning_rate": 1.230366492146597e-06, + "loss": 0.8749, + "step": 47 + }, + { + "epoch": 0.0, + "grad_norm": 2.4764666653184566, + "learning_rate": 1.256544502617801e-06, + "loss": 0.8176, + "step": 48 + }, + { + "epoch": 0.0, + "grad_norm": 1.6013297216851865, + "learning_rate": 1.2827225130890052e-06, + "loss": 0.7885, + "step": 49 + }, + { + "epoch": 0.0, + "grad_norm": 2.29556227978767, + "learning_rate": 1.3089005235602096e-06, + "loss": 0.8863, + "step": 50 + }, + { + "epoch": 0.0, + "grad_norm": 1.5164595777894039, + "learning_rate": 1.3350785340314137e-06, + "loss": 0.7677, + "step": 51 + }, + { + "epoch": 0.0, + "grad_norm": 2.531990875446019, + "learning_rate": 1.361256544502618e-06, + "loss": 0.7966, + "step": 52 + }, + { + "epoch": 0.0, + "grad_norm": 2.575988250397976, + "learning_rate": 1.3874345549738222e-06, + "loss": 0.7997, + "step": 53 + }, + { + "epoch": 0.0, + "grad_norm": 1.3833393515964518, + "learning_rate": 1.4136125654450263e-06, + "loss": 0.7431, + "step": 54 + }, + { + "epoch": 0.0, + "grad_norm": 3.399781372632845, + "learning_rate": 1.4397905759162306e-06, + "loss": 0.7796, + "step": 55 + }, + { + "epoch": 0.0, + "grad_norm": 2.333718947214041, + "learning_rate": 1.4659685863874346e-06, + "loss": 0.7743, + "step": 56 + }, + { + "epoch": 0.0, + "grad_norm": 2.7429944804838637, + "learning_rate": 1.4921465968586387e-06, + "loss": 0.7777, + "step": 57 + }, + { + "epoch": 0.0, + "grad_norm": 1.2671153309569032, + "learning_rate": 1.518324607329843e-06, + "loss": 0.7201, + "step": 58 + }, + { + "epoch": 0.0, + "grad_norm": 3.0254278124287546, + "learning_rate": 1.5445026178010472e-06, + "loss": 0.7478, + "step": 59 + }, + { + "epoch": 0.0, + "grad_norm": 1.1766331642515933, + "learning_rate": 1.5706806282722515e-06, + "loss": 0.7655, + "step": 60 + }, + { + "epoch": 0.0, + "grad_norm": 3.2767456118987313, + "learning_rate": 1.5968586387434556e-06, + "loss": 0.755, + "step": 61 + }, + { + "epoch": 0.0, + "grad_norm": 1.8077649837417682, + "learning_rate": 1.6230366492146598e-06, + "loss": 0.8457, + "step": 62 + }, + { + "epoch": 0.0, + "grad_norm": 1.8535791279862603, + "learning_rate": 1.6492146596858641e-06, + "loss": 0.7367, + "step": 63 + }, + { + "epoch": 0.01, + "grad_norm": 1.8543924838381893, + "learning_rate": 1.6753926701570683e-06, + "loss": 0.8021, + "step": 64 + }, + { + "epoch": 0.01, + "grad_norm": 2.1524070641273987, + "learning_rate": 1.7015706806282726e-06, + "loss": 0.801, + "step": 65 + }, + { + "epoch": 0.01, + "grad_norm": 1.9329178296807856, + "learning_rate": 1.7277486910994765e-06, + "loss": 0.775, + "step": 66 + }, + { + "epoch": 0.01, + "grad_norm": 3.957473044576722, + "learning_rate": 1.7539267015706806e-06, + "loss": 0.7681, + "step": 67 + }, + { + "epoch": 0.01, + "grad_norm": 2.1310805128679333, + "learning_rate": 1.780104712041885e-06, + "loss": 0.7694, + "step": 68 + }, + { + "epoch": 0.01, + "grad_norm": 1.8353382725906293, + "learning_rate": 1.8062827225130891e-06, + "loss": 0.7818, + "step": 69 + }, + { + "epoch": 0.01, + "grad_norm": 2.1697942415397358, + "learning_rate": 1.8324607329842933e-06, + "loss": 0.7907, + "step": 70 + }, + { + "epoch": 0.01, + "grad_norm": 1.9731505521769248, + "learning_rate": 1.8586387434554976e-06, + "loss": 0.7442, + "step": 71 + }, + { + "epoch": 0.01, + "grad_norm": 2.1352160467709305, + "learning_rate": 1.8848167539267017e-06, + "loss": 0.7367, + "step": 72 + }, + { + "epoch": 0.01, + "grad_norm": 2.206590331680779, + "learning_rate": 1.910994764397906e-06, + "loss": 0.7124, + "step": 73 + }, + { + "epoch": 0.01, + "grad_norm": 0.8992891746324881, + "learning_rate": 1.9371727748691104e-06, + "loss": 0.722, + "step": 74 + }, + { + "epoch": 0.01, + "grad_norm": 2.118054922158826, + "learning_rate": 1.9633507853403143e-06, + "loss": 0.786, + "step": 75 + }, + { + "epoch": 0.01, + "grad_norm": 2.1194976413857725, + "learning_rate": 1.9895287958115183e-06, + "loss": 0.7665, + "step": 76 + }, + { + "epoch": 0.01, + "grad_norm": 2.2500958372978688, + "learning_rate": 2.0157068062827226e-06, + "loss": 0.7261, + "step": 77 + }, + { + "epoch": 0.01, + "grad_norm": 1.9164092845719047, + "learning_rate": 2.041884816753927e-06, + "loss": 0.7227, + "step": 78 + }, + { + "epoch": 0.01, + "grad_norm": 2.027027307166561, + "learning_rate": 2.068062827225131e-06, + "loss": 0.7995, + "step": 79 + }, + { + "epoch": 0.01, + "grad_norm": 2.6736602072120665, + "learning_rate": 2.094240837696335e-06, + "loss": 0.7129, + "step": 80 + }, + { + "epoch": 0.01, + "grad_norm": 0.9629688978851059, + "learning_rate": 2.1204188481675396e-06, + "loss": 0.7553, + "step": 81 + }, + { + "epoch": 0.01, + "grad_norm": 2.083917113258539, + "learning_rate": 2.1465968586387435e-06, + "loss": 0.7382, + "step": 82 + }, + { + "epoch": 0.01, + "grad_norm": 1.9663694967419423, + "learning_rate": 2.172774869109948e-06, + "loss": 0.7549, + "step": 83 + }, + { + "epoch": 0.01, + "grad_norm": 1.8805571070809088, + "learning_rate": 2.198952879581152e-06, + "loss": 0.7458, + "step": 84 + }, + { + "epoch": 0.01, + "grad_norm": 1.9954794692218607, + "learning_rate": 2.2251308900523565e-06, + "loss": 0.6968, + "step": 85 + }, + { + "epoch": 0.01, + "grad_norm": 1.9157681391283916, + "learning_rate": 2.2513089005235604e-06, + "loss": 0.7518, + "step": 86 + }, + { + "epoch": 0.01, + "grad_norm": 1.872223663895253, + "learning_rate": 2.2774869109947643e-06, + "loss": 0.6708, + "step": 87 + }, + { + "epoch": 0.01, + "grad_norm": 1.969999877582168, + "learning_rate": 2.3036649214659687e-06, + "loss": 0.6439, + "step": 88 + }, + { + "epoch": 0.01, + "grad_norm": 1.858958355519399, + "learning_rate": 2.329842931937173e-06, + "loss": 0.7436, + "step": 89 + }, + { + "epoch": 0.01, + "grad_norm": 2.3418565405365506, + "learning_rate": 2.356020942408377e-06, + "loss": 0.7478, + "step": 90 + }, + { + "epoch": 0.01, + "grad_norm": 1.7936193337983721, + "learning_rate": 2.3821989528795813e-06, + "loss": 0.7125, + "step": 91 + }, + { + "epoch": 0.01, + "grad_norm": 1.892150276071557, + "learning_rate": 2.4083769633507856e-06, + "loss": 0.7338, + "step": 92 + }, + { + "epoch": 0.01, + "grad_norm": 2.097308265278116, + "learning_rate": 2.43455497382199e-06, + "loss": 0.7129, + "step": 93 + }, + { + "epoch": 0.01, + "grad_norm": 2.072006523639674, + "learning_rate": 2.460732984293194e-06, + "loss": 0.671, + "step": 94 + }, + { + "epoch": 0.01, + "grad_norm": 2.0785238995907087, + "learning_rate": 2.4869109947643982e-06, + "loss": 0.7266, + "step": 95 + }, + { + "epoch": 0.01, + "grad_norm": 1.8997777764978196, + "learning_rate": 2.513089005235602e-06, + "loss": 0.6566, + "step": 96 + }, + { + "epoch": 0.01, + "grad_norm": 2.014901719738401, + "learning_rate": 2.5392670157068065e-06, + "loss": 0.7186, + "step": 97 + }, + { + "epoch": 0.01, + "grad_norm": 5.370922928969181, + "learning_rate": 2.5654450261780104e-06, + "loss": 0.658, + "step": 98 + }, + { + "epoch": 0.01, + "grad_norm": 1.7807101146304956, + "learning_rate": 2.591623036649215e-06, + "loss": 0.6483, + "step": 99 + }, + { + "epoch": 0.01, + "grad_norm": 1.8367424093743598, + "learning_rate": 2.617801047120419e-06, + "loss": 0.7254, + "step": 100 + }, + { + "epoch": 0.01, + "grad_norm": 1.926337759300584, + "learning_rate": 2.643979057591623e-06, + "loss": 0.7583, + "step": 101 + }, + { + "epoch": 0.01, + "grad_norm": 1.9956710385213385, + "learning_rate": 2.6701570680628274e-06, + "loss": 0.6249, + "step": 102 + }, + { + "epoch": 0.01, + "grad_norm": 1.791609885701657, + "learning_rate": 2.6963350785340313e-06, + "loss": 0.6928, + "step": 103 + }, + { + "epoch": 0.01, + "grad_norm": 1.9518302967935883, + "learning_rate": 2.722513089005236e-06, + "loss": 0.678, + "step": 104 + }, + { + "epoch": 0.01, + "grad_norm": 2.039064240931749, + "learning_rate": 2.74869109947644e-06, + "loss": 0.7056, + "step": 105 + }, + { + "epoch": 0.01, + "grad_norm": 2.3678913250403104, + "learning_rate": 2.7748691099476443e-06, + "loss": 0.6715, + "step": 106 + }, + { + "epoch": 0.01, + "grad_norm": 1.9267736176367727, + "learning_rate": 2.8010471204188483e-06, + "loss": 0.7046, + "step": 107 + }, + { + "epoch": 0.01, + "grad_norm": 0.8636531425355595, + "learning_rate": 2.8272251308900526e-06, + "loss": 0.701, + "step": 108 + }, + { + "epoch": 0.01, + "grad_norm": 2.0604628063700496, + "learning_rate": 2.853403141361257e-06, + "loss": 0.7115, + "step": 109 + }, + { + "epoch": 0.01, + "grad_norm": 1.7930819856262614, + "learning_rate": 2.8795811518324613e-06, + "loss": 0.6732, + "step": 110 + }, + { + "epoch": 0.01, + "grad_norm": 2.105418395736594, + "learning_rate": 2.905759162303665e-06, + "loss": 0.6431, + "step": 111 + }, + { + "epoch": 0.01, + "grad_norm": 1.8189305880342908, + "learning_rate": 2.931937172774869e-06, + "loss": 0.6296, + "step": 112 + }, + { + "epoch": 0.01, + "grad_norm": 0.8235003667751857, + "learning_rate": 2.9581151832460735e-06, + "loss": 0.7217, + "step": 113 + }, + { + "epoch": 0.01, + "grad_norm": 2.2766271969490917, + "learning_rate": 2.9842931937172774e-06, + "loss": 0.6682, + "step": 114 + }, + { + "epoch": 0.01, + "grad_norm": 1.8820752427366934, + "learning_rate": 3.010471204188482e-06, + "loss": 0.6738, + "step": 115 + }, + { + "epoch": 0.01, + "grad_norm": 2.2754682331303964, + "learning_rate": 3.036649214659686e-06, + "loss": 0.7329, + "step": 116 + }, + { + "epoch": 0.01, + "grad_norm": 1.8743072977229744, + "learning_rate": 3.0628272251308904e-06, + "loss": 0.6589, + "step": 117 + }, + { + "epoch": 0.01, + "grad_norm": 2.1694655485962713, + "learning_rate": 3.0890052356020943e-06, + "loss": 0.6579, + "step": 118 + }, + { + "epoch": 0.01, + "grad_norm": 2.0199711618861205, + "learning_rate": 3.115183246073299e-06, + "loss": 0.6807, + "step": 119 + }, + { + "epoch": 0.01, + "grad_norm": 2.1292929227860204, + "learning_rate": 3.141361256544503e-06, + "loss": 0.6724, + "step": 120 + }, + { + "epoch": 0.01, + "grad_norm": 2.1393759446029326, + "learning_rate": 3.167539267015707e-06, + "loss": 0.6673, + "step": 121 + }, + { + "epoch": 0.01, + "grad_norm": 2.3856643323755993, + "learning_rate": 3.1937172774869113e-06, + "loss": 0.6203, + "step": 122 + }, + { + "epoch": 0.01, + "grad_norm": 2.561140903379722, + "learning_rate": 3.219895287958115e-06, + "loss": 0.6751, + "step": 123 + }, + { + "epoch": 0.01, + "grad_norm": 1.7662570854834847, + "learning_rate": 3.2460732984293196e-06, + "loss": 0.6614, + "step": 124 + }, + { + "epoch": 0.01, + "grad_norm": 1.8324500732709519, + "learning_rate": 3.2722513089005235e-06, + "loss": 0.6513, + "step": 125 + }, + { + "epoch": 0.01, + "grad_norm": 1.7679113291895352, + "learning_rate": 3.2984293193717282e-06, + "loss": 0.6594, + "step": 126 + }, + { + "epoch": 0.01, + "grad_norm": 1.951173192966468, + "learning_rate": 3.324607329842932e-06, + "loss": 0.6229, + "step": 127 + }, + { + "epoch": 0.01, + "grad_norm": 2.333074081486533, + "learning_rate": 3.3507853403141365e-06, + "loss": 0.595, + "step": 128 + }, + { + "epoch": 0.01, + "grad_norm": 2.881689358329475, + "learning_rate": 3.3769633507853404e-06, + "loss": 0.6513, + "step": 129 + }, + { + "epoch": 0.01, + "grad_norm": 2.070975299362807, + "learning_rate": 3.403141361256545e-06, + "loss": 0.614, + "step": 130 + }, + { + "epoch": 0.01, + "grad_norm": 2.206312931997375, + "learning_rate": 3.429319371727749e-06, + "loss": 0.6907, + "step": 131 + }, + { + "epoch": 0.01, + "grad_norm": 1.7629747669733618, + "learning_rate": 3.455497382198953e-06, + "loss": 0.6602, + "step": 132 + }, + { + "epoch": 0.01, + "grad_norm": 1.9321217548759493, + "learning_rate": 3.4816753926701574e-06, + "loss": 0.6575, + "step": 133 + }, + { + "epoch": 0.01, + "grad_norm": 1.9808782435471375, + "learning_rate": 3.5078534031413613e-06, + "loss": 0.6389, + "step": 134 + }, + { + "epoch": 0.01, + "grad_norm": 2.131088425682013, + "learning_rate": 3.534031413612566e-06, + "loss": 0.6231, + "step": 135 + }, + { + "epoch": 0.01, + "grad_norm": 2.028134267184075, + "learning_rate": 3.56020942408377e-06, + "loss": 0.6508, + "step": 136 + }, + { + "epoch": 0.01, + "grad_norm": 2.138081828542439, + "learning_rate": 3.5863874345549743e-06, + "loss": 0.6456, + "step": 137 + }, + { + "epoch": 0.01, + "grad_norm": 1.716613124461599, + "learning_rate": 3.6125654450261782e-06, + "loss": 0.6142, + "step": 138 + }, + { + "epoch": 0.01, + "grad_norm": 2.023720585340836, + "learning_rate": 3.6387434554973826e-06, + "loss": 0.6671, + "step": 139 + }, + { + "epoch": 0.01, + "grad_norm": 10.194023941680866, + "learning_rate": 3.6649214659685865e-06, + "loss": 0.6451, + "step": 140 + }, + { + "epoch": 0.01, + "grad_norm": 1.9972966870883009, + "learning_rate": 3.6910994764397904e-06, + "loss": 0.6319, + "step": 141 + }, + { + "epoch": 0.01, + "grad_norm": 2.0212654702091317, + "learning_rate": 3.717277486910995e-06, + "loss": 0.6509, + "step": 142 + }, + { + "epoch": 0.01, + "grad_norm": 0.9433165276775327, + "learning_rate": 3.743455497382199e-06, + "loss": 0.6993, + "step": 143 + }, + { + "epoch": 0.01, + "grad_norm": 2.1309731395963945, + "learning_rate": 3.7696335078534035e-06, + "loss": 0.7071, + "step": 144 + }, + { + "epoch": 0.01, + "grad_norm": 0.8921011279008414, + "learning_rate": 3.7958115183246074e-06, + "loss": 0.709, + "step": 145 + }, + { + "epoch": 0.01, + "grad_norm": 2.6366436090625625, + "learning_rate": 3.821989528795812e-06, + "loss": 0.6248, + "step": 146 + }, + { + "epoch": 0.01, + "grad_norm": 1.857571469328871, + "learning_rate": 3.848167539267016e-06, + "loss": 0.6901, + "step": 147 + }, + { + "epoch": 0.01, + "grad_norm": 0.7736744769520083, + "learning_rate": 3.874345549738221e-06, + "loss": 0.6689, + "step": 148 + }, + { + "epoch": 0.01, + "grad_norm": 2.2598614397167878, + "learning_rate": 3.900523560209425e-06, + "loss": 0.644, + "step": 149 + }, + { + "epoch": 0.01, + "grad_norm": 1.7893197071307048, + "learning_rate": 3.926701570680629e-06, + "loss": 0.6462, + "step": 150 + }, + { + "epoch": 0.01, + "grad_norm": 1.6474949981268692, + "learning_rate": 3.952879581151833e-06, + "loss": 0.6007, + "step": 151 + }, + { + "epoch": 0.01, + "grad_norm": 1.999142759485342, + "learning_rate": 3.9790575916230365e-06, + "loss": 0.6089, + "step": 152 + }, + { + "epoch": 0.01, + "grad_norm": 2.3139743873530603, + "learning_rate": 4.005235602094241e-06, + "loss": 0.6537, + "step": 153 + }, + { + "epoch": 0.01, + "grad_norm": 1.8846663774879808, + "learning_rate": 4.031413612565445e-06, + "loss": 0.6639, + "step": 154 + }, + { + "epoch": 0.01, + "grad_norm": 1.734985030227232, + "learning_rate": 4.05759162303665e-06, + "loss": 0.6616, + "step": 155 + }, + { + "epoch": 0.01, + "grad_norm": 2.5920569033949876, + "learning_rate": 4.083769633507854e-06, + "loss": 0.6046, + "step": 156 + }, + { + "epoch": 0.01, + "grad_norm": 1.6491730373484021, + "learning_rate": 4.109947643979058e-06, + "loss": 0.5964, + "step": 157 + }, + { + "epoch": 0.01, + "grad_norm": 1.586006489836427, + "learning_rate": 4.136125654450262e-06, + "loss": 0.5764, + "step": 158 + }, + { + "epoch": 0.01, + "grad_norm": 2.10847911947796, + "learning_rate": 4.1623036649214665e-06, + "loss": 0.6442, + "step": 159 + }, + { + "epoch": 0.01, + "grad_norm": 1.7830649184861866, + "learning_rate": 4.18848167539267e-06, + "loss": 0.5902, + "step": 160 + }, + { + "epoch": 0.01, + "grad_norm": 2.5904815875916594, + "learning_rate": 4.214659685863874e-06, + "loss": 0.6195, + "step": 161 + }, + { + "epoch": 0.01, + "grad_norm": 1.6920124196984998, + "learning_rate": 4.240837696335079e-06, + "loss": 0.6312, + "step": 162 + }, + { + "epoch": 0.01, + "grad_norm": 1.8741205082859589, + "learning_rate": 4.267015706806283e-06, + "loss": 0.6279, + "step": 163 + }, + { + "epoch": 0.01, + "grad_norm": 2.4532328049185614, + "learning_rate": 4.293193717277487e-06, + "loss": 0.6587, + "step": 164 + }, + { + "epoch": 0.01, + "grad_norm": 1.7725115371560787, + "learning_rate": 4.319371727748692e-06, + "loss": 0.5951, + "step": 165 + }, + { + "epoch": 0.01, + "grad_norm": 2.6146636137372306, + "learning_rate": 4.345549738219896e-06, + "loss": 0.6257, + "step": 166 + }, + { + "epoch": 0.01, + "grad_norm": 2.2136877777522312, + "learning_rate": 4.3717277486910996e-06, + "loss": 0.6117, + "step": 167 + }, + { + "epoch": 0.01, + "grad_norm": 5.268881685131859, + "learning_rate": 4.397905759162304e-06, + "loss": 0.6686, + "step": 168 + }, + { + "epoch": 0.01, + "grad_norm": 0.696900463368965, + "learning_rate": 4.424083769633508e-06, + "loss": 0.7198, + "step": 169 + }, + { + "epoch": 0.01, + "grad_norm": 1.7059533677803898, + "learning_rate": 4.450261780104713e-06, + "loss": 0.5859, + "step": 170 + }, + { + "epoch": 0.01, + "grad_norm": 2.44624084635017, + "learning_rate": 4.476439790575917e-06, + "loss": 0.6901, + "step": 171 + }, + { + "epoch": 0.01, + "grad_norm": 2.2909153639175632, + "learning_rate": 4.502617801047121e-06, + "loss": 0.6767, + "step": 172 + }, + { + "epoch": 0.01, + "grad_norm": 0.7291815189262174, + "learning_rate": 4.528795811518325e-06, + "loss": 0.6618, + "step": 173 + }, + { + "epoch": 0.01, + "grad_norm": 1.9162523229709274, + "learning_rate": 4.554973821989529e-06, + "loss": 0.6349, + "step": 174 + }, + { + "epoch": 0.01, + "grad_norm": 2.469418735305691, + "learning_rate": 4.5811518324607335e-06, + "loss": 0.5924, + "step": 175 + }, + { + "epoch": 0.01, + "grad_norm": 0.6602749230145328, + "learning_rate": 4.607329842931937e-06, + "loss": 0.6785, + "step": 176 + }, + { + "epoch": 0.01, + "grad_norm": 2.0646182832653475, + "learning_rate": 4.633507853403142e-06, + "loss": 0.6659, + "step": 177 + }, + { + "epoch": 0.01, + "grad_norm": 1.888560880998048, + "learning_rate": 4.659685863874346e-06, + "loss": 0.6238, + "step": 178 + }, + { + "epoch": 0.01, + "grad_norm": 2.2806725046124194, + "learning_rate": 4.68586387434555e-06, + "loss": 0.7167, + "step": 179 + }, + { + "epoch": 0.01, + "grad_norm": 3.2385032657843675, + "learning_rate": 4.712041884816754e-06, + "loss": 0.6622, + "step": 180 + }, + { + "epoch": 0.01, + "grad_norm": 1.705931243861708, + "learning_rate": 4.738219895287958e-06, + "loss": 0.6014, + "step": 181 + }, + { + "epoch": 0.01, + "grad_norm": 0.741836552376092, + "learning_rate": 4.764397905759163e-06, + "loss": 0.6766, + "step": 182 + }, + { + "epoch": 0.01, + "grad_norm": 2.1663446335535017, + "learning_rate": 4.7905759162303665e-06, + "loss": 0.5637, + "step": 183 + }, + { + "epoch": 0.01, + "grad_norm": 2.2821381374598895, + "learning_rate": 4.816753926701571e-06, + "loss": 0.5777, + "step": 184 + }, + { + "epoch": 0.01, + "grad_norm": 0.6867896701604612, + "learning_rate": 4.842931937172775e-06, + "loss": 0.672, + "step": 185 + }, + { + "epoch": 0.01, + "grad_norm": 1.7084110913182926, + "learning_rate": 4.86910994764398e-06, + "loss": 0.6199, + "step": 186 + }, + { + "epoch": 0.01, + "grad_norm": 2.37314978140354, + "learning_rate": 4.895287958115184e-06, + "loss": 0.6857, + "step": 187 + }, + { + "epoch": 0.01, + "grad_norm": 1.91825279061721, + "learning_rate": 4.921465968586388e-06, + "loss": 0.5904, + "step": 188 + }, + { + "epoch": 0.01, + "grad_norm": 2.0749995402993258, + "learning_rate": 4.947643979057592e-06, + "loss": 0.6254, + "step": 189 + }, + { + "epoch": 0.01, + "grad_norm": 2.5100045738950123, + "learning_rate": 4.9738219895287965e-06, + "loss": 0.611, + "step": 190 + }, + { + "epoch": 0.02, + "grad_norm": 1.7371943140092654, + "learning_rate": 5e-06, + "loss": 0.537, + "step": 191 + }, + { + "epoch": 0.02, + "grad_norm": 2.4153437448622332, + "learning_rate": 5.026178010471204e-06, + "loss": 0.6771, + "step": 192 + }, + { + "epoch": 0.02, + "grad_norm": 1.6508481797020853, + "learning_rate": 5.052356020942408e-06, + "loss": 0.6298, + "step": 193 + }, + { + "epoch": 0.02, + "grad_norm": 1.7882164194005299, + "learning_rate": 5.078534031413613e-06, + "loss": 0.6204, + "step": 194 + }, + { + "epoch": 0.02, + "grad_norm": 1.9525946335786866, + "learning_rate": 5.104712041884817e-06, + "loss": 0.5357, + "step": 195 + }, + { + "epoch": 0.02, + "grad_norm": 1.957509212074309, + "learning_rate": 5.130890052356021e-06, + "loss": 0.6606, + "step": 196 + }, + { + "epoch": 0.02, + "grad_norm": 2.0018704189443026, + "learning_rate": 5.157068062827225e-06, + "loss": 0.613, + "step": 197 + }, + { + "epoch": 0.02, + "grad_norm": 2.14760935743133, + "learning_rate": 5.18324607329843e-06, + "loss": 0.6183, + "step": 198 + }, + { + "epoch": 0.02, + "grad_norm": 1.9544711320036594, + "learning_rate": 5.209424083769634e-06, + "loss": 0.6021, + "step": 199 + }, + { + "epoch": 0.02, + "grad_norm": 2.4912528084584915, + "learning_rate": 5.235602094240838e-06, + "loss": 0.5545, + "step": 200 + }, + { + "epoch": 0.02, + "grad_norm": 1.9315115703051489, + "learning_rate": 5.261780104712042e-06, + "loss": 0.6131, + "step": 201 + }, + { + "epoch": 0.02, + "grad_norm": 0.7251473282650897, + "learning_rate": 5.287958115183246e-06, + "loss": 0.6768, + "step": 202 + }, + { + "epoch": 0.02, + "grad_norm": 2.693954234080208, + "learning_rate": 5.314136125654451e-06, + "loss": 0.6137, + "step": 203 + }, + { + "epoch": 0.02, + "grad_norm": 1.7756818197281292, + "learning_rate": 5.340314136125655e-06, + "loss": 0.5806, + "step": 204 + }, + { + "epoch": 0.02, + "grad_norm": 13.078500588517276, + "learning_rate": 5.366492146596859e-06, + "loss": 0.6225, + "step": 205 + }, + { + "epoch": 0.02, + "grad_norm": 2.021719702575313, + "learning_rate": 5.392670157068063e-06, + "loss": 0.6309, + "step": 206 + }, + { + "epoch": 0.02, + "grad_norm": 1.7220006778453985, + "learning_rate": 5.418848167539268e-06, + "loss": 0.6425, + "step": 207 + }, + { + "epoch": 0.02, + "grad_norm": 2.8813082062793094, + "learning_rate": 5.445026178010472e-06, + "loss": 0.6765, + "step": 208 + }, + { + "epoch": 0.02, + "grad_norm": 1.7665225877764177, + "learning_rate": 5.471204188481676e-06, + "loss": 0.5653, + "step": 209 + }, + { + "epoch": 0.02, + "grad_norm": 2.2776931345002365, + "learning_rate": 5.49738219895288e-06, + "loss": 0.5794, + "step": 210 + }, + { + "epoch": 0.02, + "grad_norm": 0.8439308712146631, + "learning_rate": 5.523560209424085e-06, + "loss": 0.6702, + "step": 211 + }, + { + "epoch": 0.02, + "grad_norm": 1.695772975501985, + "learning_rate": 5.549738219895289e-06, + "loss": 0.5512, + "step": 212 + }, + { + "epoch": 0.02, + "grad_norm": 3.0913982050412083, + "learning_rate": 5.575916230366493e-06, + "loss": 0.6256, + "step": 213 + }, + { + "epoch": 0.02, + "grad_norm": 2.1507122257592375, + "learning_rate": 5.6020942408376965e-06, + "loss": 0.5894, + "step": 214 + }, + { + "epoch": 0.02, + "grad_norm": 2.45220638799841, + "learning_rate": 5.6282722513089e-06, + "loss": 0.6211, + "step": 215 + }, + { + "epoch": 0.02, + "grad_norm": 0.7144943444245992, + "learning_rate": 5.654450261780105e-06, + "loss": 0.6629, + "step": 216 + }, + { + "epoch": 0.02, + "grad_norm": 1.9247248310532166, + "learning_rate": 5.680628272251309e-06, + "loss": 0.6102, + "step": 217 + }, + { + "epoch": 0.02, + "grad_norm": 2.1066955967685352, + "learning_rate": 5.706806282722514e-06, + "loss": 0.6287, + "step": 218 + }, + { + "epoch": 0.02, + "grad_norm": 2.1029979108896297, + "learning_rate": 5.732984293193718e-06, + "loss": 0.6234, + "step": 219 + }, + { + "epoch": 0.02, + "grad_norm": 1.894030012191344, + "learning_rate": 5.7591623036649226e-06, + "loss": 0.591, + "step": 220 + }, + { + "epoch": 0.02, + "grad_norm": 2.073173112916301, + "learning_rate": 5.7853403141361265e-06, + "loss": 0.6153, + "step": 221 + }, + { + "epoch": 0.02, + "grad_norm": 0.7002274519542699, + "learning_rate": 5.81151832460733e-06, + "loss": 0.6592, + "step": 222 + }, + { + "epoch": 0.02, + "grad_norm": 1.7930017934151798, + "learning_rate": 5.837696335078534e-06, + "loss": 0.6153, + "step": 223 + }, + { + "epoch": 0.02, + "grad_norm": 2.475914951564157, + "learning_rate": 5.863874345549738e-06, + "loss": 0.5637, + "step": 224 + }, + { + "epoch": 0.02, + "grad_norm": 1.741749162713132, + "learning_rate": 5.890052356020943e-06, + "loss": 0.62, + "step": 225 + }, + { + "epoch": 0.02, + "grad_norm": 1.7759720431129993, + "learning_rate": 5.916230366492147e-06, + "loss": 0.6693, + "step": 226 + }, + { + "epoch": 0.02, + "grad_norm": 0.7343394617034111, + "learning_rate": 5.942408376963351e-06, + "loss": 0.6367, + "step": 227 + }, + { + "epoch": 0.02, + "grad_norm": 1.793166057087866, + "learning_rate": 5.968586387434555e-06, + "loss": 0.5872, + "step": 228 + }, + { + "epoch": 0.02, + "grad_norm": 2.4545737450862313, + "learning_rate": 5.99476439790576e-06, + "loss": 0.5685, + "step": 229 + }, + { + "epoch": 0.02, + "grad_norm": 3.4463504497293354, + "learning_rate": 6.020942408376964e-06, + "loss": 0.6142, + "step": 230 + }, + { + "epoch": 0.02, + "grad_norm": 2.382576115967975, + "learning_rate": 6.047120418848168e-06, + "loss": 0.6581, + "step": 231 + }, + { + "epoch": 0.02, + "grad_norm": 1.7305290249804952, + "learning_rate": 6.073298429319372e-06, + "loss": 0.6467, + "step": 232 + }, + { + "epoch": 0.02, + "grad_norm": 0.8490308149622259, + "learning_rate": 6.099476439790576e-06, + "loss": 0.6647, + "step": 233 + }, + { + "epoch": 0.02, + "grad_norm": 2.1960310676592774, + "learning_rate": 6.125654450261781e-06, + "loss": 0.6218, + "step": 234 + }, + { + "epoch": 0.02, + "grad_norm": 3.7694383713956765, + "learning_rate": 6.151832460732985e-06, + "loss": 0.5273, + "step": 235 + }, + { + "epoch": 0.02, + "grad_norm": 0.6739184255103511, + "learning_rate": 6.178010471204189e-06, + "loss": 0.6381, + "step": 236 + }, + { + "epoch": 0.02, + "grad_norm": 2.0293083758240877, + "learning_rate": 6.204188481675393e-06, + "loss": 0.5907, + "step": 237 + }, + { + "epoch": 0.02, + "grad_norm": 1.566980184009032, + "learning_rate": 6.230366492146598e-06, + "loss": 0.5775, + "step": 238 + }, + { + "epoch": 0.02, + "grad_norm": 2.419682987980466, + "learning_rate": 6.256544502617802e-06, + "loss": 0.6018, + "step": 239 + }, + { + "epoch": 0.02, + "grad_norm": 1.7971667174279409, + "learning_rate": 6.282722513089006e-06, + "loss": 0.6207, + "step": 240 + }, + { + "epoch": 0.02, + "grad_norm": 1.8222602544518638, + "learning_rate": 6.30890052356021e-06, + "loss": 0.5495, + "step": 241 + }, + { + "epoch": 0.02, + "grad_norm": 1.9655782418817094, + "learning_rate": 6.335078534031414e-06, + "loss": 0.5693, + "step": 242 + }, + { + "epoch": 0.02, + "grad_norm": 0.6491065496522279, + "learning_rate": 6.361256544502619e-06, + "loss": 0.66, + "step": 243 + }, + { + "epoch": 0.02, + "grad_norm": 3.7489167017238296, + "learning_rate": 6.3874345549738226e-06, + "loss": 0.5976, + "step": 244 + }, + { + "epoch": 0.02, + "grad_norm": 0.7879946736689729, + "learning_rate": 6.4136125654450265e-06, + "loss": 0.6461, + "step": 245 + }, + { + "epoch": 0.02, + "grad_norm": 0.8619264996832411, + "learning_rate": 6.43979057591623e-06, + "loss": 0.6521, + "step": 246 + }, + { + "epoch": 0.02, + "grad_norm": 1.9410176997917161, + "learning_rate": 6.465968586387435e-06, + "loss": 0.611, + "step": 247 + }, + { + "epoch": 0.02, + "grad_norm": 0.7008009225139062, + "learning_rate": 6.492146596858639e-06, + "loss": 0.626, + "step": 248 + }, + { + "epoch": 0.02, + "grad_norm": 3.189246698862531, + "learning_rate": 6.518324607329843e-06, + "loss": 0.5695, + "step": 249 + }, + { + "epoch": 0.02, + "grad_norm": 2.53575826182709, + "learning_rate": 6.544502617801047e-06, + "loss": 0.6037, + "step": 250 + }, + { + "epoch": 0.02, + "grad_norm": 0.7095931613050814, + "learning_rate": 6.5706806282722526e-06, + "loss": 0.6386, + "step": 251 + }, + { + "epoch": 0.02, + "grad_norm": 1.9137628661884745, + "learning_rate": 6.5968586387434565e-06, + "loss": 0.5649, + "step": 252 + }, + { + "epoch": 0.02, + "grad_norm": 1.9677194691877633, + "learning_rate": 6.62303664921466e-06, + "loss": 0.6404, + "step": 253 + }, + { + "epoch": 0.02, + "grad_norm": 1.8663511932921686, + "learning_rate": 6.649214659685864e-06, + "loss": 0.5249, + "step": 254 + }, + { + "epoch": 0.02, + "grad_norm": 2.0004844468630747, + "learning_rate": 6.675392670157068e-06, + "loss": 0.6263, + "step": 255 + }, + { + "epoch": 0.02, + "grad_norm": 2.099160308105679, + "learning_rate": 6.701570680628273e-06, + "loss": 0.6105, + "step": 256 + }, + { + "epoch": 0.02, + "grad_norm": 2.392738953440951, + "learning_rate": 6.727748691099477e-06, + "loss": 0.5954, + "step": 257 + }, + { + "epoch": 0.02, + "grad_norm": 2.0355276124794885, + "learning_rate": 6.753926701570681e-06, + "loss": 0.6671, + "step": 258 + }, + { + "epoch": 0.02, + "grad_norm": 2.03458385969778, + "learning_rate": 6.780104712041885e-06, + "loss": 0.5604, + "step": 259 + }, + { + "epoch": 0.02, + "grad_norm": 3.8883597384286643, + "learning_rate": 6.80628272251309e-06, + "loss": 0.5639, + "step": 260 + }, + { + "epoch": 0.02, + "grad_norm": 1.866747787672583, + "learning_rate": 6.832460732984294e-06, + "loss": 0.6616, + "step": 261 + }, + { + "epoch": 0.02, + "grad_norm": 1.903986982294793, + "learning_rate": 6.858638743455498e-06, + "loss": 0.5886, + "step": 262 + }, + { + "epoch": 0.02, + "grad_norm": 2.073994782101355, + "learning_rate": 6.884816753926702e-06, + "loss": 0.6262, + "step": 263 + }, + { + "epoch": 0.02, + "grad_norm": 0.753531478359543, + "learning_rate": 6.910994764397906e-06, + "loss": 0.6491, + "step": 264 + }, + { + "epoch": 0.02, + "grad_norm": 1.9269080468807167, + "learning_rate": 6.937172774869111e-06, + "loss": 0.6587, + "step": 265 + }, + { + "epoch": 0.02, + "grad_norm": 1.8591959193729306, + "learning_rate": 6.963350785340315e-06, + "loss": 0.575, + "step": 266 + }, + { + "epoch": 0.02, + "grad_norm": 1.7600350309134827, + "learning_rate": 6.989528795811519e-06, + "loss": 0.6398, + "step": 267 + }, + { + "epoch": 0.02, + "grad_norm": 1.7602001182676164, + "learning_rate": 7.015706806282723e-06, + "loss": 0.6146, + "step": 268 + }, + { + "epoch": 0.02, + "grad_norm": 0.8551764949302686, + "learning_rate": 7.041884816753927e-06, + "loss": 0.6504, + "step": 269 + }, + { + "epoch": 0.02, + "grad_norm": 3.0024281431622826, + "learning_rate": 7.068062827225132e-06, + "loss": 0.6621, + "step": 270 + }, + { + "epoch": 0.02, + "grad_norm": 2.2210609238450276, + "learning_rate": 7.094240837696336e-06, + "loss": 0.552, + "step": 271 + }, + { + "epoch": 0.02, + "grad_norm": 1.8397689913412292, + "learning_rate": 7.12041884816754e-06, + "loss": 0.6015, + "step": 272 + }, + { + "epoch": 0.02, + "grad_norm": 2.218150729643147, + "learning_rate": 7.146596858638744e-06, + "loss": 0.6113, + "step": 273 + }, + { + "epoch": 0.02, + "grad_norm": 2.1596853885393013, + "learning_rate": 7.172774869109949e-06, + "loss": 0.636, + "step": 274 + }, + { + "epoch": 0.02, + "grad_norm": 1.668983011893593, + "learning_rate": 7.1989528795811526e-06, + "loss": 0.5872, + "step": 275 + }, + { + "epoch": 0.02, + "grad_norm": 1.5893830057802592, + "learning_rate": 7.2251308900523565e-06, + "loss": 0.5862, + "step": 276 + }, + { + "epoch": 0.02, + "grad_norm": 2.5747322565354605, + "learning_rate": 7.25130890052356e-06, + "loss": 0.6201, + "step": 277 + }, + { + "epoch": 0.02, + "grad_norm": 1.8829488599479214, + "learning_rate": 7.277486910994765e-06, + "loss": 0.5457, + "step": 278 + }, + { + "epoch": 0.02, + "grad_norm": 2.336956899504294, + "learning_rate": 7.303664921465969e-06, + "loss": 0.6038, + "step": 279 + }, + { + "epoch": 0.02, + "grad_norm": 1.7291076352632406, + "learning_rate": 7.329842931937173e-06, + "loss": 0.5971, + "step": 280 + }, + { + "epoch": 0.02, + "grad_norm": 1.9847124671832421, + "learning_rate": 7.356020942408377e-06, + "loss": 0.6058, + "step": 281 + }, + { + "epoch": 0.02, + "grad_norm": 2.798704849334657, + "learning_rate": 7.382198952879581e-06, + "loss": 0.6312, + "step": 282 + }, + { + "epoch": 0.02, + "grad_norm": 0.7331834478419776, + "learning_rate": 7.4083769633507865e-06, + "loss": 0.6406, + "step": 283 + }, + { + "epoch": 0.02, + "grad_norm": 2.564801529458803, + "learning_rate": 7.43455497382199e-06, + "loss": 0.5365, + "step": 284 + }, + { + "epoch": 0.02, + "grad_norm": 1.7463328102533797, + "learning_rate": 7.460732984293194e-06, + "loss": 0.5818, + "step": 285 + }, + { + "epoch": 0.02, + "grad_norm": 2.9650635818556155, + "learning_rate": 7.486910994764398e-06, + "loss": 0.577, + "step": 286 + }, + { + "epoch": 0.02, + "grad_norm": 1.970520728459688, + "learning_rate": 7.513089005235603e-06, + "loss": 0.5818, + "step": 287 + }, + { + "epoch": 0.02, + "grad_norm": 0.8540747283645477, + "learning_rate": 7.539267015706807e-06, + "loss": 0.6753, + "step": 288 + }, + { + "epoch": 0.02, + "grad_norm": 2.1081833360174698, + "learning_rate": 7.565445026178011e-06, + "loss": 0.5891, + "step": 289 + }, + { + "epoch": 0.02, + "grad_norm": 2.0117508764811607, + "learning_rate": 7.591623036649215e-06, + "loss": 0.6295, + "step": 290 + }, + { + "epoch": 0.02, + "grad_norm": 0.6970549536269707, + "learning_rate": 7.61780104712042e-06, + "loss": 0.6755, + "step": 291 + }, + { + "epoch": 0.02, + "grad_norm": 2.0062637312225844, + "learning_rate": 7.643979057591624e-06, + "loss": 0.5686, + "step": 292 + }, + { + "epoch": 0.02, + "grad_norm": 2.0362096093387865, + "learning_rate": 7.670157068062828e-06, + "loss": 0.5673, + "step": 293 + }, + { + "epoch": 0.02, + "grad_norm": 1.8131146598326235, + "learning_rate": 7.696335078534032e-06, + "loss": 0.6225, + "step": 294 + }, + { + "epoch": 0.02, + "grad_norm": 2.40117781779116, + "learning_rate": 7.722513089005236e-06, + "loss": 0.6092, + "step": 295 + }, + { + "epoch": 0.02, + "grad_norm": 1.942941447243737, + "learning_rate": 7.748691099476442e-06, + "loss": 0.5567, + "step": 296 + }, + { + "epoch": 0.02, + "grad_norm": 2.500181078589944, + "learning_rate": 7.774869109947646e-06, + "loss": 0.591, + "step": 297 + }, + { + "epoch": 0.02, + "grad_norm": 2.2251963124773035, + "learning_rate": 7.80104712041885e-06, + "loss": 0.6031, + "step": 298 + }, + { + "epoch": 0.02, + "grad_norm": 2.0529066305570653, + "learning_rate": 7.827225130890053e-06, + "loss": 0.6128, + "step": 299 + }, + { + "epoch": 0.02, + "grad_norm": 3.090935404900086, + "learning_rate": 7.853403141361257e-06, + "loss": 0.5996, + "step": 300 + }, + { + "epoch": 0.02, + "grad_norm": 1.9555027063866162, + "learning_rate": 7.879581151832461e-06, + "loss": 0.605, + "step": 301 + }, + { + "epoch": 0.02, + "grad_norm": 1.0087905370864692, + "learning_rate": 7.905759162303665e-06, + "loss": 0.6811, + "step": 302 + }, + { + "epoch": 0.02, + "grad_norm": 0.7043236552760113, + "learning_rate": 7.931937172774869e-06, + "loss": 0.6454, + "step": 303 + }, + { + "epoch": 0.02, + "grad_norm": 1.7342801484676242, + "learning_rate": 7.958115183246073e-06, + "loss": 0.5983, + "step": 304 + }, + { + "epoch": 0.02, + "grad_norm": 2.065925420547359, + "learning_rate": 7.984293193717279e-06, + "loss": 0.6141, + "step": 305 + }, + { + "epoch": 0.02, + "grad_norm": 2.3265166806831226, + "learning_rate": 8.010471204188483e-06, + "loss": 0.575, + "step": 306 + }, + { + "epoch": 0.02, + "grad_norm": 2.1327194566548098, + "learning_rate": 8.036649214659686e-06, + "loss": 0.6017, + "step": 307 + }, + { + "epoch": 0.02, + "grad_norm": 2.70623402273943, + "learning_rate": 8.06282722513089e-06, + "loss": 0.6197, + "step": 308 + }, + { + "epoch": 0.02, + "grad_norm": 2.1456416045219573, + "learning_rate": 8.089005235602096e-06, + "loss": 0.5736, + "step": 309 + }, + { + "epoch": 0.02, + "grad_norm": 2.1804800221288447, + "learning_rate": 8.1151832460733e-06, + "loss": 0.5494, + "step": 310 + }, + { + "epoch": 0.02, + "grad_norm": 1.4974476877105807, + "learning_rate": 8.141361256544504e-06, + "loss": 0.6817, + "step": 311 + }, + { + "epoch": 0.02, + "grad_norm": 1.6592394252262372, + "learning_rate": 8.167539267015708e-06, + "loss": 0.5578, + "step": 312 + }, + { + "epoch": 0.02, + "grad_norm": 2.7337570555432884, + "learning_rate": 8.193717277486912e-06, + "loss": 0.6749, + "step": 313 + }, + { + "epoch": 0.02, + "grad_norm": 1.760211895414078, + "learning_rate": 8.219895287958116e-06, + "loss": 0.5212, + "step": 314 + }, + { + "epoch": 0.02, + "grad_norm": 0.9209381045760696, + "learning_rate": 8.24607329842932e-06, + "loss": 0.6555, + "step": 315 + }, + { + "epoch": 0.02, + "grad_norm": 1.8331572579003792, + "learning_rate": 8.272251308900523e-06, + "loss": 0.5658, + "step": 316 + }, + { + "epoch": 0.02, + "grad_norm": 2.2183719228535876, + "learning_rate": 8.298429319371727e-06, + "loss": 0.538, + "step": 317 + }, + { + "epoch": 0.02, + "grad_norm": 2.303546757300317, + "learning_rate": 8.324607329842933e-06, + "loss": 0.575, + "step": 318 + }, + { + "epoch": 0.03, + "grad_norm": 2.4077826408323855, + "learning_rate": 8.350785340314137e-06, + "loss": 0.6142, + "step": 319 + }, + { + "epoch": 0.03, + "grad_norm": 1.7345594508031987, + "learning_rate": 8.37696335078534e-06, + "loss": 0.6065, + "step": 320 + }, + { + "epoch": 0.03, + "grad_norm": 1.9304800395881956, + "learning_rate": 8.403141361256545e-06, + "loss": 0.5618, + "step": 321 + }, + { + "epoch": 0.03, + "grad_norm": 3.5968679781572175, + "learning_rate": 8.429319371727749e-06, + "loss": 0.5563, + "step": 322 + }, + { + "epoch": 0.03, + "grad_norm": 2.006032645539263, + "learning_rate": 8.455497382198954e-06, + "loss": 0.536, + "step": 323 + }, + { + "epoch": 0.03, + "grad_norm": 2.0156420776757384, + "learning_rate": 8.481675392670158e-06, + "loss": 0.5946, + "step": 324 + }, + { + "epoch": 0.03, + "grad_norm": 1.3700939632147118, + "learning_rate": 8.507853403141362e-06, + "loss": 0.6774, + "step": 325 + }, + { + "epoch": 0.03, + "grad_norm": 1.7536817452775255, + "learning_rate": 8.534031413612566e-06, + "loss": 0.5735, + "step": 326 + }, + { + "epoch": 0.03, + "grad_norm": 2.7216317155309775, + "learning_rate": 8.56020942408377e-06, + "loss": 0.601, + "step": 327 + }, + { + "epoch": 0.03, + "grad_norm": 0.8906173526863174, + "learning_rate": 8.586387434554974e-06, + "loss": 0.644, + "step": 328 + }, + { + "epoch": 0.03, + "grad_norm": 1.7774582105780772, + "learning_rate": 8.612565445026178e-06, + "loss": 0.5895, + "step": 329 + }, + { + "epoch": 0.03, + "grad_norm": 1.633526331686594, + "learning_rate": 8.638743455497383e-06, + "loss": 0.587, + "step": 330 + }, + { + "epoch": 0.03, + "grad_norm": 1.9678423695374159, + "learning_rate": 8.664921465968587e-06, + "loss": 0.5704, + "step": 331 + }, + { + "epoch": 0.03, + "grad_norm": 1.8876313906812718, + "learning_rate": 8.691099476439791e-06, + "loss": 0.6116, + "step": 332 + }, + { + "epoch": 0.03, + "grad_norm": 1.796173396005943, + "learning_rate": 8.717277486910995e-06, + "loss": 0.6238, + "step": 333 + }, + { + "epoch": 0.03, + "grad_norm": 1.836606460764124, + "learning_rate": 8.743455497382199e-06, + "loss": 0.5695, + "step": 334 + }, + { + "epoch": 0.03, + "grad_norm": 0.9758437638543772, + "learning_rate": 8.769633507853403e-06, + "loss": 0.6644, + "step": 335 + }, + { + "epoch": 0.03, + "grad_norm": 1.560046653526461, + "learning_rate": 8.795811518324609e-06, + "loss": 0.5174, + "step": 336 + }, + { + "epoch": 0.03, + "grad_norm": 0.8161507847683459, + "learning_rate": 8.821989528795813e-06, + "loss": 0.6484, + "step": 337 + }, + { + "epoch": 0.03, + "grad_norm": 4.276855610138708, + "learning_rate": 8.848167539267016e-06, + "loss": 0.5425, + "step": 338 + }, + { + "epoch": 0.03, + "grad_norm": 1.9185312861963857, + "learning_rate": 8.87434554973822e-06, + "loss": 0.5879, + "step": 339 + }, + { + "epoch": 0.03, + "grad_norm": 1.8551708136535057, + "learning_rate": 8.900523560209426e-06, + "loss": 0.5835, + "step": 340 + }, + { + "epoch": 0.03, + "grad_norm": 1.8561936713146543, + "learning_rate": 8.92670157068063e-06, + "loss": 0.5626, + "step": 341 + }, + { + "epoch": 0.03, + "grad_norm": 3.034340329326399, + "learning_rate": 8.952879581151834e-06, + "loss": 0.6255, + "step": 342 + }, + { + "epoch": 0.03, + "grad_norm": 1.9914305572347764, + "learning_rate": 8.979057591623038e-06, + "loss": 0.5705, + "step": 343 + }, + { + "epoch": 0.03, + "grad_norm": 3.6099638033601917, + "learning_rate": 9.005235602094242e-06, + "loss": 0.6908, + "step": 344 + }, + { + "epoch": 0.03, + "grad_norm": 2.0050492959636452, + "learning_rate": 9.031413612565446e-06, + "loss": 0.5644, + "step": 345 + }, + { + "epoch": 0.03, + "grad_norm": 2.1737043159832683, + "learning_rate": 9.05759162303665e-06, + "loss": 0.5373, + "step": 346 + }, + { + "epoch": 0.03, + "grad_norm": 2.0136487689349023, + "learning_rate": 9.083769633507853e-06, + "loss": 0.5364, + "step": 347 + }, + { + "epoch": 0.03, + "grad_norm": 2.002767248285591, + "learning_rate": 9.109947643979057e-06, + "loss": 0.5913, + "step": 348 + }, + { + "epoch": 0.03, + "grad_norm": 2.542235705571547, + "learning_rate": 9.136125654450263e-06, + "loss": 0.563, + "step": 349 + }, + { + "epoch": 0.03, + "grad_norm": 2.1967348149858994, + "learning_rate": 9.162303664921467e-06, + "loss": 0.5827, + "step": 350 + }, + { + "epoch": 0.03, + "grad_norm": 2.3565261729014044, + "learning_rate": 9.18848167539267e-06, + "loss": 0.6061, + "step": 351 + }, + { + "epoch": 0.03, + "grad_norm": 1.6375159468393599, + "learning_rate": 9.214659685863875e-06, + "loss": 0.5575, + "step": 352 + }, + { + "epoch": 0.03, + "grad_norm": 1.6534083574327783, + "learning_rate": 9.240837696335079e-06, + "loss": 0.585, + "step": 353 + }, + { + "epoch": 0.03, + "grad_norm": 1.331897606240619, + "learning_rate": 9.267015706806284e-06, + "loss": 0.6553, + "step": 354 + }, + { + "epoch": 0.03, + "grad_norm": 1.8143977084759415, + "learning_rate": 9.293193717277488e-06, + "loss": 0.5698, + "step": 355 + }, + { + "epoch": 0.03, + "grad_norm": 2.106723226375449, + "learning_rate": 9.319371727748692e-06, + "loss": 0.5792, + "step": 356 + }, + { + "epoch": 0.03, + "grad_norm": 0.832056961026013, + "learning_rate": 9.345549738219896e-06, + "loss": 0.6394, + "step": 357 + }, + { + "epoch": 0.03, + "grad_norm": 1.9451383298091462, + "learning_rate": 9.3717277486911e-06, + "loss": 0.634, + "step": 358 + }, + { + "epoch": 0.03, + "grad_norm": 1.9579650730860603, + "learning_rate": 9.397905759162304e-06, + "loss": 0.5958, + "step": 359 + }, + { + "epoch": 0.03, + "grad_norm": 1.7314640239068833, + "learning_rate": 9.424083769633508e-06, + "loss": 0.588, + "step": 360 + }, + { + "epoch": 0.03, + "grad_norm": 1.054137365716144, + "learning_rate": 9.450261780104712e-06, + "loss": 0.6334, + "step": 361 + }, + { + "epoch": 0.03, + "grad_norm": 1.0649400917957876, + "learning_rate": 9.476439790575916e-06, + "loss": 0.6584, + "step": 362 + }, + { + "epoch": 0.03, + "grad_norm": 2.0675860262051775, + "learning_rate": 9.502617801047121e-06, + "loss": 0.5991, + "step": 363 + }, + { + "epoch": 0.03, + "grad_norm": 1.8552529365043544, + "learning_rate": 9.528795811518325e-06, + "loss": 0.601, + "step": 364 + }, + { + "epoch": 0.03, + "grad_norm": 2.442538081278847, + "learning_rate": 9.554973821989529e-06, + "loss": 0.5867, + "step": 365 + }, + { + "epoch": 0.03, + "grad_norm": 1.773274313259584, + "learning_rate": 9.581151832460733e-06, + "loss": 0.5683, + "step": 366 + }, + { + "epoch": 0.03, + "grad_norm": 1.899424417954219, + "learning_rate": 9.607329842931939e-06, + "loss": 0.5813, + "step": 367 + }, + { + "epoch": 0.03, + "grad_norm": 1.8323019750385396, + "learning_rate": 9.633507853403143e-06, + "loss": 0.5988, + "step": 368 + }, + { + "epoch": 0.03, + "grad_norm": 2.275918024752598, + "learning_rate": 9.659685863874346e-06, + "loss": 0.5755, + "step": 369 + }, + { + "epoch": 0.03, + "grad_norm": 1.8062118446256774, + "learning_rate": 9.68586387434555e-06, + "loss": 0.531, + "step": 370 + }, + { + "epoch": 0.03, + "grad_norm": 2.5191468664125214, + "learning_rate": 9.712041884816756e-06, + "loss": 0.569, + "step": 371 + }, + { + "epoch": 0.03, + "grad_norm": 3.3224071179207275, + "learning_rate": 9.73821989528796e-06, + "loss": 0.6412, + "step": 372 + }, + { + "epoch": 0.03, + "grad_norm": 1.8788417268521862, + "learning_rate": 9.764397905759164e-06, + "loss": 0.6379, + "step": 373 + }, + { + "epoch": 0.03, + "grad_norm": 2.0602449040659305, + "learning_rate": 9.790575916230368e-06, + "loss": 0.5902, + "step": 374 + }, + { + "epoch": 0.03, + "grad_norm": 2.292427056233333, + "learning_rate": 9.816753926701572e-06, + "loss": 0.5581, + "step": 375 + }, + { + "epoch": 0.03, + "grad_norm": 2.013468798676717, + "learning_rate": 9.842931937172776e-06, + "loss": 0.5499, + "step": 376 + }, + { + "epoch": 0.03, + "grad_norm": 1.8980067534882563, + "learning_rate": 9.86910994764398e-06, + "loss": 0.5419, + "step": 377 + }, + { + "epoch": 0.03, + "grad_norm": 1.6982130201462555, + "learning_rate": 9.895287958115183e-06, + "loss": 0.5613, + "step": 378 + }, + { + "epoch": 0.03, + "grad_norm": 1.8559293426571322, + "learning_rate": 9.921465968586387e-06, + "loss": 0.6029, + "step": 379 + }, + { + "epoch": 0.03, + "grad_norm": 1.6357801187513554, + "learning_rate": 9.947643979057593e-06, + "loss": 0.552, + "step": 380 + }, + { + "epoch": 0.03, + "grad_norm": 2.153690087246258, + "learning_rate": 9.973821989528797e-06, + "loss": 0.5801, + "step": 381 + }, + { + "epoch": 0.03, + "grad_norm": 1.9724041677639783, + "learning_rate": 1e-05, + "loss": 0.5646, + "step": 382 + }, + { + "epoch": 0.03, + "grad_norm": 1.5976335935631814, + "learning_rate": 9.999999838253271e-06, + "loss": 0.5556, + "step": 383 + }, + { + "epoch": 0.03, + "grad_norm": 1.6554892049292345, + "learning_rate": 9.999999353013093e-06, + "loss": 0.5392, + "step": 384 + }, + { + "epoch": 0.03, + "grad_norm": 1.6887486379737568, + "learning_rate": 9.999998544279496e-06, + "loss": 0.641, + "step": 385 + }, + { + "epoch": 0.03, + "grad_norm": 1.7935812618030595, + "learning_rate": 9.999997412052538e-06, + "loss": 0.6678, + "step": 386 + }, + { + "epoch": 0.03, + "grad_norm": 1.872627525790363, + "learning_rate": 9.999995956332285e-06, + "loss": 0.6358, + "step": 387 + }, + { + "epoch": 0.03, + "grad_norm": 1.8964959524243654, + "learning_rate": 9.999994177118834e-06, + "loss": 0.5797, + "step": 388 + }, + { + "epoch": 0.03, + "grad_norm": 1.605241461550817, + "learning_rate": 9.9999920744123e-06, + "loss": 0.6252, + "step": 389 + }, + { + "epoch": 0.03, + "grad_norm": 2.0935016939752162, + "learning_rate": 9.999989648212823e-06, + "loss": 0.5725, + "step": 390 + }, + { + "epoch": 0.03, + "grad_norm": 1.9617880813371835, + "learning_rate": 9.999986898520556e-06, + "loss": 0.5966, + "step": 391 + }, + { + "epoch": 0.03, + "grad_norm": 1.864885226261083, + "learning_rate": 9.999983825335676e-06, + "loss": 0.5576, + "step": 392 + }, + { + "epoch": 0.03, + "grad_norm": 1.8079353702078496, + "learning_rate": 9.999980428658383e-06, + "loss": 0.5538, + "step": 393 + }, + { + "epoch": 0.03, + "grad_norm": 1.8796330905646186, + "learning_rate": 9.999976708488898e-06, + "loss": 0.5572, + "step": 394 + }, + { + "epoch": 0.03, + "grad_norm": 1.781206713302281, + "learning_rate": 9.99997266482746e-06, + "loss": 0.5743, + "step": 395 + }, + { + "epoch": 0.03, + "grad_norm": 1.7108591015121346, + "learning_rate": 9.999968297674332e-06, + "loss": 0.6134, + "step": 396 + }, + { + "epoch": 0.03, + "grad_norm": 8.211612047212949, + "learning_rate": 9.999963607029795e-06, + "loss": 0.6146, + "step": 397 + }, + { + "epoch": 0.03, + "grad_norm": 1.6435930713590208, + "learning_rate": 9.999958592894155e-06, + "loss": 0.5371, + "step": 398 + }, + { + "epoch": 0.03, + "grad_norm": 0.9552178045133713, + "learning_rate": 9.999953255267733e-06, + "loss": 0.6656, + "step": 399 + }, + { + "epoch": 0.03, + "grad_norm": 1.726505569911282, + "learning_rate": 9.999947594150877e-06, + "loss": 0.5598, + "step": 400 + }, + { + "epoch": 0.03, + "grad_norm": 2.6077916866358, + "learning_rate": 9.999941609543953e-06, + "loss": 0.5931, + "step": 401 + }, + { + "epoch": 0.03, + "grad_norm": 1.8457085357305854, + "learning_rate": 9.999935301447348e-06, + "loss": 0.5685, + "step": 402 + }, + { + "epoch": 0.03, + "grad_norm": 2.8357313863810902, + "learning_rate": 9.999928669861467e-06, + "loss": 0.593, + "step": 403 + }, + { + "epoch": 0.03, + "grad_norm": 0.6322761612349168, + "learning_rate": 9.999921714786745e-06, + "loss": 0.6372, + "step": 404 + }, + { + "epoch": 0.03, + "grad_norm": 1.7770033749615561, + "learning_rate": 9.999914436223627e-06, + "loss": 0.5455, + "step": 405 + }, + { + "epoch": 0.03, + "grad_norm": 0.7008374043823205, + "learning_rate": 9.999906834172585e-06, + "loss": 0.6531, + "step": 406 + }, + { + "epoch": 0.03, + "grad_norm": 0.6963471980218585, + "learning_rate": 9.999898908634113e-06, + "loss": 0.6485, + "step": 407 + }, + { + "epoch": 0.03, + "grad_norm": 2.153031692888861, + "learning_rate": 9.999890659608722e-06, + "loss": 0.521, + "step": 408 + }, + { + "epoch": 0.03, + "grad_norm": 1.7396940173231474, + "learning_rate": 9.999882087096945e-06, + "loss": 0.6012, + "step": 409 + }, + { + "epoch": 0.03, + "grad_norm": 2.3235988170966353, + "learning_rate": 9.99987319109934e-06, + "loss": 0.5793, + "step": 410 + }, + { + "epoch": 0.03, + "grad_norm": 0.7559347263093723, + "learning_rate": 9.999863971616479e-06, + "loss": 0.6397, + "step": 411 + }, + { + "epoch": 0.03, + "grad_norm": 2.354326281569878, + "learning_rate": 9.999854428648958e-06, + "loss": 0.5783, + "step": 412 + }, + { + "epoch": 0.03, + "grad_norm": 1.5214335964831733, + "learning_rate": 9.999844562197398e-06, + "loss": 0.5089, + "step": 413 + }, + { + "epoch": 0.03, + "grad_norm": 1.9200327170650573, + "learning_rate": 9.999834372262435e-06, + "loss": 0.5989, + "step": 414 + }, + { + "epoch": 0.03, + "grad_norm": 1.7711204625051933, + "learning_rate": 9.999823858844728e-06, + "loss": 0.6315, + "step": 415 + }, + { + "epoch": 0.03, + "grad_norm": 2.1729131572733382, + "learning_rate": 9.99981302194496e-06, + "loss": 0.567, + "step": 416 + }, + { + "epoch": 0.03, + "grad_norm": 2.390677763196411, + "learning_rate": 9.999801861563828e-06, + "loss": 0.6078, + "step": 417 + }, + { + "epoch": 0.03, + "grad_norm": 1.9439383485691675, + "learning_rate": 9.999790377702057e-06, + "loss": 0.5127, + "step": 418 + }, + { + "epoch": 0.03, + "grad_norm": 1.9192322067186274, + "learning_rate": 9.999778570360387e-06, + "loss": 0.6163, + "step": 419 + }, + { + "epoch": 0.03, + "grad_norm": 1.9973243230444233, + "learning_rate": 9.999766439539588e-06, + "loss": 0.5951, + "step": 420 + }, + { + "epoch": 0.03, + "grad_norm": 2.2213161668187524, + "learning_rate": 9.999753985240439e-06, + "loss": 0.5763, + "step": 421 + }, + { + "epoch": 0.03, + "grad_norm": 1.9057432027906143, + "learning_rate": 9.999741207463747e-06, + "loss": 0.5866, + "step": 422 + }, + { + "epoch": 0.03, + "grad_norm": 2.1312535311870664, + "learning_rate": 9.99972810621034e-06, + "loss": 0.5238, + "step": 423 + }, + { + "epoch": 0.03, + "grad_norm": 0.9074537651287446, + "learning_rate": 9.999714681481064e-06, + "loss": 0.6493, + "step": 424 + }, + { + "epoch": 0.03, + "grad_norm": 2.0890623629097855, + "learning_rate": 9.99970093327679e-06, + "loss": 0.5281, + "step": 425 + }, + { + "epoch": 0.03, + "grad_norm": 0.6919691513639264, + "learning_rate": 9.999686861598406e-06, + "loss": 0.6272, + "step": 426 + }, + { + "epoch": 0.03, + "grad_norm": 2.0637737850327458, + "learning_rate": 9.999672466446821e-06, + "loss": 0.5835, + "step": 427 + }, + { + "epoch": 0.03, + "grad_norm": 0.7076696289292165, + "learning_rate": 9.999657747822969e-06, + "loss": 0.6441, + "step": 428 + }, + { + "epoch": 0.03, + "grad_norm": 2.1453231980105363, + "learning_rate": 9.9996427057278e-06, + "loss": 0.5449, + "step": 429 + }, + { + "epoch": 0.03, + "grad_norm": 2.129405678119133, + "learning_rate": 9.99962734016229e-06, + "loss": 0.5779, + "step": 430 + }, + { + "epoch": 0.03, + "grad_norm": 1.8297615493005956, + "learning_rate": 9.99961165112743e-06, + "loss": 0.6076, + "step": 431 + }, + { + "epoch": 0.03, + "grad_norm": 1.9121379632034627, + "learning_rate": 9.999595638624236e-06, + "loss": 0.5209, + "step": 432 + }, + { + "epoch": 0.03, + "grad_norm": 1.5795173555559119, + "learning_rate": 9.999579302653746e-06, + "loss": 0.568, + "step": 433 + }, + { + "epoch": 0.03, + "grad_norm": 13.785597597336825, + "learning_rate": 9.999562643217016e-06, + "loss": 0.5585, + "step": 434 + }, + { + "epoch": 0.03, + "grad_norm": 1.6807090497651151, + "learning_rate": 9.999545660315121e-06, + "loss": 0.5426, + "step": 435 + }, + { + "epoch": 0.03, + "grad_norm": 0.7456440571424378, + "learning_rate": 9.999528353949163e-06, + "loss": 0.6478, + "step": 436 + }, + { + "epoch": 0.03, + "grad_norm": 2.603526470949391, + "learning_rate": 9.999510724120261e-06, + "loss": 0.5677, + "step": 437 + }, + { + "epoch": 0.03, + "grad_norm": 1.9361182104948327, + "learning_rate": 9.999492770829555e-06, + "loss": 0.5616, + "step": 438 + }, + { + "epoch": 0.03, + "grad_norm": 1.815414035525676, + "learning_rate": 9.999474494078208e-06, + "loss": 0.6006, + "step": 439 + }, + { + "epoch": 0.03, + "grad_norm": 2.0405395842239304, + "learning_rate": 9.9994558938674e-06, + "loss": 0.5745, + "step": 440 + }, + { + "epoch": 0.03, + "grad_norm": 1.706757137122106, + "learning_rate": 9.999436970198336e-06, + "loss": 0.5764, + "step": 441 + }, + { + "epoch": 0.03, + "grad_norm": 1.8583675570380447, + "learning_rate": 9.99941772307224e-06, + "loss": 0.5528, + "step": 442 + }, + { + "epoch": 0.03, + "grad_norm": 1.8480658688812206, + "learning_rate": 9.999398152490358e-06, + "loss": 0.5683, + "step": 443 + }, + { + "epoch": 0.03, + "grad_norm": 5.997398455386773, + "learning_rate": 9.999378258453955e-06, + "loss": 0.5687, + "step": 444 + }, + { + "epoch": 0.03, + "grad_norm": 3.9253154859269572, + "learning_rate": 9.999358040964317e-06, + "loss": 0.5228, + "step": 445 + }, + { + "epoch": 0.04, + "grad_norm": 1.6508927814790604, + "learning_rate": 9.999337500022757e-06, + "loss": 0.5457, + "step": 446 + }, + { + "epoch": 0.04, + "grad_norm": 2.1139981650045523, + "learning_rate": 9.9993166356306e-06, + "loss": 0.6168, + "step": 447 + }, + { + "epoch": 0.04, + "grad_norm": 2.245122019332452, + "learning_rate": 9.999295447789194e-06, + "loss": 0.6761, + "step": 448 + }, + { + "epoch": 0.04, + "grad_norm": 1.5726851446782861, + "learning_rate": 9.999273936499915e-06, + "loss": 0.5886, + "step": 449 + }, + { + "epoch": 0.04, + "grad_norm": 2.456112672830984, + "learning_rate": 9.999252101764152e-06, + "loss": 0.6735, + "step": 450 + }, + { + "epoch": 0.04, + "grad_norm": 3.080850576872273, + "learning_rate": 9.999229943583318e-06, + "loss": 0.5938, + "step": 451 + }, + { + "epoch": 0.04, + "grad_norm": 2.5544416740106515, + "learning_rate": 9.999207461958845e-06, + "loss": 0.6004, + "step": 452 + }, + { + "epoch": 0.04, + "grad_norm": 0.8551083229636015, + "learning_rate": 9.999184656892191e-06, + "loss": 0.6267, + "step": 453 + }, + { + "epoch": 0.04, + "grad_norm": 1.7219032684934263, + "learning_rate": 9.999161528384828e-06, + "loss": 0.5357, + "step": 454 + }, + { + "epoch": 0.04, + "grad_norm": 2.2101149334707166, + "learning_rate": 9.999138076438253e-06, + "loss": 0.5981, + "step": 455 + }, + { + "epoch": 0.04, + "grad_norm": 1.7591576753116576, + "learning_rate": 9.999114301053985e-06, + "loss": 0.5965, + "step": 456 + }, + { + "epoch": 0.04, + "grad_norm": 2.3301228266112384, + "learning_rate": 9.999090202233563e-06, + "loss": 0.583, + "step": 457 + }, + { + "epoch": 0.04, + "grad_norm": 1.558391230528593, + "learning_rate": 9.999065779978543e-06, + "loss": 0.4999, + "step": 458 + }, + { + "epoch": 0.04, + "grad_norm": 1.778056637994504, + "learning_rate": 9.999041034290507e-06, + "loss": 0.6149, + "step": 459 + }, + { + "epoch": 0.04, + "grad_norm": 2.2902358325736727, + "learning_rate": 9.999015965171055e-06, + "loss": 0.5639, + "step": 460 + }, + { + "epoch": 0.04, + "grad_norm": 1.9289983315039514, + "learning_rate": 9.998990572621809e-06, + "loss": 0.5459, + "step": 461 + }, + { + "epoch": 0.04, + "grad_norm": 1.570344736020921, + "learning_rate": 9.998964856644415e-06, + "loss": 0.5418, + "step": 462 + }, + { + "epoch": 0.04, + "grad_norm": 0.9601195646587074, + "learning_rate": 9.998938817240533e-06, + "loss": 0.6423, + "step": 463 + }, + { + "epoch": 0.04, + "grad_norm": 1.6633424508024957, + "learning_rate": 9.99891245441185e-06, + "loss": 0.499, + "step": 464 + }, + { + "epoch": 0.04, + "grad_norm": 2.8421338389685755, + "learning_rate": 9.998885768160067e-06, + "loss": 0.5657, + "step": 465 + }, + { + "epoch": 0.04, + "grad_norm": 2.321384680394112, + "learning_rate": 9.998858758486918e-06, + "loss": 0.517, + "step": 466 + }, + { + "epoch": 0.04, + "grad_norm": 1.612696332012091, + "learning_rate": 9.998831425394144e-06, + "loss": 0.5491, + "step": 467 + }, + { + "epoch": 0.04, + "grad_norm": 2.5909574982404084, + "learning_rate": 9.998803768883519e-06, + "loss": 0.5557, + "step": 468 + }, + { + "epoch": 0.04, + "grad_norm": 1.8864328103442396, + "learning_rate": 9.998775788956828e-06, + "loss": 0.5444, + "step": 469 + }, + { + "epoch": 0.04, + "grad_norm": 0.7971065471224329, + "learning_rate": 9.998747485615881e-06, + "loss": 0.6487, + "step": 470 + }, + { + "epoch": 0.04, + "grad_norm": 2.6202011135411327, + "learning_rate": 9.998718858862512e-06, + "loss": 0.5903, + "step": 471 + }, + { + "epoch": 0.04, + "grad_norm": 1.8928062540213153, + "learning_rate": 9.998689908698572e-06, + "loss": 0.5279, + "step": 472 + }, + { + "epoch": 0.04, + "grad_norm": 1.6895509093866368, + "learning_rate": 9.998660635125934e-06, + "loss": 0.6332, + "step": 473 + }, + { + "epoch": 0.04, + "grad_norm": 2.461861206664755, + "learning_rate": 9.998631038146492e-06, + "loss": 0.5533, + "step": 474 + }, + { + "epoch": 0.04, + "grad_norm": 1.983845693162468, + "learning_rate": 9.998601117762161e-06, + "loss": 0.5756, + "step": 475 + }, + { + "epoch": 0.04, + "grad_norm": 1.9070793571873468, + "learning_rate": 9.998570873974877e-06, + "loss": 0.513, + "step": 476 + }, + { + "epoch": 0.04, + "grad_norm": 7.209137369994001, + "learning_rate": 9.998540306786596e-06, + "loss": 0.4909, + "step": 477 + }, + { + "epoch": 0.04, + "grad_norm": 0.7112339911445603, + "learning_rate": 9.998509416199295e-06, + "loss": 0.6164, + "step": 478 + }, + { + "epoch": 0.04, + "grad_norm": 2.782360710129199, + "learning_rate": 9.998478202214977e-06, + "loss": 0.4776, + "step": 479 + }, + { + "epoch": 0.04, + "grad_norm": 0.6440856649670651, + "learning_rate": 9.998446664835655e-06, + "loss": 0.6212, + "step": 480 + }, + { + "epoch": 0.04, + "grad_norm": 3.118211648327851, + "learning_rate": 9.998414804063375e-06, + "loss": 0.5997, + "step": 481 + }, + { + "epoch": 0.04, + "grad_norm": 1.6327547481832834, + "learning_rate": 9.998382619900193e-06, + "loss": 0.5396, + "step": 482 + }, + { + "epoch": 0.04, + "grad_norm": 1.7440362417683903, + "learning_rate": 9.998350112348196e-06, + "loss": 0.5936, + "step": 483 + }, + { + "epoch": 0.04, + "grad_norm": 1.7363223395339917, + "learning_rate": 9.998317281409484e-06, + "loss": 0.5861, + "step": 484 + }, + { + "epoch": 0.04, + "grad_norm": 1.9837355583495033, + "learning_rate": 9.998284127086184e-06, + "loss": 0.5785, + "step": 485 + }, + { + "epoch": 0.04, + "grad_norm": 0.7799500037688217, + "learning_rate": 9.998250649380439e-06, + "loss": 0.6473, + "step": 486 + }, + { + "epoch": 0.04, + "grad_norm": 1.9169313668320596, + "learning_rate": 9.998216848294415e-06, + "loss": 0.6517, + "step": 487 + }, + { + "epoch": 0.04, + "grad_norm": 1.8009002370620928, + "learning_rate": 9.9981827238303e-06, + "loss": 0.5299, + "step": 488 + }, + { + "epoch": 0.04, + "grad_norm": 1.7200891797745994, + "learning_rate": 9.998148275990303e-06, + "loss": 0.4847, + "step": 489 + }, + { + "epoch": 0.04, + "grad_norm": 2.3169202787398895, + "learning_rate": 9.99811350477665e-06, + "loss": 0.5774, + "step": 490 + }, + { + "epoch": 0.04, + "grad_norm": 1.8033974469246268, + "learning_rate": 9.99807841019159e-06, + "loss": 0.5664, + "step": 491 + }, + { + "epoch": 0.04, + "grad_norm": 1.882752067879728, + "learning_rate": 9.998042992237396e-06, + "loss": 0.5511, + "step": 492 + }, + { + "epoch": 0.04, + "grad_norm": 1.6013244167514782, + "learning_rate": 9.998007250916357e-06, + "loss": 0.5369, + "step": 493 + }, + { + "epoch": 0.04, + "grad_norm": 0.7481318379539572, + "learning_rate": 9.99797118623079e-06, + "loss": 0.6292, + "step": 494 + }, + { + "epoch": 0.04, + "grad_norm": 0.6969545092110656, + "learning_rate": 9.997934798183025e-06, + "loss": 0.6444, + "step": 495 + }, + { + "epoch": 0.04, + "grad_norm": 1.5947288176684786, + "learning_rate": 9.997898086775414e-06, + "loss": 0.5805, + "step": 496 + }, + { + "epoch": 0.04, + "grad_norm": 1.8251212638886976, + "learning_rate": 9.997861052010338e-06, + "loss": 0.5603, + "step": 497 + }, + { + "epoch": 0.04, + "grad_norm": 1.9720668309181737, + "learning_rate": 9.997823693890187e-06, + "loss": 0.513, + "step": 498 + }, + { + "epoch": 0.04, + "grad_norm": 2.5832798899210685, + "learning_rate": 9.997786012417384e-06, + "loss": 0.5385, + "step": 499 + }, + { + "epoch": 0.04, + "grad_norm": 2.4751794966006897, + "learning_rate": 9.997748007594362e-06, + "loss": 0.5409, + "step": 500 + }, + { + "epoch": 0.04, + "grad_norm": 1.8079973951163184, + "learning_rate": 9.997709679423581e-06, + "loss": 0.5685, + "step": 501 + }, + { + "epoch": 0.04, + "grad_norm": 1.9278705521432908, + "learning_rate": 9.997671027907525e-06, + "loss": 0.5606, + "step": 502 + }, + { + "epoch": 0.04, + "grad_norm": 1.750442916986704, + "learning_rate": 9.997632053048689e-06, + "loss": 0.587, + "step": 503 + }, + { + "epoch": 0.04, + "grad_norm": 1.9769647270035005, + "learning_rate": 9.997592754849596e-06, + "loss": 0.5079, + "step": 504 + }, + { + "epoch": 0.04, + "grad_norm": 1.5269182077926, + "learning_rate": 9.997553133312791e-06, + "loss": 0.4898, + "step": 505 + }, + { + "epoch": 0.04, + "grad_norm": 1.7854812898086454, + "learning_rate": 9.997513188440835e-06, + "loss": 0.5567, + "step": 506 + }, + { + "epoch": 0.04, + "grad_norm": 1.9364317471238655, + "learning_rate": 9.997472920236313e-06, + "loss": 0.5901, + "step": 507 + }, + { + "epoch": 0.04, + "grad_norm": 1.8234122103097035, + "learning_rate": 9.997432328701833e-06, + "loss": 0.5775, + "step": 508 + }, + { + "epoch": 0.04, + "grad_norm": 2.3331251800788033, + "learning_rate": 9.997391413840016e-06, + "loss": 0.5736, + "step": 509 + }, + { + "epoch": 0.04, + "grad_norm": 1.6943457714915588, + "learning_rate": 9.997350175653513e-06, + "loss": 0.6057, + "step": 510 + }, + { + "epoch": 0.04, + "grad_norm": 1.0169004390224894, + "learning_rate": 9.997308614144991e-06, + "loss": 0.6382, + "step": 511 + }, + { + "epoch": 0.04, + "grad_norm": 1.906197079848998, + "learning_rate": 9.997266729317138e-06, + "loss": 0.5265, + "step": 512 + }, + { + "epoch": 0.04, + "grad_norm": 1.9935889874911294, + "learning_rate": 9.997224521172668e-06, + "loss": 0.5756, + "step": 513 + }, + { + "epoch": 0.04, + "grad_norm": 0.6039594678295857, + "learning_rate": 9.997181989714305e-06, + "loss": 0.6301, + "step": 514 + }, + { + "epoch": 0.04, + "grad_norm": 2.0744146872246585, + "learning_rate": 9.997139134944806e-06, + "loss": 0.6191, + "step": 515 + }, + { + "epoch": 0.04, + "grad_norm": 1.9948630254105333, + "learning_rate": 9.997095956866943e-06, + "loss": 0.5708, + "step": 516 + }, + { + "epoch": 0.04, + "grad_norm": 5.684738977016506, + "learning_rate": 9.997052455483507e-06, + "loss": 0.621, + "step": 517 + }, + { + "epoch": 0.04, + "grad_norm": 2.1592779535081883, + "learning_rate": 9.997008630797314e-06, + "loss": 0.5327, + "step": 518 + }, + { + "epoch": 0.04, + "grad_norm": 2.5927090466906564, + "learning_rate": 9.9969644828112e-06, + "loss": 0.6172, + "step": 519 + }, + { + "epoch": 0.04, + "grad_norm": 2.006732288470622, + "learning_rate": 9.996920011528022e-06, + "loss": 0.599, + "step": 520 + }, + { + "epoch": 0.04, + "grad_norm": 2.3293150369569178, + "learning_rate": 9.996875216950655e-06, + "loss": 0.5903, + "step": 521 + }, + { + "epoch": 0.04, + "grad_norm": 1.8781733322577094, + "learning_rate": 9.996830099081998e-06, + "loss": 0.5702, + "step": 522 + }, + { + "epoch": 0.04, + "grad_norm": 2.0783582531549163, + "learning_rate": 9.99678465792497e-06, + "loss": 0.535, + "step": 523 + }, + { + "epoch": 0.04, + "grad_norm": 1.7179385542204724, + "learning_rate": 9.996738893482512e-06, + "loss": 0.5615, + "step": 524 + }, + { + "epoch": 0.04, + "grad_norm": 1.881319869854783, + "learning_rate": 9.996692805757584e-06, + "loss": 0.5954, + "step": 525 + }, + { + "epoch": 0.04, + "grad_norm": 1.8457513024622156, + "learning_rate": 9.996646394753167e-06, + "loss": 0.5948, + "step": 526 + }, + { + "epoch": 0.04, + "grad_norm": 1.785525702792255, + "learning_rate": 9.996599660472266e-06, + "loss": 0.6063, + "step": 527 + }, + { + "epoch": 0.04, + "grad_norm": 3.969284203328923, + "learning_rate": 9.996552602917902e-06, + "loss": 0.5627, + "step": 528 + }, + { + "epoch": 0.04, + "grad_norm": 1.8793133100589599, + "learning_rate": 9.996505222093123e-06, + "loss": 0.5171, + "step": 529 + }, + { + "epoch": 0.04, + "grad_norm": 2.049709958133192, + "learning_rate": 9.99645751800099e-06, + "loss": 0.5886, + "step": 530 + }, + { + "epoch": 0.04, + "grad_norm": 1.7297266235629103, + "learning_rate": 9.996409490644593e-06, + "loss": 0.5558, + "step": 531 + }, + { + "epoch": 0.04, + "grad_norm": 2.060280551842997, + "learning_rate": 9.996361140027038e-06, + "loss": 0.5915, + "step": 532 + }, + { + "epoch": 0.04, + "grad_norm": 1.6787659365945304, + "learning_rate": 9.996312466151452e-06, + "loss": 0.5484, + "step": 533 + }, + { + "epoch": 0.04, + "grad_norm": 1.7646445520989786, + "learning_rate": 9.996263469020988e-06, + "loss": 0.5747, + "step": 534 + }, + { + "epoch": 0.04, + "grad_norm": 1.4909817124771623, + "learning_rate": 9.996214148638811e-06, + "loss": 0.5836, + "step": 535 + }, + { + "epoch": 0.04, + "grad_norm": 3.4306265798346596, + "learning_rate": 9.996164505008117e-06, + "loss": 0.5647, + "step": 536 + }, + { + "epoch": 0.04, + "grad_norm": 2.1263405062083787, + "learning_rate": 9.996114538132114e-06, + "loss": 0.5668, + "step": 537 + }, + { + "epoch": 0.04, + "grad_norm": 2.0345490407764717, + "learning_rate": 9.996064248014036e-06, + "loss": 0.6117, + "step": 538 + }, + { + "epoch": 0.04, + "grad_norm": 2.0367369087259948, + "learning_rate": 9.996013634657136e-06, + "loss": 0.5639, + "step": 539 + }, + { + "epoch": 0.04, + "grad_norm": 1.5990843421878174, + "learning_rate": 9.995962698064692e-06, + "loss": 0.5131, + "step": 540 + }, + { + "epoch": 0.04, + "grad_norm": 1.3732024214638143, + "learning_rate": 9.995911438239995e-06, + "loss": 0.6699, + "step": 541 + }, + { + "epoch": 0.04, + "grad_norm": 1.5947259957861935, + "learning_rate": 9.995859855186363e-06, + "loss": 0.4764, + "step": 542 + }, + { + "epoch": 0.04, + "grad_norm": 2.642717213862172, + "learning_rate": 9.995807948907134e-06, + "loss": 0.4893, + "step": 543 + }, + { + "epoch": 0.04, + "grad_norm": 1.6425898448409968, + "learning_rate": 9.995755719405667e-06, + "loss": 0.5773, + "step": 544 + }, + { + "epoch": 0.04, + "grad_norm": 2.032220639511612, + "learning_rate": 9.99570316668534e-06, + "loss": 0.5967, + "step": 545 + }, + { + "epoch": 0.04, + "grad_norm": 2.4773074771051986, + "learning_rate": 9.995650290749553e-06, + "loss": 0.6133, + "step": 546 + }, + { + "epoch": 0.04, + "grad_norm": 1.6605611505082294, + "learning_rate": 9.995597091601727e-06, + "loss": 0.5106, + "step": 547 + }, + { + "epoch": 0.04, + "grad_norm": 1.6518753356913105, + "learning_rate": 9.995543569245304e-06, + "loss": 0.637, + "step": 548 + }, + { + "epoch": 0.04, + "grad_norm": 1.4718792844526425, + "learning_rate": 9.995489723683748e-06, + "loss": 0.595, + "step": 549 + }, + { + "epoch": 0.04, + "grad_norm": 0.8623012018187929, + "learning_rate": 9.995435554920544e-06, + "loss": 0.6184, + "step": 550 + }, + { + "epoch": 0.04, + "grad_norm": 1.968637488689114, + "learning_rate": 9.99538106295919e-06, + "loss": 0.625, + "step": 551 + }, + { + "epoch": 0.04, + "grad_norm": 1.950245196131321, + "learning_rate": 9.99532624780322e-06, + "loss": 0.5535, + "step": 552 + }, + { + "epoch": 0.04, + "grad_norm": 1.9149983646849986, + "learning_rate": 9.995271109456172e-06, + "loss": 0.5666, + "step": 553 + }, + { + "epoch": 0.04, + "grad_norm": 2.274088694453281, + "learning_rate": 9.995215647921622e-06, + "loss": 0.5415, + "step": 554 + }, + { + "epoch": 0.04, + "grad_norm": 1.73987440564278, + "learning_rate": 9.995159863203152e-06, + "loss": 0.5245, + "step": 555 + }, + { + "epoch": 0.04, + "grad_norm": 0.6573855149718795, + "learning_rate": 9.995103755304373e-06, + "loss": 0.6269, + "step": 556 + }, + { + "epoch": 0.04, + "grad_norm": 1.9237948443021977, + "learning_rate": 9.995047324228917e-06, + "loss": 0.5921, + "step": 557 + }, + { + "epoch": 0.04, + "grad_norm": 1.7347863889733812, + "learning_rate": 9.994990569980433e-06, + "loss": 0.4868, + "step": 558 + }, + { + "epoch": 0.04, + "grad_norm": 1.7652694773820714, + "learning_rate": 9.994933492562593e-06, + "loss": 0.526, + "step": 559 + }, + { + "epoch": 0.04, + "grad_norm": 0.7276782294284835, + "learning_rate": 9.994876091979092e-06, + "loss": 0.6039, + "step": 560 + }, + { + "epoch": 0.04, + "grad_norm": 3.5026831704780053, + "learning_rate": 9.994818368233639e-06, + "loss": 0.5584, + "step": 561 + }, + { + "epoch": 0.04, + "grad_norm": 1.6758481212343759, + "learning_rate": 9.994760321329972e-06, + "loss": 0.582, + "step": 562 + }, + { + "epoch": 0.04, + "grad_norm": 2.851377994974264, + "learning_rate": 9.994701951271848e-06, + "loss": 0.613, + "step": 563 + }, + { + "epoch": 0.04, + "grad_norm": 1.5502575172031208, + "learning_rate": 9.99464325806304e-06, + "loss": 0.5582, + "step": 564 + }, + { + "epoch": 0.04, + "grad_norm": 2.142598862605693, + "learning_rate": 9.994584241707349e-06, + "loss": 0.5432, + "step": 565 + }, + { + "epoch": 0.04, + "grad_norm": 2.1344114937122054, + "learning_rate": 9.994524902208588e-06, + "loss": 0.5304, + "step": 566 + }, + { + "epoch": 0.04, + "grad_norm": 1.5010984184079867, + "learning_rate": 9.994465239570602e-06, + "loss": 0.554, + "step": 567 + }, + { + "epoch": 0.04, + "grad_norm": 2.824975232800367, + "learning_rate": 9.994405253797248e-06, + "loss": 0.5822, + "step": 568 + }, + { + "epoch": 0.04, + "grad_norm": 3.888750146785224, + "learning_rate": 9.994344944892409e-06, + "loss": 0.5507, + "step": 569 + }, + { + "epoch": 0.04, + "grad_norm": 1.982323767876106, + "learning_rate": 9.994284312859982e-06, + "loss": 0.5559, + "step": 570 + }, + { + "epoch": 0.04, + "grad_norm": 1.9975489872465646, + "learning_rate": 9.994223357703897e-06, + "loss": 0.6041, + "step": 571 + }, + { + "epoch": 0.04, + "grad_norm": 1.5830085624445525, + "learning_rate": 9.994162079428093e-06, + "loss": 0.5848, + "step": 572 + }, + { + "epoch": 0.05, + "grad_norm": 1.7045779016520832, + "learning_rate": 9.994100478036535e-06, + "loss": 0.6087, + "step": 573 + }, + { + "epoch": 0.05, + "grad_norm": 0.8054863232173629, + "learning_rate": 9.994038553533209e-06, + "loss": 0.627, + "step": 574 + }, + { + "epoch": 0.05, + "grad_norm": 1.7823204476550272, + "learning_rate": 9.993976305922121e-06, + "loss": 0.6026, + "step": 575 + }, + { + "epoch": 0.05, + "grad_norm": 1.9304874880786358, + "learning_rate": 9.993913735207302e-06, + "loss": 0.5707, + "step": 576 + }, + { + "epoch": 0.05, + "grad_norm": 0.6657479767116596, + "learning_rate": 9.993850841392793e-06, + "loss": 0.6016, + "step": 577 + }, + { + "epoch": 0.05, + "grad_norm": 0.7438489220590868, + "learning_rate": 9.99378762448267e-06, + "loss": 0.6276, + "step": 578 + }, + { + "epoch": 0.05, + "grad_norm": 1.7827656143735742, + "learning_rate": 9.993724084481021e-06, + "loss": 0.4939, + "step": 579 + }, + { + "epoch": 0.05, + "grad_norm": 1.6711051316659882, + "learning_rate": 9.993660221391954e-06, + "loss": 0.5347, + "step": 580 + }, + { + "epoch": 0.05, + "grad_norm": 2.5199185534698825, + "learning_rate": 9.993596035219606e-06, + "loss": 0.5315, + "step": 581 + }, + { + "epoch": 0.05, + "grad_norm": 0.7312584987888469, + "learning_rate": 9.993531525968126e-06, + "loss": 0.6152, + "step": 582 + }, + { + "epoch": 0.05, + "grad_norm": 1.5469038874697256, + "learning_rate": 9.993466693641686e-06, + "loss": 0.5269, + "step": 583 + }, + { + "epoch": 0.05, + "grad_norm": 2.03759891966535, + "learning_rate": 9.993401538244485e-06, + "loss": 0.4825, + "step": 584 + }, + { + "epoch": 0.05, + "grad_norm": 1.9040702805980985, + "learning_rate": 9.993336059780738e-06, + "loss": 0.5958, + "step": 585 + }, + { + "epoch": 0.05, + "grad_norm": 10.318347704451378, + "learning_rate": 9.99327025825468e-06, + "loss": 0.5795, + "step": 586 + }, + { + "epoch": 0.05, + "grad_norm": 0.6898040743323544, + "learning_rate": 9.993204133670566e-06, + "loss": 0.6006, + "step": 587 + }, + { + "epoch": 0.05, + "grad_norm": 1.615720637024177, + "learning_rate": 9.993137686032677e-06, + "loss": 0.5788, + "step": 588 + }, + { + "epoch": 0.05, + "grad_norm": 0.6646898782066699, + "learning_rate": 9.993070915345313e-06, + "loss": 0.6296, + "step": 589 + }, + { + "epoch": 0.05, + "grad_norm": 1.7478658287285007, + "learning_rate": 9.993003821612793e-06, + "loss": 0.5327, + "step": 590 + }, + { + "epoch": 0.05, + "grad_norm": 1.5733117889891142, + "learning_rate": 9.992936404839455e-06, + "loss": 0.5835, + "step": 591 + }, + { + "epoch": 0.05, + "grad_norm": 1.978359391145377, + "learning_rate": 9.992868665029665e-06, + "loss": 0.5571, + "step": 592 + }, + { + "epoch": 0.05, + "grad_norm": 3.0671812455877396, + "learning_rate": 9.992800602187801e-06, + "loss": 0.5279, + "step": 593 + }, + { + "epoch": 0.05, + "grad_norm": 1.6376525722911022, + "learning_rate": 9.992732216318274e-06, + "loss": 0.5348, + "step": 594 + }, + { + "epoch": 0.05, + "grad_norm": 0.654361914860124, + "learning_rate": 9.992663507425501e-06, + "loss": 0.6317, + "step": 595 + }, + { + "epoch": 0.05, + "grad_norm": 1.6811524265556017, + "learning_rate": 9.99259447551393e-06, + "loss": 0.5359, + "step": 596 + }, + { + "epoch": 0.05, + "grad_norm": 2.6561149190153057, + "learning_rate": 9.99252512058803e-06, + "loss": 0.5287, + "step": 597 + }, + { + "epoch": 0.05, + "grad_norm": 0.6110961145389977, + "learning_rate": 9.992455442652283e-06, + "loss": 0.6205, + "step": 598 + }, + { + "epoch": 0.05, + "grad_norm": 1.5951273628797897, + "learning_rate": 9.9923854417112e-06, + "loss": 0.5888, + "step": 599 + }, + { + "epoch": 0.05, + "grad_norm": 1.6104740172196699, + "learning_rate": 9.992315117769311e-06, + "loss": 0.5414, + "step": 600 + }, + { + "epoch": 0.05, + "grad_norm": 1.3726273660888617, + "learning_rate": 9.992244470831164e-06, + "loss": 0.5423, + "step": 601 + }, + { + "epoch": 0.05, + "grad_norm": 1.580580612329893, + "learning_rate": 9.992173500901333e-06, + "loss": 0.5519, + "step": 602 + }, + { + "epoch": 0.05, + "grad_norm": 1.5206276318798455, + "learning_rate": 9.992102207984404e-06, + "loss": 0.5819, + "step": 603 + }, + { + "epoch": 0.05, + "grad_norm": 0.7683649186744994, + "learning_rate": 9.992030592084994e-06, + "loss": 0.6138, + "step": 604 + }, + { + "epoch": 0.05, + "grad_norm": 2.1938631226806624, + "learning_rate": 9.991958653207733e-06, + "loss": 0.5584, + "step": 605 + }, + { + "epoch": 0.05, + "grad_norm": 1.5323436888772248, + "learning_rate": 9.99188639135728e-06, + "loss": 0.5663, + "step": 606 + }, + { + "epoch": 0.05, + "grad_norm": 1.788553160569277, + "learning_rate": 9.991813806538308e-06, + "loss": 0.5529, + "step": 607 + }, + { + "epoch": 0.05, + "grad_norm": 1.6655266890921836, + "learning_rate": 9.99174089875551e-06, + "loss": 0.5899, + "step": 608 + }, + { + "epoch": 0.05, + "grad_norm": 1.6514508990007981, + "learning_rate": 9.991667668013609e-06, + "loss": 0.5053, + "step": 609 + }, + { + "epoch": 0.05, + "grad_norm": 1.6414595295986534, + "learning_rate": 9.991594114317338e-06, + "loss": 0.5956, + "step": 610 + }, + { + "epoch": 0.05, + "grad_norm": 1.9054910132221903, + "learning_rate": 9.991520237671457e-06, + "loss": 0.5416, + "step": 611 + }, + { + "epoch": 0.05, + "grad_norm": 0.731454544089483, + "learning_rate": 9.991446038080748e-06, + "loss": 0.6269, + "step": 612 + }, + { + "epoch": 0.05, + "grad_norm": 1.6591049700940341, + "learning_rate": 9.99137151555001e-06, + "loss": 0.5599, + "step": 613 + }, + { + "epoch": 0.05, + "grad_norm": 1.6925409216542935, + "learning_rate": 9.991296670084062e-06, + "loss": 0.5663, + "step": 614 + }, + { + "epoch": 0.05, + "grad_norm": 1.7397869738967933, + "learning_rate": 9.991221501687751e-06, + "loss": 0.5605, + "step": 615 + }, + { + "epoch": 0.05, + "grad_norm": 1.485879958708546, + "learning_rate": 9.99114601036594e-06, + "loss": 0.5217, + "step": 616 + }, + { + "epoch": 0.05, + "grad_norm": 1.5878441087476907, + "learning_rate": 9.991070196123507e-06, + "loss": 0.5732, + "step": 617 + }, + { + "epoch": 0.05, + "grad_norm": 2.4155310116666375, + "learning_rate": 9.990994058965363e-06, + "loss": 0.5444, + "step": 618 + }, + { + "epoch": 0.05, + "grad_norm": 1.5860267849373817, + "learning_rate": 9.990917598896435e-06, + "loss": 0.5275, + "step": 619 + }, + { + "epoch": 0.05, + "grad_norm": 1.4901511178261713, + "learning_rate": 9.990840815921665e-06, + "loss": 0.5657, + "step": 620 + }, + { + "epoch": 0.05, + "grad_norm": 1.537497034554049, + "learning_rate": 9.990763710046024e-06, + "loss": 0.5353, + "step": 621 + }, + { + "epoch": 0.05, + "grad_norm": 1.514538939347452, + "learning_rate": 9.990686281274498e-06, + "loss": 0.5114, + "step": 622 + }, + { + "epoch": 0.05, + "grad_norm": 2.0841471116524226, + "learning_rate": 9.9906085296121e-06, + "loss": 0.5508, + "step": 623 + }, + { + "epoch": 0.05, + "grad_norm": 2.8151218457919573, + "learning_rate": 9.990530455063857e-06, + "loss": 0.6077, + "step": 624 + }, + { + "epoch": 0.05, + "grad_norm": 1.616315591965677, + "learning_rate": 9.990452057634823e-06, + "loss": 0.5962, + "step": 625 + }, + { + "epoch": 0.05, + "grad_norm": 2.0880724421909727, + "learning_rate": 9.99037333733007e-06, + "loss": 0.5463, + "step": 626 + }, + { + "epoch": 0.05, + "grad_norm": 1.7263399225229703, + "learning_rate": 9.990294294154688e-06, + "loss": 0.5858, + "step": 627 + }, + { + "epoch": 0.05, + "grad_norm": 1.5850760926265768, + "learning_rate": 9.990214928113795e-06, + "loss": 0.5781, + "step": 628 + }, + { + "epoch": 0.05, + "grad_norm": 2.582352121211005, + "learning_rate": 9.990135239212525e-06, + "loss": 0.5396, + "step": 629 + }, + { + "epoch": 0.05, + "grad_norm": 1.6401119170026337, + "learning_rate": 9.990055227456032e-06, + "loss": 0.6795, + "step": 630 + }, + { + "epoch": 0.05, + "grad_norm": 1.6893689436990103, + "learning_rate": 9.989974892849493e-06, + "loss": 0.5403, + "step": 631 + }, + { + "epoch": 0.05, + "grad_norm": 1.639760767908914, + "learning_rate": 9.989894235398106e-06, + "loss": 0.5752, + "step": 632 + }, + { + "epoch": 0.05, + "grad_norm": 1.6783589780746055, + "learning_rate": 9.98981325510709e-06, + "loss": 0.5871, + "step": 633 + }, + { + "epoch": 0.05, + "grad_norm": 1.7562068784219536, + "learning_rate": 9.989731951981685e-06, + "loss": 0.6009, + "step": 634 + }, + { + "epoch": 0.05, + "grad_norm": 1.7364692267775057, + "learning_rate": 9.989650326027149e-06, + "loss": 0.537, + "step": 635 + }, + { + "epoch": 0.05, + "grad_norm": 1.3960999324602983, + "learning_rate": 9.989568377248763e-06, + "loss": 0.5847, + "step": 636 + }, + { + "epoch": 0.05, + "grad_norm": 1.6914464114149388, + "learning_rate": 9.989486105651834e-06, + "loss": 0.5498, + "step": 637 + }, + { + "epoch": 0.05, + "grad_norm": 1.8140557813433928, + "learning_rate": 9.989403511241678e-06, + "loss": 0.5872, + "step": 638 + }, + { + "epoch": 0.05, + "grad_norm": 0.7729738346576522, + "learning_rate": 9.989320594023641e-06, + "loss": 0.6296, + "step": 639 + }, + { + "epoch": 0.05, + "grad_norm": 1.5894564205044843, + "learning_rate": 9.98923735400309e-06, + "loss": 0.5373, + "step": 640 + }, + { + "epoch": 0.05, + "grad_norm": 1.749295350456021, + "learning_rate": 9.98915379118541e-06, + "loss": 0.564, + "step": 641 + }, + { + "epoch": 0.05, + "grad_norm": 2.0152103607782212, + "learning_rate": 9.989069905576005e-06, + "loss": 0.5217, + "step": 642 + }, + { + "epoch": 0.05, + "grad_norm": 0.7465331989766465, + "learning_rate": 9.988985697180302e-06, + "loss": 0.61, + "step": 643 + }, + { + "epoch": 0.05, + "grad_norm": 2.0524545411264117, + "learning_rate": 9.988901166003754e-06, + "loss": 0.6092, + "step": 644 + }, + { + "epoch": 0.05, + "grad_norm": 0.6936581437838388, + "learning_rate": 9.988816312051826e-06, + "loss": 0.6388, + "step": 645 + }, + { + "epoch": 0.05, + "grad_norm": 1.981466425626048, + "learning_rate": 9.988731135330008e-06, + "loss": 0.5948, + "step": 646 + }, + { + "epoch": 0.05, + "grad_norm": 1.4908283554319752, + "learning_rate": 9.988645635843811e-06, + "loss": 0.4917, + "step": 647 + }, + { + "epoch": 0.05, + "grad_norm": 1.8155960162954827, + "learning_rate": 9.988559813598769e-06, + "loss": 0.5442, + "step": 648 + }, + { + "epoch": 0.05, + "grad_norm": 1.6106436421850783, + "learning_rate": 9.988473668600431e-06, + "loss": 0.5451, + "step": 649 + }, + { + "epoch": 0.05, + "grad_norm": 2.149973094569598, + "learning_rate": 9.988387200854373e-06, + "loss": 0.5675, + "step": 650 + }, + { + "epoch": 0.05, + "grad_norm": 1.6671964338293705, + "learning_rate": 9.98830041036619e-06, + "loss": 0.544, + "step": 651 + }, + { + "epoch": 0.05, + "grad_norm": 1.746848370094641, + "learning_rate": 9.988213297141495e-06, + "loss": 0.5601, + "step": 652 + }, + { + "epoch": 0.05, + "grad_norm": 1.7662086274810078, + "learning_rate": 9.988125861185924e-06, + "loss": 0.4977, + "step": 653 + }, + { + "epoch": 0.05, + "grad_norm": 1.543500942379991, + "learning_rate": 9.988038102505138e-06, + "loss": 0.5043, + "step": 654 + }, + { + "epoch": 0.05, + "grad_norm": 2.64246197690511, + "learning_rate": 9.987950021104808e-06, + "loss": 0.5572, + "step": 655 + }, + { + "epoch": 0.05, + "grad_norm": 1.9491055885273412, + "learning_rate": 9.98786161699064e-06, + "loss": 0.5596, + "step": 656 + }, + { + "epoch": 0.05, + "grad_norm": 1.7995570194812731, + "learning_rate": 9.98777289016835e-06, + "loss": 0.5701, + "step": 657 + }, + { + "epoch": 0.05, + "grad_norm": 2.9162968582522972, + "learning_rate": 9.987683840643679e-06, + "loss": 0.5333, + "step": 658 + }, + { + "epoch": 0.05, + "grad_norm": 1.4986684799096002, + "learning_rate": 9.987594468422385e-06, + "loss": 0.5326, + "step": 659 + }, + { + "epoch": 0.05, + "grad_norm": 1.7281353638113948, + "learning_rate": 9.987504773510257e-06, + "loss": 0.6053, + "step": 660 + }, + { + "epoch": 0.05, + "grad_norm": 1.6344904333642738, + "learning_rate": 9.987414755913094e-06, + "loss": 0.4935, + "step": 661 + }, + { + "epoch": 0.05, + "grad_norm": 1.933292452356171, + "learning_rate": 9.987324415636718e-06, + "loss": 0.5729, + "step": 662 + }, + { + "epoch": 0.05, + "grad_norm": 0.7914235554236044, + "learning_rate": 9.98723375268698e-06, + "loss": 0.6318, + "step": 663 + }, + { + "epoch": 0.05, + "grad_norm": 2.368714590547771, + "learning_rate": 9.98714276706974e-06, + "loss": 0.5287, + "step": 664 + }, + { + "epoch": 0.05, + "grad_norm": 1.9735632981986477, + "learning_rate": 9.987051458790889e-06, + "loss": 0.5005, + "step": 665 + }, + { + "epoch": 0.05, + "grad_norm": 1.5160112049572383, + "learning_rate": 9.98695982785633e-06, + "loss": 0.6272, + "step": 666 + }, + { + "epoch": 0.05, + "grad_norm": 1.579649141714724, + "learning_rate": 9.986867874271996e-06, + "loss": 0.5465, + "step": 667 + }, + { + "epoch": 0.05, + "grad_norm": 2.1989897198407973, + "learning_rate": 9.986775598043834e-06, + "loss": 0.5511, + "step": 668 + }, + { + "epoch": 0.05, + "grad_norm": 1.6256322233560254, + "learning_rate": 9.986682999177813e-06, + "loss": 0.5317, + "step": 669 + }, + { + "epoch": 0.05, + "grad_norm": 1.7126356655723156, + "learning_rate": 9.986590077679927e-06, + "loss": 0.5709, + "step": 670 + }, + { + "epoch": 0.05, + "grad_norm": 1.955461437916781, + "learning_rate": 9.986496833556185e-06, + "loss": 0.5155, + "step": 671 + }, + { + "epoch": 0.05, + "grad_norm": 1.4895207567443318, + "learning_rate": 9.98640326681262e-06, + "loss": 0.5894, + "step": 672 + }, + { + "epoch": 0.05, + "grad_norm": 1.7472953365230885, + "learning_rate": 9.98630937745529e-06, + "loss": 0.5876, + "step": 673 + }, + { + "epoch": 0.05, + "grad_norm": 1.6475174166949256, + "learning_rate": 9.986215165490264e-06, + "loss": 0.5266, + "step": 674 + }, + { + "epoch": 0.05, + "grad_norm": 1.7141868521510457, + "learning_rate": 9.98612063092364e-06, + "loss": 0.5519, + "step": 675 + }, + { + "epoch": 0.05, + "grad_norm": 42.853834822125805, + "learning_rate": 9.986025773761533e-06, + "loss": 0.5155, + "step": 676 + }, + { + "epoch": 0.05, + "grad_norm": 2.082537131381896, + "learning_rate": 9.985930594010083e-06, + "loss": 0.5543, + "step": 677 + }, + { + "epoch": 0.05, + "grad_norm": 1.5225198949648842, + "learning_rate": 9.985835091675444e-06, + "loss": 0.5232, + "step": 678 + }, + { + "epoch": 0.05, + "grad_norm": 1.6803135638601516, + "learning_rate": 9.985739266763797e-06, + "loss": 0.5877, + "step": 679 + }, + { + "epoch": 0.05, + "grad_norm": 1.0559300998059715, + "learning_rate": 9.985643119281344e-06, + "loss": 0.6349, + "step": 680 + }, + { + "epoch": 0.05, + "grad_norm": 1.8267682851929372, + "learning_rate": 9.9855466492343e-06, + "loss": 0.5484, + "step": 681 + }, + { + "epoch": 0.05, + "grad_norm": 0.6796824374810231, + "learning_rate": 9.98544985662891e-06, + "loss": 0.6461, + "step": 682 + }, + { + "epoch": 0.05, + "grad_norm": 0.7403251176751362, + "learning_rate": 9.985352741471439e-06, + "loss": 0.6164, + "step": 683 + }, + { + "epoch": 0.05, + "grad_norm": 1.5508084183842126, + "learning_rate": 9.985255303768164e-06, + "loss": 0.5708, + "step": 684 + }, + { + "epoch": 0.05, + "grad_norm": 1.7474671328171805, + "learning_rate": 9.985157543525394e-06, + "loss": 0.6007, + "step": 685 + }, + { + "epoch": 0.05, + "grad_norm": 2.1095791709808855, + "learning_rate": 9.985059460749453e-06, + "loss": 0.5955, + "step": 686 + }, + { + "epoch": 0.05, + "grad_norm": 1.5541538599601818, + "learning_rate": 9.984961055446685e-06, + "loss": 0.6355, + "step": 687 + }, + { + "epoch": 0.05, + "grad_norm": 1.4152587868611695, + "learning_rate": 9.984862327623458e-06, + "loss": 0.5109, + "step": 688 + }, + { + "epoch": 0.05, + "grad_norm": 1.781653951768241, + "learning_rate": 9.984763277286161e-06, + "loss": 0.5307, + "step": 689 + }, + { + "epoch": 0.05, + "grad_norm": 2.1900541088304304, + "learning_rate": 9.984663904441198e-06, + "loss": 0.5883, + "step": 690 + }, + { + "epoch": 0.05, + "grad_norm": 1.4947068369586485, + "learning_rate": 9.984564209095002e-06, + "loss": 0.6574, + "step": 691 + }, + { + "epoch": 0.05, + "grad_norm": 2.0568687830164945, + "learning_rate": 9.984464191254024e-06, + "loss": 0.5079, + "step": 692 + }, + { + "epoch": 0.05, + "grad_norm": 1.8592171098737291, + "learning_rate": 9.984363850924733e-06, + "loss": 0.5149, + "step": 693 + }, + { + "epoch": 0.05, + "grad_norm": 1.4174768356190603, + "learning_rate": 9.98426318811362e-06, + "loss": 0.4992, + "step": 694 + }, + { + "epoch": 0.05, + "grad_norm": 1.3358526555338235, + "learning_rate": 9.984162202827199e-06, + "loss": 0.5565, + "step": 695 + }, + { + "epoch": 0.05, + "grad_norm": 1.571414051814849, + "learning_rate": 9.984060895072003e-06, + "loss": 0.6, + "step": 696 + }, + { + "epoch": 0.05, + "grad_norm": 1.5627449619313605, + "learning_rate": 9.98395926485459e-06, + "loss": 0.5652, + "step": 697 + }, + { + "epoch": 0.05, + "grad_norm": 1.6945946002972168, + "learning_rate": 9.983857312181528e-06, + "loss": 0.5221, + "step": 698 + }, + { + "epoch": 0.05, + "grad_norm": 1.5645184257198779, + "learning_rate": 9.983755037059422e-06, + "loss": 0.5436, + "step": 699 + }, + { + "epoch": 0.05, + "grad_norm": 1.709418681398146, + "learning_rate": 9.983652439494882e-06, + "loss": 0.5555, + "step": 700 + }, + { + "epoch": 0.06, + "grad_norm": 2.0497617900514182, + "learning_rate": 9.983549519494549e-06, + "loss": 0.5679, + "step": 701 + }, + { + "epoch": 0.06, + "grad_norm": 1.6868186412445652, + "learning_rate": 9.983446277065083e-06, + "loss": 0.5734, + "step": 702 + }, + { + "epoch": 0.06, + "grad_norm": 1.6743149331404321, + "learning_rate": 9.98334271221316e-06, + "loss": 0.5609, + "step": 703 + }, + { + "epoch": 0.06, + "grad_norm": 1.7812128529881837, + "learning_rate": 9.983238824945483e-06, + "loss": 0.5723, + "step": 704 + }, + { + "epoch": 0.06, + "grad_norm": 1.6914989472807405, + "learning_rate": 9.983134615268774e-06, + "loss": 0.5276, + "step": 705 + }, + { + "epoch": 0.06, + "grad_norm": 0.9038375159668738, + "learning_rate": 9.983030083189773e-06, + "loss": 0.6145, + "step": 706 + }, + { + "epoch": 0.06, + "grad_norm": 1.6030771139020674, + "learning_rate": 9.982925228715244e-06, + "loss": 0.6108, + "step": 707 + }, + { + "epoch": 0.06, + "grad_norm": 1.6154291239510121, + "learning_rate": 9.982820051851972e-06, + "loss": 0.5479, + "step": 708 + }, + { + "epoch": 0.06, + "grad_norm": 1.658916379870633, + "learning_rate": 9.98271455260676e-06, + "loss": 0.499, + "step": 709 + }, + { + "epoch": 0.06, + "grad_norm": 2.0560607094342087, + "learning_rate": 9.982608730986437e-06, + "loss": 0.5828, + "step": 710 + }, + { + "epoch": 0.06, + "grad_norm": 1.92355124818166, + "learning_rate": 9.982502586997846e-06, + "loss": 0.5297, + "step": 711 + }, + { + "epoch": 0.06, + "grad_norm": 1.7581998270043666, + "learning_rate": 9.982396120647855e-06, + "loss": 0.539, + "step": 712 + }, + { + "epoch": 0.06, + "grad_norm": 0.8064850218074654, + "learning_rate": 9.982289331943353e-06, + "loss": 0.6105, + "step": 713 + }, + { + "epoch": 0.06, + "grad_norm": 1.9950779674889938, + "learning_rate": 9.982182220891247e-06, + "loss": 0.5653, + "step": 714 + }, + { + "epoch": 0.06, + "grad_norm": 1.9160067653399984, + "learning_rate": 9.982074787498472e-06, + "loss": 0.5355, + "step": 715 + }, + { + "epoch": 0.06, + "grad_norm": 1.9059522111894736, + "learning_rate": 9.981967031771974e-06, + "loss": 0.5741, + "step": 716 + }, + { + "epoch": 0.06, + "grad_norm": 1.801979600586274, + "learning_rate": 9.981858953718728e-06, + "loss": 0.5713, + "step": 717 + }, + { + "epoch": 0.06, + "grad_norm": 2.1209981091794274, + "learning_rate": 9.981750553345721e-06, + "loss": 0.515, + "step": 718 + }, + { + "epoch": 0.06, + "grad_norm": 2.0448020072984674, + "learning_rate": 9.981641830659973e-06, + "loss": 0.5355, + "step": 719 + }, + { + "epoch": 0.06, + "grad_norm": 1.7455436440863386, + "learning_rate": 9.981532785668516e-06, + "loss": 0.5064, + "step": 720 + }, + { + "epoch": 0.06, + "grad_norm": 0.6925400323926265, + "learning_rate": 9.981423418378403e-06, + "loss": 0.6155, + "step": 721 + }, + { + "epoch": 0.06, + "grad_norm": 0.701973946700673, + "learning_rate": 9.98131372879671e-06, + "loss": 0.606, + "step": 722 + }, + { + "epoch": 0.06, + "grad_norm": 1.5844382716238123, + "learning_rate": 9.981203716930538e-06, + "loss": 0.5409, + "step": 723 + }, + { + "epoch": 0.06, + "grad_norm": 1.7799743786390323, + "learning_rate": 9.981093382787002e-06, + "loss": 0.5154, + "step": 724 + }, + { + "epoch": 0.06, + "grad_norm": 2.1690764214661664, + "learning_rate": 9.980982726373238e-06, + "loss": 0.6018, + "step": 725 + }, + { + "epoch": 0.06, + "grad_norm": 2.053480245351553, + "learning_rate": 9.980871747696408e-06, + "loss": 0.5493, + "step": 726 + }, + { + "epoch": 0.06, + "grad_norm": 1.954289388156097, + "learning_rate": 9.980760446763693e-06, + "loss": 0.5959, + "step": 727 + }, + { + "epoch": 0.06, + "grad_norm": 1.9904545251670165, + "learning_rate": 9.980648823582291e-06, + "loss": 0.5441, + "step": 728 + }, + { + "epoch": 0.06, + "grad_norm": 1.5105164865614267, + "learning_rate": 9.980536878159427e-06, + "loss": 0.5475, + "step": 729 + }, + { + "epoch": 0.06, + "grad_norm": 1.7845429346033919, + "learning_rate": 9.980424610502342e-06, + "loss": 0.5404, + "step": 730 + }, + { + "epoch": 0.06, + "grad_norm": 1.835663999661278, + "learning_rate": 9.9803120206183e-06, + "loss": 0.65, + "step": 731 + }, + { + "epoch": 0.06, + "grad_norm": 2.1707415611262446, + "learning_rate": 9.980199108514584e-06, + "loss": 0.5608, + "step": 732 + }, + { + "epoch": 0.06, + "grad_norm": 0.7935927538206793, + "learning_rate": 9.980085874198502e-06, + "loss": 0.6181, + "step": 733 + }, + { + "epoch": 0.06, + "grad_norm": 1.561644773014807, + "learning_rate": 9.979972317677377e-06, + "loss": 0.5715, + "step": 734 + }, + { + "epoch": 0.06, + "grad_norm": 2.0055433769015885, + "learning_rate": 9.97985843895856e-06, + "loss": 0.478, + "step": 735 + }, + { + "epoch": 0.06, + "grad_norm": 1.6267540376363032, + "learning_rate": 9.979744238049415e-06, + "loss": 0.5291, + "step": 736 + }, + { + "epoch": 0.06, + "grad_norm": 1.951456473765339, + "learning_rate": 9.979629714957334e-06, + "loss": 0.6164, + "step": 737 + }, + { + "epoch": 0.06, + "grad_norm": 1.8752697401666278, + "learning_rate": 9.979514869689722e-06, + "loss": 0.5404, + "step": 738 + }, + { + "epoch": 0.06, + "grad_norm": 1.6245028973500744, + "learning_rate": 9.979399702254014e-06, + "loss": 0.5654, + "step": 739 + }, + { + "epoch": 0.06, + "grad_norm": 1.5736047017997292, + "learning_rate": 9.979284212657658e-06, + "loss": 0.5437, + "step": 740 + }, + { + "epoch": 0.06, + "grad_norm": 3.6196004369659276, + "learning_rate": 9.979168400908126e-06, + "loss": 0.5429, + "step": 741 + }, + { + "epoch": 0.06, + "grad_norm": 2.1417918247001153, + "learning_rate": 9.979052267012914e-06, + "loss": 0.5479, + "step": 742 + }, + { + "epoch": 0.06, + "grad_norm": 1.4178030775106332, + "learning_rate": 9.978935810979533e-06, + "loss": 0.5579, + "step": 743 + }, + { + "epoch": 0.06, + "grad_norm": 1.5953066823336068, + "learning_rate": 9.978819032815519e-06, + "loss": 0.5785, + "step": 744 + }, + { + "epoch": 0.06, + "grad_norm": 1.8650087318715263, + "learning_rate": 9.978701932528425e-06, + "loss": 0.5388, + "step": 745 + }, + { + "epoch": 0.06, + "grad_norm": 1.5486865519521378, + "learning_rate": 9.97858451012583e-06, + "loss": 0.5812, + "step": 746 + }, + { + "epoch": 0.06, + "grad_norm": 2.531871293831441, + "learning_rate": 9.97846676561533e-06, + "loss": 0.5458, + "step": 747 + }, + { + "epoch": 0.06, + "grad_norm": 1.5201953954497902, + "learning_rate": 9.978348699004545e-06, + "loss": 0.5092, + "step": 748 + }, + { + "epoch": 0.06, + "grad_norm": 1.693273511506159, + "learning_rate": 9.978230310301109e-06, + "loss": 0.5574, + "step": 749 + }, + { + "epoch": 0.06, + "grad_norm": 1.5816583900922208, + "learning_rate": 9.978111599512685e-06, + "loss": 0.5782, + "step": 750 + }, + { + "epoch": 0.06, + "grad_norm": 1.4556411247802536, + "learning_rate": 9.977992566646952e-06, + "loss": 0.5797, + "step": 751 + }, + { + "epoch": 0.06, + "grad_norm": 2.1673046067611432, + "learning_rate": 9.977873211711612e-06, + "loss": 0.5325, + "step": 752 + }, + { + "epoch": 0.06, + "grad_norm": 0.9299906705132726, + "learning_rate": 9.977753534714389e-06, + "loss": 0.6113, + "step": 753 + }, + { + "epoch": 0.06, + "grad_norm": 1.848851679145008, + "learning_rate": 9.977633535663022e-06, + "loss": 0.5334, + "step": 754 + }, + { + "epoch": 0.06, + "grad_norm": 2.109332384398877, + "learning_rate": 9.977513214565278e-06, + "loss": 0.5236, + "step": 755 + }, + { + "epoch": 0.06, + "grad_norm": 1.684981400874114, + "learning_rate": 9.977392571428942e-06, + "loss": 0.5614, + "step": 756 + }, + { + "epoch": 0.06, + "grad_norm": 0.7803211661689158, + "learning_rate": 9.977271606261814e-06, + "loss": 0.6041, + "step": 757 + }, + { + "epoch": 0.06, + "grad_norm": 1.8787862460695115, + "learning_rate": 9.977150319071727e-06, + "loss": 0.5675, + "step": 758 + }, + { + "epoch": 0.06, + "grad_norm": 1.6195226252697599, + "learning_rate": 9.977028709866523e-06, + "loss": 0.5208, + "step": 759 + }, + { + "epoch": 0.06, + "grad_norm": 2.0772050145709864, + "learning_rate": 9.976906778654074e-06, + "loss": 0.5649, + "step": 760 + }, + { + "epoch": 0.06, + "grad_norm": 1.4158696211470787, + "learning_rate": 9.976784525442268e-06, + "loss": 0.5593, + "step": 761 + }, + { + "epoch": 0.06, + "grad_norm": 0.7502360519045748, + "learning_rate": 9.97666195023901e-06, + "loss": 0.6044, + "step": 762 + }, + { + "epoch": 0.06, + "grad_norm": 1.4283400601973875, + "learning_rate": 9.976539053052237e-06, + "loss": 0.5229, + "step": 763 + }, + { + "epoch": 0.06, + "grad_norm": 1.4764638956236051, + "learning_rate": 9.976415833889897e-06, + "loss": 0.5334, + "step": 764 + }, + { + "epoch": 0.06, + "grad_norm": 2.0859069582570218, + "learning_rate": 9.976292292759963e-06, + "loss": 0.5395, + "step": 765 + }, + { + "epoch": 0.06, + "grad_norm": 1.5203338827933996, + "learning_rate": 9.976168429670426e-06, + "loss": 0.5849, + "step": 766 + }, + { + "epoch": 0.06, + "grad_norm": 1.6775133181138706, + "learning_rate": 9.976044244629301e-06, + "loss": 0.5404, + "step": 767 + }, + { + "epoch": 0.06, + "grad_norm": 1.5952308103512223, + "learning_rate": 9.975919737644625e-06, + "loss": 0.5697, + "step": 768 + }, + { + "epoch": 0.06, + "grad_norm": 1.5129362326307954, + "learning_rate": 9.97579490872445e-06, + "loss": 0.5385, + "step": 769 + }, + { + "epoch": 0.06, + "grad_norm": 1.6688181805939526, + "learning_rate": 9.975669757876853e-06, + "loss": 0.5229, + "step": 770 + }, + { + "epoch": 0.06, + "grad_norm": 1.3711266094196919, + "learning_rate": 9.975544285109933e-06, + "loss": 0.5373, + "step": 771 + }, + { + "epoch": 0.06, + "grad_norm": 1.8880072358254525, + "learning_rate": 9.975418490431806e-06, + "loss": 0.5803, + "step": 772 + }, + { + "epoch": 0.06, + "grad_norm": 2.0819201866747257, + "learning_rate": 9.975292373850611e-06, + "loss": 0.5256, + "step": 773 + }, + { + "epoch": 0.06, + "grad_norm": 1.6854259455741785, + "learning_rate": 9.97516593537451e-06, + "loss": 0.5829, + "step": 774 + }, + { + "epoch": 0.06, + "grad_norm": 0.821712367167311, + "learning_rate": 9.975039175011678e-06, + "loss": 0.6448, + "step": 775 + }, + { + "epoch": 0.06, + "grad_norm": 3.034348543153588, + "learning_rate": 9.97491209277032e-06, + "loss": 0.544, + "step": 776 + }, + { + "epoch": 0.06, + "grad_norm": 1.4021850039764745, + "learning_rate": 9.974784688658661e-06, + "loss": 0.5352, + "step": 777 + }, + { + "epoch": 0.06, + "grad_norm": 1.6813570671633562, + "learning_rate": 9.974656962684936e-06, + "loss": 0.5999, + "step": 778 + }, + { + "epoch": 0.06, + "grad_norm": 1.4561371347973227, + "learning_rate": 9.974528914857417e-06, + "loss": 0.5059, + "step": 779 + }, + { + "epoch": 0.06, + "grad_norm": 1.5416131533424684, + "learning_rate": 9.974400545184382e-06, + "loss": 0.5174, + "step": 780 + }, + { + "epoch": 0.06, + "grad_norm": 0.6876817090353196, + "learning_rate": 9.97427185367414e-06, + "loss": 0.5978, + "step": 781 + }, + { + "epoch": 0.06, + "grad_norm": 2.0869340382487938, + "learning_rate": 9.974142840335017e-06, + "loss": 0.5731, + "step": 782 + }, + { + "epoch": 0.06, + "grad_norm": 1.6547970253362836, + "learning_rate": 9.974013505175358e-06, + "loss": 0.5751, + "step": 783 + }, + { + "epoch": 0.06, + "grad_norm": 1.6394056968397406, + "learning_rate": 9.973883848203535e-06, + "loss": 0.5344, + "step": 784 + }, + { + "epoch": 0.06, + "grad_norm": 1.6531189421145551, + "learning_rate": 9.973753869427928e-06, + "loss": 0.5443, + "step": 785 + }, + { + "epoch": 0.06, + "grad_norm": 1.7556512131704725, + "learning_rate": 9.973623568856956e-06, + "loss": 0.5148, + "step": 786 + }, + { + "epoch": 0.06, + "grad_norm": 1.9169417621087852, + "learning_rate": 9.973492946499045e-06, + "loss": 0.5918, + "step": 787 + }, + { + "epoch": 0.06, + "grad_norm": 1.4570709613454904, + "learning_rate": 9.973362002362647e-06, + "loss": 0.5373, + "step": 788 + }, + { + "epoch": 0.06, + "grad_norm": 1.5311428640142923, + "learning_rate": 9.973230736456232e-06, + "loss": 0.5683, + "step": 789 + }, + { + "epoch": 0.06, + "grad_norm": 1.398557127650882, + "learning_rate": 9.973099148788295e-06, + "loss": 0.4895, + "step": 790 + }, + { + "epoch": 0.06, + "grad_norm": 0.6256153234822469, + "learning_rate": 9.97296723936735e-06, + "loss": 0.6335, + "step": 791 + }, + { + "epoch": 0.06, + "grad_norm": 1.7729082398853595, + "learning_rate": 9.972835008201928e-06, + "loss": 0.5294, + "step": 792 + }, + { + "epoch": 0.06, + "grad_norm": 1.6396473568274677, + "learning_rate": 9.972702455300586e-06, + "loss": 0.5396, + "step": 793 + }, + { + "epoch": 0.06, + "grad_norm": 1.5249789217901535, + "learning_rate": 9.972569580671902e-06, + "loss": 0.5774, + "step": 794 + }, + { + "epoch": 0.06, + "grad_norm": 1.594759032543909, + "learning_rate": 9.97243638432447e-06, + "loss": 0.5309, + "step": 795 + }, + { + "epoch": 0.06, + "grad_norm": 1.461584294536784, + "learning_rate": 9.97230286626691e-06, + "loss": 0.5632, + "step": 796 + }, + { + "epoch": 0.06, + "grad_norm": 1.4955351507068555, + "learning_rate": 9.972169026507858e-06, + "loss": 0.5515, + "step": 797 + }, + { + "epoch": 0.06, + "grad_norm": 1.5529748120600473, + "learning_rate": 9.972034865055974e-06, + "loss": 0.5405, + "step": 798 + }, + { + "epoch": 0.06, + "grad_norm": 2.4473463039722567, + "learning_rate": 9.97190038191994e-06, + "loss": 0.5749, + "step": 799 + }, + { + "epoch": 0.06, + "grad_norm": 1.457597800019409, + "learning_rate": 9.971765577108452e-06, + "loss": 0.5262, + "step": 800 + }, + { + "epoch": 0.06, + "grad_norm": 0.6492051417181997, + "learning_rate": 9.971630450630239e-06, + "loss": 0.6388, + "step": 801 + }, + { + "epoch": 0.06, + "grad_norm": 1.7491249427264517, + "learning_rate": 9.971495002494038e-06, + "loss": 0.5415, + "step": 802 + }, + { + "epoch": 0.06, + "grad_norm": 1.6221351057051563, + "learning_rate": 9.971359232708613e-06, + "loss": 0.579, + "step": 803 + }, + { + "epoch": 0.06, + "grad_norm": 1.76924559842447, + "learning_rate": 9.971223141282751e-06, + "loss": 0.5339, + "step": 804 + }, + { + "epoch": 0.06, + "grad_norm": 2.222345346043504, + "learning_rate": 9.971086728225252e-06, + "loss": 0.5591, + "step": 805 + }, + { + "epoch": 0.06, + "grad_norm": 1.4504455435367212, + "learning_rate": 9.970949993544946e-06, + "loss": 0.5064, + "step": 806 + }, + { + "epoch": 0.06, + "grad_norm": 1.7439488363520979, + "learning_rate": 9.97081293725068e-06, + "loss": 0.5294, + "step": 807 + }, + { + "epoch": 0.06, + "grad_norm": 1.9886027020808867, + "learning_rate": 9.970675559351318e-06, + "loss": 0.5344, + "step": 808 + }, + { + "epoch": 0.06, + "grad_norm": 1.6060333858180997, + "learning_rate": 9.97053785985575e-06, + "loss": 0.5378, + "step": 809 + }, + { + "epoch": 0.06, + "grad_norm": 1.9118784895312244, + "learning_rate": 9.970399838772884e-06, + "loss": 0.5349, + "step": 810 + }, + { + "epoch": 0.06, + "grad_norm": 0.658800341863543, + "learning_rate": 9.970261496111653e-06, + "loss": 0.5952, + "step": 811 + }, + { + "epoch": 0.06, + "grad_norm": 1.6027283855454784, + "learning_rate": 9.970122831881003e-06, + "loss": 0.5334, + "step": 812 + }, + { + "epoch": 0.06, + "grad_norm": 1.8124737776309374, + "learning_rate": 9.969983846089908e-06, + "loss": 0.5393, + "step": 813 + }, + { + "epoch": 0.06, + "grad_norm": 3.566665964156739, + "learning_rate": 9.96984453874736e-06, + "loss": 0.5302, + "step": 814 + }, + { + "epoch": 0.06, + "grad_norm": 0.6052932456772644, + "learning_rate": 9.969704909862371e-06, + "loss": 0.59, + "step": 815 + }, + { + "epoch": 0.06, + "grad_norm": 1.8317701541844458, + "learning_rate": 9.969564959443976e-06, + "loss": 0.5675, + "step": 816 + }, + { + "epoch": 0.06, + "grad_norm": 1.8985976469269226, + "learning_rate": 9.96942468750123e-06, + "loss": 0.576, + "step": 817 + }, + { + "epoch": 0.06, + "grad_norm": 0.6173788735255878, + "learning_rate": 9.969284094043206e-06, + "loss": 0.6021, + "step": 818 + }, + { + "epoch": 0.06, + "grad_norm": 1.769458974596789, + "learning_rate": 9.969143179079001e-06, + "loss": 0.518, + "step": 819 + }, + { + "epoch": 0.06, + "grad_norm": 0.6675758149395968, + "learning_rate": 9.969001942617735e-06, + "loss": 0.5989, + "step": 820 + }, + { + "epoch": 0.06, + "grad_norm": 2.319624859889963, + "learning_rate": 9.968860384668543e-06, + "loss": 0.5086, + "step": 821 + }, + { + "epoch": 0.06, + "grad_norm": 0.702108816104618, + "learning_rate": 9.968718505240583e-06, + "loss": 0.6366, + "step": 822 + }, + { + "epoch": 0.06, + "grad_norm": 1.6139169083984537, + "learning_rate": 9.968576304343037e-06, + "loss": 0.4558, + "step": 823 + }, + { + "epoch": 0.06, + "grad_norm": 1.876970871841841, + "learning_rate": 9.968433781985102e-06, + "loss": 0.5122, + "step": 824 + }, + { + "epoch": 0.06, + "grad_norm": 0.6221913466613788, + "learning_rate": 9.968290938176005e-06, + "loss": 0.619, + "step": 825 + }, + { + "epoch": 0.06, + "grad_norm": 1.6744159877239608, + "learning_rate": 9.968147772924978e-06, + "loss": 0.5227, + "step": 826 + }, + { + "epoch": 0.06, + "grad_norm": 0.648872827924901, + "learning_rate": 9.968004286241292e-06, + "loss": 0.5928, + "step": 827 + }, + { + "epoch": 0.07, + "grad_norm": 1.8105188149966776, + "learning_rate": 9.96786047813423e-06, + "loss": 0.5876, + "step": 828 + }, + { + "epoch": 0.07, + "grad_norm": 0.6192993763887075, + "learning_rate": 9.96771634861309e-06, + "loss": 0.6072, + "step": 829 + }, + { + "epoch": 0.07, + "grad_norm": 1.8377241392408228, + "learning_rate": 9.967571897687203e-06, + "loss": 0.5574, + "step": 830 + }, + { + "epoch": 0.07, + "grad_norm": 1.5074218455849433, + "learning_rate": 9.967427125365912e-06, + "loss": 0.5624, + "step": 831 + }, + { + "epoch": 0.07, + "grad_norm": 2.005189790639872, + "learning_rate": 9.967282031658584e-06, + "loss": 0.5323, + "step": 832 + }, + { + "epoch": 0.07, + "grad_norm": 2.3825511337688314, + "learning_rate": 9.967136616574606e-06, + "loss": 0.6102, + "step": 833 + }, + { + "epoch": 0.07, + "grad_norm": 1.592670773526046, + "learning_rate": 9.966990880123388e-06, + "loss": 0.5577, + "step": 834 + }, + { + "epoch": 0.07, + "grad_norm": 1.931299454364683, + "learning_rate": 9.966844822314357e-06, + "loss": 0.5836, + "step": 835 + }, + { + "epoch": 0.07, + "grad_norm": 1.793291077425198, + "learning_rate": 9.966698443156964e-06, + "loss": 0.5411, + "step": 836 + }, + { + "epoch": 0.07, + "grad_norm": 1.6495658389731604, + "learning_rate": 9.96655174266068e-06, + "loss": 0.4965, + "step": 837 + }, + { + "epoch": 0.07, + "grad_norm": 1.5146596647644432, + "learning_rate": 9.966404720834992e-06, + "loss": 0.5759, + "step": 838 + }, + { + "epoch": 0.07, + "grad_norm": 0.7260735505923195, + "learning_rate": 9.966257377689418e-06, + "loss": 0.5808, + "step": 839 + }, + { + "epoch": 0.07, + "grad_norm": 1.5124861393632685, + "learning_rate": 9.966109713233487e-06, + "loss": 0.537, + "step": 840 + }, + { + "epoch": 0.07, + "grad_norm": 1.9507934332500567, + "learning_rate": 9.965961727476756e-06, + "loss": 0.5306, + "step": 841 + }, + { + "epoch": 0.07, + "grad_norm": 0.6767990291840955, + "learning_rate": 9.965813420428796e-06, + "loss": 0.6145, + "step": 842 + }, + { + "epoch": 0.07, + "grad_norm": 1.8805452824431472, + "learning_rate": 9.965664792099203e-06, + "loss": 0.5342, + "step": 843 + }, + { + "epoch": 0.07, + "grad_norm": 2.596668191985389, + "learning_rate": 9.965515842497596e-06, + "loss": 0.5203, + "step": 844 + }, + { + "epoch": 0.07, + "grad_norm": 1.8156872051069486, + "learning_rate": 9.965366571633608e-06, + "loss": 0.5711, + "step": 845 + }, + { + "epoch": 0.07, + "grad_norm": 1.5583898042883626, + "learning_rate": 9.965216979516899e-06, + "loss": 0.519, + "step": 846 + }, + { + "epoch": 0.07, + "grad_norm": 1.6736508843975682, + "learning_rate": 9.965067066157148e-06, + "loss": 0.5449, + "step": 847 + }, + { + "epoch": 0.07, + "grad_norm": 1.8870776883869451, + "learning_rate": 9.964916831564051e-06, + "loss": 0.5474, + "step": 848 + }, + { + "epoch": 0.07, + "grad_norm": 1.7116516860532125, + "learning_rate": 9.964766275747331e-06, + "loss": 0.5496, + "step": 849 + }, + { + "epoch": 0.07, + "grad_norm": 2.3252707831884516, + "learning_rate": 9.964615398716727e-06, + "loss": 0.585, + "step": 850 + }, + { + "epoch": 0.07, + "grad_norm": 0.7580207760405641, + "learning_rate": 9.964464200482001e-06, + "loss": 0.5908, + "step": 851 + }, + { + "epoch": 0.07, + "grad_norm": 6.180810716710363, + "learning_rate": 9.964312681052936e-06, + "loss": 0.5385, + "step": 852 + }, + { + "epoch": 0.07, + "grad_norm": 1.5551309645028908, + "learning_rate": 9.964160840439335e-06, + "loss": 0.5864, + "step": 853 + }, + { + "epoch": 0.07, + "grad_norm": 1.4278084825241908, + "learning_rate": 9.964008678651022e-06, + "loss": 0.468, + "step": 854 + }, + { + "epoch": 0.07, + "grad_norm": 1.7554944044530334, + "learning_rate": 9.96385619569784e-06, + "loss": 0.5185, + "step": 855 + }, + { + "epoch": 0.07, + "grad_norm": 0.6771038462422185, + "learning_rate": 9.963703391589656e-06, + "loss": 0.6201, + "step": 856 + }, + { + "epoch": 0.07, + "grad_norm": 16.006908628016113, + "learning_rate": 9.963550266336356e-06, + "loss": 0.558, + "step": 857 + }, + { + "epoch": 0.07, + "grad_norm": 1.4464863924809186, + "learning_rate": 9.963396819947848e-06, + "loss": 0.5225, + "step": 858 + }, + { + "epoch": 0.07, + "grad_norm": 0.6597158664820468, + "learning_rate": 9.963243052434057e-06, + "loss": 0.5871, + "step": 859 + }, + { + "epoch": 0.07, + "grad_norm": 0.6316903864196992, + "learning_rate": 9.963088963804935e-06, + "loss": 0.5929, + "step": 860 + }, + { + "epoch": 0.07, + "grad_norm": 0.5871133559724693, + "learning_rate": 9.962934554070448e-06, + "loss": 0.5901, + "step": 861 + }, + { + "epoch": 0.07, + "grad_norm": 1.9400071273088348, + "learning_rate": 9.962779823240588e-06, + "loss": 0.5702, + "step": 862 + }, + { + "epoch": 0.07, + "grad_norm": 2.1042290678101203, + "learning_rate": 9.962624771325367e-06, + "loss": 0.5027, + "step": 863 + }, + { + "epoch": 0.07, + "grad_norm": 1.7921002432875013, + "learning_rate": 9.962469398334813e-06, + "loss": 0.5356, + "step": 864 + }, + { + "epoch": 0.07, + "grad_norm": 1.581178866886812, + "learning_rate": 9.962313704278981e-06, + "loss": 0.5682, + "step": 865 + }, + { + "epoch": 0.07, + "grad_norm": 1.4627098442648818, + "learning_rate": 9.962157689167946e-06, + "loss": 0.519, + "step": 866 + }, + { + "epoch": 0.07, + "grad_norm": 2.329483671930689, + "learning_rate": 9.962001353011797e-06, + "loss": 0.5804, + "step": 867 + }, + { + "epoch": 0.07, + "grad_norm": 1.9277917768002495, + "learning_rate": 9.961844695820653e-06, + "loss": 0.5498, + "step": 868 + }, + { + "epoch": 0.07, + "grad_norm": 1.5793188293390557, + "learning_rate": 9.961687717604649e-06, + "loss": 0.5078, + "step": 869 + }, + { + "epoch": 0.07, + "grad_norm": 1.5227241596239927, + "learning_rate": 9.96153041837394e-06, + "loss": 0.5561, + "step": 870 + }, + { + "epoch": 0.07, + "grad_norm": 2.139967604556099, + "learning_rate": 9.961372798138701e-06, + "loss": 0.571, + "step": 871 + }, + { + "epoch": 0.07, + "grad_norm": 2.0375554771390405, + "learning_rate": 9.961214856909135e-06, + "loss": 0.5316, + "step": 872 + }, + { + "epoch": 0.07, + "grad_norm": 1.776611150892853, + "learning_rate": 9.961056594695457e-06, + "loss": 0.5406, + "step": 873 + }, + { + "epoch": 0.07, + "grad_norm": 1.8643510783800241, + "learning_rate": 9.960898011507908e-06, + "loss": 0.4994, + "step": 874 + }, + { + "epoch": 0.07, + "grad_norm": 1.4742256604281796, + "learning_rate": 9.960739107356745e-06, + "loss": 0.5004, + "step": 875 + }, + { + "epoch": 0.07, + "grad_norm": 1.8596194888817468, + "learning_rate": 9.960579882252252e-06, + "loss": 0.5481, + "step": 876 + }, + { + "epoch": 0.07, + "grad_norm": 1.0318071537157785, + "learning_rate": 9.960420336204733e-06, + "loss": 0.6024, + "step": 877 + }, + { + "epoch": 0.07, + "grad_norm": 1.615335217042538, + "learning_rate": 9.960260469224503e-06, + "loss": 0.5532, + "step": 878 + }, + { + "epoch": 0.07, + "grad_norm": 1.8856124442468722, + "learning_rate": 9.960100281321912e-06, + "loss": 0.5571, + "step": 879 + }, + { + "epoch": 0.07, + "grad_norm": 2.0684094625935234, + "learning_rate": 9.95993977250732e-06, + "loss": 0.6024, + "step": 880 + }, + { + "epoch": 0.07, + "grad_norm": 0.6864868336485753, + "learning_rate": 9.959778942791113e-06, + "loss": 0.6173, + "step": 881 + }, + { + "epoch": 0.07, + "grad_norm": 1.624615411700884, + "learning_rate": 9.959617792183698e-06, + "loss": 0.5256, + "step": 882 + }, + { + "epoch": 0.07, + "grad_norm": 2.661687689859257, + "learning_rate": 9.9594563206955e-06, + "loss": 0.53, + "step": 883 + }, + { + "epoch": 0.07, + "grad_norm": 2.6005222420008236, + "learning_rate": 9.959294528336965e-06, + "loss": 0.5358, + "step": 884 + }, + { + "epoch": 0.07, + "grad_norm": 0.7525159286083739, + "learning_rate": 9.959132415118562e-06, + "loss": 0.5955, + "step": 885 + }, + { + "epoch": 0.07, + "grad_norm": 0.7306692929271674, + "learning_rate": 9.958969981050779e-06, + "loss": 0.5785, + "step": 886 + }, + { + "epoch": 0.07, + "grad_norm": 1.6232752862988116, + "learning_rate": 9.958807226144125e-06, + "loss": 0.6083, + "step": 887 + }, + { + "epoch": 0.07, + "grad_norm": 1.5767741495613001, + "learning_rate": 9.958644150409131e-06, + "loss": 0.4971, + "step": 888 + }, + { + "epoch": 0.07, + "grad_norm": 1.8666960383117122, + "learning_rate": 9.958480753856348e-06, + "loss": 0.5328, + "step": 889 + }, + { + "epoch": 0.07, + "grad_norm": 0.7144573238298764, + "learning_rate": 9.958317036496345e-06, + "loss": 0.6039, + "step": 890 + }, + { + "epoch": 0.07, + "grad_norm": 0.6783983513904499, + "learning_rate": 9.958152998339716e-06, + "loss": 0.5868, + "step": 891 + }, + { + "epoch": 0.07, + "grad_norm": 0.6489078723090188, + "learning_rate": 9.957988639397075e-06, + "loss": 0.6145, + "step": 892 + }, + { + "epoch": 0.07, + "grad_norm": 1.8975812443867086, + "learning_rate": 9.957823959679054e-06, + "loss": 0.5699, + "step": 893 + }, + { + "epoch": 0.07, + "grad_norm": 2.611594307359781, + "learning_rate": 9.95765895919631e-06, + "loss": 0.5479, + "step": 894 + }, + { + "epoch": 0.07, + "grad_norm": 0.6522817354250684, + "learning_rate": 9.957493637959515e-06, + "loss": 0.5958, + "step": 895 + }, + { + "epoch": 0.07, + "grad_norm": 1.8905114408309076, + "learning_rate": 9.957327995979369e-06, + "loss": 0.5345, + "step": 896 + }, + { + "epoch": 0.07, + "grad_norm": 2.467240298569885, + "learning_rate": 9.957162033266585e-06, + "loss": 0.5515, + "step": 897 + }, + { + "epoch": 0.07, + "grad_norm": 1.6186076252613966, + "learning_rate": 9.956995749831901e-06, + "loss": 0.5259, + "step": 898 + }, + { + "epoch": 0.07, + "grad_norm": 2.0446014227661657, + "learning_rate": 9.95682914568608e-06, + "loss": 0.5454, + "step": 899 + }, + { + "epoch": 0.07, + "grad_norm": 1.6688268405948952, + "learning_rate": 9.956662220839894e-06, + "loss": 0.5535, + "step": 900 + }, + { + "epoch": 0.07, + "grad_norm": 0.7128713805204633, + "learning_rate": 9.95649497530415e-06, + "loss": 0.6102, + "step": 901 + }, + { + "epoch": 0.07, + "grad_norm": 1.965374807342687, + "learning_rate": 9.956327409089661e-06, + "loss": 0.5953, + "step": 902 + }, + { + "epoch": 0.07, + "grad_norm": 1.831792007341404, + "learning_rate": 9.956159522207277e-06, + "loss": 0.5614, + "step": 903 + }, + { + "epoch": 0.07, + "grad_norm": 1.5565606664474412, + "learning_rate": 9.955991314667852e-06, + "loss": 0.5447, + "step": 904 + }, + { + "epoch": 0.07, + "grad_norm": 0.6026980401707079, + "learning_rate": 9.955822786482273e-06, + "loss": 0.5993, + "step": 905 + }, + { + "epoch": 0.07, + "grad_norm": 2.1705195564671724, + "learning_rate": 9.955653937661442e-06, + "loss": 0.5342, + "step": 906 + }, + { + "epoch": 0.07, + "grad_norm": 1.4770746287985959, + "learning_rate": 9.955484768216285e-06, + "loss": 0.5131, + "step": 907 + }, + { + "epoch": 0.07, + "grad_norm": 1.81230631547397, + "learning_rate": 9.955315278157746e-06, + "loss": 0.542, + "step": 908 + }, + { + "epoch": 0.07, + "grad_norm": 2.1367991332148866, + "learning_rate": 9.955145467496791e-06, + "loss": 0.5119, + "step": 909 + }, + { + "epoch": 0.07, + "grad_norm": 1.8304261390708023, + "learning_rate": 9.954975336244406e-06, + "loss": 0.568, + "step": 910 + }, + { + "epoch": 0.07, + "grad_norm": 1.9517494614890825, + "learning_rate": 9.954804884411599e-06, + "loss": 0.5645, + "step": 911 + }, + { + "epoch": 0.07, + "grad_norm": 2.1276485398244995, + "learning_rate": 9.954634112009398e-06, + "loss": 0.5643, + "step": 912 + }, + { + "epoch": 0.07, + "grad_norm": 0.7617142690377036, + "learning_rate": 9.954463019048851e-06, + "loss": 0.6276, + "step": 913 + }, + { + "epoch": 0.07, + "grad_norm": 0.7217248337831912, + "learning_rate": 9.954291605541026e-06, + "loss": 0.6072, + "step": 914 + }, + { + "epoch": 0.07, + "grad_norm": 1.501394892380727, + "learning_rate": 9.95411987149702e-06, + "loss": 0.4689, + "step": 915 + }, + { + "epoch": 0.07, + "grad_norm": 1.7729234428555742, + "learning_rate": 9.953947816927934e-06, + "loss": 0.5204, + "step": 916 + }, + { + "epoch": 0.07, + "grad_norm": 2.185362223885419, + "learning_rate": 9.953775441844909e-06, + "loss": 0.521, + "step": 917 + }, + { + "epoch": 0.07, + "grad_norm": 1.6079038028760768, + "learning_rate": 9.95360274625909e-06, + "loss": 0.564, + "step": 918 + }, + { + "epoch": 0.07, + "grad_norm": 1.7400941128552325, + "learning_rate": 9.953429730181653e-06, + "loss": 0.5973, + "step": 919 + }, + { + "epoch": 0.07, + "grad_norm": 2.6726644201146255, + "learning_rate": 9.953256393623796e-06, + "loss": 0.5313, + "step": 920 + }, + { + "epoch": 0.07, + "grad_norm": 1.8589076341892388, + "learning_rate": 9.953082736596728e-06, + "loss": 0.5492, + "step": 921 + }, + { + "epoch": 0.07, + "grad_norm": 2.259483192284386, + "learning_rate": 9.952908759111686e-06, + "loss": 0.5289, + "step": 922 + }, + { + "epoch": 0.07, + "grad_norm": 1.6767203705815568, + "learning_rate": 9.952734461179928e-06, + "loss": 0.5795, + "step": 923 + }, + { + "epoch": 0.07, + "grad_norm": 1.0406854436985276, + "learning_rate": 9.952559842812727e-06, + "loss": 0.6299, + "step": 924 + }, + { + "epoch": 0.07, + "grad_norm": 1.6748003861195493, + "learning_rate": 9.952384904021384e-06, + "loss": 0.5819, + "step": 925 + }, + { + "epoch": 0.07, + "grad_norm": 1.4822729001771697, + "learning_rate": 9.952209644817217e-06, + "loss": 0.5239, + "step": 926 + }, + { + "epoch": 0.07, + "grad_norm": 1.5286538811513033, + "learning_rate": 9.952034065211564e-06, + "loss": 0.5427, + "step": 927 + }, + { + "epoch": 0.07, + "grad_norm": 1.933443222515339, + "learning_rate": 9.951858165215784e-06, + "loss": 0.5397, + "step": 928 + }, + { + "epoch": 0.07, + "grad_norm": 2.8541424133754627, + "learning_rate": 9.951681944841261e-06, + "loss": 0.6041, + "step": 929 + }, + { + "epoch": 0.07, + "grad_norm": 1.6823530273326184, + "learning_rate": 9.95150540409939e-06, + "loss": 0.5614, + "step": 930 + }, + { + "epoch": 0.07, + "grad_norm": 0.9767037463874199, + "learning_rate": 9.9513285430016e-06, + "loss": 0.6268, + "step": 931 + }, + { + "epoch": 0.07, + "grad_norm": 1.7526610700649616, + "learning_rate": 9.95115136155933e-06, + "loss": 0.5317, + "step": 932 + }, + { + "epoch": 0.07, + "grad_norm": 0.6740726398040634, + "learning_rate": 9.950973859784044e-06, + "loss": 0.6091, + "step": 933 + }, + { + "epoch": 0.07, + "grad_norm": 1.729628226005561, + "learning_rate": 9.950796037687224e-06, + "loss": 0.494, + "step": 934 + }, + { + "epoch": 0.07, + "grad_norm": 2.498242238639019, + "learning_rate": 9.950617895280378e-06, + "loss": 0.5191, + "step": 935 + }, + { + "epoch": 0.07, + "grad_norm": 0.8158851160510832, + "learning_rate": 9.950439432575029e-06, + "loss": 0.593, + "step": 936 + }, + { + "epoch": 0.07, + "grad_norm": 1.713610419521136, + "learning_rate": 9.950260649582727e-06, + "loss": 0.6038, + "step": 937 + }, + { + "epoch": 0.07, + "grad_norm": 1.6149588645914572, + "learning_rate": 9.950081546315036e-06, + "loss": 0.4825, + "step": 938 + }, + { + "epoch": 0.07, + "grad_norm": 1.9144448777184389, + "learning_rate": 9.949902122783543e-06, + "loss": 0.4915, + "step": 939 + }, + { + "epoch": 0.07, + "grad_norm": 2.4699324692377584, + "learning_rate": 9.94972237899986e-06, + "loss": 0.5352, + "step": 940 + }, + { + "epoch": 0.07, + "grad_norm": 1.7020448788853424, + "learning_rate": 9.949542314975614e-06, + "loss": 0.5009, + "step": 941 + }, + { + "epoch": 0.07, + "grad_norm": 2.2273994812048663, + "learning_rate": 9.949361930722454e-06, + "loss": 0.5665, + "step": 942 + }, + { + "epoch": 0.07, + "grad_norm": 1.9062644487692846, + "learning_rate": 9.949181226252052e-06, + "loss": 0.5697, + "step": 943 + }, + { + "epoch": 0.07, + "grad_norm": 2.397948146906995, + "learning_rate": 9.949000201576099e-06, + "loss": 0.5531, + "step": 944 + }, + { + "epoch": 0.07, + "grad_norm": 0.7691874044800316, + "learning_rate": 9.948818856706307e-06, + "loss": 0.5847, + "step": 945 + }, + { + "epoch": 0.07, + "grad_norm": 1.6817228788006264, + "learning_rate": 9.948637191654409e-06, + "loss": 0.5453, + "step": 946 + }, + { + "epoch": 0.07, + "grad_norm": 0.7114398697803828, + "learning_rate": 9.948455206432158e-06, + "loss": 0.6103, + "step": 947 + }, + { + "epoch": 0.07, + "grad_norm": 1.9617675799843879, + "learning_rate": 9.94827290105133e-06, + "loss": 0.5433, + "step": 948 + }, + { + "epoch": 0.07, + "grad_norm": 0.6086781572250257, + "learning_rate": 9.948090275523715e-06, + "loss": 0.5977, + "step": 949 + }, + { + "epoch": 0.07, + "grad_norm": 2.0295545831625303, + "learning_rate": 9.947907329861134e-06, + "loss": 0.5381, + "step": 950 + }, + { + "epoch": 0.07, + "grad_norm": 2.0137677234649014, + "learning_rate": 9.947724064075424e-06, + "loss": 0.5281, + "step": 951 + }, + { + "epoch": 0.07, + "grad_norm": 1.8715057144270584, + "learning_rate": 9.947540478178437e-06, + "loss": 0.5564, + "step": 952 + }, + { + "epoch": 0.07, + "grad_norm": 1.6468170055125908, + "learning_rate": 9.947356572182054e-06, + "loss": 0.5265, + "step": 953 + }, + { + "epoch": 0.07, + "grad_norm": 1.5432641442620518, + "learning_rate": 9.94717234609817e-06, + "loss": 0.5669, + "step": 954 + }, + { + "epoch": 0.08, + "grad_norm": 1.490222437757155, + "learning_rate": 9.946987799938709e-06, + "loss": 0.5394, + "step": 955 + }, + { + "epoch": 0.08, + "grad_norm": 0.8879803402680078, + "learning_rate": 9.94680293371561e-06, + "loss": 0.6243, + "step": 956 + }, + { + "epoch": 0.08, + "grad_norm": 1.7762821660739267, + "learning_rate": 9.946617747440831e-06, + "loss": 0.5436, + "step": 957 + }, + { + "epoch": 0.08, + "grad_norm": 1.8238243745218685, + "learning_rate": 9.946432241126356e-06, + "loss": 0.4932, + "step": 958 + }, + { + "epoch": 0.08, + "grad_norm": 0.7498322789663598, + "learning_rate": 9.946246414784185e-06, + "loss": 0.6133, + "step": 959 + }, + { + "epoch": 0.08, + "grad_norm": 2.694567505767062, + "learning_rate": 9.94606026842634e-06, + "loss": 0.5302, + "step": 960 + }, + { + "epoch": 0.08, + "grad_norm": 2.3462244393563485, + "learning_rate": 9.945873802064868e-06, + "loss": 0.5256, + "step": 961 + }, + { + "epoch": 0.08, + "grad_norm": 0.7490947370116301, + "learning_rate": 9.94568701571183e-06, + "loss": 0.595, + "step": 962 + }, + { + "epoch": 0.08, + "grad_norm": 1.7079264990411434, + "learning_rate": 9.945499909379313e-06, + "loss": 0.4981, + "step": 963 + }, + { + "epoch": 0.08, + "grad_norm": 1.9685712106007467, + "learning_rate": 9.94531248307942e-06, + "loss": 0.5312, + "step": 964 + }, + { + "epoch": 0.08, + "grad_norm": 4.1674323740287, + "learning_rate": 9.945124736824279e-06, + "loss": 0.5468, + "step": 965 + }, + { + "epoch": 0.08, + "grad_norm": 0.754151031059945, + "learning_rate": 9.944936670626037e-06, + "loss": 0.5988, + "step": 966 + }, + { + "epoch": 0.08, + "grad_norm": 1.9532201134233702, + "learning_rate": 9.944748284496862e-06, + "loss": 0.5391, + "step": 967 + }, + { + "epoch": 0.08, + "grad_norm": 1.4879688469545818, + "learning_rate": 9.94455957844894e-06, + "loss": 0.5002, + "step": 968 + }, + { + "epoch": 0.08, + "grad_norm": 2.363012334286684, + "learning_rate": 9.944370552494483e-06, + "loss": 0.5906, + "step": 969 + }, + { + "epoch": 0.08, + "grad_norm": 2.0825769972367367, + "learning_rate": 9.944181206645717e-06, + "loss": 0.5484, + "step": 970 + }, + { + "epoch": 0.08, + "grad_norm": 2.2140134698920337, + "learning_rate": 9.943991540914895e-06, + "loss": 0.5803, + "step": 971 + }, + { + "epoch": 0.08, + "grad_norm": 1.518754069989875, + "learning_rate": 9.94380155531429e-06, + "loss": 0.4938, + "step": 972 + }, + { + "epoch": 0.08, + "grad_norm": 1.8423085440283506, + "learning_rate": 9.94361124985619e-06, + "loss": 0.5254, + "step": 973 + }, + { + "epoch": 0.08, + "grad_norm": 1.8612029622640518, + "learning_rate": 9.943420624552912e-06, + "loss": 0.5604, + "step": 974 + }, + { + "epoch": 0.08, + "grad_norm": 2.1594880861182837, + "learning_rate": 9.943229679416781e-06, + "loss": 0.5699, + "step": 975 + }, + { + "epoch": 0.08, + "grad_norm": 1.5165155064905773, + "learning_rate": 9.94303841446016e-06, + "loss": 0.5495, + "step": 976 + }, + { + "epoch": 0.08, + "grad_norm": 1.770356362868865, + "learning_rate": 9.942846829695421e-06, + "loss": 0.5334, + "step": 977 + }, + { + "epoch": 0.08, + "grad_norm": 0.7861187482712447, + "learning_rate": 9.942654925134956e-06, + "loss": 0.6102, + "step": 978 + }, + { + "epoch": 0.08, + "grad_norm": 1.5461562958293462, + "learning_rate": 9.942462700791184e-06, + "loss": 0.5183, + "step": 979 + }, + { + "epoch": 0.08, + "grad_norm": 1.5126002590025025, + "learning_rate": 9.94227015667654e-06, + "loss": 0.5805, + "step": 980 + }, + { + "epoch": 0.08, + "grad_norm": 1.5423802641756543, + "learning_rate": 9.942077292803482e-06, + "loss": 0.4713, + "step": 981 + }, + { + "epoch": 0.08, + "grad_norm": 2.4954707255336817, + "learning_rate": 9.94188410918449e-06, + "loss": 0.5057, + "step": 982 + }, + { + "epoch": 0.08, + "grad_norm": 0.6072627022730294, + "learning_rate": 9.94169060583206e-06, + "loss": 0.5806, + "step": 983 + }, + { + "epoch": 0.08, + "grad_norm": 1.516686615141718, + "learning_rate": 9.94149678275871e-06, + "loss": 0.503, + "step": 984 + }, + { + "epoch": 0.08, + "grad_norm": 1.488974357923619, + "learning_rate": 9.941302639976986e-06, + "loss": 0.5455, + "step": 985 + }, + { + "epoch": 0.08, + "grad_norm": 1.7606502347050086, + "learning_rate": 9.941108177499443e-06, + "loss": 0.59, + "step": 986 + }, + { + "epoch": 0.08, + "grad_norm": 1.7371916913477436, + "learning_rate": 9.940913395338666e-06, + "loss": 0.5393, + "step": 987 + }, + { + "epoch": 0.08, + "grad_norm": 0.7164326666715825, + "learning_rate": 9.940718293507256e-06, + "loss": 0.585, + "step": 988 + }, + { + "epoch": 0.08, + "grad_norm": 1.706301106054119, + "learning_rate": 9.940522872017835e-06, + "loss": 0.5801, + "step": 989 + }, + { + "epoch": 0.08, + "grad_norm": 1.5350077699723788, + "learning_rate": 9.940327130883047e-06, + "loss": 0.5097, + "step": 990 + }, + { + "epoch": 0.08, + "grad_norm": 1.8882076258928164, + "learning_rate": 9.940131070115556e-06, + "loss": 0.5071, + "step": 991 + }, + { + "epoch": 0.08, + "grad_norm": 1.5351914381895546, + "learning_rate": 9.939934689728049e-06, + "loss": 0.5284, + "step": 992 + }, + { + "epoch": 0.08, + "grad_norm": 1.5247635231434422, + "learning_rate": 9.939737989733229e-06, + "loss": 0.5585, + "step": 993 + }, + { + "epoch": 0.08, + "grad_norm": 1.798598776582196, + "learning_rate": 9.939540970143822e-06, + "loss": 0.5123, + "step": 994 + }, + { + "epoch": 0.08, + "grad_norm": 0.6452957848803892, + "learning_rate": 9.939343630972577e-06, + "loss": 0.5933, + "step": 995 + }, + { + "epoch": 0.08, + "grad_norm": 1.8192408615607587, + "learning_rate": 9.93914597223226e-06, + "loss": 0.5794, + "step": 996 + }, + { + "epoch": 0.08, + "grad_norm": 1.7808330491704183, + "learning_rate": 9.93894799393566e-06, + "loss": 0.5296, + "step": 997 + }, + { + "epoch": 0.08, + "grad_norm": 1.9023417591585585, + "learning_rate": 9.938749696095587e-06, + "loss": 0.524, + "step": 998 + }, + { + "epoch": 0.08, + "grad_norm": 1.8152193989706902, + "learning_rate": 9.938551078724868e-06, + "loss": 0.5667, + "step": 999 + }, + { + "epoch": 0.08, + "grad_norm": 13.572818665937177, + "learning_rate": 9.938352141836353e-06, + "loss": 0.6009, + "step": 1000 + }, + { + "epoch": 0.08, + "grad_norm": 1.5814671479256166, + "learning_rate": 9.938152885442918e-06, + "loss": 0.5659, + "step": 1001 + }, + { + "epoch": 0.08, + "grad_norm": 2.333974539644457, + "learning_rate": 9.937953309557449e-06, + "loss": 0.503, + "step": 1002 + }, + { + "epoch": 0.08, + "grad_norm": 1.4972653038883414, + "learning_rate": 9.937753414192862e-06, + "loss": 0.4959, + "step": 1003 + }, + { + "epoch": 0.08, + "grad_norm": 0.6770736242011542, + "learning_rate": 9.937553199362087e-06, + "loss": 0.6006, + "step": 1004 + }, + { + "epoch": 0.08, + "grad_norm": 1.8935001269940408, + "learning_rate": 9.937352665078079e-06, + "loss": 0.5692, + "step": 1005 + }, + { + "epoch": 0.08, + "grad_norm": 1.7698208612485533, + "learning_rate": 9.937151811353813e-06, + "loss": 0.4887, + "step": 1006 + }, + { + "epoch": 0.08, + "grad_norm": 0.6250383997315418, + "learning_rate": 9.936950638202285e-06, + "loss": 0.5898, + "step": 1007 + }, + { + "epoch": 0.08, + "grad_norm": 1.705044983447187, + "learning_rate": 9.936749145636507e-06, + "loss": 0.5411, + "step": 1008 + }, + { + "epoch": 0.08, + "grad_norm": 1.7714644906235089, + "learning_rate": 9.936547333669518e-06, + "loss": 0.5398, + "step": 1009 + }, + { + "epoch": 0.08, + "grad_norm": 0.6274548418574036, + "learning_rate": 9.936345202314375e-06, + "loss": 0.5914, + "step": 1010 + }, + { + "epoch": 0.08, + "grad_norm": 1.6620052754199686, + "learning_rate": 9.936142751584155e-06, + "loss": 0.5855, + "step": 1011 + }, + { + "epoch": 0.08, + "grad_norm": 1.5046377889573184, + "learning_rate": 9.935939981491956e-06, + "loss": 0.5198, + "step": 1012 + }, + { + "epoch": 0.08, + "grad_norm": 1.748746586721077, + "learning_rate": 9.935736892050896e-06, + "loss": 0.5316, + "step": 1013 + }, + { + "epoch": 0.08, + "grad_norm": 1.730735977778147, + "learning_rate": 9.935533483274116e-06, + "loss": 0.523, + "step": 1014 + }, + { + "epoch": 0.08, + "grad_norm": 0.6197527040727683, + "learning_rate": 9.935329755174778e-06, + "loss": 0.5767, + "step": 1015 + }, + { + "epoch": 0.08, + "grad_norm": 1.5736824816812927, + "learning_rate": 9.93512570776606e-06, + "loss": 0.5403, + "step": 1016 + }, + { + "epoch": 0.08, + "grad_norm": 1.7339996444043406, + "learning_rate": 9.934921341061163e-06, + "loss": 0.5459, + "step": 1017 + }, + { + "epoch": 0.08, + "grad_norm": 1.5993971027053606, + "learning_rate": 9.934716655073313e-06, + "loss": 0.5483, + "step": 1018 + }, + { + "epoch": 0.08, + "grad_norm": 0.6188171864925958, + "learning_rate": 9.93451164981575e-06, + "loss": 0.5725, + "step": 1019 + }, + { + "epoch": 0.08, + "grad_norm": 0.6627933713150337, + "learning_rate": 9.934306325301738e-06, + "loss": 0.5895, + "step": 1020 + }, + { + "epoch": 0.08, + "grad_norm": 1.3699299823882745, + "learning_rate": 9.934100681544565e-06, + "loss": 0.543, + "step": 1021 + }, + { + "epoch": 0.08, + "grad_norm": 0.6120803371109612, + "learning_rate": 9.93389471855753e-06, + "loss": 0.5759, + "step": 1022 + }, + { + "epoch": 0.08, + "grad_norm": 1.6643023868542766, + "learning_rate": 9.93368843635396e-06, + "loss": 0.5167, + "step": 1023 + }, + { + "epoch": 0.08, + "grad_norm": 1.9480069936880335, + "learning_rate": 9.933481834947202e-06, + "loss": 0.5837, + "step": 1024 + }, + { + "epoch": 0.08, + "grad_norm": 0.6510826524848268, + "learning_rate": 9.933274914350626e-06, + "loss": 0.5808, + "step": 1025 + }, + { + "epoch": 0.08, + "grad_norm": 0.6457080260430015, + "learning_rate": 9.933067674577615e-06, + "loss": 0.5836, + "step": 1026 + }, + { + "epoch": 0.08, + "grad_norm": 1.7591246229301212, + "learning_rate": 9.932860115641578e-06, + "loss": 0.5797, + "step": 1027 + }, + { + "epoch": 0.08, + "grad_norm": 1.5784890758539505, + "learning_rate": 9.932652237555944e-06, + "loss": 0.5733, + "step": 1028 + }, + { + "epoch": 0.08, + "grad_norm": 1.564394300783467, + "learning_rate": 9.932444040334164e-06, + "loss": 0.4743, + "step": 1029 + }, + { + "epoch": 0.08, + "grad_norm": 1.730185369519864, + "learning_rate": 9.932235523989708e-06, + "loss": 0.5221, + "step": 1030 + }, + { + "epoch": 0.08, + "grad_norm": 1.565744584361003, + "learning_rate": 9.932026688536064e-06, + "loss": 0.5422, + "step": 1031 + }, + { + "epoch": 0.08, + "grad_norm": 1.4576745608130557, + "learning_rate": 9.931817533986746e-06, + "loss": 0.5837, + "step": 1032 + }, + { + "epoch": 0.08, + "grad_norm": 2.3256309374259265, + "learning_rate": 9.931608060355285e-06, + "loss": 0.5568, + "step": 1033 + }, + { + "epoch": 0.08, + "grad_norm": 0.6115044490699204, + "learning_rate": 9.931398267655234e-06, + "loss": 0.5436, + "step": 1034 + }, + { + "epoch": 0.08, + "grad_norm": 1.615101172123175, + "learning_rate": 9.931188155900166e-06, + "loss": 0.5512, + "step": 1035 + }, + { + "epoch": 0.08, + "grad_norm": 3.264488167745833, + "learning_rate": 9.930977725103677e-06, + "loss": 0.5564, + "step": 1036 + }, + { + "epoch": 0.08, + "grad_norm": 1.9128849680919735, + "learning_rate": 9.930766975279377e-06, + "loss": 0.5419, + "step": 1037 + }, + { + "epoch": 0.08, + "grad_norm": 1.63106770573509, + "learning_rate": 9.930555906440904e-06, + "loss": 0.5486, + "step": 1038 + }, + { + "epoch": 0.08, + "grad_norm": 1.5462764376684095, + "learning_rate": 9.930344518601915e-06, + "loss": 0.5502, + "step": 1039 + }, + { + "epoch": 0.08, + "grad_norm": 2.0242733585929527, + "learning_rate": 9.930132811776086e-06, + "loss": 0.5811, + "step": 1040 + }, + { + "epoch": 0.08, + "grad_norm": 3.5013362333595275, + "learning_rate": 9.929920785977113e-06, + "loss": 0.542, + "step": 1041 + }, + { + "epoch": 0.08, + "grad_norm": 2.3139349306950163, + "learning_rate": 9.929708441218713e-06, + "loss": 0.5228, + "step": 1042 + }, + { + "epoch": 0.08, + "grad_norm": 1.666985306124112, + "learning_rate": 9.929495777514627e-06, + "loss": 0.5512, + "step": 1043 + }, + { + "epoch": 0.08, + "grad_norm": 1.7912828430283418, + "learning_rate": 9.929282794878612e-06, + "loss": 0.5435, + "step": 1044 + }, + { + "epoch": 0.08, + "grad_norm": 1.410374826009012, + "learning_rate": 9.92906949332445e-06, + "loss": 0.5212, + "step": 1045 + }, + { + "epoch": 0.08, + "grad_norm": 1.929853020598953, + "learning_rate": 9.928855872865939e-06, + "loss": 0.5749, + "step": 1046 + }, + { + "epoch": 0.08, + "grad_norm": 1.9628600701257146, + "learning_rate": 9.9286419335169e-06, + "loss": 0.6122, + "step": 1047 + }, + { + "epoch": 0.08, + "grad_norm": 1.6109600879356432, + "learning_rate": 9.928427675291176e-06, + "loss": 0.59, + "step": 1048 + }, + { + "epoch": 0.08, + "grad_norm": 1.5068988521060063, + "learning_rate": 9.928213098202628e-06, + "loss": 0.5009, + "step": 1049 + }, + { + "epoch": 0.08, + "grad_norm": 1.46949249489788, + "learning_rate": 9.92799820226514e-06, + "loss": 0.5564, + "step": 1050 + }, + { + "epoch": 0.08, + "grad_norm": 1.6146957557987986, + "learning_rate": 9.927782987492615e-06, + "loss": 0.5256, + "step": 1051 + }, + { + "epoch": 0.08, + "grad_norm": 1.5439560808845414, + "learning_rate": 9.927567453898976e-06, + "loss": 0.5295, + "step": 1052 + }, + { + "epoch": 0.08, + "grad_norm": 3.1587152677864214, + "learning_rate": 9.92735160149817e-06, + "loss": 0.5235, + "step": 1053 + }, + { + "epoch": 0.08, + "grad_norm": 2.058990840543154, + "learning_rate": 9.927135430304163e-06, + "loss": 0.514, + "step": 1054 + }, + { + "epoch": 0.08, + "grad_norm": 1.6311768782813831, + "learning_rate": 9.926918940330937e-06, + "loss": 0.514, + "step": 1055 + }, + { + "epoch": 0.08, + "grad_norm": 1.510142259114802, + "learning_rate": 9.926702131592499e-06, + "loss": 0.5425, + "step": 1056 + }, + { + "epoch": 0.08, + "grad_norm": 1.8900908226528865, + "learning_rate": 9.92648500410288e-06, + "loss": 0.4955, + "step": 1057 + }, + { + "epoch": 0.08, + "grad_norm": 1.9029963982653995, + "learning_rate": 9.926267557876126e-06, + "loss": 0.5404, + "step": 1058 + }, + { + "epoch": 0.08, + "grad_norm": 0.7082840035148038, + "learning_rate": 9.926049792926306e-06, + "loss": 0.582, + "step": 1059 + }, + { + "epoch": 0.08, + "grad_norm": 0.63287933963858, + "learning_rate": 9.925831709267508e-06, + "loss": 0.6011, + "step": 1060 + }, + { + "epoch": 0.08, + "grad_norm": 0.6044060767130703, + "learning_rate": 9.925613306913841e-06, + "loss": 0.5781, + "step": 1061 + }, + { + "epoch": 0.08, + "grad_norm": 1.9724293579053236, + "learning_rate": 9.925394585879437e-06, + "loss": 0.623, + "step": 1062 + }, + { + "epoch": 0.08, + "grad_norm": 2.6565961273592924, + "learning_rate": 9.925175546178446e-06, + "loss": 0.5364, + "step": 1063 + }, + { + "epoch": 0.08, + "grad_norm": 2.291512772489767, + "learning_rate": 9.924956187825042e-06, + "loss": 0.6025, + "step": 1064 + }, + { + "epoch": 0.08, + "grad_norm": 4.161368285830542, + "learning_rate": 9.924736510833414e-06, + "loss": 0.5129, + "step": 1065 + }, + { + "epoch": 0.08, + "grad_norm": 2.12218620517083, + "learning_rate": 9.924516515217777e-06, + "loss": 0.5816, + "step": 1066 + }, + { + "epoch": 0.08, + "grad_norm": 1.6374160170409087, + "learning_rate": 9.924296200992363e-06, + "loss": 0.5581, + "step": 1067 + }, + { + "epoch": 0.08, + "grad_norm": 1.6925732716441213, + "learning_rate": 9.924075568171426e-06, + "loss": 0.518, + "step": 1068 + }, + { + "epoch": 0.08, + "grad_norm": 1.46094466049942, + "learning_rate": 9.923854616769242e-06, + "loss": 0.5098, + "step": 1069 + }, + { + "epoch": 0.08, + "grad_norm": 2.0717882703457695, + "learning_rate": 9.923633346800106e-06, + "loss": 0.5407, + "step": 1070 + }, + { + "epoch": 0.08, + "grad_norm": 1.8481453871417897, + "learning_rate": 9.923411758278332e-06, + "loss": 0.52, + "step": 1071 + }, + { + "epoch": 0.08, + "grad_norm": 1.2527526706634229, + "learning_rate": 9.923189851218259e-06, + "loss": 0.4865, + "step": 1072 + }, + { + "epoch": 0.08, + "grad_norm": 1.8806352326109435, + "learning_rate": 9.922967625634242e-06, + "loss": 0.5239, + "step": 1073 + }, + { + "epoch": 0.08, + "grad_norm": 2.115634407199955, + "learning_rate": 9.92274508154066e-06, + "loss": 0.4786, + "step": 1074 + }, + { + "epoch": 0.08, + "grad_norm": 1.8050603884343028, + "learning_rate": 9.92252221895191e-06, + "loss": 0.5277, + "step": 1075 + }, + { + "epoch": 0.08, + "grad_norm": 1.224550206429174, + "learning_rate": 9.922299037882414e-06, + "loss": 0.606, + "step": 1076 + }, + { + "epoch": 0.08, + "grad_norm": 1.8670289813816698, + "learning_rate": 9.922075538346609e-06, + "loss": 0.5474, + "step": 1077 + }, + { + "epoch": 0.08, + "grad_norm": 1.8822859173716924, + "learning_rate": 9.921851720358955e-06, + "loss": 0.562, + "step": 1078 + }, + { + "epoch": 0.08, + "grad_norm": 1.5965102812637362, + "learning_rate": 9.921627583933933e-06, + "loss": 0.5324, + "step": 1079 + }, + { + "epoch": 0.08, + "grad_norm": 1.6361916028300054, + "learning_rate": 9.921403129086045e-06, + "loss": 0.5393, + "step": 1080 + }, + { + "epoch": 0.08, + "grad_norm": 1.7839271079939816, + "learning_rate": 9.921178355829814e-06, + "loss": 0.5129, + "step": 1081 + }, + { + "epoch": 0.08, + "grad_norm": 1.417877062497937, + "learning_rate": 9.92095326417978e-06, + "loss": 0.5705, + "step": 1082 + }, + { + "epoch": 0.09, + "grad_norm": 1.4910706289462825, + "learning_rate": 9.920727854150506e-06, + "loss": 0.5297, + "step": 1083 + }, + { + "epoch": 0.09, + "grad_norm": 1.8269739747028728, + "learning_rate": 9.920502125756578e-06, + "loss": 0.5774, + "step": 1084 + }, + { + "epoch": 0.09, + "grad_norm": 1.7114254833253686, + "learning_rate": 9.920276079012599e-06, + "loss": 0.5932, + "step": 1085 + }, + { + "epoch": 0.09, + "grad_norm": 1.6518328394775394, + "learning_rate": 9.920049713933194e-06, + "loss": 0.5203, + "step": 1086 + }, + { + "epoch": 0.09, + "grad_norm": 1.6603079275322299, + "learning_rate": 9.91982303053301e-06, + "loss": 0.5622, + "step": 1087 + }, + { + "epoch": 0.09, + "grad_norm": 2.015170251934802, + "learning_rate": 9.91959602882671e-06, + "loss": 0.5836, + "step": 1088 + }, + { + "epoch": 0.09, + "grad_norm": 1.0624073194477806, + "learning_rate": 9.919368708828984e-06, + "loss": 0.6064, + "step": 1089 + }, + { + "epoch": 0.09, + "grad_norm": 1.8912433537277038, + "learning_rate": 9.919141070554536e-06, + "loss": 0.5725, + "step": 1090 + }, + { + "epoch": 0.09, + "grad_norm": 1.6990275821998273, + "learning_rate": 9.918913114018098e-06, + "loss": 0.5509, + "step": 1091 + }, + { + "epoch": 0.09, + "grad_norm": 1.5317338871178732, + "learning_rate": 9.918684839234417e-06, + "loss": 0.5311, + "step": 1092 + }, + { + "epoch": 0.09, + "grad_norm": 1.75601592082343, + "learning_rate": 9.918456246218257e-06, + "loss": 0.553, + "step": 1093 + }, + { + "epoch": 0.09, + "grad_norm": 1.41188752375783, + "learning_rate": 9.918227334984415e-06, + "loss": 0.5342, + "step": 1094 + }, + { + "epoch": 0.09, + "grad_norm": 1.6735406371518926, + "learning_rate": 9.9179981055477e-06, + "loss": 0.5491, + "step": 1095 + }, + { + "epoch": 0.09, + "grad_norm": 0.6717981258329313, + "learning_rate": 9.917768557922938e-06, + "loss": 0.5913, + "step": 1096 + }, + { + "epoch": 0.09, + "grad_norm": 0.6377027054064629, + "learning_rate": 9.917538692124985e-06, + "loss": 0.6031, + "step": 1097 + }, + { + "epoch": 0.09, + "grad_norm": 1.415486373513553, + "learning_rate": 9.917308508168712e-06, + "loss": 0.5209, + "step": 1098 + }, + { + "epoch": 0.09, + "grad_norm": 1.4596525792172892, + "learning_rate": 9.91707800606901e-06, + "loss": 0.5589, + "step": 1099 + }, + { + "epoch": 0.09, + "grad_norm": 1.4543368451562873, + "learning_rate": 9.916847185840794e-06, + "loss": 0.5676, + "step": 1100 + }, + { + "epoch": 0.09, + "grad_norm": 1.5111023543429314, + "learning_rate": 9.916616047498997e-06, + "loss": 0.5702, + "step": 1101 + }, + { + "epoch": 0.09, + "grad_norm": 2.1472788063836186, + "learning_rate": 9.916384591058572e-06, + "loss": 0.5149, + "step": 1102 + }, + { + "epoch": 0.09, + "grad_norm": 1.5943060132923663, + "learning_rate": 9.916152816534498e-06, + "loss": 0.5309, + "step": 1103 + }, + { + "epoch": 0.09, + "grad_norm": 1.819489143091455, + "learning_rate": 9.915920723941766e-06, + "loss": 0.5937, + "step": 1104 + }, + { + "epoch": 0.09, + "grad_norm": 0.9050453179069587, + "learning_rate": 9.915688313295394e-06, + "loss": 0.6036, + "step": 1105 + }, + { + "epoch": 0.09, + "grad_norm": 1.8463856618307843, + "learning_rate": 9.915455584610421e-06, + "loss": 0.5552, + "step": 1106 + }, + { + "epoch": 0.09, + "grad_norm": 0.7220375909189171, + "learning_rate": 9.915222537901901e-06, + "loss": 0.5955, + "step": 1107 + }, + { + "epoch": 0.09, + "grad_norm": 0.6414912405376276, + "learning_rate": 9.91498917318491e-06, + "loss": 0.5723, + "step": 1108 + }, + { + "epoch": 0.09, + "grad_norm": 1.5847141620042753, + "learning_rate": 9.914755490474552e-06, + "loss": 0.4816, + "step": 1109 + }, + { + "epoch": 0.09, + "grad_norm": 1.5016225084197317, + "learning_rate": 9.914521489785942e-06, + "loss": 0.5772, + "step": 1110 + }, + { + "epoch": 0.09, + "grad_norm": 2.420942215296239, + "learning_rate": 9.91428717113422e-06, + "loss": 0.5954, + "step": 1111 + }, + { + "epoch": 0.09, + "grad_norm": 1.8200109828718756, + "learning_rate": 9.914052534534549e-06, + "loss": 0.5476, + "step": 1112 + }, + { + "epoch": 0.09, + "grad_norm": 1.707095152757118, + "learning_rate": 9.913817580002105e-06, + "loss": 0.5576, + "step": 1113 + }, + { + "epoch": 0.09, + "grad_norm": 1.6163665457304273, + "learning_rate": 9.913582307552093e-06, + "loss": 0.5664, + "step": 1114 + }, + { + "epoch": 0.09, + "grad_norm": 1.9017239011073102, + "learning_rate": 9.913346717199732e-06, + "loss": 0.552, + "step": 1115 + }, + { + "epoch": 0.09, + "grad_norm": 1.644169730197281, + "learning_rate": 9.913110808960267e-06, + "loss": 0.5806, + "step": 1116 + }, + { + "epoch": 0.09, + "grad_norm": 2.3921308909827137, + "learning_rate": 9.912874582848962e-06, + "loss": 0.5274, + "step": 1117 + }, + { + "epoch": 0.09, + "grad_norm": 3.2225075993943535, + "learning_rate": 9.912638038881095e-06, + "loss": 0.5494, + "step": 1118 + }, + { + "epoch": 0.09, + "grad_norm": 1.778695859631835, + "learning_rate": 9.912401177071975e-06, + "loss": 0.574, + "step": 1119 + }, + { + "epoch": 0.09, + "grad_norm": 1.4082362118868081, + "learning_rate": 9.912163997436924e-06, + "loss": 0.5081, + "step": 1120 + }, + { + "epoch": 0.09, + "grad_norm": 1.570397398568426, + "learning_rate": 9.911926499991289e-06, + "loss": 0.4997, + "step": 1121 + }, + { + "epoch": 0.09, + "grad_norm": 1.6477497021227163, + "learning_rate": 9.911688684750434e-06, + "loss": 0.491, + "step": 1122 + }, + { + "epoch": 0.09, + "grad_norm": 2.084614691566177, + "learning_rate": 9.911450551729748e-06, + "loss": 0.5064, + "step": 1123 + }, + { + "epoch": 0.09, + "grad_norm": 1.7261631412640468, + "learning_rate": 9.911212100944635e-06, + "loss": 0.5378, + "step": 1124 + }, + { + "epoch": 0.09, + "grad_norm": 1.7702709946863495, + "learning_rate": 9.910973332410525e-06, + "loss": 0.5366, + "step": 1125 + }, + { + "epoch": 0.09, + "grad_norm": 1.425931522713767, + "learning_rate": 9.910734246142863e-06, + "loss": 0.6441, + "step": 1126 + }, + { + "epoch": 0.09, + "grad_norm": 1.1138034968221866, + "learning_rate": 9.910494842157122e-06, + "loss": 0.6148, + "step": 1127 + }, + { + "epoch": 0.09, + "grad_norm": 0.7728583640950348, + "learning_rate": 9.910255120468786e-06, + "loss": 0.579, + "step": 1128 + }, + { + "epoch": 0.09, + "grad_norm": 1.9569603924074712, + "learning_rate": 9.910015081093368e-06, + "loss": 0.5742, + "step": 1129 + }, + { + "epoch": 0.09, + "grad_norm": 1.977089611745831, + "learning_rate": 9.909774724046398e-06, + "loss": 0.5104, + "step": 1130 + }, + { + "epoch": 0.09, + "grad_norm": 2.018412895390878, + "learning_rate": 9.909534049343425e-06, + "loss": 0.549, + "step": 1131 + }, + { + "epoch": 0.09, + "grad_norm": 1.681252942241913, + "learning_rate": 9.909293057000023e-06, + "loss": 0.5117, + "step": 1132 + }, + { + "epoch": 0.09, + "grad_norm": 1.779568602619412, + "learning_rate": 9.909051747031779e-06, + "loss": 0.5848, + "step": 1133 + }, + { + "epoch": 0.09, + "grad_norm": 1.8169564653913255, + "learning_rate": 9.908810119454314e-06, + "loss": 0.6466, + "step": 1134 + }, + { + "epoch": 0.09, + "grad_norm": 1.9603714693645404, + "learning_rate": 9.90856817428325e-06, + "loss": 0.534, + "step": 1135 + }, + { + "epoch": 0.09, + "grad_norm": 1.7850962623196969, + "learning_rate": 9.908325911534252e-06, + "loss": 0.5557, + "step": 1136 + }, + { + "epoch": 0.09, + "grad_norm": 2.0202431046437184, + "learning_rate": 9.908083331222984e-06, + "loss": 0.5159, + "step": 1137 + }, + { + "epoch": 0.09, + "grad_norm": 1.3433708111638265, + "learning_rate": 9.907840433365147e-06, + "loss": 0.6575, + "step": 1138 + }, + { + "epoch": 0.09, + "grad_norm": 1.8939310334819288, + "learning_rate": 9.907597217976455e-06, + "loss": 0.5019, + "step": 1139 + }, + { + "epoch": 0.09, + "grad_norm": 1.5784776243039382, + "learning_rate": 9.90735368507264e-06, + "loss": 0.4981, + "step": 1140 + }, + { + "epoch": 0.09, + "grad_norm": 2.124867962676086, + "learning_rate": 9.907109834669465e-06, + "loss": 0.4753, + "step": 1141 + }, + { + "epoch": 0.09, + "grad_norm": 1.8039609130892484, + "learning_rate": 9.9068656667827e-06, + "loss": 0.5625, + "step": 1142 + }, + { + "epoch": 0.09, + "grad_norm": 1.5044615506449317, + "learning_rate": 9.906621181428147e-06, + "loss": 0.4979, + "step": 1143 + }, + { + "epoch": 0.09, + "grad_norm": 1.8432820805843526, + "learning_rate": 9.906376378621622e-06, + "loss": 0.5249, + "step": 1144 + }, + { + "epoch": 0.09, + "grad_norm": 2.0730345028416575, + "learning_rate": 9.906131258378962e-06, + "loss": 0.5799, + "step": 1145 + }, + { + "epoch": 0.09, + "grad_norm": 1.4493145163681083, + "learning_rate": 9.90588582071603e-06, + "loss": 0.6159, + "step": 1146 + }, + { + "epoch": 0.09, + "grad_norm": 1.409714049828381, + "learning_rate": 9.905640065648702e-06, + "loss": 0.4979, + "step": 1147 + }, + { + "epoch": 0.09, + "grad_norm": 2.5676881416873902, + "learning_rate": 9.90539399319288e-06, + "loss": 0.5713, + "step": 1148 + }, + { + "epoch": 0.09, + "grad_norm": 0.7848371650749066, + "learning_rate": 9.905147603364482e-06, + "loss": 0.5889, + "step": 1149 + }, + { + "epoch": 0.09, + "grad_norm": 1.8876131380260615, + "learning_rate": 9.904900896179453e-06, + "loss": 0.605, + "step": 1150 + }, + { + "epoch": 0.09, + "grad_norm": 1.829473423719891, + "learning_rate": 9.90465387165375e-06, + "loss": 0.457, + "step": 1151 + }, + { + "epoch": 0.09, + "grad_norm": 1.8076063906281903, + "learning_rate": 9.904406529803358e-06, + "loss": 0.5512, + "step": 1152 + }, + { + "epoch": 0.09, + "grad_norm": 1.7666782174284146, + "learning_rate": 9.90415887064428e-06, + "loss": 0.5702, + "step": 1153 + }, + { + "epoch": 0.09, + "grad_norm": 1.7877188920761047, + "learning_rate": 9.903910894192539e-06, + "loss": 0.5167, + "step": 1154 + }, + { + "epoch": 0.09, + "grad_norm": 1.742603751782403, + "learning_rate": 9.903662600464178e-06, + "loss": 0.5379, + "step": 1155 + }, + { + "epoch": 0.09, + "grad_norm": 1.2913534156314952, + "learning_rate": 9.90341398947526e-06, + "loss": 0.6054, + "step": 1156 + }, + { + "epoch": 0.09, + "grad_norm": 2.2253206110079993, + "learning_rate": 9.903165061241874e-06, + "loss": 0.5382, + "step": 1157 + }, + { + "epoch": 0.09, + "grad_norm": 1.6770581552745216, + "learning_rate": 9.90291581578012e-06, + "loss": 0.5075, + "step": 1158 + }, + { + "epoch": 0.09, + "grad_norm": 1.9240297676423572, + "learning_rate": 9.90266625310613e-06, + "loss": 0.5482, + "step": 1159 + }, + { + "epoch": 0.09, + "grad_norm": 0.7813365269240671, + "learning_rate": 9.902416373236045e-06, + "loss": 0.5696, + "step": 1160 + }, + { + "epoch": 0.09, + "grad_norm": 1.4683662296453355, + "learning_rate": 9.902166176186033e-06, + "loss": 0.506, + "step": 1161 + }, + { + "epoch": 0.09, + "grad_norm": 1.676338790144313, + "learning_rate": 9.901915661972283e-06, + "loss": 0.5263, + "step": 1162 + }, + { + "epoch": 0.09, + "grad_norm": 1.8655221044721126, + "learning_rate": 9.901664830611001e-06, + "loss": 0.5655, + "step": 1163 + }, + { + "epoch": 0.09, + "grad_norm": 2.2925530866968598, + "learning_rate": 9.90141368211842e-06, + "loss": 0.5282, + "step": 1164 + }, + { + "epoch": 0.09, + "grad_norm": 1.7252102077236804, + "learning_rate": 9.901162216510782e-06, + "loss": 0.516, + "step": 1165 + }, + { + "epoch": 0.09, + "grad_norm": 1.5643643831272696, + "learning_rate": 9.900910433804362e-06, + "loss": 0.5535, + "step": 1166 + }, + { + "epoch": 0.09, + "grad_norm": 1.8168170881362014, + "learning_rate": 9.900658334015447e-06, + "loss": 0.54, + "step": 1167 + }, + { + "epoch": 0.09, + "grad_norm": 1.476310146007673, + "learning_rate": 9.90040591716035e-06, + "loss": 0.5048, + "step": 1168 + }, + { + "epoch": 0.09, + "grad_norm": 1.921811761595845, + "learning_rate": 9.900153183255399e-06, + "loss": 0.5389, + "step": 1169 + }, + { + "epoch": 0.09, + "grad_norm": 1.4756422899686497, + "learning_rate": 9.899900132316948e-06, + "loss": 0.5052, + "step": 1170 + }, + { + "epoch": 0.09, + "grad_norm": 1.802787367686697, + "learning_rate": 9.89964676436137e-06, + "loss": 0.5695, + "step": 1171 + }, + { + "epoch": 0.09, + "grad_norm": 1.6473922553722975, + "learning_rate": 9.899393079405052e-06, + "loss": 0.509, + "step": 1172 + }, + { + "epoch": 0.09, + "grad_norm": 2.5675827943889926, + "learning_rate": 9.899139077464415e-06, + "loss": 0.5399, + "step": 1173 + }, + { + "epoch": 0.09, + "grad_norm": 1.7172682241451236, + "learning_rate": 9.898884758555886e-06, + "loss": 0.5703, + "step": 1174 + }, + { + "epoch": 0.09, + "grad_norm": 1.0029849527498562, + "learning_rate": 9.898630122695923e-06, + "loss": 0.5917, + "step": 1175 + }, + { + "epoch": 0.09, + "grad_norm": 0.6910533251645322, + "learning_rate": 9.898375169901e-06, + "loss": 0.5799, + "step": 1176 + }, + { + "epoch": 0.09, + "grad_norm": 2.194931525722795, + "learning_rate": 9.89811990018761e-06, + "loss": 0.5571, + "step": 1177 + }, + { + "epoch": 0.09, + "grad_norm": 1.3894673980832237, + "learning_rate": 9.89786431357227e-06, + "loss": 0.5292, + "step": 1178 + }, + { + "epoch": 0.09, + "grad_norm": 1.8268981789954277, + "learning_rate": 9.897608410071516e-06, + "loss": 0.5117, + "step": 1179 + }, + { + "epoch": 0.09, + "grad_norm": 1.4192589651751089, + "learning_rate": 9.897352189701905e-06, + "loss": 0.496, + "step": 1180 + }, + { + "epoch": 0.09, + "grad_norm": 1.6878047301647991, + "learning_rate": 9.897095652480015e-06, + "loss": 0.5488, + "step": 1181 + }, + { + "epoch": 0.09, + "grad_norm": 1.6771007886010452, + "learning_rate": 9.896838798422442e-06, + "loss": 0.4723, + "step": 1182 + }, + { + "epoch": 0.09, + "grad_norm": 1.769328655821698, + "learning_rate": 9.896581627545803e-06, + "loss": 0.507, + "step": 1183 + }, + { + "epoch": 0.09, + "grad_norm": 1.3991607416495504, + "learning_rate": 9.89632413986674e-06, + "loss": 0.4844, + "step": 1184 + }, + { + "epoch": 0.09, + "grad_norm": 1.7840591629159244, + "learning_rate": 9.89606633540191e-06, + "loss": 0.5406, + "step": 1185 + }, + { + "epoch": 0.09, + "grad_norm": 1.6645683450276998, + "learning_rate": 9.895808214167993e-06, + "loss": 0.6108, + "step": 1186 + }, + { + "epoch": 0.09, + "grad_norm": 1.468709595267863, + "learning_rate": 9.895549776181688e-06, + "loss": 0.5086, + "step": 1187 + }, + { + "epoch": 0.09, + "grad_norm": 1.5995715842468021, + "learning_rate": 9.895291021459717e-06, + "loss": 0.5424, + "step": 1188 + }, + { + "epoch": 0.09, + "grad_norm": 1.942887596448376, + "learning_rate": 9.89503195001882e-06, + "loss": 0.5499, + "step": 1189 + }, + { + "epoch": 0.09, + "grad_norm": 2.0231796313865105, + "learning_rate": 9.894772561875762e-06, + "loss": 0.5459, + "step": 1190 + }, + { + "epoch": 0.09, + "grad_norm": 1.3985537761117157, + "learning_rate": 9.89451285704732e-06, + "loss": 0.5304, + "step": 1191 + }, + { + "epoch": 0.09, + "grad_norm": 1.2671654084436923, + "learning_rate": 9.8942528355503e-06, + "loss": 0.6083, + "step": 1192 + }, + { + "epoch": 0.09, + "grad_norm": 1.4890698610583495, + "learning_rate": 9.893992497401525e-06, + "loss": 0.4915, + "step": 1193 + }, + { + "epoch": 0.09, + "grad_norm": 2.179985887695711, + "learning_rate": 9.893731842617837e-06, + "loss": 0.5406, + "step": 1194 + }, + { + "epoch": 0.09, + "grad_norm": 1.7280838176852462, + "learning_rate": 9.8934708712161e-06, + "loss": 0.5482, + "step": 1195 + }, + { + "epoch": 0.09, + "grad_norm": 2.009635792352556, + "learning_rate": 9.8932095832132e-06, + "loss": 0.5418, + "step": 1196 + }, + { + "epoch": 0.09, + "grad_norm": 1.8085803562648344, + "learning_rate": 9.89294797862604e-06, + "loss": 0.5237, + "step": 1197 + }, + { + "epoch": 0.09, + "grad_norm": 1.9624385187839906, + "learning_rate": 9.892686057471546e-06, + "loss": 0.5413, + "step": 1198 + }, + { + "epoch": 0.09, + "grad_norm": 1.3496477164764153, + "learning_rate": 9.892423819766665e-06, + "loss": 0.5749, + "step": 1199 + }, + { + "epoch": 0.09, + "grad_norm": 1.5307520083689794, + "learning_rate": 9.892161265528364e-06, + "loss": 0.5132, + "step": 1200 + }, + { + "epoch": 0.09, + "grad_norm": 3.0645863515855494, + "learning_rate": 9.891898394773627e-06, + "loss": 0.5128, + "step": 1201 + }, + { + "epoch": 0.09, + "grad_norm": 0.8214767289810571, + "learning_rate": 9.891635207519465e-06, + "loss": 0.5872, + "step": 1202 + }, + { + "epoch": 0.09, + "grad_norm": 1.7236140506117072, + "learning_rate": 9.891371703782903e-06, + "loss": 0.487, + "step": 1203 + }, + { + "epoch": 0.09, + "grad_norm": 1.3945748066681585, + "learning_rate": 9.891107883580991e-06, + "loss": 0.4747, + "step": 1204 + }, + { + "epoch": 0.09, + "grad_norm": 1.797244860512192, + "learning_rate": 9.890843746930798e-06, + "loss": 0.5223, + "step": 1205 + }, + { + "epoch": 0.09, + "grad_norm": 1.7982323161894755, + "learning_rate": 9.890579293849411e-06, + "loss": 0.6009, + "step": 1206 + }, + { + "epoch": 0.09, + "grad_norm": 1.7127512640065596, + "learning_rate": 9.890314524353943e-06, + "loss": 0.5318, + "step": 1207 + }, + { + "epoch": 0.09, + "grad_norm": 0.6932316447311612, + "learning_rate": 9.890049438461522e-06, + "loss": 0.5865, + "step": 1208 + }, + { + "epoch": 0.09, + "grad_norm": 1.6083723596524213, + "learning_rate": 9.889784036189299e-06, + "loss": 0.5334, + "step": 1209 + }, + { + "epoch": 0.1, + "grad_norm": 1.4163273722969887, + "learning_rate": 9.889518317554446e-06, + "loss": 0.4844, + "step": 1210 + }, + { + "epoch": 0.1, + "grad_norm": 0.6544059924690359, + "learning_rate": 9.889252282574154e-06, + "loss": 0.586, + "step": 1211 + }, + { + "epoch": 0.1, + "grad_norm": 1.8249012799504838, + "learning_rate": 9.888985931265636e-06, + "loss": 0.5191, + "step": 1212 + }, + { + "epoch": 0.1, + "grad_norm": 1.437528382478115, + "learning_rate": 9.888719263646123e-06, + "loss": 0.5303, + "step": 1213 + }, + { + "epoch": 0.1, + "grad_norm": 1.7918008831203407, + "learning_rate": 9.888452279732869e-06, + "loss": 0.5279, + "step": 1214 + }, + { + "epoch": 0.1, + "grad_norm": 1.8608842217306174, + "learning_rate": 9.888184979543147e-06, + "loss": 0.5879, + "step": 1215 + }, + { + "epoch": 0.1, + "grad_norm": 0.6838552323971444, + "learning_rate": 9.887917363094251e-06, + "loss": 0.5707, + "step": 1216 + }, + { + "epoch": 0.1, + "grad_norm": 1.7540621116587611, + "learning_rate": 9.887649430403496e-06, + "loss": 0.5425, + "step": 1217 + }, + { + "epoch": 0.1, + "grad_norm": 1.9588091992275638, + "learning_rate": 9.887381181488218e-06, + "loss": 0.4948, + "step": 1218 + }, + { + "epoch": 0.1, + "grad_norm": 1.6194388994458946, + "learning_rate": 9.88711261636577e-06, + "loss": 0.556, + "step": 1219 + }, + { + "epoch": 0.1, + "grad_norm": 1.756001502283283, + "learning_rate": 9.886843735053529e-06, + "loss": 0.5522, + "step": 1220 + }, + { + "epoch": 0.1, + "grad_norm": 0.6867062350229141, + "learning_rate": 9.88657453756889e-06, + "loss": 0.5829, + "step": 1221 + }, + { + "epoch": 0.1, + "grad_norm": 2.0274529200836353, + "learning_rate": 9.886305023929275e-06, + "loss": 0.5188, + "step": 1222 + }, + { + "epoch": 0.1, + "grad_norm": 2.008126032463789, + "learning_rate": 9.886035194152112e-06, + "loss": 0.5163, + "step": 1223 + }, + { + "epoch": 0.1, + "grad_norm": 1.471211758607614, + "learning_rate": 9.885765048254868e-06, + "loss": 0.5069, + "step": 1224 + }, + { + "epoch": 0.1, + "grad_norm": 0.6639189530910646, + "learning_rate": 9.885494586255014e-06, + "loss": 0.6089, + "step": 1225 + }, + { + "epoch": 0.1, + "grad_norm": 1.4257934648675714, + "learning_rate": 9.885223808170052e-06, + "loss": 0.5742, + "step": 1226 + }, + { + "epoch": 0.1, + "grad_norm": 1.5721368863947638, + "learning_rate": 9.8849527140175e-06, + "loss": 0.4974, + "step": 1227 + }, + { + "epoch": 0.1, + "grad_norm": 1.5520222587908474, + "learning_rate": 9.884681303814897e-06, + "loss": 0.5552, + "step": 1228 + }, + { + "epoch": 0.1, + "grad_norm": 1.7262446028307725, + "learning_rate": 9.884409577579806e-06, + "loss": 0.4991, + "step": 1229 + }, + { + "epoch": 0.1, + "grad_norm": 0.5708671441469854, + "learning_rate": 9.884137535329804e-06, + "loss": 0.5775, + "step": 1230 + }, + { + "epoch": 0.1, + "grad_norm": 2.952333054597408, + "learning_rate": 9.88386517708249e-06, + "loss": 0.5548, + "step": 1231 + }, + { + "epoch": 0.1, + "grad_norm": 0.6833378299804076, + "learning_rate": 9.88359250285549e-06, + "loss": 0.5882, + "step": 1232 + }, + { + "epoch": 0.1, + "grad_norm": 2.520028970153725, + "learning_rate": 9.883319512666445e-06, + "loss": 0.518, + "step": 1233 + }, + { + "epoch": 0.1, + "grad_norm": 1.6256232805447575, + "learning_rate": 9.883046206533013e-06, + "loss": 0.5202, + "step": 1234 + }, + { + "epoch": 0.1, + "grad_norm": 1.3382413282029975, + "learning_rate": 9.88277258447288e-06, + "loss": 0.5141, + "step": 1235 + }, + { + "epoch": 0.1, + "grad_norm": 1.7050831348380444, + "learning_rate": 9.88249864650375e-06, + "loss": 0.564, + "step": 1236 + }, + { + "epoch": 0.1, + "grad_norm": 1.615472673312182, + "learning_rate": 9.882224392643345e-06, + "loss": 0.494, + "step": 1237 + }, + { + "epoch": 0.1, + "grad_norm": 2.0195638334355883, + "learning_rate": 9.881949822909407e-06, + "loss": 0.5014, + "step": 1238 + }, + { + "epoch": 0.1, + "grad_norm": 1.4666438591452124, + "learning_rate": 9.881674937319701e-06, + "loss": 0.5253, + "step": 1239 + }, + { + "epoch": 0.1, + "grad_norm": 1.72783375265924, + "learning_rate": 9.881399735892015e-06, + "loss": 0.5927, + "step": 1240 + }, + { + "epoch": 0.1, + "grad_norm": 1.7029540999984278, + "learning_rate": 9.881124218644149e-06, + "loss": 0.5219, + "step": 1241 + }, + { + "epoch": 0.1, + "grad_norm": 1.612080122698668, + "learning_rate": 9.880848385593933e-06, + "loss": 0.4798, + "step": 1242 + }, + { + "epoch": 0.1, + "grad_norm": 3.0919602926902234, + "learning_rate": 9.880572236759212e-06, + "loss": 0.4861, + "step": 1243 + }, + { + "epoch": 0.1, + "grad_norm": 1.6938457806945626, + "learning_rate": 9.880295772157851e-06, + "loss": 0.5135, + "step": 1244 + }, + { + "epoch": 0.1, + "grad_norm": 1.5911468138287221, + "learning_rate": 9.880018991807738e-06, + "loss": 0.5113, + "step": 1245 + }, + { + "epoch": 0.1, + "grad_norm": 1.661098829903792, + "learning_rate": 9.87974189572678e-06, + "loss": 0.5255, + "step": 1246 + }, + { + "epoch": 0.1, + "grad_norm": 0.6772855822065804, + "learning_rate": 9.879464483932903e-06, + "loss": 0.605, + "step": 1247 + }, + { + "epoch": 0.1, + "grad_norm": 0.6571045413883058, + "learning_rate": 9.87918675644406e-06, + "loss": 0.5996, + "step": 1248 + }, + { + "epoch": 0.1, + "grad_norm": 0.5724475114954535, + "learning_rate": 9.878908713278215e-06, + "loss": 0.5825, + "step": 1249 + }, + { + "epoch": 0.1, + "grad_norm": 1.6097375050400653, + "learning_rate": 9.87863035445336e-06, + "loss": 0.5275, + "step": 1250 + }, + { + "epoch": 0.1, + "grad_norm": 1.9651590123796776, + "learning_rate": 9.8783516799875e-06, + "loss": 0.5599, + "step": 1251 + }, + { + "epoch": 0.1, + "grad_norm": 1.708592695708444, + "learning_rate": 9.878072689898672e-06, + "loss": 0.5862, + "step": 1252 + }, + { + "epoch": 0.1, + "grad_norm": 1.889189301901958, + "learning_rate": 9.87779338420492e-06, + "loss": 0.5906, + "step": 1253 + }, + { + "epoch": 0.1, + "grad_norm": 1.9150809881336093, + "learning_rate": 9.877513762924318e-06, + "loss": 0.5375, + "step": 1254 + }, + { + "epoch": 0.1, + "grad_norm": 1.563942624315454, + "learning_rate": 9.877233826074956e-06, + "loss": 0.5229, + "step": 1255 + }, + { + "epoch": 0.1, + "grad_norm": 1.885316798708101, + "learning_rate": 9.876953573674946e-06, + "loss": 0.535, + "step": 1256 + }, + { + "epoch": 0.1, + "grad_norm": 1.6721612372487031, + "learning_rate": 9.876673005742417e-06, + "loss": 0.5169, + "step": 1257 + }, + { + "epoch": 0.1, + "grad_norm": 1.7369683493765, + "learning_rate": 9.876392122295526e-06, + "loss": 0.5679, + "step": 1258 + }, + { + "epoch": 0.1, + "grad_norm": 1.6703920825731133, + "learning_rate": 9.876110923352445e-06, + "loss": 0.5365, + "step": 1259 + }, + { + "epoch": 0.1, + "grad_norm": 0.793162907991587, + "learning_rate": 9.875829408931363e-06, + "loss": 0.5848, + "step": 1260 + }, + { + "epoch": 0.1, + "grad_norm": 1.829510897559637, + "learning_rate": 9.875547579050499e-06, + "loss": 0.5124, + "step": 1261 + }, + { + "epoch": 0.1, + "grad_norm": 1.4748272604271628, + "learning_rate": 9.875265433728085e-06, + "loss": 0.5415, + "step": 1262 + }, + { + "epoch": 0.1, + "grad_norm": 2.0439648587007704, + "learning_rate": 9.874982972982374e-06, + "loss": 0.512, + "step": 1263 + }, + { + "epoch": 0.1, + "grad_norm": 1.6352923371075194, + "learning_rate": 9.874700196831641e-06, + "loss": 0.4964, + "step": 1264 + }, + { + "epoch": 0.1, + "grad_norm": 1.736061006334386, + "learning_rate": 9.874417105294184e-06, + "loss": 0.5157, + "step": 1265 + }, + { + "epoch": 0.1, + "grad_norm": 0.5881587037058288, + "learning_rate": 9.874133698388316e-06, + "loss": 0.5706, + "step": 1266 + }, + { + "epoch": 0.1, + "grad_norm": 1.8358642890634616, + "learning_rate": 9.873849976132375e-06, + "loss": 0.5549, + "step": 1267 + }, + { + "epoch": 0.1, + "grad_norm": 1.57107991887304, + "learning_rate": 9.873565938544714e-06, + "loss": 0.5568, + "step": 1268 + }, + { + "epoch": 0.1, + "grad_norm": 0.6572499251655388, + "learning_rate": 9.873281585643715e-06, + "loss": 0.6017, + "step": 1269 + }, + { + "epoch": 0.1, + "grad_norm": 1.826058026132881, + "learning_rate": 9.87299691744777e-06, + "loss": 0.5527, + "step": 1270 + }, + { + "epoch": 0.1, + "grad_norm": 1.763590780015478, + "learning_rate": 9.8727119339753e-06, + "loss": 0.528, + "step": 1271 + }, + { + "epoch": 0.1, + "grad_norm": 1.812536053303485, + "learning_rate": 9.872426635244742e-06, + "loss": 0.5591, + "step": 1272 + }, + { + "epoch": 0.1, + "grad_norm": 0.6504726498169892, + "learning_rate": 9.872141021274554e-06, + "loss": 0.592, + "step": 1273 + }, + { + "epoch": 0.1, + "grad_norm": 8.379879107627136, + "learning_rate": 9.871855092083218e-06, + "loss": 0.568, + "step": 1274 + }, + { + "epoch": 0.1, + "grad_norm": 1.7727368237544894, + "learning_rate": 9.871568847689228e-06, + "loss": 0.4708, + "step": 1275 + }, + { + "epoch": 0.1, + "grad_norm": 3.0473632667752346, + "learning_rate": 9.871282288111109e-06, + "loss": 0.5511, + "step": 1276 + }, + { + "epoch": 0.1, + "grad_norm": 1.6548522612466148, + "learning_rate": 9.870995413367397e-06, + "loss": 0.5473, + "step": 1277 + }, + { + "epoch": 0.1, + "grad_norm": 1.694968127640105, + "learning_rate": 9.870708223476653e-06, + "loss": 0.486, + "step": 1278 + }, + { + "epoch": 0.1, + "grad_norm": 2.5146314248321153, + "learning_rate": 9.870420718457458e-06, + "loss": 0.5625, + "step": 1279 + }, + { + "epoch": 0.1, + "grad_norm": 1.6934324925723094, + "learning_rate": 9.870132898328417e-06, + "loss": 0.528, + "step": 1280 + }, + { + "epoch": 0.1, + "grad_norm": 1.6060753301742932, + "learning_rate": 9.869844763108146e-06, + "loss": 0.5657, + "step": 1281 + }, + { + "epoch": 0.1, + "grad_norm": 1.9161951237878325, + "learning_rate": 9.869556312815289e-06, + "loss": 0.515, + "step": 1282 + }, + { + "epoch": 0.1, + "grad_norm": 1.5818907560780635, + "learning_rate": 9.86926754746851e-06, + "loss": 0.5839, + "step": 1283 + }, + { + "epoch": 0.1, + "grad_norm": 1.314610828847146, + "learning_rate": 9.868978467086491e-06, + "loss": 0.5163, + "step": 1284 + }, + { + "epoch": 0.1, + "grad_norm": 2.0328770026992995, + "learning_rate": 9.868689071687933e-06, + "loss": 0.4814, + "step": 1285 + }, + { + "epoch": 0.1, + "grad_norm": 0.6747212853550181, + "learning_rate": 9.868399361291562e-06, + "loss": 0.5957, + "step": 1286 + }, + { + "epoch": 0.1, + "grad_norm": 2.1030080651097802, + "learning_rate": 9.86810933591612e-06, + "loss": 0.569, + "step": 1287 + }, + { + "epoch": 0.1, + "grad_norm": 1.6495845441356625, + "learning_rate": 9.867818995580374e-06, + "loss": 0.5093, + "step": 1288 + }, + { + "epoch": 0.1, + "grad_norm": 1.584903648441113, + "learning_rate": 9.867528340303106e-06, + "loss": 0.4968, + "step": 1289 + }, + { + "epoch": 0.1, + "grad_norm": 1.4761169676700796, + "learning_rate": 9.867237370103122e-06, + "loss": 0.5339, + "step": 1290 + }, + { + "epoch": 0.1, + "grad_norm": 1.5423299442713723, + "learning_rate": 9.866946084999248e-06, + "loss": 0.5407, + "step": 1291 + }, + { + "epoch": 0.1, + "grad_norm": 1.652278681218033, + "learning_rate": 9.866654485010328e-06, + "loss": 0.5696, + "step": 1292 + }, + { + "epoch": 0.1, + "grad_norm": 1.5666281947299256, + "learning_rate": 9.86636257015523e-06, + "loss": 0.5754, + "step": 1293 + }, + { + "epoch": 0.1, + "grad_norm": 1.700553659731398, + "learning_rate": 9.866070340452838e-06, + "loss": 0.5546, + "step": 1294 + }, + { + "epoch": 0.1, + "grad_norm": 1.7590005795153083, + "learning_rate": 9.865777795922063e-06, + "loss": 0.526, + "step": 1295 + }, + { + "epoch": 0.1, + "grad_norm": 1.7584781049415148, + "learning_rate": 9.865484936581828e-06, + "loss": 0.5194, + "step": 1296 + }, + { + "epoch": 0.1, + "grad_norm": 1.6470975460098836, + "learning_rate": 9.865191762451084e-06, + "loss": 0.5433, + "step": 1297 + }, + { + "epoch": 0.1, + "grad_norm": 0.6680342145032583, + "learning_rate": 9.864898273548795e-06, + "loss": 0.5717, + "step": 1298 + }, + { + "epoch": 0.1, + "grad_norm": 2.768194547229024, + "learning_rate": 9.864604469893955e-06, + "loss": 0.4997, + "step": 1299 + }, + { + "epoch": 0.1, + "grad_norm": 2.2541545361701116, + "learning_rate": 9.864310351505567e-06, + "loss": 0.5424, + "step": 1300 + }, + { + "epoch": 0.1, + "grad_norm": 1.9917846693599401, + "learning_rate": 9.864015918402663e-06, + "loss": 0.5349, + "step": 1301 + }, + { + "epoch": 0.1, + "grad_norm": 2.074978244726442, + "learning_rate": 9.863721170604292e-06, + "loss": 0.5732, + "step": 1302 + }, + { + "epoch": 0.1, + "grad_norm": 1.9285982627310976, + "learning_rate": 9.863426108129526e-06, + "loss": 0.4947, + "step": 1303 + }, + { + "epoch": 0.1, + "grad_norm": 0.6210590885819158, + "learning_rate": 9.86313073099745e-06, + "loss": 0.5657, + "step": 1304 + }, + { + "epoch": 0.1, + "grad_norm": 2.757925430682471, + "learning_rate": 9.862835039227179e-06, + "loss": 0.5, + "step": 1305 + }, + { + "epoch": 0.1, + "grad_norm": 1.737712200477531, + "learning_rate": 9.862539032837842e-06, + "loss": 0.5606, + "step": 1306 + }, + { + "epoch": 0.1, + "grad_norm": 0.6431584145872844, + "learning_rate": 9.862242711848591e-06, + "loss": 0.584, + "step": 1307 + }, + { + "epoch": 0.1, + "grad_norm": 1.5422463729744336, + "learning_rate": 9.861946076278597e-06, + "loss": 0.5563, + "step": 1308 + }, + { + "epoch": 0.1, + "grad_norm": 2.127514497102055, + "learning_rate": 9.861649126147051e-06, + "loss": 0.5697, + "step": 1309 + }, + { + "epoch": 0.1, + "grad_norm": 0.6040292275629819, + "learning_rate": 9.861351861473168e-06, + "loss": 0.5708, + "step": 1310 + }, + { + "epoch": 0.1, + "grad_norm": 37.84452125351232, + "learning_rate": 9.861054282276176e-06, + "loss": 0.5082, + "step": 1311 + }, + { + "epoch": 0.1, + "grad_norm": 1.533607621046888, + "learning_rate": 9.860756388575335e-06, + "loss": 0.5099, + "step": 1312 + }, + { + "epoch": 0.1, + "grad_norm": 1.4770269210060845, + "learning_rate": 9.860458180389913e-06, + "loss": 0.524, + "step": 1313 + }, + { + "epoch": 0.1, + "grad_norm": 1.8705863343213631, + "learning_rate": 9.860159657739204e-06, + "loss": 0.5324, + "step": 1314 + }, + { + "epoch": 0.1, + "grad_norm": 1.7295674462802433, + "learning_rate": 9.859860820642524e-06, + "loss": 0.5313, + "step": 1315 + }, + { + "epoch": 0.1, + "grad_norm": 1.6626274426325387, + "learning_rate": 9.859561669119206e-06, + "loss": 0.5068, + "step": 1316 + }, + { + "epoch": 0.1, + "grad_norm": 2.180259681304325, + "learning_rate": 9.859262203188605e-06, + "loss": 0.5379, + "step": 1317 + }, + { + "epoch": 0.1, + "grad_norm": 1.838881916381452, + "learning_rate": 9.858962422870095e-06, + "loss": 0.5616, + "step": 1318 + }, + { + "epoch": 0.1, + "grad_norm": 3.0229298248697094, + "learning_rate": 9.858662328183075e-06, + "loss": 0.5023, + "step": 1319 + }, + { + "epoch": 0.1, + "grad_norm": 0.6338025867418218, + "learning_rate": 9.858361919146958e-06, + "loss": 0.571, + "step": 1320 + }, + { + "epoch": 0.1, + "grad_norm": 1.837196256955751, + "learning_rate": 9.85806119578118e-06, + "loss": 0.5511, + "step": 1321 + }, + { + "epoch": 0.1, + "grad_norm": 1.3725510214590027, + "learning_rate": 9.857760158105196e-06, + "loss": 0.4682, + "step": 1322 + }, + { + "epoch": 0.1, + "grad_norm": 1.9023426772230936, + "learning_rate": 9.857458806138486e-06, + "loss": 0.4947, + "step": 1323 + }, + { + "epoch": 0.1, + "grad_norm": 2.8491900263042647, + "learning_rate": 9.857157139900546e-06, + "loss": 0.489, + "step": 1324 + }, + { + "epoch": 0.1, + "grad_norm": 0.565264169737493, + "learning_rate": 9.856855159410892e-06, + "loss": 0.5655, + "step": 1325 + }, + { + "epoch": 0.1, + "grad_norm": 7.1432198868615, + "learning_rate": 9.856552864689061e-06, + "loss": 0.5175, + "step": 1326 + }, + { + "epoch": 0.1, + "grad_norm": 0.6060281783253083, + "learning_rate": 9.856250255754616e-06, + "loss": 0.5691, + "step": 1327 + }, + { + "epoch": 0.1, + "grad_norm": 2.117209006777162, + "learning_rate": 9.855947332627131e-06, + "loss": 0.5733, + "step": 1328 + }, + { + "epoch": 0.1, + "grad_norm": 1.9329718723564508, + "learning_rate": 9.855644095326207e-06, + "loss": 0.527, + "step": 1329 + }, + { + "epoch": 0.1, + "grad_norm": 1.8594030365564438, + "learning_rate": 9.855340543871461e-06, + "loss": 0.5384, + "step": 1330 + }, + { + "epoch": 0.1, + "grad_norm": 1.7186728683031587, + "learning_rate": 9.855036678282534e-06, + "loss": 0.5536, + "step": 1331 + }, + { + "epoch": 0.1, + "grad_norm": 2.599736993199715, + "learning_rate": 9.854732498579085e-06, + "loss": 0.4908, + "step": 1332 + }, + { + "epoch": 0.1, + "grad_norm": 1.505437838624874, + "learning_rate": 9.854428004780795e-06, + "loss": 0.5173, + "step": 1333 + }, + { + "epoch": 0.1, + "grad_norm": 2.2822588775643706, + "learning_rate": 9.85412319690736e-06, + "loss": 0.503, + "step": 1334 + }, + { + "epoch": 0.1, + "grad_norm": 1.8235519936042275, + "learning_rate": 9.853818074978507e-06, + "loss": 0.5061, + "step": 1335 + }, + { + "epoch": 0.1, + "grad_norm": 1.9654980749160764, + "learning_rate": 9.853512639013974e-06, + "loss": 0.5024, + "step": 1336 + }, + { + "epoch": 0.11, + "grad_norm": 1.7867603827032181, + "learning_rate": 9.853206889033522e-06, + "loss": 0.5159, + "step": 1337 + }, + { + "epoch": 0.11, + "grad_norm": 2.230542976966494, + "learning_rate": 9.852900825056932e-06, + "loss": 0.5201, + "step": 1338 + }, + { + "epoch": 0.11, + "grad_norm": 1.388001971391122, + "learning_rate": 9.85259444710401e-06, + "loss": 0.5214, + "step": 1339 + }, + { + "epoch": 0.11, + "grad_norm": 0.6091559680675073, + "learning_rate": 9.852287755194572e-06, + "loss": 0.5839, + "step": 1340 + }, + { + "epoch": 0.11, + "grad_norm": 0.6666323337098522, + "learning_rate": 9.851980749348466e-06, + "loss": 0.5824, + "step": 1341 + }, + { + "epoch": 0.11, + "grad_norm": 1.6965746567540188, + "learning_rate": 9.851673429585551e-06, + "loss": 0.5384, + "step": 1342 + }, + { + "epoch": 0.11, + "grad_norm": 0.6053502340558741, + "learning_rate": 9.851365795925713e-06, + "loss": 0.5696, + "step": 1343 + }, + { + "epoch": 0.11, + "grad_norm": 1.459681709532209, + "learning_rate": 9.851057848388854e-06, + "loss": 0.5484, + "step": 1344 + }, + { + "epoch": 0.11, + "grad_norm": 1.83036064579155, + "learning_rate": 9.850749586994899e-06, + "loss": 0.5722, + "step": 1345 + }, + { + "epoch": 0.11, + "grad_norm": 1.5877544683178362, + "learning_rate": 9.85044101176379e-06, + "loss": 0.4946, + "step": 1346 + }, + { + "epoch": 0.11, + "grad_norm": 1.7115011474218138, + "learning_rate": 9.850132122715494e-06, + "loss": 0.5395, + "step": 1347 + }, + { + "epoch": 0.11, + "grad_norm": 4.23868157720752, + "learning_rate": 9.849822919869993e-06, + "loss": 0.4991, + "step": 1348 + }, + { + "epoch": 0.11, + "grad_norm": 1.4160330432570516, + "learning_rate": 9.849513403247295e-06, + "loss": 0.5295, + "step": 1349 + }, + { + "epoch": 0.11, + "grad_norm": 2.2615378012818037, + "learning_rate": 9.849203572867422e-06, + "loss": 0.4873, + "step": 1350 + }, + { + "epoch": 0.11, + "grad_norm": 1.6556965043702019, + "learning_rate": 9.848893428750423e-06, + "loss": 0.5227, + "step": 1351 + }, + { + "epoch": 0.11, + "grad_norm": 2.28201500431767, + "learning_rate": 9.848582970916362e-06, + "loss": 0.5869, + "step": 1352 + }, + { + "epoch": 0.11, + "grad_norm": 1.982023100234855, + "learning_rate": 9.848272199385325e-06, + "loss": 0.5402, + "step": 1353 + }, + { + "epoch": 0.11, + "grad_norm": 1.6239353227185154, + "learning_rate": 9.84796111417742e-06, + "loss": 0.5482, + "step": 1354 + }, + { + "epoch": 0.11, + "grad_norm": 0.7995960136382251, + "learning_rate": 9.847649715312772e-06, + "loss": 0.5769, + "step": 1355 + }, + { + "epoch": 0.11, + "grad_norm": 1.6412381285309934, + "learning_rate": 9.847338002811528e-06, + "loss": 0.5374, + "step": 1356 + }, + { + "epoch": 0.11, + "grad_norm": 0.6558805936658236, + "learning_rate": 9.847025976693858e-06, + "loss": 0.584, + "step": 1357 + }, + { + "epoch": 0.11, + "grad_norm": 1.4622303712083156, + "learning_rate": 9.846713636979948e-06, + "loss": 0.5423, + "step": 1358 + }, + { + "epoch": 0.11, + "grad_norm": 0.5979879056073257, + "learning_rate": 9.846400983690005e-06, + "loss": 0.5787, + "step": 1359 + }, + { + "epoch": 0.11, + "grad_norm": 1.5215494068616249, + "learning_rate": 9.846088016844259e-06, + "loss": 0.5392, + "step": 1360 + }, + { + "epoch": 0.11, + "grad_norm": 1.8862063087748073, + "learning_rate": 9.845774736462957e-06, + "loss": 0.5773, + "step": 1361 + }, + { + "epoch": 0.11, + "grad_norm": 1.8672282340012472, + "learning_rate": 9.84546114256637e-06, + "loss": 0.5243, + "step": 1362 + }, + { + "epoch": 0.11, + "grad_norm": 1.9708495825706198, + "learning_rate": 9.845147235174783e-06, + "loss": 0.4748, + "step": 1363 + }, + { + "epoch": 0.11, + "grad_norm": 2.095536283810096, + "learning_rate": 9.844833014308512e-06, + "loss": 0.544, + "step": 1364 + }, + { + "epoch": 0.11, + "grad_norm": 1.4483113251656274, + "learning_rate": 9.84451847998788e-06, + "loss": 0.5238, + "step": 1365 + }, + { + "epoch": 0.11, + "grad_norm": 1.6255499065890018, + "learning_rate": 9.84420363223324e-06, + "loss": 0.5313, + "step": 1366 + }, + { + "epoch": 0.11, + "grad_norm": 1.832733133591127, + "learning_rate": 9.843888471064962e-06, + "loss": 0.5125, + "step": 1367 + }, + { + "epoch": 0.11, + "grad_norm": 1.6817054967578888, + "learning_rate": 9.843572996503439e-06, + "loss": 0.4931, + "step": 1368 + }, + { + "epoch": 0.11, + "grad_norm": 2.669300786995267, + "learning_rate": 9.843257208569077e-06, + "loss": 0.5306, + "step": 1369 + }, + { + "epoch": 0.11, + "grad_norm": 1.5601871404054013, + "learning_rate": 9.842941107282309e-06, + "loss": 0.5733, + "step": 1370 + }, + { + "epoch": 0.11, + "grad_norm": 1.878293945492033, + "learning_rate": 9.842624692663587e-06, + "loss": 0.5434, + "step": 1371 + }, + { + "epoch": 0.11, + "grad_norm": 3.691183331546977, + "learning_rate": 9.842307964733384e-06, + "loss": 0.523, + "step": 1372 + }, + { + "epoch": 0.11, + "grad_norm": 1.7804431247497463, + "learning_rate": 9.841990923512188e-06, + "loss": 0.5588, + "step": 1373 + }, + { + "epoch": 0.11, + "grad_norm": 5.5651239457282236, + "learning_rate": 9.841673569020515e-06, + "loss": 0.5561, + "step": 1374 + }, + { + "epoch": 0.11, + "grad_norm": 1.4980770357622069, + "learning_rate": 9.841355901278894e-06, + "loss": 0.5309, + "step": 1375 + }, + { + "epoch": 0.11, + "grad_norm": 1.7779967076940275, + "learning_rate": 9.841037920307883e-06, + "loss": 0.498, + "step": 1376 + }, + { + "epoch": 0.11, + "grad_norm": 1.4770660297817095, + "learning_rate": 9.840719626128048e-06, + "loss": 0.512, + "step": 1377 + }, + { + "epoch": 0.11, + "grad_norm": 1.851319568812458, + "learning_rate": 9.840401018759987e-06, + "loss": 0.4876, + "step": 1378 + }, + { + "epoch": 0.11, + "grad_norm": 1.8227859046318933, + "learning_rate": 9.840082098224312e-06, + "loss": 0.4991, + "step": 1379 + }, + { + "epoch": 0.11, + "grad_norm": 1.770611266843305, + "learning_rate": 9.839762864541657e-06, + "loss": 0.5275, + "step": 1380 + }, + { + "epoch": 0.11, + "grad_norm": 0.822972233353695, + "learning_rate": 9.839443317732676e-06, + "loss": 0.5862, + "step": 1381 + }, + { + "epoch": 0.11, + "grad_norm": 1.8583608706680923, + "learning_rate": 9.839123457818042e-06, + "loss": 0.5376, + "step": 1382 + }, + { + "epoch": 0.11, + "grad_norm": 3.005421427183514, + "learning_rate": 9.838803284818452e-06, + "loss": 0.5685, + "step": 1383 + }, + { + "epoch": 0.11, + "grad_norm": 0.6530182181391837, + "learning_rate": 9.838482798754619e-06, + "loss": 0.589, + "step": 1384 + }, + { + "epoch": 0.11, + "grad_norm": 1.6388839999237954, + "learning_rate": 9.838161999647278e-06, + "loss": 0.5163, + "step": 1385 + }, + { + "epoch": 0.11, + "grad_norm": 1.7269629148847347, + "learning_rate": 9.837840887517185e-06, + "loss": 0.5362, + "step": 1386 + }, + { + "epoch": 0.11, + "grad_norm": 0.6699267780384877, + "learning_rate": 9.837519462385115e-06, + "loss": 0.5688, + "step": 1387 + }, + { + "epoch": 0.11, + "grad_norm": 1.4802324891003922, + "learning_rate": 9.837197724271864e-06, + "loss": 0.502, + "step": 1388 + }, + { + "epoch": 0.11, + "grad_norm": 1.906861470965291, + "learning_rate": 9.836875673198248e-06, + "loss": 0.5198, + "step": 1389 + }, + { + "epoch": 0.11, + "grad_norm": 2.776398276648744, + "learning_rate": 9.836553309185105e-06, + "loss": 0.508, + "step": 1390 + }, + { + "epoch": 0.11, + "grad_norm": 1.6020112677575478, + "learning_rate": 9.836230632253288e-06, + "loss": 0.5536, + "step": 1391 + }, + { + "epoch": 0.11, + "grad_norm": 2.0532990833310403, + "learning_rate": 9.835907642423676e-06, + "loss": 0.5457, + "step": 1392 + }, + { + "epoch": 0.11, + "grad_norm": 1.6988009258101529, + "learning_rate": 9.835584339717165e-06, + "loss": 0.5247, + "step": 1393 + }, + { + "epoch": 0.11, + "grad_norm": 1.6355331440428171, + "learning_rate": 9.835260724154677e-06, + "loss": 0.5169, + "step": 1394 + }, + { + "epoch": 0.11, + "grad_norm": 2.2626647379952245, + "learning_rate": 9.83493679575714e-06, + "loss": 0.5228, + "step": 1395 + }, + { + "epoch": 0.11, + "grad_norm": 1.6725802558174745, + "learning_rate": 9.83461255454552e-06, + "loss": 0.5744, + "step": 1396 + }, + { + "epoch": 0.11, + "grad_norm": 1.922426019633825, + "learning_rate": 9.834288000540792e-06, + "loss": 0.5862, + "step": 1397 + }, + { + "epoch": 0.11, + "grad_norm": 0.7868788910892452, + "learning_rate": 9.833963133763954e-06, + "loss": 0.5953, + "step": 1398 + }, + { + "epoch": 0.11, + "grad_norm": 1.940312551674799, + "learning_rate": 9.833637954236025e-06, + "loss": 0.5533, + "step": 1399 + }, + { + "epoch": 0.11, + "grad_norm": 2.0341150882818635, + "learning_rate": 9.833312461978043e-06, + "loss": 0.5292, + "step": 1400 + }, + { + "epoch": 0.11, + "grad_norm": 1.523013759060074, + "learning_rate": 9.83298665701107e-06, + "loss": 0.5686, + "step": 1401 + }, + { + "epoch": 0.11, + "grad_norm": 2.4448239388296553, + "learning_rate": 9.83266053935618e-06, + "loss": 0.4888, + "step": 1402 + }, + { + "epoch": 0.11, + "grad_norm": 1.7905533028888934, + "learning_rate": 9.832334109034476e-06, + "loss": 0.4767, + "step": 1403 + }, + { + "epoch": 0.11, + "grad_norm": 1.5698400468407063, + "learning_rate": 9.832007366067078e-06, + "loss": 0.5302, + "step": 1404 + }, + { + "epoch": 0.11, + "grad_norm": 1.707683195713622, + "learning_rate": 9.831680310475122e-06, + "loss": 0.5215, + "step": 1405 + }, + { + "epoch": 0.11, + "grad_norm": 2.267256289365699, + "learning_rate": 9.831352942279772e-06, + "loss": 0.5186, + "step": 1406 + }, + { + "epoch": 0.11, + "grad_norm": 1.4546914619928246, + "learning_rate": 9.831025261502206e-06, + "loss": 0.5275, + "step": 1407 + }, + { + "epoch": 0.11, + "grad_norm": 0.679855944284893, + "learning_rate": 9.830697268163625e-06, + "loss": 0.5655, + "step": 1408 + }, + { + "epoch": 0.11, + "grad_norm": 1.8752637294371775, + "learning_rate": 9.830368962285252e-06, + "loss": 0.544, + "step": 1409 + }, + { + "epoch": 0.11, + "grad_norm": 1.6730391483689642, + "learning_rate": 9.830040343888324e-06, + "loss": 0.5096, + "step": 1410 + }, + { + "epoch": 0.11, + "grad_norm": 1.6602892914724103, + "learning_rate": 9.829711412994105e-06, + "loss": 0.5402, + "step": 1411 + }, + { + "epoch": 0.11, + "grad_norm": 1.7564187138079868, + "learning_rate": 9.829382169623876e-06, + "loss": 0.4417, + "step": 1412 + }, + { + "epoch": 0.11, + "grad_norm": 0.6734626202539157, + "learning_rate": 9.829052613798938e-06, + "loss": 0.5882, + "step": 1413 + }, + { + "epoch": 0.11, + "grad_norm": 1.4511343352318309, + "learning_rate": 9.828722745540614e-06, + "loss": 0.5274, + "step": 1414 + }, + { + "epoch": 0.11, + "grad_norm": 1.6923609140431217, + "learning_rate": 9.828392564870244e-06, + "loss": 0.5572, + "step": 1415 + }, + { + "epoch": 0.11, + "grad_norm": 2.44571988217081, + "learning_rate": 9.82806207180919e-06, + "loss": 0.5008, + "step": 1416 + }, + { + "epoch": 0.11, + "grad_norm": 1.941799350571069, + "learning_rate": 9.827731266378839e-06, + "loss": 0.5574, + "step": 1417 + }, + { + "epoch": 0.11, + "grad_norm": 2.080911393359229, + "learning_rate": 9.827400148600587e-06, + "loss": 0.5531, + "step": 1418 + }, + { + "epoch": 0.11, + "grad_norm": 1.6235865438465549, + "learning_rate": 9.827068718495864e-06, + "loss": 0.5209, + "step": 1419 + }, + { + "epoch": 0.11, + "grad_norm": 0.6557147659369517, + "learning_rate": 9.826736976086108e-06, + "loss": 0.5548, + "step": 1420 + }, + { + "epoch": 0.11, + "grad_norm": 1.8011044262229758, + "learning_rate": 9.826404921392783e-06, + "loss": 0.5512, + "step": 1421 + }, + { + "epoch": 0.11, + "grad_norm": 2.1399277896554283, + "learning_rate": 9.826072554437373e-06, + "loss": 0.5159, + "step": 1422 + }, + { + "epoch": 0.11, + "grad_norm": 1.609804852868493, + "learning_rate": 9.825739875241385e-06, + "loss": 0.4594, + "step": 1423 + }, + { + "epoch": 0.11, + "grad_norm": 0.6221689631727851, + "learning_rate": 9.825406883826338e-06, + "loss": 0.5925, + "step": 1424 + }, + { + "epoch": 0.11, + "grad_norm": 3.5946377852738878, + "learning_rate": 9.825073580213777e-06, + "loss": 0.5129, + "step": 1425 + }, + { + "epoch": 0.11, + "grad_norm": 2.017252465409614, + "learning_rate": 9.824739964425269e-06, + "loss": 0.533, + "step": 1426 + }, + { + "epoch": 0.11, + "grad_norm": 1.9745787734988633, + "learning_rate": 9.824406036482395e-06, + "loss": 0.549, + "step": 1427 + }, + { + "epoch": 0.11, + "grad_norm": 2.2246304226320923, + "learning_rate": 9.824071796406764e-06, + "loss": 0.5745, + "step": 1428 + }, + { + "epoch": 0.11, + "grad_norm": 1.6496788207316133, + "learning_rate": 9.823737244219997e-06, + "loss": 0.5107, + "step": 1429 + }, + { + "epoch": 0.11, + "grad_norm": 1.4978005909024623, + "learning_rate": 9.82340237994374e-06, + "loss": 0.5626, + "step": 1430 + }, + { + "epoch": 0.11, + "grad_norm": 1.5886551158230016, + "learning_rate": 9.82306720359966e-06, + "loss": 0.4978, + "step": 1431 + }, + { + "epoch": 0.11, + "grad_norm": 1.4178806615443416, + "learning_rate": 9.822731715209442e-06, + "loss": 0.4672, + "step": 1432 + }, + { + "epoch": 0.11, + "grad_norm": 1.9708377515536957, + "learning_rate": 9.82239591479479e-06, + "loss": 0.5258, + "step": 1433 + }, + { + "epoch": 0.11, + "grad_norm": 0.6477282936181471, + "learning_rate": 9.82205980237743e-06, + "loss": 0.5831, + "step": 1434 + }, + { + "epoch": 0.11, + "grad_norm": 1.4094772562134705, + "learning_rate": 9.821723377979109e-06, + "loss": 0.4855, + "step": 1435 + }, + { + "epoch": 0.11, + "grad_norm": 1.93180016626713, + "learning_rate": 9.821386641621593e-06, + "loss": 0.5367, + "step": 1436 + }, + { + "epoch": 0.11, + "grad_norm": 1.6316281356435898, + "learning_rate": 9.82104959332667e-06, + "loss": 0.5313, + "step": 1437 + }, + { + "epoch": 0.11, + "grad_norm": 1.5754686944405991, + "learning_rate": 9.820712233116142e-06, + "loss": 0.5328, + "step": 1438 + }, + { + "epoch": 0.11, + "grad_norm": 1.856940447247618, + "learning_rate": 9.820374561011843e-06, + "loss": 0.4817, + "step": 1439 + }, + { + "epoch": 0.11, + "grad_norm": 1.5849232468849792, + "learning_rate": 9.820036577035613e-06, + "loss": 0.5035, + "step": 1440 + }, + { + "epoch": 0.11, + "grad_norm": 2.256538708948276, + "learning_rate": 9.819698281209322e-06, + "loss": 0.5361, + "step": 1441 + }, + { + "epoch": 0.11, + "grad_norm": 0.5878994442457026, + "learning_rate": 9.819359673554858e-06, + "loss": 0.5591, + "step": 1442 + }, + { + "epoch": 0.11, + "grad_norm": 1.6536165730068886, + "learning_rate": 9.819020754094127e-06, + "loss": 0.5494, + "step": 1443 + }, + { + "epoch": 0.11, + "grad_norm": 1.369498155797225, + "learning_rate": 9.818681522849058e-06, + "loss": 0.5282, + "step": 1444 + }, + { + "epoch": 0.11, + "grad_norm": 1.5715907393419204, + "learning_rate": 9.818341979841597e-06, + "loss": 0.5188, + "step": 1445 + }, + { + "epoch": 0.11, + "grad_norm": 1.973900466108539, + "learning_rate": 9.818002125093714e-06, + "loss": 0.5452, + "step": 1446 + }, + { + "epoch": 0.11, + "grad_norm": 1.5295581024970804, + "learning_rate": 9.817661958627396e-06, + "loss": 0.4874, + "step": 1447 + }, + { + "epoch": 0.11, + "grad_norm": 0.6108620827341058, + "learning_rate": 9.817321480464653e-06, + "loss": 0.599, + "step": 1448 + }, + { + "epoch": 0.11, + "grad_norm": 2.4224038972422504, + "learning_rate": 9.816980690627509e-06, + "loss": 0.5385, + "step": 1449 + }, + { + "epoch": 0.11, + "grad_norm": 0.6470157292810779, + "learning_rate": 9.816639589138017e-06, + "loss": 0.5633, + "step": 1450 + }, + { + "epoch": 0.11, + "grad_norm": 1.4256876525283169, + "learning_rate": 9.816298176018245e-06, + "loss": 0.5498, + "step": 1451 + }, + { + "epoch": 0.11, + "grad_norm": 1.6546136677673913, + "learning_rate": 9.815956451290281e-06, + "loss": 0.5021, + "step": 1452 + }, + { + "epoch": 0.11, + "grad_norm": 0.6340100529066786, + "learning_rate": 9.815614414976235e-06, + "loss": 0.575, + "step": 1453 + }, + { + "epoch": 0.11, + "grad_norm": 2.157416781368045, + "learning_rate": 9.815272067098236e-06, + "loss": 0.5307, + "step": 1454 + }, + { + "epoch": 0.11, + "grad_norm": 1.9071490390100114, + "learning_rate": 9.814929407678434e-06, + "loss": 0.5218, + "step": 1455 + }, + { + "epoch": 0.11, + "grad_norm": 1.6957023678573544, + "learning_rate": 9.814586436738998e-06, + "loss": 0.5384, + "step": 1456 + }, + { + "epoch": 0.11, + "grad_norm": 1.4580621009037396, + "learning_rate": 9.814243154302116e-06, + "loss": 0.5077, + "step": 1457 + }, + { + "epoch": 0.11, + "grad_norm": 0.597096166441479, + "learning_rate": 9.813899560390001e-06, + "loss": 0.5819, + "step": 1458 + }, + { + "epoch": 0.11, + "grad_norm": 1.7378799313400277, + "learning_rate": 9.813555655024882e-06, + "loss": 0.5448, + "step": 1459 + }, + { + "epoch": 0.11, + "grad_norm": 1.7045672054254442, + "learning_rate": 9.813211438229009e-06, + "loss": 0.5219, + "step": 1460 + }, + { + "epoch": 0.11, + "grad_norm": 1.4727926327197707, + "learning_rate": 9.812866910024652e-06, + "loss": 0.537, + "step": 1461 + }, + { + "epoch": 0.11, + "grad_norm": 1.466531885303182, + "learning_rate": 9.812522070434103e-06, + "loss": 0.4731, + "step": 1462 + }, + { + "epoch": 0.11, + "grad_norm": 0.6654557048017115, + "learning_rate": 9.81217691947967e-06, + "loss": 0.5876, + "step": 1463 + }, + { + "epoch": 0.11, + "grad_norm": 1.397770804910777, + "learning_rate": 9.811831457183687e-06, + "loss": 0.5189, + "step": 1464 + }, + { + "epoch": 0.12, + "grad_norm": 2.803115288219963, + "learning_rate": 9.811485683568502e-06, + "loss": 0.5279, + "step": 1465 + }, + { + "epoch": 0.12, + "grad_norm": 1.4601816883687113, + "learning_rate": 9.81113959865649e-06, + "loss": 0.5603, + "step": 1466 + }, + { + "epoch": 0.12, + "grad_norm": 1.8989026487522842, + "learning_rate": 9.810793202470036e-06, + "loss": 0.5531, + "step": 1467 + }, + { + "epoch": 0.12, + "grad_norm": 0.6289229039008521, + "learning_rate": 9.810446495031558e-06, + "loss": 0.5558, + "step": 1468 + }, + { + "epoch": 0.12, + "grad_norm": 2.067336153759269, + "learning_rate": 9.810099476363484e-06, + "loss": 0.5586, + "step": 1469 + }, + { + "epoch": 0.12, + "grad_norm": 2.056891487080746, + "learning_rate": 9.809752146488266e-06, + "loss": 0.4961, + "step": 1470 + }, + { + "epoch": 0.12, + "grad_norm": 0.6564264985537602, + "learning_rate": 9.809404505428375e-06, + "loss": 0.5802, + "step": 1471 + }, + { + "epoch": 0.12, + "grad_norm": 1.5638423977694067, + "learning_rate": 9.809056553206303e-06, + "loss": 0.5657, + "step": 1472 + }, + { + "epoch": 0.12, + "grad_norm": 6.224399909497267, + "learning_rate": 9.808708289844566e-06, + "loss": 0.532, + "step": 1473 + }, + { + "epoch": 0.12, + "grad_norm": 1.6126993278595447, + "learning_rate": 9.808359715365693e-06, + "loss": 0.4909, + "step": 1474 + }, + { + "epoch": 0.12, + "grad_norm": 1.9035483311416836, + "learning_rate": 9.808010829792236e-06, + "loss": 0.5504, + "step": 1475 + }, + { + "epoch": 0.12, + "grad_norm": 1.8259158149478862, + "learning_rate": 9.807661633146768e-06, + "loss": 0.5872, + "step": 1476 + }, + { + "epoch": 0.12, + "grad_norm": 1.6213409351363248, + "learning_rate": 9.807312125451881e-06, + "loss": 0.5873, + "step": 1477 + }, + { + "epoch": 0.12, + "grad_norm": 1.542885500047365, + "learning_rate": 9.806962306730187e-06, + "loss": 0.5357, + "step": 1478 + }, + { + "epoch": 0.12, + "grad_norm": 1.7538979240062165, + "learning_rate": 9.806612177004324e-06, + "loss": 0.5147, + "step": 1479 + }, + { + "epoch": 0.12, + "grad_norm": 1.57243733898596, + "learning_rate": 9.806261736296938e-06, + "loss": 0.4887, + "step": 1480 + }, + { + "epoch": 0.12, + "grad_norm": 1.3839805352205314, + "learning_rate": 9.805910984630705e-06, + "loss": 0.5363, + "step": 1481 + }, + { + "epoch": 0.12, + "grad_norm": 1.6680489543720407, + "learning_rate": 9.805559922028319e-06, + "loss": 0.532, + "step": 1482 + }, + { + "epoch": 0.12, + "grad_norm": 1.614143424048914, + "learning_rate": 9.805208548512493e-06, + "loss": 0.495, + "step": 1483 + }, + { + "epoch": 0.12, + "grad_norm": 2.086016035569516, + "learning_rate": 9.804856864105958e-06, + "loss": 0.4799, + "step": 1484 + }, + { + "epoch": 0.12, + "grad_norm": 1.6966086411337373, + "learning_rate": 9.80450486883147e-06, + "loss": 0.5701, + "step": 1485 + }, + { + "epoch": 0.12, + "grad_norm": 1.3565860804155265, + "learning_rate": 9.804152562711804e-06, + "loss": 0.5562, + "step": 1486 + }, + { + "epoch": 0.12, + "grad_norm": 0.6776347346331966, + "learning_rate": 9.80379994576975e-06, + "loss": 0.5594, + "step": 1487 + }, + { + "epoch": 0.12, + "grad_norm": 1.9966643843278638, + "learning_rate": 9.803447018028124e-06, + "loss": 0.5533, + "step": 1488 + }, + { + "epoch": 0.12, + "grad_norm": 3.163997501723208, + "learning_rate": 9.80309377950976e-06, + "loss": 0.52, + "step": 1489 + }, + { + "epoch": 0.12, + "grad_norm": 1.3915427807023326, + "learning_rate": 9.802740230237512e-06, + "loss": 0.5062, + "step": 1490 + }, + { + "epoch": 0.12, + "grad_norm": 1.6175670277563174, + "learning_rate": 9.802386370234254e-06, + "loss": 0.5082, + "step": 1491 + }, + { + "epoch": 0.12, + "grad_norm": 1.9827328281711285, + "learning_rate": 9.80203219952288e-06, + "loss": 0.5425, + "step": 1492 + }, + { + "epoch": 0.12, + "grad_norm": 1.5512848654967046, + "learning_rate": 9.801677718126303e-06, + "loss": 0.4952, + "step": 1493 + }, + { + "epoch": 0.12, + "grad_norm": 1.6499644823428672, + "learning_rate": 9.801322926067461e-06, + "loss": 0.5491, + "step": 1494 + }, + { + "epoch": 0.12, + "grad_norm": 1.4826993428053694, + "learning_rate": 9.800967823369304e-06, + "loss": 0.5007, + "step": 1495 + }, + { + "epoch": 0.12, + "grad_norm": 1.5673435981039556, + "learning_rate": 9.800612410054813e-06, + "loss": 0.4842, + "step": 1496 + }, + { + "epoch": 0.12, + "grad_norm": 1.4156769917693852, + "learning_rate": 9.800256686146977e-06, + "loss": 0.4813, + "step": 1497 + }, + { + "epoch": 0.12, + "grad_norm": 1.447942557281987, + "learning_rate": 9.799900651668812e-06, + "loss": 0.5137, + "step": 1498 + }, + { + "epoch": 0.12, + "grad_norm": 0.7012942444692674, + "learning_rate": 9.799544306643356e-06, + "loss": 0.569, + "step": 1499 + }, + { + "epoch": 0.12, + "grad_norm": 1.520176049387108, + "learning_rate": 9.79918765109366e-06, + "loss": 0.5114, + "step": 1500 + }, + { + "epoch": 0.12, + "grad_norm": 1.6843540325782766, + "learning_rate": 9.798830685042801e-06, + "loss": 0.5352, + "step": 1501 + }, + { + "epoch": 0.12, + "grad_norm": 2.0591510550059278, + "learning_rate": 9.798473408513876e-06, + "loss": 0.5064, + "step": 1502 + }, + { + "epoch": 0.12, + "grad_norm": 1.420343296350935, + "learning_rate": 9.79811582153e-06, + "loss": 0.5234, + "step": 1503 + }, + { + "epoch": 0.12, + "grad_norm": 1.48144342230223, + "learning_rate": 9.797757924114304e-06, + "loss": 0.5221, + "step": 1504 + }, + { + "epoch": 0.12, + "grad_norm": 2.2374424438602327, + "learning_rate": 9.797399716289947e-06, + "loss": 0.5163, + "step": 1505 + }, + { + "epoch": 0.12, + "grad_norm": 1.611288380411681, + "learning_rate": 9.797041198080105e-06, + "loss": 0.5551, + "step": 1506 + }, + { + "epoch": 0.12, + "grad_norm": 0.7291779697126157, + "learning_rate": 9.796682369507972e-06, + "loss": 0.5988, + "step": 1507 + }, + { + "epoch": 0.12, + "grad_norm": 0.6869981498164651, + "learning_rate": 9.796323230596766e-06, + "loss": 0.582, + "step": 1508 + }, + { + "epoch": 0.12, + "grad_norm": 1.5072725421642132, + "learning_rate": 9.79596378136972e-06, + "loss": 0.5291, + "step": 1509 + }, + { + "epoch": 0.12, + "grad_norm": 1.69148184300472, + "learning_rate": 9.795604021850092e-06, + "loss": 0.4704, + "step": 1510 + }, + { + "epoch": 0.12, + "grad_norm": 1.7123627989850265, + "learning_rate": 9.795243952061158e-06, + "loss": 0.5889, + "step": 1511 + }, + { + "epoch": 0.12, + "grad_norm": 1.890178310026196, + "learning_rate": 9.79488357202621e-06, + "loss": 0.5609, + "step": 1512 + }, + { + "epoch": 0.12, + "grad_norm": 2.3162541572302286, + "learning_rate": 9.794522881768572e-06, + "loss": 0.4732, + "step": 1513 + }, + { + "epoch": 0.12, + "grad_norm": 1.702095102320251, + "learning_rate": 9.794161881311573e-06, + "loss": 0.526, + "step": 1514 + }, + { + "epoch": 0.12, + "grad_norm": 1.3811814105546294, + "learning_rate": 9.793800570678571e-06, + "loss": 0.4861, + "step": 1515 + }, + { + "epoch": 0.12, + "grad_norm": 1.590785241525209, + "learning_rate": 9.793438949892945e-06, + "loss": 0.5666, + "step": 1516 + }, + { + "epoch": 0.12, + "grad_norm": 1.8844246494596113, + "learning_rate": 9.793077018978088e-06, + "loss": 0.5628, + "step": 1517 + }, + { + "epoch": 0.12, + "grad_norm": 1.5263312497308046, + "learning_rate": 9.792714777957419e-06, + "loss": 0.5582, + "step": 1518 + }, + { + "epoch": 0.12, + "grad_norm": 1.4830971862725775, + "learning_rate": 9.792352226854373e-06, + "loss": 0.5092, + "step": 1519 + }, + { + "epoch": 0.12, + "grad_norm": 1.2238405662846028, + "learning_rate": 9.791989365692408e-06, + "loss": 0.4651, + "step": 1520 + }, + { + "epoch": 0.12, + "grad_norm": 1.4932476673251032, + "learning_rate": 9.791626194494998e-06, + "loss": 0.5224, + "step": 1521 + }, + { + "epoch": 0.12, + "grad_norm": 1.598361710120726, + "learning_rate": 9.791262713285644e-06, + "loss": 0.5366, + "step": 1522 + }, + { + "epoch": 0.12, + "grad_norm": 1.690852240103195, + "learning_rate": 9.790898922087857e-06, + "loss": 0.5084, + "step": 1523 + }, + { + "epoch": 0.12, + "grad_norm": 1.4988239699623018, + "learning_rate": 9.79053482092518e-06, + "loss": 0.467, + "step": 1524 + }, + { + "epoch": 0.12, + "grad_norm": 1.494002245915824, + "learning_rate": 9.790170409821165e-06, + "loss": 0.485, + "step": 1525 + }, + { + "epoch": 0.12, + "grad_norm": 1.2676960726270852, + "learning_rate": 9.789805688799393e-06, + "loss": 0.6222, + "step": 1526 + }, + { + "epoch": 0.12, + "grad_norm": 0.9210009322427357, + "learning_rate": 9.789440657883456e-06, + "loss": 0.553, + "step": 1527 + }, + { + "epoch": 0.12, + "grad_norm": 2.894456412323396, + "learning_rate": 9.789075317096974e-06, + "loss": 0.5112, + "step": 1528 + }, + { + "epoch": 0.12, + "grad_norm": 0.70628108519553, + "learning_rate": 9.788709666463585e-06, + "loss": 0.5754, + "step": 1529 + }, + { + "epoch": 0.12, + "grad_norm": 1.7528622705318033, + "learning_rate": 9.788343706006946e-06, + "loss": 0.5289, + "step": 1530 + }, + { + "epoch": 0.12, + "grad_norm": 1.4592595251949918, + "learning_rate": 9.787977435750732e-06, + "loss": 0.5532, + "step": 1531 + }, + { + "epoch": 0.12, + "grad_norm": 1.3513536932020114, + "learning_rate": 9.787610855718642e-06, + "loss": 0.5162, + "step": 1532 + }, + { + "epoch": 0.12, + "grad_norm": 1.5857925588106039, + "learning_rate": 9.787243965934392e-06, + "loss": 0.5595, + "step": 1533 + }, + { + "epoch": 0.12, + "grad_norm": 1.484735905113311, + "learning_rate": 9.78687676642172e-06, + "loss": 0.6419, + "step": 1534 + }, + { + "epoch": 0.12, + "grad_norm": 1.258203678294199, + "learning_rate": 9.786509257204383e-06, + "loss": 0.6182, + "step": 1535 + }, + { + "epoch": 0.12, + "grad_norm": 1.6780516009544602, + "learning_rate": 9.786141438306158e-06, + "loss": 0.5504, + "step": 1536 + }, + { + "epoch": 0.12, + "grad_norm": 2.0999507819127587, + "learning_rate": 9.785773309750845e-06, + "loss": 0.5363, + "step": 1537 + }, + { + "epoch": 0.12, + "grad_norm": 2.1956560557985756, + "learning_rate": 9.785404871562258e-06, + "loss": 0.5049, + "step": 1538 + }, + { + "epoch": 0.12, + "grad_norm": 1.7152970081106214, + "learning_rate": 9.785036123764235e-06, + "loss": 0.5392, + "step": 1539 + }, + { + "epoch": 0.12, + "grad_norm": 1.891569954455362, + "learning_rate": 9.784667066380637e-06, + "loss": 0.5582, + "step": 1540 + }, + { + "epoch": 0.12, + "grad_norm": 1.5335814055798747, + "learning_rate": 9.784297699435336e-06, + "loss": 0.5322, + "step": 1541 + }, + { + "epoch": 0.12, + "grad_norm": 1.6996267991293694, + "learning_rate": 9.783928022952236e-06, + "loss": 0.5292, + "step": 1542 + }, + { + "epoch": 0.12, + "grad_norm": 1.6319139637576137, + "learning_rate": 9.783558036955249e-06, + "loss": 0.5297, + "step": 1543 + }, + { + "epoch": 0.12, + "grad_norm": 1.6416554708943427, + "learning_rate": 9.783187741468314e-06, + "loss": 0.5417, + "step": 1544 + }, + { + "epoch": 0.12, + "grad_norm": 1.862631740632092, + "learning_rate": 9.78281713651539e-06, + "loss": 0.5714, + "step": 1545 + }, + { + "epoch": 0.12, + "grad_norm": 1.5145195376960159, + "learning_rate": 9.782446222120454e-06, + "loss": 0.5793, + "step": 1546 + }, + { + "epoch": 0.12, + "grad_norm": 1.7042888159825762, + "learning_rate": 9.782074998307505e-06, + "loss": 0.5346, + "step": 1547 + }, + { + "epoch": 0.12, + "grad_norm": 1.6858550078197532, + "learning_rate": 9.781703465100556e-06, + "loss": 0.5321, + "step": 1548 + }, + { + "epoch": 0.12, + "grad_norm": 1.3126190482958917, + "learning_rate": 9.781331622523651e-06, + "loss": 0.4458, + "step": 1549 + }, + { + "epoch": 0.12, + "grad_norm": 1.3359489271363565, + "learning_rate": 9.780959470600844e-06, + "loss": 0.6358, + "step": 1550 + }, + { + "epoch": 0.12, + "grad_norm": 1.6392377825953799, + "learning_rate": 9.780587009356214e-06, + "loss": 0.5296, + "step": 1551 + }, + { + "epoch": 0.12, + "grad_norm": 4.782425389193383, + "learning_rate": 9.780214238813858e-06, + "loss": 0.5062, + "step": 1552 + }, + { + "epoch": 0.12, + "grad_norm": 0.8528416557986036, + "learning_rate": 9.779841158997894e-06, + "loss": 0.5809, + "step": 1553 + }, + { + "epoch": 0.12, + "grad_norm": 1.6752901568743175, + "learning_rate": 9.77946776993246e-06, + "loss": 0.5431, + "step": 1554 + }, + { + "epoch": 0.12, + "grad_norm": 1.5932064850137686, + "learning_rate": 9.779094071641712e-06, + "loss": 0.546, + "step": 1555 + }, + { + "epoch": 0.12, + "grad_norm": 1.570049069081433, + "learning_rate": 9.778720064149832e-06, + "loss": 0.4393, + "step": 1556 + }, + { + "epoch": 0.12, + "grad_norm": 0.7192342294319541, + "learning_rate": 9.778345747481013e-06, + "loss": 0.5768, + "step": 1557 + }, + { + "epoch": 0.12, + "grad_norm": 1.7340970681645025, + "learning_rate": 9.777971121659477e-06, + "loss": 0.4999, + "step": 1558 + }, + { + "epoch": 0.12, + "grad_norm": 1.604312995989312, + "learning_rate": 9.777596186709457e-06, + "loss": 0.4728, + "step": 1559 + }, + { + "epoch": 0.12, + "grad_norm": 1.464654577981638, + "learning_rate": 9.777220942655217e-06, + "loss": 0.5394, + "step": 1560 + }, + { + "epoch": 0.12, + "grad_norm": 2.3416511751395794, + "learning_rate": 9.776845389521029e-06, + "loss": 0.534, + "step": 1561 + }, + { + "epoch": 0.12, + "grad_norm": 0.72288666086215, + "learning_rate": 9.776469527331194e-06, + "loss": 0.5788, + "step": 1562 + }, + { + "epoch": 0.12, + "grad_norm": 1.3807501628675511, + "learning_rate": 9.776093356110027e-06, + "loss": 0.5289, + "step": 1563 + }, + { + "epoch": 0.12, + "grad_norm": 0.7069391500052442, + "learning_rate": 9.77571687588187e-06, + "loss": 0.5787, + "step": 1564 + }, + { + "epoch": 0.12, + "grad_norm": 1.6171416699039798, + "learning_rate": 9.77534008667108e-06, + "loss": 0.5065, + "step": 1565 + }, + { + "epoch": 0.12, + "grad_norm": 1.2639808361605234, + "learning_rate": 9.774962988502032e-06, + "loss": 0.4682, + "step": 1566 + }, + { + "epoch": 0.12, + "grad_norm": 1.7201672652512507, + "learning_rate": 9.774585581399122e-06, + "loss": 0.53, + "step": 1567 + }, + { + "epoch": 0.12, + "grad_norm": 1.451863257985592, + "learning_rate": 9.774207865386775e-06, + "loss": 0.5423, + "step": 1568 + }, + { + "epoch": 0.12, + "grad_norm": 1.7379154152765588, + "learning_rate": 9.773829840489423e-06, + "loss": 0.5157, + "step": 1569 + }, + { + "epoch": 0.12, + "grad_norm": 1.6664559563873693, + "learning_rate": 9.773451506731526e-06, + "loss": 0.5696, + "step": 1570 + }, + { + "epoch": 0.12, + "grad_norm": 0.7447898060919843, + "learning_rate": 9.773072864137561e-06, + "loss": 0.5707, + "step": 1571 + }, + { + "epoch": 0.12, + "grad_norm": 1.5378793325476334, + "learning_rate": 9.772693912732022e-06, + "loss": 0.5003, + "step": 1572 + }, + { + "epoch": 0.12, + "grad_norm": 4.0609274320201045, + "learning_rate": 9.772314652539434e-06, + "loss": 0.5596, + "step": 1573 + }, + { + "epoch": 0.12, + "grad_norm": 1.800248890787103, + "learning_rate": 9.771935083584331e-06, + "loss": 0.5566, + "step": 1574 + }, + { + "epoch": 0.12, + "grad_norm": 1.7318886983275132, + "learning_rate": 9.77155520589127e-06, + "loss": 0.5524, + "step": 1575 + }, + { + "epoch": 0.12, + "grad_norm": 1.3584178721111977, + "learning_rate": 9.771175019484828e-06, + "loss": 0.4872, + "step": 1576 + }, + { + "epoch": 0.12, + "grad_norm": 2.4705481906399265, + "learning_rate": 9.770794524389605e-06, + "loss": 0.4981, + "step": 1577 + }, + { + "epoch": 0.12, + "grad_norm": 2.7649880341716853, + "learning_rate": 9.770413720630218e-06, + "loss": 0.5223, + "step": 1578 + }, + { + "epoch": 0.12, + "grad_norm": 2.053151650178062, + "learning_rate": 9.770032608231302e-06, + "loss": 0.4928, + "step": 1579 + }, + { + "epoch": 0.12, + "grad_norm": 3.1040535860312617, + "learning_rate": 9.769651187217517e-06, + "loss": 0.514, + "step": 1580 + }, + { + "epoch": 0.12, + "grad_norm": 1.5991185596135733, + "learning_rate": 9.76926945761354e-06, + "loss": 0.5427, + "step": 1581 + }, + { + "epoch": 0.12, + "grad_norm": 1.8899517653224276, + "learning_rate": 9.768887419444066e-06, + "loss": 0.5116, + "step": 1582 + }, + { + "epoch": 0.12, + "grad_norm": 1.5359368569470062, + "learning_rate": 9.768505072733816e-06, + "loss": 0.5249, + "step": 1583 + }, + { + "epoch": 0.12, + "grad_norm": 1.5176740708586691, + "learning_rate": 9.768122417507526e-06, + "loss": 0.5637, + "step": 1584 + }, + { + "epoch": 0.12, + "grad_norm": 0.6580980378745395, + "learning_rate": 9.767739453789954e-06, + "loss": 0.5516, + "step": 1585 + }, + { + "epoch": 0.12, + "grad_norm": 2.4070421840452294, + "learning_rate": 9.767356181605877e-06, + "loss": 0.5389, + "step": 1586 + }, + { + "epoch": 0.12, + "grad_norm": 1.5297105888816123, + "learning_rate": 9.766972600980088e-06, + "loss": 0.52, + "step": 1587 + }, + { + "epoch": 0.12, + "grad_norm": 0.6301945130075601, + "learning_rate": 9.76658871193741e-06, + "loss": 0.5546, + "step": 1588 + }, + { + "epoch": 0.12, + "grad_norm": 4.068753370664915, + "learning_rate": 9.766204514502677e-06, + "loss": 0.5058, + "step": 1589 + }, + { + "epoch": 0.12, + "grad_norm": 1.55996614446939, + "learning_rate": 9.765820008700746e-06, + "loss": 0.5777, + "step": 1590 + }, + { + "epoch": 0.12, + "grad_norm": 1.5117961121032426, + "learning_rate": 9.765435194556497e-06, + "loss": 0.5309, + "step": 1591 + }, + { + "epoch": 0.13, + "grad_norm": 1.5515926901217598, + "learning_rate": 9.765050072094824e-06, + "loss": 0.4924, + "step": 1592 + }, + { + "epoch": 0.13, + "grad_norm": 0.6230129898930237, + "learning_rate": 9.764664641340645e-06, + "loss": 0.5591, + "step": 1593 + }, + { + "epoch": 0.13, + "grad_norm": 1.3891450348864518, + "learning_rate": 9.764278902318897e-06, + "loss": 0.5327, + "step": 1594 + }, + { + "epoch": 0.13, + "grad_norm": 1.6914096374222076, + "learning_rate": 9.763892855054535e-06, + "loss": 0.5191, + "step": 1595 + }, + { + "epoch": 0.13, + "grad_norm": 1.5977521762519895, + "learning_rate": 9.76350649957254e-06, + "loss": 0.4915, + "step": 1596 + }, + { + "epoch": 0.13, + "grad_norm": 1.551567802635987, + "learning_rate": 9.763119835897903e-06, + "loss": 0.4902, + "step": 1597 + }, + { + "epoch": 0.13, + "grad_norm": 1.4238094491514561, + "learning_rate": 9.762732864055644e-06, + "loss": 0.5109, + "step": 1598 + }, + { + "epoch": 0.13, + "grad_norm": 0.583506751194209, + "learning_rate": 9.7623455840708e-06, + "loss": 0.5598, + "step": 1599 + }, + { + "epoch": 0.13, + "grad_norm": 1.7185491932134798, + "learning_rate": 9.761957995968427e-06, + "loss": 0.5086, + "step": 1600 + }, + { + "epoch": 0.13, + "grad_norm": 1.531833488777546, + "learning_rate": 9.761570099773601e-06, + "loss": 0.5028, + "step": 1601 + }, + { + "epoch": 0.13, + "grad_norm": 0.5789329340551965, + "learning_rate": 9.761181895511418e-06, + "loss": 0.5805, + "step": 1602 + }, + { + "epoch": 0.13, + "grad_norm": 1.5464500569078543, + "learning_rate": 9.760793383206995e-06, + "loss": 0.5178, + "step": 1603 + }, + { + "epoch": 0.13, + "grad_norm": 1.865437475135024, + "learning_rate": 9.760404562885467e-06, + "loss": 0.5655, + "step": 1604 + }, + { + "epoch": 0.13, + "grad_norm": 1.479885642920405, + "learning_rate": 9.760015434571992e-06, + "loss": 0.4512, + "step": 1605 + }, + { + "epoch": 0.13, + "grad_norm": 1.589882726282577, + "learning_rate": 9.759625998291745e-06, + "loss": 0.5767, + "step": 1606 + }, + { + "epoch": 0.13, + "grad_norm": 1.649224239675404, + "learning_rate": 9.759236254069923e-06, + "loss": 0.5771, + "step": 1607 + }, + { + "epoch": 0.13, + "grad_norm": 1.3534319033551103, + "learning_rate": 9.75884620193174e-06, + "loss": 0.5043, + "step": 1608 + }, + { + "epoch": 0.13, + "grad_norm": 0.6492649823796712, + "learning_rate": 9.758455841902435e-06, + "loss": 0.5968, + "step": 1609 + }, + { + "epoch": 0.13, + "grad_norm": 2.050652369526952, + "learning_rate": 9.75806517400726e-06, + "loss": 0.4906, + "step": 1610 + }, + { + "epoch": 0.13, + "grad_norm": 1.6898348586662564, + "learning_rate": 9.757674198271494e-06, + "loss": 0.5393, + "step": 1611 + }, + { + "epoch": 0.13, + "grad_norm": 10.950546050284869, + "learning_rate": 9.757282914720429e-06, + "loss": 0.5186, + "step": 1612 + }, + { + "epoch": 0.13, + "grad_norm": 0.658259001884661, + "learning_rate": 9.756891323379385e-06, + "loss": 0.5675, + "step": 1613 + }, + { + "epoch": 0.13, + "grad_norm": 1.4683383249373028, + "learning_rate": 9.756499424273695e-06, + "loss": 0.5672, + "step": 1614 + }, + { + "epoch": 0.13, + "grad_norm": 1.7569992686688976, + "learning_rate": 9.756107217428713e-06, + "loss": 0.5053, + "step": 1615 + }, + { + "epoch": 0.13, + "grad_norm": 1.5152816350257374, + "learning_rate": 9.755714702869817e-06, + "loss": 0.5586, + "step": 1616 + }, + { + "epoch": 0.13, + "grad_norm": 1.7800974300880035, + "learning_rate": 9.7553218806224e-06, + "loss": 0.5472, + "step": 1617 + }, + { + "epoch": 0.13, + "grad_norm": 2.3777375272487453, + "learning_rate": 9.75492875071188e-06, + "loss": 0.5608, + "step": 1618 + }, + { + "epoch": 0.13, + "grad_norm": 1.871321126321876, + "learning_rate": 9.754535313163688e-06, + "loss": 0.538, + "step": 1619 + }, + { + "epoch": 0.13, + "grad_norm": 1.7459032930878007, + "learning_rate": 9.754141568003282e-06, + "loss": 0.5106, + "step": 1620 + }, + { + "epoch": 0.13, + "grad_norm": 0.7127039115769913, + "learning_rate": 9.753747515256135e-06, + "loss": 0.5688, + "step": 1621 + }, + { + "epoch": 0.13, + "grad_norm": 1.6860423589996247, + "learning_rate": 9.753353154947744e-06, + "loss": 0.5598, + "step": 1622 + }, + { + "epoch": 0.13, + "grad_norm": 1.340843642522983, + "learning_rate": 9.752958487103621e-06, + "loss": 0.5754, + "step": 1623 + }, + { + "epoch": 0.13, + "grad_norm": 1.3617610359453556, + "learning_rate": 9.752563511749301e-06, + "loss": 0.4916, + "step": 1624 + }, + { + "epoch": 0.13, + "grad_norm": 1.4916774479155517, + "learning_rate": 9.75216822891034e-06, + "loss": 0.5035, + "step": 1625 + }, + { + "epoch": 0.13, + "grad_norm": 1.7754750251274392, + "learning_rate": 9.75177263861231e-06, + "loss": 0.5415, + "step": 1626 + }, + { + "epoch": 0.13, + "grad_norm": 1.4215838028483774, + "learning_rate": 9.751376740880807e-06, + "loss": 0.5383, + "step": 1627 + }, + { + "epoch": 0.13, + "grad_norm": 0.647273687588268, + "learning_rate": 9.750980535741445e-06, + "loss": 0.5558, + "step": 1628 + }, + { + "epoch": 0.13, + "grad_norm": 1.5184696858048425, + "learning_rate": 9.750584023219857e-06, + "loss": 0.4795, + "step": 1629 + }, + { + "epoch": 0.13, + "grad_norm": 1.4657025657978797, + "learning_rate": 9.750187203341697e-06, + "loss": 0.4889, + "step": 1630 + }, + { + "epoch": 0.13, + "grad_norm": 1.4033560792850248, + "learning_rate": 9.74979007613264e-06, + "loss": 0.4899, + "step": 1631 + }, + { + "epoch": 0.13, + "grad_norm": 0.5971544079302799, + "learning_rate": 9.749392641618376e-06, + "loss": 0.5529, + "step": 1632 + }, + { + "epoch": 0.13, + "grad_norm": 2.7817058638774235, + "learning_rate": 9.748994899824623e-06, + "loss": 0.4927, + "step": 1633 + }, + { + "epoch": 0.13, + "grad_norm": 1.70157270407114, + "learning_rate": 9.748596850777112e-06, + "loss": 0.4958, + "step": 1634 + }, + { + "epoch": 0.13, + "grad_norm": 0.623236364196928, + "learning_rate": 9.748198494501598e-06, + "loss": 0.5723, + "step": 1635 + }, + { + "epoch": 0.13, + "grad_norm": 1.4282010727427261, + "learning_rate": 9.74779983102385e-06, + "loss": 0.4841, + "step": 1636 + }, + { + "epoch": 0.13, + "grad_norm": 1.6383678381789328, + "learning_rate": 9.747400860369667e-06, + "loss": 0.4932, + "step": 1637 + }, + { + "epoch": 0.13, + "grad_norm": 5.733737644793054, + "learning_rate": 9.747001582564858e-06, + "loss": 0.5181, + "step": 1638 + }, + { + "epoch": 0.13, + "grad_norm": 1.6618091467452387, + "learning_rate": 9.746601997635255e-06, + "loss": 0.5042, + "step": 1639 + }, + { + "epoch": 0.13, + "grad_norm": 2.215757011602407, + "learning_rate": 9.746202105606713e-06, + "loss": 0.4993, + "step": 1640 + }, + { + "epoch": 0.13, + "grad_norm": 0.683547464034928, + "learning_rate": 9.745801906505104e-06, + "loss": 0.5589, + "step": 1641 + }, + { + "epoch": 0.13, + "grad_norm": 2.5425950127757986, + "learning_rate": 9.745401400356319e-06, + "loss": 0.5371, + "step": 1642 + }, + { + "epoch": 0.13, + "grad_norm": 1.7426373165516795, + "learning_rate": 9.74500058718627e-06, + "loss": 0.5945, + "step": 1643 + }, + { + "epoch": 0.13, + "grad_norm": 1.8710461669524259, + "learning_rate": 9.744599467020893e-06, + "loss": 0.5184, + "step": 1644 + }, + { + "epoch": 0.13, + "grad_norm": 1.9968527080851093, + "learning_rate": 9.744198039886136e-06, + "loss": 0.534, + "step": 1645 + }, + { + "epoch": 0.13, + "grad_norm": 0.6916076943544632, + "learning_rate": 9.743796305807971e-06, + "loss": 0.5661, + "step": 1646 + }, + { + "epoch": 0.13, + "grad_norm": 1.993616041570552, + "learning_rate": 9.743394264812392e-06, + "loss": 0.5291, + "step": 1647 + }, + { + "epoch": 0.13, + "grad_norm": 0.6813934367960476, + "learning_rate": 9.742991916925409e-06, + "loss": 0.5734, + "step": 1648 + }, + { + "epoch": 0.13, + "grad_norm": 2.1973323136025145, + "learning_rate": 9.742589262173054e-06, + "loss": 0.5313, + "step": 1649 + }, + { + "epoch": 0.13, + "grad_norm": 1.428243363523825, + "learning_rate": 9.742186300581378e-06, + "loss": 0.4944, + "step": 1650 + }, + { + "epoch": 0.13, + "grad_norm": 1.8069885197915512, + "learning_rate": 9.741783032176451e-06, + "loss": 0.5313, + "step": 1651 + }, + { + "epoch": 0.13, + "grad_norm": 1.628547012963086, + "learning_rate": 9.741379456984364e-06, + "loss": 0.5044, + "step": 1652 + }, + { + "epoch": 0.13, + "grad_norm": 1.559235470937039, + "learning_rate": 9.740975575031229e-06, + "loss": 0.5404, + "step": 1653 + }, + { + "epoch": 0.13, + "grad_norm": 1.2910206029676095, + "learning_rate": 9.740571386343178e-06, + "loss": 0.5437, + "step": 1654 + }, + { + "epoch": 0.13, + "grad_norm": 0.6307642700341504, + "learning_rate": 9.740166890946358e-06, + "loss": 0.5584, + "step": 1655 + }, + { + "epoch": 0.13, + "grad_norm": 1.8314192314848383, + "learning_rate": 9.739762088866942e-06, + "loss": 0.5256, + "step": 1656 + }, + { + "epoch": 0.13, + "grad_norm": 1.7527729257928064, + "learning_rate": 9.739356980131119e-06, + "loss": 0.6005, + "step": 1657 + }, + { + "epoch": 0.13, + "grad_norm": 1.6251643308377584, + "learning_rate": 9.7389515647651e-06, + "loss": 0.5172, + "step": 1658 + }, + { + "epoch": 0.13, + "grad_norm": 0.6195906157000807, + "learning_rate": 9.738545842795112e-06, + "loss": 0.5753, + "step": 1659 + }, + { + "epoch": 0.13, + "grad_norm": 1.3892737766682495, + "learning_rate": 9.73813981424741e-06, + "loss": 0.5002, + "step": 1660 + }, + { + "epoch": 0.13, + "grad_norm": 1.5042822326110639, + "learning_rate": 9.737733479148256e-06, + "loss": 0.5298, + "step": 1661 + }, + { + "epoch": 0.13, + "grad_norm": 1.5228980471688347, + "learning_rate": 9.737326837523949e-06, + "loss": 0.5113, + "step": 1662 + }, + { + "epoch": 0.13, + "grad_norm": 1.443048134714524, + "learning_rate": 9.736919889400788e-06, + "loss": 0.5425, + "step": 1663 + }, + { + "epoch": 0.13, + "grad_norm": 1.8434939849135352, + "learning_rate": 9.736512634805109e-06, + "loss": 0.4786, + "step": 1664 + }, + { + "epoch": 0.13, + "grad_norm": 1.9788812456309413, + "learning_rate": 9.736105073763258e-06, + "loss": 0.52, + "step": 1665 + }, + { + "epoch": 0.13, + "grad_norm": 1.5371055258407116, + "learning_rate": 9.735697206301604e-06, + "loss": 0.514, + "step": 1666 + }, + { + "epoch": 0.13, + "grad_norm": 1.6480894276309557, + "learning_rate": 9.735289032446537e-06, + "loss": 0.5054, + "step": 1667 + }, + { + "epoch": 0.13, + "grad_norm": 2.043350919660803, + "learning_rate": 9.734880552224464e-06, + "loss": 0.5175, + "step": 1668 + }, + { + "epoch": 0.13, + "grad_norm": 1.4658362287119135, + "learning_rate": 9.734471765661813e-06, + "loss": 0.4716, + "step": 1669 + }, + { + "epoch": 0.13, + "grad_norm": 1.444677822537247, + "learning_rate": 9.734062672785032e-06, + "loss": 0.5784, + "step": 1670 + }, + { + "epoch": 0.13, + "grad_norm": 1.6467824819419734, + "learning_rate": 9.73365327362059e-06, + "loss": 0.5454, + "step": 1671 + }, + { + "epoch": 0.13, + "grad_norm": 1.4507805809126435, + "learning_rate": 9.733243568194972e-06, + "loss": 0.4862, + "step": 1672 + }, + { + "epoch": 0.13, + "grad_norm": 1.6197653597491384, + "learning_rate": 9.732833556534688e-06, + "loss": 0.5086, + "step": 1673 + }, + { + "epoch": 0.13, + "grad_norm": 1.6182002536547113, + "learning_rate": 9.732423238666264e-06, + "loss": 0.4992, + "step": 1674 + }, + { + "epoch": 0.13, + "grad_norm": 1.6541744262119211, + "learning_rate": 9.732012614616247e-06, + "loss": 0.5102, + "step": 1675 + }, + { + "epoch": 0.13, + "grad_norm": 2.17576379338614, + "learning_rate": 9.731601684411205e-06, + "loss": 0.5393, + "step": 1676 + }, + { + "epoch": 0.13, + "grad_norm": 1.8763999630970931, + "learning_rate": 9.731190448077722e-06, + "loss": 0.5179, + "step": 1677 + }, + { + "epoch": 0.13, + "grad_norm": 2.1088263287279316, + "learning_rate": 9.730778905642408e-06, + "loss": 0.5521, + "step": 1678 + }, + { + "epoch": 0.13, + "grad_norm": 1.6366864440725426, + "learning_rate": 9.730367057131887e-06, + "loss": 0.547, + "step": 1679 + }, + { + "epoch": 0.13, + "grad_norm": 1.8517200118684531, + "learning_rate": 9.729954902572805e-06, + "loss": 0.5616, + "step": 1680 + }, + { + "epoch": 0.13, + "grad_norm": 1.4409162741326125, + "learning_rate": 9.729542441991828e-06, + "loss": 0.528, + "step": 1681 + }, + { + "epoch": 0.13, + "grad_norm": 1.7691234026832334, + "learning_rate": 9.729129675415643e-06, + "loss": 0.5191, + "step": 1682 + }, + { + "epoch": 0.13, + "grad_norm": 1.4350535279287002, + "learning_rate": 9.728716602870953e-06, + "loss": 0.477, + "step": 1683 + }, + { + "epoch": 0.13, + "grad_norm": 1.6638235108855273, + "learning_rate": 9.728303224384486e-06, + "loss": 0.5295, + "step": 1684 + }, + { + "epoch": 0.13, + "grad_norm": 1.8198356234309545, + "learning_rate": 9.727889539982986e-06, + "loss": 0.5254, + "step": 1685 + }, + { + "epoch": 0.13, + "grad_norm": 0.7229016790969366, + "learning_rate": 9.727475549693217e-06, + "loss": 0.5742, + "step": 1686 + }, + { + "epoch": 0.13, + "grad_norm": 0.6968211014118626, + "learning_rate": 9.727061253541964e-06, + "loss": 0.5818, + "step": 1687 + }, + { + "epoch": 0.13, + "grad_norm": 1.504046662667056, + "learning_rate": 9.726646651556032e-06, + "loss": 0.4868, + "step": 1688 + }, + { + "epoch": 0.13, + "grad_norm": 1.4992271466730613, + "learning_rate": 9.726231743762245e-06, + "loss": 0.5544, + "step": 1689 + }, + { + "epoch": 0.13, + "grad_norm": 1.5621181356574096, + "learning_rate": 9.725816530187446e-06, + "loss": 0.4675, + "step": 1690 + }, + { + "epoch": 0.13, + "grad_norm": 2.144746641944118, + "learning_rate": 9.725401010858501e-06, + "loss": 0.5404, + "step": 1691 + }, + { + "epoch": 0.13, + "grad_norm": 1.2655213078327483, + "learning_rate": 9.724985185802291e-06, + "loss": 0.51, + "step": 1692 + }, + { + "epoch": 0.13, + "grad_norm": 0.7092326385085878, + "learning_rate": 9.724569055045722e-06, + "loss": 0.576, + "step": 1693 + }, + { + "epoch": 0.13, + "grad_norm": 1.5784509140903653, + "learning_rate": 9.724152618615715e-06, + "loss": 0.4963, + "step": 1694 + }, + { + "epoch": 0.13, + "grad_norm": 1.3711484540038723, + "learning_rate": 9.723735876539213e-06, + "loss": 0.4967, + "step": 1695 + }, + { + "epoch": 0.13, + "grad_norm": 1.4070060480961435, + "learning_rate": 9.72331882884318e-06, + "loss": 0.5051, + "step": 1696 + }, + { + "epoch": 0.13, + "grad_norm": 1.486967023482276, + "learning_rate": 9.722901475554597e-06, + "loss": 0.5691, + "step": 1697 + }, + { + "epoch": 0.13, + "grad_norm": 2.1970759035084924, + "learning_rate": 9.722483816700468e-06, + "loss": 0.5674, + "step": 1698 + }, + { + "epoch": 0.13, + "grad_norm": 1.495059754616997, + "learning_rate": 9.722065852307812e-06, + "loss": 0.521, + "step": 1699 + }, + { + "epoch": 0.13, + "grad_norm": 1.7021353370053836, + "learning_rate": 9.721647582403673e-06, + "loss": 0.479, + "step": 1700 + }, + { + "epoch": 0.13, + "grad_norm": 1.8126720414937005, + "learning_rate": 9.721229007015114e-06, + "loss": 0.4991, + "step": 1701 + }, + { + "epoch": 0.13, + "grad_norm": 1.289951646733298, + "learning_rate": 9.720810126169212e-06, + "loss": 0.4633, + "step": 1702 + }, + { + "epoch": 0.13, + "grad_norm": 2.018439649116375, + "learning_rate": 9.720390939893071e-06, + "loss": 0.5013, + "step": 1703 + }, + { + "epoch": 0.13, + "grad_norm": 1.7761033324332052, + "learning_rate": 9.719971448213812e-06, + "loss": 0.5696, + "step": 1704 + }, + { + "epoch": 0.13, + "grad_norm": 1.6628740905106985, + "learning_rate": 9.719551651158576e-06, + "loss": 0.5112, + "step": 1705 + }, + { + "epoch": 0.13, + "grad_norm": 1.3501049593861383, + "learning_rate": 9.71913154875452e-06, + "loss": 0.4853, + "step": 1706 + }, + { + "epoch": 0.13, + "grad_norm": 1.6259209015622624, + "learning_rate": 9.718711141028827e-06, + "loss": 0.4906, + "step": 1707 + }, + { + "epoch": 0.13, + "grad_norm": 0.7160406777842405, + "learning_rate": 9.718290428008695e-06, + "loss": 0.571, + "step": 1708 + }, + { + "epoch": 0.13, + "grad_norm": 0.6560760909528205, + "learning_rate": 9.717869409721345e-06, + "loss": 0.5737, + "step": 1709 + }, + { + "epoch": 0.13, + "grad_norm": 2.6455879426816984, + "learning_rate": 9.717448086194018e-06, + "loss": 0.5468, + "step": 1710 + }, + { + "epoch": 0.13, + "grad_norm": 1.485525117055862, + "learning_rate": 9.717026457453968e-06, + "loss": 0.4967, + "step": 1711 + }, + { + "epoch": 0.13, + "grad_norm": 1.9130633761540319, + "learning_rate": 9.716604523528478e-06, + "loss": 0.5555, + "step": 1712 + }, + { + "epoch": 0.13, + "grad_norm": 1.607519234331676, + "learning_rate": 9.716182284444846e-06, + "loss": 0.5433, + "step": 1713 + }, + { + "epoch": 0.13, + "grad_norm": 1.509645606515361, + "learning_rate": 9.71575974023039e-06, + "loss": 0.4744, + "step": 1714 + }, + { + "epoch": 0.13, + "grad_norm": 0.8539093451507088, + "learning_rate": 9.715336890912447e-06, + "loss": 0.5959, + "step": 1715 + }, + { + "epoch": 0.13, + "grad_norm": 1.6230768787145395, + "learning_rate": 9.714913736518376e-06, + "loss": 0.5584, + "step": 1716 + }, + { + "epoch": 0.13, + "grad_norm": 1.814448100723574, + "learning_rate": 9.714490277075554e-06, + "loss": 0.5398, + "step": 1717 + }, + { + "epoch": 0.13, + "grad_norm": 2.2400617170260593, + "learning_rate": 9.714066512611378e-06, + "loss": 0.5521, + "step": 1718 + }, + { + "epoch": 0.14, + "grad_norm": 1.7399657565531563, + "learning_rate": 9.713642443153267e-06, + "loss": 0.5473, + "step": 1719 + }, + { + "epoch": 0.14, + "grad_norm": 2.295790179389231, + "learning_rate": 9.713218068728655e-06, + "loss": 0.544, + "step": 1720 + }, + { + "epoch": 0.14, + "grad_norm": 1.6719966236345472, + "learning_rate": 9.712793389364998e-06, + "loss": 0.5621, + "step": 1721 + }, + { + "epoch": 0.14, + "grad_norm": 1.7187655563687367, + "learning_rate": 9.712368405089776e-06, + "loss": 0.5141, + "step": 1722 + }, + { + "epoch": 0.14, + "grad_norm": 1.591317768009486, + "learning_rate": 9.711943115930483e-06, + "loss": 0.5609, + "step": 1723 + }, + { + "epoch": 0.14, + "grad_norm": 1.5034687695364906, + "learning_rate": 9.711517521914633e-06, + "loss": 0.5263, + "step": 1724 + }, + { + "epoch": 0.14, + "grad_norm": 0.7542595062014372, + "learning_rate": 9.711091623069763e-06, + "loss": 0.5567, + "step": 1725 + }, + { + "epoch": 0.14, + "grad_norm": 1.3868572193886552, + "learning_rate": 9.710665419423428e-06, + "loss": 0.5032, + "step": 1726 + }, + { + "epoch": 0.14, + "grad_norm": 1.517994675967119, + "learning_rate": 9.710238911003203e-06, + "loss": 0.5306, + "step": 1727 + }, + { + "epoch": 0.14, + "grad_norm": 0.6439478699038876, + "learning_rate": 9.709812097836682e-06, + "loss": 0.5595, + "step": 1728 + }, + { + "epoch": 0.14, + "grad_norm": 1.5608526248234098, + "learning_rate": 9.70938497995148e-06, + "loss": 0.603, + "step": 1729 + }, + { + "epoch": 0.14, + "grad_norm": 1.5234860858168624, + "learning_rate": 9.708957557375229e-06, + "loss": 0.5461, + "step": 1730 + }, + { + "epoch": 0.14, + "grad_norm": 1.5251944177207828, + "learning_rate": 9.708529830135585e-06, + "loss": 0.5419, + "step": 1731 + }, + { + "epoch": 0.14, + "grad_norm": 1.7510216372451706, + "learning_rate": 9.708101798260221e-06, + "loss": 0.4901, + "step": 1732 + }, + { + "epoch": 0.14, + "grad_norm": 1.4545860711076086, + "learning_rate": 9.70767346177683e-06, + "loss": 0.5765, + "step": 1733 + }, + { + "epoch": 0.14, + "grad_norm": 2.103546008279845, + "learning_rate": 9.707244820713122e-06, + "loss": 0.467, + "step": 1734 + }, + { + "epoch": 0.14, + "grad_norm": 2.097954618516181, + "learning_rate": 9.706815875096834e-06, + "loss": 0.4974, + "step": 1735 + }, + { + "epoch": 0.14, + "grad_norm": 1.6078551717133867, + "learning_rate": 9.706386624955715e-06, + "loss": 0.5655, + "step": 1736 + }, + { + "epoch": 0.14, + "grad_norm": 1.6116546954667679, + "learning_rate": 9.70595707031754e-06, + "loss": 0.5532, + "step": 1737 + }, + { + "epoch": 0.14, + "grad_norm": 1.7544291143770019, + "learning_rate": 9.705527211210097e-06, + "loss": 0.5078, + "step": 1738 + }, + { + "epoch": 0.14, + "grad_norm": 1.6343169675210163, + "learning_rate": 9.7050970476612e-06, + "loss": 0.53, + "step": 1739 + }, + { + "epoch": 0.14, + "grad_norm": 1.7707060006987545, + "learning_rate": 9.704666579698679e-06, + "loss": 0.5993, + "step": 1740 + }, + { + "epoch": 0.14, + "grad_norm": 2.221264562357025, + "learning_rate": 9.704235807350384e-06, + "loss": 0.5403, + "step": 1741 + }, + { + "epoch": 0.14, + "grad_norm": 1.7023598032292442, + "learning_rate": 9.703804730644186e-06, + "loss": 0.5215, + "step": 1742 + }, + { + "epoch": 0.14, + "grad_norm": 1.5284298066779207, + "learning_rate": 9.703373349607976e-06, + "loss": 0.526, + "step": 1743 + }, + { + "epoch": 0.14, + "grad_norm": 0.7815986493787258, + "learning_rate": 9.702941664269663e-06, + "loss": 0.5673, + "step": 1744 + }, + { + "epoch": 0.14, + "grad_norm": 1.6962268846826394, + "learning_rate": 9.702509674657177e-06, + "loss": 0.5439, + "step": 1745 + }, + { + "epoch": 0.14, + "grad_norm": 1.649285391945932, + "learning_rate": 9.702077380798466e-06, + "loss": 0.5313, + "step": 1746 + }, + { + "epoch": 0.14, + "grad_norm": 1.6952489953791672, + "learning_rate": 9.7016447827215e-06, + "loss": 0.5146, + "step": 1747 + }, + { + "epoch": 0.14, + "grad_norm": 1.5842574928253417, + "learning_rate": 9.701211880454267e-06, + "loss": 0.5463, + "step": 1748 + }, + { + "epoch": 0.14, + "grad_norm": 0.6485267459369294, + "learning_rate": 9.700778674024776e-06, + "loss": 0.5816, + "step": 1749 + }, + { + "epoch": 0.14, + "grad_norm": 1.4417921738354134, + "learning_rate": 9.700345163461054e-06, + "loss": 0.5162, + "step": 1750 + }, + { + "epoch": 0.14, + "grad_norm": 1.6974699900256511, + "learning_rate": 9.699911348791146e-06, + "loss": 0.5892, + "step": 1751 + }, + { + "epoch": 0.14, + "grad_norm": 2.568076211135904, + "learning_rate": 9.699477230043125e-06, + "loss": 0.4811, + "step": 1752 + }, + { + "epoch": 0.14, + "grad_norm": 1.9146312233358298, + "learning_rate": 9.699042807245073e-06, + "loss": 0.5082, + "step": 1753 + }, + { + "epoch": 0.14, + "grad_norm": 1.5387474057364796, + "learning_rate": 9.6986080804251e-06, + "loss": 0.549, + "step": 1754 + }, + { + "epoch": 0.14, + "grad_norm": 1.506488077391304, + "learning_rate": 9.69817304961133e-06, + "loss": 0.5125, + "step": 1755 + }, + { + "epoch": 0.14, + "grad_norm": 1.474289422948104, + "learning_rate": 9.69773771483191e-06, + "loss": 0.5744, + "step": 1756 + }, + { + "epoch": 0.14, + "grad_norm": 0.6083775393796964, + "learning_rate": 9.697302076115006e-06, + "loss": 0.5818, + "step": 1757 + }, + { + "epoch": 0.14, + "grad_norm": 2.445486177874537, + "learning_rate": 9.696866133488802e-06, + "loss": 0.5365, + "step": 1758 + }, + { + "epoch": 0.14, + "grad_norm": 1.8135264462638443, + "learning_rate": 9.696429886981501e-06, + "loss": 0.532, + "step": 1759 + }, + { + "epoch": 0.14, + "grad_norm": 1.506348090128629, + "learning_rate": 9.695993336621335e-06, + "loss": 0.5205, + "step": 1760 + }, + { + "epoch": 0.14, + "grad_norm": 0.5890776274123277, + "learning_rate": 9.695556482436538e-06, + "loss": 0.5526, + "step": 1761 + }, + { + "epoch": 0.14, + "grad_norm": 1.3364539061876732, + "learning_rate": 9.695119324455383e-06, + "loss": 0.4913, + "step": 1762 + }, + { + "epoch": 0.14, + "grad_norm": 1.6228452283152321, + "learning_rate": 9.694681862706149e-06, + "loss": 0.5115, + "step": 1763 + }, + { + "epoch": 0.14, + "grad_norm": 1.4209032406888533, + "learning_rate": 9.694244097217137e-06, + "loss": 0.4981, + "step": 1764 + }, + { + "epoch": 0.14, + "grad_norm": 3.218839443350974, + "learning_rate": 9.693806028016675e-06, + "loss": 0.5174, + "step": 1765 + }, + { + "epoch": 0.14, + "grad_norm": 1.4384728816344388, + "learning_rate": 9.693367655133104e-06, + "loss": 0.5554, + "step": 1766 + }, + { + "epoch": 0.14, + "grad_norm": 0.6574116358365748, + "learning_rate": 9.692928978594784e-06, + "loss": 0.5946, + "step": 1767 + }, + { + "epoch": 0.14, + "grad_norm": 1.6928596953735693, + "learning_rate": 9.692489998430099e-06, + "loss": 0.52, + "step": 1768 + }, + { + "epoch": 0.14, + "grad_norm": 1.5451116674639809, + "learning_rate": 9.692050714667449e-06, + "loss": 0.5271, + "step": 1769 + }, + { + "epoch": 0.14, + "grad_norm": 1.413142128196924, + "learning_rate": 9.691611127335256e-06, + "loss": 0.5062, + "step": 1770 + }, + { + "epoch": 0.14, + "grad_norm": 2.2580287383446627, + "learning_rate": 9.691171236461962e-06, + "loss": 0.4976, + "step": 1771 + }, + { + "epoch": 0.14, + "grad_norm": 1.7451358815987252, + "learning_rate": 9.690731042076023e-06, + "loss": 0.4973, + "step": 1772 + }, + { + "epoch": 0.14, + "grad_norm": 1.402984116910305, + "learning_rate": 9.690290544205925e-06, + "loss": 0.5332, + "step": 1773 + }, + { + "epoch": 0.14, + "grad_norm": 1.6379001757246756, + "learning_rate": 9.689849742880162e-06, + "loss": 0.5174, + "step": 1774 + }, + { + "epoch": 0.14, + "grad_norm": 1.8315793849617958, + "learning_rate": 9.689408638127256e-06, + "loss": 0.5736, + "step": 1775 + }, + { + "epoch": 0.14, + "grad_norm": 1.6815743445542826, + "learning_rate": 9.688967229975747e-06, + "loss": 0.5126, + "step": 1776 + }, + { + "epoch": 0.14, + "grad_norm": 1.3325690608260463, + "learning_rate": 9.68852551845419e-06, + "loss": 0.4944, + "step": 1777 + }, + { + "epoch": 0.14, + "grad_norm": 1.6743833026042148, + "learning_rate": 9.688083503591167e-06, + "loss": 0.5852, + "step": 1778 + }, + { + "epoch": 0.14, + "grad_norm": 1.3520939543412167, + "learning_rate": 9.687641185415272e-06, + "loss": 0.553, + "step": 1779 + }, + { + "epoch": 0.14, + "grad_norm": 1.904512495150474, + "learning_rate": 9.687198563955128e-06, + "loss": 0.5639, + "step": 1780 + }, + { + "epoch": 0.14, + "grad_norm": 1.419072885115324, + "learning_rate": 9.686755639239367e-06, + "loss": 0.488, + "step": 1781 + }, + { + "epoch": 0.14, + "grad_norm": 1.507648038097745, + "learning_rate": 9.686312411296646e-06, + "loss": 0.5399, + "step": 1782 + }, + { + "epoch": 0.14, + "grad_norm": 1.808924917015108, + "learning_rate": 9.685868880155644e-06, + "loss": 0.4984, + "step": 1783 + }, + { + "epoch": 0.14, + "grad_norm": 1.6341159395632192, + "learning_rate": 9.685425045845054e-06, + "loss": 0.523, + "step": 1784 + }, + { + "epoch": 0.14, + "grad_norm": 1.6377263366117027, + "learning_rate": 9.684980908393593e-06, + "loss": 0.5174, + "step": 1785 + }, + { + "epoch": 0.14, + "grad_norm": 1.5521653709967353, + "learning_rate": 9.684536467829997e-06, + "loss": 0.5508, + "step": 1786 + }, + { + "epoch": 0.14, + "grad_norm": 1.353641114514894, + "learning_rate": 9.68409172418302e-06, + "loss": 0.5158, + "step": 1787 + }, + { + "epoch": 0.14, + "grad_norm": 1.5174633587328563, + "learning_rate": 9.683646677481435e-06, + "loss": 0.4577, + "step": 1788 + }, + { + "epoch": 0.14, + "grad_norm": 1.783497306524081, + "learning_rate": 9.683201327754037e-06, + "loss": 0.5673, + "step": 1789 + }, + { + "epoch": 0.14, + "grad_norm": 0.6455861903110509, + "learning_rate": 9.68275567502964e-06, + "loss": 0.5819, + "step": 1790 + }, + { + "epoch": 0.14, + "grad_norm": 0.602629966736427, + "learning_rate": 9.682309719337076e-06, + "loss": 0.5809, + "step": 1791 + }, + { + "epoch": 0.14, + "grad_norm": 2.107392470822457, + "learning_rate": 9.6818634607052e-06, + "loss": 0.5502, + "step": 1792 + }, + { + "epoch": 0.14, + "grad_norm": 1.5439475114638852, + "learning_rate": 9.681416899162879e-06, + "loss": 0.485, + "step": 1793 + }, + { + "epoch": 0.14, + "grad_norm": 1.5476511995777875, + "learning_rate": 9.680970034739012e-06, + "loss": 0.4377, + "step": 1794 + }, + { + "epoch": 0.14, + "grad_norm": 1.4375572363615428, + "learning_rate": 9.680522867462506e-06, + "loss": 0.5175, + "step": 1795 + }, + { + "epoch": 0.14, + "grad_norm": 1.722444077610072, + "learning_rate": 9.680075397362294e-06, + "loss": 0.5436, + "step": 1796 + }, + { + "epoch": 0.14, + "grad_norm": 1.4942882073227948, + "learning_rate": 9.679627624467324e-06, + "loss": 0.4859, + "step": 1797 + }, + { + "epoch": 0.14, + "grad_norm": 2.873996190761962, + "learning_rate": 9.679179548806571e-06, + "loss": 0.4955, + "step": 1798 + }, + { + "epoch": 0.14, + "grad_norm": 1.3917977480194774, + "learning_rate": 9.678731170409022e-06, + "loss": 0.5276, + "step": 1799 + }, + { + "epoch": 0.14, + "grad_norm": 1.4523396733310785, + "learning_rate": 9.678282489303687e-06, + "loss": 0.4823, + "step": 1800 + }, + { + "epoch": 0.14, + "grad_norm": 1.9098835177132896, + "learning_rate": 9.677833505519594e-06, + "loss": 0.4891, + "step": 1801 + }, + { + "epoch": 0.14, + "grad_norm": 0.7755070604298627, + "learning_rate": 9.677384219085793e-06, + "loss": 0.574, + "step": 1802 + }, + { + "epoch": 0.14, + "grad_norm": 1.8463259455598227, + "learning_rate": 9.676934630031351e-06, + "loss": 0.4954, + "step": 1803 + }, + { + "epoch": 0.14, + "grad_norm": 1.4463426929136307, + "learning_rate": 9.676484738385357e-06, + "loss": 0.5914, + "step": 1804 + }, + { + "epoch": 0.14, + "grad_norm": 1.403698398562429, + "learning_rate": 9.676034544176918e-06, + "loss": 0.5039, + "step": 1805 + }, + { + "epoch": 0.14, + "grad_norm": 1.8265208015992391, + "learning_rate": 9.675584047435162e-06, + "loss": 0.5021, + "step": 1806 + }, + { + "epoch": 0.14, + "grad_norm": 0.6203410613003286, + "learning_rate": 9.675133248189234e-06, + "loss": 0.5592, + "step": 1807 + }, + { + "epoch": 0.14, + "grad_norm": 2.781399633281742, + "learning_rate": 9.6746821464683e-06, + "loss": 0.5121, + "step": 1808 + }, + { + "epoch": 0.14, + "grad_norm": 1.7643649387698164, + "learning_rate": 9.674230742301547e-06, + "loss": 0.5087, + "step": 1809 + }, + { + "epoch": 0.14, + "grad_norm": 0.6596655233887727, + "learning_rate": 9.67377903571818e-06, + "loss": 0.5707, + "step": 1810 + }, + { + "epoch": 0.14, + "grad_norm": 4.870373659473655, + "learning_rate": 9.673327026747423e-06, + "loss": 0.5315, + "step": 1811 + }, + { + "epoch": 0.14, + "grad_norm": 1.5966611860805129, + "learning_rate": 9.67287471541852e-06, + "loss": 0.5124, + "step": 1812 + }, + { + "epoch": 0.14, + "grad_norm": 2.196601475411892, + "learning_rate": 9.672422101760737e-06, + "loss": 0.5309, + "step": 1813 + }, + { + "epoch": 0.14, + "grad_norm": 1.564087241551201, + "learning_rate": 9.671969185803357e-06, + "loss": 0.5736, + "step": 1814 + }, + { + "epoch": 0.14, + "grad_norm": 1.4698674510702414, + "learning_rate": 9.671515967575681e-06, + "loss": 0.5621, + "step": 1815 + }, + { + "epoch": 0.14, + "grad_norm": 1.617615539542415, + "learning_rate": 9.671062447107033e-06, + "loss": 0.5295, + "step": 1816 + }, + { + "epoch": 0.14, + "grad_norm": 1.6045583464834527, + "learning_rate": 9.670608624426757e-06, + "loss": 0.5006, + "step": 1817 + }, + { + "epoch": 0.14, + "grad_norm": 1.8979323463742641, + "learning_rate": 9.67015449956421e-06, + "loss": 0.5301, + "step": 1818 + }, + { + "epoch": 0.14, + "grad_norm": 1.4508348847270582, + "learning_rate": 9.669700072548778e-06, + "loss": 0.5333, + "step": 1819 + }, + { + "epoch": 0.14, + "grad_norm": 1.6346115860202617, + "learning_rate": 9.66924534340986e-06, + "loss": 0.5097, + "step": 1820 + }, + { + "epoch": 0.14, + "grad_norm": 1.4982474966609727, + "learning_rate": 9.668790312176875e-06, + "loss": 0.5244, + "step": 1821 + }, + { + "epoch": 0.14, + "grad_norm": 1.8411165317620186, + "learning_rate": 9.668334978879265e-06, + "loss": 0.5455, + "step": 1822 + }, + { + "epoch": 0.14, + "grad_norm": 1.4292594581633755, + "learning_rate": 9.66787934354649e-06, + "loss": 0.486, + "step": 1823 + }, + { + "epoch": 0.14, + "grad_norm": 1.3065997663129374, + "learning_rate": 9.667423406208027e-06, + "loss": 0.5443, + "step": 1824 + }, + { + "epoch": 0.14, + "grad_norm": 1.579094046218995, + "learning_rate": 9.666967166893375e-06, + "loss": 0.4972, + "step": 1825 + }, + { + "epoch": 0.14, + "grad_norm": 0.6574470320859422, + "learning_rate": 9.666510625632054e-06, + "loss": 0.5723, + "step": 1826 + }, + { + "epoch": 0.14, + "grad_norm": 2.296926797692981, + "learning_rate": 9.666053782453599e-06, + "loss": 0.5368, + "step": 1827 + }, + { + "epoch": 0.14, + "grad_norm": 1.5627458943990271, + "learning_rate": 9.665596637387568e-06, + "loss": 0.4995, + "step": 1828 + }, + { + "epoch": 0.14, + "grad_norm": 1.4805219479583542, + "learning_rate": 9.665139190463539e-06, + "loss": 0.5388, + "step": 1829 + }, + { + "epoch": 0.14, + "grad_norm": 1.570146199347442, + "learning_rate": 9.664681441711106e-06, + "loss": 0.5328, + "step": 1830 + }, + { + "epoch": 0.14, + "grad_norm": 2.202864327517874, + "learning_rate": 9.664223391159885e-06, + "loss": 0.5047, + "step": 1831 + }, + { + "epoch": 0.14, + "grad_norm": 2.019827046463568, + "learning_rate": 9.663765038839515e-06, + "loss": 0.4973, + "step": 1832 + }, + { + "epoch": 0.14, + "grad_norm": 1.5224290163554257, + "learning_rate": 9.663306384779647e-06, + "loss": 0.5057, + "step": 1833 + }, + { + "epoch": 0.14, + "grad_norm": 1.7678254113514829, + "learning_rate": 9.662847429009955e-06, + "loss": 0.5446, + "step": 1834 + }, + { + "epoch": 0.14, + "grad_norm": 0.6582509691217369, + "learning_rate": 9.662388171560135e-06, + "loss": 0.5609, + "step": 1835 + }, + { + "epoch": 0.14, + "grad_norm": 1.5087165859730982, + "learning_rate": 9.661928612459898e-06, + "loss": 0.5605, + "step": 1836 + }, + { + "epoch": 0.14, + "grad_norm": 1.5580049614146727, + "learning_rate": 9.66146875173898e-06, + "loss": 0.5353, + "step": 1837 + }, + { + "epoch": 0.14, + "grad_norm": 0.5455597557591704, + "learning_rate": 9.661008589427133e-06, + "loss": 0.5554, + "step": 1838 + }, + { + "epoch": 0.14, + "grad_norm": 1.9736831663080014, + "learning_rate": 9.660548125554125e-06, + "loss": 0.5289, + "step": 1839 + }, + { + "epoch": 0.14, + "grad_norm": 1.5548122729123355, + "learning_rate": 9.66008736014975e-06, + "loss": 0.5676, + "step": 1840 + }, + { + "epoch": 0.14, + "grad_norm": 2.106114288539542, + "learning_rate": 9.65962629324382e-06, + "loss": 0.4542, + "step": 1841 + }, + { + "epoch": 0.14, + "grad_norm": 2.2494402280394934, + "learning_rate": 9.659164924866165e-06, + "loss": 0.5294, + "step": 1842 + }, + { + "epoch": 0.14, + "grad_norm": 10.906144039147925, + "learning_rate": 9.658703255046634e-06, + "loss": 0.5444, + "step": 1843 + }, + { + "epoch": 0.14, + "grad_norm": 2.3440469796227137, + "learning_rate": 9.658241283815096e-06, + "loss": 0.5518, + "step": 1844 + }, + { + "epoch": 0.14, + "grad_norm": 1.8820609313495038, + "learning_rate": 9.657779011201442e-06, + "loss": 0.5177, + "step": 1845 + }, + { + "epoch": 0.14, + "grad_norm": 1.4163341712307647, + "learning_rate": 9.657316437235577e-06, + "loss": 0.4951, + "step": 1846 + }, + { + "epoch": 0.15, + "grad_norm": 0.6997179901426247, + "learning_rate": 9.656853561947433e-06, + "loss": 0.5618, + "step": 1847 + }, + { + "epoch": 0.15, + "grad_norm": 0.6447091796466468, + "learning_rate": 9.656390385366956e-06, + "loss": 0.5594, + "step": 1848 + }, + { + "epoch": 0.15, + "grad_norm": 1.8346317439532254, + "learning_rate": 9.65592690752411e-06, + "loss": 0.5087, + "step": 1849 + }, + { + "epoch": 0.15, + "grad_norm": 2.9181109510977046, + "learning_rate": 9.655463128448885e-06, + "loss": 0.547, + "step": 1850 + }, + { + "epoch": 0.15, + "grad_norm": 1.8643152815418083, + "learning_rate": 9.654999048171286e-06, + "loss": 0.5484, + "step": 1851 + }, + { + "epoch": 0.15, + "grad_norm": 1.9999572742707186, + "learning_rate": 9.654534666721337e-06, + "loss": 0.5462, + "step": 1852 + }, + { + "epoch": 0.15, + "grad_norm": 1.4143696273948014, + "learning_rate": 9.654069984129084e-06, + "loss": 0.4896, + "step": 1853 + }, + { + "epoch": 0.15, + "grad_norm": 1.4719939420212298, + "learning_rate": 9.653605000424593e-06, + "loss": 0.556, + "step": 1854 + }, + { + "epoch": 0.15, + "grad_norm": 0.7752739389840464, + "learning_rate": 9.653139715637945e-06, + "loss": 0.5599, + "step": 1855 + }, + { + "epoch": 0.15, + "grad_norm": 1.8079952365426812, + "learning_rate": 9.652674129799245e-06, + "loss": 0.478, + "step": 1856 + }, + { + "epoch": 0.15, + "grad_norm": 0.6783707506550835, + "learning_rate": 9.652208242938615e-06, + "loss": 0.56, + "step": 1857 + }, + { + "epoch": 0.15, + "grad_norm": 1.4963456869203255, + "learning_rate": 9.651742055086198e-06, + "loss": 0.5311, + "step": 1858 + }, + { + "epoch": 0.15, + "grad_norm": 1.4906937955192159, + "learning_rate": 9.651275566272154e-06, + "loss": 0.4762, + "step": 1859 + }, + { + "epoch": 0.15, + "grad_norm": 1.7476428707420315, + "learning_rate": 9.650808776526666e-06, + "loss": 0.5148, + "step": 1860 + }, + { + "epoch": 0.15, + "grad_norm": 1.4726969791063416, + "learning_rate": 9.650341685879933e-06, + "loss": 0.498, + "step": 1861 + }, + { + "epoch": 0.15, + "grad_norm": 1.58376727646574, + "learning_rate": 9.649874294362178e-06, + "loss": 0.5365, + "step": 1862 + }, + { + "epoch": 0.15, + "grad_norm": 1.8777151059392998, + "learning_rate": 9.64940660200364e-06, + "loss": 0.4922, + "step": 1863 + }, + { + "epoch": 0.15, + "grad_norm": 1.4920573924232605, + "learning_rate": 9.648938608834574e-06, + "loss": 0.4857, + "step": 1864 + }, + { + "epoch": 0.15, + "grad_norm": 0.9082385730671085, + "learning_rate": 9.648470314885264e-06, + "loss": 0.5907, + "step": 1865 + }, + { + "epoch": 0.15, + "grad_norm": 1.3755917367638923, + "learning_rate": 9.648001720186005e-06, + "loss": 0.5187, + "step": 1866 + }, + { + "epoch": 0.15, + "grad_norm": 1.4400922067591324, + "learning_rate": 9.647532824767115e-06, + "loss": 0.5144, + "step": 1867 + }, + { + "epoch": 0.15, + "grad_norm": 1.506349325690889, + "learning_rate": 9.64706362865893e-06, + "loss": 0.567, + "step": 1868 + }, + { + "epoch": 0.15, + "grad_norm": 2.2426292166831137, + "learning_rate": 9.646594131891809e-06, + "loss": 0.5444, + "step": 1869 + }, + { + "epoch": 0.15, + "grad_norm": 2.127422761446468, + "learning_rate": 9.646124334496126e-06, + "loss": 0.5225, + "step": 1870 + }, + { + "epoch": 0.15, + "grad_norm": 3.996991372549424, + "learning_rate": 9.645654236502276e-06, + "loss": 0.5146, + "step": 1871 + }, + { + "epoch": 0.15, + "grad_norm": 1.7446835832740144, + "learning_rate": 9.645183837940674e-06, + "loss": 0.5436, + "step": 1872 + }, + { + "epoch": 0.15, + "grad_norm": 1.6248085400054184, + "learning_rate": 9.644713138841754e-06, + "loss": 0.5246, + "step": 1873 + }, + { + "epoch": 0.15, + "grad_norm": 2.143086252612302, + "learning_rate": 9.64424213923597e-06, + "loss": 0.554, + "step": 1874 + }, + { + "epoch": 0.15, + "grad_norm": 2.0521624980551545, + "learning_rate": 9.643770839153796e-06, + "loss": 0.4867, + "step": 1875 + }, + { + "epoch": 0.15, + "grad_norm": 0.7585777300523453, + "learning_rate": 9.643299238625722e-06, + "loss": 0.5434, + "step": 1876 + }, + { + "epoch": 0.15, + "grad_norm": 1.572531173966726, + "learning_rate": 9.642827337682264e-06, + "loss": 0.4877, + "step": 1877 + }, + { + "epoch": 0.15, + "grad_norm": 1.9706030696387773, + "learning_rate": 9.642355136353949e-06, + "loss": 0.5629, + "step": 1878 + }, + { + "epoch": 0.15, + "grad_norm": 0.6352121891277558, + "learning_rate": 9.641882634671329e-06, + "loss": 0.582, + "step": 1879 + }, + { + "epoch": 0.15, + "grad_norm": 11.31733320602329, + "learning_rate": 9.641409832664977e-06, + "loss": 0.5134, + "step": 1880 + }, + { + "epoch": 0.15, + "grad_norm": 2.289160771882736, + "learning_rate": 9.640936730365478e-06, + "loss": 0.5148, + "step": 1881 + }, + { + "epoch": 0.15, + "grad_norm": 1.372506564675461, + "learning_rate": 9.640463327803445e-06, + "loss": 0.4932, + "step": 1882 + }, + { + "epoch": 0.15, + "grad_norm": 0.6655094649274812, + "learning_rate": 9.639989625009505e-06, + "loss": 0.5715, + "step": 1883 + }, + { + "epoch": 0.15, + "grad_norm": 1.5348168732024798, + "learning_rate": 9.639515622014305e-06, + "loss": 0.5457, + "step": 1884 + }, + { + "epoch": 0.15, + "grad_norm": 1.5944701019559275, + "learning_rate": 9.639041318848517e-06, + "loss": 0.5453, + "step": 1885 + }, + { + "epoch": 0.15, + "grad_norm": 1.6506860711249078, + "learning_rate": 9.63856671554282e-06, + "loss": 0.5561, + "step": 1886 + }, + { + "epoch": 0.15, + "grad_norm": 3.3484934661566084, + "learning_rate": 9.638091812127928e-06, + "loss": 0.5228, + "step": 1887 + }, + { + "epoch": 0.15, + "grad_norm": 1.5517653798172206, + "learning_rate": 9.637616608634561e-06, + "loss": 0.528, + "step": 1888 + }, + { + "epoch": 0.15, + "grad_norm": 1.8089568202222732, + "learning_rate": 9.637141105093468e-06, + "loss": 0.5303, + "step": 1889 + }, + { + "epoch": 0.15, + "grad_norm": 0.6377038927365635, + "learning_rate": 9.636665301535411e-06, + "loss": 0.5534, + "step": 1890 + }, + { + "epoch": 0.15, + "grad_norm": 1.6489053543987315, + "learning_rate": 9.636189197991172e-06, + "loss": 0.5283, + "step": 1891 + }, + { + "epoch": 0.15, + "grad_norm": 1.5429584733055357, + "learning_rate": 9.63571279449156e-06, + "loss": 0.4832, + "step": 1892 + }, + { + "epoch": 0.15, + "grad_norm": 0.6363844468220257, + "learning_rate": 9.635236091067394e-06, + "loss": 0.5376, + "step": 1893 + }, + { + "epoch": 0.15, + "grad_norm": 1.3671053455566085, + "learning_rate": 9.634759087749514e-06, + "loss": 0.5744, + "step": 1894 + }, + { + "epoch": 0.15, + "grad_norm": 1.5160486796831134, + "learning_rate": 9.634281784568787e-06, + "loss": 0.5327, + "step": 1895 + }, + { + "epoch": 0.15, + "grad_norm": 1.5884070971612532, + "learning_rate": 9.63380418155609e-06, + "loss": 0.5062, + "step": 1896 + }, + { + "epoch": 0.15, + "grad_norm": 1.4566344502319915, + "learning_rate": 9.633326278742325e-06, + "loss": 0.5759, + "step": 1897 + }, + { + "epoch": 0.15, + "grad_norm": 1.8860146630487553, + "learning_rate": 9.63284807615841e-06, + "loss": 0.4751, + "step": 1898 + }, + { + "epoch": 0.15, + "grad_norm": 1.6132877472106457, + "learning_rate": 9.632369573835284e-06, + "loss": 0.5391, + "step": 1899 + }, + { + "epoch": 0.15, + "grad_norm": 1.6238791780200794, + "learning_rate": 9.631890771803909e-06, + "loss": 0.4599, + "step": 1900 + }, + { + "epoch": 0.15, + "grad_norm": 1.4486617776656527, + "learning_rate": 9.631411670095258e-06, + "loss": 0.4979, + "step": 1901 + }, + { + "epoch": 0.15, + "grad_norm": 1.4058252331531722, + "learning_rate": 9.630932268740332e-06, + "loss": 0.5338, + "step": 1902 + }, + { + "epoch": 0.15, + "grad_norm": 2.04346017356555, + "learning_rate": 9.630452567770144e-06, + "loss": 0.5607, + "step": 1903 + }, + { + "epoch": 0.15, + "grad_norm": 1.4687463979665778, + "learning_rate": 9.629972567215734e-06, + "loss": 0.4801, + "step": 1904 + }, + { + "epoch": 0.15, + "grad_norm": 1.9388915894152863, + "learning_rate": 9.629492267108157e-06, + "loss": 0.5682, + "step": 1905 + }, + { + "epoch": 0.15, + "grad_norm": 0.7339000176460159, + "learning_rate": 9.629011667478484e-06, + "loss": 0.575, + "step": 1906 + }, + { + "epoch": 0.15, + "grad_norm": 1.8281390656291485, + "learning_rate": 9.628530768357813e-06, + "loss": 0.4948, + "step": 1907 + }, + { + "epoch": 0.15, + "grad_norm": 1.4653270075321028, + "learning_rate": 9.628049569777257e-06, + "loss": 0.516, + "step": 1908 + }, + { + "epoch": 0.15, + "grad_norm": 0.6224407398210768, + "learning_rate": 9.627568071767946e-06, + "loss": 0.5551, + "step": 1909 + }, + { + "epoch": 0.15, + "grad_norm": 0.6673207861708482, + "learning_rate": 9.627086274361036e-06, + "loss": 0.5694, + "step": 1910 + }, + { + "epoch": 0.15, + "grad_norm": 1.7113701655987905, + "learning_rate": 9.626604177587696e-06, + "loss": 0.5301, + "step": 1911 + }, + { + "epoch": 0.15, + "grad_norm": 2.0581051329711286, + "learning_rate": 9.626121781479118e-06, + "loss": 0.5805, + "step": 1912 + }, + { + "epoch": 0.15, + "grad_norm": 3.605840579928456, + "learning_rate": 9.625639086066511e-06, + "loss": 0.5098, + "step": 1913 + }, + { + "epoch": 0.15, + "grad_norm": 1.6848669735443895, + "learning_rate": 9.62515609138111e-06, + "loss": 0.511, + "step": 1914 + }, + { + "epoch": 0.15, + "grad_norm": 1.9044271557992667, + "learning_rate": 9.624672797454157e-06, + "loss": 0.5195, + "step": 1915 + }, + { + "epoch": 0.15, + "grad_norm": 2.1602164265870334, + "learning_rate": 9.624189204316925e-06, + "loss": 0.5972, + "step": 1916 + }, + { + "epoch": 0.15, + "grad_norm": 0.7654281950420349, + "learning_rate": 9.623705312000701e-06, + "loss": 0.5526, + "step": 1917 + }, + { + "epoch": 0.15, + "grad_norm": 1.8515363200676294, + "learning_rate": 9.623221120536792e-06, + "loss": 0.5297, + "step": 1918 + }, + { + "epoch": 0.15, + "grad_norm": 0.6872852435569502, + "learning_rate": 9.622736629956525e-06, + "loss": 0.5608, + "step": 1919 + }, + { + "epoch": 0.15, + "grad_norm": 1.6258595650686778, + "learning_rate": 9.622251840291245e-06, + "loss": 0.5096, + "step": 1920 + }, + { + "epoch": 0.15, + "grad_norm": 1.9314525783056422, + "learning_rate": 9.621766751572317e-06, + "loss": 0.5598, + "step": 1921 + }, + { + "epoch": 0.15, + "grad_norm": 1.5698478227833776, + "learning_rate": 9.621281363831126e-06, + "loss": 0.4863, + "step": 1922 + }, + { + "epoch": 0.15, + "grad_norm": 2.4269589832401026, + "learning_rate": 9.620795677099077e-06, + "loss": 0.5225, + "step": 1923 + }, + { + "epoch": 0.15, + "grad_norm": 1.6314829794840122, + "learning_rate": 9.620309691407592e-06, + "loss": 0.5318, + "step": 1924 + }, + { + "epoch": 0.15, + "grad_norm": 1.5194310597065492, + "learning_rate": 9.619823406788114e-06, + "loss": 0.4951, + "step": 1925 + }, + { + "epoch": 0.15, + "grad_norm": 0.7614795016610035, + "learning_rate": 9.619336823272106e-06, + "loss": 0.5522, + "step": 1926 + }, + { + "epoch": 0.15, + "grad_norm": 1.805454404965748, + "learning_rate": 9.618849940891046e-06, + "loss": 0.5686, + "step": 1927 + }, + { + "epoch": 0.15, + "grad_norm": 7.453542619324918, + "learning_rate": 9.618362759676439e-06, + "loss": 0.532, + "step": 1928 + }, + { + "epoch": 0.15, + "grad_norm": 0.6613239910171699, + "learning_rate": 9.617875279659803e-06, + "loss": 0.5931, + "step": 1929 + }, + { + "epoch": 0.15, + "grad_norm": 2.0617400527585406, + "learning_rate": 9.617387500872677e-06, + "loss": 0.4785, + "step": 1930 + }, + { + "epoch": 0.15, + "grad_norm": 1.5004758332995602, + "learning_rate": 9.61689942334662e-06, + "loss": 0.4767, + "step": 1931 + }, + { + "epoch": 0.15, + "grad_norm": 0.593341760170481, + "learning_rate": 9.61641104711321e-06, + "loss": 0.5452, + "step": 1932 + }, + { + "epoch": 0.15, + "grad_norm": 1.7383773839174395, + "learning_rate": 9.615922372204043e-06, + "loss": 0.4588, + "step": 1933 + }, + { + "epoch": 0.15, + "grad_norm": 4.182898045049694, + "learning_rate": 9.61543339865074e-06, + "loss": 0.5215, + "step": 1934 + }, + { + "epoch": 0.15, + "grad_norm": 1.8015117616952703, + "learning_rate": 9.61494412648493e-06, + "loss": 0.5234, + "step": 1935 + }, + { + "epoch": 0.15, + "grad_norm": 2.044893856415567, + "learning_rate": 9.614454555738274e-06, + "loss": 0.4816, + "step": 1936 + }, + { + "epoch": 0.15, + "grad_norm": 1.714073234740042, + "learning_rate": 9.613964686442446e-06, + "loss": 0.4554, + "step": 1937 + }, + { + "epoch": 0.15, + "grad_norm": 1.9920259924299915, + "learning_rate": 9.613474518629138e-06, + "loss": 0.4789, + "step": 1938 + }, + { + "epoch": 0.15, + "grad_norm": 1.574747097966089, + "learning_rate": 9.612984052330064e-06, + "loss": 0.5208, + "step": 1939 + }, + { + "epoch": 0.15, + "grad_norm": 1.7855558288129156, + "learning_rate": 9.612493287576955e-06, + "loss": 0.4755, + "step": 1940 + }, + { + "epoch": 0.15, + "grad_norm": 3.3198880221077043, + "learning_rate": 9.612002224401565e-06, + "loss": 0.5327, + "step": 1941 + }, + { + "epoch": 0.15, + "grad_norm": 1.6722870389170643, + "learning_rate": 9.611510862835663e-06, + "loss": 0.5601, + "step": 1942 + }, + { + "epoch": 0.15, + "grad_norm": 0.8104386319619111, + "learning_rate": 9.611019202911042e-06, + "loss": 0.5795, + "step": 1943 + }, + { + "epoch": 0.15, + "grad_norm": 0.7094562742464343, + "learning_rate": 9.610527244659512e-06, + "loss": 0.5423, + "step": 1944 + }, + { + "epoch": 0.15, + "grad_norm": 1.7855967811002127, + "learning_rate": 9.610034988112897e-06, + "loss": 0.5371, + "step": 1945 + }, + { + "epoch": 0.15, + "grad_norm": 2.1226150119770044, + "learning_rate": 9.609542433303052e-06, + "loss": 0.5262, + "step": 1946 + }, + { + "epoch": 0.15, + "grad_norm": 1.4103001112012876, + "learning_rate": 9.60904958026184e-06, + "loss": 0.4759, + "step": 1947 + }, + { + "epoch": 0.15, + "grad_norm": 2.394735922551564, + "learning_rate": 9.60855642902115e-06, + "loss": 0.5074, + "step": 1948 + }, + { + "epoch": 0.15, + "grad_norm": 2.499935072642003, + "learning_rate": 9.608062979612889e-06, + "loss": 0.535, + "step": 1949 + }, + { + "epoch": 0.15, + "grad_norm": 2.2507857701161162, + "learning_rate": 9.60756923206898e-06, + "loss": 0.4995, + "step": 1950 + }, + { + "epoch": 0.15, + "grad_norm": 1.331283292668649, + "learning_rate": 9.60707518642137e-06, + "loss": 0.5007, + "step": 1951 + }, + { + "epoch": 0.15, + "grad_norm": 1.0591413706909243, + "learning_rate": 9.606580842702021e-06, + "loss": 0.566, + "step": 1952 + }, + { + "epoch": 0.15, + "grad_norm": 1.6593037543731923, + "learning_rate": 9.60608620094292e-06, + "loss": 0.5879, + "step": 1953 + }, + { + "epoch": 0.15, + "grad_norm": 1.3845228695305747, + "learning_rate": 9.605591261176064e-06, + "loss": 0.5514, + "step": 1954 + }, + { + "epoch": 0.15, + "grad_norm": 2.0294834405918096, + "learning_rate": 9.60509602343348e-06, + "loss": 0.5637, + "step": 1955 + }, + { + "epoch": 0.15, + "grad_norm": 1.9209999454199727, + "learning_rate": 9.604600487747207e-06, + "loss": 0.4848, + "step": 1956 + }, + { + "epoch": 0.15, + "grad_norm": 1.9446447548083243, + "learning_rate": 9.604104654149306e-06, + "loss": 0.4826, + "step": 1957 + }, + { + "epoch": 0.15, + "grad_norm": 2.6369386012313103, + "learning_rate": 9.603608522671859e-06, + "loss": 0.4992, + "step": 1958 + }, + { + "epoch": 0.15, + "grad_norm": 1.6522098268516108, + "learning_rate": 9.60311209334696e-06, + "loss": 0.518, + "step": 1959 + }, + { + "epoch": 0.15, + "grad_norm": 1.546673319598247, + "learning_rate": 9.60261536620673e-06, + "loss": 0.5042, + "step": 1960 + }, + { + "epoch": 0.15, + "grad_norm": 0.5901752631891816, + "learning_rate": 9.60211834128331e-06, + "loss": 0.5301, + "step": 1961 + }, + { + "epoch": 0.15, + "grad_norm": 3.25079203756869, + "learning_rate": 9.60162101860885e-06, + "loss": 0.564, + "step": 1962 + }, + { + "epoch": 0.15, + "grad_norm": 2.6233735065992376, + "learning_rate": 9.601123398215533e-06, + "loss": 0.5147, + "step": 1963 + }, + { + "epoch": 0.15, + "grad_norm": 1.729986969012851, + "learning_rate": 9.60062548013555e-06, + "loss": 0.5146, + "step": 1964 + }, + { + "epoch": 0.15, + "grad_norm": 1.939563466461983, + "learning_rate": 9.600127264401116e-06, + "loss": 0.4707, + "step": 1965 + }, + { + "epoch": 0.15, + "grad_norm": 1.5696199034258749, + "learning_rate": 9.599628751044467e-06, + "loss": 0.4531, + "step": 1966 + }, + { + "epoch": 0.15, + "grad_norm": 0.6266967564033181, + "learning_rate": 9.599129940097853e-06, + "loss": 0.5678, + "step": 1967 + }, + { + "epoch": 0.15, + "grad_norm": 1.4883283697831413, + "learning_rate": 9.59863083159355e-06, + "loss": 0.4779, + "step": 1968 + }, + { + "epoch": 0.15, + "grad_norm": 1.7246248807352447, + "learning_rate": 9.598131425563847e-06, + "loss": 0.5043, + "step": 1969 + }, + { + "epoch": 0.15, + "grad_norm": 1.9943019252348066, + "learning_rate": 9.597631722041056e-06, + "loss": 0.5569, + "step": 1970 + }, + { + "epoch": 0.15, + "grad_norm": 5.3855774758865245, + "learning_rate": 9.597131721057508e-06, + "loss": 0.4608, + "step": 1971 + }, + { + "epoch": 0.15, + "grad_norm": 1.7884628353667116, + "learning_rate": 9.59663142264555e-06, + "loss": 0.5328, + "step": 1972 + }, + { + "epoch": 0.15, + "grad_norm": 2.3979294782491385, + "learning_rate": 9.596130826837552e-06, + "loss": 0.4577, + "step": 1973 + }, + { + "epoch": 0.16, + "grad_norm": 0.5994632476954805, + "learning_rate": 9.595629933665904e-06, + "loss": 0.5735, + "step": 1974 + }, + { + "epoch": 0.16, + "grad_norm": 2.006160878390288, + "learning_rate": 9.595128743163009e-06, + "loss": 0.504, + "step": 1975 + }, + { + "epoch": 0.16, + "grad_norm": 1.6419994751213296, + "learning_rate": 9.594627255361298e-06, + "loss": 0.5468, + "step": 1976 + }, + { + "epoch": 0.16, + "grad_norm": 0.6296557913145661, + "learning_rate": 9.594125470293211e-06, + "loss": 0.5525, + "step": 1977 + }, + { + "epoch": 0.16, + "grad_norm": 2.5450590996954117, + "learning_rate": 9.593623387991218e-06, + "loss": 0.5273, + "step": 1978 + }, + { + "epoch": 0.16, + "grad_norm": 1.6651739325971893, + "learning_rate": 9.593121008487801e-06, + "loss": 0.5585, + "step": 1979 + }, + { + "epoch": 0.16, + "grad_norm": 1.6740180466062824, + "learning_rate": 9.592618331815461e-06, + "loss": 0.4974, + "step": 1980 + }, + { + "epoch": 0.16, + "grad_norm": 1.9609641355773166, + "learning_rate": 9.592115358006725e-06, + "loss": 0.5442, + "step": 1981 + }, + { + "epoch": 0.16, + "grad_norm": 1.9008689839362534, + "learning_rate": 9.591612087094133e-06, + "loss": 0.501, + "step": 1982 + }, + { + "epoch": 0.16, + "grad_norm": 4.056617713083659, + "learning_rate": 9.591108519110246e-06, + "loss": 0.5661, + "step": 1983 + }, + { + "epoch": 0.16, + "grad_norm": 1.5508514459532974, + "learning_rate": 9.590604654087641e-06, + "loss": 0.5264, + "step": 1984 + }, + { + "epoch": 0.16, + "grad_norm": 2.028596966399857, + "learning_rate": 9.590100492058922e-06, + "loss": 0.4966, + "step": 1985 + }, + { + "epoch": 0.16, + "grad_norm": 1.4063992116273172, + "learning_rate": 9.589596033056705e-06, + "loss": 0.4863, + "step": 1986 + }, + { + "epoch": 0.16, + "grad_norm": 1.8797127213032618, + "learning_rate": 9.589091277113628e-06, + "loss": 0.4624, + "step": 1987 + }, + { + "epoch": 0.16, + "grad_norm": 1.837964870874461, + "learning_rate": 9.58858622426235e-06, + "loss": 0.5578, + "step": 1988 + }, + { + "epoch": 0.16, + "grad_norm": 1.879246426021428, + "learning_rate": 9.588080874535547e-06, + "loss": 0.5101, + "step": 1989 + }, + { + "epoch": 0.16, + "grad_norm": 2.394393583494497, + "learning_rate": 9.587575227965912e-06, + "loss": 0.4926, + "step": 1990 + }, + { + "epoch": 0.16, + "grad_norm": 2.076549783347219, + "learning_rate": 9.587069284586162e-06, + "loss": 0.4853, + "step": 1991 + }, + { + "epoch": 0.16, + "grad_norm": 2.6326883649225374, + "learning_rate": 9.586563044429027e-06, + "loss": 0.4353, + "step": 1992 + }, + { + "epoch": 0.16, + "grad_norm": 1.6078603902071085, + "learning_rate": 9.586056507527266e-06, + "loss": 0.5476, + "step": 1993 + }, + { + "epoch": 0.16, + "grad_norm": 1.9144519827767175, + "learning_rate": 9.585549673913647e-06, + "loss": 0.5289, + "step": 1994 + }, + { + "epoch": 0.16, + "grad_norm": 1.7868042808192288, + "learning_rate": 9.585042543620964e-06, + "loss": 0.4653, + "step": 1995 + }, + { + "epoch": 0.16, + "grad_norm": 1.6101988739600026, + "learning_rate": 9.584535116682024e-06, + "loss": 0.525, + "step": 1996 + }, + { + "epoch": 0.16, + "grad_norm": 1.549959156834061, + "learning_rate": 9.584027393129662e-06, + "loss": 0.533, + "step": 1997 + }, + { + "epoch": 0.16, + "grad_norm": 1.7298063326063522, + "learning_rate": 9.583519372996723e-06, + "loss": 0.5169, + "step": 1998 + }, + { + "epoch": 0.16, + "grad_norm": 1.960533985153442, + "learning_rate": 9.583011056316076e-06, + "loss": 0.5052, + "step": 1999 + }, + { + "epoch": 0.16, + "grad_norm": 2.042457507059685, + "learning_rate": 9.58250244312061e-06, + "loss": 0.5087, + "step": 2000 + }, + { + "epoch": 0.16, + "grad_norm": 1.5805061347807925, + "learning_rate": 9.58199353344323e-06, + "loss": 0.4964, + "step": 2001 + }, + { + "epoch": 0.16, + "grad_norm": 0.6816361660868795, + "learning_rate": 9.581484327316862e-06, + "loss": 0.5632, + "step": 2002 + }, + { + "epoch": 0.16, + "grad_norm": 2.515994614064876, + "learning_rate": 9.580974824774452e-06, + "loss": 0.4676, + "step": 2003 + }, + { + "epoch": 0.16, + "grad_norm": 1.4702558803539987, + "learning_rate": 9.580465025848964e-06, + "loss": 0.4679, + "step": 2004 + }, + { + "epoch": 0.16, + "grad_norm": 1.8866135071526426, + "learning_rate": 9.57995493057338e-06, + "loss": 0.4649, + "step": 2005 + }, + { + "epoch": 0.16, + "grad_norm": 1.919991825031542, + "learning_rate": 9.579444538980704e-06, + "loss": 0.5436, + "step": 2006 + }, + { + "epoch": 0.16, + "grad_norm": 1.460924148102141, + "learning_rate": 9.578933851103955e-06, + "loss": 0.5057, + "step": 2007 + }, + { + "epoch": 0.16, + "grad_norm": 1.7917148710566269, + "learning_rate": 9.578422866976179e-06, + "loss": 0.5454, + "step": 2008 + }, + { + "epoch": 0.16, + "grad_norm": 4.681393069998754, + "learning_rate": 9.57791158663043e-06, + "loss": 0.5182, + "step": 2009 + }, + { + "epoch": 0.16, + "grad_norm": 0.6882539508153274, + "learning_rate": 9.577400010099791e-06, + "loss": 0.5774, + "step": 2010 + }, + { + "epoch": 0.16, + "grad_norm": 1.3935347021610167, + "learning_rate": 9.576888137417359e-06, + "loss": 0.5082, + "step": 2011 + }, + { + "epoch": 0.16, + "grad_norm": 1.4379004557713304, + "learning_rate": 9.576375968616253e-06, + "loss": 0.4561, + "step": 2012 + }, + { + "epoch": 0.16, + "grad_norm": 2.218044137839161, + "learning_rate": 9.575863503729607e-06, + "loss": 0.456, + "step": 2013 + }, + { + "epoch": 0.16, + "grad_norm": 1.624595576545622, + "learning_rate": 9.57535074279058e-06, + "loss": 0.5016, + "step": 2014 + }, + { + "epoch": 0.16, + "grad_norm": 1.734987647831434, + "learning_rate": 9.574837685832345e-06, + "loss": 0.5444, + "step": 2015 + }, + { + "epoch": 0.16, + "grad_norm": 1.7904434658644846, + "learning_rate": 9.574324332888097e-06, + "loss": 0.5167, + "step": 2016 + }, + { + "epoch": 0.16, + "grad_norm": 1.7145130938309323, + "learning_rate": 9.573810683991047e-06, + "loss": 0.5051, + "step": 2017 + }, + { + "epoch": 0.16, + "grad_norm": 1.8812049928002021, + "learning_rate": 9.573296739174429e-06, + "loss": 0.4792, + "step": 2018 + }, + { + "epoch": 0.16, + "grad_norm": 1.9692378478446724, + "learning_rate": 9.572782498471495e-06, + "loss": 0.5338, + "step": 2019 + }, + { + "epoch": 0.16, + "grad_norm": 1.4889322853218172, + "learning_rate": 9.572267961915516e-06, + "loss": 0.4761, + "step": 2020 + }, + { + "epoch": 0.16, + "grad_norm": 1.4912677996970003, + "learning_rate": 9.57175312953978e-06, + "loss": 0.5376, + "step": 2021 + }, + { + "epoch": 0.16, + "grad_norm": 1.5246908260779142, + "learning_rate": 9.571238001377597e-06, + "loss": 0.5382, + "step": 2022 + }, + { + "epoch": 0.16, + "grad_norm": 1.6524357683208102, + "learning_rate": 9.570722577462298e-06, + "loss": 0.5437, + "step": 2023 + }, + { + "epoch": 0.16, + "grad_norm": 1.7788487919012534, + "learning_rate": 9.570206857827223e-06, + "loss": 0.5225, + "step": 2024 + }, + { + "epoch": 0.16, + "grad_norm": 1.5108515525735133, + "learning_rate": 9.569690842505746e-06, + "loss": 0.5489, + "step": 2025 + }, + { + "epoch": 0.16, + "grad_norm": 1.377732465657477, + "learning_rate": 9.569174531531249e-06, + "loss": 0.4267, + "step": 2026 + }, + { + "epoch": 0.16, + "grad_norm": 1.7378627558321154, + "learning_rate": 9.568657924937137e-06, + "loss": 0.5348, + "step": 2027 + }, + { + "epoch": 0.16, + "grad_norm": 1.6228995537471274, + "learning_rate": 9.568141022756832e-06, + "loss": 0.5384, + "step": 2028 + }, + { + "epoch": 0.16, + "grad_norm": 1.576654087130885, + "learning_rate": 9.567623825023781e-06, + "loss": 0.536, + "step": 2029 + }, + { + "epoch": 0.16, + "grad_norm": 0.629275234919402, + "learning_rate": 9.567106331771443e-06, + "loss": 0.5845, + "step": 2030 + }, + { + "epoch": 0.16, + "grad_norm": 1.4489677124859823, + "learning_rate": 9.5665885430333e-06, + "loss": 0.5789, + "step": 2031 + }, + { + "epoch": 0.16, + "grad_norm": 0.5999177822298869, + "learning_rate": 9.566070458842851e-06, + "loss": 0.5491, + "step": 2032 + }, + { + "epoch": 0.16, + "grad_norm": 1.6790315655769896, + "learning_rate": 9.565552079233617e-06, + "loss": 0.5351, + "step": 2033 + }, + { + "epoch": 0.16, + "grad_norm": 0.6190922991049055, + "learning_rate": 9.565033404239136e-06, + "loss": 0.5708, + "step": 2034 + }, + { + "epoch": 0.16, + "grad_norm": 1.5129711452020425, + "learning_rate": 9.564514433892967e-06, + "loss": 0.4599, + "step": 2035 + }, + { + "epoch": 0.16, + "grad_norm": 0.6428463009973222, + "learning_rate": 9.563995168228685e-06, + "loss": 0.5761, + "step": 2036 + }, + { + "epoch": 0.16, + "grad_norm": 3.284363873913847, + "learning_rate": 9.563475607279884e-06, + "loss": 0.5158, + "step": 2037 + }, + { + "epoch": 0.16, + "grad_norm": 5.059535215244837, + "learning_rate": 9.562955751080183e-06, + "loss": 0.5723, + "step": 2038 + }, + { + "epoch": 0.16, + "grad_norm": 1.3301787019320954, + "learning_rate": 9.562435599663213e-06, + "loss": 0.554, + "step": 2039 + }, + { + "epoch": 0.16, + "grad_norm": 3.2049616738862214, + "learning_rate": 9.561915153062628e-06, + "loss": 0.5496, + "step": 2040 + }, + { + "epoch": 0.16, + "grad_norm": 2.574171426614344, + "learning_rate": 9.561394411312101e-06, + "loss": 0.497, + "step": 2041 + }, + { + "epoch": 0.16, + "grad_norm": 1.4155571045947233, + "learning_rate": 9.560873374445324e-06, + "loss": 0.5016, + "step": 2042 + }, + { + "epoch": 0.16, + "grad_norm": 1.600973038158765, + "learning_rate": 9.560352042496003e-06, + "loss": 0.5106, + "step": 2043 + }, + { + "epoch": 0.16, + "grad_norm": 1.5482967993591092, + "learning_rate": 9.559830415497874e-06, + "loss": 0.4883, + "step": 2044 + }, + { + "epoch": 0.16, + "grad_norm": 1.7819318872980026, + "learning_rate": 9.559308493484678e-06, + "loss": 0.5706, + "step": 2045 + }, + { + "epoch": 0.16, + "grad_norm": 0.611778446364573, + "learning_rate": 9.558786276490188e-06, + "loss": 0.5555, + "step": 2046 + }, + { + "epoch": 0.16, + "grad_norm": 1.7421506099276582, + "learning_rate": 9.558263764548191e-06, + "loss": 0.5503, + "step": 2047 + }, + { + "epoch": 0.16, + "grad_norm": 1.7444687343288983, + "learning_rate": 9.55774095769249e-06, + "loss": 0.5206, + "step": 2048 + }, + { + "epoch": 0.16, + "grad_norm": 1.70686489024873, + "learning_rate": 9.557217855956912e-06, + "loss": 0.5403, + "step": 2049 + }, + { + "epoch": 0.16, + "grad_norm": 1.4782358401591296, + "learning_rate": 9.556694459375301e-06, + "loss": 0.5485, + "step": 2050 + }, + { + "epoch": 0.16, + "grad_norm": 1.6466517853359657, + "learning_rate": 9.55617076798152e-06, + "loss": 0.5234, + "step": 2051 + }, + { + "epoch": 0.16, + "grad_norm": 1.9353923381414608, + "learning_rate": 9.555646781809448e-06, + "loss": 0.57, + "step": 2052 + }, + { + "epoch": 0.16, + "grad_norm": 1.5197688339756326, + "learning_rate": 9.55512250089299e-06, + "loss": 0.5045, + "step": 2053 + }, + { + "epoch": 0.16, + "grad_norm": 1.5903060322717537, + "learning_rate": 9.554597925266066e-06, + "loss": 0.5285, + "step": 2054 + }, + { + "epoch": 0.16, + "grad_norm": 1.4782117646404644, + "learning_rate": 9.554073054962614e-06, + "loss": 0.4806, + "step": 2055 + }, + { + "epoch": 0.16, + "grad_norm": 1.7885308177218775, + "learning_rate": 9.553547890016592e-06, + "loss": 0.5314, + "step": 2056 + }, + { + "epoch": 0.16, + "grad_norm": 3.938781855049388, + "learning_rate": 9.553022430461978e-06, + "loss": 0.5321, + "step": 2057 + }, + { + "epoch": 0.16, + "grad_norm": 0.6054817513725239, + "learning_rate": 9.55249667633277e-06, + "loss": 0.5586, + "step": 2058 + }, + { + "epoch": 0.16, + "grad_norm": 1.5267750804127047, + "learning_rate": 9.551970627662982e-06, + "loss": 0.5115, + "step": 2059 + }, + { + "epoch": 0.16, + "grad_norm": 2.9415949031960835, + "learning_rate": 9.551444284486649e-06, + "loss": 0.5267, + "step": 2060 + }, + { + "epoch": 0.16, + "grad_norm": 0.5682888938008787, + "learning_rate": 9.550917646837825e-06, + "loss": 0.5515, + "step": 2061 + }, + { + "epoch": 0.16, + "grad_norm": 0.570009752796976, + "learning_rate": 9.550390714750582e-06, + "loss": 0.5627, + "step": 2062 + }, + { + "epoch": 0.16, + "grad_norm": 1.9566511229347934, + "learning_rate": 9.549863488259012e-06, + "loss": 0.5641, + "step": 2063 + }, + { + "epoch": 0.16, + "grad_norm": 2.0482280967175064, + "learning_rate": 9.549335967397228e-06, + "loss": 0.4968, + "step": 2064 + }, + { + "epoch": 0.16, + "grad_norm": 1.9914595867227152, + "learning_rate": 9.548808152199358e-06, + "loss": 0.5124, + "step": 2065 + }, + { + "epoch": 0.16, + "grad_norm": 1.9232968396869008, + "learning_rate": 9.548280042699551e-06, + "loss": 0.5378, + "step": 2066 + }, + { + "epoch": 0.16, + "grad_norm": 1.527159329196347, + "learning_rate": 9.547751638931975e-06, + "loss": 0.5189, + "step": 2067 + }, + { + "epoch": 0.16, + "grad_norm": 1.7525773965788447, + "learning_rate": 9.547222940930816e-06, + "loss": 0.5453, + "step": 2068 + }, + { + "epoch": 0.16, + "grad_norm": 0.6609644855259922, + "learning_rate": 9.546693948730281e-06, + "loss": 0.5638, + "step": 2069 + }, + { + "epoch": 0.16, + "grad_norm": 1.8268757786279337, + "learning_rate": 9.546164662364596e-06, + "loss": 0.5305, + "step": 2070 + }, + { + "epoch": 0.16, + "grad_norm": 0.6033676829929758, + "learning_rate": 9.545635081868006e-06, + "loss": 0.5666, + "step": 2071 + }, + { + "epoch": 0.16, + "grad_norm": 1.7333667889017632, + "learning_rate": 9.545105207274773e-06, + "loss": 0.5519, + "step": 2072 + }, + { + "epoch": 0.16, + "grad_norm": 1.5616028321094193, + "learning_rate": 9.544575038619176e-06, + "loss": 0.5437, + "step": 2073 + }, + { + "epoch": 0.16, + "grad_norm": 0.6059526506702847, + "learning_rate": 9.54404457593552e-06, + "loss": 0.5433, + "step": 2074 + }, + { + "epoch": 0.16, + "grad_norm": 1.7962628720685776, + "learning_rate": 9.543513819258125e-06, + "loss": 0.4803, + "step": 2075 + }, + { + "epoch": 0.16, + "grad_norm": 1.7038094571996882, + "learning_rate": 9.54298276862133e-06, + "loss": 0.502, + "step": 2076 + }, + { + "epoch": 0.16, + "grad_norm": 1.400098088362526, + "learning_rate": 9.542451424059491e-06, + "loss": 0.4995, + "step": 2077 + }, + { + "epoch": 0.16, + "grad_norm": 1.4428748234460604, + "learning_rate": 9.541919785606988e-06, + "loss": 0.4739, + "step": 2078 + }, + { + "epoch": 0.16, + "grad_norm": 0.6203634553538203, + "learning_rate": 9.541387853298217e-06, + "loss": 0.5493, + "step": 2079 + }, + { + "epoch": 0.16, + "grad_norm": 0.6044456136186844, + "learning_rate": 9.540855627167593e-06, + "loss": 0.5641, + "step": 2080 + }, + { + "epoch": 0.16, + "grad_norm": 2.8488272095627583, + "learning_rate": 9.540323107249549e-06, + "loss": 0.4509, + "step": 2081 + }, + { + "epoch": 0.16, + "grad_norm": 1.8092851040192266, + "learning_rate": 9.53979029357854e-06, + "loss": 0.5494, + "step": 2082 + }, + { + "epoch": 0.16, + "grad_norm": 1.483299768318366, + "learning_rate": 9.539257186189039e-06, + "loss": 0.4873, + "step": 2083 + }, + { + "epoch": 0.16, + "grad_norm": 1.806621437048565, + "learning_rate": 9.538723785115534e-06, + "loss": 0.5327, + "step": 2084 + }, + { + "epoch": 0.16, + "grad_norm": 1.9642216730941904, + "learning_rate": 9.538190090392538e-06, + "loss": 0.5058, + "step": 2085 + }, + { + "epoch": 0.16, + "grad_norm": 1.8153278047222474, + "learning_rate": 9.537656102054582e-06, + "loss": 0.5233, + "step": 2086 + }, + { + "epoch": 0.16, + "grad_norm": 1.415882449847442, + "learning_rate": 9.537121820136208e-06, + "loss": 0.5148, + "step": 2087 + }, + { + "epoch": 0.16, + "grad_norm": 0.6481448236743124, + "learning_rate": 9.536587244671991e-06, + "loss": 0.5725, + "step": 2088 + }, + { + "epoch": 0.16, + "grad_norm": 1.912606776206628, + "learning_rate": 9.536052375696514e-06, + "loss": 0.5396, + "step": 2089 + }, + { + "epoch": 0.16, + "grad_norm": 0.6407457116264279, + "learning_rate": 9.53551721324438e-06, + "loss": 0.5593, + "step": 2090 + }, + { + "epoch": 0.16, + "grad_norm": 1.6193963001497935, + "learning_rate": 9.534981757350218e-06, + "loss": 0.517, + "step": 2091 + }, + { + "epoch": 0.16, + "grad_norm": 1.7685896087585327, + "learning_rate": 9.534446008048667e-06, + "loss": 0.4717, + "step": 2092 + }, + { + "epoch": 0.16, + "grad_norm": 1.4447835250096623, + "learning_rate": 9.53390996537439e-06, + "loss": 0.5443, + "step": 2093 + }, + { + "epoch": 0.16, + "grad_norm": 1.8330164234658446, + "learning_rate": 9.533373629362072e-06, + "loss": 0.5221, + "step": 2094 + }, + { + "epoch": 0.16, + "grad_norm": 1.5040478392684282, + "learning_rate": 9.532837000046408e-06, + "loss": 0.5363, + "step": 2095 + }, + { + "epoch": 0.16, + "grad_norm": 1.931012572435919, + "learning_rate": 9.532300077462122e-06, + "loss": 0.5781, + "step": 2096 + }, + { + "epoch": 0.16, + "grad_norm": 1.5215064601989499, + "learning_rate": 9.531762861643949e-06, + "loss": 0.53, + "step": 2097 + }, + { + "epoch": 0.16, + "grad_norm": 1.5904653867858773, + "learning_rate": 9.531225352626648e-06, + "loss": 0.5171, + "step": 2098 + }, + { + "epoch": 0.16, + "grad_norm": 2.0280920271209886, + "learning_rate": 9.530687550444994e-06, + "loss": 0.5807, + "step": 2099 + }, + { + "epoch": 0.16, + "grad_norm": 1.8054079400137233, + "learning_rate": 9.530149455133782e-06, + "loss": 0.5169, + "step": 2100 + }, + { + "epoch": 0.17, + "grad_norm": 1.6801987212415377, + "learning_rate": 9.529611066727827e-06, + "loss": 0.4622, + "step": 2101 + }, + { + "epoch": 0.17, + "grad_norm": 1.7607987420941, + "learning_rate": 9.529072385261959e-06, + "loss": 0.488, + "step": 2102 + }, + { + "epoch": 0.17, + "grad_norm": 1.7983203641623786, + "learning_rate": 9.528533410771034e-06, + "loss": 0.5122, + "step": 2103 + }, + { + "epoch": 0.17, + "grad_norm": 1.701013361806068, + "learning_rate": 9.527994143289922e-06, + "loss": 0.4882, + "step": 2104 + }, + { + "epoch": 0.17, + "grad_norm": 1.8116597531730263, + "learning_rate": 9.527454582853512e-06, + "loss": 0.471, + "step": 2105 + }, + { + "epoch": 0.17, + "grad_norm": 1.5779240881862402, + "learning_rate": 9.526914729496714e-06, + "loss": 0.5004, + "step": 2106 + }, + { + "epoch": 0.17, + "grad_norm": 1.5187493734507571, + "learning_rate": 9.526374583254454e-06, + "loss": 0.4675, + "step": 2107 + }, + { + "epoch": 0.17, + "grad_norm": 1.3876179761631933, + "learning_rate": 9.525834144161681e-06, + "loss": 0.5093, + "step": 2108 + }, + { + "epoch": 0.17, + "grad_norm": 2.007192540896147, + "learning_rate": 9.525293412253357e-06, + "loss": 0.4598, + "step": 2109 + }, + { + "epoch": 0.17, + "grad_norm": 1.9486770376135167, + "learning_rate": 9.524752387564471e-06, + "loss": 0.5033, + "step": 2110 + }, + { + "epoch": 0.17, + "grad_norm": 0.7501211150585835, + "learning_rate": 9.524211070130023e-06, + "loss": 0.5667, + "step": 2111 + }, + { + "epoch": 0.17, + "grad_norm": 1.9392259572779216, + "learning_rate": 9.523669459985039e-06, + "loss": 0.5638, + "step": 2112 + }, + { + "epoch": 0.17, + "grad_norm": 1.4828847468376445, + "learning_rate": 9.523127557164558e-06, + "loss": 0.5033, + "step": 2113 + }, + { + "epoch": 0.17, + "grad_norm": 0.6065252681880944, + "learning_rate": 9.52258536170364e-06, + "loss": 0.5779, + "step": 2114 + }, + { + "epoch": 0.17, + "grad_norm": 1.6630596262414732, + "learning_rate": 9.522042873637366e-06, + "loss": 0.5046, + "step": 2115 + }, + { + "epoch": 0.17, + "grad_norm": 1.668916647465811, + "learning_rate": 9.521500093000833e-06, + "loss": 0.5286, + "step": 2116 + }, + { + "epoch": 0.17, + "grad_norm": 2.2125270477711876, + "learning_rate": 9.52095701982916e-06, + "loss": 0.4843, + "step": 2117 + }, + { + "epoch": 0.17, + "grad_norm": 2.254204404113744, + "learning_rate": 9.520413654157482e-06, + "loss": 0.5155, + "step": 2118 + }, + { + "epoch": 0.17, + "grad_norm": 1.694252567211013, + "learning_rate": 9.519869996020954e-06, + "loss": 0.5321, + "step": 2119 + }, + { + "epoch": 0.17, + "grad_norm": 2.832320394486869, + "learning_rate": 9.519326045454749e-06, + "loss": 0.4912, + "step": 2120 + }, + { + "epoch": 0.17, + "grad_norm": 1.9471570857269564, + "learning_rate": 9.518781802494062e-06, + "loss": 0.5609, + "step": 2121 + }, + { + "epoch": 0.17, + "grad_norm": 1.539677382065425, + "learning_rate": 9.518237267174103e-06, + "loss": 0.5267, + "step": 2122 + }, + { + "epoch": 0.17, + "grad_norm": 1.4490923506115598, + "learning_rate": 9.517692439530104e-06, + "loss": 0.476, + "step": 2123 + }, + { + "epoch": 0.17, + "grad_norm": 2.629027182551497, + "learning_rate": 9.517147319597313e-06, + "loss": 0.5571, + "step": 2124 + }, + { + "epoch": 0.17, + "grad_norm": 1.7154338341355222, + "learning_rate": 9.516601907411e-06, + "loss": 0.5832, + "step": 2125 + }, + { + "epoch": 0.17, + "grad_norm": 1.729014737154734, + "learning_rate": 9.516056203006453e-06, + "loss": 0.5147, + "step": 2126 + }, + { + "epoch": 0.17, + "grad_norm": 1.3980640712067298, + "learning_rate": 9.515510206418976e-06, + "loss": 0.4696, + "step": 2127 + }, + { + "epoch": 0.17, + "grad_norm": 1.7317943594479357, + "learning_rate": 9.514963917683898e-06, + "loss": 0.476, + "step": 2128 + }, + { + "epoch": 0.17, + "grad_norm": 1.4403536605349305, + "learning_rate": 9.51441733683656e-06, + "loss": 0.5104, + "step": 2129 + }, + { + "epoch": 0.17, + "grad_norm": 1.3785960974255924, + "learning_rate": 9.513870463912324e-06, + "loss": 0.4891, + "step": 2130 + }, + { + "epoch": 0.17, + "grad_norm": 0.9198488849643011, + "learning_rate": 9.513323298946576e-06, + "loss": 0.5551, + "step": 2131 + }, + { + "epoch": 0.17, + "grad_norm": 1.7859998219101592, + "learning_rate": 9.512775841974714e-06, + "loss": 0.5344, + "step": 2132 + }, + { + "epoch": 0.17, + "grad_norm": 0.63740456365635, + "learning_rate": 9.512228093032156e-06, + "loss": 0.5761, + "step": 2133 + }, + { + "epoch": 0.17, + "grad_norm": 2.979243443044541, + "learning_rate": 9.511680052154345e-06, + "loss": 0.5093, + "step": 2134 + }, + { + "epoch": 0.17, + "grad_norm": 2.3713806588894197, + "learning_rate": 9.511131719376738e-06, + "loss": 0.5428, + "step": 2135 + }, + { + "epoch": 0.17, + "grad_norm": 1.9608286744608714, + "learning_rate": 9.51058309473481e-06, + "loss": 0.4815, + "step": 2136 + }, + { + "epoch": 0.17, + "grad_norm": 1.756086181219677, + "learning_rate": 9.510034178264054e-06, + "loss": 0.5526, + "step": 2137 + }, + { + "epoch": 0.17, + "grad_norm": 0.8848825413799138, + "learning_rate": 9.509484969999989e-06, + "loss": 0.5635, + "step": 2138 + }, + { + "epoch": 0.17, + "grad_norm": 1.4996951542792805, + "learning_rate": 9.508935469978144e-06, + "loss": 0.4685, + "step": 2139 + }, + { + "epoch": 0.17, + "grad_norm": 2.9139132477413123, + "learning_rate": 9.508385678234073e-06, + "loss": 0.5109, + "step": 2140 + }, + { + "epoch": 0.17, + "grad_norm": 0.7231226992964058, + "learning_rate": 9.507835594803346e-06, + "loss": 0.5697, + "step": 2141 + }, + { + "epoch": 0.17, + "grad_norm": 0.6828284685680502, + "learning_rate": 9.507285219721553e-06, + "loss": 0.5662, + "step": 2142 + }, + { + "epoch": 0.17, + "grad_norm": 1.955667760172087, + "learning_rate": 9.506734553024304e-06, + "loss": 0.5404, + "step": 2143 + }, + { + "epoch": 0.17, + "grad_norm": 0.6149749604204677, + "learning_rate": 9.506183594747222e-06, + "loss": 0.5597, + "step": 2144 + }, + { + "epoch": 0.17, + "grad_norm": 2.371312231628323, + "learning_rate": 9.505632344925958e-06, + "loss": 0.5172, + "step": 2145 + }, + { + "epoch": 0.17, + "grad_norm": 1.971079035340859, + "learning_rate": 9.505080803596176e-06, + "loss": 0.4778, + "step": 2146 + }, + { + "epoch": 0.17, + "grad_norm": 1.542722165045504, + "learning_rate": 9.50452897079356e-06, + "loss": 0.5361, + "step": 2147 + }, + { + "epoch": 0.17, + "grad_norm": 5.291693875618912, + "learning_rate": 9.503976846553811e-06, + "loss": 0.5631, + "step": 2148 + }, + { + "epoch": 0.17, + "grad_norm": 1.9558789705837276, + "learning_rate": 9.503424430912652e-06, + "loss": 0.5266, + "step": 2149 + }, + { + "epoch": 0.17, + "grad_norm": 1.025466277741234, + "learning_rate": 9.502871723905825e-06, + "loss": 0.5999, + "step": 2150 + }, + { + "epoch": 0.17, + "grad_norm": 1.9666408709868786, + "learning_rate": 9.502318725569085e-06, + "loss": 0.5126, + "step": 2151 + }, + { + "epoch": 0.17, + "grad_norm": 1.666263990859537, + "learning_rate": 9.501765435938216e-06, + "loss": 0.5128, + "step": 2152 + }, + { + "epoch": 0.17, + "grad_norm": 0.7368694923333615, + "learning_rate": 9.501211855049011e-06, + "loss": 0.5582, + "step": 2153 + }, + { + "epoch": 0.17, + "grad_norm": 1.8082447152472192, + "learning_rate": 9.500657982937287e-06, + "loss": 0.4883, + "step": 2154 + }, + { + "epoch": 0.17, + "grad_norm": 1.8936319297779098, + "learning_rate": 9.500103819638879e-06, + "loss": 0.5669, + "step": 2155 + }, + { + "epoch": 0.17, + "grad_norm": 1.9644663181200546, + "learning_rate": 9.499549365189643e-06, + "loss": 0.5797, + "step": 2156 + }, + { + "epoch": 0.17, + "grad_norm": 1.6765265508455378, + "learning_rate": 9.498994619625447e-06, + "loss": 0.545, + "step": 2157 + }, + { + "epoch": 0.17, + "grad_norm": 1.796049908870172, + "learning_rate": 9.498439582982185e-06, + "loss": 0.4534, + "step": 2158 + }, + { + "epoch": 0.17, + "grad_norm": 1.6460298601515715, + "learning_rate": 9.497884255295766e-06, + "loss": 0.5469, + "step": 2159 + }, + { + "epoch": 0.17, + "grad_norm": 1.610860572271665, + "learning_rate": 9.49732863660212e-06, + "loss": 0.4775, + "step": 2160 + }, + { + "epoch": 0.17, + "grad_norm": 0.8344738717138687, + "learning_rate": 9.496772726937196e-06, + "loss": 0.5538, + "step": 2161 + }, + { + "epoch": 0.17, + "grad_norm": 1.439132973223709, + "learning_rate": 9.496216526336957e-06, + "loss": 0.4746, + "step": 2162 + }, + { + "epoch": 0.17, + "grad_norm": 0.7786435972033484, + "learning_rate": 9.495660034837393e-06, + "loss": 0.5623, + "step": 2163 + }, + { + "epoch": 0.17, + "grad_norm": 1.5698008427662702, + "learning_rate": 9.495103252474504e-06, + "loss": 0.5073, + "step": 2164 + }, + { + "epoch": 0.17, + "grad_norm": 1.4909887886223667, + "learning_rate": 9.494546179284314e-06, + "loss": 0.4869, + "step": 2165 + }, + { + "epoch": 0.17, + "grad_norm": 0.5667567774250105, + "learning_rate": 9.493988815302867e-06, + "loss": 0.5447, + "step": 2166 + }, + { + "epoch": 0.17, + "grad_norm": 1.6436980784916715, + "learning_rate": 9.493431160566225e-06, + "loss": 0.4821, + "step": 2167 + }, + { + "epoch": 0.17, + "grad_norm": 1.590159498895422, + "learning_rate": 9.492873215110461e-06, + "loss": 0.5056, + "step": 2168 + }, + { + "epoch": 0.17, + "grad_norm": 1.6764983946149048, + "learning_rate": 9.49231497897168e-06, + "loss": 0.5068, + "step": 2169 + }, + { + "epoch": 0.17, + "grad_norm": 1.628110584941614, + "learning_rate": 9.491756452185995e-06, + "loss": 0.5524, + "step": 2170 + }, + { + "epoch": 0.17, + "grad_norm": 1.7202964215007375, + "learning_rate": 9.491197634789544e-06, + "loss": 0.4811, + "step": 2171 + }, + { + "epoch": 0.17, + "grad_norm": 1.7719591304936537, + "learning_rate": 9.490638526818482e-06, + "loss": 0.5336, + "step": 2172 + }, + { + "epoch": 0.17, + "grad_norm": 1.6335562008335842, + "learning_rate": 9.49007912830898e-06, + "loss": 0.5013, + "step": 2173 + }, + { + "epoch": 0.17, + "grad_norm": 3.4384627257461466, + "learning_rate": 9.489519439297234e-06, + "loss": 0.5701, + "step": 2174 + }, + { + "epoch": 0.17, + "grad_norm": 1.5538572384087763, + "learning_rate": 9.488959459819452e-06, + "loss": 0.4505, + "step": 2175 + }, + { + "epoch": 0.17, + "grad_norm": 1.013405254450756, + "learning_rate": 9.488399189911866e-06, + "loss": 0.5855, + "step": 2176 + }, + { + "epoch": 0.17, + "grad_norm": 2.539930182953416, + "learning_rate": 9.487838629610725e-06, + "loss": 0.5002, + "step": 2177 + }, + { + "epoch": 0.17, + "grad_norm": 1.8617361968328119, + "learning_rate": 9.487277778952293e-06, + "loss": 0.486, + "step": 2178 + }, + { + "epoch": 0.17, + "grad_norm": 4.132060974960517, + "learning_rate": 9.486716637972862e-06, + "loss": 0.5837, + "step": 2179 + }, + { + "epoch": 0.17, + "grad_norm": 0.7430838648860986, + "learning_rate": 9.486155206708732e-06, + "loss": 0.5591, + "step": 2180 + }, + { + "epoch": 0.17, + "grad_norm": 1.715512990093565, + "learning_rate": 9.485593485196227e-06, + "loss": 0.5042, + "step": 2181 + }, + { + "epoch": 0.17, + "grad_norm": 1.5721527506048059, + "learning_rate": 9.485031473471693e-06, + "loss": 0.4908, + "step": 2182 + }, + { + "epoch": 0.17, + "grad_norm": 3.6696885337679284, + "learning_rate": 9.484469171571491e-06, + "loss": 0.5093, + "step": 2183 + }, + { + "epoch": 0.17, + "grad_norm": 3.250448576673862, + "learning_rate": 9.483906579531998e-06, + "loss": 0.5087, + "step": 2184 + }, + { + "epoch": 0.17, + "grad_norm": 0.6823222839716344, + "learning_rate": 9.483343697389617e-06, + "loss": 0.5696, + "step": 2185 + }, + { + "epoch": 0.17, + "grad_norm": 0.685245524775517, + "learning_rate": 9.482780525180763e-06, + "loss": 0.537, + "step": 2186 + }, + { + "epoch": 0.17, + "grad_norm": 2.207196749810209, + "learning_rate": 9.482217062941872e-06, + "loss": 0.5517, + "step": 2187 + }, + { + "epoch": 0.17, + "grad_norm": 1.545300585877083, + "learning_rate": 9.481653310709402e-06, + "loss": 0.5324, + "step": 2188 + }, + { + "epoch": 0.17, + "grad_norm": 1.535925527956117, + "learning_rate": 9.481089268519825e-06, + "loss": 0.4974, + "step": 2189 + }, + { + "epoch": 0.17, + "grad_norm": 1.6625140261892406, + "learning_rate": 9.480524936409634e-06, + "loss": 0.5228, + "step": 2190 + }, + { + "epoch": 0.17, + "grad_norm": 1.9596707494165937, + "learning_rate": 9.479960314415341e-06, + "loss": 0.5392, + "step": 2191 + }, + { + "epoch": 0.17, + "grad_norm": 0.6415917752005628, + "learning_rate": 9.479395402573476e-06, + "loss": 0.5564, + "step": 2192 + }, + { + "epoch": 0.17, + "grad_norm": 1.209430408861574, + "learning_rate": 9.47883020092059e-06, + "loss": 0.4596, + "step": 2193 + }, + { + "epoch": 0.17, + "grad_norm": 1.4880507942805108, + "learning_rate": 9.478264709493249e-06, + "loss": 0.5139, + "step": 2194 + }, + { + "epoch": 0.17, + "grad_norm": 1.5931903712120157, + "learning_rate": 9.477698928328036e-06, + "loss": 0.5265, + "step": 2195 + }, + { + "epoch": 0.17, + "grad_norm": 2.0880721146531847, + "learning_rate": 9.477132857461563e-06, + "loss": 0.5657, + "step": 2196 + }, + { + "epoch": 0.17, + "grad_norm": 1.8169151361905544, + "learning_rate": 9.476566496930451e-06, + "loss": 0.5304, + "step": 2197 + }, + { + "epoch": 0.17, + "grad_norm": 2.3868525052029015, + "learning_rate": 9.47599984677134e-06, + "loss": 0.5653, + "step": 2198 + }, + { + "epoch": 0.17, + "grad_norm": 2.3511931780378257, + "learning_rate": 9.475432907020896e-06, + "loss": 0.5451, + "step": 2199 + }, + { + "epoch": 0.17, + "grad_norm": 1.4832665092620316, + "learning_rate": 9.474865677715798e-06, + "loss": 0.4904, + "step": 2200 + }, + { + "epoch": 0.17, + "grad_norm": 1.5152439314768429, + "learning_rate": 9.474298158892745e-06, + "loss": 0.4939, + "step": 2201 + }, + { + "epoch": 0.17, + "grad_norm": 1.6793842839059967, + "learning_rate": 9.473730350588452e-06, + "loss": 0.5504, + "step": 2202 + }, + { + "epoch": 0.17, + "grad_norm": 1.755852820715602, + "learning_rate": 9.47316225283966e-06, + "loss": 0.5135, + "step": 2203 + }, + { + "epoch": 0.17, + "grad_norm": 1.5133950253550443, + "learning_rate": 9.472593865683123e-06, + "loss": 0.535, + "step": 2204 + }, + { + "epoch": 0.17, + "grad_norm": 1.7667686447617437, + "learning_rate": 9.47202518915561e-06, + "loss": 0.5206, + "step": 2205 + }, + { + "epoch": 0.17, + "grad_norm": 2.1064077042558353, + "learning_rate": 9.47145622329392e-06, + "loss": 0.4994, + "step": 2206 + }, + { + "epoch": 0.17, + "grad_norm": 1.689717774744424, + "learning_rate": 9.47088696813486e-06, + "loss": 0.5019, + "step": 2207 + }, + { + "epoch": 0.17, + "grad_norm": 0.6981276524475272, + "learning_rate": 9.470317423715263e-06, + "loss": 0.5795, + "step": 2208 + }, + { + "epoch": 0.17, + "grad_norm": 2.2714049819003086, + "learning_rate": 9.469747590071976e-06, + "loss": 0.5373, + "step": 2209 + }, + { + "epoch": 0.17, + "grad_norm": 2.076811823304265, + "learning_rate": 9.469177467241867e-06, + "loss": 0.5172, + "step": 2210 + }, + { + "epoch": 0.17, + "grad_norm": 1.8150532772877375, + "learning_rate": 9.468607055261822e-06, + "loss": 0.4869, + "step": 2211 + }, + { + "epoch": 0.17, + "grad_norm": 2.0898307624149535, + "learning_rate": 9.468036354168748e-06, + "loss": 0.5526, + "step": 2212 + }, + { + "epoch": 0.17, + "grad_norm": 1.886619357714772, + "learning_rate": 9.467465363999566e-06, + "loss": 0.4988, + "step": 2213 + }, + { + "epoch": 0.17, + "grad_norm": 2.4782408804663034, + "learning_rate": 9.466894084791218e-06, + "loss": 0.5105, + "step": 2214 + }, + { + "epoch": 0.17, + "grad_norm": 1.7335450606919611, + "learning_rate": 9.466322516580668e-06, + "loss": 0.5366, + "step": 2215 + }, + { + "epoch": 0.17, + "grad_norm": 1.753197706197562, + "learning_rate": 9.465750659404894e-06, + "loss": 0.5353, + "step": 2216 + }, + { + "epoch": 0.17, + "grad_norm": 1.7451030108602756, + "learning_rate": 9.465178513300892e-06, + "loss": 0.501, + "step": 2217 + }, + { + "epoch": 0.17, + "grad_norm": 2.0408632587346482, + "learning_rate": 9.464606078305683e-06, + "loss": 0.5093, + "step": 2218 + }, + { + "epoch": 0.17, + "grad_norm": 1.8795835710619089, + "learning_rate": 9.4640333544563e-06, + "loss": 0.5329, + "step": 2219 + }, + { + "epoch": 0.17, + "grad_norm": 1.7439632386830464, + "learning_rate": 9.463460341789799e-06, + "loss": 0.4925, + "step": 2220 + }, + { + "epoch": 0.17, + "grad_norm": 0.6178824467668695, + "learning_rate": 9.462887040343254e-06, + "loss": 0.5473, + "step": 2221 + }, + { + "epoch": 0.17, + "grad_norm": 1.5467664061824875, + "learning_rate": 9.462313450153754e-06, + "loss": 0.507, + "step": 2222 + }, + { + "epoch": 0.17, + "grad_norm": 2.8949400220430728, + "learning_rate": 9.461739571258413e-06, + "loss": 0.5455, + "step": 2223 + }, + { + "epoch": 0.17, + "grad_norm": 1.5390597003227833, + "learning_rate": 9.461165403694357e-06, + "loss": 0.5307, + "step": 2224 + }, + { + "epoch": 0.17, + "grad_norm": 2.0636367595569562, + "learning_rate": 9.460590947498736e-06, + "loss": 0.5532, + "step": 2225 + }, + { + "epoch": 0.17, + "grad_norm": 1.4664087207213812, + "learning_rate": 9.460016202708714e-06, + "loss": 0.5334, + "step": 2226 + }, + { + "epoch": 0.17, + "grad_norm": 1.8014685693556765, + "learning_rate": 9.45944116936148e-06, + "loss": 0.4824, + "step": 2227 + }, + { + "epoch": 0.17, + "grad_norm": 1.5358028986672532, + "learning_rate": 9.458865847494236e-06, + "loss": 0.5024, + "step": 2228 + }, + { + "epoch": 0.18, + "grad_norm": 0.6573628026759983, + "learning_rate": 9.458290237144205e-06, + "loss": 0.5842, + "step": 2229 + }, + { + "epoch": 0.18, + "grad_norm": 1.6602263633637209, + "learning_rate": 9.457714338348626e-06, + "loss": 0.5159, + "step": 2230 + }, + { + "epoch": 0.18, + "grad_norm": 0.6139024681438195, + "learning_rate": 9.457138151144764e-06, + "loss": 0.5792, + "step": 2231 + }, + { + "epoch": 0.18, + "grad_norm": 0.5945183555884275, + "learning_rate": 9.456561675569892e-06, + "loss": 0.5278, + "step": 2232 + }, + { + "epoch": 0.18, + "grad_norm": 0.6033556554984202, + "learning_rate": 9.455984911661309e-06, + "loss": 0.5485, + "step": 2233 + }, + { + "epoch": 0.18, + "grad_norm": 0.5674993633382198, + "learning_rate": 9.455407859456332e-06, + "loss": 0.5783, + "step": 2234 + }, + { + "epoch": 0.18, + "grad_norm": 2.11408179360972, + "learning_rate": 9.454830518992294e-06, + "loss": 0.5537, + "step": 2235 + }, + { + "epoch": 0.18, + "grad_norm": 1.4763932610312303, + "learning_rate": 9.454252890306552e-06, + "loss": 0.5417, + "step": 2236 + }, + { + "epoch": 0.18, + "grad_norm": 1.6117372283316223, + "learning_rate": 9.453674973436473e-06, + "loss": 0.5166, + "step": 2237 + }, + { + "epoch": 0.18, + "grad_norm": 1.678477108921375, + "learning_rate": 9.45309676841945e-06, + "loss": 0.4941, + "step": 2238 + }, + { + "epoch": 0.18, + "grad_norm": 2.4799043555394564, + "learning_rate": 9.45251827529289e-06, + "loss": 0.4846, + "step": 2239 + }, + { + "epoch": 0.18, + "grad_norm": 0.6135290399646561, + "learning_rate": 9.451939494094224e-06, + "loss": 0.5539, + "step": 2240 + }, + { + "epoch": 0.18, + "grad_norm": 1.7848469738025845, + "learning_rate": 9.451360424860894e-06, + "loss": 0.4955, + "step": 2241 + }, + { + "epoch": 0.18, + "grad_norm": 0.5525750642375514, + "learning_rate": 9.45078106763037e-06, + "loss": 0.5581, + "step": 2242 + }, + { + "epoch": 0.18, + "grad_norm": 1.5881333343918638, + "learning_rate": 9.450201422440133e-06, + "loss": 0.4898, + "step": 2243 + }, + { + "epoch": 0.18, + "grad_norm": 2.0301774323972652, + "learning_rate": 9.449621489327685e-06, + "loss": 0.4648, + "step": 2244 + }, + { + "epoch": 0.18, + "grad_norm": 1.8841544844523594, + "learning_rate": 9.449041268330549e-06, + "loss": 0.5193, + "step": 2245 + }, + { + "epoch": 0.18, + "grad_norm": 1.4764900810287394, + "learning_rate": 9.44846075948626e-06, + "loss": 0.5027, + "step": 2246 + }, + { + "epoch": 0.18, + "grad_norm": 0.5954800081112893, + "learning_rate": 9.44787996283238e-06, + "loss": 0.5478, + "step": 2247 + }, + { + "epoch": 0.18, + "grad_norm": 1.531125130556519, + "learning_rate": 9.447298878406485e-06, + "loss": 0.5076, + "step": 2248 + }, + { + "epoch": 0.18, + "grad_norm": 1.5958142527672674, + "learning_rate": 9.446717506246172e-06, + "loss": 0.5129, + "step": 2249 + }, + { + "epoch": 0.18, + "grad_norm": 1.8909909750083016, + "learning_rate": 9.446135846389053e-06, + "loss": 0.5618, + "step": 2250 + }, + { + "epoch": 0.18, + "grad_norm": 2.0486469940678385, + "learning_rate": 9.44555389887276e-06, + "loss": 0.5214, + "step": 2251 + }, + { + "epoch": 0.18, + "grad_norm": 1.6477507168927046, + "learning_rate": 9.444971663734947e-06, + "loss": 0.5282, + "step": 2252 + }, + { + "epoch": 0.18, + "grad_norm": 1.4901228179774741, + "learning_rate": 9.44438914101328e-06, + "loss": 0.4987, + "step": 2253 + }, + { + "epoch": 0.18, + "grad_norm": 1.7435292091271193, + "learning_rate": 9.443806330745452e-06, + "loss": 0.5298, + "step": 2254 + }, + { + "epoch": 0.18, + "grad_norm": 1.6021033278007386, + "learning_rate": 9.443223232969166e-06, + "loss": 0.483, + "step": 2255 + }, + { + "epoch": 0.18, + "grad_norm": 1.731336969626263, + "learning_rate": 9.442639847722148e-06, + "loss": 0.5087, + "step": 2256 + }, + { + "epoch": 0.18, + "grad_norm": 1.5747625880693306, + "learning_rate": 9.442056175042144e-06, + "loss": 0.4914, + "step": 2257 + }, + { + "epoch": 0.18, + "grad_norm": 1.782548503408499, + "learning_rate": 9.441472214966917e-06, + "loss": 0.5115, + "step": 2258 + }, + { + "epoch": 0.18, + "grad_norm": 1.520027621733375, + "learning_rate": 9.440887967534249e-06, + "loss": 0.4215, + "step": 2259 + }, + { + "epoch": 0.18, + "grad_norm": 1.4367831231971078, + "learning_rate": 9.440303432781938e-06, + "loss": 0.5158, + "step": 2260 + }, + { + "epoch": 0.18, + "grad_norm": 1.8834995644565726, + "learning_rate": 9.439718610747804e-06, + "loss": 0.5263, + "step": 2261 + }, + { + "epoch": 0.18, + "grad_norm": 1.6061854911063451, + "learning_rate": 9.439133501469684e-06, + "loss": 0.5204, + "step": 2262 + }, + { + "epoch": 0.18, + "grad_norm": 1.459743685437855, + "learning_rate": 9.438548104985433e-06, + "loss": 0.4606, + "step": 2263 + }, + { + "epoch": 0.18, + "grad_norm": 2.0991237285913638, + "learning_rate": 9.437962421332926e-06, + "loss": 0.5086, + "step": 2264 + }, + { + "epoch": 0.18, + "grad_norm": 1.7677020064549238, + "learning_rate": 9.437376450550055e-06, + "loss": 0.5572, + "step": 2265 + }, + { + "epoch": 0.18, + "grad_norm": 2.033569799019311, + "learning_rate": 9.436790192674734e-06, + "loss": 0.5466, + "step": 2266 + }, + { + "epoch": 0.18, + "grad_norm": 2.12638574212309, + "learning_rate": 9.43620364774489e-06, + "loss": 0.5222, + "step": 2267 + }, + { + "epoch": 0.18, + "grad_norm": 0.6705963302741609, + "learning_rate": 9.435616815798476e-06, + "loss": 0.5523, + "step": 2268 + }, + { + "epoch": 0.18, + "grad_norm": 2.2791309706239464, + "learning_rate": 9.435029696873455e-06, + "loss": 0.5978, + "step": 2269 + }, + { + "epoch": 0.18, + "grad_norm": 1.634397078943637, + "learning_rate": 9.434442291007812e-06, + "loss": 0.5036, + "step": 2270 + }, + { + "epoch": 0.18, + "grad_norm": 0.5920390987773815, + "learning_rate": 9.433854598239557e-06, + "loss": 0.5565, + "step": 2271 + }, + { + "epoch": 0.18, + "grad_norm": 2.4070029869294918, + "learning_rate": 9.433266618606708e-06, + "loss": 0.4633, + "step": 2272 + }, + { + "epoch": 0.18, + "grad_norm": 2.41621220579687, + "learning_rate": 9.432678352147309e-06, + "loss": 0.5047, + "step": 2273 + }, + { + "epoch": 0.18, + "grad_norm": 1.4620047371608027, + "learning_rate": 9.43208979889942e-06, + "loss": 0.5282, + "step": 2274 + }, + { + "epoch": 0.18, + "grad_norm": 1.812494745292715, + "learning_rate": 9.431500958901118e-06, + "loss": 0.5393, + "step": 2275 + }, + { + "epoch": 0.18, + "grad_norm": 1.978913246603885, + "learning_rate": 9.4309118321905e-06, + "loss": 0.529, + "step": 2276 + }, + { + "epoch": 0.18, + "grad_norm": 1.4983438275082541, + "learning_rate": 9.430322418805683e-06, + "loss": 0.492, + "step": 2277 + }, + { + "epoch": 0.18, + "grad_norm": 1.4951832867201973, + "learning_rate": 9.429732718784803e-06, + "loss": 0.5144, + "step": 2278 + }, + { + "epoch": 0.18, + "grad_norm": 1.7883941178800622, + "learning_rate": 9.42914273216601e-06, + "loss": 0.5328, + "step": 2279 + }, + { + "epoch": 0.18, + "grad_norm": 0.647159648946074, + "learning_rate": 9.428552458987476e-06, + "loss": 0.5458, + "step": 2280 + }, + { + "epoch": 0.18, + "grad_norm": 1.7109976036822474, + "learning_rate": 9.427961899287393e-06, + "loss": 0.475, + "step": 2281 + }, + { + "epoch": 0.18, + "grad_norm": 1.421955507918678, + "learning_rate": 9.427371053103964e-06, + "loss": 0.477, + "step": 2282 + }, + { + "epoch": 0.18, + "grad_norm": 1.733695639419623, + "learning_rate": 9.426779920475424e-06, + "loss": 0.5275, + "step": 2283 + }, + { + "epoch": 0.18, + "grad_norm": 1.5560713998579256, + "learning_rate": 9.42618850144001e-06, + "loss": 0.499, + "step": 2284 + }, + { + "epoch": 0.18, + "grad_norm": 2.712873691592354, + "learning_rate": 9.425596796035991e-06, + "loss": 0.5177, + "step": 2285 + }, + { + "epoch": 0.18, + "grad_norm": 1.5795401756221055, + "learning_rate": 9.425004804301651e-06, + "loss": 0.5025, + "step": 2286 + }, + { + "epoch": 0.18, + "grad_norm": 0.6292684447911882, + "learning_rate": 9.424412526275287e-06, + "loss": 0.5459, + "step": 2287 + }, + { + "epoch": 0.18, + "grad_norm": 1.4913136948225565, + "learning_rate": 9.423819961995222e-06, + "loss": 0.5503, + "step": 2288 + }, + { + "epoch": 0.18, + "grad_norm": 1.544311050129815, + "learning_rate": 9.42322711149979e-06, + "loss": 0.5331, + "step": 2289 + }, + { + "epoch": 0.18, + "grad_norm": 1.5922629329754747, + "learning_rate": 9.422633974827351e-06, + "loss": 0.4649, + "step": 2290 + }, + { + "epoch": 0.18, + "grad_norm": 1.2765294820326885, + "learning_rate": 9.422040552016281e-06, + "loss": 0.5001, + "step": 2291 + }, + { + "epoch": 0.18, + "grad_norm": 1.5271345926361055, + "learning_rate": 9.421446843104971e-06, + "loss": 0.525, + "step": 2292 + }, + { + "epoch": 0.18, + "grad_norm": 2.075259498546482, + "learning_rate": 9.420852848131834e-06, + "loss": 0.5178, + "step": 2293 + }, + { + "epoch": 0.18, + "grad_norm": 1.9203221083847481, + "learning_rate": 9.420258567135301e-06, + "loss": 0.4975, + "step": 2294 + }, + { + "epoch": 0.18, + "grad_norm": 0.5971047418818387, + "learning_rate": 9.419664000153822e-06, + "loss": 0.5732, + "step": 2295 + }, + { + "epoch": 0.18, + "grad_norm": 0.5730136796337478, + "learning_rate": 9.41906914722586e-06, + "loss": 0.5372, + "step": 2296 + }, + { + "epoch": 0.18, + "grad_norm": 2.103266285482752, + "learning_rate": 9.418474008389909e-06, + "loss": 0.5174, + "step": 2297 + }, + { + "epoch": 0.18, + "grad_norm": 0.6108121294380026, + "learning_rate": 9.41787858368447e-06, + "loss": 0.56, + "step": 2298 + }, + { + "epoch": 0.18, + "grad_norm": 1.4329732799630057, + "learning_rate": 9.417282873148064e-06, + "loss": 0.4745, + "step": 2299 + }, + { + "epoch": 0.18, + "grad_norm": 1.8874811943930814, + "learning_rate": 9.416686876819236e-06, + "loss": 0.5307, + "step": 2300 + }, + { + "epoch": 0.18, + "grad_norm": 1.4378028787099404, + "learning_rate": 9.416090594736544e-06, + "loss": 0.475, + "step": 2301 + }, + { + "epoch": 0.18, + "grad_norm": 2.3691513629188687, + "learning_rate": 9.415494026938567e-06, + "loss": 0.5209, + "step": 2302 + }, + { + "epoch": 0.18, + "grad_norm": 1.7221289050405197, + "learning_rate": 9.414897173463903e-06, + "loss": 0.5221, + "step": 2303 + }, + { + "epoch": 0.18, + "grad_norm": 1.5929202454879234, + "learning_rate": 9.414300034351168e-06, + "loss": 0.4972, + "step": 2304 + }, + { + "epoch": 0.18, + "grad_norm": 0.6109479125422732, + "learning_rate": 9.413702609638996e-06, + "loss": 0.5677, + "step": 2305 + }, + { + "epoch": 0.18, + "grad_norm": 0.6363986099004775, + "learning_rate": 9.413104899366037e-06, + "loss": 0.5724, + "step": 2306 + }, + { + "epoch": 0.18, + "grad_norm": 2.391726569386818, + "learning_rate": 9.412506903570967e-06, + "loss": 0.5378, + "step": 2307 + }, + { + "epoch": 0.18, + "grad_norm": 1.989213820339274, + "learning_rate": 9.41190862229247e-06, + "loss": 0.5336, + "step": 2308 + }, + { + "epoch": 0.18, + "grad_norm": 1.6785683709464219, + "learning_rate": 9.411310055569256e-06, + "loss": 0.5, + "step": 2309 + }, + { + "epoch": 0.18, + "grad_norm": 0.6382645087715891, + "learning_rate": 9.410711203440054e-06, + "loss": 0.5829, + "step": 2310 + }, + { + "epoch": 0.18, + "grad_norm": 1.6535352912465338, + "learning_rate": 9.410112065943607e-06, + "loss": 0.5324, + "step": 2311 + }, + { + "epoch": 0.18, + "grad_norm": 2.080201920354252, + "learning_rate": 9.409512643118677e-06, + "loss": 0.5298, + "step": 2312 + }, + { + "epoch": 0.18, + "grad_norm": 2.227892563087289, + "learning_rate": 9.40891293500405e-06, + "loss": 0.5825, + "step": 2313 + }, + { + "epoch": 0.18, + "grad_norm": 1.7525007702316326, + "learning_rate": 9.408312941638522e-06, + "loss": 0.4959, + "step": 2314 + }, + { + "epoch": 0.18, + "grad_norm": 1.7172000295472807, + "learning_rate": 9.407712663060912e-06, + "loss": 0.5381, + "step": 2315 + }, + { + "epoch": 0.18, + "grad_norm": 1.3769873144991938, + "learning_rate": 9.40711209931006e-06, + "loss": 0.4477, + "step": 2316 + }, + { + "epoch": 0.18, + "grad_norm": 1.9541325857621255, + "learning_rate": 9.40651125042482e-06, + "loss": 0.472, + "step": 2317 + }, + { + "epoch": 0.18, + "grad_norm": 1.5255693045981917, + "learning_rate": 9.405910116444069e-06, + "loss": 0.5291, + "step": 2318 + }, + { + "epoch": 0.18, + "grad_norm": 1.4968811711312802, + "learning_rate": 9.405308697406696e-06, + "loss": 0.5866, + "step": 2319 + }, + { + "epoch": 0.18, + "grad_norm": 1.5268983974770414, + "learning_rate": 9.404706993351612e-06, + "loss": 0.5451, + "step": 2320 + }, + { + "epoch": 0.18, + "grad_norm": 1.948464827515323, + "learning_rate": 9.404105004317748e-06, + "loss": 0.5353, + "step": 2321 + }, + { + "epoch": 0.18, + "grad_norm": 2.0058347411167414, + "learning_rate": 9.403502730344051e-06, + "loss": 0.5327, + "step": 2322 + }, + { + "epoch": 0.18, + "grad_norm": 1.614822854056582, + "learning_rate": 9.402900171469488e-06, + "loss": 0.5125, + "step": 2323 + }, + { + "epoch": 0.18, + "grad_norm": 1.4282983644306353, + "learning_rate": 9.402297327733046e-06, + "loss": 0.5486, + "step": 2324 + }, + { + "epoch": 0.18, + "grad_norm": 0.663568770889312, + "learning_rate": 9.401694199173723e-06, + "loss": 0.5623, + "step": 2325 + }, + { + "epoch": 0.18, + "grad_norm": 2.001983857730837, + "learning_rate": 9.401090785830544e-06, + "loss": 0.579, + "step": 2326 + }, + { + "epoch": 0.18, + "grad_norm": 1.6503426383795254, + "learning_rate": 9.40048708774255e-06, + "loss": 0.4753, + "step": 2327 + }, + { + "epoch": 0.18, + "grad_norm": 1.6427289067911675, + "learning_rate": 9.399883104948796e-06, + "loss": 0.4748, + "step": 2328 + }, + { + "epoch": 0.18, + "grad_norm": 5.804868454992703, + "learning_rate": 9.399278837488361e-06, + "loss": 0.5749, + "step": 2329 + }, + { + "epoch": 0.18, + "grad_norm": 1.613403289304872, + "learning_rate": 9.398674285400343e-06, + "loss": 0.5042, + "step": 2330 + }, + { + "epoch": 0.18, + "grad_norm": 0.6462151024874334, + "learning_rate": 9.39806944872385e-06, + "loss": 0.5336, + "step": 2331 + }, + { + "epoch": 0.18, + "grad_norm": 1.4372057471315656, + "learning_rate": 9.397464327498017e-06, + "loss": 0.4723, + "step": 2332 + }, + { + "epoch": 0.18, + "grad_norm": 2.0397146824410655, + "learning_rate": 9.396858921761997e-06, + "loss": 0.5209, + "step": 2333 + }, + { + "epoch": 0.18, + "grad_norm": 2.163052873395621, + "learning_rate": 9.396253231554955e-06, + "loss": 0.4878, + "step": 2334 + }, + { + "epoch": 0.18, + "grad_norm": 2.273409355808137, + "learning_rate": 9.39564725691608e-06, + "loss": 0.4835, + "step": 2335 + }, + { + "epoch": 0.18, + "grad_norm": 1.8436663511430758, + "learning_rate": 9.395040997884577e-06, + "loss": 0.4967, + "step": 2336 + }, + { + "epoch": 0.18, + "grad_norm": 1.9421779480408548, + "learning_rate": 9.394434454499672e-06, + "loss": 0.5519, + "step": 2337 + }, + { + "epoch": 0.18, + "grad_norm": 1.7358446237705316, + "learning_rate": 9.393827626800607e-06, + "loss": 0.5457, + "step": 2338 + }, + { + "epoch": 0.18, + "grad_norm": 1.6700770126555389, + "learning_rate": 9.39322051482664e-06, + "loss": 0.4648, + "step": 2339 + }, + { + "epoch": 0.18, + "grad_norm": 1.4697413164756665, + "learning_rate": 9.392613118617055e-06, + "loss": 0.5048, + "step": 2340 + }, + { + "epoch": 0.18, + "grad_norm": 1.6671480675763035, + "learning_rate": 9.392005438211146e-06, + "loss": 0.5176, + "step": 2341 + }, + { + "epoch": 0.18, + "grad_norm": 0.5875622720488974, + "learning_rate": 9.391397473648232e-06, + "loss": 0.5256, + "step": 2342 + }, + { + "epoch": 0.18, + "grad_norm": 1.7115004482474931, + "learning_rate": 9.390789224967646e-06, + "loss": 0.5122, + "step": 2343 + }, + { + "epoch": 0.18, + "grad_norm": 1.7742578424242894, + "learning_rate": 9.39018069220874e-06, + "loss": 0.5419, + "step": 2344 + }, + { + "epoch": 0.18, + "grad_norm": 3.152852308694212, + "learning_rate": 9.389571875410887e-06, + "loss": 0.5359, + "step": 2345 + }, + { + "epoch": 0.18, + "grad_norm": 1.5067936452570176, + "learning_rate": 9.388962774613476e-06, + "loss": 0.4673, + "step": 2346 + }, + { + "epoch": 0.18, + "grad_norm": 1.9685809243174113, + "learning_rate": 9.388353389855914e-06, + "loss": 0.4706, + "step": 2347 + }, + { + "epoch": 0.18, + "grad_norm": 1.6114036773698448, + "learning_rate": 9.387743721177628e-06, + "loss": 0.5156, + "step": 2348 + }, + { + "epoch": 0.18, + "grad_norm": 1.9090615151170502, + "learning_rate": 9.387133768618065e-06, + "loss": 0.489, + "step": 2349 + }, + { + "epoch": 0.18, + "grad_norm": 2.2684801172775457, + "learning_rate": 9.386523532216686e-06, + "loss": 0.556, + "step": 2350 + }, + { + "epoch": 0.18, + "grad_norm": 1.7651268602140282, + "learning_rate": 9.385913012012972e-06, + "loss": 0.5176, + "step": 2351 + }, + { + "epoch": 0.18, + "grad_norm": 1.3892053106044289, + "learning_rate": 9.385302208046423e-06, + "loss": 0.4673, + "step": 2352 + }, + { + "epoch": 0.18, + "grad_norm": 2.5423652308794584, + "learning_rate": 9.384691120356559e-06, + "loss": 0.4989, + "step": 2353 + }, + { + "epoch": 0.18, + "grad_norm": 2.783193599489322, + "learning_rate": 9.384079748982915e-06, + "loss": 0.5681, + "step": 2354 + }, + { + "epoch": 0.18, + "grad_norm": 1.4606282353341635, + "learning_rate": 9.383468093965046e-06, + "loss": 0.4939, + "step": 2355 + }, + { + "epoch": 0.19, + "grad_norm": 2.3812283129012592, + "learning_rate": 9.382856155342528e-06, + "loss": 0.537, + "step": 2356 + }, + { + "epoch": 0.19, + "grad_norm": 1.4507625015503391, + "learning_rate": 9.382243933154947e-06, + "loss": 0.5429, + "step": 2357 + }, + { + "epoch": 0.19, + "grad_norm": 1.938593077995465, + "learning_rate": 9.381631427441917e-06, + "loss": 0.5186, + "step": 2358 + }, + { + "epoch": 0.19, + "grad_norm": 2.4418782447928233, + "learning_rate": 9.381018638243066e-06, + "loss": 0.5648, + "step": 2359 + }, + { + "epoch": 0.19, + "grad_norm": 1.746298084827724, + "learning_rate": 9.38040556559804e-06, + "loss": 0.5089, + "step": 2360 + }, + { + "epoch": 0.19, + "grad_norm": 1.7254748018946469, + "learning_rate": 9.379792209546506e-06, + "loss": 0.4985, + "step": 2361 + }, + { + "epoch": 0.19, + "grad_norm": 1.5121379859610484, + "learning_rate": 9.379178570128144e-06, + "loss": 0.4862, + "step": 2362 + }, + { + "epoch": 0.19, + "grad_norm": 1.5273655850392251, + "learning_rate": 9.378564647382657e-06, + "loss": 0.4773, + "step": 2363 + }, + { + "epoch": 0.19, + "grad_norm": 1.4928443462884877, + "learning_rate": 9.377950441349765e-06, + "loss": 0.5315, + "step": 2364 + }, + { + "epoch": 0.19, + "grad_norm": 0.6954159335832535, + "learning_rate": 9.377335952069207e-06, + "loss": 0.5536, + "step": 2365 + }, + { + "epoch": 0.19, + "grad_norm": 1.753455783569322, + "learning_rate": 9.37672117958074e-06, + "loss": 0.4251, + "step": 2366 + }, + { + "epoch": 0.19, + "grad_norm": 1.3533838293796452, + "learning_rate": 9.376106123924135e-06, + "loss": 0.5111, + "step": 2367 + }, + { + "epoch": 0.19, + "grad_norm": 0.5963480316635105, + "learning_rate": 9.375490785139192e-06, + "loss": 0.5606, + "step": 2368 + }, + { + "epoch": 0.19, + "grad_norm": 0.5838545600315282, + "learning_rate": 9.374875163265718e-06, + "loss": 0.5421, + "step": 2369 + }, + { + "epoch": 0.19, + "grad_norm": 2.3471547695430006, + "learning_rate": 9.374259258343543e-06, + "loss": 0.498, + "step": 2370 + }, + { + "epoch": 0.19, + "grad_norm": 2.120084112071938, + "learning_rate": 9.373643070412516e-06, + "loss": 0.4499, + "step": 2371 + }, + { + "epoch": 0.19, + "grad_norm": 1.6153495042905446, + "learning_rate": 9.373026599512504e-06, + "loss": 0.4955, + "step": 2372 + }, + { + "epoch": 0.19, + "grad_norm": 1.5706488942444603, + "learning_rate": 9.372409845683395e-06, + "loss": 0.4992, + "step": 2373 + }, + { + "epoch": 0.19, + "grad_norm": 1.3773229266923068, + "learning_rate": 9.371792808965086e-06, + "loss": 0.473, + "step": 2374 + }, + { + "epoch": 0.19, + "grad_norm": 1.5106853887897262, + "learning_rate": 9.371175489397501e-06, + "loss": 0.5012, + "step": 2375 + }, + { + "epoch": 0.19, + "grad_norm": 14.224879493818861, + "learning_rate": 9.370557887020579e-06, + "loss": 0.5281, + "step": 2376 + }, + { + "epoch": 0.19, + "grad_norm": 0.7067366198261651, + "learning_rate": 9.369940001874282e-06, + "loss": 0.5714, + "step": 2377 + }, + { + "epoch": 0.19, + "grad_norm": 1.4186155039940087, + "learning_rate": 9.369321833998582e-06, + "loss": 0.4996, + "step": 2378 + }, + { + "epoch": 0.19, + "grad_norm": 1.6582392131451884, + "learning_rate": 9.368703383433476e-06, + "loss": 0.5299, + "step": 2379 + }, + { + "epoch": 0.19, + "grad_norm": 1.4304662679294322, + "learning_rate": 9.368084650218974e-06, + "loss": 0.4917, + "step": 2380 + }, + { + "epoch": 0.19, + "grad_norm": 1.8664515510210562, + "learning_rate": 9.367465634395111e-06, + "loss": 0.576, + "step": 2381 + }, + { + "epoch": 0.19, + "grad_norm": 1.897548791650952, + "learning_rate": 9.366846336001935e-06, + "loss": 0.4985, + "step": 2382 + }, + { + "epoch": 0.19, + "grad_norm": 1.4597056332333282, + "learning_rate": 9.366226755079513e-06, + "loss": 0.5351, + "step": 2383 + }, + { + "epoch": 0.19, + "grad_norm": 1.5183534120009308, + "learning_rate": 9.365606891667932e-06, + "loss": 0.4534, + "step": 2384 + }, + { + "epoch": 0.19, + "grad_norm": 2.004775628570737, + "learning_rate": 9.364986745807295e-06, + "loss": 0.5288, + "step": 2385 + }, + { + "epoch": 0.19, + "grad_norm": 1.5505787120960475, + "learning_rate": 9.364366317537727e-06, + "loss": 0.5588, + "step": 2386 + }, + { + "epoch": 0.19, + "grad_norm": 1.5966758135598842, + "learning_rate": 9.363745606899365e-06, + "loss": 0.535, + "step": 2387 + }, + { + "epoch": 0.19, + "grad_norm": 1.9483306329129215, + "learning_rate": 9.363124613932374e-06, + "loss": 0.5171, + "step": 2388 + }, + { + "epoch": 0.19, + "grad_norm": 2.3932276354836652, + "learning_rate": 9.362503338676927e-06, + "loss": 0.4992, + "step": 2389 + }, + { + "epoch": 0.19, + "grad_norm": 1.8327744324453519, + "learning_rate": 9.36188178117322e-06, + "loss": 0.4908, + "step": 2390 + }, + { + "epoch": 0.19, + "grad_norm": 1.7902601310897783, + "learning_rate": 9.361259941461469e-06, + "loss": 0.4967, + "step": 2391 + }, + { + "epoch": 0.19, + "grad_norm": 1.6598261179331797, + "learning_rate": 9.360637819581906e-06, + "loss": 0.4928, + "step": 2392 + }, + { + "epoch": 0.19, + "grad_norm": 1.638804222146243, + "learning_rate": 9.360015415574779e-06, + "loss": 0.4621, + "step": 2393 + }, + { + "epoch": 0.19, + "grad_norm": 1.6848555409083126, + "learning_rate": 9.359392729480358e-06, + "loss": 0.533, + "step": 2394 + }, + { + "epoch": 0.19, + "grad_norm": 2.3704329017697905, + "learning_rate": 9.35876976133893e-06, + "loss": 0.5313, + "step": 2395 + }, + { + "epoch": 0.19, + "grad_norm": 2.3399479966682195, + "learning_rate": 9.358146511190803e-06, + "loss": 0.5031, + "step": 2396 + }, + { + "epoch": 0.19, + "grad_norm": 1.729187889222948, + "learning_rate": 9.357522979076295e-06, + "loss": 0.5071, + "step": 2397 + }, + { + "epoch": 0.19, + "grad_norm": 1.6830775472357482, + "learning_rate": 9.356899165035751e-06, + "loss": 0.5285, + "step": 2398 + }, + { + "epoch": 0.19, + "grad_norm": 1.7711701959265302, + "learning_rate": 9.35627506910953e-06, + "loss": 0.5078, + "step": 2399 + }, + { + "epoch": 0.19, + "grad_norm": 1.4198755177166493, + "learning_rate": 9.355650691338013e-06, + "loss": 0.4736, + "step": 2400 + }, + { + "epoch": 0.19, + "grad_norm": 1.6243408343941619, + "learning_rate": 9.355026031761593e-06, + "loss": 0.5317, + "step": 2401 + }, + { + "epoch": 0.19, + "grad_norm": 1.8431603246106645, + "learning_rate": 9.354401090420687e-06, + "loss": 0.4836, + "step": 2402 + }, + { + "epoch": 0.19, + "grad_norm": 2.440158501619492, + "learning_rate": 9.353775867355724e-06, + "loss": 0.5762, + "step": 2403 + }, + { + "epoch": 0.19, + "grad_norm": 1.9309948547716995, + "learning_rate": 9.35315036260716e-06, + "loss": 0.4504, + "step": 2404 + }, + { + "epoch": 0.19, + "grad_norm": 3.684308469533794, + "learning_rate": 9.35252457621546e-06, + "loss": 0.5281, + "step": 2405 + }, + { + "epoch": 0.19, + "grad_norm": 2.3030591476308606, + "learning_rate": 9.351898508221116e-06, + "loss": 0.5302, + "step": 2406 + }, + { + "epoch": 0.19, + "grad_norm": 1.822537922753776, + "learning_rate": 9.351272158664631e-06, + "loss": 0.5299, + "step": 2407 + }, + { + "epoch": 0.19, + "grad_norm": 1.5898885623482772, + "learning_rate": 9.35064552758653e-06, + "loss": 0.5011, + "step": 2408 + }, + { + "epoch": 0.19, + "grad_norm": 1.9840236663253659, + "learning_rate": 9.350018615027353e-06, + "loss": 0.5354, + "step": 2409 + }, + { + "epoch": 0.19, + "grad_norm": 2.0493243032414052, + "learning_rate": 9.349391421027663e-06, + "loss": 0.5387, + "step": 2410 + }, + { + "epoch": 0.19, + "grad_norm": 2.2296397282355986, + "learning_rate": 9.34876394562804e-06, + "loss": 0.5384, + "step": 2411 + }, + { + "epoch": 0.19, + "grad_norm": 2.136592604878245, + "learning_rate": 9.348136188869075e-06, + "loss": 0.4543, + "step": 2412 + }, + { + "epoch": 0.19, + "grad_norm": 2.148440053748222, + "learning_rate": 9.347508150791389e-06, + "loss": 0.5459, + "step": 2413 + }, + { + "epoch": 0.19, + "grad_norm": 2.4285905857377035, + "learning_rate": 9.346879831435611e-06, + "loss": 0.4948, + "step": 2414 + }, + { + "epoch": 0.19, + "grad_norm": 1.6384862729910865, + "learning_rate": 9.346251230842396e-06, + "loss": 0.5301, + "step": 2415 + }, + { + "epoch": 0.19, + "grad_norm": 1.6422996790529345, + "learning_rate": 9.345622349052412e-06, + "loss": 0.5193, + "step": 2416 + }, + { + "epoch": 0.19, + "grad_norm": 1.4651279940690312, + "learning_rate": 9.344993186106346e-06, + "loss": 0.4933, + "step": 2417 + }, + { + "epoch": 0.19, + "grad_norm": 1.5328998184140752, + "learning_rate": 9.344363742044904e-06, + "loss": 0.5384, + "step": 2418 + }, + { + "epoch": 0.19, + "grad_norm": 1.4079090765844, + "learning_rate": 9.343734016908814e-06, + "loss": 0.464, + "step": 2419 + }, + { + "epoch": 0.19, + "grad_norm": 1.7404645029611585, + "learning_rate": 9.343104010738812e-06, + "loss": 0.5273, + "step": 2420 + }, + { + "epoch": 0.19, + "grad_norm": 2.5842554817997296, + "learning_rate": 9.342473723575664e-06, + "loss": 0.5023, + "step": 2421 + }, + { + "epoch": 0.19, + "grad_norm": 1.6554813354428914, + "learning_rate": 9.341843155460146e-06, + "loss": 0.5433, + "step": 2422 + }, + { + "epoch": 0.19, + "grad_norm": 2.1110350884669673, + "learning_rate": 9.341212306433057e-06, + "loss": 0.5458, + "step": 2423 + }, + { + "epoch": 0.19, + "grad_norm": 0.6454758135191926, + "learning_rate": 9.34058117653521e-06, + "loss": 0.5535, + "step": 2424 + }, + { + "epoch": 0.19, + "grad_norm": 1.6687962762838733, + "learning_rate": 9.339949765807439e-06, + "loss": 0.508, + "step": 2425 + }, + { + "epoch": 0.19, + "grad_norm": 0.5668257223259372, + "learning_rate": 9.339318074290595e-06, + "loss": 0.5447, + "step": 2426 + }, + { + "epoch": 0.19, + "grad_norm": 1.394214298203269, + "learning_rate": 9.338686102025548e-06, + "loss": 0.5093, + "step": 2427 + }, + { + "epoch": 0.19, + "grad_norm": 1.767121032019649, + "learning_rate": 9.338053849053186e-06, + "loss": 0.4809, + "step": 2428 + }, + { + "epoch": 0.19, + "grad_norm": 1.4683517615102588, + "learning_rate": 9.337421315414416e-06, + "loss": 0.5146, + "step": 2429 + }, + { + "epoch": 0.19, + "grad_norm": 0.6643181930892917, + "learning_rate": 9.336788501150161e-06, + "loss": 0.5625, + "step": 2430 + }, + { + "epoch": 0.19, + "grad_norm": 1.5348951349170583, + "learning_rate": 9.336155406301361e-06, + "loss": 0.5173, + "step": 2431 + }, + { + "epoch": 0.19, + "grad_norm": 1.7708267284197963, + "learning_rate": 9.335522030908982e-06, + "loss": 0.513, + "step": 2432 + }, + { + "epoch": 0.19, + "grad_norm": 0.6124539344850676, + "learning_rate": 9.334888375013995e-06, + "loss": 0.5835, + "step": 2433 + }, + { + "epoch": 0.19, + "grad_norm": 2.4186792900560805, + "learning_rate": 9.334254438657403e-06, + "loss": 0.5055, + "step": 2434 + }, + { + "epoch": 0.19, + "grad_norm": 1.671986033143908, + "learning_rate": 9.333620221880219e-06, + "loss": 0.5439, + "step": 2435 + }, + { + "epoch": 0.19, + "grad_norm": 1.5517674192339705, + "learning_rate": 9.332985724723476e-06, + "loss": 0.4861, + "step": 2436 + }, + { + "epoch": 0.19, + "grad_norm": 5.227719009303436, + "learning_rate": 9.332350947228224e-06, + "loss": 0.5209, + "step": 2437 + }, + { + "epoch": 0.19, + "grad_norm": 1.6886748770591715, + "learning_rate": 9.331715889435534e-06, + "loss": 0.4749, + "step": 2438 + }, + { + "epoch": 0.19, + "grad_norm": 0.6185730144995171, + "learning_rate": 9.331080551386493e-06, + "loss": 0.5313, + "step": 2439 + }, + { + "epoch": 0.19, + "grad_norm": 1.7354136140177496, + "learning_rate": 9.330444933122205e-06, + "loss": 0.5454, + "step": 2440 + }, + { + "epoch": 0.19, + "grad_norm": 0.6391524412056271, + "learning_rate": 9.329809034683795e-06, + "loss": 0.5735, + "step": 2441 + }, + { + "epoch": 0.19, + "grad_norm": 2.190755305730543, + "learning_rate": 9.329172856112406e-06, + "loss": 0.4783, + "step": 2442 + }, + { + "epoch": 0.19, + "grad_norm": 2.656855985917903, + "learning_rate": 9.328536397449193e-06, + "loss": 0.5249, + "step": 2443 + }, + { + "epoch": 0.19, + "grad_norm": 1.550455312991144, + "learning_rate": 9.32789965873534e-06, + "loss": 0.492, + "step": 2444 + }, + { + "epoch": 0.19, + "grad_norm": 0.5902494872542919, + "learning_rate": 9.327262640012041e-06, + "loss": 0.5591, + "step": 2445 + }, + { + "epoch": 0.19, + "grad_norm": 1.7895144006906403, + "learning_rate": 9.326625341320508e-06, + "loss": 0.5025, + "step": 2446 + }, + { + "epoch": 0.19, + "grad_norm": 1.570714191295494, + "learning_rate": 9.325987762701977e-06, + "loss": 0.5225, + "step": 2447 + }, + { + "epoch": 0.19, + "grad_norm": 0.5798050223001514, + "learning_rate": 9.325349904197696e-06, + "loss": 0.5525, + "step": 2448 + }, + { + "epoch": 0.19, + "grad_norm": 1.499626433734733, + "learning_rate": 9.324711765848935e-06, + "loss": 0.5539, + "step": 2449 + }, + { + "epoch": 0.19, + "grad_norm": 1.5712541147592944, + "learning_rate": 9.32407334769698e-06, + "loss": 0.5359, + "step": 2450 + }, + { + "epoch": 0.19, + "grad_norm": 1.630495720904165, + "learning_rate": 9.323434649783136e-06, + "loss": 0.5351, + "step": 2451 + }, + { + "epoch": 0.19, + "grad_norm": 1.757508956740128, + "learning_rate": 9.322795672148726e-06, + "loss": 0.545, + "step": 2452 + }, + { + "epoch": 0.19, + "grad_norm": 1.5130141942761337, + "learning_rate": 9.32215641483509e-06, + "loss": 0.5308, + "step": 2453 + }, + { + "epoch": 0.19, + "grad_norm": 1.3705792829527919, + "learning_rate": 9.321516877883589e-06, + "loss": 0.5457, + "step": 2454 + }, + { + "epoch": 0.19, + "grad_norm": 1.5307834166203997, + "learning_rate": 9.3208770613356e-06, + "loss": 0.5481, + "step": 2455 + }, + { + "epoch": 0.19, + "grad_norm": 1.468553393915982, + "learning_rate": 9.320236965232517e-06, + "loss": 0.4389, + "step": 2456 + }, + { + "epoch": 0.19, + "grad_norm": 1.8949551194414926, + "learning_rate": 9.319596589615754e-06, + "loss": 0.5405, + "step": 2457 + }, + { + "epoch": 0.19, + "grad_norm": 1.6320690158287077, + "learning_rate": 9.318955934526741e-06, + "loss": 0.5178, + "step": 2458 + }, + { + "epoch": 0.19, + "grad_norm": 1.6657187442572023, + "learning_rate": 9.318315000006932e-06, + "loss": 0.4835, + "step": 2459 + }, + { + "epoch": 0.19, + "grad_norm": 2.540101786281009, + "learning_rate": 9.31767378609779e-06, + "loss": 0.4919, + "step": 2460 + }, + { + "epoch": 0.19, + "grad_norm": 1.5558528654271906, + "learning_rate": 9.317032292840801e-06, + "loss": 0.4456, + "step": 2461 + }, + { + "epoch": 0.19, + "grad_norm": 0.6525624945164196, + "learning_rate": 9.31639052027747e-06, + "loss": 0.5403, + "step": 2462 + }, + { + "epoch": 0.19, + "grad_norm": 1.6007247902437685, + "learning_rate": 9.315748468449321e-06, + "loss": 0.493, + "step": 2463 + }, + { + "epoch": 0.19, + "grad_norm": 1.7631675309675205, + "learning_rate": 9.31510613739789e-06, + "loss": 0.5036, + "step": 2464 + }, + { + "epoch": 0.19, + "grad_norm": 2.11609819270451, + "learning_rate": 9.314463527164738e-06, + "loss": 0.5287, + "step": 2465 + }, + { + "epoch": 0.19, + "grad_norm": 1.4240439363926427, + "learning_rate": 9.313820637791439e-06, + "loss": 0.4841, + "step": 2466 + }, + { + "epoch": 0.19, + "grad_norm": 1.7016774256736709, + "learning_rate": 9.313177469319587e-06, + "loss": 0.5141, + "step": 2467 + }, + { + "epoch": 0.19, + "grad_norm": 2.210374064022001, + "learning_rate": 9.312534021790797e-06, + "loss": 0.509, + "step": 2468 + }, + { + "epoch": 0.19, + "grad_norm": 0.6771910072252202, + "learning_rate": 9.311890295246697e-06, + "loss": 0.5219, + "step": 2469 + }, + { + "epoch": 0.19, + "grad_norm": 1.9800599048509315, + "learning_rate": 9.311246289728935e-06, + "loss": 0.5337, + "step": 2470 + }, + { + "epoch": 0.19, + "grad_norm": 0.6303729007875951, + "learning_rate": 9.310602005279178e-06, + "loss": 0.5563, + "step": 2471 + }, + { + "epoch": 0.19, + "grad_norm": 1.626216340514345, + "learning_rate": 9.30995744193911e-06, + "loss": 0.5549, + "step": 2472 + }, + { + "epoch": 0.19, + "grad_norm": 1.6889596614687363, + "learning_rate": 9.309312599750434e-06, + "loss": 0.477, + "step": 2473 + }, + { + "epoch": 0.19, + "grad_norm": 1.5392073025154502, + "learning_rate": 9.30866747875487e-06, + "loss": 0.5628, + "step": 2474 + }, + { + "epoch": 0.19, + "grad_norm": 1.8983043942678033, + "learning_rate": 9.308022078994157e-06, + "loss": 0.547, + "step": 2475 + }, + { + "epoch": 0.19, + "grad_norm": 1.6595414968974405, + "learning_rate": 9.30737640051005e-06, + "loss": 0.4459, + "step": 2476 + }, + { + "epoch": 0.19, + "grad_norm": 1.6606609265892773, + "learning_rate": 9.306730443344325e-06, + "loss": 0.4734, + "step": 2477 + }, + { + "epoch": 0.19, + "grad_norm": 6.23533866193931, + "learning_rate": 9.306084207538774e-06, + "loss": 0.4923, + "step": 2478 + }, + { + "epoch": 0.19, + "grad_norm": 0.5969663660976549, + "learning_rate": 9.30543769313521e-06, + "loss": 0.5523, + "step": 2479 + }, + { + "epoch": 0.19, + "grad_norm": 1.3910886054155382, + "learning_rate": 9.304790900175456e-06, + "loss": 0.4919, + "step": 2480 + }, + { + "epoch": 0.19, + "grad_norm": 2.9029888699624555, + "learning_rate": 9.304143828701364e-06, + "loss": 0.4977, + "step": 2481 + }, + { + "epoch": 0.19, + "grad_norm": 1.5810444568903859, + "learning_rate": 9.303496478754796e-06, + "loss": 0.4797, + "step": 2482 + }, + { + "epoch": 0.2, + "grad_norm": 1.654818180036754, + "learning_rate": 9.302848850377638e-06, + "loss": 0.5063, + "step": 2483 + }, + { + "epoch": 0.2, + "grad_norm": 1.6797601482034494, + "learning_rate": 9.302200943611785e-06, + "loss": 0.5028, + "step": 2484 + }, + { + "epoch": 0.2, + "grad_norm": 1.6007829122669868, + "learning_rate": 9.30155275849916e-06, + "loss": 0.4663, + "step": 2485 + }, + { + "epoch": 0.2, + "grad_norm": 7.267745616077354, + "learning_rate": 9.3009042950817e-06, + "loss": 0.4726, + "step": 2486 + }, + { + "epoch": 0.2, + "grad_norm": 2.0530444658076292, + "learning_rate": 9.300255553401356e-06, + "loss": 0.5429, + "step": 2487 + }, + { + "epoch": 0.2, + "grad_norm": 0.6253270479975034, + "learning_rate": 9.299606533500105e-06, + "loss": 0.528, + "step": 2488 + }, + { + "epoch": 0.2, + "grad_norm": 1.6176674114601628, + "learning_rate": 9.298957235419937e-06, + "loss": 0.4399, + "step": 2489 + }, + { + "epoch": 0.2, + "grad_norm": 3.406172440790163, + "learning_rate": 9.298307659202857e-06, + "loss": 0.4955, + "step": 2490 + }, + { + "epoch": 0.2, + "grad_norm": 1.614710522080688, + "learning_rate": 9.297657804890896e-06, + "loss": 0.5173, + "step": 2491 + }, + { + "epoch": 0.2, + "grad_norm": 1.6062436197129677, + "learning_rate": 9.297007672526096e-06, + "loss": 0.5325, + "step": 2492 + }, + { + "epoch": 0.2, + "grad_norm": 1.6055856823890806, + "learning_rate": 9.296357262150521e-06, + "loss": 0.5292, + "step": 2493 + }, + { + "epoch": 0.2, + "grad_norm": 0.6161825695671473, + "learning_rate": 9.29570657380625e-06, + "loss": 0.5499, + "step": 2494 + }, + { + "epoch": 0.2, + "grad_norm": 1.4657040336651872, + "learning_rate": 9.295055607535386e-06, + "loss": 0.5325, + "step": 2495 + }, + { + "epoch": 0.2, + "grad_norm": 1.7486182267440138, + "learning_rate": 9.294404363380043e-06, + "loss": 0.4927, + "step": 2496 + }, + { + "epoch": 0.2, + "grad_norm": 1.9540755169643915, + "learning_rate": 9.293752841382353e-06, + "loss": 0.5486, + "step": 2497 + }, + { + "epoch": 0.2, + "grad_norm": 2.32659223365593, + "learning_rate": 9.293101041584473e-06, + "loss": 0.4909, + "step": 2498 + }, + { + "epoch": 0.2, + "grad_norm": 1.7841093403961563, + "learning_rate": 9.29244896402857e-06, + "loss": 0.4878, + "step": 2499 + }, + { + "epoch": 0.2, + "grad_norm": 1.7737783868126311, + "learning_rate": 9.291796608756835e-06, + "loss": 0.5513, + "step": 2500 + }, + { + "epoch": 0.2, + "grad_norm": 1.8393602016369437, + "learning_rate": 9.291143975811475e-06, + "loss": 0.4754, + "step": 2501 + }, + { + "epoch": 0.2, + "grad_norm": 9.048277555393728, + "learning_rate": 9.290491065234712e-06, + "loss": 0.5592, + "step": 2502 + }, + { + "epoch": 0.2, + "grad_norm": 1.9606004419976215, + "learning_rate": 9.28983787706879e-06, + "loss": 0.4742, + "step": 2503 + }, + { + "epoch": 0.2, + "grad_norm": 1.8231670429851545, + "learning_rate": 9.28918441135597e-06, + "loss": 0.474, + "step": 2504 + }, + { + "epoch": 0.2, + "grad_norm": 1.4205568905836603, + "learning_rate": 9.28853066813853e-06, + "loss": 0.5187, + "step": 2505 + }, + { + "epoch": 0.2, + "grad_norm": 1.4942019717814887, + "learning_rate": 9.287876647458762e-06, + "loss": 0.4089, + "step": 2506 + }, + { + "epoch": 0.2, + "grad_norm": 0.5951477055645675, + "learning_rate": 9.287222349358988e-06, + "loss": 0.5443, + "step": 2507 + }, + { + "epoch": 0.2, + "grad_norm": 1.3752145727834633, + "learning_rate": 9.286567773881535e-06, + "loss": 0.4949, + "step": 2508 + }, + { + "epoch": 0.2, + "grad_norm": 0.5879767423925902, + "learning_rate": 9.285912921068755e-06, + "loss": 0.5701, + "step": 2509 + }, + { + "epoch": 0.2, + "grad_norm": 0.5933361261628092, + "learning_rate": 9.285257790963015e-06, + "loss": 0.5917, + "step": 2510 + }, + { + "epoch": 0.2, + "grad_norm": 0.5649841477380809, + "learning_rate": 9.284602383606702e-06, + "loss": 0.5433, + "step": 2511 + }, + { + "epoch": 0.2, + "grad_norm": 2.8203588827743857, + "learning_rate": 9.28394669904222e-06, + "loss": 0.5236, + "step": 2512 + }, + { + "epoch": 0.2, + "grad_norm": 3.340695603287489, + "learning_rate": 9.283290737311991e-06, + "loss": 0.5305, + "step": 2513 + }, + { + "epoch": 0.2, + "grad_norm": 1.588381817738051, + "learning_rate": 9.282634498458453e-06, + "loss": 0.4739, + "step": 2514 + }, + { + "epoch": 0.2, + "grad_norm": 2.8698348215943588, + "learning_rate": 9.281977982524067e-06, + "loss": 0.4776, + "step": 2515 + }, + { + "epoch": 0.2, + "grad_norm": 1.679640183032855, + "learning_rate": 9.281321189551306e-06, + "loss": 0.511, + "step": 2516 + }, + { + "epoch": 0.2, + "grad_norm": 0.6468875542388393, + "learning_rate": 9.280664119582665e-06, + "loss": 0.5393, + "step": 2517 + }, + { + "epoch": 0.2, + "grad_norm": 1.4475882392335921, + "learning_rate": 9.280006772660657e-06, + "loss": 0.4942, + "step": 2518 + }, + { + "epoch": 0.2, + "grad_norm": 1.9339275119658914, + "learning_rate": 9.279349148827807e-06, + "loss": 0.5213, + "step": 2519 + }, + { + "epoch": 0.2, + "grad_norm": 1.5335389226721083, + "learning_rate": 9.278691248126667e-06, + "loss": 0.5293, + "step": 2520 + }, + { + "epoch": 0.2, + "grad_norm": 2.092041907356189, + "learning_rate": 9.2780330705998e-06, + "loss": 0.4949, + "step": 2521 + }, + { + "epoch": 0.2, + "grad_norm": 1.3842815744965236, + "learning_rate": 9.27737461628979e-06, + "loss": 0.4974, + "step": 2522 + }, + { + "epoch": 0.2, + "grad_norm": 1.6121262746450775, + "learning_rate": 9.276715885239237e-06, + "loss": 0.5873, + "step": 2523 + }, + { + "epoch": 0.2, + "grad_norm": 1.803224198328928, + "learning_rate": 9.276056877490762e-06, + "loss": 0.4998, + "step": 2524 + }, + { + "epoch": 0.2, + "grad_norm": 2.4863087609035737, + "learning_rate": 9.275397593087e-06, + "loss": 0.4883, + "step": 2525 + }, + { + "epoch": 0.2, + "grad_norm": 1.6783808296900016, + "learning_rate": 9.274738032070607e-06, + "loss": 0.5123, + "step": 2526 + }, + { + "epoch": 0.2, + "grad_norm": 0.6223974418873989, + "learning_rate": 9.274078194484255e-06, + "loss": 0.5464, + "step": 2527 + }, + { + "epoch": 0.2, + "grad_norm": 1.9510064564816119, + "learning_rate": 9.273418080370636e-06, + "loss": 0.4932, + "step": 2528 + }, + { + "epoch": 0.2, + "grad_norm": 2.0600775317553364, + "learning_rate": 9.272757689772456e-06, + "loss": 0.5406, + "step": 2529 + }, + { + "epoch": 0.2, + "grad_norm": 1.7128511944441709, + "learning_rate": 9.272097022732444e-06, + "loss": 0.5235, + "step": 2530 + }, + { + "epoch": 0.2, + "grad_norm": 0.5460169837490706, + "learning_rate": 9.271436079293344e-06, + "loss": 0.5459, + "step": 2531 + }, + { + "epoch": 0.2, + "grad_norm": 0.6183378288200972, + "learning_rate": 9.270774859497915e-06, + "loss": 0.5433, + "step": 2532 + }, + { + "epoch": 0.2, + "grad_norm": 0.5872751369969683, + "learning_rate": 9.27011336338894e-06, + "loss": 0.5626, + "step": 2533 + }, + { + "epoch": 0.2, + "grad_norm": 1.5679339505670329, + "learning_rate": 9.269451591009217e-06, + "loss": 0.4554, + "step": 2534 + }, + { + "epoch": 0.2, + "grad_norm": 1.991732840440832, + "learning_rate": 9.268789542401561e-06, + "loss": 0.5408, + "step": 2535 + }, + { + "epoch": 0.2, + "grad_norm": 1.73457195070717, + "learning_rate": 9.268127217608808e-06, + "loss": 0.4462, + "step": 2536 + }, + { + "epoch": 0.2, + "grad_norm": 1.8327039989671163, + "learning_rate": 9.267464616673805e-06, + "loss": 0.4865, + "step": 2537 + }, + { + "epoch": 0.2, + "grad_norm": 1.985879379158774, + "learning_rate": 9.266801739639424e-06, + "loss": 0.5724, + "step": 2538 + }, + { + "epoch": 0.2, + "grad_norm": 1.7102053522381389, + "learning_rate": 9.266138586548554e-06, + "loss": 0.5787, + "step": 2539 + }, + { + "epoch": 0.2, + "grad_norm": 1.4799858646820447, + "learning_rate": 9.265475157444097e-06, + "loss": 0.5164, + "step": 2540 + }, + { + "epoch": 0.2, + "grad_norm": 1.3337443628689458, + "learning_rate": 9.264811452368975e-06, + "loss": 0.5302, + "step": 2541 + }, + { + "epoch": 0.2, + "grad_norm": 1.6258498233847551, + "learning_rate": 9.264147471366136e-06, + "loss": 0.4423, + "step": 2542 + }, + { + "epoch": 0.2, + "grad_norm": 2.6790662259144122, + "learning_rate": 9.263483214478531e-06, + "loss": 0.542, + "step": 2543 + }, + { + "epoch": 0.2, + "grad_norm": 1.865466279276663, + "learning_rate": 9.262818681749138e-06, + "loss": 0.5146, + "step": 2544 + }, + { + "epoch": 0.2, + "grad_norm": 1.480442880007, + "learning_rate": 9.262153873220955e-06, + "loss": 0.4526, + "step": 2545 + }, + { + "epoch": 0.2, + "grad_norm": 1.5326639430014113, + "learning_rate": 9.26148878893699e-06, + "loss": 0.538, + "step": 2546 + }, + { + "epoch": 0.2, + "grad_norm": 0.6816182543409122, + "learning_rate": 9.260823428940277e-06, + "loss": 0.5573, + "step": 2547 + }, + { + "epoch": 0.2, + "grad_norm": 1.8268978002230043, + "learning_rate": 9.260157793273862e-06, + "loss": 0.5385, + "step": 2548 + }, + { + "epoch": 0.2, + "grad_norm": 1.6602251701839639, + "learning_rate": 9.259491881980809e-06, + "loss": 0.5556, + "step": 2549 + }, + { + "epoch": 0.2, + "grad_norm": 0.5987614770527108, + "learning_rate": 9.258825695104205e-06, + "loss": 0.5598, + "step": 2550 + }, + { + "epoch": 0.2, + "grad_norm": 1.4379072168870373, + "learning_rate": 9.258159232687149e-06, + "loss": 0.4772, + "step": 2551 + }, + { + "epoch": 0.2, + "grad_norm": 0.57991378450244, + "learning_rate": 9.257492494772762e-06, + "loss": 0.524, + "step": 2552 + }, + { + "epoch": 0.2, + "grad_norm": 1.7976902036337339, + "learning_rate": 9.256825481404178e-06, + "loss": 0.4985, + "step": 2553 + }, + { + "epoch": 0.2, + "grad_norm": 1.6121422433939019, + "learning_rate": 9.256158192624555e-06, + "loss": 0.4677, + "step": 2554 + }, + { + "epoch": 0.2, + "grad_norm": 2.4926517261373795, + "learning_rate": 9.255490628477067e-06, + "loss": 0.5255, + "step": 2555 + }, + { + "epoch": 0.2, + "grad_norm": 2.1801020978369503, + "learning_rate": 9.254822789004899e-06, + "loss": 0.5262, + "step": 2556 + }, + { + "epoch": 0.2, + "grad_norm": 3.097957425772567, + "learning_rate": 9.254154674251263e-06, + "loss": 0.5258, + "step": 2557 + }, + { + "epoch": 0.2, + "grad_norm": 2.068429361392017, + "learning_rate": 9.253486284259387e-06, + "loss": 0.4973, + "step": 2558 + }, + { + "epoch": 0.2, + "grad_norm": 1.8888508398945307, + "learning_rate": 9.25281761907251e-06, + "loss": 0.5152, + "step": 2559 + }, + { + "epoch": 0.2, + "grad_norm": 2.4118273213892327, + "learning_rate": 9.2521486787339e-06, + "loss": 0.5072, + "step": 2560 + }, + { + "epoch": 0.2, + "grad_norm": 1.7943462190316661, + "learning_rate": 9.251479463286829e-06, + "loss": 0.5075, + "step": 2561 + }, + { + "epoch": 0.2, + "grad_norm": 1.6498719599748597, + "learning_rate": 9.2508099727746e-06, + "loss": 0.5113, + "step": 2562 + }, + { + "epoch": 0.2, + "grad_norm": 1.7305537559497464, + "learning_rate": 9.250140207240525e-06, + "loss": 0.4727, + "step": 2563 + }, + { + "epoch": 0.2, + "grad_norm": 1.6276739073958881, + "learning_rate": 9.24947016672794e-06, + "loss": 0.5056, + "step": 2564 + }, + { + "epoch": 0.2, + "grad_norm": 1.5310894309638283, + "learning_rate": 9.248799851280195e-06, + "loss": 0.5033, + "step": 2565 + }, + { + "epoch": 0.2, + "grad_norm": 1.5276382086348612, + "learning_rate": 9.248129260940657e-06, + "loss": 0.5027, + "step": 2566 + }, + { + "epoch": 0.2, + "grad_norm": 1.4954389820212917, + "learning_rate": 9.247458395752713e-06, + "loss": 0.4618, + "step": 2567 + }, + { + "epoch": 0.2, + "grad_norm": 1.642816702718338, + "learning_rate": 9.246787255759768e-06, + "loss": 0.4734, + "step": 2568 + }, + { + "epoch": 0.2, + "grad_norm": 3.398967441979739, + "learning_rate": 9.246115841005241e-06, + "loss": 0.5238, + "step": 2569 + }, + { + "epoch": 0.2, + "grad_norm": 1.5380909002930145, + "learning_rate": 9.245444151532575e-06, + "loss": 0.5081, + "step": 2570 + }, + { + "epoch": 0.2, + "grad_norm": 1.5555556811824625, + "learning_rate": 9.244772187385226e-06, + "loss": 0.4619, + "step": 2571 + }, + { + "epoch": 0.2, + "grad_norm": 0.6978667120079489, + "learning_rate": 9.244099948606669e-06, + "loss": 0.555, + "step": 2572 + }, + { + "epoch": 0.2, + "grad_norm": 2.0710441404831075, + "learning_rate": 9.243427435240398e-06, + "loss": 0.4772, + "step": 2573 + }, + { + "epoch": 0.2, + "grad_norm": 2.3824172940693416, + "learning_rate": 9.242754647329924e-06, + "loss": 0.5205, + "step": 2574 + }, + { + "epoch": 0.2, + "grad_norm": 1.7750099356874, + "learning_rate": 9.242081584918772e-06, + "loss": 0.5172, + "step": 2575 + }, + { + "epoch": 0.2, + "grad_norm": 2.3630676982965495, + "learning_rate": 9.241408248050491e-06, + "loss": 0.4725, + "step": 2576 + }, + { + "epoch": 0.2, + "grad_norm": 0.573708674396772, + "learning_rate": 9.240734636768647e-06, + "loss": 0.5406, + "step": 2577 + }, + { + "epoch": 0.2, + "grad_norm": 1.6110943153048605, + "learning_rate": 9.240060751116819e-06, + "loss": 0.5264, + "step": 2578 + }, + { + "epoch": 0.2, + "grad_norm": 2.4390467854313878, + "learning_rate": 9.239386591138605e-06, + "loss": 0.4962, + "step": 2579 + }, + { + "epoch": 0.2, + "grad_norm": 1.7378517985498094, + "learning_rate": 9.238712156877627e-06, + "loss": 0.4845, + "step": 2580 + }, + { + "epoch": 0.2, + "grad_norm": 1.8812462856411312, + "learning_rate": 9.238037448377517e-06, + "loss": 0.5486, + "step": 2581 + }, + { + "epoch": 0.2, + "grad_norm": 1.660538384616219, + "learning_rate": 9.237362465681928e-06, + "loss": 0.4713, + "step": 2582 + }, + { + "epoch": 0.2, + "grad_norm": 0.5949164790812576, + "learning_rate": 9.23668720883453e-06, + "loss": 0.5433, + "step": 2583 + }, + { + "epoch": 0.2, + "grad_norm": 1.5859573460698153, + "learning_rate": 9.23601167787901e-06, + "loss": 0.5117, + "step": 2584 + }, + { + "epoch": 0.2, + "grad_norm": 1.8402757180454896, + "learning_rate": 9.235335872859079e-06, + "loss": 0.4873, + "step": 2585 + }, + { + "epoch": 0.2, + "grad_norm": 0.5518089395718657, + "learning_rate": 9.234659793818456e-06, + "loss": 0.5378, + "step": 2586 + }, + { + "epoch": 0.2, + "grad_norm": 0.5813244685276074, + "learning_rate": 9.233983440800884e-06, + "loss": 0.5483, + "step": 2587 + }, + { + "epoch": 0.2, + "grad_norm": 1.5221433601624796, + "learning_rate": 9.233306813850123e-06, + "loss": 0.4393, + "step": 2588 + }, + { + "epoch": 0.2, + "grad_norm": 1.93521833768943, + "learning_rate": 9.232629913009947e-06, + "loss": 0.5471, + "step": 2589 + }, + { + "epoch": 0.2, + "grad_norm": 0.6199135857665948, + "learning_rate": 9.231952738324155e-06, + "loss": 0.5729, + "step": 2590 + }, + { + "epoch": 0.2, + "grad_norm": 1.8326261686383545, + "learning_rate": 9.231275289836556e-06, + "loss": 0.4834, + "step": 2591 + }, + { + "epoch": 0.2, + "grad_norm": 1.7260511513849952, + "learning_rate": 9.23059756759098e-06, + "loss": 0.6006, + "step": 2592 + }, + { + "epoch": 0.2, + "grad_norm": 2.4009975147116807, + "learning_rate": 9.229919571631277e-06, + "loss": 0.4439, + "step": 2593 + }, + { + "epoch": 0.2, + "grad_norm": 1.4694791811679881, + "learning_rate": 9.22924130200131e-06, + "loss": 0.484, + "step": 2594 + }, + { + "epoch": 0.2, + "grad_norm": 0.6585233955609227, + "learning_rate": 9.228562758744966e-06, + "loss": 0.5497, + "step": 2595 + }, + { + "epoch": 0.2, + "grad_norm": 2.1561782682471633, + "learning_rate": 9.22788394190614e-06, + "loss": 0.5229, + "step": 2596 + }, + { + "epoch": 0.2, + "grad_norm": 2.065327285837434, + "learning_rate": 9.227204851528756e-06, + "loss": 0.5119, + "step": 2597 + }, + { + "epoch": 0.2, + "grad_norm": 4.011982820206326, + "learning_rate": 9.226525487656745e-06, + "loss": 0.5296, + "step": 2598 + }, + { + "epoch": 0.2, + "grad_norm": 3.220457601911713, + "learning_rate": 9.225845850334067e-06, + "loss": 0.4894, + "step": 2599 + }, + { + "epoch": 0.2, + "grad_norm": 2.8276780151470424, + "learning_rate": 9.225165939604689e-06, + "loss": 0.4883, + "step": 2600 + }, + { + "epoch": 0.2, + "grad_norm": 1.8629313945801276, + "learning_rate": 9.224485755512603e-06, + "loss": 0.4745, + "step": 2601 + }, + { + "epoch": 0.2, + "grad_norm": 1.4712248178272749, + "learning_rate": 9.223805298101813e-06, + "loss": 0.5082, + "step": 2602 + }, + { + "epoch": 0.2, + "grad_norm": 1.917503838721372, + "learning_rate": 9.223124567416349e-06, + "loss": 0.5395, + "step": 2603 + }, + { + "epoch": 0.2, + "grad_norm": 0.6606170470467999, + "learning_rate": 9.222443563500248e-06, + "loss": 0.5668, + "step": 2604 + }, + { + "epoch": 0.2, + "grad_norm": 1.8118028218894473, + "learning_rate": 9.22176228639757e-06, + "loss": 0.4796, + "step": 2605 + }, + { + "epoch": 0.2, + "grad_norm": 1.8756072147160745, + "learning_rate": 9.221080736152397e-06, + "loss": 0.4822, + "step": 2606 + }, + { + "epoch": 0.2, + "grad_norm": 2.012381992087522, + "learning_rate": 9.220398912808823e-06, + "loss": 0.5071, + "step": 2607 + }, + { + "epoch": 0.2, + "grad_norm": 1.4838835123750598, + "learning_rate": 9.21971681641096e-06, + "loss": 0.5107, + "step": 2608 + }, + { + "epoch": 0.2, + "grad_norm": 1.8666858660166257, + "learning_rate": 9.219034447002938e-06, + "loss": 0.5052, + "step": 2609 + }, + { + "epoch": 0.2, + "grad_norm": 0.5885004897932643, + "learning_rate": 9.218351804628906e-06, + "loss": 0.5622, + "step": 2610 + }, + { + "epoch": 0.21, + "grad_norm": 1.7403238918840045, + "learning_rate": 9.217668889333033e-06, + "loss": 0.495, + "step": 2611 + }, + { + "epoch": 0.21, + "grad_norm": 3.5392095736681655, + "learning_rate": 9.216985701159497e-06, + "loss": 0.5121, + "step": 2612 + }, + { + "epoch": 0.21, + "grad_norm": 2.31300080116189, + "learning_rate": 9.216302240152506e-06, + "loss": 0.5483, + "step": 2613 + }, + { + "epoch": 0.21, + "grad_norm": 1.9237314492746174, + "learning_rate": 9.215618506356273e-06, + "loss": 0.4837, + "step": 2614 + }, + { + "epoch": 0.21, + "grad_norm": 1.7398613529320874, + "learning_rate": 9.214934499815038e-06, + "loss": 0.5132, + "step": 2615 + }, + { + "epoch": 0.21, + "grad_norm": 1.8622030621325105, + "learning_rate": 9.214250220573057e-06, + "loss": 0.5502, + "step": 2616 + }, + { + "epoch": 0.21, + "grad_norm": 2.7818598423759586, + "learning_rate": 9.213565668674597e-06, + "loss": 0.5339, + "step": 2617 + }, + { + "epoch": 0.21, + "grad_norm": 2.030518708224634, + "learning_rate": 9.212880844163952e-06, + "loss": 0.5462, + "step": 2618 + }, + { + "epoch": 0.21, + "grad_norm": 0.6570427018992091, + "learning_rate": 9.212195747085425e-06, + "loss": 0.5346, + "step": 2619 + }, + { + "epoch": 0.21, + "grad_norm": 0.5854610627979908, + "learning_rate": 9.211510377483345e-06, + "loss": 0.5346, + "step": 2620 + }, + { + "epoch": 0.21, + "grad_norm": 1.6011879553156652, + "learning_rate": 9.210824735402052e-06, + "loss": 0.5276, + "step": 2621 + }, + { + "epoch": 0.21, + "grad_norm": 1.8500449147535236, + "learning_rate": 9.21013882088591e-06, + "loss": 0.5922, + "step": 2622 + }, + { + "epoch": 0.21, + "grad_norm": 2.321270718208347, + "learning_rate": 9.209452633979293e-06, + "loss": 0.5002, + "step": 2623 + }, + { + "epoch": 0.21, + "grad_norm": 0.6083260414587445, + "learning_rate": 9.208766174726594e-06, + "loss": 0.5464, + "step": 2624 + }, + { + "epoch": 0.21, + "grad_norm": 2.3858755828936427, + "learning_rate": 9.208079443172232e-06, + "loss": 0.5084, + "step": 2625 + }, + { + "epoch": 0.21, + "grad_norm": 1.775026930353286, + "learning_rate": 9.207392439360634e-06, + "loss": 0.5032, + "step": 2626 + }, + { + "epoch": 0.21, + "grad_norm": 1.5694391932366076, + "learning_rate": 9.20670516333625e-06, + "loss": 0.5058, + "step": 2627 + }, + { + "epoch": 0.21, + "grad_norm": 1.6264255863149715, + "learning_rate": 9.206017615143544e-06, + "loss": 0.5751, + "step": 2628 + }, + { + "epoch": 0.21, + "grad_norm": 1.6552076236781883, + "learning_rate": 9.205329794827e-06, + "loss": 0.5209, + "step": 2629 + }, + { + "epoch": 0.21, + "grad_norm": 2.725858608673693, + "learning_rate": 9.20464170243112e-06, + "loss": 0.4889, + "step": 2630 + }, + { + "epoch": 0.21, + "grad_norm": 1.8118171306412376, + "learning_rate": 9.203953338000424e-06, + "loss": 0.5131, + "step": 2631 + }, + { + "epoch": 0.21, + "grad_norm": 2.297217581535255, + "learning_rate": 9.203264701579444e-06, + "loss": 0.5254, + "step": 2632 + }, + { + "epoch": 0.21, + "grad_norm": 0.6282178712020734, + "learning_rate": 9.202575793212739e-06, + "loss": 0.5685, + "step": 2633 + }, + { + "epoch": 0.21, + "grad_norm": 1.6048953725096347, + "learning_rate": 9.201886612944875e-06, + "loss": 0.484, + "step": 2634 + }, + { + "epoch": 0.21, + "grad_norm": 1.8829288553880361, + "learning_rate": 9.201197160820445e-06, + "loss": 0.572, + "step": 2635 + }, + { + "epoch": 0.21, + "grad_norm": 1.8990023761283759, + "learning_rate": 9.200507436884055e-06, + "loss": 0.5204, + "step": 2636 + }, + { + "epoch": 0.21, + "grad_norm": 1.835438830770884, + "learning_rate": 9.199817441180329e-06, + "loss": 0.5297, + "step": 2637 + }, + { + "epoch": 0.21, + "grad_norm": 2.796807368142079, + "learning_rate": 9.199127173753908e-06, + "loss": 0.4793, + "step": 2638 + }, + { + "epoch": 0.21, + "grad_norm": 1.8520063494133083, + "learning_rate": 9.198436634649453e-06, + "loss": 0.5273, + "step": 2639 + }, + { + "epoch": 0.21, + "grad_norm": 0.7000947854569692, + "learning_rate": 9.197745823911638e-06, + "loss": 0.5507, + "step": 2640 + }, + { + "epoch": 0.21, + "grad_norm": 2.278051545801207, + "learning_rate": 9.197054741585161e-06, + "loss": 0.4909, + "step": 2641 + }, + { + "epoch": 0.21, + "grad_norm": 2.474755397950277, + "learning_rate": 9.196363387714733e-06, + "loss": 0.5151, + "step": 2642 + }, + { + "epoch": 0.21, + "grad_norm": 1.8360007140114516, + "learning_rate": 9.195671762345082e-06, + "loss": 0.4631, + "step": 2643 + }, + { + "epoch": 0.21, + "grad_norm": 1.5864162456388085, + "learning_rate": 9.194979865520956e-06, + "loss": 0.5235, + "step": 2644 + }, + { + "epoch": 0.21, + "grad_norm": 0.5853259569073647, + "learning_rate": 9.194287697287123e-06, + "loss": 0.5113, + "step": 2645 + }, + { + "epoch": 0.21, + "grad_norm": 1.7747919258325278, + "learning_rate": 9.193595257688362e-06, + "loss": 0.5335, + "step": 2646 + }, + { + "epoch": 0.21, + "grad_norm": 2.3209317294533025, + "learning_rate": 9.192902546769473e-06, + "loss": 0.517, + "step": 2647 + }, + { + "epoch": 0.21, + "grad_norm": 1.9839506573733687, + "learning_rate": 9.192209564575274e-06, + "loss": 0.5106, + "step": 2648 + }, + { + "epoch": 0.21, + "grad_norm": 3.4361810476651993, + "learning_rate": 9.191516311150601e-06, + "loss": 0.5082, + "step": 2649 + }, + { + "epoch": 0.21, + "grad_norm": 2.0726254547801757, + "learning_rate": 9.190822786540306e-06, + "loss": 0.5065, + "step": 2650 + }, + { + "epoch": 0.21, + "grad_norm": 0.6077059152148249, + "learning_rate": 9.190128990789258e-06, + "loss": 0.5422, + "step": 2651 + }, + { + "epoch": 0.21, + "grad_norm": 1.8114339182981618, + "learning_rate": 9.189434923942346e-06, + "loss": 0.5075, + "step": 2652 + }, + { + "epoch": 0.21, + "grad_norm": 1.8784217987715583, + "learning_rate": 9.188740586044476e-06, + "loss": 0.4795, + "step": 2653 + }, + { + "epoch": 0.21, + "grad_norm": 3.1459750111937637, + "learning_rate": 9.188045977140568e-06, + "loss": 0.5061, + "step": 2654 + }, + { + "epoch": 0.21, + "grad_norm": 0.6000653411872513, + "learning_rate": 9.187351097275565e-06, + "loss": 0.5509, + "step": 2655 + }, + { + "epoch": 0.21, + "grad_norm": 1.4373767981387768, + "learning_rate": 9.186655946494422e-06, + "loss": 0.4865, + "step": 2656 + }, + { + "epoch": 0.21, + "grad_norm": 1.642961230293047, + "learning_rate": 9.18596052484212e-06, + "loss": 0.5375, + "step": 2657 + }, + { + "epoch": 0.21, + "grad_norm": 2.040502167874546, + "learning_rate": 9.185264832363644e-06, + "loss": 0.5377, + "step": 2658 + }, + { + "epoch": 0.21, + "grad_norm": 2.8655359507918545, + "learning_rate": 9.18456886910401e-06, + "loss": 0.4938, + "step": 2659 + }, + { + "epoch": 0.21, + "grad_norm": 1.6731186915516718, + "learning_rate": 9.183872635108243e-06, + "loss": 0.4885, + "step": 2660 + }, + { + "epoch": 0.21, + "grad_norm": 1.5839653290679983, + "learning_rate": 9.183176130421391e-06, + "loss": 0.5122, + "step": 2661 + }, + { + "epoch": 0.21, + "grad_norm": 1.8613414786445373, + "learning_rate": 9.182479355088515e-06, + "loss": 0.4835, + "step": 2662 + }, + { + "epoch": 0.21, + "grad_norm": 0.5878412410589576, + "learning_rate": 9.181782309154698e-06, + "loss": 0.5664, + "step": 2663 + }, + { + "epoch": 0.21, + "grad_norm": 0.6270804420032785, + "learning_rate": 9.181084992665035e-06, + "loss": 0.5568, + "step": 2664 + }, + { + "epoch": 0.21, + "grad_norm": 1.7128413011190065, + "learning_rate": 9.180387405664643e-06, + "loss": 0.5648, + "step": 2665 + }, + { + "epoch": 0.21, + "grad_norm": 1.7788814464471132, + "learning_rate": 9.179689548198654e-06, + "loss": 0.4359, + "step": 2666 + }, + { + "epoch": 0.21, + "grad_norm": 3.3713750878665585, + "learning_rate": 9.178991420312218e-06, + "loss": 0.5151, + "step": 2667 + }, + { + "epoch": 0.21, + "grad_norm": 1.834602217949142, + "learning_rate": 9.178293022050505e-06, + "loss": 0.5209, + "step": 2668 + }, + { + "epoch": 0.21, + "grad_norm": 1.9152390144370137, + "learning_rate": 9.177594353458699e-06, + "loss": 0.5273, + "step": 2669 + }, + { + "epoch": 0.21, + "grad_norm": 1.5338652214961708, + "learning_rate": 9.176895414582002e-06, + "loss": 0.5365, + "step": 2670 + }, + { + "epoch": 0.21, + "grad_norm": 16.084834506187832, + "learning_rate": 9.176196205465637e-06, + "loss": 0.4901, + "step": 2671 + }, + { + "epoch": 0.21, + "grad_norm": 2.7973729427597513, + "learning_rate": 9.175496726154842e-06, + "loss": 0.5234, + "step": 2672 + }, + { + "epoch": 0.21, + "grad_norm": 2.1752119354252497, + "learning_rate": 9.174796976694868e-06, + "loss": 0.5064, + "step": 2673 + }, + { + "epoch": 0.21, + "grad_norm": 1.7698503409138169, + "learning_rate": 9.174096957130993e-06, + "loss": 0.4477, + "step": 2674 + }, + { + "epoch": 0.21, + "grad_norm": 1.6802586053719029, + "learning_rate": 9.173396667508505e-06, + "loss": 0.5067, + "step": 2675 + }, + { + "epoch": 0.21, + "grad_norm": 2.053429485940372, + "learning_rate": 9.172696107872712e-06, + "loss": 0.5106, + "step": 2676 + }, + { + "epoch": 0.21, + "grad_norm": 0.6502500640872055, + "learning_rate": 9.171995278268939e-06, + "loss": 0.5369, + "step": 2677 + }, + { + "epoch": 0.21, + "grad_norm": 1.543070132249157, + "learning_rate": 9.17129417874253e-06, + "loss": 0.4535, + "step": 2678 + }, + { + "epoch": 0.21, + "grad_norm": 1.6957905248512428, + "learning_rate": 9.170592809338844e-06, + "loss": 0.5138, + "step": 2679 + }, + { + "epoch": 0.21, + "grad_norm": 1.5711582636892383, + "learning_rate": 9.16989117010326e-06, + "loss": 0.4802, + "step": 2680 + }, + { + "epoch": 0.21, + "grad_norm": 1.4962519854554293, + "learning_rate": 9.16918926108117e-06, + "loss": 0.4666, + "step": 2681 + }, + { + "epoch": 0.21, + "grad_norm": 1.7085743621251734, + "learning_rate": 9.168487082317989e-06, + "loss": 0.529, + "step": 2682 + }, + { + "epoch": 0.21, + "grad_norm": 1.8192377952965582, + "learning_rate": 9.167784633859149e-06, + "loss": 0.5216, + "step": 2683 + }, + { + "epoch": 0.21, + "grad_norm": 1.5419651943714792, + "learning_rate": 9.167081915750093e-06, + "loss": 0.4485, + "step": 2684 + }, + { + "epoch": 0.21, + "grad_norm": 1.7550200479246778, + "learning_rate": 9.166378928036291e-06, + "loss": 0.4828, + "step": 2685 + }, + { + "epoch": 0.21, + "grad_norm": 1.8834211718544254, + "learning_rate": 9.165675670763222e-06, + "loss": 0.559, + "step": 2686 + }, + { + "epoch": 0.21, + "grad_norm": 1.6507865163112143, + "learning_rate": 9.164972143976383e-06, + "loss": 0.5897, + "step": 2687 + }, + { + "epoch": 0.21, + "grad_norm": 0.7538085201763504, + "learning_rate": 9.1642683477213e-06, + "loss": 0.5441, + "step": 2688 + }, + { + "epoch": 0.21, + "grad_norm": 2.2846535308997873, + "learning_rate": 9.163564282043497e-06, + "loss": 0.5461, + "step": 2689 + }, + { + "epoch": 0.21, + "grad_norm": 1.6724783933260177, + "learning_rate": 9.162859946988537e-06, + "loss": 0.4819, + "step": 2690 + }, + { + "epoch": 0.21, + "grad_norm": 1.554488112960826, + "learning_rate": 9.16215534260198e-06, + "loss": 0.5051, + "step": 2691 + }, + { + "epoch": 0.21, + "grad_norm": 1.4943071276313928, + "learning_rate": 9.161450468929419e-06, + "loss": 0.5257, + "step": 2692 + }, + { + "epoch": 0.21, + "grad_norm": 2.003264144237427, + "learning_rate": 9.160745326016456e-06, + "loss": 0.5201, + "step": 2693 + }, + { + "epoch": 0.21, + "grad_norm": 1.7296444154409523, + "learning_rate": 9.160039913908715e-06, + "loss": 0.5197, + "step": 2694 + }, + { + "epoch": 0.21, + "grad_norm": 1.7439491132642566, + "learning_rate": 9.159334232651831e-06, + "loss": 0.5256, + "step": 2695 + }, + { + "epoch": 0.21, + "grad_norm": 0.6920103158440429, + "learning_rate": 9.158628282291464e-06, + "loss": 0.5406, + "step": 2696 + }, + { + "epoch": 0.21, + "grad_norm": 1.8889711448984638, + "learning_rate": 9.157922062873288e-06, + "loss": 0.5084, + "step": 2697 + }, + { + "epoch": 0.21, + "grad_norm": 1.5719399611565419, + "learning_rate": 9.157215574442993e-06, + "loss": 0.5326, + "step": 2698 + }, + { + "epoch": 0.21, + "grad_norm": 1.3754722344938513, + "learning_rate": 9.156508817046288e-06, + "loss": 0.4966, + "step": 2699 + }, + { + "epoch": 0.21, + "grad_norm": 1.8140733422251627, + "learning_rate": 9.155801790728903e-06, + "loss": 0.4954, + "step": 2700 + }, + { + "epoch": 0.21, + "grad_norm": 1.5750676488263426, + "learning_rate": 9.155094495536575e-06, + "loss": 0.5267, + "step": 2701 + }, + { + "epoch": 0.21, + "grad_norm": 2.023837455811502, + "learning_rate": 9.154386931515072e-06, + "loss": 0.4711, + "step": 2702 + }, + { + "epoch": 0.21, + "grad_norm": 0.6268180113303401, + "learning_rate": 9.153679098710166e-06, + "loss": 0.5516, + "step": 2703 + }, + { + "epoch": 0.21, + "grad_norm": 1.8020853976196107, + "learning_rate": 9.152970997167657e-06, + "loss": 0.5012, + "step": 2704 + }, + { + "epoch": 0.21, + "grad_norm": 1.6754609690048694, + "learning_rate": 9.152262626933358e-06, + "loss": 0.5154, + "step": 2705 + }, + { + "epoch": 0.21, + "grad_norm": 1.8602159651867618, + "learning_rate": 9.1515539880531e-06, + "loss": 0.5412, + "step": 2706 + }, + { + "epoch": 0.21, + "grad_norm": 0.6309716429159686, + "learning_rate": 9.150845080572727e-06, + "loss": 0.5477, + "step": 2707 + }, + { + "epoch": 0.21, + "grad_norm": 1.7858605343605, + "learning_rate": 9.150135904538109e-06, + "loss": 0.5222, + "step": 2708 + }, + { + "epoch": 0.21, + "grad_norm": 1.8487718808929, + "learning_rate": 9.149426459995127e-06, + "loss": 0.5122, + "step": 2709 + }, + { + "epoch": 0.21, + "grad_norm": 1.5288921335413317, + "learning_rate": 9.14871674698968e-06, + "loss": 0.5025, + "step": 2710 + }, + { + "epoch": 0.21, + "grad_norm": 1.581631766301348, + "learning_rate": 9.148006765567688e-06, + "loss": 0.4923, + "step": 2711 + }, + { + "epoch": 0.21, + "grad_norm": 0.6107427639309868, + "learning_rate": 9.147296515775084e-06, + "loss": 0.539, + "step": 2712 + }, + { + "epoch": 0.21, + "grad_norm": 1.4914724967806177, + "learning_rate": 9.146585997657822e-06, + "loss": 0.4528, + "step": 2713 + }, + { + "epoch": 0.21, + "grad_norm": 0.5971958208802981, + "learning_rate": 9.145875211261867e-06, + "loss": 0.5436, + "step": 2714 + }, + { + "epoch": 0.21, + "grad_norm": 1.4165825097750098, + "learning_rate": 9.145164156633212e-06, + "loss": 0.5173, + "step": 2715 + }, + { + "epoch": 0.21, + "grad_norm": 1.3428359361705824, + "learning_rate": 9.14445283381786e-06, + "loss": 0.5029, + "step": 2716 + }, + { + "epoch": 0.21, + "grad_norm": 0.553750673348457, + "learning_rate": 9.14374124286183e-06, + "loss": 0.5324, + "step": 2717 + }, + { + "epoch": 0.21, + "grad_norm": 0.6022061754528902, + "learning_rate": 9.14302938381116e-06, + "loss": 0.5303, + "step": 2718 + }, + { + "epoch": 0.21, + "grad_norm": 7.525054985575711, + "learning_rate": 9.14231725671191e-06, + "loss": 0.5363, + "step": 2719 + }, + { + "epoch": 0.21, + "grad_norm": 1.6709165060391473, + "learning_rate": 9.141604861610154e-06, + "loss": 0.5282, + "step": 2720 + }, + { + "epoch": 0.21, + "grad_norm": 1.7081744244554102, + "learning_rate": 9.14089219855198e-06, + "loss": 0.5171, + "step": 2721 + }, + { + "epoch": 0.21, + "grad_norm": 1.5867454105420833, + "learning_rate": 9.140179267583497e-06, + "loss": 0.5073, + "step": 2722 + }, + { + "epoch": 0.21, + "grad_norm": 2.12732315255267, + "learning_rate": 9.139466068750833e-06, + "loss": 0.4696, + "step": 2723 + }, + { + "epoch": 0.21, + "grad_norm": 1.4861968279556428, + "learning_rate": 9.138752602100128e-06, + "loss": 0.4583, + "step": 2724 + }, + { + "epoch": 0.21, + "grad_norm": 1.6114100694847304, + "learning_rate": 9.138038867677546e-06, + "loss": 0.5207, + "step": 2725 + }, + { + "epoch": 0.21, + "grad_norm": 0.6601654992669101, + "learning_rate": 9.13732486552926e-06, + "loss": 0.5555, + "step": 2726 + }, + { + "epoch": 0.21, + "grad_norm": 2.245988943235142, + "learning_rate": 9.136610595701469e-06, + "loss": 0.4761, + "step": 2727 + }, + { + "epoch": 0.21, + "grad_norm": 1.6916528815085077, + "learning_rate": 9.135896058240384e-06, + "loss": 0.4832, + "step": 2728 + }, + { + "epoch": 0.21, + "grad_norm": 2.207989604914988, + "learning_rate": 9.135181253192234e-06, + "loss": 0.5656, + "step": 2729 + }, + { + "epoch": 0.21, + "grad_norm": 1.65048945075191, + "learning_rate": 9.134466180603265e-06, + "loss": 0.584, + "step": 2730 + }, + { + "epoch": 0.21, + "grad_norm": 1.527197700457935, + "learning_rate": 9.133750840519744e-06, + "loss": 0.5185, + "step": 2731 + }, + { + "epoch": 0.21, + "grad_norm": 1.7389115375135906, + "learning_rate": 9.133035232987952e-06, + "loss": 0.486, + "step": 2732 + }, + { + "epoch": 0.21, + "grad_norm": 1.57240820413734, + "learning_rate": 9.132319358054185e-06, + "loss": 0.4845, + "step": 2733 + }, + { + "epoch": 0.21, + "grad_norm": 1.3285407603205157, + "learning_rate": 9.131603215764764e-06, + "loss": 0.5002, + "step": 2734 + }, + { + "epoch": 0.21, + "grad_norm": 1.5468452930722145, + "learning_rate": 9.130886806166018e-06, + "loss": 0.5214, + "step": 2735 + }, + { + "epoch": 0.21, + "grad_norm": 1.66274584206512, + "learning_rate": 9.130170129304298e-06, + "loss": 0.5273, + "step": 2736 + }, + { + "epoch": 0.21, + "grad_norm": 1.5466066573961323, + "learning_rate": 9.129453185225976e-06, + "loss": 0.5325, + "step": 2737 + }, + { + "epoch": 0.22, + "grad_norm": 1.9020722524579603, + "learning_rate": 9.128735973977433e-06, + "loss": 0.5198, + "step": 2738 + }, + { + "epoch": 0.22, + "grad_norm": 1.6679319670437078, + "learning_rate": 9.128018495605077e-06, + "loss": 0.5216, + "step": 2739 + }, + { + "epoch": 0.22, + "grad_norm": 1.4968362996665217, + "learning_rate": 9.12730075015532e-06, + "loss": 0.466, + "step": 2740 + }, + { + "epoch": 0.22, + "grad_norm": 1.832538910354158, + "learning_rate": 9.126582737674608e-06, + "loss": 0.5265, + "step": 2741 + }, + { + "epoch": 0.22, + "grad_norm": 0.7385854139924435, + "learning_rate": 9.125864458209388e-06, + "loss": 0.557, + "step": 2742 + }, + { + "epoch": 0.22, + "grad_norm": 1.3899856196366254, + "learning_rate": 9.125145911806138e-06, + "loss": 0.4677, + "step": 2743 + }, + { + "epoch": 0.22, + "grad_norm": 2.6927475904140197, + "learning_rate": 9.12442709851134e-06, + "loss": 0.5095, + "step": 2744 + }, + { + "epoch": 0.22, + "grad_norm": 1.4477363947627442, + "learning_rate": 9.123708018371507e-06, + "loss": 0.5195, + "step": 2745 + }, + { + "epoch": 0.22, + "grad_norm": 1.4310694276339349, + "learning_rate": 9.122988671433159e-06, + "loss": 0.5118, + "step": 2746 + }, + { + "epoch": 0.22, + "grad_norm": 2.3569402909371706, + "learning_rate": 9.122269057742837e-06, + "loss": 0.5268, + "step": 2747 + }, + { + "epoch": 0.22, + "grad_norm": 2.2417188825959897, + "learning_rate": 9.1215491773471e-06, + "loss": 0.4802, + "step": 2748 + }, + { + "epoch": 0.22, + "grad_norm": 0.7207911456566134, + "learning_rate": 9.120829030292522e-06, + "loss": 0.5579, + "step": 2749 + }, + { + "epoch": 0.22, + "grad_norm": 2.475787218501387, + "learning_rate": 9.120108616625697e-06, + "loss": 0.5156, + "step": 2750 + }, + { + "epoch": 0.22, + "grad_norm": 2.3375673133641435, + "learning_rate": 9.119387936393235e-06, + "loss": 0.5218, + "step": 2751 + }, + { + "epoch": 0.22, + "grad_norm": 2.7538318315433967, + "learning_rate": 9.11866698964176e-06, + "loss": 0.5254, + "step": 2752 + }, + { + "epoch": 0.22, + "grad_norm": 1.6137294144374963, + "learning_rate": 9.11794577641792e-06, + "loss": 0.4819, + "step": 2753 + }, + { + "epoch": 0.22, + "grad_norm": 1.433009280461519, + "learning_rate": 9.117224296768376e-06, + "loss": 0.4931, + "step": 2754 + }, + { + "epoch": 0.22, + "grad_norm": 1.45518271958093, + "learning_rate": 9.116502550739808e-06, + "loss": 0.5259, + "step": 2755 + }, + { + "epoch": 0.22, + "grad_norm": 1.5384352163901207, + "learning_rate": 9.115780538378907e-06, + "loss": 0.4537, + "step": 2756 + }, + { + "epoch": 0.22, + "grad_norm": 1.6472177529971626, + "learning_rate": 9.11505825973239e-06, + "loss": 0.4885, + "step": 2757 + }, + { + "epoch": 0.22, + "grad_norm": 1.5080377918369028, + "learning_rate": 9.114335714846987e-06, + "loss": 0.5816, + "step": 2758 + }, + { + "epoch": 0.22, + "grad_norm": 1.6143389665606054, + "learning_rate": 9.113612903769445e-06, + "loss": 0.5021, + "step": 2759 + }, + { + "epoch": 0.22, + "grad_norm": 1.8551384046735047, + "learning_rate": 9.112889826546529e-06, + "loss": 0.5173, + "step": 2760 + }, + { + "epoch": 0.22, + "grad_norm": 2.2720958246592944, + "learning_rate": 9.112166483225023e-06, + "loss": 0.4857, + "step": 2761 + }, + { + "epoch": 0.22, + "grad_norm": 1.5713366834254272, + "learning_rate": 9.111442873851724e-06, + "loss": 0.485, + "step": 2762 + }, + { + "epoch": 0.22, + "grad_norm": 3.681878826862694, + "learning_rate": 9.11071899847345e-06, + "loss": 0.4714, + "step": 2763 + }, + { + "epoch": 0.22, + "grad_norm": 2.2072634415872, + "learning_rate": 9.109994857137032e-06, + "loss": 0.5744, + "step": 2764 + }, + { + "epoch": 0.22, + "grad_norm": 1.5866836267675368, + "learning_rate": 9.109270449889326e-06, + "loss": 0.5113, + "step": 2765 + }, + { + "epoch": 0.22, + "grad_norm": 1.6979777898958301, + "learning_rate": 9.108545776777196e-06, + "loss": 0.5246, + "step": 2766 + }, + { + "epoch": 0.22, + "grad_norm": 1.306011405924336, + "learning_rate": 9.10782083784753e-06, + "loss": 0.5007, + "step": 2767 + }, + { + "epoch": 0.22, + "grad_norm": 0.7426384002770077, + "learning_rate": 9.107095633147229e-06, + "loss": 0.5251, + "step": 2768 + }, + { + "epoch": 0.22, + "grad_norm": 1.6225278803895051, + "learning_rate": 9.106370162723214e-06, + "loss": 0.5537, + "step": 2769 + }, + { + "epoch": 0.22, + "grad_norm": 0.6099372723394942, + "learning_rate": 9.10564442662242e-06, + "loss": 0.5433, + "step": 2770 + }, + { + "epoch": 0.22, + "grad_norm": 3.3911305350894354, + "learning_rate": 9.104918424891803e-06, + "loss": 0.5599, + "step": 2771 + }, + { + "epoch": 0.22, + "grad_norm": 2.6749314632751924, + "learning_rate": 9.104192157578335e-06, + "loss": 0.5221, + "step": 2772 + }, + { + "epoch": 0.22, + "grad_norm": 1.6866236974367717, + "learning_rate": 9.103465624729002e-06, + "loss": 0.4573, + "step": 2773 + }, + { + "epoch": 0.22, + "grad_norm": 1.1723896324424787, + "learning_rate": 9.102738826390811e-06, + "loss": 0.4878, + "step": 2774 + }, + { + "epoch": 0.22, + "grad_norm": 0.9114811437032293, + "learning_rate": 9.102011762610785e-06, + "loss": 0.5717, + "step": 2775 + }, + { + "epoch": 0.22, + "grad_norm": 1.4113126482763392, + "learning_rate": 9.101284433435965e-06, + "loss": 0.5227, + "step": 2776 + }, + { + "epoch": 0.22, + "grad_norm": 1.7202827793717557, + "learning_rate": 9.100556838913407e-06, + "loss": 0.5451, + "step": 2777 + }, + { + "epoch": 0.22, + "grad_norm": 1.312020549281303, + "learning_rate": 9.099828979090183e-06, + "loss": 0.504, + "step": 2778 + }, + { + "epoch": 0.22, + "grad_norm": 1.382271834865188, + "learning_rate": 9.099100854013392e-06, + "loss": 0.4568, + "step": 2779 + }, + { + "epoch": 0.22, + "grad_norm": 1.6484539465120087, + "learning_rate": 9.098372463730135e-06, + "loss": 0.5143, + "step": 2780 + }, + { + "epoch": 0.22, + "grad_norm": 1.4628791695061754, + "learning_rate": 9.097643808287541e-06, + "loss": 0.4723, + "step": 2781 + }, + { + "epoch": 0.22, + "grad_norm": 1.8010748515146753, + "learning_rate": 9.096914887732756e-06, + "loss": 0.5093, + "step": 2782 + }, + { + "epoch": 0.22, + "grad_norm": 0.6832296187403024, + "learning_rate": 9.096185702112933e-06, + "loss": 0.5413, + "step": 2783 + }, + { + "epoch": 0.22, + "grad_norm": 1.5124258537433442, + "learning_rate": 9.095456251475257e-06, + "loss": 0.5428, + "step": 2784 + }, + { + "epoch": 0.22, + "grad_norm": 1.4823417602332192, + "learning_rate": 9.094726535866918e-06, + "loss": 0.4974, + "step": 2785 + }, + { + "epoch": 0.22, + "grad_norm": 1.6253719640746063, + "learning_rate": 9.093996555335128e-06, + "loss": 0.5395, + "step": 2786 + }, + { + "epoch": 0.22, + "grad_norm": 1.6370834986373386, + "learning_rate": 9.093266309927116e-06, + "loss": 0.4977, + "step": 2787 + }, + { + "epoch": 0.22, + "grad_norm": 2.3536790357965214, + "learning_rate": 9.092535799690128e-06, + "loss": 0.4736, + "step": 2788 + }, + { + "epoch": 0.22, + "grad_norm": 1.6419275994965008, + "learning_rate": 9.091805024671429e-06, + "loss": 0.5132, + "step": 2789 + }, + { + "epoch": 0.22, + "grad_norm": 1.6530221631295237, + "learning_rate": 9.091073984918298e-06, + "loss": 0.5344, + "step": 2790 + }, + { + "epoch": 0.22, + "grad_norm": 1.3600676332693231, + "learning_rate": 9.090342680478031e-06, + "loss": 0.4836, + "step": 2791 + }, + { + "epoch": 0.22, + "grad_norm": 0.6018024125547977, + "learning_rate": 9.089611111397943e-06, + "loss": 0.5492, + "step": 2792 + }, + { + "epoch": 0.22, + "grad_norm": 0.5414973589670579, + "learning_rate": 9.088879277725367e-06, + "loss": 0.5501, + "step": 2793 + }, + { + "epoch": 0.22, + "grad_norm": 1.762059875585048, + "learning_rate": 9.088147179507651e-06, + "loss": 0.5045, + "step": 2794 + }, + { + "epoch": 0.22, + "grad_norm": 1.3776620741200691, + "learning_rate": 9.087414816792159e-06, + "loss": 0.4508, + "step": 2795 + }, + { + "epoch": 0.22, + "grad_norm": 0.5834146098261641, + "learning_rate": 9.086682189626277e-06, + "loss": 0.5243, + "step": 2796 + }, + { + "epoch": 0.22, + "grad_norm": 1.5293276498184103, + "learning_rate": 9.085949298057402e-06, + "loss": 0.5135, + "step": 2797 + }, + { + "epoch": 0.22, + "grad_norm": 1.5794505395275915, + "learning_rate": 9.085216142132953e-06, + "loss": 0.4728, + "step": 2798 + }, + { + "epoch": 0.22, + "grad_norm": 0.6522462735443423, + "learning_rate": 9.084482721900363e-06, + "loss": 0.5408, + "step": 2799 + }, + { + "epoch": 0.22, + "grad_norm": 1.440492453833728, + "learning_rate": 9.083749037407086e-06, + "loss": 0.49, + "step": 2800 + }, + { + "epoch": 0.22, + "grad_norm": 0.5893867109776132, + "learning_rate": 9.083015088700588e-06, + "loss": 0.5277, + "step": 2801 + }, + { + "epoch": 0.22, + "grad_norm": 1.666155076999855, + "learning_rate": 9.082280875828354e-06, + "loss": 0.534, + "step": 2802 + }, + { + "epoch": 0.22, + "grad_norm": 0.6406019427495705, + "learning_rate": 9.081546398837888e-06, + "loss": 0.5693, + "step": 2803 + }, + { + "epoch": 0.22, + "grad_norm": 1.6976584664007963, + "learning_rate": 9.08081165777671e-06, + "loss": 0.5563, + "step": 2804 + }, + { + "epoch": 0.22, + "grad_norm": 1.6429484806417651, + "learning_rate": 9.080076652692355e-06, + "loss": 0.5181, + "step": 2805 + }, + { + "epoch": 0.22, + "grad_norm": 1.4863492027077427, + "learning_rate": 9.079341383632379e-06, + "loss": 0.5061, + "step": 2806 + }, + { + "epoch": 0.22, + "grad_norm": 1.6626208028678764, + "learning_rate": 9.07860585064435e-06, + "loss": 0.5028, + "step": 2807 + }, + { + "epoch": 0.22, + "grad_norm": 1.4515077155567888, + "learning_rate": 9.07787005377586e-06, + "loss": 0.5271, + "step": 2808 + }, + { + "epoch": 0.22, + "grad_norm": 1.8112282684826926, + "learning_rate": 9.07713399307451e-06, + "loss": 0.5738, + "step": 2809 + }, + { + "epoch": 0.22, + "grad_norm": 0.5821108254790223, + "learning_rate": 9.076397668587927e-06, + "loss": 0.5444, + "step": 2810 + }, + { + "epoch": 0.22, + "grad_norm": 1.4284033581599325, + "learning_rate": 9.075661080363745e-06, + "loss": 0.4605, + "step": 2811 + }, + { + "epoch": 0.22, + "grad_norm": 1.6832464233956461, + "learning_rate": 9.074924228449625e-06, + "loss": 0.4385, + "step": 2812 + }, + { + "epoch": 0.22, + "grad_norm": 1.9701456139963462, + "learning_rate": 9.074187112893235e-06, + "loss": 0.5065, + "step": 2813 + }, + { + "epoch": 0.22, + "grad_norm": 1.7096055600463889, + "learning_rate": 9.073449733742271e-06, + "loss": 0.5171, + "step": 2814 + }, + { + "epoch": 0.22, + "grad_norm": 1.4879151210266681, + "learning_rate": 9.072712091044437e-06, + "loss": 0.3914, + "step": 2815 + }, + { + "epoch": 0.22, + "grad_norm": 1.7243821409432285, + "learning_rate": 9.071974184847459e-06, + "loss": 0.5233, + "step": 2816 + }, + { + "epoch": 0.22, + "grad_norm": 1.7309908316396845, + "learning_rate": 9.071236015199077e-06, + "loss": 0.5418, + "step": 2817 + }, + { + "epoch": 0.22, + "grad_norm": 1.3725950230404538, + "learning_rate": 9.070497582147051e-06, + "loss": 0.5371, + "step": 2818 + }, + { + "epoch": 0.22, + "grad_norm": 1.6086772680201136, + "learning_rate": 9.069758885739157e-06, + "loss": 0.4876, + "step": 2819 + }, + { + "epoch": 0.22, + "grad_norm": 0.5945002385263806, + "learning_rate": 9.069019926023189e-06, + "loss": 0.5384, + "step": 2820 + }, + { + "epoch": 0.22, + "grad_norm": 1.4179480905817499, + "learning_rate": 9.068280703046951e-06, + "loss": 0.4793, + "step": 2821 + }, + { + "epoch": 0.22, + "grad_norm": 1.6994833367032034, + "learning_rate": 9.067541216858276e-06, + "loss": 0.507, + "step": 2822 + }, + { + "epoch": 0.22, + "grad_norm": 1.4678100262733464, + "learning_rate": 9.066801467505004e-06, + "loss": 0.4944, + "step": 2823 + }, + { + "epoch": 0.22, + "grad_norm": 1.7350869892905907, + "learning_rate": 9.066061455034996e-06, + "loss": 0.5141, + "step": 2824 + }, + { + "epoch": 0.22, + "grad_norm": 1.5420637972086488, + "learning_rate": 9.065321179496134e-06, + "loss": 0.4947, + "step": 2825 + }, + { + "epoch": 0.22, + "grad_norm": 1.9724487521108094, + "learning_rate": 9.064580640936307e-06, + "loss": 0.4753, + "step": 2826 + }, + { + "epoch": 0.22, + "grad_norm": 1.7642551680786225, + "learning_rate": 9.063839839403431e-06, + "loss": 0.5185, + "step": 2827 + }, + { + "epoch": 0.22, + "grad_norm": 1.5441391355314873, + "learning_rate": 9.063098774945433e-06, + "loss": 0.4697, + "step": 2828 + }, + { + "epoch": 0.22, + "grad_norm": 2.681699786924045, + "learning_rate": 9.06235744761026e-06, + "loss": 0.493, + "step": 2829 + }, + { + "epoch": 0.22, + "grad_norm": 1.3830097143050537, + "learning_rate": 9.061615857445875e-06, + "loss": 0.5076, + "step": 2830 + }, + { + "epoch": 0.22, + "grad_norm": 1.5945140648278653, + "learning_rate": 9.060874004500256e-06, + "loss": 0.4948, + "step": 2831 + }, + { + "epoch": 0.22, + "grad_norm": 1.4693440368477615, + "learning_rate": 9.060131888821402e-06, + "loss": 0.4809, + "step": 2832 + }, + { + "epoch": 0.22, + "grad_norm": 2.4632975431155053, + "learning_rate": 9.059389510457326e-06, + "loss": 0.4838, + "step": 2833 + }, + { + "epoch": 0.22, + "grad_norm": 1.5982332725747161, + "learning_rate": 9.058646869456058e-06, + "loss": 0.5429, + "step": 2834 + }, + { + "epoch": 0.22, + "grad_norm": 1.3919668128167277, + "learning_rate": 9.057903965865649e-06, + "loss": 0.5183, + "step": 2835 + }, + { + "epoch": 0.22, + "grad_norm": 1.5115741831961291, + "learning_rate": 9.057160799734159e-06, + "loss": 0.5021, + "step": 2836 + }, + { + "epoch": 0.22, + "grad_norm": 0.6264061339354404, + "learning_rate": 9.056417371109674e-06, + "loss": 0.5642, + "step": 2837 + }, + { + "epoch": 0.22, + "grad_norm": 2.1285779893030767, + "learning_rate": 9.055673680040293e-06, + "loss": 0.5562, + "step": 2838 + }, + { + "epoch": 0.22, + "grad_norm": 1.3036097615341762, + "learning_rate": 9.054929726574128e-06, + "loss": 0.5069, + "step": 2839 + }, + { + "epoch": 0.22, + "grad_norm": 2.089050739402883, + "learning_rate": 9.054185510759317e-06, + "loss": 0.493, + "step": 2840 + }, + { + "epoch": 0.22, + "grad_norm": 1.4317091534646331, + "learning_rate": 9.053441032644005e-06, + "loss": 0.5251, + "step": 2841 + }, + { + "epoch": 0.22, + "grad_norm": 2.7529296572257094, + "learning_rate": 9.052696292276362e-06, + "loss": 0.4284, + "step": 2842 + }, + { + "epoch": 0.22, + "grad_norm": 1.3521139688492267, + "learning_rate": 9.051951289704568e-06, + "loss": 0.4772, + "step": 2843 + }, + { + "epoch": 0.22, + "grad_norm": 1.7215488872856497, + "learning_rate": 9.051206024976829e-06, + "loss": 0.5369, + "step": 2844 + }, + { + "epoch": 0.22, + "grad_norm": 1.5208064307109832, + "learning_rate": 9.050460498141358e-06, + "loss": 0.5162, + "step": 2845 + }, + { + "epoch": 0.22, + "grad_norm": 0.59466320606524, + "learning_rate": 9.049714709246392e-06, + "loss": 0.5307, + "step": 2846 + }, + { + "epoch": 0.22, + "grad_norm": 1.3903674029947377, + "learning_rate": 9.048968658340183e-06, + "loss": 0.461, + "step": 2847 + }, + { + "epoch": 0.22, + "grad_norm": 1.704528050067276, + "learning_rate": 9.048222345470996e-06, + "loss": 0.4746, + "step": 2848 + }, + { + "epoch": 0.22, + "grad_norm": 2.2776992690118854, + "learning_rate": 9.047475770687123e-06, + "loss": 0.5108, + "step": 2849 + }, + { + "epoch": 0.22, + "grad_norm": 0.6687377615283843, + "learning_rate": 9.046728934036857e-06, + "loss": 0.5632, + "step": 2850 + }, + { + "epoch": 0.22, + "grad_norm": 1.7149521824080989, + "learning_rate": 9.045981835568527e-06, + "loss": 0.5414, + "step": 2851 + }, + { + "epoch": 0.22, + "grad_norm": 3.2744692611388375, + "learning_rate": 9.045234475330464e-06, + "loss": 0.5032, + "step": 2852 + }, + { + "epoch": 0.22, + "grad_norm": 0.597800986924832, + "learning_rate": 9.044486853371022e-06, + "loss": 0.5342, + "step": 2853 + }, + { + "epoch": 0.22, + "grad_norm": 1.3459891160538193, + "learning_rate": 9.043738969738572e-06, + "loss": 0.5375, + "step": 2854 + }, + { + "epoch": 0.22, + "grad_norm": 1.545951589508904, + "learning_rate": 9.042990824481499e-06, + "loss": 0.4352, + "step": 2855 + }, + { + "epoch": 0.22, + "grad_norm": 1.4034211739231588, + "learning_rate": 9.04224241764821e-06, + "loss": 0.5653, + "step": 2856 + }, + { + "epoch": 0.22, + "grad_norm": 1.5685069722431875, + "learning_rate": 9.041493749287125e-06, + "loss": 0.5095, + "step": 2857 + }, + { + "epoch": 0.22, + "grad_norm": 2.11569744710073, + "learning_rate": 9.04074481944668e-06, + "loss": 0.476, + "step": 2858 + }, + { + "epoch": 0.22, + "grad_norm": 2.93506020171291, + "learning_rate": 9.03999562817533e-06, + "loss": 0.5252, + "step": 2859 + }, + { + "epoch": 0.22, + "grad_norm": 1.220709694908683, + "learning_rate": 9.03924617552155e-06, + "loss": 0.4888, + "step": 2860 + }, + { + "epoch": 0.22, + "grad_norm": 0.6238570606132527, + "learning_rate": 9.038496461533825e-06, + "loss": 0.5296, + "step": 2861 + }, + { + "epoch": 0.22, + "grad_norm": 0.641906368429085, + "learning_rate": 9.037746486260664e-06, + "loss": 0.5381, + "step": 2862 + }, + { + "epoch": 0.22, + "grad_norm": 1.8105920847981534, + "learning_rate": 9.036996249750588e-06, + "loss": 0.5279, + "step": 2863 + }, + { + "epoch": 0.22, + "grad_norm": 1.6864143595600605, + "learning_rate": 9.036245752052132e-06, + "loss": 0.4535, + "step": 2864 + }, + { + "epoch": 0.23, + "grad_norm": 1.6539356115962756, + "learning_rate": 9.035494993213858e-06, + "loss": 0.5294, + "step": 2865 + }, + { + "epoch": 0.23, + "grad_norm": 1.4343781648388971, + "learning_rate": 9.034743973284337e-06, + "loss": 0.5181, + "step": 2866 + }, + { + "epoch": 0.23, + "grad_norm": 1.4776321455317365, + "learning_rate": 9.03399269231216e-06, + "loss": 0.4911, + "step": 2867 + }, + { + "epoch": 0.23, + "grad_norm": 1.4289199939080706, + "learning_rate": 9.03324115034593e-06, + "loss": 0.5033, + "step": 2868 + }, + { + "epoch": 0.23, + "grad_norm": 1.7724262912160629, + "learning_rate": 9.032489347434277e-06, + "loss": 0.5386, + "step": 2869 + }, + { + "epoch": 0.23, + "grad_norm": 1.5657577330742882, + "learning_rate": 9.031737283625836e-06, + "loss": 0.5464, + "step": 2870 + }, + { + "epoch": 0.23, + "grad_norm": 0.7436895582300712, + "learning_rate": 9.030984958969268e-06, + "loss": 0.5449, + "step": 2871 + }, + { + "epoch": 0.23, + "grad_norm": 2.124191405709207, + "learning_rate": 9.030232373513245e-06, + "loss": 0.5112, + "step": 2872 + }, + { + "epoch": 0.23, + "grad_norm": 0.6282859595040777, + "learning_rate": 9.029479527306461e-06, + "loss": 0.5483, + "step": 2873 + }, + { + "epoch": 0.23, + "grad_norm": 1.454302204484329, + "learning_rate": 9.028726420397624e-06, + "loss": 0.5421, + "step": 2874 + }, + { + "epoch": 0.23, + "grad_norm": 1.5097872185938759, + "learning_rate": 9.027973052835456e-06, + "loss": 0.4727, + "step": 2875 + }, + { + "epoch": 0.23, + "grad_norm": 1.5660091707479298, + "learning_rate": 9.0272194246687e-06, + "loss": 0.4165, + "step": 2876 + }, + { + "epoch": 0.23, + "grad_norm": 1.541005863309527, + "learning_rate": 9.026465535946118e-06, + "loss": 0.5534, + "step": 2877 + }, + { + "epoch": 0.23, + "grad_norm": 2.142136977845894, + "learning_rate": 9.025711386716481e-06, + "loss": 0.4883, + "step": 2878 + }, + { + "epoch": 0.23, + "grad_norm": 1.6658377872974515, + "learning_rate": 9.024956977028585e-06, + "loss": 0.4715, + "step": 2879 + }, + { + "epoch": 0.23, + "grad_norm": 1.4720771029130917, + "learning_rate": 9.024202306931236e-06, + "loss": 0.5362, + "step": 2880 + }, + { + "epoch": 0.23, + "grad_norm": 1.5792433885212707, + "learning_rate": 9.023447376473264e-06, + "loss": 0.4849, + "step": 2881 + }, + { + "epoch": 0.23, + "grad_norm": 1.3899423807451092, + "learning_rate": 9.022692185703509e-06, + "loss": 0.5003, + "step": 2882 + }, + { + "epoch": 0.23, + "grad_norm": 0.7607924823257483, + "learning_rate": 9.021936734670833e-06, + "loss": 0.5295, + "step": 2883 + }, + { + "epoch": 0.23, + "grad_norm": 1.4556657198388738, + "learning_rate": 9.02118102342411e-06, + "loss": 0.5483, + "step": 2884 + }, + { + "epoch": 0.23, + "grad_norm": 1.4909160131099273, + "learning_rate": 9.020425052012237e-06, + "loss": 0.5419, + "step": 2885 + }, + { + "epoch": 0.23, + "grad_norm": 2.0453172855147144, + "learning_rate": 9.019668820484123e-06, + "loss": 0.508, + "step": 2886 + }, + { + "epoch": 0.23, + "grad_norm": 1.6576818076751387, + "learning_rate": 9.018912328888691e-06, + "loss": 0.5205, + "step": 2887 + }, + { + "epoch": 0.23, + "grad_norm": 1.632333043865491, + "learning_rate": 9.018155577274891e-06, + "loss": 0.52, + "step": 2888 + }, + { + "epoch": 0.23, + "grad_norm": 1.4295342656778467, + "learning_rate": 9.017398565691681e-06, + "loss": 0.4993, + "step": 2889 + }, + { + "epoch": 0.23, + "grad_norm": 1.7491171432443455, + "learning_rate": 9.01664129418804e-06, + "loss": 0.5264, + "step": 2890 + }, + { + "epoch": 0.23, + "grad_norm": 1.5224450032887107, + "learning_rate": 9.015883762812962e-06, + "loss": 0.5241, + "step": 2891 + }, + { + "epoch": 0.23, + "grad_norm": 1.5918272408452991, + "learning_rate": 9.015125971615459e-06, + "loss": 0.5332, + "step": 2892 + }, + { + "epoch": 0.23, + "grad_norm": 2.1659958412664304, + "learning_rate": 9.014367920644555e-06, + "loss": 0.5321, + "step": 2893 + }, + { + "epoch": 0.23, + "grad_norm": 1.7440380719766047, + "learning_rate": 9.0136096099493e-06, + "loss": 0.5025, + "step": 2894 + }, + { + "epoch": 0.23, + "grad_norm": 1.6324977814773578, + "learning_rate": 9.012851039578754e-06, + "loss": 0.5203, + "step": 2895 + }, + { + "epoch": 0.23, + "grad_norm": 1.3312526164378193, + "learning_rate": 9.012092209581993e-06, + "loss": 0.4961, + "step": 2896 + }, + { + "epoch": 0.23, + "grad_norm": 1.8789883484291892, + "learning_rate": 9.011333120008117e-06, + "loss": 0.4962, + "step": 2897 + }, + { + "epoch": 0.23, + "grad_norm": 1.9555911357403164, + "learning_rate": 9.010573770906235e-06, + "loss": 0.504, + "step": 2898 + }, + { + "epoch": 0.23, + "grad_norm": 1.507846986930788, + "learning_rate": 9.009814162325475e-06, + "loss": 0.493, + "step": 2899 + }, + { + "epoch": 0.23, + "grad_norm": 1.6539060493772393, + "learning_rate": 9.009054294314985e-06, + "loss": 0.4869, + "step": 2900 + }, + { + "epoch": 0.23, + "grad_norm": 1.5556406957322377, + "learning_rate": 9.008294166923927e-06, + "loss": 0.4958, + "step": 2901 + }, + { + "epoch": 0.23, + "grad_norm": 1.5758554372564502, + "learning_rate": 9.00753378020148e-06, + "loss": 0.4741, + "step": 2902 + }, + { + "epoch": 0.23, + "grad_norm": 0.9010188327482946, + "learning_rate": 9.00677313419684e-06, + "loss": 0.5748, + "step": 2903 + }, + { + "epoch": 0.23, + "grad_norm": 1.8777674113446061, + "learning_rate": 9.00601222895922e-06, + "loss": 0.5207, + "step": 2904 + }, + { + "epoch": 0.23, + "grad_norm": 1.7647453387347591, + "learning_rate": 9.005251064537848e-06, + "loss": 0.5057, + "step": 2905 + }, + { + "epoch": 0.23, + "grad_norm": 1.6119672434298182, + "learning_rate": 9.004489640981973e-06, + "loss": 0.4968, + "step": 2906 + }, + { + "epoch": 0.23, + "grad_norm": 1.502966750421292, + "learning_rate": 9.003727958340856e-06, + "loss": 0.5159, + "step": 2907 + }, + { + "epoch": 0.23, + "grad_norm": 1.408235278989794, + "learning_rate": 9.00296601666378e-06, + "loss": 0.4842, + "step": 2908 + }, + { + "epoch": 0.23, + "grad_norm": 1.4346957404124154, + "learning_rate": 9.002203816000035e-06, + "loss": 0.4848, + "step": 2909 + }, + { + "epoch": 0.23, + "grad_norm": 1.2620684598175245, + "learning_rate": 9.001441356398942e-06, + "loss": 0.5583, + "step": 2910 + }, + { + "epoch": 0.23, + "grad_norm": 1.2756562163083214, + "learning_rate": 9.000678637909825e-06, + "loss": 0.4583, + "step": 2911 + }, + { + "epoch": 0.23, + "grad_norm": 2.4008095359476687, + "learning_rate": 8.999915660582037e-06, + "loss": 0.5168, + "step": 2912 + }, + { + "epoch": 0.23, + "grad_norm": 1.9789433094176034, + "learning_rate": 8.999152424464936e-06, + "loss": 0.536, + "step": 2913 + }, + { + "epoch": 0.23, + "grad_norm": 1.8715275234546433, + "learning_rate": 8.998388929607905e-06, + "loss": 0.518, + "step": 2914 + }, + { + "epoch": 0.23, + "grad_norm": 1.367149063709105, + "learning_rate": 8.99762517606034e-06, + "loss": 0.4899, + "step": 2915 + }, + { + "epoch": 0.23, + "grad_norm": 0.9146217653856598, + "learning_rate": 8.996861163871658e-06, + "loss": 0.5517, + "step": 2916 + }, + { + "epoch": 0.23, + "grad_norm": 1.6458253638372093, + "learning_rate": 8.996096893091285e-06, + "loss": 0.5124, + "step": 2917 + }, + { + "epoch": 0.23, + "grad_norm": 1.7997800559279415, + "learning_rate": 8.995332363768671e-06, + "loss": 0.4993, + "step": 2918 + }, + { + "epoch": 0.23, + "grad_norm": 1.5517983308219325, + "learning_rate": 8.994567575953281e-06, + "loss": 0.4857, + "step": 2919 + }, + { + "epoch": 0.23, + "grad_norm": 1.7227828254481616, + "learning_rate": 8.993802529694594e-06, + "loss": 0.4875, + "step": 2920 + }, + { + "epoch": 0.23, + "grad_norm": 1.9496139081332147, + "learning_rate": 8.993037225042107e-06, + "loss": 0.4711, + "step": 2921 + }, + { + "epoch": 0.23, + "grad_norm": 0.7083956237898862, + "learning_rate": 8.992271662045335e-06, + "loss": 0.55, + "step": 2922 + }, + { + "epoch": 0.23, + "grad_norm": 2.062223356295321, + "learning_rate": 8.99150584075381e-06, + "loss": 0.4845, + "step": 2923 + }, + { + "epoch": 0.23, + "grad_norm": 2.215008892440812, + "learning_rate": 8.990739761217078e-06, + "loss": 0.5112, + "step": 2924 + }, + { + "epoch": 0.23, + "grad_norm": 1.496851018519653, + "learning_rate": 8.989973423484703e-06, + "loss": 0.5623, + "step": 2925 + }, + { + "epoch": 0.23, + "grad_norm": 1.4126782867075052, + "learning_rate": 8.989206827606269e-06, + "loss": 0.4927, + "step": 2926 + }, + { + "epoch": 0.23, + "grad_norm": 1.7585448414127276, + "learning_rate": 8.988439973631371e-06, + "loss": 0.4981, + "step": 2927 + }, + { + "epoch": 0.23, + "grad_norm": 1.4429615325228062, + "learning_rate": 8.987672861609624e-06, + "loss": 0.4725, + "step": 2928 + }, + { + "epoch": 0.23, + "grad_norm": 1.6524667605260932, + "learning_rate": 8.986905491590659e-06, + "loss": 0.4503, + "step": 2929 + }, + { + "epoch": 0.23, + "grad_norm": 2.026708868261442, + "learning_rate": 8.986137863624125e-06, + "loss": 0.5129, + "step": 2930 + }, + { + "epoch": 0.23, + "grad_norm": 1.566714723275509, + "learning_rate": 8.985369977759686e-06, + "loss": 0.5309, + "step": 2931 + }, + { + "epoch": 0.23, + "grad_norm": 1.339343421436997, + "learning_rate": 8.984601834047022e-06, + "loss": 0.4528, + "step": 2932 + }, + { + "epoch": 0.23, + "grad_norm": 1.319215389446425, + "learning_rate": 8.983833432535833e-06, + "loss": 0.4408, + "step": 2933 + }, + { + "epoch": 0.23, + "grad_norm": 1.4016226891541588, + "learning_rate": 8.98306477327583e-06, + "loss": 0.4836, + "step": 2934 + }, + { + "epoch": 0.23, + "grad_norm": 4.458482075773622, + "learning_rate": 8.98229585631675e-06, + "loss": 0.4834, + "step": 2935 + }, + { + "epoch": 0.23, + "grad_norm": 2.432574781078674, + "learning_rate": 8.981526681708336e-06, + "loss": 0.4929, + "step": 2936 + }, + { + "epoch": 0.23, + "grad_norm": 1.4965938792972264, + "learning_rate": 8.980757249500354e-06, + "loss": 0.5196, + "step": 2937 + }, + { + "epoch": 0.23, + "grad_norm": 2.1744069059084414, + "learning_rate": 8.979987559742587e-06, + "loss": 0.5396, + "step": 2938 + }, + { + "epoch": 0.23, + "grad_norm": 1.4983700945529155, + "learning_rate": 8.97921761248483e-06, + "loss": 0.5163, + "step": 2939 + }, + { + "epoch": 0.23, + "grad_norm": 2.0021631525644827, + "learning_rate": 8.978447407776898e-06, + "loss": 0.4277, + "step": 2940 + }, + { + "epoch": 0.23, + "grad_norm": 1.6299889987173506, + "learning_rate": 8.977676945668626e-06, + "loss": 0.5363, + "step": 2941 + }, + { + "epoch": 0.23, + "grad_norm": 8.714687486946245, + "learning_rate": 8.976906226209856e-06, + "loss": 0.5304, + "step": 2942 + }, + { + "epoch": 0.23, + "grad_norm": 1.9899206843886754, + "learning_rate": 8.976135249450457e-06, + "loss": 0.5399, + "step": 2943 + }, + { + "epoch": 0.23, + "grad_norm": 1.3626477637139076, + "learning_rate": 8.975364015440308e-06, + "loss": 0.5037, + "step": 2944 + }, + { + "epoch": 0.23, + "grad_norm": 0.7064942261108158, + "learning_rate": 8.974592524229308e-06, + "loss": 0.5659, + "step": 2945 + }, + { + "epoch": 0.23, + "grad_norm": 3.182204965088374, + "learning_rate": 8.973820775867372e-06, + "loss": 0.487, + "step": 2946 + }, + { + "epoch": 0.23, + "grad_norm": 0.6143727409805179, + "learning_rate": 8.97304877040443e-06, + "loss": 0.5637, + "step": 2947 + }, + { + "epoch": 0.23, + "grad_norm": 2.3788805637516175, + "learning_rate": 8.97227650789043e-06, + "loss": 0.4783, + "step": 2948 + }, + { + "epoch": 0.23, + "grad_norm": 0.5494322091186193, + "learning_rate": 8.971503988375335e-06, + "loss": 0.5201, + "step": 2949 + }, + { + "epoch": 0.23, + "grad_norm": 1.444057326039935, + "learning_rate": 8.970731211909129e-06, + "loss": 0.4852, + "step": 2950 + }, + { + "epoch": 0.23, + "grad_norm": 1.911288284899553, + "learning_rate": 8.969958178541807e-06, + "loss": 0.5258, + "step": 2951 + }, + { + "epoch": 0.23, + "grad_norm": 1.5100572561690182, + "learning_rate": 8.969184888323383e-06, + "loss": 0.4999, + "step": 2952 + }, + { + "epoch": 0.23, + "grad_norm": 1.3126743767315752, + "learning_rate": 8.968411341303892e-06, + "loss": 0.526, + "step": 2953 + }, + { + "epoch": 0.23, + "grad_norm": 1.525898642698643, + "learning_rate": 8.967637537533376e-06, + "loss": 0.495, + "step": 2954 + }, + { + "epoch": 0.23, + "grad_norm": 1.4456612731284988, + "learning_rate": 8.966863477061903e-06, + "loss": 0.496, + "step": 2955 + }, + { + "epoch": 0.23, + "grad_norm": 1.596646481271798, + "learning_rate": 8.966089159939552e-06, + "loss": 0.531, + "step": 2956 + }, + { + "epoch": 0.23, + "grad_norm": 1.4436116533246504, + "learning_rate": 8.965314586216421e-06, + "loss": 0.4832, + "step": 2957 + }, + { + "epoch": 0.23, + "grad_norm": 1.692619191648268, + "learning_rate": 8.964539755942623e-06, + "loss": 0.4841, + "step": 2958 + }, + { + "epoch": 0.23, + "grad_norm": 2.254709168430868, + "learning_rate": 8.963764669168289e-06, + "loss": 0.5293, + "step": 2959 + }, + { + "epoch": 0.23, + "grad_norm": 1.9243015500359633, + "learning_rate": 8.962989325943566e-06, + "loss": 0.4826, + "step": 2960 + }, + { + "epoch": 0.23, + "grad_norm": 0.7831459984966213, + "learning_rate": 8.962213726318619e-06, + "loss": 0.5548, + "step": 2961 + }, + { + "epoch": 0.23, + "grad_norm": 1.6708816241947577, + "learning_rate": 8.961437870343626e-06, + "loss": 0.5295, + "step": 2962 + }, + { + "epoch": 0.23, + "grad_norm": 1.7050027458258565, + "learning_rate": 8.960661758068784e-06, + "loss": 0.5147, + "step": 2963 + }, + { + "epoch": 0.23, + "grad_norm": 1.9510175835763244, + "learning_rate": 8.959885389544309e-06, + "loss": 0.4719, + "step": 2964 + }, + { + "epoch": 0.23, + "grad_norm": 1.3616403640203567, + "learning_rate": 8.95910876482043e-06, + "loss": 0.4837, + "step": 2965 + }, + { + "epoch": 0.23, + "grad_norm": 2.1973680633989634, + "learning_rate": 8.958331883947394e-06, + "loss": 0.4772, + "step": 2966 + }, + { + "epoch": 0.23, + "grad_norm": 1.90835177558252, + "learning_rate": 8.95755474697546e-06, + "loss": 0.4957, + "step": 2967 + }, + { + "epoch": 0.23, + "grad_norm": 2.1293848871189254, + "learning_rate": 8.956777353954913e-06, + "loss": 0.5165, + "step": 2968 + }, + { + "epoch": 0.23, + "grad_norm": 0.6288944396527829, + "learning_rate": 8.955999704936048e-06, + "loss": 0.5527, + "step": 2969 + }, + { + "epoch": 0.23, + "grad_norm": 2.7808352663163705, + "learning_rate": 8.955221799969175e-06, + "loss": 0.5215, + "step": 2970 + }, + { + "epoch": 0.23, + "grad_norm": 1.7316313795696343, + "learning_rate": 8.954443639104627e-06, + "loss": 0.4577, + "step": 2971 + }, + { + "epoch": 0.23, + "grad_norm": 1.78196437812799, + "learning_rate": 8.953665222392749e-06, + "loss": 0.5178, + "step": 2972 + }, + { + "epoch": 0.23, + "grad_norm": 2.1785705724047957, + "learning_rate": 8.952886549883903e-06, + "loss": 0.4842, + "step": 2973 + }, + { + "epoch": 0.23, + "grad_norm": 0.6199279242639198, + "learning_rate": 8.952107621628467e-06, + "loss": 0.5553, + "step": 2974 + }, + { + "epoch": 0.23, + "grad_norm": 1.4784759421501155, + "learning_rate": 8.951328437676838e-06, + "loss": 0.5147, + "step": 2975 + }, + { + "epoch": 0.23, + "grad_norm": 2.6102532278792783, + "learning_rate": 8.95054899807943e-06, + "loss": 0.5012, + "step": 2976 + }, + { + "epoch": 0.23, + "grad_norm": 1.5174904290569684, + "learning_rate": 8.949769302886668e-06, + "loss": 0.5092, + "step": 2977 + }, + { + "epoch": 0.23, + "grad_norm": 1.7058770722273573, + "learning_rate": 8.948989352149e-06, + "loss": 0.4989, + "step": 2978 + }, + { + "epoch": 0.23, + "grad_norm": 0.6347562946065516, + "learning_rate": 8.948209145916887e-06, + "loss": 0.5355, + "step": 2979 + }, + { + "epoch": 0.23, + "grad_norm": 2.163904549459621, + "learning_rate": 8.947428684240806e-06, + "loss": 0.5093, + "step": 2980 + }, + { + "epoch": 0.23, + "grad_norm": 1.7219955383657253, + "learning_rate": 8.946647967171254e-06, + "loss": 0.5675, + "step": 2981 + }, + { + "epoch": 0.23, + "grad_norm": 1.6072410792437541, + "learning_rate": 8.945866994758741e-06, + "loss": 0.5143, + "step": 2982 + }, + { + "epoch": 0.23, + "grad_norm": 1.4144966452187229, + "learning_rate": 8.945085767053795e-06, + "loss": 0.5257, + "step": 2983 + }, + { + "epoch": 0.23, + "grad_norm": 1.2775274421513896, + "learning_rate": 8.944304284106962e-06, + "loss": 0.544, + "step": 2984 + }, + { + "epoch": 0.23, + "grad_norm": 0.5597746369937876, + "learning_rate": 8.9435225459688e-06, + "loss": 0.543, + "step": 2985 + }, + { + "epoch": 0.23, + "grad_norm": 1.8733101885218788, + "learning_rate": 8.942740552689889e-06, + "loss": 0.5263, + "step": 2986 + }, + { + "epoch": 0.23, + "grad_norm": 15.492842926976435, + "learning_rate": 8.941958304320822e-06, + "loss": 0.497, + "step": 2987 + }, + { + "epoch": 0.23, + "grad_norm": 1.6270224114243583, + "learning_rate": 8.941175800912208e-06, + "loss": 0.5283, + "step": 2988 + }, + { + "epoch": 0.23, + "grad_norm": 1.463116484662335, + "learning_rate": 8.940393042514677e-06, + "loss": 0.4839, + "step": 2989 + }, + { + "epoch": 0.23, + "grad_norm": 2.076617583798195, + "learning_rate": 8.93961002917887e-06, + "loss": 0.4493, + "step": 2990 + }, + { + "epoch": 0.23, + "grad_norm": 1.9022759452459503, + "learning_rate": 8.938826760955448e-06, + "loss": 0.5055, + "step": 2991 + }, + { + "epoch": 0.23, + "grad_norm": 1.4830167986237655, + "learning_rate": 8.938043237895088e-06, + "loss": 0.4793, + "step": 2992 + }, + { + "epoch": 0.24, + "grad_norm": 1.2882167507977618, + "learning_rate": 8.93725946004848e-06, + "loss": 0.5053, + "step": 2993 + }, + { + "epoch": 0.24, + "grad_norm": 2.0489027367616477, + "learning_rate": 8.936475427466337e-06, + "loss": 0.475, + "step": 2994 + }, + { + "epoch": 0.24, + "grad_norm": 1.7106351183014727, + "learning_rate": 8.935691140199384e-06, + "loss": 0.4982, + "step": 2995 + }, + { + "epoch": 0.24, + "grad_norm": 1.5194274829594947, + "learning_rate": 8.934906598298362e-06, + "loss": 0.519, + "step": 2996 + }, + { + "epoch": 0.24, + "grad_norm": 2.3120612924456894, + "learning_rate": 8.934121801814031e-06, + "loss": 0.4814, + "step": 2997 + }, + { + "epoch": 0.24, + "grad_norm": 0.5880876238398923, + "learning_rate": 8.933336750797167e-06, + "loss": 0.5339, + "step": 2998 + }, + { + "epoch": 0.24, + "grad_norm": 2.4282950857027745, + "learning_rate": 8.932551445298557e-06, + "loss": 0.4914, + "step": 2999 + }, + { + "epoch": 0.24, + "grad_norm": 1.3462086628833854, + "learning_rate": 8.931765885369015e-06, + "loss": 0.4673, + "step": 3000 + }, + { + "epoch": 0.24, + "grad_norm": 1.586298512310245, + "learning_rate": 8.930980071059364e-06, + "loss": 0.5219, + "step": 3001 + }, + { + "epoch": 0.24, + "grad_norm": 1.7248005822900319, + "learning_rate": 8.930194002420444e-06, + "loss": 0.5072, + "step": 3002 + }, + { + "epoch": 0.24, + "grad_norm": 1.8035215228246972, + "learning_rate": 8.929407679503116e-06, + "loss": 0.5165, + "step": 3003 + }, + { + "epoch": 0.24, + "grad_norm": 1.4122584097906916, + "learning_rate": 8.928621102358248e-06, + "loss": 0.4661, + "step": 3004 + }, + { + "epoch": 0.24, + "grad_norm": 1.6308846017127305, + "learning_rate": 8.927834271036736e-06, + "loss": 0.5113, + "step": 3005 + }, + { + "epoch": 0.24, + "grad_norm": 1.473294475759395, + "learning_rate": 8.927047185589484e-06, + "loss": 0.5185, + "step": 3006 + }, + { + "epoch": 0.24, + "grad_norm": 2.0268388645750672, + "learning_rate": 8.926259846067417e-06, + "loss": 0.5465, + "step": 3007 + }, + { + "epoch": 0.24, + "grad_norm": 1.8867958940032006, + "learning_rate": 8.925472252521473e-06, + "loss": 0.5201, + "step": 3008 + }, + { + "epoch": 0.24, + "grad_norm": 0.6498501779556012, + "learning_rate": 8.924684405002611e-06, + "loss": 0.5606, + "step": 3009 + }, + { + "epoch": 0.24, + "grad_norm": 1.7589893741212537, + "learning_rate": 8.9238963035618e-06, + "loss": 0.559, + "step": 3010 + }, + { + "epoch": 0.24, + "grad_norm": 0.5745117020075853, + "learning_rate": 8.923107948250034e-06, + "loss": 0.5285, + "step": 3011 + }, + { + "epoch": 0.24, + "grad_norm": 0.5677130676196426, + "learning_rate": 8.922319339118314e-06, + "loss": 0.5534, + "step": 3012 + }, + { + "epoch": 0.24, + "grad_norm": 0.6395081549772318, + "learning_rate": 8.921530476217664e-06, + "loss": 0.5548, + "step": 3013 + }, + { + "epoch": 0.24, + "grad_norm": 1.5797526429901023, + "learning_rate": 8.920741359599121e-06, + "loss": 0.4914, + "step": 3014 + }, + { + "epoch": 0.24, + "grad_norm": 1.573935855623021, + "learning_rate": 8.919951989313744e-06, + "loss": 0.4837, + "step": 3015 + }, + { + "epoch": 0.24, + "grad_norm": 2.7279511052330405, + "learning_rate": 8.919162365412599e-06, + "loss": 0.5043, + "step": 3016 + }, + { + "epoch": 0.24, + "grad_norm": 1.5770046852717214, + "learning_rate": 8.918372487946778e-06, + "loss": 0.526, + "step": 3017 + }, + { + "epoch": 0.24, + "grad_norm": 0.631091561852185, + "learning_rate": 8.91758235696738e-06, + "loss": 0.558, + "step": 3018 + }, + { + "epoch": 0.24, + "grad_norm": 1.3805439043420007, + "learning_rate": 8.91679197252553e-06, + "loss": 0.5068, + "step": 3019 + }, + { + "epoch": 0.24, + "grad_norm": 1.4903873138654185, + "learning_rate": 8.916001334672364e-06, + "loss": 0.5263, + "step": 3020 + }, + { + "epoch": 0.24, + "grad_norm": 1.3237904314031874, + "learning_rate": 8.915210443459032e-06, + "loss": 0.4857, + "step": 3021 + }, + { + "epoch": 0.24, + "grad_norm": 1.5360733782311171, + "learning_rate": 8.914419298936707e-06, + "loss": 0.409, + "step": 3022 + }, + { + "epoch": 0.24, + "grad_norm": 1.9466591946127414, + "learning_rate": 8.913627901156575e-06, + "loss": 0.5492, + "step": 3023 + }, + { + "epoch": 0.24, + "grad_norm": 1.7015465361947841, + "learning_rate": 8.912836250169836e-06, + "loss": 0.5398, + "step": 3024 + }, + { + "epoch": 0.24, + "grad_norm": 2.027560627948112, + "learning_rate": 8.912044346027713e-06, + "loss": 0.5216, + "step": 3025 + }, + { + "epoch": 0.24, + "grad_norm": 1.563701008560731, + "learning_rate": 8.911252188781436e-06, + "loss": 0.4956, + "step": 3026 + }, + { + "epoch": 0.24, + "grad_norm": 1.3813243487670344, + "learning_rate": 8.910459778482259e-06, + "loss": 0.443, + "step": 3027 + }, + { + "epoch": 0.24, + "grad_norm": 1.8919595423101965, + "learning_rate": 8.909667115181451e-06, + "loss": 0.4951, + "step": 3028 + }, + { + "epoch": 0.24, + "grad_norm": 1.3593557255606197, + "learning_rate": 8.908874198930295e-06, + "loss": 0.5014, + "step": 3029 + }, + { + "epoch": 0.24, + "grad_norm": 1.4126988804293257, + "learning_rate": 8.90808102978009e-06, + "loss": 0.4787, + "step": 3030 + }, + { + "epoch": 0.24, + "grad_norm": 4.294230115784155, + "learning_rate": 8.907287607782155e-06, + "loss": 0.4903, + "step": 3031 + }, + { + "epoch": 0.24, + "grad_norm": 2.18623492178922, + "learning_rate": 8.906493932987826e-06, + "loss": 0.4897, + "step": 3032 + }, + { + "epoch": 0.24, + "grad_norm": 0.611104192218659, + "learning_rate": 8.905700005448448e-06, + "loss": 0.5437, + "step": 3033 + }, + { + "epoch": 0.24, + "grad_norm": 1.7810415694281176, + "learning_rate": 8.904905825215388e-06, + "loss": 0.5327, + "step": 3034 + }, + { + "epoch": 0.24, + "grad_norm": 1.558021654524866, + "learning_rate": 8.904111392340032e-06, + "loss": 0.4348, + "step": 3035 + }, + { + "epoch": 0.24, + "grad_norm": 0.6221079372879845, + "learning_rate": 8.903316706873774e-06, + "loss": 0.5292, + "step": 3036 + }, + { + "epoch": 0.24, + "grad_norm": 1.4770482458714866, + "learning_rate": 8.902521768868031e-06, + "loss": 0.5241, + "step": 3037 + }, + { + "epoch": 0.24, + "grad_norm": 1.943158151224759, + "learning_rate": 8.901726578374236e-06, + "loss": 0.4686, + "step": 3038 + }, + { + "epoch": 0.24, + "grad_norm": 0.5914701802293288, + "learning_rate": 8.900931135443836e-06, + "loss": 0.5206, + "step": 3039 + }, + { + "epoch": 0.24, + "grad_norm": 1.7305636577727892, + "learning_rate": 8.900135440128293e-06, + "loss": 0.4806, + "step": 3040 + }, + { + "epoch": 0.24, + "grad_norm": 2.1038927682171797, + "learning_rate": 8.899339492479089e-06, + "loss": 0.564, + "step": 3041 + }, + { + "epoch": 0.24, + "grad_norm": 1.5551081292089663, + "learning_rate": 8.898543292547722e-06, + "loss": 0.566, + "step": 3042 + }, + { + "epoch": 0.24, + "grad_norm": 0.5618309861926594, + "learning_rate": 8.897746840385702e-06, + "loss": 0.521, + "step": 3043 + }, + { + "epoch": 0.24, + "grad_norm": 1.717537312785067, + "learning_rate": 8.896950136044562e-06, + "loss": 0.5539, + "step": 3044 + }, + { + "epoch": 0.24, + "grad_norm": 1.469427370536025, + "learning_rate": 8.896153179575846e-06, + "loss": 0.5036, + "step": 3045 + }, + { + "epoch": 0.24, + "grad_norm": 1.7534648726901858, + "learning_rate": 8.895355971031115e-06, + "loss": 0.5236, + "step": 3046 + }, + { + "epoch": 0.24, + "grad_norm": 1.9419696136579172, + "learning_rate": 8.89455851046195e-06, + "loss": 0.4723, + "step": 3047 + }, + { + "epoch": 0.24, + "grad_norm": 1.4934485368523365, + "learning_rate": 8.893760797919944e-06, + "loss": 0.478, + "step": 3048 + }, + { + "epoch": 0.24, + "grad_norm": 2.1092954186614357, + "learning_rate": 8.892962833456707e-06, + "loss": 0.5428, + "step": 3049 + }, + { + "epoch": 0.24, + "grad_norm": 1.4721703253080265, + "learning_rate": 8.892164617123868e-06, + "loss": 0.4657, + "step": 3050 + }, + { + "epoch": 0.24, + "grad_norm": 1.6062260899251004, + "learning_rate": 8.891366148973068e-06, + "loss": 0.4943, + "step": 3051 + }, + { + "epoch": 0.24, + "grad_norm": 1.6322899350853253, + "learning_rate": 8.890567429055971e-06, + "loss": 0.5577, + "step": 3052 + }, + { + "epoch": 0.24, + "grad_norm": 1.4255870652846119, + "learning_rate": 8.889768457424251e-06, + "loss": 0.5044, + "step": 3053 + }, + { + "epoch": 0.24, + "grad_norm": 1.9267448471780224, + "learning_rate": 8.8889692341296e-06, + "loss": 0.4607, + "step": 3054 + }, + { + "epoch": 0.24, + "grad_norm": 2.1795041577052054, + "learning_rate": 8.888169759223724e-06, + "loss": 0.4737, + "step": 3055 + }, + { + "epoch": 0.24, + "grad_norm": 1.27235228077261, + "learning_rate": 8.887370032758354e-06, + "loss": 0.4158, + "step": 3056 + }, + { + "epoch": 0.24, + "grad_norm": 1.293772124217995, + "learning_rate": 8.886570054785229e-06, + "loss": 0.4545, + "step": 3057 + }, + { + "epoch": 0.24, + "grad_norm": 1.7006109566993444, + "learning_rate": 8.885769825356103e-06, + "loss": 0.5323, + "step": 3058 + }, + { + "epoch": 0.24, + "grad_norm": 1.552584860623139, + "learning_rate": 8.884969344522754e-06, + "loss": 0.5008, + "step": 3059 + }, + { + "epoch": 0.24, + "grad_norm": 1.6032096740093893, + "learning_rate": 8.88416861233697e-06, + "loss": 0.5167, + "step": 3060 + }, + { + "epoch": 0.24, + "grad_norm": 1.4262951209898767, + "learning_rate": 8.883367628850559e-06, + "loss": 0.4652, + "step": 3061 + }, + { + "epoch": 0.24, + "grad_norm": 1.5277024225253921, + "learning_rate": 8.882566394115342e-06, + "loss": 0.4611, + "step": 3062 + }, + { + "epoch": 0.24, + "grad_norm": 1.8187640992830203, + "learning_rate": 8.881764908183158e-06, + "loss": 0.4959, + "step": 3063 + }, + { + "epoch": 0.24, + "grad_norm": 1.7174432185726323, + "learning_rate": 8.88096317110586e-06, + "loss": 0.4993, + "step": 3064 + }, + { + "epoch": 0.24, + "grad_norm": 1.5487884835373051, + "learning_rate": 8.880161182935325e-06, + "loss": 0.4942, + "step": 3065 + }, + { + "epoch": 0.24, + "grad_norm": 1.3239023171077364, + "learning_rate": 8.879358943723437e-06, + "loss": 0.5076, + "step": 3066 + }, + { + "epoch": 0.24, + "grad_norm": 1.5179132662349666, + "learning_rate": 8.8785564535221e-06, + "loss": 0.4897, + "step": 3067 + }, + { + "epoch": 0.24, + "grad_norm": 0.7359474439774842, + "learning_rate": 8.877753712383233e-06, + "loss": 0.5694, + "step": 3068 + }, + { + "epoch": 0.24, + "grad_norm": 1.512267998587302, + "learning_rate": 8.876950720358775e-06, + "loss": 0.5669, + "step": 3069 + }, + { + "epoch": 0.24, + "grad_norm": 1.5229756802237184, + "learning_rate": 8.876147477500677e-06, + "loss": 0.4775, + "step": 3070 + }, + { + "epoch": 0.24, + "grad_norm": 1.5655739320285087, + "learning_rate": 8.875343983860909e-06, + "loss": 0.492, + "step": 3071 + }, + { + "epoch": 0.24, + "grad_norm": 1.4279273924498868, + "learning_rate": 8.874540239491451e-06, + "loss": 0.5207, + "step": 3072 + }, + { + "epoch": 0.24, + "grad_norm": 1.4554563971275536, + "learning_rate": 8.873736244444311e-06, + "loss": 0.5252, + "step": 3073 + }, + { + "epoch": 0.24, + "grad_norm": 3.1976722123173635, + "learning_rate": 8.872931998771503e-06, + "loss": 0.5435, + "step": 3074 + }, + { + "epoch": 0.24, + "grad_norm": 4.829944928659648, + "learning_rate": 8.87212750252506e-06, + "loss": 0.4896, + "step": 3075 + }, + { + "epoch": 0.24, + "grad_norm": 1.4530745990655476, + "learning_rate": 8.871322755757035e-06, + "loss": 0.5021, + "step": 3076 + }, + { + "epoch": 0.24, + "grad_norm": 0.6407994643413658, + "learning_rate": 8.87051775851949e-06, + "loss": 0.5332, + "step": 3077 + }, + { + "epoch": 0.24, + "grad_norm": 1.752968774268347, + "learning_rate": 8.86971251086451e-06, + "loss": 0.5502, + "step": 3078 + }, + { + "epoch": 0.24, + "grad_norm": 1.9166724487872593, + "learning_rate": 8.868907012844194e-06, + "loss": 0.5259, + "step": 3079 + }, + { + "epoch": 0.24, + "grad_norm": 1.3852884581115712, + "learning_rate": 8.868101264510654e-06, + "loss": 0.5006, + "step": 3080 + }, + { + "epoch": 0.24, + "grad_norm": 1.5369636900006798, + "learning_rate": 8.867295265916023e-06, + "loss": 0.5076, + "step": 3081 + }, + { + "epoch": 0.24, + "grad_norm": 2.051539483656026, + "learning_rate": 8.866489017112448e-06, + "loss": 0.5207, + "step": 3082 + }, + { + "epoch": 0.24, + "grad_norm": 1.5526077143090808, + "learning_rate": 8.86568251815209e-06, + "loss": 0.5364, + "step": 3083 + }, + { + "epoch": 0.24, + "grad_norm": 1.5285930571826203, + "learning_rate": 8.864875769087131e-06, + "loss": 0.4797, + "step": 3084 + }, + { + "epoch": 0.24, + "grad_norm": 1.416139148730127, + "learning_rate": 8.864068769969766e-06, + "loss": 0.5016, + "step": 3085 + }, + { + "epoch": 0.24, + "grad_norm": 1.5366863759112341, + "learning_rate": 8.863261520852205e-06, + "loss": 0.5016, + "step": 3086 + }, + { + "epoch": 0.24, + "grad_norm": 0.6097807415807062, + "learning_rate": 8.862454021786678e-06, + "loss": 0.5412, + "step": 3087 + }, + { + "epoch": 0.24, + "grad_norm": 1.3168312456560078, + "learning_rate": 8.861646272825429e-06, + "loss": 0.4689, + "step": 3088 + }, + { + "epoch": 0.24, + "grad_norm": 1.4699890118151353, + "learning_rate": 8.860838274020717e-06, + "loss": 0.5165, + "step": 3089 + }, + { + "epoch": 0.24, + "grad_norm": 1.514821563746873, + "learning_rate": 8.860030025424819e-06, + "loss": 0.4826, + "step": 3090 + }, + { + "epoch": 0.24, + "grad_norm": 1.5031885803853393, + "learning_rate": 8.85922152709003e-06, + "loss": 0.4759, + "step": 3091 + }, + { + "epoch": 0.24, + "grad_norm": 1.4441384732414912, + "learning_rate": 8.858412779068654e-06, + "loss": 0.4427, + "step": 3092 + }, + { + "epoch": 0.24, + "grad_norm": 1.774647452200091, + "learning_rate": 8.857603781413021e-06, + "loss": 0.5301, + "step": 3093 + }, + { + "epoch": 0.24, + "grad_norm": 0.5542537272717182, + "learning_rate": 8.856794534175468e-06, + "loss": 0.5343, + "step": 3094 + }, + { + "epoch": 0.24, + "grad_norm": 1.4553466995449236, + "learning_rate": 8.855985037408355e-06, + "loss": 0.4763, + "step": 3095 + }, + { + "epoch": 0.24, + "grad_norm": 1.7004284650344321, + "learning_rate": 8.855175291164055e-06, + "loss": 0.4687, + "step": 3096 + }, + { + "epoch": 0.24, + "grad_norm": 1.512420646989729, + "learning_rate": 8.854365295494956e-06, + "loss": 0.5527, + "step": 3097 + }, + { + "epoch": 0.24, + "grad_norm": 1.448911627850262, + "learning_rate": 8.853555050453465e-06, + "loss": 0.4444, + "step": 3098 + }, + { + "epoch": 0.24, + "grad_norm": 1.4076264983860287, + "learning_rate": 8.852744556092002e-06, + "loss": 0.509, + "step": 3099 + }, + { + "epoch": 0.24, + "grad_norm": 2.268669756593638, + "learning_rate": 8.851933812463008e-06, + "loss": 0.4879, + "step": 3100 + }, + { + "epoch": 0.24, + "grad_norm": 1.349123948896521, + "learning_rate": 8.851122819618933e-06, + "loss": 0.502, + "step": 3101 + }, + { + "epoch": 0.24, + "grad_norm": 1.6626940076930226, + "learning_rate": 8.85031157761225e-06, + "loss": 0.5764, + "step": 3102 + }, + { + "epoch": 0.24, + "grad_norm": 1.3723081163137278, + "learning_rate": 8.849500086495446e-06, + "loss": 0.4865, + "step": 3103 + }, + { + "epoch": 0.24, + "grad_norm": 1.5138858463693807, + "learning_rate": 8.84868834632102e-06, + "loss": 0.5187, + "step": 3104 + }, + { + "epoch": 0.24, + "grad_norm": 2.0433399833446817, + "learning_rate": 8.847876357141496e-06, + "loss": 0.5008, + "step": 3105 + }, + { + "epoch": 0.24, + "grad_norm": 1.8331221720906892, + "learning_rate": 8.847064119009405e-06, + "loss": 0.5745, + "step": 3106 + }, + { + "epoch": 0.24, + "grad_norm": 1.7511896801048878, + "learning_rate": 8.846251631977295e-06, + "loss": 0.5266, + "step": 3107 + }, + { + "epoch": 0.24, + "grad_norm": 1.4466839052672444, + "learning_rate": 8.845438896097738e-06, + "loss": 0.4793, + "step": 3108 + }, + { + "epoch": 0.24, + "grad_norm": 3.2681686985736493, + "learning_rate": 8.844625911423315e-06, + "loss": 0.5385, + "step": 3109 + }, + { + "epoch": 0.24, + "grad_norm": 1.7535888439970788, + "learning_rate": 8.843812678006624e-06, + "loss": 0.4582, + "step": 3110 + }, + { + "epoch": 0.24, + "grad_norm": 1.6580119491188257, + "learning_rate": 8.842999195900283e-06, + "loss": 0.5354, + "step": 3111 + }, + { + "epoch": 0.24, + "grad_norm": 2.339098954009609, + "learning_rate": 8.842185465156919e-06, + "loss": 0.4935, + "step": 3112 + }, + { + "epoch": 0.24, + "grad_norm": 1.7020675605517868, + "learning_rate": 8.841371485829183e-06, + "loss": 0.5185, + "step": 3113 + }, + { + "epoch": 0.24, + "grad_norm": 0.6753045389856148, + "learning_rate": 8.840557257969736e-06, + "loss": 0.5386, + "step": 3114 + }, + { + "epoch": 0.24, + "grad_norm": 1.749086134653893, + "learning_rate": 8.83974278163126e-06, + "loss": 0.5324, + "step": 3115 + }, + { + "epoch": 0.24, + "grad_norm": 2.365155516834661, + "learning_rate": 8.838928056866447e-06, + "loss": 0.5456, + "step": 3116 + }, + { + "epoch": 0.24, + "grad_norm": 1.4510773906155148, + "learning_rate": 8.838113083728012e-06, + "loss": 0.563, + "step": 3117 + }, + { + "epoch": 0.24, + "grad_norm": 2.290808827295803, + "learning_rate": 8.837297862268682e-06, + "loss": 0.4781, + "step": 3118 + }, + { + "epoch": 0.24, + "grad_norm": 1.626076260297186, + "learning_rate": 8.836482392541199e-06, + "loss": 0.4993, + "step": 3119 + }, + { + "epoch": 0.25, + "grad_norm": 1.8673669963616522, + "learning_rate": 8.835666674598325e-06, + "loss": 0.4477, + "step": 3120 + }, + { + "epoch": 0.25, + "grad_norm": 1.7910083470834832, + "learning_rate": 8.834850708492834e-06, + "loss": 0.4685, + "step": 3121 + }, + { + "epoch": 0.25, + "grad_norm": 1.8399016600547733, + "learning_rate": 8.83403449427752e-06, + "loss": 0.549, + "step": 3122 + }, + { + "epoch": 0.25, + "grad_norm": 1.6847530645609963, + "learning_rate": 8.833218032005187e-06, + "loss": 0.4842, + "step": 3123 + }, + { + "epoch": 0.25, + "grad_norm": 1.8826394963854247, + "learning_rate": 8.832401321728663e-06, + "loss": 0.4647, + "step": 3124 + }, + { + "epoch": 0.25, + "grad_norm": 1.7610924767354525, + "learning_rate": 8.831584363500787e-06, + "loss": 0.5497, + "step": 3125 + }, + { + "epoch": 0.25, + "grad_norm": 2.0676635122026816, + "learning_rate": 8.830767157374415e-06, + "loss": 0.4936, + "step": 3126 + }, + { + "epoch": 0.25, + "grad_norm": 1.9387568015925745, + "learning_rate": 8.829949703402421e-06, + "loss": 0.4648, + "step": 3127 + }, + { + "epoch": 0.25, + "grad_norm": 1.7376162664313684, + "learning_rate": 8.82913200163769e-06, + "loss": 0.4809, + "step": 3128 + }, + { + "epoch": 0.25, + "grad_norm": 1.6642056388818773, + "learning_rate": 8.828314052133126e-06, + "loss": 0.4788, + "step": 3129 + }, + { + "epoch": 0.25, + "grad_norm": 1.5203849793825879, + "learning_rate": 8.827495854941654e-06, + "loss": 0.5479, + "step": 3130 + }, + { + "epoch": 0.25, + "grad_norm": 1.8140951860702474, + "learning_rate": 8.826677410116206e-06, + "loss": 0.5058, + "step": 3131 + }, + { + "epoch": 0.25, + "grad_norm": 1.7610510568990287, + "learning_rate": 8.825858717709734e-06, + "loss": 0.4938, + "step": 3132 + }, + { + "epoch": 0.25, + "grad_norm": 1.3969329516371154, + "learning_rate": 8.82503977777521e-06, + "loss": 0.4832, + "step": 3133 + }, + { + "epoch": 0.25, + "grad_norm": 0.7659137507976672, + "learning_rate": 8.824220590365616e-06, + "loss": 0.5557, + "step": 3134 + }, + { + "epoch": 0.25, + "grad_norm": 1.4229605554275584, + "learning_rate": 8.82340115553395e-06, + "loss": 0.5249, + "step": 3135 + }, + { + "epoch": 0.25, + "grad_norm": 1.533844320104774, + "learning_rate": 8.822581473333233e-06, + "loss": 0.4495, + "step": 3136 + }, + { + "epoch": 0.25, + "grad_norm": 1.902563259383878, + "learning_rate": 8.821761543816493e-06, + "loss": 0.5091, + "step": 3137 + }, + { + "epoch": 0.25, + "grad_norm": 1.5093162421573296, + "learning_rate": 8.820941367036784e-06, + "loss": 0.465, + "step": 3138 + }, + { + "epoch": 0.25, + "grad_norm": 1.9304759091326718, + "learning_rate": 8.820120943047166e-06, + "loss": 0.5007, + "step": 3139 + }, + { + "epoch": 0.25, + "grad_norm": 1.2361609085672767, + "learning_rate": 8.819300271900719e-06, + "loss": 0.4687, + "step": 3140 + }, + { + "epoch": 0.25, + "grad_norm": 1.8083382332595386, + "learning_rate": 8.818479353650543e-06, + "loss": 0.5002, + "step": 3141 + }, + { + "epoch": 0.25, + "grad_norm": 2.0860660948885523, + "learning_rate": 8.817658188349745e-06, + "loss": 0.5135, + "step": 3142 + }, + { + "epoch": 0.25, + "grad_norm": 1.7132751180378571, + "learning_rate": 8.816836776051458e-06, + "loss": 0.4742, + "step": 3143 + }, + { + "epoch": 0.25, + "grad_norm": 1.8744501727453178, + "learning_rate": 8.816015116808824e-06, + "loss": 0.5578, + "step": 3144 + }, + { + "epoch": 0.25, + "grad_norm": 1.5570388588450488, + "learning_rate": 8.815193210675004e-06, + "loss": 0.521, + "step": 3145 + }, + { + "epoch": 0.25, + "grad_norm": 1.7695743333176919, + "learning_rate": 8.814371057703175e-06, + "loss": 0.497, + "step": 3146 + }, + { + "epoch": 0.25, + "grad_norm": 0.7321807668042545, + "learning_rate": 8.813548657946527e-06, + "loss": 0.5667, + "step": 3147 + }, + { + "epoch": 0.25, + "grad_norm": 0.5875410576121596, + "learning_rate": 8.812726011458271e-06, + "loss": 0.5387, + "step": 3148 + }, + { + "epoch": 0.25, + "grad_norm": 1.4771052746617401, + "learning_rate": 8.81190311829163e-06, + "loss": 0.4709, + "step": 3149 + }, + { + "epoch": 0.25, + "grad_norm": 1.8513356688037073, + "learning_rate": 8.811079978499842e-06, + "loss": 0.4647, + "step": 3150 + }, + { + "epoch": 0.25, + "grad_norm": 0.6843832918379211, + "learning_rate": 8.810256592136167e-06, + "loss": 0.5568, + "step": 3151 + }, + { + "epoch": 0.25, + "grad_norm": 2.8728502062768015, + "learning_rate": 8.809432959253872e-06, + "loss": 0.5063, + "step": 3152 + }, + { + "epoch": 0.25, + "grad_norm": 1.3963022169318688, + "learning_rate": 8.80860907990625e-06, + "loss": 0.4604, + "step": 3153 + }, + { + "epoch": 0.25, + "grad_norm": 1.7736925305679498, + "learning_rate": 8.807784954146603e-06, + "loss": 0.4754, + "step": 3154 + }, + { + "epoch": 0.25, + "grad_norm": 2.0135890232706406, + "learning_rate": 8.80696058202825e-06, + "loss": 0.5439, + "step": 3155 + }, + { + "epoch": 0.25, + "grad_norm": 1.9332997922609096, + "learning_rate": 8.806135963604528e-06, + "loss": 0.4876, + "step": 3156 + }, + { + "epoch": 0.25, + "grad_norm": 1.7667936677673874, + "learning_rate": 8.805311098928786e-06, + "loss": 0.5294, + "step": 3157 + }, + { + "epoch": 0.25, + "grad_norm": 2.0882298186371524, + "learning_rate": 8.804485988054396e-06, + "loss": 0.5645, + "step": 3158 + }, + { + "epoch": 0.25, + "grad_norm": 1.5148688458297657, + "learning_rate": 8.80366063103474e-06, + "loss": 0.5245, + "step": 3159 + }, + { + "epoch": 0.25, + "grad_norm": 1.9781505062163962, + "learning_rate": 8.802835027923216e-06, + "loss": 0.5607, + "step": 3160 + }, + { + "epoch": 0.25, + "grad_norm": 1.9892915221923826, + "learning_rate": 8.80200917877324e-06, + "loss": 0.473, + "step": 3161 + }, + { + "epoch": 0.25, + "grad_norm": 7.830992108623623, + "learning_rate": 8.801183083638246e-06, + "loss": 0.5277, + "step": 3162 + }, + { + "epoch": 0.25, + "grad_norm": 1.5575430448215832, + "learning_rate": 8.800356742571677e-06, + "loss": 0.4697, + "step": 3163 + }, + { + "epoch": 0.25, + "grad_norm": 1.5257775414500692, + "learning_rate": 8.799530155626998e-06, + "loss": 0.5312, + "step": 3164 + }, + { + "epoch": 0.25, + "grad_norm": 1.5054808946340248, + "learning_rate": 8.79870332285769e-06, + "loss": 0.4893, + "step": 3165 + }, + { + "epoch": 0.25, + "grad_norm": 0.7468684242112201, + "learning_rate": 8.797876244317245e-06, + "loss": 0.5553, + "step": 3166 + }, + { + "epoch": 0.25, + "grad_norm": 1.5333496316874167, + "learning_rate": 8.797048920059176e-06, + "loss": 0.5021, + "step": 3167 + }, + { + "epoch": 0.25, + "grad_norm": 0.5846806974005915, + "learning_rate": 8.796221350137008e-06, + "loss": 0.5398, + "step": 3168 + }, + { + "epoch": 0.25, + "grad_norm": 0.5575635790752691, + "learning_rate": 8.795393534604287e-06, + "loss": 0.5495, + "step": 3169 + }, + { + "epoch": 0.25, + "grad_norm": 1.7613725761019852, + "learning_rate": 8.794565473514567e-06, + "loss": 0.4925, + "step": 3170 + }, + { + "epoch": 0.25, + "grad_norm": 1.804247965213342, + "learning_rate": 8.793737166921425e-06, + "loss": 0.503, + "step": 3171 + }, + { + "epoch": 0.25, + "grad_norm": 0.6884671644973206, + "learning_rate": 8.792908614878452e-06, + "loss": 0.5569, + "step": 3172 + }, + { + "epoch": 0.25, + "grad_norm": 1.8276788543739844, + "learning_rate": 8.792079817439254e-06, + "loss": 0.539, + "step": 3173 + }, + { + "epoch": 0.25, + "grad_norm": 1.508319866248653, + "learning_rate": 8.791250774657451e-06, + "loss": 0.5033, + "step": 3174 + }, + { + "epoch": 0.25, + "grad_norm": 1.5903042871739241, + "learning_rate": 8.790421486586683e-06, + "loss": 0.5093, + "step": 3175 + }, + { + "epoch": 0.25, + "grad_norm": 2.0328027879885724, + "learning_rate": 8.789591953280603e-06, + "loss": 0.5077, + "step": 3176 + }, + { + "epoch": 0.25, + "grad_norm": 1.411172637050618, + "learning_rate": 8.788762174792881e-06, + "loss": 0.4961, + "step": 3177 + }, + { + "epoch": 0.25, + "grad_norm": 0.6755275725927786, + "learning_rate": 8.787932151177202e-06, + "loss": 0.5175, + "step": 3178 + }, + { + "epoch": 0.25, + "grad_norm": 2.267851735282927, + "learning_rate": 8.78710188248727e-06, + "loss": 0.5038, + "step": 3179 + }, + { + "epoch": 0.25, + "grad_norm": 0.6633436385046281, + "learning_rate": 8.7862713687768e-06, + "loss": 0.5494, + "step": 3180 + }, + { + "epoch": 0.25, + "grad_norm": 1.580064818721464, + "learning_rate": 8.785440610099524e-06, + "loss": 0.5117, + "step": 3181 + }, + { + "epoch": 0.25, + "grad_norm": 1.6871990608212786, + "learning_rate": 8.784609606509194e-06, + "loss": 0.5067, + "step": 3182 + }, + { + "epoch": 0.25, + "grad_norm": 1.5325275845605506, + "learning_rate": 8.783778358059572e-06, + "loss": 0.5218, + "step": 3183 + }, + { + "epoch": 0.25, + "grad_norm": 2.500977311959191, + "learning_rate": 8.78294686480444e-06, + "loss": 0.5051, + "step": 3184 + }, + { + "epoch": 0.25, + "grad_norm": 1.8644052704952736, + "learning_rate": 8.782115126797596e-06, + "loss": 0.5185, + "step": 3185 + }, + { + "epoch": 0.25, + "grad_norm": 1.7437842477055276, + "learning_rate": 8.78128314409285e-06, + "loss": 0.4834, + "step": 3186 + }, + { + "epoch": 0.25, + "grad_norm": 1.7712264902130028, + "learning_rate": 8.780450916744031e-06, + "loss": 0.4954, + "step": 3187 + }, + { + "epoch": 0.25, + "grad_norm": 3.9195142499829734, + "learning_rate": 8.779618444804982e-06, + "loss": 0.4514, + "step": 3188 + }, + { + "epoch": 0.25, + "grad_norm": 0.7447346492184025, + "learning_rate": 8.778785728329566e-06, + "loss": 0.5433, + "step": 3189 + }, + { + "epoch": 0.25, + "grad_norm": 0.6649538228897592, + "learning_rate": 8.777952767371657e-06, + "loss": 0.5583, + "step": 3190 + }, + { + "epoch": 0.25, + "grad_norm": 3.3016235933557017, + "learning_rate": 8.777119561985145e-06, + "loss": 0.4531, + "step": 3191 + }, + { + "epoch": 0.25, + "grad_norm": 1.7996035092874412, + "learning_rate": 8.77628611222394e-06, + "loss": 0.5191, + "step": 3192 + }, + { + "epoch": 0.25, + "grad_norm": 1.4926075695635426, + "learning_rate": 8.775452418141961e-06, + "loss": 0.4726, + "step": 3193 + }, + { + "epoch": 0.25, + "grad_norm": 0.6793602704633813, + "learning_rate": 8.774618479793151e-06, + "loss": 0.5407, + "step": 3194 + }, + { + "epoch": 0.25, + "grad_norm": 2.0835045075478518, + "learning_rate": 8.773784297231463e-06, + "loss": 0.4688, + "step": 3195 + }, + { + "epoch": 0.25, + "grad_norm": 1.424885529519649, + "learning_rate": 8.772949870510867e-06, + "loss": 0.5112, + "step": 3196 + }, + { + "epoch": 0.25, + "grad_norm": 3.361354891643077, + "learning_rate": 8.772115199685352e-06, + "loss": 0.4576, + "step": 3197 + }, + { + "epoch": 0.25, + "grad_norm": 1.724132644475796, + "learning_rate": 8.771280284808917e-06, + "loss": 0.5238, + "step": 3198 + }, + { + "epoch": 0.25, + "grad_norm": 0.7404022362769855, + "learning_rate": 8.770445125935578e-06, + "loss": 0.5397, + "step": 3199 + }, + { + "epoch": 0.25, + "grad_norm": 0.6447434530001412, + "learning_rate": 8.769609723119375e-06, + "loss": 0.548, + "step": 3200 + }, + { + "epoch": 0.25, + "grad_norm": 1.4599295704522952, + "learning_rate": 8.768774076414354e-06, + "loss": 0.4834, + "step": 3201 + }, + { + "epoch": 0.25, + "grad_norm": 1.6013134169360055, + "learning_rate": 8.76793818587458e-06, + "loss": 0.4621, + "step": 3202 + }, + { + "epoch": 0.25, + "grad_norm": 1.8481091119131194, + "learning_rate": 8.767102051554135e-06, + "loss": 0.5225, + "step": 3203 + }, + { + "epoch": 0.25, + "grad_norm": 0.6465609485685071, + "learning_rate": 8.766265673507115e-06, + "loss": 0.519, + "step": 3204 + }, + { + "epoch": 0.25, + "grad_norm": 1.413333271809522, + "learning_rate": 8.765429051787632e-06, + "loss": 0.4871, + "step": 3205 + }, + { + "epoch": 0.25, + "grad_norm": 1.77399736349863, + "learning_rate": 8.764592186449816e-06, + "loss": 0.4449, + "step": 3206 + }, + { + "epoch": 0.25, + "grad_norm": 0.645696450137369, + "learning_rate": 8.76375507754781e-06, + "loss": 0.5292, + "step": 3207 + }, + { + "epoch": 0.25, + "grad_norm": 2.076529834975819, + "learning_rate": 8.762917725135774e-06, + "loss": 0.5211, + "step": 3208 + }, + { + "epoch": 0.25, + "grad_norm": 1.698390638238923, + "learning_rate": 8.762080129267884e-06, + "loss": 0.5324, + "step": 3209 + }, + { + "epoch": 0.25, + "grad_norm": 1.5026611251242934, + "learning_rate": 8.761242289998331e-06, + "loss": 0.4814, + "step": 3210 + }, + { + "epoch": 0.25, + "grad_norm": 2.7292116106622832, + "learning_rate": 8.76040420738132e-06, + "loss": 0.4996, + "step": 3211 + }, + { + "epoch": 0.25, + "grad_norm": 1.6149842253112265, + "learning_rate": 8.75956588147108e-06, + "loss": 0.4268, + "step": 3212 + }, + { + "epoch": 0.25, + "grad_norm": 2.358671104247102, + "learning_rate": 8.758727312321843e-06, + "loss": 0.5247, + "step": 3213 + }, + { + "epoch": 0.25, + "grad_norm": 0.6200749324635986, + "learning_rate": 8.757888499987867e-06, + "loss": 0.5221, + "step": 3214 + }, + { + "epoch": 0.25, + "grad_norm": 1.7562451300752853, + "learning_rate": 8.75704944452342e-06, + "loss": 0.5353, + "step": 3215 + }, + { + "epoch": 0.25, + "grad_norm": 1.523703295661214, + "learning_rate": 8.75621014598279e-06, + "loss": 0.4522, + "step": 3216 + }, + { + "epoch": 0.25, + "grad_norm": 2.6403191588425283, + "learning_rate": 8.755370604420275e-06, + "loss": 0.5001, + "step": 3217 + }, + { + "epoch": 0.25, + "grad_norm": 3.8222004723604748, + "learning_rate": 8.754530819890198e-06, + "loss": 0.4893, + "step": 3218 + }, + { + "epoch": 0.25, + "grad_norm": 1.5640569929709862, + "learning_rate": 8.753690792446885e-06, + "loss": 0.4924, + "step": 3219 + }, + { + "epoch": 0.25, + "grad_norm": 1.386318405315565, + "learning_rate": 8.752850522144689e-06, + "loss": 0.4846, + "step": 3220 + }, + { + "epoch": 0.25, + "grad_norm": 1.7216518695006389, + "learning_rate": 8.752010009037975e-06, + "loss": 0.4654, + "step": 3221 + }, + { + "epoch": 0.25, + "grad_norm": 1.5999725224290333, + "learning_rate": 8.751169253181121e-06, + "loss": 0.4966, + "step": 3222 + }, + { + "epoch": 0.25, + "grad_norm": 1.8484397076851256, + "learning_rate": 8.750328254628524e-06, + "loss": 0.4631, + "step": 3223 + }, + { + "epoch": 0.25, + "grad_norm": 0.6382728476558481, + "learning_rate": 8.749487013434594e-06, + "loss": 0.5595, + "step": 3224 + }, + { + "epoch": 0.25, + "grad_norm": 1.6363968018135961, + "learning_rate": 8.74864552965376e-06, + "loss": 0.4936, + "step": 3225 + }, + { + "epoch": 0.25, + "grad_norm": 1.6285678400003176, + "learning_rate": 8.747803803340463e-06, + "loss": 0.5568, + "step": 3226 + }, + { + "epoch": 0.25, + "grad_norm": 2.251318348222844, + "learning_rate": 8.746961834549163e-06, + "loss": 0.5065, + "step": 3227 + }, + { + "epoch": 0.25, + "grad_norm": 0.644789333543007, + "learning_rate": 8.746119623334335e-06, + "loss": 0.5445, + "step": 3228 + }, + { + "epoch": 0.25, + "grad_norm": 1.3160445098555669, + "learning_rate": 8.745277169750467e-06, + "loss": 0.443, + "step": 3229 + }, + { + "epoch": 0.25, + "grad_norm": 1.5362129657004773, + "learning_rate": 8.744434473852066e-06, + "loss": 0.5371, + "step": 3230 + }, + { + "epoch": 0.25, + "grad_norm": 1.669278323870098, + "learning_rate": 8.743591535693652e-06, + "loss": 0.5061, + "step": 3231 + }, + { + "epoch": 0.25, + "grad_norm": 1.6953628617773973, + "learning_rate": 8.742748355329764e-06, + "loss": 0.4661, + "step": 3232 + }, + { + "epoch": 0.25, + "grad_norm": 1.3445470792409033, + "learning_rate": 8.741904932814953e-06, + "loss": 0.4895, + "step": 3233 + }, + { + "epoch": 0.25, + "grad_norm": 2.003000671299978, + "learning_rate": 8.741061268203787e-06, + "loss": 0.5377, + "step": 3234 + }, + { + "epoch": 0.25, + "grad_norm": 1.4669265619273744, + "learning_rate": 8.740217361550853e-06, + "loss": 0.4834, + "step": 3235 + }, + { + "epoch": 0.25, + "grad_norm": 1.4673369310699333, + "learning_rate": 8.739373212910746e-06, + "loss": 0.5194, + "step": 3236 + }, + { + "epoch": 0.25, + "grad_norm": 3.0255072270781818, + "learning_rate": 8.738528822338086e-06, + "loss": 0.5364, + "step": 3237 + }, + { + "epoch": 0.25, + "grad_norm": 0.6034722141597749, + "learning_rate": 8.737684189887501e-06, + "loss": 0.5431, + "step": 3238 + }, + { + "epoch": 0.25, + "grad_norm": 1.7187732665533946, + "learning_rate": 8.736839315613638e-06, + "loss": 0.5219, + "step": 3239 + }, + { + "epoch": 0.25, + "grad_norm": 2.194592268510745, + "learning_rate": 8.73599419957116e-06, + "loss": 0.4634, + "step": 3240 + }, + { + "epoch": 0.25, + "grad_norm": 1.3821733391194253, + "learning_rate": 8.735148841814745e-06, + "loss": 0.5176, + "step": 3241 + }, + { + "epoch": 0.25, + "grad_norm": 2.067743053267743, + "learning_rate": 8.734303242399086e-06, + "loss": 0.4699, + "step": 3242 + }, + { + "epoch": 0.25, + "grad_norm": 1.653625425643597, + "learning_rate": 8.733457401378893e-06, + "loss": 0.4582, + "step": 3243 + }, + { + "epoch": 0.25, + "grad_norm": 1.850761426068509, + "learning_rate": 8.732611318808888e-06, + "loss": 0.4891, + "step": 3244 + }, + { + "epoch": 0.25, + "grad_norm": 1.5457580884012472, + "learning_rate": 8.731764994743814e-06, + "loss": 0.5367, + "step": 3245 + }, + { + "epoch": 0.25, + "grad_norm": 1.6879385865555376, + "learning_rate": 8.730918429238429e-06, + "loss": 0.4985, + "step": 3246 + }, + { + "epoch": 0.26, + "grad_norm": 1.629580502344068, + "learning_rate": 8.7300716223475e-06, + "loss": 0.5087, + "step": 3247 + }, + { + "epoch": 0.26, + "grad_norm": 1.5559450640212475, + "learning_rate": 8.729224574125818e-06, + "loss": 0.4897, + "step": 3248 + }, + { + "epoch": 0.26, + "grad_norm": 1.5759081461595328, + "learning_rate": 8.728377284628183e-06, + "loss": 0.4913, + "step": 3249 + }, + { + "epoch": 0.26, + "grad_norm": 1.4414065206820996, + "learning_rate": 8.727529753909417e-06, + "loss": 0.4893, + "step": 3250 + }, + { + "epoch": 0.26, + "grad_norm": 1.5000103084689527, + "learning_rate": 8.72668198202435e-06, + "loss": 0.4987, + "step": 3251 + }, + { + "epoch": 0.26, + "grad_norm": 0.6514408429427263, + "learning_rate": 8.725833969027835e-06, + "loss": 0.5395, + "step": 3252 + }, + { + "epoch": 0.26, + "grad_norm": 1.7633915136207532, + "learning_rate": 8.724985714974735e-06, + "loss": 0.489, + "step": 3253 + }, + { + "epoch": 0.26, + "grad_norm": 1.6115957997969865, + "learning_rate": 8.724137219919932e-06, + "loss": 0.51, + "step": 3254 + }, + { + "epoch": 0.26, + "grad_norm": 1.6810536140975803, + "learning_rate": 8.723288483918324e-06, + "loss": 0.5143, + "step": 3255 + }, + { + "epoch": 0.26, + "grad_norm": 1.4177634087851634, + "learning_rate": 8.722439507024823e-06, + "loss": 0.5046, + "step": 3256 + }, + { + "epoch": 0.26, + "grad_norm": 1.3166413699116888, + "learning_rate": 8.721590289294353e-06, + "loss": 0.5099, + "step": 3257 + }, + { + "epoch": 0.26, + "grad_norm": 3.955926209630783, + "learning_rate": 8.720740830781862e-06, + "loss": 0.5019, + "step": 3258 + }, + { + "epoch": 0.26, + "grad_norm": 1.5304960229849456, + "learning_rate": 8.719891131542304e-06, + "loss": 0.5122, + "step": 3259 + }, + { + "epoch": 0.26, + "grad_norm": 0.6329933326511215, + "learning_rate": 8.719041191630657e-06, + "loss": 0.5359, + "step": 3260 + }, + { + "epoch": 0.26, + "grad_norm": 1.9471169594043491, + "learning_rate": 8.718191011101911e-06, + "loss": 0.5373, + "step": 3261 + }, + { + "epoch": 0.26, + "grad_norm": 1.8252809784576949, + "learning_rate": 8.71734059001107e-06, + "loss": 0.5353, + "step": 3262 + }, + { + "epoch": 0.26, + "grad_norm": 1.453997269518261, + "learning_rate": 8.716489928413154e-06, + "loss": 0.4698, + "step": 3263 + }, + { + "epoch": 0.26, + "grad_norm": 1.2199214494286372, + "learning_rate": 8.715639026363204e-06, + "loss": 0.4867, + "step": 3264 + }, + { + "epoch": 0.26, + "grad_norm": 1.576161409055691, + "learning_rate": 8.71478788391627e-06, + "loss": 0.523, + "step": 3265 + }, + { + "epoch": 0.26, + "grad_norm": 1.850861240333354, + "learning_rate": 8.713936501127417e-06, + "loss": 0.5402, + "step": 3266 + }, + { + "epoch": 0.26, + "grad_norm": 1.8810382680260067, + "learning_rate": 8.713084878051732e-06, + "loss": 0.5424, + "step": 3267 + }, + { + "epoch": 0.26, + "grad_norm": 2.05436447099998, + "learning_rate": 8.712233014744312e-06, + "loss": 0.4559, + "step": 3268 + }, + { + "epoch": 0.26, + "grad_norm": 1.605676107103658, + "learning_rate": 8.711380911260274e-06, + "loss": 0.4992, + "step": 3269 + }, + { + "epoch": 0.26, + "grad_norm": 1.590019514033087, + "learning_rate": 8.710528567654743e-06, + "loss": 0.4857, + "step": 3270 + }, + { + "epoch": 0.26, + "grad_norm": 1.6607119507240535, + "learning_rate": 8.709675983982871e-06, + "loss": 0.5601, + "step": 3271 + }, + { + "epoch": 0.26, + "grad_norm": 1.29371474544447, + "learning_rate": 8.708823160299815e-06, + "loss": 0.5174, + "step": 3272 + }, + { + "epoch": 0.26, + "grad_norm": 1.8046736155722014, + "learning_rate": 8.70797009666075e-06, + "loss": 0.525, + "step": 3273 + }, + { + "epoch": 0.26, + "grad_norm": 1.474239005668495, + "learning_rate": 8.707116793120873e-06, + "loss": 0.4464, + "step": 3274 + }, + { + "epoch": 0.26, + "grad_norm": 1.3882866349497234, + "learning_rate": 8.706263249735386e-06, + "loss": 0.5017, + "step": 3275 + }, + { + "epoch": 0.26, + "grad_norm": 0.6740474734258126, + "learning_rate": 8.70540946655952e-06, + "loss": 0.5239, + "step": 3276 + }, + { + "epoch": 0.26, + "grad_norm": 0.596804058037078, + "learning_rate": 8.704555443648505e-06, + "loss": 0.5289, + "step": 3277 + }, + { + "epoch": 0.26, + "grad_norm": 1.5539151439924102, + "learning_rate": 8.7037011810576e-06, + "loss": 0.5074, + "step": 3278 + }, + { + "epoch": 0.26, + "grad_norm": 1.7434101723847306, + "learning_rate": 8.702846678842074e-06, + "loss": 0.5344, + "step": 3279 + }, + { + "epoch": 0.26, + "grad_norm": 3.1180408446355363, + "learning_rate": 8.701991937057211e-06, + "loss": 0.5061, + "step": 3280 + }, + { + "epoch": 0.26, + "grad_norm": 1.8048402485957895, + "learning_rate": 8.701136955758312e-06, + "loss": 0.5147, + "step": 3281 + }, + { + "epoch": 0.26, + "grad_norm": 4.171740514451025, + "learning_rate": 8.700281735000695e-06, + "loss": 0.4934, + "step": 3282 + }, + { + "epoch": 0.26, + "grad_norm": 1.6132874747052792, + "learning_rate": 8.69942627483969e-06, + "loss": 0.522, + "step": 3283 + }, + { + "epoch": 0.26, + "grad_norm": 1.502744531882189, + "learning_rate": 8.698570575330644e-06, + "loss": 0.4656, + "step": 3284 + }, + { + "epoch": 0.26, + "grad_norm": 0.9950113678664435, + "learning_rate": 8.69771463652892e-06, + "loss": 0.5367, + "step": 3285 + }, + { + "epoch": 0.26, + "grad_norm": 1.2878335049985021, + "learning_rate": 8.696858458489898e-06, + "loss": 0.4522, + "step": 3286 + }, + { + "epoch": 0.26, + "grad_norm": 0.5912814947163051, + "learning_rate": 8.696002041268966e-06, + "loss": 0.55, + "step": 3287 + }, + { + "epoch": 0.26, + "grad_norm": 0.6716647123639256, + "learning_rate": 8.695145384921542e-06, + "loss": 0.5471, + "step": 3288 + }, + { + "epoch": 0.26, + "grad_norm": 1.3367109825298962, + "learning_rate": 8.694288489503042e-06, + "loss": 0.4993, + "step": 3289 + }, + { + "epoch": 0.26, + "grad_norm": 1.6054248328073868, + "learning_rate": 8.69343135506891e-06, + "loss": 0.5828, + "step": 3290 + }, + { + "epoch": 0.26, + "grad_norm": 1.5257874945760699, + "learning_rate": 8.6925739816746e-06, + "loss": 0.5205, + "step": 3291 + }, + { + "epoch": 0.26, + "grad_norm": 1.361738914723251, + "learning_rate": 8.691716369375587e-06, + "loss": 0.4405, + "step": 3292 + }, + { + "epoch": 0.26, + "grad_norm": 1.026740763953979, + "learning_rate": 8.690858518227353e-06, + "loss": 0.5302, + "step": 3293 + }, + { + "epoch": 0.26, + "grad_norm": 1.4678756295040023, + "learning_rate": 8.6900004282854e-06, + "loss": 0.4763, + "step": 3294 + }, + { + "epoch": 0.26, + "grad_norm": 1.725647756449732, + "learning_rate": 8.689142099605245e-06, + "loss": 0.5136, + "step": 3295 + }, + { + "epoch": 0.26, + "grad_norm": 1.5277729043077464, + "learning_rate": 8.688283532242425e-06, + "loss": 0.3995, + "step": 3296 + }, + { + "epoch": 0.26, + "grad_norm": 1.5927902338038629, + "learning_rate": 8.687424726252485e-06, + "loss": 0.5528, + "step": 3297 + }, + { + "epoch": 0.26, + "grad_norm": 1.8701808459539466, + "learning_rate": 8.686565681690988e-06, + "loss": 0.531, + "step": 3298 + }, + { + "epoch": 0.26, + "grad_norm": 1.6743002856732259, + "learning_rate": 8.685706398613514e-06, + "loss": 0.5056, + "step": 3299 + }, + { + "epoch": 0.26, + "grad_norm": 1.5225811032038654, + "learning_rate": 8.684846877075659e-06, + "loss": 0.4669, + "step": 3300 + }, + { + "epoch": 0.26, + "grad_norm": 0.711775799003001, + "learning_rate": 8.68398711713303e-06, + "loss": 0.555, + "step": 3301 + }, + { + "epoch": 0.26, + "grad_norm": 1.6350364104433937, + "learning_rate": 8.683127118841254e-06, + "loss": 0.4619, + "step": 3302 + }, + { + "epoch": 0.26, + "grad_norm": 2.156567580785746, + "learning_rate": 8.682266882255972e-06, + "loss": 0.4562, + "step": 3303 + }, + { + "epoch": 0.26, + "grad_norm": 0.620475054651976, + "learning_rate": 8.68140640743284e-06, + "loss": 0.5479, + "step": 3304 + }, + { + "epoch": 0.26, + "grad_norm": 1.7374599886474034, + "learning_rate": 8.680545694427528e-06, + "loss": 0.4963, + "step": 3305 + }, + { + "epoch": 0.26, + "grad_norm": 0.606717356991121, + "learning_rate": 8.679684743295725e-06, + "loss": 0.5434, + "step": 3306 + }, + { + "epoch": 0.26, + "grad_norm": 1.4544686314428859, + "learning_rate": 8.678823554093132e-06, + "loss": 0.4459, + "step": 3307 + }, + { + "epoch": 0.26, + "grad_norm": 1.3149254028289086, + "learning_rate": 8.67796212687547e-06, + "loss": 0.465, + "step": 3308 + }, + { + "epoch": 0.26, + "grad_norm": 1.843479598401039, + "learning_rate": 8.677100461698466e-06, + "loss": 0.5351, + "step": 3309 + }, + { + "epoch": 0.26, + "grad_norm": 2.3859516061523927, + "learning_rate": 8.676238558617875e-06, + "loss": 0.4493, + "step": 3310 + }, + { + "epoch": 0.26, + "grad_norm": 0.6351923970050173, + "learning_rate": 8.675376417689459e-06, + "loss": 0.5427, + "step": 3311 + }, + { + "epoch": 0.26, + "grad_norm": 1.6128089524835199, + "learning_rate": 8.674514038968996e-06, + "loss": 0.4923, + "step": 3312 + }, + { + "epoch": 0.26, + "grad_norm": 2.310511921178792, + "learning_rate": 8.673651422512281e-06, + "loss": 0.5573, + "step": 3313 + }, + { + "epoch": 0.26, + "grad_norm": 1.4606585598881572, + "learning_rate": 8.672788568375125e-06, + "loss": 0.5078, + "step": 3314 + }, + { + "epoch": 0.26, + "grad_norm": 1.54212534972613, + "learning_rate": 8.671925476613353e-06, + "loss": 0.5029, + "step": 3315 + }, + { + "epoch": 0.26, + "grad_norm": 1.9622402443042726, + "learning_rate": 8.671062147282807e-06, + "loss": 0.4937, + "step": 3316 + }, + { + "epoch": 0.26, + "grad_norm": 0.5698044735318659, + "learning_rate": 8.670198580439342e-06, + "loss": 0.5179, + "step": 3317 + }, + { + "epoch": 0.26, + "grad_norm": 1.5463224803747988, + "learning_rate": 8.669334776138829e-06, + "loss": 0.4754, + "step": 3318 + }, + { + "epoch": 0.26, + "grad_norm": 1.3254102150144311, + "learning_rate": 8.668470734437157e-06, + "loss": 0.4855, + "step": 3319 + }, + { + "epoch": 0.26, + "grad_norm": 0.559140754757652, + "learning_rate": 8.667606455390226e-06, + "loss": 0.5137, + "step": 3320 + }, + { + "epoch": 0.26, + "grad_norm": 3.3147781211969822, + "learning_rate": 8.666741939053958e-06, + "loss": 0.4807, + "step": 3321 + }, + { + "epoch": 0.26, + "grad_norm": 1.572447897466725, + "learning_rate": 8.665877185484281e-06, + "loss": 0.5015, + "step": 3322 + }, + { + "epoch": 0.26, + "grad_norm": 1.4744094706933022, + "learning_rate": 8.665012194737147e-06, + "loss": 0.468, + "step": 3323 + }, + { + "epoch": 0.26, + "grad_norm": 0.5737793676016821, + "learning_rate": 8.664146966868519e-06, + "loss": 0.5422, + "step": 3324 + }, + { + "epoch": 0.26, + "grad_norm": 1.5925443018244743, + "learning_rate": 8.663281501934372e-06, + "loss": 0.4818, + "step": 3325 + }, + { + "epoch": 0.26, + "grad_norm": 1.4223252091858785, + "learning_rate": 8.662415799990707e-06, + "loss": 0.4704, + "step": 3326 + }, + { + "epoch": 0.26, + "grad_norm": 0.5623611588489181, + "learning_rate": 8.66154986109353e-06, + "loss": 0.5465, + "step": 3327 + }, + { + "epoch": 0.26, + "grad_norm": 0.6089261590280686, + "learning_rate": 8.660683685298867e-06, + "loss": 0.5636, + "step": 3328 + }, + { + "epoch": 0.26, + "grad_norm": 1.4209618999564035, + "learning_rate": 8.65981727266276e-06, + "loss": 0.5333, + "step": 3329 + }, + { + "epoch": 0.26, + "grad_norm": 0.5574860279318512, + "learning_rate": 8.658950623241262e-06, + "loss": 0.5413, + "step": 3330 + }, + { + "epoch": 0.26, + "grad_norm": 1.5558097303755274, + "learning_rate": 8.658083737090444e-06, + "loss": 0.4755, + "step": 3331 + }, + { + "epoch": 0.26, + "grad_norm": 1.3059030384001609, + "learning_rate": 8.657216614266394e-06, + "loss": 0.5341, + "step": 3332 + }, + { + "epoch": 0.26, + "grad_norm": 1.6125236336984035, + "learning_rate": 8.656349254825213e-06, + "loss": 0.4849, + "step": 3333 + }, + { + "epoch": 0.26, + "grad_norm": 0.5683101046807325, + "learning_rate": 8.65548165882302e-06, + "loss": 0.5498, + "step": 3334 + }, + { + "epoch": 0.26, + "grad_norm": 1.3988168744481584, + "learning_rate": 8.654613826315943e-06, + "loss": 0.4569, + "step": 3335 + }, + { + "epoch": 0.26, + "grad_norm": 1.2314396177863982, + "learning_rate": 8.653745757360135e-06, + "loss": 0.5176, + "step": 3336 + }, + { + "epoch": 0.26, + "grad_norm": 2.56285090287388, + "learning_rate": 8.652877452011755e-06, + "loss": 0.5162, + "step": 3337 + }, + { + "epoch": 0.26, + "grad_norm": 1.457067885340824, + "learning_rate": 8.652008910326983e-06, + "loss": 0.5274, + "step": 3338 + }, + { + "epoch": 0.26, + "grad_norm": 1.9775372910758946, + "learning_rate": 8.65114013236201e-06, + "loss": 0.4748, + "step": 3339 + }, + { + "epoch": 0.26, + "grad_norm": 0.5899952973360124, + "learning_rate": 8.650271118173048e-06, + "loss": 0.5172, + "step": 3340 + }, + { + "epoch": 0.26, + "grad_norm": 1.4831999555815314, + "learning_rate": 8.64940186781632e-06, + "loss": 0.5177, + "step": 3341 + }, + { + "epoch": 0.26, + "grad_norm": 2.0813248369817536, + "learning_rate": 8.648532381348066e-06, + "loss": 0.4394, + "step": 3342 + }, + { + "epoch": 0.26, + "grad_norm": 1.8501094633745563, + "learning_rate": 8.647662658824538e-06, + "loss": 0.5609, + "step": 3343 + }, + { + "epoch": 0.26, + "grad_norm": 2.9964853560539324, + "learning_rate": 8.646792700302009e-06, + "loss": 0.5001, + "step": 3344 + }, + { + "epoch": 0.26, + "grad_norm": 2.3075778928664605, + "learning_rate": 8.645922505836763e-06, + "loss": 0.4586, + "step": 3345 + }, + { + "epoch": 0.26, + "grad_norm": 1.9759538492898014, + "learning_rate": 8.6450520754851e-06, + "loss": 0.4753, + "step": 3346 + }, + { + "epoch": 0.26, + "grad_norm": 2.8609803044445266, + "learning_rate": 8.644181409303336e-06, + "loss": 0.4754, + "step": 3347 + }, + { + "epoch": 0.26, + "grad_norm": 1.5824436515225349, + "learning_rate": 8.643310507347802e-06, + "loss": 0.4629, + "step": 3348 + }, + { + "epoch": 0.26, + "grad_norm": 1.4148299513700913, + "learning_rate": 8.642439369674845e-06, + "loss": 0.4575, + "step": 3349 + }, + { + "epoch": 0.26, + "grad_norm": 1.8928965743500823, + "learning_rate": 8.641567996340824e-06, + "loss": 0.503, + "step": 3350 + }, + { + "epoch": 0.26, + "grad_norm": 1.5631734093884255, + "learning_rate": 8.640696387402119e-06, + "loss": 0.494, + "step": 3351 + }, + { + "epoch": 0.26, + "grad_norm": 2.360224022047198, + "learning_rate": 8.639824542915118e-06, + "loss": 0.4929, + "step": 3352 + }, + { + "epoch": 0.26, + "grad_norm": 1.5762223259071542, + "learning_rate": 8.638952462936232e-06, + "loss": 0.4965, + "step": 3353 + }, + { + "epoch": 0.26, + "grad_norm": 1.6690362841576996, + "learning_rate": 8.638080147521884e-06, + "loss": 0.5716, + "step": 3354 + }, + { + "epoch": 0.26, + "grad_norm": 1.7903314196851994, + "learning_rate": 8.637207596728508e-06, + "loss": 0.4674, + "step": 3355 + }, + { + "epoch": 0.26, + "grad_norm": 3.088566694134135, + "learning_rate": 8.636334810612557e-06, + "loss": 0.5136, + "step": 3356 + }, + { + "epoch": 0.26, + "grad_norm": 2.0379502343558458, + "learning_rate": 8.635461789230501e-06, + "loss": 0.4708, + "step": 3357 + }, + { + "epoch": 0.26, + "grad_norm": 1.7487518801585136, + "learning_rate": 8.634588532638826e-06, + "loss": 0.4998, + "step": 3358 + }, + { + "epoch": 0.26, + "grad_norm": 1.6634527485066162, + "learning_rate": 8.633715040894023e-06, + "loss": 0.4617, + "step": 3359 + }, + { + "epoch": 0.26, + "grad_norm": 1.4499574126399932, + "learning_rate": 8.632841314052614e-06, + "loss": 0.5024, + "step": 3360 + }, + { + "epoch": 0.26, + "grad_norm": 1.5038158478733046, + "learning_rate": 8.631967352171125e-06, + "loss": 0.5279, + "step": 3361 + }, + { + "epoch": 0.26, + "grad_norm": 1.6920843307996423, + "learning_rate": 8.631093155306097e-06, + "loss": 0.5303, + "step": 3362 + }, + { + "epoch": 0.26, + "grad_norm": 1.2854719067976281, + "learning_rate": 8.630218723514092e-06, + "loss": 0.4987, + "step": 3363 + }, + { + "epoch": 0.26, + "grad_norm": 1.632168597003605, + "learning_rate": 8.629344056851687e-06, + "loss": 0.4821, + "step": 3364 + }, + { + "epoch": 0.26, + "grad_norm": 1.9018018946605562, + "learning_rate": 8.628469155375467e-06, + "loss": 0.45, + "step": 3365 + }, + { + "epoch": 0.26, + "grad_norm": 2.040628688809857, + "learning_rate": 8.627594019142039e-06, + "loss": 0.5008, + "step": 3366 + }, + { + "epoch": 0.26, + "grad_norm": 1.853851364784639, + "learning_rate": 8.626718648208025e-06, + "loss": 0.5249, + "step": 3367 + }, + { + "epoch": 0.26, + "grad_norm": 1.4682428625440593, + "learning_rate": 8.625843042630058e-06, + "loss": 0.4879, + "step": 3368 + }, + { + "epoch": 0.26, + "grad_norm": 1.4162731744946866, + "learning_rate": 8.624967202464789e-06, + "loss": 0.4614, + "step": 3369 + }, + { + "epoch": 0.26, + "grad_norm": 1.7673872685161327, + "learning_rate": 8.624091127768884e-06, + "loss": 0.4706, + "step": 3370 + }, + { + "epoch": 0.26, + "grad_norm": 2.0122375189824626, + "learning_rate": 8.623214818599024e-06, + "loss": 0.4831, + "step": 3371 + }, + { + "epoch": 0.26, + "grad_norm": 1.5586610834189096, + "learning_rate": 8.622338275011907e-06, + "loss": 0.526, + "step": 3372 + }, + { + "epoch": 0.26, + "grad_norm": 1.637162716249673, + "learning_rate": 8.621461497064241e-06, + "loss": 0.5126, + "step": 3373 + }, + { + "epoch": 0.26, + "grad_norm": 1.7119784291758082, + "learning_rate": 8.620584484812753e-06, + "loss": 0.4291, + "step": 3374 + }, + { + "epoch": 0.27, + "grad_norm": 1.3360263829998105, + "learning_rate": 8.619707238314186e-06, + "loss": 0.4423, + "step": 3375 + }, + { + "epoch": 0.27, + "grad_norm": 1.4398874554887273, + "learning_rate": 8.618829757625295e-06, + "loss": 0.4524, + "step": 3376 + }, + { + "epoch": 0.27, + "grad_norm": 1.6591620093181925, + "learning_rate": 8.617952042802853e-06, + "loss": 0.466, + "step": 3377 + }, + { + "epoch": 0.27, + "grad_norm": 0.5802421722380953, + "learning_rate": 8.617074093903646e-06, + "loss": 0.5542, + "step": 3378 + }, + { + "epoch": 0.27, + "grad_norm": 1.4277828447085816, + "learning_rate": 8.616195910984477e-06, + "loss": 0.5168, + "step": 3379 + }, + { + "epoch": 0.27, + "grad_norm": 1.5708225290722235, + "learning_rate": 8.615317494102164e-06, + "loss": 0.4659, + "step": 3380 + }, + { + "epoch": 0.27, + "grad_norm": 1.7368543749934282, + "learning_rate": 8.614438843313538e-06, + "loss": 0.5221, + "step": 3381 + }, + { + "epoch": 0.27, + "grad_norm": 1.6170659242108831, + "learning_rate": 8.613559958675448e-06, + "loss": 0.5126, + "step": 3382 + }, + { + "epoch": 0.27, + "grad_norm": 2.0461309388225177, + "learning_rate": 8.612680840244756e-06, + "loss": 0.4733, + "step": 3383 + }, + { + "epoch": 0.27, + "grad_norm": 1.5025158314178289, + "learning_rate": 8.611801488078337e-06, + "loss": 0.548, + "step": 3384 + }, + { + "epoch": 0.27, + "grad_norm": 1.4660126561541762, + "learning_rate": 8.61092190223309e-06, + "loss": 0.4597, + "step": 3385 + }, + { + "epoch": 0.27, + "grad_norm": 1.3272809697306307, + "learning_rate": 8.610042082765917e-06, + "loss": 0.5066, + "step": 3386 + }, + { + "epoch": 0.27, + "grad_norm": 2.1500451240413545, + "learning_rate": 8.609162029733745e-06, + "loss": 0.5163, + "step": 3387 + }, + { + "epoch": 0.27, + "grad_norm": 0.551537554343829, + "learning_rate": 8.608281743193511e-06, + "loss": 0.5176, + "step": 3388 + }, + { + "epoch": 0.27, + "grad_norm": 2.214506125127254, + "learning_rate": 8.607401223202168e-06, + "loss": 0.5297, + "step": 3389 + }, + { + "epoch": 0.27, + "grad_norm": 1.3729506952670276, + "learning_rate": 8.606520469816686e-06, + "loss": 0.4776, + "step": 3390 + }, + { + "epoch": 0.27, + "grad_norm": 1.6672889002072822, + "learning_rate": 8.605639483094046e-06, + "loss": 0.5324, + "step": 3391 + }, + { + "epoch": 0.27, + "grad_norm": 1.428277935905827, + "learning_rate": 8.604758263091248e-06, + "loss": 0.4697, + "step": 3392 + }, + { + "epoch": 0.27, + "grad_norm": 0.5867874160773533, + "learning_rate": 8.603876809865308e-06, + "loss": 0.5192, + "step": 3393 + }, + { + "epoch": 0.27, + "grad_norm": 1.9519093028723848, + "learning_rate": 8.602995123473252e-06, + "loss": 0.4651, + "step": 3394 + }, + { + "epoch": 0.27, + "grad_norm": 1.7290439037133947, + "learning_rate": 8.602113203972124e-06, + "loss": 0.5088, + "step": 3395 + }, + { + "epoch": 0.27, + "grad_norm": 1.6604500598235477, + "learning_rate": 8.601231051418984e-06, + "loss": 0.4979, + "step": 3396 + }, + { + "epoch": 0.27, + "grad_norm": 1.902113332736501, + "learning_rate": 8.600348665870907e-06, + "loss": 0.4779, + "step": 3397 + }, + { + "epoch": 0.27, + "grad_norm": 1.8990372612130764, + "learning_rate": 8.599466047384981e-06, + "loss": 0.5493, + "step": 3398 + }, + { + "epoch": 0.27, + "grad_norm": 0.6093362917532382, + "learning_rate": 8.59858319601831e-06, + "loss": 0.5285, + "step": 3399 + }, + { + "epoch": 0.27, + "grad_norm": 2.5925181383728004, + "learning_rate": 8.597700111828013e-06, + "loss": 0.5368, + "step": 3400 + }, + { + "epoch": 0.27, + "grad_norm": 2.0464333240802937, + "learning_rate": 8.596816794871226e-06, + "loss": 0.4615, + "step": 3401 + }, + { + "epoch": 0.27, + "grad_norm": 1.496038259774819, + "learning_rate": 8.595933245205097e-06, + "loss": 0.5162, + "step": 3402 + }, + { + "epoch": 0.27, + "grad_norm": 1.6301056904045803, + "learning_rate": 8.595049462886793e-06, + "loss": 0.5421, + "step": 3403 + }, + { + "epoch": 0.27, + "grad_norm": 2.3537039073986628, + "learning_rate": 8.594165447973489e-06, + "loss": 0.4274, + "step": 3404 + }, + { + "epoch": 0.27, + "grad_norm": 1.4196602530127012, + "learning_rate": 8.593281200522383e-06, + "loss": 0.4199, + "step": 3405 + }, + { + "epoch": 0.27, + "grad_norm": 3.338662609894142, + "learning_rate": 8.592396720590686e-06, + "loss": 0.5216, + "step": 3406 + }, + { + "epoch": 0.27, + "grad_norm": 1.8143335477961813, + "learning_rate": 8.591512008235618e-06, + "loss": 0.4796, + "step": 3407 + }, + { + "epoch": 0.27, + "grad_norm": 1.5739591118269822, + "learning_rate": 8.590627063514423e-06, + "loss": 0.4716, + "step": 3408 + }, + { + "epoch": 0.27, + "grad_norm": 0.6092519362794887, + "learning_rate": 8.589741886484353e-06, + "loss": 0.5341, + "step": 3409 + }, + { + "epoch": 0.27, + "grad_norm": 1.79803200226372, + "learning_rate": 8.58885647720268e-06, + "loss": 0.5172, + "step": 3410 + }, + { + "epoch": 0.27, + "grad_norm": 1.537787121283042, + "learning_rate": 8.587970835726686e-06, + "loss": 0.5115, + "step": 3411 + }, + { + "epoch": 0.27, + "grad_norm": 1.8829377719950688, + "learning_rate": 8.587084962113675e-06, + "loss": 0.4658, + "step": 3412 + }, + { + "epoch": 0.27, + "grad_norm": 0.5864625990007081, + "learning_rate": 8.58619885642096e-06, + "loss": 0.5287, + "step": 3413 + }, + { + "epoch": 0.27, + "grad_norm": 1.503080233308453, + "learning_rate": 8.585312518705867e-06, + "loss": 0.4819, + "step": 3414 + }, + { + "epoch": 0.27, + "grad_norm": 1.6320269067837785, + "learning_rate": 8.584425949025745e-06, + "loss": 0.5016, + "step": 3415 + }, + { + "epoch": 0.27, + "grad_norm": 1.6228090259418075, + "learning_rate": 8.583539147437955e-06, + "loss": 0.4552, + "step": 3416 + }, + { + "epoch": 0.27, + "grad_norm": 1.522898879856291, + "learning_rate": 8.58265211399987e-06, + "loss": 0.5469, + "step": 3417 + }, + { + "epoch": 0.27, + "grad_norm": 1.7537577765786239, + "learning_rate": 8.581764848768878e-06, + "loss": 0.585, + "step": 3418 + }, + { + "epoch": 0.27, + "grad_norm": 1.7892521641738275, + "learning_rate": 8.580877351802385e-06, + "loss": 0.4875, + "step": 3419 + }, + { + "epoch": 0.27, + "grad_norm": 1.6481390121889066, + "learning_rate": 8.579989623157813e-06, + "loss": 0.4471, + "step": 3420 + }, + { + "epoch": 0.27, + "grad_norm": 0.6274441155600545, + "learning_rate": 8.579101662892597e-06, + "loss": 0.5045, + "step": 3421 + }, + { + "epoch": 0.27, + "grad_norm": 1.8432354931210932, + "learning_rate": 8.578213471064184e-06, + "loss": 0.4739, + "step": 3422 + }, + { + "epoch": 0.27, + "grad_norm": 1.3120040824502734, + "learning_rate": 8.577325047730042e-06, + "loss": 0.4858, + "step": 3423 + }, + { + "epoch": 0.27, + "grad_norm": 0.5782400480818928, + "learning_rate": 8.576436392947647e-06, + "loss": 0.5321, + "step": 3424 + }, + { + "epoch": 0.27, + "grad_norm": 1.7700043481258967, + "learning_rate": 8.575547506774498e-06, + "loss": 0.4671, + "step": 3425 + }, + { + "epoch": 0.27, + "grad_norm": 1.3612692648329476, + "learning_rate": 8.574658389268102e-06, + "loss": 0.5095, + "step": 3426 + }, + { + "epoch": 0.27, + "grad_norm": 1.6673306555114653, + "learning_rate": 8.573769040485984e-06, + "loss": 0.5422, + "step": 3427 + }, + { + "epoch": 0.27, + "grad_norm": 3.02520097025789, + "learning_rate": 8.572879460485684e-06, + "loss": 0.4889, + "step": 3428 + }, + { + "epoch": 0.27, + "grad_norm": 1.7105565641077909, + "learning_rate": 8.571989649324756e-06, + "loss": 0.454, + "step": 3429 + }, + { + "epoch": 0.27, + "grad_norm": 1.692577742408476, + "learning_rate": 8.571099607060772e-06, + "loss": 0.5264, + "step": 3430 + }, + { + "epoch": 0.27, + "grad_norm": 1.4097702946714268, + "learning_rate": 8.570209333751314e-06, + "loss": 0.5346, + "step": 3431 + }, + { + "epoch": 0.27, + "grad_norm": 1.8243493143063017, + "learning_rate": 8.569318829453983e-06, + "loss": 0.5361, + "step": 3432 + }, + { + "epoch": 0.27, + "grad_norm": 1.5749127129389127, + "learning_rate": 8.568428094226394e-06, + "loss": 0.549, + "step": 3433 + }, + { + "epoch": 0.27, + "grad_norm": 2.7597873761827763, + "learning_rate": 8.567537128126173e-06, + "loss": 0.5259, + "step": 3434 + }, + { + "epoch": 0.27, + "grad_norm": 1.5294892748791478, + "learning_rate": 8.566645931210968e-06, + "loss": 0.5227, + "step": 3435 + }, + { + "epoch": 0.27, + "grad_norm": 2.1399663766329424, + "learning_rate": 8.565754503538439e-06, + "loss": 0.4902, + "step": 3436 + }, + { + "epoch": 0.27, + "grad_norm": 1.4966191786860732, + "learning_rate": 8.564862845166255e-06, + "loss": 0.4682, + "step": 3437 + }, + { + "epoch": 0.27, + "grad_norm": 1.3491070142745738, + "learning_rate": 8.56397095615211e-06, + "loss": 0.4548, + "step": 3438 + }, + { + "epoch": 0.27, + "grad_norm": 2.0830471332374536, + "learning_rate": 8.563078836553707e-06, + "loss": 0.4876, + "step": 3439 + }, + { + "epoch": 0.27, + "grad_norm": 0.7089210568201886, + "learning_rate": 8.562186486428762e-06, + "loss": 0.5138, + "step": 3440 + }, + { + "epoch": 0.27, + "grad_norm": 1.4759646961717325, + "learning_rate": 8.561293905835013e-06, + "loss": 0.4832, + "step": 3441 + }, + { + "epoch": 0.27, + "grad_norm": 1.3650574549663443, + "learning_rate": 8.560401094830207e-06, + "loss": 0.5138, + "step": 3442 + }, + { + "epoch": 0.27, + "grad_norm": 1.5442107272449699, + "learning_rate": 8.559508053472108e-06, + "loss": 0.5, + "step": 3443 + }, + { + "epoch": 0.27, + "grad_norm": 1.703855504966165, + "learning_rate": 8.558614781818493e-06, + "loss": 0.4669, + "step": 3444 + }, + { + "epoch": 0.27, + "grad_norm": 1.648494388774955, + "learning_rate": 8.557721279927157e-06, + "loss": 0.5152, + "step": 3445 + }, + { + "epoch": 0.27, + "grad_norm": 1.481186336780409, + "learning_rate": 8.55682754785591e-06, + "loss": 0.4628, + "step": 3446 + }, + { + "epoch": 0.27, + "grad_norm": 1.7798652409788283, + "learning_rate": 8.555933585662572e-06, + "loss": 0.4172, + "step": 3447 + }, + { + "epoch": 0.27, + "grad_norm": 0.6996518725215578, + "learning_rate": 8.555039393404985e-06, + "loss": 0.5579, + "step": 3448 + }, + { + "epoch": 0.27, + "grad_norm": 1.4615321263119314, + "learning_rate": 8.554144971140998e-06, + "loss": 0.4335, + "step": 3449 + }, + { + "epoch": 0.27, + "grad_norm": 2.251720493892175, + "learning_rate": 8.55325031892848e-06, + "loss": 0.4778, + "step": 3450 + }, + { + "epoch": 0.27, + "grad_norm": 0.5947884910918764, + "learning_rate": 8.552355436825316e-06, + "loss": 0.5415, + "step": 3451 + }, + { + "epoch": 0.27, + "grad_norm": 1.8274859239957015, + "learning_rate": 8.551460324889402e-06, + "loss": 0.4991, + "step": 3452 + }, + { + "epoch": 0.27, + "grad_norm": 3.611402691481396, + "learning_rate": 8.55056498317865e-06, + "loss": 0.4536, + "step": 3453 + }, + { + "epoch": 0.27, + "grad_norm": 2.564327029483822, + "learning_rate": 8.549669411750988e-06, + "loss": 0.4617, + "step": 3454 + }, + { + "epoch": 0.27, + "grad_norm": 1.5594684701759445, + "learning_rate": 8.548773610664361e-06, + "loss": 0.491, + "step": 3455 + }, + { + "epoch": 0.27, + "grad_norm": 1.5490750688444723, + "learning_rate": 8.547877579976722e-06, + "loss": 0.52, + "step": 3456 + }, + { + "epoch": 0.27, + "grad_norm": 0.6966686510651647, + "learning_rate": 8.546981319746046e-06, + "loss": 0.5338, + "step": 3457 + }, + { + "epoch": 0.27, + "grad_norm": 1.713725992603186, + "learning_rate": 8.546084830030317e-06, + "loss": 0.4598, + "step": 3458 + }, + { + "epoch": 0.27, + "grad_norm": 0.6150268713324031, + "learning_rate": 8.545188110887539e-06, + "loss": 0.5053, + "step": 3459 + }, + { + "epoch": 0.27, + "grad_norm": 1.5345439453332597, + "learning_rate": 8.544291162375729e-06, + "loss": 0.5192, + "step": 3460 + }, + { + "epoch": 0.27, + "grad_norm": 1.354579180065015, + "learning_rate": 8.543393984552916e-06, + "loss": 0.4442, + "step": 3461 + }, + { + "epoch": 0.27, + "grad_norm": 1.23243592720789, + "learning_rate": 8.542496577477149e-06, + "loss": 0.4598, + "step": 3462 + }, + { + "epoch": 0.27, + "grad_norm": 1.5274012221906568, + "learning_rate": 8.541598941206486e-06, + "loss": 0.5108, + "step": 3463 + }, + { + "epoch": 0.27, + "grad_norm": 1.7360261842061206, + "learning_rate": 8.540701075799005e-06, + "loss": 0.4803, + "step": 3464 + }, + { + "epoch": 0.27, + "grad_norm": 1.6934249516311686, + "learning_rate": 8.539802981312795e-06, + "loss": 0.5458, + "step": 3465 + }, + { + "epoch": 0.27, + "grad_norm": 2.3574359297032537, + "learning_rate": 8.538904657805965e-06, + "loss": 0.509, + "step": 3466 + }, + { + "epoch": 0.27, + "grad_norm": 2.100629773586588, + "learning_rate": 8.538006105336631e-06, + "loss": 0.5159, + "step": 3467 + }, + { + "epoch": 0.27, + "grad_norm": 1.4285321684010133, + "learning_rate": 8.537107323962932e-06, + "loss": 0.4773, + "step": 3468 + }, + { + "epoch": 0.27, + "grad_norm": 1.4802576286819313, + "learning_rate": 8.536208313743016e-06, + "loss": 0.5039, + "step": 3469 + }, + { + "epoch": 0.27, + "grad_norm": 2.081440882178319, + "learning_rate": 8.535309074735047e-06, + "loss": 0.4882, + "step": 3470 + }, + { + "epoch": 0.27, + "grad_norm": 1.6176467017207867, + "learning_rate": 8.534409606997207e-06, + "loss": 0.4805, + "step": 3471 + }, + { + "epoch": 0.27, + "grad_norm": 1.7660708097358375, + "learning_rate": 8.53350991058769e-06, + "loss": 0.4956, + "step": 3472 + }, + { + "epoch": 0.27, + "grad_norm": 1.2831561890726368, + "learning_rate": 8.532609985564701e-06, + "loss": 0.4917, + "step": 3473 + }, + { + "epoch": 0.27, + "grad_norm": 1.5781511020199106, + "learning_rate": 8.531709831986469e-06, + "loss": 0.4811, + "step": 3474 + }, + { + "epoch": 0.27, + "grad_norm": 1.4593111363582796, + "learning_rate": 8.53080944991123e-06, + "loss": 0.4479, + "step": 3475 + }, + { + "epoch": 0.27, + "grad_norm": 2.4232914925711544, + "learning_rate": 8.52990883939724e-06, + "loss": 0.458, + "step": 3476 + }, + { + "epoch": 0.27, + "grad_norm": 0.9057724807387423, + "learning_rate": 8.529008000502766e-06, + "loss": 0.5447, + "step": 3477 + }, + { + "epoch": 0.27, + "grad_norm": 0.743136439665139, + "learning_rate": 8.528106933286089e-06, + "loss": 0.5301, + "step": 3478 + }, + { + "epoch": 0.27, + "grad_norm": 1.537862652631991, + "learning_rate": 8.527205637805508e-06, + "loss": 0.5601, + "step": 3479 + }, + { + "epoch": 0.27, + "grad_norm": 2.4148624643922876, + "learning_rate": 8.526304114119339e-06, + "loss": 0.5009, + "step": 3480 + }, + { + "epoch": 0.27, + "grad_norm": 1.6608835214722222, + "learning_rate": 8.525402362285905e-06, + "loss": 0.5365, + "step": 3481 + }, + { + "epoch": 0.27, + "grad_norm": 1.5078079039446, + "learning_rate": 8.524500382363552e-06, + "loss": 0.4686, + "step": 3482 + }, + { + "epoch": 0.27, + "grad_norm": 1.6206931386970211, + "learning_rate": 8.523598174410633e-06, + "loss": 0.5012, + "step": 3483 + }, + { + "epoch": 0.27, + "grad_norm": 1.465400663347099, + "learning_rate": 8.522695738485523e-06, + "loss": 0.5581, + "step": 3484 + }, + { + "epoch": 0.27, + "grad_norm": 1.3296633385338978, + "learning_rate": 8.521793074646604e-06, + "loss": 0.4785, + "step": 3485 + }, + { + "epoch": 0.27, + "grad_norm": 1.0689859196842466, + "learning_rate": 8.520890182952284e-06, + "loss": 0.5494, + "step": 3486 + }, + { + "epoch": 0.27, + "grad_norm": 1.60204598116505, + "learning_rate": 8.519987063460973e-06, + "loss": 0.5232, + "step": 3487 + }, + { + "epoch": 0.27, + "grad_norm": 2.258823055899541, + "learning_rate": 8.519083716231105e-06, + "loss": 0.4628, + "step": 3488 + }, + { + "epoch": 0.27, + "grad_norm": 3.1386437106876524, + "learning_rate": 8.518180141321121e-06, + "loss": 0.5062, + "step": 3489 + }, + { + "epoch": 0.27, + "grad_norm": 0.8585312654123055, + "learning_rate": 8.517276338789489e-06, + "loss": 0.5448, + "step": 3490 + }, + { + "epoch": 0.27, + "grad_norm": 1.586787706011846, + "learning_rate": 8.516372308694677e-06, + "loss": 0.495, + "step": 3491 + }, + { + "epoch": 0.27, + "grad_norm": 0.8550070518681759, + "learning_rate": 8.515468051095175e-06, + "loss": 0.5316, + "step": 3492 + }, + { + "epoch": 0.27, + "grad_norm": 1.7538440038665466, + "learning_rate": 8.51456356604949e-06, + "loss": 0.5226, + "step": 3493 + }, + { + "epoch": 0.27, + "grad_norm": 1.7535593369925833, + "learning_rate": 8.51365885361614e-06, + "loss": 0.5137, + "step": 3494 + }, + { + "epoch": 0.27, + "grad_norm": 0.6673528282688843, + "learning_rate": 8.512753913853658e-06, + "loss": 0.5373, + "step": 3495 + }, + { + "epoch": 0.27, + "grad_norm": 0.6757778939099565, + "learning_rate": 8.511848746820593e-06, + "loss": 0.5291, + "step": 3496 + }, + { + "epoch": 0.27, + "grad_norm": 1.878406014535827, + "learning_rate": 8.510943352575509e-06, + "loss": 0.5273, + "step": 3497 + }, + { + "epoch": 0.27, + "grad_norm": 1.6404802688089888, + "learning_rate": 8.510037731176983e-06, + "loss": 0.5027, + "step": 3498 + }, + { + "epoch": 0.27, + "grad_norm": 1.6603182051462595, + "learning_rate": 8.509131882683606e-06, + "loss": 0.536, + "step": 3499 + }, + { + "epoch": 0.27, + "grad_norm": 0.8001080671366464, + "learning_rate": 8.50822580715399e-06, + "loss": 0.5278, + "step": 3500 + }, + { + "epoch": 0.27, + "grad_norm": 1.6856738625758936, + "learning_rate": 8.50731950464675e-06, + "loss": 0.4439, + "step": 3501 + }, + { + "epoch": 0.28, + "grad_norm": 1.7091377257495401, + "learning_rate": 8.506412975220528e-06, + "loss": 0.4755, + "step": 3502 + }, + { + "epoch": 0.28, + "grad_norm": 1.7821481878106913, + "learning_rate": 8.505506218933975e-06, + "loss": 0.5099, + "step": 3503 + }, + { + "epoch": 0.28, + "grad_norm": 1.4891233117372127, + "learning_rate": 8.504599235845752e-06, + "loss": 0.4575, + "step": 3504 + }, + { + "epoch": 0.28, + "grad_norm": 1.868290029113564, + "learning_rate": 8.503692026014544e-06, + "loss": 0.5007, + "step": 3505 + }, + { + "epoch": 0.28, + "grad_norm": 1.6012568877311155, + "learning_rate": 8.502784589499046e-06, + "loss": 0.5204, + "step": 3506 + }, + { + "epoch": 0.28, + "grad_norm": 0.7552268232766387, + "learning_rate": 8.501876926357967e-06, + "loss": 0.5059, + "step": 3507 + }, + { + "epoch": 0.28, + "grad_norm": 1.6799784752327345, + "learning_rate": 8.500969036650034e-06, + "loss": 0.5267, + "step": 3508 + }, + { + "epoch": 0.28, + "grad_norm": 1.295192757248136, + "learning_rate": 8.50006092043398e-06, + "loss": 0.4716, + "step": 3509 + }, + { + "epoch": 0.28, + "grad_norm": 1.6552627331885357, + "learning_rate": 8.499152577768566e-06, + "loss": 0.4958, + "step": 3510 + }, + { + "epoch": 0.28, + "grad_norm": 1.5350352822429176, + "learning_rate": 8.498244008712557e-06, + "loss": 0.46, + "step": 3511 + }, + { + "epoch": 0.28, + "grad_norm": 1.821835902574183, + "learning_rate": 8.497335213324738e-06, + "loss": 0.5048, + "step": 3512 + }, + { + "epoch": 0.28, + "grad_norm": 1.5317020506007402, + "learning_rate": 8.496426191663905e-06, + "loss": 0.4856, + "step": 3513 + }, + { + "epoch": 0.28, + "grad_norm": 1.4918654086961218, + "learning_rate": 8.495516943788871e-06, + "loss": 0.4673, + "step": 3514 + }, + { + "epoch": 0.28, + "grad_norm": 2.4274132032077538, + "learning_rate": 8.494607469758463e-06, + "loss": 0.528, + "step": 3515 + }, + { + "epoch": 0.28, + "grad_norm": 1.4739577834237916, + "learning_rate": 8.493697769631524e-06, + "loss": 0.5008, + "step": 3516 + }, + { + "epoch": 0.28, + "grad_norm": 1.5457108714457204, + "learning_rate": 8.49278784346691e-06, + "loss": 0.5497, + "step": 3517 + }, + { + "epoch": 0.28, + "grad_norm": 1.3456723386650964, + "learning_rate": 8.49187769132349e-06, + "loss": 0.449, + "step": 3518 + }, + { + "epoch": 0.28, + "grad_norm": 1.3806135820535863, + "learning_rate": 8.490967313260152e-06, + "loss": 0.4417, + "step": 3519 + }, + { + "epoch": 0.28, + "grad_norm": 1.68446419143438, + "learning_rate": 8.490056709335797e-06, + "loss": 0.48, + "step": 3520 + }, + { + "epoch": 0.28, + "grad_norm": 3.1048098672420754, + "learning_rate": 8.489145879609338e-06, + "loss": 0.503, + "step": 3521 + }, + { + "epoch": 0.28, + "grad_norm": 1.5995198924446654, + "learning_rate": 8.488234824139702e-06, + "loss": 0.4826, + "step": 3522 + }, + { + "epoch": 0.28, + "grad_norm": 2.435998340285681, + "learning_rate": 8.48732354298584e-06, + "loss": 0.4914, + "step": 3523 + }, + { + "epoch": 0.28, + "grad_norm": 1.4155928686510237, + "learning_rate": 8.486412036206706e-06, + "loss": 0.4645, + "step": 3524 + }, + { + "epoch": 0.28, + "grad_norm": 1.4267204486689717, + "learning_rate": 8.485500303861273e-06, + "loss": 0.4812, + "step": 3525 + }, + { + "epoch": 0.28, + "grad_norm": 1.443856573254523, + "learning_rate": 8.48458834600853e-06, + "loss": 0.493, + "step": 3526 + }, + { + "epoch": 0.28, + "grad_norm": 1.777438449343234, + "learning_rate": 8.48367616270748e-06, + "loss": 0.5108, + "step": 3527 + }, + { + "epoch": 0.28, + "grad_norm": 1.6475435757119186, + "learning_rate": 8.482763754017139e-06, + "loss": 0.53, + "step": 3528 + }, + { + "epoch": 0.28, + "grad_norm": 1.351917881907467, + "learning_rate": 8.481851119996539e-06, + "loss": 0.4909, + "step": 3529 + }, + { + "epoch": 0.28, + "grad_norm": 1.5278940188659935, + "learning_rate": 8.480938260704726e-06, + "loss": 0.4982, + "step": 3530 + }, + { + "epoch": 0.28, + "grad_norm": 4.073438423814597, + "learning_rate": 8.480025176200763e-06, + "loss": 0.4855, + "step": 3531 + }, + { + "epoch": 0.28, + "grad_norm": 1.7445848600246636, + "learning_rate": 8.479111866543721e-06, + "loss": 0.5028, + "step": 3532 + }, + { + "epoch": 0.28, + "grad_norm": 1.8181362255757585, + "learning_rate": 8.478198331792694e-06, + "loss": 0.558, + "step": 3533 + }, + { + "epoch": 0.28, + "grad_norm": 1.4535009695487249, + "learning_rate": 8.477284572006786e-06, + "loss": 0.4959, + "step": 3534 + }, + { + "epoch": 0.28, + "grad_norm": 1.4926737882913486, + "learning_rate": 8.476370587245114e-06, + "loss": 0.4862, + "step": 3535 + }, + { + "epoch": 0.28, + "grad_norm": 0.6684025680871163, + "learning_rate": 8.475456377566813e-06, + "loss": 0.5432, + "step": 3536 + }, + { + "epoch": 0.28, + "grad_norm": 1.5798026142982897, + "learning_rate": 8.474541943031029e-06, + "loss": 0.5199, + "step": 3537 + }, + { + "epoch": 0.28, + "grad_norm": 1.3192915533053526, + "learning_rate": 8.47362728369693e-06, + "loss": 0.4334, + "step": 3538 + }, + { + "epoch": 0.28, + "grad_norm": 6.3255908843484505, + "learning_rate": 8.472712399623686e-06, + "loss": 0.4786, + "step": 3539 + }, + { + "epoch": 0.28, + "grad_norm": 1.4751602488406883, + "learning_rate": 8.471797290870497e-06, + "loss": 0.5017, + "step": 3540 + }, + { + "epoch": 0.28, + "grad_norm": 2.7480428416570915, + "learning_rate": 8.470881957496559e-06, + "loss": 0.4833, + "step": 3541 + }, + { + "epoch": 0.28, + "grad_norm": 6.056886866342889, + "learning_rate": 8.469966399561102e-06, + "loss": 0.5036, + "step": 3542 + }, + { + "epoch": 0.28, + "grad_norm": 1.7825901255222387, + "learning_rate": 8.46905061712336e-06, + "loss": 0.5584, + "step": 3543 + }, + { + "epoch": 0.28, + "grad_norm": 1.859025861444017, + "learning_rate": 8.46813461024258e-06, + "loss": 0.4797, + "step": 3544 + }, + { + "epoch": 0.28, + "grad_norm": 1.6106268913973407, + "learning_rate": 8.467218378978028e-06, + "loss": 0.4446, + "step": 3545 + }, + { + "epoch": 0.28, + "grad_norm": 1.8228460113244127, + "learning_rate": 8.466301923388981e-06, + "loss": 0.4717, + "step": 3546 + }, + { + "epoch": 0.28, + "grad_norm": 1.735035631212725, + "learning_rate": 8.465385243534735e-06, + "loss": 0.4518, + "step": 3547 + }, + { + "epoch": 0.28, + "grad_norm": 1.5826214244177539, + "learning_rate": 8.464468339474596e-06, + "loss": 0.5217, + "step": 3548 + }, + { + "epoch": 0.28, + "grad_norm": 1.3887343216738572, + "learning_rate": 8.463551211267888e-06, + "loss": 0.5174, + "step": 3549 + }, + { + "epoch": 0.28, + "grad_norm": 1.7914155208098235, + "learning_rate": 8.462633858973948e-06, + "loss": 0.473, + "step": 3550 + }, + { + "epoch": 0.28, + "grad_norm": 1.3641775461124308, + "learning_rate": 8.461716282652127e-06, + "loss": 0.4774, + "step": 3551 + }, + { + "epoch": 0.28, + "grad_norm": 1.6515559426233766, + "learning_rate": 8.460798482361792e-06, + "loss": 0.5333, + "step": 3552 + }, + { + "epoch": 0.28, + "grad_norm": 1.4193496124092668, + "learning_rate": 8.459880458162322e-06, + "loss": 0.5079, + "step": 3553 + }, + { + "epoch": 0.28, + "grad_norm": 1.5314299743510118, + "learning_rate": 8.45896221011311e-06, + "loss": 0.5175, + "step": 3554 + }, + { + "epoch": 0.28, + "grad_norm": 1.7408926352892873, + "learning_rate": 8.458043738273572e-06, + "loss": 0.5003, + "step": 3555 + }, + { + "epoch": 0.28, + "grad_norm": 1.3346030736207017, + "learning_rate": 8.457125042703124e-06, + "loss": 0.4664, + "step": 3556 + }, + { + "epoch": 0.28, + "grad_norm": 1.678751819972822, + "learning_rate": 8.45620612346121e-06, + "loss": 0.5013, + "step": 3557 + }, + { + "epoch": 0.28, + "grad_norm": 1.6570189391176575, + "learning_rate": 8.455286980607282e-06, + "loss": 0.4734, + "step": 3558 + }, + { + "epoch": 0.28, + "grad_norm": 1.3768948006700044, + "learning_rate": 8.454367614200805e-06, + "loss": 0.4619, + "step": 3559 + }, + { + "epoch": 0.28, + "grad_norm": 0.6859896118371359, + "learning_rate": 8.453448024301262e-06, + "loss": 0.5476, + "step": 3560 + }, + { + "epoch": 0.28, + "grad_norm": 0.6494732187888806, + "learning_rate": 8.45252821096815e-06, + "loss": 0.5421, + "step": 3561 + }, + { + "epoch": 0.28, + "grad_norm": 0.6054544767721078, + "learning_rate": 8.45160817426098e-06, + "loss": 0.5314, + "step": 3562 + }, + { + "epoch": 0.28, + "grad_norm": 1.6439177271886503, + "learning_rate": 8.450687914239275e-06, + "loss": 0.521, + "step": 3563 + }, + { + "epoch": 0.28, + "grad_norm": 0.643460096864342, + "learning_rate": 8.449767430962577e-06, + "loss": 0.5235, + "step": 3564 + }, + { + "epoch": 0.28, + "grad_norm": 1.7664559954007528, + "learning_rate": 8.448846724490438e-06, + "loss": 0.5396, + "step": 3565 + }, + { + "epoch": 0.28, + "grad_norm": 0.646243594182818, + "learning_rate": 8.44792579488243e-06, + "loss": 0.5239, + "step": 3566 + }, + { + "epoch": 0.28, + "grad_norm": 1.7128601074280965, + "learning_rate": 8.44700464219813e-06, + "loss": 0.4825, + "step": 3567 + }, + { + "epoch": 0.28, + "grad_norm": 1.438059198246418, + "learning_rate": 8.446083266497142e-06, + "loss": 0.4409, + "step": 3568 + }, + { + "epoch": 0.28, + "grad_norm": 1.752887591465036, + "learning_rate": 8.445161667839075e-06, + "loss": 0.4985, + "step": 3569 + }, + { + "epoch": 0.28, + "grad_norm": 1.411418947943841, + "learning_rate": 8.444239846283553e-06, + "loss": 0.5236, + "step": 3570 + }, + { + "epoch": 0.28, + "grad_norm": 1.616626438960821, + "learning_rate": 8.443317801890219e-06, + "loss": 0.4858, + "step": 3571 + }, + { + "epoch": 0.28, + "grad_norm": 1.491130309075099, + "learning_rate": 8.442395534718729e-06, + "loss": 0.4665, + "step": 3572 + }, + { + "epoch": 0.28, + "grad_norm": 2.106509568799857, + "learning_rate": 8.44147304482875e-06, + "loss": 0.4756, + "step": 3573 + }, + { + "epoch": 0.28, + "grad_norm": 1.4264216274682489, + "learning_rate": 8.440550332279965e-06, + "loss": 0.4429, + "step": 3574 + }, + { + "epoch": 0.28, + "grad_norm": 1.3626845341637992, + "learning_rate": 8.439627397132075e-06, + "loss": 0.5021, + "step": 3575 + }, + { + "epoch": 0.28, + "grad_norm": 1.5767618520775728, + "learning_rate": 8.438704239444795e-06, + "loss": 0.4652, + "step": 3576 + }, + { + "epoch": 0.28, + "grad_norm": 1.4846372811702195, + "learning_rate": 8.437780859277846e-06, + "loss": 0.4692, + "step": 3577 + }, + { + "epoch": 0.28, + "grad_norm": 1.452022770824617, + "learning_rate": 8.436857256690974e-06, + "loss": 0.5224, + "step": 3578 + }, + { + "epoch": 0.28, + "grad_norm": 1.6081258294461187, + "learning_rate": 8.435933431743935e-06, + "loss": 0.5447, + "step": 3579 + }, + { + "epoch": 0.28, + "grad_norm": 0.6767102130019608, + "learning_rate": 8.435009384496496e-06, + "loss": 0.5245, + "step": 3580 + }, + { + "epoch": 0.28, + "grad_norm": 1.9187670411482056, + "learning_rate": 8.434085115008443e-06, + "loss": 0.4945, + "step": 3581 + }, + { + "epoch": 0.28, + "grad_norm": 1.4444595754548155, + "learning_rate": 8.433160623339575e-06, + "loss": 0.4912, + "step": 3582 + }, + { + "epoch": 0.28, + "grad_norm": 9.650099163614648, + "learning_rate": 8.432235909549706e-06, + "loss": 0.5117, + "step": 3583 + }, + { + "epoch": 0.28, + "grad_norm": 1.2664314748880683, + "learning_rate": 8.431310973698665e-06, + "loss": 0.502, + "step": 3584 + }, + { + "epoch": 0.28, + "grad_norm": 0.6266398943723601, + "learning_rate": 8.43038581584629e-06, + "loss": 0.523, + "step": 3585 + }, + { + "epoch": 0.28, + "grad_norm": 1.325519955965469, + "learning_rate": 8.429460436052443e-06, + "loss": 0.4827, + "step": 3586 + }, + { + "epoch": 0.28, + "grad_norm": 1.6798428840807524, + "learning_rate": 8.428534834376993e-06, + "loss": 0.5074, + "step": 3587 + }, + { + "epoch": 0.28, + "grad_norm": 1.354008458249079, + "learning_rate": 8.427609010879823e-06, + "loss": 0.4918, + "step": 3588 + }, + { + "epoch": 0.28, + "grad_norm": 0.5927332098688273, + "learning_rate": 8.426682965620832e-06, + "loss": 0.5471, + "step": 3589 + }, + { + "epoch": 0.28, + "grad_norm": 1.3996701652895742, + "learning_rate": 8.425756698659937e-06, + "loss": 0.5002, + "step": 3590 + }, + { + "epoch": 0.28, + "grad_norm": 1.6368440826370525, + "learning_rate": 8.424830210057066e-06, + "loss": 0.5113, + "step": 3591 + }, + { + "epoch": 0.28, + "grad_norm": 1.4847076839752928, + "learning_rate": 8.42390349987216e-06, + "loss": 0.4728, + "step": 3592 + }, + { + "epoch": 0.28, + "grad_norm": 1.6278271222290683, + "learning_rate": 8.422976568165176e-06, + "loss": 0.4866, + "step": 3593 + }, + { + "epoch": 0.28, + "grad_norm": 1.7368030981109381, + "learning_rate": 8.422049414996087e-06, + "loss": 0.4791, + "step": 3594 + }, + { + "epoch": 0.28, + "grad_norm": 8.255774396991459, + "learning_rate": 8.421122040424876e-06, + "loss": 0.4982, + "step": 3595 + }, + { + "epoch": 0.28, + "grad_norm": 1.5335374335815604, + "learning_rate": 8.420194444511545e-06, + "loss": 0.495, + "step": 3596 + }, + { + "epoch": 0.28, + "grad_norm": 1.5998144187920822, + "learning_rate": 8.419266627316109e-06, + "loss": 0.5537, + "step": 3597 + }, + { + "epoch": 0.28, + "grad_norm": 1.4180678312098747, + "learning_rate": 8.418338588898594e-06, + "loss": 0.4827, + "step": 3598 + }, + { + "epoch": 0.28, + "grad_norm": 1.4130962905531788, + "learning_rate": 8.417410329319043e-06, + "loss": 0.5016, + "step": 3599 + }, + { + "epoch": 0.28, + "grad_norm": 1.5608017635613922, + "learning_rate": 8.416481848637515e-06, + "loss": 0.5209, + "step": 3600 + }, + { + "epoch": 0.28, + "grad_norm": 1.5568996716218513, + "learning_rate": 8.415553146914081e-06, + "loss": 0.5327, + "step": 3601 + }, + { + "epoch": 0.28, + "grad_norm": 1.455557141402252, + "learning_rate": 8.414624224208828e-06, + "loss": 0.4659, + "step": 3602 + }, + { + "epoch": 0.28, + "grad_norm": 1.5745702764272025, + "learning_rate": 8.413695080581851e-06, + "loss": 0.4604, + "step": 3603 + }, + { + "epoch": 0.28, + "grad_norm": 0.668732852064739, + "learning_rate": 8.412765716093273e-06, + "loss": 0.5274, + "step": 3604 + }, + { + "epoch": 0.28, + "grad_norm": 1.3305580848365866, + "learning_rate": 8.411836130803212e-06, + "loss": 0.4715, + "step": 3605 + }, + { + "epoch": 0.28, + "grad_norm": 0.5926423959064373, + "learning_rate": 8.41090632477182e-06, + "loss": 0.5242, + "step": 3606 + }, + { + "epoch": 0.28, + "grad_norm": 1.3083595333188167, + "learning_rate": 8.409976298059251e-06, + "loss": 0.4807, + "step": 3607 + }, + { + "epoch": 0.28, + "grad_norm": 1.3071979716957935, + "learning_rate": 8.409046050725675e-06, + "loss": 0.455, + "step": 3608 + }, + { + "epoch": 0.28, + "grad_norm": 1.4367513481825063, + "learning_rate": 8.408115582831278e-06, + "loss": 0.4911, + "step": 3609 + }, + { + "epoch": 0.28, + "grad_norm": 0.6271851818348064, + "learning_rate": 8.407184894436263e-06, + "loss": 0.537, + "step": 3610 + }, + { + "epoch": 0.28, + "grad_norm": 0.5840238330626331, + "learning_rate": 8.406253985600843e-06, + "loss": 0.5285, + "step": 3611 + }, + { + "epoch": 0.28, + "grad_norm": 1.5009676087361101, + "learning_rate": 8.405322856385246e-06, + "loss": 0.4817, + "step": 3612 + }, + { + "epoch": 0.28, + "grad_norm": 1.5135178153963298, + "learning_rate": 8.404391506849714e-06, + "loss": 0.4865, + "step": 3613 + }, + { + "epoch": 0.28, + "grad_norm": 1.378625146542494, + "learning_rate": 8.403459937054504e-06, + "loss": 0.5105, + "step": 3614 + }, + { + "epoch": 0.28, + "grad_norm": 1.3172797320275331, + "learning_rate": 8.40252814705989e-06, + "loss": 0.5182, + "step": 3615 + }, + { + "epoch": 0.28, + "grad_norm": 1.5075756972089034, + "learning_rate": 8.401596136926156e-06, + "loss": 0.4732, + "step": 3616 + }, + { + "epoch": 0.28, + "grad_norm": 1.4831869707440288, + "learning_rate": 8.400663906713599e-06, + "loss": 0.4662, + "step": 3617 + }, + { + "epoch": 0.28, + "grad_norm": 1.5312659921644027, + "learning_rate": 8.39973145648254e-06, + "loss": 0.5196, + "step": 3618 + }, + { + "epoch": 0.28, + "grad_norm": 1.4662583621163991, + "learning_rate": 8.3987987862933e-06, + "loss": 0.5217, + "step": 3619 + }, + { + "epoch": 0.28, + "grad_norm": 1.3412122834367557, + "learning_rate": 8.397865896206226e-06, + "loss": 0.5044, + "step": 3620 + }, + { + "epoch": 0.28, + "grad_norm": 2.356033907806469, + "learning_rate": 8.396932786281674e-06, + "loss": 0.5699, + "step": 3621 + }, + { + "epoch": 0.28, + "grad_norm": 1.3912361012916707, + "learning_rate": 8.395999456580015e-06, + "loss": 0.5185, + "step": 3622 + }, + { + "epoch": 0.28, + "grad_norm": 1.2778690381500915, + "learning_rate": 8.395065907161632e-06, + "loss": 0.4698, + "step": 3623 + }, + { + "epoch": 0.28, + "grad_norm": 0.749647622379653, + "learning_rate": 8.394132138086925e-06, + "loss": 0.5237, + "step": 3624 + }, + { + "epoch": 0.28, + "grad_norm": 0.6833563563685068, + "learning_rate": 8.393198149416311e-06, + "loss": 0.5363, + "step": 3625 + }, + { + "epoch": 0.28, + "grad_norm": 1.425835200719499, + "learning_rate": 8.392263941210217e-06, + "loss": 0.5174, + "step": 3626 + }, + { + "epoch": 0.28, + "grad_norm": 0.6020891866968889, + "learning_rate": 8.39132951352908e-06, + "loss": 0.5236, + "step": 3627 + }, + { + "epoch": 0.28, + "grad_norm": 1.6096526898170944, + "learning_rate": 8.390394866433362e-06, + "loss": 0.4736, + "step": 3628 + }, + { + "epoch": 0.29, + "grad_norm": 1.339035364894106, + "learning_rate": 8.38945999998353e-06, + "loss": 0.5202, + "step": 3629 + }, + { + "epoch": 0.29, + "grad_norm": 1.3730360375920494, + "learning_rate": 8.388524914240073e-06, + "loss": 0.4885, + "step": 3630 + }, + { + "epoch": 0.29, + "grad_norm": 2.0538202914352133, + "learning_rate": 8.387589609263483e-06, + "loss": 0.4812, + "step": 3631 + }, + { + "epoch": 0.29, + "grad_norm": 0.9050672358915803, + "learning_rate": 8.386654085114278e-06, + "loss": 0.5672, + "step": 3632 + }, + { + "epoch": 0.29, + "grad_norm": 1.4460819114537384, + "learning_rate": 8.385718341852985e-06, + "loss": 0.5277, + "step": 3633 + }, + { + "epoch": 0.29, + "grad_norm": 1.5245473951355866, + "learning_rate": 8.384782379540145e-06, + "loss": 0.5122, + "step": 3634 + }, + { + "epoch": 0.29, + "grad_norm": 1.4189005824995047, + "learning_rate": 8.383846198236311e-06, + "loss": 0.4784, + "step": 3635 + }, + { + "epoch": 0.29, + "grad_norm": 1.4181678193995244, + "learning_rate": 8.382909798002056e-06, + "loss": 0.4887, + "step": 3636 + }, + { + "epoch": 0.29, + "grad_norm": 1.486439996632292, + "learning_rate": 8.381973178897961e-06, + "loss": 0.5246, + "step": 3637 + }, + { + "epoch": 0.29, + "grad_norm": 1.695807785165534, + "learning_rate": 8.381036340984628e-06, + "loss": 0.5181, + "step": 3638 + }, + { + "epoch": 0.29, + "grad_norm": 1.1976558446208188, + "learning_rate": 8.380099284322665e-06, + "loss": 0.4543, + "step": 3639 + }, + { + "epoch": 0.29, + "grad_norm": 1.4127796115145583, + "learning_rate": 8.379162008972698e-06, + "loss": 0.4863, + "step": 3640 + }, + { + "epoch": 0.29, + "grad_norm": 1.4012040675919155, + "learning_rate": 8.378224514995372e-06, + "loss": 0.5236, + "step": 3641 + }, + { + "epoch": 0.29, + "grad_norm": 0.8020871533457317, + "learning_rate": 8.37728680245134e-06, + "loss": 0.5477, + "step": 3642 + }, + { + "epoch": 0.29, + "grad_norm": 1.2999157109260366, + "learning_rate": 8.376348871401268e-06, + "loss": 0.5062, + "step": 3643 + }, + { + "epoch": 0.29, + "grad_norm": 1.664901543565906, + "learning_rate": 8.37541072190584e-06, + "loss": 0.4927, + "step": 3644 + }, + { + "epoch": 0.29, + "grad_norm": 1.4800927882483752, + "learning_rate": 8.374472354025756e-06, + "loss": 0.5381, + "step": 3645 + }, + { + "epoch": 0.29, + "grad_norm": 2.0990127583477545, + "learning_rate": 8.373533767821725e-06, + "loss": 0.5116, + "step": 3646 + }, + { + "epoch": 0.29, + "grad_norm": 1.7631627674661245, + "learning_rate": 8.372594963354473e-06, + "loss": 0.4683, + "step": 3647 + }, + { + "epoch": 0.29, + "grad_norm": 1.4616009640933676, + "learning_rate": 8.371655940684737e-06, + "loss": 0.4995, + "step": 3648 + }, + { + "epoch": 0.29, + "grad_norm": 1.509840774863518, + "learning_rate": 8.370716699873273e-06, + "loss": 0.4553, + "step": 3649 + }, + { + "epoch": 0.29, + "grad_norm": 1.55416593727066, + "learning_rate": 8.369777240980848e-06, + "loss": 0.4944, + "step": 3650 + }, + { + "epoch": 0.29, + "grad_norm": 1.6333501333635299, + "learning_rate": 8.368837564068243e-06, + "loss": 0.5128, + "step": 3651 + }, + { + "epoch": 0.29, + "grad_norm": 1.4277752004247202, + "learning_rate": 8.367897669196256e-06, + "loss": 0.5478, + "step": 3652 + }, + { + "epoch": 0.29, + "grad_norm": 1.560428823301475, + "learning_rate": 8.366957556425694e-06, + "loss": 0.4718, + "step": 3653 + }, + { + "epoch": 0.29, + "grad_norm": 0.6383039856130129, + "learning_rate": 8.366017225817382e-06, + "loss": 0.5447, + "step": 3654 + }, + { + "epoch": 0.29, + "grad_norm": 0.6394021376648567, + "learning_rate": 8.365076677432162e-06, + "loss": 0.559, + "step": 3655 + }, + { + "epoch": 0.29, + "grad_norm": 0.5771203653166764, + "learning_rate": 8.36413591133088e-06, + "loss": 0.5407, + "step": 3656 + }, + { + "epoch": 0.29, + "grad_norm": 1.6686323733832875, + "learning_rate": 8.363194927574407e-06, + "loss": 0.5108, + "step": 3657 + }, + { + "epoch": 0.29, + "grad_norm": 1.435826133856278, + "learning_rate": 8.362253726223622e-06, + "loss": 0.4819, + "step": 3658 + }, + { + "epoch": 0.29, + "grad_norm": 1.3510474858697588, + "learning_rate": 8.361312307339419e-06, + "loss": 0.5396, + "step": 3659 + }, + { + "epoch": 0.29, + "grad_norm": 1.4687808067060752, + "learning_rate": 8.360370670982706e-06, + "loss": 0.4597, + "step": 3660 + }, + { + "epoch": 0.29, + "grad_norm": 1.359503248306933, + "learning_rate": 8.359428817214408e-06, + "loss": 0.4444, + "step": 3661 + }, + { + "epoch": 0.29, + "grad_norm": 1.5341989240544043, + "learning_rate": 8.358486746095458e-06, + "loss": 0.4941, + "step": 3662 + }, + { + "epoch": 0.29, + "grad_norm": 1.5049758966319133, + "learning_rate": 8.35754445768681e-06, + "loss": 0.5512, + "step": 3663 + }, + { + "epoch": 0.29, + "grad_norm": 1.5272427359393728, + "learning_rate": 8.356601952049429e-06, + "loss": 0.4939, + "step": 3664 + }, + { + "epoch": 0.29, + "grad_norm": 6.213701047324028, + "learning_rate": 8.35565922924429e-06, + "loss": 0.4178, + "step": 3665 + }, + { + "epoch": 0.29, + "grad_norm": 1.5123748759675288, + "learning_rate": 8.354716289332393e-06, + "loss": 0.4761, + "step": 3666 + }, + { + "epoch": 0.29, + "grad_norm": 2.9578774684331033, + "learning_rate": 8.353773132374737e-06, + "loss": 0.4775, + "step": 3667 + }, + { + "epoch": 0.29, + "grad_norm": 1.6699576718790616, + "learning_rate": 8.352829758432347e-06, + "loss": 0.5018, + "step": 3668 + }, + { + "epoch": 0.29, + "grad_norm": 1.6568876379253346, + "learning_rate": 8.351886167566258e-06, + "loss": 0.5358, + "step": 3669 + }, + { + "epoch": 0.29, + "grad_norm": 1.373874163611425, + "learning_rate": 8.350942359837519e-06, + "loss": 0.5139, + "step": 3670 + }, + { + "epoch": 0.29, + "grad_norm": 1.574280488067774, + "learning_rate": 8.349998335307194e-06, + "loss": 0.514, + "step": 3671 + }, + { + "epoch": 0.29, + "grad_norm": 1.8896079019950032, + "learning_rate": 8.349054094036358e-06, + "loss": 0.5124, + "step": 3672 + }, + { + "epoch": 0.29, + "grad_norm": 1.7579451809697362, + "learning_rate": 8.348109636086104e-06, + "loss": 0.4579, + "step": 3673 + }, + { + "epoch": 0.29, + "grad_norm": 1.3339654044984615, + "learning_rate": 8.347164961517536e-06, + "loss": 0.4612, + "step": 3674 + }, + { + "epoch": 0.29, + "grad_norm": 2.234452034880042, + "learning_rate": 8.346220070391773e-06, + "loss": 0.5284, + "step": 3675 + }, + { + "epoch": 0.29, + "grad_norm": 1.0521907168035736, + "learning_rate": 8.345274962769947e-06, + "loss": 0.5818, + "step": 3676 + }, + { + "epoch": 0.29, + "grad_norm": 1.2870402296543073, + "learning_rate": 8.34432963871321e-06, + "loss": 0.4791, + "step": 3677 + }, + { + "epoch": 0.29, + "grad_norm": 1.600590234802461, + "learning_rate": 8.34338409828272e-06, + "loss": 0.5055, + "step": 3678 + }, + { + "epoch": 0.29, + "grad_norm": 1.7591572777409372, + "learning_rate": 8.342438341539651e-06, + "loss": 0.4582, + "step": 3679 + }, + { + "epoch": 0.29, + "grad_norm": 1.6863892118210946, + "learning_rate": 8.341492368545195e-06, + "loss": 0.4537, + "step": 3680 + }, + { + "epoch": 0.29, + "grad_norm": 2.5182792774301714, + "learning_rate": 8.340546179360555e-06, + "loss": 0.4878, + "step": 3681 + }, + { + "epoch": 0.29, + "grad_norm": 1.3898958747068006, + "learning_rate": 8.339599774046945e-06, + "loss": 0.4807, + "step": 3682 + }, + { + "epoch": 0.29, + "grad_norm": 2.158201053523248, + "learning_rate": 8.3386531526656e-06, + "loss": 0.474, + "step": 3683 + }, + { + "epoch": 0.29, + "grad_norm": 1.2782419474140292, + "learning_rate": 8.337706315277763e-06, + "loss": 0.5269, + "step": 3684 + }, + { + "epoch": 0.29, + "grad_norm": 1.8343384399913252, + "learning_rate": 8.336759261944694e-06, + "loss": 0.4806, + "step": 3685 + }, + { + "epoch": 0.29, + "grad_norm": 0.8976073722291327, + "learning_rate": 8.335811992727665e-06, + "loss": 0.5382, + "step": 3686 + }, + { + "epoch": 0.29, + "grad_norm": 1.745789822859232, + "learning_rate": 8.334864507687964e-06, + "loss": 0.5362, + "step": 3687 + }, + { + "epoch": 0.29, + "grad_norm": 2.095025528556599, + "learning_rate": 8.333916806886893e-06, + "loss": 0.4743, + "step": 3688 + }, + { + "epoch": 0.29, + "grad_norm": 1.9150248256766138, + "learning_rate": 8.332968890385764e-06, + "loss": 0.4742, + "step": 3689 + }, + { + "epoch": 0.29, + "grad_norm": 1.6154573575864144, + "learning_rate": 8.332020758245909e-06, + "loss": 0.4682, + "step": 3690 + }, + { + "epoch": 0.29, + "grad_norm": 0.5999771527864105, + "learning_rate": 8.331072410528669e-06, + "loss": 0.5291, + "step": 3691 + }, + { + "epoch": 0.29, + "grad_norm": 0.6168834821015051, + "learning_rate": 8.330123847295403e-06, + "loss": 0.5064, + "step": 3692 + }, + { + "epoch": 0.29, + "grad_norm": 1.6991481769379488, + "learning_rate": 8.32917506860748e-06, + "loss": 0.4843, + "step": 3693 + }, + { + "epoch": 0.29, + "grad_norm": 1.5929590193488814, + "learning_rate": 8.328226074526284e-06, + "loss": 0.4981, + "step": 3694 + }, + { + "epoch": 0.29, + "grad_norm": 1.76714040876322, + "learning_rate": 8.327276865113216e-06, + "loss": 0.5124, + "step": 3695 + }, + { + "epoch": 0.29, + "grad_norm": 1.5525842591015449, + "learning_rate": 8.326327440429688e-06, + "loss": 0.4611, + "step": 3696 + }, + { + "epoch": 0.29, + "grad_norm": 1.7311441835789767, + "learning_rate": 8.325377800537124e-06, + "loss": 0.482, + "step": 3697 + }, + { + "epoch": 0.29, + "grad_norm": 1.4681202042929036, + "learning_rate": 8.32442794549697e-06, + "loss": 0.4654, + "step": 3698 + }, + { + "epoch": 0.29, + "grad_norm": 6.03116144345462, + "learning_rate": 8.323477875370673e-06, + "loss": 0.4903, + "step": 3699 + }, + { + "epoch": 0.29, + "grad_norm": 1.3568805539139934, + "learning_rate": 8.322527590219708e-06, + "loss": 0.4957, + "step": 3700 + }, + { + "epoch": 0.29, + "grad_norm": 1.239083802536056, + "learning_rate": 8.321577090105551e-06, + "loss": 0.517, + "step": 3701 + }, + { + "epoch": 0.29, + "grad_norm": 1.508132178364029, + "learning_rate": 8.320626375089705e-06, + "loss": 0.5023, + "step": 3702 + }, + { + "epoch": 0.29, + "grad_norm": 1.6680491274277014, + "learning_rate": 8.319675445233673e-06, + "loss": 0.4861, + "step": 3703 + }, + { + "epoch": 0.29, + "grad_norm": 1.8665834755448074, + "learning_rate": 8.318724300598984e-06, + "loss": 0.4787, + "step": 3704 + }, + { + "epoch": 0.29, + "grad_norm": 1.4859202594869967, + "learning_rate": 8.317772941247175e-06, + "loss": 0.4481, + "step": 3705 + }, + { + "epoch": 0.29, + "grad_norm": 1.8304996951403574, + "learning_rate": 8.316821367239797e-06, + "loss": 0.446, + "step": 3706 + }, + { + "epoch": 0.29, + "grad_norm": 0.8880781423600487, + "learning_rate": 8.315869578638413e-06, + "loss": 0.5209, + "step": 3707 + }, + { + "epoch": 0.29, + "grad_norm": 1.2970785410497607, + "learning_rate": 8.314917575504607e-06, + "loss": 0.4917, + "step": 3708 + }, + { + "epoch": 0.29, + "grad_norm": 1.7830153900326597, + "learning_rate": 8.313965357899972e-06, + "loss": 0.5072, + "step": 3709 + }, + { + "epoch": 0.29, + "grad_norm": 2.140628315428497, + "learning_rate": 8.31301292588611e-06, + "loss": 0.5014, + "step": 3710 + }, + { + "epoch": 0.29, + "grad_norm": 1.9698084138189735, + "learning_rate": 8.312060279524647e-06, + "loss": 0.5245, + "step": 3711 + }, + { + "epoch": 0.29, + "grad_norm": 0.5618078101630656, + "learning_rate": 8.311107418877216e-06, + "loss": 0.5316, + "step": 3712 + }, + { + "epoch": 0.29, + "grad_norm": 1.8519049290637395, + "learning_rate": 8.310154344005468e-06, + "loss": 0.4636, + "step": 3713 + }, + { + "epoch": 0.29, + "grad_norm": 1.8962938675342438, + "learning_rate": 8.309201054971064e-06, + "loss": 0.4429, + "step": 3714 + }, + { + "epoch": 0.29, + "grad_norm": 1.6473207862588837, + "learning_rate": 8.30824755183568e-06, + "loss": 0.5175, + "step": 3715 + }, + { + "epoch": 0.29, + "grad_norm": 1.4466294537403426, + "learning_rate": 8.307293834661008e-06, + "loss": 0.468, + "step": 3716 + }, + { + "epoch": 0.29, + "grad_norm": 1.4365003490148947, + "learning_rate": 8.30633990350875e-06, + "loss": 0.4991, + "step": 3717 + }, + { + "epoch": 0.29, + "grad_norm": 1.786382553864655, + "learning_rate": 8.305385758440627e-06, + "loss": 0.4943, + "step": 3718 + }, + { + "epoch": 0.29, + "grad_norm": 1.3940101036012826, + "learning_rate": 8.30443139951837e-06, + "loss": 0.5393, + "step": 3719 + }, + { + "epoch": 0.29, + "grad_norm": 1.8638661185479024, + "learning_rate": 8.30347682680372e-06, + "loss": 0.5263, + "step": 3720 + }, + { + "epoch": 0.29, + "grad_norm": 2.06328229021623, + "learning_rate": 8.302522040358446e-06, + "loss": 0.488, + "step": 3721 + }, + { + "epoch": 0.29, + "grad_norm": 1.2047078109725002, + "learning_rate": 8.301567040244312e-06, + "loss": 0.4732, + "step": 3722 + }, + { + "epoch": 0.29, + "grad_norm": 1.8093265826488936, + "learning_rate": 8.30061182652311e-06, + "loss": 0.5469, + "step": 3723 + }, + { + "epoch": 0.29, + "grad_norm": 1.5135087958002031, + "learning_rate": 8.299656399256644e-06, + "loss": 0.496, + "step": 3724 + }, + { + "epoch": 0.29, + "grad_norm": 1.5689711344266022, + "learning_rate": 8.298700758506722e-06, + "loss": 0.4415, + "step": 3725 + }, + { + "epoch": 0.29, + "grad_norm": 1.6836482587843618, + "learning_rate": 8.297744904335179e-06, + "loss": 0.4973, + "step": 3726 + }, + { + "epoch": 0.29, + "grad_norm": 0.770427420309001, + "learning_rate": 8.296788836803853e-06, + "loss": 0.5624, + "step": 3727 + }, + { + "epoch": 0.29, + "grad_norm": 1.4087254943549288, + "learning_rate": 8.295832555974602e-06, + "loss": 0.4677, + "step": 3728 + }, + { + "epoch": 0.29, + "grad_norm": 1.532908804421045, + "learning_rate": 8.294876061909296e-06, + "loss": 0.4526, + "step": 3729 + }, + { + "epoch": 0.29, + "grad_norm": 0.6111580101885888, + "learning_rate": 8.29391935466982e-06, + "loss": 0.5354, + "step": 3730 + }, + { + "epoch": 0.29, + "grad_norm": 4.279204277074283, + "learning_rate": 8.292962434318071e-06, + "loss": 0.5904, + "step": 3731 + }, + { + "epoch": 0.29, + "grad_norm": 0.6063369680500129, + "learning_rate": 8.292005300915957e-06, + "loss": 0.5449, + "step": 3732 + }, + { + "epoch": 0.29, + "grad_norm": 1.3822382912939843, + "learning_rate": 8.29104795452541e-06, + "loss": 0.4256, + "step": 3733 + }, + { + "epoch": 0.29, + "grad_norm": 1.3601984443549668, + "learning_rate": 8.290090395208363e-06, + "loss": 0.4985, + "step": 3734 + }, + { + "epoch": 0.29, + "grad_norm": 1.9526006432895975, + "learning_rate": 8.289132623026774e-06, + "loss": 0.5362, + "step": 3735 + }, + { + "epoch": 0.29, + "grad_norm": 1.4957540809941587, + "learning_rate": 8.288174638042606e-06, + "loss": 0.4975, + "step": 3736 + }, + { + "epoch": 0.29, + "grad_norm": 1.2348007329995847, + "learning_rate": 8.28721644031784e-06, + "loss": 0.4282, + "step": 3737 + }, + { + "epoch": 0.29, + "grad_norm": 0.7233954770576565, + "learning_rate": 8.286258029914472e-06, + "loss": 0.5387, + "step": 3738 + }, + { + "epoch": 0.29, + "grad_norm": 1.4780060156273445, + "learning_rate": 8.285299406894506e-06, + "loss": 0.5248, + "step": 3739 + }, + { + "epoch": 0.29, + "grad_norm": 1.2489293535804054, + "learning_rate": 8.284340571319968e-06, + "loss": 0.4692, + "step": 3740 + }, + { + "epoch": 0.29, + "grad_norm": 1.4638442473295616, + "learning_rate": 8.28338152325289e-06, + "loss": 0.5394, + "step": 3741 + }, + { + "epoch": 0.29, + "grad_norm": 1.510621021386326, + "learning_rate": 8.282422262755323e-06, + "loss": 0.4906, + "step": 3742 + }, + { + "epoch": 0.29, + "grad_norm": 1.3725085462111033, + "learning_rate": 8.281462789889331e-06, + "loss": 0.4916, + "step": 3743 + }, + { + "epoch": 0.29, + "grad_norm": 1.4767098502512201, + "learning_rate": 8.280503104716989e-06, + "loss": 0.5191, + "step": 3744 + }, + { + "epoch": 0.29, + "grad_norm": 0.6113840721718989, + "learning_rate": 8.279543207300384e-06, + "loss": 0.5396, + "step": 3745 + }, + { + "epoch": 0.29, + "grad_norm": 1.5724818544146906, + "learning_rate": 8.278583097701626e-06, + "loss": 0.5254, + "step": 3746 + }, + { + "epoch": 0.29, + "grad_norm": 1.3047635487215181, + "learning_rate": 8.277622775982831e-06, + "loss": 0.4541, + "step": 3747 + }, + { + "epoch": 0.29, + "grad_norm": 1.249100614443897, + "learning_rate": 8.276662242206131e-06, + "loss": 0.4665, + "step": 3748 + }, + { + "epoch": 0.29, + "grad_norm": 0.5966880575870631, + "learning_rate": 8.27570149643367e-06, + "loss": 0.5176, + "step": 3749 + }, + { + "epoch": 0.29, + "grad_norm": 1.6579532880021566, + "learning_rate": 8.274740538727606e-06, + "loss": 0.5118, + "step": 3750 + }, + { + "epoch": 0.29, + "grad_norm": 1.6090880241849397, + "learning_rate": 8.273779369150114e-06, + "loss": 0.5142, + "step": 3751 + }, + { + "epoch": 0.29, + "grad_norm": 1.3907504051977013, + "learning_rate": 8.27281798776338e-06, + "loss": 0.457, + "step": 3752 + }, + { + "epoch": 0.29, + "grad_norm": 1.988991808112168, + "learning_rate": 8.271856394629603e-06, + "loss": 0.4843, + "step": 3753 + }, + { + "epoch": 0.29, + "grad_norm": 1.3476339810405968, + "learning_rate": 8.270894589810997e-06, + "loss": 0.4554, + "step": 3754 + }, + { + "epoch": 0.29, + "grad_norm": 1.556395210254575, + "learning_rate": 8.26993257336979e-06, + "loss": 0.5011, + "step": 3755 + }, + { + "epoch": 0.29, + "grad_norm": 1.5095410980026644, + "learning_rate": 8.268970345368222e-06, + "loss": 0.5031, + "step": 3756 + }, + { + "epoch": 0.3, + "grad_norm": 1.3991110806372482, + "learning_rate": 8.26800790586855e-06, + "loss": 0.4787, + "step": 3757 + }, + { + "epoch": 0.3, + "grad_norm": 1.4213444400208373, + "learning_rate": 8.267045254933043e-06, + "loss": 0.5181, + "step": 3758 + }, + { + "epoch": 0.3, + "grad_norm": 1.5847043684014035, + "learning_rate": 8.26608239262398e-06, + "loss": 0.5145, + "step": 3759 + }, + { + "epoch": 0.3, + "grad_norm": 0.6460130119621552, + "learning_rate": 8.26511931900366e-06, + "loss": 0.5166, + "step": 3760 + }, + { + "epoch": 0.3, + "grad_norm": 1.9161026640669243, + "learning_rate": 8.26415603413439e-06, + "loss": 0.4704, + "step": 3761 + }, + { + "epoch": 0.3, + "grad_norm": 2.181486940120883, + "learning_rate": 8.263192538078495e-06, + "loss": 0.5099, + "step": 3762 + }, + { + "epoch": 0.3, + "grad_norm": 1.5252421694424716, + "learning_rate": 8.262228830898313e-06, + "loss": 0.4897, + "step": 3763 + }, + { + "epoch": 0.3, + "grad_norm": 1.3924525306911006, + "learning_rate": 8.261264912656191e-06, + "loss": 0.4848, + "step": 3764 + }, + { + "epoch": 0.3, + "grad_norm": 1.3705173756501405, + "learning_rate": 8.260300783414498e-06, + "loss": 0.4635, + "step": 3765 + }, + { + "epoch": 0.3, + "grad_norm": 0.6007149022555054, + "learning_rate": 8.259336443235608e-06, + "loss": 0.5372, + "step": 3766 + }, + { + "epoch": 0.3, + "grad_norm": 1.8816265028123444, + "learning_rate": 8.258371892181912e-06, + "loss": 0.4786, + "step": 3767 + }, + { + "epoch": 0.3, + "grad_norm": 1.4464214161199063, + "learning_rate": 8.25740713031582e-06, + "loss": 0.4723, + "step": 3768 + }, + { + "epoch": 0.3, + "grad_norm": 2.111059252394764, + "learning_rate": 8.256442157699746e-06, + "loss": 0.4803, + "step": 3769 + }, + { + "epoch": 0.3, + "grad_norm": 1.5886211899008102, + "learning_rate": 8.255476974396126e-06, + "loss": 0.5027, + "step": 3770 + }, + { + "epoch": 0.3, + "grad_norm": 1.3881769996000595, + "learning_rate": 8.254511580467403e-06, + "loss": 0.4448, + "step": 3771 + }, + { + "epoch": 0.3, + "grad_norm": 1.9474478219553628, + "learning_rate": 8.253545975976039e-06, + "loss": 0.5684, + "step": 3772 + }, + { + "epoch": 0.3, + "grad_norm": 0.5616487683665813, + "learning_rate": 8.252580160984505e-06, + "loss": 0.5032, + "step": 3773 + }, + { + "epoch": 0.3, + "grad_norm": 1.3924499096867569, + "learning_rate": 8.251614135555291e-06, + "loss": 0.5118, + "step": 3774 + }, + { + "epoch": 0.3, + "grad_norm": 1.5941882152869444, + "learning_rate": 8.250647899750896e-06, + "loss": 0.5288, + "step": 3775 + }, + { + "epoch": 0.3, + "grad_norm": 1.4327270530742207, + "learning_rate": 8.249681453633834e-06, + "loss": 0.4992, + "step": 3776 + }, + { + "epoch": 0.3, + "grad_norm": 1.59535382951488, + "learning_rate": 8.248714797266632e-06, + "loss": 0.4722, + "step": 3777 + }, + { + "epoch": 0.3, + "grad_norm": 1.7006317203430288, + "learning_rate": 8.247747930711834e-06, + "loss": 0.5019, + "step": 3778 + }, + { + "epoch": 0.3, + "grad_norm": 1.320923812529501, + "learning_rate": 8.246780854031993e-06, + "loss": 0.5386, + "step": 3779 + }, + { + "epoch": 0.3, + "grad_norm": 2.708374086807389, + "learning_rate": 8.245813567289678e-06, + "loss": 0.508, + "step": 3780 + }, + { + "epoch": 0.3, + "grad_norm": 1.2748509242495234, + "learning_rate": 8.244846070547472e-06, + "loss": 0.4967, + "step": 3781 + }, + { + "epoch": 0.3, + "grad_norm": 1.4124715004601487, + "learning_rate": 8.24387836386797e-06, + "loss": 0.5045, + "step": 3782 + }, + { + "epoch": 0.3, + "grad_norm": 0.6094962518506664, + "learning_rate": 8.24291044731378e-06, + "loss": 0.5258, + "step": 3783 + }, + { + "epoch": 0.3, + "grad_norm": 1.268824318363105, + "learning_rate": 8.241942320947527e-06, + "loss": 0.5042, + "step": 3784 + }, + { + "epoch": 0.3, + "grad_norm": 1.6998645074533534, + "learning_rate": 8.240973984831847e-06, + "loss": 0.5022, + "step": 3785 + }, + { + "epoch": 0.3, + "grad_norm": 1.3304236613377396, + "learning_rate": 8.24000543902939e-06, + "loss": 0.4572, + "step": 3786 + }, + { + "epoch": 0.3, + "grad_norm": 1.655992244791374, + "learning_rate": 8.23903668360282e-06, + "loss": 0.5231, + "step": 3787 + }, + { + "epoch": 0.3, + "grad_norm": 1.5843155878717685, + "learning_rate": 8.238067718614812e-06, + "loss": 0.4962, + "step": 3788 + }, + { + "epoch": 0.3, + "grad_norm": 1.3791953889625859, + "learning_rate": 8.237098544128059e-06, + "loss": 0.4686, + "step": 3789 + }, + { + "epoch": 0.3, + "grad_norm": 1.4605822000879984, + "learning_rate": 8.236129160205265e-06, + "loss": 0.5382, + "step": 3790 + }, + { + "epoch": 0.3, + "grad_norm": 1.3555206016776558, + "learning_rate": 8.23515956690915e-06, + "loss": 0.5165, + "step": 3791 + }, + { + "epoch": 0.3, + "grad_norm": 1.5246824662747311, + "learning_rate": 8.234189764302441e-06, + "loss": 0.4814, + "step": 3792 + }, + { + "epoch": 0.3, + "grad_norm": 1.5885871312530404, + "learning_rate": 8.233219752447886e-06, + "loss": 0.4791, + "step": 3793 + }, + { + "epoch": 0.3, + "grad_norm": 1.4607450874107129, + "learning_rate": 8.232249531408244e-06, + "loss": 0.4835, + "step": 3794 + }, + { + "epoch": 0.3, + "grad_norm": 1.4320910540166778, + "learning_rate": 8.231279101246283e-06, + "loss": 0.4812, + "step": 3795 + }, + { + "epoch": 0.3, + "grad_norm": 1.3632417252689426, + "learning_rate": 8.230308462024792e-06, + "loss": 0.4878, + "step": 3796 + }, + { + "epoch": 0.3, + "grad_norm": 2.0369757419737966, + "learning_rate": 8.229337613806568e-06, + "loss": 0.5293, + "step": 3797 + }, + { + "epoch": 0.3, + "grad_norm": 1.4924730456657083, + "learning_rate": 8.228366556654426e-06, + "loss": 0.5249, + "step": 3798 + }, + { + "epoch": 0.3, + "grad_norm": 0.672692632030733, + "learning_rate": 8.227395290631192e-06, + "loss": 0.5533, + "step": 3799 + }, + { + "epoch": 0.3, + "grad_norm": 1.526336730335463, + "learning_rate": 8.226423815799704e-06, + "loss": 0.503, + "step": 3800 + }, + { + "epoch": 0.3, + "grad_norm": 9.41381302333368, + "learning_rate": 8.225452132222815e-06, + "loss": 0.4421, + "step": 3801 + }, + { + "epoch": 0.3, + "grad_norm": 1.6025387503640498, + "learning_rate": 8.224480239963393e-06, + "loss": 0.4629, + "step": 3802 + }, + { + "epoch": 0.3, + "grad_norm": 1.509576899710408, + "learning_rate": 8.223508139084318e-06, + "loss": 0.4644, + "step": 3803 + }, + { + "epoch": 0.3, + "grad_norm": 1.4973308307578845, + "learning_rate": 8.222535829648482e-06, + "loss": 0.4966, + "step": 3804 + }, + { + "epoch": 0.3, + "grad_norm": 1.388516501116106, + "learning_rate": 8.221563311718794e-06, + "loss": 0.4445, + "step": 3805 + }, + { + "epoch": 0.3, + "grad_norm": 1.5934194223292506, + "learning_rate": 8.220590585358176e-06, + "loss": 0.4744, + "step": 3806 + }, + { + "epoch": 0.3, + "grad_norm": 0.5622187147702429, + "learning_rate": 8.21961765062956e-06, + "loss": 0.5317, + "step": 3807 + }, + { + "epoch": 0.3, + "grad_norm": 1.51210706115019, + "learning_rate": 8.218644507595891e-06, + "loss": 0.5006, + "step": 3808 + }, + { + "epoch": 0.3, + "grad_norm": 0.6351103976185537, + "learning_rate": 8.217671156320134e-06, + "loss": 0.5509, + "step": 3809 + }, + { + "epoch": 0.3, + "grad_norm": 0.5668474153828773, + "learning_rate": 8.216697596865263e-06, + "loss": 0.5321, + "step": 3810 + }, + { + "epoch": 0.3, + "grad_norm": 1.3913310698362227, + "learning_rate": 8.215723829294264e-06, + "loss": 0.5438, + "step": 3811 + }, + { + "epoch": 0.3, + "grad_norm": 1.3266997389244424, + "learning_rate": 8.214749853670142e-06, + "loss": 0.4606, + "step": 3812 + }, + { + "epoch": 0.3, + "grad_norm": 1.5666684610189263, + "learning_rate": 8.213775670055908e-06, + "loss": 0.5304, + "step": 3813 + }, + { + "epoch": 0.3, + "grad_norm": 1.8080086422815622, + "learning_rate": 8.212801278514596e-06, + "loss": 0.5083, + "step": 3814 + }, + { + "epoch": 0.3, + "grad_norm": 0.6893975463257926, + "learning_rate": 8.21182667910924e-06, + "loss": 0.5402, + "step": 3815 + }, + { + "epoch": 0.3, + "grad_norm": 1.480307303637885, + "learning_rate": 8.2108518719029e-06, + "loss": 0.498, + "step": 3816 + }, + { + "epoch": 0.3, + "grad_norm": 1.6052594748805153, + "learning_rate": 8.209876856958645e-06, + "loss": 0.4719, + "step": 3817 + }, + { + "epoch": 0.3, + "grad_norm": 1.368660774550835, + "learning_rate": 8.208901634339557e-06, + "loss": 0.5061, + "step": 3818 + }, + { + "epoch": 0.3, + "grad_norm": 2.437724570073628, + "learning_rate": 8.207926204108732e-06, + "loss": 0.4285, + "step": 3819 + }, + { + "epoch": 0.3, + "grad_norm": 2.1267028157725814, + "learning_rate": 8.206950566329276e-06, + "loss": 0.4836, + "step": 3820 + }, + { + "epoch": 0.3, + "grad_norm": 1.76688823916607, + "learning_rate": 8.205974721064314e-06, + "loss": 0.4896, + "step": 3821 + }, + { + "epoch": 0.3, + "grad_norm": 0.5985212805077254, + "learning_rate": 8.204998668376983e-06, + "loss": 0.521, + "step": 3822 + }, + { + "epoch": 0.3, + "grad_norm": 1.7416833290392446, + "learning_rate": 8.204022408330428e-06, + "loss": 0.5015, + "step": 3823 + }, + { + "epoch": 0.3, + "grad_norm": 1.6174538522905797, + "learning_rate": 8.203045940987816e-06, + "loss": 0.4931, + "step": 3824 + }, + { + "epoch": 0.3, + "grad_norm": 1.7357455524837244, + "learning_rate": 8.202069266412322e-06, + "loss": 0.4829, + "step": 3825 + }, + { + "epoch": 0.3, + "grad_norm": 1.9217659697859035, + "learning_rate": 8.201092384667135e-06, + "loss": 0.504, + "step": 3826 + }, + { + "epoch": 0.3, + "grad_norm": 1.691841591153293, + "learning_rate": 8.200115295815458e-06, + "loss": 0.525, + "step": 3827 + }, + { + "epoch": 0.3, + "grad_norm": 1.7537367368459895, + "learning_rate": 8.199137999920507e-06, + "loss": 0.4781, + "step": 3828 + }, + { + "epoch": 0.3, + "grad_norm": 1.933735173893854, + "learning_rate": 8.198160497045516e-06, + "loss": 0.4499, + "step": 3829 + }, + { + "epoch": 0.3, + "grad_norm": 1.5030260581820696, + "learning_rate": 8.197182787253721e-06, + "loss": 0.5263, + "step": 3830 + }, + { + "epoch": 0.3, + "grad_norm": 1.4592796583320504, + "learning_rate": 8.196204870608384e-06, + "loss": 0.5226, + "step": 3831 + }, + { + "epoch": 0.3, + "grad_norm": 1.4147993090273425, + "learning_rate": 8.19522674717277e-06, + "loss": 0.509, + "step": 3832 + }, + { + "epoch": 0.3, + "grad_norm": 1.2148906445425809, + "learning_rate": 8.19424841701017e-06, + "loss": 0.5278, + "step": 3833 + }, + { + "epoch": 0.3, + "grad_norm": 1.6730047014014413, + "learning_rate": 8.193269880183872e-06, + "loss": 0.5004, + "step": 3834 + }, + { + "epoch": 0.3, + "grad_norm": 1.4811486396969538, + "learning_rate": 8.192291136757191e-06, + "loss": 0.4622, + "step": 3835 + }, + { + "epoch": 0.3, + "grad_norm": 1.4793967518297202, + "learning_rate": 8.19131218679345e-06, + "loss": 0.5004, + "step": 3836 + }, + { + "epoch": 0.3, + "grad_norm": 1.711183296022157, + "learning_rate": 8.190333030355986e-06, + "loss": 0.45, + "step": 3837 + }, + { + "epoch": 0.3, + "grad_norm": 0.5955900295373451, + "learning_rate": 8.189353667508149e-06, + "loss": 0.5226, + "step": 3838 + }, + { + "epoch": 0.3, + "grad_norm": 1.710295995116325, + "learning_rate": 8.1883740983133e-06, + "loss": 0.4943, + "step": 3839 + }, + { + "epoch": 0.3, + "grad_norm": 1.7315456057696605, + "learning_rate": 8.187394322834818e-06, + "loss": 0.4833, + "step": 3840 + }, + { + "epoch": 0.3, + "grad_norm": 1.8113645514012437, + "learning_rate": 8.186414341136094e-06, + "loss": 0.5675, + "step": 3841 + }, + { + "epoch": 0.3, + "grad_norm": 1.3776792620905256, + "learning_rate": 8.18543415328053e-06, + "loss": 0.4719, + "step": 3842 + }, + { + "epoch": 0.3, + "grad_norm": 1.3928472204088052, + "learning_rate": 8.184453759331543e-06, + "loss": 0.4914, + "step": 3843 + }, + { + "epoch": 0.3, + "grad_norm": 1.3411310025732577, + "learning_rate": 8.183473159352564e-06, + "loss": 0.4991, + "step": 3844 + }, + { + "epoch": 0.3, + "grad_norm": 1.9428183978502724, + "learning_rate": 8.182492353407036e-06, + "loss": 0.4858, + "step": 3845 + }, + { + "epoch": 0.3, + "grad_norm": 1.5338599860376683, + "learning_rate": 8.181511341558414e-06, + "loss": 0.5038, + "step": 3846 + }, + { + "epoch": 0.3, + "grad_norm": 1.4223100571598641, + "learning_rate": 8.180530123870174e-06, + "loss": 0.4891, + "step": 3847 + }, + { + "epoch": 0.3, + "grad_norm": 1.4456168866449077, + "learning_rate": 8.179548700405793e-06, + "loss": 0.4408, + "step": 3848 + }, + { + "epoch": 0.3, + "grad_norm": 2.2636840006506356, + "learning_rate": 8.17856707122877e-06, + "loss": 0.4738, + "step": 3849 + }, + { + "epoch": 0.3, + "grad_norm": 1.436312251953886, + "learning_rate": 8.177585236402618e-06, + "loss": 0.4989, + "step": 3850 + }, + { + "epoch": 0.3, + "grad_norm": 1.618164094702493, + "learning_rate": 8.176603195990856e-06, + "loss": 0.4904, + "step": 3851 + }, + { + "epoch": 0.3, + "grad_norm": 1.4549503582199514, + "learning_rate": 8.175620950057024e-06, + "loss": 0.4618, + "step": 3852 + }, + { + "epoch": 0.3, + "grad_norm": 2.8421113703807848, + "learning_rate": 8.17463849866467e-06, + "loss": 0.4987, + "step": 3853 + }, + { + "epoch": 0.3, + "grad_norm": 1.349902418450739, + "learning_rate": 8.173655841877359e-06, + "loss": 0.4445, + "step": 3854 + }, + { + "epoch": 0.3, + "grad_norm": 2.103151088494613, + "learning_rate": 8.172672979758665e-06, + "loss": 0.4626, + "step": 3855 + }, + { + "epoch": 0.3, + "grad_norm": 6.3404286271211525, + "learning_rate": 8.171689912372181e-06, + "loss": 0.4905, + "step": 3856 + }, + { + "epoch": 0.3, + "grad_norm": 1.4263196803529008, + "learning_rate": 8.170706639781508e-06, + "loss": 0.4325, + "step": 3857 + }, + { + "epoch": 0.3, + "grad_norm": 1.6711489240848947, + "learning_rate": 8.169723162050263e-06, + "loss": 0.5194, + "step": 3858 + }, + { + "epoch": 0.3, + "grad_norm": 1.7975298763337655, + "learning_rate": 8.168739479242075e-06, + "loss": 0.5462, + "step": 3859 + }, + { + "epoch": 0.3, + "grad_norm": 0.6694766027780488, + "learning_rate": 8.167755591420591e-06, + "loss": 0.5319, + "step": 3860 + }, + { + "epoch": 0.3, + "grad_norm": 2.2017746683977855, + "learning_rate": 8.166771498649463e-06, + "loss": 0.5188, + "step": 3861 + }, + { + "epoch": 0.3, + "grad_norm": 1.5676744889538627, + "learning_rate": 8.165787200992362e-06, + "loss": 0.4923, + "step": 3862 + }, + { + "epoch": 0.3, + "grad_norm": 0.5626233866235272, + "learning_rate": 8.164802698512968e-06, + "loss": 0.527, + "step": 3863 + }, + { + "epoch": 0.3, + "grad_norm": 1.5711819628586476, + "learning_rate": 8.163817991274982e-06, + "loss": 0.5375, + "step": 3864 + }, + { + "epoch": 0.3, + "grad_norm": 2.2545146168243377, + "learning_rate": 8.16283307934211e-06, + "loss": 0.5475, + "step": 3865 + }, + { + "epoch": 0.3, + "grad_norm": 1.5406114236179673, + "learning_rate": 8.161847962778076e-06, + "loss": 0.4542, + "step": 3866 + }, + { + "epoch": 0.3, + "grad_norm": 0.5718046365712872, + "learning_rate": 8.160862641646613e-06, + "loss": 0.5388, + "step": 3867 + }, + { + "epoch": 0.3, + "grad_norm": 2.2814309788380807, + "learning_rate": 8.159877116011474e-06, + "loss": 0.4825, + "step": 3868 + }, + { + "epoch": 0.3, + "grad_norm": 0.5820226386026355, + "learning_rate": 8.158891385936418e-06, + "loss": 0.5391, + "step": 3869 + }, + { + "epoch": 0.3, + "grad_norm": 0.5797904228126589, + "learning_rate": 8.157905451485223e-06, + "loss": 0.5475, + "step": 3870 + }, + { + "epoch": 0.3, + "grad_norm": 1.6107546435355637, + "learning_rate": 8.156919312721676e-06, + "loss": 0.4342, + "step": 3871 + }, + { + "epoch": 0.3, + "grad_norm": 1.4299452585662809, + "learning_rate": 8.155932969709578e-06, + "loss": 0.4725, + "step": 3872 + }, + { + "epoch": 0.3, + "grad_norm": 1.343049223885057, + "learning_rate": 8.154946422512746e-06, + "loss": 0.4741, + "step": 3873 + }, + { + "epoch": 0.3, + "grad_norm": 0.5863282563517187, + "learning_rate": 8.153959671195007e-06, + "loss": 0.5352, + "step": 3874 + }, + { + "epoch": 0.3, + "grad_norm": 1.4295445943371536, + "learning_rate": 8.152972715820205e-06, + "loss": 0.455, + "step": 3875 + }, + { + "epoch": 0.3, + "grad_norm": 1.548616259170812, + "learning_rate": 8.15198555645219e-06, + "loss": 0.5071, + "step": 3876 + }, + { + "epoch": 0.3, + "grad_norm": 1.9959273704980987, + "learning_rate": 8.150998193154837e-06, + "loss": 0.4812, + "step": 3877 + }, + { + "epoch": 0.3, + "grad_norm": 1.5623507836521062, + "learning_rate": 8.15001062599202e-06, + "loss": 0.5118, + "step": 3878 + }, + { + "epoch": 0.3, + "grad_norm": 1.3910780237392357, + "learning_rate": 8.149022855027637e-06, + "loss": 0.4703, + "step": 3879 + }, + { + "epoch": 0.3, + "grad_norm": 2.0730751751257364, + "learning_rate": 8.148034880325594e-06, + "loss": 0.522, + "step": 3880 + }, + { + "epoch": 0.3, + "grad_norm": 1.4561573403086012, + "learning_rate": 8.147046701949813e-06, + "loss": 0.4857, + "step": 3881 + }, + { + "epoch": 0.3, + "grad_norm": 1.519844974674417, + "learning_rate": 8.146058319964227e-06, + "loss": 0.5171, + "step": 3882 + }, + { + "epoch": 0.3, + "grad_norm": 1.804916963985851, + "learning_rate": 8.145069734432783e-06, + "loss": 0.5151, + "step": 3883 + }, + { + "epoch": 0.31, + "grad_norm": 1.251861913686813, + "learning_rate": 8.144080945419442e-06, + "loss": 0.4981, + "step": 3884 + }, + { + "epoch": 0.31, + "grad_norm": 1.4806879075437955, + "learning_rate": 8.143091952988177e-06, + "loss": 0.4983, + "step": 3885 + }, + { + "epoch": 0.31, + "grad_norm": 1.5013385453517676, + "learning_rate": 8.142102757202974e-06, + "loss": 0.5358, + "step": 3886 + }, + { + "epoch": 0.31, + "grad_norm": 1.8227615191915136, + "learning_rate": 8.141113358127832e-06, + "loss": 0.466, + "step": 3887 + }, + { + "epoch": 0.31, + "grad_norm": 1.3493649961755132, + "learning_rate": 8.140123755826767e-06, + "loss": 0.5247, + "step": 3888 + }, + { + "epoch": 0.31, + "grad_norm": 1.554141936756321, + "learning_rate": 8.139133950363801e-06, + "loss": 0.4931, + "step": 3889 + }, + { + "epoch": 0.31, + "grad_norm": 1.4506192565023577, + "learning_rate": 8.138143941802976e-06, + "loss": 0.4022, + "step": 3890 + }, + { + "epoch": 0.31, + "grad_norm": 2.1867546974283334, + "learning_rate": 8.137153730208342e-06, + "loss": 0.4707, + "step": 3891 + }, + { + "epoch": 0.31, + "grad_norm": 0.6824446867247185, + "learning_rate": 8.136163315643967e-06, + "loss": 0.552, + "step": 3892 + }, + { + "epoch": 0.31, + "grad_norm": 1.3110787561600197, + "learning_rate": 8.135172698173927e-06, + "loss": 0.4846, + "step": 3893 + }, + { + "epoch": 0.31, + "grad_norm": 1.2247973196013973, + "learning_rate": 8.134181877862314e-06, + "loss": 0.4551, + "step": 3894 + }, + { + "epoch": 0.31, + "grad_norm": 2.303755765503516, + "learning_rate": 8.133190854773236e-06, + "loss": 0.5073, + "step": 3895 + }, + { + "epoch": 0.31, + "grad_norm": 1.8036928722802792, + "learning_rate": 8.132199628970807e-06, + "loss": 0.503, + "step": 3896 + }, + { + "epoch": 0.31, + "grad_norm": 1.3264475732552545, + "learning_rate": 8.13120820051916e-06, + "loss": 0.4831, + "step": 3897 + }, + { + "epoch": 0.31, + "grad_norm": 0.5463707597181916, + "learning_rate": 8.130216569482437e-06, + "loss": 0.5166, + "step": 3898 + }, + { + "epoch": 0.31, + "grad_norm": 1.893728645837672, + "learning_rate": 8.129224735924799e-06, + "loss": 0.451, + "step": 3899 + }, + { + "epoch": 0.31, + "grad_norm": 0.5684010132426125, + "learning_rate": 8.128232699910413e-06, + "loss": 0.5292, + "step": 3900 + }, + { + "epoch": 0.31, + "grad_norm": 1.778562287876687, + "learning_rate": 8.127240461503462e-06, + "loss": 0.4617, + "step": 3901 + }, + { + "epoch": 0.31, + "grad_norm": 1.639141589732679, + "learning_rate": 8.126248020768147e-06, + "loss": 0.4938, + "step": 3902 + }, + { + "epoch": 0.31, + "grad_norm": 1.7457781284647416, + "learning_rate": 8.125255377768673e-06, + "loss": 0.5333, + "step": 3903 + }, + { + "epoch": 0.31, + "grad_norm": 1.3348458187743864, + "learning_rate": 8.124262532569264e-06, + "loss": 0.5035, + "step": 3904 + }, + { + "epoch": 0.31, + "grad_norm": 1.408955329674299, + "learning_rate": 8.123269485234158e-06, + "loss": 0.4953, + "step": 3905 + }, + { + "epoch": 0.31, + "grad_norm": 1.711485312422972, + "learning_rate": 8.122276235827599e-06, + "loss": 0.4783, + "step": 3906 + }, + { + "epoch": 0.31, + "grad_norm": 1.4131855719125865, + "learning_rate": 8.121282784413855e-06, + "loss": 0.4645, + "step": 3907 + }, + { + "epoch": 0.31, + "grad_norm": 1.7676569957637467, + "learning_rate": 8.120289131057197e-06, + "loss": 0.5185, + "step": 3908 + }, + { + "epoch": 0.31, + "grad_norm": 1.769750240344266, + "learning_rate": 8.119295275821915e-06, + "loss": 0.4797, + "step": 3909 + }, + { + "epoch": 0.31, + "grad_norm": 1.48676382256224, + "learning_rate": 8.118301218772308e-06, + "loss": 0.5312, + "step": 3910 + }, + { + "epoch": 0.31, + "grad_norm": 1.3260857886124575, + "learning_rate": 8.117306959972693e-06, + "loss": 0.4967, + "step": 3911 + }, + { + "epoch": 0.31, + "grad_norm": 1.4472440142556946, + "learning_rate": 8.116312499487394e-06, + "loss": 0.506, + "step": 3912 + }, + { + "epoch": 0.31, + "grad_norm": 1.6406276101954045, + "learning_rate": 8.115317837380753e-06, + "loss": 0.5193, + "step": 3913 + }, + { + "epoch": 0.31, + "grad_norm": 0.6378677841241669, + "learning_rate": 8.11432297371712e-06, + "loss": 0.542, + "step": 3914 + }, + { + "epoch": 0.31, + "grad_norm": 1.4963393824989704, + "learning_rate": 8.113327908560871e-06, + "loss": 0.5089, + "step": 3915 + }, + { + "epoch": 0.31, + "grad_norm": 2.3355187570639697, + "learning_rate": 8.112332641976375e-06, + "loss": 0.4929, + "step": 3916 + }, + { + "epoch": 0.31, + "grad_norm": 2.595812373858805, + "learning_rate": 8.11133717402803e-06, + "loss": 0.4852, + "step": 3917 + }, + { + "epoch": 0.31, + "grad_norm": 1.4333971838555157, + "learning_rate": 8.110341504780238e-06, + "loss": 0.5082, + "step": 3918 + }, + { + "epoch": 0.31, + "grad_norm": 0.5660988748728288, + "learning_rate": 8.109345634297423e-06, + "loss": 0.5408, + "step": 3919 + }, + { + "epoch": 0.31, + "grad_norm": 2.051466671740846, + "learning_rate": 8.108349562644011e-06, + "loss": 0.4764, + "step": 3920 + }, + { + "epoch": 0.31, + "grad_norm": 1.419459554688049, + "learning_rate": 8.10735328988445e-06, + "loss": 0.517, + "step": 3921 + }, + { + "epoch": 0.31, + "grad_norm": 1.456020850581799, + "learning_rate": 8.106356816083194e-06, + "loss": 0.4489, + "step": 3922 + }, + { + "epoch": 0.31, + "grad_norm": 1.8954103689059125, + "learning_rate": 8.105360141304717e-06, + "loss": 0.5308, + "step": 3923 + }, + { + "epoch": 0.31, + "grad_norm": 0.5403968314874155, + "learning_rate": 8.1043632656135e-06, + "loss": 0.5357, + "step": 3924 + }, + { + "epoch": 0.31, + "grad_norm": 1.5275513451836822, + "learning_rate": 8.10336618907404e-06, + "loss": 0.4441, + "step": 3925 + }, + { + "epoch": 0.31, + "grad_norm": 3.340830977691936, + "learning_rate": 8.102368911750848e-06, + "loss": 0.478, + "step": 3926 + }, + { + "epoch": 0.31, + "grad_norm": 2.818413522030533, + "learning_rate": 8.101371433708447e-06, + "loss": 0.5276, + "step": 3927 + }, + { + "epoch": 0.31, + "grad_norm": 3.063325256157453, + "learning_rate": 8.10037375501137e-06, + "loss": 0.4958, + "step": 3928 + }, + { + "epoch": 0.31, + "grad_norm": 1.461162916087229, + "learning_rate": 8.09937587572417e-06, + "loss": 0.5142, + "step": 3929 + }, + { + "epoch": 0.31, + "grad_norm": 1.6089368528920278, + "learning_rate": 8.098377795911403e-06, + "loss": 0.4468, + "step": 3930 + }, + { + "epoch": 0.31, + "grad_norm": 1.851637291639948, + "learning_rate": 8.097379515637645e-06, + "loss": 0.4737, + "step": 3931 + }, + { + "epoch": 0.31, + "grad_norm": 1.660925011614903, + "learning_rate": 8.096381034967484e-06, + "loss": 0.4808, + "step": 3932 + }, + { + "epoch": 0.31, + "grad_norm": 2.137395886194378, + "learning_rate": 8.095382353965522e-06, + "loss": 0.4199, + "step": 3933 + }, + { + "epoch": 0.31, + "grad_norm": 1.656377532746799, + "learning_rate": 8.09438347269637e-06, + "loss": 0.4435, + "step": 3934 + }, + { + "epoch": 0.31, + "grad_norm": 1.6545612999235908, + "learning_rate": 8.093384391224656e-06, + "loss": 0.4926, + "step": 3935 + }, + { + "epoch": 0.31, + "grad_norm": 1.8533518144415526, + "learning_rate": 8.092385109615018e-06, + "loss": 0.4658, + "step": 3936 + }, + { + "epoch": 0.31, + "grad_norm": 1.533442340124059, + "learning_rate": 8.09138562793211e-06, + "loss": 0.4684, + "step": 3937 + }, + { + "epoch": 0.31, + "grad_norm": 1.5272413559902536, + "learning_rate": 8.090385946240596e-06, + "loss": 0.5061, + "step": 3938 + }, + { + "epoch": 0.31, + "grad_norm": 1.3728065026439058, + "learning_rate": 8.089386064605152e-06, + "loss": 0.4373, + "step": 3939 + }, + { + "epoch": 0.31, + "grad_norm": 1.5121683472675844, + "learning_rate": 8.088385983090472e-06, + "loss": 0.5122, + "step": 3940 + }, + { + "epoch": 0.31, + "grad_norm": 1.823147283797327, + "learning_rate": 8.08738570176126e-06, + "loss": 0.4732, + "step": 3941 + }, + { + "epoch": 0.31, + "grad_norm": 1.5196865070597456, + "learning_rate": 8.08638522068223e-06, + "loss": 0.5343, + "step": 3942 + }, + { + "epoch": 0.31, + "grad_norm": 2.515294698986381, + "learning_rate": 8.085384539918115e-06, + "loss": 0.487, + "step": 3943 + }, + { + "epoch": 0.31, + "grad_norm": 0.5931969491253701, + "learning_rate": 8.084383659533656e-06, + "loss": 0.5443, + "step": 3944 + }, + { + "epoch": 0.31, + "grad_norm": 1.489658439824186, + "learning_rate": 8.083382579593609e-06, + "loss": 0.4562, + "step": 3945 + }, + { + "epoch": 0.31, + "grad_norm": 0.5641128499602256, + "learning_rate": 8.082381300162742e-06, + "loss": 0.515, + "step": 3946 + }, + { + "epoch": 0.31, + "grad_norm": 1.3511330034842506, + "learning_rate": 8.081379821305839e-06, + "loss": 0.4535, + "step": 3947 + }, + { + "epoch": 0.31, + "grad_norm": 1.4195059085133939, + "learning_rate": 8.080378143087691e-06, + "loss": 0.4205, + "step": 3948 + }, + { + "epoch": 0.31, + "grad_norm": 1.4411741243397362, + "learning_rate": 8.079376265573108e-06, + "loss": 0.5073, + "step": 3949 + }, + { + "epoch": 0.31, + "grad_norm": 1.771786722237853, + "learning_rate": 8.078374188826908e-06, + "loss": 0.5236, + "step": 3950 + }, + { + "epoch": 0.31, + "grad_norm": 1.4100060925431146, + "learning_rate": 8.077371912913925e-06, + "loss": 0.4942, + "step": 3951 + }, + { + "epoch": 0.31, + "grad_norm": 6.005872019808927, + "learning_rate": 8.076369437899005e-06, + "loss": 0.5223, + "step": 3952 + }, + { + "epoch": 0.31, + "grad_norm": 0.5934552333323317, + "learning_rate": 8.075366763847008e-06, + "loss": 0.5223, + "step": 3953 + }, + { + "epoch": 0.31, + "grad_norm": 0.5727084035372247, + "learning_rate": 8.074363890822805e-06, + "loss": 0.5332, + "step": 3954 + }, + { + "epoch": 0.31, + "grad_norm": 1.4440783099165317, + "learning_rate": 8.073360818891277e-06, + "loss": 0.4684, + "step": 3955 + }, + { + "epoch": 0.31, + "grad_norm": 3.617780973445932, + "learning_rate": 8.072357548117327e-06, + "loss": 0.5132, + "step": 3956 + }, + { + "epoch": 0.31, + "grad_norm": 1.778971819131723, + "learning_rate": 8.071354078565861e-06, + "loss": 0.4564, + "step": 3957 + }, + { + "epoch": 0.31, + "grad_norm": 2.2262988968925628, + "learning_rate": 8.070350410301806e-06, + "loss": 0.4675, + "step": 3958 + }, + { + "epoch": 0.31, + "grad_norm": 1.336479118949998, + "learning_rate": 8.069346543390092e-06, + "loss": 0.4853, + "step": 3959 + }, + { + "epoch": 0.31, + "grad_norm": 1.4500891391267445, + "learning_rate": 8.068342477895676e-06, + "loss": 0.575, + "step": 3960 + }, + { + "epoch": 0.31, + "grad_norm": 1.4272318250308782, + "learning_rate": 8.067338213883514e-06, + "loss": 0.4591, + "step": 3961 + }, + { + "epoch": 0.31, + "grad_norm": 1.4704389775380629, + "learning_rate": 8.066333751418582e-06, + "loss": 0.4315, + "step": 3962 + }, + { + "epoch": 0.31, + "grad_norm": 1.563831687537641, + "learning_rate": 8.065329090565867e-06, + "loss": 0.4702, + "step": 3963 + }, + { + "epoch": 0.31, + "grad_norm": 1.446301365462807, + "learning_rate": 8.06432423139037e-06, + "loss": 0.506, + "step": 3964 + }, + { + "epoch": 0.31, + "grad_norm": 1.9195032468143451, + "learning_rate": 8.063319173957106e-06, + "loss": 0.5254, + "step": 3965 + }, + { + "epoch": 0.31, + "grad_norm": 1.2617077807602677, + "learning_rate": 8.062313918331096e-06, + "loss": 0.4814, + "step": 3966 + }, + { + "epoch": 0.31, + "grad_norm": 1.752359238963064, + "learning_rate": 8.061308464577384e-06, + "loss": 0.5358, + "step": 3967 + }, + { + "epoch": 0.31, + "grad_norm": 1.3556315228546831, + "learning_rate": 8.060302812761019e-06, + "loss": 0.4699, + "step": 3968 + }, + { + "epoch": 0.31, + "grad_norm": 1.4702491916225844, + "learning_rate": 8.059296962947063e-06, + "loss": 0.5372, + "step": 3969 + }, + { + "epoch": 0.31, + "grad_norm": 1.4197051836723409, + "learning_rate": 8.058290915200597e-06, + "loss": 0.4997, + "step": 3970 + }, + { + "epoch": 0.31, + "grad_norm": 1.5095470020502209, + "learning_rate": 8.057284669586708e-06, + "loss": 0.5256, + "step": 3971 + }, + { + "epoch": 0.31, + "grad_norm": 1.346212819489792, + "learning_rate": 8.056278226170502e-06, + "loss": 0.4624, + "step": 3972 + }, + { + "epoch": 0.31, + "grad_norm": 1.7351039199927025, + "learning_rate": 8.055271585017093e-06, + "loss": 0.4542, + "step": 3973 + }, + { + "epoch": 0.31, + "grad_norm": 4.4795864308101665, + "learning_rate": 8.05426474619161e-06, + "loss": 0.4662, + "step": 3974 + }, + { + "epoch": 0.31, + "grad_norm": 1.415041978384465, + "learning_rate": 8.053257709759192e-06, + "loss": 0.4914, + "step": 3975 + }, + { + "epoch": 0.31, + "grad_norm": 1.5959136386357695, + "learning_rate": 8.052250475784994e-06, + "loss": 0.4959, + "step": 3976 + }, + { + "epoch": 0.31, + "grad_norm": 5.830099206214093, + "learning_rate": 8.051243044334183e-06, + "loss": 0.5271, + "step": 3977 + }, + { + "epoch": 0.31, + "grad_norm": 1.3714002596255979, + "learning_rate": 8.05023541547194e-06, + "loss": 0.4819, + "step": 3978 + }, + { + "epoch": 0.31, + "grad_norm": 1.9527696763000346, + "learning_rate": 8.049227589263455e-06, + "loss": 0.4953, + "step": 3979 + }, + { + "epoch": 0.31, + "grad_norm": 2.2653000413254794, + "learning_rate": 8.048219565773933e-06, + "loss": 0.4589, + "step": 3980 + }, + { + "epoch": 0.31, + "grad_norm": 1.445510884249527, + "learning_rate": 8.047211345068593e-06, + "loss": 0.4567, + "step": 3981 + }, + { + "epoch": 0.31, + "grad_norm": 1.6162513895404875, + "learning_rate": 8.046202927212666e-06, + "loss": 0.4997, + "step": 3982 + }, + { + "epoch": 0.31, + "grad_norm": 1.924063816676241, + "learning_rate": 8.045194312271394e-06, + "loss": 0.4363, + "step": 3983 + }, + { + "epoch": 0.31, + "grad_norm": 2.5403296061144753, + "learning_rate": 8.044185500310035e-06, + "loss": 0.4993, + "step": 3984 + }, + { + "epoch": 0.31, + "grad_norm": 1.3187906774747356, + "learning_rate": 8.043176491393854e-06, + "loss": 0.4782, + "step": 3985 + }, + { + "epoch": 0.31, + "grad_norm": 1.6352581509118242, + "learning_rate": 8.042167285588138e-06, + "loss": 0.4893, + "step": 3986 + }, + { + "epoch": 0.31, + "grad_norm": 1.3762239664975533, + "learning_rate": 8.041157882958175e-06, + "loss": 0.4749, + "step": 3987 + }, + { + "epoch": 0.31, + "grad_norm": 0.7913327897967334, + "learning_rate": 8.040148283569278e-06, + "loss": 0.4996, + "step": 3988 + }, + { + "epoch": 0.31, + "grad_norm": 1.4140470054494674, + "learning_rate": 8.039138487486763e-06, + "loss": 0.5091, + "step": 3989 + }, + { + "epoch": 0.31, + "grad_norm": 0.6089841660217014, + "learning_rate": 8.038128494775963e-06, + "loss": 0.5393, + "step": 3990 + }, + { + "epoch": 0.31, + "grad_norm": 1.4196991359435396, + "learning_rate": 8.037118305502225e-06, + "loss": 0.4825, + "step": 3991 + }, + { + "epoch": 0.31, + "grad_norm": 2.3621940982344363, + "learning_rate": 8.036107919730905e-06, + "loss": 0.5339, + "step": 3992 + }, + { + "epoch": 0.31, + "grad_norm": 1.6826065198307762, + "learning_rate": 8.035097337527373e-06, + "loss": 0.4805, + "step": 3993 + }, + { + "epoch": 0.31, + "grad_norm": 1.7624362305606989, + "learning_rate": 8.034086558957015e-06, + "loss": 0.4695, + "step": 3994 + }, + { + "epoch": 0.31, + "grad_norm": 1.4831963561039059, + "learning_rate": 8.033075584085226e-06, + "loss": 0.4495, + "step": 3995 + }, + { + "epoch": 0.31, + "grad_norm": 1.1797254923129734, + "learning_rate": 8.032064412977414e-06, + "loss": 0.4771, + "step": 3996 + }, + { + "epoch": 0.31, + "grad_norm": 1.4435024241433796, + "learning_rate": 8.031053045699001e-06, + "loss": 0.4446, + "step": 3997 + }, + { + "epoch": 0.31, + "grad_norm": 0.8575762281620587, + "learning_rate": 8.03004148231542e-06, + "loss": 0.5181, + "step": 3998 + }, + { + "epoch": 0.31, + "grad_norm": 1.5553558599418429, + "learning_rate": 8.02902972289212e-06, + "loss": 0.4651, + "step": 3999 + }, + { + "epoch": 0.31, + "grad_norm": 1.7654567060684982, + "learning_rate": 8.02801776749456e-06, + "loss": 0.5432, + "step": 4000 + }, + { + "epoch": 0.31, + "grad_norm": 1.8135324410278741, + "learning_rate": 8.02700561618821e-06, + "loss": 0.438, + "step": 4001 + }, + { + "epoch": 0.31, + "grad_norm": 1.4933977045571443, + "learning_rate": 8.025993269038559e-06, + "loss": 0.4316, + "step": 4002 + }, + { + "epoch": 0.31, + "grad_norm": 0.5556543705160437, + "learning_rate": 8.0249807261111e-06, + "loss": 0.5079, + "step": 4003 + }, + { + "epoch": 0.31, + "grad_norm": 1.3212649926277829, + "learning_rate": 8.023967987471345e-06, + "loss": 0.4764, + "step": 4004 + }, + { + "epoch": 0.31, + "grad_norm": 0.6185360599909464, + "learning_rate": 8.022955053184817e-06, + "loss": 0.5178, + "step": 4005 + }, + { + "epoch": 0.31, + "grad_norm": 1.2874683793707153, + "learning_rate": 8.021941923317052e-06, + "loss": 0.4329, + "step": 4006 + }, + { + "epoch": 0.31, + "grad_norm": 0.5825161187806117, + "learning_rate": 8.0209285979336e-06, + "loss": 0.5173, + "step": 4007 + }, + { + "epoch": 0.31, + "grad_norm": 2.2240869329810695, + "learning_rate": 8.019915077100017e-06, + "loss": 0.5074, + "step": 4008 + }, + { + "epoch": 0.31, + "grad_norm": 1.3557927568261527, + "learning_rate": 8.018901360881878e-06, + "loss": 0.5423, + "step": 4009 + }, + { + "epoch": 0.31, + "grad_norm": 0.5797679297783245, + "learning_rate": 8.017887449344773e-06, + "loss": 0.5255, + "step": 4010 + }, + { + "epoch": 0.32, + "grad_norm": 1.4341721140664487, + "learning_rate": 8.016873342554297e-06, + "loss": 0.5024, + "step": 4011 + }, + { + "epoch": 0.32, + "grad_norm": 1.3476614938032836, + "learning_rate": 8.015859040576061e-06, + "loss": 0.4787, + "step": 4012 + }, + { + "epoch": 0.32, + "grad_norm": 1.6856451763070877, + "learning_rate": 8.014844543475692e-06, + "loss": 0.4893, + "step": 4013 + }, + { + "epoch": 0.32, + "grad_norm": 1.4816445109298362, + "learning_rate": 8.013829851318824e-06, + "loss": 0.4493, + "step": 4014 + }, + { + "epoch": 0.32, + "grad_norm": 1.4645156622712026, + "learning_rate": 8.012814964171108e-06, + "loss": 0.543, + "step": 4015 + }, + { + "epoch": 0.32, + "grad_norm": 1.3968633448675236, + "learning_rate": 8.011799882098203e-06, + "loss": 0.4882, + "step": 4016 + }, + { + "epoch": 0.32, + "grad_norm": 1.4496473218442163, + "learning_rate": 8.010784605165788e-06, + "loss": 0.4956, + "step": 4017 + }, + { + "epoch": 0.32, + "grad_norm": 1.677937874553818, + "learning_rate": 8.009769133439547e-06, + "loss": 0.4907, + "step": 4018 + }, + { + "epoch": 0.32, + "grad_norm": 1.6108335982554403, + "learning_rate": 8.00875346698518e-06, + "loss": 0.5453, + "step": 4019 + }, + { + "epoch": 0.32, + "grad_norm": 2.744973975194409, + "learning_rate": 8.0077376058684e-06, + "loss": 0.4503, + "step": 4020 + }, + { + "epoch": 0.32, + "grad_norm": 1.3769044324550819, + "learning_rate": 8.006721550154933e-06, + "loss": 0.435, + "step": 4021 + }, + { + "epoch": 0.32, + "grad_norm": 1.4299208038463596, + "learning_rate": 8.005705299910511e-06, + "loss": 0.4808, + "step": 4022 + }, + { + "epoch": 0.32, + "grad_norm": 0.6779006591836431, + "learning_rate": 8.004688855200891e-06, + "loss": 0.5359, + "step": 4023 + }, + { + "epoch": 0.32, + "grad_norm": 2.298146039148745, + "learning_rate": 8.003672216091833e-06, + "loss": 0.4982, + "step": 4024 + }, + { + "epoch": 0.32, + "grad_norm": 1.3973684522359757, + "learning_rate": 8.00265538264911e-06, + "loss": 0.4803, + "step": 4025 + }, + { + "epoch": 0.32, + "grad_norm": 1.3337699701269947, + "learning_rate": 8.001638354938513e-06, + "loss": 0.4555, + "step": 4026 + }, + { + "epoch": 0.32, + "grad_norm": 1.4889425417126576, + "learning_rate": 8.00062113302584e-06, + "loss": 0.5169, + "step": 4027 + }, + { + "epoch": 0.32, + "grad_norm": 1.3237428997160223, + "learning_rate": 7.999603716976905e-06, + "loss": 0.4153, + "step": 4028 + }, + { + "epoch": 0.32, + "grad_norm": 1.4300675307179977, + "learning_rate": 7.998586106857535e-06, + "loss": 0.4315, + "step": 4029 + }, + { + "epoch": 0.32, + "grad_norm": 0.5903321934068957, + "learning_rate": 7.997568302733565e-06, + "loss": 0.5379, + "step": 4030 + }, + { + "epoch": 0.32, + "grad_norm": 1.1823573858325938, + "learning_rate": 7.99655030467085e-06, + "loss": 0.4919, + "step": 4031 + }, + { + "epoch": 0.32, + "grad_norm": 1.545849113162527, + "learning_rate": 7.995532112735246e-06, + "loss": 0.4596, + "step": 4032 + }, + { + "epoch": 0.32, + "grad_norm": 0.5379241688724287, + "learning_rate": 7.994513726992636e-06, + "loss": 0.5337, + "step": 4033 + }, + { + "epoch": 0.32, + "grad_norm": 0.5399306736940116, + "learning_rate": 7.993495147508903e-06, + "loss": 0.5105, + "step": 4034 + }, + { + "epoch": 0.32, + "grad_norm": 1.6066289331093364, + "learning_rate": 7.992476374349951e-06, + "loss": 0.4616, + "step": 4035 + }, + { + "epoch": 0.32, + "grad_norm": 1.4813171993532492, + "learning_rate": 7.991457407581694e-06, + "loss": 0.4653, + "step": 4036 + }, + { + "epoch": 0.32, + "grad_norm": 2.0194807013814557, + "learning_rate": 7.990438247270054e-06, + "loss": 0.517, + "step": 4037 + }, + { + "epoch": 0.32, + "grad_norm": 1.787015014738175, + "learning_rate": 7.98941889348097e-06, + "loss": 0.4636, + "step": 4038 + }, + { + "epoch": 0.32, + "grad_norm": 1.6758355803821703, + "learning_rate": 7.988399346280398e-06, + "loss": 0.5247, + "step": 4039 + }, + { + "epoch": 0.32, + "grad_norm": 1.4218295312392741, + "learning_rate": 7.987379605734296e-06, + "loss": 0.5055, + "step": 4040 + }, + { + "epoch": 0.32, + "grad_norm": 1.5629268249757284, + "learning_rate": 7.98635967190864e-06, + "loss": 0.4817, + "step": 4041 + }, + { + "epoch": 0.32, + "grad_norm": 0.5844282940754906, + "learning_rate": 7.985339544869422e-06, + "loss": 0.5095, + "step": 4042 + }, + { + "epoch": 0.32, + "grad_norm": 1.6160681149547396, + "learning_rate": 7.98431922468264e-06, + "loss": 0.5357, + "step": 4043 + }, + { + "epoch": 0.32, + "grad_norm": 1.2086289661480742, + "learning_rate": 7.983298711414307e-06, + "loss": 0.5146, + "step": 4044 + }, + { + "epoch": 0.32, + "grad_norm": 1.7007609338670977, + "learning_rate": 7.982278005130451e-06, + "loss": 0.4541, + "step": 4045 + }, + { + "epoch": 0.32, + "grad_norm": 1.5204710622253985, + "learning_rate": 7.98125710589711e-06, + "loss": 0.4865, + "step": 4046 + }, + { + "epoch": 0.32, + "grad_norm": 1.6962565038551431, + "learning_rate": 7.980236013780334e-06, + "loss": 0.5283, + "step": 4047 + }, + { + "epoch": 0.32, + "grad_norm": 0.5729092323143142, + "learning_rate": 7.979214728846186e-06, + "loss": 0.5215, + "step": 4048 + }, + { + "epoch": 0.32, + "grad_norm": 1.4092739003400256, + "learning_rate": 7.978193251160743e-06, + "loss": 0.4749, + "step": 4049 + }, + { + "epoch": 0.32, + "grad_norm": 3.135822221198174, + "learning_rate": 7.977171580790091e-06, + "loss": 0.5167, + "step": 4050 + }, + { + "epoch": 0.32, + "grad_norm": 2.102199759484227, + "learning_rate": 7.976149717800331e-06, + "loss": 0.4938, + "step": 4051 + }, + { + "epoch": 0.32, + "grad_norm": 1.543188176397457, + "learning_rate": 7.975127662257582e-06, + "loss": 0.5097, + "step": 4052 + }, + { + "epoch": 0.32, + "grad_norm": 1.3828036901278216, + "learning_rate": 7.974105414227964e-06, + "loss": 0.4502, + "step": 4053 + }, + { + "epoch": 0.32, + "grad_norm": 1.6809879685523046, + "learning_rate": 7.973082973777615e-06, + "loss": 0.4593, + "step": 4054 + }, + { + "epoch": 0.32, + "grad_norm": 1.3343258451484006, + "learning_rate": 7.972060340972688e-06, + "loss": 0.4858, + "step": 4055 + }, + { + "epoch": 0.32, + "grad_norm": 0.5497732681412701, + "learning_rate": 7.971037515879343e-06, + "loss": 0.499, + "step": 4056 + }, + { + "epoch": 0.32, + "grad_norm": 1.6176892427428313, + "learning_rate": 7.97001449856376e-06, + "loss": 0.5055, + "step": 4057 + }, + { + "epoch": 0.32, + "grad_norm": 1.5677191456024067, + "learning_rate": 7.968991289092123e-06, + "loss": 0.4231, + "step": 4058 + }, + { + "epoch": 0.32, + "grad_norm": 1.212052602837464, + "learning_rate": 7.967967887530631e-06, + "loss": 0.4456, + "step": 4059 + }, + { + "epoch": 0.32, + "grad_norm": 1.6576546195564046, + "learning_rate": 7.966944293945503e-06, + "loss": 0.4715, + "step": 4060 + }, + { + "epoch": 0.32, + "grad_norm": 1.69966167920983, + "learning_rate": 7.965920508402959e-06, + "loss": 0.5033, + "step": 4061 + }, + { + "epoch": 0.32, + "grad_norm": 1.3995408532356108, + "learning_rate": 7.964896530969237e-06, + "loss": 0.5359, + "step": 4062 + }, + { + "epoch": 0.32, + "grad_norm": 1.6905069407766817, + "learning_rate": 7.963872361710589e-06, + "loss": 0.5157, + "step": 4063 + }, + { + "epoch": 0.32, + "grad_norm": 2.3803825133647822, + "learning_rate": 7.962848000693277e-06, + "loss": 0.4781, + "step": 4064 + }, + { + "epoch": 0.32, + "grad_norm": 1.4286237377798023, + "learning_rate": 7.961823447983576e-06, + "loss": 0.4873, + "step": 4065 + }, + { + "epoch": 0.32, + "grad_norm": 1.8902129789726683, + "learning_rate": 7.96079870364777e-06, + "loss": 0.519, + "step": 4066 + }, + { + "epoch": 0.32, + "grad_norm": 1.504021132366671, + "learning_rate": 7.959773767752163e-06, + "loss": 0.5092, + "step": 4067 + }, + { + "epoch": 0.32, + "grad_norm": 1.5171075836029009, + "learning_rate": 7.958748640363065e-06, + "loss": 0.436, + "step": 4068 + }, + { + "epoch": 0.32, + "grad_norm": 1.8020376946163292, + "learning_rate": 7.9577233215468e-06, + "loss": 0.5203, + "step": 4069 + }, + { + "epoch": 0.32, + "grad_norm": 1.3814921560417932, + "learning_rate": 7.956697811369704e-06, + "loss": 0.4326, + "step": 4070 + }, + { + "epoch": 0.32, + "grad_norm": 1.629158151940029, + "learning_rate": 7.95567210989813e-06, + "loss": 0.4273, + "step": 4071 + }, + { + "epoch": 0.32, + "grad_norm": 2.2891253925793804, + "learning_rate": 7.954646217198434e-06, + "loss": 0.5013, + "step": 4072 + }, + { + "epoch": 0.32, + "grad_norm": 1.6044810417299975, + "learning_rate": 7.953620133336995e-06, + "loss": 0.4921, + "step": 4073 + }, + { + "epoch": 0.32, + "grad_norm": 1.5442885447453165, + "learning_rate": 7.952593858380197e-06, + "loss": 0.5251, + "step": 4074 + }, + { + "epoch": 0.32, + "grad_norm": 1.6676853794346922, + "learning_rate": 7.951567392394438e-06, + "loss": 0.4753, + "step": 4075 + }, + { + "epoch": 0.32, + "grad_norm": 0.5889792752662136, + "learning_rate": 7.95054073544613e-06, + "loss": 0.5237, + "step": 4076 + }, + { + "epoch": 0.32, + "grad_norm": 1.4807186295379637, + "learning_rate": 7.949513887601698e-06, + "loss": 0.4915, + "step": 4077 + }, + { + "epoch": 0.32, + "grad_norm": 1.3959521919083293, + "learning_rate": 7.948486848927574e-06, + "loss": 0.483, + "step": 4078 + }, + { + "epoch": 0.32, + "grad_norm": 1.5266533859434668, + "learning_rate": 7.947459619490208e-06, + "loss": 0.4874, + "step": 4079 + }, + { + "epoch": 0.32, + "grad_norm": 1.6435249261332197, + "learning_rate": 7.946432199356062e-06, + "loss": 0.4804, + "step": 4080 + }, + { + "epoch": 0.32, + "grad_norm": 1.4526072170466304, + "learning_rate": 7.945404588591605e-06, + "loss": 0.4896, + "step": 4081 + }, + { + "epoch": 0.32, + "grad_norm": 1.2328381719751407, + "learning_rate": 7.944376787263327e-06, + "loss": 0.503, + "step": 4082 + }, + { + "epoch": 0.32, + "grad_norm": 2.229904715614715, + "learning_rate": 7.94334879543772e-06, + "loss": 0.4746, + "step": 4083 + }, + { + "epoch": 0.32, + "grad_norm": 1.6103177866547165, + "learning_rate": 7.942320613181296e-06, + "loss": 0.5004, + "step": 4084 + }, + { + "epoch": 0.32, + "grad_norm": 1.5857711224671605, + "learning_rate": 7.941292240560579e-06, + "loss": 0.4634, + "step": 4085 + }, + { + "epoch": 0.32, + "grad_norm": 1.602692541470008, + "learning_rate": 7.940263677642102e-06, + "loss": 0.4779, + "step": 4086 + }, + { + "epoch": 0.32, + "grad_norm": 2.0757221030325366, + "learning_rate": 7.93923492449241e-06, + "loss": 0.4905, + "step": 4087 + }, + { + "epoch": 0.32, + "grad_norm": 1.3615527604840532, + "learning_rate": 7.938205981178065e-06, + "loss": 0.4486, + "step": 4088 + }, + { + "epoch": 0.32, + "grad_norm": 1.6727827033567302, + "learning_rate": 7.937176847765636e-06, + "loss": 0.5394, + "step": 4089 + }, + { + "epoch": 0.32, + "grad_norm": 1.2473445348295324, + "learning_rate": 7.936147524321708e-06, + "loss": 0.466, + "step": 4090 + }, + { + "epoch": 0.32, + "grad_norm": 1.2816778775383308, + "learning_rate": 7.935118010912873e-06, + "loss": 0.4993, + "step": 4091 + }, + { + "epoch": 0.32, + "grad_norm": 1.440629597104561, + "learning_rate": 7.934088307605745e-06, + "loss": 0.512, + "step": 4092 + }, + { + "epoch": 0.32, + "grad_norm": 1.568559114025573, + "learning_rate": 7.93305841446694e-06, + "loss": 0.539, + "step": 4093 + }, + { + "epoch": 0.32, + "grad_norm": 1.9476506654081815, + "learning_rate": 7.932028331563095e-06, + "loss": 0.4789, + "step": 4094 + }, + { + "epoch": 0.32, + "grad_norm": 2.243029946142988, + "learning_rate": 7.93099805896085e-06, + "loss": 0.5198, + "step": 4095 + }, + { + "epoch": 0.32, + "grad_norm": 11.364859297254679, + "learning_rate": 7.929967596726866e-06, + "loss": 0.5008, + "step": 4096 + }, + { + "epoch": 0.32, + "grad_norm": 1.7778495184017291, + "learning_rate": 7.928936944927813e-06, + "loss": 0.499, + "step": 4097 + }, + { + "epoch": 0.32, + "grad_norm": 1.5856929359383394, + "learning_rate": 7.927906103630368e-06, + "loss": 0.4702, + "step": 4098 + }, + { + "epoch": 0.32, + "grad_norm": 1.7069058079254769, + "learning_rate": 7.92687507290123e-06, + "loss": 0.5184, + "step": 4099 + }, + { + "epoch": 0.32, + "grad_norm": 1.7710637362504889, + "learning_rate": 7.925843852807105e-06, + "loss": 0.4935, + "step": 4100 + }, + { + "epoch": 0.32, + "grad_norm": 1.4560907142158446, + "learning_rate": 7.924812443414708e-06, + "loss": 0.5715, + "step": 4101 + }, + { + "epoch": 0.32, + "grad_norm": 1.3669236718316833, + "learning_rate": 7.923780844790772e-06, + "loss": 0.4468, + "step": 4102 + }, + { + "epoch": 0.32, + "grad_norm": 1.367754917310993, + "learning_rate": 7.922749057002041e-06, + "loss": 0.4667, + "step": 4103 + }, + { + "epoch": 0.32, + "grad_norm": 2.403031543273243, + "learning_rate": 7.92171708011527e-06, + "loss": 0.501, + "step": 4104 + }, + { + "epoch": 0.32, + "grad_norm": 1.2707390845414877, + "learning_rate": 7.920684914197225e-06, + "loss": 0.467, + "step": 4105 + }, + { + "epoch": 0.32, + "grad_norm": 1.5582544157719762, + "learning_rate": 7.919652559314686e-06, + "loss": 0.5217, + "step": 4106 + }, + { + "epoch": 0.32, + "grad_norm": 1.4107535433494207, + "learning_rate": 7.91862001553445e-06, + "loss": 0.5294, + "step": 4107 + }, + { + "epoch": 0.32, + "grad_norm": 1.8293405497207536, + "learning_rate": 7.917587282923312e-06, + "loss": 0.518, + "step": 4108 + }, + { + "epoch": 0.32, + "grad_norm": 1.4202769630295764, + "learning_rate": 7.916554361548094e-06, + "loss": 0.491, + "step": 4109 + }, + { + "epoch": 0.32, + "grad_norm": 1.4502692037290117, + "learning_rate": 7.915521251475627e-06, + "loss": 0.4469, + "step": 4110 + }, + { + "epoch": 0.32, + "grad_norm": 1.2694529812182196, + "learning_rate": 7.914487952772747e-06, + "loss": 0.4815, + "step": 4111 + }, + { + "epoch": 0.32, + "grad_norm": 1.866413259977874, + "learning_rate": 7.91345446550631e-06, + "loss": 0.4752, + "step": 4112 + }, + { + "epoch": 0.32, + "grad_norm": 1.6136470799592084, + "learning_rate": 7.91242078974318e-06, + "loss": 0.506, + "step": 4113 + }, + { + "epoch": 0.32, + "grad_norm": 1.436527970906703, + "learning_rate": 7.911386925550235e-06, + "loss": 0.4589, + "step": 4114 + }, + { + "epoch": 0.32, + "grad_norm": 1.3313482782794848, + "learning_rate": 7.910352872994365e-06, + "loss": 0.4992, + "step": 4115 + }, + { + "epoch": 0.32, + "grad_norm": 1.3311467914392403, + "learning_rate": 7.90931863214247e-06, + "loss": 0.4249, + "step": 4116 + }, + { + "epoch": 0.32, + "grad_norm": 0.5781367219714921, + "learning_rate": 7.908284203061466e-06, + "loss": 0.5454, + "step": 4117 + }, + { + "epoch": 0.32, + "grad_norm": 1.6552780352709309, + "learning_rate": 7.907249585818278e-06, + "loss": 0.5042, + "step": 4118 + }, + { + "epoch": 0.32, + "grad_norm": 2.4229778448762045, + "learning_rate": 7.906214780479846e-06, + "loss": 0.4979, + "step": 4119 + }, + { + "epoch": 0.32, + "grad_norm": 0.5765427392624005, + "learning_rate": 7.90517978711312e-06, + "loss": 0.5154, + "step": 4120 + }, + { + "epoch": 0.32, + "grad_norm": 0.6009325451381912, + "learning_rate": 7.90414460578506e-06, + "loss": 0.5371, + "step": 4121 + }, + { + "epoch": 0.32, + "grad_norm": 1.8050804278266164, + "learning_rate": 7.903109236562645e-06, + "loss": 0.4676, + "step": 4122 + }, + { + "epoch": 0.32, + "grad_norm": 1.813375924674175, + "learning_rate": 7.902073679512859e-06, + "loss": 0.4897, + "step": 4123 + }, + { + "epoch": 0.32, + "grad_norm": 2.046627447310638, + "learning_rate": 7.901037934702703e-06, + "loss": 0.4893, + "step": 4124 + }, + { + "epoch": 0.32, + "grad_norm": 1.6664566752430592, + "learning_rate": 7.900002002199188e-06, + "loss": 0.5201, + "step": 4125 + }, + { + "epoch": 0.32, + "grad_norm": 1.9219164849935226, + "learning_rate": 7.898965882069336e-06, + "loss": 0.4968, + "step": 4126 + }, + { + "epoch": 0.32, + "grad_norm": 2.125292604415674, + "learning_rate": 7.897929574380186e-06, + "loss": 0.4609, + "step": 4127 + }, + { + "epoch": 0.32, + "grad_norm": 6.824812098885272, + "learning_rate": 7.89689307919878e-06, + "loss": 0.4861, + "step": 4128 + }, + { + "epoch": 0.32, + "grad_norm": 1.6624107478692396, + "learning_rate": 7.895856396592183e-06, + "loss": 0.4713, + "step": 4129 + }, + { + "epoch": 0.32, + "grad_norm": 2.2590290735866265, + "learning_rate": 7.894819526627466e-06, + "loss": 0.4662, + "step": 4130 + }, + { + "epoch": 0.32, + "grad_norm": 1.7029828802828137, + "learning_rate": 7.893782469371713e-06, + "loss": 0.4792, + "step": 4131 + }, + { + "epoch": 0.32, + "grad_norm": 1.4832039434813225, + "learning_rate": 7.89274522489202e-06, + "loss": 0.5066, + "step": 4132 + }, + { + "epoch": 0.32, + "grad_norm": 1.7773272716599027, + "learning_rate": 7.891707793255493e-06, + "loss": 0.5121, + "step": 4133 + }, + { + "epoch": 0.32, + "grad_norm": 1.3660085678181846, + "learning_rate": 7.890670174529255e-06, + "loss": 0.4616, + "step": 4134 + }, + { + "epoch": 0.32, + "grad_norm": 2.501142812654778, + "learning_rate": 7.88963236878044e-06, + "loss": 0.4622, + "step": 4135 + }, + { + "epoch": 0.32, + "grad_norm": 1.5636206448109682, + "learning_rate": 7.88859437607619e-06, + "loss": 0.5202, + "step": 4136 + }, + { + "epoch": 0.32, + "grad_norm": 1.6692075127352977, + "learning_rate": 7.887556196483663e-06, + "loss": 0.4955, + "step": 4137 + }, + { + "epoch": 0.32, + "grad_norm": 1.6988705188011985, + "learning_rate": 7.886517830070026e-06, + "loss": 0.4837, + "step": 4138 + }, + { + "epoch": 0.33, + "grad_norm": 2.0673187366579593, + "learning_rate": 7.885479276902464e-06, + "loss": 0.4766, + "step": 4139 + }, + { + "epoch": 0.33, + "grad_norm": 1.4098061237902029, + "learning_rate": 7.884440537048163e-06, + "loss": 0.4985, + "step": 4140 + }, + { + "epoch": 0.33, + "grad_norm": 1.4924026645001316, + "learning_rate": 7.883401610574338e-06, + "loss": 0.489, + "step": 4141 + }, + { + "epoch": 0.33, + "grad_norm": 1.4573804922750397, + "learning_rate": 7.882362497548197e-06, + "loss": 0.4869, + "step": 4142 + }, + { + "epoch": 0.33, + "grad_norm": 1.4517361199423946, + "learning_rate": 7.881323198036974e-06, + "loss": 0.4436, + "step": 4143 + }, + { + "epoch": 0.33, + "grad_norm": 0.6590676405695403, + "learning_rate": 7.88028371210791e-06, + "loss": 0.5504, + "step": 4144 + }, + { + "epoch": 0.33, + "grad_norm": 1.8019173328002358, + "learning_rate": 7.879244039828256e-06, + "loss": 0.531, + "step": 4145 + }, + { + "epoch": 0.33, + "grad_norm": 1.4937530135517516, + "learning_rate": 7.87820418126528e-06, + "loss": 0.5183, + "step": 4146 + }, + { + "epoch": 0.33, + "grad_norm": 1.5752403055221513, + "learning_rate": 7.877164136486259e-06, + "loss": 0.4899, + "step": 4147 + }, + { + "epoch": 0.33, + "grad_norm": 2.6243703146721367, + "learning_rate": 7.876123905558484e-06, + "loss": 0.4902, + "step": 4148 + }, + { + "epoch": 0.33, + "grad_norm": 6.597020683694799, + "learning_rate": 7.875083488549253e-06, + "loss": 0.4661, + "step": 4149 + }, + { + "epoch": 0.33, + "grad_norm": 1.502917546029407, + "learning_rate": 7.874042885525881e-06, + "loss": 0.5115, + "step": 4150 + }, + { + "epoch": 0.33, + "grad_norm": 7.689924761462897, + "learning_rate": 7.873002096555692e-06, + "loss": 0.4922, + "step": 4151 + }, + { + "epoch": 0.33, + "grad_norm": 1.695146824475652, + "learning_rate": 7.871961121706029e-06, + "loss": 0.4588, + "step": 4152 + }, + { + "epoch": 0.33, + "grad_norm": 1.70957677741513, + "learning_rate": 7.870919961044237e-06, + "loss": 0.4678, + "step": 4153 + }, + { + "epoch": 0.33, + "grad_norm": 2.6214926425123655, + "learning_rate": 7.86987861463768e-06, + "loss": 0.4847, + "step": 4154 + }, + { + "epoch": 0.33, + "grad_norm": 1.995431595030365, + "learning_rate": 7.868837082553731e-06, + "loss": 0.4929, + "step": 4155 + }, + { + "epoch": 0.33, + "grad_norm": 2.0198959432674877, + "learning_rate": 7.867795364859775e-06, + "loss": 0.5065, + "step": 4156 + }, + { + "epoch": 0.33, + "grad_norm": 1.4971321604119767, + "learning_rate": 7.86675346162321e-06, + "loss": 0.4795, + "step": 4157 + }, + { + "epoch": 0.33, + "grad_norm": 2.2395556283901006, + "learning_rate": 7.865711372911447e-06, + "loss": 0.5029, + "step": 4158 + }, + { + "epoch": 0.33, + "grad_norm": 1.6957484652427492, + "learning_rate": 7.864669098791909e-06, + "loss": 0.46, + "step": 4159 + }, + { + "epoch": 0.33, + "grad_norm": 1.4358837636127295, + "learning_rate": 7.863626639332025e-06, + "loss": 0.5337, + "step": 4160 + }, + { + "epoch": 0.33, + "grad_norm": 1.6916769749781666, + "learning_rate": 7.862583994599243e-06, + "loss": 0.5001, + "step": 4161 + }, + { + "epoch": 0.33, + "grad_norm": 1.9731599251986733, + "learning_rate": 7.861541164661021e-06, + "loss": 0.5139, + "step": 4162 + }, + { + "epoch": 0.33, + "grad_norm": 1.4999651072760727, + "learning_rate": 7.860498149584833e-06, + "loss": 0.4858, + "step": 4163 + }, + { + "epoch": 0.33, + "grad_norm": 1.387618659334892, + "learning_rate": 7.859454949438152e-06, + "loss": 0.4541, + "step": 4164 + }, + { + "epoch": 0.33, + "grad_norm": 1.564276830578624, + "learning_rate": 7.85841156428848e-06, + "loss": 0.5416, + "step": 4165 + }, + { + "epoch": 0.33, + "grad_norm": 1.4098974690192476, + "learning_rate": 7.857367994203318e-06, + "loss": 0.4957, + "step": 4166 + }, + { + "epoch": 0.33, + "grad_norm": 1.4561781067742248, + "learning_rate": 7.856324239250184e-06, + "loss": 0.4762, + "step": 4167 + }, + { + "epoch": 0.33, + "grad_norm": 2.719403539111808, + "learning_rate": 7.855280299496608e-06, + "loss": 0.5103, + "step": 4168 + }, + { + "epoch": 0.33, + "grad_norm": 1.6596030629035754, + "learning_rate": 7.854236175010133e-06, + "loss": 0.4604, + "step": 4169 + }, + { + "epoch": 0.33, + "grad_norm": 1.4979912557762274, + "learning_rate": 7.853191865858309e-06, + "loss": 0.5129, + "step": 4170 + }, + { + "epoch": 0.33, + "grad_norm": 0.7186507377502518, + "learning_rate": 7.852147372108707e-06, + "loss": 0.526, + "step": 4171 + }, + { + "epoch": 0.33, + "grad_norm": 1.7678543981418764, + "learning_rate": 7.8511026938289e-06, + "loss": 0.497, + "step": 4172 + }, + { + "epoch": 0.33, + "grad_norm": 0.549440712275477, + "learning_rate": 7.850057831086477e-06, + "loss": 0.5189, + "step": 4173 + }, + { + "epoch": 0.33, + "grad_norm": 2.350207249731609, + "learning_rate": 7.849012783949042e-06, + "loss": 0.4733, + "step": 4174 + }, + { + "epoch": 0.33, + "grad_norm": 1.641963853619446, + "learning_rate": 7.847967552484206e-06, + "loss": 0.5138, + "step": 4175 + }, + { + "epoch": 0.33, + "grad_norm": 0.6361502759589202, + "learning_rate": 7.846922136759595e-06, + "loss": 0.548, + "step": 4176 + }, + { + "epoch": 0.33, + "grad_norm": 1.3954468879599369, + "learning_rate": 7.845876536842846e-06, + "loss": 0.4565, + "step": 4177 + }, + { + "epoch": 0.33, + "grad_norm": 1.4221757639390908, + "learning_rate": 7.84483075280161e-06, + "loss": 0.4928, + "step": 4178 + }, + { + "epoch": 0.33, + "grad_norm": 1.6486200275215523, + "learning_rate": 7.843784784703544e-06, + "loss": 0.489, + "step": 4179 + }, + { + "epoch": 0.33, + "grad_norm": 0.6504466518262612, + "learning_rate": 7.842738632616322e-06, + "loss": 0.512, + "step": 4180 + }, + { + "epoch": 0.33, + "grad_norm": 1.3234827743163595, + "learning_rate": 7.841692296607629e-06, + "loss": 0.4617, + "step": 4181 + }, + { + "epoch": 0.33, + "grad_norm": 1.4811971666373964, + "learning_rate": 7.840645776745165e-06, + "loss": 0.4304, + "step": 4182 + }, + { + "epoch": 0.33, + "grad_norm": 1.5150946132739578, + "learning_rate": 7.83959907309663e-06, + "loss": 0.5101, + "step": 4183 + }, + { + "epoch": 0.33, + "grad_norm": 2.2992113338676683, + "learning_rate": 7.838552185729755e-06, + "loss": 0.4882, + "step": 4184 + }, + { + "epoch": 0.33, + "grad_norm": 1.4131840329164382, + "learning_rate": 7.837505114712262e-06, + "loss": 0.5193, + "step": 4185 + }, + { + "epoch": 0.33, + "grad_norm": 1.8245638392446732, + "learning_rate": 7.836457860111903e-06, + "loss": 0.4864, + "step": 4186 + }, + { + "epoch": 0.33, + "grad_norm": 1.3301221038911126, + "learning_rate": 7.835410421996431e-06, + "loss": 0.487, + "step": 4187 + }, + { + "epoch": 0.33, + "grad_norm": 1.5461999576983239, + "learning_rate": 7.834362800433614e-06, + "loss": 0.5376, + "step": 4188 + }, + { + "epoch": 0.33, + "grad_norm": 1.7004356653527166, + "learning_rate": 7.83331499549123e-06, + "loss": 0.4547, + "step": 4189 + }, + { + "epoch": 0.33, + "grad_norm": 0.6555015208296363, + "learning_rate": 7.832267007237072e-06, + "loss": 0.5302, + "step": 4190 + }, + { + "epoch": 0.33, + "grad_norm": 0.6593330346118563, + "learning_rate": 7.831218835738947e-06, + "loss": 0.5242, + "step": 4191 + }, + { + "epoch": 0.33, + "grad_norm": 1.582080882643477, + "learning_rate": 7.830170481064666e-06, + "loss": 0.5394, + "step": 4192 + }, + { + "epoch": 0.33, + "grad_norm": 1.6508544385059107, + "learning_rate": 7.829121943282055e-06, + "loss": 0.5523, + "step": 4193 + }, + { + "epoch": 0.33, + "grad_norm": 1.6742404909920339, + "learning_rate": 7.828073222458956e-06, + "loss": 0.5545, + "step": 4194 + }, + { + "epoch": 0.33, + "grad_norm": 1.8422896479447675, + "learning_rate": 7.827024318663221e-06, + "loss": 0.4596, + "step": 4195 + }, + { + "epoch": 0.33, + "grad_norm": 1.5765984288053445, + "learning_rate": 7.825975231962708e-06, + "loss": 0.4949, + "step": 4196 + }, + { + "epoch": 0.33, + "grad_norm": 0.6570742754139188, + "learning_rate": 7.824925962425296e-06, + "loss": 0.522, + "step": 4197 + }, + { + "epoch": 0.33, + "grad_norm": 1.7059131431996593, + "learning_rate": 7.823876510118868e-06, + "loss": 0.4893, + "step": 4198 + }, + { + "epoch": 0.33, + "grad_norm": 1.9774316073254092, + "learning_rate": 7.822826875111327e-06, + "loss": 0.4876, + "step": 4199 + }, + { + "epoch": 0.33, + "grad_norm": 2.304397361202217, + "learning_rate": 7.821777057470578e-06, + "loss": 0.4807, + "step": 4200 + }, + { + "epoch": 0.33, + "grad_norm": 0.6353878235953156, + "learning_rate": 7.820727057264545e-06, + "loss": 0.5136, + "step": 4201 + }, + { + "epoch": 0.33, + "grad_norm": 0.5906650848994638, + "learning_rate": 7.81967687456116e-06, + "loss": 0.5059, + "step": 4202 + }, + { + "epoch": 0.33, + "grad_norm": 1.3077828727225753, + "learning_rate": 7.818626509428372e-06, + "loss": 0.425, + "step": 4203 + }, + { + "epoch": 0.33, + "grad_norm": 1.4211527533835069, + "learning_rate": 7.817575961934135e-06, + "loss": 0.4787, + "step": 4204 + }, + { + "epoch": 0.33, + "grad_norm": 1.450364846582341, + "learning_rate": 7.816525232146419e-06, + "loss": 0.4906, + "step": 4205 + }, + { + "epoch": 0.33, + "grad_norm": 0.6525828027271319, + "learning_rate": 7.815474320133204e-06, + "loss": 0.5293, + "step": 4206 + }, + { + "epoch": 0.33, + "grad_norm": 0.5980006335119961, + "learning_rate": 7.814423225962487e-06, + "loss": 0.5143, + "step": 4207 + }, + { + "epoch": 0.33, + "grad_norm": 2.354456419242298, + "learning_rate": 7.813371949702266e-06, + "loss": 0.4723, + "step": 4208 + }, + { + "epoch": 0.33, + "grad_norm": 1.4179018384539985, + "learning_rate": 7.812320491420562e-06, + "loss": 0.5521, + "step": 4209 + }, + { + "epoch": 0.33, + "grad_norm": 0.6035206885314006, + "learning_rate": 7.8112688511854e-06, + "loss": 0.5421, + "step": 4210 + }, + { + "epoch": 0.33, + "grad_norm": 1.409198393286427, + "learning_rate": 7.81021702906482e-06, + "loss": 0.4748, + "step": 4211 + }, + { + "epoch": 0.33, + "grad_norm": 1.3321669454118439, + "learning_rate": 7.809165025126876e-06, + "loss": 0.4904, + "step": 4212 + }, + { + "epoch": 0.33, + "grad_norm": 2.0578162721387177, + "learning_rate": 7.80811283943963e-06, + "loss": 0.4991, + "step": 4213 + }, + { + "epoch": 0.33, + "grad_norm": 1.4907443018659574, + "learning_rate": 7.807060472071156e-06, + "loss": 0.5328, + "step": 4214 + }, + { + "epoch": 0.33, + "grad_norm": 1.6605616861814372, + "learning_rate": 7.80600792308954e-06, + "loss": 0.5112, + "step": 4215 + }, + { + "epoch": 0.33, + "grad_norm": 1.365863358456906, + "learning_rate": 7.804955192562884e-06, + "loss": 0.5211, + "step": 4216 + }, + { + "epoch": 0.33, + "grad_norm": 2.1184132891837684, + "learning_rate": 7.803902280559296e-06, + "loss": 0.4855, + "step": 4217 + }, + { + "epoch": 0.33, + "grad_norm": 1.6669567306168196, + "learning_rate": 7.8028491871469e-06, + "loss": 0.4572, + "step": 4218 + }, + { + "epoch": 0.33, + "grad_norm": 1.9633928821733793, + "learning_rate": 7.801795912393826e-06, + "loss": 0.5276, + "step": 4219 + }, + { + "epoch": 0.33, + "grad_norm": 1.6226621724871384, + "learning_rate": 7.800742456368223e-06, + "loss": 0.5058, + "step": 4220 + }, + { + "epoch": 0.33, + "grad_norm": 0.7335203607416932, + "learning_rate": 7.799688819138244e-06, + "loss": 0.5392, + "step": 4221 + }, + { + "epoch": 0.33, + "grad_norm": 1.3667034946973886, + "learning_rate": 7.798635000772063e-06, + "loss": 0.4512, + "step": 4222 + }, + { + "epoch": 0.33, + "grad_norm": 1.9457655890353047, + "learning_rate": 7.797581001337859e-06, + "loss": 0.5087, + "step": 4223 + }, + { + "epoch": 0.33, + "grad_norm": 1.603192431856554, + "learning_rate": 7.796526820903824e-06, + "loss": 0.482, + "step": 4224 + }, + { + "epoch": 0.33, + "grad_norm": 1.5857141457337143, + "learning_rate": 7.795472459538163e-06, + "loss": 0.4743, + "step": 4225 + }, + { + "epoch": 0.33, + "grad_norm": 2.235978057919025, + "learning_rate": 7.794417917309088e-06, + "loss": 0.5077, + "step": 4226 + }, + { + "epoch": 0.33, + "grad_norm": 1.498696315947516, + "learning_rate": 7.79336319428483e-06, + "loss": 0.4989, + "step": 4227 + }, + { + "epoch": 0.33, + "grad_norm": 2.3561215025578375, + "learning_rate": 7.79230829053363e-06, + "loss": 0.491, + "step": 4228 + }, + { + "epoch": 0.33, + "grad_norm": 1.5315660631165804, + "learning_rate": 7.791253206123734e-06, + "loss": 0.4517, + "step": 4229 + }, + { + "epoch": 0.33, + "grad_norm": 1.4403277509595056, + "learning_rate": 7.790197941123407e-06, + "loss": 0.4144, + "step": 4230 + }, + { + "epoch": 0.33, + "grad_norm": 1.4247318531650885, + "learning_rate": 7.789142495600923e-06, + "loss": 0.485, + "step": 4231 + }, + { + "epoch": 0.33, + "grad_norm": 0.6847963478019263, + "learning_rate": 7.788086869624569e-06, + "loss": 0.5396, + "step": 4232 + }, + { + "epoch": 0.33, + "grad_norm": 1.4293670010167894, + "learning_rate": 7.787031063262643e-06, + "loss": 0.5038, + "step": 4233 + }, + { + "epoch": 0.33, + "grad_norm": 1.495505904343361, + "learning_rate": 7.785975076583452e-06, + "loss": 0.4783, + "step": 4234 + }, + { + "epoch": 0.33, + "grad_norm": 0.5435137052644385, + "learning_rate": 7.784918909655316e-06, + "loss": 0.5084, + "step": 4235 + }, + { + "epoch": 0.33, + "grad_norm": 1.7006228240395367, + "learning_rate": 7.783862562546574e-06, + "loss": 0.5017, + "step": 4236 + }, + { + "epoch": 0.33, + "grad_norm": 2.359351762023234, + "learning_rate": 7.782806035325562e-06, + "loss": 0.4868, + "step": 4237 + }, + { + "epoch": 0.33, + "grad_norm": 1.6454585815392235, + "learning_rate": 7.781749328060642e-06, + "loss": 0.5227, + "step": 4238 + }, + { + "epoch": 0.33, + "grad_norm": 1.4938601073418323, + "learning_rate": 7.780692440820179e-06, + "loss": 0.4969, + "step": 4239 + }, + { + "epoch": 0.33, + "grad_norm": 2.191547574108064, + "learning_rate": 7.779635373672554e-06, + "loss": 0.4857, + "step": 4240 + }, + { + "epoch": 0.33, + "grad_norm": 1.925197255205875, + "learning_rate": 7.778578126686154e-06, + "loss": 0.4996, + "step": 4241 + }, + { + "epoch": 0.33, + "grad_norm": 1.3058125515773447, + "learning_rate": 7.777520699929383e-06, + "loss": 0.473, + "step": 4242 + }, + { + "epoch": 0.33, + "grad_norm": 1.3567887041266828, + "learning_rate": 7.776463093470661e-06, + "loss": 0.4638, + "step": 4243 + }, + { + "epoch": 0.33, + "grad_norm": 0.7039260955090508, + "learning_rate": 7.775405307378406e-06, + "loss": 0.5267, + "step": 4244 + }, + { + "epoch": 0.33, + "grad_norm": 1.8912792826922173, + "learning_rate": 7.774347341721058e-06, + "loss": 0.4672, + "step": 4245 + }, + { + "epoch": 0.33, + "grad_norm": 0.5600752712026751, + "learning_rate": 7.773289196567066e-06, + "loss": 0.5014, + "step": 4246 + }, + { + "epoch": 0.33, + "grad_norm": 1.6699735908437303, + "learning_rate": 7.772230871984893e-06, + "loss": 0.4986, + "step": 4247 + }, + { + "epoch": 0.33, + "grad_norm": 1.8646478285312775, + "learning_rate": 7.771172368043008e-06, + "loss": 0.4956, + "step": 4248 + }, + { + "epoch": 0.33, + "grad_norm": 1.3423309797242764, + "learning_rate": 7.770113684809896e-06, + "loss": 0.5157, + "step": 4249 + }, + { + "epoch": 0.33, + "grad_norm": 1.4599978940784153, + "learning_rate": 7.769054822354052e-06, + "loss": 0.5383, + "step": 4250 + }, + { + "epoch": 0.33, + "grad_norm": 1.28204164325328, + "learning_rate": 7.767995780743985e-06, + "loss": 0.4631, + "step": 4251 + }, + { + "epoch": 0.33, + "grad_norm": 1.8765615277182686, + "learning_rate": 7.766936560048209e-06, + "loss": 0.4678, + "step": 4252 + }, + { + "epoch": 0.33, + "grad_norm": 2.058919150482084, + "learning_rate": 7.765877160335258e-06, + "loss": 0.4909, + "step": 4253 + }, + { + "epoch": 0.33, + "grad_norm": 2.6900241968797984, + "learning_rate": 7.764817581673673e-06, + "loss": 0.4773, + "step": 4254 + }, + { + "epoch": 0.33, + "grad_norm": 0.8093000996695546, + "learning_rate": 7.76375782413201e-06, + "loss": 0.5018, + "step": 4255 + }, + { + "epoch": 0.33, + "grad_norm": 1.575340846990629, + "learning_rate": 7.762697887778828e-06, + "loss": 0.4902, + "step": 4256 + }, + { + "epoch": 0.33, + "grad_norm": 2.3513216223042015, + "learning_rate": 7.761637772682709e-06, + "loss": 0.4858, + "step": 4257 + }, + { + "epoch": 0.33, + "grad_norm": 1.351264589488454, + "learning_rate": 7.760577478912237e-06, + "loss": 0.4736, + "step": 4258 + }, + { + "epoch": 0.33, + "grad_norm": 1.4763792852312911, + "learning_rate": 7.759517006536015e-06, + "loss": 0.5045, + "step": 4259 + }, + { + "epoch": 0.33, + "grad_norm": 2.2133795120987636, + "learning_rate": 7.758456355622651e-06, + "loss": 0.5099, + "step": 4260 + }, + { + "epoch": 0.33, + "grad_norm": 1.5133623030142365, + "learning_rate": 7.75739552624077e-06, + "loss": 0.481, + "step": 4261 + }, + { + "epoch": 0.33, + "grad_norm": 1.35383302772019, + "learning_rate": 7.756334518459006e-06, + "loss": 0.4511, + "step": 4262 + }, + { + "epoch": 0.33, + "grad_norm": 1.4520839797217628, + "learning_rate": 7.755273332346004e-06, + "loss": 0.4753, + "step": 4263 + }, + { + "epoch": 0.33, + "grad_norm": 0.6741063390538984, + "learning_rate": 7.75421196797042e-06, + "loss": 0.546, + "step": 4264 + }, + { + "epoch": 0.33, + "grad_norm": 2.7485147859764902, + "learning_rate": 7.753150425400928e-06, + "loss": 0.4776, + "step": 4265 + }, + { + "epoch": 0.34, + "grad_norm": 1.4267801249566783, + "learning_rate": 7.752088704706201e-06, + "loss": 0.4402, + "step": 4266 + }, + { + "epoch": 0.34, + "grad_norm": 1.6923576111351786, + "learning_rate": 7.75102680595494e-06, + "loss": 0.5117, + "step": 4267 + }, + { + "epoch": 0.34, + "grad_norm": 2.153602201020611, + "learning_rate": 7.74996472921584e-06, + "loss": 0.5003, + "step": 4268 + }, + { + "epoch": 0.34, + "grad_norm": 1.7486212430766905, + "learning_rate": 7.74890247455762e-06, + "loss": 0.4964, + "step": 4269 + }, + { + "epoch": 0.34, + "grad_norm": 1.481852831581686, + "learning_rate": 7.747840042049006e-06, + "loss": 0.4153, + "step": 4270 + }, + { + "epoch": 0.34, + "grad_norm": 0.602232273034823, + "learning_rate": 7.746777431758737e-06, + "loss": 0.5101, + "step": 4271 + }, + { + "epoch": 0.34, + "grad_norm": 1.650932206242909, + "learning_rate": 7.74571464375556e-06, + "loss": 0.5216, + "step": 4272 + }, + { + "epoch": 0.34, + "grad_norm": 3.35510304234774, + "learning_rate": 7.744651678108238e-06, + "loss": 0.4667, + "step": 4273 + }, + { + "epoch": 0.34, + "grad_norm": 2.973813397319983, + "learning_rate": 7.743588534885543e-06, + "loss": 0.4249, + "step": 4274 + }, + { + "epoch": 0.34, + "grad_norm": 1.1722369170509006, + "learning_rate": 7.742525214156257e-06, + "loss": 0.4618, + "step": 4275 + }, + { + "epoch": 0.34, + "grad_norm": 1.3638551695917065, + "learning_rate": 7.74146171598918e-06, + "loss": 0.4657, + "step": 4276 + }, + { + "epoch": 0.34, + "grad_norm": 0.5544442393026597, + "learning_rate": 7.740398040453115e-06, + "loss": 0.531, + "step": 4277 + }, + { + "epoch": 0.34, + "grad_norm": 2.174774090183045, + "learning_rate": 7.739334187616883e-06, + "loss": 0.4782, + "step": 4278 + }, + { + "epoch": 0.34, + "grad_norm": 1.4825862363176272, + "learning_rate": 7.73827015754931e-06, + "loss": 0.478, + "step": 4279 + }, + { + "epoch": 0.34, + "grad_norm": 1.7629979481365252, + "learning_rate": 7.737205950319242e-06, + "loss": 0.4932, + "step": 4280 + }, + { + "epoch": 0.34, + "grad_norm": 0.5951301622477199, + "learning_rate": 7.736141565995529e-06, + "loss": 0.5347, + "step": 4281 + }, + { + "epoch": 0.34, + "grad_norm": 1.7866128860923791, + "learning_rate": 7.735077004647036e-06, + "loss": 0.4741, + "step": 4282 + }, + { + "epoch": 0.34, + "grad_norm": 1.324560280427871, + "learning_rate": 7.73401226634264e-06, + "loss": 0.4114, + "step": 4283 + }, + { + "epoch": 0.34, + "grad_norm": 1.7554236939088093, + "learning_rate": 7.732947351151225e-06, + "loss": 0.4902, + "step": 4284 + }, + { + "epoch": 0.34, + "grad_norm": 1.2934189087565242, + "learning_rate": 7.731882259141692e-06, + "loss": 0.4598, + "step": 4285 + }, + { + "epoch": 0.34, + "grad_norm": 1.7928806488371776, + "learning_rate": 7.730816990382951e-06, + "loss": 0.5202, + "step": 4286 + }, + { + "epoch": 0.34, + "grad_norm": 1.4284621759652492, + "learning_rate": 7.729751544943921e-06, + "loss": 0.467, + "step": 4287 + }, + { + "epoch": 0.34, + "grad_norm": 1.5552575056622058, + "learning_rate": 7.72868592289354e-06, + "loss": 0.4893, + "step": 4288 + }, + { + "epoch": 0.34, + "grad_norm": 2.6302013283580403, + "learning_rate": 7.727620124300748e-06, + "loss": 0.5084, + "step": 4289 + }, + { + "epoch": 0.34, + "grad_norm": 1.4334367953649194, + "learning_rate": 7.726554149234504e-06, + "loss": 0.4998, + "step": 4290 + }, + { + "epoch": 0.34, + "grad_norm": 1.3376296537783832, + "learning_rate": 7.725487997763772e-06, + "loss": 0.4847, + "step": 4291 + }, + { + "epoch": 0.34, + "grad_norm": 1.4780406845783058, + "learning_rate": 7.72442166995753e-06, + "loss": 0.5186, + "step": 4292 + }, + { + "epoch": 0.34, + "grad_norm": 1.3813080537532703, + "learning_rate": 7.723355165884772e-06, + "loss": 0.5, + "step": 4293 + }, + { + "epoch": 0.34, + "grad_norm": 1.4369200367525592, + "learning_rate": 7.722288485614498e-06, + "loss": 0.5019, + "step": 4294 + }, + { + "epoch": 0.34, + "grad_norm": 1.375635666083701, + "learning_rate": 7.721221629215718e-06, + "loss": 0.4748, + "step": 4295 + }, + { + "epoch": 0.34, + "grad_norm": 1.5525079445059042, + "learning_rate": 7.72015459675746e-06, + "loss": 0.5301, + "step": 4296 + }, + { + "epoch": 0.34, + "grad_norm": 0.6051327980844967, + "learning_rate": 7.719087388308756e-06, + "loss": 0.5236, + "step": 4297 + }, + { + "epoch": 0.34, + "grad_norm": 1.9258708994796352, + "learning_rate": 7.718020003938658e-06, + "loss": 0.4618, + "step": 4298 + }, + { + "epoch": 0.34, + "grad_norm": 2.0605787144660113, + "learning_rate": 7.716952443716218e-06, + "loss": 0.4406, + "step": 4299 + }, + { + "epoch": 0.34, + "grad_norm": 1.891532882614519, + "learning_rate": 7.715884707710513e-06, + "loss": 0.4702, + "step": 4300 + }, + { + "epoch": 0.34, + "grad_norm": 1.723559293544447, + "learning_rate": 7.714816795990618e-06, + "loss": 0.4704, + "step": 4301 + }, + { + "epoch": 0.34, + "grad_norm": 1.8147073341472029, + "learning_rate": 7.713748708625627e-06, + "loss": 0.4751, + "step": 4302 + }, + { + "epoch": 0.34, + "grad_norm": 1.577078186747778, + "learning_rate": 7.712680445684646e-06, + "loss": 0.4652, + "step": 4303 + }, + { + "epoch": 0.34, + "grad_norm": 1.8640570688823983, + "learning_rate": 7.711612007236789e-06, + "loss": 0.4881, + "step": 4304 + }, + { + "epoch": 0.34, + "grad_norm": 1.4361075905430707, + "learning_rate": 7.710543393351182e-06, + "loss": 0.4746, + "step": 4305 + }, + { + "epoch": 0.34, + "grad_norm": 1.5909832680837996, + "learning_rate": 7.709474604096963e-06, + "loss": 0.495, + "step": 4306 + }, + { + "epoch": 0.34, + "grad_norm": 0.6292558805088015, + "learning_rate": 7.708405639543281e-06, + "loss": 0.5228, + "step": 4307 + }, + { + "epoch": 0.34, + "grad_norm": 1.9821311711865293, + "learning_rate": 7.707336499759298e-06, + "loss": 0.4601, + "step": 4308 + }, + { + "epoch": 0.34, + "grad_norm": 1.7184752520929847, + "learning_rate": 7.706267184814184e-06, + "loss": 0.4763, + "step": 4309 + }, + { + "epoch": 0.34, + "grad_norm": 1.5242265105441464, + "learning_rate": 7.705197694777124e-06, + "loss": 0.4218, + "step": 4310 + }, + { + "epoch": 0.34, + "grad_norm": 1.587975544495384, + "learning_rate": 7.704128029717312e-06, + "loss": 0.5029, + "step": 4311 + }, + { + "epoch": 0.34, + "grad_norm": 1.5997077328610512, + "learning_rate": 7.703058189703955e-06, + "loss": 0.5065, + "step": 4312 + }, + { + "epoch": 0.34, + "grad_norm": 1.5145095031762683, + "learning_rate": 7.701988174806268e-06, + "loss": 0.4859, + "step": 4313 + }, + { + "epoch": 0.34, + "grad_norm": 1.4428099085336208, + "learning_rate": 7.70091798509348e-06, + "loss": 0.4434, + "step": 4314 + }, + { + "epoch": 0.34, + "grad_norm": 1.6745222330239091, + "learning_rate": 7.699847620634834e-06, + "loss": 0.5171, + "step": 4315 + }, + { + "epoch": 0.34, + "grad_norm": 1.739370528880135, + "learning_rate": 7.698777081499578e-06, + "loss": 0.4832, + "step": 4316 + }, + { + "epoch": 0.34, + "grad_norm": 2.964668465668289, + "learning_rate": 7.697706367756975e-06, + "loss": 0.4848, + "step": 4317 + }, + { + "epoch": 0.34, + "grad_norm": 0.718421245827606, + "learning_rate": 7.696635479476297e-06, + "loss": 0.532, + "step": 4318 + }, + { + "epoch": 0.34, + "grad_norm": 4.375451919173206, + "learning_rate": 7.695564416726833e-06, + "loss": 0.4851, + "step": 4319 + }, + { + "epoch": 0.34, + "grad_norm": 0.613946048655244, + "learning_rate": 7.69449317957788e-06, + "loss": 0.533, + "step": 4320 + }, + { + "epoch": 0.34, + "grad_norm": 0.6089080656732121, + "learning_rate": 7.693421768098737e-06, + "loss": 0.5141, + "step": 4321 + }, + { + "epoch": 0.34, + "grad_norm": 0.574601259765763, + "learning_rate": 7.692350182358734e-06, + "loss": 0.5521, + "step": 4322 + }, + { + "epoch": 0.34, + "grad_norm": 2.341151563807748, + "learning_rate": 7.691278422427195e-06, + "loss": 0.5249, + "step": 4323 + }, + { + "epoch": 0.34, + "grad_norm": 3.2117090562204527, + "learning_rate": 7.690206488373462e-06, + "loss": 0.4793, + "step": 4324 + }, + { + "epoch": 0.34, + "grad_norm": 1.5602181295175142, + "learning_rate": 7.689134380266889e-06, + "loss": 0.5237, + "step": 4325 + }, + { + "epoch": 0.34, + "grad_norm": 2.0577191995624338, + "learning_rate": 7.688062098176839e-06, + "loss": 0.4836, + "step": 4326 + }, + { + "epoch": 0.34, + "grad_norm": 1.6869708829235728, + "learning_rate": 7.686989642172689e-06, + "loss": 0.4894, + "step": 4327 + }, + { + "epoch": 0.34, + "grad_norm": 1.7977093888484492, + "learning_rate": 7.685917012323823e-06, + "loss": 0.5456, + "step": 4328 + }, + { + "epoch": 0.34, + "grad_norm": 1.8470741965190391, + "learning_rate": 7.68484420869964e-06, + "loss": 0.5004, + "step": 4329 + }, + { + "epoch": 0.34, + "grad_norm": 1.3816486970602246, + "learning_rate": 7.683771231369548e-06, + "loss": 0.4755, + "step": 4330 + }, + { + "epoch": 0.34, + "grad_norm": 1.8709481793134477, + "learning_rate": 7.68269808040297e-06, + "loss": 0.5383, + "step": 4331 + }, + { + "epoch": 0.34, + "grad_norm": 1.7425303780839247, + "learning_rate": 7.681624755869334e-06, + "loss": 0.506, + "step": 4332 + }, + { + "epoch": 0.34, + "grad_norm": 1.5565652519556754, + "learning_rate": 7.680551257838084e-06, + "loss": 0.51, + "step": 4333 + }, + { + "epoch": 0.34, + "grad_norm": 1.351851686327484, + "learning_rate": 7.679477586378677e-06, + "loss": 0.4737, + "step": 4334 + }, + { + "epoch": 0.34, + "grad_norm": 1.5454763805076641, + "learning_rate": 7.678403741560573e-06, + "loss": 0.4436, + "step": 4335 + }, + { + "epoch": 0.34, + "grad_norm": 2.2641535531679686, + "learning_rate": 7.677329723453252e-06, + "loss": 0.4917, + "step": 4336 + }, + { + "epoch": 0.34, + "grad_norm": 1.7143770970358974, + "learning_rate": 7.6762555321262e-06, + "loss": 0.5201, + "step": 4337 + }, + { + "epoch": 0.34, + "grad_norm": 1.5615910483231392, + "learning_rate": 7.675181167648915e-06, + "loss": 0.4738, + "step": 4338 + }, + { + "epoch": 0.34, + "grad_norm": 1.2275472229208917, + "learning_rate": 7.674106630090909e-06, + "loss": 0.4637, + "step": 4339 + }, + { + "epoch": 0.34, + "grad_norm": 1.5159703107992575, + "learning_rate": 7.673031919521702e-06, + "loss": 0.4676, + "step": 4340 + }, + { + "epoch": 0.34, + "grad_norm": 1.3755483131325623, + "learning_rate": 7.671957036010826e-06, + "loss": 0.4324, + "step": 4341 + }, + { + "epoch": 0.34, + "grad_norm": 1.6210919746364294, + "learning_rate": 7.670881979627826e-06, + "loss": 0.4892, + "step": 4342 + }, + { + "epoch": 0.34, + "grad_norm": 0.7309107173625531, + "learning_rate": 7.669806750442255e-06, + "loss": 0.5331, + "step": 4343 + }, + { + "epoch": 0.34, + "grad_norm": 1.5433164307483809, + "learning_rate": 7.66873134852368e-06, + "loss": 0.4964, + "step": 4344 + }, + { + "epoch": 0.34, + "grad_norm": 1.6817753380601144, + "learning_rate": 7.667655773941678e-06, + "loss": 0.4532, + "step": 4345 + }, + { + "epoch": 0.34, + "grad_norm": 1.5650110346154513, + "learning_rate": 7.666580026765837e-06, + "loss": 0.4849, + "step": 4346 + }, + { + "epoch": 0.34, + "grad_norm": 0.5494881443328056, + "learning_rate": 7.665504107065758e-06, + "loss": 0.5334, + "step": 4347 + }, + { + "epoch": 0.34, + "grad_norm": 2.3017144043290494, + "learning_rate": 7.66442801491105e-06, + "loss": 0.4896, + "step": 4348 + }, + { + "epoch": 0.34, + "grad_norm": 1.5904229633114444, + "learning_rate": 7.663351750371332e-06, + "loss": 0.4897, + "step": 4349 + }, + { + "epoch": 0.34, + "grad_norm": 0.6485757024964992, + "learning_rate": 7.662275313516243e-06, + "loss": 0.5211, + "step": 4350 + }, + { + "epoch": 0.34, + "grad_norm": 1.5231119531252881, + "learning_rate": 7.661198704415423e-06, + "loss": 0.4615, + "step": 4351 + }, + { + "epoch": 0.34, + "grad_norm": 2.265497318500195, + "learning_rate": 7.660121923138528e-06, + "loss": 0.4815, + "step": 4352 + }, + { + "epoch": 0.34, + "grad_norm": 1.4626762056154226, + "learning_rate": 7.659044969755225e-06, + "loss": 0.5032, + "step": 4353 + }, + { + "epoch": 0.34, + "grad_norm": 2.0540489265860185, + "learning_rate": 7.65796784433519e-06, + "loss": 0.5033, + "step": 4354 + }, + { + "epoch": 0.34, + "grad_norm": 1.4258538114224182, + "learning_rate": 7.656890546948112e-06, + "loss": 0.449, + "step": 4355 + }, + { + "epoch": 0.34, + "grad_norm": 1.4542109733978075, + "learning_rate": 7.655813077663691e-06, + "loss": 0.4649, + "step": 4356 + }, + { + "epoch": 0.34, + "grad_norm": 1.3480076472239677, + "learning_rate": 7.65473543655164e-06, + "loss": 0.5194, + "step": 4357 + }, + { + "epoch": 0.34, + "grad_norm": 1.3010966863527664, + "learning_rate": 7.653657623681679e-06, + "loss": 0.4509, + "step": 4358 + }, + { + "epoch": 0.34, + "grad_norm": 1.3394330729443062, + "learning_rate": 7.652579639123541e-06, + "loss": 0.4503, + "step": 4359 + }, + { + "epoch": 0.34, + "grad_norm": 1.5489017734953014, + "learning_rate": 7.651501482946969e-06, + "loss": 0.4735, + "step": 4360 + }, + { + "epoch": 0.34, + "grad_norm": 0.6590780554686906, + "learning_rate": 7.65042315522172e-06, + "loss": 0.5075, + "step": 4361 + }, + { + "epoch": 0.34, + "grad_norm": 0.6159152878545825, + "learning_rate": 7.649344656017562e-06, + "loss": 0.5392, + "step": 4362 + }, + { + "epoch": 0.34, + "grad_norm": 1.6473455556863108, + "learning_rate": 7.648265985404268e-06, + "loss": 0.4576, + "step": 4363 + }, + { + "epoch": 0.34, + "grad_norm": 0.5405783571301032, + "learning_rate": 7.647187143451631e-06, + "loss": 0.5133, + "step": 4364 + }, + { + "epoch": 0.34, + "grad_norm": 1.3620234173421468, + "learning_rate": 7.646108130229449e-06, + "loss": 0.4734, + "step": 4365 + }, + { + "epoch": 0.34, + "grad_norm": 1.779954695652949, + "learning_rate": 7.645028945807528e-06, + "loss": 0.4713, + "step": 4366 + }, + { + "epoch": 0.34, + "grad_norm": 1.2990426732745304, + "learning_rate": 7.6439495902557e-06, + "loss": 0.4447, + "step": 4367 + }, + { + "epoch": 0.34, + "grad_norm": 1.996841344058395, + "learning_rate": 7.64287006364379e-06, + "loss": 0.4962, + "step": 4368 + }, + { + "epoch": 0.34, + "grad_norm": 0.614699377740711, + "learning_rate": 7.641790366041644e-06, + "loss": 0.5171, + "step": 4369 + }, + { + "epoch": 0.34, + "grad_norm": 0.632958533183693, + "learning_rate": 7.640710497519117e-06, + "loss": 0.5081, + "step": 4370 + }, + { + "epoch": 0.34, + "grad_norm": 2.6109065886675396, + "learning_rate": 7.639630458146077e-06, + "loss": 0.4367, + "step": 4371 + }, + { + "epoch": 0.34, + "grad_norm": 1.319798317176399, + "learning_rate": 7.638550247992397e-06, + "loss": 0.4701, + "step": 4372 + }, + { + "epoch": 0.34, + "grad_norm": 1.2971717328149732, + "learning_rate": 7.637469867127968e-06, + "loss": 0.446, + "step": 4373 + }, + { + "epoch": 0.34, + "grad_norm": 1.5393790354485144, + "learning_rate": 7.63638931562269e-06, + "loss": 0.5499, + "step": 4374 + }, + { + "epoch": 0.34, + "grad_norm": 1.5021270914245253, + "learning_rate": 7.635308593546474e-06, + "loss": 0.4658, + "step": 4375 + }, + { + "epoch": 0.34, + "grad_norm": 1.8145823166428965, + "learning_rate": 7.634227700969235e-06, + "loss": 0.5014, + "step": 4376 + }, + { + "epoch": 0.34, + "grad_norm": 1.372617629241495, + "learning_rate": 7.633146637960912e-06, + "loss": 0.4764, + "step": 4377 + }, + { + "epoch": 0.34, + "grad_norm": 1.261945646447017, + "learning_rate": 7.632065404591445e-06, + "loss": 0.5023, + "step": 4378 + }, + { + "epoch": 0.34, + "grad_norm": 1.699046807640755, + "learning_rate": 7.630984000930792e-06, + "loss": 0.4593, + "step": 4379 + }, + { + "epoch": 0.34, + "grad_norm": 1.509644255899814, + "learning_rate": 7.629902427048914e-06, + "loss": 0.5292, + "step": 4380 + }, + { + "epoch": 0.34, + "grad_norm": 0.6937798388492128, + "learning_rate": 7.62882068301579e-06, + "loss": 0.5485, + "step": 4381 + }, + { + "epoch": 0.34, + "grad_norm": 1.8111486310872247, + "learning_rate": 7.627738768901406e-06, + "loss": 0.4728, + "step": 4382 + }, + { + "epoch": 0.34, + "grad_norm": 2.781695639199614, + "learning_rate": 7.626656684775762e-06, + "loss": 0.426, + "step": 4383 + }, + { + "epoch": 0.34, + "grad_norm": 2.205270962038038, + "learning_rate": 7.625574430708867e-06, + "loss": 0.4895, + "step": 4384 + }, + { + "epoch": 0.34, + "grad_norm": 1.8600944675609776, + "learning_rate": 7.624492006770739e-06, + "loss": 0.4449, + "step": 4385 + }, + { + "epoch": 0.34, + "grad_norm": 1.960252924934926, + "learning_rate": 7.623409413031413e-06, + "loss": 0.4903, + "step": 4386 + }, + { + "epoch": 0.34, + "grad_norm": 1.5070960344259208, + "learning_rate": 7.622326649560929e-06, + "loss": 0.4497, + "step": 4387 + }, + { + "epoch": 0.34, + "grad_norm": 1.2674293616770762, + "learning_rate": 7.62124371642934e-06, + "loss": 0.5188, + "step": 4388 + }, + { + "epoch": 0.34, + "grad_norm": 1.531908117905589, + "learning_rate": 7.620160613706715e-06, + "loss": 0.5434, + "step": 4389 + }, + { + "epoch": 0.34, + "grad_norm": 1.6009428398160452, + "learning_rate": 7.619077341463123e-06, + "loss": 0.4629, + "step": 4390 + }, + { + "epoch": 0.34, + "grad_norm": 2.1354423427655678, + "learning_rate": 7.617993899768657e-06, + "loss": 0.4963, + "step": 4391 + }, + { + "epoch": 0.34, + "grad_norm": 1.1998541801763876, + "learning_rate": 7.6169102886934065e-06, + "loss": 0.4641, + "step": 4392 + }, + { + "epoch": 0.35, + "grad_norm": 0.6246609202243575, + "learning_rate": 7.615826508307485e-06, + "loss": 0.5376, + "step": 4393 + }, + { + "epoch": 0.35, + "grad_norm": 1.4523258671770887, + "learning_rate": 7.61474255868101e-06, + "loss": 0.4746, + "step": 4394 + }, + { + "epoch": 0.35, + "grad_norm": 1.5509905049147423, + "learning_rate": 7.613658439884113e-06, + "loss": 0.4367, + "step": 4395 + }, + { + "epoch": 0.35, + "grad_norm": 1.6205160095055837, + "learning_rate": 7.612574151986934e-06, + "loss": 0.4435, + "step": 4396 + }, + { + "epoch": 0.35, + "grad_norm": 1.5473912199290134, + "learning_rate": 7.611489695059623e-06, + "loss": 0.4881, + "step": 4397 + }, + { + "epoch": 0.35, + "grad_norm": 1.588197470952627, + "learning_rate": 7.610405069172346e-06, + "loss": 0.5022, + "step": 4398 + }, + { + "epoch": 0.35, + "grad_norm": 2.4717862304020497, + "learning_rate": 7.609320274395276e-06, + "loss": 0.419, + "step": 4399 + }, + { + "epoch": 0.35, + "grad_norm": 0.5975036295577221, + "learning_rate": 7.608235310798599e-06, + "loss": 0.5455, + "step": 4400 + }, + { + "epoch": 0.35, + "grad_norm": 1.506532787527015, + "learning_rate": 7.607150178452507e-06, + "loss": 0.5351, + "step": 4401 + }, + { + "epoch": 0.35, + "grad_norm": 2.034853599576947, + "learning_rate": 7.606064877427211e-06, + "loss": 0.5134, + "step": 4402 + }, + { + "epoch": 0.35, + "grad_norm": 1.5398705020383172, + "learning_rate": 7.604979407792925e-06, + "loss": 0.4745, + "step": 4403 + }, + { + "epoch": 0.35, + "grad_norm": 1.7659273627551266, + "learning_rate": 7.6038937696198815e-06, + "loss": 0.4972, + "step": 4404 + }, + { + "epoch": 0.35, + "grad_norm": 1.6067845163178638, + "learning_rate": 7.602807962978316e-06, + "loss": 0.4928, + "step": 4405 + }, + { + "epoch": 0.35, + "grad_norm": 1.6939930679058588, + "learning_rate": 7.601721987938479e-06, + "loss": 0.5049, + "step": 4406 + }, + { + "epoch": 0.35, + "grad_norm": 1.592387113127101, + "learning_rate": 7.600635844570634e-06, + "loss": 0.4848, + "step": 4407 + }, + { + "epoch": 0.35, + "grad_norm": 1.5468742431161744, + "learning_rate": 7.599549532945052e-06, + "loss": 0.5312, + "step": 4408 + }, + { + "epoch": 0.35, + "grad_norm": 0.6067866597447613, + "learning_rate": 7.598463053132016e-06, + "loss": 0.5409, + "step": 4409 + }, + { + "epoch": 0.35, + "grad_norm": 1.5106847560416075, + "learning_rate": 7.597376405201819e-06, + "loss": 0.4799, + "step": 4410 + }, + { + "epoch": 0.35, + "grad_norm": 1.3059740672464717, + "learning_rate": 7.596289589224766e-06, + "loss": 0.3845, + "step": 4411 + }, + { + "epoch": 0.35, + "grad_norm": 1.6144569511324067, + "learning_rate": 7.595202605271175e-06, + "loss": 0.4878, + "step": 4412 + }, + { + "epoch": 0.35, + "grad_norm": 2.182215333323812, + "learning_rate": 7.594115453411368e-06, + "loss": 0.4476, + "step": 4413 + }, + { + "epoch": 0.35, + "grad_norm": 1.6143739893897535, + "learning_rate": 7.593028133715686e-06, + "loss": 0.5359, + "step": 4414 + }, + { + "epoch": 0.35, + "grad_norm": 1.707523466158357, + "learning_rate": 7.591940646254476e-06, + "loss": 0.4703, + "step": 4415 + }, + { + "epoch": 0.35, + "grad_norm": 0.6407080817071937, + "learning_rate": 7.5908529910980964e-06, + "loss": 0.509, + "step": 4416 + }, + { + "epoch": 0.35, + "grad_norm": 1.75168225955281, + "learning_rate": 7.589765168316918e-06, + "loss": 0.4827, + "step": 4417 + }, + { + "epoch": 0.35, + "grad_norm": 1.597709830879285, + "learning_rate": 7.58867717798132e-06, + "loss": 0.4636, + "step": 4418 + }, + { + "epoch": 0.35, + "grad_norm": 1.6259871893353737, + "learning_rate": 7.5875890201616964e-06, + "loss": 0.4817, + "step": 4419 + }, + { + "epoch": 0.35, + "grad_norm": 1.9950193784671602, + "learning_rate": 7.586500694928447e-06, + "loss": 0.4551, + "step": 4420 + }, + { + "epoch": 0.35, + "grad_norm": 1.516211618015143, + "learning_rate": 7.585412202351987e-06, + "loss": 0.479, + "step": 4421 + }, + { + "epoch": 0.35, + "grad_norm": 2.1026612318751723, + "learning_rate": 7.58432354250274e-06, + "loss": 0.5381, + "step": 4422 + }, + { + "epoch": 0.35, + "grad_norm": 1.9350742312526639, + "learning_rate": 7.5832347154511386e-06, + "loss": 0.5313, + "step": 4423 + }, + { + "epoch": 0.35, + "grad_norm": 1.4000866419093525, + "learning_rate": 7.582145721267633e-06, + "loss": 0.4565, + "step": 4424 + }, + { + "epoch": 0.35, + "grad_norm": 0.5655442207733838, + "learning_rate": 7.581056560022675e-06, + "loss": 0.5092, + "step": 4425 + }, + { + "epoch": 0.35, + "grad_norm": 1.515006977205771, + "learning_rate": 7.579967231786736e-06, + "loss": 0.4437, + "step": 4426 + }, + { + "epoch": 0.35, + "grad_norm": 1.6110213674022404, + "learning_rate": 7.57887773663029e-06, + "loss": 0.541, + "step": 4427 + }, + { + "epoch": 0.35, + "grad_norm": 1.599276691508581, + "learning_rate": 7.577788074623829e-06, + "loss": 0.4939, + "step": 4428 + }, + { + "epoch": 0.35, + "grad_norm": 1.4529793534262705, + "learning_rate": 7.576698245837852e-06, + "loss": 0.4514, + "step": 4429 + }, + { + "epoch": 0.35, + "grad_norm": 1.6750135965516877, + "learning_rate": 7.575608250342869e-06, + "loss": 0.4679, + "step": 4430 + }, + { + "epoch": 0.35, + "grad_norm": 1.3763606760101372, + "learning_rate": 7.574518088209401e-06, + "loss": 0.4602, + "step": 4431 + }, + { + "epoch": 0.35, + "grad_norm": 1.6092617787608126, + "learning_rate": 7.573427759507981e-06, + "loss": 0.5321, + "step": 4432 + }, + { + "epoch": 0.35, + "grad_norm": 1.3459612186619383, + "learning_rate": 7.572337264309152e-06, + "loss": 0.4786, + "step": 4433 + }, + { + "epoch": 0.35, + "grad_norm": 2.2291549546809786, + "learning_rate": 7.571246602683465e-06, + "loss": 0.4905, + "step": 4434 + }, + { + "epoch": 0.35, + "grad_norm": 1.5159017242696466, + "learning_rate": 7.570155774701489e-06, + "loss": 0.4809, + "step": 4435 + }, + { + "epoch": 0.35, + "grad_norm": 1.3485380385311292, + "learning_rate": 7.569064780433795e-06, + "loss": 0.4248, + "step": 4436 + }, + { + "epoch": 0.35, + "grad_norm": 2.5148950725374664, + "learning_rate": 7.567973619950971e-06, + "loss": 0.571, + "step": 4437 + }, + { + "epoch": 0.35, + "grad_norm": 3.845712594157829, + "learning_rate": 7.566882293323613e-06, + "loss": 0.4992, + "step": 4438 + }, + { + "epoch": 0.35, + "grad_norm": 0.5455301234944434, + "learning_rate": 7.565790800622329e-06, + "loss": 0.4865, + "step": 4439 + }, + { + "epoch": 0.35, + "grad_norm": 1.8154327258099632, + "learning_rate": 7.564699141917736e-06, + "loss": 0.5074, + "step": 4440 + }, + { + "epoch": 0.35, + "grad_norm": 1.7065437889458257, + "learning_rate": 7.5636073172804645e-06, + "loss": 0.5275, + "step": 4441 + }, + { + "epoch": 0.35, + "grad_norm": 0.5511083564884024, + "learning_rate": 7.562515326781152e-06, + "loss": 0.5138, + "step": 4442 + }, + { + "epoch": 0.35, + "grad_norm": 1.3618501667780516, + "learning_rate": 7.561423170490452e-06, + "loss": 0.4971, + "step": 4443 + }, + { + "epoch": 0.35, + "grad_norm": 0.5426749343066507, + "learning_rate": 7.56033084847902e-06, + "loss": 0.5052, + "step": 4444 + }, + { + "epoch": 0.35, + "grad_norm": 1.487224584720076, + "learning_rate": 7.559238360817535e-06, + "loss": 0.5055, + "step": 4445 + }, + { + "epoch": 0.35, + "grad_norm": 1.4165232719981535, + "learning_rate": 7.558145707576674e-06, + "loss": 0.4588, + "step": 4446 + }, + { + "epoch": 0.35, + "grad_norm": 1.5801604714832982, + "learning_rate": 7.557052888827133e-06, + "loss": 0.5193, + "step": 4447 + }, + { + "epoch": 0.35, + "grad_norm": 1.4379617927182353, + "learning_rate": 7.5559599046396145e-06, + "loss": 0.4823, + "step": 4448 + }, + { + "epoch": 0.35, + "grad_norm": 5.38571091919479, + "learning_rate": 7.5548667550848355e-06, + "loss": 0.5037, + "step": 4449 + }, + { + "epoch": 0.35, + "grad_norm": 0.6177756897110601, + "learning_rate": 7.5537734402335175e-06, + "loss": 0.5409, + "step": 4450 + }, + { + "epoch": 0.35, + "grad_norm": 0.6108201666330376, + "learning_rate": 7.552679960156399e-06, + "loss": 0.5335, + "step": 4451 + }, + { + "epoch": 0.35, + "grad_norm": 3.003773633491461, + "learning_rate": 7.551586314924227e-06, + "loss": 0.5074, + "step": 4452 + }, + { + "epoch": 0.35, + "grad_norm": 1.5208651064661973, + "learning_rate": 7.5504925046077596e-06, + "loss": 0.4446, + "step": 4453 + }, + { + "epoch": 0.35, + "grad_norm": 1.356252304142289, + "learning_rate": 7.549398529277762e-06, + "loss": 0.453, + "step": 4454 + }, + { + "epoch": 0.35, + "grad_norm": 1.6821756659298526, + "learning_rate": 7.548304389005014e-06, + "loss": 0.4774, + "step": 4455 + }, + { + "epoch": 0.35, + "grad_norm": 2.044114879586208, + "learning_rate": 7.547210083860306e-06, + "loss": 0.4785, + "step": 4456 + }, + { + "epoch": 0.35, + "grad_norm": 0.6554998870232182, + "learning_rate": 7.54611561391444e-06, + "loss": 0.5165, + "step": 4457 + }, + { + "epoch": 0.35, + "grad_norm": 1.5020775187145792, + "learning_rate": 7.545020979238223e-06, + "loss": 0.4381, + "step": 4458 + }, + { + "epoch": 0.35, + "grad_norm": 1.8668931551165677, + "learning_rate": 7.5439261799024764e-06, + "loss": 0.4518, + "step": 4459 + }, + { + "epoch": 0.35, + "grad_norm": 1.8737775041547375, + "learning_rate": 7.5428312159780345e-06, + "loss": 0.4469, + "step": 4460 + }, + { + "epoch": 0.35, + "grad_norm": 2.2292716486198088, + "learning_rate": 7.541736087535742e-06, + "loss": 0.4918, + "step": 4461 + }, + { + "epoch": 0.35, + "grad_norm": 1.8428146257859725, + "learning_rate": 7.540640794646447e-06, + "loss": 0.4656, + "step": 4462 + }, + { + "epoch": 0.35, + "grad_norm": 0.564407579149257, + "learning_rate": 7.539545337381016e-06, + "loss": 0.5286, + "step": 4463 + }, + { + "epoch": 0.35, + "grad_norm": 1.8797101320893963, + "learning_rate": 7.538449715810326e-06, + "loss": 0.4974, + "step": 4464 + }, + { + "epoch": 0.35, + "grad_norm": 1.272480384464611, + "learning_rate": 7.537353930005258e-06, + "loss": 0.5325, + "step": 4465 + }, + { + "epoch": 0.35, + "grad_norm": 1.354903940702524, + "learning_rate": 7.536257980036711e-06, + "loss": 0.5238, + "step": 4466 + }, + { + "epoch": 0.35, + "grad_norm": 0.5462948130690461, + "learning_rate": 7.53516186597559e-06, + "loss": 0.5064, + "step": 4467 + }, + { + "epoch": 0.35, + "grad_norm": 1.3658738820538754, + "learning_rate": 7.534065587892813e-06, + "loss": 0.5096, + "step": 4468 + }, + { + "epoch": 0.35, + "grad_norm": 2.4651775932863003, + "learning_rate": 7.5329691458593055e-06, + "loss": 0.4812, + "step": 4469 + }, + { + "epoch": 0.35, + "grad_norm": 0.6032323310735264, + "learning_rate": 7.5318725399460116e-06, + "loss": 0.5257, + "step": 4470 + }, + { + "epoch": 0.35, + "grad_norm": 1.4617539525571723, + "learning_rate": 7.530775770223874e-06, + "loss": 0.4867, + "step": 4471 + }, + { + "epoch": 0.35, + "grad_norm": 1.6629355393212113, + "learning_rate": 7.529678836763856e-06, + "loss": 0.4741, + "step": 4472 + }, + { + "epoch": 0.35, + "grad_norm": 1.822646317040909, + "learning_rate": 7.528581739636924e-06, + "loss": 0.5128, + "step": 4473 + }, + { + "epoch": 0.35, + "grad_norm": 1.476838310406188, + "learning_rate": 7.527484478914065e-06, + "loss": 0.4526, + "step": 4474 + }, + { + "epoch": 0.35, + "grad_norm": 1.3590454924793507, + "learning_rate": 7.5263870546662646e-06, + "loss": 0.4388, + "step": 4475 + }, + { + "epoch": 0.35, + "grad_norm": 1.5897290500388133, + "learning_rate": 7.525289466964527e-06, + "loss": 0.4926, + "step": 4476 + }, + { + "epoch": 0.35, + "grad_norm": 1.6208388993958498, + "learning_rate": 7.524191715879863e-06, + "loss": 0.5002, + "step": 4477 + }, + { + "epoch": 0.35, + "grad_norm": 1.5680087745681166, + "learning_rate": 7.523093801483299e-06, + "loss": 0.4851, + "step": 4478 + }, + { + "epoch": 0.35, + "grad_norm": 1.7373850773757729, + "learning_rate": 7.521995723845865e-06, + "loss": 0.5059, + "step": 4479 + }, + { + "epoch": 0.35, + "grad_norm": 1.8080957698258109, + "learning_rate": 7.5208974830386075e-06, + "loss": 0.505, + "step": 4480 + }, + { + "epoch": 0.35, + "grad_norm": 1.7411347941280448, + "learning_rate": 7.51979907913258e-06, + "loss": 0.5149, + "step": 4481 + }, + { + "epoch": 0.35, + "grad_norm": 1.8654424290737228, + "learning_rate": 7.518700512198851e-06, + "loss": 0.5057, + "step": 4482 + }, + { + "epoch": 0.35, + "grad_norm": 1.8711471494456113, + "learning_rate": 7.51760178230849e-06, + "loss": 0.4655, + "step": 4483 + }, + { + "epoch": 0.35, + "grad_norm": 1.7930850778047611, + "learning_rate": 7.516502889532591e-06, + "loss": 0.4404, + "step": 4484 + }, + { + "epoch": 0.35, + "grad_norm": 2.232414067915649, + "learning_rate": 7.5154038339422444e-06, + "loss": 0.5064, + "step": 4485 + }, + { + "epoch": 0.35, + "grad_norm": 1.5135781376359325, + "learning_rate": 7.514304615608561e-06, + "loss": 0.4993, + "step": 4486 + }, + { + "epoch": 0.35, + "grad_norm": 0.6053257003787963, + "learning_rate": 7.5132052346026585e-06, + "loss": 0.5224, + "step": 4487 + }, + { + "epoch": 0.35, + "grad_norm": 2.877287040598228, + "learning_rate": 7.512105690995663e-06, + "loss": 0.5164, + "step": 4488 + }, + { + "epoch": 0.35, + "grad_norm": 1.584063673646794, + "learning_rate": 7.511005984858718e-06, + "loss": 0.4925, + "step": 4489 + }, + { + "epoch": 0.35, + "grad_norm": 1.5118364063314114, + "learning_rate": 7.5099061162629695e-06, + "loss": 0.4399, + "step": 4490 + }, + { + "epoch": 0.35, + "grad_norm": 2.286386873523664, + "learning_rate": 7.508806085279578e-06, + "loss": 0.4702, + "step": 4491 + }, + { + "epoch": 0.35, + "grad_norm": 1.5322263644398175, + "learning_rate": 7.507705891979716e-06, + "loss": 0.4479, + "step": 4492 + }, + { + "epoch": 0.35, + "grad_norm": 0.5884567306105825, + "learning_rate": 7.5066055364345635e-06, + "loss": 0.5287, + "step": 4493 + }, + { + "epoch": 0.35, + "grad_norm": 2.136812797393852, + "learning_rate": 7.5055050187153114e-06, + "loss": 0.4443, + "step": 4494 + }, + { + "epoch": 0.35, + "grad_norm": 1.6531259456452758, + "learning_rate": 7.504404338893161e-06, + "loss": 0.4855, + "step": 4495 + }, + { + "epoch": 0.35, + "grad_norm": 0.5765068310108791, + "learning_rate": 7.503303497039328e-06, + "loss": 0.5363, + "step": 4496 + }, + { + "epoch": 0.35, + "grad_norm": 2.0215275519041325, + "learning_rate": 7.5022024932250325e-06, + "loss": 0.5126, + "step": 4497 + }, + { + "epoch": 0.35, + "grad_norm": 1.4193773370525802, + "learning_rate": 7.501101327521508e-06, + "loss": 0.4856, + "step": 4498 + }, + { + "epoch": 0.35, + "grad_norm": 1.8293186785982878, + "learning_rate": 7.500000000000001e-06, + "loss": 0.4059, + "step": 4499 + }, + { + "epoch": 0.35, + "grad_norm": 1.7869349709844538, + "learning_rate": 7.498898510731764e-06, + "loss": 0.4684, + "step": 4500 + }, + { + "epoch": 0.35, + "grad_norm": 1.7956066470240517, + "learning_rate": 7.497796859788061e-06, + "loss": 0.431, + "step": 4501 + }, + { + "epoch": 0.35, + "grad_norm": 1.3924377298140087, + "learning_rate": 7.496695047240171e-06, + "loss": 0.5482, + "step": 4502 + }, + { + "epoch": 0.35, + "grad_norm": 2.414103993748557, + "learning_rate": 7.495593073159375e-06, + "loss": 0.4566, + "step": 4503 + }, + { + "epoch": 0.35, + "grad_norm": 1.6808589040239943, + "learning_rate": 7.494490937616974e-06, + "loss": 0.5161, + "step": 4504 + }, + { + "epoch": 0.35, + "grad_norm": 0.5916759813193967, + "learning_rate": 7.493388640684272e-06, + "loss": 0.5371, + "step": 4505 + }, + { + "epoch": 0.35, + "grad_norm": 0.5847902985451419, + "learning_rate": 7.492286182432587e-06, + "loss": 0.5257, + "step": 4506 + }, + { + "epoch": 0.35, + "grad_norm": 1.3919912753264105, + "learning_rate": 7.4911835629332455e-06, + "loss": 0.4415, + "step": 4507 + }, + { + "epoch": 0.35, + "grad_norm": 1.6785266860233201, + "learning_rate": 7.490080782257586e-06, + "loss": 0.4747, + "step": 4508 + }, + { + "epoch": 0.35, + "grad_norm": 1.6931148408089607, + "learning_rate": 7.488977840476959e-06, + "loss": 0.4664, + "step": 4509 + }, + { + "epoch": 0.35, + "grad_norm": 2.0837994284370907, + "learning_rate": 7.48787473766272e-06, + "loss": 0.4285, + "step": 4510 + }, + { + "epoch": 0.35, + "grad_norm": 2.2407324632378507, + "learning_rate": 7.486771473886241e-06, + "loss": 0.4773, + "step": 4511 + }, + { + "epoch": 0.35, + "grad_norm": 1.7196108707576485, + "learning_rate": 7.4856680492189e-06, + "loss": 0.5144, + "step": 4512 + }, + { + "epoch": 0.35, + "grad_norm": 1.699971127645639, + "learning_rate": 7.484564463732088e-06, + "loss": 0.5035, + "step": 4513 + }, + { + "epoch": 0.35, + "grad_norm": 1.3734646529451033, + "learning_rate": 7.483460717497206e-06, + "loss": 0.3997, + "step": 4514 + }, + { + "epoch": 0.35, + "grad_norm": 1.388657959643578, + "learning_rate": 7.482356810585664e-06, + "loss": 0.464, + "step": 4515 + }, + { + "epoch": 0.35, + "grad_norm": 1.5637001452356531, + "learning_rate": 7.481252743068883e-06, + "loss": 0.484, + "step": 4516 + }, + { + "epoch": 0.35, + "grad_norm": 1.515987573506066, + "learning_rate": 7.480148515018297e-06, + "loss": 0.5088, + "step": 4517 + }, + { + "epoch": 0.35, + "grad_norm": 1.568823368420787, + "learning_rate": 7.479044126505346e-06, + "loss": 0.4498, + "step": 4518 + }, + { + "epoch": 0.35, + "grad_norm": 1.8388535713065177, + "learning_rate": 7.477939577601483e-06, + "loss": 0.4709, + "step": 4519 + }, + { + "epoch": 0.35, + "grad_norm": 2.4671587391302863, + "learning_rate": 7.47683486837817e-06, + "loss": 0.4858, + "step": 4520 + }, + { + "epoch": 0.36, + "grad_norm": 1.254887129395308, + "learning_rate": 7.475729998906882e-06, + "loss": 0.4248, + "step": 4521 + }, + { + "epoch": 0.36, + "grad_norm": 1.7439106225334517, + "learning_rate": 7.474624969259101e-06, + "loss": 0.504, + "step": 4522 + }, + { + "epoch": 0.36, + "grad_norm": 1.4869998372794844, + "learning_rate": 7.473519779506323e-06, + "loss": 0.4224, + "step": 4523 + }, + { + "epoch": 0.36, + "grad_norm": 2.0931860449575193, + "learning_rate": 7.4724144297200495e-06, + "loss": 0.5338, + "step": 4524 + }, + { + "epoch": 0.36, + "grad_norm": 1.8430415320796543, + "learning_rate": 7.471308919971798e-06, + "loss": 0.5348, + "step": 4525 + }, + { + "epoch": 0.36, + "grad_norm": 1.822049588452768, + "learning_rate": 7.470203250333091e-06, + "loss": 0.4921, + "step": 4526 + }, + { + "epoch": 0.36, + "grad_norm": 0.7237492026059309, + "learning_rate": 7.469097420875466e-06, + "loss": 0.5263, + "step": 4527 + }, + { + "epoch": 0.36, + "grad_norm": 1.6679548463020846, + "learning_rate": 7.4679914316704675e-06, + "loss": 0.4919, + "step": 4528 + }, + { + "epoch": 0.36, + "grad_norm": 2.000580324355168, + "learning_rate": 7.4668852827896535e-06, + "loss": 0.4792, + "step": 4529 + }, + { + "epoch": 0.36, + "grad_norm": 1.8803825699792125, + "learning_rate": 7.465778974304586e-06, + "loss": 0.4675, + "step": 4530 + }, + { + "epoch": 0.36, + "grad_norm": 2.0272436272128336, + "learning_rate": 7.464672506286847e-06, + "loss": 0.4612, + "step": 4531 + }, + { + "epoch": 0.36, + "grad_norm": 1.6209735963804022, + "learning_rate": 7.463565878808019e-06, + "loss": 0.4874, + "step": 4532 + }, + { + "epoch": 0.36, + "grad_norm": 2.1369553720059353, + "learning_rate": 7.462459091939702e-06, + "loss": 0.4648, + "step": 4533 + }, + { + "epoch": 0.36, + "grad_norm": 1.4995356013641483, + "learning_rate": 7.461352145753504e-06, + "loss": 0.5053, + "step": 4534 + }, + { + "epoch": 0.36, + "grad_norm": 1.529006868246565, + "learning_rate": 7.460245040321042e-06, + "loss": 0.4226, + "step": 4535 + }, + { + "epoch": 0.36, + "grad_norm": 1.3331856299336633, + "learning_rate": 7.459137775713943e-06, + "loss": 0.4331, + "step": 4536 + }, + { + "epoch": 0.36, + "grad_norm": 2.106012461019877, + "learning_rate": 7.458030352003848e-06, + "loss": 0.4921, + "step": 4537 + }, + { + "epoch": 0.36, + "grad_norm": 1.615461793405107, + "learning_rate": 7.456922769262404e-06, + "loss": 0.4952, + "step": 4538 + }, + { + "epoch": 0.36, + "grad_norm": 2.2671569042011437, + "learning_rate": 7.4558150275612705e-06, + "loss": 0.447, + "step": 4539 + }, + { + "epoch": 0.36, + "grad_norm": 3.7794679886603824, + "learning_rate": 7.454707126972118e-06, + "loss": 0.5211, + "step": 4540 + }, + { + "epoch": 0.36, + "grad_norm": 1.4264673620250976, + "learning_rate": 7.453599067566626e-06, + "loss": 0.4504, + "step": 4541 + }, + { + "epoch": 0.36, + "grad_norm": 2.3064355993596606, + "learning_rate": 7.452490849416484e-06, + "loss": 0.5299, + "step": 4542 + }, + { + "epoch": 0.36, + "grad_norm": 1.6467967382094293, + "learning_rate": 7.451382472593392e-06, + "loss": 0.4448, + "step": 4543 + }, + { + "epoch": 0.36, + "grad_norm": 1.4304162253251227, + "learning_rate": 7.4502739371690604e-06, + "loss": 0.4742, + "step": 4544 + }, + { + "epoch": 0.36, + "grad_norm": 1.3720667666551278, + "learning_rate": 7.449165243215211e-06, + "loss": 0.4603, + "step": 4545 + }, + { + "epoch": 0.36, + "grad_norm": 1.7148512221836614, + "learning_rate": 7.4480563908035745e-06, + "loss": 0.5077, + "step": 4546 + }, + { + "epoch": 0.36, + "grad_norm": 1.5693859683538844, + "learning_rate": 7.446947380005889e-06, + "loss": 0.4631, + "step": 4547 + }, + { + "epoch": 0.36, + "grad_norm": 1.2663175828028757, + "learning_rate": 7.445838210893912e-06, + "loss": 0.4664, + "step": 4548 + }, + { + "epoch": 0.36, + "grad_norm": 1.668593563555438, + "learning_rate": 7.444728883539402e-06, + "loss": 0.523, + "step": 4549 + }, + { + "epoch": 0.36, + "grad_norm": 1.5325568136069472, + "learning_rate": 7.443619398014132e-06, + "loss": 0.4953, + "step": 4550 + }, + { + "epoch": 0.36, + "grad_norm": 1.916443328343889, + "learning_rate": 7.4425097543898816e-06, + "loss": 0.5498, + "step": 4551 + }, + { + "epoch": 0.36, + "grad_norm": 1.7861666358727153, + "learning_rate": 7.441399952738449e-06, + "loss": 0.4702, + "step": 4552 + }, + { + "epoch": 0.36, + "grad_norm": 1.359622573070845, + "learning_rate": 7.440289993131628e-06, + "loss": 0.4928, + "step": 4553 + }, + { + "epoch": 0.36, + "grad_norm": 1.5954992438531257, + "learning_rate": 7.439179875641242e-06, + "loss": 0.4596, + "step": 4554 + }, + { + "epoch": 0.36, + "grad_norm": 1.3721828389198607, + "learning_rate": 7.438069600339105e-06, + "loss": 0.4964, + "step": 4555 + }, + { + "epoch": 0.36, + "grad_norm": 2.582390961912261, + "learning_rate": 7.436959167297056e-06, + "loss": 0.4587, + "step": 4556 + }, + { + "epoch": 0.36, + "grad_norm": 1.3587658636479258, + "learning_rate": 7.435848576586936e-06, + "loss": 0.4988, + "step": 4557 + }, + { + "epoch": 0.36, + "grad_norm": 1.9264581462937433, + "learning_rate": 7.4347378282806e-06, + "loss": 0.5069, + "step": 4558 + }, + { + "epoch": 0.36, + "grad_norm": 1.5512499315893842, + "learning_rate": 7.43362692244991e-06, + "loss": 0.5034, + "step": 4559 + }, + { + "epoch": 0.36, + "grad_norm": 1.3893578835382685, + "learning_rate": 7.432515859166745e-06, + "loss": 0.5343, + "step": 4560 + }, + { + "epoch": 0.36, + "grad_norm": 2.4991956341952286, + "learning_rate": 7.431404638502984e-06, + "loss": 0.4385, + "step": 4561 + }, + { + "epoch": 0.36, + "grad_norm": 1.4268899337946406, + "learning_rate": 7.430293260530523e-06, + "loss": 0.4869, + "step": 4562 + }, + { + "epoch": 0.36, + "grad_norm": 0.6801477901501437, + "learning_rate": 7.429181725321268e-06, + "loss": 0.5152, + "step": 4563 + }, + { + "epoch": 0.36, + "grad_norm": 1.4539248608675672, + "learning_rate": 7.428070032947134e-06, + "loss": 0.4718, + "step": 4564 + }, + { + "epoch": 0.36, + "grad_norm": 1.421164254144659, + "learning_rate": 7.426958183480044e-06, + "loss": 0.4951, + "step": 4565 + }, + { + "epoch": 0.36, + "grad_norm": 1.7075140960498736, + "learning_rate": 7.425846176991936e-06, + "loss": 0.477, + "step": 4566 + }, + { + "epoch": 0.36, + "grad_norm": 1.5629106205121988, + "learning_rate": 7.424734013554752e-06, + "loss": 0.4876, + "step": 4567 + }, + { + "epoch": 0.36, + "grad_norm": 0.6644935581440224, + "learning_rate": 7.423621693240449e-06, + "loss": 0.5274, + "step": 4568 + }, + { + "epoch": 0.36, + "grad_norm": 0.5890211013645678, + "learning_rate": 7.4225092161209945e-06, + "loss": 0.541, + "step": 4569 + }, + { + "epoch": 0.36, + "grad_norm": 1.5431425665039833, + "learning_rate": 7.421396582268362e-06, + "loss": 0.4757, + "step": 4570 + }, + { + "epoch": 0.36, + "grad_norm": 1.5584282424103824, + "learning_rate": 7.420283791754538e-06, + "loss": 0.4672, + "step": 4571 + }, + { + "epoch": 0.36, + "grad_norm": 2.872359343935138, + "learning_rate": 7.41917084465152e-06, + "loss": 0.49, + "step": 4572 + }, + { + "epoch": 0.36, + "grad_norm": 1.7835210803524322, + "learning_rate": 7.4180577410313115e-06, + "loss": 0.486, + "step": 4573 + }, + { + "epoch": 0.36, + "grad_norm": 1.436099716267143, + "learning_rate": 7.416944480965931e-06, + "loss": 0.4529, + "step": 4574 + }, + { + "epoch": 0.36, + "grad_norm": 1.7369250304650892, + "learning_rate": 7.4158310645274045e-06, + "loss": 0.4541, + "step": 4575 + }, + { + "epoch": 0.36, + "grad_norm": 2.0138837328984263, + "learning_rate": 7.414717491787767e-06, + "loss": 0.4955, + "step": 4576 + }, + { + "epoch": 0.36, + "grad_norm": 3.2880692185521965, + "learning_rate": 7.413603762819069e-06, + "loss": 0.4532, + "step": 4577 + }, + { + "epoch": 0.36, + "grad_norm": 1.9742190975613736, + "learning_rate": 7.412489877693363e-06, + "loss": 0.4886, + "step": 4578 + }, + { + "epoch": 0.36, + "grad_norm": 1.4206213644313295, + "learning_rate": 7.411375836482719e-06, + "loss": 0.4629, + "step": 4579 + }, + { + "epoch": 0.36, + "grad_norm": 1.2749168915307603, + "learning_rate": 7.410261639259213e-06, + "loss": 0.4742, + "step": 4580 + }, + { + "epoch": 0.36, + "grad_norm": 1.4701068249232978, + "learning_rate": 7.409147286094931e-06, + "loss": 0.4686, + "step": 4581 + }, + { + "epoch": 0.36, + "grad_norm": 0.6252098036360981, + "learning_rate": 7.4080327770619705e-06, + "loss": 0.5067, + "step": 4582 + }, + { + "epoch": 0.36, + "grad_norm": 1.7554462430777722, + "learning_rate": 7.406918112232442e-06, + "loss": 0.5034, + "step": 4583 + }, + { + "epoch": 0.36, + "grad_norm": 1.6617602181855406, + "learning_rate": 7.405803291678459e-06, + "loss": 0.5188, + "step": 4584 + }, + { + "epoch": 0.36, + "grad_norm": 1.4370230940503752, + "learning_rate": 7.40468831547215e-06, + "loss": 0.4698, + "step": 4585 + }, + { + "epoch": 0.36, + "grad_norm": 1.5064712328118048, + "learning_rate": 7.403573183685653e-06, + "loss": 0.5004, + "step": 4586 + }, + { + "epoch": 0.36, + "grad_norm": 1.5370013358266668, + "learning_rate": 7.402457896391115e-06, + "loss": 0.5108, + "step": 4587 + }, + { + "epoch": 0.36, + "grad_norm": 0.5637713562169963, + "learning_rate": 7.401342453660694e-06, + "loss": 0.5173, + "step": 4588 + }, + { + "epoch": 0.36, + "grad_norm": 1.8776941740260504, + "learning_rate": 7.400226855566557e-06, + "loss": 0.4815, + "step": 4589 + }, + { + "epoch": 0.36, + "grad_norm": 1.9062465987060218, + "learning_rate": 7.399111102180883e-06, + "loss": 0.5013, + "step": 4590 + }, + { + "epoch": 0.36, + "grad_norm": 0.6104502836362812, + "learning_rate": 7.3979951935758596e-06, + "loss": 0.5097, + "step": 4591 + }, + { + "epoch": 0.36, + "grad_norm": 1.346453409697688, + "learning_rate": 7.396879129823682e-06, + "loss": 0.4811, + "step": 4592 + }, + { + "epoch": 0.36, + "grad_norm": 2.3477548632362835, + "learning_rate": 7.395762910996562e-06, + "loss": 0.4601, + "step": 4593 + }, + { + "epoch": 0.36, + "grad_norm": 1.8769133626027674, + "learning_rate": 7.394646537166716e-06, + "loss": 0.4928, + "step": 4594 + }, + { + "epoch": 0.36, + "grad_norm": 3.1914585684698458, + "learning_rate": 7.393530008406371e-06, + "loss": 0.5043, + "step": 4595 + }, + { + "epoch": 0.36, + "grad_norm": 1.740525002299161, + "learning_rate": 7.392413324787766e-06, + "loss": 0.4432, + "step": 4596 + }, + { + "epoch": 0.36, + "grad_norm": 1.9644205910060666, + "learning_rate": 7.391296486383149e-06, + "loss": 0.5369, + "step": 4597 + }, + { + "epoch": 0.36, + "grad_norm": 0.6060052506183139, + "learning_rate": 7.390179493264775e-06, + "loss": 0.5191, + "step": 4598 + }, + { + "epoch": 0.36, + "grad_norm": 1.8419163434096446, + "learning_rate": 7.389062345504918e-06, + "loss": 0.4727, + "step": 4599 + }, + { + "epoch": 0.36, + "grad_norm": 2.3730140966512354, + "learning_rate": 7.38794504317585e-06, + "loss": 0.5147, + "step": 4600 + }, + { + "epoch": 0.36, + "grad_norm": 0.5917576195321994, + "learning_rate": 7.386827586349863e-06, + "loss": 0.5509, + "step": 4601 + }, + { + "epoch": 0.36, + "grad_norm": 1.8463978005293353, + "learning_rate": 7.385709975099253e-06, + "loss": 0.4696, + "step": 4602 + }, + { + "epoch": 0.36, + "grad_norm": 1.6666029828454636, + "learning_rate": 7.384592209496327e-06, + "loss": 0.4787, + "step": 4603 + }, + { + "epoch": 0.36, + "grad_norm": 1.7721465857892111, + "learning_rate": 7.383474289613406e-06, + "loss": 0.5318, + "step": 4604 + }, + { + "epoch": 0.36, + "grad_norm": 1.5962959036224518, + "learning_rate": 7.382356215522817e-06, + "loss": 0.4936, + "step": 4605 + }, + { + "epoch": 0.36, + "grad_norm": 2.8137160120811777, + "learning_rate": 7.3812379872968965e-06, + "loss": 0.5294, + "step": 4606 + }, + { + "epoch": 0.36, + "grad_norm": 1.617643832532217, + "learning_rate": 7.380119605007994e-06, + "loss": 0.4883, + "step": 4607 + }, + { + "epoch": 0.36, + "grad_norm": 1.9790178111365713, + "learning_rate": 7.379001068728466e-06, + "loss": 0.5106, + "step": 4608 + }, + { + "epoch": 0.36, + "grad_norm": 4.312277878219389, + "learning_rate": 7.377882378530682e-06, + "loss": 0.5312, + "step": 4609 + }, + { + "epoch": 0.36, + "grad_norm": 2.193144227060164, + "learning_rate": 7.376763534487017e-06, + "loss": 0.4676, + "step": 4610 + }, + { + "epoch": 0.36, + "grad_norm": 0.5936775917283627, + "learning_rate": 7.3756445366698615e-06, + "loss": 0.5072, + "step": 4611 + }, + { + "epoch": 0.36, + "grad_norm": 1.4528012406837663, + "learning_rate": 7.374525385151612e-06, + "loss": 0.5289, + "step": 4612 + }, + { + "epoch": 0.36, + "grad_norm": 2.0759791944613757, + "learning_rate": 7.373406080004677e-06, + "loss": 0.5165, + "step": 4613 + }, + { + "epoch": 0.36, + "grad_norm": 1.5036839731083245, + "learning_rate": 7.372286621301472e-06, + "loss": 0.4996, + "step": 4614 + }, + { + "epoch": 0.36, + "grad_norm": 1.6647095264121488, + "learning_rate": 7.371167009114427e-06, + "loss": 0.4789, + "step": 4615 + }, + { + "epoch": 0.36, + "grad_norm": 1.642137836976585, + "learning_rate": 7.370047243515979e-06, + "loss": 0.4886, + "step": 4616 + }, + { + "epoch": 0.36, + "grad_norm": 0.6057604724488086, + "learning_rate": 7.368927324578573e-06, + "loss": 0.5262, + "step": 4617 + }, + { + "epoch": 0.36, + "grad_norm": 1.5737105266684055, + "learning_rate": 7.367807252374669e-06, + "loss": 0.4702, + "step": 4618 + }, + { + "epoch": 0.36, + "grad_norm": 1.6620256473967938, + "learning_rate": 7.3666870269767335e-06, + "loss": 0.5102, + "step": 4619 + }, + { + "epoch": 0.36, + "grad_norm": 1.3210808457947225, + "learning_rate": 7.365566648457243e-06, + "loss": 0.4883, + "step": 4620 + }, + { + "epoch": 0.36, + "grad_norm": 2.932596130903649, + "learning_rate": 7.364446116888685e-06, + "loss": 0.5035, + "step": 4621 + }, + { + "epoch": 0.36, + "grad_norm": 1.546071334438265, + "learning_rate": 7.363325432343558e-06, + "loss": 0.4405, + "step": 4622 + }, + { + "epoch": 0.36, + "grad_norm": 1.7904804890626385, + "learning_rate": 7.362204594894364e-06, + "loss": 0.4818, + "step": 4623 + }, + { + "epoch": 0.36, + "grad_norm": 0.5672802210464944, + "learning_rate": 7.3610836046136245e-06, + "loss": 0.5096, + "step": 4624 + }, + { + "epoch": 0.36, + "grad_norm": 1.6788645750950209, + "learning_rate": 7.359962461573864e-06, + "loss": 0.467, + "step": 4625 + }, + { + "epoch": 0.36, + "grad_norm": 0.5815507626312333, + "learning_rate": 7.35884116584762e-06, + "loss": 0.5186, + "step": 4626 + }, + { + "epoch": 0.36, + "grad_norm": 1.5391839136161658, + "learning_rate": 7.357719717507438e-06, + "loss": 0.5112, + "step": 4627 + }, + { + "epoch": 0.36, + "grad_norm": 1.6568881606067514, + "learning_rate": 7.356598116625875e-06, + "loss": 0.5229, + "step": 4628 + }, + { + "epoch": 0.36, + "grad_norm": 1.7157083781686024, + "learning_rate": 7.355476363275496e-06, + "loss": 0.4403, + "step": 4629 + }, + { + "epoch": 0.36, + "grad_norm": 1.7045322490959582, + "learning_rate": 7.354354457528879e-06, + "loss": 0.4883, + "step": 4630 + }, + { + "epoch": 0.36, + "grad_norm": 1.535848079552201, + "learning_rate": 7.353232399458606e-06, + "loss": 0.4487, + "step": 4631 + }, + { + "epoch": 0.36, + "grad_norm": 1.5187199415491002, + "learning_rate": 7.3521101891372785e-06, + "loss": 0.4217, + "step": 4632 + }, + { + "epoch": 0.36, + "grad_norm": 0.5473522073622727, + "learning_rate": 7.350987826637496e-06, + "loss": 0.5067, + "step": 4633 + }, + { + "epoch": 0.36, + "grad_norm": 1.8035987685377435, + "learning_rate": 7.349865312031877e-06, + "loss": 0.4822, + "step": 4634 + }, + { + "epoch": 0.36, + "grad_norm": 1.8254647215719237, + "learning_rate": 7.348742645393048e-06, + "loss": 0.5218, + "step": 4635 + }, + { + "epoch": 0.36, + "grad_norm": 1.6523730887136596, + "learning_rate": 7.347619826793641e-06, + "loss": 0.4448, + "step": 4636 + }, + { + "epoch": 0.36, + "grad_norm": 1.5744395819001273, + "learning_rate": 7.346496856306301e-06, + "loss": 0.46, + "step": 4637 + }, + { + "epoch": 0.36, + "grad_norm": 1.4661406327294033, + "learning_rate": 7.345373734003686e-06, + "loss": 0.4935, + "step": 4638 + }, + { + "epoch": 0.36, + "grad_norm": 1.7203413754040668, + "learning_rate": 7.344250459958458e-06, + "loss": 0.5183, + "step": 4639 + }, + { + "epoch": 0.36, + "grad_norm": 1.740530995780852, + "learning_rate": 7.343127034243291e-06, + "loss": 0.4482, + "step": 4640 + }, + { + "epoch": 0.36, + "grad_norm": 2.314592721387347, + "learning_rate": 7.3420034569308714e-06, + "loss": 0.4626, + "step": 4641 + }, + { + "epoch": 0.36, + "grad_norm": 0.5976847634236406, + "learning_rate": 7.340879728093892e-06, + "loss": 0.5143, + "step": 4642 + }, + { + "epoch": 0.36, + "grad_norm": 2.592952838505204, + "learning_rate": 7.339755847805056e-06, + "loss": 0.5231, + "step": 4643 + }, + { + "epoch": 0.36, + "grad_norm": 1.5068808662320112, + "learning_rate": 7.338631816137078e-06, + "loss": 0.5138, + "step": 4644 + }, + { + "epoch": 0.36, + "grad_norm": 2.094282205836922, + "learning_rate": 7.337507633162679e-06, + "loss": 0.4593, + "step": 4645 + }, + { + "epoch": 0.36, + "grad_norm": 1.367526506725231, + "learning_rate": 7.336383298954595e-06, + "loss": 0.4576, + "step": 4646 + }, + { + "epoch": 0.36, + "grad_norm": 2.1691608634472472, + "learning_rate": 7.3352588135855686e-06, + "loss": 0.4926, + "step": 4647 + }, + { + "epoch": 0.37, + "grad_norm": 1.2112541387765567, + "learning_rate": 7.334134177128351e-06, + "loss": 0.4329, + "step": 4648 + }, + { + "epoch": 0.37, + "grad_norm": 1.7094359754251969, + "learning_rate": 7.333009389655705e-06, + "loss": 0.4716, + "step": 4649 + }, + { + "epoch": 0.37, + "grad_norm": 1.747094608404503, + "learning_rate": 7.331884451240405e-06, + "loss": 0.3926, + "step": 4650 + }, + { + "epoch": 0.37, + "grad_norm": 1.693301252004503, + "learning_rate": 7.330759361955231e-06, + "loss": 0.5551, + "step": 4651 + }, + { + "epoch": 0.37, + "grad_norm": 1.2724635049790383, + "learning_rate": 7.329634121872975e-06, + "loss": 0.4346, + "step": 4652 + }, + { + "epoch": 0.37, + "grad_norm": 0.6357116178168966, + "learning_rate": 7.328508731066439e-06, + "loss": 0.5364, + "step": 4653 + }, + { + "epoch": 0.37, + "grad_norm": 3.46269166390621, + "learning_rate": 7.327383189608433e-06, + "loss": 0.4901, + "step": 4654 + }, + { + "epoch": 0.37, + "grad_norm": 4.588153398959974, + "learning_rate": 7.326257497571782e-06, + "loss": 0.5033, + "step": 4655 + }, + { + "epoch": 0.37, + "grad_norm": 1.6697534209725695, + "learning_rate": 7.325131655029311e-06, + "loss": 0.4991, + "step": 4656 + }, + { + "epoch": 0.37, + "grad_norm": 1.789051368719857, + "learning_rate": 7.3240056620538655e-06, + "loss": 0.5012, + "step": 4657 + }, + { + "epoch": 0.37, + "grad_norm": 0.5811753435834477, + "learning_rate": 7.3228795187182935e-06, + "loss": 0.5037, + "step": 4658 + }, + { + "epoch": 0.37, + "grad_norm": 0.6111014758078617, + "learning_rate": 7.321753225095456e-06, + "loss": 0.5283, + "step": 4659 + }, + { + "epoch": 0.37, + "grad_norm": 1.5626291853319376, + "learning_rate": 7.32062678125822e-06, + "loss": 0.5122, + "step": 4660 + }, + { + "epoch": 0.37, + "grad_norm": 1.8932933475980906, + "learning_rate": 7.319500187279469e-06, + "loss": 0.4487, + "step": 4661 + }, + { + "epoch": 0.37, + "grad_norm": 0.6127923986626242, + "learning_rate": 7.318373443232088e-06, + "loss": 0.5239, + "step": 4662 + }, + { + "epoch": 0.37, + "grad_norm": 0.5705045767094448, + "learning_rate": 7.317246549188981e-06, + "loss": 0.5302, + "step": 4663 + }, + { + "epoch": 0.37, + "grad_norm": 0.5510543508427849, + "learning_rate": 7.316119505223052e-06, + "loss": 0.4808, + "step": 4664 + }, + { + "epoch": 0.37, + "grad_norm": 1.4231294492913333, + "learning_rate": 7.31499231140722e-06, + "loss": 0.5279, + "step": 4665 + }, + { + "epoch": 0.37, + "grad_norm": 1.7582306321020094, + "learning_rate": 7.3138649678144155e-06, + "loss": 0.5255, + "step": 4666 + }, + { + "epoch": 0.37, + "grad_norm": 3.242684585752455, + "learning_rate": 7.312737474517575e-06, + "loss": 0.476, + "step": 4667 + }, + { + "epoch": 0.37, + "grad_norm": 1.6885928532551966, + "learning_rate": 7.3116098315896436e-06, + "loss": 0.4639, + "step": 4668 + }, + { + "epoch": 0.37, + "grad_norm": 1.2456853204804592, + "learning_rate": 7.3104820391035814e-06, + "loss": 0.4465, + "step": 4669 + }, + { + "epoch": 0.37, + "grad_norm": 2.1462988778320344, + "learning_rate": 7.309354097132352e-06, + "loss": 0.5398, + "step": 4670 + }, + { + "epoch": 0.37, + "grad_norm": 1.8372925624109482, + "learning_rate": 7.308226005748934e-06, + "loss": 0.4976, + "step": 4671 + }, + { + "epoch": 0.37, + "grad_norm": 1.4448518533989736, + "learning_rate": 7.307097765026313e-06, + "loss": 0.5301, + "step": 4672 + }, + { + "epoch": 0.37, + "grad_norm": 2.233076634453553, + "learning_rate": 7.305969375037486e-06, + "loss": 0.4808, + "step": 4673 + }, + { + "epoch": 0.37, + "grad_norm": 1.3905309010745217, + "learning_rate": 7.304840835855456e-06, + "loss": 0.4619, + "step": 4674 + }, + { + "epoch": 0.37, + "grad_norm": 1.5008392564287607, + "learning_rate": 7.303712147553239e-06, + "loss": 0.475, + "step": 4675 + }, + { + "epoch": 0.37, + "grad_norm": 0.580663619157923, + "learning_rate": 7.302583310203859e-06, + "loss": 0.4783, + "step": 4676 + }, + { + "epoch": 0.37, + "grad_norm": 0.60986132013244, + "learning_rate": 7.301454323880353e-06, + "loss": 0.5251, + "step": 4677 + }, + { + "epoch": 0.37, + "grad_norm": 0.5766232508955631, + "learning_rate": 7.300325188655762e-06, + "loss": 0.5311, + "step": 4678 + }, + { + "epoch": 0.37, + "grad_norm": 1.63248922056906, + "learning_rate": 7.29919590460314e-06, + "loss": 0.4626, + "step": 4679 + }, + { + "epoch": 0.37, + "grad_norm": 2.365804347659826, + "learning_rate": 7.298066471795551e-06, + "loss": 0.5242, + "step": 4680 + }, + { + "epoch": 0.37, + "grad_norm": 3.1927090017639683, + "learning_rate": 7.296936890306067e-06, + "loss": 0.5009, + "step": 4681 + }, + { + "epoch": 0.37, + "grad_norm": 1.37182540015722, + "learning_rate": 7.295807160207771e-06, + "loss": 0.4748, + "step": 4682 + }, + { + "epoch": 0.37, + "grad_norm": 1.914842620531982, + "learning_rate": 7.294677281573756e-06, + "loss": 0.4925, + "step": 4683 + }, + { + "epoch": 0.37, + "grad_norm": 1.7286300865010455, + "learning_rate": 7.293547254477122e-06, + "loss": 0.5042, + "step": 4684 + }, + { + "epoch": 0.37, + "grad_norm": 1.4144138954930967, + "learning_rate": 7.292417078990982e-06, + "loss": 0.4617, + "step": 4685 + }, + { + "epoch": 0.37, + "grad_norm": 0.7640260040014577, + "learning_rate": 7.291286755188453e-06, + "loss": 0.5248, + "step": 4686 + }, + { + "epoch": 0.37, + "grad_norm": 0.705506245912563, + "learning_rate": 7.290156283142671e-06, + "loss": 0.5018, + "step": 4687 + }, + { + "epoch": 0.37, + "grad_norm": 1.9900153301927477, + "learning_rate": 7.2890256629267745e-06, + "loss": 0.4636, + "step": 4688 + }, + { + "epoch": 0.37, + "grad_norm": 0.6204244068765212, + "learning_rate": 7.28789489461391e-06, + "loss": 0.4994, + "step": 4689 + }, + { + "epoch": 0.37, + "grad_norm": 1.6324342268063203, + "learning_rate": 7.286763978277238e-06, + "loss": 0.5194, + "step": 4690 + }, + { + "epoch": 0.37, + "grad_norm": 0.6536256435896809, + "learning_rate": 7.2856329139899296e-06, + "loss": 0.511, + "step": 4691 + }, + { + "epoch": 0.37, + "grad_norm": 1.9015996167138063, + "learning_rate": 7.284501701825162e-06, + "loss": 0.497, + "step": 4692 + }, + { + "epoch": 0.37, + "grad_norm": 1.4947597248959605, + "learning_rate": 7.283370341856119e-06, + "loss": 0.4768, + "step": 4693 + }, + { + "epoch": 0.37, + "grad_norm": 1.4865452842734845, + "learning_rate": 7.282238834156006e-06, + "loss": 0.4808, + "step": 4694 + }, + { + "epoch": 0.37, + "grad_norm": 1.5054378633856997, + "learning_rate": 7.2811071787980245e-06, + "loss": 0.4769, + "step": 4695 + }, + { + "epoch": 0.37, + "grad_norm": 1.6115051490176622, + "learning_rate": 7.279975375855394e-06, + "loss": 0.4704, + "step": 4696 + }, + { + "epoch": 0.37, + "grad_norm": 5.8695037901160205, + "learning_rate": 7.278843425401338e-06, + "loss": 0.4946, + "step": 4697 + }, + { + "epoch": 0.37, + "grad_norm": 1.9375358684464783, + "learning_rate": 7.277711327509094e-06, + "loss": 0.462, + "step": 4698 + }, + { + "epoch": 0.37, + "grad_norm": 1.8676380094342842, + "learning_rate": 7.276579082251906e-06, + "loss": 0.4957, + "step": 4699 + }, + { + "epoch": 0.37, + "grad_norm": 1.7942361489796048, + "learning_rate": 7.27544668970303e-06, + "loss": 0.458, + "step": 4700 + }, + { + "epoch": 0.37, + "grad_norm": 0.7067408963624273, + "learning_rate": 7.2743141499357295e-06, + "loss": 0.515, + "step": 4701 + }, + { + "epoch": 0.37, + "grad_norm": 1.3738137346572823, + "learning_rate": 7.2731814630232785e-06, + "loss": 0.4714, + "step": 4702 + }, + { + "epoch": 0.37, + "grad_norm": 1.7624810386918337, + "learning_rate": 7.272048629038961e-06, + "loss": 0.4439, + "step": 4703 + }, + { + "epoch": 0.37, + "grad_norm": 1.7887503097455553, + "learning_rate": 7.27091564805607e-06, + "loss": 0.4486, + "step": 4704 + }, + { + "epoch": 0.37, + "grad_norm": 1.7563305980337875, + "learning_rate": 7.269782520147906e-06, + "loss": 0.4682, + "step": 4705 + }, + { + "epoch": 0.37, + "grad_norm": 1.9578830878581797, + "learning_rate": 7.2686492453877835e-06, + "loss": 0.4891, + "step": 4706 + }, + { + "epoch": 0.37, + "grad_norm": 1.4306246809932381, + "learning_rate": 7.267515823849021e-06, + "loss": 0.4798, + "step": 4707 + }, + { + "epoch": 0.37, + "grad_norm": 1.3130142770195665, + "learning_rate": 7.266382255604953e-06, + "loss": 0.469, + "step": 4708 + }, + { + "epoch": 0.37, + "grad_norm": 2.0330209398303527, + "learning_rate": 7.265248540728915e-06, + "loss": 0.4711, + "step": 4709 + }, + { + "epoch": 0.37, + "grad_norm": 1.4413723403377363, + "learning_rate": 7.264114679294263e-06, + "loss": 0.468, + "step": 4710 + }, + { + "epoch": 0.37, + "grad_norm": 0.633230928496973, + "learning_rate": 7.262980671374351e-06, + "loss": 0.5359, + "step": 4711 + }, + { + "epoch": 0.37, + "grad_norm": 1.3920293959348922, + "learning_rate": 7.261846517042551e-06, + "loss": 0.4707, + "step": 4712 + }, + { + "epoch": 0.37, + "grad_norm": 1.814928577636905, + "learning_rate": 7.26071221637224e-06, + "loss": 0.5136, + "step": 4713 + }, + { + "epoch": 0.37, + "grad_norm": 1.4917108149138367, + "learning_rate": 7.259577769436806e-06, + "loss": 0.5695, + "step": 4714 + }, + { + "epoch": 0.37, + "grad_norm": 1.773866921124995, + "learning_rate": 7.258443176309645e-06, + "loss": 0.5175, + "step": 4715 + }, + { + "epoch": 0.37, + "grad_norm": 0.6080369892505022, + "learning_rate": 7.257308437064165e-06, + "loss": 0.5192, + "step": 4716 + }, + { + "epoch": 0.37, + "grad_norm": 1.9840263076349693, + "learning_rate": 7.256173551773783e-06, + "loss": 0.4563, + "step": 4717 + }, + { + "epoch": 0.37, + "grad_norm": 1.496153075933112, + "learning_rate": 7.255038520511925e-06, + "loss": 0.5158, + "step": 4718 + }, + { + "epoch": 0.37, + "grad_norm": 0.5832219781339507, + "learning_rate": 7.253903343352022e-06, + "loss": 0.4994, + "step": 4719 + }, + { + "epoch": 0.37, + "grad_norm": 3.672780103138904, + "learning_rate": 7.252768020367523e-06, + "loss": 0.4885, + "step": 4720 + }, + { + "epoch": 0.37, + "grad_norm": 1.6815980147872833, + "learning_rate": 7.25163255163188e-06, + "loss": 0.503, + "step": 4721 + }, + { + "epoch": 0.37, + "grad_norm": 1.8585282963177416, + "learning_rate": 7.2504969372185545e-06, + "loss": 0.4472, + "step": 4722 + }, + { + "epoch": 0.37, + "grad_norm": 1.5180971806900907, + "learning_rate": 7.2493611772010235e-06, + "loss": 0.4868, + "step": 4723 + }, + { + "epoch": 0.37, + "grad_norm": 2.3785406880426674, + "learning_rate": 7.2482252716527645e-06, + "loss": 0.4713, + "step": 4724 + }, + { + "epoch": 0.37, + "grad_norm": 1.5818437667628589, + "learning_rate": 7.2470892206472745e-06, + "loss": 0.4527, + "step": 4725 + }, + { + "epoch": 0.37, + "grad_norm": 1.7565609903149626, + "learning_rate": 7.245953024258049e-06, + "loss": 0.4973, + "step": 4726 + }, + { + "epoch": 0.37, + "grad_norm": 1.9254773802857275, + "learning_rate": 7.244816682558602e-06, + "loss": 0.5098, + "step": 4727 + }, + { + "epoch": 0.37, + "grad_norm": 0.5663062738471344, + "learning_rate": 7.2436801956224515e-06, + "loss": 0.5047, + "step": 4728 + }, + { + "epoch": 0.37, + "grad_norm": 1.8217040530149746, + "learning_rate": 7.242543563523128e-06, + "loss": 0.469, + "step": 4729 + }, + { + "epoch": 0.37, + "grad_norm": 1.933370273578642, + "learning_rate": 7.241406786334169e-06, + "loss": 0.5443, + "step": 4730 + }, + { + "epoch": 0.37, + "grad_norm": 1.7260818648573564, + "learning_rate": 7.240269864129125e-06, + "loss": 0.4546, + "step": 4731 + }, + { + "epoch": 0.37, + "grad_norm": 1.4425053260266039, + "learning_rate": 7.239132796981549e-06, + "loss": 0.4787, + "step": 4732 + }, + { + "epoch": 0.37, + "grad_norm": 1.2335369793194162, + "learning_rate": 7.237995584965012e-06, + "loss": 0.4843, + "step": 4733 + }, + { + "epoch": 0.37, + "grad_norm": 1.518058571393306, + "learning_rate": 7.2368582281530874e-06, + "loss": 0.4405, + "step": 4734 + }, + { + "epoch": 0.37, + "grad_norm": 1.5735924443111182, + "learning_rate": 7.2357207266193615e-06, + "loss": 0.5221, + "step": 4735 + }, + { + "epoch": 0.37, + "grad_norm": 1.5775445278768783, + "learning_rate": 7.23458308043743e-06, + "loss": 0.4745, + "step": 4736 + }, + { + "epoch": 0.37, + "grad_norm": 1.4959229900486775, + "learning_rate": 7.233445289680896e-06, + "loss": 0.4756, + "step": 4737 + }, + { + "epoch": 0.37, + "grad_norm": 1.445940569074372, + "learning_rate": 7.232307354423374e-06, + "loss": 0.4823, + "step": 4738 + }, + { + "epoch": 0.37, + "grad_norm": 1.7181549843338482, + "learning_rate": 7.2311692747384844e-06, + "loss": 0.4789, + "step": 4739 + }, + { + "epoch": 0.37, + "grad_norm": 1.5485213632807011, + "learning_rate": 7.230031050699862e-06, + "loss": 0.4541, + "step": 4740 + }, + { + "epoch": 0.37, + "grad_norm": 1.4601689880243836, + "learning_rate": 7.22889268238115e-06, + "loss": 0.467, + "step": 4741 + }, + { + "epoch": 0.37, + "grad_norm": 2.6154505582136087, + "learning_rate": 7.227754169855995e-06, + "loss": 0.5012, + "step": 4742 + }, + { + "epoch": 0.37, + "grad_norm": 2.0642110931352153, + "learning_rate": 7.226615513198061e-06, + "loss": 0.4907, + "step": 4743 + }, + { + "epoch": 0.37, + "grad_norm": 1.6947524051799303, + "learning_rate": 7.225476712481015e-06, + "loss": 0.4827, + "step": 4744 + }, + { + "epoch": 0.37, + "grad_norm": 1.6002192885577924, + "learning_rate": 7.2243377677785375e-06, + "loss": 0.4566, + "step": 4745 + }, + { + "epoch": 0.37, + "grad_norm": 2.068949852886102, + "learning_rate": 7.2231986791643165e-06, + "loss": 0.5054, + "step": 4746 + }, + { + "epoch": 0.37, + "grad_norm": 1.96567998440374, + "learning_rate": 7.22205944671205e-06, + "loss": 0.5018, + "step": 4747 + }, + { + "epoch": 0.37, + "grad_norm": 2.2318786998201268, + "learning_rate": 7.220920070495442e-06, + "loss": 0.4569, + "step": 4748 + }, + { + "epoch": 0.37, + "grad_norm": 3.0397195478713117, + "learning_rate": 7.2197805505882126e-06, + "loss": 0.4636, + "step": 4749 + }, + { + "epoch": 0.37, + "grad_norm": 1.6227063649748168, + "learning_rate": 7.2186408870640835e-06, + "loss": 0.5282, + "step": 4750 + }, + { + "epoch": 0.37, + "grad_norm": 0.6174938576125719, + "learning_rate": 7.217501079996793e-06, + "loss": 0.5272, + "step": 4751 + }, + { + "epoch": 0.37, + "grad_norm": 2.1921648849125006, + "learning_rate": 7.216361129460082e-06, + "loss": 0.4553, + "step": 4752 + }, + { + "epoch": 0.37, + "grad_norm": 2.247493481621767, + "learning_rate": 7.215221035527707e-06, + "loss": 0.5273, + "step": 4753 + }, + { + "epoch": 0.37, + "grad_norm": 2.4300858176876883, + "learning_rate": 7.2140807982734274e-06, + "loss": 0.5056, + "step": 4754 + }, + { + "epoch": 0.37, + "grad_norm": 1.58459640088003, + "learning_rate": 7.212940417771018e-06, + "loss": 0.5025, + "step": 4755 + }, + { + "epoch": 0.37, + "grad_norm": 0.5952044885058977, + "learning_rate": 7.211799894094258e-06, + "loss": 0.5272, + "step": 4756 + }, + { + "epoch": 0.37, + "grad_norm": 1.7611399398225924, + "learning_rate": 7.2106592273169376e-06, + "loss": 0.4701, + "step": 4757 + }, + { + "epoch": 0.37, + "grad_norm": 1.4425377151734717, + "learning_rate": 7.209518417512858e-06, + "loss": 0.5023, + "step": 4758 + }, + { + "epoch": 0.37, + "grad_norm": 2.0209301283751606, + "learning_rate": 7.208377464755826e-06, + "loss": 0.4232, + "step": 4759 + }, + { + "epoch": 0.37, + "grad_norm": 1.7073352395793535, + "learning_rate": 7.207236369119662e-06, + "loss": 0.5717, + "step": 4760 + }, + { + "epoch": 0.37, + "grad_norm": 0.5350499596440036, + "learning_rate": 7.206095130678192e-06, + "loss": 0.5106, + "step": 4761 + }, + { + "epoch": 0.37, + "grad_norm": 9.542541609458176, + "learning_rate": 7.204953749505252e-06, + "loss": 0.4505, + "step": 4762 + }, + { + "epoch": 0.37, + "grad_norm": 2.0885494173191135, + "learning_rate": 7.2038122256746915e-06, + "loss": 0.458, + "step": 4763 + }, + { + "epoch": 0.37, + "grad_norm": 1.893775557408428, + "learning_rate": 7.202670559260359e-06, + "loss": 0.4761, + "step": 4764 + }, + { + "epoch": 0.37, + "grad_norm": 1.6490055126750536, + "learning_rate": 7.2015287503361254e-06, + "loss": 0.4904, + "step": 4765 + }, + { + "epoch": 0.37, + "grad_norm": 1.837218531711044, + "learning_rate": 7.200386798975863e-06, + "loss": 0.4651, + "step": 4766 + }, + { + "epoch": 0.37, + "grad_norm": 1.6892997761287047, + "learning_rate": 7.1992447052534495e-06, + "loss": 0.5075, + "step": 4767 + }, + { + "epoch": 0.37, + "grad_norm": 1.4618739814609234, + "learning_rate": 7.1981024692427835e-06, + "loss": 0.481, + "step": 4768 + }, + { + "epoch": 0.37, + "grad_norm": 1.6181535962951814, + "learning_rate": 7.1969600910177615e-06, + "loss": 0.4799, + "step": 4769 + }, + { + "epoch": 0.37, + "grad_norm": 1.382438497684814, + "learning_rate": 7.195817570652297e-06, + "loss": 0.4832, + "step": 4770 + }, + { + "epoch": 0.37, + "grad_norm": 0.5775223241490135, + "learning_rate": 7.194674908220307e-06, + "loss": 0.5244, + "step": 4771 + }, + { + "epoch": 0.37, + "grad_norm": 1.5293407628719298, + "learning_rate": 7.193532103795723e-06, + "loss": 0.4874, + "step": 4772 + }, + { + "epoch": 0.37, + "grad_norm": 2.0347785855289833, + "learning_rate": 7.1923891574524785e-06, + "loss": 0.4983, + "step": 4773 + }, + { + "epoch": 0.37, + "grad_norm": 4.871040264777708, + "learning_rate": 7.191246069264526e-06, + "loss": 0.4616, + "step": 4774 + }, + { + "epoch": 0.38, + "grad_norm": 1.7134532490523338, + "learning_rate": 7.1901028393058185e-06, + "loss": 0.4306, + "step": 4775 + }, + { + "epoch": 0.38, + "grad_norm": 1.8216939814528377, + "learning_rate": 7.188959467650323e-06, + "loss": 0.5158, + "step": 4776 + }, + { + "epoch": 0.38, + "grad_norm": 1.934450551878168, + "learning_rate": 7.187815954372012e-06, + "loss": 0.5187, + "step": 4777 + }, + { + "epoch": 0.38, + "grad_norm": 3.3253752943445383, + "learning_rate": 7.186672299544872e-06, + "loss": 0.5121, + "step": 4778 + }, + { + "epoch": 0.38, + "grad_norm": 1.541476990480005, + "learning_rate": 7.185528503242894e-06, + "loss": 0.5399, + "step": 4779 + }, + { + "epoch": 0.38, + "grad_norm": 1.4478442249182373, + "learning_rate": 7.184384565540083e-06, + "loss": 0.4331, + "step": 4780 + }, + { + "epoch": 0.38, + "grad_norm": 1.4471939318118097, + "learning_rate": 7.1832404865104456e-06, + "loss": 0.5462, + "step": 4781 + }, + { + "epoch": 0.38, + "grad_norm": 1.9997887146301945, + "learning_rate": 7.182096266228006e-06, + "loss": 0.5082, + "step": 4782 + }, + { + "epoch": 0.38, + "grad_norm": 1.6419728978849903, + "learning_rate": 7.1809519047667905e-06, + "loss": 0.4882, + "step": 4783 + }, + { + "epoch": 0.38, + "grad_norm": 1.8007896785235906, + "learning_rate": 7.179807402200842e-06, + "loss": 0.5136, + "step": 4784 + }, + { + "epoch": 0.38, + "grad_norm": 1.8438502208463854, + "learning_rate": 7.178662758604205e-06, + "loss": 0.4861, + "step": 4785 + }, + { + "epoch": 0.38, + "grad_norm": 0.5627457088174193, + "learning_rate": 7.1775179740509385e-06, + "loss": 0.5245, + "step": 4786 + }, + { + "epoch": 0.38, + "grad_norm": 1.5474543971572006, + "learning_rate": 7.176373048615106e-06, + "loss": 0.4682, + "step": 4787 + }, + { + "epoch": 0.38, + "grad_norm": 18.04727414902749, + "learning_rate": 7.175227982370787e-06, + "loss": 0.4773, + "step": 4788 + }, + { + "epoch": 0.38, + "grad_norm": 1.8257081865284526, + "learning_rate": 7.174082775392061e-06, + "loss": 0.4659, + "step": 4789 + }, + { + "epoch": 0.38, + "grad_norm": 0.5411190225862744, + "learning_rate": 7.172937427753025e-06, + "loss": 0.5073, + "step": 4790 + }, + { + "epoch": 0.38, + "grad_norm": 0.5990952625617741, + "learning_rate": 7.171791939527779e-06, + "loss": 0.4928, + "step": 4791 + }, + { + "epoch": 0.38, + "grad_norm": 1.2835301696124328, + "learning_rate": 7.170646310790434e-06, + "loss": 0.5053, + "step": 4792 + }, + { + "epoch": 0.38, + "grad_norm": 1.680081901257542, + "learning_rate": 7.169500541615115e-06, + "loss": 0.477, + "step": 4793 + }, + { + "epoch": 0.38, + "grad_norm": 1.6251597784704082, + "learning_rate": 7.168354632075948e-06, + "loss": 0.4608, + "step": 4794 + }, + { + "epoch": 0.38, + "grad_norm": 1.5562489837784452, + "learning_rate": 7.167208582247072e-06, + "loss": 0.4876, + "step": 4795 + }, + { + "epoch": 0.38, + "grad_norm": 1.9502386103121507, + "learning_rate": 7.166062392202637e-06, + "loss": 0.5092, + "step": 4796 + }, + { + "epoch": 0.38, + "grad_norm": 2.010187287820319, + "learning_rate": 7.164916062016798e-06, + "loss": 0.4915, + "step": 4797 + }, + { + "epoch": 0.38, + "grad_norm": 1.6046809935951256, + "learning_rate": 7.163769591763723e-06, + "loss": 0.4462, + "step": 4798 + }, + { + "epoch": 0.38, + "grad_norm": 1.3832799618322587, + "learning_rate": 7.1626229815175855e-06, + "loss": 0.3883, + "step": 4799 + }, + { + "epoch": 0.38, + "grad_norm": 2.333501992921133, + "learning_rate": 7.161476231352569e-06, + "loss": 0.4495, + "step": 4800 + }, + { + "epoch": 0.38, + "grad_norm": 1.6268845436130952, + "learning_rate": 7.16032934134287e-06, + "loss": 0.4943, + "step": 4801 + }, + { + "epoch": 0.38, + "grad_norm": 1.650064641777069, + "learning_rate": 7.159182311562686e-06, + "loss": 0.4542, + "step": 4802 + }, + { + "epoch": 0.38, + "grad_norm": 1.6846314137780052, + "learning_rate": 7.158035142086234e-06, + "loss": 0.4681, + "step": 4803 + }, + { + "epoch": 0.38, + "grad_norm": 2.9005179214586914, + "learning_rate": 7.156887832987729e-06, + "loss": 0.5696, + "step": 4804 + }, + { + "epoch": 0.38, + "grad_norm": 1.4852843659572135, + "learning_rate": 7.155740384341404e-06, + "loss": 0.4521, + "step": 4805 + }, + { + "epoch": 0.38, + "grad_norm": 0.5832144808504138, + "learning_rate": 7.154592796221495e-06, + "loss": 0.4929, + "step": 4806 + }, + { + "epoch": 0.38, + "grad_norm": 0.5633446584448708, + "learning_rate": 7.153445068702252e-06, + "loss": 0.5014, + "step": 4807 + }, + { + "epoch": 0.38, + "grad_norm": 1.563757745098572, + "learning_rate": 7.15229720185793e-06, + "loss": 0.4516, + "step": 4808 + }, + { + "epoch": 0.38, + "grad_norm": 1.621305369537592, + "learning_rate": 7.1511491957627945e-06, + "loss": 0.497, + "step": 4809 + }, + { + "epoch": 0.38, + "grad_norm": 1.526975849112527, + "learning_rate": 7.1500010504911185e-06, + "loss": 0.4055, + "step": 4810 + }, + { + "epoch": 0.38, + "grad_norm": 3.24747164533128, + "learning_rate": 7.148852766117189e-06, + "loss": 0.4703, + "step": 4811 + }, + { + "epoch": 0.38, + "grad_norm": 1.4960270685763388, + "learning_rate": 7.147704342715296e-06, + "loss": 0.4623, + "step": 4812 + }, + { + "epoch": 0.38, + "grad_norm": 1.5603263484043122, + "learning_rate": 7.146555780359742e-06, + "loss": 0.5128, + "step": 4813 + }, + { + "epoch": 0.38, + "grad_norm": 1.6334007579191232, + "learning_rate": 7.145407079124835e-06, + "loss": 0.5088, + "step": 4814 + }, + { + "epoch": 0.38, + "grad_norm": 1.7452290072680166, + "learning_rate": 7.144258239084899e-06, + "loss": 0.495, + "step": 4815 + }, + { + "epoch": 0.38, + "grad_norm": 1.4789901181434115, + "learning_rate": 7.143109260314259e-06, + "loss": 0.4481, + "step": 4816 + }, + { + "epoch": 0.38, + "grad_norm": 1.940294440961406, + "learning_rate": 7.141960142887254e-06, + "loss": 0.4561, + "step": 4817 + }, + { + "epoch": 0.38, + "grad_norm": 2.5760601326236863, + "learning_rate": 7.140810886878228e-06, + "loss": 0.4819, + "step": 4818 + }, + { + "epoch": 0.38, + "grad_norm": 1.8740071460069843, + "learning_rate": 7.13966149236154e-06, + "loss": 0.4999, + "step": 4819 + }, + { + "epoch": 0.38, + "grad_norm": 1.2271038742195985, + "learning_rate": 7.1385119594115515e-06, + "loss": 0.4205, + "step": 4820 + }, + { + "epoch": 0.38, + "grad_norm": 1.8866342324473262, + "learning_rate": 7.137362288102638e-06, + "loss": 0.4407, + "step": 4821 + }, + { + "epoch": 0.38, + "grad_norm": 1.8296823921926835, + "learning_rate": 7.136212478509179e-06, + "loss": 0.5038, + "step": 4822 + }, + { + "epoch": 0.38, + "grad_norm": 2.7668322422326637, + "learning_rate": 7.135062530705569e-06, + "loss": 0.4907, + "step": 4823 + }, + { + "epoch": 0.38, + "grad_norm": 1.913982469947087, + "learning_rate": 7.133912444766204e-06, + "loss": 0.4589, + "step": 4824 + }, + { + "epoch": 0.38, + "grad_norm": 2.3597901269047026, + "learning_rate": 7.132762220765497e-06, + "loss": 0.5171, + "step": 4825 + }, + { + "epoch": 0.38, + "grad_norm": 1.3795548796539945, + "learning_rate": 7.131611858777863e-06, + "loss": 0.4956, + "step": 4826 + }, + { + "epoch": 0.38, + "grad_norm": 1.3193161200408823, + "learning_rate": 7.1304613588777315e-06, + "loss": 0.4495, + "step": 4827 + }, + { + "epoch": 0.38, + "grad_norm": 0.7026252335249872, + "learning_rate": 7.129310721139536e-06, + "loss": 0.5246, + "step": 4828 + }, + { + "epoch": 0.38, + "grad_norm": 1.6901845863727984, + "learning_rate": 7.128159945637722e-06, + "loss": 0.4193, + "step": 4829 + }, + { + "epoch": 0.38, + "grad_norm": 0.5827177853608418, + "learning_rate": 7.127009032446744e-06, + "loss": 0.5303, + "step": 4830 + }, + { + "epoch": 0.38, + "grad_norm": 2.2866480548892696, + "learning_rate": 7.125857981641066e-06, + "loss": 0.4727, + "step": 4831 + }, + { + "epoch": 0.38, + "grad_norm": 1.7011752144543928, + "learning_rate": 7.1247067932951555e-06, + "loss": 0.4251, + "step": 4832 + }, + { + "epoch": 0.38, + "grad_norm": 0.6404625529047874, + "learning_rate": 7.123555467483496e-06, + "loss": 0.524, + "step": 4833 + }, + { + "epoch": 0.38, + "grad_norm": 1.4976964844189433, + "learning_rate": 7.122404004280574e-06, + "loss": 0.4778, + "step": 4834 + }, + { + "epoch": 0.38, + "grad_norm": 0.5972232670139491, + "learning_rate": 7.121252403760891e-06, + "loss": 0.4988, + "step": 4835 + }, + { + "epoch": 0.38, + "grad_norm": 1.2328023132162154, + "learning_rate": 7.1201006659989525e-06, + "loss": 0.4545, + "step": 4836 + }, + { + "epoch": 0.38, + "grad_norm": 0.6016607389926304, + "learning_rate": 7.118948791069273e-06, + "loss": 0.5055, + "step": 4837 + }, + { + "epoch": 0.38, + "grad_norm": 3.826720454821463, + "learning_rate": 7.117796779046379e-06, + "loss": 0.4753, + "step": 4838 + }, + { + "epoch": 0.38, + "grad_norm": 1.6041532012294353, + "learning_rate": 7.116644630004805e-06, + "loss": 0.4405, + "step": 4839 + }, + { + "epoch": 0.38, + "grad_norm": 0.5498237787234056, + "learning_rate": 7.115492344019091e-06, + "loss": 0.5176, + "step": 4840 + }, + { + "epoch": 0.38, + "grad_norm": 1.7361073300515835, + "learning_rate": 7.11433992116379e-06, + "loss": 0.5187, + "step": 4841 + }, + { + "epoch": 0.38, + "grad_norm": 2.188239930130391, + "learning_rate": 7.113187361513463e-06, + "loss": 0.4875, + "step": 4842 + }, + { + "epoch": 0.38, + "grad_norm": 1.506591375325928, + "learning_rate": 7.112034665142677e-06, + "loss": 0.44, + "step": 4843 + }, + { + "epoch": 0.38, + "grad_norm": 1.753477473173662, + "learning_rate": 7.110881832126012e-06, + "loss": 0.4612, + "step": 4844 + }, + { + "epoch": 0.38, + "grad_norm": 1.3331318780599462, + "learning_rate": 7.109728862538054e-06, + "loss": 0.4751, + "step": 4845 + }, + { + "epoch": 0.38, + "grad_norm": 0.6212944176183048, + "learning_rate": 7.108575756453398e-06, + "loss": 0.5153, + "step": 4846 + }, + { + "epoch": 0.38, + "grad_norm": 2.967168623577125, + "learning_rate": 7.107422513946648e-06, + "loss": 0.4983, + "step": 4847 + }, + { + "epoch": 0.38, + "grad_norm": 2.505243696895769, + "learning_rate": 7.106269135092419e-06, + "loss": 0.4957, + "step": 4848 + }, + { + "epoch": 0.38, + "grad_norm": 1.8629822485856886, + "learning_rate": 7.105115619965333e-06, + "loss": 0.4579, + "step": 4849 + }, + { + "epoch": 0.38, + "grad_norm": 1.5040150399249184, + "learning_rate": 7.10396196864002e-06, + "loss": 0.4947, + "step": 4850 + }, + { + "epoch": 0.38, + "grad_norm": 1.8269030564050288, + "learning_rate": 7.1028081811911185e-06, + "loss": 0.4727, + "step": 4851 + }, + { + "epoch": 0.38, + "grad_norm": 2.573173475927753, + "learning_rate": 7.101654257693279e-06, + "loss": 0.5252, + "step": 4852 + }, + { + "epoch": 0.38, + "grad_norm": 1.912857846292556, + "learning_rate": 7.100500198221159e-06, + "loss": 0.4342, + "step": 4853 + }, + { + "epoch": 0.38, + "grad_norm": 1.8920831598127514, + "learning_rate": 7.099346002849425e-06, + "loss": 0.4903, + "step": 4854 + }, + { + "epoch": 0.38, + "grad_norm": 1.4233376215011992, + "learning_rate": 7.098191671652747e-06, + "loss": 0.5329, + "step": 4855 + }, + { + "epoch": 0.38, + "grad_norm": 1.6720778666982499, + "learning_rate": 7.097037204705816e-06, + "loss": 0.4724, + "step": 4856 + }, + { + "epoch": 0.38, + "grad_norm": 1.6913831463011904, + "learning_rate": 7.095882602083321e-06, + "loss": 0.445, + "step": 4857 + }, + { + "epoch": 0.38, + "grad_norm": 1.419008351806511, + "learning_rate": 7.094727863859964e-06, + "loss": 0.4933, + "step": 4858 + }, + { + "epoch": 0.38, + "grad_norm": 3.041095044715757, + "learning_rate": 7.093572990110452e-06, + "loss": 0.4359, + "step": 4859 + }, + { + "epoch": 0.38, + "grad_norm": 1.7466902650670626, + "learning_rate": 7.092417980909508e-06, + "loss": 0.4817, + "step": 4860 + }, + { + "epoch": 0.38, + "grad_norm": 1.4961182304604086, + "learning_rate": 7.091262836331858e-06, + "loss": 0.4722, + "step": 4861 + }, + { + "epoch": 0.38, + "grad_norm": 1.7217282512976573, + "learning_rate": 7.0901075564522385e-06, + "loss": 0.5035, + "step": 4862 + }, + { + "epoch": 0.38, + "grad_norm": 1.6091739334279902, + "learning_rate": 7.088952141345391e-06, + "loss": 0.4389, + "step": 4863 + }, + { + "epoch": 0.38, + "grad_norm": 1.2911554600974566, + "learning_rate": 7.087796591086076e-06, + "loss": 0.512, + "step": 4864 + }, + { + "epoch": 0.38, + "grad_norm": 7.989701203155342, + "learning_rate": 7.086640905749051e-06, + "loss": 0.4599, + "step": 4865 + }, + { + "epoch": 0.38, + "grad_norm": 1.3138613702983708, + "learning_rate": 7.085485085409091e-06, + "loss": 0.4569, + "step": 4866 + }, + { + "epoch": 0.38, + "grad_norm": 0.627699335834174, + "learning_rate": 7.084329130140972e-06, + "loss": 0.5243, + "step": 4867 + }, + { + "epoch": 0.38, + "grad_norm": 1.558303100660825, + "learning_rate": 7.083173040019487e-06, + "loss": 0.4228, + "step": 4868 + }, + { + "epoch": 0.38, + "grad_norm": 2.1195226362837665, + "learning_rate": 7.08201681511943e-06, + "loss": 0.4176, + "step": 4869 + }, + { + "epoch": 0.38, + "grad_norm": 1.4413023556461042, + "learning_rate": 7.080860455515609e-06, + "loss": 0.5008, + "step": 4870 + }, + { + "epoch": 0.38, + "grad_norm": 1.4560143595922155, + "learning_rate": 7.079703961282839e-06, + "loss": 0.4527, + "step": 4871 + }, + { + "epoch": 0.38, + "grad_norm": 1.79554529709397, + "learning_rate": 7.078547332495942e-06, + "loss": 0.4664, + "step": 4872 + }, + { + "epoch": 0.38, + "grad_norm": 2.0234189673597034, + "learning_rate": 7.077390569229754e-06, + "loss": 0.5033, + "step": 4873 + }, + { + "epoch": 0.38, + "grad_norm": 1.7639561015200624, + "learning_rate": 7.076233671559112e-06, + "loss": 0.4258, + "step": 4874 + }, + { + "epoch": 0.38, + "grad_norm": 5.1180125315126075, + "learning_rate": 7.075076639558868e-06, + "loss": 0.4625, + "step": 4875 + }, + { + "epoch": 0.38, + "grad_norm": 1.7016107930021975, + "learning_rate": 7.073919473303878e-06, + "loss": 0.4941, + "step": 4876 + }, + { + "epoch": 0.38, + "grad_norm": 0.5795643939678147, + "learning_rate": 7.072762172869014e-06, + "loss": 0.5151, + "step": 4877 + }, + { + "epoch": 0.38, + "grad_norm": 2.079003508604935, + "learning_rate": 7.071604738329148e-06, + "loss": 0.4929, + "step": 4878 + }, + { + "epoch": 0.38, + "grad_norm": 1.65421551332718, + "learning_rate": 7.0704471697591656e-06, + "loss": 0.4375, + "step": 4879 + }, + { + "epoch": 0.38, + "grad_norm": 1.6642350307206315, + "learning_rate": 7.069289467233959e-06, + "loss": 0.51, + "step": 4880 + }, + { + "epoch": 0.38, + "grad_norm": 1.4385713241808482, + "learning_rate": 7.068131630828432e-06, + "loss": 0.5144, + "step": 4881 + }, + { + "epoch": 0.38, + "grad_norm": 1.6364476992729464, + "learning_rate": 7.066973660617493e-06, + "loss": 0.4954, + "step": 4882 + }, + { + "epoch": 0.38, + "grad_norm": 1.647391339356725, + "learning_rate": 7.065815556676063e-06, + "loss": 0.4559, + "step": 4883 + }, + { + "epoch": 0.38, + "grad_norm": 0.5953822832925623, + "learning_rate": 7.064657319079068e-06, + "loss": 0.5289, + "step": 4884 + }, + { + "epoch": 0.38, + "grad_norm": 1.821152770737596, + "learning_rate": 7.063498947901446e-06, + "loss": 0.4748, + "step": 4885 + }, + { + "epoch": 0.38, + "grad_norm": 1.4681082132647514, + "learning_rate": 7.062340443218141e-06, + "loss": 0.4291, + "step": 4886 + }, + { + "epoch": 0.38, + "grad_norm": 1.7692051834346585, + "learning_rate": 7.061181805104107e-06, + "loss": 0.4304, + "step": 4887 + }, + { + "epoch": 0.38, + "grad_norm": 1.8359531067981039, + "learning_rate": 7.060023033634307e-06, + "loss": 0.4391, + "step": 4888 + }, + { + "epoch": 0.38, + "grad_norm": 1.8550530975554878, + "learning_rate": 7.058864128883711e-06, + "loss": 0.5392, + "step": 4889 + }, + { + "epoch": 0.38, + "grad_norm": 1.6123365793599493, + "learning_rate": 7.0577050909273e-06, + "loss": 0.5095, + "step": 4890 + }, + { + "epoch": 0.38, + "grad_norm": 0.593725759136565, + "learning_rate": 7.056545919840062e-06, + "loss": 0.5162, + "step": 4891 + }, + { + "epoch": 0.38, + "grad_norm": 2.2872937709115053, + "learning_rate": 7.055386615696992e-06, + "loss": 0.4805, + "step": 4892 + }, + { + "epoch": 0.38, + "grad_norm": 1.5970948584744433, + "learning_rate": 7.054227178573098e-06, + "loss": 0.4979, + "step": 4893 + }, + { + "epoch": 0.38, + "grad_norm": 2.074254067895052, + "learning_rate": 7.053067608543392e-06, + "loss": 0.458, + "step": 4894 + }, + { + "epoch": 0.38, + "grad_norm": 1.910352809477886, + "learning_rate": 7.051907905682898e-06, + "loss": 0.4705, + "step": 4895 + }, + { + "epoch": 0.38, + "grad_norm": 2.0055665614568183, + "learning_rate": 7.050748070066646e-06, + "loss": 0.5051, + "step": 4896 + }, + { + "epoch": 0.38, + "grad_norm": 2.393202642993326, + "learning_rate": 7.049588101769675e-06, + "loss": 0.5023, + "step": 4897 + }, + { + "epoch": 0.38, + "grad_norm": 4.685604545653513, + "learning_rate": 7.0484280008670365e-06, + "loss": 0.5316, + "step": 4898 + }, + { + "epoch": 0.38, + "grad_norm": 3.2080350043713257, + "learning_rate": 7.0472677674337875e-06, + "loss": 0.5068, + "step": 4899 + }, + { + "epoch": 0.38, + "grad_norm": 0.6182600467843604, + "learning_rate": 7.0461074015449906e-06, + "loss": 0.4887, + "step": 4900 + }, + { + "epoch": 0.38, + "grad_norm": 1.6754688834906362, + "learning_rate": 7.0449469032757224e-06, + "loss": 0.4408, + "step": 4901 + }, + { + "epoch": 0.38, + "grad_norm": 2.173361702003934, + "learning_rate": 7.043786272701063e-06, + "loss": 0.4739, + "step": 4902 + }, + { + "epoch": 0.39, + "grad_norm": 0.5653293967148161, + "learning_rate": 7.042625509896107e-06, + "loss": 0.5038, + "step": 4903 + }, + { + "epoch": 0.39, + "grad_norm": 1.812792310309564, + "learning_rate": 7.041464614935952e-06, + "loss": 0.4793, + "step": 4904 + }, + { + "epoch": 0.39, + "grad_norm": 1.769552483773579, + "learning_rate": 7.0403035878957074e-06, + "loss": 0.4916, + "step": 4905 + }, + { + "epoch": 0.39, + "grad_norm": 1.5032349139410226, + "learning_rate": 7.039142428850489e-06, + "loss": 0.4444, + "step": 4906 + }, + { + "epoch": 0.39, + "grad_norm": 1.5519574338753228, + "learning_rate": 7.037981137875423e-06, + "loss": 0.468, + "step": 4907 + }, + { + "epoch": 0.39, + "grad_norm": 1.5376187289901255, + "learning_rate": 7.036819715045644e-06, + "loss": 0.518, + "step": 4908 + }, + { + "epoch": 0.39, + "grad_norm": 1.6300988502219944, + "learning_rate": 7.035658160436294e-06, + "loss": 0.4912, + "step": 4909 + }, + { + "epoch": 0.39, + "grad_norm": 1.842975930740792, + "learning_rate": 7.034496474122523e-06, + "loss": 0.4704, + "step": 4910 + }, + { + "epoch": 0.39, + "grad_norm": 0.5617282710368641, + "learning_rate": 7.033334656179491e-06, + "loss": 0.5221, + "step": 4911 + }, + { + "epoch": 0.39, + "grad_norm": 2.1337672393430926, + "learning_rate": 7.03217270668237e-06, + "loss": 0.4271, + "step": 4912 + }, + { + "epoch": 0.39, + "grad_norm": 0.5988202337441175, + "learning_rate": 7.031010625706331e-06, + "loss": 0.526, + "step": 4913 + }, + { + "epoch": 0.39, + "grad_norm": 2.8102455719397725, + "learning_rate": 7.029848413326561e-06, + "loss": 0.4046, + "step": 4914 + }, + { + "epoch": 0.39, + "grad_norm": 1.569494803861767, + "learning_rate": 7.028686069618255e-06, + "loss": 0.4939, + "step": 4915 + }, + { + "epoch": 0.39, + "grad_norm": 1.6633172827331617, + "learning_rate": 7.027523594656615e-06, + "loss": 0.4554, + "step": 4916 + }, + { + "epoch": 0.39, + "grad_norm": 0.5642926104787376, + "learning_rate": 7.026360988516848e-06, + "loss": 0.5092, + "step": 4917 + }, + { + "epoch": 0.39, + "grad_norm": 1.5261450670091496, + "learning_rate": 7.025198251274179e-06, + "loss": 0.4174, + "step": 4918 + }, + { + "epoch": 0.39, + "grad_norm": 1.448012199015533, + "learning_rate": 7.024035383003829e-06, + "loss": 0.487, + "step": 4919 + }, + { + "epoch": 0.39, + "grad_norm": 1.537271207249816, + "learning_rate": 7.022872383781039e-06, + "loss": 0.5191, + "step": 4920 + }, + { + "epoch": 0.39, + "grad_norm": 1.7234610261669818, + "learning_rate": 7.021709253681053e-06, + "loss": 0.466, + "step": 4921 + }, + { + "epoch": 0.39, + "grad_norm": 1.7796357986476352, + "learning_rate": 7.020545992779122e-06, + "loss": 0.4971, + "step": 4922 + }, + { + "epoch": 0.39, + "grad_norm": 1.5243852343785267, + "learning_rate": 7.019382601150509e-06, + "loss": 0.5014, + "step": 4923 + }, + { + "epoch": 0.39, + "grad_norm": 2.04601509562885, + "learning_rate": 7.018219078870484e-06, + "loss": 0.5305, + "step": 4924 + }, + { + "epoch": 0.39, + "grad_norm": 1.7641941214772858, + "learning_rate": 7.017055426014323e-06, + "loss": 0.4978, + "step": 4925 + }, + { + "epoch": 0.39, + "grad_norm": 1.294434781619565, + "learning_rate": 7.015891642657316e-06, + "loss": 0.4559, + "step": 4926 + }, + { + "epoch": 0.39, + "grad_norm": 1.5266179607523802, + "learning_rate": 7.014727728874757e-06, + "loss": 0.5016, + "step": 4927 + }, + { + "epoch": 0.39, + "grad_norm": 2.0349742682609397, + "learning_rate": 7.01356368474195e-06, + "loss": 0.4782, + "step": 4928 + }, + { + "epoch": 0.39, + "grad_norm": 0.5720871276512539, + "learning_rate": 7.012399510334205e-06, + "loss": 0.5057, + "step": 4929 + }, + { + "epoch": 0.39, + "grad_norm": 0.5692220507313834, + "learning_rate": 7.011235205726845e-06, + "loss": 0.5001, + "step": 4930 + }, + { + "epoch": 0.39, + "grad_norm": 1.4774321904303722, + "learning_rate": 7.0100707709951965e-06, + "loss": 0.4917, + "step": 4931 + }, + { + "epoch": 0.39, + "grad_norm": 2.6472638458785145, + "learning_rate": 7.008906206214601e-06, + "loss": 0.5374, + "step": 4932 + }, + { + "epoch": 0.39, + "grad_norm": 1.535329647149385, + "learning_rate": 7.007741511460401e-06, + "loss": 0.4825, + "step": 4933 + }, + { + "epoch": 0.39, + "grad_norm": 0.5628796487758504, + "learning_rate": 7.006576686807952e-06, + "loss": 0.5128, + "step": 4934 + }, + { + "epoch": 0.39, + "grad_norm": 0.5726165167765285, + "learning_rate": 7.005411732332615e-06, + "loss": 0.5292, + "step": 4935 + }, + { + "epoch": 0.39, + "grad_norm": 2.216874948484997, + "learning_rate": 7.004246648109765e-06, + "loss": 0.4737, + "step": 4936 + }, + { + "epoch": 0.39, + "grad_norm": 1.6639112552904567, + "learning_rate": 7.003081434214777e-06, + "loss": 0.4964, + "step": 4937 + }, + { + "epoch": 0.39, + "grad_norm": 2.2008482389993125, + "learning_rate": 7.001916090723041e-06, + "loss": 0.4922, + "step": 4938 + }, + { + "epoch": 0.39, + "grad_norm": 0.5748130365995252, + "learning_rate": 7.0007506177099515e-06, + "loss": 0.5036, + "step": 4939 + }, + { + "epoch": 0.39, + "grad_norm": 3.508490163958507, + "learning_rate": 6.999585015250916e-06, + "loss": 0.4888, + "step": 4940 + }, + { + "epoch": 0.39, + "grad_norm": 1.6589962958292217, + "learning_rate": 6.998419283421345e-06, + "loss": 0.4836, + "step": 4941 + }, + { + "epoch": 0.39, + "grad_norm": 2.4820722143023364, + "learning_rate": 6.99725342229666e-06, + "loss": 0.5056, + "step": 4942 + }, + { + "epoch": 0.39, + "grad_norm": 1.6344594633301817, + "learning_rate": 6.996087431952292e-06, + "loss": 0.4868, + "step": 4943 + }, + { + "epoch": 0.39, + "grad_norm": 1.6258585771978618, + "learning_rate": 6.994921312463679e-06, + "loss": 0.4148, + "step": 4944 + }, + { + "epoch": 0.39, + "grad_norm": 1.7104363721701417, + "learning_rate": 6.993755063906266e-06, + "loss": 0.5313, + "step": 4945 + }, + { + "epoch": 0.39, + "grad_norm": 1.6107469547149282, + "learning_rate": 6.992588686355508e-06, + "loss": 0.4722, + "step": 4946 + }, + { + "epoch": 0.39, + "grad_norm": 1.5826607330140783, + "learning_rate": 6.991422179886871e-06, + "loss": 0.5049, + "step": 4947 + }, + { + "epoch": 0.39, + "grad_norm": 1.7770780002171889, + "learning_rate": 6.990255544575821e-06, + "loss": 0.5097, + "step": 4948 + }, + { + "epoch": 0.39, + "grad_norm": 1.6366698579730952, + "learning_rate": 6.989088780497844e-06, + "loss": 0.4129, + "step": 4949 + }, + { + "epoch": 0.39, + "grad_norm": 3.089682866935191, + "learning_rate": 6.987921887728422e-06, + "loss": 0.4679, + "step": 4950 + }, + { + "epoch": 0.39, + "grad_norm": 1.6625856514565507, + "learning_rate": 6.986754866343056e-06, + "loss": 0.4566, + "step": 4951 + }, + { + "epoch": 0.39, + "grad_norm": 1.751596928925858, + "learning_rate": 6.985587716417248e-06, + "loss": 0.5104, + "step": 4952 + }, + { + "epoch": 0.39, + "grad_norm": 2.7713322914392933, + "learning_rate": 6.984420438026513e-06, + "loss": 0.4836, + "step": 4953 + }, + { + "epoch": 0.39, + "grad_norm": 1.5640731504269751, + "learning_rate": 6.983253031246371e-06, + "loss": 0.463, + "step": 4954 + }, + { + "epoch": 0.39, + "grad_norm": 1.4377670435686263, + "learning_rate": 6.9820854961523545e-06, + "loss": 0.4907, + "step": 4955 + }, + { + "epoch": 0.39, + "grad_norm": 1.5066571696954416, + "learning_rate": 6.980917832819996e-06, + "loss": 0.4774, + "step": 4956 + }, + { + "epoch": 0.39, + "grad_norm": 1.527231053639831, + "learning_rate": 6.979750041324849e-06, + "loss": 0.4428, + "step": 4957 + }, + { + "epoch": 0.39, + "grad_norm": 2.3284021718067764, + "learning_rate": 6.978582121742461e-06, + "loss": 0.4766, + "step": 4958 + }, + { + "epoch": 0.39, + "grad_norm": 1.4797033727102322, + "learning_rate": 6.9774140741484e-06, + "loss": 0.5026, + "step": 4959 + }, + { + "epoch": 0.39, + "grad_norm": 0.6391504308219952, + "learning_rate": 6.976245898618234e-06, + "loss": 0.5205, + "step": 4960 + }, + { + "epoch": 0.39, + "grad_norm": 1.5681791873815025, + "learning_rate": 6.975077595227544e-06, + "loss": 0.5157, + "step": 4961 + }, + { + "epoch": 0.39, + "grad_norm": 1.6484627628645279, + "learning_rate": 6.973909164051916e-06, + "loss": 0.4663, + "step": 4962 + }, + { + "epoch": 0.39, + "grad_norm": 0.6241573606542122, + "learning_rate": 6.9727406051669485e-06, + "loss": 0.5233, + "step": 4963 + }, + { + "epoch": 0.39, + "grad_norm": 1.6862209364813125, + "learning_rate": 6.971571918648245e-06, + "loss": 0.4798, + "step": 4964 + }, + { + "epoch": 0.39, + "grad_norm": 1.6219436265504572, + "learning_rate": 6.970403104571416e-06, + "loss": 0.5214, + "step": 4965 + }, + { + "epoch": 0.39, + "grad_norm": 1.6918789607352824, + "learning_rate": 6.969234163012084e-06, + "loss": 0.5242, + "step": 4966 + }, + { + "epoch": 0.39, + "grad_norm": 0.6308665631004468, + "learning_rate": 6.96806509404588e-06, + "loss": 0.5148, + "step": 4967 + }, + { + "epoch": 0.39, + "grad_norm": 1.7037384578755337, + "learning_rate": 6.966895897748436e-06, + "loss": 0.4893, + "step": 4968 + }, + { + "epoch": 0.39, + "grad_norm": 1.761066223794488, + "learning_rate": 6.965726574195403e-06, + "loss": 0.5421, + "step": 4969 + }, + { + "epoch": 0.39, + "grad_norm": 1.3874299849755027, + "learning_rate": 6.96455712346243e-06, + "loss": 0.4649, + "step": 4970 + }, + { + "epoch": 0.39, + "grad_norm": 1.9813518125537426, + "learning_rate": 6.963387545625183e-06, + "loss": 0.4781, + "step": 4971 + }, + { + "epoch": 0.39, + "grad_norm": 2.4496943273616765, + "learning_rate": 6.962217840759329e-06, + "loss": 0.5009, + "step": 4972 + }, + { + "epoch": 0.39, + "grad_norm": 1.360374197642996, + "learning_rate": 6.961048008940548e-06, + "loss": 0.4703, + "step": 4973 + }, + { + "epoch": 0.39, + "grad_norm": 0.6033211934566961, + "learning_rate": 6.959878050244526e-06, + "loss": 0.5002, + "step": 4974 + }, + { + "epoch": 0.39, + "grad_norm": 1.3932231859101576, + "learning_rate": 6.958707964746958e-06, + "loss": 0.4187, + "step": 4975 + }, + { + "epoch": 0.39, + "grad_norm": 1.4570024064795173, + "learning_rate": 6.9575377525235464e-06, + "loss": 0.4788, + "step": 4976 + }, + { + "epoch": 0.39, + "grad_norm": 2.028896346364392, + "learning_rate": 6.956367413650004e-06, + "loss": 0.4239, + "step": 4977 + }, + { + "epoch": 0.39, + "grad_norm": 2.0195495783894293, + "learning_rate": 6.955196948202047e-06, + "loss": 0.4708, + "step": 4978 + }, + { + "epoch": 0.39, + "grad_norm": 1.615152426391804, + "learning_rate": 6.9540263562554085e-06, + "loss": 0.4277, + "step": 4979 + }, + { + "epoch": 0.39, + "grad_norm": 1.645904998245222, + "learning_rate": 6.952855637885819e-06, + "loss": 0.4369, + "step": 4980 + }, + { + "epoch": 0.39, + "grad_norm": 1.7836850273665843, + "learning_rate": 6.9516847931690255e-06, + "loss": 0.489, + "step": 4981 + }, + { + "epoch": 0.39, + "grad_norm": 1.776807399496063, + "learning_rate": 6.950513822180778e-06, + "loss": 0.5456, + "step": 4982 + }, + { + "epoch": 0.39, + "grad_norm": 1.4668380884900674, + "learning_rate": 6.9493427249968384e-06, + "loss": 0.4615, + "step": 4983 + }, + { + "epoch": 0.39, + "grad_norm": 1.5266440858876407, + "learning_rate": 6.948171501692974e-06, + "loss": 0.4847, + "step": 4984 + }, + { + "epoch": 0.39, + "grad_norm": 1.6462517224193731, + "learning_rate": 6.947000152344963e-06, + "loss": 0.5371, + "step": 4985 + }, + { + "epoch": 0.39, + "grad_norm": 1.7054615723588076, + "learning_rate": 6.945828677028588e-06, + "loss": 0.4966, + "step": 4986 + }, + { + "epoch": 0.39, + "grad_norm": 0.5794148374695198, + "learning_rate": 6.944657075819643e-06, + "loss": 0.5106, + "step": 4987 + }, + { + "epoch": 0.39, + "grad_norm": 1.773895273300592, + "learning_rate": 6.943485348793929e-06, + "loss": 0.5147, + "step": 4988 + }, + { + "epoch": 0.39, + "grad_norm": 1.6535908386708804, + "learning_rate": 6.942313496027255e-06, + "loss": 0.482, + "step": 4989 + }, + { + "epoch": 0.39, + "grad_norm": 2.971117171558713, + "learning_rate": 6.941141517595441e-06, + "loss": 0.4939, + "step": 4990 + }, + { + "epoch": 0.39, + "grad_norm": 1.7068980782744823, + "learning_rate": 6.939969413574308e-06, + "loss": 0.453, + "step": 4991 + }, + { + "epoch": 0.39, + "grad_norm": 1.649412592339443, + "learning_rate": 6.938797184039694e-06, + "loss": 0.4434, + "step": 4992 + }, + { + "epoch": 0.39, + "grad_norm": 1.7180376482864714, + "learning_rate": 6.937624829067435e-06, + "loss": 0.4605, + "step": 4993 + }, + { + "epoch": 0.39, + "grad_norm": 1.5907463790788607, + "learning_rate": 6.936452348733388e-06, + "loss": 0.4487, + "step": 4994 + }, + { + "epoch": 0.39, + "grad_norm": 1.6179302612445647, + "learning_rate": 6.935279743113404e-06, + "loss": 0.4919, + "step": 4995 + }, + { + "epoch": 0.39, + "grad_norm": 2.3374607579106397, + "learning_rate": 6.934107012283355e-06, + "loss": 0.4261, + "step": 4996 + }, + { + "epoch": 0.39, + "grad_norm": 2.0150125914072996, + "learning_rate": 6.932934156319111e-06, + "loss": 0.4135, + "step": 4997 + }, + { + "epoch": 0.39, + "grad_norm": 1.6095539144631388, + "learning_rate": 6.931761175296556e-06, + "loss": 0.5184, + "step": 4998 + }, + { + "epoch": 0.39, + "grad_norm": 1.9333089318539403, + "learning_rate": 6.930588069291578e-06, + "loss": 0.4979, + "step": 4999 + }, + { + "epoch": 0.39, + "grad_norm": 1.5852473633739683, + "learning_rate": 6.9294148383800805e-06, + "loss": 0.4437, + "step": 5000 + }, + { + "epoch": 0.39, + "grad_norm": 0.5861917108381923, + "learning_rate": 6.928241482637965e-06, + "loss": 0.5119, + "step": 5001 + }, + { + "epoch": 0.39, + "grad_norm": 1.592933565810781, + "learning_rate": 6.9270680021411495e-06, + "loss": 0.4842, + "step": 5002 + }, + { + "epoch": 0.39, + "grad_norm": 1.495995364739113, + "learning_rate": 6.9258943969655545e-06, + "loss": 0.515, + "step": 5003 + }, + { + "epoch": 0.39, + "grad_norm": 1.8575168537024334, + "learning_rate": 6.924720667187112e-06, + "loss": 0.4812, + "step": 5004 + }, + { + "epoch": 0.39, + "grad_norm": 4.411904756499402, + "learning_rate": 6.923546812881759e-06, + "loss": 0.42, + "step": 5005 + }, + { + "epoch": 0.39, + "grad_norm": 1.5382047958678762, + "learning_rate": 6.922372834125445e-06, + "loss": 0.5215, + "step": 5006 + }, + { + "epoch": 0.39, + "grad_norm": 1.6800668625975574, + "learning_rate": 6.921198730994121e-06, + "loss": 0.4944, + "step": 5007 + }, + { + "epoch": 0.39, + "grad_norm": 1.4541323242411728, + "learning_rate": 6.920024503563755e-06, + "loss": 0.4905, + "step": 5008 + }, + { + "epoch": 0.39, + "grad_norm": 1.6992816469813739, + "learning_rate": 6.918850151910313e-06, + "loss": 0.472, + "step": 5009 + }, + { + "epoch": 0.39, + "grad_norm": 0.5991591128247871, + "learning_rate": 6.917675676109777e-06, + "loss": 0.5279, + "step": 5010 + }, + { + "epoch": 0.39, + "grad_norm": 1.6515537713584012, + "learning_rate": 6.916501076238135e-06, + "loss": 0.5286, + "step": 5011 + }, + { + "epoch": 0.39, + "grad_norm": 0.5982021288327655, + "learning_rate": 6.915326352371379e-06, + "loss": 0.5203, + "step": 5012 + }, + { + "epoch": 0.39, + "grad_norm": 2.46926604251752, + "learning_rate": 6.914151504585513e-06, + "loss": 0.5023, + "step": 5013 + }, + { + "epoch": 0.39, + "grad_norm": 1.6968138476179577, + "learning_rate": 6.9129765329565515e-06, + "loss": 0.4554, + "step": 5014 + }, + { + "epoch": 0.39, + "grad_norm": 1.6001106094899922, + "learning_rate": 6.9118014375605085e-06, + "loss": 0.5043, + "step": 5015 + }, + { + "epoch": 0.39, + "grad_norm": 1.7664668180716354, + "learning_rate": 6.910626218473414e-06, + "loss": 0.4929, + "step": 5016 + }, + { + "epoch": 0.39, + "grad_norm": 1.4259177056827834, + "learning_rate": 6.909450875771302e-06, + "loss": 0.4407, + "step": 5017 + }, + { + "epoch": 0.39, + "grad_norm": 2.0595849998000553, + "learning_rate": 6.908275409530216e-06, + "loss": 0.4982, + "step": 5018 + }, + { + "epoch": 0.39, + "grad_norm": 1.9938437859447415, + "learning_rate": 6.907099819826208e-06, + "loss": 0.5125, + "step": 5019 + }, + { + "epoch": 0.39, + "grad_norm": 0.6297257268074141, + "learning_rate": 6.905924106735337e-06, + "loss": 0.5149, + "step": 5020 + }, + { + "epoch": 0.39, + "grad_norm": 1.3384022944921539, + "learning_rate": 6.9047482703336676e-06, + "loss": 0.4811, + "step": 5021 + }, + { + "epoch": 0.39, + "grad_norm": 0.5586108762331874, + "learning_rate": 6.903572310697279e-06, + "loss": 0.5208, + "step": 5022 + }, + { + "epoch": 0.39, + "grad_norm": 0.5892949338910995, + "learning_rate": 6.902396227902252e-06, + "loss": 0.5131, + "step": 5023 + }, + { + "epoch": 0.39, + "grad_norm": 0.6068212091860247, + "learning_rate": 6.901220022024676e-06, + "loss": 0.5122, + "step": 5024 + }, + { + "epoch": 0.39, + "grad_norm": 1.4471972566121922, + "learning_rate": 6.900043693140653e-06, + "loss": 0.4644, + "step": 5025 + }, + { + "epoch": 0.39, + "grad_norm": 1.6808294318402828, + "learning_rate": 6.898867241326288e-06, + "loss": 0.5291, + "step": 5026 + }, + { + "epoch": 0.39, + "grad_norm": 0.6672123741262052, + "learning_rate": 6.897690666657697e-06, + "loss": 0.5338, + "step": 5027 + }, + { + "epoch": 0.39, + "grad_norm": 1.740739940005463, + "learning_rate": 6.896513969211003e-06, + "loss": 0.467, + "step": 5028 + }, + { + "epoch": 0.39, + "grad_norm": 0.5457317698055332, + "learning_rate": 6.8953371490623355e-06, + "loss": 0.5082, + "step": 5029 + }, + { + "epoch": 0.4, + "grad_norm": 1.5948797384957327, + "learning_rate": 6.8941602062878335e-06, + "loss": 0.4922, + "step": 5030 + }, + { + "epoch": 0.4, + "grad_norm": 1.6854107091560475, + "learning_rate": 6.892983140963645e-06, + "loss": 0.4846, + "step": 5031 + }, + { + "epoch": 0.4, + "grad_norm": 1.9888312521075242, + "learning_rate": 6.891805953165921e-06, + "loss": 0.4567, + "step": 5032 + }, + { + "epoch": 0.4, + "grad_norm": 2.1897044304057816, + "learning_rate": 6.8906286429708294e-06, + "loss": 0.4709, + "step": 5033 + }, + { + "epoch": 0.4, + "grad_norm": 1.899102860886273, + "learning_rate": 6.889451210454536e-06, + "loss": 0.4355, + "step": 5034 + }, + { + "epoch": 0.4, + "grad_norm": 1.5769336075461506, + "learning_rate": 6.888273655693223e-06, + "loss": 0.4765, + "step": 5035 + }, + { + "epoch": 0.4, + "grad_norm": 1.5473246640734752, + "learning_rate": 6.887095978763072e-06, + "loss": 0.4925, + "step": 5036 + }, + { + "epoch": 0.4, + "grad_norm": 2.358660447223554, + "learning_rate": 6.885918179740283e-06, + "loss": 0.4444, + "step": 5037 + }, + { + "epoch": 0.4, + "grad_norm": 1.5457213355824386, + "learning_rate": 6.884740258701052e-06, + "loss": 0.4691, + "step": 5038 + }, + { + "epoch": 0.4, + "grad_norm": 1.6208181650465323, + "learning_rate": 6.8835622157215944e-06, + "loss": 0.4739, + "step": 5039 + }, + { + "epoch": 0.4, + "grad_norm": 1.9452381061445998, + "learning_rate": 6.882384050878124e-06, + "loss": 0.4891, + "step": 5040 + }, + { + "epoch": 0.4, + "grad_norm": 1.4196534991024101, + "learning_rate": 6.881205764246867e-06, + "loss": 0.4824, + "step": 5041 + }, + { + "epoch": 0.4, + "grad_norm": 1.9702615180159835, + "learning_rate": 6.88002735590406e-06, + "loss": 0.5067, + "step": 5042 + }, + { + "epoch": 0.4, + "grad_norm": 0.7120684880079208, + "learning_rate": 6.878848825925941e-06, + "loss": 0.5151, + "step": 5043 + }, + { + "epoch": 0.4, + "grad_norm": 1.9020921234282075, + "learning_rate": 6.877670174388761e-06, + "loss": 0.4351, + "step": 5044 + }, + { + "epoch": 0.4, + "grad_norm": 4.62164328954833, + "learning_rate": 6.876491401368778e-06, + "loss": 0.5042, + "step": 5045 + }, + { + "epoch": 0.4, + "grad_norm": 0.5958142556383952, + "learning_rate": 6.875312506942254e-06, + "loss": 0.504, + "step": 5046 + }, + { + "epoch": 0.4, + "grad_norm": 1.6822996627046238, + "learning_rate": 6.874133491185466e-06, + "loss": 0.501, + "step": 5047 + }, + { + "epoch": 0.4, + "grad_norm": 1.484340339136958, + "learning_rate": 6.872954354174692e-06, + "loss": 0.4374, + "step": 5048 + }, + { + "epoch": 0.4, + "grad_norm": 1.6242036702779448, + "learning_rate": 6.8717750959862225e-06, + "loss": 0.4212, + "step": 5049 + }, + { + "epoch": 0.4, + "grad_norm": 1.4767837839883453, + "learning_rate": 6.870595716696352e-06, + "loss": 0.4539, + "step": 5050 + }, + { + "epoch": 0.4, + "grad_norm": 0.6538931289029237, + "learning_rate": 6.869416216381386e-06, + "loss": 0.507, + "step": 5051 + }, + { + "epoch": 0.4, + "grad_norm": 0.6434749681529897, + "learning_rate": 6.8682365951176355e-06, + "loss": 0.5023, + "step": 5052 + }, + { + "epoch": 0.4, + "grad_norm": 1.8125848315759476, + "learning_rate": 6.867056852981422e-06, + "loss": 0.4448, + "step": 5053 + }, + { + "epoch": 0.4, + "grad_norm": 1.567883881181303, + "learning_rate": 6.865876990049073e-06, + "loss": 0.4841, + "step": 5054 + }, + { + "epoch": 0.4, + "grad_norm": 2.074260794422984, + "learning_rate": 6.864697006396922e-06, + "loss": 0.4689, + "step": 5055 + }, + { + "epoch": 0.4, + "grad_norm": 2.0681580972378173, + "learning_rate": 6.863516902101315e-06, + "loss": 0.4922, + "step": 5056 + }, + { + "epoch": 0.4, + "grad_norm": 1.7263712909998254, + "learning_rate": 6.862336677238603e-06, + "loss": 0.5126, + "step": 5057 + }, + { + "epoch": 0.4, + "grad_norm": 2.1086741469790637, + "learning_rate": 6.861156331885142e-06, + "loss": 0.4542, + "step": 5058 + }, + { + "epoch": 0.4, + "grad_norm": 1.990019722348448, + "learning_rate": 6.859975866117302e-06, + "loss": 0.489, + "step": 5059 + }, + { + "epoch": 0.4, + "grad_norm": 1.835797742563605, + "learning_rate": 6.858795280011458e-06, + "loss": 0.4146, + "step": 5060 + }, + { + "epoch": 0.4, + "grad_norm": 2.013695076338349, + "learning_rate": 6.8576145736439894e-06, + "loss": 0.492, + "step": 5061 + }, + { + "epoch": 0.4, + "grad_norm": 1.5825735448989022, + "learning_rate": 6.856433747091289e-06, + "loss": 0.4669, + "step": 5062 + }, + { + "epoch": 0.4, + "grad_norm": 2.041998234240429, + "learning_rate": 6.8552528004297525e-06, + "loss": 0.4556, + "step": 5063 + }, + { + "epoch": 0.4, + "grad_norm": 2.0204251168558325, + "learning_rate": 6.854071733735789e-06, + "loss": 0.4285, + "step": 5064 + }, + { + "epoch": 0.4, + "grad_norm": 1.3358851395831104, + "learning_rate": 6.852890547085808e-06, + "loss": 0.5116, + "step": 5065 + }, + { + "epoch": 0.4, + "grad_norm": 1.454788710172959, + "learning_rate": 6.851709240556233e-06, + "loss": 0.4605, + "step": 5066 + }, + { + "epoch": 0.4, + "grad_norm": 1.3909764108674103, + "learning_rate": 6.850527814223491e-06, + "loss": 0.4652, + "step": 5067 + }, + { + "epoch": 0.4, + "grad_norm": 1.6852454397915315, + "learning_rate": 6.849346268164022e-06, + "loss": 0.4992, + "step": 5068 + }, + { + "epoch": 0.4, + "grad_norm": 1.7197517532667954, + "learning_rate": 6.848164602454268e-06, + "loss": 0.479, + "step": 5069 + }, + { + "epoch": 0.4, + "grad_norm": 0.8338719435993904, + "learning_rate": 6.846982817170682e-06, + "loss": 0.5224, + "step": 5070 + }, + { + "epoch": 0.4, + "grad_norm": 2.042581600498176, + "learning_rate": 6.845800912389724e-06, + "loss": 0.48, + "step": 5071 + }, + { + "epoch": 0.4, + "grad_norm": 2.2276511238894865, + "learning_rate": 6.844618888187861e-06, + "loss": 0.4976, + "step": 5072 + }, + { + "epoch": 0.4, + "grad_norm": 1.8194930783028291, + "learning_rate": 6.8434367446415706e-06, + "loss": 0.5325, + "step": 5073 + }, + { + "epoch": 0.4, + "grad_norm": 1.7222285396557782, + "learning_rate": 6.8422544818273336e-06, + "loss": 0.4397, + "step": 5074 + }, + { + "epoch": 0.4, + "grad_norm": 1.514281054707239, + "learning_rate": 6.841072099821641e-06, + "loss": 0.514, + "step": 5075 + }, + { + "epoch": 0.4, + "grad_norm": 0.5709793971631403, + "learning_rate": 6.839889598700993e-06, + "loss": 0.4978, + "step": 5076 + }, + { + "epoch": 0.4, + "grad_norm": 0.6375026708296558, + "learning_rate": 6.8387069785418935e-06, + "loss": 0.539, + "step": 5077 + }, + { + "epoch": 0.4, + "grad_norm": 0.5368866569174415, + "learning_rate": 6.837524239420858e-06, + "loss": 0.514, + "step": 5078 + }, + { + "epoch": 0.4, + "grad_norm": 2.055758316298568, + "learning_rate": 6.8363413814144085e-06, + "loss": 0.4928, + "step": 5079 + }, + { + "epoch": 0.4, + "grad_norm": 1.6996253777790984, + "learning_rate": 6.835158404599074e-06, + "loss": 0.4372, + "step": 5080 + }, + { + "epoch": 0.4, + "grad_norm": 1.4522076917106026, + "learning_rate": 6.833975309051391e-06, + "loss": 0.4089, + "step": 5081 + }, + { + "epoch": 0.4, + "grad_norm": 1.4830301659964606, + "learning_rate": 6.832792094847906e-06, + "loss": 0.4518, + "step": 5082 + }, + { + "epoch": 0.4, + "grad_norm": 0.6691377579756101, + "learning_rate": 6.8316087620651675e-06, + "loss": 0.5209, + "step": 5083 + }, + { + "epoch": 0.4, + "grad_norm": 0.7214189735591864, + "learning_rate": 6.83042531077974e-06, + "loss": 0.5389, + "step": 5084 + }, + { + "epoch": 0.4, + "grad_norm": 2.53216226105628, + "learning_rate": 6.829241741068189e-06, + "loss": 0.4835, + "step": 5085 + }, + { + "epoch": 0.4, + "grad_norm": 1.7268708969373763, + "learning_rate": 6.828058053007091e-06, + "loss": 0.4778, + "step": 5086 + }, + { + "epoch": 0.4, + "grad_norm": 1.5277391432617244, + "learning_rate": 6.826874246673027e-06, + "loss": 0.4889, + "step": 5087 + }, + { + "epoch": 0.4, + "grad_norm": 1.9623051911608813, + "learning_rate": 6.825690322142589e-06, + "loss": 0.4962, + "step": 5088 + }, + { + "epoch": 0.4, + "grad_norm": 1.597356028652685, + "learning_rate": 6.824506279492375e-06, + "loss": 0.5142, + "step": 5089 + }, + { + "epoch": 0.4, + "grad_norm": 1.8832376881614035, + "learning_rate": 6.823322118798994e-06, + "loss": 0.4629, + "step": 5090 + }, + { + "epoch": 0.4, + "grad_norm": 2.4613390879183994, + "learning_rate": 6.822137840139056e-06, + "loss": 0.4577, + "step": 5091 + }, + { + "epoch": 0.4, + "grad_norm": 1.757873435658759, + "learning_rate": 6.820953443589184e-06, + "loss": 0.4874, + "step": 5092 + }, + { + "epoch": 0.4, + "grad_norm": 2.5930120215922154, + "learning_rate": 6.819768929226004e-06, + "loss": 0.5042, + "step": 5093 + }, + { + "epoch": 0.4, + "grad_norm": 1.4284865955310406, + "learning_rate": 6.818584297126157e-06, + "loss": 0.4927, + "step": 5094 + }, + { + "epoch": 0.4, + "grad_norm": 0.8485465888131762, + "learning_rate": 6.817399547366284e-06, + "loss": 0.514, + "step": 5095 + }, + { + "epoch": 0.4, + "grad_norm": 1.9476553190637653, + "learning_rate": 6.816214680023037e-06, + "loss": 0.475, + "step": 5096 + }, + { + "epoch": 0.4, + "grad_norm": 3.184890202468165, + "learning_rate": 6.815029695173079e-06, + "loss": 0.482, + "step": 5097 + }, + { + "epoch": 0.4, + "grad_norm": 0.6043139318427282, + "learning_rate": 6.813844592893071e-06, + "loss": 0.5152, + "step": 5098 + }, + { + "epoch": 0.4, + "grad_norm": 1.472579417349049, + "learning_rate": 6.812659373259693e-06, + "loss": 0.4806, + "step": 5099 + }, + { + "epoch": 0.4, + "grad_norm": 1.67112780929482, + "learning_rate": 6.811474036349622e-06, + "loss": 0.4964, + "step": 5100 + }, + { + "epoch": 0.4, + "grad_norm": 1.9374411363238244, + "learning_rate": 6.810288582239553e-06, + "loss": 0.4313, + "step": 5101 + }, + { + "epoch": 0.4, + "grad_norm": 1.9371917516419939, + "learning_rate": 6.80910301100618e-06, + "loss": 0.463, + "step": 5102 + }, + { + "epoch": 0.4, + "grad_norm": 3.012006334476266, + "learning_rate": 6.807917322726208e-06, + "loss": 0.4478, + "step": 5103 + }, + { + "epoch": 0.4, + "grad_norm": 0.7502839959317843, + "learning_rate": 6.806731517476351e-06, + "loss": 0.5153, + "step": 5104 + }, + { + "epoch": 0.4, + "grad_norm": 1.9161902169051828, + "learning_rate": 6.805545595333329e-06, + "loss": 0.4599, + "step": 5105 + }, + { + "epoch": 0.4, + "grad_norm": 1.6416697170671828, + "learning_rate": 6.804359556373868e-06, + "loss": 0.4587, + "step": 5106 + }, + { + "epoch": 0.4, + "grad_norm": 1.4945869418050486, + "learning_rate": 6.803173400674705e-06, + "loss": 0.4337, + "step": 5107 + }, + { + "epoch": 0.4, + "grad_norm": 1.8390498381536393, + "learning_rate": 6.8019871283125816e-06, + "loss": 0.4392, + "step": 5108 + }, + { + "epoch": 0.4, + "grad_norm": 1.788199804482733, + "learning_rate": 6.800800739364248e-06, + "loss": 0.4111, + "step": 5109 + }, + { + "epoch": 0.4, + "grad_norm": 2.4797201670456888, + "learning_rate": 6.799614233906462e-06, + "loss": 0.5213, + "step": 5110 + }, + { + "epoch": 0.4, + "grad_norm": 1.7597324596539268, + "learning_rate": 6.798427612015991e-06, + "loss": 0.4614, + "step": 5111 + }, + { + "epoch": 0.4, + "grad_norm": 0.5580322285437369, + "learning_rate": 6.7972408737696025e-06, + "loss": 0.4956, + "step": 5112 + }, + { + "epoch": 0.4, + "grad_norm": 1.6019729206448499, + "learning_rate": 6.796054019244084e-06, + "loss": 0.4699, + "step": 5113 + }, + { + "epoch": 0.4, + "grad_norm": 0.6152465694304937, + "learning_rate": 6.794867048516218e-06, + "loss": 0.4956, + "step": 5114 + }, + { + "epoch": 0.4, + "grad_norm": 1.877289048317339, + "learning_rate": 6.793679961662804e-06, + "loss": 0.4993, + "step": 5115 + }, + { + "epoch": 0.4, + "grad_norm": 1.2614526411451534, + "learning_rate": 6.792492758760641e-06, + "loss": 0.4604, + "step": 5116 + }, + { + "epoch": 0.4, + "grad_norm": 1.5814892556842117, + "learning_rate": 6.7913054398865416e-06, + "loss": 0.4687, + "step": 5117 + }, + { + "epoch": 0.4, + "grad_norm": 1.6732843162986006, + "learning_rate": 6.790118005117325e-06, + "loss": 0.4936, + "step": 5118 + }, + { + "epoch": 0.4, + "grad_norm": 1.9690469963373776, + "learning_rate": 6.788930454529814e-06, + "loss": 0.4711, + "step": 5119 + }, + { + "epoch": 0.4, + "grad_norm": 0.585883217429109, + "learning_rate": 6.7877427882008426e-06, + "loss": 0.527, + "step": 5120 + }, + { + "epoch": 0.4, + "grad_norm": 1.944685675750166, + "learning_rate": 6.786555006207253e-06, + "loss": 0.4816, + "step": 5121 + }, + { + "epoch": 0.4, + "grad_norm": 2.7840383704541263, + "learning_rate": 6.7853671086258896e-06, + "loss": 0.4433, + "step": 5122 + }, + { + "epoch": 0.4, + "grad_norm": 1.8875140855495454, + "learning_rate": 6.78417909553361e-06, + "loss": 0.4919, + "step": 5123 + }, + { + "epoch": 0.4, + "grad_norm": 1.816861391423662, + "learning_rate": 6.782990967007277e-06, + "loss": 0.4539, + "step": 5124 + }, + { + "epoch": 0.4, + "grad_norm": 1.6823313936628466, + "learning_rate": 6.781802723123762e-06, + "loss": 0.4877, + "step": 5125 + }, + { + "epoch": 0.4, + "grad_norm": 0.5426109533817866, + "learning_rate": 6.780614363959941e-06, + "loss": 0.519, + "step": 5126 + }, + { + "epoch": 0.4, + "grad_norm": 1.930082166938152, + "learning_rate": 6.779425889592701e-06, + "loss": 0.4974, + "step": 5127 + }, + { + "epoch": 0.4, + "grad_norm": 4.763846432451691, + "learning_rate": 6.778237300098933e-06, + "loss": 0.4655, + "step": 5128 + }, + { + "epoch": 0.4, + "grad_norm": 1.372060450554244, + "learning_rate": 6.7770485955555374e-06, + "loss": 0.4505, + "step": 5129 + }, + { + "epoch": 0.4, + "grad_norm": 1.8200617782366366, + "learning_rate": 6.775859776039423e-06, + "loss": 0.4604, + "step": 5130 + }, + { + "epoch": 0.4, + "grad_norm": 1.835488610331581, + "learning_rate": 6.774670841627504e-06, + "loss": 0.4099, + "step": 5131 + }, + { + "epoch": 0.4, + "grad_norm": 1.6428221814049495, + "learning_rate": 6.773481792396703e-06, + "loss": 0.4792, + "step": 5132 + }, + { + "epoch": 0.4, + "grad_norm": 1.886474408203313, + "learning_rate": 6.77229262842395e-06, + "loss": 0.4879, + "step": 5133 + }, + { + "epoch": 0.4, + "grad_norm": 2.372255899616415, + "learning_rate": 6.7711033497861826e-06, + "loss": 0.4588, + "step": 5134 + }, + { + "epoch": 0.4, + "grad_norm": 2.8092594708960505, + "learning_rate": 6.769913956560346e-06, + "loss": 0.4789, + "step": 5135 + }, + { + "epoch": 0.4, + "grad_norm": 1.3133346123135095, + "learning_rate": 6.7687244488233896e-06, + "loss": 0.4404, + "step": 5136 + }, + { + "epoch": 0.4, + "grad_norm": 1.3790825850808666, + "learning_rate": 6.767534826652276e-06, + "loss": 0.4431, + "step": 5137 + }, + { + "epoch": 0.4, + "grad_norm": 0.5633307773545179, + "learning_rate": 6.766345090123973e-06, + "loss": 0.4982, + "step": 5138 + }, + { + "epoch": 0.4, + "grad_norm": 1.8218850600908645, + "learning_rate": 6.765155239315452e-06, + "loss": 0.4782, + "step": 5139 + }, + { + "epoch": 0.4, + "grad_norm": 1.540806276947367, + "learning_rate": 6.763965274303697e-06, + "loss": 0.4978, + "step": 5140 + }, + { + "epoch": 0.4, + "grad_norm": 1.6382920812802584, + "learning_rate": 6.762775195165695e-06, + "loss": 0.5041, + "step": 5141 + }, + { + "epoch": 0.4, + "grad_norm": 1.4138855922406606, + "learning_rate": 6.761585001978446e-06, + "loss": 0.468, + "step": 5142 + }, + { + "epoch": 0.4, + "grad_norm": 2.5734936592979416, + "learning_rate": 6.760394694818949e-06, + "loss": 0.47, + "step": 5143 + }, + { + "epoch": 0.4, + "grad_norm": 1.5689093762651634, + "learning_rate": 6.75920427376422e-06, + "loss": 0.4637, + "step": 5144 + }, + { + "epoch": 0.4, + "grad_norm": 0.5358992206275358, + "learning_rate": 6.758013738891275e-06, + "loss": 0.5323, + "step": 5145 + }, + { + "epoch": 0.4, + "grad_norm": 10.6660831689086, + "learning_rate": 6.7568230902771415e-06, + "loss": 0.4395, + "step": 5146 + }, + { + "epoch": 0.4, + "grad_norm": 10.318270577016534, + "learning_rate": 6.755632327998851e-06, + "loss": 0.4843, + "step": 5147 + }, + { + "epoch": 0.4, + "grad_norm": 1.5369927891902775, + "learning_rate": 6.754441452133447e-06, + "loss": 0.4713, + "step": 5148 + }, + { + "epoch": 0.4, + "grad_norm": 1.6735658159841027, + "learning_rate": 6.753250462757975e-06, + "loss": 0.4987, + "step": 5149 + }, + { + "epoch": 0.4, + "grad_norm": 0.5844973583263949, + "learning_rate": 6.752059359949493e-06, + "loss": 0.5098, + "step": 5150 + }, + { + "epoch": 0.4, + "grad_norm": 3.256569937367476, + "learning_rate": 6.750868143785062e-06, + "loss": 0.4581, + "step": 5151 + }, + { + "epoch": 0.4, + "grad_norm": 3.6939315760303884, + "learning_rate": 6.749676814341752e-06, + "loss": 0.4705, + "step": 5152 + }, + { + "epoch": 0.4, + "grad_norm": 1.396676587127883, + "learning_rate": 6.748485371696642e-06, + "loss": 0.4804, + "step": 5153 + }, + { + "epoch": 0.4, + "grad_norm": 1.408672423474548, + "learning_rate": 6.747293815926814e-06, + "loss": 0.4633, + "step": 5154 + }, + { + "epoch": 0.4, + "grad_norm": 0.5473062500630678, + "learning_rate": 6.746102147109364e-06, + "loss": 0.5104, + "step": 5155 + }, + { + "epoch": 0.4, + "grad_norm": 0.5383310082158282, + "learning_rate": 6.744910365321388e-06, + "loss": 0.5001, + "step": 5156 + }, + { + "epoch": 0.41, + "grad_norm": 2.0022295449406244, + "learning_rate": 6.7437184706399925e-06, + "loss": 0.4638, + "step": 5157 + }, + { + "epoch": 0.41, + "grad_norm": 1.7379963450277462, + "learning_rate": 6.742526463142295e-06, + "loss": 0.5033, + "step": 5158 + }, + { + "epoch": 0.41, + "grad_norm": 1.791391071002403, + "learning_rate": 6.7413343429054134e-06, + "loss": 0.5117, + "step": 5159 + }, + { + "epoch": 0.41, + "grad_norm": 1.9809029555015993, + "learning_rate": 6.74014211000648e-06, + "loss": 0.4866, + "step": 5160 + }, + { + "epoch": 0.41, + "grad_norm": 1.5181030213362454, + "learning_rate": 6.738949764522627e-06, + "loss": 0.479, + "step": 5161 + }, + { + "epoch": 0.41, + "grad_norm": 1.683352851769196, + "learning_rate": 6.737757306531e-06, + "loss": 0.4369, + "step": 5162 + }, + { + "epoch": 0.41, + "grad_norm": 1.5075952887977475, + "learning_rate": 6.736564736108747e-06, + "loss": 0.4776, + "step": 5163 + }, + { + "epoch": 0.41, + "grad_norm": 1.728214021905995, + "learning_rate": 6.735372053333028e-06, + "loss": 0.4914, + "step": 5164 + }, + { + "epoch": 0.41, + "grad_norm": 1.2749612902151166, + "learning_rate": 6.734179258281007e-06, + "loss": 0.4366, + "step": 5165 + }, + { + "epoch": 0.41, + "grad_norm": 0.5759132156446712, + "learning_rate": 6.732986351029858e-06, + "loss": 0.5033, + "step": 5166 + }, + { + "epoch": 0.41, + "grad_norm": 1.4825462206876123, + "learning_rate": 6.731793331656757e-06, + "loss": 0.4842, + "step": 5167 + }, + { + "epoch": 0.41, + "grad_norm": 1.5870800851234808, + "learning_rate": 6.730600200238895e-06, + "loss": 0.4562, + "step": 5168 + }, + { + "epoch": 0.41, + "grad_norm": 0.5816448653373989, + "learning_rate": 6.729406956853462e-06, + "loss": 0.496, + "step": 5169 + }, + { + "epoch": 0.41, + "grad_norm": 1.3055098524808837, + "learning_rate": 6.728213601577664e-06, + "loss": 0.4419, + "step": 5170 + }, + { + "epoch": 0.41, + "grad_norm": 1.4489027760800093, + "learning_rate": 6.727020134488703e-06, + "loss": 0.4981, + "step": 5171 + }, + { + "epoch": 0.41, + "grad_norm": 1.4717592959351993, + "learning_rate": 6.7258265556638e-06, + "loss": 0.4994, + "step": 5172 + }, + { + "epoch": 0.41, + "grad_norm": 1.6877154486116341, + "learning_rate": 6.724632865180178e-06, + "loss": 0.5099, + "step": 5173 + }, + { + "epoch": 0.41, + "grad_norm": 1.524438121444279, + "learning_rate": 6.723439063115065e-06, + "loss": 0.4928, + "step": 5174 + }, + { + "epoch": 0.41, + "grad_norm": 2.0281202673479983, + "learning_rate": 6.722245149545698e-06, + "loss": 0.4755, + "step": 5175 + }, + { + "epoch": 0.41, + "grad_norm": 2.122354332438997, + "learning_rate": 6.721051124549325e-06, + "loss": 0.437, + "step": 5176 + }, + { + "epoch": 0.41, + "grad_norm": 0.5876003183112888, + "learning_rate": 6.719856988203195e-06, + "loss": 0.5222, + "step": 5177 + }, + { + "epoch": 0.41, + "grad_norm": 1.656973280247856, + "learning_rate": 6.718662740584566e-06, + "loss": 0.5309, + "step": 5178 + }, + { + "epoch": 0.41, + "grad_norm": 1.5107660477620135, + "learning_rate": 6.717468381770707e-06, + "loss": 0.4771, + "step": 5179 + }, + { + "epoch": 0.41, + "grad_norm": 1.3422963641132817, + "learning_rate": 6.71627391183889e-06, + "loss": 0.5075, + "step": 5180 + }, + { + "epoch": 0.41, + "grad_norm": 1.8561612201313555, + "learning_rate": 6.715079330866397e-06, + "loss": 0.5016, + "step": 5181 + }, + { + "epoch": 0.41, + "grad_norm": 1.8582564708903415, + "learning_rate": 6.7138846389305146e-06, + "loss": 0.4801, + "step": 5182 + }, + { + "epoch": 0.41, + "grad_norm": 1.8099783992796339, + "learning_rate": 6.712689836108538e-06, + "loss": 0.4694, + "step": 5183 + }, + { + "epoch": 0.41, + "grad_norm": 0.6001959069424734, + "learning_rate": 6.711494922477769e-06, + "loss": 0.498, + "step": 5184 + }, + { + "epoch": 0.41, + "grad_norm": 2.2461196404636494, + "learning_rate": 6.7102998981155186e-06, + "loss": 0.4694, + "step": 5185 + }, + { + "epoch": 0.41, + "grad_norm": 1.6101795973735369, + "learning_rate": 6.7091047630991015e-06, + "loss": 0.4945, + "step": 5186 + }, + { + "epoch": 0.41, + "grad_norm": 2.013471680506151, + "learning_rate": 6.707909517505842e-06, + "loss": 0.5189, + "step": 5187 + }, + { + "epoch": 0.41, + "grad_norm": 2.130140031057444, + "learning_rate": 6.7067141614130706e-06, + "loss": 0.4961, + "step": 5188 + }, + { + "epoch": 0.41, + "grad_norm": 1.7430828994153171, + "learning_rate": 6.705518694898127e-06, + "loss": 0.4723, + "step": 5189 + }, + { + "epoch": 0.41, + "grad_norm": 1.6180855723492615, + "learning_rate": 6.7043231180383525e-06, + "loss": 0.4925, + "step": 5190 + }, + { + "epoch": 0.41, + "grad_norm": 1.5532595581235469, + "learning_rate": 6.7031274309111025e-06, + "loss": 0.4605, + "step": 5191 + }, + { + "epoch": 0.41, + "grad_norm": 0.6365111908081824, + "learning_rate": 6.701931633593737e-06, + "loss": 0.5351, + "step": 5192 + }, + { + "epoch": 0.41, + "grad_norm": 1.8057489027771978, + "learning_rate": 6.700735726163621e-06, + "loss": 0.4959, + "step": 5193 + }, + { + "epoch": 0.41, + "grad_norm": 1.5028535403761758, + "learning_rate": 6.6995397086981275e-06, + "loss": 0.464, + "step": 5194 + }, + { + "epoch": 0.41, + "grad_norm": 1.7377697380807429, + "learning_rate": 6.698343581274639e-06, + "loss": 0.4633, + "step": 5195 + }, + { + "epoch": 0.41, + "grad_norm": 2.909377235500752, + "learning_rate": 6.6971473439705415e-06, + "loss": 0.5111, + "step": 5196 + }, + { + "epoch": 0.41, + "grad_norm": 1.7022296834058048, + "learning_rate": 6.695950996863232e-06, + "loss": 0.459, + "step": 5197 + }, + { + "epoch": 0.41, + "grad_norm": 1.4881022404527027, + "learning_rate": 6.694754540030111e-06, + "loss": 0.4255, + "step": 5198 + }, + { + "epoch": 0.41, + "grad_norm": 1.4060241611870998, + "learning_rate": 6.693557973548589e-06, + "loss": 0.4563, + "step": 5199 + }, + { + "epoch": 0.41, + "grad_norm": 1.5463512279335623, + "learning_rate": 6.69236129749608e-06, + "loss": 0.4563, + "step": 5200 + }, + { + "epoch": 0.41, + "grad_norm": 1.9699699615731752, + "learning_rate": 6.69116451195001e-06, + "loss": 0.4764, + "step": 5201 + }, + { + "epoch": 0.41, + "grad_norm": 0.597761175524961, + "learning_rate": 6.689967616987808e-06, + "loss": 0.5245, + "step": 5202 + }, + { + "epoch": 0.41, + "grad_norm": 1.641397434075564, + "learning_rate": 6.688770612686912e-06, + "loss": 0.4316, + "step": 5203 + }, + { + "epoch": 0.41, + "grad_norm": 1.7488819032200948, + "learning_rate": 6.687573499124766e-06, + "loss": 0.4898, + "step": 5204 + }, + { + "epoch": 0.41, + "grad_norm": 1.4509353322692586, + "learning_rate": 6.686376276378825e-06, + "loss": 0.414, + "step": 5205 + }, + { + "epoch": 0.41, + "grad_norm": 2.963802777426502, + "learning_rate": 6.685178944526543e-06, + "loss": 0.4406, + "step": 5206 + }, + { + "epoch": 0.41, + "grad_norm": 1.3961606867075198, + "learning_rate": 6.683981503645387e-06, + "loss": 0.4023, + "step": 5207 + }, + { + "epoch": 0.41, + "grad_norm": 2.0514652783941534, + "learning_rate": 6.682783953812832e-06, + "loss": 0.4219, + "step": 5208 + }, + { + "epoch": 0.41, + "grad_norm": 2.2103422495308553, + "learning_rate": 6.681586295106355e-06, + "loss": 0.4817, + "step": 5209 + }, + { + "epoch": 0.41, + "grad_norm": 0.6063825393558706, + "learning_rate": 6.680388527603447e-06, + "loss": 0.5349, + "step": 5210 + }, + { + "epoch": 0.41, + "grad_norm": 1.9857657937013642, + "learning_rate": 6.679190651381597e-06, + "loss": 0.4543, + "step": 5211 + }, + { + "epoch": 0.41, + "grad_norm": 1.761141163030882, + "learning_rate": 6.67799266651831e-06, + "loss": 0.4482, + "step": 5212 + }, + { + "epoch": 0.41, + "grad_norm": 1.4166420162947582, + "learning_rate": 6.67679457309109e-06, + "loss": 0.4591, + "step": 5213 + }, + { + "epoch": 0.41, + "grad_norm": 1.5497659793992562, + "learning_rate": 6.675596371177457e-06, + "loss": 0.5036, + "step": 5214 + }, + { + "epoch": 0.41, + "grad_norm": 1.6662252562868136, + "learning_rate": 6.674398060854931e-06, + "loss": 0.4409, + "step": 5215 + }, + { + "epoch": 0.41, + "grad_norm": 0.5831615225596714, + "learning_rate": 6.67319964220104e-06, + "loss": 0.5103, + "step": 5216 + }, + { + "epoch": 0.41, + "grad_norm": 1.391021813783733, + "learning_rate": 6.672001115293321e-06, + "loss": 0.438, + "step": 5217 + }, + { + "epoch": 0.41, + "grad_norm": 0.526564763541861, + "learning_rate": 6.670802480209318e-06, + "loss": 0.514, + "step": 5218 + }, + { + "epoch": 0.41, + "grad_norm": 0.5586349988447058, + "learning_rate": 6.66960373702658e-06, + "loss": 0.4968, + "step": 5219 + }, + { + "epoch": 0.41, + "grad_norm": 1.9941590817687156, + "learning_rate": 6.668404885822663e-06, + "loss": 0.4827, + "step": 5220 + }, + { + "epoch": 0.41, + "grad_norm": 1.5054471459826764, + "learning_rate": 6.667205926675134e-06, + "loss": 0.5138, + "step": 5221 + }, + { + "epoch": 0.41, + "grad_norm": 2.755948983894883, + "learning_rate": 6.666006859661562e-06, + "loss": 0.4665, + "step": 5222 + }, + { + "epoch": 0.41, + "grad_norm": 1.699843234843785, + "learning_rate": 6.6648076848595255e-06, + "loss": 0.4854, + "step": 5223 + }, + { + "epoch": 0.41, + "grad_norm": 0.546433115047816, + "learning_rate": 6.663608402346611e-06, + "loss": 0.5128, + "step": 5224 + }, + { + "epoch": 0.41, + "grad_norm": 3.960047049284606, + "learning_rate": 6.662409012200406e-06, + "loss": 0.4988, + "step": 5225 + }, + { + "epoch": 0.41, + "grad_norm": 1.8118074402567652, + "learning_rate": 6.661209514498515e-06, + "loss": 0.4745, + "step": 5226 + }, + { + "epoch": 0.41, + "grad_norm": 3.479324669206233, + "learning_rate": 6.66000990931854e-06, + "loss": 0.4756, + "step": 5227 + }, + { + "epoch": 0.41, + "grad_norm": 2.200758377931382, + "learning_rate": 6.658810196738097e-06, + "loss": 0.4749, + "step": 5228 + }, + { + "epoch": 0.41, + "grad_norm": 0.5939484553061328, + "learning_rate": 6.657610376834802e-06, + "loss": 0.5059, + "step": 5229 + }, + { + "epoch": 0.41, + "grad_norm": 1.736398109432895, + "learning_rate": 6.656410449686287e-06, + "loss": 0.4595, + "step": 5230 + }, + { + "epoch": 0.41, + "grad_norm": 1.7543401167212418, + "learning_rate": 6.655210415370181e-06, + "loss": 0.4557, + "step": 5231 + }, + { + "epoch": 0.41, + "grad_norm": 0.5575651431699798, + "learning_rate": 6.654010273964128e-06, + "loss": 0.4973, + "step": 5232 + }, + { + "epoch": 0.41, + "grad_norm": 1.8723826771564962, + "learning_rate": 6.652810025545772e-06, + "loss": 0.4687, + "step": 5233 + }, + { + "epoch": 0.41, + "grad_norm": 1.3513624426297146, + "learning_rate": 6.65160967019277e-06, + "loss": 0.4775, + "step": 5234 + }, + { + "epoch": 0.41, + "grad_norm": 1.3761604741248672, + "learning_rate": 6.6504092079827835e-06, + "loss": 0.4067, + "step": 5235 + }, + { + "epoch": 0.41, + "grad_norm": 1.6210921615464615, + "learning_rate": 6.64920863899348e-06, + "loss": 0.429, + "step": 5236 + }, + { + "epoch": 0.41, + "grad_norm": 0.5851166777395179, + "learning_rate": 6.648007963302534e-06, + "loss": 0.4991, + "step": 5237 + }, + { + "epoch": 0.41, + "grad_norm": 0.5427329433971466, + "learning_rate": 6.64680718098763e-06, + "loss": 0.4966, + "step": 5238 + }, + { + "epoch": 0.41, + "grad_norm": 1.2799983931939845, + "learning_rate": 6.645606292126455e-06, + "loss": 0.4487, + "step": 5239 + }, + { + "epoch": 0.41, + "grad_norm": 2.0208444968772286, + "learning_rate": 6.6444052967967065e-06, + "loss": 0.4839, + "step": 5240 + }, + { + "epoch": 0.41, + "grad_norm": 0.5992421661935375, + "learning_rate": 6.643204195076085e-06, + "loss": 0.5105, + "step": 5241 + }, + { + "epoch": 0.41, + "grad_norm": 6.017277193376594, + "learning_rate": 6.642002987042302e-06, + "loss": 0.4654, + "step": 5242 + }, + { + "epoch": 0.41, + "grad_norm": 0.5396442439280308, + "learning_rate": 6.640801672773075e-06, + "loss": 0.501, + "step": 5243 + }, + { + "epoch": 0.41, + "grad_norm": 2.740682832848836, + "learning_rate": 6.639600252346125e-06, + "loss": 0.5142, + "step": 5244 + }, + { + "epoch": 0.41, + "grad_norm": 1.7307889081335663, + "learning_rate": 6.6383987258391845e-06, + "loss": 0.4415, + "step": 5245 + }, + { + "epoch": 0.41, + "grad_norm": 0.5671936834847462, + "learning_rate": 6.637197093329989e-06, + "loss": 0.5231, + "step": 5246 + }, + { + "epoch": 0.41, + "grad_norm": 1.922200118862228, + "learning_rate": 6.635995354896283e-06, + "loss": 0.4734, + "step": 5247 + }, + { + "epoch": 0.41, + "grad_norm": 1.7372505467485393, + "learning_rate": 6.634793510615818e-06, + "loss": 0.4884, + "step": 5248 + }, + { + "epoch": 0.41, + "grad_norm": 0.587409568288592, + "learning_rate": 6.633591560566353e-06, + "loss": 0.5274, + "step": 5249 + }, + { + "epoch": 0.41, + "grad_norm": 2.063742516692561, + "learning_rate": 6.632389504825648e-06, + "loss": 0.4967, + "step": 5250 + }, + { + "epoch": 0.41, + "grad_norm": 0.5391491113973162, + "learning_rate": 6.63118734347148e-06, + "loss": 0.4887, + "step": 5251 + }, + { + "epoch": 0.41, + "grad_norm": 0.5779026210640987, + "learning_rate": 6.629985076581624e-06, + "loss": 0.5211, + "step": 5252 + }, + { + "epoch": 0.41, + "grad_norm": 2.3347179266058666, + "learning_rate": 6.628782704233866e-06, + "loss": 0.463, + "step": 5253 + }, + { + "epoch": 0.41, + "grad_norm": 2.2768839522521973, + "learning_rate": 6.627580226505996e-06, + "loss": 0.4911, + "step": 5254 + }, + { + "epoch": 0.41, + "grad_norm": 1.5584399625989114, + "learning_rate": 6.6263776434758175e-06, + "loss": 0.4097, + "step": 5255 + }, + { + "epoch": 0.41, + "grad_norm": 1.411129221149787, + "learning_rate": 6.625174955221131e-06, + "loss": 0.4974, + "step": 5256 + }, + { + "epoch": 0.41, + "grad_norm": 2.4136155528964847, + "learning_rate": 6.6239721618197514e-06, + "loss": 0.5212, + "step": 5257 + }, + { + "epoch": 0.41, + "grad_norm": 1.4301267165640956, + "learning_rate": 6.622769263349496e-06, + "loss": 0.5012, + "step": 5258 + }, + { + "epoch": 0.41, + "grad_norm": 2.6688177322483075, + "learning_rate": 6.6215662598881945e-06, + "loss": 0.4833, + "step": 5259 + }, + { + "epoch": 0.41, + "grad_norm": 1.7555760424823563, + "learning_rate": 6.6203631515136755e-06, + "loss": 0.4939, + "step": 5260 + }, + { + "epoch": 0.41, + "grad_norm": 1.8369715850367871, + "learning_rate": 6.619159938303782e-06, + "loss": 0.5148, + "step": 5261 + }, + { + "epoch": 0.41, + "grad_norm": 1.9098858334756308, + "learning_rate": 6.6179566203363565e-06, + "loss": 0.4726, + "step": 5262 + }, + { + "epoch": 0.41, + "grad_norm": 1.888864449626197, + "learning_rate": 6.616753197689256e-06, + "loss": 0.4321, + "step": 5263 + }, + { + "epoch": 0.41, + "grad_norm": 1.4541339572193697, + "learning_rate": 6.615549670440337e-06, + "loss": 0.476, + "step": 5264 + }, + { + "epoch": 0.41, + "grad_norm": 1.411810043478361, + "learning_rate": 6.614346038667471e-06, + "loss": 0.4726, + "step": 5265 + }, + { + "epoch": 0.41, + "grad_norm": 1.5780077938374408, + "learning_rate": 6.613142302448525e-06, + "loss": 0.4777, + "step": 5266 + }, + { + "epoch": 0.41, + "grad_norm": 1.5359915189862599, + "learning_rate": 6.611938461861383e-06, + "loss": 0.4676, + "step": 5267 + }, + { + "epoch": 0.41, + "grad_norm": 2.3098969401085845, + "learning_rate": 6.610734516983931e-06, + "loss": 0.4704, + "step": 5268 + }, + { + "epoch": 0.41, + "grad_norm": 1.9388946370652589, + "learning_rate": 6.609530467894064e-06, + "loss": 0.5171, + "step": 5269 + }, + { + "epoch": 0.41, + "grad_norm": 1.4387926611369306, + "learning_rate": 6.60832631466968e-06, + "loss": 0.4756, + "step": 5270 + }, + { + "epoch": 0.41, + "grad_norm": 1.5937910201281864, + "learning_rate": 6.607122057388687e-06, + "loss": 0.4757, + "step": 5271 + }, + { + "epoch": 0.41, + "grad_norm": 1.5187717467205835, + "learning_rate": 6.605917696129001e-06, + "loss": 0.4349, + "step": 5272 + }, + { + "epoch": 0.41, + "grad_norm": 1.8199663979216534, + "learning_rate": 6.60471323096854e-06, + "loss": 0.451, + "step": 5273 + }, + { + "epoch": 0.41, + "grad_norm": 2.2766743801891653, + "learning_rate": 6.603508661985232e-06, + "loss": 0.4624, + "step": 5274 + }, + { + "epoch": 0.41, + "grad_norm": 1.2386623156256038, + "learning_rate": 6.60230398925701e-06, + "loss": 0.4799, + "step": 5275 + }, + { + "epoch": 0.41, + "grad_norm": 2.0009454311597774, + "learning_rate": 6.601099212861818e-06, + "loss": 0.4858, + "step": 5276 + }, + { + "epoch": 0.41, + "grad_norm": 1.463096070489778, + "learning_rate": 6.5998943328776e-06, + "loss": 0.5172, + "step": 5277 + }, + { + "epoch": 0.41, + "grad_norm": 1.4287524305650479, + "learning_rate": 6.598689349382314e-06, + "loss": 0.4835, + "step": 5278 + }, + { + "epoch": 0.41, + "grad_norm": 1.6308057430548781, + "learning_rate": 6.597484262453916e-06, + "loss": 0.5191, + "step": 5279 + }, + { + "epoch": 0.41, + "grad_norm": 1.9207606236648558, + "learning_rate": 6.5962790721703775e-06, + "loss": 0.5359, + "step": 5280 + }, + { + "epoch": 0.41, + "grad_norm": 2.425728682358341, + "learning_rate": 6.59507377860967e-06, + "loss": 0.5206, + "step": 5281 + }, + { + "epoch": 0.41, + "grad_norm": 1.771678815162885, + "learning_rate": 6.593868381849775e-06, + "loss": 0.4898, + "step": 5282 + }, + { + "epoch": 0.41, + "grad_norm": 1.933414330745915, + "learning_rate": 6.592662881968681e-06, + "loss": 0.5507, + "step": 5283 + }, + { + "epoch": 0.41, + "grad_norm": 1.9457655182237008, + "learning_rate": 6.591457279044385e-06, + "loss": 0.4271, + "step": 5284 + }, + { + "epoch": 0.42, + "grad_norm": 2.1620689322387205, + "learning_rate": 6.5902515731548815e-06, + "loss": 0.4714, + "step": 5285 + }, + { + "epoch": 0.42, + "grad_norm": 0.6473865175117831, + "learning_rate": 6.589045764378184e-06, + "loss": 0.5167, + "step": 5286 + }, + { + "epoch": 0.42, + "grad_norm": 1.9778585632012742, + "learning_rate": 6.587839852792302e-06, + "loss": 0.5053, + "step": 5287 + }, + { + "epoch": 0.42, + "grad_norm": 0.5641387882618306, + "learning_rate": 6.586633838475261e-06, + "loss": 0.5169, + "step": 5288 + }, + { + "epoch": 0.42, + "grad_norm": 1.7548141676848115, + "learning_rate": 6.585427721505085e-06, + "loss": 0.4599, + "step": 5289 + }, + { + "epoch": 0.42, + "grad_norm": 2.2719856833380287, + "learning_rate": 6.584221501959809e-06, + "loss": 0.4932, + "step": 5290 + }, + { + "epoch": 0.42, + "grad_norm": 0.5727632796900416, + "learning_rate": 6.583015179917474e-06, + "loss": 0.5041, + "step": 5291 + }, + { + "epoch": 0.42, + "grad_norm": 1.65048949493738, + "learning_rate": 6.581808755456128e-06, + "loss": 0.5053, + "step": 5292 + }, + { + "epoch": 0.42, + "grad_norm": 2.1164233999770112, + "learning_rate": 6.580602228653825e-06, + "loss": 0.4655, + "step": 5293 + }, + { + "epoch": 0.42, + "grad_norm": 1.5841200522136643, + "learning_rate": 6.579395599588626e-06, + "loss": 0.4484, + "step": 5294 + }, + { + "epoch": 0.42, + "grad_norm": 2.3568511714815754, + "learning_rate": 6.578188868338598e-06, + "loss": 0.5029, + "step": 5295 + }, + { + "epoch": 0.42, + "grad_norm": 1.437090413765763, + "learning_rate": 6.576982034981813e-06, + "loss": 0.4519, + "step": 5296 + }, + { + "epoch": 0.42, + "grad_norm": 1.5233526880491428, + "learning_rate": 6.5757750995963554e-06, + "loss": 0.4801, + "step": 5297 + }, + { + "epoch": 0.42, + "grad_norm": 1.875895917317297, + "learning_rate": 6.574568062260309e-06, + "loss": 0.5171, + "step": 5298 + }, + { + "epoch": 0.42, + "grad_norm": 1.5760694706835727, + "learning_rate": 6.573360923051769e-06, + "loss": 0.4862, + "step": 5299 + }, + { + "epoch": 0.42, + "grad_norm": 1.7687858629149067, + "learning_rate": 6.572153682048836e-06, + "loss": 0.4831, + "step": 5300 + }, + { + "epoch": 0.42, + "grad_norm": 2.1569142342910905, + "learning_rate": 6.570946339329616e-06, + "loss": 0.4591, + "step": 5301 + }, + { + "epoch": 0.42, + "grad_norm": 1.482786752980439, + "learning_rate": 6.569738894972224e-06, + "loss": 0.405, + "step": 5302 + }, + { + "epoch": 0.42, + "grad_norm": 2.1344542815890306, + "learning_rate": 6.568531349054778e-06, + "loss": 0.4795, + "step": 5303 + }, + { + "epoch": 0.42, + "grad_norm": 1.6999725417085605, + "learning_rate": 6.567323701655404e-06, + "loss": 0.5025, + "step": 5304 + }, + { + "epoch": 0.42, + "grad_norm": 1.6614747082405472, + "learning_rate": 6.566115952852238e-06, + "loss": 0.4711, + "step": 5305 + }, + { + "epoch": 0.42, + "grad_norm": 1.3139225507327146, + "learning_rate": 6.56490810272342e-06, + "loss": 0.4442, + "step": 5306 + }, + { + "epoch": 0.42, + "grad_norm": 1.5445321222576824, + "learning_rate": 6.5637001513470935e-06, + "loss": 0.493, + "step": 5307 + }, + { + "epoch": 0.42, + "grad_norm": 1.8299974680926638, + "learning_rate": 6.562492098801413e-06, + "loss": 0.4764, + "step": 5308 + }, + { + "epoch": 0.42, + "grad_norm": 1.9680165944513737, + "learning_rate": 6.561283945164537e-06, + "loss": 0.4323, + "step": 5309 + }, + { + "epoch": 0.42, + "grad_norm": 1.6462097817510635, + "learning_rate": 6.560075690514633e-06, + "loss": 0.4752, + "step": 5310 + }, + { + "epoch": 0.42, + "grad_norm": 1.1826481540266487, + "learning_rate": 6.558867334929872e-06, + "loss": 0.4478, + "step": 5311 + }, + { + "epoch": 0.42, + "grad_norm": 1.6852416771914756, + "learning_rate": 6.557658878488436e-06, + "loss": 0.4224, + "step": 5312 + }, + { + "epoch": 0.42, + "grad_norm": 0.6299310770541685, + "learning_rate": 6.556450321268506e-06, + "loss": 0.5314, + "step": 5313 + }, + { + "epoch": 0.42, + "grad_norm": 2.036400608746328, + "learning_rate": 6.5552416633482775e-06, + "loss": 0.4992, + "step": 5314 + }, + { + "epoch": 0.42, + "grad_norm": 2.1143118095249744, + "learning_rate": 6.554032904805946e-06, + "loss": 0.4717, + "step": 5315 + }, + { + "epoch": 0.42, + "grad_norm": 1.8518101066286656, + "learning_rate": 6.55282404571972e-06, + "loss": 0.4588, + "step": 5316 + }, + { + "epoch": 0.42, + "grad_norm": 1.6313454848609727, + "learning_rate": 6.55161508616781e-06, + "loss": 0.5028, + "step": 5317 + }, + { + "epoch": 0.42, + "grad_norm": 1.515708328733282, + "learning_rate": 6.550406026228432e-06, + "loss": 0.4457, + "step": 5318 + }, + { + "epoch": 0.42, + "grad_norm": 2.263813345051239, + "learning_rate": 6.5491968659798145e-06, + "loss": 0.5072, + "step": 5319 + }, + { + "epoch": 0.42, + "grad_norm": 1.876819127007386, + "learning_rate": 6.547987605500184e-06, + "loss": 0.4766, + "step": 5320 + }, + { + "epoch": 0.42, + "grad_norm": 1.7770001998958938, + "learning_rate": 6.546778244867782e-06, + "loss": 0.5351, + "step": 5321 + }, + { + "epoch": 0.42, + "grad_norm": 0.6175541167199458, + "learning_rate": 6.545568784160851e-06, + "loss": 0.5217, + "step": 5322 + }, + { + "epoch": 0.42, + "grad_norm": 2.2617203316927417, + "learning_rate": 6.544359223457641e-06, + "loss": 0.4491, + "step": 5323 + }, + { + "epoch": 0.42, + "grad_norm": 0.5917743874195696, + "learning_rate": 6.5431495628364095e-06, + "loss": 0.5413, + "step": 5324 + }, + { + "epoch": 0.42, + "grad_norm": 1.837365120519526, + "learning_rate": 6.541939802375421e-06, + "loss": 0.5263, + "step": 5325 + }, + { + "epoch": 0.42, + "grad_norm": 1.5039961241545716, + "learning_rate": 6.5407299421529414e-06, + "loss": 0.4637, + "step": 5326 + }, + { + "epoch": 0.42, + "grad_norm": 1.7788505944592925, + "learning_rate": 6.539519982247254e-06, + "loss": 0.4729, + "step": 5327 + }, + { + "epoch": 0.42, + "grad_norm": 1.5125228470553895, + "learning_rate": 6.538309922736634e-06, + "loss": 0.5053, + "step": 5328 + }, + { + "epoch": 0.42, + "grad_norm": 1.5942237860402042, + "learning_rate": 6.537099763699377e-06, + "loss": 0.4279, + "step": 5329 + }, + { + "epoch": 0.42, + "grad_norm": 2.178336226953759, + "learning_rate": 6.535889505213775e-06, + "loss": 0.4654, + "step": 5330 + }, + { + "epoch": 0.42, + "grad_norm": 1.9946561647837753, + "learning_rate": 6.5346791473581325e-06, + "loss": 0.4871, + "step": 5331 + }, + { + "epoch": 0.42, + "grad_norm": 1.7384181577954418, + "learning_rate": 6.533468690210756e-06, + "loss": 0.4669, + "step": 5332 + }, + { + "epoch": 0.42, + "grad_norm": 1.5288381330184222, + "learning_rate": 6.532258133849961e-06, + "loss": 0.4491, + "step": 5333 + }, + { + "epoch": 0.42, + "grad_norm": 2.2078506868842944, + "learning_rate": 6.5310474783540685e-06, + "loss": 0.4167, + "step": 5334 + }, + { + "epoch": 0.42, + "grad_norm": 1.8690360608538297, + "learning_rate": 6.529836723801408e-06, + "loss": 0.513, + "step": 5335 + }, + { + "epoch": 0.42, + "grad_norm": 1.9867303981920907, + "learning_rate": 6.528625870270313e-06, + "loss": 0.5519, + "step": 5336 + }, + { + "epoch": 0.42, + "grad_norm": 0.6912094305634301, + "learning_rate": 6.527414917839124e-06, + "loss": 0.5289, + "step": 5337 + }, + { + "epoch": 0.42, + "grad_norm": 2.2654142858698214, + "learning_rate": 6.526203866586186e-06, + "loss": 0.4911, + "step": 5338 + }, + { + "epoch": 0.42, + "grad_norm": 2.000422958227543, + "learning_rate": 6.524992716589856e-06, + "loss": 0.4646, + "step": 5339 + }, + { + "epoch": 0.42, + "grad_norm": 1.8090904216664347, + "learning_rate": 6.523781467928492e-06, + "loss": 0.4477, + "step": 5340 + }, + { + "epoch": 0.42, + "grad_norm": 1.7121601333964902, + "learning_rate": 6.522570120680461e-06, + "loss": 0.4399, + "step": 5341 + }, + { + "epoch": 0.42, + "grad_norm": 2.246936662024646, + "learning_rate": 6.521358674924133e-06, + "loss": 0.5028, + "step": 5342 + }, + { + "epoch": 0.42, + "grad_norm": 2.1021683176989185, + "learning_rate": 6.52014713073789e-06, + "loss": 0.4877, + "step": 5343 + }, + { + "epoch": 0.42, + "grad_norm": 1.5432548823354566, + "learning_rate": 6.518935488200118e-06, + "loss": 0.5281, + "step": 5344 + }, + { + "epoch": 0.42, + "grad_norm": 1.529651949752004, + "learning_rate": 6.517723747389205e-06, + "loss": 0.511, + "step": 5345 + }, + { + "epoch": 0.42, + "grad_norm": 1.4498331979024621, + "learning_rate": 6.5165119083835515e-06, + "loss": 0.4922, + "step": 5346 + }, + { + "epoch": 0.42, + "grad_norm": 1.6075341352276735, + "learning_rate": 6.51529997126156e-06, + "loss": 0.4703, + "step": 5347 + }, + { + "epoch": 0.42, + "grad_norm": 0.6559053541873918, + "learning_rate": 6.514087936101645e-06, + "loss": 0.5036, + "step": 5348 + }, + { + "epoch": 0.42, + "grad_norm": 2.070039876123479, + "learning_rate": 6.5128758029822194e-06, + "loss": 0.4681, + "step": 5349 + }, + { + "epoch": 0.42, + "grad_norm": 1.6360454928297283, + "learning_rate": 6.511663571981708e-06, + "loss": 0.499, + "step": 5350 + }, + { + "epoch": 0.42, + "grad_norm": 0.6297642183020937, + "learning_rate": 6.510451243178543e-06, + "loss": 0.5109, + "step": 5351 + }, + { + "epoch": 0.42, + "grad_norm": 2.779766592004158, + "learning_rate": 6.509238816651158e-06, + "loss": 0.4191, + "step": 5352 + }, + { + "epoch": 0.42, + "grad_norm": 1.6726633523759864, + "learning_rate": 6.508026292477995e-06, + "loss": 0.5079, + "step": 5353 + }, + { + "epoch": 0.42, + "grad_norm": 2.391964634391703, + "learning_rate": 6.506813670737504e-06, + "loss": 0.473, + "step": 5354 + }, + { + "epoch": 0.42, + "grad_norm": 1.6122790936392446, + "learning_rate": 6.505600951508138e-06, + "loss": 0.4965, + "step": 5355 + }, + { + "epoch": 0.42, + "grad_norm": 1.2977964913574578, + "learning_rate": 6.504388134868363e-06, + "loss": 0.4407, + "step": 5356 + }, + { + "epoch": 0.42, + "grad_norm": 1.6650193588750146, + "learning_rate": 6.503175220896642e-06, + "loss": 0.4293, + "step": 5357 + }, + { + "epoch": 0.42, + "grad_norm": 1.5132659586560648, + "learning_rate": 6.501962209671452e-06, + "loss": 0.4958, + "step": 5358 + }, + { + "epoch": 0.42, + "grad_norm": 1.6411323926370633, + "learning_rate": 6.500749101271271e-06, + "loss": 0.4787, + "step": 5359 + }, + { + "epoch": 0.42, + "grad_norm": 1.8139084435376154, + "learning_rate": 6.499535895774585e-06, + "loss": 0.481, + "step": 5360 + }, + { + "epoch": 0.42, + "grad_norm": 2.7927433607015058, + "learning_rate": 6.498322593259889e-06, + "loss": 0.4686, + "step": 5361 + }, + { + "epoch": 0.42, + "grad_norm": 1.594899966260172, + "learning_rate": 6.4971091938056814e-06, + "loss": 0.4523, + "step": 5362 + }, + { + "epoch": 0.42, + "grad_norm": 1.6789353114558476, + "learning_rate": 6.495895697490468e-06, + "loss": 0.5064, + "step": 5363 + }, + { + "epoch": 0.42, + "grad_norm": 1.8416821021905123, + "learning_rate": 6.49468210439276e-06, + "loss": 0.4841, + "step": 5364 + }, + { + "epoch": 0.42, + "grad_norm": 1.3455532845026175, + "learning_rate": 6.4934684145910746e-06, + "loss": 0.4255, + "step": 5365 + }, + { + "epoch": 0.42, + "grad_norm": 1.6586766409149718, + "learning_rate": 6.492254628163936e-06, + "loss": 0.473, + "step": 5366 + }, + { + "epoch": 0.42, + "grad_norm": 1.8194903298709044, + "learning_rate": 6.491040745189876e-06, + "loss": 0.5127, + "step": 5367 + }, + { + "epoch": 0.42, + "grad_norm": 1.710576635559154, + "learning_rate": 6.489826765747431e-06, + "loss": 0.4663, + "step": 5368 + }, + { + "epoch": 0.42, + "grad_norm": 1.7655998802542583, + "learning_rate": 6.488612689915142e-06, + "loss": 0.4901, + "step": 5369 + }, + { + "epoch": 0.42, + "grad_norm": 1.9827478962400888, + "learning_rate": 6.487398517771559e-06, + "loss": 0.4103, + "step": 5370 + }, + { + "epoch": 0.42, + "grad_norm": 1.564186866570126, + "learning_rate": 6.486184249395237e-06, + "loss": 0.4719, + "step": 5371 + }, + { + "epoch": 0.42, + "grad_norm": 2.463652212551592, + "learning_rate": 6.48496988486474e-06, + "loss": 0.4733, + "step": 5372 + }, + { + "epoch": 0.42, + "grad_norm": 0.7569371301315814, + "learning_rate": 6.483755424258633e-06, + "loss": 0.5145, + "step": 5373 + }, + { + "epoch": 0.42, + "grad_norm": 0.6281449689526085, + "learning_rate": 6.482540867655492e-06, + "loss": 0.4906, + "step": 5374 + }, + { + "epoch": 0.42, + "grad_norm": 2.180277332686683, + "learning_rate": 6.481326215133895e-06, + "loss": 0.4634, + "step": 5375 + }, + { + "epoch": 0.42, + "grad_norm": 2.64258677886941, + "learning_rate": 6.480111466772432e-06, + "loss": 0.4356, + "step": 5376 + }, + { + "epoch": 0.42, + "grad_norm": 1.5870049005854123, + "learning_rate": 6.478896622649691e-06, + "loss": 0.4736, + "step": 5377 + }, + { + "epoch": 0.42, + "grad_norm": 2.3835406955797693, + "learning_rate": 6.477681682844274e-06, + "loss": 0.5535, + "step": 5378 + }, + { + "epoch": 0.42, + "grad_norm": 2.3742857725644213, + "learning_rate": 6.476466647434785e-06, + "loss": 0.4915, + "step": 5379 + }, + { + "epoch": 0.42, + "grad_norm": 1.7701432064110176, + "learning_rate": 6.475251516499836e-06, + "loss": 0.4429, + "step": 5380 + }, + { + "epoch": 0.42, + "grad_norm": 2.482609484275228, + "learning_rate": 6.474036290118042e-06, + "loss": 0.4826, + "step": 5381 + }, + { + "epoch": 0.42, + "grad_norm": 2.613001036561738, + "learning_rate": 6.47282096836803e-06, + "loss": 0.4615, + "step": 5382 + }, + { + "epoch": 0.42, + "grad_norm": 1.482729642100069, + "learning_rate": 6.471605551328427e-06, + "loss": 0.4897, + "step": 5383 + }, + { + "epoch": 0.42, + "grad_norm": 1.8902905676002197, + "learning_rate": 6.470390039077871e-06, + "loss": 0.4966, + "step": 5384 + }, + { + "epoch": 0.42, + "grad_norm": 1.5618013963087183, + "learning_rate": 6.469174431695002e-06, + "loss": 0.4828, + "step": 5385 + }, + { + "epoch": 0.42, + "grad_norm": 1.3606008069397435, + "learning_rate": 6.46795872925847e-06, + "loss": 0.4494, + "step": 5386 + }, + { + "epoch": 0.42, + "grad_norm": 1.4140319006124789, + "learning_rate": 6.466742931846927e-06, + "loss": 0.4279, + "step": 5387 + }, + { + "epoch": 0.42, + "grad_norm": 1.887975435962841, + "learning_rate": 6.465527039539038e-06, + "loss": 0.4491, + "step": 5388 + }, + { + "epoch": 0.42, + "grad_norm": 1.3690906698575989, + "learning_rate": 6.464311052413465e-06, + "loss": 0.5372, + "step": 5389 + }, + { + "epoch": 0.42, + "grad_norm": 2.516485037980026, + "learning_rate": 6.463094970548881e-06, + "loss": 0.4696, + "step": 5390 + }, + { + "epoch": 0.42, + "grad_norm": 5.94572036360271, + "learning_rate": 6.461878794023968e-06, + "loss": 0.4867, + "step": 5391 + }, + { + "epoch": 0.42, + "grad_norm": 1.570434607802137, + "learning_rate": 6.4606625229174096e-06, + "loss": 0.453, + "step": 5392 + }, + { + "epoch": 0.42, + "grad_norm": 1.9152710174430152, + "learning_rate": 6.459446157307896e-06, + "loss": 0.4733, + "step": 5393 + }, + { + "epoch": 0.42, + "grad_norm": 1.607545940308983, + "learning_rate": 6.458229697274125e-06, + "loss": 0.4425, + "step": 5394 + }, + { + "epoch": 0.42, + "grad_norm": 0.6382126754714637, + "learning_rate": 6.457013142894801e-06, + "loss": 0.5102, + "step": 5395 + }, + { + "epoch": 0.42, + "grad_norm": 2.889116308700257, + "learning_rate": 6.4557964942486316e-06, + "loss": 0.4291, + "step": 5396 + }, + { + "epoch": 0.42, + "grad_norm": 1.9242515833245408, + "learning_rate": 6.4545797514143346e-06, + "loss": 0.4874, + "step": 5397 + }, + { + "epoch": 0.42, + "grad_norm": 1.759806061622904, + "learning_rate": 6.45336291447063e-06, + "loss": 0.5048, + "step": 5398 + }, + { + "epoch": 0.42, + "grad_norm": 1.9351823058640119, + "learning_rate": 6.452145983496247e-06, + "loss": 0.4016, + "step": 5399 + }, + { + "epoch": 0.42, + "grad_norm": 7.693345991693742, + "learning_rate": 6.450928958569917e-06, + "loss": 0.4443, + "step": 5400 + }, + { + "epoch": 0.42, + "grad_norm": 14.334153433934874, + "learning_rate": 6.449711839770383e-06, + "loss": 0.4628, + "step": 5401 + }, + { + "epoch": 0.42, + "grad_norm": 1.9770433544452197, + "learning_rate": 6.448494627176388e-06, + "loss": 0.447, + "step": 5402 + }, + { + "epoch": 0.42, + "grad_norm": 1.9372612270561234, + "learning_rate": 6.4472773208666875e-06, + "loss": 0.4052, + "step": 5403 + }, + { + "epoch": 0.42, + "grad_norm": 1.7769968364022821, + "learning_rate": 6.446059920920036e-06, + "loss": 0.441, + "step": 5404 + }, + { + "epoch": 0.42, + "grad_norm": 1.7900742652634223, + "learning_rate": 6.444842427415199e-06, + "loss": 0.4759, + "step": 5405 + }, + { + "epoch": 0.42, + "grad_norm": 1.627434109668974, + "learning_rate": 6.4436248404309466e-06, + "loss": 0.4658, + "step": 5406 + }, + { + "epoch": 0.42, + "grad_norm": 1.7354460892039711, + "learning_rate": 6.442407160046057e-06, + "loss": 0.5239, + "step": 5407 + }, + { + "epoch": 0.42, + "grad_norm": 1.4531372123604291, + "learning_rate": 6.44118938633931e-06, + "loss": 0.5075, + "step": 5408 + }, + { + "epoch": 0.42, + "grad_norm": 1.794227252127523, + "learning_rate": 6.439971519389496e-06, + "loss": 0.4784, + "step": 5409 + }, + { + "epoch": 0.42, + "grad_norm": 2.0431258220341673, + "learning_rate": 6.438753559275408e-06, + "loss": 0.446, + "step": 5410 + }, + { + "epoch": 0.42, + "grad_norm": 1.6328198358822974, + "learning_rate": 6.437535506075847e-06, + "loss": 0.4754, + "step": 5411 + }, + { + "epoch": 0.43, + "grad_norm": 1.3296987772966862, + "learning_rate": 6.43631735986962e-06, + "loss": 0.4424, + "step": 5412 + }, + { + "epoch": 0.43, + "grad_norm": 1.6275245300364432, + "learning_rate": 6.435099120735537e-06, + "loss": 0.4711, + "step": 5413 + }, + { + "epoch": 0.43, + "grad_norm": 1.5364878368741093, + "learning_rate": 6.433880788752419e-06, + "loss": 0.4527, + "step": 5414 + }, + { + "epoch": 0.43, + "grad_norm": 2.5503886791229595, + "learning_rate": 6.43266236399909e-06, + "loss": 0.4588, + "step": 5415 + }, + { + "epoch": 0.43, + "grad_norm": 1.9296968599938478, + "learning_rate": 6.43144384655438e-06, + "loss": 0.5451, + "step": 5416 + }, + { + "epoch": 0.43, + "grad_norm": 1.5525208922518552, + "learning_rate": 6.430225236497125e-06, + "loss": 0.4384, + "step": 5417 + }, + { + "epoch": 0.43, + "grad_norm": 1.5080603647950124, + "learning_rate": 6.429006533906168e-06, + "loss": 0.4749, + "step": 5418 + }, + { + "epoch": 0.43, + "grad_norm": 0.9036898854974603, + "learning_rate": 6.42778773886036e-06, + "loss": 0.5042, + "step": 5419 + }, + { + "epoch": 0.43, + "grad_norm": 1.8785341207331518, + "learning_rate": 6.426568851438551e-06, + "loss": 0.4628, + "step": 5420 + }, + { + "epoch": 0.43, + "grad_norm": 8.365271394342399, + "learning_rate": 6.425349871719604e-06, + "loss": 0.4257, + "step": 5421 + }, + { + "epoch": 0.43, + "grad_norm": 2.195017687174971, + "learning_rate": 6.424130799782385e-06, + "loss": 0.4394, + "step": 5422 + }, + { + "epoch": 0.43, + "grad_norm": 1.7302862450406908, + "learning_rate": 6.422911635705766e-06, + "loss": 0.4927, + "step": 5423 + }, + { + "epoch": 0.43, + "grad_norm": 2.498544750431292, + "learning_rate": 6.421692379568626e-06, + "loss": 0.4912, + "step": 5424 + }, + { + "epoch": 0.43, + "grad_norm": 1.4687744039618424, + "learning_rate": 6.420473031449849e-06, + "loss": 0.4344, + "step": 5425 + }, + { + "epoch": 0.43, + "grad_norm": 2.3812623883392923, + "learning_rate": 6.419253591428325e-06, + "loss": 0.4757, + "step": 5426 + }, + { + "epoch": 0.43, + "grad_norm": 2.5345256421823, + "learning_rate": 6.4180340595829495e-06, + "loss": 0.4643, + "step": 5427 + }, + { + "epoch": 0.43, + "grad_norm": 1.6472825213474096, + "learning_rate": 6.416814435992625e-06, + "loss": 0.4671, + "step": 5428 + }, + { + "epoch": 0.43, + "grad_norm": 1.598756856317711, + "learning_rate": 6.415594720736261e-06, + "loss": 0.4686, + "step": 5429 + }, + { + "epoch": 0.43, + "grad_norm": 1.5361462315551706, + "learning_rate": 6.41437491389277e-06, + "loss": 0.4527, + "step": 5430 + }, + { + "epoch": 0.43, + "grad_norm": 2.151113449173148, + "learning_rate": 6.413155015541073e-06, + "loss": 0.478, + "step": 5431 + }, + { + "epoch": 0.43, + "grad_norm": 1.4327168674712347, + "learning_rate": 6.411935025760094e-06, + "loss": 0.4785, + "step": 5432 + }, + { + "epoch": 0.43, + "grad_norm": 2.5969422191599394, + "learning_rate": 6.410714944628766e-06, + "loss": 0.4838, + "step": 5433 + }, + { + "epoch": 0.43, + "grad_norm": 1.909595615074427, + "learning_rate": 6.4094947722260285e-06, + "loss": 0.4704, + "step": 5434 + }, + { + "epoch": 0.43, + "grad_norm": 1.4347264352479394, + "learning_rate": 6.4082745086308206e-06, + "loss": 0.4811, + "step": 5435 + }, + { + "epoch": 0.43, + "grad_norm": 2.0839511447097134, + "learning_rate": 6.407054153922096e-06, + "loss": 0.49, + "step": 5436 + }, + { + "epoch": 0.43, + "grad_norm": 0.721522911306308, + "learning_rate": 6.4058337081788066e-06, + "loss": 0.5112, + "step": 5437 + }, + { + "epoch": 0.43, + "grad_norm": 0.6242778700245335, + "learning_rate": 6.404613171479918e-06, + "loss": 0.5322, + "step": 5438 + }, + { + "epoch": 0.43, + "grad_norm": 1.9257484783815182, + "learning_rate": 6.403392543904391e-06, + "loss": 0.5217, + "step": 5439 + }, + { + "epoch": 0.43, + "grad_norm": 1.7644594287895383, + "learning_rate": 6.402171825531205e-06, + "loss": 0.4783, + "step": 5440 + }, + { + "epoch": 0.43, + "grad_norm": 0.5909360990160306, + "learning_rate": 6.400951016439334e-06, + "loss": 0.505, + "step": 5441 + }, + { + "epoch": 0.43, + "grad_norm": 1.6006419925270396, + "learning_rate": 6.3997301167077675e-06, + "loss": 0.4747, + "step": 5442 + }, + { + "epoch": 0.43, + "grad_norm": 1.991536195781363, + "learning_rate": 6.398509126415492e-06, + "loss": 0.4946, + "step": 5443 + }, + { + "epoch": 0.43, + "grad_norm": 0.631721951424862, + "learning_rate": 6.397288045641507e-06, + "loss": 0.4884, + "step": 5444 + }, + { + "epoch": 0.43, + "grad_norm": 2.2515637751974937, + "learning_rate": 6.396066874464811e-06, + "loss": 0.4945, + "step": 5445 + }, + { + "epoch": 0.43, + "grad_norm": 1.3422218229853164, + "learning_rate": 6.3948456129644165e-06, + "loss": 0.441, + "step": 5446 + }, + { + "epoch": 0.43, + "grad_norm": 1.668999912114585, + "learning_rate": 6.393624261219335e-06, + "loss": 0.4693, + "step": 5447 + }, + { + "epoch": 0.43, + "grad_norm": 1.6932412682056792, + "learning_rate": 6.392402819308586e-06, + "loss": 0.4271, + "step": 5448 + }, + { + "epoch": 0.43, + "grad_norm": 1.5900655826438592, + "learning_rate": 6.391181287311197e-06, + "loss": 0.4563, + "step": 5449 + }, + { + "epoch": 0.43, + "grad_norm": 0.6242586916004049, + "learning_rate": 6.389959665306198e-06, + "loss": 0.5138, + "step": 5450 + }, + { + "epoch": 0.43, + "grad_norm": 0.5915857087937059, + "learning_rate": 6.388737953372625e-06, + "loss": 0.4899, + "step": 5451 + }, + { + "epoch": 0.43, + "grad_norm": 1.506342712362207, + "learning_rate": 6.3875161515895265e-06, + "loss": 0.5074, + "step": 5452 + }, + { + "epoch": 0.43, + "grad_norm": 0.5708623882760382, + "learning_rate": 6.3862942600359465e-06, + "loss": 0.5133, + "step": 5453 + }, + { + "epoch": 0.43, + "grad_norm": 1.6527977500417101, + "learning_rate": 6.385072278790942e-06, + "loss": 0.4535, + "step": 5454 + }, + { + "epoch": 0.43, + "grad_norm": 2.1491974926709103, + "learning_rate": 6.383850207933573e-06, + "loss": 0.4822, + "step": 5455 + }, + { + "epoch": 0.43, + "grad_norm": 1.792414462685571, + "learning_rate": 6.382628047542907e-06, + "loss": 0.4792, + "step": 5456 + }, + { + "epoch": 0.43, + "grad_norm": 1.4688810450103786, + "learning_rate": 6.381405797698013e-06, + "loss": 0.4288, + "step": 5457 + }, + { + "epoch": 0.43, + "grad_norm": 1.8602533658437548, + "learning_rate": 6.380183458477972e-06, + "loss": 0.4962, + "step": 5458 + }, + { + "epoch": 0.43, + "grad_norm": 1.7216556573498454, + "learning_rate": 6.3789610299618654e-06, + "loss": 0.4636, + "step": 5459 + }, + { + "epoch": 0.43, + "grad_norm": 1.26115220474863, + "learning_rate": 6.377738512228785e-06, + "loss": 0.4467, + "step": 5460 + }, + { + "epoch": 0.43, + "grad_norm": 1.6294761832062299, + "learning_rate": 6.376515905357825e-06, + "loss": 0.5066, + "step": 5461 + }, + { + "epoch": 0.43, + "grad_norm": 1.5664688453998006, + "learning_rate": 6.375293209428087e-06, + "loss": 0.4295, + "step": 5462 + }, + { + "epoch": 0.43, + "grad_norm": 1.7730610928029351, + "learning_rate": 6.374070424518677e-06, + "loss": 0.4521, + "step": 5463 + }, + { + "epoch": 0.43, + "grad_norm": 0.6381401658153492, + "learning_rate": 6.372847550708709e-06, + "loss": 0.523, + "step": 5464 + }, + { + "epoch": 0.43, + "grad_norm": 2.169142746871996, + "learning_rate": 6.3716245880772985e-06, + "loss": 0.4387, + "step": 5465 + }, + { + "epoch": 0.43, + "grad_norm": 1.8986764257053186, + "learning_rate": 6.370401536703573e-06, + "loss": 0.4232, + "step": 5466 + }, + { + "epoch": 0.43, + "grad_norm": 2.3144938015891303, + "learning_rate": 6.369178396666661e-06, + "loss": 0.4498, + "step": 5467 + }, + { + "epoch": 0.43, + "grad_norm": 1.9488229418167975, + "learning_rate": 6.3679551680456964e-06, + "loss": 0.4498, + "step": 5468 + }, + { + "epoch": 0.43, + "grad_norm": 1.5634560214295914, + "learning_rate": 6.366731850919824e-06, + "loss": 0.465, + "step": 5469 + }, + { + "epoch": 0.43, + "grad_norm": 0.5923967710324288, + "learning_rate": 6.365508445368187e-06, + "loss": 0.4976, + "step": 5470 + }, + { + "epoch": 0.43, + "grad_norm": 1.6026706114852998, + "learning_rate": 6.364284951469941e-06, + "loss": 0.4391, + "step": 5471 + }, + { + "epoch": 0.43, + "grad_norm": 2.7901685820410713, + "learning_rate": 6.363061369304243e-06, + "loss": 0.4333, + "step": 5472 + }, + { + "epoch": 0.43, + "grad_norm": 1.8694935282780765, + "learning_rate": 6.3618376989502574e-06, + "loss": 0.4636, + "step": 5473 + }, + { + "epoch": 0.43, + "grad_norm": 1.3724218773036148, + "learning_rate": 6.360613940487152e-06, + "loss": 0.4866, + "step": 5474 + }, + { + "epoch": 0.43, + "grad_norm": 2.8005088425579485, + "learning_rate": 6.359390093994109e-06, + "loss": 0.4944, + "step": 5475 + }, + { + "epoch": 0.43, + "grad_norm": 1.6907881020604005, + "learning_rate": 6.358166159550302e-06, + "loss": 0.4613, + "step": 5476 + }, + { + "epoch": 0.43, + "grad_norm": 2.1296381345708353, + "learning_rate": 6.356942137234923e-06, + "loss": 0.4229, + "step": 5477 + }, + { + "epoch": 0.43, + "grad_norm": 1.8341426144465043, + "learning_rate": 6.355718027127161e-06, + "loss": 0.4658, + "step": 5478 + }, + { + "epoch": 0.43, + "grad_norm": 1.6540876514616087, + "learning_rate": 6.354493829306217e-06, + "loss": 0.4672, + "step": 5479 + }, + { + "epoch": 0.43, + "grad_norm": 1.5912348036339827, + "learning_rate": 6.353269543851295e-06, + "loss": 0.46, + "step": 5480 + }, + { + "epoch": 0.43, + "grad_norm": 1.8520615174937565, + "learning_rate": 6.352045170841603e-06, + "loss": 0.4572, + "step": 5481 + }, + { + "epoch": 0.43, + "grad_norm": 2.5714615522815722, + "learning_rate": 6.350820710356357e-06, + "loss": 0.4833, + "step": 5482 + }, + { + "epoch": 0.43, + "grad_norm": 1.9609453444653846, + "learning_rate": 6.349596162474779e-06, + "loss": 0.5097, + "step": 5483 + }, + { + "epoch": 0.43, + "grad_norm": 2.131953226081147, + "learning_rate": 6.348371527276093e-06, + "loss": 0.4682, + "step": 5484 + }, + { + "epoch": 0.43, + "grad_norm": 1.5932576661208442, + "learning_rate": 6.347146804839535e-06, + "loss": 0.4331, + "step": 5485 + }, + { + "epoch": 0.43, + "grad_norm": 0.6130199231462509, + "learning_rate": 6.345921995244339e-06, + "loss": 0.5061, + "step": 5486 + }, + { + "epoch": 0.43, + "grad_norm": 1.5737971277902503, + "learning_rate": 6.344697098569752e-06, + "loss": 0.4519, + "step": 5487 + }, + { + "epoch": 0.43, + "grad_norm": 1.7708496825277467, + "learning_rate": 6.343472114895022e-06, + "loss": 0.4887, + "step": 5488 + }, + { + "epoch": 0.43, + "grad_norm": 1.5435230243512081, + "learning_rate": 6.342247044299403e-06, + "loss": 0.4187, + "step": 5489 + }, + { + "epoch": 0.43, + "grad_norm": 1.3606083222923795, + "learning_rate": 6.341021886862157e-06, + "loss": 0.4563, + "step": 5490 + }, + { + "epoch": 0.43, + "grad_norm": 1.9412986379604802, + "learning_rate": 6.339796642662549e-06, + "loss": 0.457, + "step": 5491 + }, + { + "epoch": 0.43, + "grad_norm": 2.4632746095147215, + "learning_rate": 6.33857131177985e-06, + "loss": 0.5036, + "step": 5492 + }, + { + "epoch": 0.43, + "grad_norm": 2.0265371688800413, + "learning_rate": 6.337345894293339e-06, + "loss": 0.5231, + "step": 5493 + }, + { + "epoch": 0.43, + "grad_norm": 1.52936986379853, + "learning_rate": 6.3361203902822964e-06, + "loss": 0.4755, + "step": 5494 + }, + { + "epoch": 0.43, + "grad_norm": 1.5874212538591792, + "learning_rate": 6.334894799826014e-06, + "loss": 0.471, + "step": 5495 + }, + { + "epoch": 0.43, + "grad_norm": 1.86485169696741, + "learning_rate": 6.333669123003784e-06, + "loss": 0.5112, + "step": 5496 + }, + { + "epoch": 0.43, + "grad_norm": 1.4470090274878336, + "learning_rate": 6.332443359894906e-06, + "loss": 0.5146, + "step": 5497 + }, + { + "epoch": 0.43, + "grad_norm": 1.567655033098356, + "learning_rate": 6.331217510578687e-06, + "loss": 0.478, + "step": 5498 + }, + { + "epoch": 0.43, + "grad_norm": 1.4912396331544506, + "learning_rate": 6.329991575134435e-06, + "loss": 0.4509, + "step": 5499 + }, + { + "epoch": 0.43, + "grad_norm": 2.1274600665978647, + "learning_rate": 6.328765553641469e-06, + "loss": 0.5221, + "step": 5500 + }, + { + "epoch": 0.43, + "grad_norm": 1.454579301798375, + "learning_rate": 6.3275394461791095e-06, + "loss": 0.4861, + "step": 5501 + }, + { + "epoch": 0.43, + "grad_norm": 0.597734300081622, + "learning_rate": 6.326313252826685e-06, + "loss": 0.5107, + "step": 5502 + }, + { + "epoch": 0.43, + "grad_norm": 0.5312056407054541, + "learning_rate": 6.3250869736635265e-06, + "loss": 0.5098, + "step": 5503 + }, + { + "epoch": 0.43, + "grad_norm": 1.7702597449802944, + "learning_rate": 6.323860608768977e-06, + "loss": 0.4367, + "step": 5504 + }, + { + "epoch": 0.43, + "grad_norm": 1.752765665329628, + "learning_rate": 6.322634158222375e-06, + "loss": 0.4882, + "step": 5505 + }, + { + "epoch": 0.43, + "grad_norm": 1.693037343820609, + "learning_rate": 6.3214076221030755e-06, + "loss": 0.478, + "step": 5506 + }, + { + "epoch": 0.43, + "grad_norm": 0.599966286276832, + "learning_rate": 6.320181000490429e-06, + "loss": 0.5129, + "step": 5507 + }, + { + "epoch": 0.43, + "grad_norm": 2.090312987214145, + "learning_rate": 6.318954293463801e-06, + "loss": 0.4553, + "step": 5508 + }, + { + "epoch": 0.43, + "grad_norm": 1.7979868610930083, + "learning_rate": 6.317727501102554e-06, + "loss": 0.44, + "step": 5509 + }, + { + "epoch": 0.43, + "grad_norm": 1.569743529972744, + "learning_rate": 6.316500623486063e-06, + "loss": 0.4455, + "step": 5510 + }, + { + "epoch": 0.43, + "grad_norm": 2.091556601244058, + "learning_rate": 6.315273660693702e-06, + "loss": 0.4866, + "step": 5511 + }, + { + "epoch": 0.43, + "grad_norm": 1.5859257821189616, + "learning_rate": 6.3140466128048585e-06, + "loss": 0.4985, + "step": 5512 + }, + { + "epoch": 0.43, + "grad_norm": 1.5198534750191977, + "learning_rate": 6.312819479898915e-06, + "loss": 0.4606, + "step": 5513 + }, + { + "epoch": 0.43, + "grad_norm": 1.5835576877026925, + "learning_rate": 6.311592262055271e-06, + "loss": 0.5216, + "step": 5514 + }, + { + "epoch": 0.43, + "grad_norm": 1.411832292277702, + "learning_rate": 6.310364959353322e-06, + "loss": 0.5213, + "step": 5515 + }, + { + "epoch": 0.43, + "grad_norm": 1.7916729425640863, + "learning_rate": 6.309137571872476e-06, + "loss": 0.4372, + "step": 5516 + }, + { + "epoch": 0.43, + "grad_norm": 1.4797800244039416, + "learning_rate": 6.307910099692141e-06, + "loss": 0.4499, + "step": 5517 + }, + { + "epoch": 0.43, + "grad_norm": 0.6047367513266667, + "learning_rate": 6.306682542891731e-06, + "loss": 0.4963, + "step": 5518 + }, + { + "epoch": 0.43, + "grad_norm": 0.6202889265059824, + "learning_rate": 6.305454901550672e-06, + "loss": 0.4943, + "step": 5519 + }, + { + "epoch": 0.43, + "grad_norm": 2.730061000161206, + "learning_rate": 6.304227175748389e-06, + "loss": 0.4932, + "step": 5520 + }, + { + "epoch": 0.43, + "grad_norm": 1.601056835929249, + "learning_rate": 6.302999365564313e-06, + "loss": 0.4446, + "step": 5521 + }, + { + "epoch": 0.43, + "grad_norm": 3.32998846036508, + "learning_rate": 6.301771471077883e-06, + "loss": 0.485, + "step": 5522 + }, + { + "epoch": 0.43, + "grad_norm": 1.4177640392760884, + "learning_rate": 6.300543492368541e-06, + "loss": 0.4429, + "step": 5523 + }, + { + "epoch": 0.43, + "grad_norm": 6.0823578752631535, + "learning_rate": 6.299315429515738e-06, + "loss": 0.4816, + "step": 5524 + }, + { + "epoch": 0.43, + "grad_norm": 1.7002113252905469, + "learning_rate": 6.298087282598924e-06, + "loss": 0.5585, + "step": 5525 + }, + { + "epoch": 0.43, + "grad_norm": 0.6217623061662915, + "learning_rate": 6.296859051697563e-06, + "loss": 0.5216, + "step": 5526 + }, + { + "epoch": 0.43, + "grad_norm": 2.0148967264842996, + "learning_rate": 6.2956307368911185e-06, + "loss": 0.436, + "step": 5527 + }, + { + "epoch": 0.43, + "grad_norm": 1.2032137516498582, + "learning_rate": 6.294402338259059e-06, + "loss": 0.431, + "step": 5528 + }, + { + "epoch": 0.43, + "grad_norm": 1.5681886673243572, + "learning_rate": 6.293173855880861e-06, + "loss": 0.4569, + "step": 5529 + }, + { + "epoch": 0.43, + "grad_norm": 1.69073980005492, + "learning_rate": 6.2919452898360056e-06, + "loss": 0.4711, + "step": 5530 + }, + { + "epoch": 0.43, + "grad_norm": 0.5698874714475253, + "learning_rate": 6.290716640203981e-06, + "loss": 0.5097, + "step": 5531 + }, + { + "epoch": 0.43, + "grad_norm": 1.464968671036617, + "learning_rate": 6.289487907064279e-06, + "loss": 0.427, + "step": 5532 + }, + { + "epoch": 0.43, + "grad_norm": 2.133831155646331, + "learning_rate": 6.288259090496396e-06, + "loss": 0.5053, + "step": 5533 + }, + { + "epoch": 0.43, + "grad_norm": 0.5687086950746946, + "learning_rate": 6.2870301905798344e-06, + "loss": 0.5137, + "step": 5534 + }, + { + "epoch": 0.43, + "grad_norm": 1.4600551344170163, + "learning_rate": 6.285801207394103e-06, + "loss": 0.4742, + "step": 5535 + }, + { + "epoch": 0.43, + "grad_norm": 1.5395037690645166, + "learning_rate": 6.284572141018716e-06, + "loss": 0.4886, + "step": 5536 + }, + { + "epoch": 0.43, + "grad_norm": 1.7626111187571571, + "learning_rate": 6.283342991533192e-06, + "loss": 0.4895, + "step": 5537 + }, + { + "epoch": 0.43, + "grad_norm": 0.6659088375724659, + "learning_rate": 6.282113759017054e-06, + "loss": 0.5237, + "step": 5538 + }, + { + "epoch": 0.44, + "grad_norm": 1.5314847146966124, + "learning_rate": 6.280884443549836e-06, + "loss": 0.5128, + "step": 5539 + }, + { + "epoch": 0.44, + "grad_norm": 1.8011167226425937, + "learning_rate": 6.279655045211068e-06, + "loss": 0.4763, + "step": 5540 + }, + { + "epoch": 0.44, + "grad_norm": 2.087045350487504, + "learning_rate": 6.278425564080292e-06, + "loss": 0.4836, + "step": 5541 + }, + { + "epoch": 0.44, + "grad_norm": 0.5449633482197765, + "learning_rate": 6.277196000237055e-06, + "loss": 0.5048, + "step": 5542 + }, + { + "epoch": 0.44, + "grad_norm": 1.3830724000008425, + "learning_rate": 6.275966353760908e-06, + "loss": 0.4643, + "step": 5543 + }, + { + "epoch": 0.44, + "grad_norm": 1.7798968396591999, + "learning_rate": 6.274736624731407e-06, + "loss": 0.4834, + "step": 5544 + }, + { + "epoch": 0.44, + "grad_norm": 0.5574783065853035, + "learning_rate": 6.273506813228114e-06, + "loss": 0.5101, + "step": 5545 + }, + { + "epoch": 0.44, + "grad_norm": 1.493871500437576, + "learning_rate": 6.272276919330595e-06, + "loss": 0.4411, + "step": 5546 + }, + { + "epoch": 0.44, + "grad_norm": 0.5955228546102452, + "learning_rate": 6.2710469431184265e-06, + "loss": 0.4989, + "step": 5547 + }, + { + "epoch": 0.44, + "grad_norm": 0.5654304024634748, + "learning_rate": 6.269816884671181e-06, + "loss": 0.4941, + "step": 5548 + }, + { + "epoch": 0.44, + "grad_norm": 1.8002561004552389, + "learning_rate": 6.268586744068446e-06, + "loss": 0.4729, + "step": 5549 + }, + { + "epoch": 0.44, + "grad_norm": 1.3558947544683584, + "learning_rate": 6.267356521389806e-06, + "loss": 0.422, + "step": 5550 + }, + { + "epoch": 0.44, + "grad_norm": 1.4591873467014147, + "learning_rate": 6.266126216714859e-06, + "loss": 0.4693, + "step": 5551 + }, + { + "epoch": 0.44, + "grad_norm": 0.5598105293728107, + "learning_rate": 6.264895830123199e-06, + "loss": 0.5102, + "step": 5552 + }, + { + "epoch": 0.44, + "grad_norm": 1.3436474877353506, + "learning_rate": 6.263665361694436e-06, + "loss": 0.4909, + "step": 5553 + }, + { + "epoch": 0.44, + "grad_norm": 1.5961351167019608, + "learning_rate": 6.2624348115081754e-06, + "loss": 0.4741, + "step": 5554 + }, + { + "epoch": 0.44, + "grad_norm": 1.3383688868718662, + "learning_rate": 6.261204179644037e-06, + "loss": 0.4803, + "step": 5555 + }, + { + "epoch": 0.44, + "grad_norm": 1.8971574240255207, + "learning_rate": 6.2599734661816355e-06, + "loss": 0.4551, + "step": 5556 + }, + { + "epoch": 0.44, + "grad_norm": 2.4255874149756704, + "learning_rate": 6.2587426712006005e-06, + "loss": 0.5145, + "step": 5557 + }, + { + "epoch": 0.44, + "grad_norm": 3.0256001218190436, + "learning_rate": 6.2575117947805595e-06, + "loss": 0.4774, + "step": 5558 + }, + { + "epoch": 0.44, + "grad_norm": 1.6595851502776786, + "learning_rate": 6.256280837001153e-06, + "loss": 0.465, + "step": 5559 + }, + { + "epoch": 0.44, + "grad_norm": 1.5069803152235348, + "learning_rate": 6.255049797942018e-06, + "loss": 0.4509, + "step": 5560 + }, + { + "epoch": 0.44, + "grad_norm": 2.9607933876433523, + "learning_rate": 6.2538186776828035e-06, + "loss": 0.4384, + "step": 5561 + }, + { + "epoch": 0.44, + "grad_norm": 1.990783689800869, + "learning_rate": 6.25258747630316e-06, + "loss": 0.5282, + "step": 5562 + }, + { + "epoch": 0.44, + "grad_norm": 1.435659153315895, + "learning_rate": 6.251356193882747e-06, + "loss": 0.4459, + "step": 5563 + }, + { + "epoch": 0.44, + "grad_norm": 3.0793880098222886, + "learning_rate": 6.250124830501222e-06, + "loss": 0.482, + "step": 5564 + }, + { + "epoch": 0.44, + "grad_norm": 1.6722413374355083, + "learning_rate": 6.2488933862382585e-06, + "loss": 0.4549, + "step": 5565 + }, + { + "epoch": 0.44, + "grad_norm": 1.6086333705169709, + "learning_rate": 6.2476618611735265e-06, + "loss": 0.4663, + "step": 5566 + }, + { + "epoch": 0.44, + "grad_norm": 0.5643079363398351, + "learning_rate": 6.246430255386704e-06, + "loss": 0.5067, + "step": 5567 + }, + { + "epoch": 0.44, + "grad_norm": 3.1312258768988266, + "learning_rate": 6.245198568957473e-06, + "loss": 0.4543, + "step": 5568 + }, + { + "epoch": 0.44, + "grad_norm": 0.6054135273620439, + "learning_rate": 6.243966801965526e-06, + "loss": 0.4863, + "step": 5569 + }, + { + "epoch": 0.44, + "grad_norm": 0.5724986623511228, + "learning_rate": 6.242734954490552e-06, + "loss": 0.5181, + "step": 5570 + }, + { + "epoch": 0.44, + "grad_norm": 1.6538309030428566, + "learning_rate": 6.241503026612254e-06, + "loss": 0.4581, + "step": 5571 + }, + { + "epoch": 0.44, + "grad_norm": 1.6742973488910453, + "learning_rate": 6.240271018410333e-06, + "loss": 0.4894, + "step": 5572 + }, + { + "epoch": 0.44, + "grad_norm": 3.1486432112241935, + "learning_rate": 6.2390389299645e-06, + "loss": 0.4968, + "step": 5573 + }, + { + "epoch": 0.44, + "grad_norm": 2.6330177418561878, + "learning_rate": 6.23780676135447e-06, + "loss": 0.4936, + "step": 5574 + }, + { + "epoch": 0.44, + "grad_norm": 1.598287161677895, + "learning_rate": 6.236574512659961e-06, + "loss": 0.5251, + "step": 5575 + }, + { + "epoch": 0.44, + "grad_norm": 2.024696637329994, + "learning_rate": 6.235342183960698e-06, + "loss": 0.4368, + "step": 5576 + }, + { + "epoch": 0.44, + "grad_norm": 0.612976835516847, + "learning_rate": 6.234109775336412e-06, + "loss": 0.5062, + "step": 5577 + }, + { + "epoch": 0.44, + "grad_norm": 1.7719340937396224, + "learning_rate": 6.232877286866838e-06, + "loss": 0.4915, + "step": 5578 + }, + { + "epoch": 0.44, + "grad_norm": 1.4505230269332725, + "learning_rate": 6.231644718631717e-06, + "loss": 0.4767, + "step": 5579 + }, + { + "epoch": 0.44, + "grad_norm": 1.59935888299376, + "learning_rate": 6.230412070710794e-06, + "loss": 0.4741, + "step": 5580 + }, + { + "epoch": 0.44, + "grad_norm": 2.0965352194741755, + "learning_rate": 6.2291793431838186e-06, + "loss": 0.4974, + "step": 5581 + }, + { + "epoch": 0.44, + "grad_norm": 1.9694881442051635, + "learning_rate": 6.227946536130549e-06, + "loss": 0.5031, + "step": 5582 + }, + { + "epoch": 0.44, + "grad_norm": 2.4075911070721436, + "learning_rate": 6.226713649630744e-06, + "loss": 0.4671, + "step": 5583 + }, + { + "epoch": 0.44, + "grad_norm": 0.5846784067395779, + "learning_rate": 6.225480683764171e-06, + "loss": 0.5013, + "step": 5584 + }, + { + "epoch": 0.44, + "grad_norm": 2.427659418647081, + "learning_rate": 6.2242476386106e-06, + "loss": 0.477, + "step": 5585 + }, + { + "epoch": 0.44, + "grad_norm": 1.4426267208821388, + "learning_rate": 6.223014514249809e-06, + "loss": 0.4643, + "step": 5586 + }, + { + "epoch": 0.44, + "grad_norm": 2.042579700255808, + "learning_rate": 6.221781310761578e-06, + "loss": 0.5068, + "step": 5587 + }, + { + "epoch": 0.44, + "grad_norm": 1.5624296818871508, + "learning_rate": 6.2205480282256955e-06, + "loss": 0.453, + "step": 5588 + }, + { + "epoch": 0.44, + "grad_norm": 1.7295122611340383, + "learning_rate": 6.219314666721952e-06, + "loss": 0.5127, + "step": 5589 + }, + { + "epoch": 0.44, + "grad_norm": 2.7765466188951584, + "learning_rate": 6.218081226330145e-06, + "loss": 0.4667, + "step": 5590 + }, + { + "epoch": 0.44, + "grad_norm": 5.627859906736519, + "learning_rate": 6.2168477071300745e-06, + "loss": 0.4925, + "step": 5591 + }, + { + "epoch": 0.44, + "grad_norm": 1.824884442672306, + "learning_rate": 6.215614109201551e-06, + "loss": 0.4557, + "step": 5592 + }, + { + "epoch": 0.44, + "grad_norm": 1.4211754251162967, + "learning_rate": 6.214380432624384e-06, + "loss": 0.4919, + "step": 5593 + }, + { + "epoch": 0.44, + "grad_norm": 0.5818456299420639, + "learning_rate": 6.213146677478392e-06, + "loss": 0.5255, + "step": 5594 + }, + { + "epoch": 0.44, + "grad_norm": 7.900621366762852, + "learning_rate": 6.211912843843397e-06, + "loss": 0.458, + "step": 5595 + }, + { + "epoch": 0.44, + "grad_norm": 2.1801447797140567, + "learning_rate": 6.210678931799227e-06, + "loss": 0.4276, + "step": 5596 + }, + { + "epoch": 0.44, + "grad_norm": 1.841733810437032, + "learning_rate": 6.209444941425712e-06, + "loss": 0.5038, + "step": 5597 + }, + { + "epoch": 0.44, + "grad_norm": 1.450340958504094, + "learning_rate": 6.208210872802692e-06, + "loss": 0.5164, + "step": 5598 + }, + { + "epoch": 0.44, + "grad_norm": 1.5527146309995212, + "learning_rate": 6.20697672601001e-06, + "loss": 0.4562, + "step": 5599 + }, + { + "epoch": 0.44, + "grad_norm": 1.5244433387615075, + "learning_rate": 6.205742501127513e-06, + "loss": 0.5095, + "step": 5600 + }, + { + "epoch": 0.44, + "grad_norm": 1.554095705199923, + "learning_rate": 6.204508198235052e-06, + "loss": 0.4919, + "step": 5601 + }, + { + "epoch": 0.44, + "grad_norm": 1.9914185380468825, + "learning_rate": 6.203273817412487e-06, + "loss": 0.4536, + "step": 5602 + }, + { + "epoch": 0.44, + "grad_norm": 2.075358483836895, + "learning_rate": 6.202039358739681e-06, + "loss": 0.483, + "step": 5603 + }, + { + "epoch": 0.44, + "grad_norm": 0.5518661671618242, + "learning_rate": 6.2008048222965e-06, + "loss": 0.5027, + "step": 5604 + }, + { + "epoch": 0.44, + "grad_norm": 1.7050671177585839, + "learning_rate": 6.199570208162819e-06, + "loss": 0.4524, + "step": 5605 + }, + { + "epoch": 0.44, + "grad_norm": 1.768982091393003, + "learning_rate": 6.198335516418513e-06, + "loss": 0.4587, + "step": 5606 + }, + { + "epoch": 0.44, + "grad_norm": 1.605151862038958, + "learning_rate": 6.197100747143468e-06, + "loss": 0.4925, + "step": 5607 + }, + { + "epoch": 0.44, + "grad_norm": 2.0012102797441025, + "learning_rate": 6.195865900417569e-06, + "loss": 0.4718, + "step": 5608 + }, + { + "epoch": 0.44, + "grad_norm": 1.818867818488926, + "learning_rate": 6.194630976320711e-06, + "loss": 0.4518, + "step": 5609 + }, + { + "epoch": 0.44, + "grad_norm": 2.557986108279998, + "learning_rate": 6.193395974932793e-06, + "loss": 0.4858, + "step": 5610 + }, + { + "epoch": 0.44, + "grad_norm": 2.0884827335975755, + "learning_rate": 6.192160896333717e-06, + "loss": 0.5098, + "step": 5611 + }, + { + "epoch": 0.44, + "grad_norm": 1.7627101388728135, + "learning_rate": 6.190925740603388e-06, + "loss": 0.4615, + "step": 5612 + }, + { + "epoch": 0.44, + "grad_norm": 2.803152075752051, + "learning_rate": 6.189690507821724e-06, + "loss": 0.5077, + "step": 5613 + }, + { + "epoch": 0.44, + "grad_norm": 1.4778921436164885, + "learning_rate": 6.188455198068639e-06, + "loss": 0.4873, + "step": 5614 + }, + { + "epoch": 0.44, + "grad_norm": 1.802185555367496, + "learning_rate": 6.1872198114240575e-06, + "loss": 0.4451, + "step": 5615 + }, + { + "epoch": 0.44, + "grad_norm": 1.5697676469216717, + "learning_rate": 6.185984347967909e-06, + "loss": 0.4551, + "step": 5616 + }, + { + "epoch": 0.44, + "grad_norm": 0.58149791104693, + "learning_rate": 6.184748807780123e-06, + "loss": 0.5022, + "step": 5617 + }, + { + "epoch": 0.44, + "grad_norm": 1.4271086567635025, + "learning_rate": 6.1835131909406385e-06, + "loss": 0.4806, + "step": 5618 + }, + { + "epoch": 0.44, + "grad_norm": 1.7086561351103302, + "learning_rate": 6.1822774975294e-06, + "loss": 0.4515, + "step": 5619 + }, + { + "epoch": 0.44, + "grad_norm": 0.5843319246388143, + "learning_rate": 6.181041727626352e-06, + "loss": 0.512, + "step": 5620 + }, + { + "epoch": 0.44, + "grad_norm": 1.9020399473785523, + "learning_rate": 6.179805881311452e-06, + "loss": 0.4835, + "step": 5621 + }, + { + "epoch": 0.44, + "grad_norm": 1.9639058450004239, + "learning_rate": 6.178569958664653e-06, + "loss": 0.4693, + "step": 5622 + }, + { + "epoch": 0.44, + "grad_norm": 1.8518338237815346, + "learning_rate": 6.177333959765921e-06, + "loss": 0.498, + "step": 5623 + }, + { + "epoch": 0.44, + "grad_norm": 1.8055298980715515, + "learning_rate": 6.1760978846952205e-06, + "loss": 0.4584, + "step": 5624 + }, + { + "epoch": 0.44, + "grad_norm": 2.0207312189718394, + "learning_rate": 6.174861733532527e-06, + "loss": 0.4628, + "step": 5625 + }, + { + "epoch": 0.44, + "grad_norm": 0.6159370322247352, + "learning_rate": 6.173625506357814e-06, + "loss": 0.5087, + "step": 5626 + }, + { + "epoch": 0.44, + "grad_norm": 2.301521068387596, + "learning_rate": 6.172389203251068e-06, + "loss": 0.5003, + "step": 5627 + }, + { + "epoch": 0.44, + "grad_norm": 0.5679047013106664, + "learning_rate": 6.171152824292272e-06, + "loss": 0.5059, + "step": 5628 + }, + { + "epoch": 0.44, + "grad_norm": 3.099372170782018, + "learning_rate": 6.169916369561423e-06, + "loss": 0.491, + "step": 5629 + }, + { + "epoch": 0.44, + "grad_norm": 1.857518656450942, + "learning_rate": 6.168679839138514e-06, + "loss": 0.4708, + "step": 5630 + }, + { + "epoch": 0.44, + "grad_norm": 1.7521114629022725, + "learning_rate": 6.167443233103547e-06, + "loss": 0.4697, + "step": 5631 + }, + { + "epoch": 0.44, + "grad_norm": 3.4251136298316553, + "learning_rate": 6.166206551536531e-06, + "loss": 0.5249, + "step": 5632 + }, + { + "epoch": 0.44, + "grad_norm": 0.6119084010558695, + "learning_rate": 6.1649697945174784e-06, + "loss": 0.5244, + "step": 5633 + }, + { + "epoch": 0.44, + "grad_norm": 1.380729841004696, + "learning_rate": 6.163732962126402e-06, + "loss": 0.4228, + "step": 5634 + }, + { + "epoch": 0.44, + "grad_norm": 2.2870631846189693, + "learning_rate": 6.162496054443326e-06, + "loss": 0.4822, + "step": 5635 + }, + { + "epoch": 0.44, + "grad_norm": 1.8086724579027718, + "learning_rate": 6.161259071548277e-06, + "loss": 0.4543, + "step": 5636 + }, + { + "epoch": 0.44, + "grad_norm": 0.5702067965436302, + "learning_rate": 6.160022013521284e-06, + "loss": 0.5031, + "step": 5637 + }, + { + "epoch": 0.44, + "grad_norm": 2.208237457511597, + "learning_rate": 6.1587848804423846e-06, + "loss": 0.4439, + "step": 5638 + }, + { + "epoch": 0.44, + "grad_norm": 1.7630773670009363, + "learning_rate": 6.15754767239162e-06, + "loss": 0.5108, + "step": 5639 + }, + { + "epoch": 0.44, + "grad_norm": 0.5461083414637489, + "learning_rate": 6.156310389449034e-06, + "loss": 0.5062, + "step": 5640 + }, + { + "epoch": 0.44, + "grad_norm": 2.0870959559547284, + "learning_rate": 6.155073031694679e-06, + "loss": 0.4772, + "step": 5641 + }, + { + "epoch": 0.44, + "grad_norm": 1.9656273991534357, + "learning_rate": 6.153835599208609e-06, + "loss": 0.4756, + "step": 5642 + }, + { + "epoch": 0.44, + "grad_norm": 0.548023070604085, + "learning_rate": 6.152598092070885e-06, + "loss": 0.495, + "step": 5643 + }, + { + "epoch": 0.44, + "grad_norm": 1.4233484897278479, + "learning_rate": 6.151360510361574e-06, + "loss": 0.4642, + "step": 5644 + }, + { + "epoch": 0.44, + "grad_norm": 1.835031001189982, + "learning_rate": 6.150122854160743e-06, + "loss": 0.4794, + "step": 5645 + }, + { + "epoch": 0.44, + "grad_norm": 1.832944915775355, + "learning_rate": 6.148885123548468e-06, + "loss": 0.5366, + "step": 5646 + }, + { + "epoch": 0.44, + "grad_norm": 1.6412392321924147, + "learning_rate": 6.147647318604829e-06, + "loss": 0.4433, + "step": 5647 + }, + { + "epoch": 0.44, + "grad_norm": 2.9409156217546824, + "learning_rate": 6.146409439409909e-06, + "loss": 0.4454, + "step": 5648 + }, + { + "epoch": 0.44, + "grad_norm": 1.5528330949253917, + "learning_rate": 6.1451714860437985e-06, + "loss": 0.4994, + "step": 5649 + }, + { + "epoch": 0.44, + "grad_norm": 1.649855174091046, + "learning_rate": 6.14393345858659e-06, + "loss": 0.5155, + "step": 5650 + }, + { + "epoch": 0.44, + "grad_norm": 2.2505425072365757, + "learning_rate": 6.142695357118384e-06, + "loss": 0.5209, + "step": 5651 + }, + { + "epoch": 0.44, + "grad_norm": 1.9213216928115613, + "learning_rate": 6.141457181719283e-06, + "loss": 0.4945, + "step": 5652 + }, + { + "epoch": 0.44, + "grad_norm": 1.8144237281481375, + "learning_rate": 6.140218932469396e-06, + "loss": 0.4912, + "step": 5653 + }, + { + "epoch": 0.44, + "grad_norm": 3.848752946919791, + "learning_rate": 6.138980609448835e-06, + "loss": 0.4919, + "step": 5654 + }, + { + "epoch": 0.44, + "grad_norm": 2.7193268099339667, + "learning_rate": 6.137742212737718e-06, + "loss": 0.5184, + "step": 5655 + }, + { + "epoch": 0.44, + "grad_norm": 1.9129333006414508, + "learning_rate": 6.136503742416171e-06, + "loss": 0.4543, + "step": 5656 + }, + { + "epoch": 0.44, + "grad_norm": 0.5631829422991903, + "learning_rate": 6.135265198564317e-06, + "loss": 0.5286, + "step": 5657 + }, + { + "epoch": 0.44, + "grad_norm": 2.2859951386046693, + "learning_rate": 6.134026581262289e-06, + "loss": 0.4155, + "step": 5658 + }, + { + "epoch": 0.44, + "grad_norm": 4.728400373449617, + "learning_rate": 6.132787890590225e-06, + "loss": 0.4826, + "step": 5659 + }, + { + "epoch": 0.44, + "grad_norm": 2.0893314374311913, + "learning_rate": 6.1315491266282666e-06, + "loss": 0.4444, + "step": 5660 + }, + { + "epoch": 0.44, + "grad_norm": 2.3827723442042017, + "learning_rate": 6.13031028945656e-06, + "loss": 0.5225, + "step": 5661 + }, + { + "epoch": 0.44, + "grad_norm": 2.2960147483766415, + "learning_rate": 6.129071379155257e-06, + "loss": 0.4617, + "step": 5662 + }, + { + "epoch": 0.44, + "grad_norm": 0.5878010023473703, + "learning_rate": 6.127832395804512e-06, + "loss": 0.5166, + "step": 5663 + }, + { + "epoch": 0.44, + "grad_norm": 5.678098039696629, + "learning_rate": 6.1265933394844855e-06, + "loss": 0.4694, + "step": 5664 + }, + { + "epoch": 0.44, + "grad_norm": 1.7011261329044907, + "learning_rate": 6.125354210275344e-06, + "loss": 0.458, + "step": 5665 + }, + { + "epoch": 0.44, + "grad_norm": 1.7514111657476035, + "learning_rate": 6.124115008257259e-06, + "loss": 0.4932, + "step": 5666 + }, + { + "epoch": 0.45, + "grad_norm": 1.7555776623936834, + "learning_rate": 6.1228757335104e-06, + "loss": 0.4582, + "step": 5667 + }, + { + "epoch": 0.45, + "grad_norm": 2.310613942912549, + "learning_rate": 6.121636386114954e-06, + "loss": 0.4943, + "step": 5668 + }, + { + "epoch": 0.45, + "grad_norm": 1.9377045336420782, + "learning_rate": 6.120396966151099e-06, + "loss": 0.4797, + "step": 5669 + }, + { + "epoch": 0.45, + "grad_norm": 1.637232097811697, + "learning_rate": 6.119157473699027e-06, + "loss": 0.4896, + "step": 5670 + }, + { + "epoch": 0.45, + "grad_norm": 4.801566334489921, + "learning_rate": 6.11791790883893e-06, + "loss": 0.5003, + "step": 5671 + }, + { + "epoch": 0.45, + "grad_norm": 0.5908583374350184, + "learning_rate": 6.1166782716510065e-06, + "loss": 0.5086, + "step": 5672 + }, + { + "epoch": 0.45, + "grad_norm": 2.468821909172175, + "learning_rate": 6.115438562215459e-06, + "loss": 0.4578, + "step": 5673 + }, + { + "epoch": 0.45, + "grad_norm": 1.6135798735612559, + "learning_rate": 6.114198780612497e-06, + "loss": 0.4969, + "step": 5674 + }, + { + "epoch": 0.45, + "grad_norm": 2.2805661945768776, + "learning_rate": 6.11295892692233e-06, + "loss": 0.4371, + "step": 5675 + }, + { + "epoch": 0.45, + "grad_norm": 1.7102417132814736, + "learning_rate": 6.111719001225178e-06, + "loss": 0.4804, + "step": 5676 + }, + { + "epoch": 0.45, + "grad_norm": 0.5804944108793355, + "learning_rate": 6.110479003601258e-06, + "loss": 0.5018, + "step": 5677 + }, + { + "epoch": 0.45, + "grad_norm": 0.5813935176956744, + "learning_rate": 6.109238934130802e-06, + "loss": 0.5059, + "step": 5678 + }, + { + "epoch": 0.45, + "grad_norm": 0.5740389190923919, + "learning_rate": 6.107998792894038e-06, + "loss": 0.5099, + "step": 5679 + }, + { + "epoch": 0.45, + "grad_norm": 1.7385142746491293, + "learning_rate": 6.106758579971201e-06, + "loss": 0.4741, + "step": 5680 + }, + { + "epoch": 0.45, + "grad_norm": 1.6486368805632574, + "learning_rate": 6.105518295442531e-06, + "loss": 0.4442, + "step": 5681 + }, + { + "epoch": 0.45, + "grad_norm": 0.5623861420798012, + "learning_rate": 6.104277939388275e-06, + "loss": 0.5035, + "step": 5682 + }, + { + "epoch": 0.45, + "grad_norm": 1.637097862003304, + "learning_rate": 6.103037511888678e-06, + "loss": 0.4909, + "step": 5683 + }, + { + "epoch": 0.45, + "grad_norm": 1.987806739224232, + "learning_rate": 6.101797013023999e-06, + "loss": 0.4649, + "step": 5684 + }, + { + "epoch": 0.45, + "grad_norm": 0.548736000432576, + "learning_rate": 6.100556442874495e-06, + "loss": 0.4993, + "step": 5685 + }, + { + "epoch": 0.45, + "grad_norm": 0.5895862476917514, + "learning_rate": 6.099315801520428e-06, + "loss": 0.514, + "step": 5686 + }, + { + "epoch": 0.45, + "grad_norm": 1.6140040396279058, + "learning_rate": 6.098075089042068e-06, + "loss": 0.3912, + "step": 5687 + }, + { + "epoch": 0.45, + "grad_norm": 1.9076130720577347, + "learning_rate": 6.096834305519684e-06, + "loss": 0.4873, + "step": 5688 + }, + { + "epoch": 0.45, + "grad_norm": 1.869972422600285, + "learning_rate": 6.095593451033557e-06, + "loss": 0.5, + "step": 5689 + }, + { + "epoch": 0.45, + "grad_norm": 1.794705902778675, + "learning_rate": 6.094352525663967e-06, + "loss": 0.436, + "step": 5690 + }, + { + "epoch": 0.45, + "grad_norm": 2.3685303822320125, + "learning_rate": 6.0931115294911994e-06, + "loss": 0.4863, + "step": 5691 + }, + { + "epoch": 0.45, + "grad_norm": 1.6671922666354537, + "learning_rate": 6.091870462595545e-06, + "loss": 0.4458, + "step": 5692 + }, + { + "epoch": 0.45, + "grad_norm": 0.5714931401105673, + "learning_rate": 6.090629325057302e-06, + "loss": 0.5101, + "step": 5693 + }, + { + "epoch": 0.45, + "grad_norm": 1.5413340140346248, + "learning_rate": 6.089388116956767e-06, + "loss": 0.5047, + "step": 5694 + }, + { + "epoch": 0.45, + "grad_norm": 4.690536722509592, + "learning_rate": 6.088146838374247e-06, + "loss": 0.532, + "step": 5695 + }, + { + "epoch": 0.45, + "grad_norm": 3.4012375008138656, + "learning_rate": 6.0869054893900485e-06, + "loss": 0.4745, + "step": 5696 + }, + { + "epoch": 0.45, + "grad_norm": 1.8125321997025927, + "learning_rate": 6.0856640700844885e-06, + "loss": 0.4847, + "step": 5697 + }, + { + "epoch": 0.45, + "grad_norm": 0.5614235174931688, + "learning_rate": 6.084422580537882e-06, + "loss": 0.5125, + "step": 5698 + }, + { + "epoch": 0.45, + "grad_norm": 2.492267651310338, + "learning_rate": 6.083181020830553e-06, + "loss": 0.5031, + "step": 5699 + }, + { + "epoch": 0.45, + "grad_norm": 0.5706014790783737, + "learning_rate": 6.081939391042829e-06, + "loss": 0.5213, + "step": 5700 + }, + { + "epoch": 0.45, + "grad_norm": 2.4679643714721506, + "learning_rate": 6.080697691255043e-06, + "loss": 0.5183, + "step": 5701 + }, + { + "epoch": 0.45, + "grad_norm": 2.1385504928518815, + "learning_rate": 6.079455921547529e-06, + "loss": 0.4805, + "step": 5702 + }, + { + "epoch": 0.45, + "grad_norm": 0.5892377076051167, + "learning_rate": 6.07821408200063e-06, + "loss": 0.5081, + "step": 5703 + }, + { + "epoch": 0.45, + "grad_norm": 2.4952763533720472, + "learning_rate": 6.07697217269469e-06, + "loss": 0.421, + "step": 5704 + }, + { + "epoch": 0.45, + "grad_norm": 1.4896629001385853, + "learning_rate": 6.075730193710059e-06, + "loss": 0.4617, + "step": 5705 + }, + { + "epoch": 0.45, + "grad_norm": 1.685119616543394, + "learning_rate": 6.074488145127091e-06, + "loss": 0.4695, + "step": 5706 + }, + { + "epoch": 0.45, + "grad_norm": 0.5276080265530291, + "learning_rate": 6.073246027026147e-06, + "loss": 0.504, + "step": 5707 + }, + { + "epoch": 0.45, + "grad_norm": 1.8413784587994093, + "learning_rate": 6.072003839487588e-06, + "loss": 0.4676, + "step": 5708 + }, + { + "epoch": 0.45, + "grad_norm": 2.6453849094167343, + "learning_rate": 6.070761582591784e-06, + "loss": 0.4066, + "step": 5709 + }, + { + "epoch": 0.45, + "grad_norm": 1.8220154598935012, + "learning_rate": 6.069519256419104e-06, + "loss": 0.4609, + "step": 5710 + }, + { + "epoch": 0.45, + "grad_norm": 1.9352526671568626, + "learning_rate": 6.06827686104993e-06, + "loss": 0.4962, + "step": 5711 + }, + { + "epoch": 0.45, + "grad_norm": 2.6158187515564246, + "learning_rate": 6.0670343965646385e-06, + "loss": 0.4704, + "step": 5712 + }, + { + "epoch": 0.45, + "grad_norm": 1.7937338194347454, + "learning_rate": 6.06579186304362e-06, + "loss": 0.4004, + "step": 5713 + }, + { + "epoch": 0.45, + "grad_norm": 0.5891895608496196, + "learning_rate": 6.064549260567259e-06, + "loss": 0.5138, + "step": 5714 + }, + { + "epoch": 0.45, + "grad_norm": 1.7311893032605346, + "learning_rate": 6.063306589215956e-06, + "loss": 0.5058, + "step": 5715 + }, + { + "epoch": 0.45, + "grad_norm": 1.8248935108612583, + "learning_rate": 6.062063849070106e-06, + "loss": 0.3959, + "step": 5716 + }, + { + "epoch": 0.45, + "grad_norm": 0.6331680136460127, + "learning_rate": 6.0608210402101156e-06, + "loss": 0.495, + "step": 5717 + }, + { + "epoch": 0.45, + "grad_norm": 4.302594367416118, + "learning_rate": 6.0595781627163906e-06, + "loss": 0.4702, + "step": 5718 + }, + { + "epoch": 0.45, + "grad_norm": 1.8406587877177791, + "learning_rate": 6.058335216669344e-06, + "loss": 0.4401, + "step": 5719 + }, + { + "epoch": 0.45, + "grad_norm": 2.0085716063385486, + "learning_rate": 6.057092202149395e-06, + "loss": 0.42, + "step": 5720 + }, + { + "epoch": 0.45, + "grad_norm": 2.7448568654844796, + "learning_rate": 6.055849119236963e-06, + "loss": 0.4825, + "step": 5721 + }, + { + "epoch": 0.45, + "grad_norm": 7.1349576979723, + "learning_rate": 6.0546059680124735e-06, + "loss": 0.5271, + "step": 5722 + }, + { + "epoch": 0.45, + "grad_norm": 0.5914155662299729, + "learning_rate": 6.053362748556359e-06, + "loss": 0.5257, + "step": 5723 + }, + { + "epoch": 0.45, + "grad_norm": 0.605385006408059, + "learning_rate": 6.052119460949051e-06, + "loss": 0.5052, + "step": 5724 + }, + { + "epoch": 0.45, + "grad_norm": 1.8822394185843716, + "learning_rate": 6.05087610527099e-06, + "loss": 0.5202, + "step": 5725 + }, + { + "epoch": 0.45, + "grad_norm": 2.0223771521952125, + "learning_rate": 6.049632681602621e-06, + "loss": 0.4719, + "step": 5726 + }, + { + "epoch": 0.45, + "grad_norm": 0.5583886230057684, + "learning_rate": 6.04838919002439e-06, + "loss": 0.4862, + "step": 5727 + }, + { + "epoch": 0.45, + "grad_norm": 1.948650585882006, + "learning_rate": 6.04714563061675e-06, + "loss": 0.4927, + "step": 5728 + }, + { + "epoch": 0.45, + "grad_norm": 1.7708249425051292, + "learning_rate": 6.045902003460157e-06, + "loss": 0.4573, + "step": 5729 + }, + { + "epoch": 0.45, + "grad_norm": 3.1458237845597736, + "learning_rate": 6.044658308635074e-06, + "loss": 0.4828, + "step": 5730 + }, + { + "epoch": 0.45, + "grad_norm": 1.6810915776146784, + "learning_rate": 6.043414546221963e-06, + "loss": 0.485, + "step": 5731 + }, + { + "epoch": 0.45, + "grad_norm": 1.4352756727957299, + "learning_rate": 6.042170716301298e-06, + "loss": 0.4423, + "step": 5732 + }, + { + "epoch": 0.45, + "grad_norm": 2.971001893509812, + "learning_rate": 6.040926818953548e-06, + "loss": 0.4902, + "step": 5733 + }, + { + "epoch": 0.45, + "grad_norm": 1.7169653586652869, + "learning_rate": 6.039682854259197e-06, + "loss": 0.4191, + "step": 5734 + }, + { + "epoch": 0.45, + "grad_norm": 1.9488672408935992, + "learning_rate": 6.038438822298725e-06, + "loss": 0.5054, + "step": 5735 + }, + { + "epoch": 0.45, + "grad_norm": 0.5875973062325169, + "learning_rate": 6.037194723152619e-06, + "loss": 0.5015, + "step": 5736 + }, + { + "epoch": 0.45, + "grad_norm": 3.164112220157697, + "learning_rate": 6.035950556901371e-06, + "loss": 0.4618, + "step": 5737 + }, + { + "epoch": 0.45, + "grad_norm": 2.682455784345946, + "learning_rate": 6.034706323625479e-06, + "loss": 0.4737, + "step": 5738 + }, + { + "epoch": 0.45, + "grad_norm": 2.1209283612283025, + "learning_rate": 6.03346202340544e-06, + "loss": 0.4622, + "step": 5739 + }, + { + "epoch": 0.45, + "grad_norm": 2.1890772648731014, + "learning_rate": 6.032217656321761e-06, + "loss": 0.4766, + "step": 5740 + }, + { + "epoch": 0.45, + "grad_norm": 2.1284504151421917, + "learning_rate": 6.030973222454949e-06, + "loss": 0.4703, + "step": 5741 + }, + { + "epoch": 0.45, + "grad_norm": 2.343652778061655, + "learning_rate": 6.029728721885518e-06, + "loss": 0.4845, + "step": 5742 + }, + { + "epoch": 0.45, + "grad_norm": 2.47550296793369, + "learning_rate": 6.028484154693987e-06, + "loss": 0.4711, + "step": 5743 + }, + { + "epoch": 0.45, + "grad_norm": 1.627674144719648, + "learning_rate": 6.027239520960875e-06, + "loss": 0.4157, + "step": 5744 + }, + { + "epoch": 0.45, + "grad_norm": 1.6220238193866643, + "learning_rate": 6.0259948207667095e-06, + "loss": 0.4749, + "step": 5745 + }, + { + "epoch": 0.45, + "grad_norm": 0.5849949400029555, + "learning_rate": 6.024750054192023e-06, + "loss": 0.5117, + "step": 5746 + }, + { + "epoch": 0.45, + "grad_norm": 1.7299428157403334, + "learning_rate": 6.023505221317347e-06, + "loss": 0.4246, + "step": 5747 + }, + { + "epoch": 0.45, + "grad_norm": 1.6610500413073104, + "learning_rate": 6.022260322223224e-06, + "loss": 0.4383, + "step": 5748 + }, + { + "epoch": 0.45, + "grad_norm": 3.7270604454516976, + "learning_rate": 6.021015356990194e-06, + "loss": 0.4499, + "step": 5749 + }, + { + "epoch": 0.45, + "grad_norm": 1.6603085366683752, + "learning_rate": 6.0197703256988075e-06, + "loss": 0.4452, + "step": 5750 + }, + { + "epoch": 0.45, + "grad_norm": 2.151815004081429, + "learning_rate": 6.018525228429614e-06, + "loss": 0.5395, + "step": 5751 + }, + { + "epoch": 0.45, + "grad_norm": 2.2779753199704955, + "learning_rate": 6.0172800652631706e-06, + "loss": 0.5252, + "step": 5752 + }, + { + "epoch": 0.45, + "grad_norm": 2.0007795465214526, + "learning_rate": 6.016034836280037e-06, + "loss": 0.4379, + "step": 5753 + }, + { + "epoch": 0.45, + "grad_norm": 1.7615038788753057, + "learning_rate": 6.0147895415607795e-06, + "loss": 0.4453, + "step": 5754 + }, + { + "epoch": 0.45, + "grad_norm": 2.0738005062453295, + "learning_rate": 6.013544181185966e-06, + "loss": 0.4951, + "step": 5755 + }, + { + "epoch": 0.45, + "grad_norm": 2.0074224481328478, + "learning_rate": 6.012298755236169e-06, + "loss": 0.4618, + "step": 5756 + }, + { + "epoch": 0.45, + "grad_norm": 1.7868557758287615, + "learning_rate": 6.011053263791967e-06, + "loss": 0.4597, + "step": 5757 + }, + { + "epoch": 0.45, + "grad_norm": 1.6081913380449693, + "learning_rate": 6.009807706933943e-06, + "loss": 0.4333, + "step": 5758 + }, + { + "epoch": 0.45, + "grad_norm": 2.3109878533614023, + "learning_rate": 6.008562084742681e-06, + "loss": 0.4344, + "step": 5759 + }, + { + "epoch": 0.45, + "grad_norm": 1.5735955320905453, + "learning_rate": 6.00731639729877e-06, + "loss": 0.4512, + "step": 5760 + }, + { + "epoch": 0.45, + "grad_norm": 5.117886234101295, + "learning_rate": 6.006070644682807e-06, + "loss": 0.5504, + "step": 5761 + }, + { + "epoch": 0.45, + "grad_norm": 1.4678537603924338, + "learning_rate": 6.004824826975389e-06, + "loss": 0.4911, + "step": 5762 + }, + { + "epoch": 0.45, + "grad_norm": 2.0710004920839196, + "learning_rate": 6.0035789442571205e-06, + "loss": 0.4465, + "step": 5763 + }, + { + "epoch": 0.45, + "grad_norm": 1.6598112253798396, + "learning_rate": 6.002332996608605e-06, + "loss": 0.4876, + "step": 5764 + }, + { + "epoch": 0.45, + "grad_norm": 2.325810267014837, + "learning_rate": 6.001086984110457e-06, + "loss": 0.4605, + "step": 5765 + }, + { + "epoch": 0.45, + "grad_norm": 1.8099724796625862, + "learning_rate": 5.99984090684329e-06, + "loss": 0.4487, + "step": 5766 + }, + { + "epoch": 0.45, + "grad_norm": 2.584535945312912, + "learning_rate": 5.998594764887725e-06, + "loss": 0.4638, + "step": 5767 + }, + { + "epoch": 0.45, + "grad_norm": 1.7647056725360064, + "learning_rate": 5.997348558324384e-06, + "loss": 0.4248, + "step": 5768 + }, + { + "epoch": 0.45, + "grad_norm": 3.0337671979325114, + "learning_rate": 5.996102287233898e-06, + "loss": 0.4502, + "step": 5769 + }, + { + "epoch": 0.45, + "grad_norm": 2.4895841480429866, + "learning_rate": 5.9948559516968955e-06, + "loss": 0.5093, + "step": 5770 + }, + { + "epoch": 0.45, + "grad_norm": 1.493398074058847, + "learning_rate": 5.993609551794016e-06, + "loss": 0.4554, + "step": 5771 + }, + { + "epoch": 0.45, + "grad_norm": 2.046162329848687, + "learning_rate": 5.992363087605895e-06, + "loss": 0.5149, + "step": 5772 + }, + { + "epoch": 0.45, + "grad_norm": 0.5810174083949727, + "learning_rate": 5.9911165592131835e-06, + "loss": 0.5055, + "step": 5773 + }, + { + "epoch": 0.45, + "grad_norm": 1.8208328462982921, + "learning_rate": 5.989869966696525e-06, + "loss": 0.4898, + "step": 5774 + }, + { + "epoch": 0.45, + "grad_norm": 0.5252039598590208, + "learning_rate": 5.988623310136578e-06, + "loss": 0.5007, + "step": 5775 + }, + { + "epoch": 0.45, + "grad_norm": 0.5644887687974547, + "learning_rate": 5.987376589613992e-06, + "loss": 0.5099, + "step": 5776 + }, + { + "epoch": 0.45, + "grad_norm": 0.5611758382408979, + "learning_rate": 5.9861298052094354e-06, + "loss": 0.5199, + "step": 5777 + }, + { + "epoch": 0.45, + "grad_norm": 1.858815646612022, + "learning_rate": 5.984882957003567e-06, + "loss": 0.448, + "step": 5778 + }, + { + "epoch": 0.45, + "grad_norm": 1.5910068157214212, + "learning_rate": 5.983636045077062e-06, + "loss": 0.4759, + "step": 5779 + }, + { + "epoch": 0.45, + "grad_norm": 1.4403803114139198, + "learning_rate": 5.982389069510592e-06, + "loss": 0.4608, + "step": 5780 + }, + { + "epoch": 0.45, + "grad_norm": 2.5176631914239365, + "learning_rate": 5.981142030384835e-06, + "loss": 0.4919, + "step": 5781 + }, + { + "epoch": 0.45, + "grad_norm": 1.6531403102839233, + "learning_rate": 5.97989492778047e-06, + "loss": 0.4574, + "step": 5782 + }, + { + "epoch": 0.45, + "grad_norm": 1.701885285227435, + "learning_rate": 5.978647761778187e-06, + "loss": 0.4707, + "step": 5783 + }, + { + "epoch": 0.45, + "grad_norm": 5.784737868507821, + "learning_rate": 5.977400532458673e-06, + "loss": 0.5032, + "step": 5784 + }, + { + "epoch": 0.45, + "grad_norm": 2.25873055910197, + "learning_rate": 5.976153239902622e-06, + "loss": 0.5111, + "step": 5785 + }, + { + "epoch": 0.45, + "grad_norm": 2.23372903442882, + "learning_rate": 5.974905884190735e-06, + "loss": 0.4403, + "step": 5786 + }, + { + "epoch": 0.45, + "grad_norm": 2.658771267047421, + "learning_rate": 5.9736584654037125e-06, + "loss": 0.5111, + "step": 5787 + }, + { + "epoch": 0.45, + "grad_norm": 2.014120464521342, + "learning_rate": 5.9724109836222595e-06, + "loss": 0.509, + "step": 5788 + }, + { + "epoch": 0.45, + "grad_norm": 1.8811009732055821, + "learning_rate": 5.97116343892709e-06, + "loss": 0.4625, + "step": 5789 + }, + { + "epoch": 0.45, + "grad_norm": 2.301368708006228, + "learning_rate": 5.969915831398915e-06, + "loss": 0.407, + "step": 5790 + }, + { + "epoch": 0.45, + "grad_norm": 1.957903136763112, + "learning_rate": 5.968668161118455e-06, + "loss": 0.4556, + "step": 5791 + }, + { + "epoch": 0.45, + "grad_norm": 0.6256141682239557, + "learning_rate": 5.967420428166432e-06, + "loss": 0.5057, + "step": 5792 + }, + { + "epoch": 0.45, + "grad_norm": 0.5909718018368074, + "learning_rate": 5.966172632623573e-06, + "loss": 0.5086, + "step": 5793 + }, + { + "epoch": 0.46, + "grad_norm": 3.248574451857549, + "learning_rate": 5.9649247745706085e-06, + "loss": 0.4518, + "step": 5794 + }, + { + "epoch": 0.46, + "grad_norm": 2.2609969695639482, + "learning_rate": 5.963676854088274e-06, + "loss": 0.4651, + "step": 5795 + }, + { + "epoch": 0.46, + "grad_norm": 0.5887988158852362, + "learning_rate": 5.962428871257307e-06, + "loss": 0.4871, + "step": 5796 + }, + { + "epoch": 0.46, + "grad_norm": 2.4628003113589134, + "learning_rate": 5.9611808261584505e-06, + "loss": 0.4872, + "step": 5797 + }, + { + "epoch": 0.46, + "grad_norm": 1.649759791011889, + "learning_rate": 5.959932718872453e-06, + "loss": 0.4902, + "step": 5798 + }, + { + "epoch": 0.46, + "grad_norm": 2.774957857779574, + "learning_rate": 5.9586845494800635e-06, + "loss": 0.4779, + "step": 5799 + }, + { + "epoch": 0.46, + "grad_norm": 1.8795058370938917, + "learning_rate": 5.957436318062039e-06, + "loss": 0.5101, + "step": 5800 + }, + { + "epoch": 0.46, + "grad_norm": 0.6475431171666449, + "learning_rate": 5.956188024699136e-06, + "loss": 0.482, + "step": 5801 + }, + { + "epoch": 0.46, + "grad_norm": 1.9402653298123897, + "learning_rate": 5.9549396694721185e-06, + "loss": 0.4635, + "step": 5802 + }, + { + "epoch": 0.46, + "grad_norm": 2.077613746180248, + "learning_rate": 5.953691252461754e-06, + "loss": 0.4788, + "step": 5803 + }, + { + "epoch": 0.46, + "grad_norm": 2.4645388326324116, + "learning_rate": 5.952442773748814e-06, + "loss": 0.4342, + "step": 5804 + }, + { + "epoch": 0.46, + "grad_norm": 1.7827814670709088, + "learning_rate": 5.951194233414072e-06, + "loss": 0.4392, + "step": 5805 + }, + { + "epoch": 0.46, + "grad_norm": 11.599889963554114, + "learning_rate": 5.949945631538308e-06, + "loss": 0.4467, + "step": 5806 + }, + { + "epoch": 0.46, + "grad_norm": 2.0564774717996097, + "learning_rate": 5.948696968202302e-06, + "loss": 0.4684, + "step": 5807 + }, + { + "epoch": 0.46, + "grad_norm": 1.8378023351677375, + "learning_rate": 5.9474482434868455e-06, + "loss": 0.4575, + "step": 5808 + }, + { + "epoch": 0.46, + "grad_norm": 2.3015476486609856, + "learning_rate": 5.946199457472726e-06, + "loss": 0.4994, + "step": 5809 + }, + { + "epoch": 0.46, + "grad_norm": 1.9291286241249834, + "learning_rate": 5.9449506102407395e-06, + "loss": 0.4256, + "step": 5810 + }, + { + "epoch": 0.46, + "grad_norm": 1.7290201644502774, + "learning_rate": 5.943701701871685e-06, + "loss": 0.4913, + "step": 5811 + }, + { + "epoch": 0.46, + "grad_norm": 1.7813529206888297, + "learning_rate": 5.942452732446365e-06, + "loss": 0.4252, + "step": 5812 + }, + { + "epoch": 0.46, + "grad_norm": 2.508227137901349, + "learning_rate": 5.941203702045584e-06, + "loss": 0.5078, + "step": 5813 + }, + { + "epoch": 0.46, + "grad_norm": 1.8953557197110757, + "learning_rate": 5.939954610750157e-06, + "loss": 0.4688, + "step": 5814 + }, + { + "epoch": 0.46, + "grad_norm": 2.188489622863546, + "learning_rate": 5.938705458640895e-06, + "loss": 0.4887, + "step": 5815 + }, + { + "epoch": 0.46, + "grad_norm": 2.8806101190693325, + "learning_rate": 5.93745624579862e-06, + "loss": 0.4637, + "step": 5816 + }, + { + "epoch": 0.46, + "grad_norm": 2.135551854779081, + "learning_rate": 5.93620697230415e-06, + "loss": 0.4196, + "step": 5817 + }, + { + "epoch": 0.46, + "grad_norm": 0.5877604130417955, + "learning_rate": 5.934957638238316e-06, + "loss": 0.5062, + "step": 5818 + }, + { + "epoch": 0.46, + "grad_norm": 2.6831847265126774, + "learning_rate": 5.933708243681944e-06, + "loss": 0.5115, + "step": 5819 + }, + { + "epoch": 0.46, + "grad_norm": 2.4060520112272163, + "learning_rate": 5.93245878871587e-06, + "loss": 0.4588, + "step": 5820 + }, + { + "epoch": 0.46, + "grad_norm": 3.305150963124646, + "learning_rate": 5.931209273420932e-06, + "loss": 0.4679, + "step": 5821 + }, + { + "epoch": 0.46, + "grad_norm": 1.8464200274340548, + "learning_rate": 5.929959697877974e-06, + "loss": 0.4407, + "step": 5822 + }, + { + "epoch": 0.46, + "grad_norm": 1.475215356706476, + "learning_rate": 5.928710062167837e-06, + "loss": 0.5106, + "step": 5823 + }, + { + "epoch": 0.46, + "grad_norm": 3.105605225564653, + "learning_rate": 5.927460366371377e-06, + "loss": 0.4918, + "step": 5824 + }, + { + "epoch": 0.46, + "grad_norm": 2.0596178826789147, + "learning_rate": 5.926210610569443e-06, + "loss": 0.4174, + "step": 5825 + }, + { + "epoch": 0.46, + "grad_norm": 2.0792331550889926, + "learning_rate": 5.924960794842894e-06, + "loss": 0.4528, + "step": 5826 + }, + { + "epoch": 0.46, + "grad_norm": 3.744924490311416, + "learning_rate": 5.923710919272593e-06, + "loss": 0.4131, + "step": 5827 + }, + { + "epoch": 0.46, + "grad_norm": 4.871907315539418, + "learning_rate": 5.922460983939403e-06, + "loss": 0.5108, + "step": 5828 + }, + { + "epoch": 0.46, + "grad_norm": 1.5535667202844723, + "learning_rate": 5.921210988924193e-06, + "loss": 0.5034, + "step": 5829 + }, + { + "epoch": 0.46, + "grad_norm": 1.8944184619568136, + "learning_rate": 5.919960934307839e-06, + "loss": 0.4306, + "step": 5830 + }, + { + "epoch": 0.46, + "grad_norm": 1.7615248959019227, + "learning_rate": 5.9187108201712155e-06, + "loss": 0.4884, + "step": 5831 + }, + { + "epoch": 0.46, + "grad_norm": 2.4685038738545786, + "learning_rate": 5.917460646595202e-06, + "loss": 0.4474, + "step": 5832 + }, + { + "epoch": 0.46, + "grad_norm": 3.0704001362419184, + "learning_rate": 5.916210413660687e-06, + "loss": 0.517, + "step": 5833 + }, + { + "epoch": 0.46, + "grad_norm": 2.0819600663793154, + "learning_rate": 5.914960121448556e-06, + "loss": 0.5301, + "step": 5834 + }, + { + "epoch": 0.46, + "grad_norm": 2.3578031514753697, + "learning_rate": 5.913709770039702e-06, + "loss": 0.4648, + "step": 5835 + }, + { + "epoch": 0.46, + "grad_norm": 1.8342873452017652, + "learning_rate": 5.912459359515022e-06, + "loss": 0.4688, + "step": 5836 + }, + { + "epoch": 0.46, + "grad_norm": 2.5883821569696783, + "learning_rate": 5.911208889955413e-06, + "loss": 0.4548, + "step": 5837 + }, + { + "epoch": 0.46, + "grad_norm": 0.6189352671434525, + "learning_rate": 5.909958361441782e-06, + "loss": 0.4862, + "step": 5838 + }, + { + "epoch": 0.46, + "grad_norm": 2.3753168564529217, + "learning_rate": 5.9087077740550354e-06, + "loss": 0.5519, + "step": 5839 + }, + { + "epoch": 0.46, + "grad_norm": 0.560424545639952, + "learning_rate": 5.907457127876085e-06, + "loss": 0.5098, + "step": 5840 + }, + { + "epoch": 0.46, + "grad_norm": 0.5161552311707477, + "learning_rate": 5.9062064229858465e-06, + "loss": 0.4769, + "step": 5841 + }, + { + "epoch": 0.46, + "grad_norm": 3.0366954379294806, + "learning_rate": 5.904955659465236e-06, + "loss": 0.4596, + "step": 5842 + }, + { + "epoch": 0.46, + "grad_norm": 0.5889292786206042, + "learning_rate": 5.9037048373951785e-06, + "loss": 0.5273, + "step": 5843 + }, + { + "epoch": 0.46, + "grad_norm": 1.4652182913938556, + "learning_rate": 5.902453956856601e-06, + "loss": 0.4592, + "step": 5844 + }, + { + "epoch": 0.46, + "grad_norm": 0.5794530765775763, + "learning_rate": 5.901203017930432e-06, + "loss": 0.4952, + "step": 5845 + }, + { + "epoch": 0.46, + "grad_norm": 2.5782303829548447, + "learning_rate": 5.8999520206976065e-06, + "loss": 0.454, + "step": 5846 + }, + { + "epoch": 0.46, + "grad_norm": 0.5941284893375037, + "learning_rate": 5.898700965239064e-06, + "loss": 0.5105, + "step": 5847 + }, + { + "epoch": 0.46, + "grad_norm": 2.0284841697966636, + "learning_rate": 5.897449851635743e-06, + "loss": 0.4768, + "step": 5848 + }, + { + "epoch": 0.46, + "grad_norm": 1.7926180153985334, + "learning_rate": 5.896198679968592e-06, + "loss": 0.4427, + "step": 5849 + }, + { + "epoch": 0.46, + "grad_norm": 2.3373848382284095, + "learning_rate": 5.894947450318559e-06, + "loss": 0.5071, + "step": 5850 + }, + { + "epoch": 0.46, + "grad_norm": 1.701287282429645, + "learning_rate": 5.8936961627665964e-06, + "loss": 0.4958, + "step": 5851 + }, + { + "epoch": 0.46, + "grad_norm": 1.6441484297602644, + "learning_rate": 5.89244481739366e-06, + "loss": 0.4595, + "step": 5852 + }, + { + "epoch": 0.46, + "grad_norm": 1.7045652437646563, + "learning_rate": 5.891193414280714e-06, + "loss": 0.4775, + "step": 5853 + }, + { + "epoch": 0.46, + "grad_norm": 1.8301237565529862, + "learning_rate": 5.889941953508716e-06, + "loss": 0.4946, + "step": 5854 + }, + { + "epoch": 0.46, + "grad_norm": 1.908083974841471, + "learning_rate": 5.888690435158642e-06, + "loss": 0.5058, + "step": 5855 + }, + { + "epoch": 0.46, + "grad_norm": 0.7230945609893141, + "learning_rate": 5.887438859311456e-06, + "loss": 0.5011, + "step": 5856 + }, + { + "epoch": 0.46, + "grad_norm": 0.6005786464384618, + "learning_rate": 5.886187226048138e-06, + "loss": 0.4746, + "step": 5857 + }, + { + "epoch": 0.46, + "grad_norm": 1.5545303461926905, + "learning_rate": 5.884935535449665e-06, + "loss": 0.4378, + "step": 5858 + }, + { + "epoch": 0.46, + "grad_norm": 6.338666886282173, + "learning_rate": 5.883683787597022e-06, + "loss": 0.4155, + "step": 5859 + }, + { + "epoch": 0.46, + "grad_norm": 1.9903876810844285, + "learning_rate": 5.882431982571193e-06, + "loss": 0.5025, + "step": 5860 + }, + { + "epoch": 0.46, + "grad_norm": 2.0290183277474707, + "learning_rate": 5.881180120453171e-06, + "loss": 0.4494, + "step": 5861 + }, + { + "epoch": 0.46, + "grad_norm": 0.6745766293342752, + "learning_rate": 5.8799282013239456e-06, + "loss": 0.5016, + "step": 5862 + }, + { + "epoch": 0.46, + "grad_norm": 2.27649687448856, + "learning_rate": 5.878676225264517e-06, + "loss": 0.5101, + "step": 5863 + }, + { + "epoch": 0.46, + "grad_norm": 3.188612340110292, + "learning_rate": 5.877424192355886e-06, + "loss": 0.4611, + "step": 5864 + }, + { + "epoch": 0.46, + "grad_norm": 2.719474048108263, + "learning_rate": 5.876172102679058e-06, + "loss": 0.453, + "step": 5865 + }, + { + "epoch": 0.46, + "grad_norm": 1.5800762730200366, + "learning_rate": 5.8749199563150415e-06, + "loss": 0.4837, + "step": 5866 + }, + { + "epoch": 0.46, + "grad_norm": 1.9540804621446486, + "learning_rate": 5.873667753344847e-06, + "loss": 0.4548, + "step": 5867 + }, + { + "epoch": 0.46, + "grad_norm": 1.7298447287171492, + "learning_rate": 5.872415493849494e-06, + "loss": 0.4989, + "step": 5868 + }, + { + "epoch": 0.46, + "grad_norm": 6.5251443249977985, + "learning_rate": 5.871163177909998e-06, + "loss": 0.4316, + "step": 5869 + }, + { + "epoch": 0.46, + "grad_norm": 0.6477194391251179, + "learning_rate": 5.869910805607384e-06, + "loss": 0.5328, + "step": 5870 + }, + { + "epoch": 0.46, + "grad_norm": 2.1100128686417925, + "learning_rate": 5.8686583770226805e-06, + "loss": 0.5086, + "step": 5871 + }, + { + "epoch": 0.46, + "grad_norm": 2.1297005354225123, + "learning_rate": 5.867405892236915e-06, + "loss": 0.4539, + "step": 5872 + }, + { + "epoch": 0.46, + "grad_norm": 1.6482849869182417, + "learning_rate": 5.866153351331123e-06, + "loss": 0.4884, + "step": 5873 + }, + { + "epoch": 0.46, + "grad_norm": 1.8791448557362458, + "learning_rate": 5.864900754386342e-06, + "loss": 0.5082, + "step": 5874 + }, + { + "epoch": 0.46, + "grad_norm": 0.5666265538926212, + "learning_rate": 5.863648101483614e-06, + "loss": 0.5033, + "step": 5875 + }, + { + "epoch": 0.46, + "grad_norm": 2.3887828814784986, + "learning_rate": 5.8623953927039845e-06, + "loss": 0.4425, + "step": 5876 + }, + { + "epoch": 0.46, + "grad_norm": 0.6129129590424698, + "learning_rate": 5.8611426281285e-06, + "loss": 0.5069, + "step": 5877 + }, + { + "epoch": 0.46, + "grad_norm": 0.5861507619302027, + "learning_rate": 5.859889807838216e-06, + "loss": 0.5193, + "step": 5878 + }, + { + "epoch": 0.46, + "grad_norm": 1.580961921002182, + "learning_rate": 5.858636931914184e-06, + "loss": 0.4114, + "step": 5879 + }, + { + "epoch": 0.46, + "grad_norm": 1.4795720861590165, + "learning_rate": 5.857384000437466e-06, + "loss": 0.4615, + "step": 5880 + }, + { + "epoch": 0.46, + "grad_norm": 1.6182215967166627, + "learning_rate": 5.8561310134891246e-06, + "loss": 0.4737, + "step": 5881 + }, + { + "epoch": 0.46, + "grad_norm": 2.003404284900797, + "learning_rate": 5.8548779711502275e-06, + "loss": 0.4866, + "step": 5882 + }, + { + "epoch": 0.46, + "grad_norm": 0.6406674874483526, + "learning_rate": 5.853624873501844e-06, + "loss": 0.5045, + "step": 5883 + }, + { + "epoch": 0.46, + "grad_norm": 1.6747791959148048, + "learning_rate": 5.8523717206250485e-06, + "loss": 0.4609, + "step": 5884 + }, + { + "epoch": 0.46, + "grad_norm": 1.48182764399353, + "learning_rate": 5.8511185126009165e-06, + "loss": 0.4513, + "step": 5885 + }, + { + "epoch": 0.46, + "grad_norm": 2.3013474834564467, + "learning_rate": 5.8498652495105315e-06, + "loss": 0.4803, + "step": 5886 + }, + { + "epoch": 0.46, + "grad_norm": 3.9761266099442807, + "learning_rate": 5.848611931434975e-06, + "loss": 0.4795, + "step": 5887 + }, + { + "epoch": 0.46, + "grad_norm": 2.265592159894518, + "learning_rate": 5.847358558455339e-06, + "loss": 0.5079, + "step": 5888 + }, + { + "epoch": 0.46, + "grad_norm": 1.6691885384520395, + "learning_rate": 5.84610513065271e-06, + "loss": 0.4334, + "step": 5889 + }, + { + "epoch": 0.46, + "grad_norm": 0.5669935443867741, + "learning_rate": 5.844851648108188e-06, + "loss": 0.4757, + "step": 5890 + }, + { + "epoch": 0.46, + "grad_norm": 1.7369213667952494, + "learning_rate": 5.843598110902866e-06, + "loss": 0.4559, + "step": 5891 + }, + { + "epoch": 0.46, + "grad_norm": 0.5834001184163092, + "learning_rate": 5.842344519117853e-06, + "loss": 0.5202, + "step": 5892 + }, + { + "epoch": 0.46, + "grad_norm": 0.5718210239542848, + "learning_rate": 5.841090872834249e-06, + "loss": 0.5046, + "step": 5893 + }, + { + "epoch": 0.46, + "grad_norm": 1.6549160364089908, + "learning_rate": 5.839837172133169e-06, + "loss": 0.4875, + "step": 5894 + }, + { + "epoch": 0.46, + "grad_norm": 0.5492828206515015, + "learning_rate": 5.838583417095721e-06, + "loss": 0.4891, + "step": 5895 + }, + { + "epoch": 0.46, + "grad_norm": 1.8771072769125519, + "learning_rate": 5.837329607803024e-06, + "loss": 0.4361, + "step": 5896 + }, + { + "epoch": 0.46, + "grad_norm": 2.086071737548597, + "learning_rate": 5.836075744336196e-06, + "loss": 0.4442, + "step": 5897 + }, + { + "epoch": 0.46, + "grad_norm": 0.6688669223306436, + "learning_rate": 5.834821826776361e-06, + "loss": 0.5062, + "step": 5898 + }, + { + "epoch": 0.46, + "grad_norm": 0.5505965613061234, + "learning_rate": 5.833567855204646e-06, + "loss": 0.5031, + "step": 5899 + }, + { + "epoch": 0.46, + "grad_norm": 2.4575038642462506, + "learning_rate": 5.832313829702181e-06, + "loss": 0.4732, + "step": 5900 + }, + { + "epoch": 0.46, + "grad_norm": 0.5453788505134876, + "learning_rate": 5.8310597503501e-06, + "loss": 0.5071, + "step": 5901 + }, + { + "epoch": 0.46, + "grad_norm": 2.079281880434146, + "learning_rate": 5.82980561722954e-06, + "loss": 0.4382, + "step": 5902 + }, + { + "epoch": 0.46, + "grad_norm": 1.8653142660971853, + "learning_rate": 5.828551430421642e-06, + "loss": 0.4877, + "step": 5903 + }, + { + "epoch": 0.46, + "grad_norm": 2.0731564170315764, + "learning_rate": 5.8272971900075516e-06, + "loss": 0.4532, + "step": 5904 + }, + { + "epoch": 0.46, + "grad_norm": 5.356321262044365, + "learning_rate": 5.826042896068414e-06, + "loss": 0.4857, + "step": 5905 + }, + { + "epoch": 0.46, + "grad_norm": 1.9722680919458162, + "learning_rate": 5.824788548685383e-06, + "loss": 0.499, + "step": 5906 + }, + { + "epoch": 0.46, + "grad_norm": 1.5622958669665656, + "learning_rate": 5.82353414793961e-06, + "loss": 0.4293, + "step": 5907 + }, + { + "epoch": 0.46, + "grad_norm": 1.684607302907042, + "learning_rate": 5.822279693912257e-06, + "loss": 0.45, + "step": 5908 + }, + { + "epoch": 0.46, + "grad_norm": 2.1279044288798445, + "learning_rate": 5.821025186684482e-06, + "loss": 0.4696, + "step": 5909 + }, + { + "epoch": 0.46, + "grad_norm": 1.439485712706476, + "learning_rate": 5.819770626337451e-06, + "loss": 0.4364, + "step": 5910 + }, + { + "epoch": 0.46, + "grad_norm": 2.46258574072006, + "learning_rate": 5.818516012952335e-06, + "loss": 0.4123, + "step": 5911 + }, + { + "epoch": 0.46, + "grad_norm": 1.6629937964388954, + "learning_rate": 5.817261346610301e-06, + "loss": 0.4902, + "step": 5912 + }, + { + "epoch": 0.46, + "grad_norm": 1.6959620808866966, + "learning_rate": 5.81600662739253e-06, + "loss": 0.4865, + "step": 5913 + }, + { + "epoch": 0.46, + "grad_norm": 1.6255481884661898, + "learning_rate": 5.814751855380195e-06, + "loss": 0.4597, + "step": 5914 + }, + { + "epoch": 0.46, + "grad_norm": 3.4048018076923885, + "learning_rate": 5.813497030654483e-06, + "loss": 0.4697, + "step": 5915 + }, + { + "epoch": 0.46, + "grad_norm": 6.775999371655901, + "learning_rate": 5.812242153296574e-06, + "loss": 0.4516, + "step": 5916 + }, + { + "epoch": 0.46, + "grad_norm": 0.6307908014036668, + "learning_rate": 5.810987223387664e-06, + "loss": 0.516, + "step": 5917 + }, + { + "epoch": 0.46, + "grad_norm": 1.6439773845493402, + "learning_rate": 5.80973224100894e-06, + "loss": 0.4519, + "step": 5918 + }, + { + "epoch": 0.46, + "grad_norm": 2.7136551366272617, + "learning_rate": 5.808477206241599e-06, + "loss": 0.4104, + "step": 5919 + }, + { + "epoch": 0.46, + "grad_norm": 2.213209614489724, + "learning_rate": 5.80722211916684e-06, + "loss": 0.4363, + "step": 5920 + }, + { + "epoch": 0.47, + "grad_norm": 0.5592161993665855, + "learning_rate": 5.805966979865868e-06, + "loss": 0.4957, + "step": 5921 + }, + { + "epoch": 0.47, + "grad_norm": 2.0948468695595435, + "learning_rate": 5.804711788419885e-06, + "loss": 0.5055, + "step": 5922 + }, + { + "epoch": 0.47, + "grad_norm": 1.531894528417516, + "learning_rate": 5.8034565449101025e-06, + "loss": 0.4574, + "step": 5923 + }, + { + "epoch": 0.47, + "grad_norm": 1.7642893769557353, + "learning_rate": 5.802201249417732e-06, + "loss": 0.4951, + "step": 5924 + }, + { + "epoch": 0.47, + "grad_norm": 2.61287748665114, + "learning_rate": 5.8009459020239885e-06, + "loss": 0.4701, + "step": 5925 + }, + { + "epoch": 0.47, + "grad_norm": 7.672349005699024, + "learning_rate": 5.799690502810096e-06, + "loss": 0.4849, + "step": 5926 + }, + { + "epoch": 0.47, + "grad_norm": 1.9783980681204234, + "learning_rate": 5.798435051857273e-06, + "loss": 0.4968, + "step": 5927 + }, + { + "epoch": 0.47, + "grad_norm": 1.5287259404572018, + "learning_rate": 5.797179549246746e-06, + "loss": 0.4846, + "step": 5928 + }, + { + "epoch": 0.47, + "grad_norm": 1.5303634349547306, + "learning_rate": 5.795923995059747e-06, + "loss": 0.4593, + "step": 5929 + }, + { + "epoch": 0.47, + "grad_norm": 3.494214973666261, + "learning_rate": 5.794668389377503e-06, + "loss": 0.5137, + "step": 5930 + }, + { + "epoch": 0.47, + "grad_norm": 1.8227509442146623, + "learning_rate": 5.793412732281258e-06, + "loss": 0.4355, + "step": 5931 + }, + { + "epoch": 0.47, + "grad_norm": 1.8650901351854052, + "learning_rate": 5.792157023852244e-06, + "loss": 0.4687, + "step": 5932 + }, + { + "epoch": 0.47, + "grad_norm": 3.2814056773803286, + "learning_rate": 5.790901264171709e-06, + "loss": 0.5157, + "step": 5933 + }, + { + "epoch": 0.47, + "grad_norm": 2.230403025635125, + "learning_rate": 5.789645453320896e-06, + "loss": 0.4747, + "step": 5934 + }, + { + "epoch": 0.47, + "grad_norm": 1.7285978324665887, + "learning_rate": 5.788389591381055e-06, + "loss": 0.4313, + "step": 5935 + }, + { + "epoch": 0.47, + "grad_norm": 1.8364205903394037, + "learning_rate": 5.787133678433437e-06, + "loss": 0.47, + "step": 5936 + }, + { + "epoch": 0.47, + "grad_norm": 1.6299063429864755, + "learning_rate": 5.785877714559303e-06, + "loss": 0.4333, + "step": 5937 + }, + { + "epoch": 0.47, + "grad_norm": 10.165181644438015, + "learning_rate": 5.7846216998399065e-06, + "loss": 0.4852, + "step": 5938 + }, + { + "epoch": 0.47, + "grad_norm": 1.5364922797222298, + "learning_rate": 5.783365634356514e-06, + "loss": 0.4798, + "step": 5939 + }, + { + "epoch": 0.47, + "grad_norm": 1.8760757305868798, + "learning_rate": 5.7821095181903885e-06, + "loss": 0.5114, + "step": 5940 + }, + { + "epoch": 0.47, + "grad_norm": 2.07369167041546, + "learning_rate": 5.7808533514228014e-06, + "loss": 0.4416, + "step": 5941 + }, + { + "epoch": 0.47, + "grad_norm": 0.6292196711580126, + "learning_rate": 5.7795971341350235e-06, + "loss": 0.4933, + "step": 5942 + }, + { + "epoch": 0.47, + "grad_norm": 1.3910067975328642, + "learning_rate": 5.778340866408331e-06, + "loss": 0.4906, + "step": 5943 + }, + { + "epoch": 0.47, + "grad_norm": 2.3985509493240404, + "learning_rate": 5.7770845483240015e-06, + "loss": 0.4674, + "step": 5944 + }, + { + "epoch": 0.47, + "grad_norm": 1.942834929969426, + "learning_rate": 5.775828179963318e-06, + "loss": 0.4643, + "step": 5945 + }, + { + "epoch": 0.47, + "grad_norm": 1.772759246252017, + "learning_rate": 5.7745717614075695e-06, + "loss": 0.5548, + "step": 5946 + }, + { + "epoch": 0.47, + "grad_norm": 3.1086487161254115, + "learning_rate": 5.773315292738038e-06, + "loss": 0.4232, + "step": 5947 + }, + { + "epoch": 0.47, + "grad_norm": 1.6937888880106402, + "learning_rate": 5.77205877403602e-06, + "loss": 0.4371, + "step": 5948 + }, + { + "epoch": 0.47, + "grad_norm": 2.227470297728218, + "learning_rate": 5.770802205382809e-06, + "loss": 0.5161, + "step": 5949 + }, + { + "epoch": 0.47, + "grad_norm": 2.3958380194699096, + "learning_rate": 5.769545586859704e-06, + "loss": 0.4857, + "step": 5950 + }, + { + "epoch": 0.47, + "grad_norm": 1.513528106049459, + "learning_rate": 5.768288918548005e-06, + "loss": 0.4596, + "step": 5951 + }, + { + "epoch": 0.47, + "grad_norm": 4.580650690269417, + "learning_rate": 5.76703220052902e-06, + "loss": 0.4364, + "step": 5952 + }, + { + "epoch": 0.47, + "grad_norm": 1.6395933368030706, + "learning_rate": 5.765775432884053e-06, + "loss": 0.4253, + "step": 5953 + }, + { + "epoch": 0.47, + "grad_norm": 0.643788452978433, + "learning_rate": 5.764518615694419e-06, + "loss": 0.5019, + "step": 5954 + }, + { + "epoch": 0.47, + "grad_norm": 1.8066383558020616, + "learning_rate": 5.76326174904143e-06, + "loss": 0.5148, + "step": 5955 + }, + { + "epoch": 0.47, + "grad_norm": 0.5725243410183003, + "learning_rate": 5.7620048330064045e-06, + "loss": 0.5071, + "step": 5956 + }, + { + "epoch": 0.47, + "grad_norm": 3.564548069219352, + "learning_rate": 5.760747867670663e-06, + "loss": 0.4497, + "step": 5957 + }, + { + "epoch": 0.47, + "grad_norm": 1.5178917865002612, + "learning_rate": 5.7594908531155305e-06, + "loss": 0.4558, + "step": 5958 + }, + { + "epoch": 0.47, + "grad_norm": 12.702301388318126, + "learning_rate": 5.7582337894223305e-06, + "loss": 0.5076, + "step": 5959 + }, + { + "epoch": 0.47, + "grad_norm": 0.597597928948306, + "learning_rate": 5.756976676672399e-06, + "loss": 0.5043, + "step": 5960 + }, + { + "epoch": 0.47, + "grad_norm": 0.5859302154306599, + "learning_rate": 5.755719514947064e-06, + "loss": 0.503, + "step": 5961 + }, + { + "epoch": 0.47, + "grad_norm": 2.713056976452306, + "learning_rate": 5.754462304327668e-06, + "loss": 0.4617, + "step": 5962 + }, + { + "epoch": 0.47, + "grad_norm": 0.571608808028225, + "learning_rate": 5.753205044895547e-06, + "loss": 0.4996, + "step": 5963 + }, + { + "epoch": 0.47, + "grad_norm": 1.6667582144887978, + "learning_rate": 5.751947736732045e-06, + "loss": 0.4845, + "step": 5964 + }, + { + "epoch": 0.47, + "grad_norm": 1.8558307893142236, + "learning_rate": 5.750690379918508e-06, + "loss": 0.5526, + "step": 5965 + }, + { + "epoch": 0.47, + "grad_norm": 2.0765797703396154, + "learning_rate": 5.749432974536285e-06, + "loss": 0.4765, + "step": 5966 + }, + { + "epoch": 0.47, + "grad_norm": 2.34182498614451, + "learning_rate": 5.748175520666729e-06, + "loss": 0.4294, + "step": 5967 + }, + { + "epoch": 0.47, + "grad_norm": 1.7588767464239168, + "learning_rate": 5.746918018391196e-06, + "loss": 0.4777, + "step": 5968 + }, + { + "epoch": 0.47, + "grad_norm": 2.1987124651469236, + "learning_rate": 5.745660467791045e-06, + "loss": 0.4398, + "step": 5969 + }, + { + "epoch": 0.47, + "grad_norm": 0.6259079926310644, + "learning_rate": 5.744402868947635e-06, + "loss": 0.5044, + "step": 5970 + }, + { + "epoch": 0.47, + "grad_norm": 2.255531572357372, + "learning_rate": 5.743145221942333e-06, + "loss": 0.4325, + "step": 5971 + }, + { + "epoch": 0.47, + "grad_norm": 2.318388191146558, + "learning_rate": 5.74188752685651e-06, + "loss": 0.5105, + "step": 5972 + }, + { + "epoch": 0.47, + "grad_norm": 1.5472737298567707, + "learning_rate": 5.740629783771533e-06, + "loss": 0.4963, + "step": 5973 + }, + { + "epoch": 0.47, + "grad_norm": 1.511725091491056, + "learning_rate": 5.739371992768779e-06, + "loss": 0.5015, + "step": 5974 + }, + { + "epoch": 0.47, + "grad_norm": 2.255027601315444, + "learning_rate": 5.738114153929622e-06, + "loss": 0.4625, + "step": 5975 + }, + { + "epoch": 0.47, + "grad_norm": 1.606208971416444, + "learning_rate": 5.736856267335447e-06, + "loss": 0.5026, + "step": 5976 + }, + { + "epoch": 0.47, + "grad_norm": 0.5677110898383697, + "learning_rate": 5.735598333067634e-06, + "loss": 0.493, + "step": 5977 + }, + { + "epoch": 0.47, + "grad_norm": 1.5046261711409938, + "learning_rate": 5.734340351207572e-06, + "loss": 0.449, + "step": 5978 + }, + { + "epoch": 0.47, + "grad_norm": 2.634642446531102, + "learning_rate": 5.733082321836649e-06, + "loss": 0.4347, + "step": 5979 + }, + { + "epoch": 0.47, + "grad_norm": 1.5352523302536238, + "learning_rate": 5.7318242450362594e-06, + "loss": 0.4791, + "step": 5980 + }, + { + "epoch": 0.47, + "grad_norm": 1.6656548186386357, + "learning_rate": 5.7305661208877995e-06, + "loss": 0.402, + "step": 5981 + }, + { + "epoch": 0.47, + "grad_norm": 1.5973999745947913, + "learning_rate": 5.729307949472666e-06, + "loss": 0.4559, + "step": 5982 + }, + { + "epoch": 0.47, + "grad_norm": 2.0682700701942016, + "learning_rate": 5.728049730872262e-06, + "loss": 0.489, + "step": 5983 + }, + { + "epoch": 0.47, + "grad_norm": 4.3387261942253, + "learning_rate": 5.7267914651679935e-06, + "loss": 0.4133, + "step": 5984 + }, + { + "epoch": 0.47, + "grad_norm": 1.884499302203894, + "learning_rate": 5.725533152441267e-06, + "loss": 0.4843, + "step": 5985 + }, + { + "epoch": 0.47, + "grad_norm": 4.0349422675370645, + "learning_rate": 5.724274792773496e-06, + "loss": 0.4691, + "step": 5986 + }, + { + "epoch": 0.47, + "grad_norm": 1.8249198288310222, + "learning_rate": 5.723016386246093e-06, + "loss": 0.4366, + "step": 5987 + }, + { + "epoch": 0.47, + "grad_norm": 1.786384309504527, + "learning_rate": 5.7217579329404745e-06, + "loss": 0.4804, + "step": 5988 + }, + { + "epoch": 0.47, + "grad_norm": 1.3222440296848124, + "learning_rate": 5.720499432938065e-06, + "loss": 0.5139, + "step": 5989 + }, + { + "epoch": 0.47, + "grad_norm": 1.9655359874409943, + "learning_rate": 5.719240886320281e-06, + "loss": 0.499, + "step": 5990 + }, + { + "epoch": 0.47, + "grad_norm": 1.7512345875837911, + "learning_rate": 5.717982293168555e-06, + "loss": 0.5165, + "step": 5991 + }, + { + "epoch": 0.47, + "grad_norm": 1.6144634047367286, + "learning_rate": 5.716723653564312e-06, + "loss": 0.4858, + "step": 5992 + }, + { + "epoch": 0.47, + "grad_norm": 1.5081456494765408, + "learning_rate": 5.715464967588987e-06, + "loss": 0.473, + "step": 5993 + }, + { + "epoch": 0.47, + "grad_norm": 1.9960947497187504, + "learning_rate": 5.714206235324015e-06, + "loss": 0.4872, + "step": 5994 + }, + { + "epoch": 0.47, + "grad_norm": 0.5849383844809959, + "learning_rate": 5.712947456850834e-06, + "loss": 0.5018, + "step": 5995 + }, + { + "epoch": 0.47, + "grad_norm": 1.8764390107526074, + "learning_rate": 5.7116886322508845e-06, + "loss": 0.4967, + "step": 5996 + }, + { + "epoch": 0.47, + "grad_norm": 2.1808623988109983, + "learning_rate": 5.710429761605613e-06, + "loss": 0.431, + "step": 5997 + }, + { + "epoch": 0.47, + "grad_norm": 1.7510816402407445, + "learning_rate": 5.709170844996463e-06, + "loss": 0.5047, + "step": 5998 + }, + { + "epoch": 0.47, + "grad_norm": 1.5818425859911756, + "learning_rate": 5.707911882504888e-06, + "loss": 0.4847, + "step": 5999 + }, + { + "epoch": 0.47, + "grad_norm": 0.5890593975580446, + "learning_rate": 5.706652874212341e-06, + "loss": 0.515, + "step": 6000 + }, + { + "epoch": 0.47, + "grad_norm": 1.5295822772699232, + "learning_rate": 5.705393820200276e-06, + "loss": 0.4489, + "step": 6001 + }, + { + "epoch": 0.47, + "grad_norm": 2.9210435848516876, + "learning_rate": 5.704134720550155e-06, + "loss": 0.4681, + "step": 6002 + }, + { + "epoch": 0.47, + "grad_norm": 1.594082442984845, + "learning_rate": 5.702875575343438e-06, + "loss": 0.4982, + "step": 6003 + }, + { + "epoch": 0.47, + "grad_norm": 2.251270663811119, + "learning_rate": 5.70161638466159e-06, + "loss": 0.4951, + "step": 6004 + }, + { + "epoch": 0.47, + "grad_norm": 1.6134939082862363, + "learning_rate": 5.700357148586082e-06, + "loss": 0.4499, + "step": 6005 + }, + { + "epoch": 0.47, + "grad_norm": 3.00655544046567, + "learning_rate": 5.6990978671983795e-06, + "loss": 0.4481, + "step": 6006 + }, + { + "epoch": 0.47, + "grad_norm": 1.6057074508771052, + "learning_rate": 5.697838540579963e-06, + "loss": 0.4855, + "step": 6007 + }, + { + "epoch": 0.47, + "grad_norm": 1.670485100193163, + "learning_rate": 5.696579168812304e-06, + "loss": 0.4607, + "step": 6008 + }, + { + "epoch": 0.47, + "grad_norm": 1.6554599349703256, + "learning_rate": 5.6953197519768845e-06, + "loss": 0.4721, + "step": 6009 + }, + { + "epoch": 0.47, + "grad_norm": 1.5312037659960067, + "learning_rate": 5.694060290155186e-06, + "loss": 0.4452, + "step": 6010 + }, + { + "epoch": 0.47, + "grad_norm": 1.7912825747360135, + "learning_rate": 5.692800783428696e-06, + "loss": 0.4743, + "step": 6011 + }, + { + "epoch": 0.47, + "grad_norm": 5.044010005804708, + "learning_rate": 5.6915412318789e-06, + "loss": 0.485, + "step": 6012 + }, + { + "epoch": 0.47, + "grad_norm": 2.4431739806162485, + "learning_rate": 5.690281635587291e-06, + "loss": 0.5, + "step": 6013 + }, + { + "epoch": 0.47, + "grad_norm": 2.4282775356192032, + "learning_rate": 5.6890219946353645e-06, + "loss": 0.4992, + "step": 6014 + }, + { + "epoch": 0.47, + "grad_norm": 2.7417476308872604, + "learning_rate": 5.687762309104615e-06, + "loss": 0.4792, + "step": 6015 + }, + { + "epoch": 0.47, + "grad_norm": 2.1528903956823586, + "learning_rate": 5.686502579076544e-06, + "loss": 0.442, + "step": 6016 + }, + { + "epoch": 0.47, + "grad_norm": 1.4983715341979529, + "learning_rate": 5.685242804632655e-06, + "loss": 0.4506, + "step": 6017 + }, + { + "epoch": 0.47, + "grad_norm": 2.2836038021960925, + "learning_rate": 5.683982985854452e-06, + "loss": 0.5072, + "step": 6018 + }, + { + "epoch": 0.47, + "grad_norm": 1.510402468002663, + "learning_rate": 5.682723122823446e-06, + "loss": 0.482, + "step": 6019 + }, + { + "epoch": 0.47, + "grad_norm": 1.85107953551789, + "learning_rate": 5.681463215621146e-06, + "loss": 0.4981, + "step": 6020 + }, + { + "epoch": 0.47, + "grad_norm": 2.162470487262685, + "learning_rate": 5.680203264329066e-06, + "loss": 0.5077, + "step": 6021 + }, + { + "epoch": 0.47, + "grad_norm": 2.2268190513228987, + "learning_rate": 5.678943269028727e-06, + "loss": 0.4319, + "step": 6022 + }, + { + "epoch": 0.47, + "grad_norm": 2.5675505839119994, + "learning_rate": 5.677683229801646e-06, + "loss": 0.5217, + "step": 6023 + }, + { + "epoch": 0.47, + "grad_norm": 1.7178357654853753, + "learning_rate": 5.6764231467293465e-06, + "loss": 0.4491, + "step": 6024 + }, + { + "epoch": 0.47, + "grad_norm": 1.6046995496007133, + "learning_rate": 5.675163019893354e-06, + "loss": 0.4769, + "step": 6025 + }, + { + "epoch": 0.47, + "grad_norm": 4.194152488506448, + "learning_rate": 5.673902849375198e-06, + "loss": 0.4473, + "step": 6026 + }, + { + "epoch": 0.47, + "grad_norm": 1.8001871807266456, + "learning_rate": 5.6726426352564065e-06, + "loss": 0.5182, + "step": 6027 + }, + { + "epoch": 0.47, + "grad_norm": 1.9959096874324542, + "learning_rate": 5.67138237761852e-06, + "loss": 0.4728, + "step": 6028 + }, + { + "epoch": 0.47, + "grad_norm": 0.5764329495487379, + "learning_rate": 5.6701220765430694e-06, + "loss": 0.4846, + "step": 6029 + }, + { + "epoch": 0.47, + "grad_norm": 2.1121280222004617, + "learning_rate": 5.668861732111599e-06, + "loss": 0.4644, + "step": 6030 + }, + { + "epoch": 0.47, + "grad_norm": 6.604238939529686, + "learning_rate": 5.667601344405649e-06, + "loss": 0.4449, + "step": 6031 + }, + { + "epoch": 0.47, + "grad_norm": 1.741670168649439, + "learning_rate": 5.666340913506766e-06, + "loss": 0.4586, + "step": 6032 + }, + { + "epoch": 0.47, + "grad_norm": 0.5380322233169764, + "learning_rate": 5.665080439496495e-06, + "loss": 0.483, + "step": 6033 + }, + { + "epoch": 0.47, + "grad_norm": 2.9073685314972835, + "learning_rate": 5.663819922456393e-06, + "loss": 0.4518, + "step": 6034 + }, + { + "epoch": 0.47, + "grad_norm": 1.8744899268265252, + "learning_rate": 5.662559362468007e-06, + "loss": 0.4964, + "step": 6035 + }, + { + "epoch": 0.47, + "grad_norm": 1.9140057755041548, + "learning_rate": 5.661298759612901e-06, + "loss": 0.4472, + "step": 6036 + }, + { + "epoch": 0.47, + "grad_norm": 1.394471346075338, + "learning_rate": 5.6600381139726264e-06, + "loss": 0.4904, + "step": 6037 + }, + { + "epoch": 0.47, + "grad_norm": 2.83579527797309, + "learning_rate": 5.6587774256287505e-06, + "loss": 0.4636, + "step": 6038 + }, + { + "epoch": 0.47, + "grad_norm": 1.8847597530243525, + "learning_rate": 5.657516694662838e-06, + "loss": 0.473, + "step": 6039 + }, + { + "epoch": 0.47, + "grad_norm": 0.5636579285514544, + "learning_rate": 5.656255921156455e-06, + "loss": 0.4866, + "step": 6040 + }, + { + "epoch": 0.47, + "grad_norm": 2.0074486311426494, + "learning_rate": 5.654995105191172e-06, + "loss": 0.459, + "step": 6041 + }, + { + "epoch": 0.47, + "grad_norm": 0.5902853643738514, + "learning_rate": 5.653734246848563e-06, + "loss": 0.5036, + "step": 6042 + }, + { + "epoch": 0.47, + "grad_norm": 2.108414795695215, + "learning_rate": 5.652473346210203e-06, + "loss": 0.4482, + "step": 6043 + }, + { + "epoch": 0.47, + "grad_norm": 1.4608609173084268, + "learning_rate": 5.651212403357672e-06, + "loss": 0.477, + "step": 6044 + }, + { + "epoch": 0.47, + "grad_norm": 3.2100752636639958, + "learning_rate": 5.649951418372549e-06, + "loss": 0.4747, + "step": 6045 + }, + { + "epoch": 0.47, + "grad_norm": 1.9105591251298744, + "learning_rate": 5.648690391336421e-06, + "loss": 0.4484, + "step": 6046 + }, + { + "epoch": 0.47, + "grad_norm": 3.0697762911744095, + "learning_rate": 5.647429322330872e-06, + "loss": 0.4672, + "step": 6047 + }, + { + "epoch": 0.47, + "grad_norm": 0.5719096662073186, + "learning_rate": 5.646168211437494e-06, + "loss": 0.5084, + "step": 6048 + }, + { + "epoch": 0.48, + "grad_norm": 1.9114564898503823, + "learning_rate": 5.644907058737875e-06, + "loss": 0.4351, + "step": 6049 + }, + { + "epoch": 0.48, + "grad_norm": 1.9712835952829533, + "learning_rate": 5.643645864313616e-06, + "loss": 0.4727, + "step": 6050 + }, + { + "epoch": 0.48, + "grad_norm": 1.8842755280955312, + "learning_rate": 5.6423846282463105e-06, + "loss": 0.4952, + "step": 6051 + }, + { + "epoch": 0.48, + "grad_norm": 2.025008643029227, + "learning_rate": 5.64112335061756e-06, + "loss": 0.4786, + "step": 6052 + }, + { + "epoch": 0.48, + "grad_norm": 1.7560700105767266, + "learning_rate": 5.639862031508967e-06, + "loss": 0.499, + "step": 6053 + }, + { + "epoch": 0.48, + "grad_norm": 2.467559970839058, + "learning_rate": 5.638600671002139e-06, + "loss": 0.4693, + "step": 6054 + }, + { + "epoch": 0.48, + "grad_norm": 1.932307086996423, + "learning_rate": 5.637339269178682e-06, + "loss": 0.4792, + "step": 6055 + }, + { + "epoch": 0.48, + "grad_norm": 2.1177779985614262, + "learning_rate": 5.6360778261202075e-06, + "loss": 0.4893, + "step": 6056 + }, + { + "epoch": 0.48, + "grad_norm": 2.2156268966375663, + "learning_rate": 5.6348163419083316e-06, + "loss": 0.48, + "step": 6057 + }, + { + "epoch": 0.48, + "grad_norm": 0.5994514785412715, + "learning_rate": 5.633554816624667e-06, + "loss": 0.5243, + "step": 6058 + }, + { + "epoch": 0.48, + "grad_norm": 1.9005100426541817, + "learning_rate": 5.632293250350835e-06, + "loss": 0.4703, + "step": 6059 + }, + { + "epoch": 0.48, + "grad_norm": 0.5890460914355664, + "learning_rate": 5.631031643168457e-06, + "loss": 0.4993, + "step": 6060 + }, + { + "epoch": 0.48, + "grad_norm": 1.7838201035293735, + "learning_rate": 5.629769995159157e-06, + "loss": 0.4472, + "step": 6061 + }, + { + "epoch": 0.48, + "grad_norm": 3.437953889305717, + "learning_rate": 5.628508306404563e-06, + "loss": 0.4935, + "step": 6062 + }, + { + "epoch": 0.48, + "grad_norm": 1.5280163401419353, + "learning_rate": 5.627246576986303e-06, + "loss": 0.4689, + "step": 6063 + }, + { + "epoch": 0.48, + "grad_norm": 1.6750418086010357, + "learning_rate": 5.625984806986009e-06, + "loss": 0.4483, + "step": 6064 + }, + { + "epoch": 0.48, + "grad_norm": 2.7533496898342817, + "learning_rate": 5.624722996485319e-06, + "loss": 0.4326, + "step": 6065 + }, + { + "epoch": 0.48, + "grad_norm": 2.065431855140561, + "learning_rate": 5.623461145565866e-06, + "loss": 0.4579, + "step": 6066 + }, + { + "epoch": 0.48, + "grad_norm": 2.1974513610954634, + "learning_rate": 5.622199254309295e-06, + "loss": 0.4415, + "step": 6067 + }, + { + "epoch": 0.48, + "grad_norm": 0.6402534559665537, + "learning_rate": 5.620937322797244e-06, + "loss": 0.5222, + "step": 6068 + }, + { + "epoch": 0.48, + "grad_norm": 5.124427374116349, + "learning_rate": 5.619675351111363e-06, + "loss": 0.5299, + "step": 6069 + }, + { + "epoch": 0.48, + "grad_norm": 1.8035771347864251, + "learning_rate": 5.618413339333295e-06, + "loss": 0.5137, + "step": 6070 + }, + { + "epoch": 0.48, + "grad_norm": 1.7843059960627126, + "learning_rate": 5.617151287544694e-06, + "loss": 0.5129, + "step": 6071 + }, + { + "epoch": 0.48, + "grad_norm": 1.5024451675296469, + "learning_rate": 5.615889195827211e-06, + "loss": 0.4323, + "step": 6072 + }, + { + "epoch": 0.48, + "grad_norm": 1.5713383423867264, + "learning_rate": 5.614627064262504e-06, + "loss": 0.4951, + "step": 6073 + }, + { + "epoch": 0.48, + "grad_norm": 1.7226451461293641, + "learning_rate": 5.61336489293223e-06, + "loss": 0.4621, + "step": 6074 + }, + { + "epoch": 0.48, + "grad_norm": 1.8740923717743874, + "learning_rate": 5.612102681918049e-06, + "loss": 0.4691, + "step": 6075 + }, + { + "epoch": 0.48, + "grad_norm": 2.053564382222536, + "learning_rate": 5.610840431301625e-06, + "loss": 0.4786, + "step": 6076 + }, + { + "epoch": 0.48, + "grad_norm": 1.8827955650804251, + "learning_rate": 5.609578141164627e-06, + "loss": 0.5018, + "step": 6077 + }, + { + "epoch": 0.48, + "grad_norm": 1.5286405292282421, + "learning_rate": 5.608315811588717e-06, + "loss": 0.4475, + "step": 6078 + }, + { + "epoch": 0.48, + "grad_norm": 0.57264488110248, + "learning_rate": 5.6070534426555725e-06, + "loss": 0.5025, + "step": 6079 + }, + { + "epoch": 0.48, + "grad_norm": 1.3788487483075431, + "learning_rate": 5.6057910344468625e-06, + "loss": 0.4813, + "step": 6080 + }, + { + "epoch": 0.48, + "grad_norm": 1.6957515654254192, + "learning_rate": 5.604528587044266e-06, + "loss": 0.4587, + "step": 6081 + }, + { + "epoch": 0.48, + "grad_norm": 1.813128581815455, + "learning_rate": 5.60326610052946e-06, + "loss": 0.5202, + "step": 6082 + }, + { + "epoch": 0.48, + "grad_norm": 2.1432462517915516, + "learning_rate": 5.6020035749841264e-06, + "loss": 0.4449, + "step": 6083 + }, + { + "epoch": 0.48, + "grad_norm": 1.5484477788718938, + "learning_rate": 5.600741010489949e-06, + "loss": 0.4968, + "step": 6084 + }, + { + "epoch": 0.48, + "grad_norm": 4.259686135117147, + "learning_rate": 5.599478407128616e-06, + "loss": 0.5136, + "step": 6085 + }, + { + "epoch": 0.48, + "grad_norm": 1.7588980481346306, + "learning_rate": 5.598215764981811e-06, + "loss": 0.553, + "step": 6086 + }, + { + "epoch": 0.48, + "grad_norm": 1.8376235175454934, + "learning_rate": 5.596953084131231e-06, + "loss": 0.472, + "step": 6087 + }, + { + "epoch": 0.48, + "grad_norm": 2.05214934776534, + "learning_rate": 5.595690364658567e-06, + "loss": 0.4933, + "step": 6088 + }, + { + "epoch": 0.48, + "grad_norm": 1.6898989348272901, + "learning_rate": 5.594427606645516e-06, + "loss": 0.5188, + "step": 6089 + }, + { + "epoch": 0.48, + "grad_norm": 1.7363819043376463, + "learning_rate": 5.593164810173776e-06, + "loss": 0.5444, + "step": 6090 + }, + { + "epoch": 0.48, + "grad_norm": 2.309081460060708, + "learning_rate": 5.591901975325048e-06, + "loss": 0.5077, + "step": 6091 + }, + { + "epoch": 0.48, + "grad_norm": 2.005564923590532, + "learning_rate": 5.590639102181038e-06, + "loss": 0.4982, + "step": 6092 + }, + { + "epoch": 0.48, + "grad_norm": 2.088026813605811, + "learning_rate": 5.589376190823449e-06, + "loss": 0.4466, + "step": 6093 + }, + { + "epoch": 0.48, + "grad_norm": 1.685541596057917, + "learning_rate": 5.5881132413339935e-06, + "loss": 0.4526, + "step": 6094 + }, + { + "epoch": 0.48, + "grad_norm": 3.3696106934431707, + "learning_rate": 5.586850253794379e-06, + "loss": 0.5063, + "step": 6095 + }, + { + "epoch": 0.48, + "grad_norm": 1.5409494168011588, + "learning_rate": 5.585587228286321e-06, + "loss": 0.5048, + "step": 6096 + }, + { + "epoch": 0.48, + "grad_norm": 1.535950936429176, + "learning_rate": 5.584324164891536e-06, + "loss": 0.4939, + "step": 6097 + }, + { + "epoch": 0.48, + "grad_norm": 1.5129600486235135, + "learning_rate": 5.583061063691741e-06, + "loss": 0.4819, + "step": 6098 + }, + { + "epoch": 0.48, + "grad_norm": 1.8251593039689615, + "learning_rate": 5.581797924768658e-06, + "loss": 0.5092, + "step": 6099 + }, + { + "epoch": 0.48, + "grad_norm": 1.8343440800905695, + "learning_rate": 5.580534748204012e-06, + "loss": 0.5272, + "step": 6100 + }, + { + "epoch": 0.48, + "grad_norm": 1.7294355741873155, + "learning_rate": 5.579271534079526e-06, + "loss": 0.4496, + "step": 6101 + }, + { + "epoch": 0.48, + "grad_norm": 0.571433664574153, + "learning_rate": 5.57800828247693e-06, + "loss": 0.5219, + "step": 6102 + }, + { + "epoch": 0.48, + "grad_norm": 1.496009577053304, + "learning_rate": 5.576744993477953e-06, + "loss": 0.5009, + "step": 6103 + }, + { + "epoch": 0.48, + "grad_norm": 0.563879222886923, + "learning_rate": 5.575481667164331e-06, + "loss": 0.5311, + "step": 6104 + }, + { + "epoch": 0.48, + "grad_norm": 6.743211654560536, + "learning_rate": 5.574218303617797e-06, + "loss": 0.4215, + "step": 6105 + }, + { + "epoch": 0.48, + "grad_norm": 1.7257727709458222, + "learning_rate": 5.5729549029200895e-06, + "loss": 0.4484, + "step": 6106 + }, + { + "epoch": 0.48, + "grad_norm": 1.5658341091370458, + "learning_rate": 5.57169146515295e-06, + "loss": 0.46, + "step": 6107 + }, + { + "epoch": 0.48, + "grad_norm": 1.6083262941974528, + "learning_rate": 5.570427990398121e-06, + "loss": 0.4823, + "step": 6108 + }, + { + "epoch": 0.48, + "grad_norm": 1.9879218907773908, + "learning_rate": 5.569164478737346e-06, + "loss": 0.4875, + "step": 6109 + }, + { + "epoch": 0.48, + "grad_norm": 1.792675009023016, + "learning_rate": 5.567900930252375e-06, + "loss": 0.4501, + "step": 6110 + }, + { + "epoch": 0.48, + "grad_norm": 3.175247302290029, + "learning_rate": 5.566637345024956e-06, + "loss": 0.5009, + "step": 6111 + }, + { + "epoch": 0.48, + "grad_norm": 0.6160948736418187, + "learning_rate": 5.565373723136843e-06, + "loss": 0.4995, + "step": 6112 + }, + { + "epoch": 0.48, + "grad_norm": 1.50250538485351, + "learning_rate": 5.564110064669789e-06, + "loss": 0.4394, + "step": 6113 + }, + { + "epoch": 0.48, + "grad_norm": 1.9328263524409663, + "learning_rate": 5.562846369705552e-06, + "loss": 0.4723, + "step": 6114 + }, + { + "epoch": 0.48, + "grad_norm": 1.7346407448550933, + "learning_rate": 5.56158263832589e-06, + "loss": 0.4732, + "step": 6115 + }, + { + "epoch": 0.48, + "grad_norm": 1.6507434359242423, + "learning_rate": 5.5603188706125674e-06, + "loss": 0.4427, + "step": 6116 + }, + { + "epoch": 0.48, + "grad_norm": 2.0541310652783973, + "learning_rate": 5.559055066647345e-06, + "loss": 0.4928, + "step": 6117 + }, + { + "epoch": 0.48, + "grad_norm": 1.5838670408765896, + "learning_rate": 5.557791226511994e-06, + "loss": 0.4855, + "step": 6118 + }, + { + "epoch": 0.48, + "grad_norm": 1.7481184419553404, + "learning_rate": 5.556527350288278e-06, + "loss": 0.4409, + "step": 6119 + }, + { + "epoch": 0.48, + "grad_norm": 1.9045567514092938, + "learning_rate": 5.5552634380579715e-06, + "loss": 0.4393, + "step": 6120 + }, + { + "epoch": 0.48, + "grad_norm": 1.3596702709309434, + "learning_rate": 5.553999489902846e-06, + "loss": 0.4201, + "step": 6121 + }, + { + "epoch": 0.48, + "grad_norm": 1.4976452060296859, + "learning_rate": 5.552735505904679e-06, + "loss": 0.4245, + "step": 6122 + }, + { + "epoch": 0.48, + "grad_norm": 1.5989443701499597, + "learning_rate": 5.551471486145248e-06, + "loss": 0.3932, + "step": 6123 + }, + { + "epoch": 0.48, + "grad_norm": 2.225451591659345, + "learning_rate": 5.550207430706334e-06, + "loss": 0.4847, + "step": 6124 + }, + { + "epoch": 0.48, + "grad_norm": 1.479476377488131, + "learning_rate": 5.548943339669718e-06, + "loss": 0.461, + "step": 6125 + }, + { + "epoch": 0.48, + "grad_norm": 2.3550681695994307, + "learning_rate": 5.5476792131171855e-06, + "loss": 0.4742, + "step": 6126 + }, + { + "epoch": 0.48, + "grad_norm": 2.1845240627455693, + "learning_rate": 5.546415051130525e-06, + "loss": 0.4939, + "step": 6127 + }, + { + "epoch": 0.48, + "grad_norm": 1.5775564432611615, + "learning_rate": 5.545150853791525e-06, + "loss": 0.4638, + "step": 6128 + }, + { + "epoch": 0.48, + "grad_norm": 3.064816875075952, + "learning_rate": 5.5438866211819775e-06, + "loss": 0.4562, + "step": 6129 + }, + { + "epoch": 0.48, + "grad_norm": 1.9714440953357892, + "learning_rate": 5.542622353383679e-06, + "loss": 0.4995, + "step": 6130 + }, + { + "epoch": 0.48, + "grad_norm": 5.793594601267657, + "learning_rate": 5.541358050478423e-06, + "loss": 0.4376, + "step": 6131 + }, + { + "epoch": 0.48, + "grad_norm": 1.6265429327945393, + "learning_rate": 5.540093712548009e-06, + "loss": 0.4362, + "step": 6132 + }, + { + "epoch": 0.48, + "grad_norm": 1.795494843978201, + "learning_rate": 5.538829339674239e-06, + "loss": 0.4356, + "step": 6133 + }, + { + "epoch": 0.48, + "grad_norm": 2.0128315784153163, + "learning_rate": 5.537564931938915e-06, + "loss": 0.4721, + "step": 6134 + }, + { + "epoch": 0.48, + "grad_norm": 2.3391124824133613, + "learning_rate": 5.536300489423844e-06, + "loss": 0.4653, + "step": 6135 + }, + { + "epoch": 0.48, + "grad_norm": 2.3872751057068453, + "learning_rate": 5.535036012210832e-06, + "loss": 0.4421, + "step": 6136 + }, + { + "epoch": 0.48, + "grad_norm": 2.133309984524502, + "learning_rate": 5.533771500381691e-06, + "loss": 0.4734, + "step": 6137 + }, + { + "epoch": 0.48, + "grad_norm": 0.5810908249839427, + "learning_rate": 5.532506954018229e-06, + "loss": 0.5068, + "step": 6138 + }, + { + "epoch": 0.48, + "grad_norm": 1.3878516178800802, + "learning_rate": 5.531242373202268e-06, + "loss": 0.4205, + "step": 6139 + }, + { + "epoch": 0.48, + "grad_norm": 2.359285839597133, + "learning_rate": 5.529977758015616e-06, + "loss": 0.408, + "step": 6140 + }, + { + "epoch": 0.48, + "grad_norm": 3.005221373044829, + "learning_rate": 5.5287131085401e-06, + "loss": 0.4043, + "step": 6141 + }, + { + "epoch": 0.48, + "grad_norm": 6.241111755241205, + "learning_rate": 5.527448424857536e-06, + "loss": 0.4846, + "step": 6142 + }, + { + "epoch": 0.48, + "grad_norm": 25.881776510110253, + "learning_rate": 5.5261837070497505e-06, + "loss": 0.4755, + "step": 6143 + }, + { + "epoch": 0.48, + "grad_norm": 1.7995752265012297, + "learning_rate": 5.5249189551985645e-06, + "loss": 0.4632, + "step": 6144 + }, + { + "epoch": 0.48, + "grad_norm": 0.5784892367522048, + "learning_rate": 5.523654169385813e-06, + "loss": 0.4973, + "step": 6145 + }, + { + "epoch": 0.48, + "grad_norm": 0.5810176195797127, + "learning_rate": 5.522389349693318e-06, + "loss": 0.4935, + "step": 6146 + }, + { + "epoch": 0.48, + "grad_norm": 3.443109551053779, + "learning_rate": 5.521124496202917e-06, + "loss": 0.4514, + "step": 6147 + }, + { + "epoch": 0.48, + "grad_norm": 1.7210014668254479, + "learning_rate": 5.519859608996443e-06, + "loss": 0.4443, + "step": 6148 + }, + { + "epoch": 0.48, + "grad_norm": 2.620076114313818, + "learning_rate": 5.5185946881557314e-06, + "loss": 0.4657, + "step": 6149 + }, + { + "epoch": 0.48, + "grad_norm": 1.4573540045081261, + "learning_rate": 5.517329733762622e-06, + "loss": 0.4906, + "step": 6150 + }, + { + "epoch": 0.48, + "grad_norm": 3.3256329187963027, + "learning_rate": 5.516064745898956e-06, + "loss": 0.4589, + "step": 6151 + }, + { + "epoch": 0.48, + "grad_norm": 0.5863399679610443, + "learning_rate": 5.514799724646575e-06, + "loss": 0.4983, + "step": 6152 + }, + { + "epoch": 0.48, + "grad_norm": 2.7494621658080103, + "learning_rate": 5.513534670087327e-06, + "loss": 0.5322, + "step": 6153 + }, + { + "epoch": 0.48, + "grad_norm": 2.2467196423066977, + "learning_rate": 5.512269582303057e-06, + "loss": 0.5135, + "step": 6154 + }, + { + "epoch": 0.48, + "grad_norm": 1.7220730653502976, + "learning_rate": 5.511004461375615e-06, + "loss": 0.4497, + "step": 6155 + }, + { + "epoch": 0.48, + "grad_norm": 1.8254372869025794, + "learning_rate": 5.509739307386853e-06, + "loss": 0.5073, + "step": 6156 + }, + { + "epoch": 0.48, + "grad_norm": 1.6854797003354796, + "learning_rate": 5.508474120418626e-06, + "loss": 0.4431, + "step": 6157 + }, + { + "epoch": 0.48, + "grad_norm": 3.361320636776241, + "learning_rate": 5.5072089005527865e-06, + "loss": 0.4793, + "step": 6158 + }, + { + "epoch": 0.48, + "grad_norm": 1.751052335608454, + "learning_rate": 5.505943647871195e-06, + "loss": 0.4291, + "step": 6159 + }, + { + "epoch": 0.48, + "grad_norm": 4.684360599977664, + "learning_rate": 5.5046783624557106e-06, + "loss": 0.4584, + "step": 6160 + }, + { + "epoch": 0.48, + "grad_norm": 1.4034955872531034, + "learning_rate": 5.503413044388199e-06, + "loss": 0.4647, + "step": 6161 + }, + { + "epoch": 0.48, + "grad_norm": 2.4452556883215584, + "learning_rate": 5.502147693750518e-06, + "loss": 0.4727, + "step": 6162 + }, + { + "epoch": 0.48, + "grad_norm": 1.7287708993354856, + "learning_rate": 5.500882310624542e-06, + "loss": 0.4422, + "step": 6163 + }, + { + "epoch": 0.48, + "grad_norm": 1.6317900914395849, + "learning_rate": 5.499616895092133e-06, + "loss": 0.4519, + "step": 6164 + }, + { + "epoch": 0.48, + "grad_norm": 4.086964907026692, + "learning_rate": 5.498351447235166e-06, + "loss": 0.4761, + "step": 6165 + }, + { + "epoch": 0.48, + "grad_norm": 3.9960703282046515, + "learning_rate": 5.497085967135512e-06, + "loss": 0.41, + "step": 6166 + }, + { + "epoch": 0.48, + "grad_norm": 1.6876454916225092, + "learning_rate": 5.495820454875047e-06, + "loss": 0.4482, + "step": 6167 + }, + { + "epoch": 0.48, + "grad_norm": 2.4040479888130415, + "learning_rate": 5.494554910535646e-06, + "loss": 0.4601, + "step": 6168 + }, + { + "epoch": 0.48, + "grad_norm": 1.8704724298215512, + "learning_rate": 5.493289334199189e-06, + "loss": 0.4651, + "step": 6169 + }, + { + "epoch": 0.48, + "grad_norm": 1.5200863389315538, + "learning_rate": 5.4920237259475585e-06, + "loss": 0.4362, + "step": 6170 + }, + { + "epoch": 0.48, + "grad_norm": 2.4686944963393813, + "learning_rate": 5.490758085862636e-06, + "loss": 0.4859, + "step": 6171 + }, + { + "epoch": 0.48, + "grad_norm": 3.1915766630265945, + "learning_rate": 5.4894924140263075e-06, + "loss": 0.4972, + "step": 6172 + }, + { + "epoch": 0.48, + "grad_norm": 1.8589595820590077, + "learning_rate": 5.488226710520459e-06, + "loss": 0.468, + "step": 6173 + }, + { + "epoch": 0.48, + "grad_norm": 1.544235413154769, + "learning_rate": 5.486960975426981e-06, + "loss": 0.4493, + "step": 6174 + }, + { + "epoch": 0.48, + "grad_norm": 2.218455036295482, + "learning_rate": 5.485695208827766e-06, + "loss": 0.4671, + "step": 6175 + }, + { + "epoch": 0.49, + "grad_norm": 0.5846464270519955, + "learning_rate": 5.484429410804707e-06, + "loss": 0.4924, + "step": 6176 + }, + { + "epoch": 0.49, + "grad_norm": 0.5468635257090844, + "learning_rate": 5.4831635814396976e-06, + "loss": 0.4952, + "step": 6177 + }, + { + "epoch": 0.49, + "grad_norm": 1.6223208724422544, + "learning_rate": 5.4818977208146375e-06, + "loss": 0.4116, + "step": 6178 + }, + { + "epoch": 0.49, + "grad_norm": 2.4215539529799437, + "learning_rate": 5.480631829011425e-06, + "loss": 0.4358, + "step": 6179 + }, + { + "epoch": 0.49, + "grad_norm": 1.9133772126380435, + "learning_rate": 5.479365906111962e-06, + "loss": 0.4684, + "step": 6180 + }, + { + "epoch": 0.49, + "grad_norm": 0.5592268299390609, + "learning_rate": 5.4780999521981514e-06, + "loss": 0.4923, + "step": 6181 + }, + { + "epoch": 0.49, + "grad_norm": 0.5636515448443473, + "learning_rate": 5.4768339673519e-06, + "loss": 0.5101, + "step": 6182 + }, + { + "epoch": 0.49, + "grad_norm": 2.4302242402018415, + "learning_rate": 5.475567951655114e-06, + "loss": 0.4302, + "step": 6183 + }, + { + "epoch": 0.49, + "grad_norm": 1.8958875074886061, + "learning_rate": 5.474301905189705e-06, + "loss": 0.5386, + "step": 6184 + }, + { + "epoch": 0.49, + "grad_norm": 9.433617858864352, + "learning_rate": 5.47303582803758e-06, + "loss": 0.4532, + "step": 6185 + }, + { + "epoch": 0.49, + "grad_norm": 1.503411939861271, + "learning_rate": 5.4717697202806586e-06, + "loss": 0.4321, + "step": 6186 + }, + { + "epoch": 0.49, + "grad_norm": 1.7729473749048317, + "learning_rate": 5.470503582000852e-06, + "loss": 0.4557, + "step": 6187 + }, + { + "epoch": 0.49, + "grad_norm": 1.7918641458400406, + "learning_rate": 5.469237413280081e-06, + "loss": 0.4438, + "step": 6188 + }, + { + "epoch": 0.49, + "grad_norm": 2.210129788868613, + "learning_rate": 5.4679712142002614e-06, + "loss": 0.5327, + "step": 6189 + }, + { + "epoch": 0.49, + "grad_norm": 1.47759080792351, + "learning_rate": 5.466704984843317e-06, + "loss": 0.4231, + "step": 6190 + }, + { + "epoch": 0.49, + "grad_norm": 2.219900145400828, + "learning_rate": 5.465438725291171e-06, + "loss": 0.4879, + "step": 6191 + }, + { + "epoch": 0.49, + "grad_norm": 2.2502232064724494, + "learning_rate": 5.464172435625748e-06, + "loss": 0.4084, + "step": 6192 + }, + { + "epoch": 0.49, + "grad_norm": 2.00333794815659, + "learning_rate": 5.462906115928977e-06, + "loss": 0.5014, + "step": 6193 + }, + { + "epoch": 0.49, + "grad_norm": 1.701012434027411, + "learning_rate": 5.461639766282784e-06, + "loss": 0.4549, + "step": 6194 + }, + { + "epoch": 0.49, + "grad_norm": 1.461459675759506, + "learning_rate": 5.460373386769103e-06, + "loss": 0.3925, + "step": 6195 + }, + { + "epoch": 0.49, + "grad_norm": 1.644938329626762, + "learning_rate": 5.459106977469865e-06, + "loss": 0.4268, + "step": 6196 + }, + { + "epoch": 0.49, + "grad_norm": 1.6098509632255238, + "learning_rate": 5.457840538467006e-06, + "loss": 0.5228, + "step": 6197 + }, + { + "epoch": 0.49, + "grad_norm": 1.7867680854779369, + "learning_rate": 5.456574069842464e-06, + "loss": 0.4835, + "step": 6198 + }, + { + "epoch": 0.49, + "grad_norm": 1.89454600874104, + "learning_rate": 5.455307571678177e-06, + "loss": 0.4513, + "step": 6199 + }, + { + "epoch": 0.49, + "grad_norm": 1.7775040629700147, + "learning_rate": 5.454041044056086e-06, + "loss": 0.496, + "step": 6200 + }, + { + "epoch": 0.49, + "grad_norm": 1.407584426142428, + "learning_rate": 5.452774487058132e-06, + "loss": 0.4745, + "step": 6201 + }, + { + "epoch": 0.49, + "grad_norm": 1.688178599007344, + "learning_rate": 5.451507900766263e-06, + "loss": 0.433, + "step": 6202 + }, + { + "epoch": 0.49, + "grad_norm": 1.501646445047139, + "learning_rate": 5.450241285262422e-06, + "loss": 0.4889, + "step": 6203 + }, + { + "epoch": 0.49, + "grad_norm": 1.995484681314486, + "learning_rate": 5.44897464062856e-06, + "loss": 0.4942, + "step": 6204 + }, + { + "epoch": 0.49, + "grad_norm": 0.6328232711586017, + "learning_rate": 5.447707966946625e-06, + "loss": 0.4982, + "step": 6205 + }, + { + "epoch": 0.49, + "grad_norm": 1.4715274991820533, + "learning_rate": 5.44644126429857e-06, + "loss": 0.4554, + "step": 6206 + }, + { + "epoch": 0.49, + "grad_norm": 1.6494235873816463, + "learning_rate": 5.445174532766351e-06, + "loss": 0.516, + "step": 6207 + }, + { + "epoch": 0.49, + "grad_norm": 2.7747413492087127, + "learning_rate": 5.44390777243192e-06, + "loss": 0.5239, + "step": 6208 + }, + { + "epoch": 0.49, + "grad_norm": 1.7380109969520612, + "learning_rate": 5.4426409833772375e-06, + "loss": 0.4731, + "step": 6209 + }, + { + "epoch": 0.49, + "grad_norm": 1.7542706140547215, + "learning_rate": 5.441374165684262e-06, + "loss": 0.4313, + "step": 6210 + }, + { + "epoch": 0.49, + "grad_norm": 1.3785219438679248, + "learning_rate": 5.440107319434956e-06, + "loss": 0.4372, + "step": 6211 + }, + { + "epoch": 0.49, + "grad_norm": 1.592475919971585, + "learning_rate": 5.438840444711282e-06, + "loss": 0.4498, + "step": 6212 + }, + { + "epoch": 0.49, + "grad_norm": 0.6069665124641069, + "learning_rate": 5.4375735415952065e-06, + "loss": 0.5028, + "step": 6213 + }, + { + "epoch": 0.49, + "grad_norm": 0.5355741685479622, + "learning_rate": 5.436306610168693e-06, + "loss": 0.4859, + "step": 6214 + }, + { + "epoch": 0.49, + "grad_norm": 1.4853113531320452, + "learning_rate": 5.4350396505137146e-06, + "loss": 0.4278, + "step": 6215 + }, + { + "epoch": 0.49, + "grad_norm": 2.1841063441180957, + "learning_rate": 5.43377266271224e-06, + "loss": 0.4699, + "step": 6216 + }, + { + "epoch": 0.49, + "grad_norm": 2.795802384124965, + "learning_rate": 5.432505646846241e-06, + "loss": 0.4213, + "step": 6217 + }, + { + "epoch": 0.49, + "grad_norm": 1.500778920689842, + "learning_rate": 5.431238602997692e-06, + "loss": 0.4968, + "step": 6218 + }, + { + "epoch": 0.49, + "grad_norm": 1.7621174193420872, + "learning_rate": 5.429971531248569e-06, + "loss": 0.4549, + "step": 6219 + }, + { + "epoch": 0.49, + "grad_norm": 1.7075290169293769, + "learning_rate": 5.42870443168085e-06, + "loss": 0.4718, + "step": 6220 + }, + { + "epoch": 0.49, + "grad_norm": 1.6382980858710587, + "learning_rate": 5.427437304376517e-06, + "loss": 0.4944, + "step": 6221 + }, + { + "epoch": 0.49, + "grad_norm": 2.2399491981815913, + "learning_rate": 5.426170149417549e-06, + "loss": 0.466, + "step": 6222 + }, + { + "epoch": 0.49, + "grad_norm": 2.284609725737989, + "learning_rate": 5.42490296688593e-06, + "loss": 0.4904, + "step": 6223 + }, + { + "epoch": 0.49, + "grad_norm": 1.6676191641008151, + "learning_rate": 5.423635756863643e-06, + "loss": 0.5118, + "step": 6224 + }, + { + "epoch": 0.49, + "grad_norm": 2.9806706002057375, + "learning_rate": 5.4223685194326795e-06, + "loss": 0.4467, + "step": 6225 + }, + { + "epoch": 0.49, + "grad_norm": 1.8270812006639983, + "learning_rate": 5.4211012546750244e-06, + "loss": 0.4568, + "step": 6226 + }, + { + "epoch": 0.49, + "grad_norm": 2.6956974328598244, + "learning_rate": 5.419833962672668e-06, + "loss": 0.4927, + "step": 6227 + }, + { + "epoch": 0.49, + "grad_norm": 1.4145221578547051, + "learning_rate": 5.418566643507604e-06, + "loss": 0.4089, + "step": 6228 + }, + { + "epoch": 0.49, + "grad_norm": 1.7555747064717742, + "learning_rate": 5.4172992972618264e-06, + "loss": 0.4957, + "step": 6229 + }, + { + "epoch": 0.49, + "grad_norm": 1.6877813514756754, + "learning_rate": 5.416031924017327e-06, + "loss": 0.4332, + "step": 6230 + }, + { + "epoch": 0.49, + "grad_norm": 1.8889325595611621, + "learning_rate": 5.414764523856111e-06, + "loss": 0.4602, + "step": 6231 + }, + { + "epoch": 0.49, + "grad_norm": 1.846671995252927, + "learning_rate": 5.41349709686017e-06, + "loss": 0.44, + "step": 6232 + }, + { + "epoch": 0.49, + "grad_norm": 1.8721659506671033, + "learning_rate": 5.412229643111508e-06, + "loss": 0.5248, + "step": 6233 + }, + { + "epoch": 0.49, + "grad_norm": 9.040787732532449, + "learning_rate": 5.4109621626921285e-06, + "loss": 0.4562, + "step": 6234 + }, + { + "epoch": 0.49, + "grad_norm": 0.6468062117615846, + "learning_rate": 5.409694655684035e-06, + "loss": 0.4919, + "step": 6235 + }, + { + "epoch": 0.49, + "grad_norm": 1.6358340320674607, + "learning_rate": 5.408427122169231e-06, + "loss": 0.4942, + "step": 6236 + }, + { + "epoch": 0.49, + "grad_norm": 1.6479255781007034, + "learning_rate": 5.407159562229729e-06, + "loss": 0.516, + "step": 6237 + }, + { + "epoch": 0.49, + "grad_norm": 1.5313420568833969, + "learning_rate": 5.405891975947534e-06, + "loss": 0.5059, + "step": 6238 + }, + { + "epoch": 0.49, + "grad_norm": 1.7029536844106739, + "learning_rate": 5.404624363404659e-06, + "loss": 0.4715, + "step": 6239 + }, + { + "epoch": 0.49, + "grad_norm": 2.1127584456880144, + "learning_rate": 5.4033567246831185e-06, + "loss": 0.5134, + "step": 6240 + }, + { + "epoch": 0.49, + "grad_norm": 1.8955819473260356, + "learning_rate": 5.4020890598649245e-06, + "loss": 0.416, + "step": 6241 + }, + { + "epoch": 0.49, + "grad_norm": 1.9736188166879942, + "learning_rate": 5.400821369032094e-06, + "loss": 0.4535, + "step": 6242 + }, + { + "epoch": 0.49, + "grad_norm": 1.8228085773763956, + "learning_rate": 5.399553652266647e-06, + "loss": 0.476, + "step": 6243 + }, + { + "epoch": 0.49, + "grad_norm": 1.7750162324333085, + "learning_rate": 5.3982859096506e-06, + "loss": 0.4963, + "step": 6244 + }, + { + "epoch": 0.49, + "grad_norm": 1.3104342699945752, + "learning_rate": 5.397018141265975e-06, + "loss": 0.4923, + "step": 6245 + }, + { + "epoch": 0.49, + "grad_norm": 0.6243127274388197, + "learning_rate": 5.395750347194798e-06, + "loss": 0.5274, + "step": 6246 + }, + { + "epoch": 0.49, + "grad_norm": 1.7970697285433763, + "learning_rate": 5.394482527519089e-06, + "loss": 0.4669, + "step": 6247 + }, + { + "epoch": 0.49, + "grad_norm": 1.9866467236234258, + "learning_rate": 5.393214682320879e-06, + "loss": 0.5113, + "step": 6248 + }, + { + "epoch": 0.49, + "grad_norm": 2.0921703989404383, + "learning_rate": 5.391946811682191e-06, + "loss": 0.4695, + "step": 6249 + }, + { + "epoch": 0.49, + "grad_norm": 2.4994044141998235, + "learning_rate": 5.390678915685058e-06, + "loss": 0.4072, + "step": 6250 + }, + { + "epoch": 0.49, + "grad_norm": 3.045303549511949, + "learning_rate": 5.38941099441151e-06, + "loss": 0.4973, + "step": 6251 + }, + { + "epoch": 0.49, + "grad_norm": 0.5681921831982807, + "learning_rate": 5.38814304794358e-06, + "loss": 0.5007, + "step": 6252 + }, + { + "epoch": 0.49, + "grad_norm": 0.5916835522620377, + "learning_rate": 5.386875076363302e-06, + "loss": 0.5165, + "step": 6253 + }, + { + "epoch": 0.49, + "grad_norm": 1.4773206347372387, + "learning_rate": 5.385607079752714e-06, + "loss": 0.4473, + "step": 6254 + }, + { + "epoch": 0.49, + "grad_norm": 1.9697999876331795, + "learning_rate": 5.384339058193851e-06, + "loss": 0.5019, + "step": 6255 + }, + { + "epoch": 0.49, + "grad_norm": 0.5456207097539333, + "learning_rate": 5.383071011768755e-06, + "loss": 0.5007, + "step": 6256 + }, + { + "epoch": 0.49, + "grad_norm": 5.635064397127404, + "learning_rate": 5.381802940559465e-06, + "loss": 0.4462, + "step": 6257 + }, + { + "epoch": 0.49, + "grad_norm": 0.5850970705855765, + "learning_rate": 5.380534844648025e-06, + "loss": 0.4963, + "step": 6258 + }, + { + "epoch": 0.49, + "grad_norm": 0.5522828454025475, + "learning_rate": 5.3792667241164775e-06, + "loss": 0.5121, + "step": 6259 + }, + { + "epoch": 0.49, + "grad_norm": 1.6494216645929471, + "learning_rate": 5.377998579046871e-06, + "loss": 0.4607, + "step": 6260 + }, + { + "epoch": 0.49, + "grad_norm": 1.4914620541664583, + "learning_rate": 5.376730409521248e-06, + "loss": 0.376, + "step": 6261 + }, + { + "epoch": 0.49, + "grad_norm": 1.4667366376594306, + "learning_rate": 5.3754622156216615e-06, + "loss": 0.4404, + "step": 6262 + }, + { + "epoch": 0.49, + "grad_norm": 1.6930042461225863, + "learning_rate": 5.37419399743016e-06, + "loss": 0.5298, + "step": 6263 + }, + { + "epoch": 0.49, + "grad_norm": 3.64698979429338, + "learning_rate": 5.372925755028798e-06, + "loss": 0.482, + "step": 6264 + }, + { + "epoch": 0.49, + "grad_norm": 0.5874370874880012, + "learning_rate": 5.3716574884996255e-06, + "loss": 0.5025, + "step": 6265 + }, + { + "epoch": 0.49, + "grad_norm": 0.5929176049622847, + "learning_rate": 5.370389197924702e-06, + "loss": 0.5022, + "step": 6266 + }, + { + "epoch": 0.49, + "grad_norm": 0.5533408231108035, + "learning_rate": 5.36912088338608e-06, + "loss": 0.4987, + "step": 6267 + }, + { + "epoch": 0.49, + "grad_norm": 1.7710911789970258, + "learning_rate": 5.367852544965821e-06, + "loss": 0.5247, + "step": 6268 + }, + { + "epoch": 0.49, + "grad_norm": 1.613616520710951, + "learning_rate": 5.366584182745983e-06, + "loss": 0.5103, + "step": 6269 + }, + { + "epoch": 0.49, + "grad_norm": 2.1463739703508047, + "learning_rate": 5.365315796808629e-06, + "loss": 0.4596, + "step": 6270 + }, + { + "epoch": 0.49, + "grad_norm": 1.7465574581119085, + "learning_rate": 5.364047387235819e-06, + "loss": 0.4185, + "step": 6271 + }, + { + "epoch": 0.49, + "grad_norm": 1.8401806458580008, + "learning_rate": 5.362778954109621e-06, + "loss": 0.4429, + "step": 6272 + }, + { + "epoch": 0.49, + "grad_norm": 2.4811042826493708, + "learning_rate": 5.361510497512098e-06, + "loss": 0.4688, + "step": 6273 + }, + { + "epoch": 0.49, + "grad_norm": 1.8947855049732982, + "learning_rate": 5.360242017525319e-06, + "loss": 0.4724, + "step": 6274 + }, + { + "epoch": 0.49, + "grad_norm": 1.7403705638165767, + "learning_rate": 5.358973514231351e-06, + "loss": 0.4532, + "step": 6275 + }, + { + "epoch": 0.49, + "grad_norm": 2.947961443333871, + "learning_rate": 5.35770498771227e-06, + "loss": 0.4872, + "step": 6276 + }, + { + "epoch": 0.49, + "grad_norm": 1.5230329892488463, + "learning_rate": 5.356436438050141e-06, + "loss": 0.4742, + "step": 6277 + }, + { + "epoch": 0.49, + "grad_norm": 1.9063813932018792, + "learning_rate": 5.3551678653270434e-06, + "loss": 0.4883, + "step": 6278 + }, + { + "epoch": 0.49, + "grad_norm": 2.6972448311097112, + "learning_rate": 5.353899269625047e-06, + "loss": 0.5001, + "step": 6279 + }, + { + "epoch": 0.49, + "grad_norm": 1.686545163769459, + "learning_rate": 5.352630651026232e-06, + "loss": 0.4903, + "step": 6280 + }, + { + "epoch": 0.49, + "grad_norm": 3.1578877966916434, + "learning_rate": 5.3513620096126766e-06, + "loss": 0.4307, + "step": 6281 + }, + { + "epoch": 0.49, + "grad_norm": 1.5824247503932578, + "learning_rate": 5.350093345466456e-06, + "loss": 0.4784, + "step": 6282 + }, + { + "epoch": 0.49, + "grad_norm": 0.6204861532095239, + "learning_rate": 5.348824658669656e-06, + "loss": 0.4858, + "step": 6283 + }, + { + "epoch": 0.49, + "grad_norm": 2.3565999817389724, + "learning_rate": 5.347555949304356e-06, + "loss": 0.5097, + "step": 6284 + }, + { + "epoch": 0.49, + "grad_norm": 1.4984482317678507, + "learning_rate": 5.346287217452641e-06, + "loss": 0.4745, + "step": 6285 + }, + { + "epoch": 0.49, + "grad_norm": 1.784131871401627, + "learning_rate": 5.345018463196597e-06, + "loss": 0.4749, + "step": 6286 + }, + { + "epoch": 0.49, + "grad_norm": 2.06381833745769, + "learning_rate": 5.343749686618307e-06, + "loss": 0.4967, + "step": 6287 + }, + { + "epoch": 0.49, + "grad_norm": 1.6244322576247214, + "learning_rate": 5.342480887799863e-06, + "loss": 0.4476, + "step": 6288 + }, + { + "epoch": 0.49, + "grad_norm": 1.5913931340671637, + "learning_rate": 5.341212066823356e-06, + "loss": 0.5262, + "step": 6289 + }, + { + "epoch": 0.49, + "grad_norm": 1.8231656938250962, + "learning_rate": 5.339943223770871e-06, + "loss": 0.4963, + "step": 6290 + }, + { + "epoch": 0.49, + "grad_norm": 3.609619946583199, + "learning_rate": 5.338674358724507e-06, + "loss": 0.4419, + "step": 6291 + }, + { + "epoch": 0.49, + "grad_norm": 1.5044160385517398, + "learning_rate": 5.337405471766355e-06, + "loss": 0.4284, + "step": 6292 + }, + { + "epoch": 0.49, + "grad_norm": 18.95601643434386, + "learning_rate": 5.336136562978509e-06, + "loss": 0.4412, + "step": 6293 + }, + { + "epoch": 0.49, + "grad_norm": 2.7291491239111116, + "learning_rate": 5.334867632443067e-06, + "loss": 0.5082, + "step": 6294 + }, + { + "epoch": 0.49, + "grad_norm": 1.9305790703875354, + "learning_rate": 5.333598680242129e-06, + "loss": 0.5272, + "step": 6295 + }, + { + "epoch": 0.49, + "grad_norm": 3.3656206473115815, + "learning_rate": 5.3323297064577905e-06, + "loss": 0.4488, + "step": 6296 + }, + { + "epoch": 0.49, + "grad_norm": 1.782919079767497, + "learning_rate": 5.331060711172157e-06, + "loss": 0.5074, + "step": 6297 + }, + { + "epoch": 0.49, + "grad_norm": 1.730326236442161, + "learning_rate": 5.329791694467326e-06, + "loss": 0.4892, + "step": 6298 + }, + { + "epoch": 0.49, + "grad_norm": 2.186419338027681, + "learning_rate": 5.328522656425405e-06, + "loss": 0.4732, + "step": 6299 + }, + { + "epoch": 0.49, + "grad_norm": 0.6223029868635872, + "learning_rate": 5.327253597128497e-06, + "loss": 0.4817, + "step": 6300 + }, + { + "epoch": 0.49, + "grad_norm": 0.5917530158958798, + "learning_rate": 5.325984516658712e-06, + "loss": 0.4996, + "step": 6301 + }, + { + "epoch": 0.49, + "grad_norm": 1.3771180427556626, + "learning_rate": 5.324715415098154e-06, + "loss": 0.3998, + "step": 6302 + }, + { + "epoch": 0.5, + "grad_norm": 1.6652210903019669, + "learning_rate": 5.3234462925289335e-06, + "loss": 0.4966, + "step": 6303 + }, + { + "epoch": 0.5, + "grad_norm": 4.173669520105727, + "learning_rate": 5.32217714903316e-06, + "loss": 0.4644, + "step": 6304 + }, + { + "epoch": 0.5, + "grad_norm": 1.3703608074109015, + "learning_rate": 5.320907984692948e-06, + "loss": 0.4739, + "step": 6305 + }, + { + "epoch": 0.5, + "grad_norm": 1.8522374206600283, + "learning_rate": 5.319638799590408e-06, + "loss": 0.5635, + "step": 6306 + }, + { + "epoch": 0.5, + "grad_norm": 1.4912772197417128, + "learning_rate": 5.318369593807655e-06, + "loss": 0.4539, + "step": 6307 + }, + { + "epoch": 0.5, + "grad_norm": 1.8490052938874524, + "learning_rate": 5.317100367426808e-06, + "loss": 0.4831, + "step": 6308 + }, + { + "epoch": 0.5, + "grad_norm": 1.6436177452084497, + "learning_rate": 5.3158311205299794e-06, + "loss": 0.4687, + "step": 6309 + }, + { + "epoch": 0.5, + "grad_norm": 1.5753425729614179, + "learning_rate": 5.314561853199292e-06, + "loss": 0.5186, + "step": 6310 + }, + { + "epoch": 0.5, + "grad_norm": 2.0235181584990505, + "learning_rate": 5.313292565516864e-06, + "loss": 0.4558, + "step": 6311 + }, + { + "epoch": 0.5, + "grad_norm": 8.597075337151079, + "learning_rate": 5.312023257564817e-06, + "loss": 0.4185, + "step": 6312 + }, + { + "epoch": 0.5, + "grad_norm": 1.9328857002588347, + "learning_rate": 5.310753929425273e-06, + "loss": 0.4982, + "step": 6313 + }, + { + "epoch": 0.5, + "grad_norm": 2.4816405831123, + "learning_rate": 5.309484581180357e-06, + "loss": 0.4364, + "step": 6314 + }, + { + "epoch": 0.5, + "grad_norm": 1.7183106406122068, + "learning_rate": 5.3082152129121935e-06, + "loss": 0.4774, + "step": 6315 + }, + { + "epoch": 0.5, + "grad_norm": 2.3234240573791825, + "learning_rate": 5.306945824702908e-06, + "loss": 0.4222, + "step": 6316 + }, + { + "epoch": 0.5, + "grad_norm": 2.4834144575483674, + "learning_rate": 5.305676416634628e-06, + "loss": 0.4996, + "step": 6317 + }, + { + "epoch": 0.5, + "grad_norm": 1.6884736366387614, + "learning_rate": 5.304406988789486e-06, + "loss": 0.4818, + "step": 6318 + }, + { + "epoch": 0.5, + "grad_norm": 1.775642170972731, + "learning_rate": 5.30313754124961e-06, + "loss": 0.4388, + "step": 6319 + }, + { + "epoch": 0.5, + "grad_norm": 1.6149749825117283, + "learning_rate": 5.301868074097132e-06, + "loss": 0.4242, + "step": 6320 + }, + { + "epoch": 0.5, + "grad_norm": 3.4294468223332797, + "learning_rate": 5.300598587414183e-06, + "loss": 0.5201, + "step": 6321 + }, + { + "epoch": 0.5, + "grad_norm": 2.041359915283597, + "learning_rate": 5.299329081282898e-06, + "loss": 0.5076, + "step": 6322 + }, + { + "epoch": 0.5, + "grad_norm": 3.247802370545581, + "learning_rate": 5.298059555785414e-06, + "loss": 0.4571, + "step": 6323 + }, + { + "epoch": 0.5, + "grad_norm": 1.9882339879234205, + "learning_rate": 5.296790011003867e-06, + "loss": 0.4774, + "step": 6324 + }, + { + "epoch": 0.5, + "grad_norm": 1.8519505353548662, + "learning_rate": 5.2955204470203945e-06, + "loss": 0.4039, + "step": 6325 + }, + { + "epoch": 0.5, + "grad_norm": 1.9131839228307106, + "learning_rate": 5.294250863917136e-06, + "loss": 0.5026, + "step": 6326 + }, + { + "epoch": 0.5, + "grad_norm": 5.1052343546269014, + "learning_rate": 5.292981261776231e-06, + "loss": 0.4592, + "step": 6327 + }, + { + "epoch": 0.5, + "grad_norm": 1.6043179629388822, + "learning_rate": 5.291711640679822e-06, + "loss": 0.5156, + "step": 6328 + }, + { + "epoch": 0.5, + "grad_norm": 1.7533078625257088, + "learning_rate": 5.290442000710051e-06, + "loss": 0.4837, + "step": 6329 + }, + { + "epoch": 0.5, + "grad_norm": 1.8504278274861121, + "learning_rate": 5.289172341949063e-06, + "loss": 0.4767, + "step": 6330 + }, + { + "epoch": 0.5, + "grad_norm": 1.510640757751848, + "learning_rate": 5.287902664479002e-06, + "loss": 0.4312, + "step": 6331 + }, + { + "epoch": 0.5, + "grad_norm": 1.5991450427357143, + "learning_rate": 5.286632968382015e-06, + "loss": 0.4717, + "step": 6332 + }, + { + "epoch": 0.5, + "grad_norm": 1.9920646198379657, + "learning_rate": 5.28536325374025e-06, + "loss": 0.4627, + "step": 6333 + }, + { + "epoch": 0.5, + "grad_norm": 1.6081928394626317, + "learning_rate": 5.284093520635857e-06, + "loss": 0.484, + "step": 6334 + }, + { + "epoch": 0.5, + "grad_norm": 1.6056460923747577, + "learning_rate": 5.282823769150984e-06, + "loss": 0.4704, + "step": 6335 + }, + { + "epoch": 0.5, + "grad_norm": 0.8737402831553114, + "learning_rate": 5.2815539993677835e-06, + "loss": 0.5162, + "step": 6336 + }, + { + "epoch": 0.5, + "grad_norm": 1.7177839424240293, + "learning_rate": 5.280284211368407e-06, + "loss": 0.4723, + "step": 6337 + }, + { + "epoch": 0.5, + "grad_norm": 1.6226966745235356, + "learning_rate": 5.27901440523501e-06, + "loss": 0.4923, + "step": 6338 + }, + { + "epoch": 0.5, + "grad_norm": 1.6667447088802565, + "learning_rate": 5.277744581049744e-06, + "loss": 0.462, + "step": 6339 + }, + { + "epoch": 0.5, + "grad_norm": 1.6313250942295954, + "learning_rate": 5.276474738894769e-06, + "loss": 0.4908, + "step": 6340 + }, + { + "epoch": 0.5, + "grad_norm": 1.3304742436623336, + "learning_rate": 5.275204878852239e-06, + "loss": 0.4416, + "step": 6341 + }, + { + "epoch": 0.5, + "grad_norm": 2.6251352719445262, + "learning_rate": 5.273935001004313e-06, + "loss": 0.4498, + "step": 6342 + }, + { + "epoch": 0.5, + "grad_norm": 1.71600589789104, + "learning_rate": 5.27266510543315e-06, + "loss": 0.4649, + "step": 6343 + }, + { + "epoch": 0.5, + "grad_norm": 1.7779323429192808, + "learning_rate": 5.271395192220914e-06, + "loss": 0.4627, + "step": 6344 + }, + { + "epoch": 0.5, + "grad_norm": 1.3096970466278597, + "learning_rate": 5.270125261449761e-06, + "loss": 0.4637, + "step": 6345 + }, + { + "epoch": 0.5, + "grad_norm": 1.5935973937058499, + "learning_rate": 5.2688553132018595e-06, + "loss": 0.4259, + "step": 6346 + }, + { + "epoch": 0.5, + "grad_norm": 2.0346619803263954, + "learning_rate": 5.26758534755937e-06, + "loss": 0.4734, + "step": 6347 + }, + { + "epoch": 0.5, + "grad_norm": 1.8650269279941372, + "learning_rate": 5.26631536460446e-06, + "loss": 0.4646, + "step": 6348 + }, + { + "epoch": 0.5, + "grad_norm": 0.7484319902745529, + "learning_rate": 5.265045364419291e-06, + "loss": 0.5258, + "step": 6349 + }, + { + "epoch": 0.5, + "grad_norm": 1.7993804969421627, + "learning_rate": 5.2637753470860365e-06, + "loss": 0.4392, + "step": 6350 + }, + { + "epoch": 0.5, + "grad_norm": 1.706865561397999, + "learning_rate": 5.2625053126868605e-06, + "loss": 0.4516, + "step": 6351 + }, + { + "epoch": 0.5, + "grad_norm": 1.9402787315568721, + "learning_rate": 5.261235261303935e-06, + "loss": 0.4987, + "step": 6352 + }, + { + "epoch": 0.5, + "grad_norm": 1.6446848818671795, + "learning_rate": 5.259965193019431e-06, + "loss": 0.4395, + "step": 6353 + }, + { + "epoch": 0.5, + "grad_norm": 1.454533862690701, + "learning_rate": 5.258695107915517e-06, + "loss": 0.448, + "step": 6354 + }, + { + "epoch": 0.5, + "grad_norm": 2.1472889889396236, + "learning_rate": 5.257425006074368e-06, + "loss": 0.4815, + "step": 6355 + }, + { + "epoch": 0.5, + "grad_norm": 1.5143769595568661, + "learning_rate": 5.2561548875781585e-06, + "loss": 0.4208, + "step": 6356 + }, + { + "epoch": 0.5, + "grad_norm": 0.5953522912583749, + "learning_rate": 5.254884752509063e-06, + "loss": 0.5049, + "step": 6357 + }, + { + "epoch": 0.5, + "grad_norm": 1.9924628357185903, + "learning_rate": 5.253614600949256e-06, + "loss": 0.4991, + "step": 6358 + }, + { + "epoch": 0.5, + "grad_norm": 7.732888148664725, + "learning_rate": 5.252344432980918e-06, + "loss": 0.4298, + "step": 6359 + }, + { + "epoch": 0.5, + "grad_norm": 2.1156378815388024, + "learning_rate": 5.251074248686226e-06, + "loss": 0.4139, + "step": 6360 + }, + { + "epoch": 0.5, + "grad_norm": 2.035207180116052, + "learning_rate": 5.249804048147357e-06, + "loss": 0.4529, + "step": 6361 + }, + { + "epoch": 0.5, + "grad_norm": 2.1611716300246155, + "learning_rate": 5.2485338314464925e-06, + "loss": 0.4468, + "step": 6362 + }, + { + "epoch": 0.5, + "grad_norm": 1.3453832028855723, + "learning_rate": 5.2472635986658145e-06, + "loss": 0.4204, + "step": 6363 + }, + { + "epoch": 0.5, + "grad_norm": 1.4675127214739616, + "learning_rate": 5.245993349887506e-06, + "loss": 0.4849, + "step": 6364 + }, + { + "epoch": 0.5, + "grad_norm": 0.5614434986602911, + "learning_rate": 5.244723085193749e-06, + "loss": 0.5009, + "step": 6365 + }, + { + "epoch": 0.5, + "grad_norm": 1.5588169799401679, + "learning_rate": 5.243452804666728e-06, + "loss": 0.4658, + "step": 6366 + }, + { + "epoch": 0.5, + "grad_norm": 2.302343974650844, + "learning_rate": 5.24218250838863e-06, + "loss": 0.4615, + "step": 6367 + }, + { + "epoch": 0.5, + "grad_norm": 1.4949704828194643, + "learning_rate": 5.2409121964416394e-06, + "loss": 0.4511, + "step": 6368 + }, + { + "epoch": 0.5, + "grad_norm": 1.489486550335994, + "learning_rate": 5.239641868907946e-06, + "loss": 0.4105, + "step": 6369 + }, + { + "epoch": 0.5, + "grad_norm": 1.762040835435942, + "learning_rate": 5.2383715258697364e-06, + "loss": 0.463, + "step": 6370 + }, + { + "epoch": 0.5, + "grad_norm": 1.6765433871233333, + "learning_rate": 5.237101167409202e-06, + "loss": 0.442, + "step": 6371 + }, + { + "epoch": 0.5, + "grad_norm": 1.7565350949707235, + "learning_rate": 5.2358307936085315e-06, + "loss": 0.4858, + "step": 6372 + }, + { + "epoch": 0.5, + "grad_norm": 2.0581589145637462, + "learning_rate": 5.234560404549917e-06, + "loss": 0.5044, + "step": 6373 + }, + { + "epoch": 0.5, + "grad_norm": 1.883796101782298, + "learning_rate": 5.233290000315552e-06, + "loss": 0.4994, + "step": 6374 + }, + { + "epoch": 0.5, + "grad_norm": 1.9070147395372747, + "learning_rate": 5.232019580987628e-06, + "loss": 0.5017, + "step": 6375 + }, + { + "epoch": 0.5, + "grad_norm": 1.6127254047567787, + "learning_rate": 5.230749146648341e-06, + "loss": 0.4878, + "step": 6376 + }, + { + "epoch": 0.5, + "grad_norm": 0.5748707675156586, + "learning_rate": 5.2294786973798864e-06, + "loss": 0.5051, + "step": 6377 + }, + { + "epoch": 0.5, + "grad_norm": 1.904655159414454, + "learning_rate": 5.228208233264459e-06, + "loss": 0.4592, + "step": 6378 + }, + { + "epoch": 0.5, + "grad_norm": 2.1707593129342198, + "learning_rate": 5.226937754384259e-06, + "loss": 0.499, + "step": 6379 + }, + { + "epoch": 0.5, + "grad_norm": 0.5635030740996013, + "learning_rate": 5.225667260821482e-06, + "loss": 0.4987, + "step": 6380 + }, + { + "epoch": 0.5, + "grad_norm": 2.2442132335278333, + "learning_rate": 5.22439675265833e-06, + "loss": 0.4792, + "step": 6381 + }, + { + "epoch": 0.5, + "grad_norm": 1.5621225451139222, + "learning_rate": 5.223126229977e-06, + "loss": 0.5142, + "step": 6382 + }, + { + "epoch": 0.5, + "grad_norm": 1.5431950151514104, + "learning_rate": 5.221855692859697e-06, + "loss": 0.4895, + "step": 6383 + }, + { + "epoch": 0.5, + "grad_norm": 1.5365914402395804, + "learning_rate": 5.2205851413886185e-06, + "loss": 0.4081, + "step": 6384 + }, + { + "epoch": 0.5, + "grad_norm": 0.569853713583796, + "learning_rate": 5.219314575645972e-06, + "loss": 0.512, + "step": 6385 + }, + { + "epoch": 0.5, + "grad_norm": 1.6362935225760142, + "learning_rate": 5.218043995713958e-06, + "loss": 0.5122, + "step": 6386 + }, + { + "epoch": 0.5, + "grad_norm": 2.108327464549863, + "learning_rate": 5.216773401674781e-06, + "loss": 0.4605, + "step": 6387 + }, + { + "epoch": 0.5, + "grad_norm": 1.451207394963656, + "learning_rate": 5.21550279361065e-06, + "loss": 0.4897, + "step": 6388 + }, + { + "epoch": 0.5, + "grad_norm": 3.5920953375807123, + "learning_rate": 5.214232171603772e-06, + "loss": 0.4331, + "step": 6389 + }, + { + "epoch": 0.5, + "grad_norm": 1.6131797343925638, + "learning_rate": 5.212961535736351e-06, + "loss": 0.4819, + "step": 6390 + }, + { + "epoch": 0.5, + "grad_norm": 1.5832889198961604, + "learning_rate": 5.2116908860905976e-06, + "loss": 0.4897, + "step": 6391 + }, + { + "epoch": 0.5, + "grad_norm": 1.7736041899391994, + "learning_rate": 5.2104202227487195e-06, + "loss": 0.4706, + "step": 6392 + }, + { + "epoch": 0.5, + "grad_norm": 2.698483160247006, + "learning_rate": 5.20914954579293e-06, + "loss": 0.467, + "step": 6393 + }, + { + "epoch": 0.5, + "grad_norm": 1.8420120044989876, + "learning_rate": 5.207878855305438e-06, + "loss": 0.471, + "step": 6394 + }, + { + "epoch": 0.5, + "grad_norm": 1.5514768605152538, + "learning_rate": 5.206608151368457e-06, + "loss": 0.4153, + "step": 6395 + }, + { + "epoch": 0.5, + "grad_norm": 1.4756262883700841, + "learning_rate": 5.205337434064198e-06, + "loss": 0.5061, + "step": 6396 + }, + { + "epoch": 0.5, + "grad_norm": 1.6130012676722223, + "learning_rate": 5.2040667034748745e-06, + "loss": 0.5435, + "step": 6397 + }, + { + "epoch": 0.5, + "grad_norm": 1.4755491510645904, + "learning_rate": 5.202795959682704e-06, + "loss": 0.4611, + "step": 6398 + }, + { + "epoch": 0.5, + "grad_norm": 1.5353269227832131, + "learning_rate": 5.201525202769899e-06, + "loss": 0.4647, + "step": 6399 + }, + { + "epoch": 0.5, + "grad_norm": 1.7137190570570136, + "learning_rate": 5.200254432818677e-06, + "loss": 0.5004, + "step": 6400 + }, + { + "epoch": 0.5, + "grad_norm": 1.5689974389314436, + "learning_rate": 5.198983649911256e-06, + "loss": 0.4773, + "step": 6401 + }, + { + "epoch": 0.5, + "grad_norm": 1.3641785926603178, + "learning_rate": 5.197712854129852e-06, + "loss": 0.4132, + "step": 6402 + }, + { + "epoch": 0.5, + "grad_norm": 2.346223958507651, + "learning_rate": 5.196442045556685e-06, + "loss": 0.4272, + "step": 6403 + }, + { + "epoch": 0.5, + "grad_norm": 1.9931405557139343, + "learning_rate": 5.1951712242739775e-06, + "loss": 0.4454, + "step": 6404 + }, + { + "epoch": 0.5, + "grad_norm": 0.5612073321204236, + "learning_rate": 5.193900390363944e-06, + "loss": 0.4926, + "step": 6405 + }, + { + "epoch": 0.5, + "grad_norm": 1.7408529228865668, + "learning_rate": 5.192629543908811e-06, + "loss": 0.4892, + "step": 6406 + }, + { + "epoch": 0.5, + "grad_norm": 0.539621728364625, + "learning_rate": 5.191358684990796e-06, + "loss": 0.4655, + "step": 6407 + }, + { + "epoch": 0.5, + "grad_norm": 1.4159810140498088, + "learning_rate": 5.190087813692127e-06, + "loss": 0.5028, + "step": 6408 + }, + { + "epoch": 0.5, + "grad_norm": 1.7191056669152742, + "learning_rate": 5.188816930095023e-06, + "loss": 0.492, + "step": 6409 + }, + { + "epoch": 0.5, + "grad_norm": 1.7449955515947362, + "learning_rate": 5.187546034281712e-06, + "loss": 0.4602, + "step": 6410 + }, + { + "epoch": 0.5, + "grad_norm": 2.149520888460154, + "learning_rate": 5.186275126334417e-06, + "loss": 0.4339, + "step": 6411 + }, + { + "epoch": 0.5, + "grad_norm": 1.6984038003079411, + "learning_rate": 5.1850042063353655e-06, + "loss": 0.5049, + "step": 6412 + }, + { + "epoch": 0.5, + "grad_norm": 1.4921197344135262, + "learning_rate": 5.183733274366783e-06, + "loss": 0.447, + "step": 6413 + }, + { + "epoch": 0.5, + "grad_norm": 1.4397144311628005, + "learning_rate": 5.1824623305109e-06, + "loss": 0.4438, + "step": 6414 + }, + { + "epoch": 0.5, + "grad_norm": 1.548265490614242, + "learning_rate": 5.181191374849941e-06, + "loss": 0.4441, + "step": 6415 + }, + { + "epoch": 0.5, + "grad_norm": 1.5431966626586984, + "learning_rate": 5.179920407466138e-06, + "loss": 0.5002, + "step": 6416 + }, + { + "epoch": 0.5, + "grad_norm": 0.6281042938534527, + "learning_rate": 5.17864942844172e-06, + "loss": 0.5171, + "step": 6417 + }, + { + "epoch": 0.5, + "grad_norm": 1.5831227752072574, + "learning_rate": 5.177378437858918e-06, + "loss": 0.4534, + "step": 6418 + }, + { + "epoch": 0.5, + "grad_norm": 1.974205123163909, + "learning_rate": 5.176107435799962e-06, + "loss": 0.4441, + "step": 6419 + }, + { + "epoch": 0.5, + "grad_norm": 1.3700131601400525, + "learning_rate": 5.174836422347087e-06, + "loss": 0.4296, + "step": 6420 + }, + { + "epoch": 0.5, + "grad_norm": 1.9962898303726901, + "learning_rate": 5.173565397582522e-06, + "loss": 0.5228, + "step": 6421 + }, + { + "epoch": 0.5, + "grad_norm": 1.7938213727438832, + "learning_rate": 5.172294361588504e-06, + "loss": 0.4559, + "step": 6422 + }, + { + "epoch": 0.5, + "grad_norm": 1.8380930903094914, + "learning_rate": 5.171023314447265e-06, + "loss": 0.4561, + "step": 6423 + }, + { + "epoch": 0.5, + "grad_norm": 1.5332822122625431, + "learning_rate": 5.169752256241043e-06, + "loss": 0.4389, + "step": 6424 + }, + { + "epoch": 0.5, + "grad_norm": 2.4603053774310712, + "learning_rate": 5.1684811870520715e-06, + "loss": 0.4866, + "step": 6425 + }, + { + "epoch": 0.5, + "grad_norm": 1.68945279421647, + "learning_rate": 5.167210106962588e-06, + "loss": 0.4319, + "step": 6426 + }, + { + "epoch": 0.5, + "grad_norm": 1.976411496691542, + "learning_rate": 5.1659390160548285e-06, + "loss": 0.4824, + "step": 6427 + }, + { + "epoch": 0.5, + "grad_norm": 0.5872765562640447, + "learning_rate": 5.164667914411031e-06, + "loss": 0.501, + "step": 6428 + }, + { + "epoch": 0.5, + "grad_norm": 0.5469721580300385, + "learning_rate": 5.163396802113438e-06, + "loss": 0.5099, + "step": 6429 + }, + { + "epoch": 0.5, + "grad_norm": 1.4528028118496923, + "learning_rate": 5.162125679244282e-06, + "loss": 0.4447, + "step": 6430 + }, + { + "epoch": 0.51, + "grad_norm": 1.7951980591678836, + "learning_rate": 5.16085454588581e-06, + "loss": 0.4239, + "step": 6431 + }, + { + "epoch": 0.51, + "grad_norm": 2.33610109693331, + "learning_rate": 5.159583402120256e-06, + "loss": 0.5184, + "step": 6432 + }, + { + "epoch": 0.51, + "grad_norm": 1.7105316915530604, + "learning_rate": 5.158312248029868e-06, + "loss": 0.4635, + "step": 6433 + }, + { + "epoch": 0.51, + "grad_norm": 1.8769433935601085, + "learning_rate": 5.157041083696882e-06, + "loss": 0.5001, + "step": 6434 + }, + { + "epoch": 0.51, + "grad_norm": 1.618259329617298, + "learning_rate": 5.155769909203543e-06, + "loss": 0.4937, + "step": 6435 + }, + { + "epoch": 0.51, + "grad_norm": 1.6962936272300568, + "learning_rate": 5.154498724632095e-06, + "loss": 0.4814, + "step": 6436 + }, + { + "epoch": 0.51, + "grad_norm": 1.4276255167353245, + "learning_rate": 5.153227530064784e-06, + "loss": 0.4235, + "step": 6437 + }, + { + "epoch": 0.51, + "grad_norm": 1.5446258708579352, + "learning_rate": 5.15195632558385e-06, + "loss": 0.5327, + "step": 6438 + }, + { + "epoch": 0.51, + "grad_norm": 2.023823947320032, + "learning_rate": 5.150685111271541e-06, + "loss": 0.5068, + "step": 6439 + }, + { + "epoch": 0.51, + "grad_norm": 1.913079326857015, + "learning_rate": 5.1494138872101026e-06, + "loss": 0.5524, + "step": 6440 + }, + { + "epoch": 0.51, + "grad_norm": 1.57045341359687, + "learning_rate": 5.1481426534817825e-06, + "loss": 0.4509, + "step": 6441 + }, + { + "epoch": 0.51, + "grad_norm": 1.582410928448665, + "learning_rate": 5.146871410168825e-06, + "loss": 0.5124, + "step": 6442 + }, + { + "epoch": 0.51, + "grad_norm": 1.8114976265871985, + "learning_rate": 5.1456001573534795e-06, + "loss": 0.4625, + "step": 6443 + }, + { + "epoch": 0.51, + "grad_norm": 2.1948723183567, + "learning_rate": 5.1443288951179946e-06, + "loss": 0.4562, + "step": 6444 + }, + { + "epoch": 0.51, + "grad_norm": 2.2525103819699286, + "learning_rate": 5.1430576235446185e-06, + "loss": 0.4588, + "step": 6445 + }, + { + "epoch": 0.51, + "grad_norm": 2.413732211424999, + "learning_rate": 5.141786342715601e-06, + "loss": 0.4456, + "step": 6446 + }, + { + "epoch": 0.51, + "grad_norm": 1.41581810389455, + "learning_rate": 5.140515052713193e-06, + "loss": 0.4358, + "step": 6447 + }, + { + "epoch": 0.51, + "grad_norm": 1.915289319786108, + "learning_rate": 5.139243753619645e-06, + "loss": 0.4766, + "step": 6448 + }, + { + "epoch": 0.51, + "grad_norm": 1.2422567335429795, + "learning_rate": 5.13797244551721e-06, + "loss": 0.4613, + "step": 6449 + }, + { + "epoch": 0.51, + "grad_norm": 1.5608426420138577, + "learning_rate": 5.136701128488135e-06, + "loss": 0.4621, + "step": 6450 + }, + { + "epoch": 0.51, + "grad_norm": 1.6998418080600932, + "learning_rate": 5.1354298026146786e-06, + "loss": 0.5164, + "step": 6451 + }, + { + "epoch": 0.51, + "grad_norm": 8.612123966141786, + "learning_rate": 5.134158467979089e-06, + "loss": 0.4875, + "step": 6452 + }, + { + "epoch": 0.51, + "grad_norm": 1.6548387587265756, + "learning_rate": 5.132887124663625e-06, + "loss": 0.4522, + "step": 6453 + }, + { + "epoch": 0.51, + "grad_norm": 1.33475632525976, + "learning_rate": 5.131615772750534e-06, + "loss": 0.4929, + "step": 6454 + }, + { + "epoch": 0.51, + "grad_norm": 1.900944760148288, + "learning_rate": 5.130344412322078e-06, + "loss": 0.483, + "step": 6455 + }, + { + "epoch": 0.51, + "grad_norm": 1.5973543610583176, + "learning_rate": 5.129073043460506e-06, + "loss": 0.4461, + "step": 6456 + }, + { + "epoch": 0.51, + "grad_norm": 2.8637051433307064, + "learning_rate": 5.127801666248079e-06, + "loss": 0.4685, + "step": 6457 + }, + { + "epoch": 0.51, + "grad_norm": 1.9001190043422522, + "learning_rate": 5.126530280767051e-06, + "loss": 0.4954, + "step": 6458 + }, + { + "epoch": 0.51, + "grad_norm": 2.0386995129008207, + "learning_rate": 5.12525888709968e-06, + "loss": 0.4712, + "step": 6459 + }, + { + "epoch": 0.51, + "grad_norm": 1.7012475507478124, + "learning_rate": 5.123987485328221e-06, + "loss": 0.4997, + "step": 6460 + }, + { + "epoch": 0.51, + "grad_norm": 2.273908563563598, + "learning_rate": 5.122716075534936e-06, + "loss": 0.4852, + "step": 6461 + }, + { + "epoch": 0.51, + "grad_norm": 2.7741579037143547, + "learning_rate": 5.12144465780208e-06, + "loss": 0.4961, + "step": 6462 + }, + { + "epoch": 0.51, + "grad_norm": 0.6428764479979168, + "learning_rate": 5.120173232211913e-06, + "loss": 0.4864, + "step": 6463 + }, + { + "epoch": 0.51, + "grad_norm": 1.6119801629601755, + "learning_rate": 5.118901798846697e-06, + "loss": 0.4664, + "step": 6464 + }, + { + "epoch": 0.51, + "grad_norm": 1.2764344647502386, + "learning_rate": 5.117630357788689e-06, + "loss": 0.3777, + "step": 6465 + }, + { + "epoch": 0.51, + "grad_norm": 1.6930540901717557, + "learning_rate": 5.116358909120151e-06, + "loss": 0.4381, + "step": 6466 + }, + { + "epoch": 0.51, + "grad_norm": 1.6095589962193397, + "learning_rate": 5.115087452923344e-06, + "loss": 0.4421, + "step": 6467 + }, + { + "epoch": 0.51, + "grad_norm": 1.460767995766159, + "learning_rate": 5.113815989280528e-06, + "loss": 0.4536, + "step": 6468 + }, + { + "epoch": 0.51, + "grad_norm": 0.5636398879812817, + "learning_rate": 5.112544518273968e-06, + "loss": 0.4839, + "step": 6469 + }, + { + "epoch": 0.51, + "grad_norm": 1.4104955803019765, + "learning_rate": 5.111273039985924e-06, + "loss": 0.4683, + "step": 6470 + }, + { + "epoch": 0.51, + "grad_norm": 2.702160225164393, + "learning_rate": 5.110001554498661e-06, + "loss": 0.4799, + "step": 6471 + }, + { + "epoch": 0.51, + "grad_norm": 1.8112330186615966, + "learning_rate": 5.108730061894441e-06, + "loss": 0.4455, + "step": 6472 + }, + { + "epoch": 0.51, + "grad_norm": 1.5962363683207306, + "learning_rate": 5.107458562255527e-06, + "loss": 0.4438, + "step": 6473 + }, + { + "epoch": 0.51, + "grad_norm": 1.463144416939368, + "learning_rate": 5.106187055664187e-06, + "loss": 0.5069, + "step": 6474 + }, + { + "epoch": 0.51, + "grad_norm": 2.677187590392192, + "learning_rate": 5.1049155422026805e-06, + "loss": 0.5011, + "step": 6475 + }, + { + "epoch": 0.51, + "grad_norm": 2.494190595061933, + "learning_rate": 5.103644021953278e-06, + "loss": 0.4931, + "step": 6476 + }, + { + "epoch": 0.51, + "grad_norm": 1.5657685831593735, + "learning_rate": 5.102372494998241e-06, + "loss": 0.4828, + "step": 6477 + }, + { + "epoch": 0.51, + "grad_norm": 1.3397049285586455, + "learning_rate": 5.101100961419839e-06, + "loss": 0.4284, + "step": 6478 + }, + { + "epoch": 0.51, + "grad_norm": 0.562365286627066, + "learning_rate": 5.099829421300336e-06, + "loss": 0.4782, + "step": 6479 + }, + { + "epoch": 0.51, + "grad_norm": 2.1462171757522883, + "learning_rate": 5.098557874722e-06, + "loss": 0.4822, + "step": 6480 + }, + { + "epoch": 0.51, + "grad_norm": 0.5686820126035461, + "learning_rate": 5.097286321767099e-06, + "loss": 0.5108, + "step": 6481 + }, + { + "epoch": 0.51, + "grad_norm": 1.5991714741622418, + "learning_rate": 5.0960147625179005e-06, + "loss": 0.4967, + "step": 6482 + }, + { + "epoch": 0.51, + "grad_norm": 0.5199891720756461, + "learning_rate": 5.094743197056672e-06, + "loss": 0.4931, + "step": 6483 + }, + { + "epoch": 0.51, + "grad_norm": 0.5953076041477736, + "learning_rate": 5.093471625465682e-06, + "loss": 0.4983, + "step": 6484 + }, + { + "epoch": 0.51, + "grad_norm": 1.7694251045227662, + "learning_rate": 5.092200047827201e-06, + "loss": 0.4731, + "step": 6485 + }, + { + "epoch": 0.51, + "grad_norm": 1.2937997136246253, + "learning_rate": 5.090928464223498e-06, + "loss": 0.4485, + "step": 6486 + }, + { + "epoch": 0.51, + "grad_norm": 1.403455163043738, + "learning_rate": 5.089656874736841e-06, + "loss": 0.4763, + "step": 6487 + }, + { + "epoch": 0.51, + "grad_norm": 1.6380145590804795, + "learning_rate": 5.088385279449503e-06, + "loss": 0.4553, + "step": 6488 + }, + { + "epoch": 0.51, + "grad_norm": 1.6724149728007274, + "learning_rate": 5.087113678443751e-06, + "loss": 0.462, + "step": 6489 + }, + { + "epoch": 0.51, + "grad_norm": 1.9532041712429067, + "learning_rate": 5.085842071801859e-06, + "loss": 0.4554, + "step": 6490 + }, + { + "epoch": 0.51, + "grad_norm": 0.6167674487931547, + "learning_rate": 5.084570459606097e-06, + "loss": 0.4623, + "step": 6491 + }, + { + "epoch": 0.51, + "grad_norm": 1.8669605234622269, + "learning_rate": 5.083298841938738e-06, + "loss": 0.4838, + "step": 6492 + }, + { + "epoch": 0.51, + "grad_norm": 0.5640856060635908, + "learning_rate": 5.082027218882052e-06, + "loss": 0.5006, + "step": 6493 + }, + { + "epoch": 0.51, + "grad_norm": 0.5828311424051065, + "learning_rate": 5.080755590518314e-06, + "loss": 0.502, + "step": 6494 + }, + { + "epoch": 0.51, + "grad_norm": 1.5314133777730905, + "learning_rate": 5.0794839569297915e-06, + "loss": 0.4457, + "step": 6495 + }, + { + "epoch": 0.51, + "grad_norm": 2.479749530438239, + "learning_rate": 5.078212318198764e-06, + "loss": 0.3963, + "step": 6496 + }, + { + "epoch": 0.51, + "grad_norm": 1.6480121647898287, + "learning_rate": 5.0769406744075e-06, + "loss": 0.4495, + "step": 6497 + }, + { + "epoch": 0.51, + "grad_norm": 3.2992146818452595, + "learning_rate": 5.075669025638277e-06, + "loss": 0.4692, + "step": 6498 + }, + { + "epoch": 0.51, + "grad_norm": 1.4481826041154438, + "learning_rate": 5.074397371973365e-06, + "loss": 0.484, + "step": 6499 + }, + { + "epoch": 0.51, + "grad_norm": 0.6340666341137968, + "learning_rate": 5.073125713495041e-06, + "loss": 0.4874, + "step": 6500 + }, + { + "epoch": 0.51, + "grad_norm": 1.4830123884863922, + "learning_rate": 5.0718540502855785e-06, + "loss": 0.448, + "step": 6501 + }, + { + "epoch": 0.51, + "grad_norm": 1.52551552511447, + "learning_rate": 5.070582382427254e-06, + "loss": 0.443, + "step": 6502 + }, + { + "epoch": 0.51, + "grad_norm": 1.8181868208968002, + "learning_rate": 5.0693107100023395e-06, + "loss": 0.389, + "step": 6503 + }, + { + "epoch": 0.51, + "grad_norm": 1.49161500487639, + "learning_rate": 5.068039033093116e-06, + "loss": 0.51, + "step": 6504 + }, + { + "epoch": 0.51, + "grad_norm": 0.5763382188101537, + "learning_rate": 5.066767351781854e-06, + "loss": 0.487, + "step": 6505 + }, + { + "epoch": 0.51, + "grad_norm": 3.04663930982442, + "learning_rate": 5.065495666150831e-06, + "loss": 0.4222, + "step": 6506 + }, + { + "epoch": 0.51, + "grad_norm": 0.5393219359044084, + "learning_rate": 5.0642239762823265e-06, + "loss": 0.4766, + "step": 6507 + }, + { + "epoch": 0.51, + "grad_norm": 1.6516923896407305, + "learning_rate": 5.062952282258613e-06, + "loss": 0.4926, + "step": 6508 + }, + { + "epoch": 0.51, + "grad_norm": 2.025037515728459, + "learning_rate": 5.06168058416197e-06, + "loss": 0.4818, + "step": 6509 + }, + { + "epoch": 0.51, + "grad_norm": 1.4653430704681132, + "learning_rate": 5.060408882074673e-06, + "loss": 0.4469, + "step": 6510 + }, + { + "epoch": 0.51, + "grad_norm": 1.8705353443065302, + "learning_rate": 5.059137176079002e-06, + "loss": 0.5067, + "step": 6511 + }, + { + "epoch": 0.51, + "grad_norm": 1.7839725964332016, + "learning_rate": 5.057865466257231e-06, + "loss": 0.4404, + "step": 6512 + }, + { + "epoch": 0.51, + "grad_norm": 1.6739655116644743, + "learning_rate": 5.056593752691641e-06, + "loss": 0.4476, + "step": 6513 + }, + { + "epoch": 0.51, + "grad_norm": 1.3650251966476692, + "learning_rate": 5.055322035464508e-06, + "loss": 0.455, + "step": 6514 + }, + { + "epoch": 0.51, + "grad_norm": 1.7253294831946853, + "learning_rate": 5.0540503146581145e-06, + "loss": 0.4202, + "step": 6515 + }, + { + "epoch": 0.51, + "grad_norm": 1.5242466372334045, + "learning_rate": 5.052778590354735e-06, + "loss": 0.4904, + "step": 6516 + }, + { + "epoch": 0.51, + "grad_norm": 0.5871181548313401, + "learning_rate": 5.051506862636649e-06, + "loss": 0.4845, + "step": 6517 + }, + { + "epoch": 0.51, + "grad_norm": 0.6697088613735322, + "learning_rate": 5.050235131586137e-06, + "loss": 0.4871, + "step": 6518 + }, + { + "epoch": 0.51, + "grad_norm": 2.0651968957257587, + "learning_rate": 5.048963397285478e-06, + "loss": 0.444, + "step": 6519 + }, + { + "epoch": 0.51, + "grad_norm": 1.5361936215777474, + "learning_rate": 5.04769165981695e-06, + "loss": 0.4847, + "step": 6520 + }, + { + "epoch": 0.51, + "grad_norm": 0.5879318638331181, + "learning_rate": 5.046419919262836e-06, + "loss": 0.4852, + "step": 6521 + }, + { + "epoch": 0.51, + "grad_norm": 1.2785141861422618, + "learning_rate": 5.0451481757054126e-06, + "loss": 0.4385, + "step": 6522 + }, + { + "epoch": 0.51, + "grad_norm": 1.7864790265833952, + "learning_rate": 5.043876429226962e-06, + "loss": 0.4438, + "step": 6523 + }, + { + "epoch": 0.51, + "grad_norm": 2.1180513866463517, + "learning_rate": 5.042604679909762e-06, + "loss": 0.3892, + "step": 6524 + }, + { + "epoch": 0.51, + "grad_norm": 1.735681639974712, + "learning_rate": 5.041332927836097e-06, + "loss": 0.5056, + "step": 6525 + }, + { + "epoch": 0.51, + "grad_norm": 1.7360241007882895, + "learning_rate": 5.040061173088245e-06, + "loss": 0.4585, + "step": 6526 + }, + { + "epoch": 0.51, + "grad_norm": 1.5672631437033615, + "learning_rate": 5.038789415748488e-06, + "loss": 0.4153, + "step": 6527 + }, + { + "epoch": 0.51, + "grad_norm": 1.4727150012690737, + "learning_rate": 5.037517655899105e-06, + "loss": 0.4449, + "step": 6528 + }, + { + "epoch": 0.51, + "grad_norm": 1.3860435287685937, + "learning_rate": 5.03624589362238e-06, + "loss": 0.4623, + "step": 6529 + }, + { + "epoch": 0.51, + "grad_norm": 1.6981328538886096, + "learning_rate": 5.034974129000592e-06, + "loss": 0.4377, + "step": 6530 + }, + { + "epoch": 0.51, + "grad_norm": 2.1816491065009456, + "learning_rate": 5.033702362116025e-06, + "loss": 0.505, + "step": 6531 + }, + { + "epoch": 0.51, + "grad_norm": 1.7118607001755564, + "learning_rate": 5.032430593050959e-06, + "loss": 0.4419, + "step": 6532 + }, + { + "epoch": 0.51, + "grad_norm": 1.618560117825182, + "learning_rate": 5.031158821887676e-06, + "loss": 0.4744, + "step": 6533 + }, + { + "epoch": 0.51, + "grad_norm": 0.6799788168967031, + "learning_rate": 5.029887048708457e-06, + "loss": 0.4985, + "step": 6534 + }, + { + "epoch": 0.51, + "grad_norm": 4.38271717859473, + "learning_rate": 5.028615273595585e-06, + "loss": 0.4783, + "step": 6535 + }, + { + "epoch": 0.51, + "grad_norm": 0.6355921898547582, + "learning_rate": 5.027343496631343e-06, + "loss": 0.5105, + "step": 6536 + }, + { + "epoch": 0.51, + "grad_norm": 0.5477794782112593, + "learning_rate": 5.026071717898012e-06, + "loss": 0.4838, + "step": 6537 + }, + { + "epoch": 0.51, + "grad_norm": 1.5834892467927764, + "learning_rate": 5.0247999374778755e-06, + "loss": 0.4921, + "step": 6538 + }, + { + "epoch": 0.51, + "grad_norm": 1.9048369715373197, + "learning_rate": 5.023528155453217e-06, + "loss": 0.4616, + "step": 6539 + }, + { + "epoch": 0.51, + "grad_norm": 1.4381894090616965, + "learning_rate": 5.0222563719063155e-06, + "loss": 0.4911, + "step": 6540 + }, + { + "epoch": 0.51, + "grad_norm": 1.6092934041167208, + "learning_rate": 5.020984586919455e-06, + "loss": 0.4746, + "step": 6541 + }, + { + "epoch": 0.51, + "grad_norm": 1.527099662127723, + "learning_rate": 5.019712800574922e-06, + "loss": 0.4425, + "step": 6542 + }, + { + "epoch": 0.51, + "grad_norm": 1.6155996733860407, + "learning_rate": 5.018441012954994e-06, + "loss": 0.4852, + "step": 6543 + }, + { + "epoch": 0.51, + "grad_norm": 2.064781630285737, + "learning_rate": 5.017169224141959e-06, + "loss": 0.4702, + "step": 6544 + }, + { + "epoch": 0.51, + "grad_norm": 1.4796614812995288, + "learning_rate": 5.015897434218095e-06, + "loss": 0.449, + "step": 6545 + }, + { + "epoch": 0.51, + "grad_norm": 1.468740237532509, + "learning_rate": 5.014625643265691e-06, + "loss": 0.4589, + "step": 6546 + }, + { + "epoch": 0.51, + "grad_norm": 0.7914998376635575, + "learning_rate": 5.013353851367023e-06, + "loss": 0.5011, + "step": 6547 + }, + { + "epoch": 0.51, + "grad_norm": 1.6147302324372743, + "learning_rate": 5.0120820586043815e-06, + "loss": 0.4616, + "step": 6548 + }, + { + "epoch": 0.51, + "grad_norm": 1.4625195074328725, + "learning_rate": 5.010810265060045e-06, + "loss": 0.4513, + "step": 6549 + }, + { + "epoch": 0.51, + "grad_norm": 1.7003502354707902, + "learning_rate": 5.0095384708163e-06, + "loss": 0.4959, + "step": 6550 + }, + { + "epoch": 0.51, + "grad_norm": 1.7524370562843992, + "learning_rate": 5.008266675955428e-06, + "loss": 0.4133, + "step": 6551 + }, + { + "epoch": 0.51, + "grad_norm": 2.688561539101369, + "learning_rate": 5.006994880559712e-06, + "loss": 0.5215, + "step": 6552 + }, + { + "epoch": 0.51, + "grad_norm": 1.6106223449506112, + "learning_rate": 5.005723084711437e-06, + "loss": 0.4746, + "step": 6553 + }, + { + "epoch": 0.51, + "grad_norm": 1.4820154619908792, + "learning_rate": 5.004451288492886e-06, + "loss": 0.4675, + "step": 6554 + }, + { + "epoch": 0.51, + "grad_norm": 2.246026328559546, + "learning_rate": 5.003179491986342e-06, + "loss": 0.4857, + "step": 6555 + }, + { + "epoch": 0.51, + "grad_norm": 1.7613762369657864, + "learning_rate": 5.00190769527409e-06, + "loss": 0.4103, + "step": 6556 + }, + { + "epoch": 0.51, + "grad_norm": 2.140556331653775, + "learning_rate": 5.00063589843841e-06, + "loss": 0.4421, + "step": 6557 + }, + { + "epoch": 0.52, + "grad_norm": 1.746998395976713, + "learning_rate": 4.999364101561592e-06, + "loss": 0.5259, + "step": 6558 + }, + { + "epoch": 0.52, + "grad_norm": 1.7602707426218345, + "learning_rate": 4.998092304725912e-06, + "loss": 0.5022, + "step": 6559 + }, + { + "epoch": 0.52, + "grad_norm": 1.3814016841487906, + "learning_rate": 4.9968205080136605e-06, + "loss": 0.429, + "step": 6560 + }, + { + "epoch": 0.52, + "grad_norm": 1.6379138858433755, + "learning_rate": 4.995548711507115e-06, + "loss": 0.4855, + "step": 6561 + }, + { + "epoch": 0.52, + "grad_norm": 2.7931690434332954, + "learning_rate": 4.9942769152885655e-06, + "loss": 0.4413, + "step": 6562 + }, + { + "epoch": 0.52, + "grad_norm": 1.5984598509535575, + "learning_rate": 4.993005119440289e-06, + "loss": 0.5105, + "step": 6563 + }, + { + "epoch": 0.52, + "grad_norm": 1.4625184749589843, + "learning_rate": 4.991733324044573e-06, + "loss": 0.4605, + "step": 6564 + }, + { + "epoch": 0.52, + "grad_norm": 1.667020001239487, + "learning_rate": 4.990461529183701e-06, + "loss": 0.4927, + "step": 6565 + }, + { + "epoch": 0.52, + "grad_norm": 1.672614899182484, + "learning_rate": 4.989189734939955e-06, + "loss": 0.4448, + "step": 6566 + }, + { + "epoch": 0.52, + "grad_norm": 0.6386465885427992, + "learning_rate": 4.987917941395619e-06, + "loss": 0.5107, + "step": 6567 + }, + { + "epoch": 0.52, + "grad_norm": 1.7901524897071457, + "learning_rate": 4.986646148632977e-06, + "loss": 0.4446, + "step": 6568 + }, + { + "epoch": 0.52, + "grad_norm": 2.0106781750327225, + "learning_rate": 4.985374356734312e-06, + "loss": 0.5412, + "step": 6569 + }, + { + "epoch": 0.52, + "grad_norm": 0.5552134575702795, + "learning_rate": 4.984102565781906e-06, + "loss": 0.4765, + "step": 6570 + }, + { + "epoch": 0.52, + "grad_norm": 1.7707064918798165, + "learning_rate": 4.982830775858044e-06, + "loss": 0.4821, + "step": 6571 + }, + { + "epoch": 0.52, + "grad_norm": 2.0863466909489294, + "learning_rate": 4.981558987045007e-06, + "loss": 0.4865, + "step": 6572 + }, + { + "epoch": 0.52, + "grad_norm": 1.5914179011320913, + "learning_rate": 4.98028719942508e-06, + "loss": 0.4756, + "step": 6573 + }, + { + "epoch": 0.52, + "grad_norm": 1.8904943073991147, + "learning_rate": 4.979015413080546e-06, + "loss": 0.4862, + "step": 6574 + }, + { + "epoch": 0.52, + "grad_norm": 1.697027483572185, + "learning_rate": 4.977743628093685e-06, + "loss": 0.4622, + "step": 6575 + }, + { + "epoch": 0.52, + "grad_norm": 1.7749033701443462, + "learning_rate": 4.976471844546785e-06, + "loss": 0.4279, + "step": 6576 + }, + { + "epoch": 0.52, + "grad_norm": 1.6074300144830493, + "learning_rate": 4.9752000625221245e-06, + "loss": 0.4585, + "step": 6577 + }, + { + "epoch": 0.52, + "grad_norm": 1.403553716427613, + "learning_rate": 4.9739282821019885e-06, + "loss": 0.4544, + "step": 6578 + }, + { + "epoch": 0.52, + "grad_norm": 0.6483493864203449, + "learning_rate": 4.972656503368658e-06, + "loss": 0.4975, + "step": 6579 + }, + { + "epoch": 0.52, + "grad_norm": 1.6865729887753296, + "learning_rate": 4.971384726404416e-06, + "loss": 0.4474, + "step": 6580 + }, + { + "epoch": 0.52, + "grad_norm": 1.5528975609979347, + "learning_rate": 4.970112951291545e-06, + "loss": 0.4315, + "step": 6581 + }, + { + "epoch": 0.52, + "grad_norm": 1.585093527575743, + "learning_rate": 4.9688411781123266e-06, + "loss": 0.4623, + "step": 6582 + }, + { + "epoch": 0.52, + "grad_norm": 1.3373019867343807, + "learning_rate": 4.9675694069490435e-06, + "loss": 0.4481, + "step": 6583 + }, + { + "epoch": 0.52, + "grad_norm": 0.5914545680857263, + "learning_rate": 4.966297637883977e-06, + "loss": 0.5169, + "step": 6584 + }, + { + "epoch": 0.52, + "grad_norm": 0.5629030584735683, + "learning_rate": 4.965025870999409e-06, + "loss": 0.4918, + "step": 6585 + }, + { + "epoch": 0.52, + "grad_norm": 1.724908788467732, + "learning_rate": 4.963754106377622e-06, + "loss": 0.4773, + "step": 6586 + }, + { + "epoch": 0.52, + "grad_norm": 1.6909992653112695, + "learning_rate": 4.962482344100897e-06, + "loss": 0.4685, + "step": 6587 + }, + { + "epoch": 0.52, + "grad_norm": 1.9737451479676558, + "learning_rate": 4.961210584251515e-06, + "loss": 0.4931, + "step": 6588 + }, + { + "epoch": 0.52, + "grad_norm": 0.5825480971202149, + "learning_rate": 4.959938826911758e-06, + "loss": 0.4956, + "step": 6589 + }, + { + "epoch": 0.52, + "grad_norm": 1.7931334949447801, + "learning_rate": 4.958667072163904e-06, + "loss": 0.5093, + "step": 6590 + }, + { + "epoch": 0.52, + "grad_norm": 1.5849061329745877, + "learning_rate": 4.95739532009024e-06, + "loss": 0.483, + "step": 6591 + }, + { + "epoch": 0.52, + "grad_norm": 1.9534836672895128, + "learning_rate": 4.95612357077304e-06, + "loss": 0.4713, + "step": 6592 + }, + { + "epoch": 0.52, + "grad_norm": 1.6364517814415565, + "learning_rate": 4.95485182429459e-06, + "loss": 0.4564, + "step": 6593 + }, + { + "epoch": 0.52, + "grad_norm": 1.381835383830577, + "learning_rate": 4.953580080737165e-06, + "loss": 0.4302, + "step": 6594 + }, + { + "epoch": 0.52, + "grad_norm": 1.522068476764517, + "learning_rate": 4.9523083401830525e-06, + "loss": 0.4773, + "step": 6595 + }, + { + "epoch": 0.52, + "grad_norm": 0.5738696836903748, + "learning_rate": 4.951036602714523e-06, + "loss": 0.51, + "step": 6596 + }, + { + "epoch": 0.52, + "grad_norm": 1.8494017245688474, + "learning_rate": 4.949764868413866e-06, + "loss": 0.4589, + "step": 6597 + }, + { + "epoch": 0.52, + "grad_norm": 1.4704047226354522, + "learning_rate": 4.948493137363352e-06, + "loss": 0.4665, + "step": 6598 + }, + { + "epoch": 0.52, + "grad_norm": 1.754697408068161, + "learning_rate": 4.947221409645266e-06, + "loss": 0.4425, + "step": 6599 + }, + { + "epoch": 0.52, + "grad_norm": 2.0926232812347916, + "learning_rate": 4.945949685341887e-06, + "loss": 0.4899, + "step": 6600 + }, + { + "epoch": 0.52, + "grad_norm": 1.7522370551380122, + "learning_rate": 4.944677964535491e-06, + "loss": 0.4871, + "step": 6601 + }, + { + "epoch": 0.52, + "grad_norm": 2.759606510750112, + "learning_rate": 4.94340624730836e-06, + "loss": 0.4566, + "step": 6602 + }, + { + "epoch": 0.52, + "grad_norm": 2.517509056644059, + "learning_rate": 4.94213453374277e-06, + "loss": 0.4742, + "step": 6603 + }, + { + "epoch": 0.52, + "grad_norm": 0.5700796353976145, + "learning_rate": 4.940862823921001e-06, + "loss": 0.4982, + "step": 6604 + }, + { + "epoch": 0.52, + "grad_norm": 1.396792520871284, + "learning_rate": 4.939591117925328e-06, + "loss": 0.4608, + "step": 6605 + }, + { + "epoch": 0.52, + "grad_norm": 1.5312569656974562, + "learning_rate": 4.938319415838032e-06, + "loss": 0.4473, + "step": 6606 + }, + { + "epoch": 0.52, + "grad_norm": 2.716813521772426, + "learning_rate": 4.937047717741388e-06, + "loss": 0.4179, + "step": 6607 + }, + { + "epoch": 0.52, + "grad_norm": 2.4851415029821977, + "learning_rate": 4.935776023717676e-06, + "loss": 0.4562, + "step": 6608 + }, + { + "epoch": 0.52, + "grad_norm": 2.3049576042663205, + "learning_rate": 4.93450433384917e-06, + "loss": 0.4146, + "step": 6609 + }, + { + "epoch": 0.52, + "grad_norm": 1.8107778157177974, + "learning_rate": 4.933232648218146e-06, + "loss": 0.4324, + "step": 6610 + }, + { + "epoch": 0.52, + "grad_norm": 3.155892743859018, + "learning_rate": 4.931960966906886e-06, + "loss": 0.4465, + "step": 6611 + }, + { + "epoch": 0.52, + "grad_norm": 2.084578534647461, + "learning_rate": 4.93068928999766e-06, + "loss": 0.4579, + "step": 6612 + }, + { + "epoch": 0.52, + "grad_norm": 0.5679523033413384, + "learning_rate": 4.929417617572748e-06, + "loss": 0.4872, + "step": 6613 + }, + { + "epoch": 0.52, + "grad_norm": 1.9941322909236727, + "learning_rate": 4.928145949714422e-06, + "loss": 0.4781, + "step": 6614 + }, + { + "epoch": 0.52, + "grad_norm": 1.7339470559673689, + "learning_rate": 4.9268742865049616e-06, + "loss": 0.4949, + "step": 6615 + }, + { + "epoch": 0.52, + "grad_norm": 1.7856585746286044, + "learning_rate": 4.925602628026636e-06, + "loss": 0.4814, + "step": 6616 + }, + { + "epoch": 0.52, + "grad_norm": 1.5056608638852746, + "learning_rate": 4.924330974361727e-06, + "loss": 0.5297, + "step": 6617 + }, + { + "epoch": 0.52, + "grad_norm": 1.8359333336880963, + "learning_rate": 4.923059325592501e-06, + "loss": 0.4741, + "step": 6618 + }, + { + "epoch": 0.52, + "grad_norm": 1.5503684548913055, + "learning_rate": 4.921787681801239e-06, + "loss": 0.504, + "step": 6619 + }, + { + "epoch": 0.52, + "grad_norm": 1.651162966840047, + "learning_rate": 4.920516043070209e-06, + "loss": 0.4996, + "step": 6620 + }, + { + "epoch": 0.52, + "grad_norm": 2.4520006085159736, + "learning_rate": 4.919244409481688e-06, + "loss": 0.4965, + "step": 6621 + }, + { + "epoch": 0.52, + "grad_norm": 1.507379351000236, + "learning_rate": 4.91797278111795e-06, + "loss": 0.4713, + "step": 6622 + }, + { + "epoch": 0.52, + "grad_norm": 1.5232658273501551, + "learning_rate": 4.9167011580612626e-06, + "loss": 0.4622, + "step": 6623 + }, + { + "epoch": 0.52, + "grad_norm": 2.2678727128390275, + "learning_rate": 4.915429540393904e-06, + "loss": 0.4494, + "step": 6624 + }, + { + "epoch": 0.52, + "grad_norm": 0.595051580143376, + "learning_rate": 4.914157928198141e-06, + "loss": 0.4981, + "step": 6625 + }, + { + "epoch": 0.52, + "grad_norm": 1.8361111889880855, + "learning_rate": 4.912886321556251e-06, + "loss": 0.4302, + "step": 6626 + }, + { + "epoch": 0.52, + "grad_norm": 2.1494978728749725, + "learning_rate": 4.911614720550498e-06, + "loss": 0.5162, + "step": 6627 + }, + { + "epoch": 0.52, + "grad_norm": 2.338526152909621, + "learning_rate": 4.910343125263161e-06, + "loss": 0.4824, + "step": 6628 + }, + { + "epoch": 0.52, + "grad_norm": 0.5747473181858885, + "learning_rate": 4.909071535776504e-06, + "loss": 0.4764, + "step": 6629 + }, + { + "epoch": 0.52, + "grad_norm": 1.8721175945294664, + "learning_rate": 4.907799952172801e-06, + "loss": 0.4988, + "step": 6630 + }, + { + "epoch": 0.52, + "grad_norm": 1.6734475179245696, + "learning_rate": 4.906528374534319e-06, + "loss": 0.4748, + "step": 6631 + }, + { + "epoch": 0.52, + "grad_norm": 1.75169473167023, + "learning_rate": 4.905256802943329e-06, + "loss": 0.4632, + "step": 6632 + }, + { + "epoch": 0.52, + "grad_norm": 0.5928872313793587, + "learning_rate": 4.903985237482101e-06, + "loss": 0.4833, + "step": 6633 + }, + { + "epoch": 0.52, + "grad_norm": 2.3230413265437955, + "learning_rate": 4.902713678232901e-06, + "loss": 0.4837, + "step": 6634 + }, + { + "epoch": 0.52, + "grad_norm": 2.6438141443289713, + "learning_rate": 4.901442125278002e-06, + "loss": 0.5444, + "step": 6635 + }, + { + "epoch": 0.52, + "grad_norm": 1.703821987295174, + "learning_rate": 4.900170578699665e-06, + "loss": 0.491, + "step": 6636 + }, + { + "epoch": 0.52, + "grad_norm": 0.5532731761245311, + "learning_rate": 4.898899038580163e-06, + "loss": 0.5038, + "step": 6637 + }, + { + "epoch": 0.52, + "grad_norm": 1.6137113605692694, + "learning_rate": 4.897627505001761e-06, + "loss": 0.4789, + "step": 6638 + }, + { + "epoch": 0.52, + "grad_norm": 2.083458359436787, + "learning_rate": 4.8963559780467245e-06, + "loss": 0.4427, + "step": 6639 + }, + { + "epoch": 0.52, + "grad_norm": 8.735699937683806, + "learning_rate": 4.89508445779732e-06, + "loss": 0.5132, + "step": 6640 + }, + { + "epoch": 0.52, + "grad_norm": 3.8038439633313814, + "learning_rate": 4.893812944335816e-06, + "loss": 0.4564, + "step": 6641 + }, + { + "epoch": 0.52, + "grad_norm": 0.599346121861896, + "learning_rate": 4.892541437744474e-06, + "loss": 0.4956, + "step": 6642 + }, + { + "epoch": 0.52, + "grad_norm": 1.6319668672827046, + "learning_rate": 4.89126993810556e-06, + "loss": 0.4473, + "step": 6643 + }, + { + "epoch": 0.52, + "grad_norm": 1.3807712357452564, + "learning_rate": 4.889998445501341e-06, + "loss": 0.4615, + "step": 6644 + }, + { + "epoch": 0.52, + "grad_norm": 1.8089269249207602, + "learning_rate": 4.888726960014076e-06, + "loss": 0.4839, + "step": 6645 + }, + { + "epoch": 0.52, + "grad_norm": 1.6250446315475635, + "learning_rate": 4.8874554817260326e-06, + "loss": 0.4799, + "step": 6646 + }, + { + "epoch": 0.52, + "grad_norm": 2.4841089594865604, + "learning_rate": 4.886184010719472e-06, + "loss": 0.4558, + "step": 6647 + }, + { + "epoch": 0.52, + "grad_norm": 3.043329935266655, + "learning_rate": 4.884912547076658e-06, + "loss": 0.4527, + "step": 6648 + }, + { + "epoch": 0.52, + "grad_norm": 1.7329224835715065, + "learning_rate": 4.8836410908798494e-06, + "loss": 0.4962, + "step": 6649 + }, + { + "epoch": 0.52, + "grad_norm": 1.5786666192420358, + "learning_rate": 4.882369642211312e-06, + "loss": 0.4526, + "step": 6650 + }, + { + "epoch": 0.52, + "grad_norm": 2.0449134902508295, + "learning_rate": 4.881098201153304e-06, + "loss": 0.4538, + "step": 6651 + }, + { + "epoch": 0.52, + "grad_norm": 1.4191880418898544, + "learning_rate": 4.8798267677880876e-06, + "loss": 0.4679, + "step": 6652 + }, + { + "epoch": 0.52, + "grad_norm": 1.7900797804954813, + "learning_rate": 4.878555342197921e-06, + "loss": 0.4659, + "step": 6653 + }, + { + "epoch": 0.52, + "grad_norm": 1.4665200419805204, + "learning_rate": 4.8772839244650656e-06, + "loss": 0.4176, + "step": 6654 + }, + { + "epoch": 0.52, + "grad_norm": 1.6194357665050376, + "learning_rate": 4.87601251467178e-06, + "loss": 0.4999, + "step": 6655 + }, + { + "epoch": 0.52, + "grad_norm": 2.126487135138284, + "learning_rate": 4.874741112900322e-06, + "loss": 0.4715, + "step": 6656 + }, + { + "epoch": 0.52, + "grad_norm": 0.5486977297393273, + "learning_rate": 4.87346971923295e-06, + "loss": 0.5084, + "step": 6657 + }, + { + "epoch": 0.52, + "grad_norm": 1.8426484825007232, + "learning_rate": 4.8721983337519225e-06, + "loss": 0.4898, + "step": 6658 + }, + { + "epoch": 0.52, + "grad_norm": 1.6571438035086183, + "learning_rate": 4.870926956539496e-06, + "loss": 0.4679, + "step": 6659 + }, + { + "epoch": 0.52, + "grad_norm": 1.9865248860182858, + "learning_rate": 4.869655587677924e-06, + "loss": 0.4848, + "step": 6660 + }, + { + "epoch": 0.52, + "grad_norm": 1.5541856066603268, + "learning_rate": 4.868384227249468e-06, + "loss": 0.5084, + "step": 6661 + }, + { + "epoch": 0.52, + "grad_norm": 0.5767249659101897, + "learning_rate": 4.867112875336377e-06, + "loss": 0.5067, + "step": 6662 + }, + { + "epoch": 0.52, + "grad_norm": 1.5353966167917583, + "learning_rate": 4.865841532020913e-06, + "loss": 0.4977, + "step": 6663 + }, + { + "epoch": 0.52, + "grad_norm": 0.5283131099066546, + "learning_rate": 4.864570197385322e-06, + "loss": 0.4931, + "step": 6664 + }, + { + "epoch": 0.52, + "grad_norm": 1.603006967679426, + "learning_rate": 4.863298871511865e-06, + "loss": 0.4396, + "step": 6665 + }, + { + "epoch": 0.52, + "grad_norm": 2.137228529548092, + "learning_rate": 4.862027554482792e-06, + "loss": 0.492, + "step": 6666 + }, + { + "epoch": 0.52, + "grad_norm": 1.5576536805604777, + "learning_rate": 4.860756246380355e-06, + "loss": 0.4479, + "step": 6667 + }, + { + "epoch": 0.52, + "grad_norm": 1.5608197759602769, + "learning_rate": 4.859484947286807e-06, + "loss": 0.4605, + "step": 6668 + }, + { + "epoch": 0.52, + "grad_norm": 2.046512708853382, + "learning_rate": 4.8582136572844e-06, + "loss": 0.492, + "step": 6669 + }, + { + "epoch": 0.52, + "grad_norm": 2.9007808083191864, + "learning_rate": 4.856942376455384e-06, + "loss": 0.4541, + "step": 6670 + }, + { + "epoch": 0.52, + "grad_norm": 1.7703421058908309, + "learning_rate": 4.855671104882007e-06, + "loss": 0.4191, + "step": 6671 + }, + { + "epoch": 0.52, + "grad_norm": 1.6791083951423138, + "learning_rate": 4.854399842646523e-06, + "loss": 0.4354, + "step": 6672 + }, + { + "epoch": 0.52, + "grad_norm": 1.816002673511426, + "learning_rate": 4.853128589831177e-06, + "loss": 0.4767, + "step": 6673 + }, + { + "epoch": 0.52, + "grad_norm": 1.6977270770102055, + "learning_rate": 4.85185734651822e-06, + "loss": 0.4247, + "step": 6674 + }, + { + "epoch": 0.52, + "grad_norm": 2.3852248613693625, + "learning_rate": 4.850586112789898e-06, + "loss": 0.4939, + "step": 6675 + }, + { + "epoch": 0.52, + "grad_norm": 1.7050024404876745, + "learning_rate": 4.849314888728461e-06, + "loss": 0.4043, + "step": 6676 + }, + { + "epoch": 0.52, + "grad_norm": 2.250295026271321, + "learning_rate": 4.848043674416151e-06, + "loss": 0.4919, + "step": 6677 + }, + { + "epoch": 0.52, + "grad_norm": 2.1360740699064604, + "learning_rate": 4.846772469935217e-06, + "loss": 0.4817, + "step": 6678 + }, + { + "epoch": 0.52, + "grad_norm": 1.6129578544997196, + "learning_rate": 4.8455012753679055e-06, + "loss": 0.483, + "step": 6679 + }, + { + "epoch": 0.52, + "grad_norm": 0.6085352296148729, + "learning_rate": 4.844230090796456e-06, + "loss": 0.5079, + "step": 6680 + }, + { + "epoch": 0.52, + "grad_norm": 0.5658977497232243, + "learning_rate": 4.84295891630312e-06, + "loss": 0.4946, + "step": 6681 + }, + { + "epoch": 0.52, + "grad_norm": 1.768054008013359, + "learning_rate": 4.841687751970135e-06, + "loss": 0.4782, + "step": 6682 + }, + { + "epoch": 0.52, + "grad_norm": 1.9134607265158077, + "learning_rate": 4.840416597879745e-06, + "loss": 0.5162, + "step": 6683 + }, + { + "epoch": 0.52, + "grad_norm": 1.6921107203446453, + "learning_rate": 4.839145454114192e-06, + "loss": 0.4985, + "step": 6684 + }, + { + "epoch": 0.53, + "grad_norm": 1.9679053500275927, + "learning_rate": 4.8378743207557185e-06, + "loss": 0.4788, + "step": 6685 + }, + { + "epoch": 0.53, + "grad_norm": 1.6901348036170842, + "learning_rate": 4.836603197886564e-06, + "loss": 0.47, + "step": 6686 + }, + { + "epoch": 0.53, + "grad_norm": 2.13183847023187, + "learning_rate": 4.8353320855889695e-06, + "loss": 0.4411, + "step": 6687 + }, + { + "epoch": 0.53, + "grad_norm": 1.5961399061141976, + "learning_rate": 4.834060983945173e-06, + "loss": 0.4416, + "step": 6688 + }, + { + "epoch": 0.53, + "grad_norm": 1.8722968907181048, + "learning_rate": 4.832789893037414e-06, + "loss": 0.5138, + "step": 6689 + }, + { + "epoch": 0.53, + "grad_norm": 2.0640403752998626, + "learning_rate": 4.83151881294793e-06, + "loss": 0.452, + "step": 6690 + }, + { + "epoch": 0.53, + "grad_norm": 1.5473051722122786, + "learning_rate": 4.830247743758958e-06, + "loss": 0.4393, + "step": 6691 + }, + { + "epoch": 0.53, + "grad_norm": 1.777382698419964, + "learning_rate": 4.828976685552736e-06, + "loss": 0.4595, + "step": 6692 + }, + { + "epoch": 0.53, + "grad_norm": 1.8362928620077252, + "learning_rate": 4.827705638411498e-06, + "loss": 0.46, + "step": 6693 + }, + { + "epoch": 0.53, + "grad_norm": 1.6333266890757783, + "learning_rate": 4.8264346024174805e-06, + "loss": 0.5163, + "step": 6694 + }, + { + "epoch": 0.53, + "grad_norm": 2.357337029961531, + "learning_rate": 4.8251635776529145e-06, + "loss": 0.4597, + "step": 6695 + }, + { + "epoch": 0.53, + "grad_norm": 1.7135765788443145, + "learning_rate": 4.823892564200041e-06, + "loss": 0.4812, + "step": 6696 + }, + { + "epoch": 0.53, + "grad_norm": 0.6477493336238613, + "learning_rate": 4.822621562141083e-06, + "loss": 0.5354, + "step": 6697 + }, + { + "epoch": 0.53, + "grad_norm": 1.4076798938440025, + "learning_rate": 4.8213505715582825e-06, + "loss": 0.432, + "step": 6698 + }, + { + "epoch": 0.53, + "grad_norm": 1.872817909294625, + "learning_rate": 4.820079592533863e-06, + "loss": 0.4669, + "step": 6699 + }, + { + "epoch": 0.53, + "grad_norm": 1.7045707242200616, + "learning_rate": 4.818808625150059e-06, + "loss": 0.4077, + "step": 6700 + }, + { + "epoch": 0.53, + "grad_norm": 2.3066747084186323, + "learning_rate": 4.817537669489102e-06, + "loss": 0.4998, + "step": 6701 + }, + { + "epoch": 0.53, + "grad_norm": 0.5719337371379443, + "learning_rate": 4.816266725633217e-06, + "loss": 0.491, + "step": 6702 + }, + { + "epoch": 0.53, + "grad_norm": 0.5471063264534128, + "learning_rate": 4.814995793664635e-06, + "loss": 0.4915, + "step": 6703 + }, + { + "epoch": 0.53, + "grad_norm": 1.863722558240749, + "learning_rate": 4.813724873665584e-06, + "loss": 0.5057, + "step": 6704 + }, + { + "epoch": 0.53, + "grad_norm": 1.401620419186506, + "learning_rate": 4.81245396571829e-06, + "loss": 0.4995, + "step": 6705 + }, + { + "epoch": 0.53, + "grad_norm": 2.1341052732329504, + "learning_rate": 4.8111830699049786e-06, + "loss": 0.4419, + "step": 6706 + }, + { + "epoch": 0.53, + "grad_norm": 4.350273591991399, + "learning_rate": 4.8099121863078756e-06, + "loss": 0.4737, + "step": 6707 + }, + { + "epoch": 0.53, + "grad_norm": 4.626716249700459, + "learning_rate": 4.808641315009205e-06, + "loss": 0.4456, + "step": 6708 + }, + { + "epoch": 0.53, + "grad_norm": 1.6789296859407992, + "learning_rate": 4.807370456091192e-06, + "loss": 0.4653, + "step": 6709 + }, + { + "epoch": 0.53, + "grad_norm": 1.568284743686291, + "learning_rate": 4.8060996096360576e-06, + "loss": 0.4059, + "step": 6710 + }, + { + "epoch": 0.53, + "grad_norm": 1.6857332866413042, + "learning_rate": 4.804828775726023e-06, + "loss": 0.4226, + "step": 6711 + }, + { + "epoch": 0.53, + "grad_norm": 0.5841408113229113, + "learning_rate": 4.803557954443316e-06, + "loss": 0.4846, + "step": 6712 + }, + { + "epoch": 0.53, + "grad_norm": 1.381725494190368, + "learning_rate": 4.802287145870148e-06, + "loss": 0.4608, + "step": 6713 + }, + { + "epoch": 0.53, + "grad_norm": 2.349057462129503, + "learning_rate": 4.801016350088747e-06, + "loss": 0.4604, + "step": 6714 + }, + { + "epoch": 0.53, + "grad_norm": 1.5637023360271405, + "learning_rate": 4.799745567181323e-06, + "loss": 0.5196, + "step": 6715 + }, + { + "epoch": 0.53, + "grad_norm": 1.7164546471348563, + "learning_rate": 4.798474797230103e-06, + "loss": 0.4291, + "step": 6716 + }, + { + "epoch": 0.53, + "grad_norm": 1.7041170713192688, + "learning_rate": 4.797204040317297e-06, + "loss": 0.4628, + "step": 6717 + }, + { + "epoch": 0.53, + "grad_norm": 2.1716075974897664, + "learning_rate": 4.795933296525126e-06, + "loss": 0.4576, + "step": 6718 + }, + { + "epoch": 0.53, + "grad_norm": 2.005365912475376, + "learning_rate": 4.794662565935803e-06, + "loss": 0.4331, + "step": 6719 + }, + { + "epoch": 0.53, + "grad_norm": 1.7747751146802806, + "learning_rate": 4.793391848631545e-06, + "loss": 0.4383, + "step": 6720 + }, + { + "epoch": 0.53, + "grad_norm": 1.570733955633041, + "learning_rate": 4.792121144694563e-06, + "loss": 0.4789, + "step": 6721 + }, + { + "epoch": 0.53, + "grad_norm": 2.1695540170070995, + "learning_rate": 4.79085045420707e-06, + "loss": 0.4865, + "step": 6722 + }, + { + "epoch": 0.53, + "grad_norm": 2.727169963089276, + "learning_rate": 4.789579777251281e-06, + "loss": 0.4761, + "step": 6723 + }, + { + "epoch": 0.53, + "grad_norm": 1.7291508018262458, + "learning_rate": 4.788309113909403e-06, + "loss": 0.4361, + "step": 6724 + }, + { + "epoch": 0.53, + "grad_norm": 1.705861371318313, + "learning_rate": 4.787038464263651e-06, + "loss": 0.4223, + "step": 6725 + }, + { + "epoch": 0.53, + "grad_norm": 1.5672587456306586, + "learning_rate": 4.785767828396229e-06, + "loss": 0.4858, + "step": 6726 + }, + { + "epoch": 0.53, + "grad_norm": 2.16512750088143, + "learning_rate": 4.7844972063893505e-06, + "loss": 0.4887, + "step": 6727 + }, + { + "epoch": 0.53, + "grad_norm": 6.97064518061688, + "learning_rate": 4.7832265983252195e-06, + "loss": 0.4877, + "step": 6728 + }, + { + "epoch": 0.53, + "grad_norm": 1.4704118534320378, + "learning_rate": 4.781956004286045e-06, + "loss": 0.4352, + "step": 6729 + }, + { + "epoch": 0.53, + "grad_norm": 1.9015981566658091, + "learning_rate": 4.7806854243540304e-06, + "loss": 0.4831, + "step": 6730 + }, + { + "epoch": 0.53, + "grad_norm": 1.7567716512426574, + "learning_rate": 4.779414858611384e-06, + "loss": 0.4414, + "step": 6731 + }, + { + "epoch": 0.53, + "grad_norm": 1.8134928907962284, + "learning_rate": 4.778144307140305e-06, + "loss": 0.4379, + "step": 6732 + }, + { + "epoch": 0.53, + "grad_norm": 1.8554622467275732, + "learning_rate": 4.776873770023e-06, + "loss": 0.4676, + "step": 6733 + }, + { + "epoch": 0.53, + "grad_norm": 1.5533367181369777, + "learning_rate": 4.775603247341671e-06, + "loss": 0.465, + "step": 6734 + }, + { + "epoch": 0.53, + "grad_norm": 0.5739806818878227, + "learning_rate": 4.774332739178519e-06, + "loss": 0.4827, + "step": 6735 + }, + { + "epoch": 0.53, + "grad_norm": 1.4932931040628383, + "learning_rate": 4.773062245615742e-06, + "loss": 0.4566, + "step": 6736 + }, + { + "epoch": 0.53, + "grad_norm": 1.4523757845468528, + "learning_rate": 4.771791766735541e-06, + "loss": 0.438, + "step": 6737 + }, + { + "epoch": 0.53, + "grad_norm": 1.6041285269136627, + "learning_rate": 4.770521302620116e-06, + "loss": 0.5082, + "step": 6738 + }, + { + "epoch": 0.53, + "grad_norm": 1.5909074500097466, + "learning_rate": 4.76925085335166e-06, + "loss": 0.4633, + "step": 6739 + }, + { + "epoch": 0.53, + "grad_norm": 2.765590920070977, + "learning_rate": 4.7679804190123745e-06, + "loss": 0.4354, + "step": 6740 + }, + { + "epoch": 0.53, + "grad_norm": 1.6688630773308473, + "learning_rate": 4.76670999968445e-06, + "loss": 0.5043, + "step": 6741 + }, + { + "epoch": 0.53, + "grad_norm": 1.862423802875408, + "learning_rate": 4.765439595450085e-06, + "loss": 0.4966, + "step": 6742 + }, + { + "epoch": 0.53, + "grad_norm": 1.497814853240861, + "learning_rate": 4.76416920639147e-06, + "loss": 0.4505, + "step": 6743 + }, + { + "epoch": 0.53, + "grad_norm": 2.4166943086215094, + "learning_rate": 4.7628988325907985e-06, + "loss": 0.4593, + "step": 6744 + }, + { + "epoch": 0.53, + "grad_norm": 1.5090953076732723, + "learning_rate": 4.761628474130265e-06, + "loss": 0.5156, + "step": 6745 + }, + { + "epoch": 0.53, + "grad_norm": 1.8345607229160001, + "learning_rate": 4.760358131092054e-06, + "loss": 0.4348, + "step": 6746 + }, + { + "epoch": 0.53, + "grad_norm": 1.6835827591594243, + "learning_rate": 4.759087803558362e-06, + "loss": 0.4743, + "step": 6747 + }, + { + "epoch": 0.53, + "grad_norm": 1.6058234461878689, + "learning_rate": 4.757817491611371e-06, + "loss": 0.4258, + "step": 6748 + }, + { + "epoch": 0.53, + "grad_norm": 3.7275344434298945, + "learning_rate": 4.756547195333274e-06, + "loss": 0.485, + "step": 6749 + }, + { + "epoch": 0.53, + "grad_norm": 1.7207539024518654, + "learning_rate": 4.755276914806252e-06, + "loss": 0.4502, + "step": 6750 + }, + { + "epoch": 0.53, + "grad_norm": 1.5110840116608724, + "learning_rate": 4.754006650112497e-06, + "loss": 0.43, + "step": 6751 + }, + { + "epoch": 0.53, + "grad_norm": 2.191974552051519, + "learning_rate": 4.752736401334186e-06, + "loss": 0.455, + "step": 6752 + }, + { + "epoch": 0.53, + "grad_norm": 0.5466207458158461, + "learning_rate": 4.751466168553509e-06, + "loss": 0.4925, + "step": 6753 + }, + { + "epoch": 0.53, + "grad_norm": 0.544468075265627, + "learning_rate": 4.750195951852645e-06, + "loss": 0.5073, + "step": 6754 + }, + { + "epoch": 0.53, + "grad_norm": 1.567142676446206, + "learning_rate": 4.748925751313777e-06, + "loss": 0.4525, + "step": 6755 + }, + { + "epoch": 0.53, + "grad_norm": 1.7578513505667428, + "learning_rate": 4.7476555670190825e-06, + "loss": 0.5002, + "step": 6756 + }, + { + "epoch": 0.53, + "grad_norm": 1.9734864353872705, + "learning_rate": 4.746385399050743e-06, + "loss": 0.4029, + "step": 6757 + }, + { + "epoch": 0.53, + "grad_norm": 2.100411063616878, + "learning_rate": 4.745115247490939e-06, + "loss": 0.4878, + "step": 6758 + }, + { + "epoch": 0.53, + "grad_norm": 2.031113820668726, + "learning_rate": 4.743845112421842e-06, + "loss": 0.4416, + "step": 6759 + }, + { + "epoch": 0.53, + "grad_norm": 2.9038619361443163, + "learning_rate": 4.742574993925634e-06, + "loss": 0.5338, + "step": 6760 + }, + { + "epoch": 0.53, + "grad_norm": 2.1726066154631396, + "learning_rate": 4.741304892084485e-06, + "loss": 0.4818, + "step": 6761 + }, + { + "epoch": 0.53, + "grad_norm": 1.5338824217251135, + "learning_rate": 4.7400348069805725e-06, + "loss": 0.4042, + "step": 6762 + }, + { + "epoch": 0.53, + "grad_norm": 0.5948232395826915, + "learning_rate": 4.738764738696066e-06, + "loss": 0.4669, + "step": 6763 + }, + { + "epoch": 0.53, + "grad_norm": 1.4675493739613286, + "learning_rate": 4.737494687313142e-06, + "loss": 0.4334, + "step": 6764 + }, + { + "epoch": 0.53, + "grad_norm": 1.8959981046707237, + "learning_rate": 4.736224652913964e-06, + "loss": 0.4581, + "step": 6765 + }, + { + "epoch": 0.53, + "grad_norm": 1.7293635276879877, + "learning_rate": 4.734954635580711e-06, + "loss": 0.4464, + "step": 6766 + }, + { + "epoch": 0.53, + "grad_norm": 1.6113007672339685, + "learning_rate": 4.733684635395543e-06, + "loss": 0.5038, + "step": 6767 + }, + { + "epoch": 0.53, + "grad_norm": 1.6181690078500528, + "learning_rate": 4.732414652440631e-06, + "loss": 0.4889, + "step": 6768 + }, + { + "epoch": 0.53, + "grad_norm": 1.8943666291587404, + "learning_rate": 4.731144686798141e-06, + "loss": 0.4331, + "step": 6769 + }, + { + "epoch": 0.53, + "grad_norm": 1.7339765916656622, + "learning_rate": 4.729874738550238e-06, + "loss": 0.471, + "step": 6770 + }, + { + "epoch": 0.53, + "grad_norm": 1.9791596615487284, + "learning_rate": 4.728604807779088e-06, + "loss": 0.5035, + "step": 6771 + }, + { + "epoch": 0.53, + "grad_norm": 1.6588193877814434, + "learning_rate": 4.72733489456685e-06, + "loss": 0.47, + "step": 6772 + }, + { + "epoch": 0.53, + "grad_norm": 2.0322253226925002, + "learning_rate": 4.726064998995689e-06, + "loss": 0.4633, + "step": 6773 + }, + { + "epoch": 0.53, + "grad_norm": 1.5420673408314831, + "learning_rate": 4.724795121147763e-06, + "loss": 0.4861, + "step": 6774 + }, + { + "epoch": 0.53, + "grad_norm": 1.2998038760275714, + "learning_rate": 4.7235252611052335e-06, + "loss": 0.4855, + "step": 6775 + }, + { + "epoch": 0.53, + "grad_norm": 1.680774961926386, + "learning_rate": 4.722255418950257e-06, + "loss": 0.4353, + "step": 6776 + }, + { + "epoch": 0.53, + "grad_norm": 1.8501091936577974, + "learning_rate": 4.720985594764992e-06, + "loss": 0.4797, + "step": 6777 + }, + { + "epoch": 0.53, + "grad_norm": 5.12795171299923, + "learning_rate": 4.719715788631594e-06, + "loss": 0.5094, + "step": 6778 + }, + { + "epoch": 0.53, + "grad_norm": 1.5327401718329199, + "learning_rate": 4.7184460006322165e-06, + "loss": 0.4312, + "step": 6779 + }, + { + "epoch": 0.53, + "grad_norm": 1.6151446974365018, + "learning_rate": 4.717176230849017e-06, + "loss": 0.4769, + "step": 6780 + }, + { + "epoch": 0.53, + "grad_norm": 1.6175670787972947, + "learning_rate": 4.715906479364143e-06, + "loss": 0.4481, + "step": 6781 + }, + { + "epoch": 0.53, + "grad_norm": 1.3977895363788317, + "learning_rate": 4.714636746259752e-06, + "loss": 0.3851, + "step": 6782 + }, + { + "epoch": 0.53, + "grad_norm": 1.6031768326040865, + "learning_rate": 4.713367031617985e-06, + "loss": 0.4549, + "step": 6783 + }, + { + "epoch": 0.53, + "grad_norm": 1.394703480351404, + "learning_rate": 4.712097335521001e-06, + "loss": 0.4428, + "step": 6784 + }, + { + "epoch": 0.53, + "grad_norm": 1.8475387504517746, + "learning_rate": 4.710827658050938e-06, + "loss": 0.4156, + "step": 6785 + }, + { + "epoch": 0.53, + "grad_norm": 1.5977737305318904, + "learning_rate": 4.709557999289952e-06, + "loss": 0.4891, + "step": 6786 + }, + { + "epoch": 0.53, + "grad_norm": 1.6703112933762194, + "learning_rate": 4.708288359320179e-06, + "loss": 0.4653, + "step": 6787 + }, + { + "epoch": 0.53, + "grad_norm": 0.6043587101097239, + "learning_rate": 4.707018738223771e-06, + "loss": 0.4827, + "step": 6788 + }, + { + "epoch": 0.53, + "grad_norm": 1.7868653660431113, + "learning_rate": 4.705749136082865e-06, + "loss": 0.4292, + "step": 6789 + }, + { + "epoch": 0.53, + "grad_norm": 2.309115828233961, + "learning_rate": 4.704479552979606e-06, + "loss": 0.4794, + "step": 6790 + }, + { + "epoch": 0.53, + "grad_norm": 1.5025472626538847, + "learning_rate": 4.703209988996134e-06, + "loss": 0.4629, + "step": 6791 + }, + { + "epoch": 0.53, + "grad_norm": 2.342067085574972, + "learning_rate": 4.701940444214587e-06, + "loss": 0.4643, + "step": 6792 + }, + { + "epoch": 0.53, + "grad_norm": 2.316857941221925, + "learning_rate": 4.700670918717103e-06, + "loss": 0.4626, + "step": 6793 + }, + { + "epoch": 0.53, + "grad_norm": 1.8294907434680994, + "learning_rate": 4.699401412585819e-06, + "loss": 0.4342, + "step": 6794 + }, + { + "epoch": 0.53, + "grad_norm": 1.9137935536850157, + "learning_rate": 4.6981319259028715e-06, + "loss": 0.4628, + "step": 6795 + }, + { + "epoch": 0.53, + "grad_norm": 0.602900838490069, + "learning_rate": 4.696862458750391e-06, + "loss": 0.4987, + "step": 6796 + }, + { + "epoch": 0.53, + "grad_norm": 1.4894904126997455, + "learning_rate": 4.695593011210516e-06, + "loss": 0.4076, + "step": 6797 + }, + { + "epoch": 0.53, + "grad_norm": 1.7227219366705002, + "learning_rate": 4.6943235833653725e-06, + "loss": 0.4413, + "step": 6798 + }, + { + "epoch": 0.53, + "grad_norm": 1.8355061977723959, + "learning_rate": 4.693054175297095e-06, + "loss": 0.4056, + "step": 6799 + }, + { + "epoch": 0.53, + "grad_norm": 0.5692098286245784, + "learning_rate": 4.691784787087808e-06, + "loss": 0.5025, + "step": 6800 + }, + { + "epoch": 0.53, + "grad_norm": 1.8870868140879573, + "learning_rate": 4.690515418819644e-06, + "loss": 0.4681, + "step": 6801 + }, + { + "epoch": 0.53, + "grad_norm": 1.6186380036414465, + "learning_rate": 4.6892460705747275e-06, + "loss": 0.4595, + "step": 6802 + }, + { + "epoch": 0.53, + "grad_norm": 1.9295462236825514, + "learning_rate": 4.687976742435184e-06, + "loss": 0.4471, + "step": 6803 + }, + { + "epoch": 0.53, + "grad_norm": 2.357204122165277, + "learning_rate": 4.6867074344831375e-06, + "loss": 0.4366, + "step": 6804 + }, + { + "epoch": 0.53, + "grad_norm": 1.7172905622659762, + "learning_rate": 4.685438146800709e-06, + "loss": 0.4685, + "step": 6805 + }, + { + "epoch": 0.53, + "grad_norm": 1.4973659732329059, + "learning_rate": 4.684168879470022e-06, + "loss": 0.438, + "step": 6806 + }, + { + "epoch": 0.53, + "grad_norm": 3.45824204397553, + "learning_rate": 4.682899632573195e-06, + "loss": 0.4645, + "step": 6807 + }, + { + "epoch": 0.53, + "grad_norm": 2.654579492802134, + "learning_rate": 4.6816304061923465e-06, + "loss": 0.482, + "step": 6808 + }, + { + "epoch": 0.53, + "grad_norm": 1.799216005391002, + "learning_rate": 4.680361200409595e-06, + "loss": 0.4492, + "step": 6809 + }, + { + "epoch": 0.53, + "grad_norm": 2.6795940314421296, + "learning_rate": 4.6790920153070555e-06, + "loss": 0.4651, + "step": 6810 + }, + { + "epoch": 0.53, + "grad_norm": 1.5771738285772394, + "learning_rate": 4.6778228509668415e-06, + "loss": 0.4105, + "step": 6811 + }, + { + "epoch": 0.53, + "grad_norm": 0.5824114305376981, + "learning_rate": 4.676553707471068e-06, + "loss": 0.5083, + "step": 6812 + }, + { + "epoch": 0.54, + "grad_norm": 2.431769615061043, + "learning_rate": 4.675284584901848e-06, + "loss": 0.3939, + "step": 6813 + }, + { + "epoch": 0.54, + "grad_norm": 1.4908612055598287, + "learning_rate": 4.674015483341288e-06, + "loss": 0.4503, + "step": 6814 + }, + { + "epoch": 0.54, + "grad_norm": 1.8554257559873495, + "learning_rate": 4.6727464028715035e-06, + "loss": 0.4719, + "step": 6815 + }, + { + "epoch": 0.54, + "grad_norm": 1.659621956576652, + "learning_rate": 4.671477343574595e-06, + "loss": 0.4617, + "step": 6816 + }, + { + "epoch": 0.54, + "grad_norm": 1.666955430664887, + "learning_rate": 4.670208305532676e-06, + "loss": 0.4894, + "step": 6817 + }, + { + "epoch": 0.54, + "grad_norm": 1.614622044908237, + "learning_rate": 4.668939288827845e-06, + "loss": 0.4608, + "step": 6818 + }, + { + "epoch": 0.54, + "grad_norm": 1.1913143787134808, + "learning_rate": 4.667670293542211e-06, + "loss": 0.3918, + "step": 6819 + }, + { + "epoch": 0.54, + "grad_norm": 3.887649836365518, + "learning_rate": 4.666401319757873e-06, + "loss": 0.4105, + "step": 6820 + }, + { + "epoch": 0.54, + "grad_norm": 1.673264152790568, + "learning_rate": 4.665132367556935e-06, + "loss": 0.4874, + "step": 6821 + }, + { + "epoch": 0.54, + "grad_norm": 1.972038032477824, + "learning_rate": 4.6638634370214915e-06, + "loss": 0.4641, + "step": 6822 + }, + { + "epoch": 0.54, + "grad_norm": 3.079730238263787, + "learning_rate": 4.662594528233647e-06, + "loss": 0.4883, + "step": 6823 + }, + { + "epoch": 0.54, + "grad_norm": 1.9948773702354168, + "learning_rate": 4.661325641275494e-06, + "loss": 0.4874, + "step": 6824 + }, + { + "epoch": 0.54, + "grad_norm": 1.6447950679966556, + "learning_rate": 4.660056776229128e-06, + "loss": 0.4328, + "step": 6825 + }, + { + "epoch": 0.54, + "grad_norm": 1.7685107666624231, + "learning_rate": 4.6587879331766465e-06, + "loss": 0.4338, + "step": 6826 + }, + { + "epoch": 0.54, + "grad_norm": 0.5384911618441113, + "learning_rate": 4.657519112200137e-06, + "loss": 0.4633, + "step": 6827 + }, + { + "epoch": 0.54, + "grad_norm": 1.4016491360099923, + "learning_rate": 4.656250313381694e-06, + "loss": 0.4292, + "step": 6828 + }, + { + "epoch": 0.54, + "grad_norm": 1.5500420936321144, + "learning_rate": 4.654981536803406e-06, + "loss": 0.4382, + "step": 6829 + }, + { + "epoch": 0.54, + "grad_norm": 1.698424092041151, + "learning_rate": 4.653712782547361e-06, + "loss": 0.4864, + "step": 6830 + }, + { + "epoch": 0.54, + "grad_norm": 1.6562433675665198, + "learning_rate": 4.652444050695646e-06, + "loss": 0.4612, + "step": 6831 + }, + { + "epoch": 0.54, + "grad_norm": 2.257581930569672, + "learning_rate": 4.651175341330346e-06, + "loss": 0.4721, + "step": 6832 + }, + { + "epoch": 0.54, + "grad_norm": 1.9767963757533966, + "learning_rate": 4.649906654533545e-06, + "loss": 0.4673, + "step": 6833 + }, + { + "epoch": 0.54, + "grad_norm": 1.9312445662347821, + "learning_rate": 4.648637990387327e-06, + "loss": 0.4224, + "step": 6834 + }, + { + "epoch": 0.54, + "grad_norm": 2.1774070990831613, + "learning_rate": 4.6473693489737685e-06, + "loss": 0.4589, + "step": 6835 + }, + { + "epoch": 0.54, + "grad_norm": 2.5940896626242633, + "learning_rate": 4.646100730374953e-06, + "loss": 0.4694, + "step": 6836 + }, + { + "epoch": 0.54, + "grad_norm": 0.5921657602076107, + "learning_rate": 4.644832134672958e-06, + "loss": 0.498, + "step": 6837 + }, + { + "epoch": 0.54, + "grad_norm": 0.5322538272231689, + "learning_rate": 4.643563561949859e-06, + "loss": 0.4715, + "step": 6838 + }, + { + "epoch": 0.54, + "grad_norm": 2.1075641352261476, + "learning_rate": 4.642295012287732e-06, + "loss": 0.4297, + "step": 6839 + }, + { + "epoch": 0.54, + "grad_norm": 2.2623400071224085, + "learning_rate": 4.641026485768648e-06, + "loss": 0.5033, + "step": 6840 + }, + { + "epoch": 0.54, + "grad_norm": 1.987621659092553, + "learning_rate": 4.639757982474683e-06, + "loss": 0.4647, + "step": 6841 + }, + { + "epoch": 0.54, + "grad_norm": 1.9827787146373967, + "learning_rate": 4.638489502487904e-06, + "loss": 0.4462, + "step": 6842 + }, + { + "epoch": 0.54, + "grad_norm": 0.5497318689175171, + "learning_rate": 4.637221045890382e-06, + "loss": 0.4816, + "step": 6843 + }, + { + "epoch": 0.54, + "grad_norm": 1.3095292864681283, + "learning_rate": 4.635952612764183e-06, + "loss": 0.4927, + "step": 6844 + }, + { + "epoch": 0.54, + "grad_norm": 1.810359376555525, + "learning_rate": 4.634684203191374e-06, + "loss": 0.4425, + "step": 6845 + }, + { + "epoch": 0.54, + "grad_norm": 1.4270532254772705, + "learning_rate": 4.633415817254018e-06, + "loss": 0.4729, + "step": 6846 + }, + { + "epoch": 0.54, + "grad_norm": 0.5357933835814878, + "learning_rate": 4.63214745503418e-06, + "loss": 0.4923, + "step": 6847 + }, + { + "epoch": 0.54, + "grad_norm": 1.617113838999308, + "learning_rate": 4.630879116613921e-06, + "loss": 0.528, + "step": 6848 + }, + { + "epoch": 0.54, + "grad_norm": 1.49302208154492, + "learning_rate": 4.629610802075298e-06, + "loss": 0.4406, + "step": 6849 + }, + { + "epoch": 0.54, + "grad_norm": 1.5096216863067633, + "learning_rate": 4.628342511500375e-06, + "loss": 0.4938, + "step": 6850 + }, + { + "epoch": 0.54, + "grad_norm": 1.7557354626260413, + "learning_rate": 4.627074244971203e-06, + "loss": 0.4386, + "step": 6851 + }, + { + "epoch": 0.54, + "grad_norm": 1.5230727005337314, + "learning_rate": 4.625806002569842e-06, + "loss": 0.4585, + "step": 6852 + }, + { + "epoch": 0.54, + "grad_norm": 0.5813164470794332, + "learning_rate": 4.624537784378339e-06, + "loss": 0.4757, + "step": 6853 + }, + { + "epoch": 0.54, + "grad_norm": 2.1158389442094396, + "learning_rate": 4.623269590478755e-06, + "loss": 0.4528, + "step": 6854 + }, + { + "epoch": 0.54, + "grad_norm": 1.7270364714444189, + "learning_rate": 4.622001420953132e-06, + "loss": 0.4439, + "step": 6855 + }, + { + "epoch": 0.54, + "grad_norm": 1.630001069879384, + "learning_rate": 4.620733275883525e-06, + "loss": 0.4528, + "step": 6856 + }, + { + "epoch": 0.54, + "grad_norm": 2.1693036318804464, + "learning_rate": 4.619465155351976e-06, + "loss": 0.4324, + "step": 6857 + }, + { + "epoch": 0.54, + "grad_norm": 4.946226020237933, + "learning_rate": 4.618197059440535e-06, + "loss": 0.481, + "step": 6858 + }, + { + "epoch": 0.54, + "grad_norm": 1.4822099190238134, + "learning_rate": 4.616928988231246e-06, + "loss": 0.4689, + "step": 6859 + }, + { + "epoch": 0.54, + "grad_norm": 2.042278470716708, + "learning_rate": 4.6156609418061486e-06, + "loss": 0.4639, + "step": 6860 + }, + { + "epoch": 0.54, + "grad_norm": 1.6049835920323006, + "learning_rate": 4.614392920247288e-06, + "loss": 0.4782, + "step": 6861 + }, + { + "epoch": 0.54, + "grad_norm": 2.649465280365482, + "learning_rate": 4.613124923636698e-06, + "loss": 0.5041, + "step": 6862 + }, + { + "epoch": 0.54, + "grad_norm": 1.264740079993466, + "learning_rate": 4.611856952056421e-06, + "loss": 0.4572, + "step": 6863 + }, + { + "epoch": 0.54, + "grad_norm": 1.4065453297159005, + "learning_rate": 4.610589005588492e-06, + "loss": 0.4402, + "step": 6864 + }, + { + "epoch": 0.54, + "grad_norm": 1.7222743484095515, + "learning_rate": 4.6093210843149445e-06, + "loss": 0.4308, + "step": 6865 + }, + { + "epoch": 0.54, + "grad_norm": 1.5300802907627304, + "learning_rate": 4.60805318831781e-06, + "loss": 0.48, + "step": 6866 + }, + { + "epoch": 0.54, + "grad_norm": 1.7852265672717624, + "learning_rate": 4.606785317679124e-06, + "loss": 0.4936, + "step": 6867 + }, + { + "epoch": 0.54, + "grad_norm": 1.5006212431287191, + "learning_rate": 4.605517472480912e-06, + "loss": 0.4171, + "step": 6868 + }, + { + "epoch": 0.54, + "grad_norm": 1.5679573502048902, + "learning_rate": 4.604249652805203e-06, + "loss": 0.4791, + "step": 6869 + }, + { + "epoch": 0.54, + "grad_norm": 1.3510129374227096, + "learning_rate": 4.602981858734026e-06, + "loss": 0.4317, + "step": 6870 + }, + { + "epoch": 0.54, + "grad_norm": 2.008210078038119, + "learning_rate": 4.601714090349401e-06, + "loss": 0.4798, + "step": 6871 + }, + { + "epoch": 0.54, + "grad_norm": 1.5532850807678098, + "learning_rate": 4.600446347733354e-06, + "loss": 0.4901, + "step": 6872 + }, + { + "epoch": 0.54, + "grad_norm": 1.732794627759301, + "learning_rate": 4.599178630967906e-06, + "loss": 0.4949, + "step": 6873 + }, + { + "epoch": 0.54, + "grad_norm": 2.546378165576194, + "learning_rate": 4.597910940135077e-06, + "loss": 0.44, + "step": 6874 + }, + { + "epoch": 0.54, + "grad_norm": 2.167401987543923, + "learning_rate": 4.596643275316882e-06, + "loss": 0.4307, + "step": 6875 + }, + { + "epoch": 0.54, + "grad_norm": 1.556657014000904, + "learning_rate": 4.595375636595342e-06, + "loss": 0.4795, + "step": 6876 + }, + { + "epoch": 0.54, + "grad_norm": 0.5701375459132573, + "learning_rate": 4.594108024052468e-06, + "loss": 0.5108, + "step": 6877 + }, + { + "epoch": 0.54, + "grad_norm": 0.5439582717198745, + "learning_rate": 4.592840437770274e-06, + "loss": 0.5143, + "step": 6878 + }, + { + "epoch": 0.54, + "grad_norm": 2.0204214102469957, + "learning_rate": 4.59157287783077e-06, + "loss": 0.4706, + "step": 6879 + }, + { + "epoch": 0.54, + "grad_norm": 2.432910553825949, + "learning_rate": 4.590305344315967e-06, + "loss": 0.4492, + "step": 6880 + }, + { + "epoch": 0.54, + "grad_norm": 1.9681775975346065, + "learning_rate": 4.589037837307873e-06, + "loss": 0.4485, + "step": 6881 + }, + { + "epoch": 0.54, + "grad_norm": 0.5555539969433061, + "learning_rate": 4.587770356888493e-06, + "loss": 0.495, + "step": 6882 + }, + { + "epoch": 0.54, + "grad_norm": 5.711790421635941, + "learning_rate": 4.586502903139832e-06, + "loss": 0.4337, + "step": 6883 + }, + { + "epoch": 0.54, + "grad_norm": 1.8138988737524109, + "learning_rate": 4.58523547614389e-06, + "loss": 0.5258, + "step": 6884 + }, + { + "epoch": 0.54, + "grad_norm": 1.7587329648056718, + "learning_rate": 4.583968075982673e-06, + "loss": 0.4747, + "step": 6885 + }, + { + "epoch": 0.54, + "grad_norm": 1.3877595104767442, + "learning_rate": 4.582700702738175e-06, + "loss": 0.4658, + "step": 6886 + }, + { + "epoch": 0.54, + "grad_norm": 1.9825157930378736, + "learning_rate": 4.581433356492398e-06, + "loss": 0.4557, + "step": 6887 + }, + { + "epoch": 0.54, + "grad_norm": 0.5748570059722728, + "learning_rate": 4.580166037327333e-06, + "loss": 0.4792, + "step": 6888 + }, + { + "epoch": 0.54, + "grad_norm": 1.414922906016234, + "learning_rate": 4.578898745324979e-06, + "loss": 0.4497, + "step": 6889 + }, + { + "epoch": 0.54, + "grad_norm": 1.744678109463946, + "learning_rate": 4.577631480567321e-06, + "loss": 0.4797, + "step": 6890 + }, + { + "epoch": 0.54, + "grad_norm": 1.5570940624037628, + "learning_rate": 4.576364243136356e-06, + "loss": 0.5089, + "step": 6891 + }, + { + "epoch": 0.54, + "grad_norm": 0.555639996945321, + "learning_rate": 4.575097033114072e-06, + "loss": 0.4748, + "step": 6892 + }, + { + "epoch": 0.54, + "grad_norm": 1.8279227444374277, + "learning_rate": 4.573829850582452e-06, + "loss": 0.5094, + "step": 6893 + }, + { + "epoch": 0.54, + "grad_norm": 1.39196254048585, + "learning_rate": 4.572562695623484e-06, + "loss": 0.4737, + "step": 6894 + }, + { + "epoch": 0.54, + "grad_norm": 2.136720802809768, + "learning_rate": 4.57129556831915e-06, + "loss": 0.5107, + "step": 6895 + }, + { + "epoch": 0.54, + "grad_norm": 2.046214988195398, + "learning_rate": 4.570028468751433e-06, + "loss": 0.4564, + "step": 6896 + }, + { + "epoch": 0.54, + "grad_norm": 2.3321259841230355, + "learning_rate": 4.56876139700231e-06, + "loss": 0.4316, + "step": 6897 + }, + { + "epoch": 0.54, + "grad_norm": 1.4726934909524818, + "learning_rate": 4.567494353153762e-06, + "loss": 0.4212, + "step": 6898 + }, + { + "epoch": 0.54, + "grad_norm": 1.586878323423012, + "learning_rate": 4.566227337287762e-06, + "loss": 0.4773, + "step": 6899 + }, + { + "epoch": 0.54, + "grad_norm": 1.785230054256795, + "learning_rate": 4.564960349486287e-06, + "loss": 0.4626, + "step": 6900 + }, + { + "epoch": 0.54, + "grad_norm": 1.8403708151543154, + "learning_rate": 4.5636933898313074e-06, + "loss": 0.5085, + "step": 6901 + }, + { + "epoch": 0.54, + "grad_norm": 1.6771924354772159, + "learning_rate": 4.562426458404796e-06, + "loss": 0.4834, + "step": 6902 + }, + { + "epoch": 0.54, + "grad_norm": 1.924181589842709, + "learning_rate": 4.561159555288719e-06, + "loss": 0.5192, + "step": 6903 + }, + { + "epoch": 0.54, + "grad_norm": 1.4954968456682984, + "learning_rate": 4.559892680565044e-06, + "loss": 0.5159, + "step": 6904 + }, + { + "epoch": 0.54, + "grad_norm": 0.6186225089464956, + "learning_rate": 4.558625834315739e-06, + "loss": 0.4858, + "step": 6905 + }, + { + "epoch": 0.54, + "grad_norm": 1.4914130666393217, + "learning_rate": 4.557359016622763e-06, + "loss": 0.4077, + "step": 6906 + }, + { + "epoch": 0.54, + "grad_norm": 1.4530549492009845, + "learning_rate": 4.5560922275680814e-06, + "loss": 0.4763, + "step": 6907 + }, + { + "epoch": 0.54, + "grad_norm": 0.617721556564137, + "learning_rate": 4.554825467233651e-06, + "loss": 0.4933, + "step": 6908 + }, + { + "epoch": 0.54, + "grad_norm": 1.681113496045325, + "learning_rate": 4.553558735701431e-06, + "loss": 0.4465, + "step": 6909 + }, + { + "epoch": 0.54, + "grad_norm": 1.666711126781402, + "learning_rate": 4.5522920330533764e-06, + "loss": 0.4675, + "step": 6910 + }, + { + "epoch": 0.54, + "grad_norm": 2.7042054083469744, + "learning_rate": 4.551025359371443e-06, + "loss": 0.439, + "step": 6911 + }, + { + "epoch": 0.54, + "grad_norm": 1.51054173942164, + "learning_rate": 4.549758714737579e-06, + "loss": 0.4941, + "step": 6912 + }, + { + "epoch": 0.54, + "grad_norm": 2.2054036541484865, + "learning_rate": 4.5484920992337396e-06, + "loss": 0.494, + "step": 6913 + }, + { + "epoch": 0.54, + "grad_norm": 0.5786589308505222, + "learning_rate": 4.547225512941869e-06, + "loss": 0.4735, + "step": 6914 + }, + { + "epoch": 0.54, + "grad_norm": 1.6938909617642963, + "learning_rate": 4.545958955943915e-06, + "loss": 0.479, + "step": 6915 + }, + { + "epoch": 0.54, + "grad_norm": 1.5828264050227292, + "learning_rate": 4.544692428321825e-06, + "loss": 0.4711, + "step": 6916 + }, + { + "epoch": 0.54, + "grad_norm": 1.5881486421574957, + "learning_rate": 4.543425930157537e-06, + "loss": 0.4173, + "step": 6917 + }, + { + "epoch": 0.54, + "grad_norm": 1.3934223903811027, + "learning_rate": 4.542159461532995e-06, + "loss": 0.5163, + "step": 6918 + }, + { + "epoch": 0.54, + "grad_norm": 2.281474288352167, + "learning_rate": 4.540893022530136e-06, + "loss": 0.4359, + "step": 6919 + }, + { + "epoch": 0.54, + "grad_norm": 1.8234038311110423, + "learning_rate": 4.5396266132309e-06, + "loss": 0.4916, + "step": 6920 + }, + { + "epoch": 0.54, + "grad_norm": 1.8191465599331846, + "learning_rate": 4.538360233717217e-06, + "loss": 0.4679, + "step": 6921 + }, + { + "epoch": 0.54, + "grad_norm": 1.2887056839190372, + "learning_rate": 4.537093884071027e-06, + "loss": 0.4704, + "step": 6922 + }, + { + "epoch": 0.54, + "grad_norm": 2.0184185518797135, + "learning_rate": 4.5358275643742525e-06, + "loss": 0.451, + "step": 6923 + }, + { + "epoch": 0.54, + "grad_norm": 1.770487264005411, + "learning_rate": 4.5345612747088316e-06, + "loss": 0.4341, + "step": 6924 + }, + { + "epoch": 0.54, + "grad_norm": 3.4927178602069326, + "learning_rate": 4.533295015156684e-06, + "loss": 0.4655, + "step": 6925 + }, + { + "epoch": 0.54, + "grad_norm": 1.6847378809485796, + "learning_rate": 4.532028785799739e-06, + "loss": 0.4896, + "step": 6926 + }, + { + "epoch": 0.54, + "grad_norm": 1.4750236358136959, + "learning_rate": 4.530762586719921e-06, + "loss": 0.4463, + "step": 6927 + }, + { + "epoch": 0.54, + "grad_norm": 1.556879108868717, + "learning_rate": 4.529496417999148e-06, + "loss": 0.4485, + "step": 6928 + }, + { + "epoch": 0.54, + "grad_norm": 0.5752369288621986, + "learning_rate": 4.528230279719343e-06, + "loss": 0.4911, + "step": 6929 + }, + { + "epoch": 0.54, + "grad_norm": 1.372174342060887, + "learning_rate": 4.526964171962421e-06, + "loss": 0.425, + "step": 6930 + }, + { + "epoch": 0.54, + "grad_norm": 1.58420486517682, + "learning_rate": 4.525698094810298e-06, + "loss": 0.4566, + "step": 6931 + }, + { + "epoch": 0.54, + "grad_norm": 1.6198401184184732, + "learning_rate": 4.524432048344888e-06, + "loss": 0.4911, + "step": 6932 + }, + { + "epoch": 0.54, + "grad_norm": 2.618657202365816, + "learning_rate": 4.523166032648103e-06, + "loss": 0.4361, + "step": 6933 + }, + { + "epoch": 0.54, + "grad_norm": 1.8440512826291275, + "learning_rate": 4.52190004780185e-06, + "loss": 0.4385, + "step": 6934 + }, + { + "epoch": 0.54, + "grad_norm": 0.5382671248848278, + "learning_rate": 4.520634093888041e-06, + "loss": 0.4789, + "step": 6935 + }, + { + "epoch": 0.54, + "grad_norm": 1.6616880070028293, + "learning_rate": 4.519368170988577e-06, + "loss": 0.4841, + "step": 6936 + }, + { + "epoch": 0.54, + "grad_norm": 1.6681698573778345, + "learning_rate": 4.5181022791853625e-06, + "loss": 0.4499, + "step": 6937 + }, + { + "epoch": 0.54, + "grad_norm": 1.6266530546604019, + "learning_rate": 4.516836418560303e-06, + "loss": 0.4645, + "step": 6938 + }, + { + "epoch": 0.54, + "grad_norm": 0.5440259675687322, + "learning_rate": 4.515570589195293e-06, + "loss": 0.5027, + "step": 6939 + }, + { + "epoch": 0.55, + "grad_norm": 1.538233418974356, + "learning_rate": 4.514304791172235e-06, + "loss": 0.4601, + "step": 6940 + }, + { + "epoch": 0.55, + "grad_norm": 1.9546840067155016, + "learning_rate": 4.5130390245730186e-06, + "loss": 0.5228, + "step": 6941 + }, + { + "epoch": 0.55, + "grad_norm": 1.691768844924657, + "learning_rate": 4.511773289479542e-06, + "loss": 0.4666, + "step": 6942 + }, + { + "epoch": 0.55, + "grad_norm": 1.5209016847795411, + "learning_rate": 4.510507585973694e-06, + "loss": 0.5041, + "step": 6943 + }, + { + "epoch": 0.55, + "grad_norm": 4.005861846814587, + "learning_rate": 4.509241914137366e-06, + "loss": 0.4226, + "step": 6944 + }, + { + "epoch": 0.55, + "grad_norm": 0.5464427142356353, + "learning_rate": 4.507976274052443e-06, + "loss": 0.4855, + "step": 6945 + }, + { + "epoch": 0.55, + "grad_norm": 1.8246453506367573, + "learning_rate": 4.506710665800813e-06, + "loss": 0.4982, + "step": 6946 + }, + { + "epoch": 0.55, + "grad_norm": 1.5243098931004129, + "learning_rate": 4.505445089464356e-06, + "loss": 0.4512, + "step": 6947 + }, + { + "epoch": 0.55, + "grad_norm": 1.8258556937394714, + "learning_rate": 4.504179545124955e-06, + "loss": 0.5055, + "step": 6948 + }, + { + "epoch": 0.55, + "grad_norm": 1.462991577680172, + "learning_rate": 4.502914032864489e-06, + "loss": 0.4371, + "step": 6949 + }, + { + "epoch": 0.55, + "grad_norm": 7.620715617054399, + "learning_rate": 4.501648552764835e-06, + "loss": 0.4615, + "step": 6950 + }, + { + "epoch": 0.55, + "grad_norm": 0.587625363549638, + "learning_rate": 4.500383104907868e-06, + "loss": 0.4848, + "step": 6951 + }, + { + "epoch": 0.55, + "grad_norm": 2.1367492692551684, + "learning_rate": 4.49911768937546e-06, + "loss": 0.4616, + "step": 6952 + }, + { + "epoch": 0.55, + "grad_norm": 1.946007805748415, + "learning_rate": 4.497852306249483e-06, + "loss": 0.4956, + "step": 6953 + }, + { + "epoch": 0.55, + "grad_norm": 1.7293702983786041, + "learning_rate": 4.496586955611802e-06, + "loss": 0.4076, + "step": 6954 + }, + { + "epoch": 0.55, + "grad_norm": 0.5696280959653361, + "learning_rate": 4.495321637544291e-06, + "loss": 0.4974, + "step": 6955 + }, + { + "epoch": 0.55, + "grad_norm": 16.306800375051598, + "learning_rate": 4.494056352128806e-06, + "loss": 0.4712, + "step": 6956 + }, + { + "epoch": 0.55, + "grad_norm": 1.3982803426114867, + "learning_rate": 4.492791099447217e-06, + "loss": 0.4713, + "step": 6957 + }, + { + "epoch": 0.55, + "grad_norm": 2.004766336615196, + "learning_rate": 4.491525879581377e-06, + "loss": 0.506, + "step": 6958 + }, + { + "epoch": 0.55, + "grad_norm": 1.789386428490523, + "learning_rate": 4.490260692613148e-06, + "loss": 0.4176, + "step": 6959 + }, + { + "epoch": 0.55, + "grad_norm": 2.2389206006700233, + "learning_rate": 4.488995538624386e-06, + "loss": 0.4833, + "step": 6960 + }, + { + "epoch": 0.55, + "grad_norm": 1.9403510699497735, + "learning_rate": 4.487730417696943e-06, + "loss": 0.4669, + "step": 6961 + }, + { + "epoch": 0.55, + "grad_norm": 0.6295469606366729, + "learning_rate": 4.486465329912674e-06, + "loss": 0.4937, + "step": 6962 + }, + { + "epoch": 0.55, + "grad_norm": 1.9905055158988696, + "learning_rate": 4.485200275353425e-06, + "loss": 0.5231, + "step": 6963 + }, + { + "epoch": 0.55, + "grad_norm": 1.7255830095970441, + "learning_rate": 4.483935254101046e-06, + "loss": 0.4271, + "step": 6964 + }, + { + "epoch": 0.55, + "grad_norm": 1.5928382705433373, + "learning_rate": 4.482670266237379e-06, + "loss": 0.4343, + "step": 6965 + }, + { + "epoch": 0.55, + "grad_norm": 2.235279830406769, + "learning_rate": 4.481405311844271e-06, + "loss": 0.4133, + "step": 6966 + }, + { + "epoch": 0.55, + "grad_norm": 1.508964086715466, + "learning_rate": 4.480140391003559e-06, + "loss": 0.445, + "step": 6967 + }, + { + "epoch": 0.55, + "grad_norm": 1.9155834441610042, + "learning_rate": 4.478875503797085e-06, + "loss": 0.4114, + "step": 6968 + }, + { + "epoch": 0.55, + "grad_norm": 1.4269378564844795, + "learning_rate": 4.477610650306683e-06, + "loss": 0.475, + "step": 6969 + }, + { + "epoch": 0.55, + "grad_norm": 1.939330205162762, + "learning_rate": 4.476345830614188e-06, + "loss": 0.4399, + "step": 6970 + }, + { + "epoch": 0.55, + "grad_norm": 1.488488174458895, + "learning_rate": 4.475081044801436e-06, + "loss": 0.4543, + "step": 6971 + }, + { + "epoch": 0.55, + "grad_norm": 0.5662209261700896, + "learning_rate": 4.47381629295025e-06, + "loss": 0.505, + "step": 6972 + }, + { + "epoch": 0.55, + "grad_norm": 0.558954580909388, + "learning_rate": 4.472551575142465e-06, + "loss": 0.5051, + "step": 6973 + }, + { + "epoch": 0.55, + "grad_norm": 2.3189435923958883, + "learning_rate": 4.4712868914599e-06, + "loss": 0.4557, + "step": 6974 + }, + { + "epoch": 0.55, + "grad_norm": 1.819227641448975, + "learning_rate": 4.4700222419843845e-06, + "loss": 0.4536, + "step": 6975 + }, + { + "epoch": 0.55, + "grad_norm": 3.381196353971081, + "learning_rate": 4.468757626797734e-06, + "loss": 0.4577, + "step": 6976 + }, + { + "epoch": 0.55, + "grad_norm": 2.062259964954439, + "learning_rate": 4.467493045981771e-06, + "loss": 0.5397, + "step": 6977 + }, + { + "epoch": 0.55, + "grad_norm": 1.7104861853068443, + "learning_rate": 4.466228499618311e-06, + "loss": 0.4395, + "step": 6978 + }, + { + "epoch": 0.55, + "grad_norm": 1.4806804549512607, + "learning_rate": 4.46496398778917e-06, + "loss": 0.434, + "step": 6979 + }, + { + "epoch": 0.55, + "grad_norm": 1.7863206435797854, + "learning_rate": 4.463699510576158e-06, + "loss": 0.5026, + "step": 6980 + }, + { + "epoch": 0.55, + "grad_norm": 2.6334116227778526, + "learning_rate": 4.462435068061087e-06, + "loss": 0.4278, + "step": 6981 + }, + { + "epoch": 0.55, + "grad_norm": 1.8767313198898707, + "learning_rate": 4.4611706603257626e-06, + "loss": 0.4088, + "step": 6982 + }, + { + "epoch": 0.55, + "grad_norm": 2.014384796636381, + "learning_rate": 4.459906287451992e-06, + "loss": 0.4658, + "step": 6983 + }, + { + "epoch": 0.55, + "grad_norm": 1.7769979380813947, + "learning_rate": 4.458641949521579e-06, + "loss": 0.4076, + "step": 6984 + }, + { + "epoch": 0.55, + "grad_norm": 1.5930019757817278, + "learning_rate": 4.457377646616322e-06, + "loss": 0.4406, + "step": 6985 + }, + { + "epoch": 0.55, + "grad_norm": 1.4434108320394223, + "learning_rate": 4.456113378818023e-06, + "loss": 0.4821, + "step": 6986 + }, + { + "epoch": 0.55, + "grad_norm": 0.5448205135034024, + "learning_rate": 4.454849146208476e-06, + "loss": 0.4805, + "step": 6987 + }, + { + "epoch": 0.55, + "grad_norm": 1.658915191855445, + "learning_rate": 4.453584948869477e-06, + "loss": 0.4704, + "step": 6988 + }, + { + "epoch": 0.55, + "grad_norm": 1.1978233164686958, + "learning_rate": 4.452320786882817e-06, + "loss": 0.4078, + "step": 6989 + }, + { + "epoch": 0.55, + "grad_norm": 1.5753607851255316, + "learning_rate": 4.451056660330285e-06, + "loss": 0.4612, + "step": 6990 + }, + { + "epoch": 0.55, + "grad_norm": 0.5527879649433624, + "learning_rate": 4.449792569293668e-06, + "loss": 0.5073, + "step": 6991 + }, + { + "epoch": 0.55, + "grad_norm": 1.7596119004707476, + "learning_rate": 4.448528513854754e-06, + "loss": 0.4597, + "step": 6992 + }, + { + "epoch": 0.55, + "grad_norm": 1.5887849738727466, + "learning_rate": 4.447264494095322e-06, + "loss": 0.4883, + "step": 6993 + }, + { + "epoch": 0.55, + "grad_norm": 0.5310426503157457, + "learning_rate": 4.446000510097154e-06, + "loss": 0.4826, + "step": 6994 + }, + { + "epoch": 0.55, + "grad_norm": 1.669443625810114, + "learning_rate": 4.444736561942029e-06, + "loss": 0.4431, + "step": 6995 + }, + { + "epoch": 0.55, + "grad_norm": 0.5326041118385999, + "learning_rate": 4.443472649711723e-06, + "loss": 0.4654, + "step": 6996 + }, + { + "epoch": 0.55, + "grad_norm": 2.2613484326602404, + "learning_rate": 4.442208773488008e-06, + "loss": 0.4193, + "step": 6997 + }, + { + "epoch": 0.55, + "grad_norm": 1.7478284875690078, + "learning_rate": 4.440944933352655e-06, + "loss": 0.4683, + "step": 6998 + }, + { + "epoch": 0.55, + "grad_norm": 0.5501222089889063, + "learning_rate": 4.439681129387435e-06, + "loss": 0.4901, + "step": 6999 + }, + { + "epoch": 0.55, + "grad_norm": 1.9580923340483176, + "learning_rate": 4.438417361674111e-06, + "loss": 0.5518, + "step": 7000 + }, + { + "epoch": 0.55, + "grad_norm": 1.8072670405626892, + "learning_rate": 4.437153630294451e-06, + "loss": 0.4217, + "step": 7001 + }, + { + "epoch": 0.55, + "grad_norm": 0.5984986696400904, + "learning_rate": 4.435889935330213e-06, + "loss": 0.4898, + "step": 7002 + }, + { + "epoch": 0.55, + "grad_norm": 0.5214374782539432, + "learning_rate": 4.4346262768631595e-06, + "loss": 0.4878, + "step": 7003 + }, + { + "epoch": 0.55, + "grad_norm": 1.3691561953284572, + "learning_rate": 4.433362654975046e-06, + "loss": 0.4028, + "step": 7004 + }, + { + "epoch": 0.55, + "grad_norm": 2.9227485148097987, + "learning_rate": 4.432099069747625e-06, + "loss": 0.4632, + "step": 7005 + }, + { + "epoch": 0.55, + "grad_norm": 1.6774087672073437, + "learning_rate": 4.430835521262656e-06, + "loss": 0.4643, + "step": 7006 + }, + { + "epoch": 0.55, + "grad_norm": 1.8440707679471362, + "learning_rate": 4.42957200960188e-06, + "loss": 0.4422, + "step": 7007 + }, + { + "epoch": 0.55, + "grad_norm": 1.8272773348456313, + "learning_rate": 4.428308534847052e-06, + "loss": 0.4797, + "step": 7008 + }, + { + "epoch": 0.55, + "grad_norm": 1.4642179347199777, + "learning_rate": 4.427045097079911e-06, + "loss": 0.4412, + "step": 7009 + }, + { + "epoch": 0.55, + "grad_norm": 1.4985959737363224, + "learning_rate": 4.4257816963822055e-06, + "loss": 0.4294, + "step": 7010 + }, + { + "epoch": 0.55, + "grad_norm": 1.9232557455029031, + "learning_rate": 4.424518332835671e-06, + "loss": 0.4512, + "step": 7011 + }, + { + "epoch": 0.55, + "grad_norm": 1.842755491135707, + "learning_rate": 4.4232550065220485e-06, + "loss": 0.457, + "step": 7012 + }, + { + "epoch": 0.55, + "grad_norm": 1.6432091979194465, + "learning_rate": 4.421991717523072e-06, + "loss": 0.3988, + "step": 7013 + }, + { + "epoch": 0.55, + "grad_norm": 1.659266061750174, + "learning_rate": 4.420728465920477e-06, + "loss": 0.4617, + "step": 7014 + }, + { + "epoch": 0.55, + "grad_norm": 2.008098713971611, + "learning_rate": 4.41946525179599e-06, + "loss": 0.5058, + "step": 7015 + }, + { + "epoch": 0.55, + "grad_norm": 1.9006370172418146, + "learning_rate": 4.418202075231342e-06, + "loss": 0.4986, + "step": 7016 + }, + { + "epoch": 0.55, + "grad_norm": 2.2162076712388235, + "learning_rate": 4.4169389363082605e-06, + "loss": 0.467, + "step": 7017 + }, + { + "epoch": 0.55, + "grad_norm": 0.5622565699910569, + "learning_rate": 4.4156758351084656e-06, + "loss": 0.4976, + "step": 7018 + }, + { + "epoch": 0.55, + "grad_norm": 1.3861916469046607, + "learning_rate": 4.414412771713681e-06, + "loss": 0.4211, + "step": 7019 + }, + { + "epoch": 0.55, + "grad_norm": 1.913298354307848, + "learning_rate": 4.413149746205623e-06, + "loss": 0.4593, + "step": 7020 + }, + { + "epoch": 0.55, + "grad_norm": 1.5785481273394253, + "learning_rate": 4.411886758666009e-06, + "loss": 0.4766, + "step": 7021 + }, + { + "epoch": 0.55, + "grad_norm": 1.5036550785119829, + "learning_rate": 4.410623809176552e-06, + "loss": 0.4661, + "step": 7022 + }, + { + "epoch": 0.55, + "grad_norm": 1.8089720384128842, + "learning_rate": 4.409360897818964e-06, + "loss": 0.477, + "step": 7023 + }, + { + "epoch": 0.55, + "grad_norm": 1.4807022139068908, + "learning_rate": 4.408098024674953e-06, + "loss": 0.4796, + "step": 7024 + }, + { + "epoch": 0.55, + "grad_norm": 0.5555595705544788, + "learning_rate": 4.406835189826227e-06, + "loss": 0.495, + "step": 7025 + }, + { + "epoch": 0.55, + "grad_norm": 2.0281205678241614, + "learning_rate": 4.405572393354485e-06, + "loss": 0.4244, + "step": 7026 + }, + { + "epoch": 0.55, + "grad_norm": 2.01926758861407, + "learning_rate": 4.404309635341434e-06, + "loss": 0.4594, + "step": 7027 + }, + { + "epoch": 0.55, + "grad_norm": 1.6257582121771132, + "learning_rate": 4.4030469158687696e-06, + "loss": 0.4162, + "step": 7028 + }, + { + "epoch": 0.55, + "grad_norm": 0.5818089571998044, + "learning_rate": 4.401784235018188e-06, + "loss": 0.4838, + "step": 7029 + }, + { + "epoch": 0.55, + "grad_norm": 1.296009331881997, + "learning_rate": 4.400521592871386e-06, + "loss": 0.4484, + "step": 7030 + }, + { + "epoch": 0.55, + "grad_norm": 1.6467673236672709, + "learning_rate": 4.399258989510052e-06, + "loss": 0.4588, + "step": 7031 + }, + { + "epoch": 0.55, + "grad_norm": 3.0995597367065977, + "learning_rate": 4.397996425015875e-06, + "loss": 0.4823, + "step": 7032 + }, + { + "epoch": 0.55, + "grad_norm": 1.920309716629777, + "learning_rate": 4.396733899470541e-06, + "loss": 0.4552, + "step": 7033 + }, + { + "epoch": 0.55, + "grad_norm": 2.088402400197526, + "learning_rate": 4.395471412955737e-06, + "loss": 0.4673, + "step": 7034 + }, + { + "epoch": 0.55, + "grad_norm": 1.4363641129344256, + "learning_rate": 4.394208965553139e-06, + "loss": 0.4446, + "step": 7035 + }, + { + "epoch": 0.55, + "grad_norm": 1.3770192617406876, + "learning_rate": 4.392946557344431e-06, + "loss": 0.4505, + "step": 7036 + }, + { + "epoch": 0.55, + "grad_norm": 1.606612142661513, + "learning_rate": 4.391684188411284e-06, + "loss": 0.5244, + "step": 7037 + }, + { + "epoch": 0.55, + "grad_norm": 1.6183551326985346, + "learning_rate": 4.390421858835374e-06, + "loss": 0.4537, + "step": 7038 + }, + { + "epoch": 0.55, + "grad_norm": 1.573086646102559, + "learning_rate": 4.389159568698375e-06, + "loss": 0.4092, + "step": 7039 + }, + { + "epoch": 0.55, + "grad_norm": 0.6477005313384241, + "learning_rate": 4.387897318081951e-06, + "loss": 0.4923, + "step": 7040 + }, + { + "epoch": 0.55, + "grad_norm": 3.236368083172161, + "learning_rate": 4.386635107067772e-06, + "loss": 0.5005, + "step": 7041 + }, + { + "epoch": 0.55, + "grad_norm": 1.5121322760621845, + "learning_rate": 4.385372935737496e-06, + "loss": 0.4157, + "step": 7042 + }, + { + "epoch": 0.55, + "grad_norm": 1.6833511404846513, + "learning_rate": 4.38411080417279e-06, + "loss": 0.4636, + "step": 7043 + }, + { + "epoch": 0.55, + "grad_norm": 1.8877525530703354, + "learning_rate": 4.382848712455307e-06, + "loss": 0.4625, + "step": 7044 + }, + { + "epoch": 0.55, + "grad_norm": 1.3569894382538616, + "learning_rate": 4.381586660666707e-06, + "loss": 0.4516, + "step": 7045 + }, + { + "epoch": 0.55, + "grad_norm": 1.4125657630797133, + "learning_rate": 4.380324648888638e-06, + "loss": 0.4491, + "step": 7046 + }, + { + "epoch": 0.55, + "grad_norm": 1.673965314766708, + "learning_rate": 4.379062677202757e-06, + "loss": 0.4686, + "step": 7047 + }, + { + "epoch": 0.55, + "grad_norm": 2.7372410945554004, + "learning_rate": 4.3778007456907065e-06, + "loss": 0.5016, + "step": 7048 + }, + { + "epoch": 0.55, + "grad_norm": 1.9089767498205799, + "learning_rate": 4.376538854434135e-06, + "loss": 0.452, + "step": 7049 + }, + { + "epoch": 0.55, + "grad_norm": 1.4544558627795818, + "learning_rate": 4.375277003514683e-06, + "loss": 0.4747, + "step": 7050 + }, + { + "epoch": 0.55, + "grad_norm": 2.2327357536054557, + "learning_rate": 4.374015193013992e-06, + "loss": 0.4698, + "step": 7051 + }, + { + "epoch": 0.55, + "grad_norm": 2.3678180762884193, + "learning_rate": 4.372753423013699e-06, + "loss": 0.4496, + "step": 7052 + }, + { + "epoch": 0.55, + "grad_norm": 1.810492063544269, + "learning_rate": 4.3714916935954386e-06, + "loss": 0.5069, + "step": 7053 + }, + { + "epoch": 0.55, + "grad_norm": 1.5625807655030113, + "learning_rate": 4.370230004840845e-06, + "loss": 0.4635, + "step": 7054 + }, + { + "epoch": 0.55, + "grad_norm": 2.3858176260745867, + "learning_rate": 4.368968356831544e-06, + "loss": 0.502, + "step": 7055 + }, + { + "epoch": 0.55, + "grad_norm": 1.877182342981261, + "learning_rate": 4.367706749649167e-06, + "loss": 0.4614, + "step": 7056 + }, + { + "epoch": 0.55, + "grad_norm": 2.481665538232791, + "learning_rate": 4.366445183375335e-06, + "loss": 0.5198, + "step": 7057 + }, + { + "epoch": 0.55, + "grad_norm": 2.9267904034320824, + "learning_rate": 4.365183658091672e-06, + "loss": 0.443, + "step": 7058 + }, + { + "epoch": 0.55, + "grad_norm": 1.8793435474565665, + "learning_rate": 4.363922173879794e-06, + "loss": 0.4244, + "step": 7059 + }, + { + "epoch": 0.55, + "grad_norm": 1.469176425870163, + "learning_rate": 4.362660730821321e-06, + "loss": 0.4528, + "step": 7060 + }, + { + "epoch": 0.55, + "grad_norm": 2.355820290937261, + "learning_rate": 4.361399328997862e-06, + "loss": 0.4565, + "step": 7061 + }, + { + "epoch": 0.55, + "grad_norm": 1.5947954498565637, + "learning_rate": 4.360137968491033e-06, + "loss": 0.4273, + "step": 7062 + }, + { + "epoch": 0.55, + "grad_norm": 1.7571192028754195, + "learning_rate": 4.358876649382441e-06, + "loss": 0.4377, + "step": 7063 + }, + { + "epoch": 0.55, + "grad_norm": 1.5571334120862022, + "learning_rate": 4.35761537175369e-06, + "loss": 0.47, + "step": 7064 + }, + { + "epoch": 0.55, + "grad_norm": 1.9585484554976884, + "learning_rate": 4.356354135686385e-06, + "loss": 0.4779, + "step": 7065 + }, + { + "epoch": 0.55, + "grad_norm": 1.9224194520241955, + "learning_rate": 4.355092941262125e-06, + "loss": 0.4399, + "step": 7066 + }, + { + "epoch": 0.56, + "grad_norm": 1.3658057771066292, + "learning_rate": 4.353831788562509e-06, + "loss": 0.4911, + "step": 7067 + }, + { + "epoch": 0.56, + "grad_norm": 0.5592145227770972, + "learning_rate": 4.35257067766913e-06, + "loss": 0.4757, + "step": 7068 + }, + { + "epoch": 0.56, + "grad_norm": 0.5419519037460963, + "learning_rate": 4.351309608663582e-06, + "loss": 0.4881, + "step": 7069 + }, + { + "epoch": 0.56, + "grad_norm": 1.6856154431224528, + "learning_rate": 4.350048581627452e-06, + "loss": 0.4955, + "step": 7070 + }, + { + "epoch": 0.56, + "grad_norm": 1.2851496477520994, + "learning_rate": 4.348787596642331e-06, + "loss": 0.4363, + "step": 7071 + }, + { + "epoch": 0.56, + "grad_norm": 1.8153696492324172, + "learning_rate": 4.347526653789798e-06, + "loss": 0.4189, + "step": 7072 + }, + { + "epoch": 0.56, + "grad_norm": 2.3326952606353584, + "learning_rate": 4.346265753151438e-06, + "loss": 0.4503, + "step": 7073 + }, + { + "epoch": 0.56, + "grad_norm": 1.459275596930889, + "learning_rate": 4.345004894808829e-06, + "loss": 0.4884, + "step": 7074 + }, + { + "epoch": 0.56, + "grad_norm": 3.040050883432156, + "learning_rate": 4.343744078843545e-06, + "loss": 0.5096, + "step": 7075 + }, + { + "epoch": 0.56, + "grad_norm": 1.4599004057223703, + "learning_rate": 4.342483305337164e-06, + "loss": 0.4917, + "step": 7076 + }, + { + "epoch": 0.56, + "grad_norm": 2.098386629632137, + "learning_rate": 4.3412225743712494e-06, + "loss": 0.4983, + "step": 7077 + }, + { + "epoch": 0.56, + "grad_norm": 1.810049827702473, + "learning_rate": 4.339961886027376e-06, + "loss": 0.4596, + "step": 7078 + }, + { + "epoch": 0.56, + "grad_norm": 1.448124245646561, + "learning_rate": 4.338701240387101e-06, + "loss": 0.425, + "step": 7079 + }, + { + "epoch": 0.56, + "grad_norm": 2.0835026753430106, + "learning_rate": 4.337440637531994e-06, + "loss": 0.4683, + "step": 7080 + }, + { + "epoch": 0.56, + "grad_norm": 1.519222800713956, + "learning_rate": 4.3361800775436085e-06, + "loss": 0.4533, + "step": 7081 + }, + { + "epoch": 0.56, + "grad_norm": 1.4959999756829805, + "learning_rate": 4.334919560503506e-06, + "loss": 0.4889, + "step": 7082 + }, + { + "epoch": 0.56, + "grad_norm": 1.4383030012139646, + "learning_rate": 4.333659086493236e-06, + "loss": 0.483, + "step": 7083 + }, + { + "epoch": 0.56, + "grad_norm": 0.6084560745424255, + "learning_rate": 4.332398655594351e-06, + "loss": 0.4993, + "step": 7084 + }, + { + "epoch": 0.56, + "grad_norm": 1.7718959094126387, + "learning_rate": 4.331138267888402e-06, + "loss": 0.4761, + "step": 7085 + }, + { + "epoch": 0.56, + "grad_norm": 1.2973554425799056, + "learning_rate": 4.3298779234569305e-06, + "loss": 0.5093, + "step": 7086 + }, + { + "epoch": 0.56, + "grad_norm": 1.636737030073066, + "learning_rate": 4.328617622381481e-06, + "loss": 0.4771, + "step": 7087 + }, + { + "epoch": 0.56, + "grad_norm": 1.6804304037646827, + "learning_rate": 4.327357364743593e-06, + "loss": 0.5048, + "step": 7088 + }, + { + "epoch": 0.56, + "grad_norm": 0.5669294703378361, + "learning_rate": 4.3260971506248045e-06, + "loss": 0.4847, + "step": 7089 + }, + { + "epoch": 0.56, + "grad_norm": 1.524866791357876, + "learning_rate": 4.324836980106648e-06, + "loss": 0.4763, + "step": 7090 + }, + { + "epoch": 0.56, + "grad_norm": 2.027185439879834, + "learning_rate": 4.323576853270656e-06, + "loss": 0.4378, + "step": 7091 + }, + { + "epoch": 0.56, + "grad_norm": 1.6335679443035642, + "learning_rate": 4.322316770198356e-06, + "loss": 0.4903, + "step": 7092 + }, + { + "epoch": 0.56, + "grad_norm": 0.5903735576675163, + "learning_rate": 4.321056730971275e-06, + "loss": 0.4761, + "step": 7093 + }, + { + "epoch": 0.56, + "grad_norm": 3.646851041760132, + "learning_rate": 4.319796735670935e-06, + "loss": 0.456, + "step": 7094 + }, + { + "epoch": 0.56, + "grad_norm": 1.7000459523526177, + "learning_rate": 4.318536784378855e-06, + "loss": 0.4728, + "step": 7095 + }, + { + "epoch": 0.56, + "grad_norm": 1.5931789981034563, + "learning_rate": 4.317276877176556e-06, + "loss": 0.4354, + "step": 7096 + }, + { + "epoch": 0.56, + "grad_norm": 1.973054816736263, + "learning_rate": 4.316017014145548e-06, + "loss": 0.4568, + "step": 7097 + }, + { + "epoch": 0.56, + "grad_norm": 2.719179731251411, + "learning_rate": 4.314757195367347e-06, + "loss": 0.4776, + "step": 7098 + }, + { + "epoch": 0.56, + "grad_norm": 1.721910820249672, + "learning_rate": 4.313497420923456e-06, + "loss": 0.4755, + "step": 7099 + }, + { + "epoch": 0.56, + "grad_norm": 0.5372784712846987, + "learning_rate": 4.312237690895387e-06, + "loss": 0.4872, + "step": 7100 + }, + { + "epoch": 0.56, + "grad_norm": 1.7039023374321425, + "learning_rate": 4.310978005364637e-06, + "loss": 0.4487, + "step": 7101 + }, + { + "epoch": 0.56, + "grad_norm": 1.4958597937073772, + "learning_rate": 4.309718364412711e-06, + "loss": 0.4449, + "step": 7102 + }, + { + "epoch": 0.56, + "grad_norm": 1.3645944456143195, + "learning_rate": 4.308458768121102e-06, + "loss": 0.4333, + "step": 7103 + }, + { + "epoch": 0.56, + "grad_norm": 1.6157152481916293, + "learning_rate": 4.307199216571307e-06, + "loss": 0.4271, + "step": 7104 + }, + { + "epoch": 0.56, + "grad_norm": 1.320880319183414, + "learning_rate": 4.305939709844815e-06, + "loss": 0.4651, + "step": 7105 + }, + { + "epoch": 0.56, + "grad_norm": 1.7932893912409602, + "learning_rate": 4.304680248023117e-06, + "loss": 0.4396, + "step": 7106 + }, + { + "epoch": 0.56, + "grad_norm": 1.6533606693322573, + "learning_rate": 4.303420831187698e-06, + "loss": 0.4529, + "step": 7107 + }, + { + "epoch": 0.56, + "grad_norm": 0.538407189673168, + "learning_rate": 4.302161459420037e-06, + "loss": 0.4801, + "step": 7108 + }, + { + "epoch": 0.56, + "grad_norm": 1.6166162681584009, + "learning_rate": 4.300902132801621e-06, + "loss": 0.3951, + "step": 7109 + }, + { + "epoch": 0.56, + "grad_norm": 2.9283161285875248, + "learning_rate": 4.299642851413919e-06, + "loss": 0.4366, + "step": 7110 + }, + { + "epoch": 0.56, + "grad_norm": 2.814997773410602, + "learning_rate": 4.2983836153384114e-06, + "loss": 0.4028, + "step": 7111 + }, + { + "epoch": 0.56, + "grad_norm": 1.8732772018217532, + "learning_rate": 4.297124424656563e-06, + "loss": 0.4106, + "step": 7112 + }, + { + "epoch": 0.56, + "grad_norm": 1.8022789492333757, + "learning_rate": 4.295865279449847e-06, + "loss": 0.3994, + "step": 7113 + }, + { + "epoch": 0.56, + "grad_norm": 1.555915217888963, + "learning_rate": 4.2946061797997245e-06, + "loss": 0.4437, + "step": 7114 + }, + { + "epoch": 0.56, + "grad_norm": 0.6482567681059899, + "learning_rate": 4.293347125787662e-06, + "loss": 0.5175, + "step": 7115 + }, + { + "epoch": 0.56, + "grad_norm": 2.0707033523533798, + "learning_rate": 4.292088117495113e-06, + "loss": 0.4841, + "step": 7116 + }, + { + "epoch": 0.56, + "grad_norm": 1.6075329077273628, + "learning_rate": 4.290829155003538e-06, + "loss": 0.4134, + "step": 7117 + }, + { + "epoch": 0.56, + "grad_norm": 2.3139845191046, + "learning_rate": 4.28957023839439e-06, + "loss": 0.4303, + "step": 7118 + }, + { + "epoch": 0.56, + "grad_norm": 1.426227007206049, + "learning_rate": 4.288311367749116e-06, + "loss": 0.4288, + "step": 7119 + }, + { + "epoch": 0.56, + "grad_norm": 2.493381934248523, + "learning_rate": 4.287052543149167e-06, + "loss": 0.4484, + "step": 7120 + }, + { + "epoch": 0.56, + "grad_norm": 1.6787731203515182, + "learning_rate": 4.285793764675986e-06, + "loss": 0.4967, + "step": 7121 + }, + { + "epoch": 0.56, + "grad_norm": 2.234573890119122, + "learning_rate": 4.284535032411014e-06, + "loss": 0.4838, + "step": 7122 + }, + { + "epoch": 0.56, + "grad_norm": 0.5796205847564913, + "learning_rate": 4.283276346435689e-06, + "loss": 0.4856, + "step": 7123 + }, + { + "epoch": 0.56, + "grad_norm": 1.6946004249080573, + "learning_rate": 4.282017706831447e-06, + "loss": 0.5004, + "step": 7124 + }, + { + "epoch": 0.56, + "grad_norm": 3.111828690234118, + "learning_rate": 4.28075911367972e-06, + "loss": 0.3983, + "step": 7125 + }, + { + "epoch": 0.56, + "grad_norm": 2.426555629132823, + "learning_rate": 4.279500567061938e-06, + "loss": 0.495, + "step": 7126 + }, + { + "epoch": 0.56, + "grad_norm": 3.4244794007682464, + "learning_rate": 4.278242067059526e-06, + "loss": 0.4493, + "step": 7127 + }, + { + "epoch": 0.56, + "grad_norm": 2.1633449489178704, + "learning_rate": 4.27698361375391e-06, + "loss": 0.454, + "step": 7128 + }, + { + "epoch": 0.56, + "grad_norm": 0.5962354948533055, + "learning_rate": 4.275725207226505e-06, + "loss": 0.5079, + "step": 7129 + }, + { + "epoch": 0.56, + "grad_norm": 1.837982846000056, + "learning_rate": 4.274466847558733e-06, + "loss": 0.4961, + "step": 7130 + }, + { + "epoch": 0.56, + "grad_norm": 1.527886933556026, + "learning_rate": 4.273208534832007e-06, + "loss": 0.4819, + "step": 7131 + }, + { + "epoch": 0.56, + "grad_norm": 2.5239023235675115, + "learning_rate": 4.271950269127738e-06, + "loss": 0.5115, + "step": 7132 + }, + { + "epoch": 0.56, + "grad_norm": 0.5724784306377685, + "learning_rate": 4.270692050527335e-06, + "loss": 0.4704, + "step": 7133 + }, + { + "epoch": 0.56, + "grad_norm": 2.949473922851969, + "learning_rate": 4.269433879112202e-06, + "loss": 0.4463, + "step": 7134 + }, + { + "epoch": 0.56, + "grad_norm": 3.50625765584057, + "learning_rate": 4.268175754963741e-06, + "loss": 0.4303, + "step": 7135 + }, + { + "epoch": 0.56, + "grad_norm": 1.8915366186292353, + "learning_rate": 4.266917678163352e-06, + "loss": 0.4944, + "step": 7136 + }, + { + "epoch": 0.56, + "grad_norm": 1.7606990101713447, + "learning_rate": 4.26565964879243e-06, + "loss": 0.494, + "step": 7137 + }, + { + "epoch": 0.56, + "grad_norm": 2.32874710581035, + "learning_rate": 4.264401666932367e-06, + "loss": 0.4527, + "step": 7138 + }, + { + "epoch": 0.56, + "grad_norm": 1.8136147198991983, + "learning_rate": 4.2631437326645555e-06, + "loss": 0.4568, + "step": 7139 + }, + { + "epoch": 0.56, + "grad_norm": 2.2163663109038967, + "learning_rate": 4.261885846070379e-06, + "loss": 0.4927, + "step": 7140 + }, + { + "epoch": 0.56, + "grad_norm": 1.5366903757711854, + "learning_rate": 4.260628007231223e-06, + "loss": 0.4439, + "step": 7141 + }, + { + "epoch": 0.56, + "grad_norm": 1.6128422517537353, + "learning_rate": 4.259370216228468e-06, + "loss": 0.44, + "step": 7142 + }, + { + "epoch": 0.56, + "grad_norm": 0.5422867976860997, + "learning_rate": 4.258112473143491e-06, + "loss": 0.4931, + "step": 7143 + }, + { + "epoch": 0.56, + "grad_norm": 1.732283222605643, + "learning_rate": 4.256854778057668e-06, + "loss": 0.5267, + "step": 7144 + }, + { + "epoch": 0.56, + "grad_norm": 1.598296699258677, + "learning_rate": 4.2555971310523655e-06, + "loss": 0.4724, + "step": 7145 + }, + { + "epoch": 0.56, + "grad_norm": 0.5270406538508988, + "learning_rate": 4.254339532208959e-06, + "loss": 0.4824, + "step": 7146 + }, + { + "epoch": 0.56, + "grad_norm": 0.570551625454146, + "learning_rate": 4.253081981608805e-06, + "loss": 0.4756, + "step": 7147 + }, + { + "epoch": 0.56, + "grad_norm": 1.3306062359476223, + "learning_rate": 4.251824479333273e-06, + "loss": 0.4501, + "step": 7148 + }, + { + "epoch": 0.56, + "grad_norm": 1.7208407446721028, + "learning_rate": 4.250567025463716e-06, + "loss": 0.5568, + "step": 7149 + }, + { + "epoch": 0.56, + "grad_norm": 1.5618051072099315, + "learning_rate": 4.249309620081495e-06, + "loss": 0.4916, + "step": 7150 + }, + { + "epoch": 0.56, + "grad_norm": 2.041586722877766, + "learning_rate": 4.248052263267956e-06, + "loss": 0.5151, + "step": 7151 + }, + { + "epoch": 0.56, + "grad_norm": 1.7005550799666431, + "learning_rate": 4.246794955104453e-06, + "loss": 0.4633, + "step": 7152 + }, + { + "epoch": 0.56, + "grad_norm": 1.3551752685434306, + "learning_rate": 4.245537695672333e-06, + "loss": 0.4632, + "step": 7153 + }, + { + "epoch": 0.56, + "grad_norm": 0.5549663044760179, + "learning_rate": 4.244280485052935e-06, + "loss": 0.4941, + "step": 7154 + }, + { + "epoch": 0.56, + "grad_norm": 1.8302813717164201, + "learning_rate": 4.2430233233276026e-06, + "loss": 0.4849, + "step": 7155 + }, + { + "epoch": 0.56, + "grad_norm": 1.6175272060955515, + "learning_rate": 4.2417662105776695e-06, + "loss": 0.4571, + "step": 7156 + }, + { + "epoch": 0.56, + "grad_norm": 1.3630483389353352, + "learning_rate": 4.240509146884472e-06, + "loss": 0.4231, + "step": 7157 + }, + { + "epoch": 0.56, + "grad_norm": 1.341332577101578, + "learning_rate": 4.239252132329338e-06, + "loss": 0.4687, + "step": 7158 + }, + { + "epoch": 0.56, + "grad_norm": 1.8412649990272256, + "learning_rate": 4.237995166993598e-06, + "loss": 0.4495, + "step": 7159 + }, + { + "epoch": 0.56, + "grad_norm": 0.5902555942383094, + "learning_rate": 4.236738250958571e-06, + "loss": 0.51, + "step": 7160 + }, + { + "epoch": 0.56, + "grad_norm": 1.53911026827657, + "learning_rate": 4.235481384305583e-06, + "loss": 0.4452, + "step": 7161 + }, + { + "epoch": 0.56, + "grad_norm": 1.9798016316404117, + "learning_rate": 4.234224567115948e-06, + "loss": 0.4536, + "step": 7162 + }, + { + "epoch": 0.56, + "grad_norm": 2.2786078273549752, + "learning_rate": 4.2329677994709805e-06, + "loss": 0.4736, + "step": 7163 + }, + { + "epoch": 0.56, + "grad_norm": 1.61142115541771, + "learning_rate": 4.231711081451997e-06, + "loss": 0.451, + "step": 7164 + }, + { + "epoch": 0.56, + "grad_norm": 1.629607635428878, + "learning_rate": 4.230454413140297e-06, + "loss": 0.4419, + "step": 7165 + }, + { + "epoch": 0.56, + "grad_norm": 1.6069100954455453, + "learning_rate": 4.229197794617192e-06, + "loss": 0.5143, + "step": 7166 + }, + { + "epoch": 0.56, + "grad_norm": 1.9731422168325872, + "learning_rate": 4.227941225963981e-06, + "loss": 0.467, + "step": 7167 + }, + { + "epoch": 0.56, + "grad_norm": 1.440688698761986, + "learning_rate": 4.226684707261963e-06, + "loss": 0.4393, + "step": 7168 + }, + { + "epoch": 0.56, + "grad_norm": 0.5777641425416326, + "learning_rate": 4.225428238592433e-06, + "loss": 0.4814, + "step": 7169 + }, + { + "epoch": 0.56, + "grad_norm": 0.5786916309508443, + "learning_rate": 4.2241718200366824e-06, + "loss": 0.4889, + "step": 7170 + }, + { + "epoch": 0.56, + "grad_norm": 1.6835076191670824, + "learning_rate": 4.222915451675999e-06, + "loss": 0.4227, + "step": 7171 + }, + { + "epoch": 0.56, + "grad_norm": 1.7560958312115467, + "learning_rate": 4.221659133591672e-06, + "loss": 0.4367, + "step": 7172 + }, + { + "epoch": 0.56, + "grad_norm": 2.7899144033972494, + "learning_rate": 4.220402865864979e-06, + "loss": 0.5037, + "step": 7173 + }, + { + "epoch": 0.56, + "grad_norm": 1.4668440822962623, + "learning_rate": 4.2191466485772e-06, + "loss": 0.528, + "step": 7174 + }, + { + "epoch": 0.56, + "grad_norm": 1.7694986247838935, + "learning_rate": 4.217890481809613e-06, + "loss": 0.4321, + "step": 7175 + }, + { + "epoch": 0.56, + "grad_norm": 1.6802665156694991, + "learning_rate": 4.216634365643488e-06, + "loss": 0.4337, + "step": 7176 + }, + { + "epoch": 0.56, + "grad_norm": 2.290977307441879, + "learning_rate": 4.215378300160095e-06, + "loss": 0.406, + "step": 7177 + }, + { + "epoch": 0.56, + "grad_norm": 0.6057793011525513, + "learning_rate": 4.2141222854407e-06, + "loss": 0.4897, + "step": 7178 + }, + { + "epoch": 0.56, + "grad_norm": 1.3561512078015183, + "learning_rate": 4.212866321566564e-06, + "loss": 0.4598, + "step": 7179 + }, + { + "epoch": 0.56, + "grad_norm": 0.5737466134148299, + "learning_rate": 4.2116104086189466e-06, + "loss": 0.5004, + "step": 7180 + }, + { + "epoch": 0.56, + "grad_norm": 1.6331235011133518, + "learning_rate": 4.2103545466791074e-06, + "loss": 0.4741, + "step": 7181 + }, + { + "epoch": 0.56, + "grad_norm": 4.412003853584198, + "learning_rate": 4.209098735828293e-06, + "loss": 0.4528, + "step": 7182 + }, + { + "epoch": 0.56, + "grad_norm": 1.7275176501531586, + "learning_rate": 4.207842976147758e-06, + "loss": 0.4694, + "step": 7183 + }, + { + "epoch": 0.56, + "grad_norm": 1.6301747302058787, + "learning_rate": 4.206587267718743e-06, + "loss": 0.4834, + "step": 7184 + }, + { + "epoch": 0.56, + "grad_norm": 1.6640047079820801, + "learning_rate": 4.205331610622496e-06, + "loss": 0.4692, + "step": 7185 + }, + { + "epoch": 0.56, + "grad_norm": 2.286084659455096, + "learning_rate": 4.204076004940255e-06, + "loss": 0.4951, + "step": 7186 + }, + { + "epoch": 0.56, + "grad_norm": 1.546893798102047, + "learning_rate": 4.202820450753255e-06, + "loss": 0.4712, + "step": 7187 + }, + { + "epoch": 0.56, + "grad_norm": 2.4878529962976126, + "learning_rate": 4.2015649481427285e-06, + "loss": 0.4408, + "step": 7188 + }, + { + "epoch": 0.56, + "grad_norm": 2.164570868973618, + "learning_rate": 4.200309497189905e-06, + "loss": 0.4744, + "step": 7189 + }, + { + "epoch": 0.56, + "grad_norm": 1.5107459330460842, + "learning_rate": 4.199054097976012e-06, + "loss": 0.4452, + "step": 7190 + }, + { + "epoch": 0.56, + "grad_norm": 1.4913902984697256, + "learning_rate": 4.19779875058227e-06, + "loss": 0.441, + "step": 7191 + }, + { + "epoch": 0.56, + "grad_norm": 1.7601764914183795, + "learning_rate": 4.1965434550899e-06, + "loss": 0.4825, + "step": 7192 + }, + { + "epoch": 0.56, + "grad_norm": 0.5750663937481285, + "learning_rate": 4.195288211580118e-06, + "loss": 0.4873, + "step": 7193 + }, + { + "epoch": 0.56, + "grad_norm": 1.9728166575549422, + "learning_rate": 4.194033020134135e-06, + "loss": 0.4849, + "step": 7194 + }, + { + "epoch": 0.57, + "grad_norm": 1.4359490813360012, + "learning_rate": 4.192777880833161e-06, + "loss": 0.485, + "step": 7195 + }, + { + "epoch": 0.57, + "grad_norm": 2.367135646028408, + "learning_rate": 4.1915227937584015e-06, + "loss": 0.431, + "step": 7196 + }, + { + "epoch": 0.57, + "grad_norm": 2.2703325081079995, + "learning_rate": 4.190267758991062e-06, + "loss": 0.4773, + "step": 7197 + }, + { + "epoch": 0.57, + "grad_norm": 2.491102761889611, + "learning_rate": 4.189012776612337e-06, + "loss": 0.4861, + "step": 7198 + }, + { + "epoch": 0.57, + "grad_norm": 1.3692832381163065, + "learning_rate": 4.187757846703427e-06, + "loss": 0.4279, + "step": 7199 + }, + { + "epoch": 0.57, + "grad_norm": 1.7482550726188164, + "learning_rate": 4.186502969345518e-06, + "loss": 0.3971, + "step": 7200 + }, + { + "epoch": 0.57, + "grad_norm": 2.227964683480586, + "learning_rate": 4.185248144619806e-06, + "loss": 0.4646, + "step": 7201 + }, + { + "epoch": 0.57, + "grad_norm": 1.6120483074020207, + "learning_rate": 4.183993372607472e-06, + "loss": 0.4614, + "step": 7202 + }, + { + "epoch": 0.57, + "grad_norm": 1.6203608377282328, + "learning_rate": 4.1827386533897e-06, + "loss": 0.481, + "step": 7203 + }, + { + "epoch": 0.57, + "grad_norm": 0.5741994293492492, + "learning_rate": 4.181483987047666e-06, + "loss": 0.4657, + "step": 7204 + }, + { + "epoch": 0.57, + "grad_norm": 1.5819616942552157, + "learning_rate": 4.18022937366255e-06, + "loss": 0.4305, + "step": 7205 + }, + { + "epoch": 0.57, + "grad_norm": 1.9052278979378736, + "learning_rate": 4.178974813315519e-06, + "loss": 0.4474, + "step": 7206 + }, + { + "epoch": 0.57, + "grad_norm": 1.3999564200050365, + "learning_rate": 4.177720306087745e-06, + "loss": 0.4733, + "step": 7207 + }, + { + "epoch": 0.57, + "grad_norm": 1.4349190492553616, + "learning_rate": 4.176465852060391e-06, + "loss": 0.4726, + "step": 7208 + }, + { + "epoch": 0.57, + "grad_norm": 1.5976258482918875, + "learning_rate": 4.175211451314619e-06, + "loss": 0.4322, + "step": 7209 + }, + { + "epoch": 0.57, + "grad_norm": 1.4175416498897417, + "learning_rate": 4.173957103931587e-06, + "loss": 0.4666, + "step": 7210 + }, + { + "epoch": 0.57, + "grad_norm": 2.2402160660989288, + "learning_rate": 4.17270280999245e-06, + "loss": 0.5288, + "step": 7211 + }, + { + "epoch": 0.57, + "grad_norm": 2.0565465602893727, + "learning_rate": 4.171448569578359e-06, + "loss": 0.4578, + "step": 7212 + }, + { + "epoch": 0.57, + "grad_norm": 1.2889545798991457, + "learning_rate": 4.170194382770462e-06, + "loss": 0.3994, + "step": 7213 + }, + { + "epoch": 0.57, + "grad_norm": 0.5417510364901649, + "learning_rate": 4.168940249649902e-06, + "loss": 0.4914, + "step": 7214 + }, + { + "epoch": 0.57, + "grad_norm": 0.6067848747787655, + "learning_rate": 4.16768617029782e-06, + "loss": 0.483, + "step": 7215 + }, + { + "epoch": 0.57, + "grad_norm": 1.558924458905801, + "learning_rate": 4.166432144795357e-06, + "loss": 0.4517, + "step": 7216 + }, + { + "epoch": 0.57, + "grad_norm": 1.3056153487433872, + "learning_rate": 4.16517817322364e-06, + "loss": 0.4244, + "step": 7217 + }, + { + "epoch": 0.57, + "grad_norm": 1.6203790482880163, + "learning_rate": 4.1639242556638075e-06, + "loss": 0.4614, + "step": 7218 + }, + { + "epoch": 0.57, + "grad_norm": 1.5866996507146631, + "learning_rate": 4.162670392196978e-06, + "loss": 0.4761, + "step": 7219 + }, + { + "epoch": 0.57, + "grad_norm": 0.5944572471266427, + "learning_rate": 4.161416582904279e-06, + "loss": 0.5057, + "step": 7220 + }, + { + "epoch": 0.57, + "grad_norm": 1.8848243412834218, + "learning_rate": 4.160162827866832e-06, + "loss": 0.5233, + "step": 7221 + }, + { + "epoch": 0.57, + "grad_norm": 0.5436304846627225, + "learning_rate": 4.15890912716575e-06, + "loss": 0.4648, + "step": 7222 + }, + { + "epoch": 0.57, + "grad_norm": 1.8487228727618306, + "learning_rate": 4.157655480882149e-06, + "loss": 0.514, + "step": 7223 + }, + { + "epoch": 0.57, + "grad_norm": 1.62845003223565, + "learning_rate": 4.1564018890971345e-06, + "loss": 0.4783, + "step": 7224 + }, + { + "epoch": 0.57, + "grad_norm": 1.326197393556375, + "learning_rate": 4.1551483518918156e-06, + "loss": 0.3681, + "step": 7225 + }, + { + "epoch": 0.57, + "grad_norm": 1.7320646277997784, + "learning_rate": 4.153894869347292e-06, + "loss": 0.5074, + "step": 7226 + }, + { + "epoch": 0.57, + "grad_norm": 1.920768701154002, + "learning_rate": 4.152641441544665e-06, + "loss": 0.4316, + "step": 7227 + }, + { + "epoch": 0.57, + "grad_norm": 1.6106112115824267, + "learning_rate": 4.151388068565026e-06, + "loss": 0.4662, + "step": 7228 + }, + { + "epoch": 0.57, + "grad_norm": 1.5536814263242582, + "learning_rate": 4.150134750489471e-06, + "loss": 0.432, + "step": 7229 + }, + { + "epoch": 0.57, + "grad_norm": 2.267552671542952, + "learning_rate": 4.148881487399085e-06, + "loss": 0.4673, + "step": 7230 + }, + { + "epoch": 0.57, + "grad_norm": 1.5251450140465543, + "learning_rate": 4.1476282793749515e-06, + "loss": 0.5443, + "step": 7231 + }, + { + "epoch": 0.57, + "grad_norm": 1.570071021459001, + "learning_rate": 4.1463751264981574e-06, + "loss": 0.4859, + "step": 7232 + }, + { + "epoch": 0.57, + "grad_norm": 1.9499691663920258, + "learning_rate": 4.145122028849772e-06, + "loss": 0.4509, + "step": 7233 + }, + { + "epoch": 0.57, + "grad_norm": 1.5637653900421415, + "learning_rate": 4.143868986510876e-06, + "loss": 0.4681, + "step": 7234 + }, + { + "epoch": 0.57, + "grad_norm": 1.3360038123002915, + "learning_rate": 4.142615999562534e-06, + "loss": 0.4285, + "step": 7235 + }, + { + "epoch": 0.57, + "grad_norm": 0.5607455807744609, + "learning_rate": 4.141363068085817e-06, + "loss": 0.4696, + "step": 7236 + }, + { + "epoch": 0.57, + "grad_norm": 2.1869152469017368, + "learning_rate": 4.140110192161786e-06, + "loss": 0.5326, + "step": 7237 + }, + { + "epoch": 0.57, + "grad_norm": 4.193590935499073, + "learning_rate": 4.138857371871501e-06, + "loss": 0.4332, + "step": 7238 + }, + { + "epoch": 0.57, + "grad_norm": 2.079653613191426, + "learning_rate": 4.137604607296016e-06, + "loss": 0.4867, + "step": 7239 + }, + { + "epoch": 0.57, + "grad_norm": 1.6934470574676732, + "learning_rate": 4.1363518985163865e-06, + "loss": 0.5073, + "step": 7240 + }, + { + "epoch": 0.57, + "grad_norm": 1.82202470824787, + "learning_rate": 4.1350992456136586e-06, + "loss": 0.5266, + "step": 7241 + }, + { + "epoch": 0.57, + "grad_norm": 1.7153789490952156, + "learning_rate": 4.133846648668878e-06, + "loss": 0.4607, + "step": 7242 + }, + { + "epoch": 0.57, + "grad_norm": 0.5595554969703158, + "learning_rate": 4.132594107763087e-06, + "loss": 0.4944, + "step": 7243 + }, + { + "epoch": 0.57, + "grad_norm": 3.185435782620074, + "learning_rate": 4.13134162297732e-06, + "loss": 0.4775, + "step": 7244 + }, + { + "epoch": 0.57, + "grad_norm": 0.5411136030467448, + "learning_rate": 4.130089194392617e-06, + "loss": 0.4598, + "step": 7245 + }, + { + "epoch": 0.57, + "grad_norm": 1.9342805837479653, + "learning_rate": 4.128836822090003e-06, + "loss": 0.4144, + "step": 7246 + }, + { + "epoch": 0.57, + "grad_norm": 2.3018552170539945, + "learning_rate": 4.127584506150508e-06, + "loss": 0.3836, + "step": 7247 + }, + { + "epoch": 0.57, + "grad_norm": 0.5652795675424047, + "learning_rate": 4.126332246655154e-06, + "loss": 0.4889, + "step": 7248 + }, + { + "epoch": 0.57, + "grad_norm": 2.0155641280240064, + "learning_rate": 4.125080043684961e-06, + "loss": 0.4723, + "step": 7249 + }, + { + "epoch": 0.57, + "grad_norm": 1.632886268271774, + "learning_rate": 4.1238278973209426e-06, + "loss": 0.4377, + "step": 7250 + }, + { + "epoch": 0.57, + "grad_norm": 0.564861750390497, + "learning_rate": 4.122575807644116e-06, + "loss": 0.5127, + "step": 7251 + }, + { + "epoch": 0.57, + "grad_norm": 1.9712034567633971, + "learning_rate": 4.121323774735484e-06, + "loss": 0.4741, + "step": 7252 + }, + { + "epoch": 0.57, + "grad_norm": 1.5387590534814723, + "learning_rate": 4.120071798676055e-06, + "loss": 0.4635, + "step": 7253 + }, + { + "epoch": 0.57, + "grad_norm": 0.566008860908701, + "learning_rate": 4.118819879546832e-06, + "loss": 0.4623, + "step": 7254 + }, + { + "epoch": 0.57, + "grad_norm": 0.5599071387118866, + "learning_rate": 4.117568017428807e-06, + "loss": 0.4865, + "step": 7255 + }, + { + "epoch": 0.57, + "grad_norm": 2.101934518954288, + "learning_rate": 4.116316212402979e-06, + "loss": 0.427, + "step": 7256 + }, + { + "epoch": 0.57, + "grad_norm": 1.7828228437772888, + "learning_rate": 4.115064464550335e-06, + "loss": 0.4653, + "step": 7257 + }, + { + "epoch": 0.57, + "grad_norm": 1.7394057271031385, + "learning_rate": 4.113812773951863e-06, + "loss": 0.5187, + "step": 7258 + }, + { + "epoch": 0.57, + "grad_norm": 1.8281299954911372, + "learning_rate": 4.112561140688545e-06, + "loss": 0.4617, + "step": 7259 + }, + { + "epoch": 0.57, + "grad_norm": 1.7795973261585343, + "learning_rate": 4.111309564841361e-06, + "loss": 0.5197, + "step": 7260 + }, + { + "epoch": 0.57, + "grad_norm": 1.7591658904923522, + "learning_rate": 4.1100580464912845e-06, + "loss": 0.4864, + "step": 7261 + }, + { + "epoch": 0.57, + "grad_norm": 1.5556895708101333, + "learning_rate": 4.108806585719289e-06, + "loss": 0.4281, + "step": 7262 + }, + { + "epoch": 0.57, + "grad_norm": 1.5576692292947365, + "learning_rate": 4.107555182606341e-06, + "loss": 0.4414, + "step": 7263 + }, + { + "epoch": 0.57, + "grad_norm": 1.3711751625582287, + "learning_rate": 4.106303837233404e-06, + "loss": 0.462, + "step": 7264 + }, + { + "epoch": 0.57, + "grad_norm": 1.9369489098015318, + "learning_rate": 4.105052549681443e-06, + "loss": 0.4331, + "step": 7265 + }, + { + "epoch": 0.57, + "grad_norm": 2.14714813316451, + "learning_rate": 4.103801320031407e-06, + "loss": 0.4718, + "step": 7266 + }, + { + "epoch": 0.57, + "grad_norm": 2.7741689964401735, + "learning_rate": 4.1025501483642575e-06, + "loss": 0.4145, + "step": 7267 + }, + { + "epoch": 0.57, + "grad_norm": 1.7222433273465352, + "learning_rate": 4.101299034760936e-06, + "loss": 0.4111, + "step": 7268 + }, + { + "epoch": 0.57, + "grad_norm": 1.8236979486369775, + "learning_rate": 4.100047979302395e-06, + "loss": 0.4717, + "step": 7269 + }, + { + "epoch": 0.57, + "grad_norm": 1.6095616761489393, + "learning_rate": 4.098796982069568e-06, + "loss": 0.4731, + "step": 7270 + }, + { + "epoch": 0.57, + "grad_norm": 1.4960794121352596, + "learning_rate": 4.0975460431434006e-06, + "loss": 0.4733, + "step": 7271 + }, + { + "epoch": 0.57, + "grad_norm": 1.3625963282404576, + "learning_rate": 4.096295162604822e-06, + "loss": 0.4282, + "step": 7272 + }, + { + "epoch": 0.57, + "grad_norm": 1.632710106823323, + "learning_rate": 4.095044340534766e-06, + "loss": 0.5286, + "step": 7273 + }, + { + "epoch": 0.57, + "grad_norm": 1.5714894172128924, + "learning_rate": 4.093793577014156e-06, + "loss": 0.4437, + "step": 7274 + }, + { + "epoch": 0.57, + "grad_norm": 2.2254748193516987, + "learning_rate": 4.0925428721239166e-06, + "loss": 0.4749, + "step": 7275 + }, + { + "epoch": 0.57, + "grad_norm": 1.5507882722707433, + "learning_rate": 4.091292225944965e-06, + "loss": 0.4624, + "step": 7276 + }, + { + "epoch": 0.57, + "grad_norm": 0.5911538084966974, + "learning_rate": 4.090041638558218e-06, + "loss": 0.5021, + "step": 7277 + }, + { + "epoch": 0.57, + "grad_norm": 1.5518925948511184, + "learning_rate": 4.0887911100445885e-06, + "loss": 0.4474, + "step": 7278 + }, + { + "epoch": 0.57, + "grad_norm": 3.164721432401165, + "learning_rate": 4.08754064048498e-06, + "loss": 0.4958, + "step": 7279 + }, + { + "epoch": 0.57, + "grad_norm": 1.568996595220086, + "learning_rate": 4.0862902299603e-06, + "loss": 0.4322, + "step": 7280 + }, + { + "epoch": 0.57, + "grad_norm": 2.41196396552437, + "learning_rate": 4.0850398785514454e-06, + "loss": 0.5127, + "step": 7281 + }, + { + "epoch": 0.57, + "grad_norm": 0.5456935298686468, + "learning_rate": 4.083789586339315e-06, + "loss": 0.4994, + "step": 7282 + }, + { + "epoch": 0.57, + "grad_norm": 1.6836367250278734, + "learning_rate": 4.082539353404799e-06, + "loss": 0.4527, + "step": 7283 + }, + { + "epoch": 0.57, + "grad_norm": 1.683483572937276, + "learning_rate": 4.081289179828787e-06, + "loss": 0.4481, + "step": 7284 + }, + { + "epoch": 0.57, + "grad_norm": 1.5610040995362815, + "learning_rate": 4.080039065692162e-06, + "loss": 0.4865, + "step": 7285 + }, + { + "epoch": 0.57, + "grad_norm": 2.215831774679217, + "learning_rate": 4.0787890110758085e-06, + "loss": 0.4678, + "step": 7286 + }, + { + "epoch": 0.57, + "grad_norm": 1.9827336352012541, + "learning_rate": 4.077539016060598e-06, + "loss": 0.4529, + "step": 7287 + }, + { + "epoch": 0.57, + "grad_norm": 1.725589906867415, + "learning_rate": 4.076289080727408e-06, + "loss": 0.4962, + "step": 7288 + }, + { + "epoch": 0.57, + "grad_norm": 2.996218397553376, + "learning_rate": 4.075039205157107e-06, + "loss": 0.5031, + "step": 7289 + }, + { + "epoch": 0.57, + "grad_norm": 1.886631401910825, + "learning_rate": 4.073789389430557e-06, + "loss": 0.4944, + "step": 7290 + }, + { + "epoch": 0.57, + "grad_norm": 0.5410462303540111, + "learning_rate": 4.072539633628624e-06, + "loss": 0.504, + "step": 7291 + }, + { + "epoch": 0.57, + "grad_norm": 1.5161100302776698, + "learning_rate": 4.071289937832163e-06, + "loss": 0.3853, + "step": 7292 + }, + { + "epoch": 0.57, + "grad_norm": 1.5175882798239, + "learning_rate": 4.0700403021220285e-06, + "loss": 0.4794, + "step": 7293 + }, + { + "epoch": 0.57, + "grad_norm": 1.5602935665775335, + "learning_rate": 4.0687907265790685e-06, + "loss": 0.4728, + "step": 7294 + }, + { + "epoch": 0.57, + "grad_norm": 0.5668635839465165, + "learning_rate": 4.067541211284131e-06, + "loss": 0.469, + "step": 7295 + }, + { + "epoch": 0.57, + "grad_norm": 1.552276164470864, + "learning_rate": 4.066291756318058e-06, + "loss": 0.4492, + "step": 7296 + }, + { + "epoch": 0.57, + "grad_norm": 1.9416798460456859, + "learning_rate": 4.065042361761687e-06, + "loss": 0.4423, + "step": 7297 + }, + { + "epoch": 0.57, + "grad_norm": 1.8885804351222533, + "learning_rate": 4.063793027695851e-06, + "loss": 0.4691, + "step": 7298 + }, + { + "epoch": 0.57, + "grad_norm": 0.562851289644021, + "learning_rate": 4.06254375420138e-06, + "loss": 0.4952, + "step": 7299 + }, + { + "epoch": 0.57, + "grad_norm": 1.5749357483899877, + "learning_rate": 4.0612945413591055e-06, + "loss": 0.4379, + "step": 7300 + }, + { + "epoch": 0.57, + "grad_norm": 2.4771389656381997, + "learning_rate": 4.060045389249843e-06, + "loss": 0.4289, + "step": 7301 + }, + { + "epoch": 0.57, + "grad_norm": 1.6126208812506262, + "learning_rate": 4.058796297954417e-06, + "loss": 0.44, + "step": 7302 + }, + { + "epoch": 0.57, + "grad_norm": 2.0144264387376807, + "learning_rate": 4.057547267553636e-06, + "loss": 0.4565, + "step": 7303 + }, + { + "epoch": 0.57, + "grad_norm": 1.8842739185812187, + "learning_rate": 4.056298298128318e-06, + "loss": 0.4255, + "step": 7304 + }, + { + "epoch": 0.57, + "grad_norm": 0.5603887924998544, + "learning_rate": 4.055049389759261e-06, + "loss": 0.5003, + "step": 7305 + }, + { + "epoch": 0.57, + "grad_norm": 1.3477901475875183, + "learning_rate": 4.053800542527277e-06, + "loss": 0.4605, + "step": 7306 + }, + { + "epoch": 0.57, + "grad_norm": 2.6678273227006093, + "learning_rate": 4.052551756513156e-06, + "loss": 0.4717, + "step": 7307 + }, + { + "epoch": 0.57, + "grad_norm": 1.7958078206603296, + "learning_rate": 4.051303031797699e-06, + "loss": 0.4259, + "step": 7308 + }, + { + "epoch": 0.57, + "grad_norm": 1.9493410891755092, + "learning_rate": 4.050054368461695e-06, + "loss": 0.483, + "step": 7309 + }, + { + "epoch": 0.57, + "grad_norm": 1.59653465259678, + "learning_rate": 4.048805766585929e-06, + "loss": 0.4676, + "step": 7310 + }, + { + "epoch": 0.57, + "grad_norm": 1.6114293408826292, + "learning_rate": 4.047557226251187e-06, + "loss": 0.4959, + "step": 7311 + }, + { + "epoch": 0.57, + "grad_norm": 2.8201546935081434, + "learning_rate": 4.0463087475382464e-06, + "loss": 0.4607, + "step": 7312 + }, + { + "epoch": 0.57, + "grad_norm": 2.173987259415849, + "learning_rate": 4.045060330527882e-06, + "loss": 0.4955, + "step": 7313 + }, + { + "epoch": 0.57, + "grad_norm": 0.5549504089358277, + "learning_rate": 4.043811975300865e-06, + "loss": 0.4701, + "step": 7314 + }, + { + "epoch": 0.57, + "grad_norm": 1.8694713696144272, + "learning_rate": 4.0425636819379635e-06, + "loss": 0.4317, + "step": 7315 + }, + { + "epoch": 0.57, + "grad_norm": 2.2687829176791734, + "learning_rate": 4.041315450519937e-06, + "loss": 0.4527, + "step": 7316 + }, + { + "epoch": 0.57, + "grad_norm": 0.5552242515860327, + "learning_rate": 4.0400672811275495e-06, + "loss": 0.5012, + "step": 7317 + }, + { + "epoch": 0.57, + "grad_norm": 1.4604753482531572, + "learning_rate": 4.038819173841551e-06, + "loss": 0.4958, + "step": 7318 + }, + { + "epoch": 0.57, + "grad_norm": 1.6604820135143221, + "learning_rate": 4.037571128742696e-06, + "loss": 0.4358, + "step": 7319 + }, + { + "epoch": 0.57, + "grad_norm": 3.7078393657615045, + "learning_rate": 4.036323145911728e-06, + "loss": 0.4584, + "step": 7320 + }, + { + "epoch": 0.57, + "grad_norm": 1.4537142441522966, + "learning_rate": 4.0350752254293914e-06, + "loss": 0.4108, + "step": 7321 + }, + { + "epoch": 0.58, + "grad_norm": 2.1624037004743024, + "learning_rate": 4.033827367376428e-06, + "loss": 0.516, + "step": 7322 + }, + { + "epoch": 0.58, + "grad_norm": 1.6426444613575368, + "learning_rate": 4.032579571833568e-06, + "loss": 0.4209, + "step": 7323 + }, + { + "epoch": 0.58, + "grad_norm": 1.7029471153652824, + "learning_rate": 4.0313318388815465e-06, + "loss": 0.5057, + "step": 7324 + }, + { + "epoch": 0.58, + "grad_norm": 1.991998869316725, + "learning_rate": 4.030084168601086e-06, + "loss": 0.485, + "step": 7325 + }, + { + "epoch": 0.58, + "grad_norm": 2.7997079900013677, + "learning_rate": 4.028836561072912e-06, + "loss": 0.4812, + "step": 7326 + }, + { + "epoch": 0.58, + "grad_norm": 2.306296173867573, + "learning_rate": 4.027589016377741e-06, + "loss": 0.4613, + "step": 7327 + }, + { + "epoch": 0.58, + "grad_norm": 1.9150623181305038, + "learning_rate": 4.02634153459629e-06, + "loss": 0.4577, + "step": 7328 + }, + { + "epoch": 0.58, + "grad_norm": 2.1240719616016066, + "learning_rate": 4.025094115809267e-06, + "loss": 0.447, + "step": 7329 + }, + { + "epoch": 0.58, + "grad_norm": 1.3433655595156815, + "learning_rate": 4.02384676009738e-06, + "loss": 0.4225, + "step": 7330 + }, + { + "epoch": 0.58, + "grad_norm": 2.1588471871939694, + "learning_rate": 4.0225994675413296e-06, + "loss": 0.5034, + "step": 7331 + }, + { + "epoch": 0.58, + "grad_norm": 2.199053713949702, + "learning_rate": 4.021352238221815e-06, + "loss": 0.477, + "step": 7332 + }, + { + "epoch": 0.58, + "grad_norm": 1.3233652467101629, + "learning_rate": 4.020105072219532e-06, + "loss": 0.4709, + "step": 7333 + }, + { + "epoch": 0.58, + "grad_norm": 1.4812575063998095, + "learning_rate": 4.018857969615166e-06, + "loss": 0.4569, + "step": 7334 + }, + { + "epoch": 0.58, + "grad_norm": 1.707740175299555, + "learning_rate": 4.01761093048941e-06, + "loss": 0.4825, + "step": 7335 + }, + { + "epoch": 0.58, + "grad_norm": 1.7652809021015952, + "learning_rate": 4.016363954922937e-06, + "loss": 0.4721, + "step": 7336 + }, + { + "epoch": 0.58, + "grad_norm": 0.5401002420630759, + "learning_rate": 4.015117042996434e-06, + "loss": 0.4763, + "step": 7337 + }, + { + "epoch": 0.58, + "grad_norm": 1.6298520674197519, + "learning_rate": 4.013870194790566e-06, + "loss": 0.4381, + "step": 7338 + }, + { + "epoch": 0.58, + "grad_norm": 1.394556113438474, + "learning_rate": 4.01262341038601e-06, + "loss": 0.4272, + "step": 7339 + }, + { + "epoch": 0.58, + "grad_norm": 1.8679451800133462, + "learning_rate": 4.011376689863425e-06, + "loss": 0.4894, + "step": 7340 + }, + { + "epoch": 0.58, + "grad_norm": 0.5649150563107403, + "learning_rate": 4.010130033303477e-06, + "loss": 0.508, + "step": 7341 + }, + { + "epoch": 0.58, + "grad_norm": 1.6952363652683937, + "learning_rate": 4.008883440786817e-06, + "loss": 0.453, + "step": 7342 + }, + { + "epoch": 0.58, + "grad_norm": 1.806630162561481, + "learning_rate": 4.007636912394105e-06, + "loss": 0.427, + "step": 7343 + }, + { + "epoch": 0.58, + "grad_norm": 1.5493566111932915, + "learning_rate": 4.006390448205987e-06, + "loss": 0.4371, + "step": 7344 + }, + { + "epoch": 0.58, + "grad_norm": 1.534355913169727, + "learning_rate": 4.005144048303105e-06, + "loss": 0.477, + "step": 7345 + }, + { + "epoch": 0.58, + "grad_norm": 1.5141934498753955, + "learning_rate": 4.003897712766104e-06, + "loss": 0.4321, + "step": 7346 + }, + { + "epoch": 0.58, + "grad_norm": 2.1092214984871536, + "learning_rate": 4.0026514416756165e-06, + "loss": 0.4149, + "step": 7347 + }, + { + "epoch": 0.58, + "grad_norm": 1.5744244289322404, + "learning_rate": 4.001405235112277e-06, + "loss": 0.4496, + "step": 7348 + }, + { + "epoch": 0.58, + "grad_norm": 1.7818362511447288, + "learning_rate": 4.000159093156711e-06, + "loss": 0.437, + "step": 7349 + }, + { + "epoch": 0.58, + "grad_norm": 1.4416057643849776, + "learning_rate": 3.998913015889545e-06, + "loss": 0.4822, + "step": 7350 + }, + { + "epoch": 0.58, + "grad_norm": 2.06404069743296, + "learning_rate": 3.997667003391397e-06, + "loss": 0.4986, + "step": 7351 + }, + { + "epoch": 0.58, + "grad_norm": 2.3856399361748397, + "learning_rate": 3.996421055742883e-06, + "loss": 0.4414, + "step": 7352 + }, + { + "epoch": 0.58, + "grad_norm": 1.9298855227941383, + "learning_rate": 3.995175173024612e-06, + "loss": 0.3937, + "step": 7353 + }, + { + "epoch": 0.58, + "grad_norm": 1.6264945281713359, + "learning_rate": 3.993929355317195e-06, + "loss": 0.4737, + "step": 7354 + }, + { + "epoch": 0.58, + "grad_norm": 1.5806961659514345, + "learning_rate": 3.992683602701231e-06, + "loss": 0.523, + "step": 7355 + }, + { + "epoch": 0.58, + "grad_norm": 1.4466296892700827, + "learning_rate": 3.99143791525732e-06, + "loss": 0.4494, + "step": 7356 + }, + { + "epoch": 0.58, + "grad_norm": 1.745918551511774, + "learning_rate": 3.990192293066058e-06, + "loss": 0.4591, + "step": 7357 + }, + { + "epoch": 0.58, + "grad_norm": 2.0836660489384053, + "learning_rate": 3.988946736208032e-06, + "loss": 0.4703, + "step": 7358 + }, + { + "epoch": 0.58, + "grad_norm": 1.736680521513632, + "learning_rate": 3.987701244763832e-06, + "loss": 0.4602, + "step": 7359 + }, + { + "epoch": 0.58, + "grad_norm": 0.5655580771775796, + "learning_rate": 3.986455818814035e-06, + "loss": 0.5003, + "step": 7360 + }, + { + "epoch": 0.58, + "grad_norm": 1.9039211818376327, + "learning_rate": 3.985210458439223e-06, + "loss": 0.4334, + "step": 7361 + }, + { + "epoch": 0.58, + "grad_norm": 2.3885942210133853, + "learning_rate": 3.983965163719965e-06, + "loss": 0.4598, + "step": 7362 + }, + { + "epoch": 0.58, + "grad_norm": 1.9334385763128896, + "learning_rate": 3.982719934736832e-06, + "loss": 0.4344, + "step": 7363 + }, + { + "epoch": 0.58, + "grad_norm": 2.476164058384664, + "learning_rate": 3.981474771570389e-06, + "loss": 0.4474, + "step": 7364 + }, + { + "epoch": 0.58, + "grad_norm": 0.5551385182157621, + "learning_rate": 3.980229674301195e-06, + "loss": 0.4983, + "step": 7365 + }, + { + "epoch": 0.58, + "grad_norm": 1.3820578936824226, + "learning_rate": 3.978984643009808e-06, + "loss": 0.409, + "step": 7366 + }, + { + "epoch": 0.58, + "grad_norm": 2.07386479517845, + "learning_rate": 3.977739677776777e-06, + "loss": 0.4401, + "step": 7367 + }, + { + "epoch": 0.58, + "grad_norm": 1.6337458859869038, + "learning_rate": 3.976494778682654e-06, + "loss": 0.4322, + "step": 7368 + }, + { + "epoch": 0.58, + "grad_norm": 1.6588109571780973, + "learning_rate": 3.975249945807978e-06, + "loss": 0.4139, + "step": 7369 + }, + { + "epoch": 0.58, + "grad_norm": 1.8717544611136825, + "learning_rate": 3.974005179233292e-06, + "loss": 0.453, + "step": 7370 + }, + { + "epoch": 0.58, + "grad_norm": 1.92536532668072, + "learning_rate": 3.972760479039126e-06, + "loss": 0.4912, + "step": 7371 + }, + { + "epoch": 0.58, + "grad_norm": 1.680482447478772, + "learning_rate": 3.971515845306017e-06, + "loss": 0.4624, + "step": 7372 + }, + { + "epoch": 0.58, + "grad_norm": 0.5547983591456187, + "learning_rate": 3.9702712781144826e-06, + "loss": 0.4744, + "step": 7373 + }, + { + "epoch": 0.58, + "grad_norm": 1.99321163940407, + "learning_rate": 3.969026777545054e-06, + "loss": 0.4546, + "step": 7374 + }, + { + "epoch": 0.58, + "grad_norm": 0.5559816123711208, + "learning_rate": 3.96778234367824e-06, + "loss": 0.481, + "step": 7375 + }, + { + "epoch": 0.58, + "grad_norm": 1.3959258871192257, + "learning_rate": 3.966537976594563e-06, + "loss": 0.4817, + "step": 7376 + }, + { + "epoch": 0.58, + "grad_norm": 1.5223533166703613, + "learning_rate": 3.965293676374523e-06, + "loss": 0.4529, + "step": 7377 + }, + { + "epoch": 0.58, + "grad_norm": 2.094086026677986, + "learning_rate": 3.964049443098629e-06, + "loss": 0.4648, + "step": 7378 + }, + { + "epoch": 0.58, + "grad_norm": 0.5606546078042821, + "learning_rate": 3.962805276847382e-06, + "loss": 0.4859, + "step": 7379 + }, + { + "epoch": 0.58, + "grad_norm": 0.5491341187675134, + "learning_rate": 3.961561177701276e-06, + "loss": 0.4922, + "step": 7380 + }, + { + "epoch": 0.58, + "grad_norm": 1.8849146384387017, + "learning_rate": 3.960317145740805e-06, + "loss": 0.5117, + "step": 7381 + }, + { + "epoch": 0.58, + "grad_norm": 1.6206771971854343, + "learning_rate": 3.959073181046452e-06, + "loss": 0.4358, + "step": 7382 + }, + { + "epoch": 0.58, + "grad_norm": 1.686896264527124, + "learning_rate": 3.957829283698705e-06, + "loss": 0.437, + "step": 7383 + }, + { + "epoch": 0.58, + "grad_norm": 1.5263655166412793, + "learning_rate": 3.956585453778038e-06, + "loss": 0.4709, + "step": 7384 + }, + { + "epoch": 0.58, + "grad_norm": 1.7984144835085387, + "learning_rate": 3.955341691364929e-06, + "loss": 0.4568, + "step": 7385 + }, + { + "epoch": 0.58, + "grad_norm": 1.9101647841574672, + "learning_rate": 3.9540979965398445e-06, + "loss": 0.4686, + "step": 7386 + }, + { + "epoch": 0.58, + "grad_norm": 1.7023352465794623, + "learning_rate": 3.952854369383253e-06, + "loss": 0.464, + "step": 7387 + }, + { + "epoch": 0.58, + "grad_norm": 1.5672244174543317, + "learning_rate": 3.951610809975613e-06, + "loss": 0.4684, + "step": 7388 + }, + { + "epoch": 0.58, + "grad_norm": 1.516838826504019, + "learning_rate": 3.950367318397379e-06, + "loss": 0.4615, + "step": 7389 + }, + { + "epoch": 0.58, + "grad_norm": 5.355924837053419, + "learning_rate": 3.9491238947290104e-06, + "loss": 0.4884, + "step": 7390 + }, + { + "epoch": 0.58, + "grad_norm": 1.8338954509615495, + "learning_rate": 3.94788053905095e-06, + "loss": 0.4822, + "step": 7391 + }, + { + "epoch": 0.58, + "grad_norm": 2.0904145606678006, + "learning_rate": 3.946637251443643e-06, + "loss": 0.4648, + "step": 7392 + }, + { + "epoch": 0.58, + "grad_norm": 2.0547077805103315, + "learning_rate": 3.945394031987527e-06, + "loss": 0.4547, + "step": 7393 + }, + { + "epoch": 0.58, + "grad_norm": 1.6762631939562462, + "learning_rate": 3.944150880763039e-06, + "loss": 0.472, + "step": 7394 + }, + { + "epoch": 0.58, + "grad_norm": 0.5963394429899332, + "learning_rate": 3.942907797850606e-06, + "loss": 0.4875, + "step": 7395 + }, + { + "epoch": 0.58, + "grad_norm": 1.7565173718106364, + "learning_rate": 3.941664783330657e-06, + "loss": 0.4729, + "step": 7396 + }, + { + "epoch": 0.58, + "grad_norm": 1.6259672795588251, + "learning_rate": 3.94042183728361e-06, + "loss": 0.4935, + "step": 7397 + }, + { + "epoch": 0.58, + "grad_norm": 1.5656003893098618, + "learning_rate": 3.939178959789887e-06, + "loss": 0.4199, + "step": 7398 + }, + { + "epoch": 0.58, + "grad_norm": 1.407504884006315, + "learning_rate": 3.937936150929895e-06, + "loss": 0.4535, + "step": 7399 + }, + { + "epoch": 0.58, + "grad_norm": 1.4894989002334302, + "learning_rate": 3.936693410784045e-06, + "loss": 0.4515, + "step": 7400 + }, + { + "epoch": 0.58, + "grad_norm": 1.8397592191244374, + "learning_rate": 3.935450739432742e-06, + "loss": 0.4618, + "step": 7401 + }, + { + "epoch": 0.58, + "grad_norm": 0.5360415190509557, + "learning_rate": 3.934208136956383e-06, + "loss": 0.4837, + "step": 7402 + }, + { + "epoch": 0.58, + "grad_norm": 0.5440287464466432, + "learning_rate": 3.932965603435362e-06, + "loss": 0.4779, + "step": 7403 + }, + { + "epoch": 0.58, + "grad_norm": 1.705848416704902, + "learning_rate": 3.93172313895007e-06, + "loss": 0.4531, + "step": 7404 + }, + { + "epoch": 0.58, + "grad_norm": 1.530021351918056, + "learning_rate": 3.930480743580897e-06, + "loss": 0.5052, + "step": 7405 + }, + { + "epoch": 0.58, + "grad_norm": 1.759331779180002, + "learning_rate": 3.929238417408218e-06, + "loss": 0.5238, + "step": 7406 + }, + { + "epoch": 0.58, + "grad_norm": 1.8854550609759009, + "learning_rate": 3.927996160512414e-06, + "loss": 0.4621, + "step": 7407 + }, + { + "epoch": 0.58, + "grad_norm": 1.5450834459604268, + "learning_rate": 3.926753972973854e-06, + "loss": 0.4596, + "step": 7408 + }, + { + "epoch": 0.58, + "grad_norm": 0.542596227896359, + "learning_rate": 3.925511854872911e-06, + "loss": 0.4796, + "step": 7409 + }, + { + "epoch": 0.58, + "grad_norm": 1.7769510210967858, + "learning_rate": 3.9242698062899426e-06, + "loss": 0.4768, + "step": 7410 + }, + { + "epoch": 0.58, + "grad_norm": 2.2841289599112335, + "learning_rate": 3.923027827305311e-06, + "loss": 0.4545, + "step": 7411 + }, + { + "epoch": 0.58, + "grad_norm": 2.476618576837062, + "learning_rate": 3.921785917999372e-06, + "loss": 0.4665, + "step": 7412 + }, + { + "epoch": 0.58, + "grad_norm": 1.4013387474006558, + "learning_rate": 3.9205440784524715e-06, + "loss": 0.4267, + "step": 7413 + }, + { + "epoch": 0.58, + "grad_norm": 0.5322642253464693, + "learning_rate": 3.919302308744958e-06, + "loss": 0.4807, + "step": 7414 + }, + { + "epoch": 0.58, + "grad_norm": 1.845680264830862, + "learning_rate": 3.918060608957172e-06, + "loss": 0.46, + "step": 7415 + }, + { + "epoch": 0.58, + "grad_norm": 2.0994706228664586, + "learning_rate": 3.916818979169448e-06, + "loss": 0.4633, + "step": 7416 + }, + { + "epoch": 0.58, + "grad_norm": 0.5313591808671198, + "learning_rate": 3.91557741946212e-06, + "loss": 0.4793, + "step": 7417 + }, + { + "epoch": 0.58, + "grad_norm": 0.5704990884260895, + "learning_rate": 3.914335929915514e-06, + "loss": 0.4725, + "step": 7418 + }, + { + "epoch": 0.58, + "grad_norm": 4.378855037092292, + "learning_rate": 3.913094510609952e-06, + "loss": 0.4749, + "step": 7419 + }, + { + "epoch": 0.58, + "grad_norm": 2.708631827529763, + "learning_rate": 3.911853161625756e-06, + "loss": 0.4787, + "step": 7420 + }, + { + "epoch": 0.58, + "grad_norm": 2.340800081032865, + "learning_rate": 3.9106118830432346e-06, + "loss": 0.4954, + "step": 7421 + }, + { + "epoch": 0.58, + "grad_norm": 1.7729519923980588, + "learning_rate": 3.909370674942699e-06, + "loss": 0.5125, + "step": 7422 + }, + { + "epoch": 0.58, + "grad_norm": 6.615303762388371, + "learning_rate": 3.908129537404456e-06, + "loss": 0.4645, + "step": 7423 + }, + { + "epoch": 0.58, + "grad_norm": 1.9404998545379093, + "learning_rate": 3.906888470508801e-06, + "loss": 0.4911, + "step": 7424 + }, + { + "epoch": 0.58, + "grad_norm": 1.634515374564921, + "learning_rate": 3.9056474743360345e-06, + "loss": 0.5132, + "step": 7425 + }, + { + "epoch": 0.58, + "grad_norm": 1.596996189807429, + "learning_rate": 3.904406548966443e-06, + "loss": 0.4815, + "step": 7426 + }, + { + "epoch": 0.58, + "grad_norm": 1.688113542860332, + "learning_rate": 3.903165694480316e-06, + "loss": 0.443, + "step": 7427 + }, + { + "epoch": 0.58, + "grad_norm": 1.5774763756783123, + "learning_rate": 3.901924910957934e-06, + "loss": 0.4195, + "step": 7428 + }, + { + "epoch": 0.58, + "grad_norm": 0.5793112422412106, + "learning_rate": 3.9006841984795725e-06, + "loss": 0.4919, + "step": 7429 + }, + { + "epoch": 0.58, + "grad_norm": 2.031415912084839, + "learning_rate": 3.899443557125506e-06, + "loss": 0.4767, + "step": 7430 + }, + { + "epoch": 0.58, + "grad_norm": 1.5501852732548653, + "learning_rate": 3.898202986976002e-06, + "loss": 0.444, + "step": 7431 + }, + { + "epoch": 0.58, + "grad_norm": 1.5932210997781622, + "learning_rate": 3.896962488111323e-06, + "loss": 0.5189, + "step": 7432 + }, + { + "epoch": 0.58, + "grad_norm": 1.5147729157791983, + "learning_rate": 3.895722060611728e-06, + "loss": 0.4992, + "step": 7433 + }, + { + "epoch": 0.58, + "grad_norm": 1.5939284624807348, + "learning_rate": 3.894481704557471e-06, + "loss": 0.4616, + "step": 7434 + }, + { + "epoch": 0.58, + "grad_norm": 2.2330129640225005, + "learning_rate": 3.8932414200288e-06, + "loss": 0.4624, + "step": 7435 + }, + { + "epoch": 0.58, + "grad_norm": 1.6348468329315848, + "learning_rate": 3.892001207105965e-06, + "loss": 0.4238, + "step": 7436 + }, + { + "epoch": 0.58, + "grad_norm": 1.8384944408194006, + "learning_rate": 3.890761065869199e-06, + "loss": 0.4268, + "step": 7437 + }, + { + "epoch": 0.58, + "grad_norm": 1.7689207505901194, + "learning_rate": 3.889520996398743e-06, + "loss": 0.4561, + "step": 7438 + }, + { + "epoch": 0.58, + "grad_norm": 1.9721861800982003, + "learning_rate": 3.888280998774823e-06, + "loss": 0.4631, + "step": 7439 + }, + { + "epoch": 0.58, + "grad_norm": 2.351536012948005, + "learning_rate": 3.887041073077672e-06, + "loss": 0.4422, + "step": 7440 + }, + { + "epoch": 0.58, + "grad_norm": 2.983599362758474, + "learning_rate": 3.8858012193875044e-06, + "loss": 0.441, + "step": 7441 + }, + { + "epoch": 0.58, + "grad_norm": 2.0298333927005463, + "learning_rate": 3.884561437784544e-06, + "loss": 0.441, + "step": 7442 + }, + { + "epoch": 0.58, + "grad_norm": 1.878374278875591, + "learning_rate": 3.883321728348995e-06, + "loss": 0.4953, + "step": 7443 + }, + { + "epoch": 0.58, + "grad_norm": 1.405790356535479, + "learning_rate": 3.8820820911610735e-06, + "loss": 0.4719, + "step": 7444 + }, + { + "epoch": 0.58, + "grad_norm": 0.6118297026644891, + "learning_rate": 3.880842526300975e-06, + "loss": 0.4844, + "step": 7445 + }, + { + "epoch": 0.58, + "grad_norm": 3.142676041920264, + "learning_rate": 3.879603033848902e-06, + "loss": 0.4014, + "step": 7446 + }, + { + "epoch": 0.58, + "grad_norm": 1.5857458353346452, + "learning_rate": 3.878363613885048e-06, + "loss": 0.435, + "step": 7447 + }, + { + "epoch": 0.58, + "grad_norm": 0.552226407449032, + "learning_rate": 3.877124266489599e-06, + "loss": 0.4859, + "step": 7448 + }, + { + "epoch": 0.59, + "grad_norm": 1.8333780580199772, + "learning_rate": 3.875884991742744e-06, + "loss": 0.4587, + "step": 7449 + }, + { + "epoch": 0.59, + "grad_norm": 2.32552714330887, + "learning_rate": 3.874645789724656e-06, + "loss": 0.4819, + "step": 7450 + }, + { + "epoch": 0.59, + "grad_norm": 1.3375572939013214, + "learning_rate": 3.873406660515516e-06, + "loss": 0.4838, + "step": 7451 + }, + { + "epoch": 0.59, + "grad_norm": 1.7782175147692776, + "learning_rate": 3.872167604195491e-06, + "loss": 0.4877, + "step": 7452 + }, + { + "epoch": 0.59, + "grad_norm": 3.3954434491440453, + "learning_rate": 3.870928620844746e-06, + "loss": 0.5154, + "step": 7453 + }, + { + "epoch": 0.59, + "grad_norm": 1.715273234288327, + "learning_rate": 3.869689710543442e-06, + "loss": 0.4448, + "step": 7454 + }, + { + "epoch": 0.59, + "grad_norm": 0.556371677674079, + "learning_rate": 3.868450873371735e-06, + "loss": 0.4845, + "step": 7455 + }, + { + "epoch": 0.59, + "grad_norm": 1.7847706774020777, + "learning_rate": 3.8672121094097765e-06, + "loss": 0.4572, + "step": 7456 + }, + { + "epoch": 0.59, + "grad_norm": 1.9577540450012663, + "learning_rate": 3.865973418737711e-06, + "loss": 0.513, + "step": 7457 + }, + { + "epoch": 0.59, + "grad_norm": 1.7242423797310757, + "learning_rate": 3.864734801435686e-06, + "loss": 0.4845, + "step": 7458 + }, + { + "epoch": 0.59, + "grad_norm": 1.7153143557195472, + "learning_rate": 3.86349625758383e-06, + "loss": 0.4653, + "step": 7459 + }, + { + "epoch": 0.59, + "grad_norm": 1.8154174526227398, + "learning_rate": 3.862257787262283e-06, + "loss": 0.3947, + "step": 7460 + }, + { + "epoch": 0.59, + "grad_norm": 1.6110320094880846, + "learning_rate": 3.861019390551165e-06, + "loss": 0.4704, + "step": 7461 + }, + { + "epoch": 0.59, + "grad_norm": 2.159400566809231, + "learning_rate": 3.859781067530605e-06, + "loss": 0.4998, + "step": 7462 + }, + { + "epoch": 0.59, + "grad_norm": 2.1453356253637037, + "learning_rate": 3.858542818280717e-06, + "loss": 0.4761, + "step": 7463 + }, + { + "epoch": 0.59, + "grad_norm": 2.4710743701490467, + "learning_rate": 3.857304642881617e-06, + "loss": 0.4898, + "step": 7464 + }, + { + "epoch": 0.59, + "grad_norm": 0.5577366553377581, + "learning_rate": 3.85606654141341e-06, + "loss": 0.4664, + "step": 7465 + }, + { + "epoch": 0.59, + "grad_norm": 1.8171179796873802, + "learning_rate": 3.854828513956204e-06, + "loss": 0.4188, + "step": 7466 + }, + { + "epoch": 0.59, + "grad_norm": 1.5071250447056637, + "learning_rate": 3.853590560590093e-06, + "loss": 0.5, + "step": 7467 + }, + { + "epoch": 0.59, + "grad_norm": 1.5365484925957675, + "learning_rate": 3.8523526813951726e-06, + "loss": 0.5401, + "step": 7468 + }, + { + "epoch": 0.59, + "grad_norm": 1.688150321189963, + "learning_rate": 3.851114876451533e-06, + "loss": 0.4435, + "step": 7469 + }, + { + "epoch": 0.59, + "grad_norm": 1.7088859417091902, + "learning_rate": 3.849877145839258e-06, + "loss": 0.486, + "step": 7470 + }, + { + "epoch": 0.59, + "grad_norm": 1.8653941969160455, + "learning_rate": 3.8486394896384285e-06, + "loss": 0.4473, + "step": 7471 + }, + { + "epoch": 0.59, + "grad_norm": 1.5045525801628634, + "learning_rate": 3.8474019079291156e-06, + "loss": 0.4758, + "step": 7472 + }, + { + "epoch": 0.59, + "grad_norm": 0.5475195448470077, + "learning_rate": 3.846164400791393e-06, + "loss": 0.4661, + "step": 7473 + }, + { + "epoch": 0.59, + "grad_norm": 1.7219368540708582, + "learning_rate": 3.844926968305322e-06, + "loss": 0.4266, + "step": 7474 + }, + { + "epoch": 0.59, + "grad_norm": 1.5260964229342389, + "learning_rate": 3.843689610550969e-06, + "loss": 0.4828, + "step": 7475 + }, + { + "epoch": 0.59, + "grad_norm": 0.5768506244730163, + "learning_rate": 3.842452327608382e-06, + "loss": 0.478, + "step": 7476 + }, + { + "epoch": 0.59, + "grad_norm": 0.5655196970847991, + "learning_rate": 3.841215119557619e-06, + "loss": 0.4644, + "step": 7477 + }, + { + "epoch": 0.59, + "grad_norm": 1.7569177427503508, + "learning_rate": 3.839977986478718e-06, + "loss": 0.4411, + "step": 7478 + }, + { + "epoch": 0.59, + "grad_norm": 17.69881687539802, + "learning_rate": 3.838740928451724e-06, + "loss": 0.493, + "step": 7479 + }, + { + "epoch": 0.59, + "grad_norm": 0.5822321813869796, + "learning_rate": 3.837503945556675e-06, + "loss": 0.4742, + "step": 7480 + }, + { + "epoch": 0.59, + "grad_norm": 3.416010237076649, + "learning_rate": 3.836267037873598e-06, + "loss": 0.4793, + "step": 7481 + }, + { + "epoch": 0.59, + "grad_norm": 1.941841473093685, + "learning_rate": 3.835030205482523e-06, + "loss": 0.4877, + "step": 7482 + }, + { + "epoch": 0.59, + "grad_norm": 2.4210351094703912, + "learning_rate": 3.833793448463469e-06, + "loss": 0.4359, + "step": 7483 + }, + { + "epoch": 0.59, + "grad_norm": 1.7215623875315207, + "learning_rate": 3.8325567668964535e-06, + "loss": 0.4537, + "step": 7484 + }, + { + "epoch": 0.59, + "grad_norm": 1.6726611613872002, + "learning_rate": 3.831320160861488e-06, + "loss": 0.4784, + "step": 7485 + }, + { + "epoch": 0.59, + "grad_norm": 2.8718943853501355, + "learning_rate": 3.830083630438579e-06, + "loss": 0.4971, + "step": 7486 + }, + { + "epoch": 0.59, + "grad_norm": 0.5493014932106179, + "learning_rate": 3.828847175707729e-06, + "loss": 0.4819, + "step": 7487 + }, + { + "epoch": 0.59, + "grad_norm": 1.484999202973541, + "learning_rate": 3.827610796748935e-06, + "loss": 0.4788, + "step": 7488 + }, + { + "epoch": 0.59, + "grad_norm": 1.8519129174780453, + "learning_rate": 3.826374493642187e-06, + "loss": 0.4409, + "step": 7489 + }, + { + "epoch": 0.59, + "grad_norm": 1.3662615260466489, + "learning_rate": 3.825138266467474e-06, + "loss": 0.4509, + "step": 7490 + }, + { + "epoch": 0.59, + "grad_norm": 2.390835430706696, + "learning_rate": 3.823902115304781e-06, + "loss": 0.5058, + "step": 7491 + }, + { + "epoch": 0.59, + "grad_norm": 1.8684492377471882, + "learning_rate": 3.8226660402340796e-06, + "loss": 0.4868, + "step": 7492 + }, + { + "epoch": 0.59, + "grad_norm": 0.5466470788148817, + "learning_rate": 3.821430041335349e-06, + "loss": 0.472, + "step": 7493 + }, + { + "epoch": 0.59, + "grad_norm": 1.5140001405143653, + "learning_rate": 3.820194118688549e-06, + "loss": 0.4725, + "step": 7494 + }, + { + "epoch": 0.59, + "grad_norm": 0.5908308800759554, + "learning_rate": 3.818958272373649e-06, + "loss": 0.4996, + "step": 7495 + }, + { + "epoch": 0.59, + "grad_norm": 2.0857593220473354, + "learning_rate": 3.817722502470601e-06, + "loss": 0.4328, + "step": 7496 + }, + { + "epoch": 0.59, + "grad_norm": 0.552885503155174, + "learning_rate": 3.816486809059363e-06, + "loss": 0.4837, + "step": 7497 + }, + { + "epoch": 0.59, + "grad_norm": 1.7835564762817329, + "learning_rate": 3.8152511922198784e-06, + "loss": 0.4516, + "step": 7498 + }, + { + "epoch": 0.59, + "grad_norm": 0.5387163079151308, + "learning_rate": 3.8140156520320935e-06, + "loss": 0.4789, + "step": 7499 + }, + { + "epoch": 0.59, + "grad_norm": 1.7272360991393496, + "learning_rate": 3.812780188575943e-06, + "loss": 0.4514, + "step": 7500 + }, + { + "epoch": 0.59, + "grad_norm": 7.6382759784888545, + "learning_rate": 3.8115448019313627e-06, + "loss": 0.4662, + "step": 7501 + }, + { + "epoch": 0.59, + "grad_norm": 1.8987028427705963, + "learning_rate": 3.810309492178278e-06, + "loss": 0.417, + "step": 7502 + }, + { + "epoch": 0.59, + "grad_norm": 1.7446791221552287, + "learning_rate": 3.809074259396612e-06, + "loss": 0.451, + "step": 7503 + }, + { + "epoch": 0.59, + "grad_norm": 1.4492615465546963, + "learning_rate": 3.8078391036662853e-06, + "loss": 0.4619, + "step": 7504 + }, + { + "epoch": 0.59, + "grad_norm": 1.495003964791977, + "learning_rate": 3.806604025067208e-06, + "loss": 0.4099, + "step": 7505 + }, + { + "epoch": 0.59, + "grad_norm": 1.8006795206588757, + "learning_rate": 3.80536902367929e-06, + "loss": 0.437, + "step": 7506 + }, + { + "epoch": 0.59, + "grad_norm": 1.6269746363166553, + "learning_rate": 3.804134099582432e-06, + "loss": 0.4218, + "step": 7507 + }, + { + "epoch": 0.59, + "grad_norm": 1.5231938858949519, + "learning_rate": 3.8028992528565353e-06, + "loss": 0.4254, + "step": 7508 + }, + { + "epoch": 0.59, + "grad_norm": 1.5450010685127566, + "learning_rate": 3.801664483581488e-06, + "loss": 0.4586, + "step": 7509 + }, + { + "epoch": 0.59, + "grad_norm": 2.043629432296741, + "learning_rate": 3.8004297918371847e-06, + "loss": 0.4135, + "step": 7510 + }, + { + "epoch": 0.59, + "grad_norm": 2.0453349676474017, + "learning_rate": 3.799195177703501e-06, + "loss": 0.4397, + "step": 7511 + }, + { + "epoch": 0.59, + "grad_norm": 1.9554347083676054, + "learning_rate": 3.797960641260322e-06, + "loss": 0.4888, + "step": 7512 + }, + { + "epoch": 0.59, + "grad_norm": 0.546082363924803, + "learning_rate": 3.7967261825875134e-06, + "loss": 0.4887, + "step": 7513 + }, + { + "epoch": 0.59, + "grad_norm": 2.3524054125950715, + "learning_rate": 3.795491801764948e-06, + "loss": 0.4741, + "step": 7514 + }, + { + "epoch": 0.59, + "grad_norm": 1.3866358505416807, + "learning_rate": 3.7942574988724884e-06, + "loss": 0.4694, + "step": 7515 + }, + { + "epoch": 0.59, + "grad_norm": 1.6045931111969536, + "learning_rate": 3.793023273989991e-06, + "loss": 0.4918, + "step": 7516 + }, + { + "epoch": 0.59, + "grad_norm": 1.614239410821719, + "learning_rate": 3.7917891271973086e-06, + "loss": 0.443, + "step": 7517 + }, + { + "epoch": 0.59, + "grad_norm": 2.711853004485514, + "learning_rate": 3.7905550585742888e-06, + "loss": 0.4609, + "step": 7518 + }, + { + "epoch": 0.59, + "grad_norm": 0.5234959674040762, + "learning_rate": 3.7893210682007752e-06, + "loss": 0.4931, + "step": 7519 + }, + { + "epoch": 0.59, + "grad_norm": 1.466008169418806, + "learning_rate": 3.7880871561566047e-06, + "loss": 0.4933, + "step": 7520 + }, + { + "epoch": 0.59, + "grad_norm": 2.504890924902262, + "learning_rate": 3.7868533225216097e-06, + "loss": 0.5272, + "step": 7521 + }, + { + "epoch": 0.59, + "grad_norm": 1.5312765398108208, + "learning_rate": 3.785619567375617e-06, + "loss": 0.4217, + "step": 7522 + }, + { + "epoch": 0.59, + "grad_norm": 0.5325962467639115, + "learning_rate": 3.784385890798451e-06, + "loss": 0.5045, + "step": 7523 + }, + { + "epoch": 0.59, + "grad_norm": 1.7512442740105427, + "learning_rate": 3.7831522928699268e-06, + "loss": 0.4251, + "step": 7524 + }, + { + "epoch": 0.59, + "grad_norm": 1.6435841437486938, + "learning_rate": 3.781918773669856e-06, + "loss": 0.4728, + "step": 7525 + }, + { + "epoch": 0.59, + "grad_norm": 3.2660627458672944, + "learning_rate": 3.78068533327805e-06, + "loss": 0.4717, + "step": 7526 + }, + { + "epoch": 0.59, + "grad_norm": 1.5254489562411702, + "learning_rate": 3.779451971774305e-06, + "loss": 0.4515, + "step": 7527 + }, + { + "epoch": 0.59, + "grad_norm": 2.122201481630115, + "learning_rate": 3.7782186892384237e-06, + "loss": 0.4786, + "step": 7528 + }, + { + "epoch": 0.59, + "grad_norm": 0.6071317332954509, + "learning_rate": 3.776985485750192e-06, + "loss": 0.4917, + "step": 7529 + }, + { + "epoch": 0.59, + "grad_norm": 1.9862601511739864, + "learning_rate": 3.7757523613894024e-06, + "loss": 0.4167, + "step": 7530 + }, + { + "epoch": 0.59, + "grad_norm": 1.8094644472009187, + "learning_rate": 3.7745193162358307e-06, + "loss": 0.4658, + "step": 7531 + }, + { + "epoch": 0.59, + "grad_norm": 1.699572412339297, + "learning_rate": 3.773286350369258e-06, + "loss": 0.3809, + "step": 7532 + }, + { + "epoch": 0.59, + "grad_norm": 0.545231109578005, + "learning_rate": 3.7720534638694528e-06, + "loss": 0.5017, + "step": 7533 + }, + { + "epoch": 0.59, + "grad_norm": 1.608139285228216, + "learning_rate": 3.770820656816183e-06, + "loss": 0.4317, + "step": 7534 + }, + { + "epoch": 0.59, + "grad_norm": 2.429044486095302, + "learning_rate": 3.7695879292892076e-06, + "loss": 0.4415, + "step": 7535 + }, + { + "epoch": 0.59, + "grad_norm": 1.72142634734567, + "learning_rate": 3.7683552813682832e-06, + "loss": 0.3902, + "step": 7536 + }, + { + "epoch": 0.59, + "grad_norm": 0.5539572115383771, + "learning_rate": 3.767122713133163e-06, + "loss": 0.4938, + "step": 7537 + }, + { + "epoch": 0.59, + "grad_norm": 1.9593632133811218, + "learning_rate": 3.765890224663589e-06, + "loss": 0.4668, + "step": 7538 + }, + { + "epoch": 0.59, + "grad_norm": 2.1967879909273456, + "learning_rate": 3.7646578160393036e-06, + "loss": 0.4349, + "step": 7539 + }, + { + "epoch": 0.59, + "grad_norm": 2.027903174373566, + "learning_rate": 3.763425487340041e-06, + "loss": 0.4834, + "step": 7540 + }, + { + "epoch": 0.59, + "grad_norm": 2.1474045819181034, + "learning_rate": 3.7621932386455327e-06, + "loss": 0.4856, + "step": 7541 + }, + { + "epoch": 0.59, + "grad_norm": 7.580774870613579, + "learning_rate": 3.7609610700355014e-06, + "loss": 0.4581, + "step": 7542 + }, + { + "epoch": 0.59, + "grad_norm": 2.02621066151538, + "learning_rate": 3.7597289815896685e-06, + "loss": 0.4255, + "step": 7543 + }, + { + "epoch": 0.59, + "grad_norm": 1.9599562980428238, + "learning_rate": 3.7584969733877467e-06, + "loss": 0.4301, + "step": 7544 + }, + { + "epoch": 0.59, + "grad_norm": 2.0818454639734445, + "learning_rate": 3.7572650455094494e-06, + "loss": 0.4945, + "step": 7545 + }, + { + "epoch": 0.59, + "grad_norm": 2.8782414535697307, + "learning_rate": 3.7560331980344756e-06, + "loss": 0.4382, + "step": 7546 + }, + { + "epoch": 0.59, + "grad_norm": 1.8792891455135292, + "learning_rate": 3.754801431042527e-06, + "loss": 0.4489, + "step": 7547 + }, + { + "epoch": 0.59, + "grad_norm": 1.6534409777082941, + "learning_rate": 3.753569744613298e-06, + "loss": 0.4199, + "step": 7548 + }, + { + "epoch": 0.59, + "grad_norm": 2.5917312383605022, + "learning_rate": 3.7523381388264747e-06, + "loss": 0.4519, + "step": 7549 + }, + { + "epoch": 0.59, + "grad_norm": 1.5565645484542814, + "learning_rate": 3.7511066137617424e-06, + "loss": 0.4131, + "step": 7550 + }, + { + "epoch": 0.59, + "grad_norm": 1.7810368096934819, + "learning_rate": 3.749875169498778e-06, + "loss": 0.491, + "step": 7551 + }, + { + "epoch": 0.59, + "grad_norm": 1.894550441998941, + "learning_rate": 3.748643806117256e-06, + "loss": 0.467, + "step": 7552 + }, + { + "epoch": 0.59, + "grad_norm": 2.0001669928032313, + "learning_rate": 3.747412523696841e-06, + "loss": 0.4612, + "step": 7553 + }, + { + "epoch": 0.59, + "grad_norm": 2.158579473814672, + "learning_rate": 3.746181322317199e-06, + "loss": 0.4467, + "step": 7554 + }, + { + "epoch": 0.59, + "grad_norm": 2.004628338356797, + "learning_rate": 3.7449502020579838e-06, + "loss": 0.4946, + "step": 7555 + }, + { + "epoch": 0.59, + "grad_norm": 2.235203119297206, + "learning_rate": 3.7437191629988497e-06, + "loss": 0.4834, + "step": 7556 + }, + { + "epoch": 0.59, + "grad_norm": 1.7976574735128181, + "learning_rate": 3.7424882052194413e-06, + "loss": 0.502, + "step": 7557 + }, + { + "epoch": 0.59, + "grad_norm": 0.5763262974118609, + "learning_rate": 3.7412573287994e-06, + "loss": 0.5, + "step": 7558 + }, + { + "epoch": 0.59, + "grad_norm": 2.143622612775322, + "learning_rate": 3.740026533818366e-06, + "loss": 0.4505, + "step": 7559 + }, + { + "epoch": 0.59, + "grad_norm": 3.0564485725564667, + "learning_rate": 3.7387958203559637e-06, + "loss": 0.4418, + "step": 7560 + }, + { + "epoch": 0.59, + "grad_norm": 2.153090058794941, + "learning_rate": 3.737565188491825e-06, + "loss": 0.4672, + "step": 7561 + }, + { + "epoch": 0.59, + "grad_norm": 1.3828542087319728, + "learning_rate": 3.7363346383055644e-06, + "loss": 0.453, + "step": 7562 + }, + { + "epoch": 0.59, + "grad_norm": 1.8189087695195227, + "learning_rate": 3.735104169876803e-06, + "loss": 0.4434, + "step": 7563 + }, + { + "epoch": 0.59, + "grad_norm": 1.7523173795846925, + "learning_rate": 3.7338737832851433e-06, + "loss": 0.4546, + "step": 7564 + }, + { + "epoch": 0.59, + "grad_norm": 2.9032817399030804, + "learning_rate": 3.7326434786101975e-06, + "loss": 0.4607, + "step": 7565 + }, + { + "epoch": 0.59, + "grad_norm": 4.218581318184691, + "learning_rate": 3.7314132559315565e-06, + "loss": 0.4295, + "step": 7566 + }, + { + "epoch": 0.59, + "grad_norm": 2.2020124480799166, + "learning_rate": 3.730183115328821e-06, + "loss": 0.4468, + "step": 7567 + }, + { + "epoch": 0.59, + "grad_norm": 0.5727377795894238, + "learning_rate": 3.7289530568815756e-06, + "loss": 0.4934, + "step": 7568 + }, + { + "epoch": 0.59, + "grad_norm": 2.1246769190061277, + "learning_rate": 3.727723080669405e-06, + "loss": 0.4806, + "step": 7569 + }, + { + "epoch": 0.59, + "grad_norm": 1.7364845315769442, + "learning_rate": 3.7264931867718874e-06, + "loss": 0.4584, + "step": 7570 + }, + { + "epoch": 0.59, + "grad_norm": 1.7137726274535838, + "learning_rate": 3.725263375268594e-06, + "loss": 0.5082, + "step": 7571 + }, + { + "epoch": 0.59, + "grad_norm": 0.5499911796395616, + "learning_rate": 3.7240336462390934e-06, + "loss": 0.5242, + "step": 7572 + }, + { + "epoch": 0.59, + "grad_norm": 0.5545901741647533, + "learning_rate": 3.7228039997629454e-06, + "loss": 0.4944, + "step": 7573 + }, + { + "epoch": 0.59, + "grad_norm": 0.5499841977782937, + "learning_rate": 3.7215744359197094e-06, + "loss": 0.4596, + "step": 7574 + }, + { + "epoch": 0.59, + "grad_norm": 2.011956204550276, + "learning_rate": 3.7203449547889337e-06, + "loss": 0.4785, + "step": 7575 + }, + { + "epoch": 0.59, + "grad_norm": 2.620227431304135, + "learning_rate": 3.719115556450167e-06, + "loss": 0.4439, + "step": 7576 + }, + { + "epoch": 0.6, + "grad_norm": 3.192602117767563, + "learning_rate": 3.7178862409829462e-06, + "loss": 0.43, + "step": 7577 + }, + { + "epoch": 0.6, + "grad_norm": 0.5603052883254783, + "learning_rate": 3.7166570084668103e-06, + "loss": 0.5065, + "step": 7578 + }, + { + "epoch": 0.6, + "grad_norm": 1.8028491058702008, + "learning_rate": 3.715427858981285e-06, + "loss": 0.4964, + "step": 7579 + }, + { + "epoch": 0.6, + "grad_norm": 0.5373297677139584, + "learning_rate": 3.7141987926058994e-06, + "loss": 0.4731, + "step": 7580 + }, + { + "epoch": 0.6, + "grad_norm": 1.6849411897059918, + "learning_rate": 3.712969809420167e-06, + "loss": 0.4768, + "step": 7581 + }, + { + "epoch": 0.6, + "grad_norm": 2.0207992359401916, + "learning_rate": 3.711740909503605e-06, + "loss": 0.4563, + "step": 7582 + }, + { + "epoch": 0.6, + "grad_norm": 1.5842188852604493, + "learning_rate": 3.710512092935722e-06, + "loss": 0.4106, + "step": 7583 + }, + { + "epoch": 0.6, + "grad_norm": 2.361761753527509, + "learning_rate": 3.7092833597960188e-06, + "loss": 0.3671, + "step": 7584 + }, + { + "epoch": 0.6, + "grad_norm": 1.6195660825297324, + "learning_rate": 3.7080547101639953e-06, + "loss": 0.4922, + "step": 7585 + }, + { + "epoch": 0.6, + "grad_norm": 0.5547700027767101, + "learning_rate": 3.70682614411914e-06, + "loss": 0.4974, + "step": 7586 + }, + { + "epoch": 0.6, + "grad_norm": 1.9321264448281175, + "learning_rate": 3.7055976617409434e-06, + "loss": 0.484, + "step": 7587 + }, + { + "epoch": 0.6, + "grad_norm": 0.5501168051819622, + "learning_rate": 3.7043692631088836e-06, + "loss": 0.4877, + "step": 7588 + }, + { + "epoch": 0.6, + "grad_norm": 1.4935324643422738, + "learning_rate": 3.7031409483024383e-06, + "loss": 0.45, + "step": 7589 + }, + { + "epoch": 0.6, + "grad_norm": 2.391824485815515, + "learning_rate": 3.701912717401076e-06, + "loss": 0.457, + "step": 7590 + }, + { + "epoch": 0.6, + "grad_norm": 1.8481491332077131, + "learning_rate": 3.700684570484264e-06, + "loss": 0.4821, + "step": 7591 + }, + { + "epoch": 0.6, + "grad_norm": 0.5923686483406625, + "learning_rate": 3.69945650763146e-06, + "loss": 0.4855, + "step": 7592 + }, + { + "epoch": 0.6, + "grad_norm": 1.6077703828433607, + "learning_rate": 3.698228528922117e-06, + "loss": 0.4407, + "step": 7593 + }, + { + "epoch": 0.6, + "grad_norm": 1.6431391995270086, + "learning_rate": 3.697000634435689e-06, + "loss": 0.4582, + "step": 7594 + }, + { + "epoch": 0.6, + "grad_norm": 1.7161481929933124, + "learning_rate": 3.695772824251611e-06, + "loss": 0.4024, + "step": 7595 + }, + { + "epoch": 0.6, + "grad_norm": 2.273388934066274, + "learning_rate": 3.6945450984493292e-06, + "loss": 0.4705, + "step": 7596 + }, + { + "epoch": 0.6, + "grad_norm": 1.7385253926499884, + "learning_rate": 3.6933174571082686e-06, + "loss": 0.4484, + "step": 7597 + }, + { + "epoch": 0.6, + "grad_norm": 1.4980242811352524, + "learning_rate": 3.6920899003078626e-06, + "loss": 0.4486, + "step": 7598 + }, + { + "epoch": 0.6, + "grad_norm": 0.5376661445788004, + "learning_rate": 3.6908624281275252e-06, + "loss": 0.4885, + "step": 7599 + }, + { + "epoch": 0.6, + "grad_norm": 1.5182183205931765, + "learning_rate": 3.68963504064668e-06, + "loss": 0.4329, + "step": 7600 + }, + { + "epoch": 0.6, + "grad_norm": 2.040766437166542, + "learning_rate": 3.6884077379447303e-06, + "loss": 0.4488, + "step": 7601 + }, + { + "epoch": 0.6, + "grad_norm": 1.6861908034684918, + "learning_rate": 3.687180520101086e-06, + "loss": 0.4064, + "step": 7602 + }, + { + "epoch": 0.6, + "grad_norm": 0.5336329362762963, + "learning_rate": 3.6859533871951436e-06, + "loss": 0.4938, + "step": 7603 + }, + { + "epoch": 0.6, + "grad_norm": 1.5557794001520622, + "learning_rate": 3.684726339306298e-06, + "loss": 0.4482, + "step": 7604 + }, + { + "epoch": 0.6, + "grad_norm": 1.6401115241273605, + "learning_rate": 3.6834993765139388e-06, + "loss": 0.4395, + "step": 7605 + }, + { + "epoch": 0.6, + "grad_norm": 0.5280644345305249, + "learning_rate": 3.682272498897447e-06, + "loss": 0.4762, + "step": 7606 + }, + { + "epoch": 0.6, + "grad_norm": 1.7972707938066323, + "learning_rate": 3.681045706536201e-06, + "loss": 0.4807, + "step": 7607 + }, + { + "epoch": 0.6, + "grad_norm": 1.77937635136034, + "learning_rate": 3.679818999509572e-06, + "loss": 0.4951, + "step": 7608 + }, + { + "epoch": 0.6, + "grad_norm": 1.8948050707483497, + "learning_rate": 3.678592377896927e-06, + "loss": 0.4506, + "step": 7609 + }, + { + "epoch": 0.6, + "grad_norm": 2.04197366815816, + "learning_rate": 3.677365841777626e-06, + "loss": 0.4523, + "step": 7610 + }, + { + "epoch": 0.6, + "grad_norm": 2.796247275760259, + "learning_rate": 3.6761393912310262e-06, + "loss": 0.4445, + "step": 7611 + }, + { + "epoch": 0.6, + "grad_norm": 1.9844649408118051, + "learning_rate": 3.674913026336474e-06, + "loss": 0.482, + "step": 7612 + }, + { + "epoch": 0.6, + "grad_norm": 0.5337023277196105, + "learning_rate": 3.6736867471733173e-06, + "loss": 0.4938, + "step": 7613 + }, + { + "epoch": 0.6, + "grad_norm": 0.5957026544127116, + "learning_rate": 3.6724605538208926e-06, + "loss": 0.4912, + "step": 7614 + }, + { + "epoch": 0.6, + "grad_norm": 2.3219845331608733, + "learning_rate": 3.6712344463585316e-06, + "loss": 0.5071, + "step": 7615 + }, + { + "epoch": 0.6, + "grad_norm": 1.6563201032923618, + "learning_rate": 3.6700084248655653e-06, + "loss": 0.4947, + "step": 7616 + }, + { + "epoch": 0.6, + "grad_norm": 1.7591117866407757, + "learning_rate": 3.6687824894213136e-06, + "loss": 0.4537, + "step": 7617 + }, + { + "epoch": 0.6, + "grad_norm": 1.6005192138344955, + "learning_rate": 3.6675566401050944e-06, + "loss": 0.4949, + "step": 7618 + }, + { + "epoch": 0.6, + "grad_norm": 3.641180484234126, + "learning_rate": 3.6663308769962168e-06, + "loss": 0.4143, + "step": 7619 + }, + { + "epoch": 0.6, + "grad_norm": 2.1313569069820075, + "learning_rate": 3.6651052001739874e-06, + "loss": 0.4465, + "step": 7620 + }, + { + "epoch": 0.6, + "grad_norm": 2.189939333902669, + "learning_rate": 3.6638796097177044e-06, + "loss": 0.4445, + "step": 7621 + }, + { + "epoch": 0.6, + "grad_norm": 2.3100238558012776, + "learning_rate": 3.6626541057066634e-06, + "loss": 0.4891, + "step": 7622 + }, + { + "epoch": 0.6, + "grad_norm": 2.471903935293427, + "learning_rate": 3.661428688220152e-06, + "loss": 0.4575, + "step": 7623 + }, + { + "epoch": 0.6, + "grad_norm": 0.5862996539903595, + "learning_rate": 3.660203357337454e-06, + "loss": 0.486, + "step": 7624 + }, + { + "epoch": 0.6, + "grad_norm": 1.918577782731966, + "learning_rate": 3.658978113137845e-06, + "loss": 0.4822, + "step": 7625 + }, + { + "epoch": 0.6, + "grad_norm": 1.4679381538545826, + "learning_rate": 3.6577529557005975e-06, + "loss": 0.4385, + "step": 7626 + }, + { + "epoch": 0.6, + "grad_norm": 1.6907090477792461, + "learning_rate": 3.6565278851049803e-06, + "loss": 0.45, + "step": 7627 + }, + { + "epoch": 0.6, + "grad_norm": 1.6200342886226105, + "learning_rate": 3.6553029014302476e-06, + "loss": 0.4712, + "step": 7628 + }, + { + "epoch": 0.6, + "grad_norm": 3.4922113654207108, + "learning_rate": 3.6540780047556624e-06, + "loss": 0.4301, + "step": 7629 + }, + { + "epoch": 0.6, + "grad_norm": 2.8690414486335882, + "learning_rate": 3.652853195160466e-06, + "loss": 0.5125, + "step": 7630 + }, + { + "epoch": 0.6, + "grad_norm": 2.230491976010944, + "learning_rate": 3.6516284727239094e-06, + "loss": 0.5053, + "step": 7631 + }, + { + "epoch": 0.6, + "grad_norm": 1.4495752858883177, + "learning_rate": 3.6504038375252228e-06, + "loss": 0.4868, + "step": 7632 + }, + { + "epoch": 0.6, + "grad_norm": 2.755647708294134, + "learning_rate": 3.649179289643646e-06, + "loss": 0.4187, + "step": 7633 + }, + { + "epoch": 0.6, + "grad_norm": 1.645839389034316, + "learning_rate": 3.647954829158398e-06, + "loss": 0.4644, + "step": 7634 + }, + { + "epoch": 0.6, + "grad_norm": 1.8180972266432045, + "learning_rate": 3.646730456148708e-06, + "loss": 0.4739, + "step": 7635 + }, + { + "epoch": 0.6, + "grad_norm": 1.9863988228909277, + "learning_rate": 3.6455061706937833e-06, + "loss": 0.5089, + "step": 7636 + }, + { + "epoch": 0.6, + "grad_norm": 1.6269204093773668, + "learning_rate": 3.6442819728728397e-06, + "loss": 0.5089, + "step": 7637 + }, + { + "epoch": 0.6, + "grad_norm": 2.313315457876071, + "learning_rate": 3.6430578627650793e-06, + "loss": 0.5363, + "step": 7638 + }, + { + "epoch": 0.6, + "grad_norm": 0.5678640422719986, + "learning_rate": 3.641833840449699e-06, + "loss": 0.5112, + "step": 7639 + }, + { + "epoch": 0.6, + "grad_norm": 1.5569381234824795, + "learning_rate": 3.640609906005893e-06, + "loss": 0.4471, + "step": 7640 + }, + { + "epoch": 0.6, + "grad_norm": 1.636436636698173, + "learning_rate": 3.6393860595128473e-06, + "loss": 0.423, + "step": 7641 + }, + { + "epoch": 0.6, + "grad_norm": 1.7017697145494652, + "learning_rate": 3.6381623010497447e-06, + "loss": 0.4687, + "step": 7642 + }, + { + "epoch": 0.6, + "grad_norm": 0.5275215880456167, + "learning_rate": 3.636938630695759e-06, + "loss": 0.4741, + "step": 7643 + }, + { + "epoch": 0.6, + "grad_norm": 1.6531210178449371, + "learning_rate": 3.6357150485300607e-06, + "loss": 0.4593, + "step": 7644 + }, + { + "epoch": 0.6, + "grad_norm": 0.5547175690167522, + "learning_rate": 3.634491554631814e-06, + "loss": 0.474, + "step": 7645 + }, + { + "epoch": 0.6, + "grad_norm": 0.5471233003117232, + "learning_rate": 3.6332681490801784e-06, + "loss": 0.5043, + "step": 7646 + }, + { + "epoch": 0.6, + "grad_norm": 1.6301986354812212, + "learning_rate": 3.6320448319543044e-06, + "loss": 0.4369, + "step": 7647 + }, + { + "epoch": 0.6, + "grad_norm": 0.5301163825353894, + "learning_rate": 3.6308216033333394e-06, + "loss": 0.4948, + "step": 7648 + }, + { + "epoch": 0.6, + "grad_norm": 2.7096141323044214, + "learning_rate": 3.629598463296429e-06, + "loss": 0.5076, + "step": 7649 + }, + { + "epoch": 0.6, + "grad_norm": 1.6477339742066848, + "learning_rate": 3.628375411922701e-06, + "loss": 0.4778, + "step": 7650 + }, + { + "epoch": 0.6, + "grad_norm": 1.7512362293284458, + "learning_rate": 3.6271524492912925e-06, + "loss": 0.4697, + "step": 7651 + }, + { + "epoch": 0.6, + "grad_norm": 3.786266437925051, + "learning_rate": 3.625929575481323e-06, + "loss": 0.5096, + "step": 7652 + }, + { + "epoch": 0.6, + "grad_norm": 1.6167109490259914, + "learning_rate": 3.624706790571914e-06, + "loss": 0.4651, + "step": 7653 + }, + { + "epoch": 0.6, + "grad_norm": 2.0516247553796467, + "learning_rate": 3.6234840946421757e-06, + "loss": 0.4268, + "step": 7654 + }, + { + "epoch": 0.6, + "grad_norm": 1.802407154057507, + "learning_rate": 3.6222614877712165e-06, + "loss": 0.4699, + "step": 7655 + }, + { + "epoch": 0.6, + "grad_norm": 2.0000480771424844, + "learning_rate": 3.6210389700381354e-06, + "loss": 0.4672, + "step": 7656 + }, + { + "epoch": 0.6, + "grad_norm": 0.6437022282604529, + "learning_rate": 3.619816541522031e-06, + "loss": 0.494, + "step": 7657 + }, + { + "epoch": 0.6, + "grad_norm": 2.0221556124662943, + "learning_rate": 3.6185942023019887e-06, + "loss": 0.4814, + "step": 7658 + }, + { + "epoch": 0.6, + "grad_norm": 1.556725181814122, + "learning_rate": 3.617371952457096e-06, + "loss": 0.5127, + "step": 7659 + }, + { + "epoch": 0.6, + "grad_norm": 0.5839996376768188, + "learning_rate": 3.616149792066428e-06, + "loss": 0.491, + "step": 7660 + }, + { + "epoch": 0.6, + "grad_norm": 1.8591946380951652, + "learning_rate": 3.6149277212090583e-06, + "loss": 0.5227, + "step": 7661 + }, + { + "epoch": 0.6, + "grad_norm": 1.8961270719862255, + "learning_rate": 3.6137057399640548e-06, + "loss": 0.4783, + "step": 7662 + }, + { + "epoch": 0.6, + "grad_norm": 4.348903325352107, + "learning_rate": 3.6124838484104734e-06, + "loss": 0.4407, + "step": 7663 + }, + { + "epoch": 0.6, + "grad_norm": 1.7047406478812857, + "learning_rate": 3.611262046627375e-06, + "loss": 0.4864, + "step": 7664 + }, + { + "epoch": 0.6, + "grad_norm": 2.268825372284758, + "learning_rate": 3.6100403346938027e-06, + "loss": 0.4386, + "step": 7665 + }, + { + "epoch": 0.6, + "grad_norm": 0.5518052260524733, + "learning_rate": 3.6088187126888065e-06, + "loss": 0.4873, + "step": 7666 + }, + { + "epoch": 0.6, + "grad_norm": 1.7415478170040337, + "learning_rate": 3.607597180691415e-06, + "loss": 0.5049, + "step": 7667 + }, + { + "epoch": 0.6, + "grad_norm": 1.985101611476486, + "learning_rate": 3.6063757387806685e-06, + "loss": 0.5149, + "step": 7668 + }, + { + "epoch": 0.6, + "grad_norm": 1.8462728255371739, + "learning_rate": 3.6051543870355856e-06, + "loss": 0.4513, + "step": 7669 + }, + { + "epoch": 0.6, + "grad_norm": 2.0338340757122033, + "learning_rate": 3.6039331255351915e-06, + "loss": 0.4564, + "step": 7670 + }, + { + "epoch": 0.6, + "grad_norm": 2.16442278147747, + "learning_rate": 3.6027119543584955e-06, + "loss": 0.447, + "step": 7671 + }, + { + "epoch": 0.6, + "grad_norm": 1.725383318224612, + "learning_rate": 3.601490873584508e-06, + "loss": 0.4166, + "step": 7672 + }, + { + "epoch": 0.6, + "grad_norm": 1.356614940511579, + "learning_rate": 3.6002698832922338e-06, + "loss": 0.4269, + "step": 7673 + }, + { + "epoch": 0.6, + "grad_norm": 1.6289791283552701, + "learning_rate": 3.5990489835606656e-06, + "loss": 0.4447, + "step": 7674 + }, + { + "epoch": 0.6, + "grad_norm": 2.3757520037540005, + "learning_rate": 3.597828174468797e-06, + "loss": 0.4668, + "step": 7675 + }, + { + "epoch": 0.6, + "grad_norm": 2.7843933827797462, + "learning_rate": 3.59660745609561e-06, + "loss": 0.3889, + "step": 7676 + }, + { + "epoch": 0.6, + "grad_norm": 1.5259484078944399, + "learning_rate": 3.5953868285200854e-06, + "loss": 0.451, + "step": 7677 + }, + { + "epoch": 0.6, + "grad_norm": 1.556694859058405, + "learning_rate": 3.594166291821194e-06, + "loss": 0.4587, + "step": 7678 + }, + { + "epoch": 0.6, + "grad_norm": 2.5429516423851934, + "learning_rate": 3.5929458460779067e-06, + "loss": 0.4964, + "step": 7679 + }, + { + "epoch": 0.6, + "grad_norm": 2.1582659769286963, + "learning_rate": 3.5917254913691807e-06, + "loss": 0.5236, + "step": 7680 + }, + { + "epoch": 0.6, + "grad_norm": 0.5376692050729317, + "learning_rate": 3.5905052277739744e-06, + "loss": 0.4764, + "step": 7681 + }, + { + "epoch": 0.6, + "grad_norm": 1.6952495652352306, + "learning_rate": 3.589285055371235e-06, + "loss": 0.4539, + "step": 7682 + }, + { + "epoch": 0.6, + "grad_norm": 3.1483993081415633, + "learning_rate": 3.5880649742399058e-06, + "loss": 0.4886, + "step": 7683 + }, + { + "epoch": 0.6, + "grad_norm": 0.5271286572373776, + "learning_rate": 3.5868449844589295e-06, + "loss": 0.4833, + "step": 7684 + }, + { + "epoch": 0.6, + "grad_norm": 1.9792085731753795, + "learning_rate": 3.58562508610723e-06, + "loss": 0.4368, + "step": 7685 + }, + { + "epoch": 0.6, + "grad_norm": 2.1420917594466338, + "learning_rate": 3.5844052792637395e-06, + "loss": 0.4507, + "step": 7686 + }, + { + "epoch": 0.6, + "grad_norm": 2.1414340784134756, + "learning_rate": 3.5831855640073755e-06, + "loss": 0.4712, + "step": 7687 + }, + { + "epoch": 0.6, + "grad_norm": 2.595631431578242, + "learning_rate": 3.581965940417052e-06, + "loss": 0.4725, + "step": 7688 + }, + { + "epoch": 0.6, + "grad_norm": 1.8903140052081127, + "learning_rate": 3.580746408571677e-06, + "loss": 0.5103, + "step": 7689 + }, + { + "epoch": 0.6, + "grad_norm": 3.040744442268335, + "learning_rate": 3.579526968550153e-06, + "loss": 0.4322, + "step": 7690 + }, + { + "epoch": 0.6, + "grad_norm": 1.32999625437455, + "learning_rate": 3.578307620431375e-06, + "loss": 0.4372, + "step": 7691 + }, + { + "epoch": 0.6, + "grad_norm": 1.6215834845809944, + "learning_rate": 3.5770883642942357e-06, + "loss": 0.4298, + "step": 7692 + }, + { + "epoch": 0.6, + "grad_norm": 1.6835619335881362, + "learning_rate": 3.5758692002176163e-06, + "loss": 0.4317, + "step": 7693 + }, + { + "epoch": 0.6, + "grad_norm": 1.6404630799702735, + "learning_rate": 3.574650128280397e-06, + "loss": 0.3905, + "step": 7694 + }, + { + "epoch": 0.6, + "grad_norm": 1.732495126062656, + "learning_rate": 3.57343114856145e-06, + "loss": 0.4396, + "step": 7695 + }, + { + "epoch": 0.6, + "grad_norm": 1.7875978818716862, + "learning_rate": 3.5722122611396416e-06, + "loss": 0.4899, + "step": 7696 + }, + { + "epoch": 0.6, + "grad_norm": 1.720936110711901, + "learning_rate": 3.5709934660938326e-06, + "loss": 0.4052, + "step": 7697 + }, + { + "epoch": 0.6, + "grad_norm": 2.1245678888879795, + "learning_rate": 3.569774763502875e-06, + "loss": 0.4774, + "step": 7698 + }, + { + "epoch": 0.6, + "grad_norm": 1.9644602792858576, + "learning_rate": 3.5685561534456224e-06, + "loss": 0.4137, + "step": 7699 + }, + { + "epoch": 0.6, + "grad_norm": 0.5660455247706425, + "learning_rate": 3.567337636000911e-06, + "loss": 0.4857, + "step": 7700 + }, + { + "epoch": 0.6, + "grad_norm": 0.5728810125649879, + "learning_rate": 3.5661192112475835e-06, + "loss": 0.4625, + "step": 7701 + }, + { + "epoch": 0.6, + "grad_norm": 1.6465914564422006, + "learning_rate": 3.564900879264464e-06, + "loss": 0.4365, + "step": 7702 + }, + { + "epoch": 0.6, + "grad_norm": 1.6289648893956155, + "learning_rate": 3.5636826401303838e-06, + "loss": 0.3971, + "step": 7703 + }, + { + "epoch": 0.61, + "grad_norm": 1.8366470678750322, + "learning_rate": 3.5624644939241545e-06, + "loss": 0.43, + "step": 7704 + }, + { + "epoch": 0.61, + "grad_norm": 2.464762688304494, + "learning_rate": 3.561246440724593e-06, + "loss": 0.4787, + "step": 7705 + }, + { + "epoch": 0.61, + "grad_norm": 2.12442404405517, + "learning_rate": 3.560028480610505e-06, + "loss": 0.5022, + "step": 7706 + }, + { + "epoch": 0.61, + "grad_norm": 2.1519336350063476, + "learning_rate": 3.5588106136606903e-06, + "loss": 0.4142, + "step": 7707 + }, + { + "epoch": 0.61, + "grad_norm": 2.2532236207468497, + "learning_rate": 3.5575928399539446e-06, + "loss": 0.4275, + "step": 7708 + }, + { + "epoch": 0.61, + "grad_norm": 2.0248187526216093, + "learning_rate": 3.556375159569054e-06, + "loss": 0.5054, + "step": 7709 + }, + { + "epoch": 0.61, + "grad_norm": 1.8092503635355976, + "learning_rate": 3.555157572584803e-06, + "loss": 0.4334, + "step": 7710 + }, + { + "epoch": 0.61, + "grad_norm": 1.6368092871328486, + "learning_rate": 3.553940079079966e-06, + "loss": 0.4125, + "step": 7711 + }, + { + "epoch": 0.61, + "grad_norm": 1.537712873303657, + "learning_rate": 3.5527226791333155e-06, + "loss": 0.4628, + "step": 7712 + }, + { + "epoch": 0.61, + "grad_norm": 3.4011925175557485, + "learning_rate": 3.5515053728236127e-06, + "loss": 0.4663, + "step": 7713 + }, + { + "epoch": 0.61, + "grad_norm": 1.683257425124835, + "learning_rate": 3.550288160229619e-06, + "loss": 0.4733, + "step": 7714 + }, + { + "epoch": 0.61, + "grad_norm": 1.7698660117742178, + "learning_rate": 3.549071041430084e-06, + "loss": 0.4566, + "step": 7715 + }, + { + "epoch": 0.61, + "grad_norm": 2.0152560370037573, + "learning_rate": 3.5478540165037534e-06, + "loss": 0.4896, + "step": 7716 + }, + { + "epoch": 0.61, + "grad_norm": 2.245931665025475, + "learning_rate": 3.546637085529371e-06, + "loss": 0.4544, + "step": 7717 + }, + { + "epoch": 0.61, + "grad_norm": 2.1520176645216127, + "learning_rate": 3.545420248585665e-06, + "loss": 0.4401, + "step": 7718 + }, + { + "epoch": 0.61, + "grad_norm": 1.5744639864516021, + "learning_rate": 3.5442035057513693e-06, + "loss": 0.4341, + "step": 7719 + }, + { + "epoch": 0.61, + "grad_norm": 1.7962498897551804, + "learning_rate": 3.5429868571051997e-06, + "loss": 0.4387, + "step": 7720 + }, + { + "epoch": 0.61, + "grad_norm": 1.9752687255806276, + "learning_rate": 3.5417703027258752e-06, + "loss": 0.4723, + "step": 7721 + }, + { + "epoch": 0.61, + "grad_norm": 1.6536907384050394, + "learning_rate": 3.5405538426921048e-06, + "loss": 0.4618, + "step": 7722 + }, + { + "epoch": 0.61, + "grad_norm": 2.3449518310382738, + "learning_rate": 3.539337477082592e-06, + "loss": 0.5182, + "step": 7723 + }, + { + "epoch": 0.61, + "grad_norm": 2.754947429676455, + "learning_rate": 3.538121205976033e-06, + "loss": 0.487, + "step": 7724 + }, + { + "epoch": 0.61, + "grad_norm": 2.1724998229459946, + "learning_rate": 3.5369050294511203e-06, + "loss": 0.4067, + "step": 7725 + }, + { + "epoch": 0.61, + "grad_norm": 3.1806490618679644, + "learning_rate": 3.5356889475865374e-06, + "loss": 0.4148, + "step": 7726 + }, + { + "epoch": 0.61, + "grad_norm": 1.9271024516365005, + "learning_rate": 3.534472960460965e-06, + "loss": 0.4665, + "step": 7727 + }, + { + "epoch": 0.61, + "grad_norm": 2.2766448537353607, + "learning_rate": 3.5332570681530735e-06, + "loss": 0.4711, + "step": 7728 + }, + { + "epoch": 0.61, + "grad_norm": 2.046013050676859, + "learning_rate": 3.5320412707415314e-06, + "loss": 0.4624, + "step": 7729 + }, + { + "epoch": 0.61, + "grad_norm": 0.6016884987781782, + "learning_rate": 3.5308255683049998e-06, + "loss": 0.4813, + "step": 7730 + }, + { + "epoch": 0.61, + "grad_norm": 1.7150542255811525, + "learning_rate": 3.5296099609221302e-06, + "loss": 0.4255, + "step": 7731 + }, + { + "epoch": 0.61, + "grad_norm": 0.5776614830265858, + "learning_rate": 3.528394448671575e-06, + "loss": 0.4957, + "step": 7732 + }, + { + "epoch": 0.61, + "grad_norm": 2.376625592506426, + "learning_rate": 3.527179031631972e-06, + "loss": 0.4828, + "step": 7733 + }, + { + "epoch": 0.61, + "grad_norm": 2.006374050378446, + "learning_rate": 3.52596370988196e-06, + "loss": 0.4355, + "step": 7734 + }, + { + "epoch": 0.61, + "grad_norm": 0.5526348399458957, + "learning_rate": 3.5247484835001654e-06, + "loss": 0.4468, + "step": 7735 + }, + { + "epoch": 0.61, + "grad_norm": 1.734668030416912, + "learning_rate": 3.5235333525652176e-06, + "loss": 0.4393, + "step": 7736 + }, + { + "epoch": 0.61, + "grad_norm": 5.957905405192002, + "learning_rate": 3.5223183171557274e-06, + "loss": 0.4626, + "step": 7737 + }, + { + "epoch": 0.61, + "grad_norm": 2.48630398247633, + "learning_rate": 3.5211033773503116e-06, + "loss": 0.4291, + "step": 7738 + }, + { + "epoch": 0.61, + "grad_norm": 1.8337337694717137, + "learning_rate": 3.51988853322757e-06, + "loss": 0.4484, + "step": 7739 + }, + { + "epoch": 0.61, + "grad_norm": 1.4987261368824543, + "learning_rate": 3.5186737848661044e-06, + "loss": 0.4366, + "step": 7740 + }, + { + "epoch": 0.61, + "grad_norm": 0.6530649763027468, + "learning_rate": 3.5174591323445097e-06, + "loss": 0.4905, + "step": 7741 + }, + { + "epoch": 0.61, + "grad_norm": 1.5850992959957473, + "learning_rate": 3.516244575741367e-06, + "loss": 0.5069, + "step": 7742 + }, + { + "epoch": 0.61, + "grad_norm": 1.354566002359784, + "learning_rate": 3.5150301151352613e-06, + "loss": 0.4495, + "step": 7743 + }, + { + "epoch": 0.61, + "grad_norm": 2.04868532090511, + "learning_rate": 3.513815750604763e-06, + "loss": 0.5042, + "step": 7744 + }, + { + "epoch": 0.61, + "grad_norm": 1.8840664840722436, + "learning_rate": 3.512601482228443e-06, + "loss": 0.4652, + "step": 7745 + }, + { + "epoch": 0.61, + "grad_norm": 1.6978204486549122, + "learning_rate": 3.5113873100848594e-06, + "loss": 0.4805, + "step": 7746 + }, + { + "epoch": 0.61, + "grad_norm": 1.8603135788949292, + "learning_rate": 3.5101732342525716e-06, + "loss": 0.4265, + "step": 7747 + }, + { + "epoch": 0.61, + "grad_norm": 1.6535146935256293, + "learning_rate": 3.5089592548101247e-06, + "loss": 0.415, + "step": 7748 + }, + { + "epoch": 0.61, + "grad_norm": 2.163528315457827, + "learning_rate": 3.5077453718360654e-06, + "loss": 0.4559, + "step": 7749 + }, + { + "epoch": 0.61, + "grad_norm": 1.526713512491908, + "learning_rate": 3.5065315854089267e-06, + "loss": 0.4709, + "step": 7750 + }, + { + "epoch": 0.61, + "grad_norm": 3.453791742377531, + "learning_rate": 3.50531789560724e-06, + "loss": 0.4481, + "step": 7751 + }, + { + "epoch": 0.61, + "grad_norm": 1.9470245829949138, + "learning_rate": 3.5041043025095333e-06, + "loss": 0.5051, + "step": 7752 + }, + { + "epoch": 0.61, + "grad_norm": 1.7638268250349165, + "learning_rate": 3.5028908061943177e-06, + "loss": 0.4463, + "step": 7753 + }, + { + "epoch": 0.61, + "grad_norm": 2.443999839196009, + "learning_rate": 3.5016774067401127e-06, + "loss": 0.5006, + "step": 7754 + }, + { + "epoch": 0.61, + "grad_norm": 1.797079983250265, + "learning_rate": 3.500464104225415e-06, + "loss": 0.4634, + "step": 7755 + }, + { + "epoch": 0.61, + "grad_norm": 1.6851447284671497, + "learning_rate": 3.499250898728731e-06, + "loss": 0.4115, + "step": 7756 + }, + { + "epoch": 0.61, + "grad_norm": 0.6006573267461806, + "learning_rate": 3.4980377903285496e-06, + "loss": 0.488, + "step": 7757 + }, + { + "epoch": 0.61, + "grad_norm": 2.4122841529459897, + "learning_rate": 3.4968247791033595e-06, + "loss": 0.4125, + "step": 7758 + }, + { + "epoch": 0.61, + "grad_norm": 1.7509895624742733, + "learning_rate": 3.4956118651316384e-06, + "loss": 0.5272, + "step": 7759 + }, + { + "epoch": 0.61, + "grad_norm": 0.5617247527871281, + "learning_rate": 3.494399048491862e-06, + "loss": 0.467, + "step": 7760 + }, + { + "epoch": 0.61, + "grad_norm": 1.5290189807365688, + "learning_rate": 3.4931863292624978e-06, + "loss": 0.4239, + "step": 7761 + }, + { + "epoch": 0.61, + "grad_norm": 1.7495206811458648, + "learning_rate": 3.4919737075220063e-06, + "loss": 0.4544, + "step": 7762 + }, + { + "epoch": 0.61, + "grad_norm": 1.792238642723388, + "learning_rate": 3.490761183348844e-06, + "loss": 0.4734, + "step": 7763 + }, + { + "epoch": 0.61, + "grad_norm": 2.231971424535322, + "learning_rate": 3.489548756821458e-06, + "loss": 0.4786, + "step": 7764 + }, + { + "epoch": 0.61, + "grad_norm": 1.5810388314843244, + "learning_rate": 3.488336428018293e-06, + "loss": 0.4624, + "step": 7765 + }, + { + "epoch": 0.61, + "grad_norm": 0.5418697024274549, + "learning_rate": 3.4871241970177822e-06, + "loss": 0.4978, + "step": 7766 + }, + { + "epoch": 0.61, + "grad_norm": 2.0461562546552328, + "learning_rate": 3.485912063898358e-06, + "loss": 0.4444, + "step": 7767 + }, + { + "epoch": 0.61, + "grad_norm": 1.715901377869296, + "learning_rate": 3.4847000287384416e-06, + "loss": 0.4539, + "step": 7768 + }, + { + "epoch": 0.61, + "grad_norm": 2.048781475932776, + "learning_rate": 3.4834880916164514e-06, + "loss": 0.4466, + "step": 7769 + }, + { + "epoch": 0.61, + "grad_norm": 2.2751284556126565, + "learning_rate": 3.4822762526107967e-06, + "loss": 0.43, + "step": 7770 + }, + { + "epoch": 0.61, + "grad_norm": 2.545284071763954, + "learning_rate": 3.4810645117998855e-06, + "loss": 0.4637, + "step": 7771 + }, + { + "epoch": 0.61, + "grad_norm": 0.5639688152372062, + "learning_rate": 3.4798528692621105e-06, + "loss": 0.4862, + "step": 7772 + }, + { + "epoch": 0.61, + "grad_norm": 1.7629806018659928, + "learning_rate": 3.4786413250758665e-06, + "loss": 0.4414, + "step": 7773 + }, + { + "epoch": 0.61, + "grad_norm": 1.4877699677539986, + "learning_rate": 3.4774298793195407e-06, + "loss": 0.5162, + "step": 7774 + }, + { + "epoch": 0.61, + "grad_norm": 1.8013514606503256, + "learning_rate": 3.476218532071509e-06, + "loss": 0.4667, + "step": 7775 + }, + { + "epoch": 0.61, + "grad_norm": 2.4161527665763827, + "learning_rate": 3.4750072834101457e-06, + "loss": 0.4967, + "step": 7776 + }, + { + "epoch": 0.61, + "grad_norm": 1.3764002296239217, + "learning_rate": 3.473796133413815e-06, + "loss": 0.4923, + "step": 7777 + }, + { + "epoch": 0.61, + "grad_norm": 2.0712571933892625, + "learning_rate": 3.4725850821608786e-06, + "loss": 0.4722, + "step": 7778 + }, + { + "epoch": 0.61, + "grad_norm": 1.5627575763616572, + "learning_rate": 3.471374129729689e-06, + "loss": 0.4301, + "step": 7779 + }, + { + "epoch": 0.61, + "grad_norm": 1.7541916555877992, + "learning_rate": 3.4701632761985937e-06, + "loss": 0.4467, + "step": 7780 + }, + { + "epoch": 0.61, + "grad_norm": 2.157940135597341, + "learning_rate": 3.4689525216459323e-06, + "loss": 0.4403, + "step": 7781 + }, + { + "epoch": 0.61, + "grad_norm": 1.6141707005086356, + "learning_rate": 3.4677418661500416e-06, + "loss": 0.4636, + "step": 7782 + }, + { + "epoch": 0.61, + "grad_norm": 2.0866488598430952, + "learning_rate": 3.466531309789246e-06, + "loss": 0.4302, + "step": 7783 + }, + { + "epoch": 0.61, + "grad_norm": 3.0030273116582396, + "learning_rate": 3.465320852641868e-06, + "loss": 0.4125, + "step": 7784 + }, + { + "epoch": 0.61, + "grad_norm": 2.394763879770947, + "learning_rate": 3.464110494786226e-06, + "loss": 0.492, + "step": 7785 + }, + { + "epoch": 0.61, + "grad_norm": 0.5618212109733438, + "learning_rate": 3.4629002363006225e-06, + "loss": 0.5017, + "step": 7786 + }, + { + "epoch": 0.61, + "grad_norm": 1.8232024912700124, + "learning_rate": 3.4616900772633664e-06, + "loss": 0.4764, + "step": 7787 + }, + { + "epoch": 0.61, + "grad_norm": 0.5426176265536636, + "learning_rate": 3.4604800177527476e-06, + "loss": 0.4823, + "step": 7788 + }, + { + "epoch": 0.61, + "grad_norm": 1.8896647741238566, + "learning_rate": 3.4592700578470594e-06, + "loss": 0.44, + "step": 7789 + }, + { + "epoch": 0.61, + "grad_norm": 2.4560253565396524, + "learning_rate": 3.458060197624581e-06, + "loss": 0.4781, + "step": 7790 + }, + { + "epoch": 0.61, + "grad_norm": 1.6343755930901187, + "learning_rate": 3.456850437163592e-06, + "loss": 0.4429, + "step": 7791 + }, + { + "epoch": 0.61, + "grad_norm": 2.2050646115609913, + "learning_rate": 3.4556407765423604e-06, + "loss": 0.5005, + "step": 7792 + }, + { + "epoch": 0.61, + "grad_norm": 1.759691106928934, + "learning_rate": 3.4544312158391515e-06, + "loss": 0.4281, + "step": 7793 + }, + { + "epoch": 0.61, + "grad_norm": 2.123578325195361, + "learning_rate": 3.453221755132219e-06, + "loss": 0.4433, + "step": 7794 + }, + { + "epoch": 0.61, + "grad_norm": 1.8119399115970658, + "learning_rate": 3.4520123944998164e-06, + "loss": 0.494, + "step": 7795 + }, + { + "epoch": 0.61, + "grad_norm": 0.5506108452279956, + "learning_rate": 3.4508031340201875e-06, + "loss": 0.4919, + "step": 7796 + }, + { + "epoch": 0.61, + "grad_norm": 1.5189440032600667, + "learning_rate": 3.449593973771569e-06, + "loss": 0.4446, + "step": 7797 + }, + { + "epoch": 0.61, + "grad_norm": 2.8629101391508636, + "learning_rate": 3.448384913832192e-06, + "loss": 0.4979, + "step": 7798 + }, + { + "epoch": 0.61, + "grad_norm": 1.7851831975856893, + "learning_rate": 3.4471759542802807e-06, + "loss": 0.4721, + "step": 7799 + }, + { + "epoch": 0.61, + "grad_norm": 0.5750204239526002, + "learning_rate": 3.4459670951940555e-06, + "loss": 0.4827, + "step": 7800 + }, + { + "epoch": 0.61, + "grad_norm": 2.304558033599131, + "learning_rate": 3.4447583366517246e-06, + "loss": 0.4497, + "step": 7801 + }, + { + "epoch": 0.61, + "grad_norm": 1.7172343639267629, + "learning_rate": 3.443549678731496e-06, + "loss": 0.4598, + "step": 7802 + }, + { + "epoch": 0.61, + "grad_norm": 1.4216467552979597, + "learning_rate": 3.4423411215115664e-06, + "loss": 0.4548, + "step": 7803 + }, + { + "epoch": 0.61, + "grad_norm": 1.8515890439306513, + "learning_rate": 3.4411326650701294e-06, + "loss": 0.4842, + "step": 7804 + }, + { + "epoch": 0.61, + "grad_norm": 1.7606044638048226, + "learning_rate": 3.4399243094853675e-06, + "loss": 0.455, + "step": 7805 + }, + { + "epoch": 0.61, + "grad_norm": 0.5638856299518022, + "learning_rate": 3.438716054835465e-06, + "loss": 0.4848, + "step": 7806 + }, + { + "epoch": 0.61, + "grad_norm": 0.5612651591630189, + "learning_rate": 3.437507901198588e-06, + "loss": 0.4679, + "step": 7807 + }, + { + "epoch": 0.61, + "grad_norm": 2.66777144876639, + "learning_rate": 3.4362998486529077e-06, + "loss": 0.4031, + "step": 7808 + }, + { + "epoch": 0.61, + "grad_norm": 2.955437203751179, + "learning_rate": 3.4350918972765818e-06, + "loss": 0.4433, + "step": 7809 + }, + { + "epoch": 0.61, + "grad_norm": 0.5119838559572328, + "learning_rate": 3.433884047147762e-06, + "loss": 0.46, + "step": 7810 + }, + { + "epoch": 0.61, + "grad_norm": 0.5606909319682777, + "learning_rate": 3.432676298344597e-06, + "loss": 0.4983, + "step": 7811 + }, + { + "epoch": 0.61, + "grad_norm": 1.8030241788304848, + "learning_rate": 3.4314686509452245e-06, + "loss": 0.4274, + "step": 7812 + }, + { + "epoch": 0.61, + "grad_norm": 1.6970846204874446, + "learning_rate": 3.430261105027779e-06, + "loss": 0.4511, + "step": 7813 + }, + { + "epoch": 0.61, + "grad_norm": 1.3881742015479352, + "learning_rate": 3.429053660670385e-06, + "loss": 0.4914, + "step": 7814 + }, + { + "epoch": 0.61, + "grad_norm": 1.9865297891816587, + "learning_rate": 3.4278463179511657e-06, + "loss": 0.5169, + "step": 7815 + }, + { + "epoch": 0.61, + "grad_norm": 3.260625087334255, + "learning_rate": 3.4266390769482316e-06, + "loss": 0.497, + "step": 7816 + }, + { + "epoch": 0.61, + "grad_norm": 0.5311746370469715, + "learning_rate": 3.4254319377396927e-06, + "loss": 0.4736, + "step": 7817 + }, + { + "epoch": 0.61, + "grad_norm": 0.5376192084856646, + "learning_rate": 3.4242249004036466e-06, + "loss": 0.484, + "step": 7818 + }, + { + "epoch": 0.61, + "grad_norm": 1.7203093064686812, + "learning_rate": 3.423017965018186e-06, + "loss": 0.4591, + "step": 7819 + }, + { + "epoch": 0.61, + "grad_norm": 1.752747589535994, + "learning_rate": 3.4218111316614042e-06, + "loss": 0.4442, + "step": 7820 + }, + { + "epoch": 0.61, + "grad_norm": 1.6260672806602947, + "learning_rate": 3.420604400411374e-06, + "loss": 0.3975, + "step": 7821 + }, + { + "epoch": 0.61, + "grad_norm": 1.7803915612396886, + "learning_rate": 3.4193977713461767e-06, + "loss": 0.4526, + "step": 7822 + }, + { + "epoch": 0.61, + "grad_norm": 2.1770460797315025, + "learning_rate": 3.4181912445438724e-06, + "loss": 0.4245, + "step": 7823 + }, + { + "epoch": 0.61, + "grad_norm": 0.6507188981396902, + "learning_rate": 3.4169848200825286e-06, + "loss": 0.4871, + "step": 7824 + }, + { + "epoch": 0.61, + "grad_norm": 1.6476040446734745, + "learning_rate": 3.415778498040192e-06, + "loss": 0.453, + "step": 7825 + }, + { + "epoch": 0.61, + "grad_norm": 1.6277957584157905, + "learning_rate": 3.4145722784949174e-06, + "loss": 0.4546, + "step": 7826 + }, + { + "epoch": 0.61, + "grad_norm": 1.975527833231104, + "learning_rate": 3.413366161524741e-06, + "loss": 0.5058, + "step": 7827 + }, + { + "epoch": 0.61, + "grad_norm": 1.7755100329824016, + "learning_rate": 3.4121601472076993e-06, + "loss": 0.4463, + "step": 7828 + }, + { + "epoch": 0.61, + "grad_norm": 1.977926139424973, + "learning_rate": 3.4109542356218185e-06, + "loss": 0.407, + "step": 7829 + }, + { + "epoch": 0.61, + "grad_norm": 1.743118723634884, + "learning_rate": 3.4097484268451185e-06, + "loss": 0.509, + "step": 7830 + }, + { + "epoch": 0.62, + "grad_norm": 2.356843936179444, + "learning_rate": 3.4085427209556175e-06, + "loss": 0.5006, + "step": 7831 + }, + { + "epoch": 0.62, + "grad_norm": 7.865525156171202, + "learning_rate": 3.4073371180313188e-06, + "loss": 0.4449, + "step": 7832 + }, + { + "epoch": 0.62, + "grad_norm": 0.5751941610924689, + "learning_rate": 3.4061316181502264e-06, + "loss": 0.4839, + "step": 7833 + }, + { + "epoch": 0.62, + "grad_norm": 0.5542393068656261, + "learning_rate": 3.404926221390332e-06, + "loss": 0.4946, + "step": 7834 + }, + { + "epoch": 0.62, + "grad_norm": 1.4082272471765855, + "learning_rate": 3.403720927829626e-06, + "loss": 0.4367, + "step": 7835 + }, + { + "epoch": 0.62, + "grad_norm": 2.180813954898519, + "learning_rate": 3.402515737546086e-06, + "loss": 0.4259, + "step": 7836 + }, + { + "epoch": 0.62, + "grad_norm": 1.42760682988305, + "learning_rate": 3.401310650617689e-06, + "loss": 0.4654, + "step": 7837 + }, + { + "epoch": 0.62, + "grad_norm": 4.389878005630295, + "learning_rate": 3.4001056671224007e-06, + "loss": 0.4532, + "step": 7838 + }, + { + "epoch": 0.62, + "grad_norm": 1.9018939273968047, + "learning_rate": 3.398900787138184e-06, + "loss": 0.4266, + "step": 7839 + }, + { + "epoch": 0.62, + "grad_norm": 2.6220294381130254, + "learning_rate": 3.39769601074299e-06, + "loss": 0.493, + "step": 7840 + }, + { + "epoch": 0.62, + "grad_norm": 0.5580460209290257, + "learning_rate": 3.396491338014769e-06, + "loss": 0.4951, + "step": 7841 + }, + { + "epoch": 0.62, + "grad_norm": 1.7355362742564588, + "learning_rate": 3.395286769031462e-06, + "loss": 0.4631, + "step": 7842 + }, + { + "epoch": 0.62, + "grad_norm": 1.909112107128007, + "learning_rate": 3.3940823038710003e-06, + "loss": 0.4539, + "step": 7843 + }, + { + "epoch": 0.62, + "grad_norm": 2.139015606390699, + "learning_rate": 3.392877942611314e-06, + "loss": 0.5055, + "step": 7844 + }, + { + "epoch": 0.62, + "grad_norm": 0.5704977562964734, + "learning_rate": 3.391673685330321e-06, + "loss": 0.4799, + "step": 7845 + }, + { + "epoch": 0.62, + "grad_norm": 1.6495593272371447, + "learning_rate": 3.3904695321059384e-06, + "loss": 0.4855, + "step": 7846 + }, + { + "epoch": 0.62, + "grad_norm": 1.6863620093371725, + "learning_rate": 3.3892654830160696e-06, + "loss": 0.4663, + "step": 7847 + }, + { + "epoch": 0.62, + "grad_norm": 0.5283551907601328, + "learning_rate": 3.388061538138619e-06, + "loss": 0.475, + "step": 7848 + }, + { + "epoch": 0.62, + "grad_norm": 0.5507922938811267, + "learning_rate": 3.3868576975514768e-06, + "loss": 0.4668, + "step": 7849 + }, + { + "epoch": 0.62, + "grad_norm": 2.081323072902518, + "learning_rate": 3.3856539613325322e-06, + "loss": 0.5799, + "step": 7850 + }, + { + "epoch": 0.62, + "grad_norm": 1.8406038584266533, + "learning_rate": 3.384450329559663e-06, + "loss": 0.4716, + "step": 7851 + }, + { + "epoch": 0.62, + "grad_norm": 1.6744057782957944, + "learning_rate": 3.3832468023107444e-06, + "loss": 0.4799, + "step": 7852 + }, + { + "epoch": 0.62, + "grad_norm": 2.040029528342024, + "learning_rate": 3.3820433796636443e-06, + "loss": 0.5024, + "step": 7853 + }, + { + "epoch": 0.62, + "grad_norm": 1.9859895638590808, + "learning_rate": 3.3808400616962186e-06, + "loss": 0.4092, + "step": 7854 + }, + { + "epoch": 0.62, + "grad_norm": 0.5933296790648138, + "learning_rate": 3.3796368484863253e-06, + "loss": 0.4938, + "step": 7855 + }, + { + "epoch": 0.62, + "grad_norm": 1.8712467601487086, + "learning_rate": 3.3784337401118055e-06, + "loss": 0.4442, + "step": 7856 + }, + { + "epoch": 0.62, + "grad_norm": 0.5791950781624288, + "learning_rate": 3.3772307366505045e-06, + "loss": 0.4936, + "step": 7857 + }, + { + "epoch": 0.62, + "grad_norm": 2.531033330104198, + "learning_rate": 3.376027838180249e-06, + "loss": 0.4727, + "step": 7858 + }, + { + "epoch": 0.62, + "grad_norm": 3.5337493463100933, + "learning_rate": 3.374825044778871e-06, + "loss": 0.4834, + "step": 7859 + }, + { + "epoch": 0.62, + "grad_norm": 2.020548936747588, + "learning_rate": 3.3736223565241838e-06, + "loss": 0.4177, + "step": 7860 + }, + { + "epoch": 0.62, + "grad_norm": 1.798624412810488, + "learning_rate": 3.3724197734940046e-06, + "loss": 0.4519, + "step": 7861 + }, + { + "epoch": 0.62, + "grad_norm": 1.9232709864825064, + "learning_rate": 3.3712172957661355e-06, + "loss": 0.4383, + "step": 7862 + }, + { + "epoch": 0.62, + "grad_norm": 1.7135987048794163, + "learning_rate": 3.3700149234183766e-06, + "loss": 0.5133, + "step": 7863 + }, + { + "epoch": 0.62, + "grad_norm": 1.6250844449972464, + "learning_rate": 3.368812656528521e-06, + "loss": 0.4448, + "step": 7864 + }, + { + "epoch": 0.62, + "grad_norm": 1.9731809579643913, + "learning_rate": 3.3676104951743514e-06, + "loss": 0.4484, + "step": 7865 + }, + { + "epoch": 0.62, + "grad_norm": 1.5124774234093759, + "learning_rate": 3.3664084394336494e-06, + "loss": 0.4165, + "step": 7866 + }, + { + "epoch": 0.62, + "grad_norm": 0.5933597967894093, + "learning_rate": 3.3652064893841825e-06, + "loss": 0.4702, + "step": 7867 + }, + { + "epoch": 0.62, + "grad_norm": 2.2325321504664326, + "learning_rate": 3.3640046451037185e-06, + "loss": 0.4463, + "step": 7868 + }, + { + "epoch": 0.62, + "grad_norm": 1.6516801948782192, + "learning_rate": 3.3628029066700123e-06, + "loss": 0.4821, + "step": 7869 + }, + { + "epoch": 0.62, + "grad_norm": 0.5375150685233474, + "learning_rate": 3.361601274160817e-06, + "loss": 0.4944, + "step": 7870 + }, + { + "epoch": 0.62, + "grad_norm": 1.9222730752437804, + "learning_rate": 3.3603997476538765e-06, + "loss": 0.4826, + "step": 7871 + }, + { + "epoch": 0.62, + "grad_norm": 2.067986319574155, + "learning_rate": 3.3591983272269272e-06, + "loss": 0.4259, + "step": 7872 + }, + { + "epoch": 0.62, + "grad_norm": 2.1019939910543015, + "learning_rate": 3.3579970129576987e-06, + "loss": 0.4327, + "step": 7873 + }, + { + "epoch": 0.62, + "grad_norm": 0.5445308673528748, + "learning_rate": 3.3567958049239154e-06, + "loss": 0.4822, + "step": 7874 + }, + { + "epoch": 0.62, + "grad_norm": 1.8008671060584631, + "learning_rate": 3.3555947032032956e-06, + "loss": 0.4652, + "step": 7875 + }, + { + "epoch": 0.62, + "grad_norm": 0.5415305989542187, + "learning_rate": 3.3543937078735457e-06, + "loss": 0.4675, + "step": 7876 + }, + { + "epoch": 0.62, + "grad_norm": 1.5604636611272438, + "learning_rate": 3.353192819012371e-06, + "loss": 0.4277, + "step": 7877 + }, + { + "epoch": 0.62, + "grad_norm": 1.6834465887369972, + "learning_rate": 3.3519920366974666e-06, + "loss": 0.4596, + "step": 7878 + }, + { + "epoch": 0.62, + "grad_norm": 1.7823758831624599, + "learning_rate": 3.3507913610065222e-06, + "loss": 0.4299, + "step": 7879 + }, + { + "epoch": 0.62, + "grad_norm": 3.0523132287458985, + "learning_rate": 3.3495907920172178e-06, + "loss": 0.4504, + "step": 7880 + }, + { + "epoch": 0.62, + "grad_norm": 1.742681823469371, + "learning_rate": 3.3483903298072317e-06, + "loss": 0.4517, + "step": 7881 + }, + { + "epoch": 0.62, + "grad_norm": 1.9376002305963953, + "learning_rate": 3.3471899744542296e-06, + "loss": 0.4936, + "step": 7882 + }, + { + "epoch": 0.62, + "grad_norm": 2.7434685830135765, + "learning_rate": 3.3459897260358753e-06, + "loss": 0.4754, + "step": 7883 + }, + { + "epoch": 0.62, + "grad_norm": 1.9164261097584991, + "learning_rate": 3.3447895846298207e-06, + "loss": 0.469, + "step": 7884 + }, + { + "epoch": 0.62, + "grad_norm": 2.144855079105593, + "learning_rate": 3.3435895503137156e-06, + "loss": 0.5029, + "step": 7885 + }, + { + "epoch": 0.62, + "grad_norm": 1.6841579489602816, + "learning_rate": 3.3423896231651987e-06, + "loss": 0.4398, + "step": 7886 + }, + { + "epoch": 0.62, + "grad_norm": 1.5998234698716693, + "learning_rate": 3.341189803261905e-06, + "loss": 0.4492, + "step": 7887 + }, + { + "epoch": 0.62, + "grad_norm": 1.581806227791593, + "learning_rate": 3.3399900906814618e-06, + "loss": 0.4437, + "step": 7888 + }, + { + "epoch": 0.62, + "grad_norm": 1.9191970825442766, + "learning_rate": 3.3387904855014865e-06, + "loss": 0.4437, + "step": 7889 + }, + { + "epoch": 0.62, + "grad_norm": 1.4261573668559586, + "learning_rate": 3.337590987799596e-06, + "loss": 0.4366, + "step": 7890 + }, + { + "epoch": 0.62, + "grad_norm": 2.2741061695266542, + "learning_rate": 3.3363915976533913e-06, + "loss": 0.4331, + "step": 7891 + }, + { + "epoch": 0.62, + "grad_norm": 1.7849203227975985, + "learning_rate": 3.335192315140477e-06, + "loss": 0.4371, + "step": 7892 + }, + { + "epoch": 0.62, + "grad_norm": 2.0927305285455735, + "learning_rate": 3.333993140338439e-06, + "loss": 0.4729, + "step": 7893 + }, + { + "epoch": 0.62, + "grad_norm": 3.4389659112552584, + "learning_rate": 3.3327940733248686e-06, + "loss": 0.4625, + "step": 7894 + }, + { + "epoch": 0.62, + "grad_norm": 4.581694530153472, + "learning_rate": 3.3315951141773372e-06, + "loss": 0.4346, + "step": 7895 + }, + { + "epoch": 0.62, + "grad_norm": 1.7276496191562274, + "learning_rate": 3.330396262973423e-06, + "loss": 0.4297, + "step": 7896 + }, + { + "epoch": 0.62, + "grad_norm": 5.372300333855026, + "learning_rate": 3.3291975197906826e-06, + "loss": 0.4566, + "step": 7897 + }, + { + "epoch": 0.62, + "grad_norm": 2.910193767981382, + "learning_rate": 3.327998884706679e-06, + "loss": 0.4835, + "step": 7898 + }, + { + "epoch": 0.62, + "grad_norm": 2.442835003729624, + "learning_rate": 3.3268003577989604e-06, + "loss": 0.4281, + "step": 7899 + }, + { + "epoch": 0.62, + "grad_norm": 0.5549435760047432, + "learning_rate": 3.3256019391450696e-06, + "loss": 0.4895, + "step": 7900 + }, + { + "epoch": 0.62, + "grad_norm": 2.0587156608800146, + "learning_rate": 3.3244036288225434e-06, + "loss": 0.4365, + "step": 7901 + }, + { + "epoch": 0.62, + "grad_norm": 1.8609002645804358, + "learning_rate": 3.3232054269089098e-06, + "loss": 0.4869, + "step": 7902 + }, + { + "epoch": 0.62, + "grad_norm": 0.6449230986259503, + "learning_rate": 3.3220073334816928e-06, + "loss": 0.4994, + "step": 7903 + }, + { + "epoch": 0.62, + "grad_norm": 1.374026547723643, + "learning_rate": 3.3208093486184044e-06, + "loss": 0.4406, + "step": 7904 + }, + { + "epoch": 0.62, + "grad_norm": 1.5552235253148115, + "learning_rate": 3.3196114723965563e-06, + "loss": 0.4867, + "step": 7905 + }, + { + "epoch": 0.62, + "grad_norm": 2.9794255584994165, + "learning_rate": 3.318413704893646e-06, + "loss": 0.4768, + "step": 7906 + }, + { + "epoch": 0.62, + "grad_norm": 2.2349932156156322, + "learning_rate": 3.3172160461871707e-06, + "loss": 0.4171, + "step": 7907 + }, + { + "epoch": 0.62, + "grad_norm": 2.574007186470273, + "learning_rate": 3.316018496354615e-06, + "loss": 0.4843, + "step": 7908 + }, + { + "epoch": 0.62, + "grad_norm": 1.8306596311851202, + "learning_rate": 3.3148210554734582e-06, + "loss": 0.4141, + "step": 7909 + }, + { + "epoch": 0.62, + "grad_norm": 1.8812035877703819, + "learning_rate": 3.3136237236211764e-06, + "loss": 0.4153, + "step": 7910 + }, + { + "epoch": 0.62, + "grad_norm": 1.5294741054654777, + "learning_rate": 3.312426500875233e-06, + "loss": 0.4559, + "step": 7911 + }, + { + "epoch": 0.62, + "grad_norm": 1.6202595309526575, + "learning_rate": 3.3112293873130885e-06, + "loss": 0.4442, + "step": 7912 + }, + { + "epoch": 0.62, + "grad_norm": 2.0492215835956067, + "learning_rate": 3.310032383012193e-06, + "loss": 0.457, + "step": 7913 + }, + { + "epoch": 0.62, + "grad_norm": 0.5840852166249517, + "learning_rate": 3.3088354880499914e-06, + "loss": 0.4918, + "step": 7914 + }, + { + "epoch": 0.62, + "grad_norm": 1.6449017353327084, + "learning_rate": 3.307638702503921e-06, + "loss": 0.4179, + "step": 7915 + }, + { + "epoch": 0.62, + "grad_norm": 2.079641154305045, + "learning_rate": 3.3064420264514135e-06, + "loss": 0.4878, + "step": 7916 + }, + { + "epoch": 0.62, + "grad_norm": 1.7624127495138453, + "learning_rate": 3.3052454599698904e-06, + "loss": 0.4971, + "step": 7917 + }, + { + "epoch": 0.62, + "grad_norm": 1.5915910402949223, + "learning_rate": 3.30404900313677e-06, + "loss": 0.509, + "step": 7918 + }, + { + "epoch": 0.62, + "grad_norm": 1.4128039550572555, + "learning_rate": 3.3028526560294593e-06, + "loss": 0.4692, + "step": 7919 + }, + { + "epoch": 0.62, + "grad_norm": 2.9023950105550256, + "learning_rate": 3.3016564187253623e-06, + "loss": 0.4534, + "step": 7920 + }, + { + "epoch": 0.62, + "grad_norm": 0.5172693137827169, + "learning_rate": 3.300460291301874e-06, + "loss": 0.5055, + "step": 7921 + }, + { + "epoch": 0.62, + "grad_norm": 1.4730317074876444, + "learning_rate": 3.2992642738363805e-06, + "loss": 0.4213, + "step": 7922 + }, + { + "epoch": 0.62, + "grad_norm": 1.8716503281730743, + "learning_rate": 3.298068366406265e-06, + "loss": 0.4746, + "step": 7923 + }, + { + "epoch": 0.62, + "grad_norm": 0.5640836734216811, + "learning_rate": 3.296872569088897e-06, + "loss": 0.4988, + "step": 7924 + }, + { + "epoch": 0.62, + "grad_norm": 1.6717212336333918, + "learning_rate": 3.295676881961649e-06, + "loss": 0.4428, + "step": 7925 + }, + { + "epoch": 0.62, + "grad_norm": 1.8771564598046564, + "learning_rate": 3.294481305101875e-06, + "loss": 0.4322, + "step": 7926 + }, + { + "epoch": 0.62, + "grad_norm": 1.6689898712788251, + "learning_rate": 3.2932858385869315e-06, + "loss": 0.4098, + "step": 7927 + }, + { + "epoch": 0.62, + "grad_norm": 2.45927235227874, + "learning_rate": 3.2920904824941592e-06, + "loss": 0.4255, + "step": 7928 + }, + { + "epoch": 0.62, + "grad_norm": 1.5211620052168389, + "learning_rate": 3.290895236900902e-06, + "loss": 0.4163, + "step": 7929 + }, + { + "epoch": 0.62, + "grad_norm": 1.5390395830382881, + "learning_rate": 3.2897001018844827e-06, + "loss": 0.4736, + "step": 7930 + }, + { + "epoch": 0.62, + "grad_norm": 2.0206430615475557, + "learning_rate": 3.2885050775222314e-06, + "loss": 0.4692, + "step": 7931 + }, + { + "epoch": 0.62, + "grad_norm": 1.4355797423529686, + "learning_rate": 3.2873101638914636e-06, + "loss": 0.4355, + "step": 7932 + }, + { + "epoch": 0.62, + "grad_norm": 1.7363073947224397, + "learning_rate": 3.2861153610694862e-06, + "loss": 0.4201, + "step": 7933 + }, + { + "epoch": 0.62, + "grad_norm": 1.9782500730005965, + "learning_rate": 3.284920669133604e-06, + "loss": 0.399, + "step": 7934 + }, + { + "epoch": 0.62, + "grad_norm": 2.7056295425714607, + "learning_rate": 3.2837260881611105e-06, + "loss": 0.476, + "step": 7935 + }, + { + "epoch": 0.62, + "grad_norm": 1.9469819569499474, + "learning_rate": 3.2825316182292944e-06, + "loss": 0.461, + "step": 7936 + }, + { + "epoch": 0.62, + "grad_norm": 1.4189068331601302, + "learning_rate": 3.2813372594154353e-06, + "loss": 0.4606, + "step": 7937 + }, + { + "epoch": 0.62, + "grad_norm": 1.7573392982079843, + "learning_rate": 3.280143011796808e-06, + "loss": 0.4559, + "step": 7938 + }, + { + "epoch": 0.62, + "grad_norm": 1.3430798599108236, + "learning_rate": 3.278948875450677e-06, + "loss": 0.4122, + "step": 7939 + }, + { + "epoch": 0.62, + "grad_norm": 1.614049877397535, + "learning_rate": 3.2777548504543033e-06, + "loss": 0.4883, + "step": 7940 + }, + { + "epoch": 0.62, + "grad_norm": 1.6703984558381462, + "learning_rate": 3.276560936884937e-06, + "loss": 0.3988, + "step": 7941 + }, + { + "epoch": 0.62, + "grad_norm": 2.205563079257479, + "learning_rate": 3.2753671348198225e-06, + "loss": 0.481, + "step": 7942 + }, + { + "epoch": 0.62, + "grad_norm": 1.922796140964399, + "learning_rate": 3.2741734443362007e-06, + "loss": 0.4399, + "step": 7943 + }, + { + "epoch": 0.62, + "grad_norm": 0.5353179007416976, + "learning_rate": 3.2729798655112966e-06, + "loss": 0.4797, + "step": 7944 + }, + { + "epoch": 0.62, + "grad_norm": 1.7840068548268462, + "learning_rate": 3.2717863984223384e-06, + "loss": 0.4758, + "step": 7945 + }, + { + "epoch": 0.62, + "grad_norm": 1.6810472733731971, + "learning_rate": 3.2705930431465383e-06, + "loss": 0.421, + "step": 7946 + }, + { + "epoch": 0.62, + "grad_norm": 1.443972184812371, + "learning_rate": 3.269399799761107e-06, + "loss": 0.4901, + "step": 7947 + }, + { + "epoch": 0.62, + "grad_norm": 1.8276815718515595, + "learning_rate": 3.2682066683432435e-06, + "loss": 0.4641, + "step": 7948 + }, + { + "epoch": 0.62, + "grad_norm": 1.9253746051345912, + "learning_rate": 3.2670136489701443e-06, + "loss": 0.4825, + "step": 7949 + }, + { + "epoch": 0.62, + "grad_norm": 1.8683854126505743, + "learning_rate": 3.2658207417189936e-06, + "loss": 0.456, + "step": 7950 + }, + { + "epoch": 0.62, + "grad_norm": 2.5639043127375922, + "learning_rate": 3.264627946666974e-06, + "loss": 0.4997, + "step": 7951 + }, + { + "epoch": 0.62, + "grad_norm": 0.5654546389418793, + "learning_rate": 3.263435263891255e-06, + "loss": 0.4731, + "step": 7952 + }, + { + "epoch": 0.62, + "grad_norm": 1.6031647601100334, + "learning_rate": 3.262242693469003e-06, + "loss": 0.4737, + "step": 7953 + }, + { + "epoch": 0.62, + "grad_norm": 1.6138786055201226, + "learning_rate": 3.261050235477375e-06, + "loss": 0.4761, + "step": 7954 + }, + { + "epoch": 0.62, + "grad_norm": 1.6477527683899091, + "learning_rate": 3.2598578899935217e-06, + "loss": 0.4499, + "step": 7955 + }, + { + "epoch": 0.62, + "grad_norm": 1.476899503814041, + "learning_rate": 3.2586656570945874e-06, + "loss": 0.4755, + "step": 7956 + }, + { + "epoch": 0.62, + "grad_norm": 0.5424033250130103, + "learning_rate": 3.2574735368577067e-06, + "loss": 0.4814, + "step": 7957 + }, + { + "epoch": 0.62, + "grad_norm": 1.9999283100451462, + "learning_rate": 3.256281529360009e-06, + "loss": 0.452, + "step": 7958 + }, + { + "epoch": 0.63, + "grad_norm": 1.9318026283154304, + "learning_rate": 3.255089634678614e-06, + "loss": 0.4355, + "step": 7959 + }, + { + "epoch": 0.63, + "grad_norm": 1.5149372586078518, + "learning_rate": 3.2538978528906396e-06, + "loss": 0.4025, + "step": 7960 + }, + { + "epoch": 0.63, + "grad_norm": 1.8710454965261167, + "learning_rate": 3.252706184073187e-06, + "loss": 0.4767, + "step": 7961 + }, + { + "epoch": 0.63, + "grad_norm": 1.441051841179274, + "learning_rate": 3.2515146283033617e-06, + "loss": 0.508, + "step": 7962 + }, + { + "epoch": 0.63, + "grad_norm": 2.296848851092269, + "learning_rate": 3.250323185658249e-06, + "loss": 0.4957, + "step": 7963 + }, + { + "epoch": 0.63, + "grad_norm": 1.4079413532839486, + "learning_rate": 3.249131856214941e-06, + "loss": 0.4179, + "step": 7964 + }, + { + "epoch": 0.63, + "grad_norm": 1.5754152520870217, + "learning_rate": 3.2479406400505076e-06, + "loss": 0.5209, + "step": 7965 + }, + { + "epoch": 0.63, + "grad_norm": 1.5213211066753851, + "learning_rate": 3.246749537242024e-06, + "loss": 0.4211, + "step": 7966 + }, + { + "epoch": 0.63, + "grad_norm": 1.9193424232770036, + "learning_rate": 3.245558547866554e-06, + "loss": 0.4788, + "step": 7967 + }, + { + "epoch": 0.63, + "grad_norm": 1.289393415635444, + "learning_rate": 3.2443676720011493e-06, + "loss": 0.3686, + "step": 7968 + }, + { + "epoch": 0.63, + "grad_norm": 0.5650251477823203, + "learning_rate": 3.2431769097228606e-06, + "loss": 0.4949, + "step": 7969 + }, + { + "epoch": 0.63, + "grad_norm": 1.6519145550502334, + "learning_rate": 3.241986261108726e-06, + "loss": 0.4008, + "step": 7970 + }, + { + "epoch": 0.63, + "grad_norm": 1.992369713104297, + "learning_rate": 3.240795726235782e-06, + "loss": 0.411, + "step": 7971 + }, + { + "epoch": 0.63, + "grad_norm": 1.9390246926125874, + "learning_rate": 3.2396053051810515e-06, + "loss": 0.4997, + "step": 7972 + }, + { + "epoch": 0.63, + "grad_norm": 1.3843742242235517, + "learning_rate": 3.238414998021557e-06, + "loss": 0.4668, + "step": 7973 + }, + { + "epoch": 0.63, + "grad_norm": 2.120412494402407, + "learning_rate": 3.2372248048343057e-06, + "loss": 0.448, + "step": 7974 + }, + { + "epoch": 0.63, + "grad_norm": 1.4194845590102416, + "learning_rate": 3.2360347256963052e-06, + "loss": 0.4875, + "step": 7975 + }, + { + "epoch": 0.63, + "grad_norm": 2.1248095422017603, + "learning_rate": 3.234844760684549e-06, + "loss": 0.4697, + "step": 7976 + }, + { + "epoch": 0.63, + "grad_norm": 1.9878960217134987, + "learning_rate": 3.233654909876027e-06, + "loss": 0.4531, + "step": 7977 + }, + { + "epoch": 0.63, + "grad_norm": 1.4744616690432975, + "learning_rate": 3.2324651733477245e-06, + "loss": 0.4368, + "step": 7978 + }, + { + "epoch": 0.63, + "grad_norm": 1.7841976390906658, + "learning_rate": 3.2312755511766104e-06, + "loss": 0.4787, + "step": 7979 + }, + { + "epoch": 0.63, + "grad_norm": 2.014484077126986, + "learning_rate": 3.230086043439656e-06, + "loss": 0.4394, + "step": 7980 + }, + { + "epoch": 0.63, + "grad_norm": 2.0449379572200406, + "learning_rate": 3.2288966502138187e-06, + "loss": 0.4092, + "step": 7981 + }, + { + "epoch": 0.63, + "grad_norm": 1.5871955385110332, + "learning_rate": 3.2277073715760508e-06, + "loss": 0.4131, + "step": 7982 + }, + { + "epoch": 0.63, + "grad_norm": 1.5920279765919743, + "learning_rate": 3.226518207603298e-06, + "loss": 0.4848, + "step": 7983 + }, + { + "epoch": 0.63, + "grad_norm": 1.6359311327224952, + "learning_rate": 3.2253291583724975e-06, + "loss": 0.4059, + "step": 7984 + }, + { + "epoch": 0.63, + "grad_norm": 0.5277722991203128, + "learning_rate": 3.2241402239605786e-06, + "loss": 0.4797, + "step": 7985 + }, + { + "epoch": 0.63, + "grad_norm": 2.244064979551664, + "learning_rate": 3.2229514044444642e-06, + "loss": 0.4659, + "step": 7986 + }, + { + "epoch": 0.63, + "grad_norm": 0.5458962162818889, + "learning_rate": 3.2217626999010687e-06, + "loss": 0.4857, + "step": 7987 + }, + { + "epoch": 0.63, + "grad_norm": 0.5918979008661037, + "learning_rate": 3.2205741104073e-06, + "loss": 0.4855, + "step": 7988 + }, + { + "epoch": 0.63, + "grad_norm": 1.6562618828911972, + "learning_rate": 3.21938563604006e-06, + "loss": 0.4953, + "step": 7989 + }, + { + "epoch": 0.63, + "grad_norm": 1.8316000382083715, + "learning_rate": 3.218197276876239e-06, + "loss": 0.4632, + "step": 7990 + }, + { + "epoch": 0.63, + "grad_norm": 1.8533171472734944, + "learning_rate": 3.217009032992724e-06, + "loss": 0.4905, + "step": 7991 + }, + { + "epoch": 0.63, + "grad_norm": 0.5431868150622786, + "learning_rate": 3.2158209044663914e-06, + "loss": 0.4732, + "step": 7992 + }, + { + "epoch": 0.63, + "grad_norm": 0.6060990819309332, + "learning_rate": 3.2146328913741134e-06, + "loss": 0.4774, + "step": 7993 + }, + { + "epoch": 0.63, + "grad_norm": 4.662887753681133, + "learning_rate": 3.2134449937927492e-06, + "loss": 0.4431, + "step": 7994 + }, + { + "epoch": 0.63, + "grad_norm": 1.8062566351055218, + "learning_rate": 3.2122572117991604e-06, + "loss": 0.4434, + "step": 7995 + }, + { + "epoch": 0.63, + "grad_norm": 1.4279412890223049, + "learning_rate": 3.211069545470187e-06, + "loss": 0.4133, + "step": 7996 + }, + { + "epoch": 0.63, + "grad_norm": 2.1378230697258975, + "learning_rate": 3.2098819948826787e-06, + "loss": 0.4815, + "step": 7997 + }, + { + "epoch": 0.63, + "grad_norm": 1.8181573222708045, + "learning_rate": 3.208694560113459e-06, + "loss": 0.4589, + "step": 7998 + }, + { + "epoch": 0.63, + "grad_norm": 2.0958304840534594, + "learning_rate": 3.2075072412393598e-06, + "loss": 0.4568, + "step": 7999 + }, + { + "epoch": 0.63, + "grad_norm": 0.5821864234010908, + "learning_rate": 3.2063200383371985e-06, + "loss": 0.4854, + "step": 8000 + }, + { + "epoch": 0.63, + "grad_norm": 1.391900787662937, + "learning_rate": 3.2051329514837827e-06, + "loss": 0.4533, + "step": 8001 + }, + { + "epoch": 0.63, + "grad_norm": 1.8159131729587703, + "learning_rate": 3.2039459807559183e-06, + "loss": 0.4724, + "step": 8002 + }, + { + "epoch": 0.63, + "grad_norm": 2.009238092162404, + "learning_rate": 3.2027591262303974e-06, + "loss": 0.4998, + "step": 8003 + }, + { + "epoch": 0.63, + "grad_norm": 1.4252764636384176, + "learning_rate": 3.201572387984012e-06, + "loss": 0.4433, + "step": 8004 + }, + { + "epoch": 0.63, + "grad_norm": 2.018465149736098, + "learning_rate": 3.200385766093539e-06, + "loss": 0.4605, + "step": 8005 + }, + { + "epoch": 0.63, + "grad_norm": 2.5426985878999355, + "learning_rate": 3.199199260635754e-06, + "loss": 0.4188, + "step": 8006 + }, + { + "epoch": 0.63, + "grad_norm": 1.2330984410304047, + "learning_rate": 3.1980128716874205e-06, + "loss": 0.46, + "step": 8007 + }, + { + "epoch": 0.63, + "grad_norm": 2.305929036362998, + "learning_rate": 3.1968265993252967e-06, + "loss": 0.4606, + "step": 8008 + }, + { + "epoch": 0.63, + "grad_norm": 2.671252549625628, + "learning_rate": 3.1956404436261333e-06, + "loss": 0.4148, + "step": 8009 + }, + { + "epoch": 0.63, + "grad_norm": 1.4690923419194715, + "learning_rate": 3.1944544046666715e-06, + "loss": 0.4506, + "step": 8010 + }, + { + "epoch": 0.63, + "grad_norm": 2.5490230250311785, + "learning_rate": 3.1932684825236504e-06, + "loss": 0.4588, + "step": 8011 + }, + { + "epoch": 0.63, + "grad_norm": 1.7944900908776351, + "learning_rate": 3.1920826772737916e-06, + "loss": 0.4948, + "step": 8012 + }, + { + "epoch": 0.63, + "grad_norm": 1.8714352414834001, + "learning_rate": 3.1908969889938224e-06, + "loss": 0.4797, + "step": 8013 + }, + { + "epoch": 0.63, + "grad_norm": 3.1637407287182735, + "learning_rate": 3.1897114177604483e-06, + "loss": 0.4783, + "step": 8014 + }, + { + "epoch": 0.63, + "grad_norm": 2.692237623732359, + "learning_rate": 3.1885259636503785e-06, + "loss": 0.4267, + "step": 8015 + }, + { + "epoch": 0.63, + "grad_norm": 4.496261233927252, + "learning_rate": 3.187340626740309e-06, + "loss": 0.4663, + "step": 8016 + }, + { + "epoch": 0.63, + "grad_norm": 1.6115033024535848, + "learning_rate": 3.1861554071069305e-06, + "loss": 0.3778, + "step": 8017 + }, + { + "epoch": 0.63, + "grad_norm": 3.5125823933719733, + "learning_rate": 3.184970304826923e-06, + "loss": 0.4488, + "step": 8018 + }, + { + "epoch": 0.63, + "grad_norm": 1.6241827244099758, + "learning_rate": 3.183785319976964e-06, + "loss": 0.4583, + "step": 8019 + }, + { + "epoch": 0.63, + "grad_norm": 1.4232988760178529, + "learning_rate": 3.182600452633717e-06, + "loss": 0.4403, + "step": 8020 + }, + { + "epoch": 0.63, + "grad_norm": 1.5565761774801736, + "learning_rate": 3.1814157028738447e-06, + "loss": 0.4374, + "step": 8021 + }, + { + "epoch": 0.63, + "grad_norm": 2.0081091714522756, + "learning_rate": 3.180231070773997e-06, + "loss": 0.4652, + "step": 8022 + }, + { + "epoch": 0.63, + "grad_norm": 1.81122017678456, + "learning_rate": 3.1790465564108185e-06, + "loss": 0.435, + "step": 8023 + }, + { + "epoch": 0.63, + "grad_norm": 1.9355948786756647, + "learning_rate": 3.177862159860946e-06, + "loss": 0.5227, + "step": 8024 + }, + { + "epoch": 0.63, + "grad_norm": 2.341033420041658, + "learning_rate": 3.1766778812010074e-06, + "loss": 0.4454, + "step": 8025 + }, + { + "epoch": 0.63, + "grad_norm": 2.693327908565767, + "learning_rate": 3.175493720507626e-06, + "loss": 0.4679, + "step": 8026 + }, + { + "epoch": 0.63, + "grad_norm": 1.6126328170916655, + "learning_rate": 3.174309677857412e-06, + "loss": 0.4641, + "step": 8027 + }, + { + "epoch": 0.63, + "grad_norm": 0.5479882850527806, + "learning_rate": 3.1731257533269753e-06, + "loss": 0.4954, + "step": 8028 + }, + { + "epoch": 0.63, + "grad_norm": 1.8022751356140543, + "learning_rate": 3.1719419469929106e-06, + "loss": 0.4255, + "step": 8029 + }, + { + "epoch": 0.63, + "grad_norm": 3.2481422930274566, + "learning_rate": 3.1707582589318135e-06, + "loss": 0.489, + "step": 8030 + }, + { + "epoch": 0.63, + "grad_norm": 1.718744611083982, + "learning_rate": 3.1695746892202613e-06, + "loss": 0.4121, + "step": 8031 + }, + { + "epoch": 0.63, + "grad_norm": 2.0009974172626177, + "learning_rate": 3.1683912379348346e-06, + "loss": 0.4638, + "step": 8032 + }, + { + "epoch": 0.63, + "grad_norm": 2.23184447424556, + "learning_rate": 3.1672079051520963e-06, + "loss": 0.4266, + "step": 8033 + }, + { + "epoch": 0.63, + "grad_norm": 1.5938644925610839, + "learning_rate": 3.16602469094861e-06, + "loss": 0.4337, + "step": 8034 + }, + { + "epoch": 0.63, + "grad_norm": 1.9317848396760005, + "learning_rate": 3.1648415954009278e-06, + "loss": 0.4429, + "step": 8035 + }, + { + "epoch": 0.63, + "grad_norm": 2.5146475565547917, + "learning_rate": 3.1636586185855923e-06, + "loss": 0.4407, + "step": 8036 + }, + { + "epoch": 0.63, + "grad_norm": 6.037298325627997, + "learning_rate": 3.1624757605791435e-06, + "loss": 0.4924, + "step": 8037 + }, + { + "epoch": 0.63, + "grad_norm": 1.6564079312427806, + "learning_rate": 3.161293021458108e-06, + "loss": 0.4264, + "step": 8038 + }, + { + "epoch": 0.63, + "grad_norm": 0.5724254414471611, + "learning_rate": 3.16011040129901e-06, + "loss": 0.4924, + "step": 8039 + }, + { + "epoch": 0.63, + "grad_norm": 0.5626876055955855, + "learning_rate": 3.15892790017836e-06, + "loss": 0.4699, + "step": 8040 + }, + { + "epoch": 0.63, + "grad_norm": 1.5152428500864998, + "learning_rate": 3.157745518172669e-06, + "loss": 0.4283, + "step": 8041 + }, + { + "epoch": 0.63, + "grad_norm": 2.3541681581131493, + "learning_rate": 3.1565632553584315e-06, + "loss": 0.4758, + "step": 8042 + }, + { + "epoch": 0.63, + "grad_norm": 1.8182511211798784, + "learning_rate": 3.1553811118121404e-06, + "loss": 0.4959, + "step": 8043 + }, + { + "epoch": 0.63, + "grad_norm": 1.6595722262963297, + "learning_rate": 3.1541990876102775e-06, + "loss": 0.4368, + "step": 8044 + }, + { + "epoch": 0.63, + "grad_norm": 1.916600925056693, + "learning_rate": 3.1530171828293177e-06, + "loss": 0.476, + "step": 8045 + }, + { + "epoch": 0.63, + "grad_norm": 0.5563422984011408, + "learning_rate": 3.151835397545734e-06, + "loss": 0.482, + "step": 8046 + }, + { + "epoch": 0.63, + "grad_norm": 1.6898001373334064, + "learning_rate": 3.1506537318359785e-06, + "loss": 0.4936, + "step": 8047 + }, + { + "epoch": 0.63, + "grad_norm": 3.379803320583547, + "learning_rate": 3.149472185776511e-06, + "loss": 0.404, + "step": 8048 + }, + { + "epoch": 0.63, + "grad_norm": 3.2505078527126456, + "learning_rate": 3.1482907594437682e-06, + "loss": 0.4348, + "step": 8049 + }, + { + "epoch": 0.63, + "grad_norm": 2.56534642167107, + "learning_rate": 3.147109452914195e-06, + "loss": 0.4742, + "step": 8050 + }, + { + "epoch": 0.63, + "grad_norm": 0.5524798132394851, + "learning_rate": 3.1459282662642125e-06, + "loss": 0.4895, + "step": 8051 + }, + { + "epoch": 0.63, + "grad_norm": 1.9226437449692777, + "learning_rate": 3.1447471995702484e-06, + "loss": 0.4338, + "step": 8052 + }, + { + "epoch": 0.63, + "grad_norm": 1.7843342537121427, + "learning_rate": 3.143566252908712e-06, + "loss": 0.4555, + "step": 8053 + }, + { + "epoch": 0.63, + "grad_norm": 0.5389714770665909, + "learning_rate": 3.142385426356012e-06, + "loss": 0.4771, + "step": 8054 + }, + { + "epoch": 0.63, + "grad_norm": 1.6747314127824016, + "learning_rate": 3.1412047199885433e-06, + "loss": 0.4711, + "step": 8055 + }, + { + "epoch": 0.63, + "grad_norm": 0.541902146971272, + "learning_rate": 3.1400241338826983e-06, + "loss": 0.4951, + "step": 8056 + }, + { + "epoch": 0.63, + "grad_norm": 1.5259650724766551, + "learning_rate": 3.1388436681148593e-06, + "loss": 0.4333, + "step": 8057 + }, + { + "epoch": 0.63, + "grad_norm": 0.5139520861642124, + "learning_rate": 3.137663322761399e-06, + "loss": 0.4646, + "step": 8058 + }, + { + "epoch": 0.63, + "grad_norm": 0.5510474395591083, + "learning_rate": 3.136483097898687e-06, + "loss": 0.4805, + "step": 8059 + }, + { + "epoch": 0.63, + "grad_norm": 0.5545211114698652, + "learning_rate": 3.1353029936030794e-06, + "loss": 0.5051, + "step": 8060 + }, + { + "epoch": 0.63, + "grad_norm": 1.3427772244684106, + "learning_rate": 3.13412300995093e-06, + "loss": 0.4181, + "step": 8061 + }, + { + "epoch": 0.63, + "grad_norm": 1.7862540897424357, + "learning_rate": 3.13294314701858e-06, + "loss": 0.472, + "step": 8062 + }, + { + "epoch": 0.63, + "grad_norm": 2.0012784823422267, + "learning_rate": 3.1317634048823665e-06, + "loss": 0.4313, + "step": 8063 + }, + { + "epoch": 0.63, + "grad_norm": 1.9447488432758286, + "learning_rate": 3.1305837836186147e-06, + "loss": 0.4518, + "step": 8064 + }, + { + "epoch": 0.63, + "grad_norm": 1.7511068712778122, + "learning_rate": 3.1294042833036508e-06, + "loss": 0.4662, + "step": 8065 + }, + { + "epoch": 0.63, + "grad_norm": 0.6246670455096237, + "learning_rate": 3.1282249040137784e-06, + "loss": 0.4946, + "step": 8066 + }, + { + "epoch": 0.63, + "grad_norm": 1.9213332752226853, + "learning_rate": 3.127045645825308e-06, + "loss": 0.505, + "step": 8067 + }, + { + "epoch": 0.63, + "grad_norm": 1.5316323389931839, + "learning_rate": 3.1258665088145345e-06, + "loss": 0.4271, + "step": 8068 + }, + { + "epoch": 0.63, + "grad_norm": 2.2221898538928326, + "learning_rate": 3.124687493057746e-06, + "loss": 0.5094, + "step": 8069 + }, + { + "epoch": 0.63, + "grad_norm": 1.998998066744559, + "learning_rate": 3.123508598631224e-06, + "loss": 0.4175, + "step": 8070 + }, + { + "epoch": 0.63, + "grad_norm": 1.6910597003026633, + "learning_rate": 3.12232982561124e-06, + "loss": 0.4154, + "step": 8071 + }, + { + "epoch": 0.63, + "grad_norm": 2.50627238938155, + "learning_rate": 3.1211511740740607e-06, + "loss": 0.513, + "step": 8072 + }, + { + "epoch": 0.63, + "grad_norm": 1.6611569410206988, + "learning_rate": 3.1199726440959414e-06, + "loss": 0.4707, + "step": 8073 + }, + { + "epoch": 0.63, + "grad_norm": 0.5404070059318264, + "learning_rate": 3.1187942357531343e-06, + "loss": 0.49, + "step": 8074 + }, + { + "epoch": 0.63, + "grad_norm": 2.0652982116200707, + "learning_rate": 3.1176159491218785e-06, + "loss": 0.5212, + "step": 8075 + }, + { + "epoch": 0.63, + "grad_norm": 1.5104505509760409, + "learning_rate": 3.116437784278408e-06, + "loss": 0.4528, + "step": 8076 + }, + { + "epoch": 0.63, + "grad_norm": 2.255184267332409, + "learning_rate": 3.1152597412989493e-06, + "loss": 0.4488, + "step": 8077 + }, + { + "epoch": 0.63, + "grad_norm": 3.6582545600955108, + "learning_rate": 3.1140818202597178e-06, + "loss": 0.4818, + "step": 8078 + }, + { + "epoch": 0.63, + "grad_norm": 2.194251705174403, + "learning_rate": 3.1129040212369286e-06, + "loss": 0.4302, + "step": 8079 + }, + { + "epoch": 0.63, + "grad_norm": 1.758176609347948, + "learning_rate": 3.111726344306778e-06, + "loss": 0.4358, + "step": 8080 + }, + { + "epoch": 0.63, + "grad_norm": 6.967028599070529, + "learning_rate": 3.1105487895454655e-06, + "loss": 0.4407, + "step": 8081 + }, + { + "epoch": 0.63, + "grad_norm": 2.0077199214668977, + "learning_rate": 3.109371357029172e-06, + "loss": 0.4499, + "step": 8082 + }, + { + "epoch": 0.63, + "grad_norm": 1.7427094493350916, + "learning_rate": 3.108194046834081e-06, + "loss": 0.4359, + "step": 8083 + }, + { + "epoch": 0.63, + "grad_norm": 1.7669299610378082, + "learning_rate": 3.107016859036357e-06, + "loss": 0.4728, + "step": 8084 + }, + { + "epoch": 0.63, + "grad_norm": 3.8281721309768395, + "learning_rate": 3.1058397937121694e-06, + "loss": 0.5006, + "step": 8085 + }, + { + "epoch": 0.64, + "grad_norm": 1.4780911207277574, + "learning_rate": 3.1046628509376666e-06, + "loss": 0.4571, + "step": 8086 + }, + { + "epoch": 0.64, + "grad_norm": 1.8557642033012556, + "learning_rate": 3.1034860307889993e-06, + "loss": 0.471, + "step": 8087 + }, + { + "epoch": 0.64, + "grad_norm": 1.5114915537277782, + "learning_rate": 3.1023093333423036e-06, + "loss": 0.4278, + "step": 8088 + }, + { + "epoch": 0.64, + "grad_norm": 1.9119672267915637, + "learning_rate": 3.101132758673712e-06, + "loss": 0.4498, + "step": 8089 + }, + { + "epoch": 0.64, + "grad_norm": 0.5585712901812594, + "learning_rate": 3.0999563068593485e-06, + "loss": 0.4689, + "step": 8090 + }, + { + "epoch": 0.64, + "grad_norm": 1.4099783684387304, + "learning_rate": 3.0987799779753245e-06, + "loss": 0.4387, + "step": 8091 + }, + { + "epoch": 0.64, + "grad_norm": 3.439110120624536, + "learning_rate": 3.0976037720977505e-06, + "loss": 0.4777, + "step": 8092 + }, + { + "epoch": 0.64, + "grad_norm": 1.8079205825377347, + "learning_rate": 3.0964276893027225e-06, + "loss": 0.4575, + "step": 8093 + }, + { + "epoch": 0.64, + "grad_norm": 2.6049746453974447, + "learning_rate": 3.0952517296663332e-06, + "loss": 0.452, + "step": 8094 + }, + { + "epoch": 0.64, + "grad_norm": 2.4190559385822734, + "learning_rate": 3.0940758932646646e-06, + "loss": 0.4526, + "step": 8095 + }, + { + "epoch": 0.64, + "grad_norm": 2.045835880420446, + "learning_rate": 3.0929001801737932e-06, + "loss": 0.4185, + "step": 8096 + }, + { + "epoch": 0.64, + "grad_norm": 2.451319067717711, + "learning_rate": 3.0917245904697857e-06, + "loss": 0.4764, + "step": 8097 + }, + { + "epoch": 0.64, + "grad_norm": 1.5588377528101665, + "learning_rate": 3.0905491242287004e-06, + "loss": 0.4713, + "step": 8098 + }, + { + "epoch": 0.64, + "grad_norm": 1.38529140347997, + "learning_rate": 3.0893737815265878e-06, + "loss": 0.4243, + "step": 8099 + }, + { + "epoch": 0.64, + "grad_norm": 2.017633293929869, + "learning_rate": 3.088198562439493e-06, + "loss": 0.4533, + "step": 8100 + }, + { + "epoch": 0.64, + "grad_norm": 2.114188137347059, + "learning_rate": 3.0870234670434506e-06, + "loss": 0.495, + "step": 8101 + }, + { + "epoch": 0.64, + "grad_norm": 1.8100860322323875, + "learning_rate": 3.0858484954144863e-06, + "loss": 0.4548, + "step": 8102 + }, + { + "epoch": 0.64, + "grad_norm": 2.1960210478501754, + "learning_rate": 3.0846736476286222e-06, + "loss": 0.4426, + "step": 8103 + }, + { + "epoch": 0.64, + "grad_norm": 0.5392042988016463, + "learning_rate": 3.083498923761866e-06, + "loss": 0.4804, + "step": 8104 + }, + { + "epoch": 0.64, + "grad_norm": 0.5374065509737296, + "learning_rate": 3.0823243238902235e-06, + "loss": 0.493, + "step": 8105 + }, + { + "epoch": 0.64, + "grad_norm": 1.5958863729190773, + "learning_rate": 3.0811498480896872e-06, + "loss": 0.4168, + "step": 8106 + }, + { + "epoch": 0.64, + "grad_norm": 2.9059708220234084, + "learning_rate": 3.079975496436247e-06, + "loss": 0.5173, + "step": 8107 + }, + { + "epoch": 0.64, + "grad_norm": 0.557041729571183, + "learning_rate": 3.07880126900588e-06, + "loss": 0.4891, + "step": 8108 + }, + { + "epoch": 0.64, + "grad_norm": 2.287258002531915, + "learning_rate": 3.0776271658745583e-06, + "loss": 0.4891, + "step": 8109 + }, + { + "epoch": 0.64, + "grad_norm": 1.5229634566716586, + "learning_rate": 3.0764531871182422e-06, + "loss": 0.4449, + "step": 8110 + }, + { + "epoch": 0.64, + "grad_norm": 1.4703024805919196, + "learning_rate": 3.0752793328128903e-06, + "loss": 0.4776, + "step": 8111 + }, + { + "epoch": 0.64, + "grad_norm": 1.8095430356990658, + "learning_rate": 3.0741056030344476e-06, + "loss": 0.4814, + "step": 8112 + }, + { + "epoch": 0.64, + "grad_norm": 1.8436207238147895, + "learning_rate": 3.0729319978588505e-06, + "loss": 0.4228, + "step": 8113 + }, + { + "epoch": 0.64, + "grad_norm": 0.5976924978678592, + "learning_rate": 3.0717585173620364e-06, + "loss": 0.4904, + "step": 8114 + }, + { + "epoch": 0.64, + "grad_norm": 1.5322658901538564, + "learning_rate": 3.07058516161992e-06, + "loss": 0.4538, + "step": 8115 + }, + { + "epoch": 0.64, + "grad_norm": 1.5642034487088647, + "learning_rate": 3.0694119307084224e-06, + "loss": 0.5059, + "step": 8116 + }, + { + "epoch": 0.64, + "grad_norm": 2.545348273048814, + "learning_rate": 3.0682388247034455e-06, + "loss": 0.4789, + "step": 8117 + }, + { + "epoch": 0.64, + "grad_norm": 1.5304868884014815, + "learning_rate": 3.0670658436808918e-06, + "loss": 0.4751, + "step": 8118 + }, + { + "epoch": 0.64, + "grad_norm": 1.771957334672217, + "learning_rate": 3.0658929877166464e-06, + "loss": 0.4728, + "step": 8119 + }, + { + "epoch": 0.64, + "grad_norm": 3.089018682062842, + "learning_rate": 3.0647202568865973e-06, + "loss": 0.4896, + "step": 8120 + }, + { + "epoch": 0.64, + "grad_norm": 1.5361638208940827, + "learning_rate": 3.063547651266614e-06, + "loss": 0.4434, + "step": 8121 + }, + { + "epoch": 0.64, + "grad_norm": 1.585971876334789, + "learning_rate": 3.0623751709325655e-06, + "loss": 0.4414, + "step": 8122 + }, + { + "epoch": 0.64, + "grad_norm": 1.5368608510986306, + "learning_rate": 3.061202815960308e-06, + "loss": 0.444, + "step": 8123 + }, + { + "epoch": 0.64, + "grad_norm": 0.5709463145910346, + "learning_rate": 3.0600305864256918e-06, + "loss": 0.4698, + "step": 8124 + }, + { + "epoch": 0.64, + "grad_norm": 1.491771962829028, + "learning_rate": 3.0588584824045607e-06, + "loss": 0.4388, + "step": 8125 + }, + { + "epoch": 0.64, + "grad_norm": 1.9512206471581415, + "learning_rate": 3.0576865039727443e-06, + "loss": 0.5017, + "step": 8126 + }, + { + "epoch": 0.64, + "grad_norm": 2.258454238825651, + "learning_rate": 3.056514651206072e-06, + "loss": 0.4689, + "step": 8127 + }, + { + "epoch": 0.64, + "grad_norm": 1.777205834553369, + "learning_rate": 3.055342924180358e-06, + "loss": 0.4352, + "step": 8128 + }, + { + "epoch": 0.64, + "grad_norm": 1.565830549088371, + "learning_rate": 3.054171322971414e-06, + "loss": 0.4808, + "step": 8129 + }, + { + "epoch": 0.64, + "grad_norm": 2.0270861352963903, + "learning_rate": 3.052999847655039e-06, + "loss": 0.4897, + "step": 8130 + }, + { + "epoch": 0.64, + "grad_norm": 1.503145951232974, + "learning_rate": 3.051828498307028e-06, + "loss": 0.4092, + "step": 8131 + }, + { + "epoch": 0.64, + "grad_norm": 1.7852418683173914, + "learning_rate": 3.0506572750031637e-06, + "loss": 0.4986, + "step": 8132 + }, + { + "epoch": 0.64, + "grad_norm": 1.7238034247339618, + "learning_rate": 3.0494861778192244e-06, + "loss": 0.4939, + "step": 8133 + }, + { + "epoch": 0.64, + "grad_norm": 0.5546890509743488, + "learning_rate": 3.0483152068309757e-06, + "loss": 0.4875, + "step": 8134 + }, + { + "epoch": 0.64, + "grad_norm": 0.5299291633299098, + "learning_rate": 3.047144362114181e-06, + "loss": 0.4628, + "step": 8135 + }, + { + "epoch": 0.64, + "grad_norm": 1.7034223165638394, + "learning_rate": 3.0459736437445923e-06, + "loss": 0.4468, + "step": 8136 + }, + { + "epoch": 0.64, + "grad_norm": 1.6480248461493903, + "learning_rate": 3.0448030517979523e-06, + "loss": 0.4549, + "step": 8137 + }, + { + "epoch": 0.64, + "grad_norm": 2.4255204878703114, + "learning_rate": 3.043632586349997e-06, + "loss": 0.4847, + "step": 8138 + }, + { + "epoch": 0.64, + "grad_norm": 1.889181245140569, + "learning_rate": 3.0424622474764544e-06, + "loss": 0.4336, + "step": 8139 + }, + { + "epoch": 0.64, + "grad_norm": 1.8857978326130735, + "learning_rate": 3.041292035253044e-06, + "loss": 0.4556, + "step": 8140 + }, + { + "epoch": 0.64, + "grad_norm": 1.933557231357053, + "learning_rate": 3.0401219497554757e-06, + "loss": 0.4665, + "step": 8141 + }, + { + "epoch": 0.64, + "grad_norm": 2.194890223121409, + "learning_rate": 3.0389519910594546e-06, + "loss": 0.4635, + "step": 8142 + }, + { + "epoch": 0.64, + "grad_norm": 0.5286822373454189, + "learning_rate": 3.0377821592406727e-06, + "loss": 0.4749, + "step": 8143 + }, + { + "epoch": 0.64, + "grad_norm": 1.6087002311691982, + "learning_rate": 3.03661245437482e-06, + "loss": 0.4766, + "step": 8144 + }, + { + "epoch": 0.64, + "grad_norm": 0.5570596661068346, + "learning_rate": 3.035442876537571e-06, + "loss": 0.4906, + "step": 8145 + }, + { + "epoch": 0.64, + "grad_norm": 1.8020194337526472, + "learning_rate": 3.0342734258045987e-06, + "loss": 0.499, + "step": 8146 + }, + { + "epoch": 0.64, + "grad_norm": 1.7282402057997694, + "learning_rate": 3.033104102251565e-06, + "loss": 0.4557, + "step": 8147 + }, + { + "epoch": 0.64, + "grad_norm": 1.2994123144187069, + "learning_rate": 3.031934905954121e-06, + "loss": 0.4468, + "step": 8148 + }, + { + "epoch": 0.64, + "grad_norm": 6.192736125801393, + "learning_rate": 3.0307658369879166e-06, + "loss": 0.4718, + "step": 8149 + }, + { + "epoch": 0.64, + "grad_norm": 2.055779567573067, + "learning_rate": 3.0295968954285843e-06, + "loss": 0.5116, + "step": 8150 + }, + { + "epoch": 0.64, + "grad_norm": 3.4246759953451695, + "learning_rate": 3.0284280813517573e-06, + "loss": 0.4248, + "step": 8151 + }, + { + "epoch": 0.64, + "grad_norm": 1.7816028714276833, + "learning_rate": 3.027259394833052e-06, + "loss": 0.4964, + "step": 8152 + }, + { + "epoch": 0.64, + "grad_norm": 1.3141912048608082, + "learning_rate": 3.026090835948086e-06, + "loss": 0.4142, + "step": 8153 + }, + { + "epoch": 0.64, + "grad_norm": 0.5367527116014242, + "learning_rate": 3.024922404772458e-06, + "loss": 0.5168, + "step": 8154 + }, + { + "epoch": 0.64, + "grad_norm": 1.5601912569698757, + "learning_rate": 3.0237541013817696e-06, + "loss": 0.4537, + "step": 8155 + }, + { + "epoch": 0.64, + "grad_norm": 1.9212808418463638, + "learning_rate": 3.022585925851602e-06, + "loss": 0.5344, + "step": 8156 + }, + { + "epoch": 0.64, + "grad_norm": 1.9249692455907497, + "learning_rate": 3.0214178782575398e-06, + "loss": 0.4792, + "step": 8157 + }, + { + "epoch": 0.64, + "grad_norm": 0.5553376168459427, + "learning_rate": 3.020249958675153e-06, + "loss": 0.4919, + "step": 8158 + }, + { + "epoch": 0.64, + "grad_norm": 0.5754873056269578, + "learning_rate": 3.0190821671800036e-06, + "loss": 0.483, + "step": 8159 + }, + { + "epoch": 0.64, + "grad_norm": 1.9695789467971254, + "learning_rate": 3.0179145038476476e-06, + "loss": 0.4505, + "step": 8160 + }, + { + "epoch": 0.64, + "grad_norm": 1.7924532510880093, + "learning_rate": 3.016746968753629e-06, + "loss": 0.4163, + "step": 8161 + }, + { + "epoch": 0.64, + "grad_norm": 1.8949640970240083, + "learning_rate": 3.0155795619734885e-06, + "loss": 0.4256, + "step": 8162 + }, + { + "epoch": 0.64, + "grad_norm": 8.498561765607665, + "learning_rate": 3.014412283582753e-06, + "loss": 0.45, + "step": 8163 + }, + { + "epoch": 0.64, + "grad_norm": 2.840557414291357, + "learning_rate": 3.0132451336569456e-06, + "loss": 0.4796, + "step": 8164 + }, + { + "epoch": 0.64, + "grad_norm": 1.3982628845738012, + "learning_rate": 3.0120781122715792e-06, + "loss": 0.4468, + "step": 8165 + }, + { + "epoch": 0.64, + "grad_norm": 1.411915879279902, + "learning_rate": 3.0109112195021597e-06, + "loss": 0.4517, + "step": 8166 + }, + { + "epoch": 0.64, + "grad_norm": 1.7663590898288921, + "learning_rate": 3.00974445542418e-06, + "loss": 0.4754, + "step": 8167 + }, + { + "epoch": 0.64, + "grad_norm": 4.5685165903965315, + "learning_rate": 3.00857782011313e-06, + "loss": 0.4122, + "step": 8168 + }, + { + "epoch": 0.64, + "grad_norm": 1.8319429357519368, + "learning_rate": 3.0074113136444926e-06, + "loss": 0.3956, + "step": 8169 + }, + { + "epoch": 0.64, + "grad_norm": 1.7582723100033337, + "learning_rate": 3.006244936093734e-06, + "loss": 0.4117, + "step": 8170 + }, + { + "epoch": 0.64, + "grad_norm": 0.5314806916210415, + "learning_rate": 3.0050786875363223e-06, + "loss": 0.4934, + "step": 8171 + }, + { + "epoch": 0.64, + "grad_norm": 2.2160980175623233, + "learning_rate": 3.0039125680477083e-06, + "loss": 0.4287, + "step": 8172 + }, + { + "epoch": 0.64, + "grad_norm": 0.5365521514075609, + "learning_rate": 3.0027465777033405e-06, + "loss": 0.4623, + "step": 8173 + }, + { + "epoch": 0.64, + "grad_norm": 3.4155931593043927, + "learning_rate": 3.0015807165786558e-06, + "loss": 0.4939, + "step": 8174 + }, + { + "epoch": 0.64, + "grad_norm": 1.4570468422069045, + "learning_rate": 3.000414984749086e-06, + "loss": 0.4386, + "step": 8175 + }, + { + "epoch": 0.64, + "grad_norm": 2.0362376372710047, + "learning_rate": 2.9992493822900493e-06, + "loss": 0.4244, + "step": 8176 + }, + { + "epoch": 0.64, + "grad_norm": 0.5268890266341643, + "learning_rate": 2.9980839092769617e-06, + "loss": 0.4952, + "step": 8177 + }, + { + "epoch": 0.64, + "grad_norm": 1.8424093816996632, + "learning_rate": 2.996918565785225e-06, + "loss": 0.4433, + "step": 8178 + }, + { + "epoch": 0.64, + "grad_norm": 1.5345731668737623, + "learning_rate": 2.9957533518902376e-06, + "loss": 0.3957, + "step": 8179 + }, + { + "epoch": 0.64, + "grad_norm": 1.7419438850059386, + "learning_rate": 2.9945882676673853e-06, + "loss": 0.4809, + "step": 8180 + }, + { + "epoch": 0.64, + "grad_norm": 1.837067050587271, + "learning_rate": 2.9934233131920492e-06, + "loss": 0.4431, + "step": 8181 + }, + { + "epoch": 0.64, + "grad_norm": 1.865787109757726, + "learning_rate": 2.992258488539601e-06, + "loss": 0.4587, + "step": 8182 + }, + { + "epoch": 0.64, + "grad_norm": 1.5083553290531755, + "learning_rate": 2.9910937937853997e-06, + "loss": 0.4388, + "step": 8183 + }, + { + "epoch": 0.64, + "grad_norm": 1.7808964105985536, + "learning_rate": 2.9899292290048044e-06, + "loss": 0.501, + "step": 8184 + }, + { + "epoch": 0.64, + "grad_norm": 1.73766643203524, + "learning_rate": 2.9887647942731563e-06, + "loss": 0.4108, + "step": 8185 + }, + { + "epoch": 0.64, + "grad_norm": 2.8654576279704345, + "learning_rate": 2.987600489665798e-06, + "loss": 0.4561, + "step": 8186 + }, + { + "epoch": 0.64, + "grad_norm": 1.986497821393684, + "learning_rate": 2.9864363152580524e-06, + "loss": 0.4547, + "step": 8187 + }, + { + "epoch": 0.64, + "grad_norm": 1.9048572442369114, + "learning_rate": 2.9852722711252456e-06, + "loss": 0.466, + "step": 8188 + }, + { + "epoch": 0.64, + "grad_norm": 0.575208243017937, + "learning_rate": 2.984108357342684e-06, + "loss": 0.4819, + "step": 8189 + }, + { + "epoch": 0.64, + "grad_norm": 2.2347412976706735, + "learning_rate": 2.9829445739856788e-06, + "loss": 0.4964, + "step": 8190 + }, + { + "epoch": 0.64, + "grad_norm": 2.598314507428206, + "learning_rate": 2.9817809211295174e-06, + "loss": 0.4724, + "step": 8191 + }, + { + "epoch": 0.64, + "grad_norm": 1.9845552463680445, + "learning_rate": 2.980617398849491e-06, + "loss": 0.4614, + "step": 8192 + }, + { + "epoch": 0.64, + "grad_norm": 1.7933392943722042, + "learning_rate": 2.979454007220879e-06, + "loss": 0.4422, + "step": 8193 + }, + { + "epoch": 0.64, + "grad_norm": 1.6753873903146643, + "learning_rate": 2.978290746318948e-06, + "loss": 0.4615, + "step": 8194 + }, + { + "epoch": 0.64, + "grad_norm": 1.9328252913577677, + "learning_rate": 2.9771276162189615e-06, + "loss": 0.4153, + "step": 8195 + }, + { + "epoch": 0.64, + "grad_norm": 1.7414439884538382, + "learning_rate": 2.975964616996172e-06, + "loss": 0.4571, + "step": 8196 + }, + { + "epoch": 0.64, + "grad_norm": 0.5633142584691037, + "learning_rate": 2.974801748725824e-06, + "loss": 0.4559, + "step": 8197 + }, + { + "epoch": 0.64, + "grad_norm": 1.641298584323138, + "learning_rate": 2.973639011483153e-06, + "loss": 0.5123, + "step": 8198 + }, + { + "epoch": 0.64, + "grad_norm": 1.4178594427642879, + "learning_rate": 2.9724764053433886e-06, + "loss": 0.4106, + "step": 8199 + }, + { + "epoch": 0.64, + "grad_norm": 2.5018990867862554, + "learning_rate": 2.971313930381747e-06, + "loss": 0.4464, + "step": 8200 + }, + { + "epoch": 0.64, + "grad_norm": 2.003077663117057, + "learning_rate": 2.970151586673441e-06, + "loss": 0.4763, + "step": 8201 + }, + { + "epoch": 0.64, + "grad_norm": 2.0523952963003933, + "learning_rate": 2.9689893742936715e-06, + "loss": 0.4352, + "step": 8202 + }, + { + "epoch": 0.64, + "grad_norm": 1.9556635359587315, + "learning_rate": 2.967827293317631e-06, + "loss": 0.4427, + "step": 8203 + }, + { + "epoch": 0.64, + "grad_norm": 7.1189133381954335, + "learning_rate": 2.9666653438205095e-06, + "loss": 0.4116, + "step": 8204 + }, + { + "epoch": 0.64, + "grad_norm": 1.72891559671769, + "learning_rate": 2.9655035258774772e-06, + "loss": 0.4634, + "step": 8205 + }, + { + "epoch": 0.64, + "grad_norm": 1.9393539305785878, + "learning_rate": 2.9643418395637076e-06, + "loss": 0.4805, + "step": 8206 + }, + { + "epoch": 0.64, + "grad_norm": 1.8869886549698118, + "learning_rate": 2.9631802849543566e-06, + "loss": 0.4627, + "step": 8207 + }, + { + "epoch": 0.64, + "grad_norm": 1.5599968952526517, + "learning_rate": 2.9620188621245784e-06, + "loss": 0.4699, + "step": 8208 + }, + { + "epoch": 0.64, + "grad_norm": 1.6625822354179296, + "learning_rate": 2.9608575711495126e-06, + "loss": 0.4827, + "step": 8209 + }, + { + "epoch": 0.64, + "grad_norm": 1.6679442508616296, + "learning_rate": 2.9596964121042947e-06, + "loss": 0.4738, + "step": 8210 + }, + { + "epoch": 0.64, + "grad_norm": 1.666576270733875, + "learning_rate": 2.9585353850640493e-06, + "loss": 0.4201, + "step": 8211 + }, + { + "epoch": 0.64, + "grad_norm": 1.896265582095068, + "learning_rate": 2.9573744901038948e-06, + "loss": 0.4471, + "step": 8212 + }, + { + "epoch": 0.65, + "grad_norm": 0.5792897886569046, + "learning_rate": 2.956213727298938e-06, + "loss": 0.5022, + "step": 8213 + }, + { + "epoch": 0.65, + "grad_norm": 1.7002635747453678, + "learning_rate": 2.9550530967242792e-06, + "loss": 0.4753, + "step": 8214 + }, + { + "epoch": 0.65, + "grad_norm": 1.500796774413423, + "learning_rate": 2.953892598455011e-06, + "loss": 0.4934, + "step": 8215 + }, + { + "epoch": 0.65, + "grad_norm": 1.598761322657962, + "learning_rate": 2.9527322325662137e-06, + "loss": 0.4485, + "step": 8216 + }, + { + "epoch": 0.65, + "grad_norm": 1.9065096359666602, + "learning_rate": 2.9515719991329643e-06, + "loss": 0.4833, + "step": 8217 + }, + { + "epoch": 0.65, + "grad_norm": 1.3686941393225618, + "learning_rate": 2.9504118982303247e-06, + "loss": 0.413, + "step": 8218 + }, + { + "epoch": 0.65, + "grad_norm": 1.3139040235088357, + "learning_rate": 2.9492519299333573e-06, + "loss": 0.4613, + "step": 8219 + }, + { + "epoch": 0.65, + "grad_norm": 1.6044871145105861, + "learning_rate": 2.9480920943171043e-06, + "loss": 0.4376, + "step": 8220 + }, + { + "epoch": 0.65, + "grad_norm": 1.2663822308648935, + "learning_rate": 2.9469323914566113e-06, + "loss": 0.3854, + "step": 8221 + }, + { + "epoch": 0.65, + "grad_norm": 1.520256989220859, + "learning_rate": 2.945772821426903e-06, + "loss": 0.4859, + "step": 8222 + }, + { + "epoch": 0.65, + "grad_norm": 1.5619964669443505, + "learning_rate": 2.94461338430301e-06, + "loss": 0.4116, + "step": 8223 + }, + { + "epoch": 0.65, + "grad_norm": 0.5446876132376467, + "learning_rate": 2.94345408015994e-06, + "loss": 0.4664, + "step": 8224 + }, + { + "epoch": 0.65, + "grad_norm": 0.5321443175789121, + "learning_rate": 2.9422949090727002e-06, + "loss": 0.4574, + "step": 8225 + }, + { + "epoch": 0.65, + "grad_norm": 0.5854540470994614, + "learning_rate": 2.9411358711162895e-06, + "loss": 0.4557, + "step": 8226 + }, + { + "epoch": 0.65, + "grad_norm": 0.5673594195810486, + "learning_rate": 2.9399769663656937e-06, + "loss": 0.4754, + "step": 8227 + }, + { + "epoch": 0.65, + "grad_norm": 1.6969962384126904, + "learning_rate": 2.938818194895894e-06, + "loss": 0.4986, + "step": 8228 + }, + { + "epoch": 0.65, + "grad_norm": 1.9260766414192725, + "learning_rate": 2.9376595567818598e-06, + "loss": 0.4723, + "step": 8229 + }, + { + "epoch": 0.65, + "grad_norm": 0.5517345577366306, + "learning_rate": 2.936501052098556e-06, + "loss": 0.459, + "step": 8230 + }, + { + "epoch": 0.65, + "grad_norm": 1.796896044021153, + "learning_rate": 2.935342680920933e-06, + "loss": 0.4982, + "step": 8231 + }, + { + "epoch": 0.65, + "grad_norm": 1.9217778951487259, + "learning_rate": 2.934184443323939e-06, + "loss": 0.4424, + "step": 8232 + }, + { + "epoch": 0.65, + "grad_norm": 1.5820370672495068, + "learning_rate": 2.9330263393825083e-06, + "loss": 0.4529, + "step": 8233 + }, + { + "epoch": 0.65, + "grad_norm": 1.4939031886591057, + "learning_rate": 2.93186836917157e-06, + "loss": 0.4198, + "step": 8234 + }, + { + "epoch": 0.65, + "grad_norm": 1.6538945168280013, + "learning_rate": 2.9307105327660424e-06, + "loss": 0.4836, + "step": 8235 + }, + { + "epoch": 0.65, + "grad_norm": 1.6010270670961608, + "learning_rate": 2.9295528302408344e-06, + "loss": 0.4439, + "step": 8236 + }, + { + "epoch": 0.65, + "grad_norm": 2.090542247128029, + "learning_rate": 2.9283952616708533e-06, + "loss": 0.467, + "step": 8237 + }, + { + "epoch": 0.65, + "grad_norm": 1.649877439268315, + "learning_rate": 2.927237827130986e-06, + "loss": 0.4123, + "step": 8238 + }, + { + "epoch": 0.65, + "grad_norm": 0.6017103665985941, + "learning_rate": 2.9260805266961223e-06, + "loss": 0.4958, + "step": 8239 + }, + { + "epoch": 0.65, + "grad_norm": 1.4772572546893066, + "learning_rate": 2.9249233604411333e-06, + "loss": 0.4387, + "step": 8240 + }, + { + "epoch": 0.65, + "grad_norm": 1.7097143592457535, + "learning_rate": 2.9237663284408897e-06, + "loss": 0.4016, + "step": 8241 + }, + { + "epoch": 0.65, + "grad_norm": 1.8169986595543601, + "learning_rate": 2.9226094307702476e-06, + "loss": 0.4432, + "step": 8242 + }, + { + "epoch": 0.65, + "grad_norm": 1.455766885992405, + "learning_rate": 2.921452667504059e-06, + "loss": 0.4199, + "step": 8243 + }, + { + "epoch": 0.65, + "grad_norm": 1.7458486229120573, + "learning_rate": 2.920296038717162e-06, + "loss": 0.4566, + "step": 8244 + }, + { + "epoch": 0.65, + "grad_norm": 0.5430650062282202, + "learning_rate": 2.9191395444843927e-06, + "loss": 0.4966, + "step": 8245 + }, + { + "epoch": 0.65, + "grad_norm": 1.8621342077114018, + "learning_rate": 2.917983184880571e-06, + "loss": 0.5137, + "step": 8246 + }, + { + "epoch": 0.65, + "grad_norm": 3.5263195270427516, + "learning_rate": 2.916826959980514e-06, + "loss": 0.4929, + "step": 8247 + }, + { + "epoch": 0.65, + "grad_norm": 1.7143124195598418, + "learning_rate": 2.9156708698590273e-06, + "loss": 0.396, + "step": 8248 + }, + { + "epoch": 0.65, + "grad_norm": 1.6512225744344413, + "learning_rate": 2.914514914590909e-06, + "loss": 0.5167, + "step": 8249 + }, + { + "epoch": 0.65, + "grad_norm": 4.046354989438265, + "learning_rate": 2.91335909425095e-06, + "loss": 0.5126, + "step": 8250 + }, + { + "epoch": 0.65, + "grad_norm": 1.8018917333452855, + "learning_rate": 2.912203408913925e-06, + "loss": 0.4447, + "step": 8251 + }, + { + "epoch": 0.65, + "grad_norm": 3.0472325603823958, + "learning_rate": 2.9110478586546086e-06, + "loss": 0.4253, + "step": 8252 + }, + { + "epoch": 0.65, + "grad_norm": 1.7327233528669361, + "learning_rate": 2.9098924435477636e-06, + "loss": 0.4946, + "step": 8253 + }, + { + "epoch": 0.65, + "grad_norm": 1.980114705290418, + "learning_rate": 2.908737163668145e-06, + "loss": 0.5031, + "step": 8254 + }, + { + "epoch": 0.65, + "grad_norm": 0.5340968871623029, + "learning_rate": 2.9075820190904936e-06, + "loss": 0.5056, + "step": 8255 + }, + { + "epoch": 0.65, + "grad_norm": 2.208119107141144, + "learning_rate": 2.9064270098895488e-06, + "loss": 0.4189, + "step": 8256 + }, + { + "epoch": 0.65, + "grad_norm": 1.639106613865686, + "learning_rate": 2.9052721361400375e-06, + "loss": 0.4234, + "step": 8257 + }, + { + "epoch": 0.65, + "grad_norm": 1.7769743000105889, + "learning_rate": 2.9041173979166813e-06, + "loss": 0.4568, + "step": 8258 + }, + { + "epoch": 0.65, + "grad_norm": 5.5079654871055235, + "learning_rate": 2.9029627952941854e-06, + "loss": 0.4062, + "step": 8259 + }, + { + "epoch": 0.65, + "grad_norm": 1.628065600571878, + "learning_rate": 2.901808328347251e-06, + "loss": 0.4684, + "step": 8260 + }, + { + "epoch": 0.65, + "grad_norm": 0.5794870164140321, + "learning_rate": 2.900653997150578e-06, + "loss": 0.4794, + "step": 8261 + }, + { + "epoch": 0.65, + "grad_norm": 1.8121446360000792, + "learning_rate": 2.8994998017788417e-06, + "loss": 0.482, + "step": 8262 + }, + { + "epoch": 0.65, + "grad_norm": 1.4823476219949823, + "learning_rate": 2.898345742306723e-06, + "loss": 0.3933, + "step": 8263 + }, + { + "epoch": 0.65, + "grad_norm": 3.0361918507623584, + "learning_rate": 2.8971918188088815e-06, + "loss": 0.5002, + "step": 8264 + }, + { + "epoch": 0.65, + "grad_norm": 2.0558414802069973, + "learning_rate": 2.896038031359982e-06, + "loss": 0.466, + "step": 8265 + }, + { + "epoch": 0.65, + "grad_norm": 2.1774481971590314, + "learning_rate": 2.8948843800346687e-06, + "loss": 0.351, + "step": 8266 + }, + { + "epoch": 0.65, + "grad_norm": 1.5087912390487856, + "learning_rate": 2.8937308649075836e-06, + "loss": 0.4186, + "step": 8267 + }, + { + "epoch": 0.65, + "grad_norm": 0.5652916642586763, + "learning_rate": 2.8925774860533524e-06, + "loss": 0.4465, + "step": 8268 + }, + { + "epoch": 0.65, + "grad_norm": 0.5489859745171022, + "learning_rate": 2.8914242435466043e-06, + "loss": 0.4959, + "step": 8269 + }, + { + "epoch": 0.65, + "grad_norm": 1.5001235535866595, + "learning_rate": 2.890271137461948e-06, + "loss": 0.4462, + "step": 8270 + }, + { + "epoch": 0.65, + "grad_norm": 2.343075215775687, + "learning_rate": 2.889118167873989e-06, + "loss": 0.4689, + "step": 8271 + }, + { + "epoch": 0.65, + "grad_norm": 1.9397795988072182, + "learning_rate": 2.8879653348573235e-06, + "loss": 0.4968, + "step": 8272 + }, + { + "epoch": 0.65, + "grad_norm": 1.6615440625796039, + "learning_rate": 2.886812638486537e-06, + "loss": 0.4679, + "step": 8273 + }, + { + "epoch": 0.65, + "grad_norm": 1.518529275538549, + "learning_rate": 2.885660078836211e-06, + "loss": 0.455, + "step": 8274 + }, + { + "epoch": 0.65, + "grad_norm": 2.1638509373764467, + "learning_rate": 2.8845076559809103e-06, + "loss": 0.4403, + "step": 8275 + }, + { + "epoch": 0.65, + "grad_norm": 0.5621496897834805, + "learning_rate": 2.8833553699951968e-06, + "loss": 0.4857, + "step": 8276 + }, + { + "epoch": 0.65, + "grad_norm": 1.3736689648682092, + "learning_rate": 2.882203220953621e-06, + "loss": 0.4315, + "step": 8277 + }, + { + "epoch": 0.65, + "grad_norm": 1.6886079423347138, + "learning_rate": 2.881051208930729e-06, + "loss": 0.4527, + "step": 8278 + }, + { + "epoch": 0.65, + "grad_norm": 2.3669679589894677, + "learning_rate": 2.87989933400105e-06, + "loss": 0.4652, + "step": 8279 + }, + { + "epoch": 0.65, + "grad_norm": 0.5709648877703546, + "learning_rate": 2.878747596239111e-06, + "loss": 0.4909, + "step": 8280 + }, + { + "epoch": 0.65, + "grad_norm": 1.702603130119065, + "learning_rate": 2.8775959957194268e-06, + "loss": 0.421, + "step": 8281 + }, + { + "epoch": 0.65, + "grad_norm": 0.5842850027379044, + "learning_rate": 2.8764445325165057e-06, + "loss": 0.4882, + "step": 8282 + }, + { + "epoch": 0.65, + "grad_norm": 1.5246457164700038, + "learning_rate": 2.875293206704845e-06, + "loss": 0.4402, + "step": 8283 + }, + { + "epoch": 0.65, + "grad_norm": 1.8083703239667417, + "learning_rate": 2.8741420183589348e-06, + "loss": 0.415, + "step": 8284 + }, + { + "epoch": 0.65, + "grad_norm": 1.676170385760368, + "learning_rate": 2.8729909675532563e-06, + "loss": 0.4529, + "step": 8285 + }, + { + "epoch": 0.65, + "grad_norm": 1.747353601678995, + "learning_rate": 2.871840054362278e-06, + "loss": 0.4205, + "step": 8286 + }, + { + "epoch": 0.65, + "grad_norm": 1.7839235546008447, + "learning_rate": 2.8706892788604645e-06, + "loss": 0.453, + "step": 8287 + }, + { + "epoch": 0.65, + "grad_norm": 1.7023344674913623, + "learning_rate": 2.8695386411222693e-06, + "loss": 0.5045, + "step": 8288 + }, + { + "epoch": 0.65, + "grad_norm": 3.2861785813172517, + "learning_rate": 2.8683881412221395e-06, + "loss": 0.4791, + "step": 8289 + }, + { + "epoch": 0.65, + "grad_norm": 1.211967090717653, + "learning_rate": 2.867237779234505e-06, + "loss": 0.3838, + "step": 8290 + }, + { + "epoch": 0.65, + "grad_norm": 1.865368609701087, + "learning_rate": 2.8660875552337974e-06, + "loss": 0.4767, + "step": 8291 + }, + { + "epoch": 0.65, + "grad_norm": 2.0595050006827917, + "learning_rate": 2.864937469294433e-06, + "loss": 0.4727, + "step": 8292 + }, + { + "epoch": 0.65, + "grad_norm": 1.8753089932077254, + "learning_rate": 2.8637875214908217e-06, + "loss": 0.4752, + "step": 8293 + }, + { + "epoch": 0.65, + "grad_norm": 0.573684085456631, + "learning_rate": 2.8626377118973645e-06, + "loss": 0.4867, + "step": 8294 + }, + { + "epoch": 0.65, + "grad_norm": 1.8250185123723592, + "learning_rate": 2.861488040588448e-06, + "loss": 0.4569, + "step": 8295 + }, + { + "epoch": 0.65, + "grad_norm": 3.0217553130530463, + "learning_rate": 2.8603385076384617e-06, + "loss": 0.4523, + "step": 8296 + }, + { + "epoch": 0.65, + "grad_norm": 0.5614057325078737, + "learning_rate": 2.859189113121772e-06, + "loss": 0.4893, + "step": 8297 + }, + { + "epoch": 0.65, + "grad_norm": 1.8505969155762023, + "learning_rate": 2.8580398571127486e-06, + "loss": 0.3796, + "step": 8298 + }, + { + "epoch": 0.65, + "grad_norm": 1.5173714784496488, + "learning_rate": 2.8568907396857415e-06, + "loss": 0.5161, + "step": 8299 + }, + { + "epoch": 0.65, + "grad_norm": 0.5475002076918463, + "learning_rate": 2.855741760915103e-06, + "loss": 0.501, + "step": 8300 + }, + { + "epoch": 0.65, + "grad_norm": 2.1594465900801243, + "learning_rate": 2.8545929208751653e-06, + "loss": 0.4234, + "step": 8301 + }, + { + "epoch": 0.65, + "grad_norm": 1.8103977212711726, + "learning_rate": 2.8534442196402617e-06, + "loss": 0.4662, + "step": 8302 + }, + { + "epoch": 0.65, + "grad_norm": 2.521399554962547, + "learning_rate": 2.8522956572847047e-06, + "loss": 0.4429, + "step": 8303 + }, + { + "epoch": 0.65, + "grad_norm": 1.4632441268805159, + "learning_rate": 2.851147233882812e-06, + "loss": 0.436, + "step": 8304 + }, + { + "epoch": 0.65, + "grad_norm": 1.7034870054871227, + "learning_rate": 2.849998949508883e-06, + "loss": 0.5054, + "step": 8305 + }, + { + "epoch": 0.65, + "grad_norm": 1.9543803211238833, + "learning_rate": 2.8488508042372076e-06, + "loss": 0.4428, + "step": 8306 + }, + { + "epoch": 0.65, + "grad_norm": 1.7372610364514034, + "learning_rate": 2.847702798142071e-06, + "loss": 0.3922, + "step": 8307 + }, + { + "epoch": 0.65, + "grad_norm": 2.0050480757573963, + "learning_rate": 2.8465549312977487e-06, + "loss": 0.4517, + "step": 8308 + }, + { + "epoch": 0.65, + "grad_norm": 1.9251687330436067, + "learning_rate": 2.845407203778506e-06, + "loss": 0.4397, + "step": 8309 + }, + { + "epoch": 0.65, + "grad_norm": 1.541528873678368, + "learning_rate": 2.844259615658598e-06, + "loss": 0.4849, + "step": 8310 + }, + { + "epoch": 0.65, + "grad_norm": 1.7542281992065132, + "learning_rate": 2.8431121670122724e-06, + "loss": 0.4707, + "step": 8311 + }, + { + "epoch": 0.65, + "grad_norm": 1.3925116782037286, + "learning_rate": 2.8419648579137677e-06, + "loss": 0.4252, + "step": 8312 + }, + { + "epoch": 0.65, + "grad_norm": 1.6915365684697257, + "learning_rate": 2.840817688437316e-06, + "loss": 0.3936, + "step": 8313 + }, + { + "epoch": 0.65, + "grad_norm": 1.7636968621207436, + "learning_rate": 2.839670658657133e-06, + "loss": 0.4472, + "step": 8314 + }, + { + "epoch": 0.65, + "grad_norm": 1.5575800331889267, + "learning_rate": 2.8385237686474303e-06, + "loss": 0.4699, + "step": 8315 + }, + { + "epoch": 0.65, + "grad_norm": 1.4642466297829557, + "learning_rate": 2.837377018482416e-06, + "loss": 0.4769, + "step": 8316 + }, + { + "epoch": 0.65, + "grad_norm": 0.5504902964577563, + "learning_rate": 2.836230408236278e-06, + "loss": 0.4717, + "step": 8317 + }, + { + "epoch": 0.65, + "grad_norm": 2.4868786209281994, + "learning_rate": 2.835083937983202e-06, + "loss": 0.4724, + "step": 8318 + }, + { + "epoch": 0.65, + "grad_norm": 1.466271145164999, + "learning_rate": 2.8339376077973628e-06, + "loss": 0.4861, + "step": 8319 + }, + { + "epoch": 0.65, + "grad_norm": 0.598786724765163, + "learning_rate": 2.832791417752929e-06, + "loss": 0.4953, + "step": 8320 + }, + { + "epoch": 0.65, + "grad_norm": 1.493471439789418, + "learning_rate": 2.8316453679240537e-06, + "loss": 0.4603, + "step": 8321 + }, + { + "epoch": 0.65, + "grad_norm": 2.8763989217535593, + "learning_rate": 2.830499458384886e-06, + "loss": 0.4989, + "step": 8322 + }, + { + "epoch": 0.65, + "grad_norm": 1.2399243465219496, + "learning_rate": 2.8293536892095653e-06, + "loss": 0.4023, + "step": 8323 + }, + { + "epoch": 0.65, + "grad_norm": 3.035756458421096, + "learning_rate": 2.828208060472224e-06, + "loss": 0.5114, + "step": 8324 + }, + { + "epoch": 0.65, + "grad_norm": 2.2155455974149905, + "learning_rate": 2.8270625722469776e-06, + "loss": 0.4782, + "step": 8325 + }, + { + "epoch": 0.65, + "grad_norm": 1.628003121768669, + "learning_rate": 2.8259172246079402e-06, + "loss": 0.4042, + "step": 8326 + }, + { + "epoch": 0.65, + "grad_norm": 0.5870712803585254, + "learning_rate": 2.8247720176292147e-06, + "loss": 0.461, + "step": 8327 + }, + { + "epoch": 0.65, + "grad_norm": 1.3971352518875195, + "learning_rate": 2.8236269513848935e-06, + "loss": 0.4305, + "step": 8328 + }, + { + "epoch": 0.65, + "grad_norm": 1.6430939273350604, + "learning_rate": 2.8224820259490636e-06, + "loss": 0.4626, + "step": 8329 + }, + { + "epoch": 0.65, + "grad_norm": 1.7146304547572568, + "learning_rate": 2.8213372413957947e-06, + "loss": 0.456, + "step": 8330 + }, + { + "epoch": 0.65, + "grad_norm": 1.9832411482587164, + "learning_rate": 2.8201925977991598e-06, + "loss": 0.477, + "step": 8331 + }, + { + "epoch": 0.65, + "grad_norm": 1.8731376214478976, + "learning_rate": 2.81904809523321e-06, + "loss": 0.4769, + "step": 8332 + }, + { + "epoch": 0.65, + "grad_norm": 2.2825281635178962, + "learning_rate": 2.8179037337719977e-06, + "loss": 0.4094, + "step": 8333 + }, + { + "epoch": 0.65, + "grad_norm": 1.8869194039794646, + "learning_rate": 2.8167595134895553e-06, + "loss": 0.485, + "step": 8334 + }, + { + "epoch": 0.65, + "grad_norm": 1.9438768694801347, + "learning_rate": 2.8156154344599207e-06, + "loss": 0.4637, + "step": 8335 + }, + { + "epoch": 0.65, + "grad_norm": 1.7127709198925196, + "learning_rate": 2.8144714967571075e-06, + "loss": 0.3932, + "step": 8336 + }, + { + "epoch": 0.65, + "grad_norm": 1.4615523343789412, + "learning_rate": 2.813327700455131e-06, + "loss": 0.4314, + "step": 8337 + }, + { + "epoch": 0.65, + "grad_norm": 0.565371734441393, + "learning_rate": 2.812184045627988e-06, + "loss": 0.4683, + "step": 8338 + }, + { + "epoch": 0.65, + "grad_norm": 1.7947520764056022, + "learning_rate": 2.811040532349678e-06, + "loss": 0.4466, + "step": 8339 + }, + { + "epoch": 0.65, + "grad_norm": 1.5716927810210861, + "learning_rate": 2.809897160694184e-06, + "loss": 0.4829, + "step": 8340 + }, + { + "epoch": 0.66, + "grad_norm": 1.761223595511178, + "learning_rate": 2.8087539307354754e-06, + "loss": 0.4292, + "step": 8341 + }, + { + "epoch": 0.66, + "grad_norm": 3.2627338861524247, + "learning_rate": 2.8076108425475223e-06, + "loss": 0.444, + "step": 8342 + }, + { + "epoch": 0.66, + "grad_norm": 1.4287526818049328, + "learning_rate": 2.806467896204279e-06, + "loss": 0.4552, + "step": 8343 + }, + { + "epoch": 0.66, + "grad_norm": 2.267878602824153, + "learning_rate": 2.8053250917796952e-06, + "loss": 0.4788, + "step": 8344 + }, + { + "epoch": 0.66, + "grad_norm": 2.8483628461080137, + "learning_rate": 2.8041824293477056e-06, + "loss": 0.4542, + "step": 8345 + }, + { + "epoch": 0.66, + "grad_norm": 1.6467845467099607, + "learning_rate": 2.8030399089822398e-06, + "loss": 0.4717, + "step": 8346 + }, + { + "epoch": 0.66, + "grad_norm": 2.6172119119763586, + "learning_rate": 2.801897530757218e-06, + "loss": 0.4363, + "step": 8347 + }, + { + "epoch": 0.66, + "grad_norm": 1.404458643547706, + "learning_rate": 2.800755294746552e-06, + "loss": 0.4321, + "step": 8348 + }, + { + "epoch": 0.66, + "grad_norm": 1.9416211053333847, + "learning_rate": 2.7996132010241406e-06, + "loss": 0.4185, + "step": 8349 + }, + { + "epoch": 0.66, + "grad_norm": 1.709488016779424, + "learning_rate": 2.7984712496638737e-06, + "loss": 0.3935, + "step": 8350 + }, + { + "epoch": 0.66, + "grad_norm": 3.800367358551493, + "learning_rate": 2.797329440739641e-06, + "loss": 0.5001, + "step": 8351 + }, + { + "epoch": 0.66, + "grad_norm": 0.5586600921127364, + "learning_rate": 2.7961877743253105e-06, + "loss": 0.4761, + "step": 8352 + }, + { + "epoch": 0.66, + "grad_norm": 1.9689953269674476, + "learning_rate": 2.7950462504947483e-06, + "loss": 0.4774, + "step": 8353 + }, + { + "epoch": 0.66, + "grad_norm": 1.546762584221291, + "learning_rate": 2.7939048693218084e-06, + "loss": 0.4389, + "step": 8354 + }, + { + "epoch": 0.66, + "grad_norm": 0.5685714991573594, + "learning_rate": 2.7927636308803406e-06, + "loss": 0.4797, + "step": 8355 + }, + { + "epoch": 0.66, + "grad_norm": 1.5265245492614934, + "learning_rate": 2.7916225352441757e-06, + "loss": 0.438, + "step": 8356 + }, + { + "epoch": 0.66, + "grad_norm": 1.5665146606972222, + "learning_rate": 2.790481582487144e-06, + "loss": 0.4888, + "step": 8357 + }, + { + "epoch": 0.66, + "grad_norm": 1.9738203611163148, + "learning_rate": 2.7893407726830633e-06, + "loss": 0.4465, + "step": 8358 + }, + { + "epoch": 0.66, + "grad_norm": 0.5877463799673889, + "learning_rate": 2.788200105905745e-06, + "loss": 0.5084, + "step": 8359 + }, + { + "epoch": 0.66, + "grad_norm": 2.289425328467674, + "learning_rate": 2.787059582228984e-06, + "loss": 0.4557, + "step": 8360 + }, + { + "epoch": 0.66, + "grad_norm": 1.3600632331522529, + "learning_rate": 2.7859192017265734e-06, + "loss": 0.4491, + "step": 8361 + }, + { + "epoch": 0.66, + "grad_norm": 0.5846674638150436, + "learning_rate": 2.7847789644722945e-06, + "loss": 0.4691, + "step": 8362 + }, + { + "epoch": 0.66, + "grad_norm": 1.7295862358588767, + "learning_rate": 2.783638870539918e-06, + "loss": 0.4621, + "step": 8363 + }, + { + "epoch": 0.66, + "grad_norm": 2.1293941330170356, + "learning_rate": 2.7824989200032098e-06, + "loss": 0.4651, + "step": 8364 + }, + { + "epoch": 0.66, + "grad_norm": 2.189229836462391, + "learning_rate": 2.7813591129359185e-06, + "loss": 0.4764, + "step": 8365 + }, + { + "epoch": 0.66, + "grad_norm": 1.8378040021790647, + "learning_rate": 2.78021944941179e-06, + "loss": 0.5054, + "step": 8366 + }, + { + "epoch": 0.66, + "grad_norm": 2.141738031654644, + "learning_rate": 2.7790799295045595e-06, + "loss": 0.4571, + "step": 8367 + }, + { + "epoch": 0.66, + "grad_norm": 3.002590834228274, + "learning_rate": 2.7779405532879545e-06, + "loss": 0.5377, + "step": 8368 + }, + { + "epoch": 0.66, + "grad_norm": 1.5624367821109841, + "learning_rate": 2.7768013208356835e-06, + "loss": 0.4628, + "step": 8369 + }, + { + "epoch": 0.66, + "grad_norm": 1.8460637358156968, + "learning_rate": 2.7756622322214646e-06, + "loss": 0.4131, + "step": 8370 + }, + { + "epoch": 0.66, + "grad_norm": 1.8655870718104048, + "learning_rate": 2.7745232875189866e-06, + "loss": 0.453, + "step": 8371 + }, + { + "epoch": 0.66, + "grad_norm": 1.6279504498558004, + "learning_rate": 2.7733844868019406e-06, + "loss": 0.4528, + "step": 8372 + }, + { + "epoch": 0.66, + "grad_norm": 5.404658256272628, + "learning_rate": 2.7722458301440053e-06, + "loss": 0.4496, + "step": 8373 + }, + { + "epoch": 0.66, + "grad_norm": 1.438696999281865, + "learning_rate": 2.771107317618851e-06, + "loss": 0.4156, + "step": 8374 + }, + { + "epoch": 0.66, + "grad_norm": 2.284069264363936, + "learning_rate": 2.769968949300139e-06, + "loss": 0.4312, + "step": 8375 + }, + { + "epoch": 0.66, + "grad_norm": 2.587769896459424, + "learning_rate": 2.7688307252615172e-06, + "loss": 0.476, + "step": 8376 + }, + { + "epoch": 0.66, + "grad_norm": 1.8013428202903021, + "learning_rate": 2.7676926455766284e-06, + "loss": 0.4594, + "step": 8377 + }, + { + "epoch": 0.66, + "grad_norm": 1.9031606263788614, + "learning_rate": 2.7665547103191054e-06, + "loss": 0.4725, + "step": 8378 + }, + { + "epoch": 0.66, + "grad_norm": 2.141558528674537, + "learning_rate": 2.765416919562573e-06, + "loss": 0.4454, + "step": 8379 + }, + { + "epoch": 0.66, + "grad_norm": 2.9625475439093094, + "learning_rate": 2.7642792733806405e-06, + "loss": 0.4441, + "step": 8380 + }, + { + "epoch": 0.66, + "grad_norm": 1.6386532775864326, + "learning_rate": 2.7631417718469147e-06, + "loss": 0.446, + "step": 8381 + }, + { + "epoch": 0.66, + "grad_norm": 1.5163095294031517, + "learning_rate": 2.7620044150349897e-06, + "loss": 0.4413, + "step": 8382 + }, + { + "epoch": 0.66, + "grad_norm": 1.7993799323997333, + "learning_rate": 2.760867203018451e-06, + "loss": 0.5195, + "step": 8383 + }, + { + "epoch": 0.66, + "grad_norm": 1.399157036297125, + "learning_rate": 2.759730135870878e-06, + "loss": 0.4437, + "step": 8384 + }, + { + "epoch": 0.66, + "grad_norm": 1.6099059156589026, + "learning_rate": 2.75859321366583e-06, + "loss": 0.5127, + "step": 8385 + }, + { + "epoch": 0.66, + "grad_norm": 2.072333514439562, + "learning_rate": 2.757456436476873e-06, + "loss": 0.462, + "step": 8386 + }, + { + "epoch": 0.66, + "grad_norm": 1.7351578955591256, + "learning_rate": 2.7563198043775493e-06, + "loss": 0.4621, + "step": 8387 + }, + { + "epoch": 0.66, + "grad_norm": 1.688031589463551, + "learning_rate": 2.755183317441399e-06, + "loss": 0.4848, + "step": 8388 + }, + { + "epoch": 0.66, + "grad_norm": 0.541015625171994, + "learning_rate": 2.754046975741951e-06, + "loss": 0.4711, + "step": 8389 + }, + { + "epoch": 0.66, + "grad_norm": 1.4927620728469397, + "learning_rate": 2.7529107793527284e-06, + "loss": 0.4529, + "step": 8390 + }, + { + "epoch": 0.66, + "grad_norm": 1.7690020372398174, + "learning_rate": 2.7517747283472364e-06, + "loss": 0.4486, + "step": 8391 + }, + { + "epoch": 0.66, + "grad_norm": 0.5273022297394172, + "learning_rate": 2.7506388227989778e-06, + "loss": 0.4845, + "step": 8392 + }, + { + "epoch": 0.66, + "grad_norm": 1.9213654365255597, + "learning_rate": 2.7495030627814455e-06, + "loss": 0.4256, + "step": 8393 + }, + { + "epoch": 0.66, + "grad_norm": 1.7624012089148893, + "learning_rate": 2.7483674483681213e-06, + "loss": 0.4227, + "step": 8394 + }, + { + "epoch": 0.66, + "grad_norm": 1.472248459469368, + "learning_rate": 2.7472319796324793e-06, + "loss": 0.4334, + "step": 8395 + }, + { + "epoch": 0.66, + "grad_norm": 1.7581027745886375, + "learning_rate": 2.746096656647979e-06, + "loss": 0.4965, + "step": 8396 + }, + { + "epoch": 0.66, + "grad_norm": 1.6414922311337243, + "learning_rate": 2.744961479488077e-06, + "loss": 0.4498, + "step": 8397 + }, + { + "epoch": 0.66, + "grad_norm": 3.3309071229241747, + "learning_rate": 2.743826448226217e-06, + "loss": 0.4042, + "step": 8398 + }, + { + "epoch": 0.66, + "grad_norm": 1.824415640490874, + "learning_rate": 2.7426915629358364e-06, + "loss": 0.4779, + "step": 8399 + }, + { + "epoch": 0.66, + "grad_norm": 1.6230445499614004, + "learning_rate": 2.7415568236903567e-06, + "loss": 0.396, + "step": 8400 + }, + { + "epoch": 0.66, + "grad_norm": 1.3902487783454, + "learning_rate": 2.740422230563197e-06, + "loss": 0.4316, + "step": 8401 + }, + { + "epoch": 0.66, + "grad_norm": 2.0682433053236386, + "learning_rate": 2.739287783627762e-06, + "loss": 0.4449, + "step": 8402 + }, + { + "epoch": 0.66, + "grad_norm": 3.075694457736074, + "learning_rate": 2.738153482957452e-06, + "loss": 0.4646, + "step": 8403 + }, + { + "epoch": 0.66, + "grad_norm": 1.8484991813968688, + "learning_rate": 2.7370193286256495e-06, + "loss": 0.4454, + "step": 8404 + }, + { + "epoch": 0.66, + "grad_norm": 2.293613668368275, + "learning_rate": 2.7358853207057394e-06, + "loss": 0.4534, + "step": 8405 + }, + { + "epoch": 0.66, + "grad_norm": 1.782463121768709, + "learning_rate": 2.7347514592710855e-06, + "loss": 0.4712, + "step": 8406 + }, + { + "epoch": 0.66, + "grad_norm": 1.5203310909192354, + "learning_rate": 2.733617744395049e-06, + "loss": 0.4353, + "step": 8407 + }, + { + "epoch": 0.66, + "grad_norm": 1.6304307686807669, + "learning_rate": 2.7324841761509793e-06, + "loss": 0.4466, + "step": 8408 + }, + { + "epoch": 0.66, + "grad_norm": 3.084056826536372, + "learning_rate": 2.7313507546122177e-06, + "loss": 0.4692, + "step": 8409 + }, + { + "epoch": 0.66, + "grad_norm": 0.5836718673910857, + "learning_rate": 2.7302174798520958e-06, + "loss": 0.466, + "step": 8410 + }, + { + "epoch": 0.66, + "grad_norm": 1.5349119451230968, + "learning_rate": 2.7290843519439326e-06, + "loss": 0.4278, + "step": 8411 + }, + { + "epoch": 0.66, + "grad_norm": 2.916947879716906, + "learning_rate": 2.72795137096104e-06, + "loss": 0.4467, + "step": 8412 + }, + { + "epoch": 0.66, + "grad_norm": 3.2891707112800375, + "learning_rate": 2.726818536976722e-06, + "loss": 0.5197, + "step": 8413 + }, + { + "epoch": 0.66, + "grad_norm": 1.438233219845548, + "learning_rate": 2.725685850064273e-06, + "loss": 0.4618, + "step": 8414 + }, + { + "epoch": 0.66, + "grad_norm": 1.7912730182133705, + "learning_rate": 2.7245533102969723e-06, + "loss": 0.4339, + "step": 8415 + }, + { + "epoch": 0.66, + "grad_norm": 1.7978089161464292, + "learning_rate": 2.7234209177480957e-06, + "loss": 0.4439, + "step": 8416 + }, + { + "epoch": 0.66, + "grad_norm": 1.8309355448519167, + "learning_rate": 2.7222886724909083e-06, + "loss": 0.4232, + "step": 8417 + }, + { + "epoch": 0.66, + "grad_norm": 1.4381159065529576, + "learning_rate": 2.7211565745986635e-06, + "loss": 0.4409, + "step": 8418 + }, + { + "epoch": 0.66, + "grad_norm": 1.4128080723197063, + "learning_rate": 2.7200246241446086e-06, + "loss": 0.442, + "step": 8419 + }, + { + "epoch": 0.66, + "grad_norm": 2.298622242161887, + "learning_rate": 2.718892821201975e-06, + "loss": 0.4353, + "step": 8420 + }, + { + "epoch": 0.66, + "grad_norm": 2.216261716874472, + "learning_rate": 2.717761165843995e-06, + "loss": 0.4954, + "step": 8421 + }, + { + "epoch": 0.66, + "grad_norm": 1.674568964980296, + "learning_rate": 2.7166296581438806e-06, + "loss": 0.4312, + "step": 8422 + }, + { + "epoch": 0.66, + "grad_norm": 1.825403042246879, + "learning_rate": 2.71549829817484e-06, + "loss": 0.4995, + "step": 8423 + }, + { + "epoch": 0.66, + "grad_norm": 1.8666790991962592, + "learning_rate": 2.7143670860100713e-06, + "loss": 0.4523, + "step": 8424 + }, + { + "epoch": 0.66, + "grad_norm": 2.1617083482127657, + "learning_rate": 2.7132360217227637e-06, + "loss": 0.4635, + "step": 8425 + }, + { + "epoch": 0.66, + "grad_norm": 18.667210762189818, + "learning_rate": 2.7121051053860925e-06, + "loss": 0.4483, + "step": 8426 + }, + { + "epoch": 0.66, + "grad_norm": 3.06093772514321, + "learning_rate": 2.7109743370732276e-06, + "loss": 0.4093, + "step": 8427 + }, + { + "epoch": 0.66, + "grad_norm": 1.3832167579246195, + "learning_rate": 2.7098437168573287e-06, + "loss": 0.4068, + "step": 8428 + }, + { + "epoch": 0.66, + "grad_norm": 1.8066684483924873, + "learning_rate": 2.7087132448115463e-06, + "loss": 0.4502, + "step": 8429 + }, + { + "epoch": 0.66, + "grad_norm": 1.5061844776910662, + "learning_rate": 2.707582921009021e-06, + "loss": 0.4126, + "step": 8430 + }, + { + "epoch": 0.66, + "grad_norm": 0.5631107837904047, + "learning_rate": 2.7064527455228796e-06, + "loss": 0.4904, + "step": 8431 + }, + { + "epoch": 0.66, + "grad_norm": 1.6146969850143555, + "learning_rate": 2.7053227184262454e-06, + "loss": 0.4155, + "step": 8432 + }, + { + "epoch": 0.66, + "grad_norm": 1.6989812615926074, + "learning_rate": 2.704192839792229e-06, + "loss": 0.4233, + "step": 8433 + }, + { + "epoch": 0.66, + "grad_norm": 1.5768343822537383, + "learning_rate": 2.7030631096939355e-06, + "loss": 0.4276, + "step": 8434 + }, + { + "epoch": 0.66, + "grad_norm": 0.5914072490165013, + "learning_rate": 2.7019335282044513e-06, + "loss": 0.4897, + "step": 8435 + }, + { + "epoch": 0.66, + "grad_norm": 1.9495717237674735, + "learning_rate": 2.700804095396862e-06, + "loss": 0.4889, + "step": 8436 + }, + { + "epoch": 0.66, + "grad_norm": 2.159008380093327, + "learning_rate": 2.6996748113442397e-06, + "loss": 0.399, + "step": 8437 + }, + { + "epoch": 0.66, + "grad_norm": 1.4205268812198915, + "learning_rate": 2.6985456761196504e-06, + "loss": 0.4456, + "step": 8438 + }, + { + "epoch": 0.66, + "grad_norm": 1.5707560431560048, + "learning_rate": 2.6974166897961407e-06, + "loss": 0.4781, + "step": 8439 + }, + { + "epoch": 0.66, + "grad_norm": 0.5525412916814305, + "learning_rate": 2.696287852446761e-06, + "loss": 0.4906, + "step": 8440 + }, + { + "epoch": 0.66, + "grad_norm": 0.5591902389242657, + "learning_rate": 2.6951591641445463e-06, + "loss": 0.4899, + "step": 8441 + }, + { + "epoch": 0.66, + "grad_norm": 1.6514237805596739, + "learning_rate": 2.694030624962516e-06, + "loss": 0.4483, + "step": 8442 + }, + { + "epoch": 0.66, + "grad_norm": 0.5570701168885115, + "learning_rate": 2.6929022349736876e-06, + "loss": 0.4705, + "step": 8443 + }, + { + "epoch": 0.66, + "grad_norm": 1.7112678720265293, + "learning_rate": 2.6917739942510666e-06, + "loss": 0.4201, + "step": 8444 + }, + { + "epoch": 0.66, + "grad_norm": 3.5183581164182973, + "learning_rate": 2.69064590286765e-06, + "loss": 0.4491, + "step": 8445 + }, + { + "epoch": 0.66, + "grad_norm": 1.977618137465116, + "learning_rate": 2.6895179608964215e-06, + "loss": 0.4835, + "step": 8446 + }, + { + "epoch": 0.66, + "grad_norm": 1.624648908913114, + "learning_rate": 2.6883901684103585e-06, + "loss": 0.4458, + "step": 8447 + }, + { + "epoch": 0.66, + "grad_norm": 1.7711426861807622, + "learning_rate": 2.6872625254824268e-06, + "loss": 0.4663, + "step": 8448 + }, + { + "epoch": 0.66, + "grad_norm": 1.3134556046696069, + "learning_rate": 2.686135032185587e-06, + "loss": 0.4006, + "step": 8449 + }, + { + "epoch": 0.66, + "grad_norm": 1.340126581507843, + "learning_rate": 2.6850076885927812e-06, + "loss": 0.4624, + "step": 8450 + }, + { + "epoch": 0.66, + "grad_norm": 0.5775121088150093, + "learning_rate": 2.683880494776948e-06, + "loss": 0.4938, + "step": 8451 + }, + { + "epoch": 0.66, + "grad_norm": 1.3875635327016926, + "learning_rate": 2.6827534508110204e-06, + "loss": 0.4328, + "step": 8452 + }, + { + "epoch": 0.66, + "grad_norm": 1.8446132874692402, + "learning_rate": 2.6816265567679112e-06, + "loss": 0.476, + "step": 8453 + }, + { + "epoch": 0.66, + "grad_norm": 1.967955552543585, + "learning_rate": 2.680499812720534e-06, + "loss": 0.5122, + "step": 8454 + }, + { + "epoch": 0.66, + "grad_norm": 2.5078882603539396, + "learning_rate": 2.67937321874178e-06, + "loss": 0.439, + "step": 8455 + }, + { + "epoch": 0.66, + "grad_norm": 1.8914479155023898, + "learning_rate": 2.678246774904546e-06, + "loss": 0.4425, + "step": 8456 + }, + { + "epoch": 0.66, + "grad_norm": 0.5940810631016251, + "learning_rate": 2.6771204812817077e-06, + "loss": 0.4842, + "step": 8457 + }, + { + "epoch": 0.66, + "grad_norm": 2.534488177194589, + "learning_rate": 2.675994337946135e-06, + "loss": 0.4495, + "step": 8458 + }, + { + "epoch": 0.66, + "grad_norm": 2.0514228141507864, + "learning_rate": 2.6748683449706887e-06, + "loss": 0.4997, + "step": 8459 + }, + { + "epoch": 0.66, + "grad_norm": 2.3883575395705976, + "learning_rate": 2.6737425024282206e-06, + "loss": 0.3829, + "step": 8460 + }, + { + "epoch": 0.66, + "grad_norm": 2.12968181554706, + "learning_rate": 2.6726168103915675e-06, + "loss": 0.4785, + "step": 8461 + }, + { + "epoch": 0.66, + "grad_norm": 1.4117946499171357, + "learning_rate": 2.6714912689335627e-06, + "loss": 0.4052, + "step": 8462 + }, + { + "epoch": 0.66, + "grad_norm": 1.8756036163836711, + "learning_rate": 2.6703658781270257e-06, + "loss": 0.4659, + "step": 8463 + }, + { + "epoch": 0.66, + "grad_norm": 1.468828749774537, + "learning_rate": 2.66924063804477e-06, + "loss": 0.4377, + "step": 8464 + }, + { + "epoch": 0.66, + "grad_norm": 2.309431145182377, + "learning_rate": 2.6681155487595967e-06, + "loss": 0.4578, + "step": 8465 + }, + { + "epoch": 0.66, + "grad_norm": 1.7996191243156232, + "learning_rate": 2.6669906103442957e-06, + "loss": 0.4708, + "step": 8466 + }, + { + "epoch": 0.66, + "grad_norm": 1.8765046758926833, + "learning_rate": 2.6658658228716503e-06, + "loss": 0.4406, + "step": 8467 + }, + { + "epoch": 0.67, + "grad_norm": 2.1124950900533443, + "learning_rate": 2.6647411864144322e-06, + "loss": 0.4353, + "step": 8468 + }, + { + "epoch": 0.67, + "grad_norm": 1.6868116547868814, + "learning_rate": 2.6636167010454074e-06, + "loss": 0.4134, + "step": 8469 + }, + { + "epoch": 0.67, + "grad_norm": 1.4886864699169808, + "learning_rate": 2.6624923668373225e-06, + "loss": 0.4429, + "step": 8470 + }, + { + "epoch": 0.67, + "grad_norm": 2.1118298259732167, + "learning_rate": 2.6613681838629245e-06, + "loss": 0.4653, + "step": 8471 + }, + { + "epoch": 0.67, + "grad_norm": 1.7302784910478501, + "learning_rate": 2.6602441521949455e-06, + "loss": 0.4425, + "step": 8472 + }, + { + "epoch": 0.67, + "grad_norm": 1.5630853160528666, + "learning_rate": 2.6591202719061093e-06, + "loss": 0.491, + "step": 8473 + }, + { + "epoch": 0.67, + "grad_norm": 2.1683949637166067, + "learning_rate": 2.657996543069129e-06, + "loss": 0.4348, + "step": 8474 + }, + { + "epoch": 0.67, + "grad_norm": 1.5872247644502637, + "learning_rate": 2.6568729657567083e-06, + "loss": 0.443, + "step": 8475 + }, + { + "epoch": 0.67, + "grad_norm": 1.612841290188638, + "learning_rate": 2.6557495400415435e-06, + "loss": 0.4887, + "step": 8476 + }, + { + "epoch": 0.67, + "grad_norm": 1.516457498165126, + "learning_rate": 2.654626265996315e-06, + "loss": 0.4446, + "step": 8477 + }, + { + "epoch": 0.67, + "grad_norm": 2.1979977524622476, + "learning_rate": 2.6535031436937e-06, + "loss": 0.3991, + "step": 8478 + }, + { + "epoch": 0.67, + "grad_norm": 1.5278825741034252, + "learning_rate": 2.6523801732063603e-06, + "loss": 0.4107, + "step": 8479 + }, + { + "epoch": 0.67, + "grad_norm": 0.5707282340576225, + "learning_rate": 2.6512573546069555e-06, + "loss": 0.4949, + "step": 8480 + }, + { + "epoch": 0.67, + "grad_norm": 1.8905851393611177, + "learning_rate": 2.6501346879681243e-06, + "loss": 0.4511, + "step": 8481 + }, + { + "epoch": 0.67, + "grad_norm": 0.5683133185116268, + "learning_rate": 2.6490121733625055e-06, + "loss": 0.5019, + "step": 8482 + }, + { + "epoch": 0.67, + "grad_norm": 1.6757818898066488, + "learning_rate": 2.647889810862724e-06, + "loss": 0.447, + "step": 8483 + }, + { + "epoch": 0.67, + "grad_norm": 0.5315035226698239, + "learning_rate": 2.6467676005413955e-06, + "loss": 0.4765, + "step": 8484 + }, + { + "epoch": 0.67, + "grad_norm": 2.624385860488442, + "learning_rate": 2.6456455424711235e-06, + "loss": 0.4318, + "step": 8485 + }, + { + "epoch": 0.67, + "grad_norm": 1.5090856842110851, + "learning_rate": 2.6445236367245037e-06, + "loss": 0.4684, + "step": 8486 + }, + { + "epoch": 0.67, + "grad_norm": 1.8167839396921261, + "learning_rate": 2.6434018833741265e-06, + "loss": 0.4296, + "step": 8487 + }, + { + "epoch": 0.67, + "grad_norm": 0.5642067530405176, + "learning_rate": 2.642280282492563e-06, + "loss": 0.4662, + "step": 8488 + }, + { + "epoch": 0.67, + "grad_norm": 2.0329295704740686, + "learning_rate": 2.6411588341523827e-06, + "loss": 0.429, + "step": 8489 + }, + { + "epoch": 0.67, + "grad_norm": 1.8371561954660316, + "learning_rate": 2.6400375384261356e-06, + "loss": 0.4821, + "step": 8490 + }, + { + "epoch": 0.67, + "grad_norm": 2.564146389919857, + "learning_rate": 2.638916395386377e-06, + "loss": 0.4751, + "step": 8491 + }, + { + "epoch": 0.67, + "grad_norm": 0.5552921235632774, + "learning_rate": 2.6377954051056374e-06, + "loss": 0.4775, + "step": 8492 + }, + { + "epoch": 0.67, + "grad_norm": 1.7284281295429405, + "learning_rate": 2.6366745676564445e-06, + "loss": 0.4726, + "step": 8493 + }, + { + "epoch": 0.67, + "grad_norm": 1.8370945594716481, + "learning_rate": 2.6355538831113153e-06, + "loss": 0.4031, + "step": 8494 + }, + { + "epoch": 0.67, + "grad_norm": 2.040817039820376, + "learning_rate": 2.634433351542759e-06, + "loss": 0.4739, + "step": 8495 + }, + { + "epoch": 0.67, + "grad_norm": 1.5078401204528307, + "learning_rate": 2.633312973023268e-06, + "loss": 0.453, + "step": 8496 + }, + { + "epoch": 0.67, + "grad_norm": 1.293889457053298, + "learning_rate": 2.632192747625332e-06, + "loss": 0.4403, + "step": 8497 + }, + { + "epoch": 0.67, + "grad_norm": 1.6745807258430505, + "learning_rate": 2.631072675421428e-06, + "loss": 0.471, + "step": 8498 + }, + { + "epoch": 0.67, + "grad_norm": 2.0634735259719865, + "learning_rate": 2.629952756484022e-06, + "loss": 0.4631, + "step": 8499 + }, + { + "epoch": 0.67, + "grad_norm": 1.8847559329080108, + "learning_rate": 2.628832990885575e-06, + "loss": 0.4264, + "step": 8500 + }, + { + "epoch": 0.67, + "grad_norm": 1.7825098481398827, + "learning_rate": 2.6277133786985297e-06, + "loss": 0.4513, + "step": 8501 + }, + { + "epoch": 0.67, + "grad_norm": 1.567724441603882, + "learning_rate": 2.626593919995325e-06, + "loss": 0.4635, + "step": 8502 + }, + { + "epoch": 0.67, + "grad_norm": 1.6950739132535348, + "learning_rate": 2.6254746148483888e-06, + "loss": 0.4178, + "step": 8503 + }, + { + "epoch": 0.67, + "grad_norm": 1.507370779644991, + "learning_rate": 2.6243554633301414e-06, + "loss": 0.4034, + "step": 8504 + }, + { + "epoch": 0.67, + "grad_norm": 1.9862938573743107, + "learning_rate": 2.623236465512985e-06, + "loss": 0.4807, + "step": 8505 + }, + { + "epoch": 0.67, + "grad_norm": 0.5831015312586768, + "learning_rate": 2.622117621469321e-06, + "loss": 0.4774, + "step": 8506 + }, + { + "epoch": 0.67, + "grad_norm": 1.4719524421445083, + "learning_rate": 2.6209989312715355e-06, + "loss": 0.3982, + "step": 8507 + }, + { + "epoch": 0.67, + "grad_norm": 1.6524371419219617, + "learning_rate": 2.6198803949920072e-06, + "loss": 0.4424, + "step": 8508 + }, + { + "epoch": 0.67, + "grad_norm": 0.5389395096091447, + "learning_rate": 2.618762012703104e-06, + "loss": 0.4764, + "step": 8509 + }, + { + "epoch": 0.67, + "grad_norm": 1.5818063829934148, + "learning_rate": 2.617643784477183e-06, + "loss": 0.4682, + "step": 8510 + }, + { + "epoch": 0.67, + "grad_norm": 1.8773826057078797, + "learning_rate": 2.616525710386595e-06, + "loss": 0.4931, + "step": 8511 + }, + { + "epoch": 0.67, + "grad_norm": 3.6377089041080937, + "learning_rate": 2.6154077905036734e-06, + "loss": 0.4367, + "step": 8512 + }, + { + "epoch": 0.67, + "grad_norm": 2.4027818187605505, + "learning_rate": 2.6142900249007487e-06, + "loss": 0.4937, + "step": 8513 + }, + { + "epoch": 0.67, + "grad_norm": 2.1403029345983993, + "learning_rate": 2.613172413650138e-06, + "loss": 0.4381, + "step": 8514 + }, + { + "epoch": 0.67, + "grad_norm": 1.5174338589195369, + "learning_rate": 2.612054956824152e-06, + "loss": 0.4698, + "step": 8515 + }, + { + "epoch": 0.67, + "grad_norm": 2.0666287030710415, + "learning_rate": 2.6109376544950847e-06, + "loss": 0.4822, + "step": 8516 + }, + { + "epoch": 0.67, + "grad_norm": 1.4342382356074892, + "learning_rate": 2.6098205067352255e-06, + "loss": 0.45, + "step": 8517 + }, + { + "epoch": 0.67, + "grad_norm": 0.5824856193519303, + "learning_rate": 2.6087035136168526e-06, + "loss": 0.4886, + "step": 8518 + }, + { + "epoch": 0.67, + "grad_norm": 3.1622282307878837, + "learning_rate": 2.6075866752122346e-06, + "loss": 0.4874, + "step": 8519 + }, + { + "epoch": 0.67, + "grad_norm": 2.3500344806688513, + "learning_rate": 2.606469991593631e-06, + "loss": 0.4618, + "step": 8520 + }, + { + "epoch": 0.67, + "grad_norm": 1.5917134376312063, + "learning_rate": 2.605353462833284e-06, + "loss": 0.4721, + "step": 8521 + }, + { + "epoch": 0.67, + "grad_norm": 1.693608857713611, + "learning_rate": 2.6042370890034387e-06, + "loss": 0.5131, + "step": 8522 + }, + { + "epoch": 0.67, + "grad_norm": 1.6137690527316064, + "learning_rate": 2.603120870176318e-06, + "loss": 0.5005, + "step": 8523 + }, + { + "epoch": 0.67, + "grad_norm": 3.1938158946481114, + "learning_rate": 2.602004806424144e-06, + "loss": 0.4397, + "step": 8524 + }, + { + "epoch": 0.67, + "grad_norm": 2.152766065455655, + "learning_rate": 2.600888897819117e-06, + "loss": 0.4384, + "step": 8525 + }, + { + "epoch": 0.67, + "grad_norm": 1.4056162054945518, + "learning_rate": 2.599773144433445e-06, + "loss": 0.4124, + "step": 8526 + }, + { + "epoch": 0.67, + "grad_norm": 0.5641305285671968, + "learning_rate": 2.5986575463393084e-06, + "loss": 0.4952, + "step": 8527 + }, + { + "epoch": 0.67, + "grad_norm": 1.7002122904668833, + "learning_rate": 2.5975421036088867e-06, + "loss": 0.4981, + "step": 8528 + }, + { + "epoch": 0.67, + "grad_norm": 1.69507720004764, + "learning_rate": 2.596426816314348e-06, + "loss": 0.4376, + "step": 8529 + }, + { + "epoch": 0.67, + "grad_norm": 1.576468601082067, + "learning_rate": 2.59531168452785e-06, + "loss": 0.4381, + "step": 8530 + }, + { + "epoch": 0.67, + "grad_norm": 1.8429147412969076, + "learning_rate": 2.5941967083215434e-06, + "loss": 0.4373, + "step": 8531 + }, + { + "epoch": 0.67, + "grad_norm": 1.965153882742663, + "learning_rate": 2.5930818877675595e-06, + "loss": 0.4456, + "step": 8532 + }, + { + "epoch": 0.67, + "grad_norm": 1.7499842478578287, + "learning_rate": 2.5919672229380287e-06, + "loss": 0.4288, + "step": 8533 + }, + { + "epoch": 0.67, + "grad_norm": 1.4858902344662497, + "learning_rate": 2.59085271390507e-06, + "loss": 0.4326, + "step": 8534 + }, + { + "epoch": 0.67, + "grad_norm": 1.699835041205681, + "learning_rate": 2.5897383607407894e-06, + "loss": 0.4256, + "step": 8535 + }, + { + "epoch": 0.67, + "grad_norm": 1.891612624963404, + "learning_rate": 2.5886241635172828e-06, + "loss": 0.4765, + "step": 8536 + }, + { + "epoch": 0.67, + "grad_norm": 1.4342743353297844, + "learning_rate": 2.587510122306638e-06, + "loss": 0.5153, + "step": 8537 + }, + { + "epoch": 0.67, + "grad_norm": 1.5554436417728643, + "learning_rate": 2.5863962371809327e-06, + "loss": 0.4616, + "step": 8538 + }, + { + "epoch": 0.67, + "grad_norm": 2.3285952490353705, + "learning_rate": 2.5852825082122344e-06, + "loss": 0.5175, + "step": 8539 + }, + { + "epoch": 0.67, + "grad_norm": 1.3601572878960517, + "learning_rate": 2.584168935472598e-06, + "loss": 0.4244, + "step": 8540 + }, + { + "epoch": 0.67, + "grad_norm": 1.7248061866880613, + "learning_rate": 2.583055519034069e-06, + "loss": 0.4202, + "step": 8541 + }, + { + "epoch": 0.67, + "grad_norm": 1.5417979528186434, + "learning_rate": 2.5819422589686897e-06, + "loss": 0.4044, + "step": 8542 + }, + { + "epoch": 0.67, + "grad_norm": 0.540173216546552, + "learning_rate": 2.5808291553484815e-06, + "loss": 0.496, + "step": 8543 + }, + { + "epoch": 0.67, + "grad_norm": 1.5293482959917508, + "learning_rate": 2.5797162082454625e-06, + "loss": 0.458, + "step": 8544 + }, + { + "epoch": 0.67, + "grad_norm": 1.450903397994852, + "learning_rate": 2.5786034177316384e-06, + "loss": 0.47, + "step": 8545 + }, + { + "epoch": 0.67, + "grad_norm": 1.8479480616204647, + "learning_rate": 2.5774907838790076e-06, + "loss": 0.403, + "step": 8546 + }, + { + "epoch": 0.67, + "grad_norm": 1.6077915845405977, + "learning_rate": 2.5763783067595517e-06, + "loss": 0.4278, + "step": 8547 + }, + { + "epoch": 0.67, + "grad_norm": 4.837164837608964, + "learning_rate": 2.575265986445249e-06, + "loss": 0.4479, + "step": 8548 + }, + { + "epoch": 0.67, + "grad_norm": 1.9004693878098304, + "learning_rate": 2.574153823008065e-06, + "loss": 0.4329, + "step": 8549 + }, + { + "epoch": 0.67, + "grad_norm": 1.6395205179966024, + "learning_rate": 2.5730418165199578e-06, + "loss": 0.5045, + "step": 8550 + }, + { + "epoch": 0.67, + "grad_norm": 2.2341567082617284, + "learning_rate": 2.5719299670528674e-06, + "loss": 0.4771, + "step": 8551 + }, + { + "epoch": 0.67, + "grad_norm": 1.986118391247093, + "learning_rate": 2.5708182746787326e-06, + "loss": 0.4866, + "step": 8552 + }, + { + "epoch": 0.67, + "grad_norm": 2.421030109128693, + "learning_rate": 2.5697067394694775e-06, + "loss": 0.453, + "step": 8553 + }, + { + "epoch": 0.67, + "grad_norm": 2.2025967267212385, + "learning_rate": 2.568595361497017e-06, + "loss": 0.4518, + "step": 8554 + }, + { + "epoch": 0.67, + "grad_norm": 2.110569780574495, + "learning_rate": 2.5674841408332573e-06, + "loss": 0.4297, + "step": 8555 + }, + { + "epoch": 0.67, + "grad_norm": 1.5789818236593, + "learning_rate": 2.5663730775500884e-06, + "loss": 0.4786, + "step": 8556 + }, + { + "epoch": 0.67, + "grad_norm": 1.850104729124118, + "learning_rate": 2.5652621717194015e-06, + "loss": 0.4091, + "step": 8557 + }, + { + "epoch": 0.67, + "grad_norm": 1.731488864548277, + "learning_rate": 2.564151423413065e-06, + "loss": 0.4604, + "step": 8558 + }, + { + "epoch": 0.67, + "grad_norm": 1.49003029567321, + "learning_rate": 2.5630408327029473e-06, + "loss": 0.3777, + "step": 8559 + }, + { + "epoch": 0.67, + "grad_norm": 2.0707340001976977, + "learning_rate": 2.5619303996608956e-06, + "loss": 0.4669, + "step": 8560 + }, + { + "epoch": 0.67, + "grad_norm": 1.3563625693508323, + "learning_rate": 2.5608201243587615e-06, + "loss": 0.4073, + "step": 8561 + }, + { + "epoch": 0.67, + "grad_norm": 0.5739350422634835, + "learning_rate": 2.559710006868372e-06, + "loss": 0.475, + "step": 8562 + }, + { + "epoch": 0.67, + "grad_norm": 1.6539876009477463, + "learning_rate": 2.558600047261556e-06, + "loss": 0.44, + "step": 8563 + }, + { + "epoch": 0.67, + "grad_norm": 2.9642129818604936, + "learning_rate": 2.557490245610118e-06, + "loss": 0.5015, + "step": 8564 + }, + { + "epoch": 0.67, + "grad_norm": 0.5945089426742003, + "learning_rate": 2.5563806019858685e-06, + "loss": 0.4827, + "step": 8565 + }, + { + "epoch": 0.67, + "grad_norm": 2.4074496052760956, + "learning_rate": 2.5552711164605993e-06, + "loss": 0.4507, + "step": 8566 + }, + { + "epoch": 0.67, + "grad_norm": 1.5297986086049986, + "learning_rate": 2.554161789106089e-06, + "loss": 0.4123, + "step": 8567 + }, + { + "epoch": 0.67, + "grad_norm": 1.9130238565323032, + "learning_rate": 2.553052619994111e-06, + "loss": 0.4755, + "step": 8568 + }, + { + "epoch": 0.67, + "grad_norm": 1.9626426660853369, + "learning_rate": 2.551943609196428e-06, + "loss": 0.4997, + "step": 8569 + }, + { + "epoch": 0.67, + "grad_norm": 1.8235505115878488, + "learning_rate": 2.5508347567847918e-06, + "loss": 0.4507, + "step": 8570 + }, + { + "epoch": 0.67, + "grad_norm": 1.5009336228453374, + "learning_rate": 2.5497260628309417e-06, + "loss": 0.4378, + "step": 8571 + }, + { + "epoch": 0.67, + "grad_norm": 4.299761076880709, + "learning_rate": 2.5486175274066103e-06, + "loss": 0.474, + "step": 8572 + }, + { + "epoch": 0.67, + "grad_norm": 2.0266383355768753, + "learning_rate": 2.547509150583518e-06, + "loss": 0.4721, + "step": 8573 + }, + { + "epoch": 0.67, + "grad_norm": 1.5312536057926136, + "learning_rate": 2.546400932433377e-06, + "loss": 0.4383, + "step": 8574 + }, + { + "epoch": 0.67, + "grad_norm": 1.8430456698327, + "learning_rate": 2.5452928730278838e-06, + "loss": 0.4483, + "step": 8575 + }, + { + "epoch": 0.67, + "grad_norm": 4.088097365766597, + "learning_rate": 2.544184972438729e-06, + "loss": 0.4242, + "step": 8576 + }, + { + "epoch": 0.67, + "grad_norm": 1.7534001890111448, + "learning_rate": 2.543077230737598e-06, + "loss": 0.4751, + "step": 8577 + }, + { + "epoch": 0.67, + "grad_norm": 2.376587771695501, + "learning_rate": 2.541969647996154e-06, + "loss": 0.4518, + "step": 8578 + }, + { + "epoch": 0.67, + "grad_norm": 1.7342260331926937, + "learning_rate": 2.540862224286058e-06, + "loss": 0.4812, + "step": 8579 + }, + { + "epoch": 0.67, + "grad_norm": 0.5734388931233861, + "learning_rate": 2.5397549596789596e-06, + "loss": 0.4952, + "step": 8580 + }, + { + "epoch": 0.67, + "grad_norm": 1.4332261127641648, + "learning_rate": 2.5386478542464983e-06, + "loss": 0.3881, + "step": 8581 + }, + { + "epoch": 0.67, + "grad_norm": 2.2727493909285594, + "learning_rate": 2.537540908060299e-06, + "loss": 0.3958, + "step": 8582 + }, + { + "epoch": 0.67, + "grad_norm": 1.4639662977067223, + "learning_rate": 2.5364341211919817e-06, + "loss": 0.4752, + "step": 8583 + }, + { + "epoch": 0.67, + "grad_norm": 0.5761633576145906, + "learning_rate": 2.5353274937131545e-06, + "loss": 0.4901, + "step": 8584 + }, + { + "epoch": 0.67, + "grad_norm": 1.6340536612938814, + "learning_rate": 2.534221025695416e-06, + "loss": 0.4408, + "step": 8585 + }, + { + "epoch": 0.67, + "grad_norm": 0.5646026971952806, + "learning_rate": 2.53311471721035e-06, + "loss": 0.4779, + "step": 8586 + }, + { + "epoch": 0.67, + "grad_norm": 3.5443148151215245, + "learning_rate": 2.5320085683295337e-06, + "loss": 0.4361, + "step": 8587 + }, + { + "epoch": 0.67, + "grad_norm": 1.3793015911878326, + "learning_rate": 2.530902579124535e-06, + "loss": 0.4311, + "step": 8588 + }, + { + "epoch": 0.67, + "grad_norm": 1.6410426161453069, + "learning_rate": 2.5297967496669096e-06, + "loss": 0.4645, + "step": 8589 + }, + { + "epoch": 0.67, + "grad_norm": 0.5528993356735784, + "learning_rate": 2.5286910800282048e-06, + "loss": 0.4958, + "step": 8590 + }, + { + "epoch": 0.67, + "grad_norm": 2.475619101163341, + "learning_rate": 2.52758557027995e-06, + "loss": 0.4419, + "step": 8591 + }, + { + "epoch": 0.67, + "grad_norm": 1.6119102276469228, + "learning_rate": 2.526480220493679e-06, + "loss": 0.4869, + "step": 8592 + }, + { + "epoch": 0.67, + "grad_norm": 1.7778331900365023, + "learning_rate": 2.5253750307408996e-06, + "loss": 0.4321, + "step": 8593 + }, + { + "epoch": 0.67, + "grad_norm": 1.5411127459319414, + "learning_rate": 2.5242700010931207e-06, + "loss": 0.473, + "step": 8594 + }, + { + "epoch": 0.68, + "grad_norm": 1.6033165290274263, + "learning_rate": 2.52316513162183e-06, + "loss": 0.495, + "step": 8595 + }, + { + "epoch": 0.68, + "grad_norm": 0.5167209242897366, + "learning_rate": 2.522060422398519e-06, + "loss": 0.4819, + "step": 8596 + }, + { + "epoch": 0.68, + "grad_norm": 2.009066943752502, + "learning_rate": 2.520955873494656e-06, + "loss": 0.4725, + "step": 8597 + }, + { + "epoch": 0.68, + "grad_norm": 0.5264861073931983, + "learning_rate": 2.519851484981704e-06, + "loss": 0.4712, + "step": 8598 + }, + { + "epoch": 0.68, + "grad_norm": 0.5077188082281688, + "learning_rate": 2.5187472569311167e-06, + "loss": 0.4536, + "step": 8599 + }, + { + "epoch": 0.68, + "grad_norm": 4.019115630620905, + "learning_rate": 2.517643189414336e-06, + "loss": 0.4781, + "step": 8600 + }, + { + "epoch": 0.68, + "grad_norm": 1.6532095102760669, + "learning_rate": 2.5165392825027957e-06, + "loss": 0.4258, + "step": 8601 + }, + { + "epoch": 0.68, + "grad_norm": 1.8547903271265982, + "learning_rate": 2.515435536267913e-06, + "loss": 0.4306, + "step": 8602 + }, + { + "epoch": 0.68, + "grad_norm": 2.2216175528845805, + "learning_rate": 2.514331950781101e-06, + "loss": 0.4566, + "step": 8603 + }, + { + "epoch": 0.68, + "grad_norm": 1.9686083682659197, + "learning_rate": 2.51322852611376e-06, + "loss": 0.4442, + "step": 8604 + }, + { + "epoch": 0.68, + "grad_norm": 1.5932490974825848, + "learning_rate": 2.5121252623372826e-06, + "loss": 0.4123, + "step": 8605 + }, + { + "epoch": 0.68, + "grad_norm": 1.5724265732507499, + "learning_rate": 2.5110221595230435e-06, + "loss": 0.4122, + "step": 8606 + }, + { + "epoch": 0.68, + "grad_norm": 0.5805141920196887, + "learning_rate": 2.5099192177424155e-06, + "loss": 0.4808, + "step": 8607 + }, + { + "epoch": 0.68, + "grad_norm": 1.4585497866315096, + "learning_rate": 2.5088164370667558e-06, + "loss": 0.4799, + "step": 8608 + }, + { + "epoch": 0.68, + "grad_norm": 3.169329087083359, + "learning_rate": 2.5077138175674144e-06, + "loss": 0.4654, + "step": 8609 + }, + { + "epoch": 0.68, + "grad_norm": 2.40047169601641, + "learning_rate": 2.5066113593157303e-06, + "loss": 0.4812, + "step": 8610 + }, + { + "epoch": 0.68, + "grad_norm": 1.8158049722535092, + "learning_rate": 2.505509062383026e-06, + "loss": 0.4584, + "step": 8611 + }, + { + "epoch": 0.68, + "grad_norm": 3.1137347449491664, + "learning_rate": 2.5044069268406256e-06, + "loss": 0.47, + "step": 8612 + }, + { + "epoch": 0.68, + "grad_norm": 1.7668796729463565, + "learning_rate": 2.503304952759831e-06, + "loss": 0.5049, + "step": 8613 + }, + { + "epoch": 0.68, + "grad_norm": 1.493389780900634, + "learning_rate": 2.502203140211939e-06, + "loss": 0.4732, + "step": 8614 + }, + { + "epoch": 0.68, + "grad_norm": 1.7745690779243597, + "learning_rate": 2.5011014892682366e-06, + "loss": 0.4365, + "step": 8615 + }, + { + "epoch": 0.68, + "grad_norm": 1.688350050063718, + "learning_rate": 2.5000000000000015e-06, + "loss": 0.4749, + "step": 8616 + }, + { + "epoch": 0.68, + "grad_norm": 2.5793764822172753, + "learning_rate": 2.498898672478493e-06, + "loss": 0.4334, + "step": 8617 + }, + { + "epoch": 0.68, + "grad_norm": 2.1798522161664264, + "learning_rate": 2.497797506774969e-06, + "loss": 0.4313, + "step": 8618 + }, + { + "epoch": 0.68, + "grad_norm": 1.5728479128864852, + "learning_rate": 2.4966965029606734e-06, + "loss": 0.4528, + "step": 8619 + }, + { + "epoch": 0.68, + "grad_norm": 2.1434383004298003, + "learning_rate": 2.495595661106839e-06, + "loss": 0.4011, + "step": 8620 + }, + { + "epoch": 0.68, + "grad_norm": 0.5424611765008539, + "learning_rate": 2.4944949812846915e-06, + "loss": 0.4689, + "step": 8621 + }, + { + "epoch": 0.68, + "grad_norm": 1.4775355701343522, + "learning_rate": 2.493394463565438e-06, + "loss": 0.4304, + "step": 8622 + }, + { + "epoch": 0.68, + "grad_norm": 1.6978010903989313, + "learning_rate": 2.4922941080202852e-06, + "loss": 0.4458, + "step": 8623 + }, + { + "epoch": 0.68, + "grad_norm": 1.4792508890475076, + "learning_rate": 2.4911939147204224e-06, + "loss": 0.4444, + "step": 8624 + }, + { + "epoch": 0.68, + "grad_norm": 1.5817313643310613, + "learning_rate": 2.490093883737033e-06, + "loss": 0.4179, + "step": 8625 + }, + { + "epoch": 0.68, + "grad_norm": 1.6923549140820773, + "learning_rate": 2.4889940151412824e-06, + "loss": 0.4164, + "step": 8626 + }, + { + "epoch": 0.68, + "grad_norm": 1.6833100477919725, + "learning_rate": 2.487894309004338e-06, + "loss": 0.3896, + "step": 8627 + }, + { + "epoch": 0.68, + "grad_norm": 1.9406500440740726, + "learning_rate": 2.4867947653973436e-06, + "loss": 0.4357, + "step": 8628 + }, + { + "epoch": 0.68, + "grad_norm": 1.4813109871955763, + "learning_rate": 2.4856953843914423e-06, + "loss": 0.5006, + "step": 8629 + }, + { + "epoch": 0.68, + "grad_norm": 3.4881134140075156, + "learning_rate": 2.484596166057757e-06, + "loss": 0.4452, + "step": 8630 + }, + { + "epoch": 0.68, + "grad_norm": 1.9944873329607868, + "learning_rate": 2.483497110467412e-06, + "loss": 0.4985, + "step": 8631 + }, + { + "epoch": 0.68, + "grad_norm": 3.022521918791755, + "learning_rate": 2.482398217691511e-06, + "loss": 0.4422, + "step": 8632 + }, + { + "epoch": 0.68, + "grad_norm": 1.6589273072944433, + "learning_rate": 2.481299487801151e-06, + "loss": 0.4147, + "step": 8633 + }, + { + "epoch": 0.68, + "grad_norm": 3.6240499914796653, + "learning_rate": 2.48020092086742e-06, + "loss": 0.3867, + "step": 8634 + }, + { + "epoch": 0.68, + "grad_norm": 2.3407068340163475, + "learning_rate": 2.4791025169613925e-06, + "loss": 0.431, + "step": 8635 + }, + { + "epoch": 0.68, + "grad_norm": 2.4916918372388985, + "learning_rate": 2.478004276154137e-06, + "loss": 0.4317, + "step": 8636 + }, + { + "epoch": 0.68, + "grad_norm": 1.6608103333475932, + "learning_rate": 2.476906198516703e-06, + "loss": 0.4204, + "step": 8637 + }, + { + "epoch": 0.68, + "grad_norm": 1.5124608373167754, + "learning_rate": 2.475808284120138e-06, + "loss": 0.5029, + "step": 8638 + }, + { + "epoch": 0.68, + "grad_norm": 0.5698826708287567, + "learning_rate": 2.4747105330354748e-06, + "loss": 0.5198, + "step": 8639 + }, + { + "epoch": 0.68, + "grad_norm": 1.4527197190331629, + "learning_rate": 2.4736129453337384e-06, + "loss": 0.4326, + "step": 8640 + }, + { + "epoch": 0.68, + "grad_norm": 1.9973030042093867, + "learning_rate": 2.4725155210859373e-06, + "loss": 0.4569, + "step": 8641 + }, + { + "epoch": 0.68, + "grad_norm": 2.7788717862448635, + "learning_rate": 2.4714182603630764e-06, + "loss": 0.4451, + "step": 8642 + }, + { + "epoch": 0.68, + "grad_norm": 0.5802608512960286, + "learning_rate": 2.4703211632361457e-06, + "loss": 0.4786, + "step": 8643 + }, + { + "epoch": 0.68, + "grad_norm": 1.6258557113890921, + "learning_rate": 2.4692242297761267e-06, + "loss": 0.4536, + "step": 8644 + }, + { + "epoch": 0.68, + "grad_norm": 1.6260850607556498, + "learning_rate": 2.4681274600539914e-06, + "loss": 0.4598, + "step": 8645 + }, + { + "epoch": 0.68, + "grad_norm": 1.7777149196640947, + "learning_rate": 2.4670308541406928e-06, + "loss": 0.4975, + "step": 8646 + }, + { + "epoch": 0.68, + "grad_norm": 2.3724422337579663, + "learning_rate": 2.465934412107189e-06, + "loss": 0.4833, + "step": 8647 + }, + { + "epoch": 0.68, + "grad_norm": 2.200358260515967, + "learning_rate": 2.4648381340244116e-06, + "loss": 0.4854, + "step": 8648 + }, + { + "epoch": 0.68, + "grad_norm": 1.669084606731499, + "learning_rate": 2.4637420199632906e-06, + "loss": 0.4839, + "step": 8649 + }, + { + "epoch": 0.68, + "grad_norm": 1.6659316772380972, + "learning_rate": 2.462646069994743e-06, + "loss": 0.4569, + "step": 8650 + }, + { + "epoch": 0.68, + "grad_norm": 1.5568474077030963, + "learning_rate": 2.4615502841896773e-06, + "loss": 0.4574, + "step": 8651 + }, + { + "epoch": 0.68, + "grad_norm": 2.3557823585281814, + "learning_rate": 2.460454662618985e-06, + "loss": 0.4195, + "step": 8652 + }, + { + "epoch": 0.68, + "grad_norm": 1.4003361552797249, + "learning_rate": 2.4593592053535548e-06, + "loss": 0.4713, + "step": 8653 + }, + { + "epoch": 0.68, + "grad_norm": 1.7484157385085242, + "learning_rate": 2.45826391246426e-06, + "loss": 0.4407, + "step": 8654 + }, + { + "epoch": 0.68, + "grad_norm": 4.131802398963274, + "learning_rate": 2.457168784021965e-06, + "loss": 0.4606, + "step": 8655 + }, + { + "epoch": 0.68, + "grad_norm": 0.5690043519831829, + "learning_rate": 2.4560738200975252e-06, + "loss": 0.4774, + "step": 8656 + }, + { + "epoch": 0.68, + "grad_norm": 0.5434630962379705, + "learning_rate": 2.4549790207617794e-06, + "loss": 0.4787, + "step": 8657 + }, + { + "epoch": 0.68, + "grad_norm": 1.564726948929696, + "learning_rate": 2.453884386085562e-06, + "loss": 0.4571, + "step": 8658 + }, + { + "epoch": 0.68, + "grad_norm": 1.9266085209091557, + "learning_rate": 2.452789916139694e-06, + "loss": 0.4381, + "step": 8659 + }, + { + "epoch": 0.68, + "grad_norm": 2.8626364164430838, + "learning_rate": 2.451695610994988e-06, + "loss": 0.4672, + "step": 8660 + }, + { + "epoch": 0.68, + "grad_norm": 1.561103422228639, + "learning_rate": 2.4506014707222387e-06, + "loss": 0.4394, + "step": 8661 + }, + { + "epoch": 0.68, + "grad_norm": 2.166281520429704, + "learning_rate": 2.4495074953922425e-06, + "loss": 0.4727, + "step": 8662 + }, + { + "epoch": 0.68, + "grad_norm": 2.4459813431808777, + "learning_rate": 2.448413685075774e-06, + "loss": 0.4384, + "step": 8663 + }, + { + "epoch": 0.68, + "grad_norm": 1.747174031589353, + "learning_rate": 2.447320039843603e-06, + "loss": 0.3458, + "step": 8664 + }, + { + "epoch": 0.68, + "grad_norm": 1.4678498545576237, + "learning_rate": 2.446226559766483e-06, + "loss": 0.4436, + "step": 8665 + }, + { + "epoch": 0.68, + "grad_norm": 0.5563641565233948, + "learning_rate": 2.4451332449151653e-06, + "loss": 0.4762, + "step": 8666 + }, + { + "epoch": 0.68, + "grad_norm": 2.638185921559047, + "learning_rate": 2.4440400953603864e-06, + "loss": 0.5084, + "step": 8667 + }, + { + "epoch": 0.68, + "grad_norm": 0.5263339387029313, + "learning_rate": 2.4429471111728683e-06, + "loss": 0.4799, + "step": 8668 + }, + { + "epoch": 0.68, + "grad_norm": 1.854657421494241, + "learning_rate": 2.441854292423327e-06, + "loss": 0.4648, + "step": 8669 + }, + { + "epoch": 0.68, + "grad_norm": 0.5429165018187546, + "learning_rate": 2.4407616391824656e-06, + "loss": 0.4962, + "step": 8670 + }, + { + "epoch": 0.68, + "grad_norm": 1.5265043076079936, + "learning_rate": 2.4396691515209813e-06, + "loss": 0.4327, + "step": 8671 + }, + { + "epoch": 0.68, + "grad_norm": 1.5618279874013237, + "learning_rate": 2.438576829509551e-06, + "loss": 0.4737, + "step": 8672 + }, + { + "epoch": 0.68, + "grad_norm": 2.1583303231816657, + "learning_rate": 2.437484673218849e-06, + "loss": 0.4506, + "step": 8673 + }, + { + "epoch": 0.68, + "grad_norm": 2.586101024056693, + "learning_rate": 2.4363926827195367e-06, + "loss": 0.5102, + "step": 8674 + }, + { + "epoch": 0.68, + "grad_norm": 1.8869976814534242, + "learning_rate": 2.435300858082266e-06, + "loss": 0.4521, + "step": 8675 + }, + { + "epoch": 0.68, + "grad_norm": 3.0961996825225055, + "learning_rate": 2.434209199377673e-06, + "loss": 0.4204, + "step": 8676 + }, + { + "epoch": 0.68, + "grad_norm": 1.6416289702852962, + "learning_rate": 2.433117706676386e-06, + "loss": 0.4125, + "step": 8677 + }, + { + "epoch": 0.68, + "grad_norm": 1.7160250390882927, + "learning_rate": 2.4320263800490302e-06, + "loss": 0.4856, + "step": 8678 + }, + { + "epoch": 0.68, + "grad_norm": 0.5549098930267665, + "learning_rate": 2.430935219566206e-06, + "loss": 0.5034, + "step": 8679 + }, + { + "epoch": 0.68, + "grad_norm": 0.5347285705408564, + "learning_rate": 2.4298442252985137e-06, + "loss": 0.4904, + "step": 8680 + }, + { + "epoch": 0.68, + "grad_norm": 2.6905874609255225, + "learning_rate": 2.4287533973165343e-06, + "loss": 0.4594, + "step": 8681 + }, + { + "epoch": 0.68, + "grad_norm": 1.7937293985435099, + "learning_rate": 2.4276627356908503e-06, + "loss": 0.427, + "step": 8682 + }, + { + "epoch": 0.68, + "grad_norm": 1.9039307597113626, + "learning_rate": 2.4265722404920206e-06, + "loss": 0.4979, + "step": 8683 + }, + { + "epoch": 0.68, + "grad_norm": 0.6277690795983729, + "learning_rate": 2.4254819117906003e-06, + "loss": 0.4736, + "step": 8684 + }, + { + "epoch": 0.68, + "grad_norm": 1.7101031588502282, + "learning_rate": 2.4243917496571325e-06, + "loss": 0.4466, + "step": 8685 + }, + { + "epoch": 0.68, + "grad_norm": 1.9178211256721815, + "learning_rate": 2.4233017541621504e-06, + "loss": 0.4364, + "step": 8686 + }, + { + "epoch": 0.68, + "grad_norm": 2.033452560934233, + "learning_rate": 2.4222119253761727e-06, + "loss": 0.5065, + "step": 8687 + }, + { + "epoch": 0.68, + "grad_norm": 1.4054579669995466, + "learning_rate": 2.4211222633697112e-06, + "loss": 0.4436, + "step": 8688 + }, + { + "epoch": 0.68, + "grad_norm": 1.5114033109457847, + "learning_rate": 2.4200327682132664e-06, + "loss": 0.4118, + "step": 8689 + }, + { + "epoch": 0.68, + "grad_norm": 1.5468618864721682, + "learning_rate": 2.418943439977325e-06, + "loss": 0.4454, + "step": 8690 + }, + { + "epoch": 0.68, + "grad_norm": 1.450922365663008, + "learning_rate": 2.41785427873237e-06, + "loss": 0.4554, + "step": 8691 + }, + { + "epoch": 0.68, + "grad_norm": 2.308682294931974, + "learning_rate": 2.4167652845488627e-06, + "loss": 0.4213, + "step": 8692 + }, + { + "epoch": 0.68, + "grad_norm": 0.551988616539631, + "learning_rate": 2.4156764574972618e-06, + "loss": 0.4751, + "step": 8693 + }, + { + "epoch": 0.68, + "grad_norm": 3.336032193924232, + "learning_rate": 2.4145877976480135e-06, + "loss": 0.4409, + "step": 8694 + }, + { + "epoch": 0.68, + "grad_norm": 2.0652399981037206, + "learning_rate": 2.413499305071555e-06, + "loss": 0.4625, + "step": 8695 + }, + { + "epoch": 0.68, + "grad_norm": 1.2609130787039096, + "learning_rate": 2.412410979838304e-06, + "loss": 0.3775, + "step": 8696 + }, + { + "epoch": 0.68, + "grad_norm": 1.3321685529579332, + "learning_rate": 2.4113228220186814e-06, + "loss": 0.4349, + "step": 8697 + }, + { + "epoch": 0.68, + "grad_norm": 1.8896809262748044, + "learning_rate": 2.410234831683084e-06, + "loss": 0.3922, + "step": 8698 + }, + { + "epoch": 0.68, + "grad_norm": 1.9269334997883887, + "learning_rate": 2.4091470089019044e-06, + "loss": 0.3903, + "step": 8699 + }, + { + "epoch": 0.68, + "grad_norm": 3.43302857964809, + "learning_rate": 2.4080593537455242e-06, + "loss": 0.468, + "step": 8700 + }, + { + "epoch": 0.68, + "grad_norm": 1.7937640036844844, + "learning_rate": 2.406971866284314e-06, + "loss": 0.4207, + "step": 8701 + }, + { + "epoch": 0.68, + "grad_norm": 2.698001434002493, + "learning_rate": 2.4058845465886334e-06, + "loss": 0.4083, + "step": 8702 + }, + { + "epoch": 0.68, + "grad_norm": 1.7994236500960634, + "learning_rate": 2.404797394728827e-06, + "loss": 0.5186, + "step": 8703 + }, + { + "epoch": 0.68, + "grad_norm": 1.8225428990303667, + "learning_rate": 2.403710410775234e-06, + "loss": 0.4087, + "step": 8704 + }, + { + "epoch": 0.68, + "grad_norm": 1.8496258455235655, + "learning_rate": 2.4026235947981815e-06, + "loss": 0.439, + "step": 8705 + }, + { + "epoch": 0.68, + "grad_norm": 1.910776108274756, + "learning_rate": 2.4015369468679866e-06, + "loss": 0.4466, + "step": 8706 + }, + { + "epoch": 0.68, + "grad_norm": 1.6735818153322384, + "learning_rate": 2.4004504670549493e-06, + "loss": 0.4096, + "step": 8707 + }, + { + "epoch": 0.68, + "grad_norm": 1.8797391118636217, + "learning_rate": 2.3993641554293673e-06, + "loss": 0.5002, + "step": 8708 + }, + { + "epoch": 0.68, + "grad_norm": 4.978327153942626, + "learning_rate": 2.398278012061522e-06, + "loss": 0.4767, + "step": 8709 + }, + { + "epoch": 0.68, + "grad_norm": 1.2698675281431169, + "learning_rate": 2.3971920370216873e-06, + "loss": 0.4504, + "step": 8710 + }, + { + "epoch": 0.68, + "grad_norm": 2.2319502387286105, + "learning_rate": 2.3961062303801215e-06, + "loss": 0.4783, + "step": 8711 + }, + { + "epoch": 0.68, + "grad_norm": 2.8360719569294925, + "learning_rate": 2.3950205922070742e-06, + "loss": 0.4039, + "step": 8712 + }, + { + "epoch": 0.68, + "grad_norm": 2.0316487686099727, + "learning_rate": 2.39393512257279e-06, + "loss": 0.4442, + "step": 8713 + }, + { + "epoch": 0.68, + "grad_norm": 1.801444738290949, + "learning_rate": 2.3928498215474933e-06, + "loss": 0.4964, + "step": 8714 + }, + { + "epoch": 0.68, + "grad_norm": 2.73184748549202, + "learning_rate": 2.391764689201404e-06, + "loss": 0.4394, + "step": 8715 + }, + { + "epoch": 0.68, + "grad_norm": 3.058526591733146, + "learning_rate": 2.390679725604724e-06, + "loss": 0.4617, + "step": 8716 + }, + { + "epoch": 0.68, + "grad_norm": 9.03967655046963, + "learning_rate": 2.3895949308276558e-06, + "loss": 0.4795, + "step": 8717 + }, + { + "epoch": 0.68, + "grad_norm": 1.7912932690383438, + "learning_rate": 2.3885103049403783e-06, + "loss": 0.4372, + "step": 8718 + }, + { + "epoch": 0.68, + "grad_norm": 0.5605117839335046, + "learning_rate": 2.3874258480130684e-06, + "loss": 0.4656, + "step": 8719 + }, + { + "epoch": 0.68, + "grad_norm": 0.556538726887634, + "learning_rate": 2.3863415601158884e-06, + "loss": 0.5036, + "step": 8720 + }, + { + "epoch": 0.68, + "grad_norm": 2.2463965567200543, + "learning_rate": 2.385257441318992e-06, + "loss": 0.5037, + "step": 8721 + }, + { + "epoch": 0.68, + "grad_norm": 1.813789513504855, + "learning_rate": 2.384173491692517e-06, + "loss": 0.4402, + "step": 8722 + }, + { + "epoch": 0.69, + "grad_norm": 1.757445889349721, + "learning_rate": 2.3830897113065947e-06, + "loss": 0.4805, + "step": 8723 + }, + { + "epoch": 0.69, + "grad_norm": 1.5351100616852873, + "learning_rate": 2.3820061002313454e-06, + "loss": 0.4626, + "step": 8724 + }, + { + "epoch": 0.69, + "grad_norm": 0.5616114610000059, + "learning_rate": 2.380922658536876e-06, + "loss": 0.4854, + "step": 8725 + }, + { + "epoch": 0.69, + "grad_norm": 2.431120166783938, + "learning_rate": 2.3798393862932873e-06, + "loss": 0.4711, + "step": 8726 + }, + { + "epoch": 0.69, + "grad_norm": 0.5083332342059331, + "learning_rate": 2.37875628357066e-06, + "loss": 0.4736, + "step": 8727 + }, + { + "epoch": 0.69, + "grad_norm": 1.6168500939661397, + "learning_rate": 2.377673350439072e-06, + "loss": 0.4297, + "step": 8728 + }, + { + "epoch": 0.69, + "grad_norm": 2.3929795471069135, + "learning_rate": 2.3765905869685883e-06, + "loss": 0.4428, + "step": 8729 + }, + { + "epoch": 0.69, + "grad_norm": 0.553332261763214, + "learning_rate": 2.3755079932292635e-06, + "loss": 0.4802, + "step": 8730 + }, + { + "epoch": 0.69, + "grad_norm": 1.4712408387825002, + "learning_rate": 2.3744255692911345e-06, + "loss": 0.4004, + "step": 8731 + }, + { + "epoch": 0.69, + "grad_norm": 0.5426212009283794, + "learning_rate": 2.37334331522424e-06, + "loss": 0.4802, + "step": 8732 + }, + { + "epoch": 0.69, + "grad_norm": 0.5377603289534132, + "learning_rate": 2.3722612310985953e-06, + "loss": 0.4664, + "step": 8733 + }, + { + "epoch": 0.69, + "grad_norm": 1.7730832787599258, + "learning_rate": 2.3711793169842114e-06, + "loss": 0.4055, + "step": 8734 + }, + { + "epoch": 0.69, + "grad_norm": 1.6340121522007784, + "learning_rate": 2.3700975729510868e-06, + "loss": 0.4396, + "step": 8735 + }, + { + "epoch": 0.69, + "grad_norm": 0.5760830756576245, + "learning_rate": 2.3690159990692087e-06, + "loss": 0.4766, + "step": 8736 + }, + { + "epoch": 0.69, + "grad_norm": 1.8506566204935986, + "learning_rate": 2.367934595408556e-06, + "loss": 0.4954, + "step": 8737 + }, + { + "epoch": 0.69, + "grad_norm": 2.1439958788541276, + "learning_rate": 2.3668533620390895e-06, + "loss": 0.456, + "step": 8738 + }, + { + "epoch": 0.69, + "grad_norm": 1.8333177686380815, + "learning_rate": 2.3657722990307658e-06, + "loss": 0.4659, + "step": 8739 + }, + { + "epoch": 0.69, + "grad_norm": 3.082158365443648, + "learning_rate": 2.3646914064535285e-06, + "loss": 0.4392, + "step": 8740 + }, + { + "epoch": 0.69, + "grad_norm": 2.02661052503492, + "learning_rate": 2.363610684377312e-06, + "loss": 0.4633, + "step": 8741 + }, + { + "epoch": 0.69, + "grad_norm": 2.160091996781083, + "learning_rate": 2.362530132872033e-06, + "loss": 0.4682, + "step": 8742 + }, + { + "epoch": 0.69, + "grad_norm": 0.5705629523476209, + "learning_rate": 2.3614497520076047e-06, + "loss": 0.4932, + "step": 8743 + }, + { + "epoch": 0.69, + "grad_norm": 0.556365044463721, + "learning_rate": 2.360369541853925e-06, + "loss": 0.4571, + "step": 8744 + }, + { + "epoch": 0.69, + "grad_norm": 1.6269510513434418, + "learning_rate": 2.3592895024808837e-06, + "loss": 0.4977, + "step": 8745 + }, + { + "epoch": 0.69, + "grad_norm": 0.5624560286531658, + "learning_rate": 2.3582096339583587e-06, + "loss": 0.4796, + "step": 8746 + }, + { + "epoch": 0.69, + "grad_norm": 2.10179051197027, + "learning_rate": 2.3571299363562104e-06, + "loss": 0.468, + "step": 8747 + }, + { + "epoch": 0.69, + "grad_norm": 1.2019275531017815, + "learning_rate": 2.356050409744302e-06, + "loss": 0.4014, + "step": 8748 + }, + { + "epoch": 0.69, + "grad_norm": 1.7148279912355804, + "learning_rate": 2.3549710541924714e-06, + "loss": 0.4015, + "step": 8749 + }, + { + "epoch": 0.69, + "grad_norm": 1.4593976164721274, + "learning_rate": 2.3538918697705553e-06, + "loss": 0.4381, + "step": 8750 + }, + { + "epoch": 0.69, + "grad_norm": 0.5434342728739264, + "learning_rate": 2.3528128565483698e-06, + "loss": 0.464, + "step": 8751 + }, + { + "epoch": 0.69, + "grad_norm": 2.370781730100156, + "learning_rate": 2.351734014595734e-06, + "loss": 0.4182, + "step": 8752 + }, + { + "epoch": 0.69, + "grad_norm": 1.917740908400957, + "learning_rate": 2.35065534398244e-06, + "loss": 0.4247, + "step": 8753 + }, + { + "epoch": 0.69, + "grad_norm": 3.5026413348510474, + "learning_rate": 2.3495768447782808e-06, + "loss": 0.4439, + "step": 8754 + }, + { + "epoch": 0.69, + "grad_norm": 1.764071283480364, + "learning_rate": 2.3484985170530315e-06, + "loss": 0.409, + "step": 8755 + }, + { + "epoch": 0.69, + "grad_norm": 0.542035515467006, + "learning_rate": 2.34742036087646e-06, + "loss": 0.4857, + "step": 8756 + }, + { + "epoch": 0.69, + "grad_norm": 1.8717198331653573, + "learning_rate": 2.346342376318323e-06, + "loss": 0.4635, + "step": 8757 + }, + { + "epoch": 0.69, + "grad_norm": 2.0116756100216424, + "learning_rate": 2.3452645634483613e-06, + "loss": 0.4856, + "step": 8758 + }, + { + "epoch": 0.69, + "grad_norm": 1.4223974963036041, + "learning_rate": 2.344186922336309e-06, + "loss": 0.4801, + "step": 8759 + }, + { + "epoch": 0.69, + "grad_norm": 1.368871237721846, + "learning_rate": 2.3431094530518888e-06, + "loss": 0.4299, + "step": 8760 + }, + { + "epoch": 0.69, + "grad_norm": 0.520330667529179, + "learning_rate": 2.342032155664813e-06, + "loss": 0.4864, + "step": 8761 + }, + { + "epoch": 0.69, + "grad_norm": 0.5705362844642544, + "learning_rate": 2.340955030244778e-06, + "loss": 0.4903, + "step": 8762 + }, + { + "epoch": 0.69, + "grad_norm": 7.216846974685011, + "learning_rate": 2.339878076861474e-06, + "loss": 0.4496, + "step": 8763 + }, + { + "epoch": 0.69, + "grad_norm": 1.322440611483879, + "learning_rate": 2.3388012955845786e-06, + "loss": 0.3751, + "step": 8764 + }, + { + "epoch": 0.69, + "grad_norm": 1.4974417035979108, + "learning_rate": 2.33772468648376e-06, + "loss": 0.4752, + "step": 8765 + }, + { + "epoch": 0.69, + "grad_norm": 1.5098265348911137, + "learning_rate": 2.3366482496286672e-06, + "loss": 0.4416, + "step": 8766 + }, + { + "epoch": 0.69, + "grad_norm": 1.4855040288373205, + "learning_rate": 2.335571985088951e-06, + "loss": 0.4577, + "step": 8767 + }, + { + "epoch": 0.69, + "grad_norm": 0.5736189378000146, + "learning_rate": 2.3344958929342436e-06, + "loss": 0.4831, + "step": 8768 + }, + { + "epoch": 0.69, + "grad_norm": 1.7676328408650561, + "learning_rate": 2.3334199732341634e-06, + "loss": 0.4434, + "step": 8769 + }, + { + "epoch": 0.69, + "grad_norm": 1.44241088476426, + "learning_rate": 2.3323442260583223e-06, + "loss": 0.4436, + "step": 8770 + }, + { + "epoch": 0.69, + "grad_norm": 2.3992978635751228, + "learning_rate": 2.33126865147632e-06, + "loss": 0.4679, + "step": 8771 + }, + { + "epoch": 0.69, + "grad_norm": 2.09087307772518, + "learning_rate": 2.3301932495577468e-06, + "loss": 0.4553, + "step": 8772 + }, + { + "epoch": 0.69, + "grad_norm": 1.944289284012359, + "learning_rate": 2.3291180203721757e-06, + "loss": 0.4548, + "step": 8773 + }, + { + "epoch": 0.69, + "grad_norm": 1.592144710149855, + "learning_rate": 2.328042963989175e-06, + "loss": 0.4152, + "step": 8774 + }, + { + "epoch": 0.69, + "grad_norm": 2.289419589274149, + "learning_rate": 2.326968080478299e-06, + "loss": 0.447, + "step": 8775 + }, + { + "epoch": 0.69, + "grad_norm": 2.0442389268677466, + "learning_rate": 2.3258933699090934e-06, + "loss": 0.4645, + "step": 8776 + }, + { + "epoch": 0.69, + "grad_norm": 1.4767862079824787, + "learning_rate": 2.3248188323510866e-06, + "loss": 0.4949, + "step": 8777 + }, + { + "epoch": 0.69, + "grad_norm": 0.5575791928662546, + "learning_rate": 2.323744467873802e-06, + "loss": 0.4806, + "step": 8778 + }, + { + "epoch": 0.69, + "grad_norm": 2.181235810894636, + "learning_rate": 2.3226702765467496e-06, + "loss": 0.5034, + "step": 8779 + }, + { + "epoch": 0.69, + "grad_norm": 2.2777678210077985, + "learning_rate": 2.3215962584394277e-06, + "loss": 0.5088, + "step": 8780 + }, + { + "epoch": 0.69, + "grad_norm": 10.552099831855452, + "learning_rate": 2.3205224136213257e-06, + "loss": 0.444, + "step": 8781 + }, + { + "epoch": 0.69, + "grad_norm": 1.7754346767581746, + "learning_rate": 2.3194487421619143e-06, + "loss": 0.4058, + "step": 8782 + }, + { + "epoch": 0.69, + "grad_norm": 2.344182909460469, + "learning_rate": 2.318375244130667e-06, + "loss": 0.5197, + "step": 8783 + }, + { + "epoch": 0.69, + "grad_norm": 1.3461524721266638, + "learning_rate": 2.3173019195970313e-06, + "loss": 0.4246, + "step": 8784 + }, + { + "epoch": 0.69, + "grad_norm": 1.664310168159418, + "learning_rate": 2.316228768630454e-06, + "loss": 0.4196, + "step": 8785 + }, + { + "epoch": 0.69, + "grad_norm": 2.003813943824683, + "learning_rate": 2.315155791300361e-06, + "loss": 0.4574, + "step": 8786 + }, + { + "epoch": 0.69, + "grad_norm": 1.5561281919265904, + "learning_rate": 2.3140829876761794e-06, + "loss": 0.4054, + "step": 8787 + }, + { + "epoch": 0.69, + "grad_norm": 1.8031096785030982, + "learning_rate": 2.3130103578273135e-06, + "loss": 0.5038, + "step": 8788 + }, + { + "epoch": 0.69, + "grad_norm": 1.8732727970377732, + "learning_rate": 2.311937901823162e-06, + "loss": 0.4708, + "step": 8789 + }, + { + "epoch": 0.69, + "grad_norm": 1.369895655389187, + "learning_rate": 2.3108656197331118e-06, + "loss": 0.4523, + "step": 8790 + }, + { + "epoch": 0.69, + "grad_norm": 2.2752789352993346, + "learning_rate": 2.309793511626538e-06, + "loss": 0.4474, + "step": 8791 + }, + { + "epoch": 0.69, + "grad_norm": 1.608232228016393, + "learning_rate": 2.308721577572807e-06, + "loss": 0.4534, + "step": 8792 + }, + { + "epoch": 0.69, + "grad_norm": 0.5324486160938781, + "learning_rate": 2.307649817641268e-06, + "loss": 0.4597, + "step": 8793 + }, + { + "epoch": 0.69, + "grad_norm": 0.5424116957412364, + "learning_rate": 2.3065782319012625e-06, + "loss": 0.4805, + "step": 8794 + }, + { + "epoch": 0.69, + "grad_norm": 1.4370830468060798, + "learning_rate": 2.3055068204221226e-06, + "loss": 0.4478, + "step": 8795 + }, + { + "epoch": 0.69, + "grad_norm": 1.6971845653349664, + "learning_rate": 2.3044355832731685e-06, + "loss": 0.4706, + "step": 8796 + }, + { + "epoch": 0.69, + "grad_norm": 1.4142916033456723, + "learning_rate": 2.3033645205237043e-06, + "loss": 0.4383, + "step": 8797 + }, + { + "epoch": 0.69, + "grad_norm": 0.6465752809575275, + "learning_rate": 2.3022936322430275e-06, + "loss": 0.4822, + "step": 8798 + }, + { + "epoch": 0.69, + "grad_norm": 2.1583248477051136, + "learning_rate": 2.3012229185004237e-06, + "loss": 0.4717, + "step": 8799 + }, + { + "epoch": 0.69, + "grad_norm": 1.5826932578766875, + "learning_rate": 2.3001523793651688e-06, + "loss": 0.5065, + "step": 8800 + }, + { + "epoch": 0.69, + "grad_norm": 0.5404024733051546, + "learning_rate": 2.299082014906519e-06, + "loss": 0.4674, + "step": 8801 + }, + { + "epoch": 0.69, + "grad_norm": 1.4957450252036628, + "learning_rate": 2.2980118251937315e-06, + "loss": 0.4216, + "step": 8802 + }, + { + "epoch": 0.69, + "grad_norm": 0.5752588296845698, + "learning_rate": 2.2969418102960467e-06, + "loss": 0.4983, + "step": 8803 + }, + { + "epoch": 0.69, + "grad_norm": 0.5223896494057233, + "learning_rate": 2.295871970282688e-06, + "loss": 0.4718, + "step": 8804 + }, + { + "epoch": 0.69, + "grad_norm": 1.9041600229989666, + "learning_rate": 2.294802305222876e-06, + "loss": 0.4312, + "step": 8805 + }, + { + "epoch": 0.69, + "grad_norm": 1.5169058452292408, + "learning_rate": 2.2937328151858156e-06, + "loss": 0.4399, + "step": 8806 + }, + { + "epoch": 0.69, + "grad_norm": 2.493607344269368, + "learning_rate": 2.2926635002407046e-06, + "loss": 0.4618, + "step": 8807 + }, + { + "epoch": 0.69, + "grad_norm": 0.5539477899012804, + "learning_rate": 2.2915943604567203e-06, + "loss": 0.4676, + "step": 8808 + }, + { + "epoch": 0.69, + "grad_norm": 1.7997869972089129, + "learning_rate": 2.2905253959030386e-06, + "loss": 0.4128, + "step": 8809 + }, + { + "epoch": 0.69, + "grad_norm": 1.7885843396232923, + "learning_rate": 2.2894566066488194e-06, + "loss": 0.4351, + "step": 8810 + }, + { + "epoch": 0.69, + "grad_norm": 1.51770918938979, + "learning_rate": 2.2883879927632136e-06, + "loss": 0.4564, + "step": 8811 + }, + { + "epoch": 0.69, + "grad_norm": 1.3128212868480458, + "learning_rate": 2.2873195543153555e-06, + "loss": 0.4388, + "step": 8812 + }, + { + "epoch": 0.69, + "grad_norm": 2.149298372185613, + "learning_rate": 2.2862512913743734e-06, + "loss": 0.4808, + "step": 8813 + }, + { + "epoch": 0.69, + "grad_norm": 1.6532677180646076, + "learning_rate": 2.285183204009383e-06, + "loss": 0.3938, + "step": 8814 + }, + { + "epoch": 0.69, + "grad_norm": 1.6286394433991378, + "learning_rate": 2.284115292289488e-06, + "loss": 0.4213, + "step": 8815 + }, + { + "epoch": 0.69, + "grad_norm": 1.89879466320086, + "learning_rate": 2.283047556283783e-06, + "loss": 0.3993, + "step": 8816 + }, + { + "epoch": 0.69, + "grad_norm": 1.566456301592447, + "learning_rate": 2.281979996061342e-06, + "loss": 0.4567, + "step": 8817 + }, + { + "epoch": 0.69, + "grad_norm": 1.5120286926995476, + "learning_rate": 2.2809126116912443e-06, + "loss": 0.45, + "step": 8818 + }, + { + "epoch": 0.69, + "grad_norm": 1.5369697485171483, + "learning_rate": 2.2798454032425413e-06, + "loss": 0.4496, + "step": 8819 + }, + { + "epoch": 0.69, + "grad_norm": 2.0389247744033914, + "learning_rate": 2.278778370784284e-06, + "loss": 0.4151, + "step": 8820 + }, + { + "epoch": 0.69, + "grad_norm": 8.838206905988423, + "learning_rate": 2.277711514385503e-06, + "loss": 0.4332, + "step": 8821 + }, + { + "epoch": 0.69, + "grad_norm": 1.4386275775480424, + "learning_rate": 2.2766448341152297e-06, + "loss": 0.4331, + "step": 8822 + }, + { + "epoch": 0.69, + "grad_norm": 1.61286913567288, + "learning_rate": 2.275578330042471e-06, + "loss": 0.4725, + "step": 8823 + }, + { + "epoch": 0.69, + "grad_norm": 1.8946472053284966, + "learning_rate": 2.2745120022362304e-06, + "loss": 0.4797, + "step": 8824 + }, + { + "epoch": 0.69, + "grad_norm": 1.6362844465428508, + "learning_rate": 2.2734458507654978e-06, + "loss": 0.4033, + "step": 8825 + }, + { + "epoch": 0.69, + "grad_norm": 1.6867924214973369, + "learning_rate": 2.2723798756992515e-06, + "loss": 0.4272, + "step": 8826 + }, + { + "epoch": 0.69, + "grad_norm": 1.9747343763163863, + "learning_rate": 2.2713140771064613e-06, + "loss": 0.4364, + "step": 8827 + }, + { + "epoch": 0.69, + "grad_norm": 2.0082363570557376, + "learning_rate": 2.2702484550560787e-06, + "loss": 0.4699, + "step": 8828 + }, + { + "epoch": 0.69, + "grad_norm": 2.10164129143919, + "learning_rate": 2.2691830096170503e-06, + "loss": 0.506, + "step": 8829 + }, + { + "epoch": 0.69, + "grad_norm": 1.854927152118923, + "learning_rate": 2.268117740858309e-06, + "loss": 0.5502, + "step": 8830 + }, + { + "epoch": 0.69, + "grad_norm": 1.4937754557846141, + "learning_rate": 2.2670526488487774e-06, + "loss": 0.4466, + "step": 8831 + }, + { + "epoch": 0.69, + "grad_norm": 1.4937465814850117, + "learning_rate": 2.265987733657363e-06, + "loss": 0.4091, + "step": 8832 + }, + { + "epoch": 0.69, + "grad_norm": 1.4422124553944382, + "learning_rate": 2.2649229953529656e-06, + "loss": 0.4708, + "step": 8833 + }, + { + "epoch": 0.69, + "grad_norm": 1.3899277112183073, + "learning_rate": 2.263858434004472e-06, + "loss": 0.4503, + "step": 8834 + }, + { + "epoch": 0.69, + "grad_norm": 1.5844040864446882, + "learning_rate": 2.2627940496807588e-06, + "loss": 0.4265, + "step": 8835 + }, + { + "epoch": 0.69, + "grad_norm": 1.6941744003047006, + "learning_rate": 2.2617298424506918e-06, + "loss": 0.4782, + "step": 8836 + }, + { + "epoch": 0.69, + "grad_norm": 5.717318726313983, + "learning_rate": 2.2606658123831175e-06, + "loss": 0.4421, + "step": 8837 + }, + { + "epoch": 0.69, + "grad_norm": 0.5847529787328378, + "learning_rate": 2.259601959546886e-06, + "loss": 0.4925, + "step": 8838 + }, + { + "epoch": 0.69, + "grad_norm": 0.5504788823467351, + "learning_rate": 2.2585382840108213e-06, + "loss": 0.4661, + "step": 8839 + }, + { + "epoch": 0.69, + "grad_norm": 1.710090047101358, + "learning_rate": 2.2574747858437425e-06, + "loss": 0.4775, + "step": 8840 + }, + { + "epoch": 0.69, + "grad_norm": 1.7289786463566346, + "learning_rate": 2.256411465114458e-06, + "loss": 0.4498, + "step": 8841 + }, + { + "epoch": 0.69, + "grad_norm": 2.107929901860206, + "learning_rate": 2.2553483218917644e-06, + "loss": 0.5099, + "step": 8842 + }, + { + "epoch": 0.69, + "grad_norm": 2.9207998072396633, + "learning_rate": 2.254285356244442e-06, + "loss": 0.432, + "step": 8843 + }, + { + "epoch": 0.69, + "grad_norm": 1.383567082357788, + "learning_rate": 2.2532225682412646e-06, + "loss": 0.4128, + "step": 8844 + }, + { + "epoch": 0.69, + "grad_norm": 1.4184029295374017, + "learning_rate": 2.2521599579509945e-06, + "loss": 0.449, + "step": 8845 + }, + { + "epoch": 0.69, + "grad_norm": 1.9508461714939482, + "learning_rate": 2.25109752544238e-06, + "loss": 0.4423, + "step": 8846 + }, + { + "epoch": 0.69, + "grad_norm": 2.008482865570436, + "learning_rate": 2.2500352707841622e-06, + "loss": 0.4669, + "step": 8847 + }, + { + "epoch": 0.69, + "grad_norm": 2.776515179958293, + "learning_rate": 2.2489731940450625e-06, + "loss": 0.4838, + "step": 8848 + }, + { + "epoch": 0.69, + "grad_norm": 3.317306725519793, + "learning_rate": 2.247911295293798e-06, + "loss": 0.4602, + "step": 8849 + }, + { + "epoch": 0.7, + "grad_norm": 0.5547771637607359, + "learning_rate": 2.2468495745990732e-06, + "loss": 0.4794, + "step": 8850 + }, + { + "epoch": 0.7, + "grad_norm": 2.491712390905574, + "learning_rate": 2.245788032029581e-06, + "loss": 0.4536, + "step": 8851 + }, + { + "epoch": 0.7, + "grad_norm": 1.7838476396887042, + "learning_rate": 2.244726667653997e-06, + "loss": 0.4319, + "step": 8852 + }, + { + "epoch": 0.7, + "grad_norm": 2.8499339875933396, + "learning_rate": 2.2436654815409964e-06, + "loss": 0.4707, + "step": 8853 + }, + { + "epoch": 0.7, + "grad_norm": 1.7661914415380568, + "learning_rate": 2.242604473759231e-06, + "loss": 0.4247, + "step": 8854 + }, + { + "epoch": 0.7, + "grad_norm": 1.6098408430758295, + "learning_rate": 2.241543644377352e-06, + "loss": 0.4587, + "step": 8855 + }, + { + "epoch": 0.7, + "grad_norm": 1.5159813986412654, + "learning_rate": 2.2404829934639865e-06, + "loss": 0.4422, + "step": 8856 + }, + { + "epoch": 0.7, + "grad_norm": 2.4452466883889428, + "learning_rate": 2.2394225210877653e-06, + "loss": 0.4382, + "step": 8857 + }, + { + "epoch": 0.7, + "grad_norm": 0.5300099823396506, + "learning_rate": 2.2383622273172934e-06, + "loss": 0.4384, + "step": 8858 + }, + { + "epoch": 0.7, + "grad_norm": 4.410036168698587, + "learning_rate": 2.2373021122211728e-06, + "loss": 0.4705, + "step": 8859 + }, + { + "epoch": 0.7, + "grad_norm": 1.6855616743177466, + "learning_rate": 2.2362421758679916e-06, + "loss": 0.4123, + "step": 8860 + }, + { + "epoch": 0.7, + "grad_norm": 2.601208286607932, + "learning_rate": 2.235182418326326e-06, + "loss": 0.4614, + "step": 8861 + }, + { + "epoch": 0.7, + "grad_norm": 1.449006226234115, + "learning_rate": 2.2341228396647437e-06, + "loss": 0.4655, + "step": 8862 + }, + { + "epoch": 0.7, + "grad_norm": 2.1004065503937044, + "learning_rate": 2.2330634399517926e-06, + "loss": 0.4471, + "step": 8863 + }, + { + "epoch": 0.7, + "grad_norm": 1.7483681544224434, + "learning_rate": 2.2320042192560175e-06, + "loss": 0.41, + "step": 8864 + }, + { + "epoch": 0.7, + "grad_norm": 1.8212878501151792, + "learning_rate": 2.230945177645949e-06, + "loss": 0.4355, + "step": 8865 + }, + { + "epoch": 0.7, + "grad_norm": 3.1134133405022233, + "learning_rate": 2.229886315190107e-06, + "loss": 0.4368, + "step": 8866 + }, + { + "epoch": 0.7, + "grad_norm": 1.653425199290218, + "learning_rate": 2.228827631956994e-06, + "loss": 0.4838, + "step": 8867 + }, + { + "epoch": 0.7, + "grad_norm": 3.5187129633694805, + "learning_rate": 2.227769128015109e-06, + "loss": 0.4635, + "step": 8868 + }, + { + "epoch": 0.7, + "grad_norm": 2.1050924485044678, + "learning_rate": 2.2267108034329343e-06, + "loss": 0.4461, + "step": 8869 + }, + { + "epoch": 0.7, + "grad_norm": 1.7443921139464742, + "learning_rate": 2.225652658278943e-06, + "loss": 0.4587, + "step": 8870 + }, + { + "epoch": 0.7, + "grad_norm": 1.9735929362535298, + "learning_rate": 2.2245946926215965e-06, + "loss": 0.4552, + "step": 8871 + }, + { + "epoch": 0.7, + "grad_norm": 1.4806779958726386, + "learning_rate": 2.22353690652934e-06, + "loss": 0.4096, + "step": 8872 + }, + { + "epoch": 0.7, + "grad_norm": 1.6664587028003306, + "learning_rate": 2.2224793000706165e-06, + "loss": 0.4042, + "step": 8873 + }, + { + "epoch": 0.7, + "grad_norm": 1.8698759958732756, + "learning_rate": 2.221421873313848e-06, + "loss": 0.4567, + "step": 8874 + }, + { + "epoch": 0.7, + "grad_norm": 3.2532583712821066, + "learning_rate": 2.2203646263274483e-06, + "loss": 0.437, + "step": 8875 + }, + { + "epoch": 0.7, + "grad_norm": 0.5752094964150016, + "learning_rate": 2.2193075591798214e-06, + "loss": 0.4742, + "step": 8876 + }, + { + "epoch": 0.7, + "grad_norm": 2.124275472782954, + "learning_rate": 2.21825067193936e-06, + "loss": 0.4375, + "step": 8877 + }, + { + "epoch": 0.7, + "grad_norm": 1.6022341797398343, + "learning_rate": 2.217193964674439e-06, + "loss": 0.4521, + "step": 8878 + }, + { + "epoch": 0.7, + "grad_norm": 2.669439483093396, + "learning_rate": 2.216137437453428e-06, + "loss": 0.4393, + "step": 8879 + }, + { + "epoch": 0.7, + "grad_norm": 0.5866524791355068, + "learning_rate": 2.2150810903446835e-06, + "loss": 0.4913, + "step": 8880 + }, + { + "epoch": 0.7, + "grad_norm": 1.6806622928456794, + "learning_rate": 2.214024923416549e-06, + "loss": 0.4437, + "step": 8881 + }, + { + "epoch": 0.7, + "grad_norm": 1.452514326850183, + "learning_rate": 2.2129689367373593e-06, + "loss": 0.4496, + "step": 8882 + }, + { + "epoch": 0.7, + "grad_norm": 1.5622107964001402, + "learning_rate": 2.2119131303754316e-06, + "loss": 0.4505, + "step": 8883 + }, + { + "epoch": 0.7, + "grad_norm": 2.68983625579445, + "learning_rate": 2.210857504399077e-06, + "loss": 0.535, + "step": 8884 + }, + { + "epoch": 0.7, + "grad_norm": 2.281983929721882, + "learning_rate": 2.209802058876594e-06, + "loss": 0.4703, + "step": 8885 + }, + { + "epoch": 0.7, + "grad_norm": 1.3099412208279417, + "learning_rate": 2.2087467938762686e-06, + "loss": 0.4176, + "step": 8886 + }, + { + "epoch": 0.7, + "grad_norm": 1.6730334850973296, + "learning_rate": 2.207691709466371e-06, + "loss": 0.4114, + "step": 8887 + }, + { + "epoch": 0.7, + "grad_norm": 1.5680134646993322, + "learning_rate": 2.206636805715171e-06, + "loss": 0.4425, + "step": 8888 + }, + { + "epoch": 0.7, + "grad_norm": 0.5500234817171427, + "learning_rate": 2.205582082690913e-06, + "loss": 0.4506, + "step": 8889 + }, + { + "epoch": 0.7, + "grad_norm": 1.4492339526020959, + "learning_rate": 2.204527540461841e-06, + "loss": 0.4644, + "step": 8890 + }, + { + "epoch": 0.7, + "grad_norm": 2.990111583427272, + "learning_rate": 2.203473179096176e-06, + "loss": 0.411, + "step": 8891 + }, + { + "epoch": 0.7, + "grad_norm": 0.5791222300962104, + "learning_rate": 2.2024189986621404e-06, + "loss": 0.4741, + "step": 8892 + }, + { + "epoch": 0.7, + "grad_norm": 1.6079402146861241, + "learning_rate": 2.2013649992279374e-06, + "loss": 0.4248, + "step": 8893 + }, + { + "epoch": 0.7, + "grad_norm": 2.5184362187464617, + "learning_rate": 2.200311180861756e-06, + "loss": 0.4492, + "step": 8894 + }, + { + "epoch": 0.7, + "grad_norm": 2.346843490522689, + "learning_rate": 2.199257543631779e-06, + "loss": 0.4288, + "step": 8895 + }, + { + "epoch": 0.7, + "grad_norm": 2.719411981071385, + "learning_rate": 2.198204087606175e-06, + "loss": 0.4164, + "step": 8896 + }, + { + "epoch": 0.7, + "grad_norm": 1.8754511192056444, + "learning_rate": 2.1971508128531034e-06, + "loss": 0.4817, + "step": 8897 + }, + { + "epoch": 0.7, + "grad_norm": 2.2119384502315884, + "learning_rate": 2.1960977194407052e-06, + "loss": 0.4503, + "step": 8898 + }, + { + "epoch": 0.7, + "grad_norm": 4.253599050783305, + "learning_rate": 2.195044807437117e-06, + "loss": 0.4391, + "step": 8899 + }, + { + "epoch": 0.7, + "grad_norm": 2.054085900477556, + "learning_rate": 2.1939920769104606e-06, + "loss": 0.4627, + "step": 8900 + }, + { + "epoch": 0.7, + "grad_norm": 2.163941639262285, + "learning_rate": 2.192939527928847e-06, + "loss": 0.4818, + "step": 8901 + }, + { + "epoch": 0.7, + "grad_norm": 2.033932696319072, + "learning_rate": 2.191887160560373e-06, + "loss": 0.4583, + "step": 8902 + }, + { + "epoch": 0.7, + "grad_norm": 1.4270976456516458, + "learning_rate": 2.190834974873124e-06, + "loss": 0.452, + "step": 8903 + }, + { + "epoch": 0.7, + "grad_norm": 11.860675687871824, + "learning_rate": 2.1897829709351816e-06, + "loss": 0.4249, + "step": 8904 + }, + { + "epoch": 0.7, + "grad_norm": 1.5612560136269833, + "learning_rate": 2.188731148814602e-06, + "loss": 0.449, + "step": 8905 + }, + { + "epoch": 0.7, + "grad_norm": 3.862368711462543, + "learning_rate": 2.1876795085794417e-06, + "loss": 0.405, + "step": 8906 + }, + { + "epoch": 0.7, + "grad_norm": 1.865863097226969, + "learning_rate": 2.1866280502977345e-06, + "loss": 0.459, + "step": 8907 + }, + { + "epoch": 0.7, + "grad_norm": 1.9782701601440664, + "learning_rate": 2.1855767740375156e-06, + "loss": 0.4475, + "step": 8908 + }, + { + "epoch": 0.7, + "grad_norm": 1.7652529203169653, + "learning_rate": 2.1845256798667956e-06, + "loss": 0.4677, + "step": 8909 + }, + { + "epoch": 0.7, + "grad_norm": 3.0507245313661517, + "learning_rate": 2.1834747678535818e-06, + "loss": 0.3972, + "step": 8910 + }, + { + "epoch": 0.7, + "grad_norm": 2.2667124990303824, + "learning_rate": 2.1824240380658655e-06, + "loss": 0.4434, + "step": 8911 + }, + { + "epoch": 0.7, + "grad_norm": 0.5420719700092652, + "learning_rate": 2.1813734905716305e-06, + "loss": 0.4724, + "step": 8912 + }, + { + "epoch": 0.7, + "grad_norm": 0.568467584990093, + "learning_rate": 2.1803231254388406e-06, + "loss": 0.4626, + "step": 8913 + }, + { + "epoch": 0.7, + "grad_norm": 1.6134359736873944, + "learning_rate": 2.179272942735457e-06, + "loss": 0.4745, + "step": 8914 + }, + { + "epoch": 0.7, + "grad_norm": 2.442892257311608, + "learning_rate": 2.1782229425294233e-06, + "loss": 0.4677, + "step": 8915 + }, + { + "epoch": 0.7, + "grad_norm": 2.100262255585309, + "learning_rate": 2.177173124888674e-06, + "loss": 0.4557, + "step": 8916 + }, + { + "epoch": 0.7, + "grad_norm": 2.3427099857604725, + "learning_rate": 2.176123489881133e-06, + "loss": 0.4695, + "step": 8917 + }, + { + "epoch": 0.7, + "grad_norm": 1.7194150082813031, + "learning_rate": 2.175074037574706e-06, + "loss": 0.4675, + "step": 8918 + }, + { + "epoch": 0.7, + "grad_norm": 2.1314240298383442, + "learning_rate": 2.1740247680372927e-06, + "loss": 0.4601, + "step": 8919 + }, + { + "epoch": 0.7, + "grad_norm": 1.855949414040905, + "learning_rate": 2.1729756813367807e-06, + "loss": 0.4619, + "step": 8920 + }, + { + "epoch": 0.7, + "grad_norm": 1.8873719564222389, + "learning_rate": 2.1719267775410456e-06, + "loss": 0.4642, + "step": 8921 + }, + { + "epoch": 0.7, + "grad_norm": 1.5828835973876518, + "learning_rate": 2.170878056717945e-06, + "loss": 0.4845, + "step": 8922 + }, + { + "epoch": 0.7, + "grad_norm": 1.7116217614610953, + "learning_rate": 2.169829518935337e-06, + "loss": 0.4399, + "step": 8923 + }, + { + "epoch": 0.7, + "grad_norm": 3.063976523957411, + "learning_rate": 2.1687811642610545e-06, + "loss": 0.3979, + "step": 8924 + }, + { + "epoch": 0.7, + "grad_norm": 0.5441462325303021, + "learning_rate": 2.167732992762927e-06, + "loss": 0.5037, + "step": 8925 + }, + { + "epoch": 0.7, + "grad_norm": 1.9677746957089874, + "learning_rate": 2.16668500450877e-06, + "loss": 0.4443, + "step": 8926 + }, + { + "epoch": 0.7, + "grad_norm": 1.6562941220793181, + "learning_rate": 2.165637199566387e-06, + "loss": 0.5172, + "step": 8927 + }, + { + "epoch": 0.7, + "grad_norm": 0.5685035522870938, + "learning_rate": 2.1645895780035703e-06, + "loss": 0.4541, + "step": 8928 + }, + { + "epoch": 0.7, + "grad_norm": 1.8629978214457865, + "learning_rate": 2.1635421398880973e-06, + "loss": 0.4443, + "step": 8929 + }, + { + "epoch": 0.7, + "grad_norm": 0.5595506995331464, + "learning_rate": 2.1624948852877375e-06, + "loss": 0.4741, + "step": 8930 + }, + { + "epoch": 0.7, + "grad_norm": 1.691843019428444, + "learning_rate": 2.1614478142702465e-06, + "loss": 0.4285, + "step": 8931 + }, + { + "epoch": 0.7, + "grad_norm": 3.5176076962656144, + "learning_rate": 2.160400926903371e-06, + "loss": 0.4491, + "step": 8932 + }, + { + "epoch": 0.7, + "grad_norm": 1.63210682468484, + "learning_rate": 2.159354223254839e-06, + "loss": 0.4373, + "step": 8933 + }, + { + "epoch": 0.7, + "grad_norm": 0.5803370079521174, + "learning_rate": 2.158307703392372e-06, + "loss": 0.4601, + "step": 8934 + }, + { + "epoch": 0.7, + "grad_norm": 1.8361520284800898, + "learning_rate": 2.1572613673836795e-06, + "loss": 0.4598, + "step": 8935 + }, + { + "epoch": 0.7, + "grad_norm": 2.644538121924615, + "learning_rate": 2.15621521529646e-06, + "loss": 0.4267, + "step": 8936 + }, + { + "epoch": 0.7, + "grad_norm": 0.5643953606874178, + "learning_rate": 2.155169247198393e-06, + "loss": 0.4821, + "step": 8937 + }, + { + "epoch": 0.7, + "grad_norm": 2.3279099554743983, + "learning_rate": 2.1541234631571533e-06, + "loss": 0.4323, + "step": 8938 + }, + { + "epoch": 0.7, + "grad_norm": 2.224607284366617, + "learning_rate": 2.1530778632404066e-06, + "loss": 0.5263, + "step": 8939 + }, + { + "epoch": 0.7, + "grad_norm": 1.9441309691186217, + "learning_rate": 2.1520324475157956e-06, + "loss": 0.4066, + "step": 8940 + }, + { + "epoch": 0.7, + "grad_norm": 1.6252829909269528, + "learning_rate": 2.1509872160509613e-06, + "loss": 0.3975, + "step": 8941 + }, + { + "epoch": 0.7, + "grad_norm": 1.6853830683461477, + "learning_rate": 2.1499421689135238e-06, + "loss": 0.5181, + "step": 8942 + }, + { + "epoch": 0.7, + "grad_norm": 1.5135213307313313, + "learning_rate": 2.148897306171103e-06, + "loss": 0.4361, + "step": 8943 + }, + { + "epoch": 0.7, + "grad_norm": 4.457617931056689, + "learning_rate": 2.1478526278912955e-06, + "loss": 0.4784, + "step": 8944 + }, + { + "epoch": 0.7, + "grad_norm": 1.5260377366743276, + "learning_rate": 2.1468081341416912e-06, + "loss": 0.4389, + "step": 8945 + }, + { + "epoch": 0.7, + "grad_norm": 1.9782107517266923, + "learning_rate": 2.145763824989868e-06, + "loss": 0.4756, + "step": 8946 + }, + { + "epoch": 0.7, + "grad_norm": 2.007025125467285, + "learning_rate": 2.144719700503394e-06, + "loss": 0.4461, + "step": 8947 + }, + { + "epoch": 0.7, + "grad_norm": 1.5798570109214638, + "learning_rate": 2.143675760749818e-06, + "loss": 0.4313, + "step": 8948 + }, + { + "epoch": 0.7, + "grad_norm": 2.011061969263494, + "learning_rate": 2.1426320057966844e-06, + "loss": 0.4546, + "step": 8949 + }, + { + "epoch": 0.7, + "grad_norm": 1.301058737234699, + "learning_rate": 2.1415884357115213e-06, + "loss": 0.459, + "step": 8950 + }, + { + "epoch": 0.7, + "grad_norm": 1.81892363766736, + "learning_rate": 2.1405450505618474e-06, + "loss": 0.4166, + "step": 8951 + }, + { + "epoch": 0.7, + "grad_norm": 1.6936392772729532, + "learning_rate": 2.1395018504151704e-06, + "loss": 0.4235, + "step": 8952 + }, + { + "epoch": 0.7, + "grad_norm": 1.5105893706325855, + "learning_rate": 2.1384588353389794e-06, + "loss": 0.3963, + "step": 8953 + }, + { + "epoch": 0.7, + "grad_norm": 2.321971548827934, + "learning_rate": 2.137416005400758e-06, + "loss": 0.4713, + "step": 8954 + }, + { + "epoch": 0.7, + "grad_norm": 1.8070538892836439, + "learning_rate": 2.136373360667977e-06, + "loss": 0.4819, + "step": 8955 + }, + { + "epoch": 0.7, + "grad_norm": 1.8640226762022198, + "learning_rate": 2.135330901208095e-06, + "loss": 0.4304, + "step": 8956 + }, + { + "epoch": 0.7, + "grad_norm": 1.6654939189875235, + "learning_rate": 2.1342886270885526e-06, + "loss": 0.4595, + "step": 8957 + }, + { + "epoch": 0.7, + "grad_norm": 1.4511470481884279, + "learning_rate": 2.133246538376791e-06, + "loss": 0.4263, + "step": 8958 + }, + { + "epoch": 0.7, + "grad_norm": 1.8949903729220299, + "learning_rate": 2.1322046351402264e-06, + "loss": 0.5357, + "step": 8959 + }, + { + "epoch": 0.7, + "grad_norm": 1.8483101409038127, + "learning_rate": 2.13116291744627e-06, + "loss": 0.4539, + "step": 8960 + }, + { + "epoch": 0.7, + "grad_norm": 1.8035643891043378, + "learning_rate": 2.1301213853623206e-06, + "loss": 0.4152, + "step": 8961 + }, + { + "epoch": 0.7, + "grad_norm": 1.956820849736373, + "learning_rate": 2.1290800389557626e-06, + "loss": 0.4353, + "step": 8962 + }, + { + "epoch": 0.7, + "grad_norm": 2.702113032256644, + "learning_rate": 2.1280388782939725e-06, + "loss": 0.4402, + "step": 8963 + }, + { + "epoch": 0.7, + "grad_norm": 1.754823921076016, + "learning_rate": 2.1269979034443076e-06, + "loss": 0.4655, + "step": 8964 + }, + { + "epoch": 0.7, + "grad_norm": 2.0251096046613744, + "learning_rate": 2.125957114474121e-06, + "loss": 0.4482, + "step": 8965 + }, + { + "epoch": 0.7, + "grad_norm": 1.431460070861409, + "learning_rate": 2.124916511450749e-06, + "loss": 0.4246, + "step": 8966 + }, + { + "epoch": 0.7, + "grad_norm": 2.4740637144662565, + "learning_rate": 2.1238760944415193e-06, + "loss": 0.4853, + "step": 8967 + }, + { + "epoch": 0.7, + "grad_norm": 0.5736244723557816, + "learning_rate": 2.122835863513742e-06, + "loss": 0.4817, + "step": 8968 + }, + { + "epoch": 0.7, + "grad_norm": 1.7406323530372831, + "learning_rate": 2.1217958187347202e-06, + "loss": 0.4716, + "step": 8969 + }, + { + "epoch": 0.7, + "grad_norm": 23.895272184965688, + "learning_rate": 2.120755960171745e-06, + "loss": 0.4219, + "step": 8970 + }, + { + "epoch": 0.7, + "grad_norm": 1.8415630562611935, + "learning_rate": 2.1197162878920914e-06, + "loss": 0.4629, + "step": 8971 + }, + { + "epoch": 0.7, + "grad_norm": 2.246323001883261, + "learning_rate": 2.1186768019630283e-06, + "loss": 0.4528, + "step": 8972 + }, + { + "epoch": 0.7, + "grad_norm": 0.5775077323595279, + "learning_rate": 2.1176375024518037e-06, + "loss": 0.4975, + "step": 8973 + }, + { + "epoch": 0.7, + "grad_norm": 2.0685712504700517, + "learning_rate": 2.1165983894256647e-06, + "loss": 0.4041, + "step": 8974 + }, + { + "epoch": 0.7, + "grad_norm": 1.8769084345059235, + "learning_rate": 2.1155594629518374e-06, + "loss": 0.4601, + "step": 8975 + }, + { + "epoch": 0.7, + "grad_norm": 0.5994686404379879, + "learning_rate": 2.1145207230975402e-06, + "loss": 0.4772, + "step": 8976 + }, + { + "epoch": 0.71, + "grad_norm": 2.042815409577239, + "learning_rate": 2.1134821699299746e-06, + "loss": 0.4437, + "step": 8977 + }, + { + "epoch": 0.71, + "grad_norm": 1.7654912509264267, + "learning_rate": 2.11244380351634e-06, + "loss": 0.4473, + "step": 8978 + }, + { + "epoch": 0.71, + "grad_norm": 2.423037569776539, + "learning_rate": 2.111405623923812e-06, + "loss": 0.4733, + "step": 8979 + }, + { + "epoch": 0.71, + "grad_norm": 1.496522276719993, + "learning_rate": 2.1103676312195613e-06, + "loss": 0.4791, + "step": 8980 + }, + { + "epoch": 0.71, + "grad_norm": 1.5477178295596188, + "learning_rate": 2.109329825470745e-06, + "loss": 0.4504, + "step": 8981 + }, + { + "epoch": 0.71, + "grad_norm": 2.1280318578890047, + "learning_rate": 2.1082922067445073e-06, + "loss": 0.4473, + "step": 8982 + }, + { + "epoch": 0.71, + "grad_norm": 1.5299712522540068, + "learning_rate": 2.1072547751079832e-06, + "loss": 0.4549, + "step": 8983 + }, + { + "epoch": 0.71, + "grad_norm": 2.2376025277870144, + "learning_rate": 2.1062175306282885e-06, + "loss": 0.4675, + "step": 8984 + }, + { + "epoch": 0.71, + "grad_norm": 0.5206579654859457, + "learning_rate": 2.1051804733725344e-06, + "loss": 0.445, + "step": 8985 + }, + { + "epoch": 0.71, + "grad_norm": 2.6246561210765194, + "learning_rate": 2.1041436034078165e-06, + "loss": 0.4884, + "step": 8986 + }, + { + "epoch": 0.71, + "grad_norm": 0.5689940381629038, + "learning_rate": 2.1031069208012215e-06, + "loss": 0.4835, + "step": 8987 + }, + { + "epoch": 0.71, + "grad_norm": 4.887581087377493, + "learning_rate": 2.102070425619817e-06, + "loss": 0.4305, + "step": 8988 + }, + { + "epoch": 0.71, + "grad_norm": 0.5533238802699011, + "learning_rate": 2.1010341179306655e-06, + "loss": 0.4725, + "step": 8989 + }, + { + "epoch": 0.71, + "grad_norm": 0.5630686490331619, + "learning_rate": 2.0999979978008134e-06, + "loss": 0.4891, + "step": 8990 + }, + { + "epoch": 0.71, + "grad_norm": 0.5760721725173477, + "learning_rate": 2.0989620652972993e-06, + "loss": 0.4765, + "step": 8991 + }, + { + "epoch": 0.71, + "grad_norm": 2.8820932190718747, + "learning_rate": 2.097926320487141e-06, + "loss": 0.4796, + "step": 8992 + }, + { + "epoch": 0.71, + "grad_norm": 2.417879228138292, + "learning_rate": 2.096890763437355e-06, + "loss": 0.4641, + "step": 8993 + }, + { + "epoch": 0.71, + "grad_norm": 1.7301338686617478, + "learning_rate": 2.0958553942149413e-06, + "loss": 0.4558, + "step": 8994 + }, + { + "epoch": 0.71, + "grad_norm": 0.5485477568464656, + "learning_rate": 2.0948202128868823e-06, + "loss": 0.4972, + "step": 8995 + }, + { + "epoch": 0.71, + "grad_norm": 1.5455553790606953, + "learning_rate": 2.0937852195201545e-06, + "loss": 0.434, + "step": 8996 + }, + { + "epoch": 0.71, + "grad_norm": 2.089880353129698, + "learning_rate": 2.0927504141817216e-06, + "loss": 0.4297, + "step": 8997 + }, + { + "epoch": 0.71, + "grad_norm": 2.806037263372854, + "learning_rate": 2.091715796938536e-06, + "loss": 0.4792, + "step": 8998 + }, + { + "epoch": 0.71, + "grad_norm": 2.1035219830148573, + "learning_rate": 2.0906813678575315e-06, + "loss": 0.4728, + "step": 8999 + }, + { + "epoch": 0.71, + "grad_norm": 2.228726052321276, + "learning_rate": 2.089647127005637e-06, + "loss": 0.4432, + "step": 9000 + }, + { + "epoch": 0.71, + "grad_norm": 1.5839180102619852, + "learning_rate": 2.0886130744497664e-06, + "loss": 0.404, + "step": 9001 + }, + { + "epoch": 0.71, + "grad_norm": 2.42224635361128, + "learning_rate": 2.087579210256822e-06, + "loss": 0.4393, + "step": 9002 + }, + { + "epoch": 0.71, + "grad_norm": 2.5257432628567837, + "learning_rate": 2.086545534493692e-06, + "loss": 0.4647, + "step": 9003 + }, + { + "epoch": 0.71, + "grad_norm": 1.4674013468719789, + "learning_rate": 2.085512047227252e-06, + "loss": 0.449, + "step": 9004 + }, + { + "epoch": 0.71, + "grad_norm": 2.060753583317764, + "learning_rate": 2.0844787485243745e-06, + "loss": 0.453, + "step": 9005 + }, + { + "epoch": 0.71, + "grad_norm": 1.7618055376179753, + "learning_rate": 2.083445638451906e-06, + "loss": 0.4613, + "step": 9006 + }, + { + "epoch": 0.71, + "grad_norm": 0.5422754477575824, + "learning_rate": 2.0824127170766904e-06, + "loss": 0.4661, + "step": 9007 + }, + { + "epoch": 0.71, + "grad_norm": 1.6175201461840238, + "learning_rate": 2.0813799844655525e-06, + "loss": 0.4802, + "step": 9008 + }, + { + "epoch": 0.71, + "grad_norm": 1.6608052641437412, + "learning_rate": 2.080347440685315e-06, + "loss": 0.4502, + "step": 9009 + }, + { + "epoch": 0.71, + "grad_norm": 1.6289195516378172, + "learning_rate": 2.079315085802776e-06, + "loss": 0.4424, + "step": 9010 + }, + { + "epoch": 0.71, + "grad_norm": 0.5768548825894335, + "learning_rate": 2.078282919884733e-06, + "loss": 0.487, + "step": 9011 + }, + { + "epoch": 0.71, + "grad_norm": 5.738946655970322, + "learning_rate": 2.077250942997959e-06, + "loss": 0.4867, + "step": 9012 + }, + { + "epoch": 0.71, + "grad_norm": 3.9968652139666454, + "learning_rate": 2.0762191552092297e-06, + "loss": 0.4363, + "step": 9013 + }, + { + "epoch": 0.71, + "grad_norm": 2.413532780583224, + "learning_rate": 2.075187556585294e-06, + "loss": 0.4895, + "step": 9014 + }, + { + "epoch": 0.71, + "grad_norm": 0.5493171712768339, + "learning_rate": 2.074156147192897e-06, + "loss": 0.4795, + "step": 9015 + }, + { + "epoch": 0.71, + "grad_norm": 2.2347939832975197, + "learning_rate": 2.0731249270987704e-06, + "loss": 0.4646, + "step": 9016 + }, + { + "epoch": 0.71, + "grad_norm": 1.638578408278056, + "learning_rate": 2.072093896369632e-06, + "loss": 0.4937, + "step": 9017 + }, + { + "epoch": 0.71, + "grad_norm": 1.6408582482948533, + "learning_rate": 2.07106305507219e-06, + "loss": 0.4195, + "step": 9018 + }, + { + "epoch": 0.71, + "grad_norm": 1.742528398604224, + "learning_rate": 2.070032403273135e-06, + "loss": 0.4688, + "step": 9019 + }, + { + "epoch": 0.71, + "grad_norm": 3.468767821780354, + "learning_rate": 2.0690019410391505e-06, + "loss": 0.4438, + "step": 9020 + }, + { + "epoch": 0.71, + "grad_norm": 2.3995615027725186, + "learning_rate": 2.067971668436907e-06, + "loss": 0.4166, + "step": 9021 + }, + { + "epoch": 0.71, + "grad_norm": 3.0048424312357143, + "learning_rate": 2.0669415855330616e-06, + "loss": 0.4124, + "step": 9022 + }, + { + "epoch": 0.71, + "grad_norm": 1.5710320791150603, + "learning_rate": 2.0659116923942575e-06, + "loss": 0.5199, + "step": 9023 + }, + { + "epoch": 0.71, + "grad_norm": 1.594266709523323, + "learning_rate": 2.0648819890871285e-06, + "loss": 0.4721, + "step": 9024 + }, + { + "epoch": 0.71, + "grad_norm": 1.558716562215718, + "learning_rate": 2.063852475678295e-06, + "loss": 0.4226, + "step": 9025 + }, + { + "epoch": 0.71, + "grad_norm": 1.4894564516670177, + "learning_rate": 2.0628231522343678e-06, + "loss": 0.4699, + "step": 9026 + }, + { + "epoch": 0.71, + "grad_norm": 1.7605007708350828, + "learning_rate": 2.061794018821936e-06, + "loss": 0.4051, + "step": 9027 + }, + { + "epoch": 0.71, + "grad_norm": 1.9585642006305908, + "learning_rate": 2.06076507550759e-06, + "loss": 0.4384, + "step": 9028 + }, + { + "epoch": 0.71, + "grad_norm": 1.738196552594092, + "learning_rate": 2.0597363223578996e-06, + "loss": 0.4248, + "step": 9029 + }, + { + "epoch": 0.71, + "grad_norm": 0.5472938030587672, + "learning_rate": 2.058707759439422e-06, + "loss": 0.4787, + "step": 9030 + }, + { + "epoch": 0.71, + "grad_norm": 2.8503939985995257, + "learning_rate": 2.057679386818704e-06, + "loss": 0.4982, + "step": 9031 + }, + { + "epoch": 0.71, + "grad_norm": 1.6700524863028299, + "learning_rate": 2.0566512045622805e-06, + "loss": 0.5455, + "step": 9032 + }, + { + "epoch": 0.71, + "grad_norm": 0.5123265766816918, + "learning_rate": 2.055623212736676e-06, + "loss": 0.4651, + "step": 9033 + }, + { + "epoch": 0.71, + "grad_norm": 0.5425342497021234, + "learning_rate": 2.054595411408396e-06, + "loss": 0.4521, + "step": 9034 + }, + { + "epoch": 0.71, + "grad_norm": 2.729591550286699, + "learning_rate": 2.0535678006439396e-06, + "loss": 0.471, + "step": 9035 + }, + { + "epoch": 0.71, + "grad_norm": 1.8776881341316929, + "learning_rate": 2.052540380509792e-06, + "loss": 0.4177, + "step": 9036 + }, + { + "epoch": 0.71, + "grad_norm": 1.6643027711160012, + "learning_rate": 2.051513151072428e-06, + "loss": 0.4367, + "step": 9037 + }, + { + "epoch": 0.71, + "grad_norm": 1.9135374397307274, + "learning_rate": 2.0504861123983042e-06, + "loss": 0.4616, + "step": 9038 + }, + { + "epoch": 0.71, + "grad_norm": 1.5873912859784722, + "learning_rate": 2.0494592645538686e-06, + "loss": 0.4571, + "step": 9039 + }, + { + "epoch": 0.71, + "grad_norm": 0.512977213640121, + "learning_rate": 2.048432607605563e-06, + "loss": 0.4658, + "step": 9040 + }, + { + "epoch": 0.71, + "grad_norm": 6.335823981705311, + "learning_rate": 2.047406141619804e-06, + "loss": 0.423, + "step": 9041 + }, + { + "epoch": 0.71, + "grad_norm": 1.4187137457180805, + "learning_rate": 2.046379866663007e-06, + "loss": 0.451, + "step": 9042 + }, + { + "epoch": 0.71, + "grad_norm": 2.03657010410487, + "learning_rate": 2.0453537828015653e-06, + "loss": 0.4411, + "step": 9043 + }, + { + "epoch": 0.71, + "grad_norm": 1.8738390984382853, + "learning_rate": 2.0443278901018725e-06, + "loss": 0.4555, + "step": 9044 + }, + { + "epoch": 0.71, + "grad_norm": 1.698094442229802, + "learning_rate": 2.0433021886302966e-06, + "loss": 0.4815, + "step": 9045 + }, + { + "epoch": 0.71, + "grad_norm": 1.8242115093743156, + "learning_rate": 2.0422766784532035e-06, + "loss": 0.4607, + "step": 9046 + }, + { + "epoch": 0.71, + "grad_norm": 2.1293595366669327, + "learning_rate": 2.041251359636936e-06, + "loss": 0.4592, + "step": 9047 + }, + { + "epoch": 0.71, + "grad_norm": 1.9513572425874517, + "learning_rate": 2.0402262322478387e-06, + "loss": 0.435, + "step": 9048 + }, + { + "epoch": 0.71, + "grad_norm": 5.9822646478562636, + "learning_rate": 2.0392012963522314e-06, + "loss": 0.461, + "step": 9049 + }, + { + "epoch": 0.71, + "grad_norm": 1.346509100721229, + "learning_rate": 2.038176552016426e-06, + "loss": 0.4662, + "step": 9050 + }, + { + "epoch": 0.71, + "grad_norm": 3.809089129866771, + "learning_rate": 2.037151999306723e-06, + "loss": 0.4367, + "step": 9051 + }, + { + "epoch": 0.71, + "grad_norm": 3.1681372025468284, + "learning_rate": 2.0361276382894104e-06, + "loss": 0.4285, + "step": 9052 + }, + { + "epoch": 0.71, + "grad_norm": 1.8654387318564098, + "learning_rate": 2.035103469030764e-06, + "loss": 0.4467, + "step": 9053 + }, + { + "epoch": 0.71, + "grad_norm": 1.5548453404824296, + "learning_rate": 2.0340794915970426e-06, + "loss": 0.4682, + "step": 9054 + }, + { + "epoch": 0.71, + "grad_norm": 4.623379998691161, + "learning_rate": 2.0330557060544982e-06, + "loss": 0.504, + "step": 9055 + }, + { + "epoch": 0.71, + "grad_norm": 2.3152380441153833, + "learning_rate": 2.0320321124693687e-06, + "loss": 0.4928, + "step": 9056 + }, + { + "epoch": 0.71, + "grad_norm": 1.44446344924222, + "learning_rate": 2.03100871090788e-06, + "loss": 0.4374, + "step": 9057 + }, + { + "epoch": 0.71, + "grad_norm": 2.9371770861031106, + "learning_rate": 2.029985501436243e-06, + "loss": 0.5088, + "step": 9058 + }, + { + "epoch": 0.71, + "grad_norm": 1.5161378238459426, + "learning_rate": 2.028962484120658e-06, + "loss": 0.4801, + "step": 9059 + }, + { + "epoch": 0.71, + "grad_norm": 0.596385458370049, + "learning_rate": 2.027939659027314e-06, + "loss": 0.475, + "step": 9060 + }, + { + "epoch": 0.71, + "grad_norm": 1.8433846742063151, + "learning_rate": 2.0269170262223857e-06, + "loss": 0.4706, + "step": 9061 + }, + { + "epoch": 0.71, + "grad_norm": 2.3099279524651686, + "learning_rate": 2.0258945857720368e-06, + "loss": 0.4553, + "step": 9062 + }, + { + "epoch": 0.71, + "grad_norm": 1.5185901334993461, + "learning_rate": 2.024872337742418e-06, + "loss": 0.468, + "step": 9063 + }, + { + "epoch": 0.71, + "grad_norm": 1.859923130356896, + "learning_rate": 2.0238502821996686e-06, + "loss": 0.408, + "step": 9064 + }, + { + "epoch": 0.71, + "grad_norm": 2.3521942265891753, + "learning_rate": 2.0228284192099106e-06, + "loss": 0.4353, + "step": 9065 + }, + { + "epoch": 0.71, + "grad_norm": 2.3306694335385116, + "learning_rate": 2.021806748839259e-06, + "loss": 0.5052, + "step": 9066 + }, + { + "epoch": 0.71, + "grad_norm": 1.783380075038979, + "learning_rate": 2.020785271153815e-06, + "loss": 0.4459, + "step": 9067 + }, + { + "epoch": 0.71, + "grad_norm": 2.0065249703452364, + "learning_rate": 2.019763986219669e-06, + "loss": 0.4692, + "step": 9068 + }, + { + "epoch": 0.71, + "grad_norm": 1.84834275329012, + "learning_rate": 2.018742894102892e-06, + "loss": 0.4778, + "step": 9069 + }, + { + "epoch": 0.71, + "grad_norm": 1.914032266461034, + "learning_rate": 2.0177219948695498e-06, + "loss": 0.4133, + "step": 9070 + }, + { + "epoch": 0.71, + "grad_norm": 1.6765420255273842, + "learning_rate": 2.0167012885856933e-06, + "loss": 0.4204, + "step": 9071 + }, + { + "epoch": 0.71, + "grad_norm": 2.2937124151101687, + "learning_rate": 2.015680775317361e-06, + "loss": 0.4293, + "step": 9072 + }, + { + "epoch": 0.71, + "grad_norm": 1.845473503153631, + "learning_rate": 2.0146604551305805e-06, + "loss": 0.4504, + "step": 9073 + }, + { + "epoch": 0.71, + "grad_norm": 0.5376945963040818, + "learning_rate": 2.0136403280913595e-06, + "loss": 0.4537, + "step": 9074 + }, + { + "epoch": 0.71, + "grad_norm": 2.420506663673671, + "learning_rate": 2.012620394265706e-06, + "loss": 0.4476, + "step": 9075 + }, + { + "epoch": 0.71, + "grad_norm": 1.46812424897621, + "learning_rate": 2.0116006537196033e-06, + "loss": 0.4879, + "step": 9076 + }, + { + "epoch": 0.71, + "grad_norm": 3.1475708348329037, + "learning_rate": 2.0105811065190306e-06, + "loss": 0.4734, + "step": 9077 + }, + { + "epoch": 0.71, + "grad_norm": 1.7199294451489255, + "learning_rate": 2.0095617527299467e-06, + "loss": 0.4712, + "step": 9078 + }, + { + "epoch": 0.71, + "grad_norm": 1.8570547107638602, + "learning_rate": 2.0085425924183083e-06, + "loss": 0.4731, + "step": 9079 + }, + { + "epoch": 0.71, + "grad_norm": 3.0896312839718028, + "learning_rate": 2.00752362565005e-06, + "loss": 0.4681, + "step": 9080 + }, + { + "epoch": 0.71, + "grad_norm": 3.061011832455128, + "learning_rate": 2.006504852491099e-06, + "loss": 0.475, + "step": 9081 + }, + { + "epoch": 0.71, + "grad_norm": 1.7548912097951428, + "learning_rate": 2.0054862730073647e-06, + "loss": 0.4754, + "step": 9082 + }, + { + "epoch": 0.71, + "grad_norm": 1.7752719396753005, + "learning_rate": 2.0044678872647555e-06, + "loss": 0.4792, + "step": 9083 + }, + { + "epoch": 0.71, + "grad_norm": 2.020631961341323, + "learning_rate": 2.0034496953291533e-06, + "loss": 0.4037, + "step": 9084 + }, + { + "epoch": 0.71, + "grad_norm": 1.5484566486418452, + "learning_rate": 2.002431697266436e-06, + "loss": 0.4601, + "step": 9085 + }, + { + "epoch": 0.71, + "grad_norm": 1.7170834054576911, + "learning_rate": 2.0014138931424658e-06, + "loss": 0.4607, + "step": 9086 + }, + { + "epoch": 0.71, + "grad_norm": 0.56762613332507, + "learning_rate": 2.000396283023095e-06, + "loss": 0.4715, + "step": 9087 + }, + { + "epoch": 0.71, + "grad_norm": 1.6321260865427072, + "learning_rate": 1.999378866974162e-06, + "loss": 0.4746, + "step": 9088 + }, + { + "epoch": 0.71, + "grad_norm": 0.5318285081404429, + "learning_rate": 1.9983616450614894e-06, + "loss": 0.4559, + "step": 9089 + }, + { + "epoch": 0.71, + "grad_norm": 1.9360996999888145, + "learning_rate": 1.9973446173508913e-06, + "loss": 0.4115, + "step": 9090 + }, + { + "epoch": 0.71, + "grad_norm": 1.920001063505966, + "learning_rate": 1.996327783908169e-06, + "loss": 0.5405, + "step": 9091 + }, + { + "epoch": 0.71, + "grad_norm": 1.637046337926212, + "learning_rate": 1.995311144799111e-06, + "loss": 0.4646, + "step": 9092 + }, + { + "epoch": 0.71, + "grad_norm": 2.0892662190016984, + "learning_rate": 1.9942947000894903e-06, + "loss": 0.4765, + "step": 9093 + }, + { + "epoch": 0.71, + "grad_norm": 1.5470911614006504, + "learning_rate": 1.99327844984507e-06, + "loss": 0.4392, + "step": 9094 + }, + { + "epoch": 0.71, + "grad_norm": 2.152385206313903, + "learning_rate": 1.992262394131601e-06, + "loss": 0.4473, + "step": 9095 + }, + { + "epoch": 0.71, + "grad_norm": 0.5429104070363848, + "learning_rate": 1.9912465330148208e-06, + "loss": 0.4847, + "step": 9096 + }, + { + "epoch": 0.71, + "grad_norm": 1.6103553685614254, + "learning_rate": 1.990230866560453e-06, + "loss": 0.4011, + "step": 9097 + }, + { + "epoch": 0.71, + "grad_norm": 0.5783865783756478, + "learning_rate": 1.989215394834212e-06, + "loss": 0.4858, + "step": 9098 + }, + { + "epoch": 0.71, + "grad_norm": 2.0908368753540607, + "learning_rate": 1.988200117901797e-06, + "loss": 0.4849, + "step": 9099 + }, + { + "epoch": 0.71, + "grad_norm": 1.8203995988067303, + "learning_rate": 1.9871850358288936e-06, + "loss": 0.4646, + "step": 9100 + }, + { + "epoch": 0.71, + "grad_norm": 0.5586448525058509, + "learning_rate": 1.986170148681176e-06, + "loss": 0.4825, + "step": 9101 + }, + { + "epoch": 0.71, + "grad_norm": 2.901636238012747, + "learning_rate": 1.9851554565243086e-06, + "loss": 0.509, + "step": 9102 + }, + { + "epoch": 0.71, + "grad_norm": 0.552763840138365, + "learning_rate": 1.9841409594239402e-06, + "loss": 0.4677, + "step": 9103 + }, + { + "epoch": 0.71, + "grad_norm": 1.4708208905118039, + "learning_rate": 1.9831266574457047e-06, + "loss": 0.4286, + "step": 9104 + }, + { + "epoch": 0.72, + "grad_norm": 1.9660444034794247, + "learning_rate": 1.982112550655228e-06, + "loss": 0.4539, + "step": 9105 + }, + { + "epoch": 0.72, + "grad_norm": 0.553095877280426, + "learning_rate": 1.9810986391181215e-06, + "loss": 0.4674, + "step": 9106 + }, + { + "epoch": 0.72, + "grad_norm": 0.5755890513631077, + "learning_rate": 1.980084922899984e-06, + "loss": 0.4651, + "step": 9107 + }, + { + "epoch": 0.72, + "grad_norm": 1.4373639473080866, + "learning_rate": 1.979071402066403e-06, + "loss": 0.4511, + "step": 9108 + }, + { + "epoch": 0.72, + "grad_norm": 0.5649220923690779, + "learning_rate": 1.9780580766829484e-06, + "loss": 0.5041, + "step": 9109 + }, + { + "epoch": 0.72, + "grad_norm": 0.539407448722253, + "learning_rate": 1.9770449468151833e-06, + "loss": 0.4831, + "step": 9110 + }, + { + "epoch": 0.72, + "grad_norm": 0.5224227635384933, + "learning_rate": 1.976032012528656e-06, + "loss": 0.4871, + "step": 9111 + }, + { + "epoch": 0.72, + "grad_norm": 1.609974931255902, + "learning_rate": 1.9750192738889023e-06, + "loss": 0.4124, + "step": 9112 + }, + { + "epoch": 0.72, + "grad_norm": 1.4419942911751824, + "learning_rate": 1.974006730961442e-06, + "loss": 0.451, + "step": 9113 + }, + { + "epoch": 0.72, + "grad_norm": 1.609049868820141, + "learning_rate": 1.9729943838117905e-06, + "loss": 0.4072, + "step": 9114 + }, + { + "epoch": 0.72, + "grad_norm": 1.6076993574642724, + "learning_rate": 1.971982232505441e-06, + "loss": 0.4505, + "step": 9115 + }, + { + "epoch": 0.72, + "grad_norm": 1.5596619700819974, + "learning_rate": 1.970970277107882e-06, + "loss": 0.4457, + "step": 9116 + }, + { + "epoch": 0.72, + "grad_norm": 1.5552951761548044, + "learning_rate": 1.9699585176845796e-06, + "loss": 0.4266, + "step": 9117 + }, + { + "epoch": 0.72, + "grad_norm": 1.6115211171873614, + "learning_rate": 1.9689469543009997e-06, + "loss": 0.4393, + "step": 9118 + }, + { + "epoch": 0.72, + "grad_norm": 1.6686097956781625, + "learning_rate": 1.967935587022588e-06, + "loss": 0.4445, + "step": 9119 + }, + { + "epoch": 0.72, + "grad_norm": 2.722510681302889, + "learning_rate": 1.9669244159147756e-06, + "loss": 0.4319, + "step": 9120 + }, + { + "epoch": 0.72, + "grad_norm": 1.5196452251857038, + "learning_rate": 1.9659134410429857e-06, + "loss": 0.4391, + "step": 9121 + }, + { + "epoch": 0.72, + "grad_norm": 1.4980451849113845, + "learning_rate": 1.9649026624726276e-06, + "loss": 0.4359, + "step": 9122 + }, + { + "epoch": 0.72, + "grad_norm": 0.5861242357061968, + "learning_rate": 1.963892080269098e-06, + "loss": 0.4685, + "step": 9123 + }, + { + "epoch": 0.72, + "grad_norm": 1.4866437437874906, + "learning_rate": 1.9628816944977773e-06, + "loss": 0.4336, + "step": 9124 + }, + { + "epoch": 0.72, + "grad_norm": 1.778217294257492, + "learning_rate": 1.961871505224038e-06, + "loss": 0.4535, + "step": 9125 + }, + { + "epoch": 0.72, + "grad_norm": 1.8422995642936062, + "learning_rate": 1.9608615125132387e-06, + "loss": 0.4909, + "step": 9126 + }, + { + "epoch": 0.72, + "grad_norm": 2.0333615509661427, + "learning_rate": 1.959851716430725e-06, + "loss": 0.4618, + "step": 9127 + }, + { + "epoch": 0.72, + "grad_norm": 2.595272262520607, + "learning_rate": 1.9588421170418265e-06, + "loss": 0.4823, + "step": 9128 + }, + { + "epoch": 0.72, + "grad_norm": 0.5525596636398344, + "learning_rate": 1.957832714411863e-06, + "loss": 0.4619, + "step": 9129 + }, + { + "epoch": 0.72, + "grad_norm": 1.5089312348105903, + "learning_rate": 1.9568235086061467e-06, + "loss": 0.5012, + "step": 9130 + }, + { + "epoch": 0.72, + "grad_norm": 0.5932644084783678, + "learning_rate": 1.9558144996899665e-06, + "loss": 0.4775, + "step": 9131 + }, + { + "epoch": 0.72, + "grad_norm": 2.4762967051107085, + "learning_rate": 1.9548056877286058e-06, + "loss": 0.4647, + "step": 9132 + }, + { + "epoch": 0.72, + "grad_norm": 2.0064854070553277, + "learning_rate": 1.953797072787334e-06, + "loss": 0.465, + "step": 9133 + }, + { + "epoch": 0.72, + "grad_norm": 1.6721437463299431, + "learning_rate": 1.952788654931408e-06, + "loss": 0.4281, + "step": 9134 + }, + { + "epoch": 0.72, + "grad_norm": 2.4337388986184765, + "learning_rate": 1.951780434226068e-06, + "loss": 0.3888, + "step": 9135 + }, + { + "epoch": 0.72, + "grad_norm": 1.85172524650859, + "learning_rate": 1.9507724107365465e-06, + "loss": 0.469, + "step": 9136 + }, + { + "epoch": 0.72, + "grad_norm": 1.901302928193326, + "learning_rate": 1.949764584528061e-06, + "loss": 0.5008, + "step": 9137 + }, + { + "epoch": 0.72, + "grad_norm": 1.6410772168762595, + "learning_rate": 1.948756955665819e-06, + "loss": 0.4112, + "step": 9138 + }, + { + "epoch": 0.72, + "grad_norm": 1.7054388433261165, + "learning_rate": 1.947749524215008e-06, + "loss": 0.4116, + "step": 9139 + }, + { + "epoch": 0.72, + "grad_norm": 2.0412121462996686, + "learning_rate": 1.94674229024081e-06, + "loss": 0.4587, + "step": 9140 + }, + { + "epoch": 0.72, + "grad_norm": 1.7362147716078151, + "learning_rate": 1.945735253808392e-06, + "loss": 0.3564, + "step": 9141 + }, + { + "epoch": 0.72, + "grad_norm": 0.617189113476337, + "learning_rate": 1.944728414982907e-06, + "loss": 0.4875, + "step": 9142 + }, + { + "epoch": 0.72, + "grad_norm": 2.093336202199005, + "learning_rate": 1.9437217738294987e-06, + "loss": 0.4067, + "step": 9143 + }, + { + "epoch": 0.72, + "grad_norm": 2.456723202227306, + "learning_rate": 1.9427153304132923e-06, + "loss": 0.4629, + "step": 9144 + }, + { + "epoch": 0.72, + "grad_norm": 1.3922613146136986, + "learning_rate": 1.941709084799404e-06, + "loss": 0.4023, + "step": 9145 + }, + { + "epoch": 0.72, + "grad_norm": 1.9525647966324091, + "learning_rate": 1.9407030370529377e-06, + "loss": 0.4519, + "step": 9146 + }, + { + "epoch": 0.72, + "grad_norm": 0.5550076223116884, + "learning_rate": 1.9396971872389843e-06, + "loss": 0.4837, + "step": 9147 + }, + { + "epoch": 0.72, + "grad_norm": 1.677272708607009, + "learning_rate": 1.938691535422616e-06, + "loss": 0.418, + "step": 9148 + }, + { + "epoch": 0.72, + "grad_norm": 1.7298142951579487, + "learning_rate": 1.9376860816689044e-06, + "loss": 0.4428, + "step": 9149 + }, + { + "epoch": 0.72, + "grad_norm": 1.5452945563889064, + "learning_rate": 1.9366808260428947e-06, + "loss": 0.4189, + "step": 9150 + }, + { + "epoch": 0.72, + "grad_norm": 1.767455687919906, + "learning_rate": 1.9356757686096297e-06, + "loss": 0.3797, + "step": 9151 + }, + { + "epoch": 0.72, + "grad_norm": 1.549514929634445, + "learning_rate": 1.9346709094341325e-06, + "loss": 0.499, + "step": 9152 + }, + { + "epoch": 0.72, + "grad_norm": 0.5308333091790249, + "learning_rate": 1.933666248581418e-06, + "loss": 0.482, + "step": 9153 + }, + { + "epoch": 0.72, + "grad_norm": 0.5551199359990271, + "learning_rate": 1.9326617861164876e-06, + "loss": 0.4677, + "step": 9154 + }, + { + "epoch": 0.72, + "grad_norm": 2.078679556563845, + "learning_rate": 1.9316575221043256e-06, + "loss": 0.4956, + "step": 9155 + }, + { + "epoch": 0.72, + "grad_norm": 1.6984412788820709, + "learning_rate": 1.930653456609908e-06, + "loss": 0.4739, + "step": 9156 + }, + { + "epoch": 0.72, + "grad_norm": 2.3863778539989897, + "learning_rate": 1.929649589698196e-06, + "loss": 0.4696, + "step": 9157 + }, + { + "epoch": 0.72, + "grad_norm": 2.3208282700755762, + "learning_rate": 1.9286459214341413e-06, + "loss": 0.4139, + "step": 9158 + }, + { + "epoch": 0.72, + "grad_norm": 0.5560264677295821, + "learning_rate": 1.927642451882676e-06, + "loss": 0.4525, + "step": 9159 + }, + { + "epoch": 0.72, + "grad_norm": 2.106273065112307, + "learning_rate": 1.9266391811087243e-06, + "loss": 0.518, + "step": 9160 + }, + { + "epoch": 0.72, + "grad_norm": 1.4334736069047371, + "learning_rate": 1.9256361091771974e-06, + "loss": 0.4998, + "step": 9161 + }, + { + "epoch": 0.72, + "grad_norm": 1.7386219616256984, + "learning_rate": 1.9246332361529945e-06, + "loss": 0.4515, + "step": 9162 + }, + { + "epoch": 0.72, + "grad_norm": 1.5183076130130302, + "learning_rate": 1.9236305621009958e-06, + "loss": 0.4485, + "step": 9163 + }, + { + "epoch": 0.72, + "grad_norm": 0.5867848655026159, + "learning_rate": 1.922628087086074e-06, + "loss": 0.4857, + "step": 9164 + }, + { + "epoch": 0.72, + "grad_norm": 0.5093415147121736, + "learning_rate": 1.921625811173093e-06, + "loss": 0.4676, + "step": 9165 + }, + { + "epoch": 0.72, + "grad_norm": 2.874306764023957, + "learning_rate": 1.920623734426893e-06, + "loss": 0.4964, + "step": 9166 + }, + { + "epoch": 0.72, + "grad_norm": 2.766747159618499, + "learning_rate": 1.9196218569123094e-06, + "loss": 0.4312, + "step": 9167 + }, + { + "epoch": 0.72, + "grad_norm": 1.5406331983404502, + "learning_rate": 1.918620178694161e-06, + "loss": 0.4471, + "step": 9168 + }, + { + "epoch": 0.72, + "grad_norm": 2.260759109015618, + "learning_rate": 1.917618699837259e-06, + "loss": 0.4369, + "step": 9169 + }, + { + "epoch": 0.72, + "grad_norm": 2.155857237969518, + "learning_rate": 1.9166174204063925e-06, + "loss": 0.4919, + "step": 9170 + }, + { + "epoch": 0.72, + "grad_norm": 1.8222656194680276, + "learning_rate": 1.9156163404663453e-06, + "loss": 0.4673, + "step": 9171 + }, + { + "epoch": 0.72, + "grad_norm": 1.747868198546036, + "learning_rate": 1.914615460081886e-06, + "loss": 0.4616, + "step": 9172 + }, + { + "epoch": 0.72, + "grad_norm": 2.2806562090282863, + "learning_rate": 1.913614779317772e-06, + "loss": 0.4271, + "step": 9173 + }, + { + "epoch": 0.72, + "grad_norm": 2.8840310733404526, + "learning_rate": 1.912614298238743e-06, + "loss": 0.4259, + "step": 9174 + }, + { + "epoch": 0.72, + "grad_norm": 2.1425635769923828, + "learning_rate": 1.9116140169095292e-06, + "loss": 0.4592, + "step": 9175 + }, + { + "epoch": 0.72, + "grad_norm": 1.554602568767729, + "learning_rate": 1.9106139353948493e-06, + "loss": 0.3769, + "step": 9176 + }, + { + "epoch": 0.72, + "grad_norm": 2.224383722419359, + "learning_rate": 1.909614053759406e-06, + "loss": 0.469, + "step": 9177 + }, + { + "epoch": 0.72, + "grad_norm": 2.878350654288025, + "learning_rate": 1.9086143720678925e-06, + "loss": 0.4799, + "step": 9178 + }, + { + "epoch": 0.72, + "grad_norm": 0.5199105857578176, + "learning_rate": 1.907614890384983e-06, + "loss": 0.4608, + "step": 9179 + }, + { + "epoch": 0.72, + "grad_norm": 2.448375439959061, + "learning_rate": 1.9066156087753451e-06, + "loss": 0.4877, + "step": 9180 + }, + { + "epoch": 0.72, + "grad_norm": 1.4772809464921797, + "learning_rate": 1.9056165273036303e-06, + "loss": 0.4041, + "step": 9181 + }, + { + "epoch": 0.72, + "grad_norm": 0.5585764725422497, + "learning_rate": 1.9046176460344802e-06, + "loss": 0.4724, + "step": 9182 + }, + { + "epoch": 0.72, + "grad_norm": 0.5483104127030562, + "learning_rate": 1.903618965032516e-06, + "loss": 0.4465, + "step": 9183 + }, + { + "epoch": 0.72, + "grad_norm": 1.759087172651646, + "learning_rate": 1.9026204843623569e-06, + "loss": 0.4474, + "step": 9184 + }, + { + "epoch": 0.72, + "grad_norm": 0.5382222947620958, + "learning_rate": 1.9016222040885994e-06, + "loss": 0.4849, + "step": 9185 + }, + { + "epoch": 0.72, + "grad_norm": 2.2451304371676537, + "learning_rate": 1.9006241242758322e-06, + "loss": 0.4411, + "step": 9186 + }, + { + "epoch": 0.72, + "grad_norm": 1.7069006085466891, + "learning_rate": 1.8996262449886294e-06, + "loss": 0.4407, + "step": 9187 + }, + { + "epoch": 0.72, + "grad_norm": 3.373472597296601, + "learning_rate": 1.8986285662915528e-06, + "loss": 0.4982, + "step": 9188 + }, + { + "epoch": 0.72, + "grad_norm": 1.7254849091100373, + "learning_rate": 1.8976310882491522e-06, + "loss": 0.4336, + "step": 9189 + }, + { + "epoch": 0.72, + "grad_norm": 2.0107084218907123, + "learning_rate": 1.8966338109259608e-06, + "loss": 0.4803, + "step": 9190 + }, + { + "epoch": 0.72, + "grad_norm": 1.7882591634732128, + "learning_rate": 1.8956367343865017e-06, + "loss": 0.4644, + "step": 9191 + }, + { + "epoch": 0.72, + "grad_norm": 1.8883673999002968, + "learning_rate": 1.8946398586952847e-06, + "loss": 0.4241, + "step": 9192 + }, + { + "epoch": 0.72, + "grad_norm": 0.5421223452292012, + "learning_rate": 1.8936431839168084e-06, + "loss": 0.4638, + "step": 9193 + }, + { + "epoch": 0.72, + "grad_norm": 1.3444914491058082, + "learning_rate": 1.8926467101155532e-06, + "loss": 0.5309, + "step": 9194 + }, + { + "epoch": 0.72, + "grad_norm": 0.5388673124207428, + "learning_rate": 1.8916504373559903e-06, + "loss": 0.465, + "step": 9195 + }, + { + "epoch": 0.72, + "grad_norm": 1.5987483196596364, + "learning_rate": 1.8906543657025784e-06, + "loss": 0.3882, + "step": 9196 + }, + { + "epoch": 0.72, + "grad_norm": 1.9093022392877852, + "learning_rate": 1.889658495219761e-06, + "loss": 0.4376, + "step": 9197 + }, + { + "epoch": 0.72, + "grad_norm": 5.7973848421637735, + "learning_rate": 1.888662825971972e-06, + "loss": 0.4228, + "step": 9198 + }, + { + "epoch": 0.72, + "grad_norm": 1.4185810799590417, + "learning_rate": 1.887667358023625e-06, + "loss": 0.4673, + "step": 9199 + }, + { + "epoch": 0.72, + "grad_norm": 1.4879203975084994, + "learning_rate": 1.8866720914391313e-06, + "loss": 0.446, + "step": 9200 + }, + { + "epoch": 0.72, + "grad_norm": 2.311890417718689, + "learning_rate": 1.8856770262828788e-06, + "loss": 0.4633, + "step": 9201 + }, + { + "epoch": 0.72, + "grad_norm": 2.1322965166696397, + "learning_rate": 1.884682162619249e-06, + "loss": 0.431, + "step": 9202 + }, + { + "epoch": 0.72, + "grad_norm": 2.0796013831000915, + "learning_rate": 1.8836875005126083e-06, + "loss": 0.5054, + "step": 9203 + }, + { + "epoch": 0.72, + "grad_norm": 2.3121422649011376, + "learning_rate": 1.8826930400273108e-06, + "loss": 0.421, + "step": 9204 + }, + { + "epoch": 0.72, + "grad_norm": 0.5678717134258826, + "learning_rate": 1.8816987812276937e-06, + "loss": 0.4659, + "step": 9205 + }, + { + "epoch": 0.72, + "grad_norm": 0.5420242019248788, + "learning_rate": 1.8807047241780868e-06, + "loss": 0.4641, + "step": 9206 + }, + { + "epoch": 0.72, + "grad_norm": 2.9214575866449635, + "learning_rate": 1.8797108689428034e-06, + "loss": 0.4601, + "step": 9207 + }, + { + "epoch": 0.72, + "grad_norm": 1.7336959115142554, + "learning_rate": 1.8787172155861449e-06, + "loss": 0.4275, + "step": 9208 + }, + { + "epoch": 0.72, + "grad_norm": 1.8769876056339727, + "learning_rate": 1.8777237641724018e-06, + "loss": 0.466, + "step": 9209 + }, + { + "epoch": 0.72, + "grad_norm": 1.426596782776659, + "learning_rate": 1.8767305147658443e-06, + "loss": 0.3977, + "step": 9210 + }, + { + "epoch": 0.72, + "grad_norm": 1.4462289258772492, + "learning_rate": 1.8757374674307365e-06, + "loss": 0.3972, + "step": 9211 + }, + { + "epoch": 0.72, + "grad_norm": 1.5932914619831127, + "learning_rate": 1.8747446222313282e-06, + "loss": 0.404, + "step": 9212 + }, + { + "epoch": 0.72, + "grad_norm": 0.5519372100243521, + "learning_rate": 1.873751979231856e-06, + "loss": 0.4905, + "step": 9213 + }, + { + "epoch": 0.72, + "grad_norm": 1.5142662442772574, + "learning_rate": 1.872759538496539e-06, + "loss": 0.4846, + "step": 9214 + }, + { + "epoch": 0.72, + "grad_norm": 1.498870063757825, + "learning_rate": 1.8717673000895892e-06, + "loss": 0.4203, + "step": 9215 + }, + { + "epoch": 0.72, + "grad_norm": 1.81282711386877, + "learning_rate": 1.870775264075203e-06, + "loss": 0.4317, + "step": 9216 + }, + { + "epoch": 0.72, + "grad_norm": 1.9882558339664969, + "learning_rate": 1.869783430517565e-06, + "loss": 0.408, + "step": 9217 + }, + { + "epoch": 0.72, + "grad_norm": 2.419733341748207, + "learning_rate": 1.8687917994808413e-06, + "loss": 0.4722, + "step": 9218 + }, + { + "epoch": 0.72, + "grad_norm": 1.3585326606217072, + "learning_rate": 1.867800371029193e-06, + "loss": 0.4435, + "step": 9219 + }, + { + "epoch": 0.72, + "grad_norm": 0.5414326528628461, + "learning_rate": 1.8668091452267656e-06, + "loss": 0.4861, + "step": 9220 + }, + { + "epoch": 0.72, + "grad_norm": 1.84589213537367, + "learning_rate": 1.865818122137686e-06, + "loss": 0.4375, + "step": 9221 + }, + { + "epoch": 0.72, + "grad_norm": 1.5199006412952465, + "learning_rate": 1.8648273018260737e-06, + "loss": 0.3748, + "step": 9222 + }, + { + "epoch": 0.72, + "grad_norm": 0.5208804858357531, + "learning_rate": 1.8638366843560335e-06, + "loss": 0.4541, + "step": 9223 + }, + { + "epoch": 0.72, + "grad_norm": 1.518205788056479, + "learning_rate": 1.8628462697916594e-06, + "loss": 0.454, + "step": 9224 + }, + { + "epoch": 0.72, + "grad_norm": 1.6400893076547673, + "learning_rate": 1.8618560581970258e-06, + "loss": 0.484, + "step": 9225 + }, + { + "epoch": 0.72, + "grad_norm": 1.4480287237737612, + "learning_rate": 1.8608660496361997e-06, + "loss": 0.4083, + "step": 9226 + }, + { + "epoch": 0.72, + "grad_norm": 1.5329479549070404, + "learning_rate": 1.8598762441732343e-06, + "loss": 0.4236, + "step": 9227 + }, + { + "epoch": 0.72, + "grad_norm": 2.1628207731848867, + "learning_rate": 1.8588866418721691e-06, + "loss": 0.4044, + "step": 9228 + }, + { + "epoch": 0.72, + "grad_norm": 2.7215815356493622, + "learning_rate": 1.8578972427970281e-06, + "loss": 0.45, + "step": 9229 + }, + { + "epoch": 0.72, + "grad_norm": 3.6611421374795294, + "learning_rate": 1.856908047011823e-06, + "loss": 0.494, + "step": 9230 + }, + { + "epoch": 0.72, + "grad_norm": 1.3915498632270553, + "learning_rate": 1.8559190545805589e-06, + "loss": 0.4175, + "step": 9231 + }, + { + "epoch": 0.73, + "grad_norm": 0.5541304575825532, + "learning_rate": 1.8549302655672175e-06, + "loss": 0.4754, + "step": 9232 + }, + { + "epoch": 0.73, + "grad_norm": 1.6711980062408236, + "learning_rate": 1.8539416800357752e-06, + "loss": 0.4579, + "step": 9233 + }, + { + "epoch": 0.73, + "grad_norm": 1.63264840669654, + "learning_rate": 1.8529532980501874e-06, + "loss": 0.4018, + "step": 9234 + }, + { + "epoch": 0.73, + "grad_norm": 1.5190269963064156, + "learning_rate": 1.8519651196744076e-06, + "loss": 0.3929, + "step": 9235 + }, + { + "epoch": 0.73, + "grad_norm": 1.5045470009754347, + "learning_rate": 1.8509771449723646e-06, + "loss": 0.4788, + "step": 9236 + }, + { + "epoch": 0.73, + "grad_norm": 0.5421763058944935, + "learning_rate": 1.849989374007981e-06, + "loss": 0.4854, + "step": 9237 + }, + { + "epoch": 0.73, + "grad_norm": 1.9098538486400658, + "learning_rate": 1.8490018068451644e-06, + "loss": 0.4452, + "step": 9238 + }, + { + "epoch": 0.73, + "grad_norm": 1.3888801205920416, + "learning_rate": 1.8480144435478104e-06, + "loss": 0.4627, + "step": 9239 + }, + { + "epoch": 0.73, + "grad_norm": 1.6722898497132048, + "learning_rate": 1.847027284179797e-06, + "loss": 0.4271, + "step": 9240 + }, + { + "epoch": 0.73, + "grad_norm": 1.5033792356736404, + "learning_rate": 1.8460403288049934e-06, + "loss": 0.4277, + "step": 9241 + }, + { + "epoch": 0.73, + "grad_norm": 1.391308248407966, + "learning_rate": 1.8450535774872546e-06, + "loss": 0.4019, + "step": 9242 + }, + { + "epoch": 0.73, + "grad_norm": 2.140068029200228, + "learning_rate": 1.844067030290423e-06, + "loss": 0.4693, + "step": 9243 + }, + { + "epoch": 0.73, + "grad_norm": 1.9755847421866868, + "learning_rate": 1.8430806872783269e-06, + "loss": 0.4441, + "step": 9244 + }, + { + "epoch": 0.73, + "grad_norm": 1.5460951761563018, + "learning_rate": 1.842094548514779e-06, + "loss": 0.4513, + "step": 9245 + }, + { + "epoch": 0.73, + "grad_norm": 2.2005221529226167, + "learning_rate": 1.8411086140635831e-06, + "loss": 0.4939, + "step": 9246 + }, + { + "epoch": 0.73, + "grad_norm": 1.8903767635733937, + "learning_rate": 1.8401228839885271e-06, + "loss": 0.438, + "step": 9247 + }, + { + "epoch": 0.73, + "grad_norm": 2.0393010721104523, + "learning_rate": 1.839137358353389e-06, + "loss": 0.5114, + "step": 9248 + }, + { + "epoch": 0.73, + "grad_norm": 1.946580249507811, + "learning_rate": 1.8381520372219265e-06, + "loss": 0.4953, + "step": 9249 + }, + { + "epoch": 0.73, + "grad_norm": 0.5556249492954387, + "learning_rate": 1.8371669206578914e-06, + "loss": 0.487, + "step": 9250 + }, + { + "epoch": 0.73, + "grad_norm": 1.9477080437888161, + "learning_rate": 1.8361820087250194e-06, + "loss": 0.4144, + "step": 9251 + }, + { + "epoch": 0.73, + "grad_norm": 2.4589068885400316, + "learning_rate": 1.8351973014870339e-06, + "loss": 0.448, + "step": 9252 + }, + { + "epoch": 0.73, + "grad_norm": 1.5362232240013247, + "learning_rate": 1.834212799007639e-06, + "loss": 0.4662, + "step": 9253 + }, + { + "epoch": 0.73, + "grad_norm": 1.5367898453772608, + "learning_rate": 1.8332285013505374e-06, + "loss": 0.4171, + "step": 9254 + }, + { + "epoch": 0.73, + "grad_norm": 1.8864570404520737, + "learning_rate": 1.83224440857941e-06, + "loss": 0.4638, + "step": 9255 + }, + { + "epoch": 0.73, + "grad_norm": 1.59273282789299, + "learning_rate": 1.8312605207579243e-06, + "loss": 0.4393, + "step": 9256 + }, + { + "epoch": 0.73, + "grad_norm": 0.5666707898029244, + "learning_rate": 1.8302768379497372e-06, + "loss": 0.5038, + "step": 9257 + }, + { + "epoch": 0.73, + "grad_norm": 4.463357693049538, + "learning_rate": 1.8292933602184926e-06, + "loss": 0.4675, + "step": 9258 + }, + { + "epoch": 0.73, + "grad_norm": 1.9139614599752526, + "learning_rate": 1.8283100876278214e-06, + "loss": 0.4533, + "step": 9259 + }, + { + "epoch": 0.73, + "grad_norm": 2.3427453874249586, + "learning_rate": 1.8273270202413362e-06, + "loss": 0.4201, + "step": 9260 + }, + { + "epoch": 0.73, + "grad_norm": 0.5240255596541301, + "learning_rate": 1.8263441581226433e-06, + "loss": 0.4584, + "step": 9261 + }, + { + "epoch": 0.73, + "grad_norm": 1.994949089005115, + "learning_rate": 1.825361501335331e-06, + "loss": 0.4578, + "step": 9262 + }, + { + "epoch": 0.73, + "grad_norm": 1.7777001466533116, + "learning_rate": 1.8243790499429787e-06, + "loss": 0.4535, + "step": 9263 + }, + { + "epoch": 0.73, + "grad_norm": 1.813926574327787, + "learning_rate": 1.8233968040091455e-06, + "loss": 0.4297, + "step": 9264 + }, + { + "epoch": 0.73, + "grad_norm": 4.038049685055486, + "learning_rate": 1.822414763597382e-06, + "loss": 0.4162, + "step": 9265 + }, + { + "epoch": 0.73, + "grad_norm": 0.5785531416329566, + "learning_rate": 1.8214329287712301e-06, + "loss": 0.4644, + "step": 9266 + }, + { + "epoch": 0.73, + "grad_norm": 1.816192217168559, + "learning_rate": 1.820451299594208e-06, + "loss": 0.4696, + "step": 9267 + }, + { + "epoch": 0.73, + "grad_norm": 1.6655227772043057, + "learning_rate": 1.819469876129829e-06, + "loss": 0.4297, + "step": 9268 + }, + { + "epoch": 0.73, + "grad_norm": 2.708322419406571, + "learning_rate": 1.8184886584415851e-06, + "loss": 0.4275, + "step": 9269 + }, + { + "epoch": 0.73, + "grad_norm": 1.6760379698829073, + "learning_rate": 1.8175076465929658e-06, + "loss": 0.4527, + "step": 9270 + }, + { + "epoch": 0.73, + "grad_norm": 1.545372721176282, + "learning_rate": 1.816526840647438e-06, + "loss": 0.4836, + "step": 9271 + }, + { + "epoch": 0.73, + "grad_norm": 2.1280552135995947, + "learning_rate": 1.8155462406684598e-06, + "loss": 0.4463, + "step": 9272 + }, + { + "epoch": 0.73, + "grad_norm": 1.8441588535034579, + "learning_rate": 1.814565846719471e-06, + "loss": 0.4879, + "step": 9273 + }, + { + "epoch": 0.73, + "grad_norm": 1.9454659232218547, + "learning_rate": 1.8135856588639083e-06, + "loss": 0.5036, + "step": 9274 + }, + { + "epoch": 0.73, + "grad_norm": 2.2368555782766117, + "learning_rate": 1.8126056771651835e-06, + "loss": 0.433, + "step": 9275 + }, + { + "epoch": 0.73, + "grad_norm": 2.0002766260060922, + "learning_rate": 1.8116259016867017e-06, + "loss": 0.4321, + "step": 9276 + }, + { + "epoch": 0.73, + "grad_norm": 1.5621497408715517, + "learning_rate": 1.8106463324918528e-06, + "loss": 0.4321, + "step": 9277 + }, + { + "epoch": 0.73, + "grad_norm": 1.4859241322226107, + "learning_rate": 1.809666969644014e-06, + "loss": 0.394, + "step": 9278 + }, + { + "epoch": 0.73, + "grad_norm": 3.315578274882957, + "learning_rate": 1.808687813206551e-06, + "loss": 0.4011, + "step": 9279 + }, + { + "epoch": 0.73, + "grad_norm": 1.7546561153206832, + "learning_rate": 1.8077088632428098e-06, + "loss": 0.4231, + "step": 9280 + }, + { + "epoch": 0.73, + "grad_norm": 1.6751986167005537, + "learning_rate": 1.806730119816129e-06, + "loss": 0.4582, + "step": 9281 + }, + { + "epoch": 0.73, + "grad_norm": 1.3984430950892055, + "learning_rate": 1.805751582989832e-06, + "loss": 0.4586, + "step": 9282 + }, + { + "epoch": 0.73, + "grad_norm": 0.5345368492872156, + "learning_rate": 1.804773252827231e-06, + "loss": 0.4621, + "step": 9283 + }, + { + "epoch": 0.73, + "grad_norm": 1.8772328323655694, + "learning_rate": 1.803795129391619e-06, + "loss": 0.4708, + "step": 9284 + }, + { + "epoch": 0.73, + "grad_norm": 1.5103311658715841, + "learning_rate": 1.8028172127462806e-06, + "loss": 0.437, + "step": 9285 + }, + { + "epoch": 0.73, + "grad_norm": 1.8392821683791205, + "learning_rate": 1.801839502954486e-06, + "loss": 0.4159, + "step": 9286 + }, + { + "epoch": 0.73, + "grad_norm": 1.9755264681625297, + "learning_rate": 1.8008620000794923e-06, + "loss": 0.4176, + "step": 9287 + }, + { + "epoch": 0.73, + "grad_norm": 4.744678326855679, + "learning_rate": 1.799884704184542e-06, + "loss": 0.4596, + "step": 9288 + }, + { + "epoch": 0.73, + "grad_norm": 1.3276883038910463, + "learning_rate": 1.798907615332865e-06, + "loss": 0.4224, + "step": 9289 + }, + { + "epoch": 0.73, + "grad_norm": 1.9183167014473168, + "learning_rate": 1.7979307335876795e-06, + "loss": 0.465, + "step": 9290 + }, + { + "epoch": 0.73, + "grad_norm": 1.7817855433985248, + "learning_rate": 1.7969540590121854e-06, + "loss": 0.4438, + "step": 9291 + }, + { + "epoch": 0.73, + "grad_norm": 1.5718021713930184, + "learning_rate": 1.7959775916695727e-06, + "loss": 0.4664, + "step": 9292 + }, + { + "epoch": 0.73, + "grad_norm": 0.5858898233712196, + "learning_rate": 1.7950013316230192e-06, + "loss": 0.4779, + "step": 9293 + }, + { + "epoch": 0.73, + "grad_norm": 2.5968143485916064, + "learning_rate": 1.7940252789356882e-06, + "loss": 0.4526, + "step": 9294 + }, + { + "epoch": 0.73, + "grad_norm": 2.3469827852523824, + "learning_rate": 1.793049433670726e-06, + "loss": 0.4858, + "step": 9295 + }, + { + "epoch": 0.73, + "grad_norm": 2.0758638365294835, + "learning_rate": 1.7920737958912704e-06, + "loss": 0.4653, + "step": 9296 + }, + { + "epoch": 0.73, + "grad_norm": 0.5430044710587435, + "learning_rate": 1.7910983656604436e-06, + "loss": 0.4693, + "step": 9297 + }, + { + "epoch": 0.73, + "grad_norm": 2.7602142893216746, + "learning_rate": 1.7901231430413545e-06, + "loss": 0.4879, + "step": 9298 + }, + { + "epoch": 0.73, + "grad_norm": 1.3472476607751918, + "learning_rate": 1.7891481280971014e-06, + "loss": 0.4691, + "step": 9299 + }, + { + "epoch": 0.73, + "grad_norm": 0.541496911455217, + "learning_rate": 1.7881733208907603e-06, + "loss": 0.4711, + "step": 9300 + }, + { + "epoch": 0.73, + "grad_norm": 1.4798230781164425, + "learning_rate": 1.787198721485407e-06, + "loss": 0.4784, + "step": 9301 + }, + { + "epoch": 0.73, + "grad_norm": 2.0178051855943138, + "learning_rate": 1.7862243299440917e-06, + "loss": 0.4781, + "step": 9302 + }, + { + "epoch": 0.73, + "grad_norm": 1.9868237610745132, + "learning_rate": 1.7852501463298605e-06, + "loss": 0.4474, + "step": 9303 + }, + { + "epoch": 0.73, + "grad_norm": 1.4928139370725808, + "learning_rate": 1.7842761707057355e-06, + "loss": 0.4428, + "step": 9304 + }, + { + "epoch": 0.73, + "grad_norm": 1.6491388189457152, + "learning_rate": 1.783302403134739e-06, + "loss": 0.4547, + "step": 9305 + }, + { + "epoch": 0.73, + "grad_norm": 0.5511368577955398, + "learning_rate": 1.7823288436798674e-06, + "loss": 0.4769, + "step": 9306 + }, + { + "epoch": 0.73, + "grad_norm": 1.5586850148950284, + "learning_rate": 1.7813554924041121e-06, + "loss": 0.4154, + "step": 9307 + }, + { + "epoch": 0.73, + "grad_norm": 1.447723229103301, + "learning_rate": 1.7803823493704426e-06, + "loss": 0.485, + "step": 9308 + }, + { + "epoch": 0.73, + "grad_norm": 1.5967824889120015, + "learning_rate": 1.7794094146418266e-06, + "loss": 0.4526, + "step": 9309 + }, + { + "epoch": 0.73, + "grad_norm": 1.7050161123000305, + "learning_rate": 1.7784366882812066e-06, + "loss": 0.4809, + "step": 9310 + }, + { + "epoch": 0.73, + "grad_norm": 1.9088807439335018, + "learning_rate": 1.7774641703515189e-06, + "loss": 0.4201, + "step": 9311 + }, + { + "epoch": 0.73, + "grad_norm": 1.4174311003575504, + "learning_rate": 1.7764918609156835e-06, + "loss": 0.4769, + "step": 9312 + }, + { + "epoch": 0.73, + "grad_norm": 0.6198413680197977, + "learning_rate": 1.7755197600366076e-06, + "loss": 0.4949, + "step": 9313 + }, + { + "epoch": 0.73, + "grad_norm": 0.5362808728343472, + "learning_rate": 1.774547867777187e-06, + "loss": 0.4641, + "step": 9314 + }, + { + "epoch": 0.73, + "grad_norm": 1.3923999536205864, + "learning_rate": 1.7735761842002986e-06, + "loss": 0.4607, + "step": 9315 + }, + { + "epoch": 0.73, + "grad_norm": 1.9420880552116575, + "learning_rate": 1.7726047093688098e-06, + "loss": 0.4563, + "step": 9316 + }, + { + "epoch": 0.73, + "grad_norm": 0.6010606847688019, + "learning_rate": 1.7716334433455746e-06, + "loss": 0.4841, + "step": 9317 + }, + { + "epoch": 0.73, + "grad_norm": 1.5922081536444679, + "learning_rate": 1.7706623861934341e-06, + "loss": 0.4559, + "step": 9318 + }, + { + "epoch": 0.73, + "grad_norm": 2.363557857965496, + "learning_rate": 1.7696915379752111e-06, + "loss": 0.4795, + "step": 9319 + }, + { + "epoch": 0.73, + "grad_norm": 2.130864049268851, + "learning_rate": 1.7687208987537197e-06, + "loss": 0.4896, + "step": 9320 + }, + { + "epoch": 0.73, + "grad_norm": 1.6236941079567966, + "learning_rate": 1.767750468591759e-06, + "loss": 0.5046, + "step": 9321 + }, + { + "epoch": 0.73, + "grad_norm": 0.571103114808913, + "learning_rate": 1.7667802475521145e-06, + "loss": 0.4693, + "step": 9322 + }, + { + "epoch": 0.73, + "grad_norm": 0.5359202050752966, + "learning_rate": 1.7658102356975588e-06, + "loss": 0.4683, + "step": 9323 + }, + { + "epoch": 0.73, + "grad_norm": 0.5456737112928702, + "learning_rate": 1.7648404330908496e-06, + "loss": 0.4762, + "step": 9324 + }, + { + "epoch": 0.73, + "grad_norm": 1.4335623476107258, + "learning_rate": 1.7638708397947345e-06, + "loss": 0.4111, + "step": 9325 + }, + { + "epoch": 0.73, + "grad_norm": 0.5906323543995217, + "learning_rate": 1.762901455871941e-06, + "loss": 0.4876, + "step": 9326 + }, + { + "epoch": 0.73, + "grad_norm": 1.5577277279246093, + "learning_rate": 1.761932281385188e-06, + "loss": 0.4681, + "step": 9327 + }, + { + "epoch": 0.73, + "grad_norm": 1.804430177140995, + "learning_rate": 1.7609633163971806e-06, + "loss": 0.424, + "step": 9328 + }, + { + "epoch": 0.73, + "grad_norm": 1.6115224028505788, + "learning_rate": 1.7599945609706115e-06, + "loss": 0.4694, + "step": 9329 + }, + { + "epoch": 0.73, + "grad_norm": 2.1762393898851204, + "learning_rate": 1.7590260151681543e-06, + "loss": 0.3745, + "step": 9330 + }, + { + "epoch": 0.73, + "grad_norm": 1.6722203559996953, + "learning_rate": 1.758057679052474e-06, + "loss": 0.4419, + "step": 9331 + }, + { + "epoch": 0.73, + "grad_norm": 1.8229225259778827, + "learning_rate": 1.7570895526862202e-06, + "loss": 0.4483, + "step": 9332 + }, + { + "epoch": 0.73, + "grad_norm": 2.07559915063841, + "learning_rate": 1.756121636132031e-06, + "loss": 0.4731, + "step": 9333 + }, + { + "epoch": 0.73, + "grad_norm": 1.4463621563456848, + "learning_rate": 1.75515392945253e-06, + "loss": 0.4036, + "step": 9334 + }, + { + "epoch": 0.73, + "grad_norm": 1.681805116418943, + "learning_rate": 1.7541864327103208e-06, + "loss": 0.4379, + "step": 9335 + }, + { + "epoch": 0.73, + "grad_norm": 1.9468794944631487, + "learning_rate": 1.7532191459680076e-06, + "loss": 0.4798, + "step": 9336 + }, + { + "epoch": 0.73, + "grad_norm": 1.58254993002841, + "learning_rate": 1.7522520692881661e-06, + "loss": 0.4886, + "step": 9337 + }, + { + "epoch": 0.73, + "grad_norm": 1.5990588669648145, + "learning_rate": 1.7512852027333694e-06, + "loss": 0.4393, + "step": 9338 + }, + { + "epoch": 0.73, + "grad_norm": 1.7985446580840798, + "learning_rate": 1.7503185463661666e-06, + "loss": 0.4427, + "step": 9339 + }, + { + "epoch": 0.73, + "grad_norm": 1.61036880072945, + "learning_rate": 1.7493521002491054e-06, + "loss": 0.4524, + "step": 9340 + }, + { + "epoch": 0.73, + "grad_norm": 2.0426783606222543, + "learning_rate": 1.74838586444471e-06, + "loss": 0.41, + "step": 9341 + }, + { + "epoch": 0.73, + "grad_norm": 2.1366774435231237, + "learning_rate": 1.7474198390154974e-06, + "loss": 0.4091, + "step": 9342 + }, + { + "epoch": 0.73, + "grad_norm": 1.7556505387703467, + "learning_rate": 1.7464540240239626e-06, + "loss": 0.4505, + "step": 9343 + }, + { + "epoch": 0.73, + "grad_norm": 2.6566058792548293, + "learning_rate": 1.7454884195325977e-06, + "loss": 0.4551, + "step": 9344 + }, + { + "epoch": 0.73, + "grad_norm": 1.9662522560558913, + "learning_rate": 1.7445230256038764e-06, + "loss": 0.5178, + "step": 9345 + }, + { + "epoch": 0.73, + "grad_norm": 1.509436944987248, + "learning_rate": 1.7435578423002553e-06, + "loss": 0.4226, + "step": 9346 + }, + { + "epoch": 0.73, + "grad_norm": 0.5255955807171936, + "learning_rate": 1.7425928696841815e-06, + "loss": 0.444, + "step": 9347 + }, + { + "epoch": 0.73, + "grad_norm": 2.0268476223997896, + "learning_rate": 1.741628107818088e-06, + "loss": 0.4717, + "step": 9348 + }, + { + "epoch": 0.73, + "grad_norm": 2.0849529328624525, + "learning_rate": 1.7406635567643948e-06, + "loss": 0.4512, + "step": 9349 + }, + { + "epoch": 0.73, + "grad_norm": 2.1912060512591713, + "learning_rate": 1.7396992165855047e-06, + "loss": 0.5107, + "step": 9350 + }, + { + "epoch": 0.73, + "grad_norm": 1.7214610611756402, + "learning_rate": 1.7387350873438097e-06, + "loss": 0.4646, + "step": 9351 + }, + { + "epoch": 0.73, + "grad_norm": 1.5924600883896598, + "learning_rate": 1.7377711691016885e-06, + "loss": 0.5431, + "step": 9352 + }, + { + "epoch": 0.73, + "grad_norm": 2.409935865581491, + "learning_rate": 1.736807461921507e-06, + "loss": 0.4661, + "step": 9353 + }, + { + "epoch": 0.73, + "grad_norm": 3.547876913119003, + "learning_rate": 1.7358439658656119e-06, + "loss": 0.4549, + "step": 9354 + }, + { + "epoch": 0.73, + "grad_norm": 1.966440331252713, + "learning_rate": 1.7348806809963404e-06, + "loss": 0.4637, + "step": 9355 + }, + { + "epoch": 0.73, + "grad_norm": 1.7112132617858995, + "learning_rate": 1.733917607376021e-06, + "loss": 0.4479, + "step": 9356 + }, + { + "epoch": 0.73, + "grad_norm": 1.4424168307166918, + "learning_rate": 1.7329547450669586e-06, + "loss": 0.4334, + "step": 9357 + }, + { + "epoch": 0.73, + "grad_norm": 1.4016292961403505, + "learning_rate": 1.73199209413145e-06, + "loss": 0.449, + "step": 9358 + }, + { + "epoch": 0.74, + "grad_norm": 2.4481316452250996, + "learning_rate": 1.7310296546317778e-06, + "loss": 0.4557, + "step": 9359 + }, + { + "epoch": 0.74, + "grad_norm": 1.5016184556563066, + "learning_rate": 1.730067426630212e-06, + "loss": 0.4622, + "step": 9360 + }, + { + "epoch": 0.74, + "grad_norm": 2.2849569471795887, + "learning_rate": 1.7291054101890048e-06, + "loss": 0.4717, + "step": 9361 + }, + { + "epoch": 0.74, + "grad_norm": 1.7797826265308634, + "learning_rate": 1.7281436053703987e-06, + "loss": 0.416, + "step": 9362 + }, + { + "epoch": 0.74, + "grad_norm": 2.8338266262279257, + "learning_rate": 1.7271820122366211e-06, + "loss": 0.4906, + "step": 9363 + }, + { + "epoch": 0.74, + "grad_norm": 2.488351220973661, + "learning_rate": 1.7262206308498874e-06, + "loss": 0.4814, + "step": 9364 + }, + { + "epoch": 0.74, + "grad_norm": 0.5364365567445476, + "learning_rate": 1.725259461272395e-06, + "loss": 0.469, + "step": 9365 + }, + { + "epoch": 0.74, + "grad_norm": 0.5475691234460252, + "learning_rate": 1.7242985035663312e-06, + "loss": 0.4836, + "step": 9366 + }, + { + "epoch": 0.74, + "grad_norm": 2.134605342680636, + "learning_rate": 1.7233377577938693e-06, + "loss": 0.4314, + "step": 9367 + }, + { + "epoch": 0.74, + "grad_norm": 1.9681300335814162, + "learning_rate": 1.7223772240171676e-06, + "loss": 0.4693, + "step": 9368 + }, + { + "epoch": 0.74, + "grad_norm": 1.38245372498438, + "learning_rate": 1.721416902298374e-06, + "loss": 0.4229, + "step": 9369 + }, + { + "epoch": 0.74, + "grad_norm": 3.2502208510489177, + "learning_rate": 1.7204567926996145e-06, + "loss": 0.4216, + "step": 9370 + }, + { + "epoch": 0.74, + "grad_norm": 0.5695482962660846, + "learning_rate": 1.7194968952830137e-06, + "loss": 0.4651, + "step": 9371 + }, + { + "epoch": 0.74, + "grad_norm": 1.94518794761535, + "learning_rate": 1.7185372101106706e-06, + "loss": 0.4214, + "step": 9372 + }, + { + "epoch": 0.74, + "grad_norm": 2.1270999082332507, + "learning_rate": 1.717577737244679e-06, + "loss": 0.4356, + "step": 9373 + }, + { + "epoch": 0.74, + "grad_norm": 2.803947583203103, + "learning_rate": 1.71661847674711e-06, + "loss": 0.4631, + "step": 9374 + }, + { + "epoch": 0.74, + "grad_norm": 0.5611308774126539, + "learning_rate": 1.7156594286800344e-06, + "loss": 0.4898, + "step": 9375 + }, + { + "epoch": 0.74, + "grad_norm": 1.7550734571660456, + "learning_rate": 1.7147005931054956e-06, + "loss": 0.4514, + "step": 9376 + }, + { + "epoch": 0.74, + "grad_norm": 1.603361465568647, + "learning_rate": 1.7137419700855302e-06, + "loss": 0.4781, + "step": 9377 + }, + { + "epoch": 0.74, + "grad_norm": 2.589085572953168, + "learning_rate": 1.7127835596821606e-06, + "loss": 0.438, + "step": 9378 + }, + { + "epoch": 0.74, + "grad_norm": 5.339629372614089, + "learning_rate": 1.7118253619573944e-06, + "loss": 0.4657, + "step": 9379 + }, + { + "epoch": 0.74, + "grad_norm": 2.815802526264609, + "learning_rate": 1.7108673769732275e-06, + "loss": 0.4439, + "step": 9380 + }, + { + "epoch": 0.74, + "grad_norm": 1.6688502173016868, + "learning_rate": 1.7099096047916375e-06, + "loss": 0.4582, + "step": 9381 + }, + { + "epoch": 0.74, + "grad_norm": 1.5982164421126173, + "learning_rate": 1.7089520454745912e-06, + "loss": 0.4237, + "step": 9382 + }, + { + "epoch": 0.74, + "grad_norm": 1.6970059074734893, + "learning_rate": 1.707994699084043e-06, + "loss": 0.4422, + "step": 9383 + }, + { + "epoch": 0.74, + "grad_norm": 1.4143705130229824, + "learning_rate": 1.7070375656819326e-06, + "loss": 0.4219, + "step": 9384 + }, + { + "epoch": 0.74, + "grad_norm": 0.5599999410261133, + "learning_rate": 1.706080645330182e-06, + "loss": 0.4577, + "step": 9385 + }, + { + "epoch": 0.74, + "grad_norm": 0.5586146254703207, + "learning_rate": 1.7051239380907054e-06, + "loss": 0.4774, + "step": 9386 + }, + { + "epoch": 0.74, + "grad_norm": 2.249217766061195, + "learning_rate": 1.7041674440253991e-06, + "loss": 0.4788, + "step": 9387 + }, + { + "epoch": 0.74, + "grad_norm": 2.395151535946371, + "learning_rate": 1.7032111631961502e-06, + "loss": 0.4569, + "step": 9388 + }, + { + "epoch": 0.74, + "grad_norm": 1.4964013857927392, + "learning_rate": 1.7022550956648237e-06, + "loss": 0.4444, + "step": 9389 + }, + { + "epoch": 0.74, + "grad_norm": 1.5279867411395442, + "learning_rate": 1.7012992414932773e-06, + "loss": 0.4287, + "step": 9390 + }, + { + "epoch": 0.74, + "grad_norm": 1.7686582251542629, + "learning_rate": 1.7003436007433583e-06, + "loss": 0.4121, + "step": 9391 + }, + { + "epoch": 0.74, + "grad_norm": 0.5200612202428085, + "learning_rate": 1.6993881734768897e-06, + "loss": 0.4541, + "step": 9392 + }, + { + "epoch": 0.74, + "grad_norm": 1.841342268550445, + "learning_rate": 1.6984329597556886e-06, + "loss": 0.4745, + "step": 9393 + }, + { + "epoch": 0.74, + "grad_norm": 1.7093319680666754, + "learning_rate": 1.697477959641556e-06, + "loss": 0.4789, + "step": 9394 + }, + { + "epoch": 0.74, + "grad_norm": 0.5436758927180081, + "learning_rate": 1.6965231731962811e-06, + "loss": 0.4904, + "step": 9395 + }, + { + "epoch": 0.74, + "grad_norm": 1.8113144895817426, + "learning_rate": 1.6955686004816335e-06, + "loss": 0.4521, + "step": 9396 + }, + { + "epoch": 0.74, + "grad_norm": 1.5108512793777908, + "learning_rate": 1.6946142415593748e-06, + "loss": 0.4239, + "step": 9397 + }, + { + "epoch": 0.74, + "grad_norm": 5.258795270221807, + "learning_rate": 1.6936600964912508e-06, + "loss": 0.5082, + "step": 9398 + }, + { + "epoch": 0.74, + "grad_norm": 1.7835922750894324, + "learning_rate": 1.6927061653389948e-06, + "loss": 0.414, + "step": 9399 + }, + { + "epoch": 0.74, + "grad_norm": 0.558931009445231, + "learning_rate": 1.6917524481643216e-06, + "loss": 0.465, + "step": 9400 + }, + { + "epoch": 0.74, + "grad_norm": 1.5990602333267177, + "learning_rate": 1.6907989450289375e-06, + "loss": 0.4732, + "step": 9401 + }, + { + "epoch": 0.74, + "grad_norm": 1.697666986131221, + "learning_rate": 1.6898456559945332e-06, + "loss": 0.3865, + "step": 9402 + }, + { + "epoch": 0.74, + "grad_norm": 1.7559005237630128, + "learning_rate": 1.6888925811227841e-06, + "loss": 0.4168, + "step": 9403 + }, + { + "epoch": 0.74, + "grad_norm": 2.1026836853962636, + "learning_rate": 1.687939720475355e-06, + "loss": 0.4133, + "step": 9404 + }, + { + "epoch": 0.74, + "grad_norm": 0.5645004944966443, + "learning_rate": 1.6869870741138906e-06, + "loss": 0.4667, + "step": 9405 + }, + { + "epoch": 0.74, + "grad_norm": 2.0396792270399624, + "learning_rate": 1.6860346421000311e-06, + "loss": 0.4497, + "step": 9406 + }, + { + "epoch": 0.74, + "grad_norm": 2.1391574834967404, + "learning_rate": 1.685082424495394e-06, + "loss": 0.4735, + "step": 9407 + }, + { + "epoch": 0.74, + "grad_norm": 1.2831335441920926, + "learning_rate": 1.6841304213615889e-06, + "loss": 0.437, + "step": 9408 + }, + { + "epoch": 0.74, + "grad_norm": 1.7141446056692833, + "learning_rate": 1.6831786327602045e-06, + "loss": 0.4728, + "step": 9409 + }, + { + "epoch": 0.74, + "grad_norm": 1.9154358509445302, + "learning_rate": 1.6822270587528273e-06, + "loss": 0.5231, + "step": 9410 + }, + { + "epoch": 0.74, + "grad_norm": 1.4284530672236355, + "learning_rate": 1.681275699401017e-06, + "loss": 0.3948, + "step": 9411 + }, + { + "epoch": 0.74, + "grad_norm": 1.648456691912696, + "learning_rate": 1.6803245547663278e-06, + "loss": 0.4027, + "step": 9412 + }, + { + "epoch": 0.74, + "grad_norm": 2.0201403472041424, + "learning_rate": 1.6793736249102976e-06, + "loss": 0.4401, + "step": 9413 + }, + { + "epoch": 0.74, + "grad_norm": 0.543950347878804, + "learning_rate": 1.6784229098944493e-06, + "loss": 0.4662, + "step": 9414 + }, + { + "epoch": 0.74, + "grad_norm": 1.7836923791801027, + "learning_rate": 1.6774724097802959e-06, + "loss": 0.4558, + "step": 9415 + }, + { + "epoch": 0.74, + "grad_norm": 1.5843778299191118, + "learning_rate": 1.676522124629329e-06, + "loss": 0.454, + "step": 9416 + }, + { + "epoch": 0.74, + "grad_norm": 2.1600457084528237, + "learning_rate": 1.675572054503033e-06, + "loss": 0.4858, + "step": 9417 + }, + { + "epoch": 0.74, + "grad_norm": 1.575299610913128, + "learning_rate": 1.6746221994628764e-06, + "loss": 0.4033, + "step": 9418 + }, + { + "epoch": 0.74, + "grad_norm": 3.4931203483121034, + "learning_rate": 1.6736725595703145e-06, + "loss": 0.4551, + "step": 9419 + }, + { + "epoch": 0.74, + "grad_norm": 1.8352633826801457, + "learning_rate": 1.6727231348867856e-06, + "loss": 0.4587, + "step": 9420 + }, + { + "epoch": 0.74, + "grad_norm": 1.5924941548761598, + "learning_rate": 1.671773925473717e-06, + "loss": 0.4667, + "step": 9421 + }, + { + "epoch": 0.74, + "grad_norm": 1.5431038026124573, + "learning_rate": 1.6708249313925217e-06, + "loss": 0.3973, + "step": 9422 + }, + { + "epoch": 0.74, + "grad_norm": 2.19925690089107, + "learning_rate": 1.6698761527045982e-06, + "loss": 0.4031, + "step": 9423 + }, + { + "epoch": 0.74, + "grad_norm": 1.6992884126470247, + "learning_rate": 1.6689275894713326e-06, + "loss": 0.4554, + "step": 9424 + }, + { + "epoch": 0.74, + "grad_norm": 1.5158162637385297, + "learning_rate": 1.6679792417540913e-06, + "loss": 0.4234, + "step": 9425 + }, + { + "epoch": 0.74, + "grad_norm": 1.5453318226443147, + "learning_rate": 1.6670311096142371e-06, + "loss": 0.4825, + "step": 9426 + }, + { + "epoch": 0.74, + "grad_norm": 1.3826283821909433, + "learning_rate": 1.666083193113109e-06, + "loss": 0.4079, + "step": 9427 + }, + { + "epoch": 0.74, + "grad_norm": 1.9418228459360969, + "learning_rate": 1.6651354923120367e-06, + "loss": 0.4107, + "step": 9428 + }, + { + "epoch": 0.74, + "grad_norm": 0.5313753955389569, + "learning_rate": 1.664188007272336e-06, + "loss": 0.4615, + "step": 9429 + }, + { + "epoch": 0.74, + "grad_norm": 21.753136229551092, + "learning_rate": 1.6632407380553085e-06, + "loss": 0.4598, + "step": 9430 + }, + { + "epoch": 0.74, + "grad_norm": 1.438014344140615, + "learning_rate": 1.6622936847222386e-06, + "loss": 0.4171, + "step": 9431 + }, + { + "epoch": 0.74, + "grad_norm": 1.954942575705391, + "learning_rate": 1.6613468473344013e-06, + "loss": 0.4117, + "step": 9432 + }, + { + "epoch": 0.74, + "grad_norm": 1.488685423206424, + "learning_rate": 1.6604002259530549e-06, + "loss": 0.4011, + "step": 9433 + }, + { + "epoch": 0.74, + "grad_norm": 1.6029407610868032, + "learning_rate": 1.659453820639446e-06, + "loss": 0.4283, + "step": 9434 + }, + { + "epoch": 0.74, + "grad_norm": 1.794421523022401, + "learning_rate": 1.658507631454806e-06, + "loss": 0.464, + "step": 9435 + }, + { + "epoch": 0.74, + "grad_norm": 10.459092209644698, + "learning_rate": 1.6575616584603493e-06, + "loss": 0.4383, + "step": 9436 + }, + { + "epoch": 0.74, + "grad_norm": 1.729153750110327, + "learning_rate": 1.6566159017172812e-06, + "loss": 0.5503, + "step": 9437 + }, + { + "epoch": 0.74, + "grad_norm": 1.2554255729493307, + "learning_rate": 1.6556703612867904e-06, + "loss": 0.4288, + "step": 9438 + }, + { + "epoch": 0.74, + "grad_norm": 2.413585252589411, + "learning_rate": 1.6547250372300538e-06, + "loss": 0.4772, + "step": 9439 + }, + { + "epoch": 0.74, + "grad_norm": 1.7488345497394422, + "learning_rate": 1.653779929608228e-06, + "loss": 0.4478, + "step": 9440 + }, + { + "epoch": 0.74, + "grad_norm": 1.7867792633302464, + "learning_rate": 1.6528350384824671e-06, + "loss": 0.4388, + "step": 9441 + }, + { + "epoch": 0.74, + "grad_norm": 0.5552530763629887, + "learning_rate": 1.6518903639138983e-06, + "loss": 0.4783, + "step": 9442 + }, + { + "epoch": 0.74, + "grad_norm": 1.403988482679584, + "learning_rate": 1.6509459059636445e-06, + "loss": 0.4146, + "step": 9443 + }, + { + "epoch": 0.74, + "grad_norm": 0.5187400008910756, + "learning_rate": 1.6500016646928069e-06, + "loss": 0.4572, + "step": 9444 + }, + { + "epoch": 0.74, + "grad_norm": 0.5429090828856518, + "learning_rate": 1.6490576401624803e-06, + "loss": 0.4624, + "step": 9445 + }, + { + "epoch": 0.74, + "grad_norm": 1.8945126541416097, + "learning_rate": 1.648113832433743e-06, + "loss": 0.4814, + "step": 9446 + }, + { + "epoch": 0.74, + "grad_norm": 0.5656748645440883, + "learning_rate": 1.6471702415676538e-06, + "loss": 0.4853, + "step": 9447 + }, + { + "epoch": 0.74, + "grad_norm": 2.007872785565353, + "learning_rate": 1.646226867625264e-06, + "loss": 0.4346, + "step": 9448 + }, + { + "epoch": 0.74, + "grad_norm": 1.958805964340988, + "learning_rate": 1.6452837106676089e-06, + "loss": 0.4651, + "step": 9449 + }, + { + "epoch": 0.74, + "grad_norm": 1.5760445978046895, + "learning_rate": 1.6443407707557103e-06, + "loss": 0.4191, + "step": 9450 + }, + { + "epoch": 0.74, + "grad_norm": 2.208537671216683, + "learning_rate": 1.6433980479505728e-06, + "loss": 0.5142, + "step": 9451 + }, + { + "epoch": 0.74, + "grad_norm": 1.6558865705304067, + "learning_rate": 1.6424555423131905e-06, + "loss": 0.4232, + "step": 9452 + }, + { + "epoch": 0.74, + "grad_norm": 1.7879914755597162, + "learning_rate": 1.6415132539045424e-06, + "loss": 0.4403, + "step": 9453 + }, + { + "epoch": 0.74, + "grad_norm": 1.735197036523958, + "learning_rate": 1.6405711827855952e-06, + "loss": 0.452, + "step": 9454 + }, + { + "epoch": 0.74, + "grad_norm": 1.7411948269339976, + "learning_rate": 1.639629329017296e-06, + "loss": 0.4381, + "step": 9455 + }, + { + "epoch": 0.74, + "grad_norm": 1.6955786518592089, + "learning_rate": 1.6386876926605816e-06, + "loss": 0.4735, + "step": 9456 + }, + { + "epoch": 0.74, + "grad_norm": 2.247884104685819, + "learning_rate": 1.6377462737763794e-06, + "loss": 0.4553, + "step": 9457 + }, + { + "epoch": 0.74, + "grad_norm": 1.9240998028511493, + "learning_rate": 1.6368050724255935e-06, + "loss": 0.4466, + "step": 9458 + }, + { + "epoch": 0.74, + "grad_norm": 1.7293031957088278, + "learning_rate": 1.6358640886691213e-06, + "loss": 0.4111, + "step": 9459 + }, + { + "epoch": 0.74, + "grad_norm": 0.559767266275453, + "learning_rate": 1.634923322567839e-06, + "loss": 0.4875, + "step": 9460 + }, + { + "epoch": 0.74, + "grad_norm": 3.048214493890338, + "learning_rate": 1.6339827741826181e-06, + "loss": 0.3973, + "step": 9461 + }, + { + "epoch": 0.74, + "grad_norm": 2.1239997429149633, + "learning_rate": 1.6330424435743076e-06, + "loss": 0.4267, + "step": 9462 + }, + { + "epoch": 0.74, + "grad_norm": 3.4019879411903484, + "learning_rate": 1.632102330803746e-06, + "loss": 0.4343, + "step": 9463 + }, + { + "epoch": 0.74, + "grad_norm": 1.9580660584644536, + "learning_rate": 1.631162435931758e-06, + "loss": 0.3897, + "step": 9464 + }, + { + "epoch": 0.74, + "grad_norm": 1.7203975426082199, + "learning_rate": 1.6302227590191543e-06, + "loss": 0.4667, + "step": 9465 + }, + { + "epoch": 0.74, + "grad_norm": 0.560517183033107, + "learning_rate": 1.6292833001267288e-06, + "loss": 0.4751, + "step": 9466 + }, + { + "epoch": 0.74, + "grad_norm": 1.865711654315479, + "learning_rate": 1.6283440593152644e-06, + "loss": 0.4865, + "step": 9467 + }, + { + "epoch": 0.74, + "grad_norm": 1.2372077869769353, + "learning_rate": 1.627405036645529e-06, + "loss": 0.4205, + "step": 9468 + }, + { + "epoch": 0.74, + "grad_norm": 1.7098413964338406, + "learning_rate": 1.6264662321782754e-06, + "loss": 0.4244, + "step": 9469 + }, + { + "epoch": 0.74, + "grad_norm": 1.503438159074684, + "learning_rate": 1.6255276459742452e-06, + "loss": 0.497, + "step": 9470 + }, + { + "epoch": 0.74, + "grad_norm": 1.606628312608506, + "learning_rate": 1.6245892780941601e-06, + "loss": 0.3892, + "step": 9471 + }, + { + "epoch": 0.74, + "grad_norm": 1.7582621109431151, + "learning_rate": 1.6236511285987333e-06, + "loss": 0.479, + "step": 9472 + }, + { + "epoch": 0.74, + "grad_norm": 1.7475186005544148, + "learning_rate": 1.6227131975486616e-06, + "loss": 0.4826, + "step": 9473 + }, + { + "epoch": 0.74, + "grad_norm": 0.5311138999144461, + "learning_rate": 1.6217754850046297e-06, + "loss": 0.4862, + "step": 9474 + }, + { + "epoch": 0.74, + "grad_norm": 1.850432386374592, + "learning_rate": 1.6208379910273014e-06, + "loss": 0.4534, + "step": 9475 + }, + { + "epoch": 0.74, + "grad_norm": 0.5669949954976595, + "learning_rate": 1.6199007156773378e-06, + "loss": 0.4969, + "step": 9476 + }, + { + "epoch": 0.74, + "grad_norm": 1.4869074465526095, + "learning_rate": 1.6189636590153746e-06, + "loss": 0.4501, + "step": 9477 + }, + { + "epoch": 0.74, + "grad_norm": 1.491255001316474, + "learning_rate": 1.6180268211020412e-06, + "loss": 0.4272, + "step": 9478 + }, + { + "epoch": 0.74, + "grad_norm": 1.9957735508183045, + "learning_rate": 1.617090201997945e-06, + "loss": 0.4353, + "step": 9479 + }, + { + "epoch": 0.74, + "grad_norm": 5.122290335082375, + "learning_rate": 1.616153801763689e-06, + "loss": 0.4475, + "step": 9480 + }, + { + "epoch": 0.74, + "grad_norm": 9.116308705620828, + "learning_rate": 1.615217620459857e-06, + "loss": 0.4787, + "step": 9481 + }, + { + "epoch": 0.74, + "grad_norm": 1.3082318900085268, + "learning_rate": 1.6142816581470156e-06, + "loss": 0.4574, + "step": 9482 + }, + { + "epoch": 0.74, + "grad_norm": 1.5697733745669809, + "learning_rate": 1.6133459148857216e-06, + "loss": 0.4371, + "step": 9483 + }, + { + "epoch": 0.74, + "grad_norm": 2.0734756647600965, + "learning_rate": 1.6124103907365168e-06, + "loss": 0.4638, + "step": 9484 + }, + { + "epoch": 0.74, + "grad_norm": 1.9356342589704505, + "learning_rate": 1.6114750857599299e-06, + "loss": 0.4503, + "step": 9485 + }, + { + "epoch": 0.74, + "grad_norm": 1.9925063712882152, + "learning_rate": 1.6105400000164707e-06, + "loss": 0.4001, + "step": 9486 + }, + { + "epoch": 0.75, + "grad_norm": 1.529947549966917, + "learning_rate": 1.609605133566639e-06, + "loss": 0.4708, + "step": 9487 + }, + { + "epoch": 0.75, + "grad_norm": 1.6495789914128554, + "learning_rate": 1.60867048647092e-06, + "loss": 0.4685, + "step": 9488 + }, + { + "epoch": 0.75, + "grad_norm": 1.6722346052075252, + "learning_rate": 1.6077360587897867e-06, + "loss": 0.4613, + "step": 9489 + }, + { + "epoch": 0.75, + "grad_norm": 1.6700327105747919, + "learning_rate": 1.6068018505836901e-06, + "loss": 0.4214, + "step": 9490 + }, + { + "epoch": 0.75, + "grad_norm": 1.6803857038403462, + "learning_rate": 1.6058678619130735e-06, + "loss": 0.4462, + "step": 9491 + }, + { + "epoch": 0.75, + "grad_norm": 1.682054659968237, + "learning_rate": 1.6049340928383694e-06, + "loss": 0.4963, + "step": 9492 + }, + { + "epoch": 0.75, + "grad_norm": 0.5366834013419136, + "learning_rate": 1.6040005434199869e-06, + "loss": 0.4837, + "step": 9493 + }, + { + "epoch": 0.75, + "grad_norm": 1.7070580308806913, + "learning_rate": 1.6030672137183283e-06, + "loss": 0.4479, + "step": 9494 + }, + { + "epoch": 0.75, + "grad_norm": 3.917254791485374, + "learning_rate": 1.6021341037937739e-06, + "loss": 0.4863, + "step": 9495 + }, + { + "epoch": 0.75, + "grad_norm": 1.808475486717067, + "learning_rate": 1.6012012137067013e-06, + "loss": 0.4548, + "step": 9496 + }, + { + "epoch": 0.75, + "grad_norm": 1.6753571088876864, + "learning_rate": 1.600268543517462e-06, + "loss": 0.4593, + "step": 9497 + }, + { + "epoch": 0.75, + "grad_norm": 0.4969076769005229, + "learning_rate": 1.5993360932864005e-06, + "loss": 0.4849, + "step": 9498 + }, + { + "epoch": 0.75, + "grad_norm": 1.9074840928268515, + "learning_rate": 1.5984038630738458e-06, + "loss": 0.4605, + "step": 9499 + }, + { + "epoch": 0.75, + "grad_norm": 1.6160669020640468, + "learning_rate": 1.5974718529401123e-06, + "loss": 0.4421, + "step": 9500 + }, + { + "epoch": 0.75, + "grad_norm": 5.887074553743196, + "learning_rate": 1.5965400629454975e-06, + "loss": 0.4424, + "step": 9501 + }, + { + "epoch": 0.75, + "grad_norm": 2.83158035012084, + "learning_rate": 1.595608493150288e-06, + "loss": 0.4464, + "step": 9502 + }, + { + "epoch": 0.75, + "grad_norm": 1.7320565965830719, + "learning_rate": 1.5946771436147561e-06, + "loss": 0.4332, + "step": 9503 + }, + { + "epoch": 0.75, + "grad_norm": 1.7855869176629924, + "learning_rate": 1.5937460143991579e-06, + "loss": 0.4244, + "step": 9504 + }, + { + "epoch": 0.75, + "grad_norm": 1.7249361960453422, + "learning_rate": 1.592815105563738e-06, + "loss": 0.4519, + "step": 9505 + }, + { + "epoch": 0.75, + "grad_norm": 0.584797664218502, + "learning_rate": 1.5918844171687225e-06, + "loss": 0.4496, + "step": 9506 + }, + { + "epoch": 0.75, + "grad_norm": 1.856571855287507, + "learning_rate": 1.590953949274327e-06, + "loss": 0.4652, + "step": 9507 + }, + { + "epoch": 0.75, + "grad_norm": 1.8055948117820615, + "learning_rate": 1.5900237019407511e-06, + "loss": 0.4248, + "step": 9508 + }, + { + "epoch": 0.75, + "grad_norm": 4.4239989251735885, + "learning_rate": 1.5890936752281822e-06, + "loss": 0.4466, + "step": 9509 + }, + { + "epoch": 0.75, + "grad_norm": 1.4372588451993604, + "learning_rate": 1.5881638691967876e-06, + "loss": 0.4347, + "step": 9510 + }, + { + "epoch": 0.75, + "grad_norm": 1.9260101718843334, + "learning_rate": 1.5872342839067305e-06, + "loss": 0.5115, + "step": 9511 + }, + { + "epoch": 0.75, + "grad_norm": 1.7245122706338096, + "learning_rate": 1.586304919418149e-06, + "loss": 0.4313, + "step": 9512 + }, + { + "epoch": 0.75, + "grad_norm": 1.7412950487570953, + "learning_rate": 1.5853757757911737e-06, + "loss": 0.4374, + "step": 9513 + }, + { + "epoch": 0.75, + "grad_norm": 1.78237475330738, + "learning_rate": 1.5844468530859193e-06, + "loss": 0.4631, + "step": 9514 + }, + { + "epoch": 0.75, + "grad_norm": 3.137020579989209, + "learning_rate": 1.5835181513624848e-06, + "loss": 0.4487, + "step": 9515 + }, + { + "epoch": 0.75, + "grad_norm": 2.6545369543014457, + "learning_rate": 1.5825896706809579e-06, + "loss": 0.4241, + "step": 9516 + }, + { + "epoch": 0.75, + "grad_norm": 1.8032011973221418, + "learning_rate": 1.5816614111014078e-06, + "loss": 0.4716, + "step": 9517 + }, + { + "epoch": 0.75, + "grad_norm": 0.5543070126425631, + "learning_rate": 1.5807333726838924e-06, + "loss": 0.4677, + "step": 9518 + }, + { + "epoch": 0.75, + "grad_norm": 1.5062740220644393, + "learning_rate": 1.5798055554884551e-06, + "loss": 0.4475, + "step": 9519 + }, + { + "epoch": 0.75, + "grad_norm": 6.3760098652872506, + "learning_rate": 1.5788779595751252e-06, + "loss": 0.4429, + "step": 9520 + }, + { + "epoch": 0.75, + "grad_norm": 1.9306794140545294, + "learning_rate": 1.5779505850039152e-06, + "loss": 0.4637, + "step": 9521 + }, + { + "epoch": 0.75, + "grad_norm": 0.5703665620093, + "learning_rate": 1.577023431834825e-06, + "loss": 0.4788, + "step": 9522 + }, + { + "epoch": 0.75, + "grad_norm": 2.1967489297370695, + "learning_rate": 1.576096500127841e-06, + "loss": 0.4769, + "step": 9523 + }, + { + "epoch": 0.75, + "grad_norm": 1.7063140774317545, + "learning_rate": 1.5751697899429346e-06, + "loss": 0.4176, + "step": 9524 + }, + { + "epoch": 0.75, + "grad_norm": 0.5706641851239264, + "learning_rate": 1.5742433013400644e-06, + "loss": 0.4747, + "step": 9525 + }, + { + "epoch": 0.75, + "grad_norm": 1.8131725871098845, + "learning_rate": 1.5733170343791675e-06, + "loss": 0.4647, + "step": 9526 + }, + { + "epoch": 0.75, + "grad_norm": 1.5558082167399254, + "learning_rate": 1.5723909891201794e-06, + "loss": 0.3696, + "step": 9527 + }, + { + "epoch": 0.75, + "grad_norm": 1.4513936995032763, + "learning_rate": 1.5714651656230085e-06, + "loss": 0.3986, + "step": 9528 + }, + { + "epoch": 0.75, + "grad_norm": 1.8326119490423778, + "learning_rate": 1.5705395639475578e-06, + "loss": 0.4423, + "step": 9529 + }, + { + "epoch": 0.75, + "grad_norm": 2.196816284472686, + "learning_rate": 1.5696141841537083e-06, + "loss": 0.4274, + "step": 9530 + }, + { + "epoch": 0.75, + "grad_norm": 1.7042805841630007, + "learning_rate": 1.5686890263013365e-06, + "loss": 0.4541, + "step": 9531 + }, + { + "epoch": 0.75, + "grad_norm": 1.8523043324055197, + "learning_rate": 1.5677640904502944e-06, + "loss": 0.4767, + "step": 9532 + }, + { + "epoch": 0.75, + "grad_norm": 0.540166566621854, + "learning_rate": 1.5668393766604255e-06, + "loss": 0.4581, + "step": 9533 + }, + { + "epoch": 0.75, + "grad_norm": 1.3155390370897255, + "learning_rate": 1.5659148849915585e-06, + "loss": 0.3972, + "step": 9534 + }, + { + "epoch": 0.75, + "grad_norm": 3.594892610223923, + "learning_rate": 1.564990615503507e-06, + "loss": 0.4966, + "step": 9535 + }, + { + "epoch": 0.75, + "grad_norm": 2.750396699990423, + "learning_rate": 1.5640665682560678e-06, + "loss": 0.4343, + "step": 9536 + }, + { + "epoch": 0.75, + "grad_norm": 1.8670077921769563, + "learning_rate": 1.563142743309027e-06, + "loss": 0.4287, + "step": 9537 + }, + { + "epoch": 0.75, + "grad_norm": 0.5857018292928747, + "learning_rate": 1.5622191407221543e-06, + "loss": 0.4792, + "step": 9538 + }, + { + "epoch": 0.75, + "grad_norm": 2.517864936443574, + "learning_rate": 1.5612957605552064e-06, + "loss": 0.442, + "step": 9539 + }, + { + "epoch": 0.75, + "grad_norm": 1.690461840207118, + "learning_rate": 1.5603726028679255e-06, + "loss": 0.3945, + "step": 9540 + }, + { + "epoch": 0.75, + "grad_norm": 2.520239876249671, + "learning_rate": 1.5594496677200366e-06, + "loss": 0.4275, + "step": 9541 + }, + { + "epoch": 0.75, + "grad_norm": 1.7890290913333489, + "learning_rate": 1.558526955171253e-06, + "loss": 0.4533, + "step": 9542 + }, + { + "epoch": 0.75, + "grad_norm": 2.323409867952726, + "learning_rate": 1.557604465281274e-06, + "loss": 0.4962, + "step": 9543 + }, + { + "epoch": 0.75, + "grad_norm": 1.6111690265022294, + "learning_rate": 1.5566821981097836e-06, + "loss": 0.42, + "step": 9544 + }, + { + "epoch": 0.75, + "grad_norm": 1.719315966510045, + "learning_rate": 1.5557601537164497e-06, + "loss": 0.4232, + "step": 9545 + }, + { + "epoch": 0.75, + "grad_norm": 3.8168772331157212, + "learning_rate": 1.5548383321609272e-06, + "loss": 0.454, + "step": 9546 + }, + { + "epoch": 0.75, + "grad_norm": 4.098846189225212, + "learning_rate": 1.5539167335028588e-06, + "loss": 0.492, + "step": 9547 + }, + { + "epoch": 0.75, + "grad_norm": 1.5404712007206263, + "learning_rate": 1.5529953578018691e-06, + "loss": 0.4703, + "step": 9548 + }, + { + "epoch": 0.75, + "grad_norm": 0.5318636608855462, + "learning_rate": 1.5520742051175708e-06, + "loss": 0.4772, + "step": 9549 + }, + { + "epoch": 0.75, + "grad_norm": 1.6133056320735004, + "learning_rate": 1.551153275509561e-06, + "loss": 0.4226, + "step": 9550 + }, + { + "epoch": 0.75, + "grad_norm": 0.5586398656457083, + "learning_rate": 1.5502325690374243e-06, + "loss": 0.4722, + "step": 9551 + }, + { + "epoch": 0.75, + "grad_norm": 1.7843552594030445, + "learning_rate": 1.5493120857607258e-06, + "loss": 0.4966, + "step": 9552 + }, + { + "epoch": 0.75, + "grad_norm": 2.3660796404631763, + "learning_rate": 1.5483918257390212e-06, + "loss": 0.4684, + "step": 9553 + }, + { + "epoch": 0.75, + "grad_norm": 1.4073057865492087, + "learning_rate": 1.5474717890318502e-06, + "loss": 0.4053, + "step": 9554 + }, + { + "epoch": 0.75, + "grad_norm": 1.5682954647072034, + "learning_rate": 1.5465519756987396e-06, + "loss": 0.4983, + "step": 9555 + }, + { + "epoch": 0.75, + "grad_norm": 2.5165934129897196, + "learning_rate": 1.5456323857991967e-06, + "loss": 0.4705, + "step": 9556 + }, + { + "epoch": 0.75, + "grad_norm": 1.702757020919363, + "learning_rate": 1.5447130193927202e-06, + "loss": 0.4301, + "step": 9557 + }, + { + "epoch": 0.75, + "grad_norm": 1.690970463130095, + "learning_rate": 1.5437938765387906e-06, + "loss": 0.3771, + "step": 9558 + }, + { + "epoch": 0.75, + "grad_norm": 1.878326313726984, + "learning_rate": 1.542874957296876e-06, + "loss": 0.46, + "step": 9559 + }, + { + "epoch": 0.75, + "grad_norm": 1.5634339958245722, + "learning_rate": 1.5419562617264312e-06, + "loss": 0.4798, + "step": 9560 + }, + { + "epoch": 0.75, + "grad_norm": 2.4606275065512824, + "learning_rate": 1.541037789886889e-06, + "loss": 0.4629, + "step": 9561 + }, + { + "epoch": 0.75, + "grad_norm": 1.7948825489499456, + "learning_rate": 1.5401195418376801e-06, + "loss": 0.3995, + "step": 9562 + }, + { + "epoch": 0.75, + "grad_norm": 1.8127300625938654, + "learning_rate": 1.5392015176382092e-06, + "loss": 0.476, + "step": 9563 + }, + { + "epoch": 0.75, + "grad_norm": 2.1496492507136784, + "learning_rate": 1.5382837173478748e-06, + "loss": 0.4718, + "step": 9564 + }, + { + "epoch": 0.75, + "grad_norm": 1.5008139438889498, + "learning_rate": 1.5373661410260515e-06, + "loss": 0.3987, + "step": 9565 + }, + { + "epoch": 0.75, + "grad_norm": 1.767988731802174, + "learning_rate": 1.5364487887321128e-06, + "loss": 0.533, + "step": 9566 + }, + { + "epoch": 0.75, + "grad_norm": 0.565293624225528, + "learning_rate": 1.535531660525405e-06, + "loss": 0.4886, + "step": 9567 + }, + { + "epoch": 0.75, + "grad_norm": 2.180293049041982, + "learning_rate": 1.5346147564652664e-06, + "loss": 0.4694, + "step": 9568 + }, + { + "epoch": 0.75, + "grad_norm": 1.9830831666312374, + "learning_rate": 1.53369807661102e-06, + "loss": 0.4644, + "step": 9569 + }, + { + "epoch": 0.75, + "grad_norm": 0.5465052335030506, + "learning_rate": 1.5327816210219736e-06, + "loss": 0.4645, + "step": 9570 + }, + { + "epoch": 0.75, + "grad_norm": 1.494685456068443, + "learning_rate": 1.531865389757422e-06, + "loss": 0.471, + "step": 9571 + }, + { + "epoch": 0.75, + "grad_norm": 1.5959141978547964, + "learning_rate": 1.5309493828766414e-06, + "loss": 0.4109, + "step": 9572 + }, + { + "epoch": 0.75, + "grad_norm": 1.700318916623548, + "learning_rate": 1.5300336004388976e-06, + "loss": 0.4169, + "step": 9573 + }, + { + "epoch": 0.75, + "grad_norm": 1.6458351947150365, + "learning_rate": 1.5291180425034403e-06, + "loss": 0.4473, + "step": 9574 + }, + { + "epoch": 0.75, + "grad_norm": 0.5505808226334474, + "learning_rate": 1.5282027091295071e-06, + "loss": 0.4915, + "step": 9575 + }, + { + "epoch": 0.75, + "grad_norm": 1.6946074093814796, + "learning_rate": 1.5272876003763154e-06, + "loss": 0.4233, + "step": 9576 + }, + { + "epoch": 0.75, + "grad_norm": 1.63728505550244, + "learning_rate": 1.5263727163030729e-06, + "loss": 0.4609, + "step": 9577 + }, + { + "epoch": 0.75, + "grad_norm": 1.5607493031343314, + "learning_rate": 1.5254580569689713e-06, + "loss": 0.4632, + "step": 9578 + }, + { + "epoch": 0.75, + "grad_norm": 1.698320408923765, + "learning_rate": 1.52454362243319e-06, + "loss": 0.417, + "step": 9579 + }, + { + "epoch": 0.75, + "grad_norm": 2.1133433949544456, + "learning_rate": 1.5236294127548883e-06, + "loss": 0.4674, + "step": 9580 + }, + { + "epoch": 0.75, + "grad_norm": 2.4136501817371303, + "learning_rate": 1.5227154279932143e-06, + "loss": 0.485, + "step": 9581 + }, + { + "epoch": 0.75, + "grad_norm": 2.4969586416456506, + "learning_rate": 1.5218016682073068e-06, + "loss": 0.4863, + "step": 9582 + }, + { + "epoch": 0.75, + "grad_norm": 1.4290515312334118, + "learning_rate": 1.5208881334562792e-06, + "loss": 0.5185, + "step": 9583 + }, + { + "epoch": 0.75, + "grad_norm": 3.081993900914485, + "learning_rate": 1.5199748237992384e-06, + "loss": 0.4862, + "step": 9584 + }, + { + "epoch": 0.75, + "grad_norm": 1.7437198546062906, + "learning_rate": 1.519061739295274e-06, + "loss": 0.4606, + "step": 9585 + }, + { + "epoch": 0.75, + "grad_norm": 1.5912700496356162, + "learning_rate": 1.5181488800034627e-06, + "loss": 0.3727, + "step": 9586 + }, + { + "epoch": 0.75, + "grad_norm": 1.7711274858532204, + "learning_rate": 1.5172362459828627e-06, + "loss": 0.4261, + "step": 9587 + }, + { + "epoch": 0.75, + "grad_norm": 1.4229496297205668, + "learning_rate": 1.5163238372925215e-06, + "loss": 0.4726, + "step": 9588 + }, + { + "epoch": 0.75, + "grad_norm": 1.83169196787833, + "learning_rate": 1.5154116539914709e-06, + "loss": 0.3916, + "step": 9589 + }, + { + "epoch": 0.75, + "grad_norm": 1.4578129682662675, + "learning_rate": 1.5144996961387292e-06, + "loss": 0.4679, + "step": 9590 + }, + { + "epoch": 0.75, + "grad_norm": 0.5427792402671615, + "learning_rate": 1.5135879637932966e-06, + "loss": 0.4881, + "step": 9591 + }, + { + "epoch": 0.75, + "grad_norm": 1.693736627150592, + "learning_rate": 1.5126764570141617e-06, + "loss": 0.4274, + "step": 9592 + }, + { + "epoch": 0.75, + "grad_norm": 1.4386507718320383, + "learning_rate": 1.5117651758602975e-06, + "loss": 0.4608, + "step": 9593 + }, + { + "epoch": 0.75, + "grad_norm": 0.5710268488838196, + "learning_rate": 1.5108541203906635e-06, + "loss": 0.4796, + "step": 9594 + }, + { + "epoch": 0.75, + "grad_norm": 2.1837446338123003, + "learning_rate": 1.5099432906642054e-06, + "loss": 0.3989, + "step": 9595 + }, + { + "epoch": 0.75, + "grad_norm": 2.3467324347962193, + "learning_rate": 1.5090326867398476e-06, + "loss": 0.4476, + "step": 9596 + }, + { + "epoch": 0.75, + "grad_norm": 1.9092909157617979, + "learning_rate": 1.5081223086765113e-06, + "loss": 0.4181, + "step": 9597 + }, + { + "epoch": 0.75, + "grad_norm": 1.6319775840444712, + "learning_rate": 1.5072121565330916e-06, + "loss": 0.3864, + "step": 9598 + }, + { + "epoch": 0.75, + "grad_norm": 2.9583816385388566, + "learning_rate": 1.5063022303684787e-06, + "loss": 0.4325, + "step": 9599 + }, + { + "epoch": 0.75, + "grad_norm": 3.5529603790001194, + "learning_rate": 1.5053925302415374e-06, + "loss": 0.4394, + "step": 9600 + }, + { + "epoch": 0.75, + "grad_norm": 1.4241026037242772, + "learning_rate": 1.5044830562111311e-06, + "loss": 0.4198, + "step": 9601 + }, + { + "epoch": 0.75, + "grad_norm": 1.8674276094398297, + "learning_rate": 1.5035738083360967e-06, + "loss": 0.5028, + "step": 9602 + }, + { + "epoch": 0.75, + "grad_norm": 1.7788418506101502, + "learning_rate": 1.5026647866752635e-06, + "loss": 0.3988, + "step": 9603 + }, + { + "epoch": 0.75, + "grad_norm": 0.5856901186244446, + "learning_rate": 1.501755991287443e-06, + "loss": 0.4846, + "step": 9604 + }, + { + "epoch": 0.75, + "grad_norm": 2.9780219845521128, + "learning_rate": 1.500847422231434e-06, + "loss": 0.4242, + "step": 9605 + }, + { + "epoch": 0.75, + "grad_norm": 1.9590526702794837, + "learning_rate": 1.4999390795660207e-06, + "loss": 0.4731, + "step": 9606 + }, + { + "epoch": 0.75, + "grad_norm": 1.4575906782303634, + "learning_rate": 1.4990309633499683e-06, + "loss": 0.4557, + "step": 9607 + }, + { + "epoch": 0.75, + "grad_norm": 1.658650490351291, + "learning_rate": 1.4981230736420332e-06, + "loss": 0.4157, + "step": 9608 + }, + { + "epoch": 0.75, + "grad_norm": 1.635053793350725, + "learning_rate": 1.4972154105009546e-06, + "loss": 0.403, + "step": 9609 + }, + { + "epoch": 0.75, + "grad_norm": 1.7937839923334173, + "learning_rate": 1.4963079739854574e-06, + "loss": 0.4697, + "step": 9610 + }, + { + "epoch": 0.75, + "grad_norm": 2.007751674592737, + "learning_rate": 1.4954007641542501e-06, + "loss": 0.4483, + "step": 9611 + }, + { + "epoch": 0.75, + "grad_norm": 1.9161990258725385, + "learning_rate": 1.4944937810660282e-06, + "loss": 0.4666, + "step": 9612 + }, + { + "epoch": 0.75, + "grad_norm": 2.2386863831883805, + "learning_rate": 1.493587024779473e-06, + "loss": 0.457, + "step": 9613 + }, + { + "epoch": 0.76, + "grad_norm": 0.5520118355003508, + "learning_rate": 1.492680495353252e-06, + "loss": 0.5036, + "step": 9614 + }, + { + "epoch": 0.76, + "grad_norm": 3.1244214042995644, + "learning_rate": 1.4917741928460134e-06, + "loss": 0.433, + "step": 9615 + }, + { + "epoch": 0.76, + "grad_norm": 1.4019502683591856, + "learning_rate": 1.4908681173163931e-06, + "loss": 0.4509, + "step": 9616 + }, + { + "epoch": 0.76, + "grad_norm": 1.9891680347264522, + "learning_rate": 1.4899622688230186e-06, + "loss": 0.4819, + "step": 9617 + }, + { + "epoch": 0.76, + "grad_norm": 2.159763312626244, + "learning_rate": 1.4890566474244923e-06, + "loss": 0.4488, + "step": 9618 + }, + { + "epoch": 0.76, + "grad_norm": 1.9339102527344705, + "learning_rate": 1.4881512531794074e-06, + "loss": 0.4697, + "step": 9619 + }, + { + "epoch": 0.76, + "grad_norm": 1.6953351534113665, + "learning_rate": 1.4872460861463428e-06, + "loss": 0.4052, + "step": 9620 + }, + { + "epoch": 0.76, + "grad_norm": 1.5247652228188053, + "learning_rate": 1.4863411463838622e-06, + "loss": 0.45, + "step": 9621 + }, + { + "epoch": 0.76, + "grad_norm": 0.5414550380181892, + "learning_rate": 1.4854364339505117e-06, + "loss": 0.4909, + "step": 9622 + }, + { + "epoch": 0.76, + "grad_norm": 1.5467590428689633, + "learning_rate": 1.4845319489048266e-06, + "loss": 0.4001, + "step": 9623 + }, + { + "epoch": 0.76, + "grad_norm": 1.3931780062135481, + "learning_rate": 1.4836276913053256e-06, + "loss": 0.4359, + "step": 9624 + }, + { + "epoch": 0.76, + "grad_norm": 1.416778010420891, + "learning_rate": 1.482723661210514e-06, + "loss": 0.3635, + "step": 9625 + }, + { + "epoch": 0.76, + "grad_norm": 0.5649439013472531, + "learning_rate": 1.4818198586788795e-06, + "loss": 0.495, + "step": 9626 + }, + { + "epoch": 0.76, + "grad_norm": 1.6238112884185232, + "learning_rate": 1.4809162837688973e-06, + "loss": 0.4046, + "step": 9627 + }, + { + "epoch": 0.76, + "grad_norm": 2.107925704751721, + "learning_rate": 1.4800129365390282e-06, + "loss": 0.4363, + "step": 9628 + }, + { + "epoch": 0.76, + "grad_norm": 2.321423205510961, + "learning_rate": 1.4791098170477168e-06, + "loss": 0.4585, + "step": 9629 + }, + { + "epoch": 0.76, + "grad_norm": 2.7286118436204627, + "learning_rate": 1.4782069253533965e-06, + "loss": 0.436, + "step": 9630 + }, + { + "epoch": 0.76, + "grad_norm": 1.8518329193865894, + "learning_rate": 1.4773042615144778e-06, + "loss": 0.392, + "step": 9631 + }, + { + "epoch": 0.76, + "grad_norm": 0.5473098367463618, + "learning_rate": 1.4764018255893686e-06, + "loss": 0.4798, + "step": 9632 + }, + { + "epoch": 0.76, + "grad_norm": 1.6071501785728153, + "learning_rate": 1.4754996176364494e-06, + "loss": 0.45, + "step": 9633 + }, + { + "epoch": 0.76, + "grad_norm": 1.4826179478004424, + "learning_rate": 1.4745976377140969e-06, + "loss": 0.4529, + "step": 9634 + }, + { + "epoch": 0.76, + "grad_norm": 1.7510696571586861, + "learning_rate": 1.4736958858806616e-06, + "loss": 0.4957, + "step": 9635 + }, + { + "epoch": 0.76, + "grad_norm": 1.679493740338074, + "learning_rate": 1.472794362194493e-06, + "loss": 0.3896, + "step": 9636 + }, + { + "epoch": 0.76, + "grad_norm": 2.0704481842740536, + "learning_rate": 1.4718930667139131e-06, + "loss": 0.4859, + "step": 9637 + }, + { + "epoch": 0.76, + "grad_norm": 2.13400537435942, + "learning_rate": 1.4709919994972366e-06, + "loss": 0.4419, + "step": 9638 + }, + { + "epoch": 0.76, + "grad_norm": 1.828773668708369, + "learning_rate": 1.4700911606027612e-06, + "loss": 0.4428, + "step": 9639 + }, + { + "epoch": 0.76, + "grad_norm": 1.6926569199841242, + "learning_rate": 1.46919055008877e-06, + "loss": 0.478, + "step": 9640 + }, + { + "epoch": 0.76, + "grad_norm": 0.5257171656459595, + "learning_rate": 1.4682901680135332e-06, + "loss": 0.4702, + "step": 9641 + }, + { + "epoch": 0.76, + "grad_norm": 1.9173447501888918, + "learning_rate": 1.4673900144353003e-06, + "loss": 0.4863, + "step": 9642 + }, + { + "epoch": 0.76, + "grad_norm": 1.6745972367466786, + "learning_rate": 1.4664900894123123e-06, + "loss": 0.4137, + "step": 9643 + }, + { + "epoch": 0.76, + "grad_norm": 1.9858876001897323, + "learning_rate": 1.4655903930027937e-06, + "loss": 0.4207, + "step": 9644 + }, + { + "epoch": 0.76, + "grad_norm": 2.8356990604039494, + "learning_rate": 1.4646909252649544e-06, + "loss": 0.4483, + "step": 9645 + }, + { + "epoch": 0.76, + "grad_norm": 1.7294439850184085, + "learning_rate": 1.4637916862569855e-06, + "loss": 0.4204, + "step": 9646 + }, + { + "epoch": 0.76, + "grad_norm": 1.3642459233212287, + "learning_rate": 1.4628926760370688e-06, + "loss": 0.425, + "step": 9647 + }, + { + "epoch": 0.76, + "grad_norm": 12.79114468182307, + "learning_rate": 1.461993894663369e-06, + "loss": 0.4828, + "step": 9648 + }, + { + "epoch": 0.76, + "grad_norm": 2.885191909398489, + "learning_rate": 1.461095342194036e-06, + "loss": 0.479, + "step": 9649 + }, + { + "epoch": 0.76, + "grad_norm": 0.5446385798199103, + "learning_rate": 1.460197018687206e-06, + "loss": 0.4573, + "step": 9650 + }, + { + "epoch": 0.76, + "grad_norm": 1.4947546523808137, + "learning_rate": 1.4592989242009953e-06, + "loss": 0.4321, + "step": 9651 + }, + { + "epoch": 0.76, + "grad_norm": 2.0610985084441262, + "learning_rate": 1.458401058793516e-06, + "loss": 0.4104, + "step": 9652 + }, + { + "epoch": 0.76, + "grad_norm": 1.8461076201673023, + "learning_rate": 1.4575034225228528e-06, + "loss": 0.4946, + "step": 9653 + }, + { + "epoch": 0.76, + "grad_norm": 1.7472979095764258, + "learning_rate": 1.4566060154470846e-06, + "loss": 0.4457, + "step": 9654 + }, + { + "epoch": 0.76, + "grad_norm": 2.242542189147063, + "learning_rate": 1.4557088376242718e-06, + "loss": 0.5229, + "step": 9655 + }, + { + "epoch": 0.76, + "grad_norm": 0.6033672748726733, + "learning_rate": 1.4548118891124623e-06, + "loss": 0.4779, + "step": 9656 + }, + { + "epoch": 0.76, + "grad_norm": 1.7161389469420267, + "learning_rate": 1.453915169969684e-06, + "loss": 0.408, + "step": 9657 + }, + { + "epoch": 0.76, + "grad_norm": 1.4131911299803783, + "learning_rate": 1.4530186802539558e-06, + "loss": 0.4149, + "step": 9658 + }, + { + "epoch": 0.76, + "grad_norm": 1.83104449805615, + "learning_rate": 1.4521224200232786e-06, + "loss": 0.4386, + "step": 9659 + }, + { + "epoch": 0.76, + "grad_norm": 1.9539439458959489, + "learning_rate": 1.4512263893356392e-06, + "loss": 0.4481, + "step": 9660 + }, + { + "epoch": 0.76, + "grad_norm": 2.296832807183728, + "learning_rate": 1.4503305882490126e-06, + "loss": 0.4384, + "step": 9661 + }, + { + "epoch": 0.76, + "grad_norm": 1.542830261491004, + "learning_rate": 1.4494350168213511e-06, + "loss": 0.4623, + "step": 9662 + }, + { + "epoch": 0.76, + "grad_norm": 1.8351525186761743, + "learning_rate": 1.4485396751105996e-06, + "loss": 0.418, + "step": 9663 + }, + { + "epoch": 0.76, + "grad_norm": 2.2847337403177157, + "learning_rate": 1.447644563174685e-06, + "loss": 0.4848, + "step": 9664 + }, + { + "epoch": 0.76, + "grad_norm": 0.534628725416741, + "learning_rate": 1.446749681071522e-06, + "loss": 0.4817, + "step": 9665 + }, + { + "epoch": 0.76, + "grad_norm": 1.6961743451282096, + "learning_rate": 1.445855028859003e-06, + "loss": 0.452, + "step": 9666 + }, + { + "epoch": 0.76, + "grad_norm": 0.5279799934124707, + "learning_rate": 1.4449606065950173e-06, + "loss": 0.4812, + "step": 9667 + }, + { + "epoch": 0.76, + "grad_norm": 1.6628108141104814, + "learning_rate": 1.4440664143374283e-06, + "loss": 0.4062, + "step": 9668 + }, + { + "epoch": 0.76, + "grad_norm": 1.9440863062281821, + "learning_rate": 1.443172452144092e-06, + "loss": 0.4815, + "step": 9669 + }, + { + "epoch": 0.76, + "grad_norm": 1.5487369552085108, + "learning_rate": 1.4422787200728421e-06, + "loss": 0.4611, + "step": 9670 + }, + { + "epoch": 0.76, + "grad_norm": 2.635404327655508, + "learning_rate": 1.441385218181507e-06, + "loss": 0.4288, + "step": 9671 + }, + { + "epoch": 0.76, + "grad_norm": 1.5539901173496475, + "learning_rate": 1.4404919465278938e-06, + "loss": 0.434, + "step": 9672 + }, + { + "epoch": 0.76, + "grad_norm": 1.7159622281753129, + "learning_rate": 1.439598905169794e-06, + "loss": 0.5101, + "step": 9673 + }, + { + "epoch": 0.76, + "grad_norm": 1.5960887156703283, + "learning_rate": 1.4387060941649878e-06, + "loss": 0.4447, + "step": 9674 + }, + { + "epoch": 0.76, + "grad_norm": 1.7102478509328658, + "learning_rate": 1.437813513571238e-06, + "loss": 0.4417, + "step": 9675 + }, + { + "epoch": 0.76, + "grad_norm": 1.638983654801771, + "learning_rate": 1.4369211634462955e-06, + "loss": 0.4657, + "step": 9676 + }, + { + "epoch": 0.76, + "grad_norm": 1.7982156288296596, + "learning_rate": 1.4360290438478913e-06, + "loss": 0.3686, + "step": 9677 + }, + { + "epoch": 0.76, + "grad_norm": 1.5548789151374398, + "learning_rate": 1.4351371548337457e-06, + "loss": 0.408, + "step": 9678 + }, + { + "epoch": 0.76, + "grad_norm": 5.045654916234841, + "learning_rate": 1.4342454964615627e-06, + "loss": 0.4873, + "step": 9679 + }, + { + "epoch": 0.76, + "grad_norm": 1.5091384744179397, + "learning_rate": 1.433354068789033e-06, + "loss": 0.3846, + "step": 9680 + }, + { + "epoch": 0.76, + "grad_norm": 2.0331724056136684, + "learning_rate": 1.4324628718738281e-06, + "loss": 0.4733, + "step": 9681 + }, + { + "epoch": 0.76, + "grad_norm": 1.8168027159557691, + "learning_rate": 1.4315719057736067e-06, + "loss": 0.3781, + "step": 9682 + }, + { + "epoch": 0.76, + "grad_norm": 1.8276691940837746, + "learning_rate": 1.4306811705460178e-06, + "loss": 0.4353, + "step": 9683 + }, + { + "epoch": 0.76, + "grad_norm": 1.7170013607744674, + "learning_rate": 1.4297906662486866e-06, + "loss": 0.4615, + "step": 9684 + }, + { + "epoch": 0.76, + "grad_norm": 1.521713131086413, + "learning_rate": 1.4289003929392303e-06, + "loss": 0.4631, + "step": 9685 + }, + { + "epoch": 0.76, + "grad_norm": 2.055493784451531, + "learning_rate": 1.4280103506752434e-06, + "loss": 0.4245, + "step": 9686 + }, + { + "epoch": 0.76, + "grad_norm": 1.836280143244017, + "learning_rate": 1.427120539514318e-06, + "loss": 0.427, + "step": 9687 + }, + { + "epoch": 0.76, + "grad_norm": 1.478532823067747, + "learning_rate": 1.4262309595140179e-06, + "loss": 0.4115, + "step": 9688 + }, + { + "epoch": 0.76, + "grad_norm": 7.312490839099188, + "learning_rate": 1.4253416107318997e-06, + "loss": 0.4503, + "step": 9689 + }, + { + "epoch": 0.76, + "grad_norm": 1.3954164426067273, + "learning_rate": 1.4244524932255026e-06, + "loss": 0.4376, + "step": 9690 + }, + { + "epoch": 0.76, + "grad_norm": 2.383651057757473, + "learning_rate": 1.4235636070523539e-06, + "loss": 0.4102, + "step": 9691 + }, + { + "epoch": 0.76, + "grad_norm": 1.7359183164421632, + "learning_rate": 1.4226749522699595e-06, + "loss": 0.4532, + "step": 9692 + }, + { + "epoch": 0.76, + "grad_norm": 0.576892797630708, + "learning_rate": 1.4217865289358163e-06, + "loss": 0.4861, + "step": 9693 + }, + { + "epoch": 0.76, + "grad_norm": 1.9919035029096992, + "learning_rate": 1.4208983371074032e-06, + "loss": 0.4788, + "step": 9694 + }, + { + "epoch": 0.76, + "grad_norm": 0.5189714383760866, + "learning_rate": 1.4200103768421857e-06, + "loss": 0.4827, + "step": 9695 + }, + { + "epoch": 0.76, + "grad_norm": 1.8403890793835973, + "learning_rate": 1.4191226481976156e-06, + "loss": 0.4472, + "step": 9696 + }, + { + "epoch": 0.76, + "grad_norm": 1.6339010124317952, + "learning_rate": 1.4182351512311237e-06, + "loss": 0.434, + "step": 9697 + }, + { + "epoch": 0.76, + "grad_norm": 2.086586402893464, + "learning_rate": 1.4173478860001328e-06, + "loss": 0.5368, + "step": 9698 + }, + { + "epoch": 0.76, + "grad_norm": 1.7552769623658644, + "learning_rate": 1.416460852562046e-06, + "loss": 0.5019, + "step": 9699 + }, + { + "epoch": 0.76, + "grad_norm": 1.704840294793175, + "learning_rate": 1.4155740509742566e-06, + "loss": 0.4632, + "step": 9700 + }, + { + "epoch": 0.76, + "grad_norm": 1.4441749072943602, + "learning_rate": 1.4146874812941337e-06, + "loss": 0.4812, + "step": 9701 + }, + { + "epoch": 0.76, + "grad_norm": 1.6511908555842614, + "learning_rate": 1.4138011435790433e-06, + "loss": 0.4349, + "step": 9702 + }, + { + "epoch": 0.76, + "grad_norm": 0.5422171133102842, + "learning_rate": 1.412915037886326e-06, + "loss": 0.4836, + "step": 9703 + }, + { + "epoch": 0.76, + "grad_norm": 1.8992118184942701, + "learning_rate": 1.4120291642733152e-06, + "loss": 0.42, + "step": 9704 + }, + { + "epoch": 0.76, + "grad_norm": 1.5200262967941722, + "learning_rate": 1.4111435227973208e-06, + "loss": 0.4373, + "step": 9705 + }, + { + "epoch": 0.76, + "grad_norm": 1.3503431631095235, + "learning_rate": 1.4102581135156468e-06, + "loss": 0.3944, + "step": 9706 + }, + { + "epoch": 0.76, + "grad_norm": 1.6109928393556285, + "learning_rate": 1.4093729364855785e-06, + "loss": 0.4584, + "step": 9707 + }, + { + "epoch": 0.76, + "grad_norm": 1.760680695785795, + "learning_rate": 1.408487991764383e-06, + "loss": 0.4235, + "step": 9708 + }, + { + "epoch": 0.76, + "grad_norm": 1.4555839654624207, + "learning_rate": 1.4076032794093158e-06, + "loss": 0.4189, + "step": 9709 + }, + { + "epoch": 0.76, + "grad_norm": 1.7517904030458151, + "learning_rate": 1.4067187994776166e-06, + "loss": 0.4695, + "step": 9710 + }, + { + "epoch": 0.76, + "grad_norm": 1.9138928488015565, + "learning_rate": 1.4058345520265127e-06, + "loss": 0.4391, + "step": 9711 + }, + { + "epoch": 0.76, + "grad_norm": 1.9197666283853205, + "learning_rate": 1.4049505371132095e-06, + "loss": 0.4925, + "step": 9712 + }, + { + "epoch": 0.76, + "grad_norm": 2.0472638830244523, + "learning_rate": 1.404066754794904e-06, + "loss": 0.4631, + "step": 9713 + }, + { + "epoch": 0.76, + "grad_norm": 7.207484982972802, + "learning_rate": 1.4031832051287752e-06, + "loss": 0.4425, + "step": 9714 + }, + { + "epoch": 0.76, + "grad_norm": 1.7066049891329393, + "learning_rate": 1.4022998881719891e-06, + "loss": 0.458, + "step": 9715 + }, + { + "epoch": 0.76, + "grad_norm": 1.6129827361634426, + "learning_rate": 1.4014168039816929e-06, + "loss": 0.3964, + "step": 9716 + }, + { + "epoch": 0.76, + "grad_norm": 2.110268117028117, + "learning_rate": 1.4005339526150197e-06, + "loss": 0.4511, + "step": 9717 + }, + { + "epoch": 0.76, + "grad_norm": 2.0035781261139505, + "learning_rate": 1.3996513341290946e-06, + "loss": 0.4731, + "step": 9718 + }, + { + "epoch": 0.76, + "grad_norm": 0.5289342487654921, + "learning_rate": 1.3987689485810168e-06, + "loss": 0.4722, + "step": 9719 + }, + { + "epoch": 0.76, + "grad_norm": 1.5322204423089738, + "learning_rate": 1.3978867960278786e-06, + "loss": 0.3959, + "step": 9720 + }, + { + "epoch": 0.76, + "grad_norm": 1.4515473684119469, + "learning_rate": 1.397004876526749e-06, + "loss": 0.4211, + "step": 9721 + }, + { + "epoch": 0.76, + "grad_norm": 1.8936942687901868, + "learning_rate": 1.3961231901346938e-06, + "loss": 0.4474, + "step": 9722 + }, + { + "epoch": 0.76, + "grad_norm": 10.613215466898822, + "learning_rate": 1.3952417369087528e-06, + "loss": 0.4439, + "step": 9723 + }, + { + "epoch": 0.76, + "grad_norm": 3.3998774352534946, + "learning_rate": 1.3943605169059554e-06, + "loss": 0.4603, + "step": 9724 + }, + { + "epoch": 0.76, + "grad_norm": 1.622744009927646, + "learning_rate": 1.3934795301833154e-06, + "loss": 0.4416, + "step": 9725 + }, + { + "epoch": 0.76, + "grad_norm": 1.3747065419814466, + "learning_rate": 1.392598776797834e-06, + "loss": 0.4738, + "step": 9726 + }, + { + "epoch": 0.76, + "grad_norm": 1.8742286718314007, + "learning_rate": 1.3917182568064907e-06, + "loss": 0.4832, + "step": 9727 + }, + { + "epoch": 0.76, + "grad_norm": 1.5305018360697198, + "learning_rate": 1.390837970266256e-06, + "loss": 0.4251, + "step": 9728 + }, + { + "epoch": 0.76, + "grad_norm": 1.694081055401206, + "learning_rate": 1.3899579172340839e-06, + "loss": 0.4881, + "step": 9729 + }, + { + "epoch": 0.76, + "grad_norm": 1.4563023400470618, + "learning_rate": 1.3890780977669117e-06, + "loss": 0.447, + "step": 9730 + }, + { + "epoch": 0.76, + "grad_norm": 1.5917020574858896, + "learning_rate": 1.388198511921664e-06, + "loss": 0.5025, + "step": 9731 + }, + { + "epoch": 0.76, + "grad_norm": 1.6261431732513043, + "learning_rate": 1.3873191597552465e-06, + "loss": 0.4911, + "step": 9732 + }, + { + "epoch": 0.76, + "grad_norm": 1.878397864435762, + "learning_rate": 1.3864400413245532e-06, + "loss": 0.4863, + "step": 9733 + }, + { + "epoch": 0.76, + "grad_norm": 1.7808028072086688, + "learning_rate": 1.3855611566864624e-06, + "loss": 0.4719, + "step": 9734 + }, + { + "epoch": 0.76, + "grad_norm": 1.6844207196460712, + "learning_rate": 1.384682505897838e-06, + "loss": 0.4208, + "step": 9735 + }, + { + "epoch": 0.76, + "grad_norm": 2.5627687040355727, + "learning_rate": 1.383804089015523e-06, + "loss": 0.4618, + "step": 9736 + }, + { + "epoch": 0.76, + "grad_norm": 1.6231723803329783, + "learning_rate": 1.3829259060963556e-06, + "loss": 0.4537, + "step": 9737 + }, + { + "epoch": 0.76, + "grad_norm": 1.5309707733894793, + "learning_rate": 1.3820479571971491e-06, + "loss": 0.4149, + "step": 9738 + }, + { + "epoch": 0.76, + "grad_norm": 2.0083088880484525, + "learning_rate": 1.3811702423747065e-06, + "loss": 0.4542, + "step": 9739 + }, + { + "epoch": 0.76, + "grad_norm": 2.424503448077591, + "learning_rate": 1.3802927616858157e-06, + "loss": 0.4607, + "step": 9740 + }, + { + "epoch": 0.77, + "grad_norm": 1.6868006566997644, + "learning_rate": 1.379415515187248e-06, + "loss": 0.5075, + "step": 9741 + }, + { + "epoch": 0.77, + "grad_norm": 2.148107458961988, + "learning_rate": 1.3785385029357613e-06, + "loss": 0.4764, + "step": 9742 + }, + { + "epoch": 0.77, + "grad_norm": 1.6220890563820314, + "learning_rate": 1.3776617249880947e-06, + "loss": 0.4464, + "step": 9743 + }, + { + "epoch": 0.77, + "grad_norm": 1.5943503955041192, + "learning_rate": 1.3767851814009759e-06, + "loss": 0.4558, + "step": 9744 + }, + { + "epoch": 0.77, + "grad_norm": 1.637870622727954, + "learning_rate": 1.375908872231116e-06, + "loss": 0.4537, + "step": 9745 + }, + { + "epoch": 0.77, + "grad_norm": 1.9365124975820953, + "learning_rate": 1.3750327975352128e-06, + "loss": 0.4461, + "step": 9746 + }, + { + "epoch": 0.77, + "grad_norm": 1.814167621035966, + "learning_rate": 1.3741569573699437e-06, + "loss": 0.4428, + "step": 9747 + }, + { + "epoch": 0.77, + "grad_norm": 1.437270261980394, + "learning_rate": 1.373281351791977e-06, + "loss": 0.4432, + "step": 9748 + }, + { + "epoch": 0.77, + "grad_norm": 1.654862592197974, + "learning_rate": 1.3724059808579614e-06, + "loss": 0.4031, + "step": 9749 + }, + { + "epoch": 0.77, + "grad_norm": 3.2013660152049366, + "learning_rate": 1.3715308446245341e-06, + "loss": 0.4495, + "step": 9750 + }, + { + "epoch": 0.77, + "grad_norm": 1.4853119548887193, + "learning_rate": 1.3706559431483163e-06, + "loss": 0.4481, + "step": 9751 + }, + { + "epoch": 0.77, + "grad_norm": 1.7335889532423319, + "learning_rate": 1.3697812764859075e-06, + "loss": 0.4155, + "step": 9752 + }, + { + "epoch": 0.77, + "grad_norm": 0.5775261972411954, + "learning_rate": 1.3689068446939046e-06, + "loss": 0.4711, + "step": 9753 + }, + { + "epoch": 0.77, + "grad_norm": 1.7033978075172276, + "learning_rate": 1.3680326478288768e-06, + "loss": 0.4691, + "step": 9754 + }, + { + "epoch": 0.77, + "grad_norm": 1.5869143012469478, + "learning_rate": 1.3671586859473879e-06, + "loss": 0.4545, + "step": 9755 + }, + { + "epoch": 0.77, + "grad_norm": 1.5480714256511907, + "learning_rate": 1.3662849591059756e-06, + "loss": 0.402, + "step": 9756 + }, + { + "epoch": 0.77, + "grad_norm": 1.789185545663958, + "learning_rate": 1.3654114673611768e-06, + "loss": 0.3675, + "step": 9757 + }, + { + "epoch": 0.77, + "grad_norm": 2.317689808810311, + "learning_rate": 1.3645382107694993e-06, + "loss": 0.3944, + "step": 9758 + }, + { + "epoch": 0.77, + "grad_norm": 1.4993788156783316, + "learning_rate": 1.363665189387444e-06, + "loss": 0.459, + "step": 9759 + }, + { + "epoch": 0.77, + "grad_norm": 2.4891979173073304, + "learning_rate": 1.362792403271494e-06, + "loss": 0.474, + "step": 9760 + }, + { + "epoch": 0.77, + "grad_norm": 1.8247490946867762, + "learning_rate": 1.361919852478119e-06, + "loss": 0.4309, + "step": 9761 + }, + { + "epoch": 0.77, + "grad_norm": 2.1624587517942544, + "learning_rate": 1.3610475370637694e-06, + "loss": 0.4307, + "step": 9762 + }, + { + "epoch": 0.77, + "grad_norm": 2.967821326849219, + "learning_rate": 1.360175457084883e-06, + "loss": 0.3983, + "step": 9763 + }, + { + "epoch": 0.77, + "grad_norm": 1.4049153607276919, + "learning_rate": 1.359303612597883e-06, + "loss": 0.4611, + "step": 9764 + }, + { + "epoch": 0.77, + "grad_norm": 1.5869511785434793, + "learning_rate": 1.3584320036591769e-06, + "loss": 0.4449, + "step": 9765 + }, + { + "epoch": 0.77, + "grad_norm": 1.4878839656524625, + "learning_rate": 1.3575606303251582e-06, + "loss": 0.4765, + "step": 9766 + }, + { + "epoch": 0.77, + "grad_norm": 2.1391865289760585, + "learning_rate": 1.3566894926522e-06, + "loss": 0.4418, + "step": 9767 + }, + { + "epoch": 0.77, + "grad_norm": 4.79582789396429, + "learning_rate": 1.3558185906966659e-06, + "loss": 0.4568, + "step": 9768 + }, + { + "epoch": 0.77, + "grad_norm": 1.5499720430556112, + "learning_rate": 1.354947924514901e-06, + "loss": 0.4457, + "step": 9769 + }, + { + "epoch": 0.77, + "grad_norm": 1.4413785096476557, + "learning_rate": 1.3540774941632395e-06, + "loss": 0.429, + "step": 9770 + }, + { + "epoch": 0.77, + "grad_norm": 3.3919295298247376, + "learning_rate": 1.353207299697991e-06, + "loss": 0.4387, + "step": 9771 + }, + { + "epoch": 0.77, + "grad_norm": 1.5018319651659298, + "learning_rate": 1.352337341175463e-06, + "loss": 0.4359, + "step": 9772 + }, + { + "epoch": 0.77, + "grad_norm": 0.5264492331350257, + "learning_rate": 1.3514676186519354e-06, + "loss": 0.4789, + "step": 9773 + }, + { + "epoch": 0.77, + "grad_norm": 2.327658411333996, + "learning_rate": 1.3505981321836804e-06, + "loss": 0.4332, + "step": 9774 + }, + { + "epoch": 0.77, + "grad_norm": 1.7856702861849734, + "learning_rate": 1.3497288818269515e-06, + "loss": 0.4272, + "step": 9775 + }, + { + "epoch": 0.77, + "grad_norm": 1.7233789048238943, + "learning_rate": 1.34885986763799e-06, + "loss": 0.4309, + "step": 9776 + }, + { + "epoch": 0.77, + "grad_norm": 1.6847997415998228, + "learning_rate": 1.3479910896730192e-06, + "loss": 0.4369, + "step": 9777 + }, + { + "epoch": 0.77, + "grad_norm": 1.7818468852197134, + "learning_rate": 1.3471225479882466e-06, + "loss": 0.4525, + "step": 9778 + }, + { + "epoch": 0.77, + "grad_norm": 0.5568295488108892, + "learning_rate": 1.3462542426398662e-06, + "loss": 0.4826, + "step": 9779 + }, + { + "epoch": 0.77, + "grad_norm": 0.5732143996625892, + "learning_rate": 1.3453861736840568e-06, + "loss": 0.4671, + "step": 9780 + }, + { + "epoch": 0.77, + "grad_norm": 0.5706864387830022, + "learning_rate": 1.3445183411769824e-06, + "loss": 0.4885, + "step": 9781 + }, + { + "epoch": 0.77, + "grad_norm": 1.815658911544847, + "learning_rate": 1.3436507451747883e-06, + "loss": 0.4788, + "step": 9782 + }, + { + "epoch": 0.77, + "grad_norm": 1.6617200736913993, + "learning_rate": 1.3427833857336075e-06, + "loss": 0.4253, + "step": 9783 + }, + { + "epoch": 0.77, + "grad_norm": 1.4782593291817319, + "learning_rate": 1.3419162629095572e-06, + "loss": 0.4129, + "step": 9784 + }, + { + "epoch": 0.77, + "grad_norm": 1.8616842953290473, + "learning_rate": 1.34104937675874e-06, + "loss": 0.5035, + "step": 9785 + }, + { + "epoch": 0.77, + "grad_norm": 2.0308596685482283, + "learning_rate": 1.3401827273372426e-06, + "loss": 0.4239, + "step": 9786 + }, + { + "epoch": 0.77, + "grad_norm": 2.62966683825278, + "learning_rate": 1.3393163147011323e-06, + "loss": 0.4814, + "step": 9787 + }, + { + "epoch": 0.77, + "grad_norm": 0.5726175159165826, + "learning_rate": 1.3384501389064709e-06, + "loss": 0.4621, + "step": 9788 + }, + { + "epoch": 0.77, + "grad_norm": 11.473701068945438, + "learning_rate": 1.3375842000092936e-06, + "loss": 0.4214, + "step": 9789 + }, + { + "epoch": 0.77, + "grad_norm": 1.7708789622124228, + "learning_rate": 1.336718498065629e-06, + "loss": 0.4704, + "step": 9790 + }, + { + "epoch": 0.77, + "grad_norm": 1.7663202022689908, + "learning_rate": 1.3358530331314828e-06, + "loss": 0.4112, + "step": 9791 + }, + { + "epoch": 0.77, + "grad_norm": 1.5463211867620723, + "learning_rate": 1.3349878052628545e-06, + "loss": 0.4477, + "step": 9792 + }, + { + "epoch": 0.77, + "grad_norm": 1.5831405557254699, + "learning_rate": 1.3341228145157198e-06, + "loss": 0.5089, + "step": 9793 + }, + { + "epoch": 0.77, + "grad_norm": 1.6562798189496315, + "learning_rate": 1.3332580609460432e-06, + "loss": 0.4121, + "step": 9794 + }, + { + "epoch": 0.77, + "grad_norm": 1.848636414408402, + "learning_rate": 1.3323935446097735e-06, + "loss": 0.4274, + "step": 9795 + }, + { + "epoch": 0.77, + "grad_norm": 0.5654602129231828, + "learning_rate": 1.3315292655628437e-06, + "loss": 0.4746, + "step": 9796 + }, + { + "epoch": 0.77, + "grad_norm": 0.5509125235374649, + "learning_rate": 1.3306652238611729e-06, + "loss": 0.456, + "step": 9797 + }, + { + "epoch": 0.77, + "grad_norm": 1.6180634667119023, + "learning_rate": 1.3298014195606601e-06, + "loss": 0.4136, + "step": 9798 + }, + { + "epoch": 0.77, + "grad_norm": 1.683392706441607, + "learning_rate": 1.3289378527171948e-06, + "loss": 0.4956, + "step": 9799 + }, + { + "epoch": 0.77, + "grad_norm": 2.0790657076862082, + "learning_rate": 1.3280745233866477e-06, + "loss": 0.3967, + "step": 9800 + }, + { + "epoch": 0.77, + "grad_norm": 3.349019600835016, + "learning_rate": 1.3272114316248774e-06, + "loss": 0.4709, + "step": 9801 + }, + { + "epoch": 0.77, + "grad_norm": 2.6468145669126755, + "learning_rate": 1.326348577487721e-06, + "loss": 0.4098, + "step": 9802 + }, + { + "epoch": 0.77, + "grad_norm": 1.7165961360368753, + "learning_rate": 1.3254859610310055e-06, + "loss": 0.4703, + "step": 9803 + }, + { + "epoch": 0.77, + "grad_norm": 2.425229221520358, + "learning_rate": 1.3246235823105424e-06, + "loss": 0.4426, + "step": 9804 + }, + { + "epoch": 0.77, + "grad_norm": 1.4234368658967118, + "learning_rate": 1.3237614413821266e-06, + "loss": 0.4654, + "step": 9805 + }, + { + "epoch": 0.77, + "grad_norm": 1.7920914462520112, + "learning_rate": 1.322899538301533e-06, + "loss": 0.4747, + "step": 9806 + }, + { + "epoch": 0.77, + "grad_norm": 1.843531405480349, + "learning_rate": 1.3220378731245308e-06, + "loss": 0.4622, + "step": 9807 + }, + { + "epoch": 0.77, + "grad_norm": 1.5824400175471236, + "learning_rate": 1.3211764459068688e-06, + "loss": 0.4787, + "step": 9808 + }, + { + "epoch": 0.77, + "grad_norm": 1.6375655512626193, + "learning_rate": 1.3203152567042759e-06, + "loss": 0.4738, + "step": 9809 + }, + { + "epoch": 0.77, + "grad_norm": 1.9648093470506545, + "learning_rate": 1.319454305572473e-06, + "loss": 0.4488, + "step": 9810 + }, + { + "epoch": 0.77, + "grad_norm": 0.541639658549343, + "learning_rate": 1.3185935925671612e-06, + "loss": 0.5024, + "step": 9811 + }, + { + "epoch": 0.77, + "grad_norm": 2.2444307203724163, + "learning_rate": 1.31773311774403e-06, + "loss": 0.3797, + "step": 9812 + }, + { + "epoch": 0.77, + "grad_norm": 1.7301548652235517, + "learning_rate": 1.3168728811587471e-06, + "loss": 0.4219, + "step": 9813 + }, + { + "epoch": 0.77, + "grad_norm": 2.0353222835866855, + "learning_rate": 1.316012882866971e-06, + "loss": 0.4517, + "step": 9814 + }, + { + "epoch": 0.77, + "grad_norm": 1.8052040968909633, + "learning_rate": 1.3151531229243424e-06, + "loss": 0.5088, + "step": 9815 + }, + { + "epoch": 0.77, + "grad_norm": 2.3425719734328476, + "learning_rate": 1.3142936013864876e-06, + "loss": 0.4234, + "step": 9816 + }, + { + "epoch": 0.77, + "grad_norm": 1.6281791239865038, + "learning_rate": 1.3134343183090137e-06, + "loss": 0.4386, + "step": 9817 + }, + { + "epoch": 0.77, + "grad_norm": 1.5342242556977543, + "learning_rate": 1.3125752737475167e-06, + "loss": 0.4532, + "step": 9818 + }, + { + "epoch": 0.77, + "grad_norm": 1.8926627896884447, + "learning_rate": 1.311716467757576e-06, + "loss": 0.449, + "step": 9819 + }, + { + "epoch": 0.77, + "grad_norm": 1.422622734935708, + "learning_rate": 1.3108579003947546e-06, + "loss": 0.3948, + "step": 9820 + }, + { + "epoch": 0.77, + "grad_norm": 0.5855818414106012, + "learning_rate": 1.309999571714603e-06, + "loss": 0.4644, + "step": 9821 + }, + { + "epoch": 0.77, + "grad_norm": 1.488293214863266, + "learning_rate": 1.3091414817726483e-06, + "loss": 0.4581, + "step": 9822 + }, + { + "epoch": 0.77, + "grad_norm": 2.205321705239975, + "learning_rate": 1.308283630624415e-06, + "loss": 0.4253, + "step": 9823 + }, + { + "epoch": 0.77, + "grad_norm": 2.285906483377957, + "learning_rate": 1.3074260183254e-06, + "loss": 0.4629, + "step": 9824 + }, + { + "epoch": 0.77, + "grad_norm": 1.5537878799615814, + "learning_rate": 1.3065686449310922e-06, + "loss": 0.4511, + "step": 9825 + }, + { + "epoch": 0.77, + "grad_norm": 1.7795200422370296, + "learning_rate": 1.3057115104969586e-06, + "loss": 0.3775, + "step": 9826 + }, + { + "epoch": 0.77, + "grad_norm": 3.870723241567607, + "learning_rate": 1.3048546150784609e-06, + "loss": 0.4295, + "step": 9827 + }, + { + "epoch": 0.77, + "grad_norm": 0.5407619162806406, + "learning_rate": 1.3039979587310343e-06, + "loss": 0.4587, + "step": 9828 + }, + { + "epoch": 0.77, + "grad_norm": 1.869630397445088, + "learning_rate": 1.3031415415101039e-06, + "loss": 0.4071, + "step": 9829 + }, + { + "epoch": 0.77, + "grad_norm": 1.965671893502372, + "learning_rate": 1.3022853634710803e-06, + "loss": 0.5198, + "step": 9830 + }, + { + "epoch": 0.77, + "grad_norm": 1.8517644058214042, + "learning_rate": 1.3014294246693565e-06, + "loss": 0.4298, + "step": 9831 + }, + { + "epoch": 0.77, + "grad_norm": 1.931629945425655, + "learning_rate": 1.3005737251603117e-06, + "loss": 0.4668, + "step": 9832 + }, + { + "epoch": 0.77, + "grad_norm": 0.5241974859314537, + "learning_rate": 1.2997182649993068e-06, + "loss": 0.4772, + "step": 9833 + }, + { + "epoch": 0.77, + "grad_norm": 1.7567851186971823, + "learning_rate": 1.298863044241689e-06, + "loss": 0.4202, + "step": 9834 + }, + { + "epoch": 0.77, + "grad_norm": 1.5190000238083787, + "learning_rate": 1.2980080629427904e-06, + "loss": 0.4438, + "step": 9835 + }, + { + "epoch": 0.77, + "grad_norm": 1.588945532635868, + "learning_rate": 1.2971533211579285e-06, + "loss": 0.4602, + "step": 9836 + }, + { + "epoch": 0.77, + "grad_norm": 1.658398718537271, + "learning_rate": 1.296298818942402e-06, + "loss": 0.5749, + "step": 9837 + }, + { + "epoch": 0.77, + "grad_norm": 2.326847104233507, + "learning_rate": 1.2954445563514966e-06, + "loss": 0.4386, + "step": 9838 + }, + { + "epoch": 0.77, + "grad_norm": 2.4916592985201653, + "learning_rate": 1.2945905334404823e-06, + "loss": 0.4342, + "step": 9839 + }, + { + "epoch": 0.77, + "grad_norm": 2.26873847420643, + "learning_rate": 1.2937367502646147e-06, + "loss": 0.4532, + "step": 9840 + }, + { + "epoch": 0.77, + "grad_norm": 0.5683758370996783, + "learning_rate": 1.2928832068791275e-06, + "loss": 0.4822, + "step": 9841 + }, + { + "epoch": 0.77, + "grad_norm": 1.5455769787056626, + "learning_rate": 1.2920299033392492e-06, + "loss": 0.4319, + "step": 9842 + }, + { + "epoch": 0.77, + "grad_norm": 1.9132281596517118, + "learning_rate": 1.2911768397001873e-06, + "loss": 0.4885, + "step": 9843 + }, + { + "epoch": 0.77, + "grad_norm": 1.7972331351230315, + "learning_rate": 1.2903240160171304e-06, + "loss": 0.4187, + "step": 9844 + }, + { + "epoch": 0.77, + "grad_norm": 0.5648386324281731, + "learning_rate": 1.2894714323452568e-06, + "loss": 0.4717, + "step": 9845 + }, + { + "epoch": 0.77, + "grad_norm": 2.1253718957580534, + "learning_rate": 1.288619088739727e-06, + "loss": 0.4887, + "step": 9846 + }, + { + "epoch": 0.77, + "grad_norm": 2.3566837018351228, + "learning_rate": 1.287766985255689e-06, + "loss": 0.4555, + "step": 9847 + }, + { + "epoch": 0.77, + "grad_norm": 0.5524634094666402, + "learning_rate": 1.2869151219482695e-06, + "loss": 0.4535, + "step": 9848 + }, + { + "epoch": 0.77, + "grad_norm": 1.7896042531605219, + "learning_rate": 1.286063498872584e-06, + "loss": 0.4642, + "step": 9849 + }, + { + "epoch": 0.77, + "grad_norm": 1.990259620710734, + "learning_rate": 1.2852121160837323e-06, + "loss": 0.4617, + "step": 9850 + }, + { + "epoch": 0.77, + "grad_norm": 2.172131941735793, + "learning_rate": 1.2843609736367978e-06, + "loss": 0.4756, + "step": 9851 + }, + { + "epoch": 0.77, + "grad_norm": 2.04232448806431, + "learning_rate": 1.2835100715868465e-06, + "loss": 0.4368, + "step": 9852 + }, + { + "epoch": 0.77, + "grad_norm": 2.0409907375930376, + "learning_rate": 1.2826594099889322e-06, + "loss": 0.4469, + "step": 9853 + }, + { + "epoch": 0.77, + "grad_norm": 2.1488936337963698, + "learning_rate": 1.2818089888980906e-06, + "loss": 0.4232, + "step": 9854 + }, + { + "epoch": 0.77, + "grad_norm": 0.5401719741359496, + "learning_rate": 1.2809588083693436e-06, + "loss": 0.4638, + "step": 9855 + }, + { + "epoch": 0.77, + "grad_norm": 2.1293472026315836, + "learning_rate": 1.2801088684576979e-06, + "loss": 0.4272, + "step": 9856 + }, + { + "epoch": 0.77, + "grad_norm": 1.6719735790227794, + "learning_rate": 1.2792591692181393e-06, + "loss": 0.4248, + "step": 9857 + }, + { + "epoch": 0.77, + "grad_norm": 1.784254522689791, + "learning_rate": 1.278409710705648e-06, + "loss": 0.4354, + "step": 9858 + }, + { + "epoch": 0.77, + "grad_norm": 1.5275281126781837, + "learning_rate": 1.277560492975179e-06, + "loss": 0.4255, + "step": 9859 + }, + { + "epoch": 0.77, + "grad_norm": 1.5984211169223819, + "learning_rate": 1.276711516081678e-06, + "loss": 0.4526, + "step": 9860 + }, + { + "epoch": 0.77, + "grad_norm": 3.807004717862255, + "learning_rate": 1.2758627800800677e-06, + "loss": 0.491, + "step": 9861 + }, + { + "epoch": 0.77, + "grad_norm": 1.9576853635297746, + "learning_rate": 1.2750142850252667e-06, + "loss": 0.5021, + "step": 9862 + }, + { + "epoch": 0.77, + "grad_norm": 1.8398768026562364, + "learning_rate": 1.2741660309721677e-06, + "loss": 0.4962, + "step": 9863 + }, + { + "epoch": 0.77, + "grad_norm": 6.199763645472166, + "learning_rate": 1.2733180179756515e-06, + "loss": 0.4189, + "step": 9864 + }, + { + "epoch": 0.77, + "grad_norm": 0.5138608536669774, + "learning_rate": 1.2724702460905853e-06, + "loss": 0.449, + "step": 9865 + }, + { + "epoch": 0.77, + "grad_norm": 0.5622340562567587, + "learning_rate": 1.271622715371818e-06, + "loss": 0.4845, + "step": 9866 + }, + { + "epoch": 0.77, + "grad_norm": 0.5342187282529401, + "learning_rate": 1.2707754258741845e-06, + "loss": 0.4659, + "step": 9867 + }, + { + "epoch": 0.77, + "grad_norm": 1.6251096065496584, + "learning_rate": 1.2699283776525017e-06, + "loss": 0.4797, + "step": 9868 + }, + { + "epoch": 0.78, + "grad_norm": 1.9495037112536284, + "learning_rate": 1.2690815707615727e-06, + "loss": 0.4678, + "step": 9869 + }, + { + "epoch": 0.78, + "grad_norm": 1.6086932940444034, + "learning_rate": 1.268235005256186e-06, + "loss": 0.4446, + "step": 9870 + }, + { + "epoch": 0.78, + "grad_norm": 1.7203885196696374, + "learning_rate": 1.267388681191114e-06, + "loss": 0.4484, + "step": 9871 + }, + { + "epoch": 0.78, + "grad_norm": 1.8644574076252163, + "learning_rate": 1.2665425986211094e-06, + "loss": 0.466, + "step": 9872 + }, + { + "epoch": 0.78, + "grad_norm": 1.7669623573024322, + "learning_rate": 1.2656967576009155e-06, + "loss": 0.422, + "step": 9873 + }, + { + "epoch": 0.78, + "grad_norm": 2.028300996152387, + "learning_rate": 1.2648511581852557e-06, + "loss": 0.4337, + "step": 9874 + }, + { + "epoch": 0.78, + "grad_norm": 1.842818952873279, + "learning_rate": 1.2640058004288402e-06, + "loss": 0.4038, + "step": 9875 + }, + { + "epoch": 0.78, + "grad_norm": 2.1706158749446196, + "learning_rate": 1.263160684386362e-06, + "loss": 0.4027, + "step": 9876 + }, + { + "epoch": 0.78, + "grad_norm": 1.5478624444558307, + "learning_rate": 1.2623158101124995e-06, + "loss": 0.4089, + "step": 9877 + }, + { + "epoch": 0.78, + "grad_norm": 1.4199987812309558, + "learning_rate": 1.2614711776619154e-06, + "loss": 0.4992, + "step": 9878 + }, + { + "epoch": 0.78, + "grad_norm": 1.7965907808414838, + "learning_rate": 1.2606267870892541e-06, + "loss": 0.4579, + "step": 9879 + }, + { + "epoch": 0.78, + "grad_norm": 2.326134044212904, + "learning_rate": 1.259782638449148e-06, + "loss": 0.4601, + "step": 9880 + }, + { + "epoch": 0.78, + "grad_norm": 2.4429646459903784, + "learning_rate": 1.2589387317962126e-06, + "loss": 0.4423, + "step": 9881 + }, + { + "epoch": 0.78, + "grad_norm": 1.849649173950172, + "learning_rate": 1.2580950671850488e-06, + "loss": 0.491, + "step": 9882 + }, + { + "epoch": 0.78, + "grad_norm": 2.05358154404609, + "learning_rate": 1.2572516446702376e-06, + "loss": 0.4952, + "step": 9883 + }, + { + "epoch": 0.78, + "grad_norm": 1.9121105454089926, + "learning_rate": 1.2564084643063485e-06, + "loss": 0.4083, + "step": 9884 + }, + { + "epoch": 0.78, + "grad_norm": 1.7235992118250365, + "learning_rate": 1.255565526147935e-06, + "loss": 0.4742, + "step": 9885 + }, + { + "epoch": 0.78, + "grad_norm": 1.9650005750129307, + "learning_rate": 1.2547228302495334e-06, + "loss": 0.459, + "step": 9886 + }, + { + "epoch": 0.78, + "grad_norm": 0.5334263622553529, + "learning_rate": 1.253880376665667e-06, + "loss": 0.4773, + "step": 9887 + }, + { + "epoch": 0.78, + "grad_norm": 0.5196671081364501, + "learning_rate": 1.2530381654508377e-06, + "loss": 0.4672, + "step": 9888 + }, + { + "epoch": 0.78, + "grad_norm": 4.599797577333411, + "learning_rate": 1.252196196659538e-06, + "loss": 0.471, + "step": 9889 + }, + { + "epoch": 0.78, + "grad_norm": 2.9724167519595683, + "learning_rate": 1.251354470346241e-06, + "loss": 0.4228, + "step": 9890 + }, + { + "epoch": 0.78, + "grad_norm": 1.8391791605091385, + "learning_rate": 1.250512986565408e-06, + "loss": 0.4731, + "step": 9891 + }, + { + "epoch": 0.78, + "grad_norm": 0.5489642498210031, + "learning_rate": 1.2496717453714769e-06, + "loss": 0.4806, + "step": 9892 + }, + { + "epoch": 0.78, + "grad_norm": 1.8260042414007853, + "learning_rate": 1.2488307468188805e-06, + "loss": 0.3926, + "step": 9893 + }, + { + "epoch": 0.78, + "grad_norm": 0.5828042894404579, + "learning_rate": 1.2479899909620258e-06, + "loss": 0.4676, + "step": 9894 + }, + { + "epoch": 0.78, + "grad_norm": 1.7757157675087814, + "learning_rate": 1.2471494778553128e-06, + "loss": 0.4054, + "step": 9895 + }, + { + "epoch": 0.78, + "grad_norm": 4.29806407000764, + "learning_rate": 1.2463092075531158e-06, + "loss": 0.4547, + "step": 9896 + }, + { + "epoch": 0.78, + "grad_norm": 1.6058518900894856, + "learning_rate": 1.245469180109804e-06, + "loss": 0.4906, + "step": 9897 + }, + { + "epoch": 0.78, + "grad_norm": 2.1894937114798796, + "learning_rate": 1.2446293955797262e-06, + "loss": 0.5012, + "step": 9898 + }, + { + "epoch": 0.78, + "grad_norm": 1.7441777520870199, + "learning_rate": 1.2437898540172122e-06, + "loss": 0.5031, + "step": 9899 + }, + { + "epoch": 0.78, + "grad_norm": 0.5323359671712155, + "learning_rate": 1.2429505554765813e-06, + "loss": 0.4729, + "step": 9900 + }, + { + "epoch": 0.78, + "grad_norm": 2.4256854764996074, + "learning_rate": 1.2421115000121347e-06, + "loss": 0.462, + "step": 9901 + }, + { + "epoch": 0.78, + "grad_norm": 1.5698182656364748, + "learning_rate": 1.2412726876781594e-06, + "loss": 0.4525, + "step": 9902 + }, + { + "epoch": 0.78, + "grad_norm": 1.3352994204416662, + "learning_rate": 1.2404341185289226e-06, + "loss": 0.4005, + "step": 9903 + }, + { + "epoch": 0.78, + "grad_norm": 1.9110936550177373, + "learning_rate": 1.2395957926186802e-06, + "loss": 0.4255, + "step": 9904 + }, + { + "epoch": 0.78, + "grad_norm": 0.5794053515373636, + "learning_rate": 1.2387577100016706e-06, + "loss": 0.4853, + "step": 9905 + }, + { + "epoch": 0.78, + "grad_norm": 1.7277781653299185, + "learning_rate": 1.2379198707321189e-06, + "loss": 0.4378, + "step": 9906 + }, + { + "epoch": 0.78, + "grad_norm": 1.9080405587604494, + "learning_rate": 1.237082274864228e-06, + "loss": 0.4573, + "step": 9907 + }, + { + "epoch": 0.78, + "grad_norm": 1.9469390874965564, + "learning_rate": 1.2362449224521905e-06, + "loss": 0.5221, + "step": 9908 + }, + { + "epoch": 0.78, + "grad_norm": 1.6041981928925877, + "learning_rate": 1.2354078135501857e-06, + "loss": 0.398, + "step": 9909 + }, + { + "epoch": 0.78, + "grad_norm": 1.4548058323468214, + "learning_rate": 1.234570948212369e-06, + "loss": 0.4097, + "step": 9910 + }, + { + "epoch": 0.78, + "grad_norm": 1.6171399005921256, + "learning_rate": 1.2337343264928863e-06, + "loss": 0.4321, + "step": 9911 + }, + { + "epoch": 0.78, + "grad_norm": 2.763854713977996, + "learning_rate": 1.232897948445866e-06, + "loss": 0.493, + "step": 9912 + }, + { + "epoch": 0.78, + "grad_norm": 1.5831070085581385, + "learning_rate": 1.2320618141254214e-06, + "loss": 0.4825, + "step": 9913 + }, + { + "epoch": 0.78, + "grad_norm": 1.866635424288943, + "learning_rate": 1.2312259235856471e-06, + "loss": 0.4508, + "step": 9914 + }, + { + "epoch": 0.78, + "grad_norm": 2.9752349673969443, + "learning_rate": 1.2303902768806252e-06, + "loss": 0.4699, + "step": 9915 + }, + { + "epoch": 0.78, + "grad_norm": 1.797462130394774, + "learning_rate": 1.2295548740644213e-06, + "loss": 0.5185, + "step": 9916 + }, + { + "epoch": 0.78, + "grad_norm": 4.304825930292483, + "learning_rate": 1.2287197151910862e-06, + "loss": 0.4356, + "step": 9917 + }, + { + "epoch": 0.78, + "grad_norm": 0.5086793026636335, + "learning_rate": 1.227884800314651e-06, + "loss": 0.4641, + "step": 9918 + }, + { + "epoch": 0.78, + "grad_norm": 1.4596519792437321, + "learning_rate": 1.2270501294891341e-06, + "loss": 0.4481, + "step": 9919 + }, + { + "epoch": 0.78, + "grad_norm": 0.5448801912343616, + "learning_rate": 1.2262157027685384e-06, + "loss": 0.4972, + "step": 9920 + }, + { + "epoch": 0.78, + "grad_norm": 2.2458879647467556, + "learning_rate": 1.2253815202068497e-06, + "loss": 0.4656, + "step": 9921 + }, + { + "epoch": 0.78, + "grad_norm": 2.253077786647236, + "learning_rate": 1.2245475818580404e-06, + "loss": 0.4609, + "step": 9922 + }, + { + "epoch": 0.78, + "grad_norm": 0.5611500626285899, + "learning_rate": 1.2237138877760623e-06, + "loss": 0.4501, + "step": 9923 + }, + { + "epoch": 0.78, + "grad_norm": 0.5840354398186929, + "learning_rate": 1.2228804380148556e-06, + "loss": 0.4863, + "step": 9924 + }, + { + "epoch": 0.78, + "grad_norm": 2.1218853810262748, + "learning_rate": 1.222047232628344e-06, + "loss": 0.4536, + "step": 9925 + }, + { + "epoch": 0.78, + "grad_norm": 2.214266944519692, + "learning_rate": 1.221214271670435e-06, + "loss": 0.4082, + "step": 9926 + }, + { + "epoch": 0.78, + "grad_norm": 1.474122939002315, + "learning_rate": 1.2203815551950165e-06, + "loss": 0.4345, + "step": 9927 + }, + { + "epoch": 0.78, + "grad_norm": 2.104951180310517, + "learning_rate": 1.2195490832559704e-06, + "loss": 0.4321, + "step": 9928 + }, + { + "epoch": 0.78, + "grad_norm": 1.8340179677762505, + "learning_rate": 1.2187168559071517e-06, + "loss": 0.4338, + "step": 9929 + }, + { + "epoch": 0.78, + "grad_norm": 1.9407527022312259, + "learning_rate": 1.2178848732024069e-06, + "loss": 0.4398, + "step": 9930 + }, + { + "epoch": 0.78, + "grad_norm": 1.6665816190566394, + "learning_rate": 1.2170531351955605e-06, + "loss": 0.4085, + "step": 9931 + }, + { + "epoch": 0.78, + "grad_norm": 2.4615416643228762, + "learning_rate": 1.2162216419404289e-06, + "loss": 0.438, + "step": 9932 + }, + { + "epoch": 0.78, + "grad_norm": 1.7694842972383025, + "learning_rate": 1.2153903934908084e-06, + "loss": 0.4936, + "step": 9933 + }, + { + "epoch": 0.78, + "grad_norm": 2.0637775037829873, + "learning_rate": 1.2145593899004777e-06, + "loss": 0.446, + "step": 9934 + }, + { + "epoch": 0.78, + "grad_norm": 3.450364050223724, + "learning_rate": 1.2137286312232022e-06, + "loss": 0.4466, + "step": 9935 + }, + { + "epoch": 0.78, + "grad_norm": 2.2291485954652237, + "learning_rate": 1.2128981175127314e-06, + "loss": 0.4485, + "step": 9936 + }, + { + "epoch": 0.78, + "grad_norm": 1.6169224279208168, + "learning_rate": 1.2120678488227994e-06, + "loss": 0.4509, + "step": 9937 + }, + { + "epoch": 0.78, + "grad_norm": 2.2904789201098508, + "learning_rate": 1.211237825207121e-06, + "loss": 0.4667, + "step": 9938 + }, + { + "epoch": 0.78, + "grad_norm": 1.6779455011223394, + "learning_rate": 1.2104080467193991e-06, + "loss": 0.4756, + "step": 9939 + }, + { + "epoch": 0.78, + "grad_norm": 1.8871182208201456, + "learning_rate": 1.2095785134133187e-06, + "loss": 0.4948, + "step": 9940 + }, + { + "epoch": 0.78, + "grad_norm": 1.7655107105732397, + "learning_rate": 1.208749225342552e-06, + "loss": 0.4383, + "step": 9941 + }, + { + "epoch": 0.78, + "grad_norm": 2.2695138181530354, + "learning_rate": 1.207920182560749e-06, + "loss": 0.4496, + "step": 9942 + }, + { + "epoch": 0.78, + "grad_norm": 1.8135839918449157, + "learning_rate": 1.2070913851215482e-06, + "loss": 0.424, + "step": 9943 + }, + { + "epoch": 0.78, + "grad_norm": 2.0693347817800203, + "learning_rate": 1.2062628330785758e-06, + "loss": 0.4829, + "step": 9944 + }, + { + "epoch": 0.78, + "grad_norm": 1.6251159898150476, + "learning_rate": 1.2054345264854344e-06, + "loss": 0.4375, + "step": 9945 + }, + { + "epoch": 0.78, + "grad_norm": 0.5535407938864633, + "learning_rate": 1.2046064653957147e-06, + "loss": 0.4467, + "step": 9946 + }, + { + "epoch": 0.78, + "grad_norm": 1.384064629267377, + "learning_rate": 1.2037786498629916e-06, + "loss": 0.4314, + "step": 9947 + }, + { + "epoch": 0.78, + "grad_norm": 1.5791156704300235, + "learning_rate": 1.2029510799408256e-06, + "loss": 0.4431, + "step": 9948 + }, + { + "epoch": 0.78, + "grad_norm": 1.6773274329880625, + "learning_rate": 1.2021237556827558e-06, + "loss": 0.4507, + "step": 9949 + }, + { + "epoch": 0.78, + "grad_norm": 1.431598623468071, + "learning_rate": 1.2012966771423112e-06, + "loss": 0.4252, + "step": 9950 + }, + { + "epoch": 0.78, + "grad_norm": 1.6592970970066585, + "learning_rate": 1.2004698443730018e-06, + "loss": 0.4528, + "step": 9951 + }, + { + "epoch": 0.78, + "grad_norm": 1.8403349594604146, + "learning_rate": 1.1996432574283245e-06, + "loss": 0.4828, + "step": 9952 + }, + { + "epoch": 0.78, + "grad_norm": 1.5082034479910564, + "learning_rate": 1.1988169163617557e-06, + "loss": 0.4643, + "step": 9953 + }, + { + "epoch": 0.78, + "grad_norm": 1.7542844335228094, + "learning_rate": 1.1979908212267599e-06, + "loss": 0.4257, + "step": 9954 + }, + { + "epoch": 0.78, + "grad_norm": 0.5690702918212723, + "learning_rate": 1.1971649720767847e-06, + "loss": 0.4725, + "step": 9955 + }, + { + "epoch": 0.78, + "grad_norm": 1.9817945666212584, + "learning_rate": 1.1963393689652603e-06, + "loss": 0.4124, + "step": 9956 + }, + { + "epoch": 0.78, + "grad_norm": 1.8299977612464928, + "learning_rate": 1.1955140119456049e-06, + "loss": 0.48, + "step": 9957 + }, + { + "epoch": 0.78, + "grad_norm": 1.6467384896579371, + "learning_rate": 1.1946889010712143e-06, + "loss": 0.3727, + "step": 9958 + }, + { + "epoch": 0.78, + "grad_norm": 2.3029892016173763, + "learning_rate": 1.193864036395474e-06, + "loss": 0.4493, + "step": 9959 + }, + { + "epoch": 0.78, + "grad_norm": 1.9224826709130185, + "learning_rate": 1.1930394179717513e-06, + "loss": 0.4679, + "step": 9960 + }, + { + "epoch": 0.78, + "grad_norm": 2.2186226574151795, + "learning_rate": 1.1922150458533993e-06, + "loss": 0.4339, + "step": 9961 + }, + { + "epoch": 0.78, + "grad_norm": 2.036283201440223, + "learning_rate": 1.19139092009375e-06, + "loss": 0.4187, + "step": 9962 + }, + { + "epoch": 0.78, + "grad_norm": 1.4004004656923177, + "learning_rate": 1.1905670407461295e-06, + "loss": 0.4193, + "step": 9963 + }, + { + "epoch": 0.78, + "grad_norm": 2.1458357186531694, + "learning_rate": 1.189743407863836e-06, + "loss": 0.4948, + "step": 9964 + }, + { + "epoch": 0.78, + "grad_norm": 1.6180800036755154, + "learning_rate": 1.1889200215001595e-06, + "loss": 0.387, + "step": 9965 + }, + { + "epoch": 0.78, + "grad_norm": 0.5314298802656144, + "learning_rate": 1.188096881708372e-06, + "loss": 0.4666, + "step": 9966 + }, + { + "epoch": 0.78, + "grad_norm": 1.9950293982609388, + "learning_rate": 1.18727398854173e-06, + "loss": 0.4727, + "step": 9967 + }, + { + "epoch": 0.78, + "grad_norm": 1.7834056046502953, + "learning_rate": 1.1864513420534746e-06, + "loss": 0.4238, + "step": 9968 + }, + { + "epoch": 0.78, + "grad_norm": 1.468152471658757, + "learning_rate": 1.1856289422968271e-06, + "loss": 0.4463, + "step": 9969 + }, + { + "epoch": 0.78, + "grad_norm": 1.9138845406716694, + "learning_rate": 1.184806789324997e-06, + "loss": 0.4564, + "step": 9970 + }, + { + "epoch": 0.78, + "grad_norm": 1.8219396894224027, + "learning_rate": 1.1839848831911772e-06, + "loss": 0.4664, + "step": 9971 + }, + { + "epoch": 0.78, + "grad_norm": 1.9908679180317457, + "learning_rate": 1.1831632239485446e-06, + "loss": 0.4371, + "step": 9972 + }, + { + "epoch": 0.78, + "grad_norm": 1.4926536057576503, + "learning_rate": 1.1823418116502566e-06, + "loss": 0.4576, + "step": 9973 + }, + { + "epoch": 0.78, + "grad_norm": 2.951163140894335, + "learning_rate": 1.18152064634946e-06, + "loss": 0.4512, + "step": 9974 + }, + { + "epoch": 0.78, + "grad_norm": 0.5585680077765179, + "learning_rate": 1.180699728099282e-06, + "loss": 0.4833, + "step": 9975 + }, + { + "epoch": 0.78, + "grad_norm": 2.2430595345526263, + "learning_rate": 1.1798790569528356e-06, + "loss": 0.4404, + "step": 9976 + }, + { + "epoch": 0.78, + "grad_norm": 1.8148424587147551, + "learning_rate": 1.179058632963218e-06, + "loss": 0.4307, + "step": 9977 + }, + { + "epoch": 0.78, + "grad_norm": 1.7420169557679468, + "learning_rate": 1.1782384561835052e-06, + "loss": 0.426, + "step": 9978 + }, + { + "epoch": 0.78, + "grad_norm": 2.0582407115018397, + "learning_rate": 1.177418526666768e-06, + "loss": 0.3959, + "step": 9979 + }, + { + "epoch": 0.78, + "grad_norm": 1.6864013496166899, + "learning_rate": 1.1765988444660508e-06, + "loss": 0.4894, + "step": 9980 + }, + { + "epoch": 0.78, + "grad_norm": 1.418173686730568, + "learning_rate": 1.1757794096343862e-06, + "loss": 0.4534, + "step": 9981 + }, + { + "epoch": 0.78, + "grad_norm": 1.7759228502152764, + "learning_rate": 1.1749602222247908e-06, + "loss": 0.4532, + "step": 9982 + }, + { + "epoch": 0.78, + "grad_norm": 1.4779370238483347, + "learning_rate": 1.1741412822902671e-06, + "loss": 0.4359, + "step": 9983 + }, + { + "epoch": 0.78, + "grad_norm": 1.6539670958047419, + "learning_rate": 1.1733225898837958e-06, + "loss": 0.4996, + "step": 9984 + }, + { + "epoch": 0.78, + "grad_norm": 1.7604816210236347, + "learning_rate": 1.172504145058347e-06, + "loss": 0.4669, + "step": 9985 + }, + { + "epoch": 0.78, + "grad_norm": 1.628253010602657, + "learning_rate": 1.1716859478668735e-06, + "loss": 0.4328, + "step": 9986 + }, + { + "epoch": 0.78, + "grad_norm": 1.576661623706323, + "learning_rate": 1.1708679983623122e-06, + "loss": 0.4275, + "step": 9987 + }, + { + "epoch": 0.78, + "grad_norm": 2.5528366063370465, + "learning_rate": 1.1700502965975808e-06, + "loss": 0.4376, + "step": 9988 + }, + { + "epoch": 0.78, + "grad_norm": 0.5575971980865699, + "learning_rate": 1.169232842625585e-06, + "loss": 0.488, + "step": 9989 + }, + { + "epoch": 0.78, + "grad_norm": 0.5605163162011814, + "learning_rate": 1.1684156364992133e-06, + "loss": 0.4654, + "step": 9990 + }, + { + "epoch": 0.78, + "grad_norm": 2.014104133480851, + "learning_rate": 1.1675986782713372e-06, + "loss": 0.49, + "step": 9991 + }, + { + "epoch": 0.78, + "grad_norm": 2.0143676496126663, + "learning_rate": 1.1667819679948145e-06, + "loss": 0.4435, + "step": 9992 + }, + { + "epoch": 0.78, + "grad_norm": 0.5501372127426185, + "learning_rate": 1.1659655057224834e-06, + "loss": 0.4581, + "step": 9993 + }, + { + "epoch": 0.78, + "grad_norm": 1.8008272911397039, + "learning_rate": 1.1651492915071678e-06, + "loss": 0.4275, + "step": 9994 + }, + { + "epoch": 0.78, + "grad_norm": 0.5457069084323105, + "learning_rate": 1.1643333254016765e-06, + "loss": 0.4718, + "step": 9995 + }, + { + "epoch": 0.79, + "grad_norm": 3.647085660820638, + "learning_rate": 1.1635176074588028e-06, + "loss": 0.5012, + "step": 9996 + }, + { + "epoch": 0.79, + "grad_norm": 1.421620859981154, + "learning_rate": 1.1627021377313186e-06, + "loss": 0.4216, + "step": 9997 + }, + { + "epoch": 0.79, + "grad_norm": 4.809655883939099, + "learning_rate": 1.1618869162719887e-06, + "loss": 0.5097, + "step": 9998 + }, + { + "epoch": 0.79, + "grad_norm": 2.29139997838529, + "learning_rate": 1.1610719431335531e-06, + "loss": 0.4347, + "step": 9999 + }, + { + "epoch": 0.79, + "grad_norm": 0.5989818814749023, + "learning_rate": 1.1602572183687411e-06, + "loss": 0.5224, + "step": 10000 + }, + { + "epoch": 0.79, + "grad_norm": 1.2937749957403455, + "learning_rate": 1.159442742030264e-06, + "loss": 0.4216, + "step": 10001 + }, + { + "epoch": 0.79, + "grad_norm": 1.4902885052366017, + "learning_rate": 1.1586285141708176e-06, + "loss": 0.5045, + "step": 10002 + }, + { + "epoch": 0.79, + "grad_norm": 1.7534818875848799, + "learning_rate": 1.157814534843082e-06, + "loss": 0.4205, + "step": 10003 + }, + { + "epoch": 0.79, + "grad_norm": 1.6569326609005501, + "learning_rate": 1.157000804099719e-06, + "loss": 0.4625, + "step": 10004 + }, + { + "epoch": 0.79, + "grad_norm": 2.1153820032184525, + "learning_rate": 1.1561873219933762e-06, + "loss": 0.4792, + "step": 10005 + }, + { + "epoch": 0.79, + "grad_norm": 2.313001344363856, + "learning_rate": 1.1553740885766857e-06, + "loss": 0.4271, + "step": 10006 + }, + { + "epoch": 0.79, + "grad_norm": 2.0491628184246715, + "learning_rate": 1.1545611039022637e-06, + "loss": 0.4403, + "step": 10007 + }, + { + "epoch": 0.79, + "grad_norm": 0.5193909216542221, + "learning_rate": 1.1537483680227058e-06, + "loss": 0.4806, + "step": 10008 + }, + { + "epoch": 0.79, + "grad_norm": 0.5499873585068724, + "learning_rate": 1.1529358809905971e-06, + "loss": 0.4809, + "step": 10009 + }, + { + "epoch": 0.79, + "grad_norm": 0.5125965011781745, + "learning_rate": 1.1521236428585047e-06, + "loss": 0.4527, + "step": 10010 + }, + { + "epoch": 0.79, + "grad_norm": 1.473926970040196, + "learning_rate": 1.1513116536789792e-06, + "loss": 0.4537, + "step": 10011 + }, + { + "epoch": 0.79, + "grad_norm": 1.4358835663442993, + "learning_rate": 1.1504999135045558e-06, + "loss": 0.4445, + "step": 10012 + }, + { + "epoch": 0.79, + "grad_norm": 1.646335808612089, + "learning_rate": 1.1496884223877498e-06, + "loss": 0.4642, + "step": 10013 + }, + { + "epoch": 0.79, + "grad_norm": 1.568675840165188, + "learning_rate": 1.1488771803810684e-06, + "loss": 0.4282, + "step": 10014 + }, + { + "epoch": 0.79, + "grad_norm": 1.5611465754971974, + "learning_rate": 1.1480661875369947e-06, + "loss": 0.4305, + "step": 10015 + }, + { + "epoch": 0.79, + "grad_norm": 1.6794813328280251, + "learning_rate": 1.1472554439080007e-06, + "loss": 0.4677, + "step": 10016 + }, + { + "epoch": 0.79, + "grad_norm": 2.440425831995939, + "learning_rate": 1.1464449495465368e-06, + "loss": 0.4176, + "step": 10017 + }, + { + "epoch": 0.79, + "grad_norm": 2.797709889059092, + "learning_rate": 1.1456347045050463e-06, + "loss": 0.415, + "step": 10018 + }, + { + "epoch": 0.79, + "grad_norm": 1.738777762315388, + "learning_rate": 1.144824708835947e-06, + "loss": 0.489, + "step": 10019 + }, + { + "epoch": 0.79, + "grad_norm": 1.7811877513908148, + "learning_rate": 1.1440149625916458e-06, + "loss": 0.4076, + "step": 10020 + }, + { + "epoch": 0.79, + "grad_norm": 0.5827299257063068, + "learning_rate": 1.1432054658245323e-06, + "loss": 0.4892, + "step": 10021 + }, + { + "epoch": 0.79, + "grad_norm": 1.8512464599301277, + "learning_rate": 1.1423962185869798e-06, + "loss": 0.44, + "step": 10022 + }, + { + "epoch": 0.79, + "grad_norm": 2.537118812682858, + "learning_rate": 1.1415872209313466e-06, + "loss": 0.4744, + "step": 10023 + }, + { + "epoch": 0.79, + "grad_norm": 1.5649324672683547, + "learning_rate": 1.140778472909972e-06, + "loss": 0.4827, + "step": 10024 + }, + { + "epoch": 0.79, + "grad_norm": 1.6239141164720323, + "learning_rate": 1.1399699745751813e-06, + "loss": 0.444, + "step": 10025 + }, + { + "epoch": 0.79, + "grad_norm": 1.6913938095130232, + "learning_rate": 1.1391617259792836e-06, + "loss": 0.5242, + "step": 10026 + }, + { + "epoch": 0.79, + "grad_norm": 2.5456945082691544, + "learning_rate": 1.1383537271745732e-06, + "loss": 0.4012, + "step": 10027 + }, + { + "epoch": 0.79, + "grad_norm": 1.6273633722324115, + "learning_rate": 1.1375459782133236e-06, + "loss": 0.4785, + "step": 10028 + }, + { + "epoch": 0.79, + "grad_norm": 2.3777686672987817, + "learning_rate": 1.1367384791477964e-06, + "loss": 0.4541, + "step": 10029 + }, + { + "epoch": 0.79, + "grad_norm": 1.8233022556053948, + "learning_rate": 1.1359312300302361e-06, + "loss": 0.4733, + "step": 10030 + }, + { + "epoch": 0.79, + "grad_norm": 0.554051549842203, + "learning_rate": 1.1351242309128713e-06, + "loss": 0.4697, + "step": 10031 + }, + { + "epoch": 0.79, + "grad_norm": 1.4047237450400798, + "learning_rate": 1.1343174818479103e-06, + "loss": 0.4407, + "step": 10032 + }, + { + "epoch": 0.79, + "grad_norm": 0.5839521666223214, + "learning_rate": 1.133510982887553e-06, + "loss": 0.4754, + "step": 10033 + }, + { + "epoch": 0.79, + "grad_norm": 2.292550186325647, + "learning_rate": 1.132704734083978e-06, + "loss": 0.4588, + "step": 10034 + }, + { + "epoch": 0.79, + "grad_norm": 1.5203647903507407, + "learning_rate": 1.1318987354893463e-06, + "loss": 0.4117, + "step": 10035 + }, + { + "epoch": 0.79, + "grad_norm": 2.0278095392008386, + "learning_rate": 1.1310929871558068e-06, + "loss": 0.4154, + "step": 10036 + }, + { + "epoch": 0.79, + "grad_norm": 1.501440224635936, + "learning_rate": 1.1302874891354893e-06, + "loss": 0.4418, + "step": 10037 + }, + { + "epoch": 0.79, + "grad_norm": 2.0775485183665126, + "learning_rate": 1.1294822414805106e-06, + "loss": 0.5021, + "step": 10038 + }, + { + "epoch": 0.79, + "grad_norm": 1.6244385112750148, + "learning_rate": 1.1286772442429667e-06, + "loss": 0.4278, + "step": 10039 + }, + { + "epoch": 0.79, + "grad_norm": 1.5100642143750571, + "learning_rate": 1.12787249747494e-06, + "loss": 0.4635, + "step": 10040 + }, + { + "epoch": 0.79, + "grad_norm": 1.5164894743984914, + "learning_rate": 1.127068001228498e-06, + "loss": 0.4437, + "step": 10041 + }, + { + "epoch": 0.79, + "grad_norm": 1.8078510962786887, + "learning_rate": 1.1262637555556905e-06, + "loss": 0.4517, + "step": 10042 + }, + { + "epoch": 0.79, + "grad_norm": 1.5671661342163807, + "learning_rate": 1.12545976050855e-06, + "loss": 0.4318, + "step": 10043 + }, + { + "epoch": 0.79, + "grad_norm": 0.5310635909925292, + "learning_rate": 1.1246560161390925e-06, + "loss": 0.4857, + "step": 10044 + }, + { + "epoch": 0.79, + "grad_norm": 1.8098884360179892, + "learning_rate": 1.1238525224993241e-06, + "loss": 0.4506, + "step": 10045 + }, + { + "epoch": 0.79, + "grad_norm": 2.029640331758974, + "learning_rate": 1.1230492796412258e-06, + "loss": 0.4725, + "step": 10046 + }, + { + "epoch": 0.79, + "grad_norm": 0.5222749375809408, + "learning_rate": 1.1222462876167684e-06, + "loss": 0.4676, + "step": 10047 + }, + { + "epoch": 0.79, + "grad_norm": 1.8984792153519288, + "learning_rate": 1.1214435464779006e-06, + "loss": 0.4544, + "step": 10048 + }, + { + "epoch": 0.79, + "grad_norm": 0.5707293588161633, + "learning_rate": 1.1206410562765647e-06, + "loss": 0.4921, + "step": 10049 + }, + { + "epoch": 0.79, + "grad_norm": 1.7187449274734654, + "learning_rate": 1.1198388170646758e-06, + "loss": 0.4653, + "step": 10050 + }, + { + "epoch": 0.79, + "grad_norm": 1.741097488078969, + "learning_rate": 1.119036828894141e-06, + "loss": 0.4321, + "step": 10051 + }, + { + "epoch": 0.79, + "grad_norm": 2.8068443278287956, + "learning_rate": 1.118235091816844e-06, + "loss": 0.4569, + "step": 10052 + }, + { + "epoch": 0.79, + "grad_norm": 2.0535043314844557, + "learning_rate": 1.1174336058846608e-06, + "loss": 0.4297, + "step": 10053 + }, + { + "epoch": 0.79, + "grad_norm": 0.5286886783440068, + "learning_rate": 1.1166323711494438e-06, + "loss": 0.4638, + "step": 10054 + }, + { + "epoch": 0.79, + "grad_norm": 1.6937354309953243, + "learning_rate": 1.1158313876630311e-06, + "loss": 0.4444, + "step": 10055 + }, + { + "epoch": 0.79, + "grad_norm": 1.8765475475668605, + "learning_rate": 1.1150306554772472e-06, + "loss": 0.4612, + "step": 10056 + }, + { + "epoch": 0.79, + "grad_norm": 2.992382484442274, + "learning_rate": 1.1142301746438978e-06, + "loss": 0.4235, + "step": 10057 + }, + { + "epoch": 0.79, + "grad_norm": 1.9812451679340213, + "learning_rate": 1.113429945214774e-06, + "loss": 0.3851, + "step": 10058 + }, + { + "epoch": 0.79, + "grad_norm": 1.4420701126305544, + "learning_rate": 1.1126299672416474e-06, + "loss": 0.4877, + "step": 10059 + }, + { + "epoch": 0.79, + "grad_norm": 1.8827501000418154, + "learning_rate": 1.111830240776276e-06, + "loss": 0.4571, + "step": 10060 + }, + { + "epoch": 0.79, + "grad_norm": 1.9510646900373567, + "learning_rate": 1.1110307658704023e-06, + "loss": 0.4637, + "step": 10061 + }, + { + "epoch": 0.79, + "grad_norm": 2.343497489191553, + "learning_rate": 1.1102315425757514e-06, + "loss": 0.4265, + "step": 10062 + }, + { + "epoch": 0.79, + "grad_norm": 1.461065753717907, + "learning_rate": 1.1094325709440306e-06, + "loss": 0.4333, + "step": 10063 + }, + { + "epoch": 0.79, + "grad_norm": 1.9664785611775422, + "learning_rate": 1.1086338510269324e-06, + "loss": 0.4431, + "step": 10064 + }, + { + "epoch": 0.79, + "grad_norm": 1.8027441426511144, + "learning_rate": 1.107835382876134e-06, + "loss": 0.4497, + "step": 10065 + }, + { + "epoch": 0.79, + "grad_norm": 1.9146061241076822, + "learning_rate": 1.1070371665432955e-06, + "loss": 0.4388, + "step": 10066 + }, + { + "epoch": 0.79, + "grad_norm": 6.802076654544321, + "learning_rate": 1.1062392020800571e-06, + "loss": 0.4759, + "step": 10067 + }, + { + "epoch": 0.79, + "grad_norm": 1.986980764309152, + "learning_rate": 1.1054414895380504e-06, + "loss": 0.4353, + "step": 10068 + }, + { + "epoch": 0.79, + "grad_norm": 2.175705275420327, + "learning_rate": 1.1046440289688859e-06, + "loss": 0.4657, + "step": 10069 + }, + { + "epoch": 0.79, + "grad_norm": 4.279797579439347, + "learning_rate": 1.1038468204241553e-06, + "loss": 0.4458, + "step": 10070 + }, + { + "epoch": 0.79, + "grad_norm": 0.5203828842706305, + "learning_rate": 1.1030498639554388e-06, + "loss": 0.4524, + "step": 10071 + }, + { + "epoch": 0.79, + "grad_norm": 1.9083944104413655, + "learning_rate": 1.1022531596142978e-06, + "loss": 0.4589, + "step": 10072 + }, + { + "epoch": 0.79, + "grad_norm": 1.4602612627777611, + "learning_rate": 1.1014567074522804e-06, + "loss": 0.407, + "step": 10073 + }, + { + "epoch": 0.79, + "grad_norm": 0.5200842627646898, + "learning_rate": 1.1006605075209127e-06, + "loss": 0.4651, + "step": 10074 + }, + { + "epoch": 0.79, + "grad_norm": 0.527062461962958, + "learning_rate": 1.0998645598717088e-06, + "loss": 0.4612, + "step": 10075 + }, + { + "epoch": 0.79, + "grad_norm": 1.9121978114494584, + "learning_rate": 1.099068864556166e-06, + "loss": 0.4242, + "step": 10076 + }, + { + "epoch": 0.79, + "grad_norm": 1.598748133599777, + "learning_rate": 1.0982734216257663e-06, + "loss": 0.4468, + "step": 10077 + }, + { + "epoch": 0.79, + "grad_norm": 3.644147805144959, + "learning_rate": 1.0974782311319705e-06, + "loss": 0.4153, + "step": 10078 + }, + { + "epoch": 0.79, + "grad_norm": 1.6746698043604378, + "learning_rate": 1.0966832931262266e-06, + "loss": 0.4945, + "step": 10079 + }, + { + "epoch": 0.79, + "grad_norm": 1.6062556421542185, + "learning_rate": 1.09588860765997e-06, + "loss": 0.4372, + "step": 10080 + }, + { + "epoch": 0.79, + "grad_norm": 1.9156149015828148, + "learning_rate": 1.0950941747846123e-06, + "loss": 0.4627, + "step": 10081 + }, + { + "epoch": 0.79, + "grad_norm": 0.5239708024677558, + "learning_rate": 1.0942999945515542e-06, + "loss": 0.4788, + "step": 10082 + }, + { + "epoch": 0.79, + "grad_norm": 1.7085165876123345, + "learning_rate": 1.093506067012175e-06, + "loss": 0.435, + "step": 10083 + }, + { + "epoch": 0.79, + "grad_norm": 0.5268933168649017, + "learning_rate": 1.092712392217845e-06, + "loss": 0.4544, + "step": 10084 + }, + { + "epoch": 0.79, + "grad_norm": 5.223037871441057, + "learning_rate": 1.0919189702199106e-06, + "loss": 0.4325, + "step": 10085 + }, + { + "epoch": 0.79, + "grad_norm": 0.5422378232099807, + "learning_rate": 1.0911258010697084e-06, + "loss": 0.4839, + "step": 10086 + }, + { + "epoch": 0.79, + "grad_norm": 1.3670030364387182, + "learning_rate": 1.0903328848185502e-06, + "loss": 0.4075, + "step": 10087 + }, + { + "epoch": 0.79, + "grad_norm": 1.4758455755496591, + "learning_rate": 1.0895402215177425e-06, + "loss": 0.4537, + "step": 10088 + }, + { + "epoch": 0.79, + "grad_norm": 1.2712702885966063, + "learning_rate": 1.0887478112185656e-06, + "loss": 0.3955, + "step": 10089 + }, + { + "epoch": 0.79, + "grad_norm": 0.5708667573810321, + "learning_rate": 1.0879556539722892e-06, + "loss": 0.4911, + "step": 10090 + }, + { + "epoch": 0.79, + "grad_norm": 1.8508735844326745, + "learning_rate": 1.0871637498301641e-06, + "loss": 0.4596, + "step": 10091 + }, + { + "epoch": 0.79, + "grad_norm": 2.5825759227327696, + "learning_rate": 1.0863720988434257e-06, + "loss": 0.4687, + "step": 10092 + }, + { + "epoch": 0.79, + "grad_norm": 0.531806573215688, + "learning_rate": 1.0855807010632941e-06, + "loss": 0.45, + "step": 10093 + }, + { + "epoch": 0.79, + "grad_norm": 0.5500343390307103, + "learning_rate": 1.0847895565409694e-06, + "loss": 0.4556, + "step": 10094 + }, + { + "epoch": 0.79, + "grad_norm": 0.5167681809806384, + "learning_rate": 1.0839986653276385e-06, + "loss": 0.4665, + "step": 10095 + }, + { + "epoch": 0.79, + "grad_norm": 1.7299713853242789, + "learning_rate": 1.083208027474471e-06, + "loss": 0.4988, + "step": 10096 + }, + { + "epoch": 0.79, + "grad_norm": 0.5516558890094574, + "learning_rate": 1.0824176430326217e-06, + "loss": 0.4862, + "step": 10097 + }, + { + "epoch": 0.79, + "grad_norm": 2.5045410156574004, + "learning_rate": 1.081627512053225e-06, + "loss": 0.4933, + "step": 10098 + }, + { + "epoch": 0.79, + "grad_norm": 1.9727140551739775, + "learning_rate": 1.0808376345874021e-06, + "loss": 0.4661, + "step": 10099 + }, + { + "epoch": 0.79, + "grad_norm": 1.6996984291407797, + "learning_rate": 1.0800480106862575e-06, + "loss": 0.4508, + "step": 10100 + }, + { + "epoch": 0.79, + "grad_norm": 2.0836489190082763, + "learning_rate": 1.0792586404008788e-06, + "loss": 0.4154, + "step": 10101 + }, + { + "epoch": 0.79, + "grad_norm": 1.8713811642073703, + "learning_rate": 1.0784695237823368e-06, + "loss": 0.4478, + "step": 10102 + }, + { + "epoch": 0.79, + "grad_norm": 5.34651316275778, + "learning_rate": 1.0776806608816863e-06, + "loss": 0.4779, + "step": 10103 + }, + { + "epoch": 0.79, + "grad_norm": 1.468786105798487, + "learning_rate": 1.0768920517499681e-06, + "loss": 0.4599, + "step": 10104 + }, + { + "epoch": 0.79, + "grad_norm": 1.7699723849202884, + "learning_rate": 1.0761036964382004e-06, + "loss": 0.4194, + "step": 10105 + }, + { + "epoch": 0.79, + "grad_norm": 1.704743010951251, + "learning_rate": 1.0753155949973903e-06, + "loss": 0.4728, + "step": 10106 + }, + { + "epoch": 0.79, + "grad_norm": 1.9431877626586613, + "learning_rate": 1.074527747478527e-06, + "loss": 0.4507, + "step": 10107 + }, + { + "epoch": 0.79, + "grad_norm": 1.6056503841010683, + "learning_rate": 1.0737401539325848e-06, + "loss": 0.4602, + "step": 10108 + }, + { + "epoch": 0.79, + "grad_norm": 2.379884067229351, + "learning_rate": 1.0729528144105171e-06, + "loss": 0.4448, + "step": 10109 + }, + { + "epoch": 0.79, + "grad_norm": 2.164551580461938, + "learning_rate": 1.0721657289632654e-06, + "loss": 0.5092, + "step": 10110 + }, + { + "epoch": 0.79, + "grad_norm": 0.5196280155115981, + "learning_rate": 1.0713788976417522e-06, + "loss": 0.4763, + "step": 10111 + }, + { + "epoch": 0.79, + "grad_norm": 0.569126974211081, + "learning_rate": 1.0705923204968855e-06, + "loss": 0.4598, + "step": 10112 + }, + { + "epoch": 0.79, + "grad_norm": 3.3847303267708413, + "learning_rate": 1.0698059975795566e-06, + "loss": 0.4451, + "step": 10113 + }, + { + "epoch": 0.79, + "grad_norm": 2.450786934867951, + "learning_rate": 1.0690199289406355e-06, + "loss": 0.4562, + "step": 10114 + }, + { + "epoch": 0.79, + "grad_norm": 0.5355269025012972, + "learning_rate": 1.0682341146309854e-06, + "loss": 0.4853, + "step": 10115 + }, + { + "epoch": 0.79, + "grad_norm": 1.4649698208335638, + "learning_rate": 1.0674485547014435e-06, + "loss": 0.4431, + "step": 10116 + }, + { + "epoch": 0.79, + "grad_norm": 1.8715216101613363, + "learning_rate": 1.0666632492028367e-06, + "loss": 0.4191, + "step": 10117 + }, + { + "epoch": 0.79, + "grad_norm": 0.5481972737754428, + "learning_rate": 1.0658781981859694e-06, + "loss": 0.4611, + "step": 10118 + }, + { + "epoch": 0.79, + "grad_norm": 0.5617909821057572, + "learning_rate": 1.0650934017016396e-06, + "loss": 0.4777, + "step": 10119 + }, + { + "epoch": 0.79, + "grad_norm": 7.594774739197863, + "learning_rate": 1.0643088598006174e-06, + "loss": 0.4439, + "step": 10120 + }, + { + "epoch": 0.79, + "grad_norm": 1.7465037877129606, + "learning_rate": 1.0635245725336647e-06, + "loss": 0.5007, + "step": 10121 + }, + { + "epoch": 0.79, + "grad_norm": 1.8449610142401933, + "learning_rate": 1.06274053995152e-06, + "loss": 0.4208, + "step": 10122 + }, + { + "epoch": 0.8, + "grad_norm": 1.7391144333403765, + "learning_rate": 1.061956762104913e-06, + "loss": 0.4399, + "step": 10123 + }, + { + "epoch": 0.8, + "grad_norm": 1.74180599198221, + "learning_rate": 1.0611732390445534e-06, + "loss": 0.4471, + "step": 10124 + }, + { + "epoch": 0.8, + "grad_norm": 1.545775071966047, + "learning_rate": 1.0603899708211312e-06, + "loss": 0.4709, + "step": 10125 + }, + { + "epoch": 0.8, + "grad_norm": 1.5968866404031512, + "learning_rate": 1.0596069574853246e-06, + "loss": 0.4572, + "step": 10126 + }, + { + "epoch": 0.8, + "grad_norm": 1.8624107754856385, + "learning_rate": 1.0588241990877924e-06, + "loss": 0.4588, + "step": 10127 + }, + { + "epoch": 0.8, + "grad_norm": 9.650270285665199, + "learning_rate": 1.0580416956791805e-06, + "loss": 0.4794, + "step": 10128 + }, + { + "epoch": 0.8, + "grad_norm": 3.595881396622812, + "learning_rate": 1.0572594473101134e-06, + "loss": 0.4619, + "step": 10129 + }, + { + "epoch": 0.8, + "grad_norm": 1.5992030755187427, + "learning_rate": 1.0564774540312016e-06, + "loss": 0.3955, + "step": 10130 + }, + { + "epoch": 0.8, + "grad_norm": 2.2050110777391208, + "learning_rate": 1.0556957158930397e-06, + "loss": 0.4798, + "step": 10131 + }, + { + "epoch": 0.8, + "grad_norm": 1.8822566815302626, + "learning_rate": 1.054914232946207e-06, + "loss": 0.4292, + "step": 10132 + }, + { + "epoch": 0.8, + "grad_norm": 1.8888032067142406, + "learning_rate": 1.0541330052412612e-06, + "loss": 0.3733, + "step": 10133 + }, + { + "epoch": 0.8, + "grad_norm": 5.122169954676104, + "learning_rate": 1.0533520328287466e-06, + "loss": 0.4389, + "step": 10134 + }, + { + "epoch": 0.8, + "grad_norm": 0.5848357495276764, + "learning_rate": 1.0525713157591955e-06, + "loss": 0.4672, + "step": 10135 + }, + { + "epoch": 0.8, + "grad_norm": 2.7891813849447913, + "learning_rate": 1.0517908540831146e-06, + "loss": 0.4424, + "step": 10136 + }, + { + "epoch": 0.8, + "grad_norm": 1.6355553467586057, + "learning_rate": 1.0510106478510006e-06, + "loss": 0.4461, + "step": 10137 + }, + { + "epoch": 0.8, + "grad_norm": 1.7127352407495429, + "learning_rate": 1.050230697113332e-06, + "loss": 0.4776, + "step": 10138 + }, + { + "epoch": 0.8, + "grad_norm": 0.5268855595120363, + "learning_rate": 1.0494510019205716e-06, + "loss": 0.4742, + "step": 10139 + }, + { + "epoch": 0.8, + "grad_norm": 1.4560115757899093, + "learning_rate": 1.0486715623231625e-06, + "loss": 0.4095, + "step": 10140 + }, + { + "epoch": 0.8, + "grad_norm": 1.8363434091402573, + "learning_rate": 1.047892378371534e-06, + "loss": 0.4253, + "step": 10141 + }, + { + "epoch": 0.8, + "grad_norm": 1.599107681893172, + "learning_rate": 1.0471134501160983e-06, + "loss": 0.4325, + "step": 10142 + }, + { + "epoch": 0.8, + "grad_norm": 2.280237265088869, + "learning_rate": 1.046334777607253e-06, + "loss": 0.4547, + "step": 10143 + }, + { + "epoch": 0.8, + "grad_norm": 1.6280701462092735, + "learning_rate": 1.0455563608953738e-06, + "loss": 0.4477, + "step": 10144 + }, + { + "epoch": 0.8, + "grad_norm": 7.5957231272663535, + "learning_rate": 1.0447782000308255e-06, + "loss": 0.486, + "step": 10145 + }, + { + "epoch": 0.8, + "grad_norm": 0.5661373243191992, + "learning_rate": 1.0440002950639533e-06, + "loss": 0.4782, + "step": 10146 + }, + { + "epoch": 0.8, + "grad_norm": 2.116309319991897, + "learning_rate": 1.0432226460450874e-06, + "loss": 0.4346, + "step": 10147 + }, + { + "epoch": 0.8, + "grad_norm": 2.563210296679415, + "learning_rate": 1.042445253024541e-06, + "loss": 0.4381, + "step": 10148 + }, + { + "epoch": 0.8, + "grad_norm": 1.4462553222118308, + "learning_rate": 1.0416681160526072e-06, + "loss": 0.4943, + "step": 10149 + }, + { + "epoch": 0.8, + "grad_norm": 0.5557270660788208, + "learning_rate": 1.0408912351795707e-06, + "loss": 0.4693, + "step": 10150 + }, + { + "epoch": 0.8, + "grad_norm": 0.5732995197259659, + "learning_rate": 1.0401146104556909e-06, + "loss": 0.4755, + "step": 10151 + }, + { + "epoch": 0.8, + "grad_norm": 0.5629590446885313, + "learning_rate": 1.0393382419312164e-06, + "loss": 0.4815, + "step": 10152 + }, + { + "epoch": 0.8, + "grad_norm": 2.0746227246568245, + "learning_rate": 1.0385621296563741e-06, + "loss": 0.4695, + "step": 10153 + }, + { + "epoch": 0.8, + "grad_norm": 0.5774929791279447, + "learning_rate": 1.0377862736813826e-06, + "loss": 0.4666, + "step": 10154 + }, + { + "epoch": 0.8, + "grad_norm": 2.250722500552263, + "learning_rate": 1.037010674056435e-06, + "loss": 0.4436, + "step": 10155 + }, + { + "epoch": 0.8, + "grad_norm": 0.5348842727797053, + "learning_rate": 1.0362353308317135e-06, + "loss": 0.4692, + "step": 10156 + }, + { + "epoch": 0.8, + "grad_norm": 0.5830933933544387, + "learning_rate": 1.035460244057378e-06, + "loss": 0.4945, + "step": 10157 + }, + { + "epoch": 0.8, + "grad_norm": 1.8278123751738586, + "learning_rate": 1.03468541378358e-06, + "loss": 0.3926, + "step": 10158 + }, + { + "epoch": 0.8, + "grad_norm": 1.703471935626578, + "learning_rate": 1.0339108400604497e-06, + "loss": 0.4468, + "step": 10159 + }, + { + "epoch": 0.8, + "grad_norm": 1.800743670222422, + "learning_rate": 1.0331365229380986e-06, + "loss": 0.4373, + "step": 10160 + }, + { + "epoch": 0.8, + "grad_norm": 1.5186875366033328, + "learning_rate": 1.0323624624666246e-06, + "loss": 0.4571, + "step": 10161 + }, + { + "epoch": 0.8, + "grad_norm": 2.0680486511731706, + "learning_rate": 1.0315886586961094e-06, + "loss": 0.4938, + "step": 10162 + }, + { + "epoch": 0.8, + "grad_norm": 0.5702533126976862, + "learning_rate": 1.030815111676618e-06, + "loss": 0.4813, + "step": 10163 + }, + { + "epoch": 0.8, + "grad_norm": 1.8372242516366455, + "learning_rate": 1.0300418214581953e-06, + "loss": 0.473, + "step": 10164 + }, + { + "epoch": 0.8, + "grad_norm": 3.3402608279790478, + "learning_rate": 1.029268788090873e-06, + "loss": 0.469, + "step": 10165 + }, + { + "epoch": 0.8, + "grad_norm": 1.5423028570717174, + "learning_rate": 1.0284960116246663e-06, + "loss": 0.485, + "step": 10166 + }, + { + "epoch": 0.8, + "grad_norm": 1.8477678663288564, + "learning_rate": 1.027723492109573e-06, + "loss": 0.4616, + "step": 10167 + }, + { + "epoch": 0.8, + "grad_norm": 0.5457813744984346, + "learning_rate": 1.0269512295955725e-06, + "loss": 0.4729, + "step": 10168 + }, + { + "epoch": 0.8, + "grad_norm": 2.2194939557053823, + "learning_rate": 1.0261792241326285e-06, + "loss": 0.3903, + "step": 10169 + }, + { + "epoch": 0.8, + "grad_norm": 1.984197579332957, + "learning_rate": 1.0254074757706927e-06, + "loss": 0.4394, + "step": 10170 + }, + { + "epoch": 0.8, + "grad_norm": 0.5415309075118928, + "learning_rate": 1.0246359845596927e-06, + "loss": 0.4477, + "step": 10171 + }, + { + "epoch": 0.8, + "grad_norm": 2.0898133796663094, + "learning_rate": 1.023864750549544e-06, + "loss": 0.4675, + "step": 10172 + }, + { + "epoch": 0.8, + "grad_norm": 1.9707821038854907, + "learning_rate": 1.0230937737901447e-06, + "loss": 0.445, + "step": 10173 + }, + { + "epoch": 0.8, + "grad_norm": 2.356601087943694, + "learning_rate": 1.0223230543313772e-06, + "loss": 0.4358, + "step": 10174 + }, + { + "epoch": 0.8, + "grad_norm": 2.1414530842857524, + "learning_rate": 1.0215525922231028e-06, + "loss": 0.4515, + "step": 10175 + }, + { + "epoch": 0.8, + "grad_norm": 1.755855608755755, + "learning_rate": 1.0207823875151718e-06, + "loss": 0.4537, + "step": 10176 + }, + { + "epoch": 0.8, + "grad_norm": 2.3748645557419277, + "learning_rate": 1.0200124402574146e-06, + "loss": 0.4621, + "step": 10177 + }, + { + "epoch": 0.8, + "grad_norm": 1.8587984269672981, + "learning_rate": 1.0192427504996471e-06, + "loss": 0.4121, + "step": 10178 + }, + { + "epoch": 0.8, + "grad_norm": 2.2371725035879613, + "learning_rate": 1.018473318291665e-06, + "loss": 0.4422, + "step": 10179 + }, + { + "epoch": 0.8, + "grad_norm": 1.5643178196292167, + "learning_rate": 1.0177041436832508e-06, + "loss": 0.4394, + "step": 10180 + }, + { + "epoch": 0.8, + "grad_norm": 1.76954558248805, + "learning_rate": 1.0169352267241694e-06, + "loss": 0.3916, + "step": 10181 + }, + { + "epoch": 0.8, + "grad_norm": 3.8456602351273816, + "learning_rate": 1.016166567464168e-06, + "loss": 0.4585, + "step": 10182 + }, + { + "epoch": 0.8, + "grad_norm": 1.9360083198211315, + "learning_rate": 1.0153981659529793e-06, + "loss": 0.4816, + "step": 10183 + }, + { + "epoch": 0.8, + "grad_norm": 1.9887583504601396, + "learning_rate": 1.0146300222403139e-06, + "loss": 0.4551, + "step": 10184 + }, + { + "epoch": 0.8, + "grad_norm": 1.6136260607012085, + "learning_rate": 1.0138621363758755e-06, + "loss": 0.4433, + "step": 10185 + }, + { + "epoch": 0.8, + "grad_norm": 2.386772755302714, + "learning_rate": 1.0130945084093412e-06, + "loss": 0.4134, + "step": 10186 + }, + { + "epoch": 0.8, + "grad_norm": 1.630934718784969, + "learning_rate": 1.0123271383903776e-06, + "loss": 0.4558, + "step": 10187 + }, + { + "epoch": 0.8, + "grad_norm": 1.4249852265575416, + "learning_rate": 1.0115600263686292e-06, + "loss": 0.4352, + "step": 10188 + }, + { + "epoch": 0.8, + "grad_norm": 14.472719227417778, + "learning_rate": 1.0107931723937326e-06, + "loss": 0.4985, + "step": 10189 + }, + { + "epoch": 0.8, + "grad_norm": 5.563726940326743, + "learning_rate": 1.0100265765152973e-06, + "loss": 0.4142, + "step": 10190 + }, + { + "epoch": 0.8, + "grad_norm": 1.7536737593022746, + "learning_rate": 1.0092602387829231e-06, + "loss": 0.456, + "step": 10191 + }, + { + "epoch": 0.8, + "grad_norm": 4.914078101950237, + "learning_rate": 1.008494159246191e-06, + "loss": 0.4663, + "step": 10192 + }, + { + "epoch": 0.8, + "grad_norm": 1.69250129495352, + "learning_rate": 1.0077283379546653e-06, + "loss": 0.4502, + "step": 10193 + }, + { + "epoch": 0.8, + "grad_norm": 0.5617546777973564, + "learning_rate": 1.0069627749578946e-06, + "loss": 0.4516, + "step": 10194 + }, + { + "epoch": 0.8, + "grad_norm": 0.535590358477303, + "learning_rate": 1.0061974703054078e-06, + "loss": 0.4522, + "step": 10195 + }, + { + "epoch": 0.8, + "grad_norm": 1.812779821157032, + "learning_rate": 1.00543242404672e-06, + "loss": 0.4317, + "step": 10196 + }, + { + "epoch": 0.8, + "grad_norm": 1.7600752864028086, + "learning_rate": 1.004667636231329e-06, + "loss": 0.3929, + "step": 10197 + }, + { + "epoch": 0.8, + "grad_norm": 0.543665390703115, + "learning_rate": 1.003903106908717e-06, + "loss": 0.4597, + "step": 10198 + }, + { + "epoch": 0.8, + "grad_norm": 1.467794311687954, + "learning_rate": 1.0031388361283446e-06, + "loss": 0.4686, + "step": 10199 + }, + { + "epoch": 0.8, + "grad_norm": 1.761914508718542, + "learning_rate": 1.0023748239396608e-06, + "loss": 0.4778, + "step": 10200 + }, + { + "epoch": 0.8, + "grad_norm": 1.5866404601386759, + "learning_rate": 1.0016110703920966e-06, + "loss": 0.4683, + "step": 10201 + }, + { + "epoch": 0.8, + "grad_norm": 1.8504312362624236, + "learning_rate": 1.0008475755350655e-06, + "loss": 0.4326, + "step": 10202 + }, + { + "epoch": 0.8, + "grad_norm": 1.7673290957430992, + "learning_rate": 1.000084339417966e-06, + "loss": 0.43, + "step": 10203 + }, + { + "epoch": 0.8, + "grad_norm": 1.4637852832878877, + "learning_rate": 9.993213620901748e-07, + "loss": 0.4636, + "step": 10204 + }, + { + "epoch": 0.8, + "grad_norm": 1.8306487204951127, + "learning_rate": 9.985586436010602e-07, + "loss": 0.4476, + "step": 10205 + }, + { + "epoch": 0.8, + "grad_norm": 1.5479792378137343, + "learning_rate": 9.977961839999656e-07, + "loss": 0.4287, + "step": 10206 + }, + { + "epoch": 0.8, + "grad_norm": 2.2335205284329724, + "learning_rate": 9.970339833362224e-07, + "loss": 0.4882, + "step": 10207 + }, + { + "epoch": 0.8, + "grad_norm": 1.7333382107062971, + "learning_rate": 9.962720416591443e-07, + "loss": 0.3799, + "step": 10208 + }, + { + "epoch": 0.8, + "grad_norm": 2.0320695475648916, + "learning_rate": 9.955103590180287e-07, + "loss": 0.4507, + "step": 10209 + }, + { + "epoch": 0.8, + "grad_norm": 0.5718613082461896, + "learning_rate": 9.947489354621525e-07, + "loss": 0.4728, + "step": 10210 + }, + { + "epoch": 0.8, + "grad_norm": 1.8458296128320457, + "learning_rate": 9.939877710407814e-07, + "loss": 0.501, + "step": 10211 + }, + { + "epoch": 0.8, + "grad_norm": 2.175564821698887, + "learning_rate": 9.932268658031607e-07, + "loss": 0.4355, + "step": 10212 + }, + { + "epoch": 0.8, + "grad_norm": 2.131333332923354, + "learning_rate": 9.924662197985212e-07, + "loss": 0.4889, + "step": 10213 + }, + { + "epoch": 0.8, + "grad_norm": 1.6126079770849477, + "learning_rate": 9.917058330760742e-07, + "loss": 0.4724, + "step": 10214 + }, + { + "epoch": 0.8, + "grad_norm": 2.8113405055326264, + "learning_rate": 9.909457056850159e-07, + "loss": 0.4848, + "step": 10215 + }, + { + "epoch": 0.8, + "grad_norm": 2.0699596809126817, + "learning_rate": 9.901858376745254e-07, + "loss": 0.4372, + "step": 10216 + }, + { + "epoch": 0.8, + "grad_norm": 5.057509902737468, + "learning_rate": 9.894262290937667e-07, + "loss": 0.4431, + "step": 10217 + }, + { + "epoch": 0.8, + "grad_norm": 2.174193223974668, + "learning_rate": 9.886668799918853e-07, + "loss": 0.4692, + "step": 10218 + }, + { + "epoch": 0.8, + "grad_norm": 3.042545122643649, + "learning_rate": 9.879077904180067e-07, + "loss": 0.4941, + "step": 10219 + }, + { + "epoch": 0.8, + "grad_norm": 0.6030123872528108, + "learning_rate": 9.87148960421248e-07, + "loss": 0.4786, + "step": 10220 + }, + { + "epoch": 0.8, + "grad_norm": 1.8409863573274394, + "learning_rate": 9.863903900507011e-07, + "loss": 0.4517, + "step": 10221 + }, + { + "epoch": 0.8, + "grad_norm": 0.5449606679771221, + "learning_rate": 9.856320793554463e-07, + "loss": 0.4546, + "step": 10222 + }, + { + "epoch": 0.8, + "grad_norm": 1.9986242920848671, + "learning_rate": 9.848740283845427e-07, + "loss": 0.44, + "step": 10223 + }, + { + "epoch": 0.8, + "grad_norm": 2.54743276200625, + "learning_rate": 9.841162371870388e-07, + "loss": 0.4556, + "step": 10224 + }, + { + "epoch": 0.8, + "grad_norm": 1.8319330720045848, + "learning_rate": 9.833587058119603e-07, + "loss": 0.4388, + "step": 10225 + }, + { + "epoch": 0.8, + "grad_norm": 1.677950802104717, + "learning_rate": 9.82601434308319e-07, + "loss": 0.4177, + "step": 10226 + }, + { + "epoch": 0.8, + "grad_norm": 0.5409169777748895, + "learning_rate": 9.81844422725109e-07, + "loss": 0.4715, + "step": 10227 + }, + { + "epoch": 0.8, + "grad_norm": 2.2197829296552825, + "learning_rate": 9.810876711113087e-07, + "loss": 0.3991, + "step": 10228 + }, + { + "epoch": 0.8, + "grad_norm": 1.7928168081855287, + "learning_rate": 9.803311795158804e-07, + "loss": 0.4411, + "step": 10229 + }, + { + "epoch": 0.8, + "grad_norm": 1.6313863757789209, + "learning_rate": 9.795749479877647e-07, + "loss": 0.47, + "step": 10230 + }, + { + "epoch": 0.8, + "grad_norm": 1.9420718221522324, + "learning_rate": 9.788189765758904e-07, + "loss": 0.4694, + "step": 10231 + }, + { + "epoch": 0.8, + "grad_norm": 1.6796659715781856, + "learning_rate": 9.780632653291676e-07, + "loss": 0.4478, + "step": 10232 + }, + { + "epoch": 0.8, + "grad_norm": 3.2581204677731908, + "learning_rate": 9.773078142964926e-07, + "loss": 0.4327, + "step": 10233 + }, + { + "epoch": 0.8, + "grad_norm": 1.8283624635012161, + "learning_rate": 9.765526235267375e-07, + "loss": 0.465, + "step": 10234 + }, + { + "epoch": 0.8, + "grad_norm": 1.6861377692948445, + "learning_rate": 9.757976930687645e-07, + "loss": 0.4414, + "step": 10235 + }, + { + "epoch": 0.8, + "grad_norm": 1.8839549032528675, + "learning_rate": 9.750430229714163e-07, + "loss": 0.4785, + "step": 10236 + }, + { + "epoch": 0.8, + "grad_norm": 1.8219899020139274, + "learning_rate": 9.742886132835195e-07, + "loss": 0.4568, + "step": 10237 + }, + { + "epoch": 0.8, + "grad_norm": 2.5542221458246024, + "learning_rate": 9.735344640538842e-07, + "loss": 0.4604, + "step": 10238 + }, + { + "epoch": 0.8, + "grad_norm": 0.5291885483800136, + "learning_rate": 9.727805753312998e-07, + "loss": 0.4669, + "step": 10239 + }, + { + "epoch": 0.8, + "grad_norm": 1.3028155652579894, + "learning_rate": 9.720269471645455e-07, + "loss": 0.4357, + "step": 10240 + }, + { + "epoch": 0.8, + "grad_norm": 1.55399613064397, + "learning_rate": 9.712735796023782e-07, + "loss": 0.4161, + "step": 10241 + }, + { + "epoch": 0.8, + "grad_norm": 0.5789348097584666, + "learning_rate": 9.705204726935391e-07, + "loss": 0.4795, + "step": 10242 + }, + { + "epoch": 0.8, + "grad_norm": 1.6750220213970957, + "learning_rate": 9.69767626486755e-07, + "loss": 0.4721, + "step": 10243 + }, + { + "epoch": 0.8, + "grad_norm": 2.4960982561925205, + "learning_rate": 9.690150410307342e-07, + "loss": 0.4055, + "step": 10244 + }, + { + "epoch": 0.8, + "grad_norm": 2.002021275009819, + "learning_rate": 9.682627163741653e-07, + "loss": 0.4747, + "step": 10245 + }, + { + "epoch": 0.8, + "grad_norm": 3.452102472246556, + "learning_rate": 9.675106525657252e-07, + "loss": 0.454, + "step": 10246 + }, + { + "epoch": 0.8, + "grad_norm": 0.5461733158292336, + "learning_rate": 9.667588496540703e-07, + "loss": 0.5022, + "step": 10247 + }, + { + "epoch": 0.8, + "grad_norm": 2.0046401364207433, + "learning_rate": 9.660073076878419e-07, + "loss": 0.4587, + "step": 10248 + }, + { + "epoch": 0.8, + "grad_norm": 1.884001614234772, + "learning_rate": 9.652560267156647e-07, + "loss": 0.4558, + "step": 10249 + }, + { + "epoch": 0.8, + "grad_norm": 2.2975346515012585, + "learning_rate": 9.645050067861433e-07, + "loss": 0.482, + "step": 10250 + }, + { + "epoch": 0.81, + "grad_norm": 3.4077653826448504, + "learning_rate": 9.63754247947869e-07, + "loss": 0.4467, + "step": 10251 + }, + { + "epoch": 0.81, + "grad_norm": 1.7385819549657786, + "learning_rate": 9.630037502494143e-07, + "loss": 0.5104, + "step": 10252 + }, + { + "epoch": 0.81, + "grad_norm": 1.8809381461273962, + "learning_rate": 9.62253513739338e-07, + "loss": 0.4465, + "step": 10253 + }, + { + "epoch": 0.81, + "grad_norm": 1.9198252244367415, + "learning_rate": 9.615035384661743e-07, + "loss": 0.4409, + "step": 10254 + }, + { + "epoch": 0.81, + "grad_norm": 1.571633584458511, + "learning_rate": 9.60753824478451e-07, + "loss": 0.4721, + "step": 10255 + }, + { + "epoch": 0.81, + "grad_norm": 1.6279471394407021, + "learning_rate": 9.600043718246704e-07, + "loss": 0.4298, + "step": 10256 + }, + { + "epoch": 0.81, + "grad_norm": 0.5252276430729529, + "learning_rate": 9.59255180553323e-07, + "loss": 0.483, + "step": 10257 + }, + { + "epoch": 0.81, + "grad_norm": 1.7938880903349763, + "learning_rate": 9.585062507128767e-07, + "loss": 0.4777, + "step": 10258 + }, + { + "epoch": 0.81, + "grad_norm": 1.5651769452791178, + "learning_rate": 9.577575823517904e-07, + "loss": 0.454, + "step": 10259 + }, + { + "epoch": 0.81, + "grad_norm": 2.2160556900538735, + "learning_rate": 9.570091755185024e-07, + "loss": 0.4307, + "step": 10260 + }, + { + "epoch": 0.81, + "grad_norm": 0.5279663375566758, + "learning_rate": 9.5626103026143e-07, + "loss": 0.4615, + "step": 10261 + }, + { + "epoch": 0.81, + "grad_norm": 1.8164623997303837, + "learning_rate": 9.555131466289792e-07, + "loss": 0.4442, + "step": 10262 + }, + { + "epoch": 0.81, + "grad_norm": 3.0924895469259237, + "learning_rate": 9.54765524669537e-07, + "loss": 0.4184, + "step": 10263 + }, + { + "epoch": 0.81, + "grad_norm": 2.130855503962928, + "learning_rate": 9.54018164431475e-07, + "loss": 0.4276, + "step": 10264 + }, + { + "epoch": 0.81, + "grad_norm": 2.3251020690627806, + "learning_rate": 9.532710659631434e-07, + "loss": 0.4154, + "step": 10265 + }, + { + "epoch": 0.81, + "grad_norm": 2.0217996353995815, + "learning_rate": 9.5252422931288e-07, + "loss": 0.4511, + "step": 10266 + }, + { + "epoch": 0.81, + "grad_norm": 2.1967683957335824, + "learning_rate": 9.517776545290041e-07, + "loss": 0.4048, + "step": 10267 + }, + { + "epoch": 0.81, + "grad_norm": 1.9186375248370817, + "learning_rate": 9.510313416598199e-07, + "loss": 0.4508, + "step": 10268 + }, + { + "epoch": 0.81, + "grad_norm": 2.486553294585594, + "learning_rate": 9.502852907536098e-07, + "loss": 0.5231, + "step": 10269 + }, + { + "epoch": 0.81, + "grad_norm": 1.8377520954849014, + "learning_rate": 9.495395018586423e-07, + "loss": 0.4577, + "step": 10270 + }, + { + "epoch": 0.81, + "grad_norm": 1.8711266526113088, + "learning_rate": 9.487939750231728e-07, + "loss": 0.4018, + "step": 10271 + }, + { + "epoch": 0.81, + "grad_norm": 2.4857814936425724, + "learning_rate": 9.480487102954322e-07, + "loss": 0.4872, + "step": 10272 + }, + { + "epoch": 0.81, + "grad_norm": 1.555337572606532, + "learning_rate": 9.473037077236413e-07, + "loss": 0.4755, + "step": 10273 + }, + { + "epoch": 0.81, + "grad_norm": 2.2174744059355738, + "learning_rate": 9.465589673559955e-07, + "loss": 0.424, + "step": 10274 + }, + { + "epoch": 0.81, + "grad_norm": 2.118347666019394, + "learning_rate": 9.458144892406851e-07, + "loss": 0.4992, + "step": 10275 + }, + { + "epoch": 0.81, + "grad_norm": 0.5432466175226598, + "learning_rate": 9.450702734258721e-07, + "loss": 0.463, + "step": 10276 + }, + { + "epoch": 0.81, + "grad_norm": 0.553520440288294, + "learning_rate": 9.44326319959708e-07, + "loss": 0.4794, + "step": 10277 + }, + { + "epoch": 0.81, + "grad_norm": 1.9463426116073292, + "learning_rate": 9.435826288903255e-07, + "loss": 0.4552, + "step": 10278 + }, + { + "epoch": 0.81, + "grad_norm": 2.213634467337301, + "learning_rate": 9.42839200265842e-07, + "loss": 0.4496, + "step": 10279 + }, + { + "epoch": 0.81, + "grad_norm": 1.7451035748962336, + "learning_rate": 9.420960341343533e-07, + "loss": 0.5214, + "step": 10280 + }, + { + "epoch": 0.81, + "grad_norm": 4.978678131634297, + "learning_rate": 9.413531305439428e-07, + "loss": 0.4649, + "step": 10281 + }, + { + "epoch": 0.81, + "grad_norm": 0.5653734423112623, + "learning_rate": 9.406104895426755e-07, + "loss": 0.4623, + "step": 10282 + }, + { + "epoch": 0.81, + "grad_norm": 1.705521175647822, + "learning_rate": 9.39868111178599e-07, + "loss": 0.4061, + "step": 10283 + }, + { + "epoch": 0.81, + "grad_norm": 2.249732587833357, + "learning_rate": 9.391259954997456e-07, + "loss": 0.4423, + "step": 10284 + }, + { + "epoch": 0.81, + "grad_norm": 1.7557518466595086, + "learning_rate": 9.383841425541268e-07, + "loss": 0.4089, + "step": 10285 + }, + { + "epoch": 0.81, + "grad_norm": 1.7884258835580211, + "learning_rate": 9.376425523897408e-07, + "loss": 0.469, + "step": 10286 + }, + { + "epoch": 0.81, + "grad_norm": 1.8110514647561224, + "learning_rate": 9.369012250545672e-07, + "loss": 0.475, + "step": 10287 + }, + { + "epoch": 0.81, + "grad_norm": 0.5295637057735154, + "learning_rate": 9.361601605965708e-07, + "loss": 0.4674, + "step": 10288 + }, + { + "epoch": 0.81, + "grad_norm": 0.5959763283997073, + "learning_rate": 9.354193590636945e-07, + "loss": 0.4581, + "step": 10289 + }, + { + "epoch": 0.81, + "grad_norm": 2.4084902483424653, + "learning_rate": 9.346788205038682e-07, + "loss": 0.46, + "step": 10290 + }, + { + "epoch": 0.81, + "grad_norm": 3.0642615119811527, + "learning_rate": 9.33938544965004e-07, + "loss": 0.4268, + "step": 10291 + }, + { + "epoch": 0.81, + "grad_norm": 1.9193524895874488, + "learning_rate": 9.331985324949988e-07, + "loss": 0.3976, + "step": 10292 + }, + { + "epoch": 0.81, + "grad_norm": 1.7494483074184157, + "learning_rate": 9.324587831417253e-07, + "loss": 0.4731, + "step": 10293 + }, + { + "epoch": 0.81, + "grad_norm": 2.0953088980565147, + "learning_rate": 9.317192969530492e-07, + "loss": 0.4876, + "step": 10294 + }, + { + "epoch": 0.81, + "grad_norm": 1.59158419850571, + "learning_rate": 9.309800739768137e-07, + "loss": 0.4666, + "step": 10295 + }, + { + "epoch": 0.81, + "grad_norm": 1.3949582223503207, + "learning_rate": 9.302411142608431e-07, + "loss": 0.4396, + "step": 10296 + }, + { + "epoch": 0.81, + "grad_norm": 6.514271748395274, + "learning_rate": 9.295024178529488e-07, + "loss": 0.406, + "step": 10297 + }, + { + "epoch": 0.81, + "grad_norm": 3.1462051896753236, + "learning_rate": 9.287639848009228e-07, + "loss": 0.4275, + "step": 10298 + }, + { + "epoch": 0.81, + "grad_norm": 1.6638335972914338, + "learning_rate": 9.280258151525429e-07, + "loss": 0.4812, + "step": 10299 + }, + { + "epoch": 0.81, + "grad_norm": 0.5485263012621387, + "learning_rate": 9.272879089555642e-07, + "loss": 0.4746, + "step": 10300 + }, + { + "epoch": 0.81, + "grad_norm": 1.4822644987320721, + "learning_rate": 9.265502662577303e-07, + "loss": 0.4342, + "step": 10301 + }, + { + "epoch": 0.81, + "grad_norm": 2.3107608682875815, + "learning_rate": 9.258128871067651e-07, + "loss": 0.4548, + "step": 10302 + }, + { + "epoch": 0.81, + "grad_norm": 2.0922364346486573, + "learning_rate": 9.250757715503783e-07, + "loss": 0.4197, + "step": 10303 + }, + { + "epoch": 0.81, + "grad_norm": 2.4410525414933155, + "learning_rate": 9.243389196362568e-07, + "loss": 0.4901, + "step": 10304 + }, + { + "epoch": 0.81, + "grad_norm": 1.7577238296997306, + "learning_rate": 9.23602331412074e-07, + "loss": 0.4418, + "step": 10305 + }, + { + "epoch": 0.81, + "grad_norm": 1.572376132149254, + "learning_rate": 9.228660069254908e-07, + "loss": 0.4062, + "step": 10306 + }, + { + "epoch": 0.81, + "grad_norm": 1.5418114903910072, + "learning_rate": 9.221299462241418e-07, + "loss": 0.4765, + "step": 10307 + }, + { + "epoch": 0.81, + "grad_norm": 1.5789479585277342, + "learning_rate": 9.213941493556522e-07, + "loss": 0.4395, + "step": 10308 + }, + { + "epoch": 0.81, + "grad_norm": 2.6820591129522606, + "learning_rate": 9.206586163676228e-07, + "loss": 0.4351, + "step": 10309 + }, + { + "epoch": 0.81, + "grad_norm": 3.199853441489854, + "learning_rate": 9.199233473076474e-07, + "loss": 0.4287, + "step": 10310 + }, + { + "epoch": 0.81, + "grad_norm": 2.1841568209031816, + "learning_rate": 9.191883422232923e-07, + "loss": 0.4498, + "step": 10311 + }, + { + "epoch": 0.81, + "grad_norm": 1.32579340643818, + "learning_rate": 9.184536011621131e-07, + "loss": 0.4044, + "step": 10312 + }, + { + "epoch": 0.81, + "grad_norm": 1.7108584612237394, + "learning_rate": 9.177191241716471e-07, + "loss": 0.4885, + "step": 10313 + }, + { + "epoch": 0.81, + "grad_norm": 1.5635000937771477, + "learning_rate": 9.169849112994145e-07, + "loss": 0.4314, + "step": 10314 + }, + { + "epoch": 0.81, + "grad_norm": 1.8802450056169515, + "learning_rate": 9.162509625929156e-07, + "loss": 0.483, + "step": 10315 + }, + { + "epoch": 0.81, + "grad_norm": 0.5880258198498999, + "learning_rate": 9.155172780996369e-07, + "loss": 0.4836, + "step": 10316 + }, + { + "epoch": 0.81, + "grad_norm": 1.942595668792461, + "learning_rate": 9.147838578670476e-07, + "loss": 0.4803, + "step": 10317 + }, + { + "epoch": 0.81, + "grad_norm": 1.9728958802542869, + "learning_rate": 9.140507019425981e-07, + "loss": 0.4667, + "step": 10318 + }, + { + "epoch": 0.81, + "grad_norm": 0.5643636428910844, + "learning_rate": 9.13317810373725e-07, + "loss": 0.4786, + "step": 10319 + }, + { + "epoch": 0.81, + "grad_norm": 0.5421649255762295, + "learning_rate": 9.12585183207842e-07, + "loss": 0.4662, + "step": 10320 + }, + { + "epoch": 0.81, + "grad_norm": 0.5524235680919116, + "learning_rate": 9.118528204923505e-07, + "loss": 0.4632, + "step": 10321 + }, + { + "epoch": 0.81, + "grad_norm": 1.7955650505880107, + "learning_rate": 9.111207222746332e-07, + "loss": 0.4266, + "step": 10322 + }, + { + "epoch": 0.81, + "grad_norm": 5.7307945898360115, + "learning_rate": 9.103888886020579e-07, + "loss": 0.4036, + "step": 10323 + }, + { + "epoch": 0.81, + "grad_norm": 1.6035151461432573, + "learning_rate": 9.096573195219704e-07, + "loss": 0.4487, + "step": 10324 + }, + { + "epoch": 0.81, + "grad_norm": 2.1929106941806102, + "learning_rate": 9.089260150817037e-07, + "loss": 0.4624, + "step": 10325 + }, + { + "epoch": 0.81, + "grad_norm": 3.5660181757846314, + "learning_rate": 9.081949753285718e-07, + "loss": 0.4302, + "step": 10326 + }, + { + "epoch": 0.81, + "grad_norm": 1.5867425608741932, + "learning_rate": 9.074642003098721e-07, + "loss": 0.438, + "step": 10327 + }, + { + "epoch": 0.81, + "grad_norm": 1.9704425104710601, + "learning_rate": 9.067336900728846e-07, + "loss": 0.4602, + "step": 10328 + }, + { + "epoch": 0.81, + "grad_norm": 1.5073233971025206, + "learning_rate": 9.060034446648735e-07, + "loss": 0.4338, + "step": 10329 + }, + { + "epoch": 0.81, + "grad_norm": 2.084706068950284, + "learning_rate": 9.052734641330846e-07, + "loss": 0.4191, + "step": 10330 + }, + { + "epoch": 0.81, + "grad_norm": 1.771881004230732, + "learning_rate": 9.045437485247449e-07, + "loss": 0.3819, + "step": 10331 + }, + { + "epoch": 0.81, + "grad_norm": 1.622861835942293, + "learning_rate": 9.038142978870673e-07, + "loss": 0.4189, + "step": 10332 + }, + { + "epoch": 0.81, + "grad_norm": 0.5937708279263001, + "learning_rate": 9.030851122672457e-07, + "loss": 0.4857, + "step": 10333 + }, + { + "epoch": 0.81, + "grad_norm": 2.5642762013169103, + "learning_rate": 9.023561917124596e-07, + "loss": 0.5107, + "step": 10334 + }, + { + "epoch": 0.81, + "grad_norm": 5.87685458201007, + "learning_rate": 9.016275362698662e-07, + "loss": 0.4746, + "step": 10335 + }, + { + "epoch": 0.81, + "grad_norm": 2.0194265904069972, + "learning_rate": 9.008991459866096e-07, + "loss": 0.4218, + "step": 10336 + }, + { + "epoch": 0.81, + "grad_norm": 1.6916732363421423, + "learning_rate": 9.001710209098163e-07, + "loss": 0.4287, + "step": 10337 + }, + { + "epoch": 0.81, + "grad_norm": 1.5632858739406676, + "learning_rate": 8.994431610865945e-07, + "loss": 0.4771, + "step": 10338 + }, + { + "epoch": 0.81, + "grad_norm": 1.5283897132872917, + "learning_rate": 8.987155665640368e-07, + "loss": 0.4633, + "step": 10339 + }, + { + "epoch": 0.81, + "grad_norm": 2.2585277428787034, + "learning_rate": 8.979882373892152e-07, + "loss": 0.4379, + "step": 10340 + }, + { + "epoch": 0.81, + "grad_norm": 1.59364825262942, + "learning_rate": 8.972611736091902e-07, + "loss": 0.4406, + "step": 10341 + }, + { + "epoch": 0.81, + "grad_norm": 1.599927625327477, + "learning_rate": 8.965343752709993e-07, + "loss": 0.3911, + "step": 10342 + }, + { + "epoch": 0.81, + "grad_norm": 1.6899787358683087, + "learning_rate": 8.958078424216676e-07, + "loss": 0.4299, + "step": 10343 + }, + { + "epoch": 0.81, + "grad_norm": 1.9168804904398595, + "learning_rate": 8.95081575108197e-07, + "loss": 0.4807, + "step": 10344 + }, + { + "epoch": 0.81, + "grad_norm": 3.069798021302125, + "learning_rate": 8.943555733775811e-07, + "loss": 0.425, + "step": 10345 + }, + { + "epoch": 0.81, + "grad_norm": 1.3943004014279132, + "learning_rate": 8.936298372767876e-07, + "loss": 0.3952, + "step": 10346 + }, + { + "epoch": 0.81, + "grad_norm": 2.9360949388802995, + "learning_rate": 8.929043668527715e-07, + "loss": 0.3859, + "step": 10347 + }, + { + "epoch": 0.81, + "grad_norm": 1.5787392059588699, + "learning_rate": 8.921791621524706e-07, + "loss": 0.465, + "step": 10348 + }, + { + "epoch": 0.81, + "grad_norm": 2.2700749449816957, + "learning_rate": 8.914542232228041e-07, + "loss": 0.4674, + "step": 10349 + }, + { + "epoch": 0.81, + "grad_norm": 0.5973971838846036, + "learning_rate": 8.907295501106755e-07, + "loss": 0.4775, + "step": 10350 + }, + { + "epoch": 0.81, + "grad_norm": 1.810281379395451, + "learning_rate": 8.900051428629686e-07, + "loss": 0.4809, + "step": 10351 + }, + { + "epoch": 0.81, + "grad_norm": 1.7596631884673937, + "learning_rate": 8.892810015265518e-07, + "loss": 0.4356, + "step": 10352 + }, + { + "epoch": 0.81, + "grad_norm": 2.383423820983331, + "learning_rate": 8.885571261482773e-07, + "loss": 0.4504, + "step": 10353 + }, + { + "epoch": 0.81, + "grad_norm": 3.5293575036812714, + "learning_rate": 8.878335167749797e-07, + "loss": 0.4547, + "step": 10354 + }, + { + "epoch": 0.81, + "grad_norm": 1.5329948584197477, + "learning_rate": 8.871101734534726e-07, + "loss": 0.4444, + "step": 10355 + }, + { + "epoch": 0.81, + "grad_norm": 2.0164031748314137, + "learning_rate": 8.863870962305571e-07, + "loss": 0.4541, + "step": 10356 + }, + { + "epoch": 0.81, + "grad_norm": 2.9878318176941, + "learning_rate": 8.856642851530145e-07, + "loss": 0.4141, + "step": 10357 + }, + { + "epoch": 0.81, + "grad_norm": 0.5456643252558189, + "learning_rate": 8.849417402676124e-07, + "loss": 0.4596, + "step": 10358 + }, + { + "epoch": 0.81, + "grad_norm": 1.8861312861035753, + "learning_rate": 8.842194616210953e-07, + "loss": 0.4527, + "step": 10359 + }, + { + "epoch": 0.81, + "grad_norm": 0.5713905447653566, + "learning_rate": 8.834974492601933e-07, + "loss": 0.4632, + "step": 10360 + }, + { + "epoch": 0.81, + "grad_norm": 1.4134592203611467, + "learning_rate": 8.827757032316242e-07, + "loss": 0.3976, + "step": 10361 + }, + { + "epoch": 0.81, + "grad_norm": 3.4968492898153474, + "learning_rate": 8.820542235820795e-07, + "loss": 0.4906, + "step": 10362 + }, + { + "epoch": 0.81, + "grad_norm": 3.0828079652407085, + "learning_rate": 8.813330103582396e-07, + "loss": 0.463, + "step": 10363 + }, + { + "epoch": 0.81, + "grad_norm": 1.705666856115678, + "learning_rate": 8.806120636067661e-07, + "loss": 0.4246, + "step": 10364 + }, + { + "epoch": 0.81, + "grad_norm": 1.518197682824045, + "learning_rate": 8.798913833743045e-07, + "loss": 0.4563, + "step": 10365 + }, + { + "epoch": 0.81, + "grad_norm": 1.5171383302517183, + "learning_rate": 8.791709697074791e-07, + "loss": 0.4838, + "step": 10366 + }, + { + "epoch": 0.81, + "grad_norm": 1.550740650647119, + "learning_rate": 8.784508226529015e-07, + "loss": 0.3847, + "step": 10367 + }, + { + "epoch": 0.81, + "grad_norm": 1.6963322196408366, + "learning_rate": 8.77730942257164e-07, + "loss": 0.4996, + "step": 10368 + }, + { + "epoch": 0.81, + "grad_norm": 1.7309523301521403, + "learning_rate": 8.770113285668436e-07, + "loss": 0.4089, + "step": 10369 + }, + { + "epoch": 0.81, + "grad_norm": 1.6586128791589392, + "learning_rate": 8.762919816284949e-07, + "loss": 0.37, + "step": 10370 + }, + { + "epoch": 0.81, + "grad_norm": 1.7656722472391493, + "learning_rate": 8.755729014886605e-07, + "loss": 0.5047, + "step": 10371 + }, + { + "epoch": 0.81, + "grad_norm": 0.5611340777647948, + "learning_rate": 8.748540881938644e-07, + "loss": 0.4674, + "step": 10372 + }, + { + "epoch": 0.81, + "grad_norm": 2.969783835894969, + "learning_rate": 8.741355417906116e-07, + "loss": 0.4201, + "step": 10373 + }, + { + "epoch": 0.81, + "grad_norm": 1.9939270554162345, + "learning_rate": 8.73417262325394e-07, + "loss": 0.4472, + "step": 10374 + }, + { + "epoch": 0.81, + "grad_norm": 5.854829268638572, + "learning_rate": 8.726992498446785e-07, + "loss": 0.4519, + "step": 10375 + }, + { + "epoch": 0.81, + "grad_norm": 2.349155445346147, + "learning_rate": 8.719815043949248e-07, + "loss": 0.4784, + "step": 10376 + }, + { + "epoch": 0.81, + "grad_norm": 5.8306678559264045, + "learning_rate": 8.712640260225663e-07, + "loss": 0.4709, + "step": 10377 + }, + { + "epoch": 0.82, + "grad_norm": 1.934961683327888, + "learning_rate": 8.70546814774026e-07, + "loss": 0.4467, + "step": 10378 + }, + { + "epoch": 0.82, + "grad_norm": 1.6903203542219105, + "learning_rate": 8.698298706957015e-07, + "loss": 0.4692, + "step": 10379 + }, + { + "epoch": 0.82, + "grad_norm": 1.7742561693359535, + "learning_rate": 8.691131938339842e-07, + "loss": 0.4619, + "step": 10380 + }, + { + "epoch": 0.82, + "grad_norm": 2.6538450090094314, + "learning_rate": 8.683967842352386e-07, + "loss": 0.5057, + "step": 10381 + }, + { + "epoch": 0.82, + "grad_norm": 1.821025368120122, + "learning_rate": 8.676806419458156e-07, + "loss": 0.4166, + "step": 10382 + }, + { + "epoch": 0.82, + "grad_norm": 3.3437838864374085, + "learning_rate": 8.669647670120496e-07, + "loss": 0.436, + "step": 10383 + }, + { + "epoch": 0.82, + "grad_norm": 3.3016530801643373, + "learning_rate": 8.662491594802563e-07, + "loss": 0.4759, + "step": 10384 + }, + { + "epoch": 0.82, + "grad_norm": 1.343888290083774, + "learning_rate": 8.655338193967367e-07, + "loss": 0.4342, + "step": 10385 + }, + { + "epoch": 0.82, + "grad_norm": 0.5671307345206104, + "learning_rate": 8.648187468077684e-07, + "loss": 0.4856, + "step": 10386 + }, + { + "epoch": 0.82, + "grad_norm": 2.809300376415277, + "learning_rate": 8.641039417596181e-07, + "loss": 0.4111, + "step": 10387 + }, + { + "epoch": 0.82, + "grad_norm": 1.7937873842626946, + "learning_rate": 8.63389404298532e-07, + "loss": 0.4076, + "step": 10388 + }, + { + "epoch": 0.82, + "grad_norm": 0.536953704284807, + "learning_rate": 8.626751344707418e-07, + "loss": 0.4688, + "step": 10389 + }, + { + "epoch": 0.82, + "grad_norm": 2.1994912871828878, + "learning_rate": 8.619611323224563e-07, + "loss": 0.4205, + "step": 10390 + }, + { + "epoch": 0.82, + "grad_norm": 1.4989333246230003, + "learning_rate": 8.612473978998726e-07, + "loss": 0.4447, + "step": 10391 + }, + { + "epoch": 0.82, + "grad_norm": 1.709582729769975, + "learning_rate": 8.605339312491679e-07, + "loss": 0.4478, + "step": 10392 + }, + { + "epoch": 0.82, + "grad_norm": 0.5257103068064729, + "learning_rate": 8.598207324165042e-07, + "loss": 0.4802, + "step": 10393 + }, + { + "epoch": 0.82, + "grad_norm": 2.548270998571477, + "learning_rate": 8.59107801448022e-07, + "loss": 0.4655, + "step": 10394 + }, + { + "epoch": 0.82, + "grad_norm": 1.6897082800647134, + "learning_rate": 8.583951383898464e-07, + "loss": 0.4818, + "step": 10395 + }, + { + "epoch": 0.82, + "grad_norm": 1.8162914781800854, + "learning_rate": 8.576827432880902e-07, + "loss": 0.4079, + "step": 10396 + }, + { + "epoch": 0.82, + "grad_norm": 1.5219332710308813, + "learning_rate": 8.569706161888402e-07, + "loss": 0.4292, + "step": 10397 + }, + { + "epoch": 0.82, + "grad_norm": 2.355615640721105, + "learning_rate": 8.56258757138172e-07, + "loss": 0.4518, + "step": 10398 + }, + { + "epoch": 0.82, + "grad_norm": 1.8559393422617085, + "learning_rate": 8.555471661821413e-07, + "loss": 0.4749, + "step": 10399 + }, + { + "epoch": 0.82, + "grad_norm": 2.511759368637973, + "learning_rate": 8.548358433667886e-07, + "loss": 0.4598, + "step": 10400 + }, + { + "epoch": 0.82, + "grad_norm": 1.8516738310912404, + "learning_rate": 8.541247887381326e-07, + "loss": 0.4902, + "step": 10401 + }, + { + "epoch": 0.82, + "grad_norm": 1.7290323042681175, + "learning_rate": 8.5341400234218e-07, + "loss": 0.5068, + "step": 10402 + }, + { + "epoch": 0.82, + "grad_norm": 1.4562906546670134, + "learning_rate": 8.527034842249165e-07, + "loss": 0.426, + "step": 10403 + }, + { + "epoch": 0.82, + "grad_norm": 1.7327589930599883, + "learning_rate": 8.519932344323139e-07, + "loss": 0.4195, + "step": 10404 + }, + { + "epoch": 0.82, + "grad_norm": 2.4249916623598478, + "learning_rate": 8.51283253010321e-07, + "loss": 0.4659, + "step": 10405 + }, + { + "epoch": 0.82, + "grad_norm": 0.5381119532301166, + "learning_rate": 8.505735400048748e-07, + "loss": 0.4634, + "step": 10406 + }, + { + "epoch": 0.82, + "grad_norm": 1.7241813810346882, + "learning_rate": 8.498640954618926e-07, + "loss": 0.4035, + "step": 10407 + }, + { + "epoch": 0.82, + "grad_norm": 0.5900907041544682, + "learning_rate": 8.491549194272736e-07, + "loss": 0.4742, + "step": 10408 + }, + { + "epoch": 0.82, + "grad_norm": 1.5392694820345831, + "learning_rate": 8.484460119469035e-07, + "loss": 0.4233, + "step": 10409 + }, + { + "epoch": 0.82, + "grad_norm": 1.798576735095686, + "learning_rate": 8.477373730666422e-07, + "loss": 0.4662, + "step": 10410 + }, + { + "epoch": 0.82, + "grad_norm": 4.4661322394029295, + "learning_rate": 8.470290028323441e-07, + "loss": 0.439, + "step": 10411 + }, + { + "epoch": 0.82, + "grad_norm": 3.05340044751119, + "learning_rate": 8.463209012898349e-07, + "loss": 0.5051, + "step": 10412 + }, + { + "epoch": 0.82, + "grad_norm": 0.541423998263109, + "learning_rate": 8.456130684849317e-07, + "loss": 0.4881, + "step": 10413 + }, + { + "epoch": 0.82, + "grad_norm": 1.449413169031657, + "learning_rate": 8.449055044634252e-07, + "loss": 0.4523, + "step": 10414 + }, + { + "epoch": 0.82, + "grad_norm": 3.611654802954893, + "learning_rate": 8.441982092710998e-07, + "loss": 0.4483, + "step": 10415 + }, + { + "epoch": 0.82, + "grad_norm": 4.5398940613810765, + "learning_rate": 8.434911829537129e-07, + "loss": 0.4558, + "step": 10416 + }, + { + "epoch": 0.82, + "grad_norm": 1.4587384967007637, + "learning_rate": 8.427844255570084e-07, + "loss": 0.4266, + "step": 10417 + }, + { + "epoch": 0.82, + "grad_norm": 2.041512412770492, + "learning_rate": 8.420779371267134e-07, + "loss": 0.4424, + "step": 10418 + }, + { + "epoch": 0.82, + "grad_norm": 1.6879387679801476, + "learning_rate": 8.413717177085367e-07, + "loss": 0.4453, + "step": 10419 + }, + { + "epoch": 0.82, + "grad_norm": 2.0737336203807497, + "learning_rate": 8.406657673481711e-07, + "loss": 0.4025, + "step": 10420 + }, + { + "epoch": 0.82, + "grad_norm": 2.6323107702586803, + "learning_rate": 8.399600860912882e-07, + "loss": 0.43, + "step": 10421 + }, + { + "epoch": 0.82, + "grad_norm": 11.93673320629374, + "learning_rate": 8.39254673983545e-07, + "loss": 0.4602, + "step": 10422 + }, + { + "epoch": 0.82, + "grad_norm": 1.7602673856759, + "learning_rate": 8.385495310705821e-07, + "loss": 0.466, + "step": 10423 + }, + { + "epoch": 0.82, + "grad_norm": 2.199525647565027, + "learning_rate": 8.378446573980215e-07, + "loss": 0.4203, + "step": 10424 + }, + { + "epoch": 0.82, + "grad_norm": 1.8172508039527577, + "learning_rate": 8.37140053011466e-07, + "loss": 0.4749, + "step": 10425 + }, + { + "epoch": 0.82, + "grad_norm": 1.555600705663999, + "learning_rate": 8.36435717956503e-07, + "loss": 0.4487, + "step": 10426 + }, + { + "epoch": 0.82, + "grad_norm": 1.5600937912843675, + "learning_rate": 8.357316522787024e-07, + "loss": 0.4818, + "step": 10427 + }, + { + "epoch": 0.82, + "grad_norm": 1.564108326374525, + "learning_rate": 8.35027856023617e-07, + "loss": 0.4336, + "step": 10428 + }, + { + "epoch": 0.82, + "grad_norm": 1.8942923099274132, + "learning_rate": 8.343243292367814e-07, + "loss": 0.5217, + "step": 10429 + }, + { + "epoch": 0.82, + "grad_norm": 0.5478950951530912, + "learning_rate": 8.3362107196371e-07, + "loss": 0.4748, + "step": 10430 + }, + { + "epoch": 0.82, + "grad_norm": 3.7064674315709056, + "learning_rate": 8.329180842499074e-07, + "loss": 0.4475, + "step": 10431 + }, + { + "epoch": 0.82, + "grad_norm": 2.417799205265473, + "learning_rate": 8.322153661408522e-07, + "loss": 0.4613, + "step": 10432 + }, + { + "epoch": 0.82, + "grad_norm": 0.5397847607225453, + "learning_rate": 8.315129176820108e-07, + "loss": 0.4745, + "step": 10433 + }, + { + "epoch": 0.82, + "grad_norm": 1.5819000931362215, + "learning_rate": 8.308107389188303e-07, + "loss": 0.4834, + "step": 10434 + }, + { + "epoch": 0.82, + "grad_norm": 2.067798031180775, + "learning_rate": 8.301088298967425e-07, + "loss": 0.4475, + "step": 10435 + }, + { + "epoch": 0.82, + "grad_norm": 1.584377177634961, + "learning_rate": 8.294071906611573e-07, + "loss": 0.4604, + "step": 10436 + }, + { + "epoch": 0.82, + "grad_norm": 4.267941992448607, + "learning_rate": 8.287058212574712e-07, + "loss": 0.4429, + "step": 10437 + }, + { + "epoch": 0.82, + "grad_norm": 1.66659983254258, + "learning_rate": 8.28004721731061e-07, + "loss": 0.4247, + "step": 10438 + }, + { + "epoch": 0.82, + "grad_norm": 0.5627511120482244, + "learning_rate": 8.273038921272897e-07, + "loss": 0.4869, + "step": 10439 + }, + { + "epoch": 0.82, + "grad_norm": 2.8690265461064324, + "learning_rate": 8.266033324914962e-07, + "loss": 0.4462, + "step": 10440 + }, + { + "epoch": 0.82, + "grad_norm": 1.6310837710931052, + "learning_rate": 8.259030428690079e-07, + "loss": 0.3962, + "step": 10441 + }, + { + "epoch": 0.82, + "grad_norm": 1.7576469164442905, + "learning_rate": 8.252030233051322e-07, + "loss": 0.4488, + "step": 10442 + }, + { + "epoch": 0.82, + "grad_norm": 2.0079198803829397, + "learning_rate": 8.245032738451598e-07, + "loss": 0.4568, + "step": 10443 + }, + { + "epoch": 0.82, + "grad_norm": 1.9395214021196525, + "learning_rate": 8.238037945343647e-07, + "loss": 0.4214, + "step": 10444 + }, + { + "epoch": 0.82, + "grad_norm": 1.7831455162113885, + "learning_rate": 8.231045854179981e-07, + "loss": 0.4366, + "step": 10445 + }, + { + "epoch": 0.82, + "grad_norm": 1.565411501852786, + "learning_rate": 8.224056465413033e-07, + "loss": 0.4624, + "step": 10446 + }, + { + "epoch": 0.82, + "grad_norm": 2.1263394985690365, + "learning_rate": 8.217069779494968e-07, + "loss": 0.4622, + "step": 10447 + }, + { + "epoch": 0.82, + "grad_norm": 1.5564967220278327, + "learning_rate": 8.210085796877842e-07, + "loss": 0.454, + "step": 10448 + }, + { + "epoch": 0.82, + "grad_norm": 1.8633097240743695, + "learning_rate": 8.203104518013477e-07, + "loss": 0.5128, + "step": 10449 + }, + { + "epoch": 0.82, + "grad_norm": 1.6792719783856753, + "learning_rate": 8.196125943353594e-07, + "loss": 0.4169, + "step": 10450 + }, + { + "epoch": 0.82, + "grad_norm": 2.193644710044736, + "learning_rate": 8.189150073349661e-07, + "loss": 0.4513, + "step": 10451 + }, + { + "epoch": 0.82, + "grad_norm": 0.5672486236560986, + "learning_rate": 8.182176908453026e-07, + "loss": 0.4728, + "step": 10452 + }, + { + "epoch": 0.82, + "grad_norm": 1.4385328704938103, + "learning_rate": 8.175206449114841e-07, + "loss": 0.3883, + "step": 10453 + }, + { + "epoch": 0.82, + "grad_norm": 1.7665782052965864, + "learning_rate": 8.168238695786085e-07, + "loss": 0.4588, + "step": 10454 + }, + { + "epoch": 0.82, + "grad_norm": 2.1037436145980655, + "learning_rate": 8.161273648917573e-07, + "loss": 0.3922, + "step": 10455 + }, + { + "epoch": 0.82, + "grad_norm": 2.0230891308927696, + "learning_rate": 8.154311308959911e-07, + "loss": 0.4362, + "step": 10456 + }, + { + "epoch": 0.82, + "grad_norm": 2.1669630473543564, + "learning_rate": 8.147351676363569e-07, + "loss": 0.4574, + "step": 10457 + }, + { + "epoch": 0.82, + "grad_norm": 1.7340495444242308, + "learning_rate": 8.140394751578818e-07, + "loss": 0.4766, + "step": 10458 + }, + { + "epoch": 0.82, + "grad_norm": 0.541187111779539, + "learning_rate": 8.133440535055787e-07, + "loss": 0.467, + "step": 10459 + }, + { + "epoch": 0.82, + "grad_norm": 1.7945687677158546, + "learning_rate": 8.126489027244367e-07, + "loss": 0.4707, + "step": 10460 + }, + { + "epoch": 0.82, + "grad_norm": 1.5948763989509176, + "learning_rate": 8.119540228594331e-07, + "loss": 0.4514, + "step": 10461 + }, + { + "epoch": 0.82, + "grad_norm": 2.0623891789263, + "learning_rate": 8.112594139555258e-07, + "loss": 0.4628, + "step": 10462 + }, + { + "epoch": 0.82, + "grad_norm": 1.5384214814025952, + "learning_rate": 8.105650760576544e-07, + "loss": 0.4185, + "step": 10463 + }, + { + "epoch": 0.82, + "grad_norm": 2.0423502272740617, + "learning_rate": 8.098710092107436e-07, + "loss": 0.4325, + "step": 10464 + }, + { + "epoch": 0.82, + "grad_norm": 1.702679491164323, + "learning_rate": 8.091772134596953e-07, + "loss": 0.416, + "step": 10465 + }, + { + "epoch": 0.82, + "grad_norm": 1.625813032010719, + "learning_rate": 8.084836888494008e-07, + "loss": 0.4229, + "step": 10466 + }, + { + "epoch": 0.82, + "grad_norm": 1.9260244142622303, + "learning_rate": 8.077904354247274e-07, + "loss": 0.4503, + "step": 10467 + }, + { + "epoch": 0.82, + "grad_norm": 1.6132423590534624, + "learning_rate": 8.070974532305281e-07, + "loss": 0.4557, + "step": 10468 + }, + { + "epoch": 0.82, + "grad_norm": 6.324202241150519, + "learning_rate": 8.064047423116395e-07, + "loss": 0.448, + "step": 10469 + }, + { + "epoch": 0.82, + "grad_norm": 1.664319176533905, + "learning_rate": 8.057123027128788e-07, + "loss": 0.4788, + "step": 10470 + }, + { + "epoch": 0.82, + "grad_norm": 0.5482131992208511, + "learning_rate": 8.050201344790442e-07, + "loss": 0.4679, + "step": 10471 + }, + { + "epoch": 0.82, + "grad_norm": 1.775362269388806, + "learning_rate": 8.043282376549194e-07, + "loss": 0.5079, + "step": 10472 + }, + { + "epoch": 0.82, + "grad_norm": 2.0416613657455334, + "learning_rate": 8.036366122852685e-07, + "loss": 0.4902, + "step": 10473 + }, + { + "epoch": 0.82, + "grad_norm": 1.5580741294411549, + "learning_rate": 8.029452584148395e-07, + "loss": 0.4381, + "step": 10474 + }, + { + "epoch": 0.82, + "grad_norm": 1.4791108179839572, + "learning_rate": 8.022541760883635e-07, + "loss": 0.4572, + "step": 10475 + }, + { + "epoch": 0.82, + "grad_norm": 1.9183146545131848, + "learning_rate": 8.015633653505494e-07, + "loss": 0.463, + "step": 10476 + }, + { + "epoch": 0.82, + "grad_norm": 1.9966746415375298, + "learning_rate": 8.00872826246093e-07, + "loss": 0.3996, + "step": 10477 + }, + { + "epoch": 0.82, + "grad_norm": 0.5423296937673391, + "learning_rate": 8.001825588196721e-07, + "loss": 0.4705, + "step": 10478 + }, + { + "epoch": 0.82, + "grad_norm": 2.409019202988364, + "learning_rate": 7.994925631159467e-07, + "loss": 0.3938, + "step": 10479 + }, + { + "epoch": 0.82, + "grad_norm": 1.658614317815432, + "learning_rate": 7.988028391795549e-07, + "loss": 0.4145, + "step": 10480 + }, + { + "epoch": 0.82, + "grad_norm": 0.548083257395584, + "learning_rate": 7.981133870551261e-07, + "loss": 0.4779, + "step": 10481 + }, + { + "epoch": 0.82, + "grad_norm": 1.6787045973936443, + "learning_rate": 7.974242067872628e-07, + "loss": 0.4615, + "step": 10482 + }, + { + "epoch": 0.82, + "grad_norm": 1.6661808469186754, + "learning_rate": 7.967352984205573e-07, + "loss": 0.4011, + "step": 10483 + }, + { + "epoch": 0.82, + "grad_norm": 2.441615375179931, + "learning_rate": 7.960466619995772e-07, + "loss": 0.4184, + "step": 10484 + }, + { + "epoch": 0.82, + "grad_norm": 1.93345218590861, + "learning_rate": 7.953582975688795e-07, + "loss": 0.4368, + "step": 10485 + }, + { + "epoch": 0.82, + "grad_norm": 1.7915889610293307, + "learning_rate": 7.946702051730005e-07, + "loss": 0.4641, + "step": 10486 + }, + { + "epoch": 0.82, + "grad_norm": 2.4432525914788856, + "learning_rate": 7.93982384856457e-07, + "loss": 0.4187, + "step": 10487 + }, + { + "epoch": 0.82, + "grad_norm": 1.6832473353779829, + "learning_rate": 7.932948366637516e-07, + "loss": 0.421, + "step": 10488 + }, + { + "epoch": 0.82, + "grad_norm": 1.840490557424005, + "learning_rate": 7.926075606393662e-07, + "loss": 0.5027, + "step": 10489 + }, + { + "epoch": 0.82, + "grad_norm": 0.5423022956824887, + "learning_rate": 7.9192055682777e-07, + "loss": 0.4685, + "step": 10490 + }, + { + "epoch": 0.82, + "grad_norm": 1.6758603850778215, + "learning_rate": 7.91233825273407e-07, + "loss": 0.5067, + "step": 10491 + }, + { + "epoch": 0.82, + "grad_norm": 2.779663489257076, + "learning_rate": 7.905473660207097e-07, + "loss": 0.4588, + "step": 10492 + }, + { + "epoch": 0.82, + "grad_norm": 1.9660009401909029, + "learning_rate": 7.898611791140915e-07, + "loss": 0.4682, + "step": 10493 + }, + { + "epoch": 0.82, + "grad_norm": 1.9830008667609105, + "learning_rate": 7.891752645979484e-07, + "loss": 0.4845, + "step": 10494 + }, + { + "epoch": 0.82, + "grad_norm": 3.082095451632507, + "learning_rate": 7.884896225166561e-07, + "loss": 0.4598, + "step": 10495 + }, + { + "epoch": 0.82, + "grad_norm": 1.8006279290736278, + "learning_rate": 7.878042529145746e-07, + "loss": 0.417, + "step": 10496 + }, + { + "epoch": 0.82, + "grad_norm": 1.941046344257525, + "learning_rate": 7.871191558360503e-07, + "loss": 0.5308, + "step": 10497 + }, + { + "epoch": 0.82, + "grad_norm": 1.6136279093325179, + "learning_rate": 7.864343313254041e-07, + "loss": 0.4389, + "step": 10498 + }, + { + "epoch": 0.82, + "grad_norm": 1.523167780875091, + "learning_rate": 7.857497794269459e-07, + "loss": 0.4443, + "step": 10499 + }, + { + "epoch": 0.82, + "grad_norm": 1.8694191145655241, + "learning_rate": 7.850655001849616e-07, + "loss": 0.4757, + "step": 10500 + }, + { + "epoch": 0.82, + "grad_norm": 2.0080366411927364, + "learning_rate": 7.843814936437277e-07, + "loss": 0.5011, + "step": 10501 + }, + { + "epoch": 0.82, + "grad_norm": 5.063271031104307, + "learning_rate": 7.836977598474955e-07, + "loss": 0.4765, + "step": 10502 + }, + { + "epoch": 0.82, + "grad_norm": 1.7384229019369908, + "learning_rate": 7.830142988405026e-07, + "loss": 0.4573, + "step": 10503 + }, + { + "epoch": 0.82, + "grad_norm": 0.5509632331126995, + "learning_rate": 7.823311106669685e-07, + "loss": 0.4835, + "step": 10504 + }, + { + "epoch": 0.83, + "grad_norm": 1.9171296323109506, + "learning_rate": 7.816481953710947e-07, + "loss": 0.4022, + "step": 10505 + }, + { + "epoch": 0.83, + "grad_norm": 1.3993772635970991, + "learning_rate": 7.809655529970633e-07, + "loss": 0.4226, + "step": 10506 + }, + { + "epoch": 0.83, + "grad_norm": 1.410184111225007, + "learning_rate": 7.802831835890418e-07, + "loss": 0.4142, + "step": 10507 + }, + { + "epoch": 0.83, + "grad_norm": 0.5820284432433327, + "learning_rate": 7.79601087191178e-07, + "loss": 0.4665, + "step": 10508 + }, + { + "epoch": 0.83, + "grad_norm": 0.5314403847116487, + "learning_rate": 7.789192638476029e-07, + "loss": 0.4649, + "step": 10509 + }, + { + "epoch": 0.83, + "grad_norm": 0.5282810007839512, + "learning_rate": 7.78237713602431e-07, + "loss": 0.4705, + "step": 10510 + }, + { + "epoch": 0.83, + "grad_norm": 2.1412412354295554, + "learning_rate": 7.775564364997545e-07, + "loss": 0.4004, + "step": 10511 + }, + { + "epoch": 0.83, + "grad_norm": 3.0842889626919487, + "learning_rate": 7.768754325836531e-07, + "loss": 0.4309, + "step": 10512 + }, + { + "epoch": 0.83, + "grad_norm": 2.091201168656958, + "learning_rate": 7.761947018981869e-07, + "loss": 0.4496, + "step": 10513 + }, + { + "epoch": 0.83, + "grad_norm": 1.6254376162901858, + "learning_rate": 7.755142444873992e-07, + "loss": 0.4252, + "step": 10514 + }, + { + "epoch": 0.83, + "grad_norm": 1.8434726523973886, + "learning_rate": 7.748340603953114e-07, + "loss": 0.4268, + "step": 10515 + }, + { + "epoch": 0.83, + "grad_norm": 1.5628724128854636, + "learning_rate": 7.741541496659344e-07, + "loss": 0.4491, + "step": 10516 + }, + { + "epoch": 0.83, + "grad_norm": 2.178307691818149, + "learning_rate": 7.734745123432552e-07, + "loss": 0.4543, + "step": 10517 + }, + { + "epoch": 0.83, + "grad_norm": 3.5852885678103217, + "learning_rate": 7.72795148471247e-07, + "loss": 0.4738, + "step": 10518 + }, + { + "epoch": 0.83, + "grad_norm": 2.0463769935032365, + "learning_rate": 7.721160580938603e-07, + "loss": 0.4684, + "step": 10519 + }, + { + "epoch": 0.83, + "grad_norm": 1.9538472192307197, + "learning_rate": 7.714372412550353e-07, + "loss": 0.4667, + "step": 10520 + }, + { + "epoch": 0.83, + "grad_norm": 2.305163123853526, + "learning_rate": 7.707586979986903e-07, + "loss": 0.4575, + "step": 10521 + }, + { + "epoch": 0.83, + "grad_norm": 2.0648033528671728, + "learning_rate": 7.700804283687241e-07, + "loss": 0.4717, + "step": 10522 + }, + { + "epoch": 0.83, + "grad_norm": 2.209266467059784, + "learning_rate": 7.694024324090205e-07, + "loss": 0.4482, + "step": 10523 + }, + { + "epoch": 0.83, + "grad_norm": 1.7943535833943023, + "learning_rate": 7.687247101634449e-07, + "loss": 0.4341, + "step": 10524 + }, + { + "epoch": 0.83, + "grad_norm": 0.5405830788229496, + "learning_rate": 7.680472616758467e-07, + "loss": 0.4759, + "step": 10525 + }, + { + "epoch": 0.83, + "grad_norm": 2.3315461778543876, + "learning_rate": 7.673700869900536e-07, + "loss": 0.4454, + "step": 10526 + }, + { + "epoch": 0.83, + "grad_norm": 1.6853774075410262, + "learning_rate": 7.666931861498788e-07, + "loss": 0.4232, + "step": 10527 + }, + { + "epoch": 0.83, + "grad_norm": 1.5817766033926233, + "learning_rate": 7.66016559199117e-07, + "loss": 0.4506, + "step": 10528 + }, + { + "epoch": 0.83, + "grad_norm": 1.8565430079435985, + "learning_rate": 7.653402061815462e-07, + "loss": 0.4171, + "step": 10529 + }, + { + "epoch": 0.83, + "grad_norm": 1.9356066932565676, + "learning_rate": 7.646641271409233e-07, + "loss": 0.4531, + "step": 10530 + }, + { + "epoch": 0.83, + "grad_norm": 1.3723100509113055, + "learning_rate": 7.639883221209899e-07, + "loss": 0.4384, + "step": 10531 + }, + { + "epoch": 0.83, + "grad_norm": 3.0359078845897915, + "learning_rate": 7.633127911654725e-07, + "loss": 0.415, + "step": 10532 + }, + { + "epoch": 0.83, + "grad_norm": 1.678473949071921, + "learning_rate": 7.626375343180742e-07, + "loss": 0.5014, + "step": 10533 + }, + { + "epoch": 0.83, + "grad_norm": 1.6838617121346133, + "learning_rate": 7.619625516224854e-07, + "loss": 0.4165, + "step": 10534 + }, + { + "epoch": 0.83, + "grad_norm": 2.398085281846617, + "learning_rate": 7.612878431223736e-07, + "loss": 0.4564, + "step": 10535 + }, + { + "epoch": 0.83, + "grad_norm": 1.6462042622435864, + "learning_rate": 7.606134088613954e-07, + "loss": 0.4807, + "step": 10536 + }, + { + "epoch": 0.83, + "grad_norm": 1.5108619678677597, + "learning_rate": 7.599392488831825e-07, + "loss": 0.4381, + "step": 10537 + }, + { + "epoch": 0.83, + "grad_norm": 2.517588977747294, + "learning_rate": 7.59265363231354e-07, + "loss": 0.5155, + "step": 10538 + }, + { + "epoch": 0.83, + "grad_norm": 2.2031479033436567, + "learning_rate": 7.585917519495084e-07, + "loss": 0.4414, + "step": 10539 + }, + { + "epoch": 0.83, + "grad_norm": 1.7215911115858906, + "learning_rate": 7.579184150812297e-07, + "loss": 0.4741, + "step": 10540 + }, + { + "epoch": 0.83, + "grad_norm": 2.114927707807161, + "learning_rate": 7.572453526700785e-07, + "loss": 0.4566, + "step": 10541 + }, + { + "epoch": 0.83, + "grad_norm": 2.809764037563999, + "learning_rate": 7.565725647596028e-07, + "loss": 0.3765, + "step": 10542 + }, + { + "epoch": 0.83, + "grad_norm": 1.888022302806769, + "learning_rate": 7.559000513933312e-07, + "loss": 0.4694, + "step": 10543 + }, + { + "epoch": 0.83, + "grad_norm": 5.3171258366055145, + "learning_rate": 7.552278126147744e-07, + "loss": 0.4423, + "step": 10544 + }, + { + "epoch": 0.83, + "grad_norm": 0.5389018315423004, + "learning_rate": 7.545558484674264e-07, + "loss": 0.4687, + "step": 10545 + }, + { + "epoch": 0.83, + "grad_norm": 1.8164428281260794, + "learning_rate": 7.538841589947599e-07, + "loss": 0.4714, + "step": 10546 + }, + { + "epoch": 0.83, + "grad_norm": 1.5812016049619626, + "learning_rate": 7.532127442402337e-07, + "loss": 0.4561, + "step": 10547 + }, + { + "epoch": 0.83, + "grad_norm": 3.7537437444967034, + "learning_rate": 7.525416042472877e-07, + "loss": 0.4706, + "step": 10548 + }, + { + "epoch": 0.83, + "grad_norm": 0.5567653317781628, + "learning_rate": 7.518707390593449e-07, + "loss": 0.4912, + "step": 10549 + }, + { + "epoch": 0.83, + "grad_norm": 2.6121919323594556, + "learning_rate": 7.512001487198051e-07, + "loss": 0.4809, + "step": 10550 + }, + { + "epoch": 0.83, + "grad_norm": 1.6031926595770527, + "learning_rate": 7.505298332720601e-07, + "loss": 0.4574, + "step": 10551 + }, + { + "epoch": 0.83, + "grad_norm": 2.4578192336840505, + "learning_rate": 7.498597927594748e-07, + "loss": 0.4564, + "step": 10552 + }, + { + "epoch": 0.83, + "grad_norm": 1.8163232364348196, + "learning_rate": 7.49190027225401e-07, + "loss": 0.42, + "step": 10553 + }, + { + "epoch": 0.83, + "grad_norm": 1.4241521212525727, + "learning_rate": 7.485205367131721e-07, + "loss": 0.4741, + "step": 10554 + }, + { + "epoch": 0.83, + "grad_norm": 1.860552172848616, + "learning_rate": 7.478513212661021e-07, + "loss": 0.4345, + "step": 10555 + }, + { + "epoch": 0.83, + "grad_norm": 0.5689153823071813, + "learning_rate": 7.471823809274909e-07, + "loss": 0.4787, + "step": 10556 + }, + { + "epoch": 0.83, + "grad_norm": 2.2615148959792037, + "learning_rate": 7.46513715740615e-07, + "loss": 0.4602, + "step": 10557 + }, + { + "epoch": 0.83, + "grad_norm": 2.254356198053311, + "learning_rate": 7.458453257487369e-07, + "loss": 0.453, + "step": 10558 + }, + { + "epoch": 0.83, + "grad_norm": 1.5706240347495042, + "learning_rate": 7.451772109951016e-07, + "loss": 0.4214, + "step": 10559 + }, + { + "epoch": 0.83, + "grad_norm": 2.302135577753881, + "learning_rate": 7.445093715229356e-07, + "loss": 0.5135, + "step": 10560 + }, + { + "epoch": 0.83, + "grad_norm": 9.317015326389319, + "learning_rate": 7.438418073754456e-07, + "loss": 0.4705, + "step": 10561 + }, + { + "epoch": 0.83, + "grad_norm": 1.55282029670015, + "learning_rate": 7.431745185958223e-07, + "loss": 0.4504, + "step": 10562 + }, + { + "epoch": 0.83, + "grad_norm": 1.48795831693043, + "learning_rate": 7.425075052272396e-07, + "loss": 0.4232, + "step": 10563 + }, + { + "epoch": 0.83, + "grad_norm": 2.096728701319735, + "learning_rate": 7.418407673128514e-07, + "loss": 0.4384, + "step": 10564 + }, + { + "epoch": 0.83, + "grad_norm": 1.5971121808522775, + "learning_rate": 7.411743048957965e-07, + "loss": 0.4235, + "step": 10565 + }, + { + "epoch": 0.83, + "grad_norm": 0.5255303463131702, + "learning_rate": 7.405081180191909e-07, + "loss": 0.4552, + "step": 10566 + }, + { + "epoch": 0.83, + "grad_norm": 3.2773446879108823, + "learning_rate": 7.398422067261396e-07, + "loss": 0.4764, + "step": 10567 + }, + { + "epoch": 0.83, + "grad_norm": 0.5612833892498487, + "learning_rate": 7.391765710597237e-07, + "loss": 0.4762, + "step": 10568 + }, + { + "epoch": 0.83, + "grad_norm": 7.219307298141768, + "learning_rate": 7.38511211063011e-07, + "loss": 0.4522, + "step": 10569 + }, + { + "epoch": 0.83, + "grad_norm": 0.5814998059988247, + "learning_rate": 7.37846126779046e-07, + "loss": 0.4762, + "step": 10570 + }, + { + "epoch": 0.83, + "grad_norm": 1.7357400480376641, + "learning_rate": 7.37181318250863e-07, + "loss": 0.4539, + "step": 10571 + }, + { + "epoch": 0.83, + "grad_norm": 1.417097618210183, + "learning_rate": 7.365167855214711e-07, + "loss": 0.4158, + "step": 10572 + }, + { + "epoch": 0.83, + "grad_norm": 1.6116881681166537, + "learning_rate": 7.358525286338664e-07, + "loss": 0.4058, + "step": 10573 + }, + { + "epoch": 0.83, + "grad_norm": 1.6494477995560357, + "learning_rate": 7.351885476310244e-07, + "loss": 0.4724, + "step": 10574 + }, + { + "epoch": 0.83, + "grad_norm": 1.8962213527443266, + "learning_rate": 7.345248425559043e-07, + "loss": 0.4352, + "step": 10575 + }, + { + "epoch": 0.83, + "grad_norm": 1.6675876274353516, + "learning_rate": 7.338614134514482e-07, + "loss": 0.4667, + "step": 10576 + }, + { + "epoch": 0.83, + "grad_norm": 2.1430755958484577, + "learning_rate": 7.331982603605764e-07, + "loss": 0.4544, + "step": 10577 + }, + { + "epoch": 0.83, + "grad_norm": 2.3960839407345467, + "learning_rate": 7.325353833261956e-07, + "loss": 0.429, + "step": 10578 + }, + { + "epoch": 0.83, + "grad_norm": 1.413680457817274, + "learning_rate": 7.31872782391193e-07, + "loss": 0.4515, + "step": 10579 + }, + { + "epoch": 0.83, + "grad_norm": 2.6001233398623165, + "learning_rate": 7.312104575984397e-07, + "loss": 0.3942, + "step": 10580 + }, + { + "epoch": 0.83, + "grad_norm": 1.4972765224705613, + "learning_rate": 7.305484089907838e-07, + "loss": 0.4443, + "step": 10581 + }, + { + "epoch": 0.83, + "grad_norm": 1.9366631257451463, + "learning_rate": 7.298866366110607e-07, + "loss": 0.511, + "step": 10582 + }, + { + "epoch": 0.83, + "grad_norm": 1.432471435266661, + "learning_rate": 7.292251405020862e-07, + "loss": 0.4247, + "step": 10583 + }, + { + "epoch": 0.83, + "grad_norm": 1.7552394220442418, + "learning_rate": 7.285639207066591e-07, + "loss": 0.4225, + "step": 10584 + }, + { + "epoch": 0.83, + "grad_norm": 1.2150558441291324, + "learning_rate": 7.279029772675572e-07, + "loss": 0.4029, + "step": 10585 + }, + { + "epoch": 0.83, + "grad_norm": 1.6752012027032102, + "learning_rate": 7.272423102275445e-07, + "loss": 0.4161, + "step": 10586 + }, + { + "epoch": 0.83, + "grad_norm": 1.8531403767023835, + "learning_rate": 7.26581919629366e-07, + "loss": 0.4319, + "step": 10587 + }, + { + "epoch": 0.83, + "grad_norm": 0.5229120767429257, + "learning_rate": 7.259218055157458e-07, + "loss": 0.4566, + "step": 10588 + }, + { + "epoch": 0.83, + "grad_norm": 1.8718696909609773, + "learning_rate": 7.252619679293937e-07, + "loss": 0.3962, + "step": 10589 + }, + { + "epoch": 0.83, + "grad_norm": 2.22950862965234, + "learning_rate": 7.246024069130004e-07, + "loss": 0.3811, + "step": 10590 + }, + { + "epoch": 0.83, + "grad_norm": 1.7451374258481898, + "learning_rate": 7.239431225092397e-07, + "loss": 0.4488, + "step": 10591 + }, + { + "epoch": 0.83, + "grad_norm": 0.5238133579910506, + "learning_rate": 7.232841147607639e-07, + "loss": 0.4457, + "step": 10592 + }, + { + "epoch": 0.83, + "grad_norm": 2.1362877755698304, + "learning_rate": 7.226253837102109e-07, + "loss": 0.4429, + "step": 10593 + }, + { + "epoch": 0.83, + "grad_norm": 1.7402655114408039, + "learning_rate": 7.219669294002002e-07, + "loss": 0.3976, + "step": 10594 + }, + { + "epoch": 0.83, + "grad_norm": 1.5434919750140725, + "learning_rate": 7.213087518733341e-07, + "loss": 0.3986, + "step": 10595 + }, + { + "epoch": 0.83, + "grad_norm": 1.8967123775462449, + "learning_rate": 7.206508511721933e-07, + "loss": 0.4552, + "step": 10596 + }, + { + "epoch": 0.83, + "grad_norm": 2.5326731976319565, + "learning_rate": 7.199932273393445e-07, + "loss": 0.4332, + "step": 10597 + }, + { + "epoch": 0.83, + "grad_norm": 1.6829190394143017, + "learning_rate": 7.19335880417335e-07, + "loss": 0.492, + "step": 10598 + }, + { + "epoch": 0.83, + "grad_norm": 1.2953998902493518, + "learning_rate": 7.186788104486936e-07, + "loss": 0.4273, + "step": 10599 + }, + { + "epoch": 0.83, + "grad_norm": 1.858386332745713, + "learning_rate": 7.180220174759345e-07, + "loss": 0.4003, + "step": 10600 + }, + { + "epoch": 0.83, + "grad_norm": 2.003565097261802, + "learning_rate": 7.173655015415465e-07, + "loss": 0.5118, + "step": 10601 + }, + { + "epoch": 0.83, + "grad_norm": 2.3693432797929233, + "learning_rate": 7.167092626880107e-07, + "loss": 0.4899, + "step": 10602 + }, + { + "epoch": 0.83, + "grad_norm": 1.6269481727048096, + "learning_rate": 7.160533009577808e-07, + "loss": 0.438, + "step": 10603 + }, + { + "epoch": 0.83, + "grad_norm": 2.058163598176839, + "learning_rate": 7.153976163932996e-07, + "loss": 0.4285, + "step": 10604 + }, + { + "epoch": 0.83, + "grad_norm": 1.6441881616436609, + "learning_rate": 7.147422090369854e-07, + "loss": 0.4741, + "step": 10605 + }, + { + "epoch": 0.83, + "grad_norm": 0.5615128754202016, + "learning_rate": 7.140870789312471e-07, + "loss": 0.4669, + "step": 10606 + }, + { + "epoch": 0.83, + "grad_norm": 2.3456162186333347, + "learning_rate": 7.134322261184662e-07, + "loss": 0.51, + "step": 10607 + }, + { + "epoch": 0.83, + "grad_norm": 1.7993210771523578, + "learning_rate": 7.127776506410134e-07, + "loss": 0.4509, + "step": 10608 + }, + { + "epoch": 0.83, + "grad_norm": 0.5409627791061821, + "learning_rate": 7.12123352541238e-07, + "loss": 0.4853, + "step": 10609 + }, + { + "epoch": 0.83, + "grad_norm": 1.772569212832061, + "learning_rate": 7.114693318614723e-07, + "loss": 0.4592, + "step": 10610 + }, + { + "epoch": 0.83, + "grad_norm": 0.5598474010629473, + "learning_rate": 7.108155886440321e-07, + "loss": 0.4624, + "step": 10611 + }, + { + "epoch": 0.83, + "grad_norm": 1.7482030922763923, + "learning_rate": 7.101621229312111e-07, + "loss": 0.4464, + "step": 10612 + }, + { + "epoch": 0.83, + "grad_norm": 2.039417855933371, + "learning_rate": 7.095089347652889e-07, + "loss": 0.462, + "step": 10613 + }, + { + "epoch": 0.83, + "grad_norm": 0.5682006385309659, + "learning_rate": 7.088560241885256e-07, + "loss": 0.457, + "step": 10614 + }, + { + "epoch": 0.83, + "grad_norm": 1.9173123853531797, + "learning_rate": 7.082033912431658e-07, + "loss": 0.464, + "step": 10615 + }, + { + "epoch": 0.83, + "grad_norm": 4.262541680649712, + "learning_rate": 7.075510359714305e-07, + "loss": 0.4394, + "step": 10616 + }, + { + "epoch": 0.83, + "grad_norm": 1.4127308117213195, + "learning_rate": 7.068989584155283e-07, + "loss": 0.4472, + "step": 10617 + }, + { + "epoch": 0.83, + "grad_norm": 1.699450627630125, + "learning_rate": 7.062471586176473e-07, + "loss": 0.4832, + "step": 10618 + }, + { + "epoch": 0.83, + "grad_norm": 2.1932431573612186, + "learning_rate": 7.055956366199595e-07, + "loss": 0.4566, + "step": 10619 + }, + { + "epoch": 0.83, + "grad_norm": 1.7379900483867532, + "learning_rate": 7.049443924646138e-07, + "loss": 0.4218, + "step": 10620 + }, + { + "epoch": 0.83, + "grad_norm": 1.2851025365906996, + "learning_rate": 7.042934261937484e-07, + "loss": 0.4281, + "step": 10621 + }, + { + "epoch": 0.83, + "grad_norm": 0.5554322575196475, + "learning_rate": 7.036427378494804e-07, + "loss": 0.4926, + "step": 10622 + }, + { + "epoch": 0.83, + "grad_norm": 0.5213528875887441, + "learning_rate": 7.029923274739053e-07, + "loss": 0.4723, + "step": 10623 + }, + { + "epoch": 0.83, + "grad_norm": 2.199635282276161, + "learning_rate": 7.023421951091053e-07, + "loss": 0.4222, + "step": 10624 + }, + { + "epoch": 0.83, + "grad_norm": 1.915413509761312, + "learning_rate": 7.016923407971432e-07, + "loss": 0.4813, + "step": 10625 + }, + { + "epoch": 0.83, + "grad_norm": 2.09530607964085, + "learning_rate": 7.010427645800655e-07, + "loss": 0.4433, + "step": 10626 + }, + { + "epoch": 0.83, + "grad_norm": 2.4282932174060985, + "learning_rate": 7.003934664998957e-07, + "loss": 0.4588, + "step": 10627 + }, + { + "epoch": 0.83, + "grad_norm": 1.528510681393632, + "learning_rate": 6.997444465986442e-07, + "loss": 0.3815, + "step": 10628 + }, + { + "epoch": 0.83, + "grad_norm": 1.3746625250870323, + "learning_rate": 6.990957049183011e-07, + "loss": 0.4101, + "step": 10629 + }, + { + "epoch": 0.83, + "grad_norm": 1.378387900466592, + "learning_rate": 6.984472415008408e-07, + "loss": 0.4524, + "step": 10630 + }, + { + "epoch": 0.83, + "grad_norm": 3.1088558292974695, + "learning_rate": 6.977990563882165e-07, + "loss": 0.4007, + "step": 10631 + }, + { + "epoch": 0.83, + "grad_norm": 1.8216017181001483, + "learning_rate": 6.971511496223643e-07, + "loss": 0.5203, + "step": 10632 + }, + { + "epoch": 0.84, + "grad_norm": 1.966337273078408, + "learning_rate": 6.965035212452043e-07, + "loss": 0.4615, + "step": 10633 + }, + { + "epoch": 0.84, + "grad_norm": 2.7270422239114036, + "learning_rate": 6.958561712986367e-07, + "loss": 0.4433, + "step": 10634 + }, + { + "epoch": 0.84, + "grad_norm": 1.7750105060863808, + "learning_rate": 6.952090998245453e-07, + "loss": 0.5078, + "step": 10635 + }, + { + "epoch": 0.84, + "grad_norm": 1.6679588176040823, + "learning_rate": 6.945623068647922e-07, + "loss": 0.4258, + "step": 10636 + }, + { + "epoch": 0.84, + "grad_norm": 2.3607718985061528, + "learning_rate": 6.939157924612272e-07, + "loss": 0.3984, + "step": 10637 + }, + { + "epoch": 0.84, + "grad_norm": 0.5537740725478609, + "learning_rate": 6.932695566556763e-07, + "loss": 0.4549, + "step": 10638 + }, + { + "epoch": 0.84, + "grad_norm": 2.2090078497127363, + "learning_rate": 6.926235994899528e-07, + "loss": 0.4594, + "step": 10639 + }, + { + "epoch": 0.84, + "grad_norm": 2.2198465741682236, + "learning_rate": 6.919779210058447e-07, + "loss": 0.4492, + "step": 10640 + }, + { + "epoch": 0.84, + "grad_norm": 1.4685837115302212, + "learning_rate": 6.913325212451322e-07, + "loss": 0.4262, + "step": 10641 + }, + { + "epoch": 0.84, + "grad_norm": 1.6927511911117012, + "learning_rate": 6.906874002495678e-07, + "loss": 0.3991, + "step": 10642 + }, + { + "epoch": 0.84, + "grad_norm": 1.6964784704818188, + "learning_rate": 6.900425580608916e-07, + "loss": 0.4419, + "step": 10643 + }, + { + "epoch": 0.84, + "grad_norm": 0.5397056382885204, + "learning_rate": 6.893979947208235e-07, + "loss": 0.4807, + "step": 10644 + }, + { + "epoch": 0.84, + "grad_norm": 1.6845128980202997, + "learning_rate": 6.887537102710662e-07, + "loss": 0.4431, + "step": 10645 + }, + { + "epoch": 0.84, + "grad_norm": 0.536268968551931, + "learning_rate": 6.881097047533047e-07, + "loss": 0.4872, + "step": 10646 + }, + { + "epoch": 0.84, + "grad_norm": 0.5194201580287918, + "learning_rate": 6.874659782092041e-07, + "loss": 0.4511, + "step": 10647 + }, + { + "epoch": 0.84, + "grad_norm": 1.3160002101378592, + "learning_rate": 6.868225306804132e-07, + "loss": 0.422, + "step": 10648 + }, + { + "epoch": 0.84, + "grad_norm": 1.7148668385569532, + "learning_rate": 6.861793622085622e-07, + "loss": 0.4259, + "step": 10649 + }, + { + "epoch": 0.84, + "grad_norm": 1.9040214725498845, + "learning_rate": 6.855364728352643e-07, + "loss": 0.408, + "step": 10650 + }, + { + "epoch": 0.84, + "grad_norm": 1.8621924258271785, + "learning_rate": 6.848938626021112e-07, + "loss": 0.461, + "step": 10651 + }, + { + "epoch": 0.84, + "grad_norm": 1.7124390789599107, + "learning_rate": 6.842515315506809e-07, + "loss": 0.4433, + "step": 10652 + }, + { + "epoch": 0.84, + "grad_norm": 2.032863350494302, + "learning_rate": 6.836094797225306e-07, + "loss": 0.4329, + "step": 10653 + }, + { + "epoch": 0.84, + "grad_norm": 0.5382223817268618, + "learning_rate": 6.829677071592e-07, + "loss": 0.4832, + "step": 10654 + }, + { + "epoch": 0.84, + "grad_norm": 2.312665022022054, + "learning_rate": 6.823262139022119e-07, + "loss": 0.464, + "step": 10655 + }, + { + "epoch": 0.84, + "grad_norm": 1.7930319769205327, + "learning_rate": 6.816849999930691e-07, + "loss": 0.4338, + "step": 10656 + }, + { + "epoch": 0.84, + "grad_norm": 1.7909627787217854, + "learning_rate": 6.810440654732592e-07, + "loss": 0.4075, + "step": 10657 + }, + { + "epoch": 0.84, + "grad_norm": 1.54975043837616, + "learning_rate": 6.804034103842471e-07, + "loss": 0.4038, + "step": 10658 + }, + { + "epoch": 0.84, + "grad_norm": 3.70822342377593, + "learning_rate": 6.797630347674833e-07, + "loss": 0.424, + "step": 10659 + }, + { + "epoch": 0.84, + "grad_norm": 2.0193018161484804, + "learning_rate": 6.791229386643999e-07, + "loss": 0.4893, + "step": 10660 + }, + { + "epoch": 0.84, + "grad_norm": 2.700245250499907, + "learning_rate": 6.784831221164113e-07, + "loss": 0.4483, + "step": 10661 + }, + { + "epoch": 0.84, + "grad_norm": 1.6318722132310792, + "learning_rate": 6.778435851649101e-07, + "loss": 0.4449, + "step": 10662 + }, + { + "epoch": 0.84, + "grad_norm": 1.6731301835254497, + "learning_rate": 6.772043278512747e-07, + "loss": 0.5147, + "step": 10663 + }, + { + "epoch": 0.84, + "grad_norm": 1.5094104295199884, + "learning_rate": 6.765653502168646e-07, + "loss": 0.3971, + "step": 10664 + }, + { + "epoch": 0.84, + "grad_norm": 0.5054938064665913, + "learning_rate": 6.759266523030217e-07, + "loss": 0.461, + "step": 10665 + }, + { + "epoch": 0.84, + "grad_norm": 3.5956615557668576, + "learning_rate": 6.752882341510663e-07, + "loss": 0.4563, + "step": 10666 + }, + { + "epoch": 0.84, + "grad_norm": 2.2219427086196077, + "learning_rate": 6.74650095802305e-07, + "loss": 0.4423, + "step": 10667 + }, + { + "epoch": 0.84, + "grad_norm": 2.0461035449146765, + "learning_rate": 6.74012237298024e-07, + "loss": 0.4971, + "step": 10668 + }, + { + "epoch": 0.84, + "grad_norm": 1.6654819515488573, + "learning_rate": 6.733746586794925e-07, + "loss": 0.4216, + "step": 10669 + }, + { + "epoch": 0.84, + "grad_norm": 1.8371940608724024, + "learning_rate": 6.727373599879617e-07, + "loss": 0.4323, + "step": 10670 + }, + { + "epoch": 0.84, + "grad_norm": 1.756572135555636, + "learning_rate": 6.721003412646604e-07, + "loss": 0.4476, + "step": 10671 + }, + { + "epoch": 0.84, + "grad_norm": 1.843198490948536, + "learning_rate": 6.714636025508081e-07, + "loss": 0.4297, + "step": 10672 + }, + { + "epoch": 0.84, + "grad_norm": 5.059736616774512, + "learning_rate": 6.708271438875968e-07, + "loss": 0.4469, + "step": 10673 + }, + { + "epoch": 0.84, + "grad_norm": 2.756322235707946, + "learning_rate": 6.701909653162076e-07, + "loss": 0.4584, + "step": 10674 + }, + { + "epoch": 0.84, + "grad_norm": 2.1092821559800905, + "learning_rate": 6.695550668777962e-07, + "loss": 0.4909, + "step": 10675 + }, + { + "epoch": 0.84, + "grad_norm": 2.4974114912072576, + "learning_rate": 6.689194486135092e-07, + "loss": 0.4561, + "step": 10676 + }, + { + "epoch": 0.84, + "grad_norm": 2.4713085961367547, + "learning_rate": 6.682841105644672e-07, + "loss": 0.4235, + "step": 10677 + }, + { + "epoch": 0.84, + "grad_norm": 2.003704943831131, + "learning_rate": 6.676490527717766e-07, + "loss": 0.4774, + "step": 10678 + }, + { + "epoch": 0.84, + "grad_norm": 2.847168231471243, + "learning_rate": 6.670142752765246e-07, + "loss": 0.3983, + "step": 10679 + }, + { + "epoch": 0.84, + "grad_norm": 1.6862587570595589, + "learning_rate": 6.663797781197812e-07, + "loss": 0.4546, + "step": 10680 + }, + { + "epoch": 0.84, + "grad_norm": 1.7068724774272455, + "learning_rate": 6.657455613425979e-07, + "loss": 0.4418, + "step": 10681 + }, + { + "epoch": 0.84, + "grad_norm": 0.5351257022235056, + "learning_rate": 6.651116249860057e-07, + "loss": 0.4459, + "step": 10682 + }, + { + "epoch": 0.84, + "grad_norm": 2.128996661846696, + "learning_rate": 6.644779690910208e-07, + "loss": 0.4545, + "step": 10683 + }, + { + "epoch": 0.84, + "grad_norm": 1.626387138762449, + "learning_rate": 6.638445936986393e-07, + "loss": 0.3931, + "step": 10684 + }, + { + "epoch": 0.84, + "grad_norm": 2.316828830127037, + "learning_rate": 6.632114988498417e-07, + "loss": 0.4442, + "step": 10685 + }, + { + "epoch": 0.84, + "grad_norm": 1.7934640524563779, + "learning_rate": 6.625786845855858e-07, + "loss": 0.4362, + "step": 10686 + }, + { + "epoch": 0.84, + "grad_norm": 2.5812846553598687, + "learning_rate": 6.61946150946815e-07, + "loss": 0.4114, + "step": 10687 + }, + { + "epoch": 0.84, + "grad_norm": 1.7822974221690324, + "learning_rate": 6.613138979744532e-07, + "loss": 0.414, + "step": 10688 + }, + { + "epoch": 0.84, + "grad_norm": 2.053307410662241, + "learning_rate": 6.606819257094066e-07, + "loss": 0.4487, + "step": 10689 + }, + { + "epoch": 0.84, + "grad_norm": 4.820572103939041, + "learning_rate": 6.600502341925624e-07, + "loss": 0.457, + "step": 10690 + }, + { + "epoch": 0.84, + "grad_norm": 1.8400476219224644, + "learning_rate": 6.594188234647913e-07, + "loss": 0.4406, + "step": 10691 + }, + { + "epoch": 0.84, + "grad_norm": 1.8500534810364186, + "learning_rate": 6.587876935669446e-07, + "loss": 0.4305, + "step": 10692 + }, + { + "epoch": 0.84, + "grad_norm": 2.781823387761016, + "learning_rate": 6.581568445398545e-07, + "loss": 0.4587, + "step": 10693 + }, + { + "epoch": 0.84, + "grad_norm": 3.4031605541648275, + "learning_rate": 6.575262764243368e-07, + "loss": 0.4209, + "step": 10694 + }, + { + "epoch": 0.84, + "grad_norm": 2.0484288923151746, + "learning_rate": 6.568959892611882e-07, + "loss": 0.4873, + "step": 10695 + }, + { + "epoch": 0.84, + "grad_norm": 1.8766202190608015, + "learning_rate": 6.562659830911883e-07, + "loss": 0.4816, + "step": 10696 + }, + { + "epoch": 0.84, + "grad_norm": 1.7377699983800727, + "learning_rate": 6.556362579550962e-07, + "loss": 0.4995, + "step": 10697 + }, + { + "epoch": 0.84, + "grad_norm": 1.6917016214919727, + "learning_rate": 6.550068138936555e-07, + "loss": 0.4447, + "step": 10698 + }, + { + "epoch": 0.84, + "grad_norm": 1.5259794808055107, + "learning_rate": 6.543776509475897e-07, + "loss": 0.4073, + "step": 10699 + }, + { + "epoch": 0.84, + "grad_norm": 1.6962471054798165, + "learning_rate": 6.537487691576044e-07, + "loss": 0.4328, + "step": 10700 + }, + { + "epoch": 0.84, + "grad_norm": 0.5605011100392531, + "learning_rate": 6.531201685643901e-07, + "loss": 0.4673, + "step": 10701 + }, + { + "epoch": 0.84, + "grad_norm": 2.81787194754758, + "learning_rate": 6.52491849208613e-07, + "loss": 0.3995, + "step": 10702 + }, + { + "epoch": 0.84, + "grad_norm": 1.9546298164334213, + "learning_rate": 6.518638111309261e-07, + "loss": 0.4361, + "step": 10703 + }, + { + "epoch": 0.84, + "grad_norm": 1.8255558778153074, + "learning_rate": 6.512360543719626e-07, + "loss": 0.4473, + "step": 10704 + }, + { + "epoch": 0.84, + "grad_norm": 2.445829030253032, + "learning_rate": 6.506085789723382e-07, + "loss": 0.3999, + "step": 10705 + }, + { + "epoch": 0.84, + "grad_norm": 2.191251097691553, + "learning_rate": 6.499813849726471e-07, + "loss": 0.443, + "step": 10706 + }, + { + "epoch": 0.84, + "grad_norm": 2.3895863329225655, + "learning_rate": 6.49354472413472e-07, + "loss": 0.4928, + "step": 10707 + }, + { + "epoch": 0.84, + "grad_norm": 2.005762921725672, + "learning_rate": 6.487278413353703e-07, + "loss": 0.4381, + "step": 10708 + }, + { + "epoch": 0.84, + "grad_norm": 2.0280845369821043, + "learning_rate": 6.481014917788859e-07, + "loss": 0.4735, + "step": 10709 + }, + { + "epoch": 0.84, + "grad_norm": 1.9265798976973052, + "learning_rate": 6.474754237845399e-07, + "loss": 0.4614, + "step": 10710 + }, + { + "epoch": 0.84, + "grad_norm": 0.5695735178076162, + "learning_rate": 6.468496373928412e-07, + "loss": 0.4654, + "step": 10711 + }, + { + "epoch": 0.84, + "grad_norm": 1.833141276809215, + "learning_rate": 6.462241326442775e-07, + "loss": 0.4018, + "step": 10712 + }, + { + "epoch": 0.84, + "grad_norm": 2.203113768191496, + "learning_rate": 6.455989095793158e-07, + "loss": 0.4186, + "step": 10713 + }, + { + "epoch": 0.84, + "grad_norm": 2.2250349157697764, + "learning_rate": 6.449739682384082e-07, + "loss": 0.4521, + "step": 10714 + }, + { + "epoch": 0.84, + "grad_norm": 1.6440796007398124, + "learning_rate": 6.443493086619884e-07, + "loss": 0.4536, + "step": 10715 + }, + { + "epoch": 0.84, + "grad_norm": 0.5371101995777372, + "learning_rate": 6.437249308904708e-07, + "loss": 0.482, + "step": 10716 + }, + { + "epoch": 0.84, + "grad_norm": 3.05407619295554, + "learning_rate": 6.431008349642509e-07, + "loss": 0.4365, + "step": 10717 + }, + { + "epoch": 0.84, + "grad_norm": 2.007079761125552, + "learning_rate": 6.424770209237069e-07, + "loss": 0.4205, + "step": 10718 + }, + { + "epoch": 0.84, + "grad_norm": 2.205391552639151, + "learning_rate": 6.418534888091998e-07, + "loss": 0.4217, + "step": 10719 + }, + { + "epoch": 0.84, + "grad_norm": 0.5514636227149246, + "learning_rate": 6.412302386610714e-07, + "loss": 0.4642, + "step": 10720 + }, + { + "epoch": 0.84, + "grad_norm": 0.5290898880410322, + "learning_rate": 6.406072705196437e-07, + "loss": 0.4854, + "step": 10721 + }, + { + "epoch": 0.84, + "grad_norm": 1.9994688694439704, + "learning_rate": 6.399845844252218e-07, + "loss": 0.4705, + "step": 10722 + }, + { + "epoch": 0.84, + "grad_norm": 0.5213265426713367, + "learning_rate": 6.393621804180961e-07, + "loss": 0.455, + "step": 10723 + }, + { + "epoch": 0.84, + "grad_norm": 1.4853465900859177, + "learning_rate": 6.387400585385311e-07, + "loss": 0.4234, + "step": 10724 + }, + { + "epoch": 0.84, + "grad_norm": 2.3982056414417987, + "learning_rate": 6.381182188267809e-07, + "loss": 0.4961, + "step": 10725 + }, + { + "epoch": 0.84, + "grad_norm": 1.839140825583552, + "learning_rate": 6.374966613230732e-07, + "loss": 0.4942, + "step": 10726 + }, + { + "epoch": 0.84, + "grad_norm": 0.5467197117303432, + "learning_rate": 6.368753860676269e-07, + "loss": 0.4632, + "step": 10727 + }, + { + "epoch": 0.84, + "grad_norm": 1.7892157611266242, + "learning_rate": 6.362543931006343e-07, + "loss": 0.4098, + "step": 10728 + }, + { + "epoch": 0.84, + "grad_norm": 2.2857626659349806, + "learning_rate": 6.356336824622744e-07, + "loss": 0.46, + "step": 10729 + }, + { + "epoch": 0.84, + "grad_norm": 0.5811190604182273, + "learning_rate": 6.350132541927057e-07, + "loss": 0.4885, + "step": 10730 + }, + { + "epoch": 0.84, + "grad_norm": 2.0408943141391895, + "learning_rate": 6.343931083320704e-07, + "loss": 0.4218, + "step": 10731 + }, + { + "epoch": 0.84, + "grad_norm": 1.4463160109204227, + "learning_rate": 6.337732449204886e-07, + "loss": 0.3877, + "step": 10732 + }, + { + "epoch": 0.84, + "grad_norm": 1.69962972865419, + "learning_rate": 6.331536639980667e-07, + "loss": 0.417, + "step": 10733 + }, + { + "epoch": 0.84, + "grad_norm": 1.7561165621652135, + "learning_rate": 6.325343656048899e-07, + "loss": 0.4256, + "step": 10734 + }, + { + "epoch": 0.84, + "grad_norm": 1.8768130782059609, + "learning_rate": 6.319153497810266e-07, + "loss": 0.4081, + "step": 10735 + }, + { + "epoch": 0.84, + "grad_norm": 1.502886706519453, + "learning_rate": 6.312966165665263e-07, + "loss": 0.4612, + "step": 10736 + }, + { + "epoch": 0.84, + "grad_norm": 2.171785716926124, + "learning_rate": 6.306781660014194e-07, + "loss": 0.4566, + "step": 10737 + }, + { + "epoch": 0.84, + "grad_norm": 3.624382424834933, + "learning_rate": 6.300599981257193e-07, + "loss": 0.4844, + "step": 10738 + }, + { + "epoch": 0.84, + "grad_norm": 1.547483583070166, + "learning_rate": 6.29442112979421e-07, + "loss": 0.4643, + "step": 10739 + }, + { + "epoch": 0.84, + "grad_norm": 2.058531002574587, + "learning_rate": 6.288245106025015e-07, + "loss": 0.423, + "step": 10740 + }, + { + "epoch": 0.84, + "grad_norm": 2.0185327289834234, + "learning_rate": 6.282071910349152e-07, + "loss": 0.4103, + "step": 10741 + }, + { + "epoch": 0.84, + "grad_norm": 0.5220032886691861, + "learning_rate": 6.275901543166074e-07, + "loss": 0.4724, + "step": 10742 + }, + { + "epoch": 0.84, + "grad_norm": 0.6127179183379431, + "learning_rate": 6.269734004874956e-07, + "loss": 0.4629, + "step": 10743 + }, + { + "epoch": 0.84, + "grad_norm": 2.0900446525253376, + "learning_rate": 6.26356929587485e-07, + "loss": 0.4338, + "step": 10744 + }, + { + "epoch": 0.84, + "grad_norm": 1.885178812776475, + "learning_rate": 6.257407416564576e-07, + "loss": 0.4254, + "step": 10745 + }, + { + "epoch": 0.84, + "grad_norm": 0.53358432699037, + "learning_rate": 6.251248367342833e-07, + "loss": 0.4727, + "step": 10746 + }, + { + "epoch": 0.84, + "grad_norm": 1.850430135838408, + "learning_rate": 6.2450921486081e-07, + "loss": 0.4494, + "step": 10747 + }, + { + "epoch": 0.84, + "grad_norm": 1.9969723715706393, + "learning_rate": 6.23893876075865e-07, + "loss": 0.4006, + "step": 10748 + }, + { + "epoch": 0.84, + "grad_norm": 2.007078430430243, + "learning_rate": 6.232788204192624e-07, + "loss": 0.4395, + "step": 10749 + }, + { + "epoch": 0.84, + "grad_norm": 2.389490615699665, + "learning_rate": 6.226640479307943e-07, + "loss": 0.4764, + "step": 10750 + }, + { + "epoch": 0.84, + "grad_norm": 1.8297756184440575, + "learning_rate": 6.220495586502367e-07, + "loss": 0.4661, + "step": 10751 + }, + { + "epoch": 0.84, + "grad_norm": 0.5956634187715218, + "learning_rate": 6.21435352617345e-07, + "loss": 0.4813, + "step": 10752 + }, + { + "epoch": 0.84, + "grad_norm": 2.0396832738084663, + "learning_rate": 6.208214298718579e-07, + "loss": 0.4326, + "step": 10753 + }, + { + "epoch": 0.84, + "grad_norm": 2.034878129679466, + "learning_rate": 6.202077904534959e-07, + "loss": 0.4262, + "step": 10754 + }, + { + "epoch": 0.84, + "grad_norm": 2.6042694590411406, + "learning_rate": 6.195944344019611e-07, + "loss": 0.4407, + "step": 10755 + }, + { + "epoch": 0.84, + "grad_norm": 1.9815024060941804, + "learning_rate": 6.189813617569352e-07, + "loss": 0.4244, + "step": 10756 + }, + { + "epoch": 0.84, + "grad_norm": 1.1980654936196093, + "learning_rate": 6.183685725580829e-07, + "loss": 0.4725, + "step": 10757 + }, + { + "epoch": 0.84, + "grad_norm": 0.5495542961519851, + "learning_rate": 6.177560668450539e-07, + "loss": 0.4988, + "step": 10758 + }, + { + "epoch": 0.84, + "grad_norm": 0.5828860896031169, + "learning_rate": 6.17143844657474e-07, + "loss": 0.4563, + "step": 10759 + }, + { + "epoch": 0.85, + "grad_norm": 1.6152607419167093, + "learning_rate": 6.165319060349551e-07, + "loss": 0.4755, + "step": 10760 + }, + { + "epoch": 0.85, + "grad_norm": 2.8293710683555, + "learning_rate": 6.159202510170847e-07, + "loss": 0.4816, + "step": 10761 + }, + { + "epoch": 0.85, + "grad_norm": 1.254212716978076, + "learning_rate": 6.15308879643442e-07, + "loss": 0.4216, + "step": 10762 + }, + { + "epoch": 0.85, + "grad_norm": 1.5356472859332781, + "learning_rate": 6.146977919535774e-07, + "loss": 0.3985, + "step": 10763 + }, + { + "epoch": 0.85, + "grad_norm": 1.7181886729506028, + "learning_rate": 6.140869879870287e-07, + "loss": 0.484, + "step": 10764 + }, + { + "epoch": 0.85, + "grad_norm": 0.5068125063763531, + "learning_rate": 6.134764677833149e-07, + "loss": 0.4658, + "step": 10765 + }, + { + "epoch": 0.85, + "grad_norm": 1.676032320145416, + "learning_rate": 6.128662313819362e-07, + "loss": 0.4163, + "step": 10766 + }, + { + "epoch": 0.85, + "grad_norm": 1.9742997718596353, + "learning_rate": 6.122562788223724e-07, + "loss": 0.4986, + "step": 10767 + }, + { + "epoch": 0.85, + "grad_norm": 1.5058476103902625, + "learning_rate": 6.116466101440871e-07, + "loss": 0.4671, + "step": 10768 + }, + { + "epoch": 0.85, + "grad_norm": 2.1133596615408363, + "learning_rate": 6.110372253865255e-07, + "loss": 0.4354, + "step": 10769 + }, + { + "epoch": 0.85, + "grad_norm": 1.5683201992219515, + "learning_rate": 6.104281245891141e-07, + "loss": 0.4767, + "step": 10770 + }, + { + "epoch": 0.85, + "grad_norm": 1.8443396414802622, + "learning_rate": 6.098193077912618e-07, + "loss": 0.4378, + "step": 10771 + }, + { + "epoch": 0.85, + "grad_norm": 1.9546640569703932, + "learning_rate": 6.092107750323562e-07, + "loss": 0.4426, + "step": 10772 + }, + { + "epoch": 0.85, + "grad_norm": 1.985797711230216, + "learning_rate": 6.086025263517692e-07, + "loss": 0.4264, + "step": 10773 + }, + { + "epoch": 0.85, + "grad_norm": 2.163814367467736, + "learning_rate": 6.079945617888544e-07, + "loss": 0.4111, + "step": 10774 + }, + { + "epoch": 0.85, + "grad_norm": 1.5198349301332297, + "learning_rate": 6.07386881382947e-07, + "loss": 0.4133, + "step": 10775 + }, + { + "epoch": 0.85, + "grad_norm": 1.4770641914466405, + "learning_rate": 6.0677948517336e-07, + "loss": 0.465, + "step": 10776 + }, + { + "epoch": 0.85, + "grad_norm": 1.5733373793669696, + "learning_rate": 6.061723731993951e-07, + "loss": 0.4215, + "step": 10777 + }, + { + "epoch": 0.85, + "grad_norm": 0.5659410848480282, + "learning_rate": 6.055655455003289e-07, + "loss": 0.4572, + "step": 10778 + }, + { + "epoch": 0.85, + "grad_norm": 1.5392384193905682, + "learning_rate": 6.049590021154233e-07, + "loss": 0.4376, + "step": 10779 + }, + { + "epoch": 0.85, + "grad_norm": 1.4422965485470225, + "learning_rate": 6.043527430839208e-07, + "loss": 0.4066, + "step": 10780 + }, + { + "epoch": 0.85, + "grad_norm": 2.01957811071372, + "learning_rate": 6.03746768445046e-07, + "loss": 0.4461, + "step": 10781 + }, + { + "epoch": 0.85, + "grad_norm": 4.566111995609609, + "learning_rate": 6.031410782380049e-07, + "loss": 0.4185, + "step": 10782 + }, + { + "epoch": 0.85, + "grad_norm": 1.6194995263676935, + "learning_rate": 6.025356725019832e-07, + "loss": 0.4011, + "step": 10783 + }, + { + "epoch": 0.85, + "grad_norm": 1.8875681478744082, + "learning_rate": 6.019305512761508e-07, + "loss": 0.4145, + "step": 10784 + }, + { + "epoch": 0.85, + "grad_norm": 1.7007208466583958, + "learning_rate": 6.013257145996587e-07, + "loss": 0.4673, + "step": 10785 + }, + { + "epoch": 0.85, + "grad_norm": 0.5395393529176088, + "learning_rate": 6.007211625116394e-07, + "loss": 0.4513, + "step": 10786 + }, + { + "epoch": 0.85, + "grad_norm": 0.5478781205843473, + "learning_rate": 6.001168950512048e-07, + "loss": 0.4726, + "step": 10787 + }, + { + "epoch": 0.85, + "grad_norm": 0.5763235899281748, + "learning_rate": 5.995129122574517e-07, + "loss": 0.4755, + "step": 10788 + }, + { + "epoch": 0.85, + "grad_norm": 1.990752247269342, + "learning_rate": 5.989092141694563e-07, + "loss": 0.488, + "step": 10789 + }, + { + "epoch": 0.85, + "grad_norm": 1.565399298756159, + "learning_rate": 5.983058008262777e-07, + "loss": 0.4447, + "step": 10790 + }, + { + "epoch": 0.85, + "grad_norm": 2.003246083554952, + "learning_rate": 5.97702672266956e-07, + "loss": 0.4448, + "step": 10791 + }, + { + "epoch": 0.85, + "grad_norm": 1.3993089887650623, + "learning_rate": 5.970998285305113e-07, + "loss": 0.4429, + "step": 10792 + }, + { + "epoch": 0.85, + "grad_norm": 2.3216908159238114, + "learning_rate": 5.964972696559496e-07, + "loss": 0.4244, + "step": 10793 + }, + { + "epoch": 0.85, + "grad_norm": 1.5344676289342227, + "learning_rate": 5.95894995682253e-07, + "loss": 0.3908, + "step": 10794 + }, + { + "epoch": 0.85, + "grad_norm": 2.512207562490718, + "learning_rate": 5.952930066483897e-07, + "loss": 0.4978, + "step": 10795 + }, + { + "epoch": 0.85, + "grad_norm": 0.5225455933231761, + "learning_rate": 5.946913025933049e-07, + "loss": 0.4646, + "step": 10796 + }, + { + "epoch": 0.85, + "grad_norm": 2.3586749670406686, + "learning_rate": 5.940898835559322e-07, + "loss": 0.4232, + "step": 10797 + }, + { + "epoch": 0.85, + "grad_norm": 1.5593505517596848, + "learning_rate": 5.934887495751796e-07, + "loss": 0.4223, + "step": 10798 + }, + { + "epoch": 0.85, + "grad_norm": 3.8220238770516963, + "learning_rate": 5.928879006899396e-07, + "loss": 0.4473, + "step": 10799 + }, + { + "epoch": 0.85, + "grad_norm": 0.5536027658926895, + "learning_rate": 5.92287336939088e-07, + "loss": 0.475, + "step": 10800 + }, + { + "epoch": 0.85, + "grad_norm": 1.4541958896894158, + "learning_rate": 5.916870583614792e-07, + "loss": 0.399, + "step": 10801 + }, + { + "epoch": 0.85, + "grad_norm": 1.637756580703952, + "learning_rate": 5.910870649959522e-07, + "loss": 0.4895, + "step": 10802 + }, + { + "epoch": 0.85, + "grad_norm": 1.5146636224458756, + "learning_rate": 5.904873568813236e-07, + "loss": 0.4372, + "step": 10803 + }, + { + "epoch": 0.85, + "grad_norm": 2.013438246491489, + "learning_rate": 5.89887934056394e-07, + "loss": 0.437, + "step": 10804 + }, + { + "epoch": 0.85, + "grad_norm": 1.9922980137485204, + "learning_rate": 5.892887965599464e-07, + "loss": 0.3867, + "step": 10805 + }, + { + "epoch": 0.85, + "grad_norm": 2.5359570081297016, + "learning_rate": 5.886899444307448e-07, + "loss": 0.4343, + "step": 10806 + }, + { + "epoch": 0.85, + "grad_norm": 1.4652502354706705, + "learning_rate": 5.88091377707532e-07, + "loss": 0.4436, + "step": 10807 + }, + { + "epoch": 0.85, + "grad_norm": 2.0566861749072802, + "learning_rate": 5.87493096429036e-07, + "loss": 0.5002, + "step": 10808 + }, + { + "epoch": 0.85, + "grad_norm": 2.24133630827586, + "learning_rate": 5.868951006339635e-07, + "loss": 0.4898, + "step": 10809 + }, + { + "epoch": 0.85, + "grad_norm": 4.30951765685275, + "learning_rate": 5.862973903610065e-07, + "loss": 0.493, + "step": 10810 + }, + { + "epoch": 0.85, + "grad_norm": 1.5417803423874918, + "learning_rate": 5.856999656488322e-07, + "loss": 0.4429, + "step": 10811 + }, + { + "epoch": 0.85, + "grad_norm": 1.911492046118922, + "learning_rate": 5.85102826536097e-07, + "loss": 0.4498, + "step": 10812 + }, + { + "epoch": 0.85, + "grad_norm": 0.5400918949336364, + "learning_rate": 5.845059730614339e-07, + "loss": 0.4527, + "step": 10813 + }, + { + "epoch": 0.85, + "grad_norm": 0.5202719026276379, + "learning_rate": 5.839094052634575e-07, + "loss": 0.4696, + "step": 10814 + }, + { + "epoch": 0.85, + "grad_norm": 0.5429198491748485, + "learning_rate": 5.833131231807654e-07, + "loss": 0.4536, + "step": 10815 + }, + { + "epoch": 0.85, + "grad_norm": 1.3623802649636514, + "learning_rate": 5.827171268519365e-07, + "loss": 0.4241, + "step": 10816 + }, + { + "epoch": 0.85, + "grad_norm": 2.160507762999765, + "learning_rate": 5.82121416315532e-07, + "loss": 0.4313, + "step": 10817 + }, + { + "epoch": 0.85, + "grad_norm": 0.5449607707765769, + "learning_rate": 5.815259916100918e-07, + "loss": 0.4767, + "step": 10818 + }, + { + "epoch": 0.85, + "grad_norm": 1.8159785381765345, + "learning_rate": 5.809308527741397e-07, + "loss": 0.367, + "step": 10819 + }, + { + "epoch": 0.85, + "grad_norm": 1.4829408520802603, + "learning_rate": 5.803359998461805e-07, + "loss": 0.459, + "step": 10820 + }, + { + "epoch": 0.85, + "grad_norm": 2.0951268489676216, + "learning_rate": 5.797414328647011e-07, + "loss": 0.437, + "step": 10821 + }, + { + "epoch": 0.85, + "grad_norm": 9.422689292584558, + "learning_rate": 5.791471518681679e-07, + "loss": 0.4157, + "step": 10822 + }, + { + "epoch": 0.85, + "grad_norm": 2.0619657511312353, + "learning_rate": 5.785531568950309e-07, + "loss": 0.3925, + "step": 10823 + }, + { + "epoch": 0.85, + "grad_norm": 1.7916179152446619, + "learning_rate": 5.779594479837209e-07, + "loss": 0.4389, + "step": 10824 + }, + { + "epoch": 0.85, + "grad_norm": 2.01182948138568, + "learning_rate": 5.773660251726493e-07, + "loss": 0.425, + "step": 10825 + }, + { + "epoch": 0.85, + "grad_norm": 1.9257717345465382, + "learning_rate": 5.767728885002116e-07, + "loss": 0.4323, + "step": 10826 + }, + { + "epoch": 0.85, + "grad_norm": 1.5709221319772775, + "learning_rate": 5.761800380047794e-07, + "loss": 0.4454, + "step": 10827 + }, + { + "epoch": 0.85, + "grad_norm": 1.5822537843938314, + "learning_rate": 5.755874737247141e-07, + "loss": 0.4192, + "step": 10828 + }, + { + "epoch": 0.85, + "grad_norm": 1.442436894068763, + "learning_rate": 5.749951956983501e-07, + "loss": 0.4441, + "step": 10829 + }, + { + "epoch": 0.85, + "grad_norm": 1.5107128702364907, + "learning_rate": 5.744032039640096e-07, + "loss": 0.4749, + "step": 10830 + }, + { + "epoch": 0.85, + "grad_norm": 0.5734869784195678, + "learning_rate": 5.738114985599902e-07, + "loss": 0.4802, + "step": 10831 + }, + { + "epoch": 0.85, + "grad_norm": 1.812160297709051, + "learning_rate": 5.732200795245785e-07, + "loss": 0.4362, + "step": 10832 + }, + { + "epoch": 0.85, + "grad_norm": 1.7317104284369116, + "learning_rate": 5.726289468960361e-07, + "loss": 0.4532, + "step": 10833 + }, + { + "epoch": 0.85, + "grad_norm": 2.071701908294174, + "learning_rate": 5.720381007126092e-07, + "loss": 0.4298, + "step": 10834 + }, + { + "epoch": 0.85, + "grad_norm": 1.503318717640116, + "learning_rate": 5.714475410125241e-07, + "loss": 0.3819, + "step": 10835 + }, + { + "epoch": 0.85, + "grad_norm": 3.0042796740810926, + "learning_rate": 5.708572678339902e-07, + "loss": 0.4414, + "step": 10836 + }, + { + "epoch": 0.85, + "grad_norm": 2.2880602654563855, + "learning_rate": 5.702672812151977e-07, + "loss": 0.3797, + "step": 10837 + }, + { + "epoch": 0.85, + "grad_norm": 1.6003581605275439, + "learning_rate": 5.696775811943167e-07, + "loss": 0.4342, + "step": 10838 + }, + { + "epoch": 0.85, + "grad_norm": 2.0254003841594748, + "learning_rate": 5.690881678095e-07, + "loss": 0.4486, + "step": 10839 + }, + { + "epoch": 0.85, + "grad_norm": 2.6130870161573747, + "learning_rate": 5.684990410988833e-07, + "loss": 0.4116, + "step": 10840 + }, + { + "epoch": 0.85, + "grad_norm": 1.5333646815175068, + "learning_rate": 5.679102011005821e-07, + "loss": 0.4902, + "step": 10841 + }, + { + "epoch": 0.85, + "grad_norm": 1.6606103665435128, + "learning_rate": 5.673216478526916e-07, + "loss": 0.4655, + "step": 10842 + }, + { + "epoch": 0.85, + "grad_norm": 1.4302699390926856, + "learning_rate": 5.667333813932924e-07, + "loss": 0.3913, + "step": 10843 + }, + { + "epoch": 0.85, + "grad_norm": 2.128687128433247, + "learning_rate": 5.66145401760444e-07, + "loss": 0.4219, + "step": 10844 + }, + { + "epoch": 0.85, + "grad_norm": 2.4151790613071067, + "learning_rate": 5.655577089921887e-07, + "loss": 0.4531, + "step": 10845 + }, + { + "epoch": 0.85, + "grad_norm": 1.960544175415978, + "learning_rate": 5.649703031265463e-07, + "loss": 0.4136, + "step": 10846 + }, + { + "epoch": 0.85, + "grad_norm": 0.5500490480852088, + "learning_rate": 5.643831842015252e-07, + "loss": 0.4777, + "step": 10847 + }, + { + "epoch": 0.85, + "grad_norm": 1.5071035218244582, + "learning_rate": 5.637963522551099e-07, + "loss": 0.4581, + "step": 10848 + }, + { + "epoch": 0.85, + "grad_norm": 0.5424968768062859, + "learning_rate": 5.632098073252668e-07, + "loss": 0.4728, + "step": 10849 + }, + { + "epoch": 0.85, + "grad_norm": 1.9658832981977032, + "learning_rate": 5.626235494499449e-07, + "loss": 0.4336, + "step": 10850 + }, + { + "epoch": 0.85, + "grad_norm": 1.806582633322733, + "learning_rate": 5.620375786670746e-07, + "loss": 0.4661, + "step": 10851 + }, + { + "epoch": 0.85, + "grad_norm": 2.019587679592214, + "learning_rate": 5.614518950145687e-07, + "loss": 0.4553, + "step": 10852 + }, + { + "epoch": 0.85, + "grad_norm": 1.9067432855093487, + "learning_rate": 5.608664985303175e-07, + "loss": 0.3983, + "step": 10853 + }, + { + "epoch": 0.85, + "grad_norm": 1.6535079530856447, + "learning_rate": 5.602813892521963e-07, + "loss": 0.432, + "step": 10854 + }, + { + "epoch": 0.85, + "grad_norm": 1.3740540783042183, + "learning_rate": 5.596965672180621e-07, + "loss": 0.4499, + "step": 10855 + }, + { + "epoch": 0.85, + "grad_norm": 0.5738241806876595, + "learning_rate": 5.591120324657518e-07, + "loss": 0.4707, + "step": 10856 + }, + { + "epoch": 0.85, + "grad_norm": 4.244159279122046, + "learning_rate": 5.585277850330828e-07, + "loss": 0.4346, + "step": 10857 + }, + { + "epoch": 0.85, + "grad_norm": 1.7453393786132343, + "learning_rate": 5.579438249578551e-07, + "loss": 0.4434, + "step": 10858 + }, + { + "epoch": 0.85, + "grad_norm": 1.6469922641319328, + "learning_rate": 5.573601522778527e-07, + "loss": 0.4563, + "step": 10859 + }, + { + "epoch": 0.85, + "grad_norm": 0.5708469778495701, + "learning_rate": 5.56776767030836e-07, + "loss": 0.4925, + "step": 10860 + }, + { + "epoch": 0.85, + "grad_norm": 1.7150207591033215, + "learning_rate": 5.561936692545511e-07, + "loss": 0.4137, + "step": 10861 + }, + { + "epoch": 0.85, + "grad_norm": 0.554753261979144, + "learning_rate": 5.556108589867204e-07, + "loss": 0.463, + "step": 10862 + }, + { + "epoch": 0.85, + "grad_norm": 1.8912597914779397, + "learning_rate": 5.550283362650549e-07, + "loss": 0.4092, + "step": 10863 + }, + { + "epoch": 0.85, + "grad_norm": 1.9109225767576261, + "learning_rate": 5.544461011272406e-07, + "loss": 0.4596, + "step": 10864 + }, + { + "epoch": 0.85, + "grad_norm": 1.6054952096209463, + "learning_rate": 5.538641536109491e-07, + "loss": 0.473, + "step": 10865 + }, + { + "epoch": 0.85, + "grad_norm": 2.460717511789836, + "learning_rate": 5.532824937538279e-07, + "loss": 0.4562, + "step": 10866 + }, + { + "epoch": 0.85, + "grad_norm": 1.8958464952457232, + "learning_rate": 5.527011215935152e-07, + "loss": 0.4841, + "step": 10867 + }, + { + "epoch": 0.85, + "grad_norm": 2.4102602154278276, + "learning_rate": 5.521200371676205e-07, + "loss": 0.4459, + "step": 10868 + }, + { + "epoch": 0.85, + "grad_norm": 2.813225515684679, + "learning_rate": 5.51539240513741e-07, + "loss": 0.4529, + "step": 10869 + }, + { + "epoch": 0.85, + "grad_norm": 2.1192440632913203, + "learning_rate": 5.509587316694537e-07, + "loss": 0.4115, + "step": 10870 + }, + { + "epoch": 0.85, + "grad_norm": 0.5054936648933233, + "learning_rate": 5.503785106723158e-07, + "loss": 0.4656, + "step": 10871 + }, + { + "epoch": 0.85, + "grad_norm": 1.310831228962782, + "learning_rate": 5.497985775598691e-07, + "loss": 0.4901, + "step": 10872 + }, + { + "epoch": 0.85, + "grad_norm": 21.034390259467692, + "learning_rate": 5.492189323696312e-07, + "loss": 0.4391, + "step": 10873 + }, + { + "epoch": 0.85, + "grad_norm": 1.8294287366217235, + "learning_rate": 5.486395751391066e-07, + "loss": 0.4328, + "step": 10874 + }, + { + "epoch": 0.85, + "grad_norm": 1.9746278140422306, + "learning_rate": 5.48060505905778e-07, + "loss": 0.4982, + "step": 10875 + }, + { + "epoch": 0.85, + "grad_norm": 0.5766145404121513, + "learning_rate": 5.474817247071118e-07, + "loss": 0.4914, + "step": 10876 + }, + { + "epoch": 0.85, + "grad_norm": 2.4515331033336865, + "learning_rate": 5.469032315805522e-07, + "loss": 0.4779, + "step": 10877 + }, + { + "epoch": 0.85, + "grad_norm": 3.3005267914200878, + "learning_rate": 5.463250265635284e-07, + "loss": 0.4441, + "step": 10878 + }, + { + "epoch": 0.85, + "grad_norm": 2.0966236538316863, + "learning_rate": 5.457471096934492e-07, + "loss": 0.4824, + "step": 10879 + }, + { + "epoch": 0.85, + "grad_norm": 1.540918847395035, + "learning_rate": 5.451694810077052e-07, + "loss": 0.4803, + "step": 10880 + }, + { + "epoch": 0.85, + "grad_norm": 1.6410188473473428, + "learning_rate": 5.445921405436682e-07, + "loss": 0.426, + "step": 10881 + }, + { + "epoch": 0.85, + "grad_norm": 3.3069103372556707, + "learning_rate": 5.440150883386913e-07, + "loss": 0.4174, + "step": 10882 + }, + { + "epoch": 0.85, + "grad_norm": 1.3113773610892514, + "learning_rate": 5.434383244301094e-07, + "loss": 0.3944, + "step": 10883 + }, + { + "epoch": 0.85, + "grad_norm": 1.7556081142813822, + "learning_rate": 5.428618488552378e-07, + "loss": 0.4558, + "step": 10884 + }, + { + "epoch": 0.85, + "grad_norm": 1.6830569044444779, + "learning_rate": 5.422856616513733e-07, + "loss": 0.4389, + "step": 10885 + }, + { + "epoch": 0.85, + "grad_norm": 0.5261315779865384, + "learning_rate": 5.417097628557955e-07, + "loss": 0.4508, + "step": 10886 + }, + { + "epoch": 0.86, + "grad_norm": 0.5823560919071649, + "learning_rate": 5.411341525057645e-07, + "loss": 0.4723, + "step": 10887 + }, + { + "epoch": 0.86, + "grad_norm": 1.5149878131534045, + "learning_rate": 5.405588306385201e-07, + "loss": 0.418, + "step": 10888 + }, + { + "epoch": 0.86, + "grad_norm": 1.9139853125008583, + "learning_rate": 5.399837972912858e-07, + "loss": 0.4733, + "step": 10889 + }, + { + "epoch": 0.86, + "grad_norm": 2.3057611157787754, + "learning_rate": 5.394090525012652e-07, + "loss": 0.4586, + "step": 10890 + }, + { + "epoch": 0.86, + "grad_norm": 1.8697118132545676, + "learning_rate": 5.388345963056451e-07, + "loss": 0.41, + "step": 10891 + }, + { + "epoch": 0.86, + "grad_norm": 1.8412086396994791, + "learning_rate": 5.382604287415893e-07, + "loss": 0.3979, + "step": 10892 + }, + { + "epoch": 0.86, + "grad_norm": 0.5431152410329332, + "learning_rate": 5.376865498462463e-07, + "loss": 0.4659, + "step": 10893 + }, + { + "epoch": 0.86, + "grad_norm": 2.8782354999479223, + "learning_rate": 5.371129596567476e-07, + "loss": 0.4359, + "step": 10894 + }, + { + "epoch": 0.86, + "grad_norm": 1.4738863201221684, + "learning_rate": 5.365396582102017e-07, + "loss": 0.4061, + "step": 10895 + }, + { + "epoch": 0.86, + "grad_norm": 1.865300320619555, + "learning_rate": 5.359666455437018e-07, + "loss": 0.4416, + "step": 10896 + }, + { + "epoch": 0.86, + "grad_norm": 1.9807918399546878, + "learning_rate": 5.353939216943183e-07, + "loss": 0.4199, + "step": 10897 + }, + { + "epoch": 0.86, + "grad_norm": 1.5496444305304409, + "learning_rate": 5.348214866991097e-07, + "loss": 0.4407, + "step": 10898 + }, + { + "epoch": 0.86, + "grad_norm": 1.664488471224294, + "learning_rate": 5.342493405951088e-07, + "loss": 0.4543, + "step": 10899 + }, + { + "epoch": 0.86, + "grad_norm": 0.5374334593117986, + "learning_rate": 5.336774834193343e-07, + "loss": 0.4635, + "step": 10900 + }, + { + "epoch": 0.86, + "grad_norm": 1.359286354499065, + "learning_rate": 5.33105915208782e-07, + "loss": 0.3914, + "step": 10901 + }, + { + "epoch": 0.86, + "grad_norm": 1.6388165609626366, + "learning_rate": 5.325346360004357e-07, + "loss": 0.4339, + "step": 10902 + }, + { + "epoch": 0.86, + "grad_norm": 1.5264002417774192, + "learning_rate": 5.319636458312532e-07, + "loss": 0.425, + "step": 10903 + }, + { + "epoch": 0.86, + "grad_norm": 2.0669631589083433, + "learning_rate": 5.313929447381777e-07, + "loss": 0.5052, + "step": 10904 + }, + { + "epoch": 0.86, + "grad_norm": 2.3205715580532176, + "learning_rate": 5.308225327581334e-07, + "loss": 0.4572, + "step": 10905 + }, + { + "epoch": 0.86, + "grad_norm": 2.8672978841287766, + "learning_rate": 5.302524099280243e-07, + "loss": 0.4327, + "step": 10906 + }, + { + "epoch": 0.86, + "grad_norm": 2.4016618676585857, + "learning_rate": 5.296825762847385e-07, + "loss": 0.4522, + "step": 10907 + }, + { + "epoch": 0.86, + "grad_norm": 0.5383318893556583, + "learning_rate": 5.29113031865141e-07, + "loss": 0.4663, + "step": 10908 + }, + { + "epoch": 0.86, + "grad_norm": 0.5331274848758256, + "learning_rate": 5.285437767060819e-07, + "loss": 0.473, + "step": 10909 + }, + { + "epoch": 0.86, + "grad_norm": 0.5353462402087394, + "learning_rate": 5.27974810844391e-07, + "loss": 0.4718, + "step": 10910 + }, + { + "epoch": 0.86, + "grad_norm": 2.467879041583342, + "learning_rate": 5.274061343168807e-07, + "loss": 0.4464, + "step": 10911 + }, + { + "epoch": 0.86, + "grad_norm": 1.811014807292322, + "learning_rate": 5.268377471603414e-07, + "loss": 0.4355, + "step": 10912 + }, + { + "epoch": 0.86, + "grad_norm": 1.9247970920665587, + "learning_rate": 5.262696494115483e-07, + "loss": 0.4841, + "step": 10913 + }, + { + "epoch": 0.86, + "grad_norm": 1.5240870374881936, + "learning_rate": 5.257018411072562e-07, + "loss": 0.4249, + "step": 10914 + }, + { + "epoch": 0.86, + "grad_norm": 1.6038187112985207, + "learning_rate": 5.25134322284202e-07, + "loss": 0.4438, + "step": 10915 + }, + { + "epoch": 0.86, + "grad_norm": 3.0437388161851175, + "learning_rate": 5.245670929791036e-07, + "loss": 0.4498, + "step": 10916 + }, + { + "epoch": 0.86, + "grad_norm": 1.5039161400519319, + "learning_rate": 5.240001532286598e-07, + "loss": 0.4436, + "step": 10917 + }, + { + "epoch": 0.86, + "grad_norm": 1.6662791337627645, + "learning_rate": 5.234335030695514e-07, + "loss": 0.4816, + "step": 10918 + }, + { + "epoch": 0.86, + "grad_norm": 1.5946887567226384, + "learning_rate": 5.228671425384385e-07, + "loss": 0.4654, + "step": 10919 + }, + { + "epoch": 0.86, + "grad_norm": 1.9045963341688525, + "learning_rate": 5.223010716719645e-07, + "loss": 0.4131, + "step": 10920 + }, + { + "epoch": 0.86, + "grad_norm": 2.4916448129115687, + "learning_rate": 5.217352905067536e-07, + "loss": 0.4642, + "step": 10921 + }, + { + "epoch": 0.86, + "grad_norm": 2.0009105942146745, + "learning_rate": 5.211697990794118e-07, + "loss": 0.4309, + "step": 10922 + }, + { + "epoch": 0.86, + "grad_norm": 1.776198953977961, + "learning_rate": 5.206045974265245e-07, + "loss": 0.4137, + "step": 10923 + }, + { + "epoch": 0.86, + "grad_norm": 2.171524575346769, + "learning_rate": 5.200396855846596e-07, + "loss": 0.4797, + "step": 10924 + }, + { + "epoch": 0.86, + "grad_norm": 1.8214414534489494, + "learning_rate": 5.194750635903667e-07, + "loss": 0.4846, + "step": 10925 + }, + { + "epoch": 0.86, + "grad_norm": 0.5260738739702246, + "learning_rate": 5.189107314801756e-07, + "loss": 0.4783, + "step": 10926 + }, + { + "epoch": 0.86, + "grad_norm": 1.514046117209759, + "learning_rate": 5.183466892905997e-07, + "loss": 0.4224, + "step": 10927 + }, + { + "epoch": 0.86, + "grad_norm": 1.571039921632034, + "learning_rate": 5.177829370581277e-07, + "loss": 0.4983, + "step": 10928 + }, + { + "epoch": 0.86, + "grad_norm": 1.657610666252025, + "learning_rate": 5.172194748192389e-07, + "loss": 0.4022, + "step": 10929 + }, + { + "epoch": 0.86, + "grad_norm": 0.5325803862687203, + "learning_rate": 5.166563026103844e-07, + "loss": 0.4814, + "step": 10930 + }, + { + "epoch": 0.86, + "grad_norm": 2.101495184303436, + "learning_rate": 5.160934204680029e-07, + "loss": 0.438, + "step": 10931 + }, + { + "epoch": 0.86, + "grad_norm": 2.245235616243393, + "learning_rate": 5.155308284285098e-07, + "loss": 0.4456, + "step": 10932 + }, + { + "epoch": 0.86, + "grad_norm": 1.8277363218902816, + "learning_rate": 5.149685265283078e-07, + "loss": 0.4615, + "step": 10933 + }, + { + "epoch": 0.86, + "grad_norm": 1.4798322389781593, + "learning_rate": 5.144065148037736e-07, + "loss": 0.4169, + "step": 10934 + }, + { + "epoch": 0.86, + "grad_norm": 0.5303499416254304, + "learning_rate": 5.13844793291271e-07, + "loss": 0.4855, + "step": 10935 + }, + { + "epoch": 0.86, + "grad_norm": 1.9138365050382489, + "learning_rate": 5.132833620271399e-07, + "loss": 0.4387, + "step": 10936 + }, + { + "epoch": 0.86, + "grad_norm": 2.3548282942326617, + "learning_rate": 5.127222210477067e-07, + "loss": 0.4752, + "step": 10937 + }, + { + "epoch": 0.86, + "grad_norm": 1.3340372347991212, + "learning_rate": 5.12161370389277e-07, + "loss": 0.444, + "step": 10938 + }, + { + "epoch": 0.86, + "grad_norm": 1.7390982665244552, + "learning_rate": 5.116008100881348e-07, + "loss": 0.431, + "step": 10939 + }, + { + "epoch": 0.86, + "grad_norm": 1.729033023504952, + "learning_rate": 5.110405401805485e-07, + "loss": 0.4388, + "step": 10940 + }, + { + "epoch": 0.86, + "grad_norm": 0.5368454767663509, + "learning_rate": 5.104805607027668e-07, + "loss": 0.4783, + "step": 10941 + }, + { + "epoch": 0.86, + "grad_norm": 1.6008487244985252, + "learning_rate": 5.099208716910209e-07, + "loss": 0.475, + "step": 10942 + }, + { + "epoch": 0.86, + "grad_norm": 1.7334560289882073, + "learning_rate": 5.0936147318152e-07, + "loss": 0.3736, + "step": 10943 + }, + { + "epoch": 0.86, + "grad_norm": 1.5585106415293764, + "learning_rate": 5.088023652104568e-07, + "loss": 0.4335, + "step": 10944 + }, + { + "epoch": 0.86, + "grad_norm": 1.775819680991112, + "learning_rate": 5.082435478140057e-07, + "loss": 0.4137, + "step": 10945 + }, + { + "epoch": 0.86, + "grad_norm": 1.9220030506697252, + "learning_rate": 5.076850210283224e-07, + "loss": 0.4376, + "step": 10946 + }, + { + "epoch": 0.86, + "grad_norm": 2.2901663379713417, + "learning_rate": 5.071267848895401e-07, + "loss": 0.4601, + "step": 10947 + }, + { + "epoch": 0.86, + "grad_norm": 2.1680285529747505, + "learning_rate": 5.065688394337765e-07, + "loss": 0.446, + "step": 10948 + }, + { + "epoch": 0.86, + "grad_norm": 1.8301262076141867, + "learning_rate": 5.060111846971327e-07, + "loss": 0.4721, + "step": 10949 + }, + { + "epoch": 0.86, + "grad_norm": 1.7211615153631636, + "learning_rate": 5.054538207156856e-07, + "loss": 0.4458, + "step": 10950 + }, + { + "epoch": 0.86, + "grad_norm": 0.5182029070851061, + "learning_rate": 5.048967475254968e-07, + "loss": 0.4562, + "step": 10951 + }, + { + "epoch": 0.86, + "grad_norm": 1.6189059306816527, + "learning_rate": 5.04339965162608e-07, + "loss": 0.4001, + "step": 10952 + }, + { + "epoch": 0.86, + "grad_norm": 1.9055168091727923, + "learning_rate": 5.037834736630431e-07, + "loss": 0.4732, + "step": 10953 + }, + { + "epoch": 0.86, + "grad_norm": 2.1205733333738115, + "learning_rate": 5.032272730628052e-07, + "loss": 0.4328, + "step": 10954 + }, + { + "epoch": 0.86, + "grad_norm": 1.9872429198721748, + "learning_rate": 5.026713633978797e-07, + "loss": 0.4345, + "step": 10955 + }, + { + "epoch": 0.86, + "grad_norm": 4.802664063243212, + "learning_rate": 5.021157447042341e-07, + "loss": 0.4033, + "step": 10956 + }, + { + "epoch": 0.86, + "grad_norm": 1.398615388778024, + "learning_rate": 5.015604170178168e-07, + "loss": 0.4453, + "step": 10957 + }, + { + "epoch": 0.86, + "grad_norm": 1.5025946655643436, + "learning_rate": 5.010053803745546e-07, + "loss": 0.4612, + "step": 10958 + }, + { + "epoch": 0.86, + "grad_norm": 1.3875108706688568, + "learning_rate": 5.004506348103594e-07, + "loss": 0.3964, + "step": 10959 + }, + { + "epoch": 0.86, + "grad_norm": 1.2982524845645382, + "learning_rate": 4.998961803611213e-07, + "loss": 0.4712, + "step": 10960 + }, + { + "epoch": 0.86, + "grad_norm": 1.3033359280099868, + "learning_rate": 4.993420170627139e-07, + "loss": 0.4223, + "step": 10961 + }, + { + "epoch": 0.86, + "grad_norm": 1.7744552376192633, + "learning_rate": 4.987881449509913e-07, + "loss": 0.4591, + "step": 10962 + }, + { + "epoch": 0.86, + "grad_norm": 1.6700940168289402, + "learning_rate": 4.982345640617853e-07, + "loss": 0.4446, + "step": 10963 + }, + { + "epoch": 0.86, + "grad_norm": 1.6172525467242715, + "learning_rate": 4.976812744309156e-07, + "loss": 0.404, + "step": 10964 + }, + { + "epoch": 0.86, + "grad_norm": 0.5541547890582302, + "learning_rate": 4.97128276094177e-07, + "loss": 0.4487, + "step": 10965 + }, + { + "epoch": 0.86, + "grad_norm": 2.795180220052461, + "learning_rate": 4.965755690873497e-07, + "loss": 0.4972, + "step": 10966 + }, + { + "epoch": 0.86, + "grad_norm": 1.772764178635714, + "learning_rate": 4.960231534461896e-07, + "loss": 0.3899, + "step": 10967 + }, + { + "epoch": 0.86, + "grad_norm": 1.6916870512908477, + "learning_rate": 4.954710292064418e-07, + "loss": 0.4254, + "step": 10968 + }, + { + "epoch": 0.86, + "grad_norm": 1.6832493013628937, + "learning_rate": 4.949191964038241e-07, + "loss": 0.4546, + "step": 10969 + }, + { + "epoch": 0.86, + "grad_norm": 1.4603010368650682, + "learning_rate": 4.943676550740428e-07, + "loss": 0.4426, + "step": 10970 + }, + { + "epoch": 0.86, + "grad_norm": 1.957370740015821, + "learning_rate": 4.938164052527778e-07, + "loss": 0.4446, + "step": 10971 + }, + { + "epoch": 0.86, + "grad_norm": 2.1051761391196138, + "learning_rate": 4.932654469756976e-07, + "loss": 0.4048, + "step": 10972 + }, + { + "epoch": 0.86, + "grad_norm": 1.4558718063756322, + "learning_rate": 4.927147802784482e-07, + "loss": 0.4611, + "step": 10973 + }, + { + "epoch": 0.86, + "grad_norm": 0.5121296279783231, + "learning_rate": 4.921644051966551e-07, + "loss": 0.4746, + "step": 10974 + }, + { + "epoch": 0.86, + "grad_norm": 1.7378633273797575, + "learning_rate": 4.916143217659286e-07, + "loss": 0.4456, + "step": 10975 + }, + { + "epoch": 0.86, + "grad_norm": 2.5963217259718285, + "learning_rate": 4.91064530021857e-07, + "loss": 0.4507, + "step": 10976 + }, + { + "epoch": 0.86, + "grad_norm": 1.5914908812146715, + "learning_rate": 4.905150300000133e-07, + "loss": 0.4951, + "step": 10977 + }, + { + "epoch": 0.86, + "grad_norm": 0.5716469843884027, + "learning_rate": 4.899658217359471e-07, + "loss": 0.4683, + "step": 10978 + }, + { + "epoch": 0.86, + "grad_norm": 2.0361782927180196, + "learning_rate": 4.894169052651926e-07, + "loss": 0.4851, + "step": 10979 + }, + { + "epoch": 0.86, + "grad_norm": 1.7434227998681935, + "learning_rate": 4.888682806232631e-07, + "loss": 0.4674, + "step": 10980 + }, + { + "epoch": 0.86, + "grad_norm": 1.4416440587621024, + "learning_rate": 4.883199478456558e-07, + "loss": 0.4076, + "step": 10981 + }, + { + "epoch": 0.86, + "grad_norm": 2.175439341363509, + "learning_rate": 4.877719069678449e-07, + "loss": 0.4031, + "step": 10982 + }, + { + "epoch": 0.86, + "grad_norm": 1.8180191666402474, + "learning_rate": 4.872241580252879e-07, + "loss": 0.4398, + "step": 10983 + }, + { + "epoch": 0.86, + "grad_norm": 2.0221037309435874, + "learning_rate": 4.866767010534268e-07, + "loss": 0.4792, + "step": 10984 + }, + { + "epoch": 0.86, + "grad_norm": 2.0927173211045034, + "learning_rate": 4.861295360876772e-07, + "loss": 0.4248, + "step": 10985 + }, + { + "epoch": 0.86, + "grad_norm": 2.521900667936592, + "learning_rate": 4.855826631634425e-07, + "loss": 0.4394, + "step": 10986 + }, + { + "epoch": 0.86, + "grad_norm": 1.962458602319682, + "learning_rate": 4.850360823161032e-07, + "loss": 0.4409, + "step": 10987 + }, + { + "epoch": 0.86, + "grad_norm": 2.8288190717926525, + "learning_rate": 4.844897935810245e-07, + "loss": 0.4034, + "step": 10988 + }, + { + "epoch": 0.86, + "grad_norm": 1.5754956543457765, + "learning_rate": 4.839437969935479e-07, + "loss": 0.3953, + "step": 10989 + }, + { + "epoch": 0.86, + "grad_norm": 2.4370702174886207, + "learning_rate": 4.833980925890003e-07, + "loss": 0.4565, + "step": 10990 + }, + { + "epoch": 0.86, + "grad_norm": 1.8381136063999433, + "learning_rate": 4.828526804026872e-07, + "loss": 0.4597, + "step": 10991 + }, + { + "epoch": 0.86, + "grad_norm": 2.4472657339368933, + "learning_rate": 4.823075604698979e-07, + "loss": 0.3795, + "step": 10992 + }, + { + "epoch": 0.86, + "grad_norm": 1.9818725862183755, + "learning_rate": 4.817627328258983e-07, + "loss": 0.4473, + "step": 10993 + }, + { + "epoch": 0.86, + "grad_norm": 2.50577307359984, + "learning_rate": 4.812181975059393e-07, + "loss": 0.4801, + "step": 10994 + }, + { + "epoch": 0.86, + "grad_norm": 1.5451923508688805, + "learning_rate": 4.806739545452516e-07, + "loss": 0.4783, + "step": 10995 + }, + { + "epoch": 0.86, + "grad_norm": 1.6751859823259365, + "learning_rate": 4.801300039790469e-07, + "loss": 0.4645, + "step": 10996 + }, + { + "epoch": 0.86, + "grad_norm": 0.5079735674860286, + "learning_rate": 4.795863458425193e-07, + "loss": 0.4326, + "step": 10997 + }, + { + "epoch": 0.86, + "grad_norm": 1.8596774659334627, + "learning_rate": 4.790429801708408e-07, + "loss": 0.4505, + "step": 10998 + }, + { + "epoch": 0.86, + "grad_norm": 1.6828437265802594, + "learning_rate": 4.784999069991675e-07, + "loss": 0.4265, + "step": 10999 + }, + { + "epoch": 0.86, + "grad_norm": 1.852011303837463, + "learning_rate": 4.779571263626348e-07, + "loss": 0.4021, + "step": 11000 + }, + { + "epoch": 0.86, + "grad_norm": 1.9506692527695206, + "learning_rate": 4.774146382963618e-07, + "loss": 0.458, + "step": 11001 + }, + { + "epoch": 0.86, + "grad_norm": 1.8583617719503498, + "learning_rate": 4.768724428354432e-07, + "loss": 0.4738, + "step": 11002 + }, + { + "epoch": 0.86, + "grad_norm": 2.193938393382482, + "learning_rate": 4.7633054001496313e-07, + "loss": 0.4351, + "step": 11003 + }, + { + "epoch": 0.86, + "grad_norm": 1.5042517900625831, + "learning_rate": 4.7578892986997804e-07, + "loss": 0.415, + "step": 11004 + }, + { + "epoch": 0.86, + "grad_norm": 2.115526154518036, + "learning_rate": 4.752476124355304e-07, + "loss": 0.4728, + "step": 11005 + }, + { + "epoch": 0.86, + "grad_norm": 1.587713683965019, + "learning_rate": 4.7470658774664334e-07, + "loss": 0.3938, + "step": 11006 + }, + { + "epoch": 0.86, + "grad_norm": 1.6697975863066161, + "learning_rate": 4.7416585583832054e-07, + "loss": 0.4207, + "step": 11007 + }, + { + "epoch": 0.86, + "grad_norm": 1.9247078608840664, + "learning_rate": 4.736254167455473e-07, + "loss": 0.4493, + "step": 11008 + }, + { + "epoch": 0.86, + "grad_norm": 3.6483100345600423, + "learning_rate": 4.730852705032868e-07, + "loss": 0.46, + "step": 11009 + }, + { + "epoch": 0.86, + "grad_norm": 1.7651148539912696, + "learning_rate": 4.725454171464883e-07, + "loss": 0.5046, + "step": 11010 + }, + { + "epoch": 0.86, + "grad_norm": 2.3647511722311183, + "learning_rate": 4.720058567100777e-07, + "loss": 0.501, + "step": 11011 + }, + { + "epoch": 0.86, + "grad_norm": 1.8047554035639717, + "learning_rate": 4.7146658922896607e-07, + "loss": 0.4499, + "step": 11012 + }, + { + "epoch": 0.86, + "grad_norm": 1.5396275407841262, + "learning_rate": 4.709276147380415e-07, + "loss": 0.4186, + "step": 11013 + }, + { + "epoch": 0.86, + "grad_norm": 2.373214488335486, + "learning_rate": 4.7038893327217504e-07, + "loss": 0.4491, + "step": 11014 + }, + { + "epoch": 0.87, + "grad_norm": 1.5752977691237755, + "learning_rate": 4.698505448662194e-07, + "loss": 0.4068, + "step": 11015 + }, + { + "epoch": 0.87, + "grad_norm": 1.6235367153773126, + "learning_rate": 4.693124495550072e-07, + "loss": 0.4333, + "step": 11016 + }, + { + "epoch": 0.87, + "grad_norm": 1.9077259348158078, + "learning_rate": 4.6877464737335344e-07, + "loss": 0.4533, + "step": 11017 + }, + { + "epoch": 0.87, + "grad_norm": 2.0716259787283136, + "learning_rate": 4.682371383560508e-07, + "loss": 0.4502, + "step": 11018 + }, + { + "epoch": 0.87, + "grad_norm": 8.23668520906571, + "learning_rate": 4.6769992253787865e-07, + "loss": 0.4499, + "step": 11019 + }, + { + "epoch": 0.87, + "grad_norm": 1.8833382083150905, + "learning_rate": 4.671629999535915e-07, + "loss": 0.4457, + "step": 11020 + }, + { + "epoch": 0.87, + "grad_norm": 1.9997230258626286, + "learning_rate": 4.666263706379287e-07, + "loss": 0.4033, + "step": 11021 + }, + { + "epoch": 0.87, + "grad_norm": 0.5658059775292958, + "learning_rate": 4.660900346256098e-07, + "loss": 0.462, + "step": 11022 + }, + { + "epoch": 0.87, + "grad_norm": 1.964958672410041, + "learning_rate": 4.655539919513347e-07, + "loss": 0.519, + "step": 11023 + }, + { + "epoch": 0.87, + "grad_norm": 1.9088727839339725, + "learning_rate": 4.65018242649784e-07, + "loss": 0.4605, + "step": 11024 + }, + { + "epoch": 0.87, + "grad_norm": 2.1864388778310757, + "learning_rate": 4.6448278675562066e-07, + "loss": 0.4195, + "step": 11025 + }, + { + "epoch": 0.87, + "grad_norm": 2.6351581907831445, + "learning_rate": 4.639476243034874e-07, + "loss": 0.4386, + "step": 11026 + }, + { + "epoch": 0.87, + "grad_norm": 2.464644086493101, + "learning_rate": 4.6341275532800936e-07, + "loss": 0.4323, + "step": 11027 + }, + { + "epoch": 0.87, + "grad_norm": 1.7431147858647225, + "learning_rate": 4.628781798637921e-07, + "loss": 0.4388, + "step": 11028 + }, + { + "epoch": 0.87, + "grad_norm": 1.7435532121922386, + "learning_rate": 4.623438979454203e-07, + "loss": 0.4644, + "step": 11029 + }, + { + "epoch": 0.87, + "grad_norm": 0.531897520069842, + "learning_rate": 4.618099096074624e-07, + "loss": 0.4585, + "step": 11030 + }, + { + "epoch": 0.87, + "grad_norm": 0.548102434405965, + "learning_rate": 4.612762148844668e-07, + "loss": 0.4373, + "step": 11031 + }, + { + "epoch": 0.87, + "grad_norm": 1.5863252569500517, + "learning_rate": 4.607428138109632e-07, + "loss": 0.4799, + "step": 11032 + }, + { + "epoch": 0.87, + "grad_norm": 1.5341649658252945, + "learning_rate": 4.602097064214611e-07, + "loss": 0.4108, + "step": 11033 + }, + { + "epoch": 0.87, + "grad_norm": 1.8485506589368792, + "learning_rate": 4.596768927504519e-07, + "loss": 0.4746, + "step": 11034 + }, + { + "epoch": 0.87, + "grad_norm": 1.406488734472628, + "learning_rate": 4.5914437283240797e-07, + "loss": 0.4303, + "step": 11035 + }, + { + "epoch": 0.87, + "grad_norm": 2.1023013369516166, + "learning_rate": 4.5861214670178454e-07, + "loss": 0.4305, + "step": 11036 + }, + { + "epoch": 0.87, + "grad_norm": 1.371459638917895, + "learning_rate": 4.5808021439301187e-07, + "loss": 0.4348, + "step": 11037 + }, + { + "epoch": 0.87, + "grad_norm": 2.7219818569349385, + "learning_rate": 4.5754857594050905e-07, + "loss": 0.4844, + "step": 11038 + }, + { + "epoch": 0.87, + "grad_norm": 1.3413849910636684, + "learning_rate": 4.570172313786719e-07, + "loss": 0.4409, + "step": 11039 + }, + { + "epoch": 0.87, + "grad_norm": 1.7792495385356395, + "learning_rate": 4.5648618074187576e-07, + "loss": 0.4492, + "step": 11040 + }, + { + "epoch": 0.87, + "grad_norm": 1.809878302252569, + "learning_rate": 4.5595542406448037e-07, + "loss": 0.4651, + "step": 11041 + }, + { + "epoch": 0.87, + "grad_norm": 1.5277547249318906, + "learning_rate": 4.5542496138082483e-07, + "loss": 0.4039, + "step": 11042 + }, + { + "epoch": 0.87, + "grad_norm": 2.2756840290280387, + "learning_rate": 4.5489479272522954e-07, + "loss": 0.4653, + "step": 11043 + }, + { + "epoch": 0.87, + "grad_norm": 2.234351622996232, + "learning_rate": 4.5436491813199537e-07, + "loss": 0.4686, + "step": 11044 + }, + { + "epoch": 0.87, + "grad_norm": 1.4289956882369947, + "learning_rate": 4.538353376354038e-07, + "loss": 0.4557, + "step": 11045 + }, + { + "epoch": 0.87, + "grad_norm": 1.4691577427748748, + "learning_rate": 4.5330605126971906e-07, + "loss": 0.4013, + "step": 11046 + }, + { + "epoch": 0.87, + "grad_norm": 4.75912939774734, + "learning_rate": 4.52777059069186e-07, + "loss": 0.5048, + "step": 11047 + }, + { + "epoch": 0.87, + "grad_norm": 1.631659226972406, + "learning_rate": 4.5224836106802725e-07, + "loss": 0.4171, + "step": 11048 + }, + { + "epoch": 0.87, + "grad_norm": 0.5459743177321604, + "learning_rate": 4.5171995730045103e-07, + "loss": 0.4701, + "step": 11049 + }, + { + "epoch": 0.87, + "grad_norm": 1.8045602501803502, + "learning_rate": 4.5119184780064326e-07, + "loss": 0.439, + "step": 11050 + }, + { + "epoch": 0.87, + "grad_norm": 2.2593935877446603, + "learning_rate": 4.5066403260277226e-07, + "loss": 0.4548, + "step": 11051 + }, + { + "epoch": 0.87, + "grad_norm": 1.7272508472345955, + "learning_rate": 4.5013651174098837e-07, + "loss": 0.4533, + "step": 11052 + }, + { + "epoch": 0.87, + "grad_norm": 2.5700512231630976, + "learning_rate": 4.496092852494183e-07, + "loss": 0.4275, + "step": 11053 + }, + { + "epoch": 0.87, + "grad_norm": 1.7779662086750219, + "learning_rate": 4.490823531621763e-07, + "loss": 0.5003, + "step": 11054 + }, + { + "epoch": 0.87, + "grad_norm": 1.755303279194439, + "learning_rate": 4.485557155133524e-07, + "loss": 0.4891, + "step": 11055 + }, + { + "epoch": 0.87, + "grad_norm": 1.3269153614416394, + "learning_rate": 4.480293723370188e-07, + "loss": 0.3829, + "step": 11056 + }, + { + "epoch": 0.87, + "grad_norm": 0.5321619909982683, + "learning_rate": 4.4750332366723103e-07, + "loss": 0.463, + "step": 11057 + }, + { + "epoch": 0.87, + "grad_norm": 1.7386251950915632, + "learning_rate": 4.4697756953802296e-07, + "loss": 0.451, + "step": 11058 + }, + { + "epoch": 0.87, + "grad_norm": 2.1384207812147475, + "learning_rate": 4.464521099834096e-07, + "loss": 0.5124, + "step": 11059 + }, + { + "epoch": 0.87, + "grad_norm": 0.5470255194597082, + "learning_rate": 4.4592694503738767e-07, + "loss": 0.4803, + "step": 11060 + }, + { + "epoch": 0.87, + "grad_norm": 1.8946665068640247, + "learning_rate": 4.4540207473393495e-07, + "loss": 0.478, + "step": 11061 + }, + { + "epoch": 0.87, + "grad_norm": 1.9282180672194968, + "learning_rate": 4.4487749910700983e-07, + "loss": 0.4003, + "step": 11062 + }, + { + "epoch": 0.87, + "grad_norm": 5.435205291567352, + "learning_rate": 4.44353218190553e-07, + "loss": 0.4354, + "step": 11063 + }, + { + "epoch": 0.87, + "grad_norm": 1.8126858841584292, + "learning_rate": 4.438292320184817e-07, + "loss": 0.3953, + "step": 11064 + }, + { + "epoch": 0.87, + "grad_norm": 1.747794806915221, + "learning_rate": 4.4330554062469944e-07, + "loss": 0.5295, + "step": 11065 + }, + { + "epoch": 0.87, + "grad_norm": 1.9048302633146534, + "learning_rate": 4.427821440430879e-07, + "loss": 0.4653, + "step": 11066 + }, + { + "epoch": 0.87, + "grad_norm": 1.4801072815876237, + "learning_rate": 4.422590423075107e-07, + "loss": 0.4039, + "step": 11067 + }, + { + "epoch": 0.87, + "grad_norm": 2.2195304546967476, + "learning_rate": 4.417362354518101e-07, + "loss": 0.4236, + "step": 11068 + }, + { + "epoch": 0.87, + "grad_norm": 2.03114532722095, + "learning_rate": 4.4121372350981237e-07, + "loss": 0.4227, + "step": 11069 + }, + { + "epoch": 0.87, + "grad_norm": 1.5380196553927141, + "learning_rate": 4.406915065153233e-07, + "loss": 0.4577, + "step": 11070 + }, + { + "epoch": 0.87, + "grad_norm": 0.5299973016113624, + "learning_rate": 4.401695845021298e-07, + "loss": 0.4727, + "step": 11071 + }, + { + "epoch": 0.87, + "grad_norm": 2.1166077012706555, + "learning_rate": 4.3964795750399746e-07, + "loss": 0.4643, + "step": 11072 + }, + { + "epoch": 0.87, + "grad_norm": 1.9628230671405245, + "learning_rate": 4.3912662555467775e-07, + "loss": 0.4022, + "step": 11073 + }, + { + "epoch": 0.87, + "grad_norm": 2.428174727125141, + "learning_rate": 4.386055886878998e-07, + "loss": 0.4568, + "step": 11074 + }, + { + "epoch": 0.87, + "grad_norm": 1.5599697611706638, + "learning_rate": 4.380848469373722e-07, + "loss": 0.4212, + "step": 11075 + }, + { + "epoch": 0.87, + "grad_norm": 2.3493566910356547, + "learning_rate": 4.375644003367874e-07, + "loss": 0.4529, + "step": 11076 + }, + { + "epoch": 0.87, + "grad_norm": 0.5665960389249406, + "learning_rate": 4.370442489198179e-07, + "loss": 0.4971, + "step": 11077 + }, + { + "epoch": 0.87, + "grad_norm": 1.640238378215373, + "learning_rate": 4.365243927201168e-07, + "loss": 0.4231, + "step": 11078 + }, + { + "epoch": 0.87, + "grad_norm": 2.0784165762985, + "learning_rate": 4.360048317713167e-07, + "loss": 0.4206, + "step": 11079 + }, + { + "epoch": 0.87, + "grad_norm": 1.783943435077842, + "learning_rate": 4.3548556610703394e-07, + "loss": 0.4323, + "step": 11080 + }, + { + "epoch": 0.87, + "grad_norm": 0.5721848985811098, + "learning_rate": 4.349665957608634e-07, + "loss": 0.4869, + "step": 11081 + }, + { + "epoch": 0.87, + "grad_norm": 1.7403417423485206, + "learning_rate": 4.3444792076638377e-07, + "loss": 0.4733, + "step": 11082 + }, + { + "epoch": 0.87, + "grad_norm": 1.5093830256977612, + "learning_rate": 4.339295411571498e-07, + "loss": 0.4959, + "step": 11083 + }, + { + "epoch": 0.87, + "grad_norm": 1.5818654274557407, + "learning_rate": 4.3341145696670086e-07, + "loss": 0.4678, + "step": 11084 + }, + { + "epoch": 0.87, + "grad_norm": 1.467640757569211, + "learning_rate": 4.3289366822855837e-07, + "loss": 0.4348, + "step": 11085 + }, + { + "epoch": 0.87, + "grad_norm": 2.107402953373491, + "learning_rate": 4.323761749762201e-07, + "loss": 0.4847, + "step": 11086 + }, + { + "epoch": 0.87, + "grad_norm": 2.0434247241650736, + "learning_rate": 4.318589772431686e-07, + "loss": 0.4694, + "step": 11087 + }, + { + "epoch": 0.87, + "grad_norm": 1.5864197595826266, + "learning_rate": 4.313420750628644e-07, + "loss": 0.4528, + "step": 11088 + }, + { + "epoch": 0.87, + "grad_norm": 0.5747658943338664, + "learning_rate": 4.308254684687524e-07, + "loss": 0.4639, + "step": 11089 + }, + { + "epoch": 0.87, + "grad_norm": 2.1738253868782342, + "learning_rate": 4.3030915749425527e-07, + "loss": 0.4832, + "step": 11090 + }, + { + "epoch": 0.87, + "grad_norm": 0.5485963096847947, + "learning_rate": 4.2979314217277746e-07, + "loss": 0.4454, + "step": 11091 + }, + { + "epoch": 0.87, + "grad_norm": 1.4513759898879213, + "learning_rate": 4.292774225377044e-07, + "loss": 0.4551, + "step": 11092 + }, + { + "epoch": 0.87, + "grad_norm": 2.359294888208332, + "learning_rate": 4.2876199862240397e-07, + "loss": 0.4143, + "step": 11093 + }, + { + "epoch": 0.87, + "grad_norm": 2.055505843926869, + "learning_rate": 4.282468704602216e-07, + "loss": 0.4714, + "step": 11094 + }, + { + "epoch": 0.87, + "grad_norm": 1.7351728595391682, + "learning_rate": 4.2773203808448573e-07, + "loss": 0.4229, + "step": 11095 + }, + { + "epoch": 0.87, + "grad_norm": 1.842195055628895, + "learning_rate": 4.272175015285057e-07, + "loss": 0.4131, + "step": 11096 + }, + { + "epoch": 0.87, + "grad_norm": 1.7882111978495752, + "learning_rate": 4.2670326082557167e-07, + "loss": 0.4603, + "step": 11097 + }, + { + "epoch": 0.87, + "grad_norm": 1.7327045972385173, + "learning_rate": 4.261893160089553e-07, + "loss": 0.4475, + "step": 11098 + }, + { + "epoch": 0.87, + "grad_norm": 2.650102187519729, + "learning_rate": 4.2567566711190554e-07, + "loss": 0.4435, + "step": 11099 + }, + { + "epoch": 0.87, + "grad_norm": 1.7589804294691203, + "learning_rate": 4.2516231416765583e-07, + "loss": 0.446, + "step": 11100 + }, + { + "epoch": 0.87, + "grad_norm": 1.8431037181242336, + "learning_rate": 4.246492572094202e-07, + "loss": 0.4329, + "step": 11101 + }, + { + "epoch": 0.87, + "grad_norm": 0.5659828880472241, + "learning_rate": 4.241364962703931e-07, + "loss": 0.4797, + "step": 11102 + }, + { + "epoch": 0.87, + "grad_norm": 1.8298914890583236, + "learning_rate": 4.236240313837475e-07, + "loss": 0.4847, + "step": 11103 + }, + { + "epoch": 0.87, + "grad_norm": 1.9504322843087247, + "learning_rate": 4.231118625826408e-07, + "loss": 0.4555, + "step": 11104 + }, + { + "epoch": 0.87, + "grad_norm": 1.527842224653605, + "learning_rate": 4.2259998990020925e-07, + "loss": 0.4283, + "step": 11105 + }, + { + "epoch": 0.87, + "grad_norm": 1.9166044470507195, + "learning_rate": 4.2208841336957075e-07, + "loss": 0.4529, + "step": 11106 + }, + { + "epoch": 0.87, + "grad_norm": 2.098630572928746, + "learning_rate": 4.2157713302382277e-07, + "loss": 0.422, + "step": 11107 + }, + { + "epoch": 0.87, + "grad_norm": 1.8797270150576728, + "learning_rate": 4.210661488960449e-07, + "loss": 0.4424, + "step": 11108 + }, + { + "epoch": 0.87, + "grad_norm": 1.4261053134837747, + "learning_rate": 4.20555461019298e-07, + "loss": 0.3732, + "step": 11109 + }, + { + "epoch": 0.87, + "grad_norm": 0.5512720004015147, + "learning_rate": 4.200450694266212e-07, + "loss": 0.4631, + "step": 11110 + }, + { + "epoch": 0.87, + "grad_norm": 1.5860907932607615, + "learning_rate": 4.195349741510374e-07, + "loss": 0.4602, + "step": 11111 + }, + { + "epoch": 0.87, + "grad_norm": 1.8385758329757602, + "learning_rate": 4.1902517522554866e-07, + "loss": 0.4846, + "step": 11112 + }, + { + "epoch": 0.87, + "grad_norm": 1.5236980189463631, + "learning_rate": 4.185156726831391e-07, + "loss": 0.4259, + "step": 11113 + }, + { + "epoch": 0.87, + "grad_norm": 1.6843359058832088, + "learning_rate": 4.180064665567718e-07, + "loss": 0.4068, + "step": 11114 + }, + { + "epoch": 0.87, + "grad_norm": 0.5489712408136727, + "learning_rate": 4.174975568793915e-07, + "loss": 0.4696, + "step": 11115 + }, + { + "epoch": 0.87, + "grad_norm": 1.714910879588225, + "learning_rate": 4.1698894368392527e-07, + "loss": 0.4368, + "step": 11116 + }, + { + "epoch": 0.87, + "grad_norm": 2.065725231462918, + "learning_rate": 4.1648062700327953e-07, + "loss": 0.4302, + "step": 11117 + }, + { + "epoch": 0.87, + "grad_norm": 2.6895355850815235, + "learning_rate": 4.1597260687034013e-07, + "loss": 0.4604, + "step": 11118 + }, + { + "epoch": 0.87, + "grad_norm": 2.3833732998140564, + "learning_rate": 4.1546488331797587e-07, + "loss": 0.4258, + "step": 11119 + }, + { + "epoch": 0.87, + "grad_norm": 3.4675202470914415, + "learning_rate": 4.149574563790382e-07, + "loss": 0.4398, + "step": 11120 + }, + { + "epoch": 0.87, + "grad_norm": 1.9140230969136391, + "learning_rate": 4.144503260863536e-07, + "loss": 0.4241, + "step": 11121 + }, + { + "epoch": 0.87, + "grad_norm": 1.6376183652564233, + "learning_rate": 4.139434924727359e-07, + "loss": 0.4102, + "step": 11122 + }, + { + "epoch": 0.87, + "grad_norm": 0.5572895620921952, + "learning_rate": 4.1343695557097273e-07, + "loss": 0.4689, + "step": 11123 + }, + { + "epoch": 0.87, + "grad_norm": 1.7408529025844286, + "learning_rate": 4.129307154138401e-07, + "loss": 0.4504, + "step": 11124 + }, + { + "epoch": 0.87, + "grad_norm": 2.098053940959106, + "learning_rate": 4.1242477203408904e-07, + "loss": 0.5142, + "step": 11125 + }, + { + "epoch": 0.87, + "grad_norm": 0.561504258885627, + "learning_rate": 4.119191254644539e-07, + "loss": 0.474, + "step": 11126 + }, + { + "epoch": 0.87, + "grad_norm": 3.8280724743599834, + "learning_rate": 4.1141377573764906e-07, + "loss": 0.4469, + "step": 11127 + }, + { + "epoch": 0.87, + "grad_norm": 1.7690310674729592, + "learning_rate": 4.1090872288637174e-07, + "loss": 0.4564, + "step": 11128 + }, + { + "epoch": 0.87, + "grad_norm": 0.5371923263062188, + "learning_rate": 4.1040396694329576e-07, + "loss": 0.4641, + "step": 11129 + }, + { + "epoch": 0.87, + "grad_norm": 1.7144337657109012, + "learning_rate": 4.098995079410789e-07, + "loss": 0.4896, + "step": 11130 + }, + { + "epoch": 0.87, + "grad_norm": 0.5535176915037792, + "learning_rate": 4.093953459123595e-07, + "loss": 0.4849, + "step": 11131 + }, + { + "epoch": 0.87, + "grad_norm": 1.6974414781112153, + "learning_rate": 4.088914808897554e-07, + "loss": 0.3875, + "step": 11132 + }, + { + "epoch": 0.87, + "grad_norm": 1.4327025747543147, + "learning_rate": 4.083879129058682e-07, + "loss": 0.4006, + "step": 11133 + }, + { + "epoch": 0.87, + "grad_norm": 2.531642427612457, + "learning_rate": 4.078846419932753e-07, + "loss": 0.4116, + "step": 11134 + }, + { + "epoch": 0.87, + "grad_norm": 1.3440089981440855, + "learning_rate": 4.0738166818453883e-07, + "loss": 0.4336, + "step": 11135 + }, + { + "epoch": 0.87, + "grad_norm": 2.1745102060302384, + "learning_rate": 4.0687899151220013e-07, + "loss": 0.487, + "step": 11136 + }, + { + "epoch": 0.87, + "grad_norm": 1.8795258286131635, + "learning_rate": 4.0637661200878363e-07, + "loss": 0.3881, + "step": 11137 + }, + { + "epoch": 0.87, + "grad_norm": 1.880244844636084, + "learning_rate": 4.0587452970678953e-07, + "loss": 0.4214, + "step": 11138 + }, + { + "epoch": 0.87, + "grad_norm": 1.4795103430022716, + "learning_rate": 4.05372744638704e-07, + "loss": 0.4745, + "step": 11139 + }, + { + "epoch": 0.87, + "grad_norm": 4.495783190228966, + "learning_rate": 4.048712568369911e-07, + "loss": 0.4568, + "step": 11140 + }, + { + "epoch": 0.87, + "grad_norm": 2.0233661883997396, + "learning_rate": 4.0437006633409715e-07, + "loss": 0.4748, + "step": 11141 + }, + { + "epoch": 0.88, + "grad_norm": 0.49765040198675514, + "learning_rate": 4.038691731624478e-07, + "loss": 0.4688, + "step": 11142 + }, + { + "epoch": 0.88, + "grad_norm": 2.3885208219486485, + "learning_rate": 4.0336857735445043e-07, + "loss": 0.4363, + "step": 11143 + }, + { + "epoch": 0.88, + "grad_norm": 2.017277712770393, + "learning_rate": 4.028682789424937e-07, + "loss": 0.4493, + "step": 11144 + }, + { + "epoch": 0.88, + "grad_norm": 1.5870012594064757, + "learning_rate": 4.0236827795894497e-07, + "loss": 0.4356, + "step": 11145 + }, + { + "epoch": 0.88, + "grad_norm": 1.4991139979388177, + "learning_rate": 4.018685744361539e-07, + "loss": 0.4765, + "step": 11146 + }, + { + "epoch": 0.88, + "grad_norm": 2.0035029807323346, + "learning_rate": 4.0136916840645077e-07, + "loss": 0.4185, + "step": 11147 + }, + { + "epoch": 0.88, + "grad_norm": 1.8553420701018526, + "learning_rate": 4.0087005990214813e-07, + "loss": 0.4239, + "step": 11148 + }, + { + "epoch": 0.88, + "grad_norm": 1.6431159990692625, + "learning_rate": 4.00371248955535e-07, + "loss": 0.4547, + "step": 11149 + }, + { + "epoch": 0.88, + "grad_norm": 2.006932616398925, + "learning_rate": 3.998727355988852e-07, + "loss": 0.4364, + "step": 11150 + }, + { + "epoch": 0.88, + "grad_norm": 1.6588544935478375, + "learning_rate": 3.9937451986445106e-07, + "loss": 0.4341, + "step": 11151 + }, + { + "epoch": 0.88, + "grad_norm": 1.607829236175498, + "learning_rate": 3.9887660178446753e-07, + "loss": 0.4366, + "step": 11152 + }, + { + "epoch": 0.88, + "grad_norm": 1.6364493090416499, + "learning_rate": 3.983789813911498e-07, + "loss": 0.4478, + "step": 11153 + }, + { + "epoch": 0.88, + "grad_norm": 1.6423815707995204, + "learning_rate": 3.978816587166906e-07, + "loss": 0.4389, + "step": 11154 + }, + { + "epoch": 0.88, + "grad_norm": 1.5917284737552029, + "learning_rate": 3.9738463379326965e-07, + "loss": 0.4257, + "step": 11155 + }, + { + "epoch": 0.88, + "grad_norm": 2.8601407457175227, + "learning_rate": 3.968879066530407e-07, + "loss": 0.4395, + "step": 11156 + }, + { + "epoch": 0.88, + "grad_norm": 1.71648421522421, + "learning_rate": 3.9639147732814356e-07, + "loss": 0.4904, + "step": 11157 + }, + { + "epoch": 0.88, + "grad_norm": 2.7223844515753464, + "learning_rate": 3.9589534585069366e-07, + "loss": 0.3652, + "step": 11158 + }, + { + "epoch": 0.88, + "grad_norm": 2.2905526601579997, + "learning_rate": 3.9539951225279374e-07, + "loss": 0.4261, + "step": 11159 + }, + { + "epoch": 0.88, + "grad_norm": 2.9678126799641222, + "learning_rate": 3.949039765665208e-07, + "loss": 0.4459, + "step": 11160 + }, + { + "epoch": 0.88, + "grad_norm": 1.7322365804689541, + "learning_rate": 3.944087388239376e-07, + "loss": 0.4157, + "step": 11161 + }, + { + "epoch": 0.88, + "grad_norm": 1.429809090981671, + "learning_rate": 3.9391379905708237e-07, + "loss": 0.4199, + "step": 11162 + }, + { + "epoch": 0.88, + "grad_norm": 0.5127176248970764, + "learning_rate": 3.934191572979795e-07, + "loss": 0.4496, + "step": 11163 + }, + { + "epoch": 0.88, + "grad_norm": 0.5215306684772689, + "learning_rate": 3.9292481357863175e-07, + "loss": 0.445, + "step": 11164 + }, + { + "epoch": 0.88, + "grad_norm": 0.5355196048807527, + "learning_rate": 3.9243076793102077e-07, + "loss": 0.4505, + "step": 11165 + }, + { + "epoch": 0.88, + "grad_norm": 1.9755860829691727, + "learning_rate": 3.91937020387112e-07, + "loss": 0.4156, + "step": 11166 + }, + { + "epoch": 0.88, + "grad_norm": 1.770825133660042, + "learning_rate": 3.9144357097885e-07, + "loss": 0.4661, + "step": 11167 + }, + { + "epoch": 0.88, + "grad_norm": 1.6167660160888846, + "learning_rate": 3.909504197381608e-07, + "loss": 0.4447, + "step": 11168 + }, + { + "epoch": 0.88, + "grad_norm": 2.168457025153742, + "learning_rate": 3.90457566696949e-07, + "loss": 0.4483, + "step": 11169 + }, + { + "epoch": 0.88, + "grad_norm": 1.2577243769088193, + "learning_rate": 3.899650118871029e-07, + "loss": 0.428, + "step": 11170 + }, + { + "epoch": 0.88, + "grad_norm": 1.9777920249551537, + "learning_rate": 3.8947275534048977e-07, + "loss": 0.4255, + "step": 11171 + }, + { + "epoch": 0.88, + "grad_norm": 0.5827854011512574, + "learning_rate": 3.8898079708895917e-07, + "loss": 0.481, + "step": 11172 + }, + { + "epoch": 0.88, + "grad_norm": 1.6881681193543336, + "learning_rate": 3.884891371643379e-07, + "loss": 0.4776, + "step": 11173 + }, + { + "epoch": 0.88, + "grad_norm": 1.9202001014464545, + "learning_rate": 3.87997775598436e-07, + "loss": 0.4255, + "step": 11174 + }, + { + "epoch": 0.88, + "grad_norm": 3.799322529494823, + "learning_rate": 3.8750671242304636e-07, + "loss": 0.4953, + "step": 11175 + }, + { + "epoch": 0.88, + "grad_norm": 1.5274787062340993, + "learning_rate": 3.870159476699381e-07, + "loss": 0.4343, + "step": 11176 + }, + { + "epoch": 0.88, + "grad_norm": 0.5083134779134512, + "learning_rate": 3.8652548137086297e-07, + "loss": 0.4482, + "step": 11177 + }, + { + "epoch": 0.88, + "grad_norm": 1.8203742842364898, + "learning_rate": 3.8603531355755454e-07, + "loss": 0.4405, + "step": 11178 + }, + { + "epoch": 0.88, + "grad_norm": 1.9579317628937916, + "learning_rate": 3.8554544426172566e-07, + "loss": 0.4467, + "step": 11179 + }, + { + "epoch": 0.88, + "grad_norm": 0.5503509841144032, + "learning_rate": 3.8505587351507003e-07, + "loss": 0.475, + "step": 11180 + }, + { + "epoch": 0.88, + "grad_norm": 2.1232782654738944, + "learning_rate": 3.845666013492616e-07, + "loss": 0.4173, + "step": 11181 + }, + { + "epoch": 0.88, + "grad_norm": 2.210604519155103, + "learning_rate": 3.8407762779595623e-07, + "loss": 0.4128, + "step": 11182 + }, + { + "epoch": 0.88, + "grad_norm": 1.7613806451425842, + "learning_rate": 3.835889528867914e-07, + "loss": 0.4544, + "step": 11183 + }, + { + "epoch": 0.88, + "grad_norm": 2.7304040712443616, + "learning_rate": 3.831005766533813e-07, + "loss": 0.4118, + "step": 11184 + }, + { + "epoch": 0.88, + "grad_norm": 1.467988302086924, + "learning_rate": 3.826124991273239e-07, + "loss": 0.4517, + "step": 11185 + }, + { + "epoch": 0.88, + "grad_norm": 1.4506887612500328, + "learning_rate": 3.8212472034019787e-07, + "loss": 0.4643, + "step": 11186 + }, + { + "epoch": 0.88, + "grad_norm": 2.19762559112552, + "learning_rate": 3.816372403235613e-07, + "loss": 0.4216, + "step": 11187 + }, + { + "epoch": 0.88, + "grad_norm": 0.5428591222245889, + "learning_rate": 3.8115005910895453e-07, + "loss": 0.4696, + "step": 11188 + }, + { + "epoch": 0.88, + "grad_norm": 1.6709501256682693, + "learning_rate": 3.806631767278951e-07, + "loss": 0.4457, + "step": 11189 + }, + { + "epoch": 0.88, + "grad_norm": 2.347818018623209, + "learning_rate": 3.801765932118873e-07, + "loss": 0.3829, + "step": 11190 + }, + { + "epoch": 0.88, + "grad_norm": 2.134665074654038, + "learning_rate": 3.7969030859240917e-07, + "loss": 0.4179, + "step": 11191 + }, + { + "epoch": 0.88, + "grad_norm": 1.9096282455278326, + "learning_rate": 3.792043229009246e-07, + "loss": 0.4159, + "step": 11192 + }, + { + "epoch": 0.88, + "grad_norm": 1.8413833469122418, + "learning_rate": 3.787186361688744e-07, + "loss": 0.4563, + "step": 11193 + }, + { + "epoch": 0.88, + "grad_norm": 1.9950115920133202, + "learning_rate": 3.7823324842768403e-07, + "loss": 0.5158, + "step": 11194 + }, + { + "epoch": 0.88, + "grad_norm": 2.31672365372418, + "learning_rate": 3.7774815970875624e-07, + "loss": 0.4626, + "step": 11195 + }, + { + "epoch": 0.88, + "grad_norm": 2.528051575456564, + "learning_rate": 3.772633700434769e-07, + "loss": 0.4833, + "step": 11196 + }, + { + "epoch": 0.88, + "grad_norm": 3.2679686430784325, + "learning_rate": 3.7677887946320767e-07, + "loss": 0.4963, + "step": 11197 + }, + { + "epoch": 0.88, + "grad_norm": 2.0822057107203524, + "learning_rate": 3.7629468799929845e-07, + "loss": 0.4516, + "step": 11198 + }, + { + "epoch": 0.88, + "grad_norm": 2.2584542499398155, + "learning_rate": 3.758107956830753e-07, + "loss": 0.458, + "step": 11199 + }, + { + "epoch": 0.88, + "grad_norm": 2.05002080538384, + "learning_rate": 3.753272025458432e-07, + "loss": 0.4493, + "step": 11200 + }, + { + "epoch": 0.88, + "grad_norm": 1.3085742794440849, + "learning_rate": 3.7484390861889153e-07, + "loss": 0.3982, + "step": 11201 + }, + { + "epoch": 0.88, + "grad_norm": 2.4998934254205554, + "learning_rate": 3.743609139334886e-07, + "loss": 0.479, + "step": 11202 + }, + { + "epoch": 0.88, + "grad_norm": 3.0320732209662187, + "learning_rate": 3.738782185208839e-07, + "loss": 0.4924, + "step": 11203 + }, + { + "epoch": 0.88, + "grad_norm": 2.8210167891000517, + "learning_rate": 3.7339582241230634e-07, + "loss": 0.4543, + "step": 11204 + }, + { + "epoch": 0.88, + "grad_norm": 2.3772200798027945, + "learning_rate": 3.7291372563896643e-07, + "loss": 0.4402, + "step": 11205 + }, + { + "epoch": 0.88, + "grad_norm": 1.4829788465856155, + "learning_rate": 3.7243192823205544e-07, + "loss": 0.4741, + "step": 11206 + }, + { + "epoch": 0.88, + "grad_norm": 2.1363770255691312, + "learning_rate": 3.719504302227461e-07, + "loss": 0.414, + "step": 11207 + }, + { + "epoch": 0.88, + "grad_norm": 0.5055676598897723, + "learning_rate": 3.714692316421886e-07, + "loss": 0.4652, + "step": 11208 + }, + { + "epoch": 0.88, + "grad_norm": 0.5650562037343111, + "learning_rate": 3.7098833252151633e-07, + "loss": 0.4449, + "step": 11209 + }, + { + "epoch": 0.88, + "grad_norm": 2.053416737057283, + "learning_rate": 3.7050773289184494e-07, + "loss": 0.4279, + "step": 11210 + }, + { + "epoch": 0.88, + "grad_norm": 1.9053979519239272, + "learning_rate": 3.700274327842662e-07, + "loss": 0.4762, + "step": 11211 + }, + { + "epoch": 0.88, + "grad_norm": 2.131708930989026, + "learning_rate": 3.6954743222985644e-07, + "loss": 0.4525, + "step": 11212 + }, + { + "epoch": 0.88, + "grad_norm": 2.9492309929440115, + "learning_rate": 3.6906773125966963e-07, + "loss": 0.4571, + "step": 11213 + }, + { + "epoch": 0.88, + "grad_norm": 2.2333894648454606, + "learning_rate": 3.685883299047438e-07, + "loss": 0.4187, + "step": 11214 + }, + { + "epoch": 0.88, + "grad_norm": 1.80297110438348, + "learning_rate": 3.681092281960935e-07, + "loss": 0.4388, + "step": 11215 + }, + { + "epoch": 0.88, + "grad_norm": 1.6756412323669294, + "learning_rate": 3.676304261647168e-07, + "loss": 0.4211, + "step": 11216 + }, + { + "epoch": 0.88, + "grad_norm": 2.1468896816334526, + "learning_rate": 3.671519238415916e-07, + "loss": 0.4483, + "step": 11217 + }, + { + "epoch": 0.88, + "grad_norm": 1.9665495799666641, + "learning_rate": 3.666737212576771e-07, + "loss": 0.4362, + "step": 11218 + }, + { + "epoch": 0.88, + "grad_norm": 1.5084502946220684, + "learning_rate": 3.661958184439113e-07, + "loss": 0.3862, + "step": 11219 + }, + { + "epoch": 0.88, + "grad_norm": 3.247172816702885, + "learning_rate": 3.657182154312139e-07, + "loss": 0.5086, + "step": 11220 + }, + { + "epoch": 0.88, + "grad_norm": 1.4733995805753137, + "learning_rate": 3.6524091225048573e-07, + "loss": 0.4426, + "step": 11221 + }, + { + "epoch": 0.88, + "grad_norm": 1.6741091252571123, + "learning_rate": 3.647639089326072e-07, + "loss": 0.4142, + "step": 11222 + }, + { + "epoch": 0.88, + "grad_norm": 1.9150597790588353, + "learning_rate": 3.6428720550844123e-07, + "loss": 0.4377, + "step": 11223 + }, + { + "epoch": 0.88, + "grad_norm": 0.5591527812479657, + "learning_rate": 3.638108020088271e-07, + "loss": 0.4739, + "step": 11224 + }, + { + "epoch": 0.88, + "grad_norm": 0.5785559033991536, + "learning_rate": 3.633346984645908e-07, + "loss": 0.4709, + "step": 11225 + }, + { + "epoch": 0.88, + "grad_norm": 1.9393217086635213, + "learning_rate": 3.6285889490653317e-07, + "loss": 0.5065, + "step": 11226 + }, + { + "epoch": 0.88, + "grad_norm": 2.0750015961136645, + "learning_rate": 3.623833913654401e-07, + "loss": 0.3872, + "step": 11227 + }, + { + "epoch": 0.88, + "grad_norm": 1.846531809293167, + "learning_rate": 3.619081878720726e-07, + "loss": 0.4624, + "step": 11228 + }, + { + "epoch": 0.88, + "grad_norm": 3.189120182490041, + "learning_rate": 3.6143328445718e-07, + "loss": 0.4669, + "step": 11229 + }, + { + "epoch": 0.88, + "grad_norm": 1.8147823092363453, + "learning_rate": 3.609586811514848e-07, + "loss": 0.4665, + "step": 11230 + }, + { + "epoch": 0.88, + "grad_norm": 1.8559804851063775, + "learning_rate": 3.6048437798569426e-07, + "loss": 0.3933, + "step": 11231 + }, + { + "epoch": 0.88, + "grad_norm": 1.6895121138224627, + "learning_rate": 3.600103749904954e-07, + "loss": 0.403, + "step": 11232 + }, + { + "epoch": 0.88, + "grad_norm": 1.8465626173969103, + "learning_rate": 3.595366721965554e-07, + "loss": 0.467, + "step": 11233 + }, + { + "epoch": 0.88, + "grad_norm": 0.5647456585426883, + "learning_rate": 3.590632696345231e-07, + "loss": 0.4701, + "step": 11234 + }, + { + "epoch": 0.88, + "grad_norm": 2.009013782082817, + "learning_rate": 3.5859016733502504e-07, + "loss": 0.4634, + "step": 11235 + }, + { + "epoch": 0.88, + "grad_norm": 2.586805668342278, + "learning_rate": 3.581173653286718e-07, + "loss": 0.4412, + "step": 11236 + }, + { + "epoch": 0.88, + "grad_norm": 2.030110058531091, + "learning_rate": 3.5764486364605223e-07, + "loss": 0.4486, + "step": 11237 + }, + { + "epoch": 0.88, + "grad_norm": 1.929175117218611, + "learning_rate": 3.571726623177385e-07, + "loss": 0.4102, + "step": 11238 + }, + { + "epoch": 0.88, + "grad_norm": 1.7564217298755755, + "learning_rate": 3.5670076137427847e-07, + "loss": 0.434, + "step": 11239 + }, + { + "epoch": 0.88, + "grad_norm": 1.8330279457739223, + "learning_rate": 3.5622916084620543e-07, + "loss": 0.4643, + "step": 11240 + }, + { + "epoch": 0.88, + "grad_norm": 2.1385356145707313, + "learning_rate": 3.557578607640305e-07, + "loss": 0.4109, + "step": 11241 + }, + { + "epoch": 0.88, + "grad_norm": 1.6829586330125625, + "learning_rate": 3.552868611582466e-07, + "loss": 0.4019, + "step": 11242 + }, + { + "epoch": 0.88, + "grad_norm": 6.521669728485595, + "learning_rate": 3.5481616205932767e-07, + "loss": 0.4217, + "step": 11243 + }, + { + "epoch": 0.88, + "grad_norm": 1.627417484542981, + "learning_rate": 3.5434576349772485e-07, + "loss": 0.5044, + "step": 11244 + }, + { + "epoch": 0.88, + "grad_norm": 1.8035499607896466, + "learning_rate": 3.5387566550387554e-07, + "loss": 0.4421, + "step": 11245 + }, + { + "epoch": 0.88, + "grad_norm": 2.2312701092618523, + "learning_rate": 3.5340586810819144e-07, + "loss": 0.4549, + "step": 11246 + }, + { + "epoch": 0.88, + "grad_norm": 2.8639923601783135, + "learning_rate": 3.5293637134106994e-07, + "loss": 0.4022, + "step": 11247 + }, + { + "epoch": 0.88, + "grad_norm": 1.3906026463714514, + "learning_rate": 3.524671752328857e-07, + "loss": 0.4425, + "step": 11248 + }, + { + "epoch": 0.88, + "grad_norm": 0.5507524293556348, + "learning_rate": 3.5199827981399594e-07, + "loss": 0.4714, + "step": 11249 + }, + { + "epoch": 0.88, + "grad_norm": 4.4334582474343, + "learning_rate": 3.515296851147365e-07, + "loss": 0.4561, + "step": 11250 + }, + { + "epoch": 0.88, + "grad_norm": 1.4977533269852066, + "learning_rate": 3.5106139116542594e-07, + "loss": 0.4253, + "step": 11251 + }, + { + "epoch": 0.88, + "grad_norm": 1.7299628951329051, + "learning_rate": 3.5059339799636163e-07, + "loss": 0.4709, + "step": 11252 + }, + { + "epoch": 0.88, + "grad_norm": 0.6331679750197606, + "learning_rate": 3.501257056378221e-07, + "loss": 0.4863, + "step": 11253 + }, + { + "epoch": 0.88, + "grad_norm": 2.233951295396871, + "learning_rate": 3.496583141200671e-07, + "loss": 0.4302, + "step": 11254 + }, + { + "epoch": 0.88, + "grad_norm": 1.5163712902913478, + "learning_rate": 3.4919122347333525e-07, + "loss": 0.4117, + "step": 11255 + }, + { + "epoch": 0.88, + "grad_norm": 1.801016535145051, + "learning_rate": 3.4872443372784726e-07, + "loss": 0.4446, + "step": 11256 + }, + { + "epoch": 0.88, + "grad_norm": 1.9450585436580055, + "learning_rate": 3.4825794491380405e-07, + "loss": 0.4472, + "step": 11257 + }, + { + "epoch": 0.88, + "grad_norm": 1.6690305201998483, + "learning_rate": 3.47791757061387e-07, + "loss": 0.4567, + "step": 11258 + }, + { + "epoch": 0.88, + "grad_norm": 1.596938399897498, + "learning_rate": 3.4732587020075594e-07, + "loss": 0.4042, + "step": 11259 + }, + { + "epoch": 0.88, + "grad_norm": 1.633037407709579, + "learning_rate": 3.468602843620561e-07, + "loss": 0.4794, + "step": 11260 + }, + { + "epoch": 0.88, + "grad_norm": 1.7956112080581035, + "learning_rate": 3.4639499957540843e-07, + "loss": 0.4679, + "step": 11261 + }, + { + "epoch": 0.88, + "grad_norm": 1.570303892062921, + "learning_rate": 3.4593001587091666e-07, + "loss": 0.4388, + "step": 11262 + }, + { + "epoch": 0.88, + "grad_norm": 0.5750040759752023, + "learning_rate": 3.454653332786634e-07, + "loss": 0.4611, + "step": 11263 + }, + { + "epoch": 0.88, + "grad_norm": 1.9763764169020825, + "learning_rate": 3.4500095182871505e-07, + "loss": 0.4749, + "step": 11264 + }, + { + "epoch": 0.88, + "grad_norm": 1.5271912631586293, + "learning_rate": 3.445368715511166e-07, + "loss": 0.4241, + "step": 11265 + }, + { + "epoch": 0.88, + "grad_norm": 2.3161719424963043, + "learning_rate": 3.4407309247589114e-07, + "loss": 0.4764, + "step": 11266 + }, + { + "epoch": 0.88, + "grad_norm": 2.3171954992292942, + "learning_rate": 3.4360961463304587e-07, + "loss": 0.4532, + "step": 11267 + }, + { + "epoch": 0.88, + "grad_norm": 1.732159638137134, + "learning_rate": 3.4314643805256787e-07, + "loss": 0.4134, + "step": 11268 + }, + { + "epoch": 0.89, + "grad_norm": 1.8425792793175741, + "learning_rate": 3.4268356276442373e-07, + "loss": 0.4409, + "step": 11269 + }, + { + "epoch": 0.89, + "grad_norm": 2.3244454559928642, + "learning_rate": 3.4222098879855946e-07, + "loss": 0.4513, + "step": 11270 + }, + { + "epoch": 0.89, + "grad_norm": 1.84844128783743, + "learning_rate": 3.4175871618490455e-07, + "loss": 0.4285, + "step": 11271 + }, + { + "epoch": 0.89, + "grad_norm": 1.8364989667274885, + "learning_rate": 3.412967449533666e-07, + "loss": 0.4, + "step": 11272 + }, + { + "epoch": 0.89, + "grad_norm": 1.720826969403055, + "learning_rate": 3.408350751338363e-07, + "loss": 0.4577, + "step": 11273 + }, + { + "epoch": 0.89, + "grad_norm": 1.4952967300242275, + "learning_rate": 3.403737067561802e-07, + "loss": 0.4244, + "step": 11274 + }, + { + "epoch": 0.89, + "grad_norm": 1.6792486451478024, + "learning_rate": 3.3991263985025e-07, + "loss": 0.4579, + "step": 11275 + }, + { + "epoch": 0.89, + "grad_norm": 2.405480274448575, + "learning_rate": 3.394518744458758e-07, + "loss": 0.4045, + "step": 11276 + }, + { + "epoch": 0.89, + "grad_norm": 0.5755293694158577, + "learning_rate": 3.389914105728687e-07, + "loss": 0.4711, + "step": 11277 + }, + { + "epoch": 0.89, + "grad_norm": 1.797505304131525, + "learning_rate": 3.385312482610209e-07, + "loss": 0.5049, + "step": 11278 + }, + { + "epoch": 0.89, + "grad_norm": 1.3801923791706727, + "learning_rate": 3.380713875401015e-07, + "loss": 0.4341, + "step": 11279 + }, + { + "epoch": 0.89, + "grad_norm": 1.7562848146804624, + "learning_rate": 3.3761182843986604e-07, + "loss": 0.4261, + "step": 11280 + }, + { + "epoch": 0.89, + "grad_norm": 2.2520444268079496, + "learning_rate": 3.3715257099004586e-07, + "loss": 0.4863, + "step": 11281 + }, + { + "epoch": 0.89, + "grad_norm": 1.6976790780636088, + "learning_rate": 3.3669361522035426e-07, + "loss": 0.4118, + "step": 11282 + }, + { + "epoch": 0.89, + "grad_norm": 1.4295970667448212, + "learning_rate": 3.3623496116048594e-07, + "loss": 0.4291, + "step": 11283 + }, + { + "epoch": 0.89, + "grad_norm": 1.5972504000799805, + "learning_rate": 3.357766088401149e-07, + "loss": 0.4549, + "step": 11284 + }, + { + "epoch": 0.89, + "grad_norm": 2.1297620285585066, + "learning_rate": 3.3531855828889517e-07, + "loss": 0.4567, + "step": 11285 + }, + { + "epoch": 0.89, + "grad_norm": 2.0201763946640727, + "learning_rate": 3.3486080953646196e-07, + "loss": 0.4452, + "step": 11286 + }, + { + "epoch": 0.89, + "grad_norm": 1.4356084084620493, + "learning_rate": 3.3440336261243213e-07, + "loss": 0.4678, + "step": 11287 + }, + { + "epoch": 0.89, + "grad_norm": 1.957601814454286, + "learning_rate": 3.3394621754640146e-07, + "loss": 0.4485, + "step": 11288 + }, + { + "epoch": 0.89, + "grad_norm": 2.204641086718789, + "learning_rate": 3.3348937436794683e-07, + "loss": 0.4006, + "step": 11289 + }, + { + "epoch": 0.89, + "grad_norm": 2.063164485718259, + "learning_rate": 3.3303283310662516e-07, + "loss": 0.4201, + "step": 11290 + }, + { + "epoch": 0.89, + "grad_norm": 1.918846002747808, + "learning_rate": 3.325765937919734e-07, + "loss": 0.4485, + "step": 11291 + }, + { + "epoch": 0.89, + "grad_norm": 2.001033479431572, + "learning_rate": 3.321206564535101e-07, + "loss": 0.4627, + "step": 11292 + }, + { + "epoch": 0.89, + "grad_norm": 1.5881907082489715, + "learning_rate": 3.3166502112073507e-07, + "loss": 0.4645, + "step": 11293 + }, + { + "epoch": 0.89, + "grad_norm": 1.463542343862739, + "learning_rate": 3.312096878231247e-07, + "loss": 0.4453, + "step": 11294 + }, + { + "epoch": 0.89, + "grad_norm": 0.5629842868900534, + "learning_rate": 3.307546565901415e-07, + "loss": 0.4811, + "step": 11295 + }, + { + "epoch": 0.89, + "grad_norm": 1.5926697933398304, + "learning_rate": 3.3029992745122307e-07, + "loss": 0.4473, + "step": 11296 + }, + { + "epoch": 0.89, + "grad_norm": 1.5122074229037237, + "learning_rate": 3.2984550043579145e-07, + "loss": 0.4471, + "step": 11297 + }, + { + "epoch": 0.89, + "grad_norm": 1.7315963044808953, + "learning_rate": 3.293913755732453e-07, + "loss": 0.4139, + "step": 11298 + }, + { + "epoch": 0.89, + "grad_norm": 1.6038755203354975, + "learning_rate": 3.2893755289296727e-07, + "loss": 0.4409, + "step": 11299 + }, + { + "epoch": 0.89, + "grad_norm": 0.6167336332645473, + "learning_rate": 3.2848403242432047e-07, + "loss": 0.4734, + "step": 11300 + }, + { + "epoch": 0.89, + "grad_norm": 2.255135510269154, + "learning_rate": 3.2803081419664483e-07, + "loss": 0.4282, + "step": 11301 + }, + { + "epoch": 0.89, + "grad_norm": 0.5328681183211792, + "learning_rate": 3.2757789823926354e-07, + "loss": 0.4575, + "step": 11302 + }, + { + "epoch": 0.89, + "grad_norm": 1.9695160746563396, + "learning_rate": 3.2712528458147984e-07, + "loss": 0.4667, + "step": 11303 + }, + { + "epoch": 0.89, + "grad_norm": 1.5743806879613491, + "learning_rate": 3.2667297325257864e-07, + "loss": 0.4437, + "step": 11304 + }, + { + "epoch": 0.89, + "grad_norm": 3.2580586402700162, + "learning_rate": 3.262209642818215e-07, + "loss": 0.4439, + "step": 11305 + }, + { + "epoch": 0.89, + "grad_norm": 2.411325193736013, + "learning_rate": 3.2576925769845393e-07, + "loss": 0.4498, + "step": 11306 + }, + { + "epoch": 0.89, + "grad_norm": 0.5480874047937919, + "learning_rate": 3.253178535317003e-07, + "loss": 0.4622, + "step": 11307 + }, + { + "epoch": 0.89, + "grad_norm": 3.868174691028942, + "learning_rate": 3.2486675181076786e-07, + "loss": 0.4257, + "step": 11308 + }, + { + "epoch": 0.89, + "grad_norm": 1.8368645604276665, + "learning_rate": 3.244159525648394e-07, + "loss": 0.4395, + "step": 11309 + }, + { + "epoch": 0.89, + "grad_norm": 2.212415453784581, + "learning_rate": 3.23965455823082e-07, + "loss": 0.438, + "step": 11310 + }, + { + "epoch": 0.89, + "grad_norm": 2.5916941555780544, + "learning_rate": 3.235152616146436e-07, + "loss": 0.4045, + "step": 11311 + }, + { + "epoch": 0.89, + "grad_norm": 1.6445125963401992, + "learning_rate": 3.230653699686498e-07, + "loss": 0.4447, + "step": 11312 + }, + { + "epoch": 0.89, + "grad_norm": 7.662999763081008, + "learning_rate": 3.2261578091420886e-07, + "loss": 0.457, + "step": 11313 + }, + { + "epoch": 0.89, + "grad_norm": 2.139957599849015, + "learning_rate": 3.2216649448040704e-07, + "loss": 0.4121, + "step": 11314 + }, + { + "epoch": 0.89, + "grad_norm": 1.384133598054696, + "learning_rate": 3.2171751069631494e-07, + "loss": 0.441, + "step": 11315 + }, + { + "epoch": 0.89, + "grad_norm": 1.7865731249453882, + "learning_rate": 3.212688295909788e-07, + "loss": 0.4112, + "step": 11316 + }, + { + "epoch": 0.89, + "grad_norm": 1.8810960806389665, + "learning_rate": 3.208204511934293e-07, + "loss": 0.4665, + "step": 11317 + }, + { + "epoch": 0.89, + "grad_norm": 1.5744258616121598, + "learning_rate": 3.203723755326754e-07, + "loss": 0.4612, + "step": 11318 + }, + { + "epoch": 0.89, + "grad_norm": 0.5348738960636521, + "learning_rate": 3.1992460263770785e-07, + "loss": 0.4721, + "step": 11319 + }, + { + "epoch": 0.89, + "grad_norm": 1.7017684044963908, + "learning_rate": 3.194771325374951e-07, + "loss": 0.5283, + "step": 11320 + }, + { + "epoch": 0.89, + "grad_norm": 2.2004786195128743, + "learning_rate": 3.1902996526098906e-07, + "loss": 0.4361, + "step": 11321 + }, + { + "epoch": 0.89, + "grad_norm": 2.0301826418746165, + "learning_rate": 3.18583100837121e-07, + "loss": 0.4674, + "step": 11322 + }, + { + "epoch": 0.89, + "grad_norm": 1.8679368170386168, + "learning_rate": 3.1813653929480215e-07, + "loss": 0.4034, + "step": 11323 + }, + { + "epoch": 0.89, + "grad_norm": 1.7842176739470226, + "learning_rate": 3.176902806629256e-07, + "loss": 0.4662, + "step": 11324 + }, + { + "epoch": 0.89, + "grad_norm": 1.8176489055260676, + "learning_rate": 3.172443249703616e-07, + "loss": 0.4688, + "step": 11325 + }, + { + "epoch": 0.89, + "grad_norm": 1.6398410562052315, + "learning_rate": 3.1679867224596426e-07, + "loss": 0.4201, + "step": 11326 + }, + { + "epoch": 0.89, + "grad_norm": 1.7633763032238536, + "learning_rate": 3.163533225185661e-07, + "loss": 0.4405, + "step": 11327 + }, + { + "epoch": 0.89, + "grad_norm": 2.1126162702991818, + "learning_rate": 3.159082758169818e-07, + "loss": 0.505, + "step": 11328 + }, + { + "epoch": 0.89, + "grad_norm": 4.093901737775044, + "learning_rate": 3.1546353217000337e-07, + "loss": 0.4494, + "step": 11329 + }, + { + "epoch": 0.89, + "grad_norm": 1.4453150266778187, + "learning_rate": 3.150190916064072e-07, + "loss": 0.4399, + "step": 11330 + }, + { + "epoch": 0.89, + "grad_norm": 1.901952173445501, + "learning_rate": 3.1457495415494643e-07, + "loss": 0.4039, + "step": 11331 + }, + { + "epoch": 0.89, + "grad_norm": 2.275905669997325, + "learning_rate": 3.1413111984435696e-07, + "loss": 0.4455, + "step": 11332 + }, + { + "epoch": 0.89, + "grad_norm": 2.1209537237929874, + "learning_rate": 3.136875887033541e-07, + "loss": 0.4637, + "step": 11333 + }, + { + "epoch": 0.89, + "grad_norm": 0.5452202945300068, + "learning_rate": 3.132443607606339e-07, + "loss": 0.4814, + "step": 11334 + }, + { + "epoch": 0.89, + "grad_norm": 2.0593482072253075, + "learning_rate": 3.1280143604487333e-07, + "loss": 0.4407, + "step": 11335 + }, + { + "epoch": 0.89, + "grad_norm": 0.5469790320314923, + "learning_rate": 3.123588145847273e-07, + "loss": 0.4508, + "step": 11336 + }, + { + "epoch": 0.89, + "grad_norm": 0.521174002232033, + "learning_rate": 3.1191649640883395e-07, + "loss": 0.4763, + "step": 11337 + }, + { + "epoch": 0.89, + "grad_norm": 1.686434817031486, + "learning_rate": 3.114744815458104e-07, + "loss": 0.4415, + "step": 11338 + }, + { + "epoch": 0.89, + "grad_norm": 2.547245960683708, + "learning_rate": 3.1103277002425494e-07, + "loss": 0.4387, + "step": 11339 + }, + { + "epoch": 0.89, + "grad_norm": 2.005989508669002, + "learning_rate": 3.105913618727452e-07, + "loss": 0.4284, + "step": 11340 + }, + { + "epoch": 0.89, + "grad_norm": 0.5506089563682616, + "learning_rate": 3.101502571198395e-07, + "loss": 0.4575, + "step": 11341 + }, + { + "epoch": 0.89, + "grad_norm": 2.850823503800598, + "learning_rate": 3.097094557940772e-07, + "loss": 0.3911, + "step": 11342 + }, + { + "epoch": 0.89, + "grad_norm": 2.611517155265725, + "learning_rate": 3.0926895792397773e-07, + "loss": 0.4857, + "step": 11343 + }, + { + "epoch": 0.89, + "grad_norm": 1.7819940264159992, + "learning_rate": 3.0882876353803994e-07, + "loss": 0.4477, + "step": 11344 + }, + { + "epoch": 0.89, + "grad_norm": 7.459706024797533, + "learning_rate": 3.0838887266474384e-07, + "loss": 0.4564, + "step": 11345 + }, + { + "epoch": 0.89, + "grad_norm": 1.9857827275160052, + "learning_rate": 3.0794928533255165e-07, + "loss": 0.4715, + "step": 11346 + }, + { + "epoch": 0.89, + "grad_norm": 1.852616245136004, + "learning_rate": 3.0751000156990175e-07, + "loss": 0.4654, + "step": 11347 + }, + { + "epoch": 0.89, + "grad_norm": 1.7587980624301263, + "learning_rate": 3.070710214052169e-07, + "loss": 0.434, + "step": 11348 + }, + { + "epoch": 0.89, + "grad_norm": 1.5713148745825898, + "learning_rate": 3.0663234486689674e-07, + "loss": 0.4282, + "step": 11349 + }, + { + "epoch": 0.89, + "grad_norm": 0.5889383695196914, + "learning_rate": 3.0619397198332565e-07, + "loss": 0.4835, + "step": 11350 + }, + { + "epoch": 0.89, + "grad_norm": 2.380683309233936, + "learning_rate": 3.057559027828633e-07, + "loss": 0.4882, + "step": 11351 + }, + { + "epoch": 0.89, + "grad_norm": 1.4736501459113696, + "learning_rate": 3.0531813729385294e-07, + "loss": 0.4491, + "step": 11352 + }, + { + "epoch": 0.89, + "grad_norm": 1.5551133746410761, + "learning_rate": 3.048806755446182e-07, + "loss": 0.4645, + "step": 11353 + }, + { + "epoch": 0.89, + "grad_norm": 1.8032008231829162, + "learning_rate": 3.0444351756346245e-07, + "loss": 0.4882, + "step": 11354 + }, + { + "epoch": 0.89, + "grad_norm": 3.6726290999534124, + "learning_rate": 3.0400666337866756e-07, + "loss": 0.4682, + "step": 11355 + }, + { + "epoch": 0.89, + "grad_norm": 2.491519928158693, + "learning_rate": 3.035701130184987e-07, + "loss": 0.4576, + "step": 11356 + }, + { + "epoch": 0.89, + "grad_norm": 2.344926261831781, + "learning_rate": 3.0313386651119935e-07, + "loss": 0.4824, + "step": 11357 + }, + { + "epoch": 0.89, + "grad_norm": 0.5741899869775209, + "learning_rate": 3.026979238849953e-07, + "loss": 0.4982, + "step": 11358 + }, + { + "epoch": 0.89, + "grad_norm": 1.9130461305494726, + "learning_rate": 3.0226228516809066e-07, + "loss": 0.4759, + "step": 11359 + }, + { + "epoch": 0.89, + "grad_norm": 1.3645009846826872, + "learning_rate": 3.018269503886706e-07, + "loss": 0.3737, + "step": 11360 + }, + { + "epoch": 0.89, + "grad_norm": 1.8493573749084138, + "learning_rate": 3.013919195749004e-07, + "loss": 0.4389, + "step": 11361 + }, + { + "epoch": 0.89, + "grad_norm": 2.4710799692358103, + "learning_rate": 3.0095719275492706e-07, + "loss": 0.5134, + "step": 11362 + }, + { + "epoch": 0.89, + "grad_norm": 1.99393386916813, + "learning_rate": 3.0052276995687637e-07, + "loss": 0.4512, + "step": 11363 + }, + { + "epoch": 0.89, + "grad_norm": 1.9106719714730473, + "learning_rate": 3.0008865120885356e-07, + "loss": 0.415, + "step": 11364 + }, + { + "epoch": 0.89, + "grad_norm": 2.705639712369887, + "learning_rate": 2.9965483653894846e-07, + "loss": 0.4809, + "step": 11365 + }, + { + "epoch": 0.89, + "grad_norm": 1.448323036169266, + "learning_rate": 2.992213259752258e-07, + "loss": 0.4438, + "step": 11366 + }, + { + "epoch": 0.89, + "grad_norm": 3.8341714831777236, + "learning_rate": 2.9878811954573374e-07, + "loss": 0.4113, + "step": 11367 + }, + { + "epoch": 0.89, + "grad_norm": 1.563245717435171, + "learning_rate": 2.9835521727850035e-07, + "loss": 0.4082, + "step": 11368 + }, + { + "epoch": 0.89, + "grad_norm": 1.8357519257263144, + "learning_rate": 2.979226192015339e-07, + "loss": 0.4238, + "step": 11369 + }, + { + "epoch": 0.89, + "grad_norm": 1.5867637843906715, + "learning_rate": 2.9749032534282405e-07, + "loss": 0.4019, + "step": 11370 + }, + { + "epoch": 0.89, + "grad_norm": 2.320149599026252, + "learning_rate": 2.9705833573033747e-07, + "loss": 0.4417, + "step": 11371 + }, + { + "epoch": 0.89, + "grad_norm": 0.5305416131784837, + "learning_rate": 2.96626650392024e-07, + "loss": 0.4602, + "step": 11372 + }, + { + "epoch": 0.89, + "grad_norm": 0.5310197285914908, + "learning_rate": 2.96195269355814e-07, + "loss": 0.4519, + "step": 11373 + }, + { + "epoch": 0.89, + "grad_norm": 2.206453211844582, + "learning_rate": 2.9576419264961684e-07, + "loss": 0.4195, + "step": 11374 + }, + { + "epoch": 0.89, + "grad_norm": 1.627075121936036, + "learning_rate": 2.953334203013225e-07, + "loss": 0.4471, + "step": 11375 + }, + { + "epoch": 0.89, + "grad_norm": 2.0570038282300405, + "learning_rate": 2.9490295233880083e-07, + "loss": 0.4401, + "step": 11376 + }, + { + "epoch": 0.89, + "grad_norm": 1.9653415679385258, + "learning_rate": 2.944727887899035e-07, + "loss": 0.4807, + "step": 11377 + }, + { + "epoch": 0.89, + "grad_norm": 0.5174323364634673, + "learning_rate": 2.9404292968246095e-07, + "loss": 0.47, + "step": 11378 + }, + { + "epoch": 0.89, + "grad_norm": 3.847390566248935, + "learning_rate": 2.9361337504428543e-07, + "loss": 0.5137, + "step": 11379 + }, + { + "epoch": 0.89, + "grad_norm": 1.6870357897874173, + "learning_rate": 2.9318412490316636e-07, + "loss": 0.4172, + "step": 11380 + }, + { + "epoch": 0.89, + "grad_norm": 1.6031952371289342, + "learning_rate": 2.927551792868782e-07, + "loss": 0.4591, + "step": 11381 + }, + { + "epoch": 0.89, + "grad_norm": 0.5360352078561933, + "learning_rate": 2.923265382231721e-07, + "loss": 0.4664, + "step": 11382 + }, + { + "epoch": 0.89, + "grad_norm": 1.888519235564012, + "learning_rate": 2.918982017397809e-07, + "loss": 0.3777, + "step": 11383 + }, + { + "epoch": 0.89, + "grad_norm": 2.22511278269066, + "learning_rate": 2.914701698644157e-07, + "loss": 0.4627, + "step": 11384 + }, + { + "epoch": 0.89, + "grad_norm": 2.7058166092141644, + "learning_rate": 2.910424426247721e-07, + "loss": 0.4003, + "step": 11385 + }, + { + "epoch": 0.89, + "grad_norm": 2.1476447299455805, + "learning_rate": 2.9061502004852194e-07, + "loss": 0.4712, + "step": 11386 + }, + { + "epoch": 0.89, + "grad_norm": 0.548036668656327, + "learning_rate": 2.9018790216331917e-07, + "loss": 0.4813, + "step": 11387 + }, + { + "epoch": 0.89, + "grad_norm": 2.0088872058147182, + "learning_rate": 2.897610889967983e-07, + "loss": 0.4794, + "step": 11388 + }, + { + "epoch": 0.89, + "grad_norm": 1.820277018405758, + "learning_rate": 2.8933458057657295e-07, + "loss": 0.3888, + "step": 11389 + }, + { + "epoch": 0.89, + "grad_norm": 1.7648849737627514, + "learning_rate": 2.8890837693023863e-07, + "loss": 0.4696, + "step": 11390 + }, + { + "epoch": 0.89, + "grad_norm": 1.6697290973578673, + "learning_rate": 2.8848247808536847e-07, + "loss": 0.4708, + "step": 11391 + }, + { + "epoch": 0.89, + "grad_norm": 1.7322855933915686, + "learning_rate": 2.8805688406951914e-07, + "loss": 0.4902, + "step": 11392 + }, + { + "epoch": 0.89, + "grad_norm": 1.6661093342394897, + "learning_rate": 2.8763159491022484e-07, + "loss": 0.4569, + "step": 11393 + }, + { + "epoch": 0.89, + "grad_norm": 0.550928315296681, + "learning_rate": 2.87206610635003e-07, + "loss": 0.4722, + "step": 11394 + }, + { + "epoch": 0.89, + "grad_norm": 1.6933594117270645, + "learning_rate": 2.867819312713471e-07, + "loss": 0.4697, + "step": 11395 + }, + { + "epoch": 0.89, + "grad_norm": 1.683783729663718, + "learning_rate": 2.863575568467353e-07, + "loss": 0.454, + "step": 11396 + }, + { + "epoch": 0.9, + "grad_norm": 1.631256911272982, + "learning_rate": 2.859334873886227e-07, + "loss": 0.4355, + "step": 11397 + }, + { + "epoch": 0.9, + "grad_norm": 1.62446836542238, + "learning_rate": 2.8550972292444744e-07, + "loss": 0.4442, + "step": 11398 + }, + { + "epoch": 0.9, + "grad_norm": 1.7297073422909943, + "learning_rate": 2.850862634816248e-07, + "loss": 0.3916, + "step": 11399 + }, + { + "epoch": 0.9, + "grad_norm": 1.6498571926319299, + "learning_rate": 2.8466310908755347e-07, + "loss": 0.4212, + "step": 11400 + }, + { + "epoch": 0.9, + "grad_norm": 2.2062971890786613, + "learning_rate": 2.842402597696109e-07, + "loss": 0.453, + "step": 11401 + }, + { + "epoch": 0.9, + "grad_norm": 1.9833679696466364, + "learning_rate": 2.8381771555515416e-07, + "loss": 0.5178, + "step": 11402 + }, + { + "epoch": 0.9, + "grad_norm": 1.458659727497805, + "learning_rate": 2.8339547647152186e-07, + "loss": 0.4372, + "step": 11403 + }, + { + "epoch": 0.9, + "grad_norm": 1.6669920176993276, + "learning_rate": 2.8297354254603226e-07, + "loss": 0.3598, + "step": 11404 + }, + { + "epoch": 0.9, + "grad_norm": 1.6152094735551394, + "learning_rate": 2.82551913805984e-07, + "loss": 0.4501, + "step": 11405 + }, + { + "epoch": 0.9, + "grad_norm": 1.8700175843001834, + "learning_rate": 2.821305902786553e-07, + "loss": 0.4593, + "step": 11406 + }, + { + "epoch": 0.9, + "grad_norm": 2.1110618775428094, + "learning_rate": 2.8170957199130545e-07, + "loss": 0.4096, + "step": 11407 + }, + { + "epoch": 0.9, + "grad_norm": 1.576036545255131, + "learning_rate": 2.812888589711743e-07, + "loss": 0.4446, + "step": 11408 + }, + { + "epoch": 0.9, + "grad_norm": 1.4702657997579975, + "learning_rate": 2.808684512454818e-07, + "loss": 0.424, + "step": 11409 + }, + { + "epoch": 0.9, + "grad_norm": 2.212978056980427, + "learning_rate": 2.804483488414261e-07, + "loss": 0.4408, + "step": 11410 + }, + { + "epoch": 0.9, + "grad_norm": 0.5835566426546058, + "learning_rate": 2.800285517861884e-07, + "loss": 0.4556, + "step": 11411 + }, + { + "epoch": 0.9, + "grad_norm": 3.628544965715317, + "learning_rate": 2.79609060106929e-07, + "loss": 0.4361, + "step": 11412 + }, + { + "epoch": 0.9, + "grad_norm": 1.8084291332964564, + "learning_rate": 2.7918987383078857e-07, + "loss": 0.4823, + "step": 11413 + }, + { + "epoch": 0.9, + "grad_norm": 0.5310464514391604, + "learning_rate": 2.787709929848881e-07, + "loss": 0.4619, + "step": 11414 + }, + { + "epoch": 0.9, + "grad_norm": 2.3998772101075545, + "learning_rate": 2.78352417596327e-07, + "loss": 0.4576, + "step": 11415 + }, + { + "epoch": 0.9, + "grad_norm": 2.1718037059775135, + "learning_rate": 2.779341476921887e-07, + "loss": 0.4376, + "step": 11416 + }, + { + "epoch": 0.9, + "grad_norm": 0.5205833530637872, + "learning_rate": 2.7751618329953376e-07, + "loss": 0.4961, + "step": 11417 + }, + { + "epoch": 0.9, + "grad_norm": 5.303675392030558, + "learning_rate": 2.770985244454044e-07, + "loss": 0.4473, + "step": 11418 + }, + { + "epoch": 0.9, + "grad_norm": 0.542182332825767, + "learning_rate": 2.766811711568207e-07, + "loss": 0.4652, + "step": 11419 + }, + { + "epoch": 0.9, + "grad_norm": 5.325617690032522, + "learning_rate": 2.7626412346078824e-07, + "loss": 0.424, + "step": 11420 + }, + { + "epoch": 0.9, + "grad_norm": 1.70053174705483, + "learning_rate": 2.7584738138428604e-07, + "loss": 0.4394, + "step": 11421 + }, + { + "epoch": 0.9, + "grad_norm": 2.0024030876864947, + "learning_rate": 2.7543094495427913e-07, + "loss": 0.4314, + "step": 11422 + }, + { + "epoch": 0.9, + "grad_norm": 1.6713767100690384, + "learning_rate": 2.7501481419770884e-07, + "loss": 0.4287, + "step": 11423 + }, + { + "epoch": 0.9, + "grad_norm": 1.766189884612762, + "learning_rate": 2.7459898914149963e-07, + "loss": 0.4385, + "step": 11424 + }, + { + "epoch": 0.9, + "grad_norm": 1.7557920078286193, + "learning_rate": 2.741834698125545e-07, + "loss": 0.4423, + "step": 11425 + }, + { + "epoch": 0.9, + "grad_norm": 0.5439389285858546, + "learning_rate": 2.737682562377564e-07, + "loss": 0.4793, + "step": 11426 + }, + { + "epoch": 0.9, + "grad_norm": 0.5388612115035586, + "learning_rate": 2.733533484439688e-07, + "loss": 0.462, + "step": 11427 + }, + { + "epoch": 0.9, + "grad_norm": 0.5729533662673044, + "learning_rate": 2.729387464580369e-07, + "loss": 0.475, + "step": 11428 + }, + { + "epoch": 0.9, + "grad_norm": 1.6717167305903013, + "learning_rate": 2.7252445030678475e-07, + "loss": 0.4029, + "step": 11429 + }, + { + "epoch": 0.9, + "grad_norm": 2.357655547104963, + "learning_rate": 2.7211046001701604e-07, + "loss": 0.4295, + "step": 11430 + }, + { + "epoch": 0.9, + "grad_norm": 1.6012259188828346, + "learning_rate": 2.716967756155153e-07, + "loss": 0.457, + "step": 11431 + }, + { + "epoch": 0.9, + "grad_norm": 1.6790183752889436, + "learning_rate": 2.712833971290479e-07, + "loss": 0.4483, + "step": 11432 + }, + { + "epoch": 0.9, + "grad_norm": 1.9276594267081877, + "learning_rate": 2.7087032458435914e-07, + "loss": 0.4404, + "step": 11433 + }, + { + "epoch": 0.9, + "grad_norm": 2.6764322425125178, + "learning_rate": 2.7045755800817364e-07, + "loss": 0.4475, + "step": 11434 + }, + { + "epoch": 0.9, + "grad_norm": 2.2012675353302806, + "learning_rate": 2.700450974271962e-07, + "loss": 0.4434, + "step": 11435 + }, + { + "epoch": 0.9, + "grad_norm": 2.2039079946270856, + "learning_rate": 2.696329428681149e-07, + "loss": 0.44, + "step": 11436 + }, + { + "epoch": 0.9, + "grad_norm": 0.5352792845476458, + "learning_rate": 2.6922109435759346e-07, + "loss": 0.4878, + "step": 11437 + }, + { + "epoch": 0.9, + "grad_norm": 1.687443549535484, + "learning_rate": 2.6880955192227885e-07, + "loss": 0.4485, + "step": 11438 + }, + { + "epoch": 0.9, + "grad_norm": 1.4417073023430471, + "learning_rate": 2.683983155887965e-07, + "loss": 0.4543, + "step": 11439 + }, + { + "epoch": 0.9, + "grad_norm": 1.4906272079818452, + "learning_rate": 2.6798738538375447e-07, + "loss": 0.437, + "step": 11440 + }, + { + "epoch": 0.9, + "grad_norm": 2.243457288432882, + "learning_rate": 2.675767613337371e-07, + "loss": 0.4233, + "step": 11441 + }, + { + "epoch": 0.9, + "grad_norm": 1.604024146459571, + "learning_rate": 2.671664434653132e-07, + "loss": 0.4342, + "step": 11442 + }, + { + "epoch": 0.9, + "grad_norm": 2.4625304856640504, + "learning_rate": 2.6675643180502865e-07, + "loss": 0.4449, + "step": 11443 + }, + { + "epoch": 0.9, + "grad_norm": 1.662440333718254, + "learning_rate": 2.663467263794123e-07, + "loss": 0.474, + "step": 11444 + }, + { + "epoch": 0.9, + "grad_norm": 1.8502671932455674, + "learning_rate": 2.65937327214969e-07, + "loss": 0.3918, + "step": 11445 + }, + { + "epoch": 0.9, + "grad_norm": 1.7680351458876735, + "learning_rate": 2.655282343381882e-07, + "loss": 0.4038, + "step": 11446 + }, + { + "epoch": 0.9, + "grad_norm": 2.1142607570033145, + "learning_rate": 2.65119447775537e-07, + "loss": 0.4809, + "step": 11447 + }, + { + "epoch": 0.9, + "grad_norm": 1.6581699666275387, + "learning_rate": 2.6471096755346314e-07, + "loss": 0.4627, + "step": 11448 + }, + { + "epoch": 0.9, + "grad_norm": 2.950991543973786, + "learning_rate": 2.6430279369839664e-07, + "loss": 0.4373, + "step": 11449 + }, + { + "epoch": 0.9, + "grad_norm": 0.5742220322790039, + "learning_rate": 2.638949262367424e-07, + "loss": 0.474, + "step": 11450 + }, + { + "epoch": 0.9, + "grad_norm": 2.1846163652528445, + "learning_rate": 2.634873651948922e-07, + "loss": 0.4476, + "step": 11451 + }, + { + "epoch": 0.9, + "grad_norm": 1.959303904278573, + "learning_rate": 2.6308011059921257e-07, + "loss": 0.4306, + "step": 11452 + }, + { + "epoch": 0.9, + "grad_norm": 2.045963755850218, + "learning_rate": 2.626731624760542e-07, + "loss": 0.4186, + "step": 11453 + }, + { + "epoch": 0.9, + "grad_norm": 1.6883988659983487, + "learning_rate": 2.622665208517433e-07, + "loss": 0.4469, + "step": 11454 + }, + { + "epoch": 0.9, + "grad_norm": 2.1911554097917083, + "learning_rate": 2.618601857525921e-07, + "loss": 0.4113, + "step": 11455 + }, + { + "epoch": 0.9, + "grad_norm": 1.7119376681944634, + "learning_rate": 2.614541572048884e-07, + "loss": 0.4641, + "step": 11456 + }, + { + "epoch": 0.9, + "grad_norm": 1.7382387058097901, + "learning_rate": 2.610484352349013e-07, + "loss": 0.4578, + "step": 11457 + }, + { + "epoch": 0.9, + "grad_norm": 1.842338592108334, + "learning_rate": 2.606430198688814e-07, + "loss": 0.4156, + "step": 11458 + }, + { + "epoch": 0.9, + "grad_norm": 2.6047270523266723, + "learning_rate": 2.602379111330583e-07, + "loss": 0.4578, + "step": 11459 + }, + { + "epoch": 0.9, + "grad_norm": 1.5707814143339303, + "learning_rate": 2.5983310905364267e-07, + "loss": 0.4273, + "step": 11460 + }, + { + "epoch": 0.9, + "grad_norm": 2.014504903027408, + "learning_rate": 2.594286136568236e-07, + "loss": 0.3978, + "step": 11461 + }, + { + "epoch": 0.9, + "grad_norm": 2.1328381876816347, + "learning_rate": 2.5902442496877123e-07, + "loss": 0.4932, + "step": 11462 + }, + { + "epoch": 0.9, + "grad_norm": 1.433364925832676, + "learning_rate": 2.586205430156369e-07, + "loss": 0.43, + "step": 11463 + }, + { + "epoch": 0.9, + "grad_norm": 2.1491250236619726, + "learning_rate": 2.5821696782355134e-07, + "loss": 0.4287, + "step": 11464 + }, + { + "epoch": 0.9, + "grad_norm": 1.9174821088621523, + "learning_rate": 2.578136994186242e-07, + "loss": 0.4623, + "step": 11465 + }, + { + "epoch": 0.9, + "grad_norm": 1.9713032113604674, + "learning_rate": 2.5741073782694747e-07, + "loss": 0.5045, + "step": 11466 + }, + { + "epoch": 0.9, + "grad_norm": 0.5722450411662502, + "learning_rate": 2.570080830745919e-07, + "loss": 0.4682, + "step": 11467 + }, + { + "epoch": 0.9, + "grad_norm": 5.043043298819018, + "learning_rate": 2.5660573518760833e-07, + "loss": 0.4239, + "step": 11468 + }, + { + "epoch": 0.9, + "grad_norm": 2.4525646592560792, + "learning_rate": 2.562036941920298e-07, + "loss": 0.461, + "step": 11469 + }, + { + "epoch": 0.9, + "grad_norm": 2.5506322590572963, + "learning_rate": 2.5580196011386507e-07, + "loss": 0.4796, + "step": 11470 + }, + { + "epoch": 0.9, + "grad_norm": 0.5296074489700937, + "learning_rate": 2.554005329791087e-07, + "loss": 0.4776, + "step": 11471 + }, + { + "epoch": 0.9, + "grad_norm": 1.62477518037893, + "learning_rate": 2.549994128137301e-07, + "loss": 0.4655, + "step": 11472 + }, + { + "epoch": 0.9, + "grad_norm": 0.5882824398765624, + "learning_rate": 2.5459859964368227e-07, + "loss": 0.4723, + "step": 11473 + }, + { + "epoch": 0.9, + "grad_norm": 2.2125354963534436, + "learning_rate": 2.541980934948979e-07, + "loss": 0.4498, + "step": 11474 + }, + { + "epoch": 0.9, + "grad_norm": 2.5654964210877123, + "learning_rate": 2.5379789439328893e-07, + "loss": 0.4927, + "step": 11475 + }, + { + "epoch": 0.9, + "grad_norm": 1.70510662301389, + "learning_rate": 2.5339800236474645e-07, + "loss": 0.4355, + "step": 11476 + }, + { + "epoch": 0.9, + "grad_norm": 2.6139649381691386, + "learning_rate": 2.529984174351441e-07, + "loss": 0.4594, + "step": 11477 + }, + { + "epoch": 0.9, + "grad_norm": 1.7511701146777976, + "learning_rate": 2.52599139630334e-07, + "loss": 0.4863, + "step": 11478 + }, + { + "epoch": 0.9, + "grad_norm": 1.7766756188876547, + "learning_rate": 2.5220016897614943e-07, + "loss": 0.468, + "step": 11479 + }, + { + "epoch": 0.9, + "grad_norm": 1.4947317481855442, + "learning_rate": 2.518015054984041e-07, + "loss": 0.4571, + "step": 11480 + }, + { + "epoch": 0.9, + "grad_norm": 1.987195885614004, + "learning_rate": 2.5140314922288854e-07, + "loss": 0.4745, + "step": 11481 + }, + { + "epoch": 0.9, + "grad_norm": 1.9337409032654684, + "learning_rate": 2.5100510017537773e-07, + "loss": 0.4362, + "step": 11482 + }, + { + "epoch": 0.9, + "grad_norm": 2.8481453198409, + "learning_rate": 2.5060735838162433e-07, + "loss": 0.4275, + "step": 11483 + }, + { + "epoch": 0.9, + "grad_norm": 1.9586256030002387, + "learning_rate": 2.5020992386736274e-07, + "loss": 0.4326, + "step": 11484 + }, + { + "epoch": 0.9, + "grad_norm": 1.300869195613282, + "learning_rate": 2.498127966583042e-07, + "loss": 0.4227, + "step": 11485 + }, + { + "epoch": 0.9, + "grad_norm": 0.6074281092426282, + "learning_rate": 2.4941597678014464e-07, + "loss": 0.4693, + "step": 11486 + }, + { + "epoch": 0.9, + "grad_norm": 1.4496022570779206, + "learning_rate": 2.490194642585564e-07, + "loss": 0.4339, + "step": 11487 + }, + { + "epoch": 0.9, + "grad_norm": 2.0571224216844652, + "learning_rate": 2.486232591191945e-07, + "loss": 0.4335, + "step": 11488 + }, + { + "epoch": 0.9, + "grad_norm": 2.359389310872638, + "learning_rate": 2.4822736138769064e-07, + "loss": 0.4979, + "step": 11489 + }, + { + "epoch": 0.9, + "grad_norm": 1.8411424742529219, + "learning_rate": 2.4783177108966107e-07, + "loss": 0.4602, + "step": 11490 + }, + { + "epoch": 0.9, + "grad_norm": 0.5397894798745554, + "learning_rate": 2.474364882507002e-07, + "loss": 0.4778, + "step": 11491 + }, + { + "epoch": 0.9, + "grad_norm": 1.7948989805836306, + "learning_rate": 2.4704151289638045e-07, + "loss": 0.4143, + "step": 11492 + }, + { + "epoch": 0.9, + "grad_norm": 1.9146951042539309, + "learning_rate": 2.466468450522574e-07, + "loss": 0.4293, + "step": 11493 + }, + { + "epoch": 0.9, + "grad_norm": 1.5695218852774204, + "learning_rate": 2.462524847438652e-07, + "loss": 0.3819, + "step": 11494 + }, + { + "epoch": 0.9, + "grad_norm": 2.000545115941838, + "learning_rate": 2.458584319967194e-07, + "loss": 0.4175, + "step": 11495 + }, + { + "epoch": 0.9, + "grad_norm": 1.574829380102967, + "learning_rate": 2.454646868363131e-07, + "loss": 0.4241, + "step": 11496 + }, + { + "epoch": 0.9, + "grad_norm": 1.6918417886550867, + "learning_rate": 2.4507124928812184e-07, + "loss": 0.4355, + "step": 11497 + }, + { + "epoch": 0.9, + "grad_norm": 1.9282118276250628, + "learning_rate": 2.4467811937760046e-07, + "loss": 0.3848, + "step": 11498 + }, + { + "epoch": 0.9, + "grad_norm": 4.751103273554088, + "learning_rate": 2.442852971301846e-07, + "loss": 0.4671, + "step": 11499 + }, + { + "epoch": 0.9, + "grad_norm": 16.979599586875334, + "learning_rate": 2.438927825712878e-07, + "loss": 0.4465, + "step": 11500 + }, + { + "epoch": 0.9, + "grad_norm": 1.6169477325319421, + "learning_rate": 2.4350057572630703e-07, + "loss": 0.4561, + "step": 11501 + }, + { + "epoch": 0.9, + "grad_norm": 1.5866350311536457, + "learning_rate": 2.431086766206159e-07, + "loss": 0.457, + "step": 11502 + }, + { + "epoch": 0.9, + "grad_norm": 1.527767351703748, + "learning_rate": 2.427170852795713e-07, + "loss": 0.3784, + "step": 11503 + }, + { + "epoch": 0.9, + "grad_norm": 2.884216163301964, + "learning_rate": 2.4232580172850793e-07, + "loss": 0.427, + "step": 11504 + }, + { + "epoch": 0.9, + "grad_norm": 0.5579895948039629, + "learning_rate": 2.4193482599274064e-07, + "loss": 0.4745, + "step": 11505 + }, + { + "epoch": 0.9, + "grad_norm": 2.8752488804287286, + "learning_rate": 2.4154415809756695e-07, + "loss": 0.3918, + "step": 11506 + }, + { + "epoch": 0.9, + "grad_norm": 0.5353563110752431, + "learning_rate": 2.4115379806826045e-07, + "loss": 0.4613, + "step": 11507 + }, + { + "epoch": 0.9, + "grad_norm": 1.7773073809189721, + "learning_rate": 2.4076374593007825e-07, + "loss": 0.4457, + "step": 11508 + }, + { + "epoch": 0.9, + "grad_norm": 1.6634825126636108, + "learning_rate": 2.4037400170825563e-07, + "loss": 0.4331, + "step": 11509 + }, + { + "epoch": 0.9, + "grad_norm": 2.2898802316211877, + "learning_rate": 2.399845654280092e-07, + "loss": 0.4896, + "step": 11510 + }, + { + "epoch": 0.9, + "grad_norm": 1.9025798992848089, + "learning_rate": 2.395954371145343e-07, + "loss": 0.4425, + "step": 11511 + }, + { + "epoch": 0.9, + "grad_norm": 0.5200549513959155, + "learning_rate": 2.3920661679300685e-07, + "loss": 0.4664, + "step": 11512 + }, + { + "epoch": 0.9, + "grad_norm": 0.5994037983035863, + "learning_rate": 2.3881810448858345e-07, + "loss": 0.4922, + "step": 11513 + }, + { + "epoch": 0.9, + "grad_norm": 1.627258752537254, + "learning_rate": 2.384299002264001e-07, + "loss": 0.4551, + "step": 11514 + }, + { + "epoch": 0.9, + "grad_norm": 3.4129050262860194, + "learning_rate": 2.3804200403157453e-07, + "loss": 0.4703, + "step": 11515 + }, + { + "epoch": 0.9, + "grad_norm": 2.5770364455989156, + "learning_rate": 2.37654415929201e-07, + "loss": 0.4302, + "step": 11516 + }, + { + "epoch": 0.9, + "grad_norm": 1.6457749771755348, + "learning_rate": 2.3726713594435623e-07, + "loss": 0.4548, + "step": 11517 + }, + { + "epoch": 0.9, + "grad_norm": 1.9965769024376148, + "learning_rate": 2.368801641020979e-07, + "loss": 0.4346, + "step": 11518 + }, + { + "epoch": 0.9, + "grad_norm": 2.170950569971861, + "learning_rate": 2.3649350042746265e-07, + "loss": 0.4839, + "step": 11519 + }, + { + "epoch": 0.9, + "grad_norm": 1.902604732530212, + "learning_rate": 2.3610714494546493e-07, + "loss": 0.4475, + "step": 11520 + }, + { + "epoch": 0.9, + "grad_norm": 1.5778077685182375, + "learning_rate": 2.3572109768110473e-07, + "loss": 0.4576, + "step": 11521 + }, + { + "epoch": 0.9, + "grad_norm": 0.5562866815272, + "learning_rate": 2.3533535865935597e-07, + "loss": 0.4886, + "step": 11522 + }, + { + "epoch": 0.9, + "grad_norm": 1.6254660906201128, + "learning_rate": 2.3494992790517757e-07, + "loss": 0.3944, + "step": 11523 + }, + { + "epoch": 0.91, + "grad_norm": 1.5323006559371795, + "learning_rate": 2.3456480544350345e-07, + "loss": 0.3926, + "step": 11524 + }, + { + "epoch": 0.91, + "grad_norm": 2.3175342823067306, + "learning_rate": 2.3417999129925374e-07, + "loss": 0.4315, + "step": 11525 + }, + { + "epoch": 0.91, + "grad_norm": 1.6252654989555713, + "learning_rate": 2.3379548549732456e-07, + "loss": 0.4237, + "step": 11526 + }, + { + "epoch": 0.91, + "grad_norm": 1.8537611675486931, + "learning_rate": 2.3341128806259162e-07, + "loss": 0.4407, + "step": 11527 + }, + { + "epoch": 0.91, + "grad_norm": 1.4698905978931054, + "learning_rate": 2.3302739901991277e-07, + "loss": 0.4562, + "step": 11528 + }, + { + "epoch": 0.91, + "grad_norm": 1.6425284597645118, + "learning_rate": 2.326438183941254e-07, + "loss": 0.4107, + "step": 11529 + }, + { + "epoch": 0.91, + "grad_norm": 1.5007639776151216, + "learning_rate": 2.3226054621004745e-07, + "loss": 0.464, + "step": 11530 + }, + { + "epoch": 0.91, + "grad_norm": 2.056905218586717, + "learning_rate": 2.3187758249247406e-07, + "loss": 0.4718, + "step": 11531 + }, + { + "epoch": 0.91, + "grad_norm": 1.5152957575333061, + "learning_rate": 2.3149492726618373e-07, + "loss": 0.4407, + "step": 11532 + }, + { + "epoch": 0.91, + "grad_norm": 0.5526221538305488, + "learning_rate": 2.3111258055593387e-07, + "loss": 0.4593, + "step": 11533 + }, + { + "epoch": 0.91, + "grad_norm": 1.8977832804005321, + "learning_rate": 2.3073054238646197e-07, + "loss": 0.4654, + "step": 11534 + }, + { + "epoch": 0.91, + "grad_norm": 4.3154200589164695, + "learning_rate": 2.3034881278248432e-07, + "loss": 0.4335, + "step": 11535 + }, + { + "epoch": 0.91, + "grad_norm": 1.7397881182992028, + "learning_rate": 2.2996739176869841e-07, + "loss": 0.405, + "step": 11536 + }, + { + "epoch": 0.91, + "grad_norm": 1.9948088429449033, + "learning_rate": 2.2958627936978394e-07, + "loss": 0.4391, + "step": 11537 + }, + { + "epoch": 0.91, + "grad_norm": 1.5491535799936376, + "learning_rate": 2.2920547561039563e-07, + "loss": 0.5129, + "step": 11538 + }, + { + "epoch": 0.91, + "grad_norm": 1.8423445602002524, + "learning_rate": 2.2882498051517266e-07, + "loss": 0.4742, + "step": 11539 + }, + { + "epoch": 0.91, + "grad_norm": 1.456786406085758, + "learning_rate": 2.284447941087309e-07, + "loss": 0.4763, + "step": 11540 + }, + { + "epoch": 0.91, + "grad_norm": 1.5865854909279886, + "learning_rate": 2.2806491641567007e-07, + "loss": 0.4411, + "step": 11541 + }, + { + "epoch": 0.91, + "grad_norm": 2.714004461077023, + "learning_rate": 2.2768534746056615e-07, + "loss": 0.4664, + "step": 11542 + }, + { + "epoch": 0.91, + "grad_norm": 1.8980090924975184, + "learning_rate": 2.273060872679772e-07, + "loss": 0.4344, + "step": 11543 + }, + { + "epoch": 0.91, + "grad_norm": 1.7582908799952768, + "learning_rate": 2.2692713586244086e-07, + "loss": 0.448, + "step": 11544 + }, + { + "epoch": 0.91, + "grad_norm": 1.7687432306654483, + "learning_rate": 2.2654849326847584e-07, + "loss": 0.4918, + "step": 11545 + }, + { + "epoch": 0.91, + "grad_norm": 2.365381174851048, + "learning_rate": 2.2617015951057808e-07, + "loss": 0.4205, + "step": 11546 + }, + { + "epoch": 0.91, + "grad_norm": 2.1308940718252325, + "learning_rate": 2.257921346132258e-07, + "loss": 0.45, + "step": 11547 + }, + { + "epoch": 0.91, + "grad_norm": 2.1258958104859493, + "learning_rate": 2.254144186008772e-07, + "loss": 0.444, + "step": 11548 + }, + { + "epoch": 0.91, + "grad_norm": 2.0536017186212545, + "learning_rate": 2.2503701149796942e-07, + "loss": 0.4369, + "step": 11549 + }, + { + "epoch": 0.91, + "grad_norm": 1.987336556551796, + "learning_rate": 2.246599133289218e-07, + "loss": 0.451, + "step": 11550 + }, + { + "epoch": 0.91, + "grad_norm": 0.534520600854061, + "learning_rate": 2.242831241181298e-07, + "loss": 0.4563, + "step": 11551 + }, + { + "epoch": 0.91, + "grad_norm": 1.8006901115048144, + "learning_rate": 2.239066438899723e-07, + "loss": 0.4474, + "step": 11552 + }, + { + "epoch": 0.91, + "grad_norm": 0.5611517247848092, + "learning_rate": 2.23530472668807e-07, + "loss": 0.4742, + "step": 11553 + }, + { + "epoch": 0.91, + "grad_norm": 2.43701236917575, + "learning_rate": 2.2315461047897225e-07, + "loss": 0.3991, + "step": 11554 + }, + { + "epoch": 0.91, + "grad_norm": 2.3178863987738128, + "learning_rate": 2.2277905734478466e-07, + "loss": 0.4664, + "step": 11555 + }, + { + "epoch": 0.91, + "grad_norm": 1.8549630883861659, + "learning_rate": 2.2240381329054317e-07, + "loss": 0.4583, + "step": 11556 + }, + { + "epoch": 0.91, + "grad_norm": 1.7553398528483806, + "learning_rate": 2.22028878340525e-07, + "loss": 0.4523, + "step": 11557 + }, + { + "epoch": 0.91, + "grad_norm": 1.9655068550644523, + "learning_rate": 2.2165425251898743e-07, + "loss": 0.4144, + "step": 11558 + }, + { + "epoch": 0.91, + "grad_norm": 1.857684725555167, + "learning_rate": 2.212799358501694e-07, + "loss": 0.4714, + "step": 11559 + }, + { + "epoch": 0.91, + "grad_norm": 1.9652848317952962, + "learning_rate": 2.2090592835828817e-07, + "loss": 0.4937, + "step": 11560 + }, + { + "epoch": 0.91, + "grad_norm": 2.1636250670862816, + "learning_rate": 2.2053223006754166e-07, + "loss": 0.4448, + "step": 11561 + }, + { + "epoch": 0.91, + "grad_norm": 1.8285878564358173, + "learning_rate": 2.2015884100210717e-07, + "loss": 0.4099, + "step": 11562 + }, + { + "epoch": 0.91, + "grad_norm": 1.4770952271413116, + "learning_rate": 2.1978576118614315e-07, + "loss": 0.4794, + "step": 11563 + }, + { + "epoch": 0.91, + "grad_norm": 1.642925565625324, + "learning_rate": 2.1941299064378697e-07, + "loss": 0.4624, + "step": 11564 + }, + { + "epoch": 0.91, + "grad_norm": 3.1383780131702723, + "learning_rate": 2.1904052939915654e-07, + "loss": 0.4576, + "step": 11565 + }, + { + "epoch": 0.91, + "grad_norm": 2.3160913095182063, + "learning_rate": 2.1866837747634984e-07, + "loss": 0.527, + "step": 11566 + }, + { + "epoch": 0.91, + "grad_norm": 1.4668568066766676, + "learning_rate": 2.1829653489944368e-07, + "loss": 0.4025, + "step": 11567 + }, + { + "epoch": 0.91, + "grad_norm": 2.0666368858527995, + "learning_rate": 2.1792500169249665e-07, + "loss": 0.4298, + "step": 11568 + }, + { + "epoch": 0.91, + "grad_norm": 1.8285084925825668, + "learning_rate": 2.1755377787954724e-07, + "loss": 0.4079, + "step": 11569 + }, + { + "epoch": 0.91, + "grad_norm": 1.5435821511655918, + "learning_rate": 2.171828634846107e-07, + "loss": 0.4442, + "step": 11570 + }, + { + "epoch": 0.91, + "grad_norm": 2.0623593439148813, + "learning_rate": 2.1681225853168619e-07, + "loss": 0.4361, + "step": 11571 + }, + { + "epoch": 0.91, + "grad_norm": 0.5450558251007471, + "learning_rate": 2.1644196304475285e-07, + "loss": 0.4657, + "step": 11572 + }, + { + "epoch": 0.91, + "grad_norm": 1.5219296057439682, + "learning_rate": 2.1607197704776593e-07, + "loss": 0.4352, + "step": 11573 + }, + { + "epoch": 0.91, + "grad_norm": 1.8415617436970722, + "learning_rate": 2.1570230056466412e-07, + "loss": 0.5017, + "step": 11574 + }, + { + "epoch": 0.91, + "grad_norm": 1.515362573435689, + "learning_rate": 2.1533293361936381e-07, + "loss": 0.4751, + "step": 11575 + }, + { + "epoch": 0.91, + "grad_norm": 2.1913867957065527, + "learning_rate": 2.149638762357653e-07, + "loss": 0.4693, + "step": 11576 + }, + { + "epoch": 0.91, + "grad_norm": 1.8116784112370488, + "learning_rate": 2.1459512843774344e-07, + "loss": 0.4606, + "step": 11577 + }, + { + "epoch": 0.91, + "grad_norm": 2.290734175068875, + "learning_rate": 2.1422669024915632e-07, + "loss": 0.431, + "step": 11578 + }, + { + "epoch": 0.91, + "grad_norm": 4.157617377698731, + "learning_rate": 2.1385856169384212e-07, + "loss": 0.429, + "step": 11579 + }, + { + "epoch": 0.91, + "grad_norm": 0.5308058918915947, + "learning_rate": 2.1349074279561842e-07, + "loss": 0.456, + "step": 11580 + }, + { + "epoch": 0.91, + "grad_norm": 2.46404386226592, + "learning_rate": 2.131232335782818e-07, + "loss": 0.4617, + "step": 11581 + }, + { + "epoch": 0.91, + "grad_norm": 1.6660260192758156, + "learning_rate": 2.1275603406560984e-07, + "loss": 0.4164, + "step": 11582 + }, + { + "epoch": 0.91, + "grad_norm": 0.5682696302155233, + "learning_rate": 2.123891442813597e-07, + "loss": 0.4748, + "step": 11583 + }, + { + "epoch": 0.91, + "grad_norm": 1.4616178722973794, + "learning_rate": 2.1202256424926904e-07, + "loss": 0.4912, + "step": 11584 + }, + { + "epoch": 0.91, + "grad_norm": 2.263021123018609, + "learning_rate": 2.1165629399305553e-07, + "loss": 0.4166, + "step": 11585 + }, + { + "epoch": 0.91, + "grad_norm": 1.7393670491155402, + "learning_rate": 2.1129033353641527e-07, + "loss": 0.468, + "step": 11586 + }, + { + "epoch": 0.91, + "grad_norm": 1.6085495433722117, + "learning_rate": 2.1092468290302649e-07, + "loss": 0.4779, + "step": 11587 + }, + { + "epoch": 0.91, + "grad_norm": 1.6744697093702912, + "learning_rate": 2.1055934211654527e-07, + "loss": 0.4602, + "step": 11588 + }, + { + "epoch": 0.91, + "grad_norm": 3.268146057217509, + "learning_rate": 2.1019431120060996e-07, + "loss": 0.456, + "step": 11589 + }, + { + "epoch": 0.91, + "grad_norm": 1.7486774258357767, + "learning_rate": 2.0982959017883554e-07, + "loss": 0.4097, + "step": 11590 + }, + { + "epoch": 0.91, + "grad_norm": 2.320142242361531, + "learning_rate": 2.09465179074822e-07, + "loss": 0.4064, + "step": 11591 + }, + { + "epoch": 0.91, + "grad_norm": 3.242958107989898, + "learning_rate": 2.0910107791214384e-07, + "loss": 0.4363, + "step": 11592 + }, + { + "epoch": 0.91, + "grad_norm": 1.981069144351185, + "learning_rate": 2.0873728671435834e-07, + "loss": 0.4408, + "step": 11593 + }, + { + "epoch": 0.91, + "grad_norm": 2.320356188281922, + "learning_rate": 2.0837380550500275e-07, + "loss": 0.435, + "step": 11594 + }, + { + "epoch": 0.91, + "grad_norm": 2.2015991794633907, + "learning_rate": 2.0801063430759384e-07, + "loss": 0.4901, + "step": 11595 + }, + { + "epoch": 0.91, + "grad_norm": 1.9990019182457437, + "learning_rate": 2.0764777314562834e-07, + "loss": 0.4306, + "step": 11596 + }, + { + "epoch": 0.91, + "grad_norm": 1.9163960061804437, + "learning_rate": 2.072852220425825e-07, + "loss": 0.4644, + "step": 11597 + }, + { + "epoch": 0.91, + "grad_norm": 1.778514925806966, + "learning_rate": 2.069229810219131e-07, + "loss": 0.4941, + "step": 11598 + }, + { + "epoch": 0.91, + "grad_norm": 1.6211055040473192, + "learning_rate": 2.0656105010705696e-07, + "loss": 0.4179, + "step": 11599 + }, + { + "epoch": 0.91, + "grad_norm": 1.6831907000069037, + "learning_rate": 2.0619942932143034e-07, + "loss": 0.385, + "step": 11600 + }, + { + "epoch": 0.91, + "grad_norm": 1.8198307124341628, + "learning_rate": 2.0583811868842952e-07, + "loss": 0.4616, + "step": 11601 + }, + { + "epoch": 0.91, + "grad_norm": 0.5259222358063694, + "learning_rate": 2.0547711823143024e-07, + "loss": 0.4636, + "step": 11602 + }, + { + "epoch": 0.91, + "grad_norm": 1.4619442220726642, + "learning_rate": 2.0511642797378995e-07, + "loss": 0.458, + "step": 11603 + }, + { + "epoch": 0.91, + "grad_norm": 2.066107231851946, + "learning_rate": 2.0475604793884385e-07, + "loss": 0.4493, + "step": 11604 + }, + { + "epoch": 0.91, + "grad_norm": 2.846102312571656, + "learning_rate": 2.0439597814990943e-07, + "loss": 0.5673, + "step": 11605 + }, + { + "epoch": 0.91, + "grad_norm": 1.682014747141216, + "learning_rate": 2.0403621863028022e-07, + "loss": 0.448, + "step": 11606 + }, + { + "epoch": 0.91, + "grad_norm": 1.7222181787300466, + "learning_rate": 2.0367676940323545e-07, + "loss": 0.405, + "step": 11607 + }, + { + "epoch": 0.91, + "grad_norm": 2.1382575977868075, + "learning_rate": 2.0331763049202868e-07, + "loss": 0.4307, + "step": 11608 + }, + { + "epoch": 0.91, + "grad_norm": 1.6968063808615348, + "learning_rate": 2.0295880191989637e-07, + "loss": 0.4625, + "step": 11609 + }, + { + "epoch": 0.91, + "grad_norm": 2.3394350511508932, + "learning_rate": 2.0260028371005324e-07, + "loss": 0.435, + "step": 11610 + }, + { + "epoch": 0.91, + "grad_norm": 0.574160397364898, + "learning_rate": 2.0224207588569743e-07, + "loss": 0.4739, + "step": 11611 + }, + { + "epoch": 0.91, + "grad_norm": 1.4157438572398997, + "learning_rate": 2.0188417847000264e-07, + "loss": 0.4301, + "step": 11612 + }, + { + "epoch": 0.91, + "grad_norm": 2.022310962266582, + "learning_rate": 2.015265914861242e-07, + "loss": 0.4337, + "step": 11613 + }, + { + "epoch": 0.91, + "grad_norm": 1.8392879442417496, + "learning_rate": 2.011693149571986e-07, + "loss": 0.5048, + "step": 11614 + }, + { + "epoch": 0.91, + "grad_norm": 0.5589643165917287, + "learning_rate": 2.0081234890634072e-07, + "loss": 0.4773, + "step": 11615 + }, + { + "epoch": 0.91, + "grad_norm": 1.7195664601762457, + "learning_rate": 2.0045569335664595e-07, + "loss": 0.4187, + "step": 11616 + }, + { + "epoch": 0.91, + "grad_norm": 1.308172576146993, + "learning_rate": 2.000993483311886e-07, + "loss": 0.4029, + "step": 11617 + }, + { + "epoch": 0.91, + "grad_norm": 0.5397308665185568, + "learning_rate": 1.9974331385302471e-07, + "loss": 0.4414, + "step": 11618 + }, + { + "epoch": 0.91, + "grad_norm": 1.6496242750131471, + "learning_rate": 1.9938758994518858e-07, + "loss": 0.4737, + "step": 11619 + }, + { + "epoch": 0.91, + "grad_norm": 0.5365566894805941, + "learning_rate": 1.990321766306963e-07, + "loss": 0.4651, + "step": 11620 + }, + { + "epoch": 0.91, + "grad_norm": 0.5735115217978257, + "learning_rate": 1.9867707393254055e-07, + "loss": 0.4782, + "step": 11621 + }, + { + "epoch": 0.91, + "grad_norm": 2.2155802736907844, + "learning_rate": 1.9832228187369795e-07, + "loss": 0.4699, + "step": 11622 + }, + { + "epoch": 0.91, + "grad_norm": 1.4716095473700237, + "learning_rate": 1.9796780047712183e-07, + "loss": 0.4297, + "step": 11623 + }, + { + "epoch": 0.91, + "grad_norm": 1.970630551248207, + "learning_rate": 1.976136297657477e-07, + "loss": 0.4208, + "step": 11624 + }, + { + "epoch": 0.91, + "grad_norm": 0.5832398212605996, + "learning_rate": 1.972597697624884e-07, + "loss": 0.4571, + "step": 11625 + }, + { + "epoch": 0.91, + "grad_norm": 1.4003509948693398, + "learning_rate": 1.9690622049024e-07, + "loss": 0.408, + "step": 11626 + }, + { + "epoch": 0.91, + "grad_norm": 2.8614870074939063, + "learning_rate": 1.9655298197187646e-07, + "loss": 0.4654, + "step": 11627 + }, + { + "epoch": 0.91, + "grad_norm": 1.4231787220132968, + "learning_rate": 1.9620005423025056e-07, + "loss": 0.4819, + "step": 11628 + }, + { + "epoch": 0.91, + "grad_norm": 0.5170338117692749, + "learning_rate": 1.958474372881969e-07, + "loss": 0.4627, + "step": 11629 + }, + { + "epoch": 0.91, + "grad_norm": 1.6075157976621832, + "learning_rate": 1.9549513116852937e-07, + "loss": 0.4597, + "step": 11630 + }, + { + "epoch": 0.91, + "grad_norm": 2.3814228706855194, + "learning_rate": 1.9514313589404254e-07, + "loss": 0.4149, + "step": 11631 + }, + { + "epoch": 0.91, + "grad_norm": 1.5144638306441636, + "learning_rate": 1.947914514875088e-07, + "loss": 0.4471, + "step": 11632 + }, + { + "epoch": 0.91, + "grad_norm": 2.827336960243377, + "learning_rate": 1.9444007797168208e-07, + "loss": 0.4891, + "step": 11633 + }, + { + "epoch": 0.91, + "grad_norm": 1.6389174844848375, + "learning_rate": 1.9408901536929592e-07, + "loss": 0.4981, + "step": 11634 + }, + { + "epoch": 0.91, + "grad_norm": 2.1337538953464175, + "learning_rate": 1.937382637030638e-07, + "loss": 0.4874, + "step": 11635 + }, + { + "epoch": 0.91, + "grad_norm": 2.3994266266616378, + "learning_rate": 1.9338782299567815e-07, + "loss": 0.4651, + "step": 11636 + }, + { + "epoch": 0.91, + "grad_norm": 2.3322566128995708, + "learning_rate": 1.9303769326981193e-07, + "loss": 0.41, + "step": 11637 + }, + { + "epoch": 0.91, + "grad_norm": 2.820720599621765, + "learning_rate": 1.9268787454812032e-07, + "loss": 0.3925, + "step": 11638 + }, + { + "epoch": 0.91, + "grad_norm": 0.5775772691937925, + "learning_rate": 1.9233836685323303e-07, + "loss": 0.4572, + "step": 11639 + }, + { + "epoch": 0.91, + "grad_norm": 1.8084868230218274, + "learning_rate": 1.9198917020776532e-07, + "loss": 0.4723, + "step": 11640 + }, + { + "epoch": 0.91, + "grad_norm": 1.3103122734403823, + "learning_rate": 1.9164028463430738e-07, + "loss": 0.4105, + "step": 11641 + }, + { + "epoch": 0.91, + "grad_norm": 2.1628537869307385, + "learning_rate": 1.91291710155434e-07, + "loss": 0.4057, + "step": 11642 + }, + { + "epoch": 0.91, + "grad_norm": 1.5206609882420195, + "learning_rate": 1.9094344679369602e-07, + "loss": 0.4058, + "step": 11643 + }, + { + "epoch": 0.91, + "grad_norm": 1.4943644369273077, + "learning_rate": 1.9059549457162652e-07, + "loss": 0.4503, + "step": 11644 + }, + { + "epoch": 0.91, + "grad_norm": 1.4623873662846762, + "learning_rate": 1.902478535117358e-07, + "loss": 0.4249, + "step": 11645 + }, + { + "epoch": 0.91, + "grad_norm": 2.6449577851391033, + "learning_rate": 1.8990052363651812e-07, + "loss": 0.4197, + "step": 11646 + }, + { + "epoch": 0.91, + "grad_norm": 3.5875981473689658, + "learning_rate": 1.8955350496844382e-07, + "loss": 0.4853, + "step": 11647 + }, + { + "epoch": 0.91, + "grad_norm": 1.6546410222302237, + "learning_rate": 1.8920679752996441e-07, + "loss": 0.3984, + "step": 11648 + }, + { + "epoch": 0.91, + "grad_norm": 1.680195054991957, + "learning_rate": 1.8886040134351248e-07, + "loss": 0.4421, + "step": 11649 + }, + { + "epoch": 0.91, + "grad_norm": 2.143420574787109, + "learning_rate": 1.8851431643149843e-07, + "loss": 0.4128, + "step": 11650 + }, + { + "epoch": 0.92, + "grad_norm": 0.5329201202497394, + "learning_rate": 1.8816854281631435e-07, + "loss": 0.4747, + "step": 11651 + }, + { + "epoch": 0.92, + "grad_norm": 1.9878864414228619, + "learning_rate": 1.8782308052033072e-07, + "loss": 0.3681, + "step": 11652 + }, + { + "epoch": 0.92, + "grad_norm": 1.7494448824314872, + "learning_rate": 1.8747792956589793e-07, + "loss": 0.4904, + "step": 11653 + }, + { + "epoch": 0.92, + "grad_norm": 1.7084955679775087, + "learning_rate": 1.8713308997534819e-07, + "loss": 0.4709, + "step": 11654 + }, + { + "epoch": 0.92, + "grad_norm": 0.5248012759503514, + "learning_rate": 1.8678856177099193e-07, + "loss": 0.4722, + "step": 11655 + }, + { + "epoch": 0.92, + "grad_norm": 2.0269088247804787, + "learning_rate": 1.8644434497511855e-07, + "loss": 0.4441, + "step": 11656 + }, + { + "epoch": 0.92, + "grad_norm": 2.018763957139318, + "learning_rate": 1.8610043960999968e-07, + "loss": 0.4347, + "step": 11657 + }, + { + "epoch": 0.92, + "grad_norm": 2.9314586164153997, + "learning_rate": 1.8575684569788422e-07, + "loss": 0.461, + "step": 11658 + }, + { + "epoch": 0.92, + "grad_norm": 1.9558629127624854, + "learning_rate": 1.8541356326100436e-07, + "loss": 0.5043, + "step": 11659 + }, + { + "epoch": 0.92, + "grad_norm": 0.549313648712087, + "learning_rate": 1.8507059232156677e-07, + "loss": 0.4707, + "step": 11660 + }, + { + "epoch": 0.92, + "grad_norm": 0.5672296151584785, + "learning_rate": 1.8472793290176428e-07, + "loss": 0.4638, + "step": 11661 + }, + { + "epoch": 0.92, + "grad_norm": 1.4357285619514586, + "learning_rate": 1.843855850237658e-07, + "loss": 0.4392, + "step": 11662 + }, + { + "epoch": 0.92, + "grad_norm": 1.9811760602233783, + "learning_rate": 1.8404354870971975e-07, + "loss": 0.452, + "step": 11663 + }, + { + "epoch": 0.92, + "grad_norm": 0.5425607736085312, + "learning_rate": 1.8370182398175563e-07, + "loss": 0.493, + "step": 11664 + }, + { + "epoch": 0.92, + "grad_norm": 1.715600325251407, + "learning_rate": 1.8336041086198353e-07, + "loss": 0.4879, + "step": 11665 + }, + { + "epoch": 0.92, + "grad_norm": 1.7025627579313158, + "learning_rate": 1.8301930937249247e-07, + "loss": 0.3811, + "step": 11666 + }, + { + "epoch": 0.92, + "grad_norm": 2.1102612868056516, + "learning_rate": 1.8267851953534975e-07, + "loss": 0.432, + "step": 11667 + }, + { + "epoch": 0.92, + "grad_norm": 1.9142350793186051, + "learning_rate": 1.8233804137260502e-07, + "loss": 0.4279, + "step": 11668 + }, + { + "epoch": 0.92, + "grad_norm": 1.3686310315645536, + "learning_rate": 1.8199787490628672e-07, + "loss": 0.4218, + "step": 11669 + }, + { + "epoch": 0.92, + "grad_norm": 2.094288253139774, + "learning_rate": 1.8165802015840394e-07, + "loss": 0.5162, + "step": 11670 + }, + { + "epoch": 0.92, + "grad_norm": 1.6267566053431828, + "learning_rate": 1.8131847715094354e-07, + "loss": 0.4153, + "step": 11671 + }, + { + "epoch": 0.92, + "grad_norm": 1.5263953086911533, + "learning_rate": 1.8097924590587345e-07, + "loss": 0.3842, + "step": 11672 + }, + { + "epoch": 0.92, + "grad_norm": 1.8573317461697971, + "learning_rate": 1.8064032644514284e-07, + "loss": 0.4834, + "step": 11673 + }, + { + "epoch": 0.92, + "grad_norm": 1.6050083062188751, + "learning_rate": 1.8030171879067858e-07, + "loss": 0.5432, + "step": 11674 + }, + { + "epoch": 0.92, + "grad_norm": 1.85345786438777, + "learning_rate": 1.7996342296438817e-07, + "loss": 0.4472, + "step": 11675 + }, + { + "epoch": 0.92, + "grad_norm": 1.9006927653013674, + "learning_rate": 1.7962543898815798e-07, + "loss": 0.4083, + "step": 11676 + }, + { + "epoch": 0.92, + "grad_norm": 1.43383714114517, + "learning_rate": 1.7928776688385774e-07, + "loss": 0.452, + "step": 11677 + }, + { + "epoch": 0.92, + "grad_norm": 0.5538166493161042, + "learning_rate": 1.7895040667333162e-07, + "loss": 0.4886, + "step": 11678 + }, + { + "epoch": 0.92, + "grad_norm": 1.5691896060784905, + "learning_rate": 1.7861335837840777e-07, + "loss": 0.4739, + "step": 11679 + }, + { + "epoch": 0.92, + "grad_norm": 2.831635097430381, + "learning_rate": 1.7827662202089147e-07, + "loss": 0.449, + "step": 11680 + }, + { + "epoch": 0.92, + "grad_norm": 0.5493448542797732, + "learning_rate": 1.7794019762257143e-07, + "loss": 0.4695, + "step": 11681 + }, + { + "epoch": 0.92, + "grad_norm": 1.595412397775315, + "learning_rate": 1.776040852052119e-07, + "loss": 0.3759, + "step": 11682 + }, + { + "epoch": 0.92, + "grad_norm": 1.810619668862836, + "learning_rate": 1.772682847905599e-07, + "loss": 0.4705, + "step": 11683 + }, + { + "epoch": 0.92, + "grad_norm": 1.7690280015596287, + "learning_rate": 1.769327964003409e-07, + "loss": 0.4505, + "step": 11684 + }, + { + "epoch": 0.92, + "grad_norm": 1.9531642864754117, + "learning_rate": 1.7659762005626025e-07, + "loss": 0.4933, + "step": 11685 + }, + { + "epoch": 0.92, + "grad_norm": 2.3832544272593834, + "learning_rate": 1.762627557800045e-07, + "loss": 0.4736, + "step": 11686 + }, + { + "epoch": 0.92, + "grad_norm": 1.6428866634346775, + "learning_rate": 1.7592820359323748e-07, + "loss": 0.4457, + "step": 11687 + }, + { + "epoch": 0.92, + "grad_norm": 3.459835124730534, + "learning_rate": 1.7559396351760516e-07, + "loss": 0.4679, + "step": 11688 + }, + { + "epoch": 0.92, + "grad_norm": 1.9869478851280151, + "learning_rate": 1.752600355747325e-07, + "loss": 0.4854, + "step": 11689 + }, + { + "epoch": 0.92, + "grad_norm": 1.8889244266455447, + "learning_rate": 1.7492641978622382e-07, + "loss": 0.4471, + "step": 11690 + }, + { + "epoch": 0.92, + "grad_norm": 2.8149760252284493, + "learning_rate": 1.7459311617366415e-07, + "loss": 0.3922, + "step": 11691 + }, + { + "epoch": 0.92, + "grad_norm": 1.7654334978113753, + "learning_rate": 1.7426012475861676e-07, + "loss": 0.4416, + "step": 11692 + }, + { + "epoch": 0.92, + "grad_norm": 0.546519247637802, + "learning_rate": 1.7392744556262665e-07, + "loss": 0.4635, + "step": 11693 + }, + { + "epoch": 0.92, + "grad_norm": 2.213320408149971, + "learning_rate": 1.735950786072177e-07, + "loss": 0.4437, + "step": 11694 + }, + { + "epoch": 0.92, + "grad_norm": 1.4796898018508646, + "learning_rate": 1.7326302391389327e-07, + "loss": 0.4866, + "step": 11695 + }, + { + "epoch": 0.92, + "grad_norm": 1.7478029664733126, + "learning_rate": 1.7293128150413674e-07, + "loss": 0.4483, + "step": 11696 + }, + { + "epoch": 0.92, + "grad_norm": 1.6975966311324375, + "learning_rate": 1.725998513994126e-07, + "loss": 0.4207, + "step": 11697 + }, + { + "epoch": 0.92, + "grad_norm": 2.0983968243646123, + "learning_rate": 1.722687336211626e-07, + "loss": 0.4871, + "step": 11698 + }, + { + "epoch": 0.92, + "grad_norm": 1.5189287837012568, + "learning_rate": 1.7193792819081012e-07, + "loss": 0.4396, + "step": 11699 + }, + { + "epoch": 0.92, + "grad_norm": 1.7631721230813067, + "learning_rate": 1.7160743512975753e-07, + "loss": 0.4379, + "step": 11700 + }, + { + "epoch": 0.92, + "grad_norm": 3.1318750828811144, + "learning_rate": 1.7127725445938827e-07, + "loss": 0.4334, + "step": 11701 + }, + { + "epoch": 0.92, + "grad_norm": 3.2315353334409838, + "learning_rate": 1.70947386201063e-07, + "loss": 0.4221, + "step": 11702 + }, + { + "epoch": 0.92, + "grad_norm": 1.5154710352699057, + "learning_rate": 1.7061783037612468e-07, + "loss": 0.4226, + "step": 11703 + }, + { + "epoch": 0.92, + "grad_norm": 1.5780235033149828, + "learning_rate": 1.7028858700589567e-07, + "loss": 0.4612, + "step": 11704 + }, + { + "epoch": 0.92, + "grad_norm": 2.6857071600323215, + "learning_rate": 1.6995965611167676e-07, + "loss": 0.459, + "step": 11705 + }, + { + "epoch": 0.92, + "grad_norm": 1.8636272294791172, + "learning_rate": 1.6963103771474977e-07, + "loss": 0.4818, + "step": 11706 + }, + { + "epoch": 0.92, + "grad_norm": 1.7058722445479702, + "learning_rate": 1.6930273183637547e-07, + "loss": 0.444, + "step": 11707 + }, + { + "epoch": 0.92, + "grad_norm": 0.5762754065314423, + "learning_rate": 1.6897473849779522e-07, + "loss": 0.4674, + "step": 11708 + }, + { + "epoch": 0.92, + "grad_norm": 1.896069031975638, + "learning_rate": 1.6864705772022927e-07, + "loss": 0.4441, + "step": 11709 + }, + { + "epoch": 0.92, + "grad_norm": 2.388427535974961, + "learning_rate": 1.6831968952487897e-07, + "loss": 0.3808, + "step": 11710 + }, + { + "epoch": 0.92, + "grad_norm": 2.209481080411894, + "learning_rate": 1.6799263393292352e-07, + "loss": 0.428, + "step": 11711 + }, + { + "epoch": 0.92, + "grad_norm": 0.5409480482351268, + "learning_rate": 1.6766589096552432e-07, + "loss": 0.4754, + "step": 11712 + }, + { + "epoch": 0.92, + "grad_norm": 1.6017806126677618, + "learning_rate": 1.6733946064382002e-07, + "loss": 0.4349, + "step": 11713 + }, + { + "epoch": 0.92, + "grad_norm": 2.117835367983883, + "learning_rate": 1.6701334298893146e-07, + "loss": 0.4711, + "step": 11714 + }, + { + "epoch": 0.92, + "grad_norm": 1.5793243300401658, + "learning_rate": 1.6668753802195624e-07, + "loss": 0.4055, + "step": 11715 + }, + { + "epoch": 0.92, + "grad_norm": 1.7178818244728493, + "learning_rate": 1.6636204576397474e-07, + "loss": 0.4223, + "step": 11716 + }, + { + "epoch": 0.92, + "grad_norm": 1.921887025210037, + "learning_rate": 1.660368662360462e-07, + "loss": 0.4347, + "step": 11717 + }, + { + "epoch": 0.92, + "grad_norm": 1.4684232702140438, + "learning_rate": 1.6571199945920824e-07, + "loss": 0.4019, + "step": 11718 + }, + { + "epoch": 0.92, + "grad_norm": 1.8215305646883584, + "learning_rate": 1.6538744545448016e-07, + "loss": 0.4282, + "step": 11719 + }, + { + "epoch": 0.92, + "grad_norm": 1.3457139713756714, + "learning_rate": 1.650632042428596e-07, + "loss": 0.4706, + "step": 11720 + }, + { + "epoch": 0.92, + "grad_norm": 1.8460228167536126, + "learning_rate": 1.647392758453259e-07, + "loss": 0.4424, + "step": 11721 + }, + { + "epoch": 0.92, + "grad_norm": 0.5670311731254835, + "learning_rate": 1.644156602828345e-07, + "loss": 0.4618, + "step": 11722 + }, + { + "epoch": 0.92, + "grad_norm": 0.5385163643714175, + "learning_rate": 1.640923575763248e-07, + "loss": 0.4623, + "step": 11723 + }, + { + "epoch": 0.92, + "grad_norm": 1.7504067138636776, + "learning_rate": 1.6376936774671282e-07, + "loss": 0.4806, + "step": 11724 + }, + { + "epoch": 0.92, + "grad_norm": 1.7419246664879728, + "learning_rate": 1.6344669081489684e-07, + "loss": 0.4719, + "step": 11725 + }, + { + "epoch": 0.92, + "grad_norm": 2.508917603475176, + "learning_rate": 1.6312432680175294e-07, + "loss": 0.5188, + "step": 11726 + }, + { + "epoch": 0.92, + "grad_norm": 0.5041972126193592, + "learning_rate": 1.6280227572813723e-07, + "loss": 0.4522, + "step": 11727 + }, + { + "epoch": 0.92, + "grad_norm": 1.8770011437191434, + "learning_rate": 1.6248053761488636e-07, + "loss": 0.4168, + "step": 11728 + }, + { + "epoch": 0.92, + "grad_norm": 2.065771088725907, + "learning_rate": 1.6215911248281646e-07, + "loss": 0.4602, + "step": 11729 + }, + { + "epoch": 0.92, + "grad_norm": 0.5790262580533339, + "learning_rate": 1.6183800035272312e-07, + "loss": 0.4678, + "step": 11730 + }, + { + "epoch": 0.92, + "grad_norm": 2.072923969698755, + "learning_rate": 1.615172012453825e-07, + "loss": 0.4187, + "step": 11731 + }, + { + "epoch": 0.92, + "grad_norm": 3.072401735812162, + "learning_rate": 1.6119671518154966e-07, + "loss": 0.4527, + "step": 11732 + }, + { + "epoch": 0.92, + "grad_norm": 2.287261237985155, + "learning_rate": 1.6087654218195914e-07, + "loss": 0.4429, + "step": 11733 + }, + { + "epoch": 0.92, + "grad_norm": 1.51499823437332, + "learning_rate": 1.6055668226732602e-07, + "loss": 0.4671, + "step": 11734 + }, + { + "epoch": 0.92, + "grad_norm": 0.5390546737187387, + "learning_rate": 1.602371354583443e-07, + "loss": 0.4722, + "step": 11735 + }, + { + "epoch": 0.92, + "grad_norm": 0.5772369128030052, + "learning_rate": 1.5991790177568967e-07, + "loss": 0.4917, + "step": 11736 + }, + { + "epoch": 0.92, + "grad_norm": 2.6141040547021595, + "learning_rate": 1.595989812400145e-07, + "loss": 0.5119, + "step": 11737 + }, + { + "epoch": 0.92, + "grad_norm": 2.1590915769767207, + "learning_rate": 1.5928037387195283e-07, + "loss": 0.4265, + "step": 11738 + }, + { + "epoch": 0.92, + "grad_norm": 1.7952018915691113, + "learning_rate": 1.5896207969211873e-07, + "loss": 0.4702, + "step": 11739 + }, + { + "epoch": 0.92, + "grad_norm": 2.3784287043932126, + "learning_rate": 1.586440987211052e-07, + "loss": 0.4555, + "step": 11740 + }, + { + "epoch": 0.92, + "grad_norm": 1.8408478264027306, + "learning_rate": 1.583264309794863e-07, + "loss": 0.4219, + "step": 11741 + }, + { + "epoch": 0.92, + "grad_norm": 1.5522152831646425, + "learning_rate": 1.5800907648781226e-07, + "loss": 0.401, + "step": 11742 + }, + { + "epoch": 0.92, + "grad_norm": 1.5848418673087012, + "learning_rate": 1.5769203526661725e-07, + "loss": 0.4836, + "step": 11743 + }, + { + "epoch": 0.92, + "grad_norm": 1.6714697910516616, + "learning_rate": 1.5737530733641316e-07, + "loss": 0.4593, + "step": 11744 + }, + { + "epoch": 0.92, + "grad_norm": 2.0432444829873164, + "learning_rate": 1.5705889271769193e-07, + "loss": 0.4546, + "step": 11745 + }, + { + "epoch": 0.92, + "grad_norm": 0.5415021101007814, + "learning_rate": 1.5674279143092386e-07, + "loss": 0.4756, + "step": 11746 + }, + { + "epoch": 0.92, + "grad_norm": 1.6342718162943994, + "learning_rate": 1.5642700349656315e-07, + "loss": 0.4146, + "step": 11747 + }, + { + "epoch": 0.92, + "grad_norm": 4.007351409577253, + "learning_rate": 1.561115289350379e-07, + "loss": 0.3962, + "step": 11748 + }, + { + "epoch": 0.92, + "grad_norm": 3.5274829211257144, + "learning_rate": 1.5579636776676065e-07, + "loss": 0.4788, + "step": 11749 + }, + { + "epoch": 0.92, + "grad_norm": 2.068920914082165, + "learning_rate": 1.554815200121207e-07, + "loss": 0.4625, + "step": 11750 + }, + { + "epoch": 0.92, + "grad_norm": 1.9001688101723404, + "learning_rate": 1.5516698569148946e-07, + "loss": 0.448, + "step": 11751 + }, + { + "epoch": 0.92, + "grad_norm": 2.072169792362131, + "learning_rate": 1.5485276482521683e-07, + "loss": 0.4276, + "step": 11752 + }, + { + "epoch": 0.92, + "grad_norm": 1.7416557488472462, + "learning_rate": 1.5453885743363152e-07, + "loss": 0.4124, + "step": 11753 + }, + { + "epoch": 0.92, + "grad_norm": 7.585707352257947, + "learning_rate": 1.5422526353704337e-07, + "loss": 0.4148, + "step": 11754 + }, + { + "epoch": 0.92, + "grad_norm": 1.5493074466594068, + "learning_rate": 1.5391198315574175e-07, + "loss": 0.4154, + "step": 11755 + }, + { + "epoch": 0.92, + "grad_norm": 1.6782831014129347, + "learning_rate": 1.5359901630999596e-07, + "loss": 0.4625, + "step": 11756 + }, + { + "epoch": 0.92, + "grad_norm": 0.5275036593292459, + "learning_rate": 1.5328636302005317e-07, + "loss": 0.4483, + "step": 11757 + }, + { + "epoch": 0.92, + "grad_norm": 1.8550485810493866, + "learning_rate": 1.5297402330614276e-07, + "loss": 0.4682, + "step": 11758 + }, + { + "epoch": 0.92, + "grad_norm": 1.5052002794253732, + "learning_rate": 1.5266199718847185e-07, + "loss": 0.4214, + "step": 11759 + }, + { + "epoch": 0.92, + "grad_norm": 0.5472971086148162, + "learning_rate": 1.5235028468722934e-07, + "loss": 0.44, + "step": 11760 + }, + { + "epoch": 0.92, + "grad_norm": 1.8911524856423803, + "learning_rate": 1.5203888582258187e-07, + "loss": 0.4484, + "step": 11761 + }, + { + "epoch": 0.92, + "grad_norm": 1.7015379917537463, + "learning_rate": 1.5172780061467551e-07, + "loss": 0.4875, + "step": 11762 + }, + { + "epoch": 0.92, + "grad_norm": 2.4568833628671496, + "learning_rate": 1.514170290836392e-07, + "loss": 0.4952, + "step": 11763 + }, + { + "epoch": 0.92, + "grad_norm": 0.5476256632750964, + "learning_rate": 1.511065712495774e-07, + "loss": 0.4603, + "step": 11764 + }, + { + "epoch": 0.92, + "grad_norm": 2.304871431191685, + "learning_rate": 1.5079642713257792e-07, + "loss": 0.4606, + "step": 11765 + }, + { + "epoch": 0.92, + "grad_norm": 1.8658886055179422, + "learning_rate": 1.5048659675270583e-07, + "loss": 0.4556, + "step": 11766 + }, + { + "epoch": 0.92, + "grad_norm": 1.7158709460536223, + "learning_rate": 1.5017708013000787e-07, + "loss": 0.4547, + "step": 11767 + }, + { + "epoch": 0.92, + "grad_norm": 0.5690187088992504, + "learning_rate": 1.4986787728450747e-07, + "loss": 0.4832, + "step": 11768 + }, + { + "epoch": 0.92, + "grad_norm": 1.66863306240449, + "learning_rate": 1.4955898823621084e-07, + "loss": 0.476, + "step": 11769 + }, + { + "epoch": 0.92, + "grad_norm": 1.6361697280701761, + "learning_rate": 1.49250413005102e-07, + "loss": 0.4889, + "step": 11770 + }, + { + "epoch": 0.92, + "grad_norm": 2.6268884981300578, + "learning_rate": 1.4894215161114722e-07, + "loss": 0.4754, + "step": 11771 + }, + { + "epoch": 0.92, + "grad_norm": 1.771910638155748, + "learning_rate": 1.4863420407428831e-07, + "loss": 0.4555, + "step": 11772 + }, + { + "epoch": 0.92, + "grad_norm": 1.403239337633192, + "learning_rate": 1.4832657041444986e-07, + "loss": 0.468, + "step": 11773 + }, + { + "epoch": 0.92, + "grad_norm": 2.128621604992297, + "learning_rate": 1.4801925065153544e-07, + "loss": 0.4175, + "step": 11774 + }, + { + "epoch": 0.92, + "grad_norm": 1.3295265146661326, + "learning_rate": 1.4771224480542857e-07, + "loss": 0.4661, + "step": 11775 + }, + { + "epoch": 0.92, + "grad_norm": 0.513466561118754, + "learning_rate": 1.4740555289599278e-07, + "loss": 0.4737, + "step": 11776 + }, + { + "epoch": 0.92, + "grad_norm": 2.3361652599373923, + "learning_rate": 1.4709917494306836e-07, + "loss": 0.4472, + "step": 11777 + }, + { + "epoch": 0.92, + "grad_norm": 1.9373870785458802, + "learning_rate": 1.467931109664794e-07, + "loss": 0.4632, + "step": 11778 + }, + { + "epoch": 0.93, + "grad_norm": 1.535779200380525, + "learning_rate": 1.4648736098602734e-07, + "loss": 0.4941, + "step": 11779 + }, + { + "epoch": 0.93, + "grad_norm": 1.5313879338354166, + "learning_rate": 1.4618192502149465e-07, + "loss": 0.3927, + "step": 11780 + }, + { + "epoch": 0.93, + "grad_norm": 0.505408323235525, + "learning_rate": 1.4587680309264053e-07, + "loss": 0.4684, + "step": 11781 + }, + { + "epoch": 0.93, + "grad_norm": 0.5658704565250217, + "learning_rate": 1.4557199521920806e-07, + "loss": 0.4579, + "step": 11782 + }, + { + "epoch": 0.93, + "grad_norm": 2.579745614303645, + "learning_rate": 1.452675014209165e-07, + "loss": 0.4513, + "step": 11783 + }, + { + "epoch": 0.93, + "grad_norm": 1.7317307861336095, + "learning_rate": 1.449633217174673e-07, + "loss": 0.4533, + "step": 11784 + }, + { + "epoch": 0.93, + "grad_norm": 1.5625232936981786, + "learning_rate": 1.4465945612853972e-07, + "loss": 0.3953, + "step": 11785 + }, + { + "epoch": 0.93, + "grad_norm": 1.8031530607266502, + "learning_rate": 1.4435590467379356e-07, + "loss": 0.3926, + "step": 11786 + }, + { + "epoch": 0.93, + "grad_norm": 1.767036524024196, + "learning_rate": 1.4405266737286926e-07, + "loss": 0.4477, + "step": 11787 + }, + { + "epoch": 0.93, + "grad_norm": 1.6921663471674042, + "learning_rate": 1.4374974424538446e-07, + "loss": 0.4732, + "step": 11788 + }, + { + "epoch": 0.93, + "grad_norm": 1.5205772729405225, + "learning_rate": 1.4344713531093845e-07, + "loss": 0.4489, + "step": 11789 + }, + { + "epoch": 0.93, + "grad_norm": 1.686222573396598, + "learning_rate": 1.4314484058910949e-07, + "loss": 0.4211, + "step": 11790 + }, + { + "epoch": 0.93, + "grad_norm": 0.5444070947995359, + "learning_rate": 1.4284286009945636e-07, + "loss": 0.4628, + "step": 11791 + }, + { + "epoch": 0.93, + "grad_norm": 1.589692956958823, + "learning_rate": 1.4254119386151567e-07, + "loss": 0.4839, + "step": 11792 + }, + { + "epoch": 0.93, + "grad_norm": 1.8288403791287917, + "learning_rate": 1.4223984189480512e-07, + "loss": 0.4608, + "step": 11793 + }, + { + "epoch": 0.93, + "grad_norm": 1.7461559182713902, + "learning_rate": 1.4193880421882245e-07, + "loss": 0.426, + "step": 11794 + }, + { + "epoch": 0.93, + "grad_norm": 1.814427882174773, + "learning_rate": 1.416380808530443e-07, + "loss": 0.4028, + "step": 11795 + }, + { + "epoch": 0.93, + "grad_norm": 1.8279203400288289, + "learning_rate": 1.4133767181692626e-07, + "loss": 0.4424, + "step": 11796 + }, + { + "epoch": 0.93, + "grad_norm": 0.5753394666998932, + "learning_rate": 1.410375771299044e-07, + "loss": 0.5023, + "step": 11797 + }, + { + "epoch": 0.93, + "grad_norm": 1.5094929337371277, + "learning_rate": 1.4073779681139655e-07, + "loss": 0.4065, + "step": 11798 + }, + { + "epoch": 0.93, + "grad_norm": 2.3697664726896503, + "learning_rate": 1.404383308807955e-07, + "loss": 0.4596, + "step": 11799 + }, + { + "epoch": 0.93, + "grad_norm": 1.9172992439108292, + "learning_rate": 1.4013917935747744e-07, + "loss": 0.4382, + "step": 11800 + }, + { + "epoch": 0.93, + "grad_norm": 2.181812083023224, + "learning_rate": 1.398403422607969e-07, + "loss": 0.44, + "step": 11801 + }, + { + "epoch": 0.93, + "grad_norm": 1.6667817268286442, + "learning_rate": 1.3954181961008895e-07, + "loss": 0.4038, + "step": 11802 + }, + { + "epoch": 0.93, + "grad_norm": 2.228499181348054, + "learning_rate": 1.3924361142466647e-07, + "loss": 0.4799, + "step": 11803 + }, + { + "epoch": 0.93, + "grad_norm": 0.5446933006211802, + "learning_rate": 1.389457177238235e-07, + "loss": 0.4672, + "step": 11804 + }, + { + "epoch": 0.93, + "grad_norm": 1.9628941708710212, + "learning_rate": 1.3864813852683346e-07, + "loss": 0.4206, + "step": 11805 + }, + { + "epoch": 0.93, + "grad_norm": 2.2225193719545264, + "learning_rate": 1.3835087385294988e-07, + "loss": 0.4079, + "step": 11806 + }, + { + "epoch": 0.93, + "grad_norm": 1.748008663483608, + "learning_rate": 1.3805392372140514e-07, + "loss": 0.4172, + "step": 11807 + }, + { + "epoch": 0.93, + "grad_norm": 1.8159308094436208, + "learning_rate": 1.3775728815141055e-07, + "loss": 0.4658, + "step": 11808 + }, + { + "epoch": 0.93, + "grad_norm": 1.6111833964296154, + "learning_rate": 1.3746096716215906e-07, + "loss": 0.4822, + "step": 11809 + }, + { + "epoch": 0.93, + "grad_norm": 2.1832674905720406, + "learning_rate": 1.3716496077282205e-07, + "loss": 0.479, + "step": 11810 + }, + { + "epoch": 0.93, + "grad_norm": 1.7019671962116305, + "learning_rate": 1.3686926900255082e-07, + "loss": 0.4273, + "step": 11811 + }, + { + "epoch": 0.93, + "grad_norm": 1.5820665714242095, + "learning_rate": 1.3657389187047622e-07, + "loss": 0.4271, + "step": 11812 + }, + { + "epoch": 0.93, + "grad_norm": 0.5017301672721985, + "learning_rate": 1.3627882939570848e-07, + "loss": 0.4603, + "step": 11813 + }, + { + "epoch": 0.93, + "grad_norm": 1.5894047672238247, + "learning_rate": 1.359840815973379e-07, + "loss": 0.4129, + "step": 11814 + }, + { + "epoch": 0.93, + "grad_norm": 0.5779610539283278, + "learning_rate": 1.3568964849443421e-07, + "loss": 0.4441, + "step": 11815 + }, + { + "epoch": 0.93, + "grad_norm": 1.4367344687851928, + "learning_rate": 1.3539553010604666e-07, + "loss": 0.4902, + "step": 11816 + }, + { + "epoch": 0.93, + "grad_norm": 1.6874972787753546, + "learning_rate": 1.3510172645120556e-07, + "loss": 0.4714, + "step": 11817 + }, + { + "epoch": 0.93, + "grad_norm": 2.867792170666657, + "learning_rate": 1.348082375489179e-07, + "loss": 0.4133, + "step": 11818 + }, + { + "epoch": 0.93, + "grad_norm": 2.038960097807598, + "learning_rate": 1.34515063418173e-07, + "loss": 0.403, + "step": 11819 + }, + { + "epoch": 0.93, + "grad_norm": 0.5394822185844133, + "learning_rate": 1.342222040779384e-07, + "loss": 0.4609, + "step": 11820 + }, + { + "epoch": 0.93, + "grad_norm": 7.48923574368581, + "learning_rate": 1.339296595471623e-07, + "loss": 0.4454, + "step": 11821 + }, + { + "epoch": 0.93, + "grad_norm": 2.888465791767646, + "learning_rate": 1.3363742984477178e-07, + "loss": 0.4448, + "step": 11822 + }, + { + "epoch": 0.93, + "grad_norm": 2.4698326655386835, + "learning_rate": 1.3334551498967342e-07, + "loss": 0.4045, + "step": 11823 + }, + { + "epoch": 0.93, + "grad_norm": 1.971319832167152, + "learning_rate": 1.3305391500075315e-07, + "loss": 0.4427, + "step": 11824 + }, + { + "epoch": 0.93, + "grad_norm": 1.7727151415737878, + "learning_rate": 1.327626298968787e-07, + "loss": 0.432, + "step": 11825 + }, + { + "epoch": 0.93, + "grad_norm": 2.2408672451898037, + "learning_rate": 1.32471659696895e-07, + "loss": 0.4308, + "step": 11826 + }, + { + "epoch": 0.93, + "grad_norm": 10.576973474127147, + "learning_rate": 1.32181004419627e-07, + "loss": 0.4112, + "step": 11827 + }, + { + "epoch": 0.93, + "grad_norm": 1.8961084154561636, + "learning_rate": 1.3189066408387963e-07, + "loss": 0.4268, + "step": 11828 + }, + { + "epoch": 0.93, + "grad_norm": 3.0636197551604933, + "learning_rate": 1.3160063870843842e-07, + "loss": 0.4425, + "step": 11829 + }, + { + "epoch": 0.93, + "grad_norm": 2.1016892919053314, + "learning_rate": 1.3131092831206727e-07, + "loss": 0.4146, + "step": 11830 + }, + { + "epoch": 0.93, + "grad_norm": 1.6057473556341042, + "learning_rate": 1.310215329135106e-07, + "loss": 0.4293, + "step": 11831 + }, + { + "epoch": 0.93, + "grad_norm": 0.5329633082483953, + "learning_rate": 1.3073245253149013e-07, + "loss": 0.4622, + "step": 11832 + }, + { + "epoch": 0.93, + "grad_norm": 0.5057841243858959, + "learning_rate": 1.3044368718471145e-07, + "loss": 0.4549, + "step": 11833 + }, + { + "epoch": 0.93, + "grad_norm": 2.395646040385583, + "learning_rate": 1.3015523689185515e-07, + "loss": 0.416, + "step": 11834 + }, + { + "epoch": 0.93, + "grad_norm": 2.01222230752039, + "learning_rate": 1.2986710167158466e-07, + "loss": 0.49, + "step": 11835 + }, + { + "epoch": 0.93, + "grad_norm": 6.185042725076882, + "learning_rate": 1.2957928154254174e-07, + "loss": 0.4607, + "step": 11836 + }, + { + "epoch": 0.93, + "grad_norm": 4.312650557878563, + "learning_rate": 1.2929177652334812e-07, + "loss": 0.4393, + "step": 11837 + }, + { + "epoch": 0.93, + "grad_norm": 2.371576932473127, + "learning_rate": 1.2900458663260506e-07, + "loss": 0.4326, + "step": 11838 + }, + { + "epoch": 0.93, + "grad_norm": 1.407646566140664, + "learning_rate": 1.287177118888927e-07, + "loss": 0.4396, + "step": 11839 + }, + { + "epoch": 0.93, + "grad_norm": 1.8307194252975234, + "learning_rate": 1.284311523107723e-07, + "loss": 0.4095, + "step": 11840 + }, + { + "epoch": 0.93, + "grad_norm": 1.289413248782656, + "learning_rate": 1.281449079167829e-07, + "loss": 0.3817, + "step": 11841 + }, + { + "epoch": 0.93, + "grad_norm": 1.6516666861123825, + "learning_rate": 1.2785897872544585e-07, + "loss": 0.4696, + "step": 11842 + }, + { + "epoch": 0.93, + "grad_norm": 1.4494553662953884, + "learning_rate": 1.275733647552585e-07, + "loss": 0.412, + "step": 11843 + }, + { + "epoch": 0.93, + "grad_norm": 1.4287845930609309, + "learning_rate": 1.2728806602470112e-07, + "loss": 0.419, + "step": 11844 + }, + { + "epoch": 0.93, + "grad_norm": 0.5509627825945641, + "learning_rate": 1.2700308255223115e-07, + "loss": 0.4777, + "step": 11845 + }, + { + "epoch": 0.93, + "grad_norm": 1.679124111023232, + "learning_rate": 1.2671841435628718e-07, + "loss": 0.4597, + "step": 11846 + }, + { + "epoch": 0.93, + "grad_norm": 1.7954668389820017, + "learning_rate": 1.2643406145528668e-07, + "loss": 0.4047, + "step": 11847 + }, + { + "epoch": 0.93, + "grad_norm": 2.2448436276847707, + "learning_rate": 1.2615002386762665e-07, + "loss": 0.4558, + "step": 11848 + }, + { + "epoch": 0.93, + "grad_norm": 2.3496923961014495, + "learning_rate": 1.2586630161168456e-07, + "loss": 0.4378, + "step": 11849 + }, + { + "epoch": 0.93, + "grad_norm": 3.1726860695772445, + "learning_rate": 1.2558289470581742e-07, + "loss": 0.5144, + "step": 11850 + }, + { + "epoch": 0.93, + "grad_norm": 1.6564585071185243, + "learning_rate": 1.2529980316835888e-07, + "loss": 0.4266, + "step": 11851 + }, + { + "epoch": 0.93, + "grad_norm": 0.5135238247412202, + "learning_rate": 1.2501702701762708e-07, + "loss": 0.4668, + "step": 11852 + }, + { + "epoch": 0.93, + "grad_norm": 0.5370747266075647, + "learning_rate": 1.2473456627191626e-07, + "loss": 0.4773, + "step": 11853 + }, + { + "epoch": 0.93, + "grad_norm": 0.5692125959339073, + "learning_rate": 1.2445242094950127e-07, + "loss": 0.4804, + "step": 11854 + }, + { + "epoch": 0.93, + "grad_norm": 1.5899320164988602, + "learning_rate": 1.2417059106863638e-07, + "loss": 0.4698, + "step": 11855 + }, + { + "epoch": 0.93, + "grad_norm": 0.5428892197021701, + "learning_rate": 1.2388907664755644e-07, + "loss": 0.4833, + "step": 11856 + }, + { + "epoch": 0.93, + "grad_norm": 2.1407326778783444, + "learning_rate": 1.236078777044747e-07, + "loss": 0.4428, + "step": 11857 + }, + { + "epoch": 0.93, + "grad_norm": 1.9358425855056336, + "learning_rate": 1.233269942575832e-07, + "loss": 0.4602, + "step": 11858 + }, + { + "epoch": 0.93, + "grad_norm": 2.0796795927982394, + "learning_rate": 1.2304642632505637e-07, + "loss": 0.4253, + "step": 11859 + }, + { + "epoch": 0.93, + "grad_norm": 2.2878413539009093, + "learning_rate": 1.227661739250452e-07, + "loss": 0.4794, + "step": 11860 + }, + { + "epoch": 0.93, + "grad_norm": 1.724457566600411, + "learning_rate": 1.2248623707568352e-07, + "loss": 0.4119, + "step": 11861 + }, + { + "epoch": 0.93, + "grad_norm": 0.6095098691158997, + "learning_rate": 1.2220661579508075e-07, + "loss": 0.4726, + "step": 11862 + }, + { + "epoch": 0.93, + "grad_norm": 2.341348735676004, + "learning_rate": 1.2192731010132853e-07, + "loss": 0.4496, + "step": 11863 + }, + { + "epoch": 0.93, + "grad_norm": 2.0959586589127315, + "learning_rate": 1.2164832001249903e-07, + "loss": 0.4609, + "step": 11864 + }, + { + "epoch": 0.93, + "grad_norm": 0.5378965620990586, + "learning_rate": 1.213696455466412e-07, + "loss": 0.4693, + "step": 11865 + }, + { + "epoch": 0.93, + "grad_norm": 0.5370492982726289, + "learning_rate": 1.2109128672178617e-07, + "loss": 0.455, + "step": 11866 + }, + { + "epoch": 0.93, + "grad_norm": 1.881210373608352, + "learning_rate": 1.208132435559406e-07, + "loss": 0.4274, + "step": 11867 + }, + { + "epoch": 0.93, + "grad_norm": 1.8043808582421739, + "learning_rate": 1.205355160670968e-07, + "loss": 0.4434, + "step": 11868 + }, + { + "epoch": 0.93, + "grad_norm": 2.2422465420098794, + "learning_rate": 1.2025810427322149e-07, + "loss": 0.3805, + "step": 11869 + }, + { + "epoch": 0.93, + "grad_norm": 2.0978954846853592, + "learning_rate": 1.1998100819226366e-07, + "loss": 0.4428, + "step": 11870 + }, + { + "epoch": 0.93, + "grad_norm": 3.235537665464353, + "learning_rate": 1.1970422784215007e-07, + "loss": 0.4483, + "step": 11871 + }, + { + "epoch": 0.93, + "grad_norm": 1.9616152643456302, + "learning_rate": 1.1942776324078975e-07, + "loss": 0.4477, + "step": 11872 + }, + { + "epoch": 0.93, + "grad_norm": 2.141307254675652, + "learning_rate": 1.1915161440606782e-07, + "loss": 0.4625, + "step": 11873 + }, + { + "epoch": 0.93, + "grad_norm": 0.5499304748956896, + "learning_rate": 1.188757813558511e-07, + "loss": 0.4539, + "step": 11874 + }, + { + "epoch": 0.93, + "grad_norm": 2.906511863159606, + "learning_rate": 1.1860026410798641e-07, + "loss": 0.4581, + "step": 11875 + }, + { + "epoch": 0.93, + "grad_norm": 1.8112240483443718, + "learning_rate": 1.1832506268029898e-07, + "loss": 0.4797, + "step": 11876 + }, + { + "epoch": 0.93, + "grad_norm": 1.688636235111136, + "learning_rate": 1.1805017709059452e-07, + "loss": 0.4368, + "step": 11877 + }, + { + "epoch": 0.93, + "grad_norm": 1.4453036420908236, + "learning_rate": 1.177756073566566e-07, + "loss": 0.4638, + "step": 11878 + }, + { + "epoch": 0.93, + "grad_norm": 1.9924358178709283, + "learning_rate": 1.1750135349625048e-07, + "loss": 0.4663, + "step": 11879 + }, + { + "epoch": 0.93, + "grad_norm": 1.9546958544048727, + "learning_rate": 1.1722741552711914e-07, + "loss": 0.4425, + "step": 11880 + }, + { + "epoch": 0.93, + "grad_norm": 1.8598562752448733, + "learning_rate": 1.1695379346698732e-07, + "loss": 0.3992, + "step": 11881 + }, + { + "epoch": 0.93, + "grad_norm": 1.381059754564645, + "learning_rate": 1.1668048733355642e-07, + "loss": 0.4664, + "step": 11882 + }, + { + "epoch": 0.93, + "grad_norm": 1.664146995013794, + "learning_rate": 1.1640749714451005e-07, + "loss": 0.4481, + "step": 11883 + }, + { + "epoch": 0.93, + "grad_norm": 4.693232257969307, + "learning_rate": 1.1613482291751022e-07, + "loss": 0.4816, + "step": 11884 + }, + { + "epoch": 0.93, + "grad_norm": 1.7378869961476118, + "learning_rate": 1.1586246467019891e-07, + "loss": 0.4262, + "step": 11885 + }, + { + "epoch": 0.93, + "grad_norm": 1.5184969052000743, + "learning_rate": 1.1559042242019592e-07, + "loss": 0.3681, + "step": 11886 + }, + { + "epoch": 0.93, + "grad_norm": 0.5455151557725871, + "learning_rate": 1.1531869618510327e-07, + "loss": 0.4499, + "step": 11887 + }, + { + "epoch": 0.93, + "grad_norm": 2.2787494235237395, + "learning_rate": 1.1504728598250136e-07, + "loss": 0.4623, + "step": 11888 + }, + { + "epoch": 0.93, + "grad_norm": 1.8588183388938833, + "learning_rate": 1.1477619182994948e-07, + "loss": 0.4085, + "step": 11889 + }, + { + "epoch": 0.93, + "grad_norm": 1.379633366592021, + "learning_rate": 1.1450541374498747e-07, + "loss": 0.3935, + "step": 11890 + }, + { + "epoch": 0.93, + "grad_norm": 1.7485782022556036, + "learning_rate": 1.142349517451341e-07, + "loss": 0.4665, + "step": 11891 + }, + { + "epoch": 0.93, + "grad_norm": 1.939857937332404, + "learning_rate": 1.1396480584788816e-07, + "loss": 0.4849, + "step": 11892 + }, + { + "epoch": 0.93, + "grad_norm": 1.8402968481618627, + "learning_rate": 1.1369497607072732e-07, + "loss": 0.4328, + "step": 11893 + }, + { + "epoch": 0.93, + "grad_norm": 2.24751357273807, + "learning_rate": 1.134254624311093e-07, + "loss": 0.4644, + "step": 11894 + }, + { + "epoch": 0.93, + "grad_norm": 2.503446791590182, + "learning_rate": 1.1315626494647126e-07, + "loss": 0.4626, + "step": 11895 + }, + { + "epoch": 0.93, + "grad_norm": 1.612353147795168, + "learning_rate": 1.1288738363423091e-07, + "loss": 0.4853, + "step": 11896 + }, + { + "epoch": 0.93, + "grad_norm": 1.7931178423103957, + "learning_rate": 1.1261881851178325e-07, + "loss": 0.4546, + "step": 11897 + }, + { + "epoch": 0.93, + "grad_norm": 2.317029515439848, + "learning_rate": 1.1235056959650381e-07, + "loss": 0.4648, + "step": 11898 + }, + { + "epoch": 0.93, + "grad_norm": 2.4037289430608917, + "learning_rate": 1.1208263690574983e-07, + "loss": 0.4456, + "step": 11899 + }, + { + "epoch": 0.93, + "grad_norm": 1.48387276655709, + "learning_rate": 1.1181502045685411e-07, + "loss": 0.4614, + "step": 11900 + }, + { + "epoch": 0.93, + "grad_norm": 1.7392189250613297, + "learning_rate": 1.1154772026713278e-07, + "loss": 0.4368, + "step": 11901 + }, + { + "epoch": 0.93, + "grad_norm": 1.8668597737426498, + "learning_rate": 1.1128073635387815e-07, + "loss": 0.4094, + "step": 11902 + }, + { + "epoch": 0.93, + "grad_norm": 1.7331114233805012, + "learning_rate": 1.1101406873436583e-07, + "loss": 0.4409, + "step": 11903 + }, + { + "epoch": 0.93, + "grad_norm": 0.5133502351552798, + "learning_rate": 1.1074771742584645e-07, + "loss": 0.4746, + "step": 11904 + }, + { + "epoch": 0.93, + "grad_norm": 0.5716708220319111, + "learning_rate": 1.1048168244555513e-07, + "loss": 0.4567, + "step": 11905 + }, + { + "epoch": 0.94, + "grad_norm": 1.4851554081146392, + "learning_rate": 1.1021596381070144e-07, + "loss": 0.4164, + "step": 11906 + }, + { + "epoch": 0.94, + "grad_norm": 1.4458411056171714, + "learning_rate": 1.0995056153847883e-07, + "loss": 0.4738, + "step": 11907 + }, + { + "epoch": 0.94, + "grad_norm": 2.5420012444926368, + "learning_rate": 1.0968547564605747e-07, + "loss": 0.3982, + "step": 11908 + }, + { + "epoch": 0.94, + "grad_norm": 2.7359012604853308, + "learning_rate": 1.094207061505892e-07, + "loss": 0.4411, + "step": 11909 + }, + { + "epoch": 0.94, + "grad_norm": 1.9127195990424601, + "learning_rate": 1.0915625306920307e-07, + "loss": 0.4159, + "step": 11910 + }, + { + "epoch": 0.94, + "grad_norm": 0.5491286141939281, + "learning_rate": 1.0889211641900932e-07, + "loss": 0.4727, + "step": 11911 + }, + { + "epoch": 0.94, + "grad_norm": 1.581806502731135, + "learning_rate": 1.0862829621709759e-07, + "loss": 0.4577, + "step": 11912 + }, + { + "epoch": 0.94, + "grad_norm": 0.5667499345099013, + "learning_rate": 1.0836479248053589e-07, + "loss": 0.4726, + "step": 11913 + }, + { + "epoch": 0.94, + "grad_norm": 2.7898013645130377, + "learning_rate": 1.0810160522637336e-07, + "loss": 0.4765, + "step": 11914 + }, + { + "epoch": 0.94, + "grad_norm": 0.5324253851749927, + "learning_rate": 1.078387344716375e-07, + "loss": 0.4558, + "step": 11915 + }, + { + "epoch": 0.94, + "grad_norm": 1.850392437971417, + "learning_rate": 1.0757618023333638e-07, + "loss": 0.4189, + "step": 11916 + }, + { + "epoch": 0.94, + "grad_norm": 1.6063968500130201, + "learning_rate": 1.0731394252845528e-07, + "loss": 0.4088, + "step": 11917 + }, + { + "epoch": 0.94, + "grad_norm": 1.954495736238232, + "learning_rate": 1.0705202137396231e-07, + "loss": 0.4775, + "step": 11918 + }, + { + "epoch": 0.94, + "grad_norm": 1.793191918679099, + "learning_rate": 1.0679041678680224e-07, + "loss": 0.4427, + "step": 11919 + }, + { + "epoch": 0.94, + "grad_norm": 1.5288792119075454, + "learning_rate": 1.0652912878390153e-07, + "loss": 0.4623, + "step": 11920 + }, + { + "epoch": 0.94, + "grad_norm": 1.5327368080932842, + "learning_rate": 1.0626815738216445e-07, + "loss": 0.47, + "step": 11921 + }, + { + "epoch": 0.94, + "grad_norm": 1.7568672842307334, + "learning_rate": 1.0600750259847581e-07, + "loss": 0.445, + "step": 11922 + }, + { + "epoch": 0.94, + "grad_norm": 1.6496598103123985, + "learning_rate": 1.057471644496999e-07, + "loss": 0.418, + "step": 11923 + }, + { + "epoch": 0.94, + "grad_norm": 1.8242774149216143, + "learning_rate": 1.0548714295267992e-07, + "loss": 0.4583, + "step": 11924 + }, + { + "epoch": 0.94, + "grad_norm": 1.909606474818618, + "learning_rate": 1.0522743812423852e-07, + "loss": 0.4842, + "step": 11925 + }, + { + "epoch": 0.94, + "grad_norm": 1.5362836509408702, + "learning_rate": 1.0496804998117893e-07, + "loss": 0.4117, + "step": 11926 + }, + { + "epoch": 0.94, + "grad_norm": 1.938663104269406, + "learning_rate": 1.0470897854028383e-07, + "loss": 0.4824, + "step": 11927 + }, + { + "epoch": 0.94, + "grad_norm": 1.7459478067827947, + "learning_rate": 1.0445022381831316e-07, + "loss": 0.4581, + "step": 11928 + }, + { + "epoch": 0.94, + "grad_norm": 1.799667090012614, + "learning_rate": 1.0419178583200851e-07, + "loss": 0.4329, + "step": 11929 + }, + { + "epoch": 0.94, + "grad_norm": 1.817532294050131, + "learning_rate": 1.0393366459809151e-07, + "loss": 0.4432, + "step": 11930 + }, + { + "epoch": 0.94, + "grad_norm": 0.5410605317043119, + "learning_rate": 1.0367586013326104e-07, + "loss": 0.4782, + "step": 11931 + }, + { + "epoch": 0.94, + "grad_norm": 6.0164918633586515, + "learning_rate": 1.0341837245419761e-07, + "loss": 0.4298, + "step": 11932 + }, + { + "epoch": 0.94, + "grad_norm": 1.8023616736617374, + "learning_rate": 1.0316120157755904e-07, + "loss": 0.4301, + "step": 11933 + }, + { + "epoch": 0.94, + "grad_norm": 2.08035251654865, + "learning_rate": 1.029043475199859e-07, + "loss": 0.4674, + "step": 11934 + }, + { + "epoch": 0.94, + "grad_norm": 1.5280545955846563, + "learning_rate": 1.026478102980949e-07, + "loss": 0.4237, + "step": 11935 + }, + { + "epoch": 0.94, + "grad_norm": 0.5355558222918961, + "learning_rate": 1.0239158992848442e-07, + "loss": 0.4775, + "step": 11936 + }, + { + "epoch": 0.94, + "grad_norm": 0.5548918877965017, + "learning_rate": 1.0213568642773064e-07, + "loss": 0.4738, + "step": 11937 + }, + { + "epoch": 0.94, + "grad_norm": 1.7010405354398268, + "learning_rate": 1.0188009981239089e-07, + "loss": 0.4567, + "step": 11938 + }, + { + "epoch": 0.94, + "grad_norm": 2.971682598987677, + "learning_rate": 1.0162483009900137e-07, + "loss": 0.4605, + "step": 11939 + }, + { + "epoch": 0.94, + "grad_norm": 1.8486414200518737, + "learning_rate": 1.0136987730407776e-07, + "loss": 0.421, + "step": 11940 + }, + { + "epoch": 0.94, + "grad_norm": 1.745743652754548, + "learning_rate": 1.0111524144411355e-07, + "loss": 0.4109, + "step": 11941 + }, + { + "epoch": 0.94, + "grad_norm": 1.6290193529352976, + "learning_rate": 1.0086092253558555e-07, + "loss": 0.4089, + "step": 11942 + }, + { + "epoch": 0.94, + "grad_norm": 2.399057861432048, + "learning_rate": 1.0060692059494726e-07, + "loss": 0.4336, + "step": 11943 + }, + { + "epoch": 0.94, + "grad_norm": 2.1815565391110283, + "learning_rate": 1.0035323563863164e-07, + "loss": 0.4547, + "step": 11944 + }, + { + "epoch": 0.94, + "grad_norm": 1.7158134763297408, + "learning_rate": 1.0009986768305225e-07, + "loss": 0.4844, + "step": 11945 + }, + { + "epoch": 0.94, + "grad_norm": 1.9191455209149968, + "learning_rate": 9.98468167446015e-08, + "loss": 0.4335, + "step": 11946 + }, + { + "epoch": 0.94, + "grad_norm": 1.612451584179278, + "learning_rate": 9.959408283965132e-08, + "loss": 0.3982, + "step": 11947 + }, + { + "epoch": 0.94, + "grad_norm": 2.330184118253598, + "learning_rate": 9.934166598455364e-08, + "loss": 0.4807, + "step": 11948 + }, + { + "epoch": 0.94, + "grad_norm": 1.6963019474966408, + "learning_rate": 9.908956619563925e-08, + "loss": 0.4656, + "step": 11949 + }, + { + "epoch": 0.94, + "grad_norm": 0.5340791840168415, + "learning_rate": 9.883778348921846e-08, + "loss": 0.4865, + "step": 11950 + }, + { + "epoch": 0.94, + "grad_norm": 1.7673530539000595, + "learning_rate": 9.858631788158269e-08, + "loss": 0.4379, + "step": 11951 + }, + { + "epoch": 0.94, + "grad_norm": 0.565390894706377, + "learning_rate": 9.833516938899889e-08, + "loss": 0.4651, + "step": 11952 + }, + { + "epoch": 0.94, + "grad_norm": 0.5371962289334098, + "learning_rate": 9.808433802771799e-08, + "loss": 0.4696, + "step": 11953 + }, + { + "epoch": 0.94, + "grad_norm": 1.865953783534155, + "learning_rate": 9.783382381396811e-08, + "loss": 0.4583, + "step": 11954 + }, + { + "epoch": 0.94, + "grad_norm": 1.8496518662667734, + "learning_rate": 9.758362676395628e-08, + "loss": 0.4186, + "step": 11955 + }, + { + "epoch": 0.94, + "grad_norm": 0.5498701615126924, + "learning_rate": 9.73337468938712e-08, + "loss": 0.4842, + "step": 11956 + }, + { + "epoch": 0.94, + "grad_norm": 1.653415683207238, + "learning_rate": 9.708418421987942e-08, + "loss": 0.4397, + "step": 11957 + }, + { + "epoch": 0.94, + "grad_norm": 4.015837003194512, + "learning_rate": 9.683493875812688e-08, + "loss": 0.4137, + "step": 11958 + }, + { + "epoch": 0.94, + "grad_norm": 1.6087063513080675, + "learning_rate": 9.65860105247396e-08, + "loss": 0.4619, + "step": 11959 + }, + { + "epoch": 0.94, + "grad_norm": 0.544591957260909, + "learning_rate": 9.6337399535823e-08, + "loss": 0.466, + "step": 11960 + }, + { + "epoch": 0.94, + "grad_norm": 1.4686581002285228, + "learning_rate": 9.608910580746145e-08, + "loss": 0.4207, + "step": 11961 + }, + { + "epoch": 0.94, + "grad_norm": 1.6442899317936135, + "learning_rate": 9.584112935572043e-08, + "loss": 0.4699, + "step": 11962 + }, + { + "epoch": 0.94, + "grad_norm": 2.27312457993616, + "learning_rate": 9.559347019664267e-08, + "loss": 0.421, + "step": 11963 + }, + { + "epoch": 0.94, + "grad_norm": 2.685791945300738, + "learning_rate": 9.53461283462509e-08, + "loss": 0.4614, + "step": 11964 + }, + { + "epoch": 0.94, + "grad_norm": 3.9907961783823493, + "learning_rate": 9.509910382054899e-08, + "loss": 0.4612, + "step": 11965 + }, + { + "epoch": 0.94, + "grad_norm": 1.7820450876703018, + "learning_rate": 9.485239663551859e-08, + "loss": 0.4697, + "step": 11966 + }, + { + "epoch": 0.94, + "grad_norm": 1.9810123048366193, + "learning_rate": 9.460600680712195e-08, + "loss": 0.4913, + "step": 11967 + }, + { + "epoch": 0.94, + "grad_norm": 1.764547570391522, + "learning_rate": 9.435993435129853e-08, + "loss": 0.4745, + "step": 11968 + }, + { + "epoch": 0.94, + "grad_norm": 1.8924129971128811, + "learning_rate": 9.411417928397115e-08, + "loss": 0.4798, + "step": 11969 + }, + { + "epoch": 0.94, + "grad_norm": 1.97297136998746, + "learning_rate": 9.386874162103821e-08, + "loss": 0.447, + "step": 11970 + }, + { + "epoch": 0.94, + "grad_norm": 1.5031608211588405, + "learning_rate": 9.362362137837978e-08, + "loss": 0.4246, + "step": 11971 + }, + { + "epoch": 0.94, + "grad_norm": 2.1415973499809677, + "learning_rate": 9.337881857185426e-08, + "loss": 0.4136, + "step": 11972 + }, + { + "epoch": 0.94, + "grad_norm": 3.3171970365795675, + "learning_rate": 9.31343332173007e-08, + "loss": 0.4183, + "step": 11973 + }, + { + "epoch": 0.94, + "grad_norm": 1.685742710756799, + "learning_rate": 9.289016533053696e-08, + "loss": 0.424, + "step": 11974 + }, + { + "epoch": 0.94, + "grad_norm": 1.8108158521644064, + "learning_rate": 9.264631492736043e-08, + "loss": 0.4484, + "step": 11975 + }, + { + "epoch": 0.94, + "grad_norm": 1.927120095481386, + "learning_rate": 9.24027820235468e-08, + "loss": 0.4734, + "step": 11976 + }, + { + "epoch": 0.94, + "grad_norm": 1.970899358122065, + "learning_rate": 9.215956663485348e-08, + "loss": 0.4696, + "step": 11977 + }, + { + "epoch": 0.94, + "grad_norm": 1.792981198572327, + "learning_rate": 9.191666877701677e-08, + "loss": 0.4062, + "step": 11978 + }, + { + "epoch": 0.94, + "grad_norm": 2.626086743518199, + "learning_rate": 9.167408846575022e-08, + "loss": 0.4593, + "step": 11979 + }, + { + "epoch": 0.94, + "grad_norm": 1.737525146522079, + "learning_rate": 9.143182571674957e-08, + "loss": 0.4608, + "step": 11980 + }, + { + "epoch": 0.94, + "grad_norm": 7.935269916152477, + "learning_rate": 9.118988054568844e-08, + "loss": 0.3818, + "step": 11981 + }, + { + "epoch": 0.94, + "grad_norm": 1.6439325173227501, + "learning_rate": 9.094825296822096e-08, + "loss": 0.4273, + "step": 11982 + }, + { + "epoch": 0.94, + "grad_norm": 1.7608430477874888, + "learning_rate": 9.070694299997906e-08, + "loss": 0.4532, + "step": 11983 + }, + { + "epoch": 0.94, + "grad_norm": 4.442612307711923, + "learning_rate": 9.046595065657637e-08, + "loss": 0.4609, + "step": 11984 + }, + { + "epoch": 0.94, + "grad_norm": 1.7294814570900503, + "learning_rate": 9.022527595360375e-08, + "loss": 0.4221, + "step": 11985 + }, + { + "epoch": 0.94, + "grad_norm": 0.5299164766351682, + "learning_rate": 8.998491890663319e-08, + "loss": 0.4836, + "step": 11986 + }, + { + "epoch": 0.94, + "grad_norm": 2.4702638164515807, + "learning_rate": 8.974487953121502e-08, + "loss": 0.4399, + "step": 11987 + }, + { + "epoch": 0.94, + "grad_norm": 2.6158716646144935, + "learning_rate": 8.950515784287961e-08, + "loss": 0.4619, + "step": 11988 + }, + { + "epoch": 0.94, + "grad_norm": 0.543920746919628, + "learning_rate": 8.926575385713731e-08, + "loss": 0.4503, + "step": 11989 + }, + { + "epoch": 0.94, + "grad_norm": 2.005943766363169, + "learning_rate": 8.902666758947632e-08, + "loss": 0.4827, + "step": 11990 + }, + { + "epoch": 0.94, + "grad_norm": 9.219600618481454, + "learning_rate": 8.878789905536533e-08, + "loss": 0.4136, + "step": 11991 + }, + { + "epoch": 0.94, + "grad_norm": 1.6054546682073494, + "learning_rate": 8.854944827025313e-08, + "loss": 0.4691, + "step": 11992 + }, + { + "epoch": 0.94, + "grad_norm": 1.7403299700870662, + "learning_rate": 8.831131524956682e-08, + "loss": 0.4097, + "step": 11993 + }, + { + "epoch": 0.94, + "grad_norm": 0.5500191021157173, + "learning_rate": 8.80735000087124e-08, + "loss": 0.4889, + "step": 11994 + }, + { + "epoch": 0.94, + "grad_norm": 1.8473662953143966, + "learning_rate": 8.783600256307701e-08, + "loss": 0.426, + "step": 11995 + }, + { + "epoch": 0.94, + "grad_norm": 2.2492541375490505, + "learning_rate": 8.759882292802668e-08, + "loss": 0.4592, + "step": 11996 + }, + { + "epoch": 0.94, + "grad_norm": 1.5293817637611689, + "learning_rate": 8.736196111890638e-08, + "loss": 0.468, + "step": 11997 + }, + { + "epoch": 0.94, + "grad_norm": 1.7112564801047327, + "learning_rate": 8.71254171510405e-08, + "loss": 0.4274, + "step": 11998 + }, + { + "epoch": 0.94, + "grad_norm": 0.5482680416774683, + "learning_rate": 8.688919103973292e-08, + "loss": 0.4805, + "step": 11999 + }, + { + "epoch": 0.94, + "grad_norm": 1.2921730067327755, + "learning_rate": 8.665328280026808e-08, + "loss": 0.4523, + "step": 12000 + }, + { + "epoch": 0.94, + "grad_norm": 1.8922694503442175, + "learning_rate": 8.641769244790766e-08, + "loss": 0.4368, + "step": 12001 + }, + { + "epoch": 0.94, + "grad_norm": 2.763895808284745, + "learning_rate": 8.618241999789556e-08, + "loss": 0.4167, + "step": 12002 + }, + { + "epoch": 0.94, + "grad_norm": 1.5721141942673393, + "learning_rate": 8.59474654654524e-08, + "loss": 0.427, + "step": 12003 + }, + { + "epoch": 0.94, + "grad_norm": 1.8860658456308081, + "learning_rate": 8.571282886577992e-08, + "loss": 0.4618, + "step": 12004 + }, + { + "epoch": 0.94, + "grad_norm": 0.5433129716077433, + "learning_rate": 8.547851021405873e-08, + "loss": 0.4994, + "step": 12005 + }, + { + "epoch": 0.94, + "grad_norm": 1.754331412458918, + "learning_rate": 8.524450952544949e-08, + "loss": 0.4298, + "step": 12006 + }, + { + "epoch": 0.94, + "grad_norm": 1.6353948259040547, + "learning_rate": 8.50108268150901e-08, + "loss": 0.4357, + "step": 12007 + }, + { + "epoch": 0.94, + "grad_norm": 2.384465789605593, + "learning_rate": 8.477746209810123e-08, + "loss": 0.4206, + "step": 12008 + }, + { + "epoch": 0.94, + "grad_norm": 0.5183290133880379, + "learning_rate": 8.454441538958025e-08, + "loss": 0.4745, + "step": 12009 + }, + { + "epoch": 0.94, + "grad_norm": 2.0721333614224546, + "learning_rate": 8.431168670460566e-08, + "loss": 0.4232, + "step": 12010 + }, + { + "epoch": 0.94, + "grad_norm": 4.984042666117495, + "learning_rate": 8.40792760582343e-08, + "loss": 0.4268, + "step": 12011 + }, + { + "epoch": 0.94, + "grad_norm": 0.5409935828601051, + "learning_rate": 8.384718346550302e-08, + "loss": 0.4686, + "step": 12012 + }, + { + "epoch": 0.94, + "grad_norm": 1.443993223301895, + "learning_rate": 8.361540894142816e-08, + "loss": 0.4204, + "step": 12013 + }, + { + "epoch": 0.94, + "grad_norm": 2.4094829799008024, + "learning_rate": 8.33839525010044e-08, + "loss": 0.3854, + "step": 12014 + }, + { + "epoch": 0.94, + "grad_norm": 2.3902238271405243, + "learning_rate": 8.315281415920751e-08, + "loss": 0.4347, + "step": 12015 + }, + { + "epoch": 0.94, + "grad_norm": 1.8123749124699657, + "learning_rate": 8.29219939309911e-08, + "loss": 0.4353, + "step": 12016 + }, + { + "epoch": 0.94, + "grad_norm": 3.148824752067686, + "learning_rate": 8.269149183128988e-08, + "loss": 0.4513, + "step": 12017 + }, + { + "epoch": 0.94, + "grad_norm": 3.3997758670334317, + "learning_rate": 8.246130787501583e-08, + "loss": 0.456, + "step": 12018 + }, + { + "epoch": 0.94, + "grad_norm": 2.482666262461286, + "learning_rate": 8.223144207706257e-08, + "loss": 0.4105, + "step": 12019 + }, + { + "epoch": 0.94, + "grad_norm": 1.5540410938636295, + "learning_rate": 8.200189445230212e-08, + "loss": 0.4704, + "step": 12020 + }, + { + "epoch": 0.94, + "grad_norm": 1.8120156422473894, + "learning_rate": 8.177266501558534e-08, + "loss": 0.4172, + "step": 12021 + }, + { + "epoch": 0.94, + "grad_norm": 1.8476028803184046, + "learning_rate": 8.154375378174317e-08, + "loss": 0.4135, + "step": 12022 + }, + { + "epoch": 0.94, + "grad_norm": 2.705026539757959, + "learning_rate": 8.131516076558543e-08, + "loss": 0.3417, + "step": 12023 + }, + { + "epoch": 0.94, + "grad_norm": 1.8386646225885541, + "learning_rate": 8.108688598190306e-08, + "loss": 0.4464, + "step": 12024 + }, + { + "epoch": 0.94, + "grad_norm": 0.5201120201285709, + "learning_rate": 8.085892944546425e-08, + "loss": 0.4511, + "step": 12025 + }, + { + "epoch": 0.94, + "grad_norm": 1.7331608776490044, + "learning_rate": 8.06312911710172e-08, + "loss": 0.3949, + "step": 12026 + }, + { + "epoch": 0.94, + "grad_norm": 1.7444903108343048, + "learning_rate": 8.040397117329068e-08, + "loss": 0.4283, + "step": 12027 + }, + { + "epoch": 0.94, + "grad_norm": 0.5353130995164832, + "learning_rate": 8.017696946699183e-08, + "loss": 0.4711, + "step": 12028 + }, + { + "epoch": 0.94, + "grad_norm": 1.8191062973975802, + "learning_rate": 7.995028606680666e-08, + "loss": 0.4239, + "step": 12029 + }, + { + "epoch": 0.94, + "grad_norm": 5.540892710039138, + "learning_rate": 7.972392098740234e-08, + "loss": 0.4686, + "step": 12030 + }, + { + "epoch": 0.94, + "grad_norm": 1.613203055076282, + "learning_rate": 7.949787424342326e-08, + "loss": 0.4596, + "step": 12031 + }, + { + "epoch": 0.94, + "grad_norm": 0.5382350056105901, + "learning_rate": 7.927214584949549e-08, + "loss": 0.4601, + "step": 12032 + }, + { + "epoch": 0.95, + "grad_norm": 0.5409039780461194, + "learning_rate": 7.904673582022182e-08, + "loss": 0.4626, + "step": 12033 + }, + { + "epoch": 0.95, + "grad_norm": 1.8419304174987445, + "learning_rate": 7.882164417018779e-08, + "loss": 0.4346, + "step": 12034 + }, + { + "epoch": 0.95, + "grad_norm": 3.046381171616162, + "learning_rate": 7.859687091395507e-08, + "loss": 0.4864, + "step": 12035 + }, + { + "epoch": 0.95, + "grad_norm": 1.7362520842688336, + "learning_rate": 7.837241606606704e-08, + "loss": 0.5111, + "step": 12036 + }, + { + "epoch": 0.95, + "grad_norm": 1.9985533586270405, + "learning_rate": 7.814827964104544e-08, + "loss": 0.441, + "step": 12037 + }, + { + "epoch": 0.95, + "grad_norm": 1.4875221980412505, + "learning_rate": 7.79244616533914e-08, + "loss": 0.46, + "step": 12038 + }, + { + "epoch": 0.95, + "grad_norm": 1.9080840248966062, + "learning_rate": 7.770096211758616e-08, + "loss": 0.449, + "step": 12039 + }, + { + "epoch": 0.95, + "grad_norm": 2.096578013172229, + "learning_rate": 7.747778104808922e-08, + "loss": 0.4699, + "step": 12040 + }, + { + "epoch": 0.95, + "grad_norm": 0.5348432113625232, + "learning_rate": 7.725491845934075e-08, + "loss": 0.4683, + "step": 12041 + }, + { + "epoch": 0.95, + "grad_norm": 2.390122300207347, + "learning_rate": 7.703237436575806e-08, + "loss": 0.4389, + "step": 12042 + }, + { + "epoch": 0.95, + "grad_norm": 1.7933012669045372, + "learning_rate": 7.681014878174187e-08, + "loss": 0.4556, + "step": 12043 + }, + { + "epoch": 0.95, + "grad_norm": 0.5535382962125575, + "learning_rate": 7.658824172166846e-08, + "loss": 0.4462, + "step": 12044 + }, + { + "epoch": 0.95, + "grad_norm": 1.4828021592565828, + "learning_rate": 7.636665319989522e-08, + "loss": 0.4121, + "step": 12045 + }, + { + "epoch": 0.95, + "grad_norm": 1.6677653342727699, + "learning_rate": 7.614538323075848e-08, + "loss": 0.4434, + "step": 12046 + }, + { + "epoch": 0.95, + "grad_norm": 2.8003114051797455, + "learning_rate": 7.592443182857401e-08, + "loss": 0.3884, + "step": 12047 + }, + { + "epoch": 0.95, + "grad_norm": 2.032165760348553, + "learning_rate": 7.570379900763813e-08, + "loss": 0.4645, + "step": 12048 + }, + { + "epoch": 0.95, + "grad_norm": 1.5871150212523286, + "learning_rate": 7.548348478222389e-08, + "loss": 0.4439, + "step": 12049 + }, + { + "epoch": 0.95, + "grad_norm": 1.7929947459670017, + "learning_rate": 7.526348916658654e-08, + "loss": 0.4837, + "step": 12050 + }, + { + "epoch": 0.95, + "grad_norm": 1.3993118595340133, + "learning_rate": 7.50438121749586e-08, + "loss": 0.4875, + "step": 12051 + }, + { + "epoch": 0.95, + "grad_norm": 1.9796587292852197, + "learning_rate": 7.482445382155368e-08, + "loss": 0.4285, + "step": 12052 + }, + { + "epoch": 0.95, + "grad_norm": 2.4114600189597066, + "learning_rate": 7.460541412056376e-08, + "loss": 0.4873, + "step": 12053 + }, + { + "epoch": 0.95, + "grad_norm": 1.9714049196770034, + "learning_rate": 7.438669308616031e-08, + "loss": 0.4226, + "step": 12054 + }, + { + "epoch": 0.95, + "grad_norm": 1.6486355661427448, + "learning_rate": 7.416829073249366e-08, + "loss": 0.3992, + "step": 12055 + }, + { + "epoch": 0.95, + "grad_norm": 1.6314097763733606, + "learning_rate": 7.395020707369527e-08, + "loss": 0.4039, + "step": 12056 + }, + { + "epoch": 0.95, + "grad_norm": 4.670382789402163, + "learning_rate": 7.373244212387499e-08, + "loss": 0.4523, + "step": 12057 + }, + { + "epoch": 0.95, + "grad_norm": 1.4325088291008001, + "learning_rate": 7.351499589712042e-08, + "loss": 0.4255, + "step": 12058 + }, + { + "epoch": 0.95, + "grad_norm": 1.6009504742716, + "learning_rate": 7.329786840750142e-08, + "loss": 0.4536, + "step": 12059 + }, + { + "epoch": 0.95, + "grad_norm": 1.743612576128621, + "learning_rate": 7.308105966906508e-08, + "loss": 0.4557, + "step": 12060 + }, + { + "epoch": 0.95, + "grad_norm": 1.7005295279212966, + "learning_rate": 7.286456969583911e-08, + "loss": 0.4287, + "step": 12061 + }, + { + "epoch": 0.95, + "grad_norm": 3.1464286316114465, + "learning_rate": 7.264839850183003e-08, + "loss": 0.4371, + "step": 12062 + }, + { + "epoch": 0.95, + "grad_norm": 1.5683967303142914, + "learning_rate": 7.24325461010239e-08, + "loss": 0.4288, + "step": 12063 + }, + { + "epoch": 0.95, + "grad_norm": 3.882147594621402, + "learning_rate": 7.221701250738622e-08, + "loss": 0.4865, + "step": 12064 + }, + { + "epoch": 0.95, + "grad_norm": 1.5881805689863375, + "learning_rate": 7.200179773486083e-08, + "loss": 0.4395, + "step": 12065 + }, + { + "epoch": 0.95, + "grad_norm": 2.1472904598078686, + "learning_rate": 7.17869017973727e-08, + "loss": 0.4904, + "step": 12066 + }, + { + "epoch": 0.95, + "grad_norm": 1.9338792417498625, + "learning_rate": 7.157232470882513e-08, + "loss": 0.4578, + "step": 12067 + }, + { + "epoch": 0.95, + "grad_norm": 0.5450994969963001, + "learning_rate": 7.13580664831015e-08, + "loss": 0.4691, + "step": 12068 + }, + { + "epoch": 0.95, + "grad_norm": 2.5002254441130725, + "learning_rate": 7.114412713406294e-08, + "loss": 0.423, + "step": 12069 + }, + { + "epoch": 0.95, + "grad_norm": 2.356320751954262, + "learning_rate": 7.093050667555168e-08, + "loss": 0.4727, + "step": 12070 + }, + { + "epoch": 0.95, + "grad_norm": 2.2678645827548665, + "learning_rate": 7.071720512138836e-08, + "loss": 0.4195, + "step": 12071 + }, + { + "epoch": 0.95, + "grad_norm": 1.41061581455058, + "learning_rate": 7.050422248537414e-08, + "loss": 0.3947, + "step": 12072 + }, + { + "epoch": 0.95, + "grad_norm": 0.5057553222935928, + "learning_rate": 7.029155878128746e-08, + "loss": 0.4665, + "step": 12073 + }, + { + "epoch": 0.95, + "grad_norm": 1.605696499396263, + "learning_rate": 7.007921402288842e-08, + "loss": 0.436, + "step": 12074 + }, + { + "epoch": 0.95, + "grad_norm": 1.8168632830913305, + "learning_rate": 6.986718822391491e-08, + "loss": 0.428, + "step": 12075 + }, + { + "epoch": 0.95, + "grad_norm": 1.4194114817596504, + "learning_rate": 6.965548139808542e-08, + "loss": 0.4175, + "step": 12076 + }, + { + "epoch": 0.95, + "grad_norm": 0.5924585939362528, + "learning_rate": 6.944409355909565e-08, + "loss": 0.4687, + "step": 12077 + }, + { + "epoch": 0.95, + "grad_norm": 0.5749752855860285, + "learning_rate": 6.923302472062354e-08, + "loss": 0.4824, + "step": 12078 + }, + { + "epoch": 0.95, + "grad_norm": 0.5438675571520081, + "learning_rate": 6.902227489632485e-08, + "loss": 0.4815, + "step": 12079 + }, + { + "epoch": 0.95, + "grad_norm": 2.4976537281507856, + "learning_rate": 6.881184409983421e-08, + "loss": 0.4177, + "step": 12080 + }, + { + "epoch": 0.95, + "grad_norm": 1.8124249486981636, + "learning_rate": 6.86017323447663e-08, + "loss": 0.4494, + "step": 12081 + }, + { + "epoch": 0.95, + "grad_norm": 1.6843678741358121, + "learning_rate": 6.839193964471525e-08, + "loss": 0.3948, + "step": 12082 + }, + { + "epoch": 0.95, + "grad_norm": 2.527285501728673, + "learning_rate": 6.818246601325463e-08, + "loss": 0.4232, + "step": 12083 + }, + { + "epoch": 0.95, + "grad_norm": 2.419924378425735, + "learning_rate": 6.797331146393638e-08, + "loss": 0.3952, + "step": 12084 + }, + { + "epoch": 0.95, + "grad_norm": 1.5542329978532223, + "learning_rate": 6.776447601029357e-08, + "loss": 0.4124, + "step": 12085 + }, + { + "epoch": 0.95, + "grad_norm": 2.381132581234206, + "learning_rate": 6.75559596658365e-08, + "loss": 0.4306, + "step": 12086 + }, + { + "epoch": 0.95, + "grad_norm": 1.6266062903633327, + "learning_rate": 6.734776244405661e-08, + "loss": 0.411, + "step": 12087 + }, + { + "epoch": 0.95, + "grad_norm": 1.9210250492394096, + "learning_rate": 6.713988435842367e-08, + "loss": 0.4704, + "step": 12088 + }, + { + "epoch": 0.95, + "grad_norm": 1.9287624211710335, + "learning_rate": 6.693232542238692e-08, + "loss": 0.4052, + "step": 12089 + }, + { + "epoch": 0.95, + "grad_norm": 1.9268528997362258, + "learning_rate": 6.672508564937618e-08, + "loss": 0.4309, + "step": 12090 + }, + { + "epoch": 0.95, + "grad_norm": 1.9476670251623442, + "learning_rate": 6.651816505279796e-08, + "loss": 0.4564, + "step": 12091 + }, + { + "epoch": 0.95, + "grad_norm": 1.7563075051366521, + "learning_rate": 6.631156364604153e-08, + "loss": 0.4855, + "step": 12092 + }, + { + "epoch": 0.95, + "grad_norm": 1.9679931000451556, + "learning_rate": 6.61052814424723e-08, + "loss": 0.4603, + "step": 12093 + }, + { + "epoch": 0.95, + "grad_norm": 0.5478583157897122, + "learning_rate": 6.58993184554374e-08, + "loss": 0.4738, + "step": 12094 + }, + { + "epoch": 0.95, + "grad_norm": 1.7105812459440384, + "learning_rate": 6.569367469826171e-08, + "loss": 0.472, + "step": 12095 + }, + { + "epoch": 0.95, + "grad_norm": 0.5291457567537177, + "learning_rate": 6.548835018425015e-08, + "loss": 0.4716, + "step": 12096 + }, + { + "epoch": 0.95, + "grad_norm": 2.1333666078378886, + "learning_rate": 6.528334492668764e-08, + "loss": 0.5043, + "step": 12097 + }, + { + "epoch": 0.95, + "grad_norm": 1.883826270473864, + "learning_rate": 6.507865893883692e-08, + "loss": 0.4708, + "step": 12098 + }, + { + "epoch": 0.95, + "grad_norm": 0.5617926602888514, + "learning_rate": 6.487429223394182e-08, + "loss": 0.4621, + "step": 12099 + }, + { + "epoch": 0.95, + "grad_norm": 0.5487135292953904, + "learning_rate": 6.467024482522344e-08, + "loss": 0.4556, + "step": 12100 + }, + { + "epoch": 0.95, + "grad_norm": 3.080735979067863, + "learning_rate": 6.4466516725884e-08, + "loss": 0.4251, + "step": 12101 + }, + { + "epoch": 0.95, + "grad_norm": 0.5550096338531417, + "learning_rate": 6.426310794910461e-08, + "loss": 0.4718, + "step": 12102 + }, + { + "epoch": 0.95, + "grad_norm": 1.3219470163828018, + "learning_rate": 6.406001850804588e-08, + "loss": 0.4469, + "step": 12103 + }, + { + "epoch": 0.95, + "grad_norm": 2.0619843531829734, + "learning_rate": 6.385724841584672e-08, + "loss": 0.4093, + "step": 12104 + }, + { + "epoch": 0.95, + "grad_norm": 2.5224796500989117, + "learning_rate": 6.36547976856261e-08, + "loss": 0.4842, + "step": 12105 + }, + { + "epoch": 0.95, + "grad_norm": 3.42412485217852, + "learning_rate": 6.345266633048241e-08, + "loss": 0.4935, + "step": 12106 + }, + { + "epoch": 0.95, + "grad_norm": 1.5558021060304932, + "learning_rate": 6.325085436349354e-08, + "loss": 0.4267, + "step": 12107 + }, + { + "epoch": 0.95, + "grad_norm": 3.229519954525349, + "learning_rate": 6.304936179771571e-08, + "loss": 0.4617, + "step": 12108 + }, + { + "epoch": 0.95, + "grad_norm": 2.776425275163013, + "learning_rate": 6.28481886461868e-08, + "loss": 0.4778, + "step": 12109 + }, + { + "epoch": 0.95, + "grad_norm": 2.0293452167969104, + "learning_rate": 6.264733492192088e-08, + "loss": 0.4219, + "step": 12110 + }, + { + "epoch": 0.95, + "grad_norm": 1.8168790932509682, + "learning_rate": 6.244680063791419e-08, + "loss": 0.3921, + "step": 12111 + }, + { + "epoch": 0.95, + "grad_norm": 2.568818975014232, + "learning_rate": 6.224658580713971e-08, + "loss": 0.5224, + "step": 12112 + }, + { + "epoch": 0.95, + "grad_norm": 1.3882406379384997, + "learning_rate": 6.204669044255151e-08, + "loss": 0.4598, + "step": 12113 + }, + { + "epoch": 0.95, + "grad_norm": 0.5767086610090396, + "learning_rate": 6.18471145570837e-08, + "loss": 0.4422, + "step": 12114 + }, + { + "epoch": 0.95, + "grad_norm": 1.9130092999954507, + "learning_rate": 6.164785816364704e-08, + "loss": 0.4806, + "step": 12115 + }, + { + "epoch": 0.95, + "grad_norm": 1.5033923641148645, + "learning_rate": 6.144892127513402e-08, + "loss": 0.4244, + "step": 12116 + }, + { + "epoch": 0.95, + "grad_norm": 3.46121190796068, + "learning_rate": 6.125030390441489e-08, + "loss": 0.448, + "step": 12117 + }, + { + "epoch": 0.95, + "grad_norm": 1.79412814643016, + "learning_rate": 6.105200606434102e-08, + "loss": 0.4732, + "step": 12118 + }, + { + "epoch": 0.95, + "grad_norm": 1.5249202068488625, + "learning_rate": 6.085402776774107e-08, + "loss": 0.4655, + "step": 12119 + }, + { + "epoch": 0.95, + "grad_norm": 1.580471592105387, + "learning_rate": 6.065636902742423e-08, + "loss": 0.5075, + "step": 12120 + }, + { + "epoch": 0.95, + "grad_norm": 0.504267498488612, + "learning_rate": 6.045902985617913e-08, + "loss": 0.4597, + "step": 12121 + }, + { + "epoch": 0.95, + "grad_norm": 1.452123702608791, + "learning_rate": 6.026201026677281e-08, + "loss": 0.4219, + "step": 12122 + }, + { + "epoch": 0.95, + "grad_norm": 1.9315333289952938, + "learning_rate": 6.006531027195283e-08, + "loss": 0.4432, + "step": 12123 + }, + { + "epoch": 0.95, + "grad_norm": 2.016069461751853, + "learning_rate": 5.986892988444404e-08, + "loss": 0.4479, + "step": 12124 + }, + { + "epoch": 0.95, + "grad_norm": 1.2834112262567636, + "learning_rate": 5.967286911695403e-08, + "loss": 0.4056, + "step": 12125 + }, + { + "epoch": 0.95, + "grad_norm": 2.1591761197743757, + "learning_rate": 5.9477127982165984e-08, + "loss": 0.4754, + "step": 12126 + }, + { + "epoch": 0.95, + "grad_norm": 1.9542634257540719, + "learning_rate": 5.9281706492745337e-08, + "loss": 0.4493, + "step": 12127 + }, + { + "epoch": 0.95, + "grad_norm": 1.4679496009583155, + "learning_rate": 5.9086604661334756e-08, + "loss": 0.3981, + "step": 12128 + }, + { + "epoch": 0.95, + "grad_norm": 0.5567170839503993, + "learning_rate": 5.8891822500557474e-08, + "loss": 0.4796, + "step": 12129 + }, + { + "epoch": 0.95, + "grad_norm": 1.5534469475314527, + "learning_rate": 5.869736002301507e-08, + "loss": 0.4428, + "step": 12130 + }, + { + "epoch": 0.95, + "grad_norm": 165.1323188151165, + "learning_rate": 5.8503217241289714e-08, + "loss": 0.4528, + "step": 12131 + }, + { + "epoch": 0.95, + "grad_norm": 2.1583905077703123, + "learning_rate": 5.83093941679419e-08, + "loss": 0.4755, + "step": 12132 + }, + { + "epoch": 0.95, + "grad_norm": 0.5168212715548587, + "learning_rate": 5.811589081551161e-08, + "loss": 0.4414, + "step": 12133 + }, + { + "epoch": 0.95, + "grad_norm": 0.5492419677075632, + "learning_rate": 5.7922707196518266e-08, + "loss": 0.4515, + "step": 12134 + }, + { + "epoch": 0.95, + "grad_norm": 1.3464442182342977, + "learning_rate": 5.772984332346132e-08, + "loss": 0.3602, + "step": 12135 + }, + { + "epoch": 0.95, + "grad_norm": 1.628395236424482, + "learning_rate": 5.753729920881745e-08, + "loss": 0.4281, + "step": 12136 + }, + { + "epoch": 0.95, + "grad_norm": 2.095843470734586, + "learning_rate": 5.7345074865045035e-08, + "loss": 0.4279, + "step": 12137 + }, + { + "epoch": 0.95, + "grad_norm": 1.6520764461172746, + "learning_rate": 5.715317030458078e-08, + "loss": 0.4681, + "step": 12138 + }, + { + "epoch": 0.95, + "grad_norm": 1.7276730164990128, + "learning_rate": 5.696158553984032e-08, + "loss": 0.4216, + "step": 12139 + }, + { + "epoch": 0.95, + "grad_norm": 1.624025671607213, + "learning_rate": 5.677032058321874e-08, + "loss": 0.4268, + "step": 12140 + }, + { + "epoch": 0.95, + "grad_norm": 2.2167050482682114, + "learning_rate": 5.6579375447090576e-08, + "loss": 0.4572, + "step": 12141 + }, + { + "epoch": 0.95, + "grad_norm": 1.7217232579254378, + "learning_rate": 5.638875014381096e-08, + "loss": 0.4298, + "step": 12142 + }, + { + "epoch": 0.95, + "grad_norm": 1.8711197038817466, + "learning_rate": 5.6198444685711116e-08, + "loss": 0.429, + "step": 12143 + }, + { + "epoch": 0.95, + "grad_norm": 1.8744199184200039, + "learning_rate": 5.6008459085105104e-08, + "loss": 0.4431, + "step": 12144 + }, + { + "epoch": 0.95, + "grad_norm": 3.3754029644782606, + "learning_rate": 5.58187933542842e-08, + "loss": 0.48, + "step": 12145 + }, + { + "epoch": 0.95, + "grad_norm": 1.603520604639879, + "learning_rate": 5.562944750551913e-08, + "loss": 0.4893, + "step": 12146 + }, + { + "epoch": 0.95, + "grad_norm": 1.6475419867874217, + "learning_rate": 5.5440421551061216e-08, + "loss": 0.396, + "step": 12147 + }, + { + "epoch": 0.95, + "grad_norm": 0.5232736559847088, + "learning_rate": 5.525171550313957e-08, + "loss": 0.4579, + "step": 12148 + }, + { + "epoch": 0.95, + "grad_norm": 0.5367792868273121, + "learning_rate": 5.506332937396386e-08, + "loss": 0.4752, + "step": 12149 + }, + { + "epoch": 0.95, + "grad_norm": 1.666094191354845, + "learning_rate": 5.487526317572156e-08, + "loss": 0.4597, + "step": 12150 + }, + { + "epoch": 0.95, + "grad_norm": 1.8068782758132576, + "learning_rate": 5.4687516920580716e-08, + "loss": 0.4569, + "step": 12151 + }, + { + "epoch": 0.95, + "grad_norm": 0.5510633400273337, + "learning_rate": 5.450009062068773e-08, + "loss": 0.4568, + "step": 12152 + }, + { + "epoch": 0.95, + "grad_norm": 2.3912687435455138, + "learning_rate": 5.431298428817011e-08, + "loss": 0.3873, + "step": 12153 + }, + { + "epoch": 0.95, + "grad_norm": 1.7530046700622814, + "learning_rate": 5.412619793513263e-08, + "loss": 0.4362, + "step": 12154 + }, + { + "epoch": 0.95, + "grad_norm": 0.5405423546670909, + "learning_rate": 5.3939731573659504e-08, + "loss": 0.4569, + "step": 12155 + }, + { + "epoch": 0.95, + "grad_norm": 2.2996063725361346, + "learning_rate": 5.3753585215816083e-08, + "loss": 0.4322, + "step": 12156 + }, + { + "epoch": 0.95, + "grad_norm": 1.7489389309917094, + "learning_rate": 5.356775887364496e-08, + "loss": 0.4511, + "step": 12157 + }, + { + "epoch": 0.95, + "grad_norm": 1.8659760704169672, + "learning_rate": 5.338225255916929e-08, + "loss": 0.4514, + "step": 12158 + }, + { + "epoch": 0.95, + "grad_norm": 2.5218827856497974, + "learning_rate": 5.319706628439003e-08, + "loss": 0.4333, + "step": 12159 + }, + { + "epoch": 0.95, + "grad_norm": 1.8867297634222016, + "learning_rate": 5.3012200061290374e-08, + "loss": 0.4457, + "step": 12160 + }, + { + "epoch": 0.96, + "grad_norm": 1.4000551009416529, + "learning_rate": 5.2827653901829646e-08, + "loss": 0.3902, + "step": 12161 + }, + { + "epoch": 0.96, + "grad_norm": 1.6200545568492413, + "learning_rate": 5.264342781794829e-08, + "loss": 0.3842, + "step": 12162 + }, + { + "epoch": 0.96, + "grad_norm": 2.328773792435486, + "learning_rate": 5.2459521821563994e-08, + "loss": 0.4506, + "step": 12163 + }, + { + "epoch": 0.96, + "grad_norm": 2.433551590036942, + "learning_rate": 5.22759359245778e-08, + "loss": 0.4113, + "step": 12164 + }, + { + "epoch": 0.96, + "grad_norm": 3.2131692410590946, + "learning_rate": 5.209267013886521e-08, + "loss": 0.5002, + "step": 12165 + }, + { + "epoch": 0.96, + "grad_norm": 2.9190334825657205, + "learning_rate": 5.1909724476284506e-08, + "loss": 0.455, + "step": 12166 + }, + { + "epoch": 0.96, + "grad_norm": 3.797200642284039, + "learning_rate": 5.172709894867178e-08, + "loss": 0.4511, + "step": 12167 + }, + { + "epoch": 0.96, + "grad_norm": 1.6986365940501422, + "learning_rate": 5.154479356784259e-08, + "loss": 0.4261, + "step": 12168 + }, + { + "epoch": 0.96, + "grad_norm": 3.283868373239577, + "learning_rate": 5.136280834559193e-08, + "loss": 0.483, + "step": 12169 + }, + { + "epoch": 0.96, + "grad_norm": 2.2296802254204233, + "learning_rate": 5.118114329369372e-08, + "loss": 0.4593, + "step": 12170 + }, + { + "epoch": 0.96, + "grad_norm": 1.6704065873590495, + "learning_rate": 5.09997984239019e-08, + "loss": 0.3777, + "step": 12171 + }, + { + "epoch": 0.96, + "grad_norm": 1.613054292023631, + "learning_rate": 5.0818773747948744e-08, + "loss": 0.4532, + "step": 12172 + }, + { + "epoch": 0.96, + "grad_norm": 1.6671710088338947, + "learning_rate": 5.0638069277547106e-08, + "loss": 0.4454, + "step": 12173 + }, + { + "epoch": 0.96, + "grad_norm": 1.7875288782769443, + "learning_rate": 5.045768502438708e-08, + "loss": 0.5207, + "step": 12174 + }, + { + "epoch": 0.96, + "grad_norm": 2.1578022203620972, + "learning_rate": 5.027762100014044e-08, + "loss": 0.4654, + "step": 12175 + }, + { + "epoch": 0.96, + "grad_norm": 2.2157935910493225, + "learning_rate": 5.0097877216456755e-08, + "loss": 0.461, + "step": 12176 + }, + { + "epoch": 0.96, + "grad_norm": 1.7125320311153123, + "learning_rate": 4.9918453684965064e-08, + "loss": 0.4692, + "step": 12177 + }, + { + "epoch": 0.96, + "grad_norm": 1.7097222247374564, + "learning_rate": 4.9739350417273846e-08, + "loss": 0.4589, + "step": 12178 + }, + { + "epoch": 0.96, + "grad_norm": 2.115027578580721, + "learning_rate": 4.956056742497106e-08, + "loss": 0.4324, + "step": 12179 + }, + { + "epoch": 0.96, + "grad_norm": 1.9496860954506348, + "learning_rate": 4.938210471962301e-08, + "loss": 0.4319, + "step": 12180 + }, + { + "epoch": 0.96, + "grad_norm": 1.6995035378925303, + "learning_rate": 4.920396231277713e-08, + "loss": 0.4313, + "step": 12181 + }, + { + "epoch": 0.96, + "grad_norm": 1.791403557214716, + "learning_rate": 4.902614021595809e-08, + "loss": 0.4617, + "step": 12182 + }, + { + "epoch": 0.96, + "grad_norm": 1.8926969624183836, + "learning_rate": 4.8848638440671114e-08, + "loss": 0.4601, + "step": 12183 + }, + { + "epoch": 0.96, + "grad_norm": 1.7642352819950302, + "learning_rate": 4.8671456998400924e-08, + "loss": 0.4473, + "step": 12184 + }, + { + "epoch": 0.96, + "grad_norm": 2.159602601577734, + "learning_rate": 4.8494595900609454e-08, + "loss": 0.4807, + "step": 12185 + }, + { + "epoch": 0.96, + "grad_norm": 2.128217556712262, + "learning_rate": 4.831805515874089e-08, + "loss": 0.4223, + "step": 12186 + }, + { + "epoch": 0.96, + "grad_norm": 1.4839653266225885, + "learning_rate": 4.814183478421608e-08, + "loss": 0.4569, + "step": 12187 + }, + { + "epoch": 0.96, + "grad_norm": 1.6367361874834316, + "learning_rate": 4.796593478843703e-08, + "loss": 0.4188, + "step": 12188 + }, + { + "epoch": 0.96, + "grad_norm": 6.874695916743228, + "learning_rate": 4.779035518278352e-08, + "loss": 0.4552, + "step": 12189 + }, + { + "epoch": 0.96, + "grad_norm": 1.878999225241796, + "learning_rate": 4.7615095978616465e-08, + "loss": 0.4184, + "step": 12190 + }, + { + "epoch": 0.96, + "grad_norm": 1.8945250703808998, + "learning_rate": 4.744015718727346e-08, + "loss": 0.3877, + "step": 12191 + }, + { + "epoch": 0.96, + "grad_norm": 1.4281495448627544, + "learning_rate": 4.726553882007379e-08, + "loss": 0.3986, + "step": 12192 + }, + { + "epoch": 0.96, + "grad_norm": 1.8338645632583692, + "learning_rate": 4.709124088831507e-08, + "loss": 0.4245, + "step": 12193 + }, + { + "epoch": 0.96, + "grad_norm": 3.9375192621050794, + "learning_rate": 4.691726340327274e-08, + "loss": 0.4672, + "step": 12194 + }, + { + "epoch": 0.96, + "grad_norm": 0.559265579246486, + "learning_rate": 4.674360637620501e-08, + "loss": 0.4865, + "step": 12195 + }, + { + "epoch": 0.96, + "grad_norm": 1.9501957853212961, + "learning_rate": 4.657026981834623e-08, + "loss": 0.4692, + "step": 12196 + }, + { + "epoch": 0.96, + "grad_norm": 1.7653227726396006, + "learning_rate": 4.6397253740911306e-08, + "loss": 0.4655, + "step": 12197 + }, + { + "epoch": 0.96, + "grad_norm": 2.1589931507658484, + "learning_rate": 4.622455815509297e-08, + "loss": 0.4443, + "step": 12198 + }, + { + "epoch": 0.96, + "grad_norm": 1.7887900082419774, + "learning_rate": 4.605218307206616e-08, + "loss": 0.4196, + "step": 12199 + }, + { + "epoch": 0.96, + "grad_norm": 0.5796258830006363, + "learning_rate": 4.588012850298251e-08, + "loss": 0.468, + "step": 12200 + }, + { + "epoch": 0.96, + "grad_norm": 2.020843040863877, + "learning_rate": 4.5708394458973684e-08, + "loss": 0.4111, + "step": 12201 + }, + { + "epoch": 0.96, + "grad_norm": 2.191250185470242, + "learning_rate": 4.5536980951150224e-08, + "loss": 0.4359, + "step": 12202 + }, + { + "epoch": 0.96, + "grad_norm": 2.2164140324103676, + "learning_rate": 4.536588799060326e-08, + "loss": 0.4356, + "step": 12203 + }, + { + "epoch": 0.96, + "grad_norm": 1.6856964277341124, + "learning_rate": 4.5195115588402285e-08, + "loss": 0.4402, + "step": 12204 + }, + { + "epoch": 0.96, + "grad_norm": 1.6158079158037137, + "learning_rate": 4.502466375559511e-08, + "loss": 0.4386, + "step": 12205 + }, + { + "epoch": 0.96, + "grad_norm": 1.8127577270789712, + "learning_rate": 4.485453250321015e-08, + "loss": 0.4428, + "step": 12206 + }, + { + "epoch": 0.96, + "grad_norm": 1.508409960259944, + "learning_rate": 4.4684721842254695e-08, + "loss": 0.4329, + "step": 12207 + }, + { + "epoch": 0.96, + "grad_norm": 0.5605080524883762, + "learning_rate": 4.4515231783716086e-08, + "loss": 0.4657, + "step": 12208 + }, + { + "epoch": 0.96, + "grad_norm": 1.6825498198534399, + "learning_rate": 4.434606233855887e-08, + "loss": 0.4074, + "step": 12209 + }, + { + "epoch": 0.96, + "grad_norm": 0.5272448982265743, + "learning_rate": 4.417721351772874e-08, + "loss": 0.4541, + "step": 12210 + }, + { + "epoch": 0.96, + "grad_norm": 1.4336231530695474, + "learning_rate": 4.4008685332149745e-08, + "loss": 0.4498, + "step": 12211 + }, + { + "epoch": 0.96, + "grad_norm": 1.96325752225603, + "learning_rate": 4.384047779272538e-08, + "loss": 0.4657, + "step": 12212 + }, + { + "epoch": 0.96, + "grad_norm": 1.7679051193304456, + "learning_rate": 4.367259091033915e-08, + "loss": 0.4016, + "step": 12213 + }, + { + "epoch": 0.96, + "grad_norm": 2.1357838687223287, + "learning_rate": 4.3505024695851826e-08, + "loss": 0.4352, + "step": 12214 + }, + { + "epoch": 0.96, + "grad_norm": 2.1831472218454198, + "learning_rate": 4.33377791601064e-08, + "loss": 0.3924, + "step": 12215 + }, + { + "epoch": 0.96, + "grad_norm": 1.585771024900894, + "learning_rate": 4.317085431392198e-08, + "loss": 0.4321, + "step": 12216 + }, + { + "epoch": 0.96, + "grad_norm": 1.6383268040798091, + "learning_rate": 4.3004250168098836e-08, + "loss": 0.4485, + "step": 12217 + }, + { + "epoch": 0.96, + "grad_norm": 1.8807358069181077, + "learning_rate": 4.283796673341667e-08, + "loss": 0.4061, + "step": 12218 + }, + { + "epoch": 0.96, + "grad_norm": 0.582170223839725, + "learning_rate": 4.267200402063298e-08, + "loss": 0.4569, + "step": 12219 + }, + { + "epoch": 0.96, + "grad_norm": 1.543225939011292, + "learning_rate": 4.250636204048586e-08, + "loss": 0.4511, + "step": 12220 + }, + { + "epoch": 0.96, + "grad_norm": 1.7590982331009064, + "learning_rate": 4.2341040803691166e-08, + "loss": 0.4552, + "step": 12221 + }, + { + "epoch": 0.96, + "grad_norm": 1.9528333234101705, + "learning_rate": 4.217604032094647e-08, + "loss": 0.4692, + "step": 12222 + }, + { + "epoch": 0.96, + "grad_norm": 2.125541703692806, + "learning_rate": 4.2011360602925987e-08, + "loss": 0.4275, + "step": 12223 + }, + { + "epoch": 0.96, + "grad_norm": 2.3007772555223593, + "learning_rate": 4.184700166028455e-08, + "loss": 0.4378, + "step": 12224 + }, + { + "epoch": 0.96, + "grad_norm": 1.5897748730079242, + "learning_rate": 4.168296350365641e-08, + "loss": 0.4518, + "step": 12225 + }, + { + "epoch": 0.96, + "grad_norm": 2.4700859364213206, + "learning_rate": 4.151924614365366e-08, + "loss": 0.3961, + "step": 12226 + }, + { + "epoch": 0.96, + "grad_norm": 1.6105085606298144, + "learning_rate": 4.135584959086947e-08, + "loss": 0.4616, + "step": 12227 + }, + { + "epoch": 0.96, + "grad_norm": 1.7761485445290064, + "learning_rate": 4.119277385587539e-08, + "loss": 0.4722, + "step": 12228 + }, + { + "epoch": 0.96, + "grad_norm": 1.8239033176559623, + "learning_rate": 4.1030018949221315e-08, + "loss": 0.4868, + "step": 12229 + }, + { + "epoch": 0.96, + "grad_norm": 1.9100291850567426, + "learning_rate": 4.086758488143827e-08, + "loss": 0.4459, + "step": 12230 + }, + { + "epoch": 0.96, + "grad_norm": 0.606513760791807, + "learning_rate": 4.0705471663035065e-08, + "loss": 0.4496, + "step": 12231 + }, + { + "epoch": 0.96, + "grad_norm": 1.6322041869369035, + "learning_rate": 4.054367930450054e-08, + "loss": 0.3929, + "step": 12232 + }, + { + "epoch": 0.96, + "grad_norm": 1.921367039598112, + "learning_rate": 4.038220781630131e-08, + "loss": 0.4198, + "step": 12233 + }, + { + "epoch": 0.96, + "grad_norm": 1.6654984627644491, + "learning_rate": 4.022105720888625e-08, + "loss": 0.4631, + "step": 12234 + }, + { + "epoch": 0.96, + "grad_norm": 1.9114406106986481, + "learning_rate": 4.0060227492680346e-08, + "loss": 0.4321, + "step": 12235 + }, + { + "epoch": 0.96, + "grad_norm": 1.9159482544959077, + "learning_rate": 3.9899718678088615e-08, + "loss": 0.4723, + "step": 12236 + }, + { + "epoch": 0.96, + "grad_norm": 3.6213108094001614, + "learning_rate": 3.973953077549719e-08, + "loss": 0.4484, + "step": 12237 + }, + { + "epoch": 0.96, + "grad_norm": 0.5540393688323295, + "learning_rate": 3.9579663795268894e-08, + "loss": 0.4497, + "step": 12238 + }, + { + "epoch": 0.96, + "grad_norm": 1.7377508136102546, + "learning_rate": 3.9420117747747675e-08, + "loss": 0.4299, + "step": 12239 + }, + { + "epoch": 0.96, + "grad_norm": 0.5321358645497081, + "learning_rate": 3.926089264325528e-08, + "loss": 0.4642, + "step": 12240 + }, + { + "epoch": 0.96, + "grad_norm": 2.102315152921353, + "learning_rate": 3.9101988492093457e-08, + "loss": 0.4868, + "step": 12241 + }, + { + "epoch": 0.96, + "grad_norm": 1.7144132487324786, + "learning_rate": 3.894340530454399e-08, + "loss": 0.3879, + "step": 12242 + }, + { + "epoch": 0.96, + "grad_norm": 1.8071250167394868, + "learning_rate": 3.87851430908659e-08, + "loss": 0.4078, + "step": 12243 + }, + { + "epoch": 0.96, + "grad_norm": 2.8470017989072924, + "learning_rate": 3.8627201861298756e-08, + "loss": 0.4481, + "step": 12244 + }, + { + "epoch": 0.96, + "grad_norm": 1.978625695595467, + "learning_rate": 3.846958162606163e-08, + "loss": 0.4615, + "step": 12245 + }, + { + "epoch": 0.96, + "grad_norm": 2.7093817018946185, + "learning_rate": 3.8312282395351897e-08, + "loss": 0.4345, + "step": 12246 + }, + { + "epoch": 0.96, + "grad_norm": 2.0909561078046197, + "learning_rate": 3.815530417934754e-08, + "loss": 0.4328, + "step": 12247 + }, + { + "epoch": 0.96, + "grad_norm": 2.491848422080435, + "learning_rate": 3.799864698820321e-08, + "loss": 0.4599, + "step": 12248 + }, + { + "epoch": 0.96, + "grad_norm": 2.258644642161358, + "learning_rate": 3.784231083205525e-08, + "loss": 0.4154, + "step": 12249 + }, + { + "epoch": 0.96, + "grad_norm": 2.2876207370542523, + "learning_rate": 3.76862957210189e-08, + "loss": 0.3973, + "step": 12250 + }, + { + "epoch": 0.96, + "grad_norm": 1.7184638176241906, + "learning_rate": 3.753060166518774e-08, + "loss": 0.4453, + "step": 12251 + }, + { + "epoch": 0.96, + "grad_norm": 1.77533974378186, + "learning_rate": 3.737522867463483e-08, + "loss": 0.485, + "step": 12252 + }, + { + "epoch": 0.96, + "grad_norm": 0.5971882777627019, + "learning_rate": 3.72201767594127e-08, + "loss": 0.4669, + "step": 12253 + }, + { + "epoch": 0.96, + "grad_norm": 1.5874999510333814, + "learning_rate": 3.7065445929552744e-08, + "loss": 0.4568, + "step": 12254 + }, + { + "epoch": 0.96, + "grad_norm": 3.8828794454335473, + "learning_rate": 3.691103619506642e-08, + "loss": 0.4453, + "step": 12255 + }, + { + "epoch": 0.96, + "grad_norm": 1.5288385871228325, + "learning_rate": 3.6756947565943504e-08, + "loss": 0.4698, + "step": 12256 + }, + { + "epoch": 0.96, + "grad_norm": 1.7319785221220998, + "learning_rate": 3.660318005215324e-08, + "loss": 0.4987, + "step": 12257 + }, + { + "epoch": 0.96, + "grad_norm": 2.5563629644788217, + "learning_rate": 3.644973366364435e-08, + "loss": 0.4029, + "step": 12258 + }, + { + "epoch": 0.96, + "grad_norm": 1.4510350742571931, + "learning_rate": 3.6296608410344434e-08, + "loss": 0.4419, + "step": 12259 + }, + { + "epoch": 0.96, + "grad_norm": 1.7345225042378463, + "learning_rate": 3.614380430216058e-08, + "loss": 0.4484, + "step": 12260 + }, + { + "epoch": 0.96, + "grad_norm": 1.6197498834075845, + "learning_rate": 3.5991321348979316e-08, + "loss": 0.4375, + "step": 12261 + }, + { + "epoch": 0.96, + "grad_norm": 1.7244885332334154, + "learning_rate": 3.583915956066553e-08, + "loss": 0.4308, + "step": 12262 + }, + { + "epoch": 0.96, + "grad_norm": 2.1979899734912385, + "learning_rate": 3.5687318947064676e-08, + "loss": 0.4077, + "step": 12263 + }, + { + "epoch": 0.96, + "grad_norm": 1.4689494533462557, + "learning_rate": 3.553579951799946e-08, + "loss": 0.4616, + "step": 12264 + }, + { + "epoch": 0.96, + "grad_norm": 1.7497343566923738, + "learning_rate": 3.53846012832737e-08, + "loss": 0.4211, + "step": 12265 + }, + { + "epoch": 0.96, + "grad_norm": 0.5627986779930028, + "learning_rate": 3.5233724252670134e-08, + "loss": 0.4668, + "step": 12266 + }, + { + "epoch": 0.96, + "grad_norm": 0.544455862008603, + "learning_rate": 3.508316843594983e-08, + "loss": 0.4734, + "step": 12267 + }, + { + "epoch": 0.96, + "grad_norm": 1.7824636533899578, + "learning_rate": 3.4932933842853344e-08, + "loss": 0.4768, + "step": 12268 + }, + { + "epoch": 0.96, + "grad_norm": 0.5483257313524167, + "learning_rate": 3.4783020483101226e-08, + "loss": 0.4608, + "step": 12269 + }, + { + "epoch": 0.96, + "grad_norm": 2.619005732447942, + "learning_rate": 3.463342836639239e-08, + "loss": 0.4387, + "step": 12270 + }, + { + "epoch": 0.96, + "grad_norm": 2.175539456416044, + "learning_rate": 3.4484157502404636e-08, + "loss": 0.4443, + "step": 12271 + }, + { + "epoch": 0.96, + "grad_norm": 1.4769824548680204, + "learning_rate": 3.433520790079692e-08, + "loss": 0.4229, + "step": 12272 + }, + { + "epoch": 0.96, + "grad_norm": 2.4195414604596017, + "learning_rate": 3.4186579571204856e-08, + "loss": 0.399, + "step": 12273 + }, + { + "epoch": 0.96, + "grad_norm": 1.9548132739694004, + "learning_rate": 3.403827252324521e-08, + "loss": 0.4315, + "step": 12274 + }, + { + "epoch": 0.96, + "grad_norm": 5.277843897588021, + "learning_rate": 3.389028676651307e-08, + "loss": 0.4425, + "step": 12275 + }, + { + "epoch": 0.96, + "grad_norm": 1.4213875533276281, + "learning_rate": 3.374262231058245e-08, + "loss": 0.445, + "step": 12276 + }, + { + "epoch": 0.96, + "grad_norm": 3.4152922269143065, + "learning_rate": 3.359527916500793e-08, + "loss": 0.4412, + "step": 12277 + }, + { + "epoch": 0.96, + "grad_norm": 1.7327429186035037, + "learning_rate": 3.3448257339322446e-08, + "loss": 0.4666, + "step": 12278 + }, + { + "epoch": 0.96, + "grad_norm": 1.5239804060265414, + "learning_rate": 3.3301556843036705e-08, + "loss": 0.4433, + "step": 12279 + }, + { + "epoch": 0.96, + "grad_norm": 1.6389073043257443, + "learning_rate": 3.3155177685643136e-08, + "loss": 0.5153, + "step": 12280 + }, + { + "epoch": 0.96, + "grad_norm": 1.9089056803210736, + "learning_rate": 3.3009119876612485e-08, + "loss": 0.4894, + "step": 12281 + }, + { + "epoch": 0.96, + "grad_norm": 1.7917171761536899, + "learning_rate": 3.286338342539386e-08, + "loss": 0.4897, + "step": 12282 + }, + { + "epoch": 0.96, + "grad_norm": 1.750330503380991, + "learning_rate": 3.271796834141694e-08, + "loss": 0.5017, + "step": 12283 + }, + { + "epoch": 0.96, + "grad_norm": 1.485750452860168, + "learning_rate": 3.257287463408865e-08, + "loss": 0.4362, + "step": 12284 + }, + { + "epoch": 0.96, + "grad_norm": 2.71813865742657, + "learning_rate": 3.242810231279814e-08, + "loss": 0.444, + "step": 12285 + }, + { + "epoch": 0.96, + "grad_norm": 1.508141695414921, + "learning_rate": 3.228365138691014e-08, + "loss": 0.4113, + "step": 12286 + }, + { + "epoch": 0.96, + "grad_norm": 1.902796484319638, + "learning_rate": 3.213952186577163e-08, + "loss": 0.4562, + "step": 12287 + }, + { + "epoch": 0.97, + "grad_norm": 1.550885577046666, + "learning_rate": 3.199571375870736e-08, + "loss": 0.4316, + "step": 12288 + }, + { + "epoch": 0.97, + "grad_norm": 0.5559343174319692, + "learning_rate": 3.185222707502156e-08, + "loss": 0.4691, + "step": 12289 + }, + { + "epoch": 0.97, + "grad_norm": 2.201786523902043, + "learning_rate": 3.1709061823997356e-08, + "loss": 0.4446, + "step": 12290 + }, + { + "epoch": 0.97, + "grad_norm": 1.7683053461597373, + "learning_rate": 3.156621801489734e-08, + "loss": 0.3689, + "step": 12291 + }, + { + "epoch": 0.97, + "grad_norm": 2.2700657734605088, + "learning_rate": 3.1423695656964123e-08, + "loss": 0.4509, + "step": 12292 + }, + { + "epoch": 0.97, + "grad_norm": 2.000739369004508, + "learning_rate": 3.128149475941755e-08, + "loss": 0.4454, + "step": 12293 + }, + { + "epoch": 0.97, + "grad_norm": 1.793179808234318, + "learning_rate": 3.1139615331458596e-08, + "loss": 0.4245, + "step": 12294 + }, + { + "epoch": 0.97, + "grad_norm": 2.038301359423851, + "learning_rate": 3.099805738226602e-08, + "loss": 0.4352, + "step": 12295 + }, + { + "epoch": 0.97, + "grad_norm": 0.5268989198223204, + "learning_rate": 3.085682092099918e-08, + "loss": 0.4585, + "step": 12296 + }, + { + "epoch": 0.97, + "grad_norm": 2.0993853307928116, + "learning_rate": 3.071590595679519e-08, + "loss": 0.4426, + "step": 12297 + }, + { + "epoch": 0.97, + "grad_norm": 2.0500907267790573, + "learning_rate": 3.057531249877233e-08, + "loss": 0.4231, + "step": 12298 + }, + { + "epoch": 0.97, + "grad_norm": 1.787674634499823, + "learning_rate": 3.043504055602498e-08, + "loss": 0.453, + "step": 12299 + }, + { + "epoch": 0.97, + "grad_norm": 0.5125314362336555, + "learning_rate": 3.0295090137630324e-08, + "loss": 0.4401, + "step": 12300 + }, + { + "epoch": 0.97, + "grad_norm": 1.743226770177226, + "learning_rate": 3.015546125264113e-08, + "loss": 0.387, + "step": 12301 + }, + { + "epoch": 0.97, + "grad_norm": 1.832886323963274, + "learning_rate": 3.001615391009349e-08, + "loss": 0.4518, + "step": 12302 + }, + { + "epoch": 0.97, + "grad_norm": 2.8439139939251135, + "learning_rate": 2.987716811899799e-08, + "loss": 0.4536, + "step": 12303 + }, + { + "epoch": 0.97, + "grad_norm": 1.7431015270352062, + "learning_rate": 2.9738503888348534e-08, + "loss": 0.4455, + "step": 12304 + }, + { + "epoch": 0.97, + "grad_norm": 1.5146613005705547, + "learning_rate": 2.960016122711573e-08, + "loss": 0.3875, + "step": 12305 + }, + { + "epoch": 0.97, + "grad_norm": 0.5169523606476568, + "learning_rate": 2.946214014425075e-08, + "loss": 0.436, + "step": 12306 + }, + { + "epoch": 0.97, + "grad_norm": 2.043498183237889, + "learning_rate": 2.932444064868256e-08, + "loss": 0.4254, + "step": 12307 + }, + { + "epoch": 0.97, + "grad_norm": 1.592863184890767, + "learning_rate": 2.9187062749320704e-08, + "loss": 0.4328, + "step": 12308 + }, + { + "epoch": 0.97, + "grad_norm": 1.6593745597861853, + "learning_rate": 2.9050006455053625e-08, + "loss": 0.4651, + "step": 12309 + }, + { + "epoch": 0.97, + "grad_norm": 1.8018584495223355, + "learning_rate": 2.891327177474812e-08, + "loss": 0.4699, + "step": 12310 + }, + { + "epoch": 0.97, + "grad_norm": 1.9301548187060964, + "learning_rate": 2.8776858717250445e-08, + "loss": 0.4405, + "step": 12311 + }, + { + "epoch": 0.97, + "grad_norm": 0.5258197416621908, + "learning_rate": 2.8640767291387427e-08, + "loss": 0.4509, + "step": 12312 + }, + { + "epoch": 0.97, + "grad_norm": 3.1931659833453594, + "learning_rate": 2.8504997505963695e-08, + "loss": 0.4655, + "step": 12313 + }, + { + "epoch": 0.97, + "grad_norm": 1.6087409343678611, + "learning_rate": 2.8369549369762218e-08, + "loss": 0.4666, + "step": 12314 + }, + { + "epoch": 0.97, + "grad_norm": 1.5624465476501812, + "learning_rate": 2.8234422891547654e-08, + "loss": 0.4252, + "step": 12315 + }, + { + "epoch": 0.97, + "grad_norm": 1.4663693670487221, + "learning_rate": 2.8099618080061896e-08, + "loss": 0.4408, + "step": 12316 + }, + { + "epoch": 0.97, + "grad_norm": 19.06635210944333, + "learning_rate": 2.796513494402686e-08, + "loss": 0.4401, + "step": 12317 + }, + { + "epoch": 0.97, + "grad_norm": 8.885232063756083, + "learning_rate": 2.7830973492143365e-08, + "loss": 0.4725, + "step": 12318 + }, + { + "epoch": 0.97, + "grad_norm": 2.111824727592062, + "learning_rate": 2.7697133733091686e-08, + "loss": 0.4869, + "step": 12319 + }, + { + "epoch": 0.97, + "grad_norm": 1.7981570083496166, + "learning_rate": 2.7563615675530452e-08, + "loss": 0.3751, + "step": 12320 + }, + { + "epoch": 0.97, + "grad_norm": 1.4905198613775308, + "learning_rate": 2.7430419328098867e-08, + "loss": 0.3908, + "step": 12321 + }, + { + "epoch": 0.97, + "grad_norm": 1.872076349735022, + "learning_rate": 2.7297544699413925e-08, + "loss": 0.3672, + "step": 12322 + }, + { + "epoch": 0.97, + "grad_norm": 5.421741257911467, + "learning_rate": 2.716499179807264e-08, + "loss": 0.4444, + "step": 12323 + }, + { + "epoch": 0.97, + "grad_norm": 1.624039835585337, + "learning_rate": 2.7032760632651477e-08, + "loss": 0.4546, + "step": 12324 + }, + { + "epoch": 0.97, + "grad_norm": 2.13699220968566, + "learning_rate": 2.690085121170527e-08, + "loss": 0.426, + "step": 12325 + }, + { + "epoch": 0.97, + "grad_norm": 1.371519041917397, + "learning_rate": 2.67692635437683e-08, + "loss": 0.3974, + "step": 12326 + }, + { + "epoch": 0.97, + "grad_norm": 2.7528633945651872, + "learning_rate": 2.6637997637353752e-08, + "loss": 0.4801, + "step": 12327 + }, + { + "epoch": 0.97, + "grad_norm": 1.7920116230840426, + "learning_rate": 2.6507053500955393e-08, + "loss": 0.4531, + "step": 12328 + }, + { + "epoch": 0.97, + "grad_norm": 1.8057573905124469, + "learning_rate": 2.6376431143044223e-08, + "loss": 0.4239, + "step": 12329 + }, + { + "epoch": 0.97, + "grad_norm": 2.1368041539187184, + "learning_rate": 2.6246130572071816e-08, + "loss": 0.4233, + "step": 12330 + }, + { + "epoch": 0.97, + "grad_norm": 1.5048181139696015, + "learning_rate": 2.6116151796467538e-08, + "loss": 0.4355, + "step": 12331 + }, + { + "epoch": 0.97, + "grad_norm": 1.6162267085902138, + "learning_rate": 2.598649482464244e-08, + "loss": 0.4565, + "step": 12332 + }, + { + "epoch": 0.97, + "grad_norm": 1.8058144886929828, + "learning_rate": 2.5857159664984254e-08, + "loss": 0.4149, + "step": 12333 + }, + { + "epoch": 0.97, + "grad_norm": 1.5896839669285732, + "learning_rate": 2.5728146325860183e-08, + "loss": 0.4573, + "step": 12334 + }, + { + "epoch": 0.97, + "grad_norm": 2.627759581529985, + "learning_rate": 2.5599454815618542e-08, + "loss": 0.3881, + "step": 12335 + }, + { + "epoch": 0.97, + "grad_norm": 1.8290772097067245, + "learning_rate": 2.547108514258434e-08, + "loss": 0.4765, + "step": 12336 + }, + { + "epoch": 0.97, + "grad_norm": 0.5478055014297066, + "learning_rate": 2.534303731506371e-08, + "loss": 0.474, + "step": 12337 + }, + { + "epoch": 0.97, + "grad_norm": 1.670776996613076, + "learning_rate": 2.5215311341340587e-08, + "loss": 0.5173, + "step": 12338 + }, + { + "epoch": 0.97, + "grad_norm": 2.5531225872303787, + "learning_rate": 2.5087907229679465e-08, + "loss": 0.4399, + "step": 12339 + }, + { + "epoch": 0.97, + "grad_norm": 2.125449561695311, + "learning_rate": 2.496082498832264e-08, + "loss": 0.4394, + "step": 12340 + }, + { + "epoch": 0.97, + "grad_norm": 3.18680603653225, + "learning_rate": 2.4834064625492426e-08, + "loss": 0.4753, + "step": 12341 + }, + { + "epoch": 0.97, + "grad_norm": 1.9005813533696778, + "learning_rate": 2.4707626149389486e-08, + "loss": 0.4632, + "step": 12342 + }, + { + "epoch": 0.97, + "grad_norm": 1.8745101854580857, + "learning_rate": 2.4581509568194496e-08, + "loss": 0.4664, + "step": 12343 + }, + { + "epoch": 0.97, + "grad_norm": 2.2581373497947888, + "learning_rate": 2.4455714890067593e-08, + "loss": 0.4004, + "step": 12344 + }, + { + "epoch": 0.97, + "grad_norm": 1.9772467133604987, + "learning_rate": 2.4330242123147273e-08, + "loss": 0.4486, + "step": 12345 + }, + { + "epoch": 0.97, + "grad_norm": 1.8955955870327175, + "learning_rate": 2.4205091275550928e-08, + "loss": 0.4303, + "step": 12346 + }, + { + "epoch": 0.97, + "grad_norm": 1.8156317972657583, + "learning_rate": 2.408026235537597e-08, + "loss": 0.4491, + "step": 12347 + }, + { + "epoch": 0.97, + "grad_norm": 1.5915565573319441, + "learning_rate": 2.395575537069872e-08, + "loss": 0.4276, + "step": 12348 + }, + { + "epoch": 0.97, + "grad_norm": 1.545999848699551, + "learning_rate": 2.3831570329574948e-08, + "loss": 0.4236, + "step": 12349 + }, + { + "epoch": 0.97, + "grad_norm": 1.5578919054368918, + "learning_rate": 2.3707707240038237e-08, + "loss": 0.4018, + "step": 12350 + }, + { + "epoch": 0.97, + "grad_norm": 2.0503366308644626, + "learning_rate": 2.3584166110103835e-08, + "loss": 0.4351, + "step": 12351 + }, + { + "epoch": 0.97, + "grad_norm": 2.0182241847117792, + "learning_rate": 2.3460946947763684e-08, + "loss": 0.4586, + "step": 12352 + }, + { + "epoch": 0.97, + "grad_norm": 1.723941289247087, + "learning_rate": 2.3338049760989745e-08, + "loss": 0.4359, + "step": 12353 + }, + { + "epoch": 0.97, + "grad_norm": 2.231863281440146, + "learning_rate": 2.3215474557733985e-08, + "loss": 0.4563, + "step": 12354 + }, + { + "epoch": 0.97, + "grad_norm": 2.476803359272614, + "learning_rate": 2.3093221345926732e-08, + "loss": 0.4407, + "step": 12355 + }, + { + "epoch": 0.97, + "grad_norm": 1.3673934526596825, + "learning_rate": 2.297129013347721e-08, + "loss": 0.4246, + "step": 12356 + }, + { + "epoch": 0.97, + "grad_norm": 2.6722621444142756, + "learning_rate": 2.2849680928274663e-08, + "loss": 0.4734, + "step": 12357 + }, + { + "epoch": 0.97, + "grad_norm": 1.9019907675007741, + "learning_rate": 2.2728393738186695e-08, + "loss": 0.4344, + "step": 12358 + }, + { + "epoch": 0.97, + "grad_norm": 1.6298637796191917, + "learning_rate": 2.260742857106035e-08, + "loss": 0.4627, + "step": 12359 + }, + { + "epoch": 0.97, + "grad_norm": 1.8226549735745319, + "learning_rate": 2.2486785434722712e-08, + "loss": 0.4339, + "step": 12360 + }, + { + "epoch": 0.97, + "grad_norm": 2.296583873858308, + "learning_rate": 2.2366464336978088e-08, + "loss": 0.4858, + "step": 12361 + }, + { + "epoch": 0.97, + "grad_norm": 3.453059205472766, + "learning_rate": 2.224646528561192e-08, + "loss": 0.4317, + "step": 12362 + }, + { + "epoch": 0.97, + "grad_norm": 1.9994295560013642, + "learning_rate": 2.2126788288387434e-08, + "loss": 0.3928, + "step": 12363 + }, + { + "epoch": 0.97, + "grad_norm": 0.5494188192752956, + "learning_rate": 2.2007433353048447e-08, + "loss": 0.4853, + "step": 12364 + }, + { + "epoch": 0.97, + "grad_norm": 3.112445706361296, + "learning_rate": 2.1888400487315997e-08, + "loss": 0.3999, + "step": 12365 + }, + { + "epoch": 0.97, + "grad_norm": 1.5452430482781085, + "learning_rate": 2.1769689698891705e-08, + "loss": 0.5108, + "step": 12366 + }, + { + "epoch": 0.97, + "grad_norm": 1.8492918119962793, + "learning_rate": 2.1651300995456647e-08, + "loss": 0.4506, + "step": 12367 + }, + { + "epoch": 0.97, + "grad_norm": 1.4972724682093894, + "learning_rate": 2.15332343846697e-08, + "loss": 0.432, + "step": 12368 + }, + { + "epoch": 0.97, + "grad_norm": 1.8818024684279302, + "learning_rate": 2.141548987416975e-08, + "loss": 0.4321, + "step": 12369 + }, + { + "epoch": 0.97, + "grad_norm": 1.9899017050695995, + "learning_rate": 2.1298067471575146e-08, + "loss": 0.4494, + "step": 12370 + }, + { + "epoch": 0.97, + "grad_norm": 1.7606900939875754, + "learning_rate": 2.1180967184482038e-08, + "loss": 0.4536, + "step": 12371 + }, + { + "epoch": 0.97, + "grad_norm": 1.968096899268923, + "learning_rate": 2.1064189020467694e-08, + "loss": 0.4077, + "step": 12372 + }, + { + "epoch": 0.97, + "grad_norm": 1.5584148411790155, + "learning_rate": 2.0947732987086633e-08, + "loss": 0.4563, + "step": 12373 + }, + { + "epoch": 0.97, + "grad_norm": 2.4329334558067295, + "learning_rate": 2.0831599091873934e-08, + "loss": 0.42, + "step": 12374 + }, + { + "epoch": 0.97, + "grad_norm": 1.6261095729089556, + "learning_rate": 2.0715787342343586e-08, + "loss": 0.453, + "step": 12375 + }, + { + "epoch": 0.97, + "grad_norm": 1.917747485383327, + "learning_rate": 2.060029774598793e-08, + "loss": 0.453, + "step": 12376 + }, + { + "epoch": 0.97, + "grad_norm": 1.5866844751993328, + "learning_rate": 2.0485130310279322e-08, + "loss": 0.3979, + "step": 12377 + }, + { + "epoch": 0.97, + "grad_norm": 1.862112901942078, + "learning_rate": 2.037028504266847e-08, + "loss": 0.4562, + "step": 12378 + }, + { + "epoch": 0.97, + "grad_norm": 0.5520047315398092, + "learning_rate": 2.0255761950586096e-08, + "loss": 0.4828, + "step": 12379 + }, + { + "epoch": 0.97, + "grad_norm": 2.8075645153806543, + "learning_rate": 2.014156104144127e-08, + "loss": 0.4607, + "step": 12380 + }, + { + "epoch": 0.97, + "grad_norm": 1.4334532620522746, + "learning_rate": 2.0027682322623087e-08, + "loss": 0.4378, + "step": 12381 + }, + { + "epoch": 0.97, + "grad_norm": 1.5306064228313454, + "learning_rate": 1.991412580149954e-08, + "loss": 0.4261, + "step": 12382 + }, + { + "epoch": 0.97, + "grad_norm": 1.9284743635933819, + "learning_rate": 1.9800891485416974e-08, + "loss": 0.4055, + "step": 12383 + }, + { + "epoch": 0.97, + "grad_norm": 0.5043631965454617, + "learning_rate": 1.9687979381702306e-08, + "loss": 0.4403, + "step": 12384 + }, + { + "epoch": 0.97, + "grad_norm": 0.5375569268696316, + "learning_rate": 1.9575389497659693e-08, + "loss": 0.468, + "step": 12385 + }, + { + "epoch": 0.97, + "grad_norm": 2.08727103091835, + "learning_rate": 1.9463121840574416e-08, + "loss": 0.4142, + "step": 12386 + }, + { + "epoch": 0.97, + "grad_norm": 1.6574493790025686, + "learning_rate": 1.9351176417710115e-08, + "loss": 0.4573, + "step": 12387 + }, + { + "epoch": 0.97, + "grad_norm": 1.6610549354469388, + "learning_rate": 1.923955323630877e-08, + "loss": 0.4364, + "step": 12388 + }, + { + "epoch": 0.97, + "grad_norm": 1.6561434018668182, + "learning_rate": 1.9128252303592942e-08, + "loss": 0.3962, + "step": 12389 + }, + { + "epoch": 0.97, + "grad_norm": 2.74990833792763, + "learning_rate": 1.9017273626762977e-08, + "loss": 0.4275, + "step": 12390 + }, + { + "epoch": 0.97, + "grad_norm": 2.0726889662546104, + "learning_rate": 1.89066172129998e-08, + "loss": 0.4517, + "step": 12391 + }, + { + "epoch": 0.97, + "grad_norm": 0.5685541489231947, + "learning_rate": 1.8796283069462683e-08, + "loss": 0.4848, + "step": 12392 + }, + { + "epoch": 0.97, + "grad_norm": 1.6556358915506932, + "learning_rate": 1.8686271203289254e-08, + "loss": 0.4567, + "step": 12393 + }, + { + "epoch": 0.97, + "grad_norm": 1.9056820492928797, + "learning_rate": 1.85765816215977e-08, + "loss": 0.4905, + "step": 12394 + }, + { + "epoch": 0.97, + "grad_norm": 1.9497018895096259, + "learning_rate": 1.8467214331485128e-08, + "loss": 0.4273, + "step": 12395 + }, + { + "epoch": 0.97, + "grad_norm": 1.7436125803114355, + "learning_rate": 1.835816934002699e-08, + "loss": 0.4769, + "step": 12396 + }, + { + "epoch": 0.97, + "grad_norm": 1.6042549482850064, + "learning_rate": 1.8249446654278745e-08, + "loss": 0.3671, + "step": 12397 + }, + { + "epoch": 0.97, + "grad_norm": 1.5176459774418807, + "learning_rate": 1.8141046281273667e-08, + "loss": 0.4364, + "step": 12398 + }, + { + "epoch": 0.97, + "grad_norm": 1.4346893079265186, + "learning_rate": 1.8032968228026692e-08, + "loss": 0.4627, + "step": 12399 + }, + { + "epoch": 0.97, + "grad_norm": 0.5640068258685682, + "learning_rate": 1.7925212501528898e-08, + "loss": 0.4771, + "step": 12400 + }, + { + "epoch": 0.97, + "grad_norm": 0.5889853626905102, + "learning_rate": 1.7817779108752486e-08, + "loss": 0.4757, + "step": 12401 + }, + { + "epoch": 0.97, + "grad_norm": 4.709577586347337, + "learning_rate": 1.771066805664856e-08, + "loss": 0.4608, + "step": 12402 + }, + { + "epoch": 0.97, + "grad_norm": 1.95364414895742, + "learning_rate": 1.760387935214658e-08, + "loss": 0.4285, + "step": 12403 + }, + { + "epoch": 0.97, + "grad_norm": 1.6627574704975139, + "learning_rate": 1.749741300215546e-08, + "loss": 0.4561, + "step": 12404 + }, + { + "epoch": 0.97, + "grad_norm": 2.3749276363110394, + "learning_rate": 1.7391269013564137e-08, + "loss": 0.4683, + "step": 12405 + }, + { + "epoch": 0.97, + "grad_norm": 1.897443922160629, + "learning_rate": 1.7285447393239896e-08, + "loss": 0.4692, + "step": 12406 + }, + { + "epoch": 0.97, + "grad_norm": 2.8067648012740145, + "learning_rate": 1.717994814802837e-08, + "loss": 0.4647, + "step": 12407 + }, + { + "epoch": 0.97, + "grad_norm": 2.3241150561047257, + "learning_rate": 1.707477128475632e-08, + "loss": 0.4062, + "step": 12408 + }, + { + "epoch": 0.97, + "grad_norm": 0.5614959190544031, + "learning_rate": 1.696991681022775e-08, + "loss": 0.4802, + "step": 12409 + }, + { + "epoch": 0.97, + "grad_norm": 1.7183736018550144, + "learning_rate": 1.686538473122723e-08, + "loss": 0.4653, + "step": 12410 + }, + { + "epoch": 0.97, + "grad_norm": 2.513748094700749, + "learning_rate": 1.6761175054517687e-08, + "loss": 0.356, + "step": 12411 + }, + { + "epoch": 0.97, + "grad_norm": 1.5766378957803453, + "learning_rate": 1.6657287786840947e-08, + "loss": 0.4472, + "step": 12412 + }, + { + "epoch": 0.97, + "grad_norm": 2.001976903061876, + "learning_rate": 1.655372293491886e-08, + "loss": 0.4559, + "step": 12413 + }, + { + "epoch": 0.97, + "grad_norm": 1.6760399318389718, + "learning_rate": 1.6450480505451614e-08, + "loss": 0.4358, + "step": 12414 + }, + { + "epoch": 0.98, + "grad_norm": 1.8678288470260627, + "learning_rate": 1.6347560505118877e-08, + "loss": 0.414, + "step": 12415 + }, + { + "epoch": 0.98, + "grad_norm": 1.7594823861541244, + "learning_rate": 1.6244962940579754e-08, + "loss": 0.4667, + "step": 12416 + }, + { + "epoch": 0.98, + "grad_norm": 1.6743331858330381, + "learning_rate": 1.6142687818471725e-08, + "loss": 0.4591, + "step": 12417 + }, + { + "epoch": 0.98, + "grad_norm": 0.5362704092320819, + "learning_rate": 1.6040735145411714e-08, + "loss": 0.4582, + "step": 12418 + }, + { + "epoch": 0.98, + "grad_norm": 2.1953725585209924, + "learning_rate": 1.5939104927997218e-08, + "loss": 0.4699, + "step": 12419 + }, + { + "epoch": 0.98, + "grad_norm": 1.7211307993693612, + "learning_rate": 1.5837797172801872e-08, + "loss": 0.4328, + "step": 12420 + }, + { + "epoch": 0.98, + "grad_norm": 0.5177193675839087, + "learning_rate": 1.5736811886381542e-08, + "loss": 0.4921, + "step": 12421 + }, + { + "epoch": 0.98, + "grad_norm": 1.6614835961182148, + "learning_rate": 1.563614907526878e-08, + "loss": 0.4426, + "step": 12422 + }, + { + "epoch": 0.98, + "grad_norm": 2.2722317561060597, + "learning_rate": 1.5535808745977264e-08, + "loss": 0.5116, + "step": 12423 + }, + { + "epoch": 0.98, + "grad_norm": 1.7264921149559496, + "learning_rate": 1.5435790904997916e-08, + "loss": 0.4587, + "step": 12424 + }, + { + "epoch": 0.98, + "grad_norm": 0.5321553756268829, + "learning_rate": 1.5336095558802776e-08, + "loss": 0.4614, + "step": 12425 + }, + { + "epoch": 0.98, + "grad_norm": 0.6128008886931653, + "learning_rate": 1.5236722713841136e-08, + "loss": 0.4895, + "step": 12426 + }, + { + "epoch": 0.98, + "grad_norm": 2.2370866148877075, + "learning_rate": 1.5137672376542843e-08, + "loss": 0.4655, + "step": 12427 + }, + { + "epoch": 0.98, + "grad_norm": 1.647920554101873, + "learning_rate": 1.5038944553316115e-08, + "loss": 0.4008, + "step": 12428 + }, + { + "epoch": 0.98, + "grad_norm": 1.5034680432661958, + "learning_rate": 1.4940539250548058e-08, + "loss": 0.4441, + "step": 12429 + }, + { + "epoch": 0.98, + "grad_norm": 2.362784014842779, + "learning_rate": 1.4842456474606359e-08, + "loss": 0.4948, + "step": 12430 + }, + { + "epoch": 0.98, + "grad_norm": 2.1619123305683585, + "learning_rate": 1.4744696231836497e-08, + "loss": 0.4318, + "step": 12431 + }, + { + "epoch": 0.98, + "grad_norm": 1.8835790837531212, + "learning_rate": 1.4647258528562858e-08, + "loss": 0.4293, + "step": 12432 + }, + { + "epoch": 0.98, + "grad_norm": 0.5230570462943147, + "learning_rate": 1.4550143371089841e-08, + "loss": 0.4644, + "step": 12433 + }, + { + "epoch": 0.98, + "grad_norm": 2.1458175927841308, + "learning_rate": 1.4453350765700757e-08, + "loss": 0.4646, + "step": 12434 + }, + { + "epoch": 0.98, + "grad_norm": 1.2452769847107312, + "learning_rate": 1.4356880718658373e-08, + "loss": 0.4045, + "step": 12435 + }, + { + "epoch": 0.98, + "grad_norm": 1.684451015977942, + "learning_rate": 1.426073323620325e-08, + "loss": 0.5126, + "step": 12436 + }, + { + "epoch": 0.98, + "grad_norm": 0.5564080886814182, + "learning_rate": 1.416490832455708e-08, + "loss": 0.4564, + "step": 12437 + }, + { + "epoch": 0.98, + "grad_norm": 0.5360364455890111, + "learning_rate": 1.4069405989918795e-08, + "loss": 0.4856, + "step": 12438 + }, + { + "epoch": 0.98, + "grad_norm": 1.7875962319350327, + "learning_rate": 1.3974226238467337e-08, + "loss": 0.4406, + "step": 12439 + }, + { + "epoch": 0.98, + "grad_norm": 1.2361012055622402, + "learning_rate": 1.3879369076361115e-08, + "loss": 0.435, + "step": 12440 + }, + { + "epoch": 0.98, + "grad_norm": 2.0111370813231217, + "learning_rate": 1.3784834509736888e-08, + "loss": 0.4896, + "step": 12441 + }, + { + "epoch": 0.98, + "grad_norm": 2.0991391373320645, + "learning_rate": 1.3690622544711429e-08, + "loss": 0.409, + "step": 12442 + }, + { + "epoch": 0.98, + "grad_norm": 1.5819138544642948, + "learning_rate": 1.3596733187379307e-08, + "loss": 0.4001, + "step": 12443 + }, + { + "epoch": 0.98, + "grad_norm": 1.6426891537401123, + "learning_rate": 1.3503166443815663e-08, + "loss": 0.4505, + "step": 12444 + }, + { + "epoch": 0.98, + "grad_norm": 0.536805548788582, + "learning_rate": 1.3409922320074543e-08, + "loss": 0.4558, + "step": 12445 + }, + { + "epoch": 0.98, + "grad_norm": 3.036213115241153, + "learning_rate": 1.3317000822187232e-08, + "loss": 0.4884, + "step": 12446 + }, + { + "epoch": 0.98, + "grad_norm": 0.5599256441556179, + "learning_rate": 1.3224401956167255e-08, + "loss": 0.471, + "step": 12447 + }, + { + "epoch": 0.98, + "grad_norm": 1.7180533833538991, + "learning_rate": 1.3132125728004818e-08, + "loss": 0.4318, + "step": 12448 + }, + { + "epoch": 0.98, + "grad_norm": 2.4900217684565824, + "learning_rate": 1.3040172143670148e-08, + "loss": 0.4495, + "step": 12449 + }, + { + "epoch": 0.98, + "grad_norm": 1.9021945468911312, + "learning_rate": 1.2948541209112375e-08, + "loss": 0.4474, + "step": 12450 + }, + { + "epoch": 0.98, + "grad_norm": 3.0403115975809984, + "learning_rate": 1.2857232930260089e-08, + "loss": 0.4302, + "step": 12451 + }, + { + "epoch": 0.98, + "grad_norm": 2.1025971055779142, + "learning_rate": 1.2766247313021341e-08, + "loss": 0.4879, + "step": 12452 + }, + { + "epoch": 0.98, + "grad_norm": 1.9755778633438108, + "learning_rate": 1.2675584363281979e-08, + "loss": 0.475, + "step": 12453 + }, + { + "epoch": 0.98, + "grad_norm": 1.6410505017855979, + "learning_rate": 1.2585244086907867e-08, + "loss": 0.4346, + "step": 12454 + }, + { + "epoch": 0.98, + "grad_norm": 1.320647738236473, + "learning_rate": 1.2495226489744328e-08, + "loss": 0.4043, + "step": 12455 + }, + { + "epoch": 0.98, + "grad_norm": 1.6080114488495076, + "learning_rate": 1.2405531577615037e-08, + "loss": 0.4014, + "step": 12456 + }, + { + "epoch": 0.98, + "grad_norm": 3.077484636984275, + "learning_rate": 1.2316159356323132e-08, + "loss": 0.451, + "step": 12457 + }, + { + "epoch": 0.98, + "grad_norm": 1.4593675162351005, + "learning_rate": 1.2227109831651763e-08, + "loss": 0.4389, + "step": 12458 + }, + { + "epoch": 0.98, + "grad_norm": 1.6844183378142294, + "learning_rate": 1.2138383009360765e-08, + "loss": 0.3992, + "step": 12459 + }, + { + "epoch": 0.98, + "grad_norm": 2.5246330768015395, + "learning_rate": 1.204997889519166e-08, + "loss": 0.4348, + "step": 12460 + }, + { + "epoch": 0.98, + "grad_norm": 2.085555009779293, + "learning_rate": 1.1961897494863761e-08, + "loss": 0.4301, + "step": 12461 + }, + { + "epoch": 0.98, + "grad_norm": 1.9416403658433656, + "learning_rate": 1.1874138814075842e-08, + "loss": 0.4542, + "step": 12462 + }, + { + "epoch": 0.98, + "grad_norm": 2.5386038278228558, + "learning_rate": 1.1786702858506138e-08, + "loss": 0.4181, + "step": 12463 + }, + { + "epoch": 0.98, + "grad_norm": 0.5406098473433208, + "learning_rate": 1.1699589633811236e-08, + "loss": 0.4708, + "step": 12464 + }, + { + "epoch": 0.98, + "grad_norm": 2.3531787931654895, + "learning_rate": 1.161279914562774e-08, + "loss": 0.4606, + "step": 12465 + }, + { + "epoch": 0.98, + "grad_norm": 1.3752613686105355, + "learning_rate": 1.1526331399570045e-08, + "loss": 0.4254, + "step": 12466 + }, + { + "epoch": 0.98, + "grad_norm": 1.6261018432549799, + "learning_rate": 1.1440186401233123e-08, + "loss": 0.4538, + "step": 12467 + }, + { + "epoch": 0.98, + "grad_norm": 1.7635071713066908, + "learning_rate": 1.1354364156189734e-08, + "loss": 0.438, + "step": 12468 + }, + { + "epoch": 0.98, + "grad_norm": 1.6820209462723312, + "learning_rate": 1.1268864669993773e-08, + "loss": 0.4385, + "step": 12469 + }, + { + "epoch": 0.98, + "grad_norm": 1.6472560419342481, + "learning_rate": 1.1183687948175814e-08, + "loss": 0.4733, + "step": 12470 + }, + { + "epoch": 0.98, + "grad_norm": 1.818097929327011, + "learning_rate": 1.109883399624756e-08, + "loss": 0.4247, + "step": 12471 + }, + { + "epoch": 0.98, + "grad_norm": 2.007440964239131, + "learning_rate": 1.1014302819697952e-08, + "loss": 0.4594, + "step": 12472 + }, + { + "epoch": 0.98, + "grad_norm": 1.6816141890342104, + "learning_rate": 1.0930094423996508e-08, + "loss": 0.4831, + "step": 12473 + }, + { + "epoch": 0.98, + "grad_norm": 1.686752515404565, + "learning_rate": 1.0846208814591642e-08, + "loss": 0.4609, + "step": 12474 + }, + { + "epoch": 0.98, + "grad_norm": 3.1718206277850065, + "learning_rate": 1.0762645996910126e-08, + "loss": 0.4832, + "step": 12475 + }, + { + "epoch": 0.98, + "grad_norm": 1.8219524276605636, + "learning_rate": 1.0679405976359303e-08, + "loss": 0.458, + "step": 12476 + }, + { + "epoch": 0.98, + "grad_norm": 2.070349208892602, + "learning_rate": 1.0596488758323198e-08, + "loss": 0.4414, + "step": 12477 + }, + { + "epoch": 0.98, + "grad_norm": 1.6795091354404088, + "learning_rate": 1.0513894348168074e-08, + "loss": 0.4133, + "step": 12478 + }, + { + "epoch": 0.98, + "grad_norm": 0.5278543022334349, + "learning_rate": 1.0431622751236326e-08, + "loss": 0.4756, + "step": 12479 + }, + { + "epoch": 0.98, + "grad_norm": 1.6986593718091938, + "learning_rate": 1.0349673972852025e-08, + "loss": 0.4301, + "step": 12480 + }, + { + "epoch": 0.98, + "grad_norm": 1.5694325962801887, + "learning_rate": 1.0268048018315935e-08, + "loss": 0.4633, + "step": 12481 + }, + { + "epoch": 0.98, + "grad_norm": 2.646592610833807, + "learning_rate": 1.0186744892909939e-08, + "loss": 0.4805, + "step": 12482 + }, + { + "epoch": 0.98, + "grad_norm": 1.6287190815935564, + "learning_rate": 1.0105764601894274e-08, + "loss": 0.4353, + "step": 12483 + }, + { + "epoch": 0.98, + "grad_norm": 1.5997022011670194, + "learning_rate": 1.0025107150508085e-08, + "loss": 0.439, + "step": 12484 + }, + { + "epoch": 0.98, + "grad_norm": 1.8898280559415683, + "learning_rate": 9.944772543969417e-09, + "loss": 0.4542, + "step": 12485 + }, + { + "epoch": 0.98, + "grad_norm": 1.9372268015740353, + "learning_rate": 9.864760787476336e-09, + "loss": 0.4609, + "step": 12486 + }, + { + "epoch": 0.98, + "grad_norm": 0.5533978310676768, + "learning_rate": 9.785071886205256e-09, + "loss": 0.4693, + "step": 12487 + }, + { + "epoch": 0.98, + "grad_norm": 1.637118589972661, + "learning_rate": 9.705705845312052e-09, + "loss": 0.4497, + "step": 12488 + }, + { + "epoch": 0.98, + "grad_norm": 2.1195306500732567, + "learning_rate": 9.626662669931507e-09, + "loss": 0.4441, + "step": 12489 + }, + { + "epoch": 0.98, + "grad_norm": 0.5406200919018584, + "learning_rate": 9.547942365177865e-09, + "loss": 0.4616, + "step": 12490 + }, + { + "epoch": 0.98, + "grad_norm": 1.633634812555877, + "learning_rate": 9.469544936143715e-09, + "loss": 0.4279, + "step": 12491 + }, + { + "epoch": 0.98, + "grad_norm": 0.6129967266922169, + "learning_rate": 9.391470387901669e-09, + "loss": 0.4574, + "step": 12492 + }, + { + "epoch": 0.98, + "grad_norm": 1.7075401056109174, + "learning_rate": 9.31371872550324e-09, + "loss": 0.4309, + "step": 12493 + }, + { + "epoch": 0.98, + "grad_norm": 1.5134047362786376, + "learning_rate": 9.23628995397774e-09, + "loss": 0.4877, + "step": 12494 + }, + { + "epoch": 0.98, + "grad_norm": 2.8002690524548104, + "learning_rate": 9.159184078336714e-09, + "loss": 0.434, + "step": 12495 + }, + { + "epoch": 0.98, + "grad_norm": 1.727504780408673, + "learning_rate": 9.082401103566729e-09, + "loss": 0.4242, + "step": 12496 + }, + { + "epoch": 0.98, + "grad_norm": 2.6428157749345744, + "learning_rate": 9.005941034636589e-09, + "loss": 0.4986, + "step": 12497 + }, + { + "epoch": 0.98, + "grad_norm": 1.6031208444443603, + "learning_rate": 8.929803876493448e-09, + "loss": 0.4342, + "step": 12498 + }, + { + "epoch": 0.98, + "grad_norm": 1.5116602925620355, + "learning_rate": 8.853989634062255e-09, + "loss": 0.4165, + "step": 12499 + }, + { + "epoch": 0.98, + "grad_norm": 0.5207485320793253, + "learning_rate": 8.778498312249083e-09, + "loss": 0.4639, + "step": 12500 + }, + { + "epoch": 0.98, + "grad_norm": 1.6557388787633334, + "learning_rate": 8.703329915937808e-09, + "loss": 0.4893, + "step": 12501 + }, + { + "epoch": 0.98, + "grad_norm": 1.555961993380804, + "learning_rate": 8.628484449991759e-09, + "loss": 0.4641, + "step": 12502 + }, + { + "epoch": 0.98, + "grad_norm": 1.7873939258482971, + "learning_rate": 8.553961919252618e-09, + "loss": 0.4899, + "step": 12503 + }, + { + "epoch": 0.98, + "grad_norm": 2.176924203274366, + "learning_rate": 8.479762328543195e-09, + "loss": 0.4841, + "step": 12504 + }, + { + "epoch": 0.98, + "grad_norm": 2.0594458894041816, + "learning_rate": 8.405885682662983e-09, + "loss": 0.4124, + "step": 12505 + }, + { + "epoch": 0.98, + "grad_norm": 1.5285612407395224, + "learning_rate": 8.332331986392606e-09, + "loss": 0.4813, + "step": 12506 + }, + { + "epoch": 0.98, + "grad_norm": 1.4827611726782732, + "learning_rate": 8.25910124448992e-09, + "loss": 0.4485, + "step": 12507 + }, + { + "epoch": 0.98, + "grad_norm": 1.9188966963848202, + "learning_rate": 8.186193461693359e-09, + "loss": 0.4439, + "step": 12508 + }, + { + "epoch": 0.98, + "grad_norm": 1.2877615313442303, + "learning_rate": 8.11360864272026e-09, + "loss": 0.4499, + "step": 12509 + }, + { + "epoch": 0.98, + "grad_norm": 2.1060685943110413, + "learning_rate": 8.041346792266313e-09, + "loss": 0.4501, + "step": 12510 + }, + { + "epoch": 0.98, + "grad_norm": 1.5991257398218082, + "learning_rate": 7.969407915007221e-09, + "loss": 0.4369, + "step": 12511 + }, + { + "epoch": 0.98, + "grad_norm": 2.3948486845115555, + "learning_rate": 7.897792015596484e-09, + "loss": 0.4424, + "step": 12512 + }, + { + "epoch": 0.98, + "grad_norm": 1.624702457056527, + "learning_rate": 7.826499098668728e-09, + "loss": 0.4198, + "step": 12513 + }, + { + "epoch": 0.98, + "grad_norm": 2.4245030949680832, + "learning_rate": 7.755529168835263e-09, + "loss": 0.4454, + "step": 12514 + }, + { + "epoch": 0.98, + "grad_norm": 1.6885482230210522, + "learning_rate": 7.684882230689083e-09, + "loss": 0.4082, + "step": 12515 + }, + { + "epoch": 0.98, + "grad_norm": 1.6320356095730846, + "learning_rate": 7.614558288799867e-09, + "loss": 0.4074, + "step": 12516 + }, + { + "epoch": 0.98, + "grad_norm": 1.812139307775009, + "learning_rate": 7.544557347717863e-09, + "loss": 0.4168, + "step": 12517 + }, + { + "epoch": 0.98, + "grad_norm": 0.5353127016172445, + "learning_rate": 7.47487941197167e-09, + "loss": 0.4709, + "step": 12518 + }, + { + "epoch": 0.98, + "grad_norm": 2.2160211052553445, + "learning_rate": 7.405524486070459e-09, + "loss": 0.3932, + "step": 12519 + }, + { + "epoch": 0.98, + "grad_norm": 1.4763512548568691, + "learning_rate": 7.336492574500087e-09, + "loss": 0.4209, + "step": 12520 + }, + { + "epoch": 0.98, + "grad_norm": 0.5663983031717892, + "learning_rate": 7.267783681728091e-09, + "loss": 0.4826, + "step": 12521 + }, + { + "epoch": 0.98, + "grad_norm": 2.200697643459193, + "learning_rate": 7.199397812198694e-09, + "loss": 0.4394, + "step": 12522 + }, + { + "epoch": 0.98, + "grad_norm": 1.911818981207784, + "learning_rate": 7.13133497033669e-09, + "loss": 0.4324, + "step": 12523 + }, + { + "epoch": 0.98, + "grad_norm": 0.59879226006257, + "learning_rate": 7.063595160546332e-09, + "loss": 0.4992, + "step": 12524 + }, + { + "epoch": 0.98, + "grad_norm": 2.970817586930159, + "learning_rate": 6.9961783872091185e-09, + "loss": 0.47, + "step": 12525 + }, + { + "epoch": 0.98, + "grad_norm": 1.7914245449334705, + "learning_rate": 6.929084654688223e-09, + "loss": 0.4855, + "step": 12526 + }, + { + "epoch": 0.98, + "grad_norm": 0.562617636154707, + "learning_rate": 6.862313967323508e-09, + "loss": 0.4592, + "step": 12527 + }, + { + "epoch": 0.98, + "grad_norm": 1.5172123400698967, + "learning_rate": 6.795866329434853e-09, + "loss": 0.4316, + "step": 12528 + }, + { + "epoch": 0.98, + "grad_norm": 2.0669896121000284, + "learning_rate": 6.729741745322149e-09, + "loss": 0.4048, + "step": 12529 + }, + { + "epoch": 0.98, + "grad_norm": 3.3857952260583914, + "learning_rate": 6.663940219263088e-09, + "loss": 0.4202, + "step": 12530 + }, + { + "epoch": 0.98, + "grad_norm": 2.043134969167867, + "learning_rate": 6.598461755514818e-09, + "loss": 0.4681, + "step": 12531 + }, + { + "epoch": 0.98, + "grad_norm": 1.604203662963431, + "learning_rate": 6.533306358313951e-09, + "loss": 0.4552, + "step": 12532 + }, + { + "epoch": 0.98, + "grad_norm": 2.1187301365065365, + "learning_rate": 6.468474031876004e-09, + "loss": 0.4618, + "step": 12533 + }, + { + "epoch": 0.98, + "grad_norm": 1.8939579672034357, + "learning_rate": 6.4039647803953994e-09, + "loss": 0.4519, + "step": 12534 + }, + { + "epoch": 0.98, + "grad_norm": 1.8553072704475941, + "learning_rate": 6.339778608046021e-09, + "loss": 0.4152, + "step": 12535 + }, + { + "epoch": 0.98, + "grad_norm": 1.3667794668592637, + "learning_rate": 6.275915518980102e-09, + "loss": 0.4094, + "step": 12536 + }, + { + "epoch": 0.98, + "grad_norm": 4.8616680695991965, + "learning_rate": 6.212375517329894e-09, + "loss": 0.4965, + "step": 12537 + }, + { + "epoch": 0.98, + "grad_norm": 1.7325493851720626, + "learning_rate": 6.149158607206552e-09, + "loss": 0.4418, + "step": 12538 + }, + { + "epoch": 0.98, + "grad_norm": 2.070073371111874, + "learning_rate": 6.086264792699581e-09, + "loss": 0.4642, + "step": 12539 + }, + { + "epoch": 0.98, + "grad_norm": 0.5267055785058776, + "learning_rate": 6.023694077878506e-09, + "loss": 0.4346, + "step": 12540 + }, + { + "epoch": 0.98, + "grad_norm": 1.897340562724816, + "learning_rate": 5.961446466791199e-09, + "loss": 0.4368, + "step": 12541 + }, + { + "epoch": 0.98, + "grad_norm": 1.5575401288335324, + "learning_rate": 5.899521963465549e-09, + "loss": 0.4369, + "step": 12542 + }, + { + "epoch": 0.99, + "grad_norm": 1.830500131131668, + "learning_rate": 5.837920571907796e-09, + "loss": 0.4485, + "step": 12543 + }, + { + "epoch": 0.99, + "grad_norm": 1.737592730959085, + "learning_rate": 5.7766422961030855e-09, + "loss": 0.42, + "step": 12544 + }, + { + "epoch": 0.99, + "grad_norm": 0.5180532914566667, + "learning_rate": 5.715687140016579e-09, + "loss": 0.4732, + "step": 12545 + }, + { + "epoch": 0.99, + "grad_norm": 2.5399953821077093, + "learning_rate": 5.6550551075923445e-09, + "loss": 0.4859, + "step": 12546 + }, + { + "epoch": 0.99, + "grad_norm": 1.835526287629584, + "learning_rate": 5.594746202751688e-09, + "loss": 0.4282, + "step": 12547 + }, + { + "epoch": 0.99, + "grad_norm": 1.9304038926712141, + "learning_rate": 5.534760429398156e-09, + "loss": 0.45, + "step": 12548 + }, + { + "epoch": 0.99, + "grad_norm": 1.8160448694699245, + "learning_rate": 5.475097791411421e-09, + "loss": 0.4387, + "step": 12549 + }, + { + "epoch": 0.99, + "grad_norm": 0.6161554019832003, + "learning_rate": 5.41575829265284e-09, + "loss": 0.4763, + "step": 12550 + }, + { + "epoch": 0.99, + "grad_norm": 2.1088372761369976, + "learning_rate": 5.356741936960452e-09, + "loss": 0.4589, + "step": 12551 + }, + { + "epoch": 0.99, + "grad_norm": 2.0716707289278915, + "learning_rate": 5.298048728152872e-09, + "loss": 0.5001, + "step": 12552 + }, + { + "epoch": 0.99, + "grad_norm": 1.6048323588845608, + "learning_rate": 5.2396786700281695e-09, + "loss": 0.4213, + "step": 12553 + }, + { + "epoch": 0.99, + "grad_norm": 2.185769982058536, + "learning_rate": 5.181631766362216e-09, + "loss": 0.4534, + "step": 12554 + }, + { + "epoch": 0.99, + "grad_norm": 1.8134513300830346, + "learning_rate": 5.12390802091034e-09, + "loss": 0.4304, + "step": 12555 + }, + { + "epoch": 0.99, + "grad_norm": 1.7380971816132587, + "learning_rate": 5.066507437407886e-09, + "loss": 0.457, + "step": 12556 + }, + { + "epoch": 0.99, + "grad_norm": 1.5501853884556158, + "learning_rate": 5.009430019567996e-09, + "loss": 0.4683, + "step": 12557 + }, + { + "epoch": 0.99, + "grad_norm": 2.783188806585626, + "learning_rate": 4.952675771083826e-09, + "loss": 0.4398, + "step": 12558 + }, + { + "epoch": 0.99, + "grad_norm": 2.3761827483572775, + "learning_rate": 4.896244695626884e-09, + "loss": 0.4314, + "step": 12559 + }, + { + "epoch": 0.99, + "grad_norm": 1.7419609398005242, + "learning_rate": 4.840136796848694e-09, + "loss": 0.4271, + "step": 12560 + }, + { + "epoch": 0.99, + "grad_norm": 1.4873744251207768, + "learning_rate": 4.784352078379129e-09, + "loss": 0.3904, + "step": 12561 + }, + { + "epoch": 0.99, + "grad_norm": 1.6377879071226575, + "learning_rate": 4.728890543827525e-09, + "loss": 0.4333, + "step": 12562 + }, + { + "epoch": 0.99, + "grad_norm": 1.74665235922418, + "learning_rate": 4.673752196782122e-09, + "loss": 0.4009, + "step": 12563 + }, + { + "epoch": 0.99, + "grad_norm": 2.6837070454110004, + "learning_rate": 4.618937040810068e-09, + "loss": 0.4144, + "step": 12564 + }, + { + "epoch": 0.99, + "grad_norm": 2.1052359316794678, + "learning_rate": 4.564445079457969e-09, + "loss": 0.461, + "step": 12565 + }, + { + "epoch": 0.99, + "grad_norm": 1.5818245045843733, + "learning_rate": 4.510276316251339e-09, + "loss": 0.4229, + "step": 12566 + }, + { + "epoch": 0.99, + "grad_norm": 3.779675202771309, + "learning_rate": 4.456430754695151e-09, + "loss": 0.4771, + "step": 12567 + }, + { + "epoch": 0.99, + "grad_norm": 2.127673950283077, + "learning_rate": 4.4029083982727314e-09, + "loss": 0.5127, + "step": 12568 + }, + { + "epoch": 0.99, + "grad_norm": 2.6734948827249356, + "learning_rate": 4.34970925044742e-09, + "loss": 0.4384, + "step": 12569 + }, + { + "epoch": 0.99, + "grad_norm": 2.8513899284300206, + "learning_rate": 4.296833314660354e-09, + "loss": 0.4326, + "step": 12570 + }, + { + "epoch": 0.99, + "grad_norm": 1.8720860971901838, + "learning_rate": 4.244280594333239e-09, + "loss": 0.4191, + "step": 12571 + }, + { + "epoch": 0.99, + "grad_norm": 1.5922233995268702, + "learning_rate": 4.192051092866134e-09, + "loss": 0.4358, + "step": 12572 + }, + { + "epoch": 0.99, + "grad_norm": 2.5047227126346328, + "learning_rate": 4.140144813637448e-09, + "loss": 0.4575, + "step": 12573 + }, + { + "epoch": 0.99, + "grad_norm": 1.5054844439859798, + "learning_rate": 4.08856176000616e-09, + "loss": 0.4882, + "step": 12574 + }, + { + "epoch": 0.99, + "grad_norm": 1.543994646107952, + "learning_rate": 4.037301935309601e-09, + "loss": 0.3717, + "step": 12575 + }, + { + "epoch": 0.99, + "grad_norm": 0.568550411337171, + "learning_rate": 3.986365342864007e-09, + "loss": 0.4686, + "step": 12576 + }, + { + "epoch": 0.99, + "grad_norm": 1.6765072465540531, + "learning_rate": 3.935751985965075e-09, + "loss": 0.4268, + "step": 12577 + }, + { + "epoch": 0.99, + "grad_norm": 1.5680153252172166, + "learning_rate": 3.885461867886852e-09, + "loss": 0.4592, + "step": 12578 + }, + { + "epoch": 0.99, + "grad_norm": 1.743475879591402, + "learning_rate": 3.835494991883959e-09, + "loss": 0.4043, + "step": 12579 + }, + { + "epoch": 0.99, + "grad_norm": 1.542144750645424, + "learning_rate": 3.785851361188808e-09, + "loss": 0.4415, + "step": 12580 + }, + { + "epoch": 0.99, + "grad_norm": 0.5333926961892341, + "learning_rate": 3.736530979012721e-09, + "loss": 0.4617, + "step": 12581 + }, + { + "epoch": 0.99, + "grad_norm": 1.6365016367833727, + "learning_rate": 3.6875338485475866e-09, + "loss": 0.4253, + "step": 12582 + }, + { + "epoch": 0.99, + "grad_norm": 0.5217973327164985, + "learning_rate": 3.6388599729625383e-09, + "loss": 0.4607, + "step": 12583 + }, + { + "epoch": 0.99, + "grad_norm": 16.75684485275876, + "learning_rate": 3.590509355407834e-09, + "loss": 0.4537, + "step": 12584 + }, + { + "epoch": 0.99, + "grad_norm": 1.7779382848013272, + "learning_rate": 3.5424819990104163e-09, + "loss": 0.4907, + "step": 12585 + }, + { + "epoch": 0.99, + "grad_norm": 0.5397302420331073, + "learning_rate": 3.49477790687891e-09, + "loss": 0.4697, + "step": 12586 + }, + { + "epoch": 0.99, + "grad_norm": 2.019218932599117, + "learning_rate": 3.4473970820986248e-09, + "loss": 0.437, + "step": 12587 + }, + { + "epoch": 0.99, + "grad_norm": 3.2831027218224276, + "learning_rate": 3.4003395277354413e-09, + "loss": 0.4429, + "step": 12588 + }, + { + "epoch": 0.99, + "grad_norm": 2.7428038821769722, + "learning_rate": 3.353605246833591e-09, + "loss": 0.4658, + "step": 12589 + }, + { + "epoch": 0.99, + "grad_norm": 1.8525318997912361, + "learning_rate": 3.307194242417322e-09, + "loss": 0.4467, + "step": 12590 + }, + { + "epoch": 0.99, + "grad_norm": 1.7412943231851326, + "learning_rate": 3.261106517489232e-09, + "loss": 0.4505, + "step": 12591 + }, + { + "epoch": 0.99, + "grad_norm": 1.8502048825724473, + "learning_rate": 3.215342075030825e-09, + "loss": 0.4773, + "step": 12592 + }, + { + "epoch": 0.99, + "grad_norm": 3.7600758568759587, + "learning_rate": 3.169900918003621e-09, + "loss": 0.4455, + "step": 12593 + }, + { + "epoch": 0.99, + "grad_norm": 1.8894457368547062, + "learning_rate": 3.1247830493469357e-09, + "loss": 0.4306, + "step": 12594 + }, + { + "epoch": 0.99, + "grad_norm": 1.7994146551691743, + "learning_rate": 3.0799884719795448e-09, + "loss": 0.4223, + "step": 12595 + }, + { + "epoch": 0.99, + "grad_norm": 1.6144149442660667, + "learning_rate": 3.035517188800796e-09, + "loss": 0.4332, + "step": 12596 + }, + { + "epoch": 0.99, + "grad_norm": 2.6487461277474638, + "learning_rate": 2.9913692026867227e-09, + "loss": 0.494, + "step": 12597 + }, + { + "epoch": 0.99, + "grad_norm": 1.8387610012853146, + "learning_rate": 2.947544516494483e-09, + "loss": 0.461, + "step": 12598 + }, + { + "epoch": 0.99, + "grad_norm": 0.5622533905295612, + "learning_rate": 2.9040431330584762e-09, + "loss": 0.4856, + "step": 12599 + }, + { + "epoch": 0.99, + "grad_norm": 3.730566070880151, + "learning_rate": 2.8608650551947835e-09, + "loss": 0.4485, + "step": 12600 + }, + { + "epoch": 0.99, + "grad_norm": 1.6813296552044827, + "learning_rate": 2.818010285695616e-09, + "loss": 0.4401, + "step": 12601 + }, + { + "epoch": 0.99, + "grad_norm": 1.8829375974243217, + "learning_rate": 2.7754788273337553e-09, + "loss": 0.453, + "step": 12602 + }, + { + "epoch": 0.99, + "grad_norm": 2.4322393591207865, + "learning_rate": 2.733270682861444e-09, + "loss": 0.4748, + "step": 12603 + }, + { + "epoch": 0.99, + "grad_norm": 1.8279137379874328, + "learning_rate": 2.6913858550098314e-09, + "loss": 0.4555, + "step": 12604 + }, + { + "epoch": 0.99, + "grad_norm": 1.620441580584949, + "learning_rate": 2.6498243464878614e-09, + "loss": 0.4354, + "step": 12605 + }, + { + "epoch": 0.99, + "grad_norm": 1.723250227563526, + "learning_rate": 2.6085861599844943e-09, + "loss": 0.4761, + "step": 12606 + }, + { + "epoch": 0.99, + "grad_norm": 1.653617630457076, + "learning_rate": 2.5676712981687058e-09, + "loss": 0.4205, + "step": 12607 + }, + { + "epoch": 0.99, + "grad_norm": 1.8081656612909613, + "learning_rate": 2.5270797636867127e-09, + "loss": 0.4807, + "step": 12608 + }, + { + "epoch": 0.99, + "grad_norm": 2.5980962280382385, + "learning_rate": 2.486811559165303e-09, + "loss": 0.4742, + "step": 12609 + }, + { + "epoch": 0.99, + "grad_norm": 1.6159725732225896, + "learning_rate": 2.4468666872096148e-09, + "loss": 0.4149, + "step": 12610 + }, + { + "epoch": 0.99, + "grad_norm": 1.651409029062181, + "learning_rate": 2.407245150404247e-09, + "loss": 0.4721, + "step": 12611 + }, + { + "epoch": 0.99, + "grad_norm": 1.4651318218104874, + "learning_rate": 2.36794695131215e-09, + "loss": 0.435, + "step": 12612 + }, + { + "epoch": 0.99, + "grad_norm": 1.8897248070299966, + "learning_rate": 2.3289720924762893e-09, + "loss": 0.4097, + "step": 12613 + }, + { + "epoch": 0.99, + "grad_norm": 0.5459885522426507, + "learning_rate": 2.2903205764185364e-09, + "loss": 0.4722, + "step": 12614 + }, + { + "epoch": 0.99, + "grad_norm": 0.5094813596235412, + "learning_rate": 2.251992405638559e-09, + "loss": 0.4614, + "step": 12615 + }, + { + "epoch": 0.99, + "grad_norm": 0.5326465389649856, + "learning_rate": 2.21398758261715e-09, + "loss": 0.4607, + "step": 12616 + }, + { + "epoch": 0.99, + "grad_norm": 1.8250086540855375, + "learning_rate": 2.176306109812898e-09, + "loss": 0.41, + "step": 12617 + }, + { + "epoch": 0.99, + "grad_norm": 1.9695860736088986, + "learning_rate": 2.138947989663298e-09, + "loss": 0.4135, + "step": 12618 + }, + { + "epoch": 0.99, + "grad_norm": 3.161567082916375, + "learning_rate": 2.101913224585861e-09, + "loss": 0.474, + "step": 12619 + }, + { + "epoch": 0.99, + "grad_norm": 2.2416319555802144, + "learning_rate": 2.065201816977003e-09, + "loss": 0.4995, + "step": 12620 + }, + { + "epoch": 0.99, + "grad_norm": 1.5850137362441887, + "learning_rate": 2.0288137692109357e-09, + "loss": 0.3967, + "step": 12621 + }, + { + "epoch": 0.99, + "grad_norm": 2.2629690739861874, + "learning_rate": 1.9927490836429975e-09, + "loss": 0.4719, + "step": 12622 + }, + { + "epoch": 0.99, + "grad_norm": 1.5208464892056612, + "learning_rate": 1.957007762605212e-09, + "loss": 0.4435, + "step": 12623 + }, + { + "epoch": 0.99, + "grad_norm": 1.7229118507059125, + "learning_rate": 1.9215898084112838e-09, + "loss": 0.4288, + "step": 12624 + }, + { + "epoch": 0.99, + "grad_norm": 1.941189359487513, + "learning_rate": 1.8864952233521584e-09, + "loss": 0.4395, + "step": 12625 + }, + { + "epoch": 0.99, + "grad_norm": 1.6019764991918644, + "learning_rate": 1.8517240096982414e-09, + "loss": 0.4247, + "step": 12626 + }, + { + "epoch": 0.99, + "grad_norm": 2.8986391093999524, + "learning_rate": 1.8172761696994001e-09, + "loss": 0.489, + "step": 12627 + }, + { + "epoch": 0.99, + "grad_norm": 2.0394481341760646, + "learning_rate": 1.7831517055849623e-09, + "loss": 0.487, + "step": 12628 + }, + { + "epoch": 0.99, + "grad_norm": 2.2593047336199894, + "learning_rate": 1.7493506195614963e-09, + "loss": 0.4614, + "step": 12629 + }, + { + "epoch": 0.99, + "grad_norm": 1.5642282037206607, + "learning_rate": 1.7158729138166963e-09, + "loss": 0.4371, + "step": 12630 + }, + { + "epoch": 0.99, + "grad_norm": 1.7932297997838356, + "learning_rate": 1.6827185905160526e-09, + "loss": 0.4141, + "step": 12631 + }, + { + "epoch": 0.99, + "grad_norm": 2.0383482222754656, + "learning_rate": 1.6498876518050711e-09, + "loss": 0.4467, + "step": 12632 + }, + { + "epoch": 0.99, + "grad_norm": 1.8206322122454668, + "learning_rate": 1.6173800998076084e-09, + "loss": 0.489, + "step": 12633 + }, + { + "epoch": 0.99, + "grad_norm": 1.9885811727605542, + "learning_rate": 1.585195936626982e-09, + "loss": 0.4226, + "step": 12634 + }, + { + "epoch": 0.99, + "grad_norm": 0.5597764070803066, + "learning_rate": 1.5533351643459704e-09, + "loss": 0.4735, + "step": 12635 + }, + { + "epoch": 0.99, + "grad_norm": 2.8587457064072965, + "learning_rate": 1.5217977850245925e-09, + "loss": 0.4089, + "step": 12636 + }, + { + "epoch": 0.99, + "grad_norm": 2.0254855543485153, + "learning_rate": 1.4905838007045482e-09, + "loss": 0.4698, + "step": 12637 + }, + { + "epoch": 0.99, + "grad_norm": 1.7780462741774332, + "learning_rate": 1.4596932134047782e-09, + "loss": 0.444, + "step": 12638 + }, + { + "epoch": 0.99, + "grad_norm": 1.5095873108317084, + "learning_rate": 1.4291260251236837e-09, + "loss": 0.4396, + "step": 12639 + }, + { + "epoch": 0.99, + "grad_norm": 2.416634595873887, + "learning_rate": 1.3988822378396827e-09, + "loss": 0.3945, + "step": 12640 + }, + { + "epoch": 0.99, + "grad_norm": 1.452242685705804, + "learning_rate": 1.3689618535084326e-09, + "loss": 0.4513, + "step": 12641 + }, + { + "epoch": 0.99, + "grad_norm": 0.5175640346460375, + "learning_rate": 1.3393648740667175e-09, + "loss": 0.444, + "step": 12642 + }, + { + "epoch": 0.99, + "grad_norm": 0.5284292319516303, + "learning_rate": 1.3100913014291172e-09, + "loss": 0.4663, + "step": 12643 + }, + { + "epoch": 0.99, + "grad_norm": 0.5361981828406261, + "learning_rate": 1.281141137489117e-09, + "loss": 0.4546, + "step": 12644 + }, + { + "epoch": 0.99, + "grad_norm": 2.400674992174665, + "learning_rate": 1.2525143841202182e-09, + "loss": 0.3892, + "step": 12645 + }, + { + "epoch": 0.99, + "grad_norm": 2.2224078853439946, + "learning_rate": 1.2242110431742727e-09, + "loss": 0.4413, + "step": 12646 + }, + { + "epoch": 0.99, + "grad_norm": 2.1059507453490065, + "learning_rate": 1.1962311164831485e-09, + "loss": 0.413, + "step": 12647 + }, + { + "epoch": 0.99, + "grad_norm": 2.1326559083608303, + "learning_rate": 1.1685746058565095e-09, + "loss": 0.4486, + "step": 12648 + }, + { + "epoch": 0.99, + "grad_norm": 2.012181921987285, + "learning_rate": 1.1412415130834797e-09, + "loss": 0.4484, + "step": 12649 + }, + { + "epoch": 0.99, + "grad_norm": 1.3939322267270886, + "learning_rate": 1.1142318399331997e-09, + "loss": 0.3886, + "step": 12650 + }, + { + "epoch": 0.99, + "grad_norm": 1.5624958257655308, + "learning_rate": 1.0875455881526055e-09, + "loss": 0.4612, + "step": 12651 + }, + { + "epoch": 0.99, + "grad_norm": 2.9654969853414817, + "learning_rate": 1.0611827594680934e-09, + "loss": 0.483, + "step": 12652 + }, + { + "epoch": 0.99, + "grad_norm": 1.4940693652885892, + "learning_rate": 1.0351433555860769e-09, + "loss": 0.4191, + "step": 12653 + }, + { + "epoch": 0.99, + "grad_norm": 1.9052627213490994, + "learning_rate": 1.0094273781907637e-09, + "loss": 0.4743, + "step": 12654 + }, + { + "epoch": 0.99, + "grad_norm": 1.6582757608629217, + "learning_rate": 9.840348289458234e-10, + "loss": 0.4644, + "step": 12655 + }, + { + "epoch": 0.99, + "grad_norm": 1.8414704778888966, + "learning_rate": 9.58965709494386e-10, + "loss": 0.4045, + "step": 12656 + }, + { + "epoch": 0.99, + "grad_norm": 1.5403204651039883, + "learning_rate": 9.342200214579323e-10, + "loss": 0.4109, + "step": 12657 + }, + { + "epoch": 0.99, + "grad_norm": 2.516150066535453, + "learning_rate": 9.097977664385138e-10, + "loss": 0.4368, + "step": 12658 + }, + { + "epoch": 0.99, + "grad_norm": 2.119426394438219, + "learning_rate": 8.856989460148679e-10, + "loss": 0.4289, + "step": 12659 + }, + { + "epoch": 0.99, + "grad_norm": 1.5717193044962845, + "learning_rate": 8.619235617474131e-10, + "loss": 0.4182, + "step": 12660 + }, + { + "epoch": 0.99, + "grad_norm": 1.5701016631999365, + "learning_rate": 8.38471615173253e-10, + "loss": 0.4469, + "step": 12661 + }, + { + "epoch": 0.99, + "grad_norm": 2.4526411242547237, + "learning_rate": 8.153431078106177e-10, + "loss": 0.5089, + "step": 12662 + }, + { + "epoch": 0.99, + "grad_norm": 1.8711326982969616, + "learning_rate": 7.925380411555327e-10, + "loss": 0.4796, + "step": 12663 + }, + { + "epoch": 0.99, + "grad_norm": 1.82933748367458, + "learning_rate": 7.700564166834845e-10, + "loss": 0.4149, + "step": 12664 + }, + { + "epoch": 0.99, + "grad_norm": 2.778140889547643, + "learning_rate": 7.478982358488651e-10, + "loss": 0.4111, + "step": 12665 + }, + { + "epoch": 0.99, + "grad_norm": 0.551549633281428, + "learning_rate": 7.260635000855276e-10, + "loss": 0.4412, + "step": 12666 + }, + { + "epoch": 0.99, + "grad_norm": 1.6775553700036507, + "learning_rate": 7.045522108056757e-10, + "loss": 0.4213, + "step": 12667 + }, + { + "epoch": 0.99, + "grad_norm": 1.4542510969415225, + "learning_rate": 6.833643694015291e-10, + "loss": 0.4203, + "step": 12668 + }, + { + "epoch": 0.99, + "grad_norm": 1.4612066197198725, + "learning_rate": 6.62499977243658e-10, + "loss": 0.4268, + "step": 12669 + }, + { + "epoch": 1.0, + "grad_norm": 1.6071274144746788, + "learning_rate": 6.419590356826489e-10, + "loss": 0.4404, + "step": 12670 + }, + { + "epoch": 1.0, + "grad_norm": 3.5635197407005026, + "learning_rate": 6.217415460463283e-10, + "loss": 0.4582, + "step": 12671 + }, + { + "epoch": 1.0, + "grad_norm": 2.5138247265949745, + "learning_rate": 6.018475096436494e-10, + "loss": 0.4735, + "step": 12672 + }, + { + "epoch": 1.0, + "grad_norm": 1.5127880143163386, + "learning_rate": 5.822769277613604e-10, + "loss": 0.4465, + "step": 12673 + }, + { + "epoch": 1.0, + "grad_norm": 1.926359490358433, + "learning_rate": 5.63029801665671e-10, + "loss": 0.4695, + "step": 12674 + }, + { + "epoch": 1.0, + "grad_norm": 2.4810396036485027, + "learning_rate": 5.44106132601696e-10, + "loss": 0.4371, + "step": 12675 + }, + { + "epoch": 1.0, + "grad_norm": 1.7347582054933806, + "learning_rate": 5.255059217940117e-10, + "loss": 0.4459, + "step": 12676 + }, + { + "epoch": 1.0, + "grad_norm": 2.2379992398321513, + "learning_rate": 5.072291704460997e-10, + "loss": 0.3806, + "step": 12677 + }, + { + "epoch": 1.0, + "grad_norm": 2.523233081733703, + "learning_rate": 4.892758797397923e-10, + "loss": 0.4726, + "step": 12678 + }, + { + "epoch": 1.0, + "grad_norm": 1.4533494827497468, + "learning_rate": 4.716460508380482e-10, + "loss": 0.4463, + "step": 12679 + }, + { + "epoch": 1.0, + "grad_norm": 1.5101038936748863, + "learning_rate": 4.543396848799564e-10, + "loss": 0.4486, + "step": 12680 + }, + { + "epoch": 1.0, + "grad_norm": 1.730805620410329, + "learning_rate": 4.373567829857317e-10, + "loss": 0.4636, + "step": 12681 + }, + { + "epoch": 1.0, + "grad_norm": 1.4986104858271505, + "learning_rate": 4.20697346254495e-10, + "loss": 0.4653, + "step": 12682 + }, + { + "epoch": 1.0, + "grad_norm": 1.7828414255189737, + "learning_rate": 4.043613757642728e-10, + "loss": 0.509, + "step": 12683 + }, + { + "epoch": 1.0, + "grad_norm": 1.7409368545554322, + "learning_rate": 3.8834887257088726e-10, + "loss": 0.4569, + "step": 12684 + }, + { + "epoch": 1.0, + "grad_norm": 1.8107875358792018, + "learning_rate": 3.7265983771184176e-10, + "loss": 0.4634, + "step": 12685 + }, + { + "epoch": 1.0, + "grad_norm": 2.1987832332917243, + "learning_rate": 3.5729427220076997e-10, + "loss": 0.3984, + "step": 12686 + }, + { + "epoch": 1.0, + "grad_norm": 1.5882555842386747, + "learning_rate": 3.422521770324316e-10, + "loss": 0.4267, + "step": 12687 + }, + { + "epoch": 1.0, + "grad_norm": 3.13433649911325, + "learning_rate": 3.275335531799373e-10, + "loss": 0.4363, + "step": 12688 + }, + { + "epoch": 1.0, + "grad_norm": 7.058063244847107, + "learning_rate": 3.131384015958583e-10, + "loss": 0.4152, + "step": 12689 + }, + { + "epoch": 1.0, + "grad_norm": 3.6775059766723466, + "learning_rate": 2.9906672321111664e-10, + "loss": 0.403, + "step": 12690 + }, + { + "epoch": 1.0, + "grad_norm": 2.2414625135016903, + "learning_rate": 2.853185189366503e-10, + "loss": 0.4378, + "step": 12691 + }, + { + "epoch": 1.0, + "grad_norm": 2.194515766554616, + "learning_rate": 2.718937896611928e-10, + "loss": 0.4709, + "step": 12692 + }, + { + "epoch": 1.0, + "grad_norm": 2.0364742613506723, + "learning_rate": 2.587925362540489e-10, + "loss": 0.4631, + "step": 12693 + }, + { + "epoch": 1.0, + "grad_norm": 1.7429766137277094, + "learning_rate": 2.460147595628737e-10, + "loss": 0.4512, + "step": 12694 + }, + { + "epoch": 1.0, + "grad_norm": 1.5571357964895418, + "learning_rate": 2.335604604131181e-10, + "loss": 0.4161, + "step": 12695 + }, + { + "epoch": 1.0, + "grad_norm": 1.32499533573689, + "learning_rate": 2.2142963961246932e-10, + "loss": 0.4438, + "step": 12696 + }, + { + "epoch": 1.0, + "grad_norm": 1.4826414172354452, + "learning_rate": 2.0962229794418975e-10, + "loss": 0.3903, + "step": 12697 + }, + { + "epoch": 1.0, + "grad_norm": 1.9143893265601541, + "learning_rate": 1.9813843617266793e-10, + "loss": 0.4406, + "step": 12698 + }, + { + "epoch": 1.0, + "grad_norm": 1.964557147942109, + "learning_rate": 1.8697805504175326e-10, + "loss": 0.4741, + "step": 12699 + }, + { + "epoch": 1.0, + "grad_norm": 0.5931841070559926, + "learning_rate": 1.761411552719805e-10, + "loss": 0.4752, + "step": 12700 + }, + { + "epoch": 1.0, + "grad_norm": 1.4855245468592573, + "learning_rate": 1.6562773756556573e-10, + "loss": 0.4715, + "step": 12701 + }, + { + "epoch": 1.0, + "grad_norm": 0.5683864777949463, + "learning_rate": 1.554378026025205e-10, + "loss": 0.4606, + "step": 12702 + }, + { + "epoch": 1.0, + "grad_norm": 1.5715258959469307, + "learning_rate": 1.4557135104231735e-10, + "loss": 0.4362, + "step": 12703 + }, + { + "epoch": 1.0, + "grad_norm": 1.816903592909708, + "learning_rate": 1.3602838352277936e-10, + "loss": 0.4721, + "step": 12704 + }, + { + "epoch": 1.0, + "grad_norm": 1.6929083222985504, + "learning_rate": 1.2680890066119055e-10, + "loss": 0.439, + "step": 12705 + }, + { + "epoch": 1.0, + "grad_norm": 2.2301044291630188, + "learning_rate": 1.1791290305485093e-10, + "loss": 0.3951, + "step": 12706 + }, + { + "epoch": 1.0, + "grad_norm": 1.4302259131335344, + "learning_rate": 1.0934039127885598e-10, + "loss": 0.4174, + "step": 12707 + }, + { + "epoch": 1.0, + "grad_norm": 2.782097524570738, + "learning_rate": 1.0109136588720703e-10, + "loss": 0.4739, + "step": 12708 + }, + { + "epoch": 1.0, + "grad_norm": 1.7439158114583095, + "learning_rate": 9.316582741503155e-11, + "loss": 0.4081, + "step": 12709 + }, + { + "epoch": 1.0, + "grad_norm": 1.991228070544835, + "learning_rate": 8.556377637414237e-11, + "loss": 0.4222, + "step": 12710 + }, + { + "epoch": 1.0, + "grad_norm": 0.55848693736396, + "learning_rate": 7.828521325636829e-11, + "loss": 0.4573, + "step": 12711 + }, + { + "epoch": 1.0, + "grad_norm": 1.7324051743234623, + "learning_rate": 7.133013853299897e-11, + "loss": 0.4124, + "step": 12712 + }, + { + "epoch": 1.0, + "grad_norm": 1.9375806151467723, + "learning_rate": 6.469855265367475e-11, + "loss": 0.4416, + "step": 12713 + }, + { + "epoch": 1.0, + "grad_norm": 1.5298100461225657, + "learning_rate": 5.839045604805194e-11, + "loss": 0.4134, + "step": 12714 + }, + { + "epoch": 1.0, + "grad_norm": 0.5515084004453629, + "learning_rate": 5.240584912358238e-11, + "loss": 0.4532, + "step": 12715 + }, + { + "epoch": 1.0, + "grad_norm": 0.5531722149703644, + "learning_rate": 4.674473226717879e-11, + "loss": 0.4737, + "step": 12716 + }, + { + "epoch": 1.0, + "grad_norm": 1.8600555374023193, + "learning_rate": 4.1407105846325015e-11, + "loss": 0.4011, + "step": 12717 + }, + { + "epoch": 1.0, + "grad_norm": 2.2951534367432265, + "learning_rate": 3.639297020519017e-11, + "loss": 0.4618, + "step": 12718 + }, + { + "epoch": 1.0, + "grad_norm": 0.5309432915281782, + "learning_rate": 3.170232566906961e-11, + "loss": 0.472, + "step": 12719 + }, + { + "epoch": 1.0, + "grad_norm": 1.648632433604236, + "learning_rate": 2.7335172541054222e-11, + "loss": 0.472, + "step": 12720 + }, + { + "epoch": 1.0, + "grad_norm": 1.9891922851986015, + "learning_rate": 2.329151110314065e-11, + "loss": 0.4146, + "step": 12721 + }, + { + "epoch": 1.0, + "grad_norm": 1.4863681096782582, + "learning_rate": 1.9571341617896643e-11, + "loss": 0.4691, + "step": 12722 + }, + { + "epoch": 1.0, + "grad_norm": 0.5474883211736816, + "learning_rate": 1.6174664325685485e-11, + "loss": 0.4772, + "step": 12723 + }, + { + "epoch": 1.0, + "grad_norm": 1.7968771768996952, + "learning_rate": 1.3101479445776222e-11, + "loss": 0.432, + "step": 12724 + }, + { + "epoch": 1.0, + "grad_norm": 1.7850241950049794, + "learning_rate": 1.0351787177453886e-11, + "loss": 0.4187, + "step": 12725 + }, + { + "epoch": 1.0, + "grad_norm": 1.6177864248250622, + "learning_rate": 7.925587698909276e-12, + "loss": 0.4347, + "step": 12726 + }, + { + "epoch": 1.0, + "grad_norm": 1.9385634257563205, + "learning_rate": 5.8228811661287244e-12, + "loss": 0.4436, + "step": 12727 + }, + { + "epoch": 1.0, + "grad_norm": 2.103776297698041, + "learning_rate": 4.043667716224775e-12, + "loss": 0.4528, + "step": 12728 + }, + { + "epoch": 1.0, + "grad_norm": 1.541259578282624, + "learning_rate": 2.5879474641055114e-12, + "loss": 0.4331, + "step": 12729 + }, + { + "epoch": 1.0, + "grad_norm": 1.7721900415780467, + "learning_rate": 1.4557205035847787e-12, + "loss": 0.4684, + "step": 12730 + }, + { + "epoch": 1.0, + "grad_norm": 1.6051586781797063, + "learning_rate": 6.46986907937297e-13, + "loss": 0.4647, + "step": 12731 + }, + { + "epoch": 1.0, + "grad_norm": 3.6430257866029137, + "learning_rate": 1.6174672934354819e-13, + "loss": 0.4528, + "step": 12732 + }, + { + "epoch": 1.0, + "grad_norm": 1.7634476721682628, + "learning_rate": 0.0, + "loss": 0.4617, + "step": 12733 + }, + { + "epoch": 1.0, + "step": 12733, + "total_flos": 3813892823678976.0, + "train_loss": 0.09614035478748979, + "train_runtime": 51328.1871, + "train_samples_per_second": 63.509, + "train_steps_per_second": 0.248 + } + ], + "logging_steps": 1.0, + "max_steps": 12733, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 2000, + "total_flos": 3813892823678976.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}