|
{ |
|
"best_metric": 2.2475366592407227, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-550", |
|
"epoch": 4.002709415217882, |
|
"eval_steps": 25, |
|
"global_step": 554, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.007225107247685708, |
|
"grad_norm": 64.54656219482422, |
|
"learning_rate": 3.7037037037037037e-06, |
|
"loss": 116.6258, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.007225107247685708, |
|
"eval_loss": 2.789177894592285, |
|
"eval_runtime": 0.6553, |
|
"eval_samples_per_second": 76.3, |
|
"eval_steps_per_second": 76.3, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.014450214495371415, |
|
"grad_norm": 54.67939758300781, |
|
"learning_rate": 7.4074074074074075e-06, |
|
"loss": 126.0822, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.021675321743057124, |
|
"grad_norm": 58.601707458496094, |
|
"learning_rate": 1.1111111111111112e-05, |
|
"loss": 128.7215, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.02890042899074283, |
|
"grad_norm": 45.49699020385742, |
|
"learning_rate": 1.4814814814814815e-05, |
|
"loss": 135.9089, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.03612553623842854, |
|
"grad_norm": 47.61927032470703, |
|
"learning_rate": 1.8518518518518518e-05, |
|
"loss": 136.4493, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.04335064348611425, |
|
"grad_norm": 46.85959243774414, |
|
"learning_rate": 2.2222222222222223e-05, |
|
"loss": 140.787, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.050575750733799954, |
|
"grad_norm": 48.88517379760742, |
|
"learning_rate": 2.5925925925925925e-05, |
|
"loss": 150.3139, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.05780085798148566, |
|
"grad_norm": 50.686092376708984, |
|
"learning_rate": 2.962962962962963e-05, |
|
"loss": 150.0237, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.06502596522917137, |
|
"grad_norm": 52.31294631958008, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 158.4682, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.07225107247685708, |
|
"grad_norm": 47.285072326660156, |
|
"learning_rate": 3.7037037037037037e-05, |
|
"loss": 156.9395, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07947617972454278, |
|
"grad_norm": 49.950775146484375, |
|
"learning_rate": 4.074074074074074e-05, |
|
"loss": 163.8341, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.0867012869722285, |
|
"grad_norm": 52.288211822509766, |
|
"learning_rate": 4.4444444444444447e-05, |
|
"loss": 158.1243, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.09392639421991421, |
|
"grad_norm": 50.71270751953125, |
|
"learning_rate": 4.814814814814815e-05, |
|
"loss": 158.3429, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.10115150146759991, |
|
"grad_norm": 51.72929763793945, |
|
"learning_rate": 5.185185185185185e-05, |
|
"loss": 172.155, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.10837660871528562, |
|
"grad_norm": 60.08763885498047, |
|
"learning_rate": 5.555555555555556e-05, |
|
"loss": 174.4488, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.11560171596297132, |
|
"grad_norm": 57.922359466552734, |
|
"learning_rate": 5.925925925925926e-05, |
|
"loss": 175.58, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.12282682321065704, |
|
"grad_norm": 70.09874725341797, |
|
"learning_rate": 6.296296296296296e-05, |
|
"loss": 183.6348, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.13005193045834273, |
|
"grad_norm": 63.958580017089844, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 187.0141, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.13727703770602845, |
|
"grad_norm": 65.56310272216797, |
|
"learning_rate": 7.037037037037038e-05, |
|
"loss": 189.6452, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.14450214495371416, |
|
"grad_norm": 70.80933380126953, |
|
"learning_rate": 7.407407407407407e-05, |
|
"loss": 184.7314, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.15172725220139988, |
|
"grad_norm": 78.70838165283203, |
|
"learning_rate": 7.777777777777778e-05, |
|
"loss": 202.2183, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.15895235944908556, |
|
"grad_norm": 69.47725677490234, |
|
"learning_rate": 8.148148148148148e-05, |
|
"loss": 190.7609, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.16617746669677128, |
|
"grad_norm": 73.57704162597656, |
|
"learning_rate": 8.518518518518518e-05, |
|
"loss": 201.8692, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.173402573944457, |
|
"grad_norm": 77.79843139648438, |
|
"learning_rate": 8.888888888888889e-05, |
|
"loss": 203.2968, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.1806276811921427, |
|
"grad_norm": 81.88188934326172, |
|
"learning_rate": 9.25925925925926e-05, |
|
"loss": 201.7194, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.1806276811921427, |
|
"eval_loss": 2.638108491897583, |
|
"eval_runtime": 0.6507, |
|
"eval_samples_per_second": 76.835, |
|
"eval_steps_per_second": 76.835, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.18785278843982842, |
|
"grad_norm": 83.04781341552734, |
|
"learning_rate": 9.62962962962963e-05, |
|
"loss": 198.6217, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.1950778956875141, |
|
"grad_norm": 85.5269775390625, |
|
"learning_rate": 0.0001, |
|
"loss": 195.5746, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.20230300293519982, |
|
"grad_norm": 87.42471313476562, |
|
"learning_rate": 9.999920042400544e-05, |
|
"loss": 202.2589, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.20952811018288553, |
|
"grad_norm": 94.57598876953125, |
|
"learning_rate": 9.999680172443598e-05, |
|
"loss": 203.4591, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.21675321743057124, |
|
"grad_norm": 95.3369369506836, |
|
"learning_rate": 9.999280398653359e-05, |
|
"loss": 202.5068, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.22397832467825693, |
|
"grad_norm": 105.30885314941406, |
|
"learning_rate": 9.998720735236468e-05, |
|
"loss": 209.9333, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.23120343192594264, |
|
"grad_norm": 109.33146667480469, |
|
"learning_rate": 9.998001202081524e-05, |
|
"loss": 209.2891, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.23842853917362836, |
|
"grad_norm": 115.16373443603516, |
|
"learning_rate": 9.997121824758367e-05, |
|
"loss": 204.809, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.24565364642131407, |
|
"grad_norm": 138.49798583984375, |
|
"learning_rate": 9.996082634517176e-05, |
|
"loss": 208.7707, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.25287875366899976, |
|
"grad_norm": 144.79405212402344, |
|
"learning_rate": 9.994883668287352e-05, |
|
"loss": 118.8286, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.26010386091668547, |
|
"grad_norm": 121.61618041992188, |
|
"learning_rate": 9.993524968676216e-05, |
|
"loss": 124.1272, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.2673289681643712, |
|
"grad_norm": 115.61026000976562, |
|
"learning_rate": 9.99200658396748e-05, |
|
"loss": 133.4779, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.2745540754120569, |
|
"grad_norm": 116.54967498779297, |
|
"learning_rate": 9.99032856811954e-05, |
|
"loss": 136.8838, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.2817791826597426, |
|
"grad_norm": 119.98362731933594, |
|
"learning_rate": 9.988490980763562e-05, |
|
"loss": 141.997, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.2890042899074283, |
|
"grad_norm": 103.40101623535156, |
|
"learning_rate": 9.98649388720136e-05, |
|
"loss": 149.3798, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.29622939715511404, |
|
"grad_norm": 78.41800689697266, |
|
"learning_rate": 9.984337358403068e-05, |
|
"loss": 144.1501, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.30345450440279975, |
|
"grad_norm": 70.429931640625, |
|
"learning_rate": 9.982021471004624e-05, |
|
"loss": 150.1001, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.3106796116504854, |
|
"grad_norm": 68.79240417480469, |
|
"learning_rate": 9.979546307305052e-05, |
|
"loss": 152.7282, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.3179047188981711, |
|
"grad_norm": 64.10746765136719, |
|
"learning_rate": 9.976911955263529e-05, |
|
"loss": 150.7592, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.32512982614585684, |
|
"grad_norm": 63.97102737426758, |
|
"learning_rate": 9.974118508496258e-05, |
|
"loss": 158.1265, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.33235493339354255, |
|
"grad_norm": 59.588539123535156, |
|
"learning_rate": 9.971166066273153e-05, |
|
"loss": 152.0917, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.33958004064122826, |
|
"grad_norm": 56.97199249267578, |
|
"learning_rate": 9.9680547335143e-05, |
|
"loss": 153.4449, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.346805147888914, |
|
"grad_norm": 60.04453659057617, |
|
"learning_rate": 9.964784620786228e-05, |
|
"loss": 164.4727, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.3540302551365997, |
|
"grad_norm": 64.35054016113281, |
|
"learning_rate": 9.961355844297988e-05, |
|
"loss": 171.0691, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.3612553623842854, |
|
"grad_norm": 62.75254440307617, |
|
"learning_rate": 9.957768525897023e-05, |
|
"loss": 171.733, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.3612553623842854, |
|
"eval_loss": 2.5156970024108887, |
|
"eval_runtime": 0.6517, |
|
"eval_samples_per_second": 76.722, |
|
"eval_steps_per_second": 76.722, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.3684804696319711, |
|
"grad_norm": 62.48736572265625, |
|
"learning_rate": 9.954022793064826e-05, |
|
"loss": 169.8293, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.37570557687965683, |
|
"grad_norm": 60.11751174926758, |
|
"learning_rate": 9.950118778912423e-05, |
|
"loss": 168.2545, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.3829306841273425, |
|
"grad_norm": 72.86707305908203, |
|
"learning_rate": 9.946056622175634e-05, |
|
"loss": 184.524, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.3901557913750282, |
|
"grad_norm": 68.04542541503906, |
|
"learning_rate": 9.941836467210152e-05, |
|
"loss": 185.4, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.3973808986227139, |
|
"grad_norm": 66.58963012695312, |
|
"learning_rate": 9.937458463986401e-05, |
|
"loss": 181.0175, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.40460600587039963, |
|
"grad_norm": 63.77046203613281, |
|
"learning_rate": 9.932922768084218e-05, |
|
"loss": 182.9669, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.41183111311808535, |
|
"grad_norm": 64.7295150756836, |
|
"learning_rate": 9.928229540687316e-05, |
|
"loss": 180.1315, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.41905622036577106, |
|
"grad_norm": 65.93685913085938, |
|
"learning_rate": 9.923378948577559e-05, |
|
"loss": 187.6352, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.4262813276134568, |
|
"grad_norm": 68.34617614746094, |
|
"learning_rate": 9.918371164129037e-05, |
|
"loss": 186.9591, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.4335064348611425, |
|
"grad_norm": 66.827880859375, |
|
"learning_rate": 9.913206365301939e-05, |
|
"loss": 187.058, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.4407315421088282, |
|
"grad_norm": 76.42950439453125, |
|
"learning_rate": 9.907884735636226e-05, |
|
"loss": 193.3964, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.44795664935651386, |
|
"grad_norm": 72.85874938964844, |
|
"learning_rate": 9.902406464245115e-05, |
|
"loss": 181.1469, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.45518175660419957, |
|
"grad_norm": 71.87165832519531, |
|
"learning_rate": 9.896771745808349e-05, |
|
"loss": 189.0317, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.4624068638518853, |
|
"grad_norm": 89.61693572998047, |
|
"learning_rate": 9.89098078056529e-05, |
|
"loss": 186.9491, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.469631971099571, |
|
"grad_norm": 82.98321533203125, |
|
"learning_rate": 9.885033774307798e-05, |
|
"loss": 181.5333, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.4768570783472567, |
|
"grad_norm": 85.03607940673828, |
|
"learning_rate": 9.87893093837291e-05, |
|
"loss": 186.9785, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.4840821855949424, |
|
"grad_norm": 97.14375305175781, |
|
"learning_rate": 9.872672489635346e-05, |
|
"loss": 191.9914, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.49130729284262814, |
|
"grad_norm": 132.39361572265625, |
|
"learning_rate": 9.866258650499787e-05, |
|
"loss": 194.7207, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.49853240009031385, |
|
"grad_norm": 79.9813003540039, |
|
"learning_rate": 9.859689648892982e-05, |
|
"loss": 114.7597, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.5057575073379995, |
|
"grad_norm": 62.79768371582031, |
|
"learning_rate": 9.852965718255638e-05, |
|
"loss": 117.4539, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.5129826145856853, |
|
"grad_norm": 65.06932830810547, |
|
"learning_rate": 9.84608709753414e-05, |
|
"loss": 126.1914, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.5202077218333709, |
|
"grad_norm": 63.830989837646484, |
|
"learning_rate": 9.839054031172038e-05, |
|
"loss": 132.7605, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.5274328290810567, |
|
"grad_norm": 56.553497314453125, |
|
"learning_rate": 9.831866769101381e-05, |
|
"loss": 127.5648, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.5346579363287424, |
|
"grad_norm": 57.750732421875, |
|
"learning_rate": 9.824525566733823e-05, |
|
"loss": 136.0356, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.5418830435764281, |
|
"grad_norm": 49.138206481933594, |
|
"learning_rate": 9.817030684951549e-05, |
|
"loss": 143.931, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.5418830435764281, |
|
"eval_loss": 2.4383444786071777, |
|
"eval_runtime": 0.6413, |
|
"eval_samples_per_second": 77.973, |
|
"eval_steps_per_second": 77.973, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.5491081508241138, |
|
"grad_norm": 47.41360855102539, |
|
"learning_rate": 9.809382390098004e-05, |
|
"loss": 140.3533, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.5563332580717995, |
|
"grad_norm": 47.02049255371094, |
|
"learning_rate": 9.801580953968435e-05, |
|
"loss": 145.456, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.5635583653194852, |
|
"grad_norm": 46.56816482543945, |
|
"learning_rate": 9.793626653800219e-05, |
|
"loss": 145.1962, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.5707834725671709, |
|
"grad_norm": 50.03951644897461, |
|
"learning_rate": 9.785519772263025e-05, |
|
"loss": 149.2638, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.5780085798148566, |
|
"grad_norm": 50.39605712890625, |
|
"learning_rate": 9.777260597448753e-05, |
|
"loss": 159.7057, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5852336870625423, |
|
"grad_norm": 47.88210678100586, |
|
"learning_rate": 9.768849422861313e-05, |
|
"loss": 157.102, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.5924587943102281, |
|
"grad_norm": 48.747520446777344, |
|
"learning_rate": 9.760286547406186e-05, |
|
"loss": 162.9211, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.5996839015579137, |
|
"grad_norm": 47.951412200927734, |
|
"learning_rate": 9.7515722753798e-05, |
|
"loss": 164.9853, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.6069090088055995, |
|
"grad_norm": 51.72910690307617, |
|
"learning_rate": 9.74270691645872e-05, |
|
"loss": 164.8267, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.6141341160532852, |
|
"grad_norm": 55.5040397644043, |
|
"learning_rate": 9.73369078568864e-05, |
|
"loss": 172.8293, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.6213592233009708, |
|
"grad_norm": 57.3427848815918, |
|
"learning_rate": 9.724524203473197e-05, |
|
"loss": 169.605, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.6285843305486566, |
|
"grad_norm": 56.78362274169922, |
|
"learning_rate": 9.715207495562573e-05, |
|
"loss": 171.3703, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.6358094377963422, |
|
"grad_norm": 61.482643127441406, |
|
"learning_rate": 9.70574099304192e-05, |
|
"loss": 173.9247, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.643034545044028, |
|
"grad_norm": 60.386348724365234, |
|
"learning_rate": 9.6961250323196e-05, |
|
"loss": 168.3157, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.6502596522917137, |
|
"grad_norm": 64.57029724121094, |
|
"learning_rate": 9.686359955115235e-05, |
|
"loss": 177.178, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.6574847595393994, |
|
"grad_norm": 69.96235656738281, |
|
"learning_rate": 9.676446108447545e-05, |
|
"loss": 174.5947, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.6647098667870851, |
|
"grad_norm": 63.585506439208984, |
|
"learning_rate": 9.666383844622034e-05, |
|
"loss": 175.7491, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.6719349740347709, |
|
"grad_norm": 69.02960205078125, |
|
"learning_rate": 9.656173521218463e-05, |
|
"loss": 178.0728, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.6791600812824565, |
|
"grad_norm": 64.00545501708984, |
|
"learning_rate": 9.645815501078142e-05, |
|
"loss": 183.6833, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.6863851885301423, |
|
"grad_norm": 65.82809448242188, |
|
"learning_rate": 9.635310152291039e-05, |
|
"loss": 176.3168, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.693610295777828, |
|
"grad_norm": 74.30838012695312, |
|
"learning_rate": 9.624657848182693e-05, |
|
"loss": 184.4491, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.7008354030255136, |
|
"grad_norm": 80.60615539550781, |
|
"learning_rate": 9.61385896730096e-05, |
|
"loss": 179.8313, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.7080605102731994, |
|
"grad_norm": 80.63772583007812, |
|
"learning_rate": 9.602913893402546e-05, |
|
"loss": 188.0501, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.715285617520885, |
|
"grad_norm": 84.3320541381836, |
|
"learning_rate": 9.591823015439374e-05, |
|
"loss": 188.2311, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.7225107247685708, |
|
"grad_norm": 81.29312896728516, |
|
"learning_rate": 9.580586727544771e-05, |
|
"loss": 175.6401, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.7225107247685708, |
|
"eval_loss": 2.393695592880249, |
|
"eval_runtime": 0.6484, |
|
"eval_samples_per_second": 77.118, |
|
"eval_steps_per_second": 77.118, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.7297358320162565, |
|
"grad_norm": 97.86168670654297, |
|
"learning_rate": 9.569205429019452e-05, |
|
"loss": 183.8533, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.7369609392639422, |
|
"grad_norm": 120.66143035888672, |
|
"learning_rate": 9.557679524317331e-05, |
|
"loss": 183.9207, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.7441860465116279, |
|
"grad_norm": 59.1573600769043, |
|
"learning_rate": 9.54600942303115e-05, |
|
"loss": 111.658, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.7514111537593137, |
|
"grad_norm": 47.51983642578125, |
|
"learning_rate": 9.534195539877922e-05, |
|
"loss": 116.4757, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.7586362610069993, |
|
"grad_norm": 41.60358428955078, |
|
"learning_rate": 9.522238294684203e-05, |
|
"loss": 123.7686, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.765861368254685, |
|
"grad_norm": 43.5106201171875, |
|
"learning_rate": 9.510138112371153e-05, |
|
"loss": 127.939, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.7730864755023708, |
|
"grad_norm": 50.11954116821289, |
|
"learning_rate": 9.497895422939455e-05, |
|
"loss": 129.1992, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.7803115827500564, |
|
"grad_norm": 43.32356262207031, |
|
"learning_rate": 9.485510661454022e-05, |
|
"loss": 136.5778, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.7875366899977422, |
|
"grad_norm": 44.33633804321289, |
|
"learning_rate": 9.472984268028544e-05, |
|
"loss": 141.9382, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.7947617972454278, |
|
"grad_norm": 40.56188201904297, |
|
"learning_rate": 9.46031668780984e-05, |
|
"loss": 139.9151, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.8019869044931136, |
|
"grad_norm": 40.898475646972656, |
|
"learning_rate": 9.44750837096205e-05, |
|
"loss": 141.2808, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.8092120117407993, |
|
"grad_norm": 40.37065887451172, |
|
"learning_rate": 9.43455977265062e-05, |
|
"loss": 146.3503, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.816437118988485, |
|
"grad_norm": 42.96000289916992, |
|
"learning_rate": 9.421471353026149e-05, |
|
"loss": 151.9168, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.8236622262361707, |
|
"grad_norm": 43.85641098022461, |
|
"learning_rate": 9.40824357720802e-05, |
|
"loss": 153.8375, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.8308873334838563, |
|
"grad_norm": 47.796295166015625, |
|
"learning_rate": 9.394876915267878e-05, |
|
"loss": 148.2482, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.8381124407315421, |
|
"grad_norm": 45.746238708496094, |
|
"learning_rate": 9.381371842212923e-05, |
|
"loss": 157.8461, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.8453375479792278, |
|
"grad_norm": 47.07131576538086, |
|
"learning_rate": 9.36772883796903e-05, |
|
"loss": 158.5688, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.8525626552269135, |
|
"grad_norm": 47.55768585205078, |
|
"learning_rate": 9.353948387363699e-05, |
|
"loss": 162.7859, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.8597877624745992, |
|
"grad_norm": 52.72948455810547, |
|
"learning_rate": 9.340030980108816e-05, |
|
"loss": 162.023, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.867012869722285, |
|
"grad_norm": 55.043453216552734, |
|
"learning_rate": 9.325977110783264e-05, |
|
"loss": 169.4957, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.8742379769699706, |
|
"grad_norm": 54.188453674316406, |
|
"learning_rate": 9.311787278815328e-05, |
|
"loss": 165.3426, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.8814630842176564, |
|
"grad_norm": 58.27094650268555, |
|
"learning_rate": 9.297461988464967e-05, |
|
"loss": 172.4739, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.8886881914653421, |
|
"grad_norm": 56.99961853027344, |
|
"learning_rate": 9.28300174880588e-05, |
|
"loss": 162.9698, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.8959132987130277, |
|
"grad_norm": 61.62599182128906, |
|
"learning_rate": 9.268407073707426e-05, |
|
"loss": 172.4337, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.9031384059607135, |
|
"grad_norm": 61.69138717651367, |
|
"learning_rate": 9.253678481816351e-05, |
|
"loss": 171.1664, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.9031384059607135, |
|
"eval_loss": 2.360010862350464, |
|
"eval_runtime": 0.6436, |
|
"eval_samples_per_second": 77.684, |
|
"eval_steps_per_second": 77.684, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.9103635132083991, |
|
"grad_norm": 69.07987976074219, |
|
"learning_rate": 9.238816496538369e-05, |
|
"loss": 172.1746, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.9175886204560849, |
|
"grad_norm": 68.58795928955078, |
|
"learning_rate": 9.223821646019553e-05, |
|
"loss": 173.2199, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.9248137277037706, |
|
"grad_norm": 71.02379608154297, |
|
"learning_rate": 9.208694463127569e-05, |
|
"loss": 181.6123, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.9320388349514563, |
|
"grad_norm": 72.92750549316406, |
|
"learning_rate": 9.193435485432745e-05, |
|
"loss": 177.9379, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.939263942199142, |
|
"grad_norm": 71.89237213134766, |
|
"learning_rate": 9.178045255188955e-05, |
|
"loss": 179.2086, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.9464890494468278, |
|
"grad_norm": 75.93486785888672, |
|
"learning_rate": 9.162524319314366e-05, |
|
"loss": 176.8553, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.9537141566945134, |
|
"grad_norm": 74.88329315185547, |
|
"learning_rate": 9.146873229371984e-05, |
|
"loss": 186.3932, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.9609392639421992, |
|
"grad_norm": 74.99288940429688, |
|
"learning_rate": 9.131092541550072e-05, |
|
"loss": 181.7378, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.9681643711898849, |
|
"grad_norm": 83.1282958984375, |
|
"learning_rate": 9.115182816642369e-05, |
|
"loss": 182.5549, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.9753894784375705, |
|
"grad_norm": 88.4671401977539, |
|
"learning_rate": 9.099144620028166e-05, |
|
"loss": 177.1656, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.9826145856852563, |
|
"grad_norm": 138.13075256347656, |
|
"learning_rate": 9.082978521652222e-05, |
|
"loss": 179.7221, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.9898396929329419, |
|
"grad_norm": 57.780609130859375, |
|
"learning_rate": 9.066685096004499e-05, |
|
"loss": 136.3332, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.9970648001806277, |
|
"grad_norm": 62.77166748046875, |
|
"learning_rate": 9.050264922099755e-05, |
|
"loss": 161.6082, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.0042899074283134, |
|
"grad_norm": 58.23940658569336, |
|
"learning_rate": 9.033718583456961e-05, |
|
"loss": 142.1509, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 1.011515014675999, |
|
"grad_norm": 44.70951461791992, |
|
"learning_rate": 9.017046668078572e-05, |
|
"loss": 111.9812, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.018740121923685, |
|
"grad_norm": 32.90129089355469, |
|
"learning_rate": 9.000249768429621e-05, |
|
"loss": 121.872, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.0259652291713706, |
|
"grad_norm": 33.142757415771484, |
|
"learning_rate": 8.983328481416675e-05, |
|
"loss": 121.8843, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.0331903364190562, |
|
"grad_norm": 37.793434143066406, |
|
"learning_rate": 8.966283408366621e-05, |
|
"loss": 129.8525, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 1.0404154436667419, |
|
"grad_norm": 35.80763244628906, |
|
"learning_rate": 8.949115155005289e-05, |
|
"loss": 131.2904, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.0476405509144275, |
|
"grad_norm": 42.35646438598633, |
|
"learning_rate": 8.931824331435937e-05, |
|
"loss": 141.2367, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.0548656581621134, |
|
"grad_norm": 39.78588104248047, |
|
"learning_rate": 8.914411552117559e-05, |
|
"loss": 136.225, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.062090765409799, |
|
"grad_norm": 38.78228759765625, |
|
"learning_rate": 8.896877435843063e-05, |
|
"loss": 140.3088, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.0693158726574847, |
|
"grad_norm": 39.07134246826172, |
|
"learning_rate": 8.879222605717268e-05, |
|
"loss": 143.1922, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.0765409799051704, |
|
"grad_norm": 41.59137725830078, |
|
"learning_rate": 8.861447689134768e-05, |
|
"loss": 150.6414, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 1.0837660871528563, |
|
"grad_norm": 42.95624923706055, |
|
"learning_rate": 8.843553317757632e-05, |
|
"loss": 147.6945, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.0837660871528563, |
|
"eval_loss": 2.3478291034698486, |
|
"eval_runtime": 0.6442, |
|
"eval_samples_per_second": 77.616, |
|
"eval_steps_per_second": 77.616, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.090991194400542, |
|
"grad_norm": 42.467933654785156, |
|
"learning_rate": 8.825540127492967e-05, |
|
"loss": 140.8872, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 1.0982163016482276, |
|
"grad_norm": 42.057655334472656, |
|
"learning_rate": 8.807408758470302e-05, |
|
"loss": 153.8803, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.1054414088959132, |
|
"grad_norm": 43.9022102355957, |
|
"learning_rate": 8.789159855018858e-05, |
|
"loss": 154.978, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.112666516143599, |
|
"grad_norm": 48.20111846923828, |
|
"learning_rate": 8.770794065644639e-05, |
|
"loss": 161.4774, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.1198916233912848, |
|
"grad_norm": 53.01012420654297, |
|
"learning_rate": 8.752312043007396e-05, |
|
"loss": 166.7152, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.1271167306389704, |
|
"grad_norm": 54.39327621459961, |
|
"learning_rate": 8.73371444389742e-05, |
|
"loss": 162.619, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.134341837886656, |
|
"grad_norm": 54.17763900756836, |
|
"learning_rate": 8.715001929212214e-05, |
|
"loss": 169.5686, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.1415669451343418, |
|
"grad_norm": 51.94620895385742, |
|
"learning_rate": 8.696175163933004e-05, |
|
"loss": 165.3075, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.1487920523820276, |
|
"grad_norm": 57.945369720458984, |
|
"learning_rate": 8.677234817101101e-05, |
|
"loss": 168.0594, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.1560171596297133, |
|
"grad_norm": 59.11105728149414, |
|
"learning_rate": 8.658181561794137e-05, |
|
"loss": 164.0512, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.163242266877399, |
|
"grad_norm": 63.255611419677734, |
|
"learning_rate": 8.639016075102136e-05, |
|
"loss": 167.9124, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.1704673741250846, |
|
"grad_norm": 62.79253005981445, |
|
"learning_rate": 8.619739038103456e-05, |
|
"loss": 168.2611, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.1776924813727705, |
|
"grad_norm": 66.78952026367188, |
|
"learning_rate": 8.600351135840589e-05, |
|
"loss": 167.1303, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.1849175886204562, |
|
"grad_norm": 65.6444320678711, |
|
"learning_rate": 8.580853057295813e-05, |
|
"loss": 174.8452, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.1921426958681418, |
|
"grad_norm": 68.41908264160156, |
|
"learning_rate": 8.561245495366706e-05, |
|
"loss": 173.017, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.1993678031158275, |
|
"grad_norm": 72.44266510009766, |
|
"learning_rate": 8.541529146841526e-05, |
|
"loss": 179.2277, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.2065929103635131, |
|
"grad_norm": 79.08995819091797, |
|
"learning_rate": 8.521704712374453e-05, |
|
"loss": 162.5972, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.213818017611199, |
|
"grad_norm": 81.43531799316406, |
|
"learning_rate": 8.50177289646068e-05, |
|
"loss": 175.9867, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.2210431248588847, |
|
"grad_norm": 79.34605407714844, |
|
"learning_rate": 8.48173440741139e-05, |
|
"loss": 178.2415, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.2282682321065703, |
|
"grad_norm": 81.79105377197266, |
|
"learning_rate": 8.46158995732857e-05, |
|
"loss": 178.3049, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.235493339354256, |
|
"grad_norm": 85.38658142089844, |
|
"learning_rate": 8.44134026207972e-05, |
|
"loss": 174.3011, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.2427184466019416, |
|
"grad_norm": 104.00491333007812, |
|
"learning_rate": 8.420986041272407e-05, |
|
"loss": 182.388, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.2499435538496275, |
|
"grad_norm": 105.1968002319336, |
|
"learning_rate": 8.400528018228688e-05, |
|
"loss": 142.8601, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.2571686610973132, |
|
"grad_norm": 91.36693572998047, |
|
"learning_rate": 8.379966919959416e-05, |
|
"loss": 112.5672, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.2643937683449988, |
|
"grad_norm": 61.239349365234375, |
|
"learning_rate": 8.359303477138393e-05, |
|
"loss": 117.836, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.2643937683449988, |
|
"eval_loss": 2.333247423171997, |
|
"eval_runtime": 0.6482, |
|
"eval_samples_per_second": 77.14, |
|
"eval_steps_per_second": 77.14, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.2716188755926847, |
|
"grad_norm": 44.947166442871094, |
|
"learning_rate": 8.338538424076411e-05, |
|
"loss": 119.2028, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.2788439828403702, |
|
"grad_norm": 51.20450210571289, |
|
"learning_rate": 8.317672498695162e-05, |
|
"loss": 126.114, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.286069090088056, |
|
"grad_norm": 45.0206184387207, |
|
"learning_rate": 8.296706442500998e-05, |
|
"loss": 124.9417, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.2932941973357417, |
|
"grad_norm": 47.35272216796875, |
|
"learning_rate": 8.275641000558598e-05, |
|
"loss": 136.7816, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.3005193045834273, |
|
"grad_norm": 42.55514144897461, |
|
"learning_rate": 8.254476921464484e-05, |
|
"loss": 134.4095, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.3077444118311132, |
|
"grad_norm": 41.750221252441406, |
|
"learning_rate": 8.233214957320411e-05, |
|
"loss": 137.0252, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.3149695190787989, |
|
"grad_norm": 44.95912170410156, |
|
"learning_rate": 8.211855863706654e-05, |
|
"loss": 143.0184, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.3221946263264845, |
|
"grad_norm": 46.76531219482422, |
|
"learning_rate": 8.190400399655147e-05, |
|
"loss": 145.2914, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.3294197335741702, |
|
"grad_norm": 47.532100677490234, |
|
"learning_rate": 8.168849327622513e-05, |
|
"loss": 149.4524, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.3366448408218559, |
|
"grad_norm": 47.3936767578125, |
|
"learning_rate": 8.147203413462967e-05, |
|
"loss": 144.6819, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.3438699480695417, |
|
"grad_norm": 45.35457229614258, |
|
"learning_rate": 8.125463426401101e-05, |
|
"loss": 150.504, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.3510950553172274, |
|
"grad_norm": 48.21922302246094, |
|
"learning_rate": 8.103630139004553e-05, |
|
"loss": 155.5509, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.358320162564913, |
|
"grad_norm": 50.7261962890625, |
|
"learning_rate": 8.08170432715654e-05, |
|
"loss": 160.2927, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.3655452698125987, |
|
"grad_norm": 52.04977035522461, |
|
"learning_rate": 8.059686770028303e-05, |
|
"loss": 154.3667, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.3727703770602844, |
|
"grad_norm": 52.68675994873047, |
|
"learning_rate": 8.037578250051399e-05, |
|
"loss": 155.9036, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.3799954843079703, |
|
"grad_norm": 54.34752655029297, |
|
"learning_rate": 8.015379552889913e-05, |
|
"loss": 156.5415, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.387220591555656, |
|
"grad_norm": 51.08332443237305, |
|
"learning_rate": 7.993091467412527e-05, |
|
"loss": 164.3319, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.3944456988033416, |
|
"grad_norm": 57.97858428955078, |
|
"learning_rate": 7.970714785664492e-05, |
|
"loss": 170.2659, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.4016708060510275, |
|
"grad_norm": 63.892784118652344, |
|
"learning_rate": 7.948250302839476e-05, |
|
"loss": 169.5753, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.408895913298713, |
|
"grad_norm": 62.205596923828125, |
|
"learning_rate": 7.92569881725131e-05, |
|
"loss": 170.2137, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.4161210205463988, |
|
"grad_norm": 59.26729202270508, |
|
"learning_rate": 7.903061130305616e-05, |
|
"loss": 158.9264, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.4233461277940844, |
|
"grad_norm": 60.39152908325195, |
|
"learning_rate": 7.880338046471331e-05, |
|
"loss": 169.5714, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.43057123504177, |
|
"grad_norm": 71.50019073486328, |
|
"learning_rate": 7.857530373252116e-05, |
|
"loss": 171.8552, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.437796342289456, |
|
"grad_norm": 68.2151870727539, |
|
"learning_rate": 7.83463892115766e-05, |
|
"loss": 168.0481, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.4450214495371416, |
|
"grad_norm": 79.76618957519531, |
|
"learning_rate": 7.811664503674875e-05, |
|
"loss": 170.3496, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.4450214495371416, |
|
"eval_loss": 2.3155527114868164, |
|
"eval_runtime": 0.6439, |
|
"eval_samples_per_second": 77.647, |
|
"eval_steps_per_second": 77.647, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.4522465567848273, |
|
"grad_norm": 78.41180419921875, |
|
"learning_rate": 7.788607937238995e-05, |
|
"loss": 184.1455, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.459471664032513, |
|
"grad_norm": 73.47679138183594, |
|
"learning_rate": 7.765470041204553e-05, |
|
"loss": 168.6345, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.4666967712801986, |
|
"grad_norm": 84.49844360351562, |
|
"learning_rate": 7.742251637816274e-05, |
|
"loss": 179.5529, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.4739218785278845, |
|
"grad_norm": 83.9566879272461, |
|
"learning_rate": 7.718953552179841e-05, |
|
"loss": 175.5488, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.4811469857755701, |
|
"grad_norm": 85.65151977539062, |
|
"learning_rate": 7.695576612232591e-05, |
|
"loss": 174.084, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.4883720930232558, |
|
"grad_norm": 113.66490936279297, |
|
"learning_rate": 7.67212164871408e-05, |
|
"loss": 175.5837, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.4955972002709415, |
|
"grad_norm": 89.42914581298828, |
|
"learning_rate": 7.64858949513656e-05, |
|
"loss": 139.4319, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.502822307518627, |
|
"grad_norm": 38.07080078125, |
|
"learning_rate": 7.624980987755375e-05, |
|
"loss": 111.8192, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.510047414766313, |
|
"grad_norm": 33.71562576293945, |
|
"learning_rate": 7.601296965539225e-05, |
|
"loss": 116.1872, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.5172725220139986, |
|
"grad_norm": 42.460052490234375, |
|
"learning_rate": 7.577538270140358e-05, |
|
"loss": 123.3414, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.5244976292616843, |
|
"grad_norm": 39.355186462402344, |
|
"learning_rate": 7.553705745864661e-05, |
|
"loss": 127.2322, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.5317227365093702, |
|
"grad_norm": 39.048091888427734, |
|
"learning_rate": 7.529800239641664e-05, |
|
"loss": 126.9881, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.5389478437570556, |
|
"grad_norm": 39.43886184692383, |
|
"learning_rate": 7.505822600994424e-05, |
|
"loss": 135.6013, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.5461729510047415, |
|
"grad_norm": 37.615814208984375, |
|
"learning_rate": 7.481773682009356e-05, |
|
"loss": 130.4914, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.5533980582524272, |
|
"grad_norm": 40.4903450012207, |
|
"learning_rate": 7.457654337305941e-05, |
|
"loss": 141.4838, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.5606231655001128, |
|
"grad_norm": 36.969791412353516, |
|
"learning_rate": 7.433465424006356e-05, |
|
"loss": 136.31, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.5678482727477987, |
|
"grad_norm": 41.90666198730469, |
|
"learning_rate": 7.40920780170502e-05, |
|
"loss": 140.6337, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.5750733799954844, |
|
"grad_norm": 43.43820571899414, |
|
"learning_rate": 7.384882332438046e-05, |
|
"loss": 149.4489, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.58229848724317, |
|
"grad_norm": 45.44638442993164, |
|
"learning_rate": 7.360489880652599e-05, |
|
"loss": 144.4296, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.5895235944908557, |
|
"grad_norm": 45.75027847290039, |
|
"learning_rate": 7.336031313176187e-05, |
|
"loss": 151.5403, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.5967487017385413, |
|
"grad_norm": 45.903900146484375, |
|
"learning_rate": 7.311507499185849e-05, |
|
"loss": 148.2643, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.6039738089862272, |
|
"grad_norm": 52.591922760009766, |
|
"learning_rate": 7.286919310177274e-05, |
|
"loss": 157.8545, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.6111989162339129, |
|
"grad_norm": 52.4876708984375, |
|
"learning_rate": 7.262267619933825e-05, |
|
"loss": 159.6404, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.6184240234815985, |
|
"grad_norm": 54.48468780517578, |
|
"learning_rate": 7.23755330449549e-05, |
|
"loss": 166.2668, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.6256491307292844, |
|
"grad_norm": 52.32865524291992, |
|
"learning_rate": 7.212777242127752e-05, |
|
"loss": 166.8458, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.6256491307292844, |
|
"eval_loss": 2.3025312423706055, |
|
"eval_runtime": 0.6658, |
|
"eval_samples_per_second": 75.1, |
|
"eval_steps_per_second": 75.1, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.6328742379769698, |
|
"grad_norm": 55.27311325073242, |
|
"learning_rate": 7.187940313290375e-05, |
|
"loss": 166.1355, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.6400993452246557, |
|
"grad_norm": 59.219242095947266, |
|
"learning_rate": 7.163043400606118e-05, |
|
"loss": 169.2431, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.6473244524723414, |
|
"grad_norm": 58.01912307739258, |
|
"learning_rate": 7.13808738882937e-05, |
|
"loss": 163.1232, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.654549559720027, |
|
"grad_norm": 59.8082275390625, |
|
"learning_rate": 7.113073164814705e-05, |
|
"loss": 162.4904, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.661774666967713, |
|
"grad_norm": 65.7035140991211, |
|
"learning_rate": 7.088001617485369e-05, |
|
"loss": 168.8548, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.6689997742153984, |
|
"grad_norm": 66.40797424316406, |
|
"learning_rate": 7.062873637801692e-05, |
|
"loss": 175.5675, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.6762248814630842, |
|
"grad_norm": 69.01406860351562, |
|
"learning_rate": 7.037690118729421e-05, |
|
"loss": 170.8191, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.68344998871077, |
|
"grad_norm": 75.17335510253906, |
|
"learning_rate": 7.012451955207993e-05, |
|
"loss": 170.9042, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.6906750959584556, |
|
"grad_norm": 75.20256805419922, |
|
"learning_rate": 6.987160044118729e-05, |
|
"loss": 173.0992, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.6979002032061414, |
|
"grad_norm": 79.28140258789062, |
|
"learning_rate": 6.961815284252958e-05, |
|
"loss": 173.8614, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.705125310453827, |
|
"grad_norm": 76.99756622314453, |
|
"learning_rate": 6.936418576280083e-05, |
|
"loss": 181.1917, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.7123504177015128, |
|
"grad_norm": 81.97278594970703, |
|
"learning_rate": 6.910970822715577e-05, |
|
"loss": 180.226, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.7195755249491986, |
|
"grad_norm": 87.5104751586914, |
|
"learning_rate": 6.885472927888898e-05, |
|
"loss": 172.1478, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.726800632196884, |
|
"grad_norm": 95.62020874023438, |
|
"learning_rate": 6.859925797911362e-05, |
|
"loss": 169.1054, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.73402573944457, |
|
"grad_norm": 116.52254486083984, |
|
"learning_rate": 6.83433034064394e-05, |
|
"loss": 171.785, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.7412508466922556, |
|
"grad_norm": 90.70625305175781, |
|
"learning_rate": 6.808687465664996e-05, |
|
"loss": 134.6553, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.7484759539399413, |
|
"grad_norm": 60.01106643676758, |
|
"learning_rate": 6.782998084237966e-05, |
|
"loss": 110.2753, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.7557010611876271, |
|
"grad_norm": 43.751258850097656, |
|
"learning_rate": 6.757263109278972e-05, |
|
"loss": 115.0716, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.7629261684353126, |
|
"grad_norm": 31.402307510375977, |
|
"learning_rate": 6.731483455324374e-05, |
|
"loss": 121.6901, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.7701512756829985, |
|
"grad_norm": 34.220619201660156, |
|
"learning_rate": 6.705660038498282e-05, |
|
"loss": 125.6248, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.7773763829306841, |
|
"grad_norm": 36.18199157714844, |
|
"learning_rate": 6.679793776479994e-05, |
|
"loss": 124.9633, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.7846014901783698, |
|
"grad_norm": 38.51054763793945, |
|
"learning_rate": 6.653885588471386e-05, |
|
"loss": 134.9891, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.7918265974260557, |
|
"grad_norm": 38.865604400634766, |
|
"learning_rate": 6.627936395164243e-05, |
|
"loss": 137.1066, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.799051704673741, |
|
"grad_norm": 39.14519500732422, |
|
"learning_rate": 6.601947118707545e-05, |
|
"loss": 139.0853, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.806276811921427, |
|
"grad_norm": 43.307373046875, |
|
"learning_rate": 6.575918682674695e-05, |
|
"loss": 146.7656, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.806276811921427, |
|
"eval_loss": 2.2963333129882812, |
|
"eval_runtime": 0.6468, |
|
"eval_samples_per_second": 77.308, |
|
"eval_steps_per_second": 77.308, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.8135019191691126, |
|
"grad_norm": 39.974761962890625, |
|
"learning_rate": 6.549852012030699e-05, |
|
"loss": 140.9642, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.8207270264167983, |
|
"grad_norm": 40.40039825439453, |
|
"learning_rate": 6.523748033099296e-05, |
|
"loss": 144.6558, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.8279521336644842, |
|
"grad_norm": 44.24665832519531, |
|
"learning_rate": 6.497607673530033e-05, |
|
"loss": 148.5241, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 1.8351772409121698, |
|
"grad_norm": 43.769508361816406, |
|
"learning_rate": 6.47143186226532e-05, |
|
"loss": 146.4435, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.8424023481598555, |
|
"grad_norm": 46.50810623168945, |
|
"learning_rate": 6.445221529507384e-05, |
|
"loss": 155.2083, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.8496274554075414, |
|
"grad_norm": 49.342918395996094, |
|
"learning_rate": 6.418977606685244e-05, |
|
"loss": 150.5372, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.8568525626552268, |
|
"grad_norm": 46.81935119628906, |
|
"learning_rate": 6.392701026421602e-05, |
|
"loss": 153.1862, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 1.8640776699029127, |
|
"grad_norm": 54.023563385009766, |
|
"learning_rate": 6.366392722499689e-05, |
|
"loss": 160.635, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.8713027771505983, |
|
"grad_norm": 55.154781341552734, |
|
"learning_rate": 6.340053629830097e-05, |
|
"loss": 158.005, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 1.878527884398284, |
|
"grad_norm": 55.1218376159668, |
|
"learning_rate": 6.313684684417547e-05, |
|
"loss": 159.7618, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.8857529916459699, |
|
"grad_norm": 56.447635650634766, |
|
"learning_rate": 6.287286823327627e-05, |
|
"loss": 164.9144, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.8929780988936553, |
|
"grad_norm": 56.978450775146484, |
|
"learning_rate": 6.260860984653495e-05, |
|
"loss": 163.56, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.9002032061413412, |
|
"grad_norm": 63.379634857177734, |
|
"learning_rate": 6.234408107482537e-05, |
|
"loss": 172.3597, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 1.9074283133890269, |
|
"grad_norm": 62.546836853027344, |
|
"learning_rate": 6.207929131863004e-05, |
|
"loss": 169.4815, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.9146534206367125, |
|
"grad_norm": 62.5031852722168, |
|
"learning_rate": 6.181424998770595e-05, |
|
"loss": 172.2419, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.9218785278843984, |
|
"grad_norm": 69.73976135253906, |
|
"learning_rate": 6.154896650075027e-05, |
|
"loss": 172.6031, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.9291036351320838, |
|
"grad_norm": 72.4342269897461, |
|
"learning_rate": 6.128345028506553e-05, |
|
"loss": 176.2277, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 1.9363287423797697, |
|
"grad_norm": 72.21240234375, |
|
"learning_rate": 6.1017710776224744e-05, |
|
"loss": 170.4608, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 1.9435538496274554, |
|
"grad_norm": 74.69300079345703, |
|
"learning_rate": 6.0751757417736e-05, |
|
"loss": 169.2778, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 1.950778956875141, |
|
"grad_norm": 78.29647064208984, |
|
"learning_rate": 6.048559966070693e-05, |
|
"loss": 170.52, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.958004064122827, |
|
"grad_norm": 78.61617279052734, |
|
"learning_rate": 6.0219246963508746e-05, |
|
"loss": 173.9382, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 1.9652291713705126, |
|
"grad_norm": 88.84861755371094, |
|
"learning_rate": 5.995270879144027e-05, |
|
"loss": 176.2962, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.9724542786181982, |
|
"grad_norm": 103.87718963623047, |
|
"learning_rate": 5.968599461639144e-05, |
|
"loss": 174.6413, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 1.979679385865884, |
|
"grad_norm": 103.3280258178711, |
|
"learning_rate": 5.94191139165068e-05, |
|
"loss": 173.1013, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 1.9869044931135695, |
|
"grad_norm": 91.55598449707031, |
|
"learning_rate": 5.9152076175848594e-05, |
|
"loss": 142.9728, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.9869044931135695, |
|
"eval_loss": 2.2794442176818848, |
|
"eval_runtime": 0.6485, |
|
"eval_samples_per_second": 77.095, |
|
"eval_steps_per_second": 77.095, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.9941296003612554, |
|
"grad_norm": 53.11991500854492, |
|
"learning_rate": 5.888489088405983e-05, |
|
"loss": 154.9475, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 2.001354707608941, |
|
"grad_norm": 75.6795425415039, |
|
"learning_rate": 5.861756753602694e-05, |
|
"loss": 155.098, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 2.0085798148566267, |
|
"grad_norm": 34.77781677246094, |
|
"learning_rate": 5.835011563154249e-05, |
|
"loss": 108.0842, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 2.0158049221043126, |
|
"grad_norm": 32.095428466796875, |
|
"learning_rate": 5.8082544674967445e-05, |
|
"loss": 110.3337, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 2.023030029351998, |
|
"grad_norm": 36.59615707397461, |
|
"learning_rate": 5.7814864174893536e-05, |
|
"loss": 117.6389, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.030255136599684, |
|
"grad_norm": 36.38612365722656, |
|
"learning_rate": 5.754708364380531e-05, |
|
"loss": 125.0564, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 2.03748024384737, |
|
"grad_norm": 35.21025848388672, |
|
"learning_rate": 5.727921259774208e-05, |
|
"loss": 123.1134, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 2.0447053510950552, |
|
"grad_norm": 35.71826171875, |
|
"learning_rate": 5.7011260555959736e-05, |
|
"loss": 131.763, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 2.051930458342741, |
|
"grad_norm": 36.14569091796875, |
|
"learning_rate": 5.674323704059255e-05, |
|
"loss": 130.8396, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 2.0591555655904266, |
|
"grad_norm": 38.5828742980957, |
|
"learning_rate": 5.647515157631467e-05, |
|
"loss": 136.589, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 2.0663806728381124, |
|
"grad_norm": 40.066932678222656, |
|
"learning_rate": 5.6207013690001734e-05, |
|
"loss": 142.8923, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 2.0736057800857983, |
|
"grad_norm": 40.28878402709961, |
|
"learning_rate": 5.593883291039227e-05, |
|
"loss": 144.453, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 2.0808308873334838, |
|
"grad_norm": 42.93016815185547, |
|
"learning_rate": 5.5670618767749116e-05, |
|
"loss": 141.7153, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 2.0880559945811696, |
|
"grad_norm": 43.676109313964844, |
|
"learning_rate": 5.5402380793520714e-05, |
|
"loss": 145.1744, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 2.095281101828855, |
|
"grad_norm": 43.66289138793945, |
|
"learning_rate": 5.513412852000239e-05, |
|
"loss": 148.1503, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.102506209076541, |
|
"grad_norm": 45.55429458618164, |
|
"learning_rate": 5.486587147999762e-05, |
|
"loss": 145.9116, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 2.109731316324227, |
|
"grad_norm": 45.915687561035156, |
|
"learning_rate": 5.459761920647931e-05, |
|
"loss": 151.4164, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 2.1169564235719123, |
|
"grad_norm": 49.95574188232422, |
|
"learning_rate": 5.4329381232250895e-05, |
|
"loss": 157.1364, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 2.124181530819598, |
|
"grad_norm": 50.951595306396484, |
|
"learning_rate": 5.406116708960776e-05, |
|
"loss": 155.8816, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 2.1314066380672836, |
|
"grad_norm": 53.455631256103516, |
|
"learning_rate": 5.379298630999828e-05, |
|
"loss": 159.1212, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 2.1386317453149695, |
|
"grad_norm": 55.12257766723633, |
|
"learning_rate": 5.3524848423685356e-05, |
|
"loss": 159.7997, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 2.1458568525626553, |
|
"grad_norm": 55.63461685180664, |
|
"learning_rate": 5.325676295940746e-05, |
|
"loss": 159.3888, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 2.153081959810341, |
|
"grad_norm": 58.48078155517578, |
|
"learning_rate": 5.298873944404026e-05, |
|
"loss": 166.7546, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 2.1603070670580267, |
|
"grad_norm": 59.48677444458008, |
|
"learning_rate": 5.2720787402257935e-05, |
|
"loss": 169.3197, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 2.1675321743057125, |
|
"grad_norm": 60.309879302978516, |
|
"learning_rate": 5.245291635619469e-05, |
|
"loss": 163.7783, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.1675321743057125, |
|
"eval_loss": 2.2782297134399414, |
|
"eval_runtime": 0.6446, |
|
"eval_samples_per_second": 77.571, |
|
"eval_steps_per_second": 77.571, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.174757281553398, |
|
"grad_norm": 61.1097526550293, |
|
"learning_rate": 5.218513582510648e-05, |
|
"loss": 167.5735, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 2.181982388801084, |
|
"grad_norm": 64.91517639160156, |
|
"learning_rate": 5.191745532503257e-05, |
|
"loss": 165.7943, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 2.1892074960487693, |
|
"grad_norm": 66.6432113647461, |
|
"learning_rate": 5.1649884368457534e-05, |
|
"loss": 168.6365, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 2.196432603296455, |
|
"grad_norm": 72.80828857421875, |
|
"learning_rate": 5.1382432463973077e-05, |
|
"loss": 167.3139, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 2.203657710544141, |
|
"grad_norm": 69.28011322021484, |
|
"learning_rate": 5.1115109115940195e-05, |
|
"loss": 161.4875, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 2.2108828177918265, |
|
"grad_norm": 77.29905700683594, |
|
"learning_rate": 5.0847923824151424e-05, |
|
"loss": 172.7222, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 2.2181079250395124, |
|
"grad_norm": 81.0169448852539, |
|
"learning_rate": 5.058088608349323e-05, |
|
"loss": 174.758, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 2.225333032287198, |
|
"grad_norm": 84.8740005493164, |
|
"learning_rate": 5.031400538360858e-05, |
|
"loss": 168.4707, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 2.2325581395348837, |
|
"grad_norm": 91.66473388671875, |
|
"learning_rate": 5.004729120855973e-05, |
|
"loss": 169.2647, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 2.2397832467825696, |
|
"grad_norm": 105.14354705810547, |
|
"learning_rate": 4.9780753036491265e-05, |
|
"loss": 176.2409, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.247008354030255, |
|
"grad_norm": 112.60404205322266, |
|
"learning_rate": 4.9514400339293075e-05, |
|
"loss": 154.8602, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 2.254233461277941, |
|
"grad_norm": 56.13935470581055, |
|
"learning_rate": 4.9248242582264e-05, |
|
"loss": 105.6355, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 2.2614585685256268, |
|
"grad_norm": 47.51068115234375, |
|
"learning_rate": 4.898228922377526e-05, |
|
"loss": 110.0799, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 2.268683675773312, |
|
"grad_norm": 33.776058197021484, |
|
"learning_rate": 4.87165497149345e-05, |
|
"loss": 118.4811, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 2.275908783020998, |
|
"grad_norm": 37.834251403808594, |
|
"learning_rate": 4.8451033499249755e-05, |
|
"loss": 123.3947, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.2831338902686835, |
|
"grad_norm": 36.37396240234375, |
|
"learning_rate": 4.8185750012294065e-05, |
|
"loss": 123.4409, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 2.2903589975163694, |
|
"grad_norm": 36.17446517944336, |
|
"learning_rate": 4.7920708681369964e-05, |
|
"loss": 132.076, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 2.2975841047640553, |
|
"grad_norm": 38.38850784301758, |
|
"learning_rate": 4.765591892517464e-05, |
|
"loss": 134.8902, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 2.3048092120117407, |
|
"grad_norm": 37.74662399291992, |
|
"learning_rate": 4.739139015346508e-05, |
|
"loss": 135.0242, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 2.3120343192594266, |
|
"grad_norm": 39.41255187988281, |
|
"learning_rate": 4.7127131766723744e-05, |
|
"loss": 139.3454, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.319259426507112, |
|
"grad_norm": 43.438262939453125, |
|
"learning_rate": 4.6863153155824545e-05, |
|
"loss": 143.1548, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 2.326484533754798, |
|
"grad_norm": 45.747432708740234, |
|
"learning_rate": 4.659946370169903e-05, |
|
"loss": 145.9223, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 2.333709641002484, |
|
"grad_norm": 42.40370559692383, |
|
"learning_rate": 4.633607277500312e-05, |
|
"loss": 139.4424, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 2.3409347482501692, |
|
"grad_norm": 42.54023361206055, |
|
"learning_rate": 4.6072989735783986e-05, |
|
"loss": 145.1206, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 2.348159855497855, |
|
"grad_norm": 45.216739654541016, |
|
"learning_rate": 4.581022393314757e-05, |
|
"loss": 149.1618, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.348159855497855, |
|
"eval_loss": 2.2698802947998047, |
|
"eval_runtime": 0.65, |
|
"eval_samples_per_second": 76.919, |
|
"eval_steps_per_second": 76.919, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.355384962745541, |
|
"grad_norm": 49.58754348754883, |
|
"learning_rate": 4.554778470492619e-05, |
|
"loss": 151.4744, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 2.3626100699932264, |
|
"grad_norm": 48.64653015136719, |
|
"learning_rate": 4.5285681377346836e-05, |
|
"loss": 153.4593, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 2.3698351772409123, |
|
"grad_norm": 54.19842529296875, |
|
"learning_rate": 4.5023923264699663e-05, |
|
"loss": 158.7397, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 2.3770602844885977, |
|
"grad_norm": 53.489540100097656, |
|
"learning_rate": 4.4762519669007075e-05, |
|
"loss": 156.4357, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 2.3842853917362836, |
|
"grad_norm": 57.42572021484375, |
|
"learning_rate": 4.450147987969302e-05, |
|
"loss": 156.7323, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.391510498983969, |
|
"grad_norm": 55.5129280090332, |
|
"learning_rate": 4.424081317325306e-05, |
|
"loss": 162.6986, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 2.398735606231655, |
|
"grad_norm": 62.38901901245117, |
|
"learning_rate": 4.398052881292457e-05, |
|
"loss": 168.2654, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 2.405960713479341, |
|
"grad_norm": 57.75197982788086, |
|
"learning_rate": 4.372063604835758e-05, |
|
"loss": 154.5503, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 2.4131858207270263, |
|
"grad_norm": 64.6832504272461, |
|
"learning_rate": 4.3461144115286155e-05, |
|
"loss": 162.428, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 2.420410927974712, |
|
"grad_norm": 63.21479797363281, |
|
"learning_rate": 4.320206223520006e-05, |
|
"loss": 159.031, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 2.427636035222398, |
|
"grad_norm": 66.63702392578125, |
|
"learning_rate": 4.2943399615017196e-05, |
|
"loss": 166.0823, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 2.4348611424700835, |
|
"grad_norm": 67.73152160644531, |
|
"learning_rate": 4.268516544675628e-05, |
|
"loss": 166.5575, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 2.4420862497177693, |
|
"grad_norm": 74.20126342773438, |
|
"learning_rate": 4.2427368907210293e-05, |
|
"loss": 168.2213, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 2.4493113569654548, |
|
"grad_norm": 84.52593231201172, |
|
"learning_rate": 4.217001915762033e-05, |
|
"loss": 172.4427, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 2.4565364642131406, |
|
"grad_norm": 83.18832397460938, |
|
"learning_rate": 4.191312534335005e-05, |
|
"loss": 172.7155, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.4637615714608265, |
|
"grad_norm": 83.47673797607422, |
|
"learning_rate": 4.165669659356062e-05, |
|
"loss": 172.4003, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 2.470986678708512, |
|
"grad_norm": 86.96723937988281, |
|
"learning_rate": 4.1400742020886396e-05, |
|
"loss": 176.3645, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 2.478211785956198, |
|
"grad_norm": 100.84264373779297, |
|
"learning_rate": 4.114527072111103e-05, |
|
"loss": 173.7337, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 2.4854368932038833, |
|
"grad_norm": 106.68719482421875, |
|
"learning_rate": 4.0890291772844224e-05, |
|
"loss": 167.7532, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 2.492662000451569, |
|
"grad_norm": 118.92356872558594, |
|
"learning_rate": 4.063581423719916e-05, |
|
"loss": 164.0733, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 2.499887107699255, |
|
"grad_norm": 59.804168701171875, |
|
"learning_rate": 4.038184715747044e-05, |
|
"loss": 108.4363, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 2.5071122149469405, |
|
"grad_norm": 50.58955764770508, |
|
"learning_rate": 4.012839955881273e-05, |
|
"loss": 110.6487, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 2.5143373221946264, |
|
"grad_norm": 39.889163970947266, |
|
"learning_rate": 3.9875480447920076e-05, |
|
"loss": 116.4527, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 2.5215624294423122, |
|
"grad_norm": 40.6741828918457, |
|
"learning_rate": 3.9623098812705803e-05, |
|
"loss": 122.8531, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 2.5287875366899977, |
|
"grad_norm": 35.53072738647461, |
|
"learning_rate": 3.93712636219831e-05, |
|
"loss": 124.1339, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.5287875366899977, |
|
"eval_loss": 2.266690254211426, |
|
"eval_runtime": 0.6451, |
|
"eval_samples_per_second": 77.513, |
|
"eval_steps_per_second": 77.513, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.5360126439376836, |
|
"grad_norm": 40.804901123046875, |
|
"learning_rate": 3.9119983825146326e-05, |
|
"loss": 133.2125, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 2.5432377511853694, |
|
"grad_norm": 40.934730529785156, |
|
"learning_rate": 3.886926835185297e-05, |
|
"loss": 129.789, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 2.550462858433055, |
|
"grad_norm": 42.09627151489258, |
|
"learning_rate": 3.861912611170631e-05, |
|
"loss": 140.0207, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 2.5576879656807403, |
|
"grad_norm": 41.50466537475586, |
|
"learning_rate": 3.8369565993938835e-05, |
|
"loss": 136.9768, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 2.564913072928426, |
|
"grad_norm": 41.60817337036133, |
|
"learning_rate": 3.8120596867096255e-05, |
|
"loss": 143.2024, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 2.572138180176112, |
|
"grad_norm": 42.11903762817383, |
|
"learning_rate": 3.7872227578722495e-05, |
|
"loss": 141.182, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 2.5793632874237975, |
|
"grad_norm": 45.26333236694336, |
|
"learning_rate": 3.762446695504511e-05, |
|
"loss": 145.6521, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 2.5865883946714834, |
|
"grad_norm": 43.8610725402832, |
|
"learning_rate": 3.7377323800661764e-05, |
|
"loss": 145.043, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 2.5938135019191693, |
|
"grad_norm": 48.53872299194336, |
|
"learning_rate": 3.7130806898227276e-05, |
|
"loss": 154.6329, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 2.6010386091668547, |
|
"grad_norm": 46.94786834716797, |
|
"learning_rate": 3.688492500814152e-05, |
|
"loss": 156.9177, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.6082637164145406, |
|
"grad_norm": 52.20769500732422, |
|
"learning_rate": 3.663968686823814e-05, |
|
"loss": 156.2657, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 2.6154888236622265, |
|
"grad_norm": 53.79183578491211, |
|
"learning_rate": 3.6395101193474024e-05, |
|
"loss": 154.101, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 2.622713930909912, |
|
"grad_norm": 54.7728157043457, |
|
"learning_rate": 3.6151176675619555e-05, |
|
"loss": 159.0833, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 2.6299390381575978, |
|
"grad_norm": 56.10276412963867, |
|
"learning_rate": 3.59079219829498e-05, |
|
"loss": 159.4493, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 2.637164145405283, |
|
"grad_norm": 61.8068733215332, |
|
"learning_rate": 3.5665345759936454e-05, |
|
"loss": 158.6887, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 2.644389252652969, |
|
"grad_norm": 64.09469604492188, |
|
"learning_rate": 3.542345662694061e-05, |
|
"loss": 163.7667, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 2.6516143599006545, |
|
"grad_norm": 62.513427734375, |
|
"learning_rate": 3.518226317990646e-05, |
|
"loss": 165.2545, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 2.6588394671483404, |
|
"grad_norm": 66.19265747070312, |
|
"learning_rate": 3.494177399005578e-05, |
|
"loss": 167.7175, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 2.6660645743960263, |
|
"grad_norm": 61.81763458251953, |
|
"learning_rate": 3.470199760358339e-05, |
|
"loss": 172.3992, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 2.6732896816437117, |
|
"grad_norm": 72.12129974365234, |
|
"learning_rate": 3.446294254135339e-05, |
|
"loss": 171.6202, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.6805147888913976, |
|
"grad_norm": 73.13341522216797, |
|
"learning_rate": 3.422461729859643e-05, |
|
"loss": 172.6192, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 2.6877398961390835, |
|
"grad_norm": 76.26922607421875, |
|
"learning_rate": 3.398703034460776e-05, |
|
"loss": 168.6027, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 2.694965003386769, |
|
"grad_norm": 72.966064453125, |
|
"learning_rate": 3.3750190122446256e-05, |
|
"loss": 164.1373, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 2.702190110634455, |
|
"grad_norm": 78.573974609375, |
|
"learning_rate": 3.3514105048634394e-05, |
|
"loss": 168.2193, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 2.7094152178821407, |
|
"grad_norm": 83.50703430175781, |
|
"learning_rate": 3.327878351285922e-05, |
|
"loss": 172.5475, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 2.7094152178821407, |
|
"eval_loss": 2.25943660736084, |
|
"eval_runtime": 0.6467, |
|
"eval_samples_per_second": 77.315, |
|
"eval_steps_per_second": 77.315, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 2.716640325129826, |
|
"grad_norm": 84.6645736694336, |
|
"learning_rate": 3.304423387767411e-05, |
|
"loss": 173.0074, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 2.723865432377512, |
|
"grad_norm": 87.42393493652344, |
|
"learning_rate": 3.28104644782016e-05, |
|
"loss": 163.5128, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 2.7310905396251974, |
|
"grad_norm": 105.86749267578125, |
|
"learning_rate": 3.2577483621837276e-05, |
|
"loss": 164.7872, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 2.7383156468728833, |
|
"grad_norm": 119.39720916748047, |
|
"learning_rate": 3.2345299587954484e-05, |
|
"loss": 161.0168, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 2.7455407541205687, |
|
"grad_norm": 44.81121826171875, |
|
"learning_rate": 3.211392062761007e-05, |
|
"loss": 103.1053, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.7527658613682546, |
|
"grad_norm": 34.9925651550293, |
|
"learning_rate": 3.1883354963251256e-05, |
|
"loss": 110.0836, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 2.7599909686159405, |
|
"grad_norm": 33.172786712646484, |
|
"learning_rate": 3.1653610788423416e-05, |
|
"loss": 117.6201, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 2.767216075863626, |
|
"grad_norm": 35.12316131591797, |
|
"learning_rate": 3.142469626747885e-05, |
|
"loss": 120.7938, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 2.774441183111312, |
|
"grad_norm": 33.81804275512695, |
|
"learning_rate": 3.119661953528671e-05, |
|
"loss": 119.9144, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 2.7816662903589977, |
|
"grad_norm": 38.326560974121094, |
|
"learning_rate": 3.0969388696943855e-05, |
|
"loss": 127.9329, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 2.788891397606683, |
|
"grad_norm": 39.44643020629883, |
|
"learning_rate": 3.0743011827486914e-05, |
|
"loss": 131.7632, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 2.796116504854369, |
|
"grad_norm": 39.9660530090332, |
|
"learning_rate": 3.0517496971605252e-05, |
|
"loss": 131.6603, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 2.803341612102055, |
|
"grad_norm": 43.44757080078125, |
|
"learning_rate": 3.029285214335509e-05, |
|
"loss": 135.7212, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 2.8105667193497403, |
|
"grad_norm": 39.718448638916016, |
|
"learning_rate": 3.0069085325874736e-05, |
|
"loss": 129.8581, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 2.817791826597426, |
|
"grad_norm": 40.95348358154297, |
|
"learning_rate": 2.984620447110087e-05, |
|
"loss": 139.6507, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.8250169338451117, |
|
"grad_norm": 45.0244140625, |
|
"learning_rate": 2.962421749948601e-05, |
|
"loss": 141.6569, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 2.8322420410927975, |
|
"grad_norm": 44.68109130859375, |
|
"learning_rate": 2.940313229971699e-05, |
|
"loss": 141.7101, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 2.839467148340483, |
|
"grad_norm": 46.43368148803711, |
|
"learning_rate": 2.9182956728434607e-05, |
|
"loss": 151.777, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 2.846692255588169, |
|
"grad_norm": 48.510772705078125, |
|
"learning_rate": 2.8963698609954483e-05, |
|
"loss": 153.4996, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 2.8539173628358547, |
|
"grad_norm": 51.50070571899414, |
|
"learning_rate": 2.8745365735988993e-05, |
|
"loss": 153.3897, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 2.86114247008354, |
|
"grad_norm": 57.43215560913086, |
|
"learning_rate": 2.852796586537035e-05, |
|
"loss": 154.0306, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 2.868367577331226, |
|
"grad_norm": 54.87392044067383, |
|
"learning_rate": 2.831150672377489e-05, |
|
"loss": 156.4159, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 2.875592684578912, |
|
"grad_norm": 57.76106643676758, |
|
"learning_rate": 2.809599600344853e-05, |
|
"loss": 165.0454, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 2.8828177918265974, |
|
"grad_norm": 57.79510498046875, |
|
"learning_rate": 2.7881441362933468e-05, |
|
"loss": 164.8601, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 2.8900428990742832, |
|
"grad_norm": 57.36224365234375, |
|
"learning_rate": 2.766785042679591e-05, |
|
"loss": 162.4617, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.8900428990742832, |
|
"eval_loss": 2.2592520713806152, |
|
"eval_runtime": 0.6479, |
|
"eval_samples_per_second": 77.176, |
|
"eval_steps_per_second": 77.176, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.8972680063219687, |
|
"grad_norm": 59.14641189575195, |
|
"learning_rate": 2.745523078535517e-05, |
|
"loss": 158.6469, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 2.9044931135696546, |
|
"grad_norm": 63.88818359375, |
|
"learning_rate": 2.724358999441402e-05, |
|
"loss": 159.6259, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 2.91171822081734, |
|
"grad_norm": 68.26206970214844, |
|
"learning_rate": 2.7032935574990033e-05, |
|
"loss": 169.2703, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 2.918943328065026, |
|
"grad_norm": 67.78248596191406, |
|
"learning_rate": 2.68232750130484e-05, |
|
"loss": 159.6917, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 2.9261684353127118, |
|
"grad_norm": 67.05574798583984, |
|
"learning_rate": 2.6614615759235884e-05, |
|
"loss": 166.1517, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 2.933393542560397, |
|
"grad_norm": 75.2081069946289, |
|
"learning_rate": 2.6406965228616087e-05, |
|
"loss": 168.4991, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 2.940618649808083, |
|
"grad_norm": 76.87635803222656, |
|
"learning_rate": 2.620033080040585e-05, |
|
"loss": 172.8878, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 2.947843757055769, |
|
"grad_norm": 74.64488983154297, |
|
"learning_rate": 2.599471981771314e-05, |
|
"loss": 169.2124, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 2.9550688643034544, |
|
"grad_norm": 82.09602355957031, |
|
"learning_rate": 2.5790139587275948e-05, |
|
"loss": 166.7485, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 2.9622939715511403, |
|
"grad_norm": 86.70449829101562, |
|
"learning_rate": 2.5586597379202805e-05, |
|
"loss": 172.9556, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.969519078798826, |
|
"grad_norm": 93.79573822021484, |
|
"learning_rate": 2.5384100426714307e-05, |
|
"loss": 172.531, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 2.9767441860465116, |
|
"grad_norm": 97.84060668945312, |
|
"learning_rate": 2.5182655925886123e-05, |
|
"loss": 168.0535, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 2.9839692932941975, |
|
"grad_norm": 116.76630401611328, |
|
"learning_rate": 2.4982271035393208e-05, |
|
"loss": 156.5745, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 2.991194400541883, |
|
"grad_norm": 41.00756072998047, |
|
"learning_rate": 2.4782952876255474e-05, |
|
"loss": 140.6133, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 2.998419507789569, |
|
"grad_norm": 67.31522369384766, |
|
"learning_rate": 2.4584708531584742e-05, |
|
"loss": 160.9438, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 3.0056446150372547, |
|
"grad_norm": 57.157806396484375, |
|
"learning_rate": 2.4387545046332956e-05, |
|
"loss": 118.3847, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 3.01286972228494, |
|
"grad_norm": 43.13034439086914, |
|
"learning_rate": 2.4191469427041888e-05, |
|
"loss": 105.1958, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 3.020094829532626, |
|
"grad_norm": 34.08012390136719, |
|
"learning_rate": 2.39964886415941e-05, |
|
"loss": 112.8705, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 3.0273199367803114, |
|
"grad_norm": 34.88169860839844, |
|
"learning_rate": 2.3802609618965446e-05, |
|
"loss": 117.5818, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 3.0345450440279973, |
|
"grad_norm": 32.31670379638672, |
|
"learning_rate": 2.360983924897866e-05, |
|
"loss": 122.0667, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 3.041770151275683, |
|
"grad_norm": 34.80269241333008, |
|
"learning_rate": 2.3418184382058638e-05, |
|
"loss": 124.0351, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 3.0489952585233686, |
|
"grad_norm": 37.697933197021484, |
|
"learning_rate": 2.3227651828989e-05, |
|
"loss": 137.7882, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 3.0562203657710545, |
|
"grad_norm": 36.73977279663086, |
|
"learning_rate": 2.303824836066998e-05, |
|
"loss": 127.7736, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 3.06344547301874, |
|
"grad_norm": 42.219444274902344, |
|
"learning_rate": 2.284998070787787e-05, |
|
"loss": 143.5189, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 3.070670580266426, |
|
"grad_norm": 39.332454681396484, |
|
"learning_rate": 2.2662855561025804e-05, |
|
"loss": 130.0694, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 3.070670580266426, |
|
"eval_loss": 2.2527289390563965, |
|
"eval_runtime": 0.6467, |
|
"eval_samples_per_second": 77.314, |
|
"eval_steps_per_second": 77.314, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 3.0778956875141117, |
|
"grad_norm": 43.08782196044922, |
|
"learning_rate": 2.2476879569926048e-05, |
|
"loss": 143.561, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 3.085120794761797, |
|
"grad_norm": 41.036651611328125, |
|
"learning_rate": 2.2292059343553596e-05, |
|
"loss": 139.8347, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 3.092345902009483, |
|
"grad_norm": 44.45792770385742, |
|
"learning_rate": 2.210840144981144e-05, |
|
"loss": 143.4371, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 3.0995710092571684, |
|
"grad_norm": 42.94414138793945, |
|
"learning_rate": 2.192591241529699e-05, |
|
"loss": 141.0129, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 3.1067961165048543, |
|
"grad_norm": 49.134517669677734, |
|
"learning_rate": 2.1744598725070347e-05, |
|
"loss": 153.8074, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 3.11402122375254, |
|
"grad_norm": 49.15544891357422, |
|
"learning_rate": 2.1564466822423672e-05, |
|
"loss": 153.712, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 3.1212463310002256, |
|
"grad_norm": 51.34669876098633, |
|
"learning_rate": 2.1385523108652335e-05, |
|
"loss": 150.7699, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 3.1284714382479115, |
|
"grad_norm": 50.598899841308594, |
|
"learning_rate": 2.1207773942827332e-05, |
|
"loss": 155.5012, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 3.1356965454955974, |
|
"grad_norm": 58.15761184692383, |
|
"learning_rate": 2.103122564156937e-05, |
|
"loss": 158.1927, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 3.142921652743283, |
|
"grad_norm": 56.7564811706543, |
|
"learning_rate": 2.0855884478824412e-05, |
|
"loss": 164.2387, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 3.1501467599909687, |
|
"grad_norm": 58.64568328857422, |
|
"learning_rate": 2.0681756685640647e-05, |
|
"loss": 161.4923, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 3.157371867238654, |
|
"grad_norm": 58.64356231689453, |
|
"learning_rate": 2.0508848449947114e-05, |
|
"loss": 155.2221, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 3.16459697448634, |
|
"grad_norm": 65.33573150634766, |
|
"learning_rate": 2.0337165916333795e-05, |
|
"loss": 166.1156, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 3.171822081734026, |
|
"grad_norm": 64.05957794189453, |
|
"learning_rate": 2.016671518583325e-05, |
|
"loss": 164.2138, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 3.1790471889817113, |
|
"grad_norm": 68.86741638183594, |
|
"learning_rate": 1.9997502315703804e-05, |
|
"loss": 160.5822, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 3.1862722962293972, |
|
"grad_norm": 71.23939514160156, |
|
"learning_rate": 1.98295333192143e-05, |
|
"loss": 167.138, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 3.1934974034770827, |
|
"grad_norm": 78.74092102050781, |
|
"learning_rate": 1.9662814165430392e-05, |
|
"loss": 171.5105, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 3.2007225107247685, |
|
"grad_norm": 74.84487915039062, |
|
"learning_rate": 1.9497350779002463e-05, |
|
"loss": 164.3165, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 3.2079476179724544, |
|
"grad_norm": 75.05098724365234, |
|
"learning_rate": 1.9333149039955026e-05, |
|
"loss": 167.7948, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 3.21517272522014, |
|
"grad_norm": 81.60777282714844, |
|
"learning_rate": 1.917021478347779e-05, |
|
"loss": 170.561, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 3.2223978324678257, |
|
"grad_norm": 82.3056869506836, |
|
"learning_rate": 1.9008553799718355e-05, |
|
"loss": 164.4864, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 3.2296229397155116, |
|
"grad_norm": 89.91145324707031, |
|
"learning_rate": 1.8848171833576322e-05, |
|
"loss": 163.1181, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 3.236848046963197, |
|
"grad_norm": 98.04798126220703, |
|
"learning_rate": 1.8689074584499296e-05, |
|
"loss": 166.0624, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 3.244073154210883, |
|
"grad_norm": 117.23175048828125, |
|
"learning_rate": 1.8531267706280154e-05, |
|
"loss": 173.7292, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 3.2512982614585684, |
|
"grad_norm": 83.46915435791016, |
|
"learning_rate": 1.8374756806856357e-05, |
|
"loss": 121.3651, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 3.2512982614585684, |
|
"eval_loss": 2.250525951385498, |
|
"eval_runtime": 0.6458, |
|
"eval_samples_per_second": 77.422, |
|
"eval_steps_per_second": 77.422, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 3.2585233687062543, |
|
"grad_norm": 47.13894271850586, |
|
"learning_rate": 1.8219547448110454e-05, |
|
"loss": 111.1761, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 3.26574847595394, |
|
"grad_norm": 39.46201705932617, |
|
"learning_rate": 1.806564514567258e-05, |
|
"loss": 114.0671, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 3.2729735832016256, |
|
"grad_norm": 34.3983154296875, |
|
"learning_rate": 1.7913055368724318e-05, |
|
"loss": 119.771, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 3.2801986904493114, |
|
"grad_norm": 34.957542419433594, |
|
"learning_rate": 1.7761783539804482e-05, |
|
"loss": 122.0474, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 3.287423797696997, |
|
"grad_norm": 35.007755279541016, |
|
"learning_rate": 1.7611835034616314e-05, |
|
"loss": 123.7122, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 3.2946489049446828, |
|
"grad_norm": 38.12125015258789, |
|
"learning_rate": 1.7463215181836497e-05, |
|
"loss": 132.5189, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 3.3018740121923686, |
|
"grad_norm": 36.32997131347656, |
|
"learning_rate": 1.7315929262925756e-05, |
|
"loss": 128.9809, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 3.309099119440054, |
|
"grad_norm": 39.102500915527344, |
|
"learning_rate": 1.71699825119412e-05, |
|
"loss": 132.5178, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 3.31632422668774, |
|
"grad_norm": 39.0343132019043, |
|
"learning_rate": 1.7025380115350343e-05, |
|
"loss": 136.3037, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 3.323549333935426, |
|
"grad_norm": 41.78243637084961, |
|
"learning_rate": 1.6882127211846727e-05, |
|
"loss": 142.4308, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 3.3307744411831113, |
|
"grad_norm": 45.74725341796875, |
|
"learning_rate": 1.674022889216737e-05, |
|
"loss": 143.102, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 3.337999548430797, |
|
"grad_norm": 42.37492370605469, |
|
"learning_rate": 1.6599690198911826e-05, |
|
"loss": 135.5431, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 3.3452246556784826, |
|
"grad_norm": 44.97285461425781, |
|
"learning_rate": 1.6460516126363014e-05, |
|
"loss": 149.0417, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 3.3524497629261685, |
|
"grad_norm": 48.054073333740234, |
|
"learning_rate": 1.632271162030971e-05, |
|
"loss": 153.1987, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 3.359674870173854, |
|
"grad_norm": 49.26851272583008, |
|
"learning_rate": 1.6186281577870785e-05, |
|
"loss": 148.3528, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 3.36689997742154, |
|
"grad_norm": 48.746395111083984, |
|
"learning_rate": 1.605123084732123e-05, |
|
"loss": 153.4831, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 3.3741250846692257, |
|
"grad_norm": 54.20684051513672, |
|
"learning_rate": 1.59175642279198e-05, |
|
"loss": 150.8487, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 3.381350191916911, |
|
"grad_norm": 52.09891128540039, |
|
"learning_rate": 1.578528646973852e-05, |
|
"loss": 156.4221, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 3.388575299164597, |
|
"grad_norm": 55.3662109375, |
|
"learning_rate": 1.5654402273493805e-05, |
|
"loss": 158.3705, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 3.395800406412283, |
|
"grad_norm": 56.87941360473633, |
|
"learning_rate": 1.552491629037952e-05, |
|
"loss": 156.8328, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 3.4030255136599683, |
|
"grad_norm": 58.18745040893555, |
|
"learning_rate": 1.5396833121901592e-05, |
|
"loss": 158.3661, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 3.410250620907654, |
|
"grad_norm": 63.58408737182617, |
|
"learning_rate": 1.5270157319714572e-05, |
|
"loss": 165.1025, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 3.4174757281553396, |
|
"grad_norm": 64.96553039550781, |
|
"learning_rate": 1.514489338545978e-05, |
|
"loss": 156.8923, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 3.4247008354030255, |
|
"grad_norm": 64.00413513183594, |
|
"learning_rate": 1.5021045770605458e-05, |
|
"loss": 164.6695, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 3.4319259426507114, |
|
"grad_norm": 75.28882598876953, |
|
"learning_rate": 1.4898618876288473e-05, |
|
"loss": 168.7521, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 3.4319259426507114, |
|
"eval_loss": 2.2525815963745117, |
|
"eval_runtime": 0.6432, |
|
"eval_samples_per_second": 77.738, |
|
"eval_steps_per_second": 77.738, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 3.439151049898397, |
|
"grad_norm": 73.818359375, |
|
"learning_rate": 1.4777617053157982e-05, |
|
"loss": 164.7033, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 3.4463761571460827, |
|
"grad_norm": 73.67526245117188, |
|
"learning_rate": 1.4658044601220777e-05, |
|
"loss": 169.974, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 3.453601264393768, |
|
"grad_norm": 75.99698638916016, |
|
"learning_rate": 1.4539905769688514e-05, |
|
"loss": 165.4877, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 3.460826371641454, |
|
"grad_norm": 80.24381256103516, |
|
"learning_rate": 1.4423204756826705e-05, |
|
"loss": 169.552, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 3.46805147888914, |
|
"grad_norm": 86.68580627441406, |
|
"learning_rate": 1.4307945709805487e-05, |
|
"loss": 170.0956, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 3.4752765861368253, |
|
"grad_norm": 89.91241455078125, |
|
"learning_rate": 1.4194132724552292e-05, |
|
"loss": 174.2864, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 3.482501693384511, |
|
"grad_norm": 105.35623168945312, |
|
"learning_rate": 1.4081769845606262e-05, |
|
"loss": 170.3175, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 3.489726800632197, |
|
"grad_norm": 115.0772705078125, |
|
"learning_rate": 1.3970861065974563e-05, |
|
"loss": 165.5754, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 3.4969519078798825, |
|
"grad_norm": 76.55886840820312, |
|
"learning_rate": 1.3861410326990411e-05, |
|
"loss": 115.5443, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 3.5041770151275684, |
|
"grad_norm": 39.43132400512695, |
|
"learning_rate": 1.3753421518173073e-05, |
|
"loss": 107.1197, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 3.511402122375254, |
|
"grad_norm": 32.5174674987793, |
|
"learning_rate": 1.3646898477089626e-05, |
|
"loss": 113.2218, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 3.5186272296229397, |
|
"grad_norm": 35.985008239746094, |
|
"learning_rate": 1.3541844989218578e-05, |
|
"loss": 119.1645, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 3.525852336870625, |
|
"grad_norm": 32.62986755371094, |
|
"learning_rate": 1.3438264787815378e-05, |
|
"loss": 119.9635, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 3.533077444118311, |
|
"grad_norm": 34.348697662353516, |
|
"learning_rate": 1.3336161553779664e-05, |
|
"loss": 123.3444, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 3.540302551365997, |
|
"grad_norm": 35.503746032714844, |
|
"learning_rate": 1.323553891552456e-05, |
|
"loss": 130.0223, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 3.5475276586136824, |
|
"grad_norm": 38.014793395996094, |
|
"learning_rate": 1.3136400448847655e-05, |
|
"loss": 132.8261, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 3.5547527658613682, |
|
"grad_norm": 41.30305480957031, |
|
"learning_rate": 1.3038749676803994e-05, |
|
"loss": 137.0195, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 3.561977873109054, |
|
"grad_norm": 38.00440979003906, |
|
"learning_rate": 1.2942590069580812e-05, |
|
"loss": 135.3861, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 3.5692029803567396, |
|
"grad_norm": 41.92805099487305, |
|
"learning_rate": 1.2847925044374282e-05, |
|
"loss": 144.1726, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 3.5764280876044254, |
|
"grad_norm": 42.240840911865234, |
|
"learning_rate": 1.275475796526802e-05, |
|
"loss": 146.6922, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 3.5836531948521113, |
|
"grad_norm": 44.38275909423828, |
|
"learning_rate": 1.26630921431136e-05, |
|
"loss": 145.4355, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 3.5908783020997967, |
|
"grad_norm": 44.79975891113281, |
|
"learning_rate": 1.2572930835412819e-05, |
|
"loss": 144.7125, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 3.5981034093474826, |
|
"grad_norm": 47.101890563964844, |
|
"learning_rate": 1.2484277246202009e-05, |
|
"loss": 150.0654, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 3.605328516595168, |
|
"grad_norm": 48.01686477661133, |
|
"learning_rate": 1.239713452593814e-05, |
|
"loss": 154.898, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 3.612553623842854, |
|
"grad_norm": 54.491729736328125, |
|
"learning_rate": 1.2311505771386865e-05, |
|
"loss": 154.9589, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.612553623842854, |
|
"eval_loss": 2.248832941055298, |
|
"eval_runtime": 0.6421, |
|
"eval_samples_per_second": 77.868, |
|
"eval_steps_per_second": 77.868, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.6197787310905394, |
|
"grad_norm": 55.82506561279297, |
|
"learning_rate": 1.2227394025512476e-05, |
|
"loss": 157.7209, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 3.6270038383382253, |
|
"grad_norm": 58.75271987915039, |
|
"learning_rate": 1.2144802277369761e-05, |
|
"loss": 160.4193, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 3.634228945585911, |
|
"grad_norm": 56.1617546081543, |
|
"learning_rate": 1.2063733461997805e-05, |
|
"loss": 162.6156, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 3.6414540528335966, |
|
"grad_norm": 60.91801071166992, |
|
"learning_rate": 1.1984190460315653e-05, |
|
"loss": 159.6019, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 3.6486791600812825, |
|
"grad_norm": 60.85551071166992, |
|
"learning_rate": 1.1906176099019958e-05, |
|
"loss": 160.3419, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 3.6559042673289683, |
|
"grad_norm": 61.02642059326172, |
|
"learning_rate": 1.1829693150484523e-05, |
|
"loss": 165.2598, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 3.6631293745766538, |
|
"grad_norm": 68.14204406738281, |
|
"learning_rate": 1.1754744332661776e-05, |
|
"loss": 162.0726, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 3.6703544818243397, |
|
"grad_norm": 67.21196746826172, |
|
"learning_rate": 1.1681332308986191e-05, |
|
"loss": 165.2303, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 3.6775795890720255, |
|
"grad_norm": 68.1639633178711, |
|
"learning_rate": 1.1609459688279622e-05, |
|
"loss": 165.0496, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 3.684804696319711, |
|
"grad_norm": 74.92154693603516, |
|
"learning_rate": 1.1539129024658605e-05, |
|
"loss": 165.1957, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 3.692029803567397, |
|
"grad_norm": 72.8001937866211, |
|
"learning_rate": 1.1470342817443607e-05, |
|
"loss": 159.5798, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 3.6992549108150823, |
|
"grad_norm": 81.59801483154297, |
|
"learning_rate": 1.140310351107019e-05, |
|
"loss": 171.2486, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 3.706480018062768, |
|
"grad_norm": 78.66382598876953, |
|
"learning_rate": 1.133741349500213e-05, |
|
"loss": 165.4824, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 3.7137051253104536, |
|
"grad_norm": 83.93260192871094, |
|
"learning_rate": 1.1273275103646545e-05, |
|
"loss": 172.9596, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 3.7209302325581395, |
|
"grad_norm": 93.98062896728516, |
|
"learning_rate": 1.12106906162709e-05, |
|
"loss": 165.9155, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 3.7281553398058254, |
|
"grad_norm": 96.30677795410156, |
|
"learning_rate": 1.114966225692203e-05, |
|
"loss": 166.0989, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 3.735380447053511, |
|
"grad_norm": 119.76006317138672, |
|
"learning_rate": 1.1090192194347101e-05, |
|
"loss": 166.277, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 3.7426055543011967, |
|
"grad_norm": 72.7205810546875, |
|
"learning_rate": 1.1032282541916521e-05, |
|
"loss": 117.604, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 3.7498306615488826, |
|
"grad_norm": 35.77104949951172, |
|
"learning_rate": 1.0975935357548869e-05, |
|
"loss": 105.8642, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 3.757055768796568, |
|
"grad_norm": 32.010738372802734, |
|
"learning_rate": 1.092115264363775e-05, |
|
"loss": 114.9323, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 3.764280876044254, |
|
"grad_norm": 34.5285758972168, |
|
"learning_rate": 1.0867936346980626e-05, |
|
"loss": 117.0938, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 3.7715059832919398, |
|
"grad_norm": 35.69192886352539, |
|
"learning_rate": 1.0816288358709636e-05, |
|
"loss": 124.612, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 3.778731090539625, |
|
"grad_norm": 34.59403991699219, |
|
"learning_rate": 1.076621051422442e-05, |
|
"loss": 126.6394, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 3.7859561977873106, |
|
"grad_norm": 36.92626190185547, |
|
"learning_rate": 1.0717704593126856e-05, |
|
"loss": 131.2098, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 3.7931813050349965, |
|
"grad_norm": 37.527610778808594, |
|
"learning_rate": 1.067077231915783e-05, |
|
"loss": 131.3304, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 3.7931813050349965, |
|
"eval_loss": 2.247295618057251, |
|
"eval_runtime": 0.6443, |
|
"eval_samples_per_second": 77.598, |
|
"eval_steps_per_second": 77.598, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 3.8004064122826824, |
|
"grad_norm": 41.458560943603516, |
|
"learning_rate": 1.0625415360135994e-05, |
|
"loss": 136.7501, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 3.807631519530368, |
|
"grad_norm": 41.02821731567383, |
|
"learning_rate": 1.0581635327898491e-05, |
|
"loss": 140.3124, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 3.8148566267780537, |
|
"grad_norm": 41.52109146118164, |
|
"learning_rate": 1.053943377824367e-05, |
|
"loss": 138.0626, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 3.8220817340257396, |
|
"grad_norm": 45.5300178527832, |
|
"learning_rate": 1.049881221087579e-05, |
|
"loss": 143.3791, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 3.829306841273425, |
|
"grad_norm": 44.56905746459961, |
|
"learning_rate": 1.0459772069351755e-05, |
|
"loss": 140.6374, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 3.836531948521111, |
|
"grad_norm": 47.68376541137695, |
|
"learning_rate": 1.0422314741029781e-05, |
|
"loss": 149.6589, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 3.843757055768797, |
|
"grad_norm": 47.23084259033203, |
|
"learning_rate": 1.038644155702012e-05, |
|
"loss": 147.9767, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 3.850982163016482, |
|
"grad_norm": 50.1689567565918, |
|
"learning_rate": 1.0352153792137733e-05, |
|
"loss": 157.9461, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 3.858207270264168, |
|
"grad_norm": 54.78097915649414, |
|
"learning_rate": 1.0319452664857016e-05, |
|
"loss": 155.4814, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 3.8654323775118535, |
|
"grad_norm": 54.88005447387695, |
|
"learning_rate": 1.0288339337268468e-05, |
|
"loss": 156.4116, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 3.8726574847595394, |
|
"grad_norm": 54.529666900634766, |
|
"learning_rate": 1.0258814915037418e-05, |
|
"loss": 154.1808, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 3.879882592007225, |
|
"grad_norm": 57.33926773071289, |
|
"learning_rate": 1.023088044736472e-05, |
|
"loss": 161.1651, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 3.8871076992549107, |
|
"grad_norm": 60.16495895385742, |
|
"learning_rate": 1.0204536926949475e-05, |
|
"loss": 165.6093, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 3.8943328065025966, |
|
"grad_norm": 61.08560562133789, |
|
"learning_rate": 1.0179785289953755e-05, |
|
"loss": 162.3731, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 3.901557913750282, |
|
"grad_norm": 62.306461334228516, |
|
"learning_rate": 1.0156626415969325e-05, |
|
"loss": 160.4524, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 3.908783020997968, |
|
"grad_norm": 64.85774230957031, |
|
"learning_rate": 1.0135061127986394e-05, |
|
"loss": 161.3555, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 3.916008128245654, |
|
"grad_norm": 67.20696258544922, |
|
"learning_rate": 1.0115090192364367e-05, |
|
"loss": 164.4856, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 3.9232332354933392, |
|
"grad_norm": 67.95388793945312, |
|
"learning_rate": 1.0096714318804607e-05, |
|
"loss": 167.9778, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 3.930458342741025, |
|
"grad_norm": 71.8521499633789, |
|
"learning_rate": 1.0079934160325223e-05, |
|
"loss": 166.5036, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 3.937683449988711, |
|
"grad_norm": 80.16471099853516, |
|
"learning_rate": 1.0064750313237851e-05, |
|
"loss": 165.6457, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 3.9449085572363964, |
|
"grad_norm": 80.44886779785156, |
|
"learning_rate": 1.0051163317126472e-05, |
|
"loss": 167.0526, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 3.9521336644840823, |
|
"grad_norm": 85.00817108154297, |
|
"learning_rate": 1.0039173654828249e-05, |
|
"loss": 164.8536, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 3.9593587717317678, |
|
"grad_norm": 88.25836944580078, |
|
"learning_rate": 1.002878175241634e-05, |
|
"loss": 179.1164, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 3.9665838789794536, |
|
"grad_norm": 90.4699935913086, |
|
"learning_rate": 1.0019987979184773e-05, |
|
"loss": 171.6953, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 3.973808986227139, |
|
"grad_norm": 99.42991638183594, |
|
"learning_rate": 1.0012792647635323e-05, |
|
"loss": 167.9289, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 3.973808986227139, |
|
"eval_loss": 2.2475366592407227, |
|
"eval_runtime": 0.6429, |
|
"eval_samples_per_second": 77.776, |
|
"eval_steps_per_second": 77.776, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 3.981034093474825, |
|
"grad_norm": 122.0754623413086, |
|
"learning_rate": 1.0007196013466415e-05, |
|
"loss": 168.6182, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 3.988259200722511, |
|
"grad_norm": 76.6998519897461, |
|
"learning_rate": 1.0003198275564018e-05, |
|
"loss": 141.2874, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 3.9954843079701963, |
|
"grad_norm": 52.6463623046875, |
|
"learning_rate": 1.0000799575994581e-05, |
|
"loss": 158.1349, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 4.002709415217882, |
|
"grad_norm": 66.10282135009766, |
|
"learning_rate": 1e-05, |
|
"loss": 140.7063, |
|
"step": 554 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 554, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 20, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9573407816417280.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|