rayonlabs's picture
Duplicate from alicegoesdown/472854fba8
35ba3e5 verified
{
"best_metric": 2.2475366592407227,
"best_model_checkpoint": "miner_id_24/checkpoint-550",
"epoch": 4.002709415217882,
"eval_steps": 25,
"global_step": 554,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.007225107247685708,
"grad_norm": 64.54656219482422,
"learning_rate": 3.7037037037037037e-06,
"loss": 116.6258,
"step": 1
},
{
"epoch": 0.007225107247685708,
"eval_loss": 2.789177894592285,
"eval_runtime": 0.6553,
"eval_samples_per_second": 76.3,
"eval_steps_per_second": 76.3,
"step": 1
},
{
"epoch": 0.014450214495371415,
"grad_norm": 54.67939758300781,
"learning_rate": 7.4074074074074075e-06,
"loss": 126.0822,
"step": 2
},
{
"epoch": 0.021675321743057124,
"grad_norm": 58.601707458496094,
"learning_rate": 1.1111111111111112e-05,
"loss": 128.7215,
"step": 3
},
{
"epoch": 0.02890042899074283,
"grad_norm": 45.49699020385742,
"learning_rate": 1.4814814814814815e-05,
"loss": 135.9089,
"step": 4
},
{
"epoch": 0.03612553623842854,
"grad_norm": 47.61927032470703,
"learning_rate": 1.8518518518518518e-05,
"loss": 136.4493,
"step": 5
},
{
"epoch": 0.04335064348611425,
"grad_norm": 46.85959243774414,
"learning_rate": 2.2222222222222223e-05,
"loss": 140.787,
"step": 6
},
{
"epoch": 0.050575750733799954,
"grad_norm": 48.88517379760742,
"learning_rate": 2.5925925925925925e-05,
"loss": 150.3139,
"step": 7
},
{
"epoch": 0.05780085798148566,
"grad_norm": 50.686092376708984,
"learning_rate": 2.962962962962963e-05,
"loss": 150.0237,
"step": 8
},
{
"epoch": 0.06502596522917137,
"grad_norm": 52.31294631958008,
"learning_rate": 3.3333333333333335e-05,
"loss": 158.4682,
"step": 9
},
{
"epoch": 0.07225107247685708,
"grad_norm": 47.285072326660156,
"learning_rate": 3.7037037037037037e-05,
"loss": 156.9395,
"step": 10
},
{
"epoch": 0.07947617972454278,
"grad_norm": 49.950775146484375,
"learning_rate": 4.074074074074074e-05,
"loss": 163.8341,
"step": 11
},
{
"epoch": 0.0867012869722285,
"grad_norm": 52.288211822509766,
"learning_rate": 4.4444444444444447e-05,
"loss": 158.1243,
"step": 12
},
{
"epoch": 0.09392639421991421,
"grad_norm": 50.71270751953125,
"learning_rate": 4.814814814814815e-05,
"loss": 158.3429,
"step": 13
},
{
"epoch": 0.10115150146759991,
"grad_norm": 51.72929763793945,
"learning_rate": 5.185185185185185e-05,
"loss": 172.155,
"step": 14
},
{
"epoch": 0.10837660871528562,
"grad_norm": 60.08763885498047,
"learning_rate": 5.555555555555556e-05,
"loss": 174.4488,
"step": 15
},
{
"epoch": 0.11560171596297132,
"grad_norm": 57.922359466552734,
"learning_rate": 5.925925925925926e-05,
"loss": 175.58,
"step": 16
},
{
"epoch": 0.12282682321065704,
"grad_norm": 70.09874725341797,
"learning_rate": 6.296296296296296e-05,
"loss": 183.6348,
"step": 17
},
{
"epoch": 0.13005193045834273,
"grad_norm": 63.958580017089844,
"learning_rate": 6.666666666666667e-05,
"loss": 187.0141,
"step": 18
},
{
"epoch": 0.13727703770602845,
"grad_norm": 65.56310272216797,
"learning_rate": 7.037037037037038e-05,
"loss": 189.6452,
"step": 19
},
{
"epoch": 0.14450214495371416,
"grad_norm": 70.80933380126953,
"learning_rate": 7.407407407407407e-05,
"loss": 184.7314,
"step": 20
},
{
"epoch": 0.15172725220139988,
"grad_norm": 78.70838165283203,
"learning_rate": 7.777777777777778e-05,
"loss": 202.2183,
"step": 21
},
{
"epoch": 0.15895235944908556,
"grad_norm": 69.47725677490234,
"learning_rate": 8.148148148148148e-05,
"loss": 190.7609,
"step": 22
},
{
"epoch": 0.16617746669677128,
"grad_norm": 73.57704162597656,
"learning_rate": 8.518518518518518e-05,
"loss": 201.8692,
"step": 23
},
{
"epoch": 0.173402573944457,
"grad_norm": 77.79843139648438,
"learning_rate": 8.888888888888889e-05,
"loss": 203.2968,
"step": 24
},
{
"epoch": 0.1806276811921427,
"grad_norm": 81.88188934326172,
"learning_rate": 9.25925925925926e-05,
"loss": 201.7194,
"step": 25
},
{
"epoch": 0.1806276811921427,
"eval_loss": 2.638108491897583,
"eval_runtime": 0.6507,
"eval_samples_per_second": 76.835,
"eval_steps_per_second": 76.835,
"step": 25
},
{
"epoch": 0.18785278843982842,
"grad_norm": 83.04781341552734,
"learning_rate": 9.62962962962963e-05,
"loss": 198.6217,
"step": 26
},
{
"epoch": 0.1950778956875141,
"grad_norm": 85.5269775390625,
"learning_rate": 0.0001,
"loss": 195.5746,
"step": 27
},
{
"epoch": 0.20230300293519982,
"grad_norm": 87.42471313476562,
"learning_rate": 9.999920042400544e-05,
"loss": 202.2589,
"step": 28
},
{
"epoch": 0.20952811018288553,
"grad_norm": 94.57598876953125,
"learning_rate": 9.999680172443598e-05,
"loss": 203.4591,
"step": 29
},
{
"epoch": 0.21675321743057124,
"grad_norm": 95.3369369506836,
"learning_rate": 9.999280398653359e-05,
"loss": 202.5068,
"step": 30
},
{
"epoch": 0.22397832467825693,
"grad_norm": 105.30885314941406,
"learning_rate": 9.998720735236468e-05,
"loss": 209.9333,
"step": 31
},
{
"epoch": 0.23120343192594264,
"grad_norm": 109.33146667480469,
"learning_rate": 9.998001202081524e-05,
"loss": 209.2891,
"step": 32
},
{
"epoch": 0.23842853917362836,
"grad_norm": 115.16373443603516,
"learning_rate": 9.997121824758367e-05,
"loss": 204.809,
"step": 33
},
{
"epoch": 0.24565364642131407,
"grad_norm": 138.49798583984375,
"learning_rate": 9.996082634517176e-05,
"loss": 208.7707,
"step": 34
},
{
"epoch": 0.25287875366899976,
"grad_norm": 144.79405212402344,
"learning_rate": 9.994883668287352e-05,
"loss": 118.8286,
"step": 35
},
{
"epoch": 0.26010386091668547,
"grad_norm": 121.61618041992188,
"learning_rate": 9.993524968676216e-05,
"loss": 124.1272,
"step": 36
},
{
"epoch": 0.2673289681643712,
"grad_norm": 115.61026000976562,
"learning_rate": 9.99200658396748e-05,
"loss": 133.4779,
"step": 37
},
{
"epoch": 0.2745540754120569,
"grad_norm": 116.54967498779297,
"learning_rate": 9.99032856811954e-05,
"loss": 136.8838,
"step": 38
},
{
"epoch": 0.2817791826597426,
"grad_norm": 119.98362731933594,
"learning_rate": 9.988490980763562e-05,
"loss": 141.997,
"step": 39
},
{
"epoch": 0.2890042899074283,
"grad_norm": 103.40101623535156,
"learning_rate": 9.98649388720136e-05,
"loss": 149.3798,
"step": 40
},
{
"epoch": 0.29622939715511404,
"grad_norm": 78.41800689697266,
"learning_rate": 9.984337358403068e-05,
"loss": 144.1501,
"step": 41
},
{
"epoch": 0.30345450440279975,
"grad_norm": 70.429931640625,
"learning_rate": 9.982021471004624e-05,
"loss": 150.1001,
"step": 42
},
{
"epoch": 0.3106796116504854,
"grad_norm": 68.79240417480469,
"learning_rate": 9.979546307305052e-05,
"loss": 152.7282,
"step": 43
},
{
"epoch": 0.3179047188981711,
"grad_norm": 64.10746765136719,
"learning_rate": 9.976911955263529e-05,
"loss": 150.7592,
"step": 44
},
{
"epoch": 0.32512982614585684,
"grad_norm": 63.97102737426758,
"learning_rate": 9.974118508496258e-05,
"loss": 158.1265,
"step": 45
},
{
"epoch": 0.33235493339354255,
"grad_norm": 59.588539123535156,
"learning_rate": 9.971166066273153e-05,
"loss": 152.0917,
"step": 46
},
{
"epoch": 0.33958004064122826,
"grad_norm": 56.97199249267578,
"learning_rate": 9.9680547335143e-05,
"loss": 153.4449,
"step": 47
},
{
"epoch": 0.346805147888914,
"grad_norm": 60.04453659057617,
"learning_rate": 9.964784620786228e-05,
"loss": 164.4727,
"step": 48
},
{
"epoch": 0.3540302551365997,
"grad_norm": 64.35054016113281,
"learning_rate": 9.961355844297988e-05,
"loss": 171.0691,
"step": 49
},
{
"epoch": 0.3612553623842854,
"grad_norm": 62.75254440307617,
"learning_rate": 9.957768525897023e-05,
"loss": 171.733,
"step": 50
},
{
"epoch": 0.3612553623842854,
"eval_loss": 2.5156970024108887,
"eval_runtime": 0.6517,
"eval_samples_per_second": 76.722,
"eval_steps_per_second": 76.722,
"step": 50
},
{
"epoch": 0.3684804696319711,
"grad_norm": 62.48736572265625,
"learning_rate": 9.954022793064826e-05,
"loss": 169.8293,
"step": 51
},
{
"epoch": 0.37570557687965683,
"grad_norm": 60.11751174926758,
"learning_rate": 9.950118778912423e-05,
"loss": 168.2545,
"step": 52
},
{
"epoch": 0.3829306841273425,
"grad_norm": 72.86707305908203,
"learning_rate": 9.946056622175634e-05,
"loss": 184.524,
"step": 53
},
{
"epoch": 0.3901557913750282,
"grad_norm": 68.04542541503906,
"learning_rate": 9.941836467210152e-05,
"loss": 185.4,
"step": 54
},
{
"epoch": 0.3973808986227139,
"grad_norm": 66.58963012695312,
"learning_rate": 9.937458463986401e-05,
"loss": 181.0175,
"step": 55
},
{
"epoch": 0.40460600587039963,
"grad_norm": 63.77046203613281,
"learning_rate": 9.932922768084218e-05,
"loss": 182.9669,
"step": 56
},
{
"epoch": 0.41183111311808535,
"grad_norm": 64.7295150756836,
"learning_rate": 9.928229540687316e-05,
"loss": 180.1315,
"step": 57
},
{
"epoch": 0.41905622036577106,
"grad_norm": 65.93685913085938,
"learning_rate": 9.923378948577559e-05,
"loss": 187.6352,
"step": 58
},
{
"epoch": 0.4262813276134568,
"grad_norm": 68.34617614746094,
"learning_rate": 9.918371164129037e-05,
"loss": 186.9591,
"step": 59
},
{
"epoch": 0.4335064348611425,
"grad_norm": 66.827880859375,
"learning_rate": 9.913206365301939e-05,
"loss": 187.058,
"step": 60
},
{
"epoch": 0.4407315421088282,
"grad_norm": 76.42950439453125,
"learning_rate": 9.907884735636226e-05,
"loss": 193.3964,
"step": 61
},
{
"epoch": 0.44795664935651386,
"grad_norm": 72.85874938964844,
"learning_rate": 9.902406464245115e-05,
"loss": 181.1469,
"step": 62
},
{
"epoch": 0.45518175660419957,
"grad_norm": 71.87165832519531,
"learning_rate": 9.896771745808349e-05,
"loss": 189.0317,
"step": 63
},
{
"epoch": 0.4624068638518853,
"grad_norm": 89.61693572998047,
"learning_rate": 9.89098078056529e-05,
"loss": 186.9491,
"step": 64
},
{
"epoch": 0.469631971099571,
"grad_norm": 82.98321533203125,
"learning_rate": 9.885033774307798e-05,
"loss": 181.5333,
"step": 65
},
{
"epoch": 0.4768570783472567,
"grad_norm": 85.03607940673828,
"learning_rate": 9.87893093837291e-05,
"loss": 186.9785,
"step": 66
},
{
"epoch": 0.4840821855949424,
"grad_norm": 97.14375305175781,
"learning_rate": 9.872672489635346e-05,
"loss": 191.9914,
"step": 67
},
{
"epoch": 0.49130729284262814,
"grad_norm": 132.39361572265625,
"learning_rate": 9.866258650499787e-05,
"loss": 194.7207,
"step": 68
},
{
"epoch": 0.49853240009031385,
"grad_norm": 79.9813003540039,
"learning_rate": 9.859689648892982e-05,
"loss": 114.7597,
"step": 69
},
{
"epoch": 0.5057575073379995,
"grad_norm": 62.79768371582031,
"learning_rate": 9.852965718255638e-05,
"loss": 117.4539,
"step": 70
},
{
"epoch": 0.5129826145856853,
"grad_norm": 65.06932830810547,
"learning_rate": 9.84608709753414e-05,
"loss": 126.1914,
"step": 71
},
{
"epoch": 0.5202077218333709,
"grad_norm": 63.830989837646484,
"learning_rate": 9.839054031172038e-05,
"loss": 132.7605,
"step": 72
},
{
"epoch": 0.5274328290810567,
"grad_norm": 56.553497314453125,
"learning_rate": 9.831866769101381e-05,
"loss": 127.5648,
"step": 73
},
{
"epoch": 0.5346579363287424,
"grad_norm": 57.750732421875,
"learning_rate": 9.824525566733823e-05,
"loss": 136.0356,
"step": 74
},
{
"epoch": 0.5418830435764281,
"grad_norm": 49.138206481933594,
"learning_rate": 9.817030684951549e-05,
"loss": 143.931,
"step": 75
},
{
"epoch": 0.5418830435764281,
"eval_loss": 2.4383444786071777,
"eval_runtime": 0.6413,
"eval_samples_per_second": 77.973,
"eval_steps_per_second": 77.973,
"step": 75
},
{
"epoch": 0.5491081508241138,
"grad_norm": 47.41360855102539,
"learning_rate": 9.809382390098004e-05,
"loss": 140.3533,
"step": 76
},
{
"epoch": 0.5563332580717995,
"grad_norm": 47.02049255371094,
"learning_rate": 9.801580953968435e-05,
"loss": 145.456,
"step": 77
},
{
"epoch": 0.5635583653194852,
"grad_norm": 46.56816482543945,
"learning_rate": 9.793626653800219e-05,
"loss": 145.1962,
"step": 78
},
{
"epoch": 0.5707834725671709,
"grad_norm": 50.03951644897461,
"learning_rate": 9.785519772263025e-05,
"loss": 149.2638,
"step": 79
},
{
"epoch": 0.5780085798148566,
"grad_norm": 50.39605712890625,
"learning_rate": 9.777260597448753e-05,
"loss": 159.7057,
"step": 80
},
{
"epoch": 0.5852336870625423,
"grad_norm": 47.88210678100586,
"learning_rate": 9.768849422861313e-05,
"loss": 157.102,
"step": 81
},
{
"epoch": 0.5924587943102281,
"grad_norm": 48.747520446777344,
"learning_rate": 9.760286547406186e-05,
"loss": 162.9211,
"step": 82
},
{
"epoch": 0.5996839015579137,
"grad_norm": 47.951412200927734,
"learning_rate": 9.7515722753798e-05,
"loss": 164.9853,
"step": 83
},
{
"epoch": 0.6069090088055995,
"grad_norm": 51.72910690307617,
"learning_rate": 9.74270691645872e-05,
"loss": 164.8267,
"step": 84
},
{
"epoch": 0.6141341160532852,
"grad_norm": 55.5040397644043,
"learning_rate": 9.73369078568864e-05,
"loss": 172.8293,
"step": 85
},
{
"epoch": 0.6213592233009708,
"grad_norm": 57.3427848815918,
"learning_rate": 9.724524203473197e-05,
"loss": 169.605,
"step": 86
},
{
"epoch": 0.6285843305486566,
"grad_norm": 56.78362274169922,
"learning_rate": 9.715207495562573e-05,
"loss": 171.3703,
"step": 87
},
{
"epoch": 0.6358094377963422,
"grad_norm": 61.482643127441406,
"learning_rate": 9.70574099304192e-05,
"loss": 173.9247,
"step": 88
},
{
"epoch": 0.643034545044028,
"grad_norm": 60.386348724365234,
"learning_rate": 9.6961250323196e-05,
"loss": 168.3157,
"step": 89
},
{
"epoch": 0.6502596522917137,
"grad_norm": 64.57029724121094,
"learning_rate": 9.686359955115235e-05,
"loss": 177.178,
"step": 90
},
{
"epoch": 0.6574847595393994,
"grad_norm": 69.96235656738281,
"learning_rate": 9.676446108447545e-05,
"loss": 174.5947,
"step": 91
},
{
"epoch": 0.6647098667870851,
"grad_norm": 63.585506439208984,
"learning_rate": 9.666383844622034e-05,
"loss": 175.7491,
"step": 92
},
{
"epoch": 0.6719349740347709,
"grad_norm": 69.02960205078125,
"learning_rate": 9.656173521218463e-05,
"loss": 178.0728,
"step": 93
},
{
"epoch": 0.6791600812824565,
"grad_norm": 64.00545501708984,
"learning_rate": 9.645815501078142e-05,
"loss": 183.6833,
"step": 94
},
{
"epoch": 0.6863851885301423,
"grad_norm": 65.82809448242188,
"learning_rate": 9.635310152291039e-05,
"loss": 176.3168,
"step": 95
},
{
"epoch": 0.693610295777828,
"grad_norm": 74.30838012695312,
"learning_rate": 9.624657848182693e-05,
"loss": 184.4491,
"step": 96
},
{
"epoch": 0.7008354030255136,
"grad_norm": 80.60615539550781,
"learning_rate": 9.61385896730096e-05,
"loss": 179.8313,
"step": 97
},
{
"epoch": 0.7080605102731994,
"grad_norm": 80.63772583007812,
"learning_rate": 9.602913893402546e-05,
"loss": 188.0501,
"step": 98
},
{
"epoch": 0.715285617520885,
"grad_norm": 84.3320541381836,
"learning_rate": 9.591823015439374e-05,
"loss": 188.2311,
"step": 99
},
{
"epoch": 0.7225107247685708,
"grad_norm": 81.29312896728516,
"learning_rate": 9.580586727544771e-05,
"loss": 175.6401,
"step": 100
},
{
"epoch": 0.7225107247685708,
"eval_loss": 2.393695592880249,
"eval_runtime": 0.6484,
"eval_samples_per_second": 77.118,
"eval_steps_per_second": 77.118,
"step": 100
},
{
"epoch": 0.7297358320162565,
"grad_norm": 97.86168670654297,
"learning_rate": 9.569205429019452e-05,
"loss": 183.8533,
"step": 101
},
{
"epoch": 0.7369609392639422,
"grad_norm": 120.66143035888672,
"learning_rate": 9.557679524317331e-05,
"loss": 183.9207,
"step": 102
},
{
"epoch": 0.7441860465116279,
"grad_norm": 59.1573600769043,
"learning_rate": 9.54600942303115e-05,
"loss": 111.658,
"step": 103
},
{
"epoch": 0.7514111537593137,
"grad_norm": 47.51983642578125,
"learning_rate": 9.534195539877922e-05,
"loss": 116.4757,
"step": 104
},
{
"epoch": 0.7586362610069993,
"grad_norm": 41.60358428955078,
"learning_rate": 9.522238294684203e-05,
"loss": 123.7686,
"step": 105
},
{
"epoch": 0.765861368254685,
"grad_norm": 43.5106201171875,
"learning_rate": 9.510138112371153e-05,
"loss": 127.939,
"step": 106
},
{
"epoch": 0.7730864755023708,
"grad_norm": 50.11954116821289,
"learning_rate": 9.497895422939455e-05,
"loss": 129.1992,
"step": 107
},
{
"epoch": 0.7803115827500564,
"grad_norm": 43.32356262207031,
"learning_rate": 9.485510661454022e-05,
"loss": 136.5778,
"step": 108
},
{
"epoch": 0.7875366899977422,
"grad_norm": 44.33633804321289,
"learning_rate": 9.472984268028544e-05,
"loss": 141.9382,
"step": 109
},
{
"epoch": 0.7947617972454278,
"grad_norm": 40.56188201904297,
"learning_rate": 9.46031668780984e-05,
"loss": 139.9151,
"step": 110
},
{
"epoch": 0.8019869044931136,
"grad_norm": 40.898475646972656,
"learning_rate": 9.44750837096205e-05,
"loss": 141.2808,
"step": 111
},
{
"epoch": 0.8092120117407993,
"grad_norm": 40.37065887451172,
"learning_rate": 9.43455977265062e-05,
"loss": 146.3503,
"step": 112
},
{
"epoch": 0.816437118988485,
"grad_norm": 42.96000289916992,
"learning_rate": 9.421471353026149e-05,
"loss": 151.9168,
"step": 113
},
{
"epoch": 0.8236622262361707,
"grad_norm": 43.85641098022461,
"learning_rate": 9.40824357720802e-05,
"loss": 153.8375,
"step": 114
},
{
"epoch": 0.8308873334838563,
"grad_norm": 47.796295166015625,
"learning_rate": 9.394876915267878e-05,
"loss": 148.2482,
"step": 115
},
{
"epoch": 0.8381124407315421,
"grad_norm": 45.746238708496094,
"learning_rate": 9.381371842212923e-05,
"loss": 157.8461,
"step": 116
},
{
"epoch": 0.8453375479792278,
"grad_norm": 47.07131576538086,
"learning_rate": 9.36772883796903e-05,
"loss": 158.5688,
"step": 117
},
{
"epoch": 0.8525626552269135,
"grad_norm": 47.55768585205078,
"learning_rate": 9.353948387363699e-05,
"loss": 162.7859,
"step": 118
},
{
"epoch": 0.8597877624745992,
"grad_norm": 52.72948455810547,
"learning_rate": 9.340030980108816e-05,
"loss": 162.023,
"step": 119
},
{
"epoch": 0.867012869722285,
"grad_norm": 55.043453216552734,
"learning_rate": 9.325977110783264e-05,
"loss": 169.4957,
"step": 120
},
{
"epoch": 0.8742379769699706,
"grad_norm": 54.188453674316406,
"learning_rate": 9.311787278815328e-05,
"loss": 165.3426,
"step": 121
},
{
"epoch": 0.8814630842176564,
"grad_norm": 58.27094650268555,
"learning_rate": 9.297461988464967e-05,
"loss": 172.4739,
"step": 122
},
{
"epoch": 0.8886881914653421,
"grad_norm": 56.99961853027344,
"learning_rate": 9.28300174880588e-05,
"loss": 162.9698,
"step": 123
},
{
"epoch": 0.8959132987130277,
"grad_norm": 61.62599182128906,
"learning_rate": 9.268407073707426e-05,
"loss": 172.4337,
"step": 124
},
{
"epoch": 0.9031384059607135,
"grad_norm": 61.69138717651367,
"learning_rate": 9.253678481816351e-05,
"loss": 171.1664,
"step": 125
},
{
"epoch": 0.9031384059607135,
"eval_loss": 2.360010862350464,
"eval_runtime": 0.6436,
"eval_samples_per_second": 77.684,
"eval_steps_per_second": 77.684,
"step": 125
},
{
"epoch": 0.9103635132083991,
"grad_norm": 69.07987976074219,
"learning_rate": 9.238816496538369e-05,
"loss": 172.1746,
"step": 126
},
{
"epoch": 0.9175886204560849,
"grad_norm": 68.58795928955078,
"learning_rate": 9.223821646019553e-05,
"loss": 173.2199,
"step": 127
},
{
"epoch": 0.9248137277037706,
"grad_norm": 71.02379608154297,
"learning_rate": 9.208694463127569e-05,
"loss": 181.6123,
"step": 128
},
{
"epoch": 0.9320388349514563,
"grad_norm": 72.92750549316406,
"learning_rate": 9.193435485432745e-05,
"loss": 177.9379,
"step": 129
},
{
"epoch": 0.939263942199142,
"grad_norm": 71.89237213134766,
"learning_rate": 9.178045255188955e-05,
"loss": 179.2086,
"step": 130
},
{
"epoch": 0.9464890494468278,
"grad_norm": 75.93486785888672,
"learning_rate": 9.162524319314366e-05,
"loss": 176.8553,
"step": 131
},
{
"epoch": 0.9537141566945134,
"grad_norm": 74.88329315185547,
"learning_rate": 9.146873229371984e-05,
"loss": 186.3932,
"step": 132
},
{
"epoch": 0.9609392639421992,
"grad_norm": 74.99288940429688,
"learning_rate": 9.131092541550072e-05,
"loss": 181.7378,
"step": 133
},
{
"epoch": 0.9681643711898849,
"grad_norm": 83.1282958984375,
"learning_rate": 9.115182816642369e-05,
"loss": 182.5549,
"step": 134
},
{
"epoch": 0.9753894784375705,
"grad_norm": 88.4671401977539,
"learning_rate": 9.099144620028166e-05,
"loss": 177.1656,
"step": 135
},
{
"epoch": 0.9826145856852563,
"grad_norm": 138.13075256347656,
"learning_rate": 9.082978521652222e-05,
"loss": 179.7221,
"step": 136
},
{
"epoch": 0.9898396929329419,
"grad_norm": 57.780609130859375,
"learning_rate": 9.066685096004499e-05,
"loss": 136.3332,
"step": 137
},
{
"epoch": 0.9970648001806277,
"grad_norm": 62.77166748046875,
"learning_rate": 9.050264922099755e-05,
"loss": 161.6082,
"step": 138
},
{
"epoch": 1.0042899074283134,
"grad_norm": 58.23940658569336,
"learning_rate": 9.033718583456961e-05,
"loss": 142.1509,
"step": 139
},
{
"epoch": 1.011515014675999,
"grad_norm": 44.70951461791992,
"learning_rate": 9.017046668078572e-05,
"loss": 111.9812,
"step": 140
},
{
"epoch": 1.018740121923685,
"grad_norm": 32.90129089355469,
"learning_rate": 9.000249768429621e-05,
"loss": 121.872,
"step": 141
},
{
"epoch": 1.0259652291713706,
"grad_norm": 33.142757415771484,
"learning_rate": 8.983328481416675e-05,
"loss": 121.8843,
"step": 142
},
{
"epoch": 1.0331903364190562,
"grad_norm": 37.793434143066406,
"learning_rate": 8.966283408366621e-05,
"loss": 129.8525,
"step": 143
},
{
"epoch": 1.0404154436667419,
"grad_norm": 35.80763244628906,
"learning_rate": 8.949115155005289e-05,
"loss": 131.2904,
"step": 144
},
{
"epoch": 1.0476405509144275,
"grad_norm": 42.35646438598633,
"learning_rate": 8.931824331435937e-05,
"loss": 141.2367,
"step": 145
},
{
"epoch": 1.0548656581621134,
"grad_norm": 39.78588104248047,
"learning_rate": 8.914411552117559e-05,
"loss": 136.225,
"step": 146
},
{
"epoch": 1.062090765409799,
"grad_norm": 38.78228759765625,
"learning_rate": 8.896877435843063e-05,
"loss": 140.3088,
"step": 147
},
{
"epoch": 1.0693158726574847,
"grad_norm": 39.07134246826172,
"learning_rate": 8.879222605717268e-05,
"loss": 143.1922,
"step": 148
},
{
"epoch": 1.0765409799051704,
"grad_norm": 41.59137725830078,
"learning_rate": 8.861447689134768e-05,
"loss": 150.6414,
"step": 149
},
{
"epoch": 1.0837660871528563,
"grad_norm": 42.95624923706055,
"learning_rate": 8.843553317757632e-05,
"loss": 147.6945,
"step": 150
},
{
"epoch": 1.0837660871528563,
"eval_loss": 2.3478291034698486,
"eval_runtime": 0.6442,
"eval_samples_per_second": 77.616,
"eval_steps_per_second": 77.616,
"step": 150
},
{
"epoch": 1.090991194400542,
"grad_norm": 42.467933654785156,
"learning_rate": 8.825540127492967e-05,
"loss": 140.8872,
"step": 151
},
{
"epoch": 1.0982163016482276,
"grad_norm": 42.057655334472656,
"learning_rate": 8.807408758470302e-05,
"loss": 153.8803,
"step": 152
},
{
"epoch": 1.1054414088959132,
"grad_norm": 43.9022102355957,
"learning_rate": 8.789159855018858e-05,
"loss": 154.978,
"step": 153
},
{
"epoch": 1.112666516143599,
"grad_norm": 48.20111846923828,
"learning_rate": 8.770794065644639e-05,
"loss": 161.4774,
"step": 154
},
{
"epoch": 1.1198916233912848,
"grad_norm": 53.01012420654297,
"learning_rate": 8.752312043007396e-05,
"loss": 166.7152,
"step": 155
},
{
"epoch": 1.1271167306389704,
"grad_norm": 54.39327621459961,
"learning_rate": 8.73371444389742e-05,
"loss": 162.619,
"step": 156
},
{
"epoch": 1.134341837886656,
"grad_norm": 54.17763900756836,
"learning_rate": 8.715001929212214e-05,
"loss": 169.5686,
"step": 157
},
{
"epoch": 1.1415669451343418,
"grad_norm": 51.94620895385742,
"learning_rate": 8.696175163933004e-05,
"loss": 165.3075,
"step": 158
},
{
"epoch": 1.1487920523820276,
"grad_norm": 57.945369720458984,
"learning_rate": 8.677234817101101e-05,
"loss": 168.0594,
"step": 159
},
{
"epoch": 1.1560171596297133,
"grad_norm": 59.11105728149414,
"learning_rate": 8.658181561794137e-05,
"loss": 164.0512,
"step": 160
},
{
"epoch": 1.163242266877399,
"grad_norm": 63.255611419677734,
"learning_rate": 8.639016075102136e-05,
"loss": 167.9124,
"step": 161
},
{
"epoch": 1.1704673741250846,
"grad_norm": 62.79253005981445,
"learning_rate": 8.619739038103456e-05,
"loss": 168.2611,
"step": 162
},
{
"epoch": 1.1776924813727705,
"grad_norm": 66.78952026367188,
"learning_rate": 8.600351135840589e-05,
"loss": 167.1303,
"step": 163
},
{
"epoch": 1.1849175886204562,
"grad_norm": 65.6444320678711,
"learning_rate": 8.580853057295813e-05,
"loss": 174.8452,
"step": 164
},
{
"epoch": 1.1921426958681418,
"grad_norm": 68.41908264160156,
"learning_rate": 8.561245495366706e-05,
"loss": 173.017,
"step": 165
},
{
"epoch": 1.1993678031158275,
"grad_norm": 72.44266510009766,
"learning_rate": 8.541529146841526e-05,
"loss": 179.2277,
"step": 166
},
{
"epoch": 1.2065929103635131,
"grad_norm": 79.08995819091797,
"learning_rate": 8.521704712374453e-05,
"loss": 162.5972,
"step": 167
},
{
"epoch": 1.213818017611199,
"grad_norm": 81.43531799316406,
"learning_rate": 8.50177289646068e-05,
"loss": 175.9867,
"step": 168
},
{
"epoch": 1.2210431248588847,
"grad_norm": 79.34605407714844,
"learning_rate": 8.48173440741139e-05,
"loss": 178.2415,
"step": 169
},
{
"epoch": 1.2282682321065703,
"grad_norm": 81.79105377197266,
"learning_rate": 8.46158995732857e-05,
"loss": 178.3049,
"step": 170
},
{
"epoch": 1.235493339354256,
"grad_norm": 85.38658142089844,
"learning_rate": 8.44134026207972e-05,
"loss": 174.3011,
"step": 171
},
{
"epoch": 1.2427184466019416,
"grad_norm": 104.00491333007812,
"learning_rate": 8.420986041272407e-05,
"loss": 182.388,
"step": 172
},
{
"epoch": 1.2499435538496275,
"grad_norm": 105.1968002319336,
"learning_rate": 8.400528018228688e-05,
"loss": 142.8601,
"step": 173
},
{
"epoch": 1.2571686610973132,
"grad_norm": 91.36693572998047,
"learning_rate": 8.379966919959416e-05,
"loss": 112.5672,
"step": 174
},
{
"epoch": 1.2643937683449988,
"grad_norm": 61.239349365234375,
"learning_rate": 8.359303477138393e-05,
"loss": 117.836,
"step": 175
},
{
"epoch": 1.2643937683449988,
"eval_loss": 2.333247423171997,
"eval_runtime": 0.6482,
"eval_samples_per_second": 77.14,
"eval_steps_per_second": 77.14,
"step": 175
},
{
"epoch": 1.2716188755926847,
"grad_norm": 44.947166442871094,
"learning_rate": 8.338538424076411e-05,
"loss": 119.2028,
"step": 176
},
{
"epoch": 1.2788439828403702,
"grad_norm": 51.20450210571289,
"learning_rate": 8.317672498695162e-05,
"loss": 126.114,
"step": 177
},
{
"epoch": 1.286069090088056,
"grad_norm": 45.0206184387207,
"learning_rate": 8.296706442500998e-05,
"loss": 124.9417,
"step": 178
},
{
"epoch": 1.2932941973357417,
"grad_norm": 47.35272216796875,
"learning_rate": 8.275641000558598e-05,
"loss": 136.7816,
"step": 179
},
{
"epoch": 1.3005193045834273,
"grad_norm": 42.55514144897461,
"learning_rate": 8.254476921464484e-05,
"loss": 134.4095,
"step": 180
},
{
"epoch": 1.3077444118311132,
"grad_norm": 41.750221252441406,
"learning_rate": 8.233214957320411e-05,
"loss": 137.0252,
"step": 181
},
{
"epoch": 1.3149695190787989,
"grad_norm": 44.95912170410156,
"learning_rate": 8.211855863706654e-05,
"loss": 143.0184,
"step": 182
},
{
"epoch": 1.3221946263264845,
"grad_norm": 46.76531219482422,
"learning_rate": 8.190400399655147e-05,
"loss": 145.2914,
"step": 183
},
{
"epoch": 1.3294197335741702,
"grad_norm": 47.532100677490234,
"learning_rate": 8.168849327622513e-05,
"loss": 149.4524,
"step": 184
},
{
"epoch": 1.3366448408218559,
"grad_norm": 47.3936767578125,
"learning_rate": 8.147203413462967e-05,
"loss": 144.6819,
"step": 185
},
{
"epoch": 1.3438699480695417,
"grad_norm": 45.35457229614258,
"learning_rate": 8.125463426401101e-05,
"loss": 150.504,
"step": 186
},
{
"epoch": 1.3510950553172274,
"grad_norm": 48.21922302246094,
"learning_rate": 8.103630139004553e-05,
"loss": 155.5509,
"step": 187
},
{
"epoch": 1.358320162564913,
"grad_norm": 50.7261962890625,
"learning_rate": 8.08170432715654e-05,
"loss": 160.2927,
"step": 188
},
{
"epoch": 1.3655452698125987,
"grad_norm": 52.04977035522461,
"learning_rate": 8.059686770028303e-05,
"loss": 154.3667,
"step": 189
},
{
"epoch": 1.3727703770602844,
"grad_norm": 52.68675994873047,
"learning_rate": 8.037578250051399e-05,
"loss": 155.9036,
"step": 190
},
{
"epoch": 1.3799954843079703,
"grad_norm": 54.34752655029297,
"learning_rate": 8.015379552889913e-05,
"loss": 156.5415,
"step": 191
},
{
"epoch": 1.387220591555656,
"grad_norm": 51.08332443237305,
"learning_rate": 7.993091467412527e-05,
"loss": 164.3319,
"step": 192
},
{
"epoch": 1.3944456988033416,
"grad_norm": 57.97858428955078,
"learning_rate": 7.970714785664492e-05,
"loss": 170.2659,
"step": 193
},
{
"epoch": 1.4016708060510275,
"grad_norm": 63.892784118652344,
"learning_rate": 7.948250302839476e-05,
"loss": 169.5753,
"step": 194
},
{
"epoch": 1.408895913298713,
"grad_norm": 62.205596923828125,
"learning_rate": 7.92569881725131e-05,
"loss": 170.2137,
"step": 195
},
{
"epoch": 1.4161210205463988,
"grad_norm": 59.26729202270508,
"learning_rate": 7.903061130305616e-05,
"loss": 158.9264,
"step": 196
},
{
"epoch": 1.4233461277940844,
"grad_norm": 60.39152908325195,
"learning_rate": 7.880338046471331e-05,
"loss": 169.5714,
"step": 197
},
{
"epoch": 1.43057123504177,
"grad_norm": 71.50019073486328,
"learning_rate": 7.857530373252116e-05,
"loss": 171.8552,
"step": 198
},
{
"epoch": 1.437796342289456,
"grad_norm": 68.2151870727539,
"learning_rate": 7.83463892115766e-05,
"loss": 168.0481,
"step": 199
},
{
"epoch": 1.4450214495371416,
"grad_norm": 79.76618957519531,
"learning_rate": 7.811664503674875e-05,
"loss": 170.3496,
"step": 200
},
{
"epoch": 1.4450214495371416,
"eval_loss": 2.3155527114868164,
"eval_runtime": 0.6439,
"eval_samples_per_second": 77.647,
"eval_steps_per_second": 77.647,
"step": 200
},
{
"epoch": 1.4522465567848273,
"grad_norm": 78.41180419921875,
"learning_rate": 7.788607937238995e-05,
"loss": 184.1455,
"step": 201
},
{
"epoch": 1.459471664032513,
"grad_norm": 73.47679138183594,
"learning_rate": 7.765470041204553e-05,
"loss": 168.6345,
"step": 202
},
{
"epoch": 1.4666967712801986,
"grad_norm": 84.49844360351562,
"learning_rate": 7.742251637816274e-05,
"loss": 179.5529,
"step": 203
},
{
"epoch": 1.4739218785278845,
"grad_norm": 83.9566879272461,
"learning_rate": 7.718953552179841e-05,
"loss": 175.5488,
"step": 204
},
{
"epoch": 1.4811469857755701,
"grad_norm": 85.65151977539062,
"learning_rate": 7.695576612232591e-05,
"loss": 174.084,
"step": 205
},
{
"epoch": 1.4883720930232558,
"grad_norm": 113.66490936279297,
"learning_rate": 7.67212164871408e-05,
"loss": 175.5837,
"step": 206
},
{
"epoch": 1.4955972002709415,
"grad_norm": 89.42914581298828,
"learning_rate": 7.64858949513656e-05,
"loss": 139.4319,
"step": 207
},
{
"epoch": 1.502822307518627,
"grad_norm": 38.07080078125,
"learning_rate": 7.624980987755375e-05,
"loss": 111.8192,
"step": 208
},
{
"epoch": 1.510047414766313,
"grad_norm": 33.71562576293945,
"learning_rate": 7.601296965539225e-05,
"loss": 116.1872,
"step": 209
},
{
"epoch": 1.5172725220139986,
"grad_norm": 42.460052490234375,
"learning_rate": 7.577538270140358e-05,
"loss": 123.3414,
"step": 210
},
{
"epoch": 1.5244976292616843,
"grad_norm": 39.355186462402344,
"learning_rate": 7.553705745864661e-05,
"loss": 127.2322,
"step": 211
},
{
"epoch": 1.5317227365093702,
"grad_norm": 39.048091888427734,
"learning_rate": 7.529800239641664e-05,
"loss": 126.9881,
"step": 212
},
{
"epoch": 1.5389478437570556,
"grad_norm": 39.43886184692383,
"learning_rate": 7.505822600994424e-05,
"loss": 135.6013,
"step": 213
},
{
"epoch": 1.5461729510047415,
"grad_norm": 37.615814208984375,
"learning_rate": 7.481773682009356e-05,
"loss": 130.4914,
"step": 214
},
{
"epoch": 1.5533980582524272,
"grad_norm": 40.4903450012207,
"learning_rate": 7.457654337305941e-05,
"loss": 141.4838,
"step": 215
},
{
"epoch": 1.5606231655001128,
"grad_norm": 36.969791412353516,
"learning_rate": 7.433465424006356e-05,
"loss": 136.31,
"step": 216
},
{
"epoch": 1.5678482727477987,
"grad_norm": 41.90666198730469,
"learning_rate": 7.40920780170502e-05,
"loss": 140.6337,
"step": 217
},
{
"epoch": 1.5750733799954844,
"grad_norm": 43.43820571899414,
"learning_rate": 7.384882332438046e-05,
"loss": 149.4489,
"step": 218
},
{
"epoch": 1.58229848724317,
"grad_norm": 45.44638442993164,
"learning_rate": 7.360489880652599e-05,
"loss": 144.4296,
"step": 219
},
{
"epoch": 1.5895235944908557,
"grad_norm": 45.75027847290039,
"learning_rate": 7.336031313176187e-05,
"loss": 151.5403,
"step": 220
},
{
"epoch": 1.5967487017385413,
"grad_norm": 45.903900146484375,
"learning_rate": 7.311507499185849e-05,
"loss": 148.2643,
"step": 221
},
{
"epoch": 1.6039738089862272,
"grad_norm": 52.591922760009766,
"learning_rate": 7.286919310177274e-05,
"loss": 157.8545,
"step": 222
},
{
"epoch": 1.6111989162339129,
"grad_norm": 52.4876708984375,
"learning_rate": 7.262267619933825e-05,
"loss": 159.6404,
"step": 223
},
{
"epoch": 1.6184240234815985,
"grad_norm": 54.48468780517578,
"learning_rate": 7.23755330449549e-05,
"loss": 166.2668,
"step": 224
},
{
"epoch": 1.6256491307292844,
"grad_norm": 52.32865524291992,
"learning_rate": 7.212777242127752e-05,
"loss": 166.8458,
"step": 225
},
{
"epoch": 1.6256491307292844,
"eval_loss": 2.3025312423706055,
"eval_runtime": 0.6658,
"eval_samples_per_second": 75.1,
"eval_steps_per_second": 75.1,
"step": 225
},
{
"epoch": 1.6328742379769698,
"grad_norm": 55.27311325073242,
"learning_rate": 7.187940313290375e-05,
"loss": 166.1355,
"step": 226
},
{
"epoch": 1.6400993452246557,
"grad_norm": 59.219242095947266,
"learning_rate": 7.163043400606118e-05,
"loss": 169.2431,
"step": 227
},
{
"epoch": 1.6473244524723414,
"grad_norm": 58.01912307739258,
"learning_rate": 7.13808738882937e-05,
"loss": 163.1232,
"step": 228
},
{
"epoch": 1.654549559720027,
"grad_norm": 59.8082275390625,
"learning_rate": 7.113073164814705e-05,
"loss": 162.4904,
"step": 229
},
{
"epoch": 1.661774666967713,
"grad_norm": 65.7035140991211,
"learning_rate": 7.088001617485369e-05,
"loss": 168.8548,
"step": 230
},
{
"epoch": 1.6689997742153984,
"grad_norm": 66.40797424316406,
"learning_rate": 7.062873637801692e-05,
"loss": 175.5675,
"step": 231
},
{
"epoch": 1.6762248814630842,
"grad_norm": 69.01406860351562,
"learning_rate": 7.037690118729421e-05,
"loss": 170.8191,
"step": 232
},
{
"epoch": 1.68344998871077,
"grad_norm": 75.17335510253906,
"learning_rate": 7.012451955207993e-05,
"loss": 170.9042,
"step": 233
},
{
"epoch": 1.6906750959584556,
"grad_norm": 75.20256805419922,
"learning_rate": 6.987160044118729e-05,
"loss": 173.0992,
"step": 234
},
{
"epoch": 1.6979002032061414,
"grad_norm": 79.28140258789062,
"learning_rate": 6.961815284252958e-05,
"loss": 173.8614,
"step": 235
},
{
"epoch": 1.705125310453827,
"grad_norm": 76.99756622314453,
"learning_rate": 6.936418576280083e-05,
"loss": 181.1917,
"step": 236
},
{
"epoch": 1.7123504177015128,
"grad_norm": 81.97278594970703,
"learning_rate": 6.910970822715577e-05,
"loss": 180.226,
"step": 237
},
{
"epoch": 1.7195755249491986,
"grad_norm": 87.5104751586914,
"learning_rate": 6.885472927888898e-05,
"loss": 172.1478,
"step": 238
},
{
"epoch": 1.726800632196884,
"grad_norm": 95.62020874023438,
"learning_rate": 6.859925797911362e-05,
"loss": 169.1054,
"step": 239
},
{
"epoch": 1.73402573944457,
"grad_norm": 116.52254486083984,
"learning_rate": 6.83433034064394e-05,
"loss": 171.785,
"step": 240
},
{
"epoch": 1.7412508466922556,
"grad_norm": 90.70625305175781,
"learning_rate": 6.808687465664996e-05,
"loss": 134.6553,
"step": 241
},
{
"epoch": 1.7484759539399413,
"grad_norm": 60.01106643676758,
"learning_rate": 6.782998084237966e-05,
"loss": 110.2753,
"step": 242
},
{
"epoch": 1.7557010611876271,
"grad_norm": 43.751258850097656,
"learning_rate": 6.757263109278972e-05,
"loss": 115.0716,
"step": 243
},
{
"epoch": 1.7629261684353126,
"grad_norm": 31.402307510375977,
"learning_rate": 6.731483455324374e-05,
"loss": 121.6901,
"step": 244
},
{
"epoch": 1.7701512756829985,
"grad_norm": 34.220619201660156,
"learning_rate": 6.705660038498282e-05,
"loss": 125.6248,
"step": 245
},
{
"epoch": 1.7773763829306841,
"grad_norm": 36.18199157714844,
"learning_rate": 6.679793776479994e-05,
"loss": 124.9633,
"step": 246
},
{
"epoch": 1.7846014901783698,
"grad_norm": 38.51054763793945,
"learning_rate": 6.653885588471386e-05,
"loss": 134.9891,
"step": 247
},
{
"epoch": 1.7918265974260557,
"grad_norm": 38.865604400634766,
"learning_rate": 6.627936395164243e-05,
"loss": 137.1066,
"step": 248
},
{
"epoch": 1.799051704673741,
"grad_norm": 39.14519500732422,
"learning_rate": 6.601947118707545e-05,
"loss": 139.0853,
"step": 249
},
{
"epoch": 1.806276811921427,
"grad_norm": 43.307373046875,
"learning_rate": 6.575918682674695e-05,
"loss": 146.7656,
"step": 250
},
{
"epoch": 1.806276811921427,
"eval_loss": 2.2963333129882812,
"eval_runtime": 0.6468,
"eval_samples_per_second": 77.308,
"eval_steps_per_second": 77.308,
"step": 250
},
{
"epoch": 1.8135019191691126,
"grad_norm": 39.974761962890625,
"learning_rate": 6.549852012030699e-05,
"loss": 140.9642,
"step": 251
},
{
"epoch": 1.8207270264167983,
"grad_norm": 40.40039825439453,
"learning_rate": 6.523748033099296e-05,
"loss": 144.6558,
"step": 252
},
{
"epoch": 1.8279521336644842,
"grad_norm": 44.24665832519531,
"learning_rate": 6.497607673530033e-05,
"loss": 148.5241,
"step": 253
},
{
"epoch": 1.8351772409121698,
"grad_norm": 43.769508361816406,
"learning_rate": 6.47143186226532e-05,
"loss": 146.4435,
"step": 254
},
{
"epoch": 1.8424023481598555,
"grad_norm": 46.50810623168945,
"learning_rate": 6.445221529507384e-05,
"loss": 155.2083,
"step": 255
},
{
"epoch": 1.8496274554075414,
"grad_norm": 49.342918395996094,
"learning_rate": 6.418977606685244e-05,
"loss": 150.5372,
"step": 256
},
{
"epoch": 1.8568525626552268,
"grad_norm": 46.81935119628906,
"learning_rate": 6.392701026421602e-05,
"loss": 153.1862,
"step": 257
},
{
"epoch": 1.8640776699029127,
"grad_norm": 54.023563385009766,
"learning_rate": 6.366392722499689e-05,
"loss": 160.635,
"step": 258
},
{
"epoch": 1.8713027771505983,
"grad_norm": 55.154781341552734,
"learning_rate": 6.340053629830097e-05,
"loss": 158.005,
"step": 259
},
{
"epoch": 1.878527884398284,
"grad_norm": 55.1218376159668,
"learning_rate": 6.313684684417547e-05,
"loss": 159.7618,
"step": 260
},
{
"epoch": 1.8857529916459699,
"grad_norm": 56.447635650634766,
"learning_rate": 6.287286823327627e-05,
"loss": 164.9144,
"step": 261
},
{
"epoch": 1.8929780988936553,
"grad_norm": 56.978450775146484,
"learning_rate": 6.260860984653495e-05,
"loss": 163.56,
"step": 262
},
{
"epoch": 1.9002032061413412,
"grad_norm": 63.379634857177734,
"learning_rate": 6.234408107482537e-05,
"loss": 172.3597,
"step": 263
},
{
"epoch": 1.9074283133890269,
"grad_norm": 62.546836853027344,
"learning_rate": 6.207929131863004e-05,
"loss": 169.4815,
"step": 264
},
{
"epoch": 1.9146534206367125,
"grad_norm": 62.5031852722168,
"learning_rate": 6.181424998770595e-05,
"loss": 172.2419,
"step": 265
},
{
"epoch": 1.9218785278843984,
"grad_norm": 69.73976135253906,
"learning_rate": 6.154896650075027e-05,
"loss": 172.6031,
"step": 266
},
{
"epoch": 1.9291036351320838,
"grad_norm": 72.4342269897461,
"learning_rate": 6.128345028506553e-05,
"loss": 176.2277,
"step": 267
},
{
"epoch": 1.9363287423797697,
"grad_norm": 72.21240234375,
"learning_rate": 6.1017710776224744e-05,
"loss": 170.4608,
"step": 268
},
{
"epoch": 1.9435538496274554,
"grad_norm": 74.69300079345703,
"learning_rate": 6.0751757417736e-05,
"loss": 169.2778,
"step": 269
},
{
"epoch": 1.950778956875141,
"grad_norm": 78.29647064208984,
"learning_rate": 6.048559966070693e-05,
"loss": 170.52,
"step": 270
},
{
"epoch": 1.958004064122827,
"grad_norm": 78.61617279052734,
"learning_rate": 6.0219246963508746e-05,
"loss": 173.9382,
"step": 271
},
{
"epoch": 1.9652291713705126,
"grad_norm": 88.84861755371094,
"learning_rate": 5.995270879144027e-05,
"loss": 176.2962,
"step": 272
},
{
"epoch": 1.9724542786181982,
"grad_norm": 103.87718963623047,
"learning_rate": 5.968599461639144e-05,
"loss": 174.6413,
"step": 273
},
{
"epoch": 1.979679385865884,
"grad_norm": 103.3280258178711,
"learning_rate": 5.94191139165068e-05,
"loss": 173.1013,
"step": 274
},
{
"epoch": 1.9869044931135695,
"grad_norm": 91.55598449707031,
"learning_rate": 5.9152076175848594e-05,
"loss": 142.9728,
"step": 275
},
{
"epoch": 1.9869044931135695,
"eval_loss": 2.2794442176818848,
"eval_runtime": 0.6485,
"eval_samples_per_second": 77.095,
"eval_steps_per_second": 77.095,
"step": 275
},
{
"epoch": 1.9941296003612554,
"grad_norm": 53.11991500854492,
"learning_rate": 5.888489088405983e-05,
"loss": 154.9475,
"step": 276
},
{
"epoch": 2.001354707608941,
"grad_norm": 75.6795425415039,
"learning_rate": 5.861756753602694e-05,
"loss": 155.098,
"step": 277
},
{
"epoch": 2.0085798148566267,
"grad_norm": 34.77781677246094,
"learning_rate": 5.835011563154249e-05,
"loss": 108.0842,
"step": 278
},
{
"epoch": 2.0158049221043126,
"grad_norm": 32.095428466796875,
"learning_rate": 5.8082544674967445e-05,
"loss": 110.3337,
"step": 279
},
{
"epoch": 2.023030029351998,
"grad_norm": 36.59615707397461,
"learning_rate": 5.7814864174893536e-05,
"loss": 117.6389,
"step": 280
},
{
"epoch": 2.030255136599684,
"grad_norm": 36.38612365722656,
"learning_rate": 5.754708364380531e-05,
"loss": 125.0564,
"step": 281
},
{
"epoch": 2.03748024384737,
"grad_norm": 35.21025848388672,
"learning_rate": 5.727921259774208e-05,
"loss": 123.1134,
"step": 282
},
{
"epoch": 2.0447053510950552,
"grad_norm": 35.71826171875,
"learning_rate": 5.7011260555959736e-05,
"loss": 131.763,
"step": 283
},
{
"epoch": 2.051930458342741,
"grad_norm": 36.14569091796875,
"learning_rate": 5.674323704059255e-05,
"loss": 130.8396,
"step": 284
},
{
"epoch": 2.0591555655904266,
"grad_norm": 38.5828742980957,
"learning_rate": 5.647515157631467e-05,
"loss": 136.589,
"step": 285
},
{
"epoch": 2.0663806728381124,
"grad_norm": 40.066932678222656,
"learning_rate": 5.6207013690001734e-05,
"loss": 142.8923,
"step": 286
},
{
"epoch": 2.0736057800857983,
"grad_norm": 40.28878402709961,
"learning_rate": 5.593883291039227e-05,
"loss": 144.453,
"step": 287
},
{
"epoch": 2.0808308873334838,
"grad_norm": 42.93016815185547,
"learning_rate": 5.5670618767749116e-05,
"loss": 141.7153,
"step": 288
},
{
"epoch": 2.0880559945811696,
"grad_norm": 43.676109313964844,
"learning_rate": 5.5402380793520714e-05,
"loss": 145.1744,
"step": 289
},
{
"epoch": 2.095281101828855,
"grad_norm": 43.66289138793945,
"learning_rate": 5.513412852000239e-05,
"loss": 148.1503,
"step": 290
},
{
"epoch": 2.102506209076541,
"grad_norm": 45.55429458618164,
"learning_rate": 5.486587147999762e-05,
"loss": 145.9116,
"step": 291
},
{
"epoch": 2.109731316324227,
"grad_norm": 45.915687561035156,
"learning_rate": 5.459761920647931e-05,
"loss": 151.4164,
"step": 292
},
{
"epoch": 2.1169564235719123,
"grad_norm": 49.95574188232422,
"learning_rate": 5.4329381232250895e-05,
"loss": 157.1364,
"step": 293
},
{
"epoch": 2.124181530819598,
"grad_norm": 50.951595306396484,
"learning_rate": 5.406116708960776e-05,
"loss": 155.8816,
"step": 294
},
{
"epoch": 2.1314066380672836,
"grad_norm": 53.455631256103516,
"learning_rate": 5.379298630999828e-05,
"loss": 159.1212,
"step": 295
},
{
"epoch": 2.1386317453149695,
"grad_norm": 55.12257766723633,
"learning_rate": 5.3524848423685356e-05,
"loss": 159.7997,
"step": 296
},
{
"epoch": 2.1458568525626553,
"grad_norm": 55.63461685180664,
"learning_rate": 5.325676295940746e-05,
"loss": 159.3888,
"step": 297
},
{
"epoch": 2.153081959810341,
"grad_norm": 58.48078155517578,
"learning_rate": 5.298873944404026e-05,
"loss": 166.7546,
"step": 298
},
{
"epoch": 2.1603070670580267,
"grad_norm": 59.48677444458008,
"learning_rate": 5.2720787402257935e-05,
"loss": 169.3197,
"step": 299
},
{
"epoch": 2.1675321743057125,
"grad_norm": 60.309879302978516,
"learning_rate": 5.245291635619469e-05,
"loss": 163.7783,
"step": 300
},
{
"epoch": 2.1675321743057125,
"eval_loss": 2.2782297134399414,
"eval_runtime": 0.6446,
"eval_samples_per_second": 77.571,
"eval_steps_per_second": 77.571,
"step": 300
},
{
"epoch": 2.174757281553398,
"grad_norm": 61.1097526550293,
"learning_rate": 5.218513582510648e-05,
"loss": 167.5735,
"step": 301
},
{
"epoch": 2.181982388801084,
"grad_norm": 64.91517639160156,
"learning_rate": 5.191745532503257e-05,
"loss": 165.7943,
"step": 302
},
{
"epoch": 2.1892074960487693,
"grad_norm": 66.6432113647461,
"learning_rate": 5.1649884368457534e-05,
"loss": 168.6365,
"step": 303
},
{
"epoch": 2.196432603296455,
"grad_norm": 72.80828857421875,
"learning_rate": 5.1382432463973077e-05,
"loss": 167.3139,
"step": 304
},
{
"epoch": 2.203657710544141,
"grad_norm": 69.28011322021484,
"learning_rate": 5.1115109115940195e-05,
"loss": 161.4875,
"step": 305
},
{
"epoch": 2.2108828177918265,
"grad_norm": 77.29905700683594,
"learning_rate": 5.0847923824151424e-05,
"loss": 172.7222,
"step": 306
},
{
"epoch": 2.2181079250395124,
"grad_norm": 81.0169448852539,
"learning_rate": 5.058088608349323e-05,
"loss": 174.758,
"step": 307
},
{
"epoch": 2.225333032287198,
"grad_norm": 84.8740005493164,
"learning_rate": 5.031400538360858e-05,
"loss": 168.4707,
"step": 308
},
{
"epoch": 2.2325581395348837,
"grad_norm": 91.66473388671875,
"learning_rate": 5.004729120855973e-05,
"loss": 169.2647,
"step": 309
},
{
"epoch": 2.2397832467825696,
"grad_norm": 105.14354705810547,
"learning_rate": 4.9780753036491265e-05,
"loss": 176.2409,
"step": 310
},
{
"epoch": 2.247008354030255,
"grad_norm": 112.60404205322266,
"learning_rate": 4.9514400339293075e-05,
"loss": 154.8602,
"step": 311
},
{
"epoch": 2.254233461277941,
"grad_norm": 56.13935470581055,
"learning_rate": 4.9248242582264e-05,
"loss": 105.6355,
"step": 312
},
{
"epoch": 2.2614585685256268,
"grad_norm": 47.51068115234375,
"learning_rate": 4.898228922377526e-05,
"loss": 110.0799,
"step": 313
},
{
"epoch": 2.268683675773312,
"grad_norm": 33.776058197021484,
"learning_rate": 4.87165497149345e-05,
"loss": 118.4811,
"step": 314
},
{
"epoch": 2.275908783020998,
"grad_norm": 37.834251403808594,
"learning_rate": 4.8451033499249755e-05,
"loss": 123.3947,
"step": 315
},
{
"epoch": 2.2831338902686835,
"grad_norm": 36.37396240234375,
"learning_rate": 4.8185750012294065e-05,
"loss": 123.4409,
"step": 316
},
{
"epoch": 2.2903589975163694,
"grad_norm": 36.17446517944336,
"learning_rate": 4.7920708681369964e-05,
"loss": 132.076,
"step": 317
},
{
"epoch": 2.2975841047640553,
"grad_norm": 38.38850784301758,
"learning_rate": 4.765591892517464e-05,
"loss": 134.8902,
"step": 318
},
{
"epoch": 2.3048092120117407,
"grad_norm": 37.74662399291992,
"learning_rate": 4.739139015346508e-05,
"loss": 135.0242,
"step": 319
},
{
"epoch": 2.3120343192594266,
"grad_norm": 39.41255187988281,
"learning_rate": 4.7127131766723744e-05,
"loss": 139.3454,
"step": 320
},
{
"epoch": 2.319259426507112,
"grad_norm": 43.438262939453125,
"learning_rate": 4.6863153155824545e-05,
"loss": 143.1548,
"step": 321
},
{
"epoch": 2.326484533754798,
"grad_norm": 45.747432708740234,
"learning_rate": 4.659946370169903e-05,
"loss": 145.9223,
"step": 322
},
{
"epoch": 2.333709641002484,
"grad_norm": 42.40370559692383,
"learning_rate": 4.633607277500312e-05,
"loss": 139.4424,
"step": 323
},
{
"epoch": 2.3409347482501692,
"grad_norm": 42.54023361206055,
"learning_rate": 4.6072989735783986e-05,
"loss": 145.1206,
"step": 324
},
{
"epoch": 2.348159855497855,
"grad_norm": 45.216739654541016,
"learning_rate": 4.581022393314757e-05,
"loss": 149.1618,
"step": 325
},
{
"epoch": 2.348159855497855,
"eval_loss": 2.2698802947998047,
"eval_runtime": 0.65,
"eval_samples_per_second": 76.919,
"eval_steps_per_second": 76.919,
"step": 325
},
{
"epoch": 2.355384962745541,
"grad_norm": 49.58754348754883,
"learning_rate": 4.554778470492619e-05,
"loss": 151.4744,
"step": 326
},
{
"epoch": 2.3626100699932264,
"grad_norm": 48.64653015136719,
"learning_rate": 4.5285681377346836e-05,
"loss": 153.4593,
"step": 327
},
{
"epoch": 2.3698351772409123,
"grad_norm": 54.19842529296875,
"learning_rate": 4.5023923264699663e-05,
"loss": 158.7397,
"step": 328
},
{
"epoch": 2.3770602844885977,
"grad_norm": 53.489540100097656,
"learning_rate": 4.4762519669007075e-05,
"loss": 156.4357,
"step": 329
},
{
"epoch": 2.3842853917362836,
"grad_norm": 57.42572021484375,
"learning_rate": 4.450147987969302e-05,
"loss": 156.7323,
"step": 330
},
{
"epoch": 2.391510498983969,
"grad_norm": 55.5129280090332,
"learning_rate": 4.424081317325306e-05,
"loss": 162.6986,
"step": 331
},
{
"epoch": 2.398735606231655,
"grad_norm": 62.38901901245117,
"learning_rate": 4.398052881292457e-05,
"loss": 168.2654,
"step": 332
},
{
"epoch": 2.405960713479341,
"grad_norm": 57.75197982788086,
"learning_rate": 4.372063604835758e-05,
"loss": 154.5503,
"step": 333
},
{
"epoch": 2.4131858207270263,
"grad_norm": 64.6832504272461,
"learning_rate": 4.3461144115286155e-05,
"loss": 162.428,
"step": 334
},
{
"epoch": 2.420410927974712,
"grad_norm": 63.21479797363281,
"learning_rate": 4.320206223520006e-05,
"loss": 159.031,
"step": 335
},
{
"epoch": 2.427636035222398,
"grad_norm": 66.63702392578125,
"learning_rate": 4.2943399615017196e-05,
"loss": 166.0823,
"step": 336
},
{
"epoch": 2.4348611424700835,
"grad_norm": 67.73152160644531,
"learning_rate": 4.268516544675628e-05,
"loss": 166.5575,
"step": 337
},
{
"epoch": 2.4420862497177693,
"grad_norm": 74.20126342773438,
"learning_rate": 4.2427368907210293e-05,
"loss": 168.2213,
"step": 338
},
{
"epoch": 2.4493113569654548,
"grad_norm": 84.52593231201172,
"learning_rate": 4.217001915762033e-05,
"loss": 172.4427,
"step": 339
},
{
"epoch": 2.4565364642131406,
"grad_norm": 83.18832397460938,
"learning_rate": 4.191312534335005e-05,
"loss": 172.7155,
"step": 340
},
{
"epoch": 2.4637615714608265,
"grad_norm": 83.47673797607422,
"learning_rate": 4.165669659356062e-05,
"loss": 172.4003,
"step": 341
},
{
"epoch": 2.470986678708512,
"grad_norm": 86.96723937988281,
"learning_rate": 4.1400742020886396e-05,
"loss": 176.3645,
"step": 342
},
{
"epoch": 2.478211785956198,
"grad_norm": 100.84264373779297,
"learning_rate": 4.114527072111103e-05,
"loss": 173.7337,
"step": 343
},
{
"epoch": 2.4854368932038833,
"grad_norm": 106.68719482421875,
"learning_rate": 4.0890291772844224e-05,
"loss": 167.7532,
"step": 344
},
{
"epoch": 2.492662000451569,
"grad_norm": 118.92356872558594,
"learning_rate": 4.063581423719916e-05,
"loss": 164.0733,
"step": 345
},
{
"epoch": 2.499887107699255,
"grad_norm": 59.804168701171875,
"learning_rate": 4.038184715747044e-05,
"loss": 108.4363,
"step": 346
},
{
"epoch": 2.5071122149469405,
"grad_norm": 50.58955764770508,
"learning_rate": 4.012839955881273e-05,
"loss": 110.6487,
"step": 347
},
{
"epoch": 2.5143373221946264,
"grad_norm": 39.889163970947266,
"learning_rate": 3.9875480447920076e-05,
"loss": 116.4527,
"step": 348
},
{
"epoch": 2.5215624294423122,
"grad_norm": 40.6741828918457,
"learning_rate": 3.9623098812705803e-05,
"loss": 122.8531,
"step": 349
},
{
"epoch": 2.5287875366899977,
"grad_norm": 35.53072738647461,
"learning_rate": 3.93712636219831e-05,
"loss": 124.1339,
"step": 350
},
{
"epoch": 2.5287875366899977,
"eval_loss": 2.266690254211426,
"eval_runtime": 0.6451,
"eval_samples_per_second": 77.513,
"eval_steps_per_second": 77.513,
"step": 350
},
{
"epoch": 2.5360126439376836,
"grad_norm": 40.804901123046875,
"learning_rate": 3.9119983825146326e-05,
"loss": 133.2125,
"step": 351
},
{
"epoch": 2.5432377511853694,
"grad_norm": 40.934730529785156,
"learning_rate": 3.886926835185297e-05,
"loss": 129.789,
"step": 352
},
{
"epoch": 2.550462858433055,
"grad_norm": 42.09627151489258,
"learning_rate": 3.861912611170631e-05,
"loss": 140.0207,
"step": 353
},
{
"epoch": 2.5576879656807403,
"grad_norm": 41.50466537475586,
"learning_rate": 3.8369565993938835e-05,
"loss": 136.9768,
"step": 354
},
{
"epoch": 2.564913072928426,
"grad_norm": 41.60817337036133,
"learning_rate": 3.8120596867096255e-05,
"loss": 143.2024,
"step": 355
},
{
"epoch": 2.572138180176112,
"grad_norm": 42.11903762817383,
"learning_rate": 3.7872227578722495e-05,
"loss": 141.182,
"step": 356
},
{
"epoch": 2.5793632874237975,
"grad_norm": 45.26333236694336,
"learning_rate": 3.762446695504511e-05,
"loss": 145.6521,
"step": 357
},
{
"epoch": 2.5865883946714834,
"grad_norm": 43.8610725402832,
"learning_rate": 3.7377323800661764e-05,
"loss": 145.043,
"step": 358
},
{
"epoch": 2.5938135019191693,
"grad_norm": 48.53872299194336,
"learning_rate": 3.7130806898227276e-05,
"loss": 154.6329,
"step": 359
},
{
"epoch": 2.6010386091668547,
"grad_norm": 46.94786834716797,
"learning_rate": 3.688492500814152e-05,
"loss": 156.9177,
"step": 360
},
{
"epoch": 2.6082637164145406,
"grad_norm": 52.20769500732422,
"learning_rate": 3.663968686823814e-05,
"loss": 156.2657,
"step": 361
},
{
"epoch": 2.6154888236622265,
"grad_norm": 53.79183578491211,
"learning_rate": 3.6395101193474024e-05,
"loss": 154.101,
"step": 362
},
{
"epoch": 2.622713930909912,
"grad_norm": 54.7728157043457,
"learning_rate": 3.6151176675619555e-05,
"loss": 159.0833,
"step": 363
},
{
"epoch": 2.6299390381575978,
"grad_norm": 56.10276412963867,
"learning_rate": 3.59079219829498e-05,
"loss": 159.4493,
"step": 364
},
{
"epoch": 2.637164145405283,
"grad_norm": 61.8068733215332,
"learning_rate": 3.5665345759936454e-05,
"loss": 158.6887,
"step": 365
},
{
"epoch": 2.644389252652969,
"grad_norm": 64.09469604492188,
"learning_rate": 3.542345662694061e-05,
"loss": 163.7667,
"step": 366
},
{
"epoch": 2.6516143599006545,
"grad_norm": 62.513427734375,
"learning_rate": 3.518226317990646e-05,
"loss": 165.2545,
"step": 367
},
{
"epoch": 2.6588394671483404,
"grad_norm": 66.19265747070312,
"learning_rate": 3.494177399005578e-05,
"loss": 167.7175,
"step": 368
},
{
"epoch": 2.6660645743960263,
"grad_norm": 61.81763458251953,
"learning_rate": 3.470199760358339e-05,
"loss": 172.3992,
"step": 369
},
{
"epoch": 2.6732896816437117,
"grad_norm": 72.12129974365234,
"learning_rate": 3.446294254135339e-05,
"loss": 171.6202,
"step": 370
},
{
"epoch": 2.6805147888913976,
"grad_norm": 73.13341522216797,
"learning_rate": 3.422461729859643e-05,
"loss": 172.6192,
"step": 371
},
{
"epoch": 2.6877398961390835,
"grad_norm": 76.26922607421875,
"learning_rate": 3.398703034460776e-05,
"loss": 168.6027,
"step": 372
},
{
"epoch": 2.694965003386769,
"grad_norm": 72.966064453125,
"learning_rate": 3.3750190122446256e-05,
"loss": 164.1373,
"step": 373
},
{
"epoch": 2.702190110634455,
"grad_norm": 78.573974609375,
"learning_rate": 3.3514105048634394e-05,
"loss": 168.2193,
"step": 374
},
{
"epoch": 2.7094152178821407,
"grad_norm": 83.50703430175781,
"learning_rate": 3.327878351285922e-05,
"loss": 172.5475,
"step": 375
},
{
"epoch": 2.7094152178821407,
"eval_loss": 2.25943660736084,
"eval_runtime": 0.6467,
"eval_samples_per_second": 77.315,
"eval_steps_per_second": 77.315,
"step": 375
},
{
"epoch": 2.716640325129826,
"grad_norm": 84.6645736694336,
"learning_rate": 3.304423387767411e-05,
"loss": 173.0074,
"step": 376
},
{
"epoch": 2.723865432377512,
"grad_norm": 87.42393493652344,
"learning_rate": 3.28104644782016e-05,
"loss": 163.5128,
"step": 377
},
{
"epoch": 2.7310905396251974,
"grad_norm": 105.86749267578125,
"learning_rate": 3.2577483621837276e-05,
"loss": 164.7872,
"step": 378
},
{
"epoch": 2.7383156468728833,
"grad_norm": 119.39720916748047,
"learning_rate": 3.2345299587954484e-05,
"loss": 161.0168,
"step": 379
},
{
"epoch": 2.7455407541205687,
"grad_norm": 44.81121826171875,
"learning_rate": 3.211392062761007e-05,
"loss": 103.1053,
"step": 380
},
{
"epoch": 2.7527658613682546,
"grad_norm": 34.9925651550293,
"learning_rate": 3.1883354963251256e-05,
"loss": 110.0836,
"step": 381
},
{
"epoch": 2.7599909686159405,
"grad_norm": 33.172786712646484,
"learning_rate": 3.1653610788423416e-05,
"loss": 117.6201,
"step": 382
},
{
"epoch": 2.767216075863626,
"grad_norm": 35.12316131591797,
"learning_rate": 3.142469626747885e-05,
"loss": 120.7938,
"step": 383
},
{
"epoch": 2.774441183111312,
"grad_norm": 33.81804275512695,
"learning_rate": 3.119661953528671e-05,
"loss": 119.9144,
"step": 384
},
{
"epoch": 2.7816662903589977,
"grad_norm": 38.326560974121094,
"learning_rate": 3.0969388696943855e-05,
"loss": 127.9329,
"step": 385
},
{
"epoch": 2.788891397606683,
"grad_norm": 39.44643020629883,
"learning_rate": 3.0743011827486914e-05,
"loss": 131.7632,
"step": 386
},
{
"epoch": 2.796116504854369,
"grad_norm": 39.9660530090332,
"learning_rate": 3.0517496971605252e-05,
"loss": 131.6603,
"step": 387
},
{
"epoch": 2.803341612102055,
"grad_norm": 43.44757080078125,
"learning_rate": 3.029285214335509e-05,
"loss": 135.7212,
"step": 388
},
{
"epoch": 2.8105667193497403,
"grad_norm": 39.718448638916016,
"learning_rate": 3.0069085325874736e-05,
"loss": 129.8581,
"step": 389
},
{
"epoch": 2.817791826597426,
"grad_norm": 40.95348358154297,
"learning_rate": 2.984620447110087e-05,
"loss": 139.6507,
"step": 390
},
{
"epoch": 2.8250169338451117,
"grad_norm": 45.0244140625,
"learning_rate": 2.962421749948601e-05,
"loss": 141.6569,
"step": 391
},
{
"epoch": 2.8322420410927975,
"grad_norm": 44.68109130859375,
"learning_rate": 2.940313229971699e-05,
"loss": 141.7101,
"step": 392
},
{
"epoch": 2.839467148340483,
"grad_norm": 46.43368148803711,
"learning_rate": 2.9182956728434607e-05,
"loss": 151.777,
"step": 393
},
{
"epoch": 2.846692255588169,
"grad_norm": 48.510772705078125,
"learning_rate": 2.8963698609954483e-05,
"loss": 153.4996,
"step": 394
},
{
"epoch": 2.8539173628358547,
"grad_norm": 51.50070571899414,
"learning_rate": 2.8745365735988993e-05,
"loss": 153.3897,
"step": 395
},
{
"epoch": 2.86114247008354,
"grad_norm": 57.43215560913086,
"learning_rate": 2.852796586537035e-05,
"loss": 154.0306,
"step": 396
},
{
"epoch": 2.868367577331226,
"grad_norm": 54.87392044067383,
"learning_rate": 2.831150672377489e-05,
"loss": 156.4159,
"step": 397
},
{
"epoch": 2.875592684578912,
"grad_norm": 57.76106643676758,
"learning_rate": 2.809599600344853e-05,
"loss": 165.0454,
"step": 398
},
{
"epoch": 2.8828177918265974,
"grad_norm": 57.79510498046875,
"learning_rate": 2.7881441362933468e-05,
"loss": 164.8601,
"step": 399
},
{
"epoch": 2.8900428990742832,
"grad_norm": 57.36224365234375,
"learning_rate": 2.766785042679591e-05,
"loss": 162.4617,
"step": 400
},
{
"epoch": 2.8900428990742832,
"eval_loss": 2.2592520713806152,
"eval_runtime": 0.6479,
"eval_samples_per_second": 77.176,
"eval_steps_per_second": 77.176,
"step": 400
},
{
"epoch": 2.8972680063219687,
"grad_norm": 59.14641189575195,
"learning_rate": 2.745523078535517e-05,
"loss": 158.6469,
"step": 401
},
{
"epoch": 2.9044931135696546,
"grad_norm": 63.88818359375,
"learning_rate": 2.724358999441402e-05,
"loss": 159.6259,
"step": 402
},
{
"epoch": 2.91171822081734,
"grad_norm": 68.26206970214844,
"learning_rate": 2.7032935574990033e-05,
"loss": 169.2703,
"step": 403
},
{
"epoch": 2.918943328065026,
"grad_norm": 67.78248596191406,
"learning_rate": 2.68232750130484e-05,
"loss": 159.6917,
"step": 404
},
{
"epoch": 2.9261684353127118,
"grad_norm": 67.05574798583984,
"learning_rate": 2.6614615759235884e-05,
"loss": 166.1517,
"step": 405
},
{
"epoch": 2.933393542560397,
"grad_norm": 75.2081069946289,
"learning_rate": 2.6406965228616087e-05,
"loss": 168.4991,
"step": 406
},
{
"epoch": 2.940618649808083,
"grad_norm": 76.87635803222656,
"learning_rate": 2.620033080040585e-05,
"loss": 172.8878,
"step": 407
},
{
"epoch": 2.947843757055769,
"grad_norm": 74.64488983154297,
"learning_rate": 2.599471981771314e-05,
"loss": 169.2124,
"step": 408
},
{
"epoch": 2.9550688643034544,
"grad_norm": 82.09602355957031,
"learning_rate": 2.5790139587275948e-05,
"loss": 166.7485,
"step": 409
},
{
"epoch": 2.9622939715511403,
"grad_norm": 86.70449829101562,
"learning_rate": 2.5586597379202805e-05,
"loss": 172.9556,
"step": 410
},
{
"epoch": 2.969519078798826,
"grad_norm": 93.79573822021484,
"learning_rate": 2.5384100426714307e-05,
"loss": 172.531,
"step": 411
},
{
"epoch": 2.9767441860465116,
"grad_norm": 97.84060668945312,
"learning_rate": 2.5182655925886123e-05,
"loss": 168.0535,
"step": 412
},
{
"epoch": 2.9839692932941975,
"grad_norm": 116.76630401611328,
"learning_rate": 2.4982271035393208e-05,
"loss": 156.5745,
"step": 413
},
{
"epoch": 2.991194400541883,
"grad_norm": 41.00756072998047,
"learning_rate": 2.4782952876255474e-05,
"loss": 140.6133,
"step": 414
},
{
"epoch": 2.998419507789569,
"grad_norm": 67.31522369384766,
"learning_rate": 2.4584708531584742e-05,
"loss": 160.9438,
"step": 415
},
{
"epoch": 3.0056446150372547,
"grad_norm": 57.157806396484375,
"learning_rate": 2.4387545046332956e-05,
"loss": 118.3847,
"step": 416
},
{
"epoch": 3.01286972228494,
"grad_norm": 43.13034439086914,
"learning_rate": 2.4191469427041888e-05,
"loss": 105.1958,
"step": 417
},
{
"epoch": 3.020094829532626,
"grad_norm": 34.08012390136719,
"learning_rate": 2.39964886415941e-05,
"loss": 112.8705,
"step": 418
},
{
"epoch": 3.0273199367803114,
"grad_norm": 34.88169860839844,
"learning_rate": 2.3802609618965446e-05,
"loss": 117.5818,
"step": 419
},
{
"epoch": 3.0345450440279973,
"grad_norm": 32.31670379638672,
"learning_rate": 2.360983924897866e-05,
"loss": 122.0667,
"step": 420
},
{
"epoch": 3.041770151275683,
"grad_norm": 34.80269241333008,
"learning_rate": 2.3418184382058638e-05,
"loss": 124.0351,
"step": 421
},
{
"epoch": 3.0489952585233686,
"grad_norm": 37.697933197021484,
"learning_rate": 2.3227651828989e-05,
"loss": 137.7882,
"step": 422
},
{
"epoch": 3.0562203657710545,
"grad_norm": 36.73977279663086,
"learning_rate": 2.303824836066998e-05,
"loss": 127.7736,
"step": 423
},
{
"epoch": 3.06344547301874,
"grad_norm": 42.219444274902344,
"learning_rate": 2.284998070787787e-05,
"loss": 143.5189,
"step": 424
},
{
"epoch": 3.070670580266426,
"grad_norm": 39.332454681396484,
"learning_rate": 2.2662855561025804e-05,
"loss": 130.0694,
"step": 425
},
{
"epoch": 3.070670580266426,
"eval_loss": 2.2527289390563965,
"eval_runtime": 0.6467,
"eval_samples_per_second": 77.314,
"eval_steps_per_second": 77.314,
"step": 425
},
{
"epoch": 3.0778956875141117,
"grad_norm": 43.08782196044922,
"learning_rate": 2.2476879569926048e-05,
"loss": 143.561,
"step": 426
},
{
"epoch": 3.085120794761797,
"grad_norm": 41.036651611328125,
"learning_rate": 2.2292059343553596e-05,
"loss": 139.8347,
"step": 427
},
{
"epoch": 3.092345902009483,
"grad_norm": 44.45792770385742,
"learning_rate": 2.210840144981144e-05,
"loss": 143.4371,
"step": 428
},
{
"epoch": 3.0995710092571684,
"grad_norm": 42.94414138793945,
"learning_rate": 2.192591241529699e-05,
"loss": 141.0129,
"step": 429
},
{
"epoch": 3.1067961165048543,
"grad_norm": 49.134517669677734,
"learning_rate": 2.1744598725070347e-05,
"loss": 153.8074,
"step": 430
},
{
"epoch": 3.11402122375254,
"grad_norm": 49.15544891357422,
"learning_rate": 2.1564466822423672e-05,
"loss": 153.712,
"step": 431
},
{
"epoch": 3.1212463310002256,
"grad_norm": 51.34669876098633,
"learning_rate": 2.1385523108652335e-05,
"loss": 150.7699,
"step": 432
},
{
"epoch": 3.1284714382479115,
"grad_norm": 50.598899841308594,
"learning_rate": 2.1207773942827332e-05,
"loss": 155.5012,
"step": 433
},
{
"epoch": 3.1356965454955974,
"grad_norm": 58.15761184692383,
"learning_rate": 2.103122564156937e-05,
"loss": 158.1927,
"step": 434
},
{
"epoch": 3.142921652743283,
"grad_norm": 56.7564811706543,
"learning_rate": 2.0855884478824412e-05,
"loss": 164.2387,
"step": 435
},
{
"epoch": 3.1501467599909687,
"grad_norm": 58.64568328857422,
"learning_rate": 2.0681756685640647e-05,
"loss": 161.4923,
"step": 436
},
{
"epoch": 3.157371867238654,
"grad_norm": 58.64356231689453,
"learning_rate": 2.0508848449947114e-05,
"loss": 155.2221,
"step": 437
},
{
"epoch": 3.16459697448634,
"grad_norm": 65.33573150634766,
"learning_rate": 2.0337165916333795e-05,
"loss": 166.1156,
"step": 438
},
{
"epoch": 3.171822081734026,
"grad_norm": 64.05957794189453,
"learning_rate": 2.016671518583325e-05,
"loss": 164.2138,
"step": 439
},
{
"epoch": 3.1790471889817113,
"grad_norm": 68.86741638183594,
"learning_rate": 1.9997502315703804e-05,
"loss": 160.5822,
"step": 440
},
{
"epoch": 3.1862722962293972,
"grad_norm": 71.23939514160156,
"learning_rate": 1.98295333192143e-05,
"loss": 167.138,
"step": 441
},
{
"epoch": 3.1934974034770827,
"grad_norm": 78.74092102050781,
"learning_rate": 1.9662814165430392e-05,
"loss": 171.5105,
"step": 442
},
{
"epoch": 3.2007225107247685,
"grad_norm": 74.84487915039062,
"learning_rate": 1.9497350779002463e-05,
"loss": 164.3165,
"step": 443
},
{
"epoch": 3.2079476179724544,
"grad_norm": 75.05098724365234,
"learning_rate": 1.9333149039955026e-05,
"loss": 167.7948,
"step": 444
},
{
"epoch": 3.21517272522014,
"grad_norm": 81.60777282714844,
"learning_rate": 1.917021478347779e-05,
"loss": 170.561,
"step": 445
},
{
"epoch": 3.2223978324678257,
"grad_norm": 82.3056869506836,
"learning_rate": 1.9008553799718355e-05,
"loss": 164.4864,
"step": 446
},
{
"epoch": 3.2296229397155116,
"grad_norm": 89.91145324707031,
"learning_rate": 1.8848171833576322e-05,
"loss": 163.1181,
"step": 447
},
{
"epoch": 3.236848046963197,
"grad_norm": 98.04798126220703,
"learning_rate": 1.8689074584499296e-05,
"loss": 166.0624,
"step": 448
},
{
"epoch": 3.244073154210883,
"grad_norm": 117.23175048828125,
"learning_rate": 1.8531267706280154e-05,
"loss": 173.7292,
"step": 449
},
{
"epoch": 3.2512982614585684,
"grad_norm": 83.46915435791016,
"learning_rate": 1.8374756806856357e-05,
"loss": 121.3651,
"step": 450
},
{
"epoch": 3.2512982614585684,
"eval_loss": 2.250525951385498,
"eval_runtime": 0.6458,
"eval_samples_per_second": 77.422,
"eval_steps_per_second": 77.422,
"step": 450
},
{
"epoch": 3.2585233687062543,
"grad_norm": 47.13894271850586,
"learning_rate": 1.8219547448110454e-05,
"loss": 111.1761,
"step": 451
},
{
"epoch": 3.26574847595394,
"grad_norm": 39.46201705932617,
"learning_rate": 1.806564514567258e-05,
"loss": 114.0671,
"step": 452
},
{
"epoch": 3.2729735832016256,
"grad_norm": 34.3983154296875,
"learning_rate": 1.7913055368724318e-05,
"loss": 119.771,
"step": 453
},
{
"epoch": 3.2801986904493114,
"grad_norm": 34.957542419433594,
"learning_rate": 1.7761783539804482e-05,
"loss": 122.0474,
"step": 454
},
{
"epoch": 3.287423797696997,
"grad_norm": 35.007755279541016,
"learning_rate": 1.7611835034616314e-05,
"loss": 123.7122,
"step": 455
},
{
"epoch": 3.2946489049446828,
"grad_norm": 38.12125015258789,
"learning_rate": 1.7463215181836497e-05,
"loss": 132.5189,
"step": 456
},
{
"epoch": 3.3018740121923686,
"grad_norm": 36.32997131347656,
"learning_rate": 1.7315929262925756e-05,
"loss": 128.9809,
"step": 457
},
{
"epoch": 3.309099119440054,
"grad_norm": 39.102500915527344,
"learning_rate": 1.71699825119412e-05,
"loss": 132.5178,
"step": 458
},
{
"epoch": 3.31632422668774,
"grad_norm": 39.0343132019043,
"learning_rate": 1.7025380115350343e-05,
"loss": 136.3037,
"step": 459
},
{
"epoch": 3.323549333935426,
"grad_norm": 41.78243637084961,
"learning_rate": 1.6882127211846727e-05,
"loss": 142.4308,
"step": 460
},
{
"epoch": 3.3307744411831113,
"grad_norm": 45.74725341796875,
"learning_rate": 1.674022889216737e-05,
"loss": 143.102,
"step": 461
},
{
"epoch": 3.337999548430797,
"grad_norm": 42.37492370605469,
"learning_rate": 1.6599690198911826e-05,
"loss": 135.5431,
"step": 462
},
{
"epoch": 3.3452246556784826,
"grad_norm": 44.97285461425781,
"learning_rate": 1.6460516126363014e-05,
"loss": 149.0417,
"step": 463
},
{
"epoch": 3.3524497629261685,
"grad_norm": 48.054073333740234,
"learning_rate": 1.632271162030971e-05,
"loss": 153.1987,
"step": 464
},
{
"epoch": 3.359674870173854,
"grad_norm": 49.26851272583008,
"learning_rate": 1.6186281577870785e-05,
"loss": 148.3528,
"step": 465
},
{
"epoch": 3.36689997742154,
"grad_norm": 48.746395111083984,
"learning_rate": 1.605123084732123e-05,
"loss": 153.4831,
"step": 466
},
{
"epoch": 3.3741250846692257,
"grad_norm": 54.20684051513672,
"learning_rate": 1.59175642279198e-05,
"loss": 150.8487,
"step": 467
},
{
"epoch": 3.381350191916911,
"grad_norm": 52.09891128540039,
"learning_rate": 1.578528646973852e-05,
"loss": 156.4221,
"step": 468
},
{
"epoch": 3.388575299164597,
"grad_norm": 55.3662109375,
"learning_rate": 1.5654402273493805e-05,
"loss": 158.3705,
"step": 469
},
{
"epoch": 3.395800406412283,
"grad_norm": 56.87941360473633,
"learning_rate": 1.552491629037952e-05,
"loss": 156.8328,
"step": 470
},
{
"epoch": 3.4030255136599683,
"grad_norm": 58.18745040893555,
"learning_rate": 1.5396833121901592e-05,
"loss": 158.3661,
"step": 471
},
{
"epoch": 3.410250620907654,
"grad_norm": 63.58408737182617,
"learning_rate": 1.5270157319714572e-05,
"loss": 165.1025,
"step": 472
},
{
"epoch": 3.4174757281553396,
"grad_norm": 64.96553039550781,
"learning_rate": 1.514489338545978e-05,
"loss": 156.8923,
"step": 473
},
{
"epoch": 3.4247008354030255,
"grad_norm": 64.00413513183594,
"learning_rate": 1.5021045770605458e-05,
"loss": 164.6695,
"step": 474
},
{
"epoch": 3.4319259426507114,
"grad_norm": 75.28882598876953,
"learning_rate": 1.4898618876288473e-05,
"loss": 168.7521,
"step": 475
},
{
"epoch": 3.4319259426507114,
"eval_loss": 2.2525815963745117,
"eval_runtime": 0.6432,
"eval_samples_per_second": 77.738,
"eval_steps_per_second": 77.738,
"step": 475
},
{
"epoch": 3.439151049898397,
"grad_norm": 73.818359375,
"learning_rate": 1.4777617053157982e-05,
"loss": 164.7033,
"step": 476
},
{
"epoch": 3.4463761571460827,
"grad_norm": 73.67526245117188,
"learning_rate": 1.4658044601220777e-05,
"loss": 169.974,
"step": 477
},
{
"epoch": 3.453601264393768,
"grad_norm": 75.99698638916016,
"learning_rate": 1.4539905769688514e-05,
"loss": 165.4877,
"step": 478
},
{
"epoch": 3.460826371641454,
"grad_norm": 80.24381256103516,
"learning_rate": 1.4423204756826705e-05,
"loss": 169.552,
"step": 479
},
{
"epoch": 3.46805147888914,
"grad_norm": 86.68580627441406,
"learning_rate": 1.4307945709805487e-05,
"loss": 170.0956,
"step": 480
},
{
"epoch": 3.4752765861368253,
"grad_norm": 89.91241455078125,
"learning_rate": 1.4194132724552292e-05,
"loss": 174.2864,
"step": 481
},
{
"epoch": 3.482501693384511,
"grad_norm": 105.35623168945312,
"learning_rate": 1.4081769845606262e-05,
"loss": 170.3175,
"step": 482
},
{
"epoch": 3.489726800632197,
"grad_norm": 115.0772705078125,
"learning_rate": 1.3970861065974563e-05,
"loss": 165.5754,
"step": 483
},
{
"epoch": 3.4969519078798825,
"grad_norm": 76.55886840820312,
"learning_rate": 1.3861410326990411e-05,
"loss": 115.5443,
"step": 484
},
{
"epoch": 3.5041770151275684,
"grad_norm": 39.43132400512695,
"learning_rate": 1.3753421518173073e-05,
"loss": 107.1197,
"step": 485
},
{
"epoch": 3.511402122375254,
"grad_norm": 32.5174674987793,
"learning_rate": 1.3646898477089626e-05,
"loss": 113.2218,
"step": 486
},
{
"epoch": 3.5186272296229397,
"grad_norm": 35.985008239746094,
"learning_rate": 1.3541844989218578e-05,
"loss": 119.1645,
"step": 487
},
{
"epoch": 3.525852336870625,
"grad_norm": 32.62986755371094,
"learning_rate": 1.3438264787815378e-05,
"loss": 119.9635,
"step": 488
},
{
"epoch": 3.533077444118311,
"grad_norm": 34.348697662353516,
"learning_rate": 1.3336161553779664e-05,
"loss": 123.3444,
"step": 489
},
{
"epoch": 3.540302551365997,
"grad_norm": 35.503746032714844,
"learning_rate": 1.323553891552456e-05,
"loss": 130.0223,
"step": 490
},
{
"epoch": 3.5475276586136824,
"grad_norm": 38.014793395996094,
"learning_rate": 1.3136400448847655e-05,
"loss": 132.8261,
"step": 491
},
{
"epoch": 3.5547527658613682,
"grad_norm": 41.30305480957031,
"learning_rate": 1.3038749676803994e-05,
"loss": 137.0195,
"step": 492
},
{
"epoch": 3.561977873109054,
"grad_norm": 38.00440979003906,
"learning_rate": 1.2942590069580812e-05,
"loss": 135.3861,
"step": 493
},
{
"epoch": 3.5692029803567396,
"grad_norm": 41.92805099487305,
"learning_rate": 1.2847925044374282e-05,
"loss": 144.1726,
"step": 494
},
{
"epoch": 3.5764280876044254,
"grad_norm": 42.240840911865234,
"learning_rate": 1.275475796526802e-05,
"loss": 146.6922,
"step": 495
},
{
"epoch": 3.5836531948521113,
"grad_norm": 44.38275909423828,
"learning_rate": 1.26630921431136e-05,
"loss": 145.4355,
"step": 496
},
{
"epoch": 3.5908783020997967,
"grad_norm": 44.79975891113281,
"learning_rate": 1.2572930835412819e-05,
"loss": 144.7125,
"step": 497
},
{
"epoch": 3.5981034093474826,
"grad_norm": 47.101890563964844,
"learning_rate": 1.2484277246202009e-05,
"loss": 150.0654,
"step": 498
},
{
"epoch": 3.605328516595168,
"grad_norm": 48.01686477661133,
"learning_rate": 1.239713452593814e-05,
"loss": 154.898,
"step": 499
},
{
"epoch": 3.612553623842854,
"grad_norm": 54.491729736328125,
"learning_rate": 1.2311505771386865e-05,
"loss": 154.9589,
"step": 500
},
{
"epoch": 3.612553623842854,
"eval_loss": 2.248832941055298,
"eval_runtime": 0.6421,
"eval_samples_per_second": 77.868,
"eval_steps_per_second": 77.868,
"step": 500
},
{
"epoch": 3.6197787310905394,
"grad_norm": 55.82506561279297,
"learning_rate": 1.2227394025512476e-05,
"loss": 157.7209,
"step": 501
},
{
"epoch": 3.6270038383382253,
"grad_norm": 58.75271987915039,
"learning_rate": 1.2144802277369761e-05,
"loss": 160.4193,
"step": 502
},
{
"epoch": 3.634228945585911,
"grad_norm": 56.1617546081543,
"learning_rate": 1.2063733461997805e-05,
"loss": 162.6156,
"step": 503
},
{
"epoch": 3.6414540528335966,
"grad_norm": 60.91801071166992,
"learning_rate": 1.1984190460315653e-05,
"loss": 159.6019,
"step": 504
},
{
"epoch": 3.6486791600812825,
"grad_norm": 60.85551071166992,
"learning_rate": 1.1906176099019958e-05,
"loss": 160.3419,
"step": 505
},
{
"epoch": 3.6559042673289683,
"grad_norm": 61.02642059326172,
"learning_rate": 1.1829693150484523e-05,
"loss": 165.2598,
"step": 506
},
{
"epoch": 3.6631293745766538,
"grad_norm": 68.14204406738281,
"learning_rate": 1.1754744332661776e-05,
"loss": 162.0726,
"step": 507
},
{
"epoch": 3.6703544818243397,
"grad_norm": 67.21196746826172,
"learning_rate": 1.1681332308986191e-05,
"loss": 165.2303,
"step": 508
},
{
"epoch": 3.6775795890720255,
"grad_norm": 68.1639633178711,
"learning_rate": 1.1609459688279622e-05,
"loss": 165.0496,
"step": 509
},
{
"epoch": 3.684804696319711,
"grad_norm": 74.92154693603516,
"learning_rate": 1.1539129024658605e-05,
"loss": 165.1957,
"step": 510
},
{
"epoch": 3.692029803567397,
"grad_norm": 72.8001937866211,
"learning_rate": 1.1470342817443607e-05,
"loss": 159.5798,
"step": 511
},
{
"epoch": 3.6992549108150823,
"grad_norm": 81.59801483154297,
"learning_rate": 1.140310351107019e-05,
"loss": 171.2486,
"step": 512
},
{
"epoch": 3.706480018062768,
"grad_norm": 78.66382598876953,
"learning_rate": 1.133741349500213e-05,
"loss": 165.4824,
"step": 513
},
{
"epoch": 3.7137051253104536,
"grad_norm": 83.93260192871094,
"learning_rate": 1.1273275103646545e-05,
"loss": 172.9596,
"step": 514
},
{
"epoch": 3.7209302325581395,
"grad_norm": 93.98062896728516,
"learning_rate": 1.12106906162709e-05,
"loss": 165.9155,
"step": 515
},
{
"epoch": 3.7281553398058254,
"grad_norm": 96.30677795410156,
"learning_rate": 1.114966225692203e-05,
"loss": 166.0989,
"step": 516
},
{
"epoch": 3.735380447053511,
"grad_norm": 119.76006317138672,
"learning_rate": 1.1090192194347101e-05,
"loss": 166.277,
"step": 517
},
{
"epoch": 3.7426055543011967,
"grad_norm": 72.7205810546875,
"learning_rate": 1.1032282541916521e-05,
"loss": 117.604,
"step": 518
},
{
"epoch": 3.7498306615488826,
"grad_norm": 35.77104949951172,
"learning_rate": 1.0975935357548869e-05,
"loss": 105.8642,
"step": 519
},
{
"epoch": 3.757055768796568,
"grad_norm": 32.010738372802734,
"learning_rate": 1.092115264363775e-05,
"loss": 114.9323,
"step": 520
},
{
"epoch": 3.764280876044254,
"grad_norm": 34.5285758972168,
"learning_rate": 1.0867936346980626e-05,
"loss": 117.0938,
"step": 521
},
{
"epoch": 3.7715059832919398,
"grad_norm": 35.69192886352539,
"learning_rate": 1.0816288358709636e-05,
"loss": 124.612,
"step": 522
},
{
"epoch": 3.778731090539625,
"grad_norm": 34.59403991699219,
"learning_rate": 1.076621051422442e-05,
"loss": 126.6394,
"step": 523
},
{
"epoch": 3.7859561977873106,
"grad_norm": 36.92626190185547,
"learning_rate": 1.0717704593126856e-05,
"loss": 131.2098,
"step": 524
},
{
"epoch": 3.7931813050349965,
"grad_norm": 37.527610778808594,
"learning_rate": 1.067077231915783e-05,
"loss": 131.3304,
"step": 525
},
{
"epoch": 3.7931813050349965,
"eval_loss": 2.247295618057251,
"eval_runtime": 0.6443,
"eval_samples_per_second": 77.598,
"eval_steps_per_second": 77.598,
"step": 525
},
{
"epoch": 3.8004064122826824,
"grad_norm": 41.458560943603516,
"learning_rate": 1.0625415360135994e-05,
"loss": 136.7501,
"step": 526
},
{
"epoch": 3.807631519530368,
"grad_norm": 41.02821731567383,
"learning_rate": 1.0581635327898491e-05,
"loss": 140.3124,
"step": 527
},
{
"epoch": 3.8148566267780537,
"grad_norm": 41.52109146118164,
"learning_rate": 1.053943377824367e-05,
"loss": 138.0626,
"step": 528
},
{
"epoch": 3.8220817340257396,
"grad_norm": 45.5300178527832,
"learning_rate": 1.049881221087579e-05,
"loss": 143.3791,
"step": 529
},
{
"epoch": 3.829306841273425,
"grad_norm": 44.56905746459961,
"learning_rate": 1.0459772069351755e-05,
"loss": 140.6374,
"step": 530
},
{
"epoch": 3.836531948521111,
"grad_norm": 47.68376541137695,
"learning_rate": 1.0422314741029781e-05,
"loss": 149.6589,
"step": 531
},
{
"epoch": 3.843757055768797,
"grad_norm": 47.23084259033203,
"learning_rate": 1.038644155702012e-05,
"loss": 147.9767,
"step": 532
},
{
"epoch": 3.850982163016482,
"grad_norm": 50.1689567565918,
"learning_rate": 1.0352153792137733e-05,
"loss": 157.9461,
"step": 533
},
{
"epoch": 3.858207270264168,
"grad_norm": 54.78097915649414,
"learning_rate": 1.0319452664857016e-05,
"loss": 155.4814,
"step": 534
},
{
"epoch": 3.8654323775118535,
"grad_norm": 54.88005447387695,
"learning_rate": 1.0288339337268468e-05,
"loss": 156.4116,
"step": 535
},
{
"epoch": 3.8726574847595394,
"grad_norm": 54.529666900634766,
"learning_rate": 1.0258814915037418e-05,
"loss": 154.1808,
"step": 536
},
{
"epoch": 3.879882592007225,
"grad_norm": 57.33926773071289,
"learning_rate": 1.023088044736472e-05,
"loss": 161.1651,
"step": 537
},
{
"epoch": 3.8871076992549107,
"grad_norm": 60.16495895385742,
"learning_rate": 1.0204536926949475e-05,
"loss": 165.6093,
"step": 538
},
{
"epoch": 3.8943328065025966,
"grad_norm": 61.08560562133789,
"learning_rate": 1.0179785289953755e-05,
"loss": 162.3731,
"step": 539
},
{
"epoch": 3.901557913750282,
"grad_norm": 62.306461334228516,
"learning_rate": 1.0156626415969325e-05,
"loss": 160.4524,
"step": 540
},
{
"epoch": 3.908783020997968,
"grad_norm": 64.85774230957031,
"learning_rate": 1.0135061127986394e-05,
"loss": 161.3555,
"step": 541
},
{
"epoch": 3.916008128245654,
"grad_norm": 67.20696258544922,
"learning_rate": 1.0115090192364367e-05,
"loss": 164.4856,
"step": 542
},
{
"epoch": 3.9232332354933392,
"grad_norm": 67.95388793945312,
"learning_rate": 1.0096714318804607e-05,
"loss": 167.9778,
"step": 543
},
{
"epoch": 3.930458342741025,
"grad_norm": 71.8521499633789,
"learning_rate": 1.0079934160325223e-05,
"loss": 166.5036,
"step": 544
},
{
"epoch": 3.937683449988711,
"grad_norm": 80.16471099853516,
"learning_rate": 1.0064750313237851e-05,
"loss": 165.6457,
"step": 545
},
{
"epoch": 3.9449085572363964,
"grad_norm": 80.44886779785156,
"learning_rate": 1.0051163317126472e-05,
"loss": 167.0526,
"step": 546
},
{
"epoch": 3.9521336644840823,
"grad_norm": 85.00817108154297,
"learning_rate": 1.0039173654828249e-05,
"loss": 164.8536,
"step": 547
},
{
"epoch": 3.9593587717317678,
"grad_norm": 88.25836944580078,
"learning_rate": 1.002878175241634e-05,
"loss": 179.1164,
"step": 548
},
{
"epoch": 3.9665838789794536,
"grad_norm": 90.4699935913086,
"learning_rate": 1.0019987979184773e-05,
"loss": 171.6953,
"step": 549
},
{
"epoch": 3.973808986227139,
"grad_norm": 99.42991638183594,
"learning_rate": 1.0012792647635323e-05,
"loss": 167.9289,
"step": 550
},
{
"epoch": 3.973808986227139,
"eval_loss": 2.2475366592407227,
"eval_runtime": 0.6429,
"eval_samples_per_second": 77.776,
"eval_steps_per_second": 77.776,
"step": 550
},
{
"epoch": 3.981034093474825,
"grad_norm": 122.0754623413086,
"learning_rate": 1.0007196013466415e-05,
"loss": 168.6182,
"step": 551
},
{
"epoch": 3.988259200722511,
"grad_norm": 76.6998519897461,
"learning_rate": 1.0003198275564018e-05,
"loss": 141.2874,
"step": 552
},
{
"epoch": 3.9954843079701963,
"grad_norm": 52.6463623046875,
"learning_rate": 1.0000799575994581e-05,
"loss": 158.1349,
"step": 553
},
{
"epoch": 4.002709415217882,
"grad_norm": 66.10282135009766,
"learning_rate": 1e-05,
"loss": 140.7063,
"step": 554
}
],
"logging_steps": 1,
"max_steps": 554,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 20,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9573407816417280.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}