Replete-LLM-Qwen2-7b-Adapter / trainer_state.json
rombodawg's picture
Upload 13 files
d68101b verified
raw
history blame contribute delete
No virus
60.6 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.9933642609665863,
"eval_steps": 500,
"global_step": 34000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.011744670855599272,
"grad_norm": 0.025151433423161507,
"learning_rate": 0.0001997648165569144,
"loss": 1.2331,
"step": 100
},
{
"epoch": 0.023489341711198545,
"grad_norm": 0.028844915330410004,
"learning_rate": 0.0001992944496707432,
"loss": 1.1461,
"step": 200
},
{
"epoch": 0.03523401256679781,
"grad_norm": 0.0280243381857872,
"learning_rate": 0.00019882408278457198,
"loss": 1.1523,
"step": 300
},
{
"epoch": 0.04697868342239709,
"grad_norm": 0.029297035187482834,
"learning_rate": 0.00019835371589840077,
"loss": 1.1273,
"step": 400
},
{
"epoch": 0.05872335427799636,
"grad_norm": 0.02906208299100399,
"learning_rate": 0.00019788334901222956,
"loss": 1.1149,
"step": 500
},
{
"epoch": 0.07046802513359562,
"grad_norm": 0.027980709448456764,
"learning_rate": 0.00019741298212605835,
"loss": 1.1108,
"step": 600
},
{
"epoch": 0.08221269598919491,
"grad_norm": 0.03009636141359806,
"learning_rate": 0.00019694261523988714,
"loss": 1.1041,
"step": 700
},
{
"epoch": 0.09395736684479418,
"grad_norm": 0.02974865213036537,
"learning_rate": 0.00019647224835371593,
"loss": 1.1088,
"step": 800
},
{
"epoch": 0.10570203770039345,
"grad_norm": 0.03112063929438591,
"learning_rate": 0.0001960018814675447,
"loss": 1.1052,
"step": 900
},
{
"epoch": 0.11744670855599272,
"grad_norm": 0.03213803470134735,
"learning_rate": 0.00019553151458137348,
"loss": 1.1037,
"step": 1000
},
{
"epoch": 0.12919137941159198,
"grad_norm": 0.029907317832112312,
"learning_rate": 0.00019506114769520227,
"loss": 1.0927,
"step": 1100
},
{
"epoch": 0.14093605026719125,
"grad_norm": 0.03029988706111908,
"learning_rate": 0.00019459078080903106,
"loss": 1.0926,
"step": 1200
},
{
"epoch": 0.15268072112279055,
"grad_norm": 0.03094552271068096,
"learning_rate": 0.00019412041392285985,
"loss": 1.0977,
"step": 1300
},
{
"epoch": 0.16442539197838982,
"grad_norm": 0.031248079612851143,
"learning_rate": 0.00019365004703668864,
"loss": 1.0914,
"step": 1400
},
{
"epoch": 0.1761700628339891,
"grad_norm": 0.03235971927642822,
"learning_rate": 0.00019317968015051742,
"loss": 1.1114,
"step": 1500
},
{
"epoch": 0.18791473368958836,
"grad_norm": 0.031699199229478836,
"learning_rate": 0.0001927093132643462,
"loss": 1.0883,
"step": 1600
},
{
"epoch": 0.19965940454518763,
"grad_norm": 0.032460007816553116,
"learning_rate": 0.00019223894637817498,
"loss": 1.0925,
"step": 1700
},
{
"epoch": 0.2114040754007869,
"grad_norm": 0.03195160627365112,
"learning_rate": 0.00019176857949200376,
"loss": 1.0849,
"step": 1800
},
{
"epoch": 0.22314874625638617,
"grad_norm": 0.031754713505506516,
"learning_rate": 0.00019129821260583255,
"loss": 1.0872,
"step": 1900
},
{
"epoch": 0.23489341711198544,
"grad_norm": 0.0315755270421505,
"learning_rate": 0.00019082784571966134,
"loss": 1.0899,
"step": 2000
},
{
"epoch": 0.2466380879675847,
"grad_norm": 0.03236432373523712,
"learning_rate": 0.00019035747883349013,
"loss": 1.0846,
"step": 2100
},
{
"epoch": 0.25838275882318396,
"grad_norm": 0.034620147198438644,
"learning_rate": 0.00018988711194731892,
"loss": 1.0813,
"step": 2200
},
{
"epoch": 0.2701274296787832,
"grad_norm": 0.03313825652003288,
"learning_rate": 0.0001894167450611477,
"loss": 1.0829,
"step": 2300
},
{
"epoch": 0.2818721005343825,
"grad_norm": 0.033103689551353455,
"learning_rate": 0.0001889463781749765,
"loss": 1.0889,
"step": 2400
},
{
"epoch": 0.2936167713899818,
"grad_norm": 0.0330510288476944,
"learning_rate": 0.0001884760112888053,
"loss": 1.0792,
"step": 2500
},
{
"epoch": 0.3053614422455811,
"grad_norm": 0.03478708490729332,
"learning_rate": 0.00018800564440263405,
"loss": 1.0819,
"step": 2600
},
{
"epoch": 0.31710611310118036,
"grad_norm": 0.033343035727739334,
"learning_rate": 0.00018753527751646284,
"loss": 1.0774,
"step": 2700
},
{
"epoch": 0.32885078395677964,
"grad_norm": 0.034466274082660675,
"learning_rate": 0.00018706491063029163,
"loss": 1.0814,
"step": 2800
},
{
"epoch": 0.3405954548123789,
"grad_norm": 0.034216560423374176,
"learning_rate": 0.00018659454374412042,
"loss": 1.0823,
"step": 2900
},
{
"epoch": 0.3523401256679782,
"grad_norm": 0.033088088035583496,
"learning_rate": 0.0001861241768579492,
"loss": 1.0745,
"step": 3000
},
{
"epoch": 0.36408479652357745,
"grad_norm": 0.03337638080120087,
"learning_rate": 0.000185653809971778,
"loss": 1.0876,
"step": 3100
},
{
"epoch": 0.3758294673791767,
"grad_norm": 0.03440937399864197,
"learning_rate": 0.00018518344308560678,
"loss": 1.0708,
"step": 3200
},
{
"epoch": 0.387574138234776,
"grad_norm": 0.03419345244765282,
"learning_rate": 0.00018471307619943557,
"loss": 1.0849,
"step": 3300
},
{
"epoch": 0.39931880909037526,
"grad_norm": 0.03453630208969116,
"learning_rate": 0.00018424270931326436,
"loss": 1.0716,
"step": 3400
},
{
"epoch": 0.41106347994597453,
"grad_norm": 0.033510930836200714,
"learning_rate": 0.00018377234242709312,
"loss": 1.0708,
"step": 3500
},
{
"epoch": 0.4228081508015738,
"grad_norm": 0.03349142521619797,
"learning_rate": 0.00018330197554092191,
"loss": 1.0813,
"step": 3600
},
{
"epoch": 0.43455282165717307,
"grad_norm": 0.0351400151848793,
"learning_rate": 0.0001828316086547507,
"loss": 1.0835,
"step": 3700
},
{
"epoch": 0.44629749251277234,
"grad_norm": 0.034197255969047546,
"learning_rate": 0.0001823612417685795,
"loss": 1.0762,
"step": 3800
},
{
"epoch": 0.4580421633683716,
"grad_norm": 0.03609395772218704,
"learning_rate": 0.00018189087488240828,
"loss": 1.0665,
"step": 3900
},
{
"epoch": 0.4697868342239709,
"grad_norm": 0.034626953303813934,
"learning_rate": 0.00018142050799623707,
"loss": 1.073,
"step": 4000
},
{
"epoch": 0.48153150507957015,
"grad_norm": 0.03524584695696831,
"learning_rate": 0.00018095014111006586,
"loss": 1.0655,
"step": 4100
},
{
"epoch": 0.4932761759351694,
"grad_norm": 0.0350763201713562,
"learning_rate": 0.00018047977422389465,
"loss": 1.0733,
"step": 4200
},
{
"epoch": 0.5050208467907686,
"grad_norm": 0.0361902192234993,
"learning_rate": 0.0001800094073377234,
"loss": 1.0837,
"step": 4300
},
{
"epoch": 0.5167655176463679,
"grad_norm": 0.03573866933584213,
"learning_rate": 0.0001795390404515522,
"loss": 1.0807,
"step": 4400
},
{
"epoch": 0.5285101885019672,
"grad_norm": 0.03497539460659027,
"learning_rate": 0.000179068673565381,
"loss": 1.0599,
"step": 4500
},
{
"epoch": 0.5402548593575665,
"grad_norm": 0.034839775413274765,
"learning_rate": 0.00017859830667920978,
"loss": 1.0732,
"step": 4600
},
{
"epoch": 0.5519995302131657,
"grad_norm": 0.035876356065273285,
"learning_rate": 0.0001781279397930386,
"loss": 1.0674,
"step": 4700
},
{
"epoch": 0.563744201068765,
"grad_norm": 0.034889355301856995,
"learning_rate": 0.00017765757290686738,
"loss": 1.0701,
"step": 4800
},
{
"epoch": 0.5754888719243643,
"grad_norm": 0.03712477535009384,
"learning_rate": 0.00017718720602069617,
"loss": 1.0532,
"step": 4900
},
{
"epoch": 0.5872335427799636,
"grad_norm": 0.036021534353494644,
"learning_rate": 0.00017671683913452493,
"loss": 1.0734,
"step": 5000
},
{
"epoch": 0.5989782136355629,
"grad_norm": 0.03627597168087959,
"learning_rate": 0.00017624647224835372,
"loss": 1.0704,
"step": 5100
},
{
"epoch": 0.6107228844911622,
"grad_norm": 0.03433089703321457,
"learning_rate": 0.0001757761053621825,
"loss": 1.0741,
"step": 5200
},
{
"epoch": 0.6224675553467615,
"grad_norm": 0.03627678006887436,
"learning_rate": 0.0001753057384760113,
"loss": 1.0626,
"step": 5300
},
{
"epoch": 0.6342122262023607,
"grad_norm": 0.035330165177583694,
"learning_rate": 0.0001748353715898401,
"loss": 1.0597,
"step": 5400
},
{
"epoch": 0.64595689705796,
"grad_norm": 0.03662644326686859,
"learning_rate": 0.00017436500470366888,
"loss": 1.0712,
"step": 5500
},
{
"epoch": 0.6577015679135593,
"grad_norm": 0.03487861156463623,
"learning_rate": 0.00017389463781749767,
"loss": 1.0526,
"step": 5600
},
{
"epoch": 0.6694462387691585,
"grad_norm": 0.036091409623622894,
"learning_rate": 0.00017342427093132646,
"loss": 1.0641,
"step": 5700
},
{
"epoch": 0.6811909096247578,
"grad_norm": 0.037575751543045044,
"learning_rate": 0.00017295390404515525,
"loss": 1.0593,
"step": 5800
},
{
"epoch": 0.6929355804803571,
"grad_norm": 0.03606625273823738,
"learning_rate": 0.000172483537158984,
"loss": 1.067,
"step": 5900
},
{
"epoch": 0.7046802513359564,
"grad_norm": 0.03537227585911751,
"learning_rate": 0.0001720131702728128,
"loss": 1.0635,
"step": 6000
},
{
"epoch": 0.7164249221915556,
"grad_norm": 0.03722114861011505,
"learning_rate": 0.0001715428033866416,
"loss": 1.0704,
"step": 6100
},
{
"epoch": 0.7281695930471549,
"grad_norm": 0.03697160631418228,
"learning_rate": 0.00017107243650047038,
"loss": 1.0523,
"step": 6200
},
{
"epoch": 0.7399142639027542,
"grad_norm": 0.03575093299150467,
"learning_rate": 0.00017060206961429917,
"loss": 1.0629,
"step": 6300
},
{
"epoch": 0.7516589347583534,
"grad_norm": 0.0371549054980278,
"learning_rate": 0.00017013170272812795,
"loss": 1.068,
"step": 6400
},
{
"epoch": 0.7634036056139527,
"grad_norm": 0.03723159059882164,
"learning_rate": 0.00016966133584195674,
"loss": 1.0515,
"step": 6500
},
{
"epoch": 0.775148276469552,
"grad_norm": 0.03721994906663895,
"learning_rate": 0.00016919096895578553,
"loss": 1.0628,
"step": 6600
},
{
"epoch": 0.7868929473251512,
"grad_norm": 0.03621937334537506,
"learning_rate": 0.00016872060206961432,
"loss": 1.0691,
"step": 6700
},
{
"epoch": 0.7986376181807505,
"grad_norm": 0.03620177507400513,
"learning_rate": 0.00016825023518344308,
"loss": 1.0455,
"step": 6800
},
{
"epoch": 0.8103822890363498,
"grad_norm": 0.037204090505838394,
"learning_rate": 0.00016777986829727187,
"loss": 1.072,
"step": 6900
},
{
"epoch": 0.8221269598919491,
"grad_norm": 0.03673955425620079,
"learning_rate": 0.00016730950141110066,
"loss": 1.0649,
"step": 7000
},
{
"epoch": 0.8338716307475483,
"grad_norm": 0.036923281848430634,
"learning_rate": 0.00016683913452492945,
"loss": 1.0661,
"step": 7100
},
{
"epoch": 0.8456163016031476,
"grad_norm": 0.03734573721885681,
"learning_rate": 0.00016636876763875824,
"loss": 1.0636,
"step": 7200
},
{
"epoch": 0.8573609724587469,
"grad_norm": 0.03644491732120514,
"learning_rate": 0.00016589840075258703,
"loss": 1.0558,
"step": 7300
},
{
"epoch": 0.8691056433143461,
"grad_norm": 0.03660232573747635,
"learning_rate": 0.00016542803386641582,
"loss": 1.0671,
"step": 7400
},
{
"epoch": 0.8808503141699454,
"grad_norm": 0.036776404827833176,
"learning_rate": 0.0001649576669802446,
"loss": 1.0643,
"step": 7500
},
{
"epoch": 0.8925949850255447,
"grad_norm": 0.03714260086417198,
"learning_rate": 0.00016448730009407337,
"loss": 1.0645,
"step": 7600
},
{
"epoch": 0.904339655881144,
"grad_norm": 0.03652457147836685,
"learning_rate": 0.00016401693320790216,
"loss": 1.0541,
"step": 7700
},
{
"epoch": 0.9160843267367432,
"grad_norm": 0.03829098492860794,
"learning_rate": 0.00016354656632173095,
"loss": 1.0585,
"step": 7800
},
{
"epoch": 0.9278289975923425,
"grad_norm": 0.036905597895383835,
"learning_rate": 0.00016307619943555974,
"loss": 1.0563,
"step": 7900
},
{
"epoch": 0.9395736684479418,
"grad_norm": 0.036890897899866104,
"learning_rate": 0.00016260583254938853,
"loss": 1.059,
"step": 8000
},
{
"epoch": 0.951318339303541,
"grad_norm": 0.03704727813601494,
"learning_rate": 0.00016213546566321731,
"loss": 1.0567,
"step": 8100
},
{
"epoch": 0.9630630101591403,
"grad_norm": 0.03992870822548866,
"learning_rate": 0.0001616650987770461,
"loss": 1.0675,
"step": 8200
},
{
"epoch": 0.9748076810147396,
"grad_norm": 0.03689022362232208,
"learning_rate": 0.0001611947318908749,
"loss": 1.0592,
"step": 8300
},
{
"epoch": 0.9865523518703389,
"grad_norm": 0.03615827485918999,
"learning_rate": 0.00016072436500470368,
"loss": 1.0404,
"step": 8400
},
{
"epoch": 0.9982970227259381,
"grad_norm": 0.03881993889808655,
"learning_rate": 0.00016025399811853244,
"loss": 1.0506,
"step": 8500
},
{
"epoch": 1.0100416935815373,
"grad_norm": 0.03911367058753967,
"learning_rate": 0.00015978363123236123,
"loss": 1.0508,
"step": 8600
},
{
"epoch": 1.0217863644371366,
"grad_norm": 0.03765745460987091,
"learning_rate": 0.00015931326434619002,
"loss": 1.0432,
"step": 8700
},
{
"epoch": 1.0335310352927358,
"grad_norm": 0.038481660187244415,
"learning_rate": 0.0001588428974600188,
"loss": 1.0493,
"step": 8800
},
{
"epoch": 1.045275706148335,
"grad_norm": 0.03929019346833229,
"learning_rate": 0.0001583725305738476,
"loss": 1.0584,
"step": 8900
},
{
"epoch": 1.0570203770039344,
"grad_norm": 0.03770457208156586,
"learning_rate": 0.0001579021636876764,
"loss": 1.0497,
"step": 9000
},
{
"epoch": 1.0687650478595336,
"grad_norm": 0.037688400596380234,
"learning_rate": 0.00015743179680150518,
"loss": 1.0432,
"step": 9100
},
{
"epoch": 1.080509718715133,
"grad_norm": 0.0388583168387413,
"learning_rate": 0.00015696142991533397,
"loss": 1.0406,
"step": 9200
},
{
"epoch": 1.0922543895707322,
"grad_norm": 0.0381295308470726,
"learning_rate": 0.00015649106302916276,
"loss": 1.0343,
"step": 9300
},
{
"epoch": 1.1039990604263314,
"grad_norm": 0.03957001864910126,
"learning_rate": 0.00015602069614299155,
"loss": 1.0445,
"step": 9400
},
{
"epoch": 1.1157437312819307,
"grad_norm": 0.03933073207736015,
"learning_rate": 0.00015555032925682034,
"loss": 1.0498,
"step": 9500
},
{
"epoch": 1.12748840213753,
"grad_norm": 0.03816806897521019,
"learning_rate": 0.00015507996237064912,
"loss": 1.0441,
"step": 9600
},
{
"epoch": 1.1392330729931293,
"grad_norm": 0.038057826459407806,
"learning_rate": 0.0001546095954844779,
"loss": 1.0404,
"step": 9700
},
{
"epoch": 1.1509777438487285,
"grad_norm": 0.03927973657846451,
"learning_rate": 0.0001541392285983067,
"loss": 1.0435,
"step": 9800
},
{
"epoch": 1.1627224147043278,
"grad_norm": 0.039503954350948334,
"learning_rate": 0.0001536688617121355,
"loss": 1.0592,
"step": 9900
},
{
"epoch": 1.174467085559927,
"grad_norm": 0.04005419462919235,
"learning_rate": 0.00015319849482596428,
"loss": 1.0445,
"step": 10000
},
{
"epoch": 1.1862117564155263,
"grad_norm": 0.03986848145723343,
"learning_rate": 0.00015272812793979304,
"loss": 1.0569,
"step": 10100
},
{
"epoch": 1.1979564272711256,
"grad_norm": 0.03886035457253456,
"learning_rate": 0.00015225776105362183,
"loss": 1.0473,
"step": 10200
},
{
"epoch": 1.2097010981267249,
"grad_norm": 0.03941928222775459,
"learning_rate": 0.00015178739416745062,
"loss": 1.0399,
"step": 10300
},
{
"epoch": 1.2214457689823242,
"grad_norm": 0.04138774052262306,
"learning_rate": 0.0001513170272812794,
"loss": 1.0387,
"step": 10400
},
{
"epoch": 1.2331904398379234,
"grad_norm": 0.040227312594652176,
"learning_rate": 0.0001508466603951082,
"loss": 1.0592,
"step": 10500
},
{
"epoch": 1.2449351106935227,
"grad_norm": 0.03977705165743828,
"learning_rate": 0.000150376293508937,
"loss": 1.047,
"step": 10600
},
{
"epoch": 1.256679781549122,
"grad_norm": 0.040624938905239105,
"learning_rate": 0.00014990592662276578,
"loss": 1.0528,
"step": 10700
},
{
"epoch": 1.2684244524047212,
"grad_norm": 0.03948456794023514,
"learning_rate": 0.00014943555973659457,
"loss": 1.0413,
"step": 10800
},
{
"epoch": 1.2801691232603205,
"grad_norm": 0.0396355502307415,
"learning_rate": 0.00014896519285042333,
"loss": 1.0403,
"step": 10900
},
{
"epoch": 1.2919137941159198,
"grad_norm": 0.04015343636274338,
"learning_rate": 0.00014849482596425212,
"loss": 1.057,
"step": 11000
},
{
"epoch": 1.303658464971519,
"grad_norm": 0.03963370993733406,
"learning_rate": 0.0001480244590780809,
"loss": 1.0474,
"step": 11100
},
{
"epoch": 1.3154031358271183,
"grad_norm": 0.03991986811161041,
"learning_rate": 0.0001475540921919097,
"loss": 1.0333,
"step": 11200
},
{
"epoch": 1.3271478066827176,
"grad_norm": 0.040591537952423096,
"learning_rate": 0.00014708372530573848,
"loss": 1.0401,
"step": 11300
},
{
"epoch": 1.3388924775383169,
"grad_norm": 0.039190664887428284,
"learning_rate": 0.00014661335841956727,
"loss": 1.0467,
"step": 11400
},
{
"epoch": 1.3506371483939161,
"grad_norm": 0.03903213143348694,
"learning_rate": 0.00014614299153339606,
"loss": 1.0557,
"step": 11500
},
{
"epoch": 1.3623818192495154,
"grad_norm": 0.03976823762059212,
"learning_rate": 0.00014567262464722485,
"loss": 1.055,
"step": 11600
},
{
"epoch": 1.3741264901051147,
"grad_norm": 0.04024571180343628,
"learning_rate": 0.00014520225776105364,
"loss": 1.0375,
"step": 11700
},
{
"epoch": 1.385871160960714,
"grad_norm": 0.040485769510269165,
"learning_rate": 0.0001447318908748824,
"loss": 1.0407,
"step": 11800
},
{
"epoch": 1.3976158318163132,
"grad_norm": 0.04040844738483429,
"learning_rate": 0.0001442615239887112,
"loss": 1.0395,
"step": 11900
},
{
"epoch": 1.4093605026719125,
"grad_norm": 0.039215583354234695,
"learning_rate": 0.00014379115710253998,
"loss": 1.0417,
"step": 12000
},
{
"epoch": 1.4211051735275118,
"grad_norm": 0.040224071592092514,
"learning_rate": 0.00014332079021636877,
"loss": 1.0342,
"step": 12100
},
{
"epoch": 1.432849844383111,
"grad_norm": 0.03919661417603493,
"learning_rate": 0.00014285042333019756,
"loss": 1.0396,
"step": 12200
},
{
"epoch": 1.4445945152387103,
"grad_norm": 0.0384608618915081,
"learning_rate": 0.00014238005644402635,
"loss": 1.0467,
"step": 12300
},
{
"epoch": 1.4563391860943098,
"grad_norm": 0.039908237755298615,
"learning_rate": 0.00014190968955785514,
"loss": 1.0539,
"step": 12400
},
{
"epoch": 1.468083856949909,
"grad_norm": 0.0399697907269001,
"learning_rate": 0.00014143932267168393,
"loss": 1.0445,
"step": 12500
},
{
"epoch": 1.4798285278055083,
"grad_norm": 0.04020760953426361,
"learning_rate": 0.0001409689557855127,
"loss": 1.0456,
"step": 12600
},
{
"epoch": 1.4915731986611076,
"grad_norm": 0.04049207270145416,
"learning_rate": 0.00014049858889934148,
"loss": 1.0402,
"step": 12700
},
{
"epoch": 1.5033178695167067,
"grad_norm": 0.0405813567340374,
"learning_rate": 0.00014002822201317027,
"loss": 1.0327,
"step": 12800
},
{
"epoch": 1.515062540372306,
"grad_norm": 0.04098460450768471,
"learning_rate": 0.00013955785512699906,
"loss": 1.0426,
"step": 12900
},
{
"epoch": 1.5268072112279052,
"grad_norm": 0.03912067785859108,
"learning_rate": 0.00013908748824082785,
"loss": 1.0398,
"step": 13000
},
{
"epoch": 1.5385518820835045,
"grad_norm": 0.040918510407209396,
"learning_rate": 0.00013861712135465663,
"loss": 1.0511,
"step": 13100
},
{
"epoch": 1.5502965529391037,
"grad_norm": 0.039643533527851105,
"learning_rate": 0.00013814675446848542,
"loss": 1.0408,
"step": 13200
},
{
"epoch": 1.562041223794703,
"grad_norm": 0.04073023423552513,
"learning_rate": 0.0001376763875823142,
"loss": 1.0465,
"step": 13300
},
{
"epoch": 1.5737858946503023,
"grad_norm": 0.03993350267410278,
"learning_rate": 0.000137206020696143,
"loss": 1.0481,
"step": 13400
},
{
"epoch": 1.5855305655059015,
"grad_norm": 0.0393822006881237,
"learning_rate": 0.00013673565380997176,
"loss": 1.0378,
"step": 13500
},
{
"epoch": 1.5972752363615008,
"grad_norm": 0.041657913476228714,
"learning_rate": 0.00013626528692380055,
"loss": 1.0305,
"step": 13600
},
{
"epoch": 1.6090199072171,
"grad_norm": 0.0409579873085022,
"learning_rate": 0.00013579492003762934,
"loss": 1.0433,
"step": 13700
},
{
"epoch": 1.6207645780726994,
"grad_norm": 0.039673928171396255,
"learning_rate": 0.00013532455315145813,
"loss": 1.0286,
"step": 13800
},
{
"epoch": 1.6325092489282986,
"grad_norm": 0.04071459546685219,
"learning_rate": 0.00013485418626528692,
"loss": 1.0413,
"step": 13900
},
{
"epoch": 1.644253919783898,
"grad_norm": 0.04058365523815155,
"learning_rate": 0.00013438381937911574,
"loss": 1.0465,
"step": 14000
},
{
"epoch": 1.6559985906394972,
"grad_norm": 0.04230272024869919,
"learning_rate": 0.00013391345249294453,
"loss": 1.0304,
"step": 14100
},
{
"epoch": 1.6677432614950964,
"grad_norm": 0.04120560362935066,
"learning_rate": 0.0001334430856067733,
"loss": 1.0459,
"step": 14200
},
{
"epoch": 1.6794879323506957,
"grad_norm": 0.0403909832239151,
"learning_rate": 0.00013297271872060208,
"loss": 1.0402,
"step": 14300
},
{
"epoch": 1.691232603206295,
"grad_norm": 0.04099250212311745,
"learning_rate": 0.00013250235183443087,
"loss": 1.0477,
"step": 14400
},
{
"epoch": 1.7029772740618943,
"grad_norm": 0.039906516671180725,
"learning_rate": 0.00013203198494825965,
"loss": 1.057,
"step": 14500
},
{
"epoch": 1.7147219449174935,
"grad_norm": 0.04008522629737854,
"learning_rate": 0.00013156161806208844,
"loss": 1.0433,
"step": 14600
},
{
"epoch": 1.7264666157730928,
"grad_norm": 0.0416274331510067,
"learning_rate": 0.00013109125117591723,
"loss": 1.0405,
"step": 14700
},
{
"epoch": 1.738211286628692,
"grad_norm": 0.04142718017101288,
"learning_rate": 0.00013062088428974602,
"loss": 1.0349,
"step": 14800
},
{
"epoch": 1.7499559574842916,
"grad_norm": 0.0407978855073452,
"learning_rate": 0.0001301505174035748,
"loss": 1.0415,
"step": 14900
},
{
"epoch": 1.7617006283398908,
"grad_norm": 0.03977083042263985,
"learning_rate": 0.0001296801505174036,
"loss": 1.0522,
"step": 15000
},
{
"epoch": 1.77344529919549,
"grad_norm": 0.04186280444264412,
"learning_rate": 0.00012920978363123236,
"loss": 1.0515,
"step": 15100
},
{
"epoch": 1.7851899700510894,
"grad_norm": 0.04049232602119446,
"learning_rate": 0.00012873941674506115,
"loss": 1.0517,
"step": 15200
},
{
"epoch": 1.7969346409066886,
"grad_norm": 0.039164550602436066,
"learning_rate": 0.00012826904985888994,
"loss": 1.0403,
"step": 15300
},
{
"epoch": 1.808679311762288,
"grad_norm": 0.04166054725646973,
"learning_rate": 0.00012779868297271873,
"loss": 1.0469,
"step": 15400
},
{
"epoch": 1.8204239826178872,
"grad_norm": 0.0396597720682621,
"learning_rate": 0.00012732831608654752,
"loss": 1.0433,
"step": 15500
},
{
"epoch": 1.8321686534734865,
"grad_norm": 0.041060902178287506,
"learning_rate": 0.0001268579492003763,
"loss": 1.0298,
"step": 15600
},
{
"epoch": 1.8439133243290857,
"grad_norm": 0.04100984334945679,
"learning_rate": 0.0001263875823142051,
"loss": 1.0423,
"step": 15700
},
{
"epoch": 1.855657995184685,
"grad_norm": 0.03933743014931679,
"learning_rate": 0.00012591721542803389,
"loss": 1.0459,
"step": 15800
},
{
"epoch": 1.8674026660402843,
"grad_norm": 0.04171588271856308,
"learning_rate": 0.00012544684854186265,
"loss": 1.0411,
"step": 15900
},
{
"epoch": 1.8791473368958835,
"grad_norm": 0.04075104370713234,
"learning_rate": 0.00012497648165569144,
"loss": 1.0521,
"step": 16000
},
{
"epoch": 1.8908920077514828,
"grad_norm": 0.04037100449204445,
"learning_rate": 0.00012450611476952023,
"loss": 1.0482,
"step": 16100
},
{
"epoch": 1.902636678607082,
"grad_norm": 0.04113980382680893,
"learning_rate": 0.00012403574788334901,
"loss": 1.0335,
"step": 16200
},
{
"epoch": 1.9143813494626813,
"grad_norm": 0.04102000594139099,
"learning_rate": 0.0001235653809971778,
"loss": 1.0462,
"step": 16300
},
{
"epoch": 1.9261260203182806,
"grad_norm": 0.041277866810560226,
"learning_rate": 0.0001230950141110066,
"loss": 1.0472,
"step": 16400
},
{
"epoch": 1.93787069117388,
"grad_norm": 0.040296610444784164,
"learning_rate": 0.00012262464722483538,
"loss": 1.0455,
"step": 16500
},
{
"epoch": 1.9496153620294792,
"grad_norm": 0.04030190408229828,
"learning_rate": 0.00012215428033866417,
"loss": 1.036,
"step": 16600
},
{
"epoch": 1.9613600328850784,
"grad_norm": 0.04076563939452171,
"learning_rate": 0.00012168391345249295,
"loss": 1.0355,
"step": 16700
},
{
"epoch": 1.9731047037406777,
"grad_norm": 0.042260345071554184,
"learning_rate": 0.00012121354656632174,
"loss": 1.0318,
"step": 16800
},
{
"epoch": 1.984849374596277,
"grad_norm": 0.04094604775309563,
"learning_rate": 0.00012074317968015052,
"loss": 1.0272,
"step": 16900
},
{
"epoch": 1.9965940454518762,
"grad_norm": 0.04106820747256279,
"learning_rate": 0.0001202728127939793,
"loss": 1.0248,
"step": 17000
},
{
"epoch": 2.0083387163074753,
"grad_norm": 0.041894011199474335,
"learning_rate": 0.00011980244590780809,
"loss": 1.0415,
"step": 17100
},
{
"epoch": 2.0200833871630746,
"grad_norm": 0.04177311062812805,
"learning_rate": 0.00011933207902163688,
"loss": 1.034,
"step": 17200
},
{
"epoch": 2.031828058018674,
"grad_norm": 0.04308000206947327,
"learning_rate": 0.00011886171213546567,
"loss": 1.0164,
"step": 17300
},
{
"epoch": 2.043572728874273,
"grad_norm": 0.04087553173303604,
"learning_rate": 0.00011839134524929444,
"loss": 1.0271,
"step": 17400
},
{
"epoch": 2.0553173997298724,
"grad_norm": 0.04184258356690407,
"learning_rate": 0.00011792097836312323,
"loss": 1.0214,
"step": 17500
},
{
"epoch": 2.0670620705854716,
"grad_norm": 0.0424019880592823,
"learning_rate": 0.00011745061147695202,
"loss": 1.041,
"step": 17600
},
{
"epoch": 2.078806741441071,
"grad_norm": 0.04137306660413742,
"learning_rate": 0.00011698024459078081,
"loss": 1.0182,
"step": 17700
},
{
"epoch": 2.09055141229667,
"grad_norm": 0.04244280233979225,
"learning_rate": 0.0001165098777046096,
"loss": 1.0269,
"step": 17800
},
{
"epoch": 2.1022960831522695,
"grad_norm": 0.04111913591623306,
"learning_rate": 0.00011603951081843838,
"loss": 1.0335,
"step": 17900
},
{
"epoch": 2.1140407540078687,
"grad_norm": 0.0424952507019043,
"learning_rate": 0.00011556914393226716,
"loss": 1.0266,
"step": 18000
},
{
"epoch": 2.125785424863468,
"grad_norm": 0.0427490659058094,
"learning_rate": 0.00011509877704609595,
"loss": 1.0274,
"step": 18100
},
{
"epoch": 2.1375300957190673,
"grad_norm": 0.04336949810385704,
"learning_rate": 0.00011462841015992474,
"loss": 1.0268,
"step": 18200
},
{
"epoch": 2.1492747665746665,
"grad_norm": 0.04262473061680794,
"learning_rate": 0.00011415804327375352,
"loss": 1.0214,
"step": 18300
},
{
"epoch": 2.161019437430266,
"grad_norm": 0.04471023753285408,
"learning_rate": 0.00011368767638758231,
"loss": 1.0295,
"step": 18400
},
{
"epoch": 2.172764108285865,
"grad_norm": 0.044171739369630814,
"learning_rate": 0.0001132173095014111,
"loss": 1.0296,
"step": 18500
},
{
"epoch": 2.1845087791414644,
"grad_norm": 0.04301764816045761,
"learning_rate": 0.0001127469426152399,
"loss": 1.0258,
"step": 18600
},
{
"epoch": 2.196253449997064,
"grad_norm": 0.04325546696782112,
"learning_rate": 0.00011227657572906869,
"loss": 1.0288,
"step": 18700
},
{
"epoch": 2.207998120852663,
"grad_norm": 0.04137617349624634,
"learning_rate": 0.00011180620884289748,
"loss": 1.0385,
"step": 18800
},
{
"epoch": 2.2197427917082626,
"grad_norm": 0.043259892612695694,
"learning_rate": 0.00011133584195672627,
"loss": 1.0264,
"step": 18900
},
{
"epoch": 2.2314874625638614,
"grad_norm": 0.044129449874162674,
"learning_rate": 0.00011086547507055504,
"loss": 1.0313,
"step": 19000
},
{
"epoch": 2.243232133419461,
"grad_norm": 0.04146253690123558,
"learning_rate": 0.00011039510818438383,
"loss": 1.0304,
"step": 19100
},
{
"epoch": 2.25497680427506,
"grad_norm": 0.042836517095565796,
"learning_rate": 0.00010992474129821262,
"loss": 1.0281,
"step": 19200
},
{
"epoch": 2.2667214751306597,
"grad_norm": 0.04558749496936798,
"learning_rate": 0.00010945437441204141,
"loss": 1.03,
"step": 19300
},
{
"epoch": 2.2784661459862585,
"grad_norm": 0.04284907504916191,
"learning_rate": 0.00010898400752587018,
"loss": 1.0469,
"step": 19400
},
{
"epoch": 2.2902108168418582,
"grad_norm": 0.04364863410592079,
"learning_rate": 0.00010851364063969897,
"loss": 1.0295,
"step": 19500
},
{
"epoch": 2.301955487697457,
"grad_norm": 0.042074691504240036,
"learning_rate": 0.00010804327375352776,
"loss": 1.0458,
"step": 19600
},
{
"epoch": 2.3137001585530568,
"grad_norm": 0.043050698935985565,
"learning_rate": 0.00010757290686735655,
"loss": 1.0438,
"step": 19700
},
{
"epoch": 2.3254448294086556,
"grad_norm": 0.04526820033788681,
"learning_rate": 0.00010710253998118534,
"loss": 1.0283,
"step": 19800
},
{
"epoch": 2.3371895002642553,
"grad_norm": 0.04481109231710434,
"learning_rate": 0.00010663217309501412,
"loss": 1.0205,
"step": 19900
},
{
"epoch": 2.348934171119854,
"grad_norm": 0.04517560824751854,
"learning_rate": 0.0001061618062088429,
"loss": 1.0179,
"step": 20000
},
{
"epoch": 2.3607962886840097,
"grad_norm": 0.045082226395606995,
"learning_rate": 0.0001056914393226717,
"loss": 1.0406,
"step": 20100
},
{
"epoch": 2.372540959539609,
"grad_norm": 0.042884670197963715,
"learning_rate": 0.00010522107243650048,
"loss": 1.0221,
"step": 20200
},
{
"epoch": 2.3842856303952082,
"grad_norm": 0.04309197515249252,
"learning_rate": 0.00010475070555032926,
"loss": 1.0287,
"step": 20300
},
{
"epoch": 2.3960303012508075,
"grad_norm": 0.04434290900826454,
"learning_rate": 0.00010428033866415805,
"loss": 1.0269,
"step": 20400
},
{
"epoch": 2.407774972106407,
"grad_norm": 0.044556260108947754,
"learning_rate": 0.00010380997177798684,
"loss": 1.0454,
"step": 20500
},
{
"epoch": 2.419519642962006,
"grad_norm": 0.043353039771318436,
"learning_rate": 0.00010333960489181563,
"loss": 1.0401,
"step": 20600
},
{
"epoch": 2.4312643138176053,
"grad_norm": 0.045345306396484375,
"learning_rate": 0.0001028692380056444,
"loss": 1.0172,
"step": 20700
},
{
"epoch": 2.4430089846732046,
"grad_norm": 0.043075308203697205,
"learning_rate": 0.00010239887111947319,
"loss": 1.0413,
"step": 20800
},
{
"epoch": 2.454753655528804,
"grad_norm": 0.044308003038167953,
"learning_rate": 0.00010192850423330198,
"loss": 1.0292,
"step": 20900
},
{
"epoch": 2.466498326384403,
"grad_norm": 0.04506301134824753,
"learning_rate": 0.00010145813734713077,
"loss": 1.0382,
"step": 21000
},
{
"epoch": 2.4782429972400024,
"grad_norm": 0.04396146163344383,
"learning_rate": 0.00010098777046095956,
"loss": 1.0337,
"step": 21100
},
{
"epoch": 2.4899876680956017,
"grad_norm": 0.044499751180410385,
"learning_rate": 0.00010051740357478833,
"loss": 1.0407,
"step": 21200
},
{
"epoch": 2.501732338951201,
"grad_norm": 0.042769189924001694,
"learning_rate": 0.00010004703668861712,
"loss": 1.031,
"step": 21300
},
{
"epoch": 2.5134770098068,
"grad_norm": 0.0427669994533062,
"learning_rate": 9.957666980244591e-05,
"loss": 1.0193,
"step": 21400
},
{
"epoch": 2.5252216806623995,
"grad_norm": 0.04454643651843071,
"learning_rate": 9.91063029162747e-05,
"loss": 1.0256,
"step": 21500
},
{
"epoch": 2.5369663515179988,
"grad_norm": 0.042179521173238754,
"learning_rate": 9.863593603010348e-05,
"loss": 1.0293,
"step": 21600
},
{
"epoch": 2.548711022373598,
"grad_norm": 0.04245784506201744,
"learning_rate": 9.816556914393227e-05,
"loss": 1.0138,
"step": 21700
},
{
"epoch": 2.5604556932291973,
"grad_norm": 0.04282999783754349,
"learning_rate": 9.769520225776106e-05,
"loss": 1.0216,
"step": 21800
},
{
"epoch": 2.5722003640847966,
"grad_norm": 0.04309820756316185,
"learning_rate": 9.722483537158984e-05,
"loss": 1.0436,
"step": 21900
},
{
"epoch": 2.583945034940396,
"grad_norm": 0.0428336001932621,
"learning_rate": 9.675446848541862e-05,
"loss": 1.0327,
"step": 22000
},
{
"epoch": 2.595689705795995,
"grad_norm": 0.0433744341135025,
"learning_rate": 9.628410159924742e-05,
"loss": 1.0248,
"step": 22100
},
{
"epoch": 2.6074343766515944,
"grad_norm": 0.0440264493227005,
"learning_rate": 9.581373471307621e-05,
"loss": 1.0276,
"step": 22200
},
{
"epoch": 2.6191790475071937,
"grad_norm": 0.04474351555109024,
"learning_rate": 9.5343367826905e-05,
"loss": 1.0392,
"step": 22300
},
{
"epoch": 2.630923718362793,
"grad_norm": 0.04529641568660736,
"learning_rate": 9.487300094073378e-05,
"loss": 1.0436,
"step": 22400
},
{
"epoch": 2.642668389218392,
"grad_norm": 0.04383498802781105,
"learning_rate": 9.440263405456257e-05,
"loss": 1.0271,
"step": 22500
},
{
"epoch": 2.6544130600739915,
"grad_norm": 0.04416006803512573,
"learning_rate": 9.393226716839135e-05,
"loss": 1.035,
"step": 22600
},
{
"epoch": 2.6661577309295907,
"grad_norm": 0.0439978651702404,
"learning_rate": 9.346190028222014e-05,
"loss": 1.0242,
"step": 22700
},
{
"epoch": 2.67790240178519,
"grad_norm": 0.043451737612485886,
"learning_rate": 9.299153339604892e-05,
"loss": 1.0266,
"step": 22800
},
{
"epoch": 2.6896470726407893,
"grad_norm": 0.043133124709129333,
"learning_rate": 9.252116650987771e-05,
"loss": 1.0313,
"step": 22900
},
{
"epoch": 2.7013917434963886,
"grad_norm": 0.04365682601928711,
"learning_rate": 9.20507996237065e-05,
"loss": 1.0164,
"step": 23000
},
{
"epoch": 2.713136414351988,
"grad_norm": 0.045253172516822815,
"learning_rate": 9.158043273753529e-05,
"loss": 1.0234,
"step": 23100
},
{
"epoch": 2.724881085207587,
"grad_norm": 0.04371510446071625,
"learning_rate": 9.111006585136406e-05,
"loss": 1.0262,
"step": 23200
},
{
"epoch": 2.7366257560631864,
"grad_norm": 0.04663108289241791,
"learning_rate": 9.063969896519285e-05,
"loss": 1.027,
"step": 23300
},
{
"epoch": 2.7483704269187856,
"grad_norm": 0.043591178953647614,
"learning_rate": 9.016933207902164e-05,
"loss": 1.0362,
"step": 23400
},
{
"epoch": 2.760115097774385,
"grad_norm": 0.04423443600535393,
"learning_rate": 8.969896519285043e-05,
"loss": 1.0262,
"step": 23500
},
{
"epoch": 2.771859768629984,
"grad_norm": 0.045264832675457,
"learning_rate": 8.922859830667922e-05,
"loss": 1.0227,
"step": 23600
},
{
"epoch": 2.7836044394855834,
"grad_norm": 0.04213082045316696,
"learning_rate": 8.8758231420508e-05,
"loss": 1.0161,
"step": 23700
},
{
"epoch": 2.7953491103411827,
"grad_norm": 0.04401146247982979,
"learning_rate": 8.828786453433678e-05,
"loss": 1.0269,
"step": 23800
},
{
"epoch": 2.807093781196782,
"grad_norm": 0.043770622462034225,
"learning_rate": 8.781749764816557e-05,
"loss": 1.0429,
"step": 23900
},
{
"epoch": 2.8188384520523813,
"grad_norm": 0.04466963931918144,
"learning_rate": 8.734713076199436e-05,
"loss": 1.0309,
"step": 24000
},
{
"epoch": 2.8305831229079805,
"grad_norm": 0.042598120868206024,
"learning_rate": 8.687676387582314e-05,
"loss": 1.0321,
"step": 24100
},
{
"epoch": 2.84232779376358,
"grad_norm": 0.04534047096967697,
"learning_rate": 8.640639698965193e-05,
"loss": 1.0335,
"step": 24200
},
{
"epoch": 2.854072464619179,
"grad_norm": 0.044474124908447266,
"learning_rate": 8.593603010348071e-05,
"loss": 1.0303,
"step": 24300
},
{
"epoch": 2.8658171354747783,
"grad_norm": 0.04398440942168236,
"learning_rate": 8.546566321730952e-05,
"loss": 1.0228,
"step": 24400
},
{
"epoch": 2.8775618063303776,
"grad_norm": 0.043202903121709824,
"learning_rate": 8.499529633113829e-05,
"loss": 1.0299,
"step": 24500
},
{
"epoch": 2.889306477185977,
"grad_norm": 0.04326211288571358,
"learning_rate": 8.452492944496708e-05,
"loss": 1.0324,
"step": 24600
},
{
"epoch": 2.901051148041576,
"grad_norm": 0.044613469392061234,
"learning_rate": 8.405456255879587e-05,
"loss": 1.0269,
"step": 24700
},
{
"epoch": 2.9127958188971754,
"grad_norm": 0.04421741142868996,
"learning_rate": 8.358419567262466e-05,
"loss": 1.0193,
"step": 24800
},
{
"epoch": 2.9245404897527747,
"grad_norm": 0.044949114322662354,
"learning_rate": 8.311382878645344e-05,
"loss": 1.0296,
"step": 24900
},
{
"epoch": 2.936285160608374,
"grad_norm": 0.044719964265823364,
"learning_rate": 8.264346190028222e-05,
"loss": 1.031,
"step": 25000
},
{
"epoch": 2.9480298314639732,
"grad_norm": 0.04360034689307213,
"learning_rate": 8.217309501411101e-05,
"loss": 1.0163,
"step": 25100
},
{
"epoch": 2.9597745023195725,
"grad_norm": 0.0441979356110096,
"learning_rate": 8.17027281279398e-05,
"loss": 1.0325,
"step": 25200
},
{
"epoch": 2.971519173175172,
"grad_norm": 0.044677652418613434,
"learning_rate": 8.123236124176858e-05,
"loss": 1.0286,
"step": 25300
},
{
"epoch": 2.983263844030771,
"grad_norm": 0.042885322123765945,
"learning_rate": 8.076199435559737e-05,
"loss": 1.0365,
"step": 25400
},
{
"epoch": 2.9950085148863703,
"grad_norm": 0.042082566767930984,
"learning_rate": 8.029162746942616e-05,
"loss": 1.0351,
"step": 25500
},
{
"epoch": 3.0067531857419696,
"grad_norm": 0.04490746557712555,
"learning_rate": 7.982126058325495e-05,
"loss": 1.0112,
"step": 25600
},
{
"epoch": 3.018497856597569,
"grad_norm": 0.048318084329366684,
"learning_rate": 7.935089369708372e-05,
"loss": 1.0144,
"step": 25700
},
{
"epoch": 3.030242527453168,
"grad_norm": 0.04372231662273407,
"learning_rate": 7.888052681091251e-05,
"loss": 1.0081,
"step": 25800
},
{
"epoch": 3.0419871983087674,
"grad_norm": 0.04528006911277771,
"learning_rate": 7.84101599247413e-05,
"loss": 1.0215,
"step": 25900
},
{
"epoch": 3.0537318691643667,
"grad_norm": 0.04795797914266586,
"learning_rate": 7.793979303857009e-05,
"loss": 1.0226,
"step": 26000
},
{
"epoch": 3.065476540019966,
"grad_norm": 0.04441961273550987,
"learning_rate": 7.746942615239888e-05,
"loss": 1.0298,
"step": 26100
},
{
"epoch": 3.077221210875565,
"grad_norm": 0.044861868023872375,
"learning_rate": 7.699905926622765e-05,
"loss": 1.0158,
"step": 26200
},
{
"epoch": 3.0889658817311645,
"grad_norm": 0.04549916088581085,
"learning_rate": 7.652869238005644e-05,
"loss": 1.0107,
"step": 26300
},
{
"epoch": 3.1007105525867638,
"grad_norm": 0.04485148563981056,
"learning_rate": 7.605832549388523e-05,
"loss": 1.0295,
"step": 26400
},
{
"epoch": 3.112455223442363,
"grad_norm": 0.0463709756731987,
"learning_rate": 7.558795860771402e-05,
"loss": 1.0237,
"step": 26500
},
{
"epoch": 3.1241998942979623,
"grad_norm": 0.04507851600646973,
"learning_rate": 7.51175917215428e-05,
"loss": 1.0257,
"step": 26600
},
{
"epoch": 3.1359445651535616,
"grad_norm": 0.04443085938692093,
"learning_rate": 7.46472248353716e-05,
"loss": 1.0161,
"step": 26700
},
{
"epoch": 3.147689236009161,
"grad_norm": 0.04493951424956322,
"learning_rate": 7.417685794920039e-05,
"loss": 1.0103,
"step": 26800
},
{
"epoch": 3.15943390686476,
"grad_norm": 0.04466501250863075,
"learning_rate": 7.370649106302918e-05,
"loss": 1.0184,
"step": 26900
},
{
"epoch": 3.1711785777203594,
"grad_norm": 0.04674587398767471,
"learning_rate": 7.323612417685795e-05,
"loss": 1.0234,
"step": 27000
},
{
"epoch": 3.1829232485759587,
"grad_norm": 0.04568205028772354,
"learning_rate": 7.276575729068674e-05,
"loss": 1.0177,
"step": 27100
},
{
"epoch": 3.194667919431558,
"grad_norm": 0.04736079275608063,
"learning_rate": 7.229539040451553e-05,
"loss": 1.0238,
"step": 27200
},
{
"epoch": 3.206412590287157,
"grad_norm": 0.04510754346847534,
"learning_rate": 7.182502351834432e-05,
"loss": 1.0239,
"step": 27300
},
{
"epoch": 3.2181572611427565,
"grad_norm": 0.04676396772265434,
"learning_rate": 7.13546566321731e-05,
"loss": 1.0226,
"step": 27400
},
{
"epoch": 3.2299019319983557,
"grad_norm": 0.04639539122581482,
"learning_rate": 7.088428974600188e-05,
"loss": 1.0304,
"step": 27500
},
{
"epoch": 3.241646602853955,
"grad_norm": 0.046673484146595,
"learning_rate": 7.041392285983067e-05,
"loss": 1.029,
"step": 27600
},
{
"epoch": 3.2533912737095543,
"grad_norm": 0.04434806853532791,
"learning_rate": 6.994355597365946e-05,
"loss": 1.0231,
"step": 27700
},
{
"epoch": 3.2651359445651535,
"grad_norm": 0.046948377043008804,
"learning_rate": 6.947318908748824e-05,
"loss": 1.0212,
"step": 27800
},
{
"epoch": 3.276880615420753,
"grad_norm": 0.045691922307014465,
"learning_rate": 6.900282220131703e-05,
"loss": 1.0228,
"step": 27900
},
{
"epoch": 3.288625286276352,
"grad_norm": 0.04534591734409332,
"learning_rate": 6.853245531514582e-05,
"loss": 1.0311,
"step": 28000
},
{
"epoch": 3.3003699571319514,
"grad_norm": 0.045218247920274734,
"learning_rate": 6.80620884289746e-05,
"loss": 1.0145,
"step": 28100
},
{
"epoch": 3.3121146279875506,
"grad_norm": 0.046320728957653046,
"learning_rate": 6.75917215428034e-05,
"loss": 1.0184,
"step": 28200
},
{
"epoch": 3.32385929884315,
"grad_norm": 0.04595513269305229,
"learning_rate": 6.712135465663217e-05,
"loss": 1.0181,
"step": 28300
},
{
"epoch": 3.335603969698749,
"grad_norm": 0.04726444184780121,
"learning_rate": 6.665098777046096e-05,
"loss": 1.019,
"step": 28400
},
{
"epoch": 3.3473486405543484,
"grad_norm": 0.0476396419107914,
"learning_rate": 6.618062088428975e-05,
"loss": 1.0109,
"step": 28500
},
{
"epoch": 3.3590933114099477,
"grad_norm": 0.04619845747947693,
"learning_rate": 6.571025399811854e-05,
"loss": 1.0322,
"step": 28600
},
{
"epoch": 3.370837982265547,
"grad_norm": 0.04548267647624016,
"learning_rate": 6.523988711194731e-05,
"loss": 1.0088,
"step": 28700
},
{
"epoch": 3.3825826531211463,
"grad_norm": 0.04472291097044945,
"learning_rate": 6.47695202257761e-05,
"loss": 1.0371,
"step": 28800
},
{
"epoch": 3.3943273239767455,
"grad_norm": 0.04602396488189697,
"learning_rate": 6.429915333960489e-05,
"loss": 1.0246,
"step": 28900
},
{
"epoch": 3.406071994832345,
"grad_norm": 0.0454532653093338,
"learning_rate": 6.382878645343368e-05,
"loss": 1.0254,
"step": 29000
},
{
"epoch": 3.417816665687944,
"grad_norm": 0.04494043067097664,
"learning_rate": 6.335841956726247e-05,
"loss": 1.0193,
"step": 29100
},
{
"epoch": 3.4295613365435433,
"grad_norm": 0.045226361602544785,
"learning_rate": 6.288805268109126e-05,
"loss": 1.0075,
"step": 29200
},
{
"epoch": 3.4413060073991426,
"grad_norm": 0.04578743502497673,
"learning_rate": 6.241768579492005e-05,
"loss": 1.0175,
"step": 29300
},
{
"epoch": 3.453050678254742,
"grad_norm": 0.04741055890917778,
"learning_rate": 6.194731890874884e-05,
"loss": 1.025,
"step": 29400
},
{
"epoch": 3.464795349110341,
"grad_norm": 0.046121254563331604,
"learning_rate": 6.147695202257761e-05,
"loss": 1.0122,
"step": 29500
},
{
"epoch": 3.4765400199659404,
"grad_norm": 0.04572110250592232,
"learning_rate": 6.10065851364064e-05,
"loss": 1.0226,
"step": 29600
},
{
"epoch": 3.4882846908215397,
"grad_norm": 0.04542776942253113,
"learning_rate": 6.053621825023519e-05,
"loss": 1.0139,
"step": 29700
},
{
"epoch": 3.500029361677139,
"grad_norm": 0.04612453654408455,
"learning_rate": 6.006585136406397e-05,
"loss": 1.0137,
"step": 29800
},
{
"epoch": 3.5117740325327382,
"grad_norm": 0.04599248990416527,
"learning_rate": 5.959548447789276e-05,
"loss": 1.0254,
"step": 29900
},
{
"epoch": 3.5235187033883375,
"grad_norm": 0.047213006764650345,
"learning_rate": 5.9125117591721544e-05,
"loss": 1.0202,
"step": 30000
},
{
"epoch": 3.5352633742439368,
"grad_norm": 0.04772693291306496,
"learning_rate": 5.865475070555033e-05,
"loss": 1.0158,
"step": 30100
},
{
"epoch": 3.547008045099536,
"grad_norm": 0.04780668392777443,
"learning_rate": 5.8184383819379116e-05,
"loss": 1.0122,
"step": 30200
},
{
"epoch": 3.5587527159551353,
"grad_norm": 0.04593056067824364,
"learning_rate": 5.7714016933207905e-05,
"loss": 1.0196,
"step": 30300
},
{
"epoch": 3.5704973868107346,
"grad_norm": 0.046468161046504974,
"learning_rate": 5.724365004703669e-05,
"loss": 1.016,
"step": 30400
},
{
"epoch": 3.582242057666334,
"grad_norm": 0.046613674610853195,
"learning_rate": 5.6773283160865476e-05,
"loss": 1.028,
"step": 30500
},
{
"epoch": 3.593986728521933,
"grad_norm": 0.0453767292201519,
"learning_rate": 5.6302916274694265e-05,
"loss": 1.0179,
"step": 30600
},
{
"epoch": 3.6057313993775324,
"grad_norm": 0.0448760949075222,
"learning_rate": 5.583254938852305e-05,
"loss": 1.0107,
"step": 30700
},
{
"epoch": 3.6174760702331317,
"grad_norm": 0.04624709486961365,
"learning_rate": 5.5362182502351837e-05,
"loss": 1.0191,
"step": 30800
},
{
"epoch": 3.629220741088731,
"grad_norm": 0.04776296019554138,
"learning_rate": 5.489181561618062e-05,
"loss": 1.0145,
"step": 30900
},
{
"epoch": 3.64096541194433,
"grad_norm": 0.044639695435762405,
"learning_rate": 5.442144873000941e-05,
"loss": 1.0286,
"step": 31000
},
{
"epoch": 3.6527100827999295,
"grad_norm": 0.04474237933754921,
"learning_rate": 5.395108184383819e-05,
"loss": 1.0202,
"step": 31100
},
{
"epoch": 3.6644547536555288,
"grad_norm": 0.045259665697813034,
"learning_rate": 5.348071495766698e-05,
"loss": 1.0034,
"step": 31200
},
{
"epoch": 3.676199424511128,
"grad_norm": 0.04631993547081947,
"learning_rate": 5.3010348071495775e-05,
"loss": 1.026,
"step": 31300
},
{
"epoch": 3.6879440953667273,
"grad_norm": 0.04611456021666527,
"learning_rate": 5.253998118532456e-05,
"loss": 1.0084,
"step": 31400
},
{
"epoch": 3.6996887662223266,
"grad_norm": 0.04521900787949562,
"learning_rate": 5.206961429915335e-05,
"loss": 1.019,
"step": 31500
},
{
"epoch": 3.711433437077926,
"grad_norm": 0.04682457074522972,
"learning_rate": 5.1599247412982136e-05,
"loss": 1.029,
"step": 31600
},
{
"epoch": 3.723178107933525,
"grad_norm": 0.04528072476387024,
"learning_rate": 5.112888052681092e-05,
"loss": 1.0381,
"step": 31700
},
{
"epoch": 3.7349227787891244,
"grad_norm": 0.044861361384391785,
"learning_rate": 5.065851364063971e-05,
"loss": 1.0215,
"step": 31800
},
{
"epoch": 3.7466674496447236,
"grad_norm": 0.04482056945562363,
"learning_rate": 5.018814675446849e-05,
"loss": 1.0113,
"step": 31900
},
{
"epoch": 3.758412120500323,
"grad_norm": 0.046249981969594955,
"learning_rate": 4.971777986829728e-05,
"loss": 1.0162,
"step": 32000
},
{
"epoch": 3.7702155147102,
"grad_norm": 0.04573667049407959,
"learning_rate": 4.924741298212606e-05,
"loss": 1.0207,
"step": 32100
},
{
"epoch": 3.7819601855657994,
"grad_norm": 0.045440200716257095,
"learning_rate": 4.877704609595485e-05,
"loss": 1.0204,
"step": 32200
},
{
"epoch": 3.7937048564213987,
"grad_norm": 0.043568406254053116,
"learning_rate": 4.830667920978363e-05,
"loss": 1.0218,
"step": 32300
},
{
"epoch": 3.805449527276998,
"grad_norm": 0.04695621505379677,
"learning_rate": 4.783631232361242e-05,
"loss": 1.016,
"step": 32400
},
{
"epoch": 3.817194198132597,
"grad_norm": 0.04511117562651634,
"learning_rate": 4.7365945437441204e-05,
"loss": 1.018,
"step": 32500
},
{
"epoch": 3.8289388689881965,
"grad_norm": 0.0471058115363121,
"learning_rate": 4.689557855126999e-05,
"loss": 1.0265,
"step": 32600
},
{
"epoch": 3.8406835398437957,
"grad_norm": 0.04763401299715042,
"learning_rate": 4.6425211665098775e-05,
"loss": 1.0207,
"step": 32700
},
{
"epoch": 3.852428210699395,
"grad_norm": 0.04863814637064934,
"learning_rate": 4.5954844778927564e-05,
"loss": 1.0176,
"step": 32800
},
{
"epoch": 3.8641728815549943,
"grad_norm": 0.04665295407176018,
"learning_rate": 4.5484477892756347e-05,
"loss": 1.0143,
"step": 32900
},
{
"epoch": 3.8759175524105935,
"grad_norm": 0.04519110545516014,
"learning_rate": 4.501411100658514e-05,
"loss": 1.0206,
"step": 33000
},
{
"epoch": 3.887662223266193,
"grad_norm": 0.04543546214699745,
"learning_rate": 4.4543744120413925e-05,
"loss": 1.0204,
"step": 33100
},
{
"epoch": 3.899406894121792,
"grad_norm": 0.04484422877430916,
"learning_rate": 4.4073377234242714e-05,
"loss": 1.0166,
"step": 33200
},
{
"epoch": 3.9111515649773914,
"grad_norm": 0.04460673779249191,
"learning_rate": 4.3603010348071496e-05,
"loss": 1.0116,
"step": 33300
},
{
"epoch": 3.9228962358329906,
"grad_norm": 0.04780727997422218,
"learning_rate": 4.3132643461900285e-05,
"loss": 1.0127,
"step": 33400
},
{
"epoch": 3.93464090668859,
"grad_norm": 0.045138854533433914,
"learning_rate": 4.2662276575729074e-05,
"loss": 1.0173,
"step": 33500
},
{
"epoch": 3.946385577544189,
"grad_norm": 0.046860042959451675,
"learning_rate": 4.219190968955786e-05,
"loss": 1.0164,
"step": 33600
},
{
"epoch": 3.9581302483997884,
"grad_norm": 0.04478363320231438,
"learning_rate": 4.1721542803386646e-05,
"loss": 1.0339,
"step": 33700
},
{
"epoch": 3.9698749192553877,
"grad_norm": 0.04575762897729874,
"learning_rate": 4.125117591721543e-05,
"loss": 1.01,
"step": 33800
},
{
"epoch": 3.981619590110987,
"grad_norm": 0.04523037001490593,
"learning_rate": 4.078080903104422e-05,
"loss": 1.0145,
"step": 33900
},
{
"epoch": 3.9933642609665863,
"grad_norm": 0.04593048244714737,
"learning_rate": 4.0310442144873e-05,
"loss": 1.0131,
"step": 34000
}
],
"logging_steps": 100,
"max_steps": 42570,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.302351602279881e+20,
"train_batch_size": 9,
"trial_name": null,
"trial_params": null
}