hllj's picture
Upload folder using huggingface_hub
6fb20af verified
raw
history blame
177 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.3665823527255398,
"eval_steps": 1000,
"global_step": 10000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0003665823527255398,
"grad_norm": 0.8778485808644284,
"learning_rate": 2e-07,
"loss": 2.1465,
"step": 10
},
{
"epoch": 0.0007331647054510796,
"grad_norm": 1.0370696683685088,
"learning_rate": 4e-07,
"loss": 2.1972,
"step": 20
},
{
"epoch": 0.0010997470581766194,
"grad_norm": 1.0006676078231553,
"learning_rate": 6e-07,
"loss": 2.1582,
"step": 30
},
{
"epoch": 0.0014663294109021592,
"grad_norm": 0.8854477289760336,
"learning_rate": 8e-07,
"loss": 2.1934,
"step": 40
},
{
"epoch": 0.001832911763627699,
"grad_norm": 0.8999727006888211,
"learning_rate": 1e-06,
"loss": 2.1904,
"step": 50
},
{
"epoch": 0.002199494116353239,
"grad_norm": 0.932364223152173,
"learning_rate": 9.999996672053607e-07,
"loss": 2.1706,
"step": 60
},
{
"epoch": 0.0025660764690787785,
"grad_norm": 1.0299012086021375,
"learning_rate": 9.999986688218858e-07,
"loss": 2.1958,
"step": 70
},
{
"epoch": 0.0029326588218043185,
"grad_norm": 0.9395158606106717,
"learning_rate": 9.999970048509042e-07,
"loss": 2.2273,
"step": 80
},
{
"epoch": 0.003299241174529858,
"grad_norm": 0.9869960358591985,
"learning_rate": 9.999946752946311e-07,
"loss": 2.1807,
"step": 90
},
{
"epoch": 0.003665823527255398,
"grad_norm": 0.98825421384792,
"learning_rate": 9.999916801561675e-07,
"loss": 2.1348,
"step": 100
},
{
"epoch": 0.004032405879980938,
"grad_norm": 1.1988395000442367,
"learning_rate": 9.999880194395004e-07,
"loss": 2.1377,
"step": 110
},
{
"epoch": 0.004398988232706478,
"grad_norm": 1.129064025809237,
"learning_rate": 9.99983693149503e-07,
"loss": 2.1565,
"step": 120
},
{
"epoch": 0.004765570585432017,
"grad_norm": 1.0050118479797396,
"learning_rate": 9.999787012919342e-07,
"loss": 2.1701,
"step": 130
},
{
"epoch": 0.005132152938157557,
"grad_norm": 0.9232759625522824,
"learning_rate": 9.999730438734393e-07,
"loss": 2.0963,
"step": 140
},
{
"epoch": 0.0054987352908830965,
"grad_norm": 1.0348403490845175,
"learning_rate": 9.999667209015492e-07,
"loss": 2.1989,
"step": 150
},
{
"epoch": 0.005865317643608637,
"grad_norm": 1.0493408122676058,
"learning_rate": 9.999597323846806e-07,
"loss": 2.1707,
"step": 160
},
{
"epoch": 0.0062318999963341766,
"grad_norm": 1.116513730433909,
"learning_rate": 9.99952078332137e-07,
"loss": 2.1614,
"step": 170
},
{
"epoch": 0.006598482349059716,
"grad_norm": 0.9558367370618089,
"learning_rate": 9.999437587541072e-07,
"loss": 2.1214,
"step": 180
},
{
"epoch": 0.006965064701785256,
"grad_norm": 1.0990453159310916,
"learning_rate": 9.999347736616657e-07,
"loss": 2.1514,
"step": 190
},
{
"epoch": 0.007331647054510796,
"grad_norm": 1.051146838955259,
"learning_rate": 9.999251230667734e-07,
"loss": 2.1672,
"step": 200
},
{
"epoch": 0.007698229407236336,
"grad_norm": 1.0528334484392676,
"learning_rate": 9.99914806982277e-07,
"loss": 2.1651,
"step": 210
},
{
"epoch": 0.008064811759961876,
"grad_norm": 1.0488001209067876,
"learning_rate": 9.999038254219094e-07,
"loss": 2.1269,
"step": 220
},
{
"epoch": 0.008431394112687415,
"grad_norm": 1.0423933094923075,
"learning_rate": 9.998921784002884e-07,
"loss": 2.1409,
"step": 230
},
{
"epoch": 0.008797976465412955,
"grad_norm": 1.2035163212207243,
"learning_rate": 9.998798659329188e-07,
"loss": 2.0949,
"step": 240
},
{
"epoch": 0.009164558818138494,
"grad_norm": 1.0311622443925152,
"learning_rate": 9.998668880361902e-07,
"loss": 2.1572,
"step": 250
},
{
"epoch": 0.009531141170864035,
"grad_norm": 1.0199238986570556,
"learning_rate": 9.99853244727379e-07,
"loss": 2.0908,
"step": 260
},
{
"epoch": 0.009897723523589575,
"grad_norm": 1.1052910194491554,
"learning_rate": 9.998389360246465e-07,
"loss": 2.1046,
"step": 270
},
{
"epoch": 0.010264305876315114,
"grad_norm": 1.0244380828171549,
"learning_rate": 9.998239619470404e-07,
"loss": 2.1351,
"step": 280
},
{
"epoch": 0.010630888229040654,
"grad_norm": 1.0080176905815665,
"learning_rate": 9.998083225144936e-07,
"loss": 2.089,
"step": 290
},
{
"epoch": 0.010997470581766193,
"grad_norm": 0.9588881775099163,
"learning_rate": 9.997920177478252e-07,
"loss": 2.0186,
"step": 300
},
{
"epoch": 0.011364052934491733,
"grad_norm": 1.0223619251237732,
"learning_rate": 9.997750476687394e-07,
"loss": 2.0966,
"step": 310
},
{
"epoch": 0.011730635287217274,
"grad_norm": 1.1940399230837102,
"learning_rate": 9.99757412299827e-07,
"loss": 2.1036,
"step": 320
},
{
"epoch": 0.012097217639942813,
"grad_norm": 0.9943487033980454,
"learning_rate": 9.997391116645635e-07,
"loss": 2.0628,
"step": 330
},
{
"epoch": 0.012463799992668353,
"grad_norm": 1.03891573450971,
"learning_rate": 9.997201457873104e-07,
"loss": 2.0691,
"step": 340
},
{
"epoch": 0.012830382345393894,
"grad_norm": 1.116344520158988,
"learning_rate": 9.997005146933144e-07,
"loss": 2.0524,
"step": 350
},
{
"epoch": 0.013196964698119432,
"grad_norm": 0.9966017657422209,
"learning_rate": 9.996802184087082e-07,
"loss": 2.0779,
"step": 360
},
{
"epoch": 0.013563547050844973,
"grad_norm": 1.0412743923430994,
"learning_rate": 9.996592569605099e-07,
"loss": 2.0376,
"step": 370
},
{
"epoch": 0.013930129403570512,
"grad_norm": 1.1118998023014073,
"learning_rate": 9.996376303766227e-07,
"loss": 2.015,
"step": 380
},
{
"epoch": 0.014296711756296052,
"grad_norm": 1.0325566872435106,
"learning_rate": 9.996153386858355e-07,
"loss": 2.0249,
"step": 390
},
{
"epoch": 0.014663294109021592,
"grad_norm": 0.9345504257678122,
"learning_rate": 9.995923819178226e-07,
"loss": 2.0451,
"step": 400
},
{
"epoch": 0.015029876461747131,
"grad_norm": 0.8875269101106378,
"learning_rate": 9.995687601031435e-07,
"loss": 2.0108,
"step": 410
},
{
"epoch": 0.015396458814472672,
"grad_norm": 1.0784341870798066,
"learning_rate": 9.99544473273243e-07,
"loss": 2.0201,
"step": 420
},
{
"epoch": 0.015763041167198212,
"grad_norm": 0.9379135038421763,
"learning_rate": 9.995195214604515e-07,
"loss": 1.941,
"step": 430
},
{
"epoch": 0.016129623519923753,
"grad_norm": 0.9126909079244707,
"learning_rate": 9.994939046979838e-07,
"loss": 1.9684,
"step": 440
},
{
"epoch": 0.01649620587264929,
"grad_norm": 0.8838022442791796,
"learning_rate": 9.994676230199407e-07,
"loss": 2.0389,
"step": 450
},
{
"epoch": 0.01686278822537483,
"grad_norm": 0.8836839199930503,
"learning_rate": 9.994406764613082e-07,
"loss": 1.9666,
"step": 460
},
{
"epoch": 0.01722937057810037,
"grad_norm": 1.0627568898996331,
"learning_rate": 9.994130650579563e-07,
"loss": 2.0156,
"step": 470
},
{
"epoch": 0.01759595293082591,
"grad_norm": 0.9141641845780258,
"learning_rate": 9.993847888466408e-07,
"loss": 1.9649,
"step": 480
},
{
"epoch": 0.01796253528355145,
"grad_norm": 0.9929808622960486,
"learning_rate": 9.993558478650027e-07,
"loss": 1.951,
"step": 490
},
{
"epoch": 0.01832911763627699,
"grad_norm": 0.9649106649125109,
"learning_rate": 9.993262421515677e-07,
"loss": 2.0194,
"step": 500
},
{
"epoch": 0.01869569998900253,
"grad_norm": 0.9646184299435382,
"learning_rate": 9.992959717457456e-07,
"loss": 2.0054,
"step": 510
},
{
"epoch": 0.01906228234172807,
"grad_norm": 0.9754107205971403,
"learning_rate": 9.992650366878326e-07,
"loss": 1.9614,
"step": 520
},
{
"epoch": 0.01942886469445361,
"grad_norm": 0.825876663123403,
"learning_rate": 9.99233437019008e-07,
"loss": 2.0141,
"step": 530
},
{
"epoch": 0.01979544704717915,
"grad_norm": 0.9898145517539251,
"learning_rate": 9.992011727813372e-07,
"loss": 1.9788,
"step": 540
},
{
"epoch": 0.020162029399904687,
"grad_norm": 1.1244188599069105,
"learning_rate": 9.991682440177694e-07,
"loss": 1.9034,
"step": 550
},
{
"epoch": 0.020528611752630228,
"grad_norm": 1.1497344942569774,
"learning_rate": 9.991346507721387e-07,
"loss": 1.9211,
"step": 560
},
{
"epoch": 0.020895194105355768,
"grad_norm": 0.9021316458842555,
"learning_rate": 9.991003930891637e-07,
"loss": 1.9182,
"step": 570
},
{
"epoch": 0.02126177645808131,
"grad_norm": 0.8307709564470201,
"learning_rate": 9.990654710144475e-07,
"loss": 1.9272,
"step": 580
},
{
"epoch": 0.02162835881080685,
"grad_norm": 0.8745951617052735,
"learning_rate": 9.990298845944777e-07,
"loss": 1.9499,
"step": 590
},
{
"epoch": 0.021994941163532386,
"grad_norm": 0.8243921045085457,
"learning_rate": 9.98993633876626e-07,
"loss": 1.9221,
"step": 600
},
{
"epoch": 0.022361523516257927,
"grad_norm": 0.9285168979863858,
"learning_rate": 9.989567189091486e-07,
"loss": 1.8804,
"step": 610
},
{
"epoch": 0.022728105868983467,
"grad_norm": 0.9675998606348684,
"learning_rate": 9.98919139741186e-07,
"loss": 1.9019,
"step": 620
},
{
"epoch": 0.023094688221709007,
"grad_norm": 0.8852104273861887,
"learning_rate": 9.988808964227629e-07,
"loss": 1.8772,
"step": 630
},
{
"epoch": 0.023461270574434548,
"grad_norm": 0.819719680853091,
"learning_rate": 9.988419890047877e-07,
"loss": 1.9171,
"step": 640
},
{
"epoch": 0.023827852927160085,
"grad_norm": 0.93140794342887,
"learning_rate": 9.988024175390533e-07,
"loss": 1.8467,
"step": 650
},
{
"epoch": 0.024194435279885625,
"grad_norm": 0.8360802933834758,
"learning_rate": 9.987621820782363e-07,
"loss": 1.9233,
"step": 660
},
{
"epoch": 0.024561017632611166,
"grad_norm": 0.8157180427592693,
"learning_rate": 9.987212826758975e-07,
"loss": 1.9473,
"step": 670
},
{
"epoch": 0.024927599985336706,
"grad_norm": 0.9793002573948607,
"learning_rate": 9.98679719386481e-07,
"loss": 1.8931,
"step": 680
},
{
"epoch": 0.025294182338062247,
"grad_norm": 0.8445420197840301,
"learning_rate": 9.986374922653154e-07,
"loss": 1.8686,
"step": 690
},
{
"epoch": 0.025660764690787787,
"grad_norm": 0.8584605142905422,
"learning_rate": 9.985946013686119e-07,
"loss": 1.8967,
"step": 700
},
{
"epoch": 0.026027347043513324,
"grad_norm": 0.98656156834715,
"learning_rate": 9.985510467534664e-07,
"loss": 1.8635,
"step": 710
},
{
"epoch": 0.026393929396238865,
"grad_norm": 0.9182458113746159,
"learning_rate": 9.985068284778577e-07,
"loss": 1.8693,
"step": 720
},
{
"epoch": 0.026760511748964405,
"grad_norm": 0.8330989668660308,
"learning_rate": 9.984619466006485e-07,
"loss": 1.8613,
"step": 730
},
{
"epoch": 0.027127094101689946,
"grad_norm": 0.8644736624360776,
"learning_rate": 9.98416401181584e-07,
"loss": 1.8628,
"step": 740
},
{
"epoch": 0.027493676454415486,
"grad_norm": 0.987168924150431,
"learning_rate": 9.98370192281294e-07,
"loss": 1.8943,
"step": 750
},
{
"epoch": 0.027860258807141023,
"grad_norm": 0.8720418625775509,
"learning_rate": 9.983233199612903e-07,
"loss": 1.9446,
"step": 760
},
{
"epoch": 0.028226841159866563,
"grad_norm": 0.7953663245922279,
"learning_rate": 9.982757842839687e-07,
"loss": 1.9014,
"step": 770
},
{
"epoch": 0.028593423512592104,
"grad_norm": 0.9296681817326182,
"learning_rate": 9.98227585312607e-07,
"loss": 1.8108,
"step": 780
},
{
"epoch": 0.028960005865317644,
"grad_norm": 0.8062000633701384,
"learning_rate": 9.981787231113675e-07,
"loss": 1.8345,
"step": 790
},
{
"epoch": 0.029326588218043185,
"grad_norm": 0.7938194156111642,
"learning_rate": 9.981291977452939e-07,
"loss": 1.8941,
"step": 800
},
{
"epoch": 0.029693170570768722,
"grad_norm": 0.9291321405470028,
"learning_rate": 9.980790092803135e-07,
"loss": 1.8403,
"step": 810
},
{
"epoch": 0.030059752923494262,
"grad_norm": 0.8275423223500764,
"learning_rate": 9.980281577832363e-07,
"loss": 1.8402,
"step": 820
},
{
"epoch": 0.030426335276219803,
"grad_norm": 0.8980283349268403,
"learning_rate": 9.979766433217545e-07,
"loss": 1.8691,
"step": 830
},
{
"epoch": 0.030792917628945343,
"grad_norm": 0.7768796883189981,
"learning_rate": 9.979244659644429e-07,
"loss": 1.888,
"step": 840
},
{
"epoch": 0.031159499981670884,
"grad_norm": 0.818398169635764,
"learning_rate": 9.978716257807593e-07,
"loss": 1.8814,
"step": 850
},
{
"epoch": 0.031526082334396424,
"grad_norm": 0.8442121417280394,
"learning_rate": 9.97818122841043e-07,
"loss": 1.8369,
"step": 860
},
{
"epoch": 0.031892664687121965,
"grad_norm": 0.8176757534156489,
"learning_rate": 9.977639572165162e-07,
"loss": 1.8591,
"step": 870
},
{
"epoch": 0.032259247039847505,
"grad_norm": 0.8029579269470367,
"learning_rate": 9.97709128979283e-07,
"loss": 1.8866,
"step": 880
},
{
"epoch": 0.03262582939257304,
"grad_norm": 0.8812915944662771,
"learning_rate": 9.976536382023294e-07,
"loss": 1.8366,
"step": 890
},
{
"epoch": 0.03299241174529858,
"grad_norm": 0.777876054228082,
"learning_rate": 9.97597484959524e-07,
"loss": 1.8322,
"step": 900
},
{
"epoch": 0.03335899409802412,
"grad_norm": 0.9073927568433396,
"learning_rate": 9.975406693256162e-07,
"loss": 1.8238,
"step": 910
},
{
"epoch": 0.03372557645074966,
"grad_norm": 1.154230547383887,
"learning_rate": 9.974831913762382e-07,
"loss": 1.8574,
"step": 920
},
{
"epoch": 0.0340921588034752,
"grad_norm": 0.8196714978615802,
"learning_rate": 9.974250511879031e-07,
"loss": 1.8423,
"step": 930
},
{
"epoch": 0.03445874115620074,
"grad_norm": 0.9288752746341313,
"learning_rate": 9.97366248838006e-07,
"loss": 1.8993,
"step": 940
},
{
"epoch": 0.03482532350892628,
"grad_norm": 0.7950657259868453,
"learning_rate": 9.973067844048235e-07,
"loss": 1.8741,
"step": 950
},
{
"epoch": 0.03519190586165182,
"grad_norm": 0.796086365915343,
"learning_rate": 9.972466579675131e-07,
"loss": 1.7832,
"step": 960
},
{
"epoch": 0.03555848821437736,
"grad_norm": 0.9066172708399791,
"learning_rate": 9.97185869606114e-07,
"loss": 1.8462,
"step": 970
},
{
"epoch": 0.0359250705671029,
"grad_norm": 1.038083569499433,
"learning_rate": 9.971244194015463e-07,
"loss": 1.858,
"step": 980
},
{
"epoch": 0.036291652919828436,
"grad_norm": 0.9051533251684815,
"learning_rate": 9.97062307435611e-07,
"loss": 1.8387,
"step": 990
},
{
"epoch": 0.03665823527255398,
"grad_norm": 0.8381523935993735,
"learning_rate": 9.969995337909908e-07,
"loss": 1.8361,
"step": 1000
},
{
"epoch": 0.03665823527255398,
"eval_accuracy": 0.5988169778677517,
"eval_loss": 1.8318405151367188,
"eval_runtime": 308.5555,
"eval_samples_per_second": 10.718,
"eval_steps_per_second": 0.894,
"step": 1000
},
{
"epoch": 0.03702481762527952,
"grad_norm": 0.8427628207388767,
"learning_rate": 9.969360985512478e-07,
"loss": 1.8265,
"step": 1010
},
{
"epoch": 0.03739139997800506,
"grad_norm": 0.8552215254960128,
"learning_rate": 9.968720018008264e-07,
"loss": 1.858,
"step": 1020
},
{
"epoch": 0.0377579823307306,
"grad_norm": 0.9770990446912831,
"learning_rate": 9.968072436250502e-07,
"loss": 1.8336,
"step": 1030
},
{
"epoch": 0.03812456468345614,
"grad_norm": 0.8749109462328284,
"learning_rate": 9.967418241101245e-07,
"loss": 1.8659,
"step": 1040
},
{
"epoch": 0.03849114703618168,
"grad_norm": 1.0370092544039358,
"learning_rate": 9.966757433431338e-07,
"loss": 1.7817,
"step": 1050
},
{
"epoch": 0.03885772938890722,
"grad_norm": 0.9115228378829131,
"learning_rate": 9.966090014120439e-07,
"loss": 1.8024,
"step": 1060
},
{
"epoch": 0.03922431174163276,
"grad_norm": 0.8868427346212977,
"learning_rate": 9.965415984056998e-07,
"loss": 1.8437,
"step": 1070
},
{
"epoch": 0.0395908940943583,
"grad_norm": 0.9053364161480404,
"learning_rate": 9.96473534413827e-07,
"loss": 1.817,
"step": 1080
},
{
"epoch": 0.039957476447083834,
"grad_norm": 0.9133195528454671,
"learning_rate": 9.964048095270312e-07,
"loss": 1.7877,
"step": 1090
},
{
"epoch": 0.040324058799809374,
"grad_norm": 1.0646101033232054,
"learning_rate": 9.963354238367971e-07,
"loss": 1.784,
"step": 1100
},
{
"epoch": 0.040690641152534915,
"grad_norm": 0.7708104862115812,
"learning_rate": 9.962653774354897e-07,
"loss": 1.8534,
"step": 1110
},
{
"epoch": 0.041057223505260455,
"grad_norm": 0.8675790148592712,
"learning_rate": 9.96194670416353e-07,
"loss": 1.8549,
"step": 1120
},
{
"epoch": 0.041423805857985996,
"grad_norm": 0.8417668918121122,
"learning_rate": 9.961233028735107e-07,
"loss": 1.816,
"step": 1130
},
{
"epoch": 0.041790388210711536,
"grad_norm": 0.8168288703880237,
"learning_rate": 9.960512749019661e-07,
"loss": 1.8512,
"step": 1140
},
{
"epoch": 0.04215697056343708,
"grad_norm": 0.8018545416660454,
"learning_rate": 9.95978586597601e-07,
"loss": 1.832,
"step": 1150
},
{
"epoch": 0.04252355291616262,
"grad_norm": 0.9865966895727584,
"learning_rate": 9.959052380571764e-07,
"loss": 1.853,
"step": 1160
},
{
"epoch": 0.04289013526888816,
"grad_norm": 0.8107907928839149,
"learning_rate": 9.958312293783327e-07,
"loss": 1.85,
"step": 1170
},
{
"epoch": 0.0432567176216137,
"grad_norm": 0.9230676080344427,
"learning_rate": 9.957565606595882e-07,
"loss": 1.7839,
"step": 1180
},
{
"epoch": 0.04362329997433924,
"grad_norm": 0.9011134249108275,
"learning_rate": 9.956812320003407e-07,
"loss": 1.7649,
"step": 1190
},
{
"epoch": 0.04398988232706477,
"grad_norm": 0.8877055310067349,
"learning_rate": 9.956052435008657e-07,
"loss": 1.8358,
"step": 1200
},
{
"epoch": 0.04435646467979031,
"grad_norm": 0.9441745533847735,
"learning_rate": 9.955285952623177e-07,
"loss": 1.8217,
"step": 1210
},
{
"epoch": 0.04472304703251585,
"grad_norm": 0.9280531244485228,
"learning_rate": 9.954512873867292e-07,
"loss": 1.8273,
"step": 1220
},
{
"epoch": 0.04508962938524139,
"grad_norm": 1.0733510489183336,
"learning_rate": 9.95373319977011e-07,
"loss": 1.8289,
"step": 1230
},
{
"epoch": 0.045456211737966934,
"grad_norm": 0.9194393203848475,
"learning_rate": 9.952946931369512e-07,
"loss": 1.8134,
"step": 1240
},
{
"epoch": 0.045822794090692474,
"grad_norm": 0.8924651164337065,
"learning_rate": 9.952154069712164e-07,
"loss": 1.8233,
"step": 1250
},
{
"epoch": 0.046189376443418015,
"grad_norm": 0.9645620934573451,
"learning_rate": 9.951354615853506e-07,
"loss": 1.7951,
"step": 1260
},
{
"epoch": 0.046555958796143555,
"grad_norm": 0.9514951845878826,
"learning_rate": 9.950548570857755e-07,
"loss": 1.8034,
"step": 1270
},
{
"epoch": 0.046922541148869096,
"grad_norm": 1.0861848487934576,
"learning_rate": 9.949735935797898e-07,
"loss": 1.7845,
"step": 1280
},
{
"epoch": 0.047289123501594636,
"grad_norm": 0.9444165617124335,
"learning_rate": 9.948916711755702e-07,
"loss": 1.8499,
"step": 1290
},
{
"epoch": 0.04765570585432017,
"grad_norm": 0.9296489213610688,
"learning_rate": 9.948090899821695e-07,
"loss": 1.8362,
"step": 1300
},
{
"epoch": 0.04802228820704571,
"grad_norm": 0.9031404187157595,
"learning_rate": 9.947258501095183e-07,
"loss": 1.7987,
"step": 1310
},
{
"epoch": 0.04838887055977125,
"grad_norm": 0.9893576898507132,
"learning_rate": 9.946419516684238e-07,
"loss": 1.7901,
"step": 1320
},
{
"epoch": 0.04875545291249679,
"grad_norm": 0.8312432281714202,
"learning_rate": 9.945573947705696e-07,
"loss": 1.7877,
"step": 1330
},
{
"epoch": 0.04912203526522233,
"grad_norm": 0.9503234488792208,
"learning_rate": 9.944721795285161e-07,
"loss": 1.7814,
"step": 1340
},
{
"epoch": 0.04948861761794787,
"grad_norm": 0.8138144516056374,
"learning_rate": 9.943863060557e-07,
"loss": 1.7973,
"step": 1350
},
{
"epoch": 0.04985519997067341,
"grad_norm": 1.0236050868655204,
"learning_rate": 9.942997744664346e-07,
"loss": 1.766,
"step": 1360
},
{
"epoch": 0.05022178232339895,
"grad_norm": 0.8876253030811799,
"learning_rate": 9.942125848759084e-07,
"loss": 1.8025,
"step": 1370
},
{
"epoch": 0.05058836467612449,
"grad_norm": 0.9143837255426513,
"learning_rate": 9.941247374001864e-07,
"loss": 1.8256,
"step": 1380
},
{
"epoch": 0.050954947028850034,
"grad_norm": 0.7919956208916636,
"learning_rate": 9.940362321562095e-07,
"loss": 1.7966,
"step": 1390
},
{
"epoch": 0.051321529381575574,
"grad_norm": 0.9593927463945575,
"learning_rate": 9.939470692617936e-07,
"loss": 1.756,
"step": 1400
},
{
"epoch": 0.05168811173430111,
"grad_norm": 1.0264148022637987,
"learning_rate": 9.938572488356309e-07,
"loss": 1.7938,
"step": 1410
},
{
"epoch": 0.05205469408702665,
"grad_norm": 1.0694910008156386,
"learning_rate": 9.937667709972882e-07,
"loss": 1.7151,
"step": 1420
},
{
"epoch": 0.05242127643975219,
"grad_norm": 1.106949179035861,
"learning_rate": 9.936756358672075e-07,
"loss": 1.7566,
"step": 1430
},
{
"epoch": 0.05278785879247773,
"grad_norm": 0.8484995009187619,
"learning_rate": 9.935838435667062e-07,
"loss": 1.8061,
"step": 1440
},
{
"epoch": 0.05315444114520327,
"grad_norm": 0.9442924790988804,
"learning_rate": 9.93491394217976e-07,
"loss": 1.7938,
"step": 1450
},
{
"epoch": 0.05352102349792881,
"grad_norm": 0.8835040984395444,
"learning_rate": 9.933982879440838e-07,
"loss": 1.7801,
"step": 1460
},
{
"epoch": 0.05388760585065435,
"grad_norm": 0.951681021528121,
"learning_rate": 9.933045248689704e-07,
"loss": 1.7839,
"step": 1470
},
{
"epoch": 0.05425418820337989,
"grad_norm": 0.8986214443009446,
"learning_rate": 9.932101051174513e-07,
"loss": 1.8251,
"step": 1480
},
{
"epoch": 0.05462077055610543,
"grad_norm": 0.8136477078651573,
"learning_rate": 9.93115028815216e-07,
"loss": 1.8429,
"step": 1490
},
{
"epoch": 0.05498735290883097,
"grad_norm": 1.0031260237221131,
"learning_rate": 9.93019296088828e-07,
"loss": 1.7663,
"step": 1500
},
{
"epoch": 0.055353935261556506,
"grad_norm": 0.9959012828848206,
"learning_rate": 9.92922907065725e-07,
"loss": 1.8269,
"step": 1510
},
{
"epoch": 0.055720517614282046,
"grad_norm": 0.8915575658825868,
"learning_rate": 9.928258618742176e-07,
"loss": 1.7696,
"step": 1520
},
{
"epoch": 0.056087099967007586,
"grad_norm": 0.9963782636445598,
"learning_rate": 9.927281606434902e-07,
"loss": 1.7738,
"step": 1530
},
{
"epoch": 0.05645368231973313,
"grad_norm": 0.9381564546633785,
"learning_rate": 9.92629803503601e-07,
"loss": 1.7333,
"step": 1540
},
{
"epoch": 0.05682026467245867,
"grad_norm": 1.0017202007335113,
"learning_rate": 9.925307905854807e-07,
"loss": 1.8095,
"step": 1550
},
{
"epoch": 0.05718684702518421,
"grad_norm": 1.0543725728983615,
"learning_rate": 9.924311220209332e-07,
"loss": 1.7571,
"step": 1560
},
{
"epoch": 0.05755342937790975,
"grad_norm": 1.0455383232236297,
"learning_rate": 9.92330797942635e-07,
"loss": 1.7605,
"step": 1570
},
{
"epoch": 0.05792001173063529,
"grad_norm": 0.8416991518569622,
"learning_rate": 9.922298184841356e-07,
"loss": 1.7703,
"step": 1580
},
{
"epoch": 0.05828659408336083,
"grad_norm": 0.92044213042727,
"learning_rate": 9.921281837798565e-07,
"loss": 1.7051,
"step": 1590
},
{
"epoch": 0.05865317643608637,
"grad_norm": 0.9422384532621354,
"learning_rate": 9.920258939650918e-07,
"loss": 1.7882,
"step": 1600
},
{
"epoch": 0.0590197587888119,
"grad_norm": 1.1464397608985724,
"learning_rate": 9.919229491760074e-07,
"loss": 1.7504,
"step": 1610
},
{
"epoch": 0.059386341141537444,
"grad_norm": 1.1503410560007548,
"learning_rate": 9.918193495496411e-07,
"loss": 1.7755,
"step": 1620
},
{
"epoch": 0.059752923494262984,
"grad_norm": 1.034854775422536,
"learning_rate": 9.917150952239028e-07,
"loss": 1.8109,
"step": 1630
},
{
"epoch": 0.060119505846988525,
"grad_norm": 0.9357240877838402,
"learning_rate": 9.916101863375734e-07,
"loss": 1.812,
"step": 1640
},
{
"epoch": 0.060486088199714065,
"grad_norm": 1.2613406348730127,
"learning_rate": 9.915046230303055e-07,
"loss": 1.7299,
"step": 1650
},
{
"epoch": 0.060852670552439606,
"grad_norm": 0.991269818479319,
"learning_rate": 9.913984054426226e-07,
"loss": 1.6839,
"step": 1660
},
{
"epoch": 0.061219252905165146,
"grad_norm": 1.0426302229265827,
"learning_rate": 9.91291533715919e-07,
"loss": 1.6983,
"step": 1670
},
{
"epoch": 0.061585835257890686,
"grad_norm": 1.0623577818006307,
"learning_rate": 9.911840079924607e-07,
"loss": 1.7586,
"step": 1680
},
{
"epoch": 0.06195241761061623,
"grad_norm": 0.9792793493189645,
"learning_rate": 9.910758284153834e-07,
"loss": 1.7863,
"step": 1690
},
{
"epoch": 0.06231899996334177,
"grad_norm": 1.1013133546227525,
"learning_rate": 9.90966995128693e-07,
"loss": 1.7586,
"step": 1700
},
{
"epoch": 0.0626855823160673,
"grad_norm": 1.2653001609685381,
"learning_rate": 9.908575082772664e-07,
"loss": 1.7087,
"step": 1710
},
{
"epoch": 0.06305216466879285,
"grad_norm": 1.2600949114865185,
"learning_rate": 9.907473680068501e-07,
"loss": 1.6974,
"step": 1720
},
{
"epoch": 0.06341874702151838,
"grad_norm": 1.0352843166386823,
"learning_rate": 9.906365744640605e-07,
"loss": 1.7247,
"step": 1730
},
{
"epoch": 0.06378532937424393,
"grad_norm": 1.0534586823177523,
"learning_rate": 9.905251277963838e-07,
"loss": 1.7989,
"step": 1740
},
{
"epoch": 0.06415191172696946,
"grad_norm": 1.0901888662447625,
"learning_rate": 9.904130281521749e-07,
"loss": 1.7495,
"step": 1750
},
{
"epoch": 0.06451849407969501,
"grad_norm": 1.0657237836075932,
"learning_rate": 9.903002756806589e-07,
"loss": 1.7393,
"step": 1760
},
{
"epoch": 0.06488507643242054,
"grad_norm": 1.0695629454280169,
"learning_rate": 9.901868705319291e-07,
"loss": 1.784,
"step": 1770
},
{
"epoch": 0.06525165878514608,
"grad_norm": 0.9206279700392275,
"learning_rate": 9.900728128569482e-07,
"loss": 1.758,
"step": 1780
},
{
"epoch": 0.06561824113787162,
"grad_norm": 1.0410164391482535,
"learning_rate": 9.899581028075473e-07,
"loss": 1.7252,
"step": 1790
},
{
"epoch": 0.06598482349059716,
"grad_norm": 0.9377493357256449,
"learning_rate": 9.898427405364262e-07,
"loss": 1.74,
"step": 1800
},
{
"epoch": 0.0663514058433227,
"grad_norm": 1.1272971880737597,
"learning_rate": 9.897267261971524e-07,
"loss": 1.7524,
"step": 1810
},
{
"epoch": 0.06671798819604824,
"grad_norm": 1.0979559562270786,
"learning_rate": 9.896100599441618e-07,
"loss": 1.6988,
"step": 1820
},
{
"epoch": 0.06708457054877379,
"grad_norm": 0.961855276743755,
"learning_rate": 9.894927419327576e-07,
"loss": 1.7327,
"step": 1830
},
{
"epoch": 0.06745115290149932,
"grad_norm": 0.97235897562474,
"learning_rate": 9.893747723191118e-07,
"loss": 1.7544,
"step": 1840
},
{
"epoch": 0.06781773525422487,
"grad_norm": 1.1764451813427488,
"learning_rate": 9.892561512602626e-07,
"loss": 1.7616,
"step": 1850
},
{
"epoch": 0.0681843176069504,
"grad_norm": 0.9690232157285822,
"learning_rate": 9.891368789141158e-07,
"loss": 1.7386,
"step": 1860
},
{
"epoch": 0.06855089995967593,
"grad_norm": 1.131145797735988,
"learning_rate": 9.89016955439444e-07,
"loss": 1.7473,
"step": 1870
},
{
"epoch": 0.06891748231240148,
"grad_norm": 1.1996910697441496,
"learning_rate": 9.88896380995887e-07,
"loss": 1.7502,
"step": 1880
},
{
"epoch": 0.06928406466512702,
"grad_norm": 1.2280647210603344,
"learning_rate": 9.887751557439513e-07,
"loss": 1.7547,
"step": 1890
},
{
"epoch": 0.06965064701785256,
"grad_norm": 1.0705375351848956,
"learning_rate": 9.886532798450085e-07,
"loss": 1.7577,
"step": 1900
},
{
"epoch": 0.0700172293705781,
"grad_norm": 1.0083918166967278,
"learning_rate": 9.88530753461298e-07,
"loss": 1.7193,
"step": 1910
},
{
"epoch": 0.07038381172330364,
"grad_norm": 1.0053388433251793,
"learning_rate": 9.884075767559236e-07,
"loss": 1.7635,
"step": 1920
},
{
"epoch": 0.07075039407602918,
"grad_norm": 1.1405257537860627,
"learning_rate": 9.88283749892856e-07,
"loss": 1.7859,
"step": 1930
},
{
"epoch": 0.07111697642875472,
"grad_norm": 1.3872222978621402,
"learning_rate": 9.881592730369305e-07,
"loss": 1.6823,
"step": 1940
},
{
"epoch": 0.07148355878148026,
"grad_norm": 1.0500974949147595,
"learning_rate": 9.880341463538483e-07,
"loss": 1.7268,
"step": 1950
},
{
"epoch": 0.0718501411342058,
"grad_norm": 1.1146107157958263,
"learning_rate": 9.879083700101754e-07,
"loss": 1.7324,
"step": 1960
},
{
"epoch": 0.07221672348693134,
"grad_norm": 1.0782444093138666,
"learning_rate": 9.877819441733421e-07,
"loss": 1.7219,
"step": 1970
},
{
"epoch": 0.07258330583965687,
"grad_norm": 1.1066515564824118,
"learning_rate": 9.876548690116443e-07,
"loss": 1.6974,
"step": 1980
},
{
"epoch": 0.07294988819238242,
"grad_norm": 1.0551270004207765,
"learning_rate": 9.875271446942416e-07,
"loss": 1.7086,
"step": 1990
},
{
"epoch": 0.07331647054510795,
"grad_norm": 1.0172022580059552,
"learning_rate": 9.873987713911579e-07,
"loss": 1.7281,
"step": 2000
},
{
"epoch": 0.07331647054510795,
"eval_accuracy": 0.6153943652920695,
"eval_loss": 1.7325148582458496,
"eval_runtime": 307.9034,
"eval_samples_per_second": 10.74,
"eval_steps_per_second": 0.896,
"step": 2000
},
{
"epoch": 0.0736830528978335,
"grad_norm": 1.0319650415221862,
"learning_rate": 9.872697492732805e-07,
"loss": 1.699,
"step": 2010
},
{
"epoch": 0.07404963525055903,
"grad_norm": 0.9982774529316707,
"learning_rate": 9.871400785123615e-07,
"loss": 1.7476,
"step": 2020
},
{
"epoch": 0.07441621760328458,
"grad_norm": 1.1272779709424325,
"learning_rate": 9.870097592810156e-07,
"loss": 1.7911,
"step": 2030
},
{
"epoch": 0.07478279995601012,
"grad_norm": 1.0356947186293473,
"learning_rate": 9.86878791752721e-07,
"loss": 1.7038,
"step": 2040
},
{
"epoch": 0.07514938230873566,
"grad_norm": 0.9227271241300935,
"learning_rate": 9.867471761018187e-07,
"loss": 1.789,
"step": 2050
},
{
"epoch": 0.0755159646614612,
"grad_norm": 1.1484518524699514,
"learning_rate": 9.86614912503513e-07,
"loss": 1.7706,
"step": 2060
},
{
"epoch": 0.07588254701418674,
"grad_norm": 0.8955923870076745,
"learning_rate": 9.864820011338698e-07,
"loss": 1.7543,
"step": 2070
},
{
"epoch": 0.07624912936691228,
"grad_norm": 1.1335067807492596,
"learning_rate": 9.863484421698182e-07,
"loss": 1.7155,
"step": 2080
},
{
"epoch": 0.07661571171963781,
"grad_norm": 1.1784649675887455,
"learning_rate": 9.86214235789149e-07,
"loss": 1.7198,
"step": 2090
},
{
"epoch": 0.07698229407236336,
"grad_norm": 0.9990776315852751,
"learning_rate": 9.860793821705153e-07,
"loss": 1.7088,
"step": 2100
},
{
"epoch": 0.07734887642508889,
"grad_norm": 1.8933737366748618,
"learning_rate": 9.859438814934306e-07,
"loss": 1.7815,
"step": 2110
},
{
"epoch": 0.07771545877781444,
"grad_norm": 1.0824373033670114,
"learning_rate": 9.858077339382708e-07,
"loss": 1.7056,
"step": 2120
},
{
"epoch": 0.07808204113053997,
"grad_norm": 1.0459040499217758,
"learning_rate": 9.856709396862727e-07,
"loss": 1.7587,
"step": 2130
},
{
"epoch": 0.07844862348326552,
"grad_norm": 1.1273027866420589,
"learning_rate": 9.855334989195338e-07,
"loss": 1.6718,
"step": 2140
},
{
"epoch": 0.07881520583599105,
"grad_norm": 1.1216307142085522,
"learning_rate": 9.853954118210124e-07,
"loss": 1.6925,
"step": 2150
},
{
"epoch": 0.0791817881887166,
"grad_norm": 1.2320479842440668,
"learning_rate": 9.852566785745269e-07,
"loss": 1.7128,
"step": 2160
},
{
"epoch": 0.07954837054144213,
"grad_norm": 1.0679388999130817,
"learning_rate": 9.851172993647562e-07,
"loss": 1.7063,
"step": 2170
},
{
"epoch": 0.07991495289416767,
"grad_norm": 1.2733808120999472,
"learning_rate": 9.849772743772387e-07,
"loss": 1.69,
"step": 2180
},
{
"epoch": 0.08028153524689322,
"grad_norm": 1.240045987921097,
"learning_rate": 9.848366037983728e-07,
"loss": 1.7382,
"step": 2190
},
{
"epoch": 0.08064811759961875,
"grad_norm": 1.0370629833579919,
"learning_rate": 9.846952878154162e-07,
"loss": 1.7135,
"step": 2200
},
{
"epoch": 0.0810146999523443,
"grad_norm": 1.1809158590474762,
"learning_rate": 9.845533266164856e-07,
"loss": 1.7197,
"step": 2210
},
{
"epoch": 0.08138128230506983,
"grad_norm": 1.0143562772242192,
"learning_rate": 9.844107203905567e-07,
"loss": 1.7062,
"step": 2220
},
{
"epoch": 0.08174786465779538,
"grad_norm": 1.1841441026483928,
"learning_rate": 9.842674693274639e-07,
"loss": 1.6766,
"step": 2230
},
{
"epoch": 0.08211444701052091,
"grad_norm": 1.1281564379658906,
"learning_rate": 9.841235736179e-07,
"loss": 1.6485,
"step": 2240
},
{
"epoch": 0.08248102936324646,
"grad_norm": 1.2660731034162191,
"learning_rate": 9.83979033453416e-07,
"loss": 1.7513,
"step": 2250
},
{
"epoch": 0.08284761171597199,
"grad_norm": 1.1670722746985231,
"learning_rate": 9.8383384902642e-07,
"loss": 1.7282,
"step": 2260
},
{
"epoch": 0.08321419406869754,
"grad_norm": 1.1924698170354644,
"learning_rate": 9.836880205301795e-07,
"loss": 1.7339,
"step": 2270
},
{
"epoch": 0.08358077642142307,
"grad_norm": 1.0522491790203259,
"learning_rate": 9.835415481588173e-07,
"loss": 1.6907,
"step": 2280
},
{
"epoch": 0.0839473587741486,
"grad_norm": 1.1650865835189006,
"learning_rate": 9.83394432107315e-07,
"loss": 1.718,
"step": 2290
},
{
"epoch": 0.08431394112687415,
"grad_norm": 0.9881537861019963,
"learning_rate": 9.832466725715097e-07,
"loss": 1.7423,
"step": 2300
},
{
"epoch": 0.08468052347959969,
"grad_norm": 1.0843420992658444,
"learning_rate": 9.830982697480958e-07,
"loss": 1.7112,
"step": 2310
},
{
"epoch": 0.08504710583232523,
"grad_norm": 1.1947303847486304,
"learning_rate": 9.829492238346244e-07,
"loss": 1.6813,
"step": 2320
},
{
"epoch": 0.08541368818505077,
"grad_norm": 1.04336555772043,
"learning_rate": 9.82799535029502e-07,
"loss": 1.6871,
"step": 2330
},
{
"epoch": 0.08578027053777632,
"grad_norm": 1.3465243494238373,
"learning_rate": 9.826492035319911e-07,
"loss": 1.7358,
"step": 2340
},
{
"epoch": 0.08614685289050185,
"grad_norm": 1.1173189734449491,
"learning_rate": 9.824982295422097e-07,
"loss": 1.7047,
"step": 2350
},
{
"epoch": 0.0865134352432274,
"grad_norm": 1.2520018391632697,
"learning_rate": 9.823466132611313e-07,
"loss": 1.6984,
"step": 2360
},
{
"epoch": 0.08688001759595293,
"grad_norm": 1.03470369404529,
"learning_rate": 9.82194354890584e-07,
"loss": 1.7278,
"step": 2370
},
{
"epoch": 0.08724659994867848,
"grad_norm": 1.0164204083388344,
"learning_rate": 9.820414546332513e-07,
"loss": 1.7458,
"step": 2380
},
{
"epoch": 0.08761318230140401,
"grad_norm": 1.2348821126024987,
"learning_rate": 9.818879126926701e-07,
"loss": 1.7343,
"step": 2390
},
{
"epoch": 0.08797976465412954,
"grad_norm": 1.0011105767660962,
"learning_rate": 9.817337292732328e-07,
"loss": 1.7131,
"step": 2400
},
{
"epoch": 0.08834634700685509,
"grad_norm": 1.0710762717577924,
"learning_rate": 9.815789045801847e-07,
"loss": 1.6617,
"step": 2410
},
{
"epoch": 0.08871292935958063,
"grad_norm": 1.1055970569118785,
"learning_rate": 9.814234388196252e-07,
"loss": 1.758,
"step": 2420
},
{
"epoch": 0.08907951171230617,
"grad_norm": 1.013594052614807,
"learning_rate": 9.81267332198507e-07,
"loss": 1.6906,
"step": 2430
},
{
"epoch": 0.0894460940650317,
"grad_norm": 1.0649424099545044,
"learning_rate": 9.811105849246359e-07,
"loss": 1.6896,
"step": 2440
},
{
"epoch": 0.08981267641775725,
"grad_norm": 1.7084885584877294,
"learning_rate": 9.809531972066705e-07,
"loss": 1.6614,
"step": 2450
},
{
"epoch": 0.09017925877048279,
"grad_norm": 1.5758236147361129,
"learning_rate": 9.807951692541217e-07,
"loss": 1.6952,
"step": 2460
},
{
"epoch": 0.09054584112320833,
"grad_norm": 1.3585874981966901,
"learning_rate": 9.806365012773532e-07,
"loss": 1.7113,
"step": 2470
},
{
"epoch": 0.09091242347593387,
"grad_norm": 1.3061869321513975,
"learning_rate": 9.804771934875807e-07,
"loss": 1.6796,
"step": 2480
},
{
"epoch": 0.09127900582865942,
"grad_norm": 1.1540286110201206,
"learning_rate": 9.803172460968705e-07,
"loss": 1.7097,
"step": 2490
},
{
"epoch": 0.09164558818138495,
"grad_norm": 1.2915686809771951,
"learning_rate": 9.80156659318142e-07,
"loss": 1.7138,
"step": 2500
},
{
"epoch": 0.09201217053411048,
"grad_norm": 1.1468908768097306,
"learning_rate": 9.799954333651642e-07,
"loss": 1.7038,
"step": 2510
},
{
"epoch": 0.09237875288683603,
"grad_norm": 1.257655656482852,
"learning_rate": 9.79833568452558e-07,
"loss": 1.677,
"step": 2520
},
{
"epoch": 0.09274533523956156,
"grad_norm": 1.6361492549326027,
"learning_rate": 9.796710647957944e-07,
"loss": 1.6155,
"step": 2530
},
{
"epoch": 0.09311191759228711,
"grad_norm": 1.1505717408841072,
"learning_rate": 9.795079226111949e-07,
"loss": 1.6811,
"step": 2540
},
{
"epoch": 0.09347849994501264,
"grad_norm": 1.1983166183129195,
"learning_rate": 9.793441421159308e-07,
"loss": 1.7203,
"step": 2550
},
{
"epoch": 0.09384508229773819,
"grad_norm": 1.1985818933727272,
"learning_rate": 9.79179723528023e-07,
"loss": 1.7232,
"step": 2560
},
{
"epoch": 0.09421166465046373,
"grad_norm": 1.0143700528752713,
"learning_rate": 9.790146670663422e-07,
"loss": 1.6916,
"step": 2570
},
{
"epoch": 0.09457824700318927,
"grad_norm": 1.121117592417204,
"learning_rate": 9.788489729506082e-07,
"loss": 1.6683,
"step": 2580
},
{
"epoch": 0.0949448293559148,
"grad_norm": 1.339002521581536,
"learning_rate": 9.78682641401389e-07,
"loss": 1.6622,
"step": 2590
},
{
"epoch": 0.09531141170864034,
"grad_norm": 1.1212646774920143,
"learning_rate": 9.785156726401019e-07,
"loss": 1.687,
"step": 2600
},
{
"epoch": 0.09567799406136589,
"grad_norm": 1.2061879994547406,
"learning_rate": 9.78348066889012e-07,
"loss": 1.6652,
"step": 2610
},
{
"epoch": 0.09604457641409142,
"grad_norm": 1.225185884537581,
"learning_rate": 9.781798243712326e-07,
"loss": 1.6948,
"step": 2620
},
{
"epoch": 0.09641115876681697,
"grad_norm": 1.0146497215382635,
"learning_rate": 9.780109453107245e-07,
"loss": 1.7009,
"step": 2630
},
{
"epoch": 0.0967777411195425,
"grad_norm": 1.2171300466801498,
"learning_rate": 9.77841429932296e-07,
"loss": 1.7087,
"step": 2640
},
{
"epoch": 0.09714432347226805,
"grad_norm": 1.0629828650910798,
"learning_rate": 9.77671278461602e-07,
"loss": 1.7316,
"step": 2650
},
{
"epoch": 0.09751090582499358,
"grad_norm": 1.1754432625786018,
"learning_rate": 9.775004911251448e-07,
"loss": 1.6953,
"step": 2660
},
{
"epoch": 0.09787748817771913,
"grad_norm": 1.3069724342535498,
"learning_rate": 9.773290681502727e-07,
"loss": 1.7057,
"step": 2670
},
{
"epoch": 0.09824407053044466,
"grad_norm": 1.3314679455466842,
"learning_rate": 9.7715700976518e-07,
"loss": 1.6842,
"step": 2680
},
{
"epoch": 0.09861065288317021,
"grad_norm": 1.3928937247531508,
"learning_rate": 9.769843161989079e-07,
"loss": 1.7052,
"step": 2690
},
{
"epoch": 0.09897723523589574,
"grad_norm": 1.3389115391442472,
"learning_rate": 9.768109876813417e-07,
"loss": 1.6905,
"step": 2700
},
{
"epoch": 0.09934381758862128,
"grad_norm": 1.2854315608533564,
"learning_rate": 9.76637024443213e-07,
"loss": 1.6806,
"step": 2710
},
{
"epoch": 0.09971039994134683,
"grad_norm": 1.24293956575573,
"learning_rate": 9.764624267160975e-07,
"loss": 1.6922,
"step": 2720
},
{
"epoch": 0.10007698229407236,
"grad_norm": 1.2809307536658918,
"learning_rate": 9.762871947324165e-07,
"loss": 1.7001,
"step": 2730
},
{
"epoch": 0.1004435646467979,
"grad_norm": 1.1615070632030087,
"learning_rate": 9.761113287254345e-07,
"loss": 1.6747,
"step": 2740
},
{
"epoch": 0.10081014699952344,
"grad_norm": 1.245140216818738,
"learning_rate": 9.75934828929261e-07,
"loss": 1.6469,
"step": 2750
},
{
"epoch": 0.10117672935224899,
"grad_norm": 1.152316966014997,
"learning_rate": 9.757576955788486e-07,
"loss": 1.6773,
"step": 2760
},
{
"epoch": 0.10154331170497452,
"grad_norm": 1.1064605629765938,
"learning_rate": 9.755799289099932e-07,
"loss": 1.6447,
"step": 2770
},
{
"epoch": 0.10190989405770007,
"grad_norm": 1.1150499110452152,
"learning_rate": 9.754015291593343e-07,
"loss": 1.7168,
"step": 2780
},
{
"epoch": 0.1022764764104256,
"grad_norm": 1.3016769905995789,
"learning_rate": 9.752224965643536e-07,
"loss": 1.7209,
"step": 2790
},
{
"epoch": 0.10264305876315115,
"grad_norm": 1.332321427009131,
"learning_rate": 9.750428313633757e-07,
"loss": 1.6247,
"step": 2800
},
{
"epoch": 0.10300964111587668,
"grad_norm": 1.311092146207188,
"learning_rate": 9.748625337955667e-07,
"loss": 1.6366,
"step": 2810
},
{
"epoch": 0.10337622346860222,
"grad_norm": 1.1634742047900515,
"learning_rate": 9.746816041009351e-07,
"loss": 1.7143,
"step": 2820
},
{
"epoch": 0.10374280582132776,
"grad_norm": 1.1916284602740692,
"learning_rate": 9.745000425203307e-07,
"loss": 1.6568,
"step": 2830
},
{
"epoch": 0.1041093881740533,
"grad_norm": 1.280571751055567,
"learning_rate": 9.743178492954442e-07,
"loss": 1.6303,
"step": 2840
},
{
"epoch": 0.10447597052677884,
"grad_norm": 1.3621017517970784,
"learning_rate": 9.741350246688076e-07,
"loss": 1.7569,
"step": 2850
},
{
"epoch": 0.10484255287950438,
"grad_norm": 1.1019913075705825,
"learning_rate": 9.739515688837927e-07,
"loss": 1.6934,
"step": 2860
},
{
"epoch": 0.10520913523222993,
"grad_norm": 1.3868159647800968,
"learning_rate": 9.73767482184612e-07,
"loss": 1.6267,
"step": 2870
},
{
"epoch": 0.10557571758495546,
"grad_norm": 1.4881189853618986,
"learning_rate": 9.73582764816318e-07,
"loss": 1.7354,
"step": 2880
},
{
"epoch": 0.105942299937681,
"grad_norm": 1.5118948532986631,
"learning_rate": 9.733974170248025e-07,
"loss": 1.6856,
"step": 2890
},
{
"epoch": 0.10630888229040654,
"grad_norm": 1.4796154680218983,
"learning_rate": 9.732114390567963e-07,
"loss": 1.7045,
"step": 2900
},
{
"epoch": 0.10667546464313207,
"grad_norm": 1.2560441338500297,
"learning_rate": 9.730248311598694e-07,
"loss": 1.6466,
"step": 2910
},
{
"epoch": 0.10704204699585762,
"grad_norm": 1.1595828068992133,
"learning_rate": 9.728375935824301e-07,
"loss": 1.6822,
"step": 2920
},
{
"epoch": 0.10740862934858315,
"grad_norm": 1.3126146065763922,
"learning_rate": 9.726497265737252e-07,
"loss": 1.6723,
"step": 2930
},
{
"epoch": 0.1077752117013087,
"grad_norm": 1.2296488317137073,
"learning_rate": 9.724612303838393e-07,
"loss": 1.6647,
"step": 2940
},
{
"epoch": 0.10814179405403423,
"grad_norm": 1.170972623285309,
"learning_rate": 9.722721052636944e-07,
"loss": 1.6955,
"step": 2950
},
{
"epoch": 0.10850837640675978,
"grad_norm": 1.2633141406462256,
"learning_rate": 9.720823514650495e-07,
"loss": 1.6332,
"step": 2960
},
{
"epoch": 0.10887495875948532,
"grad_norm": 1.2911934178837097,
"learning_rate": 9.718919692405014e-07,
"loss": 1.7218,
"step": 2970
},
{
"epoch": 0.10924154111221086,
"grad_norm": 1.1657180939495957,
"learning_rate": 9.717009588434822e-07,
"loss": 1.6067,
"step": 2980
},
{
"epoch": 0.1096081234649364,
"grad_norm": 1.239214562886889,
"learning_rate": 9.715093205282615e-07,
"loss": 1.7067,
"step": 2990
},
{
"epoch": 0.10997470581766194,
"grad_norm": 1.3619661984646028,
"learning_rate": 9.713170545499435e-07,
"loss": 1.6978,
"step": 3000
},
{
"epoch": 0.10997470581766194,
"eval_accuracy": 0.6262376782115725,
"eval_loss": 1.6762739419937134,
"eval_runtime": 309.1255,
"eval_samples_per_second": 10.698,
"eval_steps_per_second": 0.893,
"step": 3000
},
{
"epoch": 0.11034128817038748,
"grad_norm": 1.2670499181513593,
"learning_rate": 9.711241611644688e-07,
"loss": 1.677,
"step": 3010
},
{
"epoch": 0.11070787052311301,
"grad_norm": 1.2403940254412753,
"learning_rate": 9.709306406286129e-07,
"loss": 1.6604,
"step": 3020
},
{
"epoch": 0.11107445287583856,
"grad_norm": 1.3312898520587448,
"learning_rate": 9.707364931999864e-07,
"loss": 1.6867,
"step": 3030
},
{
"epoch": 0.11144103522856409,
"grad_norm": 1.3495930407749666,
"learning_rate": 9.70541719137034e-07,
"loss": 1.6617,
"step": 3040
},
{
"epoch": 0.11180761758128964,
"grad_norm": 1.1396532709110236,
"learning_rate": 9.703463186990346e-07,
"loss": 1.7035,
"step": 3050
},
{
"epoch": 0.11217419993401517,
"grad_norm": 1.2231802562577823,
"learning_rate": 9.701502921461013e-07,
"loss": 1.6723,
"step": 3060
},
{
"epoch": 0.11254078228674072,
"grad_norm": 1.3403523967021675,
"learning_rate": 9.699536397391806e-07,
"loss": 1.6698,
"step": 3070
},
{
"epoch": 0.11290736463946625,
"grad_norm": 1.3447918453958256,
"learning_rate": 9.697563617400516e-07,
"loss": 1.6716,
"step": 3080
},
{
"epoch": 0.1132739469921918,
"grad_norm": 1.2969348535087712,
"learning_rate": 9.695584584113267e-07,
"loss": 1.6949,
"step": 3090
},
{
"epoch": 0.11364052934491733,
"grad_norm": 1.1643584556065927,
"learning_rate": 9.693599300164508e-07,
"loss": 1.6713,
"step": 3100
},
{
"epoch": 0.11400711169764288,
"grad_norm": 1.2242377804664155,
"learning_rate": 9.691607768197002e-07,
"loss": 1.6386,
"step": 3110
},
{
"epoch": 0.11437369405036842,
"grad_norm": 1.319822492671326,
"learning_rate": 9.689609990861837e-07,
"loss": 1.6816,
"step": 3120
},
{
"epoch": 0.11474027640309395,
"grad_norm": 1.3781452196212938,
"learning_rate": 9.687605970818408e-07,
"loss": 1.6784,
"step": 3130
},
{
"epoch": 0.1151068587558195,
"grad_norm": 1.2168088100404522,
"learning_rate": 9.68559571073443e-07,
"loss": 1.6982,
"step": 3140
},
{
"epoch": 0.11547344110854503,
"grad_norm": 1.4540401524570652,
"learning_rate": 9.68357921328591e-07,
"loss": 1.6718,
"step": 3150
},
{
"epoch": 0.11584002346127058,
"grad_norm": 1.3143498063269197,
"learning_rate": 9.681556481157171e-07,
"loss": 1.6709,
"step": 3160
},
{
"epoch": 0.11620660581399611,
"grad_norm": 1.1946622719420839,
"learning_rate": 9.679527517040831e-07,
"loss": 1.6747,
"step": 3170
},
{
"epoch": 0.11657318816672166,
"grad_norm": 1.286257203814063,
"learning_rate": 9.6774923236378e-07,
"loss": 1.699,
"step": 3180
},
{
"epoch": 0.11693977051944719,
"grad_norm": 1.3969179686751765,
"learning_rate": 9.675450903657286e-07,
"loss": 1.6228,
"step": 3190
},
{
"epoch": 0.11730635287217274,
"grad_norm": 1.1607892230894732,
"learning_rate": 9.673403259816787e-07,
"loss": 1.6538,
"step": 3200
},
{
"epoch": 0.11767293522489827,
"grad_norm": 1.4009629932701972,
"learning_rate": 9.671349394842075e-07,
"loss": 1.6401,
"step": 3210
},
{
"epoch": 0.1180395175776238,
"grad_norm": 1.5024706182569632,
"learning_rate": 9.669289311467216e-07,
"loss": 1.6508,
"step": 3220
},
{
"epoch": 0.11840609993034935,
"grad_norm": 1.9466998313668968,
"learning_rate": 9.66722301243455e-07,
"loss": 1.6662,
"step": 3230
},
{
"epoch": 0.11877268228307489,
"grad_norm": 1.6928758946763174,
"learning_rate": 9.665150500494686e-07,
"loss": 1.681,
"step": 3240
},
{
"epoch": 0.11913926463580043,
"grad_norm": 1.5050927792757436,
"learning_rate": 9.66307177840651e-07,
"loss": 1.6669,
"step": 3250
},
{
"epoch": 0.11950584698852597,
"grad_norm": 1.179067981511082,
"learning_rate": 9.66098684893717e-07,
"loss": 1.6503,
"step": 3260
},
{
"epoch": 0.11987242934125152,
"grad_norm": 1.7279906281142485,
"learning_rate": 9.658895714862082e-07,
"loss": 1.6331,
"step": 3270
},
{
"epoch": 0.12023901169397705,
"grad_norm": 1.1891919657193728,
"learning_rate": 9.656798378964918e-07,
"loss": 1.6111,
"step": 3280
},
{
"epoch": 0.1206055940467026,
"grad_norm": 1.7749941957068498,
"learning_rate": 9.654694844037607e-07,
"loss": 1.666,
"step": 3290
},
{
"epoch": 0.12097217639942813,
"grad_norm": 1.5093366351881725,
"learning_rate": 9.65258511288033e-07,
"loss": 1.6569,
"step": 3300
},
{
"epoch": 0.12133875875215368,
"grad_norm": 1.2872309950824516,
"learning_rate": 9.650469188301512e-07,
"loss": 1.6697,
"step": 3310
},
{
"epoch": 0.12170534110487921,
"grad_norm": 1.2299002535631731,
"learning_rate": 9.648347073117832e-07,
"loss": 1.6413,
"step": 3320
},
{
"epoch": 0.12207192345760474,
"grad_norm": 1.407253463937065,
"learning_rate": 9.6462187701542e-07,
"loss": 1.6757,
"step": 3330
},
{
"epoch": 0.12243850581033029,
"grad_norm": 1.166071729039829,
"learning_rate": 9.644084282243768e-07,
"loss": 1.6654,
"step": 3340
},
{
"epoch": 0.12280508816305583,
"grad_norm": 1.558952263125209,
"learning_rate": 9.641943612227921e-07,
"loss": 1.6807,
"step": 3350
},
{
"epoch": 0.12317167051578137,
"grad_norm": 1.3374281457093373,
"learning_rate": 9.639796762956276e-07,
"loss": 1.6664,
"step": 3360
},
{
"epoch": 0.1235382528685069,
"grad_norm": 1.1902844247942133,
"learning_rate": 9.637643737286667e-07,
"loss": 1.6914,
"step": 3370
},
{
"epoch": 0.12390483522123245,
"grad_norm": 1.2998133772041194,
"learning_rate": 9.63548453808516e-07,
"loss": 1.7112,
"step": 3380
},
{
"epoch": 0.12427141757395799,
"grad_norm": 1.3162405748836254,
"learning_rate": 9.633319168226036e-07,
"loss": 1.6936,
"step": 3390
},
{
"epoch": 0.12463799992668353,
"grad_norm": 1.3677758198871173,
"learning_rate": 9.631147630591782e-07,
"loss": 1.6883,
"step": 3400
},
{
"epoch": 0.12500458227940908,
"grad_norm": 1.2054292111865461,
"learning_rate": 9.62896992807311e-07,
"loss": 1.6576,
"step": 3410
},
{
"epoch": 0.1253711646321346,
"grad_norm": 1.156101638091166,
"learning_rate": 9.626786063568925e-07,
"loss": 1.6667,
"step": 3420
},
{
"epoch": 0.12573774698486015,
"grad_norm": 1.3745543808654352,
"learning_rate": 9.624596039986343e-07,
"loss": 1.6712,
"step": 3430
},
{
"epoch": 0.1261043293375857,
"grad_norm": 1.178401890967186,
"learning_rate": 9.622399860240679e-07,
"loss": 1.6474,
"step": 3440
},
{
"epoch": 0.12647091169031122,
"grad_norm": 1.4332376083467566,
"learning_rate": 9.620197527255436e-07,
"loss": 1.6655,
"step": 3450
},
{
"epoch": 0.12683749404303676,
"grad_norm": 1.2402171846377348,
"learning_rate": 9.617989043962315e-07,
"loss": 1.6349,
"step": 3460
},
{
"epoch": 0.1272040763957623,
"grad_norm": 1.1586534075249035,
"learning_rate": 9.615774413301201e-07,
"loss": 1.6514,
"step": 3470
},
{
"epoch": 0.12757065874848786,
"grad_norm": 1.3594354851138566,
"learning_rate": 9.613553638220162e-07,
"loss": 1.6516,
"step": 3480
},
{
"epoch": 0.12793724110121338,
"grad_norm": 1.6613648157437189,
"learning_rate": 9.611326721675447e-07,
"loss": 1.6111,
"step": 3490
},
{
"epoch": 0.12830382345393893,
"grad_norm": 1.1659314128590663,
"learning_rate": 9.60909366663148e-07,
"loss": 1.6144,
"step": 3500
},
{
"epoch": 0.12867040580666447,
"grad_norm": 1.3825427999836462,
"learning_rate": 9.606854476060858e-07,
"loss": 1.6355,
"step": 3510
},
{
"epoch": 0.12903698815939002,
"grad_norm": 1.3221664320987678,
"learning_rate": 9.604609152944339e-07,
"loss": 1.6582,
"step": 3520
},
{
"epoch": 0.12940357051211554,
"grad_norm": 1.223865417664176,
"learning_rate": 9.602357700270848e-07,
"loss": 1.6629,
"step": 3530
},
{
"epoch": 0.1297701528648411,
"grad_norm": 1.2654800350319806,
"learning_rate": 9.600100121037478e-07,
"loss": 1.6746,
"step": 3540
},
{
"epoch": 0.13013673521756663,
"grad_norm": 1.5629673478694224,
"learning_rate": 9.597836418249463e-07,
"loss": 1.598,
"step": 3550
},
{
"epoch": 0.13050331757029215,
"grad_norm": 1.434783120339992,
"learning_rate": 9.5955665949202e-07,
"loss": 1.6667,
"step": 3560
},
{
"epoch": 0.1308698999230177,
"grad_norm": 1.391092196783546,
"learning_rate": 9.593290654071227e-07,
"loss": 1.6533,
"step": 3570
},
{
"epoch": 0.13123648227574325,
"grad_norm": 1.4923072292703214,
"learning_rate": 9.591008598732227e-07,
"loss": 1.6742,
"step": 3580
},
{
"epoch": 0.1316030646284688,
"grad_norm": 1.313620532521857,
"learning_rate": 9.588720431941024e-07,
"loss": 1.643,
"step": 3590
},
{
"epoch": 0.13196964698119432,
"grad_norm": 1.527900388849829,
"learning_rate": 9.586426156743576e-07,
"loss": 1.6466,
"step": 3600
},
{
"epoch": 0.13233622933391986,
"grad_norm": 1.3345529937125478,
"learning_rate": 9.584125776193977e-07,
"loss": 1.6242,
"step": 3610
},
{
"epoch": 0.1327028116866454,
"grad_norm": 1.1722053149478573,
"learning_rate": 9.581819293354437e-07,
"loss": 1.6361,
"step": 3620
},
{
"epoch": 0.13306939403937096,
"grad_norm": 1.448965551365503,
"learning_rate": 9.579506711295303e-07,
"loss": 1.6766,
"step": 3630
},
{
"epoch": 0.13343597639209648,
"grad_norm": 1.435539195626326,
"learning_rate": 9.57718803309503e-07,
"loss": 1.6639,
"step": 3640
},
{
"epoch": 0.13380255874482203,
"grad_norm": 1.5710598550118229,
"learning_rate": 9.574863261840195e-07,
"loss": 1.6821,
"step": 3650
},
{
"epoch": 0.13416914109754757,
"grad_norm": 1.3432388820323078,
"learning_rate": 9.572532400625486e-07,
"loss": 1.6578,
"step": 3660
},
{
"epoch": 0.1345357234502731,
"grad_norm": 1.4304292951831412,
"learning_rate": 9.570195452553692e-07,
"loss": 1.6683,
"step": 3670
},
{
"epoch": 0.13490230580299864,
"grad_norm": 1.293030659950829,
"learning_rate": 9.567852420735707e-07,
"loss": 1.6712,
"step": 3680
},
{
"epoch": 0.1352688881557242,
"grad_norm": 1.5727628914988818,
"learning_rate": 9.565503308290529e-07,
"loss": 1.6362,
"step": 3690
},
{
"epoch": 0.13563547050844973,
"grad_norm": 1.6929875598843593,
"learning_rate": 9.56314811834524e-07,
"loss": 1.6734,
"step": 3700
},
{
"epoch": 0.13600205286117525,
"grad_norm": 1.5989548687758315,
"learning_rate": 9.560786854035027e-07,
"loss": 1.6449,
"step": 3710
},
{
"epoch": 0.1363686352139008,
"grad_norm": 1.5032676879166582,
"learning_rate": 9.558419518503146e-07,
"loss": 1.6572,
"step": 3720
},
{
"epoch": 0.13673521756662635,
"grad_norm": 1.4171570128132858,
"learning_rate": 9.55604611490095e-07,
"loss": 1.6084,
"step": 3730
},
{
"epoch": 0.13710179991935187,
"grad_norm": 1.445587424899926,
"learning_rate": 9.553666646387859e-07,
"loss": 1.6226,
"step": 3740
},
{
"epoch": 0.13746838227207742,
"grad_norm": 1.3746442868420083,
"learning_rate": 9.55128111613137e-07,
"loss": 1.6244,
"step": 3750
},
{
"epoch": 0.13783496462480296,
"grad_norm": 1.379515983296158,
"learning_rate": 9.548889527307052e-07,
"loss": 1.6178,
"step": 3760
},
{
"epoch": 0.1382015469775285,
"grad_norm": 1.3571114141269711,
"learning_rate": 9.546491883098536e-07,
"loss": 1.6295,
"step": 3770
},
{
"epoch": 0.13856812933025403,
"grad_norm": 1.463273179907825,
"learning_rate": 9.544088186697514e-07,
"loss": 1.6252,
"step": 3780
},
{
"epoch": 0.13893471168297958,
"grad_norm": 1.409249057690562,
"learning_rate": 9.541678441303736e-07,
"loss": 1.6226,
"step": 3790
},
{
"epoch": 0.13930129403570513,
"grad_norm": 1.2549772425250405,
"learning_rate": 9.539262650125003e-07,
"loss": 1.6904,
"step": 3800
},
{
"epoch": 0.13966787638843067,
"grad_norm": 1.398529314496367,
"learning_rate": 9.536840816377163e-07,
"loss": 1.641,
"step": 3810
},
{
"epoch": 0.1400344587411562,
"grad_norm": 1.4089240361542354,
"learning_rate": 9.534412943284111e-07,
"loss": 1.6749,
"step": 3820
},
{
"epoch": 0.14040104109388174,
"grad_norm": 1.2690921990550241,
"learning_rate": 9.53197903407778e-07,
"loss": 1.6483,
"step": 3830
},
{
"epoch": 0.1407676234466073,
"grad_norm": 1.443019453596183,
"learning_rate": 9.529539091998138e-07,
"loss": 1.5942,
"step": 3840
},
{
"epoch": 0.1411342057993328,
"grad_norm": 1.3973353826502415,
"learning_rate": 9.527093120293179e-07,
"loss": 1.6637,
"step": 3850
},
{
"epoch": 0.14150078815205835,
"grad_norm": 1.612241752672322,
"learning_rate": 9.524641122218934e-07,
"loss": 1.6144,
"step": 3860
},
{
"epoch": 0.1418673705047839,
"grad_norm": 1.6392078912198202,
"learning_rate": 9.522183101039447e-07,
"loss": 1.599,
"step": 3870
},
{
"epoch": 0.14223395285750945,
"grad_norm": 1.3307238721886945,
"learning_rate": 9.519719060026784e-07,
"loss": 1.6692,
"step": 3880
},
{
"epoch": 0.14260053521023497,
"grad_norm": 1.3570795255125636,
"learning_rate": 9.517249002461023e-07,
"loss": 1.6871,
"step": 3890
},
{
"epoch": 0.14296711756296052,
"grad_norm": 1.4037736413570712,
"learning_rate": 9.514772931630253e-07,
"loss": 1.5922,
"step": 3900
},
{
"epoch": 0.14333369991568606,
"grad_norm": 1.6691508908927133,
"learning_rate": 9.512290850830564e-07,
"loss": 1.5939,
"step": 3910
},
{
"epoch": 0.1437002822684116,
"grad_norm": 1.2746936442730004,
"learning_rate": 9.509802763366052e-07,
"loss": 1.6376,
"step": 3920
},
{
"epoch": 0.14406686462113713,
"grad_norm": 1.7263750991736497,
"learning_rate": 9.507308672548803e-07,
"loss": 1.6251,
"step": 3930
},
{
"epoch": 0.14443344697386268,
"grad_norm": 1.6162337099963227,
"learning_rate": 9.504808581698898e-07,
"loss": 1.6855,
"step": 3940
},
{
"epoch": 0.14480002932658823,
"grad_norm": 1.4400774058967862,
"learning_rate": 9.502302494144405e-07,
"loss": 1.6688,
"step": 3950
},
{
"epoch": 0.14516661167931375,
"grad_norm": 1.4106971014212684,
"learning_rate": 9.499790413221372e-07,
"loss": 1.6212,
"step": 3960
},
{
"epoch": 0.1455331940320393,
"grad_norm": 1.549216443416639,
"learning_rate": 9.49727234227383e-07,
"loss": 1.6316,
"step": 3970
},
{
"epoch": 0.14589977638476484,
"grad_norm": 1.2499725096259189,
"learning_rate": 9.494748284653779e-07,
"loss": 1.6113,
"step": 3980
},
{
"epoch": 0.1462663587374904,
"grad_norm": 1.8429540203762498,
"learning_rate": 9.492218243721192e-07,
"loss": 1.6424,
"step": 3990
},
{
"epoch": 0.1466329410902159,
"grad_norm": 1.4097823826329705,
"learning_rate": 9.489682222844004e-07,
"loss": 1.5986,
"step": 4000
},
{
"epoch": 0.1466329410902159,
"eval_accuracy": 0.634133690356089,
"eval_loss": 1.6327084302902222,
"eval_runtime": 310.7367,
"eval_samples_per_second": 10.642,
"eval_steps_per_second": 0.888,
"step": 4000
},
{
"epoch": 0.14699952344294145,
"grad_norm": 1.4923503061339742,
"learning_rate": 9.487140225398112e-07,
"loss": 1.6354,
"step": 4010
},
{
"epoch": 0.147366105795667,
"grad_norm": 1.4794551483340477,
"learning_rate": 9.484592254767368e-07,
"loss": 1.6337,
"step": 4020
},
{
"epoch": 0.14773268814839255,
"grad_norm": 1.5712257291796352,
"learning_rate": 9.482038314343577e-07,
"loss": 1.6569,
"step": 4030
},
{
"epoch": 0.14809927050111807,
"grad_norm": 1.7977345143090582,
"learning_rate": 9.479478407526489e-07,
"loss": 1.6489,
"step": 4040
},
{
"epoch": 0.14846585285384362,
"grad_norm": 1.3741458319499518,
"learning_rate": 9.476912537723797e-07,
"loss": 1.6133,
"step": 4050
},
{
"epoch": 0.14883243520656916,
"grad_norm": 1.4690331639136838,
"learning_rate": 9.474340708351131e-07,
"loss": 1.6232,
"step": 4060
},
{
"epoch": 0.14919901755929468,
"grad_norm": 1.2959341038239927,
"learning_rate": 9.471762922832059e-07,
"loss": 1.6136,
"step": 4070
},
{
"epoch": 0.14956559991202023,
"grad_norm": 1.3662274482371721,
"learning_rate": 9.469179184598068e-07,
"loss": 1.6568,
"step": 4080
},
{
"epoch": 0.14993218226474578,
"grad_norm": 1.6303487241504246,
"learning_rate": 9.46658949708858e-07,
"loss": 1.5929,
"step": 4090
},
{
"epoch": 0.15029876461747133,
"grad_norm": 1.5690296034603222,
"learning_rate": 9.463993863750927e-07,
"loss": 1.6273,
"step": 4100
},
{
"epoch": 0.15066534697019685,
"grad_norm": 1.4565888691647535,
"learning_rate": 9.461392288040364e-07,
"loss": 1.6111,
"step": 4110
},
{
"epoch": 0.1510319293229224,
"grad_norm": 1.3399651168141258,
"learning_rate": 9.458784773420052e-07,
"loss": 1.6317,
"step": 4120
},
{
"epoch": 0.15139851167564794,
"grad_norm": 1.4314663401678571,
"learning_rate": 9.456171323361057e-07,
"loss": 1.6149,
"step": 4130
},
{
"epoch": 0.1517650940283735,
"grad_norm": 1.8610614612324794,
"learning_rate": 9.45355194134235e-07,
"loss": 1.6129,
"step": 4140
},
{
"epoch": 0.152131676381099,
"grad_norm": 1.4894532553388709,
"learning_rate": 9.450926630850795e-07,
"loss": 1.609,
"step": 4150
},
{
"epoch": 0.15249825873382455,
"grad_norm": 1.4046406522547454,
"learning_rate": 9.44829539538115e-07,
"loss": 1.5696,
"step": 4160
},
{
"epoch": 0.1528648410865501,
"grad_norm": 1.507747542986857,
"learning_rate": 9.445658238436056e-07,
"loss": 1.6105,
"step": 4170
},
{
"epoch": 0.15323142343927562,
"grad_norm": 1.5105255618831799,
"learning_rate": 9.443015163526043e-07,
"loss": 1.6656,
"step": 4180
},
{
"epoch": 0.15359800579200117,
"grad_norm": 1.409667843388443,
"learning_rate": 9.440366174169514e-07,
"loss": 1.6143,
"step": 4190
},
{
"epoch": 0.15396458814472672,
"grad_norm": 1.4899089219548238,
"learning_rate": 9.437711273892748e-07,
"loss": 1.6434,
"step": 4200
},
{
"epoch": 0.15433117049745226,
"grad_norm": 1.3835730704800184,
"learning_rate": 9.435050466229892e-07,
"loss": 1.5896,
"step": 4210
},
{
"epoch": 0.15469775285017778,
"grad_norm": 1.5192649294767298,
"learning_rate": 9.432383754722953e-07,
"loss": 1.5982,
"step": 4220
},
{
"epoch": 0.15506433520290333,
"grad_norm": 1.414847151501446,
"learning_rate": 9.429711142921804e-07,
"loss": 1.6195,
"step": 4230
},
{
"epoch": 0.15543091755562888,
"grad_norm": 1.6343731391974052,
"learning_rate": 9.427032634384166e-07,
"loss": 1.6571,
"step": 4240
},
{
"epoch": 0.15579749990835443,
"grad_norm": 1.3341873108704791,
"learning_rate": 9.424348232675612e-07,
"loss": 1.6592,
"step": 4250
},
{
"epoch": 0.15616408226107995,
"grad_norm": 1.6008064117545706,
"learning_rate": 9.421657941369561e-07,
"loss": 1.5976,
"step": 4260
},
{
"epoch": 0.1565306646138055,
"grad_norm": 1.5239464972441716,
"learning_rate": 9.418961764047271e-07,
"loss": 1.6696,
"step": 4270
},
{
"epoch": 0.15689724696653104,
"grad_norm": 1.4769248460119957,
"learning_rate": 9.416259704297836e-07,
"loss": 1.5887,
"step": 4280
},
{
"epoch": 0.15726382931925656,
"grad_norm": 1.5681596592695635,
"learning_rate": 9.413551765718178e-07,
"loss": 1.6013,
"step": 4290
},
{
"epoch": 0.1576304116719821,
"grad_norm": 1.631287334977878,
"learning_rate": 9.410837951913049e-07,
"loss": 1.5945,
"step": 4300
},
{
"epoch": 0.15799699402470765,
"grad_norm": 1.4050312863210865,
"learning_rate": 9.408118266495019e-07,
"loss": 1.6402,
"step": 4310
},
{
"epoch": 0.1583635763774332,
"grad_norm": 1.5578526902775003,
"learning_rate": 9.405392713084475e-07,
"loss": 1.5887,
"step": 4320
},
{
"epoch": 0.15873015873015872,
"grad_norm": 1.838536265304532,
"learning_rate": 9.402661295309613e-07,
"loss": 1.6579,
"step": 4330
},
{
"epoch": 0.15909674108288427,
"grad_norm": 1.399860997384879,
"learning_rate": 9.399924016806442e-07,
"loss": 1.6393,
"step": 4340
},
{
"epoch": 0.15946332343560982,
"grad_norm": 1.5068872354692342,
"learning_rate": 9.397180881218764e-07,
"loss": 1.615,
"step": 4350
},
{
"epoch": 0.15982990578833534,
"grad_norm": 1.3780932641355175,
"learning_rate": 9.394431892198187e-07,
"loss": 1.5897,
"step": 4360
},
{
"epoch": 0.16019648814106088,
"grad_norm": 1.3266983904985465,
"learning_rate": 9.391677053404102e-07,
"loss": 1.622,
"step": 4370
},
{
"epoch": 0.16056307049378643,
"grad_norm": 1.620877234564149,
"learning_rate": 9.388916368503695e-07,
"loss": 1.5967,
"step": 4380
},
{
"epoch": 0.16092965284651198,
"grad_norm": 1.4779982203811086,
"learning_rate": 9.386149841171927e-07,
"loss": 1.6698,
"step": 4390
},
{
"epoch": 0.1612962351992375,
"grad_norm": 1.8674907963100393,
"learning_rate": 9.38337747509154e-07,
"loss": 1.587,
"step": 4400
},
{
"epoch": 0.16166281755196305,
"grad_norm": 1.253158061665667,
"learning_rate": 9.380599273953052e-07,
"loss": 1.5428,
"step": 4410
},
{
"epoch": 0.1620293999046886,
"grad_norm": 1.3525050799204679,
"learning_rate": 9.37781524145474e-07,
"loss": 1.6247,
"step": 4420
},
{
"epoch": 0.16239598225741414,
"grad_norm": 1.4613300416955568,
"learning_rate": 9.375025381302654e-07,
"loss": 1.6224,
"step": 4430
},
{
"epoch": 0.16276256461013966,
"grad_norm": 1.2944336505844816,
"learning_rate": 9.372229697210592e-07,
"loss": 1.6073,
"step": 4440
},
{
"epoch": 0.1631291469628652,
"grad_norm": 1.5174622698952627,
"learning_rate": 9.369428192900108e-07,
"loss": 1.6071,
"step": 4450
},
{
"epoch": 0.16349572931559075,
"grad_norm": 1.338534858401422,
"learning_rate": 9.366620872100508e-07,
"loss": 1.6601,
"step": 4460
},
{
"epoch": 0.16386231166831627,
"grad_norm": 1.6728271928417346,
"learning_rate": 9.363807738548834e-07,
"loss": 1.551,
"step": 4470
},
{
"epoch": 0.16422889402104182,
"grad_norm": 1.302057455107361,
"learning_rate": 9.360988795989873e-07,
"loss": 1.6131,
"step": 4480
},
{
"epoch": 0.16459547637376737,
"grad_norm": 1.3688499844245678,
"learning_rate": 9.358164048176136e-07,
"loss": 1.6117,
"step": 4490
},
{
"epoch": 0.16496205872649292,
"grad_norm": 1.8246828901080199,
"learning_rate": 9.355333498867869e-07,
"loss": 1.5894,
"step": 4500
},
{
"epoch": 0.16532864107921844,
"grad_norm": 1.6028775096282735,
"learning_rate": 9.352497151833038e-07,
"loss": 1.614,
"step": 4510
},
{
"epoch": 0.16569522343194398,
"grad_norm": 1.4820831927771527,
"learning_rate": 9.349655010847329e-07,
"loss": 1.6046,
"step": 4520
},
{
"epoch": 0.16606180578466953,
"grad_norm": 1.7672157547664196,
"learning_rate": 9.346807079694139e-07,
"loss": 1.5998,
"step": 4530
},
{
"epoch": 0.16642838813739508,
"grad_norm": 1.399533793932768,
"learning_rate": 9.34395336216457e-07,
"loss": 1.6209,
"step": 4540
},
{
"epoch": 0.1667949704901206,
"grad_norm": 1.3639375879771105,
"learning_rate": 9.341093862057432e-07,
"loss": 1.6321,
"step": 4550
},
{
"epoch": 0.16716155284284615,
"grad_norm": 1.5049904120253712,
"learning_rate": 9.338228583179231e-07,
"loss": 1.5531,
"step": 4560
},
{
"epoch": 0.1675281351955717,
"grad_norm": 1.2985124195396522,
"learning_rate": 9.335357529344162e-07,
"loss": 1.5925,
"step": 4570
},
{
"epoch": 0.1678947175482972,
"grad_norm": 1.6446327484619145,
"learning_rate": 9.332480704374113e-07,
"loss": 1.5926,
"step": 4580
},
{
"epoch": 0.16826129990102276,
"grad_norm": 1.6322229820052805,
"learning_rate": 9.329598112098649e-07,
"loss": 1.6415,
"step": 4590
},
{
"epoch": 0.1686278822537483,
"grad_norm": 1.4469690988313273,
"learning_rate": 9.326709756355018e-07,
"loss": 1.5885,
"step": 4600
},
{
"epoch": 0.16899446460647385,
"grad_norm": 2.0102392352379415,
"learning_rate": 9.323815640988135e-07,
"loss": 1.559,
"step": 4610
},
{
"epoch": 0.16936104695919937,
"grad_norm": 2.121900247865438,
"learning_rate": 9.320915769850585e-07,
"loss": 1.628,
"step": 4620
},
{
"epoch": 0.16972762931192492,
"grad_norm": 1.6562713457587275,
"learning_rate": 9.318010146802615e-07,
"loss": 1.6442,
"step": 4630
},
{
"epoch": 0.17009421166465047,
"grad_norm": 1.825933954099794,
"learning_rate": 9.315098775712127e-07,
"loss": 1.5848,
"step": 4640
},
{
"epoch": 0.17046079401737602,
"grad_norm": 2.2902161148174445,
"learning_rate": 9.312181660454677e-07,
"loss": 1.5825,
"step": 4650
},
{
"epoch": 0.17082737637010154,
"grad_norm": 1.392734199429953,
"learning_rate": 9.309258804913465e-07,
"loss": 1.6126,
"step": 4660
},
{
"epoch": 0.17119395872282708,
"grad_norm": 1.565256666892175,
"learning_rate": 9.306330212979334e-07,
"loss": 1.6022,
"step": 4670
},
{
"epoch": 0.17156054107555263,
"grad_norm": 1.7600380550932417,
"learning_rate": 9.303395888550763e-07,
"loss": 1.5663,
"step": 4680
},
{
"epoch": 0.17192712342827815,
"grad_norm": 1.5247880984614344,
"learning_rate": 9.300455835533863e-07,
"loss": 1.6012,
"step": 4690
},
{
"epoch": 0.1722937057810037,
"grad_norm": 1.7352070019598504,
"learning_rate": 9.297510057842367e-07,
"loss": 1.5681,
"step": 4700
},
{
"epoch": 0.17266028813372924,
"grad_norm": 1.6435683033446582,
"learning_rate": 9.294558559397633e-07,
"loss": 1.6687,
"step": 4710
},
{
"epoch": 0.1730268704864548,
"grad_norm": 1.3964234370853204,
"learning_rate": 9.291601344128631e-07,
"loss": 1.5829,
"step": 4720
},
{
"epoch": 0.1733934528391803,
"grad_norm": 1.76715189072495,
"learning_rate": 9.288638415971944e-07,
"loss": 1.5724,
"step": 4730
},
{
"epoch": 0.17376003519190586,
"grad_norm": 1.3087839062281306,
"learning_rate": 9.285669778871758e-07,
"loss": 1.6033,
"step": 4740
},
{
"epoch": 0.1741266175446314,
"grad_norm": 1.7592015890177557,
"learning_rate": 9.282695436779857e-07,
"loss": 1.5787,
"step": 4750
},
{
"epoch": 0.17449319989735695,
"grad_norm": 1.5281595493710598,
"learning_rate": 9.279715393655625e-07,
"loss": 1.5593,
"step": 4760
},
{
"epoch": 0.17485978225008247,
"grad_norm": 1.738599325299021,
"learning_rate": 9.276729653466029e-07,
"loss": 1.5669,
"step": 4770
},
{
"epoch": 0.17522636460280802,
"grad_norm": 1.594132633669574,
"learning_rate": 9.273738220185624e-07,
"loss": 1.623,
"step": 4780
},
{
"epoch": 0.17559294695553357,
"grad_norm": 2.226861365359913,
"learning_rate": 9.27074109779654e-07,
"loss": 1.6368,
"step": 4790
},
{
"epoch": 0.1759595293082591,
"grad_norm": 1.7870988536401553,
"learning_rate": 9.267738290288484e-07,
"loss": 1.5905,
"step": 4800
},
{
"epoch": 0.17632611166098464,
"grad_norm": 1.6753244560734581,
"learning_rate": 9.264729801658726e-07,
"loss": 1.588,
"step": 4810
},
{
"epoch": 0.17669269401371018,
"grad_norm": 1.5163383708898754,
"learning_rate": 9.261715635912105e-07,
"loss": 1.6068,
"step": 4820
},
{
"epoch": 0.17705927636643573,
"grad_norm": 1.6054513357762625,
"learning_rate": 9.258695797061011e-07,
"loss": 1.5623,
"step": 4830
},
{
"epoch": 0.17742585871916125,
"grad_norm": 1.7549519455125482,
"learning_rate": 9.255670289125392e-07,
"loss": 1.6342,
"step": 4840
},
{
"epoch": 0.1777924410718868,
"grad_norm": 1.5524081159338652,
"learning_rate": 9.252639116132737e-07,
"loss": 1.5866,
"step": 4850
},
{
"epoch": 0.17815902342461234,
"grad_norm": 1.5466546969225983,
"learning_rate": 9.249602282118078e-07,
"loss": 1.6022,
"step": 4860
},
{
"epoch": 0.1785256057773379,
"grad_norm": 1.4959615382996556,
"learning_rate": 9.246559791123984e-07,
"loss": 1.6196,
"step": 4870
},
{
"epoch": 0.1788921881300634,
"grad_norm": 1.4914720900146645,
"learning_rate": 9.243511647200554e-07,
"loss": 1.5919,
"step": 4880
},
{
"epoch": 0.17925877048278896,
"grad_norm": 1.5337435868741187,
"learning_rate": 9.240457854405411e-07,
"loss": 1.6044,
"step": 4890
},
{
"epoch": 0.1796253528355145,
"grad_norm": 1.6816858785763387,
"learning_rate": 9.237398416803702e-07,
"loss": 1.5634,
"step": 4900
},
{
"epoch": 0.17999193518824003,
"grad_norm": 1.8428666379108207,
"learning_rate": 9.234333338468079e-07,
"loss": 1.5595,
"step": 4910
},
{
"epoch": 0.18035851754096557,
"grad_norm": 1.4112423758680814,
"learning_rate": 9.231262623478712e-07,
"loss": 1.5958,
"step": 4920
},
{
"epoch": 0.18072509989369112,
"grad_norm": 1.9379415330464052,
"learning_rate": 9.228186275923271e-07,
"loss": 1.6132,
"step": 4930
},
{
"epoch": 0.18109168224641667,
"grad_norm": 1.6478659028610085,
"learning_rate": 9.225104299896923e-07,
"loss": 1.5253,
"step": 4940
},
{
"epoch": 0.1814582645991422,
"grad_norm": 1.4723128432871142,
"learning_rate": 9.222016699502329e-07,
"loss": 1.6025,
"step": 4950
},
{
"epoch": 0.18182484695186774,
"grad_norm": 1.7186069161894069,
"learning_rate": 9.218923478849636e-07,
"loss": 1.5888,
"step": 4960
},
{
"epoch": 0.18219142930459328,
"grad_norm": 2.0518524516759706,
"learning_rate": 9.215824642056473e-07,
"loss": 1.6131,
"step": 4970
},
{
"epoch": 0.18255801165731883,
"grad_norm": 1.7336503978028492,
"learning_rate": 9.212720193247946e-07,
"loss": 1.5725,
"step": 4980
},
{
"epoch": 0.18292459401004435,
"grad_norm": 1.4722133429873332,
"learning_rate": 9.209610136556629e-07,
"loss": 1.5547,
"step": 4990
},
{
"epoch": 0.1832911763627699,
"grad_norm": 1.6753596780660358,
"learning_rate": 9.206494476122565e-07,
"loss": 1.5997,
"step": 5000
},
{
"epoch": 0.1832911763627699,
"eval_accuracy": 0.642745649510724,
"eval_loss": 1.587723731994629,
"eval_runtime": 309.6063,
"eval_samples_per_second": 10.681,
"eval_steps_per_second": 0.891,
"step": 5000
},
{
"epoch": 0.18365775871549544,
"grad_norm": 1.5685677710443469,
"learning_rate": 9.203373216093253e-07,
"loss": 1.5679,
"step": 5010
},
{
"epoch": 0.18402434106822096,
"grad_norm": 1.8335955057050302,
"learning_rate": 9.200246360623647e-07,
"loss": 1.5621,
"step": 5020
},
{
"epoch": 0.1843909234209465,
"grad_norm": 1.522191845438261,
"learning_rate": 9.19711391387615e-07,
"loss": 1.5729,
"step": 5030
},
{
"epoch": 0.18475750577367206,
"grad_norm": 1.6776006382527855,
"learning_rate": 9.193975880020609e-07,
"loss": 1.59,
"step": 5040
},
{
"epoch": 0.1851240881263976,
"grad_norm": 1.626198881855077,
"learning_rate": 9.190832263234307e-07,
"loss": 1.5274,
"step": 5050
},
{
"epoch": 0.18549067047912313,
"grad_norm": 1.7849118070867178,
"learning_rate": 9.18768306770196e-07,
"loss": 1.5976,
"step": 5060
},
{
"epoch": 0.18585725283184867,
"grad_norm": 1.6492509263028217,
"learning_rate": 9.184528297615706e-07,
"loss": 1.574,
"step": 5070
},
{
"epoch": 0.18622383518457422,
"grad_norm": 1.6650634512326183,
"learning_rate": 9.181367957175111e-07,
"loss": 1.6145,
"step": 5080
},
{
"epoch": 0.18659041753729974,
"grad_norm": 1.728522905813247,
"learning_rate": 9.178202050587152e-07,
"loss": 1.623,
"step": 5090
},
{
"epoch": 0.1869569998900253,
"grad_norm": 1.5996442049523565,
"learning_rate": 9.175030582066215e-07,
"loss": 1.5807,
"step": 5100
},
{
"epoch": 0.18732358224275084,
"grad_norm": 2.127736796999369,
"learning_rate": 9.17185355583409e-07,
"loss": 1.6288,
"step": 5110
},
{
"epoch": 0.18769016459547638,
"grad_norm": 1.7060344023543381,
"learning_rate": 9.16867097611997e-07,
"loss": 1.5706,
"step": 5120
},
{
"epoch": 0.1880567469482019,
"grad_norm": 1.6633154215840553,
"learning_rate": 9.165482847160433e-07,
"loss": 1.6202,
"step": 5130
},
{
"epoch": 0.18842332930092745,
"grad_norm": 2.008854546754292,
"learning_rate": 9.162289173199449e-07,
"loss": 1.5684,
"step": 5140
},
{
"epoch": 0.188789911653653,
"grad_norm": 1.8267125273776432,
"learning_rate": 9.159089958488368e-07,
"loss": 1.5463,
"step": 5150
},
{
"epoch": 0.18915649400637854,
"grad_norm": 1.5564239251002085,
"learning_rate": 9.155885207285917e-07,
"loss": 1.5432,
"step": 5160
},
{
"epoch": 0.18952307635910406,
"grad_norm": 1.6146271060205803,
"learning_rate": 9.152674923858192e-07,
"loss": 1.5524,
"step": 5170
},
{
"epoch": 0.1898896587118296,
"grad_norm": 1.5552810397285535,
"learning_rate": 9.149459112478653e-07,
"loss": 1.5704,
"step": 5180
},
{
"epoch": 0.19025624106455516,
"grad_norm": 1.5384519496242604,
"learning_rate": 9.146237777428119e-07,
"loss": 1.5832,
"step": 5190
},
{
"epoch": 0.19062282341728068,
"grad_norm": 2.017102331377888,
"learning_rate": 9.143010922994761e-07,
"loss": 1.5652,
"step": 5200
},
{
"epoch": 0.19098940577000623,
"grad_norm": 1.8257390842642465,
"learning_rate": 9.139778553474102e-07,
"loss": 1.6286,
"step": 5210
},
{
"epoch": 0.19135598812273177,
"grad_norm": 1.8375892545538077,
"learning_rate": 9.136540673169e-07,
"loss": 1.5999,
"step": 5220
},
{
"epoch": 0.19172257047545732,
"grad_norm": 2.0587302949543327,
"learning_rate": 9.133297286389652e-07,
"loss": 1.5976,
"step": 5230
},
{
"epoch": 0.19208915282818284,
"grad_norm": 2.011881523827466,
"learning_rate": 9.130048397453586e-07,
"loss": 1.5948,
"step": 5240
},
{
"epoch": 0.1924557351809084,
"grad_norm": 1.8390608792602066,
"learning_rate": 9.126794010685652e-07,
"loss": 1.6149,
"step": 5250
},
{
"epoch": 0.19282231753363394,
"grad_norm": 1.9246481251033047,
"learning_rate": 9.123534130418022e-07,
"loss": 1.5918,
"step": 5260
},
{
"epoch": 0.19318889988635948,
"grad_norm": 1.716961973736044,
"learning_rate": 9.120268760990177e-07,
"loss": 1.5423,
"step": 5270
},
{
"epoch": 0.193555482239085,
"grad_norm": 2.0653331266058053,
"learning_rate": 9.116997906748906e-07,
"loss": 1.5646,
"step": 5280
},
{
"epoch": 0.19392206459181055,
"grad_norm": 1.518359023904073,
"learning_rate": 9.113721572048303e-07,
"loss": 1.5893,
"step": 5290
},
{
"epoch": 0.1942886469445361,
"grad_norm": 1.5221964255305394,
"learning_rate": 9.110439761249752e-07,
"loss": 1.5944,
"step": 5300
},
{
"epoch": 0.19465522929726162,
"grad_norm": 1.591016019300809,
"learning_rate": 9.107152478721929e-07,
"loss": 1.5957,
"step": 5310
},
{
"epoch": 0.19502181164998716,
"grad_norm": 1.6048630337553804,
"learning_rate": 9.103859728840797e-07,
"loss": 1.5373,
"step": 5320
},
{
"epoch": 0.1953883940027127,
"grad_norm": 1.8089344462427293,
"learning_rate": 9.10056151598959e-07,
"loss": 1.5484,
"step": 5330
},
{
"epoch": 0.19575497635543826,
"grad_norm": 1.7077347921127968,
"learning_rate": 9.097257844558821e-07,
"loss": 1.5688,
"step": 5340
},
{
"epoch": 0.19612155870816378,
"grad_norm": 2.0584080275062706,
"learning_rate": 9.093948718946265e-07,
"loss": 1.5202,
"step": 5350
},
{
"epoch": 0.19648814106088933,
"grad_norm": 1.6275162784009292,
"learning_rate": 9.090634143556961e-07,
"loss": 1.5851,
"step": 5360
},
{
"epoch": 0.19685472341361487,
"grad_norm": 1.7941515009032263,
"learning_rate": 9.087314122803198e-07,
"loss": 1.5794,
"step": 5370
},
{
"epoch": 0.19722130576634042,
"grad_norm": 1.72604148825101,
"learning_rate": 9.083988661104519e-07,
"loss": 1.5966,
"step": 5380
},
{
"epoch": 0.19758788811906594,
"grad_norm": 1.7824620622659664,
"learning_rate": 9.080657762887706e-07,
"loss": 1.5893,
"step": 5390
},
{
"epoch": 0.1979544704717915,
"grad_norm": 1.710078177829696,
"learning_rate": 9.077321432586779e-07,
"loss": 1.5668,
"step": 5400
},
{
"epoch": 0.19832105282451704,
"grad_norm": 1.8516264946489545,
"learning_rate": 9.073979674642991e-07,
"loss": 1.6049,
"step": 5410
},
{
"epoch": 0.19868763517724256,
"grad_norm": 2.1561627747886583,
"learning_rate": 9.070632493504815e-07,
"loss": 1.585,
"step": 5420
},
{
"epoch": 0.1990542175299681,
"grad_norm": 1.912041110250784,
"learning_rate": 9.06727989362795e-07,
"loss": 1.5196,
"step": 5430
},
{
"epoch": 0.19942079988269365,
"grad_norm": 1.8404077118276456,
"learning_rate": 9.063921879475306e-07,
"loss": 1.611,
"step": 5440
},
{
"epoch": 0.1997873822354192,
"grad_norm": 1.5865821224681815,
"learning_rate": 9.060558455516996e-07,
"loss": 1.5739,
"step": 5450
},
{
"epoch": 0.20015396458814472,
"grad_norm": 1.9756512969668862,
"learning_rate": 9.057189626230341e-07,
"loss": 1.5002,
"step": 5460
},
{
"epoch": 0.20052054694087026,
"grad_norm": 1.5812577707350812,
"learning_rate": 9.053815396099851e-07,
"loss": 1.5869,
"step": 5470
},
{
"epoch": 0.2008871292935958,
"grad_norm": 2.0162867580185555,
"learning_rate": 9.050435769617231e-07,
"loss": 1.5559,
"step": 5480
},
{
"epoch": 0.20125371164632136,
"grad_norm": 1.899649598636165,
"learning_rate": 9.047050751281368e-07,
"loss": 1.5407,
"step": 5490
},
{
"epoch": 0.20162029399904688,
"grad_norm": 1.9101266806326496,
"learning_rate": 9.043660345598322e-07,
"loss": 1.5576,
"step": 5500
},
{
"epoch": 0.20198687635177243,
"grad_norm": 2.0420669589479403,
"learning_rate": 9.040264557081334e-07,
"loss": 1.557,
"step": 5510
},
{
"epoch": 0.20235345870449797,
"grad_norm": 1.9260883055795428,
"learning_rate": 9.036863390250801e-07,
"loss": 1.5521,
"step": 5520
},
{
"epoch": 0.2027200410572235,
"grad_norm": 1.6555197284342995,
"learning_rate": 9.033456849634284e-07,
"loss": 1.5717,
"step": 5530
},
{
"epoch": 0.20308662340994904,
"grad_norm": 2.153362825776131,
"learning_rate": 9.030044939766497e-07,
"loss": 1.5713,
"step": 5540
},
{
"epoch": 0.2034532057626746,
"grad_norm": 1.910089724316295,
"learning_rate": 9.026627665189303e-07,
"loss": 1.5697,
"step": 5550
},
{
"epoch": 0.20381978811540014,
"grad_norm": 1.7762617538543,
"learning_rate": 9.0232050304517e-07,
"loss": 1.5239,
"step": 5560
},
{
"epoch": 0.20418637046812566,
"grad_norm": 1.7174298843577596,
"learning_rate": 9.019777040109831e-07,
"loss": 1.5276,
"step": 5570
},
{
"epoch": 0.2045529528208512,
"grad_norm": 1.6862369469038345,
"learning_rate": 9.016343698726961e-07,
"loss": 1.5541,
"step": 5580
},
{
"epoch": 0.20491953517357675,
"grad_norm": 1.875834526669963,
"learning_rate": 9.01290501087348e-07,
"loss": 1.555,
"step": 5590
},
{
"epoch": 0.2052861175263023,
"grad_norm": 1.7840227955187389,
"learning_rate": 9.009460981126898e-07,
"loss": 1.5872,
"step": 5600
},
{
"epoch": 0.20565269987902782,
"grad_norm": 1.668168953110993,
"learning_rate": 9.006011614071829e-07,
"loss": 1.599,
"step": 5610
},
{
"epoch": 0.20601928223175336,
"grad_norm": 1.6951419814826267,
"learning_rate": 9.002556914300001e-07,
"loss": 1.5599,
"step": 5620
},
{
"epoch": 0.2063858645844789,
"grad_norm": 2.031183645077938,
"learning_rate": 8.999096886410234e-07,
"loss": 1.5697,
"step": 5630
},
{
"epoch": 0.20675244693720443,
"grad_norm": 2.2433698552413595,
"learning_rate": 8.995631535008442e-07,
"loss": 1.5751,
"step": 5640
},
{
"epoch": 0.20711902928992998,
"grad_norm": 1.96339871171306,
"learning_rate": 8.992160864707629e-07,
"loss": 1.5922,
"step": 5650
},
{
"epoch": 0.20748561164265553,
"grad_norm": 1.7341008984989021,
"learning_rate": 8.988684880127877e-07,
"loss": 1.5476,
"step": 5660
},
{
"epoch": 0.20785219399538107,
"grad_norm": 1.6011033018349554,
"learning_rate": 8.985203585896339e-07,
"loss": 1.5337,
"step": 5670
},
{
"epoch": 0.2082187763481066,
"grad_norm": 1.804008259917083,
"learning_rate": 8.981716986647241e-07,
"loss": 1.548,
"step": 5680
},
{
"epoch": 0.20858535870083214,
"grad_norm": 1.7644993504571036,
"learning_rate": 8.978225087021872e-07,
"loss": 1.5566,
"step": 5690
},
{
"epoch": 0.2089519410535577,
"grad_norm": 2.1995890332913812,
"learning_rate": 8.974727891668568e-07,
"loss": 1.509,
"step": 5700
},
{
"epoch": 0.2093185234062832,
"grad_norm": 1.7307439040874695,
"learning_rate": 8.971225405242724e-07,
"loss": 1.5792,
"step": 5710
},
{
"epoch": 0.20968510575900876,
"grad_norm": 1.8843347719325225,
"learning_rate": 8.967717632406775e-07,
"loss": 1.5745,
"step": 5720
},
{
"epoch": 0.2100516881117343,
"grad_norm": 1.8994279922279045,
"learning_rate": 8.964204577830193e-07,
"loss": 1.5346,
"step": 5730
},
{
"epoch": 0.21041827046445985,
"grad_norm": 2.0146207080838305,
"learning_rate": 8.960686246189479e-07,
"loss": 1.5724,
"step": 5740
},
{
"epoch": 0.21078485281718537,
"grad_norm": 1.9175010632666802,
"learning_rate": 8.957162642168164e-07,
"loss": 1.482,
"step": 5750
},
{
"epoch": 0.21115143516991092,
"grad_norm": 1.6492564643172203,
"learning_rate": 8.953633770456791e-07,
"loss": 1.5635,
"step": 5760
},
{
"epoch": 0.21151801752263646,
"grad_norm": 1.8913486368556613,
"learning_rate": 8.950099635752919e-07,
"loss": 1.5634,
"step": 5770
},
{
"epoch": 0.211884599875362,
"grad_norm": 1.7405053491856226,
"learning_rate": 8.946560242761114e-07,
"loss": 1.5475,
"step": 5780
},
{
"epoch": 0.21225118222808753,
"grad_norm": 1.7166883252641594,
"learning_rate": 8.943015596192938e-07,
"loss": 1.516,
"step": 5790
},
{
"epoch": 0.21261776458081308,
"grad_norm": 1.935712334758643,
"learning_rate": 8.93946570076695e-07,
"loss": 1.5575,
"step": 5800
},
{
"epoch": 0.21298434693353863,
"grad_norm": 1.9385604701128256,
"learning_rate": 8.935910561208693e-07,
"loss": 1.5634,
"step": 5810
},
{
"epoch": 0.21335092928626415,
"grad_norm": 2.557688500744313,
"learning_rate": 8.932350182250694e-07,
"loss": 1.5103,
"step": 5820
},
{
"epoch": 0.2137175116389897,
"grad_norm": 1.7120107495237882,
"learning_rate": 8.928784568632454e-07,
"loss": 1.5332,
"step": 5830
},
{
"epoch": 0.21408409399171524,
"grad_norm": 1.9120958570178155,
"learning_rate": 8.925213725100439e-07,
"loss": 1.5902,
"step": 5840
},
{
"epoch": 0.2144506763444408,
"grad_norm": 2.0551912368717984,
"learning_rate": 8.921637656408081e-07,
"loss": 1.5784,
"step": 5850
},
{
"epoch": 0.2148172586971663,
"grad_norm": 1.9480411905431083,
"learning_rate": 8.918056367315765e-07,
"loss": 1.5551,
"step": 5860
},
{
"epoch": 0.21518384104989186,
"grad_norm": 2.072902657734444,
"learning_rate": 8.914469862590825e-07,
"loss": 1.5555,
"step": 5870
},
{
"epoch": 0.2155504234026174,
"grad_norm": 1.9451661388320578,
"learning_rate": 8.910878147007544e-07,
"loss": 1.5513,
"step": 5880
},
{
"epoch": 0.21591700575534295,
"grad_norm": 2.0629785589418104,
"learning_rate": 8.907281225347132e-07,
"loss": 1.5553,
"step": 5890
},
{
"epoch": 0.21628358810806847,
"grad_norm": 1.863954721076218,
"learning_rate": 8.903679102397735e-07,
"loss": 1.5691,
"step": 5900
},
{
"epoch": 0.21665017046079402,
"grad_norm": 1.8545804685124208,
"learning_rate": 8.900071782954424e-07,
"loss": 1.5331,
"step": 5910
},
{
"epoch": 0.21701675281351956,
"grad_norm": 1.8522158136831326,
"learning_rate": 8.896459271819181e-07,
"loss": 1.5481,
"step": 5920
},
{
"epoch": 0.21738333516624508,
"grad_norm": 2.114169763199409,
"learning_rate": 8.892841573800909e-07,
"loss": 1.5574,
"step": 5930
},
{
"epoch": 0.21774991751897063,
"grad_norm": 2.2195708048317897,
"learning_rate": 8.889218693715405e-07,
"loss": 1.5632,
"step": 5940
},
{
"epoch": 0.21811649987169618,
"grad_norm": 1.9709151192601133,
"learning_rate": 8.885590636385373e-07,
"loss": 1.5861,
"step": 5950
},
{
"epoch": 0.21848308222442173,
"grad_norm": 1.9808333239294875,
"learning_rate": 8.881957406640402e-07,
"loss": 1.5065,
"step": 5960
},
{
"epoch": 0.21884966457714725,
"grad_norm": 2.442742784557856,
"learning_rate": 8.878319009316973e-07,
"loss": 1.5445,
"step": 5970
},
{
"epoch": 0.2192162469298728,
"grad_norm": 2.311119780435353,
"learning_rate": 8.874675449258439e-07,
"loss": 1.5483,
"step": 5980
},
{
"epoch": 0.21958282928259834,
"grad_norm": 2.0035864035930655,
"learning_rate": 8.871026731315031e-07,
"loss": 1.5516,
"step": 5990
},
{
"epoch": 0.2199494116353239,
"grad_norm": 1.9235134048584597,
"learning_rate": 8.867372860343843e-07,
"loss": 1.5841,
"step": 6000
},
{
"epoch": 0.2199494116353239,
"eval_accuracy": 0.6509060196907062,
"eval_loss": 1.540500521659851,
"eval_runtime": 311.0144,
"eval_samples_per_second": 10.633,
"eval_steps_per_second": 0.887,
"step": 6000
},
{
"epoch": 0.2203159939880494,
"grad_norm": 1.7524109005789064,
"learning_rate": 8.863713841208831e-07,
"loss": 1.5597,
"step": 6010
},
{
"epoch": 0.22068257634077496,
"grad_norm": 1.6692328056749952,
"learning_rate": 8.860049678780803e-07,
"loss": 1.4923,
"step": 6020
},
{
"epoch": 0.2210491586935005,
"grad_norm": 1.9399213197528828,
"learning_rate": 8.856380377937411e-07,
"loss": 1.552,
"step": 6030
},
{
"epoch": 0.22141574104622602,
"grad_norm": 2.2904467183798753,
"learning_rate": 8.852705943563153e-07,
"loss": 1.5254,
"step": 6040
},
{
"epoch": 0.22178232339895157,
"grad_norm": 1.8153750134894717,
"learning_rate": 8.849026380549354e-07,
"loss": 1.5141,
"step": 6050
},
{
"epoch": 0.22214890575167712,
"grad_norm": 2.618147882062693,
"learning_rate": 8.84534169379417e-07,
"loss": 1.5427,
"step": 6060
},
{
"epoch": 0.22251548810440266,
"grad_norm": 1.7910988941866253,
"learning_rate": 8.84165188820258e-07,
"loss": 1.5024,
"step": 6070
},
{
"epoch": 0.22288207045712818,
"grad_norm": 2.1174011777995565,
"learning_rate": 8.837956968686371e-07,
"loss": 1.5354,
"step": 6080
},
{
"epoch": 0.22324865280985373,
"grad_norm": 1.9009206870385398,
"learning_rate": 8.834256940164142e-07,
"loss": 1.5147,
"step": 6090
},
{
"epoch": 0.22361523516257928,
"grad_norm": 1.8496325535415874,
"learning_rate": 8.830551807561291e-07,
"loss": 1.5179,
"step": 6100
},
{
"epoch": 0.22398181751530483,
"grad_norm": 1.662570964745413,
"learning_rate": 8.826841575810011e-07,
"loss": 1.5187,
"step": 6110
},
{
"epoch": 0.22434839986803035,
"grad_norm": 1.8932960142147148,
"learning_rate": 8.823126249849283e-07,
"loss": 1.511,
"step": 6120
},
{
"epoch": 0.2247149822207559,
"grad_norm": 2.055911875635135,
"learning_rate": 8.819405834624869e-07,
"loss": 1.5155,
"step": 6130
},
{
"epoch": 0.22508156457348144,
"grad_norm": 2.0651755539958603,
"learning_rate": 8.815680335089308e-07,
"loss": 1.4753,
"step": 6140
},
{
"epoch": 0.22544814692620696,
"grad_norm": 2.0717254734315405,
"learning_rate": 8.811949756201902e-07,
"loss": 1.5565,
"step": 6150
},
{
"epoch": 0.2258147292789325,
"grad_norm": 1.9847422671401158,
"learning_rate": 8.808214102928721e-07,
"loss": 1.5438,
"step": 6160
},
{
"epoch": 0.22618131163165806,
"grad_norm": 2.4190623603018806,
"learning_rate": 8.804473380242583e-07,
"loss": 1.5399,
"step": 6170
},
{
"epoch": 0.2265478939843836,
"grad_norm": 2.20009570928599,
"learning_rate": 8.80072759312306e-07,
"loss": 1.5398,
"step": 6180
},
{
"epoch": 0.22691447633710912,
"grad_norm": 1.9921790637181438,
"learning_rate": 8.796976746556462e-07,
"loss": 1.4771,
"step": 6190
},
{
"epoch": 0.22728105868983467,
"grad_norm": 2.0203680363068344,
"learning_rate": 8.793220845535838e-07,
"loss": 1.5176,
"step": 6200
},
{
"epoch": 0.22764764104256022,
"grad_norm": 2.7532988176359754,
"learning_rate": 8.789459895060962e-07,
"loss": 1.5371,
"step": 6210
},
{
"epoch": 0.22801422339528576,
"grad_norm": 1.937352911027064,
"learning_rate": 8.785693900138329e-07,
"loss": 1.5356,
"step": 6220
},
{
"epoch": 0.22838080574801128,
"grad_norm": 1.9964616803134492,
"learning_rate": 8.781922865781151e-07,
"loss": 1.56,
"step": 6230
},
{
"epoch": 0.22874738810073683,
"grad_norm": 2.106377863408321,
"learning_rate": 8.778146797009349e-07,
"loss": 1.559,
"step": 6240
},
{
"epoch": 0.22911397045346238,
"grad_norm": 1.6409859726466804,
"learning_rate": 8.774365698849547e-07,
"loss": 1.5116,
"step": 6250
},
{
"epoch": 0.2294805528061879,
"grad_norm": 2.305691070208384,
"learning_rate": 8.770579576335058e-07,
"loss": 1.5683,
"step": 6260
},
{
"epoch": 0.22984713515891345,
"grad_norm": 1.7207294769909895,
"learning_rate": 8.766788434505887e-07,
"loss": 1.4618,
"step": 6270
},
{
"epoch": 0.230213717511639,
"grad_norm": 1.9323445658200624,
"learning_rate": 8.762992278408723e-07,
"loss": 1.5618,
"step": 6280
},
{
"epoch": 0.23058029986436454,
"grad_norm": 1.999152732092489,
"learning_rate": 8.759191113096927e-07,
"loss": 1.5569,
"step": 6290
},
{
"epoch": 0.23094688221709006,
"grad_norm": 1.8502749258838977,
"learning_rate": 8.755384943630529e-07,
"loss": 1.5114,
"step": 6300
},
{
"epoch": 0.2313134645698156,
"grad_norm": 2.0061014414371003,
"learning_rate": 8.751573775076219e-07,
"loss": 1.5011,
"step": 6310
},
{
"epoch": 0.23168004692254116,
"grad_norm": 2.064565021271191,
"learning_rate": 8.747757612507345e-07,
"loss": 1.5588,
"step": 6320
},
{
"epoch": 0.23204662927526667,
"grad_norm": 1.878533236916369,
"learning_rate": 8.743936461003898e-07,
"loss": 1.5179,
"step": 6330
},
{
"epoch": 0.23241321162799222,
"grad_norm": 2.080116702687917,
"learning_rate": 8.740110325652515e-07,
"loss": 1.5211,
"step": 6340
},
{
"epoch": 0.23277979398071777,
"grad_norm": 2.2534624739469433,
"learning_rate": 8.736279211546465e-07,
"loss": 1.5077,
"step": 6350
},
{
"epoch": 0.23314637633344332,
"grad_norm": 2.1778452457873527,
"learning_rate": 8.732443123785644e-07,
"loss": 1.5385,
"step": 6360
},
{
"epoch": 0.23351295868616884,
"grad_norm": 2.0802562378092317,
"learning_rate": 8.72860206747657e-07,
"loss": 1.5053,
"step": 6370
},
{
"epoch": 0.23387954103889438,
"grad_norm": 2.197133342414823,
"learning_rate": 8.724756047732376e-07,
"loss": 1.5223,
"step": 6380
},
{
"epoch": 0.23424612339161993,
"grad_norm": 2.3786394596220437,
"learning_rate": 8.720905069672799e-07,
"loss": 1.5124,
"step": 6390
},
{
"epoch": 0.23461270574434548,
"grad_norm": 1.8455501641424978,
"learning_rate": 8.717049138424182e-07,
"loss": 1.525,
"step": 6400
},
{
"epoch": 0.234979288097071,
"grad_norm": 2.0418699202678727,
"learning_rate": 8.713188259119452e-07,
"loss": 1.5082,
"step": 6410
},
{
"epoch": 0.23534587044979655,
"grad_norm": 1.8308136052916946,
"learning_rate": 8.709322436898135e-07,
"loss": 1.4779,
"step": 6420
},
{
"epoch": 0.2357124528025221,
"grad_norm": 2.155105815758525,
"learning_rate": 8.705451676906328e-07,
"loss": 1.5101,
"step": 6430
},
{
"epoch": 0.2360790351552476,
"grad_norm": 1.9647757860923412,
"learning_rate": 8.701575984296702e-07,
"loss": 1.5105,
"step": 6440
},
{
"epoch": 0.23644561750797316,
"grad_norm": 2.051510082680593,
"learning_rate": 8.6976953642285e-07,
"loss": 1.503,
"step": 6450
},
{
"epoch": 0.2368121998606987,
"grad_norm": 2.1386714707947534,
"learning_rate": 8.693809821867517e-07,
"loss": 1.5282,
"step": 6460
},
{
"epoch": 0.23717878221342426,
"grad_norm": 2.1401411616284167,
"learning_rate": 8.689919362386104e-07,
"loss": 1.4949,
"step": 6470
},
{
"epoch": 0.23754536456614977,
"grad_norm": 1.956666297999974,
"learning_rate": 8.686023990963157e-07,
"loss": 1.4993,
"step": 6480
},
{
"epoch": 0.23791194691887532,
"grad_norm": 2.0257118859168672,
"learning_rate": 8.682123712784112e-07,
"loss": 1.5186,
"step": 6490
},
{
"epoch": 0.23827852927160087,
"grad_norm": 1.895169068962553,
"learning_rate": 8.678218533040937e-07,
"loss": 1.526,
"step": 6500
},
{
"epoch": 0.23864511162432642,
"grad_norm": 6.529056788123207,
"learning_rate": 8.67430845693212e-07,
"loss": 1.4975,
"step": 6510
},
{
"epoch": 0.23901169397705194,
"grad_norm": 2.078820041783562,
"learning_rate": 8.670393489662673e-07,
"loss": 1.5147,
"step": 6520
},
{
"epoch": 0.23937827632977748,
"grad_norm": 2.313941233193865,
"learning_rate": 8.666473636444116e-07,
"loss": 1.5103,
"step": 6530
},
{
"epoch": 0.23974485868250303,
"grad_norm": 2.204068052979437,
"learning_rate": 8.662548902494473e-07,
"loss": 1.5197,
"step": 6540
},
{
"epoch": 0.24011144103522855,
"grad_norm": 2.6677538134182033,
"learning_rate": 8.658619293038265e-07,
"loss": 1.4539,
"step": 6550
},
{
"epoch": 0.2404780233879541,
"grad_norm": 2.1826711924398876,
"learning_rate": 8.654684813306508e-07,
"loss": 1.4569,
"step": 6560
},
{
"epoch": 0.24084460574067965,
"grad_norm": 2.4513733249404037,
"learning_rate": 8.650745468536691e-07,
"loss": 1.472,
"step": 6570
},
{
"epoch": 0.2412111880934052,
"grad_norm": 1.9341316559705668,
"learning_rate": 8.64680126397279e-07,
"loss": 1.5128,
"step": 6580
},
{
"epoch": 0.2415777704461307,
"grad_norm": 2.2183441842361753,
"learning_rate": 8.642852204865243e-07,
"loss": 1.5409,
"step": 6590
},
{
"epoch": 0.24194435279885626,
"grad_norm": 2.270638521627112,
"learning_rate": 8.638898296470953e-07,
"loss": 1.4992,
"step": 6600
},
{
"epoch": 0.2423109351515818,
"grad_norm": 2.6732843475957146,
"learning_rate": 8.634939544053279e-07,
"loss": 1.5335,
"step": 6610
},
{
"epoch": 0.24267751750430736,
"grad_norm": 1.9291920434342291,
"learning_rate": 8.630975952882026e-07,
"loss": 1.4627,
"step": 6620
},
{
"epoch": 0.24304409985703287,
"grad_norm": 2.05169281240212,
"learning_rate": 8.627007528233445e-07,
"loss": 1.5257,
"step": 6630
},
{
"epoch": 0.24341068220975842,
"grad_norm": 2.42497111676382,
"learning_rate": 8.623034275390214e-07,
"loss": 1.5445,
"step": 6640
},
{
"epoch": 0.24377726456248397,
"grad_norm": 2.1919485638499903,
"learning_rate": 8.619056199641444e-07,
"loss": 1.5115,
"step": 6650
},
{
"epoch": 0.2441438469152095,
"grad_norm": 2.3664261903908343,
"learning_rate": 8.615073306282663e-07,
"loss": 1.4846,
"step": 6660
},
{
"epoch": 0.24451042926793504,
"grad_norm": 2.7278440906317387,
"learning_rate": 8.611085600615812e-07,
"loss": 1.5419,
"step": 6670
},
{
"epoch": 0.24487701162066058,
"grad_norm": 2.326361941668607,
"learning_rate": 8.607093087949244e-07,
"loss": 1.5447,
"step": 6680
},
{
"epoch": 0.24524359397338613,
"grad_norm": 2.101465809666948,
"learning_rate": 8.603095773597702e-07,
"loss": 1.5147,
"step": 6690
},
{
"epoch": 0.24561017632611165,
"grad_norm": 2.121131443755951,
"learning_rate": 8.599093662882326e-07,
"loss": 1.5046,
"step": 6700
},
{
"epoch": 0.2459767586788372,
"grad_norm": 2.004374535392673,
"learning_rate": 8.595086761130641e-07,
"loss": 1.5104,
"step": 6710
},
{
"epoch": 0.24634334103156275,
"grad_norm": 2.330571487353144,
"learning_rate": 8.591075073676548e-07,
"loss": 1.489,
"step": 6720
},
{
"epoch": 0.2467099233842883,
"grad_norm": 1.954097712061658,
"learning_rate": 8.587058605860319e-07,
"loss": 1.4628,
"step": 6730
},
{
"epoch": 0.2470765057370138,
"grad_norm": 2.287871494329092,
"learning_rate": 8.583037363028591e-07,
"loss": 1.4966,
"step": 6740
},
{
"epoch": 0.24744308808973936,
"grad_norm": 2.2507921472351837,
"learning_rate": 8.579011350534355e-07,
"loss": 1.5148,
"step": 6750
},
{
"epoch": 0.2478096704424649,
"grad_norm": 2.2811051866364034,
"learning_rate": 8.574980573736951e-07,
"loss": 1.5123,
"step": 6760
},
{
"epoch": 0.24817625279519043,
"grad_norm": 2.0762345472822106,
"learning_rate": 8.570945038002066e-07,
"loss": 1.5538,
"step": 6770
},
{
"epoch": 0.24854283514791597,
"grad_norm": 2.0481616873032618,
"learning_rate": 8.566904748701718e-07,
"loss": 1.5162,
"step": 6780
},
{
"epoch": 0.24890941750064152,
"grad_norm": 1.977911548805274,
"learning_rate": 8.562859711214252e-07,
"loss": 1.4945,
"step": 6790
},
{
"epoch": 0.24927599985336707,
"grad_norm": 2.166946374211255,
"learning_rate": 8.558809930924336e-07,
"loss": 1.5143,
"step": 6800
},
{
"epoch": 0.2496425822060926,
"grad_norm": 2.265635068798512,
"learning_rate": 8.554755413222952e-07,
"loss": 1.5079,
"step": 6810
},
{
"epoch": 0.25000916455881816,
"grad_norm": 2.376856602321205,
"learning_rate": 8.550696163507384e-07,
"loss": 1.5187,
"step": 6820
},
{
"epoch": 0.2503757469115437,
"grad_norm": 2.329411952961872,
"learning_rate": 8.54663218718122e-07,
"loss": 1.4985,
"step": 6830
},
{
"epoch": 0.2507423292642692,
"grad_norm": 2.127867609490789,
"learning_rate": 8.542563489654337e-07,
"loss": 1.5249,
"step": 6840
},
{
"epoch": 0.2511089116169948,
"grad_norm": 2.3846188422530545,
"learning_rate": 8.5384900763429e-07,
"loss": 1.5157,
"step": 6850
},
{
"epoch": 0.2514754939697203,
"grad_norm": 1.9837481727043949,
"learning_rate": 8.534411952669348e-07,
"loss": 1.5185,
"step": 6860
},
{
"epoch": 0.2518420763224458,
"grad_norm": 2.0300743472877776,
"learning_rate": 8.530329124062392e-07,
"loss": 1.4726,
"step": 6870
},
{
"epoch": 0.2522086586751714,
"grad_norm": 3.41153757527899,
"learning_rate": 8.526241595957007e-07,
"loss": 1.482,
"step": 6880
},
{
"epoch": 0.2525752410278969,
"grad_norm": 2.7170854102243043,
"learning_rate": 8.52214937379442e-07,
"loss": 1.4518,
"step": 6890
},
{
"epoch": 0.25294182338062243,
"grad_norm": 2.5040883653748294,
"learning_rate": 8.518052463022112e-07,
"loss": 1.4506,
"step": 6900
},
{
"epoch": 0.253308405733348,
"grad_norm": 2.1362380301717807,
"learning_rate": 8.513950869093802e-07,
"loss": 1.4975,
"step": 6910
},
{
"epoch": 0.2536749880860735,
"grad_norm": 56.61497948468882,
"learning_rate": 8.509844597469442e-07,
"loss": 1.5211,
"step": 6920
},
{
"epoch": 0.2540415704387991,
"grad_norm": 2.161248343347086,
"learning_rate": 8.505733653615217e-07,
"loss": 1.5123,
"step": 6930
},
{
"epoch": 0.2544081527915246,
"grad_norm": 2.197831076147601,
"learning_rate": 8.501618043003522e-07,
"loss": 1.4735,
"step": 6940
},
{
"epoch": 0.25477473514425014,
"grad_norm": 2.730731478650521,
"learning_rate": 8.497497771112975e-07,
"loss": 1.5154,
"step": 6950
},
{
"epoch": 0.2551413174969757,
"grad_norm": 2.625261843658038,
"learning_rate": 8.49337284342839e-07,
"loss": 1.4642,
"step": 6960
},
{
"epoch": 0.25550789984970124,
"grad_norm": 3.6302229703502302,
"learning_rate": 8.489243265440785e-07,
"loss": 1.4339,
"step": 6970
},
{
"epoch": 0.25587448220242676,
"grad_norm": 2.2912655831406408,
"learning_rate": 8.485109042647361e-07,
"loss": 1.5021,
"step": 6980
},
{
"epoch": 0.25624106455515233,
"grad_norm": 8.005970124630041,
"learning_rate": 8.48097018055151e-07,
"loss": 1.4777,
"step": 6990
},
{
"epoch": 0.25660764690787785,
"grad_norm": 2.2515437376163097,
"learning_rate": 8.476826684662797e-07,
"loss": 1.5096,
"step": 7000
},
{
"epoch": 0.25660764690787785,
"eval_accuracy": 0.6611285662580546,
"eval_loss": 1.4870213270187378,
"eval_runtime": 310.8369,
"eval_samples_per_second": 10.639,
"eval_steps_per_second": 0.888,
"step": 7000
},
{
"epoch": 0.25697422926060337,
"grad_norm": 2.531506922529387,
"learning_rate": 8.472678560496955e-07,
"loss": 1.4718,
"step": 7010
},
{
"epoch": 0.25734081161332895,
"grad_norm": 2.6738422568666778,
"learning_rate": 8.468525813575875e-07,
"loss": 1.4849,
"step": 7020
},
{
"epoch": 0.25770739396605447,
"grad_norm": 2.3045631257315256,
"learning_rate": 8.464368449427608e-07,
"loss": 1.3982,
"step": 7030
},
{
"epoch": 0.25807397631878004,
"grad_norm": 2.3127941331475586,
"learning_rate": 8.460206473586347e-07,
"loss": 1.4584,
"step": 7040
},
{
"epoch": 0.25844055867150556,
"grad_norm": 2.624025522294039,
"learning_rate": 8.456039891592424e-07,
"loss": 1.5064,
"step": 7050
},
{
"epoch": 0.2588071410242311,
"grad_norm": 2.4392755048359906,
"learning_rate": 8.451868708992305e-07,
"loss": 1.4744,
"step": 7060
},
{
"epoch": 0.25917372337695666,
"grad_norm": 2.244873049339989,
"learning_rate": 8.447692931338577e-07,
"loss": 1.4866,
"step": 7070
},
{
"epoch": 0.2595403057296822,
"grad_norm": 2.7693601328533846,
"learning_rate": 8.443512564189947e-07,
"loss": 1.4264,
"step": 7080
},
{
"epoch": 0.2599068880824077,
"grad_norm": 2.18123288795935,
"learning_rate": 8.439327613111231e-07,
"loss": 1.4487,
"step": 7090
},
{
"epoch": 0.26027347043513327,
"grad_norm": 2.770780437192883,
"learning_rate": 8.435138083673343e-07,
"loss": 1.5298,
"step": 7100
},
{
"epoch": 0.2606400527878588,
"grad_norm": 2.2581904540642737,
"learning_rate": 8.430943981453298e-07,
"loss": 1.4801,
"step": 7110
},
{
"epoch": 0.2610066351405843,
"grad_norm": 2.3222299759291674,
"learning_rate": 8.426745312034192e-07,
"loss": 1.4896,
"step": 7120
},
{
"epoch": 0.2613732174933099,
"grad_norm": 2.0280868196158908,
"learning_rate": 8.422542081005209e-07,
"loss": 1.4466,
"step": 7130
},
{
"epoch": 0.2617397998460354,
"grad_norm": 2.224282133830904,
"learning_rate": 8.418334293961593e-07,
"loss": 1.5286,
"step": 7140
},
{
"epoch": 0.262106382198761,
"grad_norm": 2.223919368251033,
"learning_rate": 8.414121956504665e-07,
"loss": 1.5043,
"step": 7150
},
{
"epoch": 0.2624729645514865,
"grad_norm": 2.505467964910925,
"learning_rate": 8.409905074241796e-07,
"loss": 1.4781,
"step": 7160
},
{
"epoch": 0.262839546904212,
"grad_norm": 2.0986445187287077,
"learning_rate": 8.405683652786411e-07,
"loss": 1.4804,
"step": 7170
},
{
"epoch": 0.2632061292569376,
"grad_norm": 2.490412539205642,
"learning_rate": 8.401457697757972e-07,
"loss": 1.518,
"step": 7180
},
{
"epoch": 0.2635727116096631,
"grad_norm": 2.6915376209294917,
"learning_rate": 8.397227214781983e-07,
"loss": 1.4812,
"step": 7190
},
{
"epoch": 0.26393929396238863,
"grad_norm": 2.3046153435535235,
"learning_rate": 8.392992209489973e-07,
"loss": 1.5159,
"step": 7200
},
{
"epoch": 0.2643058763151142,
"grad_norm": 2.508127660367551,
"learning_rate": 8.388752687519489e-07,
"loss": 1.4451,
"step": 7210
},
{
"epoch": 0.2646724586678397,
"grad_norm": 3.1862145718553245,
"learning_rate": 8.384508654514091e-07,
"loss": 1.4609,
"step": 7220
},
{
"epoch": 0.26503904102056525,
"grad_norm": 2.5580838478505803,
"learning_rate": 8.380260116123343e-07,
"loss": 1.4331,
"step": 7230
},
{
"epoch": 0.2654056233732908,
"grad_norm": 2.257862509636175,
"learning_rate": 8.376007078002813e-07,
"loss": 1.45,
"step": 7240
},
{
"epoch": 0.26577220572601634,
"grad_norm": 2.288080123372639,
"learning_rate": 8.371749545814051e-07,
"loss": 1.4389,
"step": 7250
},
{
"epoch": 0.2661387880787419,
"grad_norm": 2.396647723381076,
"learning_rate": 8.367487525224592e-07,
"loss": 1.4366,
"step": 7260
},
{
"epoch": 0.26650537043146744,
"grad_norm": 2.2979084143372868,
"learning_rate": 8.363221021907949e-07,
"loss": 1.4818,
"step": 7270
},
{
"epoch": 0.26687195278419296,
"grad_norm": 2.1808515998354694,
"learning_rate": 8.358950041543598e-07,
"loss": 1.4542,
"step": 7280
},
{
"epoch": 0.26723853513691853,
"grad_norm": 2.230268806261455,
"learning_rate": 8.354674589816977e-07,
"loss": 1.4329,
"step": 7290
},
{
"epoch": 0.26760511748964405,
"grad_norm": 2.927648869466954,
"learning_rate": 8.350394672419474e-07,
"loss": 1.5225,
"step": 7300
},
{
"epoch": 0.26797169984236957,
"grad_norm": 2.112114910370922,
"learning_rate": 8.346110295048425e-07,
"loss": 1.4225,
"step": 7310
},
{
"epoch": 0.26833828219509515,
"grad_norm": 2.660467378126346,
"learning_rate": 8.341821463407101e-07,
"loss": 1.5031,
"step": 7320
},
{
"epoch": 0.26870486454782067,
"grad_norm": 3.003354330326063,
"learning_rate": 8.337528183204704e-07,
"loss": 1.4707,
"step": 7330
},
{
"epoch": 0.2690714469005462,
"grad_norm": 2.623779251977545,
"learning_rate": 8.333230460156355e-07,
"loss": 1.4794,
"step": 7340
},
{
"epoch": 0.26943802925327176,
"grad_norm": 3.101895766048754,
"learning_rate": 8.32892829998309e-07,
"loss": 1.4667,
"step": 7350
},
{
"epoch": 0.2698046116059973,
"grad_norm": 2.960369047027641,
"learning_rate": 8.324621708411854e-07,
"loss": 1.5522,
"step": 7360
},
{
"epoch": 0.2701711939587228,
"grad_norm": 2.524100342925903,
"learning_rate": 8.320310691175489e-07,
"loss": 1.4526,
"step": 7370
},
{
"epoch": 0.2705377763114484,
"grad_norm": 2.62363195310582,
"learning_rate": 8.315995254012726e-07,
"loss": 1.4018,
"step": 7380
},
{
"epoch": 0.2709043586641739,
"grad_norm": 1.9920146887682115,
"learning_rate": 8.311675402668188e-07,
"loss": 1.3965,
"step": 7390
},
{
"epoch": 0.27127094101689947,
"grad_norm": 2.18110821192289,
"learning_rate": 8.307351142892364e-07,
"loss": 1.4842,
"step": 7400
},
{
"epoch": 0.271637523369625,
"grad_norm": 2.2188567896520497,
"learning_rate": 8.303022480441617e-07,
"loss": 1.4159,
"step": 7410
},
{
"epoch": 0.2720041057223505,
"grad_norm": 2.858166839750072,
"learning_rate": 8.298689421078171e-07,
"loss": 1.3954,
"step": 7420
},
{
"epoch": 0.2723706880750761,
"grad_norm": 2.740212521082454,
"learning_rate": 8.294351970570099e-07,
"loss": 1.4861,
"step": 7430
},
{
"epoch": 0.2727372704278016,
"grad_norm": 3.419233012340433,
"learning_rate": 8.290010134691326e-07,
"loss": 1.4824,
"step": 7440
},
{
"epoch": 0.2731038527805271,
"grad_norm": 2.4809215592986966,
"learning_rate": 8.285663919221606e-07,
"loss": 1.4938,
"step": 7450
},
{
"epoch": 0.2734704351332527,
"grad_norm": 2.607478119047904,
"learning_rate": 8.281313329946531e-07,
"loss": 1.419,
"step": 7460
},
{
"epoch": 0.2738370174859782,
"grad_norm": 2.8279213303777753,
"learning_rate": 8.276958372657512e-07,
"loss": 1.4801,
"step": 7470
},
{
"epoch": 0.27420359983870374,
"grad_norm": 2.585541966605194,
"learning_rate": 8.272599053151774e-07,
"loss": 1.4154,
"step": 7480
},
{
"epoch": 0.2745701821914293,
"grad_norm": 2.7236239018595336,
"learning_rate": 8.268235377232351e-07,
"loss": 1.4741,
"step": 7490
},
{
"epoch": 0.27493676454415483,
"grad_norm": 2.2739375571211844,
"learning_rate": 8.263867350708072e-07,
"loss": 1.4447,
"step": 7500
},
{
"epoch": 0.2753033468968804,
"grad_norm": 2.936703619541737,
"learning_rate": 8.259494979393562e-07,
"loss": 1.4811,
"step": 7510
},
{
"epoch": 0.2756699292496059,
"grad_norm": 2.644051786280347,
"learning_rate": 8.255118269109229e-07,
"loss": 1.4359,
"step": 7520
},
{
"epoch": 0.27603651160233145,
"grad_norm": 2.814370164816269,
"learning_rate": 8.250737225681254e-07,
"loss": 1.4697,
"step": 7530
},
{
"epoch": 0.276403093955057,
"grad_norm": 2.7487477516640664,
"learning_rate": 8.246351854941589e-07,
"loss": 1.4677,
"step": 7540
},
{
"epoch": 0.27676967630778254,
"grad_norm": 2.7840690479403807,
"learning_rate": 8.241962162727946e-07,
"loss": 1.462,
"step": 7550
},
{
"epoch": 0.27713625866050806,
"grad_norm": 2.9784690105392366,
"learning_rate": 8.237568154883788e-07,
"loss": 1.4439,
"step": 7560
},
{
"epoch": 0.27750284101323364,
"grad_norm": 2.8948634927350105,
"learning_rate": 8.233169837258325e-07,
"loss": 1.4705,
"step": 7570
},
{
"epoch": 0.27786942336595916,
"grad_norm": 2.612491147603324,
"learning_rate": 8.228767215706503e-07,
"loss": 1.467,
"step": 7580
},
{
"epoch": 0.2782360057186847,
"grad_norm": 2.8002040163179736,
"learning_rate": 8.224360296088995e-07,
"loss": 1.4573,
"step": 7590
},
{
"epoch": 0.27860258807141025,
"grad_norm": 2.8029823959562155,
"learning_rate": 8.219949084272201e-07,
"loss": 1.4804,
"step": 7600
},
{
"epoch": 0.27896917042413577,
"grad_norm": 2.6888372781846375,
"learning_rate": 8.21553358612823e-07,
"loss": 1.4633,
"step": 7610
},
{
"epoch": 0.27933575277686135,
"grad_norm": 2.279721839418087,
"learning_rate": 8.2111138075349e-07,
"loss": 1.4713,
"step": 7620
},
{
"epoch": 0.27970233512958687,
"grad_norm": 2.3829035564919807,
"learning_rate": 8.206689754375724e-07,
"loss": 1.4387,
"step": 7630
},
{
"epoch": 0.2800689174823124,
"grad_norm": 3.7962407630882384,
"learning_rate": 8.202261432539907e-07,
"loss": 1.4025,
"step": 7640
},
{
"epoch": 0.28043549983503796,
"grad_norm": 2.797043930833034,
"learning_rate": 8.197828847922337e-07,
"loss": 1.4576,
"step": 7650
},
{
"epoch": 0.2808020821877635,
"grad_norm": 3.256545613051792,
"learning_rate": 8.193392006423574e-07,
"loss": 1.432,
"step": 7660
},
{
"epoch": 0.281168664540489,
"grad_norm": 2.432668523438971,
"learning_rate": 8.188950913949848e-07,
"loss": 1.456,
"step": 7670
},
{
"epoch": 0.2815352468932146,
"grad_norm": 2.4546993774133856,
"learning_rate": 8.184505576413043e-07,
"loss": 1.392,
"step": 7680
},
{
"epoch": 0.2819018292459401,
"grad_norm": 3.0030506631971776,
"learning_rate": 8.180055999730702e-07,
"loss": 1.365,
"step": 7690
},
{
"epoch": 0.2822684115986656,
"grad_norm": 2.9439493487762465,
"learning_rate": 8.175602189826001e-07,
"loss": 1.4292,
"step": 7700
},
{
"epoch": 0.2826349939513912,
"grad_norm": 2.620909787731563,
"learning_rate": 8.171144152627761e-07,
"loss": 1.4251,
"step": 7710
},
{
"epoch": 0.2830015763041167,
"grad_norm": 3.263683256322055,
"learning_rate": 8.16668189407042e-07,
"loss": 1.3899,
"step": 7720
},
{
"epoch": 0.2833681586568423,
"grad_norm": 2.5437523385064953,
"learning_rate": 8.162215420094045e-07,
"loss": 1.3683,
"step": 7730
},
{
"epoch": 0.2837347410095678,
"grad_norm": 2.4580551613838844,
"learning_rate": 8.15774473664431e-07,
"loss": 1.3732,
"step": 7740
},
{
"epoch": 0.2841013233622933,
"grad_norm": 2.8279077970597184,
"learning_rate": 8.153269849672493e-07,
"loss": 1.419,
"step": 7750
},
{
"epoch": 0.2844679057150189,
"grad_norm": 3.041958703900493,
"learning_rate": 8.148790765135465e-07,
"loss": 1.4356,
"step": 7760
},
{
"epoch": 0.2848344880677444,
"grad_norm": 2.4582661578514426,
"learning_rate": 8.144307488995689e-07,
"loss": 1.4378,
"step": 7770
},
{
"epoch": 0.28520107042046994,
"grad_norm": 2.8361019596271726,
"learning_rate": 8.139820027221208e-07,
"loss": 1.4111,
"step": 7780
},
{
"epoch": 0.2855676527731955,
"grad_norm": 2.4415137770737427,
"learning_rate": 8.135328385785631e-07,
"loss": 1.4996,
"step": 7790
},
{
"epoch": 0.28593423512592103,
"grad_norm": 2.1392002967653094,
"learning_rate": 8.130832570668139e-07,
"loss": 1.433,
"step": 7800
},
{
"epoch": 0.28630081747864655,
"grad_norm": 3.061322031102369,
"learning_rate": 8.126332587853462e-07,
"loss": 1.4051,
"step": 7810
},
{
"epoch": 0.2866673998313721,
"grad_norm": 3.2748819767509354,
"learning_rate": 8.12182844333188e-07,
"loss": 1.3863,
"step": 7820
},
{
"epoch": 0.28703398218409765,
"grad_norm": 3.1866933217967603,
"learning_rate": 8.117320143099216e-07,
"loss": 1.4173,
"step": 7830
},
{
"epoch": 0.2874005645368232,
"grad_norm": 2.9290211285749175,
"learning_rate": 8.11280769315682e-07,
"loss": 1.4395,
"step": 7840
},
{
"epoch": 0.28776714688954874,
"grad_norm": 2.7212160772193474,
"learning_rate": 8.108291099511571e-07,
"loss": 1.4503,
"step": 7850
},
{
"epoch": 0.28813372924227426,
"grad_norm": 2.3892746869258317,
"learning_rate": 8.10377036817586e-07,
"loss": 1.4368,
"step": 7860
},
{
"epoch": 0.28850031159499984,
"grad_norm": 3.4107926691510277,
"learning_rate": 8.099245505167589e-07,
"loss": 1.4623,
"step": 7870
},
{
"epoch": 0.28886689394772536,
"grad_norm": 3.1259277735027307,
"learning_rate": 8.094716516510156e-07,
"loss": 1.4412,
"step": 7880
},
{
"epoch": 0.2892334763004509,
"grad_norm": 2.9135343767151154,
"learning_rate": 8.090183408232459e-07,
"loss": 1.4187,
"step": 7890
},
{
"epoch": 0.28960005865317645,
"grad_norm": 3.30617041516701,
"learning_rate": 8.085646186368867e-07,
"loss": 1.4176,
"step": 7900
},
{
"epoch": 0.28996664100590197,
"grad_norm": 3.1801194693312556,
"learning_rate": 8.081104856959238e-07,
"loss": 1.4534,
"step": 7910
},
{
"epoch": 0.2903332233586275,
"grad_norm": 3.2431476470574983,
"learning_rate": 8.07655942604889e-07,
"loss": 1.3469,
"step": 7920
},
{
"epoch": 0.29069980571135307,
"grad_norm": 3.1005913247685237,
"learning_rate": 8.072009899688605e-07,
"loss": 1.417,
"step": 7930
},
{
"epoch": 0.2910663880640786,
"grad_norm": 2.953054099149132,
"learning_rate": 8.067456283934614e-07,
"loss": 1.4252,
"step": 7940
},
{
"epoch": 0.29143297041680416,
"grad_norm": 2.6363992565855803,
"learning_rate": 8.062898584848592e-07,
"loss": 1.4499,
"step": 7950
},
{
"epoch": 0.2917995527695297,
"grad_norm": 2.7290690238502635,
"learning_rate": 8.05833680849765e-07,
"loss": 1.4716,
"step": 7960
},
{
"epoch": 0.2921661351222552,
"grad_norm": 3.21591143424738,
"learning_rate": 8.053770960954328e-07,
"loss": 1.3969,
"step": 7970
},
{
"epoch": 0.2925327174749808,
"grad_norm": 3.8732639515812575,
"learning_rate": 8.049201048296585e-07,
"loss": 1.463,
"step": 7980
},
{
"epoch": 0.2928992998277063,
"grad_norm": 2.9966394441630126,
"learning_rate": 8.044627076607789e-07,
"loss": 1.4545,
"step": 7990
},
{
"epoch": 0.2932658821804318,
"grad_norm": 3.1577560282041017,
"learning_rate": 8.040049051976713e-07,
"loss": 1.4682,
"step": 8000
},
{
"epoch": 0.2932658821804318,
"eval_accuracy": 0.6739903313977985,
"eval_loss": 1.4271955490112305,
"eval_runtime": 311.2156,
"eval_samples_per_second": 10.626,
"eval_steps_per_second": 0.887,
"step": 8000
},
{
"epoch": 0.2936324645331574,
"grad_norm": 2.957786000444244,
"learning_rate": 8.035466980497526e-07,
"loss": 1.4592,
"step": 8010
},
{
"epoch": 0.2939990468858829,
"grad_norm": 2.765279941343725,
"learning_rate": 8.030880868269785e-07,
"loss": 1.4404,
"step": 8020
},
{
"epoch": 0.29436562923860843,
"grad_norm": 2.803405395861366,
"learning_rate": 8.026290721398421e-07,
"loss": 1.3642,
"step": 8030
},
{
"epoch": 0.294732211591334,
"grad_norm": 3.134947642226663,
"learning_rate": 8.02169654599374e-07,
"loss": 1.4662,
"step": 8040
},
{
"epoch": 0.2950987939440595,
"grad_norm": 3.3888445829207923,
"learning_rate": 8.017098348171411e-07,
"loss": 1.4092,
"step": 8050
},
{
"epoch": 0.2954653762967851,
"grad_norm": 2.595961601811049,
"learning_rate": 8.012496134052457e-07,
"loss": 1.3772,
"step": 8060
},
{
"epoch": 0.2958319586495106,
"grad_norm": 3.724884065568925,
"learning_rate": 8.007889909763246e-07,
"loss": 1.3862,
"step": 8070
},
{
"epoch": 0.29619854100223614,
"grad_norm": 3.6608857589920754,
"learning_rate": 8.003279681435482e-07,
"loss": 1.444,
"step": 8080
},
{
"epoch": 0.2965651233549617,
"grad_norm": 2.7154240671865213,
"learning_rate": 7.998665455206206e-07,
"loss": 1.4285,
"step": 8090
},
{
"epoch": 0.29693170570768723,
"grad_norm": 2.7151538150939927,
"learning_rate": 7.994047237217776e-07,
"loss": 1.4489,
"step": 8100
},
{
"epoch": 0.29729828806041275,
"grad_norm": 2.9729575587995742,
"learning_rate": 7.989425033617863e-07,
"loss": 1.4289,
"step": 8110
},
{
"epoch": 0.2976648704131383,
"grad_norm": 3.298808013574498,
"learning_rate": 7.984798850559447e-07,
"loss": 1.4607,
"step": 8120
},
{
"epoch": 0.29803145276586385,
"grad_norm": 3.1491445672684866,
"learning_rate": 7.980168694200804e-07,
"loss": 1.4097,
"step": 8130
},
{
"epoch": 0.29839803511858937,
"grad_norm": 3.6399703354293007,
"learning_rate": 7.975534570705497e-07,
"loss": 1.3743,
"step": 8140
},
{
"epoch": 0.29876461747131494,
"grad_norm": 3.2547493183004974,
"learning_rate": 7.970896486242374e-07,
"loss": 1.4346,
"step": 8150
},
{
"epoch": 0.29913119982404046,
"grad_norm": 3.421650269839234,
"learning_rate": 7.966254446985553e-07,
"loss": 1.43,
"step": 8160
},
{
"epoch": 0.29949778217676604,
"grad_norm": 3.797293850962011,
"learning_rate": 7.961608459114416e-07,
"loss": 1.4651,
"step": 8170
},
{
"epoch": 0.29986436452949156,
"grad_norm": 3.5920498224364508,
"learning_rate": 7.956958528813604e-07,
"loss": 1.3738,
"step": 8180
},
{
"epoch": 0.3002309468822171,
"grad_norm": 3.238482918382144,
"learning_rate": 7.952304662273003e-07,
"loss": 1.3987,
"step": 8190
},
{
"epoch": 0.30059752923494265,
"grad_norm": 2.7498611423368176,
"learning_rate": 7.947646865687742e-07,
"loss": 1.4181,
"step": 8200
},
{
"epoch": 0.30096411158766817,
"grad_norm": 4.031428344222072,
"learning_rate": 7.942985145258179e-07,
"loss": 1.4294,
"step": 8210
},
{
"epoch": 0.3013306939403937,
"grad_norm": 2.643218639323195,
"learning_rate": 7.938319507189894e-07,
"loss": 1.4302,
"step": 8220
},
{
"epoch": 0.30169727629311927,
"grad_norm": 3.1275133100531227,
"learning_rate": 7.933649957693689e-07,
"loss": 1.348,
"step": 8230
},
{
"epoch": 0.3020638586458448,
"grad_norm": 3.521399879217592,
"learning_rate": 7.928976502985565e-07,
"loss": 1.4328,
"step": 8240
},
{
"epoch": 0.3024304409985703,
"grad_norm": 3.1834120547065665,
"learning_rate": 7.924299149286725e-07,
"loss": 1.4742,
"step": 8250
},
{
"epoch": 0.3027970233512959,
"grad_norm": 3.631213709741295,
"learning_rate": 7.919617902823563e-07,
"loss": 1.4068,
"step": 8260
},
{
"epoch": 0.3031636057040214,
"grad_norm": 2.726938578010126,
"learning_rate": 7.914932769827653e-07,
"loss": 1.4359,
"step": 8270
},
{
"epoch": 0.303530188056747,
"grad_norm": 3.7017959652425882,
"learning_rate": 7.910243756535744e-07,
"loss": 1.3344,
"step": 8280
},
{
"epoch": 0.3038967704094725,
"grad_norm": 3.3417669291832066,
"learning_rate": 7.90555086918975e-07,
"loss": 1.4121,
"step": 8290
},
{
"epoch": 0.304263352762198,
"grad_norm": 2.733351967687222,
"learning_rate": 7.900854114036743e-07,
"loss": 1.3732,
"step": 8300
},
{
"epoch": 0.3046299351149236,
"grad_norm": 3.1756478835337476,
"learning_rate": 7.89615349732894e-07,
"loss": 1.4007,
"step": 8310
},
{
"epoch": 0.3049965174676491,
"grad_norm": 3.238758242953075,
"learning_rate": 7.891449025323703e-07,
"loss": 1.4288,
"step": 8320
},
{
"epoch": 0.30536309982037463,
"grad_norm": 2.6053607033892043,
"learning_rate": 7.886740704283525e-07,
"loss": 1.4156,
"step": 8330
},
{
"epoch": 0.3057296821731002,
"grad_norm": 3.4053915363354417,
"learning_rate": 7.88202854047602e-07,
"loss": 1.3763,
"step": 8340
},
{
"epoch": 0.3060962645258257,
"grad_norm": 3.715425460301463,
"learning_rate": 7.877312540173922e-07,
"loss": 1.4036,
"step": 8350
},
{
"epoch": 0.30646284687855124,
"grad_norm": 2.9427971805533697,
"learning_rate": 7.872592709655066e-07,
"loss": 1.4385,
"step": 8360
},
{
"epoch": 0.3068294292312768,
"grad_norm": 3.5845846532616426,
"learning_rate": 7.867869055202392e-07,
"loss": 1.415,
"step": 8370
},
{
"epoch": 0.30719601158400234,
"grad_norm": 3.331222139254396,
"learning_rate": 7.863141583103927e-07,
"loss": 1.4126,
"step": 8380
},
{
"epoch": 0.3075625939367279,
"grad_norm": 3.1984388430808406,
"learning_rate": 7.85841029965278e-07,
"loss": 1.3826,
"step": 8390
},
{
"epoch": 0.30792917628945343,
"grad_norm": 3.1255012278404615,
"learning_rate": 7.853675211147134e-07,
"loss": 1.383,
"step": 8400
},
{
"epoch": 0.30829575864217895,
"grad_norm": 3.329583698840508,
"learning_rate": 7.848936323890239e-07,
"loss": 1.3931,
"step": 8410
},
{
"epoch": 0.3086623409949045,
"grad_norm": 3.9347250968462055,
"learning_rate": 7.844193644190396e-07,
"loss": 1.415,
"step": 8420
},
{
"epoch": 0.30902892334763005,
"grad_norm": 4.137255951707039,
"learning_rate": 7.839447178360963e-07,
"loss": 1.3998,
"step": 8430
},
{
"epoch": 0.30939550570035557,
"grad_norm": 2.6794621566293917,
"learning_rate": 7.834696932720331e-07,
"loss": 1.4228,
"step": 8440
},
{
"epoch": 0.30976208805308114,
"grad_norm": 2.726588078339754,
"learning_rate": 7.829942913591925e-07,
"loss": 1.4486,
"step": 8450
},
{
"epoch": 0.31012867040580666,
"grad_norm": 3.6162463016794026,
"learning_rate": 7.825185127304194e-07,
"loss": 1.4051,
"step": 8460
},
{
"epoch": 0.3104952527585322,
"grad_norm": 2.910711368055256,
"learning_rate": 7.820423580190603e-07,
"loss": 1.41,
"step": 8470
},
{
"epoch": 0.31086183511125776,
"grad_norm": 4.136385316326493,
"learning_rate": 7.815658278589619e-07,
"loss": 1.3859,
"step": 8480
},
{
"epoch": 0.3112284174639833,
"grad_norm": 2.1538443576824404,
"learning_rate": 7.810889228844708e-07,
"loss": 1.4113,
"step": 8490
},
{
"epoch": 0.31159499981670885,
"grad_norm": 3.1055419264140727,
"learning_rate": 7.806116437304331e-07,
"loss": 1.4327,
"step": 8500
},
{
"epoch": 0.31196158216943437,
"grad_norm": 3.183052960747229,
"learning_rate": 7.801339910321922e-07,
"loss": 1.4179,
"step": 8510
},
{
"epoch": 0.3123281645221599,
"grad_norm": 4.6955784323633925,
"learning_rate": 7.796559654255894e-07,
"loss": 1.3961,
"step": 8520
},
{
"epoch": 0.31269474687488547,
"grad_norm": 3.227174794853267,
"learning_rate": 7.79177567546962e-07,
"loss": 1.4082,
"step": 8530
},
{
"epoch": 0.313061329227611,
"grad_norm": 2.8264595214995243,
"learning_rate": 7.78698798033143e-07,
"loss": 1.4136,
"step": 8540
},
{
"epoch": 0.3134279115803365,
"grad_norm": 3.7915043909577624,
"learning_rate": 7.782196575214601e-07,
"loss": 1.3758,
"step": 8550
},
{
"epoch": 0.3137944939330621,
"grad_norm": 4.070976938559408,
"learning_rate": 7.777401466497349e-07,
"loss": 1.3915,
"step": 8560
},
{
"epoch": 0.3141610762857876,
"grad_norm": 3.3538502722425916,
"learning_rate": 7.772602660562819e-07,
"loss": 1.3718,
"step": 8570
},
{
"epoch": 0.3145276586385131,
"grad_norm": 3.230342363406807,
"learning_rate": 7.767800163799081e-07,
"loss": 1.3408,
"step": 8580
},
{
"epoch": 0.3148942409912387,
"grad_norm": 3.6144160833487415,
"learning_rate": 7.762993982599113e-07,
"loss": 1.4296,
"step": 8590
},
{
"epoch": 0.3152608233439642,
"grad_norm": 3.1182771552970374,
"learning_rate": 7.758184123360803e-07,
"loss": 1.3858,
"step": 8600
},
{
"epoch": 0.3156274056966898,
"grad_norm": 3.5319206230022977,
"learning_rate": 7.75337059248693e-07,
"loss": 1.4342,
"step": 8610
},
{
"epoch": 0.3159939880494153,
"grad_norm": 4.327639493570607,
"learning_rate": 7.748553396385163e-07,
"loss": 1.3915,
"step": 8620
},
{
"epoch": 0.31636057040214083,
"grad_norm": 3.9982142503751326,
"learning_rate": 7.743732541468053e-07,
"loss": 1.363,
"step": 8630
},
{
"epoch": 0.3167271527548664,
"grad_norm": 2.8786530129074728,
"learning_rate": 7.738908034153015e-07,
"loss": 1.3589,
"step": 8640
},
{
"epoch": 0.3170937351075919,
"grad_norm": 4.4947342914569095,
"learning_rate": 7.734079880862333e-07,
"loss": 1.3506,
"step": 8650
},
{
"epoch": 0.31746031746031744,
"grad_norm": 3.1518608629753477,
"learning_rate": 7.729248088023139e-07,
"loss": 1.3847,
"step": 8660
},
{
"epoch": 0.317826899813043,
"grad_norm": 3.8964914548994534,
"learning_rate": 7.724412662067415e-07,
"loss": 1.3616,
"step": 8670
},
{
"epoch": 0.31819348216576854,
"grad_norm": 4.158332163473049,
"learning_rate": 7.719573609431971e-07,
"loss": 1.3477,
"step": 8680
},
{
"epoch": 0.31856006451849406,
"grad_norm": 5.31244346458908,
"learning_rate": 7.714730936558455e-07,
"loss": 1.3885,
"step": 8690
},
{
"epoch": 0.31892664687121963,
"grad_norm": 3.5750048314109946,
"learning_rate": 7.709884649893328e-07,
"loss": 1.3763,
"step": 8700
},
{
"epoch": 0.31929322922394515,
"grad_norm": 3.5013927398683444,
"learning_rate": 7.70503475588786e-07,
"loss": 1.3437,
"step": 8710
},
{
"epoch": 0.31965981157667067,
"grad_norm": 3.772854937898392,
"learning_rate": 7.700181260998131e-07,
"loss": 1.434,
"step": 8720
},
{
"epoch": 0.32002639392939625,
"grad_norm": 3.939247516045474,
"learning_rate": 7.695324171685004e-07,
"loss": 1.384,
"step": 8730
},
{
"epoch": 0.32039297628212177,
"grad_norm": 3.3160045433400334,
"learning_rate": 7.690463494414137e-07,
"loss": 1.3681,
"step": 8740
},
{
"epoch": 0.32075955863484734,
"grad_norm": 3.2760601494452533,
"learning_rate": 7.685599235655955e-07,
"loss": 1.3576,
"step": 8750
},
{
"epoch": 0.32112614098757286,
"grad_norm": 3.917398028616676,
"learning_rate": 7.680731401885658e-07,
"loss": 1.4109,
"step": 8760
},
{
"epoch": 0.3214927233402984,
"grad_norm": 4.3801775022523355,
"learning_rate": 7.675859999583202e-07,
"loss": 1.3688,
"step": 8770
},
{
"epoch": 0.32185930569302396,
"grad_norm": 3.52546033919284,
"learning_rate": 7.670985035233291e-07,
"loss": 1.3803,
"step": 8780
},
{
"epoch": 0.3222258880457495,
"grad_norm": 3.4568824402601925,
"learning_rate": 7.666106515325374e-07,
"loss": 1.3615,
"step": 8790
},
{
"epoch": 0.322592470398475,
"grad_norm": 2.7983015500958826,
"learning_rate": 7.661224446353634e-07,
"loss": 1.3767,
"step": 8800
},
{
"epoch": 0.32295905275120057,
"grad_norm": 3.4581919245368904,
"learning_rate": 7.656338834816976e-07,
"loss": 1.3768,
"step": 8810
},
{
"epoch": 0.3233256351039261,
"grad_norm": 3.7176544154346054,
"learning_rate": 7.651449687219018e-07,
"loss": 1.3312,
"step": 8820
},
{
"epoch": 0.3236922174566516,
"grad_norm": 3.6712040176600502,
"learning_rate": 7.646557010068091e-07,
"loss": 1.3981,
"step": 8830
},
{
"epoch": 0.3240587998093772,
"grad_norm": 2.8962404949789637,
"learning_rate": 7.641660809877222e-07,
"loss": 1.4085,
"step": 8840
},
{
"epoch": 0.3244253821621027,
"grad_norm": 5.2069626245172635,
"learning_rate": 7.636761093164126e-07,
"loss": 1.3489,
"step": 8850
},
{
"epoch": 0.3247919645148283,
"grad_norm": 3.3614052591604793,
"learning_rate": 7.631857866451204e-07,
"loss": 1.391,
"step": 8860
},
{
"epoch": 0.3251585468675538,
"grad_norm": 3.1183008582079417,
"learning_rate": 7.626951136265523e-07,
"loss": 1.3966,
"step": 8870
},
{
"epoch": 0.3255251292202793,
"grad_norm": 4.337276600886146,
"learning_rate": 7.622040909138818e-07,
"loss": 1.3566,
"step": 8880
},
{
"epoch": 0.3258917115730049,
"grad_norm": 4.083650404603487,
"learning_rate": 7.617127191607479e-07,
"loss": 1.3928,
"step": 8890
},
{
"epoch": 0.3262582939257304,
"grad_norm": 3.847428171873619,
"learning_rate": 7.612209990212543e-07,
"loss": 1.3259,
"step": 8900
},
{
"epoch": 0.32662487627845593,
"grad_norm": 3.2197146488177384,
"learning_rate": 7.607289311499678e-07,
"loss": 1.376,
"step": 8910
},
{
"epoch": 0.3269914586311815,
"grad_norm": 3.4983962191005373,
"learning_rate": 7.60236516201919e-07,
"loss": 1.3927,
"step": 8920
},
{
"epoch": 0.32735804098390703,
"grad_norm": 3.610610377134006,
"learning_rate": 7.597437548326002e-07,
"loss": 1.3792,
"step": 8930
},
{
"epoch": 0.32772462333663255,
"grad_norm": 5.095826376758547,
"learning_rate": 7.592506476979644e-07,
"loss": 1.358,
"step": 8940
},
{
"epoch": 0.3280912056893581,
"grad_norm": 3.3863305431901183,
"learning_rate": 7.587571954544254e-07,
"loss": 1.3983,
"step": 8950
},
{
"epoch": 0.32845778804208364,
"grad_norm": 3.5975350890244067,
"learning_rate": 7.582633987588563e-07,
"loss": 1.4057,
"step": 8960
},
{
"epoch": 0.3288243703948092,
"grad_norm": 3.848485096118636,
"learning_rate": 7.577692582685886e-07,
"loss": 1.3814,
"step": 8970
},
{
"epoch": 0.32919095274753474,
"grad_norm": 3.157404479059578,
"learning_rate": 7.572747746414117e-07,
"loss": 1.4095,
"step": 8980
},
{
"epoch": 0.32955753510026026,
"grad_norm": 4.1043127446716285,
"learning_rate": 7.567799485355715e-07,
"loss": 1.3755,
"step": 8990
},
{
"epoch": 0.32992411745298583,
"grad_norm": 3.7156219870736615,
"learning_rate": 7.562847806097696e-07,
"loss": 1.3526,
"step": 9000
},
{
"epoch": 0.32992411745298583,
"eval_accuracy": 0.688625248964108,
"eval_loss": 1.3686386346817017,
"eval_runtime": 311.2444,
"eval_samples_per_second": 10.625,
"eval_steps_per_second": 0.887,
"step": 9000
},
{
"epoch": 0.33029069980571135,
"grad_norm": 4.016168592808031,
"learning_rate": 7.557892715231634e-07,
"loss": 1.3607,
"step": 9010
},
{
"epoch": 0.33065728215843687,
"grad_norm": 3.504820069720998,
"learning_rate": 7.552934219353638e-07,
"loss": 1.3833,
"step": 9020
},
{
"epoch": 0.33102386451116245,
"grad_norm": 3.3563895186210875,
"learning_rate": 7.547972325064351e-07,
"loss": 1.393,
"step": 9030
},
{
"epoch": 0.33139044686388797,
"grad_norm": 3.401944814988902,
"learning_rate": 7.543007038968939e-07,
"loss": 1.3708,
"step": 9040
},
{
"epoch": 0.3317570292166135,
"grad_norm": 4.8917426491539935,
"learning_rate": 7.538038367677087e-07,
"loss": 1.329,
"step": 9050
},
{
"epoch": 0.33212361156933906,
"grad_norm": 4.014824315681244,
"learning_rate": 7.53306631780298e-07,
"loss": 1.3464,
"step": 9060
},
{
"epoch": 0.3324901939220646,
"grad_norm": 3.9395593086417637,
"learning_rate": 7.52809089596531e-07,
"loss": 1.4059,
"step": 9070
},
{
"epoch": 0.33285677627479016,
"grad_norm": 3.5141323515233274,
"learning_rate": 7.523112108787247e-07,
"loss": 1.3467,
"step": 9080
},
{
"epoch": 0.3332233586275157,
"grad_norm": 4.310837199551292,
"learning_rate": 7.518129962896448e-07,
"loss": 1.3432,
"step": 9090
},
{
"epoch": 0.3335899409802412,
"grad_norm": 4.049279934012434,
"learning_rate": 7.513144464925036e-07,
"loss": 1.4107,
"step": 9100
},
{
"epoch": 0.33395652333296677,
"grad_norm": 5.43599736913238,
"learning_rate": 7.508155621509603e-07,
"loss": 1.3779,
"step": 9110
},
{
"epoch": 0.3343231056856923,
"grad_norm": 4.312594101718665,
"learning_rate": 7.503163439291187e-07,
"loss": 1.3279,
"step": 9120
},
{
"epoch": 0.3346896880384178,
"grad_norm": 3.7888042986131794,
"learning_rate": 7.498167924915276e-07,
"loss": 1.3422,
"step": 9130
},
{
"epoch": 0.3350562703911434,
"grad_norm": 4.6227274755808665,
"learning_rate": 7.493169085031791e-07,
"loss": 1.3489,
"step": 9140
},
{
"epoch": 0.3354228527438689,
"grad_norm": 4.440746888404653,
"learning_rate": 7.48816692629508e-07,
"loss": 1.3955,
"step": 9150
},
{
"epoch": 0.3357894350965944,
"grad_norm": 3.1422454499623753,
"learning_rate": 7.483161455363909e-07,
"loss": 1.3613,
"step": 9160
},
{
"epoch": 0.33615601744932,
"grad_norm": 3.894653506327936,
"learning_rate": 7.478152678901455e-07,
"loss": 1.4148,
"step": 9170
},
{
"epoch": 0.3365225998020455,
"grad_norm": 5.433033949859381,
"learning_rate": 7.473140603575294e-07,
"loss": 1.3144,
"step": 9180
},
{
"epoch": 0.3368891821547711,
"grad_norm": 3.975951714183405,
"learning_rate": 7.468125236057392e-07,
"loss": 1.3691,
"step": 9190
},
{
"epoch": 0.3372557645074966,
"grad_norm": 4.918343199781564,
"learning_rate": 7.463106583024099e-07,
"loss": 1.3848,
"step": 9200
},
{
"epoch": 0.33762234686022213,
"grad_norm": 4.865872631877682,
"learning_rate": 7.458084651156138e-07,
"loss": 1.3612,
"step": 9210
},
{
"epoch": 0.3379889292129477,
"grad_norm": 4.124355883120795,
"learning_rate": 7.453059447138597e-07,
"loss": 1.3922,
"step": 9220
},
{
"epoch": 0.33835551156567323,
"grad_norm": 3.4927433175723968,
"learning_rate": 7.448030977660921e-07,
"loss": 1.3209,
"step": 9230
},
{
"epoch": 0.33872209391839875,
"grad_norm": 3.5565740075352887,
"learning_rate": 7.4429992494169e-07,
"loss": 1.3137,
"step": 9240
},
{
"epoch": 0.3390886762711243,
"grad_norm": 3.2292820179583335,
"learning_rate": 7.437964269104663e-07,
"loss": 1.3469,
"step": 9250
},
{
"epoch": 0.33945525862384984,
"grad_norm": 5.260253752526274,
"learning_rate": 7.432926043426668e-07,
"loss": 1.3067,
"step": 9260
},
{
"epoch": 0.33982184097657536,
"grad_norm": 4.394976349303848,
"learning_rate": 7.427884579089691e-07,
"loss": 1.3423,
"step": 9270
},
{
"epoch": 0.34018842332930094,
"grad_norm": 3.396422180187779,
"learning_rate": 7.422839882804825e-07,
"loss": 1.3449,
"step": 9280
},
{
"epoch": 0.34055500568202646,
"grad_norm": 4.387777704799267,
"learning_rate": 7.417791961287457e-07,
"loss": 1.3274,
"step": 9290
},
{
"epoch": 0.34092158803475203,
"grad_norm": 4.664699242153168,
"learning_rate": 7.412740821257275e-07,
"loss": 1.3147,
"step": 9300
},
{
"epoch": 0.34128817038747755,
"grad_norm": 3.393736360787831,
"learning_rate": 7.407686469438248e-07,
"loss": 1.3934,
"step": 9310
},
{
"epoch": 0.34165475274020307,
"grad_norm": 4.750927708757991,
"learning_rate": 7.40262891255862e-07,
"loss": 1.4067,
"step": 9320
},
{
"epoch": 0.34202133509292865,
"grad_norm": 3.428169411059033,
"learning_rate": 7.397568157350903e-07,
"loss": 1.3411,
"step": 9330
},
{
"epoch": 0.34238791744565417,
"grad_norm": 4.302469394811799,
"learning_rate": 7.392504210551865e-07,
"loss": 1.299,
"step": 9340
},
{
"epoch": 0.3427544997983797,
"grad_norm": 7.00981557963566,
"learning_rate": 7.387437078902523e-07,
"loss": 1.3573,
"step": 9350
},
{
"epoch": 0.34312108215110526,
"grad_norm": 5.566063359486336,
"learning_rate": 7.382366769148136e-07,
"loss": 1.3497,
"step": 9360
},
{
"epoch": 0.3434876645038308,
"grad_norm": 3.4660448886166244,
"learning_rate": 7.37729328803819e-07,
"loss": 1.4092,
"step": 9370
},
{
"epoch": 0.3438542468565563,
"grad_norm": 3.702869545438875,
"learning_rate": 7.372216642326394e-07,
"loss": 1.3603,
"step": 9380
},
{
"epoch": 0.3442208292092819,
"grad_norm": 4.231146103126798,
"learning_rate": 7.367136838770671e-07,
"loss": 1.3428,
"step": 9390
},
{
"epoch": 0.3445874115620074,
"grad_norm": 4.554271919619236,
"learning_rate": 7.362053884133146e-07,
"loss": 1.3311,
"step": 9400
},
{
"epoch": 0.34495399391473297,
"grad_norm": 4.041325390537124,
"learning_rate": 7.35696778518014e-07,
"loss": 1.3471,
"step": 9410
},
{
"epoch": 0.3453205762674585,
"grad_norm": 5.283681695413367,
"learning_rate": 7.351878548682155e-07,
"loss": 1.3334,
"step": 9420
},
{
"epoch": 0.345687158620184,
"grad_norm": 4.104429136831335,
"learning_rate": 7.34678618141388e-07,
"loss": 1.3443,
"step": 9430
},
{
"epoch": 0.3460537409729096,
"grad_norm": 4.637839526253117,
"learning_rate": 7.341690690154161e-07,
"loss": 1.3383,
"step": 9440
},
{
"epoch": 0.3464203233256351,
"grad_norm": 6.447434633082354,
"learning_rate": 7.336592081686007e-07,
"loss": 1.3769,
"step": 9450
},
{
"epoch": 0.3467869056783606,
"grad_norm": 4.989354934531907,
"learning_rate": 7.331490362796579e-07,
"loss": 1.3651,
"step": 9460
},
{
"epoch": 0.3471534880310862,
"grad_norm": 4.121285832330203,
"learning_rate": 7.326385540277171e-07,
"loss": 1.319,
"step": 9470
},
{
"epoch": 0.3475200703838117,
"grad_norm": 3.7909593948348284,
"learning_rate": 7.321277620923217e-07,
"loss": 1.3743,
"step": 9480
},
{
"epoch": 0.34788665273653724,
"grad_norm": 3.3733089497346853,
"learning_rate": 7.316166611534267e-07,
"loss": 1.3743,
"step": 9490
},
{
"epoch": 0.3482532350892628,
"grad_norm": 3.7253741770570823,
"learning_rate": 7.311052518913989e-07,
"loss": 1.2903,
"step": 9500
},
{
"epoch": 0.34861981744198833,
"grad_norm": 4.039793671210928,
"learning_rate": 7.305935349870155e-07,
"loss": 1.2862,
"step": 9510
},
{
"epoch": 0.3489863997947139,
"grad_norm": 4.342535349346429,
"learning_rate": 7.300815111214628e-07,
"loss": 1.3808,
"step": 9520
},
{
"epoch": 0.34935298214743943,
"grad_norm": 5.42799281760455,
"learning_rate": 7.29569180976336e-07,
"loss": 1.3523,
"step": 9530
},
{
"epoch": 0.34971956450016495,
"grad_norm": 5.020277916958928,
"learning_rate": 7.290565452336381e-07,
"loss": 1.3256,
"step": 9540
},
{
"epoch": 0.3500861468528905,
"grad_norm": 4.373712918374428,
"learning_rate": 7.285436045757789e-07,
"loss": 1.2827,
"step": 9550
},
{
"epoch": 0.35045272920561604,
"grad_norm": 6.179796353095443,
"learning_rate": 7.280303596855737e-07,
"loss": 1.3197,
"step": 9560
},
{
"epoch": 0.35081931155834156,
"grad_norm": 5.167300912494304,
"learning_rate": 7.275168112462433e-07,
"loss": 1.331,
"step": 9570
},
{
"epoch": 0.35118589391106714,
"grad_norm": 4.118700000532668,
"learning_rate": 7.270029599414125e-07,
"loss": 1.3529,
"step": 9580
},
{
"epoch": 0.35155247626379266,
"grad_norm": 3.6038833094843516,
"learning_rate": 7.264888064551089e-07,
"loss": 1.3258,
"step": 9590
},
{
"epoch": 0.3519190586165182,
"grad_norm": 3.5142758374979524,
"learning_rate": 7.259743514717627e-07,
"loss": 1.3377,
"step": 9600
},
{
"epoch": 0.35228564096924375,
"grad_norm": 4.1250041287694685,
"learning_rate": 7.254595956762053e-07,
"loss": 1.3135,
"step": 9610
},
{
"epoch": 0.35265222332196927,
"grad_norm": 3.132058137932181,
"learning_rate": 7.249445397536686e-07,
"loss": 1.3349,
"step": 9620
},
{
"epoch": 0.35301880567469485,
"grad_norm": 3.399519224329254,
"learning_rate": 7.244291843897839e-07,
"loss": 1.3052,
"step": 9630
},
{
"epoch": 0.35338538802742037,
"grad_norm": 4.712619284275666,
"learning_rate": 7.239135302705816e-07,
"loss": 1.3065,
"step": 9640
},
{
"epoch": 0.3537519703801459,
"grad_norm": 3.734161433235809,
"learning_rate": 7.23397578082489e-07,
"loss": 1.3094,
"step": 9650
},
{
"epoch": 0.35411855273287146,
"grad_norm": 5.100823292959423,
"learning_rate": 7.228813285123308e-07,
"loss": 1.3331,
"step": 9660
},
{
"epoch": 0.354485135085597,
"grad_norm": 4.534677424827633,
"learning_rate": 7.223647822473271e-07,
"loss": 1.3912,
"step": 9670
},
{
"epoch": 0.3548517174383225,
"grad_norm": 3.470979394380451,
"learning_rate": 7.218479399750934e-07,
"loss": 1.3476,
"step": 9680
},
{
"epoch": 0.3552182997910481,
"grad_norm": 4.753775104454421,
"learning_rate": 7.21330802383639e-07,
"loss": 1.3167,
"step": 9690
},
{
"epoch": 0.3555848821437736,
"grad_norm": 3.412263014571041,
"learning_rate": 7.208133701613665e-07,
"loss": 1.3358,
"step": 9700
},
{
"epoch": 0.3559514644964991,
"grad_norm": 4.131601355517602,
"learning_rate": 7.202956439970704e-07,
"loss": 1.3244,
"step": 9710
},
{
"epoch": 0.3563180468492247,
"grad_norm": 5.122163472630932,
"learning_rate": 7.197776245799367e-07,
"loss": 1.2796,
"step": 9720
},
{
"epoch": 0.3566846292019502,
"grad_norm": 5.335391466451254,
"learning_rate": 7.192593125995418e-07,
"loss": 1.3161,
"step": 9730
},
{
"epoch": 0.3570512115546758,
"grad_norm": 4.103339016303858,
"learning_rate": 7.187407087458518e-07,
"loss": 1.4146,
"step": 9740
},
{
"epoch": 0.3574177939074013,
"grad_norm": 5.904708913785668,
"learning_rate": 7.182218137092204e-07,
"loss": 1.3092,
"step": 9750
},
{
"epoch": 0.3577843762601268,
"grad_norm": 4.187532290173183,
"learning_rate": 7.1770262818039e-07,
"loss": 1.2946,
"step": 9760
},
{
"epoch": 0.3581509586128524,
"grad_norm": 4.6467762537942,
"learning_rate": 7.17183152850489e-07,
"loss": 1.3212,
"step": 9770
},
{
"epoch": 0.3585175409655779,
"grad_norm": 4.424491675585427,
"learning_rate": 7.16663388411032e-07,
"loss": 1.3167,
"step": 9780
},
{
"epoch": 0.35888412331830344,
"grad_norm": 4.460602913760459,
"learning_rate": 7.161433355539181e-07,
"loss": 1.3514,
"step": 9790
},
{
"epoch": 0.359250705671029,
"grad_norm": 7.380392542181771,
"learning_rate": 7.156229949714307e-07,
"loss": 1.305,
"step": 9800
},
{
"epoch": 0.35961728802375453,
"grad_norm": 3.677155226574757,
"learning_rate": 7.15102367356236e-07,
"loss": 1.3175,
"step": 9810
},
{
"epoch": 0.35998387037648005,
"grad_norm": 2.995203775176967,
"learning_rate": 7.145814534013821e-07,
"loss": 1.3833,
"step": 9820
},
{
"epoch": 0.36035045272920563,
"grad_norm": 3.5086546677463364,
"learning_rate": 7.140602538002989e-07,
"loss": 1.3858,
"step": 9830
},
{
"epoch": 0.36071703508193115,
"grad_norm": 3.523795917156669,
"learning_rate": 7.135387692467957e-07,
"loss": 1.3375,
"step": 9840
},
{
"epoch": 0.3610836174346567,
"grad_norm": 3.7313877963514,
"learning_rate": 7.130170004350617e-07,
"loss": 1.3094,
"step": 9850
},
{
"epoch": 0.36145019978738224,
"grad_norm": 4.442532041857861,
"learning_rate": 7.124949480596644e-07,
"loss": 1.3121,
"step": 9860
},
{
"epoch": 0.36181678214010776,
"grad_norm": 5.641090705197642,
"learning_rate": 7.119726128155487e-07,
"loss": 1.3387,
"step": 9870
},
{
"epoch": 0.36218336449283334,
"grad_norm": 9.369536303911914,
"learning_rate": 7.114499953980362e-07,
"loss": 1.3413,
"step": 9880
},
{
"epoch": 0.36254994684555886,
"grad_norm": 4.32109030408511,
"learning_rate": 7.109270965028238e-07,
"loss": 1.3636,
"step": 9890
},
{
"epoch": 0.3629165291982844,
"grad_norm": 6.871086039775216,
"learning_rate": 7.104039168259834e-07,
"loss": 1.352,
"step": 9900
},
{
"epoch": 0.36328311155100995,
"grad_norm": 4.509944406939018,
"learning_rate": 7.098804570639605e-07,
"loss": 1.2874,
"step": 9910
},
{
"epoch": 0.36364969390373547,
"grad_norm": 4.612863347134658,
"learning_rate": 7.093567179135738e-07,
"loss": 1.2676,
"step": 9920
},
{
"epoch": 0.364016276256461,
"grad_norm": 4.091094769005595,
"learning_rate": 7.088327000720131e-07,
"loss": 1.3038,
"step": 9930
},
{
"epoch": 0.36438285860918657,
"grad_norm": 4.977334963231582,
"learning_rate": 7.083084042368401e-07,
"loss": 1.3008,
"step": 9940
},
{
"epoch": 0.3647494409619121,
"grad_norm": 5.166826475680081,
"learning_rate": 7.077838311059862e-07,
"loss": 1.2881,
"step": 9950
},
{
"epoch": 0.36511602331463766,
"grad_norm": 4.01832965003142,
"learning_rate": 7.072589813777518e-07,
"loss": 1.3523,
"step": 9960
},
{
"epoch": 0.3654826056673632,
"grad_norm": 3.8045628665321214,
"learning_rate": 7.067338557508055e-07,
"loss": 1.3155,
"step": 9970
},
{
"epoch": 0.3658491880200887,
"grad_norm": 4.344284713227578,
"learning_rate": 7.062084549241833e-07,
"loss": 1.3314,
"step": 9980
},
{
"epoch": 0.3662157703728143,
"grad_norm": 4.559382806632024,
"learning_rate": 7.056827795972876e-07,
"loss": 1.3242,
"step": 9990
},
{
"epoch": 0.3665823527255398,
"grad_norm": 8.960735940046002,
"learning_rate": 7.051568304698862e-07,
"loss": 1.2563,
"step": 10000
},
{
"epoch": 0.3665823527255398,
"eval_accuracy": 0.7009188125309459,
"eval_loss": 1.3158118724822998,
"eval_runtime": 311.2198,
"eval_samples_per_second": 10.626,
"eval_steps_per_second": 0.887,
"step": 10000
}
],
"logging_steps": 10,
"max_steps": 27279,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 10000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1085213557587968.0,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}