ben81828's picture
Training in progress, step 1600, checkpoint
c9f0589 verified
raw
history blame
77.4 kB
{
"best_metric": 0.4339977502822876,
"best_model_checkpoint": "saves/CADICA_qwenvl_stenosis_detect_scale4/lora/sft/checkpoint-1250",
"epoch": 0.41205253669842906,
"eval_steps": 50,
"global_step": 1600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0012876641771825909,
"grad_norm": 13.245840411597928,
"learning_rate": 2.9411764705882355e-06,
"loss": 2.8889,
"num_input_tokens_seen": 52840,
"step": 5
},
{
"epoch": 0.0025753283543651817,
"grad_norm": 12.237619501215374,
"learning_rate": 5.882352941176471e-06,
"loss": 2.8165,
"num_input_tokens_seen": 105528,
"step": 10
},
{
"epoch": 0.0038629925315477724,
"grad_norm": 16.29688816410412,
"learning_rate": 8.823529411764707e-06,
"loss": 2.8363,
"num_input_tokens_seen": 158768,
"step": 15
},
{
"epoch": 0.0051506567087303634,
"grad_norm": 11.576419511120797,
"learning_rate": 1.1764705882352942e-05,
"loss": 2.6853,
"num_input_tokens_seen": 210816,
"step": 20
},
{
"epoch": 0.006438320885912954,
"grad_norm": 6.9672256792859,
"learning_rate": 1.4705882352941177e-05,
"loss": 2.2992,
"num_input_tokens_seen": 262936,
"step": 25
},
{
"epoch": 0.007725985063095545,
"grad_norm": 3.1837818528204305,
"learning_rate": 1.7647058823529414e-05,
"loss": 1.8923,
"num_input_tokens_seen": 315264,
"step": 30
},
{
"epoch": 0.009013649240278136,
"grad_norm": 2.835950303969337,
"learning_rate": 2.058823529411765e-05,
"loss": 1.6984,
"num_input_tokens_seen": 367840,
"step": 35
},
{
"epoch": 0.010301313417460727,
"grad_norm": 2.223740001042382,
"learning_rate": 2.3529411764705884e-05,
"loss": 1.6434,
"num_input_tokens_seen": 420112,
"step": 40
},
{
"epoch": 0.011588977594643318,
"grad_norm": 1.9880935044313244,
"learning_rate": 2.647058823529412e-05,
"loss": 1.4659,
"num_input_tokens_seen": 472728,
"step": 45
},
{
"epoch": 0.012876641771825908,
"grad_norm": 1.7151131700495934,
"learning_rate": 2.9411764705882354e-05,
"loss": 1.3506,
"num_input_tokens_seen": 524648,
"step": 50
},
{
"epoch": 0.012876641771825908,
"eval_loss": 1.1727452278137207,
"eval_runtime": 66.3207,
"eval_samples_per_second": 1.809,
"eval_steps_per_second": 0.452,
"num_input_tokens_seen": 524648,
"step": 50
},
{
"epoch": 0.014164305949008499,
"grad_norm": 1.47475981537851,
"learning_rate": 3.235294117647059e-05,
"loss": 1.1455,
"num_input_tokens_seen": 576472,
"step": 55
},
{
"epoch": 0.01545197012619109,
"grad_norm": 1.7476693647440722,
"learning_rate": 3.529411764705883e-05,
"loss": 0.9971,
"num_input_tokens_seen": 628056,
"step": 60
},
{
"epoch": 0.01673963430337368,
"grad_norm": 1.3384365493212875,
"learning_rate": 3.8235294117647055e-05,
"loss": 0.9073,
"num_input_tokens_seen": 680448,
"step": 65
},
{
"epoch": 0.018027298480556272,
"grad_norm": 0.9014358219807773,
"learning_rate": 4.11764705882353e-05,
"loss": 0.8386,
"num_input_tokens_seen": 733664,
"step": 70
},
{
"epoch": 0.01931496265773886,
"grad_norm": 0.8007820009902022,
"learning_rate": 4.411764705882353e-05,
"loss": 0.7827,
"num_input_tokens_seen": 786096,
"step": 75
},
{
"epoch": 0.020602626834921454,
"grad_norm": 0.6701003454307716,
"learning_rate": 4.705882352941177e-05,
"loss": 0.7814,
"num_input_tokens_seen": 838192,
"step": 80
},
{
"epoch": 0.021890291012104043,
"grad_norm": 0.8973165751658843,
"learning_rate": 5e-05,
"loss": 0.7297,
"num_input_tokens_seen": 890112,
"step": 85
},
{
"epoch": 0.023177955189286635,
"grad_norm": 0.9060968630490469,
"learning_rate": 5.294117647058824e-05,
"loss": 0.7894,
"num_input_tokens_seen": 943472,
"step": 90
},
{
"epoch": 0.024465619366469224,
"grad_norm": 0.9520214202472889,
"learning_rate": 5.588235294117647e-05,
"loss": 0.7758,
"num_input_tokens_seen": 996872,
"step": 95
},
{
"epoch": 0.025753283543651816,
"grad_norm": 0.8226006535044261,
"learning_rate": 5.882352941176471e-05,
"loss": 0.7577,
"num_input_tokens_seen": 1049816,
"step": 100
},
{
"epoch": 0.025753283543651816,
"eval_loss": 0.7517351508140564,
"eval_runtime": 38.7829,
"eval_samples_per_second": 3.094,
"eval_steps_per_second": 0.774,
"num_input_tokens_seen": 1049816,
"step": 100
},
{
"epoch": 0.027040947720834405,
"grad_norm": 0.7251208491150668,
"learning_rate": 6.176470588235295e-05,
"loss": 0.7579,
"num_input_tokens_seen": 1102584,
"step": 105
},
{
"epoch": 0.028328611898016998,
"grad_norm": 0.8217419839297042,
"learning_rate": 6.470588235294118e-05,
"loss": 0.7659,
"num_input_tokens_seen": 1155512,
"step": 110
},
{
"epoch": 0.029616276075199587,
"grad_norm": 0.6768053879888967,
"learning_rate": 6.764705882352942e-05,
"loss": 0.7469,
"num_input_tokens_seen": 1207976,
"step": 115
},
{
"epoch": 0.03090394025238218,
"grad_norm": 1.9562630849642013,
"learning_rate": 7.058823529411765e-05,
"loss": 0.7353,
"num_input_tokens_seen": 1259776,
"step": 120
},
{
"epoch": 0.03219160442956477,
"grad_norm": 0.6439041597153087,
"learning_rate": 7.352941176470589e-05,
"loss": 0.7537,
"num_input_tokens_seen": 1312760,
"step": 125
},
{
"epoch": 0.03347926860674736,
"grad_norm": 0.6124318582166212,
"learning_rate": 7.647058823529411e-05,
"loss": 0.7669,
"num_input_tokens_seen": 1365616,
"step": 130
},
{
"epoch": 0.03476693278392995,
"grad_norm": 0.7593534002488418,
"learning_rate": 7.941176470588235e-05,
"loss": 0.722,
"num_input_tokens_seen": 1417544,
"step": 135
},
{
"epoch": 0.036054596961112545,
"grad_norm": 0.7827834651032061,
"learning_rate": 8.23529411764706e-05,
"loss": 0.7502,
"num_input_tokens_seen": 1469856,
"step": 140
},
{
"epoch": 0.037342261138295134,
"grad_norm": 0.5444126155596626,
"learning_rate": 8.529411764705883e-05,
"loss": 0.7174,
"num_input_tokens_seen": 1521496,
"step": 145
},
{
"epoch": 0.03862992531547772,
"grad_norm": 0.40878703812837747,
"learning_rate": 8.823529411764706e-05,
"loss": 0.7018,
"num_input_tokens_seen": 1573376,
"step": 150
},
{
"epoch": 0.03862992531547772,
"eval_loss": 0.7309949994087219,
"eval_runtime": 38.2005,
"eval_samples_per_second": 3.141,
"eval_steps_per_second": 0.785,
"num_input_tokens_seen": 1573376,
"step": 150
},
{
"epoch": 0.03991758949266031,
"grad_norm": 0.5536144453733772,
"learning_rate": 9.11764705882353e-05,
"loss": 0.738,
"num_input_tokens_seen": 1626136,
"step": 155
},
{
"epoch": 0.04120525366984291,
"grad_norm": 0.5151715191704441,
"learning_rate": 9.411764705882353e-05,
"loss": 0.7579,
"num_input_tokens_seen": 1678760,
"step": 160
},
{
"epoch": 0.042492917847025496,
"grad_norm": 0.5209077394596254,
"learning_rate": 9.705882352941177e-05,
"loss": 0.7502,
"num_input_tokens_seen": 1731240,
"step": 165
},
{
"epoch": 0.043780582024208085,
"grad_norm": 0.721213601237688,
"learning_rate": 0.0001,
"loss": 0.7448,
"num_input_tokens_seen": 1783816,
"step": 170
},
{
"epoch": 0.045068246201390674,
"grad_norm": 0.48666007914879555,
"learning_rate": 9.999940874631277e-05,
"loss": 0.6648,
"num_input_tokens_seen": 1834592,
"step": 175
},
{
"epoch": 0.04635591037857327,
"grad_norm": 0.5136600613696797,
"learning_rate": 9.999763499923432e-05,
"loss": 0.7759,
"num_input_tokens_seen": 1888176,
"step": 180
},
{
"epoch": 0.04764357455575586,
"grad_norm": 0.6706281530046975,
"learning_rate": 9.999467880071402e-05,
"loss": 0.7167,
"num_input_tokens_seen": 1940280,
"step": 185
},
{
"epoch": 0.04893123873293845,
"grad_norm": 0.5159139445497618,
"learning_rate": 9.999054022066641e-05,
"loss": 0.7483,
"num_input_tokens_seen": 1993096,
"step": 190
},
{
"epoch": 0.050218902910121044,
"grad_norm": 0.40251006129746847,
"learning_rate": 9.998521935696953e-05,
"loss": 0.7464,
"num_input_tokens_seen": 2045648,
"step": 195
},
{
"epoch": 0.05150656708730363,
"grad_norm": 0.4811730853311867,
"learning_rate": 9.997871633546257e-05,
"loss": 0.7594,
"num_input_tokens_seen": 2099008,
"step": 200
},
{
"epoch": 0.05150656708730363,
"eval_loss": 0.7274295687675476,
"eval_runtime": 38.079,
"eval_samples_per_second": 3.151,
"eval_steps_per_second": 0.788,
"num_input_tokens_seen": 2099008,
"step": 200
},
{
"epoch": 0.05279423126448622,
"grad_norm": 0.591934959695668,
"learning_rate": 9.997103130994296e-05,
"loss": 0.706,
"num_input_tokens_seen": 2151680,
"step": 205
},
{
"epoch": 0.05408189544166881,
"grad_norm": 0.48253717444489286,
"learning_rate": 9.996216446216267e-05,
"loss": 0.7186,
"num_input_tokens_seen": 2203784,
"step": 210
},
{
"epoch": 0.055369559618851406,
"grad_norm": 0.5274315079401322,
"learning_rate": 9.995211600182397e-05,
"loss": 0.7009,
"num_input_tokens_seen": 2255632,
"step": 215
},
{
"epoch": 0.056657223796033995,
"grad_norm": 0.32879215224292613,
"learning_rate": 9.994088616657444e-05,
"loss": 0.6801,
"num_input_tokens_seen": 2308096,
"step": 220
},
{
"epoch": 0.057944887973216584,
"grad_norm": 0.37171195071448215,
"learning_rate": 9.992847522200133e-05,
"loss": 0.7569,
"num_input_tokens_seen": 2361168,
"step": 225
},
{
"epoch": 0.05923255215039917,
"grad_norm": 0.4120941016934064,
"learning_rate": 9.99148834616253e-05,
"loss": 0.7402,
"num_input_tokens_seen": 2413896,
"step": 230
},
{
"epoch": 0.06052021632758177,
"grad_norm": 0.5998680948310651,
"learning_rate": 9.990011120689351e-05,
"loss": 0.7191,
"num_input_tokens_seen": 2466136,
"step": 235
},
{
"epoch": 0.06180788050476436,
"grad_norm": 0.538488141249078,
"learning_rate": 9.988415880717194e-05,
"loss": 0.7274,
"num_input_tokens_seen": 2518848,
"step": 240
},
{
"epoch": 0.06309554468194695,
"grad_norm": 0.4393093124760277,
"learning_rate": 9.986702663973722e-05,
"loss": 0.7704,
"num_input_tokens_seen": 2572384,
"step": 245
},
{
"epoch": 0.06438320885912954,
"grad_norm": 0.6116643616510118,
"learning_rate": 9.98487151097676e-05,
"loss": 0.7346,
"num_input_tokens_seen": 2625352,
"step": 250
},
{
"epoch": 0.06438320885912954,
"eval_loss": 0.7181503176689148,
"eval_runtime": 38.0986,
"eval_samples_per_second": 3.15,
"eval_steps_per_second": 0.787,
"num_input_tokens_seen": 2625352,
"step": 250
},
{
"epoch": 0.06567087303631212,
"grad_norm": 0.41200227731339506,
"learning_rate": 9.98292246503335e-05,
"loss": 0.7408,
"num_input_tokens_seen": 2678216,
"step": 255
},
{
"epoch": 0.06695853721349472,
"grad_norm": 0.44521059732114987,
"learning_rate": 9.980855572238714e-05,
"loss": 0.7044,
"num_input_tokens_seen": 2730664,
"step": 260
},
{
"epoch": 0.06824620139067732,
"grad_norm": 0.571896859428363,
"learning_rate": 9.978670881475172e-05,
"loss": 0.7334,
"num_input_tokens_seen": 2783584,
"step": 265
},
{
"epoch": 0.0695338655678599,
"grad_norm": 0.3907697039722125,
"learning_rate": 9.976368444410985e-05,
"loss": 0.7075,
"num_input_tokens_seen": 2836152,
"step": 270
},
{
"epoch": 0.0708215297450425,
"grad_norm": 0.4507806825752261,
"learning_rate": 9.973948315499126e-05,
"loss": 0.7039,
"num_input_tokens_seen": 2887808,
"step": 275
},
{
"epoch": 0.07210919392222509,
"grad_norm": 0.41330504132984697,
"learning_rate": 9.971410551976002e-05,
"loss": 0.6953,
"num_input_tokens_seen": 2939656,
"step": 280
},
{
"epoch": 0.07339685809940767,
"grad_norm": 0.4625671909482009,
"learning_rate": 9.968755213860094e-05,
"loss": 0.7022,
"num_input_tokens_seen": 2991632,
"step": 285
},
{
"epoch": 0.07468452227659027,
"grad_norm": 0.6553627840267285,
"learning_rate": 9.96598236395054e-05,
"loss": 0.6796,
"num_input_tokens_seen": 3043616,
"step": 290
},
{
"epoch": 0.07597218645377285,
"grad_norm": 0.5157886895754477,
"learning_rate": 9.96309206782565e-05,
"loss": 0.7346,
"num_input_tokens_seen": 3096920,
"step": 295
},
{
"epoch": 0.07725985063095545,
"grad_norm": 0.5672965149433489,
"learning_rate": 9.960084393841355e-05,
"loss": 0.6815,
"num_input_tokens_seen": 3149032,
"step": 300
},
{
"epoch": 0.07725985063095545,
"eval_loss": 0.7073924541473389,
"eval_runtime": 38.1842,
"eval_samples_per_second": 3.143,
"eval_steps_per_second": 0.786,
"num_input_tokens_seen": 3149032,
"step": 300
},
{
"epoch": 0.07854751480813804,
"grad_norm": 0.4479276285203507,
"learning_rate": 9.956959413129585e-05,
"loss": 0.7208,
"num_input_tokens_seen": 3201560,
"step": 305
},
{
"epoch": 0.07983517898532062,
"grad_norm": 0.368457437106614,
"learning_rate": 9.953717199596598e-05,
"loss": 0.7144,
"num_input_tokens_seen": 3254632,
"step": 310
},
{
"epoch": 0.08112284316250322,
"grad_norm": 0.5531413254856732,
"learning_rate": 9.95035782992122e-05,
"loss": 0.6861,
"num_input_tokens_seen": 3306432,
"step": 315
},
{
"epoch": 0.08241050733968582,
"grad_norm": 0.41513991799613037,
"learning_rate": 9.94688138355304e-05,
"loss": 0.6836,
"num_input_tokens_seen": 3358392,
"step": 320
},
{
"epoch": 0.0836981715168684,
"grad_norm": 0.47052274706452957,
"learning_rate": 9.943287942710527e-05,
"loss": 0.7353,
"num_input_tokens_seen": 3411424,
"step": 325
},
{
"epoch": 0.08498583569405099,
"grad_norm": 0.6322586593511644,
"learning_rate": 9.939577592379088e-05,
"loss": 0.6774,
"num_input_tokens_seen": 3462992,
"step": 330
},
{
"epoch": 0.08627349987123359,
"grad_norm": 0.4129597798905344,
"learning_rate": 9.935750420309055e-05,
"loss": 0.7331,
"num_input_tokens_seen": 3516136,
"step": 335
},
{
"epoch": 0.08756116404841617,
"grad_norm": 0.4031509882699161,
"learning_rate": 9.931806517013612e-05,
"loss": 0.6939,
"num_input_tokens_seen": 3568360,
"step": 340
},
{
"epoch": 0.08884882822559877,
"grad_norm": 0.4444358747076587,
"learning_rate": 9.927745975766654e-05,
"loss": 0.7158,
"num_input_tokens_seen": 3620696,
"step": 345
},
{
"epoch": 0.09013649240278135,
"grad_norm": 0.5290547365449167,
"learning_rate": 9.923568892600578e-05,
"loss": 0.6932,
"num_input_tokens_seen": 3673152,
"step": 350
},
{
"epoch": 0.09013649240278135,
"eval_loss": 0.7044599056243896,
"eval_runtime": 38.2709,
"eval_samples_per_second": 3.136,
"eval_steps_per_second": 0.784,
"num_input_tokens_seen": 3673152,
"step": 350
},
{
"epoch": 0.09142415657996394,
"grad_norm": 0.47530311368359207,
"learning_rate": 9.91927536630402e-05,
"loss": 0.6778,
"num_input_tokens_seen": 3725296,
"step": 355
},
{
"epoch": 0.09271182075714654,
"grad_norm": 0.38913022785688944,
"learning_rate": 9.91486549841951e-05,
"loss": 0.6857,
"num_input_tokens_seen": 3777552,
"step": 360
},
{
"epoch": 0.09399948493432912,
"grad_norm": 0.4834773141333328,
"learning_rate": 9.91033939324107e-05,
"loss": 0.7184,
"num_input_tokens_seen": 3830200,
"step": 365
},
{
"epoch": 0.09528714911151172,
"grad_norm": 0.5862045807150876,
"learning_rate": 9.905697157811761e-05,
"loss": 0.7196,
"num_input_tokens_seen": 3883200,
"step": 370
},
{
"epoch": 0.09657481328869431,
"grad_norm": 0.4576971522205563,
"learning_rate": 9.900938901921131e-05,
"loss": 0.6914,
"num_input_tokens_seen": 3935576,
"step": 375
},
{
"epoch": 0.0978624774658769,
"grad_norm": 0.49551517524520683,
"learning_rate": 9.896064738102635e-05,
"loss": 0.6681,
"num_input_tokens_seen": 3987624,
"step": 380
},
{
"epoch": 0.09915014164305949,
"grad_norm": 0.8198390819787913,
"learning_rate": 9.891074781630966e-05,
"loss": 0.6723,
"num_input_tokens_seen": 4039680,
"step": 385
},
{
"epoch": 0.10043780582024209,
"grad_norm": 0.7034626469978683,
"learning_rate": 9.885969150519331e-05,
"loss": 0.6498,
"num_input_tokens_seen": 4091216,
"step": 390
},
{
"epoch": 0.10172546999742467,
"grad_norm": 0.8838075623197742,
"learning_rate": 9.88074796551666e-05,
"loss": 0.7311,
"num_input_tokens_seen": 4144264,
"step": 395
},
{
"epoch": 0.10301313417460727,
"grad_norm": 0.7342758386202114,
"learning_rate": 9.875411350104744e-05,
"loss": 0.7089,
"num_input_tokens_seen": 4197072,
"step": 400
},
{
"epoch": 0.10301313417460727,
"eval_loss": 0.6847750544548035,
"eval_runtime": 37.9238,
"eval_samples_per_second": 3.164,
"eval_steps_per_second": 0.791,
"num_input_tokens_seen": 4197072,
"step": 400
},
{
"epoch": 0.10430079835178985,
"grad_norm": 0.8113533605928532,
"learning_rate": 9.86995943049533e-05,
"loss": 0.7021,
"num_input_tokens_seen": 4249656,
"step": 405
},
{
"epoch": 0.10558846252897244,
"grad_norm": 1.1772677082041305,
"learning_rate": 9.864392335627117e-05,
"loss": 0.6943,
"num_input_tokens_seen": 4302944,
"step": 410
},
{
"epoch": 0.10687612670615504,
"grad_norm": 1.6493280510697776,
"learning_rate": 9.858710197162721e-05,
"loss": 0.7146,
"num_input_tokens_seen": 4355480,
"step": 415
},
{
"epoch": 0.10816379088333762,
"grad_norm": 3.0159798803441715,
"learning_rate": 9.852913149485556e-05,
"loss": 0.6312,
"num_input_tokens_seen": 4407688,
"step": 420
},
{
"epoch": 0.10945145506052022,
"grad_norm": 1.7981196843056153,
"learning_rate": 9.847001329696653e-05,
"loss": 0.6877,
"num_input_tokens_seen": 4459736,
"step": 425
},
{
"epoch": 0.11073911923770281,
"grad_norm": 1.5783278376799834,
"learning_rate": 9.840974877611422e-05,
"loss": 0.6975,
"num_input_tokens_seen": 4512928,
"step": 430
},
{
"epoch": 0.1120267834148854,
"grad_norm": 3.306646516615779,
"learning_rate": 9.834833935756344e-05,
"loss": 0.651,
"num_input_tokens_seen": 4565840,
"step": 435
},
{
"epoch": 0.11331444759206799,
"grad_norm": 2.3184973874904005,
"learning_rate": 9.828578649365601e-05,
"loss": 0.685,
"num_input_tokens_seen": 4618168,
"step": 440
},
{
"epoch": 0.11460211176925057,
"grad_norm": 1.602690016495642,
"learning_rate": 9.822209166377635e-05,
"loss": 0.6258,
"num_input_tokens_seen": 4669784,
"step": 445
},
{
"epoch": 0.11588977594643317,
"grad_norm": 2.6770797227308196,
"learning_rate": 9.815725637431662e-05,
"loss": 0.6732,
"num_input_tokens_seen": 4722528,
"step": 450
},
{
"epoch": 0.11588977594643317,
"eval_loss": 0.6526497006416321,
"eval_runtime": 39.085,
"eval_samples_per_second": 3.07,
"eval_steps_per_second": 0.768,
"num_input_tokens_seen": 4722528,
"step": 450
},
{
"epoch": 0.11717744012361576,
"grad_norm": 2.1823349329218074,
"learning_rate": 9.809128215864097e-05,
"loss": 0.6544,
"num_input_tokens_seen": 4774400,
"step": 455
},
{
"epoch": 0.11846510430079835,
"grad_norm": 1.434521593914191,
"learning_rate": 9.802417057704931e-05,
"loss": 0.652,
"num_input_tokens_seen": 4826704,
"step": 460
},
{
"epoch": 0.11975276847798094,
"grad_norm": 2.399754385687283,
"learning_rate": 9.795592321674045e-05,
"loss": 0.6582,
"num_input_tokens_seen": 4880072,
"step": 465
},
{
"epoch": 0.12104043265516354,
"grad_norm": 3.9235176077985536,
"learning_rate": 9.788654169177453e-05,
"loss": 0.6506,
"num_input_tokens_seen": 4931968,
"step": 470
},
{
"epoch": 0.12232809683234612,
"grad_norm": 3.659330745777227,
"learning_rate": 9.781602764303487e-05,
"loss": 0.6551,
"num_input_tokens_seen": 4983656,
"step": 475
},
{
"epoch": 0.12361576100952872,
"grad_norm": 1.9670601503398757,
"learning_rate": 9.774438273818911e-05,
"loss": 0.6978,
"num_input_tokens_seen": 5036528,
"step": 480
},
{
"epoch": 0.12490342518671131,
"grad_norm": 1.308580869419328,
"learning_rate": 9.767160867164979e-05,
"loss": 0.6407,
"num_input_tokens_seen": 5088768,
"step": 485
},
{
"epoch": 0.1261910893638939,
"grad_norm": 1.7349486072682865,
"learning_rate": 9.759770716453436e-05,
"loss": 0.6641,
"num_input_tokens_seen": 5142080,
"step": 490
},
{
"epoch": 0.1274787535410765,
"grad_norm": 2.993327939872198,
"learning_rate": 9.752267996462434e-05,
"loss": 0.6588,
"num_input_tokens_seen": 5194432,
"step": 495
},
{
"epoch": 0.12876641771825909,
"grad_norm": 2.6430988002320976,
"learning_rate": 9.744652884632406e-05,
"loss": 0.6304,
"num_input_tokens_seen": 5246640,
"step": 500
},
{
"epoch": 0.12876641771825909,
"eval_loss": 0.6272165775299072,
"eval_runtime": 39.4177,
"eval_samples_per_second": 3.044,
"eval_steps_per_second": 0.761,
"num_input_tokens_seen": 5246640,
"step": 500
},
{
"epoch": 0.13005408189544168,
"grad_norm": 2.6047672112920286,
"learning_rate": 9.736925561061871e-05,
"loss": 0.5741,
"num_input_tokens_seen": 5299024,
"step": 505
},
{
"epoch": 0.13134174607262425,
"grad_norm": 2.4706517190834063,
"learning_rate": 9.729086208503174e-05,
"loss": 0.6535,
"num_input_tokens_seen": 5352664,
"step": 510
},
{
"epoch": 0.13262941024980685,
"grad_norm": 2.031672226684599,
"learning_rate": 9.721135012358156e-05,
"loss": 0.6081,
"num_input_tokens_seen": 5406008,
"step": 515
},
{
"epoch": 0.13391707442698944,
"grad_norm": 2.773997809426142,
"learning_rate": 9.713072160673777e-05,
"loss": 0.6792,
"num_input_tokens_seen": 5459368,
"step": 520
},
{
"epoch": 0.13520473860417204,
"grad_norm": 5.083057729524855,
"learning_rate": 9.704897844137673e-05,
"loss": 0.6821,
"num_input_tokens_seen": 5512960,
"step": 525
},
{
"epoch": 0.13649240278135463,
"grad_norm": 3.0440654843385584,
"learning_rate": 9.696612256073633e-05,
"loss": 0.5835,
"num_input_tokens_seen": 5565368,
"step": 530
},
{
"epoch": 0.1377800669585372,
"grad_norm": 3.7400231170971323,
"learning_rate": 9.688215592437039e-05,
"loss": 0.6129,
"num_input_tokens_seen": 5618008,
"step": 535
},
{
"epoch": 0.1390677311357198,
"grad_norm": 6.340287952379529,
"learning_rate": 9.679708051810221e-05,
"loss": 0.5765,
"num_input_tokens_seen": 5670072,
"step": 540
},
{
"epoch": 0.1403553953129024,
"grad_norm": 3.6351560550229207,
"learning_rate": 9.67108983539777e-05,
"loss": 0.6325,
"num_input_tokens_seen": 5722936,
"step": 545
},
{
"epoch": 0.141643059490085,
"grad_norm": 3.8363425916745117,
"learning_rate": 9.662361147021779e-05,
"loss": 0.5596,
"num_input_tokens_seen": 5774880,
"step": 550
},
{
"epoch": 0.141643059490085,
"eval_loss": 0.5832681059837341,
"eval_runtime": 38.2495,
"eval_samples_per_second": 3.137,
"eval_steps_per_second": 0.784,
"num_input_tokens_seen": 5774880,
"step": 550
},
{
"epoch": 0.14293072366726758,
"grad_norm": 3.911447203674744,
"learning_rate": 9.653522193117013e-05,
"loss": 0.5073,
"num_input_tokens_seen": 5826608,
"step": 555
},
{
"epoch": 0.14421838784445018,
"grad_norm": 3.3501835856945763,
"learning_rate": 9.644573182726035e-05,
"loss": 0.5652,
"num_input_tokens_seen": 5879776,
"step": 560
},
{
"epoch": 0.14550605202163275,
"grad_norm": 8.75758822201328,
"learning_rate": 9.63551432749426e-05,
"loss": 0.5727,
"num_input_tokens_seen": 5932888,
"step": 565
},
{
"epoch": 0.14679371619881534,
"grad_norm": 4.351029258458384,
"learning_rate": 9.626345841664953e-05,
"loss": 0.6251,
"num_input_tokens_seen": 5984648,
"step": 570
},
{
"epoch": 0.14808138037599794,
"grad_norm": 7.617020699535255,
"learning_rate": 9.617067942074153e-05,
"loss": 0.6508,
"num_input_tokens_seen": 6037000,
"step": 575
},
{
"epoch": 0.14936904455318054,
"grad_norm": 7.293430172750479,
"learning_rate": 9.607680848145558e-05,
"loss": 0.6686,
"num_input_tokens_seen": 6090512,
"step": 580
},
{
"epoch": 0.15065670873036313,
"grad_norm": 3.3635276124166653,
"learning_rate": 9.598184781885318e-05,
"loss": 0.5793,
"num_input_tokens_seen": 6143320,
"step": 585
},
{
"epoch": 0.1519443729075457,
"grad_norm": 2.7589160396339407,
"learning_rate": 9.588579967876806e-05,
"loss": 0.5954,
"num_input_tokens_seen": 6195720,
"step": 590
},
{
"epoch": 0.1532320370847283,
"grad_norm": 1.582169884399532,
"learning_rate": 9.578866633275288e-05,
"loss": 0.5644,
"num_input_tokens_seen": 6247592,
"step": 595
},
{
"epoch": 0.1545197012619109,
"grad_norm": 3.891844940061855,
"learning_rate": 9.569045007802559e-05,
"loss": 0.5794,
"num_input_tokens_seen": 6299656,
"step": 600
},
{
"epoch": 0.1545197012619109,
"eval_loss": 0.6039358973503113,
"eval_runtime": 38.3138,
"eval_samples_per_second": 3.132,
"eval_steps_per_second": 0.783,
"num_input_tokens_seen": 6299656,
"step": 600
},
{
"epoch": 0.1558073654390935,
"grad_norm": 5.90634634073773,
"learning_rate": 9.55911532374151e-05,
"loss": 0.6106,
"num_input_tokens_seen": 6351680,
"step": 605
},
{
"epoch": 0.15709502961627608,
"grad_norm": 3.5429043559071034,
"learning_rate": 9.549077815930636e-05,
"loss": 0.5812,
"num_input_tokens_seen": 6403648,
"step": 610
},
{
"epoch": 0.15838269379345868,
"grad_norm": 2.8753548663225144,
"learning_rate": 9.538932721758474e-05,
"loss": 0.5992,
"num_input_tokens_seen": 6456328,
"step": 615
},
{
"epoch": 0.15967035797064125,
"grad_norm": 2.4013005755622467,
"learning_rate": 9.528680281157999e-05,
"loss": 0.587,
"num_input_tokens_seen": 6509024,
"step": 620
},
{
"epoch": 0.16095802214782384,
"grad_norm": 3.860358696946306,
"learning_rate": 9.518320736600943e-05,
"loss": 0.5836,
"num_input_tokens_seen": 6561336,
"step": 625
},
{
"epoch": 0.16224568632500644,
"grad_norm": 3.187917212328382,
"learning_rate": 9.507854333092063e-05,
"loss": 0.5913,
"num_input_tokens_seen": 6614024,
"step": 630
},
{
"epoch": 0.16353335050218903,
"grad_norm": 3.5342177024321586,
"learning_rate": 9.497281318163346e-05,
"loss": 0.5693,
"num_input_tokens_seen": 6666416,
"step": 635
},
{
"epoch": 0.16482101467937163,
"grad_norm": 3.90374612709263,
"learning_rate": 9.486601941868154e-05,
"loss": 0.572,
"num_input_tokens_seen": 6718200,
"step": 640
},
{
"epoch": 0.1661086788565542,
"grad_norm": 4.4270591027201665,
"learning_rate": 9.475816456775313e-05,
"loss": 0.6111,
"num_input_tokens_seen": 6771256,
"step": 645
},
{
"epoch": 0.1673963430337368,
"grad_norm": 5.04761388655614,
"learning_rate": 9.464925117963133e-05,
"loss": 0.5959,
"num_input_tokens_seen": 6824008,
"step": 650
},
{
"epoch": 0.1673963430337368,
"eval_loss": 0.5542036890983582,
"eval_runtime": 68.9048,
"eval_samples_per_second": 1.742,
"eval_steps_per_second": 0.435,
"num_input_tokens_seen": 6824008,
"step": 650
},
{
"epoch": 0.1686840072109194,
"grad_norm": 3.428410481447858,
"learning_rate": 9.453928183013385e-05,
"loss": 0.5344,
"num_input_tokens_seen": 6875432,
"step": 655
},
{
"epoch": 0.16997167138810199,
"grad_norm": 2.9137495299009846,
"learning_rate": 9.442825912005202e-05,
"loss": 0.56,
"num_input_tokens_seen": 6927768,
"step": 660
},
{
"epoch": 0.17125933556528458,
"grad_norm": 4.2956604210715925,
"learning_rate": 9.431618567508933e-05,
"loss": 0.5701,
"num_input_tokens_seen": 6980544,
"step": 665
},
{
"epoch": 0.17254699974246718,
"grad_norm": 4.3977584083656405,
"learning_rate": 9.420306414579925e-05,
"loss": 0.5604,
"num_input_tokens_seen": 7032584,
"step": 670
},
{
"epoch": 0.17383466391964975,
"grad_norm": 4.48381006313936,
"learning_rate": 9.408889720752266e-05,
"loss": 0.5763,
"num_input_tokens_seen": 7085048,
"step": 675
},
{
"epoch": 0.17512232809683234,
"grad_norm": 2.189534287393346,
"learning_rate": 9.397368756032445e-05,
"loss": 0.5962,
"num_input_tokens_seen": 7137952,
"step": 680
},
{
"epoch": 0.17640999227401494,
"grad_norm": 3.34591241093722,
"learning_rate": 9.385743792892982e-05,
"loss": 0.5935,
"num_input_tokens_seen": 7190584,
"step": 685
},
{
"epoch": 0.17769765645119753,
"grad_norm": 2.7509902524242507,
"learning_rate": 9.374015106265968e-05,
"loss": 0.5267,
"num_input_tokens_seen": 7243440,
"step": 690
},
{
"epoch": 0.17898532062838013,
"grad_norm": 2.322454948468365,
"learning_rate": 9.362182973536569e-05,
"loss": 0.5351,
"num_input_tokens_seen": 7295568,
"step": 695
},
{
"epoch": 0.1802729848055627,
"grad_norm": 3.4615171229405046,
"learning_rate": 9.35024767453647e-05,
"loss": 0.5014,
"num_input_tokens_seen": 7347040,
"step": 700
},
{
"epoch": 0.1802729848055627,
"eval_loss": 0.5440100431442261,
"eval_runtime": 39.1181,
"eval_samples_per_second": 3.068,
"eval_steps_per_second": 0.767,
"num_input_tokens_seen": 7347040,
"step": 700
},
{
"epoch": 0.1815606489827453,
"grad_norm": 4.815426816055898,
"learning_rate": 9.338209491537257e-05,
"loss": 0.543,
"num_input_tokens_seen": 7399584,
"step": 705
},
{
"epoch": 0.1828483131599279,
"grad_norm": 7.294932559918336,
"learning_rate": 9.326068709243727e-05,
"loss": 0.4995,
"num_input_tokens_seen": 7452928,
"step": 710
},
{
"epoch": 0.18413597733711048,
"grad_norm": 3.6946433405013495,
"learning_rate": 9.313825614787177e-05,
"loss": 0.5109,
"num_input_tokens_seen": 7505112,
"step": 715
},
{
"epoch": 0.18542364151429308,
"grad_norm": 4.339671310261357,
"learning_rate": 9.301480497718593e-05,
"loss": 0.4932,
"num_input_tokens_seen": 7557608,
"step": 720
},
{
"epoch": 0.18671130569147568,
"grad_norm": 11.604530853746237,
"learning_rate": 9.289033650001817e-05,
"loss": 0.5573,
"num_input_tokens_seen": 7610048,
"step": 725
},
{
"epoch": 0.18799896986865824,
"grad_norm": 5.990020165378009,
"learning_rate": 9.276485366006634e-05,
"loss": 0.5305,
"num_input_tokens_seen": 7662056,
"step": 730
},
{
"epoch": 0.18928663404584084,
"grad_norm": 4.709895983169237,
"learning_rate": 9.263835942501807e-05,
"loss": 0.5369,
"num_input_tokens_seen": 7713656,
"step": 735
},
{
"epoch": 0.19057429822302344,
"grad_norm": 4.873824727341975,
"learning_rate": 9.251085678648072e-05,
"loss": 0.5397,
"num_input_tokens_seen": 7765992,
"step": 740
},
{
"epoch": 0.19186196240020603,
"grad_norm": 3.288968567031419,
"learning_rate": 9.238234875991046e-05,
"loss": 0.5116,
"num_input_tokens_seen": 7818448,
"step": 745
},
{
"epoch": 0.19314962657738863,
"grad_norm": 4.778741391076671,
"learning_rate": 9.225283838454111e-05,
"loss": 0.541,
"num_input_tokens_seen": 7870520,
"step": 750
},
{
"epoch": 0.19314962657738863,
"eval_loss": 0.5273815989494324,
"eval_runtime": 39.1812,
"eval_samples_per_second": 3.063,
"eval_steps_per_second": 0.766,
"num_input_tokens_seen": 7870520,
"step": 750
},
{
"epoch": 0.1944372907545712,
"grad_norm": 4.544356566141105,
"learning_rate": 9.21223287233121e-05,
"loss": 0.4961,
"num_input_tokens_seen": 7922736,
"step": 755
},
{
"epoch": 0.1957249549317538,
"grad_norm": 7.025876813077666,
"learning_rate": 9.199082286279622e-05,
"loss": 0.4956,
"num_input_tokens_seen": 7975304,
"step": 760
},
{
"epoch": 0.1970126191089364,
"grad_norm": 4.9360968239249985,
"learning_rate": 9.185832391312644e-05,
"loss": 0.4997,
"num_input_tokens_seen": 8027448,
"step": 765
},
{
"epoch": 0.19830028328611898,
"grad_norm": 10.528361984915874,
"learning_rate": 9.172483500792244e-05,
"loss": 0.5214,
"num_input_tokens_seen": 8080944,
"step": 770
},
{
"epoch": 0.19958794746330158,
"grad_norm": 9.264531258094065,
"learning_rate": 9.159035930421658e-05,
"loss": 0.6098,
"num_input_tokens_seen": 8133392,
"step": 775
},
{
"epoch": 0.20087561164048418,
"grad_norm": 1.9709167614209242,
"learning_rate": 9.145489998237902e-05,
"loss": 0.5046,
"num_input_tokens_seen": 8185360,
"step": 780
},
{
"epoch": 0.20216327581766674,
"grad_norm": 7.5915211434567595,
"learning_rate": 9.131846024604274e-05,
"loss": 0.5803,
"num_input_tokens_seen": 8237672,
"step": 785
},
{
"epoch": 0.20345093999484934,
"grad_norm": 3.251682970663388,
"learning_rate": 9.11810433220276e-05,
"loss": 0.5365,
"num_input_tokens_seen": 8289688,
"step": 790
},
{
"epoch": 0.20473860417203193,
"grad_norm": 4.341533737034294,
"learning_rate": 9.104265246026415e-05,
"loss": 0.5259,
"num_input_tokens_seen": 8341624,
"step": 795
},
{
"epoch": 0.20602626834921453,
"grad_norm": 5.463180544339495,
"learning_rate": 9.090329093371666e-05,
"loss": 0.5291,
"num_input_tokens_seen": 8393696,
"step": 800
},
{
"epoch": 0.20602626834921453,
"eval_loss": 0.5219093561172485,
"eval_runtime": 39.7455,
"eval_samples_per_second": 3.019,
"eval_steps_per_second": 0.755,
"num_input_tokens_seen": 8393696,
"step": 800
},
{
"epoch": 0.20731393252639713,
"grad_norm": 4.254130676908817,
"learning_rate": 9.076296203830579e-05,
"loss": 0.5449,
"num_input_tokens_seen": 8446496,
"step": 805
},
{
"epoch": 0.2086015967035797,
"grad_norm": 5.6525741285524145,
"learning_rate": 9.062166909283062e-05,
"loss": 0.5625,
"num_input_tokens_seen": 8499544,
"step": 810
},
{
"epoch": 0.2098892608807623,
"grad_norm": 3.8041246225911345,
"learning_rate": 9.047941543889014e-05,
"loss": 0.5564,
"num_input_tokens_seen": 8552568,
"step": 815
},
{
"epoch": 0.2111769250579449,
"grad_norm": 3.803732280546421,
"learning_rate": 9.033620444080428e-05,
"loss": 0.5487,
"num_input_tokens_seen": 8605560,
"step": 820
},
{
"epoch": 0.21246458923512748,
"grad_norm": 2.8518948364927925,
"learning_rate": 9.019203948553422e-05,
"loss": 0.5719,
"num_input_tokens_seen": 8657704,
"step": 825
},
{
"epoch": 0.21375225341231008,
"grad_norm": 3.939376115862177,
"learning_rate": 9.004692398260244e-05,
"loss": 0.5235,
"num_input_tokens_seen": 8711088,
"step": 830
},
{
"epoch": 0.21503991758949267,
"grad_norm": 6.635912128499916,
"learning_rate": 8.9900861364012e-05,
"loss": 0.5566,
"num_input_tokens_seen": 8763712,
"step": 835
},
{
"epoch": 0.21632758176667524,
"grad_norm": 3.7547407090496687,
"learning_rate": 8.975385508416532e-05,
"loss": 0.482,
"num_input_tokens_seen": 8815760,
"step": 840
},
{
"epoch": 0.21761524594385784,
"grad_norm": 4.093006904445721,
"learning_rate": 8.960590861978265e-05,
"loss": 0.5046,
"num_input_tokens_seen": 8867720,
"step": 845
},
{
"epoch": 0.21890291012104043,
"grad_norm": 11.397392997722068,
"learning_rate": 8.945702546981969e-05,
"loss": 0.5063,
"num_input_tokens_seen": 8919608,
"step": 850
},
{
"epoch": 0.21890291012104043,
"eval_loss": 0.5525640249252319,
"eval_runtime": 39.0469,
"eval_samples_per_second": 3.073,
"eval_steps_per_second": 0.768,
"num_input_tokens_seen": 8919608,
"step": 850
},
{
"epoch": 0.22019057429822303,
"grad_norm": 4.339535962830116,
"learning_rate": 8.930720915538487e-05,
"loss": 0.5853,
"num_input_tokens_seen": 8971048,
"step": 855
},
{
"epoch": 0.22147823847540563,
"grad_norm": 6.118436891847819,
"learning_rate": 8.915646321965614e-05,
"loss": 0.5534,
"num_input_tokens_seen": 9022936,
"step": 860
},
{
"epoch": 0.2227659026525882,
"grad_norm": 3.3997835203618667,
"learning_rate": 8.900479122779712e-05,
"loss": 0.5623,
"num_input_tokens_seen": 9075336,
"step": 865
},
{
"epoch": 0.2240535668297708,
"grad_norm": 4.188326935911128,
"learning_rate": 8.885219676687277e-05,
"loss": 0.5561,
"num_input_tokens_seen": 9127688,
"step": 870
},
{
"epoch": 0.22534123100695339,
"grad_norm": 5.220175192497493,
"learning_rate": 8.869868344576459e-05,
"loss": 0.5449,
"num_input_tokens_seen": 9180624,
"step": 875
},
{
"epoch": 0.22662889518413598,
"grad_norm": 2.2022914161050577,
"learning_rate": 8.854425489508532e-05,
"loss": 0.5062,
"num_input_tokens_seen": 9233176,
"step": 880
},
{
"epoch": 0.22791655936131858,
"grad_norm": 4.62379059067999,
"learning_rate": 8.838891476709288e-05,
"loss": 0.5033,
"num_input_tokens_seen": 9286688,
"step": 885
},
{
"epoch": 0.22920422353850115,
"grad_norm": 3.639684630492015,
"learning_rate": 8.823266673560426e-05,
"loss": 0.4845,
"num_input_tokens_seen": 9339600,
"step": 890
},
{
"epoch": 0.23049188771568374,
"grad_norm": 4.131757647310936,
"learning_rate": 8.807551449590846e-05,
"loss": 0.5595,
"num_input_tokens_seen": 9391536,
"step": 895
},
{
"epoch": 0.23177955189286634,
"grad_norm": 4.771128685196347,
"learning_rate": 8.791746176467907e-05,
"loss": 0.5251,
"num_input_tokens_seen": 9443616,
"step": 900
},
{
"epoch": 0.23177955189286634,
"eval_loss": 0.49604204297065735,
"eval_runtime": 39.5289,
"eval_samples_per_second": 3.036,
"eval_steps_per_second": 0.759,
"num_input_tokens_seen": 9443616,
"step": 900
},
{
"epoch": 0.23306721607004893,
"grad_norm": 6.849781513397169,
"learning_rate": 8.775851227988656e-05,
"loss": 0.5774,
"num_input_tokens_seen": 9497304,
"step": 905
},
{
"epoch": 0.23435488024723153,
"grad_norm": 2.526801567699946,
"learning_rate": 8.759866980070963e-05,
"loss": 0.5441,
"num_input_tokens_seen": 9549416,
"step": 910
},
{
"epoch": 0.23564254442441412,
"grad_norm": 3.1008408808291503,
"learning_rate": 8.743793810744654e-05,
"loss": 0.4898,
"num_input_tokens_seen": 9601800,
"step": 915
},
{
"epoch": 0.2369302086015967,
"grad_norm": 4.120824184689494,
"learning_rate": 8.727632100142551e-05,
"loss": 0.4681,
"num_input_tokens_seen": 9653600,
"step": 920
},
{
"epoch": 0.2382178727787793,
"grad_norm": 5.251488809494114,
"learning_rate": 8.711382230491493e-05,
"loss": 0.4946,
"num_input_tokens_seen": 9707224,
"step": 925
},
{
"epoch": 0.23950553695596188,
"grad_norm": 6.885034741125289,
"learning_rate": 8.695044586103296e-05,
"loss": 0.5517,
"num_input_tokens_seen": 9760096,
"step": 930
},
{
"epoch": 0.24079320113314448,
"grad_norm": 4.6246077239626855,
"learning_rate": 8.678619553365659e-05,
"loss": 0.6064,
"num_input_tokens_seen": 9812672,
"step": 935
},
{
"epoch": 0.24208086531032708,
"grad_norm": 5.621020693846077,
"learning_rate": 8.662107520733027e-05,
"loss": 0.5398,
"num_input_tokens_seen": 9866200,
"step": 940
},
{
"epoch": 0.24336852948750964,
"grad_norm": 3.1921985322817092,
"learning_rate": 8.64550887871741e-05,
"loss": 0.5068,
"num_input_tokens_seen": 9918160,
"step": 945
},
{
"epoch": 0.24465619366469224,
"grad_norm": 2.3689648161336465,
"learning_rate": 8.628824019879137e-05,
"loss": 0.5862,
"num_input_tokens_seen": 9970600,
"step": 950
},
{
"epoch": 0.24465619366469224,
"eval_loss": 0.5085262656211853,
"eval_runtime": 39.0437,
"eval_samples_per_second": 3.073,
"eval_steps_per_second": 0.768,
"num_input_tokens_seen": 9970600,
"step": 950
},
{
"epoch": 0.24594385784187484,
"grad_norm": 2.8827978223065363,
"learning_rate": 8.612053338817581e-05,
"loss": 0.4549,
"num_input_tokens_seen": 10022248,
"step": 955
},
{
"epoch": 0.24723152201905743,
"grad_norm": 6.662877258417003,
"learning_rate": 8.595197232161824e-05,
"loss": 0.4791,
"num_input_tokens_seen": 10075280,
"step": 960
},
{
"epoch": 0.24851918619624003,
"grad_norm": 8.140970355143077,
"learning_rate": 8.578256098561275e-05,
"loss": 0.4833,
"num_input_tokens_seen": 10128392,
"step": 965
},
{
"epoch": 0.24980685037342262,
"grad_norm": 3.243184767888501,
"learning_rate": 8.561230338676239e-05,
"loss": 0.4672,
"num_input_tokens_seen": 10180720,
"step": 970
},
{
"epoch": 0.2510945145506052,
"grad_norm": 6.588760068173114,
"learning_rate": 8.544120355168451e-05,
"loss": 0.5205,
"num_input_tokens_seen": 10233256,
"step": 975
},
{
"epoch": 0.2523821787277878,
"grad_norm": 2.6240987196110837,
"learning_rate": 8.526926552691544e-05,
"loss": 0.5124,
"num_input_tokens_seen": 10284928,
"step": 980
},
{
"epoch": 0.2536698429049704,
"grad_norm": 8.242761558538728,
"learning_rate": 8.509649337881483e-05,
"loss": 0.5034,
"num_input_tokens_seen": 10338208,
"step": 985
},
{
"epoch": 0.254957507082153,
"grad_norm": 8.922137566500533,
"learning_rate": 8.492289119346943e-05,
"loss": 0.5226,
"num_input_tokens_seen": 10390224,
"step": 990
},
{
"epoch": 0.25624517125933555,
"grad_norm": 4.922275874717211,
"learning_rate": 8.474846307659658e-05,
"loss": 0.5399,
"num_input_tokens_seen": 10443080,
"step": 995
},
{
"epoch": 0.25753283543651817,
"grad_norm": 6.866585614783304,
"learning_rate": 8.457321315344694e-05,
"loss": 0.483,
"num_input_tokens_seen": 10495592,
"step": 1000
},
{
"epoch": 0.25753283543651817,
"eval_loss": 0.5305114388465881,
"eval_runtime": 38.9297,
"eval_samples_per_second": 3.082,
"eval_steps_per_second": 0.771,
"num_input_tokens_seen": 10495592,
"step": 1000
},
{
"epoch": 0.25882049961370074,
"grad_norm": 8.233033578002926,
"learning_rate": 8.439714556870704e-05,
"loss": 0.568,
"num_input_tokens_seen": 10548136,
"step": 1005
},
{
"epoch": 0.26010816379088336,
"grad_norm": 5.3701298824478485,
"learning_rate": 8.422026448640124e-05,
"loss": 0.4335,
"num_input_tokens_seen": 10600048,
"step": 1010
},
{
"epoch": 0.26139582796806593,
"grad_norm": 5.491882026124958,
"learning_rate": 8.40425740897932e-05,
"loss": 0.5385,
"num_input_tokens_seen": 10652160,
"step": 1015
},
{
"epoch": 0.2626834921452485,
"grad_norm": 5.479941792055548,
"learning_rate": 8.386407858128706e-05,
"loss": 0.5171,
"num_input_tokens_seen": 10705208,
"step": 1020
},
{
"epoch": 0.2639711563224311,
"grad_norm": 3.489116106033337,
"learning_rate": 8.368478218232787e-05,
"loss": 0.5201,
"num_input_tokens_seen": 10758688,
"step": 1025
},
{
"epoch": 0.2652588204996137,
"grad_norm": 5.923123692460237,
"learning_rate": 8.350468913330192e-05,
"loss": 0.5521,
"num_input_tokens_seen": 10811408,
"step": 1030
},
{
"epoch": 0.2665464846767963,
"grad_norm": 2.7605406738569824,
"learning_rate": 8.33238036934364e-05,
"loss": 0.4938,
"num_input_tokens_seen": 10864144,
"step": 1035
},
{
"epoch": 0.2678341488539789,
"grad_norm": 5.500647711838314,
"learning_rate": 8.31421301406986e-05,
"loss": 0.4828,
"num_input_tokens_seen": 10916952,
"step": 1040
},
{
"epoch": 0.26912181303116145,
"grad_norm": 6.823855575342733,
"learning_rate": 8.29596727716949e-05,
"loss": 0.5491,
"num_input_tokens_seen": 10968824,
"step": 1045
},
{
"epoch": 0.2704094772083441,
"grad_norm": 5.409054743152559,
"learning_rate": 8.277643590156894e-05,
"loss": 0.4628,
"num_input_tokens_seen": 11021656,
"step": 1050
},
{
"epoch": 0.2704094772083441,
"eval_loss": 0.5039986371994019,
"eval_runtime": 40.3009,
"eval_samples_per_second": 2.978,
"eval_steps_per_second": 0.744,
"num_input_tokens_seen": 11021656,
"step": 1050
},
{
"epoch": 0.27169714138552664,
"grad_norm": 3.2588151986321994,
"learning_rate": 8.259242386389973e-05,
"loss": 0.4586,
"num_input_tokens_seen": 11074336,
"step": 1055
},
{
"epoch": 0.27298480556270927,
"grad_norm": 12.995641199019554,
"learning_rate": 8.240764101059912e-05,
"loss": 0.4939,
"num_input_tokens_seen": 11126776,
"step": 1060
},
{
"epoch": 0.27427246973989183,
"grad_norm": 8.713479932798109,
"learning_rate": 8.222209171180883e-05,
"loss": 0.4978,
"num_input_tokens_seen": 11179680,
"step": 1065
},
{
"epoch": 0.2755601339170744,
"grad_norm": 3.6728132957332016,
"learning_rate": 8.203578035579715e-05,
"loss": 0.5695,
"num_input_tokens_seen": 11231616,
"step": 1070
},
{
"epoch": 0.276847798094257,
"grad_norm": 9.661110166832387,
"learning_rate": 8.184871134885513e-05,
"loss": 0.4635,
"num_input_tokens_seen": 11283720,
"step": 1075
},
{
"epoch": 0.2781354622714396,
"grad_norm": 5.4096015474623576,
"learning_rate": 8.166088911519235e-05,
"loss": 0.4974,
"num_input_tokens_seen": 11336144,
"step": 1080
},
{
"epoch": 0.2794231264486222,
"grad_norm": 5.353663008589148,
"learning_rate": 8.147231809683236e-05,
"loss": 0.4439,
"num_input_tokens_seen": 11389128,
"step": 1085
},
{
"epoch": 0.2807107906258048,
"grad_norm": 3.863008112890598,
"learning_rate": 8.128300275350756e-05,
"loss": 0.4368,
"num_input_tokens_seen": 11441864,
"step": 1090
},
{
"epoch": 0.2819984548029874,
"grad_norm": 5.545035623030093,
"learning_rate": 8.109294756255375e-05,
"loss": 0.4895,
"num_input_tokens_seen": 11494880,
"step": 1095
},
{
"epoch": 0.28328611898017,
"grad_norm": 5.124762488175073,
"learning_rate": 8.090215701880419e-05,
"loss": 0.4825,
"num_input_tokens_seen": 11547008,
"step": 1100
},
{
"epoch": 0.28328611898017,
"eval_loss": 0.4798590838909149,
"eval_runtime": 40.6942,
"eval_samples_per_second": 2.949,
"eval_steps_per_second": 0.737,
"num_input_tokens_seen": 11547008,
"step": 1100
},
{
"epoch": 0.28457378315735254,
"grad_norm": 11.308296783543483,
"learning_rate": 8.07106356344834e-05,
"loss": 0.4927,
"num_input_tokens_seen": 11600032,
"step": 1105
},
{
"epoch": 0.28586144733453517,
"grad_norm": 4.902660398367944,
"learning_rate": 8.051838793910038e-05,
"loss": 0.4353,
"num_input_tokens_seen": 11652120,
"step": 1110
},
{
"epoch": 0.28714911151171774,
"grad_norm": 4.185631754620407,
"learning_rate": 8.032541847934146e-05,
"loss": 0.4891,
"num_input_tokens_seen": 11705184,
"step": 1115
},
{
"epoch": 0.28843677568890036,
"grad_norm": 6.049695709018542,
"learning_rate": 8.013173181896283e-05,
"loss": 0.4497,
"num_input_tokens_seen": 11758032,
"step": 1120
},
{
"epoch": 0.28972443986608293,
"grad_norm": 4.598736726589848,
"learning_rate": 7.993733253868256e-05,
"loss": 0.4927,
"num_input_tokens_seen": 11810736,
"step": 1125
},
{
"epoch": 0.2910121040432655,
"grad_norm": 41.010822412039396,
"learning_rate": 7.974222523607236e-05,
"loss": 0.4853,
"num_input_tokens_seen": 11863152,
"step": 1130
},
{
"epoch": 0.2922997682204481,
"grad_norm": 5.591270811303827,
"learning_rate": 7.954641452544865e-05,
"loss": 0.4458,
"num_input_tokens_seen": 11914536,
"step": 1135
},
{
"epoch": 0.2935874323976307,
"grad_norm": 4.526048407550314,
"learning_rate": 7.934990503776363e-05,
"loss": 0.3976,
"num_input_tokens_seen": 11966064,
"step": 1140
},
{
"epoch": 0.2948750965748133,
"grad_norm": 4.778105875378293,
"learning_rate": 7.915270142049566e-05,
"loss": 0.508,
"num_input_tokens_seen": 12018928,
"step": 1145
},
{
"epoch": 0.2961627607519959,
"grad_norm": 8.075837130866274,
"learning_rate": 7.89548083375394e-05,
"loss": 0.4553,
"num_input_tokens_seen": 12071088,
"step": 1150
},
{
"epoch": 0.2961627607519959,
"eval_loss": 0.45381438732147217,
"eval_runtime": 38.3303,
"eval_samples_per_second": 3.131,
"eval_steps_per_second": 0.783,
"num_input_tokens_seen": 12071088,
"step": 1150
},
{
"epoch": 0.29745042492917845,
"grad_norm": 5.66991445612284,
"learning_rate": 7.875623046909544e-05,
"loss": 0.4192,
"num_input_tokens_seen": 12122128,
"step": 1155
},
{
"epoch": 0.29873808910636107,
"grad_norm": 11.08291356725024,
"learning_rate": 7.855697251155967e-05,
"loss": 0.433,
"num_input_tokens_seen": 12174288,
"step": 1160
},
{
"epoch": 0.30002575328354364,
"grad_norm": 8.191495602021662,
"learning_rate": 7.835703917741212e-05,
"loss": 0.4817,
"num_input_tokens_seen": 12227008,
"step": 1165
},
{
"epoch": 0.30131341746072626,
"grad_norm": 7.763763600628314,
"learning_rate": 7.81564351951057e-05,
"loss": 0.485,
"num_input_tokens_seen": 12280168,
"step": 1170
},
{
"epoch": 0.30260108163790883,
"grad_norm": 5.347838532189795,
"learning_rate": 7.795516530895414e-05,
"loss": 0.4532,
"num_input_tokens_seen": 12333072,
"step": 1175
},
{
"epoch": 0.3038887458150914,
"grad_norm": 7.959591215701365,
"learning_rate": 7.775323427901993e-05,
"loss": 0.4643,
"num_input_tokens_seen": 12386208,
"step": 1180
},
{
"epoch": 0.305176409992274,
"grad_norm": 6.676689561663868,
"learning_rate": 7.755064688100171e-05,
"loss": 0.4577,
"num_input_tokens_seen": 12439304,
"step": 1185
},
{
"epoch": 0.3064640741694566,
"grad_norm": 6.976246725003336,
"learning_rate": 7.734740790612136e-05,
"loss": 0.4666,
"num_input_tokens_seen": 12491360,
"step": 1190
},
{
"epoch": 0.3077517383466392,
"grad_norm": 6.034570050567919,
"learning_rate": 7.714352216101055e-05,
"loss": 0.407,
"num_input_tokens_seen": 12544264,
"step": 1195
},
{
"epoch": 0.3090394025238218,
"grad_norm": 4.583037231101643,
"learning_rate": 7.693899446759727e-05,
"loss": 0.454,
"num_input_tokens_seen": 12596208,
"step": 1200
},
{
"epoch": 0.3090394025238218,
"eval_loss": 0.49250805377960205,
"eval_runtime": 38.6863,
"eval_samples_per_second": 3.102,
"eval_steps_per_second": 0.775,
"num_input_tokens_seen": 12596208,
"step": 1200
},
{
"epoch": 0.31032706670100435,
"grad_norm": 4.0964966925406365,
"learning_rate": 7.673382966299163e-05,
"loss": 0.5226,
"num_input_tokens_seen": 12648936,
"step": 1205
},
{
"epoch": 0.311614730878187,
"grad_norm": 7.87992303723905,
"learning_rate": 7.65280325993715e-05,
"loss": 0.4757,
"num_input_tokens_seen": 12702432,
"step": 1210
},
{
"epoch": 0.31290239505536954,
"grad_norm": 6.822793875901239,
"learning_rate": 7.63216081438678e-05,
"loss": 0.451,
"num_input_tokens_seen": 12755128,
"step": 1215
},
{
"epoch": 0.31419005923255217,
"grad_norm": 8.804840574778536,
"learning_rate": 7.611456117844934e-05,
"loss": 0.4155,
"num_input_tokens_seen": 12808152,
"step": 1220
},
{
"epoch": 0.31547772340973473,
"grad_norm": 12.832933509895003,
"learning_rate": 7.59068965998074e-05,
"loss": 0.4094,
"num_input_tokens_seen": 12861592,
"step": 1225
},
{
"epoch": 0.31676538758691736,
"grad_norm": 3.769639586972444,
"learning_rate": 7.569861931923989e-05,
"loss": 0.4663,
"num_input_tokens_seen": 12914240,
"step": 1230
},
{
"epoch": 0.3180530517640999,
"grad_norm": 5.011688667303979,
"learning_rate": 7.548973426253521e-05,
"loss": 0.468,
"num_input_tokens_seen": 12967472,
"step": 1235
},
{
"epoch": 0.3193407159412825,
"grad_norm": 5.925703481508644,
"learning_rate": 7.528024636985575e-05,
"loss": 0.4744,
"num_input_tokens_seen": 13020232,
"step": 1240
},
{
"epoch": 0.3206283801184651,
"grad_norm": 3.511846132089351,
"learning_rate": 7.507016059562107e-05,
"loss": 0.4269,
"num_input_tokens_seen": 13073032,
"step": 1245
},
{
"epoch": 0.3219160442956477,
"grad_norm": 6.878508053492975,
"learning_rate": 7.485948190839077e-05,
"loss": 0.4725,
"num_input_tokens_seen": 13125624,
"step": 1250
},
{
"epoch": 0.3219160442956477,
"eval_loss": 0.4339977502822876,
"eval_runtime": 39.1132,
"eval_samples_per_second": 3.068,
"eval_steps_per_second": 0.767,
"num_input_tokens_seen": 13125624,
"step": 1250
},
{
"epoch": 0.3232037084728303,
"grad_norm": 3.2225418900054184,
"learning_rate": 7.464821529074679e-05,
"loss": 0.4196,
"num_input_tokens_seen": 13178656,
"step": 1255
},
{
"epoch": 0.3244913726500129,
"grad_norm": 5.7056125199065475,
"learning_rate": 7.443636573917585e-05,
"loss": 0.4349,
"num_input_tokens_seen": 13231224,
"step": 1260
},
{
"epoch": 0.32577903682719545,
"grad_norm": 3.1679429520474587,
"learning_rate": 7.422393826395108e-05,
"loss": 0.4726,
"num_input_tokens_seen": 13283208,
"step": 1265
},
{
"epoch": 0.32706670100437807,
"grad_norm": 5.409673500894723,
"learning_rate": 7.40109378890136e-05,
"loss": 0.4604,
"num_input_tokens_seen": 13335808,
"step": 1270
},
{
"epoch": 0.32835436518156064,
"grad_norm": 6.011303613930208,
"learning_rate": 7.379736965185368e-05,
"loss": 0.4606,
"num_input_tokens_seen": 13389112,
"step": 1275
},
{
"epoch": 0.32964202935874326,
"grad_norm": 11.490498301960598,
"learning_rate": 7.358323860339165e-05,
"loss": 0.4487,
"num_input_tokens_seen": 13441816,
"step": 1280
},
{
"epoch": 0.33092969353592583,
"grad_norm": 8.761206465870922,
"learning_rate": 7.336854980785839e-05,
"loss": 0.422,
"num_input_tokens_seen": 13493592,
"step": 1285
},
{
"epoch": 0.3322173577131084,
"grad_norm": 8.457687965106274,
"learning_rate": 7.315330834267553e-05,
"loss": 0.5397,
"num_input_tokens_seen": 13545696,
"step": 1290
},
{
"epoch": 0.333505021890291,
"grad_norm": 6.1852361009354295,
"learning_rate": 7.293751929833553e-05,
"loss": 0.5022,
"num_input_tokens_seen": 13597560,
"step": 1295
},
{
"epoch": 0.3347926860674736,
"grad_norm": 3.157280649859201,
"learning_rate": 7.272118777828108e-05,
"loss": 0.4794,
"num_input_tokens_seen": 13650264,
"step": 1300
},
{
"epoch": 0.3347926860674736,
"eval_loss": 0.4991846978664398,
"eval_runtime": 38.2504,
"eval_samples_per_second": 3.137,
"eval_steps_per_second": 0.784,
"num_input_tokens_seen": 13650264,
"step": 1300
},
{
"epoch": 0.3360803502446562,
"grad_norm": 6.386835645613503,
"learning_rate": 7.250431889878455e-05,
"loss": 0.4971,
"num_input_tokens_seen": 13702584,
"step": 1305
},
{
"epoch": 0.3373680144218388,
"grad_norm": 4.797592029689297,
"learning_rate": 7.228691778882693e-05,
"loss": 0.4574,
"num_input_tokens_seen": 13755024,
"step": 1310
},
{
"epoch": 0.33865567859902135,
"grad_norm": 3.659831343491765,
"learning_rate": 7.20689895899765e-05,
"loss": 0.4463,
"num_input_tokens_seen": 13807528,
"step": 1315
},
{
"epoch": 0.33994334277620397,
"grad_norm": 8.104230440489859,
"learning_rate": 7.185053945626733e-05,
"loss": 0.4549,
"num_input_tokens_seen": 13859760,
"step": 1320
},
{
"epoch": 0.34123100695338654,
"grad_norm": 4.000749012853666,
"learning_rate": 7.163157255407732e-05,
"loss": 0.4073,
"num_input_tokens_seen": 13911656,
"step": 1325
},
{
"epoch": 0.34251867113056916,
"grad_norm": 4.431361614574065,
"learning_rate": 7.141209406200599e-05,
"loss": 0.433,
"num_input_tokens_seen": 13963816,
"step": 1330
},
{
"epoch": 0.34380633530775173,
"grad_norm": 3.9352317738395635,
"learning_rate": 7.1192109170752e-05,
"loss": 0.4244,
"num_input_tokens_seen": 14016256,
"step": 1335
},
{
"epoch": 0.34509399948493436,
"grad_norm": 4.571632866024196,
"learning_rate": 7.097162308299054e-05,
"loss": 0.4448,
"num_input_tokens_seen": 14068768,
"step": 1340
},
{
"epoch": 0.3463816636621169,
"grad_norm": 4.2711556426666375,
"learning_rate": 7.07506410132501e-05,
"loss": 0.4608,
"num_input_tokens_seen": 14121272,
"step": 1345
},
{
"epoch": 0.3476693278392995,
"grad_norm": 4.49067434213006,
"learning_rate": 7.052916818778918e-05,
"loss": 0.3994,
"num_input_tokens_seen": 14173240,
"step": 1350
},
{
"epoch": 0.3476693278392995,
"eval_loss": 0.460835725069046,
"eval_runtime": 38.3552,
"eval_samples_per_second": 3.129,
"eval_steps_per_second": 0.782,
"num_input_tokens_seen": 14173240,
"step": 1350
},
{
"epoch": 0.3489569920164821,
"grad_norm": 6.100571377010892,
"learning_rate": 7.030720984447279e-05,
"loss": 0.41,
"num_input_tokens_seen": 14226032,
"step": 1355
},
{
"epoch": 0.3502446561936647,
"grad_norm": 3.531812694789996,
"learning_rate": 7.008477123264848e-05,
"loss": 0.3751,
"num_input_tokens_seen": 14278128,
"step": 1360
},
{
"epoch": 0.3515323203708473,
"grad_norm": 13.528736327050117,
"learning_rate": 6.986185761302224e-05,
"loss": 0.4814,
"num_input_tokens_seen": 14330624,
"step": 1365
},
{
"epoch": 0.3528199845480299,
"grad_norm": 6.2453361475565305,
"learning_rate": 6.963847425753403e-05,
"loss": 0.5007,
"num_input_tokens_seen": 14382416,
"step": 1370
},
{
"epoch": 0.35410764872521244,
"grad_norm": 3.5868157849734925,
"learning_rate": 6.941462644923318e-05,
"loss": 0.4335,
"num_input_tokens_seen": 14434896,
"step": 1375
},
{
"epoch": 0.35539531290239507,
"grad_norm": 7.0930284762784925,
"learning_rate": 6.919031948215335e-05,
"loss": 0.4427,
"num_input_tokens_seen": 14487152,
"step": 1380
},
{
"epoch": 0.35668297707957763,
"grad_norm": 1.8673746248959853,
"learning_rate": 6.896555866118741e-05,
"loss": 0.42,
"num_input_tokens_seen": 14539608,
"step": 1385
},
{
"epoch": 0.35797064125676026,
"grad_norm": 3.29378340171418,
"learning_rate": 6.87403493019619e-05,
"loss": 0.4573,
"num_input_tokens_seen": 14592168,
"step": 1390
},
{
"epoch": 0.3592583054339428,
"grad_norm": 4.710051493913417,
"learning_rate": 6.851469673071143e-05,
"loss": 0.4341,
"num_input_tokens_seen": 14643920,
"step": 1395
},
{
"epoch": 0.3605459696111254,
"grad_norm": 5.46737560287727,
"learning_rate": 6.828860628415253e-05,
"loss": 0.437,
"num_input_tokens_seen": 14697136,
"step": 1400
},
{
"epoch": 0.3605459696111254,
"eval_loss": 0.46620962023735046,
"eval_runtime": 38.4197,
"eval_samples_per_second": 3.123,
"eval_steps_per_second": 0.781,
"num_input_tokens_seen": 14697136,
"step": 1400
},
{
"epoch": 0.361833633788308,
"grad_norm": 5.6011715346425355,
"learning_rate": 6.806208330935766e-05,
"loss": 0.4377,
"num_input_tokens_seen": 14749168,
"step": 1405
},
{
"epoch": 0.3631212979654906,
"grad_norm": 8.725023519965001,
"learning_rate": 6.783513316362855e-05,
"loss": 0.412,
"num_input_tokens_seen": 14801568,
"step": 1410
},
{
"epoch": 0.3644089621426732,
"grad_norm": 8.12664534705471,
"learning_rate": 6.760776121436962e-05,
"loss": 0.4441,
"num_input_tokens_seen": 14853384,
"step": 1415
},
{
"epoch": 0.3656966263198558,
"grad_norm": 3.5568354734329244,
"learning_rate": 6.737997283896103e-05,
"loss": 0.4576,
"num_input_tokens_seen": 14906632,
"step": 1420
},
{
"epoch": 0.36698429049703835,
"grad_norm": 2.9816566580274007,
"learning_rate": 6.715177342463145e-05,
"loss": 0.3853,
"num_input_tokens_seen": 14959240,
"step": 1425
},
{
"epoch": 0.36827195467422097,
"grad_norm": 9.270651786172323,
"learning_rate": 6.692316836833065e-05,
"loss": 0.3755,
"num_input_tokens_seen": 15012256,
"step": 1430
},
{
"epoch": 0.36955961885140354,
"grad_norm": 7.022055493979997,
"learning_rate": 6.6694163076602e-05,
"loss": 0.5384,
"num_input_tokens_seen": 15064664,
"step": 1435
},
{
"epoch": 0.37084728302858616,
"grad_norm": 3.764454647275643,
"learning_rate": 6.646476296545434e-05,
"loss": 0.4377,
"num_input_tokens_seen": 15117384,
"step": 1440
},
{
"epoch": 0.37213494720576873,
"grad_norm": 5.3073636057406794,
"learning_rate": 6.623497346023418e-05,
"loss": 0.3876,
"num_input_tokens_seen": 15169880,
"step": 1445
},
{
"epoch": 0.37342261138295135,
"grad_norm": 3.8443265684988392,
"learning_rate": 6.60047999954972e-05,
"loss": 0.4065,
"num_input_tokens_seen": 15222568,
"step": 1450
},
{
"epoch": 0.37342261138295135,
"eval_loss": 0.4395444095134735,
"eval_runtime": 38.336,
"eval_samples_per_second": 3.13,
"eval_steps_per_second": 0.783,
"num_input_tokens_seen": 15222568,
"step": 1450
},
{
"epoch": 0.3747102755601339,
"grad_norm": 8.614661225033187,
"learning_rate": 6.57742480148798e-05,
"loss": 0.4231,
"num_input_tokens_seen": 15275288,
"step": 1455
},
{
"epoch": 0.3759979397373165,
"grad_norm": 3.107561516867378,
"learning_rate": 6.554332297097031e-05,
"loss": 0.4301,
"num_input_tokens_seen": 15328072,
"step": 1460
},
{
"epoch": 0.3772856039144991,
"grad_norm": 2.9024892391048867,
"learning_rate": 6.53120303251801e-05,
"loss": 0.446,
"num_input_tokens_seen": 15379120,
"step": 1465
},
{
"epoch": 0.3785732680916817,
"grad_norm": 2.7506997409330105,
"learning_rate": 6.508037554761432e-05,
"loss": 0.3764,
"num_input_tokens_seen": 15431104,
"step": 1470
},
{
"epoch": 0.3798609322688643,
"grad_norm": 5.7118625908326734,
"learning_rate": 6.484836411694267e-05,
"loss": 0.4423,
"num_input_tokens_seen": 15482816,
"step": 1475
},
{
"epoch": 0.3811485964460469,
"grad_norm": 4.701095405963631,
"learning_rate": 6.461600152026965e-05,
"loss": 0.4439,
"num_input_tokens_seen": 15534896,
"step": 1480
},
{
"epoch": 0.38243626062322944,
"grad_norm": 5.574717716204205,
"learning_rate": 6.438329325300499e-05,
"loss": 0.4408,
"num_input_tokens_seen": 15587496,
"step": 1485
},
{
"epoch": 0.38372392480041206,
"grad_norm": 4.6497322752918,
"learning_rate": 6.415024481873352e-05,
"loss": 0.4086,
"num_input_tokens_seen": 15639672,
"step": 1490
},
{
"epoch": 0.38501158897759463,
"grad_norm": 5.427307211472868,
"learning_rate": 6.391686172908506e-05,
"loss": 0.4489,
"num_input_tokens_seen": 15693120,
"step": 1495
},
{
"epoch": 0.38629925315477726,
"grad_norm": 5.005547973733715,
"learning_rate": 6.368314950360415e-05,
"loss": 0.4338,
"num_input_tokens_seen": 15744848,
"step": 1500
},
{
"epoch": 0.38629925315477726,
"eval_loss": 0.45475366711616516,
"eval_runtime": 38.3957,
"eval_samples_per_second": 3.125,
"eval_steps_per_second": 0.781,
"num_input_tokens_seen": 15744848,
"step": 1500
},
{
"epoch": 0.3875869173319598,
"grad_norm": 5.097132399629058,
"learning_rate": 6.344911366961934e-05,
"loss": 0.4558,
"num_input_tokens_seen": 15797632,
"step": 1505
},
{
"epoch": 0.3888745815091424,
"grad_norm": 4.502325593575991,
"learning_rate": 6.321475976211266e-05,
"loss": 0.4518,
"num_input_tokens_seen": 15850040,
"step": 1510
},
{
"epoch": 0.390162245686325,
"grad_norm": 6.425152572566654,
"learning_rate": 6.298009332358856e-05,
"loss": 0.4092,
"num_input_tokens_seen": 15902496,
"step": 1515
},
{
"epoch": 0.3914499098635076,
"grad_norm": 3.968135032555422,
"learning_rate": 6.274511990394294e-05,
"loss": 0.478,
"num_input_tokens_seen": 15954936,
"step": 1520
},
{
"epoch": 0.3927375740406902,
"grad_norm": 4.636757769906518,
"learning_rate": 6.250984506033183e-05,
"loss": 0.4294,
"num_input_tokens_seen": 16007624,
"step": 1525
},
{
"epoch": 0.3940252382178728,
"grad_norm": 2.7967900169696347,
"learning_rate": 6.227427435703997e-05,
"loss": 0.3846,
"num_input_tokens_seen": 16059440,
"step": 1530
},
{
"epoch": 0.39531290239505534,
"grad_norm": 2.983520749639549,
"learning_rate": 6.203841336534924e-05,
"loss": 0.4372,
"num_input_tokens_seen": 16111136,
"step": 1535
},
{
"epoch": 0.39660056657223797,
"grad_norm": 8.364510466670477,
"learning_rate": 6.180226766340688e-05,
"loss": 0.484,
"num_input_tokens_seen": 16163976,
"step": 1540
},
{
"epoch": 0.39788823074942054,
"grad_norm": 4.45878743373729,
"learning_rate": 6.156584283609359e-05,
"loss": 0.3965,
"num_input_tokens_seen": 16217192,
"step": 1545
},
{
"epoch": 0.39917589492660316,
"grad_norm": 2.6831990995391717,
"learning_rate": 6.132914447489137e-05,
"loss": 0.3872,
"num_input_tokens_seen": 16269896,
"step": 1550
},
{
"epoch": 0.39917589492660316,
"eval_loss": 0.4416767656803131,
"eval_runtime": 38.4671,
"eval_samples_per_second": 3.12,
"eval_steps_per_second": 0.78,
"num_input_tokens_seen": 16269896,
"step": 1550
},
{
"epoch": 0.4004635591037857,
"grad_norm": 4.920079251827062,
"learning_rate": 6.109217817775139e-05,
"loss": 0.4593,
"num_input_tokens_seen": 16322496,
"step": 1555
},
{
"epoch": 0.40175122328096835,
"grad_norm": 9.068094163618136,
"learning_rate": 6.085494954896156e-05,
"loss": 0.4865,
"num_input_tokens_seen": 16375320,
"step": 1560
},
{
"epoch": 0.4030388874581509,
"grad_norm": 9.316944070527988,
"learning_rate": 6.061746419901388e-05,
"loss": 0.4422,
"num_input_tokens_seen": 16428096,
"step": 1565
},
{
"epoch": 0.4043265516353335,
"grad_norm": 2.4617418860122213,
"learning_rate": 6.0379727744471936e-05,
"loss": 0.3538,
"num_input_tokens_seen": 16480832,
"step": 1570
},
{
"epoch": 0.4056142158125161,
"grad_norm": 5.028400110331736,
"learning_rate": 6.014174580783794e-05,
"loss": 0.3923,
"num_input_tokens_seen": 16534016,
"step": 1575
},
{
"epoch": 0.4069018799896987,
"grad_norm": 6.638266454273257,
"learning_rate": 5.990352401741981e-05,
"loss": 0.3967,
"num_input_tokens_seen": 16586216,
"step": 1580
},
{
"epoch": 0.4081895441668813,
"grad_norm": 6.928848680437489,
"learning_rate": 5.9665068007197976e-05,
"loss": 0.4212,
"num_input_tokens_seen": 16639312,
"step": 1585
},
{
"epoch": 0.40947720834406387,
"grad_norm": 4.2324092477507005,
"learning_rate": 5.94263834166923e-05,
"loss": 0.3489,
"num_input_tokens_seen": 16692328,
"step": 1590
},
{
"epoch": 0.41076487252124644,
"grad_norm": 5.607976113391715,
"learning_rate": 5.918747589082853e-05,
"loss": 0.4105,
"num_input_tokens_seen": 16745088,
"step": 1595
},
{
"epoch": 0.41205253669842906,
"grad_norm": 5.155332109104381,
"learning_rate": 5.8948351079804875e-05,
"loss": 0.3914,
"num_input_tokens_seen": 16798768,
"step": 1600
},
{
"epoch": 0.41205253669842906,
"eval_loss": 0.4657597243785858,
"eval_runtime": 38.2951,
"eval_samples_per_second": 3.134,
"eval_steps_per_second": 0.783,
"num_input_tokens_seen": 16798768,
"step": 1600
}
],
"logging_steps": 5,
"max_steps": 3400,
"num_input_tokens_seen": 16798768,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1108323298967552.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}