|
{ |
|
"best_metric": 0.4339977502822876, |
|
"best_model_checkpoint": "saves/CADICA_qwenvl_stenosis_detect_scale4/lora/sft/checkpoint-1250", |
|
"epoch": 0.41205253669842906, |
|
"eval_steps": 50, |
|
"global_step": 1600, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0012876641771825909, |
|
"grad_norm": 13.245840411597928, |
|
"learning_rate": 2.9411764705882355e-06, |
|
"loss": 2.8889, |
|
"num_input_tokens_seen": 52840, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0025753283543651817, |
|
"grad_norm": 12.237619501215374, |
|
"learning_rate": 5.882352941176471e-06, |
|
"loss": 2.8165, |
|
"num_input_tokens_seen": 105528, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0038629925315477724, |
|
"grad_norm": 16.29688816410412, |
|
"learning_rate": 8.823529411764707e-06, |
|
"loss": 2.8363, |
|
"num_input_tokens_seen": 158768, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0051506567087303634, |
|
"grad_norm": 11.576419511120797, |
|
"learning_rate": 1.1764705882352942e-05, |
|
"loss": 2.6853, |
|
"num_input_tokens_seen": 210816, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.006438320885912954, |
|
"grad_norm": 6.9672256792859, |
|
"learning_rate": 1.4705882352941177e-05, |
|
"loss": 2.2992, |
|
"num_input_tokens_seen": 262936, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.007725985063095545, |
|
"grad_norm": 3.1837818528204305, |
|
"learning_rate": 1.7647058823529414e-05, |
|
"loss": 1.8923, |
|
"num_input_tokens_seen": 315264, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.009013649240278136, |
|
"grad_norm": 2.835950303969337, |
|
"learning_rate": 2.058823529411765e-05, |
|
"loss": 1.6984, |
|
"num_input_tokens_seen": 367840, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.010301313417460727, |
|
"grad_norm": 2.223740001042382, |
|
"learning_rate": 2.3529411764705884e-05, |
|
"loss": 1.6434, |
|
"num_input_tokens_seen": 420112, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.011588977594643318, |
|
"grad_norm": 1.9880935044313244, |
|
"learning_rate": 2.647058823529412e-05, |
|
"loss": 1.4659, |
|
"num_input_tokens_seen": 472728, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.012876641771825908, |
|
"grad_norm": 1.7151131700495934, |
|
"learning_rate": 2.9411764705882354e-05, |
|
"loss": 1.3506, |
|
"num_input_tokens_seen": 524648, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.012876641771825908, |
|
"eval_loss": 1.1727452278137207, |
|
"eval_runtime": 66.3207, |
|
"eval_samples_per_second": 1.809, |
|
"eval_steps_per_second": 0.452, |
|
"num_input_tokens_seen": 524648, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.014164305949008499, |
|
"grad_norm": 1.47475981537851, |
|
"learning_rate": 3.235294117647059e-05, |
|
"loss": 1.1455, |
|
"num_input_tokens_seen": 576472, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.01545197012619109, |
|
"grad_norm": 1.7476693647440722, |
|
"learning_rate": 3.529411764705883e-05, |
|
"loss": 0.9971, |
|
"num_input_tokens_seen": 628056, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.01673963430337368, |
|
"grad_norm": 1.3384365493212875, |
|
"learning_rate": 3.8235294117647055e-05, |
|
"loss": 0.9073, |
|
"num_input_tokens_seen": 680448, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.018027298480556272, |
|
"grad_norm": 0.9014358219807773, |
|
"learning_rate": 4.11764705882353e-05, |
|
"loss": 0.8386, |
|
"num_input_tokens_seen": 733664, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.01931496265773886, |
|
"grad_norm": 0.8007820009902022, |
|
"learning_rate": 4.411764705882353e-05, |
|
"loss": 0.7827, |
|
"num_input_tokens_seen": 786096, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.020602626834921454, |
|
"grad_norm": 0.6701003454307716, |
|
"learning_rate": 4.705882352941177e-05, |
|
"loss": 0.7814, |
|
"num_input_tokens_seen": 838192, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.021890291012104043, |
|
"grad_norm": 0.8973165751658843, |
|
"learning_rate": 5e-05, |
|
"loss": 0.7297, |
|
"num_input_tokens_seen": 890112, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.023177955189286635, |
|
"grad_norm": 0.9060968630490469, |
|
"learning_rate": 5.294117647058824e-05, |
|
"loss": 0.7894, |
|
"num_input_tokens_seen": 943472, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.024465619366469224, |
|
"grad_norm": 0.9520214202472889, |
|
"learning_rate": 5.588235294117647e-05, |
|
"loss": 0.7758, |
|
"num_input_tokens_seen": 996872, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.025753283543651816, |
|
"grad_norm": 0.8226006535044261, |
|
"learning_rate": 5.882352941176471e-05, |
|
"loss": 0.7577, |
|
"num_input_tokens_seen": 1049816, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.025753283543651816, |
|
"eval_loss": 0.7517351508140564, |
|
"eval_runtime": 38.7829, |
|
"eval_samples_per_second": 3.094, |
|
"eval_steps_per_second": 0.774, |
|
"num_input_tokens_seen": 1049816, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.027040947720834405, |
|
"grad_norm": 0.7251208491150668, |
|
"learning_rate": 6.176470588235295e-05, |
|
"loss": 0.7579, |
|
"num_input_tokens_seen": 1102584, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.028328611898016998, |
|
"grad_norm": 0.8217419839297042, |
|
"learning_rate": 6.470588235294118e-05, |
|
"loss": 0.7659, |
|
"num_input_tokens_seen": 1155512, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.029616276075199587, |
|
"grad_norm": 0.6768053879888967, |
|
"learning_rate": 6.764705882352942e-05, |
|
"loss": 0.7469, |
|
"num_input_tokens_seen": 1207976, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.03090394025238218, |
|
"grad_norm": 1.9562630849642013, |
|
"learning_rate": 7.058823529411765e-05, |
|
"loss": 0.7353, |
|
"num_input_tokens_seen": 1259776, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.03219160442956477, |
|
"grad_norm": 0.6439041597153087, |
|
"learning_rate": 7.352941176470589e-05, |
|
"loss": 0.7537, |
|
"num_input_tokens_seen": 1312760, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.03347926860674736, |
|
"grad_norm": 0.6124318582166212, |
|
"learning_rate": 7.647058823529411e-05, |
|
"loss": 0.7669, |
|
"num_input_tokens_seen": 1365616, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.03476693278392995, |
|
"grad_norm": 0.7593534002488418, |
|
"learning_rate": 7.941176470588235e-05, |
|
"loss": 0.722, |
|
"num_input_tokens_seen": 1417544, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.036054596961112545, |
|
"grad_norm": 0.7827834651032061, |
|
"learning_rate": 8.23529411764706e-05, |
|
"loss": 0.7502, |
|
"num_input_tokens_seen": 1469856, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.037342261138295134, |
|
"grad_norm": 0.5444126155596626, |
|
"learning_rate": 8.529411764705883e-05, |
|
"loss": 0.7174, |
|
"num_input_tokens_seen": 1521496, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.03862992531547772, |
|
"grad_norm": 0.40878703812837747, |
|
"learning_rate": 8.823529411764706e-05, |
|
"loss": 0.7018, |
|
"num_input_tokens_seen": 1573376, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.03862992531547772, |
|
"eval_loss": 0.7309949994087219, |
|
"eval_runtime": 38.2005, |
|
"eval_samples_per_second": 3.141, |
|
"eval_steps_per_second": 0.785, |
|
"num_input_tokens_seen": 1573376, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.03991758949266031, |
|
"grad_norm": 0.5536144453733772, |
|
"learning_rate": 9.11764705882353e-05, |
|
"loss": 0.738, |
|
"num_input_tokens_seen": 1626136, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.04120525366984291, |
|
"grad_norm": 0.5151715191704441, |
|
"learning_rate": 9.411764705882353e-05, |
|
"loss": 0.7579, |
|
"num_input_tokens_seen": 1678760, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.042492917847025496, |
|
"grad_norm": 0.5209077394596254, |
|
"learning_rate": 9.705882352941177e-05, |
|
"loss": 0.7502, |
|
"num_input_tokens_seen": 1731240, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.043780582024208085, |
|
"grad_norm": 0.721213601237688, |
|
"learning_rate": 0.0001, |
|
"loss": 0.7448, |
|
"num_input_tokens_seen": 1783816, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.045068246201390674, |
|
"grad_norm": 0.48666007914879555, |
|
"learning_rate": 9.999940874631277e-05, |
|
"loss": 0.6648, |
|
"num_input_tokens_seen": 1834592, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.04635591037857327, |
|
"grad_norm": 0.5136600613696797, |
|
"learning_rate": 9.999763499923432e-05, |
|
"loss": 0.7759, |
|
"num_input_tokens_seen": 1888176, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.04764357455575586, |
|
"grad_norm": 0.6706281530046975, |
|
"learning_rate": 9.999467880071402e-05, |
|
"loss": 0.7167, |
|
"num_input_tokens_seen": 1940280, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.04893123873293845, |
|
"grad_norm": 0.5159139445497618, |
|
"learning_rate": 9.999054022066641e-05, |
|
"loss": 0.7483, |
|
"num_input_tokens_seen": 1993096, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.050218902910121044, |
|
"grad_norm": 0.40251006129746847, |
|
"learning_rate": 9.998521935696953e-05, |
|
"loss": 0.7464, |
|
"num_input_tokens_seen": 2045648, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.05150656708730363, |
|
"grad_norm": 0.4811730853311867, |
|
"learning_rate": 9.997871633546257e-05, |
|
"loss": 0.7594, |
|
"num_input_tokens_seen": 2099008, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.05150656708730363, |
|
"eval_loss": 0.7274295687675476, |
|
"eval_runtime": 38.079, |
|
"eval_samples_per_second": 3.151, |
|
"eval_steps_per_second": 0.788, |
|
"num_input_tokens_seen": 2099008, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.05279423126448622, |
|
"grad_norm": 0.591934959695668, |
|
"learning_rate": 9.997103130994296e-05, |
|
"loss": 0.706, |
|
"num_input_tokens_seen": 2151680, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.05408189544166881, |
|
"grad_norm": 0.48253717444489286, |
|
"learning_rate": 9.996216446216267e-05, |
|
"loss": 0.7186, |
|
"num_input_tokens_seen": 2203784, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.055369559618851406, |
|
"grad_norm": 0.5274315079401322, |
|
"learning_rate": 9.995211600182397e-05, |
|
"loss": 0.7009, |
|
"num_input_tokens_seen": 2255632, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.056657223796033995, |
|
"grad_norm": 0.32879215224292613, |
|
"learning_rate": 9.994088616657444e-05, |
|
"loss": 0.6801, |
|
"num_input_tokens_seen": 2308096, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.057944887973216584, |
|
"grad_norm": 0.37171195071448215, |
|
"learning_rate": 9.992847522200133e-05, |
|
"loss": 0.7569, |
|
"num_input_tokens_seen": 2361168, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.05923255215039917, |
|
"grad_norm": 0.4120941016934064, |
|
"learning_rate": 9.99148834616253e-05, |
|
"loss": 0.7402, |
|
"num_input_tokens_seen": 2413896, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.06052021632758177, |
|
"grad_norm": 0.5998680948310651, |
|
"learning_rate": 9.990011120689351e-05, |
|
"loss": 0.7191, |
|
"num_input_tokens_seen": 2466136, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.06180788050476436, |
|
"grad_norm": 0.538488141249078, |
|
"learning_rate": 9.988415880717194e-05, |
|
"loss": 0.7274, |
|
"num_input_tokens_seen": 2518848, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.06309554468194695, |
|
"grad_norm": 0.4393093124760277, |
|
"learning_rate": 9.986702663973722e-05, |
|
"loss": 0.7704, |
|
"num_input_tokens_seen": 2572384, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.06438320885912954, |
|
"grad_norm": 0.6116643616510118, |
|
"learning_rate": 9.98487151097676e-05, |
|
"loss": 0.7346, |
|
"num_input_tokens_seen": 2625352, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.06438320885912954, |
|
"eval_loss": 0.7181503176689148, |
|
"eval_runtime": 38.0986, |
|
"eval_samples_per_second": 3.15, |
|
"eval_steps_per_second": 0.787, |
|
"num_input_tokens_seen": 2625352, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.06567087303631212, |
|
"grad_norm": 0.41200227731339506, |
|
"learning_rate": 9.98292246503335e-05, |
|
"loss": 0.7408, |
|
"num_input_tokens_seen": 2678216, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.06695853721349472, |
|
"grad_norm": 0.44521059732114987, |
|
"learning_rate": 9.980855572238714e-05, |
|
"loss": 0.7044, |
|
"num_input_tokens_seen": 2730664, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.06824620139067732, |
|
"grad_norm": 0.571896859428363, |
|
"learning_rate": 9.978670881475172e-05, |
|
"loss": 0.7334, |
|
"num_input_tokens_seen": 2783584, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.0695338655678599, |
|
"grad_norm": 0.3907697039722125, |
|
"learning_rate": 9.976368444410985e-05, |
|
"loss": 0.7075, |
|
"num_input_tokens_seen": 2836152, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.0708215297450425, |
|
"grad_norm": 0.4507806825752261, |
|
"learning_rate": 9.973948315499126e-05, |
|
"loss": 0.7039, |
|
"num_input_tokens_seen": 2887808, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.07210919392222509, |
|
"grad_norm": 0.41330504132984697, |
|
"learning_rate": 9.971410551976002e-05, |
|
"loss": 0.6953, |
|
"num_input_tokens_seen": 2939656, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.07339685809940767, |
|
"grad_norm": 0.4625671909482009, |
|
"learning_rate": 9.968755213860094e-05, |
|
"loss": 0.7022, |
|
"num_input_tokens_seen": 2991632, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.07468452227659027, |
|
"grad_norm": 0.6553627840267285, |
|
"learning_rate": 9.96598236395054e-05, |
|
"loss": 0.6796, |
|
"num_input_tokens_seen": 3043616, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.07597218645377285, |
|
"grad_norm": 0.5157886895754477, |
|
"learning_rate": 9.96309206782565e-05, |
|
"loss": 0.7346, |
|
"num_input_tokens_seen": 3096920, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.07725985063095545, |
|
"grad_norm": 0.5672965149433489, |
|
"learning_rate": 9.960084393841355e-05, |
|
"loss": 0.6815, |
|
"num_input_tokens_seen": 3149032, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.07725985063095545, |
|
"eval_loss": 0.7073924541473389, |
|
"eval_runtime": 38.1842, |
|
"eval_samples_per_second": 3.143, |
|
"eval_steps_per_second": 0.786, |
|
"num_input_tokens_seen": 3149032, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.07854751480813804, |
|
"grad_norm": 0.4479276285203507, |
|
"learning_rate": 9.956959413129585e-05, |
|
"loss": 0.7208, |
|
"num_input_tokens_seen": 3201560, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.07983517898532062, |
|
"grad_norm": 0.368457437106614, |
|
"learning_rate": 9.953717199596598e-05, |
|
"loss": 0.7144, |
|
"num_input_tokens_seen": 3254632, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.08112284316250322, |
|
"grad_norm": 0.5531413254856732, |
|
"learning_rate": 9.95035782992122e-05, |
|
"loss": 0.6861, |
|
"num_input_tokens_seen": 3306432, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.08241050733968582, |
|
"grad_norm": 0.41513991799613037, |
|
"learning_rate": 9.94688138355304e-05, |
|
"loss": 0.6836, |
|
"num_input_tokens_seen": 3358392, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.0836981715168684, |
|
"grad_norm": 0.47052274706452957, |
|
"learning_rate": 9.943287942710527e-05, |
|
"loss": 0.7353, |
|
"num_input_tokens_seen": 3411424, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.08498583569405099, |
|
"grad_norm": 0.6322586593511644, |
|
"learning_rate": 9.939577592379088e-05, |
|
"loss": 0.6774, |
|
"num_input_tokens_seen": 3462992, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.08627349987123359, |
|
"grad_norm": 0.4129597798905344, |
|
"learning_rate": 9.935750420309055e-05, |
|
"loss": 0.7331, |
|
"num_input_tokens_seen": 3516136, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.08756116404841617, |
|
"grad_norm": 0.4031509882699161, |
|
"learning_rate": 9.931806517013612e-05, |
|
"loss": 0.6939, |
|
"num_input_tokens_seen": 3568360, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.08884882822559877, |
|
"grad_norm": 0.4444358747076587, |
|
"learning_rate": 9.927745975766654e-05, |
|
"loss": 0.7158, |
|
"num_input_tokens_seen": 3620696, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.09013649240278135, |
|
"grad_norm": 0.5290547365449167, |
|
"learning_rate": 9.923568892600578e-05, |
|
"loss": 0.6932, |
|
"num_input_tokens_seen": 3673152, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.09013649240278135, |
|
"eval_loss": 0.7044599056243896, |
|
"eval_runtime": 38.2709, |
|
"eval_samples_per_second": 3.136, |
|
"eval_steps_per_second": 0.784, |
|
"num_input_tokens_seen": 3673152, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.09142415657996394, |
|
"grad_norm": 0.47530311368359207, |
|
"learning_rate": 9.91927536630402e-05, |
|
"loss": 0.6778, |
|
"num_input_tokens_seen": 3725296, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.09271182075714654, |
|
"grad_norm": 0.38913022785688944, |
|
"learning_rate": 9.91486549841951e-05, |
|
"loss": 0.6857, |
|
"num_input_tokens_seen": 3777552, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.09399948493432912, |
|
"grad_norm": 0.4834773141333328, |
|
"learning_rate": 9.91033939324107e-05, |
|
"loss": 0.7184, |
|
"num_input_tokens_seen": 3830200, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.09528714911151172, |
|
"grad_norm": 0.5862045807150876, |
|
"learning_rate": 9.905697157811761e-05, |
|
"loss": 0.7196, |
|
"num_input_tokens_seen": 3883200, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.09657481328869431, |
|
"grad_norm": 0.4576971522205563, |
|
"learning_rate": 9.900938901921131e-05, |
|
"loss": 0.6914, |
|
"num_input_tokens_seen": 3935576, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.0978624774658769, |
|
"grad_norm": 0.49551517524520683, |
|
"learning_rate": 9.896064738102635e-05, |
|
"loss": 0.6681, |
|
"num_input_tokens_seen": 3987624, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.09915014164305949, |
|
"grad_norm": 0.8198390819787913, |
|
"learning_rate": 9.891074781630966e-05, |
|
"loss": 0.6723, |
|
"num_input_tokens_seen": 4039680, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.10043780582024209, |
|
"grad_norm": 0.7034626469978683, |
|
"learning_rate": 9.885969150519331e-05, |
|
"loss": 0.6498, |
|
"num_input_tokens_seen": 4091216, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.10172546999742467, |
|
"grad_norm": 0.8838075623197742, |
|
"learning_rate": 9.88074796551666e-05, |
|
"loss": 0.7311, |
|
"num_input_tokens_seen": 4144264, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.10301313417460727, |
|
"grad_norm": 0.7342758386202114, |
|
"learning_rate": 9.875411350104744e-05, |
|
"loss": 0.7089, |
|
"num_input_tokens_seen": 4197072, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.10301313417460727, |
|
"eval_loss": 0.6847750544548035, |
|
"eval_runtime": 37.9238, |
|
"eval_samples_per_second": 3.164, |
|
"eval_steps_per_second": 0.791, |
|
"num_input_tokens_seen": 4197072, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.10430079835178985, |
|
"grad_norm": 0.8113533605928532, |
|
"learning_rate": 9.86995943049533e-05, |
|
"loss": 0.7021, |
|
"num_input_tokens_seen": 4249656, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.10558846252897244, |
|
"grad_norm": 1.1772677082041305, |
|
"learning_rate": 9.864392335627117e-05, |
|
"loss": 0.6943, |
|
"num_input_tokens_seen": 4302944, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.10687612670615504, |
|
"grad_norm": 1.6493280510697776, |
|
"learning_rate": 9.858710197162721e-05, |
|
"loss": 0.7146, |
|
"num_input_tokens_seen": 4355480, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.10816379088333762, |
|
"grad_norm": 3.0159798803441715, |
|
"learning_rate": 9.852913149485556e-05, |
|
"loss": 0.6312, |
|
"num_input_tokens_seen": 4407688, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.10945145506052022, |
|
"grad_norm": 1.7981196843056153, |
|
"learning_rate": 9.847001329696653e-05, |
|
"loss": 0.6877, |
|
"num_input_tokens_seen": 4459736, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.11073911923770281, |
|
"grad_norm": 1.5783278376799834, |
|
"learning_rate": 9.840974877611422e-05, |
|
"loss": 0.6975, |
|
"num_input_tokens_seen": 4512928, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.1120267834148854, |
|
"grad_norm": 3.306646516615779, |
|
"learning_rate": 9.834833935756344e-05, |
|
"loss": 0.651, |
|
"num_input_tokens_seen": 4565840, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.11331444759206799, |
|
"grad_norm": 2.3184973874904005, |
|
"learning_rate": 9.828578649365601e-05, |
|
"loss": 0.685, |
|
"num_input_tokens_seen": 4618168, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.11460211176925057, |
|
"grad_norm": 1.602690016495642, |
|
"learning_rate": 9.822209166377635e-05, |
|
"loss": 0.6258, |
|
"num_input_tokens_seen": 4669784, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.11588977594643317, |
|
"grad_norm": 2.6770797227308196, |
|
"learning_rate": 9.815725637431662e-05, |
|
"loss": 0.6732, |
|
"num_input_tokens_seen": 4722528, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.11588977594643317, |
|
"eval_loss": 0.6526497006416321, |
|
"eval_runtime": 39.085, |
|
"eval_samples_per_second": 3.07, |
|
"eval_steps_per_second": 0.768, |
|
"num_input_tokens_seen": 4722528, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.11717744012361576, |
|
"grad_norm": 2.1823349329218074, |
|
"learning_rate": 9.809128215864097e-05, |
|
"loss": 0.6544, |
|
"num_input_tokens_seen": 4774400, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.11846510430079835, |
|
"grad_norm": 1.434521593914191, |
|
"learning_rate": 9.802417057704931e-05, |
|
"loss": 0.652, |
|
"num_input_tokens_seen": 4826704, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.11975276847798094, |
|
"grad_norm": 2.399754385687283, |
|
"learning_rate": 9.795592321674045e-05, |
|
"loss": 0.6582, |
|
"num_input_tokens_seen": 4880072, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.12104043265516354, |
|
"grad_norm": 3.9235176077985536, |
|
"learning_rate": 9.788654169177453e-05, |
|
"loss": 0.6506, |
|
"num_input_tokens_seen": 4931968, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.12232809683234612, |
|
"grad_norm": 3.659330745777227, |
|
"learning_rate": 9.781602764303487e-05, |
|
"loss": 0.6551, |
|
"num_input_tokens_seen": 4983656, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.12361576100952872, |
|
"grad_norm": 1.9670601503398757, |
|
"learning_rate": 9.774438273818911e-05, |
|
"loss": 0.6978, |
|
"num_input_tokens_seen": 5036528, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.12490342518671131, |
|
"grad_norm": 1.308580869419328, |
|
"learning_rate": 9.767160867164979e-05, |
|
"loss": 0.6407, |
|
"num_input_tokens_seen": 5088768, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.1261910893638939, |
|
"grad_norm": 1.7349486072682865, |
|
"learning_rate": 9.759770716453436e-05, |
|
"loss": 0.6641, |
|
"num_input_tokens_seen": 5142080, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.1274787535410765, |
|
"grad_norm": 2.993327939872198, |
|
"learning_rate": 9.752267996462434e-05, |
|
"loss": 0.6588, |
|
"num_input_tokens_seen": 5194432, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.12876641771825909, |
|
"grad_norm": 2.6430988002320976, |
|
"learning_rate": 9.744652884632406e-05, |
|
"loss": 0.6304, |
|
"num_input_tokens_seen": 5246640, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.12876641771825909, |
|
"eval_loss": 0.6272165775299072, |
|
"eval_runtime": 39.4177, |
|
"eval_samples_per_second": 3.044, |
|
"eval_steps_per_second": 0.761, |
|
"num_input_tokens_seen": 5246640, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.13005408189544168, |
|
"grad_norm": 2.6047672112920286, |
|
"learning_rate": 9.736925561061871e-05, |
|
"loss": 0.5741, |
|
"num_input_tokens_seen": 5299024, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.13134174607262425, |
|
"grad_norm": 2.4706517190834063, |
|
"learning_rate": 9.729086208503174e-05, |
|
"loss": 0.6535, |
|
"num_input_tokens_seen": 5352664, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.13262941024980685, |
|
"grad_norm": 2.031672226684599, |
|
"learning_rate": 9.721135012358156e-05, |
|
"loss": 0.6081, |
|
"num_input_tokens_seen": 5406008, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.13391707442698944, |
|
"grad_norm": 2.773997809426142, |
|
"learning_rate": 9.713072160673777e-05, |
|
"loss": 0.6792, |
|
"num_input_tokens_seen": 5459368, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.13520473860417204, |
|
"grad_norm": 5.083057729524855, |
|
"learning_rate": 9.704897844137673e-05, |
|
"loss": 0.6821, |
|
"num_input_tokens_seen": 5512960, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.13649240278135463, |
|
"grad_norm": 3.0440654843385584, |
|
"learning_rate": 9.696612256073633e-05, |
|
"loss": 0.5835, |
|
"num_input_tokens_seen": 5565368, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.1377800669585372, |
|
"grad_norm": 3.7400231170971323, |
|
"learning_rate": 9.688215592437039e-05, |
|
"loss": 0.6129, |
|
"num_input_tokens_seen": 5618008, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.1390677311357198, |
|
"grad_norm": 6.340287952379529, |
|
"learning_rate": 9.679708051810221e-05, |
|
"loss": 0.5765, |
|
"num_input_tokens_seen": 5670072, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.1403553953129024, |
|
"grad_norm": 3.6351560550229207, |
|
"learning_rate": 9.67108983539777e-05, |
|
"loss": 0.6325, |
|
"num_input_tokens_seen": 5722936, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.141643059490085, |
|
"grad_norm": 3.8363425916745117, |
|
"learning_rate": 9.662361147021779e-05, |
|
"loss": 0.5596, |
|
"num_input_tokens_seen": 5774880, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.141643059490085, |
|
"eval_loss": 0.5832681059837341, |
|
"eval_runtime": 38.2495, |
|
"eval_samples_per_second": 3.137, |
|
"eval_steps_per_second": 0.784, |
|
"num_input_tokens_seen": 5774880, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.14293072366726758, |
|
"grad_norm": 3.911447203674744, |
|
"learning_rate": 9.653522193117013e-05, |
|
"loss": 0.5073, |
|
"num_input_tokens_seen": 5826608, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.14421838784445018, |
|
"grad_norm": 3.3501835856945763, |
|
"learning_rate": 9.644573182726035e-05, |
|
"loss": 0.5652, |
|
"num_input_tokens_seen": 5879776, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.14550605202163275, |
|
"grad_norm": 8.75758822201328, |
|
"learning_rate": 9.63551432749426e-05, |
|
"loss": 0.5727, |
|
"num_input_tokens_seen": 5932888, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.14679371619881534, |
|
"grad_norm": 4.351029258458384, |
|
"learning_rate": 9.626345841664953e-05, |
|
"loss": 0.6251, |
|
"num_input_tokens_seen": 5984648, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.14808138037599794, |
|
"grad_norm": 7.617020699535255, |
|
"learning_rate": 9.617067942074153e-05, |
|
"loss": 0.6508, |
|
"num_input_tokens_seen": 6037000, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.14936904455318054, |
|
"grad_norm": 7.293430172750479, |
|
"learning_rate": 9.607680848145558e-05, |
|
"loss": 0.6686, |
|
"num_input_tokens_seen": 6090512, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.15065670873036313, |
|
"grad_norm": 3.3635276124166653, |
|
"learning_rate": 9.598184781885318e-05, |
|
"loss": 0.5793, |
|
"num_input_tokens_seen": 6143320, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.1519443729075457, |
|
"grad_norm": 2.7589160396339407, |
|
"learning_rate": 9.588579967876806e-05, |
|
"loss": 0.5954, |
|
"num_input_tokens_seen": 6195720, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.1532320370847283, |
|
"grad_norm": 1.582169884399532, |
|
"learning_rate": 9.578866633275288e-05, |
|
"loss": 0.5644, |
|
"num_input_tokens_seen": 6247592, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.1545197012619109, |
|
"grad_norm": 3.891844940061855, |
|
"learning_rate": 9.569045007802559e-05, |
|
"loss": 0.5794, |
|
"num_input_tokens_seen": 6299656, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.1545197012619109, |
|
"eval_loss": 0.6039358973503113, |
|
"eval_runtime": 38.3138, |
|
"eval_samples_per_second": 3.132, |
|
"eval_steps_per_second": 0.783, |
|
"num_input_tokens_seen": 6299656, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.1558073654390935, |
|
"grad_norm": 5.90634634073773, |
|
"learning_rate": 9.55911532374151e-05, |
|
"loss": 0.6106, |
|
"num_input_tokens_seen": 6351680, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.15709502961627608, |
|
"grad_norm": 3.5429043559071034, |
|
"learning_rate": 9.549077815930636e-05, |
|
"loss": 0.5812, |
|
"num_input_tokens_seen": 6403648, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.15838269379345868, |
|
"grad_norm": 2.8753548663225144, |
|
"learning_rate": 9.538932721758474e-05, |
|
"loss": 0.5992, |
|
"num_input_tokens_seen": 6456328, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.15967035797064125, |
|
"grad_norm": 2.4013005755622467, |
|
"learning_rate": 9.528680281157999e-05, |
|
"loss": 0.587, |
|
"num_input_tokens_seen": 6509024, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.16095802214782384, |
|
"grad_norm": 3.860358696946306, |
|
"learning_rate": 9.518320736600943e-05, |
|
"loss": 0.5836, |
|
"num_input_tokens_seen": 6561336, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.16224568632500644, |
|
"grad_norm": 3.187917212328382, |
|
"learning_rate": 9.507854333092063e-05, |
|
"loss": 0.5913, |
|
"num_input_tokens_seen": 6614024, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.16353335050218903, |
|
"grad_norm": 3.5342177024321586, |
|
"learning_rate": 9.497281318163346e-05, |
|
"loss": 0.5693, |
|
"num_input_tokens_seen": 6666416, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.16482101467937163, |
|
"grad_norm": 3.90374612709263, |
|
"learning_rate": 9.486601941868154e-05, |
|
"loss": 0.572, |
|
"num_input_tokens_seen": 6718200, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.1661086788565542, |
|
"grad_norm": 4.4270591027201665, |
|
"learning_rate": 9.475816456775313e-05, |
|
"loss": 0.6111, |
|
"num_input_tokens_seen": 6771256, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.1673963430337368, |
|
"grad_norm": 5.04761388655614, |
|
"learning_rate": 9.464925117963133e-05, |
|
"loss": 0.5959, |
|
"num_input_tokens_seen": 6824008, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.1673963430337368, |
|
"eval_loss": 0.5542036890983582, |
|
"eval_runtime": 68.9048, |
|
"eval_samples_per_second": 1.742, |
|
"eval_steps_per_second": 0.435, |
|
"num_input_tokens_seen": 6824008, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.1686840072109194, |
|
"grad_norm": 3.428410481447858, |
|
"learning_rate": 9.453928183013385e-05, |
|
"loss": 0.5344, |
|
"num_input_tokens_seen": 6875432, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.16997167138810199, |
|
"grad_norm": 2.9137495299009846, |
|
"learning_rate": 9.442825912005202e-05, |
|
"loss": 0.56, |
|
"num_input_tokens_seen": 6927768, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.17125933556528458, |
|
"grad_norm": 4.2956604210715925, |
|
"learning_rate": 9.431618567508933e-05, |
|
"loss": 0.5701, |
|
"num_input_tokens_seen": 6980544, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.17254699974246718, |
|
"grad_norm": 4.3977584083656405, |
|
"learning_rate": 9.420306414579925e-05, |
|
"loss": 0.5604, |
|
"num_input_tokens_seen": 7032584, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.17383466391964975, |
|
"grad_norm": 4.48381006313936, |
|
"learning_rate": 9.408889720752266e-05, |
|
"loss": 0.5763, |
|
"num_input_tokens_seen": 7085048, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.17512232809683234, |
|
"grad_norm": 2.189534287393346, |
|
"learning_rate": 9.397368756032445e-05, |
|
"loss": 0.5962, |
|
"num_input_tokens_seen": 7137952, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.17640999227401494, |
|
"grad_norm": 3.34591241093722, |
|
"learning_rate": 9.385743792892982e-05, |
|
"loss": 0.5935, |
|
"num_input_tokens_seen": 7190584, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.17769765645119753, |
|
"grad_norm": 2.7509902524242507, |
|
"learning_rate": 9.374015106265968e-05, |
|
"loss": 0.5267, |
|
"num_input_tokens_seen": 7243440, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.17898532062838013, |
|
"grad_norm": 2.322454948468365, |
|
"learning_rate": 9.362182973536569e-05, |
|
"loss": 0.5351, |
|
"num_input_tokens_seen": 7295568, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.1802729848055627, |
|
"grad_norm": 3.4615171229405046, |
|
"learning_rate": 9.35024767453647e-05, |
|
"loss": 0.5014, |
|
"num_input_tokens_seen": 7347040, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.1802729848055627, |
|
"eval_loss": 0.5440100431442261, |
|
"eval_runtime": 39.1181, |
|
"eval_samples_per_second": 3.068, |
|
"eval_steps_per_second": 0.767, |
|
"num_input_tokens_seen": 7347040, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.1815606489827453, |
|
"grad_norm": 4.815426816055898, |
|
"learning_rate": 9.338209491537257e-05, |
|
"loss": 0.543, |
|
"num_input_tokens_seen": 7399584, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.1828483131599279, |
|
"grad_norm": 7.294932559918336, |
|
"learning_rate": 9.326068709243727e-05, |
|
"loss": 0.4995, |
|
"num_input_tokens_seen": 7452928, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.18413597733711048, |
|
"grad_norm": 3.6946433405013495, |
|
"learning_rate": 9.313825614787177e-05, |
|
"loss": 0.5109, |
|
"num_input_tokens_seen": 7505112, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.18542364151429308, |
|
"grad_norm": 4.339671310261357, |
|
"learning_rate": 9.301480497718593e-05, |
|
"loss": 0.4932, |
|
"num_input_tokens_seen": 7557608, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.18671130569147568, |
|
"grad_norm": 11.604530853746237, |
|
"learning_rate": 9.289033650001817e-05, |
|
"loss": 0.5573, |
|
"num_input_tokens_seen": 7610048, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.18799896986865824, |
|
"grad_norm": 5.990020165378009, |
|
"learning_rate": 9.276485366006634e-05, |
|
"loss": 0.5305, |
|
"num_input_tokens_seen": 7662056, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.18928663404584084, |
|
"grad_norm": 4.709895983169237, |
|
"learning_rate": 9.263835942501807e-05, |
|
"loss": 0.5369, |
|
"num_input_tokens_seen": 7713656, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.19057429822302344, |
|
"grad_norm": 4.873824727341975, |
|
"learning_rate": 9.251085678648072e-05, |
|
"loss": 0.5397, |
|
"num_input_tokens_seen": 7765992, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.19186196240020603, |
|
"grad_norm": 3.288968567031419, |
|
"learning_rate": 9.238234875991046e-05, |
|
"loss": 0.5116, |
|
"num_input_tokens_seen": 7818448, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.19314962657738863, |
|
"grad_norm": 4.778741391076671, |
|
"learning_rate": 9.225283838454111e-05, |
|
"loss": 0.541, |
|
"num_input_tokens_seen": 7870520, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.19314962657738863, |
|
"eval_loss": 0.5273815989494324, |
|
"eval_runtime": 39.1812, |
|
"eval_samples_per_second": 3.063, |
|
"eval_steps_per_second": 0.766, |
|
"num_input_tokens_seen": 7870520, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.1944372907545712, |
|
"grad_norm": 4.544356566141105, |
|
"learning_rate": 9.21223287233121e-05, |
|
"loss": 0.4961, |
|
"num_input_tokens_seen": 7922736, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.1957249549317538, |
|
"grad_norm": 7.025876813077666, |
|
"learning_rate": 9.199082286279622e-05, |
|
"loss": 0.4956, |
|
"num_input_tokens_seen": 7975304, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.1970126191089364, |
|
"grad_norm": 4.9360968239249985, |
|
"learning_rate": 9.185832391312644e-05, |
|
"loss": 0.4997, |
|
"num_input_tokens_seen": 8027448, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.19830028328611898, |
|
"grad_norm": 10.528361984915874, |
|
"learning_rate": 9.172483500792244e-05, |
|
"loss": 0.5214, |
|
"num_input_tokens_seen": 8080944, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.19958794746330158, |
|
"grad_norm": 9.264531258094065, |
|
"learning_rate": 9.159035930421658e-05, |
|
"loss": 0.6098, |
|
"num_input_tokens_seen": 8133392, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.20087561164048418, |
|
"grad_norm": 1.9709167614209242, |
|
"learning_rate": 9.145489998237902e-05, |
|
"loss": 0.5046, |
|
"num_input_tokens_seen": 8185360, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.20216327581766674, |
|
"grad_norm": 7.5915211434567595, |
|
"learning_rate": 9.131846024604274e-05, |
|
"loss": 0.5803, |
|
"num_input_tokens_seen": 8237672, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.20345093999484934, |
|
"grad_norm": 3.251682970663388, |
|
"learning_rate": 9.11810433220276e-05, |
|
"loss": 0.5365, |
|
"num_input_tokens_seen": 8289688, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.20473860417203193, |
|
"grad_norm": 4.341533737034294, |
|
"learning_rate": 9.104265246026415e-05, |
|
"loss": 0.5259, |
|
"num_input_tokens_seen": 8341624, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.20602626834921453, |
|
"grad_norm": 5.463180544339495, |
|
"learning_rate": 9.090329093371666e-05, |
|
"loss": 0.5291, |
|
"num_input_tokens_seen": 8393696, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.20602626834921453, |
|
"eval_loss": 0.5219093561172485, |
|
"eval_runtime": 39.7455, |
|
"eval_samples_per_second": 3.019, |
|
"eval_steps_per_second": 0.755, |
|
"num_input_tokens_seen": 8393696, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.20731393252639713, |
|
"grad_norm": 4.254130676908817, |
|
"learning_rate": 9.076296203830579e-05, |
|
"loss": 0.5449, |
|
"num_input_tokens_seen": 8446496, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.2086015967035797, |
|
"grad_norm": 5.6525741285524145, |
|
"learning_rate": 9.062166909283062e-05, |
|
"loss": 0.5625, |
|
"num_input_tokens_seen": 8499544, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.2098892608807623, |
|
"grad_norm": 3.8041246225911345, |
|
"learning_rate": 9.047941543889014e-05, |
|
"loss": 0.5564, |
|
"num_input_tokens_seen": 8552568, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.2111769250579449, |
|
"grad_norm": 3.803732280546421, |
|
"learning_rate": 9.033620444080428e-05, |
|
"loss": 0.5487, |
|
"num_input_tokens_seen": 8605560, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.21246458923512748, |
|
"grad_norm": 2.8518948364927925, |
|
"learning_rate": 9.019203948553422e-05, |
|
"loss": 0.5719, |
|
"num_input_tokens_seen": 8657704, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.21375225341231008, |
|
"grad_norm": 3.939376115862177, |
|
"learning_rate": 9.004692398260244e-05, |
|
"loss": 0.5235, |
|
"num_input_tokens_seen": 8711088, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.21503991758949267, |
|
"grad_norm": 6.635912128499916, |
|
"learning_rate": 8.9900861364012e-05, |
|
"loss": 0.5566, |
|
"num_input_tokens_seen": 8763712, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.21632758176667524, |
|
"grad_norm": 3.7547407090496687, |
|
"learning_rate": 8.975385508416532e-05, |
|
"loss": 0.482, |
|
"num_input_tokens_seen": 8815760, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.21761524594385784, |
|
"grad_norm": 4.093006904445721, |
|
"learning_rate": 8.960590861978265e-05, |
|
"loss": 0.5046, |
|
"num_input_tokens_seen": 8867720, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.21890291012104043, |
|
"grad_norm": 11.397392997722068, |
|
"learning_rate": 8.945702546981969e-05, |
|
"loss": 0.5063, |
|
"num_input_tokens_seen": 8919608, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.21890291012104043, |
|
"eval_loss": 0.5525640249252319, |
|
"eval_runtime": 39.0469, |
|
"eval_samples_per_second": 3.073, |
|
"eval_steps_per_second": 0.768, |
|
"num_input_tokens_seen": 8919608, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.22019057429822303, |
|
"grad_norm": 4.339535962830116, |
|
"learning_rate": 8.930720915538487e-05, |
|
"loss": 0.5853, |
|
"num_input_tokens_seen": 8971048, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.22147823847540563, |
|
"grad_norm": 6.118436891847819, |
|
"learning_rate": 8.915646321965614e-05, |
|
"loss": 0.5534, |
|
"num_input_tokens_seen": 9022936, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.2227659026525882, |
|
"grad_norm": 3.3997835203618667, |
|
"learning_rate": 8.900479122779712e-05, |
|
"loss": 0.5623, |
|
"num_input_tokens_seen": 9075336, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.2240535668297708, |
|
"grad_norm": 4.188326935911128, |
|
"learning_rate": 8.885219676687277e-05, |
|
"loss": 0.5561, |
|
"num_input_tokens_seen": 9127688, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.22534123100695339, |
|
"grad_norm": 5.220175192497493, |
|
"learning_rate": 8.869868344576459e-05, |
|
"loss": 0.5449, |
|
"num_input_tokens_seen": 9180624, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.22662889518413598, |
|
"grad_norm": 2.2022914161050577, |
|
"learning_rate": 8.854425489508532e-05, |
|
"loss": 0.5062, |
|
"num_input_tokens_seen": 9233176, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.22791655936131858, |
|
"grad_norm": 4.62379059067999, |
|
"learning_rate": 8.838891476709288e-05, |
|
"loss": 0.5033, |
|
"num_input_tokens_seen": 9286688, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.22920422353850115, |
|
"grad_norm": 3.639684630492015, |
|
"learning_rate": 8.823266673560426e-05, |
|
"loss": 0.4845, |
|
"num_input_tokens_seen": 9339600, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.23049188771568374, |
|
"grad_norm": 4.131757647310936, |
|
"learning_rate": 8.807551449590846e-05, |
|
"loss": 0.5595, |
|
"num_input_tokens_seen": 9391536, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.23177955189286634, |
|
"grad_norm": 4.771128685196347, |
|
"learning_rate": 8.791746176467907e-05, |
|
"loss": 0.5251, |
|
"num_input_tokens_seen": 9443616, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.23177955189286634, |
|
"eval_loss": 0.49604204297065735, |
|
"eval_runtime": 39.5289, |
|
"eval_samples_per_second": 3.036, |
|
"eval_steps_per_second": 0.759, |
|
"num_input_tokens_seen": 9443616, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.23306721607004893, |
|
"grad_norm": 6.849781513397169, |
|
"learning_rate": 8.775851227988656e-05, |
|
"loss": 0.5774, |
|
"num_input_tokens_seen": 9497304, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.23435488024723153, |
|
"grad_norm": 2.526801567699946, |
|
"learning_rate": 8.759866980070963e-05, |
|
"loss": 0.5441, |
|
"num_input_tokens_seen": 9549416, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.23564254442441412, |
|
"grad_norm": 3.1008408808291503, |
|
"learning_rate": 8.743793810744654e-05, |
|
"loss": 0.4898, |
|
"num_input_tokens_seen": 9601800, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.2369302086015967, |
|
"grad_norm": 4.120824184689494, |
|
"learning_rate": 8.727632100142551e-05, |
|
"loss": 0.4681, |
|
"num_input_tokens_seen": 9653600, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.2382178727787793, |
|
"grad_norm": 5.251488809494114, |
|
"learning_rate": 8.711382230491493e-05, |
|
"loss": 0.4946, |
|
"num_input_tokens_seen": 9707224, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.23950553695596188, |
|
"grad_norm": 6.885034741125289, |
|
"learning_rate": 8.695044586103296e-05, |
|
"loss": 0.5517, |
|
"num_input_tokens_seen": 9760096, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.24079320113314448, |
|
"grad_norm": 4.6246077239626855, |
|
"learning_rate": 8.678619553365659e-05, |
|
"loss": 0.6064, |
|
"num_input_tokens_seen": 9812672, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.24208086531032708, |
|
"grad_norm": 5.621020693846077, |
|
"learning_rate": 8.662107520733027e-05, |
|
"loss": 0.5398, |
|
"num_input_tokens_seen": 9866200, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.24336852948750964, |
|
"grad_norm": 3.1921985322817092, |
|
"learning_rate": 8.64550887871741e-05, |
|
"loss": 0.5068, |
|
"num_input_tokens_seen": 9918160, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.24465619366469224, |
|
"grad_norm": 2.3689648161336465, |
|
"learning_rate": 8.628824019879137e-05, |
|
"loss": 0.5862, |
|
"num_input_tokens_seen": 9970600, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.24465619366469224, |
|
"eval_loss": 0.5085262656211853, |
|
"eval_runtime": 39.0437, |
|
"eval_samples_per_second": 3.073, |
|
"eval_steps_per_second": 0.768, |
|
"num_input_tokens_seen": 9970600, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.24594385784187484, |
|
"grad_norm": 2.8827978223065363, |
|
"learning_rate": 8.612053338817581e-05, |
|
"loss": 0.4549, |
|
"num_input_tokens_seen": 10022248, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.24723152201905743, |
|
"grad_norm": 6.662877258417003, |
|
"learning_rate": 8.595197232161824e-05, |
|
"loss": 0.4791, |
|
"num_input_tokens_seen": 10075280, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.24851918619624003, |
|
"grad_norm": 8.140970355143077, |
|
"learning_rate": 8.578256098561275e-05, |
|
"loss": 0.4833, |
|
"num_input_tokens_seen": 10128392, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.24980685037342262, |
|
"grad_norm": 3.243184767888501, |
|
"learning_rate": 8.561230338676239e-05, |
|
"loss": 0.4672, |
|
"num_input_tokens_seen": 10180720, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.2510945145506052, |
|
"grad_norm": 6.588760068173114, |
|
"learning_rate": 8.544120355168451e-05, |
|
"loss": 0.5205, |
|
"num_input_tokens_seen": 10233256, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.2523821787277878, |
|
"grad_norm": 2.6240987196110837, |
|
"learning_rate": 8.526926552691544e-05, |
|
"loss": 0.5124, |
|
"num_input_tokens_seen": 10284928, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.2536698429049704, |
|
"grad_norm": 8.242761558538728, |
|
"learning_rate": 8.509649337881483e-05, |
|
"loss": 0.5034, |
|
"num_input_tokens_seen": 10338208, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.254957507082153, |
|
"grad_norm": 8.922137566500533, |
|
"learning_rate": 8.492289119346943e-05, |
|
"loss": 0.5226, |
|
"num_input_tokens_seen": 10390224, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.25624517125933555, |
|
"grad_norm": 4.922275874717211, |
|
"learning_rate": 8.474846307659658e-05, |
|
"loss": 0.5399, |
|
"num_input_tokens_seen": 10443080, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.25753283543651817, |
|
"grad_norm": 6.866585614783304, |
|
"learning_rate": 8.457321315344694e-05, |
|
"loss": 0.483, |
|
"num_input_tokens_seen": 10495592, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.25753283543651817, |
|
"eval_loss": 0.5305114388465881, |
|
"eval_runtime": 38.9297, |
|
"eval_samples_per_second": 3.082, |
|
"eval_steps_per_second": 0.771, |
|
"num_input_tokens_seen": 10495592, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.25882049961370074, |
|
"grad_norm": 8.233033578002926, |
|
"learning_rate": 8.439714556870704e-05, |
|
"loss": 0.568, |
|
"num_input_tokens_seen": 10548136, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.26010816379088336, |
|
"grad_norm": 5.3701298824478485, |
|
"learning_rate": 8.422026448640124e-05, |
|
"loss": 0.4335, |
|
"num_input_tokens_seen": 10600048, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.26139582796806593, |
|
"grad_norm": 5.491882026124958, |
|
"learning_rate": 8.40425740897932e-05, |
|
"loss": 0.5385, |
|
"num_input_tokens_seen": 10652160, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.2626834921452485, |
|
"grad_norm": 5.479941792055548, |
|
"learning_rate": 8.386407858128706e-05, |
|
"loss": 0.5171, |
|
"num_input_tokens_seen": 10705208, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.2639711563224311, |
|
"grad_norm": 3.489116106033337, |
|
"learning_rate": 8.368478218232787e-05, |
|
"loss": 0.5201, |
|
"num_input_tokens_seen": 10758688, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.2652588204996137, |
|
"grad_norm": 5.923123692460237, |
|
"learning_rate": 8.350468913330192e-05, |
|
"loss": 0.5521, |
|
"num_input_tokens_seen": 10811408, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.2665464846767963, |
|
"grad_norm": 2.7605406738569824, |
|
"learning_rate": 8.33238036934364e-05, |
|
"loss": 0.4938, |
|
"num_input_tokens_seen": 10864144, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.2678341488539789, |
|
"grad_norm": 5.500647711838314, |
|
"learning_rate": 8.31421301406986e-05, |
|
"loss": 0.4828, |
|
"num_input_tokens_seen": 10916952, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.26912181303116145, |
|
"grad_norm": 6.823855575342733, |
|
"learning_rate": 8.29596727716949e-05, |
|
"loss": 0.5491, |
|
"num_input_tokens_seen": 10968824, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.2704094772083441, |
|
"grad_norm": 5.409054743152559, |
|
"learning_rate": 8.277643590156894e-05, |
|
"loss": 0.4628, |
|
"num_input_tokens_seen": 11021656, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.2704094772083441, |
|
"eval_loss": 0.5039986371994019, |
|
"eval_runtime": 40.3009, |
|
"eval_samples_per_second": 2.978, |
|
"eval_steps_per_second": 0.744, |
|
"num_input_tokens_seen": 11021656, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.27169714138552664, |
|
"grad_norm": 3.2588151986321994, |
|
"learning_rate": 8.259242386389973e-05, |
|
"loss": 0.4586, |
|
"num_input_tokens_seen": 11074336, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.27298480556270927, |
|
"grad_norm": 12.995641199019554, |
|
"learning_rate": 8.240764101059912e-05, |
|
"loss": 0.4939, |
|
"num_input_tokens_seen": 11126776, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.27427246973989183, |
|
"grad_norm": 8.713479932798109, |
|
"learning_rate": 8.222209171180883e-05, |
|
"loss": 0.4978, |
|
"num_input_tokens_seen": 11179680, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.2755601339170744, |
|
"grad_norm": 3.6728132957332016, |
|
"learning_rate": 8.203578035579715e-05, |
|
"loss": 0.5695, |
|
"num_input_tokens_seen": 11231616, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.276847798094257, |
|
"grad_norm": 9.661110166832387, |
|
"learning_rate": 8.184871134885513e-05, |
|
"loss": 0.4635, |
|
"num_input_tokens_seen": 11283720, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.2781354622714396, |
|
"grad_norm": 5.4096015474623576, |
|
"learning_rate": 8.166088911519235e-05, |
|
"loss": 0.4974, |
|
"num_input_tokens_seen": 11336144, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.2794231264486222, |
|
"grad_norm": 5.353663008589148, |
|
"learning_rate": 8.147231809683236e-05, |
|
"loss": 0.4439, |
|
"num_input_tokens_seen": 11389128, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.2807107906258048, |
|
"grad_norm": 3.863008112890598, |
|
"learning_rate": 8.128300275350756e-05, |
|
"loss": 0.4368, |
|
"num_input_tokens_seen": 11441864, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.2819984548029874, |
|
"grad_norm": 5.545035623030093, |
|
"learning_rate": 8.109294756255375e-05, |
|
"loss": 0.4895, |
|
"num_input_tokens_seen": 11494880, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.28328611898017, |
|
"grad_norm": 5.124762488175073, |
|
"learning_rate": 8.090215701880419e-05, |
|
"loss": 0.4825, |
|
"num_input_tokens_seen": 11547008, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.28328611898017, |
|
"eval_loss": 0.4798590838909149, |
|
"eval_runtime": 40.6942, |
|
"eval_samples_per_second": 2.949, |
|
"eval_steps_per_second": 0.737, |
|
"num_input_tokens_seen": 11547008, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.28457378315735254, |
|
"grad_norm": 11.308296783543483, |
|
"learning_rate": 8.07106356344834e-05, |
|
"loss": 0.4927, |
|
"num_input_tokens_seen": 11600032, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.28586144733453517, |
|
"grad_norm": 4.902660398367944, |
|
"learning_rate": 8.051838793910038e-05, |
|
"loss": 0.4353, |
|
"num_input_tokens_seen": 11652120, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.28714911151171774, |
|
"grad_norm": 4.185631754620407, |
|
"learning_rate": 8.032541847934146e-05, |
|
"loss": 0.4891, |
|
"num_input_tokens_seen": 11705184, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.28843677568890036, |
|
"grad_norm": 6.049695709018542, |
|
"learning_rate": 8.013173181896283e-05, |
|
"loss": 0.4497, |
|
"num_input_tokens_seen": 11758032, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.28972443986608293, |
|
"grad_norm": 4.598736726589848, |
|
"learning_rate": 7.993733253868256e-05, |
|
"loss": 0.4927, |
|
"num_input_tokens_seen": 11810736, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.2910121040432655, |
|
"grad_norm": 41.010822412039396, |
|
"learning_rate": 7.974222523607236e-05, |
|
"loss": 0.4853, |
|
"num_input_tokens_seen": 11863152, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.2922997682204481, |
|
"grad_norm": 5.591270811303827, |
|
"learning_rate": 7.954641452544865e-05, |
|
"loss": 0.4458, |
|
"num_input_tokens_seen": 11914536, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.2935874323976307, |
|
"grad_norm": 4.526048407550314, |
|
"learning_rate": 7.934990503776363e-05, |
|
"loss": 0.3976, |
|
"num_input_tokens_seen": 11966064, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.2948750965748133, |
|
"grad_norm": 4.778105875378293, |
|
"learning_rate": 7.915270142049566e-05, |
|
"loss": 0.508, |
|
"num_input_tokens_seen": 12018928, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.2961627607519959, |
|
"grad_norm": 8.075837130866274, |
|
"learning_rate": 7.89548083375394e-05, |
|
"loss": 0.4553, |
|
"num_input_tokens_seen": 12071088, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.2961627607519959, |
|
"eval_loss": 0.45381438732147217, |
|
"eval_runtime": 38.3303, |
|
"eval_samples_per_second": 3.131, |
|
"eval_steps_per_second": 0.783, |
|
"num_input_tokens_seen": 12071088, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.29745042492917845, |
|
"grad_norm": 5.66991445612284, |
|
"learning_rate": 7.875623046909544e-05, |
|
"loss": 0.4192, |
|
"num_input_tokens_seen": 12122128, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.29873808910636107, |
|
"grad_norm": 11.08291356725024, |
|
"learning_rate": 7.855697251155967e-05, |
|
"loss": 0.433, |
|
"num_input_tokens_seen": 12174288, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.30002575328354364, |
|
"grad_norm": 8.191495602021662, |
|
"learning_rate": 7.835703917741212e-05, |
|
"loss": 0.4817, |
|
"num_input_tokens_seen": 12227008, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.30131341746072626, |
|
"grad_norm": 7.763763600628314, |
|
"learning_rate": 7.81564351951057e-05, |
|
"loss": 0.485, |
|
"num_input_tokens_seen": 12280168, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.30260108163790883, |
|
"grad_norm": 5.347838532189795, |
|
"learning_rate": 7.795516530895414e-05, |
|
"loss": 0.4532, |
|
"num_input_tokens_seen": 12333072, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.3038887458150914, |
|
"grad_norm": 7.959591215701365, |
|
"learning_rate": 7.775323427901993e-05, |
|
"loss": 0.4643, |
|
"num_input_tokens_seen": 12386208, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.305176409992274, |
|
"grad_norm": 6.676689561663868, |
|
"learning_rate": 7.755064688100171e-05, |
|
"loss": 0.4577, |
|
"num_input_tokens_seen": 12439304, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.3064640741694566, |
|
"grad_norm": 6.976246725003336, |
|
"learning_rate": 7.734740790612136e-05, |
|
"loss": 0.4666, |
|
"num_input_tokens_seen": 12491360, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.3077517383466392, |
|
"grad_norm": 6.034570050567919, |
|
"learning_rate": 7.714352216101055e-05, |
|
"loss": 0.407, |
|
"num_input_tokens_seen": 12544264, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.3090394025238218, |
|
"grad_norm": 4.583037231101643, |
|
"learning_rate": 7.693899446759727e-05, |
|
"loss": 0.454, |
|
"num_input_tokens_seen": 12596208, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.3090394025238218, |
|
"eval_loss": 0.49250805377960205, |
|
"eval_runtime": 38.6863, |
|
"eval_samples_per_second": 3.102, |
|
"eval_steps_per_second": 0.775, |
|
"num_input_tokens_seen": 12596208, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.31032706670100435, |
|
"grad_norm": 4.0964966925406365, |
|
"learning_rate": 7.673382966299163e-05, |
|
"loss": 0.5226, |
|
"num_input_tokens_seen": 12648936, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.311614730878187, |
|
"grad_norm": 7.87992303723905, |
|
"learning_rate": 7.65280325993715e-05, |
|
"loss": 0.4757, |
|
"num_input_tokens_seen": 12702432, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.31290239505536954, |
|
"grad_norm": 6.822793875901239, |
|
"learning_rate": 7.63216081438678e-05, |
|
"loss": 0.451, |
|
"num_input_tokens_seen": 12755128, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.31419005923255217, |
|
"grad_norm": 8.804840574778536, |
|
"learning_rate": 7.611456117844934e-05, |
|
"loss": 0.4155, |
|
"num_input_tokens_seen": 12808152, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.31547772340973473, |
|
"grad_norm": 12.832933509895003, |
|
"learning_rate": 7.59068965998074e-05, |
|
"loss": 0.4094, |
|
"num_input_tokens_seen": 12861592, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.31676538758691736, |
|
"grad_norm": 3.769639586972444, |
|
"learning_rate": 7.569861931923989e-05, |
|
"loss": 0.4663, |
|
"num_input_tokens_seen": 12914240, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.3180530517640999, |
|
"grad_norm": 5.011688667303979, |
|
"learning_rate": 7.548973426253521e-05, |
|
"loss": 0.468, |
|
"num_input_tokens_seen": 12967472, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.3193407159412825, |
|
"grad_norm": 5.925703481508644, |
|
"learning_rate": 7.528024636985575e-05, |
|
"loss": 0.4744, |
|
"num_input_tokens_seen": 13020232, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.3206283801184651, |
|
"grad_norm": 3.511846132089351, |
|
"learning_rate": 7.507016059562107e-05, |
|
"loss": 0.4269, |
|
"num_input_tokens_seen": 13073032, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.3219160442956477, |
|
"grad_norm": 6.878508053492975, |
|
"learning_rate": 7.485948190839077e-05, |
|
"loss": 0.4725, |
|
"num_input_tokens_seen": 13125624, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.3219160442956477, |
|
"eval_loss": 0.4339977502822876, |
|
"eval_runtime": 39.1132, |
|
"eval_samples_per_second": 3.068, |
|
"eval_steps_per_second": 0.767, |
|
"num_input_tokens_seen": 13125624, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.3232037084728303, |
|
"grad_norm": 3.2225418900054184, |
|
"learning_rate": 7.464821529074679e-05, |
|
"loss": 0.4196, |
|
"num_input_tokens_seen": 13178656, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.3244913726500129, |
|
"grad_norm": 5.7056125199065475, |
|
"learning_rate": 7.443636573917585e-05, |
|
"loss": 0.4349, |
|
"num_input_tokens_seen": 13231224, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.32577903682719545, |
|
"grad_norm": 3.1679429520474587, |
|
"learning_rate": 7.422393826395108e-05, |
|
"loss": 0.4726, |
|
"num_input_tokens_seen": 13283208, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.32706670100437807, |
|
"grad_norm": 5.409673500894723, |
|
"learning_rate": 7.40109378890136e-05, |
|
"loss": 0.4604, |
|
"num_input_tokens_seen": 13335808, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.32835436518156064, |
|
"grad_norm": 6.011303613930208, |
|
"learning_rate": 7.379736965185368e-05, |
|
"loss": 0.4606, |
|
"num_input_tokens_seen": 13389112, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.32964202935874326, |
|
"grad_norm": 11.490498301960598, |
|
"learning_rate": 7.358323860339165e-05, |
|
"loss": 0.4487, |
|
"num_input_tokens_seen": 13441816, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.33092969353592583, |
|
"grad_norm": 8.761206465870922, |
|
"learning_rate": 7.336854980785839e-05, |
|
"loss": 0.422, |
|
"num_input_tokens_seen": 13493592, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.3322173577131084, |
|
"grad_norm": 8.457687965106274, |
|
"learning_rate": 7.315330834267553e-05, |
|
"loss": 0.5397, |
|
"num_input_tokens_seen": 13545696, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.333505021890291, |
|
"grad_norm": 6.1852361009354295, |
|
"learning_rate": 7.293751929833553e-05, |
|
"loss": 0.5022, |
|
"num_input_tokens_seen": 13597560, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.3347926860674736, |
|
"grad_norm": 3.157280649859201, |
|
"learning_rate": 7.272118777828108e-05, |
|
"loss": 0.4794, |
|
"num_input_tokens_seen": 13650264, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.3347926860674736, |
|
"eval_loss": 0.4991846978664398, |
|
"eval_runtime": 38.2504, |
|
"eval_samples_per_second": 3.137, |
|
"eval_steps_per_second": 0.784, |
|
"num_input_tokens_seen": 13650264, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.3360803502446562, |
|
"grad_norm": 6.386835645613503, |
|
"learning_rate": 7.250431889878455e-05, |
|
"loss": 0.4971, |
|
"num_input_tokens_seen": 13702584, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.3373680144218388, |
|
"grad_norm": 4.797592029689297, |
|
"learning_rate": 7.228691778882693e-05, |
|
"loss": 0.4574, |
|
"num_input_tokens_seen": 13755024, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.33865567859902135, |
|
"grad_norm": 3.659831343491765, |
|
"learning_rate": 7.20689895899765e-05, |
|
"loss": 0.4463, |
|
"num_input_tokens_seen": 13807528, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.33994334277620397, |
|
"grad_norm": 8.104230440489859, |
|
"learning_rate": 7.185053945626733e-05, |
|
"loss": 0.4549, |
|
"num_input_tokens_seen": 13859760, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.34123100695338654, |
|
"grad_norm": 4.000749012853666, |
|
"learning_rate": 7.163157255407732e-05, |
|
"loss": 0.4073, |
|
"num_input_tokens_seen": 13911656, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.34251867113056916, |
|
"grad_norm": 4.431361614574065, |
|
"learning_rate": 7.141209406200599e-05, |
|
"loss": 0.433, |
|
"num_input_tokens_seen": 13963816, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.34380633530775173, |
|
"grad_norm": 3.9352317738395635, |
|
"learning_rate": 7.1192109170752e-05, |
|
"loss": 0.4244, |
|
"num_input_tokens_seen": 14016256, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.34509399948493436, |
|
"grad_norm": 4.571632866024196, |
|
"learning_rate": 7.097162308299054e-05, |
|
"loss": 0.4448, |
|
"num_input_tokens_seen": 14068768, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.3463816636621169, |
|
"grad_norm": 4.2711556426666375, |
|
"learning_rate": 7.07506410132501e-05, |
|
"loss": 0.4608, |
|
"num_input_tokens_seen": 14121272, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.3476693278392995, |
|
"grad_norm": 4.49067434213006, |
|
"learning_rate": 7.052916818778918e-05, |
|
"loss": 0.3994, |
|
"num_input_tokens_seen": 14173240, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.3476693278392995, |
|
"eval_loss": 0.460835725069046, |
|
"eval_runtime": 38.3552, |
|
"eval_samples_per_second": 3.129, |
|
"eval_steps_per_second": 0.782, |
|
"num_input_tokens_seen": 14173240, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.3489569920164821, |
|
"grad_norm": 6.100571377010892, |
|
"learning_rate": 7.030720984447279e-05, |
|
"loss": 0.41, |
|
"num_input_tokens_seen": 14226032, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.3502446561936647, |
|
"grad_norm": 3.531812694789996, |
|
"learning_rate": 7.008477123264848e-05, |
|
"loss": 0.3751, |
|
"num_input_tokens_seen": 14278128, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.3515323203708473, |
|
"grad_norm": 13.528736327050117, |
|
"learning_rate": 6.986185761302224e-05, |
|
"loss": 0.4814, |
|
"num_input_tokens_seen": 14330624, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.3528199845480299, |
|
"grad_norm": 6.2453361475565305, |
|
"learning_rate": 6.963847425753403e-05, |
|
"loss": 0.5007, |
|
"num_input_tokens_seen": 14382416, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.35410764872521244, |
|
"grad_norm": 3.5868157849734925, |
|
"learning_rate": 6.941462644923318e-05, |
|
"loss": 0.4335, |
|
"num_input_tokens_seen": 14434896, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.35539531290239507, |
|
"grad_norm": 7.0930284762784925, |
|
"learning_rate": 6.919031948215335e-05, |
|
"loss": 0.4427, |
|
"num_input_tokens_seen": 14487152, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.35668297707957763, |
|
"grad_norm": 1.8673746248959853, |
|
"learning_rate": 6.896555866118741e-05, |
|
"loss": 0.42, |
|
"num_input_tokens_seen": 14539608, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.35797064125676026, |
|
"grad_norm": 3.29378340171418, |
|
"learning_rate": 6.87403493019619e-05, |
|
"loss": 0.4573, |
|
"num_input_tokens_seen": 14592168, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.3592583054339428, |
|
"grad_norm": 4.710051493913417, |
|
"learning_rate": 6.851469673071143e-05, |
|
"loss": 0.4341, |
|
"num_input_tokens_seen": 14643920, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.3605459696111254, |
|
"grad_norm": 5.46737560287727, |
|
"learning_rate": 6.828860628415253e-05, |
|
"loss": 0.437, |
|
"num_input_tokens_seen": 14697136, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.3605459696111254, |
|
"eval_loss": 0.46620962023735046, |
|
"eval_runtime": 38.4197, |
|
"eval_samples_per_second": 3.123, |
|
"eval_steps_per_second": 0.781, |
|
"num_input_tokens_seen": 14697136, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.361833633788308, |
|
"grad_norm": 5.6011715346425355, |
|
"learning_rate": 6.806208330935766e-05, |
|
"loss": 0.4377, |
|
"num_input_tokens_seen": 14749168, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 0.3631212979654906, |
|
"grad_norm": 8.725023519965001, |
|
"learning_rate": 6.783513316362855e-05, |
|
"loss": 0.412, |
|
"num_input_tokens_seen": 14801568, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.3644089621426732, |
|
"grad_norm": 8.12664534705471, |
|
"learning_rate": 6.760776121436962e-05, |
|
"loss": 0.4441, |
|
"num_input_tokens_seen": 14853384, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 0.3656966263198558, |
|
"grad_norm": 3.5568354734329244, |
|
"learning_rate": 6.737997283896103e-05, |
|
"loss": 0.4576, |
|
"num_input_tokens_seen": 14906632, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.36698429049703835, |
|
"grad_norm": 2.9816566580274007, |
|
"learning_rate": 6.715177342463145e-05, |
|
"loss": 0.3853, |
|
"num_input_tokens_seen": 14959240, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.36827195467422097, |
|
"grad_norm": 9.270651786172323, |
|
"learning_rate": 6.692316836833065e-05, |
|
"loss": 0.3755, |
|
"num_input_tokens_seen": 15012256, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.36955961885140354, |
|
"grad_norm": 7.022055493979997, |
|
"learning_rate": 6.6694163076602e-05, |
|
"loss": 0.5384, |
|
"num_input_tokens_seen": 15064664, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.37084728302858616, |
|
"grad_norm": 3.764454647275643, |
|
"learning_rate": 6.646476296545434e-05, |
|
"loss": 0.4377, |
|
"num_input_tokens_seen": 15117384, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.37213494720576873, |
|
"grad_norm": 5.3073636057406794, |
|
"learning_rate": 6.623497346023418e-05, |
|
"loss": 0.3876, |
|
"num_input_tokens_seen": 15169880, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 0.37342261138295135, |
|
"grad_norm": 3.8443265684988392, |
|
"learning_rate": 6.60047999954972e-05, |
|
"loss": 0.4065, |
|
"num_input_tokens_seen": 15222568, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.37342261138295135, |
|
"eval_loss": 0.4395444095134735, |
|
"eval_runtime": 38.336, |
|
"eval_samples_per_second": 3.13, |
|
"eval_steps_per_second": 0.783, |
|
"num_input_tokens_seen": 15222568, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.3747102755601339, |
|
"grad_norm": 8.614661225033187, |
|
"learning_rate": 6.57742480148798e-05, |
|
"loss": 0.4231, |
|
"num_input_tokens_seen": 15275288, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.3759979397373165, |
|
"grad_norm": 3.107561516867378, |
|
"learning_rate": 6.554332297097031e-05, |
|
"loss": 0.4301, |
|
"num_input_tokens_seen": 15328072, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.3772856039144991, |
|
"grad_norm": 2.9024892391048867, |
|
"learning_rate": 6.53120303251801e-05, |
|
"loss": 0.446, |
|
"num_input_tokens_seen": 15379120, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 0.3785732680916817, |
|
"grad_norm": 2.7506997409330105, |
|
"learning_rate": 6.508037554761432e-05, |
|
"loss": 0.3764, |
|
"num_input_tokens_seen": 15431104, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.3798609322688643, |
|
"grad_norm": 5.7118625908326734, |
|
"learning_rate": 6.484836411694267e-05, |
|
"loss": 0.4423, |
|
"num_input_tokens_seen": 15482816, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.3811485964460469, |
|
"grad_norm": 4.701095405963631, |
|
"learning_rate": 6.461600152026965e-05, |
|
"loss": 0.4439, |
|
"num_input_tokens_seen": 15534896, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.38243626062322944, |
|
"grad_norm": 5.574717716204205, |
|
"learning_rate": 6.438329325300499e-05, |
|
"loss": 0.4408, |
|
"num_input_tokens_seen": 15587496, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 0.38372392480041206, |
|
"grad_norm": 4.6497322752918, |
|
"learning_rate": 6.415024481873352e-05, |
|
"loss": 0.4086, |
|
"num_input_tokens_seen": 15639672, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.38501158897759463, |
|
"grad_norm": 5.427307211472868, |
|
"learning_rate": 6.391686172908506e-05, |
|
"loss": 0.4489, |
|
"num_input_tokens_seen": 15693120, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 0.38629925315477726, |
|
"grad_norm": 5.005547973733715, |
|
"learning_rate": 6.368314950360415e-05, |
|
"loss": 0.4338, |
|
"num_input_tokens_seen": 15744848, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.38629925315477726, |
|
"eval_loss": 0.45475366711616516, |
|
"eval_runtime": 38.3957, |
|
"eval_samples_per_second": 3.125, |
|
"eval_steps_per_second": 0.781, |
|
"num_input_tokens_seen": 15744848, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.3875869173319598, |
|
"grad_norm": 5.097132399629058, |
|
"learning_rate": 6.344911366961934e-05, |
|
"loss": 0.4558, |
|
"num_input_tokens_seen": 15797632, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 0.3888745815091424, |
|
"grad_norm": 4.502325593575991, |
|
"learning_rate": 6.321475976211266e-05, |
|
"loss": 0.4518, |
|
"num_input_tokens_seen": 15850040, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.390162245686325, |
|
"grad_norm": 6.425152572566654, |
|
"learning_rate": 6.298009332358856e-05, |
|
"loss": 0.4092, |
|
"num_input_tokens_seen": 15902496, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 0.3914499098635076, |
|
"grad_norm": 3.968135032555422, |
|
"learning_rate": 6.274511990394294e-05, |
|
"loss": 0.478, |
|
"num_input_tokens_seen": 15954936, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.3927375740406902, |
|
"grad_norm": 4.636757769906518, |
|
"learning_rate": 6.250984506033183e-05, |
|
"loss": 0.4294, |
|
"num_input_tokens_seen": 16007624, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.3940252382178728, |
|
"grad_norm": 2.7967900169696347, |
|
"learning_rate": 6.227427435703997e-05, |
|
"loss": 0.3846, |
|
"num_input_tokens_seen": 16059440, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.39531290239505534, |
|
"grad_norm": 2.983520749639549, |
|
"learning_rate": 6.203841336534924e-05, |
|
"loss": 0.4372, |
|
"num_input_tokens_seen": 16111136, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 0.39660056657223797, |
|
"grad_norm": 8.364510466670477, |
|
"learning_rate": 6.180226766340688e-05, |
|
"loss": 0.484, |
|
"num_input_tokens_seen": 16163976, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.39788823074942054, |
|
"grad_norm": 4.45878743373729, |
|
"learning_rate": 6.156584283609359e-05, |
|
"loss": 0.3965, |
|
"num_input_tokens_seen": 16217192, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 0.39917589492660316, |
|
"grad_norm": 2.6831990995391717, |
|
"learning_rate": 6.132914447489137e-05, |
|
"loss": 0.3872, |
|
"num_input_tokens_seen": 16269896, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.39917589492660316, |
|
"eval_loss": 0.4416767656803131, |
|
"eval_runtime": 38.4671, |
|
"eval_samples_per_second": 3.12, |
|
"eval_steps_per_second": 0.78, |
|
"num_input_tokens_seen": 16269896, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.4004635591037857, |
|
"grad_norm": 4.920079251827062, |
|
"learning_rate": 6.109217817775139e-05, |
|
"loss": 0.4593, |
|
"num_input_tokens_seen": 16322496, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 0.40175122328096835, |
|
"grad_norm": 9.068094163618136, |
|
"learning_rate": 6.085494954896156e-05, |
|
"loss": 0.4865, |
|
"num_input_tokens_seen": 16375320, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.4030388874581509, |
|
"grad_norm": 9.316944070527988, |
|
"learning_rate": 6.061746419901388e-05, |
|
"loss": 0.4422, |
|
"num_input_tokens_seen": 16428096, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 0.4043265516353335, |
|
"grad_norm": 2.4617418860122213, |
|
"learning_rate": 6.0379727744471936e-05, |
|
"loss": 0.3538, |
|
"num_input_tokens_seen": 16480832, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.4056142158125161, |
|
"grad_norm": 5.028400110331736, |
|
"learning_rate": 6.014174580783794e-05, |
|
"loss": 0.3923, |
|
"num_input_tokens_seen": 16534016, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.4069018799896987, |
|
"grad_norm": 6.638266454273257, |
|
"learning_rate": 5.990352401741981e-05, |
|
"loss": 0.3967, |
|
"num_input_tokens_seen": 16586216, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.4081895441668813, |
|
"grad_norm": 6.928848680437489, |
|
"learning_rate": 5.9665068007197976e-05, |
|
"loss": 0.4212, |
|
"num_input_tokens_seen": 16639312, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 0.40947720834406387, |
|
"grad_norm": 4.2324092477507005, |
|
"learning_rate": 5.94263834166923e-05, |
|
"loss": 0.3489, |
|
"num_input_tokens_seen": 16692328, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.41076487252124644, |
|
"grad_norm": 5.607976113391715, |
|
"learning_rate": 5.918747589082853e-05, |
|
"loss": 0.4105, |
|
"num_input_tokens_seen": 16745088, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 0.41205253669842906, |
|
"grad_norm": 5.155332109104381, |
|
"learning_rate": 5.8948351079804875e-05, |
|
"loss": 0.3914, |
|
"num_input_tokens_seen": 16798768, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.41205253669842906, |
|
"eval_loss": 0.4657597243785858, |
|
"eval_runtime": 38.2951, |
|
"eval_samples_per_second": 3.134, |
|
"eval_steps_per_second": 0.783, |
|
"num_input_tokens_seen": 16798768, |
|
"step": 1600 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 3400, |
|
"num_input_tokens_seen": 16798768, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1108323298967552.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|