|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.274274937560528, |
|
"global_step": 50000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01, |
|
"learning_rate": 5.999999999999999e-06, |
|
"loss": 0.8907, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"learning_rate": 1.1999999999999999e-05, |
|
"loss": 0.7216, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_loss": 0.9141247272491455, |
|
"eval_runtime": 1.2377, |
|
"eval_samples_per_second": 807.918, |
|
"eval_steps_per_second": 12.927, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"learning_rate": 1.7999999999999997e-05, |
|
"loss": 0.7122, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"learning_rate": 2.3999999999999997e-05, |
|
"loss": 0.7114, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_loss": 0.9156034588813782, |
|
"eval_runtime": 1.1631, |
|
"eval_samples_per_second": 859.777, |
|
"eval_steps_per_second": 13.756, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"learning_rate": 2.9999999999999997e-05, |
|
"loss": 0.7111, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"learning_rate": 3.5999999999999994e-05, |
|
"loss": 0.711, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_loss": 0.915696918964386, |
|
"eval_runtime": 1.149, |
|
"eval_samples_per_second": 870.294, |
|
"eval_steps_per_second": 13.925, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"learning_rate": 4.2e-05, |
|
"loss": 0.7107, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"learning_rate": 4.7999999999999994e-05, |
|
"loss": 0.7102, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"eval_loss": 0.8820520043373108, |
|
"eval_runtime": 1.2331, |
|
"eval_samples_per_second": 810.943, |
|
"eval_steps_per_second": 12.975, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"learning_rate": 5.399999999999999e-05, |
|
"loss": 0.7096, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"learning_rate": 5.9999999999999995e-05, |
|
"loss": 0.709, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"eval_loss": 0.8773962259292603, |
|
"eval_runtime": 1.22, |
|
"eval_samples_per_second": 819.664, |
|
"eval_steps_per_second": 13.115, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"learning_rate": 6.599999999999999e-05, |
|
"loss": 0.7087, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"learning_rate": 7.199999999999999e-05, |
|
"loss": 0.7083, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"eval_loss": 0.8944988250732422, |
|
"eval_runtime": 1.1878, |
|
"eval_samples_per_second": 841.893, |
|
"eval_steps_per_second": 13.47, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"learning_rate": 7.8e-05, |
|
"loss": 0.6868, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"learning_rate": 8.4e-05, |
|
"loss": 0.6516, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"eval_loss": 0.8944807648658752, |
|
"eval_runtime": 1.2213, |
|
"eval_samples_per_second": 818.801, |
|
"eval_steps_per_second": 13.101, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"learning_rate": 8.999999999999999e-05, |
|
"loss": 0.6267, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"learning_rate": 9.599999999999999e-05, |
|
"loss": 0.6042, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"eval_loss": 0.8855597972869873, |
|
"eval_runtime": 1.1662, |
|
"eval_samples_per_second": 857.462, |
|
"eval_steps_per_second": 13.719, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"learning_rate": 0.000102, |
|
"loss": 0.5883, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"learning_rate": 0.00010799999999999998, |
|
"loss": 0.5732, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"eval_loss": 0.8907042741775513, |
|
"eval_runtime": 1.2023, |
|
"eval_samples_per_second": 831.767, |
|
"eval_steps_per_second": 13.308, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"learning_rate": 0.00011399999999999999, |
|
"loss": 0.559, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"learning_rate": 0.00011999999999999999, |
|
"loss": 0.5506, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"eval_loss": 0.8921875953674316, |
|
"eval_runtime": 1.197, |
|
"eval_samples_per_second": 835.455, |
|
"eval_steps_per_second": 13.367, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"learning_rate": 0.00012599999999999997, |
|
"loss": 0.5444, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"learning_rate": 0.00013199999999999998, |
|
"loss": 0.5385, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"eval_loss": 0.8903993964195251, |
|
"eval_runtime": 1.1996, |
|
"eval_samples_per_second": 833.63, |
|
"eval_steps_per_second": 13.338, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"learning_rate": 0.000138, |
|
"loss": 0.532, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"learning_rate": 0.00014399999999999998, |
|
"loss": 0.5276, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"eval_loss": 0.8864779472351074, |
|
"eval_runtime": 1.1846, |
|
"eval_samples_per_second": 844.173, |
|
"eval_steps_per_second": 13.507, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"learning_rate": 0.00015, |
|
"loss": 0.5218, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"learning_rate": 0.000156, |
|
"loss": 0.517, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"eval_loss": 0.88374263048172, |
|
"eval_runtime": 1.1943, |
|
"eval_samples_per_second": 837.343, |
|
"eval_steps_per_second": 13.397, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"learning_rate": 0.000162, |
|
"loss": 0.5124, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"learning_rate": 0.000168, |
|
"loss": 0.5077, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"eval_loss": 0.8863275647163391, |
|
"eval_runtime": 1.1844, |
|
"eval_samples_per_second": 844.316, |
|
"eval_steps_per_second": 13.509, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"learning_rate": 0.00017399999999999997, |
|
"loss": 0.5027, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"learning_rate": 0.00017999999999999998, |
|
"loss": 0.498, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"eval_loss": 0.8769687414169312, |
|
"eval_runtime": 1.1484, |
|
"eval_samples_per_second": 870.806, |
|
"eval_steps_per_second": 13.933, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"learning_rate": 0.000186, |
|
"loss": 0.4938, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"learning_rate": 0.00019199999999999998, |
|
"loss": 0.4897, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"eval_loss": 0.8793530464172363, |
|
"eval_runtime": 1.2247, |
|
"eval_samples_per_second": 816.516, |
|
"eval_steps_per_second": 13.064, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"learning_rate": 0.000198, |
|
"loss": 0.4849, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"learning_rate": 0.000204, |
|
"loss": 0.4791, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"eval_loss": 0.879580557346344, |
|
"eval_runtime": 1.1672, |
|
"eval_samples_per_second": 856.75, |
|
"eval_steps_per_second": 13.708, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"learning_rate": 0.00020999999999999998, |
|
"loss": 0.4744, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"learning_rate": 0.00021599999999999996, |
|
"loss": 0.4698, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"eval_loss": 0.8753651976585388, |
|
"eval_runtime": 1.1516, |
|
"eval_samples_per_second": 868.392, |
|
"eval_steps_per_second": 13.894, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"learning_rate": 0.00022199999999999998, |
|
"loss": 0.4644, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"learning_rate": 0.00022799999999999999, |
|
"loss": 0.4592, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"eval_loss": 0.8825145363807678, |
|
"eval_runtime": 1.2668, |
|
"eval_samples_per_second": 789.404, |
|
"eval_steps_per_second": 12.63, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"learning_rate": 0.000234, |
|
"loss": 0.4542, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"learning_rate": 0.00023999999999999998, |
|
"loss": 0.4489, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"eval_loss": 0.878689169883728, |
|
"eval_runtime": 1.1589, |
|
"eval_samples_per_second": 862.885, |
|
"eval_steps_per_second": 13.806, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"learning_rate": 0.00024599999999999996, |
|
"loss": 0.4437, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"learning_rate": 0.00025199999999999995, |
|
"loss": 0.439, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"eval_loss": 0.8742354512214661, |
|
"eval_runtime": 1.1831, |
|
"eval_samples_per_second": 845.245, |
|
"eval_steps_per_second": 13.524, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"learning_rate": 0.000258, |
|
"loss": 0.4339, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"learning_rate": 0.00026399999999999997, |
|
"loss": 0.4292, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"eval_loss": 0.8848758935928345, |
|
"eval_runtime": 1.1886, |
|
"eval_samples_per_second": 841.313, |
|
"eval_steps_per_second": 13.461, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"learning_rate": 0.00027, |
|
"loss": 0.4251, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"learning_rate": 0.000276, |
|
"loss": 0.4212, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"eval_loss": 0.8841528296470642, |
|
"eval_runtime": 1.2211, |
|
"eval_samples_per_second": 818.95, |
|
"eval_steps_per_second": 13.103, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"learning_rate": 0.00028199999999999997, |
|
"loss": 0.4175, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"learning_rate": 0.00028799999999999995, |
|
"loss": 0.4142, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"eval_loss": 0.8811973929405212, |
|
"eval_runtime": 1.275, |
|
"eval_samples_per_second": 784.285, |
|
"eval_steps_per_second": 12.549, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"learning_rate": 0.000294, |
|
"loss": 0.4109, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4076, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"eval_loss": 0.8659318089485168, |
|
"eval_runtime": 1.2881, |
|
"eval_samples_per_second": 776.338, |
|
"eval_steps_per_second": 12.421, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"learning_rate": 0.00029999920715161553, |
|
"loss": 0.4045, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"learning_rate": 0.0002999968286151326, |
|
"loss": 0.4017, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"eval_loss": 0.874371349811554, |
|
"eval_runtime": 1.2282, |
|
"eval_samples_per_second": 814.215, |
|
"eval_steps_per_second": 13.027, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"learning_rate": 0.0002999928644165624, |
|
"loss": 0.3987, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"learning_rate": 0.0002999873145992569, |
|
"loss": 0.3958, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"eval_loss": 0.8821514844894409, |
|
"eval_runtime": 1.1414, |
|
"eval_samples_per_second": 876.15, |
|
"eval_steps_per_second": 14.018, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"learning_rate": 0.000299980179223908, |
|
"loss": 0.3933, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"learning_rate": 0.0002999714583685469, |
|
"loss": 0.3907, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"eval_loss": 0.8762433528900146, |
|
"eval_runtime": 1.2113, |
|
"eval_samples_per_second": 825.526, |
|
"eval_steps_per_second": 13.208, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"learning_rate": 0.00029996115212854366, |
|
"loss": 0.3886, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"learning_rate": 0.00029994926061660554, |
|
"loss": 0.3863, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"eval_loss": 0.8757530450820923, |
|
"eval_runtime": 1.1818, |
|
"eval_samples_per_second": 846.134, |
|
"eval_steps_per_second": 13.538, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"learning_rate": 0.0002999357839627762, |
|
"loss": 0.3841, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"learning_rate": 0.00029992072231443425, |
|
"loss": 0.382, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"eval_loss": 0.8755055069923401, |
|
"eval_runtime": 1.2044, |
|
"eval_samples_per_second": 830.296, |
|
"eval_steps_per_second": 13.285, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"learning_rate": 0.0002999040758362914, |
|
"loss": 0.3804, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"learning_rate": 0.00029988584471039094, |
|
"loss": 0.378, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"eval_loss": 0.8780514001846313, |
|
"eval_runtime": 1.1913, |
|
"eval_samples_per_second": 839.425, |
|
"eval_steps_per_second": 13.431, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"learning_rate": 0.0002998660291361054, |
|
"loss": 0.3763, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"learning_rate": 0.0002998446293301349, |
|
"loss": 0.3748, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"eval_loss": 0.8814873099327087, |
|
"eval_runtime": 1.3032, |
|
"eval_samples_per_second": 767.316, |
|
"eval_steps_per_second": 12.277, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"learning_rate": 0.0002998216455265042, |
|
"loss": 0.3735, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"learning_rate": 0.00029979707797656046, |
|
"loss": 0.3716, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"eval_loss": 0.868879497051239, |
|
"eval_runtime": 1.2062, |
|
"eval_samples_per_second": 829.022, |
|
"eval_steps_per_second": 13.264, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"learning_rate": 0.00029977092694897053, |
|
"loss": 0.3702, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"learning_rate": 0.0002997431927297178, |
|
"loss": 0.3689, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"eval_loss": 0.8758594989776611, |
|
"eval_runtime": 1.2131, |
|
"eval_samples_per_second": 824.364, |
|
"eval_steps_per_second": 13.19, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"learning_rate": 0.00029971387562209936, |
|
"loss": 0.3677, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"learning_rate": 0.00029968297594672226, |
|
"loss": 0.3665, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"eval_loss": 0.8690294623374939, |
|
"eval_runtime": 1.1601, |
|
"eval_samples_per_second": 861.986, |
|
"eval_steps_per_second": 13.792, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"learning_rate": 0.0002996504940415005, |
|
"loss": 0.365, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"learning_rate": 0.00029961643026165096, |
|
"loss": 0.364, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"eval_loss": 0.8695961236953735, |
|
"eval_runtime": 1.2184, |
|
"eval_samples_per_second": 820.732, |
|
"eval_steps_per_second": 13.132, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"learning_rate": 0.00029958078497968973, |
|
"loss": 0.3627, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"learning_rate": 0.0002995435585854278, |
|
"loss": 0.3614, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"eval_loss": 0.8684411644935608, |
|
"eval_runtime": 1.2551, |
|
"eval_samples_per_second": 796.719, |
|
"eval_steps_per_second": 12.747, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"learning_rate": 0.0002995047514859671, |
|
"loss": 0.3603, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"learning_rate": 0.0002994643641056959, |
|
"loss": 0.3592, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"eval_loss": 0.8597822189331055, |
|
"eval_runtime": 1.198, |
|
"eval_samples_per_second": 834.721, |
|
"eval_steps_per_second": 13.356, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"learning_rate": 0.000299422396886284, |
|
"loss": 0.358, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"learning_rate": 0.0002993788502866783, |
|
"loss": 0.3571, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"eval_loss": 0.8572097420692444, |
|
"eval_runtime": 1.21, |
|
"eval_samples_per_second": 826.463, |
|
"eval_steps_per_second": 13.223, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"learning_rate": 0.00029933372478309746, |
|
"loss": 0.3562, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"learning_rate": 0.00029928702086902664, |
|
"loss": 0.3555, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"eval_loss": 0.8637193441390991, |
|
"eval_runtime": 1.4051, |
|
"eval_samples_per_second": 711.674, |
|
"eval_steps_per_second": 11.387, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"learning_rate": 0.00029923873905521244, |
|
"loss": 0.3545, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"learning_rate": 0.000299188879869657, |
|
"loss": 0.3535, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"eval_loss": 0.8638287782669067, |
|
"eval_runtime": 1.2182, |
|
"eval_samples_per_second": 820.876, |
|
"eval_steps_per_second": 13.134, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"learning_rate": 0.00029913744385761244, |
|
"loss": 0.3524, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"learning_rate": 0.00029908443158157465, |
|
"loss": 0.3518, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"eval_loss": 0.8664878606796265, |
|
"eval_runtime": 1.2547, |
|
"eval_samples_per_second": 796.987, |
|
"eval_steps_per_second": 12.752, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"learning_rate": 0.0002990298436212775, |
|
"loss": 0.3511, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"learning_rate": 0.0002989736805736861, |
|
"loss": 0.3502, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"eval_loss": 0.8559480905532837, |
|
"eval_runtime": 1.2272, |
|
"eval_samples_per_second": 814.861, |
|
"eval_steps_per_second": 13.038, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"learning_rate": 0.00029891594305299065, |
|
"loss": 0.3494, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"learning_rate": 0.00029885663169059926, |
|
"loss": 0.3488, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"eval_loss": 0.859957754611969, |
|
"eval_runtime": 1.2944, |
|
"eval_samples_per_second": 772.587, |
|
"eval_steps_per_second": 12.361, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"learning_rate": 0.0002987957471351316, |
|
"loss": 0.3478, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"learning_rate": 0.00029873329005241137, |
|
"loss": 0.3469, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"eval_loss": 0.852756917476654, |
|
"eval_runtime": 1.2478, |
|
"eval_samples_per_second": 801.439, |
|
"eval_steps_per_second": 12.823, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"learning_rate": 0.00029866926112545925, |
|
"loss": 0.3464, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"learning_rate": 0.00029860366105448534, |
|
"loss": 0.3459, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"eval_loss": 0.8597527146339417, |
|
"eval_runtime": 1.1814, |
|
"eval_samples_per_second": 846.435, |
|
"eval_steps_per_second": 13.543, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"learning_rate": 0.00029853649055688143, |
|
"loss": 0.3451, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"learning_rate": 0.00029846775036721337, |
|
"loss": 0.3444, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"eval_loss": 0.860701322555542, |
|
"eval_runtime": 1.2083, |
|
"eval_samples_per_second": 827.633, |
|
"eval_steps_per_second": 13.242, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"learning_rate": 0.0002983974412372129, |
|
"loss": 0.3438, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"learning_rate": 0.00029832556393576934, |
|
"loss": 0.3428, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"eval_loss": 0.865045964717865, |
|
"eval_runtime": 1.2493, |
|
"eval_samples_per_second": 800.445, |
|
"eval_steps_per_second": 12.807, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"learning_rate": 0.0002982521192489214, |
|
"loss": 0.3425, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"learning_rate": 0.0002981771079798483, |
|
"loss": 0.342, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"eval_loss": 0.8640099167823792, |
|
"eval_runtime": 1.2369, |
|
"eval_samples_per_second": 808.489, |
|
"eval_steps_per_second": 12.936, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"learning_rate": 0.00029810053094886136, |
|
"loss": 0.3417, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"learning_rate": 0.00029802238899339473, |
|
"loss": 0.3408, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"eval_loss": 0.8549481630325317, |
|
"eval_runtime": 1.3108, |
|
"eval_samples_per_second": 762.871, |
|
"eval_steps_per_second": 12.206, |
|
"step": 50000 |
|
} |
|
], |
|
"max_steps": 500000, |
|
"num_train_epochs": 13, |
|
"total_flos": 1.597422214959455e+21, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|