gpt2-starter / trainer_state.json
louis030195's picture
update
4416589
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 99.9971346704871,
"global_step": 17400,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 1.15,
"eval_loss": 3.23046875,
"eval_runtime": 11.7347,
"eval_samples_per_second": 68.259,
"eval_steps_per_second": 8.607,
"step": 200
},
{
"epoch": 2.3,
"eval_loss": 3.255859375,
"eval_runtime": 11.7326,
"eval_samples_per_second": 68.271,
"eval_steps_per_second": 8.608,
"step": 400
},
{
"epoch": 2.87,
"learning_rate": 5e-05,
"loss": 2.0922,
"step": 500
},
{
"epoch": 3.45,
"eval_loss": 3.279296875,
"eval_runtime": 11.7456,
"eval_samples_per_second": 68.196,
"eval_steps_per_second": 8.599,
"step": 600
},
{
"epoch": 4.6,
"eval_loss": 3.294921875,
"eval_runtime": 11.7533,
"eval_samples_per_second": 68.151,
"eval_steps_per_second": 8.593,
"step": 800
},
{
"epoch": 5.74,
"learning_rate": 5e-05,
"loss": 1.931,
"step": 1000
},
{
"epoch": 5.74,
"eval_loss": 3.310546875,
"eval_runtime": 11.7529,
"eval_samples_per_second": 68.153,
"eval_steps_per_second": 8.594,
"step": 1000
},
{
"epoch": 6.89,
"eval_loss": 3.30078125,
"eval_runtime": 11.76,
"eval_samples_per_second": 68.112,
"eval_steps_per_second": 8.588,
"step": 1200
},
{
"epoch": 8.05,
"eval_loss": 3.3359375,
"eval_runtime": 11.7654,
"eval_samples_per_second": 68.081,
"eval_steps_per_second": 8.584,
"step": 1400
},
{
"epoch": 8.62,
"learning_rate": 5e-05,
"loss": 1.8589,
"step": 1500
},
{
"epoch": 9.19,
"eval_loss": 3.341796875,
"eval_runtime": 11.7717,
"eval_samples_per_second": 68.045,
"eval_steps_per_second": 8.58,
"step": 1600
},
{
"epoch": 10.34,
"eval_loss": 3.353515625,
"eval_runtime": 11.7648,
"eval_samples_per_second": 68.085,
"eval_steps_per_second": 8.585,
"step": 1800
},
{
"epoch": 11.49,
"learning_rate": 5e-05,
"loss": 1.7982,
"step": 2000
},
{
"epoch": 11.49,
"eval_loss": 3.373046875,
"eval_runtime": 11.771,
"eval_samples_per_second": 68.049,
"eval_steps_per_second": 8.58,
"step": 2000
},
{
"epoch": 12.64,
"eval_loss": 3.37109375,
"eval_runtime": 11.7674,
"eval_samples_per_second": 68.069,
"eval_steps_per_second": 8.583,
"step": 2200
},
{
"epoch": 13.79,
"eval_loss": 3.40625,
"eval_runtime": 11.7675,
"eval_samples_per_second": 68.069,
"eval_steps_per_second": 8.583,
"step": 2400
},
{
"epoch": 14.37,
"learning_rate": 5e-05,
"loss": 1.7446,
"step": 2500
},
{
"epoch": 14.94,
"eval_loss": 3.4140625,
"eval_runtime": 11.7686,
"eval_samples_per_second": 68.062,
"eval_steps_per_second": 8.582,
"step": 2600
},
{
"epoch": 16.09,
"eval_loss": 3.45703125,
"eval_runtime": 11.7612,
"eval_samples_per_second": 68.105,
"eval_steps_per_second": 8.588,
"step": 2800
},
{
"epoch": 17.24,
"learning_rate": 5e-05,
"loss": 1.6947,
"step": 3000
},
{
"epoch": 17.24,
"eval_loss": 3.498046875,
"eval_runtime": 11.7651,
"eval_samples_per_second": 68.082,
"eval_steps_per_second": 8.585,
"step": 3000
},
{
"epoch": 18.39,
"eval_loss": 3.51171875,
"eval_runtime": 11.7624,
"eval_samples_per_second": 68.098,
"eval_steps_per_second": 8.587,
"step": 3200
},
{
"epoch": 19.54,
"eval_loss": 3.533203125,
"eval_runtime": 11.7616,
"eval_samples_per_second": 68.103,
"eval_steps_per_second": 8.587,
"step": 3400
},
{
"epoch": 20.11,
"learning_rate": 5e-05,
"loss": 1.6464,
"step": 3500
},
{
"epoch": 20.69,
"eval_loss": 3.55078125,
"eval_runtime": 11.7586,
"eval_samples_per_second": 68.121,
"eval_steps_per_second": 8.589,
"step": 3600
},
{
"epoch": 21.84,
"eval_loss": 3.580078125,
"eval_runtime": 11.772,
"eval_samples_per_second": 68.043,
"eval_steps_per_second": 8.58,
"step": 3800
},
{
"epoch": 22.99,
"learning_rate": 5e-05,
"loss": 1.5981,
"step": 4000
},
{
"epoch": 22.99,
"eval_loss": 3.6015625,
"eval_runtime": 11.7661,
"eval_samples_per_second": 68.077,
"eval_steps_per_second": 8.584,
"step": 4000
},
{
"epoch": 24.14,
"eval_loss": 3.6796875,
"eval_runtime": 11.7648,
"eval_samples_per_second": 68.085,
"eval_steps_per_second": 8.585,
"step": 4200
},
{
"epoch": 25.29,
"eval_loss": 3.7265625,
"eval_runtime": 11.7655,
"eval_samples_per_second": 68.08,
"eval_steps_per_second": 8.584,
"step": 4400
},
{
"epoch": 25.86,
"learning_rate": 5e-05,
"loss": 1.5539,
"step": 4500
},
{
"epoch": 26.44,
"eval_loss": 3.736328125,
"eval_runtime": 11.7698,
"eval_samples_per_second": 68.056,
"eval_steps_per_second": 8.581,
"step": 4600
},
{
"epoch": 27.58,
"eval_loss": 3.755859375,
"eval_runtime": 11.7676,
"eval_samples_per_second": 68.068,
"eval_steps_per_second": 8.583,
"step": 4800
},
{
"epoch": 28.73,
"learning_rate": 5e-05,
"loss": 1.5105,
"step": 5000
},
{
"epoch": 28.73,
"eval_loss": 3.79296875,
"eval_runtime": 11.7657,
"eval_samples_per_second": 68.079,
"eval_steps_per_second": 8.584,
"step": 5000
},
{
"epoch": 29.88,
"eval_loss": 3.859375,
"eval_runtime": 11.7702,
"eval_samples_per_second": 68.053,
"eval_steps_per_second": 8.581,
"step": 5200
},
{
"epoch": 31.03,
"eval_loss": 3.951171875,
"eval_runtime": 11.7682,
"eval_samples_per_second": 68.065,
"eval_steps_per_second": 8.582,
"step": 5400
},
{
"epoch": 31.61,
"learning_rate": 5e-05,
"loss": 1.4699,
"step": 5500
},
{
"epoch": 32.18,
"eval_loss": 3.953125,
"eval_runtime": 11.768,
"eval_samples_per_second": 68.066,
"eval_steps_per_second": 8.583,
"step": 5600
},
{
"epoch": 33.33,
"eval_loss": 3.95703125,
"eval_runtime": 11.7717,
"eval_samples_per_second": 68.044,
"eval_steps_per_second": 8.58,
"step": 5800
},
{
"epoch": 34.48,
"learning_rate": 5e-05,
"loss": 1.4317,
"step": 6000
},
{
"epoch": 34.48,
"eval_loss": 4.0234375,
"eval_runtime": 11.7662,
"eval_samples_per_second": 68.076,
"eval_steps_per_second": 8.584,
"step": 6000
},
{
"epoch": 35.63,
"eval_loss": 4.0859375,
"eval_runtime": 11.7718,
"eval_samples_per_second": 68.044,
"eval_steps_per_second": 8.58,
"step": 6200
},
{
"epoch": 36.78,
"eval_loss": 4.09765625,
"eval_runtime": 11.7694,
"eval_samples_per_second": 68.058,
"eval_steps_per_second": 8.582,
"step": 6400
},
{
"epoch": 37.36,
"learning_rate": 5e-05,
"loss": 1.3947,
"step": 6500
},
{
"epoch": 37.93,
"eval_loss": 4.11328125,
"eval_runtime": 11.769,
"eval_samples_per_second": 68.06,
"eval_steps_per_second": 8.582,
"step": 6600
},
{
"epoch": 39.08,
"eval_loss": 4.1796875,
"eval_runtime": 11.7687,
"eval_samples_per_second": 68.062,
"eval_steps_per_second": 8.582,
"step": 6800
},
{
"epoch": 40.23,
"learning_rate": 5e-05,
"loss": 1.3589,
"step": 7000
},
{
"epoch": 40.23,
"eval_loss": 4.20703125,
"eval_runtime": 11.7645,
"eval_samples_per_second": 68.086,
"eval_steps_per_second": 8.585,
"step": 7000
},
{
"epoch": 41.38,
"eval_loss": 4.2734375,
"eval_runtime": 11.7699,
"eval_samples_per_second": 68.055,
"eval_steps_per_second": 8.581,
"step": 7200
},
{
"epoch": 42.53,
"eval_loss": 4.29296875,
"eval_runtime": 11.7707,
"eval_samples_per_second": 68.05,
"eval_steps_per_second": 8.581,
"step": 7400
},
{
"epoch": 43.1,
"learning_rate": 5e-05,
"loss": 1.3248,
"step": 7500
},
{
"epoch": 43.68,
"eval_loss": 4.30859375,
"eval_runtime": 11.7694,
"eval_samples_per_second": 68.058,
"eval_steps_per_second": 8.582,
"step": 7600
},
{
"epoch": 44.83,
"eval_loss": 4.34765625,
"eval_runtime": 11.7695,
"eval_samples_per_second": 68.057,
"eval_steps_per_second": 8.582,
"step": 7800
},
{
"epoch": 45.97,
"learning_rate": 5e-05,
"loss": 1.2899,
"step": 8000
},
{
"epoch": 45.97,
"eval_loss": 4.375,
"eval_runtime": 11.7703,
"eval_samples_per_second": 68.053,
"eval_steps_per_second": 8.581,
"step": 8000
},
{
"epoch": 47.13,
"eval_loss": 4.4609375,
"eval_runtime": 11.7682,
"eval_samples_per_second": 68.065,
"eval_steps_per_second": 8.582,
"step": 8200
},
{
"epoch": 48.28,
"eval_loss": 4.47265625,
"eval_runtime": 11.77,
"eval_samples_per_second": 68.054,
"eval_steps_per_second": 8.581,
"step": 8400
},
{
"epoch": 48.85,
"learning_rate": 5e-05,
"loss": 1.2585,
"step": 8500
},
{
"epoch": 49.42,
"eval_loss": 4.5546875,
"eval_runtime": 11.7706,
"eval_samples_per_second": 68.051,
"eval_steps_per_second": 8.581,
"step": 8600
},
{
"epoch": 50.57,
"eval_loss": 4.54296875,
"eval_runtime": 11.7686,
"eval_samples_per_second": 68.062,
"eval_steps_per_second": 8.582,
"step": 8800
},
{
"epoch": 51.72,
"learning_rate": 5e-05,
"loss": 1.2273,
"step": 9000
},
{
"epoch": 51.72,
"eval_loss": 4.55859375,
"eval_runtime": 11.7687,
"eval_samples_per_second": 68.062,
"eval_steps_per_second": 8.582,
"step": 9000
},
{
"epoch": 52.87,
"eval_loss": 4.58984375,
"eval_runtime": 11.7666,
"eval_samples_per_second": 68.074,
"eval_steps_per_second": 8.584,
"step": 9200
},
{
"epoch": 54.02,
"eval_loss": 4.62109375,
"eval_runtime": 11.7682,
"eval_samples_per_second": 68.065,
"eval_steps_per_second": 8.582,
"step": 9400
},
{
"epoch": 54.6,
"learning_rate": 5e-05,
"loss": 1.1995,
"step": 9500
},
{
"epoch": 55.17,
"eval_loss": 4.671875,
"eval_runtime": 11.7698,
"eval_samples_per_second": 68.056,
"eval_steps_per_second": 8.581,
"step": 9600
},
{
"epoch": 56.32,
"eval_loss": 4.71875,
"eval_runtime": 11.7677,
"eval_samples_per_second": 68.068,
"eval_steps_per_second": 8.583,
"step": 9800
},
{
"epoch": 57.47,
"learning_rate": 5e-05,
"loss": 1.1713,
"step": 10000
},
{
"epoch": 57.47,
"eval_loss": 4.72265625,
"eval_runtime": 11.7696,
"eval_samples_per_second": 68.057,
"eval_steps_per_second": 8.581,
"step": 10000
},
{
"epoch": 58.62,
"eval_loss": 4.7578125,
"eval_runtime": 11.7712,
"eval_samples_per_second": 68.047,
"eval_steps_per_second": 8.58,
"step": 10200
},
{
"epoch": 59.77,
"eval_loss": 4.77734375,
"eval_runtime": 11.7676,
"eval_samples_per_second": 68.068,
"eval_steps_per_second": 8.583,
"step": 10400
},
{
"epoch": 60.34,
"learning_rate": 5e-05,
"loss": 1.144,
"step": 10500
},
{
"epoch": 60.92,
"eval_loss": 4.78515625,
"eval_runtime": 11.7665,
"eval_samples_per_second": 68.075,
"eval_steps_per_second": 8.584,
"step": 10600
},
{
"epoch": 62.07,
"eval_loss": 4.859375,
"eval_runtime": 11.7665,
"eval_samples_per_second": 68.074,
"eval_steps_per_second": 8.584,
"step": 10800
},
{
"epoch": 63.22,
"learning_rate": 5e-05,
"loss": 1.1196,
"step": 11000
},
{
"epoch": 63.22,
"eval_loss": 4.859375,
"eval_runtime": 11.7664,
"eval_samples_per_second": 68.075,
"eval_steps_per_second": 8.584,
"step": 11000
},
{
"epoch": 64.37,
"eval_loss": 4.90234375,
"eval_runtime": 11.767,
"eval_samples_per_second": 68.072,
"eval_steps_per_second": 8.583,
"step": 11200
},
{
"epoch": 65.52,
"eval_loss": 4.921875,
"eval_runtime": 11.7663,
"eval_samples_per_second": 68.076,
"eval_steps_per_second": 8.584,
"step": 11400
},
{
"epoch": 66.09,
"learning_rate": 5e-05,
"loss": 1.0945,
"step": 11500
},
{
"epoch": 66.66,
"eval_loss": 4.94140625,
"eval_runtime": 11.7257,
"eval_samples_per_second": 68.312,
"eval_steps_per_second": 8.614,
"step": 11600
},
{
"epoch": 67.81,
"eval_loss": 4.97265625,
"eval_runtime": 11.7376,
"eval_samples_per_second": 68.242,
"eval_steps_per_second": 8.605,
"step": 11800
},
{
"epoch": 68.96,
"learning_rate": 5e-05,
"loss": 1.0698,
"step": 12000
},
{
"epoch": 68.96,
"eval_loss": 4.98046875,
"eval_runtime": 11.7405,
"eval_samples_per_second": 68.226,
"eval_steps_per_second": 8.603,
"step": 12000
},
{
"epoch": 70.11,
"eval_loss": 5.0234375,
"eval_runtime": 11.7527,
"eval_samples_per_second": 68.155,
"eval_steps_per_second": 8.594,
"step": 12200
},
{
"epoch": 71.26,
"eval_loss": 5.0546875,
"eval_runtime": 11.7518,
"eval_samples_per_second": 68.16,
"eval_steps_per_second": 8.594,
"step": 12400
},
{
"epoch": 71.84,
"learning_rate": 5e-05,
"loss": 1.047,
"step": 12500
},
{
"epoch": 72.41,
"eval_loss": 5.0859375,
"eval_runtime": 11.7522,
"eval_samples_per_second": 68.157,
"eval_steps_per_second": 8.594,
"step": 12600
},
{
"epoch": 73.56,
"eval_loss": 5.109375,
"eval_runtime": 11.7581,
"eval_samples_per_second": 68.123,
"eval_steps_per_second": 8.59,
"step": 12800
},
{
"epoch": 74.71,
"learning_rate": 5e-05,
"loss": 1.0242,
"step": 13000
},
{
"epoch": 74.71,
"eval_loss": 5.1328125,
"eval_runtime": 11.7589,
"eval_samples_per_second": 68.119,
"eval_steps_per_second": 8.589,
"step": 13000
},
{
"epoch": 75.86,
"eval_loss": 5.1484375,
"eval_runtime": 11.7654,
"eval_samples_per_second": 68.081,
"eval_steps_per_second": 8.585,
"step": 13200
},
{
"epoch": 77.01,
"eval_loss": 5.171875,
"eval_runtime": 11.7655,
"eval_samples_per_second": 68.08,
"eval_steps_per_second": 8.584,
"step": 13400
},
{
"epoch": 77.58,
"learning_rate": 5e-05,
"loss": 1.0042,
"step": 13500
},
{
"epoch": 78.16,
"eval_loss": 5.21484375,
"eval_runtime": 11.7554,
"eval_samples_per_second": 68.139,
"eval_steps_per_second": 8.592,
"step": 13600
},
{
"epoch": 79.31,
"eval_loss": 5.25390625,
"eval_runtime": 11.7677,
"eval_samples_per_second": 68.068,
"eval_steps_per_second": 8.583,
"step": 13800
},
{
"epoch": 80.46,
"learning_rate": 5e-05,
"loss": 0.983,
"step": 14000
},
{
"epoch": 80.46,
"eval_loss": 5.2421875,
"eval_runtime": 11.7633,
"eval_samples_per_second": 68.093,
"eval_steps_per_second": 8.586,
"step": 14000
},
{
"epoch": 81.61,
"eval_loss": 5.27734375,
"eval_runtime": 11.7597,
"eval_samples_per_second": 68.114,
"eval_steps_per_second": 8.589,
"step": 14200
},
{
"epoch": 82.76,
"eval_loss": 5.28515625,
"eval_runtime": 11.7676,
"eval_samples_per_second": 68.068,
"eval_steps_per_second": 8.583,
"step": 14400
},
{
"epoch": 83.33,
"learning_rate": 5e-05,
"loss": 0.9641,
"step": 14500
},
{
"epoch": 83.91,
"eval_loss": 5.32421875,
"eval_runtime": 11.7617,
"eval_samples_per_second": 68.102,
"eval_steps_per_second": 8.587,
"step": 14600
},
{
"epoch": 85.06,
"eval_loss": 5.375,
"eval_runtime": 11.7644,
"eval_samples_per_second": 68.087,
"eval_steps_per_second": 8.585,
"step": 14800
},
{
"epoch": 86.21,
"learning_rate": 5e-05,
"loss": 0.9448,
"step": 15000
},
{
"epoch": 86.21,
"eval_loss": 5.38671875,
"eval_runtime": 11.7479,
"eval_samples_per_second": 68.182,
"eval_steps_per_second": 8.597,
"step": 15000
},
{
"epoch": 87.36,
"eval_loss": 5.40625,
"eval_runtime": 11.7632,
"eval_samples_per_second": 68.094,
"eval_steps_per_second": 8.586,
"step": 15200
},
{
"epoch": 88.5,
"eval_loss": 5.41796875,
"eval_runtime": 11.7578,
"eval_samples_per_second": 68.125,
"eval_steps_per_second": 8.59,
"step": 15400
},
{
"epoch": 89.08,
"learning_rate": 5e-05,
"loss": 0.9253,
"step": 15500
},
{
"epoch": 89.65,
"eval_loss": 5.453125,
"eval_runtime": 11.7633,
"eval_samples_per_second": 68.093,
"eval_steps_per_second": 8.586,
"step": 15600
},
{
"epoch": 90.8,
"eval_loss": 5.44921875,
"eval_runtime": 11.755,
"eval_samples_per_second": 68.141,
"eval_steps_per_second": 8.592,
"step": 15800
},
{
"epoch": 91.95,
"learning_rate": 5e-05,
"loss": 0.907,
"step": 16000
},
{
"epoch": 91.95,
"eval_loss": 5.484375,
"eval_runtime": 11.7719,
"eval_samples_per_second": 68.044,
"eval_steps_per_second": 8.58,
"step": 16000
},
{
"epoch": 93.1,
"eval_loss": 5.54296875,
"eval_runtime": 11.7524,
"eval_samples_per_second": 68.157,
"eval_steps_per_second": 8.594,
"step": 16200
},
{
"epoch": 94.25,
"eval_loss": 5.56640625,
"eval_runtime": 11.762,
"eval_samples_per_second": 68.1,
"eval_steps_per_second": 8.587,
"step": 16400
},
{
"epoch": 94.83,
"learning_rate": 5e-05,
"loss": 0.8889,
"step": 16500
},
{
"epoch": 95.4,
"eval_loss": 5.58984375,
"eval_runtime": 11.7505,
"eval_samples_per_second": 68.167,
"eval_steps_per_second": 8.595,
"step": 16600
},
{
"epoch": 96.55,
"eval_loss": 5.59375,
"eval_runtime": 11.7537,
"eval_samples_per_second": 68.149,
"eval_steps_per_second": 8.593,
"step": 16800
},
{
"epoch": 97.7,
"learning_rate": 5e-05,
"loss": 0.8717,
"step": 17000
},
{
"epoch": 97.7,
"eval_loss": 5.6328125,
"eval_runtime": 11.7619,
"eval_samples_per_second": 68.101,
"eval_steps_per_second": 8.587,
"step": 17000
},
{
"epoch": 98.85,
"eval_loss": 5.64453125,
"eval_runtime": 11.7552,
"eval_samples_per_second": 68.14,
"eval_steps_per_second": 8.592,
"step": 17200
},
{
"epoch": 100.0,
"eval_loss": 5.671875,
"eval_runtime": 11.7611,
"eval_samples_per_second": 68.106,
"eval_steps_per_second": 8.588,
"step": 17400
},
{
"epoch": 100.0,
"step": 17400,
"total_flos": 2.1345852491772723e+17,
"train_loss": 0.3251776333512931,
"train_runtime": 8837.2329,
"train_samples_per_second": 31.594,
"train_steps_per_second": 1.969
}
],
"max_steps": 17400,
"num_train_epochs": 100,
"total_flos": 2.1345852491772723e+17,
"trial_name": null,
"trial_params": null
}