|
{ |
|
"best_metric": 5.4548899332682295, |
|
"best_model_checkpoint": "./results/checkpoint-5496", |
|
"epoch": 7.0, |
|
"eval_steps": 500, |
|
"global_step": 6412, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.1091703056768559, |
|
"grad_norm": 50.38945388793945, |
|
"learning_rate": 9.863537117903931e-05, |
|
"loss": 46.0228, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2183406113537118, |
|
"grad_norm": 53.34562683105469, |
|
"learning_rate": 9.727074235807861e-05, |
|
"loss": 16.1163, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.32751091703056767, |
|
"grad_norm": 47.549678802490234, |
|
"learning_rate": 9.59061135371179e-05, |
|
"loss": 10.3137, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.4366812227074236, |
|
"grad_norm": 27.560810089111328, |
|
"learning_rate": 9.454148471615721e-05, |
|
"loss": 8.4484, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.5458515283842795, |
|
"grad_norm": 43.655635833740234, |
|
"learning_rate": 9.317685589519652e-05, |
|
"loss": 7.5983, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.6550218340611353, |
|
"grad_norm": 36.29502868652344, |
|
"learning_rate": 9.18122270742358e-05, |
|
"loss": 7.5018, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.7641921397379913, |
|
"grad_norm": 46.47073745727539, |
|
"learning_rate": 9.044759825327511e-05, |
|
"loss": 7.1886, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.8733624454148472, |
|
"grad_norm": 40.817569732666016, |
|
"learning_rate": 8.908296943231441e-05, |
|
"loss": 7.136, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.982532751091703, |
|
"grad_norm": 40.99913024902344, |
|
"learning_rate": 8.771834061135371e-05, |
|
"loss": 6.9784, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_avg_mae": 7.428911844889323, |
|
"eval_loss": 7.4289116859436035, |
|
"eval_mae_lex": 7.114953517913818, |
|
"eval_mae_sem": 5.620931625366211, |
|
"eval_mae_syn": 9.550849914550781, |
|
"eval_runtime": 27.041, |
|
"eval_samples_per_second": 270.959, |
|
"eval_steps_per_second": 8.469, |
|
"step": 916 |
|
}, |
|
{ |
|
"epoch": 1.091703056768559, |
|
"grad_norm": 40.483001708984375, |
|
"learning_rate": 8.635371179039302e-05, |
|
"loss": 6.7681, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.2008733624454149, |
|
"grad_norm": 28.894540786743164, |
|
"learning_rate": 8.498908296943232e-05, |
|
"loss": 6.5716, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.3100436681222707, |
|
"grad_norm": 28.377840042114258, |
|
"learning_rate": 8.362445414847162e-05, |
|
"loss": 6.796, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.4192139737991267, |
|
"grad_norm": 39.30733108520508, |
|
"learning_rate": 8.225982532751092e-05, |
|
"loss": 6.6517, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.5283842794759825, |
|
"grad_norm": 29.63848304748535, |
|
"learning_rate": 8.089519650655023e-05, |
|
"loss": 6.7767, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.6375545851528384, |
|
"grad_norm": 52.78211975097656, |
|
"learning_rate": 7.953056768558951e-05, |
|
"loss": 6.6008, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.7467248908296944, |
|
"grad_norm": 52.6399040222168, |
|
"learning_rate": 7.816593886462883e-05, |
|
"loss": 6.5132, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.8558951965065502, |
|
"grad_norm": 25.399370193481445, |
|
"learning_rate": 7.680131004366813e-05, |
|
"loss": 6.5244, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.965065502183406, |
|
"grad_norm": 40.24644470214844, |
|
"learning_rate": 7.543668122270742e-05, |
|
"loss": 6.4174, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_avg_mae": 6.124305725097656, |
|
"eval_loss": 6.124305725097656, |
|
"eval_mae_lex": 5.557982921600342, |
|
"eval_mae_sem": 4.011363983154297, |
|
"eval_mae_syn": 8.803570747375488, |
|
"eval_runtime": 26.9945, |
|
"eval_samples_per_second": 271.426, |
|
"eval_steps_per_second": 8.483, |
|
"step": 1832 |
|
}, |
|
{ |
|
"epoch": 2.074235807860262, |
|
"grad_norm": 72.3259048461914, |
|
"learning_rate": 7.407205240174672e-05, |
|
"loss": 6.362, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.183406113537118, |
|
"grad_norm": 44.780540466308594, |
|
"learning_rate": 7.270742358078603e-05, |
|
"loss": 6.2052, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.2925764192139737, |
|
"grad_norm": 42.22085952758789, |
|
"learning_rate": 7.134279475982533e-05, |
|
"loss": 6.0878, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.4017467248908297, |
|
"grad_norm": 31.405487060546875, |
|
"learning_rate": 6.997816593886463e-05, |
|
"loss": 6.1432, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.5109170305676853, |
|
"grad_norm": 40.08876037597656, |
|
"learning_rate": 6.861353711790393e-05, |
|
"loss": 6.0203, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.6200873362445414, |
|
"grad_norm": 25.91780662536621, |
|
"learning_rate": 6.724890829694324e-05, |
|
"loss": 6.05, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.7292576419213974, |
|
"grad_norm": 30.461589813232422, |
|
"learning_rate": 6.588427947598254e-05, |
|
"loss": 6.0259, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.8384279475982535, |
|
"grad_norm": 35.90027618408203, |
|
"learning_rate": 6.451965065502183e-05, |
|
"loss": 6.1552, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.947598253275109, |
|
"grad_norm": 26.988929748535156, |
|
"learning_rate": 6.315502183406113e-05, |
|
"loss": 5.8932, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_avg_mae": 5.737476348876953, |
|
"eval_loss": 5.737475872039795, |
|
"eval_mae_lex": 5.095954895019531, |
|
"eval_mae_sem": 3.588836669921875, |
|
"eval_mae_syn": 8.527636528015137, |
|
"eval_runtime": 27.0753, |
|
"eval_samples_per_second": 270.616, |
|
"eval_steps_per_second": 8.458, |
|
"step": 2748 |
|
}, |
|
{ |
|
"epoch": 3.056768558951965, |
|
"grad_norm": 31.56240463256836, |
|
"learning_rate": 6.179039301310045e-05, |
|
"loss": 5.7562, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 3.165938864628821, |
|
"grad_norm": 30.02605628967285, |
|
"learning_rate": 6.042576419213974e-05, |
|
"loss": 5.7653, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 3.2751091703056767, |
|
"grad_norm": 37.670711517333984, |
|
"learning_rate": 5.9061135371179045e-05, |
|
"loss": 5.8259, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 3.3842794759825328, |
|
"grad_norm": 29.38449478149414, |
|
"learning_rate": 5.769650655021834e-05, |
|
"loss": 5.5527, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 3.493449781659389, |
|
"grad_norm": 41.96727752685547, |
|
"learning_rate": 5.633187772925764e-05, |
|
"loss": 5.7041, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 3.6026200873362444, |
|
"grad_norm": 40.971092224121094, |
|
"learning_rate": 5.4967248908296945e-05, |
|
"loss": 5.8564, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 3.7117903930131004, |
|
"grad_norm": 28.331939697265625, |
|
"learning_rate": 5.360262008733624e-05, |
|
"loss": 5.6168, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 3.8209606986899565, |
|
"grad_norm": 26.781084060668945, |
|
"learning_rate": 5.223799126637555e-05, |
|
"loss": 5.6631, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 3.930131004366812, |
|
"grad_norm": 32.894012451171875, |
|
"learning_rate": 5.087336244541485e-05, |
|
"loss": 5.6886, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_avg_mae": 6.235666910807292, |
|
"eval_loss": 6.2356672286987305, |
|
"eval_mae_lex": 5.677294731140137, |
|
"eval_mae_sem": 4.446841716766357, |
|
"eval_mae_syn": 8.582864761352539, |
|
"eval_runtime": 27.0543, |
|
"eval_samples_per_second": 270.826, |
|
"eval_steps_per_second": 8.464, |
|
"step": 3664 |
|
}, |
|
{ |
|
"epoch": 4.039301310043668, |
|
"grad_norm": 35.94241714477539, |
|
"learning_rate": 4.950873362445415e-05, |
|
"loss": 5.6333, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 4.148471615720524, |
|
"grad_norm": 25.45247459411621, |
|
"learning_rate": 4.814410480349345e-05, |
|
"loss": 5.3055, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 4.25764192139738, |
|
"grad_norm": 30.626169204711914, |
|
"learning_rate": 4.6779475982532754e-05, |
|
"loss": 5.4211, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 4.366812227074236, |
|
"grad_norm": 25.812488555908203, |
|
"learning_rate": 4.5414847161572056e-05, |
|
"loss": 5.328, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 4.475982532751091, |
|
"grad_norm": 33.51716232299805, |
|
"learning_rate": 4.405021834061135e-05, |
|
"loss": 5.4752, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 4.585152838427947, |
|
"grad_norm": 32.362186431884766, |
|
"learning_rate": 4.268558951965066e-05, |
|
"loss": 5.3976, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 4.6943231441048034, |
|
"grad_norm": 32.369476318359375, |
|
"learning_rate": 4.1320960698689957e-05, |
|
"loss": 5.1836, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 4.8034934497816595, |
|
"grad_norm": 35.22241973876953, |
|
"learning_rate": 3.995633187772926e-05, |
|
"loss": 5.3904, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 4.9126637554585155, |
|
"grad_norm": 34.892250061035156, |
|
"learning_rate": 3.859170305676856e-05, |
|
"loss": 5.2038, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_avg_mae": 6.149158477783203, |
|
"eval_loss": 6.149158954620361, |
|
"eval_mae_lex": 5.426638126373291, |
|
"eval_mae_sem": 4.464701175689697, |
|
"eval_mae_syn": 8.556136131286621, |
|
"eval_runtime": 26.9461, |
|
"eval_samples_per_second": 271.913, |
|
"eval_steps_per_second": 8.498, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 5.021834061135372, |
|
"grad_norm": 30.598581314086914, |
|
"learning_rate": 3.7227074235807864e-05, |
|
"loss": 5.3014, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 5.131004366812227, |
|
"grad_norm": 25.072534561157227, |
|
"learning_rate": 3.586244541484716e-05, |
|
"loss": 4.9525, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 5.240174672489083, |
|
"grad_norm": 34.00102233886719, |
|
"learning_rate": 3.449781659388647e-05, |
|
"loss": 4.9499, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 5.349344978165939, |
|
"grad_norm": 30.76056480407715, |
|
"learning_rate": 3.3133187772925765e-05, |
|
"loss": 4.9841, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 5.458515283842795, |
|
"grad_norm": 28.28791618347168, |
|
"learning_rate": 3.176855895196507e-05, |
|
"loss": 5.1702, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 5.567685589519651, |
|
"grad_norm": 32.07628631591797, |
|
"learning_rate": 3.0403930131004366e-05, |
|
"loss": 5.0461, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 5.676855895196507, |
|
"grad_norm": 39.26410675048828, |
|
"learning_rate": 2.9039301310043672e-05, |
|
"loss": 4.8651, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 5.786026200873362, |
|
"grad_norm": 25.232053756713867, |
|
"learning_rate": 2.767467248908297e-05, |
|
"loss": 4.8736, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 5.895196506550218, |
|
"grad_norm": 29.347366333007812, |
|
"learning_rate": 2.631004366812227e-05, |
|
"loss": 4.9244, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_avg_mae": 5.4548899332682295, |
|
"eval_loss": 5.45488977432251, |
|
"eval_mae_lex": 4.807984352111816, |
|
"eval_mae_sem": 3.6341991424560547, |
|
"eval_mae_syn": 7.922485828399658, |
|
"eval_runtime": 27.0225, |
|
"eval_samples_per_second": 271.145, |
|
"eval_steps_per_second": 8.474, |
|
"step": 5496 |
|
}, |
|
{ |
|
"epoch": 6.004366812227074, |
|
"grad_norm": 27.099462509155273, |
|
"learning_rate": 2.4945414847161576e-05, |
|
"loss": 5.0121, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 6.11353711790393, |
|
"grad_norm": 26.199542999267578, |
|
"learning_rate": 2.3580786026200875e-05, |
|
"loss": 4.6621, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 6.222707423580786, |
|
"grad_norm": 31.04909324645996, |
|
"learning_rate": 2.2216157205240178e-05, |
|
"loss": 4.6792, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 6.331877729257642, |
|
"grad_norm": 26.449748992919922, |
|
"learning_rate": 2.0851528384279477e-05, |
|
"loss": 4.6471, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 6.441048034934497, |
|
"grad_norm": 27.806798934936523, |
|
"learning_rate": 1.948689956331878e-05, |
|
"loss": 4.5161, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 6.550218340611353, |
|
"grad_norm": 39.113399505615234, |
|
"learning_rate": 1.812227074235808e-05, |
|
"loss": 4.5787, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 6.6593886462882095, |
|
"grad_norm": 30.493192672729492, |
|
"learning_rate": 1.675764192139738e-05, |
|
"loss": 4.6041, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 6.7685589519650655, |
|
"grad_norm": 35.86678695678711, |
|
"learning_rate": 1.5393013100436683e-05, |
|
"loss": 4.6345, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 6.877729257641922, |
|
"grad_norm": 44.66313934326172, |
|
"learning_rate": 1.4028384279475984e-05, |
|
"loss": 4.6063, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 6.986899563318778, |
|
"grad_norm": 34.84800720214844, |
|
"learning_rate": 1.2663755458515283e-05, |
|
"loss": 4.6152, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_avg_mae": 5.481756210327148, |
|
"eval_loss": 5.481756210327148, |
|
"eval_mae_lex": 4.816911220550537, |
|
"eval_mae_sem": 3.748530626296997, |
|
"eval_mae_syn": 7.87982702255249, |
|
"eval_runtime": 27.0121, |
|
"eval_samples_per_second": 271.248, |
|
"eval_steps_per_second": 8.478, |
|
"step": 6412 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 7328, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 8, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.3494363633370368e+16, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|