|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 10.0, |
|
"eval_steps": 500, |
|
"global_step": 1250, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.19323205947875977, |
|
"learning_rate": 0.0002, |
|
"loss": 1.4874, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.1678144633769989, |
|
"learning_rate": 0.0002, |
|
"loss": 1.4509, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.08337455242872238, |
|
"learning_rate": 0.0002, |
|
"loss": 1.3405, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.09026125818490982, |
|
"learning_rate": 0.0002, |
|
"loss": 1.3735, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.1073032096028328, |
|
"learning_rate": 0.0002, |
|
"loss": 1.4068, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.13652268052101135, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2271, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.12082232534885406, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2864, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.12123199552297592, |
|
"learning_rate": 0.0002, |
|
"loss": 1.254, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.11623656004667282, |
|
"learning_rate": 0.0002, |
|
"loss": 1.3337, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.13121260702610016, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2837, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 0.17798519134521484, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1528, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 0.16624127328395844, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1691, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 0.13809803128242493, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1826, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 0.13886697590351105, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2243, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.20055805146694183, |
|
"learning_rate": 0.0002, |
|
"loss": 1.2453, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 0.2664979100227356, |
|
"learning_rate": 0.0002, |
|
"loss": 1.068, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 0.19323182106018066, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1088, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 0.2079138457775116, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0796, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"grad_norm": 0.20442678034305573, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1325, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.25957778096199036, |
|
"learning_rate": 0.0002, |
|
"loss": 1.1482, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"grad_norm": 0.3366886079311371, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0235, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"grad_norm": 0.25876155495643616, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9673, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 4.6, |
|
"grad_norm": 0.239900603890419, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0075, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"grad_norm": 0.23491528630256653, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0195, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.27541735768318176, |
|
"learning_rate": 0.0002, |
|
"loss": 1.0431, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 5.2, |
|
"grad_norm": 0.3689779043197632, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9048, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 5.4, |
|
"grad_norm": 0.2586508095264435, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8714, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 5.6, |
|
"grad_norm": 0.2392464578151703, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8859, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 5.8, |
|
"grad_norm": 0.2529771029949188, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9311, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 0.5787801146507263, |
|
"learning_rate": 0.0002, |
|
"loss": 0.9657, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 6.2, |
|
"grad_norm": 0.5453413128852844, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8175, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 6.4, |
|
"grad_norm": 0.34291961789131165, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8445, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 6.6, |
|
"grad_norm": 0.28864824771881104, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8116, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 6.8, |
|
"grad_norm": 0.22972020506858826, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7752, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 0.43756699562072754, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8068, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 7.2, |
|
"grad_norm": 0.5593159794807434, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7582, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 7.4, |
|
"grad_norm": 0.3569984436035156, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7165, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 7.6, |
|
"grad_norm": 0.3286341428756714, |
|
"learning_rate": 0.0002, |
|
"loss": 0.706, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 7.8, |
|
"grad_norm": 0.27032291889190674, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6735, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 0.35773584246635437, |
|
"learning_rate": 0.0002, |
|
"loss": 0.7105, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 8.2, |
|
"grad_norm": 0.6158097982406616, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6855, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 8.4, |
|
"grad_norm": 0.3763855993747711, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6066, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 8.6, |
|
"grad_norm": 0.3213497996330261, |
|
"learning_rate": 0.0002, |
|
"loss": 0.623, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 8.8, |
|
"grad_norm": 0.3130660355091095, |
|
"learning_rate": 0.0002, |
|
"loss": 0.587, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"grad_norm": 0.49706026911735535, |
|
"learning_rate": 0.0002, |
|
"loss": 0.6161, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 9.2, |
|
"grad_norm": 0.606447696685791, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5802, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 9.4, |
|
"grad_norm": 0.42242899537086487, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5627, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 9.6, |
|
"grad_norm": 0.37915894389152527, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5408, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 9.8, |
|
"grad_norm": 0.3458578586578369, |
|
"learning_rate": 0.0002, |
|
"loss": 0.524, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.5249849557876587, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5091, |
|
"step": 1250 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 1250, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 25, |
|
"total_flos": 1.3841243942078054e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|