|
{ |
|
"best_metric": 2.9599573612213135, |
|
"best_model_checkpoint": "models/opt-babylm2-rewritten-clean-spacy-32k-earlystop-40epochs_seed-42_1e-3/checkpoint-48293", |
|
"epoch": 28.0, |
|
"eval_steps": 500, |
|
"global_step": 54089, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.5176653293645658, |
|
"grad_norm": 0.516268789768219, |
|
"learning_rate": 3.125e-05, |
|
"loss": 5.9216, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.9996117510029766, |
|
"eval_accuracy": 0.32528310960329715, |
|
"eval_loss": 4.013359069824219, |
|
"eval_runtime": 112.6568, |
|
"eval_samples_per_second": 463.265, |
|
"eval_steps_per_second": 7.243, |
|
"step": 1931 |
|
}, |
|
{ |
|
"epoch": 1.0353306587291315, |
|
"grad_norm": 0.6370756030082703, |
|
"learning_rate": 6.25e-05, |
|
"loss": 4.1987, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.5529959880936974, |
|
"grad_norm": 0.6017232537269592, |
|
"learning_rate": 9.375e-05, |
|
"loss": 3.7977, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.9997411673353178, |
|
"eval_accuracy": 0.3639096213308086, |
|
"eval_loss": 3.544811725616455, |
|
"eval_runtime": 112.8804, |
|
"eval_samples_per_second": 462.348, |
|
"eval_steps_per_second": 7.229, |
|
"step": 3863 |
|
}, |
|
{ |
|
"epoch": 2.070661317458263, |
|
"grad_norm": 0.5702515840530396, |
|
"learning_rate": 0.000125, |
|
"loss": 3.5582, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.588326646822829, |
|
"grad_norm": 0.48817870020866394, |
|
"learning_rate": 0.00015625, |
|
"loss": 3.3887, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 2.9998705836676587, |
|
"eval_accuracy": 0.3840780857274889, |
|
"eval_loss": 3.324249744415283, |
|
"eval_runtime": 112.7528, |
|
"eval_samples_per_second": 462.871, |
|
"eval_steps_per_second": 7.237, |
|
"step": 5795 |
|
}, |
|
{ |
|
"epoch": 3.105991976187395, |
|
"grad_norm": 0.45357462763786316, |
|
"learning_rate": 0.0001875, |
|
"loss": 3.2719, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 3.6236573055519608, |
|
"grad_norm": 0.42539018392562866, |
|
"learning_rate": 0.00021875, |
|
"loss": 3.1805, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.3949032381682315, |
|
"eval_loss": 3.2081618309020996, |
|
"eval_runtime": 112.6095, |
|
"eval_samples_per_second": 463.46, |
|
"eval_steps_per_second": 7.246, |
|
"step": 7727 |
|
}, |
|
{ |
|
"epoch": 4.141322634916526, |
|
"grad_norm": 0.41741499304771423, |
|
"learning_rate": 0.00025, |
|
"loss": 3.1173, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 4.658987964281092, |
|
"grad_norm": 0.3810145854949951, |
|
"learning_rate": 0.00028125000000000003, |
|
"loss": 3.0632, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 4.999611751002977, |
|
"eval_accuracy": 0.401180377880219, |
|
"eval_loss": 3.143218517303467, |
|
"eval_runtime": 112.4876, |
|
"eval_samples_per_second": 463.962, |
|
"eval_steps_per_second": 7.254, |
|
"step": 9658 |
|
}, |
|
{ |
|
"epoch": 5.176653293645658, |
|
"grad_norm": 0.3555419445037842, |
|
"learning_rate": 0.0003125, |
|
"loss": 3.0212, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 5.694318623010224, |
|
"grad_norm": 0.3318658173084259, |
|
"learning_rate": 0.00034375, |
|
"loss": 2.9865, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 5.999741167335317, |
|
"eval_accuracy": 0.4055885546400971, |
|
"eval_loss": 3.101013422012329, |
|
"eval_runtime": 112.8154, |
|
"eval_samples_per_second": 462.614, |
|
"eval_steps_per_second": 7.233, |
|
"step": 11590 |
|
}, |
|
{ |
|
"epoch": 6.21198395237479, |
|
"grad_norm": 0.3243854343891144, |
|
"learning_rate": 0.000375, |
|
"loss": 2.9568, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 6.729649281739356, |
|
"grad_norm": 0.3086845874786377, |
|
"learning_rate": 0.00040625000000000004, |
|
"loss": 2.9347, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 6.999870583667659, |
|
"eval_accuracy": 0.4087078510269791, |
|
"eval_loss": 3.071547746658325, |
|
"eval_runtime": 113.6377, |
|
"eval_samples_per_second": 459.267, |
|
"eval_steps_per_second": 7.181, |
|
"step": 13522 |
|
}, |
|
{ |
|
"epoch": 7.2473146111039215, |
|
"grad_norm": 0.29632025957107544, |
|
"learning_rate": 0.0004375, |
|
"loss": 2.9084, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 7.764979940468487, |
|
"grad_norm": 0.28605663776397705, |
|
"learning_rate": 0.0004686875, |
|
"loss": 2.8953, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.4107785654978604, |
|
"eval_loss": 3.053938388824463, |
|
"eval_runtime": 112.9423, |
|
"eval_samples_per_second": 462.094, |
|
"eval_steps_per_second": 7.225, |
|
"step": 15454 |
|
}, |
|
{ |
|
"epoch": 8.282645269833052, |
|
"grad_norm": 0.2786637246608734, |
|
"learning_rate": 0.0004999375, |
|
"loss": 2.8698, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 8.80031059919762, |
|
"grad_norm": 0.2667602002620697, |
|
"learning_rate": 0.00053115625, |
|
"loss": 2.8689, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 8.999611751002977, |
|
"eval_accuracy": 0.4122456033572655, |
|
"eval_loss": 3.039193868637085, |
|
"eval_runtime": 112.7699, |
|
"eval_samples_per_second": 462.801, |
|
"eval_steps_per_second": 7.236, |
|
"step": 17385 |
|
}, |
|
{ |
|
"epoch": 9.317975928562184, |
|
"grad_norm": 0.25813835859298706, |
|
"learning_rate": 0.00056240625, |
|
"loss": 2.8401, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 9.835641257926751, |
|
"grad_norm": 0.2392367571592331, |
|
"learning_rate": 0.00059365625, |
|
"loss": 2.8456, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 9.999741167335317, |
|
"eval_accuracy": 0.4133619617611367, |
|
"eval_loss": 3.0309925079345703, |
|
"eval_runtime": 112.8237, |
|
"eval_samples_per_second": 462.58, |
|
"eval_steps_per_second": 7.233, |
|
"step": 19317 |
|
}, |
|
{ |
|
"epoch": 10.353306587291316, |
|
"grad_norm": 0.2465026080608368, |
|
"learning_rate": 0.00062490625, |
|
"loss": 2.8163, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 10.870971916655883, |
|
"grad_norm": 0.21547040343284607, |
|
"learning_rate": 0.000656125, |
|
"loss": 2.8298, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 10.99987058366766, |
|
"eval_accuracy": 0.41438394403555634, |
|
"eval_loss": 3.0251340866088867, |
|
"eval_runtime": 112.6781, |
|
"eval_samples_per_second": 463.178, |
|
"eval_steps_per_second": 7.242, |
|
"step": 21249 |
|
}, |
|
{ |
|
"epoch": 11.388637246020448, |
|
"grad_norm": 0.23142270743846893, |
|
"learning_rate": 0.0006873749999999999, |
|
"loss": 2.798, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 11.906302575385013, |
|
"grad_norm": 0.2184012234210968, |
|
"learning_rate": 0.00071859375, |
|
"loss": 2.817, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.4152235609706615, |
|
"eval_loss": 3.0175206661224365, |
|
"eval_runtime": 112.4939, |
|
"eval_samples_per_second": 463.936, |
|
"eval_steps_per_second": 7.254, |
|
"step": 23181 |
|
}, |
|
{ |
|
"epoch": 12.42396790474958, |
|
"grad_norm": 0.211518794298172, |
|
"learning_rate": 0.0007498437500000001, |
|
"loss": 2.7828, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 12.941633234114144, |
|
"grad_norm": 0.2092377245426178, |
|
"learning_rate": 0.00078109375, |
|
"loss": 2.8069, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 12.999611751002977, |
|
"eval_accuracy": 0.41580334298885296, |
|
"eval_loss": 3.0118961334228516, |
|
"eval_runtime": 112.5807, |
|
"eval_samples_per_second": 463.578, |
|
"eval_steps_per_second": 7.248, |
|
"step": 25112 |
|
}, |
|
{ |
|
"epoch": 13.459298563478711, |
|
"grad_norm": 0.20688970386981964, |
|
"learning_rate": 0.00081234375, |
|
"loss": 2.7707, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 13.976963892843276, |
|
"grad_norm": 0.20183704793453217, |
|
"learning_rate": 0.00084353125, |
|
"loss": 2.7996, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 13.999741167335317, |
|
"eval_accuracy": 0.4162821365373128, |
|
"eval_loss": 3.005990743637085, |
|
"eval_runtime": 112.5919, |
|
"eval_samples_per_second": 463.533, |
|
"eval_steps_per_second": 7.247, |
|
"step": 27044 |
|
}, |
|
{ |
|
"epoch": 14.494629222207843, |
|
"grad_norm": 0.19293886423110962, |
|
"learning_rate": 0.00087478125, |
|
"loss": 2.7615, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 14.99987058366766, |
|
"eval_accuracy": 0.4170801257847458, |
|
"eval_loss": 3.0038492679595947, |
|
"eval_runtime": 112.6692, |
|
"eval_samples_per_second": 463.214, |
|
"eval_steps_per_second": 7.242, |
|
"step": 28976 |
|
}, |
|
{ |
|
"epoch": 15.012294551572408, |
|
"grad_norm": 0.1958475559949875, |
|
"learning_rate": 0.0009060312499999999, |
|
"loss": 2.7934, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 15.529959880936975, |
|
"grad_norm": 0.18981003761291504, |
|
"learning_rate": 0.00093728125, |
|
"loss": 2.7575, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.4168810041740398, |
|
"eval_loss": 3.0022430419921875, |
|
"eval_runtime": 112.632, |
|
"eval_samples_per_second": 463.367, |
|
"eval_steps_per_second": 7.245, |
|
"step": 30908 |
|
}, |
|
{ |
|
"epoch": 16.04762521030154, |
|
"grad_norm": 0.21319861710071564, |
|
"learning_rate": 0.00096853125, |
|
"loss": 2.7826, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 16.565290539666105, |
|
"grad_norm": 0.188828244805336, |
|
"learning_rate": 0.00099975, |
|
"loss": 2.7573, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 16.999611751002977, |
|
"eval_accuracy": 0.4178921662552739, |
|
"eval_loss": 2.9961841106414795, |
|
"eval_runtime": 112.6097, |
|
"eval_samples_per_second": 463.459, |
|
"eval_steps_per_second": 7.246, |
|
"step": 32839 |
|
}, |
|
{ |
|
"epoch": 17.082955869030673, |
|
"grad_norm": 0.1903599202632904, |
|
"learning_rate": 0.0009780725022104334, |
|
"loss": 2.7729, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 17.60062119839524, |
|
"grad_norm": 0.18594609200954437, |
|
"learning_rate": 0.0009559902740937224, |
|
"loss": 2.7451, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 17.99974116733532, |
|
"eval_accuracy": 0.41887905804207104, |
|
"eval_loss": 2.98665452003479, |
|
"eval_runtime": 112.7053, |
|
"eval_samples_per_second": 463.066, |
|
"eval_steps_per_second": 7.24, |
|
"step": 34771 |
|
}, |
|
{ |
|
"epoch": 18.118286527759803, |
|
"grad_norm": 0.19103111326694489, |
|
"learning_rate": 0.0009338859416445623, |
|
"loss": 2.7475, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 18.63595185712437, |
|
"grad_norm": 0.17397448420524597, |
|
"learning_rate": 0.0009118037135278515, |
|
"loss": 2.7275, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 18.999870583667658, |
|
"eval_accuracy": 0.4201490782172229, |
|
"eval_loss": 2.98036527633667, |
|
"eval_runtime": 112.4975, |
|
"eval_samples_per_second": 463.921, |
|
"eval_steps_per_second": 7.253, |
|
"step": 36703 |
|
}, |
|
{ |
|
"epoch": 19.153617186488933, |
|
"grad_norm": 0.18536260724067688, |
|
"learning_rate": 0.0008896993810786914, |
|
"loss": 2.7238, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 19.671282515853502, |
|
"grad_norm": 0.17299328744411469, |
|
"learning_rate": 0.0008676171529619805, |
|
"loss": 2.7099, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.42075012492063313, |
|
"eval_loss": 2.9760243892669678, |
|
"eval_runtime": 112.7131, |
|
"eval_samples_per_second": 463.034, |
|
"eval_steps_per_second": 7.24, |
|
"step": 38635 |
|
}, |
|
{ |
|
"epoch": 20.188947845218067, |
|
"grad_norm": 0.19630002975463867, |
|
"learning_rate": 0.0008455128205128205, |
|
"loss": 2.7028, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 20.706613174582632, |
|
"grad_norm": 0.18252891302108765, |
|
"learning_rate": 0.0008234084880636605, |
|
"loss": 2.693, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 20.999611751002977, |
|
"eval_accuracy": 0.4216216764536817, |
|
"eval_loss": 2.968324899673462, |
|
"eval_runtime": 112.7059, |
|
"eval_samples_per_second": 463.064, |
|
"eval_steps_per_second": 7.24, |
|
"step": 40566 |
|
}, |
|
{ |
|
"epoch": 21.224278503947197, |
|
"grad_norm": 0.19568035006523132, |
|
"learning_rate": 0.0008013262599469496, |
|
"loss": 2.6802, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 21.741943833311765, |
|
"grad_norm": 0.2092587798833847, |
|
"learning_rate": 0.0007792219274977895, |
|
"loss": 2.6785, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 21.99974116733532, |
|
"eval_accuracy": 0.4221156483286934, |
|
"eval_loss": 2.96663761138916, |
|
"eval_runtime": 112.5823, |
|
"eval_samples_per_second": 463.572, |
|
"eval_steps_per_second": 7.248, |
|
"step": 42498 |
|
}, |
|
{ |
|
"epoch": 22.25960916267633, |
|
"grad_norm": 0.2045181393623352, |
|
"learning_rate": 0.0007571175950486296, |
|
"loss": 2.6616, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 22.777274492040895, |
|
"grad_norm": 0.19560641050338745, |
|
"learning_rate": 0.0007350132625994696, |
|
"loss": 2.6628, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 22.999870583667658, |
|
"eval_accuracy": 0.4226662007972378, |
|
"eval_loss": 2.9646129608154297, |
|
"eval_runtime": 112.7148, |
|
"eval_samples_per_second": 463.027, |
|
"eval_steps_per_second": 7.24, |
|
"step": 44430 |
|
}, |
|
{ |
|
"epoch": 23.29493982140546, |
|
"grad_norm": 0.19825534522533417, |
|
"learning_rate": 0.0007129089301503095, |
|
"loss": 2.6395, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 23.812605150770025, |
|
"grad_norm": 0.1957850456237793, |
|
"learning_rate": 0.0006908267020335986, |
|
"loss": 2.6501, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_accuracy": 0.42281715752022214, |
|
"eval_loss": 2.9626119136810303, |
|
"eval_runtime": 111.9607, |
|
"eval_samples_per_second": 466.146, |
|
"eval_steps_per_second": 7.288, |
|
"step": 46362 |
|
}, |
|
{ |
|
"epoch": 24.330270480134594, |
|
"grad_norm": 0.2085314244031906, |
|
"learning_rate": 0.0006687223695844385, |
|
"loss": 2.6181, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 24.84793580949916, |
|
"grad_norm": 0.21406565606594086, |
|
"learning_rate": 0.0006466401414677277, |
|
"loss": 2.6343, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 24.999611751002977, |
|
"eval_accuracy": 0.42334742212654364, |
|
"eval_loss": 2.9599573612213135, |
|
"eval_runtime": 111.9994, |
|
"eval_samples_per_second": 465.985, |
|
"eval_steps_per_second": 7.286, |
|
"step": 48293 |
|
}, |
|
{ |
|
"epoch": 25.365601138863724, |
|
"grad_norm": 0.20139683783054352, |
|
"learning_rate": 0.0006245358090185677, |
|
"loss": 2.598, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 25.88326646822829, |
|
"grad_norm": 0.20590080320835114, |
|
"learning_rate": 0.0006024535809018568, |
|
"loss": 2.6198, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 25.99974116733532, |
|
"eval_accuracy": 0.42357351907998303, |
|
"eval_loss": 2.9637885093688965, |
|
"eval_runtime": 112.1406, |
|
"eval_samples_per_second": 465.398, |
|
"eval_steps_per_second": 7.277, |
|
"step": 50225 |
|
}, |
|
{ |
|
"epoch": 26.400931797592857, |
|
"grad_norm": 0.2109968364238739, |
|
"learning_rate": 0.0005803492484526968, |
|
"loss": 2.5789, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 26.918597126957422, |
|
"grad_norm": 0.21926765143871307, |
|
"learning_rate": 0.0005582670203359858, |
|
"loss": 2.604, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 26.999870583667658, |
|
"eval_accuracy": 0.4239798023060537, |
|
"eval_loss": 2.9604480266571045, |
|
"eval_runtime": 112.1419, |
|
"eval_samples_per_second": 465.392, |
|
"eval_steps_per_second": 7.276, |
|
"step": 52157 |
|
}, |
|
{ |
|
"epoch": 27.436262456321987, |
|
"grad_norm": 0.2280665636062622, |
|
"learning_rate": 0.0005361626878868259, |
|
"loss": 2.5576, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 27.953927785686552, |
|
"grad_norm": 0.218441903591156, |
|
"learning_rate": 0.000514080459770115, |
|
"loss": 2.5876, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"eval_accuracy": 0.42452238991016983, |
|
"eval_loss": 2.960141658782959, |
|
"eval_runtime": 111.7102, |
|
"eval_samples_per_second": 467.191, |
|
"eval_steps_per_second": 7.305, |
|
"step": 54089 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"step": 54089, |
|
"total_flos": 1.808986925039616e+18, |
|
"train_loss": 2.9159927132606205, |
|
"train_runtime": 57317.9169, |
|
"train_samples_per_second": 345.105, |
|
"train_steps_per_second": 1.348 |
|
} |
|
], |
|
"logging_steps": 1000, |
|
"max_steps": 77240, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 40, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 3 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.808986925039616e+18, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|