|
{ |
|
"best_metric": 0.10608438402414322, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-500", |
|
"epoch": 0.2720348204570185, |
|
"eval_steps": 50, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.000544069640914037, |
|
"eval_loss": 0.3388690948486328, |
|
"eval_runtime": 65.0573, |
|
"eval_samples_per_second": 5.949, |
|
"eval_steps_per_second": 1.491, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.00544069640914037, |
|
"grad_norm": 0.17373442649841309, |
|
"learning_rate": 4.2000000000000004e-05, |
|
"loss": 0.2499, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01088139281828074, |
|
"grad_norm": 0.2752276062965393, |
|
"learning_rate": 8.400000000000001e-05, |
|
"loss": 0.2266, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.01632208922742111, |
|
"grad_norm": 0.2970355451107025, |
|
"learning_rate": 0.000126, |
|
"loss": 0.1705, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.02176278563656148, |
|
"grad_norm": 0.2946202754974365, |
|
"learning_rate": 0.00016800000000000002, |
|
"loss": 0.1953, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.02720348204570185, |
|
"grad_norm": 0.16966816782951355, |
|
"learning_rate": 0.00021, |
|
"loss": 0.1967, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.02720348204570185, |
|
"eval_loss": 0.14745917916297913, |
|
"eval_runtime": 65.2419, |
|
"eval_samples_per_second": 5.932, |
|
"eval_steps_per_second": 1.487, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.03264417845484222, |
|
"grad_norm": 0.2769581973552704, |
|
"learning_rate": 0.00020974422527728155, |
|
"loss": 0.1933, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.03808487486398259, |
|
"grad_norm": 0.4267159104347229, |
|
"learning_rate": 0.0002089781472178649, |
|
"loss": 0.2355, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.04352557127312296, |
|
"grad_norm": 0.6827275156974792, |
|
"learning_rate": 0.0002077054980770496, |
|
"loss": 0.368, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.04896626768226333, |
|
"grad_norm": 1.049018383026123, |
|
"learning_rate": 0.00020593247807352348, |
|
"loss": 0.4278, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.0544069640914037, |
|
"grad_norm": 1.1495347023010254, |
|
"learning_rate": 0.00020366772518252038, |
|
"loss": 0.6546, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0544069640914037, |
|
"eval_loss": 0.16608624160289764, |
|
"eval_runtime": 65.2368, |
|
"eval_samples_per_second": 5.932, |
|
"eval_steps_per_second": 1.487, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.05984766050054407, |
|
"grad_norm": 0.08401616662740707, |
|
"learning_rate": 0.0002009222730524731, |
|
"loss": 0.1106, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.06528835690968444, |
|
"grad_norm": 0.1952836960554123, |
|
"learning_rate": 0.00019770949725018733, |
|
"loss": 0.1357, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.07072905331882481, |
|
"grad_norm": 0.1629328429698944, |
|
"learning_rate": 0.00019404505009642473, |
|
"loss": 0.1751, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.07616974972796518, |
|
"grad_norm": 0.14246857166290283, |
|
"learning_rate": 0.0001899467844093695, |
|
"loss": 0.1564, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.08161044613710555, |
|
"grad_norm": 0.45524969696998596, |
|
"learning_rate": 0.00018543466652749268, |
|
"loss": 0.2137, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.08161044613710555, |
|
"eval_loss": 0.13314945995807648, |
|
"eval_runtime": 65.6995, |
|
"eval_samples_per_second": 5.89, |
|
"eval_steps_per_second": 1.476, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.08705114254624592, |
|
"grad_norm": 0.2759036123752594, |
|
"learning_rate": 0.00018053067903555837, |
|
"loss": 0.2328, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.09249183895538629, |
|
"grad_norm": 0.34225738048553467, |
|
"learning_rate": 0.00017525871366768012, |
|
"loss": 0.2684, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.09793253536452666, |
|
"grad_norm": 0.5663359761238098, |
|
"learning_rate": 0.00016964445490919413, |
|
"loss": 0.3034, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.10337323177366703, |
|
"grad_norm": 0.4630700945854187, |
|
"learning_rate": 0.00016371525486442843, |
|
"loss": 0.3356, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.1088139281828074, |
|
"grad_norm": 1.2807186841964722, |
|
"learning_rate": 0.0001575, |
|
"loss": 0.4927, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.1088139281828074, |
|
"eval_loss": 0.15209516882896423, |
|
"eval_runtime": 65.2491, |
|
"eval_samples_per_second": 5.931, |
|
"eval_steps_per_second": 1.487, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.11425462459194777, |
|
"grad_norm": 0.1879313737154007, |
|
"learning_rate": 0.00015102897041285315, |
|
"loss": 0.1106, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.11969532100108814, |
|
"grad_norm": 0.2476097196340561, |
|
"learning_rate": 0.00014433369230867077, |
|
"loss": 0.1203, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.1251360174102285, |
|
"grad_norm": 0.195736363530159, |
|
"learning_rate": 0.0001374467844093695, |
|
"loss": 0.129, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.1305767138193689, |
|
"grad_norm": 0.23351465165615082, |
|
"learning_rate": 0.0001304017990379651, |
|
"loss": 0.1806, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.13601741022850924, |
|
"grad_norm": 0.20477648079395294, |
|
"learning_rate": 0.0001232330586550277, |
|
"loss": 0.161, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.13601741022850924, |
|
"eval_loss": 0.1386343389749527, |
|
"eval_runtime": 65.6794, |
|
"eval_samples_per_second": 5.892, |
|
"eval_steps_per_second": 1.477, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.14145810663764963, |
|
"grad_norm": 0.3020540475845337, |
|
"learning_rate": 0.00011597548864310363, |
|
"loss": 0.1828, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.14689880304678998, |
|
"grad_norm": 0.5656459331512451, |
|
"learning_rate": 0.00010866444715376263, |
|
"loss": 0.211, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.15233949945593037, |
|
"grad_norm": 0.6137648224830627, |
|
"learning_rate": 0.00010133555284623744, |
|
"loss": 0.2859, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.15778019586507072, |
|
"grad_norm": 1.091139316558838, |
|
"learning_rate": 9.402451135689641e-05, |
|
"loss": 0.3492, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.1632208922742111, |
|
"grad_norm": 0.78134685754776, |
|
"learning_rate": 8.676694134497232e-05, |
|
"loss": 0.4765, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.1632208922742111, |
|
"eval_loss": 0.12224029749631882, |
|
"eval_runtime": 65.225, |
|
"eval_samples_per_second": 5.933, |
|
"eval_steps_per_second": 1.487, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.16866158868335146, |
|
"grad_norm": 0.06177017092704773, |
|
"learning_rate": 7.95982009620349e-05, |
|
"loss": 0.0763, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.17410228509249184, |
|
"grad_norm": 0.22023071348667145, |
|
"learning_rate": 7.255321559063053e-05, |
|
"loss": 0.0765, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.1795429815016322, |
|
"grad_norm": 0.22449415922164917, |
|
"learning_rate": 6.566630769132923e-05, |
|
"loss": 0.1294, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.18498367791077258, |
|
"grad_norm": 0.2885405719280243, |
|
"learning_rate": 5.897102958714686e-05, |
|
"loss": 0.1433, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.19042437431991294, |
|
"grad_norm": 0.1774507761001587, |
|
"learning_rate": 5.250000000000002e-05, |
|
"loss": 0.1847, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.19042437431991294, |
|
"eval_loss": 0.11864767968654633, |
|
"eval_runtime": 65.2912, |
|
"eval_samples_per_second": 5.927, |
|
"eval_steps_per_second": 1.486, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.19586507072905332, |
|
"grad_norm": 0.27000942826271057, |
|
"learning_rate": 4.62847451355716e-05, |
|
"loss": 0.1948, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.20130576713819368, |
|
"grad_norm": 0.3730561435222626, |
|
"learning_rate": 4.035554509080588e-05, |
|
"loss": 0.1986, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.20674646354733406, |
|
"grad_norm": 0.4994836151599884, |
|
"learning_rate": 3.474128633231992e-05, |
|
"loss": 0.2262, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.21218715995647444, |
|
"grad_norm": 0.7578539848327637, |
|
"learning_rate": 2.946932096444165e-05, |
|
"loss": 0.3025, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.2176278563656148, |
|
"grad_norm": 1.3834261894226074, |
|
"learning_rate": 2.456533347250732e-05, |
|
"loss": 0.5652, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.2176278563656148, |
|
"eval_loss": 0.1065056249499321, |
|
"eval_runtime": 65.2426, |
|
"eval_samples_per_second": 5.932, |
|
"eval_steps_per_second": 1.487, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.22306855277475518, |
|
"grad_norm": 0.11437083035707474, |
|
"learning_rate": 2.005321559063053e-05, |
|
"loss": 0.0697, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.22850924918389554, |
|
"grad_norm": 0.10665331780910492, |
|
"learning_rate": 1.5954949903575276e-05, |
|
"loss": 0.0688, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.23394994559303592, |
|
"grad_norm": 0.27620697021484375, |
|
"learning_rate": 1.2290502749812666e-05, |
|
"loss": 0.1334, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.23939064200217627, |
|
"grad_norm": 0.16248451173305511, |
|
"learning_rate": 9.077726947526898e-06, |
|
"loss": 0.1574, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.24483133841131666, |
|
"grad_norm": 0.19599585235118866, |
|
"learning_rate": 6.332274817479627e-06, |
|
"loss": 0.1893, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.24483133841131666, |
|
"eval_loss": 0.10668922960758209, |
|
"eval_runtime": 65.667, |
|
"eval_samples_per_second": 5.893, |
|
"eval_steps_per_second": 1.477, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.250272034820457, |
|
"grad_norm": 0.19068965315818787, |
|
"learning_rate": 4.067521926476516e-06, |
|
"loss": 0.1663, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.25571273122959737, |
|
"grad_norm": 0.6219221353530884, |
|
"learning_rate": 2.294501922950403e-06, |
|
"loss": 0.2751, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.2611534276387378, |
|
"grad_norm": 0.6672576665878296, |
|
"learning_rate": 1.021852782135112e-06, |
|
"loss": 0.2909, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.26659412404787813, |
|
"grad_norm": 0.7676600217819214, |
|
"learning_rate": 2.5577472271845927e-07, |
|
"loss": 0.3287, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.2720348204570185, |
|
"grad_norm": 0.9298309683799744, |
|
"learning_rate": 0.0, |
|
"loss": 0.4581, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.2720348204570185, |
|
"eval_loss": 0.10608438402414322, |
|
"eval_runtime": 65.7082, |
|
"eval_samples_per_second": 5.89, |
|
"eval_steps_per_second": 1.476, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.3049289243623424e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|