{ "best_metric": 0.10608438402414322, "best_model_checkpoint": "miner_id_24/checkpoint-500", "epoch": 0.2720348204570185, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000544069640914037, "eval_loss": 0.3388690948486328, "eval_runtime": 65.0573, "eval_samples_per_second": 5.949, "eval_steps_per_second": 1.491, "step": 1 }, { "epoch": 0.00544069640914037, "grad_norm": 0.17373442649841309, "learning_rate": 4.2000000000000004e-05, "loss": 0.2499, "step": 10 }, { "epoch": 0.01088139281828074, "grad_norm": 0.2752276062965393, "learning_rate": 8.400000000000001e-05, "loss": 0.2266, "step": 20 }, { "epoch": 0.01632208922742111, "grad_norm": 0.2970355451107025, "learning_rate": 0.000126, "loss": 0.1705, "step": 30 }, { "epoch": 0.02176278563656148, "grad_norm": 0.2946202754974365, "learning_rate": 0.00016800000000000002, "loss": 0.1953, "step": 40 }, { "epoch": 0.02720348204570185, "grad_norm": 0.16966816782951355, "learning_rate": 0.00021, "loss": 0.1967, "step": 50 }, { "epoch": 0.02720348204570185, "eval_loss": 0.14745917916297913, "eval_runtime": 65.2419, "eval_samples_per_second": 5.932, "eval_steps_per_second": 1.487, "step": 50 }, { "epoch": 0.03264417845484222, "grad_norm": 0.2769581973552704, "learning_rate": 0.00020974422527728155, "loss": 0.1933, "step": 60 }, { "epoch": 0.03808487486398259, "grad_norm": 0.4267159104347229, "learning_rate": 0.0002089781472178649, "loss": 0.2355, "step": 70 }, { "epoch": 0.04352557127312296, "grad_norm": 0.6827275156974792, "learning_rate": 0.0002077054980770496, "loss": 0.368, "step": 80 }, { "epoch": 0.04896626768226333, "grad_norm": 1.049018383026123, "learning_rate": 0.00020593247807352348, "loss": 0.4278, "step": 90 }, { "epoch": 0.0544069640914037, "grad_norm": 1.1495347023010254, "learning_rate": 0.00020366772518252038, "loss": 0.6546, "step": 100 }, { "epoch": 0.0544069640914037, "eval_loss": 0.16608624160289764, "eval_runtime": 65.2368, "eval_samples_per_second": 5.932, "eval_steps_per_second": 1.487, "step": 100 }, { "epoch": 0.05984766050054407, "grad_norm": 0.08401616662740707, "learning_rate": 0.0002009222730524731, "loss": 0.1106, "step": 110 }, { "epoch": 0.06528835690968444, "grad_norm": 0.1952836960554123, "learning_rate": 0.00019770949725018733, "loss": 0.1357, "step": 120 }, { "epoch": 0.07072905331882481, "grad_norm": 0.1629328429698944, "learning_rate": 0.00019404505009642473, "loss": 0.1751, "step": 130 }, { "epoch": 0.07616974972796518, "grad_norm": 0.14246857166290283, "learning_rate": 0.0001899467844093695, "loss": 0.1564, "step": 140 }, { "epoch": 0.08161044613710555, "grad_norm": 0.45524969696998596, "learning_rate": 0.00018543466652749268, "loss": 0.2137, "step": 150 }, { "epoch": 0.08161044613710555, "eval_loss": 0.13314945995807648, "eval_runtime": 65.6995, "eval_samples_per_second": 5.89, "eval_steps_per_second": 1.476, "step": 150 }, { "epoch": 0.08705114254624592, "grad_norm": 0.2759036123752594, "learning_rate": 0.00018053067903555837, "loss": 0.2328, "step": 160 }, { "epoch": 0.09249183895538629, "grad_norm": 0.34225738048553467, "learning_rate": 0.00017525871366768012, "loss": 0.2684, "step": 170 }, { "epoch": 0.09793253536452666, "grad_norm": 0.5663359761238098, "learning_rate": 0.00016964445490919413, "loss": 0.3034, "step": 180 }, { "epoch": 0.10337323177366703, "grad_norm": 0.4630700945854187, "learning_rate": 0.00016371525486442843, "loss": 0.3356, "step": 190 }, { "epoch": 0.1088139281828074, "grad_norm": 1.2807186841964722, "learning_rate": 0.0001575, "loss": 0.4927, "step": 200 }, { "epoch": 0.1088139281828074, "eval_loss": 0.15209516882896423, "eval_runtime": 65.2491, "eval_samples_per_second": 5.931, "eval_steps_per_second": 1.487, "step": 200 }, { "epoch": 0.11425462459194777, "grad_norm": 0.1879313737154007, "learning_rate": 0.00015102897041285315, "loss": 0.1106, "step": 210 }, { "epoch": 0.11969532100108814, "grad_norm": 0.2476097196340561, "learning_rate": 0.00014433369230867077, "loss": 0.1203, "step": 220 }, { "epoch": 0.1251360174102285, "grad_norm": 0.195736363530159, "learning_rate": 0.0001374467844093695, "loss": 0.129, "step": 230 }, { "epoch": 0.1305767138193689, "grad_norm": 0.23351465165615082, "learning_rate": 0.0001304017990379651, "loss": 0.1806, "step": 240 }, { "epoch": 0.13601741022850924, "grad_norm": 0.20477648079395294, "learning_rate": 0.0001232330586550277, "loss": 0.161, "step": 250 }, { "epoch": 0.13601741022850924, "eval_loss": 0.1386343389749527, "eval_runtime": 65.6794, "eval_samples_per_second": 5.892, "eval_steps_per_second": 1.477, "step": 250 }, { "epoch": 0.14145810663764963, "grad_norm": 0.3020540475845337, "learning_rate": 0.00011597548864310363, "loss": 0.1828, "step": 260 }, { "epoch": 0.14689880304678998, "grad_norm": 0.5656459331512451, "learning_rate": 0.00010866444715376263, "loss": 0.211, "step": 270 }, { "epoch": 0.15233949945593037, "grad_norm": 0.6137648224830627, "learning_rate": 0.00010133555284623744, "loss": 0.2859, "step": 280 }, { "epoch": 0.15778019586507072, "grad_norm": 1.091139316558838, "learning_rate": 9.402451135689641e-05, "loss": 0.3492, "step": 290 }, { "epoch": 0.1632208922742111, "grad_norm": 0.78134685754776, "learning_rate": 8.676694134497232e-05, "loss": 0.4765, "step": 300 }, { "epoch": 0.1632208922742111, "eval_loss": 0.12224029749631882, "eval_runtime": 65.225, "eval_samples_per_second": 5.933, "eval_steps_per_second": 1.487, "step": 300 }, { "epoch": 0.16866158868335146, "grad_norm": 0.06177017092704773, "learning_rate": 7.95982009620349e-05, "loss": 0.0763, "step": 310 }, { "epoch": 0.17410228509249184, "grad_norm": 0.22023071348667145, "learning_rate": 7.255321559063053e-05, "loss": 0.0765, "step": 320 }, { "epoch": 0.1795429815016322, "grad_norm": 0.22449415922164917, "learning_rate": 6.566630769132923e-05, "loss": 0.1294, "step": 330 }, { "epoch": 0.18498367791077258, "grad_norm": 0.2885405719280243, "learning_rate": 5.897102958714686e-05, "loss": 0.1433, "step": 340 }, { "epoch": 0.19042437431991294, "grad_norm": 0.1774507761001587, "learning_rate": 5.250000000000002e-05, "loss": 0.1847, "step": 350 }, { "epoch": 0.19042437431991294, "eval_loss": 0.11864767968654633, "eval_runtime": 65.2912, "eval_samples_per_second": 5.927, "eval_steps_per_second": 1.486, "step": 350 }, { "epoch": 0.19586507072905332, "grad_norm": 0.27000942826271057, "learning_rate": 4.62847451355716e-05, "loss": 0.1948, "step": 360 }, { "epoch": 0.20130576713819368, "grad_norm": 0.3730561435222626, "learning_rate": 4.035554509080588e-05, "loss": 0.1986, "step": 370 }, { "epoch": 0.20674646354733406, "grad_norm": 0.4994836151599884, "learning_rate": 3.474128633231992e-05, "loss": 0.2262, "step": 380 }, { "epoch": 0.21218715995647444, "grad_norm": 0.7578539848327637, "learning_rate": 2.946932096444165e-05, "loss": 0.3025, "step": 390 }, { "epoch": 0.2176278563656148, "grad_norm": 1.3834261894226074, "learning_rate": 2.456533347250732e-05, "loss": 0.5652, "step": 400 }, { "epoch": 0.2176278563656148, "eval_loss": 0.1065056249499321, "eval_runtime": 65.2426, "eval_samples_per_second": 5.932, "eval_steps_per_second": 1.487, "step": 400 }, { "epoch": 0.22306855277475518, "grad_norm": 0.11437083035707474, "learning_rate": 2.005321559063053e-05, "loss": 0.0697, "step": 410 }, { "epoch": 0.22850924918389554, "grad_norm": 0.10665331780910492, "learning_rate": 1.5954949903575276e-05, "loss": 0.0688, "step": 420 }, { "epoch": 0.23394994559303592, "grad_norm": 0.27620697021484375, "learning_rate": 1.2290502749812666e-05, "loss": 0.1334, "step": 430 }, { "epoch": 0.23939064200217627, "grad_norm": 0.16248451173305511, "learning_rate": 9.077726947526898e-06, "loss": 0.1574, "step": 440 }, { "epoch": 0.24483133841131666, "grad_norm": 0.19599585235118866, "learning_rate": 6.332274817479627e-06, "loss": 0.1893, "step": 450 }, { "epoch": 0.24483133841131666, "eval_loss": 0.10668922960758209, "eval_runtime": 65.667, "eval_samples_per_second": 5.893, "eval_steps_per_second": 1.477, "step": 450 }, { "epoch": 0.250272034820457, "grad_norm": 0.19068965315818787, "learning_rate": 4.067521926476516e-06, "loss": 0.1663, "step": 460 }, { "epoch": 0.25571273122959737, "grad_norm": 0.6219221353530884, "learning_rate": 2.294501922950403e-06, "loss": 0.2751, "step": 470 }, { "epoch": 0.2611534276387378, "grad_norm": 0.6672576665878296, "learning_rate": 1.021852782135112e-06, "loss": 0.2909, "step": 480 }, { "epoch": 0.26659412404787813, "grad_norm": 0.7676600217819214, "learning_rate": 2.5577472271845927e-07, "loss": 0.3287, "step": 490 }, { "epoch": 0.2720348204570185, "grad_norm": 0.9298309683799744, "learning_rate": 0.0, "loss": 0.4581, "step": 500 }, { "epoch": 0.2720348204570185, "eval_loss": 0.10608438402414322, "eval_runtime": 65.7082, "eval_samples_per_second": 5.89, "eval_steps_per_second": 1.476, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3049289243623424e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }