{ "best_metric": 2.425506353378296, "best_model_checkpoint": "miner_id_24/checkpoint-500", "epoch": 0.03208213025344883, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 6.416426050689766e-05, "eval_loss": 3.0761466026306152, "eval_runtime": 248.0097, "eval_samples_per_second": 26.463, "eval_steps_per_second": 6.617, "step": 1 }, { "epoch": 0.0006416426050689766, "grad_norm": 2.1181607246398926, "learning_rate": 4.08e-05, "loss": 2.6512, "step": 10 }, { "epoch": 0.0012832852101379532, "grad_norm": 2.143641948699951, "learning_rate": 8.16e-05, "loss": 2.553, "step": 20 }, { "epoch": 0.0019249278152069298, "grad_norm": 2.2535758018493652, "learning_rate": 0.0001224, "loss": 2.4612, "step": 30 }, { "epoch": 0.0025665704202759063, "grad_norm": 2.534200668334961, "learning_rate": 0.0001632, "loss": 2.521, "step": 40 }, { "epoch": 0.003208213025344883, "grad_norm": 3.303925037384033, "learning_rate": 0.000204, "loss": 2.5419, "step": 50 }, { "epoch": 0.003208213025344883, "eval_loss": 2.524552345275879, "eval_runtime": 247.4632, "eval_samples_per_second": 26.521, "eval_steps_per_second": 6.631, "step": 50 }, { "epoch": 0.0038498556304138597, "grad_norm": 1.3380857706069946, "learning_rate": 0.00020375153312650207, "loss": 2.4129, "step": 60 }, { "epoch": 0.004491498235482836, "grad_norm": 1.7602264881134033, "learning_rate": 0.00020300734301164017, "loss": 2.4986, "step": 70 }, { "epoch": 0.005133140840551813, "grad_norm": 1.8537044525146484, "learning_rate": 0.00020177105527484818, "loss": 2.441, "step": 80 }, { "epoch": 0.005774783445620789, "grad_norm": 2.583019971847534, "learning_rate": 0.00020004869298570854, "loss": 2.5052, "step": 90 }, { "epoch": 0.006416426050689766, "grad_norm": 3.4130592346191406, "learning_rate": 0.00019784864732016265, "loss": 2.5278, "step": 100 }, { "epoch": 0.006416426050689766, "eval_loss": 2.5519673824310303, "eval_runtime": 247.2928, "eval_samples_per_second": 26.539, "eval_steps_per_second": 6.636, "step": 100 }, { "epoch": 0.007058068655758742, "grad_norm": 1.2456212043762207, "learning_rate": 0.00019518163667954527, "loss": 2.4501, "step": 110 }, { "epoch": 0.007699711260827719, "grad_norm": 1.8833853006362915, "learning_rate": 0.00019206065447161056, "loss": 2.4523, "step": 120 }, { "epoch": 0.008341353865896696, "grad_norm": 2.167327404022217, "learning_rate": 0.00018850090580795544, "loss": 2.5085, "step": 130 }, { "epoch": 0.008982996470965673, "grad_norm": 2.533130645751953, "learning_rate": 0.00018451973342624464, "loss": 2.4788, "step": 140 }, { "epoch": 0.009624639076034648, "grad_norm": 3.3289971351623535, "learning_rate": 0.00018013653319813575, "loss": 2.3569, "step": 150 }, { "epoch": 0.009624639076034648, "eval_loss": 2.561899423599243, "eval_runtime": 248.4044, "eval_samples_per_second": 26.421, "eval_steps_per_second": 6.606, "step": 150 }, { "epoch": 0.010266281681103625, "grad_norm": 1.4331955909729004, "learning_rate": 0.0001753726596345424, "loss": 2.5594, "step": 160 }, { "epoch": 0.010907924286172602, "grad_norm": 1.6859421730041504, "learning_rate": 0.00017025132184860355, "loss": 2.4436, "step": 170 }, { "epoch": 0.011549566891241578, "grad_norm": 2.1262240409851074, "learning_rate": 0.00016479747048321714, "loss": 2.5153, "step": 180 }, { "epoch": 0.012191209496310555, "grad_norm": 2.288388252258301, "learning_rate": 0.00015903767615401616, "loss": 2.4702, "step": 190 }, { "epoch": 0.012832852101379532, "grad_norm": 3.3808329105377197, "learning_rate": 0.000153, "loss": 2.4684, "step": 200 }, { "epoch": 0.012832852101379532, "eval_loss": 2.4939587116241455, "eval_runtime": 247.9007, "eval_samples_per_second": 26.474, "eval_steps_per_second": 6.62, "step": 200 }, { "epoch": 0.013474494706448507, "grad_norm": 1.3569692373275757, "learning_rate": 0.0001467138569724859, "loss": 2.463, "step": 210 }, { "epoch": 0.014116137311517485, "grad_norm": 1.664777159690857, "learning_rate": 0.00014020987252842305, "loss": 2.5294, "step": 220 }, { "epoch": 0.014757779916586462, "grad_norm": 1.891471266746521, "learning_rate": 0.00013351973342624464, "loss": 2.4667, "step": 230 }, { "epoch": 0.015399422521655439, "grad_norm": 2.472409248352051, "learning_rate": 0.00012667603335116609, "loss": 2.4393, "step": 240 }, { "epoch": 0.016041065126724416, "grad_norm": 3.5608251094818115, "learning_rate": 0.00011971211412202691, "loss": 2.3954, "step": 250 }, { "epoch": 0.016041065126724416, "eval_loss": 2.4938225746154785, "eval_runtime": 247.9352, "eval_samples_per_second": 26.471, "eval_steps_per_second": 6.619, "step": 250 }, { "epoch": 0.01668270773179339, "grad_norm": 1.4132589101791382, "learning_rate": 0.00011266190325330066, "loss": 2.4294, "step": 260 }, { "epoch": 0.017324350336862367, "grad_norm": 1.6854068040847778, "learning_rate": 0.00010555974866365511, "loss": 2.4818, "step": 270 }, { "epoch": 0.017965992941931345, "grad_norm": 1.9312472343444824, "learning_rate": 9.844025133634492e-05, "loss": 2.575, "step": 280 }, { "epoch": 0.01860763554700032, "grad_norm": 2.2843751907348633, "learning_rate": 9.133809674669937e-05, "loss": 2.4306, "step": 290 }, { "epoch": 0.019249278152069296, "grad_norm": 3.1359288692474365, "learning_rate": 8.428788587797311e-05, "loss": 2.4642, "step": 300 }, { "epoch": 0.019249278152069296, "eval_loss": 2.4635369777679443, "eval_runtime": 248.3558, "eval_samples_per_second": 26.426, "eval_steps_per_second": 6.607, "step": 300 }, { "epoch": 0.019890920757138275, "grad_norm": 1.1439577341079712, "learning_rate": 7.73239666488339e-05, "loss": 2.4846, "step": 310 }, { "epoch": 0.02053256336220725, "grad_norm": 1.5837206840515137, "learning_rate": 7.048026657375537e-05, "loss": 2.4171, "step": 320 }, { "epoch": 0.021174205967276226, "grad_norm": 1.7678472995758057, "learning_rate": 6.379012747157697e-05, "loss": 2.3862, "step": 330 }, { "epoch": 0.021815848572345205, "grad_norm": 2.200498342514038, "learning_rate": 5.7286143027514095e-05, "loss": 2.4574, "step": 340 }, { "epoch": 0.02245749117741418, "grad_norm": 2.8402225971221924, "learning_rate": 5.100000000000002e-05, "loss": 2.458, "step": 350 }, { "epoch": 0.02245749117741418, "eval_loss": 2.4442172050476074, "eval_runtime": 248.9891, "eval_samples_per_second": 26.359, "eval_steps_per_second": 6.591, "step": 350 }, { "epoch": 0.023099133782483156, "grad_norm": 1.2951734066009521, "learning_rate": 4.496232384598384e-05, "loss": 2.3697, "step": 360 }, { "epoch": 0.023740776387552134, "grad_norm": 1.5272811651229858, "learning_rate": 3.9202529516782854e-05, "loss": 2.4261, "step": 370 }, { "epoch": 0.02438241899262111, "grad_norm": 1.9741696119308472, "learning_rate": 3.374867815139649e-05, "loss": 2.4812, "step": 380 }, { "epoch": 0.025024061597690085, "grad_norm": 1.9671711921691895, "learning_rate": 2.8627340365457602e-05, "loss": 2.414, "step": 390 }, { "epoch": 0.025665704202759064, "grad_norm": 3.4139561653137207, "learning_rate": 2.3863466801864254e-05, "loss": 2.3964, "step": 400 }, { "epoch": 0.025665704202759064, "eval_loss": 2.4310266971588135, "eval_runtime": 248.5605, "eval_samples_per_second": 26.404, "eval_steps_per_second": 6.602, "step": 400 }, { "epoch": 0.02630734680782804, "grad_norm": 1.3299930095672607, "learning_rate": 1.9480266573755372e-05, "loss": 2.4124, "step": 410 }, { "epoch": 0.026948989412897015, "grad_norm": 1.694121241569519, "learning_rate": 1.5499094192044554e-05, "loss": 2.4622, "step": 420 }, { "epoch": 0.027590632017965994, "grad_norm": 1.9327712059020996, "learning_rate": 1.1939345528389446e-05, "loss": 2.4714, "step": 430 }, { "epoch": 0.02823227462303497, "grad_norm": 2.3316149711608887, "learning_rate": 8.818363320454701e-06, "loss": 2.4211, "step": 440 }, { "epoch": 0.028873917228103944, "grad_norm": 3.5221705436706543, "learning_rate": 6.1513526798373514e-06, "loss": 2.4161, "step": 450 }, { "epoch": 0.028873917228103944, "eval_loss": 2.426518440246582, "eval_runtime": 248.4092, "eval_samples_per_second": 26.42, "eval_steps_per_second": 6.606, "step": 450 }, { "epoch": 0.029515559833172923, "grad_norm": 1.2081670761108398, "learning_rate": 3.9513070142914725e-06, "loss": 2.4222, "step": 460 }, { "epoch": 0.0301572024382419, "grad_norm": 1.6027675867080688, "learning_rate": 2.2289447251518195e-06, "loss": 2.4569, "step": 470 }, { "epoch": 0.030798845043310877, "grad_norm": 2.0222573280334473, "learning_rate": 9.92656988359823e-07, "loss": 2.4387, "step": 480 }, { "epoch": 0.03144048764837985, "grad_norm": 2.331239700317383, "learning_rate": 2.4846687349793185e-07, "loss": 2.4296, "step": 490 }, { "epoch": 0.03208213025344883, "grad_norm": 3.7559499740600586, "learning_rate": 0.0, "loss": 2.5162, "step": 500 }, { "epoch": 0.03208213025344883, "eval_loss": 2.425506353378296, "eval_runtime": 248.6079, "eval_samples_per_second": 26.399, "eval_steps_per_second": 6.601, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.8513699793731584e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }