{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9212295869356388, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03842459173871278, "grad_norm": 0.7461163997650146, "learning_rate": 8.333333333333334e-05, "loss": 1.2598, "step": 10 }, { "epoch": 0.07684918347742556, "grad_norm": 0.25502100586891174, "learning_rate": 0.0001666666666666667, "loss": 0.7563, "step": 20 }, { "epoch": 0.11527377521613832, "grad_norm": 0.14209185540676117, "learning_rate": 0.00019996891820008164, "loss": 0.63, "step": 30 }, { "epoch": 0.15369836695485112, "grad_norm": 0.10494557023048401, "learning_rate": 0.0001997790438338385, "loss": 0.5646, "step": 40 }, { "epoch": 0.19212295869356388, "grad_norm": 0.11090180277824402, "learning_rate": 0.0001994168902089112, "loss": 0.5158, "step": 50 }, { "epoch": 0.23054755043227665, "grad_norm": 0.09561982750892639, "learning_rate": 0.00019888308262251285, "loss": 0.5171, "step": 60 }, { "epoch": 0.2689721421709894, "grad_norm": 0.13890881836414337, "learning_rate": 0.0001981785427508966, "loss": 0.5013, "step": 70 }, { "epoch": 0.30739673390970224, "grad_norm": 0.09685279428958893, "learning_rate": 0.00019730448705798239, "loss": 0.4803, "step": 80 }, { "epoch": 0.345821325648415, "grad_norm": 0.0978529155254364, "learning_rate": 0.0001962624246950012, "loss": 0.4824, "step": 90 }, { "epoch": 0.38424591738712777, "grad_norm": 0.09823426604270935, "learning_rate": 0.0001950541548947829, "loss": 0.4765, "step": 100 }, { "epoch": 0.42267050912584053, "grad_norm": 0.11479681730270386, "learning_rate": 0.0001936817638651871, "loss": 0.4804, "step": 110 }, { "epoch": 0.4610951008645533, "grad_norm": 0.1102244183421135, "learning_rate": 0.00019214762118704076, "loss": 0.4735, "step": 120 }, { "epoch": 0.49951969260326606, "grad_norm": 0.09442220628261566, "learning_rate": 0.00019045437572280194, "loss": 0.4654, "step": 130 }, { "epoch": 0.5379442843419788, "grad_norm": 0.0998912900686264, "learning_rate": 0.00018860495104301345, "loss": 0.4714, "step": 140 }, { "epoch": 0.5763688760806917, "grad_norm": 0.12593407928943634, "learning_rate": 0.00018660254037844388, "loss": 0.4652, "step": 150 }, { "epoch": 0.6147934678194045, "grad_norm": 0.10841673612594604, "learning_rate": 0.0001844506011066308, "loss": 0.4633, "step": 160 }, { "epoch": 0.6532180595581172, "grad_norm": 0.09892784804105759, "learning_rate": 0.00018215284878234642, "loss": 0.461, "step": 170 }, { "epoch": 0.69164265129683, "grad_norm": 0.5387171506881714, "learning_rate": 0.00017971325072229226, "loss": 0.4591, "step": 180 }, { "epoch": 0.7300672430355427, "grad_norm": 0.11192867159843445, "learning_rate": 0.0001771360191551, "loss": 0.4592, "step": 190 }, { "epoch": 0.7684918347742555, "grad_norm": 0.10694364458322525, "learning_rate": 0.00017442560394846516, "loss": 0.4574, "step": 200 }, { "epoch": 0.8069164265129684, "grad_norm": 0.10873424261808395, "learning_rate": 0.00017158668492597186, "loss": 0.4492, "step": 210 }, { "epoch": 0.8453410182516811, "grad_norm": 0.11694315820932388, "learning_rate": 0.0001686241637868734, "loss": 0.4467, "step": 220 }, { "epoch": 0.8837656099903939, "grad_norm": 0.10100408643484116, "learning_rate": 0.000165543155642781, "loss": 0.4488, "step": 230 }, { "epoch": 0.9221902017291066, "grad_norm": 0.10397649556398392, "learning_rate": 0.00016234898018587337, "loss": 0.447, "step": 240 }, { "epoch": 0.9606147934678194, "grad_norm": 0.10007993876934052, "learning_rate": 0.00015904715250387498, "loss": 0.4428, "step": 250 }, { "epoch": 0.9990393852065321, "grad_norm": 0.10865867137908936, "learning_rate": 0.00015564337355766412, "loss": 0.4452, "step": 260 }, { "epoch": 1.037463976945245, "grad_norm": 0.10476306080818176, "learning_rate": 0.0001521435203379498, "loss": 0.4367, "step": 270 }, { "epoch": 1.0758885686839577, "grad_norm": 0.10958375781774521, "learning_rate": 0.00014855363571801523, "loss": 0.4336, "step": 280 }, { "epoch": 1.1143131604226706, "grad_norm": 0.11801016330718994, "learning_rate": 0.00014487991802004623, "loss": 0.4346, "step": 290 }, { "epoch": 1.1527377521613833, "grad_norm": 0.11526134610176086, "learning_rate": 0.00014112871031306119, "loss": 0.4221, "step": 300 }, { "epoch": 1.191162343900096, "grad_norm": 0.10705429315567017, "learning_rate": 0.0001373064894609194, "loss": 0.4363, "step": 310 }, { "epoch": 1.229586935638809, "grad_norm": 0.09906008094549179, "learning_rate": 0.00013341985493931877, "loss": 0.4359, "step": 320 }, { "epoch": 1.2680115273775217, "grad_norm": 0.11935935914516449, "learning_rate": 0.00012947551744109043, "loss": 0.429, "step": 330 }, { "epoch": 1.3064361191162344, "grad_norm": 0.11651390045881271, "learning_rate": 0.0001254802872894655, "loss": 0.4295, "step": 340 }, { "epoch": 1.344860710854947, "grad_norm": 0.13374051451683044, "learning_rate": 0.00012144106267931876, "loss": 0.43, "step": 350 }, { "epoch": 1.38328530259366, "grad_norm": 0.10709749907255173, "learning_rate": 0.00011736481776669306, "loss": 0.4229, "step": 360 }, { "epoch": 1.4217098943323727, "grad_norm": 0.10747699439525604, "learning_rate": 0.00011325859062716795, "loss": 0.4255, "step": 370 }, { "epoch": 1.4601344860710854, "grad_norm": 0.1302700638771057, "learning_rate": 0.00010912947110386484, "loss": 0.4314, "step": 380 }, { "epoch": 1.4985590778097984, "grad_norm": 0.10743537545204163, "learning_rate": 0.00010498458856606972, "loss": 0.4242, "step": 390 }, { "epoch": 1.536983669548511, "grad_norm": 0.11519400030374527, "learning_rate": 0.00010083109959960973, "loss": 0.4216, "step": 400 }, { "epoch": 1.5754082612872238, "grad_norm": 0.11456304788589478, "learning_rate": 9.667617565023735e-05, "loss": 0.4315, "step": 410 }, { "epoch": 1.6138328530259365, "grad_norm": 0.10759314894676208, "learning_rate": 9.252699064135758e-05, "loss": 0.4199, "step": 420 }, { "epoch": 1.6522574447646494, "grad_norm": 0.1024770587682724, "learning_rate": 8.839070858747697e-05, "loss": 0.4272, "step": 430 }, { "epoch": 1.6906820365033621, "grad_norm": 0.10736548155546188, "learning_rate": 8.427447122476148e-05, "loss": 0.4232, "step": 440 }, { "epoch": 1.729106628242075, "grad_norm": 0.1060362458229065, "learning_rate": 8.018538568006027e-05, "loss": 0.4237, "step": 450 }, { "epoch": 1.7675312199807878, "grad_norm": 0.10463803261518478, "learning_rate": 7.613051219968623e-05, "loss": 0.4247, "step": 460 }, { "epoch": 1.8059558117195005, "grad_norm": 0.10327400267124176, "learning_rate": 7.211685195914097e-05, "loss": 0.4196, "step": 470 }, { "epoch": 1.8443804034582132, "grad_norm": 0.13895417749881744, "learning_rate": 6.815133497483157e-05, "loss": 0.4205, "step": 480 }, { "epoch": 1.882804995196926, "grad_norm": 0.10684759169816971, "learning_rate": 6.424080813865138e-05, "loss": 0.4224, "step": 490 }, { "epoch": 1.9212295869356388, "grad_norm": 0.13927388191223145, "learning_rate": 6.039202339608432e-05, "loss": 0.4196, "step": 500 } ], "logging_steps": 10, "max_steps": 780, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.71469629094101e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }