{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 774, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03875968992248062, "grad_norm": 4.869916612403419, "learning_rate": 5e-06, "loss": 0.7587, "step": 10 }, { "epoch": 0.07751937984496124, "grad_norm": 2.356800536439144, "learning_rate": 5e-06, "loss": 0.6463, "step": 20 }, { "epoch": 0.11627906976744186, "grad_norm": 2.980197187385176, "learning_rate": 5e-06, "loss": 0.619, "step": 30 }, { "epoch": 0.15503875968992248, "grad_norm": 2.1795876864921673, "learning_rate": 5e-06, "loss": 0.6062, "step": 40 }, { "epoch": 0.1937984496124031, "grad_norm": 3.8053120183248743, "learning_rate": 5e-06, "loss": 0.5945, "step": 50 }, { "epoch": 0.23255813953488372, "grad_norm": 2.266077683631817, "learning_rate": 5e-06, "loss": 0.5926, "step": 60 }, { "epoch": 0.2713178294573643, "grad_norm": 2.6521624393299073, "learning_rate": 5e-06, "loss": 0.5891, "step": 70 }, { "epoch": 0.31007751937984496, "grad_norm": 1.8499417581910265, "learning_rate": 5e-06, "loss": 0.5807, "step": 80 }, { "epoch": 0.3488372093023256, "grad_norm": 1.448091288991347, "learning_rate": 5e-06, "loss": 0.5764, "step": 90 }, { "epoch": 0.3875968992248062, "grad_norm": 1.331827196567518, "learning_rate": 5e-06, "loss": 0.5762, "step": 100 }, { "epoch": 0.4263565891472868, "grad_norm": 1.3983432251748389, "learning_rate": 5e-06, "loss": 0.5721, "step": 110 }, { "epoch": 0.46511627906976744, "grad_norm": 1.2625603233326237, "learning_rate": 5e-06, "loss": 0.5668, "step": 120 }, { "epoch": 0.5038759689922481, "grad_norm": 1.1208975511203456, "learning_rate": 5e-06, "loss": 0.5691, "step": 130 }, { "epoch": 0.5426356589147286, "grad_norm": 1.5271934431585836, "learning_rate": 5e-06, "loss": 0.5667, "step": 140 }, { "epoch": 0.5813953488372093, "grad_norm": 1.3849439710808884, "learning_rate": 5e-06, "loss": 0.5707, "step": 150 }, { "epoch": 0.6201550387596899, "grad_norm": 1.784019748077074, "learning_rate": 5e-06, "loss": 0.5602, "step": 160 }, { "epoch": 0.6589147286821705, "grad_norm": 1.2242333910528385, "learning_rate": 5e-06, "loss": 0.5592, "step": 170 }, { "epoch": 0.6976744186046512, "grad_norm": 1.0610732976745183, "learning_rate": 5e-06, "loss": 0.5563, "step": 180 }, { "epoch": 0.7364341085271318, "grad_norm": 1.286529481018184, "learning_rate": 5e-06, "loss": 0.5646, "step": 190 }, { "epoch": 0.7751937984496124, "grad_norm": 1.733002678157722, "learning_rate": 5e-06, "loss": 0.5554, "step": 200 }, { "epoch": 0.813953488372093, "grad_norm": 1.566488894753038, "learning_rate": 5e-06, "loss": 0.5587, "step": 210 }, { "epoch": 0.8527131782945736, "grad_norm": 1.45338662358728, "learning_rate": 5e-06, "loss": 0.5565, "step": 220 }, { "epoch": 0.8914728682170543, "grad_norm": 1.5845555463612138, "learning_rate": 5e-06, "loss": 0.5537, "step": 230 }, { "epoch": 0.9302325581395349, "grad_norm": 1.310120628712062, "learning_rate": 5e-06, "loss": 0.5544, "step": 240 }, { "epoch": 0.9689922480620154, "grad_norm": 1.2711146035798313, "learning_rate": 5e-06, "loss": 0.5488, "step": 250 }, { "epoch": 1.0077519379844961, "grad_norm": 1.279690552953799, "learning_rate": 5e-06, "loss": 0.5403, "step": 260 }, { "epoch": 1.0465116279069768, "grad_norm": 1.403934103523871, "learning_rate": 5e-06, "loss": 0.4888, "step": 270 }, { "epoch": 1.0852713178294573, "grad_norm": 1.7860715320959983, "learning_rate": 5e-06, "loss": 0.4881, "step": 280 }, { "epoch": 1.124031007751938, "grad_norm": 2.6582065674591817, "learning_rate": 5e-06, "loss": 0.4826, "step": 290 }, { "epoch": 1.1627906976744187, "grad_norm": 2.5637630175870156, "learning_rate": 5e-06, "loss": 0.4795, "step": 300 }, { "epoch": 1.2015503875968991, "grad_norm": 1.836601052714763, "learning_rate": 5e-06, "loss": 0.4851, "step": 310 }, { "epoch": 1.2403100775193798, "grad_norm": 2.7460689111042726, "learning_rate": 5e-06, "loss": 0.4819, "step": 320 }, { "epoch": 1.2790697674418605, "grad_norm": 1.960316959139004, "learning_rate": 5e-06, "loss": 0.4825, "step": 330 }, { "epoch": 1.3178294573643412, "grad_norm": 1.5245042026619613, "learning_rate": 5e-06, "loss": 0.4872, "step": 340 }, { "epoch": 1.3565891472868217, "grad_norm": 1.1226006584443515, "learning_rate": 5e-06, "loss": 0.4848, "step": 350 }, { "epoch": 1.3953488372093024, "grad_norm": 1.6342475055168568, "learning_rate": 5e-06, "loss": 0.4891, "step": 360 }, { "epoch": 1.4341085271317828, "grad_norm": 1.1926368928566515, "learning_rate": 5e-06, "loss": 0.4878, "step": 370 }, { "epoch": 1.4728682170542635, "grad_norm": 1.6154699522052325, "learning_rate": 5e-06, "loss": 0.4865, "step": 380 }, { "epoch": 1.5116279069767442, "grad_norm": 1.3392832383092106, "learning_rate": 5e-06, "loss": 0.4913, "step": 390 }, { "epoch": 1.550387596899225, "grad_norm": 1.4892182638716063, "learning_rate": 5e-06, "loss": 0.4875, "step": 400 }, { "epoch": 1.5891472868217056, "grad_norm": 1.3885053163345062, "learning_rate": 5e-06, "loss": 0.4839, "step": 410 }, { "epoch": 1.627906976744186, "grad_norm": 1.1196443684292803, "learning_rate": 5e-06, "loss": 0.4891, "step": 420 }, { "epoch": 1.6666666666666665, "grad_norm": 1.1238270569651316, "learning_rate": 5e-06, "loss": 0.4928, "step": 430 }, { "epoch": 1.7054263565891472, "grad_norm": 1.298011997164272, "learning_rate": 5e-06, "loss": 0.4893, "step": 440 }, { "epoch": 1.744186046511628, "grad_norm": 1.3168801932240708, "learning_rate": 5e-06, "loss": 0.4925, "step": 450 }, { "epoch": 1.7829457364341086, "grad_norm": 1.2894860735538498, "learning_rate": 5e-06, "loss": 0.4897, "step": 460 }, { "epoch": 1.8217054263565893, "grad_norm": 1.1452568334867488, "learning_rate": 5e-06, "loss": 0.4918, "step": 470 }, { "epoch": 1.8604651162790697, "grad_norm": 1.7898485533967223, "learning_rate": 5e-06, "loss": 0.4952, "step": 480 }, { "epoch": 1.8992248062015504, "grad_norm": 2.5528753434996063, "learning_rate": 5e-06, "loss": 0.4919, "step": 490 }, { "epoch": 1.937984496124031, "grad_norm": 2.4812121040462762, "learning_rate": 5e-06, "loss": 0.4952, "step": 500 }, { "epoch": 1.9767441860465116, "grad_norm": 1.8738858925975534, "learning_rate": 5e-06, "loss": 0.4912, "step": 510 }, { "epoch": 2.0155038759689923, "grad_norm": 2.9903948329494052, "learning_rate": 5e-06, "loss": 0.4648, "step": 520 }, { "epoch": 2.054263565891473, "grad_norm": 1.240867023517709, "learning_rate": 5e-06, "loss": 0.4224, "step": 530 }, { "epoch": 2.0930232558139537, "grad_norm": 1.3414112700662149, "learning_rate": 5e-06, "loss": 0.4193, "step": 540 }, { "epoch": 2.1317829457364343, "grad_norm": 1.3756538218381091, "learning_rate": 5e-06, "loss": 0.4206, "step": 550 }, { "epoch": 2.1705426356589146, "grad_norm": 1.1988023419162364, "learning_rate": 5e-06, "loss": 0.4268, "step": 560 }, { "epoch": 2.2093023255813953, "grad_norm": 1.4882384822811707, "learning_rate": 5e-06, "loss": 0.4256, "step": 570 }, { "epoch": 2.248062015503876, "grad_norm": 1.3650791004947405, "learning_rate": 5e-06, "loss": 0.4277, "step": 580 }, { "epoch": 2.2868217054263567, "grad_norm": 1.2416611465338827, "learning_rate": 5e-06, "loss": 0.4302, "step": 590 }, { "epoch": 2.3255813953488373, "grad_norm": 1.5098800156810854, "learning_rate": 5e-06, "loss": 0.4287, "step": 600 }, { "epoch": 2.3643410852713176, "grad_norm": 1.6277010120108575, "learning_rate": 5e-06, "loss": 0.4333, "step": 610 }, { "epoch": 2.4031007751937983, "grad_norm": 1.3823219785892285, "learning_rate": 5e-06, "loss": 0.4335, "step": 620 }, { "epoch": 2.441860465116279, "grad_norm": 1.262448760242348, "learning_rate": 5e-06, "loss": 0.4329, "step": 630 }, { "epoch": 2.4806201550387597, "grad_norm": 1.4732939558325473, "learning_rate": 5e-06, "loss": 0.4353, "step": 640 }, { "epoch": 2.5193798449612403, "grad_norm": 2.7196278212283906, "learning_rate": 5e-06, "loss": 0.434, "step": 650 }, { "epoch": 2.558139534883721, "grad_norm": 2.146043169743785, "learning_rate": 5e-06, "loss": 0.4375, "step": 660 }, { "epoch": 2.5968992248062017, "grad_norm": 2.2197008538350613, "learning_rate": 5e-06, "loss": 0.4381, "step": 670 }, { "epoch": 2.6356589147286824, "grad_norm": 2.0029057458346236, "learning_rate": 5e-06, "loss": 0.4307, "step": 680 }, { "epoch": 2.6744186046511627, "grad_norm": 1.9016342995088265, "learning_rate": 5e-06, "loss": 0.433, "step": 690 }, { "epoch": 2.7131782945736433, "grad_norm": 1.7743717503501029, "learning_rate": 5e-06, "loss": 0.4323, "step": 700 }, { "epoch": 2.751937984496124, "grad_norm": 1.578135945766867, "learning_rate": 5e-06, "loss": 0.4301, "step": 710 }, { "epoch": 2.7906976744186047, "grad_norm": 1.7449167163303743, "learning_rate": 5e-06, "loss": 0.4354, "step": 720 }, { "epoch": 2.8294573643410854, "grad_norm": 1.6551665508907096, "learning_rate": 5e-06, "loss": 0.4366, "step": 730 }, { "epoch": 2.8682170542635657, "grad_norm": 1.5568238591625354, "learning_rate": 5e-06, "loss": 0.437, "step": 740 }, { "epoch": 2.9069767441860463, "grad_norm": 1.7009594737316032, "learning_rate": 5e-06, "loss": 0.4349, "step": 750 }, { "epoch": 2.945736434108527, "grad_norm": 1.4680051710135147, "learning_rate": 5e-06, "loss": 0.4369, "step": 760 }, { "epoch": 2.9844961240310077, "grad_norm": 2.462482590739916, "learning_rate": 5e-06, "loss": 0.4377, "step": 770 }, { "epoch": 3.0, "step": 774, "total_flos": 2591282618695680.0, "train_loss": 0.5006180582736506, "train_runtime": 11465.4014, "train_samples_per_second": 69.07, "train_steps_per_second": 0.068 } ], "logging_steps": 10, "max_steps": 774, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2591282618695680.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }