{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.07836990595611286, "eval_steps": 9, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007836990595611285, "eval_loss": 1.8189666271209717, "eval_runtime": 154.0325, "eval_samples_per_second": 13.952, "eval_steps_per_second": 1.746, "step": 1 }, { "epoch": 0.0023510971786833857, "grad_norm": 0.19960135221481323, "learning_rate": 3e-05, "loss": 1.6224, "step": 3 }, { "epoch": 0.004702194357366771, "grad_norm": 0.20366176962852478, "learning_rate": 6e-05, "loss": 1.788, "step": 6 }, { "epoch": 0.007053291536050157, "grad_norm": 0.24395181238651276, "learning_rate": 9e-05, "loss": 1.7563, "step": 9 }, { "epoch": 0.007053291536050157, "eval_loss": 1.7944647073745728, "eval_runtime": 155.6578, "eval_samples_per_second": 13.806, "eval_steps_per_second": 1.728, "step": 9 }, { "epoch": 0.009404388714733543, "grad_norm": 0.22205477952957153, "learning_rate": 9.987820251299122e-05, "loss": 1.6841, "step": 12 }, { "epoch": 0.011755485893416929, "grad_norm": 0.2233169674873352, "learning_rate": 9.924038765061042e-05, "loss": 1.7981, "step": 15 }, { "epoch": 0.014106583072100314, "grad_norm": 0.21325641870498657, "learning_rate": 9.806308479691595e-05, "loss": 1.7515, "step": 18 }, { "epoch": 0.014106583072100314, "eval_loss": 1.7263827323913574, "eval_runtime": 155.6956, "eval_samples_per_second": 13.803, "eval_steps_per_second": 1.728, "step": 18 }, { "epoch": 0.016457680250783698, "grad_norm": 0.21644531190395355, "learning_rate": 9.635919272833938e-05, "loss": 1.6503, "step": 21 }, { "epoch": 0.018808777429467086, "grad_norm": 0.1924390196800232, "learning_rate": 9.414737964294636e-05, "loss": 1.5309, "step": 24 }, { "epoch": 0.02115987460815047, "grad_norm": 0.23553432524204254, "learning_rate": 9.145187862775209e-05, "loss": 1.4196, "step": 27 }, { "epoch": 0.02115987460815047, "eval_loss": 1.6870815753936768, "eval_runtime": 155.6773, "eval_samples_per_second": 13.804, "eval_steps_per_second": 1.728, "step": 27 }, { "epoch": 0.023510971786833857, "grad_norm": 0.2909448742866516, "learning_rate": 8.83022221559489e-05, "loss": 1.5234, "step": 30 }, { "epoch": 0.02586206896551724, "grad_norm": 0.2146679311990738, "learning_rate": 8.473291852294987e-05, "loss": 1.4488, "step": 33 }, { "epoch": 0.02821316614420063, "grad_norm": 0.2212241291999817, "learning_rate": 8.07830737662829e-05, "loss": 1.4463, "step": 36 }, { "epoch": 0.02821316614420063, "eval_loss": 1.6592437028884888, "eval_runtime": 155.6643, "eval_samples_per_second": 13.805, "eval_steps_per_second": 1.728, "step": 36 }, { "epoch": 0.030564263322884012, "grad_norm": 0.31677696108818054, "learning_rate": 7.649596321166024e-05, "loss": 1.7469, "step": 39 }, { "epoch": 0.032915360501567396, "grad_norm": 0.23331013321876526, "learning_rate": 7.191855733945387e-05, "loss": 1.6296, "step": 42 }, { "epoch": 0.03526645768025078, "grad_norm": 0.27310678362846375, "learning_rate": 6.710100716628344e-05, "loss": 1.5908, "step": 45 }, { "epoch": 0.03526645768025078, "eval_loss": 1.6446130275726318, "eval_runtime": 155.692, "eval_samples_per_second": 13.803, "eval_steps_per_second": 1.728, "step": 45 }, { "epoch": 0.03761755485893417, "grad_norm": 0.2733047604560852, "learning_rate": 6.209609477998338e-05, "loss": 1.6959, "step": 48 }, { "epoch": 0.039968652037617555, "grad_norm": 0.24993999302387238, "learning_rate": 5.695865504800327e-05, "loss": 1.6182, "step": 51 }, { "epoch": 0.04231974921630094, "grad_norm": 0.25631314516067505, "learning_rate": 5.174497483512506e-05, "loss": 1.5026, "step": 54 }, { "epoch": 0.04231974921630094, "eval_loss": 1.6353027820587158, "eval_runtime": 155.6626, "eval_samples_per_second": 13.806, "eval_steps_per_second": 1.728, "step": 54 }, { "epoch": 0.04467084639498432, "grad_norm": 0.23046250641345978, "learning_rate": 4.6512176312793736e-05, "loss": 1.5225, "step": 57 }, { "epoch": 0.047021943573667714, "grad_norm": 0.2635563910007477, "learning_rate": 4.131759111665349e-05, "loss": 1.6317, "step": 60 }, { "epoch": 0.0493730407523511, "grad_norm": 0.272748738527298, "learning_rate": 3.6218132209150045e-05, "loss": 1.6508, "step": 63 }, { "epoch": 0.0493730407523511, "eval_loss": 1.628738284111023, "eval_runtime": 155.6556, "eval_samples_per_second": 13.806, "eval_steps_per_second": 1.728, "step": 63 }, { "epoch": 0.05172413793103448, "grad_norm": 0.31287166476249695, "learning_rate": 3.12696703292044e-05, "loss": 1.5983, "step": 66 }, { "epoch": 0.054075235109717866, "grad_norm": 0.2865971028804779, "learning_rate": 2.6526421860705473e-05, "loss": 1.5986, "step": 69 }, { "epoch": 0.05642633228840126, "grad_norm": 0.24794569611549377, "learning_rate": 2.2040354826462668e-05, "loss": 1.6311, "step": 72 }, { "epoch": 0.05642633228840126, "eval_loss": 1.624565839767456, "eval_runtime": 155.6863, "eval_samples_per_second": 13.803, "eval_steps_per_second": 1.728, "step": 72 }, { "epoch": 0.05877742946708464, "grad_norm": 0.30881521105766296, "learning_rate": 1.7860619515673033e-05, "loss": 1.5812, "step": 75 }, { "epoch": 0.061128526645768025, "grad_norm": 0.3214222490787506, "learning_rate": 1.4033009983067452e-05, "loss": 1.5031, "step": 78 }, { "epoch": 0.06347962382445141, "grad_norm": 0.9007675647735596, "learning_rate": 1.0599462319663905e-05, "loss": 1.6133, "step": 81 }, { "epoch": 0.06347962382445141, "eval_loss": 1.6222409009933472, "eval_runtime": 155.6887, "eval_samples_per_second": 13.803, "eval_steps_per_second": 1.728, "step": 81 }, { "epoch": 0.06583072100313479, "grad_norm": 0.27317968010902405, "learning_rate": 7.597595192178702e-06, "loss": 1.5434, "step": 84 }, { "epoch": 0.06818181818181818, "grad_norm": 0.3382052481174469, "learning_rate": 5.060297685041659e-06, "loss": 1.5874, "step": 87 }, { "epoch": 0.07053291536050156, "grad_norm": 0.2950480878353119, "learning_rate": 3.0153689607045845e-06, "loss": 1.5417, "step": 90 }, { "epoch": 0.07053291536050156, "eval_loss": 1.6213549375534058, "eval_runtime": 155.6058, "eval_samples_per_second": 13.811, "eval_steps_per_second": 1.729, "step": 90 }, { "epoch": 0.07288401253918496, "grad_norm": 0.27879786491394043, "learning_rate": 1.4852136862001764e-06, "loss": 1.7045, "step": 93 }, { "epoch": 0.07523510971786834, "grad_norm": 0.2570805847644806, "learning_rate": 4.865965629214819e-07, "loss": 1.4772, "step": 96 }, { "epoch": 0.07758620689655173, "grad_norm": 0.23328937590122223, "learning_rate": 3.04586490452119e-08, "loss": 1.6005, "step": 99 }, { "epoch": 0.07758620689655173, "eval_loss": 1.6211639642715454, "eval_runtime": 155.6403, "eval_samples_per_second": 13.807, "eval_steps_per_second": 1.728, "step": 99 } ], "logging_steps": 3, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 9, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.444156864022446e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }