{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.975951903807615, "eval_steps": 500, "global_step": 124, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.16032064128256512, "grad_norm": 96.1263392680429, "learning_rate": 7.692307692307693e-05, "loss": 1.184, "step": 5 }, { "epoch": 0.32064128256513025, "grad_norm": 109.03932941785152, "learning_rate": 0.00015384615384615385, "loss": 5.0239, "step": 10 }, { "epoch": 0.48096192384769537, "grad_norm": 57.030962020859526, "learning_rate": 0.00019983983492623833, "loss": 4.0648, "step": 15 }, { "epoch": 0.6412825651302605, "grad_norm": 22.869725355526718, "learning_rate": 0.0001980438647961327, "loss": 2.8094, "step": 20 }, { "epoch": 0.8016032064128257, "grad_norm": 6.985773379251052, "learning_rate": 0.00019428774454610843, "loss": 2.2094, "step": 25 }, { "epoch": 0.9619238476953907, "grad_norm": 5.841783328387945, "learning_rate": 0.00018864656872260985, "loss": 1.3499, "step": 30 }, { "epoch": 1.122244488977956, "grad_norm": 4.328203171108087, "learning_rate": 0.0001812331190023886, "loss": 1.2527, "step": 35 }, { "epoch": 1.282565130260521, "grad_norm": 1.8271946123422904, "learning_rate": 0.00017219560939545246, "loss": 0.9903, "step": 40 }, { "epoch": 1.4428857715430863, "grad_norm": 1.3376797882778457, "learning_rate": 0.00016171472306414554, "loss": 0.8082, "step": 45 }, { "epoch": 1.6032064128256514, "grad_norm": 0.8625176200253305, "learning_rate": 0.00015000000000000001, "loss": 0.7515, "step": 50 }, { "epoch": 1.7635270541082164, "grad_norm": 0.9297452179429324, "learning_rate": 0.00013728564777803088, "loss": 0.6321, "step": 55 }, { "epoch": 1.9238476953907817, "grad_norm": 1.1609492766676697, "learning_rate": 0.0001238258591423165, "loss": 0.5802, "step": 60 }, { "epoch": 2.0841683366733466, "grad_norm": 0.6088051647121085, "learning_rate": 0.00010988973003642499, "loss": 0.4983, "step": 65 }, { "epoch": 2.244488977955912, "grad_norm": 0.45623167145983035, "learning_rate": 9.57558796803852e-05, "loss": 0.4119, "step": 70 }, { "epoch": 2.404809619238477, "grad_norm": 0.43383089288281307, "learning_rate": 8.170688025276134e-05, "loss": 0.3915, "step": 75 }, { "epoch": 2.565130260521042, "grad_norm": 0.44731640463982364, "learning_rate": 6.802360754287547e-05, "loss": 0.3917, "step": 80 }, { "epoch": 2.7254509018036073, "grad_norm": 0.4205909891049952, "learning_rate": 5.497962551823266e-05, "loss": 0.3886, "step": 85 }, { "epoch": 2.8857715430861726, "grad_norm": 0.4168469096572953, "learning_rate": 4.283571707415214e-05, "loss": 0.3689, "step": 90 }, { "epoch": 3.0460921843687374, "grad_norm": 0.6798047385077177, "learning_rate": 3.1834670310046734e-05, "loss": 0.3212, "step": 95 }, { "epoch": 3.2064128256513027, "grad_norm": 0.3963204162500924, "learning_rate": 2.2196424568156073e-05, "loss": 0.1699, "step": 100 }, { "epoch": 3.3667334669338675, "grad_norm": 0.38329008287670924, "learning_rate": 1.4113673277957395e-05, "loss": 0.1585, "step": 105 }, { "epoch": 3.527054108216433, "grad_norm": 0.34785356511529864, "learning_rate": 7.74801151675314e-06, "loss": 0.1499, "step": 110 }, { "epoch": 3.687374749498998, "grad_norm": 0.3409235398527789, "learning_rate": 3.226705306650113e-06, "loss": 0.1438, "step": 115 }, { "epoch": 3.847695390781563, "grad_norm": 0.365385237116056, "learning_rate": 6.401472380297091e-07, "loss": 0.1525, "step": 120 }, { "epoch": 3.975951903807615, "step": 124, "total_flos": 135489736671232.0, "train_loss": 1.0211431388893435, "train_runtime": 8462.237, "train_samples_per_second": 3.773, "train_steps_per_second": 0.015 } ], "logging_steps": 5, "max_steps": 124, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 135489736671232.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }