{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9785407725321889, "eval_steps": 300, "global_step": 116, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08583690987124463, "grad_norm": 1.0348295145965056, "learning_rate": 2.0833333333333336e-05, "loss": 0.63, "mean_token_accuracy": 0.8028752787393729, "step": 5 }, { "epoch": 0.17167381974248927, "grad_norm": 0.7946003237280415, "learning_rate": 4.166666666666667e-05, "loss": 0.6382, "mean_token_accuracy": 0.7991655463152572, "step": 10 }, { "epoch": 0.2575107296137339, "grad_norm": 0.7838232140655009, "learning_rate": 4.989741394042727e-05, "loss": 0.6105, "mean_token_accuracy": 0.8056573072191984, "step": 15 }, { "epoch": 0.34334763948497854, "grad_norm": 0.6462660097267386, "learning_rate": 4.92735454356513e-05, "loss": 0.606, "mean_token_accuracy": 0.8061553724053724, "step": 20 }, { "epoch": 0.4291845493562232, "grad_norm": 0.5037099303179456, "learning_rate": 4.8096988312782174e-05, "loss": 0.6115, "mean_token_accuracy": 0.8043439717150109, "step": 25 }, { "epoch": 0.5150214592274678, "grad_norm": 0.6195545306913001, "learning_rate": 4.639453180753619e-05, "loss": 0.5917, "mean_token_accuracy": 0.8100629578754578, "step": 30 }, { "epoch": 0.6008583690987125, "grad_norm": 0.5391133807181806, "learning_rate": 4.420493945100702e-05, "loss": 0.5984, "mean_token_accuracy": 0.8075732600732601, "step": 35 }, { "epoch": 0.6866952789699571, "grad_norm": 0.4989577694128677, "learning_rate": 4.157806645601988e-05, "loss": 0.5906, "mean_token_accuracy": 0.8098496642246642, "step": 40 }, { "epoch": 0.7725321888412017, "grad_norm": 0.44652843585336766, "learning_rate": 3.857372455503697e-05, "loss": 0.5855, "mean_token_accuracy": 0.8110664682539681, "step": 45 }, { "epoch": 0.8583690987124464, "grad_norm": 0.440623988108488, "learning_rate": 3.526032013631893e-05, "loss": 0.5825, "mean_token_accuracy": 0.8121346519902609, "step": 50 }, { "epoch": 0.944206008583691, "grad_norm": 0.4801584677439355, "learning_rate": 3.1713296686859426e-05, "loss": 0.5732, "mean_token_accuracy": 0.8148958333333332, "step": 55 }, { "epoch": 1.0171673819742488, "grad_norm": 0.8143816005601359, "learning_rate": 2.8013417006383076e-05, "loss": 0.54, "mean_token_accuracy": 0.8251169871573535, "step": 60 }, { "epoch": 1.1030042918454936, "grad_norm": 0.6905303409845385, "learning_rate": 2.4244924304977785e-05, "loss": 0.4399, "mean_token_accuracy": 0.8520162575203752, "step": 65 }, { "epoch": 1.1888412017167382, "grad_norm": 0.5001697685249674, "learning_rate": 2.0493624054652357e-05, "loss": 0.4337, "mean_token_accuracy": 0.8533382229436546, "step": 70 }, { "epoch": 1.2746781115879828, "grad_norm": 0.4316206882931221, "learning_rate": 1.6844930269478274e-05, "loss": 0.4279, "mean_token_accuracy": 0.8551797161172161, "step": 75 }, { "epoch": 1.3605150214592274, "grad_norm": 0.38409385416618463, "learning_rate": 1.3381920698905787e-05, "loss": 0.4156, "mean_token_accuracy": 0.8587591575091574, "step": 80 }, { "epoch": 1.4463519313304722, "grad_norm": 0.39576280367792716, "learning_rate": 1.0183445215899584e-05, "loss": 0.4236, "mean_token_accuracy": 0.8561346275221495, "step": 85 }, { "epoch": 1.5321888412017168, "grad_norm": 0.3840880238814308, "learning_rate": 7.3223304703363135e-06, "loss": 0.4158, "mean_token_accuracy": 0.859030630897102, "step": 90 }, { "epoch": 1.6180257510729614, "grad_norm": 0.33692841072311475, "learning_rate": 4.86372168622635e-06, "loss": 0.4231, "mean_token_accuracy": 0.8563792442426716, "step": 95 }, { "epoch": 1.703862660944206, "grad_norm": 0.32222211768399056, "learning_rate": 2.8635993586697553e-06, "loss": 0.4168, "mean_token_accuracy": 0.8585276149893059, "step": 100 }, { "epoch": 1.7896995708154506, "grad_norm": 0.3378812679021034, "learning_rate": 1.3675046241339918e-06, "loss": 0.402, "mean_token_accuracy": 0.8626781898656898, "step": 105 }, { "epoch": 1.8755364806866952, "grad_norm": 0.3155391223091413, "learning_rate": 4.095023263214121e-07, "loss": 0.4084, "mean_token_accuracy": 0.8610657051282053, "step": 110 }, { "epoch": 1.9613733905579398, "grad_norm": 0.30286260681832744, "learning_rate": 1.1405387761664887e-08, "loss": 0.4117, "mean_token_accuracy": 0.8597779304029303, "step": 115 }, { "epoch": 1.9785407725321889, "mean_token_accuracy": 0.8636809371184371, "step": 116, "total_flos": 120709586092032.0, "train_loss": 0.5114190966404718, "train_runtime": 2623.0811, "train_samples_per_second": 5.684, "train_steps_per_second": 0.044 } ], "logging_steps": 5, "max_steps": 116, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 120709586092032.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }