{ "best_metric": null, "best_model_checkpoint": null, "epoch": 6.153846153846154, "eval_steps": 50, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.15384615384615385, "grad_norm": 25.85894203186035, "learning_rate": 3.0769230769230774e-06, "loss": 1.0987, "step": 50 }, { "epoch": 0.15384615384615385, "eval_loss": 0.7039205431938171, "eval_runtime": 1.7807, "eval_samples_per_second": 64.02, "eval_steps_per_second": 3.369, "step": 50 }, { "epoch": 0.3076923076923077, "grad_norm": 10.326680183410645, "learning_rate": 6.153846153846155e-06, "loss": 0.6388, "step": 100 }, { "epoch": 0.3076923076923077, "eval_loss": 0.6586376428604126, "eval_runtime": 1.795, "eval_samples_per_second": 63.511, "eval_steps_per_second": 3.343, "step": 100 }, { "epoch": 0.46153846153846156, "grad_norm": 2.875251531600952, "learning_rate": 9.230769230769232e-06, "loss": 0.6463, "step": 150 }, { "epoch": 0.46153846153846156, "eval_loss": 0.6574278473854065, "eval_runtime": 1.7356, "eval_samples_per_second": 65.682, "eval_steps_per_second": 3.457, "step": 150 }, { "epoch": 0.6153846153846154, "grad_norm": 2.7418408393859863, "learning_rate": 1.230769230769231e-05, "loss": 0.6721, "step": 200 }, { "epoch": 0.6153846153846154, "eval_loss": 0.6629524827003479, "eval_runtime": 1.7437, "eval_samples_per_second": 65.378, "eval_steps_per_second": 3.441, "step": 200 }, { "epoch": 0.7692307692307693, "grad_norm": 1.8382577896118164, "learning_rate": 1.5384615384615387e-05, "loss": 0.6288, "step": 250 }, { "epoch": 0.7692307692307693, "eval_loss": 0.673968493938446, "eval_runtime": 1.8191, "eval_samples_per_second": 62.667, "eval_steps_per_second": 3.298, "step": 250 }, { "epoch": 0.9230769230769231, "grad_norm": 2.8036742210388184, "learning_rate": 1.8461538461538465e-05, "loss": 0.7276, "step": 300 }, { "epoch": 0.9230769230769231, "eval_loss": 0.6912825107574463, "eval_runtime": 1.7707, "eval_samples_per_second": 64.38, "eval_steps_per_second": 3.388, "step": 300 }, { "epoch": 1.0769230769230769, "grad_norm": 1.469930648803711, "learning_rate": 1.9996395276708856e-05, "loss": 0.5365, "step": 350 }, { "epoch": 1.0769230769230769, "eval_loss": 0.7528238296508789, "eval_runtime": 1.7853, "eval_samples_per_second": 63.856, "eval_steps_per_second": 3.361, "step": 350 }, { "epoch": 1.2307692307692308, "grad_norm": 2.5635602474212646, "learning_rate": 1.9967573081342103e-05, "loss": 0.4279, "step": 400 }, { "epoch": 1.2307692307692308, "eval_loss": 0.7624219655990601, "eval_runtime": 1.8133, "eval_samples_per_second": 62.869, "eval_steps_per_second": 3.309, "step": 400 }, { "epoch": 1.3846153846153846, "grad_norm": 1.6066973209381104, "learning_rate": 1.9910011792459086e-05, "loss": 0.42, "step": 450 }, { "epoch": 1.3846153846153846, "eval_loss": 0.7675647139549255, "eval_runtime": 1.7503, "eval_samples_per_second": 65.132, "eval_steps_per_second": 3.428, "step": 450 }, { "epoch": 1.5384615384615383, "grad_norm": 2.782292604446411, "learning_rate": 1.9823877374156647e-05, "loss": 0.4284, "step": 500 }, { "epoch": 1.5384615384615383, "eval_loss": 0.7663527131080627, "eval_runtime": 1.7731, "eval_samples_per_second": 64.293, "eval_steps_per_second": 3.384, "step": 500 }, { "epoch": 1.6923076923076923, "grad_norm": 1.985113263130188, "learning_rate": 1.9709418174260523e-05, "loss": 0.4615, "step": 550 }, { "epoch": 1.6923076923076923, "eval_loss": 0.7588106989860535, "eval_runtime": 1.743, "eval_samples_per_second": 65.406, "eval_steps_per_second": 3.442, "step": 550 }, { "epoch": 1.8461538461538463, "grad_norm": 3.593240261077881, "learning_rate": 1.9566964208274254e-05, "loss": 0.4493, "step": 600 }, { "epoch": 1.8461538461538463, "eval_loss": 0.7685819864273071, "eval_runtime": 1.7745, "eval_samples_per_second": 64.242, "eval_steps_per_second": 3.381, "step": 600 }, { "epoch": 2.0, "grad_norm": 2.46645450592041, "learning_rate": 1.9396926207859085e-05, "loss": 0.4103, "step": 650 }, { "epoch": 2.0, "eval_loss": 0.7486200928688049, "eval_runtime": 1.7602, "eval_samples_per_second": 64.766, "eval_steps_per_second": 3.409, "step": 650 }, { "epoch": 2.1538461538461537, "grad_norm": 2.113468647003174, "learning_rate": 1.9199794436588244e-05, "loss": 0.2133, "step": 700 }, { "epoch": 2.1538461538461537, "eval_loss": 0.8937916159629822, "eval_runtime": 1.7444, "eval_samples_per_second": 65.35, "eval_steps_per_second": 3.439, "step": 700 }, { "epoch": 2.3076923076923075, "grad_norm": 1.9733953475952148, "learning_rate": 1.8976137276390145e-05, "loss": 0.2181, "step": 750 }, { "epoch": 2.3076923076923075, "eval_loss": 0.8633579611778259, "eval_runtime": 1.7477, "eval_samples_per_second": 65.228, "eval_steps_per_second": 3.433, "step": 750 }, { "epoch": 2.4615384615384617, "grad_norm": 1.7347004413604736, "learning_rate": 1.8726599588756144e-05, "loss": 0.2005, "step": 800 }, { "epoch": 2.4615384615384617, "eval_loss": 0.8777443766593933, "eval_runtime": 1.7381, "eval_samples_per_second": 65.588, "eval_steps_per_second": 3.452, "step": 800 }, { "epoch": 2.6153846153846154, "grad_norm": 1.5702624320983887, "learning_rate": 1.845190085543795e-05, "loss": 0.2157, "step": 850 }, { "epoch": 2.6153846153846154, "eval_loss": 0.8784948587417603, "eval_runtime": 1.7673, "eval_samples_per_second": 64.505, "eval_steps_per_second": 3.395, "step": 850 }, { "epoch": 2.769230769230769, "grad_norm": 2.3403327465057373, "learning_rate": 1.8152833103995443e-05, "loss": 0.2115, "step": 900 }, { "epoch": 2.769230769230769, "eval_loss": 0.8863385915756226, "eval_runtime": 1.7696, "eval_samples_per_second": 64.421, "eval_steps_per_second": 3.391, "step": 900 }, { "epoch": 2.9230769230769234, "grad_norm": 1.9724828004837036, "learning_rate": 1.7830258624176224e-05, "loss": 0.2225, "step": 950 }, { "epoch": 2.9230769230769234, "eval_loss": 0.8796689510345459, "eval_runtime": 1.7943, "eval_samples_per_second": 63.534, "eval_steps_per_second": 3.344, "step": 950 }, { "epoch": 3.076923076923077, "grad_norm": 1.9262828826904297, "learning_rate": 1.7485107481711014e-05, "loss": 0.1693, "step": 1000 }, { "epoch": 3.076923076923077, "eval_loss": 0.9404221177101135, "eval_runtime": 1.8296, "eval_samples_per_second": 62.309, "eval_steps_per_second": 3.279, "step": 1000 }, { "epoch": 3.230769230769231, "grad_norm": 1.1772091388702393, "learning_rate": 1.7118374836693407e-05, "loss": 0.1256, "step": 1050 }, { "epoch": 3.230769230769231, "eval_loss": 0.9360042810440063, "eval_runtime": 1.8904, "eval_samples_per_second": 60.305, "eval_steps_per_second": 3.174, "step": 1050 }, { "epoch": 3.3846153846153846, "grad_norm": 1.739654302597046, "learning_rate": 1.67311180742757e-05, "loss": 0.125, "step": 1100 }, { "epoch": 3.3846153846153846, "eval_loss": 0.9563117623329163, "eval_runtime": 1.7652, "eval_samples_per_second": 64.581, "eval_steps_per_second": 3.399, "step": 1100 }, { "epoch": 3.5384615384615383, "grad_norm": 1.3015375137329102, "learning_rate": 1.6324453755953772e-05, "loss": 0.1281, "step": 1150 }, { "epoch": 3.5384615384615383, "eval_loss": 0.9608204960823059, "eval_runtime": 1.7445, "eval_samples_per_second": 65.347, "eval_steps_per_second": 3.439, "step": 1150 }, { "epoch": 3.6923076923076925, "grad_norm": 2.1856415271759033, "learning_rate": 1.5899554400231233e-05, "loss": 0.1264, "step": 1200 }, { "epoch": 3.6923076923076925, "eval_loss": 0.9677081108093262, "eval_runtime": 1.7748, "eval_samples_per_second": 64.233, "eval_steps_per_second": 3.381, "step": 1200 }, { "epoch": 3.8461538461538463, "grad_norm": 1.8629130125045776, "learning_rate": 1.5457645101945046e-05, "loss": 0.1267, "step": 1250 }, { "epoch": 3.8461538461538463, "eval_loss": 0.9708028435707092, "eval_runtime": 1.7595, "eval_samples_per_second": 64.791, "eval_steps_per_second": 3.41, "step": 1250 }, { "epoch": 4.0, "grad_norm": 2.2338545322418213, "learning_rate": 1.5000000000000002e-05, "loss": 0.1324, "step": 1300 }, { "epoch": 4.0, "eval_loss": 0.9690905809402466, "eval_runtime": 1.7742, "eval_samples_per_second": 64.255, "eval_steps_per_second": 3.382, "step": 1300 }, { "epoch": 4.153846153846154, "grad_norm": 1.040000557899475, "learning_rate": 1.4527938603696376e-05, "loss": 0.0931, "step": 1350 }, { "epoch": 4.153846153846154, "eval_loss": 0.9888765811920166, "eval_runtime": 1.7454, "eval_samples_per_second": 65.315, "eval_steps_per_second": 3.438, "step": 1350 }, { "epoch": 4.3076923076923075, "grad_norm": 0.49024778604507446, "learning_rate": 1.404282198824305e-05, "loss": 0.0829, "step": 1400 }, { "epoch": 4.3076923076923075, "eval_loss": 1.0040687322616577, "eval_runtime": 1.798, "eval_samples_per_second": 63.406, "eval_steps_per_second": 3.337, "step": 1400 }, { "epoch": 4.461538461538462, "grad_norm": 1.1068203449249268, "learning_rate": 1.3546048870425356e-05, "loss": 0.0908, "step": 1450 }, { "epoch": 4.461538461538462, "eval_loss": 1.0206270217895508, "eval_runtime": 1.7439, "eval_samples_per_second": 65.37, "eval_steps_per_second": 3.441, "step": 1450 }, { "epoch": 4.615384615384615, "grad_norm": 1.7601001262664795, "learning_rate": 1.303905157574247e-05, "loss": 0.0845, "step": 1500 }, { "epoch": 4.615384615384615, "eval_loss": 1.026615023612976, "eval_runtime": 1.7715, "eval_samples_per_second": 64.352, "eval_steps_per_second": 3.387, "step": 1500 }, { "epoch": 4.769230769230769, "grad_norm": 1.9115608930587769, "learning_rate": 1.2523291908642219e-05, "loss": 0.087, "step": 1550 }, { "epoch": 4.769230769230769, "eval_loss": 1.03285551071167, "eval_runtime": 1.7503, "eval_samples_per_second": 65.13, "eval_steps_per_second": 3.428, "step": 1550 }, { "epoch": 4.923076923076923, "grad_norm": 0.8985757231712341, "learning_rate": 1.2000256937760446e-05, "loss": 0.0833, "step": 1600 }, { "epoch": 4.923076923076923, "eval_loss": 1.0319923162460327, "eval_runtime": 1.8324, "eval_samples_per_second": 62.213, "eval_steps_per_second": 3.274, "step": 1600 }, { "epoch": 5.076923076923077, "grad_norm": 0.524580717086792, "learning_rate": 1.1471454708317163e-05, "loss": 0.0758, "step": 1650 }, { "epoch": 5.076923076923077, "eval_loss": 1.0695827007293701, "eval_runtime": 1.8442, "eval_samples_per_second": 61.814, "eval_steps_per_second": 3.253, "step": 1650 }, { "epoch": 5.230769230769231, "grad_norm": 0.7894753813743591, "learning_rate": 1.0938409894031793e-05, "loss": 0.0645, "step": 1700 }, { "epoch": 5.230769230769231, "eval_loss": 1.0913734436035156, "eval_runtime": 1.77, "eval_samples_per_second": 64.406, "eval_steps_per_second": 3.39, "step": 1700 }, { "epoch": 5.384615384615385, "grad_norm": 0.9538184404373169, "learning_rate": 1.0402659401094154e-05, "loss": 0.0586, "step": 1750 }, { "epoch": 5.384615384615385, "eval_loss": 1.0830014944076538, "eval_runtime": 1.7555, "eval_samples_per_second": 64.939, "eval_steps_per_second": 3.418, "step": 1750 }, { "epoch": 5.538461538461538, "grad_norm": 0.7752998471260071, "learning_rate": 9.865747936866027e-06, "loss": 0.0611, "step": 1800 }, { "epoch": 5.538461538461538, "eval_loss": 1.0868384838104248, "eval_runtime": 1.762, "eval_samples_per_second": 64.699, "eval_steps_per_second": 3.405, "step": 1800 }, { "epoch": 5.6923076923076925, "grad_norm": 1.0308915376663208, "learning_rate": 9.329223556089976e-06, "loss": 0.0675, "step": 1850 }, { "epoch": 5.6923076923076925, "eval_loss": 1.0818272829055786, "eval_runtime": 1.7338, "eval_samples_per_second": 65.753, "eval_steps_per_second": 3.461, "step": 1850 }, { "epoch": 5.846153846153846, "grad_norm": 0.47693803906440735, "learning_rate": 8.79463319744677e-06, "loss": 0.0631, "step": 1900 }, { "epoch": 5.846153846153846, "eval_loss": 1.0780550241470337, "eval_runtime": 1.778, "eval_samples_per_second": 64.118, "eval_steps_per_second": 3.375, "step": 1900 }, { "epoch": 6.0, "grad_norm": 0.3759777247905731, "learning_rate": 8.263518223330698e-06, "loss": 0.0609, "step": 1950 }, { "epoch": 6.0, "eval_loss": 1.0708911418914795, "eval_runtime": 1.7659, "eval_samples_per_second": 64.557, "eval_steps_per_second": 3.398, "step": 1950 }, { "epoch": 6.153846153846154, "grad_norm": 0.34113025665283203, "learning_rate": 7.73740997570278e-06, "loss": 0.0473, "step": 2000 }, { "epoch": 6.153846153846154, "eval_loss": 1.1129051446914673, "eval_runtime": 1.7485, "eval_samples_per_second": 65.199, "eval_steps_per_second": 3.432, "step": 2000 } ], "logging_steps": 50, "max_steps": 3250, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 1000, "total_flos": 1.223215769756631e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }