{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 50, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05, "grad_norm": 7.943530082702637, "learning_rate": 1.0000000000000002e-06, "loss": 1.8114, "step": 50 }, { "epoch": 0.05, "eval_loss": 1.1615628004074097, "eval_runtime": 3.153, "eval_samples_per_second": 49.476, "eval_steps_per_second": 2.537, "step": 50 }, { "epoch": 0.1, "grad_norm": 5.800487518310547, "learning_rate": 2.0000000000000003e-06, "loss": 0.9446, "step": 100 }, { "epoch": 0.1, "eval_loss": 0.8041068315505981, "eval_runtime": 3.0734, "eval_samples_per_second": 50.759, "eval_steps_per_second": 2.603, "step": 100 }, { "epoch": 0.15, "grad_norm": 3.3142590522766113, "learning_rate": 3e-06, "loss": 0.829, "step": 150 }, { "epoch": 0.15, "eval_loss": 0.7686946392059326, "eval_runtime": 3.0993, "eval_samples_per_second": 50.334, "eval_steps_per_second": 2.581, "step": 150 }, { "epoch": 0.2, "grad_norm": 4.635634422302246, "learning_rate": 4.000000000000001e-06, "loss": 0.8045, "step": 200 }, { "epoch": 0.2, "eval_loss": 0.754519522190094, "eval_runtime": 3.0961, "eval_samples_per_second": 50.386, "eval_steps_per_second": 2.584, "step": 200 }, { "epoch": 0.25, "grad_norm": 3.970125913619995, "learning_rate": 5e-06, "loss": 0.7957, "step": 250 }, { "epoch": 0.25, "eval_loss": 0.7530429363250732, "eval_runtime": 3.0913, "eval_samples_per_second": 50.465, "eval_steps_per_second": 2.588, "step": 250 }, { "epoch": 0.3, "grad_norm": 3.469130516052246, "learning_rate": 6e-06, "loss": 0.7833, "step": 300 }, { "epoch": 0.3, "eval_loss": 0.7474463582038879, "eval_runtime": 3.0973, "eval_samples_per_second": 50.366, "eval_steps_per_second": 2.583, "step": 300 }, { "epoch": 0.35, "grad_norm": 2.752779722213745, "learning_rate": 7e-06, "loss": 0.8005, "step": 350 }, { "epoch": 0.35, "eval_loss": 0.7482121586799622, "eval_runtime": 3.0919, "eval_samples_per_second": 50.454, "eval_steps_per_second": 2.587, "step": 350 }, { "epoch": 0.4, "grad_norm": 2.9983832836151123, "learning_rate": 8.000000000000001e-06, "loss": 0.771, "step": 400 }, { "epoch": 0.4, "eval_loss": 0.7438180446624756, "eval_runtime": 3.0859, "eval_samples_per_second": 50.553, "eval_steps_per_second": 2.592, "step": 400 }, { "epoch": 0.45, "grad_norm": 2.574100971221924, "learning_rate": 9e-06, "loss": 0.7449, "step": 450 }, { "epoch": 0.45, "eval_loss": 0.7498941421508789, "eval_runtime": 3.0388, "eval_samples_per_second": 51.336, "eval_steps_per_second": 2.633, "step": 450 }, { "epoch": 0.5, "grad_norm": 2.4326021671295166, "learning_rate": 1e-05, "loss": 0.7533, "step": 500 }, { "epoch": 0.5, "eval_loss": 0.7558138966560364, "eval_runtime": 3.0822, "eval_samples_per_second": 50.614, "eval_steps_per_second": 2.596, "step": 500 }, { "epoch": 0.55, "grad_norm": 2.791947364807129, "learning_rate": 1.1000000000000001e-05, "loss": 0.8062, "step": 550 }, { "epoch": 0.55, "eval_loss": 0.7553198337554932, "eval_runtime": 3.0823, "eval_samples_per_second": 50.611, "eval_steps_per_second": 2.595, "step": 550 }, { "epoch": 0.6, "grad_norm": 2.409931182861328, "learning_rate": 1.2e-05, "loss": 0.8354, "step": 600 }, { "epoch": 0.6, "eval_loss": 0.7591729760169983, "eval_runtime": 3.0925, "eval_samples_per_second": 50.444, "eval_steps_per_second": 2.587, "step": 600 }, { "epoch": 0.65, "grad_norm": 2.654364585876465, "learning_rate": 1.3000000000000001e-05, "loss": 0.8084, "step": 650 }, { "epoch": 0.65, "eval_loss": 0.7605798244476318, "eval_runtime": 3.0938, "eval_samples_per_second": 50.423, "eval_steps_per_second": 2.586, "step": 650 }, { "epoch": 0.7, "grad_norm": 2.8715052604675293, "learning_rate": 1.4e-05, "loss": 0.7464, "step": 700 }, { "epoch": 0.7, "eval_loss": 0.768295168876648, "eval_runtime": 3.0904, "eval_samples_per_second": 50.479, "eval_steps_per_second": 2.589, "step": 700 }, { "epoch": 0.75, "grad_norm": 2.547102451324463, "learning_rate": 1.5000000000000002e-05, "loss": 0.7652, "step": 750 }, { "epoch": 0.75, "eval_loss": 0.7770843505859375, "eval_runtime": 3.0851, "eval_samples_per_second": 50.566, "eval_steps_per_second": 2.593, "step": 750 }, { "epoch": 0.8, "grad_norm": 2.6396355628967285, "learning_rate": 1.6000000000000003e-05, "loss": 0.7807, "step": 800 }, { "epoch": 0.8, "eval_loss": 0.7774565815925598, "eval_runtime": 3.0842, "eval_samples_per_second": 50.581, "eval_steps_per_second": 2.594, "step": 800 }, { "epoch": 0.85, "grad_norm": 2.243042230606079, "learning_rate": 1.7e-05, "loss": 0.8038, "step": 850 }, { "epoch": 0.85, "eval_loss": 0.778019905090332, "eval_runtime": 3.0739, "eval_samples_per_second": 50.75, "eval_steps_per_second": 2.603, "step": 850 }, { "epoch": 0.9, "grad_norm": 2.5405020713806152, "learning_rate": 1.8e-05, "loss": 0.8292, "step": 900 }, { "epoch": 0.9, "eval_loss": 0.7810414433479309, "eval_runtime": 3.0805, "eval_samples_per_second": 50.641, "eval_steps_per_second": 2.597, "step": 900 }, { "epoch": 0.95, "grad_norm": 2.9373252391815186, "learning_rate": 1.9e-05, "loss": 0.8057, "step": 950 }, { "epoch": 0.95, "eval_loss": 0.7795991897583008, "eval_runtime": 3.0813, "eval_samples_per_second": 50.627, "eval_steps_per_second": 2.596, "step": 950 }, { "epoch": 1.0, "grad_norm": 2.0142874717712402, "learning_rate": 2e-05, "loss": 0.8265, "step": 1000 }, { "epoch": 1.0, "eval_loss": 0.7865928411483765, "eval_runtime": 3.0922, "eval_samples_per_second": 50.449, "eval_steps_per_second": 2.587, "step": 1000 }, { "epoch": 1.05, "grad_norm": 2.4737377166748047, "learning_rate": 1.9998476951563914e-05, "loss": 0.567, "step": 1050 }, { "epoch": 1.05, "eval_loss": 0.8242304921150208, "eval_runtime": 3.0914, "eval_samples_per_second": 50.462, "eval_steps_per_second": 2.588, "step": 1050 }, { "epoch": 1.1, "grad_norm": 1.9739576578140259, "learning_rate": 1.999390827019096e-05, "loss": 0.5251, "step": 1100 }, { "epoch": 1.1, "eval_loss": 0.8305041193962097, "eval_runtime": 3.0703, "eval_samples_per_second": 50.81, "eval_steps_per_second": 2.606, "step": 1100 }, { "epoch": 1.15, "grad_norm": 3.079801321029663, "learning_rate": 1.9986295347545738e-05, "loss": 0.5836, "step": 1150 }, { "epoch": 1.15, "eval_loss": 0.8230717778205872, "eval_runtime": 3.0905, "eval_samples_per_second": 50.478, "eval_steps_per_second": 2.589, "step": 1150 }, { "epoch": 1.2, "grad_norm": 2.380758047103882, "learning_rate": 1.9975640502598243e-05, "loss": 0.5404, "step": 1200 }, { "epoch": 1.2, "eval_loss": 0.8192301392555237, "eval_runtime": 3.0886, "eval_samples_per_second": 50.508, "eval_steps_per_second": 2.59, "step": 1200 }, { "epoch": 1.25, "grad_norm": 2.047494649887085, "learning_rate": 1.9961946980917457e-05, "loss": 0.5448, "step": 1250 }, { "epoch": 1.25, "eval_loss": 0.8297872543334961, "eval_runtime": 3.0696, "eval_samples_per_second": 50.821, "eval_steps_per_second": 2.606, "step": 1250 }, { "epoch": 1.3, "grad_norm": 2.4927003383636475, "learning_rate": 1.9945218953682736e-05, "loss": 0.5505, "step": 1300 }, { "epoch": 1.3, "eval_loss": 0.8312389850616455, "eval_runtime": 3.0792, "eval_samples_per_second": 50.662, "eval_steps_per_second": 2.598, "step": 1300 }, { "epoch": 1.35, "grad_norm": 3.409341812133789, "learning_rate": 1.9925461516413224e-05, "loss": 0.5726, "step": 1350 }, { "epoch": 1.35, "eval_loss": 0.8322041630744934, "eval_runtime": 3.0883, "eval_samples_per_second": 50.514, "eval_steps_per_second": 2.59, "step": 1350 }, { "epoch": 1.4, "grad_norm": 1.5658016204833984, "learning_rate": 1.9902680687415704e-05, "loss": 0.5687, "step": 1400 }, { "epoch": 1.4, "eval_loss": 0.8255723118782043, "eval_runtime": 3.0902, "eval_samples_per_second": 50.481, "eval_steps_per_second": 2.589, "step": 1400 }, { "epoch": 1.45, "grad_norm": 2.2273943424224854, "learning_rate": 1.9876883405951378e-05, "loss": 0.5768, "step": 1450 }, { "epoch": 1.45, "eval_loss": 0.8274821639060974, "eval_runtime": 3.0903, "eval_samples_per_second": 50.48, "eval_steps_per_second": 2.589, "step": 1450 }, { "epoch": 1.5, "grad_norm": 1.8873156309127808, "learning_rate": 1.9848077530122083e-05, "loss": 0.5437, "step": 1500 }, { "epoch": 1.5, "eval_loss": 0.8350476026535034, "eval_runtime": 3.0892, "eval_samples_per_second": 50.498, "eval_steps_per_second": 2.59, "step": 1500 }, { "epoch": 1.55, "grad_norm": 1.8373135328292847, "learning_rate": 1.9816271834476642e-05, "loss": 0.569, "step": 1550 }, { "epoch": 1.55, "eval_loss": 0.8110507726669312, "eval_runtime": 3.096, "eval_samples_per_second": 50.388, "eval_steps_per_second": 2.584, "step": 1550 }, { "epoch": 1.6, "grad_norm": 2.555246591567993, "learning_rate": 1.9781476007338058e-05, "loss": 0.5672, "step": 1600 }, { "epoch": 1.6, "eval_loss": 0.8229615688323975, "eval_runtime": 3.1033, "eval_samples_per_second": 50.269, "eval_steps_per_second": 2.578, "step": 1600 }, { "epoch": 1.65, "grad_norm": 2.032984972000122, "learning_rate": 1.9743700647852356e-05, "loss": 0.5709, "step": 1650 }, { "epoch": 1.65, "eval_loss": 0.8179122805595398, "eval_runtime": 3.0703, "eval_samples_per_second": 50.81, "eval_steps_per_second": 2.606, "step": 1650 }, { "epoch": 1.7, "grad_norm": 2.0338752269744873, "learning_rate": 1.9702957262759964e-05, "loss": 0.5945, "step": 1700 }, { "epoch": 1.7, "eval_loss": 0.8166740536689758, "eval_runtime": 3.0576, "eval_samples_per_second": 51.021, "eval_steps_per_second": 2.616, "step": 1700 }, { "epoch": 1.75, "grad_norm": 2.7427613735198975, "learning_rate": 1.9659258262890683e-05, "loss": 0.5756, "step": 1750 }, { "epoch": 1.75, "eval_loss": 0.8103773593902588, "eval_runtime": 3.0935, "eval_samples_per_second": 50.429, "eval_steps_per_second": 2.586, "step": 1750 }, { "epoch": 1.8, "grad_norm": 2.876005172729492, "learning_rate": 1.961261695938319e-05, "loss": 0.5701, "step": 1800 }, { "epoch": 1.8, "eval_loss": 0.8226655125617981, "eval_runtime": 3.099, "eval_samples_per_second": 50.338, "eval_steps_per_second": 2.581, "step": 1800 }, { "epoch": 1.85, "grad_norm": 2.338442087173462, "learning_rate": 1.9563047559630356e-05, "loss": 0.5765, "step": 1850 }, { "epoch": 1.85, "eval_loss": 0.8165319561958313, "eval_runtime": 3.0865, "eval_samples_per_second": 50.543, "eval_steps_per_second": 2.592, "step": 1850 }, { "epoch": 1.9, "grad_norm": 1.9952592849731445, "learning_rate": 1.9510565162951538e-05, "loss": 0.6023, "step": 1900 }, { "epoch": 1.9, "eval_loss": 0.8059903979301453, "eval_runtime": 3.057, "eval_samples_per_second": 51.03, "eval_steps_per_second": 2.617, "step": 1900 }, { "epoch": 1.95, "grad_norm": 2.47471284866333, "learning_rate": 1.945518575599317e-05, "loss": 0.5544, "step": 1950 }, { "epoch": 1.95, "eval_loss": 0.8095174431800842, "eval_runtime": 3.1028, "eval_samples_per_second": 50.277, "eval_steps_per_second": 2.578, "step": 1950 }, { "epoch": 2.0, "grad_norm": 2.391505002975464, "learning_rate": 1.9396926207859085e-05, "loss": 0.5932, "step": 2000 }, { "epoch": 2.0, "eval_loss": 0.8160108923912048, "eval_runtime": 3.0868, "eval_samples_per_second": 50.537, "eval_steps_per_second": 2.592, "step": 2000 } ], "logging_steps": 50, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 400, "total_flos": 1.140965468316631e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }