{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2, "eval_steps": 2000, "global_step": 4000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 0.9697519540786743, "learning_rate": 1e-06, "loss": 0.1629, "step": 100 }, { "epoch": 0.01, "grad_norm": 1.3848124742507935, "learning_rate": 9.898989898989898e-07, "loss": 0.14, "step": 200 }, { "epoch": 0.01, "grad_norm": 0.9986572265625, "learning_rate": 9.797979797979797e-07, "loss": 0.1354, "step": 300 }, { "epoch": 0.02, "grad_norm": 0.11438798904418945, "learning_rate": 9.696969696969698e-07, "loss": 0.1182, "step": 400 }, { "epoch": 0.03, "grad_norm": 0.8548241257667542, "learning_rate": 9.595959595959596e-07, "loss": 0.1192, "step": 500 }, { "epoch": 0.03, "grad_norm": 1.5312464237213135, "learning_rate": 9.494949494949495e-07, "loss": 0.0997, "step": 600 }, { "epoch": 0.04, "grad_norm": 0.9692059755325317, "learning_rate": 9.393939393939395e-07, "loss": 0.102, "step": 700 }, { "epoch": 0.04, "grad_norm": 0.42864611744880676, "learning_rate": 9.292929292929292e-07, "loss": 0.0901, "step": 800 }, { "epoch": 0.04, "grad_norm": 0.852543830871582, "learning_rate": 9.191919191919192e-07, "loss": 0.0937, "step": 900 }, { "epoch": 0.05, "grad_norm": 0.5718303322792053, "learning_rate": 9.09090909090909e-07, "loss": 0.093, "step": 1000 }, { "epoch": 0.06, "grad_norm": 0.9396565556526184, "learning_rate": 8.98989898989899e-07, "loss": 0.0892, "step": 1100 }, { "epoch": 0.06, "grad_norm": 0.08157779276371002, "learning_rate": 8.888888888888888e-07, "loss": 0.0934, "step": 1200 }, { "epoch": 0.07, "grad_norm": 0.8076322078704834, "learning_rate": 8.787878787878787e-07, "loss": 0.0725, "step": 1300 }, { "epoch": 0.07, "grad_norm": 1.5076119899749756, "learning_rate": 8.686868686868687e-07, "loss": 0.0835, "step": 1400 }, { "epoch": 0.07, "grad_norm": 1.1567238569259644, "learning_rate": 8.585858585858586e-07, "loss": 0.0747, "step": 1500 }, { "epoch": 0.08, "grad_norm": 0.6817927956581116, "learning_rate": 8.484848484848484e-07, "loss": 0.0903, "step": 1600 }, { "epoch": 0.09, "grad_norm": 0.6467050313949585, "learning_rate": 8.383838383838383e-07, "loss": 0.0721, "step": 1700 }, { "epoch": 0.09, "grad_norm": 1.8435570001602173, "learning_rate": 8.282828282828283e-07, "loss": 0.0847, "step": 1800 }, { "epoch": 0.1, "grad_norm": 0.6265794634819031, "learning_rate": 8.181818181818182e-07, "loss": 0.0687, "step": 1900 }, { "epoch": 0.1, "grad_norm": 1.360060453414917, "learning_rate": 8.08080808080808e-07, "loss": 0.0748, "step": 2000 }, { "epoch": 0.1, "eval_loss": 0.06745574623346329, "eval_runtime": 304.7718, "eval_samples_per_second": 3.281, "eval_steps_per_second": 0.82, "step": 2000 }, { "epoch": 0.1, "grad_norm": 0.5074820518493652, "learning_rate": 7.97979797979798e-07, "loss": 0.0699, "step": 2100 }, { "epoch": 0.11, "grad_norm": 1.3634223937988281, "learning_rate": 7.878787878787878e-07, "loss": 0.0682, "step": 2200 }, { "epoch": 0.12, "grad_norm": 3.0763113498687744, "learning_rate": 7.777777777777778e-07, "loss": 0.0786, "step": 2300 }, { "epoch": 0.12, "grad_norm": 1.5437021255493164, "learning_rate": 7.676767676767675e-07, "loss": 0.065, "step": 2400 }, { "epoch": 0.12, "grad_norm": 1.5890172719955444, "learning_rate": 7.575757575757575e-07, "loss": 0.0772, "step": 2500 }, { "epoch": 0.13, "grad_norm": 0.9904383420944214, "learning_rate": 7.474747474747475e-07, "loss": 0.0735, "step": 2600 }, { "epoch": 0.14, "grad_norm": 0.9233341813087463, "learning_rate": 7.373737373737373e-07, "loss": 0.0695, "step": 2700 }, { "epoch": 0.14, "grad_norm": 0.10603007674217224, "learning_rate": 7.272727272727272e-07, "loss": 0.0595, "step": 2800 }, { "epoch": 0.14, "grad_norm": 0.9929779767990112, "learning_rate": 7.171717171717171e-07, "loss": 0.0595, "step": 2900 }, { "epoch": 0.15, "grad_norm": 0.4711184799671173, "learning_rate": 7.07070707070707e-07, "loss": 0.0606, "step": 3000 }, { "epoch": 0.15, "grad_norm": 0.5627967715263367, "learning_rate": 6.96969696969697e-07, "loss": 0.0713, "step": 3100 }, { "epoch": 0.16, "grad_norm": 0.7645086646080017, "learning_rate": 6.868686868686868e-07, "loss": 0.0672, "step": 3200 }, { "epoch": 0.17, "grad_norm": 0.8071433901786804, "learning_rate": 6.767676767676767e-07, "loss": 0.0809, "step": 3300 }, { "epoch": 0.17, "grad_norm": 1.281545877456665, "learning_rate": 6.666666666666666e-07, "loss": 0.0577, "step": 3400 }, { "epoch": 0.17, "grad_norm": 1.15431547164917, "learning_rate": 6.565656565656566e-07, "loss": 0.0613, "step": 3500 }, { "epoch": 0.18, "grad_norm": 0.42522451281547546, "learning_rate": 6.464646464646465e-07, "loss": 0.0531, "step": 3600 }, { "epoch": 0.18, "grad_norm": 1.2078278064727783, "learning_rate": 6.363636363636363e-07, "loss": 0.0692, "step": 3700 }, { "epoch": 0.19, "grad_norm": 2.207512855529785, "learning_rate": 6.262626262626263e-07, "loss": 0.0508, "step": 3800 }, { "epoch": 0.2, "grad_norm": 1.696768045425415, "learning_rate": 6.161616161616161e-07, "loss": 0.0655, "step": 3900 }, { "epoch": 0.2, "grad_norm": 0.6295761466026306, "learning_rate": 6.060606060606061e-07, "loss": 0.0577, "step": 4000 }, { "epoch": 0.2, "eval_loss": 0.057661667466163635, "eval_runtime": 304.4519, "eval_samples_per_second": 3.285, "eval_steps_per_second": 0.821, "step": 4000 } ], "logging_steps": 100, "max_steps": 10000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "total_flos": 3.26411004936192e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }