{ "best_metric": 5.173658792045899e-05, "best_model_checkpoint": "res/checkpoint-400", "epoch": 0.5633802816901409, "eval_steps": 200, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 3.3899288177490234, "learning_rate": 5e-06, "loss": 0.6908, "step": 10 }, { "epoch": 0.03, "grad_norm": 1.9440199136734009, "learning_rate": 1e-05, "loss": 0.6992, "step": 20 }, { "epoch": 0.04, "grad_norm": Infinity, "learning_rate": 1.45e-05, "loss": 0.6318, "step": 30 }, { "epoch": 0.06, "grad_norm": 4.202402591705322, "learning_rate": 1.9e-05, "loss": 0.4382, "step": 40 }, { "epoch": 0.07, "grad_norm": 21.559049606323242, "learning_rate": 2.4e-05, "loss": 0.0718, "step": 50 }, { "epoch": 0.08, "grad_norm": 0.11590419709682465, "learning_rate": 2.9e-05, "loss": 0.0055, "step": 60 }, { "epoch": 0.1, "grad_norm": 0.02931913733482361, "learning_rate": 3.35e-05, "loss": 0.0175, "step": 70 }, { "epoch": 0.11, "grad_norm": 31.5627498626709, "learning_rate": 3.85e-05, "loss": 0.159, "step": 80 }, { "epoch": 0.13, "grad_norm": 0.10093926638364792, "learning_rate": 4.35e-05, "loss": 0.0993, "step": 90 }, { "epoch": 0.14, "grad_norm": 0.02363261766731739, "learning_rate": 4.85e-05, "loss": 0.0013, "step": 100 }, { "epoch": 0.15, "grad_norm": 0.018094830214977264, "learning_rate": 4.9426229508196726e-05, "loss": 0.0008, "step": 110 }, { "epoch": 0.17, "grad_norm": 0.010088359005749226, "learning_rate": 4.860655737704918e-05, "loss": 0.0005, "step": 120 }, { "epoch": 0.18, "grad_norm": 0.00800824724137783, "learning_rate": 4.778688524590164e-05, "loss": 0.0004, "step": 130 }, { "epoch": 0.2, "grad_norm": 0.03682546317577362, "learning_rate": 4.704918032786885e-05, "loss": 0.0664, "step": 140 }, { "epoch": 0.21, "grad_norm": 0.006090231705456972, "learning_rate": 4.622950819672132e-05, "loss": 0.0002, "step": 150 }, { "epoch": 0.23, "grad_norm": 0.00360495550557971, "learning_rate": 4.540983606557377e-05, "loss": 0.0002, "step": 160 }, { "epoch": 0.24, "grad_norm": 0.004050845745950937, "learning_rate": 4.459016393442623e-05, "loss": 0.0002, "step": 170 }, { "epoch": 0.25, "grad_norm": 0.0043782140128314495, "learning_rate": 4.377049180327869e-05, "loss": 0.0002, "step": 180 }, { "epoch": 0.27, "grad_norm": 0.004006043076515198, "learning_rate": 4.295081967213115e-05, "loss": 0.0001, "step": 190 }, { "epoch": 0.28, "grad_norm": 0.003426899667829275, "learning_rate": 4.213114754098361e-05, "loss": 0.1068, "step": 200 }, { "epoch": 0.28, "eval_accuracy": 0.9987674609695973, "eval_f1": 0.9987956643918106, "eval_loss": 0.002453433582559228, "eval_precision": 0.9983948635634029, "eval_recall": 0.9991967871485944, "eval_runtime": 335.4821, "eval_samples_per_second": 7.255, "eval_steps_per_second": 0.909, "step": 200 }, { "epoch": 0.3, "grad_norm": 0.015531109645962715, "learning_rate": 4.131147540983607e-05, "loss": 0.062, "step": 210 }, { "epoch": 0.31, "grad_norm": 0.02330106310546398, "learning_rate": 4.049180327868853e-05, "loss": 0.0089, "step": 220 }, { "epoch": 0.32, "grad_norm": 0.007218244019895792, "learning_rate": 3.9672131147540983e-05, "loss": 0.0004, "step": 230 }, { "epoch": 0.34, "grad_norm": 0.008732643909752369, "learning_rate": 3.8852459016393444e-05, "loss": 0.0003, "step": 240 }, { "epoch": 0.35, "grad_norm": 0.007611180655658245, "learning_rate": 3.8032786885245905e-05, "loss": 0.0003, "step": 250 }, { "epoch": 0.37, "grad_norm": 0.004713522270321846, "learning_rate": 3.721311475409836e-05, "loss": 0.0002, "step": 260 }, { "epoch": 0.38, "grad_norm": 0.005065000616014004, "learning_rate": 3.6393442622950826e-05, "loss": 0.0002, "step": 270 }, { "epoch": 0.39, "grad_norm": 0.0033236700110137463, "learning_rate": 3.557377049180328e-05, "loss": 0.0002, "step": 280 }, { "epoch": 0.41, "grad_norm": 0.003254691371694207, "learning_rate": 3.475409836065574e-05, "loss": 0.0001, "step": 290 }, { "epoch": 0.42, "grad_norm": 0.0026583385188132524, "learning_rate": 3.39344262295082e-05, "loss": 0.0001, "step": 300 }, { "epoch": 0.44, "grad_norm": 0.0026992058847099543, "learning_rate": 3.3114754098360655e-05, "loss": 0.0001, "step": 310 }, { "epoch": 0.45, "grad_norm": 0.002891330514103174, "learning_rate": 3.2295081967213116e-05, "loss": 0.0001, "step": 320 }, { "epoch": 0.46, "grad_norm": 0.0022049755789339542, "learning_rate": 3.1475409836065576e-05, "loss": 0.0001, "step": 330 }, { "epoch": 0.48, "grad_norm": 0.002381928265094757, "learning_rate": 3.065573770491804e-05, "loss": 0.0001, "step": 340 }, { "epoch": 0.49, "grad_norm": 0.0018912949599325657, "learning_rate": 2.9836065573770494e-05, "loss": 0.0001, "step": 350 }, { "epoch": 0.51, "grad_norm": 0.00272244680672884, "learning_rate": 2.901639344262295e-05, "loss": 0.0001, "step": 360 }, { "epoch": 0.52, "grad_norm": 0.0019356077536940575, "learning_rate": 2.819672131147541e-05, "loss": 0.0001, "step": 370 }, { "epoch": 0.54, "grad_norm": 0.002229230711236596, "learning_rate": 2.737704918032787e-05, "loss": 0.0001, "step": 380 }, { "epoch": 0.55, "grad_norm": 0.002221511909738183, "learning_rate": 2.6557377049180327e-05, "loss": 0.0001, "step": 390 }, { "epoch": 0.56, "grad_norm": 0.0020540908444672823, "learning_rate": 2.573770491803279e-05, "loss": 0.0001, "step": 400 }, { "epoch": 0.56, "eval_accuracy": 1.0, "eval_f1": 1.0, "eval_loss": 5.173658792045899e-05, "eval_precision": 1.0, "eval_recall": 1.0, "eval_runtime": 333.7573, "eval_samples_per_second": 7.293, "eval_steps_per_second": 0.914, "step": 400 } ], "logging_steps": 10, "max_steps": 710, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 400, "total_flos": 4203850314547200.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }