{ "best_metric": null, "best_model_checkpoint": null, "epoch": 30.0, "eval_steps": 500, "global_step": 24210, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, "grad_norm": 5.218414306640625, "learning_rate": 9.668731928954978e-06, "loss": 1.2041, "step": 807 }, { "epoch": 1.0, "eval_accuracy": 0.3661971688270569, "eval_loss": 1.1999133825302124, "eval_runtime": 16.8219, "eval_samples_per_second": 4.221, "eval_steps_per_second": 4.221, "step": 807 }, { "epoch": 2.0, "grad_norm": 14.095768928527832, "learning_rate": 9.335398595621644e-06, "loss": 1.067, "step": 1614 }, { "epoch": 2.0, "eval_accuracy": 0.4507042169570923, "eval_loss": 1.4613122940063477, "eval_runtime": 16.9697, "eval_samples_per_second": 4.184, "eval_steps_per_second": 4.184, "step": 1614 }, { "epoch": 3.0, "grad_norm": 8.218274116516113, "learning_rate": 9.002478314745973e-06, "loss": 1.1007, "step": 2421 }, { "epoch": 3.0, "eval_accuracy": 0.4084506928920746, "eval_loss": 1.5213311910629272, "eval_runtime": 16.8544, "eval_samples_per_second": 4.213, "eval_steps_per_second": 4.213, "step": 2421 }, { "epoch": 4.0, "grad_norm": 43.27208709716797, "learning_rate": 8.669558033870302e-06, "loss": 1.1945, "step": 3228 }, { "epoch": 4.0, "eval_accuracy": 0.6056337952613831, "eval_loss": 1.664249062538147, "eval_runtime": 16.9603, "eval_samples_per_second": 4.186, "eval_steps_per_second": 4.186, "step": 3228 }, { "epoch": 5.0, "grad_norm": 13.959416389465332, "learning_rate": 8.336637752994631e-06, "loss": 1.3665, "step": 4035 }, { "epoch": 5.0, "eval_accuracy": 0.43661972880363464, "eval_loss": 2.9907848834991455, "eval_runtime": 16.9865, "eval_samples_per_second": 4.18, "eval_steps_per_second": 4.18, "step": 4035 }, { "epoch": 6.0, "grad_norm": 2.819016456604004, "learning_rate": 8.003304419661297e-06, "loss": 1.4506, "step": 4842 }, { "epoch": 6.0, "eval_accuracy": 0.6056337952613831, "eval_loss": 1.9229768514633179, "eval_runtime": 17.0241, "eval_samples_per_second": 4.171, "eval_steps_per_second": 4.171, "step": 4842 }, { "epoch": 7.0, "grad_norm": 0.291822224855423, "learning_rate": 7.669971086327965e-06, "loss": 1.495, "step": 5649 }, { "epoch": 7.0, "eval_accuracy": 0.6760563254356384, "eval_loss": 1.6813377141952515, "eval_runtime": 17.1264, "eval_samples_per_second": 4.146, "eval_steps_per_second": 4.146, "step": 5649 }, { "epoch": 8.0, "grad_norm": 61.62895584106445, "learning_rate": 7.336637752994631e-06, "loss": 1.2605, "step": 6456 }, { "epoch": 8.0, "eval_accuracy": 0.6619718074798584, "eval_loss": 1.893676519393921, "eval_runtime": 17.0263, "eval_samples_per_second": 4.17, "eval_steps_per_second": 4.17, "step": 6456 }, { "epoch": 9.0, "grad_norm": 137.62139892578125, "learning_rate": 7.003717472118959e-06, "loss": 1.2713, "step": 7263 }, { "epoch": 9.0, "eval_accuracy": 0.6901408433914185, "eval_loss": 1.6283684968948364, "eval_runtime": 17.106, "eval_samples_per_second": 4.151, "eval_steps_per_second": 4.151, "step": 7263 }, { "epoch": 10.0, "grad_norm": 0.8416326642036438, "learning_rate": 6.670384138785626e-06, "loss": 1.2608, "step": 8070 }, { "epoch": 10.0, "eval_accuracy": 0.6478873491287231, "eval_loss": 1.9437721967697144, "eval_runtime": 16.9644, "eval_samples_per_second": 4.185, "eval_steps_per_second": 4.185, "step": 8070 }, { "epoch": 11.0, "grad_norm": 29.88570785522461, "learning_rate": 6.337050805452293e-06, "loss": 1.2068, "step": 8877 }, { "epoch": 11.0, "eval_accuracy": 0.7183098793029785, "eval_loss": 1.5236804485321045, "eval_runtime": 17.1219, "eval_samples_per_second": 4.147, "eval_steps_per_second": 4.147, "step": 8877 }, { "epoch": 12.0, "grad_norm": 0.15335923433303833, "learning_rate": 6.004130524576621e-06, "loss": 1.0478, "step": 9684 }, { "epoch": 12.0, "eval_accuracy": 0.6338028311729431, "eval_loss": 2.000718832015991, "eval_runtime": 16.9949, "eval_samples_per_second": 4.178, "eval_steps_per_second": 4.178, "step": 9684 }, { "epoch": 13.0, "grad_norm": 0.08654139935970306, "learning_rate": 5.670797191243288e-06, "loss": 1.1282, "step": 10491 }, { "epoch": 13.0, "eval_accuracy": 0.7464788556098938, "eval_loss": 1.5307363271713257, "eval_runtime": 17.1388, "eval_samples_per_second": 4.143, "eval_steps_per_second": 4.143, "step": 10491 }, { "epoch": 14.0, "grad_norm": 598.8184204101562, "learning_rate": 5.337463857909955e-06, "loss": 0.9433, "step": 11298 }, { "epoch": 14.0, "eval_accuracy": 0.6478873491287231, "eval_loss": 2.0042455196380615, "eval_runtime": 17.1036, "eval_samples_per_second": 4.151, "eval_steps_per_second": 4.151, "step": 11298 }, { "epoch": 15.0, "grad_norm": 201.67433166503906, "learning_rate": 5.004130524576621e-06, "loss": 0.9574, "step": 12105 }, { "epoch": 15.0, "eval_accuracy": 0.6338028311729431, "eval_loss": 2.198476791381836, "eval_runtime": 17.0314, "eval_samples_per_second": 4.169, "eval_steps_per_second": 4.169, "step": 12105 }, { "epoch": 16.0, "grad_norm": 0.1339723765850067, "learning_rate": 4.670797191243288e-06, "loss": 0.8737, "step": 12912 }, { "epoch": 16.0, "eval_accuracy": 0.6478873491287231, "eval_loss": 2.156816005706787, "eval_runtime": 17.0458, "eval_samples_per_second": 4.165, "eval_steps_per_second": 4.165, "step": 12912 }, { "epoch": 17.0, "grad_norm": 2.1163623332977295, "learning_rate": 4.338289962825279e-06, "loss": 0.8937, "step": 13719 }, { "epoch": 17.0, "eval_accuracy": 0.6197183132171631, "eval_loss": 2.2980363368988037, "eval_runtime": 16.9937, "eval_samples_per_second": 4.178, "eval_steps_per_second": 4.178, "step": 13719 }, { "epoch": 18.0, "grad_norm": 542.3316040039062, "learning_rate": 4.005369681949608e-06, "loss": 0.8681, "step": 14526 }, { "epoch": 18.0, "eval_accuracy": 0.6197183132171631, "eval_loss": 2.3267853260040283, "eval_runtime": 16.9917, "eval_samples_per_second": 4.179, "eval_steps_per_second": 4.179, "step": 14526 }, { "epoch": 19.0, "grad_norm": 0.029439380392432213, "learning_rate": 3.672036348616275e-06, "loss": 0.8005, "step": 15333 }, { "epoch": 19.0, "eval_accuracy": 0.6478873491287231, "eval_loss": 2.4827253818511963, "eval_runtime": 17.024, "eval_samples_per_second": 4.171, "eval_steps_per_second": 4.171, "step": 15333 }, { "epoch": 20.0, "grad_norm": 0.3387238681316376, "learning_rate": 3.3387030152829415e-06, "loss": 0.8176, "step": 16140 }, { "epoch": 20.0, "eval_accuracy": 0.6338028311729431, "eval_loss": 2.4842429161071777, "eval_runtime": 16.9608, "eval_samples_per_second": 4.186, "eval_steps_per_second": 4.186, "step": 16140 }, { "epoch": 21.0, "grad_norm": 0.11516053974628448, "learning_rate": 3.0053696819496083e-06, "loss": 0.8133, "step": 16947 }, { "epoch": 21.0, "eval_accuracy": 0.6760563254356384, "eval_loss": 2.061955213546753, "eval_runtime": 17.156, "eval_samples_per_second": 4.138, "eval_steps_per_second": 4.138, "step": 16947 }, { "epoch": 22.0, "grad_norm": 0.07013405114412308, "learning_rate": 2.672036348616274e-06, "loss": 0.7404, "step": 17754 }, { "epoch": 22.0, "eval_accuracy": 0.6478873491287231, "eval_loss": 2.414825916290283, "eval_runtime": 17.2261, "eval_samples_per_second": 4.122, "eval_steps_per_second": 4.122, "step": 17754 }, { "epoch": 23.0, "grad_norm": 0.10270074754953384, "learning_rate": 2.3387030152829414e-06, "loss": 0.7134, "step": 18561 }, { "epoch": 23.0, "eval_accuracy": 0.6760563254356384, "eval_loss": 2.338927984237671, "eval_runtime": 17.2024, "eval_samples_per_second": 4.127, "eval_steps_per_second": 4.127, "step": 18561 }, { "epoch": 24.0, "grad_norm": 633.0289916992188, "learning_rate": 2.00578273440727e-06, "loss": 0.6573, "step": 19368 }, { "epoch": 24.0, "eval_accuracy": 0.6197183132171631, "eval_loss": 2.697195529937744, "eval_runtime": 17.2791, "eval_samples_per_second": 4.109, "eval_steps_per_second": 4.109, "step": 19368 }, { "epoch": 25.0, "grad_norm": 0.12489618360996246, "learning_rate": 1.6724494010739365e-06, "loss": 0.6848, "step": 20175 }, { "epoch": 25.0, "eval_accuracy": 0.6760563254356384, "eval_loss": 2.337477922439575, "eval_runtime": 17.079, "eval_samples_per_second": 4.157, "eval_steps_per_second": 4.157, "step": 20175 }, { "epoch": 26.0, "grad_norm": 0.11721812188625336, "learning_rate": 1.3391160677406032e-06, "loss": 0.6161, "step": 20982 }, { "epoch": 26.0, "eval_accuracy": 0.6619718074798584, "eval_loss": 2.479071617126465, "eval_runtime": 17.1585, "eval_samples_per_second": 4.138, "eval_steps_per_second": 4.138, "step": 20982 }, { "epoch": 27.0, "grad_norm": 0.09104170650243759, "learning_rate": 1.0057827344072698e-06, "loss": 0.6301, "step": 21789 }, { "epoch": 27.0, "eval_accuracy": 0.6478873491287231, "eval_loss": 2.3807287216186523, "eval_runtime": 17.3153, "eval_samples_per_second": 4.1, "eval_steps_per_second": 4.1, "step": 21789 }, { "epoch": 28.0, "grad_norm": 0.048452723771333694, "learning_rate": 6.724494010739364e-07, "loss": 0.5758, "step": 22596 }, { "epoch": 28.0, "eval_accuracy": 0.6901408433914185, "eval_loss": 2.224256753921509, "eval_runtime": 17.1054, "eval_samples_per_second": 4.151, "eval_steps_per_second": 4.151, "step": 22596 }, { "epoch": 29.0, "grad_norm": 79.71731567382812, "learning_rate": 3.395291201982652e-07, "loss": 0.5598, "step": 23403 }, { "epoch": 29.0, "eval_accuracy": 0.6619718074798584, "eval_loss": 2.41304874420166, "eval_runtime": 17.2038, "eval_samples_per_second": 4.127, "eval_steps_per_second": 4.127, "step": 23403 }, { "epoch": 30.0, "grad_norm": 0.05571676418185234, "learning_rate": 7.021891780256092e-09, "loss": 0.6066, "step": 24210 }, { "epoch": 30.0, "eval_accuracy": 0.6619718074798584, "eval_loss": 2.4623985290527344, "eval_runtime": 17.2965, "eval_samples_per_second": 4.105, "eval_steps_per_second": 4.105, "step": 24210 }, { "epoch": 30.0, "step": 24210, "total_flos": 6.696180305029608e+18, "train_loss": 0.9601948892119895, "train_runtime": 9628.1801, "train_samples_per_second": 2.514, "train_steps_per_second": 2.514 } ], "logging_steps": 12, "max_steps": 24210, "num_input_tokens_seen": 0, "num_train_epochs": 30, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.696180305029608e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }