barc0
/

induction-10k-50seeds-gpt4omini-llama3.1-8b-instruct-lora64_lr2e-4_epoch3

+---
+base_model: meta-llama/Meta-Llama-3.1-8B-Instruct
+library_name: peft
+license: llama3.1
+tags:
+- trl
+- sft
+- generated_from_trainer
+model-index:
+- name: induction-10k-50seeds-gpt4omini-llama3.1-8b-instruct-lora64_lr2e-4_epoch3
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# induction-10k-50seeds-gpt4omini-llama3.1-8b-instruct-lora64_lr2e-4_epoch3
+This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) on an unknown dataset.
+It achieves the following results on the evaluation set:
+- Loss: 0.3508
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.0002
+- train_batch_size: 8
+- eval_batch_size: 4
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 8
+- gradient_accumulation_steps: 2
+- total_train_batch_size: 128
+- total_eval_batch_size: 32
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_ratio: 0.1
+- num_epochs: 3
+### Training results
+| Training Loss | Epoch  | Step | Validation Loss |
+|:-------------:|:------:|:----:|:---------------:|
+| 0.3733        | 0.9935 | 77   | 0.3744          |
+| 0.33          | 2.0    | 155  | 0.3544          |
+| 0.3079        | 2.9806 | 231  | 0.3508          |
+### Framework versions
+- PEFT 0.13.0
+- Transformers 4.45.0.dev0
+- Pytorch 2.4.0+cu121
+- Datasets 3.0.1
+- Tokenizers 0.19.1

all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 2.9806451612903224,
+    "total_flos": 9.324729662937498e+16,
+    "train_loss": 0.3951803825118325,
+    "train_runtime": 2997.4381,
+    "train_samples": 9863,
+    "train_samples_per_second": 9.871,
+    "train_steps_per_second": 0.077
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 2.9806451612903224,
+    "total_flos": 9.324729662937498e+16,
+    "train_loss": 0.3951803825118325,
+    "train_runtime": 2997.4381,
+    "train_samples": 9863,
+    "train_samples_per_second": 9.871,
+    "train_steps_per_second": 0.077
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1683 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 2.9806451612903224,
+  "eval_steps": 500,
+  "global_step": 231,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.012903225806451613,
+      "grad_norm": 0.882150089808769,
+      "learning_rate": 8.333333333333334e-06,
+      "loss": 1.3191,
+      "step": 1
+    },
+    {
+      "epoch": 0.025806451612903226,
+      "grad_norm": 0.8369153094823952,
+      "learning_rate": 1.6666666666666667e-05,
+      "loss": 1.249,
+      "step": 2
+    },
+    {
+      "epoch": 0.03870967741935484,
+      "grad_norm": 0.8525103918091212,
+      "learning_rate": 2.5e-05,
+      "loss": 1.2775,
+      "step": 3
+    },
+    {
+      "epoch": 0.05161290322580645,
+      "grad_norm": 0.8113130093304075,
+      "learning_rate": 3.3333333333333335e-05,
+      "loss": 1.2577,
+      "step": 4
+    },
+    {
+      "epoch": 0.06451612903225806,
+      "grad_norm": 0.7691226782403744,
+      "learning_rate": 4.166666666666667e-05,
+      "loss": 1.2275,
+      "step": 5
+    },
+    {
+      "epoch": 0.07741935483870968,
+      "grad_norm": 0.5954210054804412,
+      "learning_rate": 5e-05,
+      "loss": 1.1159,
+      "step": 6
+    },
+    {
+      "epoch": 0.09032258064516129,
+      "grad_norm": 0.48189256930049384,
+      "learning_rate": 5.833333333333334e-05,
+      "loss": 1.0593,
+      "step": 7
+    },
+    {
+      "epoch": 0.1032258064516129,
+      "grad_norm": 0.5241879927945232,
+      "learning_rate": 6.666666666666667e-05,
+      "loss": 1.0031,
+      "step": 8
+    },
+    {
+      "epoch": 0.11612903225806452,
+      "grad_norm": 0.5751865259411146,
+      "learning_rate": 7.500000000000001e-05,
+      "loss": 0.9263,
+      "step": 9
+    },
+    {
+      "epoch": 0.12903225806451613,
+      "grad_norm": 0.5686526755807603,
+      "learning_rate": 8.333333333333334e-05,
+      "loss": 0.8146,
+      "step": 10
+    },
+    {
+      "epoch": 0.14193548387096774,
+      "grad_norm": 0.5156906474251192,
+      "learning_rate": 9.166666666666667e-05,
+      "loss": 0.7583,
+      "step": 11
+    },
+    {
+      "epoch": 0.15483870967741936,
+      "grad_norm": 0.4901634328534619,
+      "learning_rate": 0.0001,
+      "loss": 0.6686,
+      "step": 12
+    },
+    {
+      "epoch": 0.16774193548387098,
+      "grad_norm": 0.376084270046461,
+      "learning_rate": 0.00010833333333333333,
+      "loss": 0.6005,
+      "step": 13
+    },
+    {
+      "epoch": 0.18064516129032257,
+      "grad_norm": 0.2761318809240614,
+      "learning_rate": 0.00011666666666666668,
+      "loss": 0.5741,
+      "step": 14
+    },
+    {
+      "epoch": 0.1935483870967742,
+      "grad_norm": 0.25038763704461725,
+      "learning_rate": 0.000125,
+      "loss": 0.5465,
+      "step": 15
+    },
+    {
+      "epoch": 0.2064516129032258,
+      "grad_norm": 0.2214903977106201,
+      "learning_rate": 0.00013333333333333334,
+      "loss": 0.5138,
+      "step": 16
+    },
+    {
+      "epoch": 0.21935483870967742,
+      "grad_norm": 0.28905541505099525,
+      "learning_rate": 0.00014166666666666668,
+      "loss": 0.5247,
+      "step": 17
+    },
+    {
+      "epoch": 0.23225806451612904,
+      "grad_norm": 0.20699066633757193,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.4978,
+      "step": 18
+    },
+    {
+      "epoch": 0.24516129032258063,
+      "grad_norm": 0.219457528851344,
+      "learning_rate": 0.00015833333333333332,
+      "loss": 0.4924,
+      "step": 19
+    },
+    {
+      "epoch": 0.25806451612903225,
+      "grad_norm": 0.16596853789220767,
+      "learning_rate": 0.0001666666666666667,
+      "loss": 0.4759,
+      "step": 20
+    },
+    {
+      "epoch": 0.2709677419354839,
+      "grad_norm": 0.13228412371333673,
+      "learning_rate": 0.000175,
+      "loss": 0.4613,
+      "step": 21
+    },
+    {
+      "epoch": 0.2838709677419355,
+      "grad_norm": 0.1421107856190867,
+      "learning_rate": 0.00018333333333333334,
+      "loss": 0.4852,
+      "step": 22
+    },
+    {
+      "epoch": 0.2967741935483871,
+      "grad_norm": 0.12552928984887968,
+      "learning_rate": 0.00019166666666666667,
+      "loss": 0.4786,
+      "step": 23
+    },
+    {
+      "epoch": 0.3096774193548387,
+      "grad_norm": 0.11489463060846784,
+      "learning_rate": 0.0002,
+      "loss": 0.4532,
+      "step": 24
+    },
+    {
+      "epoch": 0.3225806451612903,
+      "grad_norm": 0.11476879539402507,
+      "learning_rate": 0.00019998848349441062,
+      "loss": 0.4454,
+      "step": 25
+    },
+    {
+      "epoch": 0.33548387096774196,
+      "grad_norm": 0.1256602270101812,
+      "learning_rate": 0.00019995393663024054,
+      "loss": 0.4513,
+      "step": 26
+    },
+    {
+      "epoch": 0.34838709677419355,
+      "grad_norm": 0.11833482485698336,
+      "learning_rate": 0.00019989636736467278,
+      "loss": 0.44,
+      "step": 27
+    },
+    {
+      "epoch": 0.36129032258064514,
+      "grad_norm": 0.11124019681377781,
+      "learning_rate": 0.00019981578895764273,
+      "loss": 0.4439,
+      "step": 28
+    },
+    {
+      "epoch": 0.3741935483870968,
+      "grad_norm": 0.10954971384477814,
+      "learning_rate": 0.00019971221996878394,
+      "loss": 0.4274,
+      "step": 29
+    },
+    {
+      "epoch": 0.3870967741935484,
+      "grad_norm": 0.11422715129880294,
+      "learning_rate": 0.00019958568425315314,
+      "loss": 0.4254,
+      "step": 30
+    },
+    {
+      "epoch": 0.4,
+      "grad_norm": 0.11262310014016527,
+      "learning_rate": 0.00019943621095573586,
+      "loss": 0.4204,
+      "step": 31
+    },
+    {
+      "epoch": 0.4129032258064516,
+      "grad_norm": 0.11143099554463408,
+      "learning_rate": 0.00019926383450473344,
+      "loss": 0.4105,
+      "step": 32
+    },
+    {
+      "epoch": 0.4258064516129032,
+      "grad_norm": 0.1088260973247734,
+      "learning_rate": 0.00019906859460363307,
+      "loss": 0.4136,
+      "step": 33
+    },
+    {
+      "epoch": 0.43870967741935485,
+      "grad_norm": 0.10400753996611788,
+      "learning_rate": 0.00019885053622206304,
+      "loss": 0.4213,
+      "step": 34
+    },
+    {
+      "epoch": 0.45161290322580644,
+      "grad_norm": 0.09587900896302251,
+      "learning_rate": 0.0001986097095854347,
+      "loss": 0.4085,
+      "step": 35
+    },
+    {
+      "epoch": 0.4645161290322581,
+      "grad_norm": 0.10119603747308556,
+      "learning_rate": 0.0001983461701633742,
+      "loss": 0.4181,
+      "step": 36
+    },
+    {
+      "epoch": 0.4774193548387097,
+      "grad_norm": 0.10062413136253176,
+      "learning_rate": 0.00019805997865694614,
+      "loss": 0.4098,
+      "step": 37
+    },
+    {
+      "epoch": 0.49032258064516127,
+      "grad_norm": 0.09162394941720846,
+      "learning_rate": 0.0001977512009846721,
+      "loss": 0.4085,
+      "step": 38
+    },
+    {
+      "epoch": 0.5032258064516129,
+      "grad_norm": 0.09269316443279575,
+      "learning_rate": 0.00019741990826734794,
+      "loss": 0.3994,
+      "step": 39
+    },
+    {
+      "epoch": 0.5161290322580645,
+      "grad_norm": 0.08782581803238095,
+      "learning_rate": 0.00019706617681166218,
+      "loss": 0.3983,
+      "step": 40
+    },
+    {
+      "epoch": 0.5290322580645161,
+      "grad_norm": 0.08665646987756218,
+      "learning_rate": 0.00019669008809262062,
+      "loss": 0.3938,
+      "step": 41
+    },
+    {
+      "epoch": 0.5419354838709678,
+      "grad_norm": 0.09289388957990503,
+      "learning_rate": 0.00019629172873477995,
+      "loss": 0.396,
+      "step": 42
+    },
+    {
+      "epoch": 0.5548387096774193,
+      "grad_norm": 0.09203344649472522,
+      "learning_rate": 0.00019587119049229557,
+      "loss": 0.4052,
+      "step": 43
+    },
+    {
+      "epoch": 0.567741935483871,
+      "grad_norm": 0.08209774194723368,
+      "learning_rate": 0.0001954285702277879,
+      "loss": 0.3959,
+      "step": 44
+    },
+    {
+      "epoch": 0.5806451612903226,
+      "grad_norm": 0.08595872863630391,
+      "learning_rate": 0.00019496396989003193,
+      "loss": 0.397,
+      "step": 45
+    },
+    {
+      "epoch": 0.5935483870967742,
+      "grad_norm": 0.09041908237644536,
+      "learning_rate": 0.00019447749649047542,
+      "loss": 0.3992,
+      "step": 46
+    },
+    {
+      "epoch": 0.6064516129032258,
+      "grad_norm": 0.08321976348844515,
+      "learning_rate": 0.00019396926207859084,
+      "loss": 0.4095,
+      "step": 47
+    },
+    {
+      "epoch": 0.6193548387096774,
+      "grad_norm": 0.07887604040253807,
+      "learning_rate": 0.00019343938371606712,
+      "loss": 0.3866,
+      "step": 48
+    },
+    {
+      "epoch": 0.632258064516129,
+      "grad_norm": 0.08329265943906447,
+      "learning_rate": 0.00019288798344984672,
+      "loss": 0.3985,
+      "step": 49
+    },
+    {
+      "epoch": 0.6451612903225806,
+      "grad_norm": 0.08661703211305888,
+      "learning_rate": 0.00019231518828401458,
+      "loss": 0.3925,
+      "step": 50
+    },
+    {
+      "epoch": 0.6580645161290323,
+      "grad_norm": 0.08382217550700771,
+      "learning_rate": 0.00019172113015054532,
+      "loss": 0.3862,
+      "step": 51
+    },
+    {
+      "epoch": 0.6709677419354839,
+      "grad_norm": 0.08245124856491458,
+      "learning_rate": 0.00019110594587891519,
+      "loss": 0.3847,
+      "step": 52
+    },
+    {
+      "epoch": 0.6838709677419355,
+      "grad_norm": 0.08319716279149986,
+      "learning_rate": 0.00019046977716458626,
+      "loss": 0.3775,
+      "step": 53
+    },
+    {
+      "epoch": 0.6967741935483871,
+      "grad_norm": 0.08074648144423298,
+      "learning_rate": 0.0001898127705363696,
+      "loss": 0.3786,
+      "step": 54
+    },
+    {
+      "epoch": 0.7096774193548387,
+      "grad_norm": 0.08472762376284584,
+      "learning_rate": 0.0001891350773226754,
+      "loss": 0.3923,
+      "step": 55
+    },
+    {
+      "epoch": 0.7225806451612903,
+      "grad_norm": 0.08398076059437376,
+      "learning_rate": 0.00018843685361665723,
+      "loss": 0.3709,
+      "step": 56
+    },
+    {
+      "epoch": 0.7354838709677419,
+      "grad_norm": 0.08465216102770419,
+      "learning_rate": 0.00018771826024025946,
+      "loss": 0.3818,
+      "step": 57
+    },
+    {
+      "epoch": 0.7483870967741936,
+      "grad_norm": 0.09145572810056589,
+      "learning_rate": 0.00018697946270717467,
+      "loss": 0.39,
+      "step": 58
+    },
+    {
+      "epoch": 0.7612903225806451,
+      "grad_norm": 0.08415188367023674,
+      "learning_rate": 0.00018622063118472134,
+      "loss": 0.3733,
+      "step": 59
+    },
+    {
+      "epoch": 0.7741935483870968,
+      "grad_norm": 0.08576290382509591,
+      "learning_rate": 0.00018544194045464886,
+      "loss": 0.3878,
+      "step": 60
+    },
+    {
+      "epoch": 0.7870967741935484,
+      "grad_norm": 0.0844142047859298,
+      "learning_rate": 0.00018464356987288013,
+      "loss": 0.3637,
+      "step": 61
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.08918487261557899,
+      "learning_rate": 0.00018382570332820043,
+      "loss": 0.3775,
+      "step": 62
+    },
+    {
+      "epoch": 0.8129032258064516,
+      "grad_norm": 0.0795181880669878,
+      "learning_rate": 0.00018298852919990252,
+      "loss": 0.3853,
+      "step": 63
+    },
+    {
+      "epoch": 0.8258064516129032,
+      "grad_norm": 0.08173055996583302,
+      "learning_rate": 0.0001821322403143969,
+      "loss": 0.38,
+      "step": 64
+    },
+    {
+      "epoch": 0.8387096774193549,
+      "grad_norm": 0.08525070031165603,
+      "learning_rate": 0.0001812570339007983,
+      "loss": 0.3778,
+      "step": 65
+    },
+    {
+      "epoch": 0.8516129032258064,
+      "grad_norm": 0.08531235204546653,
+      "learning_rate": 0.00018036311154549784,
+      "loss": 0.3727,
+      "step": 66
+    },
+    {
+      "epoch": 0.864516129032258,
+      "grad_norm": 0.08169851479895494,
+      "learning_rate": 0.00017945067914573146,
+      "loss": 0.365,
+      "step": 67
+    },
+    {
+      "epoch": 0.8774193548387097,
+      "grad_norm": 0.08463789046916101,
+      "learning_rate": 0.0001785199468621559,
+      "loss": 0.3752,
+      "step": 68
+    },
+    {
+      "epoch": 0.8903225806451613,
+      "grad_norm": 0.09441843624235378,
+      "learning_rate": 0.000177571129070442,
+      "loss": 0.3665,
+      "step": 69
+    },
+    {
+      "epoch": 0.9032258064516129,
+      "grad_norm": 0.08530939476149231,
+      "learning_rate": 0.0001766044443118978,
+      "loss": 0.3926,
+      "step": 70
+    },
+    {
+      "epoch": 0.9161290322580645,
+      "grad_norm": 0.0836606457284625,
+      "learning_rate": 0.00017562011524313185,
+      "loss": 0.3844,
+      "step": 71
+    },
+    {
+      "epoch": 0.9290322580645162,
+      "grad_norm": 0.09868625782773943,
+      "learning_rate": 0.00017461836858476856,
+      "loss": 0.3835,
+      "step": 72
+    },
+    {
+      "epoch": 0.9419354838709677,
+      "grad_norm": 0.082132336261239,
+      "learning_rate": 0.00017359943506922774,
+      "loss": 0.3792,
+      "step": 73
+    },
+    {
+      "epoch": 0.9548387096774194,
+      "grad_norm": 0.08948965393301354,
+      "learning_rate": 0.0001725635493875799,
+      "loss": 0.3813,
+      "step": 74
+    },
+    {
+      "epoch": 0.967741935483871,
+      "grad_norm": 0.08539410389371488,
+      "learning_rate": 0.00017151095013548994,
+      "loss": 0.3774,
+      "step": 75
+    },
+    {
+      "epoch": 0.9806451612903225,
+      "grad_norm": 0.08690404790165682,
+      "learning_rate": 0.00017044187975826124,
+      "loss": 0.3762,
+      "step": 76
+    },
+    {
+      "epoch": 0.9935483870967742,
+      "grad_norm": 0.09039522496805455,
+      "learning_rate": 0.0001693565844949933,
+      "loss": 0.3733,
+      "step": 77
+    },
+    {
+      "epoch": 0.9935483870967742,
+      "eval_loss": 0.3743511736392975,
+      "eval_runtime": 42.1339,
+      "eval_samples_per_second": 24.66,
+      "eval_steps_per_second": 0.783,
+      "step": 77
+    },
+    {
+      "epoch": 1.0064516129032257,
+      "grad_norm": 0.09165665911792642,
+      "learning_rate": 0.00016825531432186543,
+      "loss": 0.3532,
+      "step": 78
+    },
+    {
+      "epoch": 1.0193548387096774,
+      "grad_norm": 0.0801922544260219,
+      "learning_rate": 0.0001671383228945597,
+      "loss": 0.347,
+      "step": 79
+    },
+    {
+      "epoch": 1.032258064516129,
+      "grad_norm": 0.08352186065175837,
+      "learning_rate": 0.00016600586748983641,
+      "loss": 0.3566,
+      "step": 80
+    },
+    {
+      "epoch": 1.0451612903225806,
+      "grad_norm": 0.08793176795367076,
+      "learning_rate": 0.0001648582089462756,
+      "loss": 0.3473,
+      "step": 81
+    },
+    {
+      "epoch": 1.0580645161290323,
+      "grad_norm": 0.08913951531063671,
+      "learning_rate": 0.00016369561160419784,
+      "loss": 0.342,
+      "step": 82
+    },
+    {
+      "epoch": 1.070967741935484,
+      "grad_norm": 0.08309712335786672,
+      "learning_rate": 0.0001625183432447789,
+      "loss": 0.345,
+      "step": 83
+    },
+    {
+      "epoch": 1.0838709677419356,
+      "grad_norm": 0.08725330804483407,
+      "learning_rate": 0.00016132667502837165,
+      "loss": 0.3523,
+      "step": 84
+    },
+    {
+      "epoch": 1.096774193548387,
+      "grad_norm": 0.08680862762413778,
+      "learning_rate": 0.00016012088143204953,
+      "loss": 0.3554,
+      "step": 85
+    },
+    {
+      "epoch": 1.1096774193548387,
+      "grad_norm": 0.0863782848559528,
+      "learning_rate": 0.00015890124018638638,
+      "loss": 0.364,
+      "step": 86
+    },
+    {
+      "epoch": 1.1225806451612903,
+      "grad_norm": 0.08388848992116194,
+      "learning_rate": 0.00015766803221148673,
+      "loss": 0.3568,
+      "step": 87
+    },
+    {
+      "epoch": 1.135483870967742,
+      "grad_norm": 0.08226994751114965,
+      "learning_rate": 0.00015642154155228122,
+      "loss": 0.3489,
+      "step": 88
+    },
+    {
+      "epoch": 1.1483870967741936,
+      "grad_norm": 0.08575965994905438,
+      "learning_rate": 0.00015516205531310273,
+      "loss": 0.3466,
+      "step": 89
+    },
+    {
+      "epoch": 1.1612903225806452,
+      "grad_norm": 0.0895747440427046,
+      "learning_rate": 0.00015388986359155758,
+      "loss": 0.3488,
+      "step": 90
+    },
+    {
+      "epoch": 1.1741935483870969,
+      "grad_norm": 0.08403222320010312,
+      "learning_rate": 0.00015260525941170712,
+      "loss": 0.356,
+      "step": 91
+    },
+    {
+      "epoch": 1.1870967741935483,
+      "grad_norm": 0.08627434364043794,
+      "learning_rate": 0.0001513085386565758,
+      "loss": 0.3519,
+      "step": 92
+    },
+    {
+      "epoch": 1.2,
+      "grad_norm": 0.08925414655300028,
+      "learning_rate": 0.00015000000000000001,
+      "loss": 0.3523,
+      "step": 93
+    },
+    {
+      "epoch": 1.2129032258064516,
+      "grad_norm": 0.09120079741968923,
+      "learning_rate": 0.00014867994483783485,
+      "loss": 0.3555,
+      "step": 94
+    },
+    {
+      "epoch": 1.2258064516129032,
+      "grad_norm": 0.08519037826685563,
+      "learning_rate": 0.0001473486772185334,
+      "loss": 0.3551,
+      "step": 95
+    },
+    {
+      "epoch": 1.238709677419355,
+      "grad_norm": 0.08814591743170447,
+      "learning_rate": 0.00014600650377311522,
+      "loss": 0.3535,
+      "step": 96
+    },
+    {
+      "epoch": 1.2516129032258063,
+      "grad_norm": 0.08812877093082108,
+      "learning_rate": 0.00014465373364454001,
+      "loss": 0.3498,
+      "step": 97
+    },
+    {
+      "epoch": 1.2645161290322582,
+      "grad_norm": 0.08596197743921638,
+      "learning_rate": 0.00014329067841650274,
+      "loss": 0.3484,
+      "step": 98
+    },
+    {
+      "epoch": 1.2774193548387096,
+      "grad_norm": 0.09025513346881896,
+      "learning_rate": 0.00014191765204166643,
+      "loss": 0.3465,
+      "step": 99
+    },
+    {
+      "epoch": 1.2903225806451613,
+      "grad_norm": 0.08665409616008209,
+      "learning_rate": 0.00014053497076934948,
+      "loss": 0.35,
+      "step": 100
+    },
+    {
+      "epoch": 1.303225806451613,
+      "grad_norm": 0.09012608398761074,
+      "learning_rate": 0.00013914295307268396,
+      "loss": 0.3516,
+      "step": 101
+    },
+    {
+      "epoch": 1.3161290322580645,
+      "grad_norm": 0.09456407877563842,
+      "learning_rate": 0.00013774191957526143,
+      "loss": 0.3639,
+      "step": 102
+    },
+    {
+      "epoch": 1.3290322580645162,
+      "grad_norm": 0.0888376260234129,
+      "learning_rate": 0.00013633219297728416,
+      "loss": 0.3396,
+      "step": 103
+    },
+    {
+      "epoch": 1.3419354838709676,
+      "grad_norm": 0.08652600639054038,
+      "learning_rate": 0.00013491409798123687,
+      "loss": 0.3445,
+      "step": 104
+    },
+    {
+      "epoch": 1.3548387096774195,
+      "grad_norm": 0.09269194410505097,
+      "learning_rate": 0.00013348796121709862,
+      "loss": 0.3555,
+      "step": 105
+    },
+    {
+      "epoch": 1.367741935483871,
+      "grad_norm": 0.09421096011594207,
+      "learning_rate": 0.00013205411116710972,
+      "loss": 0.3508,
+      "step": 106
+    },
+    {
+      "epoch": 1.3806451612903226,
+      "grad_norm": 0.09286783444235318,
+      "learning_rate": 0.00013061287809011242,
+      "loss": 0.3571,
+      "step": 107
+    },
+    {
+      "epoch": 1.3935483870967742,
+      "grad_norm": 0.08172852976047028,
+      "learning_rate": 0.0001291645939454825,
+      "loss": 0.3488,
+      "step": 108
+    },
+    {
+      "epoch": 1.4064516129032258,
+      "grad_norm": 0.09033973727962885,
+      "learning_rate": 0.0001277095923166689,
+      "loss": 0.3498,
+      "step": 109
+    },
+    {
+      "epoch": 1.4193548387096775,
+      "grad_norm": 0.09628933362833343,
+      "learning_rate": 0.00012624820833435937,
+      "loss": 0.3472,
+      "step": 110
+    },
+    {
+      "epoch": 1.432258064516129,
+      "grad_norm": 0.08471497514674803,
+      "learning_rate": 0.00012478077859929,
+      "loss": 0.3353,
+      "step": 111
+    },
+    {
+      "epoch": 1.4451612903225808,
+      "grad_norm": 0.08976133324522119,
+      "learning_rate": 0.00012330764110471566,
+      "loss": 0.3468,
+      "step": 112
+    },
+    {
+      "epoch": 1.4580645161290322,
+      "grad_norm": 0.09634877556737409,
+      "learning_rate": 0.00012182913515856015,
+      "loss": 0.3541,
+      "step": 113
+    },
+    {
+      "epoch": 1.4709677419354839,
+      "grad_norm": 0.09348923296138459,
+      "learning_rate": 0.0001203456013052634,
+      "loss": 0.3521,
+      "step": 114
+    },
+    {
+      "epoch": 1.4838709677419355,
+      "grad_norm": 0.09437711091684706,
+      "learning_rate": 0.00011885738124734358,
+      "loss": 0.3566,
+      "step": 115
+    },
+    {
+      "epoch": 1.4967741935483871,
+      "grad_norm": 0.08916702937111011,
+      "learning_rate": 0.00011736481776669306,
+      "loss": 0.3458,
+      "step": 116
+    },
+    {
+      "epoch": 1.5096774193548388,
+      "grad_norm": 0.09100601467580355,
+      "learning_rate": 0.00011586825464562514,
+      "loss": 0.3593,
+      "step": 117
+    },
+    {
+      "epoch": 1.5225806451612902,
+      "grad_norm": 0.08990470683690902,
+      "learning_rate": 0.00011436803658769082,
+      "loss": 0.3434,
+      "step": 118
+    },
+    {
+      "epoch": 1.535483870967742,
+      "grad_norm": 0.0932653393737011,
+      "learning_rate": 0.00011286450913828312,
+      "loss": 0.342,
+      "step": 119
+    },
+    {
+      "epoch": 1.5483870967741935,
+      "grad_norm": 0.08960531773257623,
+      "learning_rate": 0.00011135801860504749,
+      "loss": 0.3628,
+      "step": 120
+    },
+    {
+      "epoch": 1.5612903225806452,
+      "grad_norm": 0.09275069273094473,
+      "learning_rate": 0.00010984891197811687,
+      "loss": 0.3513,
+      "step": 121
+    },
+    {
+      "epoch": 1.5741935483870968,
+      "grad_norm": 0.09527469311088294,
+      "learning_rate": 0.00010833753685018935,
+      "loss": 0.3556,
+      "step": 122
+    },
+    {
+      "epoch": 1.5870967741935482,
+      "grad_norm": 0.09323849659154124,
+      "learning_rate": 0.0001068242413364671,
+      "loss": 0.3448,
+      "step": 123
+    },
+    {
+      "epoch": 1.6,
+      "grad_norm": 0.08474554028292876,
+      "learning_rate": 0.00010530937399447496,
+      "loss": 0.3499,
+      "step": 124
+    },
+    {
+      "epoch": 1.6129032258064515,
+      "grad_norm": 0.09382059811382143,
+      "learning_rate": 0.00010379328374377715,
+      "loss": 0.3384,
+      "step": 125
+    },
+    {
+      "epoch": 1.6258064516129034,
+      "grad_norm": 0.09276702527842776,
+      "learning_rate": 0.00010227631978561056,
+      "loss": 0.3444,
+      "step": 126
+    },
+    {
+      "epoch": 1.6387096774193548,
+      "grad_norm": 0.08750152088472078,
+      "learning_rate": 0.00010075883152245334,
+      "loss": 0.3569,
+      "step": 127
+    },
+    {
+      "epoch": 1.6516129032258065,
+      "grad_norm": 0.08714445180642569,
+      "learning_rate": 9.92411684775467e-05,
+      "loss": 0.342,
+      "step": 128
+    },
+    {
+      "epoch": 1.664516129032258,
+      "grad_norm": 0.08469902272466831,
+      "learning_rate": 9.772368021438943e-05,
+      "loss": 0.3342,
+      "step": 129
+    },
+    {
+      "epoch": 1.6774193548387095,
+      "grad_norm": 0.08724585745005611,
+      "learning_rate": 9.620671625622288e-05,
+      "loss": 0.3335,
+      "step": 130
+    },
+    {
+      "epoch": 1.6903225806451614,
+      "grad_norm": 0.09087336723016343,
+      "learning_rate": 9.469062600552509e-05,
+      "loss": 0.3447,
+      "step": 131
+    },
+    {
+      "epoch": 1.7032258064516128,
+      "grad_norm": 0.08863278083042062,
+      "learning_rate": 9.317575866353292e-05,
+      "loss": 0.3487,
+      "step": 132
+    },
+    {
+      "epoch": 1.7161290322580647,
+      "grad_norm": 0.08343459715762,
+      "learning_rate": 9.166246314981066e-05,
+      "loss": 0.3454,
+      "step": 133
+    },
+    {
+      "epoch": 1.729032258064516,
+      "grad_norm": 0.08837483796029806,
+      "learning_rate": 9.015108802188313e-05,
+      "loss": 0.3484,
+      "step": 134
+    },
+    {
+      "epoch": 1.7419354838709677,
+      "grad_norm": 0.08762249376974672,
+      "learning_rate": 8.86419813949525e-05,
+      "loss": 0.3447,
+      "step": 135
+    },
+    {
+      "epoch": 1.7548387096774194,
+      "grad_norm": 0.08446853010895118,
+      "learning_rate": 8.713549086171691e-05,
+      "loss": 0.3466,
+      "step": 136
+    },
+    {
+      "epoch": 1.7677419354838708,
+      "grad_norm": 0.08897676787603495,
+      "learning_rate": 8.563196341230919e-05,
+      "loss": 0.3434,
+      "step": 137
+    },
+    {
+      "epoch": 1.7806451612903227,
+      "grad_norm": 0.09210810174866911,
+      "learning_rate": 8.413174535437487e-05,
+      "loss": 0.355,
+      "step": 138
+    },
+    {
+      "epoch": 1.793548387096774,
+      "grad_norm": 0.0877098792555575,
+      "learning_rate": 8.263518223330697e-05,
+      "loss": 0.3392,
+      "step": 139
+    },
+    {
+      "epoch": 1.8064516129032258,
+      "grad_norm": 0.09059259587839792,
+      "learning_rate": 8.114261875265643e-05,
+      "loss": 0.3465,
+      "step": 140
+    },
+    {
+      "epoch": 1.8193548387096774,
+      "grad_norm": 0.09043152099082513,
+      "learning_rate": 7.965439869473664e-05,
+      "loss": 0.3409,
+      "step": 141
+    },
+    {
+      "epoch": 1.832258064516129,
+      "grad_norm": 0.08863483273837267,
+      "learning_rate": 7.817086484143986e-05,
+      "loss": 0.3497,
+      "step": 142
+    },
+    {
+      "epoch": 1.8451612903225807,
+      "grad_norm": 0.08351509862847174,
+      "learning_rate": 7.669235889528436e-05,
+      "loss": 0.3484,
+      "step": 143
+    },
+    {
+      "epoch": 1.8580645161290321,
+      "grad_norm": 0.08881689002413959,
+      "learning_rate": 7.521922140071002e-05,
+      "loss": 0.3428,
+      "step": 144
+    },
+    {
+      "epoch": 1.870967741935484,
+      "grad_norm": 0.08962413300366581,
+      "learning_rate": 7.375179166564063e-05,
+      "loss": 0.3353,
+      "step": 145
+    },
+    {
+      "epoch": 1.8838709677419354,
+      "grad_norm": 0.08991947191225944,
+      "learning_rate": 7.229040768333115e-05,
+      "loss": 0.3366,
+      "step": 146
+    },
+    {
+      "epoch": 1.896774193548387,
+      "grad_norm": 0.0890545628104281,
+      "learning_rate": 7.08354060545175e-05,
+      "loss": 0.3381,
+      "step": 147
+    },
+    {
+      "epoch": 1.9096774193548387,
+      "grad_norm": 0.09306016588414409,
+      "learning_rate": 6.93871219098876e-05,
+      "loss": 0.3356,
+      "step": 148
+    },
+    {
+      "epoch": 1.9225806451612903,
+      "grad_norm": 0.08816048934545212,
+      "learning_rate": 6.79458888328903e-05,
+      "loss": 0.3412,
+      "step": 149
+    },
+    {
+      "epoch": 1.935483870967742,
+      "grad_norm": 0.09006593042575502,
+      "learning_rate": 6.651203878290139e-05,
+      "loss": 0.3471,
+      "step": 150
+    },
+    {
+      "epoch": 1.9483870967741934,
+      "grad_norm": 0.08499237638300171,
+      "learning_rate": 6.508590201876317e-05,
+      "loss": 0.335,
+      "step": 151
+    },
+    {
+      "epoch": 1.9612903225806453,
+      "grad_norm": 0.09566747308379261,
+      "learning_rate": 6.366780702271589e-05,
+      "loss": 0.3395,
+      "step": 152
+    },
+    {
+      "epoch": 1.9741935483870967,
+      "grad_norm": 0.0915253754596643,
+      "learning_rate": 6.225808042473858e-05,
+      "loss": 0.3488,
+      "step": 153
+    },
+    {
+      "epoch": 1.9870967741935484,
+      "grad_norm": 0.08657357278603872,
+      "learning_rate": 6.085704692731609e-05,
+      "loss": 0.3344,
+      "step": 154
+    },
+    {
+      "epoch": 2.0,
+      "grad_norm": 0.08950726731743963,
+      "learning_rate": 5.9465029230650534e-05,
+      "loss": 0.33,
+      "step": 155
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 0.35439133644104004,
+      "eval_runtime": 36.1469,
+      "eval_samples_per_second": 28.744,
+      "eval_steps_per_second": 0.913,
+      "step": 155
+    },
+    {
+      "epoch": 2.0129032258064514,
+      "grad_norm": 0.08961232668946545,
+      "learning_rate": 5.8082347958333625e-05,
+      "loss": 0.3273,
+      "step": 156
+    },
+    {
+      "epoch": 2.0258064516129033,
+      "grad_norm": 0.09402916213349197,
+      "learning_rate": 5.670932158349731e-05,
+      "loss": 0.3218,
+      "step": 157
+    },
+    {
+      "epoch": 2.0387096774193547,
+      "grad_norm": 0.08520247695821515,
+      "learning_rate": 5.5346266355459995e-05,
+      "loss": 0.3089,
+      "step": 158
+    },
+    {
+      "epoch": 2.0516129032258066,
+      "grad_norm": 0.08637288183919145,
+      "learning_rate": 5.399349622688479e-05,
+      "loss": 0.3266,
+      "step": 159
+    },
+    {
+      "epoch": 2.064516129032258,
+      "grad_norm": 0.08823864345930746,
+      "learning_rate": 5.26513227814666e-05,
+      "loss": 0.329,
+      "step": 160
+    },
+    {
+      "epoch": 2.07741935483871,
+      "grad_norm": 0.09384371931382793,
+      "learning_rate": 5.1320055162165115e-05,
+      "loss": 0.3275,
+      "step": 161
+    },
+    {
+      "epoch": 2.0903225806451613,
+      "grad_norm": 0.09516405744887674,
+      "learning_rate": 5.000000000000002e-05,
+      "loss": 0.332,
+      "step": 162
+    },
+    {
+      "epoch": 2.1032258064516127,
+      "grad_norm": 0.08966279182804247,
+      "learning_rate": 4.869146134342426e-05,
+      "loss": 0.3247,
+      "step": 163
+    },
+    {
+      "epoch": 2.1161290322580646,
+      "grad_norm": 0.08700940402163973,
+      "learning_rate": 4.739474058829289e-05,
+      "loss": 0.3221,
+      "step": 164
+    },
+    {
+      "epoch": 2.129032258064516,
+      "grad_norm": 0.08984677102800173,
+      "learning_rate": 4.611013640844245e-05,
+      "loss": 0.3272,
+      "step": 165
+    },
+    {
+      "epoch": 2.141935483870968,
+      "grad_norm": 0.08964202186304891,
+      "learning_rate": 4.483794468689728e-05,
+      "loss": 0.3188,
+      "step": 166
+    },
+    {
+      "epoch": 2.1548387096774193,
+      "grad_norm": 0.09997697429798251,
+      "learning_rate": 4.357845844771881e-05,
+      "loss": 0.3383,
+      "step": 167
+    },
+    {
+      "epoch": 2.167741935483871,
+      "grad_norm": 0.09510073376177604,
+      "learning_rate": 4.2331967788513295e-05,
+      "loss": 0.3252,
+      "step": 168
+    },
+    {
+      "epoch": 2.1806451612903226,
+      "grad_norm": 0.09107612709336496,
+      "learning_rate": 4.109875981361363e-05,
+      "loss": 0.3217,
+      "step": 169
+    },
+    {
+      "epoch": 2.193548387096774,
+      "grad_norm": 0.08804927379783276,
+      "learning_rate": 3.987911856795047e-05,
+      "loss": 0.3173,
+      "step": 170
+    },
+    {
+      "epoch": 2.206451612903226,
+      "grad_norm": 0.0916081059987062,
+      "learning_rate": 3.8673324971628357e-05,
+      "loss": 0.3285,
+      "step": 171
+    },
+    {
+      "epoch": 2.2193548387096773,
+      "grad_norm": 0.09226628432750343,
+      "learning_rate": 3.7481656755221125e-05,
+      "loss": 0.3154,
+      "step": 172
+    },
+    {
+      "epoch": 2.232258064516129,
+      "grad_norm": 0.09145015878266409,
+      "learning_rate": 3.630438839580217e-05,
+      "loss": 0.3087,
+      "step": 173
+    },
+    {
+      "epoch": 2.2451612903225806,
+      "grad_norm": 0.08786201399591659,
+      "learning_rate": 3.5141791053724405e-05,
+      "loss": 0.3151,
+      "step": 174
+    },
+    {
+      "epoch": 2.258064516129032,
+      "grad_norm": 0.09259402512083086,
+      "learning_rate": 3.399413251016359e-05,
+      "loss": 0.3369,
+      "step": 175
+    },
+    {
+      "epoch": 2.270967741935484,
+      "grad_norm": 0.09311260751337232,
+      "learning_rate": 3.2861677105440336e-05,
+      "loss": 0.3051,
+      "step": 176
+    },
+    {
+      "epoch": 2.2838709677419353,
+      "grad_norm": 0.09217712904693832,
+      "learning_rate": 3.174468567813461e-05,
+      "loss": 0.3199,
+      "step": 177
+    },
+    {
+      "epoch": 2.296774193548387,
+      "grad_norm": 0.09141877592974519,
+      "learning_rate": 3.0643415505006735e-05,
+      "loss": 0.3229,
+      "step": 178
+    },
+    {
+      "epoch": 2.3096774193548386,
+      "grad_norm": 0.09528833689903496,
+      "learning_rate": 2.9558120241738784e-05,
+      "loss": 0.3286,
+      "step": 179
+    },
+    {
+      "epoch": 2.3225806451612905,
+      "grad_norm": 0.09070636787107308,
+      "learning_rate": 2.8489049864510054e-05,
+      "loss": 0.3348,
+      "step": 180
+    },
+    {
+      "epoch": 2.335483870967742,
+      "grad_norm": 0.09307512327341362,
+      "learning_rate": 2.7436450612420095e-05,
+      "loss": 0.3256,
+      "step": 181
+    },
+    {
+      "epoch": 2.3483870967741938,
+      "grad_norm": 0.09127823479306682,
+      "learning_rate": 2.640056493077231e-05,
+      "loss": 0.3181,
+      "step": 182
+    },
+    {
+      "epoch": 2.361290322580645,
+      "grad_norm": 0.09246009256113925,
+      "learning_rate": 2.5381631415231454e-05,
+      "loss": 0.3391,
+      "step": 183
+    },
+    {
+      "epoch": 2.3741935483870966,
+      "grad_norm": 0.09095352379758655,
+      "learning_rate": 2.4379884756868167e-05,
+      "loss": 0.3172,
+      "step": 184
+    },
+    {
+      "epoch": 2.3870967741935485,
+      "grad_norm": 0.0926880163626768,
+      "learning_rate": 2.339555568810221e-05,
+      "loss": 0.3177,
+      "step": 185
+    },
+    {
+      "epoch": 2.4,
+      "grad_norm": 0.09094474131194094,
+      "learning_rate": 2.242887092955801e-05,
+      "loss": 0.3199,
+      "step": 186
+    },
+    {
+      "epoch": 2.412903225806452,
+      "grad_norm": 0.09106546035353981,
+      "learning_rate": 2.1480053137844115e-05,
+      "loss": 0.3222,
+      "step": 187
+    },
+    {
+      "epoch": 2.425806451612903,
+      "grad_norm": 0.08873018715134598,
+      "learning_rate": 2.054932085426856e-05,
+      "loss": 0.3118,
+      "step": 188
+    },
+    {
+      "epoch": 2.4387096774193546,
+      "grad_norm": 0.0932765377498955,
+      "learning_rate": 1.9636888454502178e-05,
+      "loss": 0.3358,
+      "step": 189
+    },
+    {
+      "epoch": 2.4516129032258065,
+      "grad_norm": 0.09181586534157822,
+      "learning_rate": 1.8742966099201697e-05,
+      "loss": 0.3157,
+      "step": 190
+    },
+    {
+      "epoch": 2.464516129032258,
+      "grad_norm": 0.0929486436457203,
+      "learning_rate": 1.7867759685603114e-05,
+      "loss": 0.3154,
+      "step": 191
+    },
+    {
+      "epoch": 2.47741935483871,
+      "grad_norm": 0.09188630220285351,
+      "learning_rate": 1.7011470800097496e-05,
+      "loss": 0.3181,
+      "step": 192
+    },
+    {
+      "epoch": 2.490322580645161,
+      "grad_norm": 0.09574286894431329,
+      "learning_rate": 1.6174296671799572e-05,
+      "loss": 0.3222,
+      "step": 193
+    },
+    {
+      "epoch": 2.5032258064516126,
+      "grad_norm": 0.09145354457132104,
+      "learning_rate": 1.5356430127119913e-05,
+      "loss": 0.3222,
+      "step": 194
+    },
+    {
+      "epoch": 2.5161290322580645,
+      "grad_norm": 0.09039580690260736,
+      "learning_rate": 1.4558059545351143e-05,
+      "loss": 0.324,
+      "step": 195
+    },
+    {
+      "epoch": 2.5290322580645164,
+      "grad_norm": 0.08979381831653434,
+      "learning_rate": 1.3779368815278647e-05,
+      "loss": 0.3107,
+      "step": 196
+    },
+    {
+      "epoch": 2.541935483870968,
+      "grad_norm": 0.09526292697431937,
+      "learning_rate": 1.302053729282533e-05,
+      "loss": 0.3219,
+      "step": 197
+    },
+    {
+      "epoch": 2.554838709677419,
+      "grad_norm": 0.09310358146453943,
+      "learning_rate": 1.2281739759740574e-05,
+      "loss": 0.3214,
+      "step": 198
+    },
+    {
+      "epoch": 2.567741935483871,
+      "grad_norm": 0.09212645063531479,
+      "learning_rate": 1.1563146383342772e-05,
+      "loss": 0.3154,
+      "step": 199
+    },
+    {
+      "epoch": 2.5806451612903225,
+      "grad_norm": 0.09533681862557382,
+      "learning_rate": 1.0864922677324618e-05,
+      "loss": 0.319,
+      "step": 200
+    },
+    {
+      "epoch": 2.5935483870967744,
+      "grad_norm": 0.09551418366783314,
+      "learning_rate": 1.01872294636304e-05,
+      "loss": 0.3333,
+      "step": 201
+    },
+    {
+      "epoch": 2.606451612903226,
+      "grad_norm": 0.08930212325894361,
+      "learning_rate": 9.530222835413738e-06,
+      "loss": 0.3048,
+      "step": 202
+    },
+    {
+      "epoch": 2.6193548387096772,
+      "grad_norm": 0.09220378121771236,
+      "learning_rate": 8.894054121084838e-06,
+      "loss": 0.3146,
+      "step": 203
+    },
+    {
+      "epoch": 2.632258064516129,
+      "grad_norm": 0.09150774720724307,
+      "learning_rate": 8.278869849454718e-06,
+      "loss": 0.3311,
+      "step": 204
+    },
+    {
+      "epoch": 2.6451612903225805,
+      "grad_norm": 0.09261513270619316,
+      "learning_rate": 7.684811715985429e-06,
+      "loss": 0.3172,
+      "step": 205
+    },
+    {
+      "epoch": 2.6580645161290324,
+      "grad_norm": 0.0941004102909483,
+      "learning_rate": 7.1120165501533e-06,
+      "loss": 0.3347,
+      "step": 206
+    },
+    {
+      "epoch": 2.670967741935484,
+      "grad_norm": 0.08707518610128166,
+      "learning_rate": 6.560616283932897e-06,
+      "loss": 0.3116,
+      "step": 207
+    },
+    {
+      "epoch": 2.6838709677419352,
+      "grad_norm": 0.08648707636296159,
+      "learning_rate": 6.030737921409169e-06,
+      "loss": 0.3144,
+      "step": 208
+    },
+    {
+      "epoch": 2.696774193548387,
+      "grad_norm": 0.09169150101119816,
+      "learning_rate": 5.52250350952459e-06,
+      "loss": 0.3255,
+      "step": 209
+    },
+    {
+      "epoch": 2.709677419354839,
+      "grad_norm": 0.09060072523264334,
+      "learning_rate": 5.036030109968082e-06,
+      "loss": 0.3183,
+      "step": 210
+    },
+    {
+      "epoch": 2.7225806451612904,
+      "grad_norm": 0.09077216490604942,
+      "learning_rate": 4.5714297722121106e-06,
+      "loss": 0.321,
+      "step": 211
+    },
+    {
+      "epoch": 2.735483870967742,
+      "grad_norm": 0.09088968433443333,
+      "learning_rate": 4.128809507704445e-06,
+      "loss": 0.3172,
+      "step": 212
+    },
+    {
+      "epoch": 2.7483870967741937,
+      "grad_norm": 0.09191902683388614,
+      "learning_rate": 3.7082712652200867e-06,
+      "loss": 0.3261,
+      "step": 213
+    },
+    {
+      "epoch": 2.761290322580645,
+      "grad_norm": 0.08843215800144302,
+      "learning_rate": 3.3099119073793928e-06,
+      "loss": 0.3158,
+      "step": 214
+    },
+    {
+      "epoch": 2.774193548387097,
+      "grad_norm": 0.09079938334868655,
+      "learning_rate": 2.9338231883378366e-06,
+      "loss": 0.3178,
+      "step": 215
+    },
+    {
+      "epoch": 2.7870967741935484,
+      "grad_norm": 0.09122789808454786,
+      "learning_rate": 2.580091732652101e-06,
+      "loss": 0.3282,
+      "step": 216
+    },
+    {
+      "epoch": 2.8,
+      "grad_norm": 0.09380292374109117,
+      "learning_rate": 2.248799015327907e-06,
+      "loss": 0.3359,
+      "step": 217
+    },
+    {
+      "epoch": 2.8129032258064517,
+      "grad_norm": 0.09035917420929797,
+      "learning_rate": 1.9400213430538773e-06,
+      "loss": 0.3169,
+      "step": 218
+    },
+    {
+      "epoch": 2.825806451612903,
+      "grad_norm": 0.09195121657817087,
+      "learning_rate": 1.6538298366257976e-06,
+      "loss": 0.3314,
+      "step": 219
+    },
+    {
+      "epoch": 2.838709677419355,
+      "grad_norm": 0.09166102367139951,
+      "learning_rate": 1.3902904145653096e-06,
+      "loss": 0.3258,
+      "step": 220
+    },
+    {
+      "epoch": 2.8516129032258064,
+      "grad_norm": 0.0921992572010057,
+      "learning_rate": 1.1494637779369766e-06,
+      "loss": 0.3298,
+      "step": 221
+    },
+    {
+      "epoch": 2.864516129032258,
+      "grad_norm": 0.09068261067988724,
+      "learning_rate": 9.314053963669245e-07,
+      "loss": 0.3214,
+      "step": 222
+    },
+    {
+      "epoch": 2.8774193548387097,
+      "grad_norm": 0.09417924199778298,
+      "learning_rate": 7.361654952665609e-07,
+      "loss": 0.3134,
+      "step": 223
+    },
+    {
+      "epoch": 2.8903225806451616,
+      "grad_norm": 0.0901765977296441,
+      "learning_rate": 5.637890442641402e-07,
+      "loss": 0.3221,
+      "step": 224
+    },
+    {
+      "epoch": 2.903225806451613,
+      "grad_norm": 0.09094506589085496,
+      "learning_rate": 4.143157468468717e-07,
+      "loss": 0.3128,
+      "step": 225
+    },
+    {
+      "epoch": 2.9161290322580644,
+      "grad_norm": 0.08772549933058231,
+      "learning_rate": 2.877800312160783e-07,
+      "loss": 0.3248,
+      "step": 226
+    },
+    {
+      "epoch": 2.9290322580645163,
+      "grad_norm": 0.09191883931659987,
+      "learning_rate": 1.8421104235727405e-07,
+      "loss": 0.3114,
+      "step": 227
+    },
+    {
+      "epoch": 2.9419354838709677,
+      "grad_norm": 0.08876137430429,
+      "learning_rate": 1.0363263532724432e-07,
+      "loss": 0.3127,
+      "step": 228
+    },
+    {
+      "epoch": 2.9548387096774196,
+      "grad_norm": 0.09157045134043748,
+      "learning_rate": 4.606336975948589e-08,
+      "loss": 0.3275,
+      "step": 229
+    },
+    {
+      "epoch": 2.967741935483871,
+      "grad_norm": 0.08940213355520302,
+      "learning_rate": 1.1516505589381776e-08,
+      "loss": 0.3246,
+      "step": 230
+    },
+    {
+      "epoch": 2.9806451612903224,
+      "grad_norm": 0.0895898052255747,
+      "learning_rate": 0.0,
+      "loss": 0.3079,
+      "step": 231
+    },
+    {
+      "epoch": 2.9806451612903224,
+      "eval_loss": 0.3507891595363617,
+      "eval_runtime": 36.0777,
+      "eval_samples_per_second": 28.799,
+      "eval_steps_per_second": 0.915,
+      "step": 231
+    },
+    {
+      "epoch": 2.9806451612903224,
+      "step": 231,
+      "total_flos": 9.324729662937498e+16,
+      "train_loss": 0.3951803825118325,
+      "train_runtime": 2997.4381,
+      "train_samples_per_second": 9.871,
+      "train_steps_per_second": 0.077
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 231,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 9.324729662937498e+16,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}