Upload model finetuned on codet5p-220m using strategy src_fm_fc_dctx

Browse files

Files changed (12) hide show

README.md +52 -3
added_tokens.json +8 -0
config.json +33 -0
generation_config.json +8 -0
merges.txt +0 -0
pytorch_model.bin +3 -0
special_tokens_map.json +59 -0
tokenizer.json +0 -0
tokenizer_config.json +113 -0
trainer_state.json +1161 -0
training_args.bin +3 -0
vocab.json +0 -0

README.md CHANGED Viewed

@@ -1,3 +1,52 @@
----
-license: apache-2.0
----

+---
+license: bsd-3-clause
+base_model: Salesforce/codet5p-220m
+tags:
+- generated_from_trainer
+model-index:
+- name: dynamtests_01_codet5p_src_fm_fc_dctx
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# dynamtests_01_codet5p_src_fm_fc_dctx
+This model is a fine-tuned version of [Salesforce/codet5p-220m](https://huggingface.co/Salesforce/codet5p-220m) on an unknown dataset.
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 2e-05
+- train_batch_size: 8
+- eval_batch_size: 32
+- seed: 42
+- gradient_accumulation_steps: 8
+- total_train_batch_size: 64
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: linear
+- lr_scheduler_warmup_steps: 1974
+- num_epochs: 4
+- mixed_precision_training: Native AMP
+### Framework versions
+- Transformers 4.40.0
+- Pytorch 2.1.0
+- Datasets 3.0.0
+- Tokenizers 0.19.1

added_tokens.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "</ECTX>": 32103,
+  "</FCTX>": 32101,
+  "</PRIVATE_FCTX>": 32105,
+  "<ECTX>": 32102,
+  "<FCTX>": 32100,
+  "<PRIVATE_FCTX>": 32104
+}

config.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "_name_or_path": "Salesforce/codet5p-220m",
+  "architectures": [
+    "T5ForConditionalGeneration"
+  ],
+  "bos_token_id": 1,
+  "classifier_dropout": 0.0,
+  "d_ff": 3072,
+  "d_kv": 64,
+  "d_model": 768,
+  "decoder_start_token_id": 0,
+  "dense_act_fn": "relu",
+  "dropout_rate": 0.1,
+  "eos_token_id": 2,
+  "feed_forward_proj": "relu",
+  "initializer_factor": 1.0,
+  "is_encoder_decoder": true,
+  "is_gated_act": false,
+  "layer_norm_epsilon": 1e-06,
+  "model_type": "t5",
+  "n_positions": 1024,
+  "num_decoder_layers": 12,
+  "num_heads": 12,
+  "num_layers": 12,
+  "output_past": true,
+  "pad_token_id": 0,
+  "relative_attention_max_distance": 128,
+  "relative_attention_num_buckets": 32,
+  "torch_dtype": "float32",
+  "transformers_version": "4.40.0",
+  "use_cache": true,
+  "vocab_size": 32106
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "decoder_start_token_id": 0,
+  "eos_token_id": 2,
+  "pad_token_id": 0,
+  "transformers_version": "4.40.0"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c89fe1d2448925fbe7c35105204834a24d4703a6a86010241401f99b19e8db6d
+size 891635790

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,59 @@

+{
+  "additional_special_tokens": [
+    "<FCTX>",
+    "</FCTX>",
+    "<ECTX>",
+    "</ECTX>",
+    "<PRIVATE_FCTX>",
+    "</PRIVATE_FCTX>"
+  ],
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,113 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<mask>",
+      "lstrip": true,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32100": {
+      "content": "<FCTX>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32101": {
+      "content": "</FCTX>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32102": {
+      "content": "<ECTX>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32103": {
+      "content": "</ECTX>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32104": {
+      "content": "<PRIVATE_FCTX>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "32105": {
+      "content": "</PRIVATE_FCTX>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<FCTX>",
+    "</FCTX>",
+    "<ECTX>",
+    "</ECTX>",
+    "<PRIVATE_FCTX>",
+    "</PRIVATE_FCTX>"
+  ],
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "errors": "replace",
+  "mask_token": "<mask>",
+  "model_max_length": 1024,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "tokenizer_class": "RobertaTokenizer",
+  "trim_offsets": true,
+  "unk_token": "<unk>"
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1161 @@

+{
+  "best_metric": 1.0085912942886353,
+  "best_model_checkpoint": "/root/finetuning_executions/dynamtests_01_codet5p_src_fm_fc_dctx/checkpoint-39484",
+  "epoch": 3.999797396545611,
+  "eval_steps": 500,
+  "global_step": 39484,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.025325431798612165,
+      "grad_norm": 1.3758184909820557,
+      "learning_rate": 2.462006079027356e-06,
+      "loss": 2.5021,
+      "step": 250
+    },
+    {
+      "epoch": 0.05065086359722433,
+      "grad_norm": 1.0281180143356323,
+      "learning_rate": 4.994934143870315e-06,
+      "loss": 1.2333,
+      "step": 500
+    },
+    {
+      "epoch": 0.0759762953958365,
+      "grad_norm": 1.0643247365951538,
+      "learning_rate": 7.527862208713273e-06,
+      "loss": 1.1968,
+      "step": 750
+    },
+    {
+      "epoch": 0.10130172719444866,
+      "grad_norm": 1.0396573543548584,
+      "learning_rate": 1.006079027355623e-05,
+      "loss": 1.1765,
+      "step": 1000
+    },
+    {
+      "epoch": 0.12662715899306085,
+      "grad_norm": 1.0461889505386353,
+      "learning_rate": 1.259371833839919e-05,
+      "loss": 1.154,
+      "step": 1250
+    },
+    {
+      "epoch": 0.151952590791673,
+      "grad_norm": 0.9765501022338867,
+      "learning_rate": 1.5126646403242148e-05,
+      "loss": 1.1434,
+      "step": 1500
+    },
+    {
+      "epoch": 0.17727802259028516,
+      "grad_norm": 1.5702638626098633,
+      "learning_rate": 1.765957446808511e-05,
+      "loss": 1.1293,
+      "step": 1750
+    },
+    {
+      "epoch": 0.20260345438889732,
+      "grad_norm": 0.9935035705566406,
+      "learning_rate": 1.998986936816849e-05,
+      "loss": 1.1071,
+      "step": 2000
+    },
+    {
+      "epoch": 0.2279288861875095,
+      "grad_norm": 0.9715584516525269,
+      "learning_rate": 1.985657158091176e-05,
+      "loss": 1.0968,
+      "step": 2250
+    },
+    {
+      "epoch": 0.2532543179861217,
+      "grad_norm": 0.9306278228759766,
+      "learning_rate": 1.9723273793655026e-05,
+      "loss": 1.0923,
+      "step": 2500
+    },
+    {
+      "epoch": 0.2785797497847338,
+      "grad_norm": 1.0911701917648315,
+      "learning_rate": 1.9589976006398295e-05,
+      "loss": 1.0749,
+      "step": 2750
+    },
+    {
+      "epoch": 0.303905181583346,
+      "grad_norm": 1.024587869644165,
+      "learning_rate": 1.9456678219141565e-05,
+      "loss": 1.0695,
+      "step": 3000
+    },
+    {
+      "epoch": 0.32923061338195814,
+      "grad_norm": 0.9198426008224487,
+      "learning_rate": 1.932338043188483e-05,
+      "loss": 1.0677,
+      "step": 3250
+    },
+    {
+      "epoch": 0.3545560451805703,
+      "grad_norm": 0.97704017162323,
+      "learning_rate": 1.91900826446281e-05,
+      "loss": 1.0551,
+      "step": 3500
+    },
+    {
+      "epoch": 0.3798814769791825,
+      "grad_norm": 0.8315013647079468,
+      "learning_rate": 1.905678485737137e-05,
+      "loss": 1.0388,
+      "step": 3750
+    },
+    {
+      "epoch": 0.40520690877779464,
+      "grad_norm": 0.8992215991020203,
+      "learning_rate": 1.8923487070114637e-05,
+      "loss": 1.0451,
+      "step": 4000
+    },
+    {
+      "epoch": 0.4305323405764068,
+      "grad_norm": 1.0379681587219238,
+      "learning_rate": 1.8790189282857906e-05,
+      "loss": 1.0355,
+      "step": 4250
+    },
+    {
+      "epoch": 0.455857772375019,
+      "grad_norm": 0.9491779804229736,
+      "learning_rate": 1.8656891495601176e-05,
+      "loss": 1.0374,
+      "step": 4500
+    },
+    {
+      "epoch": 0.48118320417363114,
+      "grad_norm": 0.8982920050621033,
+      "learning_rate": 1.8523593708344442e-05,
+      "loss": 1.0267,
+      "step": 4750
+    },
+    {
+      "epoch": 0.5065086359722434,
+      "grad_norm": 0.8307796716690063,
+      "learning_rate": 1.8390295921087712e-05,
+      "loss": 1.0161,
+      "step": 5000
+    },
+    {
+      "epoch": 0.5318340677708555,
+      "grad_norm": 0.9225348830223083,
+      "learning_rate": 1.825699813383098e-05,
+      "loss": 1.0142,
+      "step": 5250
+    },
+    {
+      "epoch": 0.5571594995694676,
+      "grad_norm": 0.8506412506103516,
+      "learning_rate": 1.8123700346574248e-05,
+      "loss": 1.0091,
+      "step": 5500
+    },
+    {
+      "epoch": 0.5824849313680798,
+      "grad_norm": 0.9221129417419434,
+      "learning_rate": 1.7990402559317517e-05,
+      "loss": 1.0004,
+      "step": 5750
+    },
+    {
+      "epoch": 0.607810363166692,
+      "grad_norm": 0.9478843212127686,
+      "learning_rate": 1.7857104772060784e-05,
+      "loss": 1.0046,
+      "step": 6000
+    },
+    {
+      "epoch": 0.6331357949653041,
+      "grad_norm": 2.4678380489349365,
+      "learning_rate": 1.7723806984804053e-05,
+      "loss": 1.0008,
+      "step": 6250
+    },
+    {
+      "epoch": 0.6584612267639163,
+      "grad_norm": 0.8661640286445618,
+      "learning_rate": 1.7590509197547323e-05,
+      "loss": 1.0027,
+      "step": 6500
+    },
+    {
+      "epoch": 0.6837866585625285,
+      "grad_norm": 0.8918209671974182,
+      "learning_rate": 1.7457211410290592e-05,
+      "loss": 0.9995,
+      "step": 6750
+    },
+    {
+      "epoch": 0.7091120903611406,
+      "grad_norm": 0.8873420357704163,
+      "learning_rate": 1.732391362303386e-05,
+      "loss": 0.9932,
+      "step": 7000
+    },
+    {
+      "epoch": 0.7344375221597528,
+      "grad_norm": 0.8874693512916565,
+      "learning_rate": 1.719061583577713e-05,
+      "loss": 0.9785,
+      "step": 7250
+    },
+    {
+      "epoch": 0.759762953958365,
+      "grad_norm": 0.8625825643539429,
+      "learning_rate": 1.7057318048520395e-05,
+      "loss": 0.9817,
+      "step": 7500
+    },
+    {
+      "epoch": 0.7850883857569771,
+      "grad_norm": 0.9080167412757874,
+      "learning_rate": 1.6924020261263664e-05,
+      "loss": 0.9783,
+      "step": 7750
+    },
+    {
+      "epoch": 0.8104138175555893,
+      "grad_norm": 0.9515653848648071,
+      "learning_rate": 1.6790722474006934e-05,
+      "loss": 0.984,
+      "step": 8000
+    },
+    {
+      "epoch": 0.8357392493542015,
+      "grad_norm": 0.8340937495231628,
+      "learning_rate": 1.66574246867502e-05,
+      "loss": 0.9819,
+      "step": 8250
+    },
+    {
+      "epoch": 0.8610646811528136,
+      "grad_norm": 0.9471110701560974,
+      "learning_rate": 1.652412689949347e-05,
+      "loss": 0.974,
+      "step": 8500
+    },
+    {
+      "epoch": 0.8863901129514258,
+      "grad_norm": 0.8862300515174866,
+      "learning_rate": 1.639082911223674e-05,
+      "loss": 0.9653,
+      "step": 8750
+    },
+    {
+      "epoch": 0.911715544750038,
+      "grad_norm": 0.8293271064758301,
+      "learning_rate": 1.6257531324980006e-05,
+      "loss": 0.9615,
+      "step": 9000
+    },
+    {
+      "epoch": 0.9370409765486502,
+      "grad_norm": 0.9127717614173889,
+      "learning_rate": 1.6124233537723275e-05,
+      "loss": 0.9671,
+      "step": 9250
+    },
+    {
+      "epoch": 0.9623664083472623,
+      "grad_norm": 0.8955530524253845,
+      "learning_rate": 1.599093575046654e-05,
+      "loss": 0.9606,
+      "step": 9500
+    },
+    {
+      "epoch": 0.9876918401458745,
+      "grad_norm": 0.9058429002761841,
+      "learning_rate": 1.585763796320981e-05,
+      "loss": 0.9586,
+      "step": 9750
+    },
+    {
+      "epoch": 0.9999493491364028,
+      "eval_loss": 1.0199180841445923,
+      "eval_runtime": 1639.424,
+      "eval_samples_per_second": 40.384,
+      "eval_steps_per_second": 1.262,
+      "step": 9871
+    },
+    {
+      "epoch": 1.0130172719444868,
+      "grad_norm": 0.8560991287231445,
+      "learning_rate": 1.572434017595308e-05,
+      "loss": 0.939,
+      "step": 10000
+    },
+    {
+      "epoch": 1.0383427037430988,
+      "grad_norm": 0.8284989595413208,
+      "learning_rate": 1.559104238869635e-05,
+      "loss": 0.9317,
+      "step": 10250
+    },
+    {
+      "epoch": 1.063668135541711,
+      "grad_norm": 0.8807125091552734,
+      "learning_rate": 1.5457744601439617e-05,
+      "loss": 0.9363,
+      "step": 10500
+    },
+    {
+      "epoch": 1.088993567340323,
+      "grad_norm": 0.8796108961105347,
+      "learning_rate": 1.5324446814182886e-05,
+      "loss": 0.9236,
+      "step": 10750
+    },
+    {
+      "epoch": 1.1143189991389353,
+      "grad_norm": 0.8540758490562439,
+      "learning_rate": 1.5191149026926156e-05,
+      "loss": 0.9233,
+      "step": 11000
+    },
+    {
+      "epoch": 1.1396444309375475,
+      "grad_norm": 0.8657658696174622,
+      "learning_rate": 1.5057851239669424e-05,
+      "loss": 0.9346,
+      "step": 11250
+    },
+    {
+      "epoch": 1.1649698627361595,
+      "grad_norm": 0.8995893001556396,
+      "learning_rate": 1.4924553452412692e-05,
+      "loss": 0.917,
+      "step": 11500
+    },
+    {
+      "epoch": 1.1902952945347718,
+      "grad_norm": 0.9290043115615845,
+      "learning_rate": 1.479125566515596e-05,
+      "loss": 0.9338,
+      "step": 11750
+    },
+    {
+      "epoch": 1.215620726333384,
+      "grad_norm": 0.8952407836914062,
+      "learning_rate": 1.4657957877899228e-05,
+      "loss": 0.9257,
+      "step": 12000
+    },
+    {
+      "epoch": 1.240946158131996,
+      "grad_norm": 0.8839919567108154,
+      "learning_rate": 1.4524660090642497e-05,
+      "loss": 0.9104,
+      "step": 12250
+    },
+    {
+      "epoch": 1.2662715899306083,
+      "grad_norm": 0.9677265286445618,
+      "learning_rate": 1.4391362303385765e-05,
+      "loss": 0.9259,
+      "step": 12500
+    },
+    {
+      "epoch": 1.2915970217292205,
+      "grad_norm": 0.9325098395347595,
+      "learning_rate": 1.4258064516129033e-05,
+      "loss": 0.9203,
+      "step": 12750
+    },
+    {
+      "epoch": 1.3169224535278325,
+      "grad_norm": 0.778640866279602,
+      "learning_rate": 1.4124766728872301e-05,
+      "loss": 0.912,
+      "step": 13000
+    },
+    {
+      "epoch": 1.3422478853264448,
+      "grad_norm": 0.8638414144515991,
+      "learning_rate": 1.399146894161557e-05,
+      "loss": 0.9181,
+      "step": 13250
+    },
+    {
+      "epoch": 1.367573317125057,
+      "grad_norm": 1.0181560516357422,
+      "learning_rate": 1.3858171154358839e-05,
+      "loss": 0.9146,
+      "step": 13500
+    },
+    {
+      "epoch": 1.392898748923669,
+      "grad_norm": 0.9884174466133118,
+      "learning_rate": 1.3724873367102107e-05,
+      "loss": 0.9037,
+      "step": 13750
+    },
+    {
+      "epoch": 1.4182241807222813,
+      "grad_norm": 1.1058709621429443,
+      "learning_rate": 1.3591575579845375e-05,
+      "loss": 0.9159,
+      "step": 14000
+    },
+    {
+      "epoch": 1.4435496125208935,
+      "grad_norm": 1.0129822492599487,
+      "learning_rate": 1.3458277792588642e-05,
+      "loss": 0.8993,
+      "step": 14250
+    },
+    {
+      "epoch": 1.4688750443195056,
+      "grad_norm": 0.8782840967178345,
+      "learning_rate": 1.3324980005331914e-05,
+      "loss": 0.9061,
+      "step": 14500
+    },
+    {
+      "epoch": 1.4942004761181178,
+      "grad_norm": 0.9015256762504578,
+      "learning_rate": 1.3191682218075182e-05,
+      "loss": 0.9122,
+      "step": 14750
+    },
+    {
+      "epoch": 1.51952590791673,
+      "grad_norm": 0.9482327103614807,
+      "learning_rate": 1.305838443081845e-05,
+      "loss": 0.9091,
+      "step": 15000
+    },
+    {
+      "epoch": 1.544851339715342,
+      "grad_norm": 0.8400648236274719,
+      "learning_rate": 1.2925086643561718e-05,
+      "loss": 0.9064,
+      "step": 15250
+    },
+    {
+      "epoch": 1.5701767715139543,
+      "grad_norm": 0.9606112837791443,
+      "learning_rate": 1.2791788856304987e-05,
+      "loss": 0.8983,
+      "step": 15500
+    },
+    {
+      "epoch": 1.5955022033125665,
+      "grad_norm": 0.944854736328125,
+      "learning_rate": 1.2658491069048255e-05,
+      "loss": 0.9055,
+      "step": 15750
+    },
+    {
+      "epoch": 1.6208276351111786,
+      "grad_norm": 0.8674355745315552,
+      "learning_rate": 1.2525193281791523e-05,
+      "loss": 0.8895,
+      "step": 16000
+    },
+    {
+      "epoch": 1.6461530669097908,
+      "grad_norm": 0.8632267713546753,
+      "learning_rate": 1.2391895494534791e-05,
+      "loss": 0.8876,
+      "step": 16250
+    },
+    {
+      "epoch": 1.671478498708403,
+      "grad_norm": 0.903851330280304,
+      "learning_rate": 1.2258597707278059e-05,
+      "loss": 0.9007,
+      "step": 16500
+    },
+    {
+      "epoch": 1.696803930507015,
+      "grad_norm": 0.9242746829986572,
+      "learning_rate": 1.2125833111170356e-05,
+      "loss": 0.8953,
+      "step": 16750
+    },
+    {
+      "epoch": 1.7221293623056273,
+      "grad_norm": 0.9627535343170166,
+      "learning_rate": 1.1992535323913624e-05,
+      "loss": 0.8881,
+      "step": 17000
+    },
+    {
+      "epoch": 1.7474547941042395,
+      "grad_norm": 0.8524439334869385,
+      "learning_rate": 1.1859237536656893e-05,
+      "loss": 0.8913,
+      "step": 17250
+    },
+    {
+      "epoch": 1.7727802259028516,
+      "grad_norm": 0.9666581749916077,
+      "learning_rate": 1.1726472940549186e-05,
+      "loss": 0.8972,
+      "step": 17500
+    },
+    {
+      "epoch": 1.7981056577014638,
+      "grad_norm": 0.8809413909912109,
+      "learning_rate": 1.1593175153292454e-05,
+      "loss": 0.8858,
+      "step": 17750
+    },
+    {
+      "epoch": 1.823431089500076,
+      "grad_norm": 0.903626024723053,
+      "learning_rate": 1.1459877366035726e-05,
+      "loss": 0.8907,
+      "step": 18000
+    },
+    {
+      "epoch": 1.848756521298688,
+      "grad_norm": 0.8203657865524292,
+      "learning_rate": 1.1326579578778994e-05,
+      "loss": 0.8849,
+      "step": 18250
+    },
+    {
+      "epoch": 1.8740819530973003,
+      "grad_norm": 0.8978894948959351,
+      "learning_rate": 1.1193281791522262e-05,
+      "loss": 0.8831,
+      "step": 18500
+    },
+    {
+      "epoch": 1.8994073848959125,
+      "grad_norm": 0.9283676743507385,
+      "learning_rate": 1.105998400426553e-05,
+      "loss": 0.8828,
+      "step": 18750
+    },
+    {
+      "epoch": 1.9247328166945246,
+      "grad_norm": 0.9514400959014893,
+      "learning_rate": 1.09266862170088e-05,
+      "loss": 0.8761,
+      "step": 19000
+    },
+    {
+      "epoch": 1.9500582484931368,
+      "grad_norm": 0.8809083104133606,
+      "learning_rate": 1.0793388429752067e-05,
+      "loss": 0.8861,
+      "step": 19250
+    },
+    {
+      "epoch": 1.975383680291749,
+      "grad_norm": 0.8767380118370056,
+      "learning_rate": 1.0660090642495335e-05,
+      "loss": 0.8811,
+      "step": 19500
+    },
+    {
+      "epoch": 2.0,
+      "eval_loss": 1.0132982730865479,
+      "eval_runtime": 1638.7739,
+      "eval_samples_per_second": 40.4,
+      "eval_steps_per_second": 1.263,
+      "step": 19743
+    },
+    {
+      "epoch": 2.000709112090361,
+      "grad_norm": 0.9499657154083252,
+      "learning_rate": 1.0526792855238603e-05,
+      "loss": 0.8821,
+      "step": 19750
+    },
+    {
+      "epoch": 2.0260345438889735,
+      "grad_norm": 0.7973400950431824,
+      "learning_rate": 1.0393495067981871e-05,
+      "loss": 0.8608,
+      "step": 20000
+    },
+    {
+      "epoch": 2.0513599756875855,
+      "grad_norm": 0.9120876789093018,
+      "learning_rate": 1.0260197280725142e-05,
+      "loss": 0.8574,
+      "step": 20250
+    },
+    {
+      "epoch": 2.0766854074861976,
+      "grad_norm": 0.8534213304519653,
+      "learning_rate": 1.012689949346841e-05,
+      "loss": 0.8604,
+      "step": 20500
+    },
+    {
+      "epoch": 2.1020108392848096,
+      "grad_norm": 0.8855152726173401,
+      "learning_rate": 9.993601706211678e-06,
+      "loss": 0.8696,
+      "step": 20750
+    },
+    {
+      "epoch": 2.127336271083422,
+      "grad_norm": 0.9473037123680115,
+      "learning_rate": 9.860303918954946e-06,
+      "loss": 0.859,
+      "step": 21000
+    },
+    {
+      "epoch": 2.152661702882034,
+      "grad_norm": 0.9424493908882141,
+      "learning_rate": 9.727006131698214e-06,
+      "loss": 0.8577,
+      "step": 21250
+    },
+    {
+      "epoch": 2.177987134680646,
+      "grad_norm": 0.8066829442977905,
+      "learning_rate": 9.593708344441484e-06,
+      "loss": 0.8611,
+      "step": 21500
+    },
+    {
+      "epoch": 2.2033125664792585,
+      "grad_norm": 0.9358799457550049,
+      "learning_rate": 9.460410557184752e-06,
+      "loss": 0.8572,
+      "step": 21750
+    },
+    {
+      "epoch": 2.2286379982778706,
+      "grad_norm": 0.9982028007507324,
+      "learning_rate": 9.32711276992802e-06,
+      "loss": 0.8492,
+      "step": 22000
+    },
+    {
+      "epoch": 2.2539634300764826,
+      "grad_norm": 0.8830463290214539,
+      "learning_rate": 9.19381498267129e-06,
+      "loss": 0.8539,
+      "step": 22250
+    },
+    {
+      "epoch": 2.279288861875095,
+      "grad_norm": 0.9708883762359619,
+      "learning_rate": 9.061050386563584e-06,
+      "loss": 0.8495,
+      "step": 22500
+    },
+    {
+      "epoch": 2.304614293673707,
+      "grad_norm": 0.8464154005050659,
+      "learning_rate": 8.927752599306852e-06,
+      "loss": 0.8519,
+      "step": 22750
+    },
+    {
+      "epoch": 2.329939725472319,
+      "grad_norm": 0.9446752667427063,
+      "learning_rate": 8.79445481205012e-06,
+      "loss": 0.8545,
+      "step": 23000
+    },
+    {
+      "epoch": 2.3552651572709316,
+      "grad_norm": 0.9621785283088684,
+      "learning_rate": 8.66115702479339e-06,
+      "loss": 0.8482,
+      "step": 23250
+    },
+    {
+      "epoch": 2.3805905890695436,
+      "grad_norm": 0.9067039489746094,
+      "learning_rate": 8.527859237536658e-06,
+      "loss": 0.8582,
+      "step": 23500
+    },
+    {
+      "epoch": 2.4059160208681556,
+      "grad_norm": 0.9858378767967224,
+      "learning_rate": 8.394561450279926e-06,
+      "loss": 0.8565,
+      "step": 23750
+    },
+    {
+      "epoch": 2.431241452666768,
+      "grad_norm": 0.9362533092498779,
+      "learning_rate": 8.261263663023195e-06,
+      "loss": 0.8531,
+      "step": 24000
+    },
+    {
+      "epoch": 2.45656688446538,
+      "grad_norm": 0.9225192666053772,
+      "learning_rate": 8.127965875766463e-06,
+      "loss": 0.8555,
+      "step": 24250
+    },
+    {
+      "epoch": 2.481892316263992,
+      "grad_norm": 0.9358901381492615,
+      "learning_rate": 7.994668088509731e-06,
+      "loss": 0.8523,
+      "step": 24500
+    },
+    {
+      "epoch": 2.5072177480626046,
+      "grad_norm": 0.9531691670417786,
+      "learning_rate": 7.861370301252999e-06,
+      "loss": 0.8443,
+      "step": 24750
+    },
+    {
+      "epoch": 2.5325431798612166,
+      "grad_norm": 0.9370359182357788,
+      "learning_rate": 7.728072513996269e-06,
+      "loss": 0.8451,
+      "step": 25000
+    },
+    {
+      "epoch": 2.5578686116598286,
+      "grad_norm": 0.8840625882148743,
+      "learning_rate": 7.5947747267395365e-06,
+      "loss": 0.8533,
+      "step": 25250
+    },
+    {
+      "epoch": 2.583194043458441,
+      "grad_norm": 0.9283475875854492,
+      "learning_rate": 7.461476939482805e-06,
+      "loss": 0.8425,
+      "step": 25500
+    },
+    {
+      "epoch": 2.608519475257053,
+      "grad_norm": 0.908301055431366,
+      "learning_rate": 7.328179152226073e-06,
+      "loss": 0.8433,
+      "step": 25750
+    },
+    {
+      "epoch": 2.633844907055665,
+      "grad_norm": 0.9126138091087341,
+      "learning_rate": 7.194881364969342e-06,
+      "loss": 0.8401,
+      "step": 26000
+    },
+    {
+      "epoch": 2.6591703388542776,
+      "grad_norm": 0.8935621976852417,
+      "learning_rate": 7.061583577712611e-06,
+      "loss": 0.8418,
+      "step": 26250
+    },
+    {
+      "epoch": 2.6844957706528896,
+      "grad_norm": 0.8745056986808777,
+      "learning_rate": 6.928285790455879e-06,
+      "loss": 0.837,
+      "step": 26500
+    },
+    {
+      "epoch": 2.7098212024515016,
+      "grad_norm": 0.948512077331543,
+      "learning_rate": 6.795521194348175e-06,
+      "loss": 0.8411,
+      "step": 26750
+    },
+    {
+      "epoch": 2.735146634250114,
+      "grad_norm": 1.008754014968872,
+      "learning_rate": 6.6622234070914425e-06,
+      "loss": 0.8355,
+      "step": 27000
+    },
+    {
+      "epoch": 2.760472066048726,
+      "grad_norm": 1.0162386894226074,
+      "learning_rate": 6.528925619834712e-06,
+      "loss": 0.8405,
+      "step": 27250
+    },
+    {
+      "epoch": 2.785797497847338,
+      "grad_norm": 0.9260863661766052,
+      "learning_rate": 6.39562783257798e-06,
+      "loss": 0.8354,
+      "step": 27500
+    },
+    {
+      "epoch": 2.8111229296459506,
+      "grad_norm": 0.9513674378395081,
+      "learning_rate": 6.262330045321248e-06,
+      "loss": 0.8392,
+      "step": 27750
+    },
+    {
+      "epoch": 2.8364483614445626,
+      "grad_norm": 1.0211256742477417,
+      "learning_rate": 6.129032258064517e-06,
+      "loss": 0.8324,
+      "step": 28000
+    },
+    {
+      "epoch": 2.8617737932431746,
+      "grad_norm": 0.9345864057540894,
+      "learning_rate": 5.995734470807785e-06,
+      "loss": 0.8412,
+      "step": 28250
+    },
+    {
+      "epoch": 2.887099225041787,
+      "grad_norm": 0.8973652124404907,
+      "learning_rate": 5.8624366835510535e-06,
+      "loss": 0.8368,
+      "step": 28500
+    },
+    {
+      "epoch": 2.912424656840399,
+      "grad_norm": 0.8682575225830078,
+      "learning_rate": 5.7291388962943215e-06,
+      "loss": 0.8316,
+      "step": 28750
+    },
+    {
+      "epoch": 2.937750088639011,
+      "grad_norm": 0.9307655096054077,
+      "learning_rate": 5.595841109037591e-06,
+      "loss": 0.8365,
+      "step": 29000
+    },
+    {
+      "epoch": 2.9630755204376236,
+      "grad_norm": 1.0200505256652832,
+      "learning_rate": 5.462543321780859e-06,
+      "loss": 0.8395,
+      "step": 29250
+    },
+    {
+      "epoch": 2.9884009522362356,
+      "grad_norm": 1.0397480726242065,
+      "learning_rate": 5.329245534524128e-06,
+      "loss": 0.834,
+      "step": 29500
+    },
+    {
+      "epoch": 2.999949349136403,
+      "eval_loss": 1.0098419189453125,
+      "eval_runtime": 1638.8769,
+      "eval_samples_per_second": 40.398,
+      "eval_steps_per_second": 1.262,
+      "step": 29614
+    },
+    {
+      "epoch": 3.0137263840348476,
+      "grad_norm": 1.0044381618499756,
+      "learning_rate": 5.195947747267396e-06,
+      "loss": 0.8305,
+      "step": 29750
+    },
+    {
+      "epoch": 3.03905181583346,
+      "grad_norm": 1.0036746263504028,
+      "learning_rate": 5.062649960010664e-06,
+      "loss": 0.8284,
+      "step": 30000
+    },
+    {
+      "epoch": 3.064377247632072,
+      "grad_norm": 0.8896681070327759,
+      "learning_rate": 4.9293521727539325e-06,
+      "loss": 0.8296,
+      "step": 30250
+    },
+    {
+      "epoch": 3.089702679430684,
+      "grad_norm": 0.9313392639160156,
+      "learning_rate": 4.796587576646228e-06,
+      "loss": 0.8249,
+      "step": 30500
+    },
+    {
+      "epoch": 3.1150281112292966,
+      "grad_norm": 0.9054111838340759,
+      "learning_rate": 4.663822980538523e-06,
+      "loss": 0.8217,
+      "step": 30750
+    },
+    {
+      "epoch": 3.1403535430279086,
+      "grad_norm": 0.9691897630691528,
+      "learning_rate": 4.530525193281792e-06,
+      "loss": 0.8182,
+      "step": 31000
+    },
+    {
+      "epoch": 3.1656789748265206,
+      "grad_norm": 1.0348809957504272,
+      "learning_rate": 4.39722740602506e-06,
+      "loss": 0.8205,
+      "step": 31250
+    },
+    {
+      "epoch": 3.191004406625133,
+      "grad_norm": 0.8919842839241028,
+      "learning_rate": 4.263929618768329e-06,
+      "loss": 0.8142,
+      "step": 31500
+    },
+    {
+      "epoch": 3.216329838423745,
+      "grad_norm": 0.8951621651649475,
+      "learning_rate": 4.130631831511598e-06,
+      "loss": 0.8264,
+      "step": 31750
+    },
+    {
+      "epoch": 3.241655270222357,
+      "grad_norm": 0.9754297733306885,
+      "learning_rate": 3.9973340442548655e-06,
+      "loss": 0.8176,
+      "step": 32000
+    },
+    {
+      "epoch": 3.2669807020209696,
+      "grad_norm": 1.0148935317993164,
+      "learning_rate": 3.864036256998134e-06,
+      "loss": 0.8252,
+      "step": 32250
+    },
+    {
+      "epoch": 3.2923061338195816,
+      "grad_norm": 0.9025924801826477,
+      "learning_rate": 3.7307384697414027e-06,
+      "loss": 0.816,
+      "step": 32500
+    },
+    {
+      "epoch": 3.3176315656181936,
+      "grad_norm": 0.9092098474502563,
+      "learning_rate": 3.597440682484671e-06,
+      "loss": 0.8201,
+      "step": 32750
+    },
+    {
+      "epoch": 3.342956997416806,
+      "grad_norm": 0.9289584755897522,
+      "learning_rate": 3.4641428952279394e-06,
+      "loss": 0.8161,
+      "step": 33000
+    },
+    {
+      "epoch": 3.368282429215418,
+      "grad_norm": 0.9961521029472351,
+      "learning_rate": 3.3308451079712077e-06,
+      "loss": 0.8188,
+      "step": 33250
+    },
+    {
+      "epoch": 3.39360786101403,
+      "grad_norm": 1.0093194246292114,
+      "learning_rate": 3.1975473207144765e-06,
+      "loss": 0.8188,
+      "step": 33500
+    },
+    {
+      "epoch": 3.4189332928126426,
+      "grad_norm": 1.016876220703125,
+      "learning_rate": 3.064249533457745e-06,
+      "loss": 0.8119,
+      "step": 33750
+    },
+    {
+      "epoch": 3.4442587246112546,
+      "grad_norm": 1.0821869373321533,
+      "learning_rate": 2.9309517462010133e-06,
+      "loss": 0.8221,
+      "step": 34000
+    },
+    {
+      "epoch": 3.4695841564098666,
+      "grad_norm": 0.9766045808792114,
+      "learning_rate": 2.797653958944282e-06,
+      "loss": 0.8132,
+      "step": 34250
+    },
+    {
+      "epoch": 3.494909588208479,
+      "grad_norm": 0.8767127990722656,
+      "learning_rate": 2.6643561716875504e-06,
+      "loss": 0.8166,
+      "step": 34500
+    },
+    {
+      "epoch": 3.520235020007091,
+      "grad_norm": 0.9653995633125305,
+      "learning_rate": 2.5310583844308183e-06,
+      "loss": 0.8074,
+      "step": 34750
+    },
+    {
+      "epoch": 3.545560451805703,
+      "grad_norm": 0.8945389986038208,
+      "learning_rate": 2.397760597174087e-06,
+      "loss": 0.8176,
+      "step": 35000
+    },
+    {
+      "epoch": 3.5708858836043156,
+      "grad_norm": 0.9447450637817383,
+      "learning_rate": 2.2649960010663825e-06,
+      "loss": 0.8235,
+      "step": 35250
+    },
+    {
+      "epoch": 3.5962113154029276,
+      "grad_norm": 1.0400015115737915,
+      "learning_rate": 2.131698213809651e-06,
+      "loss": 0.8136,
+      "step": 35500
+    },
+    {
+      "epoch": 3.6215367472015396,
+      "grad_norm": 0.9300839900970459,
+      "learning_rate": 1.9984004265529192e-06,
+      "loss": 0.8197,
+      "step": 35750
+    },
+    {
+      "epoch": 3.646862179000152,
+      "grad_norm": 0.9101824164390564,
+      "learning_rate": 1.865102639296188e-06,
+      "loss": 0.8078,
+      "step": 36000
+    },
+    {
+      "epoch": 3.672187610798764,
+      "grad_norm": 0.9514500498771667,
+      "learning_rate": 1.7318048520394562e-06,
+      "loss": 0.8084,
+      "step": 36250
+    },
+    {
+      "epoch": 3.697513042597376,
+      "grad_norm": 0.9441540241241455,
+      "learning_rate": 1.5985070647827248e-06,
+      "loss": 0.8213,
+      "step": 36500
+    },
+    {
+      "epoch": 3.7228384743959886,
+      "grad_norm": 1.0184293985366821,
+      "learning_rate": 1.4652092775259933e-06,
+      "loss": 0.8094,
+      "step": 36750
+    },
+    {
+      "epoch": 3.7481639061946006,
+      "grad_norm": 0.991316556930542,
+      "learning_rate": 1.3319114902692617e-06,
+      "loss": 0.8106,
+      "step": 37000
+    },
+    {
+      "epoch": 3.7734893379932126,
+      "grad_norm": 0.9887702465057373,
+      "learning_rate": 1.19861370301253e-06,
+      "loss": 0.8185,
+      "step": 37250
+    },
+    {
+      "epoch": 3.798814769791825,
+      "grad_norm": 0.9897658228874207,
+      "learning_rate": 1.0653159157557984e-06,
+      "loss": 0.8069,
+      "step": 37500
+    },
+    {
+      "epoch": 3.824140201590437,
+      "grad_norm": 0.9137114882469177,
+      "learning_rate": 9.32018128499067e-07,
+      "loss": 0.8114,
+      "step": 37750
+    },
+    {
+      "epoch": 3.849465633389049,
+      "grad_norm": 0.9579175710678101,
+      "learning_rate": 7.987203412423355e-07,
+      "loss": 0.819,
+      "step": 38000
+    },
+    {
+      "epoch": 3.8747910651876616,
+      "grad_norm": 0.9389879107475281,
+      "learning_rate": 6.654225539856039e-07,
+      "loss": 0.8165,
+      "step": 38250
+    },
+    {
+      "epoch": 3.9001164969862736,
+      "grad_norm": 0.9765516519546509,
+      "learning_rate": 5.321247667288723e-07,
+      "loss": 0.8191,
+      "step": 38500
+    },
+    {
+      "epoch": 3.9254419287848856,
+      "grad_norm": 1.0299735069274902,
+      "learning_rate": 3.9882697947214085e-07,
+      "loss": 0.8182,
+      "step": 38750
+    },
+    {
+      "epoch": 3.950767360583498,
+      "grad_norm": 0.9844255447387695,
+      "learning_rate": 2.6552919221540927e-07,
+      "loss": 0.8175,
+      "step": 39000
+    },
+    {
+      "epoch": 3.97609279238211,
+      "grad_norm": 0.9425697922706604,
+      "learning_rate": 1.3223140495867768e-07,
+      "loss": 0.813,
+      "step": 39250
+    },
+    {
+      "epoch": 3.999797396545611,
+      "eval_loss": 1.0085912942886353,
+      "eval_runtime": 1638.7267,
+      "eval_samples_per_second": 40.401,
+      "eval_steps_per_second": 1.263,
+      "step": 39484
+    },
+    {
+      "epoch": 3.999797396545611,
+      "step": 39484,
+      "total_flos": 3.077643948911493e+18,
+      "train_loss": 0.9114745237365729,
+      "train_runtime": 159688.734,
+      "train_samples_per_second": 15.825,
+      "train_steps_per_second": 0.247
+    }
+  ],
+  "logging_steps": 250,
+  "max_steps": 39484,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 4,
+  "save_steps": 500,
+  "total_flos": 3.077643948911493e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aecc845e0935bcd44dbbe3b9f46d94173a31bb0bc3a77f8b9c2482246435ad0f
+size 5240

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff