24/08/18 Initial Commit

Browse files

Files changed (13) hide show

README.md +142 -3
config.json +31 -0
generation_config.json +6 -0
model.safetensors +3 -0
optimizer.pt +3 -0
pytorch_model.bin +3 -0
rng_state.pth +3 -0
scheduler.pt +3 -0
special_tokens_map.json +24 -0
tokenizer.model +3 -0
tokenizer_config.json +43 -0
trainer_state.json +0 -0
training_args.bin +3 -0

README.md CHANGED Viewed

@@ -1,3 +1,142 @@
----
-license: apache-2.0
----

+---
+license: apache-2.0
+base_model: keeeeenw/MicroLlama
+tags:
+- generated_from_trainer
+model-index:
+- name: medusa-microllama_305M_stage2
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+[<img src="https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/OpenAccess-AI-Collective/axolotl)
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="200" height="32"/>](https://wandb.ai/momorami-kaist/medusa_test/runs/eg00n44l)
+# medusa-microllama_305M_stage2
+This model is a fine-tuned version of [keeeeenw/MicroLlama](https://huggingface.co/keeeeenw/MicroLlama) on the None dataset.
+It achieves the following results on the evaluation set:
+- Loss: 3.5262
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.0005
+- train_batch_size: 2
+- eval_batch_size: 2
+- seed: 42
+- gradient_accumulation_steps: 4
+- total_train_batch_size: 8
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_steps: 40
+- num_epochs: 2
+### Training results
+| Training Loss | Epoch  | Step | Validation Loss |
+|:-------------:|:------:|:----:|:---------------:|
+| 4.6913        | 0.0244 | 40   | 4.7578          |
+| 4.8782        | 0.0489 | 80   | 4.8017          |
+| 4.642         | 0.0733 | 120  | 4.7973          |
+| 4.4601        | 0.0978 | 160  | 4.7589          |
+| 4.4806        | 0.1222 | 200  | 4.6955          |
+| 4.4856        | 0.1467 | 240  | 4.6196          |
+| 4.4671        | 0.1711 | 280  | 4.5750          |
+| 4.3228        | 0.1955 | 320  | 4.5563          |
+| 4.1184        | 0.2200 | 360  | 4.5274          |
+| 3.9986        | 0.2444 | 400  | 4.5031          |
+| 4.2603        | 0.2689 | 440  | 4.4637          |
+| 4.1344        | 0.2933 | 480  | 4.4349          |
+| 4.1973        | 0.3178 | 520  | 4.4106          |
+| 4.3961        | 0.3422 | 560  | 4.4202          |
+| 4.1814        | 0.3666 | 600  | 4.3732          |
+| 4.1685        | 0.3911 | 640  | 4.3877          |
+| 4.3108        | 0.4155 | 680  | 4.3262          |
+| 4.6294        | 0.4400 | 720  | 4.3108          |
+| 4.3653        | 0.4644 | 760  | 4.2880          |
+| 4.1505        | 0.4888 | 800  | 4.2835          |
+| 3.8278        | 0.5133 | 840  | 4.2623          |
+| 4.3567        | 0.5377 | 880  | 4.2253          |
+| 4.2782        | 0.5622 | 920  | 4.1919          |
+| 4.1025        | 0.5866 | 960  | 4.1846          |
+| 4.2819        | 0.6111 | 1000 | 4.1637          |
+| 3.9919        | 0.6355 | 1040 | 4.1323          |
+| 4.1932        | 0.6599 | 1080 | 4.1017          |
+| 4.0949        | 0.6844 | 1120 | 4.1085          |
+| 3.7266        | 0.7088 | 1160 | 4.0668          |
+| 4.1255        | 0.7333 | 1200 | 4.0500          |
+| 4.3707        | 0.7577 | 1240 | 4.0207          |
+| 4.1965        | 0.7822 | 1280 | 4.0065          |
+| 3.4585        | 0.8066 | 1320 | 3.9363          |
+| 3.7242        | 0.8310 | 1360 | 3.8893          |
+| 3.9228        | 0.8555 | 1400 | 3.8569          |
+| 4.2051        | 0.8799 | 1440 | 3.8412          |
+| 3.6795        | 0.9044 | 1480 | 3.8245          |
+| 3.2453        | 0.9288 | 1520 | 3.8132          |
+| 3.5941        | 0.9533 | 1560 | 3.7907          |
+| 3.6246        | 0.9777 | 1600 | 3.7573          |
+| 2.8637        | 1.0021 | 1640 | 3.7530          |
+| 2.8495        | 1.0266 | 1680 | 3.7741          |
+| 3.0246        | 1.0510 | 1720 | 3.7690          |
+| 2.99          | 1.0755 | 1760 | 3.7464          |
+| 3.1902        | 1.0999 | 1800 | 3.7347          |
+| 2.8099        | 1.1244 | 1840 | 3.7278          |
+| 2.7652        | 1.1488 | 1880 | 3.7245          |
+| 2.6362        | 1.1732 | 1920 | 3.7034          |
+| 2.8562        | 1.1977 | 1960 | 3.6871          |
+| 3.1712        | 1.2221 | 2000 | 3.6786          |
+| 2.7405        | 1.2466 | 2040 | 3.6709          |
+| 2.734         | 1.2710 | 2080 | 3.6404          |
+| 3.1788        | 1.2954 | 2120 | 3.6310          |
+| 2.9609        | 1.3199 | 2160 | 3.6176          |
+| 3.0737        | 1.3443 | 2200 | 3.6136          |
+| 2.751         | 1.3688 | 2240 | 3.5960          |
+| 2.7105        | 1.3932 | 2280 | 3.5872          |
+| 2.8158        | 1.4177 | 2320 | 3.5848          |
+| 3.03          | 1.4421 | 2360 | 3.5679          |
+| 2.8122        | 1.4665 | 2400 | 3.5718          |
+| 2.5581        | 1.4910 | 2440 | 3.5568          |
+| 2.9845        | 1.5154 | 2480 | 3.5496          |
+| 2.83          | 1.5399 | 2520 | 3.5440          |
+| 2.7004        | 1.5643 | 2560 | 3.5402          |
+| 2.8271        | 1.5888 | 2600 | 3.5406          |
+| 2.5315        | 1.6132 | 2640 | 3.5316          |
+| 2.6001        | 1.6376 | 2680 | 3.5346          |
+| 2.4959        | 1.6621 | 2720 | 3.5298          |
+| 2.9174        | 1.6865 | 2760 | 3.5304          |
+| 2.7219        | 1.7110 | 2800 | 3.5286          |
+| 2.5395        | 1.7354 | 2840 | 3.5279          |
+| 2.7464        | 1.7599 | 2880 | 3.5284          |
+| 2.7532        | 1.7843 | 2920 | 3.5274          |
+| 2.6472        | 1.8087 | 2960 | 3.5270          |
+| 2.8263        | 1.8332 | 3000 | 3.5268          |
+| 2.916         | 1.8576 | 3040 | 3.5263          |
+| 3.0202        | 1.8821 | 3080 | 3.5262          |
+| 2.7152        | 1.9065 | 3120 | 3.5261          |
+| 2.7628        | 1.9310 | 3160 | 3.5261          |
+| 2.783         | 1.9554 | 3200 | 3.5263          |
+| 3.2587        | 1.9798 | 3240 | 3.5262          |
+### Framework versions
+- Transformers 4.43.0
+- Pytorch 2.3.1
+- Datasets 2.15.0
+- Tokenizers 0.19.1

config.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "_name_or_path": "keeeeenw/MicroLlama",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 5632,
+  "max_position_embeddings": 2048,
+  "medusa_num_heads": 4,
+  "medusa_num_layers": 1,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 12,
+  "num_key_value_heads": 4,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.43.0",
+  "use_cache": false,
+  "vocab_size": 32000
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "transformers_version": "4.43.0"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:be4eb34ca0e3fc34f34baac894f0f44680a3f058668a92ad29c0e762bf753f32
+size 879828432

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:30135acbe7354f09dd92c92216071e9d4d877727c1b50266f08977b59f3a744a
+size 881840890

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:742702a217271b44f6ba575b260d03262a496587d84b8b63e8e3f4397032ac7c
+size 879855990

rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9196a1e708bf24d6abba41cce3f8558820acc3e50f9394c5955e29eb41ffea3d
+size 14244

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8039d280dad3070e169b806490ff8e0bd5e1077911a754b77ef2909860fde236
+size 1064

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
+size 499723

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": true,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": true,
+  "model_max_length": 2048,
+  "pad_token": "</s>",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false,
+  "use_fast": true
+}

trainer_state.json ADDED Viewed

The diff for this file is too large to render. See raw diff

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9fe8eb74cf7eb4ffabe9c2817f4ff602404e640fa2f457eb78e8f5a5d58eab00
+size 5624