Delta-Vector commited on
Commit
a576a5c
·
verified ·
1 Parent(s): a4c6e28

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +0 -170
README.md CHANGED
@@ -1,170 +0,0 @@
1
- ---
2
- library_name: peft
3
- tags:
4
- - generated_from_trainer
5
- datasets:
6
- - Mielikki/Erebus-87k
7
- - NewEden/Orion-Asstr-Stories-16K
8
- base_model: unsloth_phi-4
9
- model-index:
10
- - name: phi4-pt-out-r2
11
- results: []
12
- ---
13
-
14
- <!-- This model card has been generated automatically according to the information the Trainer had access to. You
15
- should probably proofread and complete it, then remove this comment. -->
16
-
17
- [<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
18
- <details><summary>See axolotl config</summary>
19
-
20
- axolotl version: `0.6.0`
21
- ```yaml
22
- base_model: unsloth_phi-4
23
- model_type: AutoModelForCausalLM
24
- tokenizer_type: AutoTokenizer
25
-
26
- #hub_model_id: NewEden/Phi4-pretrain
27
- #hub_strategy: "all_checkpoints"
28
- #push_dataset_to_hub:
29
- #hf_use_auth_token: true
30
-
31
- plugins:
32
- - axolotl.integrations.liger.LigerPlugin
33
- liger_rope: true
34
- liger_rms_norm: true
35
- liger_swiglu: true
36
- liger_fused_linear_cross_entropy: true
37
-
38
- #plugins:
39
- # - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
40
-
41
- #cut_cross_entropy: true
42
-
43
- load_in_8bit: false
44
- load_in_4bit: false
45
- strict: false
46
-
47
- datasets:
48
- - path: Mielikki/Erebus-87k
49
- type: completion
50
- field: body
51
- - path: NewEden/Orion-Asstr-Stories-16K
52
- type: completion
53
- field: content
54
- shuffle_merged_datasets: true
55
- dataset_prepared_path: prepared_data
56
- val_set_size: 0.0
57
- output_dir: ./phi4-pt-out-r2
58
-
59
- sequence_len: 16384
60
- sample_packing: true
61
- pad_to_sequence_len: true
62
-
63
- adapter: lora
64
- lora_model_dir:
65
- lora_r: 128
66
- lora_alpha: 16
67
- lora_dropout: 0.05
68
- lora_target_modules:
69
- - gate_proj
70
- - down_proj
71
- - up_proj
72
- - q_proj
73
- - v_proj
74
- - k_proj
75
- - o_proj
76
-
77
- lora_modules_to_save:
78
- - embed_tokens
79
- - lm_head
80
-
81
-
82
- wandb_project: mag-phi
83
- wandb_entity:
84
- wandb_watch:
85
- wandb_name: attempt-02
86
- wandb_log_model:
87
-
88
- gradient_accumulation_steps: 4
89
- micro_batch_size: 2
90
- num_epochs: 1
91
- optimizer: paged_ademamix_8bit
92
- lr_scheduler: cosine
93
- learning_rate: 0.00001
94
-
95
- train_on_inputs: false
96
- group_by_length: false
97
- bf16: auto
98
- fp16:
99
- tf32: false
100
-
101
- gradient_checkpointing: unsloth
102
- early_stopping_patience:
103
- resume_from_checkpoint:
104
- local_rank:
105
- logging_steps: 1
106
- xformers_attention:
107
- flash_attention: true
108
-
109
- warmup_steps: 10
110
- evals_per_epoch: 4
111
- eval_table_size:
112
- eval_max_new_tokens: 128
113
- saves_per_epoch: 4
114
- debug:
115
- deepspeed: /workspace/axolotl/deepspeed_configs/zero3_bf16_cpuoffload_params.json
116
- weight_decay: 0.01
117
- fsdp:
118
- fsdp_config:
119
-
120
- ```
121
-
122
- </details><br>
123
-
124
- # phi4-pt-out-r2
125
-
126
- This model was trained from scratch on the Mielikki/Erebus-87k and the NewEden/Orion-Asstr-Stories-16K datasets.
127
-
128
- ## Model description
129
-
130
- More information needed
131
-
132
- ## Intended uses & limitations
133
-
134
- More information needed
135
-
136
- ## Training and evaluation data
137
-
138
- More information needed
139
-
140
- ## Training procedure
141
-
142
- ### Training hyperparameters
143
-
144
- The following hyperparameters were used during training:
145
- - learning_rate: 1e-05
146
- - train_batch_size: 2
147
- - eval_batch_size: 2
148
- - seed: 42
149
- - distributed_type: multi-GPU
150
- - num_devices: 4
151
- - gradient_accumulation_steps: 4
152
- - total_train_batch_size: 32
153
- - total_eval_batch_size: 8
154
- - optimizer: Use OptimizerNames.PAGED_ADEMAMIX_8BIT and the args are:
155
- No additional optimizer arguments
156
- - lr_scheduler_type: cosine
157
- - lr_scheduler_warmup_steps: 10
158
- - num_epochs: 1.0
159
-
160
- ### Training results
161
-
162
-
163
-
164
- ### Framework versions
165
-
166
- - PEFT 0.14.0
167
- - Transformers 4.48.1
168
- - Pytorch 2.5.1+cu124
169
- - Datasets 3.2.0
170
- - Tokenizers 0.21.0