Mistral-7B-v0.1-onmt / mistral-finetune.yaml
vince62s's picture
Upload 5 files
5d78781
# Corpus opts:
data:
alpaca:
path_src: "alpaca_clean.txt"
transforms: [sentencepiece, filtertoolong]
weight: 10
sharegpt:
path_src: "sharegpt.txt"
transforms: [sentencepiece, filtertoolong]
weight: 10
osst1:
path_src: "osst1.flattened.txt"
transforms: [sentencepiece, filtertoolong]
weight: 10
valid:
path_src: "dataAI/valid.txt"
transforms: [sentencepiece]
### Transform related opts:
#### Subword
src_subword_model: "tokenizer.model"
tgt_subword_model: "tokenizer.model"
#### Filter
src_seq_length: 1792
tgt_seq_length: 1792
#truncated_decoder: 32
# silently ignore empty lines in the data
skip_empty_level: silent
# General opts
train_from: "mistral-onmt.pt"
save_model: "mistral-onmt-sft"
save_format: pytorch
keep_checkpoint: 10
save_checkpoint_steps: 100
seed: 1234
report_every: 10
train_steps: 1000
valid_steps: 100
# Batching
bucket_size: 32768
num_workers: 1
world_size: 2
gpu_ranks: [0,1]
parallel_mode: tensor_parallel
batch_type: "tokens"
batch_size: 1792
valid_batch_size: 512
batch_size_multiple: 1
accum_count: [8]
accum_steps: [0]
override_opts: true # CAREFULL this requires all settings to be defined below
share_vocab: true
save_data: "mistral-7B"
src_vocab: "mistral.vocab"
src_vocab_size: 32000
tgt_vocab_size: 32000
decoder_start_token: '<s>'
# Optimization
model_dtype: "fp8"
apex_opt_level: ""
optim: "fusedadam"
learning_rate: 0.0001
warmup_steps: 100
decay_method: "none"
#learning_rate_decay: 0.98
#start_decay_steps: 100
#decay_steps: 10
adam_beta2: 0.998
max_grad_norm: 0
label_smoothing: 0.0
param_init: 0
param_init_glorot: true
normalization: "tokens"
#4/8bit
quant_layers: ['w_1', 'w_2', 'w_3', 'linear_values', 'linear_query', 'linear_keys', 'final_linear']
quant_type: "bnb_FP4"
#LoRa
lora_layers: ['linear_values', 'linear_query', 'linear_keys', 'final_linear']
lora_rank: 4
lora_dropout: 0.05
lora_alpha: 8
lora_embedding: false
# Chekpointing
#use_ckpting: ['ffn', 'lora']
# Model
model_task: lm
encoder_type: transformer_lm
decoder_type: transformer_lm
layer_norm: rms
norm_eps: 1e-5
pos_ffn_activation_fn: 'silu'
max_relative_positions: -1
position_encoding: false
add_qkvbias: False
add_ffnbias: False
parallel_residual: false
dec_layers: 32
heads: 32
num_kv: 8
sliding_window: 128
hidden_size: 4096
word_vec_size: 4096
transformer_ff: 14336
dropout_steps: [0]
dropout: [0.0]
attention_dropout: [0.0]