|
|
|
data: |
|
alpaca: |
|
path_src: "alpaca_clean.txt" |
|
transforms: [sentencepiece, filtertoolong] |
|
weight: 10 |
|
sharegpt: |
|
path_src: "sharegpt.txt" |
|
transforms: [sentencepiece, filtertoolong] |
|
weight: 10 |
|
osst1: |
|
path_src: "osst1.flattened.txt" |
|
transforms: [sentencepiece, filtertoolong] |
|
weight: 10 |
|
valid: |
|
path_src: "dataAI/valid.txt" |
|
transforms: [sentencepiece] |
|
|
|
|
|
|
|
src_subword_model: "tokenizer.model" |
|
tgt_subword_model: "tokenizer.model" |
|
|
|
|
|
src_seq_length: 1792 |
|
tgt_seq_length: 1792 |
|
|
|
|
|
|
|
|
|
skip_empty_level: silent |
|
|
|
|
|
train_from: "mistral-onmt.pt" |
|
save_model: "mistral-onmt-sft" |
|
save_format: pytorch |
|
keep_checkpoint: 10 |
|
save_checkpoint_steps: 100 |
|
seed: 1234 |
|
report_every: 10 |
|
train_steps: 1000 |
|
valid_steps: 100 |
|
|
|
|
|
bucket_size: 32768 |
|
num_workers: 1 |
|
world_size: 2 |
|
gpu_ranks: [0,1] |
|
parallel_mode: tensor_parallel |
|
batch_type: "tokens" |
|
batch_size: 1792 |
|
valid_batch_size: 512 |
|
batch_size_multiple: 1 |
|
accum_count: [8] |
|
accum_steps: [0] |
|
|
|
override_opts: true |
|
|
|
share_vocab: true |
|
save_data: "mistral-7B" |
|
src_vocab: "mistral.vocab" |
|
src_vocab_size: 32000 |
|
tgt_vocab_size: 32000 |
|
|
|
decoder_start_token: '<s>' |
|
|
|
model_dtype: "fp8" |
|
apex_opt_level: "" |
|
optim: "fusedadam" |
|
learning_rate: 0.0001 |
|
warmup_steps: 100 |
|
decay_method: "none" |
|
|
|
|
|
|
|
adam_beta2: 0.998 |
|
max_grad_norm: 0 |
|
label_smoothing: 0.0 |
|
param_init: 0 |
|
param_init_glorot: true |
|
normalization: "tokens" |
|
|
|
|
|
quant_layers: ['w_1', 'w_2', 'w_3', 'linear_values', 'linear_query', 'linear_keys', 'final_linear'] |
|
quant_type: "bnb_FP4" |
|
|
|
|
|
lora_layers: ['linear_values', 'linear_query', 'linear_keys', 'final_linear'] |
|
lora_rank: 4 |
|
lora_dropout: 0.05 |
|
lora_alpha: 8 |
|
lora_embedding: false |
|
|
|
|
|
|
|
|
|
|
|
model_task: lm |
|
encoder_type: transformer_lm |
|
decoder_type: transformer_lm |
|
layer_norm: rms |
|
norm_eps: 1e-5 |
|
pos_ffn_activation_fn: 'silu' |
|
max_relative_positions: -1 |
|
position_encoding: false |
|
add_qkvbias: False |
|
add_ffnbias: False |
|
parallel_residual: false |
|
dec_layers: 32 |
|
heads: 32 |
|
num_kv: 8 |
|
sliding_window: 128 |
|
hidden_size: 4096 |
|
word_vec_size: 4096 |
|
transformer_ff: 14336 |
|
dropout_steps: [0] |
|
dropout: [0.0] |
|
attention_dropout: [0.0] |
|
|