File size: 2,654 Bytes
4f907f4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
max_seq_len: 8192
global_seed: 17

# Run Name
run_name: mpt-30b-orca-4ep_flan3m # If left blank, will be read from env var $RUN_NAME

model:
  name: hf_causal_lm
  pretrained: true
  pretrained_model_name_or_path: mosaicml/mpt-30b
  init_device: mixed
  config_overrides:
    max_seq_len: ${max_seq_len}
    attn_config:
      attn_impl: triton
      # Set this to `true` if using `train_loader.dataset.packing_ratio` below
      attn_uses_sequence_id: false

# Tokenizer
tokenizer:
  name: mosaicml/mpt-30b
  kwargs:
    model_max_length: ${max_seq_len}


# Dataloaders
train_loader:
  name: finetuning
  dataset:
    hf_name: csv
    hf_kwargs:
      data_dir: ~/mpt/llm-foundry/data/orca_3m_gpt3.5
    preprocessing_fn:
    split: train
    max_seq_len: ${max_seq_len}
    allow_pad_trimming: false
    decoder_only_format: true
    # # Use `python llmfoundry/data/packing.py --yaml-path /path/to/this/yaml/ ...`
    # # to profile this run's optimal packing_ratio as it depends on GPU count,
    # # batch size, sequence length
    packing_ratio: 19.0
    shuffle: true
  drop_last: true
  num_workers: 8
  pin_memory: false
  prefetch_factor: 2
  persistent_workers: true
  timeout: 0

# Optimization
scheduler:
  name: linear_decay_with_warmup  # linear no warmup is HF default which dolly used
  t_warmup: 200ba  # add some warmup though, seems to help with MPT
  alpha_f: 0

optimizer:
  # Based on Dolly
  name: decoupled_lionw
  lr: 1.0e-6
  betas:
  - 0.9
  - 0.999
  eps: 1.0e-8
  weight_decay: 0

algorithms:
  gradient_clipping:
    clipping_type: norm
    clipping_threshold: 1.0

max_duration: 4ep  # 2-3 epochs seems like the sweet spot
eval_interval: 1
# eval_subset_num_batches: -1
# eval_first: true
global_train_batch_size: 8  # somewhere in the 6-8 * numgpus range seems good

# System
seed: ${global_seed}
# device_eval_batch_size: 8
device_train_microbatch_size: 2
# device_train_microbatch_size: auto
precision: amp_bf16

# FSDP
fsdp_config:
  sharding_strategy: FULL_SHARD
  mixed_precision: PURE
  activation_checkpointing: true
  activation_checkpointing_reentrant: false
  activation_cpu_offload: false
  limit_all_gathers: true
  verbose: false

# Logging
progress_bar: false
log_to_console: true
console_log_interval: 1ba

callbacks:
  speed_monitor:
    window_size: 10
  lr_monitor: {}
  memory_monitor: {}
  runtime_estimator: {}

# loggers:
#   wandb: {}

# Checkpoint to local filesystem or remote object store
save_interval: 1ep
save_num_checkpoints_to_keep: 4  # Important, this cleans up checkpoints saved to DISK
save_folder: ./{run_name}/checkpoints
# save_folder: s3://my-bucket/my-folder/{run_name}/checkpoints