ELLM_Star / config.yaml
tjellm's picture
Upload folder using huggingface_hub
90fc0ea verified
# lightning.pytorch==2.4.0.dev20240728
seed_everything: 123
trainer:
accelerator: gpu
strategy:
class_path: lightning.pytorch.strategies.DeepSpeedStrategy
init_args:
accelerator: null
zero_optimization: true
stage: 2
remote_device: null
offload_optimizer: false
offload_parameters: true
offload_params_device: cpu
nvme_path: /local_nvme
params_buffer_count: 5
params_buffer_size: 100000000
max_in_cpu: 1000000000
offload_optimizer_device: cpu
optimizer_buffer_count: 4
block_size: 1048576
queue_depth: 8
single_submit: false
overlap_events: true
thread_count: 1
pin_memory: true
sub_group_size: 1000000000000
contiguous_gradients: true
overlap_comm: true
allgather_partitions: true
reduce_scatter: true
allgather_bucket_size: 200000000
reduce_bucket_size: 200000000
zero_allow_untested_optimizer: true
logging_batch_size_per_gpu: auto
config: null
logging_level: 30
parallel_devices: null
cluster_environment: null
loss_scale: 0.0
initial_scale_power: 16
loss_scale_window: 1000
hysteresis: 2
min_loss_scale: 1
partition_activations: false
cpu_checkpointing: false
contiguous_memory_optimization: false
synchronize_checkpoint_boundary: false
load_full_weights: false
precision_plugin: null
process_group_backend: null
devices: 8
num_nodes: 1
precision: bf16-true
logger:
class_path: lightning.pytorch.loggers.TensorBoardLogger
init_args:
save_dir: /media/logs
name: main
version: null
log_graph: false
default_hp_metric: true
prefix: ''
sub_dir: null
comment: ''
purge_step: null
max_queue: 10
flush_secs: 120
filename_suffix: ''
callbacks: null
fast_dev_run: false
max_epochs: 2
min_epochs: null
max_steps: -1
min_steps: null
max_time: null
limit_train_batches: null
limit_val_batches: null
limit_test_batches: null
limit_predict_batches: null
overfit_batches: 0.0
val_check_interval: null
check_val_every_n_epoch: 1
num_sanity_val_steps: 0
log_every_n_steps: 1
enable_checkpointing: null
enable_progress_bar: null
enable_model_summary: null
accumulate_grad_batches: 8
gradient_clip_val: null
gradient_clip_algorithm: null
deterministic: null
benchmark: null
inference_mode: true
use_distributed_sampler: true
profiler: null
detect_anomaly: false
barebones: false
plugins: null
sync_batchnorm: false
reload_dataloaders_every_n_epochs: 0
default_root_dir: null
model:
config:
model_name: Mistral-7B-v0.2
dtype: bfloat16
num_thoughts: 2
thought_length: 8
lookahead_tokens: 4
embedding_grad_weights: 100.0
temperature: 1.0
do_sample: true
train_max_length: 120
offload_cache: false
top_k: null
top_p: null
checkpoint_dir: /media/models/unsloth/Mistral-7B-v0.2
weight_decay: 0.001
warmup_steps: 20
policy_weight: 1.0
init_lr: 1.0e-06
optimizer:
class_path: torch.optim.AdamW
init_args:
lr: 1.0e-06
betas:
- 0.9
- 0.999
eps: 1.0e-08
weight_decay: 0.001
amsgrad: false
maximize: false
foreach: null
capturable: false
differentiable: false
fused: null
scheduler: null
ckpt_path: null
data:
class_path: src.dataset.OpenWebMathDataModule
init_args:
data_path: /media/datasets/openwebmath
tokenizer:
class_path: src.dataset.SpecialTokenizer
init_args:
checkpoint_dir: /media/models/unsloth/Mistral-7B-v0.2
batch_size: 1
max_seq_length: 120
num_samples: 2048
ignore_index: -100
val_split_fraction: 0.125
seed: 42
num_workers: 1