|
|
|
seed_everything: 123 |
|
trainer: |
|
accelerator: gpu |
|
strategy: |
|
class_path: lightning.pytorch.strategies.DeepSpeedStrategy |
|
init_args: |
|
accelerator: null |
|
zero_optimization: true |
|
stage: 2 |
|
remote_device: null |
|
offload_optimizer: false |
|
offload_parameters: true |
|
offload_params_device: cpu |
|
nvme_path: /local_nvme |
|
params_buffer_count: 5 |
|
params_buffer_size: 100000000 |
|
max_in_cpu: 1000000000 |
|
offload_optimizer_device: cpu |
|
optimizer_buffer_count: 4 |
|
block_size: 1048576 |
|
queue_depth: 8 |
|
single_submit: false |
|
overlap_events: true |
|
thread_count: 1 |
|
pin_memory: true |
|
sub_group_size: 1000000000000 |
|
contiguous_gradients: true |
|
overlap_comm: true |
|
allgather_partitions: true |
|
reduce_scatter: true |
|
allgather_bucket_size: 200000000 |
|
reduce_bucket_size: 200000000 |
|
zero_allow_untested_optimizer: true |
|
logging_batch_size_per_gpu: auto |
|
config: null |
|
logging_level: 30 |
|
parallel_devices: null |
|
cluster_environment: null |
|
loss_scale: 0.0 |
|
initial_scale_power: 16 |
|
loss_scale_window: 1000 |
|
hysteresis: 2 |
|
min_loss_scale: 1 |
|
partition_activations: false |
|
cpu_checkpointing: false |
|
contiguous_memory_optimization: false |
|
synchronize_checkpoint_boundary: false |
|
load_full_weights: false |
|
precision_plugin: null |
|
process_group_backend: null |
|
devices: 8 |
|
num_nodes: 1 |
|
precision: bf16-true |
|
logger: |
|
class_path: lightning.pytorch.loggers.TensorBoardLogger |
|
init_args: |
|
save_dir: /media/logs |
|
name: main |
|
version: null |
|
log_graph: false |
|
default_hp_metric: true |
|
prefix: '' |
|
sub_dir: null |
|
comment: '' |
|
purge_step: null |
|
max_queue: 10 |
|
flush_secs: 120 |
|
filename_suffix: '' |
|
callbacks: null |
|
fast_dev_run: false |
|
max_epochs: 2 |
|
min_epochs: null |
|
max_steps: -1 |
|
min_steps: null |
|
max_time: null |
|
limit_train_batches: null |
|
limit_val_batches: null |
|
limit_test_batches: null |
|
limit_predict_batches: null |
|
overfit_batches: 0.0 |
|
val_check_interval: null |
|
check_val_every_n_epoch: 1 |
|
num_sanity_val_steps: 0 |
|
log_every_n_steps: 1 |
|
enable_checkpointing: null |
|
enable_progress_bar: null |
|
enable_model_summary: null |
|
accumulate_grad_batches: 8 |
|
gradient_clip_val: null |
|
gradient_clip_algorithm: null |
|
deterministic: null |
|
benchmark: null |
|
inference_mode: true |
|
use_distributed_sampler: true |
|
profiler: null |
|
detect_anomaly: false |
|
barebones: false |
|
plugins: null |
|
sync_batchnorm: false |
|
reload_dataloaders_every_n_epochs: 0 |
|
default_root_dir: null |
|
model: |
|
config: |
|
model_name: Mistral-7B-v0.2 |
|
dtype: bfloat16 |
|
num_thoughts: 2 |
|
thought_length: 8 |
|
lookahead_tokens: 4 |
|
embedding_grad_weights: 100.0 |
|
temperature: 1.0 |
|
do_sample: true |
|
train_max_length: 120 |
|
offload_cache: false |
|
top_k: null |
|
top_p: null |
|
checkpoint_dir: /media/models/unsloth/Mistral-7B-v0.2 |
|
weight_decay: 0.001 |
|
warmup_steps: 20 |
|
policy_weight: 1.0 |
|
init_lr: 1.0e-06 |
|
optimizer: |
|
class_path: torch.optim.AdamW |
|
init_args: |
|
lr: 1.0e-06 |
|
betas: |
|
- 0.9 |
|
- 0.999 |
|
eps: 1.0e-08 |
|
weight_decay: 0.001 |
|
amsgrad: false |
|
maximize: false |
|
foreach: null |
|
capturable: false |
|
differentiable: false |
|
fused: null |
|
scheduler: null |
|
ckpt_path: null |
|
data: |
|
class_path: src.dataset.OpenWebMathDataModule |
|
init_args: |
|
data_path: /media/datasets/openwebmath |
|
tokenizer: |
|
class_path: src.dataset.SpecialTokenizer |
|
init_args: |
|
checkpoint_dir: /media/models/unsloth/Mistral-7B-v0.2 |
|
batch_size: 1 |
|
max_seq_length: 120 |
|
num_samples: 2048 |
|
ignore_index: -100 |
|
val_split_fraction: 0.125 |
|
seed: 42 |
|
num_workers: 1 |
|
|