dataset_args:
  path: argilla/10k_prompts_dpo

format_args:
  prompt_format: zephyr

model_args:
  pretrained_model_name_or_path: alignment-handbook/zephyr-7b-sft-full
  torch_dtype: float16
  quantization_config:
    quant_method: bitsandbytes
    load_in_4bit: true

peft_config:
  r: 16
  lora_alpha: 16
  lora_dropout: 0.05
  bias: none
  task_type: CAUSAL_LM
  target_modules:
    - k_proj
    - gate_proj
    - v_proj
    - up_proj
    - q_proj
    - o_proj
    - down_proj

wandb_args:
  entity: argilla-io
  project: dibt-dpo
  name: zephyr-7b-lora-dpo-dibt-openhermes-params-v0

training_args:
  # `trl.DPOTrainer`
  beta: 0.1
  max_length: 1536
  max_prompt_length: 1024
  loss_type: sigmoid
  # `transformers.Trainer`
  bf16: true
  do_eval: true
  do_train: true
  evaluation_strategy: steps
  eval_steps: 20
  gradient_accumulation_steps: 4
  gradient_checkpointing: true
  hub_model_id: plaguss/zephyr-7b-lora-dpo-dibt-v0
  hub_model_revision: v0
  hub_strategy: every_save
  hub_private_repo: true
  push_to_hub: true
  learning_rate: 5.0e-5
  logging_steps: 10
  lr_scheduler_type: cosine
  num_train_epochs: 2
  optim: paged_adamw_32bit
  output_dir: data/zephyr-7b-sft-lora-dpo-v0
  load_best_model_at_end: true
  metric_for_best_model: rewards/accuracies
  greater_is_better: true
  per_device_train_batch_size: 4
  per_device_eval_batch_size: 16
  save_strategy: steps
  save_total_limit: null
  seed: 42
  warmup_ratio: 0.1
  report_to:
    - wandb

use_accelerate: false
use_unsloth: false