dataset_args: path: argilla/10k_prompts_dpo format_args: prompt_format: zephyr model_args: pretrained_model_name_or_path: alignment-handbook/zephyr-7b-sft-full torch_dtype: float16 quantization_config: quant_method: bitsandbytes load_in_4bit: true peft_config: r: 16 lora_alpha: 16 lora_dropout: 0.05 bias: none task_type: CAUSAL_LM target_modules: - k_proj - gate_proj - v_proj - up_proj - q_proj - o_proj - down_proj wandb_args: entity: argilla-io project: dibt-dpo name: zephyr-7b-lora-dpo-dibt-openhermes-params-v0 training_args: # `trl.DPOTrainer` beta: 0.1 max_length: 1536 max_prompt_length: 1024 loss_type: sigmoid # `transformers.Trainer` bf16: true do_eval: true do_train: true evaluation_strategy: steps eval_steps: 20 gradient_accumulation_steps: 4 gradient_checkpointing: true hub_model_id: plaguss/zephyr-7b-lora-dpo-dibt-v0 hub_model_revision: v0 hub_strategy: every_save hub_private_repo: true push_to_hub: true learning_rate: 5.0e-5 logging_steps: 10 lr_scheduler_type: cosine num_train_epochs: 2 optim: paged_adamw_32bit output_dir: data/zephyr-7b-sft-lora-dpo-v0 load_best_model_at_end: true metric_for_best_model: rewards/accuracies greater_is_better: true per_device_train_batch_size: 4 per_device_eval_batch_size: 16 save_strategy: steps save_total_limit: null seed: 42 warmup_ratio: 0.1 report_to: - wandb use_accelerate: false use_unsloth: false