--- license: mit datasets: - Vi-VLM/Vista language: - vi --- LLaVA-Qwen1.5-1.8b model trained with LoRA, on a subset of Vista Vi LLaVA Complex Reasoning. Loss: ~1.5 Training script ```bash deepspeed moellava/train/train_mem.py \ --lora_enable True --lora_r 128 --lora_alpha 256 --mm_projector_lr 0.00000125 \ --lora_path /kaggle/temp/lora-llavaqwen \ --deepspeed ./scripts/zero3.json \ --model_name_or_path Qwen/Qwen1.5-1.8B \ --version qwen \ --data_path /kaggle/temp/vi_llava_train.json \ --image_folder /kaggle/input/coco-2017-dataset/coco2017/train2017 \ --image_tower google/siglip-base-patch16-256-multilingual \ --image_projector_type mlp2x_gelu \ --pretrain_mm_mlp_adapter /kaggle/temp/pt-llavaqwen1.5-1.8b/mm_projector.bin \ --mm_vision_select_layer -2 \ --mm_use_im_start_end False \ --mm_use_im_patch_token False \ --image_aspect_ratio pad \ --group_by_modality_length True \ --fp16 True \ --output_dir ./checkpoints/ft-lora-llavaqwen1.5-1.8b-complex_reasoning \ --num_train_epochs 1 \ --per_device_train_batch_size 2 \ --per_device_eval_batch_size 4 \ --gradient_accumulation_steps 8 \ --evaluation_strategy "no" \ --save_strategy "steps" \ --save_steps 100 \ --save_total_limit 1 \ --learning_rate 1e-5 \ --weight_decay 0. \ --warmup_ratio 0 \ --lr_scheduler_type "cosine" \ --logging_steps 5 \ --tf32 False \ --model_max_length 1024 \ --gradient_checkpointing True \ --dataloader_num_workers 4 \ --lazy_preprocess True \ --report_to wandb \ --run_name ft-llava-qwen1.5-1.8b-lora-vista_reasoning-cont \ --push_to_hub True ``` Python code to merge LoRA ```python from typing import Optional, List class ModelArguments: model_name_or_path: Optional[str] = "facebook/opt-125m" version: Optional[str] = "v0" freeze_backbone: bool = False tune_mm_mlp_adapter: bool = False mm_vision_select_layer: Optional[int] = -1 # default to the last layer pretrain_mm_mlp_adapter: Optional[str] = None mm_use_im_start_end: bool = False mm_use_im_patch_token: bool = True mm_vision_select_feature: Optional[str] = "patch" # =================================================================== image_tower: Optional[str] = 'google/siglip-base-patch16-256-multilingual' video_tower: Optional[str] = None image_projector_type: Optional[str] = 'linear' video_projector_type: Optional[str] = 'linear' video_global_proj: bool = False video_temproal_proj: bool = False video_spatial_proj: bool = False # =================================================================== # ============================================================= only_lora_ffn: bool = True moe_enable: bool = False train_modules: Optional[List[str]] = None moe_mode: str = "sparse" moe_layers_idx: Optional[List[int]] = None ep_size: int = 1 num_experts: Optional[List[int]] = 4 top_k_experts: int = 2 capacity_factor: float = 1. eval_capacity_factor: float = 2. min_capacity: int = 0 use_residual: bool = False router_aux_loss_coef: float = 0.01 class DataArguments: lazy_preprocess: bool = False is_multimodal: bool = False image_aspect_ratio: str = 'pad' # =================================================================== data_path: Optional[List[str]] = None image_folder: Optional[str] = None video_folder: Optional[str] = None num_frames: int = 8 model_args = ModelArguments() data_args = DataArguments() import torch from peft import PeftModel from moellava.model import LlavaQwen1_5ForCausalLM model_name_or_path = 'Qwen/Qwen1.5-1.8B' lora_path = 'llavaqwen1.5-lora' model = LlavaQwen1_5ForCausalLM.from_pretrained( model_name_or_path, ) model.to(torch.float16) model = PeftModel.from_pretrained(model, lora_path) model import transformers tokenizer = transformers.AutoTokenizer.from_pretrained( model_args.model_name_or_path, model_max_length=1024, padding_side="right", use_fast=False, ) tokenizer.add_special_tokens({'unk_token': '<|extra_0|>'}) model.get_model().initialize_vision_modules( model_args=model_args, ) image_tower = model.get_image_tower() image_tower.to(dtype=torch.float16) data_args.image_processor = image_tower.image_processor data_args.is_multimodal = True model.config.image_aspect_ratio = data_args.image_aspect_ratio model.config.tokenizer_padding_side = tokenizer.padding_side model.config.mm_use_im_start_end = data_args.mm_use_im_start_end = model_args.mm_use_im_start_end model.config.mm_use_im_patch_token = model_args.mm_use_im_patch_token model.initialize_vision_tokenizer(model_args, tokenizer=tokenizer) merged_model = model.merge_and_unload() merged_model.save_pretrained("llava-qwen1.5-1.8b-complex_reasoning-merged") ```