diff --git a/app.py b/app.py
index 5cc265e4965b21fd51f9accf7aafbebd89067761..f0e7b442ce0f4bfa76d4e55dc085c1820e97910a 100644
--- a/app.py
+++ b/app.py
@@ -1,14 +1,485 @@
+import torch
+import os
+import argparse
+import numpy as np
+import copy
 import gradio as gr
+import re
+import torchaudio
+import io
+import cv2
+import math
 import spaces
-import torch
+from numba import jit
+from huggingface_hub import snapshot_download
+
+from vita.constants import DEFAULT_AUDIO_TOKEN, DEFAULT_IMAGE_TOKEN, MAX_IMAGE_LENGTH, MIN_IMAGE_LENGTH, IMAGE_TOKEN_INDEX, AUDIO_TOKEN_INDEX
+from vita.conversation import conv_templates, SeparatorStyle
+from vita.util.mm_utils import tokenizer_image_token, tokenizer_image_audio_token 
+from PIL import Image
+from decord import VideoReader, cpu
+from vita.model.builder import load_pretrained_model
+from vita.model.vita_tts.decoder.llm2tts import llm2TTS
+from vita.model.language_model.vita_qwen2 import VITAQwen2Config, VITAQwen2ForCausalLM
+
+decoder_topk = 2
+codec_chunk_size = 40
+codec_padding_size = 10
+
+PUNCTUATION = "！？。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘'‛""„‟…‧﹏."
+
+MODEL_NAME = "VITA-MLLM/VITA-1.5"
+model_path = snapshot_download(MODEL_NAME, local_dir="VITA_ckpt")
+tokenizer, model, feature_extractor, context_len = load_pretrained_model(
+    model_path, model_base=None, model_name="VITA-1.5", model_type="qwen2p5_instruct"
+)
+llm_embedding = model.get_input_embeddings().cuda()
+tts = llm2TTS(os.path.join(model_path, 'vita_tts_ckpt/'))
+
+@jit
+def float_to_int16(audio: np.ndarray) -> np.ndarray:
+    am = int(math.ceil(float(np.abs(audio).max())) * 32768)
+    am = 32767 * 32768 // am
+    return np.multiply(audio, am).astype(np.int16)
+
+
+def remove_special_characters(input_str):
+    # Remove special tokens
+    special_tokens = ['☞', '☟', '☜', '<unk>', '<|im_end|>']
+    for token in special_tokens:
+        input_str = input_str.replace(token, '')
+    return input_str
+
+
+def replace_equation(sentence):
+    special_notations = {
+        "sin": " sine ",
+        "cos": " cosine ",
+        "tan": " tangent ",
+        "cot": " cotangent ",
+        "sec": " secant ",
+        "csc": " cosecant ",
+        "log": " logarithm ",
+        "exp": "e^",
+        "sqrt": "根号 ",
+        "abs": "绝对值 ",
+    }
+    
+    special_operators = {
+        "+": "加",
+        "-": "减",
+        "*": "乘",
+        "/": "除",
+        "=": "等于",
+        '!=': '不等于',
+        '>': '大于',
+        '<': '小于',
+        '>=': '大于等于',
+        '<=': '小于等于',
+    }
+
+    greek_letters = {
+        "α": "alpha ",
+        "β": "beta ",
+        "γ": "gamma ",
+        "δ": "delta ",
+        "ε": "epsilon ",
+        "ζ": "zeta ",
+        "η": "eta ",
+        "θ": "theta ",
+        "ι": "iota ",
+        "κ": "kappa ",
+        "λ": "lambda ",
+        "μ": "mu ",
+        "ν": "nu ",
+        "ξ": "xi ",
+        "ο": "omicron ",
+        "π": "派 ",
+        "ρ": "rho ",
+        "σ": "sigma ",
+        "τ": "tau ",
+        "υ": "upsilon ",
+        "φ": "phi ",
+        "χ": "chi ",
+        "ψ": "psi ",
+        "ω": "omega "
+    }
+
+    sentence = sentence.replace('**', ' ')
+
+    sentence = re.sub(r'(?<![\d)])-(\d+)', r'负\1', sentence)
+
+    for key in special_notations:
+        sentence = sentence.replace(key, special_notations[key]) 
+    for key in special_operators:
+        sentence = sentence.replace(key, special_operators[key])
+    for key in greek_letters:
+        sentence = sentence.replace(key, greek_letters[key])
+
+
+    sentence = re.sub(r'\(?(\d+)\)?\((\d+)\)', r'\1乘\2', sentence)
+    sentence = re.sub(r'\(?(\w+)\)?\^\(?(\w+)\)?', r'\1的\2次方', sentence)
+    
+    return sentence
+
+
+def is_video(file_path):
+    video_extensions = {'.mp4', '.avi', '.mov', '.mkv', '.flv', '.wmv', '.webm'}
+    _, ext = os.path.splitext(file_path)
+    return ext.lower() in video_extensions
+
+def is_image(file_path):
+    image_extensions = {'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff'}
+    _, ext = os.path.splitext(file_path)
+    return ext.lower() in image_extensions
+
+def is_wav(file_path):
+    wav_extensions = {'.wav'}
+    _, ext = os.path.splitext(file_path)
+    return ext.lower() in wav_extensions
+
+def load_model_embemding(model_path):
+    config_path = os.path.join(model_path, 'origin_config.json')
+    config = VITAQwen2Config.from_pretrained(config_path)
+    model = VITAQwen2ForCausalLM.from_pretrained(model_path, config=config, low_cpu_mem_usage=True)
+    embedding = model.get_input_embeddings()
+    del model
+    return embedding
+
+def split_into_sentences(text):
+    sentence_endings = re.compile(r'[，。？\n！？、,?.!]')
+    sentences = sentence_endings.split(text)
+    return [sentence.strip() for sentence in sentences if sentence.strip()]
+
+def convert_webm_to_mp4(input_file, output_file):
+    try:
+        cap = cv2.VideoCapture(input_file)
+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        out = cv2.VideoWriter(output_file, fourcc, 20.0, (int(cap.get(3)), int(cap.get(4))))
+
+        while cap.isOpened():
+            ret, frame = cap.read()
+            if not ret:
+                break
+            out.write(frame)
+
+        cap.release()
+        out.release()
+    except Exception as e:
+        print(f"Error: {e}")
+        raise
+
+
+def _get_rawvideo_dec(video_path, max_frames=MAX_IMAGE_LENGTH, min_frames=MIN_IMAGE_LENGTH, video_framerate=1, s=None, e=None):
+    if s is None or e is None:
+        start_time, end_time = None, None
+    else:
+        start_time = int(s)
+        end_time = int(e)
+        start_time = max(start_time, 0)
+        end_time = max(end_time, 0)
+        if start_time > end_time:
+            start_time, end_time = end_time, start_time
+        elif start_time == end_time:
+            end_time = start_time + 1
+
+    if os.path.exists(video_path):
+        vreader = VideoReader(video_path, ctx=cpu(0))
+    else:
+        raise FileNotFoundError
+
+    fps = vreader.get_avg_fps()
+    f_start = 0 if start_time is None else int(start_time * fps)
+    f_end = int(min(1000000000 if end_time is None else end_time * fps, len(vreader) - 1))
+    num_frames = f_end - f_start + 1
+
+    if num_frames > 0:
+        sample_fps = int(video_framerate)
+        t_stride = int(round(float(fps) / sample_fps))
+        all_pos = list(range(f_start, f_end + 1, t_stride))
+
+        if len(all_pos) > max_frames:
+            sample_pos = [all_pos[_] for _ in np.linspace(0, len(all_pos) - 1, num=max_frames, dtype=int)]
+        elif len(all_pos) < min_frames:
+            sample_pos = [all_pos[_] for _ in np.linspace(0, len(all_pos) - 1, num=min_frames, dtype=int)]
+        else:
+            sample_pos = all_pos
+
+        patch_images = [Image.fromarray(f).convert("RGB") for f in vreader.get_batch(sample_pos).asnumpy()]
+        return patch_images, len(patch_images)
+    else:
+        print(f"video path: {video_path} error.")
 
-zero = torch.Tensor([0]).cuda()
-print(zero.device) # <-- 'cpu' 🤔
+def _parse_text(text):
+    lines = text.split("\n")
+    lines = [line for line in lines if line != ""]
+    count = 0
+
+    for i, line in enumerate(lines):
+        if "```" in line:
+            count += 1
+            items = line.split("`")
+            if count % 2 == 1:
+                lines[i] = f'<pre><code class="language-{items[-1]}">'
+            else:
+                lines[i] = "<br></code></pre>"
+        else:
+            if i > 0 and count % 2 == 1:
+                line = line.replace("`", r"\`")
+                line = line.replace("<", "&lt;")
+                line = line.replace(">", "&gt;")
+                line = line.replace(" ", "&nbsp;")
+                line = line.replace("*", "&ast;")
+                line = line.replace("_", "&lowbar;")
+                line = line.replace("-", "&#45;")
+                line = line.replace(".", "&#46;")
+                line = line.replace("!", "&#33;")
+                line = line.replace("(", "&#40;")
+                line = line.replace(")", "&#41;")
+                line = line.replace("$", "&#36;")
+            lines[i] = "<br>" + line
+
+    return "".join(lines)
+
+
+@spaces.GPU
+def predict(_chatbot, task_history):
+    chat_query = task_history[-1][0]
+    print(task_history)
+
+    conv_mode = "qwen2p5_instruct"
+    conv = conv_templates[conv_mode].copy()
+    
+    all_audio_path = []
+    all_visual_tensor = []
+
+    qs = ''
+    input_mode = 'lang'
+    for i, (q, a) in enumerate(task_history):
+        if isinstance(q, (tuple, list)):
+            if is_image(q[0]):
+                images = [Image.open(q[0]).convert("RGB")]
+                all_visual_tensor.extend(images)
+                input_mode = 'image'
+                qs += DEFAULT_IMAGE_TOKEN * len(images) + '\n'
+            elif is_video(q[0]):             
+                video_frames, slice_len = _get_rawvideo_dec(q[0])
+                all_visual_tensor.extend(video_frames)
+                input_mode = 'video'
+                qs += DEFAULT_IMAGE_TOKEN * slice_len + '\n'
+            elif is_wav(q[0]):
+                if a is not None and a.startswith('☜'):
+                    continue
+                else:
+                    all_audio_path.append(q[0])
+                    new_q = qs + DEFAULT_AUDIO_TOKEN
+                    qs = ''
+                    conv.append_message(conv.roles[0], new_q)
+                    conv.append_message(conv.roles[1], a)
+        else:
+            new_q = qs + q
+            qs = ''
+            conv.append_message(conv.roles[0], new_q)
+            conv.append_message(conv.roles[1], a)
+
+    prompt = conv.get_prompt(input_mode)
+
+    if all_audio_path != []:
+        input_ids = tokenizer_image_audio_token(
+            prompt, tokenizer, 
+            image_token_index=IMAGE_TOKEN_INDEX, 
+            audio_token_index=AUDIO_TOKEN_INDEX
+        )
+        audio_list = []
+        for single_audio_path in all_audio_path:
+            try:
+                audio, original_sr = torchaudio.load(single_audio_path)
+                target_sr = 16000
+                if original_sr != target_sr:
+                    resampler = torchaudio.transforms.Resample(orig_freq=original_sr, new_freq=target_sr)
+                    audio = resampler(audio)
+                audio_features = feature_extractor(audio, sampling_rate=target_sr, return_tensors="pt")["input_features"]
+                audio_list.append(audio_features.squeeze(0))
+            except Exception as e:
+                print(f"Error processing {single_audio_path}: {e}")
+    else:
+        input_ids = tokenizer_image_token(
+            prompt, tokenizer, 
+            image_token_index=IMAGE_TOKEN_INDEX
+        )
+
+    if all_visual_tensor == [] and all_audio_path == []:
+        datapromt = {
+                "prompt_token_ids": input_ids,
+        }
+    elif all_visual_tensor != [] and all_audio_path == []:
+        datapromt = {
+            "prompt_token_ids": input_ids,
+            "multi_modal_data": {
+                "image": all_visual_tensor
+            },
+        }
+    elif all_visual_tensor == [] and all_audio_path != []:
+        datapromt = {
+            "prompt_token_ids": input_ids,
+            "multi_modal_data": {
+                "audio": audio_list
+            },
+        }
+    else:
+        datapromt = {
+            "prompt_token_ids": input_ids,
+            "multi_modal_data": {
+                "image": all_visual_tensor,
+                "audio": audio_list
+            },
+        }
+
+    print(datapromt)
+
+    with torch.inference_mode():
+        output_ids = model.generate(
+            input_ids,
+            images=all_visual_tensor,
+            audios=audio_list,
+            do_sample=False,
+            temperature=0.01,
+            top_p=None,
+            num_beams=1,
+            output_scores=True,
+            return_dict_in_generate=True,
+            max_new_tokens=1024,
+            use_cache=True,
+        )
+
+    outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=False)[0]
+    outputs = outputs.strip()
+
+    task_history[-1] = (chat_query, outputs)
+    remove_special_characters_output = remove_special_characters(outputs)  
+    _chatbot[-1] = (chat_query, _parse_text(remove_special_characters_output))
+    print("query", chat_query)
+    print("task_history", task_history)
+    print(_chatbot)
+    print("answer:  ", outputs)
+    yield _chatbot
+
+
+def add_text(history, task_history, text):
+    task_text = text
+    if len(text) >= 2 and text[-1] in PUNCTUATION and text[-2] not in PUNCTUATION:
+        task_text = text[:-1]
+    history = history + [(_parse_text(text), None)]
+    task_history = task_history + [(task_text, None)]
+    return history, task_history, ""
+
+def add_file(history, task_history, file):
+    history = history + [((file.name,), None)]
+    task_history = task_history + [((file.name,), None)]
+    return history, task_history
+
+def add_audio(history, task_history, file):
+    print(file)
+    if file is None:
+        return history, task_history
+    history = history + [((file,), None)]
+    task_history = task_history + [((file,), None)]
+    return history, task_history
+
+def add_video(history, task_history, file):
+    print(file)
+    if file is None:
+        return history, task_history
+    new_file_name = file.replace(".webm",".mp4")
+    if file.endswith(".webm"):
+        convert_webm_to_mp4(file, new_file_name)
+    task_history = task_history + [((new_file_name,), None)]
+    return history, task_history
+
+
+def reset_user_input():
+    return gr.update(value="")
+
+def reset_state(task_history):
+    task_history.clear()
+    return []
 
 @spaces.GPU
-def greet(n):
-    print(zero.device) # <-- 'cuda:0' 🤗
-    return f"Hello {zero + n} Tensor"
+def stream_audio_output(history, task_history):
+    text = task_history[-1][-1]
+    if not text:
+        # import pdb;pdb.set_trace()
+        yield None,None
+    llm_resounse = replace_equation(remove_special_characters(text))
+    #print('tts_text', llm_resounse)
+    for idx, text in enumerate(split_into_sentences(llm_resounse)):
+        embeddings = llm_embedding(torch.tensor(tokenizer.encode(text)).cuda())
+        for seg in tts.run(embeddings.reshape(-1, 896).unsqueeze(0), decoder_topk,
+                            None, 
+                            codec_chunk_size, codec_padding_size):
+            if idx == 0:
+                try:
+                    split_idx = torch.nonzero(seg.abs() > 0.03, as_tuple=True)[-1][0]
+                    seg = seg[:, :, split_idx:]
+                except:
+                    print('Do not need to split')
+                    pass
+    
+            if seg is not None and len(seg) > 0:
+                seg = seg.to(torch.float32).cpu().numpy()
+                yield 24000, float_to_int16(seg).T
+
+
+with gr.Blocks(title="VideoMLLM") as demo:
+    gr.Markdown("""<center><font size=8>VITA</center>""")
+    chatbot = gr.Chatbot(label='VITA', elem_classes="control-height", height=500)
+    query = gr.Textbox(lines=2, label='Text Input')
+    task_history = gr.State([])
+    with gr.Row():
+        add_text_button = gr.Button("Submit Text (提交文本)")
+        add_audio_button = gr.Button("Submit Audio (提交音频)")
+    with gr.Row():
+        with gr.Column(scale=2):
+            addfile_btn = gr.UploadButton("📁 Upload (上传文件[视频,图片])", file_types=["video", "image"])
+            video_input = gr.Video(sources=[ "webcam"], height=400, width=700, container=True, interactive=True, show_download_button=True, label="📹 Video Recording (视频录制)")
+
+        with gr.Column(scale=1):
+            empty_bin = gr.Button("🧹 Clear History (清除历史)")
+            record_btn = gr.Audio(sources=[ "microphone","upload"], type="filepath", label="🎤 Record or Upload Audio (录音或上传音频)", show_download_button=True, waveform_options=gr.WaveformOptions(sample_rate=16000))
+            audio_output = gr.Audio(
+                label="Output Audio",
+                value=None,
+                format= "wav",
+                autoplay=True,
+                streaming=True,
+                interactive=False,
+                show_label=True,
+                waveform_options=gr.WaveformOptions(
+                    sample_rate=24000,
+                ),
+            )
+
+
+    add_text_button.click(add_text, [chatbot, task_history, query], [chatbot, task_history], show_progress=True).then(
+        reset_user_input, [], [query]
+    ).then(
+            predict, [chatbot, task_history], [chatbot], show_progress=True  
+    ).then(
+        stream_audio_output,[chatbot, task_history], [audio_output], 
+    )
+
+
+    video_input.stop_recording(add_video, [chatbot, task_history, video_input], [chatbot, task_history], show_progress=True)
+    empty_bin.click(reset_state, [task_history], [chatbot], show_progress=True)
+    addfile_btn.upload(add_file, [chatbot, task_history, addfile_btn], [chatbot, task_history], show_progress=True)
+
+
+
+    add_audio_button.click(add_audio, [chatbot, task_history,record_btn], [chatbot, task_history], show_progress=True).then(
+            predict, [chatbot, task_history], [chatbot], show_progress=True   
+    ).then(
+        stream_audio_output,[chatbot, task_history], [audio_output],
+    )
+    
 
-demo = gr.Interface(fn=greet, inputs=gr.Number(), outputs=gr.Text())
-demo.launch()
+demo.launch(server_port=18806)
diff --git a/vita/config/__init__.py b/vita/config/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..57a4e3cd2742b36618ee8838e7573fdfa06b3939
--- /dev/null
+++ b/vita/config/__init__.py
@@ -0,0 +1,10 @@
+from .dataset_config import *
+
+NaturalCap0 = [ShareGPT4V0]
+NaturalCap = [ShareGPT4V]
+
+DataConfig = {
+    "Pretrain_video": NaturalCap0,
+}
+
+NoPatchSets = ["khair", "jester"]
diff --git a/vita/config/dataset_config.py b/vita/config/dataset_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..eac21c8e3422ba71a6184dd0f45743b6be2c9153
--- /dev/null
+++ b/vita/config/dataset_config.py
@@ -0,0 +1,8 @@
+AudioFolder = ""
+FolderDict = {
+    #### NaturalCap
+    "sharegpt4": "",
+}
+#### NaturalCap
+ShareGPT4V = {"chat_path": ""}
+ShareGPT4V0 = {"chat_path": ""}
diff --git a/vita/constants.py b/vita/constants.py
new file mode 100644
index 0000000000000000000000000000000000000000..8302610d120eec46970b3d174fedc395ea2f376b
--- /dev/null
+++ b/vita/constants.py
@@ -0,0 +1,14 @@
+# Model Constants
+MAX_IMAGE_LENGTH = 16  # 8#16#32#64
+MIN_IMAGE_LENGTH = 4
+IGNORE_INDEX = -100
+IMAGE_TOKEN_INDEX = -200
+AUDIO_TOKEN_INDEX = -500
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_VIDEO_TOKEN = "<video>"
+DEFAULT_AUDIO_TOKEN = "<audio>"
+CONTROLLER_HEART_BEAT_EXPIRATION = 30
+LOGDIR = "gradio-logs"
+WORKER_HEART_BEAT_INTERVAL = 15
+DEFAULT_DATA_RATIO = 1.0#0.124#0.5 #0.2 #1.0
+GLOBAL_WEIGHTS_PATH = "/path/to/model_weights"
diff --git a/vita/conversation.py b/vita/conversation.py
new file mode 100644
index 0000000000000000000000000000000000000000..784a7f7e456bc7b1b7a6f1e7feb871cbca0b73dd
--- /dev/null
+++ b/vita/conversation.py
@@ -0,0 +1,401 @@
+import dataclasses
+from enum import Enum, auto
+from typing import List
+
+
+class SeparatorStyle(Enum):
+    """Different separator style."""
+
+    TWO = auto()
+    PLAIN = auto()
+    Nemo = auto()
+    Qwen2p5Instruct = auto()
+    MixtralZh = auto()
+    MixtralTwo = auto()
+
+
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+
+    system: str
+    roles: List[str]
+    messages: List[List[str]]
+    offset: int
+    sep_style: SeparatorStyle
+    sep: str = "###"
+    sep2: str = None
+    version: str = "Unknown"
+
+    skip_next: bool = False
+
+    def get_prompt(self, modality=None):
+        messages = self.messages
+        if len(messages) > 0 and type(messages[0][1]) is tuple:
+            messages = self.messages.copy()
+            init_role, init_msg = messages[0].copy()
+            init_msg = init_msg[0].replace("<image>", "").strip()
+            if "mmtag" in self.version:
+                messages[0] = (init_role, init_msg)
+                messages.insert(0, (self.roles[0], "<Image><image></Image>"))
+                messages.insert(1, (self.roles[1], "Received."))
+            else:
+                messages[0] = (init_role, "<image>\n" + init_msg)
+
+        if self.sep_style == SeparatorStyle.TWO:
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+
+        elif self.sep_style == SeparatorStyle.MixtralZh:
+            seps = [self.sep, self.sep2]
+            ret = "system:" + self.system + seps[0]
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += "\n" + role + ":" + message + seps[i % 2]
+                else:
+                    ret += "\n" + role + ":"
+
+        elif self.sep_style == SeparatorStyle.MixtralTwo:
+            seps = [self.sep, self.sep2]
+            has_image = False
+            for i, (role, message) in enumerate(messages):
+                if message and "<image>" in message:
+                    has_image = True
+                    break
+            if has_image:
+                assert modality == "image" or modality == "video"
+                if modality == "image":
+                    self.system = self.system[0]
+                elif modality == "video":
+                    self.system = self.system[1]
+                else:
+                    raise ValueError
+            else:
+                assert modality == "lang"
+                self.system = self.system[2]
+            ret = "system:" + self.system + seps[0]
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += "\n" + role + ":" + message + seps[i % 2]
+                else:
+                    ret += "\n" + role + ":"
+
+        elif self.sep_style == SeparatorStyle.Nemo:
+            wrap_inst = lambda msg: f"[INST]{msg}[/INST]"
+            seps = [self.sep, self.sep2]
+            has_image = False
+            for i, (role, message) in enumerate(messages):
+                if message and "<image>" in message:
+                    has_image = True
+                    break
+            if has_image:
+                assert modality == "image" or modality == "video"
+                if modality == "image":
+                    self.system = self.system[0]
+                elif modality == "video":
+                    self.system = self.system[1]
+                else:
+                    raise ValueError
+            else:
+                assert modality == "lang"
+                self.system = self.system[2]
+            ret = ""
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    if i == 0:
+                        message = self.system + '\n' + message
+                    if i % 2 == 0:
+                        ret += wrap_inst(message)
+                    else:
+                        ret += message + seps[i % 2]
+                else:
+                    ret += ""
+
+        elif self.sep_style == SeparatorStyle.Qwen2p5Instruct:
+            wrap_qa = lambda msg: f"<|im_start|>{msg}<|im_end|>\n"
+            wrap_qa2 = lambda msg: f"<|im_start|>{msg}<|im_end|>"
+            seps = [self.sep, self.sep2]
+            has_image = False
+            for i, (role, message) in enumerate(messages):
+                if message and "<image>" in message:
+                    has_image = True
+                    break
+            if has_image:
+                assert modality == "image" or modality == "video"
+                if modality == "image":
+                    self.system = self.system[0]
+                elif modality == "video":
+                    self.system = self.system[1]
+                else:
+                    raise ValueError
+            else:
+                assert modality == "lang"
+                self.system = self.system[2]
+            ret = wrap_qa("system\n" + self.system)
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    if i < len(messages) - 1:
+                        ret += wrap_qa(role + '\n' + message)
+                    else:
+                        ret += wrap_qa2(role + '\n' + message)
+                else:
+                    ret += "<|im_start|>" + role + '\n'
+
+        elif self.sep_style == SeparatorStyle.PLAIN:
+            seps = [self.sep, self.sep2]
+            ret = self.system
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += message + seps[i % 2]
+                else:
+                    ret += ""
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+
+        return ret
+
+    def append_message(self, role, message):
+        self.messages.append([role, message])
+
+    def get_images(self, return_pil=False):
+        images = []
+        for i, (role, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    import base64
+                    from io import BytesIO
+                    from PIL import Image
+
+                    msg, image, image_process_mode = msg
+                    if image_process_mode == "Pad":
+
+                        def expand2square(pil_img, background_color=(122, 116, 104)):
+                            width, height = pil_img.size
+                            if width == height:
+                                return pil_img
+                            elif width > height:
+                                result = Image.new(pil_img.mode, (width, width), background_color)
+                                result.paste(pil_img, (0, (width - height) // 2))
+                                return result
+                            else:
+                                result = Image.new(pil_img.mode, (height, height), background_color)
+                                result.paste(pil_img, ((height - width) // 2, 0))
+                                return result
+
+                        image = expand2square(image)
+                    elif image_process_mode in ["Default", "Crop"]:
+                        pass
+                    elif image_process_mode == "Resize":
+                        image = image.resize((336, 336))
+                    else:
+                        raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
+
+                    if return_pil:
+                        images.append(image)
+                    else:
+                        buffered = BytesIO()
+                        image.save(buffered, format="PNG")
+                        img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+                        images.append(img_b64_str)
+        return images
+
+    def to_gradio_chatbot(self):
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    import base64
+                    from io import BytesIO
+
+                    msg, image, image_process_mode = msg
+                    max_hw, min_hw = max(image.size), min(image.size)
+                    aspect_ratio = max_hw / min_hw
+                    max_len, min_len = 800, 400
+                    shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+                    longest_edge = int(shortest_edge * aspect_ratio)
+                    W, H = image.size
+                    if H > W:
+                        H, W = longest_edge, shortest_edge
+                    else:
+                        H, W = shortest_edge, longest_edge
+                    image = image.resize((W, H))
+                    buffered = BytesIO()
+                    image.save(buffered, format="JPEG")
+                    img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+                    img_str = (
+                        f'<img src="data:image/png;base64,{img_b64_str}" alt="user upload image" />'
+                    )
+                    msg = img_str + msg.replace("<image>", "").strip()
+                    ret.append([msg, None])
+                else:
+                    ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+
+    def copy(self):
+        return Conversation(
+            system=self.system,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            version=self.version,
+        )
+
+    def dict(self):
+        if len(self.get_images()) > 0:
+            return {
+                "system": self.system,
+                "roles": self.roles,
+                "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
+                "offset": self.offset,
+                "sep": self.sep,
+                "sep2": self.sep2,
+            }
+        return {
+            "system": self.system,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+            "sep": self.sep,
+            "sep2": self.sep2,
+        }
+
+
+conv_mixtral_zh = Conversation(
+    system="你是一个人工智能机器人。\n- 你是研究社区开发的大语言模型。你的设计宗旨是有益、诚实且无害。\n- 你支持使用用户选择的多种语言流利地进行交流并解答用户的问题。\n- 如果用户更正你生成的错误答案，你会向用户致歉并与用户探讨正确的答案。",
+    roles=("user", "bot"),
+    version="mixtral_zh",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MixtralZh,
+    sep="</s>",
+    sep2="</s>",
+)
+
+conv_mixtral_two = Conversation(
+    system=[
+        "You are an AI robot and your name is VITA. \n- You are a multimodal large language model developed by the open source community. Your aim is to be helpful, honest and harmless. \n- You support the ability to communicate fluently and answer user questions in multiple languages of the user's choice. \n- If the user corrects the wrong answer you generated, you will apologize and discuss the correct answer with the user. \n- You must answer the question strictly according to the content of the image given by the user, and it is strictly forbidden to answer the question without the content of the image. Please note that you are seeing the image, not the video.",
+        "You are an AI robot and your name is VITA. \n- You are a multimodal large language model developed by the open source community. Your aim is to be helpful, honest and harmless. \n- You support the ability to communicate fluently and answer user questions in multiple languages of the user's choice. \n- If the user corrects the wrong answer you generated, you will apologize and discuss the correct answer with the user. \n- You must answer the question strictly according to the content of the video given by the user, and it is strictly forbidden to answer the question without the content of the video. Please note that you are seeing the video, not the image.",
+        "You are an AI robot and your name is VITA. \n- You are a multimodal large language model developed by the open source community. Your aim is to be helpful, honest and harmless. \n- You support the ability to communicate fluently and answer user questions in multiple languages of the user's choice. \n- If the user corrects the wrong answer you generated, you will apologize and discuss the correct answer with the user.",
+    ],
+    roles=("user", "bot"),
+    version="mixtral_two",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MixtralTwo,
+    sep="</s>",
+    sep2="</s>",
+)
+
+conv_nemo = Conversation(
+    system=[
+        "You are an AI robot and your name is VITA. \n- You are a multimodal large language model developed by the open source community. Your aim is to be helpful, honest and harmless. \n- You support the ability to communicate fluently and answer user questions in multiple languages of the user's choice. \n- If the user corrects the wrong answer you generated, you will apologize and discuss the correct answer with the user. \n- You must answer the question strictly according to the content of the image given by the user, and it is strictly forbidden to answer the question without the content of the image. Please note that you are seeing the image, not the video.",
+        "You are an AI robot and your name is VITA. \n- You are a multimodal large language model developed by the open source community. Your aim is to be helpful, honest and harmless. \n- You support the ability to communicate fluently and answer user questions in multiple languages of the user's choice. \n- If the user corrects the wrong answer you generated, you will apologize and discuss the correct answer with the user. \n- You must answer the question strictly according to the content of the video given by the user, and it is strictly forbidden to answer the question without the content of the video. Please note that you are seeing the video, not the image.",
+        "You are an AI robot and your name is VITA. \n- You are a multimodal large language model developed by the open source community. Your aim is to be helpful, honest and harmless. \n- You support the ability to communicate fluently and answer user questions in multiple languages of the user's choice. \n- If the user corrects the wrong answer you generated, you will apologize and discuss the correct answer with the user.",
+    ],
+    roles=("USER", "ASSISTANT"),
+    version="nemo",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.Nemo,
+    sep="[/INST]",
+    sep2="</s>",
+)
+
+conv_qwen2p5_instruct = Conversation(
+    system=[
+        "You are an AI robot and your name is VITA. \n- You are a multimodal large language model developed by the open source community. Your aim is to be helpful, honest and harmless. \n- You support the ability to communicate fluently and answer user questions in multiple languages of the user's choice. \n- If the user corrects the wrong answer you generated, you will apologize and discuss the correct answer with the user. \n- You must answer the question strictly according to the content of the image given by the user, and it is strictly forbidden to answer the question without the content of the image. Please note that you are seeing the image, not the video.",
+        "You are an AI robot and your name is VITA. \n- You are a multimodal large language model developed by the open source community. Your aim is to be helpful, honest and harmless. \n- You support the ability to communicate fluently and answer user questions in multiple languages of the user's choice. \n- If the user corrects the wrong answer you generated, you will apologize and discuss the correct answer with the user. \n- You must answer the question strictly according to the content of the video given by the user, and it is strictly forbidden to answer the question without the content of the video. Please note that you are seeing the video, not the image.",
+        "You are an AI robot and your name is VITA. \n- You are a multimodal large language model developed by the open source community. Your aim is to be helpful, honest and harmless. \n- You support the ability to communicate fluently and answer user questions in multiple languages of the user's choice. \n- If the user corrects the wrong answer you generated, you will apologize and discuss the correct answer with the user.",
+    ],
+    roles=("user", "assistant"),
+    version="qwen2p5_instruct",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.Qwen2p5Instruct,
+    sep="<|im_start|>",
+    sep2="<|im_start|>",
+)
+
+conv_phi3 = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="phi3",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="<|endoftext|>",
+)
+
+conv_minicpm = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="minicpm",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+
+conv_llama = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="llama",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="<|end_of_text|>",
+)
+
+conv_plain = Conversation(
+    system="",
+    roles=("", ""),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.PLAIN,
+    sep="\n",
+)
+
+default_conversation = conv_mixtral_two
+conv_templates = {
+    "default": conv_mixtral_two,
+    "nemo": conv_nemo,
+    "qwen2p5_instruct": conv_qwen2p5_instruct,
+    "mixtral_zh": conv_mixtral_zh,
+    "mixtral_two": conv_mixtral_two,
+    "phi3": conv_phi3,
+    "plain": conv_plain,
+    "minicpm": conv_minicpm,
+    "llama": conv_llama,
+}
+
+if __name__ == "__main__":
+    print(default_conversation.get_prompt())
+
diff --git a/vita/model/__init__.py b/vita/model/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c965c974d3d4d30c79ab8820c4dd14a936df66e1
--- /dev/null
+++ b/vita/model/__init__.py
@@ -0,0 +1,5 @@
+from .language_model.vita_mixtral import VITAMixtralConfig, VITAMixtralForCausalLM
+from .language_model.vita_nemo import VITAMistralConfig, VITAMistralForCausalLM
+from .language_model.vita_qwen2 import VITAQwen2Config, VITAQwen2ForCausalLM
+from .language_model.vita_fo_qwen2 import VITAFOQwen2Config, VITAFOQwen2ForCausalLM
+
diff --git a/vita/model/builder.py b/vita/model/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c98fcb81856963fb98f753e5f55f0cf072be73c
--- /dev/null
+++ b/vita/model/builder.py
@@ -0,0 +1,287 @@
+import os
+import warnings
+
+import torch
+from transformers import AutoConfig, AutoTokenizer, BitsAndBytesConfig, logging
+
+from vita.constants import GLOBAL_WEIGHTS_PATH
+from vita.model import *
+
+logging.set_verbosity_error()
+warnings.filterwarnings("ignore")
+
+
+def load_pretrained_model(
+    model_path,
+    model_base,
+    model_name,
+    model_type,
+    load_8bit=False,
+    load_4bit=False,
+    device_map="auto",
+    device="cuda",
+    **kwargs,
+):
+    if model_type not in {"mixtral-8x7b", "nemo", "qwen2p5_instruct", "qwen2p5_fo_instruct"}:
+        raise ValueError(f"Unknown Model Type {model_type}")
+
+    kwargs = {"device_map": device_map, **kwargs}
+
+    if device != "cuda":
+        kwargs["device_map"] = {"": device}
+
+    if load_8bit:
+        kwargs["load_in_8bit"] = True
+    elif load_4bit:
+        kwargs["load_in_4bit"] = True
+        kwargs["quantization_config"] = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type="nf4",
+        )
+    else:
+        kwargs["torch_dtype"] = torch.float16
+
+    # Load VITA model
+    if "lora" in model_name.lower() and model_base is None:
+        warnings.warn(
+            "There is `lora` in model name but no `model_base` is provided. If you are loading a LoRA model, please provide the `model_base` argument."
+        )
+    if "lora" in model_name.lower() and model_base is not None:
+        lora_cfg_pretrained = AutoConfig.from_pretrained(model_path)
+
+        print("Loading VITA from base model...")
+        if model_type == "mixtral-8x7b":
+            tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
+            model = VITAMixtralForCausalLM.from_pretrained(
+                model_path, low_cpu_mem_usage=True, **kwargs
+            )
+
+        token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features
+        if model.lm_head.weight.shape[0] != token_num:
+            model.lm_head.weight = torch.nn.Parameter(
+                torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype)
+            )
+            model.model.embed_tokens.weight = torch.nn.Parameter(
+                torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype)
+            )
+
+        print("Loading additional VITA weights...")
+        if os.path.exists(os.path.join(model_path, "non_lora_trainables.bin")):
+            non_lora_trainables = torch.load(
+                os.path.join(model_path, "non_lora_trainables.bin"), map_location="cpu"
+            )
+        else:
+            # this is probably from HF Hub
+            from huggingface_hub import hf_hub_download
+
+            def load_from_hf(repo_id, filename, subfolder=None):
+                cache_file = hf_hub_download(
+                    repo_id=repo_id, filename=filename, subfolder=subfolder
+                )
+                return torch.load(cache_file, map_location="cpu")
+
+            non_lora_trainables = load_from_hf(model_path, "non_lora_trainables.bin")
+
+        non_lora_trainables = {
+            (k[11:] if k.startswith("base_model.") else k): v
+            for k, v in non_lora_trainables.items()
+        }
+        if any(k.startswith("model.model.") for k in non_lora_trainables):
+            non_lora_trainables = {
+                (k[6:] if k.startswith("model.") else k): v for k, v in non_lora_trainables.items()
+            }
+        model.load_state_dict(non_lora_trainables, strict=False)
+
+        from peft import PeftModel
+
+        print("Loading LoRA weights...")
+        model = PeftModel.from_pretrained(model, model_path)
+        print("Merging LoRA weights...")
+        model = model.merge_and_unload()
+        print("Model is loaded...")
+    elif model_base is not None:
+        # this may be mm projector only
+        print("Loading VITA from base model...")
+
+        cfg_pretrained = AutoConfig.from_pretrained(model_path)
+        if model_type == "mixtral-8x7b":
+            tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=True)
+            model = VITAMixtralForCausalLM.from_pretrained(
+                model_base, low_cpu_mem_usage=True, **kwargs
+            )
+
+            # load vision encoder
+            from types import SimpleNamespace
+            model_args = {
+                "vision_tower": f"{GLOBAL_WEIGHTS_PATH}/InternViT-300M-448px",
+                "pretrain_mm_mlp_adapter": None,
+                "mm_projector_type": "mlp2x_gelu",
+            }
+            model_args = SimpleNamespace(**model_args)
+            model.get_model().initialize_vision_modules(model_args=model_args)
+
+            # load audio encoder
+            from types import SimpleNamespace
+            model_args = {
+               'audio_encoder': f"{GLOBAL_WEIGHTS_PATH}/audio-encoder-2wh_zh_en_audioset_Mixtral-8x7B_New-base-tunning",
+               'freeze_audio_encoder': True,
+               'freeze_audio_encoder_adapter': True
+            }
+            model_args = SimpleNamespace(**model_args)
+            model.get_model().initialize_audio_modules(model_args=model_args)
+            audio_encoder = model.get_audio_encoder()
+            device = torch.device('cuda:0')
+            audio_encoder = audio_encoder.to(device)
+
+        mm_projector_weights = torch.load(
+            os.path.join(model_path, "mm_projector.bin"), map_location="cpu"
+        )
+        mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()}
+        model.load_state_dict(mm_projector_weights, strict=False)
+        model.model.mm_projector.to(device="cuda", dtype=torch.float16)
+        model.model.vision_tower.to(device="cuda", dtype=torch.float16)
+    else:
+        if model_type == "mixtral-8x7b":
+            # import pdb; pdb.set_trace()
+            device_map = {
+                "model.embed_tokens": 0,
+                "model.layers.0": 0,
+                "model.layers.1": 0,
+                "model.layers.2": 0,
+                "model.layers.3": 0,
+                "model.layers.4": 0,
+                "model.layers.5": 0,
+                "model.layers.6": 0,
+                "model.layers.7": 0,
+                "model.layers.8": 0,
+                "model.layers.9": 0,
+                "model.layers.10": 0,
+                "model.layers.11": 0,
+                "model.layers.12": 0,
+                "model.layers.13": 0,
+                "model.layers.14": 0,
+                "model.layers.15": 0,
+                "model.layers.16": 1,
+                "model.layers.17": 1,
+                "model.layers.18": 1,
+                "model.layers.19": 1,
+                "model.layers.20": 1,
+                "model.layers.21": 1,
+                "model.layers.22": 1,
+                "model.layers.23": 1,
+                "model.layers.24": 1,
+                "model.layers.25": 1,
+                "model.layers.26": 1,
+                "model.layers.27": 1,
+                "model.layers.28": 1,
+                "model.layers.29": 1,
+                "model.layers.30": 1,
+                "model.layers.31": 1,
+                "model.norm": 1,
+                "model.vision_tower": 1,
+                "model.mm_projector": 1,
+                "model.audio_encoder": 1,
+                "lm_head": 1,
+            }
+            device_map["model.audio_encoder"] = 0
+            kwargs.update(device_map=device_map)
+            tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
+            model = VITAMixtralForCausalLM.from_pretrained(
+                model_path, low_cpu_mem_usage=True, **kwargs
+            )
+            # model.hf_device_map
+        elif model_type == "nemo":
+            # import pdb; pdb.set_trace()
+            tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
+            model = VITAMistralForCausalLM.from_pretrained(
+                model_path, low_cpu_mem_usage=True, **kwargs
+            )
+        elif model_type == "qwen2p5_instruct":
+            # import pdb; pdb.set_trace()
+            tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
+            model = VITAQwen2ForCausalLM.from_pretrained(
+                model_path, low_cpu_mem_usage=True, **kwargs
+            )
+        elif model_type == "qwen2p5_fo_instruct":
+            # import pdb; pdb.set_trace()
+            tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
+            model = VITAFOQwen2ForCausalLM.from_pretrained(
+                model_path, low_cpu_mem_usage=True, **kwargs
+            )
+
+    model.resize_token_embeddings(len(tokenizer))
+
+    vision_tower = model.get_vision_tower()
+    if not vision_tower.is_loaded:
+        vision_tower.load_model()
+
+    num_params = sum(p.numel() for p in vision_tower.parameters())
+    print("the number of vision encoder params: {}M".format(num_params / 1024 / 1024))
+
+    if getattr(model.config, "unfreeze_vision_tower", False):
+        if "lora" in model_name.lower():
+            assert model_base is not None
+            vision_non_lora_trainables = {
+                k[19:]: v
+                for k, v in non_lora_trainables.items()
+                if k.startswith("model.vision_tower.")
+            }
+            vision_tower.load_state_dict(vision_non_lora_trainables, strict=False)
+        else:
+            assert model_base is None
+            from safetensors.torch import load_file
+
+            vision_weights = {}
+            for file_name in os.listdir(model_path):
+                if file_name.endswith("safetensors"):
+                    vision_weights.update(
+                        {
+                            k[19:]: v
+                            for k, v in load_file(os.path.join(model_path, file_name)).items()
+                            if k.startswith("model.vision_tower.")
+                        }
+                    )
+            vision_tower.load_state_dict(vision_weights, strict=True)
+
+    # import pdb; pdb.set_trace()
+    # if (not getattr(model.config, "freeze_audio_encoder", True)) and (not getattr(model.config, "freeze_audio_encoder_adapter", True)):
+    #    from safetensors.torch import load_file
+    #    audio_weights = {}
+    #    for file_name in os.listdir(model_path):
+    #        if file_name.endswith('safetensors'):
+    #            audio_weights.update(
+    #                {k[20:]: v for k, v in load_file(os.path.join(model_path, file_name)).items() if
+    #                    k.startswith('model.audio_encoder.')})
+    #    audio_encoder.load_state_dict(audio_weights, strict=True)
+    #    audio_encoder.eval()
+    # import pdb; pdb.set_trace()
+
+    # import pdb; pdb.set_trace()
+    # from safetensors.torch import load_file
+    # audio_weights = {}
+    # for file_name in os.listdir(model_path):
+    #    if file_name.endswith('safetensors'):
+    #        audio_weights.update(
+    #            {k[20:]: v for k, v in load_file(os.path.join(model_path, file_name)).items() if
+    #                k.startswith('model.audio_encoder.')})
+    # import pdb; pdb.set_trace()
+
+    vision_tower.to(dtype=torch.float16)
+    image_processor = vision_tower.image_processor
+
+    #import pdb; pdb.set_trace()
+    if hasattr(model.config, "max_sequence_length"):
+        context_len = model.config.max_sequence_length
+    else:
+        context_len = 2048
+
+    if model.generation_config.pad_token_id is None:
+        model.generation_config.pad_token_id = model.generation_config.eos_token_id
+
+    if model_type == "phi-3":
+        model.generation_config.eos_token_id = tokenizer.eos_token_id
+
+    return tokenizer, model, image_processor, context_len
+
diff --git a/vita/model/language_model/vita_fo_qwen2.py b/vita/model/language_model/vita_fo_qwen2.py
new file mode 100644
index 0000000000000000000000000000000000000000..2de39125dac09b442537c3903a7828bc70594d8b
--- /dev/null
+++ b/vita/model/language_model/vita_fo_qwen2.py
@@ -0,0 +1,227 @@
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from PIL import Image
+from torch.nn import CrossEntropyLoss
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    Qwen2Config,
+    Qwen2ForCausalLM,
+    Qwen2Model,
+)
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.modeling_outputs import CausalLMOutputWithPast, MoeCausalLMOutputWithPast
+from transformers.generation.utils import GenerateOutput
+
+from ..vita_arch import VITAMetaForCausalLM, VITAMetaModel
+from ...constants import IGNORE_INDEX
+from .vita_qwen2 import custom_forward
+
+
+Qwen2ForCausalLM.forward = custom_forward
+
+
+class VITAFOQwen2Config(Qwen2Config):
+    model_type = "vita-fo-Qwen2"
+
+
+class VITAFOQwen2Model(VITAMetaModel, Qwen2Model):
+    config_class = VITAFOQwen2Config
+
+    def __init__(self, config: Qwen2Config):
+        super(VITAFOQwen2Model, self).__init__(config)
+
+
+class VITAFOQwen2ForCausalLM(Qwen2ForCausalLM, VITAMetaForCausalLM):
+    config_class = VITAFOQwen2Config
+
+    def __init__(self, config):
+        super(Qwen2ForCausalLM, self).__init__(config)
+        self.model = VITAFOQwen2Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.predict_usr_state = 0#2
+        if self.predict_usr_state:
+            self.predictor_head = torch.nn.Linear(config.hidden_size, self.predict_usr_state + 1) # +1 for the dummy class
+        else:
+            self.predictor_head = None
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_model(self):
+        return self.model
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        audios: Optional[dict] = None,
+        sf_masks: Optional[torch.Tensor] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        if inputs_embeds is None:
+            (
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                inputs_embeds,
+                labels,
+            ) = self.prepare_inputs_labels_for_multimodal(
+                input_ids, position_ids, attention_mask, past_key_values, labels, images, audios, sf_masks
+            )
+        if labels is not None:
+            state_labels = labels
+            labels = torch.where(labels>=0, labels, IGNORE_INDEX)
+        output_hidden_states = True
+        outputs =  super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+        # state loss
+        if self.predictor_head is not None:
+            state_logits = self.predictor_head(outputs[2][-1]).view(-1, self.predict_usr_state+1) # +1 for the dummy class
+            if labels is not None:
+                loss = outputs[0]
+                weight = torch.Tensor([1, 5, 1]).to(torch.bfloat16).to(inputs_embeds.device)
+                loss_fct = torch.nn.CrossEntropyLoss(weight=weight)
+                s_labels= torch.where(
+                            state_labels < IGNORE_INDEX, 
+                            IGNORE_INDEX-state_labels-1, 
+                            IGNORE_INDEX).view(-1)
+                #assert all(label in [0, 1, IGNORE_INDEX] for label in s_labels), "s_labels must contain only 0, 1, or -100"
+                state_loss = loss_fct(state_logits, s_labels)
+                loss = loss + state_loss
+                outputs['loss'] = loss
+        return outputs
+
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        images: Optional[torch.Tensor] = None,
+        audios: Optional[torch.Tensor] = None,
+        sf_masks: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        position_ids = kwargs.pop("position_ids", None)
+        attention_mask = kwargs.pop("attention_mask", None)
+        if "inputs_embeds" in kwargs:
+            raise NotImplementedError("`inputs_embeds` is not supported")
+
+        if images is not None or audios is not None:
+            (
+                inputs,
+                position_ids,
+                attention_mask,
+                _,
+                inputs_embeds,
+                _
+            ) = self.prepare_inputs_labels_for_multimodal(
+                inputs,
+                position_ids,
+                attention_mask,
+                None,
+                None,
+                images,
+                audios,
+                sf_masks,
+            )
+        else:
+            inputs_embeds = self.get_model().embed_tokens(inputs)
+
+        return super().generate(
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            **kwargs
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        attention_mask=None,
+        **kwargs,
+    ):
+        images = kwargs.pop("images", None)
+        audios = kwargs.pop("audios", None)
+        sf_masks = kwargs.pop("sf_masks", None)
+
+        _inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            **kwargs,
+        )
+
+        if images is not None:
+            _inputs["images"] = images
+        if audios is not None:
+            _inputs["audios"] = audios
+        if sf_masks is not None:
+            _inputs["sf_masks"] = sf_masks
+        return _inputs
+
+    def expand2square(self, pil_img, background_color):
+        width, height = pil_img.size
+        if width == height:
+            return pil_img
+        elif width > height:
+            result = Image.new(pil_img.mode, (width, width), background_color)
+            result.paste(pil_img, (0, (width - height) // 2))
+            return result
+        else:
+            result = Image.new(pil_img.mode, (height, height), background_color)
+            result.paste(pil_img, ((height - width) // 2, 0))
+            return result
+
+    def process_images(self, images, model_cfg):
+        vision_tower = self.get_vision_tower()
+        if not vision_tower.is_loaded:
+            vision_tower.load_model()
+        image_processor = vision_tower.image_processor
+        image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
+        new_images = []
+        if image_aspect_ratio == "pad":
+            for image in images:
+                image = self.expand2square(
+                    image, tuple(int(x * 255) for x in image_processor.image_mean)
+                )
+                image = image_processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
+                new_images.append(image)
+        else:
+            return image_processor(images, return_tensors="pt")["pixel_values"]
+        if all(x.shape == new_images[0].shape for x in new_images):
+            new_images = torch.stack(new_images, dim=0)
+        return new_images
+
+
+AutoConfig.register("vita-fo-Qwen2", VITAFOQwen2Config)
+AutoModelForCausalLM.register(VITAFOQwen2Config, VITAFOQwen2ForCausalLM)
+
+
+
+
diff --git a/vita/model/language_model/vita_mixtral.py b/vita/model/language_model/vita_mixtral.py
new file mode 100644
index 0000000000000000000000000000000000000000..acffc90ae8a88e6326b563b8d60861e414c71557
--- /dev/null
+++ b/vita/model/language_model/vita_mixtral.py
@@ -0,0 +1,420 @@
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from PIL import Image
+from torch.nn import CrossEntropyLoss
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    MixtralConfig,
+    MixtralForCausalLM,
+    MixtralModel,
+)
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.modeling_outputs import CausalLMOutputWithPast, MoeCausalLMOutputWithPast
+
+from ..vita_arch import VITAMetaForCausalLM, VITAMetaModel
+
+
+def load_balancing_loss_func(
+    gate_logits: torch.Tensor,
+    num_experts: torch.Tensor = None,
+    top_k=2,
+    attention_mask: Optional[torch.Tensor] = None,
+) -> float:
+    r"""
+    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.
+
+    See Switch Transformer (https://arxiv.org/abs/2101.03961) for more details. This function implements the loss
+    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
+    experts is too unbalanced.
+
+    Args:
+        gate_logits (Union[`torch.Tensor`, Tuple[torch.Tensor]):
+            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
+            shape [batch_size X sequence_length, num_experts].
+        attention_mask (`torch.Tensor`, None):
+            The attention_mask used in forward function
+            shape [batch_size X sequence_length] if not None.
+        num_experts (`int`, *optional*):
+            Number of experts
+
+    Returns:
+        The auxiliary loss.
+    """
+    if gate_logits is None or not isinstance(gate_logits, tuple):
+        return 0
+
+    if isinstance(gate_logits, tuple):
+        compute_device = gate_logits[0].device
+        concatenated_gate_logits = torch.cat(
+            [layer_gate.to(compute_device) for layer_gate in gate_logits], dim=0
+        )
+
+    routing_weights = torch.nn.functional.softmax(concatenated_gate_logits, dim=-1)
+
+    _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
+
+    expert_mask = torch.nn.functional.one_hot(selected_experts, num_experts)
+
+    if attention_mask is None:
+        # Compute the percentage of tokens routed to each experts
+        tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
+
+        # Compute the average probability of routing to these experts
+        router_prob_per_expert = torch.mean(routing_weights, dim=0)
+    else:
+        batch_size, sequence_length = attention_mask.shape
+        num_hidden_layers = concatenated_gate_logits.shape[0] // (batch_size * sequence_length)
+
+        # Compute the mask that masks all padding tokens as 0 with the same shape of expert_mask
+        expert_attention_mask = (
+            attention_mask[None, :, :, None, None]
+            .expand((num_hidden_layers, batch_size, sequence_length, top_k, num_experts))
+            .reshape(-1, top_k, num_experts)
+            .to(compute_device)
+        )
+
+        # Compute the percentage of tokens routed to each experts
+        tokens_per_expert = torch.sum(
+            expert_mask.float() * expert_attention_mask, dim=0
+        ) / torch.sum(expert_attention_mask, dim=0)
+
+        # Compute the mask that masks all padding tokens as 0 with the same shape of tokens_per_expert
+        router_per_expert_attention_mask = (
+            attention_mask[None, :, :, None]
+            .expand((num_hidden_layers, batch_size, sequence_length, num_experts))
+            .reshape(-1, num_experts)
+            .to(compute_device)
+        )
+
+        # Compute the average probability of routing to these experts
+        router_prob_per_expert = torch.sum(
+            routing_weights * router_per_expert_attention_mask, dim=0
+        ) / torch.sum(router_per_expert_attention_mask, dim=0)
+
+    overall_loss = torch.sum(tokens_per_expert * router_prob_per_expert.unsqueeze(0))
+    return overall_loss * num_experts
+
+
+def custom_forward(
+    self,
+    input_ids: torch.LongTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[List[torch.FloatTensor]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    output_router_logits: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+) -> Union[Tuple, MoeCausalLMOutputWithPast]:
+    r"""
+    Args:
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+    Returns:
+
+    Example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, MixtralForCausalLM
+
+    >>> model = MixtralForCausalLM.from_pretrained("mistralai/Mixtral-8x7B-v0.1")
+    >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mixtral-8x7B-v0.1")
+
+    >>> prompt = "Hey, are you conscious? Can you talk to me?"
+    >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+    >>> # Generate
+    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+    ```"""
+
+    output_attentions = (
+        output_attentions if output_attentions is not None else self.config.output_attentions
+    )
+    output_router_logits = (
+        output_router_logits
+        if output_router_logits is not None
+        else self.config.output_router_logits
+    )
+
+    output_hidden_states = (
+        output_hidden_states
+        if output_hidden_states is not None
+        else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+    outputs = self.model(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        output_router_logits=output_router_logits,
+        return_dict=return_dict,
+    )
+
+    hidden_states = outputs[0]
+    logits = self.lm_head(hidden_states)
+    # logits = logits.float()
+
+    loss = None
+    if labels is not None:
+        # Shift so that tokens < n predict n
+        shift_logits = logits[..., :-1, :].contiguous()
+        shift_labels = labels[..., 1:].contiguous()
+        # Flatten the tokens
+        loss_fct = CrossEntropyLoss()
+        shift_logits = shift_logits.view(-1, self.config.vocab_size)
+        shift_labels = shift_labels.view(-1)
+        # Enable model parallelism
+        shift_labels = shift_labels.to(shift_logits.device)
+        loss = loss_fct(shift_logits, shift_labels)
+
+    aux_loss = None
+    if output_router_logits:
+        aux_loss = load_balancing_loss_func(
+            outputs.router_logits if return_dict else outputs[-1],
+            self.num_experts,
+            self.num_experts_per_tok,
+            attention_mask,
+        )
+        if labels is not None:
+            loss += self.router_aux_loss_coef * aux_loss.to(
+                loss.device
+            )  # make sure to reside in the same device
+
+    if not return_dict:
+        output = (logits,) + outputs[1:]
+        if output_router_logits:
+            output = (aux_loss,) + output
+        return (loss,) + output if loss is not None else output
+
+    return MoeCausalLMOutputWithPast(
+        loss=loss,
+        aux_loss=aux_loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+        router_logits=outputs.router_logits,
+    )
+
+
+MixtralForCausalLM.forward = custom_forward
+
+
+class VITAMixtralConfig(MixtralConfig):
+    model_type = "vita-mixtral"
+
+
+class VITAMixtralModel(VITAMetaModel, MixtralModel):
+    config_class = VITAMixtralConfig
+
+    def __init__(self, config: MixtralConfig):
+        super(VITAMixtralModel, self).__init__(config)
+
+
+class VITAMixtralForCausalLM(MixtralForCausalLM, VITAMetaForCausalLM):
+    config_class = VITAMixtralConfig
+
+    def __init__(self, config):
+        super(MixtralForCausalLM, self).__init__(config)
+        self.model = VITAMixtralModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.router_aux_loss_coef = config.router_aux_loss_coef
+        self.num_experts = config.num_local_experts
+        self.num_experts_per_tok = config.num_experts_per_tok
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_model(self):
+        return self.model
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        audios: Optional[dict] = None,
+        sf_masks: Optional[torch.Tensor] = None,
+        output_router_logits: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        if inputs_embeds is None:
+            (
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                inputs_embeds,
+                labels,
+            ) = self.prepare_inputs_labels_for_multimodal(
+                input_ids, position_ids, attention_mask, past_key_values, labels, images, audios, sf_masks
+            )
+
+        return super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            output_router_logits=output_router_logits,
+            return_dict=return_dict,
+        )
+
+    def prepare_inputs_for_generation_original(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        output_router_logits=False,
+        **kwargs,
+    ):
+        # Omit tokens covered by past_key_values
+        if past_key_values is not None:
+            if isinstance(past_key_values, Cache):
+                cache_length = past_key_values.get_seq_length()
+                past_length = past_key_values.seen_tokens
+                max_cache_length = past_key_values.get_max_length()
+            else:
+                cache_length = past_length = past_key_values[0][0].shape[2]
+                max_cache_length = None
+
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+            else:
+                remove_prefix_length = input_ids.shape[1] - 1
+                input_ids = input_ids[:, remove_prefix_length:]
+            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+            ):
+                attention_mask = attention_mask[:, -max_cache_length:]
+
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+                "output_router_logits": output_router_logits,
+            }
+        )
+        return model_inputs
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        attention_mask=None,
+        output_router_logits=False,
+        **kwargs,
+    ):
+        images = kwargs.pop("images", None)
+        audios = kwargs.pop("audios", None)
+
+        _inputs = self.prepare_inputs_for_generation_original(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            output_router_logits=output_router_logits,
+            **kwargs,
+        )
+
+        if images is not None:
+            _inputs["images"] = images
+        if audios is not None:
+            _inputs["audios"] = audios
+        return _inputs
+
+    def expand2square(self, pil_img, background_color):
+        width, height = pil_img.size
+        if width == height:
+            return pil_img
+        elif width > height:
+            result = Image.new(pil_img.mode, (width, width), background_color)
+            result.paste(pil_img, (0, (width - height) // 2))
+            return result
+        else:
+            result = Image.new(pil_img.mode, (height, height), background_color)
+            result.paste(pil_img, ((height - width) // 2, 0))
+            return result
+
+    def process_images(self, images, model_cfg):
+        vision_tower = self.get_vision_tower()
+        if not vision_tower.is_loaded:
+            vision_tower.load_model()
+        image_processor = vision_tower.image_processor
+        image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
+        new_images = []
+        if image_aspect_ratio == "pad":
+            for image in images:
+                image = self.expand2square(
+                    image, tuple(int(x * 255) for x in image_processor.image_mean)
+                )
+                image = image_processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
+                new_images.append(image)
+        else:
+            return image_processor(images, return_tensors="pt")["pixel_values"]
+        if all(x.shape == new_images[0].shape for x in new_images):
+            new_images = torch.stack(new_images, dim=0)
+        return new_images
+
+
+AutoConfig.register("vita-mixtral", VITAMixtralConfig)
+AutoModelForCausalLM.register(VITAMixtralConfig, VITAMixtralForCausalLM)
diff --git a/vita/model/language_model/vita_nemo.py b/vita/model/language_model/vita_nemo.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c71681d5e5759306d795bf5accf09a1a20a6483
--- /dev/null
+++ b/vita/model/language_model/vita_nemo.py
@@ -0,0 +1,282 @@
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from PIL import Image
+from torch.nn import CrossEntropyLoss
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    MistralConfig,
+    MistralForCausalLM,
+    MistralModel,
+)
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.modeling_outputs import CausalLMOutputWithPast, MoeCausalLMOutputWithPast
+from transformers.generation.utils import GenerateOutput
+
+from ..vita_arch import VITAMetaForCausalLM, VITAMetaModel
+
+
+def custom_forward(
+    self,
+    input_ids: torch.LongTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+) -> Union[Tuple, CausalLMOutputWithPast]:
+    r"""
+    Args:
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+    Returns:
+
+    Example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, MistralForCausalLM
+
+    >>> model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
+    >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
+
+    >>> prompt = "Hey, are you conscious? Can you talk to me?"
+    >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+    >>> # Generate
+    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+    ```"""
+
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+    outputs = self.model(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+        cache_position=cache_position,
+    )
+
+    hidden_states = outputs[0]
+    logits = self.lm_head(hidden_states)
+    # logits = logits.float()
+
+    loss = None
+    if labels is not None:
+        # Shift so that tokens < n predict n
+        shift_logits = logits[..., :-1, :].contiguous()
+        shift_labels = labels[..., 1:].contiguous()
+        # Flatten the tokens
+        shift_logits = shift_logits.view(-1, self.config.vocab_size)
+        shift_labels = shift_labels.view(-1)
+        # Ensure tensors are on the same device
+        shift_labels = shift_labels.to(shift_logits.device)
+        loss_fct = CrossEntropyLoss()
+        loss = loss_fct(shift_logits, shift_labels)
+
+    if not return_dict:
+        output = (logits,) + outputs[1:]
+        return (loss,) + output if loss is not None else output
+
+    return CausalLMOutputWithPast(
+        loss=loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+    )
+
+MistralForCausalLM.forward = custom_forward
+
+
+class VITAMistralConfig(MistralConfig):
+    model_type = "vita-Mistral"
+
+
+class VITAMistralModel(VITAMetaModel, MistralModel):
+    config_class = VITAMistralConfig
+
+    def __init__(self, config: MistralConfig):
+        super(VITAMistralModel, self).__init__(config)
+
+
+class VITAMistralForCausalLM(MistralForCausalLM, VITAMetaForCausalLM):
+    config_class = VITAMistralConfig
+
+    def __init__(self, config):
+        super(MistralForCausalLM, self).__init__(config)
+        self.model = VITAMistralModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_model(self):
+        return self.model
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        audios: Optional[dict] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        if inputs_embeds is None:
+            (
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                inputs_embeds,
+                labels,
+            ) = self.prepare_inputs_labels_for_multimodal(
+                input_ids, position_ids, attention_mask, past_key_values, labels, images, audios
+            )
+
+        return super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        images: Optional[torch.Tensor] = None,
+        audios: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        position_ids = kwargs.pop("position_ids", None)
+        attention_mask = kwargs.pop("attention_mask", None)
+        if "inputs_embeds" in kwargs:
+            raise NotImplementedError("`inputs_embeds` is not supported")
+
+        if images is not None or audios is not None:
+            (
+                inputs,
+                position_ids,
+                attention_mask,
+                _,
+                inputs_embeds,
+                _
+            ) = self.prepare_inputs_labels_for_multimodal(
+                inputs,
+                position_ids,
+                attention_mask,
+                None,
+                None,
+                images,
+                audios
+            )
+        else:
+            inputs_embeds = self.get_model().embed_tokens(inputs)
+
+        return super().generate(
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            **kwargs
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        attention_mask=None,
+        **kwargs,
+    ):
+        images = kwargs.pop("images", None)
+        audios = kwargs.pop("audios", None)
+
+        _inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            **kwargs,
+        )
+
+        if images is not None:
+            _inputs["images"] = images
+        if audios is not None:
+            _inputs["audios"] = audios
+        return _inputs
+
+    def expand2square(self, pil_img, background_color):
+        width, height = pil_img.size
+        if width == height:
+            return pil_img
+        elif width > height:
+            result = Image.new(pil_img.mode, (width, width), background_color)
+            result.paste(pil_img, (0, (width - height) // 2))
+            return result
+        else:
+            result = Image.new(pil_img.mode, (height, height), background_color)
+            result.paste(pil_img, ((height - width) // 2, 0))
+            return result
+
+    def process_images(self, images, model_cfg):
+        vision_tower = self.get_vision_tower()
+        if not vision_tower.is_loaded:
+            vision_tower.load_model()
+        image_processor = vision_tower.image_processor
+        image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
+        new_images = []
+        if image_aspect_ratio == "pad":
+            for image in images:
+                image = self.expand2square(
+                    image, tuple(int(x * 255) for x in image_processor.image_mean)
+                )
+                image = image_processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
+                new_images.append(image)
+        else:
+            return image_processor(images, return_tensors="pt")["pixel_values"]
+        if all(x.shape == new_images[0].shape for x in new_images):
+            new_images = torch.stack(new_images, dim=0)
+        return new_images
+
+
+AutoConfig.register("vita-Mistral", VITAMistralConfig)
+AutoModelForCausalLM.register(VITAMistralConfig, VITAMistralForCausalLM)
+
diff --git a/vita/model/language_model/vita_qwen2.py b/vita/model/language_model/vita_qwen2.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e4b52492a2eda63b08c8af359443c17cd807171
--- /dev/null
+++ b/vita/model/language_model/vita_qwen2.py
@@ -0,0 +1,304 @@
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from PIL import Image
+from torch.nn import CrossEntropyLoss
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    Qwen2Config,
+    Qwen2ForCausalLM,
+    Qwen2Model,
+)
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.modeling_outputs import CausalLMOutputWithPast, MoeCausalLMOutputWithPast
+from transformers.generation.utils import GenerateOutput
+
+from ..vita_arch import VITAMetaForCausalLM, VITAMetaModel
+
+
+def custom_forward(
+    self,
+    input_ids: torch.LongTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[List[torch.FloatTensor]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+) -> Union[Tuple, CausalLMOutputWithPast]:
+    r"""
+    Args:
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+    Returns:
+
+    Example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, Qwen2ForCausalLM
+
+    >>> model = Qwen2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+    >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+
+    >>> prompt = "Hey, are you conscious? Can you talk to me?"
+    >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+    >>> # Generate
+    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+    ```"""
+
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+    outputs = self.model(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+        cache_position=cache_position,
+    )
+
+    hidden_states = outputs[0]
+    logits = self.lm_head(hidden_states)
+    # logits = logits.float()
+
+    loss = None
+    if labels is not None:
+        # Shift so that tokens < n predict n
+        shift_logits = logits[..., :-1, :].contiguous()
+        shift_labels = labels[..., 1:].contiguous()
+        # Flatten the tokens
+        loss_fct = CrossEntropyLoss()
+        shift_logits = shift_logits.view(-1, self.config.vocab_size)
+        shift_labels = shift_labels.view(-1)
+        # Enable model parallelism
+        shift_labels = shift_labels.to(shift_logits.device)
+        loss = loss_fct(shift_logits, shift_labels)
+
+    if not return_dict:
+        output = (logits,) + outputs[1:]
+        return (loss,) + output if loss is not None else output
+
+    #import pdb; pdb.set_trace()
+    return CausalLMOutputWithPast(
+        loss=loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+    )
+
+
+Qwen2ForCausalLM.forward = custom_forward
+
+
+class VITAQwen2Config(Qwen2Config):
+    model_type = "vita-Qwen2"
+
+
+class VITAQwen2Model(VITAMetaModel, Qwen2Model):
+    config_class = VITAQwen2Config
+
+    def __init__(self, config: Qwen2Config):
+        super(VITAQwen2Model, self).__init__(config)
+
+
+class VITAQwen2ForCausalLM(Qwen2ForCausalLM, VITAMetaForCausalLM):
+    config_class = VITAQwen2Config
+
+    def __init__(self, config):
+        super(Qwen2ForCausalLM, self).__init__(config)
+        self.model = VITAQwen2Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_model(self):
+        return self.model
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        audios: Optional[dict] = None,
+        sf_masks: Optional[torch.Tensor] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        if inputs_embeds is None:
+            (
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                inputs_embeds,
+                labels,
+            ) = self.prepare_inputs_labels_for_multimodal(
+                input_ids, position_ids, attention_mask, past_key_values, labels, images, audios, sf_masks
+            )
+
+        return super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        images: Optional[torch.Tensor] = None,
+        audios: Optional[torch.Tensor] = None,
+        sf_masks: Optional[torch.Tensor] = None,
+        shared_v_pid_stride: Optional[int] = None,
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        position_ids = kwargs.pop("position_ids", None)
+        attention_mask = kwargs.pop("attention_mask", None)
+        if "inputs_embeds" in kwargs:
+            raise NotImplementedError("`inputs_embeds` is not supported")
+
+        if images is not None or audios is not None:
+            (
+                inputs,
+                position_ids,
+                attention_mask,
+                _,
+                inputs_embeds,
+                _
+            ) = self.prepare_inputs_labels_for_multimodal(
+                inputs,
+                position_ids,
+                attention_mask,
+                None,
+                None,
+                images,
+                audios,
+                sf_masks,
+                shared_v_pid_stride,
+            )
+        else:
+            inputs_embeds = self.get_model().embed_tokens(inputs)
+
+        return super().generate(
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            **kwargs
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        attention_mask=None,
+        **kwargs,
+    ):
+        images = kwargs.pop("images", None)
+        audios = kwargs.pop("audios", None)
+        sf_masks = kwargs.pop("sf_masks", None)
+
+        _inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            **kwargs,
+        )
+
+#        import pdb; pdb.set_trace()
+        position_ids = _inputs["position_ids"]
+        cache_position = _inputs["cache_position"]
+        if cache_position.shape[-1] == 1 and position_ids.shape[-1] > 1:
+            new_position_ids = torch.zeros((position_ids.shape[0],1), dtype=position_ids.dtype, device=position_ids.device)
+            new_position_ids[:, 0] = position_ids[0,-1] + cache_position[-1] + 1 - position_ids.shape[-1]
+            position_ids = new_position_ids
+            _inputs["position_ids"] = position_ids
+#        import pdb; pdb.set_trace()
+
+        if images is not None:
+            _inputs["images"] = images
+        if audios is not None:
+            _inputs["audios"] = audios
+        if sf_masks is not None:
+            _inputs["sf_masks"] = sf_masks
+        return _inputs
+
+    def expand2square(self, pil_img, background_color):
+        width, height = pil_img.size
+        if width == height:
+            return pil_img
+        elif width > height:
+            result = Image.new(pil_img.mode, (width, width), background_color)
+            result.paste(pil_img, (0, (width - height) // 2))
+            return result
+        else:
+            result = Image.new(pil_img.mode, (height, height), background_color)
+            result.paste(pil_img, ((height - width) // 2, 0))
+            return result
+
+    def process_images(self, images, model_cfg):
+        vision_tower = self.get_vision_tower()
+        if not vision_tower.is_loaded:
+            vision_tower.load_model()
+        image_processor = vision_tower.image_processor
+        image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
+        new_images = []
+        if image_aspect_ratio == "pad":
+            for image in images:
+                image = self.expand2square(
+                    image, tuple(int(x * 255) for x in image_processor.image_mean)
+                )
+                image = image_processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
+                new_images.append(image)
+        else:
+            return image_processor(images, return_tensors="pt")["pixel_values"]
+        if all(x.shape == new_images[0].shape for x in new_images):
+            new_images = torch.stack(new_images, dim=0)
+        return new_images
+
+
+AutoConfig.register("vita-Qwen2", VITAQwen2Config)
+AutoModelForCausalLM.register(VITAQwen2Config, VITAQwen2ForCausalLM)
+
+
+
diff --git a/vita/model/multimodal_encoder/builder.py b/vita/model/multimodal_encoder/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ad3ef4d7fa0febfecb80afaf8ffd94513b58bbf
--- /dev/null
+++ b/vita/model/multimodal_encoder/builder.py
@@ -0,0 +1,83 @@
+import os
+
+import yaml
+import torch
+from transformers.utils.hub import get_file_from_repo
+
+from .clip.clip_encoder import CLIPVisionTower
+from .eva_clip.eva_clip_encoder import EvaClipVisionTower
+from .internvit.internvit_encoder import InternViTVisionTower
+from .siglip.siglip_encoder import SiglipVisionTower, SiglipVisionTowerS2
+from .whale.init_model import init_model
+
+
+def build_vision_tower(vision_tower_cfg, **kwargs):
+    vision_tower = getattr(
+        vision_tower_cfg, "mm_vision_tower", getattr(vision_tower_cfg, "vision_tower", None)
+    )
+    use_s2 = getattr(vision_tower_cfg, "use_s2", False)
+
+    if "sig" in vision_tower.lower():
+        if use_s2:
+            return SiglipVisionTowerS2(vision_tower, args=vision_tower_cfg, **kwargs)
+        else:
+            return SiglipVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
+    elif "eva" in vision_tower.lower():
+        if use_s2:
+            raise ValueError(f"Currently not supporting S2 for EVA-CLIP")
+        else:
+            return EvaClipVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
+
+    elif "clip" in vision_tower.lower():
+        if use_s2:
+            raise ValueError(f"Currently not supporting S2 for CLIP")
+        else:
+            return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
+    elif "internvit" in vision_tower.lower():
+        if use_s2:
+            raise ValueError(f"Currently not supporting S2 for InternViT")
+        else:
+            return InternViTVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
+
+    else:
+        raise ValueError(f"Unknown vision tower: {vision_tower}")
+
+
+def build_audio_encoder(audio_encoder_config, **kwargs):
+    with open(get_file_from_repo(audio_encoder_config.mm_audio_encoder, "train.yaml"), "r") as fin:
+        configs = yaml.load(fin, Loader=yaml.FullLoader)
+
+    configs["cmvn_file"] = get_file_from_repo(audio_encoder_config.mm_audio_encoder, "global_cmvn")
+
+    configs["model_conf"]["freeze_encoder"] = getattr(
+        audio_encoder_config, "freeze_audio_encoder", True
+    )
+    configs["model_conf"]["freeze_adpter"] = getattr(
+        audio_encoder_config, "freeze_audio_encoder_adapter", True
+    )
+    configs["model_conf"]["audio_prompt_finetune"] = getattr(
+        audio_encoder_config, "audio_prompt_finetune", False
+    )
+    configs["model_conf"]["audio_prompt_num"] = getattr(
+        audio_encoder_config, "audio_prompt_num", 0
+    )
+
+    audio_encoder = init_model(configs)
+
+    checkpoint = torch.load(get_file_from_repo(audio_encoder_config.mm_audio_encoder, "final.pt"), map_location="cpu")
+    model_dict = audio_encoder.state_dict()
+    for key in model_dict.keys():
+        if key in checkpoint.keys():
+            if model_dict[key].shape == checkpoint[key].shape:
+                model_dict[key] = checkpoint[key]
+            else:
+                print(
+                    "Key {} has different shape, {} VS {}".format(
+                        key, model_dict[key].shape, checkpoint[key].shape
+                    )
+                )
+        else:
+            print("Key {} has not in resume model".format(key))
+    audio_encoder.load_state_dict(model_dict)
+
+    return audio_encoder
diff --git a/vita/model/multimodal_encoder/clip/clip_encoder.py b/vita/model/multimodal_encoder/clip/clip_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ad4ff7e62cab57ed93aa621dc88fc031f9fd7d7
--- /dev/null
+++ b/vita/model/multimodal_encoder/clip/clip_encoder.py
@@ -0,0 +1,78 @@
+import torch
+import torch.nn as nn
+from transformers import CLIPImageProcessor, CLIPVisionConfig, CLIPVisionModel
+
+
+class CLIPVisionTower(nn.Module):
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__()
+
+        self.is_loaded = False
+
+        self.vision_tower_name = vision_tower
+        self.select_layer = -2
+
+        if not delay_load:
+            self.load_model()
+        else:
+            self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
+
+    def load_model(self):
+        self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
+        self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name)
+        self.vision_tower.requires_grad_(False)
+
+        self.is_loaded = True
+
+    def feature_select(self, image_forward_outs):
+        image_features = image_forward_outs.hidden_states[self.select_layer]
+
+        image_features = image_features[:, 1:]
+
+        return image_features
+
+    @torch.no_grad()
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_tower(
+                    image.to(device=self.device, dtype=self.dtype).unsqueeze(0),
+                    output_hidden_states=True,
+                )
+                image_feature = self.feature_select(image_forward_out).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            image_forward_outs = self.vision_tower(
+                images.to(device=self.device, dtype=self.dtype), output_hidden_states=True
+            )
+            image_features = self.feature_select(image_forward_outs).to(images.dtype)
+
+        return image_features
+
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+
+    @property
+    def dtype(self):
+        return self.vision_tower.dtype
+
+    @property
+    def device(self):
+        return self.vision_tower.device
+
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_tower.config
+        else:
+            return self.cfg_only
+
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size) ** 2
diff --git a/vita/model/multimodal_encoder/eva_clip/eva_clip_encoder.py b/vita/model/multimodal_encoder/eva_clip/eva_clip_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b9448d62dbc6e29aed6b5825fddad99b1e840a5
--- /dev/null
+++ b/vita/model/multimodal_encoder/eva_clip/eva_clip_encoder.py
@@ -0,0 +1,66 @@
+import torch
+import torch.nn as nn
+
+from .eva_clip_processors import EvaClipImageTrainProcessor
+from .eva_vit import Eva2LargePlusEncoder
+
+
+class EvaClipVisionTower(nn.Module):
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__()
+
+        self.is_loaded = False
+
+        self.vision_tower_path = vision_tower
+        self.config = VisionTowerConfig()
+
+        if not delay_load:
+            self.load_model()
+        else:
+            self.cfg_only = self.config
+
+    def load_model(self):
+        self.image_processor = EvaClipImageTrainProcessor(self.config.image_size)
+        self.vision_tower = Eva2LargePlusEncoder(self.vision_tower_path)
+        self.vision_tower.requires_grad_(False)
+
+        self.is_loaded = True
+
+    @torch.no_grad()
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_feature = self.vision_tower(
+                    image.to(device=self.device, dtype=self.dtype).unsqueeze(0)
+                ).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            image_features = self.vision_tower(images.to(device=self.device, dtype=self.dtype)).to(
+                images.dtype
+            )
+
+        return image_features
+
+    @property
+    def dtype(self):
+        return self.vision_tower.dtype
+
+    @property
+    def device(self):
+        return self.vision_tower.device
+
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size) ** 2
+
+
+class VisionTowerConfig:
+    def __init__(self):
+        self.image_size = 336
+        self.patch_size = 14
+        self.hidden_size = 1024
diff --git a/vita/model/multimodal_encoder/eva_clip/eva_clip_processors.py b/vita/model/multimodal_encoder/eva_clip/eva_clip_processors.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a64be85c0e4ed2f6bcf30a0e50357bc259d28bb
--- /dev/null
+++ b/vita/model/multimodal_encoder/eva_clip/eva_clip_processors.py
@@ -0,0 +1,69 @@
+"""
+# Adapted from https://github.com/baaivision/EVA/tree/master/EVA-CLIP
+"""
+
+from PIL import Image
+from transformers.image_processing_utils import BatchFeature
+from transformers.image_transforms import convert_to_rgb
+
+from torchvision import transforms
+from torchvision.transforms.functional import InterpolationMode
+
+
+class BaseProcessor:
+    def __init__(self):
+        self.transform = lambda x: x
+        return
+
+    def __call__(self, item):
+        return self.transform(item)
+
+
+class EvaClipImageBaseProcessor(BaseProcessor):
+    def __init__(self, mean=None, std=None):
+        self.mean = (0.48145466, 0.4578275, 0.40821073) if mean is None else mean
+        self.std = (0.26862954, 0.26130258, 0.27577711) if std is None else std
+
+        self.normalize = transforms.Normalize(self.mean, self.std)
+
+    @property
+    def image_mean(self):
+        return self.mean
+
+
+class EvaClipImageTrainProcessor(EvaClipImageBaseProcessor):
+    def __init__(self, image_size=224, mean=None, std=None, min_scale=0.5, max_scale=1.0):
+        super().__init__(mean=mean, std=std)
+
+        self.transform = transforms.Compose(
+            [
+                convert_to_rgb,
+                transforms.Resize(
+                    image_size,
+                    interpolation=InterpolationMode.BICUBIC,
+                ),
+                transforms.CenterCrop(image_size),
+                transforms.ToTensor(),
+                self.normalize,
+            ]
+        )
+
+        self.image_size = image_size
+
+    def preprocess(self, images, return_tensors):
+        if isinstance(images, Image.Image):
+            images = [images]
+        else:
+            assert isinstance(images, list)
+
+        transformed_images = [self.transform(image).numpy() for image in images]
+        data = {"pixel_values": transformed_images}
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+    def __call__(self, item):
+        return self.transform(item)
+
+    @property
+    def crop_size(self):
+        return {"height": self.image_size, "width": self.image_size}
diff --git a/vita/model/multimodal_encoder/eva_clip/eva_vit.py b/vita/model/multimodal_encoder/eva_clip/eva_vit.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d186a4099138cb2246c67d3e5047026d669b81e
--- /dev/null
+++ b/vita/model/multimodal_encoder/eva_clip/eva_vit.py
@@ -0,0 +1,982 @@
+"""
+# Adapted from https://github.com/baaivision/EVA/tree/master/EVA-CLIP
+"""
+
+import logging
+
+# --------------------------------------------------------
+# Adapted from  https://github.com/microsoft/unilm/tree/master/beit
+# --------------------------------------------------------
+import math
+import os
+from dataclasses import dataclass
+from functools import partial
+from math import pi
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from torch import nn as nn
+
+import xformers.ops as xops
+
+
+def broadcat(tensors, dim=-1):
+    num_tensors = len(tensors)
+    shape_lens = set(list(map(lambda t: len(t.shape), tensors)))
+    assert len(shape_lens) == 1, "tensors must all have the same number of dimensions"
+    shape_len = list(shape_lens)[0]
+    dim = (dim + shape_len) if dim < 0 else dim
+    dims = list(zip(*map(lambda t: list(t.shape), tensors)))
+    expandable_dims = [(i, val) for i, val in enumerate(dims) if i != dim]
+    assert all(
+        [*map(lambda t: len(set(t[1])) <= 2, expandable_dims)]
+    ), "invalid dimensions for broadcastable concatentation"
+    max_dims = list(map(lambda t: (t[0], max(t[1])), expandable_dims))
+    expanded_dims = list(map(lambda t: (t[0], (t[1],) * num_tensors), max_dims))
+    expanded_dims.insert(dim, (dim, dims[dim]))
+    expandable_shapes = list(zip(*map(lambda t: t[1], expanded_dims)))
+    tensors = list(map(lambda t: t[0].expand(*t[1]), zip(tensors, expandable_shapes)))
+    return torch.cat(tensors, dim=dim)
+
+
+def rotate_half(x):
+    x = rearrange(x, "... (d r) -> ... d r", r=2)
+    x1, x2 = x.unbind(dim=-1)
+    x = torch.stack((-x2, x1), dim=-1)
+    return rearrange(x, "... d r -> ... (d r)")
+
+
+class VisionRotaryEmbeddingFast(nn.Module):
+    def __init__(
+        self,
+        dim,
+        pt_seq_len,
+        ft_seq_len=None,
+        custom_freqs=None,
+        freqs_for="lang",
+        theta=10000,
+        max_freq=10,
+        num_freqs=1,
+        patch_dropout=0.0,
+    ):
+        super().__init__()
+        if custom_freqs:
+            freqs = custom_freqs
+        elif freqs_for == "lang":
+            freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+        elif freqs_for == "pixel":
+            freqs = torch.linspace(1.0, max_freq / 2, dim // 2) * pi
+        elif freqs_for == "constant":
+            freqs = torch.ones(num_freqs).float()
+        else:
+            raise ValueError(f"unknown modality {freqs_for}")
+
+        if ft_seq_len is None:
+            ft_seq_len = pt_seq_len
+        t = torch.arange(ft_seq_len) / ft_seq_len * pt_seq_len
+
+        freqs = torch.einsum("..., f -> ... f", t, freqs)
+        freqs = repeat(freqs, "... n -> ... (n r)", r=2)
+        freqs = broadcat((freqs[:, None, :], freqs[None, :, :]), dim=-1)
+
+        freqs_cos = freqs.cos().view(-1, freqs.shape[-1])
+        freqs_sin = freqs.sin().view(-1, freqs.shape[-1])
+
+        self.patch_dropout = patch_dropout
+
+        self.register_buffer("freqs_cos", freqs_cos)
+        self.register_buffer("freqs_sin", freqs_sin)
+
+        logging.info(f"Shape of rope freq: {self.freqs_cos.shape}")
+
+    def forward(self, t, patch_indices_keep=None):
+        if patch_indices_keep is not None:
+            batch = t.size()[0]
+            batch_indices = torch.arange(batch)
+            batch_indices = batch_indices[..., None]
+
+            freqs_cos = repeat(self.freqs_cos, "i j -> n i m j", n=t.shape[0], m=t.shape[1])
+            freqs_sin = repeat(self.freqs_sin, "i j -> n i m j", n=t.shape[0], m=t.shape[1])
+
+            freqs_cos = freqs_cos[batch_indices, patch_indices_keep]
+            freqs_cos = rearrange(freqs_cos, "n i m j -> n m i j")
+            freqs_sin = freqs_sin[batch_indices, patch_indices_keep]
+            freqs_sin = rearrange(freqs_sin, "n i m j -> n m i j")
+
+            return t * freqs_cos + rotate_half(t) * freqs_sin
+
+        return t * self.freqs_cos + rotate_half(t) * self.freqs_sin
+
+
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm (with cast back to input dtype)."""
+
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        x = F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
+        return x.to(orig_type)
+
+
+class PatchDropout(nn.Module):
+    """
+    https://arxiv.org/abs/2212.00794
+    """
+
+    def __init__(self, prob, exclude_first_token=True):
+        super().__init__()
+        assert 0 <= prob < 1.0
+        self.prob = prob
+        self.exclude_first_token = exclude_first_token  # exclude CLS token
+        logging.info(f"os.getenv('RoPE')={os.getenv('RoPE')}")
+
+    def forward(self, x):
+        if not self.training or self.prob == 0.0:
+            return x
+
+        if self.exclude_first_token:
+            cls_tokens, x = x[:, :1], x[:, 1:]
+        else:
+            cls_tokens = torch.jit.annotate(torch.Tensor, x[:, :1])
+
+        batch = x.size()[0]
+        num_tokens = x.size()[1]
+
+        batch_indices = torch.arange(batch)
+        batch_indices = batch_indices[..., None]
+
+        keep_prob = 1 - self.prob
+        num_patches_keep = max(1, int(num_tokens * keep_prob))
+
+        rand = torch.randn(batch, num_tokens)
+        patch_indices_keep = rand.topk(num_patches_keep, dim=-1).indices
+
+        x = x[batch_indices, patch_indices_keep]
+
+        if self.exclude_first_token:
+            x = torch.cat((cls_tokens, x), dim=1)
+
+        if self.training and os.getenv("RoPE") == "1":
+            return x, patch_indices_keep
+
+        return x
+
+
+try:
+    from timm.models.layers import drop_path, to_2tuple, trunc_normal_
+except:
+    from timm.layers import drop_path, to_2tuple, trunc_normal_
+
+if os.getenv("ENV_TYPE") == "deepspeed":
+    try:
+        from deepspeed.runtime.activation_checkpointing.checkpointing import checkpoint
+    except:
+        from torch.utils.checkpoint import checkpoint
+else:
+    from torch.utils.checkpoint import checkpoint
+
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return "p={}".format(self.drop_prob)
+
+
+class Mlp(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+        drop=0.0,
+        subln=False,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+
+        self.ffn_ln = norm_layer(hidden_features) if subln else nn.Identity()
+
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        # x = self.drop(x)
+        # commit this for the orignal BERT implement
+        x = self.ffn_ln(x)
+
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class SwiGLU(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.SiLU,
+        drop=0.0,
+        norm_layer=nn.LayerNorm,
+        subln=False,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+
+        self.w1 = nn.Linear(in_features, hidden_features)
+        self.w2 = nn.Linear(in_features, hidden_features)
+
+        self.act = act_layer()
+        self.ffn_ln = norm_layer(hidden_features) if subln else nn.Identity()
+        self.w3 = nn.Linear(hidden_features, out_features)
+
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x1 = self.w1(x)
+        x2 = self.w2(x)
+        hidden = self.act(x1) * x2
+        x = self.ffn_ln(hidden)
+        x = self.w3(x)
+        x = self.drop(x)
+        return x
+
+
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads=8,
+        qkv_bias=False,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+        window_size=None,
+        attn_head_dim=None,
+        xattn=False,
+        rope=None,
+        subln=False,
+        norm_layer=nn.LayerNorm,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        if attn_head_dim is not None:
+            head_dim = attn_head_dim
+        all_head_dim = head_dim * self.num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.subln = subln
+        if self.subln:
+            self.q_proj = nn.Linear(dim, all_head_dim, bias=False)
+            self.k_proj = nn.Linear(dim, all_head_dim, bias=False)
+            self.v_proj = nn.Linear(dim, all_head_dim, bias=False)
+        else:
+            self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False)
+
+        if qkv_bias:
+            self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
+            self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
+        else:
+            self.q_bias = None
+            self.v_bias = None
+
+        if window_size:
+            self.window_size = window_size
+            self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3
+            self.relative_position_bias_table = nn.Parameter(
+                torch.zeros(self.num_relative_distance, num_heads)
+            )  # 2*Wh-1 * 2*Ww-1, nH
+            # cls to token & token 2 cls & cls to cls
+
+            # get pair-wise relative position index for each token inside the window
+            coords_h = torch.arange(window_size[0])
+            coords_w = torch.arange(window_size[1])
+            coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+            coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+            relative_coords = (
+                coords_flatten[:, :, None] - coords_flatten[:, None, :]
+            )  # 2, Wh*Ww, Wh*Ww
+            relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+            relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
+            relative_coords[:, :, 1] += window_size[1] - 1
+            relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+            relative_position_index = torch.zeros(
+                size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype
+            )
+            relative_position_index[1:, 1:] = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+            relative_position_index[0, 0:] = self.num_relative_distance - 3
+            relative_position_index[0:, 0] = self.num_relative_distance - 2
+            relative_position_index[0, 0] = self.num_relative_distance - 1
+
+            self.register_buffer("relative_position_index", relative_position_index)
+        else:
+            self.window_size = None
+            self.relative_position_bias_table = None
+            self.relative_position_index = None
+
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.inner_attn_ln = norm_layer(all_head_dim) if subln else nn.Identity()
+        # self.proj = nn.Linear(all_head_dim, all_head_dim)
+        self.proj = nn.Linear(all_head_dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.xattn = xattn
+        self.xattn_drop = attn_drop
+
+        self.rope = rope
+
+    def forward(self, x, rel_pos_bias=None, attn_mask=None):
+        B, N, C = x.shape
+        if self.subln:
+            q = F.linear(input=x, weight=self.q_proj.weight, bias=self.q_bias)
+            k = F.linear(input=x, weight=self.k_proj.weight, bias=None)
+            v = F.linear(input=x, weight=self.v_proj.weight, bias=self.v_bias)
+
+            q = q.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3)  # B, num_heads, N, C
+            k = k.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3)
+            v = v.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3)
+        else:
+
+            qkv_bias = None
+            if self.q_bias is not None:
+                qkv_bias = torch.cat(
+                    (self.q_bias, torch.zeros_like(self.v_bias, requires_grad=False), self.v_bias)
+                )
+
+            qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
+            qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(
+                2, 0, 3, 1, 4
+            )  # 3, B, num_heads, N, C
+            q, k, v = qkv[0], qkv[1], qkv[2]
+
+        if self.rope:
+            # slightly fast impl
+            q_t = q[:, :, 1:, :]
+            ro_q_t = self.rope(q_t)
+            q = torch.cat((q[:, :, :1, :], ro_q_t), -2).type_as(v)
+
+            k_t = k[:, :, 1:, :]
+            ro_k_t = self.rope(k_t)
+            k = torch.cat((k[:, :, :1, :], ro_k_t), -2).type_as(v)
+
+        if self.xattn:
+            q = q.permute(0, 2, 1, 3)  # B, num_heads, N, C -> B, N, num_heads, C
+            k = k.permute(0, 2, 1, 3)
+            v = v.permute(0, 2, 1, 3)
+
+            x = xops.memory_efficient_attention(
+                q,
+                k,
+                v,
+                p=self.xattn_drop,
+                scale=self.scale,
+            )
+            x = x.reshape(B, N, -1)
+            x = self.inner_attn_ln(x)
+            x = self.proj(x)
+            x = self.proj_drop(x)
+        else:
+            q = q * self.scale
+            attn = q @ k.transpose(-2, -1)
+
+            if self.relative_position_bias_table is not None:
+                relative_position_bias = self.relative_position_bias_table[
+                    self.relative_position_index.view(-1)
+                ].view(
+                    self.window_size[0] * self.window_size[1] + 1,
+                    self.window_size[0] * self.window_size[1] + 1,
+                    -1,
+                )  # Wh*Ww,Wh*Ww,nH
+                relative_position_bias = relative_position_bias.permute(
+                    2, 0, 1
+                ).contiguous()  # nH, Wh*Ww, Wh*Ww
+                attn = attn + relative_position_bias.unsqueeze(0).type_as(attn)
+
+            if rel_pos_bias is not None:
+                attn = attn + rel_pos_bias.type_as(attn)
+
+            if attn_mask is not None:
+                attn_mask = attn_mask.bool()
+                attn = attn.masked_fill(~attn_mask[:, None, None, :], float("-inf"))
+
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+
+            x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+            x = self.inner_attn_ln(x)
+            x = self.proj(x)
+            x = self.proj_drop(x)
+        return x
+
+
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        init_values=None,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+        window_size=None,
+        attn_head_dim=None,
+        xattn=False,
+        rope=None,
+        postnorm=False,
+        subln=False,
+        naiveswiglu=False,
+    ):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+            window_size=window_size,
+            attn_head_dim=attn_head_dim,
+            xattn=xattn,
+            rope=rope,
+            subln=subln,
+            norm_layer=norm_layer,
+        )
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+
+        if naiveswiglu:
+            self.mlp = SwiGLU(
+                in_features=dim,
+                hidden_features=mlp_hidden_dim,
+                subln=subln,
+                norm_layer=norm_layer,
+            )
+        else:
+            self.mlp = Mlp(
+                in_features=dim,
+                hidden_features=mlp_hidden_dim,
+                act_layer=act_layer,
+                subln=subln,
+                drop=drop,
+            )
+
+        if init_values is not None and init_values > 0:
+            self.gamma_1 = nn.Parameter(init_values * torch.ones((dim)), requires_grad=True)
+            self.gamma_2 = nn.Parameter(init_values * torch.ones((dim)), requires_grad=True)
+        else:
+            self.gamma_1, self.gamma_2 = None, None
+
+        self.postnorm = postnorm
+
+    def forward(self, x, rel_pos_bias=None, attn_mask=None):
+        if self.gamma_1 is None:
+            if self.postnorm:
+                x = x + self.drop_path(
+                    self.norm1(self.attn(x, rel_pos_bias=rel_pos_bias, attn_mask=attn_mask))
+                )
+                x = x + self.drop_path(self.norm2(self.mlp(x)))
+            else:
+                x = x + self.drop_path(
+                    self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias, attn_mask=attn_mask)
+                )
+                x = x + self.drop_path(self.mlp(self.norm2(x)))
+        else:
+            if self.postnorm:
+                x = x + self.drop_path(
+                    self.gamma_1
+                    * self.norm1(self.attn(x, rel_pos_bias=rel_pos_bias, attn_mask=attn_mask))
+                )
+                x = x + self.drop_path(self.gamma_2 * self.norm2(self.mlp(x)))
+            else:
+                x = x + self.drop_path(
+                    self.gamma_1
+                    * self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias, attn_mask=attn_mask)
+                )
+                x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+        return x
+
+
+class PatchEmbed(nn.Module):
+    """Image to Patch Embedding"""
+
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
+        self.patch_shape = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, x, **kwargs):
+        B, C, H, W = x.shape
+        # FIXME look at relaxing size constraints
+        assert (
+            H == self.img_size[0] and W == self.img_size[1]
+        ), f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        return x
+
+
+class RelativePositionBias(nn.Module):
+    def __init__(self, window_size, num_heads):
+        super().__init__()
+        self.window_size = window_size
+        self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros(self.num_relative_distance, num_heads)
+        )  # 2*Wh-1 * 2*Ww-1, nH
+        # cls to token & token 2 cls & cls to cls
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(window_size[0])
+        coords_w = torch.arange(window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+        relative_position_index = torch.zeros(
+            size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype
+        )
+        relative_position_index[1:, 1:] = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        relative_position_index[0, 0:] = self.num_relative_distance - 3
+        relative_position_index[0:, 0] = self.num_relative_distance - 2
+        relative_position_index[0, 0] = self.num_relative_distance - 1
+
+        self.register_buffer("relative_position_index", relative_position_index)
+
+    def forward(self):
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index.view(-1)
+        ].view(
+            self.window_size[0] * self.window_size[1] + 1,
+            self.window_size[0] * self.window_size[1] + 1,
+            -1,
+        )  # Wh*Ww,Wh*Ww,nH
+        return relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+
+
+class EVAVisionTransformer(nn.Module):
+    """Vision Transformer with support for patch or hybrid CNN input stage"""
+
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        num_classes=1000,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        qk_scale=None,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.0,
+        norm_layer=nn.LayerNorm,
+        init_values=None,
+        patch_dropout=0.0,
+        use_abs_pos_emb=True,
+        use_rel_pos_bias=False,
+        use_shared_rel_pos_bias=False,
+        rope=False,
+        use_mean_pooling=True,
+        init_scale=0.001,
+        grad_checkpointing=False,
+        xattn=False,
+        postnorm=False,
+        pt_hw_seq_len=16,
+        intp_freq=False,
+        naiveswiglu=False,
+        subln=False,
+    ):
+        super().__init__()
+        self.image_size = img_size
+        self.num_classes = num_classes
+        self.num_features = (
+            self.embed_dim
+        ) = embed_dim  # num_features for consistency with other models
+
+        self.patch_embed = PatchEmbed(
+            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim
+        )
+        num_patches = self.patch_embed.num_patches
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        # self.mask_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        if use_abs_pos_emb:
+            self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
+        else:
+            self.pos_embed = None
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        if use_shared_rel_pos_bias:
+            self.rel_pos_bias = RelativePositionBias(
+                window_size=self.patch_embed.patch_shape, num_heads=num_heads
+            )
+        else:
+            self.rel_pos_bias = None
+
+        if rope:
+            half_head_dim = embed_dim // num_heads // 2
+            hw_seq_len = img_size // patch_size
+            self.rope = VisionRotaryEmbeddingFast(
+                dim=half_head_dim,
+                pt_seq_len=pt_hw_seq_len,
+                ft_seq_len=hw_seq_len if intp_freq else None,
+                # patch_dropout=patch_dropout
+            )
+        else:
+            self.rope = None
+
+        self.naiveswiglu = naiveswiglu
+
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, depth)
+        ]  # stochastic depth decay rule
+        self.use_rel_pos_bias = use_rel_pos_bias
+        self.blocks = nn.ModuleList(
+            [
+                Block(
+                    dim=embed_dim,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop_rate,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[i],
+                    norm_layer=norm_layer,
+                    init_values=init_values,
+                    window_size=self.patch_embed.patch_shape if use_rel_pos_bias else None,
+                    xattn=xattn,
+                    rope=self.rope,
+                    postnorm=postnorm,
+                    subln=subln,
+                    naiveswiglu=naiveswiglu,
+                )
+                for i in range(depth)
+            ]
+        )
+        self.norm = nn.Identity() if use_mean_pooling else norm_layer(embed_dim)
+        self.fc_norm = norm_layer(embed_dim) if use_mean_pooling else None
+        self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+        if self.pos_embed is not None:
+            trunc_normal_(self.pos_embed, std=0.02)
+
+        trunc_normal_(self.cls_token, std=0.02)
+        # trunc_normal_(self.mask_token, std=.02)
+
+        self.apply(self._init_weights)
+        self.fix_init_weight()
+
+        if isinstance(self.head, nn.Linear):
+            trunc_normal_(self.head.weight, std=0.02)
+            self.head.weight.data.mul_(init_scale)
+            self.head.bias.data.mul_(init_scale)
+
+        # setting a patch_dropout of 0. would mean it is disabled and this function would be the identity fn
+        self.patch_dropout = PatchDropout(patch_dropout) if patch_dropout > 0.0 else nn.Identity()
+
+        self.grad_checkpointing = grad_checkpointing
+
+    def fix_init_weight(self):
+        def rescale(param, layer_id):
+            param.div_(math.sqrt(2.0 * layer_id))
+
+        for layer_id, layer in enumerate(self.blocks):
+            rescale(layer.attn.proj.weight.data, layer_id + 1)
+            if self.naiveswiglu:
+                rescale(layer.mlp.w3.weight.data, layer_id + 1)
+            else:
+                rescale(layer.mlp.fc2.weight.data, layer_id + 1)
+
+    def get_cast_dtype(self) -> torch.dtype:
+        return self.blocks[0].mlp.fc2.weight.dtype
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+
+    def get_num_layers(self):
+        return len(self.blocks)
+
+    def lock(self, unlocked_groups=0, freeze_bn_stats=False):
+        assert unlocked_groups == 0, "partial locking not currently supported for this model"
+        for param in self.parameters():
+            param.requires_grad = False
+
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable=True):
+        self.grad_checkpointing = enable
+
+    @torch.jit.ignore
+    def no_weight_decay(self):
+        return {"pos_embed", "cls_token"}
+
+    def get_classifier(self):
+        return self.head
+
+    def reset_classifier(self, num_classes, global_pool=""):
+        self.num_classes = num_classes
+        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+
+    def forward_features(self, x, return_all_features=False):
+
+        x = self.patch_embed(x)
+        batch_size, seq_len, _ = x.size()
+
+        cls_tokens = self.cls_token.expand(
+            batch_size, -1, -1
+        )  # stole cls_tokens impl from Phil Wang, thanks
+        x = torch.cat((cls_tokens, x), dim=1)
+        if self.pos_embed is not None:
+            x = x + self.pos_embed
+        x = self.pos_drop(x)
+
+        # a patch_dropout of 0. would mean it is disabled and this function would do nothing but return what was passed in
+        if os.getenv("RoPE") == "1":
+            if self.training and not isinstance(self.patch_dropout, nn.Identity):
+                x, patch_indices_keep = self.patch_dropout(x)
+                self.rope.forward = partial(
+                    self.rope.forward, patch_indices_keep=patch_indices_keep
+                )
+            else:
+                self.rope.forward = partial(self.rope.forward, patch_indices_keep=None)
+                x = self.patch_dropout(x)
+        else:
+            x = self.patch_dropout(x)
+
+        rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None
+        for i, blk in enumerate(self.blocks):
+            if i == len(self.blocks) - 1:
+                continue
+            if self.grad_checkpointing:
+                x = checkpoint(blk, x, (rel_pos_bias,))
+            else:
+                x = blk(x, rel_pos_bias=rel_pos_bias)
+
+        if not return_all_features:
+            x = self.norm(x)
+            if self.fc_norm is not None:
+                return self.fc_norm(x.mean(1))
+            else:
+                return x[:, 0]
+        return x
+
+    def forward(self, x, return_all_features=False):
+        if return_all_features:
+            return self.forward_features(x, return_all_features)
+        x = self.forward_features(x)
+        x = self.head(x)
+        return x
+
+
+def load_state_dict(
+    checkpoint_path: str,
+    map_location: str = "cpu",
+    model_key: str = "model|module|state_dict",
+    is_openai: bool = False,
+    skip_list: list = [],
+):
+    if is_openai:
+        model = torch.jit.load(checkpoint_path, map_location="cpu").eval()
+        state_dict = model.state_dict()
+        for key in ["input_resolution", "context_length", "vocab_size"]:
+            state_dict.pop(key, None)
+    else:
+        checkpoint = torch.load(checkpoint_path, map_location=map_location)
+        for mk in model_key.split("|"):
+            if isinstance(checkpoint, dict) and mk in checkpoint:
+                state_dict = checkpoint[mk]
+                break
+            else:
+                state_dict = checkpoint
+        if next(iter(state_dict.items()))[0].startswith("module"):
+            state_dict = {k[7:]: v for k, v in state_dict.items()}
+
+    for k in skip_list:
+        if k in list(state_dict.keys()):
+            logging.info(f"Removing key {k} from pretrained checkpoint")
+            del state_dict[k]
+
+    if os.getenv("RoPE") == "1":
+        for k in list(state_dict.keys()):
+            if "freqs_cos" in k or "freqs_sin" in k:
+                del state_dict[k]
+    return state_dict
+
+
+def load_clip_visual_state_dict(
+    checkpoint_path: str, map_location: str = "cpu", is_openai: bool = False, skip_list: list = []
+):
+    state_dict = load_state_dict(
+        checkpoint_path, map_location=map_location, is_openai=is_openai, skip_list=skip_list
+    )
+
+    for k in list(state_dict.keys()):
+        if not k.startswith("visual."):
+            del state_dict[k]
+    for k in list(state_dict.keys()):
+        if k.startswith("visual."):
+            new_k = k[7:]
+            state_dict[new_k] = state_dict[k]
+            del state_dict[k]
+    return state_dict
+
+
+try:
+    from apex.normalization import FusedLayerNorm
+except:
+    FusedLayerNorm = LayerNorm
+    print(
+        "Please build and install Nvidia apex package with option '--cuda_ext' according to https://github.com/NVIDIA/apex#from-source ."
+    )
+
+
+@dataclass
+class CLIPVisionCfg:
+    layers: Union[Tuple[int, int, int, int], int] = 12
+    width: int = 768
+    head_width: int = 64
+    mlp_ratio: float = 4.0
+    patch_size: int = 16
+    image_size: Union[Tuple[int, int], int] = 224
+    ls_init_value: Optional[float] = None  # layer scale initial value
+    patch_dropout: float = 0.0  # what fraction of patches to dropout during training (0 would mean disabled and no patches dropped) - 0.5 to 0.75 recommended in the paper for optimal results
+    global_average_pool: bool = False  # whether to global average pool the last embedding layer, instead of using CLS token (https://arxiv.org/abs/2205.01580)
+    drop_path_rate: Optional[float] = None  # drop path rate
+    timm_model_name: str = None  # a valid model name overrides layers, width, patch_size
+    timm_model_pretrained: bool = False  # use (imagenet) pretrained weights for named model
+    timm_pool: str = "avg"  # feature pooling for timm model ('abs_attn', 'rot_attn', 'avg', '')
+    timm_proj: str = "linear"  # linear projection for timm model output ('linear', 'mlp', '')
+    timm_proj_bias: bool = False  # enable bias final projection
+    eva_model_name: str = None  # a valid eva model name overrides layers, width, patch_size
+    qkv_bias: bool = True
+    fusedLN: bool = False
+    xattn: bool = False
+    postnorm: bool = False
+    rope: bool = False
+    pt_hw_seq_len: int = 16  # 224/14
+    intp_freq: bool = False
+    naiveswiglu: bool = False
+    subln: bool = False
+
+
+def _build_vision_tower(vision_tower_path: str, embed_dim: int, vision_cfg: CLIPVisionCfg):
+    if isinstance(vision_cfg, dict):
+        vision_cfg = CLIPVisionCfg(**vision_cfg)
+
+    if vision_cfg.eva_model_name:
+        vision_heads = vision_cfg.width // vision_cfg.head_width
+        norm_layer = LayerNorm
+
+        visual = EVAVisionTransformer(
+            img_size=vision_cfg.image_size,
+            patch_size=vision_cfg.patch_size,
+            num_classes=embed_dim,
+            use_mean_pooling=vision_cfg.global_average_pool,  # False
+            init_values=vision_cfg.ls_init_value,
+            patch_dropout=vision_cfg.patch_dropout,
+            embed_dim=vision_cfg.width,
+            depth=vision_cfg.layers,
+            num_heads=vision_heads,
+            mlp_ratio=vision_cfg.mlp_ratio,
+            qkv_bias=vision_cfg.qkv_bias,
+            drop_path_rate=vision_cfg.drop_path_rate,
+            norm_layer=partial(FusedLayerNorm, eps=1e-6)
+            if vision_cfg.fusedLN
+            else partial(norm_layer, eps=1e-6),
+            xattn=vision_cfg.xattn,
+            rope=vision_cfg.rope,
+            postnorm=vision_cfg.postnorm,
+            pt_hw_seq_len=vision_cfg.pt_hw_seq_len,  # 224/14
+            intp_freq=vision_cfg.intp_freq,
+            naiveswiglu=vision_cfg.naiveswiglu,
+            subln=vision_cfg.subln,
+        )
+
+        state_dict = load_clip_visual_state_dict(vision_tower_path)
+        incompatible_keys = visual.load_state_dict(state_dict, strict=False)
+        print("EVA-CLIP incompatible_keys:", incompatible_keys)
+
+    return visual
+
+
+class Eva2LargePlusEncoder(nn.Module):
+    def __init__(self, vision_tower_path):
+        super(Eva2LargePlusEncoder, self).__init__()
+        self.config = {
+            "embed_dim": 768,
+            "vision_cfg": {
+                "image_size": 336,
+                "layers": 24,
+                "width": 1024,
+                "drop_path_rate": 0,
+                "head_width": 64,
+                "mlp_ratio": 2.6667,
+                "patch_size": 14,
+                "eva_model_name": "eva-clip-l-14-336",
+                "xattn": True,
+                "fusedLN": True,
+                "rope": True,
+                "pt_hw_seq_len": 16,
+                "intp_freq": True,
+                "naiveswiglu": True,
+                "subln": True,
+            },
+        }
+
+        self.config["vision_tower_path"] = vision_tower_path
+        self.model = _build_vision_tower(**self.config)
+
+    def forward(self, image, **kwargs):
+        encode = self.model(image, return_all_features=True)[:, 1:, :]
+        return encode
+
+    @property
+    def dtype(self):
+        return list(self.parameters())[-1].dtype
+
+    @property
+    def device(self):
+        return list(self.parameters())[-1].device
diff --git a/vita/model/multimodal_encoder/internvit/configuration_intern_vit.py b/vita/model/multimodal_encoder/internvit/configuration_intern_vit.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc45069f9225aa484bdf919e2b1cb8857b8c9b20
--- /dev/null
+++ b/vita/model/multimodal_encoder/internvit/configuration_intern_vit.py
@@ -0,0 +1,125 @@
+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2023 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+import os
+from typing import Union
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+class InternVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`InternVisionModel`]. It is used to
+    instantiate a vision encoder according to the specified arguments, defining the model architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of color channels in the input images (e.g., 3 for RGB).
+        patch_size (`int`, *optional*, defaults to 14):
+            The size (resolution) of each patch.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        qkv_bias (`bool`, *optional*, defaults to `False`):
+            Whether to add a bias to the queries and values in the self-attention layers.
+        hidden_size (`int`, *optional*, defaults to 3200):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_attention_heads (`int`, *optional*, defaults to 25):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 12800):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        qk_normalization (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the queries and keys in the self-attention layers.
+        num_hidden_layers (`int`, *optional*, defaults to 48):
+            Number of hidden layers in the Transformer encoder.
+        use_flash_attn (`bool`, *optional*, defaults to `True`):
+            Whether to use flash attention mechanism.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
+            The epsilon used by the layer normalization layers.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            Dropout rate for stochastic depth.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_factor (`float`, *optional*, defaults to 0.1):
+            A factor for layer scale.
+    """
+
+    model_type = "intern_vit_6b"
+
+    def __init__(
+        self,
+        num_channels=3,
+        patch_size=14,
+        image_size=224,
+        qkv_bias=False,
+        hidden_size=3200,
+        num_attention_heads=25,
+        intermediate_size=12800,
+        qk_normalization=True,
+        num_hidden_layers=48,
+        use_flash_attn=True,
+        hidden_act="gelu",
+        norm_type="rms_norm",
+        layer_norm_eps=1e-6,
+        dropout=0.0,
+        drop_path_rate=0.0,
+        attention_dropout=0.0,
+        initializer_range=0.02,
+        initializer_factor=0.1,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.dropout = dropout
+        self.drop_path_rate = drop_path_rate
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.norm_type = norm_type
+        self.qkv_bias = qkv_bias
+        self.qk_normalization = qk_normalization
+        self.use_flash_attn = use_flash_attn
+
+    @classmethod
+    def from_pretrained(
+        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
+    ) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        if "vision_config" in config_dict:
+            config_dict = config_dict["vision_config"]
+
+        if (
+            "model_type" in config_dict
+            and hasattr(cls, "model_type")
+            and config_dict["model_type"] != cls.model_type
+        ):
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
diff --git a/vita/model/multimodal_encoder/internvit/flash_attention.py b/vita/model/multimodal_encoder/internvit/flash_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ff40abae064eb68a95f0f06d2d1df7ab0359ee3
--- /dev/null
+++ b/vita/model/multimodal_encoder/internvit/flash_attention.py
@@ -0,0 +1,101 @@
+# https://github.com/Dao-AILab/flash-attention/blob/v0.2.8/flash_attn/flash_attention.py
+import torch
+import torch.nn as nn
+from einops import rearrange
+
+from flash_attn.bert_padding import pad_input, unpad_input
+
+try:  # v1
+    from flash_attn.flash_attn_interface import flash_attn_unpadded_qkvpacked_func
+except:  # v2
+    from flash_attn.flash_attn_interface import (
+        flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func,
+    )
+
+
+class FlashAttention(nn.Module):
+    """Implement the scaled dot product attention with softmax.
+    Arguments
+    ---------
+        softmax_scale: The temperature to use for the softmax attention.
+                      (default: 1/sqrt(d_keys) where d_keys is computed at
+                      runtime)
+        attention_dropout: The dropout rate to apply to the attention
+                           (default: 0.0)
+    """
+
+    def __init__(self, softmax_scale=None, attention_dropout=0.0, device=None, dtype=None):
+        super().__init__()
+        self.softmax_scale = softmax_scale
+        self.dropout_p = attention_dropout
+
+    def forward(
+        self,
+        qkv,
+        key_padding_mask=None,
+        causal=False,
+        cu_seqlens=None,
+        max_s=None,
+        need_weights=False,
+    ):
+        """Implements the multihead softmax attention.
+        Arguments
+        ---------
+            qkv: The tensor containing the query, key, and value. (B, S, 3, H, D) if key_padding_mask is None
+                if unpadded: (nnz, 3, h, d)
+            key_padding_mask: a bool tensor of shape (B, S)
+        """
+        assert not need_weights
+        assert qkv.dtype in [torch.float16, torch.bfloat16]
+        assert qkv.is_cuda
+
+        if cu_seqlens is None:
+            batch_size = qkv.shape[0]
+            seqlen = qkv.shape[1]
+            if key_padding_mask is None:
+                qkv = rearrange(qkv, "b s ... -> (b s) ...")
+                max_s = seqlen
+                cu_seqlens = torch.arange(
+                    0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32, device=qkv.device
+                )
+                output = flash_attn_unpadded_qkvpacked_func(
+                    qkv,
+                    cu_seqlens,
+                    max_s,
+                    self.dropout_p if self.training else 0.0,
+                    softmax_scale=self.softmax_scale,
+                    causal=causal,
+                )
+                output = rearrange(output, "(b s) ... -> b s ...", b=batch_size)
+            else:
+                nheads = qkv.shape[-2]
+                x = rearrange(qkv, "b s three h d -> b s (three h d)")
+                x_unpad, indices, cu_seqlens, max_s = unpad_input(x, key_padding_mask)
+                x_unpad = rearrange(x_unpad, "nnz (three h d) -> nnz three h d", three=3, h=nheads)
+                output_unpad = flash_attn_unpadded_qkvpacked_func(
+                    x_unpad,
+                    cu_seqlens,
+                    max_s,
+                    self.dropout_p if self.training else 0.0,
+                    softmax_scale=self.softmax_scale,
+                    causal=causal,
+                )
+                output = rearrange(
+                    pad_input(
+                        rearrange(output_unpad, "nnz h d -> nnz (h d)"), indices, batch_size, seqlen
+                    ),
+                    "b s (h d) -> b s h d",
+                    h=nheads,
+                )
+        else:
+            assert max_s is not None
+            output = flash_attn_unpadded_qkvpacked_func(
+                qkv,
+                cu_seqlens,
+                max_s,
+                self.dropout_p if self.training else 0.0,
+                softmax_scale=self.softmax_scale,
+                causal=causal,
+            )
+
+        return output, None
diff --git a/vita/model/multimodal_encoder/internvit/internvit_encoder.py b/vita/model/multimodal_encoder/internvit/internvit_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..a14fb2188a2e82b519aba71aa6c6f5e8ad05cd73
--- /dev/null
+++ b/vita/model/multimodal_encoder/internvit/internvit_encoder.py
@@ -0,0 +1,105 @@
+import torch
+import torch.nn as nn
+from transformers import AutoConfig, AutoModel, CLIPImageProcessor
+
+from .modeling_intern_vit import InternVisionModel
+
+
+class InternViTVisionTower(nn.Module):
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__()
+
+        self.is_loaded = False
+
+        self.vision_tower_name = vision_tower
+        self.select_layer = -1
+        self.scale_pix_shuffle = 0.5
+
+        if not delay_load:
+            self.load_model()
+        else:
+            self.cfg_only = AutoConfig.from_pretrained(
+                self.vision_tower_name, trust_remote_code=True
+            )
+
+    def load_model(self):
+        self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
+        self.vision_tower = InternVisionModel.from_pretrained(
+            self.vision_tower_name, trust_remote_code=True
+        )
+        self.vision_tower.requires_grad_(False)
+
+        self.is_loaded = True
+
+    def feature_select(self, image_forward_outs):
+        image_features = image_forward_outs.hidden_states[self.select_layer]
+
+        image_features = image_features[:, 1:]
+
+        return image_features
+
+    def pixel_shuffle(self, x, scale_factor=0.5):
+        n, w, h, c = x.size()
+        # N, W, H, C --> N, W, H * scale, C // scale
+        x = x.view(n, w, int(h * scale_factor), int(c / scale_factor))
+        # N, W, H * scale, C // scale --> N, H * scale, W, C // scale
+        x = x.permute(0, 2, 1, 3).contiguous()
+        # N, H * scale, W, C // scale --> N, H * scale, W * scale, C // (scale ** 2)
+        x = x.view(
+            n, int(h * scale_factor), int(w * scale_factor), int(c / (scale_factor * scale_factor))
+        )
+        x = x.permute(0, 2, 1, 3).contiguous()
+        return x
+
+    #@torch.no_grad()
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_tower(
+                    image.to(device=self.device, dtype=self.dtype).unsqueeze(0),
+                    output_hidden_states=True,
+                )
+                image_feature = self.feature_select(image_forward_out).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            image_forward_outs = self.vision_tower(
+                images.to(device=self.device, dtype=self.dtype), output_hidden_states=True
+            )
+            image_features = self.feature_select(image_forward_outs).to(images.dtype)
+        h = w = int(image_features.shape[1] ** 0.5)
+        assert image_features.shape[1] == h * w
+        image_features = image_features.reshape(image_features.shape[0], h, w, -1)
+        image_features = self.pixel_shuffle(image_features * self.scale_pix_shuffle)
+        image_features = image_features.reshape(
+            image_features.shape[0], -1, image_features.shape[-1]
+        )
+
+        return image_features
+
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+
+    @property
+    def dtype(self):
+        return self.vision_tower.dtype
+
+    @property
+    def device(self):
+        return self.vision_tower.device
+
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_tower.config
+        else:
+            return self.cfg_only
+
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size * (int(1 / self.scale_pix_shuffle) ** 2)
+
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size) ** 2
diff --git a/vita/model/multimodal_encoder/internvit/modeling_intern_vit.py b/vita/model/multimodal_encoder/internvit/modeling_intern_vit.py
new file mode 100644
index 0000000000000000000000000000000000000000..aacb577e5a608eb2767419dfb94ecd7b8870d221
--- /dev/null
+++ b/vita/model/multimodal_encoder/internvit/modeling_intern_vit.py
@@ -0,0 +1,394 @@
+# --------------------------------------------------------
+# InternVL
+# Copyright (c) 2023 OpenGVLab
+# Licensed under The MIT License [see LICENSE for details]
+# --------------------------------------------------------
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from einops import rearrange
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+
+from timm.models.layers import DropPath
+
+from .configuration_intern_vit import InternVisionConfig
+
+try:
+    from .flash_attention import FlashAttention
+
+    has_flash_attn = True
+except:
+    print("FlashAttention is not installed.")
+    has_flash_attn = False
+
+
+logger = logging.get_logger(__name__)
+
+
+class InternRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+
+try:
+    from apex.normalization import FusedRMSNorm
+
+    InternRMSNorm = FusedRMSNorm  # noqa
+
+    logger.info("Discovered apex.normalization.FusedRMSNorm - will use it instead of InternRMSNorm")
+except ImportError:
+    # using the normal InternRMSNorm
+    pass
+except Exception:
+    logger.warning("discovered apex but it failed to load, falling back to InternRMSNorm")
+    pass
+
+
+NORM2FN = {
+    "rms_norm": InternRMSNorm,
+    "layer_norm": nn.LayerNorm,
+}
+
+
+class InternVisionEmbeddings(nn.Module):
+    def __init__(self, config: InternVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.class_embedding = nn.Parameter(
+            torch.randn(1, 1, self.embed_dim),
+        )
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=3,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+
+        self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim))
+
+    def _get_pos_embed(self, pos_embed, H, W):
+        target_dtype = pos_embed.dtype
+        pos_embed = (
+            pos_embed.float()
+            .reshape(1, self.image_size // self.patch_size, self.image_size // self.patch_size, -1)
+            .permute(0, 3, 1, 2)
+        )
+        pos_embed = (
+            F.interpolate(pos_embed, size=(H, W), mode="bicubic", align_corners=False)
+            .reshape(1, -1, H * W)
+            .permute(0, 2, 1)
+            .to(target_dtype)
+        )
+        return pos_embed
+
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, channel, width, height]
+        batch_size, _, height, width = patch_embeds.shape
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        position_embedding = torch.cat(
+            [
+                self.position_embedding[:, :1, :],
+                self._get_pos_embed(self.position_embedding[:, 1:, :], height, width),
+            ],
+            dim=1,
+        )
+        embeddings = embeddings + position_embedding.to(target_dtype)
+        return embeddings
+
+
+class InternAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: InternVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.use_flash_attn = config.use_flash_attn and has_flash_attn
+        if config.use_flash_attn and not has_flash_attn:
+            print("Warning: Flash Attention is not available, use_flash_attn is set to False.")
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+
+        self.scale = self.head_dim**-0.5
+        self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias=config.qkv_bias)
+        self.attn_drop = nn.Dropout(config.attention_dropout)
+        self.proj_drop = nn.Dropout(config.dropout)
+
+        self.qk_normalization = config.qk_normalization
+
+        if self.qk_normalization:
+            self.q_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
+            self.k_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+        if self.use_flash_attn:
+            self.inner_attn = FlashAttention(attention_dropout=config.attention_dropout)
+        self.proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def _naive_attn(self, x):
+        B, N, C = x.shape
+        qkv = (
+            self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        )
+        q, k, v = qkv.unbind(0)  # make torchscript happy (cannot use tensor as tuple)
+
+        if self.qk_normalization:
+            B_, H_, N_, D_ = q.shape
+            q = self.q_norm(q.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
+            k = self.k_norm(k.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
+
+        attn = (q * self.scale) @ k.transpose(-2, -1)
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+    def _flash_attn(self, x, key_padding_mask=None, need_weights=False):
+        qkv = self.qkv(x)
+        qkv = rearrange(qkv, "b s (three h d) -> b s three h d", three=3, h=self.num_heads)
+
+        if self.qk_normalization:
+            q, k, v = qkv.unbind(2)
+            q = self.q_norm(q.flatten(-2, -1)).view(q.shape)
+            k = self.k_norm(k.flatten(-2, -1)).view(k.shape)
+            qkv = torch.stack([q, k, v], dim=2)
+
+        context, _ = self.inner_attn(
+            qkv, key_padding_mask=key_padding_mask, need_weights=need_weights, causal=False
+        )
+        outs = self.proj(rearrange(context, "b s h d -> b s (h d)"))
+        outs = self.proj_drop(outs)
+        return outs
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        x = (
+            self._naive_attn(hidden_states)
+            if not self.use_flash_attn
+            else self._flash_attn(hidden_states)
+        )
+        return x
+
+
+class InternMLP(nn.Module):
+    def __init__(self, config: InternVisionConfig):
+        super().__init__()
+        self.config = config
+        self.act = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class InternVisionEncoderLayer(nn.Module):
+    def __init__(self, config: InternVisionConfig, drop_path_rate: float):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.norm_type = config.norm_type
+
+        self.attn = InternAttention(config)
+        self.mlp = InternMLP(config)
+        self.norm1 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps)
+        self.norm2 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps)
+
+        self.ls1 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
+        self.ls2 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
+        self.drop_path1 = DropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
+        self.drop_path2 = DropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor], Optional[Tuple[torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]`): input to the layer of shape `(batch, seq_len, embed_dim)`
+        """
+        hidden_states = hidden_states + self.drop_path1(
+            self.attn(self.norm1(hidden_states)) * self.ls1
+        )
+
+        hidden_states = hidden_states + self.drop_path2(
+            self.mlp(self.norm2(hidden_states)) * self.ls2
+        )
+
+        return hidden_states
+
+
+class InternVisionEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`InternEncoderLayer`].
+
+    Args:
+        config (`InternConfig`):
+            The corresponding vision configuration for the `InternEncoder`.
+    """
+
+    def __init__(self, config: InternVisionConfig):
+        super().__init__()
+        self.config = config
+        # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers)]
+        self.layers = nn.ModuleList(
+            [InternVisionEncoderLayer(config, dpr[idx]) for idx in range(config.num_hidden_layers)]
+        )
+        self.gradient_checkpointing = True
+
+    def forward(
+        self,
+        inputs_embeds,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Embedded representation of the inputs. Should be float, not int tokens.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        hidden_states = inputs_embeds
+
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = torch.utils.checkpoint.checkpoint(encoder_layer, hidden_states)
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                )
+            hidden_states = layer_outputs
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states] if v is not None)
+        return BaseModelOutput(last_hidden_state=hidden_states, hidden_states=encoder_states)
+
+
+class InternVisionModel(PreTrainedModel):
+    main_input_name = "pixel_values"
+    config_class = InternVisionConfig
+    _no_split_modules = ["InternVisionEncoderLayer"]
+
+    def __init__(self, config: InternVisionConfig):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = InternVisionEmbeddings(config)
+        self.encoder = InternVisionEncoder(config)
+
+    def resize_pos_embeddings(self, old_size, new_size, patch_size):
+        pos_emb = self.embeddings.position_embedding
+        _, num_positions, embed_dim = pos_emb.shape
+        cls_emb = pos_emb[:, :1, :]
+        pos_emb = (
+            pos_emb[:, 1:, :]
+            .reshape(1, old_size // patch_size, old_size // patch_size, -1)
+            .permute(0, 3, 1, 2)
+        )
+        pos_emb = F.interpolate(
+            pos_emb.float(), size=new_size // patch_size, mode="bicubic", align_corners=False
+        )
+        pos_emb = pos_emb.to(cls_emb.dtype).reshape(1, embed_dim, -1).permute(0, 2, 1)
+        pos_emb = torch.cat([cls_emb, pos_emb], dim=1)
+        self.embeddings.position_embedding = nn.Parameter(pos_emb)
+        self.embeddings.image_size = new_size
+        logger.info("Resized position embeddings from {} to {}".format(old_size, new_size))
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        pixel_embeds: Optional[torch.FloatTensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None and pixel_embeds is None:
+            raise ValueError("You have to specify pixel_values or pixel_embeds")
+
+        if pixel_embeds is not None:
+            hidden_states = pixel_embeds
+        else:
+            if len(pixel_values.shape) == 4:
+                hidden_states = self.embeddings(pixel_values)
+            else:
+                raise ValueError(f"wrong pixel_values size: {pixel_values.shape}")
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_state = encoder_outputs.last_hidden_state
+        pooled_output = last_hidden_state[:, 0, :]
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
diff --git a/vita/model/multimodal_encoder/siglip/siglip_encoder.py b/vita/model/multimodal_encoder/siglip/siglip_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..56662f2b0d08dd27b9cba7ff5bf87d0efe39c2fe
--- /dev/null
+++ b/vita/model/multimodal_encoder/siglip/siglip_encoder.py
@@ -0,0 +1,149 @@
+import torch
+import torch.nn as nn
+from transformers import SiglipImageProcessor, SiglipVisionConfig, SiglipVisionModel
+
+from vita.util.s2wrapper import forward as multiscale_forward
+
+
+class SiglipVisionTower(nn.Module):
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__()
+
+        self.is_loaded = False
+
+        self.vision_tower_name = vision_tower
+        self.select_layer = -2
+
+        if not delay_load:
+            self.load_model()
+        else:
+            self.cfg_only = SiglipVisionConfig.from_pretrained(self.vision_tower_name)
+
+    def load_model(self):
+        self.image_processor = SiglipImageProcessor.from_pretrained(self.vision_tower_name)
+        self.image_processor.crop_size = self.image_processor.size
+        self.vision_tower = SiglipVisionModel.from_pretrained(self.vision_tower_name)
+        self.vision_tower.requires_grad_(False)
+
+        self.is_loaded = True
+
+    def feature_select(self, image_forward_outs):
+        image_features = image_forward_outs.hidden_states[self.select_layer]
+
+        return image_features
+
+    @torch.no_grad()
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_tower(
+                    image.to(device=self.device, dtype=self.dtype).unsqueeze(0),
+                    output_hidden_states=True,
+                )
+                image_feature = self.feature_select(image_forward_out).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            image_forward_outs = self.vision_tower(
+                images.to(device=self.device, dtype=self.dtype), output_hidden_states=True
+            )
+            image_features = self.feature_select(image_forward_outs).to(images.dtype)
+
+        return image_features
+
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+
+    @property
+    def dtype(self):
+        return self.vision_tower.dtype
+
+    @property
+    def device(self):
+        return self.vision_tower.device
+
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_tower.config
+        else:
+            return self.cfg_only
+
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size) ** 2
+
+
+class SiglipVisionTowerS2(SiglipVisionTower):
+    def __init__(self, vision_tower, args, delay_load=False):
+        self.s2_scales = getattr(args, "s2_scales", "384,768,1152")
+        self.s2_scales = list(map(int, self.s2_scales.split(",")))
+        self.s2_scales.sort()
+        self.s2_split_size = self.s2_scales[0]
+        self.s2_image_size = self.s2_scales[-1]
+
+        super().__init__(vision_tower, args, delay_load)
+
+        self.multiscale_forward = multiscale_forward
+
+        if not delay_load:
+            self.image_processor.size["height"] = self.image_processor.size[
+                "width"
+            ] = self.s2_image_size
+            self.image_processor.crop_size["height"] = self.image_processor.crop_size[
+                "width"
+            ] = self.s2_image_size
+
+    def load_model(self):
+        self.image_processor = SiglipImageProcessor.from_pretrained(self.vision_tower_name)
+        self.image_processor.crop_size = self.image_processor.size
+        self.vision_tower = SiglipVisionModel.from_pretrained(self.vision_tower_name)
+        self.vision_tower.requires_grad_(False)
+
+        self.image_processor.size["height"] = self.image_processor.size[
+            "width"
+        ] = self.s2_image_size
+        self.image_processor.crop_size["height"] = self.image_processor.crop_size[
+            "width"
+        ] = self.s2_image_size
+
+        self.is_loaded = True
+
+    @torch.no_grad()
+    def forward_feature(self, images):
+        image_forward_outs = self.vision_tower(
+            images.to(device=self.device, dtype=self.dtype), output_hidden_states=True
+        )
+        image_features = self.feature_select(image_forward_outs).to(images.dtype)
+        return image_features
+
+    @torch.no_grad()
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_feature = self.multiscale_forward(
+                    self.forward_feature,
+                    image.unsqueeze(0),
+                    img_sizes=self.s2_scales,
+                    max_split_size=self.s2_split_size,
+                )
+                image_features.append(image_feature)
+        else:
+            image_features = self.multiscale_forward(
+                self.forward_feature,
+                images,
+                img_sizes=self.s2_scales,
+                max_split_size=self.s2_split_size,
+            )
+
+        return image_features
+
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size * len(self.s2_scales)
diff --git a/vita/model/multimodal_encoder/whale/adapter.py b/vita/model/multimodal_encoder/whale/adapter.py
new file mode 100644
index 0000000000000000000000000000000000000000..18f38712ff00f192ad7673a59983dc8701fa17f5
--- /dev/null
+++ b/vita/model/multimodal_encoder/whale/adapter.py
@@ -0,0 +1,137 @@
+import torch
+from torch import nn
+from torch.nn.utils.rnn import pad_sequence
+
+
+class CNNAdapter(torch.nn.Module):
+    def __init__(
+        self,
+        enc_out_dim: int = 512,
+        llm_embed_dim: int = 4096,
+        kernel_size: int = 5,
+    ):
+        super().__init__()
+
+        self.left_padding1 = nn.ConstantPad1d((kernel_size - 1, 0), 0.0)
+        self.conv1d1 = nn.Conv1d(enc_out_dim, 2 * enc_out_dim, kernel_size, 1, 0)
+        self.bn1 = nn.BatchNorm1d(2 * enc_out_dim, eps=1e-3, momentum=0.99)
+        self.relu1 = nn.ReLU()
+
+        self.left_padding2 = nn.ConstantPad1d((kernel_size - 1, 0), 0.0)
+        self.conv1d2 = nn.Conv1d(2 * enc_out_dim, 4 * enc_out_dim, kernel_size, 1, 0)
+        self.bn2 = nn.BatchNorm1d(4 * enc_out_dim, eps=1e-3, momentum=0.99)
+        self.relu2 = nn.ReLU()
+
+        self.project = nn.Linear(4 * enc_out_dim, llm_embed_dim)
+
+    def forward(self, x, mask_pad):
+        """
+        x: B, T, enc_out_dim
+        mask: (B, T) or (B, 1, T)
+        """
+        x = x.transpose(1, 2)  # B, channels, T
+
+        # mask batch padding
+        if mask_pad.size(2) > 0:  # time > 0
+            x.masked_fill_(~mask_pad, 0.0)
+
+        x = self.left_padding1(x)
+        x = self.conv1d1(x)
+        x = self.bn1(x)
+        x = self.relu1(x)
+
+        x = self.left_padding2(x)
+        x = self.conv1d2(x)
+        x = self.bn2(x)
+        x = self.relu2(x)
+
+        x = x.transpose(1, 2)
+        x = self.project(x)
+
+        return x, mask_pad
+
+
+class LinearAdapter(torch.nn.Module):
+    def __init__(
+        self,
+        enc_out_dim: int = 512,
+        llm_embed_dim: int = 4096,
+    ):
+        super().__init__()
+
+        self.adpter = torch.nn.Linear(enc_out_dim, llm_embed_dim)
+
+    def forward(self, x, mask_pad):
+        return self.adpter(x), mask_pad
+
+
+class CNNSubsampling(torch.nn.Module):
+    def __init__(
+        self,
+        enc_out_dim: int = 512,
+        llm_embed_dim: int = 4096,
+        kernel_size: int = 5,
+        activation_func: str = "relu",
+        norm: str = "batch",
+    ):
+        super().__init__()
+
+        #if enc_out_dim * 4 < llm_embed_dim:
+        if enc_out_dim * 4 < 0:
+            self.left_padding1 = nn.ConstantPad1d((kernel_size - 1, 0), 0.0)
+            self.conv1d1 = nn.Conv1d(enc_out_dim, 2 * enc_out_dim, kernel_size, 1, 0)
+            self.bn1 = nn.BatchNorm1d(2 * enc_out_dim, eps=1e-3, momentum=0.99)
+            self.relu1 = nn.ReLU()
+
+            self.left_padding2 = nn.ConstantPad1d((0, kernel_size - 1), 0.0)
+            self.conv1d2 = nn.Conv1d(2 * enc_out_dim, 4 * enc_out_dim, kernel_size, 2, 0)
+            self.bn2 = nn.BatchNorm1d(4 * enc_out_dim, eps=1e-3, momentum=0.99)
+            self.relu2 = nn.ReLU()
+
+            self.project = nn.Linear(4 * enc_out_dim, llm_embed_dim)
+            self.cnn_num = 2
+        else:
+            self.left_padding2 = nn.ConstantPad1d((0, kernel_size - 1), 0.0)
+            self.conv1d2 = nn.Conv1d(enc_out_dim, 2 * enc_out_dim, kernel_size, 2, 0)
+            if norm == "batch":
+                self.bn2 = nn.BatchNorm1d(2 * enc_out_dim, eps=1e-3, momentum=0.99)
+            elif norm == "layer":
+                self.bn2 = nn.LayerNorm(2 * enc_out_dim, eps=1e-3)
+            if activation_func == "gelu":
+                self.relu2 = nn.GELU()
+            else:
+                self.relu2 = nn.ReLU()
+
+            self.project = nn.Linear(2 * enc_out_dim, llm_embed_dim)
+            self.cnn_num = 1
+
+    def forward(self, x, mask_pad):
+        """
+        x: B, T, enc_out_dim
+        mask: (B, T) or (B, 1, T)
+        """
+        x = x.transpose(1, 2)  # B, channels, T
+
+        # mask batch padding
+        if mask_pad.size(2) > 0:  # time > 0
+            x.masked_fill_(~mask_pad, 0.0)
+
+        if self.cnn_num == 2:
+            x = self.left_padding1(x)
+            x = self.conv1d1(x)
+            x = self.bn1(x)
+            x = self.relu1(x)
+
+        x = self.left_padding2(x)
+        x = self.conv1d2(x)
+        if isinstance(self.bn2, nn.LayerNorm):
+            x = x.transpose(1, 2)
+        x = self.bn2(x)
+        if isinstance(self.bn2, nn.LayerNorm):
+            x = x.transpose(1, 2)
+        x = self.relu2(x)
+
+        x = x.transpose(1, 2)
+        x = self.project(x)
+
+        return x, mask_pad[:, :, 0::2]
diff --git a/vita/model/multimodal_encoder/whale/cmvn.py b/vita/model/multimodal_encoder/whale/cmvn.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ff5710baee95290becf51a4b345730d3d2576b0
--- /dev/null
+++ b/vita/model/multimodal_encoder/whale/cmvn.py
@@ -0,0 +1,89 @@
+import numpy as np
+import torch
+import json
+import math
+
+
+class GlobalCMVN(torch.nn.Module):
+    def __init__(self, mean: torch.Tensor, istd: torch.Tensor, norm_var: bool = True):
+        """
+        Args:
+            mean (torch.Tensor): mean stats
+            istd (torch.Tensor): inverse std, std which is 1.0 / std
+        """
+        super().__init__()
+        assert mean.shape == istd.shape
+        self.norm_var = norm_var
+        # The buffer can be accessed from this module using self.mean
+        self.register_buffer("mean", mean)
+        self.register_buffer("istd", istd)
+
+    def forward(self, x: torch.Tensor):
+        """
+        Args:
+            x (torch.Tensor): (batch, max_len, feat_dim)
+
+        Returns:
+            (torch.Tensor): normalized feature
+        """
+        x = x - self.mean
+        if self.norm_var:
+            x = x * self.istd
+        return x
+
+
+def load_cmvn_json(json_cmvn_file):
+    with open(json_cmvn_file) as f:
+        cmvn_json = json.load(f)
+
+    avg = cmvn_json["mean_stat"]
+    var = cmvn_json["var_stat"]
+    count = cmvn_json["frame_num"]
+    for i in range(len(avg)):
+        avg[i] /= count
+        var[i] = var[i] / count - avg[i] * avg[i]
+        if var[i] < 1.0e-20:
+            var[i] = 1.0e-20
+        var[i] = 1.0 / math.sqrt(var[i])
+    cmvn = np.array([avg, var])
+    return cmvn
+
+
+def load_cmvn_kaldi(kaldi_cmvn_file):
+    avg = []
+    var = []
+    with open(kaldi_cmvn_file, "r") as file:
+        # kaldi binary file start with '\0B'
+        if file.read(2) == "\0B":
+            logging.error(
+                "kaldi cmvn binary file is not supported, please "
+            )
+            sys.exit(1)
+        file.seek(0)
+        arr = file.read().split()
+        assert arr[0] == "["
+        assert arr[-2] == "0"
+        assert arr[-1] == "]"
+        feat_dim = int((len(arr) - 2 - 2) / 2)
+        for i in range(1, feat_dim + 1):
+            avg.append(float(arr[i]))
+        count = float(arr[feat_dim + 1])
+        for i in range(feat_dim + 2, 2 * feat_dim + 2):
+            var.append(float(arr[i]))
+
+    for i in range(len(avg)):
+        avg[i] /= count
+        var[i] = var[i] / count - avg[i] * avg[i]
+        if var[i] < 1.0e-20:
+            var[i] = 1.0e-20
+        var[i] = 1.0 / math.sqrt(var[i])
+    cmvn = np.array([avg, var])
+    return cmvn
+
+
+def load_cmvn(filename, is_json):
+    if is_json:
+        file = load_cmvn_json(filename)
+    else:
+        file = load_cmvn_kaldi(filename)
+    return file[0], file[1]
diff --git a/vita/model/multimodal_encoder/whale/init_model.py b/vita/model/multimodal_encoder/whale/init_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..15cd93d35a1c312256d1e444c1d686daf8c312d9
--- /dev/null
+++ b/vita/model/multimodal_encoder/whale/init_model.py
@@ -0,0 +1,192 @@
+# Copyright (c) 2022 Binbin Zhang (binbzha@qq.com)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Optional, Tuple
+
+import torch
+from torch import nn
+
+import torchaudio
+import torchaudio.compliance.kaldi as kaldi
+
+from .adapter import CNNAdapter, CNNSubsampling, LinearAdapter
+from .cmvn import GlobalCMVN, load_cmvn
+from .module.encoder.encoder import whaleEncoder
+
+
+class audioEncoderProcessor:
+    def __init__(
+        self,
+        dataset_conf: dict = None,
+    ):
+        self.dataset_conf = dataset_conf
+
+    def process(self, wav_path):
+        try:
+            waveform, sample_rate = torchaudio.load(wav_path)
+        except Exception as e:
+            print(f"cannot open {wav_path}!!!!!!!!!!!!!!!!")
+        if sample_rate != self.dataset_conf["resample_conf"]["resample_rate"]:
+            waveform = torchaudio.transforms.Resample(
+                orig_freq=sample_rate, new_freq=self.dataset_conf["resample_conf"]["resample_rate"]
+            )(waveform)
+            sample_rate = self.dataset_conf['resample_conf']['resample_rate']
+
+        waveform = waveform * (1 << 15)
+        # Only keep key, feat, label
+        mat = kaldi.fbank(
+            waveform,
+            num_mel_bins=self.dataset_conf["fbank_conf"]["num_mel_bins"],
+            frame_length=self.dataset_conf["fbank_conf"]["frame_length"],
+            frame_shift=self.dataset_conf["fbank_conf"]["frame_shift"],
+            dither=self.dataset_conf["fbank_conf"]["dither"],
+            energy_floor=0.0,
+            sample_frequency=sample_rate,
+        )
+        attn_mask = torch.ones(mat.shape[0])
+        attn_mask = attn_mask[2::2][2::2][0::2]
+
+        return mat, attn_mask.shape[0]
+
+
+class audioEncoder(torch.nn.Module):
+    def __init__(
+        self,
+        encoder: torch.nn.Module,
+        llm_path: str,
+        freeze_llm: bool = True,
+        enc_out_dim: int = 512,
+        llm_embed_dim: int = 4096,
+        kernel_size: int = 3,
+        IGNORE_ID: int = -100,
+        adpter_type: str = "cnn",
+        add_audio_bos_eos: bool = False,
+        task_num: int = 10,
+        task_before_audio: bool = False,
+        task_type: str = "prompt",
+        freeze_encoder: bool = False,
+        freeze_adpter: bool = False,
+        audio_prompt_finetune: bool = False,
+        audio_prompt_num: int = 25,
+        activation_func: str = "relu",
+        norm: str = "batch",
+        chat_template=None,
+    ):
+        super().__init__()
+        self.encoder = encoder
+
+        self.enc_out_dim = enc_out_dim
+        self.llm_embed_dim = llm_embed_dim
+        self.IGNORE_ID = IGNORE_ID
+        self.add_audio_bos_eos = add_audio_bos_eos
+        self.task_before_audio = task_before_audio
+        self.task_type = task_type
+        self.freeze_encoder = freeze_encoder
+        self.freeze_adpter = freeze_adpter
+        self.audio_prompt_finetune = audio_prompt_finetune
+        self.audio_prompt_num = audio_prompt_num
+
+        if adpter_type == "cnn":
+            self.adpter = CNNAdapter(enc_out_dim, llm_embed_dim, kernel_size)
+        elif adpter_type == "linear":
+            self.adpter = LinearAdapter(enc_out_dim, llm_embed_dim)
+        elif adpter_type == "subsampling":
+            self.adpter = CNNSubsampling(
+                enc_out_dim, llm_embed_dim, kernel_size, activation_func, norm
+            )
+
+        if self.freeze_encoder:
+            self.encoder.eval()
+            for (name, param) in self.encoder.named_parameters():
+                param.requires_grad = False
+        if self.freeze_adpter:
+            self.adpter.eval()
+            for (name, param) in self.adpter.named_parameters():
+                param.requires_grad = False
+
+        if self.audio_prompt_finetune:
+            self.prompt_embeddings = nn.Embedding(audio_prompt_num, llm_embed_dim)
+            self.prompt_ids = torch.tensor([i for i in range(audio_prompt_num)]).long()
+
+    def forward(
+        self,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+    ) -> Dict[str, Optional[torch.Tensor]]:
+
+        speech = speech.to(next(self.parameters()).dtype)
+
+        # 1. Encoder
+        encoder_out, encoder_mask = self.encoder(speech, speech_lengths)
+        inputs_embeds, encoder_mask = self.adpter(encoder_out, encoder_mask)  # B, T, D
+        attention_mask = encoder_mask.squeeze(1)  # B, T
+        assert inputs_embeds.size(1) == attention_mask.size(1)
+
+        # audio bos/eos
+        if self.add_audio_bos_eos:
+            inputs_embeds, attention_mask, target = self._add_bos_eos(
+                "audio", "/audio", inputs_embeds, attention_mask, target
+            )
+
+        B, _, _ = inputs_embeds.shape
+        if self.audio_prompt_finetune:
+            prompt_ids = self.prompt_ids.repeat(B, 1).to(inputs_embeds.device)
+            prompt_embeds = self.prompt_embeddings(
+                                prompt_ids.to(inputs_embeds.device)) # B, 5, D
+            inputs_embeds = torch.cat((prompt_embeds, inputs_embeds), 1) # B, (T+5), D
+
+        outputs = {
+            "inputs_embeds": inputs_embeds,
+            "attention_mask": attention_mask,
+        }
+
+        return outputs
+
+    def _add_bos_eos(self, bos, eos, inputs_embeds, attention_mask, target=None):
+        B = len(inputs_embeds)
+        bos_embed = self.task_embeddings(
+            torch.full([B, 1], self.task_ids[bos]).to(inputs_embeds.device)
+        )  # B, 1, D
+        eos_embed = self.task_embeddings(
+            torch.full([B, 1], self.task_ids[eos]).to(inputs_embeds.device)
+        )  # B, 1, D
+        bos_eos_target = torch.full([B, 2], self.IGNORE_ID).to(inputs_embeds.device)  # B, 2
+        bos_eos_mask = torch.full([B, 1], True).to(inputs_embeds.device)  # B, 1
+
+        inputs_embeds = torch.cat((bos_embed, inputs_embeds), 1)  # B, (1+T), D
+        inputs_embeds = torch.cat((inputs_embeds, eos_embed), 1)  # B, (1+T+1), D
+        attention_mask = torch.cat((bos_eos_mask, attention_mask), 1)  # B, (1+T)
+        attention_mask = torch.cat((attention_mask, bos_eos_mask), 1)  # B, (1+T+1)
+        if target is not None:
+            target = torch.cat((target, bos_eos_target), 1)  # B, (T+2), D
+
+        return inputs_embeds, attention_mask, target
+
+
+def init_model(configs):
+    if configs["cmvn_file"] is not None:
+        mean, istd = load_cmvn(configs["cmvn_file"], configs["is_json_cmvn"])
+        global_cmvn = GlobalCMVN(torch.from_numpy(mean).float(), torch.from_numpy(istd).float())
+    else:
+        global_cmvn = None
+
+    input_dim = configs["input_dim"]
+
+    encoder = whaleEncoder(input_dim, global_cmvn=global_cmvn, **configs["encoder_conf"])
+    model = audioEncoder(encoder=encoder, **configs["model_conf"])
+    processor = audioEncoderProcessor(dataset_conf=configs["dataset_conf"])
+
+    model.audio_processor = processor
+
+    return model
diff --git a/vita/model/multimodal_encoder/whale/module/component/mamba.py b/vita/model/multimodal_encoder/whale/module/component/mamba.py
new file mode 100644
index 0000000000000000000000000000000000000000..02b619290745fcf2a8f3bc5c7abdebdc02436e19
--- /dev/null
+++ b/vita/model/multimodal_encoder/whale/module/component/mamba.py
@@ -0,0 +1,131 @@
+"""Encoder self-attention layer definition."""
+
+import math
+import pdb
+from functools import partial
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from vita.model.multimodal_encoder.whale.utils import IGNORE_ID, strtobool
+
+try:
+    from mamba_ssm.modules.mamba_simple import Mamba, Block
+    from mamba_ssm.models.mixer_seq_simple import _init_weights
+    from mamba_ssm.ops.triton.layernorm import RMSNorm
+except ImportError:
+    print("Please install mamba_ssm to use MambaSSM component.")
+
+
+class MambaBlock(nn.Module):
+    def __init__(self, in_channels, n_layer=1, d_state=16, d_conv=4, expand=4, bidirectional=False):
+        super(MambaBlock, self).__init__()
+        self.forward_blocks = nn.ModuleList([])
+        self.forward_norm_f = RMSNorm(in_channels, eps=1e-5)
+        for i in range(n_layer):
+            self.forward_blocks.append(
+                Block(
+                    in_channels,
+                    mixer_cls=partial(
+                        Mamba, layer_idx=i, d_state=d_state, d_conv=d_conv, expand=expand
+                    ),
+                    norm_cls=partial(RMSNorm, eps=1e-5),
+                    fused_add_norm=True,
+                    residual_in_fp32=True,
+                )
+            )
+        if bidirectional:
+            self.backward_blocks = nn.ModuleList([])
+            for i in range(n_layer):
+                self.backward_blocks.append(
+                    Block(
+                        in_channels,
+                        mixer_cls=partial(
+                            Mamba, layer_idx=i, d_state=d_state, d_conv=d_conv, expand=expand
+                        ),
+                        norm_cls=partial(RMSNorm, eps=1e-5),
+                        fused_add_norm=True,
+                        residual_in_fp32=True,
+                    )
+                )
+            self.backward_norm_f = RMSNorm(in_channels, eps=1e-5)
+        else:
+            self.backward_blocks = None
+
+        self.apply(partial(_init_weights, n_layer=n_layer))
+
+    def forward(self, input):
+        for_residual = None
+        forward_f = input.clone()
+        for block in self.forward_blocks:
+            forward_f, for_residual = block(forward_f, for_residual, inference_params=None)
+        residual = (forward_f + for_residual) if for_residual is not None else forward_f
+        residual = self.forward_norm_f(residual)
+
+        if self.backward_blocks is not None:
+            back_residual = None
+            backward_f = torch.flip(input, [1])
+            for block in self.backward_blocks:
+                backward_f, back_residual = block(backward_f, back_residual, inference_params=None)
+            back_residual = (
+                (backward_f + back_residual) if back_residual is not None else backward_f
+            )
+
+            back_residual = torch.flip(back_residual, [1])
+            back_residual = self.backward_norm_f(back_residual)
+            residual = torch.cat([residual, back_residual], -1)
+
+        return residual
+
+
+class MambaSSM(torch.nn.Module):
+    @staticmethod
+    def add_arguments(group):
+        """Add TDNN common arguments."""
+        group.add_argument(
+            "--mamba-num-layers", default=4, type=int, help="Output dim of MambaSSM."
+        )
+        group.add_argument(
+            "--mamba-input-dim", default=256, type=int, help="Input dim of MambaSSM."
+        )
+        group.add_argument(
+            "--mamba-output-dim", default=256, type=int, help="Output dim of MambaSSM."
+        )
+        group.add_argument("--mamba-d-state", default=16, type=int, help="d-state of MambaSSM.")
+        group.add_argument("--mamba-d-conv", default=4, type=int, help="d-conv of MambaSSM.")
+        group.add_argument("--mamba-expand", default=4, type=int, help="expand of MambaSSM.")
+        return group
+
+    def __init__(self, args):
+        """Construct an Encoder object."""
+        super(MambaSSM, self).__init__()
+        self.mamb_num_layers = args.mamba_num_layers
+        self.mamba_input_dim = args.mamba_input_dim
+        self.mamba_output_dim = args.mamba_output_dim
+        self.mamba_d_state = args.mamba_d_state
+        self.mamba_d_conv = args.mamba_d_conv
+        self.mamba_expand = args.mamba_expand
+
+        self.mamba = MambaBlock(
+            self.mamba_input_dim,
+            self.mamb_num_layers,
+            self.mamba_d_state,
+            self.mamba_d_conv,
+            self.mamba_expand,
+        )
+
+    @torch.jit.unused
+    def forward(self, xs, ilens=None, masks=None):
+        """Embed positions in tensor.
+
+        :param torch.Tensor xs: input tensor
+        :param torch.Tensor masks: input mask
+        :return: position embedded tensor and mask
+        :rtype Tuple[torch.Tensor, torch.Tensor]:
+        """
+
+        xs_out = self.mamba(xs)
+
+        return xs_out.to(xs.dtype), ilens, masks
diff --git a/vita/model/multimodal_encoder/whale/module/component/subsampling.py b/vita/model/multimodal_encoder/whale/module/component/subsampling.py
new file mode 100644
index 0000000000000000000000000000000000000000..a390ab40a53be8db74fa1d08d5e7a48d2df7251e
--- /dev/null
+++ b/vita/model/multimodal_encoder/whale/module/component/subsampling.py
@@ -0,0 +1,74 @@
+import torch
+from typing import Tuple, Union
+
+
+class BaseSubsampling(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.subsampling_rate = 1
+        self.right_context = 0
+
+    def position_encoding(self, offset: Union[int, torch.Tensor], size: int) -> torch.Tensor:
+        return self.pos_enc.position_encoding(offset, size)
+
+
+class Conv2dSubsampling4(BaseSubsampling):
+    """Convolutional 2D subsampling (to 1/4 length).
+
+    Args:
+        idim (int): Input dimension.
+        odim (int): Output dimension.
+        dropout_rate (float): Dropout rate.
+
+    """
+
+    def __init__(self, idim: int, odim: int, dropout_rate: float):
+        """Construct an Conv2dSubsampling4 object."""
+        super().__init__()
+        self.conv = torch.nn.Sequential(
+            torch.nn.Conv2d(1, odim, 3, 2),
+            torch.nn.ReLU(),
+            torch.nn.Conv2d(odim, odim, 3, 2),
+            torch.nn.ReLU(),
+        )
+        self.out = torch.nn.Sequential(torch.nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim))
+        self.right_context = 6
+        self.subsampling_rate = 4
+
+    def forward(self, x: torch.Tensor, x_mask: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        x = x.unsqueeze(1)  # (b, c=1, t, f)
+        x = self.conv(x)
+        b, c, t, f = x.size()
+        x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
+        return x, x_mask[:, :, 2::2][:, :, 2::2]
+
+
+class Subsampling(torch.nn.Module):
+    @staticmethod
+    def add_arguments(group):
+        """Add Subsampling common arguments."""
+        group.add_argument("--subsampling-rate", default=4, type=int)
+        group.add_argument("--subsampling-input-dim", default=256, type=int)
+        group.add_argument("--subsampling-output-dim", default=256, type=int)
+        group.add_argument("--subsampling-dropout-rate", default=0.1, type=float)
+
+        return group
+
+    def __init__(self, args):
+        super().__init__()
+        self.subsampling_rate = args.subsampling_rate
+        self.subsampling_input_dim = args.subsampling_input_dim
+        self.subsampling_output_dim = args.subsampling_output_dim
+        self.subsampling_dropout_rate = args.subsampling_dropout_rate
+
+        if self.subsampling_rate == 4:
+            self.core = Conv2dSubsampling4(
+                self.subsampling_input_dim,
+                self.subsampling_output_dim,
+                self.subsampling_dropout_rate,
+            )
+
+    def forward(self, xs, ilens, masks):
+        xs, masks = self.core(xs, masks)
+        ilens = masks.squeeze(1).sum(1)
+        return xs, ilens, masks
diff --git a/vita/model/multimodal_encoder/whale/module/component/transformer.py b/vita/model/multimodal_encoder/whale/module/component/transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..014a0198419db83dd2f8acd12dfd3332804fc78a
--- /dev/null
+++ b/vita/model/multimodal_encoder/whale/module/component/transformer.py
@@ -0,0 +1,428 @@
+"""Encoder self-attention layer definition."""
+
+import math
+import pdb
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from vita.model.multimodal_encoder.whale.module.layer.attention import (
+    Conv1dLinear,
+    MultiHeadedAttention,
+    MultiLayeredConv1d,
+    PositionalEncoding,
+    PositionwiseFeedForward,
+    RelPositionalEncoding,
+)
+
+# from vita.model.multimodal_encoder.whale.module.component.utils import *
+from vita.model.multimodal_encoder.whale.utils import IGNORE_ID, add_optional_chunk_mask, strtobool
+
+
+def repeat(N, fn):
+    """Repeat module N times.
+
+    :param int N: repeat time
+    :param function fn: function to generate module
+    :return: repeated modules
+    :rtype: MultiSequential
+    """
+    return MultiSequential(*[fn(n) for n in range(N)])
+
+
+class MultiSequential(torch.nn.Sequential):
+    """Multi-input multi-output torch.nn.Sequential."""
+
+    def forward(self, x, masks, pos_emb):
+
+        """Repeat."""
+        for m in self:
+            x, masks, pos_emb = m(x, masks, pos_emb)
+        return x, masks, pos_emb
+
+    @torch.jit.export
+    def infer(self, x, pos_emb, buffer, buffer_index, buffer_out):
+        # type: (Tensor, Tensor, Tensor, Tensor) -> Tuple[Tensor, Tensor, Tensor, Tensor]
+        """Repeat."""
+        for m in self:
+            x, pos_emb, buffer, buffer_index, buffer_out = m.infer(
+                x, pos_emb, buffer, buffer_index, buffer_out
+            )
+        return x, pos_emb, buffer, buffer_index, buffer_out
+
+    @torch.jit.export
+    def infer_hidden(self, x, pos_emb, buffer, buffer_index, buffer_out, hidden_out):
+        # type: (Tensor, Tensor, Tensor, Tensor) -> Tuple[Tensor, Tensor, Tensor, Tensor]
+        """Repeat."""
+        for m in self:
+            x, pos_emb, buffer, buffer_index, buffer_out = m.infer(
+                x, pos_emb, buffer, buffer_index, buffer_out
+            )
+            hidden_out.append(x)
+        return x, pos_emb, buffer, buffer_index, buffer_out, hidden_out
+
+
+class TransformerLayer(nn.Module):
+    """Transformer layer module.
+
+    :param int size: input dim
+    :param self_attn: self attention module
+    :param feed_forward: feed forward module
+    :param float dropout_rate: dropout rate
+    :param bool normalize_before: whether to use layer_norm before the first block
+    :param bool concat_after: whether to concat attention layer's input and output
+        if True, additional linear will be applied. i.e. x -> x + linear(concat(x, att(x)))
+        if False, no additional linear will be applied. i.e. x -> x + att(x)
+
+    """
+
+    def __init__(
+        self, size, self_attn, feed_forward, dropout_rate, normalize_before=True, concat_after=False
+    ):
+        """Construct an TransformerLayer object."""
+        super(TransformerLayer, self).__init__()
+        self.self_attn = self_attn
+        self.feed_forward = feed_forward
+        self.norm1 = torch.nn.LayerNorm(size)
+        self.norm2 = torch.nn.LayerNorm(size)
+        self.dropout = nn.Dropout(dropout_rate)
+        self.size = size
+        self.normalize_before = normalize_before
+        self.concat_after = concat_after
+        if self.concat_after:
+            self.concat_linear = nn.Linear(size + size, size)
+        else:
+            self.concat_linear = nn.Identity()
+
+    @torch.jit.unused
+    def forward(self, x, mask, pos_emb):
+        """Compute encoded features.
+
+        :param torch.Tensor x: encoded source features (batch, max_time_in, size)
+        :param torch.Tensor mask: mask for x (batch, max_time_in)
+        :rtype: Tuple[torch.Tensor, torch.Tensor]
+        """
+        residual = x
+        if self.normalize_before:
+            x = self.norm1(x)
+        if self.concat_after:
+            x_concat = torch.cat((x, self.self_attn(x, x, x, mask, pos_emb)), dim=-1)
+            x = residual + self.concat_linear(x_concat)
+        else:
+            x = residual + self.dropout(self.self_attn(x, x, x, mask, pos_emb))
+        if not self.normalize_before:
+            x = self.norm1(x)
+
+        residual = x
+        if self.normalize_before:
+            x = self.norm2(x)
+        x = residual + self.dropout(self.feed_forward(x))
+        if not self.normalize_before:
+            x = self.norm2(x)
+
+        return x, mask, pos_emb
+
+    @torch.jit.export
+    def infer(self, x, pos_emb, buffer, buffer_index, buffer_out):
+        # type: (Tensor, Tensor, Tensor, Tensor) -> Tuple[Tensor, Tensor, Tensor, Tensor]
+        residual = x.clone()
+        if self.normalize_before:
+            x = self.norm1(x)
+        if self.concat_after:
+            x_att, buffer, buffer_index, buffer_out = self.self_attn.infer(
+                x, x, x, pos_emb, buffer, buffer_index, buffer_out
+            )
+            x_concat = torch.cat((x, x_att), dim=-1)
+            x = residual + self.concat_linear(x_concat)
+        else:
+            x_att, buffer, buffer_index, buffer_out = self.self_attn.infer(
+                x, x, x, pos_emb, buffer, buffer_index, buffer_out
+            )
+            x = residual + x_att
+        if not self.normalize_before:
+            x = self.norm1(x)
+
+        residual = x.clone()
+        if self.normalize_before:
+            x = self.norm2(x)
+        x_feed, buffer, buffer_index, buffer_out = self.feed_forward.infer(
+            x, buffer, buffer_index, buffer_out
+        )
+        x = residual + x_feed
+        if not self.normalize_before:
+            x = self.norm2(x)
+
+        return x, pos_emb, buffer, buffer_index, buffer_out
+
+
+class Transformer(torch.nn.Module):
+    @staticmethod
+    def add_arguments(group):
+        """Add TDNN common arguments."""
+        group.add_argument(
+            "--transformer-input-dim", default=256, type=int, help="Input dim of Transformer."
+        )
+        group.add_argument(
+            "--transformer-output-dim", default=4, type=int, help="Output dim of Transformer."
+        )
+        group.add_argument(
+            "--transformer-attention-dim", default=256, type=int, help="Dimention of attention."
+        )
+        group.add_argument(
+            "--transformer-attention-heads",
+            default=4,
+            type=int,
+            help="The number of heads of multi head attention.",
+        )
+        group.add_argument(
+            "--transformer-linear-units",
+            default=1024,
+            type=int,
+            help="The number of units of position-wise feed forward.",
+        )
+        group.add_argument(
+            "--transformer-num-blocks", default=6, type=int, help="The number of attention blocks."
+        )
+        group.add_argument(
+            "--transformer-dropout-rate",
+            default=0.1,
+            type=float,
+            help="Dropout rate in Transformer.",
+        )
+        group.add_argument(
+            "--transformer-attention-dropout-rate",
+            default=0.0,
+            type=float,
+            help="Dropout rate in attention.",
+        )
+        group.add_argument(
+            "--transformer-positional-dropout-rate",
+            default=0.1,
+            type=float,
+            help="Dropout rate after adding positional encoding.",
+        )
+        group.add_argument(
+            "--transformer-input-layer", default="linear", type=str, help="Type of input layer"
+        )
+        group.add_argument("--transformer-pos-enc-class", default="abs-enc", type=str, help="")
+        group.add_argument(
+            "--transformer-normalize-before",
+            default=True,
+            type=strtobool,
+            help="Whether to use layer-norm before the first block.",
+        )
+        group.add_argument(
+            "--transformer-concat-after",
+            default=False,
+            type=strtobool,
+            help="Whether to concat attention layer's input and output.",
+        )
+        group.add_argument(
+            "--transformer-positionwise-layer-type",
+            default="linear",
+            type=str,
+            help="Linear of conv1d.",
+        )
+        group.add_argument(
+            "--transformer-positionwise-conv-kernel_size",
+            default=1,
+            type=int,
+            help="Kernel size of positionwise conv1d layer.",
+        )
+        group.add_argument("--transformer-chunk_size", default=-1, type=int, help="")
+        group.add_argument("--transformer-left_chunks", default=-1, type=int, help="")
+        group.add_argument("--transformer-dynamic-chunks", default=True, type=strtobool, help="")
+        return group
+
+    def __init__(
+        self,
+        args,
+        input_dim=None,
+        output_dim=None,
+        attention_dim=None,
+        attention_heads=None,
+        linear_units=None,
+        num_blocks=None,
+        dropout_rate=None,
+        positional_dropout_rate=None,
+        attention_dropout_rate=None,
+        input_layer=None,
+        pos_enc_class=None,
+        normalize_before=None,
+        concat_after=None,
+        positionwise_layer_type=None,
+        positionwise_conv_kernel_size=None,
+        chunk_size=None,
+        left_chunks=None,
+    ):
+        """Construct an Encoder object."""
+        super(Transformer, self).__init__()
+        if args is None:
+            self.input_dim = input_dim
+            self.output_dim = output_dim
+            self.attention_dim = attention_dim
+            self.attention_heads = attention_heads
+            self.linear_units = linear_units
+            self.num_blocks = num_blocks
+            self.dropout_rate = dropout_rate
+            self.positional_dropout_rate = positional_dropout_rate
+            self.attention_dropout_rate = attention_dropout_rate
+            self.input_layer = input_layer
+            self.pos_enc_class = pos_enc_class
+            self.normalize_before = normalize_before
+            self.concat_after = concat_after
+            self.positionwise_layer_type = positionwise_layer_type
+            self.positionwise_conv_kernel_size = positionwise_conv_kernel_size
+            self.chunk_size = chunk_size
+            self.left_chunks = left_chunks
+        else:
+            self.input_dim = args.transformer_input_dim
+            self.output_dim = args.transformer_output_dim
+            self.attention_dim = args.transformer_attention_dim
+            self.attention_heads = args.transformer_attention_heads
+            self.linear_units = args.transformer_linear_units
+            self.num_blocks = args.transformer_num_blocks
+            self.dropout_rate = args.transformer_dropout_rate
+            self.positional_dropout_rate = args.transformer_positional_dropout_rate
+            self.attention_dropout_rate = args.transformer_attention_dropout_rate
+            self.input_layer = args.transformer_input_layer
+            self.pos_enc_class = args.transformer_pos_enc_class
+            self.normalize_before = args.transformer_normalize_before
+            self.concat_after = args.transformer_concat_after
+            self.positionwise_layer_type = args.transformer_positionwise_layer_type
+            self.positionwise_conv_kernel_size = args.transformer_positionwise_conv_kernel_size
+            self.chunk_size = args.transformer_chunk_size
+            self.left_chunks = args.transformer_left_chunks
+            self.transformer_dynamic_chunks = args.transformer_dynamic_chunks
+
+        if self.pos_enc_class == "abs-enc":
+            pos_enc_args = (self.attention_dim, self.positional_dropout_rate)
+            pos_enc_class = PositionalEncoding
+        elif self.pos_enc_class == "rel-enc":
+            pos_enc_args = (
+                self.attention_dim,
+                self.positional_dropout_rate,
+                self.chunk_size,
+                self.left_chunks,
+            )
+            pos_enc_class = RelPositionalEncoding
+
+        if self.input_layer == "linear":
+            self.embed = torch.nn.Sequential(
+                torch.nn.Linear(self.input_dim, self.attention_dim),
+                torch.nn.LayerNorm(self.attention_dim),
+                torch.nn.Dropout(self.dropout_rate),
+                torch.nn.ReLU(),
+            )
+        elif self.input_layer == "embed":
+            self.embed = torch.nn.Sequential(
+                torch.nn.Embedding(self.input_dim, self.attention_dim, padding_idx=IGNORE_ID)
+            )
+        elif self.input_layer == "none":
+            self.embed = torch.nn.Sequential(torch.nn.Identity())
+        else:
+            raise ValueError("unknown input_layer: " + self.input_layer)
+        self.pe = pos_enc_class(*pos_enc_args)
+        self.embed_layer_num = len(self.embed)
+
+        if self.positionwise_layer_type == "linear":
+            positionwise_layer = PositionwiseFeedForward
+            positionwise_layer_args = (self.attention_dim, self.linear_units, self.dropout_rate)
+        elif self.positionwise_layer_type == "conv1d":
+            positionwise_layer = MultiLayeredConv1d
+            positionwise_layer_args = (
+                self.attention_dim,
+                self.linear_units,
+                self.positionwise_conv_kernel_size,
+                self.dropout_rate,
+            )
+        elif self.positionwise_layer_type == "conv1d-linear":
+            positionwise_layer = Conv1dLinear
+            positionwise_layer_args = (
+                self.attention_dim,
+                self.linear_units,
+                self.positionwise_conv_kernel_size,
+                self.dropout_rate,
+            )
+        else:
+            raise NotImplementedError("Support only linear or conv1d.")
+
+        self.encoders = repeat(
+            self.num_blocks,
+            lambda lnum: TransformerLayer(
+                self.attention_dim,
+                MultiHeadedAttention(
+                    self.attention_heads,
+                    self.attention_dim,
+                    self.attention_dropout_rate,
+                    self.chunk_size,
+                    self.left_chunks,
+                    self.pos_enc_class,
+                ),
+                positionwise_layer(*positionwise_layer_args),
+                self.dropout_rate,
+                self.normalize_before,
+                self.concat_after,
+            ),
+        )
+        if self.normalize_before:
+            self.after_norm = torch.nn.LayerNorm(self.attention_dim)
+
+    @torch.jit.unused
+    def forward(self, xs, ilens=None, masks=None):
+        """Embed positions in tensor.
+
+        :param torch.Tensor xs: input tensor
+        :param torch.Tensor masks: input mask
+        :return: position embedded tensor and mask
+        :rtype Tuple[torch.Tensor, torch.Tensor]:
+        """
+
+        if self.transformer_dynamic_chunks == True:  # and self.training:
+            chunk_masks = add_optional_chunk_mask(xs, masks, True, True, 0, 0, -1)
+        else:
+            chunk_masks = add_optional_chunk_mask(
+                xs, masks, False, False, self.chunk_size, self.chunk_size, self.left_chunks
+            ).to(xs.device)
+        xs = self.embed(xs)
+        xs, pos_emb = self.pe(xs)
+        xs, chunk_masks, pos_emb = self.encoders(xs, chunk_masks, pos_emb)
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+        return xs, ilens, masks
+
+    @torch.jit.export
+    def infer(self, xs, buffer, buffer_index, buffer_out):
+        xs = self.embed(xs)
+
+        # pe_index = buffer[buffer_index: buffer_index + 1].reshape([1]).to(torch.int64)
+        # xs, pos_emb, pe_index[0] = self.pe.infer(xs, pe_index[0])
+        # buffer_out.append(pe_index.reshape(-1).to(torch.float32))
+        # buffer_index = buffer_index + 1
+        xs, pos_emb, _ = self.pe.infer(xs, 0)
+        xs, pos_emb, buffer, buffer_index, buffer_out = self.encoders.infer(
+            xs, pos_emb, buffer, buffer_index, buffer_out
+        )
+
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+        return xs, buffer, buffer_index, buffer_out
+
+    @torch.jit.export
+    def infer_hidden(self, xs, buffer, buffer_index, buffer_out, hidden_out):
+        xs = self.embed(xs)
+
+        # pe_index = buffer[buffer_index: buffer_index + 1].reshape([1]).to(torch.int64)
+        # xs, pos_emb, pe_index[0] = self.pe.infer(xs, pe_index[0])
+        # buffer_out.append(pe_index.reshape(-1).to(torch.float32))
+        # buffer_index = buffer_index + 1
+        xs, pos_emb, _ = self.pe.infer(xs, 0)
+        xs, pos_emb, buffer, buffer_index, buffer_out, hidden_out = self.encoders.infer_hidden(
+            xs, pos_emb, buffer, buffer_index, buffer_out, hidden_out
+        )
+
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+        return xs, buffer, buffer_index, buffer_out, hidden_out
diff --git a/vita/model/multimodal_encoder/whale/module/encoder/encoder.py b/vita/model/multimodal_encoder/whale/module/encoder/encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..c129410a740f6fa8dbd870b8ae821a8a72cf612a
--- /dev/null
+++ b/vita/model/multimodal_encoder/whale/module/encoder/encoder.py
@@ -0,0 +1,171 @@
+import argparse
+import logging
+import sys
+import time
+from typing import Dict, Optional, Tuple
+
+import numpy as np
+import six
+import torch
+
+from vita.model.multimodal_encoder.whale.module.component.mamba import MambaSSM
+from vita.model.multimodal_encoder.whale.module.component.subsampling import Subsampling
+from vita.model.multimodal_encoder.whale.module.component.transformer import Transformer
+from vita.model.multimodal_encoder.whale.utils import make_pad_mask
+
+
+def add_encoder_args(group):
+    """Add Encoder common arguments."""
+    group.add_argument(
+        "--encoder-layer-config",
+        type=str,
+        default="tdnn-dtc",
+        help="Layer config of encoder. Format layername-layername-..., default(conv1d-fsmn-rnn)",
+    )
+    group.add_argument(
+        "--encoder-input-dim",
+        type=int,
+        default=256,
+        help="Input dim of encoder. Must equal to the input dim of the first Component (default=40)",
+    )
+    group.add_argument(
+        "--encoder-output-dim",
+        type=int,
+        default=256,
+        help="Output dim of encoder. Must enqual to the output dim of the last Component ! (default=256)",
+    )
+    # Add args of all kinds of components.
+    # If you add a new component, DO NOT forget to add args to add_component_args func.
+    group = Transformer.add_arguments(group)
+    group = Subsampling.add_arguments(group)
+    group = MambaSSM.add_arguments(group)
+    return group
+
+
+def assign_args_from_dict(args, dict, prefix_key=None):
+    if prefix_key is not None:
+        dict = dict[prefix_key]
+    for k, v in dict.items():
+        k_args = k.replace("-", "_")
+        if hasattr(args, k_args):
+            setattr(args, k_args, dict[k])
+    return args
+
+
+class whaleEncoder(torch.nn.Module):
+    def __init__(self, input_dim, overview_conf=None, para_conf=None, global_cmvn=None):
+        super(whaleEncoder, self).__init__()
+
+        parser = argparse.ArgumentParser()
+        add_encoder_args(parser)
+        args, _ = parser.parse_known_args()
+
+        assign_args_from_dict(args, overview_conf)
+        # assign_args_from_dict(args, para_conf)
+
+        self.config = args.encoder_layer_config.split("-")
+        encoder_input_dim = args.encoder_input_dim
+        encoder_output_dim = args.encoder_output_dim
+        prev_output_dim = encoder_input_dim
+        prev_component_name = "encoder"
+        self.enc = torch.nn.ModuleList([])
+        for name in self.config:
+            assign_args_from_dict(args, para_conf[name])
+            if len(name.split("_")) == 2:
+                name = name.split("_")[0]
+            elif len(name.split("_")) == 1:
+                name = name
+            else:
+                logging.error("WRONG CONFIG! {} is not valid".format("encoder", name))
+                sys.exit()
+
+            if name == "transformer":
+                self.enc.append(Transformer(args))
+            elif name == "subsampling":
+                self.enc.append(Subsampling(args))
+            elif name == "mamba":
+                self.enc.append(MambaSSM(args))
+            else:
+                print("{} is not supported now!".format(name))
+                return NotImplemented
+            component_input_dim = getattr(args, name + "_input_dim")
+            if component_input_dim != prev_output_dim:
+                # This is the first layer
+                logging.error(
+                    "WRONG CONFIG! --{}-output-dim ({}) does not equal to --{}-input-dim ({})".format(
+                        prev_component_name, prev_output_dim, name, component_input_dim
+                    )
+                )
+                sys.exit()
+            prev_output_dim = getattr(args, name + "_output_dim")
+            prev_component_name = name
+
+        self.global_cmvn = global_cmvn
+        if prev_output_dim != encoder_output_dim:
+            logging.error(
+                "WRONG CONFIG! --{}-output-dim ({}) does not equal to --{}-output-dim ({}, the last component)".format(
+                    "encoder", encoder_output_dim, name, prev_output_dim
+                )
+            )
+            sys.exit()
+
+        self._output_size = encoder_output_dim
+
+        num_params = sum(p.numel() for p in self.parameters())
+        print("the number of whale encoder params: {}M".format(num_params / 1024 / 1024))
+
+    def output_size(self) -> int:
+        return self._output_size
+
+    @torch.jit.unused
+    def forward(self, xs, ilens, decoding_chunk_size=None, num_decoding_left_chunks=None):
+        # type: (Tensor, Optional[Tensor], Optional[Tensor]) -> Tuple[Tensor, Optional[List[int]], Optional[Tensor]]
+        """Encoder forward
+
+        :param torch.Tensor xs_pad: batch of padded input sequences (B, Tmax, D)
+        :param torch.Tensor ilens: batch of lengths of input sequences (B)
+        :return: batch of hidden state sequences (B, Tmax, eprojs)
+        :rtype: torch.Tensor
+        """
+
+        if decoding_chunk_size is not None and num_decoding_left_chunks is not None:
+            for layer in self.enc:
+                if hasattr(layer, "chunk_size"):
+                    layer.chunk_size = decoding_chunk_size
+                if hasattr(layer, "left_chunks"):
+                    layer.left_chunks = num_decoding_left_chunks
+                if hasattr(layer, "transformer_dynamic_chunks"):
+                    layer.transformer_dynamic_chunks = False
+
+        assert (len(xs.shape)) == 3
+        T = xs.size(1)
+        masks = ~make_pad_mask(ilens, T).unsqueeze(1)  # (B, 1, T)
+        if self.global_cmvn is not None:
+            xs = self.global_cmvn(xs)
+        for module in self.enc:
+            xs, ilens, masks = module(xs, ilens, masks)
+        return xs, masks
+
+    @torch.jit.export
+    def infer(self, xs_pad, buffer, buffer_index, buffer_out):
+        if self.global_cmvn is not None:
+            xs = self.global_cmvn(xs)
+        for module in self.enc:
+            xs_pad, buffer, buffer_index, buffer_out = module.infer(
+                xs_pad, buffer, buffer_index, buffer_out
+            )
+        return xs_pad, buffer, buffer_index, buffer_out
+
+    @torch.jit.export
+    def infer_hidden(self, xs_pad, buffer, buffer_index, buffer_out, hidden_out):
+        if self.global_cmvn is not None:
+            xs = self.global_cmvn(xs)
+        for module in self.enc:
+            xs_pad, buffer, buffer_index, buffer_out, hidden_out = module.infer_hidden(
+                xs_pad, buffer, buffer_index, buffer_out, hidden_out
+            )
+        return xs_pad, buffer, buffer_index, buffer_out, hidden_out
+
+    @torch.jit.ignore(drop=True)
+    def get_extra_loss(self) -> Dict[str, torch.Tensor]:
+        return None
diff --git a/vita/model/multimodal_encoder/whale/module/layer/attention.py b/vita/model/multimodal_encoder/whale/module/layer/attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..eeb8ca074b97b6f6110e67451492cb44a8be1546
--- /dev/null
+++ b/vita/model/multimodal_encoder/whale/module/layer/attention.py
@@ -0,0 +1,571 @@
+import math
+import pdb
+
+import numpy
+import torch
+import torch.nn as nn
+
+
+class PositionalEncoding(torch.nn.Module):
+    """Positional encoding.
+    :param int d_model: embedding dim
+    :param float dropout_rate: dropout rate
+    :param int max_len: maximum input length
+    PE(pos, 2i)   = sin(pos/(10000^(2i/dmodel)))
+    PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel)))
+    """
+
+    def __init__(
+        self, d_model: int, dropout_rate: float, max_len: int = 1500, reverse: bool = False
+    ):
+        """Construct an PositionalEncoding object."""
+        super().__init__()
+        self.d_model = d_model
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+        self.max_len = max_len
+
+        self.pe = torch.zeros(self.max_len, self.d_model)
+        position = torch.arange(0, self.max_len, dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.float32)
+            * -(math.log(10000.0) / self.d_model)
+        )
+        self.pe[:, 0::2] = torch.sin(position * div_term)
+        self.pe[:, 1::2] = torch.cos(position * div_term)
+        self.pe = self.pe.unsqueeze(0)
+
+    def forward(self, x: torch.Tensor, offset: int = 0):
+        """Add positional encoding.
+        Args:
+            x (torch.Tensor): Input. Its shape is (batch, time, ...)
+            offset (int): position offset
+        Returns:
+            torch.Tensor: Encoded tensor. Its shape is (batch, time, ...)
+            torch.Tensor: for compatibility to RelPositionalEncoding
+        """
+        assert offset + x.size(1) < self.max_len
+        self.pe = self.pe.to(x.device)
+        pos_emb = self.pe[:, offset : offset + x.size(1)]
+        x = x * self.xscale + pos_emb
+        return self.dropout(x), self.dropout(pos_emb)
+
+    def position_encoding(self, offset: int, size: int):
+        """For getting encoding in a streaming fashion
+        Attention!!!!!
+        we apply dropout only once at the whole utterance level in a none
+        streaming way, but will call this function several times with
+        increasing input size in a streaming scenario, so the dropout will
+        be applied several times.
+        Args:
+            offset (int): start offset
+            size (int): requried size of position encoding
+        Returns:
+            torch.Tensor: Corresponding encoding
+        """
+        assert offset + size < self.max_len
+        return self.dropout(self.pe[:, offset : offset + size])
+
+
+class RelPositionalEncoding(PositionalEncoding):
+    """Relative positional encoding module.
+    See : Appendix B in https://arxiv.org/abs/1901.02860
+    Args:
+        d_model (int): Embedding dimension.
+        dropout_rate (float): Dropout rate.
+        max_len (int): Maximum input length.
+    """
+
+    def __init__(
+        self,
+        d_model: int,
+        dropout_rate: float,
+        chunk_size: int,
+        left_chunks: int,
+        max_len: int = 5000,
+    ):
+        """Initialize class."""
+        super().__init__(d_model, dropout_rate, max_len, reverse=True)
+        self.chunk_size = chunk_size
+        self.left_chunks = left_chunks
+        self.full_chunk_size = (self.left_chunks + 1) * self.chunk_size
+
+        self.div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.float32)
+            * -(math.log(10000.0) / self.d_model)
+        )
+        self.max_len = self.chunk_size * (max_len // self.chunk_size) - self.full_chunk_size
+
+    @torch.jit.export
+    def forward(self, x: torch.Tensor, offset: int = 0):
+        """Compute positional encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, time, `*`).
+        Returns:
+            torch.Tensor: Encoded tensor (batch, time, `*`).
+            torch.Tensor: Positional embedding tensor (1, time, `*`).
+        """
+        self.pe = self.pe.to(x.device)
+        x = x * self.xscale
+        pos_emb = self.pe[:, offset : offset + x.size(1)]
+        return self.dropout(x), self.dropout(pos_emb)
+
+    @torch.jit.export
+    def infer(self, xs, pe_index):
+        # type: (Tensor, Tensor) -> Tuple[Tensor, Tensor, Tensor]
+        pe_index = pe_index % self.max_len
+        xs = xs * self.xscale
+
+        pe = torch.zeros(self.full_chunk_size, self.d_model)
+        position = torch.arange(
+            pe_index, pe_index + self.full_chunk_size, dtype=torch.float32
+        ).unsqueeze(1)
+        pe[:, 0::2] = torch.sin(position * self.div_term)
+        pe[:, 1::2] = torch.cos(position * self.div_term)
+        pos_emb = pe.unsqueeze(0)
+
+        pe_index = pe_index + self.chunk_size
+        return xs, pos_emb, pe_index
+
+
+class PositionwiseFeedForward(torch.nn.Module):
+    """Positionwise feed forward layer.
+    :param int idim: input dimenstion
+    :param int hidden_units: number of hidden units
+    :param float dropout_rate: dropout rate
+    """
+
+    def __init__(self, idim, hidden_units, dropout_rate):
+        """Construct an PositionwiseFeedForward object."""
+        super(PositionwiseFeedForward, self).__init__()
+        self.w_1 = torch.nn.Linear(idim, hidden_units)
+        self.w_2 = torch.nn.Linear(hidden_units, idim)
+        self.dropout = torch.nn.Dropout(dropout_rate)
+
+    def forward(self, x):
+        """Forward funciton."""
+        return self.w_2(self.dropout(torch.relu(self.w_1(x))))
+
+    @torch.jit.export
+    def infer(self, xs, buffer, buffer_index, buffer_out):
+        # type: (Tensor, Tensor, Tensor) -> Tuple[Tensor, Tensor, Tensor]
+        return self.w_2(torch.relu(self.w_1(xs))), buffer, buffer_index, buffer_out
+
+
+class MultiLayeredConv1d(torch.nn.Module):
+    """Multi-layered conv1d for Transformer block.
+
+    This is a module of multi-leyered conv1d designed
+    to replace positionwise feed-forward network
+    in Transformer block, which is introduced in
+    `FastSpeech: Fast, Robust and Controllable Text to Speech`_.
+
+    .. _`FastSpeech: Fast, Robust and Controllable Text to Speech`:
+        https://arxiv.org/pdf/1905.09263.pdf
+
+    """
+
+    def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate):
+        """Initialize MultiLayeredConv1d module.
+
+        Args:
+            in_chans (int): Number of input channels.
+            hidden_chans (int): Number of hidden channels.
+            kernel_size (int): Kernel size of conv1d.
+            dropout_rate (float): Dropout rate.
+
+        """
+        super(MultiLayeredConv1d, self).__init__()
+        self.w_1 = torch.nn.Conv1d(
+            in_chans,
+            hidden_chans,
+            kernel_size,
+            stride=1,
+            padding=(kernel_size - 1) // 2,
+        )
+        self.w_2 = torch.nn.Conv1d(
+            hidden_chans,
+            in_chans,
+            kernel_size,
+            stride=1,
+            padding=(kernel_size - 1) // 2,
+        )
+        self.dropout = torch.nn.Dropout(dropout_rate)
+
+    @torch.jit.unused
+    def forward(self, x):
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Batch of input tensors (B, ..., in_chans).
+
+        Returns:
+            Tensor: Batch of output tensors (B, ..., hidden_chans).
+
+        """
+        x = torch.relu(self.w_1(x.transpose(-1, 1))).transpose(-1, 1)
+        return self.w_2(self.dropout(x).transpose(-1, 1)).transpose(-1, 1)
+
+
+class Conv1dLinear(torch.nn.Module):
+    """Conv1D + Linear for Transformer block.
+
+    A variant of MultiLayeredConv1d, which replaces second conv-layer to linear.
+
+    """
+
+    def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate):
+        """Initialize Conv1dLinear module.
+
+        Args:
+            in_chans (int): Number of input channels.
+            hidden_chans (int): Number of hidden channels.
+            kernel_size (int): Kernel size of conv1d.
+            dropout_rate (float): Dropout rate.
+
+        """
+        super(Conv1dLinear, self).__init__()
+        self.lorder = kernel_size - 1
+        self.left_padding = nn.ConstantPad1d((self.lorder, 0), 0.0)
+        self.w_1 = torch.nn.Sequential(
+            torch.nn.Conv1d(in_chans, in_chans, kernel_size, stride=1, padding=0, groups=in_chans),
+            torch.nn.Conv1d(in_chans, hidden_chans, 1, padding=0),
+        )
+        self.w_2 = torch.nn.Linear(hidden_chans, in_chans)
+        self.dropout = torch.nn.Dropout(dropout_rate)
+        self.in_chans = in_chans
+
+        # cnn_buffer = 1, in_chans, self.lorder
+        self.buffer_size = 1 * self.in_chans * self.lorder
+
+    @torch.jit.unused
+    def forward(self, x):
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Batch of input tensors (B, ..., in_chans).
+
+        Returns:
+            Tensor: Batch of output tensors (B, ..., hidden_chans).
+
+        """
+        x = torch.relu(self.w_1(self.left_padding(x.transpose(-1, 1)))).transpose(-1, 1)
+        return self.w_2(self.dropout(x))
+
+    @torch.jit.export
+    def infer(self, x, buffer, buffer_index, buffer_out):
+        # type: (Tensor, Tensor, Tensor) -> Tuple[Tensor, Tensor, Tensor]
+        x = x.transpose(-1, 1)
+
+        cnn_buffer = buffer[buffer_index : buffer_index + self.buffer_size].reshape(
+            [1, self.in_chans, self.lorder]
+        )
+        x = torch.cat([cnn_buffer, x], dim=2)
+        buffer_out.append(x[:, :, -self.lorder :].reshape(-1))
+        buffer_index = buffer_index + self.buffer_size
+
+        x = self.w_1(x)
+        x = torch.relu(x).transpose(-1, 1)
+        x = self.w_2(x)
+        return x, buffer, buffer_index, buffer_out
+
+
+class MultiHeadedAttention(nn.Module):
+    """Multi-Head Attention layer.
+
+    :param int n_head: the number of head s
+    :param int n_feat: the number of features
+    :param float dropout_rate: dropout rate
+
+    """
+
+    def __init__(self, n_head, n_feat, dropout_rate, chunk_size, left_chunks, pos_enc_class):
+        """Construct an MultiHeadedAttention object."""
+        super(MultiHeadedAttention, self).__init__()
+        assert n_feat % n_head == 0
+        # We assume d_v always equals d_k
+        self.d_k = n_feat // n_head
+        self.h = n_head
+        self.linear_q = nn.Linear(n_feat, n_feat)
+        self.linear_k = nn.Linear(n_feat, n_feat)
+        self.linear_v = nn.Linear(n_feat, n_feat)
+        self.linear_out = nn.Linear(n_feat, n_feat)
+        self.dropout = nn.Dropout(p=dropout_rate)
+        # self.min_value = float(numpy.finfo(torch.tensor(0, dtype=torch.float16).numpy().dtype).min)
+        self.min_value = float(torch.finfo(torch.float16).min)
+        # chunk par
+        if chunk_size > 0 and left_chunks > 0:  # for streaming mode
+            self.buffersize = chunk_size * (left_chunks)
+            self.left_chunk_size = chunk_size * left_chunks
+        else:  # for non-streaming mode
+            self.buffersize = 1
+            self.left_chunk_size = 1
+        self.chunk_size = chunk_size
+
+        # encoding setup
+        if pos_enc_class == "rel-enc":
+            self.rel_enc = True
+            self.linear_pos = nn.Linear(n_feat, n_feat, bias=False)
+            # these two learnable bias are used in matrix c and matrix d
+            # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+            self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k))
+            self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k))
+            torch.nn.init.xavier_uniform_(self.pos_bias_u)
+            torch.nn.init.xavier_uniform_(self.pos_bias_v)
+        else:
+            self.rel_enc = False
+            self.linear_pos = nn.Identity()
+            self.pos_bias_u = torch.tensor([0])
+            self.pos_bias_v = torch.tensor([0])
+
+        # buffer
+        # key_buffer = 1, self.h, self.buffersize, self.d_k
+        self.key_buffer_size = 1 * self.h * self.buffersize * self.d_k
+        # value_buffer = 1, self.h, self.buffersize, self.d_k
+        self.value_buffer_size = 1 * self.h * self.buffersize * self.d_k
+        if self.chunk_size > 0:
+            # buffer_mask_size = 1, self.h, self.chunk_size, self.buffersize
+            self.buffer_mask_size = 1 * self.h * self.chunk_size * self.buffersize
+            # self.buffer_mask = torch.ones([1, self.h, self.chunk_size, self.buffersize], dtype=torch.bool)
+        else:
+            self.buffer_mask = torch.ones([1, self.h, 1, 1], dtype=torch.bool)
+
+    @torch.jit.unused
+    def rel_shift(self, x, zero_triu: bool = False):
+        """Compute relative positinal encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, time, size).
+            zero_triu (bool): If true, return the lower triangular part of
+                the matrix.
+        Returns:
+            torch.Tensor: Output tensor.
+        """
+
+        zero_pad = torch.zeros(
+            (x.size()[0], x.size()[1], x.size()[2], 1), device=x.device, dtype=x.dtype
+        )
+        x_padded = torch.cat([zero_pad, x], dim=-1)
+
+        x_padded = x_padded.view(x.size()[0], x.size()[1], x.size(3) + 1, x.size(2))
+        x = x_padded[:, :, 1:].view_as(x)
+
+        if zero_triu:
+            ones = torch.ones((x.size(2), x.size(3)))
+            x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :]
+        return x
+
+    @torch.jit.export
+    def forward(self, query, key, value, mask=None, pos_emb=torch.tensor(1.0)):
+        # type: (Tensor, Tensor, Tensor, Optional[Tensor], Tensor) -> Tensor
+        """Compute 'Scaled Dot Product Attention'.
+
+        :param torch.Tensor query: (batch, time1, size)
+        :param torch.Tensor key: (batch, time2, size)
+        :param torch.Tensor value: (batch, time2, size)
+        :param torch.Tensor mask: (batch, time1, time2)
+        :param torch.nn.Dropout dropout:
+        :return torch.Tensor: attentined and transformed `value` (batch, time1, d_model)
+             weighted by the query dot key attention (batch, head, time1, time2)
+        """
+        n_batch = query.size(0)
+        q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k)
+        k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k)
+        v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k)
+        q = q.transpose(1, 2)  # (batch, head, time1, d_k)
+        k = k.transpose(1, 2)  # (batch, head, time2, d_k)
+        v = v.transpose(1, 2)  # (batch, head, time2, d_k)
+
+        if self.rel_enc:
+            q = q.transpose(1, 2)  # (batch, time1, head, d_k)
+            n_batch_pos = pos_emb.size(0)
+            p = self.linear_pos(pos_emb.to(query.dtype)).view(n_batch_pos, -1, self.h, self.d_k)
+            p = p.transpose(1, 2)  # (batch, head, time1, d_k)
+            # (batch, head, time1, d_k)
+            q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2)
+            # (batch, head, time1, d_k)
+            q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2)
+            # compute attention score
+            # first compute matrix a and matrix c
+            # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+            # (batch, head, time1, time2)
+            matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1))
+            # compute matrix b and matrix d
+            # (batch, head, time1, time2)
+            matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1))
+            # Remove rel_shift since it is useless in speech recognition,
+            # and it requires special attention for streaming.
+            # matrix_bd = self.rel_shift(matrix_bd)
+            scores = (matrix_ac + matrix_bd) / math.sqrt(self.d_k)  # (batch, head, time1, time2)
+        else:
+            scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(
+                self.d_k
+            )  # (batch, head, time1, time2)
+
+        if mask is not None:
+            mask = mask.unsqueeze(1).eq(0)  # (batch, 1, time1, time2)
+            scores = scores.masked_fill(mask, self.min_value)
+            attn = torch.softmax(scores, dim=-1).masked_fill(
+                mask, 0.0
+            )  # (batch, head, time1, time2)
+        else:
+            attn = torch.softmax(scores, dim=-1)  # (batch, head, time1, time2)
+
+        p_attn = self.dropout(attn)
+
+        x = torch.matmul(p_attn, v)  # (batch, head, time1, d_k)
+        x = (
+            x.transpose(1, 2).contiguous().view(n_batch, -1, self.h * self.d_k)
+        )  # (batch, time1, d_model)
+        return self.linear_out(x)  # (batch, time1, d_model)
+
+    @torch.jit.export
+    def infer(self, query, key, value, pos_emb, buffer, buffer_index, buffer_out):
+        # type: (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor) -> Tuple[Tensor, Tensor, Tensor]
+        n_batch = query.size(0)
+
+        q = (
+            self.linear_q(query).view(n_batch, -1, self.h, self.d_k).transpose(1, 2)
+        )  # (batch, head, len_q, d_k)
+        k = (
+            self.linear_k(key).view(n_batch, -1, self.h, self.d_k).transpose(1, 2)
+        )  # (batch, head, len_k, d_k)
+        v = (
+            self.linear_v(value).view(n_batch, -1, self.h, self.d_k).transpose(1, 2)
+        )  # (batch, head, len_v, d_k)
+
+        key_value_buffer = buffer[
+            buffer_index : buffer_index + self.key_buffer_size + self.value_buffer_size
+        ].reshape([1, self.h, self.buffersize * 2, self.d_k])
+        key_buffer = torch.cat([key_value_buffer[:, :, : self.buffersize, :], k], dim=2)
+        value_buffer = torch.cat([key_value_buffer[:, :, self.buffersize :, :], v], dim=2)
+        buffer_out.append(
+            torch.cat(
+                [key_buffer[:, :, self.chunk_size :, :], value_buffer[:, :, self.chunk_size :, :]],
+                dim=2,
+            ).reshape(-1)
+        )
+        buffer_index = buffer_index + self.key_buffer_size + self.value_buffer_size
+
+        if self.rel_enc:
+            q = q.transpose(1, 2)  # (batch, time1, head, d_k)
+            n_batch_pos = pos_emb.size(0)
+            p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k)
+            p = p.transpose(1, 2)  # (batch, head, time1, d_k)
+            # (batch, head, time1, d_k)
+            q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2)
+            # (batch, head, time1, d_k)
+            q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2)
+            # compute attention score
+            # first compute matrix a and matrix c
+            # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+            # (batch, head, time1, time2)
+            matrix_ac = torch.matmul(q_with_bias_u, key_buffer.transpose(-2, -1))
+            # compute matrix b and matrix d
+            # (batch, head, time1, time2)
+            matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1))
+            # Remove rel_shift since it is useless in speech recognition,
+            # and it requires special attention for streaming.
+            # matrix_bd = self.rel_shift(matrix_bd)
+            scores = (matrix_ac + matrix_bd) / math.sqrt(self.d_k)  # (batch, head, time1, time2)
+        else:
+            scores = torch.matmul(q, key_buffer.transpose(-2, -1)) / math.sqrt(
+                self.d_k
+            )  # (batch, head, len_q, buffersize)
+
+        attn = torch.softmax(scores, dim=-1)
+
+        x = torch.matmul(attn, value_buffer)  # (batch, head, len_q, d_k)
+        x = x.transpose(1, 2).reshape(n_batch, -1, self.h * self.d_k)  # (batch, time1, d_model)
+        return self.linear_out(x), buffer, buffer_index, buffer_out  # (batch, time1, d_model)
+
+    @torch.jit.export
+    def infer_mask(self, query, key, value, mask, buffer, buffer_index, buffer_out, is_static):
+        n_batch = query.size(0)
+
+        q = (
+            self.linear_q(query).view(n_batch, -1, self.h, self.d_k).transpose(1, 2)
+        )  # (batch, head, len_q, d_k)
+        k = (
+            self.linear_k(key).view(n_batch, -1, self.h, self.d_k).transpose(1, 2)
+        )  # (batch, head, len_k, d_k)
+        v = (
+            self.linear_v(value).view(n_batch, -1, self.h, self.d_k).transpose(1, 2)
+        )  # (batch, head, len_v, d_k)
+
+        if is_static:
+            key_buffer = k
+            value_buffer = v
+        else:
+            key_value_buffer = buffer[
+                buffer_index : buffer_index + self.key_buffer_size + self.value_buffer_size
+            ].reshape([1, self.h, self.buffersize * 2, self.d_k])
+            key_buffer = torch.cat([key_value_buffer[:, :, : self.buffersize, :], k], dim=2)
+            value_buffer = torch.cat([key_value_buffer[:, :, self.buffersize :, :], v], dim=2)
+            buffer_out.append(
+                torch.cat(
+                    [
+                        key_buffer[:, :, self.chunk_size :, :],
+                        value_buffer[:, :, self.chunk_size :, :],
+                    ],
+                    dim=2,
+                ).reshape(-1)
+            )
+            buffer_index = buffer_index + self.key_buffer_size + self.value_buffer_size
+
+        scores = torch.matmul(q, key_buffer.transpose(-2, -1)) / math.sqrt(
+            self.d_k
+        )  # (batch, head, len_q, buffersize)
+
+        if mask is not None:
+            mask = mask.unsqueeze(1).eq(0)  # (batch, 1, time1, time2)
+            scores = scores.masked_fill(mask, self.min_value)
+            attn = torch.softmax(scores, dim=-1).masked_fill(
+                mask, 0.0
+            )  # (batch, head, time1, time2)
+        else:
+            attn = torch.softmax(scores, dim=-1)  # (batch, head, time1, time2)
+
+        x = torch.matmul(attn, value_buffer)  # (batch, head, len_q, d_k)
+        x = x.transpose(1, 2).reshape(n_batch, -1, self.h * self.d_k)  # (batch, time1, d_model)
+        return self.linear_out(x), buffer_index, buffer_out  # (batch, time1, d_model)
+
+
+class SoftAttention(nn.Module):
+    def __init__(self, in_dim, hidden_dim):
+        super(SoftAttention, self).__init__()
+        self.q = torch.nn.Parameter(torch.rand([hidden_dim]), requires_grad=True)
+        self.wb = nn.Linear(in_dim, hidden_dim)
+        self.min_value = float(numpy.finfo(torch.tensor(0, dtype=torch.float32).numpy().dtype).min)
+        # buffer
+        self.window_size = 50
+        self.buffer_in = torch.zeros([1, self.window_size, in_dim], dtype=torch.float32)
+        self.buffer = torch.zeros([1, self.window_size], dtype=torch.float32)
+        self.buffer[:, :] = float(
+            numpy.finfo(torch.tensor(0, dtype=torch.float32).numpy().dtype).min
+        )
+
+    @torch.jit.unused
+    def forward(self, x, mask=None):
+        hidden = torch.tanh(self.wb(x))  # B T D
+        hidden = torch.einsum("btd,d->bt", hidden, self.q)
+        score = torch.softmax(hidden, dim=-1)  # B T
+        if mask is not None:
+            score = score.masked_fill(mask, 0.0)
+        output = torch.einsum("bt,btd->bd", score, x)
+        return output
+
+    @torch.jit.export
+    def infer(self, x):
+        # type: (Tensor) -> Tensor
+        hidden = torch.tanh(self.wb(x))  # B T D
+        hidden = torch.einsum("btd,d->bt", hidden, self.q)
+        size = hidden.shape[1]
+        output = torch.zeros([size, x.shape[-1]])
+        for i in range(size):
+            self.buffer = torch.cat([self.buffer, hidden[:, i : i + 1]], dim=-1)
+            self.buffer = self.buffer[:, 1:]
+            score = torch.softmax(self.buffer, dim=-1)  # B T
+            self.buffer_in = torch.cat([self.buffer_in, x[:, i : i + 1, :]], dim=1)
+            self.buffer_in = self.buffer_in[:, 1:]
+            output[i : i + 1] = torch.einsum("bt,btd->bd", score, self.buffer_in)
+        return output
diff --git a/vita/model/multimodal_encoder/whale/module/layer/conv1d.py b/vita/model/multimodal_encoder/whale/module/layer/conv1d.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e3008232d0758ab9cd879dea3f80f5ac64b31a3
--- /dev/null
+++ b/vita/model/multimodal_encoder/whale/module/layer/conv1d.py
@@ -0,0 +1,88 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class Conv1dLayer(nn.Module):
+    def __init__(
+        self,
+        input_dim,
+        output_dim,
+        kernel_size,
+        stride,
+        causal_conv,
+        dilation,
+        dropout_rate,
+        residual=True,
+    ):
+        super(Conv1dLayer, self).__init__()
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.dilation = dilation
+        self.causal_conv = causal_conv
+        if causal_conv:
+            self.lorder = (kernel_size - 1) * self.dilation
+            self.left_padding = nn.ConstantPad1d((self.lorder, 0), 0.0)
+        else:
+            assert (kernel_size - 1) % 2 == 0
+            self.lorder = ((kernel_size - 1) // 2) * self.dilation
+            self.left_padding = nn.ConstantPad1d((self.lorder, self.lorder), 0.0)
+        self.conv1d = nn.Conv1d(
+            self.input_dim, self.output_dim, self.kernel_size, self.stride, 0, self.dilation
+        )
+        self.bn = nn.BatchNorm1d(self.output_dim, eps=1e-3, momentum=0.99)
+        self.relu = nn.ReLU()
+        self.dropout = nn.Dropout(p=dropout_rate)
+        self.residual = residual
+        if self.input_dim != self.output_dim:
+            self.residual = False
+
+        # buffer = 1, self.input_dim, self.lorder
+        self.lorder = (kernel_size - 1) * self.dilation - (self.stride - 1)
+        self.buffer_size = 1 * self.input_dim * self.lorder
+        self.x_data_chache_size = self.lorder
+        self.x_data_buffer_size = self.input_dim * self.x_data_chache_size
+
+    @torch.jit.unused
+    def forward(self, x):
+        x_data = x
+        x = self.left_padding(x)
+        x = self.conv1d(x)
+        x = self.bn(x)
+        if self.stride == 1 and self.residual:
+            x = self.relu(x + x_data)
+        else:
+            x = self.relu(x)
+        x = self.dropout(x)
+        return x
+
+    @torch.jit.export
+    def infer(self, x, buffer, buffer_index, buffer_out):
+        # type: (Tensor) -> Tensor
+        x_data = x.clone()
+
+        cnn_buffer = buffer[buffer_index : buffer_index + self.buffer_size].reshape(
+            [1, self.input_dim, self.lorder]
+        )
+        x = torch.cat([cnn_buffer, x], dim=2)
+        buffer_out.append(x[:, :, -self.lorder :].reshape(-1))
+        buffer_index = buffer_index + self.buffer_size
+
+        x = self.conv1d(x)
+        x = self.bn(x)
+
+        if self.stride == 1 and self.residual:
+            x_data_cnn_buffer = buffer[
+                buffer_index : buffer_index + self.x_data_buffer_size
+            ].reshape([1, self.input_dim, self.x_data_chache_size])
+            x_data = torch.cat([x_data_cnn_buffer, x_data], dim=2)
+            buffer_out.append(x_data[:, :, -self.x_data_chache_size :].reshape(-1))
+            buffer_index = buffer_index + self.x_data_buffer_size
+            x_data = x_data[:, :, : -self.x_data_chache_size]
+            x = self.relu(x + x_data)
+        else:
+            x = self.relu(x)
+
+        return x, buffer, buffer_index, buffer_out
diff --git a/vita/model/multimodal_encoder/whale/module/layer/dtcblock.py b/vita/model/multimodal_encoder/whale/module/layer/dtcblock.py
new file mode 100644
index 0000000000000000000000000000000000000000..41fb4be681949efc4d896cc8976d8262c39bb2a8
--- /dev/null
+++ b/vita/model/multimodal_encoder/whale/module/layer/dtcblock.py
@@ -0,0 +1,95 @@
+import math
+import pdb
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class DTCBlock(nn.Module):
+    def __init__(
+        self, input_dim, output_dim, kernel_size, stride, causal_conv, dilation, dropout_rate
+    ):
+        super(DTCBlock, self).__init__()
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.dilation = dilation
+        if causal_conv:
+            self.padding = 0
+            self.lorder = (kernel_size - 1) * self.dilation
+            self.left_padding = nn.ConstantPad1d((self.lorder, 0), 0.0)
+        else:
+            assert (kernel_size - 1) % 2 == 0
+            self.padding = ((kernel_size - 1) // 2) * self.dilation
+            self.lorder = 0
+        self.causal_conv = causal_conv
+        self.depthwise_conv = nn.Conv1d(
+            self.input_dim,
+            self.input_dim,
+            self.kernel_size,
+            self.stride,
+            self.padding,
+            self.dilation,
+            groups=self.input_dim,
+        )
+        self.point_conv_1 = nn.Conv1d(self.input_dim, self.input_dim, 1, 1, self.padding)
+        self.point_conv_2 = nn.Conv1d(self.input_dim, self.input_dim, 1, 1, self.padding)
+        self.bn_1 = nn.BatchNorm1d(self.input_dim)
+        self.bn_2 = nn.BatchNorm1d(self.input_dim)
+        self.bn_3 = nn.BatchNorm1d(self.input_dim)
+        self.dropout = nn.Dropout(p=dropout_rate)
+
+        # buffer = 1, self.input_dim, self.lorder
+        self.lorder = (kernel_size - 1) * self.dilation - (self.stride - 1)
+        self.buffer_size = 1 * self.input_dim * self.lorder
+
+    @torch.jit.unused
+    def forward(self, x):
+        x_in = x
+        x_data = x_in.transpose(1, 2)
+        if self.causal_conv:
+            x_data_pad = self.left_padding(x_data)
+        else:
+            x_data_pad = x_data
+        x_depth = self.depthwise_conv(x_data_pad)
+        x_bn_1 = self.bn_1(x_depth)
+        x_point_1 = self.point_conv_1(x_bn_1)
+        x_bn_2 = self.bn_2(x_point_1)
+        x_relu_2 = torch.relu(x_bn_2)
+        x_point_2 = self.point_conv_2(x_relu_2)
+        x_bn_3 = self.bn_3(x_point_2)
+        x_bn_3 = x_bn_3.transpose(1, 2)
+        if self.stride == 1:
+            x_relu_3 = torch.relu(x_bn_3 + x_in)
+        else:
+            x_relu_3 = torch.relu(x_bn_3)
+        x_drop = self.dropout(x_relu_3)
+        return x_drop
+
+    @torch.jit.export
+    def infer(self, x, buffer, buffer_index, buffer_out):
+        # type: (Tensor, Tensor, Tensor) -> Tuple[Tensor, Tensor, Tensor]
+        x_in = x
+        x = x_in.transpose(1, 2)
+        cnn_buffer = buffer[buffer_index : buffer_index + self.buffer_size].reshape(
+            [1, self.input_dim, self.lorder]
+        )
+        x = torch.cat([cnn_buffer, x], dim=2)
+        buffer_out.append(x[:, :, -self.lorder :].reshape(-1))
+        buffer_index = buffer_index + self.buffer_size
+        x = self.depthwise_conv(x)
+        x = self.bn_1(x)
+        x = self.point_conv_1(x)
+        x = self.bn_2(x)
+        x = torch.relu(x)
+        x = self.point_conv_2(x)
+        x = self.bn_3(x)
+        x = x.transpose(1, 2)
+        if self.stride == 1:
+            x = torch.relu(x + x_in)
+        else:
+            x = torch.relu(x)
+        return x, buffer, buffer_index, buffer_out
diff --git a/vita/model/multimodal_encoder/whale/module/layer/fsmn.py b/vita/model/multimodal_encoder/whale/module/layer/fsmn.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ac8d3dd69dfda3f41b1a2823c721be5ef6dbd05
--- /dev/null
+++ b/vita/model/multimodal_encoder/whale/module/layer/fsmn.py
@@ -0,0 +1,129 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class FsmnLayer(nn.Module):
+    def __init__(
+        self,
+        input_dim,
+        out_dim,
+        hidden_dim,
+        left_frame=1,
+        right_frame=1,
+        left_dilation=1,
+        right_dilation=1,
+    ):
+        super(FsmnLayer, self).__init__()
+        self.input_dim = input_dim
+        self.out_dim = out_dim
+        self.hidden_dim = hidden_dim
+        self.left_frame = left_frame
+        self.right_frame = right_frame
+        self.left_dilation = left_dilation
+        self.right_dilation = right_dilation
+        self.conv_in = nn.Conv1d(input_dim, hidden_dim, kernel_size=1)
+        if left_frame > 0:
+            self.pad_left = nn.ConstantPad1d([left_dilation * left_frame, 0], 0.0)
+            self.conv_left = nn.Conv1d(
+                hidden_dim,
+                hidden_dim,
+                kernel_size=left_frame + 1,
+                dilation=left_dilation,
+                bias=False,
+                groups=hidden_dim,
+            )
+        if right_frame > 0:
+            self.pad_right = nn.ConstantPad1d([-right_dilation, right_dilation * right_frame], 0.0)
+            self.conv_right = nn.Conv1d(
+                hidden_dim,
+                hidden_dim,
+                kernel_size=right_frame,
+                dilation=right_dilation,
+                bias=False,
+                groups=hidden_dim,
+            )
+        self.conv_out = nn.Conv1d(hidden_dim, out_dim, kernel_size=1)
+
+        # cache = 1, self.hidden_dim, left_frame * left_dilation + right_frame * right_dilation
+        self.cache_size = left_frame * left_dilation + right_frame * right_dilation
+        self.buffer_size = self.hidden_dim * self.cache_size
+        self.p_in_raw_chache_size = self.right_frame * self.right_dilation
+        self.p_in_raw_buffer_size = self.hidden_dim * self.p_in_raw_chache_size
+        self.hidden_chache_size = self.right_frame * self.right_dilation
+        self.hidden_buffer_size = self.hidden_dim * self.hidden_chache_size
+
+    @torch.jit.unused
+    def forward(self, x, hidden=None):
+        x_data = x.transpose(1, 2)
+        p_in = self.conv_in(x_data)
+        if self.left_frame > 0:
+            p_left = self.pad_left(p_in)
+            p_left = self.conv_left(p_left)
+        else:
+            p_left = 0
+        if self.right_frame > 0:
+            p_right = self.pad_right(p_in)
+            p_right = self.conv_right(p_right)
+        else:
+            p_right = 0
+        p_out = p_in + p_right + p_left
+        if hidden is not None:
+            p_out = hidden + p_out
+        out = F.relu(self.conv_out(p_out))
+        out = out.transpose(1, 2)
+        return out, p_out
+
+    @torch.jit.export
+    def infer(self, x, buffer, buffer_index, buffer_out, hidden=None):
+        # type: (Tensor, Tensor, Tensor, Optional[Tensor]) -> Tuple[Tensor, Tensor, Tensor, Tensor]
+        p_in_raw = self.conv_in(x)
+
+        cnn_buffer = buffer[buffer_index : buffer_index + self.buffer_size].reshape(
+            [1, self.hidden_dim, self.cache_size]
+        )
+        p_in = torch.cat([cnn_buffer, p_in_raw], dim=2)
+        # buffer[buffer_index: buffer_index + self.buffer_size] =  p_in[:, :, -self.cache_size:].reshape(-1)
+        buffer_out.append(p_in[:, :, -self.cache_size :].reshape(-1))
+        buffer_index = buffer_index + self.buffer_size
+
+        if self.left_frame > 0:
+            if self.right_frame > 0:
+                p_left = p_in[:, :, : -self.right_frame * self.right_dilation]
+            else:
+                p_left = p_in[:, :]
+            p_left_out = self.conv_left(p_left)
+        else:
+            p_left_out = torch.tensor([0])
+        if self.right_frame > 0:
+            p_right = p_in[:, :, self.left_frame * self.left_dilation + 1 :]
+            p_right_out = self.conv_right(p_right)
+        else:
+            p_right_out = torch.tensor([0])
+
+        if self.right_frame > 0:
+            p_in_raw_cnn_buffer = buffer[
+                buffer_index : buffer_index + self.p_in_raw_buffer_size
+            ].reshape([1, self.hidden_dim, self.p_in_raw_chache_size])
+            p_in_raw = torch.cat([p_in_raw_cnn_buffer, p_in_raw], dim=2)
+            # buffer[buffer_index: buffer_index + self.p_in_raw_buffer_size] =  p_in_raw[:, :, -self.p_in_raw_chache_size:].reshape(-1)
+            buffer_out.append(p_in_raw[:, :, -self.p_in_raw_chache_size :].reshape(-1))
+            buffer_index = buffer_index + self.p_in_raw_buffer_size
+            p_in_raw = p_in_raw[:, :, : -self.p_in_raw_chache_size]
+        p_out = p_in_raw + p_left_out + p_right_out
+
+        if hidden is not None:
+            if self.right_frame > 0:
+                hidden_cnn_buffer = buffer[
+                    buffer_index : buffer_index + self.hidden_buffer_size
+                ].reshape([1, self.hidden_dim, self.hidden_chache_size])
+                hidden = torch.cat([hidden_cnn_buffer, hidden], dim=2)
+                # buffer[buffer_index: buffer_index + self.hidden_buffer_size] =  hidden[:, :, -self.hidden_chache_size:].reshape(-1)
+                buffer_out.append(hidden[:, :, -self.hidden_chache_size :].reshape(-1))
+                buffer_index = buffer_index + self.hidden_buffer_size
+                hidden = hidden[:, :, : -self.hidden_chache_size]
+            p_out = hidden + p_out
+
+        out = F.relu(self.conv_out(p_out))
+
+        return out, buffer, buffer_index, buffer_out, p_out
diff --git a/vita/model/multimodal_encoder/whale/utils.py b/vita/model/multimodal_encoder/whale/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..8db40fee81a5af1f6c7ac51d73aad3fcf91c7e1b
--- /dev/null
+++ b/vita/model/multimodal_encoder/whale/utils.py
@@ -0,0 +1,146 @@
+import argparse
+import importlib
+import json
+import os
+from distutils.util import strtobool as dist_strtobool
+
+import torch
+import yaml
+
+IGNORE_ID = -1
+
+
+def assign_args_from_yaml(args, yaml_path, prefix_key=None):
+    with open(yaml_path) as f:
+        ydict = yaml.load(f, Loader=yaml.FullLoader)
+    if prefix_key is not None:
+        ydict = ydict[prefix_key]
+    for k, v in ydict.items():
+        k_args = k.replace("-", "_")
+        if hasattr(args, k_args):
+            setattr(args, k_args, ydict[k])
+    return args
+
+
+def get_model_conf(model_path):
+    model_conf = os.path.dirname(model_path) + "/model.json"
+    with open(model_conf, "rb") as f:
+        print("reading a config file from " + model_conf)
+        confs = json.load(f)
+    # for asr, tts, mt
+    idim, odim, args = confs
+    return argparse.Namespace(**args)
+
+
+def strtobool(x):
+    return bool(dist_strtobool(x))
+
+
+def dynamic_import(import_path, alias=dict()):
+    """dynamic import module and class
+
+    :param str import_path: syntax 'module_name:class_name'
+        e.g., 'espnet.transform.add_deltas:AddDeltas'
+    :param dict alias: shortcut for registered class
+    :return: imported class
+    """
+    if import_path not in alias and ":" not in import_path:
+        raise ValueError(
+            "import_path should be one of {} or "
+            'include ":", e.g. "espnet.transform.add_deltas:AddDeltas" : '
+            "{}".format(set(alias), import_path)
+        )
+    if ":" not in import_path:
+        import_path = alias[import_path]
+
+    module_name, objname = import_path.split(":")
+    m = importlib.import_module(module_name)
+    return getattr(m, objname)
+
+
+def set_deterministic_pytorch(args):
+    # seed setting
+    torch.manual_seed(args.seed)
+
+    torch.backends.cudnn.deterministic = False
+    torch.backends.cudnn.benchmark = False
+
+
+def pad_list(xs, pad_value):
+    n_batch = len(xs)
+    max_len = max(x.size(0) for x in xs)
+    pad = xs[0].new(n_batch, max_len, *xs[0].size()[1:]).fill_(pad_value)
+    for i in range(n_batch):
+        pad[i, : xs[i].size(0)] = xs[i]
+    return pad
+
+
+def make_pad_mask(lengths: torch.Tensor, max_len: int = 0) -> torch.Tensor:
+    batch_size = lengths.size(0)
+    max_len = max_len if max_len > 0 else lengths.max().item()
+    seq_range = torch.arange(0, max_len, dtype=torch.int64, device=lengths.device)
+    seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
+    seq_length_expand = lengths.unsqueeze(-1)
+    mask = seq_range_expand >= seq_length_expand
+    return mask
+
+
+def subsequent_chunk_mask(
+    size: int,
+    ck_size: int,
+    num_l_cks: int = -1,
+    device: torch.device = torch.device("cpu"),
+) -> torch.Tensor:
+    ret = torch.zeros(size, size, device=device, dtype=torch.bool)
+    for i in range(size):
+        if num_l_cks < 0:
+            start = 0
+        else:
+            start = max((i // ck_size - num_l_cks) * ck_size, 0)
+        ending = min((i // ck_size + 1) * ck_size, size)
+        ret[i, start:ending] = True
+    return ret
+
+
+def add_optional_chunk_mask(
+    xs: torch.Tensor,
+    masks: torch.Tensor,
+    use_dynamic_chunk: bool,
+    use_dynamic_left_chunk: bool,
+    decoding_chunk_size: int,
+    static_chunk_size: int,
+    num_decoding_left_chunks: int,
+):
+    if use_dynamic_chunk:
+        max_len = xs.size(1)
+        if decoding_chunk_size < 0:
+            chunk_size = max_len
+            num_l_cks = -1
+        elif decoding_chunk_size > 0:
+            chunk_size = decoding_chunk_size
+            num_l_cks = num_decoding_left_chunks
+        else:
+            chunk_size = torch.randint(1, max_len, (1,)).item()
+            num_l_cks = -1
+            if chunk_size > max_len // 2:
+                chunk_size = max_len
+            else:
+                chunk_size = chunk_size % 25 + 1
+                if use_dynamic_left_chunk:
+                    max_left_chunks = (max_len - 1) // chunk_size
+                    num_l_cks = torch.randint(0, max_left_chunks, (1,)).item()
+        ck_masks = subsequent_chunk_mask(
+            xs.size(1), chunk_size, num_l_cks, xs.device
+        )  # (L, L)
+        ck_masks = ck_masks.unsqueeze(0)  # (1, L, L)
+        ck_masks = masks & ck_masks  # (B, L, L)
+    elif static_chunk_size > 0:
+        num_l_cks = num_decoding_left_chunks
+        ck_masks = subsequent_chunk_mask(
+            xs.size(1), static_chunk_size, num_l_cks, xs.device
+        )  # (L, L)
+        ck_masks = ck_masks.unsqueeze(0)  # (1, L, L)
+        ck_masks = masks & ck_masks  # (B, L, L)
+    else:
+        ck_masks = masks
+    return ck_masks
diff --git a/vita/model/multimodal_projector/builder.py b/vita/model/multimodal_projector/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..a59e5e3a2050f9a99547ab161c49e72c3c4944fc
--- /dev/null
+++ b/vita/model/multimodal_projector/builder.py
@@ -0,0 +1,185 @@
+import math
+import re
+from functools import partial
+
+from torch import nn
+
+from timm.layers.norm_act import LayerNormAct2d
+from torchvision.models.mobilenetv3 import InvertedResidual, InvertedResidualConfig
+from torchvision.ops.misc import SqueezeExcitation as SELayer
+
+
+class IdentityMap(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, *args, **kwargs):
+        return x
+
+    @property
+    def config(self):
+        return {"mm_projector_type": "identity"}
+
+
+class Minigpt(nn.Module):
+    def __init__(self, config=None):
+        super(Minigpt, self).__init__()
+        # c*4 is the input size, and c is the output size for the linear layer
+        inc, ouc = config.mm_hidden_size, config.hidden_size
+        self.linear = nn.Linear(inc * 4, ouc)
+
+    def forward(self, x):
+        # x is the input tensor with shape [b, num_tokens, c]
+        b, num_tokens, c = x.shape
+
+        # Check if num_tokens is divisible by 4
+        if num_tokens % 4 != 0:
+            raise ValueError("num_tokens must be divisible by 4")
+
+        # Reshape x to [b, num_tokens/4, c*4]
+        x = x.view(b, num_tokens // 4, c * 4)
+
+        # Apply the linear transformation
+        x = self.linear(x)
+        return x
+
+
+class Vanilla(nn.Module):
+    def __init__(self, config=None):
+        super(Vanilla, self).__init__()
+        # c*4 is the input size, and c is the output size for the linear layer
+        inc, ouc = config.mm_hidden_size, config.hidden_size
+        self.linear = nn.Linear(inc * 4, ouc)
+
+    def forward(self, x):
+        b, num_tokens, c = x.shape
+
+        # Check if num_tokens is divisible by 4
+        if num_tokens % 4 != 0:
+            raise ValueError("num_tokens must be divisible by 4")
+
+        # First, reshape to [b, num_tokens//4, 4, c]
+        x = x.view(b, num_tokens // 4, 4, c)
+
+        # Then, permute to interleave the tokens
+        x = x.permute(0, 1, 3, 2).contiguous()
+
+        # Finally, reshape to [b, num_tokens//4, c*4] to interleave features of 4 tokens
+        x = x.view(b, num_tokens // 4, c * 4)
+
+        # Apply the linear transformation
+        x = self.linear(x)
+        return x
+
+
+class LDPBlock(nn.Module):
+    # Lightweight Downsample Projector Block
+
+    def __init__(self, config=None):
+        super().__init__()
+
+        inc, ouc = config.mm_hidden_size, config.hidden_size
+        layer_norm = partial(LayerNormAct2d, act_layer=None)
+        se_layer = partial(SELayer, scale_activation=nn.Hardsigmoid)
+        self.mlp = nn.Sequential(nn.Identity(), nn.Linear(inc, ouc), nn.GELU(), nn.Linear(ouc, ouc))
+        self.mb_block = nn.Sequential(
+            nn.Identity(),
+            InvertedResidual(
+                InvertedResidualConfig(ouc, 3, ouc, ouc, True, "HS", 1, 1, 1), layer_norm, se_layer
+            ),
+            InvertedResidual(
+                InvertedResidualConfig(ouc, 3, ouc, ouc, True, "HS", 2, 1, 1), layer_norm, se_layer
+            ),
+        )
+
+    def forward(self, x):
+        b, num_tokens, c = x.shape
+        h = int(math.sqrt(num_tokens))
+        x = self.mlp(x)
+        x = x.permute(0, 2, 1).reshape(b, -1, h, h)
+        x = self.mb_block(x)
+        x = x.flatten(2).permute(0, 2, 1)
+        return x
+
+
+class LDPNetProjector(nn.Module):
+    def __init__(self, config=None):
+        super().__init__()
+        self.model = LDPBlock(config)
+
+    def forward(self, x):
+        return self.model(x)
+
+
+class SPP(nn.Module):
+    def __init__(self, config=None, projector_type="v1"):
+        super().__init__()
+
+        self.projector_type = projector_type
+
+        inc, ouc = config.mm_hidden_size, config.hidden_size
+        self.linear_0 = nn.Linear(inc, inc)
+
+        self.linear_1 = nn.Linear(inc, ouc)
+
+        self.pooling = nn.AvgPool2d(kernel_size=2)
+
+        self.linear_2 = nn.Linear(ouc, ouc)
+
+    def forward(self, x):
+        b, num_tokens, c = x.shape
+        h = int(math.sqrt(num_tokens))
+        if "v1" in self.projector_type:
+            x = self.linear_1(x)
+            x = x.permute(0, 2, 1).reshape(b, -1, h, h)
+            x = self.pooling(x)
+            x = x.flatten(2).permute(0, 2, 1)
+            x = self.linear_2(x)
+        elif "v2" in self.projector_type:
+            x = self.linear_1(x)
+            x = self.linear_2(x)
+            x = x.permute(0, 2, 1).reshape(b, -1, h, h)
+            x = self.pooling(x)
+            x = x.flatten(2).permute(0, 2, 1)
+        elif "v3" in self.projector_type:
+            x = self.linear_0(x)
+            x = x.permute(0, 2, 1).reshape(b, -1, h, h)
+            x = self.pooling(x)
+            x = x.flatten(2).permute(0, 2, 1)
+            x = self.linear_1(x)
+            x = self.linear_2(x)
+        return x
+
+
+def build_vision_projector(config, delay_load=False, **kwargs):
+    projector_type = getattr(config, "mm_projector_type", "mlp2x_gelu")
+
+    if projector_type == "linear":
+        return nn.Linear(config.mm_hidden_size, config.hidden_size)
+
+    elif projector_type.startswith("mlp"):
+        mlp_gelu_match = re.match(r"^mlp(\d+)x_gelu$", projector_type)
+        if mlp_gelu_match:
+            mlp_depth = int(mlp_gelu_match.group(1))
+            modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
+            for _ in range(1, mlp_depth):
+                modules.append(nn.GELU())
+                modules.append(nn.Linear(config.hidden_size, config.hidden_size))
+            return nn.Sequential(*modules)
+
+    elif projector_type.startswith("spp"):
+        return SPP(config, projector_type)
+
+    elif projector_type == "ldp":
+        return LDPNetProjector(config)
+
+    elif projector_type == "vanilla":
+        return Vanilla(config)
+
+    elif projector_type == "minigpt":
+        return Minigpt(config)
+
+    elif projector_type == "identity":
+        return IdentityMap()
+
+    raise ValueError(f"Unknown projector type: {projector_type}")
diff --git a/vita/model/vita_arch.py b/vita/model/vita_arch.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca47f370c0d8fc40b2a0a2cfa12586b8e9c48ab4
--- /dev/null
+++ b/vita/model/vita_arch.py
@@ -0,0 +1,639 @@
+import math
+from abc import ABC, abstractmethod
+
+import torch
+import torch.nn.functional as F
+
+from vita.constants import AUDIO_TOKEN_INDEX, IGNORE_INDEX, IMAGE_TOKEN_INDEX
+
+from .multimodal_encoder.builder import build_audio_encoder, build_vision_tower
+from .multimodal_projector.builder import build_vision_projector
+import numpy as np
+
+class VITAMetaModel:
+    def __init__(self, config):
+        super(VITAMetaModel, self).__init__(config)
+
+        if hasattr(config, "mm_vision_tower"):
+            self.vision_tower = build_vision_tower(
+                config, delay_load=False#not getattr(config, "continuous_training", False)
+            )
+            if getattr(config, "continuous_training", False):
+                config.continuous_training = False
+            self.mm_projector = build_vision_projector(config)
+
+        if hasattr(config, "mm_audio_encoder"):
+            self.audio_encoder = build_audio_encoder(config)
+
+    def get_vision_tower(self):
+        vision_tower = getattr(self, "vision_tower", None)
+        if type(vision_tower) is list:
+            vision_tower = vision_tower[0]
+        return vision_tower
+
+    def get_audio_encoder(self):
+        audio_encoder = getattr(self, "audio_encoder", None)
+        return audio_encoder
+
+    def initialize_vision_modules(self, model_args):
+        vision_tower = model_args.vision_tower
+
+        pretrain_mm_mlp_adapter = model_args.pretrain_mm_mlp_adapter
+
+        self.config.mm_vision_tower = vision_tower
+
+        if self.get_vision_tower() is None:
+            vision_tower = build_vision_tower(model_args)
+            self.vision_tower = vision_tower
+        else:
+            vision_tower = self.vision_tower
+            #vision_tower.load_model()
+
+        self.config.use_mm_proj = True
+        self.config.mm_projector_type = getattr(model_args, "mm_projector_type")
+        self.config.mm_hidden_size = vision_tower.hidden_size
+
+        if getattr(self, "mm_projector", None) is None:
+            self.mm_projector = build_vision_projector(self.config)
+        else:
+            # In case it is frozen by LoRA
+            for p in self.mm_projector.parameters():
+                p.requires_grad = True
+
+        if pretrain_mm_mlp_adapter is not None:
+            mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location="cpu")
+
+            def get_w(weights, keyword):
+                return {k.split(keyword + ".")[1]: v for k, v in weights.items() if keyword in k}
+
+            self.mm_projector.load_state_dict(get_w(mm_projector_weights, "mm_projector"))
+
+    def initialize_audio_modules(self, model_args):
+        audio_encoder = model_args.audio_encoder
+
+        pretrain_audio_mlp_adapter = model_args.pretrain_audio_mlp_adapter
+
+        setattr(self.config, "mm_audio_encoder", audio_encoder)
+
+        audio_encoder = build_audio_encoder(self.config)
+        self.audio_encoder = audio_encoder
+
+        load_audio_ckpt_from_mllm = True
+        if load_audio_ckpt_from_mllm:
+            from safetensors.torch import load_file
+            import os
+            audio_weights = {}
+            for file_name in os.listdir(model_args.model_name_or_path):
+                if file_name.endswith('safetensors'):
+                    audio_weights.update(
+                        {k[20:]: v for k, v in load_file(os.path.join(model_args.model_name_or_path, file_name)).items() if
+                            k.startswith('model.audio_encoder.')})
+            self.audio_encoder.load_state_dict(audio_weights, strict=True) 
+
+        #load_audio_ckpt = True
+        #if self.get_audio_encoder() is None or load_audio_ckpt or model_args.audio_prompt_finetune:
+        #    audio_encoder = build_audio_encoder(self.config)
+        #    self.audio_encoder = audio_encoder
+
+        #load_audio_prompt_weight = False #True
+        #if load_audio_prompt_weight:
+        #    from safetensors.torch import load_file
+        #    import os
+        #    audio_weights = {}
+        #    for file_name in os.listdir(model_args.model_name_or_path):
+        #        if file_name.endswith('safetensors'):
+        #            audio_weights.update(
+        #                {k[38:]: v for k, v in load_file(os.path.join(model_args.model_name_or_path, file_name)).items() if
+        #                    k.startswith('model.audio_encoder.prompt_embeddings')})
+        #    self.audio_encoder.prompt_embeddings.load_state_dict(audio_weights, strict=True)
+
+        #checkpoint = torch.load(model_args.audio_encoder + "/final.pt", map_location="cpu")
+        #model_dict = self.audio_encoder.state_dict()
+        #for key in model_dict.keys():
+        #    if key in checkpoint.keys():
+        #        if model_dict[key].shape == checkpoint[key].shape:
+        #            model_dict[key] = checkpoint[key]
+        #        else:
+        #            print(
+        #                "Key {} has different shape, {} VS {}".format(
+        #                    key, model_dict[key].shape, checkpoint[key].shape
+        #                )
+        #            )
+        #    else:
+        #        print("Key {} has not in resume model".format(key))
+        #self.audio_encoder.load_state_dict(model_dict)
+
+        if pretrain_audio_mlp_adapter is not None:
+            audio_projector_weights = torch.load(pretrain_audio_mlp_adapter, map_location="cpu")
+
+            def get_w(weights, keyword):
+                return {k.split(keyword + ".")[1]: v for k, v in weights.items() if keyword in k}
+
+            self.audio_encoder.adpter.load_state_dict(get_w(audio_projector_weights, "audio_encoder.adpter"))
+
+
+class VITAMetaForCausalLM(ABC):
+    @abstractmethod
+    def get_model(self):
+        pass
+
+    def get_vision_tower(self):
+        return self.get_model().get_vision_tower()
+
+    def get_audio_encoder(self):
+        return self.get_model().get_audio_encoder()
+
+    def pool_feats(self, x, out_size):
+        ndim = x.ndim
+        if ndim == 2:
+            x = x.unsqueeze(0)
+        b, num_tokens, c = x.shape
+        h = int(math.sqrt(num_tokens))
+        x = x.permute(0, 2, 1).reshape(b, -1, h, h)
+        x = F.interpolate(x, size=out_size, mode='bilinear', align_corners=False)
+        num_tokens = x.shape[2] * x.shape[3]  # Recalculate the number of tokens after pooling
+        x = x.reshape(b, c, num_tokens).permute(0, 2, 1)
+        if ndim == 2:
+            x = x.squeeze(0)
+        return x
+
+    def encode_images(self, images):
+        image_features = self.get_model().get_vision_tower()(images)
+        #image_features = self.pool_feats(image_features)
+        image_features = self.get_model().mm_projector(image_features)
+        return image_features
+
+    def encode_images_frameCat(self, images):
+        image_features = self.get_model().get_vision_tower()(images)
+        assert len(image_features) % 5 == 0
+
+        concatenated_features = []
+        for i in range(0, len(image_features), 5):
+            tensors_to_concat = [image_features[j] for j in range(i, i + 5)]
+            concatenated_tensor = torch.cat(tensors_to_concat, dim=-1)
+            concatenated_features.append(concatenated_tensor)
+        concatenated_features = torch.stack(concatenated_features)
+        image_features = concatenated_features
+
+        image_features = self.get_model().mm_projector(image_features)
+        return image_features
+
+    def slow_fast_pooling0(self, temp_img_feats):
+        num_frame = len(temp_img_feats)
+        if num_frame <= 30:
+            slow_token_num = max([e for e in [256, 225, 196, 169] if e <= 5200/num_frame]) 
+            fast_token_num = slow_token_num
+        elif num_frame <= 45:
+            slow_token_num = 169
+            fast_token_num = 81
+        elif num_frame <= 64:
+            slow_token_num = 169
+            fast_token_num = 49
+        else:
+            raise ValueError("The number of frames is too large!")
+        
+        if num_frame <= 30:
+            num_slow = num_frame
+        else:
+            num_slow = int((5200 - fast_token_num * num_frame) / (slow_token_num - fast_token_num))
+        num_fast = num_frame - num_slow
+        slow_index = list(np.linspace(0, num_frame, num=num_slow, dtype=int))
+
+        new_img_feats = []
+        for i, feat in enumerate(temp_img_feats):
+            if i in slow_index:
+                sqrt_len = int(math.sqrt(slow_token_num))
+            else:
+                sqrt_len = int(math.sqrt(fast_token_num))
+            if sqrt_len != 16:
+                feat = self.pool_feats(feat, out_size=(sqrt_len, sqrt_len))
+            new_img_feats.append(feat)
+
+        return new_img_feats
+
+    def slow_fast_pooling1(self, temp_img_feats):
+        num_frame = len(temp_img_feats)
+        if num_frame <= 28:
+            slow_token_num = max([e for e in [256, 225, 196, 169, 144] if e <= 4096/num_frame]) 
+            fast_token_num = slow_token_num
+        elif num_frame <= 40:
+            slow_token_num = 144
+            fast_token_num = 81
+        elif num_frame <= 64:
+            slow_token_num = 144
+            fast_token_num = 49
+        else:
+            raise ValueError("The number of frames is too large!")
+        
+        if num_frame <= 28:
+            num_slow = num_frame
+        else:
+            num_slow = int((4096 - fast_token_num * num_frame) / (slow_token_num - fast_token_num))
+        num_fast = num_frame - num_slow
+        slow_index = list(np.linspace(0, num_frame, num=num_slow, dtype=int))
+
+        new_img_feats = []
+        for i, feat in enumerate(temp_img_feats):
+            if i in slow_index:
+                sqrt_len = int(math.sqrt(slow_token_num))
+            else:
+                sqrt_len = int(math.sqrt(fast_token_num))
+            if sqrt_len != 16:
+                feat = self.pool_feats(feat, out_size=(sqrt_len, sqrt_len))
+            new_img_feats.append(feat)
+
+        return new_img_feats
+
+    def slow_fast_pooling(self, temp_img_feats):
+        num_frame = len(temp_img_feats)
+        slow_token_num = 144
+        fast_token_num = 49
+        
+        slow_index = list(range(0, num_frame, 4))
+
+        new_img_feats = []
+        for i, feat in enumerate(temp_img_feats):
+            if i in slow_index:
+                sqrt_len = int(math.sqrt(slow_token_num))
+            else:
+                sqrt_len = int(math.sqrt(fast_token_num))
+            if sqrt_len != 16:
+                feat = self.pool_feats(feat, out_size=(sqrt_len, sqrt_len))
+            new_img_feats.append(feat)
+
+        return new_img_feats
+
+    def slow_fast_pooling3(self, temp_img_feats):
+        num_frame = len(temp_img_feats)
+        slow_token_num = 144
+        fast_token_num = 36
+        
+        slow_index = list(range(0, num_frame, 16))
+
+        new_img_feats = []
+        for i, feat in enumerate(temp_img_feats):
+            if i in slow_index:
+                sqrt_len = int(math.sqrt(slow_token_num))
+            else:
+                sqrt_len = int(math.sqrt(fast_token_num))
+            if sqrt_len != 16:
+                feat = self.pool_feats(feat, out_size=(sqrt_len, sqrt_len))
+            new_img_feats.append(feat)
+
+        return new_img_feats
+
+    def slow_fast(self, image_features, sf_masks):
+        new_image_features = []
+        temp_img_feats = []  # 初始化 temp_img_feats 在循环外
+        for i, img_feat in enumerate(image_features):
+            if i == 0 or sf_masks[i] != sf_masks[i-1]:
+                if temp_img_feats:  # 如果 temp_img_feats 不为空，则添加到 new_image_features
+                    if sf_masks[i-1] > 0:
+                        temp_img_feats = self.slow_fast_pooling(temp_img_feats)
+                    new_image_features.append(temp_img_feats)
+                temp_img_feats = [img_feat]  # 重新初始化 temp_img_feats
+            else:
+                temp_img_feats.append(img_feat)
+        if temp_img_feats:  # 处理最后一个子列表
+            if sf_masks[-1] > 0:
+                temp_img_feats = self.slow_fast_pooling(temp_img_feats)
+            new_image_features.append(temp_img_feats)
+        
+        output_features = []
+        for e in new_image_features:
+            output_features += e
+
+        return output_features
+
+    def prepare_inputs_labels_for_multimodal(
+        self, input_ids, position_ids, attention_mask, past_key_values, labels, images, audios, sf_masks, shared_v_pid_stride=None
+    ):
+        vision_tower = self.get_vision_tower()
+        if vision_tower is None or images is None or input_ids.shape[1] == 1:
+            if (
+                past_key_values is not None
+                and vision_tower is not None
+                and images is not None
+                and input_ids.shape[1] == 1
+            ):
+                target_shape = past_key_values[-1][-1].shape[-2] + 1
+                attention_mask = torch.cat(
+                    (
+                        attention_mask,
+                        torch.ones(
+                            (attention_mask.shape[0], target_shape - attention_mask.shape[1]),
+                            dtype=attention_mask.dtype,
+                            device=attention_mask.device,
+                        ),
+                    ),
+                    dim=1,
+                )
+                position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+            return input_ids, position_ids, attention_mask, past_key_values, None, labels
+
+        if type(images) is list or images.ndim == 5:
+            concat_images = torch.cat([image for image in images], dim=0)
+            image_features = self.encode_images(concat_images)
+            split_sizes = [image.shape[0] for image in images]
+            image_features = torch.split(image_features, split_sizes, dim=0)
+            image_features = [x.flatten(0, 1).to(self.device) for x in image_features]
+        else:
+            image_features = self.encode_images(images).to(self.device)
+
+        image_features = [e for e in image_features]
+        if sf_masks is not None:
+            assert len(image_features) == len(sf_masks)
+            image_features = self.slow_fast(image_features, sf_masks) 
+
+        audio_encoder = self.get_audio_encoder()
+        if audios is not None:
+            audio_features = audio_encoder(audios["audios"], audios["lengths"])
+            state_labels = audios.get("state_labels", None)
+            lengths_for_llm = audios["lengths_for_llm"]
+            if state_labels is not None:
+                assert len(audio_features["inputs_embeds"]) == len(state_labels) == len(lengths_for_llm)
+        else:
+            audio_features, state_labels, lengths_for_llm = None, None, None        
+
+        # Let's just add dummy tensors if they do not exist,
+        # it is a headache to deal with None all the time.
+        # But it is not ideal, and if you have a better idea,
+        # please open an issue / submit a PR, thanks.
+        _labels = labels
+        _position_ids = position_ids
+        _attention_mask = attention_mask
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
+        else:
+            attention_mask = attention_mask.bool()
+        if position_ids is None:
+            position_ids = torch.arange(
+                0, input_ids.shape[1], dtype=torch.long, device=input_ids.device
+            )
+        if labels is None:
+            labels = torch.full_like(input_ids, IGNORE_INDEX)
+
+        # remove the padding using attention_mask -- TODO: double check
+        input_ids = [
+            cur_input_ids[cur_attention_mask]
+            for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)
+        ]
+        labels = [
+            cur_labels[cur_attention_mask]
+            for cur_labels, cur_attention_mask in zip(labels, attention_mask)
+        ]
+
+        new_input_embeds = []
+        new_labels = []
+        v_start_end = []
+        cur_image_idx = 0
+        cur_audio_idx = 0
+        assert (
+            sum([(cur == IMAGE_TOKEN_INDEX).sum() for cur in input_ids])
+            + sum([(IMAGE_TOKEN_INDEX not in cur) for cur in input_ids])
+            == len(image_features)
+        ), input_ids
+        assert (
+            sum([(cur == AUDIO_TOKEN_INDEX).sum() for cur in input_ids])
+            + sum([(AUDIO_TOKEN_INDEX not in cur) for cur in input_ids])
+            == audio_features["inputs_embeds"].shape[0]
+        ), input_ids
+
+        for batch_idx, cur_input_ids in enumerate(input_ids):
+            num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum()
+            num_audio_frames = (cur_input_ids == AUDIO_TOKEN_INDEX).sum()
+            if num_images == 0 and num_audio_frames == 0:
+                cur_image_features = image_features[cur_image_idx]
+                cur_audio_features = audio_features["inputs_embeds"][cur_audio_idx]
+                cur_input_embeds_1 = self.get_model().embed_tokens(cur_input_ids)
+                cur_input_embeds = torch.cat(
+                    [cur_input_embeds_1, cur_image_features[0:0], cur_audio_features[0:0]], dim=0
+                )
+                new_input_embeds.append(cur_input_embeds)
+                new_labels.append(labels[batch_idx])
+                cur_image_idx += 1
+                cur_audio_idx += 1
+                continue
+
+            image_audio_token_indices = (
+                [-1]
+                + torch.where(
+                    (cur_input_ids == IMAGE_TOKEN_INDEX) | (cur_input_ids == AUDIO_TOKEN_INDEX)
+                )[0].tolist()
+                + [cur_input_ids.shape[0]]
+            )
+            cur_input_ids_noim_noau = []
+            cur_labels = labels[batch_idx]
+            cur_labels_noim_noau = []
+            for i in range(len(image_audio_token_indices) - 1):
+                cur_input_ids_noim_noau.append(
+                    cur_input_ids[
+                        image_audio_token_indices[i] + 1 : image_audio_token_indices[i + 1]
+                    ]
+                )
+                cur_labels_noim_noau.append(
+                    cur_labels[image_audio_token_indices[i] + 1 : image_audio_token_indices[i + 1]]
+                )
+
+            split_sizes = [x.shape[0] for x in cur_labels_noim_noau]
+            cur_input_embeds = self.get_model().embed_tokens(torch.cat(cur_input_ids_noim_noau))
+            cur_input_embeds_no_im_no_au = torch.split(cur_input_embeds, split_sizes, dim=0)
+            cur_new_input_embeds = []
+            cur_new_labels = []
+            cur_v_start_end = []
+            for i in range(num_images + num_audio_frames + 1):
+                cur_new_input_embeds.append(cur_input_embeds_no_im_no_au[i])
+                cur_new_labels.append(cur_labels_noim_noau[i])
+                if i < num_images + num_audio_frames:
+                    if cur_input_ids[image_audio_token_indices[i + 1]] == IMAGE_TOKEN_INDEX:
+                        cur_image_features = image_features[cur_image_idx]
+                        cur_image_idx += 1
+                        cur_new_input_embeds.append(cur_image_features)
+                        cur_new_labels.append(
+                            torch.full(
+                                (cur_image_features.shape[0],),
+                                IGNORE_INDEX,
+                                device=cur_labels.device,
+                                dtype=cur_labels.dtype,
+                            )
+                        )
+                        if shared_v_pid_stride:
+                            start = sum([x.shape[0] for x in cur_new_labels[:-1]])
+                            end = start + cur_new_labels[-1].shape[0]
+                            cur_v_start_end.append((start, end))
+                    elif cur_input_ids[image_audio_token_indices[i + 1]] == AUDIO_TOKEN_INDEX:
+                        cur_lengths_for_llm = lengths_for_llm[cur_audio_idx]
+                        cur_audio_features = audio_features["inputs_embeds"][cur_audio_idx]
+                        if getattr(self.config, "audio_prompt_num", None):#self.config.audio_prompt_num:
+                            cur_lengths_for_llm = cur_lengths_for_llm + self.config.audio_prompt_num
+                        cur_audio_features = cur_audio_features[:cur_lengths_for_llm]
+                        if state_labels is not None:
+                            cur_state_label = state_labels[cur_audio_idx]
+                        cur_audio_idx += 1
+                        cur_new_input_embeds.append(cur_audio_features)
+                        cur_new_labels.append(
+                            torch.full(
+                                (cur_audio_features.shape[0],),
+                                IGNORE_INDEX,
+                                device=cur_labels.device,
+                                dtype=cur_labels.dtype,
+                            )
+                        )
+                        if state_labels is not None:
+                            cur_new_labels[-1][-1] = cur_state_label
+                    else:
+                        raise ValueError
+
+            if num_images != 0 and num_audio_frames == 0:
+                cur_audio_features = audio_features["inputs_embeds"][cur_audio_idx]
+                cur_audio_idx += 1
+                cur_new_input_embeds.append(cur_audio_features[0:0])
+            elif num_images == 0 and num_audio_frames != 0:
+                cur_image_features = image_features[cur_image_idx]
+                cur_image_idx += 1
+                cur_new_input_embeds.append(cur_image_features[0:0])
+            cur_new_input_embeds = torch.cat(cur_new_input_embeds)
+            cur_new_labels = torch.cat(cur_new_labels)
+
+            new_input_embeds.append(cur_new_input_embeds)
+            new_labels.append(cur_new_labels)
+
+            if shared_v_pid_stride:
+                cur_v_start_end = merge_consecutive_tuples(cur_v_start_end)
+                v_start_end.append(cur_v_start_end)
+
+        assert cur_image_idx == len(image_features)
+        assert cur_audio_idx == audio_features["inputs_embeds"].shape[0]
+        if state_labels is not None:
+            assert cur_audio_idx == len(state_labels)
+        if state_labels is not None:
+            assert (
+                sum([(cur == AUDIO_TOKEN_INDEX).sum() for cur in input_ids])
+                == sum([(cur == -101).sum() for cur in new_labels]) + sum([(cur == -102).sum() for cur in new_labels])
+            ), (input_ids, sum([(cur == AUDIO_TOKEN_INDEX).sum() for cur in input_ids]),  sum([(cur == -101).sum() for cur in new_labels]), sum([(cur == -102).sum() for cur in new_labels]), new_labels.shape)
+
+        # Truncate sequences to max length as image embeddings can make the sequence longer
+        tokenizer_model_max_length = getattr(self.config, "tokenizer_model_max_length", None)
+        if tokenizer_model_max_length is not None:
+            new_input_embeds = [x[:tokenizer_model_max_length] for x in new_input_embeds]
+            new_labels = [x[:tokenizer_model_max_length] for x in new_labels]
+
+        # Combine them
+        max_len = max(x.shape[0] for x in new_input_embeds)
+        batch_size = len(new_input_embeds)
+
+        new_input_embeds_padded = []
+        new_labels_padded = torch.full(
+            (batch_size, max_len),
+            IGNORE_INDEX,
+            dtype=new_labels[0].dtype,
+            device=new_labels[0].device,
+        )
+        attention_mask = torch.zeros(
+            (batch_size, max_len), dtype=attention_mask.dtype, device=attention_mask.device
+        )
+        position_ids = torch.zeros(
+            (batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device
+        )
+
+        for i, (cur_new_embed, cur_new_labels) in enumerate(zip(new_input_embeds, new_labels)):
+            cur_len = cur_new_embed.shape[0]
+            if getattr(self.config, "tokenizer_padding_side", "right") == "left":
+                new_input_embeds_padded.append(
+                    torch.cat(
+                        (
+                            torch.zeros(
+                                (max_len - cur_len, cur_new_embed.shape[1]),
+                                dtype=cur_new_embed.dtype,
+                                device=cur_new_embed.device,
+                            ),
+                            cur_new_embed,
+                        ),
+                        dim=0,
+                    )
+                )
+                if cur_len > 0:
+                    new_labels_padded[i, -cur_len:] = cur_new_labels
+                    attention_mask[i, -cur_len:] = True
+                    position_ids[i, -cur_len:] = torch.arange(
+                        0, cur_len, dtype=position_ids.dtype, device=position_ids.device
+                    )
+            else:
+                new_input_embeds_padded.append(
+                    torch.cat(
+                        (
+                            cur_new_embed,
+                            torch.zeros(
+                                (max_len - cur_len, cur_new_embed.shape[1]),
+                                dtype=cur_new_embed.dtype,
+                                device=cur_new_embed.device,
+                            ),
+                        ),
+                        dim=0,
+                    )
+                )
+                if cur_len > 0:
+                    new_labels_padded[i, :cur_len] = cur_new_labels
+                    attention_mask[i, :cur_len] = True
+                    if shared_v_pid_stride is None:
+                        position_ids[i, :cur_len] = torch.arange(
+                            0, cur_len, dtype=position_ids.dtype, device=position_ids.device
+                        )
+                    else:
+                        cur_v_start_end = v_start_end[i]
+                        cur_shared_position_ids = make_shared_position_ids(cur_v_start_end, cur_len, shared_v_pid_stride)
+                        position_ids[i, :cur_len] = cur_shared_position_ids
+
+        new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
+
+        if _labels is None:
+            new_labels = None
+        else:
+            new_labels = new_labels_padded
+
+        if _attention_mask is None:
+            attention_mask = None
+        else:
+            attention_mask = attention_mask.to(dtype=_attention_mask.dtype)
+
+        if _position_ids is None and shared_v_pid_stride is None:
+            position_ids = None
+
+        return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels
+
+
+def merge_consecutive_tuples(tuples_list):
+    if not tuples_list:
+        return []
+
+    # 首先对列表按照起点索引进行排序
+    sorted_tuples = sorted(tuples_list, key=lambda x: x[0])
+    
+    # 初始化合并后的列表
+    merged_tuples = [sorted_tuples[0]]
+    
+    for current_start, current_end in sorted_tuples[1:]:
+        last_merged_start, last_merged_end = merged_tuples[-1]
+        if current_start <= last_merged_end:  # 如果当前元组的起点小于等于上一个合并元组的终点
+            # 合并这两个元组
+            new_start, new_end = merged_tuples[-1][0], max(last_merged_end, current_end)
+            merged_tuples[-1] = (new_start, new_end)
+        else:
+            # 如果当前元组不连续，直接添加到合并后的列表中
+            merged_tuples.append((current_start, current_end))
+    
+    return merged_tuples
+
+
+def make_shared_position_ids(cur_v_start_end, cur_len, shared_v_pid_stride):
+    position_ids = torch.tensor([1.0] * cur_len)
+
+    for start, end in cur_v_start_end:
+        position_ids[start:end] = 1/shared_v_pid_stride
+        v_mod = (end - start) % shared_v_pid_stride
+        if v_mod != 0:
+            position_ids[end-v_mod:end] = 1 / v_mod
+    position_ids = position_ids.cumsum(dim=0)
+    position_ids = torch.ceil(position_ids).long() - 1
+
+    return position_ids
diff --git a/vita/model/vita_tts/adapter.py b/vita/model/vita_tts/adapter.py
new file mode 100644
index 0000000000000000000000000000000000000000..83793b4ed61502b2e3457cb9946b4e471870001f
--- /dev/null
+++ b/vita/model/vita_tts/adapter.py
@@ -0,0 +1,157 @@
+import random
+import torch
+import copy
+import re
+
+from torch import nn
+from torch.nn.utils.rnn import pad_sequence
+import torch.nn.functional as F
+
+class CNNAdapter(torch.nn.Module):
+    def __init__(
+        self,
+        enc_out_dim: int = 512,
+        llm_embed_dim: int = 4096,
+        kernel_size: int = 5,
+    ):
+        super().__init__()
+        
+        self.left_padding1 = nn.ConstantPad1d((kernel_size - 1, 0), 0.0)
+        self.left_padding2 = nn.ConstantPad1d((kernel_size - 1, 0), 0.0)
+        
+        self.conv1d1 = nn.Conv1d(enc_out_dim, 2 * enc_out_dim, kernel_size, 1, 0)
+        self.conv1d2 = nn.Conv1d(2 * enc_out_dim, 4 * enc_out_dim, kernel_size, 1, 0)
+        
+        self.bn1 = nn.BatchNorm1d(2 * enc_out_dim, eps=1e-3, momentum=0.99)
+        self.bn2 = nn.BatchNorm1d(4 * enc_out_dim, eps=1e-3, momentum=0.99)
+        
+        self.relu1 = nn.ReLU()
+        self.relu2 = nn.ReLU()
+        
+        self.project = nn.Linear(4 * enc_out_dim, llm_embed_dim)
+    
+    def forward(self, x, mask_pad):
+        """
+            x: B, T, enc_out_dim
+            mask: (B, T) or (B, 1, T)
+        """
+        x = x.transpose(1, 2)  # B, channels, T
+
+        # mask batch padding
+        if mask_pad.size(2) > 0:  # time > 0
+            x.masked_fill_(~mask_pad, 0.0)
+
+        x = self.left_padding1(x)
+        x = self.conv1d1(x)
+        x = self.bn1(x)
+        x = self.relu1(x)
+
+        x = self.left_padding2(x)
+        x = self.conv1d2(x)
+        x = self.bn2(x)
+        x = self.relu2(x)
+
+        x = x.transpose(1, 2)
+        x = self.project(x)
+
+        return x, mask_pad
+
+class LinearAdapter(torch.nn.Module):
+    def __init__(
+        self,
+        enc_out_dim: int = 512,
+        llm_embed_dim: int = 4096,
+    ):
+        super().__init__()
+
+        self.adpter = torch.nn.Linear(enc_out_dim, llm_embed_dim)
+
+    def forward(self, x, mask_pad):
+        return self.adpter(x), mask_pad
+
+class CNNSubsampling(torch.nn.Module):
+    def __init__(
+        self,
+        enc_out_dim: int = 512,
+        llm_embed_dim: int = 4096,
+        kernel_size: int = 5,
+        activation_func: str = 'relu',
+        norm: str = 'batch',
+    ):
+        super().__init__()
+        
+        self.kernel_size = kernel_size
+        if enc_out_dim * 4 < llm_embed_dim:
+            self.left_padding1 = nn.ConstantPad1d((kernel_size - 1, 0), 0.0)
+            self.conv1d1 = nn.Conv1d(enc_out_dim, 2 * enc_out_dim, kernel_size, 1, 0)
+            self.bn1 = nn.BatchNorm1d(2 * enc_out_dim, eps=1e-3, momentum=0.99)
+            self.relu1 = nn.ReLU()
+
+            self.left_padding2 = nn.ConstantPad1d((kernel_size - 1, 0), 0.0)
+            self.conv1d2 = nn.Conv1d(2 * enc_out_dim, 4 * enc_out_dim, kernel_size, 2, 0)
+            self.bn2 = nn.BatchNorm1d(4 * enc_out_dim, eps=1e-3, momentum=0.99)
+            self.relu2 = nn.ReLU()
+            
+            self.project = nn.Linear(4 * enc_out_dim, llm_embed_dim)
+            self.cnn_num = 2
+        else:
+            self.left_padding2 = nn.ConstantPad1d((kernel_size - 1, 0), 0.0)
+            self.conv1d2 = nn.Conv1d(enc_out_dim, 2 * enc_out_dim, kernel_size, 2, 0)
+            if norm == 'batch':
+                self.bn2 = nn.BatchNorm1d(2 * enc_out_dim, eps=1e-3, momentum=0.99)
+            elif norm == 'layer':
+                self.bn2 = nn.LayerNorm(2 * enc_out_dim, eps=1e-3)
+            if activation_func == 'gelu':
+                self.relu2 = nn.GELU()
+            else:
+                self.relu2 = nn.ReLU()
+            
+            self.project = nn.Linear(2 * enc_out_dim, llm_embed_dim)
+            self.cnn_num = 1
+    
+    def forward(self, x, mask_pad, cache=None, return_cache=False):
+        """
+            x: B, T, enc_out_dim
+            mask: (B, T) or (B, 1, T)
+        """
+        x = x.transpose(1, 2)  # B, channels, T
+
+        # mask batch padding
+        if mask_pad.size(2) > 0:  # time > 0
+            x.masked_fill_(~mask_pad, 0.0)
+
+        if self.cnn_num == 2:
+            if cache is None:
+                x = self.left_padding1(x)
+            else:
+                x = torch.cat((cache[1], x), dim=2)
+            if cache is not None:
+                cache[1] = x[:, :, 1-self.kernel_size:]
+            else:
+                cache = [None, x[:, :, 1-self.kernel_size:]]
+            x = self.conv1d1(x)
+            x = self.bn1(x)
+            x = self.relu1(x)
+
+        if cache is None or cache[0] is None:
+            x = self.left_padding2(x)
+        else:
+            x = torch.cat((cache[0], x), dim=2)
+        if cache is not None:
+            cache[0] = x[:, :, 1-self.kernel_size:]
+        else:
+            cache = [x[:, :, 1-self.kernel_size:]]
+        x = self.conv1d2(x)
+        if isinstance(self.bn2, nn.LayerNorm):
+            x = x.transpose(1, 2)
+        x = self.bn2(x)
+        if isinstance(self.bn2, nn.LayerNorm):
+            x = x.transpose(1, 2)
+        x = self.relu2(x)
+
+        x = x.transpose(1, 2)
+        x = self.project(x)
+
+        if return_cache:
+            return x, mask_pad[:, :, 0::2], cache
+        return x, mask_pad[:, :, 0::2]
diff --git a/vita/model/vita_tts/audioLLM.py b/vita/model/vita_tts/audioLLM.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1e710d29f91557e3cb71c355bccaef4419e77a9
--- /dev/null
+++ b/vita/model/vita_tts/audioLLM.py
@@ -0,0 +1,433 @@
+import random
+import torch
+import copy
+import re
+
+from torch import nn
+from torch.nn.utils.rnn import pad_sequence
+import torch.nn.functional as F
+
+from collections import defaultdict
+from typing import Dict, List, Optional, Tuple
+
+from transformers import AutoModelForCausalLM
+from transformers import AutoTokenizer
+
+from vita.model.vita_tts.adapter import *
+
+IGNORE_ID = -1
+
+class AudioLLM(torch.nn.Module):
+    def __init__(
+        self,
+        encoder: torch.nn.Module,
+        llm_path: str,
+        freeze_llm: bool = True,
+        enc_out_dim: int = 512,
+        llm_embed_dim: int = 4096,
+        kernel_size: int = 3,
+        IGNORE_ID: int = -100,
+        adpter_type: str = 'cnn',
+        add_audio_bos_eos: bool = False,
+        task_num: int = 10,
+        add_ctc_prompt_ratio: float = 0.0,
+        lang_dict: dict = None,
+        ctc: torch.nn.Module = None,
+        tokenize_ctc_char: bool = False,
+        task_before_audio: bool = False,
+        hyp_before_task: bool = False,
+        prompt_finetune: bool = False,
+        add_prompt_before: bool = False,
+        prompt_num: int = 5,
+        prefix_finetune: bool = False,
+        prefix_num: int = 5,
+        llm_head_num: int = 32,
+        num_key_value_heads: int = None,
+        task_type: str = 'prompt',
+        freeze_encoder: bool = False,
+        freeze_adpter: bool = False,
+        activation_func: str = 'relu',
+        norm: str = 'batch',
+        use_lora: bool = False,
+        clone_encoder: torch.nn.Module = None,
+        chat_template: str = None,
+        predict_usr_state: int = 0,
+        chunk_size: int = -1,
+    ):
+        super().__init__()
+
+        self.encoder =  encoder
+        self.llm_decoder = AutoModelForCausalLM.from_pretrained(llm_path, 
+                                                    torch_dtype="auto",
+                                                    trust_remote_code=True)
+        self.tokenizer = AutoTokenizer.from_pretrained(llm_path, 
+                                                    trust_remote_code=True)
+        self.freeze_llm =  freeze_llm
+        self.enc_out_dim = enc_out_dim
+        self.llm_embed_dim = llm_embed_dim
+        self.IGNORE_ID = IGNORE_ID
+        self.add_audio_bos_eos = add_audio_bos_eos
+        self.add_ctc_prompt_ratio = add_ctc_prompt_ratio
+        self.lang_dict = lang_dict
+        self.tokenize_ctc_char = tokenize_ctc_char
+        self.task_before_audio = task_before_audio
+        self.hyp_before_task = hyp_before_task
+        self.prompt_finetune = prompt_finetune
+        self.add_prompt_before = add_prompt_before
+        self.prompt_num = prompt_num
+        self.prefix_finetune = prefix_finetune
+        self.prefix_num = prefix_num
+        self.llm_head_num = llm_head_num
+        if num_key_value_heads is None:
+            self.num_key_value_heads = llm_head_num
+        else:
+            self.num_key_value_heads = num_key_value_heads
+        self.kv_cache_dim = llm_embed_dim // self.llm_head_num * self.num_key_value_heads
+        self.task_type = task_type
+        self.freeze_encoder = freeze_encoder
+        self.freeze_adpter = freeze_adpter
+        self.predict_usr_state = predict_usr_state
+        self.chunk_size = chunk_size
+
+        if not hasattr(self.tokenizer, "eod_id"):
+            self.tokenizer.eod_id = self.tokenizer.eos_token_id
+        if not hasattr(self.llm_decoder, "transformer"):
+            self.llm_decoder.transformer = self.llm_decoder.model
+            self.llm_decoder.transformer.h = self.llm_decoder.transformer.layers
+        if not hasattr(self.llm_decoder.transformer, "wte"):
+            self.llm_decoder.transformer.wte = \
+                self.llm_decoder.transformer.embed_tokens
+
+        # for chat mode
+        if chat_template is not None:
+            self.tokenizer.eod_id = self.tokenizer('<|im_end|>'
+                                                )['input_ids'][0]
+            self.chat_template = {}
+            chat_template = chat_template.split('<audio>')
+            chat_prefix = chat_template[0].split('<|im_end|>')
+            chat_role = chat_prefix[0] + '<|im_end|>'
+            self.chat_template['role'] = self.tokenizer(
+                        [chat_role], return_tensors="pt")['input_ids']
+            self.chat_template['prefix'] = self.tokenizer(
+                        [chat_prefix[1]], return_tensors="pt")['input_ids']
+            self.chat_template['suffix'] = self.tokenizer(
+                        [chat_template[1]], return_tensors="pt")['input_ids']
+        else:
+            self.chat_template = None
+
+        # for CTC prompt
+        if self.add_ctc_prompt_ratio > 0.0:
+            assert lang_dict is not None
+            assert ctc is not None
+            self.ctc = ctc.eval()
+            if clone_encoder is None:
+                self.clone_encoder = copy.deepcopy(encoder)
+            else:
+                self.clone_encoder = clone_encoder
+            self.clone_encoder.eval()
+            for (name, param) in self.clone_encoder.named_parameters():
+                param.requires_grad = False
+            for (name, param) in self.ctc.named_parameters():
+                param.requires_grad = False
+        else:
+            self.clone_encoder = None
+
+        if self.freeze_llm:
+            self.llm_decoder.eval()
+            for (name, param) in self.llm_decoder.named_parameters():
+                param.requires_grad = False
+        
+        if use_lora:
+            config = LoraConfig(
+                r=lora_r,
+                lora_alpha=lora_alpha,
+                target_modules=UNET_TARGET_MODULES,
+                lora_dropout=args.lora_dropout,
+                bias=args.lora_bias,
+            )
+
+        if adpter_type == 'cnn':
+            self.adpter = CNNAdapter(enc_out_dim, llm_embed_dim, kernel_size)
+        elif adpter_type == 'linear':
+            self.adpter = LinearAdapter(enc_out_dim, llm_embed_dim)
+        elif adpter_type == 'subsampling':
+            self.adpter = CNNSubsampling(enc_out_dim, llm_embed_dim, 
+                                        kernel_size, activation_func, norm)
+        
+        self.task_embeddings = torch.nn.Embedding(task_num, llm_embed_dim)
+        if task_type == 'prefix':
+            self.prefix_embeddings = nn.ModuleList(
+                    [
+                        torch.nn.ModuleList(
+                            [nn.Embedding(task_num, self.kv_cache_dim),
+                            nn.Embedding(task_num, self.kv_cache_dim)]
+                        )
+                        for i in range(len(self.llm_decoder.transformer.h))
+                    ]
+                )
+
+        if self.prompt_finetune or self.prefix_finetune:
+            if self.prompt_finetune:
+                self.prompt_embeddings = nn.Embedding(prompt_num, llm_embed_dim)
+                self.prompt_ids = torch.Tensor([i for i in range(prompt_num)]).long()
+            if self.prefix_finetune:
+                self.prefix_embeddings = nn.ModuleList(
+                    [
+                        torch.nn.ModuleList(
+                            [nn.Embedding(prefix_num, self.kv_cache_dim),
+                            nn.Embedding(prefix_num, self.kv_cache_dim)]
+                        )
+                        for i in range(len(self.llm_decoder.transformer.h))
+                    ]
+                )
+                self.prefix_ids = torch.Tensor([i for i in range(prefix_num)]).long()
+
+        if self.freeze_encoder:
+            self.encoder.eval()
+            for (name, param) in self.encoder.named_parameters():
+                param.requires_grad = False
+        if self.freeze_adpter:
+            self.adpter.eval()
+            for (name, param) in self.adpter.named_parameters():
+                param.requires_grad = False
+
+        if self.predict_usr_state:
+            self.predictor_head = torch.nn.Linear(llm_embed_dim, predict_usr_state)
+        else:
+            self.predictor_head = None
+
+        # define task ids
+        self.task_ids = {
+            "sot": 0,
+            "transcribe": 1,
+            "translate": 2,
+            "zh": 3,
+            "en": 4,
+            "audio": 5,
+            "/audio": 6,
+            "hyps": 7,
+            "/hyps": 8,
+        }
+        
+    def set_system_role(
+        self,
+        extra_inputs: Optional[dict] = None,
+    ):
+        # Ensure 'past_key_values' does not exist in extra_inputs, raise an exception if it does
+        assert extra_inputs.get('past_key_values', None) is None, "past key values already exist!!!"
+        
+        # If 'role' key is present in extra_inputs, use that role as the chat prefix
+        if extra_inputs.get('role', None) is not None:
+            chat_prefix = self.tokenizer([extra_inputs['role']], 
+                return_tensors="pt")['input_ids'].to('cuda')  # Convert role to tokens and move to CUDA device
+        else:
+            # If no 'role' is provided, use the default chat template and remove the last token (<|im_end|>)
+            chat_prefix = self.chat_template['role'][:, :-1].to('cuda')
+        
+        # Use the LLM decoder's word embedding layer to convert the chat prefix into embeddings
+        inputs_embeds = self.llm_decoder.transformer.wte(chat_prefix)
+        
+        # Create an attention mask with the same shape as the chat prefix, all values set to True
+        attention_mask = torch.full(chat_prefix.shape, 
+                            True).to(inputs_embeds.device) 
+        
+        # Prepare the input dictionary containing embeddings and attention mask
+        inputs = {
+                'inputs_embeds': inputs_embeds.half(),  # Convert embeddings to half precision floats
+                'attention_mask': attention_mask,
+            }
+
+        # Call the _generate_one_step method to generate one step output, including past_key_values, etc.
+        _, past_key_values, stat, _ = self._generate_one_step(
+                                                copy.deepcopy(inputs), "sl")
+        
+        # Return the generated past_key_values
+        return past_key_values
+
+    def recognize(
+        self,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        extra_inputs: Optional[dict] = None,
+    ):
+        assert extra_inputs.get('past_key_values', None) is not None, "must set system role first!!!"
+
+        buffer = extra_inputs.get('encoder_cache', None)
+        cnn_cache = extra_inputs.get('adapter_cache', None)
+        pe_index = extra_inputs.get('pe_index', 0)
+        if extra_inputs['stat'] == 'sl' or extra_inputs['stat'] == 'cl':
+            # Encoder
+            
+            if buffer is None:
+                buffer = [None] * self.encoder.enc[1].num_blocks
+            
+            encoder_out, buffer, _, _, pe_index = self.encoder.infer(speech, buffer, 
+                                                                    0, None, pe_index)
+
+            encoder_mask = torch.full(encoder_out.shape[:2], True).unsqueeze(1
+                                                            ).to(encoder_out.device)
+
+            # adapter
+            inputs_embeds, encoder_mask, cnn_cache = self.adpter(encoder_out, encoder_mask, 
+                                            cache=cnn_cache, return_cache=True) # 1, T, D
+
+            attention_mask = encoder_mask.squeeze(1) # 1, T
+
+        # prompt
+        if extra_inputs['stat'] == 'sl':
+            if self.prompt_finetune:
+                prompt_ids = self.prompt_ids.repeat(1, 1).to(inputs_embeds.device)
+                prompt_embeds = self.prompt_embeddings(
+                                    prompt_ids.to(inputs_embeds.device)) # B, 5, D
+                prompt_mask = torch.full(prompt_ids.shape, 
+                                    True).to(inputs_embeds.device) # B, 5
+                
+                if self.add_prompt_before:
+                    inputs_embeds = torch.cat((prompt_embeds, inputs_embeds), 1) # B, (T+5), D
+                    attention_mask = torch.cat((prompt_mask, attention_mask), 1) # B, (T+5)
+
+        # chat mode
+        if self.chat_template is not None:
+            if extra_inputs['stat'] == 'sl':
+                chat_prefix = self.chat_template['prefix'].to(
+                                                    inputs_embeds.device)
+                chat_prefix = torch.cat((torch.tensor([[self.tokenizer.eod_id]]
+                                    ).to(inputs_embeds.device), chat_prefix), 1)
+                chat_prefix_embeds = self.llm_decoder.transformer.wte(chat_prefix)
+                chat_prefix_mask = torch.full(chat_prefix.shape, 
+                                True).to(inputs_embeds.device)
+                inputs_embeds = torch.cat((chat_prefix_embeds, inputs_embeds), 1)
+                attention_mask = torch.cat((chat_prefix_mask, attention_mask), 1)
+            if extra_inputs['stat'] == 'ss':
+                chat_suffix = self.chat_template['suffix'].to('cuda')
+                chat_suffix_embeds = self.llm_decoder.transformer.wte(chat_suffix)
+                chat_suffix_mask = torch.full(chat_suffix.shape, True).to('cuda')
+                inputs_embeds = chat_suffix_embeds
+                attention_mask = chat_suffix_mask
+        
+        if extra_inputs['stat'] != 'cs':
+            inputs = {
+                'inputs_embeds': inputs_embeds.half(),
+                'attention_mask': attention_mask,
+            }
+        else:
+            attention_mask = torch.full([1, 1], True).to('cuda')
+            inputs = {
+                'input_ids': extra_inputs['last_id'],
+                'attention_mask': attention_mask
+            }
+
+        # add kv cache
+        inputs['past_key_values'] = extra_inputs['past_key_values']
+        past_mask = torch.full([1, inputs['past_key_values'][0][0].size(2)],
+                                True).to('cuda')
+        attention_mask = torch.cat((past_mask, attention_mask), 1)
+        inputs['attention_mask'] = attention_mask
+
+        top_p = extra_inputs.get('top_p', 1.0)
+        top_k = extra_inputs.get('top_k', 0)
+        temperature = extra_inputs.get('temperature', 1.0)
+
+        last_id, past_key_values, stat, hidden_state = self._generate_one_step(copy.deepcopy(inputs), 
+                                                extra_inputs['stat'],
+                                                top_p=top_p, 
+                                                top_k=top_k,
+                                                temperature=temperature)
+
+        return last_id, stat, past_key_values, cnn_cache, buffer, pe_index, hidden_state
+    
+    def _post_decode(self, output, temperature=1.0, top_k=0, top_p=0.0):
+        """
+        Decoding function, based on the posterior probability output, 
+        uses top_k, top_p, and temperature parameters for sampling.
+
+        Parameters:
+        - output: torch.Tensor, shaped as (1, 1, D), represents the posterior probability output by the model.
+        - top_k: int, indicates selecting the top k tokens with the highest probability for sampling.
+                      If 0, no top_k filtering is performed.
+        - top_p: float, indicates selecting tokens with cumulative probability not exceeding p for sampling.
+                        If 0.0, no top_p filtering is performed.
+        - temperature: float, represents the sampling temperature parameter. 
+                              The higher the value, the more random the sampling; 
+                            the lower the value, the more deterministic the sampling.
+
+        Returns:
+        - Selected token index.
+        """
+        output = output.squeeze(0).squeeze(0)
+
+        # temperature
+        if temperature != 1.0:
+            output = output / temperature
+
+        probs = torch.nn.functional.softmax(output, dim=-1)
+
+        # top_k
+        if top_k > 0:
+            top_k_probs, top_k_indices = torch.topk(probs, top_k)
+            probs = torch.zeros_like(probs).scatter_(0, top_k_indices, top_k_probs)
+            probs = probs / probs.sum()
+
+        # top_p
+        if top_p > 0.0:
+            sorted_probs, sorted_indices = torch.sort(probs, descending=True)
+            cumulative_probs = torch.cumsum(sorted_probs, dim=-1)
+            sorted_indices_to_remove = cumulative_probs > top_p
+            if sorted_indices_to_remove[0]:
+                sorted_indices_to_remove[1:] = sorted_indices_to_remove[:-1].clone()
+                sorted_indices_to_remove[0] = 0
+
+            indices_to_remove = sorted_indices[sorted_indices_to_remove]
+            probs[indices_to_remove] = 0
+            probs = probs / probs.sum()
+
+        token_index = torch.multinomial(probs, 1)
+        return token_index.unsqueeze(0)
+    
+    def _generate_one_step(
+        self,
+        inputs,
+        stat,
+        top_p: float = 1.0,
+        top_k: int = 0,
+        temperature: float = 1.0,
+    ):
+        """
+        Generates the model's next output based on the current input and state.
+
+        Parameters:
+        - inputs: The input tensor containing the model's input data.
+        - stat: The current state information used to control the generation process.
+        - top_p: The threshold for controlling top-p sampling.
+        - top_k: The threshold for controlling top-k sampling.
+        - temperature: Controls the randomness of sampling.
+
+        Returns:
+        - last_id: The index of the last generated token.
+        - stat: The updated state information.
+        - past_key_values: The model's historical key-value pairs, used for cross-step memory.
+        - hidden_state: The model's hidden state, used to maintain cross-step contextual information.
+        """
+        outputs = self.llm_decoder.model(**inputs)
+        if stat == 'sl' or stat == 'cl':
+            state_logits = self.predictor_head(
+                        outputs['last_hidden_state'])[0, :]
+            prob = F.softmax(state_logits[:, :-1])
+            state_prob = prob[-1].clone()
+            state_1 = state_prob[1]
+            state_2 = state_prob[2]
+            print("State 1 prob: {:.4f}, State 2 prob: {:.4f}".format(state_1.item(), state_2.item()))
+            if state_2 > 0.5:
+                return None, outputs['past_key_values'], 'el', None
+            if state_1 > 0.5:
+                return None, outputs['past_key_values'], 'ss', None
+            return None, outputs['past_key_values'], 'cl', None
+
+        last_logit = self.llm_decoder.lm_head(outputs['last_hidden_state'][:, -1:, :])
+        last_id = self._post_decode(last_logit, temperature=temperature, top_k=top_k, top_p=top_p)
+        return_tts_state = outputs['last_hidden_state'][:, -1:, :]
+
+        if last_id[0][0] == self.tokenizer.eod_id:
+            return None, outputs['past_key_values'], 'sl', return_tts_state
+        else:
+            return last_id, outputs['past_key_values'], 'cs', return_tts_state
diff --git a/vita/model/vita_tts/decoder/decoder.py b/vita/model/vita_tts/decoder/decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..50319e49b61482abe5140112fb054b9348f1bc93
--- /dev/null
+++ b/vita/model/vita_tts/decoder/decoder.py
@@ -0,0 +1,367 @@
+import torch
+import math
+import torch.nn as nn
+import torch.nn.functional as F
+
+from typing import Dict, List, Tuple, Optional, Union
+from transformers import LlamaConfig
+from transformers.models.llama.modeling_llama import LlamaDecoderLayer, LlamaRMSNorm, LlamaRotaryEmbedding
+from transformers.cache_utils import DynamicCache
+
+from vita.model.vita_tts.encoder.encoder import add_encoder_args
+from vita.model.vita_tts.masks import *
+
+IGNORE_ID = -1
+
+class CrossEntropyLoss(torch.nn.Module):
+    def __init__(self, ignore_index=-1):
+        super(CrossEntropyLoss, self).__init__()
+        self.criterion = torch.nn.CrossEntropyLoss(reduction='sum', ignore_index=ignore_index)
+        
+    def forward(self, logits, target, target_subsampling_factor=1):
+        """
+        logits: B*T1*D
+        target: B*T2
+        """
+        logits = logits[:, :target.shape[1], :]
+        logits = logits.transpose(1, 2)
+        target = target.to(torch.long)
+        loss = self.criterion(logits, target)
+        return loss
+
+class LLM2TTSCodecAR(torch.nn.Module):
+    """E2E module.
+
+    Args:
+        idim (int): dimension of inputs
+        odim (int): dimension of outputs
+        args (namespace): argument Namespace containing options
+
+    """
+
+    @staticmethod
+    def add_arguments(parser):
+        """Extend arguments for transducer."""
+        group = parser.add_argument_group("TDNN model setting")
+
+        group.add_argument('--encoder-pre-norm-type',
+                           default='ln', type=str, help="Type of input norm.")
+        group.add_argument('--encoder-drop-rate', default=0.0,
+                           type=float, help="Dropout rate for output.")
+        group.add_argument('--encoder-criterion', default='cross-entropy',
+                           type=str, help="Criterion for output")
+        group.add_argument('--encoder-upsample-rate', default=1, type=int)
+        group.add_argument('--kv-cache-prefix-finetune', default=0, type=int)
+
+        group = add_encoder_args(group)
+
+        return parser
+
+    def __init__(self, idim, odim, args):
+        """Initialize transducer modules.
+
+        Args:
+            idim (int): dimension of inputs
+            odim (int): dimension of outputs
+            args (Namespace): argument Namespace containing options
+
+        """
+        super(LLM2TTSCodecAR, self).__init__()
+        self.idim = args.idim
+        self.odim = args.odim
+        self.encoder_pre_norm_type = args.encoder_pre_norm_type
+        self.encoder_drop_rate = args.encoder_drop_rate
+        self.encoder_criterion = args.encoder_criterion
+        self.encoder_upsample_rate = args.encoder_upsample_rate
+        self.reporter = None
+
+        self.vocab_size = self.odim
+        config = LlamaConfig(vocab_size=self.vocab_size + 4, hidden_size=args.transformer_attention_dim, 
+                            intermediate_size=args.transformer_linear_units, 
+                            num_hidden_layers=args.transformer_num_blocks, 
+                            num_attention_heads=args.transformer_attention_heads, max_position_embeddings=2048, 
+                            bos_token_id=self.vocab_size + 1, 
+                            eos_token_id=self.vocab_size + 2, pad_token_id=self.vocab_size + 3,
+                            attention_dropout=args.transformer_dropout_rate)
+
+        self.embedding = nn.Embedding(self.vocab_size + 4, self.idim, padding_idx=self.vocab_size + 3)
+        self.init_pre_nn(config)
+
+        self.layers = nn.ModuleList(
+            [LlamaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = LlamaRotaryEmbedding(config=config)
+
+        self.dropout = nn.Dropout(p=self.encoder_drop_rate)
+        self.out_fnn = nn.Linear(args.encoder_output_dim, self.vocab_size + 4)
+
+        self.kv_cache_prefix_finetune = args.kv_cache_prefix_finetune
+        if self.kv_cache_prefix_finetune:
+            self.init_kv_cache_prefix(config)
+            self.embedding.eval()
+            self.layers.eval()
+            self.norm.eval()
+            self.rotary_emb.eval()
+            self.out_fnn.eval()
+            for (name, param) in self.embedding.named_parameters():
+                param.requires_grad = False
+            for (name, param) in self.layers.named_parameters():
+                param.requires_grad = False
+            for (name, param) in self.norm.named_parameters():
+                param.requires_grad = False
+            for (name, param) in self.rotary_emb.named_parameters():
+                param.requires_grad = False
+            for (name, param) in self.out_fnn.named_parameters():
+                param.requires_grad = False
+
+        if self.encoder_criterion == 'ce':
+            self.criterion = CrossEntropyLoss(ignore_index=self.vocab_size + 3)
+    
+    def init_kv_cache_prefix(self, config):
+        self.layers_prefix = nn.ModuleList(
+            [LlamaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.rotary_emb_prefix = LlamaRotaryEmbedding(config=config)
+    
+    def kv_cache_prefix_forward(self, prefix, prefix_lens, past_key_values):
+        inputs_embeds = prefix
+        past_seen_tokens = 0
+        cache_position = torch.arange(past_seen_tokens, past_seen_tokens + \
+                                      inputs_embeds.shape[1], device=inputs_embeds.device)
+        position_ids = cache_position.unsqueeze(0)
+        hidden_states = inputs_embeds
+        position_embeddings = self.rotary_emb_prefix(hidden_states, position_ids)
+        next_decoder_cache = None
+        batch_size, max_len, _ = prefix.size()
+        input_mask = torch.zeros(batch_size, max_len, max_len, dtype=torch.bool, device=prefix.device)
+        for i in range(batch_size):
+            input_mask[i, :prefix_lens[i], :prefix_lens[i]] = True
+        attention_mask = ~(input_mask.unsqueeze(1)) * torch.finfo(inputs_embeds.dtype).min
+        for decoder_layer in self.layers_prefix:
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_values,
+                output_attentions=False,
+                use_cache=True,
+                cache_position=None,
+                position_embeddings=position_embeddings,
+            )
+            hidden_states = layer_outputs[0]
+            next_decoder_cache = layer_outputs[1]
+        past_key_values = next_decoder_cache
+    
+    def init_pre_nn(self, config):
+        self.layers_pre_nn = nn.ModuleList(
+            [LlamaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers // 2)]
+        )
+        self.rotary_emb_pre_nn = LlamaRotaryEmbedding(config=config)
+    
+    def pre_nn_forward(self, hidden, hidden_lens):
+        inputs_embeds = hidden
+        past_seen_tokens = 0
+        cache_position = torch.arange(past_seen_tokens, past_seen_tokens + \
+                                      inputs_embeds.shape[1], device=inputs_embeds.device)
+        position_ids = cache_position.unsqueeze(0)
+        hidden_states = inputs_embeds
+        position_embeddings = self.rotary_emb_pre_nn(hidden_states, position_ids)
+        next_decoder_cache = None
+        batch_size, max_len, _ = hidden.size()
+        input_mask = torch.zeros(batch_size, max_len, max_len, dtype=torch.bool, device=hidden.device)
+        for i in range(batch_size):
+            input_mask[i, :hidden_lens[i], :hidden_lens[i]] = True
+        attention_mask = ~(input_mask.unsqueeze(1)) * torch.finfo(inputs_embeds.dtype).min
+        for decoder_layer in self.layers_pre_nn:
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=None,
+                output_attentions=False,
+                use_cache=False,
+                cache_position=None,
+                position_embeddings=position_embeddings,
+            )
+            hidden_states = layer_outputs[0]
+        return hidden_states
+
+    def forward(self, batch):
+        llm_hidden = batch['x']
+        llm_hidden_lens = batch['x_lens']
+        y = batch['y']
+        y[y == IGNORE_ID] = self.vocab_size + 3
+        y_lens = batch['y_lens']
+        past_key_values = DynamicCache.from_legacy_cache(None)
+
+        if self.kv_cache_prefix_finetune:
+            self.kv_cache_prefix_forward(batch['x_prefix'], batch['x_prefix_lens'], past_key_values)
+
+        # text_ids: (batch_size, max_len)
+        batch_size, max_len = y.size()
+
+        # Create bos, sos and eos tokens
+        bos_token = torch.full((batch_size, 1), self.vocab_size, dtype=torch.long, device=y.device)
+        sos_token = torch.full((batch_size, 1), self.vocab_size + 1, dtype=torch.long, device=y.device)
+        eos_token = torch.full((batch_size, 1), self.vocab_size + 2, dtype=torch.long, device=y.device)
+        padding_token = torch.full((batch_size, 1), self.vocab_size + 3, dtype=torch.long, device=y.device)
+
+        # Pass through pre_nn
+        llm_hidden = self.pre_nn_forward(llm_hidden, llm_hidden_lens)
+
+        # Concat bos embedding
+        bos_emb = self.embedding(bos_token)
+        llm_hidden = torch.cat([bos_emb, llm_hidden], dim=1)
+        llm_hidden_lens = llm_hidden_lens + 1
+
+        # Create input x with sos token at the beginning
+        x = torch.cat([sos_token, y], dim=1)  # (batch_size, max_len + 1)
+        
+        # Create output y with eos token at the end
+        y = torch.cat([y, padding_token], dim=1)
+        eos_positions = torch.arange(max_len + 1, device=y.device).expand(batch_size, max_len + 1) \
+                        == y_lens.unsqueeze(1)
+        y = y.masked_scatter(eos_positions, eos_token.expand_as(y)[eos_positions])
+
+        # Embed the input sequence
+        x_emb = self.embedding(x)  # (batch_size, max_len + 1, d_model)
+
+        # compute masks
+        if self.kv_cache_prefix_finetune:
+            x_prefix = batch['x_prefix']
+            x_prefix_lens = batch['x_prefix_lens']
+            input_lens = llm_hidden.size(1) + max_len + 1
+            input_mask = torch.zeros(batch_size, input_lens, x_prefix.size(1) + input_lens, \
+                                     dtype=torch.bool, device=x_emb.device)
+            for i in range(batch_size):
+                input_mask[i, :llm_hidden_lens[i], :x_prefix_lens[i]] = True
+                input_mask[i, :llm_hidden_lens[i], x_prefix.size(1): x_prefix.size(1) + llm_hidden_lens[i]] = True
+                input_mask[i, llm_hidden.size(1): llm_hidden.size(1) + y_lens[i] + 1, :x_prefix_lens[i]] = True
+                input_mask[i, llm_hidden.size(1): llm_hidden.size(1) + y_lens[i] + 1, \
+                           x_prefix.size(1): x_prefix.size(1) + llm_hidden_lens[i]] = True
+                input_mask[i, llm_hidden.size(1): llm_hidden.size(1) + y_lens[i] + 1, \
+                           x_prefix.size(1) + llm_hidden.size(1): x_prefix.size(1) + \
+                                                                  llm_hidden.size(1) + y_lens[i] + 1] \
+                           = subsequent_mask(y_lens[i] + 1, x_emb.device)
+        else:
+            input_lens = llm_hidden.size(1) + max_len + 1
+            input_mask = torch.zeros(batch_size, input_lens, input_lens, dtype=torch.bool, device=x_emb.device)
+            for i in range(batch_size):
+                input_mask[i, :llm_hidden_lens[i], :llm_hidden_lens[i]] = True
+                input_mask[i, llm_hidden.size(1): llm_hidden.size(1) + y_lens[i] + 1, :llm_hidden_lens[i]] = True
+                input_mask[i, llm_hidden.size(1): llm_hidden.size(1) + y_lens[i] + 1, \
+                           llm_hidden.size(1): llm_hidden.size(1) + y_lens[i] + 1] \
+                           = subsequent_mask(y_lens[i] + 1, x_emb.device)
+
+        # Pass through the transformer
+        inputs_embeds = torch.cat([llm_hidden, x_emb], 1)
+        llm_hidden = self.dropout(llm_hidden)
+        past_seen_tokens = 0
+        cache_position = torch.arange(past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], \
+                                      device=inputs_embeds.device)
+        position_ids = cache_position.unsqueeze(0)
+        hidden_states = inputs_embeds
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        attention_mask = ~(input_mask.unsqueeze(1)) * torch.finfo(inputs_embeds.dtype).min
+        for decoder_layer in self.layers:
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_values,
+                output_attentions=False,
+                use_cache=True,
+                cache_position=None,
+                position_embeddings=position_embeddings,
+            )
+            hidden_states = layer_outputs[0]
+        hidden_states = self.norm(hidden_states)
+
+        encoder_out = hidden_states[:, llm_hidden.size(1):]
+
+        # Project to vocabulary size
+        logits = self.out_fnn(encoder_out)
+
+        if self.encoder_criterion == 'ce':
+            loss = self.criterion(logits, y)
+
+        if self.training:
+            self.reporter.log_loss('loss', float(loss))
+
+        return loss
+    
+    def transformer_infer(self, inputs_embeds, cache_position, past_key_values):
+        position_ids = cache_position.unsqueeze(0)
+        hidden_states = inputs_embeds
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        next_decoder_cache = None
+        for decoder_layer in self.layers:
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=None,
+                position_ids=position_ids,
+                past_key_value=past_key_values,
+                output_attentions=False,
+                use_cache=True,
+                cache_position=None,
+                position_embeddings=position_embeddings,
+            )
+            hidden_states = layer_outputs[0]
+            next_decoder_cache = layer_outputs[1]
+        return hidden_states
+            
+    def infer(self, hidden, top_k, prefix, penalty_window_size, penalty, max_tokens=1000):
+        # Pass through pre_nn
+        hidden = self.pre_nn_forward(hidden, [hidden.size(1)])
+        # Concat bos embedding
+        bos_emb = self.embedding(torch.full((1, 1), self.vocab_size, dtype=torch.long, device=hidden.device))
+        hidden = torch.cat([bos_emb, hidden], dim=1)
+        # init past key values
+        past_key_values = DynamicCache.from_legacy_cache(None)
+        # Pass through the prefix nar decoder
+        if prefix is not None and self.kv_cache_prefix_finetune:
+            self.kv_cache_prefix_forward(prefix, [prefix.size(1)], past_key_values)
+        inputs_embeds = hidden
+        past_seen_tokens = 0
+        cache_position = torch.arange(past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], \
+                                      device=inputs_embeds.device)
+        hidden_states = self.transformer_infer(inputs_embeds, cache_position, past_key_values)
+
+        # init generated tokens
+        cur_token = torch.full((1, 1), self.vocab_size + 1, dtype=torch.long, device=hidden.device)
+        generated_tokens = torch.full((1, 1), self.vocab_size + 1, dtype=torch.long, device=hidden.device)
+        # generate tokens
+        for i in range(max_tokens):
+            inputs_embeds = self.embedding(cur_token)
+            past_seen_tokens = past_key_values.get_seq_length()
+            if prefix is not None:
+                past_seen_tokens = past_seen_tokens - prefix.size(1)
+            cache_position = torch.arange(past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], \
+                                          device=inputs_embeds.device)
+            hidden_states = self.transformer_infer(inputs_embeds, cache_position, past_key_values)
+            hidden_states = self.norm(hidden_states)
+
+            # Project to vocabulary size
+            logits = self.out_fnn(hidden_states)
+
+            # apply penalty
+            if penalty_window_size > 0:
+                for token in set(generated_tokens[0][-penalty_window_size:]):
+                    logits[:, :, token] /= penalty
+
+            # top k sampling
+            output = logits.squeeze(0).squeeze(0)
+            probs = torch.nn.functional.softmax(output, dim=-1)
+            top_k_probs, top_k_indices = torch.topk(probs, top_k)
+            probs = torch.zeros_like(probs).scatter_(0, top_k_indices, top_k_probs)
+            probs = probs / probs.sum()
+            next_token_id = torch.multinomial(probs, 1).unsqueeze(0)
+
+            generated_tokens = torch.cat([generated_tokens, next_token_id], dim=-1)
+            cur_token = next_token_id
+
+            # eos
+            if next_token_id == self.vocab_size + 2:
+                break
+            yield next_token_id
diff --git a/vita/model/vita_tts/decoder/llm2tts.py b/vita/model/vita_tts/decoder/llm2tts.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf03032a4a12a5980c958619de2a21f6e7384c09
--- /dev/null
+++ b/vita/model/vita_tts/decoder/llm2tts.py
@@ -0,0 +1,161 @@
+import os
+import sys
+import copy
+import json
+import torch
+import random
+import argparse
+import subprocess
+import numpy as np
+import soundfile as sf
+import subprocess
+import concurrent.futures
+
+from vita.model.vita_tts.decoder.decoder import LLM2TTSCodecAR
+from vita.model.vita_tts.decoder.ticodec.vqvae_tester import VqvaeTester
+
+class llm2TTS():
+    def __init__(self, model_path):
+        self.model = self.get_model(model_path).cuda().to(
+                                    torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float32
+                                    )
+        self.infer = self.model.infer
+
+        self.codec_model = VqvaeTester(config_path=model_path + "/codec/model.json", 
+                                        model_path=model_path + "/codec/final.pt",
+                                        sample_rate=24000)
+        self.codec_model = self.codec_model.cuda()
+        self.codec_model.vqvae.generator.remove_weight_norm()
+        self.codec_model.vqvae.encoder.remove_weight_norm()
+        self.codec_model.eval()
+
+    def get_model_conf(self, model_path):
+        model_conf = model_path + "/decoder/model.json"
+        with open(model_conf, "rb") as f:
+            print('reading a config file from ' + model_conf)
+            confs = json.load(f)
+        # for asr, tts, mt
+        idim, odim, args = confs
+        return argparse.Namespace(**args)
+
+    def get_model(self, model_path):
+        args_load = self.get_model_conf(model_path)
+        args_load = vars(args_load)
+        args = argparse.Namespace(**args_load)
+        odim = args.odim
+        idim = args.idim
+        model = LLM2TTSCodecAR(idim, odim, args)
+
+        # Resume from a snapshot
+        snapshot_dict = torch.load(model_path + "/decoder/final.pt", map_location=lambda storage, loc: storage)
+        if 'model' in snapshot_dict.keys():
+            resume_model_dict = snapshot_dict['model']
+        else:
+            resume_model_dict = snapshot_dict
+        
+        model_dict = model.state_dict()
+        for key in model_dict.keys():
+            if key in resume_model_dict.keys():
+                if model_dict[key].shape == resume_model_dict[key].shape:
+                    model_dict[key] = resume_model_dict[key]
+                else:
+                    print('Key {} has different shape, {} VS {}'.format(key, model_dict[key].shape, 
+                                                                        resume_model_dict[key].shape))
+            else:
+                print('Key {} has not in resume model'.format(key))
+        model.load_state_dict(model_dict)
+        model.eval()
+        return model
+    
+    def find_min_sum_index(self, buffer, syn, N, threshold):
+        """
+        Find the index with the minimum sum of a sliding window in the given audio segment 
+        and perform operations based on this index.
+
+        Parameters:
+        - buffer (torch.Tensor): The buffer containing previously processed audio segments.
+        - syn (torch.Tensor): The current audio segment to be processed.
+        - N (int): The size of the sliding window used to calculate the sum.
+        - threshold (float): Threshold value to determine whether to concatenate buffer and current segment or not.
+
+        Returns:
+        - tuple: A tuple containing the updated buffer and the processed audio segment.
+
+        """
+        arr = syn[0, 0, :]
+        L = len(arr)
+        mid = L // 2
+        
+        kernel = torch.ones(N).to(arr.device)
+        window_sums = torch.nn.functional.conv1d(arr.abs().view(1, 1, -1), kernel.view(1, 1, -1), padding=0).squeeze()
+        
+        start_index = mid - (N // 2)
+        min_sum, min_index = torch.min(window_sums[start_index:], dim=0)
+
+        # get the start and end index of the window
+        start_index = max(0, min_index.item() + start_index)
+        end_index = min(L, min_index.item() + N + start_index)
+        
+        # calculate the real min_sum and min_index
+        min_sum_real, min_index_real = torch.min(arr[start_index: end_index].abs(), dim=0)
+        min_index = min_index_real.item() + start_index
+
+        min_sum = min_sum / N
+        syn_clone = syn.clone()
+
+        if min_sum < threshold:
+            syn = torch.cat([buffer.clone(), syn[:, :, :min_index]], dim=-1)
+            buffer = syn_clone[:, :, min_index:]
+        else:
+            buffer = torch.cat([buffer, syn_clone], dim=-1)
+            syn = None
+        return buffer, syn
+
+    def run(self, hidden, top_k, prefix, codec_chunk_size=40, codec_padding_size=10, 
+            penalty_window_size=-1, penalty=1.1, N=2401, seg_threshold=0.01):
+        """
+        Run the speech decoder process.
+
+        Parameters:
+        - hidden (torch.Tensor): The output for embedding layer of the language model.
+        - top_k (int): The number of top-k tokens to consider during inference.
+        - prefix (str, optional): The hidden state from the language model.
+        - codec_chunk_size (int, default=40): The size of each chunk to process in the codec model.
+        - codec_padding_size (int, default=10): The amount of padding to add on each side of the codec chunk.
+        - penalty_window_size (int, default=20): The window size for applying penalties during decoding.
+        - penalty (float, default=1.1): The penalty factor.
+
+        Yields:
+        - torch.Tensor: Intermediate audio segments generated by the codec model.
+
+        """
+        codec_upsample_rate = 600
+        left_padding = 0
+        right_padding = codec_padding_size
+        prefix = None
+        buffer = torch.zeros([1, 1, 0]).to(hidden.device)
+        with torch.no_grad():
+            with torch.autocast(device_type="cuda", 
+                    dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float32):
+                print("Starting TTS...")
+                token = torch.full((1, 0), self.model.vocab_size, dtype=torch.long, device=hidden.device)
+                for next_token_id in self.infer(hidden, top_k, prefix, penalty_window_size, penalty):
+                    token = torch.cat([token, next_token_id], dim=-1)
+                    if token.size(1) == left_padding + codec_chunk_size + right_padding:
+                        syn = self.codec_model.vqvae(token.unsqueeze(-1), 
+                                                     torch.tensor(self.codec_model.vqvae.h.global_tokens, 
+                                                     device=token.device).unsqueeze(0).unsqueeze(0))
+                        print("Codec Done")
+                        syn = syn[:, :, left_padding * codec_upsample_rate: -right_padding * codec_upsample_rate]
+                        left_padding = codec_padding_size
+                        token = token[:, -(left_padding + right_padding):]
+                        buffer, syn = self.find_min_sum_index(buffer, syn, N, seg_threshold)
+                        if syn is not None:
+                            yield syn
+                if token.size(1) > 0:
+                    print("Codec Done")
+                    syn = self.codec_model.vqvae(token.unsqueeze(-1), 
+                                                 torch.tensor(self.codec_model.vqvae.h.global_tokens, 
+                                                 device=token.device).unsqueeze(0).unsqueeze(0))
+                    syn = syn[:, :, left_padding * codec_upsample_rate:]
+                    yield torch.cat([buffer, syn], dim=-1)
diff --git a/vita/model/vita_tts/decoder/ticodec/models.py b/vita/model/vita_tts/decoder/ticodec/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..dae05a42e0c500b61744bc02e10e6d2d189ee575
--- /dev/null
+++ b/vita/model/vita_tts/decoder/ticodec/models.py
@@ -0,0 +1,716 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import AvgPool1d
+from torch.nn import Conv1d
+from torch.nn import Conv2d
+from torch.nn import ConvTranspose1d
+from torch.nn.utils import remove_weight_norm
+from torch.nn.utils import spectral_norm
+from torch.nn.utils import weight_norm
+
+LRELU_SLOPE = 0.1
+
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size * dilation - dilation) / 2)
+
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+
+class GlobalTokenEncoder(nn.Module):
+    def __init__(self, in_channels, hidden_channels, out_channels, kernel_size=3, stride=1):
+        super().__init__()
+        self.pad = (kernel_size - stride) // 2
+        self.conv = nn.Sequential(
+            nn.Conv1d(in_channels, hidden_channels, kernel_size, stride, self.pad, bias=False),
+            nn.LeakyReLU(LRELU_SLOPE),
+            nn.Conv1d(hidden_channels, hidden_channels, kernel_size, stride, self.pad, bias=False),
+            nn.LeakyReLU(LRELU_SLOPE),
+            nn.Conv1d(hidden_channels, out_channels, kernel_size, stride, self.pad, bias=False),
+            nn.LeakyReLU(LRELU_SLOPE),
+        )
+        self.fn = nn.Sequential(
+            # # 2 layers
+            # nn.Linear(out_channels, hidden_channels),
+            # nn.LeakyReLU(LRELU_SLOPE),
+            # nn.Linear(hidden_channels, out_channels),
+            # nn.LeakyReLU(LRELU_SLOPE),
+            # 1 layer
+            nn.Linear(out_channels, out_channels),
+            nn.LeakyReLU(LRELU_SLOPE),
+            nn.BatchNorm1d(out_channels),
+        )
+    def forward(self, x):
+        """
+        x --- [B, in_channels, T]
+        out -- [B, out_channels]
+        """
+        # x_mask = torch.unsqueeze(sequence_mask(
+        #     x_lengths, x.size(2)), 1).to(x.dtype)
+        # x = self.conv(x) * x_mask
+        x = self.conv(x)
+        # x = torch.sum(x, dim=2) / torch.sum(x_mask, dim=2) # [B, out_channels]
+        x = torch.mean(x, dim=2) # [B, out_channels]
+        x = self.fn(x)
+        return x
+
+class ResBlock1(torch.nn.Module):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super(ResBlock1, self).__init__()
+        self.h = h
+        self.convs1 = nn.ModuleList([
+            weight_norm(
+                Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[0],
+                    padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(
+                Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[1],
+                    padding=get_padding(kernel_size, dilation[1]))),
+            weight_norm(
+                Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[2],
+                    padding=get_padding(kernel_size, dilation[2])))
+        ])
+        self.convs1.apply(init_weights)
+
+        self.convs2 = nn.ModuleList([
+            weight_norm(
+                Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=1,
+                    padding=get_padding(kernel_size, 1))), weight_norm(
+                        Conv1d(
+                            channels,
+                            channels,
+                            kernel_size,
+                            1,
+                            dilation=1,
+                            padding=get_padding(kernel_size, 1))), weight_norm(
+                                Conv1d(
+                                    channels,
+                                    channels,
+                                    kernel_size,
+                                    1,
+                                    dilation=1,
+                                    padding=get_padding(kernel_size, 1)))
+        ])
+        self.convs2.apply(init_weights)
+
+    def forward(self, x):
+        for c1, c2 in zip(self.convs1, self.convs2):
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            xt = c1(xt)
+            xt = F.leaky_relu(xt, LRELU_SLOPE)
+            xt = c2(xt)
+            x = xt + x
+        return x
+
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+
+
+class ResBlock2(torch.nn.Module):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
+        super(ResBlock2, self).__init__()
+        self.h = h
+        self.convs = nn.ModuleList([
+            weight_norm(
+                Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[0],
+                    padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(
+                Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    1,
+                    dilation=dilation[1],
+                    padding=get_padding(kernel_size, dilation[1])))
+        ])
+        self.convs.apply(init_weights)
+
+    def forward(self, x):
+        for c in self.convs:
+            xt = F.leaky_relu(x, LRELU_SLOPE)
+            xt = c(xt)
+            x = xt + x
+        return x
+
+    def remove_weight_norm(self):
+        for l in self.convs:
+            remove_weight_norm(l)
+
+
+class Generator(torch.nn.Module):
+    def __init__(self, h):
+        """
+        Initializes the Generator module.
+
+        Parameters:
+        - h (object): Configuration object containing hyperparameters for the generator.
+        """
+        super(Generator, self).__init__()
+        self.h = h
+        self.num_kernels = len(h.resblock_kernel_sizes)
+        self.num_upsamples = len(h.upsample_rates)
+        self.conv_pre = weight_norm(
+            Conv1d(512, h.upsample_initial_channel, 7, 1, padding=3))
+        resblock = ResBlock1 if h.resblock == '1' else ResBlock2
+
+        self.ups = nn.ModuleList()
+        for i, (u,
+                k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        h.upsample_initial_channel // (2**i),
+                        h.upsample_initial_channel // (2**(i + 1)),
+                        k,
+                        u,
+                        # padding=(u//2 + u%2),
+                        padding=(k - u) // 2,
+                        # output_padding=u%2
+                    )))
+
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = h.upsample_initial_channel // (2**(i + 1))
+            for j, (k, d) in enumerate(
+                    zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
+                self.resblocks.append(resblock(h, ch, k, d))
+
+        self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+
+    def forward(self, x, global_features):
+        """
+        Forward pass of the Generator module.
+
+        Parameters:
+        - x (torch.Tensor): Input tensor of shape [B, C, T], where B is the batch size, 
+                            C is the number of channels, and T is the sequence length.
+        - global_features (torch.Tensor): Global features tensor of shape [B, 128].
+
+        Returns:
+        - torch.Tensor: Output tensor of shape [B, 1, T], 
+                        where B is the batch size, and T is the sequence length.
+        """
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            x = self.ups[i](x)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+            # if i == self.num_upsamples//2 - 1:
+            if x.shape[-2] == global_features.shape[-1]:
+                x += global_features.unsqueeze(-1).repeat(1, 1, x.shape[-1])
+        x = F.leaky_relu(x, LRELU_SLOPE)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+
+        return x
+
+    def remove_weight_norm(self):
+        """
+        Removes weight normalization from all layers in the Generator module.
+        """
+        print('Removing weight norm...')
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+
+
+class DiscriminatorP(torch.nn.Module):
+    def __init__(self, period, kernel_size=5, stride=3,
+                 use_spectral_norm=False):
+        super(DiscriminatorP, self).__init__()
+        self.period = period
+        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
+        self.convs = nn.ModuleList([
+            norm_f(
+                Conv2d(
+                    1,
+                    32, (kernel_size, 1), (stride, 1),
+                    padding=(get_padding(5, 1), 0))),
+            norm_f(
+                Conv2d(
+                    32,
+                    128, (kernel_size, 1), (stride, 1),
+                    padding=(get_padding(5, 1), 0))),
+            norm_f(
+                Conv2d(
+                    128,
+                    512, (kernel_size, 1), (stride, 1),
+                    padding=(get_padding(5, 1), 0))),
+            norm_f(
+                Conv2d(
+                    512,
+                    1024, (kernel_size, 1), (stride, 1),
+                    padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
+        ])
+        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+
+    def forward(self, x):
+        fmap = []
+
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0:  # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+
+        return x, fmap
+
+
+class MultiPeriodDiscriminator(torch.nn.Module):
+    def __init__(self):
+        super(MultiPeriodDiscriminator, self).__init__()
+        self.discriminators = nn.ModuleList([
+            DiscriminatorP(2),
+            DiscriminatorP(3),
+            DiscriminatorP(5),
+            DiscriminatorP(7),
+            DiscriminatorP(11),
+        ])
+
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+
+
+class DiscriminatorS(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(DiscriminatorS, self).__init__()
+        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
+        self.convs = nn.ModuleList([
+            norm_f(Conv1d(1, 128, 15, 1, padding=7)),
+            norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
+            norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
+            norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
+            norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
+            norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
+            norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+        ])
+        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+
+    def forward(self, x):
+        fmap = []
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+
+        return x, fmap
+
+
+class MultiScaleDiscriminator(torch.nn.Module):
+    def __init__(self):
+        super(MultiScaleDiscriminator, self).__init__()
+        self.discriminators = nn.ModuleList([
+            DiscriminatorS(use_spectral_norm=True),
+            DiscriminatorS(),
+            DiscriminatorS(),
+        ])
+        self.meanpools = nn.ModuleList(
+            [AvgPool1d(4, 2, padding=2), AvgPool1d(4, 2, padding=2)])
+
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            if i != 0:
+                y = self.meanpools[i - 1](y)
+                y_hat = self.meanpools[i - 1](y_hat)
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+
+
+def feature_loss(fmap_r, fmap_g):
+    loss = 0
+    for dr, dg in zip(fmap_r, fmap_g):
+        for rl, gl in zip(dr, dg):
+            loss += torch.mean(torch.abs(rl - gl))
+
+    return loss * 2
+
+
+def discriminator_loss(disc_real_outputs, disc_generated_outputs):
+    loss = 0
+    r_losses = []
+    g_losses = []
+    for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
+        r_loss = torch.mean((1 - dr)**2)
+        g_loss = torch.mean(dg**2)
+        loss += (r_loss + g_loss)
+        r_losses.append(r_loss.item())
+        g_losses.append(g_loss.item())
+
+    return loss, r_losses, g_losses
+
+
+def generator_loss(disc_outputs):
+    loss = 0
+    gen_losses = []
+    for dg in disc_outputs:
+        l = torch.mean((1 - dg)**2)
+        gen_losses.append(l)
+        loss += l
+
+    return loss, gen_losses
+
+
+class Encoder(torch.nn.Module):
+    def __init__(self, h):
+        super(Encoder, self).__init__()
+        self.h = h
+        self.num_kernels = len(h.resblock_kernel_sizes)
+        self.num_upsamples = len(h.upsample_rates)
+        self.conv_pre = weight_norm(Conv1d(1, 32, 7, 1, padding=3))
+        self.normalize = nn.ModuleList()
+        resblock = ResBlock1 if h.resblock == '1' else ResBlock2
+
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(
+                list(
+                    reversed(
+                        list(zip(h.upsample_rates, h.upsample_kernel_sizes))))):
+            self.ups.append(
+                weight_norm(
+                    Conv1d(
+                        32 * (2**i),
+                        32 * (2**(i + 1)),
+                        k,
+                        u,
+                        padding=((k - u) // 2)
+                        # padding=(u//2 + u%2)
+                    )))
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = 32 * (2**(i + 1))
+            for j, (k, d) in enumerate(
+                    zip(
+                        list(reversed(h.resblock_kernel_sizes)),
+                        list(reversed(h.resblock_dilation_sizes)))):
+                self.resblocks.append(resblock(h, ch, k, d))
+                self.normalize.append(
+                    torch.nn.GroupNorm(ch // 16, ch, eps=1e-6, affine=True))
+        self.conv_post = Conv1d(512, 512, 3, 1, padding=1)
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+        self.linear = nn.Sequential(
+            nn.Linear(128, 128),
+            nn.LeakyReLU(LRELU_SLOPE)
+        )
+        self.gfc = h.global_feature_conv
+        self.GlobalTokenEncoder = GlobalTokenEncoder(self.gfc[0], self.gfc[1], self.gfc[2], self.gfc[3], self.gfc[4])
+        self.GlobalTokenEncoder.apply(init_weights)
+
+    def forward(self, x, xx=None):
+        x = self.conv_pre(x)
+        global_features = None
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            x = self.ups[i](x)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                    xs = self.normalize[i * self.num_kernels + j](xs)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+                    xs = self.normalize[i * self.num_kernels + j](xs)
+            x = xs / self.num_kernels
+            if i == self.num_upsamples//2 - 1:
+                mid_features = x
+                global_features = self.GlobalTokenEncoder(x)
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        if xx is not None:
+            xx = self.conv_pre(xx)
+            global_features2 = None
+            for i in range(self.num_upsamples//2):
+                xx = F.leaky_relu(xx, LRELU_SLOPE)
+                xx = self.ups[i](xx)
+                xxs = None
+                for j in range(self.num_kernels):
+                    if xxs is None:
+                        xxs = self.resblocks[i * self.num_kernels + j](xx)
+                        xxs = self.normalize[i * self.num_kernels + j](xxs)
+                    else:
+                        xxs += self.resblocks[i * self.num_kernels + j](xx)
+                        xxs = self.normalize[i * self.num_kernels + j](xxs)
+                xx = xxs / self.num_kernels
+            mid_features2 = xx
+            global_features2 = self.GlobalTokenEncoder(xx)
+            global_features2 = global_features2.detach()
+            return x, global_features, global_features2
+        return x, global_features
+
+    def remove_weight_norm(self):
+        print('Removing weight norm...')
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+
+
+class Quantizer_module(torch.nn.Module):
+    def __init__(self, n_e, e_dim):
+        super(Quantizer_module, self).__init__()
+        self.embedding = nn.Embedding(n_e, e_dim)
+        self.embedding.weight.data.uniform_(-1.0 / n_e, 1.0 / n_e)
+
+    def forward(self, x):
+        # compute Euclidean distance
+        d = torch.sum(x ** 2, 1, keepdim=True) + torch.sum(self.embedding.weight ** 2, 1) \
+            - 2 * torch.matmul(x, self.embedding.weight.T)
+        min_indicies = torch.argmin(d, 1)
+        z_q = self.embedding(min_indicies)
+        return z_q, min_indicies
+
+
+class Quantizer(torch.nn.Module):
+    def __init__(self, h):
+        super(Quantizer, self).__init__()
+        assert 512 % h.n_code_groups == 0
+        self.quantizer_modules = nn.ModuleList([
+            Quantizer_module(h.n_codes, 512 // h.n_code_groups)
+            for _ in range(h.n_code_groups)
+        ])
+        self.residul_layer = h.residul_layer
+        if h.residul_layer == 2:
+            self.quantizer_modules2 = nn.ModuleList([
+                Quantizer_module(h.n_codes, 512 // h.n_code_groups)
+                for _ in range(h.n_code_groups)
+            ])
+        if h.residul_layer == 4:
+            self.quantizer_modules2 = nn.ModuleList([
+                Quantizer_module(h.n_codes, 512 // h.n_code_groups)
+                for _ in range(h.n_code_groups)
+            ])
+            self.quantizer_modules3 = nn.ModuleList([
+                Quantizer_module(h.n_codes, 512 // h.n_code_groups)
+                for _ in range(h.n_code_groups)
+            ])
+            self.quantizer_modules4 = nn.ModuleList([
+                Quantizer_module(h.n_codes, 512 // h.n_code_groups)
+                for _ in range(h.n_code_groups)
+            ])
+        
+        self.quantizer_modules_globaltokens = nn.ModuleList([
+            Quantizer_module(h.n_codes, 128//h.global_code_num)
+            for _ in range(h.global_code_num)
+        ])
+        # self.quantizer_modules3 = nn.ModuleList([
+        #     Quantizer_module(h.n_codes, 128//h.global_code_num)
+        #     for _ in range(h.global_code_num)
+        # ])
+        self.h = h
+        self.codebook_loss_lambda = self.h.codebook_loss_lambda  # e.g., 1
+        self.commitment_loss_lambda = self.h.commitment_loss_lambda  # e.g., 0.25
+        # self.residul_layer = 2
+        self.n_code_groups = h.n_code_groups
+        self.global_code_num = h.global_code_num
+
+    def for_one_step(self, xin, idx):
+        xin = xin.transpose(1, 2)
+        x = xin.reshape(-1, 512)
+        x = torch.split(x, 512 // self.h.n_code_groups, dim=-1)
+        min_indicies = []
+        z_q = []
+        if idx == 0:
+            for _x, m in zip(x, self.quantizer_modules):
+                _z_q, _min_indicies = m(_x)
+                z_q.append(_z_q)
+                min_indicies.append(_min_indicies)  #B * T,
+        elif idx == 1:
+            for _x, m in zip(x, self.quantizer_modules2):
+                _z_q, _min_indicies = m(_x)
+                z_q.append(_z_q)
+                min_indicies.append(_min_indicies)  #B * T,
+        elif idx == 2:
+            for _x, m in zip(x, self.quantizer_modules3):
+                _z_q, _min_indicies = m(_x)
+                z_q.append(_z_q)
+                min_indicies.append(_min_indicies)
+        elif idx == 3:
+            for _x, m in zip(x, self.quantizer_modules4):
+                _z_q, _min_indicies = m(_x)
+                z_q.append(_z_q)
+                min_indicies.append(_min_indicies)
+        z_q = torch.cat(z_q, -1).reshape(xin.shape)
+        # loss = 0.25 * torch.mean((z_q.detach() - xin) ** 2) + torch.mean((z_q - xin.detach()) ** 2)
+        loss = self.codebook_loss_lambda * torch.mean((z_q - xin.detach()) ** 2) \
+            + self.commitment_loss_lambda * torch.mean((z_q.detach() - xin) ** 2)
+        z_q = xin + (z_q - xin).detach()
+        z_q = z_q.transpose(1, 2)
+        return z_q, loss, min_indicies
+
+    def for_one_step_gst(self, xin):
+        # xin = xin.transpose(1, 2)
+        x = xin.reshape(-1, 128) #B * 1, 128
+        x = torch.split(x, 128 // self.global_code_num, dim=-1)
+        min_indicies = []
+        z_q = []
+        for _x, m in zip(x, self.quantizer_modules_globaltokens):
+            _z_q, _min_indicies = m(_x)
+            z_q.append(_z_q)
+            min_indicies.append(_min_indicies) 
+        # for _x, m in zip(x, self.quantizer_modules3):
+        #     _z_q, _min_indicies = m(_x)
+        #     z_q.append(_z_q)
+        #     min_indicies.append(_min_indicies)
+        z_q = torch.cat(z_q, -1).reshape(xin.shape)
+        # loss = 0.25 * torch.mean((z_q.detach() - xin) ** 2) + torch.mean((z_q - xin.detach()) ** 2)
+        loss = self.codebook_loss_lambda * torch.mean((z_q - xin.detach()) ** 2) \
+            + self.commitment_loss_lambda * torch.mean((z_q.detach() - xin) ** 2)
+        z_q = xin + (z_q - xin).detach()
+        z_q = z_q.squeeze(1)
+        return z_q, loss, min_indicies
+
+    def forward(self, xin, global_style):
+        #B, C, T
+        quantized_out = 0.0
+        residual = xin
+        all_losses = []
+        all_indices = []
+        for i in range(self.residul_layer):
+            quantized, loss, indices = self.for_one_step(residual, i)  # 
+            residual = residual - quantized
+            quantized_out = quantized_out + quantized
+            all_indices.extend(indices)  # 
+            all_losses.append(loss)
+        all_losses = torch.stack(all_losses)
+        loss = torch.mean(all_losses)
+        global_style_quantized, loss_gst_vq, global_style_tokens= self.for_one_step_gst(global_style)
+        loss += loss_gst_vq
+        # global_style_quantized = global_style
+        # global_style_tokens = global_style
+        # global_style_quantized = global_style_quantized.squeeze(1)
+        # global_style_tokens = global_style_tokens.squeeze(1)
+        return quantized_out, loss, all_indices, global_style_quantized, global_style_tokens
+
+    def embed(self, x):
+        #idx: N, T, 4
+        #print('x ', x.shape)
+        quantized_out = torch.tensor(0.0, device=x.device)
+        x = torch.split(x, 1, 2)  # split, 将最后一个维度分开, 每个属于一个index group
+        #print('x.shape ', len(x),x[0].shape)
+        for i in range(self.residul_layer):
+            ret = []
+            if i == 0:
+                for j in range(self.n_code_groups):
+                    q = x[j]
+                    embed = self.quantizer_modules[j]
+                    q = embed.embedding(q.squeeze(-1))
+                    ret.append(q)
+                ret = torch.cat(ret, -1)
+                #print(ret.shape)
+                quantized_out = quantized_out + ret
+            elif i == 1:
+                for j in range(self.n_code_groups):
+                    q = x[j + self.n_code_groups]
+                    embed = self.quantizer_modules2[j]
+                    q = embed.embedding(q.squeeze(-1))
+                    ret.append(q)
+                ret = torch.cat(ret, -1)
+                quantized_out = quantized_out + ret
+            elif i == 2:
+                for j in range(self.n_code_groups):
+                    q = x[j + self.n_code_groups * 2]
+                    embed = self.quantizer_modules3[j]
+                    q = embed.embedding(q.squeeze(-1))
+                    ret.append(q)
+                ret = torch.cat(ret, -1)
+                quantized_out = quantized_out + ret
+            elif i == 3:
+                for j in range(self.n_code_groups):
+                    q = x[j + self.n_code_groups * 3]
+                    embed = self.quantizer_modules4[j]
+                    q = embed.embedding(q.squeeze(-1))
+                    ret.append(q)
+                ret = torch.cat(ret, -1)
+                quantized_out = quantized_out + ret
+        return quantized_out.transpose(1, 2)  #N, C, T
+    def embed_gst(self, x):
+        quantized_out = torch.tensor(0.0, device=x.device)
+        ret = []
+        x = torch.split(x, 1, 2) 
+        for j in range(self.global_code_num):
+            q = x[j]
+            embed = self.quantizer_modules_globaltokens[j]
+            # embed = self.quantizer_modules3[j]
+            q = embed.embedding(q.squeeze(-1))
+            ret.append(q)
+        ret = torch.cat(ret, -1)
+        quantized_out = quantized_out + ret
+        return quantized_out.transpose(1, 2)
+        # return x
diff --git a/vita/model/vita_tts/decoder/ticodec/vqvae.py b/vita/model/vita_tts/decoder/ticodec/vqvae.py
new file mode 100644
index 0000000000000000000000000000000000000000..149cd339567c471f200679bd05bbb616b1352bc7
--- /dev/null
+++ b/vita/model/vita_tts/decoder/ticodec/vqvae.py
@@ -0,0 +1,57 @@
+import json
+
+import torch
+import torch.nn as nn
+
+from vita.model.vita_tts.decoder.ticodec.models import Encoder
+from vita.model.vita_tts.decoder.ticodec.models import Generator
+from vita.model.vita_tts.decoder.ticodec.models import Quantizer
+
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+
+class VQVAE(nn.Module):
+    def __init__(self,
+                 config_path,
+                 ckpt_path,
+                 with_encoder=False):
+        super(VQVAE, self).__init__()
+        ckpt = torch.load(ckpt_path)
+        with open(config_path) as f:
+            data = f.read()
+        json_config = json.loads(data)
+        self.h = AttrDict(json_config)
+        # self.gst = GST()
+        # self.gst = Proposed(n_specs=128, token_num=10, E=128, n_layers=4)
+        self.quantizer = Quantizer(self.h)
+        self.generator = Generator(self.h)
+        self.generator.load_state_dict(ckpt['generator'])
+        self.quantizer.load_state_dict(ckpt['quantizer'])
+        # self.gst.load_state_dict(ckpt['gst'])
+        if with_encoder:
+            self.encoder = Encoder(self.h)
+            self.encoder.load_state_dict(ckpt['encoder'])
+
+    def forward(self, x, global_style_token):
+        # x is the codebook
+        # x.shape (B, T, Nq)
+        quant_emb = self.quantizer.embed(x)
+        global_style_quantized_emb = self.quantizer.embed_gst(global_style_token).squeeze(-1)
+        return self.generator(quant_emb, global_style_quantized_emb)
+
+    def encode(self, x):
+        batch_size = x.size(0)
+        if len(x.shape) == 3 and x.shape[-1] == 1:
+            x = x.squeeze(-1)
+        # print(x.shape)
+
+        c, global_features = self.encoder(x.unsqueeze(1))
+        # mid = mid.transpose(1, 2).unsqueeze(1)
+        # global_style = self.gst(mid)
+        q, loss_q, local_token, g, global_style_token = self.quantizer(c, global_features)
+        local_token = [code.reshape(batch_size, -1) for code in local_token]
+        global_style_token = torch.stack(global_style_token, -1).unsqueeze(1)
+        # shape: [N, T, 4]
+        return torch.stack(local_token, -1), global_style_token
diff --git a/vita/model/vita_tts/decoder/ticodec/vqvae_tester.py b/vita/model/vita_tts/decoder/ticodec/vqvae_tester.py
new file mode 100644
index 0000000000000000000000000000000000000000..88c1953c58ad621f0e933799ef53daaa34823911
--- /dev/null
+++ b/vita/model/vita_tts/decoder/ticodec/vqvae_tester.py
@@ -0,0 +1,37 @@
+import os
+
+import librosa
+import torch
+import torch.nn as nn
+
+from vita.model.vita_tts.decoder.ticodec.vqvae import VQVAE
+
+class VqvaeTester(nn.Module):
+    def __init__(self, config_path, model_path, sample_rate=24000):
+        super().__init__()
+        self.vqvae = VQVAE(config_path, model_path, with_encoder=True)
+        self.sample_rate = sample_rate
+
+    @torch.no_grad()
+    def forward(self, wav_path):
+        # 单声道
+        # wav.shape (T, ), 按照模型的 sr 读取
+        wav, sr = librosa.load(wav_path, sr=self.sample_rate)
+        fid = os.path.basename(wav_path)[:-4]
+        wav = torch.tensor(wav).unsqueeze(0)
+        wav = wav.cuda()
+        # vq_codes is acoustic token
+        vq_codes, global_token = self.vqvae.encode(wav)
+        import pdb; pdb.set_trace()
+        syn = self.vqvae(vq_codes, global_token)
+        return fid, syn
+
+    @torch.no_grad()
+    def vq(self, wav_path):
+        wav, sr = librosa.load(wav_path, sr=self.sample_rate)
+        fid = os.path.basename(wav_path)[:-4]
+        wav = torch.tensor(wav).unsqueeze(0)
+        wav = wav.cuda()
+        # vq_codes is acoustic token
+        vq_codes, global_token = self.vqvae.encode(wav)
+        return fid, vq_codes, global_token
diff --git a/vita/model/vita_tts/encoder/attention.py b/vita/model/vita_tts/encoder/attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0265b7c14e9efdced81833546883318b47760d0
--- /dev/null
+++ b/vita/model/vita_tts/encoder/attention.py
@@ -0,0 +1,459 @@
+import torch
+import torch.nn as nn
+import math
+import numpy
+import pdb
+
+class PositionalEncoding(torch.nn.Module):
+    """Positional encoding.
+    :param int d_model: embedding dim
+    :param float dropout_rate: dropout rate
+    :param int max_len: maximum input length
+    PE(pos, 2i)   = sin(pos/(10000^(2i/dmodel)))
+    PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel)))
+    """
+    def __init__(self,
+                 d_model: int,
+                 dropout_rate: float,
+                 max_len: int = 1500,
+                 reverse: bool = False):
+        """Construct an PositionalEncoding object."""
+        super().__init__()
+        self.d_model = d_model
+        self.xscale = math.sqrt(self.d_model)
+        self.dropout = torch.nn.Dropout(p=dropout_rate)
+        self.max_len = max_len
+
+        self.pe = torch.zeros(self.max_len, self.d_model)
+        position = torch.arange(0, self.max_len,
+                                dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.float32) *
+            -(math.log(10000.0) / self.d_model))
+        self.pe[:, 0::2] = torch.sin(position * div_term)
+        self.pe[:, 1::2] = torch.cos(position * div_term)
+        self.pe = self.pe.unsqueeze(0)
+
+    def forward(self,
+                x: torch.Tensor,
+                offset: int = 0):
+        """Add positional encoding.
+        Args:
+            x (torch.Tensor): Input. Its shape is (batch, time, ...)
+            offset (int): position offset
+        Returns:
+            torch.Tensor: Encoded tensor. Its shape is (batch, time, ...)
+            torch.Tensor: for compatibility to RelPositionalEncoding
+        """
+        assert offset + x.size(1) < self.max_len
+        self.pe = self.pe.to(x.device)
+        pos_emb = self.pe[:, offset:offset + x.size(1)]
+        x = x * self.xscale + pos_emb
+        return self.dropout(x), self.dropout(pos_emb)
+
+    def position_encoding(self, offset: int, size: int):
+        """ For getting encoding in a streaming fashion
+        Attention!!!!!
+        we apply dropout only once at the whole utterance level in a none
+        streaming way, but will call this function several times with
+        increasing input size in a streaming scenario, so the dropout will
+        be applied several times.
+        Args:
+            offset (int): start offset
+            size (int): requried size of position encoding
+        Returns:
+            torch.Tensor: Corresponding encoding
+        """
+        assert offset + size < self.max_len
+        return self.dropout(self.pe[:, offset:offset + size])
+
+class RelPositionalEncoding(PositionalEncoding):
+    """Relative positional encoding module.
+    See : Appendix B in https://arxiv.org/abs/1901.02860
+    Args:
+        d_model (int): Embedding dimension.
+        dropout_rate (float): Dropout rate.
+        max_len (int): Maximum input length.
+    """
+    def __init__(self, d_model: int, dropout_rate: float, chunk_size: int, left_chunks: int, max_len: int = 5000):
+        """Initialize class."""
+        super().__init__(d_model, dropout_rate, max_len, reverse=True)
+        self.chunk_size = chunk_size
+        self.left_chunks = left_chunks
+        self.full_chunk_size = (self.left_chunks + 1) * self.chunk_size
+
+        self.div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.float32) *
+            -(math.log(10000.0) / self.d_model))
+        self.max_len = self.chunk_size * (max_len // self.chunk_size) - self.full_chunk_size
+
+    def forward(self,
+                x: torch.Tensor,
+                offset: int = 0):
+        """Compute positional encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, time, `*`).
+        Returns:
+            torch.Tensor: Encoded tensor (batch, time, `*`).
+            torch.Tensor: Positional embedding tensor (1, time, `*`).
+        """
+        self.pe = self.pe.to(x.device)
+        x = x * self.xscale
+        pos_emb = self.pe[:, offset:offset + x.size(1)]
+        return self.dropout(x), self.dropout(pos_emb)
+
+    def infer(self, xs, pe_index, pe_length):
+        # type: (Tensor, Tensor) -> Tuple[Tensor, Tensor, Tensor]
+        pe_index = pe_index % self.max_len
+        xs = xs * self.xscale
+
+        # pe = torch.zeros(self.full_chunk_size, self.d_model)
+        pe = torch.zeros(pe_length, self.d_model)
+        position = torch.arange(max(0, pe_index-self.full_chunk_size), 
+                                max(0, pe_index-self.full_chunk_size) 
+                                + pe_length, # self.full_chunk_size,
+                                dtype=torch.float32).unsqueeze(1)
+        pe[:, 0::2] = torch.sin(position * self.div_term)
+        pe[:, 1::2] = torch.cos(position * self.div_term)
+        pos_emb = pe.unsqueeze(0)
+
+        pe_index = pe_index + self.chunk_size
+        return xs, pos_emb, pe_index
+
+class PositionwiseFeedForward(torch.nn.Module):
+    """Positionwise feed forward layer.
+    :param int idim: input dimenstion
+    :param int hidden_units: number of hidden units
+    :param float dropout_rate: dropout rate
+    """
+
+    def __init__(self, idim, hidden_units, dropout_rate):
+        """Construct an PositionwiseFeedForward object."""
+        super(PositionwiseFeedForward, self).__init__()
+        self.w_1 = torch.nn.Linear(idim, hidden_units)
+        self.w_2 = torch.nn.Linear(hidden_units, idim)
+        self.dropout = torch.nn.Dropout(dropout_rate)
+
+    def forward(self, x):
+        """Forward funciton."""
+        return self.w_2(self.dropout(torch.relu(self.w_1(x))))
+    
+    def infer(self, xs, buffer, buffer_index, buffer_out):
+        # type: (Tensor, Tensor, Tensor) -> Tuple[Tensor, Tensor, Tensor]
+        return self.w_2(torch.relu(self.w_1(xs))), buffer, buffer_index, buffer_out
+
+class MultiLayeredConv1d(torch.nn.Module):
+    """Multi-layered conv1d for Transformer block.
+
+    This is a module of multi-leyered conv1d designed
+    to replace positionwise feed-forward network
+    in Transformer block, which is introduced in
+    `FastSpeech: Fast, Robust and Controllable Text to Speech`_.
+
+    .. _`FastSpeech: Fast, Robust and Controllable Text to Speech`:
+        https://arxiv.org/pdf/1905.09263.pdf
+
+    """
+
+    def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate):
+        """Initialize MultiLayeredConv1d module.
+
+        Args:
+            in_chans (int): Number of input channels.
+            hidden_chans (int): Number of hidden channels.
+            kernel_size (int): Kernel size of conv1d.
+            dropout_rate (float): Dropout rate.
+
+        """
+        super(MultiLayeredConv1d, self).__init__()
+        self.w_1 = torch.nn.Conv1d(
+            in_chans,
+            hidden_chans,
+            kernel_size,
+            stride=1,
+            padding=(kernel_size - 1) // 2,
+        )
+        self.w_2 = torch.nn.Conv1d(
+            hidden_chans,
+            in_chans,
+            kernel_size,
+            stride=1,
+            padding=(kernel_size - 1) // 2,
+        )
+        self.dropout = torch.nn.Dropout(dropout_rate)
+
+    def forward(self, x):
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Batch of input tensors (B, ..., in_chans).
+
+        Returns:
+            Tensor: Batch of output tensors (B, ..., hidden_chans).
+
+        """
+        x = torch.relu(self.w_1(x.transpose(-1, 1))).transpose(-1, 1)
+        return self.w_2(self.dropout(x).transpose(-1, 1)).transpose(-1, 1)
+
+class Conv1dLinear(torch.nn.Module):
+    """Conv1D + Linear for Transformer block.
+
+    A variant of MultiLayeredConv1d, which replaces second conv-layer to linear.
+
+    """
+
+    def __init__(self, in_chans, hidden_chans, kernel_size, dropout_rate):
+        """Initialize Conv1dLinear module.
+
+        Args:
+            in_chans (int): Number of input channels.
+            hidden_chans (int): Number of hidden channels.
+            kernel_size (int): Kernel size of conv1d.
+            dropout_rate (float): Dropout rate.
+
+        """
+        super(Conv1dLinear, self).__init__()
+        self.lorder = (kernel_size - 1)
+        self.left_padding = nn.ConstantPad1d((self.lorder, 0), 0.0)
+        self.w_1 = torch.nn.Sequential(
+                        torch.nn.Conv1d(
+                            in_chans,
+                            in_chans,
+                            kernel_size,
+                            stride=1,
+                            padding=0,
+                            groups=in_chans
+                        ),
+                        torch.nn.Conv1d(
+                            in_chans, 
+                            hidden_chans, 
+                            1,
+                            padding=0
+                        )
+                    )
+        self.w_2 = torch.nn.Linear(hidden_chans, in_chans)
+        self.dropout = torch.nn.Dropout(dropout_rate)
+        self.in_chans = in_chans
+
+        # cnn_buffer = 1, in_chans, self.lorder
+        self.buffer_size = 1 * self.in_chans * self.lorder
+
+    def forward(self, x):
+        """Calculate forward propagation.
+
+        Args:
+            x (Tensor): Batch of input tensors (B, ..., in_chans).
+
+        Returns:
+            Tensor: Batch of output tensors (B, ..., hidden_chans).
+
+        """
+        x = torch.relu(self.w_1(self.left_padding(x.transpose(-1, 1)))).transpose(-1, 1)
+        return self.w_2(self.dropout(x))
+
+    def infer(self, x, buffer, buffer_index, buffer_out):
+        # type: (Tensor, Tensor, Tensor) -> Tuple[Tensor, Tensor, Tensor]
+        x = x.transpose(-1, 1)
+
+        cnn_buffer = buffer[buffer_index: buffer_index + self.buffer_size].reshape([1, self.in_chans, self.lorder])
+        x = torch.cat([cnn_buffer, x], dim=2)
+        buffer_out.append(x[:, :, -self.lorder:].reshape(-1))
+        buffer_index = buffer_index + self.buffer_size
+
+        x = self.w_1(x)
+        x = torch.relu(x).transpose(-1, 1)
+        x = self.w_2(x)
+        return x, buffer, buffer_index, buffer_out
+
+class MultiHeadedAttention(nn.Module):
+    """Multi-Head Attention layer.
+ 
+    :param int n_head: the number of head s
+    :param int n_feat: the number of features
+    :param float dropout_rate: dropout rate
+
+    """
+    def __init__(self, n_head, n_feat, dropout_rate, chunk_size, left_chunks, pos_enc_class):
+        """Construct an MultiHeadedAttention object."""
+        super(MultiHeadedAttention, self).__init__()
+        assert n_feat % n_head == 0
+        # We assume d_v always equals d_k
+        self.d_k = n_feat // n_head
+        self.h = n_head
+        self.linear_q = nn.Linear(n_feat, n_feat)
+        self.linear_k = nn.Linear(n_feat, n_feat)
+        self.linear_v = nn.Linear(n_feat, n_feat)
+        self.linear_out = nn.Linear(n_feat, n_feat)
+        self.dropout = nn.Dropout(p=dropout_rate)
+        self.min_value = float(numpy.finfo(torch.tensor(0, dtype=torch.float16).numpy().dtype).min)
+        # chunk par
+        if chunk_size > 0 and left_chunks > 0: #for streaming mode
+            self.buffersize = chunk_size * (left_chunks)
+            self.left_chunk_size = chunk_size * left_chunks
+        else: # for non-streaming mode
+            self.buffersize = 1
+            self.left_chunk_size = 1
+        self.chunk_size = chunk_size
+
+        # encoding setup
+        if pos_enc_class == "rel-enc":
+            self.rel_enc = True
+            self.linear_pos = nn.Linear(n_feat, n_feat, bias=False)
+            # these two learnable bias are used in matrix c and matrix d
+            # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+            self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k))
+            self.pos_bias_v = nn.Parameter(torch.Tensor(self.h, self.d_k))
+            torch.nn.init.xavier_uniform_(self.pos_bias_u)
+            torch.nn.init.xavier_uniform_(self.pos_bias_v)
+        else:
+            self.rel_enc = False
+            self.linear_pos = nn.Identity()
+            self.pos_bias_u = torch.tensor([0])
+            self.pos_bias_v = torch.tensor([0])
+        
+        # buffer
+        # key_buffer = 1, self.h, self.buffersize, self.d_k
+        self.key_buffer_size = 1 * self.h * self.buffersize * self.d_k
+        # value_buffer = 1, self.h, self.buffersize, self.d_k
+        self.value_buffer_size = 1 * self.h * self.buffersize * self.d_k
+        if self.chunk_size > 0:
+            # buffer_mask_size = 1, self.h, self.chunk_size, self.buffersize
+            self.buffer_mask_size = 1 * self.h * self.chunk_size * self.buffersize
+        else:
+            self.buffer_mask = torch.ones([1, self.h, 1, 1], dtype=torch.bool)
+
+    def rel_shift(self, x, zero_triu: bool = False):
+        """Compute relative positinal encoding.
+        Args:
+            x (torch.Tensor): Input tensor (batch, time, size).
+            zero_triu (bool): If true, return the lower triangular part of
+                the matrix.
+        Returns:
+            torch.Tensor: Output tensor.
+        """
+
+        zero_pad = torch.zeros((x.size()[0], x.size()[1], x.size()[2], 1),
+                               device=x.device,
+                               dtype=x.dtype)
+        x_padded = torch.cat([zero_pad, x], dim=-1)
+
+        x_padded = x_padded.view(x.size()[0],
+                                 x.size()[1],
+                                 x.size(3) + 1, x.size(2))
+        x = x_padded[:, :, 1:].view_as(x)
+
+        if zero_triu:
+            ones = torch.ones((x.size(2), x.size(3)))
+            x = x * torch.tril(ones, x.size(3) - x.size(2))[None, None, :, :]
+        return x
+
+    def forward(self, query, key, value, mask=None, pos_emb=torch.tensor(1.0)):
+        # type: (Tensor, Tensor, Tensor, Optional[Tensor], Tensor) -> Tensor
+        """Compute 'Scaled Dot Product Attention'.
+
+        :param torch.Tensor query: (batch, time1, size)
+        :param torch.Tensor key: (batch, time2, size)
+        :param torch.Tensor value: (batch, time2, size)
+        :param torch.Tensor mask: (batch, time1, time2)
+        :param torch.nn.Dropout dropout:
+        :return torch.Tensor: attentined and transformed `value` (batch, time1, d_model)
+             weighted by the query dot key attention (batch, head, time1, time2)
+        """
+        n_batch = query.size(0)
+        q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k)
+        k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k)
+        v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k)
+        q = q.transpose(1, 2)  # (batch, head, time1, d_k)
+        k = k.transpose(1, 2)  # (batch, head, time2, d_k)
+        v = v.transpose(1, 2)  # (batch, head, time2, d_k)
+
+        if self.rel_enc:
+            q = q.transpose(1, 2)  # (batch, time1, head, d_k)
+            n_batch_pos = pos_emb.size(0)
+            p = self.linear_pos(pos_emb.to(query.dtype)).view(n_batch_pos, -1, self.h, self.d_k)
+            p = p.transpose(1, 2)  # (batch, head, time1, d_k)
+            # (batch, head, time1, d_k)
+            q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2)
+            # (batch, head, time1, d_k)
+            q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2)
+            # compute attention score
+            # first compute matrix a and matrix c
+            # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+            # (batch, head, time1, time2)
+            matrix_ac = torch.matmul(q_with_bias_u, k.transpose(-2, -1))
+            # compute matrix b and matrix d
+            # (batch, head, time1, time2)
+            matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1))
+            # Remove rel_shift since it is useless in speech recognition,
+            # and it requires special attention for streaming.
+            # matrix_bd = self.rel_shift(matrix_bd)
+            scores = (matrix_ac + matrix_bd) / math.sqrt(self.d_k)  # (batch, head, time1, time2)
+        else:
+            scores = torch.matmul(q, k.transpose(-2, -1) ) / math.sqrt(self.d_k)  # (batch, head, time1, time2)
+
+        if mask is not None:
+            mask = mask.unsqueeze(1).eq(0)  # (batch, 1, time1, time2)
+            scores = scores.masked_fill(mask, self.min_value)
+            attn = torch.softmax(scores, dim=-1).masked_fill(mask, 0.0)  # (batch, head, time1, time2)
+        else:
+            attn = torch.softmax(scores, dim=-1)  # (batch, head, time1, time2)
+
+        p_attn = self.dropout(attn)
+
+        x = torch.matmul(p_attn, v)  # (batch, head, time1, d_k)
+        x = x.transpose(1, 2).contiguous().view(n_batch, -1, self.h * self.d_k)  # (batch, time1, d_model)
+        return self.linear_out(x)  # (batch, time1, d_model)
+
+    def infer(self, query, key, value, pos_emb, buffer, buffer_index, buffer_out):
+        # type: (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor) -> Tuple[Tensor, Tensor, Tensor]
+        n_batch = query.size(0)
+
+        q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k).transpose(1, 2) # (batch, head, len_q, d_k)
+        k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k).transpose(1, 2) # (batch, head, len_k, d_k)
+        v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k).transpose(1, 2) # (batch, head, len_v, d_k)
+
+        key_value_buffer = buffer[buffer_index]
+        if buffer[buffer_index] is None:
+            buffer[buffer_index] = [None, None]
+            key_buffer = k
+            value_buffer = v
+        else:
+            key_buffer = torch.cat([key_value_buffer[0], k], dim=2)
+            value_buffer = torch.cat([key_value_buffer[1], v], dim=2)
+        if key_buffer.size(2) > self.buffersize:
+            buffer[buffer_index][0] = key_buffer[:, :, -self.buffersize:, :]
+            buffer[buffer_index][1] = value_buffer[:, :, -self.buffersize:, :]
+        else:
+            buffer[buffer_index] = [key_buffer, value_buffer]
+        buffer_index += 1
+        
+        if self.rel_enc:
+            q = q.transpose(1, 2)  # (batch, time1, head, d_k)
+            n_batch_pos = pos_emb.size(0)
+            p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k)
+            p = p.transpose(1, 2)  # (batch, head, time1, d_k)
+            # (batch, head, time1, d_k)
+            q_with_bias_u = (q + self.pos_bias_u).transpose(1, 2)
+            # (batch, head, time1, d_k)
+            q_with_bias_v = (q + self.pos_bias_v).transpose(1, 2)
+            # compute attention score
+            # first compute matrix a and matrix c
+            # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+            # (batch, head, time1, time2)
+            matrix_ac = torch.matmul(q_with_bias_u, key_buffer.transpose(-2, -1))
+            # compute matrix b and matrix d
+            # (batch, head, time1, time2)
+            matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1))
+            # Remove rel_shift since it is useless in speech recognition,
+            # and it requires special attention for streaming.
+            # matrix_bd = self.rel_shift(matrix_bd)
+            scores = (matrix_ac + matrix_bd) / math.sqrt(self.d_k)  # (batch, head, time1, time2)
+        else:
+            # (batch, head, len_q, buffersize)
+            scores = torch.matmul(q, key_buffer.transpose(-2, -1) ) / math.sqrt(self.d_k)
+
+        attn = torch.softmax(scores, dim=-1)
+
+        x = torch.matmul(attn, value_buffer)  # (batch, head, len_q, d_k)
+        x = x.transpose(1, 2).reshape(n_batch, -1, self.h * self.d_k)  # (batch, time1, d_model)
+        return self.linear_out(x), buffer, buffer_index, buffer_out  # (batch, time1, d_model)
diff --git a/vita/model/vita_tts/encoder/cmvn.py b/vita/model/vita_tts/encoder/cmvn.py
new file mode 100644
index 0000000000000000000000000000000000000000..2dcd026dd27231ead6ad1cced312124aba177996
--- /dev/null
+++ b/vita/model/vita_tts/encoder/cmvn.py
@@ -0,0 +1,107 @@
+import torch
+import json
+import math
+
+import numpy as np
+
+class GlobalCMVN(torch.nn.Module):
+    def __init__(self,
+                 mean: torch.Tensor,
+                 istd: torch.Tensor,
+                 norm_var: bool = True):
+        """
+        Args:
+            mean (torch.Tensor): mean stats
+            istd (torch.Tensor): inverse std, std which is 1.0 / std
+        """
+        super().__init__()
+        assert mean.shape == istd.shape
+        self.norm_var = norm_var
+        # The buffer can be accessed from this module using self.mean
+        self.register_buffer("mean", mean)
+        self.register_buffer("istd", istd)
+
+    def forward(self, x: torch.Tensor):
+        """
+        Args:
+            x (torch.Tensor): (batch, max_len, feat_dim)
+
+        Returns:
+            (torch.Tensor): normalized feature
+        """
+        x = x - self.mean
+        if self.norm_var:
+            x = x * self.istd
+        return x
+
+def _load_json_cmvn(json_cmvn_file):
+    """ Load the json format cmvn stats file and calculate cmvn
+
+    Args:
+        json_cmvn_file: cmvn stats file in json format
+
+    Returns:
+        a numpy array of [means, vars]
+    """
+    with open(json_cmvn_file) as f:
+        cmvn_stats = json.load(f)
+
+    means = cmvn_stats['mean_stat']
+    variance = cmvn_stats['var_stat']
+    count = cmvn_stats['frame_num']
+    for i in range(len(means)):
+        means[i] /= count
+        variance[i] = variance[i] / count - means[i] * means[i]
+        if variance[i] < 1.0e-20:
+            variance[i] = 1.0e-20
+        variance[i] = 1.0 / math.sqrt(variance[i])
+    cmvn = np.array([means, variance])
+    return cmvn
+
+def _load_kaldi_cmvn(kaldi_cmvn_file):
+    """ Load the kaldi format cmvn stats file and calculate cmvn
+
+    Args:
+        kaldi_cmvn_file:  kaldi text style global cmvn file, which
+           is generated by:
+           compute-cmvn-stats --binary=false scp:feats.scp global_cmvn
+
+    Returns:
+        a numpy array of [means, vars]
+    """
+    means = []
+    variance = []
+    with open(kaldi_cmvn_file, 'r') as fid:
+        # kaldi binary file start with '\0B'
+        if fid.read(2) == '\0B':
+            print('kaldi cmvn binary file is not supported, please '
+                          'recompute it by: compute-cmvn-stats --binary=false '
+                          ' scp:feats.scp global_cmvn')
+            sys.exit(1)
+        fid.seek(0)
+        arr = fid.read().split()
+        assert (arr[0] == '[')
+        assert (arr[-2] == '0')
+        assert (arr[-1] == ']')
+        feat_dim = int((len(arr) - 2 - 2) / 2)
+        for i in range(1, feat_dim + 1):
+            means.append(float(arr[i]))
+        count = float(arr[feat_dim + 1])
+        for i in range(feat_dim + 2, 2 * feat_dim + 2):
+            variance.append(float(arr[i]))
+
+    for i in range(len(means)):
+        means[i] /= count
+        variance[i] = variance[i] / count - means[i] * means[i]
+        if variance[i] < 1.0e-20:
+            variance[i] = 1.0e-20
+        variance[i] = 1.0 / math.sqrt(variance[i])
+    cmvn = np.array([means, variance])
+    return cmvn
+
+def load_cmvn(cmvn_file, is_json):
+    if is_json:
+        cmvn = _load_json_cmvn(cmvn_file)
+    else:
+        cmvn = _load_kaldi_cmvn(cmvn_file)
+    return cmvn[0], cmvn[1]
diff --git a/vita/model/vita_tts/encoder/encoder.py b/vita/model/vita_tts/encoder/encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc771eeeac342ef337d61885edee229ec9a1faff
--- /dev/null
+++ b/vita/model/vita_tts/encoder/encoder.py
@@ -0,0 +1,155 @@
+import sys
+import time
+import numpy as np
+import torch
+import argparse
+
+from typing import Tuple, Dict, Optional
+
+from vita.model.vita_tts.encoder.transformer import Transformer
+from vita.model.vita_tts.encoder.subsampling import Subsampling
+
+def add_encoder_args(group):
+    """Add Encoder common arguments."""
+    group.add_argument(
+        "--encoder-layer-config",
+        type=str,
+        default="tdnn-dtc",
+        help="Layer config of encoder. Format layername-layername-...",
+    )
+    group.add_argument(
+        "--encoder-input-dim",
+        type=int,
+        default=256,
+        help="Input dim of encoder. Must equal to the input dim of the first Component (default=40)"
+    )
+    group.add_argument(
+        "--encoder-output-dim",
+        type=int,
+        default=256,
+        help="Output dim of encoder. Must enqual to the output dim of the last Component ! (default=256)"
+    )
+    group = Transformer.add_arguments(group)
+    group = Subsampling.add_arguments(group)
+    return group
+
+def assign_args_from_dict(args, dict, prefix_key=None):
+    if prefix_key is not None:
+        dict = dict[prefix_key]
+    for k, v in dict.items():
+        k_args = k.replace('-', '_') 
+        if hasattr(args, k_args):
+            setattr(args, k_args, dict[k])
+    return args
+
+class speechEncoder(torch.nn.Module):
+    def __init__(
+            self,
+            input_dim,
+            overview_conf = None,
+            para_conf = None,
+            global_cmvn = None):
+        super(speechEncoder, self).__init__()
+
+        parser = argparse.ArgumentParser()
+        add_encoder_args(parser)
+        args, _ = parser.parse_known_args()
+        assign_args_from_dict(args, overview_conf)
+
+        self.config = args.encoder_layer_config.split('-')
+        encoder_input_dim = args.encoder_input_dim
+        encoder_output_dim = args.encoder_output_dim
+        prev_output_dim = encoder_input_dim
+        prev_component_name = "encoder"
+
+        self.global_cmvn = global_cmvn
+        self.enc = torch.nn.ModuleList([])
+        for name in self.config:
+            assign_args_from_dict(args, para_conf[name])
+            if len(name.split('_'))  == 2:
+                name = name.split('_')[0]
+            elif len(name.split('_'))  == 1:
+                name = name
+            else:
+                print("WRONG CONFIG! {} is not valid".format("encoder", name))
+                sys.exit()
+            if name == "transformer":
+                self.enc.append(Transformer(args))
+            elif name == "subsampling":
+                self.enc.append(Subsampling(args))
+            else:
+                print("{} is not supported now!".format(name))
+                return NotImplemented                
+            component_input_dim = getattr(args, name + "_input_dim")
+            if component_input_dim != prev_output_dim:
+                print("WRONG CONFIG! --{}-output-dim ({}) does not equal to --{}-input-dim ({})"
+                        .format(prev_component_name, prev_output_dim, name, component_input_dim))
+                sys.exit()
+            prev_output_dim = getattr(args, name + "_output_dim")
+            prev_component_name = name
+        
+        if (prev_output_dim != encoder_output_dim):
+            print("WRONG CONFIG! --{}-output-dim ({}) does not equal to --{}-output-dim ({}, the last component)"
+                        .format("encoder", encoder_output_dim, name, prev_output_dim))
+            sys.exit()
+        
+        self._output_size = encoder_output_dim
+
+        num_params = sum(p.numel() for p in self.parameters())
+        print('the number of speech encoder params: {}M'.format(num_params/1024/1024))
+
+    def output_size(self) -> int:
+        return self._output_size
+    
+    def forward(self, xs, ilens, decoding_chunk_size=None, num_decoding_left_chunks=None):
+        """
+        Forward pass through the encoder.
+
+        Parameters:
+        - xs: torch.Tensor, shape (batch_size, sequence_length, input_dim)
+            The input tensor containing the sequence of input vectors.
+            - batch_size: The number of sequences in the batch.
+            - sequence_length: The length of each sequence.
+            - input_dim: The dimensionality of each input vector.
+
+        - ilens: torch.Tensor, shape (batch_size,)
+            The lengths of each sequence in the batch, used for padding masks.
+
+        - decoding_chunk_size: int, optional (default=None)
+            The size of chunks to use for decoding
+
+        - num_decoding_left_chunks: int, optional (default=None)
+            The number of left chunks to use for decoding
+
+        Returns:
+        - xs: torch.Tensor, shape (batch_size, sequence_length, encoded_dim)
+            The encoded output tensor, where encoded_dim is the dimensionality of the encoded representation.
+
+        - masks: torch.Tensor, shape (batch_size, 1, sequence_length)
+            The padding mask tensor, where True indicates valid elements and False indicates padded elements.
+        """
+        if decoding_chunk_size is not None and num_decoding_left_chunks is not None:
+            for layer in self.enc:
+                if hasattr(layer, "chunk_size"):
+                    layer.chunk_size = decoding_chunk_size
+                if hasattr(layer, "left_chunks"):
+                    layer.left_chunks = num_decoding_left_chunks
+                if hasattr(layer, "transformer_dynamic_chunks"):
+                    layer.transformer_dynamic_chunks = False
+
+        assert(len(xs.shape)) == 3
+        T = xs.size(1)
+        masks = ~make_pad_mask(ilens, T).unsqueeze(1)  # (B, 1, T)
+        if self.global_cmvn is not None:
+            xs = self.global_cmvn(xs)
+        for module in self.enc:
+            xs, ilens, masks = module(xs, ilens, masks)
+        return xs, masks
+
+    def infer(self, xs_pad, buffer, buffer_index, buffer_out, pe_index):
+        if self.global_cmvn is not None:
+            xs_pad = self.global_cmvn(xs_pad)
+        for module in self.enc:
+            xs_pad, buffer, buffer_index, buffer_out, pe_index = module.infer(xs_pad, 
+                                            buffer, buffer_index, buffer_out, pe_index)
+        return xs_pad, buffer, buffer_index, buffer_out, pe_index
diff --git a/vita/model/vita_tts/encoder/subsampling.py b/vita/model/vita_tts/encoder/subsampling.py
new file mode 100644
index 0000000000000000000000000000000000000000..6589aaaad733f307974f6151e25ee9f7f9ebd810
--- /dev/null
+++ b/vita/model/vita_tts/encoder/subsampling.py
@@ -0,0 +1,106 @@
+from typing import Tuple, Union
+
+import torch
+
+class BaseSubsampling(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.right_context = 0
+        self.subsampling_rate = 1
+
+    def position_encoding(self, offset: Union[int, torch.Tensor],
+                          size: int) -> torch.Tensor:
+        return self.pos_enc.position_encoding(offset, size)
+
+class Conv2dSubsampling4(BaseSubsampling):
+    """Convolutional 2D subsampling (to 1/4 length).
+
+    Args:
+        idim (int): Input dimension.
+        odim (int): Output dimension.
+        dropout_rate (float): Dropout rate.
+
+    """
+    def __init__(self, idim: int, odim: int, dropout_rate: float):
+        """Construct an Conv2dSubsampling4 object."""
+        super().__init__()
+        self.conv = torch.nn.Sequential(
+            torch.nn.Conv2d(1, odim, 3, 2),
+            torch.nn.ReLU(),
+            torch.nn.Conv2d(odim, odim, 3, 2),
+            torch.nn.ReLU(),
+        )
+        self.out = torch.nn.Sequential(
+            torch.nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim))
+        # The right context for every conv layer is computed by:
+        # (kernel_size - 1) * frame_rate_of_this_layer
+        self.subsampling_rate = 4
+        # 6 = (3 - 1) * 1 + (3 - 1) * 2
+        self.right_context = 6
+
+    def forward(
+            self,
+            x: torch.Tensor,
+            x_mask: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Subsample x.
+
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, idim).
+            x_mask (torch.Tensor): Input mask (#batch, 1, time).
+
+        Returns:
+            torch.Tensor: Subsampled tensor (#batch, time', odim),
+                where time' = time // 4.
+            torch.Tensor: Subsampled mask (#batch, 1, time'),
+                where time' = time // 4.
+            torch.Tensor: positional encoding
+
+        """
+        x = x.unsqueeze(1)  # (b, c=1, t, f)
+        x = self.conv(x)
+        b, c, t, f = x.size()
+        x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
+
+        return x, x_mask[:, :, 2::2][:, :, 2::2]
+    
+    def infer(self, x, buffer, buffer_index, buffer_out):
+        x = x.unsqueeze(1)  # (b, c=1, t, f)
+        x = self.conv(x)
+        b, c, t, f = x.size()
+        x = self.out(x.transpose(1, 2).contiguous().view(b, t, c * f))
+
+        return x, buffer, buffer_index, buffer_out
+
+class Subsampling(torch.nn.Module):
+    @staticmethod
+    def add_arguments(group):
+        """Add Subsampling common arguments."""
+        group.add_argument('--subsampling-rate', default=4, type=int)
+        group.add_argument('--subsampling-input-dim', default=256, type=int)
+        group.add_argument('--subsampling-output-dim', default=256, type=int)
+        group.add_argument('--subsampling-dropout-rate', default=0.1, type=float)
+
+        return group
+    
+    def __init__(self, args):
+        super().__init__()
+        self.subsampling_rate = args.subsampling_rate
+        self.subsampling_input_dim = args.subsampling_input_dim
+        self.subsampling_output_dim = args.subsampling_output_dim
+        self.subsampling_dropout_rate = args.subsampling_dropout_rate
+
+        if self.subsampling_rate == 4:
+            self.core = Conv2dSubsampling4(self.subsampling_input_dim, 
+                                           self.subsampling_output_dim, 
+                                           self.subsampling_dropout_rate)
+
+    def forward(self, xs, ilens, masks):
+        xs, masks = self.core(xs, masks)
+        ilens = masks.squeeze(1).sum(1)
+        return xs, ilens, masks
+    
+    def infer(self, x, buffer, buffer_index, buffer_out, pe_index):
+        x, buffer, buffer_index, buffer_out = self.core.infer(x, 
+                                    buffer, buffer_index, buffer_out)
+        return x, buffer, buffer_index, buffer_out, pe_index
diff --git a/vita/model/vita_tts/encoder/transformer.py b/vita/model/vita_tts/encoder/transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d1cd17d275231aaf8f1dc6b345755c97f90c9e6
--- /dev/null
+++ b/vita/model/vita_tts/encoder/transformer.py
@@ -0,0 +1,285 @@
+import numpy as np
+import math
+import pdb
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from vita.model.vita_tts.encoder.attention import *
+from vita.model.vita_tts.masks import *
+
+IGNORE_ID = -1
+
+def strtobool(x):
+    return bool(dist_strtobool(x))
+
+def repeat(N, fn):
+    """Repeat module N times.
+
+    :param int N: repeat time
+    :param function fn: function to generate module
+    :return: repeated modules
+    :rtype: MultiSequential
+    """
+    return MultiSequential(*[fn(n) for n in range(N)])
+
+class MultiSequential(torch.nn.Sequential):
+    """Multi-input multi-output torch.nn.Sequential."""
+    def forward(self, x, masks, pos_emb):
+
+        """Repeat."""
+        for m in self:
+            x, masks, pos_emb = m(x, masks, pos_emb)
+        return x, masks, pos_emb
+
+    @torch.jit.export
+    def infer(self, x, pos_emb, buffer, buffer_index, buffer_out):
+        # type: (Tensor, Tensor, Tensor, Tensor) -> Tuple[Tensor, Tensor, Tensor, Tensor]
+        """Repeat."""
+        for m in self:
+            x, pos_emb, buffer, buffer_index, buffer_out = m.infer(x, pos_emb, buffer, buffer_index, buffer_out)
+        return x, pos_emb, buffer, buffer_index, buffer_out
+
+class TransformerLayer(nn.Module):
+    """Transformer layer module.
+
+    :param int size: input dim
+    :param self_attn: self attention module
+    :param feed_forward: feed forward module
+    :param float dropout_rate: dropout rate
+    :param bool normalize_before: whether to use layer_norm before the first block
+    :param bool concat_after: whether to concat attention layer's input and output
+        if True, additional linear will be applied. i.e. x -> x + linear(concat(x, att(x)))
+        if False, no additional linear will be applied. i.e. x -> x + att(x)
+
+    """
+    def __init__(self, size, self_attn, feed_forward, dropout_rate,
+                 normalize_before=True, concat_after=False):
+        """Construct an TransformerLayer object."""
+        super(TransformerLayer, self).__init__()
+        self.self_attn = self_attn
+        self.feed_forward = feed_forward
+        self.norm1 = torch.nn.LayerNorm(size)
+        self.norm2 = torch.nn.LayerNorm(size)
+        self.dropout = nn.Dropout(dropout_rate)
+        self.size = size
+        self.normalize_before = normalize_before
+        self.concat_after = concat_after
+        if self.concat_after:
+            self.concat_linear = nn.Linear(size + size, size)
+        else:
+            self.concat_linear = nn.Identity()
+
+    @torch.jit.unused
+    def forward(self, x, mask, pos_emb):
+        """Compute encoded features.
+
+        :param torch.Tensor x: encoded source features (batch, max_time_in, size)
+        :param torch.Tensor mask: mask for x (batch, max_time_in)
+        :rtype: Tuple[torch.Tensor, torch.Tensor]
+        """
+        residual = x
+        if self.normalize_before:
+            x = self.norm1(x)
+        if self.concat_after:
+            x_concat = torch.cat((x, self.self_attn(x, x, x, mask, pos_emb)), dim=-1)
+            x = residual + self.concat_linear(x_concat)
+        else:
+            x = residual + self.dropout(self.self_attn(x, x, x, mask, pos_emb))
+        if not self.normalize_before:
+            x = self.norm1(x)
+
+        residual = x
+        if self.normalize_before:
+            x = self.norm2(x)
+        x = residual + self.dropout(self.feed_forward(x))
+        if not self.normalize_before:
+            x = self.norm2(x)
+
+        return x, mask, pos_emb
+
+    @torch.jit.export
+    def infer(self, x, pos_emb, buffer, buffer_index, buffer_out):
+        # type: (Tensor, Tensor, Tensor, Tensor) -> Tuple[Tensor, Tensor, Tensor, Tensor]
+        residual = x.clone()
+        if self.normalize_before:
+            x = self.norm1(x)
+        if self.concat_after:
+            x_att, buffer, buffer_index, buffer_out = self.self_attn.infer(x, x, x, 
+                                                                          pos_emb, buffer, 
+                                                                          buffer_index, buffer_out)
+            x_concat = torch.cat((x, x_att), dim=-1)
+            x = residual + self.concat_linear(x_concat)
+        else:
+            x_att, buffer, buffer_index, buffer_out = self.self_attn.infer(x, x, x, 
+                                                                           pos_emb, buffer, 
+                                                                           buffer_index, buffer_out)
+            x = residual + x_att
+        if not self.normalize_before:
+            x = self.norm1(x)
+
+        residual = x.clone()
+        if self.normalize_before:
+            x = self.norm2(x)
+        x_feed, buffer, buffer_index, buffer_out = self.feed_forward.infer(x, buffer, buffer_index, buffer_out)
+        x = residual + x_feed
+        if not self.normalize_before:
+            x = self.norm2(x)
+
+        return x, pos_emb, buffer, buffer_index, buffer_out
+
+class Transformer(torch.nn.Module):
+    @staticmethod
+    def add_arguments(group):
+        """Add TDNN common arguments."""
+        group.add_argument('--transformer-input-dim', default=256, type=int)
+        group.add_argument('--transformer-output-dim', default=4, type=int)
+        group.add_argument('--transformer-attention-dim', default=256, type=int)
+        group.add_argument('--transformer-attention-heads', default=4, type=int)
+        group.add_argument('--transformer-linear-units', default=1024, type=int)
+        group.add_argument('--transformer-num-blocks', default=6, type=int)
+        group.add_argument('--transformer-dropout-rate', default=0.1, type=float)
+        group.add_argument('--transformer-attention-dropout-rate', default=0.0, type=float)
+        group.add_argument('--transformer-positional-dropout-rate', default=0.1, type=float)
+        group.add_argument('--transformer-input-layer', default='linear', type=str)
+        group.add_argument('--transformer-pos-enc-class', default='abs-enc', type=str)
+        group.add_argument('--transformer-normalize-before', default=True, type=strtobool)
+        group.add_argument('--transformer-concat-after', default=False, type=strtobool)
+        group.add_argument('--transformer-positionwise-layer-type', default='linear', type=str)
+        group.add_argument('--transformer-positionwise-conv-kernel_size', default=1, type=int)
+        group.add_argument('--transformer-chunk_size', default=-1, type=int)
+        group.add_argument('--transformer-left_chunks', default=-1, type=int)
+        group.add_argument('--transformer-dynamic-chunks', default=True, type=strtobool)
+        return group
+
+    def __init__(self, args):
+        """Construct an Encoder object."""
+        super(Transformer, self).__init__()
+
+        self.input_dim = args.transformer_input_dim
+        self.output_dim = args.transformer_output_dim
+        self.attention_dim = args.transformer_attention_dim
+        self.attention_heads = args.transformer_attention_heads
+        self.linear_units = args.transformer_linear_units
+        self.num_blocks = args.transformer_num_blocks
+        self.dropout_rate = args.transformer_dropout_rate
+        self.positional_dropout_rate = args.transformer_positional_dropout_rate
+        self.attention_dropout_rate = args.transformer_attention_dropout_rate
+        self.input_layer = args.transformer_input_layer
+        self.pos_enc_class = args.transformer_pos_enc_class
+        self.normalize_before = args.transformer_normalize_before
+        self.concat_after = args.transformer_concat_after
+        self.positionwise_layer_type = args.transformer_positionwise_layer_type
+        self.positionwise_conv_kernel_size = args.transformer_positionwise_conv_kernel_size
+        self.chunk_size = args.transformer_chunk_size
+        self.left_chunks = args.transformer_left_chunks
+        self.transformer_dynamic_chunks = args.transformer_dynamic_chunks
+
+        if self.pos_enc_class == "abs-enc":
+            pos_enc_args = (self.attention_dim, self.positional_dropout_rate)
+            pos_enc_class = PositionalEncoding
+        elif self.pos_enc_class == "rel-enc":
+            pos_enc_args = (self.attention_dim, self.positional_dropout_rate, self.chunk_size, self.left_chunks)
+            pos_enc_class = RelPositionalEncoding
+
+        if self.input_layer == "linear":
+            self.embed = torch.nn.Sequential(
+                torch.nn.Linear(self.input_dim, self.attention_dim),
+                torch.nn.LayerNorm(self.attention_dim),
+                torch.nn.Dropout(self.dropout_rate),
+                torch.nn.ReLU()
+            )
+        elif self.input_layer == "embed":
+            self.embed = torch.nn.Sequential(
+                torch.nn.Embedding(self.input_dim, self.attention_dim, padding_idx=IGNORE_ID)
+            )
+        elif self.input_layer == "none":
+            self.embed = torch.nn.Sequential(
+                torch.nn.Identity()
+            )
+        else:
+            raise ValueError("unknown input_layer: " + self.input_layer)
+        self.pe = pos_enc_class(*pos_enc_args)
+        self.embed_layer_num = len(self.embed)
+
+        if self.positionwise_layer_type == "linear":
+            positionwise_layer = PositionwiseFeedForward
+            positionwise_layer_args = (self.attention_dim, self.linear_units, self.dropout_rate)
+        elif self.positionwise_layer_type == "conv1d":
+            positionwise_layer = MultiLayeredConv1d
+            positionwise_layer_args = (self.attention_dim, self.linear_units, 
+                                       self.positionwise_conv_kernel_size, self.dropout_rate)
+        elif self.positionwise_layer_type == "conv1d-linear":
+            positionwise_layer = Conv1dLinear
+            positionwise_layer_args = (self.attention_dim, self.linear_units, 
+                                       self.positionwise_conv_kernel_size, self.dropout_rate)
+        else:
+            raise NotImplementedError("Support only linear or conv1d.")
+
+        self.encoders = repeat(
+            self.num_blocks,
+            lambda lnum: TransformerLayer(
+                self.attention_dim,
+                MultiHeadedAttention(self.attention_heads, self.attention_dim, 
+                                     self.attention_dropout_rate, self.chunk_size, 
+                                     self.left_chunks, self.pos_enc_class),
+                positionwise_layer(*positionwise_layer_args),
+                self.dropout_rate,
+                self.normalize_before,
+                self.concat_after
+            )
+        )
+        if self.normalize_before:
+            self.after_norm = torch.nn.LayerNorm(self.attention_dim)
+
+    @torch.jit.unused
+    def forward(self, xs, ilens=None, masks=None):
+        """Embed positions in tensor.
+
+        :param torch.Tensor xs: input tensor
+        :param torch.Tensor masks: input mask
+        :return: position embedded tensor and mask
+        :rtype Tuple[torch.Tensor, torch.Tensor]:
+        """
+        if self.transformer_dynamic_chunks == True: # and self.training:
+            chunk_masks = add_optional_chunk_mask(xs, masks,
+                                            True,
+                                            True,
+                                            0,
+                                            0,
+                                            -1)
+        else:
+            chunk_masks = add_optional_chunk_mask(xs, masks,
+                                            False,
+                                            False,
+                                            self.chunk_size,
+                                            self.chunk_size,
+                                            self.left_chunks).to(xs.device)
+        xs = self.embed(xs)
+        xs, pos_emb = self.pe(xs)
+        xs, chunk_masks, pos_emb = self.encoders(xs, chunk_masks, pos_emb)
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+        return xs, ilens, masks
+    
+    @torch.jit.export
+    def infer(self, xs, buffer, buffer_index, buffer_out, pe_index):
+        xs = self.embed(xs)
+
+        # pe_index = buffer[buffer_index: buffer_index + 1].reshape([1]).to(torch.int64)
+        # xs, pos_emb, pe_index[0] = self.pe.infer(xs, pe_index[0])
+        # buffer_out.append(pe_index.reshape(-1).to(torch.float32))
+        # buffer_index = buffer_index + 1
+        if buffer[0] is None:
+            pe_length = xs.size(1)
+        else:
+            pe_length = buffer[0][0].size(2) + xs.size(1)
+        xs, pos_emb, pe_index = self.pe.infer(xs, pe_index, pe_length)
+        pos_emb = pos_emb.to('cuda')
+        xs, pos_emb, buffer, buffer_index, buffer_out = self.encoders.infer(xs, pos_emb, 
+                                                                            buffer, buffer_index, buffer_out)
+
+        if self.normalize_before:
+            xs = self.after_norm(xs)
+        return xs, buffer, buffer_index, buffer_out, pe_index
diff --git a/vita/model/vita_tts/masks.py b/vita/model/vita_tts/masks.py
new file mode 100644
index 0000000000000000000000000000000000000000..94a6a35a95008bef854b4c5b720b341475a1c3bc
--- /dev/null
+++ b/vita/model/vita_tts/masks.py
@@ -0,0 +1,195 @@
+import torch
+
+def casual_chunk_mask(ilens, chunk_size, left_chunks=1):
+    # type: (List[int], int, int) -> Tensor
+    # param ilens: list, (B, )
+    # param chunk_size: int
+    # return chunk_mask: torch.Tensor, (B, T, T)
+    B = len(ilens)
+    T = max(ilens)
+    chunk_mask = torch.zeros(B, T, T)
+    for b in range(0, B):
+        if chunk_size == -1 :
+            chunk_mask[b, 0:ilens[b], 0:ilens[b]] = 1
+        else:
+            for t in range(0, ilens[b], chunk_size):
+                ty_start = t
+                ty_end = min(t + chunk_size, ilens[b])
+                tx_start = max(t - chunk_size * left_chunks, 0)
+                tx_end = min(t + chunk_size, ilens[b])
+                chunk_mask[b, ty_start:ty_end, tx_start:tx_end] = 1
+    return chunk_mask
+
+def subsequent_chunk_mask(
+        size: int,
+        chunk_size: int,
+        num_left_chunks: int = -1
+) -> torch.Tensor:
+    """Create mask for subsequent steps (size, size) with chunk size,
+       this is for streaming encoder
+
+    Args:
+        size (int): size of mask
+        chunk_size (int): size of chunk
+        num_left_chunks (int): number of left chunks
+            <0: use full chunk
+            >=0: use num_left_chunks
+        device (torch.device): "cpu" or "cuda" or torch.Tensor.device
+
+    Returns:
+        torch.Tensor: mask
+
+    Examples:
+        >>> subsequent_chunk_mask(4, 2)
+        [[1, 1, 0, 0],
+         [1, 1, 0, 0],
+         [1, 1, 1, 1],
+         [1, 1, 1, 1]]
+    """
+    ret = torch.zeros(size, size, dtype=torch.bool)
+    for i in range(size):
+        if num_left_chunks < 0:
+            start = 0
+        else:
+            start = max((i // chunk_size - num_left_chunks) * chunk_size, 0)
+        ending = min((i // chunk_size + 1) * chunk_size, size)
+        ret[i, start:ending] = torch.ones(ending-start, dtype=torch.bool)
+    return ret
+
+def add_optional_chunk_mask(xs: torch.Tensor, masks: torch.Tensor,
+                            use_dynamic_chunk: bool,
+                            use_dynamic_left_chunk: bool,
+                            decoding_chunk_size: int, static_chunk_size: int,
+                            num_decoding_left_chunks: int):
+    """ Apply optional mask for encoder.
+
+    Args:
+        xs (torch.Tensor): padded input, (B, L, D), L for max length
+        mask (torch.Tensor): mask for xs, (B, 1, L)
+        use_dynamic_chunk (bool): whether to use dynamic chunk or not
+        use_dynamic_left_chunk (bool): whether to use dynamic left chunk for
+            training.
+        decoding_chunk_size (int): decoding chunk size for dynamic chunk, it's
+            0: default for training, use random dynamic chunk.
+            <0: for decoding, use full chunk.
+            >0: for decoding, use fixed chunk size as set.
+        static_chunk_size (int): chunk size for static chunk training/decoding
+            if it's greater than 0, if use_dynamic_chunk is true,
+            this parameter will be ignored
+        num_decoding_left_chunks: number of left chunks, this is for decoding,
+            the chunk size is decoding_chunk_size.
+            >=0: use num_decoding_left_chunks
+            <0: use all left chunks
+
+    Returns:
+        torch.Tensor: chunk mask of the input xs.
+    """
+    # Whether to use chunk mask or not
+    if use_dynamic_chunk:
+        max_len = xs.size(1)
+        if decoding_chunk_size < 0:
+            chunk_size = max_len
+            num_left_chunks = -1
+        elif decoding_chunk_size > 0:
+            chunk_size = decoding_chunk_size
+            num_left_chunks = num_decoding_left_chunks
+        else:
+            # chunk size is either [1, 25] or full context(max_len).
+            # Since we use 4 times subsampling and allow up to 1s(100 frames)
+            # delay, the maximum frame is 100 / 4 = 25.
+            chunk_size = torch.randint(1, max_len, (1, )).item()
+            num_left_chunks = -1
+            if chunk_size > max_len // 2:
+                chunk_size = max_len
+            else:
+                chunk_size = chunk_size % 25 + 1
+                if use_dynamic_left_chunk:
+                    max_left_chunks = (max_len - 1) // chunk_size
+                    num_left_chunks = torch.randint(0, max_left_chunks,
+                                                    (1, )).item()
+        chunk_masks = subsequent_chunk_mask(xs.size(1), chunk_size,
+                                            num_left_chunks)  # (L, L)
+        chunk_masks = chunk_masks.to(xs.device)
+        chunk_masks = chunk_masks.unsqueeze(0)  # (1, L, L)
+        chunk_masks = masks & chunk_masks  # (B, L, L)
+    elif static_chunk_size > 0:
+        num_left_chunks = num_decoding_left_chunks
+        chunk_masks = subsequent_chunk_mask(xs.size(1), static_chunk_size,
+                                            num_left_chunks)  # (L, L)
+        chunk_masks = chunk_masks.unsqueeze(0).to(masks.device)  # (1, L, L)
+        chunk_masks = masks & chunk_masks  # (B, L, L)
+    else:
+        chunk_masks = masks
+    return chunk_masks
+
+def make_pad_mask(lengths: torch.Tensor) -> torch.Tensor:
+    """Make mask tensor containing indices of padded part.
+
+    See description of make_non_pad_mask.
+
+    Args:
+        lengths (torch.Tensor): Batch of lengths (B,).
+    Returns:
+        torch.Tensor: Mask tensor containing indices of padded part.
+
+    Examples:
+        >>> lengths = [5, 3, 2]
+        >>> make_pad_mask(lengths)
+        masks = [[0, 0, 0, 0 ,0],
+                 [0, 0, 0, 1, 1],
+                 [0, 0, 1, 1, 1]]
+    """
+    batch_size = int(lengths.size(0))
+    max_len = int(lengths.max().item())
+    seq_range = torch.arange(0,
+                             max_len,
+                             dtype=torch.int64,
+                             device=lengths.device)
+    seq_range_expand = seq_range.unsqueeze(0).expand(batch_size, max_len)
+    seq_length_expand = lengths.unsqueeze(-1)
+    mask = seq_range_expand >= seq_length_expand
+    return mask
+
+def subsequent_mask(
+        size: int,
+        device: torch.device = torch.device("cpu"),
+) -> torch.Tensor:
+    """Create mask for subsequent steps (size, size).
+
+    This mask is used only in decoder which works in an auto-regressive mode.
+    This means the current step could only do attention with its left steps.
+
+    In encoder, fully attention is used when streaming is not necessary and
+    the sequence is not long. In this  case, no attention mask is needed.
+
+    When streaming is need, chunk-based attention is used in encoder. See
+    subsequent_chunk_mask for the chunk-based attention mask.
+
+    Args:
+        size (int): size of mask
+        str device (str): "cpu" or "cuda" or torch.Tensor.device
+        dtype (torch.device): result dtype
+
+    Returns:
+        torch.Tensor: mask
+
+    Examples:
+        >>> subsequent_mask(3)
+        [[1, 0, 0],
+         [1, 1, 0],
+         [1, 1, 1]]
+    """
+    ret = torch.ones(size, size, device=device, dtype=torch.bool)
+    return torch.tril(ret, out=ret)
+
+def target_mask(ys_in_pad, ignore_id):
+    # type: (Tensor, int) -> Tensor
+    # Create mask for decoder self-attention.
+    # :param torch.Tensor ys_pad: batch of padded target sequences (B, Lmax)
+    # :param int ignore_id: index of padding
+    # :param torch.dtype dtype: result dtype
+    # :rtype: torch.Tensor
+
+    ys_mask = ys_in_pad != ignore_id
+    m = subsequent_mask(ys_mask.size(-1), device=ys_mask.device).unsqueeze(0)
+    return ys_mask.unsqueeze(-2) & m
diff --git a/vita/model/vita_tts/pipeline.py b/vita/model/vita_tts/pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa2a781a994abe58a13d6b4ae49ef07a04d6c374
--- /dev/null
+++ b/vita/model/vita_tts/pipeline.py
@@ -0,0 +1,131 @@
+import torch
+import yaml
+import os
+import re
+
+from vita.model.vita_tts.utils import init_encoder_llm, load_checkpoint
+
+class inferencePipeline():
+    def __init__(self, args):
+        self.args = args
+
+        with open(self.args.model_path + "/audiollm/train.yaml", 'r') as fin:
+            configs = yaml.safe_load(fin)
+            configs['cmvn_file'] = self.args.model_path + "/audiollm/global_cmvn"
+            configs['model_conf']['llm_path'] = self.args.llm_path
+
+        # Init asr model from configs
+        self.model = init_encoder_llm(configs)
+        
+        load_checkpoint(self.model, self.args.model_path + "/audiollm/final.pt")
+        device = torch.device('cuda')
+        self.model = self.model.to(device)
+        self.model.eval()
+
+    def speech_dialogue(self, 
+                        audio: tuple, 
+                        role: str=None, 
+                        stat: str='sl', 
+                        past_key_values=None,
+                        last_id=None,
+                        past_tokens=None,
+                        adapter_cache=None,
+                        encoder_cache=None,
+                        pe_index=0):
+        with torch.no_grad():
+            ## input fbank
+            feats = audio
+            if feats is not None:
+                feats = feats.to('cuda')
+                feats_lengths = torch.tensor([feats.size(1)]).to('cuda')
+            else:
+                feats_lengths = None
+
+            extra_inputs = {}
+            extra_inputs['top_p'] = self.args.top_p
+            extra_inputs['top_k'] = self.args.top_k
+            extra_inputs['temperature'] = self.args.temperature
+            extra_inputs['past_key_values'] = past_key_values
+            extra_inputs['stat'] = stat
+            extra_inputs['last_id'] = last_id
+            extra_inputs['adapter_cache'] = adapter_cache
+            extra_inputs['encoder_cache'] = encoder_cache
+            extra_inputs['pe_index'] = pe_index
+            if role is not None and past_key_values is None:
+                # add <|im_end|> in chat_prefix
+                extra_inputs['role'] = '<|im_start|>system\n' + role # + '<|im_end|>'
+
+            with torch.autocast(device_type="cuda", 
+                       dtype=torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float32):
+                # preprocess system role first              
+                if stat == 'pre':
+                    past_key_values = self.model.set_system_role(extra_inputs)
+                    stat = 'sl'
+                else:
+                    (last_id, stat, past_key_values, adapter_cache, 
+                            encoder_cache, pe_index, hidden_state) = self.model.recognize(
+                                feats,
+                                feats_lengths,
+                                extra_inputs=extra_inputs)
+            
+            outputs = dict(
+                past_key_values=past_key_values,
+                stat=stat,
+                last_id=last_id,
+                adapter_cache=adapter_cache,
+                encoder_cache=encoder_cache,
+                pe_index=pe_index,
+            )
+
+            if stat == 'cs':
+                if past_tokens is None:
+                    past_tokens = []
+                past_tokens.append(last_id[0][0])
+                text = self.model.tokenizer.decode(past_tokens, skip_special_tokens=True)
+                outputs['hidden_state'] = hidden_state
+                outputs['text'] = text
+                outputs['past_tokens'] = past_tokens
+            
+            return outputs
+
+    def post_process(self, text):
+        """
+        Post-processes the input text to standardize various characters and formatting.
+
+        Parameters:
+        - text (str): The input text string to be post-processed.
+
+        Actions:
+        1. Replaces various Chinese and English punctuation marks with standardized ones.
+        2. Removes newline, tab, and other unwanted whitespace characters.
+        3. Removes special characters like asterisks, underscores, backticks, and tildes.
+        4. Condenses whitespace following periods and colons.
+        5. Adjusts the format of numbered lists to use appropriate separators
+        6. Ensures the text ends with an appropriate punctuation mark
+
+        Returns:
+        - str: The post-processed text string.
+        """
+        text = text.replace('、', '，')
+        text = text.replace('(', ',')
+        text = text.replace(')', ',')
+        text = text.replace('（', '，')
+        text = text.replace('）', '，')
+
+        text = re.sub(r'[\n\r\t]', '', text)
+        text = re.sub(r'[*_`~]', '', text)
+
+        text = re.sub(r'(\.|\:)\s+', r'\1', text)
+        
+        if re.search(r'[\u4e00-\u9fa5]', text):
+            text = re.sub(r'(\d+)\.\s*([\u4e00-\u9fa5A-Za-z])', r'\1：\2', text)
+        else:
+            text = re.sub(r'(\d+)\.\s*([\w])', r'\1:\2', text)
+        
+        if text and text[-1] not in ["。", "？", "！", ".", "?", "!"]:
+            if text[-1] in [",", "，", ";", "；", ":", "：", "、"]:
+                text = text[:-1] + "。"
+            else:
+                text += "。"
+        
+        return text
diff --git a/vita/model/vita_tts/utils.py b/vita/model/vita_tts/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..2476ed75d4e6346ed10ee36a1b168f5fcadb94f0
--- /dev/null
+++ b/vita/model/vita_tts/utils.py
@@ -0,0 +1,48 @@
+import torch
+import re
+import os
+
+from vita.model.vita_tts.audioLLM import AudioLLM
+
+from vita.model.vita_tts.encoder.cmvn import GlobalCMVN, load_cmvn
+from vita.model.vita_tts.encoder.encoder import speechEncoder
+
+def load_checkpoint(model: torch.nn.Module, path: str) -> dict:
+    if torch.cuda.is_available():
+        print('Checkpoint: loading from checkpoint %s for GPU' % path)
+        checkpoint = torch.load(path)
+    else:
+        print('Checkpoint: loading from checkpoint %s for CPU' % path)
+        checkpoint = torch.load(path, map_location='cpu')
+    
+    # load parm from checkpoint
+    model.load_state_dict(checkpoint, strict=False)
+
+    info_path = re.sub('.pt$', '.yaml', path)
+    configs = {}
+    # get configs
+    if os.path.exists(info_path):
+        with open(info_path, 'r') as fin:
+            configs = yaml.safe_load(fin)
+    return configs
+
+def init_encoder_llm(configs):
+    if configs['cmvn_file'] is not None:
+        # read cmvn
+        mean, istd = load_cmvn(configs['cmvn_file'], configs['is_json_cmvn'])
+        # init cmvn layer
+        global_cmvn = GlobalCMVN(
+            torch.from_numpy(mean).float(),
+            torch.from_numpy(istd).float())
+    else:
+        global_cmvn = None
+
+    input_dim = configs['input_dim']
+    vocab_size = configs['output_dim']
+
+    # init speech encoder
+    encoder = speechEncoder(input_dim, global_cmvn=global_cmvn, **configs['encoder_conf'])
+    # init audioLLM
+    model = AudioLLM(encoder=encoder, **configs['model_conf'])
+
+    return model
diff --git a/vita/train/train.py b/vita/train/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c8909232286dfb62e1b4833d80f960222076ae1
--- /dev/null
+++ b/vita/train/train.py
@@ -0,0 +1,468 @@
+import logging
+import os
+import pathlib
+import random
+from dataclasses import dataclass, field
+from typing import Optional
+
+import numpy as np
+import torch
+import transformers
+from transformers import set_seed
+
+from vita import conversation as conversation_lib
+from vita.model import *
+from vita.train.vita_trainer import VITATrainer
+
+from vita.util.data_utils_video_audio_neg_patch import make_supervised_data_module, DataArguments
+#from vita.util.data_utils_video_audio_neg_patch_fo import make_supervised_data_module, DataArguments
+#from vita.util.data_utils_video_audio_patch import make_supervised_data_module, DataArguments
+#from vita.util.data_utils_video_audio_patch_sf import make_supervised_data_module, DataArguments
+#from vita.util.data_utils_video_patch_audio import make_supervised_data_module, DataArguments
+
+
+def set_random_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed)
+    set_seed(seed)
+
+
+set_random_seed(42)
+
+
+local_rank = None
+
+
+def rank0_print(*args):
+    if local_rank == 0:
+        print(*args)
+
+
+@dataclass
+class ModelArguments:
+    model_name_or_path: Optional[str] = field(default=None)
+    model_type: Optional[str] = field(default=None)
+    version: Optional[str] = field(default=None)
+    freeze_backbone: bool = field(default=False)
+    tune_mm_mlp_adapter: bool = field(default=False)
+    tune_audio_mlp_adapter: bool = field(default=False)
+    audio_prompt_finetune: bool = field(default=False)
+    audio_prompt_num: Optional[int] = field(default=None)
+    audio_state_predictor_tuning: bool = field(default=False)
+    vision_tower: Optional[str] = field(default=None)
+    audio_encoder: Optional[str] = field(default=None)
+    freeze_audio_encoder: bool = field(default=True)
+    freeze_audio_encoder_adapter: bool = field(default=True)
+    unfreeze_vision_tower: bool = field(default=False)
+    use_s2: bool = field(default=False)
+    pretrain_audio_mlp_adapter: Optional[str] = field(default=None)
+    pretrain_mm_mlp_adapter: Optional[str] = field(default=None)
+    mm_projector_type: Optional[str] = field(default="mlp2x_gelu")
+
+
+@dataclass
+class TrainingArguments(transformers.TrainingArguments):
+    cache_dir: Optional[str] = field(default=None)
+    optim: str = field(default="adamw_torch")
+    remove_unused_columns: bool = field(default=False)
+    freeze_mm_mlp_adapter: bool = field(default=False)
+    mpt_attn_impl: Optional[str] = field(default="triton")
+    model_max_length: int = field(
+        default=512,
+        metadata={
+            "help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
+        },
+    )
+    double_quant: bool = field(
+        default=True,
+        metadata={"help": "Compress the quantization statistics through double quantization."},
+    )
+    quant_type: str = field(
+        default="nf4",
+        metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."},
+    )
+    bits: int = field(default=16, metadata={"help": "How many bits to use."})
+    lora_enable: bool = False
+    lora_r: int = 64
+    lora_alpha: int = 16
+    lora_dropout: float = 0.05
+    lora_weight_path: str = ""
+    lora_bias: str = "none"
+    mm_projector_lr: Optional[float] = None
+    group_by_modality_length: bool = field(default=False)
+
+
+def maybe_zero_3(param, ignore_status=False, name=None):
+    from deepspeed import zero
+    from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
+
+    if hasattr(param, "ds_id"):
+        if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
+            if not ignore_status:
+                logging.warning(
+                    f"{name}: param.ds_status != ZeroParamStatus.NOT_AVAILABLE: {param.ds_status}"
+                )
+        with zero.GatheredParameters([param]):
+            param = param.data.detach().cpu().clone()
+    else:
+        param = param.detach().cpu().clone()
+    return param
+
+
+# Borrowed from peft.util.get_peft_model_state_dict
+def get_peft_state_maybe_zero_3(named_params, bias):
+    if bias == "none":
+        to_return = {k: t for k, t in named_params if "lora_" in k}
+    elif bias == "all":
+        to_return = {k: t for k, t in named_params if "lora_" in k or "bias" in k}
+    elif bias == "lora_only":
+        to_return = {}
+        maybe_lora_bias = {}
+        lora_bias_names = set()
+        for k, t in named_params:
+            if "lora_" in k:
+                to_return[k] = t
+                bias_name = k.split("lora_")[0] + "bias"
+                lora_bias_names.add(bias_name)
+            elif "bias" in k:
+                maybe_lora_bias[k] = t
+        for k, t in maybe_lora_bias:
+            if bias_name in lora_bias_names:
+                to_return[bias_name] = t
+    else:
+        raise NotImplementedError
+    to_return = {k: maybe_zero_3(v, ignore_status=True) for k, v in to_return.items()}
+    return to_return
+
+
+def get_peft_state_non_lora_maybe_zero_3(named_params, require_grad_only=True):
+    to_return = {k: t for k, t in named_params if "lora_" not in k}
+    if require_grad_only:
+        to_return = {k: t for k, t in to_return.items() if t.requires_grad}
+    to_return = {k: maybe_zero_3(v, ignore_status=True).cpu() for k, v in to_return.items()}
+    return to_return
+
+
+def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match):
+    to_return = {
+        k: t for k, t in named_params if any(key_match in k for key_match in keys_to_match)
+    }
+    to_return = {k: maybe_zero_3(v, ignore_status=True).cpu() for k, v in to_return.items()}
+    return to_return
+
+
+def find_all_linear_names(model):
+    cls = torch.nn.Linear
+    lora_module_names = set()
+    multimodal_keywords = ["mm_projector", "vision_tower", "vision_resampler"]
+    for name, module in model.named_modules():
+        if any(mm_keyword in name for mm_keyword in multimodal_keywords):
+            continue
+        if isinstance(module, cls):
+            names = name.split(".")
+            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
+
+    if "lm_head" in lora_module_names:  # needed for 16-bit
+        lora_module_names.remove("lm_head")
+    return list(lora_module_names)
+
+
+def safe_save_model_for_hf_trainer(trainer: transformers.Trainer, output_dir: str):
+    """Collects the state dict and dump to disk."""
+
+    if getattr(trainer.args, "tune_mm_mlp_adapter", False):
+        # Only save Adapter
+        keys_to_match = ["mm_projector"]
+        if getattr(trainer.args, "use_im_start_end", False):
+            keys_to_match.extend(["embed_tokens", "embed_in"])
+
+        weight_to_save = get_mm_adapter_state_maybe_zero_3(
+            trainer.model.named_parameters(), keys_to_match
+        )
+        trainer.model.config.save_pretrained(output_dir)
+
+        current_folder = output_dir.split("/")[-1]
+        parent_folder = os.path.dirname(output_dir)
+        if trainer.args.local_rank == 0 or trainer.args.local_rank == -1:
+            if current_folder.startswith("checkpoint-"):
+                mm_projector_folder = os.path.join(parent_folder, "mm_projector")
+                os.makedirs(mm_projector_folder, exist_ok=True)
+                torch.save(
+                    weight_to_save, os.path.join(mm_projector_folder, f"{current_folder}.bin")
+                )
+            else:
+                torch.save(weight_to_save, os.path.join(output_dir, f"mm_projector.bin"))
+        return
+
+    if trainer.deepspeed:
+        torch.cuda.synchronize()
+        trainer.save_model(output_dir)
+        return
+
+    state_dict = trainer.model.state_dict()
+    if trainer.args.should_save:
+        cpu_state_dict = {key: value.cpu() for key, value in state_dict.items()}
+        del state_dict
+        trainer._save(output_dir, state_dict=cpu_state_dict)  # noqa
+
+
+def train():
+    global local_rank
+
+    parser = transformers.HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    local_rank = training_args.local_rank
+    compute_dtype = (
+        torch.float16
+        if training_args.fp16
+        else (torch.bfloat16 if training_args.bf16 else torch.float32)
+    )
+
+    bnb_model_from_pretrained_args = {}
+    if training_args.bits in [4, 8]:
+        from transformers import BitsAndBytesConfig
+
+        bnb_model_from_pretrained_args.update(
+            dict(
+                device_map={"": training_args.device},
+                load_in_4bit=training_args.bits == 4,
+                load_in_8bit=training_args.bits == 8,
+                quantization_config=BitsAndBytesConfig(
+                    load_in_4bit=training_args.bits == 4,
+                    load_in_8bit=training_args.bits == 8,
+                    llm_int8_skip_modules=["mm_projector"],
+                    llm_int8_threshold=6.0,
+                    llm_int8_has_fp16_weight=False,
+                    bnb_4bit_compute_dtype=compute_dtype,
+                    bnb_4bit_use_double_quant=training_args.double_quant,
+                    bnb_4bit_quant_type=training_args.quant_type,  # {'fp4', 'nf4'}
+                ),
+            )
+        )
+
+    assert model_args.vision_tower is not None
+    if model_args.model_type in {"mixtral-8x7b", "nemo", "qwen2p5_instruct", "qwen2p5_fo_instruct"}:
+        tokenizer = transformers.AutoTokenizer.from_pretrained(
+            model_args.model_name_or_path,
+            cache_dir=training_args.cache_dir,
+            model_max_length=training_args.model_max_length,
+            padding_side="right",
+            use_fast=True,
+        )
+
+    if tokenizer.unk_token is not None and tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.unk_token
+
+    if model_args.model_type == "llama3-8b":
+        tokenizer.pad_token = tokenizer.eos_token
+
+    if model_args.model_type == "mixtral-8x7b":
+        torch_dtype = torch.float16 if training_args.fp16 else torch.bfloat16
+        model = VITAMixtralForCausalLM.from_pretrained(
+            model_args.model_name_or_path,
+            cache_dir=training_args.cache_dir,
+            torch_dtype=torch_dtype,
+            attn_implementation="flash_attention_2",
+            **bnb_model_from_pretrained_args,
+        )
+    elif model_args.model_type == "nemo":
+        torch_dtype = torch.float16 if training_args.fp16 else torch.bfloat16
+        model = VITAMistralForCausalLM.from_pretrained(
+            model_args.model_name_or_path,
+            cache_dir=training_args.cache_dir,
+            torch_dtype=torch_dtype,
+            attn_implementation="flash_attention_2",
+            **bnb_model_from_pretrained_args,
+        )
+    elif model_args.model_type == "qwen2p5_instruct":
+        torch_dtype = torch.float16 if training_args.fp16 else torch.bfloat16
+        model = VITAQwen2ForCausalLM.from_pretrained(
+            model_args.model_name_or_path,
+            cache_dir=training_args.cache_dir,
+            torch_dtype=torch_dtype,
+            attn_implementation="flash_attention_2",
+            **bnb_model_from_pretrained_args,
+        )
+    elif model_args.model_type == "qwen2p5_fo_instruct":
+        torch_dtype = torch.float16 if training_args.fp16 else torch.bfloat16
+        model = VITAFOQwen2ForCausalLM.from_pretrained(
+            model_args.model_name_or_path,
+            cache_dir=training_args.cache_dir,
+            torch_dtype=torch_dtype,
+            attn_implementation="flash_attention_2",
+            **bnb_model_from_pretrained_args,
+        )
+    else:
+        raise ValueError(f"Unknown Model Type {model_args.model_type}")
+
+    model.config.use_cache = False
+
+    if model_args.freeze_backbone:
+        model.model.requires_grad_(False)
+
+    if training_args.bits in [4, 8]:
+        from peft import prepare_model_for_kbit_training
+
+        model.config.torch_dtype = (
+            torch.float32
+            if training_args.fp16
+            else (torch.bfloat16 if training_args.bf16 else torch.float32)
+        )
+        model = prepare_model_for_kbit_training(
+            model, use_gradient_checkpointing=training_args.gradient_checkpointing
+        )
+
+    if training_args.gradient_checkpointing:
+        if hasattr(model, "enable_input_require_grads"):
+            model.enable_input_require_grads()
+        else:
+
+            def make_inputs_require_grad(module, input, output):
+                output.requires_grad_(True)
+
+            model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+
+    if training_args.lora_enable:
+        from peft import LoraConfig, get_peft_model
+
+        lora_config = LoraConfig(
+            r=training_args.lora_r,
+            lora_alpha=training_args.lora_alpha,
+            target_modules=find_all_linear_names(model),
+            lora_dropout=training_args.lora_dropout,
+            bias=training_args.lora_bias,
+            task_type="CAUSAL_LM",
+        )
+        if training_args.bits == 16:
+            if training_args.bf16:
+                model.to(torch.bfloat16)
+            if training_args.fp16:
+                model.to(torch.float16)
+        rank0_print("Adding LoRA adapters...")
+        model = get_peft_model(model, lora_config)
+
+    if model_args.version in conversation_lib.conv_templates:
+        conversation_lib.default_conversation = conversation_lib.conv_templates[model_args.version]
+    else:
+        conversation_lib.default_conversation = conversation_lib.conv_templates["default"]
+
+    model.get_model().initialize_vision_modules(model_args=model_args)
+
+    model.config.freeze_audio_encoder = model_args.freeze_audio_encoder
+    model.config.freeze_audio_encoder_adapter = model_args.freeze_audio_encoder_adapter
+    model.config.audio_prompt_finetune = model_args.audio_prompt_finetune
+    model.config.audio_prompt_num = model_args.audio_prompt_num
+    model.get_model().initialize_audio_modules(model_args=model_args)
+
+    vision_tower = model.get_vision_tower()
+    vision_tower.to(
+        dtype=torch.bfloat16 if training_args.bf16 else torch.float16, device=training_args.device
+    )
+
+    audio_encoder = model.get_audio_encoder()
+    audio_encoder.to(
+        dtype=torch.bfloat16 if training_args.bf16 else torch.float16, device=training_args.device
+    )
+
+    data_args.image_processor = vision_tower.image_processor
+    data_args.audio_processor = audio_encoder.audio_processor
+
+    model.config.image_aspect_ratio = data_args.image_aspect_ratio
+    model.config.tokenizer_padding_side = tokenizer.padding_side
+    model.config.tokenizer_model_max_length = tokenizer.model_max_length
+
+    model.config.tune_mm_mlp_adapter = (
+        training_args.tune_mm_mlp_adapter
+    ) = model_args.tune_mm_mlp_adapter
+    if model_args.tune_mm_mlp_adapter:
+        model.requires_grad_(False)
+        for p in model.get_model().mm_projector.parameters():
+            p.requires_grad = True
+
+    model.config.tune_audio_mlp_adapter = (
+        training_args.tune_audio_mlp_adapter
+    ) = model_args.tune_audio_mlp_adapter
+    if model_args.tune_audio_mlp_adapter:
+        model.requires_grad_(False)
+        for p in model.model.audio_encoder.adpter.parameters():
+            p.requires_grad = True
+
+    model.config.audio_prompt_finetune = (
+        training_args.audio_prompt_finetune
+    ) = model_args.audio_prompt_finetune
+    model.config.audio_state_predictor_tuning = (
+        training_args.audio_state_predictor_tuning
+    ) = model_args.audio_state_predictor_tuning
+    if model_args.audio_prompt_finetune or model_args.audio_state_predictor_tuning:
+        model.requires_grad_(False)
+        if model_args.audio_prompt_finetune:
+            for p in model.model.audio_encoder.prompt_embeddings.parameters():
+                p.requires_grad = True        
+        if model_args.audio_state_predictor_tuning:
+            for p in model.predictor_head.parameters():
+                p.requires_grad = True
+
+    model.config.freeze_mm_mlp_adapter = training_args.freeze_mm_mlp_adapter
+    if training_args.freeze_mm_mlp_adapter:
+        for p in model.get_model().mm_projector.parameters():
+            p.requires_grad = False
+
+    if training_args.bits in [4, 8]:
+        model.get_model().mm_projector.to(dtype=compute_dtype, device=training_args.device)
+
+    model.config.mm_projector_lr = training_args.mm_projector_lr
+
+    model.config.use_s2 = model_args.use_s2
+
+    model.config.unfreeze_vision_tower = (
+        training_args.unfreeze_vision_tower
+    ) = model_args.unfreeze_vision_tower
+    if training_args.unfreeze_vision_tower:
+        for p in model.get_model().vision_tower.parameters():
+            p.requires_grad = True
+
+    if training_args.bits in [4, 8]:
+        from peft.tuners.lora import LoraLayer
+
+        for name, module in model.named_modules():
+            if isinstance(module, LoraLayer):
+                if training_args.bf16:
+                    module = module.to(torch.bfloat16)
+            if "norm" in name:
+                module = module.to(torch.float32)
+            if "lm_head" in name or "embed_tokens" in name:
+                if hasattr(module, "weight"):
+                    if training_args.bf16 and module.weight.dtype == torch.float32:
+                        module = module.to(torch.bfloat16)
+
+    data_module = make_supervised_data_module(tokenizer=tokenizer, data_args=data_args)
+    trainer = VITATrainer(model=model, tokenizer=tokenizer, args=training_args, **data_module)
+
+    if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")):
+        trainer.train(resume_from_checkpoint=True)
+    else:
+        trainer.train()
+    trainer.save_state()
+
+    model.config.use_cache = True
+
+    if training_args.lora_enable:
+        state_dict = get_peft_state_maybe_zero_3(model.named_parameters(), training_args.lora_bias)
+        non_lora_state_dict = get_peft_state_non_lora_maybe_zero_3(model.named_parameters())
+        if training_args.local_rank == 0 or training_args.local_rank == -1:
+            model.config.save_pretrained(training_args.output_dir)
+            model.save_pretrained(training_args.output_dir, state_dict=state_dict)
+            torch.save(
+                non_lora_state_dict,
+                os.path.join(training_args.output_dir, "non_lora_trainables.bin"),
+            )
+    else:
+        safe_save_model_for_hf_trainer(trainer=trainer, output_dir=training_args.output_dir)
+
+
+if __name__ == "__main__":
+    train()
+
diff --git a/vita/train/vita_trainer.py b/vita/train/vita_trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffe6f2400d8fcf0001a8402ca373bbb4f22cb728
--- /dev/null
+++ b/vita/train/vita_trainer.py
@@ -0,0 +1,373 @@
+import os
+from typing import Any, Dict, List, Optional, Union
+
+import torch
+from torch import nn
+from torch.utils.data import Sampler
+from transformers import Trainer
+from transformers.trainer import (
+    ALL_LAYERNORM_LAYERS,
+    get_parameter_names,
+    has_length,
+    is_sagemaker_mp_enabled,
+    logger,
+)
+
+
+def maybe_zero_3(param, ignore_status=False, name=None):
+    from deepspeed import zero
+    from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
+
+    if hasattr(param, "ds_id"):
+        if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
+            if not ignore_status:
+                print(name, "no ignore status")
+        with zero.GatheredParameters([param]):
+            param = param.data.detach().cpu().clone()
+    else:
+        param = param.detach().cpu().clone()
+    return param
+
+
+def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match):
+    to_return = {
+        k: t for k, t in named_params if any(key_match in k for key_match in keys_to_match)
+    }
+    to_return = {k: maybe_zero_3(v, ignore_status=True, name=k).cpu() for k, v in to_return.items()}
+    return to_return
+
+
+def split_to_even_chunks(indices, lengths, num_chunks):
+    """
+    Split a list of indices into `chunks` chunks of roughly equal lengths.
+    """
+
+    if len(indices) % num_chunks != 0:
+        return [indices[i::num_chunks] for i in range(num_chunks)]
+
+    num_indices_per_chunk = len(indices) // num_chunks
+
+    chunks = [[] for _ in range(num_chunks)]
+    chunks_lengths = [0 for _ in range(num_chunks)]
+    for index in indices:
+        shortest_chunk = chunks_lengths.index(min(chunks_lengths))
+        chunks[shortest_chunk].append(index)
+        chunks_lengths[shortest_chunk] += lengths[index]
+        if len(chunks[shortest_chunk]) == num_indices_per_chunk:
+            chunks_lengths[shortest_chunk] = float("inf")
+
+    return chunks
+
+
+def get_modality_length_grouped_indices(lengths, batch_size, world_size, generator=None):
+    # We need to use torch for the random part as a distributed sampler will set the random seed for torch.
+    assert all(l != 0 for l in lengths), "Should not have zero length."
+    if all(l > 0 for l in lengths) or all(l < 0 for l in lengths):
+        # all samples are in the same modality
+        return get_length_grouped_indices(lengths, batch_size, world_size, generator=generator)
+    mm_indices, mm_lengths = zip(*[(i, l) for i, l in enumerate(lengths) if l > 0])
+    lang_indices, lang_lengths = zip(*[(i, -l) for i, l in enumerate(lengths) if l < 0])
+
+    mm_shuffle = [
+        mm_indices[i]
+        for i in get_length_grouped_indices(mm_lengths, batch_size, world_size, generator=None)
+    ]
+    lang_shuffle = [
+        lang_indices[i]
+        for i in get_length_grouped_indices(lang_lengths, batch_size, world_size, generator=None)
+    ]
+    megabatch_size = world_size * batch_size
+    mm_megabatches = [
+        mm_shuffle[i : i + megabatch_size] for i in range(0, len(mm_shuffle), megabatch_size)
+    ]
+    lang_megabatches = [
+        lang_shuffle[i : i + megabatch_size] for i in range(0, len(lang_shuffle), megabatch_size)
+    ]
+
+    last_mm = mm_megabatches[-1]
+    last_lang = lang_megabatches[-1]
+    additional_batch = last_mm + last_lang
+    megabatches = mm_megabatches[:-1] + lang_megabatches[:-1]
+    megabatch_indices = torch.randperm(len(megabatches), generator=generator)
+    megabatches = [megabatches[i] for i in megabatch_indices]
+
+    if len(additional_batch) > 0:
+        megabatches.append(sorted(additional_batch))
+
+    return [i for megabatch in megabatches for i in megabatch]
+
+
+def get_length_grouped_indices(lengths, batch_size, world_size, generator=None, merge=True):
+    # We need to use torch for the random part as a distributed sampler will set the random seed for torch.
+    indices = torch.randperm(len(lengths), generator=generator)
+    megabatch_size = world_size * batch_size
+    megabatches = [
+        indices[i : i + megabatch_size].tolist() for i in range(0, len(lengths), megabatch_size)
+    ]
+    megabatches = [
+        sorted(megabatch, key=lambda i: lengths[i], reverse=True) for megabatch in megabatches
+    ]
+    megabatches = [
+        split_to_even_chunks(megabatch, lengths, world_size) for megabatch in megabatches
+    ]
+
+    return [i for megabatch in megabatches for batch in megabatch for i in batch]
+
+
+class LengthGroupedSampler(Sampler):
+    r"""
+    Sampler that samples indices in a way that groups together features of the dataset of roughly the same length while
+    keeping a bit of randomness.
+    """
+
+    def __init__(
+        self,
+        batch_size: int,
+        world_size: int,
+        lengths: Optional[List[int]] = None,
+        generator=None,
+        group_by_modality: bool = False,
+    ):
+        if lengths is None:
+            raise ValueError("Lengths must be provided.")
+
+        self.batch_size = batch_size
+        self.world_size = world_size
+        self.lengths = lengths
+        self.generator = generator
+        self.group_by_modality = group_by_modality
+
+    def __len__(self):
+        return len(self.lengths)
+
+    def __iter__(self):
+        if self.group_by_modality:
+            indices = get_modality_length_grouped_indices(
+                self.lengths, self.batch_size, self.world_size, generator=self.generator
+            )
+        else:
+            indices = get_length_grouped_indices(
+                self.lengths, self.batch_size, self.world_size, generator=self.generator
+            )
+        return iter(indices)
+
+
+class VITATrainer(Trainer):
+    def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
+        if self.train_dataset is None or not has_length(self.train_dataset):
+            return None
+
+        if self.args.group_by_modality_length:
+            lengths = self.train_dataset.modality_lengths
+            return LengthGroupedSampler(
+                self.args.train_batch_size,
+                world_size=self.args.world_size * self.args.gradient_accumulation_steps,
+                lengths=lengths,
+                group_by_modality=True,
+            )
+        else:
+            return super()._get_train_sampler()
+
+    def create_optimizer(self):
+        """
+        Setup the optimizer.
+
+        We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
+        Trainer's init through `optimizers`, or subclass and override this method in a subclass.
+        """
+        if is_sagemaker_mp_enabled():
+            return super().create_optimizer()
+
+        opt_model = self.model
+
+        if self.optimizer is None:
+            decay_parameters = get_parameter_names(opt_model, ALL_LAYERNORM_LAYERS)
+            decay_parameters = [name for name in decay_parameters if "bias" not in name]
+            if self.args.mm_projector_lr is not None:
+                projector_parameters = [
+                    name
+                    for name, _ in opt_model.named_parameters()
+                    #if "mm_projector" in name or "vision_tower" in name
+                    if "vision_tower" in name
+                ]
+                optimizer_grouped_parameters = [
+                    {
+                        "params": [
+                            p
+                            for n, p in opt_model.named_parameters()
+                            if (
+                                n in decay_parameters
+                                and n not in projector_parameters
+                                and p.requires_grad
+                            )
+                        ],
+                        "weight_decay": self.args.weight_decay,
+                    },
+                    {
+                        "params": [
+                            p
+                            for n, p in opt_model.named_parameters()
+                            if (
+                                n not in decay_parameters
+                                and n not in projector_parameters
+                                and p.requires_grad
+                            )
+                        ],
+                        "weight_decay": 0.0,
+                    },
+                    {
+                        "params": [
+                            p
+                            for n, p in opt_model.named_parameters()
+                            if (
+                                n in decay_parameters
+                                and n in projector_parameters
+                                and p.requires_grad
+                            )
+                        ],
+                        "weight_decay": self.args.weight_decay,
+                        "lr": self.args.mm_projector_lr,
+                    },
+                    {
+                        "params": [
+                            p
+                            for n, p in opt_model.named_parameters()
+                            if (
+                                n not in decay_parameters
+                                and n in projector_parameters
+                                and p.requires_grad
+                            )
+                        ],
+                        "weight_decay": 0.0,
+                        "lr": self.args.mm_projector_lr,
+                    },
+                ]
+            else:
+                optimizer_grouped_parameters = [
+                    {
+                        "params": [
+                            p
+                            for n, p in opt_model.named_parameters()
+                            if (n in decay_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": self.args.weight_decay,
+                    },
+                    {
+                        "params": [
+                            p
+                            for n, p in opt_model.named_parameters()
+                            if (n not in decay_parameters and p.requires_grad)
+                        ],
+                        "weight_decay": 0.0,
+                    },
+                ]
+
+            optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(self.args)
+
+            self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
+            if optimizer_cls.__name__ == "Adam8bit":
+                import bitsandbytes
+
+                manager = bitsandbytes.optim.GlobalOptimManager.get_instance()
+
+                skipped = 0
+                for module in opt_model.modules():
+                    if isinstance(module, nn.Embedding):
+                        skipped += sum(
+                            {p.data_ptr(): p.numel() for p in module.parameters()}.values()
+                        )
+                        logger.info(f"skipped {module}: {skipped / 2 ** 20}M params")
+                        manager.register_module_override(module, "weight", {"optim_bits": 32})
+                        logger.debug(f"bitsandbytes: will optimize {module} in fp32")
+                logger.info(f"skipped: {skipped / 2 ** 20}M params")
+
+        return self.optimizer
+
+    def _save_checkpoint(self, model, trial, metrics=None):
+        # print('model.model.audio_encoder.adpter.project.weight')
+        # print(model.model.audio_encoder.adpter.project.weight)
+        # print('model.model.audio_encoder.adpter.project.weight.requires_grad')
+        # print(model.model.audio_encoder.adpter.project.weight.requires_grad)
+        if getattr(self.args, "tune_mm_mlp_adapter", False) or getattr(self.args, "tune_audio_mlp_adapter", False):
+            if getattr(self.args, "tune_mm_mlp_adapter", False):
+                from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
+
+                checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
+
+                run_dir = self._get_output_dir(trial=trial)
+                output_dir = os.path.join(run_dir, checkpoint_folder)
+
+                # Only save Adapter
+                keys_to_match = ["mm_projector", "vision_resampler"]
+                if getattr(self.args, "use_im_start_end", False):
+                    keys_to_match.extend(["embed_tokens", "embed_in"])
+
+                weight_to_save = get_mm_adapter_state_maybe_zero_3(
+                    self.model.named_parameters(), keys_to_match
+                )
+
+                if self.args.local_rank == 0 or self.args.local_rank == -1:
+                    self.model.config.save_pretrained(output_dir)
+                    torch.save(weight_to_save, os.path.join(output_dir, f"mm_projector.bin"))
+            if getattr(self.args, "tune_audio_mlp_adapter", False):
+                from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
+
+                checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
+
+                run_dir = self._get_output_dir(trial=trial)
+                output_dir = os.path.join(run_dir, checkpoint_folder)
+
+                # Only save Adapter
+                keys_to_match = ["audio_encoder.adpter"]
+                if getattr(self.args, "use_im_start_end", False):
+                    keys_to_match.extend(["embed_tokens", "embed_in"])
+
+                weight_to_save = get_mm_adapter_state_maybe_zero_3(
+                    self.model.named_parameters(), keys_to_match
+                )
+
+                if self.args.local_rank == 0 or self.args.local_rank == -1:
+                    self.model.config.save_pretrained(output_dir)
+                    torch.save(weight_to_save, os.path.join(output_dir, f"audio_adpter.bin"))
+        else:
+            super(VITATrainer, self)._save_checkpoint(model, trial, metrics)
+
+    def _save(self, output_dir: Optional[str] = None, state_dict=None):
+        if getattr(self.args, "tune_mm_mlp_adapter", False) or getattr(self.args, "tune_audio_mlp_adapter", False):
+            pass
+        else:
+            super(VITATrainer, self)._save(output_dir, state_dict)
+
+    def training_step(
+        self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]
+    ) -> torch.Tensor:
+        """
+        Perform a training step on a batch of inputs.
+        Print
+
+        Subclass and override to inject custom behavior.
+
+        Args:
+            model (`nn.Module`):
+                The model to train.
+            inputs (`Dict[str, Union[torch.Tensor, Any]]`):
+                The inputs and targets of the model.
+
+                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
+                argument `labels`. Check your model's documentation for all accepted arguments.
+
+        Return:
+            `torch.Tensor`: The tensor with training loss on this batch.
+        """
+        tr_loss_step = super().training_step(model, inputs)
+        return tr_loss_step
+        #try:
+           #import pdb; pdb.set_trace()
+           #tr_loss_step = super().training_step(model, inputs)
+           #return tr_loss_step
+        #except Exception as e:
+           #print('------------------------------------------------len of images------------------------------------------------')
+           #print(len(inputs['images']))
+           #print('------------------------------------------------input_ids------------------------------------------------')
+           #print(inputs['input_ids'].tolist())
+           #print(e)
diff --git a/vita/util/data_utils_video_audio.py b/vita/util/data_utils_video_audio.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f0a6183c4112cb7d0c4bfb90119d92bac1e9c47
--- /dev/null
+++ b/vita/util/data_utils_video_audio.py
@@ -0,0 +1,843 @@
+import copy
+import json
+import math
+import os
+import random
+from dataclasses import dataclass, field
+from typing import Dict, Optional, Sequence
+
+import numpy as np
+import torch
+import transformers
+from PIL import Image
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data import Dataset
+
+from decord import VideoReader, cpu
+from vita import conversation as conversation_lib
+from vita.config import AudioFolder, DataConfig, FolderDict
+from vita.constants import (
+    DEFAULT_AUDIO_TOKEN,
+    DEFAULT_IMAGE_TOKEN,
+    DEFAULT_VIDEO_TOKEN,
+    IGNORE_INDEX,
+    MAX_IMAGE_LENGTH,
+    MIN_IMAGE_LENGTH,
+)
+from vita.util.mm_utils import tokenizer_image_audio_token, tokenizer_image_token
+
+
+@dataclass
+class DataArguments:
+    lazy_preprocess: bool = False
+    is_multimodal: bool = True
+    image_folder: Optional[str] = field(default=None)
+    image_aspect_ratio: str = field(default=None)
+    dataset_use: str = field(default="temp")
+
+
+def preprocess_multimodal(
+    sources: Sequence[str], data_args: DataArguments, image_token_num=1, audio_lens: int = 0
+) -> Dict:
+    is_multimodal = data_args.is_multimodal
+    if not is_multimodal:
+        return sources
+
+    for source in sources:
+        for sentence in source:
+            if DEFAULT_IMAGE_TOKEN in sentence["value"] or DEFAULT_VIDEO_TOKEN in sentence["value"]:
+                sentence["value"] = (
+                    sentence["value"]
+                    .replace(DEFAULT_IMAGE_TOKEN + "\n", DEFAULT_IMAGE_TOKEN)
+                    .strip()
+                )
+                sentence["value"] = (
+                    sentence["value"]
+                    .replace("\n" + DEFAULT_IMAGE_TOKEN, DEFAULT_IMAGE_TOKEN)
+                    .strip()
+                )
+                if sentence["value"].endswith(DEFAULT_IMAGE_TOKEN):
+                    IMAGE_TOKEN_NUM = sentence["value"].count(DEFAULT_IMAGE_TOKEN)
+                    sentence["value"] = (
+                        sentence["value"].replace(DEFAULT_IMAGE_TOKEN * IMAGE_TOKEN_NUM, "").strip()
+                    )
+                    sentence["value"] = DEFAULT_IMAGE_TOKEN * IMAGE_TOKEN_NUM + sentence["value"]
+                    sentence["value"] = sentence["value"].strip()
+                if sentence["value"].endswith(DEFAULT_VIDEO_TOKEN):
+                    VIDEO_TOKEN_NUM = sentence["value"].count(DEFAULT_VIDEO_TOKEN)
+                    sentence["value"] = (
+                        sentence["value"].replace(DEFAULT_VIDEO_TOKEN * VIDEO_TOKEN_NUM, "").strip()
+                    )
+                    sentence["value"] = DEFAULT_VIDEO_TOKEN * VIDEO_TOKEN_NUM + sentence["value"]
+                    sentence["value"] = sentence["value"].strip()
+
+                if "mmtag" in conversation_lib.default_conversation.version:
+                    sentence["value"] = sentence["value"].replace(
+                        DEFAULT_IMAGE_TOKEN, "<Image>" + DEFAULT_IMAGE_TOKEN + "</Image>"
+                    )
+
+                IMAGE_TOKEN_NUM = sentence["value"].count(DEFAULT_IMAGE_TOKEN)
+                if IMAGE_TOKEN_NUM > MAX_IMAGE_LENGTH:
+                    sentence["value"] = (
+                        sentence["value"]
+                        .replace(
+                            DEFAULT_IMAGE_TOKEN * IMAGE_TOKEN_NUM,
+                            DEFAULT_IMAGE_TOKEN * MAX_IMAGE_LENGTH,
+                        )
+                        .strip()
+                    )
+            replace_token, vid_replace_token, audio_replace_token = (
+                DEFAULT_IMAGE_TOKEN,
+                DEFAULT_IMAGE_TOKEN * image_token_num,
+                DEFAULT_AUDIO_TOKEN,
+            )  # * audio_lens
+
+            sentence["value"] = sentence["value"].replace(DEFAULT_IMAGE_TOKEN, replace_token + "\n")
+            sentence["value"] = sentence["value"].replace(
+                DEFAULT_VIDEO_TOKEN, vid_replace_token + "\n"
+            )
+            sentence["value"] = sentence["value"].replace(
+                DEFAULT_AUDIO_TOKEN + "\n", audio_replace_token
+            )
+            sentence["value"] = sentence["value"].replace("\n\n", "\n")
+
+    return sources
+
+
+def preprocess_mixtral_zh(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+    has_audio: bool = False,
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+
+    # Tokenize conversations
+    if has_image and not has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    elif has_image and has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_audio_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    elif not has_image and has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_audio_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+
+    targets = input_ids.clone()
+
+    assert conv.sep_style == conversation_lib.SeparatorStyle.MixtralZh
+
+    # Mask targets
+    sep = conv.sep + "\n" + conv.roles[1] + ":"
+    sep2_2 = "\n" + conv.roles[0] + ":"
+    sep2 = conv.sep2 + sep2_2
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+
+        rounds = conversation.split(sep2)
+        rounds = [rounds[0] + sep2 + rounds[1]] + rounds[2:]
+        cur_len = 1
+        end_token_cnt = 0
+        target[:cur_len] = IGNORE_INDEX
+
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+            if i > 0:
+                rou = sep2_2 + rou
+
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+
+            if has_image and not has_audio:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 1
+            elif has_image and has_audio:
+                round_len = len(tokenizer_image_audio_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_audio_token(parts[0], tokenizer)) - 1
+            elif not has_image and has_audio:
+                round_len = len(tokenizer_image_audio_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_audio_token(parts[0], tokenizer)) - 1
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 2
+
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+
+            end_token_cnt += 1
+            cur_len += round_len
+        cur_len = cur_len - 1
+        target[cur_len:] = IGNORE_INDEX
+
+        if tokenizer.pad_token_id == tokenizer.eos_token_id:
+            cur_len -= end_token_cnt
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." f" (ignored)")
+                # print(f"YOU NEED GO TO DEBUG THIS DATA ITEM: {conversations}")
+
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+
+
+def preprocess_plain(
+    sources: Sequence[str],
+    tokenizer: transformers.PreTrainedTokenizer,
+) -> Dict:
+    # add end signal and concatenate together
+    conversations = []
+    for source in sources:
+        assert len(source) == 2
+        assert DEFAULT_IMAGE_TOKEN in source[0]["value"]
+        source[0]["value"] = DEFAULT_IMAGE_TOKEN
+        conversation = (
+            source[0]["value"] + source[1]["value"] + conversation_lib.default_conversation.sep
+        )
+        conversations.append(conversation)
+    # tokenize conversations
+    input_ids = [
+        tokenizer_image_token(prompt, tokenizer, return_tensors="pt") for prompt in conversations
+    ]
+    targets = copy.deepcopy(input_ids)
+    for target, source in zip(targets, sources):
+        tokenized_len = len(tokenizer_image_token(source[0]["value"], tokenizer))
+        target[:tokenized_len] = IGNORE_INDEX
+
+    return dict(input_ids=input_ids, labels=targets)
+
+
+def preprocess(
+    sources: Sequence[str],
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+    has_audio: bool = False,
+) -> Dict:
+    if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.PLAIN:
+        return preprocess_plain(sources, tokenizer)
+
+    if conversation_lib.default_conversation.version == "mixtral_zh":
+        return preprocess_mixtral_zh(sources, tokenizer, has_image=has_image, has_audio=has_audio)
+
+
+def _get_rawvideo_dec(
+    video_path,
+    image_processor,
+    max_frames=32,
+    min_frames=4,
+    image_resolution=384,
+    video_framerate=1,
+    s=None,
+    e=None,
+    image_aspect_ratio="pad",
+):
+    # speed up video decode via decord.
+    video_mask = np.zeros(max_frames, dtype=np.int64)
+    max_video_length = 0
+
+    # T x 3 x H x W
+    video = np.zeros((max_frames, 3, image_resolution, image_resolution), dtype=np.float64)
+
+    if s is None:
+        start_time, end_time = None, None
+    else:
+        start_time = int(s)
+        end_time = int(e)
+        start_time = start_time if start_time >= 0.0 else 0.0
+        end_time = end_time if end_time >= 0.0 else 0.0
+        if start_time > end_time:
+            start_time, end_time = end_time, start_time
+        elif start_time == end_time:
+            end_time = start_time + 1
+
+    if os.path.exists(video_path):
+        vreader = VideoReader(video_path, ctx=cpu(0))
+    else:
+        print(video_path)
+        raise FileNotFoundError
+
+    fps = vreader.get_avg_fps()
+    f_start = 0 if start_time is None else int(start_time * fps)
+    f_end = int(min(1000000000 if end_time is None else end_time * fps, len(vreader) - 1))
+    num_frames = f_end - f_start + 1
+    if num_frames > 0:
+        # T x 3 x H x W
+        sample_fps = int(video_framerate)
+        t_stride = int(round(float(fps) / sample_fps))
+
+        all_pos = list(range(f_start, f_end + 1, t_stride))
+        if len(all_pos) > max_frames:
+            sample_pos = [
+                all_pos[_] for _ in np.linspace(0, len(all_pos) - 1, num=max_frames, dtype=int)
+            ]
+        elif len(all_pos) < min_frames:
+            sample_pos = [
+                all_pos[_] for _ in np.linspace(0, len(all_pos) - 1, num=min_frames, dtype=int)
+            ]
+        else:
+            sample_pos = all_pos
+
+        patch_images = [Image.fromarray(f) for f in vreader.get_batch(sample_pos).asnumpy()]
+
+        if image_aspect_ratio == "pad":
+
+            def expand2square(pil_img, background_color):
+                width, height = pil_img.size
+                if width == height:
+                    return pil_img
+                elif width > height:
+                    result = Image.new(pil_img.mode, (width, width), background_color)
+                    result.paste(pil_img, (0, (width - height) // 2))
+                    return result
+                else:
+                    result = Image.new(pil_img.mode, (height, height), background_color)
+                    result.paste(pil_img, ((height - width) // 2, 0))
+                    return result
+
+            patch_images = [
+                expand2square(i, tuple(int(x * 255) for x in image_processor.image_mean))
+                for i in patch_images
+            ]
+            patch_images = [
+                image_processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                for i in patch_images
+            ]
+        else:
+            patch_images = [
+                image_processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                for i in patch_images
+            ]
+
+        # patch_images = [image_processor.preprocess(img, return_tensors='pt')['pixel_values'][0] for img in patch_images]
+        slice_len = len(patch_images)
+        return patch_images, slice_len
+        max_video_length = max_video_length if max_video_length > slice_len else slice_len
+        if slice_len < 1:
+            pass
+        else:
+            while len(patch_images) < max_frames:
+                patch_images.append(torch.zeros((3, image_resolution, image_resolution)))
+            # video[:slice_len, ...] = patch_images
+    else:
+        print("video path: {} error.".format(video_path))
+
+    video_mask[:max_video_length] = [1] * max_video_length
+
+    return patch_images, video_mask
+
+
+class LazySupervisedDataset(Dataset):
+    """Dataset for supervised fine-tuning."""
+
+    def __init__(self, tokenizer: transformers.PreTrainedTokenizer, data_args: DataArguments):
+        super(LazySupervisedDataset, self).__init__()
+
+        dataset_list = DataConfig[str(data_args.dataset_use)]
+        print(dataset_list)
+
+        self.max_length = MAX_IMAGE_LENGTH
+        list_data_dict = []
+        self.folder_dict = {}
+        for i in dataset_list:
+            list_data_dict += json.load(open(i["chat_path"], "r"))
+
+            image_folder = [folder for folder in i if folder is not "chat_path"]
+
+            for folder in image_folder:
+                if folder not in self.folder_dict:
+                    self.folder_dict[folder] = i[folder]
+        for key in FolderDict.keys():
+            if key not in self.folder_dict:
+                self.folder_dict[key] = FolderDict[key]
+
+        random.shuffle(list_data_dict)
+
+        self.tokenizer = tokenizer
+        self.list_data_dict = list_data_dict
+        self.data_args = data_args
+
+    def __len__(self):
+        return len(self.list_data_dict)
+
+    # @property
+    # def lengths(self):
+    #     length_list = []
+    #     for sample in self.list_data_dict:
+    #         img_tokens = 128 if 'image' in sample else 0
+    #         length_list.append(sum(len(conv['value'].split()) for conv in sample['conversations']) + img_tokens)
+    #     return length_list
+
+    @property
+    def modality_lengths(self):
+        length_list = []
+        for sample in self.list_data_dict:
+            cur_len = sum(len(conv["value"].split()) for conv in sample["conversations"])
+            cur_len = cur_len if ("image" in sample or "video" in sample) else -cur_len
+            length_list.append(cur_len)
+        return length_list
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        sources = self.list_data_dict[i]
+        if isinstance(i, int):
+            sources = [sources]
+        assert len(sources) == 1, "Don't know why it is wrapped to a list"  # FIXME
+        if "image" in sources[0] and "audio" not in sources[0]:
+            image_file = self.list_data_dict[i]["image"]
+            set_id = self.list_data_dict[i].get("set", None)
+            file = image_file[0] if type(image_file) is list else image_file
+            processor = self.data_args.image_processor
+
+            if type(image_file) is list:
+                assert type(set_id) is list
+                if len(image_file) != len(set_id):
+                    assert len(set(set_id)) == 1
+                image = [
+                    Image.open(
+                        os.path.join(self.folder_dict[set_id[k]], file.replace("\\", "/"))
+                    ).convert("RGB")
+                    for k, file in enumerate(image_file)
+                ]
+                if self.data_args.image_aspect_ratio == "pad":
+
+                    def expand2square(pil_img, background_color):
+                        width, height = pil_img.size
+                        if width == height:
+                            return pil_img
+                        elif width > height:
+                            result = Image.new(pil_img.mode, (width, width), background_color)
+                            result.paste(pil_img, (0, (width - height) // 2))
+                            return result
+                        else:
+                            result = Image.new(pil_img.mode, (height, height), background_color)
+                            result.paste(pil_img, ((height - width) // 2, 0))
+                            return result
+
+                    image = [
+                        expand2square(i, tuple(int(x * 255) for x in processor.image_mean))
+                        for i in image
+                    ]
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+                else:
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+            else:
+                image_folder = self.folder_dict[set_id]
+                image = Image.open(
+                    os.path.join(image_folder, image_file.replace("\\", "/"))
+                ).convert("RGB")
+                if self.data_args.image_aspect_ratio == "pad":
+
+                    def expand2square(pil_img, background_color):
+                        width, height = pil_img.size
+                        if width == height:
+                            return pil_img
+                        elif width > height:
+                            result = Image.new(pil_img.mode, (width, width), background_color)
+                            result.paste(pil_img, (0, (width - height) // 2))
+                            return result
+                        else:
+                            result = Image.new(pil_img.mode, (height, height), background_color)
+                            result.paste(pil_img, ((height - width) // 2, 0))
+                            return result
+
+                    image = expand2square(image, tuple(int(x * 255) for x in processor.image_mean))
+                    image = processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
+                else:
+                    image = processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
+
+            sources = preprocess_multimodal(
+                copy.deepcopy([e["conversations"] for e in sources]), self.data_args
+            )
+
+            data_dict = preprocess(sources, self.tokenizer, has_image=True)
+
+        elif "image" in sources[0] and "audio" in sources[0]:
+            image_file = self.list_data_dict[i]["image"]
+            set_id = self.list_data_dict[i].get("set", None)
+            file = image_file[0] if type(image_file) is list else image_file
+            audio_file = self.list_data_dict[i]["audio"]
+            processor = self.data_args.image_processor
+
+            if type(image_file) is list:
+                assert type(set_id) is list
+                if len(image_file) != len(set_id):  # 多图数据
+                    assert len(set(set_id)) == 1
+                image = [
+                    Image.open(
+                        os.path.join(self.folder_dict[set_id[k]], file.replace("\\", "/"))
+                    ).convert("RGB")
+                    for k, file in enumerate(image_file)
+                ]
+                if self.data_args.image_aspect_ratio == "pad":
+
+                    def expand2square(pil_img, background_color):
+                        width, height = pil_img.size
+                        if width == height:
+                            return pil_img
+                        elif width > height:
+                            result = Image.new(pil_img.mode, (width, width), background_color)
+                            result.paste(pil_img, (0, (width - height) // 2))
+                            return result
+                        else:
+                            result = Image.new(pil_img.mode, (height, height), background_color)
+                            result.paste(pil_img, ((height - width) // 2, 0))
+                            return result
+
+                    image = [
+                        expand2square(i, tuple(int(x * 255) for x in processor.image_mean))
+                        for i in image
+                    ]
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+                else:
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+            else:
+                image_folder = self.folder_dict[set_id]
+                image = Image.open(
+                    os.path.join(image_folder, image_file.replace("\\", "/"))
+                ).convert("RGB")
+                if self.data_args.image_aspect_ratio == "pad":
+
+                    def expand2square(pil_img, background_color):
+                        width, height = pil_img.size
+                        if width == height:
+                            return pil_img
+                        elif width > height:
+                            result = Image.new(pil_img.mode, (width, width), background_color)
+                            result.paste(pil_img, (0, (width - height) // 2))
+                            return result
+                        else:
+                            result = Image.new(pil_img.mode, (height, height), background_color)
+                            result.paste(pil_img, ((height - width) // 2, 0))
+                            return result
+
+                    image = expand2square(image, tuple(int(x * 255) for x in processor.image_mean))
+                    image = processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
+                else:
+                    image = processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
+
+            if type(audio_file) is list:
+                # if type(set_id) is list:
+                #    audio_folder = self.folder_dict[set_id[0]+'_audio']
+                # else:
+                #    audio_folder = self.folder_dict[set_id+'_audio']
+                audio_folder = AudioFolder
+                assert len(audio_file) > 0, "audio_file为列表时不能为空"
+                audio = []
+                audio_for_llm_lens = []
+                audio_length = []
+                for file in audio_file:
+                    try:
+                        a, a_llm = self.data_args.audio_processor.process(
+                            os.path.join(audio_folder, "audio", file)
+                        )
+                    except:
+                        print(f"File {os.path.join(audio_folder, 'audio', file)} not OK!!!!!")
+                    audio.append(a)
+                    audio_for_llm_lens.append(a_llm)
+                    audio_length.append(a.shape[0])
+            else:
+                # audio_folder = self.folder_dict[set_id+'_audio']
+                audio_folder = AudioFolder
+                assert audio_file, "audio_file不能为空"
+                audio, audio_for_llm_lens = self.data_args.audio_processor.process(
+                    os.path.join(audio_folder, "audio", audio_file)
+                )
+                audio_length = audio.shape[0]
+
+            sources = preprocess_multimodal(
+                copy.deepcopy([e["conversations"] for e in sources]),
+                self.data_args,
+                audio_lens=audio_for_llm_lens,
+            )
+
+            data_dict = preprocess(sources, self.tokenizer, has_image=True, has_audio=True)
+            data_dict["audio_lengths"] = audio_length
+            data_dict["audio_lengths_for_llm"] = audio_for_llm_lens
+
+        elif "video" in sources[0] and "audio" not in sources[0]:
+            video_file = self.list_data_dict[i]["video"]
+            video_id = self.list_data_dict[i]["id"]
+            set_id = self.list_data_dict[i].get("set", None)
+            processor = self.data_args.image_processor
+            if "height" in processor.size.keys():
+                image_size = processor.size["height"]
+            elif "shortest_edge" in processor.size.keys():
+                image_size = processor.size["shortest_edge"]
+            else:
+                raise NotImplementedError(f"Please use correct key to use processor size!")
+            video_folder = self.folder_dict[set_id]
+            image, image_token_num = _get_rawvideo_dec(
+                os.path.join(video_folder, video_file),
+                processor,
+                max_frames=MAX_IMAGE_LENGTH,
+                min_frames=MIN_IMAGE_LENGTH,
+                image_resolution=image_size,
+            )
+
+            sources = preprocess_multimodal(
+                copy.deepcopy([e["conversations"] for e in sources]),
+                self.data_args,
+                image_token_num=image_token_num,
+            )
+
+            data_dict = preprocess(sources, self.tokenizer, has_image=True, has_audio=False)
+
+        elif "video" in sources[0] and "audio" in sources[0]:
+            video_file = self.list_data_dict[i]["video"]
+            video_id = self.list_data_dict[i]["id"]
+            set_id = self.list_data_dict[i].get("set", None)
+            audio_file = self.list_data_dict[i]["audio"]
+            processor = self.data_args.image_processor
+            if "height" in processor.size.keys():
+                image_size = processor.size["height"]
+            elif "shortest_edge" in processor.size.keys():
+                image_size = processor.size["shortest_edge"]
+            else:
+                raise NotImplementedError(f"Please use correct key to use processor size!")
+            video_folder = self.folder_dict[set_id]
+            # audio_folder = self.folder_dict[set_id+'_audio']
+            audio_folder = AudioFolder
+            image, image_token_num = _get_rawvideo_dec(
+                os.path.join(video_folder, video_file),
+                processor,
+                max_frames=MAX_IMAGE_LENGTH,
+                min_frames=MIN_IMAGE_LENGTH,
+                image_resolution=image_size,
+            )
+            if type(audio_file) is list:
+                assert len(audio_file) > 0, "audio_file为列表时不能为空"
+                audio = []
+                audio_for_llm_lens = []
+                audio_length = []
+                for file in audio_file:
+                    a, a_llm = self.data_args.audio_processor.process(
+                        os.path.join(audio_folder, "audio", file)
+                    )
+                    audio.append(a)
+                    audio_for_llm_lens.append(a_llm)
+                    audio_length.append(a.shape[0])
+            else:
+                assert audio_file, "audio_file不能为空"
+                audio, audio_for_llm_lens = self.data_args.audio_processor.process(
+                    os.path.join(audio_folder, "audio", audio_file)
+                )
+                audio_length = audio.shape[0]
+
+            sources = preprocess_multimodal(
+                copy.deepcopy([e["conversations"] for e in sources]),
+                self.data_args,
+                image_token_num=image_token_num,
+                audio_lens=audio_for_llm_lens,
+            )
+
+            data_dict = preprocess(sources, self.tokenizer, has_image=True, has_audio=True)
+            data_dict["audio_lengths"] = audio_length
+            data_dict["audio_lengths_for_llm"] = audio_for_llm_lens
+        elif "audio" in sources[0]:
+            audio_file = self.list_data_dict[i]["audio"]
+            audio_folder = AudioFolder
+            if type(audio_file) is list:
+                assert len(audio_file) > 0, "audio_file为列表时不能为空"
+                audio = []
+                audio_for_llm_lens = []
+                audio_length = []
+                for file in audio_file:
+                    a, a_llm = self.data_args.audio_processor.process(
+                        os.path.join(audio_folder, "audio", file)
+                    )
+                    audio.append(a)
+                    audio_for_llm_lens.append(a_llm)
+                    audio_length.append(a.shape[0])
+            else:
+                assert audio_file, "audio_file不能为空"
+                audio, audio_for_llm_lens = self.data_args.audio_processor.process(
+                    os.path.join(audio_folder, "audio", audio_file)
+                )
+                audio_length = audio.shape[0]
+
+            sources = preprocess_multimodal(
+                copy.deepcopy([e["conversations"] for e in sources]),
+                self.data_args,
+                image_token_num=0,
+                audio_lens=audio_for_llm_lens,
+            )
+
+            data_dict = preprocess(sources, self.tokenizer, has_image=False, has_audio=True)
+            data_dict["audio_lengths"] = audio_length
+            data_dict["audio_lengths_for_llm"] = audio_for_llm_lens
+        else:
+            sources = copy.deepcopy([e["conversations"] for e in sources])
+
+            data_dict = preprocess(sources, self.tokenizer, has_image=False)
+
+        if isinstance(i, int):
+            if "audio" in self.list_data_dict[i]:
+                data_dict = dict(
+                    input_ids=data_dict["input_ids"][0],
+                    labels=data_dict["labels"][0],
+                    audio_lengths=data_dict["audio_lengths"],
+                    audio_lengths_for_llm=data_dict["audio_lengths_for_llm"],
+                )
+            else:
+                data_dict = dict(input_ids=data_dict["input_ids"][0], labels=data_dict["labels"][0])
+
+        # image exist in the data
+        if "image" in self.list_data_dict[i] or "video" in self.list_data_dict[i]:
+            data_dict["image"] = image
+        elif self.data_args.is_multimodal:
+            # image does not exist in the data, but the model is multimodal
+            crop_size = self.data_args.image_processor.crop_size
+            data_dict["image"] = torch.zeros(3, crop_size["height"], crop_size["width"])
+        if "audio" in self.list_data_dict[i]:
+            data_dict["audio"] = audio
+        elif self.data_args.is_multimodal:
+            data_dict["audio"] = torch.zeros(400, 80)
+            data_dict["audio_lengths"] = 400
+            data_dict["audio_lengths_for_llm"] = 60
+        return data_dict
+
+
+@dataclass
+class DataCollatorForSupervisedDataset(object):
+    """Collate examples for supervised fine-tuning."""
+
+    tokenizer: transformers.PreTrainedTokenizer
+
+    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        input_ids, labels = tuple(
+            [instance[key] for instance in instances] for key in ("input_ids", "labels")
+        )
+        if self.tokenizer.pad_token_id == self.tokenizer.eos_token_id:
+            for input_id in input_ids:
+                input_id[input_id == self.tokenizer.eos_token_id] = -300
+
+        input_ids = torch.nn.utils.rnn.pad_sequence(
+            input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
+        )
+
+        labels = torch.nn.utils.rnn.pad_sequence(
+            labels, batch_first=True, padding_value=IGNORE_INDEX
+        )
+
+        input_ids = input_ids[:, : self.tokenizer.model_max_length]
+
+        attention_mask = input_ids.ne(self.tokenizer.pad_token_id)
+
+        labels = labels[:, : self.tokenizer.model_max_length]
+
+        if self.tokenizer.pad_token_id == self.tokenizer.eos_token_id:
+            for input_id in input_ids:
+                input_id[input_id == -300] = self.tokenizer.eos_token_id
+
+        batch = dict(
+            input_ids=input_ids,
+            labels=labels,
+            attention_mask=attention_mask,
+        )
+
+        if "image" in instances[0]:
+            images = [instance["image"] for instance in instances]
+
+            new_images = []
+            for image in images:
+                if type(image) is list:
+                    for i in image:
+                        new_images.append(i)
+                else:
+                    new_images.append(image)
+            images = new_images
+
+            if all(x is not None and x.shape == images[0].shape for x in images):
+                batch["images"] = torch.stack(images)
+            else:
+                batch["images"] = images
+
+        batch["audios"] = {}
+        if "audio" in instances[0]:
+            audios = [instance["audio"] for instance in instances]
+            audio_lengths = [instance["audio_lengths"] for instance in instances]
+            audio_lengths_for_llm = [instance["audio_lengths_for_llm"] for instance in instances]
+
+            new_audios = []
+            new_audio_lengths = []
+            new_audio_lengths_for_llm = []
+            for i, audio in enumerate(audios):
+                length = audio_lengths[i]
+                length_for_llm = audio_lengths_for_llm[i]
+                if type(audio) is list:
+                    for j, a in enumerate(audio):
+                        new_audios.append(a)
+                        new_audio_lengths.append(length[j])
+                        new_audio_lengths_for_llm.append(length_for_llm[j])
+                else:
+                    new_audios.append(audio)
+                    new_audio_lengths.append(length)
+                    new_audio_lengths_for_llm.append(length_for_llm)
+            audios = new_audios
+            audios = pad_sequence(audios, batch_first=True, padding_value=0)
+
+            batch["audios"]["audios"] = audios
+            batch["audios"]["lengths"] = torch.tensor(new_audio_lengths)
+            batch["audios"]["lengths_for_llm"] = torch.tensor(new_audio_lengths_for_llm)
+
+        return batch
+
+
+def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer, data_args) -> Dict:
+    """Make dataset and collator for supervised fine-tuning."""
+    train_dataset = LazySupervisedDataset(tokenizer=tokenizer, data_args=data_args)
+    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
+    return dict(train_dataset=train_dataset, eval_dataset=None, data_collator=data_collator)
diff --git a/vita/util/data_utils_video_audio_neg_frameCat.py b/vita/util/data_utils_video_audio_neg_frameCat.py
new file mode 100644
index 0000000000000000000000000000000000000000..52704c69b7fc2153abe676bbd9814d45a1f2143c
--- /dev/null
+++ b/vita/util/data_utils_video_audio_neg_frameCat.py
@@ -0,0 +1,1349 @@
+import copy
+import json
+import math
+import os
+import random
+from dataclasses import dataclass, field
+from typing import Dict, Optional, Sequence
+
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+import transformers
+from PIL import Image
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data import Dataset
+
+from decord import VideoReader, cpu
+from vita import conversation as conversation_lib
+from vita.config import AudioFolder, DataConfig, FolderDict, NoPatchSets
+from vita.constants import (
+    DEFAULT_AUDIO_TOKEN,
+    DEFAULT_DATA_RATIO,
+    DEFAULT_IMAGE_TOKEN,
+    DEFAULT_VIDEO_TOKEN,
+    IGNORE_INDEX,
+    MAX_IMAGE_LENGTH,
+    MIN_IMAGE_LENGTH,
+)
+from vita.util.mm_utils import tokenizer_image_audio_token, tokenizer_image_token
+
+
+@dataclass
+class DataArguments:
+    lazy_preprocess: bool = False
+    is_multimodal: bool = True
+    image_folder: Optional[str] = field(default=None)
+    image_aspect_ratio: str = field(default=None)
+    dataset_use: str = field(default="temp")
+    min_dynamic_patch: int = 2
+    max_dynamic_patch: int = 12
+    use_thumbnail: bool = True
+
+
+def preprocess_multimodal(
+    sources: Sequence[str],
+    data_args: DataArguments,
+    image_token_num=1,
+    patch_num=[1],
+    audio_lens: int = 0,
+    inserted_id=None,
+) -> Dict:
+    is_multimodal = data_args.is_multimodal
+    if not is_multimodal:
+        return sources
+
+    k_img_ph = 0
+    for source in sources:
+        if inserted_id is not None:
+            assert source[inserted_id]["from"] == "gpt"
+        for i, sentence in enumerate(source):
+            if DEFAULT_IMAGE_TOKEN in sentence["value"] or DEFAULT_VIDEO_TOKEN in sentence["value"]:
+                sentence["value"] = (
+                    sentence["value"]
+                    .replace(DEFAULT_IMAGE_TOKEN + "\n", DEFAULT_IMAGE_TOKEN)
+                    .strip()
+                )
+                sentence["value"] = (
+                    sentence["value"]
+                    .replace("\n" + DEFAULT_IMAGE_TOKEN, DEFAULT_IMAGE_TOKEN)
+                    .strip()
+                )
+                if sentence["value"].endswith(DEFAULT_IMAGE_TOKEN):
+                    IMAGE_TOKEN_NUM = sentence["value"].count(DEFAULT_IMAGE_TOKEN)
+                    sentence["value"] = (
+                        sentence["value"].replace(DEFAULT_IMAGE_TOKEN * IMAGE_TOKEN_NUM, "").strip()
+                    )
+                    sentence["value"] = DEFAULT_IMAGE_TOKEN * IMAGE_TOKEN_NUM + sentence["value"]
+                    sentence["value"] = sentence["value"].strip()
+                if sentence["value"].endswith(DEFAULT_VIDEO_TOKEN):
+                    VIDEO_TOKEN_NUM = sentence["value"].count(DEFAULT_VIDEO_TOKEN)
+                    sentence["value"] = (
+                        sentence["value"].replace(DEFAULT_VIDEO_TOKEN * VIDEO_TOKEN_NUM, "").strip()
+                    )
+                    sentence["value"] = DEFAULT_VIDEO_TOKEN * VIDEO_TOKEN_NUM + sentence["value"]
+                    sentence["value"] = sentence["value"].strip()
+
+                if "mmtag" in conversation_lib.default_conversation.version:
+                    sentence["value"] = sentence["value"].replace(
+                        DEFAULT_IMAGE_TOKEN, "<Image>" + DEFAULT_IMAGE_TOKEN + "</Image>"
+                    )
+
+                IMAGE_TOKEN_NUM = sentence["value"].count(DEFAULT_IMAGE_TOKEN)
+                if IMAGE_TOKEN_NUM > MAX_IMAGE_LENGTH:
+                    sentence["value"] = (
+                        sentence["value"]
+                        .replace(
+                            DEFAULT_IMAGE_TOKEN * IMAGE_TOKEN_NUM,
+                            DEFAULT_IMAGE_TOKEN * MAX_IMAGE_LENGTH,
+                        )
+                        .strip()
+                    )
+            replace_token, vid_replace_token, audio_replace_token = (
+                DEFAULT_IMAGE_TOKEN,
+                DEFAULT_IMAGE_TOKEN * image_token_num,
+                DEFAULT_AUDIO_TOKEN,
+            )  # * audio_lens
+            if DEFAULT_IMAGE_TOKEN in sentence["value"]:
+                replace_token = DEFAULT_IMAGE_TOKEN * patch_num[k_img_ph]
+                k_img_ph += 1
+
+            sentence["value"] = sentence["value"].replace(DEFAULT_IMAGE_TOKEN, replace_token + "\n")
+            sentence["value"] = sentence["value"].replace(
+                DEFAULT_VIDEO_TOKEN, vid_replace_token + "\n"
+            )
+            sentence["value"] = sentence["value"].replace(
+                DEFAULT_AUDIO_TOKEN + "\n", audio_replace_token
+            )
+            sentence["value"] = sentence["value"].replace("\n\n", "\n")
+            if i == inserted_id:
+                assert sentence["from"] == "gpt"
+                sentence["value"] = "<2>" + sentence["value"]
+            elif sentence["from"] == "gpt":
+                if "<audio>" in source[i - 1]["value"]:
+                    sentence["value"] = "<1>" + sentence["value"]
+                else:
+                    sentence["value"] = "<3>" + sentence["value"]
+
+    # print(patch_num)
+    # print(sum(patch_num))
+    # print(sources)
+    # import pdb; pdb.set_trace()
+    return sources
+
+
+def preprocess_mixtral_zh(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+    has_audio: bool = False,
+    end_tag: bool = True,
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+
+    # Tokenize conversations
+    if not end_tag:
+        conversations[0] = conversations[0][:-4]
+    if has_image and not has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    elif has_image and has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_audio_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    elif not has_image and has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_audio_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+    # print(f'end_tag: {end_tag}')
+    # print(conversations)
+    # print(input_ids)
+    # import pdb; pdb.set_trace()
+
+    targets = input_ids.clone()
+
+    assert conv.sep_style == conversation_lib.SeparatorStyle.MixtralZh
+
+    # Mask targets
+    sep = conv.sep + "\n" + conv.roles[1] + ":"
+    sep2_2 = "\n" + conv.roles[0] + ":"
+    sep2 = conv.sep2 + sep2_2
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+
+        rounds = conversation.split(sep2)
+        rounds = [rounds[0] + sep2 + rounds[1]] + rounds[2:]
+        cur_len = 1
+        end_token_cnt = 0
+        target[:cur_len] = IGNORE_INDEX
+
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+            if i > 0:
+                rou = sep2_2 + rou
+
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+
+            if has_image and not has_audio:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 1
+            elif has_image and has_audio:
+                round_len = len(tokenizer_image_audio_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_audio_token(parts[0], tokenizer)) - 1
+            elif not has_image and has_audio:
+                round_len = len(tokenizer_image_audio_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_audio_token(parts[0], tokenizer)) - 1
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 2
+
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+
+            end_token_cnt += 1
+            cur_len += round_len
+        cur_len = cur_len - 1
+        target[cur_len:] = IGNORE_INDEX
+
+        if tokenizer.pad_token_id == tokenizer.eos_token_id:
+            cur_len -= end_token_cnt
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." f" (ignored)")
+                # print(f"YOU NEED GO TO DEBUG THIS DATA ITEM: {conversations}")
+
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+
+
+def preprocess_mixtral_two(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+    has_audio: bool = False,
+    end_tag: bool = True,
+    modality: str = "lang",
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt(modality))
+    # print(conversations)
+    # import pdb; pdb.set_trace()
+
+    # Tokenize conversations
+    if not end_tag:
+        conversations[0] = conversations[0][:-4]
+    if has_image and not has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    elif has_image and has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_audio_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    elif not has_image and has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_audio_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+    #print(f'end_tag: {end_tag}')
+    #print(conversations)
+    #print(input_ids)
+    #import pdb; pdb.set_trace()
+
+    targets = input_ids.clone()
+
+    assert conv.sep_style == conversation_lib.SeparatorStyle.MixtralTwo
+
+    # Mask targets
+    sep = conv.sep + "\n" + conv.roles[1] + ":"
+    sep2_2 = "\n" + conv.roles[0] + ":"
+    sep2 = conv.sep2 + sep2_2
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+
+        rounds = conversation.split(sep2)
+        rounds = [rounds[0] + sep2 + rounds[1]] + rounds[2:]
+        cur_len = 1
+        end_token_cnt = 0
+        target[:cur_len] = IGNORE_INDEX
+
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+            if i > 0:
+                rou = sep2_2 + rou
+
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+
+            if has_image and not has_audio:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 1
+            elif has_image and has_audio:
+                round_len = len(tokenizer_image_audio_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_audio_token(parts[0], tokenizer)) - 1
+            elif not has_image and has_audio:
+                round_len = len(tokenizer_image_audio_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_audio_token(parts[0], tokenizer)) - 1
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 2
+
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+
+            end_token_cnt += 1
+            cur_len += round_len
+        cur_len = cur_len - 1
+        target[cur_len:] = IGNORE_INDEX
+
+        if tokenizer.pad_token_id == tokenizer.eos_token_id:
+            cur_len -= end_token_cnt
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." f" (ignored)")
+                # print(f"YOU NEED GO TO DEBUG THIS DATA ITEM: {conversations}")
+
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+
+
+def preprocess_plain(
+    sources: Sequence[str],
+    tokenizer: transformers.PreTrainedTokenizer,
+) -> Dict:
+    # add end signal and concatenate together
+    conversations = []
+    for source in sources:
+        assert len(source) == 2
+        assert DEFAULT_IMAGE_TOKEN in source[0]["value"]
+        source[0]["value"] = DEFAULT_IMAGE_TOKEN
+        conversation = (
+            source[0]["value"] + source[1]["value"] + conversation_lib.default_conversation.sep
+        )
+        conversations.append(conversation)
+    # tokenize conversations
+    input_ids = [
+        tokenizer_image_token(prompt, tokenizer, return_tensors="pt") for prompt in conversations
+    ]
+    targets = copy.deepcopy(input_ids)
+    for target, source in zip(targets, sources):
+        tokenized_len = len(tokenizer_image_token(source[0]["value"], tokenizer))
+        target[:tokenized_len] = IGNORE_INDEX
+
+    return dict(input_ids=input_ids, labels=targets)
+
+
+def preprocess(
+    sources: Sequence[str],
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+    has_audio: bool = False,
+    end_tag: bool = True,
+    modality: str = "lang",
+) -> Dict:
+    if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.PLAIN:
+        return preprocess_plain(sources, tokenizer)
+
+    if conversation_lib.default_conversation.version == "mixtral_zh":
+        return preprocess_mixtral_zh(
+            sources, tokenizer, has_image=has_image, has_audio=has_audio, end_tag=end_tag
+        )
+    elif conversation_lib.default_conversation.version == "mixtral_two":
+        return preprocess_mixtral_two(
+            sources,
+            tokenizer,
+            has_image=has_image,
+            has_audio=has_audio,
+            end_tag=end_tag,
+            modality=modality,
+        )
+
+
+def _get_rawvideo_dec(
+    video_path,
+    image_processor,
+    max_frames=MAX_IMAGE_LENGTH,
+    min_frames=MIN_IMAGE_LENGTH,
+    image_resolution=384,
+    video_framerate=1,
+    s=None,
+    e=None,
+    image_aspect_ratio="pad",
+):
+    # speed up video decode via decord.
+    video_mask = np.zeros(max_frames, dtype=np.int64)
+    max_video_length = 0
+
+    # T x 3 x H x W
+    video = np.zeros((max_frames, 3, image_resolution, image_resolution), dtype=np.float64)
+
+    if s is None:
+        start_time, end_time = None, None
+    else:
+        start_time = int(s)
+        end_time = int(e)
+        start_time = start_time if start_time >= 0.0 else 0.0
+        end_time = end_time if end_time >= 0.0 else 0.0
+        if start_time > end_time:
+            start_time, end_time = end_time, start_time
+        elif start_time == end_time:
+            end_time = start_time + 1
+
+    if os.path.exists(video_path):
+        vreader = VideoReader(video_path, ctx=cpu(0))
+    else:
+        print(video_path)
+        raise FileNotFoundError
+
+    fps = vreader.get_avg_fps()
+    f_start = 0 if start_time is None else int(start_time * fps)
+    f_end = int(min(1000000000 if end_time is None else end_time * fps, len(vreader) - 1))
+    num_frames = f_end - f_start + 1
+    if num_frames > 0:
+        # T x 3 x H x W
+        sample_fps = int(video_framerate)
+        t_stride = int(round(float(fps) / sample_fps))
+        all_pos = list(range(f_start, f_end + 1, t_stride))
+        num_frame = math.ceil(len(all_pos) / 4) * 4  # rounded up to the nearest multiple of 4
+        if num_frame > max_frames:
+            num_frame = math.floor(max_frames / 4) * 4
+        assert num_frame <= MAX_IMAGE_LENGTH and num_frame >= MIN_IMAGE_LENGTH
+
+        sample_fps = 3
+        t_stride = int(round(float(fps) / sample_fps))
+        all_pos = list(range(f_start, f_end + 1, t_stride))
+        sample_pos = [
+            all_pos[_] for _ in np.linspace(0, len(all_pos) - 1, num=num_frame, dtype=int)
+        ]
+
+        patch_images = [Image.fromarray(f) for f in vreader.get_batch(sample_pos).asnumpy()]
+        assert len(patch_images) % 4 == 0
+        new_patch_images = []
+        for i in range(0, len(patch_images), 4):
+            img1, img2, img3, img4 = patch_images[i : i + 4]
+            width, height = img1.size
+
+            new_image = Image.new(
+                patch_images[0].mode,
+                (2 * width, 2 * height),
+                tuple(int(x * 255) for x in image_processor.image_mean),
+            )
+            new_image.paste(img1, (0, 0))
+            new_image.paste(img2, (width, 0))
+            new_image.paste(img3, (0, height))
+            new_image.paste(img4, (width, height))
+
+            new_patch_images.append(new_image)
+            new_patch_images.extend([img1, img2, img3, img4])
+
+        patch_images = new_patch_images
+
+        # import pdb; pdb.set_trace()
+        # visualize_images(patch_images[0], patch_images)
+        if image_aspect_ratio == "pad":
+
+            def expand2square(pil_img, background_color):
+                width, height = pil_img.size
+                if width == height:
+                    return pil_img
+                elif width > height:
+                    result = Image.new(pil_img.mode, (width, width), background_color)
+                    result.paste(pil_img, (0, (width - height) // 2))
+                    return result
+                else:
+                    result = Image.new(pil_img.mode, (height, height), background_color)
+                    result.paste(pil_img, ((height - width) // 2, 0))
+                    return result
+
+            patch_images = [
+                expand2square(i, tuple(int(x * 255) for x in image_processor.image_mean))
+                for i in patch_images
+            ]
+            patch_images = [
+                image_processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                for i in patch_images
+            ]
+        else:
+            patch_images = [
+                image_processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                for i in patch_images
+            ]
+
+        assert len(patch_images) % 5 == 0
+        slice_len = len(patch_images) // 5
+        return patch_images, slice_len
+    else:
+        print("video path: {} error.".format(video_path))
+
+
+class LazySupervisedDataset(Dataset):
+    """Dataset for supervised fine-tuning."""
+
+    def __init__(self, tokenizer: transformers.PreTrainedTokenizer, data_args: DataArguments):
+        super(LazySupervisedDataset, self).__init__()
+
+        dataset_list = DataConfig[str(data_args.dataset_use)]
+        print(dataset_list)
+
+        self.max_length = MAX_IMAGE_LENGTH
+        list_data_dict = []
+        self.folder_dict = {}
+        for i in dataset_list:
+            # list_data_dict += json.load(open(i["chat_path"], "r"))
+            data_ratio = i.get("data_ratio", DEFAULT_DATA_RATIO)
+            data_i = json.load(open(i["chat_path"], "r"))
+            len_data_i = len(data_i)
+            data_i = random.sample(data_i, int(len_data_i * data_ratio))
+            list_data_dict += data_i
+
+            image_folder = [folder for folder in i if folder is not "chat_path"]
+
+            for folder in image_folder:
+                if folder not in self.folder_dict:
+                    self.folder_dict[folder] = i[folder]
+        for key in FolderDict.keys():
+            if key not in self.folder_dict:
+                self.folder_dict[key] = FolderDict[key]
+
+        random.shuffle(list_data_dict)
+
+        self.tokenizer = tokenizer
+        self.list_data_dict = list_data_dict
+        self.data_args = data_args
+
+    def __len__(self):
+        return len(self.list_data_dict)
+
+    # @property
+    # def lengths(self):
+    #     length_list = []
+    #     for sample in self.list_data_dict:
+    #         img_tokens = 128 if 'image' in sample else 0
+    #         length_list.append(sum(len(conv['value'].split()) for conv in sample['conversations']) + img_tokens)
+    #     return length_list
+
+    @property
+    def modality_lengths(self):
+        length_list = []
+        for sample in self.list_data_dict:
+            cur_len = sum(len(conv["value"].split()) for conv in sample["conversations"])
+            cur_len = cur_len if ("image" in sample or "video" in sample) else -cur_len
+            length_list.append(cur_len)
+        return length_list
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        sources = self.list_data_dict[i]
+        if isinstance(i, int):
+            sources = [sources]
+        assert len(sources) == 1, "Don't know why it is wrapped to a list"  # FIXME
+        if "image" in sources[0] and "audio" not in sources[0]:
+            image_file = self.list_data_dict[i]["image"]
+            set_id = self.list_data_dict[i].get("set", None)
+            file = image_file[0] if type(image_file) is list else image_file
+            processor = self.data_args.image_processor
+            if "height" in processor.size.keys():
+                image_size = processor.size["height"]
+            elif "shortest_edge" in processor.size.keys():
+                image_size = processor.size["shortest_edge"]
+            else:
+                raise NotImplementedError(f"Please use correct key to use processor size!")
+
+            if type(image_file) is list:
+                assert type(set_id) is list
+                if len(image_file) != len(set_id):
+                    assert len(set(set_id)) == 1
+                image = [
+                    Image.open(
+                        os.path.join(self.folder_dict[set_id[k]], file.replace("\\", "/"))
+                    ).convert("RGB")
+                    for k, file in enumerate(image_file)
+                ]
+                if self.data_args.image_aspect_ratio == "pad":
+
+                    def expand2square(pil_img, background_color):
+                        width, height = pil_img.size
+                        if width == height:
+                            return pil_img
+                        elif width > height:
+                            result = Image.new(pil_img.mode, (width, width), background_color)
+                            result.paste(pil_img, (0, (width - height) // 2))
+                            return result
+                        else:
+                            result = Image.new(pil_img.mode, (height, height), background_color)
+                            result.paste(pil_img, ((height - width) // 2, 0))
+                            return result
+
+                    image = [
+                        expand2square(i, tuple(int(x * 255) for x in processor.image_mean))
+                        for i in image
+                    ]
+                    image_patches, patch_num = [], []
+                    for k, img in enumerate(image):
+                        if set_id[k] not in NoPatchSets:
+                            img, p_num = dynamic_preprocess(
+                                img,
+                                min_num=self.data_args.min_dynamic_patch,
+                                max_num=self.data_args.max_dynamic_patch,
+                                image_size=image_size,
+                                use_thumbnail=self.data_args.use_thumbnail,
+                                img_mean=processor.image_mean,
+                            )
+                        else:
+                            img, p_num = [img], [1]
+                        image_patches += img
+                        patch_num += p_num
+                    image = image_patches
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+                else:
+                    image_patches, patch_num = [], []
+                    for k, img in enumerate(image):
+                        if set_id[k] not in NoPatchSets:
+                            img, p_num = dynamic_preprocess(
+                                img,
+                                min_num=self.data_args.min_dynamic_patch,
+                                max_num=self.data_args.max_dynamic_patch,
+                                image_size=image_size,
+                                use_thumbnail=self.data_args.use_thumbnail,
+                                img_mean=processor.image_mean,
+                            )
+                        else:
+                            img, p_num = [img], [1]
+                        image_patches += img
+                        patch_num += p_num
+                    image = image_patches
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+            else:
+                image_folder = self.folder_dict[set_id]
+                image = Image.open(
+                    os.path.join(image_folder, image_file.replace("\\", "/"))
+                ).convert("RGB")
+                if self.data_args.image_aspect_ratio == "pad":
+
+                    def expand2square(pil_img, background_color):
+                        width, height = pil_img.size
+                        if width == height:
+                            return pil_img
+                        elif width > height:
+                            result = Image.new(pil_img.mode, (width, width), background_color)
+                            result.paste(pil_img, (0, (width - height) // 2))
+                            return result
+                        else:
+                            result = Image.new(pil_img.mode, (height, height), background_color)
+                            result.paste(pil_img, ((height - width) // 2, 0))
+                            return result
+
+                    image = expand2square(image, tuple(int(x * 255) for x in processor.image_mean))
+                    image, patch_num = dynamic_preprocess(
+                        image,
+                        min_num=self.data_args.min_dynamic_patch,
+                        max_num=self.data_args.max_dynamic_patch,
+                        image_size=image_size,
+                        use_thumbnail=self.data_args.use_thumbnail,
+                        img_mean=processor.image_mean,
+                    )
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+                else:
+                    image, patch_num = dynamic_preprocess(
+                        image,
+                        min_num=self.data_args.min_dynamic_patch,
+                        max_num=self.data_args.max_dynamic_patch,
+                        image_size=image_size,
+                        use_thumbnail=self.data_args.use_thumbnail,
+                        img_mean=processor.image_mean,
+                    )
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+
+            inserted_id = self.list_data_dict[i].get("inserted_id", None)
+            end_tag = self.list_data_dict[i].get("end_tag", True)
+            assert inserted_id is None
+            assert end_tag is True
+            sources = preprocess_multimodal(
+                copy.deepcopy([e["conversations"] for e in sources]),
+                self.data_args,
+                patch_num=patch_num,
+                inserted_id=inserted_id,
+            )
+
+            data_dict = preprocess(
+                sources, self.tokenizer, has_image=True, end_tag=end_tag, modality="image"
+            )
+
+        elif "image" in sources[0] and "audio" in sources[0]:
+            image_file = self.list_data_dict[i]["image"]
+            set_id = self.list_data_dict[i].get("set", None)
+            file = image_file[0] if type(image_file) is list else image_file
+            audio_file = self.list_data_dict[i]["audio"]
+            processor = self.data_args.image_processor
+            if "height" in processor.size.keys():
+                image_size = processor.size["height"]
+            elif "shortest_edge" in processor.size.keys():
+                image_size = processor.size["shortest_edge"]
+            else:
+                raise NotImplementedError(f"Please use correct key to use processor size!")
+
+            if type(image_file) is list:
+                assert type(set_id) is list
+                if len(image_file) != len(set_id):  # 多图数据
+                    assert len(set(set_id)) == 1
+                image = [
+                    Image.open(
+                        os.path.join(self.folder_dict[set_id[k]], file.replace("\\", "/"))
+                    ).convert("RGB")
+                    for k, file in enumerate(image_file)
+                ]
+                if self.data_args.image_aspect_ratio == "pad":
+
+                    def expand2square(pil_img, background_color):
+                        width, height = pil_img.size
+                        if width == height:
+                            return pil_img
+                        elif width > height:
+                            result = Image.new(pil_img.mode, (width, width), background_color)
+                            result.paste(pil_img, (0, (width - height) // 2))
+                            return result
+                        else:
+                            result = Image.new(pil_img.mode, (height, height), background_color)
+                            result.paste(pil_img, ((height - width) // 2, 0))
+                            return result
+
+                    image = [
+                        expand2square(i, tuple(int(x * 255) for x in processor.image_mean))
+                        for i in image
+                    ]
+                    image_patches, patch_num = [], []
+                    for k, img in enumerate(image):
+                        if set_id[k] not in NoPatchSets:
+                            img, p_num = dynamic_preprocess(
+                                img,
+                                min_num=self.data_args.min_dynamic_patch,
+                                max_num=self.data_args.max_dynamic_patch,
+                                image_size=image_size,
+                                use_thumbnail=self.data_args.use_thumbnail,
+                                img_mean=processor.image_mean,
+                            )
+                        else:
+                            img, p_num = [img], [1]
+                        image_patches += img
+                        patch_num += p_num
+                    image = image_patches
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+                else:
+                    image_patches, patch_num = [], []
+                    for k, img in enumerate(image):
+                        if set_id[k] not in NoPatchSets:
+                            img, p_num = dynamic_preprocess(
+                                img,
+                                min_num=self.data_args.min_dynamic_patch,
+                                max_num=self.data_args.max_dynamic_patch,
+                                image_size=image_size,
+                                use_thumbnail=self.data_args.use_thumbnail,
+                                img_mean=processor.image_mean,
+                            )
+                        else:
+                            img, p_num = [img], [1]
+                        image_patches += img
+                        patch_num += p_num
+                    image = image_patches
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+            else:
+                image_folder = self.folder_dict[set_id]
+                image = Image.open(
+                    os.path.join(image_folder, image_file.replace("\\", "/"))
+                ).convert("RGB")
+                if self.data_args.image_aspect_ratio == "pad":
+
+                    def expand2square(pil_img, background_color):
+                        width, height = pil_img.size
+                        if width == height:
+                            return pil_img
+                        elif width > height:
+                            result = Image.new(pil_img.mode, (width, width), background_color)
+                            result.paste(pil_img, (0, (width - height) // 2))
+                            return result
+                        else:
+                            result = Image.new(pil_img.mode, (height, height), background_color)
+                            result.paste(pil_img, ((height - width) // 2, 0))
+                            return result
+
+                    image = expand2square(image, tuple(int(x * 255) for x in processor.image_mean))
+                    image, patch_num = dynamic_preprocess(
+                        image,
+                        min_num=self.data_args.min_dynamic_patch,
+                        max_num=self.data_args.max_dynamic_patch,
+                        image_size=image_size,
+                        use_thumbnail=self.data_args.use_thumbnail,
+                        img_mean=processor.image_mean,
+                    )
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+                else:
+                    image, patch_num = dynamic_preprocess(
+                        image,
+                        min_num=self.data_args.min_dynamic_patch,
+                        max_num=self.data_args.max_dynamic_patch,
+                        image_size=image_size,
+                        use_thumbnail=self.data_args.use_thumbnail,
+                        img_mean=processor.image_mean,
+                    )
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+
+            if type(audio_file) is list:
+                # if type(set_id) is list:
+                #    audio_folder = self.folder_dict[set_id[0]+'_audio']
+                # else:
+                #    audio_folder = self.folder_dict[set_id+'_audio']
+                audio_folder = AudioFolder
+                assert len(audio_file) > 0, "audio_file为列表时不能为空"
+                audio = []
+                audio_for_llm_lens = []
+                audio_length = []
+                for file in audio_file:
+                    try:
+                        a, a_llm = self.data_args.audio_processor.process(
+                            os.path.join(audio_folder, "audio", file)
+                        )
+                    except:
+                        print(f"File {os.path.join(audio_folder, 'audio', file)} not OK!!!!!")
+                    audio.append(a)
+                    audio_for_llm_lens.append(a_llm)
+                    audio_length.append(a.shape[0])
+            else:
+                # audio_folder = self.folder_dict[set_id+'_audio']
+                audio_folder = AudioFolder
+                assert audio_file, "audio_file不能为空"
+                audio, audio_for_llm_lens = self.data_args.audio_processor.process(
+                    os.path.join(audio_folder, "audio", audio_file)
+                )
+                audio_length = audio.shape[0]
+
+            inserted_id = self.list_data_dict[i].get("inserted_id", None)
+            end_tag = self.list_data_dict[i].get("end_tag", True)
+            sources = preprocess_multimodal(
+                copy.deepcopy([e["conversations"] for e in sources]),
+                self.data_args,
+                patch_num=patch_num,
+                audio_lens=audio_for_llm_lens,
+                inserted_id=inserted_id,
+            )
+
+            data_dict = preprocess(
+                sources,
+                self.tokenizer,
+                has_image=True,
+                has_audio=True,
+                end_tag=end_tag,
+                modality="image",
+            )
+            data_dict["audio_lengths"] = audio_length
+            data_dict["audio_lengths_for_llm"] = audio_for_llm_lens
+
+        elif "video" in sources[0] and "audio" not in sources[0]:
+            video_file = self.list_data_dict[i]["video"]
+            video_id = self.list_data_dict[i]["id"]
+            set_id = self.list_data_dict[i].get("set", None)
+            processor = self.data_args.image_processor
+            if "height" in processor.size.keys():
+                image_size = processor.size["height"]
+            elif "shortest_edge" in processor.size.keys():
+                image_size = processor.size["shortest_edge"]
+            else:
+                raise NotImplementedError(f"Please use correct key to use processor size!")
+            video_folder = self.folder_dict[set_id]
+            image, image_token_num = _get_rawvideo_dec(
+                os.path.join(video_folder, video_file),
+                processor,
+                max_frames=MAX_IMAGE_LENGTH,
+                min_frames=MIN_IMAGE_LENGTH,
+                image_resolution=image_size,
+                image_aspect_ratio=self.data_args.image_aspect_ratio,
+            )
+
+            inserted_id = self.list_data_dict[i].get("inserted_id", None)
+            end_tag = self.list_data_dict[i].get("end_tag", True)
+            assert inserted_id is None
+            assert end_tag is True
+            sources = preprocess_multimodal(
+                copy.deepcopy([e["conversations"] for e in sources]),
+                self.data_args,
+                image_token_num=image_token_num,
+                inserted_id=inserted_id,
+            )
+
+            data_dict = preprocess(
+                sources,
+                self.tokenizer,
+                has_image=True,
+                has_audio=False,
+                end_tag=end_tag,
+                modality="video",
+            )
+
+        elif "video" in sources[0] and "audio" in sources[0]:
+            video_file = self.list_data_dict[i]["video"]
+            video_id = self.list_data_dict[i]["id"]
+            set_id = self.list_data_dict[i].get("set", None)
+            audio_file = self.list_data_dict[i]["audio"]
+            processor = self.data_args.image_processor
+            if "height" in processor.size.keys():
+                image_size = processor.size["height"]
+            elif "shortest_edge" in processor.size.keys():
+                image_size = processor.size["shortest_edge"]
+            else:
+                raise NotImplementedError(f"Please use correct key to use processor size!")
+            video_folder = self.folder_dict[set_id]
+            # audio_folder = self.folder_dict[set_id+'_audio']
+            audio_folder = AudioFolder
+            image, image_token_num = _get_rawvideo_dec(
+                os.path.join(video_folder, video_file),
+                processor,
+                max_frames=MAX_IMAGE_LENGTH,
+                min_frames=MIN_IMAGE_LENGTH,
+                image_resolution=image_size,
+                image_aspect_ratio=self.data_args.image_aspect_ratio,
+            )
+            if type(audio_file) is list:
+                assert len(audio_file) > 0, "audio_file为列表时不能为空"
+                audio = []
+                audio_for_llm_lens = []
+                audio_length = []
+                for file in audio_file:
+                    a, a_llm = self.data_args.audio_processor.process(
+                        os.path.join(audio_folder, "audio", file)
+                    )
+                    audio.append(a)
+                    audio_for_llm_lens.append(a_llm)
+                    audio_length.append(a.shape[0])
+            else:
+                assert audio_file, "audio_file不能为空"
+                audio, audio_for_llm_lens = self.data_args.audio_processor.process(
+                    os.path.join(audio_folder, "audio", audio_file)
+                )
+                audio_length = audio.shape[0]
+
+            inserted_id = self.list_data_dict[i].get("inserted_id", None)
+            end_tag = self.list_data_dict[i].get("end_tag", True)
+            sources = preprocess_multimodal(
+                copy.deepcopy([e["conversations"] for e in sources]),
+                self.data_args,
+                image_token_num=image_token_num,
+                audio_lens=audio_for_llm_lens,
+                inserted_id=inserted_id,
+            )
+
+            data_dict = preprocess(
+                sources,
+                self.tokenizer,
+                has_image=True,
+                has_audio=True,
+                end_tag=end_tag,
+                modality="video",
+            )
+            data_dict["audio_lengths"] = audio_length
+            data_dict["audio_lengths_for_llm"] = audio_for_llm_lens
+        elif "audio" in sources[0]:
+            audio_file = self.list_data_dict[i]["audio"]
+            audio_folder = AudioFolder
+            if type(audio_file) is list:
+                assert len(audio_file) > 0, "audio_file为列表时不能为空"
+                audio = []
+                audio_for_llm_lens = []
+                audio_length = []
+                for file in audio_file:
+                    a, a_llm = self.data_args.audio_processor.process(
+                        os.path.join(audio_folder, "audio", file)
+                    )
+                    audio.append(a)
+                    audio_for_llm_lens.append(a_llm)
+                    audio_length.append(a.shape[0])
+            else:
+                assert audio_file, "audio_file不能为空"
+                audio, audio_for_llm_lens = self.data_args.audio_processor.process(
+                    os.path.join(audio_folder, "audio", audio_file)
+                )
+                audio_length = audio.shape[0]
+
+            inserted_id = self.list_data_dict[i].get("inserted_id", None)
+            end_tag = self.list_data_dict[i].get("end_tag", True)
+            sources = preprocess_multimodal(
+                copy.deepcopy([e["conversations"] for e in sources]),
+                self.data_args,
+                image_token_num=0,
+                audio_lens=audio_for_llm_lens,
+                inserted_id=inserted_id,
+            )
+
+            data_dict = preprocess(
+                sources,
+                self.tokenizer,
+                has_image=False,
+                has_audio=True,
+                end_tag=end_tag,
+                modality="lang",
+            )
+            data_dict["audio_lengths"] = audio_length
+            data_dict["audio_lengths_for_llm"] = audio_for_llm_lens
+        else:
+            sources = copy.deepcopy([e["conversations"] for e in sources])
+            sources = preprocess_multimodal(
+                sources,
+                self.data_args,
+                image_token_num=0,
+            )
+
+            data_dict = preprocess(sources, self.tokenizer, has_image=False, modality="lang")
+
+        if isinstance(i, int):
+            if "audio" in self.list_data_dict[i]:
+                data_dict = dict(
+                    input_ids=data_dict["input_ids"][0],
+                    labels=data_dict["labels"][0],
+                    audio_lengths=data_dict["audio_lengths"],
+                    audio_lengths_for_llm=data_dict["audio_lengths_for_llm"],
+                )
+            else:
+                data_dict = dict(input_ids=data_dict["input_ids"][0], labels=data_dict["labels"][0])
+
+        # image exist in the data
+        if "image" in self.list_data_dict[i] or "video" in self.list_data_dict[i]:
+            data_dict["image"] = image
+        elif self.data_args.is_multimodal:
+            # image does not exist in the data, but the model is multimodal
+            crop_size = self.data_args.image_processor.crop_size
+            # data_dict['image'] = torch.zeros(3, crop_size['height'], crop_size['width'])
+            data_dict["image"] = [torch.zeros(3, crop_size["height"], crop_size["width"])] * 5
+        if "audio" in self.list_data_dict[i]:
+            data_dict["audio"] = audio
+        elif self.data_args.is_multimodal:
+            data_dict["audio"] = torch.zeros(400, 80)
+            data_dict["audio_lengths"] = 400
+            data_dict["audio_lengths_for_llm"] = 60
+        return data_dict
+
+
+@dataclass
+class DataCollatorForSupervisedDataset(object):
+    """Collate examples for supervised fine-tuning."""
+
+    tokenizer: transformers.PreTrainedTokenizer
+
+    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        input_ids, labels = tuple(
+            [instance[key] for instance in instances] for key in ("input_ids", "labels")
+        )
+        if self.tokenizer.pad_token_id == self.tokenizer.eos_token_id:
+            for input_id in input_ids:
+                input_id[input_id == self.tokenizer.eos_token_id] = -300
+
+        input_ids = torch.nn.utils.rnn.pad_sequence(
+            input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
+        )
+
+        labels = torch.nn.utils.rnn.pad_sequence(
+            labels, batch_first=True, padding_value=IGNORE_INDEX
+        )
+
+        input_ids = input_ids[:, : self.tokenizer.model_max_length]
+
+        attention_mask = input_ids.ne(self.tokenizer.pad_token_id)
+
+        labels = labels[:, : self.tokenizer.model_max_length]
+
+        if self.tokenizer.pad_token_id == self.tokenizer.eos_token_id:
+            for input_id in input_ids:
+                input_id[input_id == -300] = self.tokenizer.eos_token_id
+
+        batch = dict(
+            input_ids=input_ids,
+            labels=labels,
+            attention_mask=attention_mask,
+        )
+
+        if "image" in instances[0]:
+            images = [instance["image"] for instance in instances]
+
+            new_images = []
+            for image in images:
+                if type(image) is list:
+                    for i in image:
+                        new_images.append(i)
+                else:
+                    new_images.append(image)
+            images = new_images
+
+            if all(x is not None and x.shape == images[0].shape for x in images):
+                batch["images"] = torch.stack(images)
+            else:
+                batch["images"] = images
+
+        batch["audios"] = {}
+        if "audio" in instances[0]:
+            audios = [instance["audio"] for instance in instances]
+            audio_lengths = [instance["audio_lengths"] for instance in instances]
+            audio_lengths_for_llm = [instance["audio_lengths_for_llm"] for instance in instances]
+
+            new_audios = []
+            new_audio_lengths = []
+            new_audio_lengths_for_llm = []
+            for i, audio in enumerate(audios):
+                length = audio_lengths[i]
+                length_for_llm = audio_lengths_for_llm[i]
+                if type(audio) is list:
+                    for j, a in enumerate(audio):
+                        new_audios.append(a)
+                        new_audio_lengths.append(length[j])
+                        new_audio_lengths_for_llm.append(length_for_llm[j])
+                else:
+                    new_audios.append(audio)
+                    new_audio_lengths.append(length)
+                    new_audio_lengths_for_llm.append(length_for_llm)
+            audios = new_audios
+            audios = pad_sequence(audios, batch_first=True, padding_value=0)
+
+            batch["audios"]["audios"] = audios
+            batch["audios"]["lengths"] = torch.tensor(new_audio_lengths)
+            batch["audios"]["lengths_for_llm"] = torch.tensor(new_audio_lengths_for_llm)
+
+        return batch
+
+
+def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer, data_args) -> Dict:
+    """Make dataset and collator for supervised fine-tuning."""
+    train_dataset = LazySupervisedDataset(tokenizer=tokenizer, data_args=data_args)
+    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
+    return dict(train_dataset=train_dataset, eval_dataset=None, data_collator=data_collator)
+
+
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    best_ratio_diff = float("inf")
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    # print(f'width: {width}, height: {height}, best_ratio: {best_ratio}')
+    return best_ratio
+
+
+def dynamic_preprocess(
+    image, min_num=1, max_num=6, image_size=448, use_thumbnail=False, img_mean=0
+):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+
+    # calculate the existing image aspect ratio
+    target_ratios = set(
+        (i, j)
+        for n in range(min_num, max_num + 1)
+        for i in range(1, n + 1)
+        for j in range(1, n + 1)
+        if i * j <= max_num and i * j >= min_num
+    )
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, target_ratios, orig_width, orig_height, image_size
+    )
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+
+    # expand target_aspect_ratio to even for each size
+    new_target_aspect_ratio = [e if e % 2 == 0 else e + 1 for e in target_aspect_ratio]
+    blocks_big = int(0.5 * new_target_aspect_ratio[0] * 0.5 * new_target_aspect_ratio[1])
+
+    # padding to even patch for each size
+    new_target_width = new_target_aspect_ratio[0] * image_size
+    new_target_height = new_target_aspect_ratio[1] * image_size
+    resized_img = expand2even(
+        resized_img, new_target_width, new_target_height, tuple(int(x * 255) for x in img_mean)
+    )
+    assert resized_img.size[0] == new_target_aspect_ratio[0] * image_size
+    assert resized_img.size[1] == new_target_aspect_ratio[1] * image_size
+
+    processed_images = []
+    image_size_big = image_size * 2
+    for i in range(blocks_big):
+        # TODO append big patch per 4 patch, order: big then small
+        box = (
+            (i % (new_target_width // image_size_big)) * image_size_big,
+            (i // (new_target_width // image_size_big)) * image_size_big,
+            ((i % (new_target_width // image_size_big)) + 1) * image_size_big,
+            ((i // (new_target_width // image_size_big)) + 1) * image_size_big,
+        )
+        # split the image
+        split_img_big = resized_img.crop(box)
+        split_img = split_img_big.resize((image_size, image_size))
+        processed_images.append(split_img)
+        blocks_small = 2 * 2
+        for i in range(blocks_small):
+            # TODO append big patch per 4 patch, order: big then small
+            box = (
+                (i % (image_size_big // image_size)) * image_size,
+                (i // (image_size_big // image_size)) * image_size,
+                ((i % (image_size_big // image_size)) + 1) * image_size,
+                ((i // (image_size_big // image_size)) + 1) * image_size,
+            )
+            # split the image
+            split_img = split_img_big.crop(box)
+            processed_images.append(split_img)
+
+    if use_thumbnail:
+        thumbnail_img = resized_img.resize((image_size, image_size))
+        processed_images+=[thumbnail_img]*5
+
+    #assert len(processed_images) == blocks_big * 5
+    assert len(processed_images) == (blocks_big+1) * 5
+    assert len(processed_images) % 5 == 0
+    #import pdb; pdb.set_trace()
+    #visualize_images(resized_img, processed_images)
+    return processed_images, [len(processed_images) // 5]
+
+
+def expand2even(pil_img, new_target_width, new_target_height, background_color):
+    result = Image.new(pil_img.mode, (new_target_width, new_target_height), background_color)
+    result.paste(pil_img, (0, 0))
+    return result
+
+
+def visualize_images(resized_img, processed_images, output_path="output.png"):
+    # Create a figure to hold the subplots
+    fig, axes = plt.subplots(
+        nrows=(len(processed_images) // 5) + 1,
+        ncols=5,
+        figsize=(15, (len(processed_images) // 5) + 1),
+    )
+
+    # Plot the resized_img in the first row
+    axes[0, 0].imshow(resized_img)
+    axes[0, 0].set_title("Resized Image")
+    axes[0, 0].axis("off")
+
+    # Hide the remaining subplots in the first row
+    for j in range(1, 5):
+        axes[0, j].axis("off")
+
+    # Plot the processed_images
+    for i, img in enumerate(processed_images):
+        row = (i // 5) + 1
+        col = i % 5
+        axes[row, col].imshow(img)
+        axes[row, col].axis("off")
+
+    # Save the figure
+    plt.tight_layout()
+    plt.savefig(output_path)
+    plt.close()
diff --git a/vita/util/data_utils_video_audio_neg_patch.py b/vita/util/data_utils_video_audio_neg_patch.py
new file mode 100644
index 0000000000000000000000000000000000000000..960b9db0be8bcc7a26304ac26bccf79ff79dff33
--- /dev/null
+++ b/vita/util/data_utils_video_audio_neg_patch.py
@@ -0,0 +1,1540 @@
+import copy
+import json
+import math
+import os
+import random
+from dataclasses import dataclass, field
+from typing import Dict, Optional, Sequence
+
+import numpy as np
+import torch
+import transformers
+from PIL import Image
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data import Dataset
+
+from decord import VideoReader, cpu
+from vita import conversation as conversation_lib
+from vita.config import AudioFolder, DataConfig, FolderDict, NoPatchSets
+from vita.constants import (
+    DEFAULT_AUDIO_TOKEN,
+    DEFAULT_DATA_RATIO,
+    DEFAULT_IMAGE_TOKEN,
+    DEFAULT_VIDEO_TOKEN,
+    IGNORE_INDEX,
+    MAX_IMAGE_LENGTH,
+    MIN_IMAGE_LENGTH,
+)
+from vita.util.mm_utils import tokenizer_image_audio_token, tokenizer_image_token
+
+
+@dataclass
+class DataArguments:
+    lazy_preprocess: bool = False
+    is_multimodal: bool = True
+    image_folder: Optional[str] = field(default=None)
+    image_aspect_ratio: str = field(default=None)
+    dataset_use: str = field(default="temp")
+    min_dynamic_patch: int = 1
+    max_dynamic_patch: int = 12
+    use_thumbnail: bool = True
+
+
+def preprocess_multimodal(
+    sources: Sequence[str],
+    data_args: DataArguments,
+    image_token_num=1,
+    patch_num=[1],
+    audio_lens: int = 0,
+    inserted_id=None,
+) -> Dict:
+    is_multimodal = data_args.is_multimodal
+    if not is_multimodal:
+        return sources
+
+    k_img_ph = 0
+    for source in sources:
+        if inserted_id is not None:
+            assert source[inserted_id]["from"] == "gpt"
+        for i, sentence in enumerate(source):
+            if DEFAULT_IMAGE_TOKEN in sentence["value"] or DEFAULT_VIDEO_TOKEN in sentence["value"]:
+                sentence["value"] = (
+                    sentence["value"]
+                    .replace(DEFAULT_IMAGE_TOKEN + "\n", DEFAULT_IMAGE_TOKEN)
+                    .strip()
+                )
+                sentence["value"] = (
+                    sentence["value"]
+                    .replace("\n" + DEFAULT_IMAGE_TOKEN, DEFAULT_IMAGE_TOKEN)
+                    .strip()
+                )
+                if sentence["value"].endswith(DEFAULT_IMAGE_TOKEN):
+                    IMAGE_TOKEN_NUM = sentence["value"].count(DEFAULT_IMAGE_TOKEN)
+                    sentence["value"] = (
+                        sentence["value"].replace(DEFAULT_IMAGE_TOKEN * IMAGE_TOKEN_NUM, "").strip()
+                    )
+                    sentence["value"] = DEFAULT_IMAGE_TOKEN * IMAGE_TOKEN_NUM + sentence["value"]
+                    sentence["value"] = sentence["value"].strip()
+                if sentence["value"].endswith(DEFAULT_VIDEO_TOKEN):
+                    VIDEO_TOKEN_NUM = sentence["value"].count(DEFAULT_VIDEO_TOKEN)
+                    sentence["value"] = (
+                        sentence["value"].replace(DEFAULT_VIDEO_TOKEN * VIDEO_TOKEN_NUM, "").strip()
+                    )
+                    sentence["value"] = DEFAULT_VIDEO_TOKEN * VIDEO_TOKEN_NUM + sentence["value"]
+                    sentence["value"] = sentence["value"].strip()
+
+                if "mmtag" in conversation_lib.default_conversation.version:
+                    sentence["value"] = sentence["value"].replace(
+                        DEFAULT_IMAGE_TOKEN, "<Image>" + DEFAULT_IMAGE_TOKEN + "</Image>"
+                    )
+
+                IMAGE_TOKEN_NUM = sentence["value"].count(DEFAULT_IMAGE_TOKEN)
+                if IMAGE_TOKEN_NUM > 99999:#MAX_IMAGE_LENGTH:
+                    sentence["value"] = (
+                        sentence["value"]
+                        .replace(
+                            DEFAULT_IMAGE_TOKEN * IMAGE_TOKEN_NUM,
+                            DEFAULT_IMAGE_TOKEN * MAX_IMAGE_LENGTH,
+                        )
+                        .strip()
+                    )
+            replace_token, vid_replace_token, audio_replace_token = (
+                DEFAULT_IMAGE_TOKEN,
+                DEFAULT_IMAGE_TOKEN * image_token_num,
+                DEFAULT_AUDIO_TOKEN,
+            )  # * audio_lens
+            if DEFAULT_IMAGE_TOKEN in sentence["value"]:
+                replace_token = DEFAULT_IMAGE_TOKEN * patch_num[k_img_ph]
+                k_img_ph += 1
+
+            sentence["value"] = sentence["value"].replace(DEFAULT_IMAGE_TOKEN, replace_token + "\n")
+            sentence["value"] = sentence["value"].replace(
+                DEFAULT_VIDEO_TOKEN, vid_replace_token + "\n"
+            )
+            sentence["value"] = sentence["value"].replace(
+                DEFAULT_AUDIO_TOKEN + "\n", audio_replace_token
+            )
+            sentence["value"] = sentence["value"].replace("\n\n", "\n")
+
+            #if i == inserted_id:
+            #    assert sentence["from"] == "gpt"
+            #    sentence["value"] = "<2>" + sentence["value"]
+            #elif sentence["from"] == "gpt":
+            #    if "<audio>" in source[i - 1]["value"]:
+            #        sentence["value"] = "<1>" + sentence["value"]
+            #    else:
+            #        sentence["value"] = "<3>" + sentence["value"]
+
+            if i == inserted_id:# ☞ ☟ ☜
+                assert sentence["from"] == "gpt"
+                sentence["value"] = "☟" + sentence["value"]
+            elif sentence["from"] == "gpt":
+                if "<audio>" in source[i - 1]["value"]:
+                    sentence["value"] = "☞" + sentence["value"]
+                else:
+                    sentence["value"] = "☜" + sentence["value"]
+
+    # print(patch_num)
+    # print(sum(patch_num))
+    #print(sources)
+    #import pdb; pdb.set_trace()
+    return sources
+
+
+def preprocess_mixtral_zh(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+    has_audio: bool = False,
+    end_tag: bool = True,
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+
+    # Tokenize conversations
+    if not end_tag:
+        conversations[0] = conversations[0][:-4]
+    if has_image and not has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    elif has_image and has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_audio_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    elif not has_image and has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_audio_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+    #print(f'end_tag: {end_tag}')
+    #print(conversations)
+    #print(input_ids)
+    #import pdb; pdb.set_trace()
+
+    targets = input_ids.clone()
+
+    assert conv.sep_style == conversation_lib.SeparatorStyle.MixtralZh
+
+    # Mask targets
+    sep = conv.sep + "\n" + conv.roles[1] + ":"
+    sep2_2 = "\n" + conv.roles[0] + ":"
+    sep2 = conv.sep2 + sep2_2
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+
+        rounds = conversation.split(sep2)
+        rounds = [rounds[0] + sep2 + rounds[1]] + rounds[2:]
+        cur_len = 1
+        end_token_cnt = 0
+        target[:cur_len] = IGNORE_INDEX
+
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+            if i > 0:
+                rou = sep2_2 + rou
+
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+
+            if has_image and not has_audio:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 1
+            elif has_image and has_audio:
+                round_len = len(tokenizer_image_audio_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_audio_token(parts[0], tokenizer)) - 1
+            elif not has_image and has_audio:
+                round_len = len(tokenizer_image_audio_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_audio_token(parts[0], tokenizer)) - 1
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 2
+
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+
+            end_token_cnt += 1
+            cur_len += round_len
+        cur_len = cur_len - 1
+        target[cur_len:] = IGNORE_INDEX
+
+        if tokenizer.pad_token_id == tokenizer.eos_token_id:
+            cur_len -= end_token_cnt
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." f" (ignored)")
+                # print(f"YOU NEED GO TO DEBUG THIS DATA ITEM: {conversations}")
+
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+
+
+def preprocess_mixtral_two(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+    has_audio: bool = False,
+    end_tag: bool = True,
+    modality: str = "lang",
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt(modality))
+    # print(conversations)
+    # import pdb; pdb.set_trace()
+
+    # Tokenize conversations
+    if not end_tag:
+        conversations[0] = conversations[0][:-4]
+    if has_image and not has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    elif has_image and has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_audio_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    elif not has_image and has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_audio_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+    #print(f'end_tag: {end_tag}')
+    print(conversations)
+    print(input_ids)
+    import pdb; pdb.set_trace()
+
+    targets = input_ids.clone()
+
+    assert conv.sep_style == conversation_lib.SeparatorStyle.MixtralTwo
+
+    # Mask targets
+    sep = conv.sep + "\n" + conv.roles[1] + ":"
+    sep2_2 = "\n" + conv.roles[0] + ":"
+    sep2 = conv.sep2 + sep2_2
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+
+        rounds = conversation.split(sep2)
+        rounds = [rounds[0] + sep2 + rounds[1]] + rounds[2:]
+        cur_len = 1
+        end_token_cnt = 0
+        target[:cur_len] = IGNORE_INDEX
+
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+            if i > 0:
+                rou = sep2_2 + rou
+
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+
+            import pdb; pdb.set_trace()
+            if has_image and not has_audio:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 1
+            elif has_image and has_audio:
+                round_len = len(tokenizer_image_audio_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_audio_token(parts[0], tokenizer)) - 1
+            elif not has_image and has_audio:
+                round_len = len(tokenizer_image_audio_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_audio_token(parts[0], tokenizer)) - 1
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 2
+
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+
+            end_token_cnt += 1
+            cur_len += round_len
+        cur_len = cur_len - 1
+        target[cur_len:] = IGNORE_INDEX
+
+        if tokenizer.pad_token_id == tokenizer.eos_token_id:
+            cur_len -= end_token_cnt
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." f" (ignored)")
+                # print(f"YOU NEED GO TO DEBUG THIS DATA ITEM: {conversations}")
+
+    import pdb; pdb.set_trace()
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+
+
+def preprocess_nemo(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+    has_audio: bool = False,
+    end_tag: bool = True,
+    modality: str = "lang",
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt(modality))
+    #print(conversations)
+    #import pdb; pdb.set_trace()
+
+    if not end_tag:
+        conversations[0] = conversations[0][:-4]
+    # Tokenize conversations
+    if has_image and not has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    elif has_image and has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_audio_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    elif not has_image and has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_audio_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+    print(f'end_tag: {end_tag}')
+    print(conversations)
+    print(input_ids)
+    import pdb; pdb.set_trace()
+
+    targets = input_ids.clone()
+
+    assert conv.sep_style == conversation_lib.SeparatorStyle.Nemo
+
+    # Mask targets
+    sep = conv.sep
+    sep2 = conv.sep2
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+
+        rounds = conversation.split(sep2)
+        cur_len = 1
+        end_token_cnt = 0
+        target[:cur_len] = IGNORE_INDEX
+
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+
+            if has_image and not has_audio:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 1
+            elif has_image and has_audio:
+                round_len = len(tokenizer_image_audio_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_audio_token(parts[0], tokenizer)) - 1
+            elif not has_image and has_audio:
+                round_len = len(tokenizer_image_audio_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_audio_token(parts[0], tokenizer)) - 1
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 2
+
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+
+            end_token_cnt += 1
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+
+        if tokenizer.pad_token_id == tokenizer.eos_token_id:
+            cur_len -= end_token_cnt
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." f" (ignored)")
+                # print(f"YOU NEED GO TO DEBUG THIS DATA ITEM: {conversations}")
+
+    #import pdb; pdb.set_trace()
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+
+
+def preprocess_qwen2p5_instruct(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+    has_audio: bool = False,
+    end_tag: bool = True,
+    modality: str = "lang",
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt(modality))
+    #print(conversations)
+    #import pdb; pdb.set_trace()
+
+    # Tokenize conversations
+    if not end_tag:
+        conversations[0] = conversations[0][:-10]
+    if has_image and not has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    elif has_image and has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_audio_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    elif not has_image and has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_audio_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+    #print(f'end_tag: {end_tag}')
+    #print(conversations)
+    #print(input_ids)
+    #import pdb; pdb.set_trace()
+
+    targets = input_ids.clone()
+
+    assert conv.sep_style == conversation_lib.SeparatorStyle.Qwen2p5Instruct
+
+    # Mask targets
+    sep = '\n' + conv.sep + conv.roles[1] + "\n"   #\n<|im_start|>assistant\n
+    sep2 = '\n' + conv.sep2 + conv.roles[0] + "\n" #\n<|im_start|>user\n
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+
+        rounds = conversation.split(sep2)
+        rounds = [rounds[0] + sep2 + rounds[1]] + rounds[2:]
+        cur_len = 0
+        end_token_cnt = 0
+
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+            if i > 0:
+                rou = sep2 + rou
+
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+
+            #import pdb; pdb.set_trace()
+            if has_image and not has_audio:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer))
+            elif has_image and has_audio:
+                round_len = len(tokenizer_image_audio_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_audio_token(parts[0], tokenizer))
+            elif not has_image and has_audio:
+                round_len = len(tokenizer_image_audio_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_audio_token(parts[0], tokenizer))
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids)
+
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+
+            end_token_cnt += 1
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+
+        if tokenizer.pad_token_id == tokenizer.eos_token_id:
+            cur_len -= end_token_cnt
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." f" (ignored)")
+                # print(f"YOU NEED GO TO DEBUG THIS DATA ITEM: {conversations}")
+
+    #print(targets)
+    #import pdb; pdb.set_trace()
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+
+
+def preprocess_plain(
+    sources: Sequence[str],
+    tokenizer: transformers.PreTrainedTokenizer,
+) -> Dict:
+    # add end signal and concatenate together
+    conversations = []
+    for source in sources:
+        assert len(source) == 2
+        assert DEFAULT_IMAGE_TOKEN in source[0]["value"]
+        source[0]["value"] = DEFAULT_IMAGE_TOKEN
+        conversation = (
+            source[0]["value"] + source[1]["value"] + conversation_lib.default_conversation.sep
+        )
+        conversations.append(conversation)
+    # tokenize conversations
+    input_ids = [
+        tokenizer_image_token(prompt, tokenizer, return_tensors="pt") for prompt in conversations
+    ]
+    targets = copy.deepcopy(input_ids)
+    for target, source in zip(targets, sources):
+        tokenized_len = len(tokenizer_image_token(source[0]["value"], tokenizer))
+        target[:tokenized_len] = IGNORE_INDEX
+
+    return dict(input_ids=input_ids, labels=targets)
+
+
+def preprocess(
+    sources: Sequence[str],
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+    has_audio: bool = False,
+    end_tag: bool = True,
+    modality: str = "lang",
+) -> Dict:
+    if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.PLAIN:
+        return preprocess_plain(sources, tokenizer)
+
+    if conversation_lib.default_conversation.version == "nemo":
+        return preprocess_nemo(
+            sources,
+            tokenizer,
+            has_image=has_image,
+            has_audio=has_audio,
+            end_tag=end_tag,
+            modality=modality,
+        )
+    elif conversation_lib.default_conversation.version == "qwen2p5_instruct":
+        return preprocess_qwen2p5_instruct(
+            sources,
+            tokenizer,
+            has_image=has_image,
+            has_audio=has_audio,
+            end_tag=end_tag,
+            modality=modality,
+        )
+    elif conversation_lib.default_conversation.version == "mixtral_two":
+        return preprocess_mixtral_two(
+            sources,
+            tokenizer,
+            has_image=has_image,
+            has_audio=has_audio,
+            end_tag=end_tag,
+            modality=modality,
+        )
+
+
+def _get_rawvideo_dec(
+    video_path,
+    image_processor,
+    max_frames=32,
+    min_frames=4,
+    image_resolution=384,
+    video_framerate=3,
+    s=None,
+    e=None,
+    image_aspect_ratio="pad",
+):
+    # speed up video decode via decord.
+    video_mask = np.zeros(max_frames, dtype=np.int64)
+    max_video_length = 0
+
+    # T x 3 x H x W
+    video = np.zeros((max_frames, 3, image_resolution, image_resolution), dtype=np.float64)
+
+    if s is None:
+        start_time, end_time = None, None
+    else:
+        start_time = int(s)
+        end_time = int(e)
+        start_time = start_time if start_time >= 0.0 else 0.0
+        end_time = end_time if end_time >= 0.0 else 0.0
+        if start_time > end_time:
+            start_time, end_time = end_time, start_time
+        elif start_time == end_time:
+            end_time = start_time + 1
+
+    if os.path.exists(video_path):
+        vreader = VideoReader(video_path, ctx=cpu(0))
+    else:
+        print(video_path)
+        raise FileNotFoundError
+
+    fps = vreader.get_avg_fps()
+    f_start = 0 if start_time is None else int(start_time * fps)
+    f_end = int(min(1000000000 if end_time is None else end_time * fps, len(vreader) - 1))
+    num_frames = f_end - f_start + 1
+    if num_frames > 0:
+        # T x 3 x H x W
+        sample_fps = int(video_framerate)
+        t_stride = int(round(float(fps) / sample_fps))
+
+        all_pos = list(range(f_start, f_end + 1, t_stride))
+        if len(all_pos) > max_frames:
+            sample_pos = [
+                all_pos[_] for _ in np.linspace(0, len(all_pos) - 1, num=max_frames, dtype=int)
+            ]
+        elif len(all_pos) < min_frames:
+            sample_pos = [
+                all_pos[_] for _ in np.linspace(0, len(all_pos) - 1, num=min_frames, dtype=int)
+            ]
+        else:
+            sample_pos = all_pos
+
+        patch_images = [Image.fromarray(f) for f in vreader.get_batch(sample_pos).asnumpy()]
+
+        if image_aspect_ratio == "pad":
+
+            def expand2square(pil_img, background_color):
+                width, height = pil_img.size
+                if width == height:
+                    return pil_img
+                elif width > height:
+                    result = Image.new(pil_img.mode, (width, width), background_color)
+                    result.paste(pil_img, (0, (width - height) // 2))
+                    return result
+                else:
+                    result = Image.new(pil_img.mode, (height, height), background_color)
+                    result.paste(pil_img, ((height - width) // 2, 0))
+                    return result
+
+            patch_images = [
+                expand2square(i, tuple(int(x * 255) for x in image_processor.image_mean))
+                for i in patch_images
+            ]
+            patch_images = [
+                image_processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                for i in patch_images
+            ]
+        else:
+            patch_images = [
+                image_processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                for i in patch_images
+            ]
+
+        # patch_images = [image_processor.preprocess(img, return_tensors='pt')['pixel_values'][0] for img in patch_images]
+        slice_len = len(patch_images)
+        return patch_images, slice_len
+        max_video_length = max_video_length if max_video_length > slice_len else slice_len
+        if slice_len < 1:
+            pass
+        else:
+            while len(patch_images) < max_frames:
+                patch_images.append(torch.zeros((3, image_resolution, image_resolution)))
+            # video[:slice_len, ...] = patch_images
+    else:
+        print("video path: {} error.".format(video_path))
+
+    video_mask[:max_video_length] = [1] * max_video_length
+
+    return patch_images, video_mask
+
+
+class LazySupervisedDataset(Dataset):
+    """Dataset for supervised fine-tuning."""
+
+    def __init__(self, tokenizer: transformers.PreTrainedTokenizer, data_args: DataArguments):
+        super(LazySupervisedDataset, self).__init__()
+
+        dataset_list = DataConfig[str(data_args.dataset_use)]
+        print(dataset_list)
+
+        self.max_length = MAX_IMAGE_LENGTH
+        list_data_dict = []
+        self.folder_dict = {}
+        for i in dataset_list:
+            # list_data_dict += json.load(open(i["chat_path"], "r"))
+            data_ratio = i.get("data_ratio", DEFAULT_DATA_RATIO)
+            data_i = json.load(open(i["chat_path"], "r"))
+            len_data_i = len(data_i)
+            data_i = random.sample(data_i, int(len_data_i * data_ratio))
+            list_data_dict += data_i
+
+            image_folder = [folder for folder in i if folder is not "chat_path"]
+
+            for folder in image_folder:
+                if folder not in self.folder_dict:
+                    self.folder_dict[folder] = i[folder]
+        for key in FolderDict.keys():
+            if key not in self.folder_dict:
+                self.folder_dict[key] = FolderDict[key]
+
+        random.shuffle(list_data_dict)
+
+        self.tokenizer = tokenizer
+        self.list_data_dict = list_data_dict
+        self.data_args = data_args
+
+    def __len__(self):
+        return len(self.list_data_dict)
+
+    # @property
+    # def lengths(self):
+    #     length_list = []
+    #     for sample in self.list_data_dict:
+    #         img_tokens = 128 if 'image' in sample else 0
+    #         length_list.append(sum(len(conv['value'].split()) for conv in sample['conversations']) + img_tokens)
+    #     return length_list
+
+    @property
+    def modality_lengths(self):
+        length_list = []
+        for sample in self.list_data_dict:
+            cur_len = sum(len(conv["value"].split()) for conv in sample["conversations"])
+            cur_len = cur_len if ("image" in sample or "video" in sample) else -cur_len
+            length_list.append(cur_len)
+        return length_list
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        sources = self.list_data_dict[i]
+        if isinstance(i, int):
+            sources = [sources]
+        assert len(sources) == 1, "Don't know why it is wrapped to a list"  # FIXME
+        if "image" in sources[0] and "audio" not in sources[0]:
+            image_file = self.list_data_dict[i]["image"]
+            set_id = self.list_data_dict[i].get("set", None)
+            file = image_file[0] if type(image_file) is list else image_file
+            processor = self.data_args.image_processor
+            if "height" in processor.size.keys():
+                image_size = processor.size["height"]
+            elif "shortest_edge" in processor.size.keys():
+                image_size = processor.size["shortest_edge"]
+            else:
+                raise NotImplementedError(f"Please use correct key to use processor size!")
+
+            if type(image_file) is list:
+                assert type(set_id) is list
+                if len(image_file) != len(set_id):
+                    assert len(set(set_id)) == 1
+                image = [
+                    Image.open(
+                        os.path.join(self.folder_dict[set_id[k]], file.replace("\\", "/"))
+                    ).convert("RGB")
+                    for k, file in enumerate(image_file)
+                ]
+                if self.data_args.image_aspect_ratio == "pad":
+
+                    def expand2square(pil_img, background_color):
+                        width, height = pil_img.size
+                        if width == height:
+                            return pil_img
+                        elif width > height:
+                            result = Image.new(pil_img.mode, (width, width), background_color)
+                            result.paste(pil_img, (0, (width - height) // 2))
+                            return result
+                        else:
+                            result = Image.new(pil_img.mode, (height, height), background_color)
+                            result.paste(pil_img, ((height - width) // 2, 0))
+                            return result
+
+                    image = [
+                        expand2square(i, tuple(int(x * 255) for x in processor.image_mean))
+                        for i in image
+                    ]
+                    image_patches, patch_num = [], []
+                    for k, img in enumerate(image):
+                        if set_id[k] not in NoPatchSets:
+                            img, p_num = dynamic_preprocess(
+                                img,
+                                min_num=self.data_args.min_dynamic_patch,
+                                max_num=self.data_args.max_dynamic_patch,
+                                image_size=image_size,
+                                use_thumbnail=self.data_args.use_thumbnail,
+                            )
+                        else:
+                            img, p_num = [img], [1]
+                        image_patches += img
+                        patch_num += p_num
+                    assert len(image_patches) == sum(patch_num)
+                    image = image_patches
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+                else:
+                    image_patches, patch_num = [], []
+                    for k, img in enumerate(image):
+                        if set_id[k] not in NoPatchSets:
+                            img, p_num = dynamic_preprocess(
+                                img,
+                                min_num=self.data_args.min_dynamic_patch,
+                                max_num=self.data_args.max_dynamic_patch,
+                                image_size=image_size,
+                                use_thumbnail=self.data_args.use_thumbnail,
+                            )
+                        else:
+                            img, p_num = [img], [1]
+                        image_patches += img
+                        patch_num += p_num
+                    assert len(image_patches) == sum(patch_num)
+                    image = image_patches
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+            else:
+                image_folder = self.folder_dict[set_id]
+                image = Image.open(
+                    os.path.join(image_folder, image_file.replace("\\", "/"))
+                ).convert("RGB")
+                if self.data_args.image_aspect_ratio == "pad":
+
+                    def expand2square(pil_img, background_color):
+                        width, height = pil_img.size
+                        if width == height:
+                            return pil_img
+                        elif width > height:
+                            result = Image.new(pil_img.mode, (width, width), background_color)
+                            result.paste(pil_img, (0, (width - height) // 2))
+                            return result
+                        else:
+                            result = Image.new(pil_img.mode, (height, height), background_color)
+                            result.paste(pil_img, ((height - width) // 2, 0))
+                            return result
+
+                    image = expand2square(image, tuple(int(x * 255) for x in processor.image_mean))
+                    image, patch_num = dynamic_preprocess(
+                        image,
+                        min_num=self.data_args.min_dynamic_patch,
+                        max_num=self.data_args.max_dynamic_patch,
+                        image_size=image_size,
+                        use_thumbnail=self.data_args.use_thumbnail,
+                    )
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+                else:
+                    image, patch_num = dynamic_preprocess(
+                        image,
+                        min_num=self.data_args.min_dynamic_patch,
+                        max_num=self.data_args.max_dynamic_patch,
+                        image_size=image_size,
+                        use_thumbnail=self.data_args.use_thumbnail,
+                    )
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+
+            inserted_id = self.list_data_dict[i].get("inserted_id", None)
+            end_tag = self.list_data_dict[i].get("end_tag", True)
+            assert inserted_id is None
+            assert end_tag is True
+            sources = preprocess_multimodal(
+                copy.deepcopy([e["conversations"] for e in sources]),
+                self.data_args,
+                patch_num=patch_num,
+                inserted_id=inserted_id,
+            )
+
+            data_dict = preprocess(
+                sources, self.tokenizer, has_image=True, end_tag=end_tag, modality="image"
+            )
+
+        elif "image" in sources[0] and "audio" in sources[0]:
+            image_file = self.list_data_dict[i]["image"]
+            set_id = self.list_data_dict[i].get("set", None)
+            file = image_file[0] if type(image_file) is list else image_file
+            audio_file = self.list_data_dict[i]["audio"]
+            processor = self.data_args.image_processor
+            if "height" in processor.size.keys():
+                image_size = processor.size["height"]
+            elif "shortest_edge" in processor.size.keys():
+                image_size = processor.size["shortest_edge"]
+            else:
+                raise NotImplementedError(f"Please use correct key to use processor size!")
+
+            if type(image_file) is list:
+                assert type(set_id) is list
+                if len(image_file) != len(set_id):  # 多图数据
+                    assert len(set(set_id)) == 1
+                image = [
+                    Image.open(
+                        os.path.join(self.folder_dict[set_id[k]], file.replace("\\", "/"))
+                    ).convert("RGB")
+                    for k, file in enumerate(image_file)
+                ]
+                if self.data_args.image_aspect_ratio == "pad":
+
+                    def expand2square(pil_img, background_color):
+                        width, height = pil_img.size
+                        if width == height:
+                            return pil_img
+                        elif width > height:
+                            result = Image.new(pil_img.mode, (width, width), background_color)
+                            result.paste(pil_img, (0, (width - height) // 2))
+                            return result
+                        else:
+                            result = Image.new(pil_img.mode, (height, height), background_color)
+                            result.paste(pil_img, ((height - width) // 2, 0))
+                            return result
+
+                    image = [
+                        expand2square(i, tuple(int(x * 255) for x in processor.image_mean))
+                        for i in image
+                    ]
+                    image_patches, patch_num = [], []
+                    for k, img in enumerate(image):
+                        if set_id[k] not in NoPatchSets:
+                            img, p_num = dynamic_preprocess(
+                                img,
+                                min_num=self.data_args.min_dynamic_patch,
+                                max_num=self.data_args.max_dynamic_patch,
+                                image_size=image_size,
+                                use_thumbnail=self.data_args.use_thumbnail,
+                            )
+                        else:
+                            img, p_num = [img], [1]
+                        image_patches += img
+                        patch_num += p_num
+                    assert len(image_patches) == sum(patch_num)
+                    image = image_patches
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+                else:
+                    image_patches, patch_num = [], []
+                    for k, img in enumerate(image):
+                        if set_id[k] not in NoPatchSets:
+                            img, p_num = dynamic_preprocess(
+                                img,
+                                min_num=self.data_args.min_dynamic_patch,
+                                max_num=self.data_args.max_dynamic_patch,
+                                image_size=image_size,
+                                use_thumbnail=self.data_args.use_thumbnail,
+                            )
+                        else:
+                            img, p_num = [img], [1]
+                        image_patches += img
+                        patch_num += p_num
+                    assert len(image_patches) == sum(patch_num)
+                    image = image_patches
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+            else:
+                image_folder = self.folder_dict[set_id]
+                image = Image.open(
+                    os.path.join(image_folder, image_file.replace("\\", "/"))
+                ).convert("RGB")
+                if self.data_args.image_aspect_ratio == "pad":
+
+                    def expand2square(pil_img, background_color):
+                        width, height = pil_img.size
+                        if width == height:
+                            return pil_img
+                        elif width > height:
+                            result = Image.new(pil_img.mode, (width, width), background_color)
+                            result.paste(pil_img, (0, (width - height) // 2))
+                            return result
+                        else:
+                            result = Image.new(pil_img.mode, (height, height), background_color)
+                            result.paste(pil_img, ((height - width) // 2, 0))
+                            return result
+
+                    image = expand2square(image, tuple(int(x * 255) for x in processor.image_mean))
+                    image, patch_num = dynamic_preprocess(
+                        image,
+                        min_num=self.data_args.min_dynamic_patch,
+                        max_num=self.data_args.max_dynamic_patch,
+                        image_size=image_size,
+                        use_thumbnail=self.data_args.use_thumbnail,
+                    )
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+                else:
+                    image, patch_num = dynamic_preprocess(
+                        image,
+                        min_num=self.data_args.min_dynamic_patch,
+                        max_num=self.data_args.max_dynamic_patch,
+                        image_size=image_size,
+                        use_thumbnail=self.data_args.use_thumbnail,
+                    )
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+
+            if type(audio_file) is list:
+                # if type(set_id) is list:
+                #    audio_folder = self.folder_dict[set_id[0]+'_audio']
+                # else:
+                #    audio_folder = self.folder_dict[set_id+'_audio']
+                audio_folder = AudioFolder
+                assert len(audio_file) > 0, "audio_file为列表时不能为空"
+                audio = []
+                audio_for_llm_lens = []
+                audio_length = []
+                for file in audio_file:
+                    try:
+                        a, a_llm = self.data_args.audio_processor.process(
+                            os.path.join(audio_folder, "audio", file)
+                        )
+                    except:
+                        print(f"File {os.path.join(audio_folder, 'audio', file)} not OK!!!!!")
+                    audio.append(a)
+                    audio_for_llm_lens.append(a_llm)
+                    audio_length.append(a.shape[0])
+            else:
+                # audio_folder = self.folder_dict[set_id+'_audio']
+                audio_folder = AudioFolder
+                assert audio_file, "audio_file不能为空"
+                audio, audio_for_llm_lens = self.data_args.audio_processor.process(
+                    os.path.join(audio_folder, "audio", audio_file)
+                )
+                audio_length = audio.shape[0]
+
+            inserted_id = self.list_data_dict[i].get("inserted_id", None)
+            end_tag = self.list_data_dict[i].get("end_tag", True)
+            sources = preprocess_multimodal(
+                copy.deepcopy([e["conversations"] for e in sources]),
+                self.data_args,
+                patch_num=patch_num,
+                audio_lens=audio_for_llm_lens,
+                inserted_id=inserted_id,
+            )
+
+            data_dict = preprocess(
+                sources,
+                self.tokenizer,
+                has_image=True,
+                has_audio=True,
+                end_tag=end_tag,
+                modality="image",
+            )
+            data_dict["audio_lengths"] = audio_length
+            data_dict["audio_lengths_for_llm"] = audio_for_llm_lens
+
+        elif "video" in sources[0] and "audio" not in sources[0]:
+            video_file = self.list_data_dict[i]["video"]
+            video_id = self.list_data_dict[i]["id"]
+            set_id = self.list_data_dict[i].get("set", None)
+            processor = self.data_args.image_processor
+            if "height" in processor.size.keys():
+                image_size = processor.size["height"]
+            elif "shortest_edge" in processor.size.keys():
+                image_size = processor.size["shortest_edge"]
+            else:
+                raise NotImplementedError(f"Please use correct key to use processor size!")
+            video_folder = self.folder_dict[set_id]
+            image, image_token_num = _get_rawvideo_dec(
+                os.path.join(video_folder, video_file),
+                processor,
+                max_frames=MAX_IMAGE_LENGTH,
+                min_frames=MIN_IMAGE_LENGTH,
+                image_resolution=image_size,
+                image_aspect_ratio=self.data_args.image_aspect_ratio,
+            )
+
+            inserted_id = self.list_data_dict[i].get("inserted_id", None)
+            end_tag = self.list_data_dict[i].get("end_tag", True)
+            assert inserted_id is None
+            assert end_tag is True
+            sources = preprocess_multimodal(
+                copy.deepcopy([e["conversations"] for e in sources]),
+                self.data_args,
+                image_token_num=image_token_num,
+                inserted_id=inserted_id,
+            )
+
+            data_dict = preprocess(
+                sources,
+                self.tokenizer,
+                has_image=True,
+                has_audio=False,
+                end_tag=end_tag,
+                modality="video",
+            )
+
+        elif "video" in sources[0] and "audio" in sources[0]:
+            video_file = self.list_data_dict[i]["video"]
+            video_id = self.list_data_dict[i]["id"]
+            set_id = self.list_data_dict[i].get("set", None)
+            audio_file = self.list_data_dict[i]["audio"]
+            processor = self.data_args.image_processor
+            if "height" in processor.size.keys():
+                image_size = processor.size["height"]
+            elif "shortest_edge" in processor.size.keys():
+                image_size = processor.size["shortest_edge"]
+            else:
+                raise NotImplementedError(f"Please use correct key to use processor size!")
+            video_folder = self.folder_dict[set_id]
+            # audio_folder = self.folder_dict[set_id+'_audio']
+            audio_folder = AudioFolder
+            image, image_token_num = _get_rawvideo_dec(
+                os.path.join(video_folder, video_file),
+                processor,
+                max_frames=MAX_IMAGE_LENGTH,
+                min_frames=MIN_IMAGE_LENGTH,
+                image_resolution=image_size,
+                image_aspect_ratio=self.data_args.image_aspect_ratio,
+            )
+            if type(audio_file) is list:
+                assert len(audio_file) > 0, "audio_file为列表时不能为空"
+                audio = []
+                audio_for_llm_lens = []
+                audio_length = []
+                for file in audio_file:
+                    a, a_llm = self.data_args.audio_processor.process(
+                        os.path.join(audio_folder, "audio", file)
+                    )
+                    audio.append(a)
+                    audio_for_llm_lens.append(a_llm)
+                    audio_length.append(a.shape[0])
+            else:
+                assert audio_file, "audio_file不能为空"
+                audio, audio_for_llm_lens = self.data_args.audio_processor.process(
+                    os.path.join(audio_folder, "audio", audio_file)
+                )
+                audio_length = audio.shape[0]
+
+            inserted_id = self.list_data_dict[i].get("inserted_id", None)
+            end_tag = self.list_data_dict[i].get("end_tag", True)
+            sources = preprocess_multimodal(
+                copy.deepcopy([e["conversations"] for e in sources]),
+                self.data_args,
+                image_token_num=image_token_num,
+                audio_lens=audio_for_llm_lens,
+                inserted_id=inserted_id,
+            )
+
+            data_dict = preprocess(
+                sources,
+                self.tokenizer,
+                has_image=True,
+                has_audio=True,
+                end_tag=end_tag,
+                modality="video",
+            )
+            data_dict["audio_lengths"] = audio_length
+            data_dict["audio_lengths_for_llm"] = audio_for_llm_lens
+        elif "audio" in sources[0]:
+            audio_file = self.list_data_dict[i]["audio"]
+            audio_folder = AudioFolder
+            if type(audio_file) is list:
+                assert len(audio_file) > 0, "audio_file为列表时不能为空"
+                audio = []
+                audio_for_llm_lens = []
+                audio_length = []
+                for file in audio_file:
+                    a, a_llm = self.data_args.audio_processor.process(
+                        os.path.join(audio_folder, "audio", file)
+                    )
+                    audio.append(a)
+                    audio_for_llm_lens.append(a_llm)
+                    audio_length.append(a.shape[0])
+            else:
+                assert audio_file, "audio_file不能为空"
+                audio, audio_for_llm_lens = self.data_args.audio_processor.process(
+                    os.path.join(audio_folder, "audio", audio_file)
+                )
+                audio_length = audio.shape[0]
+
+            inserted_id = self.list_data_dict[i].get("inserted_id", None)
+            end_tag = self.list_data_dict[i].get("end_tag", True)
+            sources = preprocess_multimodal(
+                copy.deepcopy([e["conversations"] for e in sources]),
+                self.data_args,
+                image_token_num=0,
+                audio_lens=audio_for_llm_lens,
+                inserted_id=inserted_id,
+            )
+
+            data_dict = preprocess(
+                sources,
+                self.tokenizer,
+                has_image=False,
+                has_audio=True,
+                end_tag=end_tag,
+                modality="lang",
+            )
+            data_dict["audio_lengths"] = audio_length
+            data_dict["audio_lengths_for_llm"] = audio_for_llm_lens
+        else:
+            sources = copy.deepcopy([e["conversations"] for e in sources])
+            sources = preprocess_multimodal(
+                sources,
+                self.data_args,
+                image_token_num=0,
+            )
+
+            data_dict = preprocess(sources, self.tokenizer, has_image=False, modality="lang")
+
+        if isinstance(i, int):
+            if "audio" in self.list_data_dict[i]:
+                data_dict = dict(
+                    input_ids=data_dict["input_ids"][0],
+                    labels=data_dict["labels"][0],
+                    audio_lengths=data_dict["audio_lengths"],
+                    audio_lengths_for_llm=data_dict["audio_lengths_for_llm"],
+                )
+            else:
+                data_dict = dict(input_ids=data_dict["input_ids"][0], labels=data_dict["labels"][0])
+
+        # image exist in the data
+        if "image" in self.list_data_dict[i] or "video" in self.list_data_dict[i]:
+            data_dict["image"] = image
+        elif self.data_args.is_multimodal:
+            # image does not exist in the data, but the model is multimodal
+            crop_size = self.data_args.image_processor.crop_size
+            data_dict["image"] = torch.zeros(3, crop_size["height"], crop_size["width"])
+        if "audio" in self.list_data_dict[i]:
+            data_dict["audio"] = audio
+        elif self.data_args.is_multimodal:
+            data_dict["audio"] = torch.zeros(400, 80)
+            data_dict["audio_lengths"] = 400
+            data_dict["audio_lengths_for_llm"] = 60
+        return data_dict
+
+
+@dataclass
+class DataCollatorForSupervisedDataset(object):
+    """Collate examples for supervised fine-tuning."""
+
+    tokenizer: transformers.PreTrainedTokenizer
+
+    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        input_ids, labels = tuple(
+            [instance[key] for instance in instances] for key in ("input_ids", "labels")
+        )
+        if self.tokenizer.pad_token_id == self.tokenizer.eos_token_id:
+            for input_id in input_ids:
+                input_id[input_id == self.tokenizer.eos_token_id] = -300
+
+        input_ids = torch.nn.utils.rnn.pad_sequence(
+            input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
+        )
+
+        labels = torch.nn.utils.rnn.pad_sequence(
+            labels, batch_first=True, padding_value=IGNORE_INDEX
+        )
+
+        input_ids = input_ids[:, : self.tokenizer.model_max_length]
+
+        attention_mask = input_ids.ne(self.tokenizer.pad_token_id)
+
+        labels = labels[:, : self.tokenizer.model_max_length]
+
+        if self.tokenizer.pad_token_id == self.tokenizer.eos_token_id:
+            for input_id in input_ids:
+                input_id[input_id == -300] = self.tokenizer.eos_token_id
+
+        batch = dict(
+            input_ids=input_ids,
+            labels=labels,
+            attention_mask=attention_mask,
+        )
+
+        if "image" in instances[0]:
+            images = [instance["image"] for instance in instances]
+
+            new_images = []
+            for image in images:
+                if type(image) is list:
+                    for i in image:
+                        new_images.append(i)
+                else:
+                    new_images.append(image)
+            images = new_images
+
+            if all(x is not None and x.shape == images[0].shape for x in images):
+                batch["images"] = torch.stack(images)
+            else:
+                batch["images"] = images
+
+        batch["audios"] = {}
+        if "audio" in instances[0]:
+            audios = [instance["audio"] for instance in instances]
+            audio_lengths = [instance["audio_lengths"] for instance in instances]
+            audio_lengths_for_llm = [instance["audio_lengths_for_llm"] for instance in instances]
+
+            new_audios = []
+            new_audio_lengths = []
+            new_audio_lengths_for_llm = []
+            for i, audio in enumerate(audios):
+                length = audio_lengths[i]
+                length_for_llm = audio_lengths_for_llm[i]
+                if type(audio) is list:
+                    for j, a in enumerate(audio):
+                        new_audios.append(a)
+                        new_audio_lengths.append(length[j])
+                        new_audio_lengths_for_llm.append(length_for_llm[j])
+                else:
+                    new_audios.append(audio)
+                    new_audio_lengths.append(length)
+                    new_audio_lengths_for_llm.append(length_for_llm)
+            audios = new_audios
+            audios = pad_sequence(audios, batch_first=True, padding_value=0)
+
+            batch["audios"]["audios"] = audios
+            batch["audios"]["lengths"] = torch.tensor(new_audio_lengths)
+            batch["audios"]["lengths_for_llm"] = torch.tensor(new_audio_lengths_for_llm)
+
+        return batch
+
+
+def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer, data_args) -> Dict:
+    """Make dataset and collator for supervised fine-tuning."""
+    train_dataset = LazySupervisedDataset(tokenizer=tokenizer, data_args=data_args)
+    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
+    return dict(train_dataset=train_dataset, eval_dataset=None, data_collator=data_collator)
+
+
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    best_ratio_diff = float("inf")
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    # print(f'width: {width}, height: {height}, best_ratio: {best_ratio}')
+    return best_ratio
+
+
+def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+
+    # calculate the existing image aspect ratio
+    target_ratios = set(
+        (i, j)
+        for n in range(min_num, max_num + 1)
+        for i in range(1, n + 1)
+        for j in range(1, n + 1)
+        if i * j <= max_num and i * j >= min_num
+    )
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, target_ratios, orig_width, orig_height, image_size
+    )
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size,
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images, [len(processed_images)]
diff --git a/vita/util/data_utils_video_audio_neg_patch_fo.py b/vita/util/data_utils_video_audio_neg_patch_fo.py
new file mode 100644
index 0000000000000000000000000000000000000000..472fd37b18997c53cdc13daeeff5489f6bb14b74
--- /dev/null
+++ b/vita/util/data_utils_video_audio_neg_patch_fo.py
@@ -0,0 +1,1550 @@
+import copy
+import json
+import math
+import os
+import random
+from dataclasses import dataclass, field
+from typing import Dict, Optional, Sequence
+
+import numpy as np
+import torch
+import transformers
+from PIL import Image
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data import Dataset
+
+from decord import VideoReader, cpu
+from vita import conversation as conversation_lib
+from vita.config import AudioFolder, DataConfig, FolderDict, NoPatchSets
+from vita.constants import (
+    DEFAULT_AUDIO_TOKEN,
+    DEFAULT_DATA_RATIO,
+    DEFAULT_IMAGE_TOKEN,
+    DEFAULT_VIDEO_TOKEN,
+    IGNORE_INDEX,
+    MAX_IMAGE_LENGTH,
+    MIN_IMAGE_LENGTH,
+)
+from vita.util.mm_utils import tokenizer_image_audio_token, tokenizer_image_token
+
+
+@dataclass
+class DataArguments:
+    lazy_preprocess: bool = False
+    is_multimodal: bool = True
+    image_folder: Optional[str] = field(default=None)
+    image_aspect_ratio: str = field(default=None)
+    dataset_use: str = field(default="temp")
+    min_dynamic_patch: int = 1
+    max_dynamic_patch: int = 12
+    use_thumbnail: bool = True
+
+
+def preprocess_multimodal(
+    sources: Sequence[str],
+    data_args: DataArguments,
+    image_token_num=1,
+    patch_num=[1],
+    audio_lens: int = 0,
+    inserted_id=None,
+) -> Dict:
+    is_multimodal = data_args.is_multimodal
+    if not is_multimodal:
+        return sources
+
+    state_labels = []
+    k_img_ph = 0
+    for source in sources:
+        if inserted_id is not None:
+            assert source[inserted_id]["from"] == "gpt"
+        for i, sentence in enumerate(source):
+            if DEFAULT_IMAGE_TOKEN in sentence["value"] or DEFAULT_VIDEO_TOKEN in sentence["value"]:
+                sentence["value"] = (
+                    sentence["value"]
+                    .replace(DEFAULT_IMAGE_TOKEN + "\n", DEFAULT_IMAGE_TOKEN)
+                    .strip()
+                )
+                sentence["value"] = (
+                    sentence["value"]
+                    .replace("\n" + DEFAULT_IMAGE_TOKEN, DEFAULT_IMAGE_TOKEN)
+                    .strip()
+                )
+                if sentence["value"].endswith(DEFAULT_IMAGE_TOKEN):
+                    IMAGE_TOKEN_NUM = sentence["value"].count(DEFAULT_IMAGE_TOKEN)
+                    sentence["value"] = (
+                        sentence["value"].replace(DEFAULT_IMAGE_TOKEN * IMAGE_TOKEN_NUM, "").strip()
+                    )
+                    sentence["value"] = DEFAULT_IMAGE_TOKEN * IMAGE_TOKEN_NUM + sentence["value"]
+                    sentence["value"] = sentence["value"].strip()
+                if sentence["value"].endswith(DEFAULT_VIDEO_TOKEN):
+                    VIDEO_TOKEN_NUM = sentence["value"].count(DEFAULT_VIDEO_TOKEN)
+                    sentence["value"] = (
+                        sentence["value"].replace(DEFAULT_VIDEO_TOKEN * VIDEO_TOKEN_NUM, "").strip()
+                    )
+                    sentence["value"] = DEFAULT_VIDEO_TOKEN * VIDEO_TOKEN_NUM + sentence["value"]
+                    sentence["value"] = sentence["value"].strip()
+
+                if "mmtag" in conversation_lib.default_conversation.version:
+                    sentence["value"] = sentence["value"].replace(
+                        DEFAULT_IMAGE_TOKEN, "<Image>" + DEFAULT_IMAGE_TOKEN + "</Image>"
+                    )
+
+                IMAGE_TOKEN_NUM = sentence["value"].count(DEFAULT_IMAGE_TOKEN)
+                if IMAGE_TOKEN_NUM > 99999:#MAX_IMAGE_LENGTH:
+                    sentence["value"] = (
+                        sentence["value"]
+                        .replace(
+                            DEFAULT_IMAGE_TOKEN * IMAGE_TOKEN_NUM,
+                            DEFAULT_IMAGE_TOKEN * MAX_IMAGE_LENGTH,
+                        )
+                        .strip()
+                    )
+            replace_token, vid_replace_token, audio_replace_token = (
+                DEFAULT_IMAGE_TOKEN,
+                DEFAULT_IMAGE_TOKEN * image_token_num,
+                DEFAULT_AUDIO_TOKEN,
+            )  # * audio_lens
+            if DEFAULT_IMAGE_TOKEN in sentence["value"]:
+                replace_token = DEFAULT_IMAGE_TOKEN * patch_num[k_img_ph]
+                k_img_ph += 1
+
+            sentence["value"] = sentence["value"].replace(DEFAULT_IMAGE_TOKEN, replace_token + "\n")
+            sentence["value"] = sentence["value"].replace(
+                DEFAULT_VIDEO_TOKEN, vid_replace_token + "\n"
+            )
+            sentence["value"] = sentence["value"].replace(
+                DEFAULT_AUDIO_TOKEN + "\n", audio_replace_token
+            )
+            sentence["value"] = sentence["value"].replace("\n\n", "\n")
+
+            # if i == inserted_id:# ☞ ☟ ☜
+            #     assert sentence["from"] == "gpt"
+            #     sentence["value"] = "☟" + sentence["value"]
+            # elif sentence["from"] == "gpt":
+            #     if "<audio>" in source[i - 1]["value"]:
+            #         sentence["value"] = "☞" + sentence["value"]
+            #     else:
+            #         sentence["value"] = "☜" + sentence["value"]
+
+            if "<audio>" in sentence["value"]:
+                if inserted_id is not None and i == inserted_id - 1:
+                    state_labels.append(-102)
+                else:
+                    state_labels.append(-101)
+            
+    # print(patch_num)
+    # print(sum(patch_num))
+    #print(sources)
+    #import pdb; pdb.set_trace()
+    return sources, state_labels
+
+
+def preprocess_mixtral_zh(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+    has_audio: bool = False,
+    end_tag: bool = True,
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+
+    # Tokenize conversations
+    if not end_tag:
+        conversations[0] = conversations[0][:-4]
+    if has_image and not has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    elif has_image and has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_audio_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    elif not has_image and has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_audio_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+    #print(f'end_tag: {end_tag}')
+    #print(conversations)
+    #print(input_ids)
+    #import pdb; pdb.set_trace()
+
+    targets = input_ids.clone()
+
+    assert conv.sep_style == conversation_lib.SeparatorStyle.MixtralZh
+
+    # Mask targets
+    sep = conv.sep + "\n" + conv.roles[1] + ":"
+    sep2_2 = "\n" + conv.roles[0] + ":"
+    sep2 = conv.sep2 + sep2_2
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+
+        rounds = conversation.split(sep2)
+        rounds = [rounds[0] + sep2 + rounds[1]] + rounds[2:]
+        cur_len = 1
+        end_token_cnt = 0
+        target[:cur_len] = IGNORE_INDEX
+
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+            if i > 0:
+                rou = sep2_2 + rou
+
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+
+            if has_image and not has_audio:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 1
+            elif has_image and has_audio:
+                round_len = len(tokenizer_image_audio_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_audio_token(parts[0], tokenizer)) - 1
+            elif not has_image and has_audio:
+                round_len = len(tokenizer_image_audio_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_audio_token(parts[0], tokenizer)) - 1
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 2
+
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+
+            end_token_cnt += 1
+            cur_len += round_len
+        cur_len = cur_len - 1
+        target[cur_len:] = IGNORE_INDEX
+
+        if tokenizer.pad_token_id == tokenizer.eos_token_id:
+            cur_len -= end_token_cnt
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." f" (ignored)")
+                # print(f"YOU NEED GO TO DEBUG THIS DATA ITEM: {conversations}")
+
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+
+
+def preprocess_mixtral_two(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+    has_audio: bool = False,
+    end_tag: bool = True,
+    modality: str = "lang",
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt(modality))
+    # print(conversations)
+    # import pdb; pdb.set_trace()
+
+    # Tokenize conversations
+    if not end_tag:
+        conversations[0] = conversations[0][:-4]
+    if has_image and not has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    elif has_image and has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_audio_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    elif not has_image and has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_audio_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+    #print(f'end_tag: {end_tag}')
+    print(conversations)
+    print(input_ids)
+    import pdb; pdb.set_trace()
+
+    targets = input_ids.clone()
+
+    assert conv.sep_style == conversation_lib.SeparatorStyle.MixtralTwo
+
+    # Mask targets
+    sep = conv.sep + "\n" + conv.roles[1] + ":"
+    sep2_2 = "\n" + conv.roles[0] + ":"
+    sep2 = conv.sep2 + sep2_2
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+
+        rounds = conversation.split(sep2)
+        rounds = [rounds[0] + sep2 + rounds[1]] + rounds[2:]
+        cur_len = 1
+        end_token_cnt = 0
+        target[:cur_len] = IGNORE_INDEX
+
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+            if i > 0:
+                rou = sep2_2 + rou
+
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+
+            import pdb; pdb.set_trace()
+            if has_image and not has_audio:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 1
+            elif has_image and has_audio:
+                round_len = len(tokenizer_image_audio_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_audio_token(parts[0], tokenizer)) - 1
+            elif not has_image and has_audio:
+                round_len = len(tokenizer_image_audio_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_audio_token(parts[0], tokenizer)) - 1
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 2
+
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+
+            end_token_cnt += 1
+            cur_len += round_len
+        cur_len = cur_len - 1
+        target[cur_len:] = IGNORE_INDEX
+
+        if tokenizer.pad_token_id == tokenizer.eos_token_id:
+            cur_len -= end_token_cnt
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." f" (ignored)")
+                # print(f"YOU NEED GO TO DEBUG THIS DATA ITEM: {conversations}")
+
+    import pdb; pdb.set_trace()
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+
+
+def preprocess_nemo(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+    has_audio: bool = False,
+    end_tag: bool = True,
+    modality: str = "lang",
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt(modality))
+    #print(conversations)
+    #import pdb; pdb.set_trace()
+
+    if not end_tag:
+        conversations[0] = conversations[0][:-4]
+    # Tokenize conversations
+    if has_image and not has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    elif has_image and has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_audio_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    elif not has_image and has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_audio_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+    print(f'end_tag: {end_tag}')
+    print(conversations)
+    print(input_ids)
+    import pdb; pdb.set_trace()
+
+    targets = input_ids.clone()
+
+    assert conv.sep_style == conversation_lib.SeparatorStyle.Nemo
+
+    # Mask targets
+    sep = conv.sep
+    sep2 = conv.sep2
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+
+        rounds = conversation.split(sep2)
+        cur_len = 1
+        end_token_cnt = 0
+        target[:cur_len] = IGNORE_INDEX
+
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+
+            if has_image and not has_audio:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 1
+            elif has_image and has_audio:
+                round_len = len(tokenizer_image_audio_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_audio_token(parts[0], tokenizer)) - 1
+            elif not has_image and has_audio:
+                round_len = len(tokenizer_image_audio_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_audio_token(parts[0], tokenizer)) - 1
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 2
+
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+
+            end_token_cnt += 1
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+
+        if tokenizer.pad_token_id == tokenizer.eos_token_id:
+            cur_len -= end_token_cnt
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." f" (ignored)")
+                # print(f"YOU NEED GO TO DEBUG THIS DATA ITEM: {conversations}")
+
+    #import pdb; pdb.set_trace()
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+
+
+def preprocess_qwen2p5_instruct(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+    has_audio: bool = False,
+    end_tag: bool = True,
+    modality: str = "lang",
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt(modality))
+    #print(conversations)
+    #import pdb; pdb.set_trace()
+
+    # Tokenize conversations
+    if not end_tag:
+        conversations[0] = conversations[0][:-10]
+    if has_image and not has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    elif has_image and has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_audio_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    elif not has_image and has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_audio_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+    #print(f'end_tag: {end_tag}')
+    #print(conversations)
+    #print(input_ids)
+    #import pdb; pdb.set_trace()
+
+    targets = input_ids.clone()
+
+    assert conv.sep_style == conversation_lib.SeparatorStyle.Qwen2p5Instruct
+
+    # Mask targets
+    sep = '\n' + conv.sep + conv.roles[1] + "\n"   #\n<|im_start|>assistant\n
+    sep2 = '\n' + conv.sep2 + conv.roles[0] + "\n" #\n<|im_start|>user\n
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+
+        rounds = conversation.split(sep2)
+        rounds = [rounds[0] + sep2 + rounds[1]] + rounds[2:]
+        cur_len = 0
+        end_token_cnt = 0
+
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+            if i > 0:
+                rou = sep2 + rou
+
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+
+            #import pdb; pdb.set_trace()
+            if has_image and not has_audio:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer))
+            elif has_image and has_audio:
+                round_len = len(tokenizer_image_audio_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_audio_token(parts[0], tokenizer))
+            elif not has_image and has_audio:
+                round_len = len(tokenizer_image_audio_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_audio_token(parts[0], tokenizer))
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids)
+
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+
+            end_token_cnt += 1
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+
+        if tokenizer.pad_token_id == tokenizer.eos_token_id:
+            cur_len -= end_token_cnt
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." f" (ignored)")
+                # print(f"YOU NEED GO TO DEBUG THIS DATA ITEM: {conversations}")
+
+    #print(targets)
+    #import pdb; pdb.set_trace()
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+
+
+def preprocess_plain(
+    sources: Sequence[str],
+    tokenizer: transformers.PreTrainedTokenizer,
+) -> Dict:
+    # add end signal and concatenate together
+    conversations = []
+    for source in sources:
+        assert len(source) == 2
+        assert DEFAULT_IMAGE_TOKEN in source[0]["value"]
+        source[0]["value"] = DEFAULT_IMAGE_TOKEN
+        conversation = (
+            source[0]["value"] + source[1]["value"] + conversation_lib.default_conversation.sep
+        )
+        conversations.append(conversation)
+    # tokenize conversations
+    input_ids = [
+        tokenizer_image_token(prompt, tokenizer, return_tensors="pt") for prompt in conversations
+    ]
+    targets = copy.deepcopy(input_ids)
+    for target, source in zip(targets, sources):
+        tokenized_len = len(tokenizer_image_token(source[0]["value"], tokenizer))
+        target[:tokenized_len] = IGNORE_INDEX
+
+    return dict(input_ids=input_ids, labels=targets)
+
+
+def preprocess(
+    sources: Sequence[str],
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+    has_audio: bool = False,
+    end_tag: bool = True,
+    modality: str = "lang",
+) -> Dict:
+    if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.PLAIN:
+        return preprocess_plain(sources, tokenizer)
+
+    if conversation_lib.default_conversation.version == "nemo":
+        return preprocess_nemo(
+            sources,
+            tokenizer,
+            has_image=has_image,
+            has_audio=has_audio,
+            end_tag=end_tag,
+            modality=modality,
+        )
+    elif conversation_lib.default_conversation.version == "qwen2p5_instruct":
+        return preprocess_qwen2p5_instruct(
+            sources,
+            tokenizer,
+            has_image=has_image,
+            has_audio=has_audio,
+            end_tag=end_tag,
+            modality=modality,
+        )
+    elif conversation_lib.default_conversation.version == "mixtral_two":
+        return preprocess_mixtral_two(
+            sources,
+            tokenizer,
+            has_image=has_image,
+            has_audio=has_audio,
+            end_tag=end_tag,
+            modality=modality,
+        )
+
+
+def _get_rawvideo_dec(
+    video_path,
+    image_processor,
+    max_frames=32,
+    min_frames=4,
+    image_resolution=384,
+    video_framerate=3,
+    s=None,
+    e=None,
+    image_aspect_ratio="pad",
+):
+    # speed up video decode via decord.
+    video_mask = np.zeros(max_frames, dtype=np.int64)
+    max_video_length = 0
+
+    # T x 3 x H x W
+    video = np.zeros((max_frames, 3, image_resolution, image_resolution), dtype=np.float64)
+
+    if s is None:
+        start_time, end_time = None, None
+    else:
+        start_time = int(s)
+        end_time = int(e)
+        start_time = start_time if start_time >= 0.0 else 0.0
+        end_time = end_time if end_time >= 0.0 else 0.0
+        if start_time > end_time:
+            start_time, end_time = end_time, start_time
+        elif start_time == end_time:
+            end_time = start_time + 1
+
+    if os.path.exists(video_path):
+        vreader = VideoReader(video_path, ctx=cpu(0))
+    else:
+        print(video_path)
+        raise FileNotFoundError
+
+    fps = vreader.get_avg_fps()
+    f_start = 0 if start_time is None else int(start_time * fps)
+    f_end = int(min(1000000000 if end_time is None else end_time * fps, len(vreader) - 1))
+    num_frames = f_end - f_start + 1
+    if num_frames > 0:
+        # T x 3 x H x W
+        sample_fps = int(video_framerate)
+        t_stride = int(round(float(fps) / sample_fps))
+
+        all_pos = list(range(f_start, f_end + 1, t_stride))
+        if len(all_pos) > max_frames:
+            sample_pos = [
+                all_pos[_] for _ in np.linspace(0, len(all_pos) - 1, num=max_frames, dtype=int)
+            ]
+        elif len(all_pos) < min_frames:
+            sample_pos = [
+                all_pos[_] for _ in np.linspace(0, len(all_pos) - 1, num=min_frames, dtype=int)
+            ]
+        else:
+            sample_pos = all_pos
+
+        patch_images = [Image.fromarray(f) for f in vreader.get_batch(sample_pos).asnumpy()]
+
+        if image_aspect_ratio == "pad":
+
+            def expand2square(pil_img, background_color):
+                width, height = pil_img.size
+                if width == height:
+                    return pil_img
+                elif width > height:
+                    result = Image.new(pil_img.mode, (width, width), background_color)
+                    result.paste(pil_img, (0, (width - height) // 2))
+                    return result
+                else:
+                    result = Image.new(pil_img.mode, (height, height), background_color)
+                    result.paste(pil_img, ((height - width) // 2, 0))
+                    return result
+
+            patch_images = [
+                expand2square(i, tuple(int(x * 255) for x in image_processor.image_mean))
+                for i in patch_images
+            ]
+            patch_images = [
+                image_processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                for i in patch_images
+            ]
+        else:
+            patch_images = [
+                image_processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                for i in patch_images
+            ]
+
+        # patch_images = [image_processor.preprocess(img, return_tensors='pt')['pixel_values'][0] for img in patch_images]
+        slice_len = len(patch_images)
+        return patch_images, slice_len
+        max_video_length = max_video_length if max_video_length > slice_len else slice_len
+        if slice_len < 1:
+            pass
+        else:
+            while len(patch_images) < max_frames:
+                patch_images.append(torch.zeros((3, image_resolution, image_resolution)))
+            # video[:slice_len, ...] = patch_images
+    else:
+        print("video path: {} error.".format(video_path))
+
+    video_mask[:max_video_length] = [1] * max_video_length
+
+    return patch_images, video_mask
+
+
+class LazySupervisedDataset(Dataset):
+    """Dataset for supervised fine-tuning."""
+
+    def __init__(self, tokenizer: transformers.PreTrainedTokenizer, data_args: DataArguments):
+        super(LazySupervisedDataset, self).__init__()
+
+        dataset_list = DataConfig[str(data_args.dataset_use)]
+        print(dataset_list)
+
+        self.max_length = MAX_IMAGE_LENGTH
+        list_data_dict = []
+        self.folder_dict = {}
+        for i in dataset_list:
+            # list_data_dict += json.load(open(i["chat_path"], "r"))
+            data_ratio = i.get("data_ratio", DEFAULT_DATA_RATIO)
+            data_i = json.load(open(i["chat_path"], "r"))
+            len_data_i = len(data_i)
+            data_i = random.sample(data_i, int(len_data_i * data_ratio))
+            list_data_dict += data_i
+
+            image_folder = [folder for folder in i if folder is not "chat_path"]
+
+            for folder in image_folder:
+                if folder not in self.folder_dict:
+                    self.folder_dict[folder] = i[folder]
+        for key in FolderDict.keys():
+            if key not in self.folder_dict:
+                self.folder_dict[key] = FolderDict[key]
+
+        random.shuffle(list_data_dict)
+
+        self.tokenizer = tokenizer
+        self.list_data_dict = list_data_dict
+        self.data_args = data_args
+
+    def __len__(self):
+        return len(self.list_data_dict)
+
+    # @property
+    # def lengths(self):
+    #     length_list = []
+    #     for sample in self.list_data_dict:
+    #         img_tokens = 128 if 'image' in sample else 0
+    #         length_list.append(sum(len(conv['value'].split()) for conv in sample['conversations']) + img_tokens)
+    #     return length_list
+
+    @property
+    def modality_lengths(self):
+        length_list = []
+        for sample in self.list_data_dict:
+            cur_len = sum(len(conv["value"].split()) for conv in sample["conversations"])
+            cur_len = cur_len if ("image" in sample or "video" in sample) else -cur_len
+            length_list.append(cur_len)
+        return length_list
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        sources = self.list_data_dict[i]
+        if isinstance(i, int):
+            sources = [sources]
+        assert len(sources) == 1, "Don't know why it is wrapped to a list"  # FIXME
+        if "image" in sources[0] and "audio" not in sources[0]:
+            image_file = self.list_data_dict[i]["image"]
+            set_id = self.list_data_dict[i].get("set", None)
+            file = image_file[0] if type(image_file) is list else image_file
+            processor = self.data_args.image_processor
+            if "height" in processor.size.keys():
+                image_size = processor.size["height"]
+            elif "shortest_edge" in processor.size.keys():
+                image_size = processor.size["shortest_edge"]
+            else:
+                raise NotImplementedError(f"Please use correct key to use processor size!")
+
+            if type(image_file) is list:
+                assert type(set_id) is list
+                if len(image_file) != len(set_id):
+                    assert len(set(set_id)) == 1
+                image = [
+                    Image.open(
+                        os.path.join(self.folder_dict[set_id[k]], file.replace("\\", "/"))
+                    ).convert("RGB")
+                    for k, file in enumerate(image_file)
+                ]
+                if self.data_args.image_aspect_ratio == "pad":
+
+                    def expand2square(pil_img, background_color):
+                        width, height = pil_img.size
+                        if width == height:
+                            return pil_img
+                        elif width > height:
+                            result = Image.new(pil_img.mode, (width, width), background_color)
+                            result.paste(pil_img, (0, (width - height) // 2))
+                            return result
+                        else:
+                            result = Image.new(pil_img.mode, (height, height), background_color)
+                            result.paste(pil_img, ((height - width) // 2, 0))
+                            return result
+
+                    image = [
+                        expand2square(i, tuple(int(x * 255) for x in processor.image_mean))
+                        for i in image
+                    ]
+                    image_patches, patch_num = [], []
+                    for k, img in enumerate(image):
+                        if set_id[k] not in NoPatchSets:
+                            img, p_num = dynamic_preprocess(
+                                img,
+                                min_num=self.data_args.min_dynamic_patch,
+                                max_num=self.data_args.max_dynamic_patch,
+                                image_size=image_size,
+                                use_thumbnail=self.data_args.use_thumbnail,
+                            )
+                        else:
+                            img, p_num = [img], [1]
+                        image_patches += img
+                        patch_num += p_num
+                    assert len(image_patches) == sum(patch_num)
+                    image = image_patches
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+                else:
+                    image_patches, patch_num = [], []
+                    for k, img in enumerate(image):
+                        if set_id[k] not in NoPatchSets:
+                            img, p_num = dynamic_preprocess(
+                                img,
+                                min_num=self.data_args.min_dynamic_patch,
+                                max_num=self.data_args.max_dynamic_patch,
+                                image_size=image_size,
+                                use_thumbnail=self.data_args.use_thumbnail,
+                            )
+                        else:
+                            img, p_num = [img], [1]
+                        image_patches += img
+                        patch_num += p_num
+                    assert len(image_patches) == sum(patch_num)
+                    image = image_patches
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+            else:
+                image_folder = self.folder_dict[set_id]
+                image = Image.open(
+                    os.path.join(image_folder, image_file.replace("\\", "/"))
+                ).convert("RGB")
+                if self.data_args.image_aspect_ratio == "pad":
+
+                    def expand2square(pil_img, background_color):
+                        width, height = pil_img.size
+                        if width == height:
+                            return pil_img
+                        elif width > height:
+                            result = Image.new(pil_img.mode, (width, width), background_color)
+                            result.paste(pil_img, (0, (width - height) // 2))
+                            return result
+                        else:
+                            result = Image.new(pil_img.mode, (height, height), background_color)
+                            result.paste(pil_img, ((height - width) // 2, 0))
+                            return result
+
+                    image = expand2square(image, tuple(int(x * 255) for x in processor.image_mean))
+                    image, patch_num = dynamic_preprocess(
+                        image,
+                        min_num=self.data_args.min_dynamic_patch,
+                        max_num=self.data_args.max_dynamic_patch,
+                        image_size=image_size,
+                        use_thumbnail=self.data_args.use_thumbnail,
+                    )
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+                else:
+                    image, patch_num = dynamic_preprocess(
+                        image,
+                        min_num=self.data_args.min_dynamic_patch,
+                        max_num=self.data_args.max_dynamic_patch,
+                        image_size=image_size,
+                        use_thumbnail=self.data_args.use_thumbnail,
+                    )
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+
+            inserted_id = self.list_data_dict[i].get("inserted_id", None)
+            end_tag = self.list_data_dict[i].get("end_tag", True)
+            assert inserted_id is None
+            assert end_tag is True
+            sources, state_labels = preprocess_multimodal(
+                copy.deepcopy([e["conversations"] for e in sources]),
+                self.data_args,
+                patch_num=patch_num,
+                inserted_id=inserted_id,
+            )
+
+            data_dict = preprocess(
+                sources, self.tokenizer, has_image=True, end_tag=end_tag, modality="image"
+            )
+
+        elif "image" in sources[0] and "audio" in sources[0]:
+            image_file = self.list_data_dict[i]["image"]
+            set_id = self.list_data_dict[i].get("set", None)
+            file = image_file[0] if type(image_file) is list else image_file
+            audio_file = self.list_data_dict[i]["audio"]
+            processor = self.data_args.image_processor
+            if "height" in processor.size.keys():
+                image_size = processor.size["height"]
+            elif "shortest_edge" in processor.size.keys():
+                image_size = processor.size["shortest_edge"]
+            else:
+                raise NotImplementedError(f"Please use correct key to use processor size!")
+
+            if type(image_file) is list:
+                assert type(set_id) is list
+                if len(image_file) != len(set_id):  # 多图数据
+                    assert len(set(set_id)) == 1
+                image = [
+                    Image.open(
+                        os.path.join(self.folder_dict[set_id[k]], file.replace("\\", "/"))
+                    ).convert("RGB")
+                    for k, file in enumerate(image_file)
+                ]
+                if self.data_args.image_aspect_ratio == "pad":
+
+                    def expand2square(pil_img, background_color):
+                        width, height = pil_img.size
+                        if width == height:
+                            return pil_img
+                        elif width > height:
+                            result = Image.new(pil_img.mode, (width, width), background_color)
+                            result.paste(pil_img, (0, (width - height) // 2))
+                            return result
+                        else:
+                            result = Image.new(pil_img.mode, (height, height), background_color)
+                            result.paste(pil_img, ((height - width) // 2, 0))
+                            return result
+
+                    image = [
+                        expand2square(i, tuple(int(x * 255) for x in processor.image_mean))
+                        for i in image
+                    ]
+                    image_patches, patch_num = [], []
+                    for k, img in enumerate(image):
+                        if set_id[k] not in NoPatchSets:
+                            img, p_num = dynamic_preprocess(
+                                img,
+                                min_num=self.data_args.min_dynamic_patch,
+                                max_num=self.data_args.max_dynamic_patch,
+                                image_size=image_size,
+                                use_thumbnail=self.data_args.use_thumbnail,
+                            )
+                        else:
+                            img, p_num = [img], [1]
+                        image_patches += img
+                        patch_num += p_num
+                    assert len(image_patches) == sum(patch_num)
+                    image = image_patches
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+                else:
+                    image_patches, patch_num = [], []
+                    for k, img in enumerate(image):
+                        if set_id[k] not in NoPatchSets:
+                            img, p_num = dynamic_preprocess(
+                                img,
+                                min_num=self.data_args.min_dynamic_patch,
+                                max_num=self.data_args.max_dynamic_patch,
+                                image_size=image_size,
+                                use_thumbnail=self.data_args.use_thumbnail,
+                            )
+                        else:
+                            img, p_num = [img], [1]
+                        image_patches += img
+                        patch_num += p_num
+                    assert len(image_patches) == sum(patch_num)
+                    image = image_patches
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+            else:
+                image_folder = self.folder_dict[set_id]
+                image = Image.open(
+                    os.path.join(image_folder, image_file.replace("\\", "/"))
+                ).convert("RGB")
+                if self.data_args.image_aspect_ratio == "pad":
+
+                    def expand2square(pil_img, background_color):
+                        width, height = pil_img.size
+                        if width == height:
+                            return pil_img
+                        elif width > height:
+                            result = Image.new(pil_img.mode, (width, width), background_color)
+                            result.paste(pil_img, (0, (width - height) // 2))
+                            return result
+                        else:
+                            result = Image.new(pil_img.mode, (height, height), background_color)
+                            result.paste(pil_img, ((height - width) // 2, 0))
+                            return result
+
+                    image = expand2square(image, tuple(int(x * 255) for x in processor.image_mean))
+                    image, patch_num = dynamic_preprocess(
+                        image,
+                        min_num=self.data_args.min_dynamic_patch,
+                        max_num=self.data_args.max_dynamic_patch,
+                        image_size=image_size,
+                        use_thumbnail=self.data_args.use_thumbnail,
+                    )
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+                else:
+                    image, patch_num = dynamic_preprocess(
+                        image,
+                        min_num=self.data_args.min_dynamic_patch,
+                        max_num=self.data_args.max_dynamic_patch,
+                        image_size=image_size,
+                        use_thumbnail=self.data_args.use_thumbnail,
+                    )
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+
+            if type(audio_file) is list:
+                # if type(set_id) is list:
+                #    audio_folder = self.folder_dict[set_id[0]+'_audio']
+                # else:
+                #    audio_folder = self.folder_dict[set_id+'_audio']
+                audio_folder = AudioFolder
+                assert len(audio_file) > 0, "audio_file为列表时不能为空"
+                audio = []
+                audio_for_llm_lens = []
+                audio_length = []
+                for file in audio_file:
+                    try:
+                        a, a_llm = self.data_args.audio_processor.process(
+                            os.path.join(audio_folder, "audio", file)
+                        )
+                    except:
+                        print(f"File {os.path.join(audio_folder, 'audio', file)} not OK!!!!!")
+                    audio.append(a)
+                    audio_for_llm_lens.append(a_llm)
+                    audio_length.append(a.shape[0])
+            else:
+                # audio_folder = self.folder_dict[set_id+'_audio']
+                audio_folder = AudioFolder
+                assert audio_file, "audio_file不能为空"
+                audio, audio_for_llm_lens = self.data_args.audio_processor.process(
+                    os.path.join(audio_folder, "audio", audio_file)
+                )
+                audio_length = audio.shape[0]
+
+            inserted_id = self.list_data_dict[i].get("inserted_id", None)
+            end_tag = self.list_data_dict[i].get("end_tag", True)
+            sources, state_labels = preprocess_multimodal(
+                copy.deepcopy([e["conversations"] for e in sources]),
+                self.data_args,
+                patch_num=patch_num,
+                audio_lens=audio_for_llm_lens,
+                inserted_id=inserted_id,
+            )
+
+            data_dict = preprocess(
+                sources,
+                self.tokenizer,
+                has_image=True,
+                has_audio=True,
+                end_tag=end_tag,
+                modality="image",
+            )
+            data_dict["audio_lengths"] = audio_length
+            data_dict["audio_lengths_for_llm"] = audio_for_llm_lens
+
+        elif "video" in sources[0] and "audio" not in sources[0]:
+            video_file = self.list_data_dict[i]["video"]
+            video_id = self.list_data_dict[i]["id"]
+            set_id = self.list_data_dict[i].get("set", None)
+            processor = self.data_args.image_processor
+            if "height" in processor.size.keys():
+                image_size = processor.size["height"]
+            elif "shortest_edge" in processor.size.keys():
+                image_size = processor.size["shortest_edge"]
+            else:
+                raise NotImplementedError(f"Please use correct key to use processor size!")
+            video_folder = self.folder_dict[set_id]
+            image, image_token_num = _get_rawvideo_dec(
+                os.path.join(video_folder, video_file),
+                processor,
+                max_frames=MAX_IMAGE_LENGTH,
+                min_frames=MIN_IMAGE_LENGTH,
+                image_resolution=image_size,
+                image_aspect_ratio=self.data_args.image_aspect_ratio,
+            )
+
+            inserted_id = self.list_data_dict[i].get("inserted_id", None)
+            end_tag = self.list_data_dict[i].get("end_tag", True)
+            assert inserted_id is None
+            assert end_tag is True
+            sources, state_labels = preprocess_multimodal(
+                copy.deepcopy([e["conversations"] for e in sources]),
+                self.data_args,
+                image_token_num=image_token_num,
+                inserted_id=inserted_id,
+            )
+
+            data_dict = preprocess(
+                sources,
+                self.tokenizer,
+                has_image=True,
+                has_audio=False,
+                end_tag=end_tag,
+                modality="video",
+            )
+
+        elif "video" in sources[0] and "audio" in sources[0]:
+            video_file = self.list_data_dict[i]["video"]
+            video_id = self.list_data_dict[i]["id"]
+            set_id = self.list_data_dict[i].get("set", None)
+            audio_file = self.list_data_dict[i]["audio"]
+            processor = self.data_args.image_processor
+            if "height" in processor.size.keys():
+                image_size = processor.size["height"]
+            elif "shortest_edge" in processor.size.keys():
+                image_size = processor.size["shortest_edge"]
+            else:
+                raise NotImplementedError(f"Please use correct key to use processor size!")
+            video_folder = self.folder_dict[set_id]
+            # audio_folder = self.folder_dict[set_id+'_audio']
+            audio_folder = AudioFolder
+            image, image_token_num = _get_rawvideo_dec(
+                os.path.join(video_folder, video_file),
+                processor,
+                max_frames=MAX_IMAGE_LENGTH,
+                min_frames=MIN_IMAGE_LENGTH,
+                image_resolution=image_size,
+                image_aspect_ratio=self.data_args.image_aspect_ratio,
+            )
+            if type(audio_file) is list:
+                assert len(audio_file) > 0, "audio_file为列表时不能为空"
+                audio = []
+                audio_for_llm_lens = []
+                audio_length = []
+                for file in audio_file:
+                    a, a_llm = self.data_args.audio_processor.process(
+                        os.path.join(audio_folder, "audio", file)
+                    )
+                    audio.append(a)
+                    audio_for_llm_lens.append(a_llm)
+                    audio_length.append(a.shape[0])
+            else:
+                assert audio_file, "audio_file不能为空"
+                audio, audio_for_llm_lens = self.data_args.audio_processor.process(
+                    os.path.join(audio_folder, "audio", audio_file)
+                )
+                audio_length = audio.shape[0]
+
+            inserted_id = self.list_data_dict[i].get("inserted_id", None)
+            end_tag = self.list_data_dict[i].get("end_tag", True)
+            sources, state_labels = preprocess_multimodal(
+                copy.deepcopy([e["conversations"] for e in sources]),
+                self.data_args,
+                image_token_num=image_token_num,
+                audio_lens=audio_for_llm_lens,
+                inserted_id=inserted_id,
+            )
+
+            data_dict = preprocess(
+                sources,
+                self.tokenizer,
+                has_image=True,
+                has_audio=True,
+                end_tag=end_tag,
+                modality="video",
+            )
+            data_dict["audio_lengths"] = audio_length
+            data_dict["audio_lengths_for_llm"] = audio_for_llm_lens
+        elif "audio" in sources[0]:
+            audio_file = self.list_data_dict[i]["audio"]
+            audio_folder = AudioFolder
+            if type(audio_file) is list:
+                assert len(audio_file) > 0, "audio_file为列表时不能为空"
+                audio = []
+                audio_for_llm_lens = []
+                audio_length = []
+                for file in audio_file:
+                    a, a_llm = self.data_args.audio_processor.process(
+                        os.path.join(audio_folder, "audio", file)
+                    )
+                    audio.append(a)
+                    audio_for_llm_lens.append(a_llm)
+                    audio_length.append(a.shape[0])
+            else:
+                assert audio_file, "audio_file不能为空"
+                audio, audio_for_llm_lens = self.data_args.audio_processor.process(
+                    os.path.join(audio_folder, "audio", audio_file)
+                )
+                audio_length = audio.shape[0]
+
+            inserted_id = self.list_data_dict[i].get("inserted_id", None)
+            end_tag = self.list_data_dict[i].get("end_tag", True)
+            sources, state_labels = preprocess_multimodal(
+                copy.deepcopy([e["conversations"] for e in sources]),
+                self.data_args,
+                image_token_num=0,
+                audio_lens=audio_for_llm_lens,
+                inserted_id=inserted_id,
+            )
+
+            data_dict = preprocess(
+                sources,
+                self.tokenizer,
+                has_image=False,
+                has_audio=True,
+                end_tag=end_tag,
+                modality="lang",
+            )
+            data_dict["audio_lengths"] = audio_length
+            data_dict["audio_lengths_for_llm"] = audio_for_llm_lens
+        else:
+            sources = copy.deepcopy([e["conversations"] for e in sources])
+            sources, state_labels = preprocess_multimodal(
+                sources,
+                self.data_args,
+                image_token_num=0,
+            )
+
+            data_dict = preprocess(sources, self.tokenizer, has_image=False, modality="lang")
+
+        if isinstance(i, int):
+            if "audio" in self.list_data_dict[i]:
+                data_dict = dict(
+                    input_ids=data_dict["input_ids"][0],
+                    labels=data_dict["labels"][0],
+                    audio_lengths=data_dict["audio_lengths"],
+                    audio_lengths_for_llm=data_dict["audio_lengths_for_llm"],
+                )
+            else:
+                data_dict = dict(input_ids=data_dict["input_ids"][0], labels=data_dict["labels"][0])
+
+        # image exist in the data
+        if "image" in self.list_data_dict[i] or "video" in self.list_data_dict[i]:
+            data_dict["image"] = image
+        elif self.data_args.is_multimodal:
+            # image does not exist in the data, but the model is multimodal
+            crop_size = self.data_args.image_processor.crop_size
+            data_dict["image"] = torch.zeros(3, crop_size["height"], crop_size["width"])
+        if "audio" in self.list_data_dict[i]:
+            data_dict["audio"] = audio
+            data_dict["state_labels"] = state_labels
+        elif self.data_args.is_multimodal:
+            data_dict["audio"] = torch.zeros(400, 80)
+            data_dict["audio_lengths"] = 400
+            data_dict["audio_lengths_for_llm"] = 60
+            data_dict["state_labels"] = [-999]
+        return data_dict
+
+
+@dataclass
+class DataCollatorForSupervisedDataset(object):
+    """Collate examples for supervised fine-tuning."""
+
+    tokenizer: transformers.PreTrainedTokenizer
+
+    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        input_ids, labels = tuple(
+            [instance[key] for instance in instances] for key in ("input_ids", "labels")
+        )
+        if self.tokenizer.pad_token_id == self.tokenizer.eos_token_id:
+            for input_id in input_ids:
+                input_id[input_id == self.tokenizer.eos_token_id] = -300
+
+        input_ids = torch.nn.utils.rnn.pad_sequence(
+            input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
+        )
+
+        labels = torch.nn.utils.rnn.pad_sequence(
+            labels, batch_first=True, padding_value=IGNORE_INDEX
+        )
+
+        input_ids = input_ids[:, : self.tokenizer.model_max_length]
+
+        attention_mask = input_ids.ne(self.tokenizer.pad_token_id)
+
+        labels = labels[:, : self.tokenizer.model_max_length]
+
+        if self.tokenizer.pad_token_id == self.tokenizer.eos_token_id:
+            for input_id in input_ids:
+                input_id[input_id == -300] = self.tokenizer.eos_token_id
+
+        batch = dict(
+            input_ids=input_ids,
+            labels=labels,
+            attention_mask=attention_mask,
+        )
+
+        if "image" in instances[0]:
+            images = [instance["image"] for instance in instances]
+
+            new_images = []
+            for image in images:
+                if type(image) is list:
+                    for i in image:
+                        new_images.append(i)
+                else:
+                    new_images.append(image)
+            images = new_images
+
+            if all(x is not None and x.shape == images[0].shape for x in images):
+                batch["images"] = torch.stack(images)
+            else:
+                batch["images"] = images
+
+        batch["audios"] = {}
+        if "audio" in instances[0]:
+            audios = [instance["audio"] for instance in instances]
+            audio_lengths = [instance["audio_lengths"] for instance in instances]
+            audio_lengths_for_llm = [instance["audio_lengths_for_llm"] for instance in instances]
+            state_labels = [instance["state_labels"] for instance in instances]
+
+            new_audios = []
+            new_audio_lengths = []
+            new_audio_lengths_for_llm = []
+            new_state_labels = []
+            for i, audio in enumerate(audios):
+                length = audio_lengths[i]
+                length_for_llm = audio_lengths_for_llm[i]
+                s_label = state_labels[i]
+                for s in s_label:
+                    new_state_labels.append(s)
+                if type(audio) is list:
+                    for j, a in enumerate(audio):
+                        new_audios.append(a)
+                        new_audio_lengths.append(length[j])
+                        new_audio_lengths_for_llm.append(length_for_llm[j])
+                else:
+                    new_audios.append(audio)
+                    new_audio_lengths.append(length)
+                    new_audio_lengths_for_llm.append(length_for_llm)
+
+            audios = new_audios
+            audios = pad_sequence(audios, batch_first=True, padding_value=0)
+            state_labels = torch.tensor(new_state_labels)
+            assert len(audios) == len(state_labels)
+
+            batch["audios"]["audios"] = audios
+            batch["audios"]["lengths"] = torch.tensor(new_audio_lengths)
+            batch["audios"]["lengths_for_llm"] = torch.tensor(new_audio_lengths_for_llm)
+            batch["audios"]["state_labels"] = state_labels
+
+        return batch
+
+
+def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer, data_args) -> Dict:
+    """Make dataset and collator for supervised fine-tuning."""
+    train_dataset = LazySupervisedDataset(tokenizer=tokenizer, data_args=data_args)
+    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
+    return dict(train_dataset=train_dataset, eval_dataset=None, data_collator=data_collator)
+
+
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    best_ratio_diff = float("inf")
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    # print(f'width: {width}, height: {height}, best_ratio: {best_ratio}')
+    return best_ratio
+
+
+def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+
+    # calculate the existing image aspect ratio
+    target_ratios = set(
+        (i, j)
+        for n in range(min_num, max_num + 1)
+        for i in range(1, n + 1)
+        for j in range(1, n + 1)
+        if i * j <= max_num and i * j >= min_num
+    )
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, target_ratios, orig_width, orig_height, image_size
+    )
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size,
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images, [len(processed_images)]
+
diff --git a/vita/util/data_utils_video_audio_patch.py b/vita/util/data_utils_video_audio_patch.py
new file mode 100644
index 0000000000000000000000000000000000000000..10d057df2e8654ad0fb1d122ac040e8b1d62b49e
--- /dev/null
+++ b/vita/util/data_utils_video_audio_patch.py
@@ -0,0 +1,1399 @@
+import copy
+import json
+import math
+import os
+import random
+from dataclasses import dataclass, field
+from typing import Dict, Optional, Sequence
+
+import numpy as np
+import torch
+import transformers
+from PIL import Image
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data import Dataset
+
+from decord import VideoReader, cpu
+from vita import conversation as conversation_lib
+from vita.config import AudioFolder, DataConfig, FolderDict, NoPatchSets
+from vita.constants import (
+    DEFAULT_AUDIO_TOKEN,
+    DEFAULT_DATA_RATIO,
+    DEFAULT_IMAGE_TOKEN,
+    DEFAULT_VIDEO_TOKEN,
+    IGNORE_INDEX,
+    MAX_IMAGE_LENGTH,
+    MIN_IMAGE_LENGTH,
+)
+from vita.util.mm_utils import tokenizer_image_audio_token, tokenizer_image_token
+
+
+@dataclass
+class DataArguments:
+    lazy_preprocess: bool = False
+    is_multimodal: bool = True
+    image_folder: Optional[str] = field(default=None)
+    image_aspect_ratio: str = field(default=None)
+    dataset_use: str = field(default="temp")
+    min_dynamic_patch: int = 1
+    max_dynamic_patch: int = 12
+    use_thumbnail: bool = True
+
+
+def preprocess_multimodal(
+    sources: Sequence[str],
+    data_args: DataArguments,
+    image_token_num=1,
+    patch_num=[1],
+    audio_lens: int = 0,
+    inserted_id=None,
+) -> Dict:
+    is_multimodal = data_args.is_multimodal
+    if not is_multimodal:
+        return sources
+
+    k_img_ph = 0
+    for source in sources:
+        if inserted_id is not None:
+            assert source[inserted_id]["from"] == "gpt"
+        for i, sentence in enumerate(source):
+            if DEFAULT_IMAGE_TOKEN in sentence["value"] or DEFAULT_VIDEO_TOKEN in sentence["value"]:
+                sentence["value"] = (
+                    sentence["value"]
+                    .replace(DEFAULT_IMAGE_TOKEN + "\n", DEFAULT_IMAGE_TOKEN)
+                    .strip()
+                )
+                sentence["value"] = (
+                    sentence["value"]
+                    .replace("\n" + DEFAULT_IMAGE_TOKEN, DEFAULT_IMAGE_TOKEN)
+                    .strip()
+                )
+                VIDEO_TOKEN_NUM = sentence["value"].count(DEFAULT_VIDEO_TOKEN)
+                if VIDEO_TOKEN_NUM == 1 and sentence["value"].endswith(DEFAULT_IMAGE_TOKEN):
+                    IMAGE_TOKEN_NUM = sentence["value"].count(DEFAULT_IMAGE_TOKEN)
+                    sentence["value"] = (
+                        sentence["value"].replace(DEFAULT_IMAGE_TOKEN * IMAGE_TOKEN_NUM, "").strip()
+                    )
+                    sentence["value"] = DEFAULT_IMAGE_TOKEN * IMAGE_TOKEN_NUM + sentence["value"]
+                    sentence["value"] = sentence["value"].strip()
+
+                if sentence["value"].endswith(DEFAULT_VIDEO_TOKEN):
+                    VIDEO_TOKEN_NUM = sentence["value"].count(DEFAULT_VIDEO_TOKEN)
+                    sentence["value"] = (
+                        sentence["value"].replace(DEFAULT_VIDEO_TOKEN * VIDEO_TOKEN_NUM, "").strip()
+                    )
+                    sentence["value"] = DEFAULT_VIDEO_TOKEN * VIDEO_TOKEN_NUM + sentence["value"]
+                    sentence["value"] = sentence["value"].strip()
+
+                if "mmtag" in conversation_lib.default_conversation.version:
+                    sentence["value"] = sentence["value"].replace(
+                        DEFAULT_IMAGE_TOKEN, "<Image>" + DEFAULT_IMAGE_TOKEN + "</Image>"
+                    )
+
+                IMAGE_TOKEN_NUM = sentence["value"].count(DEFAULT_IMAGE_TOKEN)
+                if IMAGE_TOKEN_NUM > MAX_IMAGE_LENGTH:
+                    sentence["value"] = (
+                        sentence["value"]
+                        .replace(
+                            DEFAULT_IMAGE_TOKEN * IMAGE_TOKEN_NUM,
+                            DEFAULT_IMAGE_TOKEN * MAX_IMAGE_LENGTH,
+                        )
+                        .strip()
+                    )
+            replace_token, vid_replace_token, audio_replace_token = (
+                DEFAULT_IMAGE_TOKEN,
+                DEFAULT_IMAGE_TOKEN * image_token_num,
+                DEFAULT_AUDIO_TOKEN,
+            )  # * audio_lens
+            if DEFAULT_IMAGE_TOKEN in sentence["value"]:
+                replace_token = DEFAULT_IMAGE_TOKEN * patch_num[k_img_ph]
+                k_img_ph += 1
+
+            sentence["value"] = sentence["value"].replace(DEFAULT_IMAGE_TOKEN, replace_token + "\n")
+            sentence["value"] = sentence["value"].replace(
+                DEFAULT_VIDEO_TOKEN, vid_replace_token + "\n"
+            )
+            sentence["value"] = sentence["value"].replace(
+                DEFAULT_AUDIO_TOKEN + "\n", audio_replace_token
+            )
+            sentence["value"] = sentence["value"].replace("\n\n", "\n")
+            # if i == inserted_id:
+            #     assert sentence["from"] == "gpt"
+            #     sentence["value"] = "<2>" + sentence["value"]
+            # elif sentence["from"] == "gpt":
+            #     if "<audio>" in source[i - 1]["value"]:
+            #         sentence["value"] = "<1>" + sentence["value"]
+            #     else:
+            #         sentence["value"] = "<3>" + sentence["value"]
+
+    # print(patch_num)
+    # print(sum(patch_num))
+    #print(sources)
+    #import pdb; pdb.set_trace()
+    return sources
+
+
+def preprocess_mixtral_two(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+    has_audio: bool = False,
+    end_tag: bool = True,
+    modality: str = "lang",
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt(modality))
+    # print(conversations)
+    # import pdb; pdb.set_trace()
+
+    # Tokenize conversations
+    if not end_tag:
+        conversations[0] = conversations[0][:-4]
+    if has_image and not has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    elif has_image and has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_audio_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    elif not has_image and has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_audio_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+    #print(f'end_tag: {end_tag}')
+    #print(conversations)
+    #print(input_ids)
+    #import pdb; pdb.set_trace()
+
+    targets = input_ids.clone()
+
+    assert conv.sep_style == conversation_lib.SeparatorStyle.MixtralTwo
+
+    # Mask targets
+    sep = conv.sep + "\n" + conv.roles[1] + ":"
+    sep2_2 = "\n" + conv.roles[0] + ":"
+    sep2 = conv.sep2 + sep2_2
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+
+        rounds = conversation.split(sep2)
+        rounds = [rounds[0] + sep2 + rounds[1]] + rounds[2:]
+        cur_len = 1
+        end_token_cnt = 0
+        target[:cur_len] = IGNORE_INDEX
+
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+            if i > 0:
+                rou = sep2_2 + rou
+
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+
+            if has_image and not has_audio:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 1
+            elif has_image and has_audio:
+                round_len = len(tokenizer_image_audio_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_audio_token(parts[0], tokenizer)) - 1
+            elif not has_image and has_audio:
+                round_len = len(tokenizer_image_audio_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_audio_token(parts[0], tokenizer)) - 1
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 2
+
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+
+            end_token_cnt += 1
+            cur_len += round_len
+        cur_len = cur_len - 1
+        target[cur_len:] = IGNORE_INDEX
+
+        if tokenizer.pad_token_id == tokenizer.eos_token_id:
+            cur_len -= end_token_cnt
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." f" (ignored)")
+                # print(f"YOU NEED GO TO DEBUG THIS DATA ITEM: {conversations}")
+
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+
+
+def preprocess_nemo(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+    has_audio: bool = False,
+    end_tag: bool = True,
+    modality: str = "lang",
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt(modality))
+    #print(conversations)
+    #import pdb; pdb.set_trace()
+
+    # Tokenize conversations
+    if has_image and not has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    elif has_image and has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_audio_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    elif not has_image and has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_audio_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+    #print(f'end_tag: {end_tag}')
+    #print(conversations)
+    #print(input_ids)
+    #import pdb; pdb.set_trace()
+
+    targets = input_ids.clone()
+
+    assert conv.sep_style == conversation_lib.SeparatorStyle.Nemo
+
+    # Mask targets
+    sep = conv.sep
+    sep2 = conv.sep2
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+
+        rounds = conversation.split(sep2)
+        cur_len = 1
+        end_token_cnt = 0
+        target[:cur_len] = IGNORE_INDEX
+
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+
+            if has_image and not has_audio:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 1
+            elif has_image and has_audio:
+                round_len = len(tokenizer_image_audio_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_audio_token(parts[0], tokenizer)) - 1
+            elif not has_image and has_audio:
+                round_len = len(tokenizer_image_audio_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_audio_token(parts[0], tokenizer)) - 1 
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 2
+
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+
+            end_token_cnt += 1
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+
+        if tokenizer.pad_token_id == tokenizer.eos_token_id:
+            cur_len -= end_token_cnt
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." f" (ignored)")
+                # print(f"YOU NEED GO TO DEBUG THIS DATA ITEM: {conversations}")
+
+    #import pdb; pdb.set_trace()
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+
+
+def preprocess_qwen2p5_instruct(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+    has_audio: bool = False,
+    end_tag: bool = True,
+    modality: str = "lang",
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt(modality))
+    #print(conversations)
+    #import pdb; pdb.set_trace()
+
+    # Tokenize conversations
+    if has_image and not has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    elif has_image and has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_audio_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    elif not has_image and has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_audio_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+    #print(f'end_tag: {end_tag}')
+    #print(conversations)
+    #print(input_ids)
+    #import pdb; pdb.set_trace()
+
+    targets = input_ids.clone()
+
+    assert conv.sep_style == conversation_lib.SeparatorStyle.Qwen2p5Instruct
+
+    # Mask targets
+    sep = '\n' + conv.sep + conv.roles[1] + "\n"   #\n<|im_start|>assistant\n
+    sep2 = '\n' + conv.sep2 + conv.roles[0] + "\n" #\n<|im_start|>user\n
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+
+        rounds = conversation.split(sep2)
+        rounds = [rounds[0] + sep2 + rounds[1]] + rounds[2:]
+        cur_len = 0
+        end_token_cnt = 0
+
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+            if i > 0:
+                rou = sep2 + rou
+
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+
+            #import pdb; pdb.set_trace()
+            if has_image and not has_audio:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer))
+            elif has_image and has_audio:
+                round_len = len(tokenizer_image_audio_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_audio_token(parts[0], tokenizer))
+            elif not has_image and has_audio:
+                round_len = len(tokenizer_image_audio_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_audio_token(parts[0], tokenizer))
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids)
+
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+
+            end_token_cnt += 1
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+
+        if tokenizer.pad_token_id == tokenizer.eos_token_id:
+            cur_len -= end_token_cnt
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." f" (ignored)")
+                # print(f"YOU NEED GO TO DEBUG THIS DATA ITEM: {conversations}")
+
+    #import pdb; pdb.set_trace()
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+
+
+def preprocess_plain(
+    sources: Sequence[str],
+    tokenizer: transformers.PreTrainedTokenizer,
+) -> Dict:
+    # add end signal and concatenate together
+    conversations = []
+    for source in sources:
+        assert len(source) == 2
+        assert DEFAULT_IMAGE_TOKEN in source[0]["value"]
+        source[0]["value"] = DEFAULT_IMAGE_TOKEN
+        conversation = (
+            source[0]["value"] + source[1]["value"] + conversation_lib.default_conversation.sep
+        )
+        conversations.append(conversation)
+    # tokenize conversations
+    input_ids = [
+        tokenizer_image_token(prompt, tokenizer, return_tensors="pt") for prompt in conversations
+    ]
+    targets = copy.deepcopy(input_ids)
+    for target, source in zip(targets, sources):
+        tokenized_len = len(tokenizer_image_token(source[0]["value"], tokenizer))
+        target[:tokenized_len] = IGNORE_INDEX
+
+    return dict(input_ids=input_ids, labels=targets)
+
+
+def preprocess(
+    sources: Sequence[str],
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+    has_audio: bool = False,
+    end_tag: bool = True,
+    modality: str = "lang",
+) -> Dict:
+    if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.PLAIN:
+        return preprocess_plain(sources, tokenizer)
+
+    if conversation_lib.default_conversation.version == "nemo":
+        return preprocess_nemo(
+            sources,
+            tokenizer,
+            has_image=has_image,
+            has_audio=has_audio,
+            end_tag=end_tag,
+            modality=modality,
+        )
+    elif conversation_lib.default_conversation.version == "mixtral_two":
+        return preprocess_mixtral_two(
+            sources,
+            tokenizer,
+            has_image=has_image,
+            has_audio=has_audio,
+            end_tag=end_tag,
+            modality=modality,
+        )
+    elif conversation_lib.default_conversation.version == "qwen2p5_instruct":
+        return preprocess_qwen2p5_instruct(
+            sources,
+            tokenizer,
+            has_image=has_image,
+            has_audio=has_audio,
+            end_tag=end_tag,
+            modality=modality,
+        )
+
+def _get_rawvideo_dec(
+    video_path,
+    image_processor,
+    max_frames=32,
+    min_frames=4,
+    image_resolution=384,
+    video_framerate=1,
+    s=None,
+    e=None,
+    image_aspect_ratio="pad",
+):
+    # speed up video decode via decord.
+    video_mask = np.zeros(max_frames, dtype=np.int64)
+    max_video_length = 0
+
+    # T x 3 x H x W
+    video = np.zeros((max_frames, 3, image_resolution, image_resolution), dtype=np.float64)
+
+    if s is None:
+        start_time, end_time = None, None
+    else:
+        start_time = int(s)
+        end_time = int(e)
+        start_time = start_time if start_time >= 0.0 else 0.0
+        end_time = end_time if end_time >= 0.0 else 0.0
+        if start_time > end_time:
+            start_time, end_time = end_time, start_time
+        elif start_time == end_time:
+            end_time = start_time + 1
+
+    if os.path.exists(video_path):
+        vreader = VideoReader(video_path, ctx=cpu(0))
+    else:
+        print(video_path)
+        raise FileNotFoundError
+
+    fps = vreader.get_avg_fps()
+    f_start = 0 if start_time is None else int(start_time * fps)
+    f_end = int(min(1000000000 if end_time is None else end_time * fps, len(vreader) - 1))
+    num_frames = f_end - f_start + 1
+    if num_frames > 0:
+        # T x 3 x H x W
+        sample_fps = int(video_framerate)
+        t_stride = int(round(float(fps) / sample_fps))
+
+        all_pos = list(range(f_start, f_end + 1, t_stride))
+        if len(all_pos) > max_frames:
+            sample_pos = [
+                all_pos[_] for _ in np.linspace(0, len(all_pos) - 1, num=max_frames, dtype=int)
+            ]
+        elif len(all_pos) < min_frames:
+            sample_pos = [
+                all_pos[_] for _ in np.linspace(0, len(all_pos) - 1, num=min_frames, dtype=int)
+            ]
+        else:
+            sample_pos = all_pos
+
+        patch_images = [Image.fromarray(f) for f in vreader.get_batch(sample_pos).asnumpy()]
+
+        if image_aspect_ratio == "pad":
+
+            def expand2square(pil_img, background_color):
+                width, height = pil_img.size
+                if width == height:
+                    return pil_img
+                elif width > height:
+                    result = Image.new(pil_img.mode, (width, width), background_color)
+                    result.paste(pil_img, (0, (width - height) // 2))
+                    return result
+                else:
+                    result = Image.new(pil_img.mode, (height, height), background_color)
+                    result.paste(pil_img, ((height - width) // 2, 0))
+                    return result
+
+            patch_images = [
+                expand2square(i, tuple(int(x * 255) for x in image_processor.image_mean))
+                for i in patch_images
+            ]
+            patch_images = [
+                image_processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                for i in patch_images
+            ]
+        else:
+            patch_images = [
+                image_processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                for i in patch_images
+            ]
+
+        # patch_images = [image_processor.preprocess(img, return_tensors='pt')['pixel_values'][0] for img in patch_images]
+        slice_len = len(patch_images)
+        return patch_images, slice_len
+        max_video_length = max_video_length if max_video_length > slice_len else slice_len
+        if slice_len < 1:
+            pass
+        else:
+            while len(patch_images) < max_frames:
+                patch_images.append(torch.zeros((3, image_resolution, image_resolution)))
+            # video[:slice_len, ...] = patch_images
+    else:
+        print("video path: {} error.".format(video_path))
+
+    video_mask[:max_video_length] = [1] * max_video_length
+
+    return patch_images, video_mask
+
+
+class LazySupervisedDataset(Dataset):
+    """Dataset for supervised fine-tuning."""
+
+    def __init__(self, tokenizer: transformers.PreTrainedTokenizer, data_args: DataArguments):
+        super(LazySupervisedDataset, self).__init__()
+
+        dataset_list = DataConfig[str(data_args.dataset_use)]
+        print(dataset_list)
+
+        self.max_length = MAX_IMAGE_LENGTH
+        list_data_dict = []
+        self.folder_dict = {}
+        for i in dataset_list:
+            # list_data_dict += json.load(open(i["chat_path"], "r"))
+            data_ratio = i.get("data_ratio", DEFAULT_DATA_RATIO)
+            data_i = json.load(open(i["chat_path"], "r"))
+            len_data_i = len(data_i)
+            data_i = random.sample(data_i, int(len_data_i * data_ratio))
+            list_data_dict += data_i
+
+            image_folder = [folder for folder in i if folder is not "chat_path"]
+
+            for folder in image_folder:
+                if folder not in self.folder_dict:
+                    self.folder_dict[folder] = i[folder]
+        for key in FolderDict.keys():
+            if key not in self.folder_dict:
+                self.folder_dict[key] = FolderDict[key]
+
+        random.shuffle(list_data_dict)
+
+        self.tokenizer = tokenizer
+        self.list_data_dict = list_data_dict
+        self.data_args = data_args
+
+    def __len__(self):
+        return len(self.list_data_dict)
+
+    # @property
+    # def lengths(self):
+    #     length_list = []
+    #     for sample in self.list_data_dict:
+    #         img_tokens = 128 if 'image' in sample else 0
+    #         length_list.append(sum(len(conv['value'].split()) for conv in sample['conversations']) + img_tokens)
+    #     return length_list
+
+    @property
+    def modality_lengths(self):
+        length_list = []
+        for sample in self.list_data_dict:
+            cur_len = sum(len(conv["value"].split()) for conv in sample["conversations"])
+            cur_len = cur_len if ("image" in sample or "video" in sample) else -cur_len
+            length_list.append(cur_len)
+        return length_list
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        sources = self.list_data_dict[i]
+        if isinstance(i, int):
+            sources = [sources]
+        assert len(sources) == 1, "Don't know why it is wrapped to a list"  # FIXME
+        if "image" in sources[0] and "audio" not in sources[0]:
+            image_file = self.list_data_dict[i]["image"]
+            set_id = self.list_data_dict[i].get("set", None)
+            file = image_file[0] if type(image_file) is list else image_file
+            processor = self.data_args.image_processor
+            if "height" in processor.size.keys():
+                image_size = processor.size["height"]
+            elif "shortest_edge" in processor.size.keys():
+                image_size = processor.size["shortest_edge"]
+            else:
+                raise NotImplementedError(f"Please use correct key to use processor size!")
+
+            if type(image_file) is list:
+                assert type(set_id) is list
+                if len(image_file) != len(set_id):
+                    assert len(set(set_id)) == 1
+                image = [
+                    Image.open(
+                        os.path.join(self.folder_dict[set_id[k]], file.replace("\\", "/"))
+                    ).convert("RGB")
+                    for k, file in enumerate(image_file)
+                ]
+                if self.data_args.image_aspect_ratio == "pad":
+
+                    def expand2square(pil_img, background_color):
+                        width, height = pil_img.size
+                        if width == height:
+                            return pil_img
+                        elif width > height:
+                            result = Image.new(pil_img.mode, (width, width), background_color)
+                            result.paste(pil_img, (0, (width - height) // 2))
+                            return result
+                        else:
+                            result = Image.new(pil_img.mode, (height, height), background_color)
+                            result.paste(pil_img, ((height - width) // 2, 0))
+                            return result
+
+                    image = [
+                        expand2square(i, tuple(int(x * 255) for x in processor.image_mean))
+                        for i in image
+                    ]
+                    image_patches, patch_num = [], []
+                    for k, img in enumerate(image):
+                        if set_id[k] not in NoPatchSets:
+                            img, p_num = dynamic_preprocess(
+                                img,
+                                min_num=self.data_args.min_dynamic_patch,
+                                max_num=self.data_args.max_dynamic_patch,
+                                image_size=image_size,
+                                use_thumbnail=self.data_args.use_thumbnail,
+                            )
+                        else:
+                            img, p_num = [img], [1]
+                        image_patches += img
+                        patch_num += p_num
+                    assert len(image_patches) == sum(patch_num)
+                    image = image_patches
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+                else:
+                    image_patches, patch_num = [], []
+                    for k, img in enumerate(image):
+                        if set_id[k] not in NoPatchSets:
+                            img, p_num = dynamic_preprocess(
+                                img,
+                                min_num=self.data_args.min_dynamic_patch,
+                                max_num=self.data_args.max_dynamic_patch,
+                                image_size=image_size,
+                                use_thumbnail=self.data_args.use_thumbnail,
+                            )
+                        else:
+                            img, p_num = [img], [1]
+                        image_patches += img
+                        patch_num += p_num
+                    assert len(image_patches) == sum(patch_num)
+                    image = image_patches
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+            else:
+                image_folder = self.folder_dict[set_id]
+                image = Image.open(
+                    os.path.join(image_folder, image_file.replace("\\", "/"))
+                ).convert("RGB")
+                if self.data_args.image_aspect_ratio == "pad":
+
+                    def expand2square(pil_img, background_color):
+                        width, height = pil_img.size
+                        if width == height:
+                            return pil_img
+                        elif width > height:
+                            result = Image.new(pil_img.mode, (width, width), background_color)
+                            result.paste(pil_img, (0, (width - height) // 2))
+                            return result
+                        else:
+                            result = Image.new(pil_img.mode, (height, height), background_color)
+                            result.paste(pil_img, ((height - width) // 2, 0))
+                            return result
+
+                    image = expand2square(image, tuple(int(x * 255) for x in processor.image_mean))
+                    image, patch_num = dynamic_preprocess(
+                        image,
+                        min_num=self.data_args.min_dynamic_patch,
+                        max_num=self.data_args.max_dynamic_patch,
+                        image_size=image_size,
+                        use_thumbnail=self.data_args.use_thumbnail,
+                    )
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+                else:
+                    image, patch_num = dynamic_preprocess(
+                        image,
+                        min_num=self.data_args.min_dynamic_patch,
+                        max_num=self.data_args.max_dynamic_patch,
+                        image_size=image_size,
+                        use_thumbnail=self.data_args.use_thumbnail,
+                    )
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+
+            inserted_id = self.list_data_dict[i].get("inserted_id", None)
+            end_tag = self.list_data_dict[i].get("end_tag", True)
+            assert inserted_id is None
+            assert end_tag is True
+            sources = preprocess_multimodal(
+                copy.deepcopy([e["conversations"] for e in sources]),
+                self.data_args,
+                patch_num=patch_num,
+                inserted_id=inserted_id,
+            )
+
+            data_dict = preprocess(
+                sources, self.tokenizer, has_image=True, end_tag=end_tag, modality="image"
+            )
+
+        elif "image" in sources[0] and "audio" in sources[0]:
+            image_file = self.list_data_dict[i]["image"]
+            set_id = self.list_data_dict[i].get("set", None)
+            file = image_file[0] if type(image_file) is list else image_file
+            audio_file = self.list_data_dict[i]["audio"]
+            processor = self.data_args.image_processor
+            if "height" in processor.size.keys():
+                image_size = processor.size["height"]
+            elif "shortest_edge" in processor.size.keys():
+                image_size = processor.size["shortest_edge"]
+            else:
+                raise NotImplementedError(f"Please use correct key to use processor size!")
+
+            if type(image_file) is list:
+                assert type(set_id) is list
+                if len(image_file) != len(set_id):  # 多图数据
+                    assert len(set(set_id)) == 1
+                image = [
+                    Image.open(
+                        os.path.join(self.folder_dict[set_id[k]], file.replace("\\", "/"))
+                    ).convert("RGB")
+                    for k, file in enumerate(image_file)
+                ]
+                if self.data_args.image_aspect_ratio == "pad":
+
+                    def expand2square(pil_img, background_color):
+                        width, height = pil_img.size
+                        if width == height:
+                            return pil_img
+                        elif width > height:
+                            result = Image.new(pil_img.mode, (width, width), background_color)
+                            result.paste(pil_img, (0, (width - height) // 2))
+                            return result
+                        else:
+                            result = Image.new(pil_img.mode, (height, height), background_color)
+                            result.paste(pil_img, ((height - width) // 2, 0))
+                            return result
+
+                    image = [
+                        expand2square(i, tuple(int(x * 255) for x in processor.image_mean))
+                        for i in image
+                    ]
+                    image_patches, patch_num = [], []
+                    for k, img in enumerate(image):
+                        if set_id[k] not in NoPatchSets:
+                            img, p_num = dynamic_preprocess(
+                                img,
+                                min_num=self.data_args.min_dynamic_patch,
+                                max_num=self.data_args.max_dynamic_patch,
+                                image_size=image_size,
+                                use_thumbnail=self.data_args.use_thumbnail,
+                            )
+                        else:
+                            img, p_num = [img], [1]
+                        image_patches += img
+                        patch_num += p_num
+                    assert len(image_patches) == sum(patch_num)
+                    image = image_patches
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+                else:
+                    image_patches, patch_num = [], []
+                    for k, img in enumerate(image):
+                        if set_id[k] not in NoPatchSets:
+                            img, p_num = dynamic_preprocess(
+                                img,
+                                min_num=self.data_args.min_dynamic_patch,
+                                max_num=self.data_args.max_dynamic_patch,
+                                image_size=image_size,
+                                use_thumbnail=self.data_args.use_thumbnail,
+                            )
+                        else:
+                            img, p_num = [img], [1]
+                        image_patches += img
+                        patch_num += p_num
+                    assert len(image_patches) == sum(patch_num)
+                    image = image_patches
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+            else:
+                image_folder = self.folder_dict[set_id]
+                image = Image.open(
+                    os.path.join(image_folder, image_file.replace("\\", "/"))
+                ).convert("RGB")
+                if self.data_args.image_aspect_ratio == "pad":
+
+                    def expand2square(pil_img, background_color):
+                        width, height = pil_img.size
+                        if width == height:
+                            return pil_img
+                        elif width > height:
+                            result = Image.new(pil_img.mode, (width, width), background_color)
+                            result.paste(pil_img, (0, (width - height) // 2))
+                            return result
+                        else:
+                            result = Image.new(pil_img.mode, (height, height), background_color)
+                            result.paste(pil_img, ((height - width) // 2, 0))
+                            return result
+
+                    image = expand2square(image, tuple(int(x * 255) for x in processor.image_mean))
+                    image, patch_num = dynamic_preprocess(
+                        image,
+                        min_num=self.data_args.min_dynamic_patch,
+                        max_num=self.data_args.max_dynamic_patch,
+                        image_size=image_size,
+                        use_thumbnail=self.data_args.use_thumbnail,
+                    )
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+                else:
+                    image, patch_num = dynamic_preprocess(
+                        image,
+                        min_num=self.data_args.min_dynamic_patch,
+                        max_num=self.data_args.max_dynamic_patch,
+                        image_size=image_size,
+                        use_thumbnail=self.data_args.use_thumbnail,
+                    )
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+
+            if type(audio_file) is list:
+                # if type(set_id) is list:
+                #    audio_folder = self.folder_dict[set_id[0]+'_audio']
+                # else:
+                #    audio_folder = self.folder_dict[set_id+'_audio']
+                audio_folder = AudioFolder
+                assert len(audio_file) > 0, "audio_file为列表时不能为空"
+                audio = []
+                audio_for_llm_lens = []
+                audio_length = []
+                for file in audio_file:
+                    try:
+                        a, a_llm = self.data_args.audio_processor.process(
+                            os.path.join(audio_folder, "audio", file)
+                        )
+                    except:
+                        print(f"File {os.path.join(audio_folder, 'audio', file)} not OK!!!!!")
+                    audio.append(a)
+                    audio_for_llm_lens.append(a_llm)
+                    audio_length.append(a.shape[0])
+            else:
+                # audio_folder = self.folder_dict[set_id+'_audio']
+                audio_folder = AudioFolder
+                assert audio_file, "audio_file不能为空"
+                audio, audio_for_llm_lens = self.data_args.audio_processor.process(
+                    os.path.join(audio_folder, "audio", audio_file)
+                )
+                audio_length = audio.shape[0]
+
+            inserted_id = self.list_data_dict[i].get("inserted_id", None)
+            end_tag = self.list_data_dict[i].get("end_tag", True)
+            sources = preprocess_multimodal(
+                copy.deepcopy([e["conversations"] for e in sources]),
+                self.data_args,
+                patch_num=patch_num,
+                audio_lens=audio_for_llm_lens,
+                inserted_id=inserted_id,
+            )
+
+            data_dict = preprocess(
+                sources,
+                self.tokenizer,
+                has_image=True,
+                has_audio=True,
+                end_tag=end_tag,
+                modality="image",
+            )
+            data_dict["audio_lengths"] = audio_length
+            data_dict["audio_lengths_for_llm"] = audio_for_llm_lens
+
+        elif "video" in sources[0] and "audio" not in sources[0]:
+            video_file = self.list_data_dict[i]["video"]
+            video_id = self.list_data_dict[i]["id"]
+            set_id = self.list_data_dict[i].get("set", None)
+            processor = self.data_args.image_processor
+            if "height" in processor.size.keys():
+                image_size = processor.size["height"]
+            elif "shortest_edge" in processor.size.keys():
+                image_size = processor.size["shortest_edge"]
+            else:
+                raise NotImplementedError(f"Please use correct key to use processor size!")
+            video_folder = self.folder_dict[set_id]
+            image, image_token_num = _get_rawvideo_dec(
+                os.path.join(video_folder, video_file),
+                processor,
+                max_frames=MAX_IMAGE_LENGTH,
+                min_frames=MIN_IMAGE_LENGTH,
+                image_resolution=image_size,
+                image_aspect_ratio=self.data_args.image_aspect_ratio,
+            )
+
+            inserted_id = self.list_data_dict[i].get("inserted_id", None)
+            end_tag = self.list_data_dict[i].get("end_tag", True)
+            assert inserted_id is None
+            assert end_tag is True
+            sources = preprocess_multimodal(
+                copy.deepcopy([e["conversations"] for e in sources]),
+                self.data_args,
+                image_token_num=image_token_num,
+                inserted_id=inserted_id,
+            )
+
+            data_dict = preprocess(
+                sources,
+                self.tokenizer,
+                has_image=True,
+                has_audio=False,
+                end_tag=end_tag,
+                modality="video",
+            )
+
+        elif "video" in sources[0] and "audio" in sources[0]:
+            video_file = self.list_data_dict[i]["video"]
+            video_id = self.list_data_dict[i]["id"]
+            set_id = self.list_data_dict[i].get("set", None)
+            audio_file = self.list_data_dict[i]["audio"]
+            processor = self.data_args.image_processor
+            if "height" in processor.size.keys():
+                image_size = processor.size["height"]
+            elif "shortest_edge" in processor.size.keys():
+                image_size = processor.size["shortest_edge"]
+            else:
+                raise NotImplementedError(f"Please use correct key to use processor size!")
+            video_folder = self.folder_dict[set_id]
+            # audio_folder = self.folder_dict[set_id+'_audio']
+            audio_folder = AudioFolder
+            image, image_token_num = _get_rawvideo_dec(
+                os.path.join(video_folder, video_file),
+                processor,
+                max_frames=MAX_IMAGE_LENGTH,
+                min_frames=MIN_IMAGE_LENGTH,
+                image_resolution=image_size,
+                image_aspect_ratio=self.data_args.image_aspect_ratio,
+            )
+            if type(audio_file) is list:
+                assert len(audio_file) > 0, "audio_file为列表时不能为空"
+                audio = []
+                audio_for_llm_lens = []
+                audio_length = []
+                for file in audio_file:
+                    a, a_llm = self.data_args.audio_processor.process(
+                        os.path.join(audio_folder, "audio", file)
+                    )
+                    audio.append(a)
+                    audio_for_llm_lens.append(a_llm)
+                    audio_length.append(a.shape[0])
+            else:
+                assert audio_file, "audio_file不能为空"
+                audio, audio_for_llm_lens = self.data_args.audio_processor.process(
+                    os.path.join(audio_folder, "audio", audio_file)
+                )
+                audio_length = audio.shape[0]
+
+            inserted_id = self.list_data_dict[i].get("inserted_id", None)
+            end_tag = self.list_data_dict[i].get("end_tag", True)
+            sources = preprocess_multimodal(
+                copy.deepcopy([e["conversations"] for e in sources]),
+                self.data_args,
+                image_token_num=image_token_num,
+                audio_lens=audio_for_llm_lens,
+                inserted_id=inserted_id,
+            )
+
+            data_dict = preprocess(
+                sources,
+                self.tokenizer,
+                has_image=True,
+                has_audio=True,
+                end_tag=end_tag,
+                modality="video",
+            )
+            data_dict["audio_lengths"] = audio_length
+            data_dict["audio_lengths_for_llm"] = audio_for_llm_lens
+        elif "audio" in sources[0]:
+            audio_file = self.list_data_dict[i]["audio"]
+            audio_folder = AudioFolder
+            if type(audio_file) is list:
+                assert len(audio_file) > 0, "audio_file为列表时不能为空"
+                audio = []
+                audio_for_llm_lens = []
+                audio_length = []
+                for file in audio_file:
+                    a, a_llm = self.data_args.audio_processor.process(
+                        os.path.join(audio_folder, "audio", file)
+                    )
+                    audio.append(a)
+                    audio_for_llm_lens.append(a_llm)
+                    audio_length.append(a.shape[0])
+            else:
+                assert audio_file, "audio_file不能为空"
+                audio, audio_for_llm_lens = self.data_args.audio_processor.process(
+                    os.path.join(audio_folder, "audio", audio_file)
+                )
+                audio_length = audio.shape[0]
+
+            inserted_id = self.list_data_dict[i].get("inserted_id", None)
+            end_tag = self.list_data_dict[i].get("end_tag", True)
+            sources = preprocess_multimodal(
+                copy.deepcopy([e["conversations"] for e in sources]),
+                self.data_args,
+                image_token_num=0,
+                audio_lens=audio_for_llm_lens,
+                inserted_id=inserted_id,
+            )
+
+            data_dict = preprocess(
+                sources,
+                self.tokenizer,
+                has_image=False,
+                has_audio=True,
+                end_tag=end_tag,
+                modality="lang",
+            )
+            data_dict["audio_lengths"] = audio_length
+            data_dict["audio_lengths_for_llm"] = audio_for_llm_lens
+        else:
+            sources = copy.deepcopy([e["conversations"] for e in sources])
+            sources = preprocess_multimodal(
+                sources,
+                self.data_args,
+                image_token_num=0,
+            )
+
+            data_dict = preprocess(sources, self.tokenizer, has_image=False, modality="lang")
+
+        if isinstance(i, int):
+            if "audio" in self.list_data_dict[i]:
+                data_dict = dict(
+                    input_ids=data_dict["input_ids"][0],
+                    labels=data_dict["labels"][0],
+                    audio_lengths=data_dict["audio_lengths"],
+                    audio_lengths_for_llm=data_dict["audio_lengths_for_llm"],
+                )
+            else:
+                data_dict = dict(input_ids=data_dict["input_ids"][0], labels=data_dict["labels"][0])
+
+        # image exist in the data
+        if "image" in self.list_data_dict[i] or "video" in self.list_data_dict[i]:
+            data_dict["image"] = image
+        elif self.data_args.is_multimodal:
+            # image does not exist in the data, but the model is multimodal
+            crop_size = self.data_args.image_processor.crop_size
+            data_dict["image"] = torch.zeros(3, crop_size["height"], crop_size["width"])
+        if "audio" in self.list_data_dict[i]:
+            data_dict["audio"] = audio
+        elif self.data_args.is_multimodal:
+            data_dict["audio"] = torch.zeros(400, 80)
+            data_dict["audio_lengths"] = 400
+            data_dict["audio_lengths_for_llm"] = 60
+        return data_dict
+
+
+@dataclass
+class DataCollatorForSupervisedDataset(object):
+    """Collate examples for supervised fine-tuning."""
+
+    tokenizer: transformers.PreTrainedTokenizer
+
+    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        input_ids, labels = tuple(
+            [instance[key] for instance in instances] for key in ("input_ids", "labels")
+        )
+        if self.tokenizer.pad_token_id == self.tokenizer.eos_token_id:
+            for input_id in input_ids:
+                input_id[input_id == self.tokenizer.eos_token_id] = -300
+
+        input_ids = torch.nn.utils.rnn.pad_sequence(
+            input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
+        )
+
+        labels = torch.nn.utils.rnn.pad_sequence(
+            labels, batch_first=True, padding_value=IGNORE_INDEX
+        )
+
+        input_ids = input_ids[:, : self.tokenizer.model_max_length]
+
+        attention_mask = input_ids.ne(self.tokenizer.pad_token_id)
+
+        labels = labels[:, : self.tokenizer.model_max_length]
+
+        if self.tokenizer.pad_token_id == self.tokenizer.eos_token_id:
+            for input_id in input_ids:
+                input_id[input_id == -300] = self.tokenizer.eos_token_id
+
+        batch = dict(
+            input_ids=input_ids,
+            labels=labels,
+            attention_mask=attention_mask,
+        )
+
+        if "image" in instances[0]:
+            images = [instance["image"] for instance in instances]
+
+            new_images = []
+            for image in images:
+                if type(image) is list:
+                    for i in image:
+                        new_images.append(i)
+                else:
+                    new_images.append(image)
+            images = new_images
+
+            if all(x is not None and x.shape == images[0].shape for x in images):
+                batch["images"] = torch.stack(images)
+            else:
+                batch["images"] = images
+
+        batch["audios"] = {}
+        if "audio" in instances[0]:
+            audios = [instance["audio"] for instance in instances]
+            audio_lengths = [instance["audio_lengths"] for instance in instances]
+            audio_lengths_for_llm = [instance["audio_lengths_for_llm"] for instance in instances]
+
+            new_audios = []
+            new_audio_lengths = []
+            new_audio_lengths_for_llm = []
+            for i, audio in enumerate(audios):
+                length = audio_lengths[i]
+                length_for_llm = audio_lengths_for_llm[i]
+                if type(audio) is list:
+                    for j, a in enumerate(audio):
+                        new_audios.append(a)
+                        new_audio_lengths.append(length[j])
+                        new_audio_lengths_for_llm.append(length_for_llm[j])
+                else:
+                    new_audios.append(audio)
+                    new_audio_lengths.append(length)
+                    new_audio_lengths_for_llm.append(length_for_llm)
+            audios = new_audios
+            audios = pad_sequence(audios, batch_first=True, padding_value=0)
+
+            batch["audios"]["audios"] = audios
+            batch["audios"]["lengths"] = torch.tensor(new_audio_lengths)
+            batch["audios"]["lengths_for_llm"] = torch.tensor(new_audio_lengths_for_llm)
+
+        return batch
+
+
+def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer, data_args) -> Dict:
+    """Make dataset and collator for supervised fine-tuning."""
+    train_dataset = LazySupervisedDataset(tokenizer=tokenizer, data_args=data_args)
+    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
+    return dict(train_dataset=train_dataset, eval_dataset=None, data_collator=data_collator)
+
+
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    best_ratio_diff = float("inf")
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    # print(f'width: {width}, height: {height}, best_ratio: {best_ratio}')
+    return best_ratio
+
+
+def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+
+    # calculate the existing image aspect ratio
+    target_ratios = set(
+        (i, j)
+        for n in range(min_num, max_num + 1)
+        for i in range(1, n + 1)
+        for j in range(1, n + 1)
+        if i * j <= max_num and i * j >= min_num
+    )
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, target_ratios, orig_width, orig_height, image_size
+    )
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size,
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images, [len(processed_images)]
+
diff --git a/vita/util/data_utils_video_audio_patch_sf.py b/vita/util/data_utils_video_audio_patch_sf.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef2bd5df7998b375b895ef67295383d272080d88
--- /dev/null
+++ b/vita/util/data_utils_video_audio_patch_sf.py
@@ -0,0 +1,1418 @@
+import copy
+import json
+import math
+import os
+import random
+from dataclasses import dataclass, field
+from typing import Dict, Optional, Sequence
+
+import numpy as np
+import torch
+import transformers
+from PIL import Image
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data import Dataset
+
+from decord import VideoReader, cpu
+from vita import conversation as conversation_lib
+from vita.config import AudioFolder, DataConfig, FolderDict, NoPatchSets
+from vita.constants import (
+    DEFAULT_AUDIO_TOKEN,
+    DEFAULT_DATA_RATIO,
+    DEFAULT_IMAGE_TOKEN,
+    DEFAULT_VIDEO_TOKEN,
+    IGNORE_INDEX,
+    MAX_IMAGE_LENGTH,
+    MIN_IMAGE_LENGTH,
+)
+from vita.util.mm_utils import tokenizer_image_audio_token, tokenizer_image_token
+
+
+@dataclass
+class DataArguments:
+    lazy_preprocess: bool = False
+    is_multimodal: bool = True
+    image_folder: Optional[str] = field(default=None)
+    image_aspect_ratio: str = field(default=None)
+    dataset_use: str = field(default="temp")
+    min_dynamic_patch: int = 1
+    max_dynamic_patch: int = 12
+    use_thumbnail: bool = True
+
+
+def preprocess_multimodal(
+    sources: Sequence[str],
+    data_args: DataArguments,
+    image_token_num=1,
+    patch_num=[1],
+    audio_lens: int = 0,
+    inserted_id=None,
+) -> Dict:
+    is_multimodal = data_args.is_multimodal
+    if not is_multimodal:
+        return sources
+
+    k_img_ph = 0
+    for source in sources:
+        if inserted_id is not None:
+            assert source[inserted_id]["from"] == "gpt"
+        for i, sentence in enumerate(source):
+            if DEFAULT_IMAGE_TOKEN in sentence["value"] or DEFAULT_VIDEO_TOKEN in sentence["value"]:
+                sentence["value"] = (
+                    sentence["value"]
+                    .replace(DEFAULT_IMAGE_TOKEN + "\n", DEFAULT_IMAGE_TOKEN)
+                    .strip()
+                )
+                sentence["value"] = (
+                    sentence["value"]
+                    .replace("\n" + DEFAULT_IMAGE_TOKEN, DEFAULT_IMAGE_TOKEN)
+                    .strip()
+                )
+                VIDEO_TOKEN_NUM = sentence["value"].count(DEFAULT_VIDEO_TOKEN)
+                if VIDEO_TOKEN_NUM == 1 and sentence["value"].endswith(DEFAULT_IMAGE_TOKEN):
+                    IMAGE_TOKEN_NUM = sentence["value"].count(DEFAULT_IMAGE_TOKEN)
+                    sentence["value"] = (
+                        sentence["value"].replace(DEFAULT_IMAGE_TOKEN * IMAGE_TOKEN_NUM, "").strip()
+                    )
+                    sentence["value"] = DEFAULT_IMAGE_TOKEN * IMAGE_TOKEN_NUM + sentence["value"]
+                    sentence["value"] = sentence["value"].strip()
+
+                if sentence["value"].endswith(DEFAULT_VIDEO_TOKEN):
+                    VIDEO_TOKEN_NUM = sentence["value"].count(DEFAULT_VIDEO_TOKEN)
+                    sentence["value"] = (
+                        sentence["value"].replace(DEFAULT_VIDEO_TOKEN * VIDEO_TOKEN_NUM, "").strip()
+                    )
+                    sentence["value"] = DEFAULT_VIDEO_TOKEN * VIDEO_TOKEN_NUM + sentence["value"]
+                    sentence["value"] = sentence["value"].strip()
+
+                if "mmtag" in conversation_lib.default_conversation.version:
+                    sentence["value"] = sentence["value"].replace(
+                        DEFAULT_IMAGE_TOKEN, "<Image>" + DEFAULT_IMAGE_TOKEN + "</Image>"
+                    )
+
+                IMAGE_TOKEN_NUM = sentence["value"].count(DEFAULT_IMAGE_TOKEN)
+                if IMAGE_TOKEN_NUM > MAX_IMAGE_LENGTH:
+                    sentence["value"] = (
+                        sentence["value"]
+                        .replace(
+                            DEFAULT_IMAGE_TOKEN * IMAGE_TOKEN_NUM,
+                            DEFAULT_IMAGE_TOKEN * MAX_IMAGE_LENGTH,
+                        )
+                        .strip()
+                    )
+            replace_token, vid_replace_token, audio_replace_token = (
+                DEFAULT_IMAGE_TOKEN,
+                DEFAULT_IMAGE_TOKEN * image_token_num,
+                DEFAULT_AUDIO_TOKEN,
+            )  # * audio_lens
+            if DEFAULT_IMAGE_TOKEN in sentence["value"]:
+                replace_token = DEFAULT_IMAGE_TOKEN * patch_num[k_img_ph]
+                k_img_ph += 1
+
+            sentence["value"] = sentence["value"].replace(DEFAULT_IMAGE_TOKEN, replace_token + "\n")
+            sentence["value"] = sentence["value"].replace(
+                DEFAULT_VIDEO_TOKEN, vid_replace_token + "\n"
+            )
+            sentence["value"] = sentence["value"].replace(
+                DEFAULT_AUDIO_TOKEN + "\n", audio_replace_token
+            )
+            sentence["value"] = sentence["value"].replace("\n\n", "\n")
+            # if i == inserted_id:
+            #     assert sentence["from"] == "gpt"
+            #     sentence["value"] = "<2>" + sentence["value"]
+            # elif sentence["from"] == "gpt":
+            #     if "<audio>" in source[i - 1]["value"]:
+            #         sentence["value"] = "<1>" + sentence["value"]
+            #     else:
+            #         sentence["value"] = "<3>" + sentence["value"]
+
+    # print(patch_num)
+    # print(sum(patch_num))
+    #print(sources)
+    #import pdb; pdb.set_trace()
+    return sources
+
+
+def preprocess_mixtral_two(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+    has_audio: bool = False,
+    end_tag: bool = True,
+    modality: str = "lang",
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt(modality))
+    # print(conversations)
+    # import pdb; pdb.set_trace()
+
+    # Tokenize conversations
+    if not end_tag:
+        conversations[0] = conversations[0][:-4]
+    if has_image and not has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    elif has_image and has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_audio_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    elif not has_image and has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_audio_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+    #print(f'end_tag: {end_tag}')
+    #print(conversations)
+    #print(input_ids)
+    #import pdb; pdb.set_trace()
+
+    targets = input_ids.clone()
+
+    assert conv.sep_style == conversation_lib.SeparatorStyle.MixtralTwo
+
+    # Mask targets
+    sep = conv.sep + "\n" + conv.roles[1] + ":"
+    sep2_2 = "\n" + conv.roles[0] + ":"
+    sep2 = conv.sep2 + sep2_2
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+
+        rounds = conversation.split(sep2)
+        rounds = [rounds[0] + sep2 + rounds[1]] + rounds[2:]
+        cur_len = 1
+        end_token_cnt = 0
+        target[:cur_len] = IGNORE_INDEX
+
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+            if i > 0:
+                rou = sep2_2 + rou
+
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+
+            if has_image and not has_audio:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 1
+            elif has_image and has_audio:
+                round_len = len(tokenizer_image_audio_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_audio_token(parts[0], tokenizer)) - 1
+            elif not has_image and has_audio:
+                round_len = len(tokenizer_image_audio_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_audio_token(parts[0], tokenizer)) - 1
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 2
+
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+
+            end_token_cnt += 1
+            cur_len += round_len
+        cur_len = cur_len - 1
+        target[cur_len:] = IGNORE_INDEX
+
+        if tokenizer.pad_token_id == tokenizer.eos_token_id:
+            cur_len -= end_token_cnt
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." f" (ignored)")
+                # print(f"YOU NEED GO TO DEBUG THIS DATA ITEM: {conversations}")
+
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+
+
+def preprocess_nemo(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+    has_audio: bool = False,
+    end_tag: bool = True,
+    modality: str = "lang",
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt(modality))
+    #print(conversations)
+    #import pdb; pdb.set_trace()
+
+    # Tokenize conversations
+    if has_image and not has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    elif has_image and has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_audio_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    elif not has_image and has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_audio_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+    #print(f'end_tag: {end_tag}')
+    #print(conversations)
+    #print(input_ids)
+    #import pdb; pdb.set_trace()
+
+    targets = input_ids.clone()
+
+    assert conv.sep_style == conversation_lib.SeparatorStyle.Nemo
+
+    # Mask targets
+    sep = conv.sep
+    sep2 = conv.sep2
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+
+        rounds = conversation.split(sep2)
+        cur_len = 1
+        end_token_cnt = 0
+        target[:cur_len] = IGNORE_INDEX
+
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+
+            if has_image and not has_audio:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 1
+            elif has_image and has_audio:
+                round_len = len(tokenizer_image_audio_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_audio_token(parts[0], tokenizer)) - 1
+            elif not has_image and has_audio:
+                round_len = len(tokenizer_image_audio_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_audio_token(parts[0], tokenizer)) - 1 
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 2
+
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+
+            end_token_cnt += 1
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+
+        if tokenizer.pad_token_id == tokenizer.eos_token_id:
+            cur_len -= end_token_cnt
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." f" (ignored)")
+                # print(f"YOU NEED GO TO DEBUG THIS DATA ITEM: {conversations}")
+
+    #import pdb; pdb.set_trace()
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+
+
+def preprocess_qwen2p5_instruct(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+    has_audio: bool = False,
+    end_tag: bool = True,
+    modality: str = "lang",
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt(modality))
+    #print(conversations)
+    #import pdb; pdb.set_trace()
+
+    # Tokenize conversations
+    if has_image and not has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    elif has_image and has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_audio_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    elif not has_image and has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_audio_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+    #print(f'end_tag: {end_tag}')
+    #print(conversations)
+    #print(input_ids)
+    #import pdb; pdb.set_trace()
+
+    targets = input_ids.clone()
+
+    assert conv.sep_style == conversation_lib.SeparatorStyle.Qwen2p5Instruct
+
+    # Mask targets
+    sep = '\n' + conv.sep + conv.roles[1] + "\n"   #\n<|im_start|>assistant\n
+    sep2 = '\n' + conv.sep2 + conv.roles[0] + "\n" #\n<|im_start|>user\n
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+
+        rounds = conversation.split(sep2)
+        rounds = [rounds[0] + sep2 + rounds[1]] + rounds[2:]
+        cur_len = 0
+        end_token_cnt = 0
+
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+            if i > 0:
+                rou = sep2 + rou
+
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+
+            #import pdb; pdb.set_trace()
+            if has_image and not has_audio:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer))
+            elif has_image and has_audio:
+                round_len = len(tokenizer_image_audio_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_audio_token(parts[0], tokenizer))
+            elif not has_image and has_audio:
+                round_len = len(tokenizer_image_audio_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_audio_token(parts[0], tokenizer))
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids)
+
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+
+            end_token_cnt += 1
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+
+        if tokenizer.pad_token_id == tokenizer.eos_token_id:
+            cur_len -= end_token_cnt
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." f" (ignored)")
+                # print(f"YOU NEED GO TO DEBUG THIS DATA ITEM: {conversations}")
+
+    #import pdb; pdb.set_trace()
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+
+
+def preprocess_plain(
+    sources: Sequence[str],
+    tokenizer: transformers.PreTrainedTokenizer,
+) -> Dict:
+    # add end signal and concatenate together
+    conversations = []
+    for source in sources:
+        assert len(source) == 2
+        assert DEFAULT_IMAGE_TOKEN in source[0]["value"]
+        source[0]["value"] = DEFAULT_IMAGE_TOKEN
+        conversation = (
+            source[0]["value"] + source[1]["value"] + conversation_lib.default_conversation.sep
+        )
+        conversations.append(conversation)
+    # tokenize conversations
+    input_ids = [
+        tokenizer_image_token(prompt, tokenizer, return_tensors="pt") for prompt in conversations
+    ]
+    targets = copy.deepcopy(input_ids)
+    for target, source in zip(targets, sources):
+        tokenized_len = len(tokenizer_image_token(source[0]["value"], tokenizer))
+        target[:tokenized_len] = IGNORE_INDEX
+
+    return dict(input_ids=input_ids, labels=targets)
+
+
+def preprocess(
+    sources: Sequence[str],
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+    has_audio: bool = False,
+    end_tag: bool = True,
+    modality: str = "lang",
+) -> Dict:
+    if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.PLAIN:
+        return preprocess_plain(sources, tokenizer)
+
+    if conversation_lib.default_conversation.version == "nemo":
+        return preprocess_nemo(
+            sources,
+            tokenizer,
+            has_image=has_image,
+            has_audio=has_audio,
+            end_tag=end_tag,
+            modality=modality,
+        )
+    elif conversation_lib.default_conversation.version == "mixtral_two":
+        return preprocess_mixtral_two(
+            sources,
+            tokenizer,
+            has_image=has_image,
+            has_audio=has_audio,
+            end_tag=end_tag,
+            modality=modality,
+        )
+    elif conversation_lib.default_conversation.version == "qwen2p5_instruct":
+        return preprocess_qwen2p5_instruct(
+            sources,
+            tokenizer,
+            has_image=has_image,
+            has_audio=has_audio,
+            end_tag=end_tag,
+            modality=modality,
+        )
+
+def _get_rawvideo_dec(
+    video_path,
+    image_processor,
+    max_frames=32,
+    min_frames=4,
+    image_resolution=384,
+    video_framerate=1,
+    s=None,
+    e=None,
+    image_aspect_ratio="pad",
+):
+    # speed up video decode via decord.
+    video_mask = np.zeros(max_frames, dtype=np.int64)
+    max_video_length = 0
+
+    # T x 3 x H x W
+    video = np.zeros((max_frames, 3, image_resolution, image_resolution), dtype=np.float64)
+
+    if s is None:
+        start_time, end_time = None, None
+    else:
+        start_time = int(s)
+        end_time = int(e)
+        start_time = start_time if start_time >= 0.0 else 0.0
+        end_time = end_time if end_time >= 0.0 else 0.0
+        if start_time > end_time:
+            start_time, end_time = end_time, start_time
+        elif start_time == end_time:
+            end_time = start_time + 1
+
+    if os.path.exists(video_path):
+        vreader = VideoReader(video_path, ctx=cpu(0))
+    else:
+        print(video_path)
+        raise FileNotFoundError
+
+    fps = vreader.get_avg_fps()
+    f_start = 0 if start_time is None else int(start_time * fps)
+    f_end = int(min(1000000000 if end_time is None else end_time * fps, len(vreader) - 1))
+    num_frames = f_end - f_start + 1
+    if num_frames > 0:
+        # T x 3 x H x W
+        sample_fps = int(video_framerate)
+        t_stride = int(round(float(fps) / sample_fps))
+
+        all_pos = list(range(f_start, f_end + 1, t_stride))
+        if len(all_pos) > max_frames:
+            sample_pos = [
+                all_pos[_] for _ in np.linspace(0, len(all_pos) - 1, num=max_frames, dtype=int)
+            ]
+        elif len(all_pos) < min_frames:
+            sample_pos = [
+                all_pos[_] for _ in np.linspace(0, len(all_pos) - 1, num=min_frames, dtype=int)
+            ]
+        else:
+            sample_pos = all_pos
+
+        patch_images = [Image.fromarray(f) for f in vreader.get_batch(sample_pos).asnumpy()]
+
+        if image_aspect_ratio == "pad":
+
+            def expand2square(pil_img, background_color):
+                width, height = pil_img.size
+                if width == height:
+                    return pil_img
+                elif width > height:
+                    result = Image.new(pil_img.mode, (width, width), background_color)
+                    result.paste(pil_img, (0, (width - height) // 2))
+                    return result
+                else:
+                    result = Image.new(pil_img.mode, (height, height), background_color)
+                    result.paste(pil_img, ((height - width) // 2, 0))
+                    return result
+
+            patch_images = [
+                expand2square(i, tuple(int(x * 255) for x in image_processor.image_mean))
+                for i in patch_images
+            ]
+            patch_images = [
+                image_processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                for i in patch_images
+            ]
+        else:
+            patch_images = [
+                image_processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                for i in patch_images
+            ]
+
+        # patch_images = [image_processor.preprocess(img, return_tensors='pt')['pixel_values'][0] for img in patch_images]
+        slice_len = len(patch_images)
+        return patch_images, slice_len
+        max_video_length = max_video_length if max_video_length > slice_len else slice_len
+        if slice_len < 1:
+            pass
+        else:
+            while len(patch_images) < max_frames:
+                patch_images.append(torch.zeros((3, image_resolution, image_resolution)))
+            # video[:slice_len, ...] = patch_images
+    else:
+        print("video path: {} error.".format(video_path))
+
+    video_mask[:max_video_length] = [1] * max_video_length
+
+    return patch_images, video_mask
+
+
+class LazySupervisedDataset(Dataset):
+    """Dataset for supervised fine-tuning."""
+
+    def __init__(self, tokenizer: transformers.PreTrainedTokenizer, data_args: DataArguments):
+        super(LazySupervisedDataset, self).__init__()
+
+        dataset_list = DataConfig[str(data_args.dataset_use)]
+        print(dataset_list)
+
+        self.max_length = MAX_IMAGE_LENGTH
+        list_data_dict = []
+        self.folder_dict = {}
+        for i in dataset_list:
+            # list_data_dict += json.load(open(i["chat_path"], "r"))
+            data_ratio = i.get("data_ratio", DEFAULT_DATA_RATIO)
+            data_i = json.load(open(i["chat_path"], "r"))
+            len_data_i = len(data_i)
+            data_i = random.sample(data_i, int(len_data_i * data_ratio))
+            list_data_dict += data_i
+
+            image_folder = [folder for folder in i if folder is not "chat_path"]
+
+            for folder in image_folder:
+                if folder not in self.folder_dict:
+                    self.folder_dict[folder] = i[folder]
+        for key in FolderDict.keys():
+            if key not in self.folder_dict:
+                self.folder_dict[key] = FolderDict[key]
+
+        random.shuffle(list_data_dict)
+
+        self.tokenizer = tokenizer
+        self.list_data_dict = list_data_dict
+        self.data_args = data_args
+
+    def __len__(self):
+        return len(self.list_data_dict)
+
+    # @property
+    # def lengths(self):
+    #     length_list = []
+    #     for sample in self.list_data_dict:
+    #         img_tokens = 128 if 'image' in sample else 0
+    #         length_list.append(sum(len(conv['value'].split()) for conv in sample['conversations']) + img_tokens)
+    #     return length_list
+
+    @property
+    def modality_lengths(self):
+        length_list = []
+        for sample in self.list_data_dict:
+            cur_len = sum(len(conv["value"].split()) for conv in sample["conversations"])
+            cur_len = cur_len if ("image" in sample or "video" in sample) else -cur_len
+            length_list.append(cur_len)
+        return length_list
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        sources = self.list_data_dict[i]
+        if isinstance(i, int):
+            sources = [sources]
+        assert len(sources) == 1, "Don't know why it is wrapped to a list"  # FIXME
+        if "image" in sources[0] and "audio" not in sources[0]:
+            image_file = self.list_data_dict[i]["image"]
+            set_id = self.list_data_dict[i].get("set", None)
+            file = image_file[0] if type(image_file) is list else image_file
+            processor = self.data_args.image_processor
+            if "height" in processor.size.keys():
+                image_size = processor.size["height"]
+            elif "shortest_edge" in processor.size.keys():
+                image_size = processor.size["shortest_edge"]
+            else:
+                raise NotImplementedError(f"Please use correct key to use processor size!")
+
+            if type(image_file) is list:
+                assert type(set_id) is list
+                if len(image_file) != len(set_id):
+                    assert len(set(set_id)) == 1
+                image = [
+                    Image.open(
+                        os.path.join(self.folder_dict[set_id[k]], file.replace("\\", "/"))
+                    ).convert("RGB")
+                    for k, file in enumerate(image_file)
+                ]
+                if self.data_args.image_aspect_ratio == "pad":
+
+                    def expand2square(pil_img, background_color):
+                        width, height = pil_img.size
+                        if width == height:
+                            return pil_img
+                        elif width > height:
+                            result = Image.new(pil_img.mode, (width, width), background_color)
+                            result.paste(pil_img, (0, (width - height) // 2))
+                            return result
+                        else:
+                            result = Image.new(pil_img.mode, (height, height), background_color)
+                            result.paste(pil_img, ((height - width) // 2, 0))
+                            return result
+
+                    image = [
+                        expand2square(i, tuple(int(x * 255) for x in processor.image_mean))
+                        for i in image
+                    ]
+                    image_patches, patch_num = [], []
+                    for k, img in enumerate(image):
+                        if set_id[k] not in NoPatchSets:
+                            img, p_num = dynamic_preprocess(
+                                img,
+                                min_num=self.data_args.min_dynamic_patch,
+                                max_num=self.data_args.max_dynamic_patch,
+                                image_size=image_size,
+                                use_thumbnail=self.data_args.use_thumbnail,
+                            )
+                        else:
+                            img, p_num = [img], [1]
+                        image_patches += img
+                        patch_num += p_num
+                    assert len(image_patches) == sum(patch_num)
+                    image = image_patches
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+                else:
+                    image_patches, patch_num = [], []
+                    for k, img in enumerate(image):
+                        if set_id[k] not in NoPatchSets:
+                            img, p_num = dynamic_preprocess(
+                                img,
+                                min_num=self.data_args.min_dynamic_patch,
+                                max_num=self.data_args.max_dynamic_patch,
+                                image_size=image_size,
+                                use_thumbnail=self.data_args.use_thumbnail,
+                            )
+                        else:
+                            img, p_num = [img], [1]
+                        image_patches += img
+                        patch_num += p_num
+                    assert len(image_patches) == sum(patch_num)
+                    image = image_patches
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+            else:
+                image_folder = self.folder_dict[set_id]
+                image = Image.open(
+                    os.path.join(image_folder, image_file.replace("\\", "/"))
+                ).convert("RGB")
+                if self.data_args.image_aspect_ratio == "pad":
+
+                    def expand2square(pil_img, background_color):
+                        width, height = pil_img.size
+                        if width == height:
+                            return pil_img
+                        elif width > height:
+                            result = Image.new(pil_img.mode, (width, width), background_color)
+                            result.paste(pil_img, (0, (width - height) // 2))
+                            return result
+                        else:
+                            result = Image.new(pil_img.mode, (height, height), background_color)
+                            result.paste(pil_img, ((height - width) // 2, 0))
+                            return result
+
+                    image = expand2square(image, tuple(int(x * 255) for x in processor.image_mean))
+                    image, patch_num = dynamic_preprocess(
+                        image,
+                        min_num=self.data_args.min_dynamic_patch,
+                        max_num=self.data_args.max_dynamic_patch,
+                        image_size=image_size,
+                        use_thumbnail=self.data_args.use_thumbnail,
+                    )
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+                else:
+                    image, patch_num = dynamic_preprocess(
+                        image,
+                        min_num=self.data_args.min_dynamic_patch,
+                        max_num=self.data_args.max_dynamic_patch,
+                        image_size=image_size,
+                        use_thumbnail=self.data_args.use_thumbnail,
+                    )
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+
+            inserted_id = self.list_data_dict[i].get("inserted_id", None)
+            end_tag = self.list_data_dict[i].get("end_tag", True)
+            assert inserted_id is None
+            assert end_tag is True
+            sources = preprocess_multimodal(
+                copy.deepcopy([e["conversations"] for e in sources]),
+                self.data_args,
+                patch_num=patch_num,
+                inserted_id=inserted_id,
+            )
+
+            data_dict = preprocess(
+                sources, self.tokenizer, has_image=True, end_tag=end_tag, modality="image"
+            )
+
+        elif "image" in sources[0] and "audio" in sources[0]:
+            image_file = self.list_data_dict[i]["image"]
+            set_id = self.list_data_dict[i].get("set", None)
+            file = image_file[0] if type(image_file) is list else image_file
+            audio_file = self.list_data_dict[i]["audio"]
+            processor = self.data_args.image_processor
+            if "height" in processor.size.keys():
+                image_size = processor.size["height"]
+            elif "shortest_edge" in processor.size.keys():
+                image_size = processor.size["shortest_edge"]
+            else:
+                raise NotImplementedError(f"Please use correct key to use processor size!")
+
+            if type(image_file) is list:
+                assert type(set_id) is list
+                if len(image_file) != len(set_id):  # 多图数据
+                    assert len(set(set_id)) == 1
+                image = [
+                    Image.open(
+                        os.path.join(self.folder_dict[set_id[k]], file.replace("\\", "/"))
+                    ).convert("RGB")
+                    for k, file in enumerate(image_file)
+                ]
+                if self.data_args.image_aspect_ratio == "pad":
+
+                    def expand2square(pil_img, background_color):
+                        width, height = pil_img.size
+                        if width == height:
+                            return pil_img
+                        elif width > height:
+                            result = Image.new(pil_img.mode, (width, width), background_color)
+                            result.paste(pil_img, (0, (width - height) // 2))
+                            return result
+                        else:
+                            result = Image.new(pil_img.mode, (height, height), background_color)
+                            result.paste(pil_img, ((height - width) // 2, 0))
+                            return result
+
+                    image = [
+                        expand2square(i, tuple(int(x * 255) for x in processor.image_mean))
+                        for i in image
+                    ]
+                    image_patches, patch_num = [], []
+                    for k, img in enumerate(image):
+                        if set_id[k] not in NoPatchSets:
+                            img, p_num = dynamic_preprocess(
+                                img,
+                                min_num=self.data_args.min_dynamic_patch,
+                                max_num=self.data_args.max_dynamic_patch,
+                                image_size=image_size,
+                                use_thumbnail=self.data_args.use_thumbnail,
+                            )
+                        else:
+                            img, p_num = [img], [1]
+                        image_patches += img
+                        patch_num += p_num
+                    assert len(image_patches) == sum(patch_num)
+                    image = image_patches
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+                else:
+                    image_patches, patch_num = [], []
+                    for k, img in enumerate(image):
+                        if set_id[k] not in NoPatchSets:
+                            img, p_num = dynamic_preprocess(
+                                img,
+                                min_num=self.data_args.min_dynamic_patch,
+                                max_num=self.data_args.max_dynamic_patch,
+                                image_size=image_size,
+                                use_thumbnail=self.data_args.use_thumbnail,
+                            )
+                        else:
+                            img, p_num = [img], [1]
+                        image_patches += img
+                        patch_num += p_num
+                    assert len(image_patches) == sum(patch_num)
+                    image = image_patches
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+            else:
+                image_folder = self.folder_dict[set_id]
+                image = Image.open(
+                    os.path.join(image_folder, image_file.replace("\\", "/"))
+                ).convert("RGB")
+                if self.data_args.image_aspect_ratio == "pad":
+
+                    def expand2square(pil_img, background_color):
+                        width, height = pil_img.size
+                        if width == height:
+                            return pil_img
+                        elif width > height:
+                            result = Image.new(pil_img.mode, (width, width), background_color)
+                            result.paste(pil_img, (0, (width - height) // 2))
+                            return result
+                        else:
+                            result = Image.new(pil_img.mode, (height, height), background_color)
+                            result.paste(pil_img, ((height - width) // 2, 0))
+                            return result
+
+                    image = expand2square(image, tuple(int(x * 255) for x in processor.image_mean))
+                    image, patch_num = dynamic_preprocess(
+                        image,
+                        min_num=self.data_args.min_dynamic_patch,
+                        max_num=self.data_args.max_dynamic_patch,
+                        image_size=image_size,
+                        use_thumbnail=self.data_args.use_thumbnail,
+                    )
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+                else:
+                    image, patch_num = dynamic_preprocess(
+                        image,
+                        min_num=self.data_args.min_dynamic_patch,
+                        max_num=self.data_args.max_dynamic_patch,
+                        image_size=image_size,
+                        use_thumbnail=self.data_args.use_thumbnail,
+                    )
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+
+            if type(audio_file) is list:
+                # if type(set_id) is list:
+                #    audio_folder = self.folder_dict[set_id[0]+'_audio']
+                # else:
+                #    audio_folder = self.folder_dict[set_id+'_audio']
+                audio_folder = AudioFolder
+                assert len(audio_file) > 0, "audio_file为列表时不能为空"
+                audio = []
+                audio_for_llm_lens = []
+                audio_length = []
+                for file in audio_file:
+                    try:
+                        a, a_llm = self.data_args.audio_processor.process(
+                            os.path.join(audio_folder, "audio", file)
+                        )
+                    except:
+                        print(f"File {os.path.join(audio_folder, 'audio', file)} not OK!!!!!")
+                    audio.append(a)
+                    audio_for_llm_lens.append(a_llm)
+                    audio_length.append(a.shape[0])
+            else:
+                # audio_folder = self.folder_dict[set_id+'_audio']
+                audio_folder = AudioFolder
+                assert audio_file, "audio_file不能为空"
+                audio, audio_for_llm_lens = self.data_args.audio_processor.process(
+                    os.path.join(audio_folder, "audio", audio_file)
+                )
+                audio_length = audio.shape[0]
+
+            inserted_id = self.list_data_dict[i].get("inserted_id", None)
+            end_tag = self.list_data_dict[i].get("end_tag", True)
+            sources = preprocess_multimodal(
+                copy.deepcopy([e["conversations"] for e in sources]),
+                self.data_args,
+                patch_num=patch_num,
+                audio_lens=audio_for_llm_lens,
+                inserted_id=inserted_id,
+            )
+
+            data_dict = preprocess(
+                sources,
+                self.tokenizer,
+                has_image=True,
+                has_audio=True,
+                end_tag=end_tag,
+                modality="image",
+            )
+            data_dict["audio_lengths"] = audio_length
+            data_dict["audio_lengths_for_llm"] = audio_for_llm_lens
+
+        elif "video" in sources[0] and "audio" not in sources[0]:
+            video_file = self.list_data_dict[i]["video"]
+            video_id = self.list_data_dict[i]["id"]
+            set_id = self.list_data_dict[i].get("set", None)
+            processor = self.data_args.image_processor
+            if "height" in processor.size.keys():
+                image_size = processor.size["height"]
+            elif "shortest_edge" in processor.size.keys():
+                image_size = processor.size["shortest_edge"]
+            else:
+                raise NotImplementedError(f"Please use correct key to use processor size!")
+            video_folder = self.folder_dict[set_id]
+            image, image_token_num = _get_rawvideo_dec(
+                os.path.join(video_folder, video_file),
+                processor,
+                max_frames=MAX_IMAGE_LENGTH,
+                min_frames=MIN_IMAGE_LENGTH,
+                image_resolution=image_size,
+                image_aspect_ratio=self.data_args.image_aspect_ratio,
+            )
+
+            inserted_id = self.list_data_dict[i].get("inserted_id", None)
+            end_tag = self.list_data_dict[i].get("end_tag", True)
+            assert inserted_id is None
+            assert end_tag is True
+            sources = preprocess_multimodal(
+                copy.deepcopy([e["conversations"] for e in sources]),
+                self.data_args,
+                image_token_num=image_token_num,
+                inserted_id=inserted_id,
+            )
+
+            data_dict = preprocess(
+                sources,
+                self.tokenizer,
+                has_image=True,
+                has_audio=False,
+                end_tag=end_tag,
+                modality="video",
+            )
+
+        elif "video" in sources[0] and "audio" in sources[0]:
+            video_file = self.list_data_dict[i]["video"]
+            video_id = self.list_data_dict[i]["id"]
+            set_id = self.list_data_dict[i].get("set", None)
+            audio_file = self.list_data_dict[i]["audio"]
+            processor = self.data_args.image_processor
+            if "height" in processor.size.keys():
+                image_size = processor.size["height"]
+            elif "shortest_edge" in processor.size.keys():
+                image_size = processor.size["shortest_edge"]
+            else:
+                raise NotImplementedError(f"Please use correct key to use processor size!")
+            video_folder = self.folder_dict[set_id]
+            # audio_folder = self.folder_dict[set_id+'_audio']
+            audio_folder = AudioFolder
+            image, image_token_num = _get_rawvideo_dec(
+                os.path.join(video_folder, video_file),
+                processor,
+                max_frames=MAX_IMAGE_LENGTH,
+                min_frames=MIN_IMAGE_LENGTH,
+                image_resolution=image_size,
+                image_aspect_ratio=self.data_args.image_aspect_ratio,
+            )
+            if type(audio_file) is list:
+                assert len(audio_file) > 0, "audio_file为列表时不能为空"
+                audio = []
+                audio_for_llm_lens = []
+                audio_length = []
+                for file in audio_file:
+                    a, a_llm = self.data_args.audio_processor.process(
+                        os.path.join(audio_folder, "audio", file)
+                    )
+                    audio.append(a)
+                    audio_for_llm_lens.append(a_llm)
+                    audio_length.append(a.shape[0])
+            else:
+                assert audio_file, "audio_file不能为空"
+                audio, audio_for_llm_lens = self.data_args.audio_processor.process(
+                    os.path.join(audio_folder, "audio", audio_file)
+                )
+                audio_length = audio.shape[0]
+
+            inserted_id = self.list_data_dict[i].get("inserted_id", None)
+            end_tag = self.list_data_dict[i].get("end_tag", True)
+            sources = preprocess_multimodal(
+                copy.deepcopy([e["conversations"] for e in sources]),
+                self.data_args,
+                image_token_num=image_token_num,
+                audio_lens=audio_for_llm_lens,
+                inserted_id=inserted_id,
+            )
+
+            data_dict = preprocess(
+                sources,
+                self.tokenizer,
+                has_image=True,
+                has_audio=True,
+                end_tag=end_tag,
+                modality="video",
+            )
+            data_dict["audio_lengths"] = audio_length
+            data_dict["audio_lengths_for_llm"] = audio_for_llm_lens
+        elif "audio" in sources[0]:
+            audio_file = self.list_data_dict[i]["audio"]
+            audio_folder = AudioFolder
+            if type(audio_file) is list:
+                assert len(audio_file) > 0, "audio_file为列表时不能为空"
+                audio = []
+                audio_for_llm_lens = []
+                audio_length = []
+                for file in audio_file:
+                    a, a_llm = self.data_args.audio_processor.process(
+                        os.path.join(audio_folder, "audio", file)
+                    )
+                    audio.append(a)
+                    audio_for_llm_lens.append(a_llm)
+                    audio_length.append(a.shape[0])
+            else:
+                assert audio_file, "audio_file不能为空"
+                audio, audio_for_llm_lens = self.data_args.audio_processor.process(
+                    os.path.join(audio_folder, "audio", audio_file)
+                )
+                audio_length = audio.shape[0]
+
+            inserted_id = self.list_data_dict[i].get("inserted_id", None)
+            end_tag = self.list_data_dict[i].get("end_tag", True)
+            sources = preprocess_multimodal(
+                copy.deepcopy([e["conversations"] for e in sources]),
+                self.data_args,
+                image_token_num=0,
+                audio_lens=audio_for_llm_lens,
+                inserted_id=inserted_id,
+            )
+
+            data_dict = preprocess(
+                sources,
+                self.tokenizer,
+                has_image=False,
+                has_audio=True,
+                end_tag=end_tag,
+                modality="lang",
+            )
+            data_dict["audio_lengths"] = audio_length
+            data_dict["audio_lengths_for_llm"] = audio_for_llm_lens
+        else:
+            sources = copy.deepcopy([e["conversations"] for e in sources])
+            sources = preprocess_multimodal(
+                sources,
+                self.data_args,
+                image_token_num=0,
+            )
+
+            data_dict = preprocess(sources, self.tokenizer, has_image=False, modality="lang")
+
+        if isinstance(i, int):
+            if "audio" in self.list_data_dict[i]:
+                data_dict = dict(
+                    input_ids=data_dict["input_ids"][0],
+                    labels=data_dict["labels"][0],
+                    audio_lengths=data_dict["audio_lengths"],
+                    audio_lengths_for_llm=data_dict["audio_lengths_for_llm"],
+                )
+            else:
+                data_dict = dict(input_ids=data_dict["input_ids"][0], labels=data_dict["labels"][0])
+
+        # image exist in the data
+        if "image" in self.list_data_dict[i] or "video" in self.list_data_dict[i]:
+            data_dict["image"] = image
+        elif self.data_args.is_multimodal:
+            # image does not exist in the data, but the model is multimodal
+            crop_size = self.data_args.image_processor.crop_size
+            data_dict["image"] = torch.zeros(3, crop_size["height"], crop_size["width"])
+        if "audio" in self.list_data_dict[i]:
+            data_dict["audio"] = audio
+        elif self.data_args.is_multimodal:
+            data_dict["audio"] = torch.zeros(400, 80)
+            data_dict["audio_lengths"] = 400
+            data_dict["audio_lengths_for_llm"] = 60
+        if "video" in self.list_data_dict[i]:
+            data_dict["sf_mask"] = [1] * image_token_num
+        elif type(data_dict["image"]) is list:
+            data_dict["sf_mask"] = [0] * len(data_dict["image"])
+        else:
+            data_dict["sf_mask"] = [0]
+        return data_dict
+
+
+@dataclass
+class DataCollatorForSupervisedDataset(object):
+    """Collate examples for supervised fine-tuning."""
+
+    tokenizer: transformers.PreTrainedTokenizer
+
+    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        input_ids, labels = tuple(
+            [instance[key] for instance in instances] for key in ("input_ids", "labels")
+        )
+        if self.tokenizer.pad_token_id == self.tokenizer.eos_token_id:
+            for input_id in input_ids:
+                input_id[input_id == self.tokenizer.eos_token_id] = -300
+
+        input_ids = torch.nn.utils.rnn.pad_sequence(
+            input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
+        )
+
+        labels = torch.nn.utils.rnn.pad_sequence(
+            labels, batch_first=True, padding_value=IGNORE_INDEX
+        )
+
+        input_ids = input_ids[:, : self.tokenizer.model_max_length]
+
+        attention_mask = input_ids.ne(self.tokenizer.pad_token_id)
+
+        labels = labels[:, : self.tokenizer.model_max_length]
+
+        if self.tokenizer.pad_token_id == self.tokenizer.eos_token_id:
+            for input_id in input_ids:
+                input_id[input_id == -300] = self.tokenizer.eos_token_id
+
+        batch = dict(
+            input_ids=input_ids,
+            labels=labels,
+            attention_mask=attention_mask,
+        )
+
+        if "image" in instances[0]:
+            images = [instance["image"] for instance in instances]
+            new_images = []
+            for k, image in enumerate(images):
+                if type(image) is list:
+                    for i in image:
+                        new_images.append(i)
+                else:
+                    new_images.append(image)
+            images = new_images
+
+            sf_masks = []
+            sf_idx = 0
+            for instance in instances:
+                sf_mask =  instance["sf_mask"]
+                if sf_mask[0] == 1:
+                    sf_idx += 1
+                    sf_masks += [sf_idx] * len(sf_mask)
+                else:
+                    sf_masks += sf_mask
+            assert len(sf_masks) == len(images)
+
+            if all(x is not None and x.shape == images[0].shape for x in images):
+                batch["images"] = torch.stack(images)
+            else:
+                batch["images"] = images
+            
+            batch["sf_masks"] = torch.tensor(sf_masks)
+
+        batch["audios"] = {}
+        if "audio" in instances[0]:
+            audios = [instance["audio"] for instance in instances]
+            audio_lengths = [instance["audio_lengths"] for instance in instances]
+            audio_lengths_for_llm = [instance["audio_lengths_for_llm"] for instance in instances]
+
+            new_audios = []
+            new_audio_lengths = []
+            new_audio_lengths_for_llm = []
+            for i, audio in enumerate(audios):
+                length = audio_lengths[i]
+                length_for_llm = audio_lengths_for_llm[i]
+                if type(audio) is list:
+                    for j, a in enumerate(audio):
+                        new_audios.append(a)
+                        new_audio_lengths.append(length[j])
+                        new_audio_lengths_for_llm.append(length_for_llm[j])
+                else:
+                    new_audios.append(audio)
+                    new_audio_lengths.append(length)
+                    new_audio_lengths_for_llm.append(length_for_llm)
+            audios = new_audios
+            audios = pad_sequence(audios, batch_first=True, padding_value=0)
+
+            batch["audios"]["audios"] = audios
+            batch["audios"]["lengths"] = torch.tensor(new_audio_lengths)
+            batch["audios"]["lengths_for_llm"] = torch.tensor(new_audio_lengths_for_llm)
+
+        return batch
+
+
+def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer, data_args) -> Dict:
+    """Make dataset and collator for supervised fine-tuning."""
+    train_dataset = LazySupervisedDataset(tokenizer=tokenizer, data_args=data_args)
+    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
+    return dict(train_dataset=train_dataset, eval_dataset=None, data_collator=data_collator)
+
+
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    best_ratio_diff = float("inf")
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    # print(f'width: {width}, height: {height}, best_ratio: {best_ratio}')
+    return best_ratio
+
+
+def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+
+    # calculate the existing image aspect ratio
+    target_ratios = set(
+        (i, j)
+        for n in range(min_num, max_num + 1)
+        for i in range(1, n + 1)
+        for j in range(1, n + 1)
+        if i * j <= max_num and i * j >= min_num
+    )
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, target_ratios, orig_width, orig_height, image_size
+    )
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size,
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images, [len(processed_images)]
+
+
diff --git a/vita/util/data_utils_video_patch_audio.py b/vita/util/data_utils_video_patch_audio.py
new file mode 100644
index 0000000000000000000000000000000000000000..965f2fbfd7106db08a1199a899ad8dccb1030d28
--- /dev/null
+++ b/vita/util/data_utils_video_patch_audio.py
@@ -0,0 +1,1450 @@
+import copy
+import json
+import math
+import os
+import random
+from dataclasses import dataclass, field
+from typing import Dict, Optional, Sequence
+
+import numpy as np
+import torch
+import transformers
+from PIL import Image
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data import Dataset
+
+from decord import VideoReader, cpu
+from vita import conversation as conversation_lib
+from vita.config import AudioFolder, DataConfig, FolderDict, NoPatchSets
+from vita.constants import (
+    DEFAULT_AUDIO_TOKEN,
+    DEFAULT_DATA_RATIO,
+    DEFAULT_IMAGE_TOKEN,
+    DEFAULT_VIDEO_TOKEN,
+    IGNORE_INDEX,
+    MAX_IMAGE_LENGTH,
+    MIN_IMAGE_LENGTH,
+)
+from vita.util.mm_utils import tokenizer_image_audio_token, tokenizer_image_token
+
+
+@dataclass
+class DataArguments:
+    lazy_preprocess: bool = False
+    is_multimodal: bool = True
+    image_folder: Optional[str] = field(default=None)
+    image_aspect_ratio: str = field(default=None)
+    dataset_use: str = field(default="temp")
+    min_dynamic_patch: int = 1
+    max_dynamic_patch: int = 12
+    use_thumbnail: bool = True
+
+
+def preprocess_multimodal(
+    sources: Sequence[str],
+    data_args: DataArguments,
+    image_token_num=1,
+    patch_num=[1],
+    audio_lens: int = 0,
+    inserted_id=None,
+) -> Dict:
+    is_multimodal = data_args.is_multimodal
+    if not is_multimodal:
+        return sources
+
+    k_img_ph = 0
+    for source in sources:
+        if inserted_id is not None:
+            assert source[inserted_id]["from"] == "gpt"
+        for i, sentence in enumerate(source):
+            if DEFAULT_IMAGE_TOKEN in sentence["value"] or DEFAULT_VIDEO_TOKEN in sentence["value"]:
+                sentence["value"] = (
+                    sentence["value"]
+                    #.replace(DEFAULT_IMAGE_TOKEN + "\n", DEFAULT_IMAGE_TOKEN)
+                    .strip()
+                )
+                sentence["value"] = (
+                    sentence["value"]
+                    .replace("\n" + DEFAULT_IMAGE_TOKEN, DEFAULT_IMAGE_TOKEN)
+                    .strip()
+                )
+                if sentence["value"].endswith(DEFAULT_IMAGE_TOKEN):
+                    IMAGE_TOKEN_NUM = sentence["value"].count(DEFAULT_IMAGE_TOKEN)
+                    sentence["value"] = (
+                        sentence["value"].replace(DEFAULT_IMAGE_TOKEN * IMAGE_TOKEN_NUM, "").strip()
+                    )
+                    sentence["value"] = DEFAULT_IMAGE_TOKEN * IMAGE_TOKEN_NUM + sentence["value"]
+                    sentence["value"] = sentence["value"].strip()
+                if sentence["value"].endswith(DEFAULT_VIDEO_TOKEN):
+                    VIDEO_TOKEN_NUM = sentence["value"].count(DEFAULT_VIDEO_TOKEN)
+                    sentence["value"] = (
+                        sentence["value"].replace(DEFAULT_VIDEO_TOKEN * VIDEO_TOKEN_NUM, "").strip()
+                    )
+                    sentence["value"] = DEFAULT_VIDEO_TOKEN * VIDEO_TOKEN_NUM + sentence["value"]
+                    sentence["value"] = sentence["value"].strip()
+
+                if "mmtag" in conversation_lib.default_conversation.version:
+                    sentence["value"] = sentence["value"].replace(
+                        DEFAULT_IMAGE_TOKEN, "<Image>" + DEFAULT_IMAGE_TOKEN + "</Image>"
+                    )
+
+                IMAGE_TOKEN_NUM = sentence["value"].count(DEFAULT_IMAGE_TOKEN)
+                if IMAGE_TOKEN_NUM > MAX_IMAGE_LENGTH:
+                    sentence["value"] = (
+                        sentence["value"]
+                        .replace(
+                            DEFAULT_IMAGE_TOKEN * IMAGE_TOKEN_NUM,
+                            DEFAULT_IMAGE_TOKEN * MAX_IMAGE_LENGTH,
+                        )
+                        .strip()
+                    )
+            replace_token, vid_replace_token, audio_replace_token = (
+                DEFAULT_IMAGE_TOKEN,
+                DEFAULT_IMAGE_TOKEN * image_token_num,
+                DEFAULT_AUDIO_TOKEN,
+            )  # * audio_lens
+            sentence["value"] = sentence["value"].replace(
+                DEFAULT_VIDEO_TOKEN, vid_replace_token + "\n"
+            )
+            if DEFAULT_IMAGE_TOKEN in sentence["value"]:
+                IMAGE_TOKEN_NUM = sentence["value"].count(DEFAULT_IMAGE_TOKEN) 
+                parts = sentence["value"].split(DEFAULT_IMAGE_TOKEN)
+                new_value = ""
+                for m in range(IMAGE_TOKEN_NUM):
+                    #replace_token = (DEFAULT_IMAGE_TOKEN + '\t') * patch_num[k_img_ph] + "\n"
+                    replace_token = DEFAULT_IMAGE_TOKEN * patch_num[k_img_ph]
+                    k_img_ph += 1
+                    new_value = new_value + parts[m] + replace_token
+                new_value += parts[-1]
+                assert len(parts) == IMAGE_TOKEN_NUM + 1
+                assert m+1 == IMAGE_TOKEN_NUM
+                sentence["value"] = new_value
+
+
+            # sentence["value"] = sentence["value"].replace(DEFAULT_IMAGE_TOKEN, replace_token + "\n")
+            # sentence["value"] = sentence["value"].replace(
+            #     DEFAULT_VIDEO_TOKEN, vid_replace_token + "\n"
+            # )
+            sentence["value"] = sentence["value"].replace(
+                DEFAULT_AUDIO_TOKEN + "\n", audio_replace_token
+            )
+            sentence["value"] = sentence["value"].replace("\n\n", "\n").replace("\n\n", "\n")
+
+    if sources[0][0]['value'].count('<image>') > 0:
+        assert k_img_ph == len(patch_num)
+    #print(patch_num)
+    #print(sum(patch_num))
+    #print(sources)
+    #print(sources[0])
+    #print(sources[0][0]['value'])
+    #print(sources[0][0]['value'].count('<image>'))
+    #import pdb; pdb.set_trace()
+    return sources
+
+
+def preprocess_mixtral_two(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+    has_audio: bool = False,
+    end_tag: bool = True,
+    modality: str = "lang",
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt(modality))
+    # print(conversations)
+    # import pdb; pdb.set_trace()
+
+    # Tokenize conversations
+    if not end_tag:
+        conversations[0] = conversations[0][:-4]
+    if has_image and not has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    elif has_image and has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_audio_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    elif not has_image and has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_audio_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+    #print(f'end_tag: {end_tag}')
+    #print(conversations)
+    #print(input_ids)
+    #import pdb; pdb.set_trace()
+
+    targets = input_ids.clone()
+
+    assert conv.sep_style == conversation_lib.SeparatorStyle.MixtralTwo
+
+    # Mask targets
+    sep = conv.sep + "\n" + conv.roles[1] + ":"
+    sep2_2 = "\n" + conv.roles[0] + ":"
+    sep2 = conv.sep2 + sep2_2
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+
+        rounds = conversation.split(sep2)
+        rounds = [rounds[0] + sep2 + rounds[1]] + rounds[2:]
+        cur_len = 1
+        end_token_cnt = 0
+        target[:cur_len] = IGNORE_INDEX
+
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+            if i > 0:
+                rou = sep2_2 + rou
+
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+
+            if has_image and not has_audio:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 1
+            elif has_image and has_audio:
+                round_len = len(tokenizer_image_audio_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_audio_token(parts[0], tokenizer)) - 1
+            elif not has_image and has_audio:
+                round_len = len(tokenizer_image_audio_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_audio_token(parts[0], tokenizer)) - 1
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 2
+
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+
+            end_token_cnt += 1
+            cur_len += round_len
+        cur_len = cur_len - 1
+        target[cur_len:] = IGNORE_INDEX
+
+        if tokenizer.pad_token_id == tokenizer.eos_token_id:
+            cur_len -= end_token_cnt
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." f" (ignored)")
+                # print(f"YOU NEED GO TO DEBUG THIS DATA ITEM: {conversations}")
+
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+
+
+def preprocess_nemo(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+    has_audio: bool = False,
+    end_tag: bool = True,
+    modality: str = "lang",
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt(modality))
+    #print(conversations)
+    #import pdb; pdb.set_trace()
+
+    # Tokenize conversations
+    if has_image and not has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    elif has_image and has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_audio_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    elif not has_image and has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_audio_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+    #print(f'end_tag: {end_tag}')
+    #print(conversations)
+    #print(input_ids)
+    #import pdb; pdb.set_trace()
+
+    targets = input_ids.clone()
+
+    assert conv.sep_style == conversation_lib.SeparatorStyle.Nemo
+
+    # Mask targets
+    sep = conv.sep
+    sep2 = conv.sep2
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+
+        rounds = conversation.split(sep2)
+        cur_len = 1
+        end_token_cnt = 0
+        target[:cur_len] = IGNORE_INDEX
+
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+
+            if has_image and not has_audio:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 1
+            elif has_image and has_audio:
+                round_len = len(tokenizer_image_audio_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_audio_token(parts[0], tokenizer)) - 1
+            elif not has_image and has_audio:
+                round_len = len(tokenizer_image_audio_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_audio_token(parts[0], tokenizer)) - 1 
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 2
+
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+
+            end_token_cnt += 1
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+
+        if tokenizer.pad_token_id == tokenizer.eos_token_id:
+            cur_len -= end_token_cnt
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." f" (ignored)")
+                # print(f"YOU NEED GO TO DEBUG THIS DATA ITEM: {conversations}")
+
+    #import pdb; pdb.set_trace()
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+
+
+def preprocess_qwen2p5_instruct(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+    has_audio: bool = False,
+    end_tag: bool = True,
+    modality: str = "lang",
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt(modality))
+    #print(conversations)
+    #import pdb; pdb.set_trace()
+
+    # Tokenize conversations
+    if has_image and not has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    elif has_image and has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_audio_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    elif not has_image and has_audio:
+        input_ids = torch.stack(
+            [
+                tokenizer_image_audio_token(prompt, tokenizer, return_tensors="pt")
+                for prompt in conversations
+            ],
+            dim=0,
+        )
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+    #print(f'end_tag: {end_tag}')
+    #print(conversations)
+    #print(input_ids)
+    #import pdb; pdb.set_trace()
+
+    targets = input_ids.clone()
+
+    assert conv.sep_style == conversation_lib.SeparatorStyle.Qwen2p5Instruct
+
+    # Mask targets
+    sep = '\n' + conv.sep + conv.roles[1] + "\n"   #\n<|im_start|>assistant\n
+    sep2 = '\n' + conv.sep2 + conv.roles[0] + "\n" #\n<|im_start|>user\n
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+
+        rounds = conversation.split(sep2)
+        rounds = [rounds[0] + sep2 + rounds[1]] + rounds[2:]
+        cur_len = 0
+        end_token_cnt = 0
+
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+            if i > 0:
+                rou = sep2 + rou
+
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+
+            if has_image and not has_audio:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer))
+            elif has_image and has_audio:
+                round_len = len(tokenizer_image_audio_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_audio_token(parts[0], tokenizer))
+            elif not has_image and has_audio:
+                round_len = len(tokenizer_image_audio_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_audio_token(parts[0], tokenizer))
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids)
+
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+
+            end_token_cnt += 1
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+
+        if tokenizer.pad_token_id == tokenizer.eos_token_id:
+            cur_len -= end_token_cnt
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}." f" (ignored)")
+                # print(f"YOU NEED GO TO DEBUG THIS DATA ITEM: {conversations}")
+
+    #import pdb; pdb.set_trace()
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+
+
+def preprocess_plain(
+    sources: Sequence[str],
+    tokenizer: transformers.PreTrainedTokenizer,
+) -> Dict:
+    # add end signal and concatenate together
+    conversations = []
+    for source in sources:
+        assert len(source) == 2
+        assert DEFAULT_IMAGE_TOKEN in source[0]["value"]
+        source[0]["value"] = DEFAULT_IMAGE_TOKEN
+        conversation = (
+            source[0]["value"] + source[1]["value"] + conversation_lib.default_conversation.sep
+        )
+        conversations.append(conversation)
+    # tokenize conversations
+    input_ids = [
+        tokenizer_image_token(prompt, tokenizer, return_tensors="pt") for prompt in conversations
+    ]
+    targets = copy.deepcopy(input_ids)
+    for target, source in zip(targets, sources):
+        tokenized_len = len(tokenizer_image_token(source[0]["value"], tokenizer))
+        target[:tokenized_len] = IGNORE_INDEX
+
+    return dict(input_ids=input_ids, labels=targets)
+
+
+def preprocess(
+    sources: Sequence[str],
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_image: bool = False,
+    has_audio: bool = False,
+    end_tag: bool = True,
+    modality: str = "lang",
+) -> Dict:
+    if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.PLAIN:
+        return preprocess_plain(sources, tokenizer)
+
+    if conversation_lib.default_conversation.version == "nemo":
+        return preprocess_nemo(
+            sources,
+            tokenizer,
+            has_image=has_image,
+            has_audio=has_audio,
+            end_tag=end_tag,
+            modality=modality,
+        )
+    elif conversation_lib.default_conversation.version == "mixtral_two":
+        return preprocess_mixtral_two(
+            sources,
+            tokenizer,
+            has_image=has_image,
+            has_audio=has_audio,
+            end_tag=end_tag,
+            modality=modality,
+        )
+    elif conversation_lib.default_conversation.version == "qwen2p5_instruct":
+        return preprocess_qwen2p5_instruct(
+            sources,
+            tokenizer,
+            has_image=has_image,
+            has_audio=has_audio,
+            end_tag=end_tag,
+            modality=modality,
+        )
+
+def _get_rawvideo_dec(
+    video_path,
+    image_processor,
+    max_frames=32,
+    min_frames=4,
+    image_resolution=384,
+    video_framerate=1,
+    s=None,
+    e=None,
+    image_aspect_ratio="pad",
+    data_args=None,
+):
+    if s is None:
+        start_time, end_time = None, None
+    else:
+        start_time = int(s)
+        end_time = int(e)
+        start_time = start_time if start_time >= 0.0 else 0.0
+        end_time = end_time if end_time >= 0.0 else 0.0
+        if start_time > end_time:
+            start_time, end_time = end_time, start_time
+        elif start_time == end_time:
+            end_time = start_time + 1
+
+    if os.path.exists(video_path):
+        vreader = VideoReader(video_path, ctx=cpu(0))
+    else:
+        print(video_path)
+        raise FileNotFoundError
+
+    fps = vreader.get_avg_fps()
+    f_start = 0 if start_time is None else int(start_time * fps)
+    f_end = int(min(1000000000 if end_time is None else end_time * fps, len(vreader) - 1))
+    num_frames = f_end - f_start + 1
+    if num_frames > 0:
+        # T x 3 x H x W
+        sample_fps = int(video_framerate)
+        t_stride = int(round(float(fps) / sample_fps))
+
+        all_pos = list(range(f_start, f_end + 1, t_stride))
+        if len(all_pos) > max_frames:
+            sample_pos = [
+                all_pos[_] for _ in np.linspace(0, len(all_pos) - 1, num=max_frames, dtype=int)
+            ]
+        elif len(all_pos) < min_frames:
+            sample_pos = [
+                all_pos[_] for _ in np.linspace(0, len(all_pos) - 1, num=min_frames, dtype=int)
+            ]
+        else:
+            sample_pos = all_pos
+
+        patch_images = [Image.fromarray(f) for f in vreader.get_batch(sample_pos).asnumpy()]
+
+        if image_aspect_ratio == "pad":
+            def expand2square(pil_img, background_color):
+                width, height = pil_img.size
+                if width == height:
+                    return pil_img
+                elif width > height:
+                    result = Image.new(pil_img.mode, (width, width), background_color)
+                    result.paste(pil_img, (0, (width - height) // 2))
+                    return result
+                else:
+                    result = Image.new(pil_img.mode, (height, height), background_color)
+                    result.paste(pil_img, ((height - width) // 2, 0))
+                    return result
+                
+            patch_images = [
+                expand2square(i, tuple(int(x * 255) for x in image_processor.image_mean))
+                for i in patch_images
+            ]
+        slice_len = len(patch_images)
+
+        patch_images, patch_num = video2patches(patch_images, data_args, image_processor)
+
+        patch_images = [
+            image_processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+            for i in patch_images
+        ]
+        return patch_images, slice_len, patch_num
+    else:
+        print("video path: {} error.".format(video_path))
+
+
+def video2patches(patch_images, data_args, processor):
+    if "height" in processor.size.keys():
+        image_size = processor.size["height"]
+    elif "shortest_edge" in processor.size.keys():
+        image_size = processor.size["shortest_edge"]
+    else:
+        raise NotImplementedError(f"Please use correct key to use processor size!")
+    
+    len_frame = len(patch_images)
+    image0, patch_num0 = dynamic_preprocess(
+        patch_images[0],
+        min_num=data_args.min_dynamic_patch,
+        max_num=data_args.max_dynamic_patch,
+        image_size=image_size,
+        use_thumbnail=data_args.use_thumbnail,
+    )
+    frame_patches = []
+    num_patch_per_frame = []
+    if len_frame * patch_num0[0] <= MAX_IMAGE_LENGTH:
+        frame_patches += image0
+        num_patch_per_frame += patch_num0
+        for frame in patch_images[1:]:
+            image, patch_num = dynamic_preprocess(
+                frame,
+                min_num=data_args.min_dynamic_patch,
+                max_num=data_args.max_dynamic_patch,
+                image_size=image_size,
+                use_thumbnail=data_args.use_thumbnail,
+            )
+            frame_patches += image
+            num_patch_per_frame += patch_num
+    else:
+        a = int((MAX_IMAGE_LENGTH-len_frame) / (patch_num0[0]-1)) #a+b=len_frame, a*patch_num + b <= MAX_IMAGE_LENGTH
+        if a == 0:
+            frame_patches = patch_images
+            num_patch_per_frame = [1] * len_frame
+        elif a > 0:
+            sampled_ids = random.sample(range(0, len_frame), a)
+            for f_id, frame in enumerate(patch_images):
+                if f_id in sampled_ids:
+                    image, patch_num = dynamic_preprocess(
+                        frame,
+                        min_num=data_args.min_dynamic_patch,
+                        max_num=data_args.max_dynamic_patch,
+                        image_size=image_size,
+                        use_thumbnail=data_args.use_thumbnail,
+                    )
+                    frame_patches += image
+                    num_patch_per_frame += patch_num
+                else:
+                    frame_patches.append(frame)
+                    num_patch_per_frame.append(1)
+        else:
+            raise ValueError('a should not be negative')
+        
+    assert len(frame_patches) == sum(num_patch_per_frame)
+    assert len_frame == len(num_patch_per_frame)
+
+    return frame_patches, num_patch_per_frame
+
+class LazySupervisedDataset(Dataset):
+    """Dataset for supervised fine-tuning."""
+
+    def __init__(self, tokenizer: transformers.PreTrainedTokenizer, data_args: DataArguments):
+        super(LazySupervisedDataset, self).__init__()
+
+        dataset_list = DataConfig[str(data_args.dataset_use)]
+        print(dataset_list)
+
+        self.max_length = MAX_IMAGE_LENGTH
+        list_data_dict = []
+        self.folder_dict = {}
+        for i in dataset_list:
+            # list_data_dict += json.load(open(i["chat_path"], "r"))
+            data_ratio = i.get("data_ratio", DEFAULT_DATA_RATIO)
+            data_i = json.load(open(i["chat_path"], "r"))
+            len_data_i = len(data_i)
+            data_i = random.sample(data_i, int(len_data_i * data_ratio))
+            list_data_dict += data_i
+
+            image_folder = [folder for folder in i if folder is not "chat_path"]
+
+            for folder in image_folder:
+                if folder not in self.folder_dict:
+                    self.folder_dict[folder] = i[folder]
+        for key in FolderDict.keys():
+            if key not in self.folder_dict:
+                self.folder_dict[key] = FolderDict[key]
+
+        random.shuffle(list_data_dict)
+
+        self.tokenizer = tokenizer
+        self.list_data_dict = list_data_dict
+        self.data_args = data_args
+
+    def __len__(self):
+        return len(self.list_data_dict)
+
+    # @property
+    # def lengths(self):
+    #     length_list = []
+    #     for sample in self.list_data_dict:
+    #         img_tokens = 128 if 'image' in sample else 0
+    #         length_list.append(sum(len(conv['value'].split()) for conv in sample['conversations']) + img_tokens)
+    #     return length_list
+
+    @property
+    def modality_lengths(self):
+        length_list = []
+        for sample in self.list_data_dict:
+            cur_len = sum(len(conv["value"].split()) for conv in sample["conversations"])
+            cur_len = cur_len if ("image" in sample or "video" in sample) else -cur_len
+            length_list.append(cur_len)
+        return length_list
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        sources = self.list_data_dict[i]
+        if isinstance(i, int):
+            sources = [sources]
+        assert len(sources) == 1, "Don't know why it is wrapped to a list"  # FIXME
+        if "image" in sources[0] and "audio" not in sources[0]:
+            image_file = self.list_data_dict[i]["image"]
+            set_id = self.list_data_dict[i].get("set", None)
+            file = image_file[0] if type(image_file) is list else image_file
+            processor = self.data_args.image_processor
+            if "height" in processor.size.keys():
+                image_size = processor.size["height"]
+            elif "shortest_edge" in processor.size.keys():
+                image_size = processor.size["shortest_edge"]
+            else:
+                raise NotImplementedError(f"Please use correct key to use processor size!")
+
+            if type(image_file) is list:
+                assert type(set_id) is list
+                if len(image_file) != len(set_id):
+                    assert len(set(set_id)) == 1
+                image = [
+                    Image.open(
+                        os.path.join(self.folder_dict[set_id[k]], file.replace("\\", "/"))
+                    ).convert("RGB")
+                    for k, file in enumerate(image_file)
+                ]
+                if self.data_args.image_aspect_ratio == "pad":
+
+                    def expand2square(pil_img, background_color):
+                        width, height = pil_img.size
+                        if width == height:
+                            return pil_img
+                        elif width > height:
+                            result = Image.new(pil_img.mode, (width, width), background_color)
+                            result.paste(pil_img, (0, (width - height) // 2))
+                            return result
+                        else:
+                            result = Image.new(pil_img.mode, (height, height), background_color)
+                            result.paste(pil_img, ((height - width) // 2, 0))
+                            return result
+
+                    image = [
+                        expand2square(i, tuple(int(x * 255) for x in processor.image_mean))
+                        for i in image
+                    ]
+                    image_patches, patch_num = [], []
+                    for k, img in enumerate(image):
+                        if set_id[k] not in NoPatchSets:
+                            img, p_num = dynamic_preprocess(
+                                img,
+                                min_num=self.data_args.min_dynamic_patch,
+                                max_num=self.data_args.max_dynamic_patch,
+                                image_size=image_size,
+                                use_thumbnail=self.data_args.use_thumbnail,
+                            )
+                        else:
+                            img, p_num = [img], [1]
+                        image_patches += img
+                        patch_num += p_num
+                    assert len(image_patches) == sum(patch_num)
+                    image = image_patches
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+                else:
+                    image_patches, patch_num = [], []
+                    for k, img in enumerate(image):
+                        if set_id[k] not in NoPatchSets:
+                            img, p_num = dynamic_preprocess(
+                                img,
+                                min_num=self.data_args.min_dynamic_patch,
+                                max_num=self.data_args.max_dynamic_patch,
+                                image_size=image_size,
+                                use_thumbnail=self.data_args.use_thumbnail,
+                            )
+                        else:
+                            img, p_num = [img], [1]
+                        image_patches += img
+                        patch_num += p_num
+                    assert len(image_patches) == sum(patch_num)
+                    image = image_patches
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+            else:
+                image_folder = self.folder_dict[set_id]
+                image = Image.open(
+                    os.path.join(image_folder, image_file.replace("\\", "/"))
+                ).convert("RGB")
+                if self.data_args.image_aspect_ratio == "pad":
+
+                    def expand2square(pil_img, background_color):
+                        width, height = pil_img.size
+                        if width == height:
+                            return pil_img
+                        elif width > height:
+                            result = Image.new(pil_img.mode, (width, width), background_color)
+                            result.paste(pil_img, (0, (width - height) // 2))
+                            return result
+                        else:
+                            result = Image.new(pil_img.mode, (height, height), background_color)
+                            result.paste(pil_img, ((height - width) // 2, 0))
+                            return result
+
+                    image = expand2square(image, tuple(int(x * 255) for x in processor.image_mean))
+                    image, patch_num = dynamic_preprocess(
+                        image,
+                        min_num=self.data_args.min_dynamic_patch,
+                        max_num=self.data_args.max_dynamic_patch,
+                        image_size=image_size,
+                        use_thumbnail=self.data_args.use_thumbnail,
+                    )
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+                else:
+                    image, patch_num = dynamic_preprocess(
+                        image,
+                        min_num=self.data_args.min_dynamic_patch,
+                        max_num=self.data_args.max_dynamic_patch,
+                        image_size=image_size,
+                        use_thumbnail=self.data_args.use_thumbnail,
+                    )
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+
+            inserted_id = self.list_data_dict[i].get("inserted_id", None)
+            end_tag = self.list_data_dict[i].get("end_tag", True)
+            assert inserted_id is None
+            assert end_tag is True
+            sources = preprocess_multimodal(
+                copy.deepcopy([e["conversations"] for e in sources]),
+                self.data_args,
+                patch_num=patch_num,
+                inserted_id=inserted_id,
+            )
+
+            data_dict = preprocess(
+                sources, self.tokenizer, has_image=True, end_tag=end_tag, modality="image"
+            )
+
+        elif "image" in sources[0] and "audio" in sources[0]:
+            image_file = self.list_data_dict[i]["image"]
+            set_id = self.list_data_dict[i].get("set", None)
+            file = image_file[0] if type(image_file) is list else image_file
+            audio_file = self.list_data_dict[i]["audio"]
+            processor = self.data_args.image_processor
+            if "height" in processor.size.keys():
+                image_size = processor.size["height"]
+            elif "shortest_edge" in processor.size.keys():
+                image_size = processor.size["shortest_edge"]
+            else:
+                raise NotImplementedError(f"Please use correct key to use processor size!")
+
+            if type(image_file) is list:
+                assert type(set_id) is list
+                if len(image_file) != len(set_id):  # 多图数据
+                    assert len(set(set_id)) == 1
+                image = [
+                    Image.open(
+                        os.path.join(self.folder_dict[set_id[k]], file.replace("\\", "/"))
+                    ).convert("RGB")
+                    for k, file in enumerate(image_file)
+                ]
+                if self.data_args.image_aspect_ratio == "pad":
+
+                    def expand2square(pil_img, background_color):
+                        width, height = pil_img.size
+                        if width == height:
+                            return pil_img
+                        elif width > height:
+                            result = Image.new(pil_img.mode, (width, width), background_color)
+                            result.paste(pil_img, (0, (width - height) // 2))
+                            return result
+                        else:
+                            result = Image.new(pil_img.mode, (height, height), background_color)
+                            result.paste(pil_img, ((height - width) // 2, 0))
+                            return result
+
+                    image = [
+                        expand2square(i, tuple(int(x * 255) for x in processor.image_mean))
+                        for i in image
+                    ]
+                    image_patches, patch_num = [], []
+                    for k, img in enumerate(image):
+                        if set_id[k] not in NoPatchSets:
+                            img, p_num = dynamic_preprocess(
+                                img,
+                                min_num=self.data_args.min_dynamic_patch,
+                                max_num=self.data_args.max_dynamic_patch,
+                                image_size=image_size,
+                                use_thumbnail=self.data_args.use_thumbnail,
+                            )
+                        else:
+                            img, p_num = [img], [1]
+                        image_patches += img
+                        patch_num += p_num
+                    assert len(image_patches) == sum(patch_num)
+                    image = image_patches
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+                else:
+                    image_patches, patch_num = [], []
+                    for k, img in enumerate(image):
+                        if set_id[k] not in NoPatchSets:
+                            img, p_num = dynamic_preprocess(
+                                img,
+                                min_num=self.data_args.min_dynamic_patch,
+                                max_num=self.data_args.max_dynamic_patch,
+                                image_size=image_size,
+                                use_thumbnail=self.data_args.use_thumbnail,
+                            )
+                        else:
+                            img, p_num = [img], [1]
+                        image_patches += img
+                        patch_num += p_num
+                    assert len(image_patches) == sum(patch_num)
+                    image = image_patches
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+            else:
+                image_folder = self.folder_dict[set_id]
+                image = Image.open(
+                    os.path.join(image_folder, image_file.replace("\\", "/"))
+                ).convert("RGB")
+                if self.data_args.image_aspect_ratio == "pad":
+
+                    def expand2square(pil_img, background_color):
+                        width, height = pil_img.size
+                        if width == height:
+                            return pil_img
+                        elif width > height:
+                            result = Image.new(pil_img.mode, (width, width), background_color)
+                            result.paste(pil_img, (0, (width - height) // 2))
+                            return result
+                        else:
+                            result = Image.new(pil_img.mode, (height, height), background_color)
+                            result.paste(pil_img, ((height - width) // 2, 0))
+                            return result
+
+                    image = expand2square(image, tuple(int(x * 255) for x in processor.image_mean))
+                    image, patch_num = dynamic_preprocess(
+                        image,
+                        min_num=self.data_args.min_dynamic_patch,
+                        max_num=self.data_args.max_dynamic_patch,
+                        image_size=image_size,
+                        use_thumbnail=self.data_args.use_thumbnail,
+                    )
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+                else:
+                    image, patch_num = dynamic_preprocess(
+                        image,
+                        min_num=self.data_args.min_dynamic_patch,
+                        max_num=self.data_args.max_dynamic_patch,
+                        image_size=image_size,
+                        use_thumbnail=self.data_args.use_thumbnail,
+                    )
+                    image = [
+                        processor.preprocess(i, return_tensors="pt")["pixel_values"][0]
+                        for i in image
+                    ]
+
+            if type(audio_file) is list:
+                # if type(set_id) is list:
+                #    audio_folder = self.folder_dict[set_id[0]+'_audio']
+                # else:
+                #    audio_folder = self.folder_dict[set_id+'_audio']
+                audio_folder = AudioFolder
+                assert len(audio_file) > 0, "audio_file为列表时不能为空"
+                audio = []
+                audio_for_llm_lens = []
+                audio_length = []
+                for file in audio_file:
+                    try:
+                        a, a_llm = self.data_args.audio_processor.process(
+                            os.path.join(audio_folder, "audio", file)
+                        )
+                    except:
+                        print(f"File {os.path.join(audio_folder, 'audio', file)} not OK!!!!!")
+                    audio.append(a)
+                    audio_for_llm_lens.append(a_llm)
+                    audio_length.append(a.shape[0])
+            else:
+                # audio_folder = self.folder_dict[set_id+'_audio']
+                audio_folder = AudioFolder
+                assert audio_file, "audio_file不能为空"
+                audio, audio_for_llm_lens = self.data_args.audio_processor.process(
+                    os.path.join(audio_folder, "audio", audio_file)
+                )
+                audio_length = audio.shape[0]
+
+            inserted_id = self.list_data_dict[i].get("inserted_id", None)
+            end_tag = self.list_data_dict[i].get("end_tag", True)
+            sources = preprocess_multimodal(
+                copy.deepcopy([e["conversations"] for e in sources]),
+                self.data_args,
+                patch_num=patch_num,
+                audio_lens=audio_for_llm_lens,
+                inserted_id=inserted_id,
+            )
+
+            data_dict = preprocess(
+                sources,
+                self.tokenizer,
+                has_image=True,
+                has_audio=True,
+                end_tag=end_tag,
+                modality="image",
+            )
+            data_dict["audio_lengths"] = audio_length
+            data_dict["audio_lengths_for_llm"] = audio_for_llm_lens
+
+        elif "video" in sources[0] and "audio" not in sources[0]:
+            video_file = self.list_data_dict[i]["video"]
+            video_id = self.list_data_dict[i]["id"]
+            set_id = self.list_data_dict[i].get("set", None)
+            processor = self.data_args.image_processor
+            if "height" in processor.size.keys():
+                image_size = processor.size["height"]
+            elif "shortest_edge" in processor.size.keys():
+                image_size = processor.size["shortest_edge"]
+            else:
+                raise NotImplementedError(f"Please use correct key to use processor size!")
+            video_folder = self.folder_dict[set_id]
+            image, image_token_num, patch_num = _get_rawvideo_dec(
+                os.path.join(video_folder, video_file),
+                processor,
+                max_frames=MAX_IMAGE_LENGTH,
+                min_frames=MIN_IMAGE_LENGTH,
+                image_resolution=image_size,
+                image_aspect_ratio=self.data_args.image_aspect_ratio,
+                data_args=self.data_args,
+            )
+
+            inserted_id = self.list_data_dict[i].get("inserted_id", None)
+            end_tag = self.list_data_dict[i].get("end_tag", True)
+            assert inserted_id is None
+            assert end_tag is True
+            sources = preprocess_multimodal(
+                copy.deepcopy([e["conversations"] for e in sources]),
+                self.data_args,
+                image_token_num=image_token_num,
+                patch_num=patch_num,
+                inserted_id=inserted_id,
+            )
+
+            data_dict = preprocess(
+                sources,
+                self.tokenizer,
+                has_image=True,
+                has_audio=False,
+                end_tag=end_tag,
+                modality="video",
+            )
+
+        elif "video" in sources[0] and "audio" in sources[0]:
+            video_file = self.list_data_dict[i]["video"]
+            video_id = self.list_data_dict[i]["id"]
+            set_id = self.list_data_dict[i].get("set", None)
+            audio_file = self.list_data_dict[i]["audio"]
+            processor = self.data_args.image_processor
+            if "height" in processor.size.keys():
+                image_size = processor.size["height"]
+            elif "shortest_edge" in processor.size.keys():
+                image_size = processor.size["shortest_edge"]
+            else:
+                raise NotImplementedError(f"Please use correct key to use processor size!")
+            video_folder = self.folder_dict[set_id]
+            # audio_folder = self.folder_dict[set_id+'_audio']
+            audio_folder = AudioFolder
+            image, image_token_num, patch_num = _get_rawvideo_dec(
+                os.path.join(video_folder, video_file),
+                processor,
+                max_frames=MAX_IMAGE_LENGTH,
+                min_frames=MIN_IMAGE_LENGTH,
+                image_resolution=image_size,
+                image_aspect_ratio=self.data_args.image_aspect_ratio,
+                data_args=self.data_args,
+            )
+            if type(audio_file) is list:
+                assert len(audio_file) > 0, "audio_file为列表时不能为空"
+                audio = []
+                audio_for_llm_lens = []
+                audio_length = []
+                for file in audio_file:
+                    a, a_llm = self.data_args.audio_processor.process(
+                        os.path.join(audio_folder, "audio", file)
+                    )
+                    audio.append(a)
+                    audio_for_llm_lens.append(a_llm)
+                    audio_length.append(a.shape[0])
+            else:
+                assert audio_file, "audio_file不能为空"
+                audio, audio_for_llm_lens = self.data_args.audio_processor.process(
+                    os.path.join(audio_folder, "audio", audio_file)
+                )
+                audio_length = audio.shape[0]
+
+            inserted_id = self.list_data_dict[i].get("inserted_id", None)
+            end_tag = self.list_data_dict[i].get("end_tag", True)
+            sources = preprocess_multimodal(
+                copy.deepcopy([e["conversations"] for e in sources]),
+                self.data_args,
+                image_token_num=image_token_num,
+                patch_num=patch_num,
+                audio_lens=audio_for_llm_lens,
+                inserted_id=inserted_id,
+            )
+
+            data_dict = preprocess(
+                sources,
+                self.tokenizer,
+                has_image=True,
+                has_audio=True,
+                end_tag=end_tag,
+                modality="video",
+            )
+            data_dict["audio_lengths"] = audio_length
+            data_dict["audio_lengths_for_llm"] = audio_for_llm_lens
+        elif "audio" in sources[0]:
+            audio_file = self.list_data_dict[i]["audio"]
+            audio_folder = AudioFolder
+            if type(audio_file) is list:
+                assert len(audio_file) > 0, "audio_file为列表时不能为空"
+                audio = []
+                audio_for_llm_lens = []
+                audio_length = []
+                for file in audio_file:
+                    a, a_llm = self.data_args.audio_processor.process(
+                        os.path.join(audio_folder, "audio", file)
+                    )
+                    audio.append(a)
+                    audio_for_llm_lens.append(a_llm)
+                    audio_length.append(a.shape[0])
+            else:
+                assert audio_file, "audio_file不能为空"
+                audio, audio_for_llm_lens = self.data_args.audio_processor.process(
+                    os.path.join(audio_folder, "audio", audio_file)
+                )
+                audio_length = audio.shape[0]
+
+            inserted_id = self.list_data_dict[i].get("inserted_id", None)
+            end_tag = self.list_data_dict[i].get("end_tag", True)
+            sources = preprocess_multimodal(
+                copy.deepcopy([e["conversations"] for e in sources]),
+                self.data_args,
+                image_token_num=0,
+                audio_lens=audio_for_llm_lens,
+                inserted_id=inserted_id,
+            )
+
+            data_dict = preprocess(
+                sources,
+                self.tokenizer,
+                has_image=False,
+                has_audio=True,
+                end_tag=end_tag,
+                modality="lang",
+            )
+            data_dict["audio_lengths"] = audio_length
+            data_dict["audio_lengths_for_llm"] = audio_for_llm_lens
+        else:
+            sources = copy.deepcopy([e["conversations"] for e in sources])
+            sources = preprocess_multimodal(
+                sources,
+                self.data_args,
+                image_token_num=0,
+            )
+
+            data_dict = preprocess(sources, self.tokenizer, has_image=False, modality="lang")
+
+        if isinstance(i, int):
+            if "audio" in self.list_data_dict[i]:
+                data_dict = dict(
+                    input_ids=data_dict["input_ids"][0],
+                    labels=data_dict["labels"][0],
+                    audio_lengths=data_dict["audio_lengths"],
+                    audio_lengths_for_llm=data_dict["audio_lengths_for_llm"],
+                )
+            else:
+                data_dict = dict(input_ids=data_dict["input_ids"][0], labels=data_dict["labels"][0])
+
+        # image exist in the data
+        if "image" in self.list_data_dict[i] or "video" in self.list_data_dict[i]:
+            data_dict["image"] = image
+        elif self.data_args.is_multimodal:
+            # image does not exist in the data, but the model is multimodal
+            crop_size = self.data_args.image_processor.crop_size
+            data_dict["image"] = torch.zeros(3, crop_size["height"], crop_size["width"])
+        if "audio" in self.list_data_dict[i]:
+            data_dict["audio"] = audio
+        elif self.data_args.is_multimodal:
+            data_dict["audio"] = torch.zeros(400, 80)
+            data_dict["audio_lengths"] = 400
+            data_dict["audio_lengths_for_llm"] = 60
+        return data_dict
+
+
+@dataclass
+class DataCollatorForSupervisedDataset(object):
+    """Collate examples for supervised fine-tuning."""
+
+    tokenizer: transformers.PreTrainedTokenizer
+
+    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        input_ids, labels = tuple(
+            [instance[key] for instance in instances] for key in ("input_ids", "labels")
+        )
+        if self.tokenizer.pad_token_id == self.tokenizer.eos_token_id:
+            for input_id in input_ids:
+                input_id[input_id == self.tokenizer.eos_token_id] = -300
+
+        input_ids = torch.nn.utils.rnn.pad_sequence(
+            input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
+        )
+
+        labels = torch.nn.utils.rnn.pad_sequence(
+            labels, batch_first=True, padding_value=IGNORE_INDEX
+        )
+
+        input_ids = input_ids[:, : self.tokenizer.model_max_length]
+
+        attention_mask = input_ids.ne(self.tokenizer.pad_token_id)
+
+        labels = labels[:, : self.tokenizer.model_max_length]
+
+        if self.tokenizer.pad_token_id == self.tokenizer.eos_token_id:
+            for input_id in input_ids:
+                input_id[input_id == -300] = self.tokenizer.eos_token_id
+
+        batch = dict(
+            input_ids=input_ids,
+            labels=labels,
+            attention_mask=attention_mask,
+        )
+
+        if "image" in instances[0]:
+            images = [instance["image"] for instance in instances]
+
+            new_images = []
+            for image in images:
+                if type(image) is list:
+                    for i in image:
+                        new_images.append(i)
+                else:
+                    new_images.append(image)
+            images = new_images
+
+            if all(x is not None and x.shape == images[0].shape for x in images):
+                batch["images"] = torch.stack(images)
+            else:
+                batch["images"] = images
+
+        batch["audios"] = {}
+        if "audio" in instances[0]:
+            audios = [instance["audio"] for instance in instances]
+            audio_lengths = [instance["audio_lengths"] for instance in instances]
+            audio_lengths_for_llm = [instance["audio_lengths_for_llm"] for instance in instances]
+
+            new_audios = []
+            new_audio_lengths = []
+            new_audio_lengths_for_llm = []
+            for i, audio in enumerate(audios):
+                length = audio_lengths[i]
+                length_for_llm = audio_lengths_for_llm[i]
+                if type(audio) is list:
+                    for j, a in enumerate(audio):
+                        new_audios.append(a)
+                        new_audio_lengths.append(length[j])
+                        new_audio_lengths_for_llm.append(length_for_llm[j])
+                else:
+                    new_audios.append(audio)
+                    new_audio_lengths.append(length)
+                    new_audio_lengths_for_llm.append(length_for_llm)
+            audios = new_audios
+            audios = pad_sequence(audios, batch_first=True, padding_value=0)
+
+            batch["audios"]["audios"] = audios
+            batch["audios"]["lengths"] = torch.tensor(new_audio_lengths)
+            batch["audios"]["lengths_for_llm"] = torch.tensor(new_audio_lengths_for_llm)
+
+        return batch
+
+
+def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer, data_args) -> Dict:
+    """Make dataset and collator for supervised fine-tuning."""
+    train_dataset = LazySupervisedDataset(tokenizer=tokenizer, data_args=data_args)
+    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
+    return dict(train_dataset=train_dataset, eval_dataset=None, data_collator=data_collator)
+
+
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    best_ratio_diff = float("inf")
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    # print(f'width: {width}, height: {height}, best_ratio: {best_ratio}')
+    return best_ratio
+
+
+def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+
+    # calculate the existing image aspect ratio
+    target_ratios = set(
+        (i, j)
+        for n in range(min_num, max_num + 1)
+        for i in range(1, n + 1)
+        for j in range(1, n + 1)
+        if i * j <= max_num and i * j >= min_num
+    )
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, target_ratios, orig_width, orig_height, image_size
+    )
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size,
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images, [len(processed_images)]
+
+
diff --git a/vita/util/mm_utils.py b/vita/util/mm_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd5550baa00c857e45df4256369d0c5f2580ffa7
--- /dev/null
+++ b/vita/util/mm_utils.py
@@ -0,0 +1,156 @@
+import base64
+import re
+from io import BytesIO
+
+import torch
+from PIL import Image
+from transformers import StoppingCriteria
+
+from vita.constants import AUDIO_TOKEN_INDEX, IMAGE_TOKEN_INDEX
+
+
+def load_image_from_base64(image):
+    return Image.open(BytesIO(base64.b64decode(image)))
+
+
+def expand2square(pil_img, background_color):
+    width, height = pil_img.size
+    if width == height:
+        return pil_img
+    elif width > height:
+        result = Image.new(pil_img.mode, (width, width), background_color)
+        result.paste(pil_img, (0, (width - height) // 2))
+        return result
+    else:
+        result = Image.new(pil_img.mode, (height, height), background_color)
+        result.paste(pil_img, ((height - width) // 2, 0))
+        return result
+
+
+def process_images(images, image_processor, model_cfg):
+    image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
+    new_images = []
+    if image_aspect_ratio == "pad":
+        for image in images:
+            image = expand2square(image, tuple(int(x * 255) for x in image_processor.image_mean))
+            image = image_processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
+            new_images.append(image)
+    else:
+        return image_processor(images, return_tensors="pt")["pixel_values"]
+    if all(x.shape == new_images[0].shape for x in new_images):
+        new_images = torch.stack(new_images, dim=0)
+    return new_images
+
+
+def tokenizer_image_token(
+    prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None
+):
+    prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("<image>")]
+
+    def insert_separator(X, sep):
+        return [ele for sublist in zip(X, [sep] * len(X)) for ele in sublist][:-1]
+
+    input_ids = []
+    offset = 0
+    if (
+        len(prompt_chunks) > 0
+        and len(prompt_chunks[0]) > 0
+        and prompt_chunks[0][0] == tokenizer.bos_token_id
+    ):
+        offset = 1
+        input_ids.append(prompt_chunks[0][0])
+
+    for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
+        input_ids.extend(x[offset:])
+
+    if return_tensors is not None:
+        if return_tensors == "pt":
+            return torch.tensor(input_ids, dtype=torch.long)
+        raise ValueError(f"Unsupported tensor type: {return_tensors}")
+    return input_ids
+
+
+def tokenizer_image_audio_token(
+    prompt,
+    tokenizer,
+    image_token_index=IMAGE_TOKEN_INDEX,
+    audio_token_index=AUDIO_TOKEN_INDEX,
+    return_tensors=None,
+):
+    prompt_chunks = []
+    for chunk in re.split(r"(<audio>|<image>)", prompt):
+        if chunk == "<audio>":
+            prompt_chunks.append([audio_token_index])
+        elif chunk == "<image>":
+            prompt_chunks.append([image_token_index])
+        else:
+            prompt_chunks.append(tokenizer(chunk).input_ids)
+
+    input_ids = []
+    offset = 0
+    if (
+        len(prompt_chunks) > 0
+        and len(prompt_chunks[0]) > 0
+        and prompt_chunks[0][0] == tokenizer.bos_token_id
+    ):
+        offset = 1
+        input_ids.append(prompt_chunks[0][0])
+
+    for x in prompt_chunks:
+        if x != [image_token_index] and x != [audio_token_index]:
+            input_ids.extend(x[offset:])
+        else:
+            input_ids.extend(x[:])
+
+    if return_tensors is not None:
+        if return_tensors == "pt":
+            return torch.tensor(input_ids, dtype=torch.long)
+        raise ValueError(f"Unsupported tensor type: {return_tensors}")
+    return input_ids
+
+
+def get_model_name_from_path(model_path):
+    model_path = model_path.strip("/")
+    model_paths = model_path.split("/")
+    if model_paths[-1].startswith("checkpoint-"):
+        return model_paths[-2] + "_" + model_paths[-1]
+    else:
+        return model_paths[-1]
+
+
+class KeywordsStoppingCriteria(StoppingCriteria):
+    def __init__(self, keywords, tokenizer, input_ids):
+        self.keywords = keywords
+        self.keyword_ids = []
+        self.max_keyword_len = 0
+        for keyword in keywords:
+            cur_keyword_ids = tokenizer(keyword).input_ids
+            if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
+                cur_keyword_ids = cur_keyword_ids[1:]
+            if len(cur_keyword_ids) > self.max_keyword_len:
+                self.max_keyword_len = len(cur_keyword_ids)
+            self.keyword_ids.append(torch.tensor(cur_keyword_ids))
+        self.tokenizer = tokenizer
+        self.start_len = input_ids.shape[1]
+
+    def call_for_batch(
+        self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
+    ) -> bool:
+        offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len)
+        self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
+        for keyword_id in self.keyword_ids:
+            truncated_output_ids = output_ids[0, -keyword_id.shape[0] :]
+            if torch.equal(truncated_output_ids, keyword_id):
+                return True
+        outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
+        for keyword in self.keywords:
+            if keyword in outputs:
+                return True
+        return False
+
+    def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        outputs = []
+        for i in range(output_ids.shape[0]):
+            outputs.append(self.call_for_batch(output_ids[i].unsqueeze(0), scores))
+        return all(outputs)
+
diff --git a/vita/util/s2wrapper/__init__.py b/vita/util/s2wrapper/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3fc8112a42bf1f6bf6ed1ce330a4554a6af4a66
--- /dev/null
+++ b/vita/util/s2wrapper/__init__.py
@@ -0,0 +1,2 @@
+from .core import *
+from .utils import *
diff --git a/vita/util/s2wrapper/core.py b/vita/util/s2wrapper/core.py
new file mode 100644
index 0000000000000000000000000000000000000000..667832a20dd5b73e1544369b99b96e48b8f11fa3
--- /dev/null
+++ b/vita/util/s2wrapper/core.py
@@ -0,0 +1,100 @@
+#  ------------------------------------------------------------------------------------------
+#  Copyright (c) 2024 Baifeng Shi.
+#  All rights reserved.
+#
+#  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
+#  ------------------------------------------------------------------------------------------
+
+import math
+
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+
+from .utils import batched_forward, merge_chessboard, split_chessboard
+
+
+def forward(
+    model,
+    input,
+    scales=None,
+    img_sizes=None,
+    max_split_size=None,
+    resize_output_to_idx=0,
+    num_prefix_token=0,
+    output_shape="bnc",
+    split_forward=False,
+):
+
+    assert input.dim() == 4, "Input image must be in the shape of BxCxHxW."
+    assert input.shape[2] == input.shape[3], "Currently only square images are supported."
+    assert output_shape in [
+        "bnc",
+        "bchw",
+    ], "Output shape should be either BxNxC (e.g., ViT) or BxCxHxW (e.g., ConvNet)."
+    assert (
+        output_shape == "bnc" or num_prefix_token == 0
+    ), "For ConvNet there shouldn't be any prefix token."
+
+    b, c, input_size, _ = input.shape
+
+    # image size for each scale
+    assert scales is not None or img_sizes is not None, "Please assign either scales or img_sizes."
+    img_sizes = img_sizes or [int(input_size * scale) for scale in scales]
+
+    # prepare multiscale inputs
+    max_split_size = (
+        max_split_size or input_size
+    )  # The maximum size of each split of image. Set as the input size by default
+    num_splits = [
+        math.ceil(size / max_split_size) for size in img_sizes
+    ]  # number of splits each scale
+    input_multiscale = []
+    for size, num_split in zip(img_sizes, num_splits):
+        x = F.interpolate(input.to(torch.float32), size=size, mode="bicubic").to(input.dtype)
+        x = split_chessboard(x, num_split=num_split)
+        input_multiscale.append(x)
+
+    # run feedforward on each scale
+    outs_multiscale = [
+        batched_forward(model, x, b) if split_forward else model(x) for x in input_multiscale
+    ]
+    if num_prefix_token > 0:
+        outs_prefix_multiscale = [out[:, :num_prefix_token] for out in outs_multiscale]
+        outs_multiscale = [out[:, num_prefix_token:] for out in outs_multiscale]
+    if output_shape == "bnc":
+        outs_multiscale = [
+            rearrange(
+                out, "b (h w) c -> b c h w", h=int(out.shape[1] ** 0.5), w=int(out.shape[1] ** 0.5)
+            )
+            for out in outs_multiscale
+        ]
+
+    # merge outputs of different splits for each scale separately
+    outs_multiscale = [
+        merge_chessboard(out, num_split=num_split)
+        for num_split, out in zip(num_splits, outs_multiscale)
+    ]
+
+    # interpolate outputs from different scales and concat together
+    output_size = outs_multiscale[resize_output_to_idx].shape[-2]
+    out = torch.cat(
+        [
+            F.interpolate(outs_multiscale[i].to(torch.float32), size=output_size, mode="area").to(
+                outs_multiscale[i].dtype
+            )
+            for i in range(len(outs_multiscale))
+        ],
+        dim=1,
+    )
+    if output_shape == "bnc":
+        out = rearrange(out, "b c h w -> b (h w) c")
+    if num_prefix_token > 0:
+        # take the mean of prefix tokens from different splits for each scale
+        outs_prefix_multiscale = [
+            torch.stack(out.split(b, dim=0), dim=0).mean(dim=0) for out in outs_prefix_multiscale
+        ]
+        out_prefix_multiscale = torch.cat(outs_prefix_multiscale, dim=-1)
+        out = torch.cat([out_prefix_multiscale, out], dim=1)
+
+    return out
diff --git a/vita/util/s2wrapper/utils.py b/vita/util/s2wrapper/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf9858abc0c2172b3798361b2c4010fbc1ecc210
--- /dev/null
+++ b/vita/util/s2wrapper/utils.py
@@ -0,0 +1,61 @@
+#  ------------------------------------------------------------------------------------------
+#  Copyright (c) 2024 Baifeng Shi.
+#  All rights reserved.
+#
+#  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
+#  ------------------------------------------------------------------------------------------
+
+import torch
+
+
+def split_chessboard(x, num_split):
+    """
+    x: b * c * h * w
+    Deividing x into num_split**2 sub-squares, and concatenate all the sub-squares on the batch dimension
+    """
+    B, C, H, W = x.shape
+    assert H % num_split == 0 and W % num_split == 0
+    h, w = H // num_split, W // num_split
+    x_split = torch.cat(
+        [
+            x[:, :, i * h : (i + 1) * h, j * w : (j + 1) * w]
+            for i in range(num_split)
+            for j in range(num_split)
+        ],
+        dim=0,
+    )
+    return x_split
+
+
+def merge_chessboard(x, num_split):
+    """
+    x: b * c * h * w
+    Assuming x contains num_split**2 sub-squares concatenated along batch dimension, merge the sub-squares back to the original whole square.
+    (inverse of split_chessboard)
+    """
+    B, C, H, W = x.shape
+    assert B % (num_split**2) == 0
+    b = B // (num_split**2)
+    x_merge = torch.cat(
+        [
+            torch.cat(
+                [
+                    x[(i * num_split + j) * b : (i * num_split + j + 1) * b]
+                    for j in range(num_split)
+                ],
+                dim=-1,
+            )
+            for i in range(num_split)
+        ],
+        dim=-2,
+    )
+    return x_merge
+
+
+def batched_forward(model, x, batch_size=-1):
+    if batch_size == -1:
+        return model(x)
+    else:
+        x_batched = x.split(batch_size)
+        outs = [model(x) for x in x_batched]
+        return torch.cat(outs, dim=0)
diff --git a/vita/util/utils.py b/vita/util/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..2abc0e05b13abc36e9d9651e010179f15224af58
--- /dev/null
+++ b/vita/util/utils.py
@@ -0,0 +1,128 @@
+import logging
+import logging.handlers
+import os
+import sys
+
+from vita.constants import LOGDIR
+
+server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
+moderation_msg = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN."
+
+handler = None
+
+
+def disable_torch_init():
+    """
+    Disable the redundant torch default initialization to accelerate model creation.
+    """
+    import torch
+
+    setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
+    setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
+
+
+def build_logger(logger_name, logger_filename):
+    global handler
+
+    formatter = logging.Formatter(
+        fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+    )
+
+    # Set the format of root handlers
+    if not logging.getLogger().handlers:
+        logging.basicConfig(level=logging.INFO)
+    logging.getLogger().handlers[0].setFormatter(formatter)
+
+    # Redirect stdout and stderr to loggers
+    stdout_logger = logging.getLogger("stdout")
+    stdout_logger.setLevel(logging.INFO)
+    sl = StreamToLogger(stdout_logger, logging.INFO)
+    sys.stdout = sl
+
+    stderr_logger = logging.getLogger("stderr")
+    stderr_logger.setLevel(logging.ERROR)
+    sl = StreamToLogger(stderr_logger, logging.ERROR)
+    sys.stderr = sl
+
+    # Get logger
+    logger = logging.getLogger(logger_name)
+    logger.setLevel(logging.INFO)
+
+    # Add a file handler for all loggers
+    if handler is None:
+        os.makedirs(LOGDIR, exist_ok=True)
+        filename = os.path.join(LOGDIR, logger_filename)
+        handler = logging.handlers.TimedRotatingFileHandler(
+            filename, when="D", utc=True, encoding="UTF-8"
+        )
+        handler.setFormatter(formatter)
+
+        for name, item in logging.root.manager.loggerDict.items():
+            if isinstance(item, logging.Logger):
+                item.addHandler(handler)
+
+    return logger
+
+
+class StreamToLogger(object):
+    """
+    Fake file-like stream object that redirects writes to a logger instance.
+    """
+
+    def __init__(self, logger, log_level=logging.INFO):
+        self.terminal = sys.stdout
+        self.logger = logger
+        self.log_level = log_level
+        self.linebuf = ""
+
+    def __getattr__(self, attr):
+        return getattr(self.terminal, attr)
+
+    def write(self, buf):
+        temp_linebuf = self.linebuf + buf
+        self.linebuf = ""
+        for line in temp_linebuf.splitlines(True):
+            # From the io.TextIOWrapper docs:
+            #   On output, if newline is None, any '\n' characters written
+            #   are translated to the system default line separator.
+            # By default sys.stdout.write() expects '\n' newlines and then
+            # translates them so this is still cross platform.
+            if line[-1] == "\n":
+                self.logger.log(self.log_level, line.rstrip())
+            else:
+                self.linebuf += line
+
+    def flush(self):
+        if self.linebuf != "":
+            self.logger.log(self.log_level, self.linebuf.rstrip())
+        self.linebuf = ""
+
+
+def violates_moderation(text):
+    """
+    Check whether the text violates OpenAI moderation API.
+    """
+    url = "https://api.openai.com/v1/moderations"
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": "Bearer " + os.environ["OPENAI_API_KEY"],
+    }
+    text = text.replace("\n", "")
+    data = "{" + '"input": ' + f'"{text}"' + "}"
+    data = data.encode("utf-8")
+    try:
+        ret = requests.post(url, headers=headers, data=data, timeout=5)
+        flagged = ret.json()["results"][0]["flagged"]
+    except requests.exceptions.RequestException as e:
+        flagged = False
+    except KeyError as e:
+        flagged = False
+
+    return flagged
+
+
+def pretty_print_semaphore(semaphore):
+    if semaphore is None:
+        return "None"
+    return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})"