Spaces:

THUdyh
/

Ola

Running on Zero

App Files Files Community

dongyh20 commited on 10 days ago

Commit

1938217

1 Parent(s): a4679af

update space

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

app.py +382 -0
ola/CosyVoice +1 -0
ola/__pycache__/arguments.cpython-310.pyc +0 -0
ola/__pycache__/arguments.cpython-38.pyc +0 -0
ola/__pycache__/constants.cpython-310.pyc +0 -0
ola/__pycache__/constants.cpython-38.pyc +0 -0
ola/__pycache__/conversation.cpython-310.pyc +0 -0
ola/__pycache__/conversation.cpython-38.pyc +0 -0
ola/__pycache__/mm_utils.cpython-310.pyc +0 -0
ola/__pycache__/mm_utils.cpython-38.pyc +0 -0
ola/__pycache__/utils.cpython-310.pyc +0 -0
ola/__pycache__/utils.cpython-38.pyc +0 -0
ola/arguments.py +65 -0
ola/constants.py +14 -0
ola/conversation.py +254 -0
ola/datasets/__init__.py +0 -0
ola/datasets/__pycache__/__init__.cpython-310.pyc +0 -0
ola/datasets/__pycache__/__init__.cpython-38.pyc +0 -0
ola/datasets/__pycache__/preprocess.cpython-310.pyc +0 -0
ola/datasets/__pycache__/preprocess.cpython-38.pyc +0 -0
ola/datasets/preprocess.py +413 -0
ola/mm_utils.py +272 -0
ola/model/__init__.py +1 -0
ola/model/__pycache__/__init__.cpython-310.pyc +0 -0
ola/model/__pycache__/__init__.cpython-38.pyc +0 -0
ola/model/__pycache__/builder.cpython-310.pyc +0 -0
ola/model/__pycache__/builder.cpython-38.pyc +0 -0
ola/model/__pycache__/ola_arch.cpython-310.pyc +0 -0
ola/model/__pycache__/ola_arch.cpython-38.pyc +0 -0
ola/model/builder.py +91 -0
ola/model/language_model/__pycache__/ola_qwen.cpython-310.pyc +0 -0
ola/model/language_model/__pycache__/ola_qwen.cpython-38.pyc +0 -0
ola/model/language_model/ola_qwen.py +237 -0
ola/model/multimodal_encoder/__pycache__/builder.cpython-310.pyc +0 -0
ola/model/multimodal_encoder/__pycache__/builder.cpython-38.pyc +0 -0
ola/model/multimodal_encoder/__pycache__/oryx_vit.cpython-310.pyc +0 -0
ola/model/multimodal_encoder/__pycache__/oryx_vit.cpython-38.pyc +0 -0
ola/model/multimodal_encoder/builder.py +9 -0
ola/model/multimodal_encoder/oryx_vit.py +1126 -0
ola/model/multimodal_projector/__pycache__/builder.cpython-310.pyc +0 -0
ola/model/multimodal_projector/__pycache__/builder.cpython-38.pyc +0 -0
ola/model/multimodal_projector/__pycache__/pooler_projector.cpython-310.pyc +0 -0
ola/model/multimodal_projector/__pycache__/pooler_projector.cpython-38.pyc +0 -0
ola/model/multimodal_projector/builder.py +179 -0
ola/model/multimodal_projector/pooler_projector.py +74 -0
ola/model/multimodal_resampler/__pycache__/builder.cpython-310.pyc +0 -0
ola/model/multimodal_resampler/__pycache__/builder.cpython-38.pyc +0 -0
ola/model/multimodal_resampler/__pycache__/perceiver.cpython-310.pyc +0 -0
ola/model/multimodal_resampler/__pycache__/perceiver.cpython-38.pyc +0 -0
ola/model/multimodal_resampler/builder.py +24 -0

app.py ADDED Viewed

	@@ -0,0 +1,382 @@

+import os
+os.environ['LOWRES_RESIZE'] = '384x32'
+os.environ['HIGHRES_BASE'] = '0x32'
+os.environ['VIDEO_RESIZE'] = "0x64"
+os.environ['VIDEO_MAXRES'] = "480"
+os.environ['VIDEO_MINRES'] = "288"
+os.environ['MAXRES'] = '1536'
+os.environ['MINRES'] = '0'
+os.environ['REGIONAL_POOL'] = '2x'
+os.environ['FORCE_NO_DOWNSAMPLE'] = '1'
+os.environ['LOAD_VISION_EARLY'] = '1'
+os.environ['SKIP_LOAD_VIT'] = '1'
+import gradio as gr
+import torch
+import re
+from decord import VideoReader, cpu
+from PIL import Image
+import numpy as np
+import transformers
+import moviepy.editor as mp
+from typing import Dict, Optional, Sequence, List
+import librosa
+import whisper
+# import subprocess
+# subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
+import sys
+sys.path.append('./ola/CosyVoice/')
+from ola.conversation import conv_templates, SeparatorStyle
+from ola.model.builder import load_pretrained_model
+from ola.utils import disable_torch_init
+from ola.datasets.preprocess import tokenizer_image_token, tokenizer_speech_image_token, tokenizer_speech_question_image_token
+from ola.mm_utils import get_model_name_from_path, KeywordsStoppingCriteria, process_anyres_video, process_anyres_highres_image_genli
+from ola.constants import IGNORE_INDEX, DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX, DEFAULT_SPEECH_TOKEN
+# from ola.CosyVoice.cosyvoice.cli.cosyvoice import CosyVoice
+model_path = "/mnt/lzy/ola-model/Ola-7b"
+tokenizer, model, image_processor, _ = load_pretrained_model(model_path, None)
+model = model.to('cuda').eval()
+model = model.bfloat16()
+# tts_model = CosyVoice('CosyVoice/pretrained_models/CosyVoice-300M-SFT', load_jit=True, load_onnx=False, fp16=True)
+# OUTPUT_SPEECH = False
+USE_SPEECH=False
+title_markdown = """
+<div style="display: flex; justify-content: left; align-items: center; text-align: left; background: linear-gradient(45deg, rgba(204,255,231, 0.8), rgba(204,255,231, 0.3)); border-radius: 10px; box-shadow: 0 8px 16px 0 rgba(0,0,0,0.1);">  <a href="https://llava-vl.github.io/blog/2024-04-30-llava-next-video/"" style="margin-right: 20px; text-decoration: none; display: flex; align-items: center;">
+    <img src="https://ola-omni.github.io/static/images/icon.png" alt="Oryx" style="max-width: 80px; height: auto; border-radius: 10px;">
+  </a>
+  <div>
+    <h2 ><a href="https://github.com/Ola-Omni/Ola">Ola: Pushing the Frontiers of Omni-Modal Language Model with Progressive Modality Alignment</a> </h2>
+    <h5 style="margin: 0;"><a href="https://ola-omni.github.io/">Project Page</a> | <a href="https://github.com/Ola-Omni/Ola">Github</a> | <a href="https://huggingface.co/THUdyh/Ola-7b">Huggingface</a> | <a href="https://arxiv.org/abs/2502.04328">Paper</a> </h5>
+  </div>
+</div>
+"""
+bibtext = """
+### Citation
+```
+@article{liu2025ola,
+title={Ola: Pushing the Frontiers of Omni-Modal Language Model with Progressive Modality Alignment},
+author={Liu, Zuyan and Dong, Yuhao and Wang, Jiahui and Liu, Ziwei and Hu, Winston and Lu, Jiwen and Rao, Yongming},
+journal={arXiv preprint arXiv:2502.04328},
+year={2025}
+}
+```
+"""
+cur_dir = os.path.dirname(os.path.abspath(__file__))
+def load_audio(audio_file_name):
+    speech_wav, samplerate = librosa.load(audio_file_name, sr=16000)
+    if len(speech_wav.shape) > 1:
+        speech_wav = speech_wav[:, 0]
+    speech_wav = speech_wav.astype(np.float32)
+    CHUNK_LIM = 480000
+    SAMPLE_RATE = 16000
+    speechs = []
+    speech_wavs = []
+    if len(speech_wav) <= CHUNK_LIM:
+        speech = whisper.pad_or_trim(speech_wav)
+        speech_wav = whisper.pad_or_trim(speech_wav)
+        speechs.append(speech)
+        speech_wavs.append(torch.from_numpy(speech_wav).unsqueeze(0))
+    else:
+        for i in range(0, len(speech_wav), CHUNK_LIM):
+            chunk = speech_wav[i : i + CHUNK_LIM]
+            if len(chunk) < CHUNK_LIM:
+                chunk = whisper.pad_or_trim(chunk)
+            speechs.append(chunk)
+            speech_wavs.append(torch.from_numpy(chunk).unsqueeze(0))
+    mels = []
+    for chunk in speechs:
+        chunk = whisper.log_mel_spectrogram(chunk, n_mels=128).permute(1, 0).unsqueeze(0)
+        mels.append(chunk)
+    mels = torch.cat(mels, dim=0)
+    speech_wavs = torch.cat(speech_wavs, dim=0)
+    if mels.shape[0] > 25:
+        mels = mels[:25]
+        speech_wavs = speech_wavs[:25]
+    speech_length = torch.LongTensor([mels.shape[1]] * mels.shape[0])
+    speech_chunks = torch.LongTensor([mels.shape[0]])
+    return mels, speech_length, speech_chunks, speech_wavs
+def extract_audio(videos_file_path):
+    my_clip = mp.VideoFileClip(videos_file_path)
+    return my_clip.audio
+def ola_inference(multimodal, audio_path):
+    visual, text = multimodal["files"][0], multimodal["text"]
+    if visual.endswith("image2.png"):
+        modality = "video"
+        visual = f"{cur_dir}/case/case1.mp4"
+    if visual.endswith(".mp4"):
+        modality = "video"
+    else:
+        modality = "image"
+    # input audio and video, do not parse audio in the video, else parse audio in the video
+    if audio_path:
+        USE_SPEECH = True
+    elif modality == "video":
+        USE_SPEECH = True
+    else:
+        USE_SPEECH = False
+    speechs = []
+    speech_lengths = []
+    speech_wavs = []
+    speech_chunks = []
+    if modality == "video":
+        vr = VideoReader(visual, ctx=cpu(0))
+        total_frame_num = len(vr)
+        fps = round(vr.get_avg_fps())
+        uniform_sampled_frames = np.linspace(0, total_frame_num - 1, 64, dtype=int)
+        frame_idx = uniform_sampled_frames.tolist()
+        spare_frames = vr.get_batch(frame_idx).asnumpy()
+        video = [Image.fromarray(frame) for frame in spare_frames]
+    else:
+        image = [Image.open(visual)]
+        image_sizes = [image[0].size]
+    if USE_SPEECH and audio_path:
+        audio_path = audio_path
+        speech, speech_length, speech_chunk, speech_wav = load_audio(audio_path)
+        speechs.append(speech.bfloat16().to('cuda'))
+        speech_lengths.append(speech_length.to('cuda'))
+        speech_chunks.append(speech_chunk.to('cuda'))
+        speech_wavs.append(speech_wav.to('cuda'))
+        print('load audio')
+    elif USE_SPEECH and not audio_path:
+        # parse audio in the video
+        audio = extract_audio(visual)
+        audio.write_audiofile("./video_audio.wav")
+        video_audio_path = './video_audio.wav'
+        speech, speech_length, speech_chunk, speech_wav = load_audio(video_audio_path)
+        speechs.append(speech.bfloat16().to('cuda'))
+        speech_lengths.append(speech_length.to('cuda'))
+        speech_chunks.append(speech_chunk.to('cuda'))
+        speech_wavs.append(speech_wav.to('cuda'))
+    else:
+        speechs = [torch.zeros(1, 3000, 128).bfloat16().to('cuda')]
+        speech_lengths = [torch.LongTensor([3000]).to('cuda')]
+        speech_wavs = [torch.zeros([1, 480000]).to('cuda')]
+        speech_chunks = [torch.LongTensor([1]).to('cuda')]
+    conv_mode = "qwen_1_5"
+    if text:
+        qs = text
+    else:
+        qs = ''
+    if USE_SPEECH and audio_path:
+        qs = DEFAULT_IMAGE_TOKEN + "\n" + "User's question in speech: " + DEFAULT_SPEECH_TOKEN + '\n'
+    elif USE_SPEECH:
+        qs = DEFAULT_SPEECH_TOKEN + DEFAULT_IMAGE_TOKEN + "\n" + qs
+    else:
+        qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
+    conv = conv_templates[conv_mode].copy()
+    conv.append_message(conv.roles[0], qs)
+    conv.append_message(conv.roles[1], None)
+    prompt = conv.get_prompt()
+    if USE_SPEECH and audio_path:
+        input_ids = tokenizer_speech_question_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to('cuda')
+    elif USE_SPEECH:
+        input_ids = tokenizer_speech_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to('cuda')
+    else:
+        input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to('cuda')
+    if modality == "video":
+        video_processed = []
+        for idx, frame in enumerate(video):
+            image_processor.do_resize = False
+            image_processor.do_center_crop = False
+            frame = process_anyres_video(frame, image_processor)
+            if frame_idx is not None and idx in frame_idx:
+                video_processed.append(frame.unsqueeze(0))
+            elif frame_idx is None:
+                video_processed.append(frame.unsqueeze(0))
+        if frame_idx is None:
+            frame_idx = np.arange(0, len(video_processed), dtype=int).tolist()
+        video_processed = torch.cat(video_processed, dim=0).bfloat16().to("cuda")
+        video_processed = (video_processed, video_processed)
+        video_data = (video_processed, (384, 384), "video")
+    else:
+        image_processor.do_resize = False
+        image_processor.do_center_crop = False
+        image_tensor, image_highres_tensor = [], []
+        for visual in image:
+            image_tensor_, image_highres_tensor_ = process_anyres_highres_image_genli(visual, image_processor)
+            image_tensor.append(image_tensor_)
+            image_highres_tensor.append(image_highres_tensor_)
+        if all(x.shape == image_tensor[0].shape for x in image_tensor):
+            image_tensor = torch.stack(image_tensor, dim=0)
+        if all(x.shape == image_highres_tensor[0].shape for x in image_highres_tensor):
+            image_highres_tensor = torch.stack(image_highres_tensor, dim=0)
+        if type(image_tensor) is list:
+            image_tensor = [_image.bfloat16().to("cuda") for _image in image_tensor]
+        else:
+            image_tensor = image_tensor.bfloat16().to("cuda")
+        if type(image_highres_tensor) is list:
+            image_highres_tensor = [_image.bfloat16().to("cuda") for _image in image_highres_tensor]
+        else:
+            image_highres_tensor = image_highres_tensor.bfloat16().to("cuda")
+    pad_token_ids = 151643
+    attention_masks = input_ids.ne(pad_token_ids).long().to('cuda')
+    stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
+    keywords = [stop_str]
+    stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
+    gen_kwargs = {}
+    if "max_new_tokens" not in gen_kwargs:
+        gen_kwargs["max_new_tokens"] = 1024
+    if "temperature" not in gen_kwargs:
+        gen_kwargs["temperature"] = 0.2
+    if "top_p" not in gen_kwargs:
+        gen_kwargs["top_p"] = None
+    if "num_beams" not in gen_kwargs:
+        gen_kwargs["num_beams"] = 1
+    with torch.inference_mode():
+        if modality == "video":
+            output_ids = model.generate(
+                inputs=input_ids,
+                images=video_data[0][0],
+                images_highres=video_data[0][1],
+                modalities=video_data[2],
+                speech=speechs,
+                speech_lengths=speech_lengths,
+                speech_chunks=speech_chunks,
+                speech_wav=speech_wavs,
+                attention_mask=attention_masks,
+                use_cache=True,
+                stopping_criteria=[stopping_criteria],
+                do_sample=True if gen_kwargs["temperature"] > 0 else False,
+                temperature=gen_kwargs["temperature"],
+                top_p=gen_kwargs["top_p"],
+                num_beams=gen_kwargs["num_beams"],
+                max_new_tokens=gen_kwargs["max_new_tokens"],
+            )
+        else:
+            output_ids = model.generate(
+                inputs=input_ids,
+                images=image_tensor,
+                images_highres=image_highres_tensor,
+                image_sizes=image_sizes,
+                modalities=['image'],
+                speech=speechs,
+                speech_lengths=speech_lengths,
+                speech_chunks=speech_chunks,
+                speech_wav=speech_wavs,
+                attention_mask=attention_masks,
+                use_cache=True,
+                stopping_criteria=[stopping_criteria],
+                do_sample=True if gen_kwargs["temperature"] > 0 else False,
+                temperature=gen_kwargs["temperature"],
+                top_p=gen_kwargs["top_p"],
+                num_beams=gen_kwargs["num_beams"],
+                max_new_tokens=gen_kwargs["max_new_tokens"],
+            )
+    outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
+    outputs = outputs.strip()
+    if outputs.endswith(stop_str):
+        outputs = outputs[:-len(stop_str)]
+    outputs = outputs.strip()
+    # if OUTPUT_SPEECH:
+    #     voice_all = []
+    #     for i, j in enumerate(cosyvoice.inference_sft('Visual data comes in various forms, ranging from small icons of just a few pixels to long videos spanning hours. Existing multi-modal LLMs usually standardize these diverse visual inputs to a fixed resolution for visual encoders and yield similar numbers of tokens for LLMs. This approach is non-optimal for multimodal understanding and inefficient for processing inputs with long and short visual contents. To solve the problem, we propose Oryx, a unified multimodal architecture for the spatial-temporal understanding of images, videos, and multi-view 3D scenes. Oryx offers an on-demand solution to seamlessly and efficiently process visual inputs with arbitrary spatial sizes and temporal lengths through two core innovations: 1) a pre-trained OryxViT model that can encode images at any resolution into LLM-friendly visual representations; 2) a dynamic compressor module that supports 1x to 16x compression on visual tokens by request. These design features enable Oryx to accommodate extremely long visual contexts, such as videos, with lower resolution and high compression while maintaining high recognition precision for tasks like document understanding with native resolution and no compression. Beyond the architectural improvements, enhanced data curation and specialized training on long-context retrieval and spatial-aware data help Oryx achieve strong capabilities in image, video, and 3D multimodal understanding simultaneously. ', '英文女', stream=False)):
+    #         voice_all.append(j['tts_speech'])
+    #     voice_all = torch.cat(voice_all, dim=1)
+    #     torchaudio.save('sft.wav', voice_all, 22050)
+    #     return outputs, "sft.wav"
+    # else:
+    return outputs, None
+# Define input and output for the Gradio interface
+demo = gr.Interface(
+    fn=ola_inference,
+    inputs=[gr.MultimodalTextbox(file_types=[".mp4", "image"],placeholder="Enter message or upload file..."), gr.Audio(type="filepath")],
+    outputs=["text", "audio"],
+    # examples=[
+    #         {
+    #             "files":[f"{cur_dir}/case/image2.png"],
+    #             "text":"Describe what is happening in this video in detail.",
+    #         },
+    #         {
+    #             "files":[f"{cur_dir}/case/image.png"],
+    #             "text":"Describe this icon.",
+    #         },
+    #     ],
+    title="Ola Demo",
+    description=title_markdown,
+    article=bibtext,
+)
+# textbox = gr.Textbox(
+#     show_label=False, placeholder="Enter text and press ENTER", container=False, max_lines=100
+# )
+# with gr.Blocks(
+#     title="Oryx-7B",
+#     theme="finlaymacklon/smooth_slate",
+#     css=".message-wrap.svelte-1lcyrx4>div.svelte-1lcyrx4  img {min-width: 50px}",
+#     fill_height=True
+# ) as demo:
+#     html_header = "https://oryx-mllm.github.io/"
+#     gr.HTML(html_header)
+#     with gr.Row(equal_height=True):
+#         with gr.Column(scale=3):
+#             with gr.Row():
+#                 video = gr.Video(label="Input Video",  height=400)
+#                 cur_dir = os.path.dirname(os.path.abspath(__file__))
+#             with gr.Row():
+#                 gr.Examples(
+#                     examples=[
+#                         [
+#                             f"{cur_dir}/case/case1.mp4",
+#                             "Describe what is happening in this video in detail.",
+#                         ],
+#                     ],
+#                     inputs=[video, textbox],
+#                 )
+#         with gr.Column(scale=7):
+#             chatbot = gr.Chatbot(label="Oryx", bubble_full_width=False, height=660)
+#             with gr.Row():
+#                 with gr.Column(scale=8):
+#                     textbox.render()
+#                 with gr.Column(scale=1, min_width=50):
+#                     submit_btn = gr.Button(
+#                         value="Send", variant="primary", interactive=True
+#                     )
+#             # with gr.Row(elem_id="buttons") as button_row:
+#             #     upvote_btn = gr.Button(value="👍  Upvote", interactive=True)
+#             #     downvote_btn = gr.Button(value="👎  Downvote", interactive=True)
+#             #     flag_btn = gr.Button(value="⚠️  Flag", interactive=True)
+#             #     clear_btn = gr.Button(value="🗑️  Clear history", interactive=True)
+#     submit_btn.click(
+#         oryx_inference,
+#         [video, textbox],
+#         [chatbot, textbox, video],
+#     )
+# Launch the Gradio app
+demo.launch(server_name="0.0.0.0",server_port=80)

ola/CosyVoice ADDED Viewed

	@@ -0,0 +1 @@


1	+ Subproject commit 027e1ccb82ce59bbc12f35a96e0f92625cf18369

ola/__pycache__/arguments.cpython-310.pyc ADDED Viewed

Binary file (2.65 kB). View file

ola/__pycache__/arguments.cpython-38.pyc ADDED Viewed

Binary file (2.64 kB). View file

ola/__pycache__/constants.cpython-310.pyc ADDED Viewed

Binary file (508 Bytes). View file

ola/__pycache__/constants.cpython-38.pyc ADDED Viewed

Binary file (506 Bytes). View file

ola/__pycache__/conversation.cpython-310.pyc ADDED Viewed

Binary file (6.21 kB). View file

ola/__pycache__/conversation.cpython-38.pyc ADDED Viewed

Binary file (6.28 kB). View file

ola/__pycache__/mm_utils.cpython-310.pyc ADDED Viewed

Binary file (6.44 kB). View file

ola/__pycache__/mm_utils.cpython-38.pyc ADDED Viewed

Binary file (6.41 kB). View file

ola/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (7.5 kB). View file

ola/__pycache__/utils.cpython-38.pyc ADDED Viewed

Binary file (7.53 kB). View file

ola/arguments.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import transformers
+from dataclasses import dataclass, field
+from typing import Optional
+@dataclass
+class ModelArguments:
+    model_name_or_path: Optional[str] = field(default="facebook/opt-125m")
+    version: Optional[str] = field(default="v0")
+    freeze_backbone: bool = field(default=False)
+    tune_speech_projector: bool = field(default=False)
+    tune_speech_encoder: bool = field(default=False)
+    tune_speech_generator_only: bool = field(default=False)
+    speech_encoder_type: Optional[str] = field(default=None)
+    speech_encoder: Optional[str] = field(default=None)
+    pretrain_speech_projector: Optional[str] = field(default=None)
+    speech_projector_type: Optional[str] = field(default='linear')
+    speech_encoder_ds_rate: int = 5
+    speech_encoder_hidden_size: int = 1280
+@dataclass
+class DataArguments:
+    data_path: str = field(default=None,
+                           metadata={"help": "Path to the training data."})
+    is_multimodal: bool = False
+    input_type: str = field(default="mel")
+    speech_normalize: bool = False
+    mel_size: int = 128
+    has_tgt_units: bool = False
+@dataclass
+class TrainingArguments(transformers.TrainingArguments):
+    cache_dir: Optional[str] = field(default=None)
+    optim: str = field(default="adamw_torch")
+    freeze_speech_projector: bool = field(default=False)
+    model_max_length: int = field(
+        default=512,
+        metadata={
+            "help":
+            "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
+        },
+    )
+    double_quant: bool = field(
+        default=True,
+        metadata={"help": "Compress the quantization statistics through double quantization."}
+    )
+    quant_type: str = field(
+        default="nf4",
+        metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."}
+    )
+    bits: int = field(
+        default=16,
+        metadata={"help": "How many bits to use."}
+    )
+    lora_enable: bool = False
+    lora_r: int = 64
+    lora_alpha: int = 16
+    lora_dropout: float = 0.05
+    lora_weight_path: str = ""
+    lora_bias: str = "none"
+    speech_projector_lr: Optional[float] = None
+    group_by_modality_length: bool = field(default=False)

ola/constants.py ADDED Viewed

	@@ -0,0 +1,14 @@

+CONTROLLER_HEART_BEAT_EXPIRATION = 30
+WORKER_HEART_BEAT_INTERVAL = 15
+LOGDIR = "."
+# Model Constants
+IGNORE_INDEX = -100
+SPEECH_TOKEN_INDEX = -200
+DEFAULT_SPEECH_TOKEN = "<speech>"
+IMAGE_TOKEN_INDEX= -300
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"

ola/conversation.py ADDED Viewed

	@@ -0,0 +1,254 @@

+import dataclasses
+from enum import auto, Enum
+from typing import List, Any, Union, Tuple
+import base64
+from io import BytesIO
+from PIL import Image
+class SeparatorStyle(Enum):
+    """Different separator style."""
+    TWO = auto()
+    PLAIN = auto()
+    CHATML = auto()
+    LLAMA_2 = auto()
+    LLAMA_3 = auto()
+    QWEN2 = auto()
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+    system: str
+    roles: List[str]
+    messages: List[List[str]]
+    offset: int
+    sep_style: SeparatorStyle = SeparatorStyle.PLAIN
+    sep: str = "###"
+    sep2: str = None
+    version: str = "Unknown"
+    tokenizer_id: str = ""
+    tokenizer: Any = None
+    # Stop criteria (the default one is EOS token)
+    stop_str: Union[str, List[str]] = None
+    # Stops generation if meeting any token in this list
+    stop_token_ids: List[int] = None
+    skip_next: bool = False
+    def get_prompt(self):
+        messages = self.messages
+        if self.sep_style == SeparatorStyle.TWO:
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message = message[0]
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.LLAMA_3:
+            wrap_sys = lambda msg: f"<|start_header_id|>system<|end_header_id|>\n\n{msg}<|eot_id|>" if len(msg) > 0 else msg
+            ret = "<|begin_of_text|>" + wrap_sys(self.system)
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message = message[0]
+                    ret += f"<|start_header_id|>{role}<|end_header_id|>\n\n"
+                    ret += message.strip() + self.sep2
+                else:
+                    ret += f"<|start_header_id|>{role}<|end_header_id|>\n\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.LLAMA_2:
+            wrap_sys = lambda msg: f"<<SYS>>\n{msg}\n<</SYS>>\n\n" if len(msg) > 0 else msg
+            wrap_inst = lambda msg: f"[INST] {msg} [/INST]"
+            ret = ""
+            for i, (role, message) in enumerate(messages):
+                if i == 0:
+                    assert message, "first message should not be none"
+                    assert role == self.roles[0], "first message should come from user"
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    if i == 0:
+                        message = wrap_sys(self.system) + message
+                    if i % 2 == 0:
+                        message = wrap_inst(message)
+                        ret += self.sep + message
+                    else:
+                        ret += " " + message + " " + self.sep2
+                else:
+                    ret += ""
+            ret = ret.lstrip(self.sep)
+        elif self.sep_style == SeparatorStyle.PLAIN:
+            seps = [self.sep, self.sep2]
+            ret = self.system
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += message + seps[i % 2]
+                else:
+                    ret += ""
+        elif self.sep_style == SeparatorStyle.CHATML:
+            ret = "" if self.system == "" else self.system + self.sep + "\n"
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        raise ValueError("Tuple not supported in CHATML")
+                        message, images = message
+                        message = "<speech>" * len(images) + message
+                    ret += role + "\n" + message + self.sep + "\n"
+                else:
+                    ret += role + "\n"
+            return ret
+        elif self.sep_style == SeparatorStyle.QWEN2:
+            start = '<|im_start|>'
+            end = '<|im_end|>\n'
+            ret = start + 'system\n' + self.system + end
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    if message.endswith('<|endoftext|>'):
+                        message = message.replace('<|endoftext|>', '')
+                        ret += start + role + "\n" + message + end + '<|endoftext|>'
+                    else:
+                        assert not '<|endoftext|>' in message, f"Invalid message: {message}"
+                        ret += start + role + "\n" + message + end
+                else:
+                    ret += start + role + "\n"
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+        return ret
+    def append_message(self, role, message):
+        self.messages.append([role, message])
+    def to_gradio_chatbot(self):
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset:]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    msg, speech = msg
+                    ret.append([msg, None])
+                else:
+                    ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+    def copy(self):
+        return Conversation(
+            system=self.system,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            version=self.version)
+    def dict(self):
+        if len(self.get_images()) > 0:
+            return {
+                "system": self.system,
+                "roles": self.roles,
+                "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
+                "offset": self.offset,
+                "sep": self.sep,
+                "sep2": self.sep2,
+            }
+        return {
+            "system": self.system,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+            "sep": self.sep,
+            "sep2": self.sep2,
+        }
+conv_vicuna_v1 = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. " "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+conv_llama_2 = Conversation(
+    system="You are a helpful language and speech assistant. " "You are able to understand the speech content that the user provides, " "and assist the user with a variety of tasks using natural language.",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+)
+conv_llama_3 = Conversation(
+    system="You are a helpful language and speech assistant. " "You are able to understand the speech content that the user provides, " "and assist the user with a variety of tasks using natural language.",
+    roles=("user", "assistant"),
+    version="llama_v3",
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_3,
+    sep="",
+    sep2="<|eot_id|>"
+)
+conv_qwen_v1 = Conversation(
+    system="You are a helpful assistant.",
+    roles=("user", "assistant"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.QWEN2,
+)
+conv_plain = Conversation(
+    system="",
+    roles=("", ""),
+    messages=(
+    ),
+    offset=0,
+    sep_style=SeparatorStyle.PLAIN,
+    sep="</s>",
+)
+conv_qwen = Conversation(
+    system="""<|im_start|>system
+You are a helpful assistant.""",
+    roles=("<|im_start|>user", "<|im_start|>assistant"),
+    version="qwen",
+    messages=[],
+    offset=0,
+    sep_style=SeparatorStyle.CHATML,
+    sep="<|im_end|>",
+)
+default_conversation = conv_llama_3
+conv_templates = {
+    "v1": conv_vicuna_v1,
+    "plain": conv_plain,
+    "llama_2": conv_llama_2,
+    "llama_3": conv_llama_3,
+    'v1_qwen2': conv_qwen_v1,
+    "qwen_1_5": conv_qwen,
+}
+if __name__ == "__main__":
+    print(default_conversation.get_prompt())

ola/datasets/__init__.py ADDED Viewed

File without changes

ola/datasets/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (135 Bytes). View file

ola/datasets/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (133 Bytes). View file

ola/datasets/__pycache__/preprocess.cpython-310.pyc ADDED Viewed

Binary file (10.2 kB). View file

ola/datasets/__pycache__/preprocess.cpython-38.pyc ADDED Viewed

Binary file (10.9 kB). View file

ola/datasets/preprocess.py ADDED Viewed

	@@ -0,0 +1,413 @@

+import copy
+import torch
+import transformers
+import tokenizers
+from typing import Dict, Sequence
+from ola.constants import IGNORE_INDEX, DEFAULT_SPEECH_TOKEN, IMAGE_TOKEN_INDEX
+from ola import conversation as conversation_lib
+from ola.model import *
+from ola.arguments import DataArguments
+from ola.constants import SPEECH_TOKEN_INDEX
+from packaging import version
+IS_TOKENIZER_GREATER_THAN_0_14 = version.parse(tokenizers.__version__) >= version.parse('0.14')
+def tokenizer_speech_token(prompt, tokenizer, speech_token_index=SPEECH_TOKEN_INDEX, return_tensors=None):
+    prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<speech>')]
+    def insert_separator(X, sep):
+        return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
+    input_ids = []
+    offset = 0
+    if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
+        offset = 1
+        input_ids.append(prompt_chunks[0][0])
+    for x in insert_separator(prompt_chunks, [speech_token_index] * (offset + 1)):
+        input_ids.extend(x[offset:])
+    if return_tensors is not None:
+        if return_tensors == 'pt':
+            return torch.tensor(input_ids, dtype=torch.long)
+        raise ValueError(f'Unsupported tensor type: {return_tensors}')
+    return input_ids
+def preprocess_multimodal(
+    sources: Sequence[str],
+    data_args: DataArguments
+) -> Dict:
+    is_multimodal = data_args.is_multimodal
+    if not is_multimodal:
+        return sources
+    for source in sources:
+        for sentence in source:
+            if DEFAULT_SPEECH_TOKEN in sentence['value']:
+                sentence['value'] = sentence['value'].replace(DEFAULT_SPEECH_TOKEN, '').strip()
+                sentence['value'] = DEFAULT_SPEECH_TOKEN + '\n' + sentence['value']
+                sentence['value'] = sentence['value'].strip()
+    return sources
+def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
+    prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
+    def insert_separator(X, sep):
+        return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
+    input_ids = []
+    offset = 0
+    if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
+        offset = 1
+        input_ids.append(prompt_chunks[0][0])
+    for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
+        input_ids.extend(x[offset:])
+    if return_tensors is not None:
+        if return_tensors == 'pt':
+            return torch.tensor(input_ids, dtype=torch.long)
+        raise ValueError(f'Unsupported tensor type: {return_tensors}')
+    return input_ids
+def tokenizer_speech_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, speech_token_idx=SPEECH_TOKEN_INDEX, return_tensors=None):
+    prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<speech><image>')]
+    def insert_separator(X, sep):
+        return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
+    input_ids = []
+    offset = 0
+    if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
+        offset = 1
+        input_ids.append(prompt_chunks[0][0])
+    for x in insert_separator(prompt_chunks, [speech_token_idx, image_token_index] * (offset + 1)):
+        input_ids.extend(x[offset:])
+    if return_tensors is not None:
+        if return_tensors == 'pt':
+            return torch.tensor(input_ids, dtype=torch.long)
+        raise ValueError(f'Unsupported tensor type: {return_tensors}')
+    return input_ids
+def tokenizer_speech_question_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, speech_token_idx=SPEECH_TOKEN_INDEX, return_tensors=None):
+    prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("<image>\nUser's question in speech: <speech>\n")]
+    def insert_separator(X, sep):
+        return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
+    input_ids = []
+    offset = 0
+    if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
+        offset = 1
+        input_ids.append(prompt_chunks[0][0])
+    nl_tokens = tokenizer("\n").input_ids[0]
+    special_chunks = [image_token_index, nl_tokens]
+    special_chunks.extend(tokenizer("User's question in speech: ").input_ids)
+    special_chunks.extend([speech_token_idx, nl_tokens])
+    for x in insert_separator(prompt_chunks, special_chunks):
+        input_ids.extend(x[offset:])
+    if return_tensors is not None:
+        if return_tensors == 'pt':
+            return torch.tensor(input_ids, dtype=torch.long)
+        raise ValueError(f'Unsupported tensor type: {return_tensors}')
+    return input_ids
+def preprocess_llama_2(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_speech: bool = False
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+    # Tokenize conversations
+    if has_speech:
+        input_ids = torch.stack([tokenizer_speech_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0)
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+    targets = input_ids.clone()
+    assert conv.sep_style == conversation_lib.SeparatorStyle.LLAMA_2
+    # Mask targets
+    sep = "[/INST] "
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+        rounds = conversation.split(conv.sep2)
+        cur_len = 1
+        target[:cur_len] = IGNORE_INDEX
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+            if has_speech:
+                round_len = len(tokenizer_speech_token(rou, tokenizer))
+                instruction_len = len(tokenizer_speech_token(parts[0], tokenizer)) - 2
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 2
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(
+                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
+                    f" (ignored)"
+                )
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+def preprocess_llama_3(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_speech: bool = False
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+        assert len(source) == 2, "now only support single-turn conversation"
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+    # Tokenize conversations
+    if has_speech:
+        input_ids = torch.stack([tokenizer_speech_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0)
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+    targets = input_ids.clone()
+    assert conv.sep_style == conversation_lib.SeparatorStyle.LLAMA_3
+    # Mask targets
+    sep = "<|start_header_id|>" + conv.roles[1] + "<|end_header_id|>\n\n"
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+        cur_len = 1
+        target[:cur_len] = IGNORE_INDEX
+        parts = conversation.split(sep)
+        parts[0] += sep
+        if has_speech:
+            conversation_len = len(tokenizer_speech_token(conversation, tokenizer))
+            instruction_len = len(tokenizer_speech_token(parts[0], tokenizer)) - 1
+        else:
+            conversation_len = len(tokenizer(conversation).input_ids)
+            instruction_len = len(tokenizer(parts[0]).input_ids) - 1
+        target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+        cur_len += conversation_len
+        target[cur_len:] = IGNORE_INDEX
+        # if cur_len < tokenizer.model_max_length:
+        #     if cur_len != total_len:
+        #         target[:] = IGNORE_INDEX
+        #         print(
+        #             f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
+        #             f" (ignored)"
+        #         )
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+def preprocess_v1(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_speech: bool = False
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+    # Tokenize conversations
+    if has_speech:
+        input_ids = torch.stack([tokenizer_speech_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0)
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+    targets = input_ids.clone()
+    assert conv.sep_style == conversation_lib.SeparatorStyle.TWO
+    # Mask targets
+    sep = conv.sep + conv.roles[1] + ": "
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+        rounds = conversation.split(conv.sep2)
+        cur_len = 1
+        target[:cur_len] = IGNORE_INDEX
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+            if has_speech:
+                round_len = len(tokenizer_speech_token(rou, tokenizer))
+                instruction_len = len(tokenizer_speech_token(parts[0], tokenizer)) - 2
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 2
+            # FIXME: tokenizer bug
+            if i != 0 and not tokenizer.legacy and IS_TOKENIZER_GREATER_THAN_0_14:
+                round_len -= 1
+                instruction_len -= 1
+            target[cur_len : cur_len + instruction_len] = IGNORE_INDEX
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(
+                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
+                    f" (ignored)"
+                )
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+def preprocess_plain(
+    sources: Sequence[str],
+    tokenizer: transformers.PreTrainedTokenizer,
+) -> Dict:
+    # add end signal and concatenate together
+    conversations = []
+    for source in sources:
+        assert len(source) == 2
+        assert DEFAULT_SPEECH_TOKEN in source[0]['value']
+        source[0]['value'] = DEFAULT_SPEECH_TOKEN
+        conversation = source[0]['value'] + source[1]['value'] + conversation_lib.default_conversation.sep
+        conversations.append(conversation)
+    # tokenize conversations
+    input_ids = [tokenizer_speech_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations]
+    targets = copy.deepcopy(input_ids)
+    for target, source in zip(targets, sources):
+        tokenized_len = len(tokenizer_speech_token(source[0]['value'], tokenizer))
+        target[:tokenized_len] = IGNORE_INDEX
+    return dict(input_ids=input_ids, labels=targets)
+def preprocess(
+    sources: Sequence[str],
+    tokenizer: transformers.PreTrainedTokenizer,
+    has_speech: bool = False
+) -> Dict:
+    """
+    Given a list of sources, each is a conversation list. This transform:
+    1. Add signal '### ' at the beginning each sentence, with end signal '\n';
+    2. Concatenate conversations together;
+    3. Tokenize the concatenated conversation;
+    4. Make a deepcopy as the target. Mask human words with IGNORE_INDEX.
+    """
+    if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.PLAIN:
+        return preprocess_plain(sources, tokenizer)
+    if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.LLAMA_2:
+        return preprocess_llama_2(sources, tokenizer, has_speech=has_speech)
+    if conversation_lib.default_conversation.version.startswith("v1"):
+        return preprocess_v1(sources, tokenizer, has_speech=has_speech)
+    if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.LLAMA_3:
+        return preprocess_llama_3(sources, tokenizer, has_speech=has_speech)
+    raise NotImplementedError

ola/mm_utils.py ADDED Viewed

	@@ -0,0 +1,272 @@

+from PIL import Image
+import base64
+import math
+import ast
+import torch
+from transformers import StoppingCriteria
+import os
+import io
+if 'VIDEO_RESIZE' in os.environ:
+    # highresxpatch
+    VIDEO_RESIZE = os.environ['VIDEO_RESIZE']
+    video_base, video_ps = VIDEO_RESIZE.split('x')
+    video_base = int(video_base)
+    video_ps = int(video_ps)
+    print(f"VIDEO_RESIZE is set as {VIDEO_RESIZE}, {video_base}, {video_ps}")
+else:
+    HIGHRES_BASE = None
+if 'HIGHRES_BASE' in os.environ:
+    # highresxpatch
+    HIGHRES_BASE = os.environ['HIGHRES_BASE']
+    highres_base, highres_ps = HIGHRES_BASE.split('x')
+    highres_base = int(highres_base)
+    highres_ps = int(highres_ps)
+    print(f"HIGHRES_BASE is set as {HIGHRES_BASE}, {highres_base}, {highres_ps}")
+else:
+    HIGHRES_BASE = None
+if 'MAXRES' in os.environ:
+    # highresxpatch
+    MAXRES = int(os.environ['MAXRES'])
+    print(f"MAXRES is set as {MAXRES}")
+else:
+    MAXRES = 1536
+if 'MINRES' in os.environ:
+    # highresxpatch
+    MINRES = int(os.environ['MINRES'])
+    print(f"MINRES is set as {MINRES}")
+else:
+    MINRES = 0
+if 'VIDEO_MAXRES' in os.environ:
+    # highresxpatch
+    VIDEO_MAXRES = int(os.environ['VIDEO_MAXRES'])
+    print(f"VIDEO_MAXRES is set as {VIDEO_MAXRES}")
+else:
+    VIDEO_MAXRES = 1536
+if 'VIDEO_MINRES' in os.environ:
+    # highresxpatch
+    VIDEO_MINRES = int(os.environ['VIDEO_MINRES'])
+    print(f"VIDEO_MINRES is set as {VIDEO_MINRES}")
+else:
+    MINRES = 0
+if 'PAD2STRIDE' in os.environ:
+    # highresxpatch
+    PAD2STRIDE = True
+    print(f"PAD2STRIDE is set")
+else:
+    PAD2STRIDE = False
+if 'LOWRES_RESIZE' in os.environ:
+    LOWRES_RESIZE = os.environ['LOWRES_RESIZE']
+    print(f"LOWRES_RESIZE is set as {LOWRES_RESIZE}")
+    if 'x' in LOWRES_RESIZE:
+        size, ps = LOWRES_RESIZE.split('x')
+        size = int(size)
+        ps = int(ps)
+        LOWRES_RESIZE = (size, ps)
+    else:
+        LOWRES_RESIZE = int(LOWRES_RESIZE)
+else:
+    LOWRES_RESIZE = None
+def pad_image(image, target_resolution, value=0):
+    """
+    Resize and pad an image to a target resolution while maintaining aspect ratio.
+    Args:
+        image (PIL.Image.Image): The input image.
+        target_resolution (tuple): The target resolution (width, height) of the image.
+    Returns:
+        PIL.Image.Image: The resized and padded image.
+    """
+    original_width, original_height = image.size
+    target_width, target_height = target_resolution
+    # Create a new image with the target size and paste the resized image onto it
+    new_image = Image.new('RGB', (target_width, target_height), (value, value, value))
+    paste_x = (target_width - original_width) // 2
+    paste_y = (target_height - original_height) // 2
+    new_image.paste(image, (paste_x, paste_y))
+    return new_image
+def resize_images(image, patch_size=14, base_size=896):
+    h, w = image.size
+    if base_size == 0:
+        if h * w > MAXRES * MAXRES:
+            # print(f'{h}x{w} larger than max size {MAXRES}, resize to {MAXRES}')
+            scale = MAXRES * MAXRES / (h * w)
+            scale = math.sqrt(scale)
+        elif h * w < MINRES * MINRES:
+            # print(f'{h}x{w} smaller than max size {MINRES}, resize to {MINRES}')
+            scale = MINRES * MINRES / (h * w)
+            scale = math.sqrt(scale)
+        else:
+            scale = None
+    else:
+        scale = base_size * base_size / (h * w)
+        scale = math.sqrt(scale)
+    if scale is not None:
+        new_h = int(h * scale / patch_size) * patch_size
+        new_w = int(w * scale / patch_size) * patch_size
+        new_h = max(new_h, patch_size)
+        new_w = max(new_w, patch_size)
+        image = image.resize((new_h, new_w))
+    elif PAD2STRIDE:
+        if h % patch_size == 0:
+            new_h = h
+        else:
+            new_h = (h // patch_size + 1) * patch_size
+        if w % patch_size == 0:
+            new_w = w
+        else:
+            new_w = (w // patch_size + 1) * patch_size
+        image = pad_image(image, (new_h, new_w), value=127)
+    else:
+        scale = 1.0
+        new_h = int(h * scale / patch_size) * patch_size
+        new_w = int(w * scale / patch_size) * patch_size
+        new_h = max(new_h, patch_size)
+        new_w = max(new_w, patch_size)
+        image = image.resize((new_h, new_w))
+    return image
+def resize_video(image, patch_size=14, base_size=896):
+    h, w = image.size
+    if base_size == 0:
+        if h * w > VIDEO_MAXRES * VIDEO_MAXRES:
+            # print(f'{h}x{w} larger than max size {MAXRES}, resize to {MAXRES}')
+            scale = VIDEO_MAXRES * VIDEO_MAXRES / (h * w)
+            scale = math.sqrt(scale)
+        elif h * w < VIDEO_MINRES * VIDEO_MINRES:
+            # print(f'{h}x{w} smaller than max size {MINRES}, resize to {MINRES}')
+            scale = VIDEO_MINRES * VIDEO_MINRES / (h * w)
+            scale = math.sqrt(scale)
+        else:
+            scale = None
+    else:
+        scale = base_size * base_size / (h * w)
+        scale = math.sqrt(scale)
+    if scale is not None:
+        new_h = int(h * scale / patch_size) * patch_size
+        new_w = int(w * scale / patch_size) * patch_size
+        image = image.resize((new_h, new_w))
+    elif PAD2STRIDE:
+        if h % patch_size == 0:
+            new_h = h
+        else:
+            new_h = (h // patch_size + 1) * patch_size
+        if w % patch_size == 0:
+            new_w = w
+        else:
+            new_w = (w // patch_size + 1) * patch_size
+        image = pad_image(image, (new_h, new_w), value=127)
+    else:
+        scale = 1.0
+        new_h = int(h * scale / patch_size) * patch_size
+        new_w = int(w * scale / patch_size) * patch_size
+        image = image.resize((new_h, new_w))
+    return image
+def process_anyres_video(image, processor):
+    if VIDEO_RESIZE is not None:
+        image = resize_video(image, patch_size=video_ps, base_size=video_base)
+        image = processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+        return image.unsqueeze(0)
+    else:
+        raise ValueError("VIDEO_RESIZE is not set")
+def process_anyres_highres_image_genli(image, processor):
+    h, w = image.size
+    if h < 32 and w < 32:
+        min_size = min(h, w)
+        ratio = 64 / min_size
+        image = image.resize((int(h * ratio), int(w * ratio)))
+    elif h < 32:
+        ratio = 64 / h
+        image = image.resize((int(h * ratio), int(w * ratio)))
+    elif w < 32:
+        ratio = 64 / w
+        image = image.resize((int(h * ratio), int(w * ratio)))
+    if HIGHRES_BASE is not None:
+        image = resize_images(image, patch_size=highres_ps, base_size=highres_base)
+    if LOWRES_RESIZE is not None:
+        image_original_resize = resize_images(image, patch_size=LOWRES_RESIZE[1], base_size=LOWRES_RESIZE[0])
+    else:
+        image_original_resize = image.resize((384, 384))
+    # image_patches = [image_original_resize] + [image_original_resize]
+    # image_patches = [processor.preprocess(image_patch, return_tensors='pt')['pixel_values'][0]
+    #                 for image_patch in image_patches]
+    image_patches = processor.preprocess(image_original_resize, return_tensors='pt')['pixel_values'][0]
+    image_padded = processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+    # return torch.stack(image_patches, dim=0), image_padded.unsqueeze(0)
+    return image_patches.unsqueeze(0), image_padded.unsqueeze(0)
+def read_image_patch(patch_info):
+    if 'img_path' in patch_info.keys():
+        image = Image.open(patch_info['img_path']).convert('RGB')
+    else:
+        if 'image_encoing' in patch_info.keys():
+            patch_info['image_encoding'] = patch_info['image_encoing']
+        image_file_name = patch_info['patch']
+        start_bytes = int(patch_info['start_num'])
+        file_size = int(patch_info['size'])
+        with open(image_file_name, 'rb') as f:
+            f.seek(start_bytes)
+            if 'image_encoding' in patch_info.keys() and patch_info['image_encoding'] == 'base64':
+                image = Image.open(io.BytesIO(base64.b64decode(f.read(file_size).decode()))).convert("RGB")
+            else:
+                image = Image.open(io.BytesIO(f.read(file_size))).convert("RGB")
+    return image
+def get_model_name_from_path(model_path):
+    model_path = model_path.strip("/")
+    model_paths = model_path.split("/")
+    if model_paths[-1].startswith('checkpoint-'):
+        return model_paths[-2] + "_" + model_paths[-1]
+    else:
+        return model_paths[-1]
+class KeywordsStoppingCriteria(StoppingCriteria):
+    def __init__(self, keywords, tokenizer, input_ids):
+        self.keywords = keywords
+        self.keyword_ids = []
+        for keyword in keywords:
+            cur_keyword_ids = tokenizer(keyword).input_ids
+            if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
+                cur_keyword_ids = cur_keyword_ids[1:]
+            self.keyword_ids.append(torch.tensor(cur_keyword_ids))
+        self.tokenizer = tokenizer
+        self.start_len = input_ids.shape[1]
+    def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        assert output_ids.shape[0] == 1, "Only support batch size 1 (yet)"  # TODO
+        offset = min(output_ids.shape[1] - self.start_len, 3)
+        self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
+        for keyword_id in self.keyword_ids:
+            if output_ids[0, -keyword_id.shape[0]:] == keyword_id:
+                return True
+        outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
+        for keyword in self.keywords:
+            if keyword in outputs:
+                return True
+        return False

ola/model/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .language_model.ola_qwen import OlaQwenForCausalLM, OlaConfigQwen

ola/model/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (222 Bytes). View file

ola/model/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (220 Bytes). View file

ola/model/__pycache__/builder.cpython-310.pyc ADDED Viewed

Binary file (3.27 kB). View file

ola/model/__pycache__/builder.cpython-38.pyc ADDED Viewed

Binary file (3.34 kB). View file

ola/model/__pycache__/ola_arch.cpython-310.pyc ADDED Viewed

Binary file (11.8 kB). View file

ola/model/__pycache__/ola_arch.cpython-38.pyc ADDED Viewed

Binary file (12 kB). View file

ola/model/builder.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import os
+import warnings
+import shutil
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
+import torch
+from ola.model import *
+from ola.model.speech_encoder.builder import build_speech_encoder
+def load_pretrained_model(model_path, model_base, is_lora=False, s2s=False, load_8bit=False, load_4bit=False, device="cuda", use_flash_attn=False, **kwargs):
+    if load_8bit:
+        kwargs['load_in_8bit'] = True
+    elif load_4bit:
+        kwargs['load_in_4bit'] = True
+        kwargs['quantization_config'] = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type='nf4'
+        )
+    else:
+        kwargs['torch_dtype'] = torch.bfloat16
+    if use_flash_attn:
+        kwargs['attn_implementation'] = 'flash_attention_2'
+    model_cls = OlaQwenForCausalLM
+    # Load OmniSpeech model
+    if is_lora:
+        assert model_base is not None, "model_base is required for LoRA models."
+        from ola.model.language_model.ola_qwen import OlaConfigQwen
+        lora_cfg_pretrained = OlaConfigQwen.from_pretrained(model_path)
+        tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+        print('Loading OmniSpeech from base model...')
+        model = model_cls.from_pretrained(model_base, low_cpu_mem_usage=False, config=lora_cfg_pretrained, **kwargs)
+        print('Loading additional OmniSpeech weights...')
+        if os.path.exists(os.path.join(model_path, 'non_lora_trainables.bin')):
+            non_lora_trainables = torch.load(os.path.join(model_path, 'non_lora_trainables.bin'), map_location='cpu')
+        non_lora_trainables = {(k[11:] if k.startswith('base_model.') else k): v for k, v in non_lora_trainables.items()}
+        if any(k.startswith('model.model.') for k in non_lora_trainables):
+            non_lora_trainables = {(k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()}
+        model.load_state_dict(non_lora_trainables, strict=False)
+        from peft import PeftModel
+        print('Loading LoRA weights...')
+        model = PeftModel.from_pretrained(model, model_path)
+        print('Merging LoRA weights...')
+        model = model.merge_and_unload()
+        print('Model is loaded...')
+    elif model_base is not None:
+        print('Loading OmniSpeech from base model...')
+        tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+        cfg_pretrained = AutoConfig.from_pretrained(model_path)
+        model = model_cls.from_pretrained(model_base, low_cpu_mem_usage=False, config=cfg_pretrained, **kwargs)
+        speech_projector_weights = torch.load(os.path.join(model_path, 'speech_projector.bin'), map_location='cpu')
+        speech_projector_weights = {k: v.to(torch.float16) for k, v in speech_projector_weights.items()}
+        model.load_state_dict(speech_projector_weights, strict=False)
+        model = model.to(device=device)
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
+        model = model_cls.from_pretrained(
+            model_path,
+            low_cpu_mem_usage=False,
+            **kwargs
+        )
+        model = model.to(device=device)
+    model.get_model().speech_encoder = build_speech_encoder(model.config)
+    model.get_model().speech_encoder.to(device=device, dtype=torch.float16)
+    image_processor = None
+    model.resize_token_embeddings(len(tokenizer))
+    vision_tower = model.get_vision_tower()
+    print("Loading vision tower...")
+    if not vision_tower.is_loaded:
+        vision_tower.load_model(device_map=device)
+    if device != "auto":
+        vision_tower.to(device="cuda", dtype=torch.bfloat16)
+    else:
+        vision_tower.to(device="cuda:0", dtype=torch.bfloat16)
+    image_processor = vision_tower.image_processor
+    print("Loading vision tower succeeded.")
+    if hasattr(model.config, "max_sequence_length"):
+        context_len = model.config.max_sequence_length
+    else:
+        context_len = 16384
+    return tokenizer, model, image_processor, context_len

ola/model/language_model/__pycache__/ola_qwen.cpython-310.pyc ADDED Viewed

Binary file (5.31 kB). View file

ola/model/language_model/__pycache__/ola_qwen.cpython-38.pyc ADDED Viewed

Binary file (5.26 kB). View file

ola/model/language_model/ola_qwen.py ADDED Viewed

	@@ -0,0 +1,237 @@

+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import transformers
+from transformers import AutoConfig, AutoModelForCausalLM
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.generation.utils import GenerateOutput
+from ..ola_arch import OlaMetaModel, OlaMetaForCausalLM
+from transformers import Qwen2Config, Qwen2Model, Qwen2ForCausalLM
+class OlaConfigQwen(Qwen2Config):
+    model_type = "ola_qwen"
+class OlaQwenModel(OlaMetaModel, Qwen2Model):
+    config_class = OlaConfigQwen
+    def __init__(self, config: Qwen2Config):
+        super(OlaQwenModel, self).__init__(config)
+class OlaQwenForCausalLM(Qwen2ForCausalLM, OlaMetaForCausalLM):
+    config_class = OlaConfigQwen
+    def __init__(self, config):
+        super(Qwen2ForCausalLM, self).__init__(config)
+        config.rope_scaling = None
+        self.model = OlaQwenModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_model(self):
+        return self.model
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        speech: Optional[torch.FloatTensor] = None,
+        speech_lengths: Optional[torch.LongTensor] = None,
+        speech_chunks: Optional[torch.LongTensor] = None,
+        speech_wav: Optional[torch.FloatTensor] = None,
+        images: Optional[torch.FloatTensor] = None,
+        images_highres: Optional[List[torch.FloatTensor]] = None,
+        image_sizes: Optional[List[List[int]]] = None,
+        modalities: Optional[List[str]] = ["image"],
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        if inputs_embeds is None:
+            (
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                inputs_embeds,
+                labels
+            ) = self.prepare_inputs_labels_for_speech_vision_text(
+                input_ids,
+                position_ids,
+                attention_mask,
+                past_key_values,
+                labels,
+                speech,
+                speech_lengths,
+                speech_chunks,
+                speech_wav,
+                images,
+                modalities,
+                image_sizes,
+                images_highres
+            )
+        if labels is None:
+            return super().forward(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                inputs_embeds=inputs_embeds,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict
+            )
+        else:
+            return self.forward_llm_efficient(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                inputs_embeds=inputs_embeds,
+                labels=labels,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict
+            )
+    def forward_llm_efficient(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        hidden_dim = hidden_states.size(-1)
+        shift_labels = labels[..., 1:].contiguous().reshape(-1)
+        shift_hidden_states = hidden_states[..., :-1, :].contiguous().reshape(-1, hidden_dim)
+        assert shift_labels.size(0) == shift_hidden_states.size(0)
+        mask = shift_labels > -1
+        assert mask.float().sum() > 0
+        shift_labels = shift_labels[mask]
+        shift_hidden_states = shift_hidden_states[mask, :]
+        logits = self.lm_head(shift_hidden_states)
+        logits = logits.float()
+        loss_fct = nn.CrossEntropyLoss()
+        loss = loss_fct(logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        speech: Optional[torch.Tensor] = None,
+        speech_lengths: Optional[torch.Tensor] = None,
+        speech_chunks: Optional[torch.Tensor] = None,
+        speech_wav: Optional[torch.FloatTensor] = None,
+        images: Optional[torch.Tensor] = None,
+        images_highres: Optional[List[torch.FloatTensor]] = None,
+        image_sizes: Optional[torch.Tensor] = None,
+        modalities: Optional[List[str]] = ["image"],
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        position_ids = kwargs.pop("position_ids", None)
+        attention_mask = kwargs.pop("attention_mask", None)
+        if "inputs_embeds" in kwargs:
+            raise NotImplementedError("`inputs_embeds` is not supported")
+        (
+            inputs,
+            position_ids,
+            attention_mask,
+            _,
+            inputs_embeds,
+            _
+        ) = self.prepare_inputs_labels_for_speech_vision_text(
+            inputs,
+            position_ids,
+            attention_mask,
+            None,
+            None,
+            speech,
+            speech_lengths,
+            speech_chunks,
+            speech_wav,
+            images,
+            modalities,
+            image_sizes,
+            images_highres
+        )
+        return super().generate(
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            **kwargs
+        )
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None,
+                                      inputs_embeds=None, **kwargs):
+        speech = kwargs.pop("speech", None)
+        speech_lengths = kwargs.pop("speech_lengths", None)
+        speech_chunks = kwargs.pop("speech_chunks", None)
+        images = kwargs.pop("images", None)
+        image_sizes = kwargs.pop("image_sizes", None)
+        inputs = super().prepare_inputs_for_generation(
+            input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
+        )
+        if speech is not None:
+            inputs['speech'] = speech
+            inputs['speech_lengths'] = speech_lengths
+            inputs['speech_chunks'] = speech_chunks
+        if images is not None:
+            inputs["images"] = images
+        if image_sizes is not None:
+            inputs["image_sizes"] = image_sizes
+        return inputs
+AutoConfig.register("ola_qwen", OlaConfigQwen)
+AutoModelForCausalLM.register(OlaConfigQwen, OlaQwenForCausalLM)

ola/model/multimodal_encoder/__pycache__/builder.cpython-310.pyc ADDED Viewed

Binary file (579 Bytes). View file

ola/model/multimodal_encoder/__pycache__/builder.cpython-38.pyc ADDED Viewed

Binary file (577 Bytes). View file

ola/model/multimodal_encoder/__pycache__/oryx_vit.cpython-310.pyc ADDED Viewed

Binary file (28.8 kB). View file

ola/model/multimodal_encoder/__pycache__/oryx_vit.cpython-38.pyc ADDED Viewed

Binary file (28.7 kB). View file

ola/model/multimodal_encoder/builder.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import os
+from .oryx_vit import SigLIPViTAnysizeWrapper
+def build_vision_tower(vision_tower_cfg, **kwargs):
+    vision_tower = getattr(vision_tower_cfg, 'vision_tower', getattr(vision_tower_cfg, 'mm_vision_tower', None))
+    is_absolute_path_exists = os.path.exists(vision_tower)
+    print(f"Buiding OryxViTWrapper from {vision_tower}...")
+    # path = vision_tower.split(":")[1]
+    return SigLIPViTAnysizeWrapper(vision_tower, path=vision_tower, args=vision_tower_cfg, **kwargs)

ola/model/multimodal_encoder/oryx_vit.py ADDED Viewed

	@@ -0,0 +1,1126 @@

+import math
+import warnings
+from dataclasses import dataclass
+from functools import partial
+from typing import (
+    Callable,
+    Dict,
+    Final,
+    List,
+    Literal,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    Type,
+    Union,
+)
+from torch.utils.checkpoint import checkpoint
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+try:
+    from timm.layers import (
+        AttentionPoolLatent,
+        DropPath,
+        LayerType,
+        Mlp,
+        PatchDropout,
+        PatchEmbed,
+        resample_abs_pos_embed,
+    )
+    from timm.models._manipulate import checkpoint_seq, named_apply
+except:
+    print('Wrong timm version')
+from flash_attn import flash_attn_func, flash_attn_varlen_func
+from typing import Optional
+import logging
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import deepspeed
+import os
+if 'LOAD_VISION_EARLY' in os.environ:
+    print("LOAD_VISION_EARLY is set")
+    LOAD_VISION_EARLY = True
+else:
+    LOAD_VISION_EARLY = False
+if 'SKIP_LOAD_VIT' in os.environ:
+    print("SKIP_LOAD_VIT is set")
+    SKIP_LOAD_VIT = True
+else:
+    SKIP_LOAD_VIT = False
+if 'VIT_WITH_GRAD' in os.environ:
+    print("VIT_WITH_GRAD is set")
+    VIT_WITH_GRAD = True
+else:
+    VIT_WITH_GRAD = False
+if 'FIX_SIZE' in os.environ:
+    print("FIX_SIZE is set")
+    FIX_SIZE = True
+else:
+    FIX_SIZE = False
+if 'ANYRES_SPLIT' in os.environ:
+    ANYRES_SPLIT = int(os.environ['ANYRES_SPLIT'])
+    print(f"ANYRES_SPLIT is set as {ANYRES_SPLIT}")
+else:
+    ANYRES_SPLIT = None
+if 'FORCE_NO_DOWNSAMPLE' in os.environ:
+    print("FORCE_NO_DOWNSAMPLE is set")
+    FORCE_NO_DOWNSAMPLE = True
+else:
+    FORCE_NO_DOWNSAMPLE = False
+if 'EVAL_72B' in os.environ:
+    print("EVAL_72B is set")
+    EVAL_72B = True
+else:
+    EVAL_72B = False
+def _no_grad_trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn(
+            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+            "The distribution of values may be incorrect.",
+            stacklevel=2,
+        )
+    with torch.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        l = norm_cdf((a - mean) / std)  # noqa: E741
+        u = norm_cdf((b - mean) / std)
+        # Uniformly fill tensor with values from [l, u], then translate to
+        # [2l-1, 2u-1].
+        tensor.uniform_(2 * l - 1, 2 * u - 1)
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        tensor.erfinv_()
+        # Transform to proper mean, std
+        tensor.mul_(std * math.sqrt(2.0))
+        tensor.add_(mean)
+        # Clamp to ensure it's in the proper range
+        tensor.clamp_(min=a, max=b)
+        return tensor
+def trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0):
+    # type: (torch.Tensor, float, float, float, float) -> torch.Tensor
+    r"""The original timm.models.layers.weight_init.trunc_normal_ can not handle bfloat16 yet, here we first
+    convert the tensor to float32, apply the trunc_normal_() in float32, and then convert it back to its orignal dtype.
+    Fills the input Tensor with values drawn from a truncated normal distribution. The values are effectively drawn
+    from the normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \leq \text{mean} \leq b`.
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.trunc_normal_(w)
+    """
+    with torch.no_grad():
+        dtype = tensor.dtype
+        tensor_fp32 = tensor.float()
+        tensor_fp32 = _no_grad_trunc_normal_(tensor_fp32, mean, std, a, b)
+        tensor_dtype = tensor_fp32.to(dtype=dtype)
+        tensor.copy_(tensor_dtype)
+def init_weights(self):
+    if self.pos_embed is not None:
+        trunc_normal_(self.pos_embed, std=self.pos_embed.shape[1] ** -0.5)
+    trunc_normal_(self.latent, std=self.latent_dim**-0.5)
+def init_weights_vit_timm(module: nn.Module, name: str = "") -> None:
+    """ViT weight initialization, original timm impl (for reproducibility)"""
+    if isinstance(module, nn.Linear):
+        trunc_normal_(module.weight, std=0.02)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+    elif hasattr(module, "init_weights"):
+        module.init_weights()
+class Attention(nn.Module):
+    fused_attn: Final[bool]
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        qk_norm: bool = False,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+        norm_layer: nn.Module = nn.LayerNorm,
+    ) -> None:
+        super().__init__()
+        assert dim % num_heads == 0, "dim should be divisible by num_heads"
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim**-0.5
+        # self.fused_attn = use_fused_attn()
+        self.fused_attn = True
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop) if proj_drop > 0.0 else nn.Identity()
+    def forward(self, x: torch.Tensor, cu_slens=None) -> torch.Tensor:
+        B, N, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B, N, 3, self.num_heads, self.head_dim)
+            .permute(2, 0, 3, 1, 4)
+        )
+        q, k, v = qkv.unbind(0)
+        q, k = self.q_norm(q), self.k_norm(k)
+        if cu_slens is not None:
+            q = q.permute(0, 2, 1, 3)   # B, num_heads, N, C -> B, N, num_heads, C
+            k = k.permute(0, 2, 1, 3)
+            v = v.permute(0, 2, 1, 3)
+            max_seqlen = torch.max(cu_slens[1:] - cu_slens[:-1]).item()
+            x = flash_attn_varlen_func(
+                q.squeeze(0),
+                k.squeeze(0),
+                v.squeeze(0),
+                cu_seqlens_q=cu_slens,
+                cu_seqlens_k=cu_slens,
+                max_seqlen_q=max_seqlen,
+                max_seqlen_k=max_seqlen,
+                softmax_scale=self.scale,
+                causal=False,
+                )
+            x = x.reshape(B, N, -1)
+            x = self.proj(x)
+            x = self.proj_drop(x)
+        else:
+            q = q.permute(0, 2, 1, 3)   # B, num_heads, N, C -> B, N, num_heads, C
+            k = k.permute(0, 2, 1, 3)
+            v = v.permute(0, 2, 1, 3)
+            x = flash_attn_func(q, k, v, softmax_scale=self.scale) # -> b, n, h, c
+            x = x.reshape(B, N, -1)
+            x = self.proj(x)
+            x = self.proj_drop(x)
+        # if self.fused_attn:
+        #     x = F.scaled_dot_product_attention(
+        #         q,
+        #         k,
+        #         v,
+        #         dropout_p=self.attn_drop.p if self.training else 0.0,
+        #     )
+        # else:
+        #     q = q * self.scale
+        #     attn = q @ k.transpose(-2, -1)
+        #     attn = attn.softmax(dim=-1)
+        #     attn = self.attn_drop(attn)
+        #     x = attn @ v
+        # x = x.transpose(1, 2).reshape(B, N, C)
+        # x = self.proj(x)
+        # x = self.proj_drop(x)
+        return x
+class LayerScale(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        init_values: float = 1e-5,
+        inplace: bool = False,
+    ) -> None:
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        qk_norm: bool = False,
+        proj_drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values: Optional[float] = None,
+        drop_path: float = 0.0,
+        act_layer: nn.Module = nn.GELU,
+        norm_layer: nn.Module = nn.LayerNorm,
+        mlp_layer: nn.Module = Mlp,
+    ) -> None:
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_norm=qk_norm,
+            attn_drop=attn_drop,
+            proj_drop=proj_drop,
+            norm_layer=norm_layer,
+        )
+        self.ls1 = (
+            LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        )
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        self.mlp = mlp_layer(
+            in_features=dim,
+            hidden_features=int(dim * mlp_ratio),
+            act_layer=act_layer,
+            drop=proj_drop,
+        )
+        self.ls2 = (
+            LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        )
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+    def forward(self, x: torch.Tensor, cu_slens=None) -> torch.Tensor:
+        x = x + self.drop_path1(self.ls1(self.attn(self.norm1(x), cu_slens=cu_slens)))
+        x = x + self.drop_path2(self.ls2(self.mlp(self.norm2(x))))
+        return x
+class VisionTransformer(nn.Module):
+    """Vision Transformer
+    A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale`
+        - https://arxiv.org/abs/2010.11929
+    """
+    dynamic_img_size: Final[bool]
+    def __init__(
+        self,
+        img_size: Union[int, Tuple[int, int]] = 224,
+        patch_size: Union[int, Tuple[int, int]] = 16,
+        in_chans: int = 3,
+        num_classes: int = 1000,
+        global_pool: Literal["", "avg", "token", "map"] = "token",
+        embed_dim: int = 768,
+        depth: int = 12,
+        num_heads: int = 12,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        qk_norm: bool = False,
+        init_values: Optional[float] = None,
+        class_token: bool = True,
+        no_embed_class: bool = False,
+        reg_tokens: int = 0,
+        pre_norm: bool = False,
+        fc_norm: Optional[bool] = None,
+        dynamic_img_size: bool = False,
+        dynamic_img_pad: bool = False,
+        drop_rate: float = 0.0,
+        pos_drop_rate: float = 0.0,
+        patch_drop_rate: float = 0.0,
+        proj_drop_rate: float = 0.0,
+        attn_drop_rate: float = 0.0,
+        drop_path_rate: float = 0.0,
+        weight_init: Literal["skip", "jax", "jax_nlhb", "moco", ""] = "",
+        embed_layer: Callable = PatchEmbed,
+        norm_layer: Optional[LayerType] = None,
+        act_layer: Optional[LayerType] = None,
+        strict_img_size: bool = False,
+        block_fn: Type[nn.Module] = Block,
+        mlp_layer: Type[nn.Module] = Mlp,
+        ignore_head: bool = False,
+        add_patch2x2: bool = False,
+    ) -> None:
+        """
+        Args:
+            img_size: Input image size.
+            patch_size: Patch size.
+            in_chans: Number of image input channels.
+            num_classes: Mumber of classes for classification head.
+            global_pool: Type of global pooling for final sequence (default: 'token').
+            embed_dim: Transformer embedding dimension.
+            depth: Depth of transformer.
+            num_heads: Number of attention heads.
+            mlp_ratio: Ratio of mlp hidden dim to embedding dim.
+            qkv_bias: Enable bias for qkv projections if True.
+            init_values: Layer-scale init values (layer-scale enabled if not None).
+            class_token: Use class token.
+            no_embed_class: Don't include position embeddings for class (or reg) tokens.
+            reg_tokens: Number of register tokens.
+            fc_norm: Pre head norm after pool (instead of before), if None, enabled when global_pool == 'avg'.
+            drop_rate: Head dropout rate.
+            pos_drop_rate: Position embedding dropout rate.
+            attn_drop_rate: Attention dropout rate.
+            drop_path_rate: Stochastic depth rate.
+            weight_init: Weight initialization scheme.
+            embed_layer: Patch embedding layer.
+            norm_layer: Normalization layer.
+            act_layer: MLP activation layer.
+            block_fn: Transformer block layer.
+        """
+        super().__init__()
+        assert global_pool in ("", "avg", "token", "map")
+        assert class_token or global_pool != "token"
+        use_fc_norm = global_pool == "avg" if fc_norm is None else fc_norm
+        # norm_layer = get_norm_layer(norm_layer) or partial(nn.LayerNorm, eps=1e-6)
+        # act_layer = get_act_layer(act_layer) or nn.GELU
+        norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        act_layer = nn.GELU
+        self.num_classes = num_classes
+        self.global_pool = global_pool
+        self.num_features = self.embed_dim = (
+            embed_dim  # num_features for consistency with other models
+        )
+        self.num_prefix_tokens = 1 if class_token else 0
+        self.num_prefix_tokens += reg_tokens
+        self.num_reg_tokens = reg_tokens
+        self.has_class_token = class_token
+        self.no_embed_class = (
+            no_embed_class  # don't embed prefix positions (includes reg)
+        )
+        self.dynamic_img_size = dynamic_img_size
+        self.grad_checkpointing = False
+        self.ignore_head = ignore_head
+        embed_args = {}
+        if dynamic_img_size:
+            # flatten deferred until after pos embed
+            embed_args.update(dict(strict_img_size=False, output_fmt="NHWC"))
+        self.patch_embed = embed_layer(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            bias=not pre_norm,  # disable bias if pre-norm is used (e.g. CLIP)
+            dynamic_img_pad=dynamic_img_pad,
+            strict_img_size=strict_img_size,
+            **embed_args,
+        )
+        num_patches = self.patch_embed.num_patches
+        self.cls_token = (
+            nn.Parameter(torch.zeros(1, 1, embed_dim)) if class_token else None
+        )
+        self.reg_token = (
+            nn.Parameter(torch.zeros(1, reg_tokens, embed_dim)) if reg_tokens else None
+        )
+        embed_len = (
+            num_patches if no_embed_class else num_patches + self.num_prefix_tokens
+        )
+        self.pos_embed = nn.Parameter(torch.randn(1, embed_len, embed_dim) * 0.02)
+        # deepspeed.zero.register_external_parameter(self, self.pos_embed)
+        # deepspeed.zero.register_external_parameter(self, self.patch_embed.proj.weight)
+        # deepspeed.zero.register_external_parameter(self, self.patch_embed.proj.bias)
+        # print(self.patch_embed.state_dict().keys())
+        self.pos_drop = nn.Dropout(p=pos_drop_rate)
+        if patch_drop_rate > 0:
+            self.patch_drop = PatchDropout(
+                patch_drop_rate,
+                num_prefix_tokens=self.num_prefix_tokens,
+            )
+        else:
+            self.patch_drop = nn.Identity()
+        self.norm_pre = norm_layer(embed_dim) if pre_norm else nn.Identity()
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, depth)
+        ]  # stochastic depth decay rule
+        self.blocks = nn.Sequential(
+            *[
+                block_fn(
+                    dim=embed_dim,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    qk_norm=qk_norm,
+                    init_values=init_values,
+                    proj_drop=proj_drop_rate,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[i],
+                    norm_layer=norm_layer,
+                    act_layer=act_layer,
+                    mlp_layer=mlp_layer,
+                )
+                for i in range(depth)
+            ]
+        )
+        if add_patch2x2:
+            if add_patch2x2 == 'v2':
+                self.downsample = nn.Sequential(
+                    nn.Conv2d(embed_dim, embed_dim*2, kernel_size=2, stride=2),
+                    nn.GELU(),
+                    nn.Conv2d(embed_dim*2, embed_dim*4, 1)
+                )
+            else:
+                mid_dim = embed_dim * 2
+                self.downsample = nn.Sequential(
+                    nn.Conv2d(embed_dim, mid_dim, kernel_size=2, stride=2),
+                    nn.GELU(),
+                    nn.Conv2d(mid_dim, mid_dim, 1)
+            )
+        else:
+            self.downsample = None
+        # self.norm = norm_layer(embed_dim) if not use_fc_norm else nn.Identity()
+        # # Classifier Head
+        # if global_pool == "map":
+        #     AttentionPoolLatent.init_weights = init_weights
+        #     self.attn_pool = AttentionPoolLatent(
+        #         self.embed_dim,
+        #         num_heads=num_heads,
+        #         mlp_ratio=mlp_ratio,
+        #         norm_layer=norm_layer,
+        #     )
+        # else:
+        #     self.attn_pool = None
+        # self.fc_norm = norm_layer(embed_dim) if use_fc_norm else nn.Identity()
+        # self.head_drop = nn.Dropout(drop_rate)
+        # self.head = (
+        #     nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+        # )
+        # if weight_init != "skip":
+        #     self.init_weights(weight_init)
+    def init_weights(self, mode: Literal["jax", "jax_nlhb", "moco", ""] = "") -> None:
+        assert mode in ("jax", "jax_nlhb", "moco", "")
+        # head_bias = -math.log(self.num_classes) if "nlhb" in mode else 0.0
+        trunc_normal_(self.pos_embed, std=0.02)
+        if self.cls_token is not None:
+            nn.init.normal_(self.cls_token, std=1e-6)
+        named_apply(init_weights_vit_timm, self)
+    @torch.jit.ignore
+    def no_weight_decay(self) -> Set:
+        return {"pos_embed", "cls_token", "dist_token"}
+    @torch.jit.ignore
+    def group_matcher(self, coarse: bool = False) -> Dict:
+        return dict(
+            stem=r"^cls_token|pos_embed|patch_embed",  # stem and embed
+            blocks=[(r"^blocks\.(\d+)", None), (r"^norm", (99999,))],
+        )
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable: bool = True) -> None:
+        self.grad_checkpointing = enable
+    @torch.jit.ignore
+    def get_classifier(self) -> nn.Module:
+        return self.head
+    def reset_classifier(self, num_classes: int, global_pool=None) -> None:
+        self.num_classes = num_classes
+        if global_pool is not None:
+            assert global_pool in ("", "avg", "token", "map")
+            if global_pool == "map" and self.attn_pool is None:
+                assert (
+                    False
+                ), "Cannot currently add attention pooling in reset_classifier()."
+            elif global_pool != "map " and self.attn_pool is not None:
+                self.attn_pool = None  # remove attention pooling
+            self.global_pool = global_pool
+        self.head = (
+            nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+        )
+    def rescale_positional_embedding(self, out_size):
+        h, w = out_size
+        pos_embed_shape = int((self.pos_embed.shape[1]) ** 0.5)
+        if (h, w) == (pos_embed_shape, pos_embed_shape):
+            return self.pos_embed
+        rescaled_positional_embedding = \
+            self.pos_embed.new_zeros(1, h*w, self.pos_embed.shape[2])
+        pe_2d = self.pos_embed[0].T.contiguous().view(1, -1, pos_embed_shape, pos_embed_shape)
+        pe_2d = F.interpolate(pe_2d, out_size, mode='bilinear', align_corners=False).view(-1, h*w)
+        rescaled_positional_embedding[0] = pe_2d.T.contiguous()
+        return rescaled_positional_embedding
+    def _pos_embed(self, x: torch.Tensor) -> torch.Tensor:
+        if self.dynamic_img_size:
+            B, H, W, C = x.shape
+            pos_embed = resample_abs_pos_embed(
+                self.pos_embed,
+                (H, W),
+                num_prefix_tokens=0 if self.no_embed_class else self.num_prefix_tokens,
+            )
+            x = x.view(B, -1, C)
+        else:
+            pos_embed = self.pos_embed
+        to_cat = []
+        if self.cls_token is not None:
+            to_cat.append(self.cls_token.expand(x.shape[0], -1, -1))
+        if self.reg_token is not None:
+            to_cat.append(self.reg_token.expand(x.shape[0], -1, -1))
+        if self.no_embed_class:
+            # deit-3, updated JAX (big vision)
+            # position embedding does not overlap with class token, add then concat
+            x = x + pos_embed
+            if to_cat:
+                x = torch.cat(to_cat + [x], dim=1)
+        else:
+            # original timm, JAX, and deit vit impl
+            # pos_embed has entry for class token, concat then add
+            if to_cat:
+                x = torch.cat(to_cat + [x], dim=1)
+            x = x + pos_embed
+        return self.pos_drop(x)
+    def _intermediate_layers(
+        self,
+        x: torch.Tensor,
+        n: Union[int, Sequence] = 1,
+    ) -> List[torch.Tensor]:
+        outputs, num_blocks = [], len(self.blocks)
+        take_indices = set(
+            range(num_blocks - n, num_blocks) if isinstance(n, int) else n
+        )
+        # forward pass
+        x = self.patch_embed(x)
+        x = self._pos_embed(x)
+        x = self.patch_drop(x)
+        x = self.norm_pre(x)
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+            if i in take_indices:
+                outputs.append(x)
+        return outputs
+    def get_intermediate_layers(
+        self,
+        x: torch.Tensor,
+        n: Union[int, Sequence] = 1,
+        reshape: bool = False,
+        return_prefix_tokens: bool = False,
+        norm: bool = False,
+    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
+        """Intermediate layer accessor (NOTE: This is a WIP experiment).
+        Inspired by DINO / DINOv2 interface
+        """
+        # take last n blocks if n is an int, if in is a sequence, select by matching indices
+        outputs = self._intermediate_layers(x, n)
+        if norm:
+            outputs = [self.norm(out) for out in outputs]
+        prefix_tokens = [out[:, 0 : self.num_prefix_tokens] for out in outputs]
+        outputs = [out[:, self.num_prefix_tokens :] for out in outputs]
+        if reshape:
+            grid_size = self.patch_embed.grid_size
+            outputs = [
+                out.reshape(x.shape[0], grid_size[0], grid_size[1], -1)
+                .permute(0, 3, 1, 2)
+                .contiguous()
+                for out in outputs
+            ]
+        if return_prefix_tokens:
+            return tuple(zip(outputs, prefix_tokens))
+        return tuple(outputs)
+    def forward_features_list(self, x_list):
+        x_all = []
+        image_sizes = []
+        for x in x_list:
+            if EVAL_72B:
+                x = x.to('cuda:0')
+            bs, _, h, w = x.shape
+            # fix patch size=14 in datasets
+            pad_h = (self.patch_embed.patch_size[0] - h % self.patch_embed.patch_size[0]) % self.patch_embed.patch_size[0]
+            pad_w = (self.patch_embed.patch_size[1] - w % self.patch_embed.patch_size[1]) % self.patch_embed.patch_size[1]
+            x = F.pad(x, (0, pad_w, 0, pad_h))
+            bs, _, h, w = x.shape
+            h = h // self.patch_embed.patch_size[0]
+            w = w // self.patch_embed.patch_size[1]
+            x = self.patch_embed(x)
+            # x = self._pos_embed(x)
+            x = x + self.rescale_positional_embedding(out_size=(h, w))
+            x = self.patch_drop(x)
+            x = self.norm_pre(x)
+            x_all.append(x)
+            image_sizes.append((h, w))
+        slen = [xi.size(1) for xi in x_all]
+        x = torch.cat(x_all, dim=1)
+        cu_indices = [0, ]
+        for i in slen:
+            cu_indices.append(cu_indices[-1] + i)
+        cu_slens = torch.tensor(cu_indices, dtype=torch.int32).to(x.device)
+        for idx, blk in enumerate(self.blocks):
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint(blk, x, cu_slens, use_reentrant=True)
+            else:
+                x = blk(x, cu_slens=cu_slens)
+        feats = x.split(slen, dim=1) #[(1, slen, c)]
+        if self.downsample is not None:
+            new_feats = []
+            new_sizes = []
+            for f, s in zip(feats, image_sizes):
+                h, w = s
+                b, n, c = f.size()
+                f = f.reshape(b, h, w, c).permute(0, 3, 1, 2)
+                f = self.downsample(f)
+                b, c, h, w = f.size()
+                f = f.permute(0, 2, 3, 1).reshape(b, h*w, c)
+                new_feats.append(f)
+                new_sizes.append((h, w))
+            return new_feats, new_sizes
+        return feats, image_sizes
+    def forward_features(self, x: torch.Tensor) -> torch.Tensor:
+        if EVAL_72B:
+            x = x.to('cuda:0')
+        bs, _, h, w = x.shape
+        h = h // self.patch_embed.patch_size[0]
+        w = w // self.patch_embed.patch_size[1]
+        x = self.patch_embed(x)
+        # x = self._pos_embed(x)
+        x = x + self.rescale_positional_embedding(out_size=(h, w))
+        x = self.patch_drop(x)
+        x = self.norm_pre(x)
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            x = checkpoint_seq(self.blocks, x)
+        else:
+            x = self.blocks(x)
+        if self.downsample is not None:
+            b, n, c = x.size()
+            x = x.reshape(b, h, w, c).permute(0, 3, 1, 2)
+            x = self.downsample(x)
+            b, c, h, w = x.size()
+            x = x.permute(0, 2, 3, 1).reshape(b, h*w, c)
+            new_feats = x
+            new_sizes = (h, w)
+            return new_feats, new_sizes
+        return x, (h, w)
+    def forward_head(self, x: torch.Tensor, pre_logits: bool = False) -> torch.Tensor:
+        x = self.norm(x)
+        if self.attn_pool is not None:
+            x = self.attn_pool(x)
+        elif self.global_pool == "avg":
+            x = x[:, self.num_prefix_tokens :].mean(dim=1)
+        elif self.global_pool:
+            x = x[:, 0]  # class token
+        x = self.fc_norm(x)
+        x = self.head_drop(x)
+        return x if pre_logits else self.head(x)
+    def forward(self, x, cal_attn_pool=False):
+        if type(x) is list:
+            x, image_sizes = self.forward_features_list(x)
+            return x, image_sizes, None
+        else:
+            x, image_sizes = self.forward_features(x)
+            return x, image_sizes, None
+@dataclass
+class SigLIPVisionCfg:
+    width: int = 1152
+    layers: Union[Tuple[int, int, int, int], int] = 27
+    heads: int = 16
+    patch_size: int = 14
+    image_size: Union[Tuple[int, int], int] = 336
+    global_pool: str = "map"
+    mlp_ratio: float = 3.7362
+    class_token: bool = False
+    num_classes: int = 0
+    use_checkpoint: bool = False
+SigLIP_MODEL_CONFIG = {
+    "siglip_so400m_patch14_384": {
+        "image_size": 384,
+        "patch_size": 14,
+        "width": 1152,
+        "layers": 27,
+        "heads": 16,
+        "mlp_ratio": 3.7362,
+        "global_pool": "map",
+        "use_checkpoint": False,
+    },
+    "siglip_so400m_patch16_384": {
+        "image_size": 384,
+        "patch_size": 16,
+        "width": 1152,
+        "layers": 27,
+        "heads": 16,
+        "mlp_ratio": 3.7362,
+        "global_pool": "map",
+        "use_checkpoint": False,
+    },
+    "siglip_so400m_patch14_224": {
+        "image_size": 224,
+        "patch_size": 14,
+        "width": 1152,
+        "layers": 27,
+        "heads": 16,
+        "mlp_ratio": 3.7362,
+        "global_pool": "map",
+        "use_checkpoint": False,
+    },
+    "siglip_large_patch16_384": {
+        "image_size": 384,
+        "patch_size": 16,
+        "width": 1024,
+        "layers": 24,
+        "heads": 16,
+        "mlp_ratio": 4,
+        "global_pool": "map",
+        "use_checkpoint": False,
+    },
+}
+def resize_evaclip_pos_embed(model: VisionTransformer, interpolation: str = 'bicubic'):
+    # interpolate position embedding
+    orig_size = 24
+    new_size = 128
+    pos_tokens = model.pos_embed
+    pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, model.embed_dim).permute(0, 3, 1, 2)
+    pos_tokens = torch.nn.functional.interpolate(
+        pos_tokens, size=(new_size, new_size), mode=interpolation, align_corners=False)
+    pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
+    model.pos_embed = nn.Parameter(pos_tokens, requires_grad=True)
+    return model
+def create_siglip_vit(
+    model_name: str = "siglip_so400m_patch14_384",
+    image_size: int = 384,
+    select_layer: int = -1,
+    path: str = "",
+    gradient_checkpointing: bool = False,
+    **kwargs,
+):
+    assert (
+        model_name in SigLIP_MODEL_CONFIG.keys()
+    ), f"model name should be in {SigLIP_MODEL_CONFIG.keys()}"
+    vision_cfg = SigLIPVisionCfg(**SigLIP_MODEL_CONFIG[model_name])
+    if select_layer <= 0:
+        layers = min(vision_cfg.layers, vision_cfg.layers + select_layer + 1)
+    else:
+        layers = min(vision_cfg.layers, select_layer)
+    if 'patch2x2' or 'patch4x4' in path:
+        add_patch2x2 = True
+    else:
+        add_patch2x2 = False
+    if 'patch4x4pool' in path or 'patch2x2from4x4' in path:
+        add_patch2x2 = 'v2'
+    if FORCE_NO_DOWNSAMPLE:
+        add_patch2x2 = False
+    model = VisionTransformer(
+        img_size=2048,
+        patch_size=16,
+        embed_dim=vision_cfg.width,
+        depth=layers,
+        num_heads=vision_cfg.heads,
+        mlp_ratio=vision_cfg.mlp_ratio,
+        class_token=vision_cfg.class_token,
+        global_pool=vision_cfg.global_pool,
+        dynamic_img_pad=False,
+        strict_img_size=False,
+        ignore_head=kwargs.get("ignore_head", False),
+        weight_init=kwargs.get("weight_init", "skip"),
+        num_classes=0,
+        add_patch2x2=add_patch2x2
+    )
+    if not SKIP_LOAD_VIT:
+        if path is not None and os.path.exists(path):
+            ckpt = path
+        else:
+            raise ValueError(f"Model checkpoint not found at {path}")
+        state_dict = torch.load(ckpt, map_location="cpu")
+        print('loading vision backbone from', path)
+        if 'genli' in path:
+            new_sd = {}
+            for k in state_dict.keys():
+                if k.startswith('base_model.model.model.vision_tower.vision_tower.'):
+                    new_k = k.replace('base_model.model.model.vision_tower.vision_tower.', '')
+                    new_sd[new_k] = state_dict[k]
+                if add_patch2x2:
+                    if k.startswith('base_model.model.model.mm_projector.proj'):
+                        new_k = k.replace('base_model.model.model.mm_projector.proj', 'downsample')
+                        new_sd[new_k] = state_dict[k]
+        elif 'distill' in path:
+            new_sd = {}
+            state_dict = state_dict['model']
+            for k in state_dict.keys():
+                if k.startswith('vision_tower.'):
+                    new_k = k.replace('vision_tower.', '')
+                    new_sd[new_k] = state_dict[k]
+        else:
+            raise NotImplementedError
+        msg = model.load_state_dict(new_sd, strict=False)
+        print(msg)
+    else:
+        print("#### Skip loading vision backbone")
+    if gradient_checkpointing:
+        model.set_grad_checkpointing(True)
+    return model
+from transformers import CLIPImageProcessor
+import torch.distributed as dist
+class SigLIPViTAnysizeWrapper(nn.Module):
+    def __init__(self, vision_tower, path, args, delay_load=False):
+        super().__init__()
+        self.is_loaded = False
+        self.vision_tower_name = vision_tower
+        self.args = args
+        self.path = path
+        self.select_layer = -1
+        if self.select_layer < -1: self.select_layer += 1
+        self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
+        self.output_dim = 1152
+        if not FORCE_NO_DOWNSAMPLE:
+            if 'patch2x2' or 'patch4x4' in path:
+                self.output_dim = 1152*2
+            if 'patch4x4pool' in path or 'patch2x2from4x4' in path:
+                self.output_dim = 1152*4
+        if not delay_load or LOAD_VISION_EARLY:
+            self.load_model()
+        elif getattr(args, "unfreeze_mm_vision_tower", False):
+            # TODO: better detector is needed.
+            print(f"The checkpoint seems to contain `vision_tower` weights: `unfreeze_mm_vision_tower`: True.")
+            self.load_model()
+    def load_model(self, device_map=None):
+        if self.is_loaded:
+            print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
+            return
+        self.image_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14")
+        if self.args.mm_projector_type == "conv_mlp" or self.args.mm_projector_type == "multipath_conv_mlp" or self.args.mm_projector_type == "multipath_conv_mlp_woconv":
+            self.image_processor.crop_size['height'] = 384
+            self.image_processor.crop_size['width'] = 384
+            self.image_processor.size['shortest_edge'] = 384
+            print("Resizeing clip processor to 384...")
+        self.image_processor.image_mean = [0.5, 0.5, 0.5]
+        self.image_processor.image_std = [0.5, 0.5, 0.5]
+        print("Loading vision model...")
+        if VIT_WITH_GRAD:
+            self.vision_tower = create_siglip_vit(path=self.path, model_name='siglip_so400m_patch16_384',
+                                                  gradient_checkpointing=True)
+            self.vision_tower.train()
+        else:
+            self.vision_tower = create_siglip_vit(path=self.path, model_name='siglip_so400m_patch16_384',
+                                                  gradient_checkpointing=False)
+            for p in self.vision_tower.parameters():
+                p.requires_grad = False
+            self.vision_tower.eval()
+        self.is_loaded = True
+    def train(self, mode = True):
+        self.training = mode
+        if self.is_loaded and not VIT_WITH_GRAD:
+            self.vision_tower.eval()
+    def split_images(self, images, split_res=512, base_size=32):
+        split_images = []
+        sub_images_info = []
+        for image in images:
+            now_sub_images = []
+            _, c, h, w = image.shape
+            if h * w <= split_res * split_res:
+                split_images.append(image)
+                sub_images_info.append(
+                    (
+                        1, 1, 1, h // base_size, w // base_size, [(0, h // base_size, 0, w // base_size)]
+                    )
+                )
+                continue
+            nsplit_h = math.ceil(h / split_res)
+            nsplit_w = math.ceil(w / split_res)
+            sub_h = int(h / nsplit_h  / base_size) * base_size
+            sub_w = int(w / nsplit_w / base_size) * base_size
+            crop_infos = []
+            for i in range(nsplit_h):
+                for j in range(nsplit_w):
+                    begin_h = i * sub_h
+                    begin_w = j * sub_w
+                    if i == nsplit_h - 1:
+                        end_h = h
+                    else:
+                        end_h = (i + 1) * sub_h
+                    if j == nsplit_w - 1:
+                        end_w = w
+                    else:
+                        end_w = (j + 1) * sub_w
+                    assert (end_h - begin_h) % base_size == 0 and (end_w - begin_w) % base_size == 0
+                    sub_image = image[:, :, begin_h:end_h, begin_w:end_w]
+                    now_sub_images.append(sub_image)
+                    crop_infos.append(
+                        (begin_h // base_size, end_h // base_size, begin_w // base_size, end_w // base_size)
+                    )
+            split_images += now_sub_images
+            sub_images_info.append(
+                (
+                    len(now_sub_images), nsplit_h, nsplit_w, h // base_size, w // base_size, crop_infos
+                )
+            )
+        return split_images, sub_images_info
+    def unsplit_images(self, features, sizes, sub_images_info):
+        new_features = []
+        for feature, size in zip(features, sizes):
+            h, w = size
+            new_features.append(
+                feature.reshape(1, h, w, -1)
+            )
+        fused_images = []
+        images_sizes = []
+        sub_count = 0
+        for n_split, nsplit_h, nsplit_w, total_h, total_w, crop_infos in sub_images_info:
+            sub_features = new_features[sub_count:sub_count+n_split]
+            sub_count += n_split
+            total_feature = new_features[0].new_zeros(1, total_h, total_w, self.hidden_size)
+            for feature, (begin_h, end_h, begin_w, end_w) in zip(sub_features, crop_infos):
+                total_feature[:, begin_h:end_h, begin_w:end_w] += feature
+            fused_images.append(total_feature.reshape(1, total_h * total_w, self.hidden_size))
+            images_sizes.append((total_h, total_w))
+        return fused_images, images_sizes
+    def forward_func(self, images, force_fix_size=False, cal_attn_pool=False):
+        if type(images) is list:
+            xs = [x.to(self.dtype) for x in images]
+            image_features, img_size, cls_token = self.vision_tower(xs, cal_attn_pool=cal_attn_pool)
+            image_features = [x.to(images[0].dtype) for x in image_features]
+        else:
+            image_forward_outs, img_size, cls_token = self.vision_tower(images.to(self.dtype), cal_attn_pool=cal_attn_pool)
+            image_features = image_forward_outs.to(images.dtype)
+        return image_features, img_size, cls_token
+    def forward(self, images, cal_attn_pool=False):
+        if VIT_WITH_GRAD:
+            image_features, img_size, cls_token = self.forward_func(images, cal_attn_pool=cal_attn_pool)
+            return image_features, img_size
+        else:
+            with torch.no_grad():
+                image_features, img_size, cls_token = self.forward_func(images, cal_attn_pool=cal_attn_pool)
+                return image_features, img_size
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, 1152, device=self.device, dtype=self.dtype)
+    @property
+    def dtype(self):
+        return self.vision_tower.pos_embed.dtype
+    @property
+    def device(self):
+        return self.vision_tower.pos_embed.device
+    @property
+    def hidden_size(self):
+        return self.output_dim
+    @property
+    def config(self):
+        return type('LLaVAConfigWrapper', (), {
+            # 'image_size': 224,
+            'patch_size': 16,
+        })()

ola/model/multimodal_projector/__pycache__/builder.cpython-310.pyc ADDED Viewed

Binary file (4.61 kB). View file

ola/model/multimodal_projector/__pycache__/builder.cpython-38.pyc ADDED Viewed

Binary file (4.62 kB). View file

ola/model/multimodal_projector/__pycache__/pooler_projector.cpython-310.pyc ADDED Viewed

Binary file (2.76 kB). View file

ola/model/multimodal_projector/__pycache__/pooler_projector.cpython-38.pyc ADDED Viewed

Binary file (2.78 kB). View file

ola/model/multimodal_projector/builder.py ADDED Viewed

	@@ -0,0 +1,179 @@

+import torch
+import torch.nn as nn
+import re
+import math
+from .pooler_projector import NormalizedDwPooler
+import os
+import math
+if 'REGIONAL_POOL' in os.environ:
+    REGIONAL_POOL = os.environ['REGIONAL_POOL']
+else:
+    REGIONAL_POOL = '2x'
+print(f"REGIONAL_POOL is set as {REGIONAL_POOL}")
+class IdentityMap(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x, *args, **kwargs):
+        return x
+    @property
+    def config(self):
+        return {"mm_projector_type": 'identity'}
+class SimpleResBlock(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.pre_norm = nn.LayerNorm(channels)
+        self.proj = nn.Sequential(
+            nn.Linear(channels, channels),
+            nn.GELU(),
+            nn.Linear(channels, channels)
+        )
+    def forward(self, x):
+        x = self.pre_norm(x)
+        return x + self.proj(x)
+class OlaMLP(nn.Module):
+    def __init__(self, in_channels, out_channels, twoview=False):
+        super().__init__()
+        self.proj1 = nn.Linear(in_channels, out_channels)
+        self.proj2 = nn.Linear(out_channels, out_channels)
+        self.act = nn.GELU()
+        self.pooler = NormalizedDwPooler(out_channels)
+        embed_std = 1 / math.sqrt(out_channels)
+        self.image_newline = nn.Parameter(
+            torch.randn(out_channels) * embed_std
+        )
+        self.image_begin = nn.Parameter(
+            torch.randn(out_channels) * embed_std
+        )
+        self.image_end = nn.Parameter(
+            torch.randn(out_channels) * embed_std
+        )
+        if twoview:
+            self.image_sep = nn.Parameter(
+                torch.randn(out_channels) * embed_std
+            )
+    def forward(self, x, size=(16,16), x2=None, size2=(16, 16), modalities='image'):
+        if modalities in ['image', 'text']:
+            h, w = size
+            dtype = x.dtype
+            x = x.reshape(x.shape[0], h, w, -1)
+            x = self.proj1(x)
+            x = self.pooler(x, forward_type=REGIONAL_POOL)
+            x = self.act(x)
+            x = self.proj2(x)
+            b, h, w, c = x.shape
+            x = torch.cat([
+                x,
+                self.image_newline.reshape(1, 1, 1, c).expand(b, h, 1, c).to(dtype)
+            ], dim=2)
+            x = x.reshape(b, -1, c)
+            if x2 is not None:
+                h2, w2 = size2
+                x2 = x2.reshape(x2.shape[0], h2, w2, -1)
+                x2 = self.proj1(x2)
+                x2 = self.pooler(x2, forward_type=REGIONAL_POOL)
+                x2 = self.act(x2)
+                x2 = self.proj2(x2)
+                b2, h2, w2, c2 = x2.shape
+                x2 = torch.cat([
+                    x2,
+                    self.image_newline.reshape(1, 1, 1, c).expand(b, h2, 1, c).to(dtype)
+                ], dim=2)
+                x2 = x2.reshape(b, -1, c)
+                sep = self.image_sep.reshape(1, 1, -1).expand(b, 1, c2).to(dtype)
+                x = torch.cat([x, sep, x2], dim=1)
+            begin = self.image_begin.reshape(1, 1, -1).expand(b, 1, c).to(dtype)
+            end = self.image_end.reshape(1, 1, -1).expand(b, 1, c).to(dtype)
+            x = torch.cat([begin, x, end], dim=1)
+            return x
+        elif modalities in ['video']:
+            # x2 is the true feature, ignore x
+            h, w = size
+            dtype = x.dtype
+            x = x.reshape(x.shape[0], h, w, -1)
+            x1 = self.proj1(x)
+            x1 = self.pooler(x1, forward_type=REGIONAL_POOL)
+            x1 = self.proj2(x1).mean() * 0.0
+            h2, w2 = size2
+            x2 = x2.reshape(x2.shape[0], h2, w2, -1)
+            x2 = self.proj1(x2)
+            x2 = self.pooler(x2, forward_type=REGIONAL_POOL)
+            x2 = self.act(x2)
+            x2 = self.proj2(x2)
+            b2, h2, w2, c = x2.shape
+            x2 = torch.cat([
+                x2,
+                self.image_newline.reshape(1, 1, 1, c).expand(b2, h2, 1, c).to(dtype)
+            ], dim=2)
+            x2 = x2.reshape(b2, -1, c)
+            sep = self.image_sep.reshape(1, 1, -1).expand(b2, 1, c).to(dtype)
+            x2 = torch.cat([x2, sep], dim=1)
+            x2 = x2.flatten(0, 1)
+            begin = self.image_begin.reshape(1, -1).expand(1, c).to(dtype)
+            end = self.image_end.reshape(1, -1).expand(1, c).to(dtype)
+            x2 = torch.cat([begin, x2, end], dim=0)
+            x2 = x2.unsqueeze(0)
+            return x2
+        else:
+            raise ValueError(f'Unknown modalities: {modalities}')
+def build_vision_projector(config, delay_load=False, **kwargs):
+    projector_type = getattr(config, 'mm_projector_type', 'linear')
+    if projector_type == 'linear':
+        return nn.Linear(config.mm_hidden_size, config.hidden_size)
+    elif projector_type == 'ola_mlp':
+        return OlaMLP(config.mm_hidden_size, config.hidden_size, twoview=True)
+    mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
+    if mlp_gelu_match:
+        mlp_depth = int(mlp_gelu_match.group(1))
+        modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
+        for _ in range(1, mlp_depth):
+            modules.append(nn.GELU())
+            modules.append(nn.Linear(config.hidden_size, config.hidden_size))
+        return nn.Sequential(*modules)
+    mlp_gelu_resnet_match = re.match(r'^mlp(\d+)x_res(\d+)x_gelu$', projector_type)
+    if mlp_gelu_resnet_match:
+        mlp_depth = int(mlp_gelu_resnet_match.group(1))
+        res_depth = int(mlp_gelu_resnet_match.group(2))
+        modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
+        for _ in range(1, mlp_depth):
+            modules.append(nn.GELU())
+            modules.append(nn.Linear(config.hidden_size, config.hidden_size))
+        for _ in range(res_depth):
+            modules.append(SimpleResBlock(config.hidden_size))
+        return nn.Sequential(*modules)
+    if projector_type == 'identity':
+        return IdentityMap()
+    raise ValueError(f'Unknown projector type: {projector_type}')

ola/model/multimodal_projector/pooler_projector.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+from transformers.models.clip.modeling_clip import CLIPVisionModel
+import os
+if 'NORMALIZE_POOL' in os.environ:
+    NORMALIZE_POOL = bool(int(os.environ['NORMALIZE_POOL']))
+    print(f'NORMALIZE_POOL: {NORMALIZE_POOL}')
+else:
+    NORMALIZE_POOL = True
+class PoolerProjector(nn.Module):
+    def __init__(self, config, vision_cfg):
+        super().__init__()
+        self._config = config
+        self.hw = vision_cfg.image_size // vision_cfg.patch_size
+        self.conv_pool = nn.Conv2d(
+            config.mm_hidden_size, config.hidden_size,
+            kernel_size=2, stride=2
+        )
+        self.proj = nn.Sequential(
+            nn.GELU(),
+            nn.Linear(config.hidden_size, config.hidden_size),
+        )
+    def forward(self, x, *args, **kwargs):
+        height = width = self.hw
+        assert height * width == x.shape[1]
+        x = x.view(x.shape[0], height, width, -1).permute(0, 3, 1, 2)
+        x = self.conv_pool(x)
+        x = x.flatten(2).transpose(1, 2)
+        x = self.proj(x)
+        return x
+    @property
+    def config(self):
+        return {"mm_projector_type": 'pooler'}
+class NormalizedDwPooler(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+        self.predictor = nn.Sequential(
+            nn.Linear(dim*2, dim),
+            nn.GELU(),
+            nn.Linear(dim, dim),
+        )
+    def forward(self, x, forward_type='2x'):
+        B, H, W, C = x.shape
+        if forward_type == '2x':
+            new_x = x.reshape(B, H//2, 2, W//2, 2, C).permute(0, 1, 3, 2, 4, 5).reshape(B, H//2, W//2, 4, C)
+            pooled_x = new_x.mean(-2, keepdim=True).expand(-1, -1, -1, 4, -1)
+            fused_x = torch.cat([new_x, pooled_x], dim=-1)
+        elif forward_type == '1x':
+            new_x = x.reshape(B, H, W, 1, C)
+            fused_x = torch.cat([new_x, new_x], dim=-1)
+        elif forward_type == '4x':
+            new_x = x.reshape(B, H//4, 4, W//4, 4, C).permute(0, 1, 3, 2, 4, 5).reshape(B, H//4, W//4, 16, C)
+            pooled_x = new_x.mean(-2, keepdim=True).expand(-1, -1, -1, 16, -1)
+            fused_x = torch.cat([new_x, pooled_x], dim=-1)
+        score = self.predictor(fused_x)
+        normalized_score = F.softmax(score, dim=-2)
+        new_x = (new_x * normalized_score).sum(dim=-2)
+        return new_x

ola/model/multimodal_resampler/__pycache__/builder.cpython-310.pyc ADDED Viewed

Binary file (1.18 kB). View file

ola/model/multimodal_resampler/__pycache__/builder.cpython-38.pyc ADDED Viewed

Binary file (1.18 kB). View file

ola/model/multimodal_resampler/__pycache__/perceiver.cpython-310.pyc ADDED Viewed

Binary file (2.81 kB). View file

ola/model/multimodal_resampler/__pycache__/perceiver.cpython-38.pyc ADDED Viewed

Binary file (2.83 kB). View file

ola/model/multimodal_resampler/builder.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import torch
+from .perceiver import DynamicCompressor
+class IdentityMap(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x, *args, **kwargs):
+        return x
+    @property
+    def config(self):
+        return {"mm_resampler_type": None}
+def build_vision_resampler(model_args, delay_load=False, **kwargs):
+    # import pdb;pdb.set_trace()
+    resampler_type = getattr(model_args, 'mm_resampler_type', None)
+    if resampler_type == 'dynamic_compressor':
+        return DynamicCompressor(model_args, **kwargs)
+    elif resampler_type is None:
+        return IdentityMap()
+    else:
+        raise ValueError(f'Unknown resampler type: {resampler_type}')