|
"""VideoLLaMA3 model configuration.""" |
|
|
|
import importlib.util |
|
import os.path as osp |
|
from typing import Optional, Dict, Any |
|
|
|
from transformers import AutoConfig, AutoModel, PretrainedConfig, Qwen2Config |
|
|
|
try: |
|
from .configuration_videollama3_encoder import Videollama3VisionEncoderConfig |
|
except ModuleNotFoundError: |
|
spec = importlib.util.spec_from_file_location( |
|
"configuration_videollama3_encoder", |
|
osp.join(osp.dirname(__file__), "configuration_videollama3_encoder.py"), |
|
) |
|
configuration_videollama3_encoder = importlib.util.module_from_spec(spec) |
|
spec.loader.exec_module(configuration_videollama3_encoder) |
|
Videollama3VisionEncoderConfig = getattr( |
|
configuration_videollama3_encoder, |
|
"Videollama3VisionEncoderConfig", |
|
) |
|
|
|
try: |
|
from .modeling_videollama3_encoder import Videollama3VisionEncoderModel |
|
except ModuleNotFoundError: |
|
spec = importlib.util.spec_from_file_location( |
|
"modeling_videollama3_encoder", |
|
osp.join(osp.dirname(__file__), "modeling_videollama3_encoder.py"), |
|
) |
|
modeling_videollama3_encoder = importlib.util.module_from_spec(spec) |
|
spec.loader.exec_module(modeling_videollama3_encoder) |
|
Videollama3VisionEncoderModel = getattr( |
|
modeling_videollama3_encoder, |
|
"Videollama3VisionEncoderModel", |
|
) |
|
|
|
AutoConfig.register("videollama3_vision_encoder", Videollama3VisionEncoderConfig) |
|
AutoModel.register(Videollama3VisionEncoderConfig, Videollama3VisionEncoderModel) |
|
|
|
|
|
class Videollama3Qwen2Config(Qwen2Config): |
|
|
|
model_type = "videollama3_qwen2" |
|
sub_configs = {"vision_encoder_config": Videollama3VisionEncoderConfig} |
|
|
|
def __init__( |
|
self, |
|
vision_encoder: Optional[str] = None, |
|
vision_encoder_config: Dict[str, Any] = {}, |
|
mm_projector_type: str = "mlp2x_gelu", |
|
use_token_compression: bool = True, |
|
image_token_index: int = -1, |
|
**kwargs, |
|
): |
|
super().__init__(**kwargs) |
|
self.model_type = "videollama3_qwen2" |
|
|
|
self.vision_encoder = vision_encoder |
|
if vision_encoder_config is not None and not isinstance(vision_encoder_config, PretrainedConfig): |
|
vision_encoder_config = Videollama3VisionEncoderConfig(**vision_encoder_config) |
|
self.vision_encoder_config = vision_encoder_config |
|
|
|
self.mm_projector_type = mm_projector_type |
|
self.use_token_compression = use_token_compression |
|
self.image_token_index = image_token_index |
|
|