import os from typing import List import cv2 import numpy as np import torch import torch.nn as nn from PIL import Image from utils.text_encoder import text_encoder from utils.vision_encoder import get_vision_encoder class VideoCLIP_XL(nn.Module): def __init__(self): super(VideoCLIP_XL, self).__init__() self.text_model = text_encoder.load().float() self.vision_model = get_vision_encoder().float()