import os | |
from typing import List | |
import cv2 | |
import numpy as np | |
import torch | |
import torch.nn as nn | |
from PIL import Image | |
from utils.text_encoder import text_encoder | |
from utils.vision_encoder import get_vision_encoder | |
class VideoCLIP_XL(nn.Module): | |
def __init__(self): | |
super(VideoCLIP_XL, self).__init__() | |
self.text_model = text_encoder.load().float() | |
self.vision_model = get_vision_encoder().float() |