matjarm's picture
init
681078d
import requests
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
import torch
from PIL import Image
model1 = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor1 = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer1 = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
device1 = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model1.to(device1)
max_length = 16
num_beams = 4
gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
def image_to_text_model_1(image_url):
raw_image = Image.open(requests.get(image_url, stream=True).raw).convert('RGB')
pixel_values = feature_extractor1(images=[raw_image], return_tensors="pt").pixel_values
pixel_values = pixel_values.to(device1)
output_ids = model1.generate(pixel_values, **gen_kwargs)
preds = tokenizer1.batch_decode(output_ids, skip_special_tokens=True)
preds = [pred.strip() for pred in preds]
return preds
def bytes_to_text_model_1(bts):
pixel_values = feature_extractor1(images=[bts], return_tensors="pt").pixel_values
pixel_values = pixel_values.to(device1)
output_ids = model1.generate(pixel_values, **gen_kwargs)
preds = tokenizer1.batch_decode(output_ids, skip_special_tokens=True)
preds = [pred.strip() for pred in preds]
print(preds[0])
import requests
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch
device2 = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
processor2 = BlipProcessor.from_pretrained("noamrot/FuseCap")
model2 = BlipForConditionalGeneration.from_pretrained("noamrot/FuseCap").to(device2)
def image_to_text_model_2(img_url):
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
text = "a picture of "
inputs = processor2(raw_image, text, return_tensors="pt").to(device2)
out = model2.generate(**inputs, num_beams = 3)
print(processor2.decode(out[0], skip_special_tokens=True))
def bytes_to_text_model_2(byts):
text = "a picture of "
inputs = processor2(byts, text, return_tensors="pt").to(device2)
out = model2.generate(**inputs, num_beams = 3)
print(processor2.decode(out[0], skip_special_tokens=True))
import requests
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
processor3 = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model3 = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")
def image_to_text_model_3(img_url):
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
text = "a picture of"
inputs = processor3(raw_image, text, return_tensors="pt")
inputs = processor3(raw_image, return_tensors="pt")
out = model3.generate(**inputs)
print(processor3.decode(out[0], skip_special_tokens=True))
def bytes_to_text_model_3(byts):
text = "a picture of"
inputs = processor3(byts, text, return_tensors="pt")
inputs = processor3(byts, return_tensors="pt")
out = model3.generate(**inputs)
print(processor3.decode(out[0], skip_special_tokens=True))
import cv2
def FrameCapture(path):
vidObj = cv2.VideoCapture(path)
count = 0
success = 1
while success:
success, image = vidObj.read()
if count % 20 == 0:
print("NEW FRAME")
print("MODEL 1")
bytes_to_text_model_1(image)
print("MODEL 2")
bytes_to_text_model_2(image)
print("MODEL 3")
bytes_to_text_model_3(image)
print("\n\n")
count += 1
FrameCapture("animation.mp4")