|
import requests |
|
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer |
|
import torch |
|
from PIL import Image |
|
|
|
model1 = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning") |
|
feature_extractor1 = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning") |
|
tokenizer1 = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning") |
|
|
|
device1 = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
model1.to(device1) |
|
|
|
|
|
|
|
max_length = 16 |
|
num_beams = 4 |
|
gen_kwargs = {"max_length": max_length, "num_beams": num_beams} |
|
|
|
def image_to_text_model_1(image_url): |
|
raw_image = Image.open(requests.get(image_url, stream=True).raw).convert('RGB') |
|
|
|
pixel_values = feature_extractor1(images=[raw_image], return_tensors="pt").pixel_values |
|
pixel_values = pixel_values.to(device1) |
|
|
|
output_ids = model1.generate(pixel_values, **gen_kwargs) |
|
|
|
preds = tokenizer1.batch_decode(output_ids, skip_special_tokens=True) |
|
preds = [pred.strip() for pred in preds] |
|
return preds |
|
|
|
def bytes_to_text_model_1(bts): |
|
pixel_values = feature_extractor1(images=[bts], return_tensors="pt").pixel_values |
|
pixel_values = pixel_values.to(device1) |
|
|
|
output_ids = model1.generate(pixel_values, **gen_kwargs) |
|
|
|
preds = tokenizer1.batch_decode(output_ids, skip_special_tokens=True) |
|
preds = [pred.strip() for pred in preds] |
|
print(preds[0]) |
|
|
|
|
|
import requests |
|
from PIL import Image |
|
from transformers import BlipProcessor, BlipForConditionalGeneration |
|
import torch |
|
|
|
device2 = torch.device('cuda' if torch.cuda.is_available() else 'cpu') |
|
processor2 = BlipProcessor.from_pretrained("noamrot/FuseCap") |
|
model2 = BlipForConditionalGeneration.from_pretrained("noamrot/FuseCap").to(device2) |
|
|
|
|
|
def image_to_text_model_2(img_url): |
|
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB') |
|
text = "a picture of " |
|
inputs = processor2(raw_image, text, return_tensors="pt").to(device2) |
|
|
|
out = model2.generate(**inputs, num_beams = 3) |
|
print(processor2.decode(out[0], skip_special_tokens=True)) |
|
|
|
def bytes_to_text_model_2(byts): |
|
text = "a picture of " |
|
inputs = processor2(byts, text, return_tensors="pt").to(device2) |
|
|
|
out = model2.generate(**inputs, num_beams = 3) |
|
print(processor2.decode(out[0], skip_special_tokens=True)) |
|
|
|
|
|
|
|
import requests |
|
from PIL import Image |
|
from transformers import BlipProcessor, BlipForConditionalGeneration |
|
|
|
processor3 = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large") |
|
model3 = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large") |
|
|
|
def image_to_text_model_3(img_url): |
|
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB') |
|
text = "a picture of" |
|
inputs = processor3(raw_image, text, return_tensors="pt") |
|
inputs = processor3(raw_image, return_tensors="pt") |
|
|
|
out = model3.generate(**inputs) |
|
print(processor3.decode(out[0], skip_special_tokens=True)) |
|
|
|
def bytes_to_text_model_3(byts): |
|
text = "a picture of" |
|
inputs = processor3(byts, text, return_tensors="pt") |
|
inputs = processor3(byts, return_tensors="pt") |
|
|
|
out = model3.generate(**inputs) |
|
print(processor3.decode(out[0], skip_special_tokens=True)) |
|
|
|
|
|
import cv2 |
|
|
|
def FrameCapture(path): |
|
vidObj = cv2.VideoCapture(path) |
|
count = 0 |
|
success = 1 |
|
|
|
while success: |
|
success, image = vidObj.read() |
|
|
|
if count % 20 == 0: |
|
|
|
print("NEW FRAME") |
|
print("MODEL 1") |
|
bytes_to_text_model_1(image) |
|
print("MODEL 2") |
|
bytes_to_text_model_2(image) |
|
print("MODEL 3") |
|
bytes_to_text_model_3(image) |
|
|
|
print("\n\n") |
|
|
|
count += 1 |
|
|
|
|
|
FrameCapture("animation.mp4") |