import os import gradio as gr import torch import PIL from open_flamingo import create_model_and_transforms model, image_processor, tokenizer = create_model_and_transforms( clip_vision_encoder_path="ViT-L-14", clip_vision_encoder_pretrained="openai", lang_encoder_path="anas-awadalla/mpt-1b-redpajama-200b", tokenizer_path="anas-awadalla/mpt-1b-redpajama-200b", cross_attn_every_n_layers=1 ) # grab model checkpoint from huggingface hub from huggingface_hub import hf_hub_download import torch checkpoint_path = hf_hub_download("openflamingo/OpenFlamingo-3B-vitl-mpt1b", "checkpoint.pt") model.load_state_dict(torch.load(checkpoint_path), strict=False) from PIL import Image import requests import torch """ Step 1: Load images """ demo_image_one = Image.open( requests.get( "http://images.cocodataset.org/val2017/000000039769.jpg", stream=True ).raw ) demo_image_two = Image.open( requests.get( "http://images.cocodataset.org/test-stuff2017/000000028137.jpg", stream=True ).raw ) query_image = Image.open( requests.get( "http://images.cocodataset.org/test-stuff2017/000000028352.jpg", stream=True ).raw ) """ Step 2: Preprocessing images Details: For OpenFlamingo, we expect the image to be a torch tensor of shape batch_size x num_media x num_frames x channels x height x width. In this case batch_size = 1, num_media = 3, num_frames = 1, channels = 3, height = 224, width = 224. Step 3: Preprocessing text Details: In the text we expect an special token to indicate where an image is. We also expect an <|endofchunk|> special token to indicate the end of the text portion associated with an image. tokenizer.padding_side = "left" # For generation padding tokens should be on the left lang_x = tokenizer( ["An image of two cats.<|endofchunk|>An image of a bathroom sink.<|endofchunk|>An image of"], return_tensors="pt", ) """ """ Step 4: Generate text """ #print("Generated text: ", tokenizer.decode(generated_text[0])) def predict_caption(image, prompt): assert isinstance(prompt, str) vision_x = [image_processor(demo_image_one).unsqueeze(0), image_processor(demo_image_two).unsqueeze(0), image_processor(query_image).unsqueeze(0)] vision_x = torch.cat(vision_x, dim=0) vision_x = vision_x.unsqueeze(1).unsqueeze(0) tokenizer.padding_side = "left" # For generation padding tokens should be on the left lang_x = tokenizer( ["An image of two cats.<|endofchunk|>An image of a bathroom sink.<|endofchunk|>An image of"], return_tensors="pt", ) tokenizer.padding_side = "left" # For generation padding tokens should be on the left lang_x = tokenizer( ["An image of two cats.<|endofchunk|>An image of a bathroom sink.<|endofchunk|>An image of"], return_tensors="pt", ) caption = tokenizer.decode(generated_text[0]) return caption iface = gr.Interface(fn=predict_caption, inputs=[gr.Image(type="pil"), gr.Textbox(value=DEFAULT_PROMPT, label="Prompt")], examples=examples, outputs="text") iface.launch()