File size: 2,563 Bytes
58e7332 c4b6d3a 58e7332 4fbba2c 58e7332 534a65b 58e7332 04f5d32 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
# -*- coding: utf-8 -*-
"""image_caption
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1wo4dOccibBJyLj9E3anSLGeMCWbnIPS1
"""
#pip install transformers -q
#pip install gradio -q
from PIL import Image
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, PreTrainedTokenizerFast
import requests
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
vit_feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
tokenizer = PreTrainedTokenizerFast.from_pretrained("distilgpt2")
#url = 'https://d2gp644kobdlm6.cloudfront.net/wp-content/uploads/2016/06/bigstock-Shocked-and-surprised-boy-on-t-113798588-300x212.jpg'
#with Image.open(requests.get(url, stream=True).raw) as img:
#pixel_values = vit_feature_extractor(images=img, return_tensors="pt").pixel_values
#encoder_outputs = model.generate(pixel_values.to('cpu'),num_beams=5)
#generated_sentences = tokenizer.batch_decode(encoder_outputs, skip_special_tokens=True)
#generated_sentences
#naive text processing
#generated_sentences[0].split('.')[0]
# inference function
def vit2distilgpt2(img):
pixel_values = vit_feature_extractor(images=img, return_tensors="pt").pixel_values
encoder_outputs = generated_ids = model.generate(pixel_values.to('cpu'),num_beams=5)
generated_sentences = tokenizer.batch_decode(encoder_outputs, skip_special_tokens=True)
return(generated_sentences[0].split('.')[0])
#!wget https://media.glamour.com/photos/5f171c4fd35176eaedb36823/master/w_2560%2Cc_limit/bike.jpg
import gradio as gr
inputs = [
gr.inputs.Image(type="pil", label="Original Image")
]
outputs = [
gr.outputs.Textbox(label = 'Caption')
]
title = "Image Captioning using ViT + GPT2"
description = "ViT and GPT2 are used to generate Image Caption for the uploaded image. COCO Dataset was used for training. This image captioning model might have some biases that we couldn't figure during our stress testing, so if you find any bias (gender, race and so on) please use `Flag` button to flag the image with bias"
article = " <a href='https://huggingface.co./vit2distilgpt2'>Model Repo on Hugging Face Model Hub</a>"
examples = [
["bike.jpg"],["Image1.png"],["Image2.png"],["Image3.png"],["images.jpg"]
]
gr.Interface(
vit2distilgpt2,
inputs,
outputs,
title=title,
description=description,
article=article,
examples=examples,
theme="huggingface",
).launch(debug=True, enable_queue=True) |