Spaces:

Stoneman
/

IG-caption-generator

Runtime error

App Files Files Community

Stoneman commited on Nov 27, 2023

Commit

ed7be52

•

1 Parent(s): d7e34bd

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -8

app.py CHANGED Viewed

@@ -4,18 +4,42 @@ from transformers import GPT2TokenizerFast, ViTImageProcessor, VisionEncoderDeco
 # Setup device, model, tokenizer, and feature extractor
 device = 'cpu'
-model_checkpoint = "Stoneman/IG-caption-generator-nlpconnect-all"
-feature_extractor = ViTImageProcessor.from_pretrained(model_checkpoint)
-tokenizer = GPT2TokenizerFast.from_pretrained(model_checkpoint)
-model = VisionEncoderDecoderModel.from_pretrained(model_checkpoint).to(device)
 # Prediction function
 def predict(image, max_length=128):
     image = image.convert('RGB')
-    pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values.to(device)
-    caption_ids = model.generate(pixel_values, max_length=max_length)[0]
-    caption_text = tokenizer.decode(caption_ids, skip_special_tokens=True)
-    return caption_text
 # Define input and output components
 input_component = gr.components.Image(label="Upload any Image", type="pil")

 # Setup device, model, tokenizer, and feature extractor
 device = 'cpu'
+model_checkpoint1 = "Stoneman/IG-caption-generator-vit-gpt2-last-block"
+feature_extractor1 = ViTImageProcessor.from_pretrained(model_checkpoint1)
+tokenizer1 = GPT2TokenizerFast.from_pretrained(model_checkpoint1)
+model1 = VisionEncoderDecoderModel.from_pretrained(model_checkpoint1).to(device)
+model_checkpoint2 = "Stoneman/IG-caption-generator-vit-gpt2-all"
+model2 = VisionEncoderDecoderModel.from_pretrained(model_checkpoint2).to(device)
+model_checkpoint3 = "Stoneman/IG-caption-generator-nlpconnect-last-block"
+model3 = VisionEncoderDecoderModel.from_pretrained(model_checkpoint3).to(device)
+model_checkpoint4 = "Stoneman/IG-caption-generator-nlpconnect-all"
+model4 = VisionEncoderDecoderModel.from_pretrained(model_checkpoint4).to(device)
+models = {
+    1: model1,
+    2: model2,
+    3: model3,
+    4: model4
+}
 # Prediction function
 def predict(image, max_length=128):
+    captions = {}
     image = image.convert('RGB')
+    pixel_values = feature_extractor1(images=image, return_tensors="pt").pixel_values.to(device)
+    for i in range(1,5):
+        caption_ids = models[i].generate(pixel_values, max_length=max_length)[0]
+        caption_text = tokenizer1.decode(caption_ids, skip_special_tokens=True)
+        captions[i] = caption_text
+    # Return a single string with all captions
+    return '\n\n'.join(f'Model {i}: {caption}' for i, caption in captions.items())
 # Define input and output components
 input_component = gr.components.Image(label="Upload any Image", type="pil")