import gradio as gr from PIL import Image from transformers import AutoProcessor, BlipForConditionalGeneration import os processor = AutoProcessor.from_pretrained("shadowlilac/visor") model = BlipForConditionalGeneration.from_pretrained("shadowlilac/visor", ignore_mismatched_sizes=True) def generate_caption(image): raw_image = Image.fromarray(image) inputs = processor(raw_image, return_tensors="pt") out = model.generate(**inputs, max_length=200) caption = processor.decode(out[0], skip_special_tokens=True) return caption # Create a Gradio interface iface = gr.Interface( fn=generate_caption, inputs="image", outputs="text", live=True, ) # Launch the Gradio app iface.launch(server_name='0.0.0.0', server_port=7860)