import gradio as gr from PIL import Image from transformers import AutoProcessor, Blip2ForConditionalGeneration import os processor = AutoProcessor.from_pretrained("shadowlilac/visor-v2", token=os.getenv('HUGGING_FACE_HUB_TOKEN')) model = Blip2ForConditionalGeneration.from_pretrained("shadowlilac/visor-v2", ignore_mismatched_sizes=True, token=os.getenv('HUGGING_FACE_HUB_TOKEN')) def generate_caption(image): raw_image = Image.fromarray(image) inputs = processor(raw_image, return_tensors="pt") out = model.generate(**inputs, max_length=200) caption = processor.decode(out[0], skip_special_tokens=True) return caption # Create a Gradio interface iface = gr.Interface( fn=generate_caption, inputs="image", outputs="text", live=True, ) # Launch the Gradio app iface.launch(server_name='0.0.0.0', server_port=7860)