import gradio as gr import numpy as np from transformers import pipeline # caption = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large") caption = pipeline("image-to-text", model="nlpconnect/vit-gpt2-image-captioning") generate = pipeline("text-generation", model="openai-community/gpt2") tts = pipeline(task="text-to-speech", model="facebook/mms-tts-eng") def run_caption(img): res = caption(img, max_new_tokens=128) return res[0]["generated_text"] def run_generate(txt): res = generate(txt, max_length=50) return res[0]["generated_text"] def run_tts(txt): res = tts(txt) audio = (res["audio"].reshape(-1) * 2 ** 15).astype(np.int16) return res["sampling_rate"], audio def run_caption_tts(img): return run_tts(run_caption(img)) def run_caption_generate_tts(img): return run_tts(run_generate(run_caption(img))) with gr.Blocks() as demo: gr.Interface( run_caption, inputs=gr.Image(type="pil"), outputs="text", ) gr.Interface( run_generate, inputs="text", outputs="text", ) gr.Interface( run_tts, inputs=gr.Textbox(), outputs="audio", ) gr.Interface( run_caption_tts, inputs=gr.Image(type="pil"), outputs="audio", ) gr.Interface( run_caption_generate_tts, inputs=gr.Image(type="pil"), outputs="audio", ) if __name__ == "__main__": demo.launch()