from transformers import pipeline # Create Object Detection pipeline to Detect Objects in an Image od_pipe = pipeline("object-detection", "facebook/detr-resnet-50") #Create Text to Speech pipeline to Generate Audio Narration of an Image tts_pipe = pipeline("text-to-speech", model="kakao-enterprise/vits-ljs") #Build the Audio Powered AI Assistant import os import gradio as gr from helper import summarize_predictions_natural_language def dictate_whatisin_image(pil_image): pipeline_output = od_pipe(pil_image) text = summarize_predictions_natural_language(pipeline_output) narrated_text = tts_pipe(text) (narrated_text["sampling_rate"], narrated_text["audio"][0] ) return (narrated_text["sampling_rate"], narrated_text["audio"][0]) demo = gr.Interface( fn=dictate_whatisin_image, inputs=gr.Image(label="Input image", type="pil"), outputs=gr.Audio(label="Listen to what is inside the image", type="numpy", autoplay=True), title="AI Powered Audio Assitant", description="Dictate what is inside the Image" ) demo.launch()