from transformers import pipeline from pytube import YouTube import gradio as gr import requests pipe = pipeline(model="Silemo/whisper-it") # change to "your-username/the-name-you-picked" def download_audio(audio_url, filename): # URL of the image to be downloaded is defined as audio_url r = requests.get(audio_url) # create HTTP response object # send a HTTP request to the server and save # the HTTP response in a response object called r with open(filename,'wb') as f: # Saving received content as a mp3 file in # binary format # write the contents of the response (r.content) # to a new file in binary mode. f.write(r.content) def transcribe(audio): text = pipe(audio)["text"] return text def transcribe_video(url): yt = YouTube(url) stream = yt.streams.get_audio_only() # Saves the audio in the /audio folder audio = stream.download() text = transcribe(audio) return text audio1_url = "https://github.com/Silemo/sml-lab2-2023-manfredi-meneghin/raw/main/task1/audio/offer.mp3" audio1_filename = "offer.mp3" download_audio(audio1_url, audio1_filename) audio2_url = "https://github.com/Silemo/sml-lab2-2023-manfredi-meneghin/raw/main/task1/audio/fantozzi.mp3" audio2_filename = "fantozzi.mp3" download_audio(audio2_url, audio2_filename) # Multiple interfaces using tabs -> https://github.com/gradio-app/gradio/issues/450 io1 = gr.Interface( fn = transcribe, inputs = gr.Audio(sources=["microphone", "upload"], type="filepath"), outputs = "text", examples=[ [audio1_filename], [audio2_filename], ], title = "Whisper Small - Italian - Microphone or Audio file", description = "Realtime demo for Italian speech recognition using a fine-tuned Whisper small model. It uses the computer microphone or an audio file as audio input", ) io2 = gr.Interface( fn = transcribe_video, inputs = gr.Textbox(label = "YouTube URL", placeholder = "https://youtu.be/9DImRZERJNs?si=1Lme7o_KH2oCxU7y"), outputs = "text", examples=[ # Per me รจ la cipolla ["https://youtu.be/QbwZlURClSA?si=DKMtIiKE-nO2mfcV"], # Breaking Italy - Lollobrigida ferma il treno ["https://youtu.be/9MPBN0tnA_E?si=8-hqkJS05LNkWprX&t=2"], # Mussolini discorso ["https://youtu.be/UmnxcjRk37Q?si=uxt8oqnMDJ3vFzIB&t=77"], ], title = "Whisper Small - Italian - YouTube link", description = "Realtime demo for Italian speech recognition using a fine-tuned Whisper small model. It uses a YouTube link as audio input", ) gr.TabbedInterface( [io1, io2], {"Microphone or audio file", "YouTube"} ).launch()