import os import gradio as gr import yt_dlp from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq import torch import torchaudio from pydub import AudioSegment from pydub.silence import split_on_silence def download_video(url): """Download video and extract audio. :param url: The URL of the video to download. :return: Path to the downloaded audio file. """ ydl_opts = { 'format': 'bestaudio/best', 'postprocessors': [{ 'key': 'FFmpegExtractAudio', 'preferredcodec': 'wav', 'preferredquality': '192', }], 'outtmpl': f"downloaded_audio.%(ext)s", } with yt_dlp.YoutubeDL(ydl_opts) as ydl: ydl.download([url]) return f"downloaded_audio.wav" def split_audio(audio_file, min_silence_len=500, silence_thresh=-40, keep_silence=200, max_length=30000) -> list: """ Splits the audio file into chunks at points of silence. :param audio_file: Path to the audio file. :param min_silence_len: Minimum length of silence (in ms) to consider it as a split point. :param silence_thresh: Silence threshold (in dB). :param keep_silence: Amount of silence (in ms) to leave at the beginning and end of each chunk. :param max_length: Maximum length of each chunk (in ms). :return: List of paths to the audio chunks. """ # Load the audio file audio = AudioSegment.from_file(audio_file) # Split the audio file into chunks at points of silence chunks = split_on_silence( audio, min_silence_len=min_silence_len, silence_thresh=silence_thresh, keep_silence=keep_silence ) # Further split chunks if they are too long split_chunks = [] for i, chunk in enumerate(chunks): if len(chunk) <= max_length: split_chunks.append(chunk) else: split_chunks.extend(chunk[::max_length]) # Export the chunks to files chunk_filenames = [] for i, chunk in enumerate(split_chunks): chunk_name = f"chunk{i}.wav" chunk.export(chunk_name, format="wav") chunk_filenames.append(chunk_name) return chunk_filenames # Hugging Face # Load the model and processor processor = AutoProcessor.from_pretrained("GroupSix/whisper-small-sv") model = AutoModelForSpeechSeq2Seq.from_pretrained("GroupSix/whisper-small-sv") def transcribe_audio(segment, num_segments): print(f"Current segment: {segment} (out of {num_segments})") # Load the audio file waveform, sample_rate = torchaudio.load(segment) # Resample if necessary if sample_rate != 16000: resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000) waveform = resampler(waveform) # Run the model inputs = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt") # Generate the transcription with torch.no_grad(): generated_ids = model.generate(**inputs) # Decode the output and select the first transcription decoded_output = processor.batch_decode(generated_ids, skip_special_tokens=True) transcription = decoded_output[0] if decoded_output else "" return transcription # Main function to tie everything together def process_video(url): # Download and split the audio audio_file = download_video(url) segments = split_audio(audio_file) # Transcribe each segment transcriptions = [transcribe_audio(segment, len(segments)) for segment in segments] # Delete the audio file and the chunks os.remove(audio_file) for segment in segments: os.remove(segment) return transcriptions # Gradio interface iface = gr.Interface( fn=process_video, inputs=gr.Textbox(label="Swedish YouTube Video URL"), outputs=gr.Textbox(label="Transcriptions"), # examples=[ # # ["https://www.youtube.com/watch?v=hcxwTgEC7IM"], # Fred på jorden # # ["https://www.youtube.com/watch?v=AzlipxrzMe4"], # Jerry talar spanska # # ["https://www.youtube.com/watch?v=H_16_5kGh3I"], # Det heter näsa, inte nos! # # ["https://www.youtube.com/watch?v=v2m4V6FUseQ"], # Ove blir arg på pantsystemet # # ["https://www.youtube.com/watch?v=oA5QJHBNQkU"], # Hur mår björnen egentligen? (takes too long) # ] ) iface.launch()