import spaces import torch import gradio as gr import tempfile import os import uuid import scipy.io.wavfile import time import numpy as np from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, WhisperTokenizer, pipeline from kokoro import KPipeline import soundfile as sf import subprocess # subprocess.run( # "pip install flash-attn --no-build-isolation", # env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, # shell=True, # ) device = "cuda" if torch.cuda.is_available() else "cpu" torch_dtype = torch.float16 MODEL_NAME = "openai/whisper-large-v3-turbo" # attn_implementation="flash_attention_2" model = AutoModelForSpeechSeq2Seq.from_pretrained( MODEL_NAME, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, ) model.to(device) # πŸ‡ΊπŸ‡Έ 'a' => American English # πŸ‡¬πŸ‡§ 'b' => British English # πŸ‡«πŸ‡· 'f' => French fr-fr tts_pipeline = KPipeline(lang_code='a', device=device) # <= make sure lang_code matches voice processor = AutoProcessor.from_pretrained(MODEL_NAME) tokenizer = WhisperTokenizer.from_pretrained(MODEL_NAME) pipe = pipeline( task="automatic-speech-recognition", model=model, tokenizer=tokenizer, feature_extractor=processor.feature_extractor, chunk_length_s=10, torch_dtype=torch_dtype, device=device, ) @spaces.GPU def stream_transcribe(stream, new_chunk): start_time = time.time() try: sr, y = new_chunk y[y!=y]=0 # Convert to mono if stereo if y.ndim > 1: y = y.mean(axis=1) y = y.astype(np.float32) y /= np.max(np.abs(y)) if stream is not None: stream = np.concatenate([stream, y]) else: stream = y transcription = pipe({"sampling_rate": sr, "raw": stream})["text"] end_time = time.time() latency = end_time - start_time return stream, transcription, f"{latency:.2f}" except Exception as e: print(f"Error during Transcription: {e}") return stream, e, "Error" def clear(): return "" def clear_state(): return None @spaces.GPU def tts(target_text): generator = tts_pipeline( target_text, voice='af_heart', # <= change voice here speed=1, split_pattern=r'\n+' ) audios = [] for i, (gs, ps, audio) in enumerate(generator): audios.append(audio.cpu().numpy()) return (24000, np.concatenate(audios)) with gr.Blocks() as microphone: with gr.Column(): gr.Markdown(f"# Realtime Whisper Large V3 Turbo: \n Transcribe Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co./{MODEL_NAME}) and πŸ€— Transformers.\n Note: The first token takes about 5 seconds. After that, it works flawlessly.") with gr.Row(): input_audio_microphone = gr.Audio(streaming=True) output = gr.Textbox(label="Transcription", value="") latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0) with gr.Row(): clear_button = gr.Button("Clear Output") state = gr.State() input_audio_microphone.stream(stream_transcribe, [state, input_audio_microphone], [state, output, latency_textbox], time_limit=30, stream_every=2, concurrency_limit=None) clear_button.click(clear_state, outputs=[state]).then(clear, outputs=[output]) gen_text_input = gr.Textbox(label="Text to Generate", lines=10) generate_btn = gr.Button("Synthesize", variant="primary") audio_output = gr.Audio(label="Synthesized Audio") generate_btn.click( tts, inputs=[ gen_text_input, ], outputs=[audio_output], ) with gr.Blocks(theme=gr.themes.Ocean()) as demo: gr.TabbedInterface([microphone], ["vc chat"]) demo.launch()