Spaces:
Running
Running
import spaces | |
import torch | |
import gradio as gr | |
import tempfile | |
import os | |
import uuid | |
import scipy.io.wavfile | |
import time | |
import numpy as np | |
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, WhisperTokenizer, pipeline | |
from kokoro import KPipeline | |
import soundfile as sf | |
import subprocess | |
# subprocess.run( | |
# "pip install flash-attn --no-build-isolation", | |
# env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, | |
# shell=True, | |
# ) | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
torch_dtype = torch.float16 | |
MODEL_NAME = "openai/whisper-large-v3-turbo" | |
# attn_implementation="flash_attention_2" | |
model = AutoModelForSpeechSeq2Seq.from_pretrained( | |
MODEL_NAME, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True, | |
) | |
model.to(device) | |
# πΊπΈ 'a' => American English | |
# π¬π§ 'b' => British English | |
# π«π· 'f' => French fr-fr | |
tts_pipeline = KPipeline(lang_code='a', device=device) # <= make sure lang_code matches voice | |
processor = AutoProcessor.from_pretrained(MODEL_NAME) | |
tokenizer = WhisperTokenizer.from_pretrained(MODEL_NAME) | |
pipe = pipeline( | |
task="automatic-speech-recognition", | |
model=model, | |
tokenizer=tokenizer, | |
feature_extractor=processor.feature_extractor, | |
chunk_length_s=10, | |
torch_dtype=torch_dtype, | |
device=device, | |
) | |
def stream_transcribe(stream, new_chunk): | |
start_time = time.time() | |
try: | |
sr, y = new_chunk | |
y[y!=y]=0 | |
# Convert to mono if stereo | |
if y.ndim > 1: | |
y = y.mean(axis=1) | |
y = y.astype(np.float32) | |
y /= np.max(np.abs(y)) | |
if stream is not None: | |
stream = np.concatenate([stream, y]) | |
else: | |
stream = y | |
transcription = pipe({"sampling_rate": sr, "raw": stream})["text"] | |
end_time = time.time() | |
latency = end_time - start_time | |
return stream, transcription, f"{latency:.2f}" | |
except Exception as e: | |
print(f"Error during Transcription: {e}") | |
return stream, e, "Error" | |
def clear(): | |
return "" | |
def clear_state(): | |
return None | |
def tts(target_text): | |
generator = tts_pipeline( | |
target_text, voice='af_heart', # <= change voice here | |
speed=1, split_pattern=r'\n+' | |
) | |
audios = [] | |
for i, (gs, ps, audio) in enumerate(generator): | |
audios.append(audio.cpu().numpy()) | |
return (24000, np.concatenate(audios)) | |
with gr.Blocks() as microphone: | |
with gr.Column(): | |
gr.Markdown(f"# Realtime Whisper Large V3 Turbo: \n Transcribe Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co./{MODEL_NAME}) and π€ Transformers.\n Note: The first token takes about 5 seconds. After that, it works flawlessly.") | |
with gr.Row(): | |
input_audio_microphone = gr.Audio(streaming=True) | |
output = gr.Textbox(label="Transcription", value="") | |
latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0) | |
with gr.Row(): | |
clear_button = gr.Button("Clear Output") | |
state = gr.State() | |
input_audio_microphone.stream(stream_transcribe, [state, input_audio_microphone], [state, output, latency_textbox], time_limit=30, stream_every=2, concurrency_limit=None) | |
clear_button.click(clear_state, outputs=[state]).then(clear, outputs=[output]) | |
gen_text_input = gr.Textbox(label="Text to Generate", lines=10) | |
generate_btn = gr.Button("Synthesize", variant="primary") | |
audio_output = gr.Audio(label="Synthesized Audio") | |
generate_btn.click( | |
tts, | |
inputs=[ | |
gen_text_input, | |
], | |
outputs=[audio_output], | |
) | |
with gr.Blocks(theme=gr.themes.Ocean()) as demo: | |
gr.TabbedInterface([microphone], ["vc chat"]) | |
demo.launch() | |