Spaces:

srinivasbilla
/

vc_chat

Running

App Files Files Community

vc_chat / app.py

srinivasbilla

Update app.py

6f15984 verified 24 days ago

raw

history blame contribute delete

3.84 kB


	import spaces
	import torch
	import gradio as gr
	import tempfile
	import os
	import uuid
	import scipy.io.wavfile
	import time
	import numpy as np
	from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, WhisperTokenizer, pipeline
	from kokoro import KPipeline
	import soundfile as sf
	import subprocess
	# subprocess.run(
	# "pip install flash-attn --no-build-isolation",
	# env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
	# shell=True,
	# )

	device = "cuda" if torch.cuda.is_available() else "cpu"
	torch_dtype = torch.float16
	MODEL_NAME = "openai/whisper-large-v3-turbo"
	# attn_implementation="flash_attention_2"
	model = AutoModelForSpeechSeq2Seq.from_pretrained(
	MODEL_NAME, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True,
	)
	model.to(device)


	# 🇺🇸 'a' => American English
	# 🇬🇧 'b' => British English
	# 🇫🇷 'f' => French fr-fr
	tts_pipeline = KPipeline(lang_code='a', device=device) # <= make sure lang_code matches voice

	processor = AutoProcessor.from_pretrained(MODEL_NAME)
	tokenizer = WhisperTokenizer.from_pretrained(MODEL_NAME)

	pipe = pipeline(
	task="automatic-speech-recognition",
	model=model,
	tokenizer=tokenizer,
	feature_extractor=processor.feature_extractor,
	chunk_length_s=10,
	torch_dtype=torch_dtype,
	device=device,
	)

	@spaces.GPU
	def stream_transcribe(stream, new_chunk):
	start_time = time.time()
	try:
	sr, y = new_chunk
	y[y!=y]=0
	# Convert to mono if stereo
	if y.ndim > 1:
	y = y.mean(axis=1)

	y = y.astype(np.float32)
	y /= np.max(np.abs(y))

	if stream is not None:
	stream = np.concatenate([stream, y])
	else:
	stream = y

	transcription = pipe({"sampling_rate": sr, "raw": stream})["text"]
	end_time = time.time()
	latency = end_time - start_time

	return stream, transcription, f"{latency:.2f}"
	except Exception as e:
	print(f"Error during Transcription: {e}")
	return stream, e, "Error"

	def clear():
	return ""

	def clear_state():
	return None

	@spaces.GPU
	def tts(target_text):
	generator = tts_pipeline(
	target_text, voice='af_heart', # <= change voice here
	speed=1, split_pattern=r'\n+'
	)
	audios = []
	for i, (gs, ps, audio) in enumerate(generator):
	audios.append(audio.cpu().numpy())
	return (24000, np.concatenate(audios))

	with gr.Blocks() as microphone:
	with gr.Column():
	gr.Markdown(f"# Realtime Whisper Large V3 Turbo: \n Transcribe Audio in Realtime. This Demo uses the Checkpoint [{MODEL_NAME}](https://huggingface.co./{MODEL_NAME}) and 🤗 Transformers.\n Note: The first token takes about 5 seconds. After that, it works flawlessly.")
	with gr.Row():
	input_audio_microphone = gr.Audio(streaming=True)
	output = gr.Textbox(label="Transcription", value="")
	latency_textbox = gr.Textbox(label="Latency (seconds)", value="0.0", scale=0)
	with gr.Row():
	clear_button = gr.Button("Clear Output")
	state = gr.State()
	input_audio_microphone.stream(stream_transcribe, [state, input_audio_microphone], [state, output, latency_textbox], time_limit=30, stream_every=2, concurrency_limit=None)
	clear_button.click(clear_state, outputs=[state]).then(clear, outputs=[output])

	gen_text_input = gr.Textbox(label="Text to Generate", lines=10)

	generate_btn = gr.Button("Synthesize", variant="primary")

	audio_output = gr.Audio(label="Synthesized Audio")

	generate_btn.click(
	tts,
	inputs=[
	gen_text_input,
	],
	outputs=[audio_output],
	)

	with gr.Blocks(theme=gr.themes.Ocean()) as demo:
	gr.TabbedInterface([microphone], ["vc chat"])

	demo.launch()