Spaces:
Running
Running
import gradio as gr | |
from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor | |
import torch | |
import numpy as np | |
device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
#device = "cpu" | |
torch_dtype = torch.float16 if device != "cpu" else torch.float32 | |
print("Device:", device) | |
model_id = "openai/whisper-large-v3" | |
#model_id = "openai/whisper-medium" | |
# model_id = "openai/whisper-large-v3" | |
# model_id = "openai/whisper-medium" | |
# model = AutoModelForSpeechSeq2Seq.from_pretrained( | |
# model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True | |
# ) | |
# model.to(device) | |
# processor = AutoProcessor.from_pretrained(model_id) | |
# pipe_transcription = pipeline( | |
# "automatic-speech-recognition", | |
# model=model, | |
# tokenizer=processor.tokenizer, | |
# feature_extractor=processor.feature_extractor, | |
# max_new_tokens=128, | |
# chunk_length_s=30, | |
# batch_size=16, | |
# return_timestamps=True, | |
# torch_dtype=torch_dtype, | |
# device=device, | |
# ) | |
pipe_transcription = pipeline("automatic-speech-recognition", model="pierreguillou/whisper-medium-french") | |
pipe_translate = pipeline("translation", model="Helsinki-NLP/opus-mt-fr-en", device=device) | |
pipe_tts = pipeline("text-to-speech", model="facebook/mms-tts-eng", device=device) # Better quality, way faster than bark | |
def get_translation(text): | |
return pipe_translate(text)[0]["translation_text"] | |
def get_transcript(voice): | |
return get_translation(pipe_transcription(voice)["text"])#, generate_kwargs={"task": "translate", "language": "french"})["text"] | |
def get_audio(text): | |
speech = pipe_tts(text) | |
return speech["sampling_rate"], (speech["audio"]* 32767).astype(np.int16).T | |
with gr.Blocks() as demo: | |
with gr.Tab("Texte (rapide)"): | |
input_text = gr.Textbox( | |
label="Input text", | |
info="Your text", | |
lines=3, | |
placeholder="Écrire le texte à traduire", | |
) | |
translation_button = gr.Button("Traduire...") | |
output_text = gr.Textbox( | |
label="Output text", | |
info="Your text", | |
lines=3, | |
placeholder="Votre traduction", | |
) | |
speech_button = gr.Button("Générer audio...") | |
translation_button.click( | |
get_translation, | |
inputs=[ | |
input_text | |
], | |
outputs=[ | |
output_text | |
], | |
) | |
speech_button.click( | |
get_audio, | |
inputs=[ | |
output_text | |
], | |
outputs=[ | |
gr.Audio(label="Output") | |
], | |
) | |
with gr.Tab("Voix (plus lent)"): | |
voice = gr.Audio(sources=["microphone"], type="filepath") | |
translation_button = gr.Button("Traduire votre enregistrement !") | |
output_text = gr.Textbox( | |
label="Texte traduit", | |
info="Votre texte", | |
lines=3, | |
placeholder="Votre traduction", | |
) | |
speech_button = gr.Button("Générer audio !") | |
translation_button.click( | |
get_transcript, | |
inputs=[ | |
voice | |
], | |
outputs=[ | |
output_text | |
], | |
) | |
speech_button.click( | |
get_audio, | |
inputs=[ | |
output_text | |
], | |
outputs=[ | |
gr.Audio(label="Output") | |
], | |
) | |
demo.launch() |