File size: 3,750 Bytes
d347764
 
 
 
65ecf8c
d347764
 
 
 
 
 
bd9c34f
d347764
 
 
 
bd9c34f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d347764
617d0d1
 
 
 
2fbbc0b
617d0d1
 
bd9c34f
 
617d0d1
 
 
 
 
bd9c34f
617d0d1
2fbbc0b
bd9c34f
617d0d1
 
 
 
d347764
 
 
bd9c34f
d347764
 
 
f805e49
 
617d0d1
 
 
 
f805e49
 
 
 
c737803
 
 
d347764
226ec3a
d347764
f805e49
 
5704381
d347764
c737803
 
 
 
 
 
 
 
5704381
c737803
 
 
3946ba6
c737803
c440523
d2abf2e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import gradio as gr
import numpy as np
import torch
from datasets import load_dataset
from transformers import pipeline

device = "cuda:0" if torch.cuda.is_available() else "cpu"

# load speech translation checkpoint
asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-base", device=device)


def translate(audio):
    outputs = asr_pipe(audio, max_new_tokens=256, generate_kwargs={"task": "translate"})
    return outputs["text"]

'''
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan

# load text-to-speech checkpoint and speaker embeddings
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")

model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)

embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)


def synthesise_old(text):
    inputs = processor(text=text, return_tensors="pt")
    speech = model.generate_speech(inputs["input_ids"].to(device), speaker_embeddings.to(device), vocoder=vocoder)
    return speech.cpu()


def speech_to_speech_translation_old(audio):
    translated_text = translate(audio)
    synthesised_speech = synthesise_old(translated_text)
    synthesised_speech = (synthesised_speech.numpy() * 32767).astype(np.int16)
    return 16000, synthesised_speech
'''

from transformers import VitsModel, VitsTokenizer


# load translator to french
en_fr_translator = pipeline("translation_en_to_fr")

# load text-to-speech
model_new = VitsModel.from_pretrained("facebook/mms-tts-fra")
tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-fra")


def synthesise(text):
    translation_to_french = en_fr_translator(text)
    french_text = translation_to_french[0]['translation_text']

    inputs = tokenizer(french_text, return_tensors="pt")
    input_ids = inputs["input_ids"]

    with torch.no_grad():
        outputs = model_new(input_ids)

    speech = outputs["waveform"]
    return speech


def speech_to_speech_translation(audio):
    translated_text = translate(audio)
    synthesised_speech = synthesise(translated_text)
    synthesised_speech = (synthesised_speech[0].numpy() * 32767).astype(np.int16)
    return 16000, synthesised_speech


title = "Cascaded STST"
description = """
Demo for cascaded speech-to-speech translation (STST), mapping from source speech in any language to target speech in French. 
Demo uses OpenAI's [Whisper Base](https://huggingface.co./openai/whisper-base) model for speech translation,
Google's [T5](https://huggingface.co./t5-base) for translating from English to French 
and Facebook's [Massive Multilingual Speech (MMS)](https://huggingface.co./facebook/mms-tts) model for text-to-speech:

![Cascaded STST](https://huggingface.co./datasets/huggingface-course/audio-course-images/resolve/main/s2st_cascaded.png "Diagram of cascaded speech to speech translation")
"""

demo = gr.Blocks()

mic_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(source="microphone", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    title=title,
    description=description,
    api_name='predict',
)

file_translate = gr.Interface(
    fn=speech_to_speech_translation,
    inputs=gr.Audio(source="upload", type="filepath"),
    outputs=gr.Audio(label="Generated Speech", type="numpy"),
    examples=[["./example.wav"]],
    title=title,
    description=description,
    api_name='predict_upload',
)

with demo:
    gr.TabbedInterface([mic_translate, file_translate], ["Microphone", "Audio File"])

demo.queue()
demo.launch()