Spaces:

Edmon02
/

SpeechT5_hy

Sleeping

File size: 6,666 Bytes

import gradio as gr
import librosa
import numpy as np
import torch

import string
import httpx
import inflect
import re

from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan


checkpoint = "microsoft/speecht5_tts"
processor = SpeechT5Processor.from_pretrained(checkpoint)
model = SpeechT5ForTextToSpeech.from_pretrained("Edmon02/speecht5_finetuned_voxpopuli_hy")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")


speaker_embeddings = {
    "BDL": "cmu_us_bdl_arctic-wav-arctic_a0009.npy",
}

def convert_number_to_words(number: float) -> str:
    p = inflect.engine()
    words = p.number_to_words(number)

    # Translate using httpx
    async def translate_text(text, source_lang, target_lang):
        async with httpx.AsyncClient() as client:
            response = await client.get(
                f'https://api.mymemory.translated.net/get?q={text}&langpair={source_lang}|{target_lang}'
            )
            translation = response.json()
            return translation['responseData']['translatedText']

    # You can change 'en' to the appropriate source language code
    source_lang = 'en'
    # You can change 'hy' to the appropriate target language code
    target_lang = 'hy'

    # Use asyncio.run even if an event loop is already running (nested asyncio)
    translated_words = asyncio.run(translate_text(words, source_lang, target_lang))

    return translated_words

def process_text(text: str) -> str:
    # Convert numbers to words
    words = []
    text = str(text) if str(text) else ''
    for word in text.split():
        # Check if the word is a number
        if re.search(r'\d', word):
            words.append(convert_number_to_words(int(''.join(filter(str.isdigit, word)))))
        else:
            words.append(word)

    # Join the words back into a sentence
    processed_text = ' '.join(words)
    return processed_text

replacements = [
    ("՚", "?"),
    ('՛', ""),
    ('՝', ""),
    ("«", "\""),
    ("»", "\""),
    ("՞", "?"),
    ("ա", "a"),
    ("բ", "b"),
    ("գ", "g"),
    ("դ", "d"),
    ("զ", "z"),
    ("է", "e"),
    ("ը", "e'"),
    ("թ", "t'"),
    ("ժ",	"jh"),
    ("ի",	"i"),
    ("լ",	"l"),
    ("խ",	"kh"),
    ("ծ",	"ts"),
    ("կ",	"k"),
    ("հ",	"h"),
    ("ձ",	"dz"),
    ("ղ",	"gh"),
    ("ճ",	"ch"),
    ("մ",	"m"),
    ("յ",	"y"),
    ("ն",	"n"),
    ("շ",	"sh"),
    ("չ",	"ch'"),
    ("պ",	"p"),
    ("ջ",	"j"),
    ("ռ",	"r"),
    ("ս",	"s"),
    ("վ",	"v"),
    ("տ",	"t"),
    ("ր",	"r"),
    ("ց",	"ts'"),
    ("ւ",	""),
    ("փ",	"p'"),
    ("ք",	"k'"),
    ("և",	"yev"),
    ("օ",	"o"),
    ("ֆ",	"f"),
    ('։', "."),
    ('–', "-"),
    ('†', "e'"),
]


def cleanup_text(text):
    
    translator = str.maketrans("", "", string.punctuation)

    text = text.translate(translator).lower()
    text = text.lower()
    
    normalized_text = text

    normalized_text = normalized_text.replace("ու", "u")
    normalized_text = normalized_text.replace("եւ", "yev")
    normalized_text = normalized_text.replace("եվ", "yev")

    # Handle 'ո' at the beginning of a word
    normalized_text = normalized_text.replace(" ո", " vo")

    # Handle 'ո' in the middle of a word
    normalized_text = normalized_text.replace("ո", "o")

    # Handle 'ե' at the beginning of a word
    normalized_text = normalized_text.replace(" ե", " ye")

    # Handle 'ե' in the middle of a word
    normalized_text = normalized_text.replace("ե", "e")

    # Apply other replacements
    for src, dst in replacements:
        normalized_text = normalized_text.replace(src, dst)

    inputs = normalized_text
    return inputs

def predict(text, speaker):
    if len(text.strip()) == 0:
        return (16000, np.zeros(0).astype(np.int16))

    text = process_text(text)
    
    text = cleanup_text(text)

    inputs = processor(text=text, return_tensors="pt")

    # limit input length
    input_ids = inputs["input_ids"]
    input_ids = input_ids[..., :model.config.max_text_positions]

    speaker_embedding = np.load(speaker_embeddings[speaker[:3]]).astype(np.float32)

    speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)

    speech = model.generate_speech(input_ids, speaker_embedding, vocoder=vocoder)

    speech = (speech.numpy() * 32767).astype(np.int16)
    return (16000, speech)


title = "SpeechT5_hy: Speech Synthesis"

description = """
The <b>SpeechT5</b> model is pre-trained on text as well as speech inputs, with targets that are also a mix of text and speech.
By pre-training on text and speech at the same time, it learns unified representations for both, resulting in improved modeling capabilities.

SpeechT5 can be fine-tuned for different speech tasks. This space demonstrates the <b>text-to-speech</b> (TTS) checkpoint for the English language.

See also the <a href="https://huggingface.co./spaces/Matthijs/speecht5-asr-demo">speech recognition (ASR) demo</a>
and the <a href="https://huggingface.co./spaces/Matthijs/speecht5-vc-demo">voice conversion demo</a>.

Refer to <a href="https://colab.research.google.com/drive/1i7I5pzBcU3WDFarDnzweIj4-sVVoIUFJ">this Colab notebook</a> to learn how to fine-tune the SpeechT5 TTS model on your own dataset or language.

<b>How to use:</b> Enter some English text and choose a speaker. The output is a mel spectrogram, which is converted to a mono 16 kHz waveform by the
HiFi-GAN vocoder. Because the model always applies random dropout, each attempt will give slightly different results.
The <em>Surprise Me!</em> option creates a completely randomized speaker.
"""

examples = [
    ["It is not in the stars to hold our destiny but in ourselves.", "BDL (male)"],
    ["The octopus and Oliver went to the opera in October.", "CLB (female)"],
    ["She sells seashells by the seashore. I saw a kitten eating chicken in the kitchen.", "RMS (male)"],
    ["Brisk brave brigadiers brandished broad bright blades, blunderbusses, and bludgeons—balancing them badly.", "SLT (female)"],
    ["A synonym for cinnamon is a cinnamon synonym.", "BDL (male)"],
    ["How much wood would a woodchuck chuck if a woodchuck could chuck wood? He would chuck, he would, as much as he could, and chuck as much wood as a woodchuck would if a woodchuck could chuck wood.", "CLB (female)"],
]

gr.Interface(
    fn=predict,
    inputs=[
        gr.Text(label="Input Text"),
        gr.Radio(label="Speaker", choices=[
            "BDL (female)"
        ],
        value="BDL (female)"),
    ],
    outputs=[
        gr.Audio(label="Generated Speech", type="numpy"),
    ],
    title=title,
    description=description,
).launch()