|
import http.client |
|
import json |
|
import os |
|
import tempfile |
|
import urllib.request |
|
from pathlib import Path |
|
from typing import Tuple |
|
|
|
import numpy as np |
|
from scipy.io import wavfile |
|
|
|
from TTS.utils.audio.numpy_transforms import save_wav |
|
from TTS.utils.manage import ModelManager |
|
from TTS.utils.synthesizer import Synthesizer |
|
|
|
|
|
class Speaker(object): |
|
"""Convert dict to object.""" |
|
|
|
def __init__(self, d, is_voice=False): |
|
self.is_voice = is_voice |
|
for k, v in d.items(): |
|
if isinstance(k, (list, tuple)): |
|
setattr(self, k, [Speaker(x) if isinstance(x, dict) else x for x in v]) |
|
else: |
|
setattr(self, k, Speaker(v) if isinstance(v, dict) else v) |
|
|
|
def __repr__(self): |
|
return str(self.__dict__) |
|
|
|
|
|
class CS_API: |
|
"""🐸Coqui Studio API Wrapper. |
|
|
|
🐸Coqui Studio is the most advanced voice generation platform. You can generate new voices by voice cloning, voice |
|
interpolation, or our unique prompt to voice technology. It also provides a set of built-in voices with different |
|
characteristics. You can use these voices to generate new audio files or use them in your applications. |
|
You can use all the built-in and your own 🐸Coqui Studio speakers with this API with an API token. |
|
You can signup to 🐸Coqui Studio from https://app.coqui.ai/auth/signup and get an API token from |
|
https://app.coqui.ai/account. We can either enter the token as an environment variable as |
|
`export COQUI_STUDIO_TOKEN=<token>` or pass it as `CS_API(api_token=<toke>)`. |
|
Visit https://app.coqui.ai/api for more information. |
|
|
|
Example listing all available speakers: |
|
>>> from TTS.api import CS_API |
|
>>> tts = CS_API() |
|
>>> tts.speakers |
|
|
|
Example listing all emotions: |
|
>>> from TTS.api import CS_API |
|
>>> tts = CS_API() |
|
>>> tts.emotions |
|
|
|
Example with a built-in 🐸 speaker: |
|
>>> from TTS.api import CS_API |
|
>>> tts = CS_API() |
|
>>> wav, sr = api.tts("Hello world", speaker_name="Claribel Dervla") |
|
>>> filepath = tts.tts_to_file(text="Hello world!", speaker_name=tts.speakers[0].name, file_path="output.wav") |
|
""" |
|
|
|
def __init__(self, api_token=None): |
|
self.api_token = api_token |
|
self.api_prefix = "/api/v2" |
|
self.headers = None |
|
self._speakers = None |
|
self._check_token() |
|
|
|
@property |
|
def speakers(self): |
|
if self._speakers is None: |
|
self._speakers = self.list_all_speakers() |
|
return self._speakers |
|
|
|
@property |
|
def emotions(self): |
|
"""Return a list of available emotions. |
|
|
|
TODO: Get this from the API endpoint. |
|
""" |
|
return ["Neutral", "Happy", "Sad", "Angry", "Dull"] |
|
|
|
def _check_token(self): |
|
if self.api_token is None: |
|
self.api_token = os.environ.get("COQUI_STUDIO_TOKEN") |
|
self.headers = {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_token}"} |
|
if not self.api_token: |
|
raise ValueError( |
|
"No API token found for 🐸Coqui Studio voices - https://coqui.ai.\n" |
|
"Visit 🔗https://app.coqui.ai/account to get one.\n" |
|
"Set it as an environment variable `export COQUI_STUDIO_TOKEN=<token>`\n" |
|
"" |
|
) |
|
|
|
def list_all_speakers(self): |
|
"""Return both built-in Coqui Studio speakers and custom voices created by the user.""" |
|
return self.list_speakers() + self.list_voices() |
|
|
|
def list_speakers(self): |
|
"""List built-in Coqui Studio speakers.""" |
|
self._check_token() |
|
conn = http.client.HTTPSConnection("app.coqui.ai") |
|
conn.request("GET", f"{self.api_prefix}/speakers", headers=self.headers) |
|
res = conn.getresponse() |
|
data = res.read() |
|
return [Speaker(s) for s in json.loads(data)["result"]] |
|
|
|
def list_voices(self): |
|
"""List custom voices created by the user.""" |
|
conn = http.client.HTTPSConnection("app.coqui.ai") |
|
conn.request("GET", f"{self.api_prefix}/voices", headers=self.headers) |
|
res = conn.getresponse() |
|
data = res.read() |
|
return [Speaker(s, True) for s in json.loads(data)["result"]] |
|
|
|
def list_speakers_as_tts_models(self): |
|
"""List speakers in ModelManager format.""" |
|
models = [] |
|
for speaker in self.speakers: |
|
model = f"coqui_studio/en/{speaker.name}/coqui_studio" |
|
models.append(model) |
|
return models |
|
|
|
def name_to_speaker(self, name): |
|
for speaker in self.speakers: |
|
if speaker.name == name: |
|
return speaker |
|
raise ValueError(f"Speaker {name} not found.") |
|
|
|
def id_to_speaker(self, speaker_id): |
|
for speaker in self.speakers: |
|
if speaker.id == speaker_id: |
|
return speaker |
|
raise ValueError(f"Speaker {speaker_id} not found.") |
|
|
|
@staticmethod |
|
def url_to_np(url): |
|
tmp_file, _ = urllib.request.urlretrieve(url) |
|
rate, data = wavfile.read(tmp_file) |
|
return data, rate |
|
|
|
@staticmethod |
|
def _create_payload(text, speaker, emotion, speed): |
|
payload = {} |
|
if speaker.is_voice: |
|
payload["voice_id"] = speaker.id |
|
else: |
|
payload["speaker_id"] = speaker.id |
|
payload.update( |
|
{ |
|
"emotion": emotion, |
|
"name": speaker.name, |
|
"text": text, |
|
"speed": speed, |
|
} |
|
) |
|
return payload |
|
|
|
def tts( |
|
self, |
|
text: str, |
|
speaker_name: str = None, |
|
speaker_id=None, |
|
emotion="Neutral", |
|
speed=1.0, |
|
language=None, |
|
) -> Tuple[np.ndarray, int]: |
|
"""Synthesize speech from text. |
|
|
|
Args: |
|
text (str): Text to synthesize. |
|
speaker_name (str): Name of the speaker. You can get the list of speakers with `list_speakers()` and |
|
voices (user generated speakers) with `list_voices()`. |
|
speaker_id (str): Speaker ID. If None, the speaker name is used. |
|
emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". |
|
speed (float): Speed of the speech. 1.0 is normal speed. |
|
language (str): Language of the text. If None, the default language of the speaker is used. |
|
""" |
|
self._check_token() |
|
if speaker_name is None and speaker_id is None: |
|
raise ValueError(" [!] Please provide either a `speaker_name` or a `speaker_id`.") |
|
if speaker_id is None: |
|
speaker = self.name_to_speaker(speaker_name) |
|
else: |
|
speaker = self.id_to_speaker(speaker_id) |
|
conn = http.client.HTTPSConnection("app.coqui.ai") |
|
payload = self._create_payload(text, speaker, emotion, speed) |
|
conn.request("POST", "/api/v2/samples", json.dumps(payload), self.headers) |
|
res = conn.getresponse() |
|
data = res.read() |
|
try: |
|
wav, sr = self.url_to_np(json.loads(data)["audio_url"]) |
|
except KeyError as e: |
|
raise ValueError(f" [!] 🐸 API returned error: {data}") from e |
|
return wav, sr |
|
|
|
def tts_to_file( |
|
self, |
|
text: str, |
|
speaker_name: str, |
|
speaker_id=None, |
|
emotion="Neutral", |
|
speed=1.0, |
|
language=None, |
|
file_path: str = None, |
|
) -> str: |
|
"""Synthesize speech from text and save it to a file. |
|
|
|
Args: |
|
text (str): Text to synthesize. |
|
speaker_name (str): Name of the speaker. You can get the list of speakers with `list_speakers()` and |
|
voices (user generated speakers) with `list_voices()`. |
|
speaker_id (str): Speaker ID. If None, the speaker name is used. |
|
emotion (str): Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". |
|
speed (float): Speed of the speech. 1.0 is normal speed. |
|
language (str): Language of the text. If None, the default language of the speaker is used. |
|
file_path (str): Path to save the file. If None, a temporary file is created. |
|
""" |
|
if file_path is None: |
|
file_path = tempfile.mktemp(".wav") |
|
wav, sr = self.tts(text, speaker_name, speaker_id, emotion, speed, language) |
|
wavfile.write(file_path, sr, wav) |
|
return file_path |
|
|
|
|
|
class TTS: |
|
"""TODO: Add voice conversion and Capacitron support.""" |
|
|
|
def __init__( |
|
self, |
|
model_name: str = None, |
|
model_path: str = None, |
|
config_path: str = None, |
|
vocoder_path: str = None, |
|
vocoder_config_path: str = None, |
|
progress_bar: bool = True, |
|
gpu=False, |
|
): |
|
"""🐸TTS python interface that allows to load and use the released models. |
|
|
|
Example with a multi-speaker model: |
|
>>> from TTS.api import TTS |
|
>>> tts = TTS(TTS.list_models()[0]) |
|
>>> wav = tts.tts("This is a test! This is also a test!!", speaker=tts.speakers[0], language=tts.languages[0]) |
|
>>> tts.tts_to_file(text="Hello world!", speaker=tts.speakers[0], language=tts.languages[0], file_path="output.wav") |
|
|
|
Example with a single-speaker model: |
|
>>> tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False, gpu=False) |
|
>>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav") |
|
|
|
Example loading a model from a path: |
|
>>> tts = TTS(model_path="/path/to/checkpoint_100000.pth", config_path="/path/to/config.json", progress_bar=False, gpu=False) |
|
>>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav") |
|
|
|
Example voice cloning with YourTTS in English, French and Portuguese: |
|
>>> tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=True) |
|
>>> tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="thisisit.wav") |
|
>>> tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr", file_path="thisisit.wav") |
|
>>> tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt", file_path="thisisit.wav") |
|
|
|
Args: |
|
model_name (str, optional): Model name to load. You can list models by ```tts.models```. Defaults to None. |
|
model_path (str, optional): Path to the model checkpoint. Defaults to None. |
|
config_path (str, optional): Path to the model config. Defaults to None. |
|
vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None. |
|
vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None. |
|
progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True. |
|
gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False. |
|
""" |
|
self.manager = ModelManager(models_file=self.get_models_file_path(), progress_bar=progress_bar, verbose=False) |
|
|
|
self.synthesizer = None |
|
self.voice_converter = None |
|
self.csapi = None |
|
self.model_name = None |
|
|
|
if model_name: |
|
self.load_tts_model_by_name(model_name, gpu) |
|
|
|
if model_path: |
|
self.load_tts_model_by_path( |
|
model_path, config_path, vocoder_path=vocoder_path, vocoder_config=vocoder_config_path, gpu=gpu |
|
) |
|
|
|
@property |
|
def models(self): |
|
return self.manager.list_tts_models() |
|
|
|
@property |
|
def is_multi_speaker(self): |
|
if hasattr(self.synthesizer.tts_model, "speaker_manager") and self.synthesizer.tts_model.speaker_manager: |
|
return self.synthesizer.tts_model.speaker_manager.num_speakers > 1 |
|
return False |
|
|
|
@property |
|
def is_coqui_studio(self): |
|
return "coqui_studio" in self.model_name |
|
|
|
@property |
|
def is_multi_lingual(self): |
|
if hasattr(self.synthesizer.tts_model, "language_manager") and self.synthesizer.tts_model.language_manager: |
|
return self.synthesizer.tts_model.language_manager.num_languages > 1 |
|
return False |
|
|
|
@property |
|
def speakers(self): |
|
if not self.is_multi_speaker: |
|
return None |
|
return self.synthesizer.tts_model.speaker_manager.speaker_names |
|
|
|
@property |
|
def languages(self): |
|
if not self.is_multi_lingual: |
|
return None |
|
return self.synthesizer.tts_model.language_manager.language_names |
|
|
|
@staticmethod |
|
def get_models_file_path(): |
|
return Path(__file__).parent / ".models.json" |
|
|
|
@staticmethod |
|
def list_models(): |
|
try: |
|
csapi = CS_API() |
|
models = csapi.list_speakers_as_tts_models() |
|
except ValueError as e: |
|
print(e) |
|
models = [] |
|
manager = ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False, verbose=False) |
|
return manager.list_tts_models() + models |
|
|
|
def download_model_by_name(self, model_name: str): |
|
model_path, config_path, model_item = self.manager.download_model(model_name) |
|
if model_item.get("default_vocoder") is None: |
|
return model_path, config_path, None, None |
|
vocoder_path, vocoder_config_path, _ = self.manager.download_model(model_item["default_vocoder"]) |
|
return model_path, config_path, vocoder_path, vocoder_config_path |
|
|
|
def load_vc_model_by_name(self, model_name: str, gpu: bool = False): |
|
"""Load one of the voice conversion models by name. |
|
|
|
Args: |
|
model_name (str): Model name to load. You can list models by ```tts.models```. |
|
gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False. |
|
""" |
|
model_path, config_path, _, _ = self.download_model_by_name(model_name) |
|
self.voice_converter = Synthesizer(vc_checkpoint=model_path, vc_config=config_path, use_cuda=gpu) |
|
|
|
def load_tts_model_by_name(self, model_name: str, gpu: bool = False): |
|
"""Load one of 🐸TTS models by name. |
|
|
|
Args: |
|
model_name (str): Model name to load. You can list models by ```tts.models```. |
|
gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False. |
|
|
|
TODO: Add tests |
|
""" |
|
self.synthesizer = None |
|
self.csapi = None |
|
self.model_name = model_name |
|
|
|
if "coqui_studio" in model_name: |
|
self.csapi = CS_API() |
|
else: |
|
model_path, config_path, vocoder_path, vocoder_config_path = self.download_model_by_name(model_name) |
|
|
|
|
|
|
|
self.synthesizer = Synthesizer( |
|
tts_checkpoint=model_path, |
|
tts_config_path=config_path, |
|
tts_speakers_file=None, |
|
tts_languages_file=None, |
|
vocoder_checkpoint=vocoder_path, |
|
vocoder_config=vocoder_config_path, |
|
encoder_checkpoint=None, |
|
encoder_config=None, |
|
use_cuda=gpu, |
|
) |
|
|
|
def load_tts_model_by_path( |
|
self, model_path: str, config_path: str, vocoder_path: str = None, vocoder_config: str = None, gpu: bool = False |
|
): |
|
"""Load a model from a path. |
|
|
|
Args: |
|
model_path (str): Path to the model checkpoint. |
|
config_path (str): Path to the model config. |
|
vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None. |
|
vocoder_config (str, optional): Path to the vocoder config. Defaults to None. |
|
gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False. |
|
""" |
|
|
|
self.synthesizer = Synthesizer( |
|
tts_checkpoint=model_path, |
|
tts_config_path=config_path, |
|
tts_speakers_file=None, |
|
tts_languages_file=None, |
|
vocoder_checkpoint=vocoder_path, |
|
vocoder_config=vocoder_config, |
|
encoder_checkpoint=None, |
|
encoder_config=None, |
|
use_cuda=gpu, |
|
) |
|
|
|
def _check_arguments( |
|
self, |
|
speaker: str = None, |
|
language: str = None, |
|
speaker_wav: str = None, |
|
emotion: str = None, |
|
speed: float = None, |
|
) -> None: |
|
"""Check if the arguments are valid for the model.""" |
|
if not self.is_coqui_studio: |
|
|
|
if self.is_multi_speaker and (speaker is None and speaker_wav is None): |
|
raise ValueError("Model is multi-speaker but no `speaker` is provided.") |
|
if self.is_multi_lingual and language is None: |
|
raise ValueError("Model is multi-lingual but no `language` is provided.") |
|
if not self.is_multi_speaker and speaker is not None: |
|
raise ValueError("Model is not multi-speaker but `speaker` is provided.") |
|
if not self.is_multi_lingual and language is not None: |
|
raise ValueError("Model is not multi-lingual but `language` is provided.") |
|
if not emotion is None and not speed is None: |
|
raise ValueError("Emotion and speed can only be used with Coqui Studio models.") |
|
else: |
|
if emotion is None: |
|
emotion = "Neutral" |
|
if speed is None: |
|
speed = 1.0 |
|
|
|
if speaker_wav is not None: |
|
raise ValueError("Coqui Studio models do not support `speaker_wav` argument.") |
|
if speaker is not None: |
|
raise ValueError("Coqui Studio models do not support `speaker` argument.") |
|
if language is not None and language != "en": |
|
raise ValueError("Coqui Studio models currently support only `language=en` argument.") |
|
if emotion not in ["Neutral", "Happy", "Sad", "Angry", "Dull"]: |
|
raise ValueError(f"Emotion - `{emotion}` - must be one of `Neutral`, `Happy`, `Sad`, `Angry`, `Dull`.") |
|
|
|
def tts_coqui_studio( |
|
self, |
|
text: str, |
|
speaker_name: str = None, |
|
language: str = None, |
|
emotion: str = "Neutral", |
|
speed: float = 1.0, |
|
file_path: str = None, |
|
): |
|
"""Convert text to speech using Coqui Studio models. Use `CS_API` class if you are only interested in the API. |
|
|
|
Args: |
|
text (str): |
|
Input text to synthesize. |
|
speaker_name (str, optional): |
|
Speaker name from Coqui Studio. Defaults to None. |
|
language (str, optional): |
|
Language code. Coqui Studio currently supports only English. Defaults to None. |
|
emotion (str, optional): |
|
Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Defaults to "Neutral". |
|
speed (float, optional): |
|
Speed of the speech. Defaults to 1.0. |
|
file_path (str, optional): |
|
Path to save the output file. When None it returns the `np.ndarray` of waveform. Defaults to None. |
|
""" |
|
speaker_name = self.model_name.split("/")[2] |
|
if file_path is None: |
|
return self.csapi.tts_to_file( |
|
text=text, |
|
speaker_name=speaker_name, |
|
language=language, |
|
speed=speed, |
|
emotion=emotion, |
|
file_path=file_path, |
|
)[0] |
|
return self.csapi.tts(text=text, speaker_name=speaker_name, language=language, speed=speed, emotion=emotion)[0] |
|
|
|
def tts( |
|
self, |
|
text: str, |
|
speaker: str = None, |
|
language: str = None, |
|
speaker_wav: str = None, |
|
emotion: str = None, |
|
speed: float = None, |
|
): |
|
"""Convert text to speech. |
|
|
|
Args: |
|
text (str): |
|
Input text to synthesize. |
|
speaker (str, optional): |
|
Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by |
|
`tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None. |
|
language (str, optional): |
|
Language code for multi-lingual models. You can check whether loaded model is multi-lingual |
|
`tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None. |
|
speaker_wav (str, optional): |
|
Path to a reference wav file to use for voice cloning with supporting models like YourTTS. |
|
Defaults to None. |
|
emotion (str, optional): |
|
Emotion to use for 🐸Coqui Studio models. If None, Studio models use "Neutral". Defaults to None. |
|
speed (float, optional): |
|
Speed factor to use for 🐸Coqui Studio models, between 0 and 2.0. If None, Studio models use 1.0. |
|
Defaults to None. |
|
""" |
|
self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav, emotion=emotion, speed=speed) |
|
if self.csapi is not None: |
|
return self.tts_coqui_studio( |
|
text=text, speaker_name=speaker, language=language, emotion=emotion, speed=speed |
|
) |
|
|
|
wav = self.synthesizer.tts( |
|
text=text, |
|
speaker_name=speaker, |
|
language_name=language, |
|
speaker_wav=speaker_wav, |
|
reference_wav=None, |
|
style_wav=None, |
|
style_text=None, |
|
reference_speaker_name=None, |
|
) |
|
return wav |
|
|
|
def tts_to_file( |
|
self, |
|
text: str, |
|
speaker: str = None, |
|
language: str = None, |
|
speaker_wav: str = None, |
|
emotion: str = "Neutral", |
|
speed: float = 1.0, |
|
file_path: str = "output.wav", |
|
): |
|
"""Convert text to speech. |
|
|
|
Args: |
|
text (str): |
|
Input text to synthesize. |
|
speaker (str, optional): |
|
Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by |
|
`tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None. |
|
language (str, optional): |
|
Language code for multi-lingual models. You can check whether loaded model is multi-lingual |
|
`tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None. |
|
speaker_wav (str, optional): |
|
Path to a reference wav file to use for voice cloning with supporting models like YourTTS. |
|
Defaults to None. |
|
emotion (str, optional): |
|
Emotion to use for 🐸Coqui Studio models. Defaults to "Neutral". |
|
speed (float, optional): |
|
Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0. Defaults to None. |
|
file_path (str, optional): |
|
Output file path. Defaults to "output.wav". |
|
""" |
|
self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav) |
|
|
|
if self.csapi is not None: |
|
return self.tts_coqui_studio( |
|
text=text, speaker_name=speaker, language=language, emotion=emotion, speed=speed, file_path=file_path |
|
) |
|
wav = self.tts(text=text, speaker=speaker, language=language, speaker_wav=speaker_wav) |
|
self.synthesizer.save_wav(wav=wav, path=file_path) |
|
return file_path |
|
|
|
def voice_conversion( |
|
self, |
|
sourve_wav: str, |
|
target_wav: str, |
|
): |
|
"""Voice conversion with FreeVC. Convert source wav to target speaker. |
|
|
|
Args: |
|
source_wav (str): |
|
Path to the source wav file. |
|
target_wav (str): |
|
Path to the target wav file. |
|
""" |
|
wav = self.synthesizer.voice_conversion(source_wav=sourve_wav, target_wav=target_wav) |
|
return wav |
|
|
|
def tts_with_vc(self, text: str, language: str = None, speaker_wav: str = None): |
|
"""Convert text to speech with voice conversion. |
|
|
|
It combines tts with voice conversion to fake voice cloning. |
|
|
|
- Convert text to speech with tts. |
|
- Convert the output wav to target speaker with voice conversion. |
|
|
|
Args: |
|
text (str): |
|
Input text to synthesize. |
|
language (str, optional): |
|
Language code for multi-lingual models. You can check whether loaded model is multi-lingual |
|
`tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None. |
|
speaker_wav (str, optional): |
|
Path to a reference wav file to use for voice cloning with supporting models like YourTTS. |
|
Defaults to None. |
|
""" |
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: |
|
|
|
self.tts_to_file(text=text, speaker=None, language=language, file_path=fp.name) |
|
if self.voice_converter is None: |
|
self.load_vc_model_by_name("voice_conversion_models/multilingual/vctk/freevc24") |
|
wav = self.voice_converter.voice_conversion(source_wav=fp.name, target_wav=speaker_wav) |
|
return wav |
|
|
|
def tts_with_vc_to_file( |
|
self, text: str, language: str = None, speaker_wav: str = None, file_path: str = "output.wav" |
|
): |
|
"""Convert text to speech with voice conversion and save to file. |
|
|
|
Check `tts_with_vc` for more details. |
|
|
|
Args: |
|
text (str): |
|
Input text to synthesize. |
|
language (str, optional): |
|
Language code for multi-lingual models. You can check whether loaded model is multi-lingual |
|
`tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None. |
|
speaker_wav (str, optional): |
|
Path to a reference wav file to use for voice cloning with supporting models like YourTTS. |
|
Defaults to None. |
|
file_path (str, optional): |
|
Output file path. Defaults to "output.wav". |
|
""" |
|
wav = self.tts_with_vc(text=text, language=language, speaker_wav=speaker_wav) |
|
save_wav(wav=wav, path=file_path, sample_rate=self.voice_converter.vc_config.audio.output_sample_rate) |
|
|