import torch import torchaudio import torch.nn as nn import torch.nn.functional as F import IPython import sys import subprocess subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "--force-reinstall", "git+https://github.com/osanseviero/tortoise-tts.git"]) # entmax could not be installed at same time as torch subprocess.check_call([sys.executable, "-m", "pip", "install", "entmax"]) from tortoise_tts.api import TextToSpeech from tortoise_tts.utils.audio import load_audio, get_voices # This will download all the models used by Tortoise from the HF hub. tts = TextToSpeech() voices = [ "angie", "daniel", "deniro", "emma", "freeman", "geralt", "halle", "jlaw", "lj", "snakes", "tom", "William", ] voices = get_voices() preset = "fastest" def inference(text, voice): cond_paths = voices[voice] conds = [] for cond_path in cond_paths: c = load_audio(cond_path, 22050) conds.append(c) gen = tts.tts_with_preset(text, conds, preset) return gen text = "Joining two modalities results in a surprising increase in generalization! What would happen if we combined them all?" iface = gr.Interface( generate_tone, inputs=[ gr.inputs.Textbox(type="text", default=text, label="Text"), gr.inputs.Dropdown(voices, type="index"), ], outputs="audio", ) iface.launch()