File size: 2,268 Bytes
a9482ab
 
b87f08b
7f6563e
 
510f17f
aeceb48
7f6563e
8772ca9
7f6563e
d59ee2f
 
9630f4e
dd29aa4
e28cac3
a9482ab
a7d0893
9630f4e
 
55ef1e7
a9482ab
 
 
 
 
 
 
 
 
 
 
 
 
 
63ced49
 
a9482ab
7663e41
a9482ab
 
132c7ea
2b65d86
d2e0f91
b7d4e28
d2e0f91
 
 
b7d4e28
d2e0f91
dd29aa4
 
 
d9f9ad4
 
d2e0f91
91eda71
d9f9ad4
1256bad
 
 
d9f9ad4
 
8622a01
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a9482ab
2fce1e5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import IPython

import sys
import subprocess

subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "--force-reinstall", "git+https://github.com/osanseviero/tortoise-tts.git"])

# entmax could not be installed at same time as torch
subprocess.check_call([sys.executable, "-m", "pip", "install", "entmax"])

from tortoise_tts.api import TextToSpeech
from tortoise_tts.utils.audio import load_audio, get_voices
import torch 
import torchaudio
import gradio as gr

device = "cuda" if torch.cuda.is_available() else "cpu"

# This will download all the models used by Tortoise from the HF hub
tts = TextToSpeech(device="cuda")

voices = [
  "angie",
  "daniel",
  "deniro",
  "emma",
  "freeman",
  "geralt",
  "halle",
  "jlaw",
  "lj",
  "snakes",
  "William",
]
voice_paths = get_voices()
print(voice_paths)

preset = "fast"

def inference(text, voice):
    text = text[:256]
    cond_paths = voice_paths[voice]
    conds = []
    print(voice_paths, voice, cond_paths)
    for cond_path in cond_paths:
        c = load_audio(cond_path, 22050)
        conds.append(c)
    print(text, conds, preset)
    gen = tts.tts_with_preset(text, conds, preset)
    print("gen")
    torchaudio.save('generated.wav', gen.squeeze(0).cpu(), 24000)
    return "generated.wav"
    

 
text = "Joining two modalities results in a surprising increase in generalization! What would happen if we combined them all?"
examples = [
    [text, "angie"],
    [text, "emma"],
    ["how are you doing this day", "freeman"]
]

block = gr.Blocks()
with block:
    gr.Markdown("# TorToiSe")
    gr.Markdown("A multi-voice TTS system trained with an emphasis on quality")
    with gr.Tabs():
      with gr.TabItem("Pre-recorded voices"):
        iface = gr.Interface(
          inference,
          inputs=[
              gr.inputs.Textbox(type="str", default=text, label="Text", lines=3),
              gr.inputs.Dropdown(voices),
          ],
          outputs="audio",
          enable_queue=True,
          examples=examples,
        )

    gr.Markdown("This demo shows the ultra fast option in the TorToiSe system. For more info check the <a href='https://github.com/neonbjb/tortoise-tts' target='_blank'>Repository</a>.",)

block.launch()


iface.launch(cache_examples=True)