File size: 3,626 Bytes
4408097
 
 
 
ba3f0c0
 
4408097
ba3f0c0
 
4408097
ba3f0c0
 
4408097
ba3f0c0
 
 
 
4408097
ba3f0c0
 
 
4408097
ba3f0c0
 
4408097
ba3f0c0
 
 
 
 
 
 
4408097
ba3f0c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4408097
ba3f0c0
4408097
ba3f0c0
 
4408097
ba3f0c0
 
 
 
 
 
4408097
ba3f0c0
 
4408097
ba3f0c0
 
 
 
 
 
4408097
 
ba3f0c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import os
import shutil
from pathlib import Path
import streamlit as st
import torchaudio
import IPython

from tortoise.api import TextToSpeech
from tortoise.utils.audio import load_voice

# Initialize TextToSpeech model
tts = TextToSpeech()

# Constants
PRESETS = ["ultra_fast", "fast", "standard", "high_quality", "very_fast"]
UPLOAD_FOLDER = "./uploads"
OUTPUT_FOLDER = "./output"

# Create upload and output directories if they don't exist
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

# Streamlit UI elements
st.title("Tortoise Text-to-Speech App")

# Upload .wav files
st.sidebar.header("Upload Audio Samples")
uploaded_files = st.sidebar.file_uploader(
    "Upload Audio Samples for a New Voice",
    accept_multiple_files=True,
    type=["wav"],
)

# Create a new voice
voice_name = st.sidebar.text_input("New Voice Name", help="Enter a name for your new voice.")

if st.sidebar.button("Create Voice") and voice_name.strip() != "":
    new_voice_name = voice_name.strip().replace(" ", "_")
    voices_dir = f"./tortoise/voices/{new_voice_name}/"
    if os.path.exists(voices_dir):
        shutil.rmtree(voices_dir)
    os.makedirs(voices_dir)

    for index, uploaded_file in enumerate(uploaded_files):
        bytes_data = uploaded_file.read()
        with open(f"{voices_dir}voice_sample{index}.wav", "wb") as wav_file:
            wav_file.write(bytes_data)

    st.sidebar.success(f"Voice '{voice_name}' created successfully!")

# Input text and settings
st.header("Text-to-Speech Generation")
text = st.text_area(
    "Enter Text",
    help="Enter the text you want to convert to speech.",
    value="Joining two modalities results in a surprising increase in generalization! What would happen if we combined them all?",
)

preset = st.selectbox("Preset", PRESETS, help="Select a voice preset.")

voices = [v for v in os.listdir("tortoise/voices") if v != "cond_latent_example"]
voice = st.selectbox("Voice", voices, help="Select a voice to use for generation.")

# Generate speech
if st.button("Generate Speech"):
    if voice_name.strip() == "":
        st.warning("Please create a voice first.")
    else:
        st.info("Generating speech...")

        # Load voice samples
        voice_samples, conditioning_latents = load_voice(voice)

        # Generate speech with Tortoise
        gen = tts.tts_with_preset(
            text,
            voice_samples=voice_samples,
            conditioning_latents=conditioning_latents,
            preset=preset,
        )

        # Save and display the generated audio
        output_path = os.path.join(OUTPUT_FOLDER, "generated.wav")
        torchaudio.save(output_path, gen.squeeze(0).cpu(), 24000)
        st.audio(output_path, format="audio/wav")

        # Show generated output
        st.subheader("Generated Output")
        st.audio(output_path, format="audio/wav")
        if st.checkbox("Play Audio"):
            IPython.display.Audio(output_path)

        st.success("Speech generated successfully!")

# Clean up uploaded files and output directory
if st.sidebar.button("Clean Up"):
    shutil.rmtree(UPLOAD_FOLDER)
    os.makedirs(UPLOAD_FOLDER, exist_ok=True)
    shutil.rmtree(OUTPUT_FOLDER)
    os.makedirs(OUTPUT_FOLDER, exist_ok=True)
    st.sidebar.success("Clean up completed!")

# Display information
st.sidebar.header("Information")
st.sidebar.markdown(
    "This app allows you to create a new voice by uploading .wav files. You can then generate speech "
    "using the selected voice and preset. You can play the generated audio and clean up uploaded files and the output directory when needed."
)