Spaces:

djkesu
/

tortoise5c

Running

File size: 3,626 Bytes

import os
import shutil
from pathlib import Path
import streamlit as st
import torchaudio
import IPython

from tortoise.api import TextToSpeech
from tortoise.utils.audio import load_voice

# Initialize TextToSpeech model
tts = TextToSpeech()

# Constants
PRESETS = ["ultra_fast", "fast", "standard", "high_quality", "very_fast"]
UPLOAD_FOLDER = "./uploads"
OUTPUT_FOLDER = "./output"

# Create upload and output directories if they don't exist
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

# Streamlit UI elements
st.title("Tortoise Text-to-Speech App")

# Upload .wav files
st.sidebar.header("Upload Audio Samples")
uploaded_files = st.sidebar.file_uploader(
    "Upload Audio Samples for a New Voice",
    accept_multiple_files=True,
    type=["wav"],
)

# Create a new voice
voice_name = st.sidebar.text_input("New Voice Name", help="Enter a name for your new voice.")

if st.sidebar.button("Create Voice") and voice_name.strip() != "":
    new_voice_name = voice_name.strip().replace(" ", "_")
    voices_dir = f"./tortoise/voices/{new_voice_name}/"
    if os.path.exists(voices_dir):
        shutil.rmtree(voices_dir)
    os.makedirs(voices_dir)

    for index, uploaded_file in enumerate(uploaded_files):
        bytes_data = uploaded_file.read()
        with open(f"{voices_dir}voice_sample{index}.wav", "wb") as wav_file:
            wav_file.write(bytes_data)

    st.sidebar.success(f"Voice '{voice_name}' created successfully!")

# Input text and settings
st.header("Text-to-Speech Generation")
text = st.text_area(
    "Enter Text",
    help="Enter the text you want to convert to speech.",
    value="Joining two modalities results in a surprising increase in generalization! What would happen if we combined them all?",
)

preset = st.selectbox("Preset", PRESETS, help="Select a voice preset.")

voices = [v for v in os.listdir("tortoise/voices") if v != "cond_latent_example"]
voice = st.selectbox("Voice", voices, help="Select a voice to use for generation.")

# Generate speech
if st.button("Generate Speech"):
    if voice_name.strip() == "":
        st.warning("Please create a voice first.")
    else:
        st.info("Generating speech...")

        # Load voice samples
        voice_samples, conditioning_latents = load_voice(voice)

        # Generate speech with Tortoise
        gen = tts.tts_with_preset(
            text,
            voice_samples=voice_samples,
            conditioning_latents=conditioning_latents,
            preset=preset,
        )

        # Save and display the generated audio
        output_path = os.path.join(OUTPUT_FOLDER, "generated.wav")
        torchaudio.save(output_path, gen.squeeze(0).cpu(), 24000)
        st.audio(output_path, format="audio/wav")

        # Show generated output
        st.subheader("Generated Output")
        st.audio(output_path, format="audio/wav")
        if st.checkbox("Play Audio"):
            IPython.display.Audio(output_path)

        st.success("Speech generated successfully!")

# Clean up uploaded files and output directory
if st.sidebar.button("Clean Up"):
    shutil.rmtree(UPLOAD_FOLDER)
    os.makedirs(UPLOAD_FOLDER, exist_ok=True)
    shutil.rmtree(OUTPUT_FOLDER)
    os.makedirs(OUTPUT_FOLDER, exist_ok=True)
    st.sidebar.success("Clean up completed!")

# Display information
st.sidebar.header("Information")
st.sidebar.markdown(
    "This app allows you to create a new voice by uploading .wav files. You can then generate speech "
    "using the selected voice and preset. You can play the generated audio and clean up uploaded files and the output directory when needed."
)