tortoise5c / app.py
djkesu's picture
Simplified app.py
ba3f0c0
raw
history blame
3.63 kB
import os
import shutil
from pathlib import Path
import streamlit as st
import torchaudio
import IPython
from tortoise.api import TextToSpeech
from tortoise.utils.audio import load_voice
# Initialize TextToSpeech model
tts = TextToSpeech()
# Constants
PRESETS = ["ultra_fast", "fast", "standard", "high_quality", "very_fast"]
UPLOAD_FOLDER = "./uploads"
OUTPUT_FOLDER = "./output"
# Create upload and output directories if they don't exist
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
# Streamlit UI elements
st.title("Tortoise Text-to-Speech App")
# Upload .wav files
st.sidebar.header("Upload Audio Samples")
uploaded_files = st.sidebar.file_uploader(
"Upload Audio Samples for a New Voice",
accept_multiple_files=True,
type=["wav"],
)
# Create a new voice
voice_name = st.sidebar.text_input("New Voice Name", help="Enter a name for your new voice.")
if st.sidebar.button("Create Voice") and voice_name.strip() != "":
new_voice_name = voice_name.strip().replace(" ", "_")
voices_dir = f"./tortoise/voices/{new_voice_name}/"
if os.path.exists(voices_dir):
shutil.rmtree(voices_dir)
os.makedirs(voices_dir)
for index, uploaded_file in enumerate(uploaded_files):
bytes_data = uploaded_file.read()
with open(f"{voices_dir}voice_sample{index}.wav", "wb") as wav_file:
wav_file.write(bytes_data)
st.sidebar.success(f"Voice '{voice_name}' created successfully!")
# Input text and settings
st.header("Text-to-Speech Generation")
text = st.text_area(
"Enter Text",
help="Enter the text you want to convert to speech.",
value="Joining two modalities results in a surprising increase in generalization! What would happen if we combined them all?",
)
preset = st.selectbox("Preset", PRESETS, help="Select a voice preset.")
voices = [v for v in os.listdir("tortoise/voices") if v != "cond_latent_example"]
voice = st.selectbox("Voice", voices, help="Select a voice to use for generation.")
# Generate speech
if st.button("Generate Speech"):
if voice_name.strip() == "":
st.warning("Please create a voice first.")
else:
st.info("Generating speech...")
# Load voice samples
voice_samples, conditioning_latents = load_voice(voice)
# Generate speech with Tortoise
gen = tts.tts_with_preset(
text,
voice_samples=voice_samples,
conditioning_latents=conditioning_latents,
preset=preset,
)
# Save and display the generated audio
output_path = os.path.join(OUTPUT_FOLDER, "generated.wav")
torchaudio.save(output_path, gen.squeeze(0).cpu(), 24000)
st.audio(output_path, format="audio/wav")
# Show generated output
st.subheader("Generated Output")
st.audio(output_path, format="audio/wav")
if st.checkbox("Play Audio"):
IPython.display.Audio(output_path)
st.success("Speech generated successfully!")
# Clean up uploaded files and output directory
if st.sidebar.button("Clean Up"):
shutil.rmtree(UPLOAD_FOLDER)
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
shutil.rmtree(OUTPUT_FOLDER)
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
st.sidebar.success("Clean up completed!")
# Display information
st.sidebar.header("Information")
st.sidebar.markdown(
"This app allows you to create a new voice by uploading .wav files. You can then generate speech "
"using the selected voice and preset. You can play the generated audio and clean up uploaded files and the output directory when needed."
)