Spaces:
Running
Running
File size: 3,626 Bytes
4408097 ba3f0c0 4408097 ba3f0c0 4408097 ba3f0c0 4408097 ba3f0c0 4408097 ba3f0c0 4408097 ba3f0c0 4408097 ba3f0c0 4408097 ba3f0c0 4408097 ba3f0c0 4408097 ba3f0c0 4408097 ba3f0c0 4408097 ba3f0c0 4408097 ba3f0c0 4408097 ba3f0c0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
import os
import shutil
from pathlib import Path
import streamlit as st
import torchaudio
import IPython
from tortoise.api import TextToSpeech
from tortoise.utils.audio import load_voice
# Initialize TextToSpeech model
tts = TextToSpeech()
# Constants
PRESETS = ["ultra_fast", "fast", "standard", "high_quality", "very_fast"]
UPLOAD_FOLDER = "./uploads"
OUTPUT_FOLDER = "./output"
# Create upload and output directories if they don't exist
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
# Streamlit UI elements
st.title("Tortoise Text-to-Speech App")
# Upload .wav files
st.sidebar.header("Upload Audio Samples")
uploaded_files = st.sidebar.file_uploader(
"Upload Audio Samples for a New Voice",
accept_multiple_files=True,
type=["wav"],
)
# Create a new voice
voice_name = st.sidebar.text_input("New Voice Name", help="Enter a name for your new voice.")
if st.sidebar.button("Create Voice") and voice_name.strip() != "":
new_voice_name = voice_name.strip().replace(" ", "_")
voices_dir = f"./tortoise/voices/{new_voice_name}/"
if os.path.exists(voices_dir):
shutil.rmtree(voices_dir)
os.makedirs(voices_dir)
for index, uploaded_file in enumerate(uploaded_files):
bytes_data = uploaded_file.read()
with open(f"{voices_dir}voice_sample{index}.wav", "wb") as wav_file:
wav_file.write(bytes_data)
st.sidebar.success(f"Voice '{voice_name}' created successfully!")
# Input text and settings
st.header("Text-to-Speech Generation")
text = st.text_area(
"Enter Text",
help="Enter the text you want to convert to speech.",
value="Joining two modalities results in a surprising increase in generalization! What would happen if we combined them all?",
)
preset = st.selectbox("Preset", PRESETS, help="Select a voice preset.")
voices = [v for v in os.listdir("tortoise/voices") if v != "cond_latent_example"]
voice = st.selectbox("Voice", voices, help="Select a voice to use for generation.")
# Generate speech
if st.button("Generate Speech"):
if voice_name.strip() == "":
st.warning("Please create a voice first.")
else:
st.info("Generating speech...")
# Load voice samples
voice_samples, conditioning_latents = load_voice(voice)
# Generate speech with Tortoise
gen = tts.tts_with_preset(
text,
voice_samples=voice_samples,
conditioning_latents=conditioning_latents,
preset=preset,
)
# Save and display the generated audio
output_path = os.path.join(OUTPUT_FOLDER, "generated.wav")
torchaudio.save(output_path, gen.squeeze(0).cpu(), 24000)
st.audio(output_path, format="audio/wav")
# Show generated output
st.subheader("Generated Output")
st.audio(output_path, format="audio/wav")
if st.checkbox("Play Audio"):
IPython.display.Audio(output_path)
st.success("Speech generated successfully!")
# Clean up uploaded files and output directory
if st.sidebar.button("Clean Up"):
shutil.rmtree(UPLOAD_FOLDER)
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
shutil.rmtree(OUTPUT_FOLDER)
os.makedirs(OUTPUT_FOLDER, exist_ok=True)
st.sidebar.success("Clean up completed!")
# Display information
st.sidebar.header("Information")
st.sidebar.markdown(
"This app allows you to create a new voice by uploading .wav files. You can then generate speech "
"using the selected voice and preset. You can play the generated audio and clean up uploaded files and the output directory when needed."
)
|