Spaces:

djkesu
/

tortoise5c

Running

App Files Files Community

tortoise5c / app.py

djkesu

Simplified app.py

ba3f0c0 about 1 year ago

raw

history blame

3.63 kB

	import os
	import shutil
	from pathlib import Path
	import streamlit as st
	import torchaudio
	import IPython

	from tortoise.api import TextToSpeech
	from tortoise.utils.audio import load_voice

	# Initialize TextToSpeech model
	tts = TextToSpeech()

	# Constants
	PRESETS = ["ultra_fast", "fast", "standard", "high_quality", "very_fast"]
	UPLOAD_FOLDER = "./uploads"
	OUTPUT_FOLDER = "./output"

	# Create upload and output directories if they don't exist
	os.makedirs(UPLOAD_FOLDER, exist_ok=True)
	os.makedirs(OUTPUT_FOLDER, exist_ok=True)

	# Streamlit UI elements
	st.title("Tortoise Text-to-Speech App")

	# Upload .wav files
	st.sidebar.header("Upload Audio Samples")
	uploaded_files = st.sidebar.file_uploader(
	"Upload Audio Samples for a New Voice",
	accept_multiple_files=True,
	type=["wav"],
	)

	# Create a new voice
	voice_name = st.sidebar.text_input("New Voice Name", help="Enter a name for your new voice.")

	if st.sidebar.button("Create Voice") and voice_name.strip() != "":
	new_voice_name = voice_name.strip().replace(" ", "_")
	voices_dir = f"./tortoise/voices/{new_voice_name}/"
	if os.path.exists(voices_dir):
	shutil.rmtree(voices_dir)
	os.makedirs(voices_dir)

	for index, uploaded_file in enumerate(uploaded_files):
	bytes_data = uploaded_file.read()
	with open(f"{voices_dir}voice_sample{index}.wav", "wb") as wav_file:
	wav_file.write(bytes_data)

	st.sidebar.success(f"Voice '{voice_name}' created successfully!")

	# Input text and settings
	st.header("Text-to-Speech Generation")
	text = st.text_area(
	"Enter Text",
	help="Enter the text you want to convert to speech.",
	value="Joining two modalities results in a surprising increase in generalization! What would happen if we combined them all?",
	)

	preset = st.selectbox("Preset", PRESETS, help="Select a voice preset.")

	voices = [v for v in os.listdir("tortoise/voices") if v != "cond_latent_example"]
	voice = st.selectbox("Voice", voices, help="Select a voice to use for generation.")

	# Generate speech
	if st.button("Generate Speech"):
	if voice_name.strip() == "":
	st.warning("Please create a voice first.")
	else:
	st.info("Generating speech...")

	# Load voice samples
	voice_samples, conditioning_latents = load_voice(voice)

	# Generate speech with Tortoise
	gen = tts.tts_with_preset(
	text,
	voice_samples=voice_samples,
	conditioning_latents=conditioning_latents,
	preset=preset,
	)

	# Save and display the generated audio
	output_path = os.path.join(OUTPUT_FOLDER, "generated.wav")
	torchaudio.save(output_path, gen.squeeze(0).cpu(), 24000)
	st.audio(output_path, format="audio/wav")

	# Show generated output
	st.subheader("Generated Output")
	st.audio(output_path, format="audio/wav")
	if st.checkbox("Play Audio"):
	IPython.display.Audio(output_path)

	st.success("Speech generated successfully!")

	# Clean up uploaded files and output directory
	if st.sidebar.button("Clean Up"):
	shutil.rmtree(UPLOAD_FOLDER)
	os.makedirs(UPLOAD_FOLDER, exist_ok=True)
	shutil.rmtree(OUTPUT_FOLDER)
	os.makedirs(OUTPUT_FOLDER, exist_ok=True)
	st.sidebar.success("Clean up completed!")

	# Display information
	st.sidebar.header("Information")
	st.sidebar.markdown(
	"This app allows you to create a new voice by uploading .wav files. You can then generate speech "
	"using the selected voice and preset. You can play the generated audio and clean up uploaded files and the output directory when needed."
	)