voice_clone_v2 / app.py
ahassoun's picture
Update app.py
1b925a6
raw
history blame
15.8 kB
import subprocess
# Define the local paths to the packages
local_package_paths = [./transformers"]
# Run the pip install command for each local package
for package_path in local_package_paths:
subprocess.run(["pip", "install", "-e", package_path])
import gradio as gr
from share_btn import community_icon_html, loading_icon_html, share_js
import os
import shutil
import re
#from huggingface_hub import snapshot_download
import numpy as np
from scipy.io import wavfile
from scipy.io.wavfile import write, read
from pydub import AudioSegment
file_upload_available = os.environ.get("ALLOW_FILE_UPLOAD")
MAX_NUMBER_SENTENCES = 10
import json
with open("characters.json", "r") as file:
data = json.load(file)
characters = [
{
"image": item["image"],
"title": item["title"],
"speaker": item["speaker"]
}
for item in data
]
from TTS.api import TTS
tts = TTS("tts_models/multilingual/multi-dataset/bark", gpu=True)
def cut_wav(input_path, max_duration):
# Load the WAV file
audio = AudioSegment.from_wav(input_path)
# Calculate the duration of the audio
audio_duration = len(audio) / 1000 # Convert milliseconds to seconds
# Determine the duration to cut (maximum of max_duration and actual audio duration)
cut_duration = min(max_duration, audio_duration)
# Cut the audio
cut_audio = audio[:int(cut_duration * 1000)] # Convert seconds to milliseconds
# Get the input file name without extension
file_name = os.path.splitext(os.path.basename(input_path))[0]
# Construct the output file path with the original file name and "_cut" suffix
output_path = f"{file_name}_cut.wav"
# Save the cut audio as a new WAV file
cut_audio.export(output_path, format="wav")
return output_path
def load_hidden(audio_in):
return audio_in
def load_hidden_mic(audio_in):
print("USER RECORDED A NEW SAMPLE")
library_path = 'bark_voices'
folder_name = 'audio-0-100'
second_folder_name = 'audio-0-100_cleaned'
folder_path = os.path.join(library_path, folder_name)
second_folder_path = os.path.join(library_path, second_folder_name)
print("We need to clean previous util files, if needed:")
if os.path.exists(folder_path):
try:
shutil.rmtree(folder_path)
print(f"Successfully deleted the folder previously created from last raw recorded sample: {folder_path}")
except OSError as e:
print(f"Error: {folder_path} - {e.strerror}")
else:
print(f"OK, the folder for a raw recorded sample does not exist: {folder_path}")
if os.path.exists(second_folder_path):
try:
shutil.rmtree(second_folder_path)
print(f"Successfully deleted the folder previously created from last cleaned recorded sample: {second_folder_path}")
except OSError as e:
print(f"Error: {second_folder_path} - {e.strerror}")
else:
print(f"Ok, the folder for a cleaned recorded sample does not exist: {second_folder_path}")
return audio_in
def clear_clean_ckeck():
return False
def wipe_npz_file(folder_path):
print("YO β€’ a user is manipulating audio inputs")
def split_process(audio, chosen_out_track):
gr.Info("Cleaning your audio sample...")
os.makedirs("out", exist_ok=True)
write('test.wav', audio[0], audio[1])
os.system("python3 -m demucs.separate -n mdx_extra_q -j 4 test.wav -o out")
#return "./out/mdx_extra_q/test/vocals.wav","./out/mdx_extra_q/test/bass.wav","./out/mdx_extra_q/test/drums.wav","./out/mdx_extra_q/test/other.wav"
if chosen_out_track == "vocals":
print("Audio sample cleaned")
return "./out/mdx_extra_q/test/vocals.wav"
elif chosen_out_track == "bass":
return "./out/mdx_extra_q/test/bass.wav"
elif chosen_out_track == "drums":
return "./out/mdx_extra_q/test/drums.wav"
elif chosen_out_track == "other":
return "./out/mdx_extra_q/test/other.wav"
elif chosen_out_track == "all-in":
return "test.wav"
def update_selection(selected_state: gr.SelectData):
c_image = characters[selected_state.index]["image"]
c_title = characters[selected_state.index]["title"]
c_speaker = characters[selected_state.index]["speaker"]
return c_title, selected_state
def infer(prompt, input_wav_file, clean_audio, hidden_numpy_audio):
print("""
β€”β€”β€”β€”β€”
NEW INFERENCE:
β€”β€”β€”β€”β€”β€”β€”
""")
if prompt == "":
gr.Warning("Do not forget to provide a tts prompt !")
if clean_audio is True :
print("We want to clean audio sample")
# Extract the file name without the extension
new_name = os.path.splitext(os.path.basename(input_wav_file))[0]
print(f"FILE BASENAME is: {new_name}")
if os.path.exists(os.path.join("bark_voices", f"{new_name}_cleaned")):
print("This file has already been cleaned")
check_name = os.path.join("bark_voices", f"{new_name}_cleaned")
source_path = os.path.join(check_name, f"{new_name}_cleaned.wav")
else:
print("This file is new, we need to clean and store it")
source_path = split_process(hidden_numpy_audio, "vocals")
# Rename the file
new_path = os.path.join(os.path.dirname(source_path), f"{new_name}_cleaned.wav")
os.rename(source_path, new_path)
source_path = new_path
else :
print("We do NOT want to clean audio sample")
# Path to your WAV file
source_path = input_wav_file
# Destination directory
destination_directory = "bark_voices"
# Extract the file name without the extension
file_name = os.path.splitext(os.path.basename(source_path))[0]
# Construct the full destination directory path
destination_path = os.path.join(destination_directory, file_name)
# Create the new directory
os.makedirs(destination_path, exist_ok=True)
# Move the WAV file to the new directory
shutil.move(source_path, os.path.join(destination_path, f"{file_name}.wav"))
# β€”β€”β€”β€”β€”
# Split the text into sentences based on common punctuation marks
sentences = re.split(r'(?<=[.!?])\s+', prompt)
if len(sentences) > MAX_NUMBER_SENTENCES:
gr.Info("Your text is too long. To keep this demo enjoyable for everyone, we only kept the first 10 sentences :) Duplicate this space and set MAX_NUMBER_SENTENCES for longer texts ;)")
# Keep only the first MAX_NUMBER_SENTENCES sentences
first_nb_sentences = sentences[:MAX_NUMBER_SENTENCES]
# Join the selected sentences back into a single string
limited_prompt = ' '.join(first_nb_sentences)
prompt = limited_prompt
else:
prompt = prompt
gr.Info("Generating audio from prompt")
tts.tts_to_file(text=prompt,
file_path="output.wav",
voice_dir="bark_voices/",
speaker=f"{file_name}")
# List all the files and subdirectories in the given directory
contents = os.listdir(f"bark_voices/{file_name}")
# Print the contents
for item in contents:
print(item)
print("Preparing final waveform video ...")
tts_video = gr.make_waveform(audio="output.wav")
print(tts_video)
print("FINISHED")
return "output.wav", tts_video, gr.update(value=f"bark_voices/{file_name}/{contents[1]}", visible=True), gr.Group.update(visible=True), destination_path
def infer_from_c(prompt, c_name):
print("""
β€”β€”β€”β€”β€”
NEW INFERENCE:
β€”β€”β€”β€”β€”β€”β€”
""")
if prompt == "":
gr.Warning("Do not forget to provide a tts prompt !")
print("Warning about prompt sent to user")
print(f"USING VOICE LIBRARY: {c_name}")
# Split the text into sentences based on common punctuation marks
sentences = re.split(r'(?<=[.!?])\s+', prompt)
if len(sentences) > MAX_NUMBER_SENTENCES:
gr.Info("Your text is too long. To keep this demo enjoyable for everyone, we only kept the first 10 sentences :) Duplicate this space and set MAX_NUMBER_SENTENCES for longer texts ;)")
# Keep only the first MAX_NUMBER_SENTENCES sentences
first_nb_sentences = sentences[:MAX_NUMBER_SENTENCES]
# Join the selected sentences back into a single string
limited_prompt = ' '.join(first_nb_sentences)
prompt = limited_prompt
else:
prompt = prompt
if c_name == "":
gr.Warning("Voice character is not properly selected. Please ensure that the name of the chosen voice is specified in the Character Name input.")
print("Warning about Voice Name sent to user")
else:
print(f"Generating audio from prompt with {c_name} ;)")
tts.tts_to_file(text=prompt,
file_path="output.wav",
voice_dir="examples/library/",
speaker=f"{c_name}")
print("Preparing final waveform video ...")
tts_video = gr.make_waveform(audio="output.wav")
print(tts_video)
print("FINISHED")
return "output.wav", tts_video, gr.update(value=f"examples/library/{c_name}/{c_name}.npz", visible=True), gr.Group.update(visible=True)
css = """
#col-container {max-width: 780px; margin-left: auto; margin-right: auto;}
a {text-decoration-line: underline; font-weight: 600;}
.mic-wrap > button {
width: 100%;
height: 60px;
font-size: 1.4em!important;
}
.record-icon.svelte-1thnwz {
display: flex;
position: relative;
margin-right: var(--size-2);
width: unset;
height: unset;
}
span.record-icon > span.dot.svelte-1thnwz {
width: 20px!important;
height: 20px!important;
}
.animate-spin {
animation: spin 1s linear infinite;
}
@keyframes spin {
from {
transform: rotate(0deg);
}
to {
transform: rotate(360deg);
}
}
#share-btn-container {
display: flex;
padding-left: 0.5rem !important;
padding-right: 0.5rem !important;
background-color: #000000;
justify-content: center;
align-items: center;
border-radius: 9999px !important;
max-width: 15rem;
height: 36px;
}
div#share-btn-container > div {
flex-direction: row;
background: black;
align-items: center;
}
#share-btn-container:hover {
background-color: #060606;
}
#share-btn {
all: initial;
color: #ffffff;
font-weight: 600;
cursor:pointer;
font-family: 'IBM Plex Sans', sans-serif;
margin-left: 0.5rem !important;
padding-top: 0.5rem !important;
padding-bottom: 0.5rem !important;
right:0;
}
#share-btn * {
all: unset;
}
#share-btn-container div:nth-child(-n+2){
width: auto !important;
min-height: 0px !important;
}
#share-btn-container .wrap {
display: none !important;
}
#share-btn-container.hidden {
display: none!important;
}
img[src*='#center'] {
display: block;
margin: auto;
}
.footer {
margin-bottom: 45px;
margin-top: 10px;
text-align: center;
border-bottom: 1px solid #e5e5e5;
}
.footer>p {
font-size: .8rem;
display: inline-block;
padding: 0 10px;
transform: translateY(10px);
background: white;
}
.dark .footer {
border-color: #303030;
}
.dark .footer>p {
background: #0b0f19;
}
.disclaimer {
text-align: left;
}
.disclaimer > p {
font-size: .8rem;
}
"""
with gr.Blocks(css=css) as demo:
with gr.Column(elem_id="col-container"):
gr.Markdown("""
<h1 style="text-align: center;">Voice Cloning Demo</h1>
""")
with gr.Row():
with gr.Column():
prompt = gr.Textbox(
label = "Text to speech prompt",
info = "One or two sentences at a time is better* (max: 10)",
placeholder = "Hello friend! How are you today?",
elem_id = "tts-prompt"
)
with gr.Column():
audio_in = gr.Audio(
label="WAV voice to clone",
type="filepath",
source="upload",
interactive = False
)
hidden_audio_numpy = gr.Audio(type="numpy", visible=False)
submit_btn = gr.Button("Submit")
with gr.Tab("Microphone"):
texts_samples = gr.Textbox(label = "Helpers",
info = "You can read out loud one of these sentences if you do not know what to record :)",
value = """"Jazz, a quirky mix of groovy saxophones and wailing trumpets, echoes through the vibrant city streets."
β€”β€”β€”
"A majestic orchestra plays enchanting melodies, filling the air with harmony."
β€”β€”β€”
"The exquisite aroma of freshly baked bread wafts from a cozy bakery, enticing passersby."
β€”β€”β€”
"A thunderous roar shakes the ground as a massive jet takes off into the sky, leaving trails of white behind."
β€”β€”β€”
"Laughter erupts from a park where children play, their innocent voices rising like tinkling bells."
β€”β€”β€”
"Waves crash on the beach, and seagulls caw as they soar overhead, a symphony of nature's sounds."
β€”β€”β€”
"In the distance, a blacksmith hammers red-hot metal, the rhythmic clang punctuating the day."
β€”β€”β€”
"As evening falls, a soft hush blankets the world, crickets chirping in a soothing rhythm."
""",
interactive = False,
lines = 5
)
micro_in = gr.Audio(
label="Record voice to clone",
type="filepath",
source="microphone",
interactive = True
)
clean_micro = gr.Checkbox(label="Clean sample ?", value=False)
micro_submit_btn = gr.Button("Submit")
audio_in.upload(fn=load_hidden, inputs=[audio_in], outputs=[hidden_audio_numpy], queue=False)
micro_in.stop_recording(fn=load_hidden_mic, inputs=[micro_in], outputs=[hidden_audio_numpy], queue=False)
with gr.Column():
cloned_out = gr.Audio(
label="Text to speech output",
visible = False
)
video_out = gr.Video(
label = "Waveform video",
elem_id = "voice-video-out"
)
npz_file = gr.File(
label = ".npz file",
visible = False
)
folder_path = gr.Textbox(visible=False)
audio_in.change(fn=wipe_npz_file, inputs=[folder_path], queue=False)
micro_in.clear(fn=wipe_npz_file, inputs=[folder_path], queue=False)
submit_btn.click(
fn = infer,
inputs = [
prompt,
audio_in,
hidden_audio_numpy
],
outputs = [
cloned_out,
video_out,
npz_file,
folder_path
]
)
micro_submit_btn.click(
fn = infer,
inputs = [
prompt,
micro_in,
clean_micro,
hidden_audio_numpy
],
outputs = [
cloned_out,
video_out,
npz_file,
folder_path
]
)
demo.queue(api_open=False, max_size=10).launch()