|
import gradio as gr |
|
import os |
|
os.system("pip install -q piper-tts==1.2.0") |
|
os.system("pip install -q -r requirements_xtts.txt") |
|
os.system("pip install -q TTS==0.21.1 --no-deps") |
|
import spaces |
|
import librosa |
|
from soni_translate.logging_setup import ( |
|
logger, |
|
set_logging_level, |
|
configure_logging_libs, |
|
); configure_logging_libs() |
|
import whisperx |
|
import torch |
|
import os |
|
from soni_translate.audio_segments import create_translated_audio |
|
from soni_translate.text_to_speech import ( |
|
audio_segmentation_to_voice, |
|
edge_tts_voices_list, |
|
coqui_xtts_voices_list, |
|
piper_tts_voices_list, |
|
create_wav_file_vc, |
|
accelerate_segments, |
|
) |
|
from soni_translate.translate_segments import ( |
|
translate_text, |
|
TRANSLATION_PROCESS_OPTIONS, |
|
DOCS_TRANSLATION_PROCESS_OPTIONS |
|
) |
|
from soni_translate.preprocessor import ( |
|
audio_video_preprocessor, |
|
audio_preprocessor, |
|
) |
|
from soni_translate.postprocessor import ( |
|
OUTPUT_TYPE_OPTIONS, |
|
DOCS_OUTPUT_TYPE_OPTIONS, |
|
sound_separate, |
|
get_no_ext_filename, |
|
media_out, |
|
get_subtitle_speaker, |
|
) |
|
from soni_translate.language_configuration import ( |
|
LANGUAGES, |
|
UNIDIRECTIONAL_L_LIST, |
|
LANGUAGES_LIST, |
|
BARK_VOICES_LIST, |
|
VITS_VOICES_LIST, |
|
OPENAI_TTS_MODELS, |
|
) |
|
from soni_translate.utils import ( |
|
remove_files, |
|
download_list, |
|
upload_model_list, |
|
download_manager, |
|
run_command, |
|
is_audio_file, |
|
is_subtitle_file, |
|
copy_files, |
|
get_valid_files, |
|
get_link_list, |
|
remove_directory_contents, |
|
) |
|
from soni_translate.mdx_net import ( |
|
UVR_MODELS, |
|
MDX_DOWNLOAD_LINK, |
|
mdxnet_models_dir, |
|
) |
|
from soni_translate.speech_segmentation import ( |
|
ASR_MODEL_OPTIONS, |
|
COMPUTE_TYPE_GPU, |
|
COMPUTE_TYPE_CPU, |
|
find_whisper_models, |
|
transcribe_speech, |
|
align_speech, |
|
diarize_speech, |
|
diarization_models, |
|
) |
|
from soni_translate.text_multiformat_processor import ( |
|
BORDER_COLORS, |
|
srt_file_to_segments, |
|
document_preprocessor, |
|
determine_chunk_size, |
|
plain_text_to_segments, |
|
segments_to_plain_text, |
|
process_subtitles, |
|
linguistic_level_segments, |
|
break_aling_segments, |
|
doc_to_txtximg_pages, |
|
page_data_to_segments, |
|
update_page_data, |
|
fix_timestamps_docs, |
|
create_video_from_images, |
|
merge_video_and_audio, |
|
) |
|
from soni_translate.languages_gui import language_data, news |
|
import copy |
|
import logging |
|
import json |
|
from pydub import AudioSegment |
|
from voice_main import ClassVoices |
|
import argparse |
|
import time |
|
import hashlib |
|
import sys |
|
|
|
directories = [ |
|
"downloads", |
|
"logs", |
|
"weights", |
|
"clean_song_output", |
|
"_XTTS_", |
|
f"audio2{os.sep}audio", |
|
"audio", |
|
"outputs", |
|
] |
|
[ |
|
os.makedirs(directory) |
|
for directory in directories |
|
if not os.path.exists(directory) |
|
] |
|
|
|
|
|
class TTS_Info: |
|
def __init__(self, piper_enabled, xtts_enabled): |
|
self.list_edge = edge_tts_voices_list() |
|
self.list_bark = list(BARK_VOICES_LIST.keys()) |
|
self.list_vits = list(VITS_VOICES_LIST.keys()) |
|
self.list_openai_tts = OPENAI_TTS_MODELS |
|
self.piper_enabled = piper_enabled |
|
self.list_vits_onnx = ( |
|
piper_tts_voices_list() if self.piper_enabled else [] |
|
) |
|
self.xtts_enabled = xtts_enabled |
|
|
|
def tts_list(self): |
|
self.list_coqui_xtts = ( |
|
coqui_xtts_voices_list() if self.xtts_enabled else [] |
|
) |
|
list_tts = self.list_coqui_xtts + sorted( |
|
self.list_edge |
|
+ (self.list_bark if os.environ.get("ZERO_GPU") != "TRUE" else []) |
|
+ self.list_vits |
|
+ self.list_openai_tts |
|
+ self.list_vits_onnx |
|
) |
|
return list_tts |
|
|
|
|
|
def prog_disp(msg, percent, is_gui, progress=None): |
|
logger.info(msg) |
|
if is_gui: |
|
progress(percent, desc=msg) |
|
|
|
|
|
def warn_disp(wrn_lang, is_gui): |
|
logger.warning(wrn_lang) |
|
if is_gui: |
|
gr.Warning(wrn_lang) |
|
|
|
|
|
class SoniTrCache: |
|
def __init__(self): |
|
self.cache = { |
|
'media': [[]], |
|
'refine_vocals': [], |
|
'transcript_align': [], |
|
'break_align': [], |
|
'diarize': [], |
|
'translate': [], |
|
'subs_and_edit': [], |
|
'tts': [], |
|
'acc_and_vc': [], |
|
'mix_aud': [], |
|
'output': [] |
|
} |
|
|
|
self.cache_data = { |
|
'media': [], |
|
'refine_vocals': [], |
|
'transcript_align': [], |
|
'break_align': [], |
|
'diarize': [], |
|
'translate': [], |
|
'subs_and_edit': [], |
|
'tts': [], |
|
'acc_and_vc': [], |
|
'mix_aud': [], |
|
'output': [] |
|
} |
|
|
|
self.cache_keys = list(self.cache.keys()) |
|
self.first_task = self.cache_keys[0] |
|
self.last_task = self.cache_keys[-1] |
|
|
|
self.pre_step = None |
|
self.pre_params = [] |
|
|
|
def set_variable(self, variable_name, value): |
|
setattr(self, variable_name, value) |
|
|
|
def task_in_cache(self, step: str, params: list, previous_step_data: dict): |
|
|
|
self.pre_step_cache = None |
|
|
|
if step == self.first_task: |
|
self.pre_step = None |
|
|
|
if self.pre_step: |
|
self.cache[self.pre_step] = self.pre_params |
|
|
|
|
|
self.cache_data[self.pre_step] = copy.deepcopy(previous_step_data) |
|
|
|
self.pre_params = params |
|
|
|
if params == self.cache[step]: |
|
logger.debug(f"In cache: {str(step)}") |
|
|
|
|
|
|
|
for key, value in self.cache_data[step].items(): |
|
self.set_variable(key, copy.deepcopy(value)) |
|
logger.debug( |
|
f"Chache load: {str(key)}" |
|
) |
|
|
|
self.pre_step = step |
|
return True |
|
|
|
else: |
|
logger.debug(f"Flush next and caching {str(step)}") |
|
selected_index = self.cache_keys.index(step) |
|
|
|
for idx, key in enumerate(self.cache.keys()): |
|
if idx >= selected_index: |
|
self.cache[key] = [] |
|
self.cache_data[key] = {} |
|
|
|
|
|
self.pre_step = step |
|
return False |
|
|
|
def clear_cache(self, media, force=False): |
|
|
|
self.cache["media"] = ( |
|
self.cache["media"] if len(self.cache["media"]) else [[]] |
|
) |
|
|
|
if media != self.cache["media"][0] or force: |
|
|
|
|
|
self.cache = {key: [] for key in self.cache} |
|
self.cache["media"] = [[]] |
|
|
|
logger.info("Cache flushed") |
|
|
|
|
|
def get_hash(filepath): |
|
with open(filepath, 'rb') as f: |
|
file_hash = hashlib.blake2b() |
|
while chunk := f.read(8192): |
|
file_hash.update(chunk) |
|
|
|
return file_hash.hexdigest()[:18] |
|
|
|
|
|
def check_openai_api_key(): |
|
if not os.environ.get("OPENAI_API_KEY"): |
|
raise ValueError( |
|
"To use GPT for translation, please set up your OpenAI API key " |
|
"as an environment variable in Linux as follows: " |
|
"export OPENAI_API_KEY='your-api-key-here'. Or change the " |
|
"translation process in Advanced settings." |
|
) |
|
|
|
|
|
class SoniTranslate(SoniTrCache): |
|
def __init__(self, cpu_mode=False): |
|
super().__init__() |
|
if cpu_mode: |
|
os.environ["SONITR_DEVICE"] = "cpu" |
|
else: |
|
os.environ["SONITR_DEVICE"] = ( |
|
"cuda" if torch.cuda.is_available() else "cpu" |
|
) |
|
|
|
self.device = os.environ.get("SONITR_DEVICE") |
|
self.device = self.device if os.environ.get("ZERO_GPU") != "TRUE" else "cuda" |
|
self.result_diarize = None |
|
self.align_language = None |
|
self.result_source_lang = None |
|
self.edit_subs_complete = False |
|
self.voiceless_id = None |
|
self.burn_subs_id = None |
|
|
|
self.vci = ClassVoices(only_cpu=cpu_mode) |
|
|
|
self.tts_voices = self.get_tts_voice_list() |
|
|
|
logger.info(f"Working in: {self.device}") |
|
|
|
def get_tts_voice_list(self): |
|
try: |
|
from piper import PiperVoice |
|
|
|
piper_enabled = True |
|
logger.info("PIPER TTS enabled") |
|
except Exception as error: |
|
logger.debug(str(error)) |
|
piper_enabled = False |
|
logger.info("PIPER TTS disabled") |
|
try: |
|
from TTS.api import TTS |
|
|
|
xtts_enabled = True |
|
logger.info("Coqui XTTS enabled") |
|
logger.info( |
|
"In this app, by using Coqui TTS (text-to-speech), you " |
|
"acknowledge and agree to the license.\n" |
|
"You confirm that you have read, understood, and agreed " |
|
"to the Terms and Conditions specified at the following " |
|
"link:\nhttps://coqui.ai/cpml.txt." |
|
) |
|
os.environ["COQUI_TOS_AGREED"] = "1" |
|
except Exception as error: |
|
logger.debug(str(error)) |
|
xtts_enabled = False |
|
logger.info("Coqui XTTS disabled") |
|
|
|
self.tts_info = TTS_Info(piper_enabled, xtts_enabled) |
|
|
|
return self.tts_info.tts_list() |
|
|
|
def batch_multilingual_media_conversion(self, *kwargs): |
|
|
|
|
|
media_file_arg = kwargs[0] if kwargs[0] is not None else [] |
|
|
|
link_media_arg = kwargs[1] |
|
link_media_arg = [x.strip() for x in link_media_arg.split(',')] |
|
link_media_arg = get_link_list(link_media_arg) |
|
|
|
path_arg = kwargs[2] |
|
path_arg = [x.strip() for x in path_arg.split(',')] |
|
path_arg = get_valid_files(path_arg) |
|
|
|
edit_text_arg = kwargs[31] |
|
get_text_arg = kwargs[32] |
|
|
|
is_gui_arg = kwargs[-1] |
|
|
|
kwargs = kwargs[3:] |
|
|
|
media_batch = media_file_arg + link_media_arg + path_arg |
|
media_batch = list(filter(lambda x: x != "", media_batch)) |
|
media_batch = media_batch if media_batch else [None] |
|
logger.debug(str(media_batch)) |
|
|
|
remove_directory_contents("outputs") |
|
|
|
if edit_text_arg or get_text_arg: |
|
return self.multilingual_media_conversion( |
|
media_batch[0], "", "", *kwargs |
|
) |
|
|
|
if "SET_LIMIT" == os.getenv("DEMO") or "TRUE" == os.getenv("ZERO_GPU"): |
|
media_batch = [media_batch[0]] |
|
|
|
result = [] |
|
for media in media_batch: |
|
|
|
output_file = self.multilingual_media_conversion( |
|
media, "", "", *kwargs |
|
) |
|
|
|
if isinstance(output_file, str): |
|
output_file = [output_file] |
|
result.extend(output_file) |
|
|
|
if is_gui_arg and len(media_batch) > 1: |
|
gr.Info(f"Done: {os.path.basename(output_file[0])}") |
|
|
|
return result |
|
|
|
def multilingual_media_conversion( |
|
self, |
|
media_file=None, |
|
link_media="", |
|
directory_input="", |
|
YOUR_HF_TOKEN="", |
|
preview=False, |
|
transcriber_model="large-v3", |
|
batch_size=4, |
|
compute_type="auto", |
|
origin_language="Automatic detection", |
|
target_language="English (en)", |
|
min_speakers=1, |
|
max_speakers=1, |
|
tts_voice00="en-US-EmmaMultilingualNeural-Female", |
|
tts_voice01="en-US-AndrewMultilingualNeural-Male", |
|
tts_voice02="en-US-AvaMultilingualNeural-Female", |
|
tts_voice03="en-US-BrianMultilingualNeural-Male", |
|
tts_voice04="de-DE-SeraphinaMultilingualNeural-Female", |
|
tts_voice05="de-DE-FlorianMultilingualNeural-Male", |
|
tts_voice06="fr-FR-VivienneMultilingualNeural-Female", |
|
tts_voice07="fr-FR-RemyMultilingualNeural-Male", |
|
tts_voice08="en-US-EmmaMultilingualNeural-Female", |
|
tts_voice09="en-US-AndrewMultilingualNeural-Male", |
|
tts_voice10="en-US-EmmaMultilingualNeural-Female", |
|
tts_voice11="en-US-AndrewMultilingualNeural-Male", |
|
video_output_name="", |
|
mix_method_audio="Adjusting volumes and mixing audio", |
|
max_accelerate_audio=2.1, |
|
acceleration_rate_regulation=False, |
|
volume_original_audio=0.25, |
|
volume_translated_audio=1.80, |
|
output_format_subtitle="srt", |
|
get_translated_text=False, |
|
get_video_from_text_json=False, |
|
text_json="{}", |
|
avoid_overlap=False, |
|
vocal_refinement=False, |
|
literalize_numbers=True, |
|
segment_duration_limit=15, |
|
diarization_model="pyannote_2.1", |
|
translate_process="google_translator_batch", |
|
subtitle_file=None, |
|
output_type="video (mp4)", |
|
voiceless_track=False, |
|
voice_imitation=False, |
|
voice_imitation_max_segments=3, |
|
voice_imitation_vocals_dereverb=False, |
|
voice_imitation_remove_previous=True, |
|
voice_imitation_method="freevc", |
|
dereverb_automatic_xtts=True, |
|
text_segmentation_scale="sentence", |
|
divide_text_segments_by="", |
|
soft_subtitles_to_video=True, |
|
burn_subtitles_to_video=False, |
|
enable_cache=True, |
|
custom_voices=False, |
|
custom_voices_workers=1, |
|
is_gui=False, |
|
progress=gr.Progress(), |
|
): |
|
if not YOUR_HF_TOKEN: |
|
YOUR_HF_TOKEN = os.getenv("YOUR_HF_TOKEN") |
|
if diarization_model == "disable" or max_speakers == 1: |
|
if YOUR_HF_TOKEN is None: |
|
YOUR_HF_TOKEN = "" |
|
elif not YOUR_HF_TOKEN: |
|
raise ValueError("No valid Hugging Face token") |
|
else: |
|
os.environ["YOUR_HF_TOKEN"] = YOUR_HF_TOKEN |
|
|
|
if ( |
|
"gpt" in translate_process |
|
or transcriber_model == "OpenAI_API_Whisper" |
|
or "OpenAI-TTS" in tts_voice00 |
|
): |
|
check_openai_api_key() |
|
|
|
if media_file is None: |
|
media_file = ( |
|
directory_input |
|
if os.path.exists(directory_input) |
|
else link_media |
|
) |
|
media_file = ( |
|
media_file if isinstance(media_file, str) else media_file.name |
|
) |
|
|
|
if is_subtitle_file(media_file): |
|
subtitle_file = media_file |
|
media_file = "" |
|
|
|
if media_file is None: |
|
media_file = "" |
|
|
|
if not origin_language: |
|
origin_language = "Automatic detection" |
|
|
|
if origin_language in UNIDIRECTIONAL_L_LIST and not subtitle_file: |
|
raise ValueError( |
|
f"The language '{origin_language}' " |
|
"is not supported for transcription (ASR)." |
|
) |
|
|
|
if get_translated_text: |
|
self.edit_subs_complete = False |
|
if get_video_from_text_json: |
|
if not self.edit_subs_complete: |
|
raise ValueError("Generate the transcription first.") |
|
|
|
if ( |
|
("sound" in output_type or output_type == "raw media") |
|
and (get_translated_text or get_video_from_text_json) |
|
): |
|
raise ValueError( |
|
"Please disable 'edit generate subtitles' " |
|
f"first to acquire the {output_type}." |
|
) |
|
|
|
TRANSLATE_AUDIO_TO = LANGUAGES[target_language] |
|
SOURCE_LANGUAGE = LANGUAGES[origin_language] |
|
|
|
if ( |
|
transcriber_model == "OpenAI_API_Whisper" |
|
and SOURCE_LANGUAGE == "zh-TW" |
|
): |
|
logger.warning( |
|
"OpenAI API Whisper only supports Chinese (Simplified)." |
|
) |
|
SOURCE_LANGUAGE = "zh" |
|
|
|
if ( |
|
text_segmentation_scale in ["word", "character"] |
|
and "subtitle" not in output_type |
|
): |
|
wrn_lang = ( |
|
"Text segmentation by words or characters is typically" |
|
" used for generating subtitles. If subtitles are not the" |
|
" intended output, consider selecting 'sentence' " |
|
"segmentation method to ensure optimal results." |
|
|
|
) |
|
warn_disp(wrn_lang, is_gui) |
|
|
|
if tts_voice00[:2].lower() != TRANSLATE_AUDIO_TO[:2].lower(): |
|
wrn_lang = ( |
|
"Make sure to select a 'TTS Speaker' suitable for" |
|
" the translation language to avoid errors with the TTS." |
|
) |
|
warn_disp(wrn_lang, is_gui) |
|
|
|
if "_XTTS_" in tts_voice00 and voice_imitation: |
|
wrn_lang = ( |
|
"When you select XTTS, it is advisable " |
|
"to disable Voice Imitation." |
|
) |
|
warn_disp(wrn_lang, is_gui) |
|
|
|
if custom_voices and voice_imitation: |
|
wrn_lang = ( |
|
"When you use R.V.C. models, it is advisable" |
|
" to disable Voice Imitation." |
|
) |
|
warn_disp(wrn_lang, is_gui) |
|
|
|
if not media_file and not subtitle_file: |
|
raise ValueError( |
|
"Specifify a media or SRT file in advanced settings" |
|
) |
|
|
|
if subtitle_file: |
|
subtitle_file = ( |
|
subtitle_file |
|
if isinstance(subtitle_file, str) |
|
else subtitle_file.name |
|
) |
|
|
|
if subtitle_file and SOURCE_LANGUAGE == "Automatic detection": |
|
raise Exception( |
|
"To use an SRT file, you need to specify its " |
|
"original language (Source language)" |
|
) |
|
|
|
if not media_file and subtitle_file: |
|
diarization_model = "disable" |
|
media_file = "audio_support.wav" |
|
if not get_video_from_text_json: |
|
remove_files(media_file) |
|
srt_data = srt_file_to_segments(subtitle_file) |
|
total_duration = srt_data["segments"][-1]["end"] + 30. |
|
support_audio = AudioSegment.silent( |
|
duration=int(total_duration * 1000) |
|
) |
|
support_audio.export( |
|
media_file, format="wav" |
|
) |
|
logger.info("Supporting audio for the SRT file, created.") |
|
|
|
if "SET_LIMIT" == os.getenv("DEMO"): |
|
preview = True |
|
mix_method_audio = "Adjusting volumes and mixing audio" |
|
transcriber_model = "medium" |
|
logger.info( |
|
"DEMO; set preview=True; Generation is limited to " |
|
"10 seconds to prevent CPU errors. No limitations with GPU.\n" |
|
"DEMO; set Adjusting volumes and mixing audio\n" |
|
"DEMO; set whisper model to medium" |
|
) |
|
|
|
|
|
if self.device == "cpu" and compute_type not in COMPUTE_TYPE_CPU: |
|
logger.info("Compute type changed to float32") |
|
compute_type = "float32" |
|
|
|
base_video_file = "Video.mp4" |
|
base_audio_wav = "audio.wav" |
|
dub_audio_file = "audio_dub_solo.ogg" |
|
vocals_audio_file = "audio_Vocals_DeReverb.wav" |
|
voiceless_audio_file = "audio_Voiceless.wav" |
|
mix_audio_file = "audio_mix.mp3" |
|
vid_subs = "video_subs_file.mp4" |
|
video_output_file = "video_dub.mp4" |
|
|
|
if os.path.exists(media_file): |
|
media_base_hash = get_hash(media_file) |
|
else: |
|
media_base_hash = media_file |
|
self.clear_cache(media_base_hash, force=(not enable_cache)) |
|
|
|
if not get_video_from_text_json: |
|
self.result_diarize = ( |
|
self.align_language |
|
) = self.result_source_lang = None |
|
if not self.task_in_cache("media", [media_base_hash, preview], {}): |
|
if is_audio_file(media_file): |
|
prog_disp( |
|
"Processing audio...", 0.15, is_gui, progress=progress |
|
) |
|
audio_preprocessor(preview, media_file, base_audio_wav) |
|
else: |
|
prog_disp( |
|
"Processing video...", 0.15, is_gui, progress=progress |
|
) |
|
audio_video_preprocessor( |
|
preview, media_file, base_video_file, base_audio_wav |
|
) |
|
logger.debug("Set file complete.") |
|
|
|
if "sound" in output_type: |
|
prog_disp( |
|
"Separating sounds in the file...", |
|
0.50, |
|
is_gui, |
|
progress=progress |
|
) |
|
separate_out = sound_separate(base_audio_wav, output_type) |
|
final_outputs = [] |
|
for out in separate_out: |
|
final_name = media_out( |
|
media_file, |
|
f"{get_no_ext_filename(out)}", |
|
video_output_name, |
|
"wav", |
|
file_obj=out, |
|
) |
|
final_outputs.append(final_name) |
|
logger.info(f"Done: {str(final_outputs)}") |
|
return final_outputs |
|
|
|
if output_type == "raw media": |
|
output = media_out( |
|
media_file, |
|
"raw_media", |
|
video_output_name, |
|
"wav" if is_audio_file(media_file) else "mp4", |
|
file_obj=base_audio_wav if is_audio_file(media_file) else base_video_file, |
|
) |
|
logger.info(f"Done: {output}") |
|
return output |
|
|
|
if os.environ.get("IS_DEMO") == "TRUE": |
|
duration_verify = librosa.get_duration(filename=base_audio_wav) |
|
logger.info(f"Duration: {duration_verify} seconds") |
|
if duration_verify > 1500: |
|
raise RuntimeError( |
|
"The audio is too long to process in this demo. Alternatively, you" |
|
" can install the app locally or use the Colab notebook available " |
|
"in the SoniTranslate repository." |
|
) |
|
elif duration_verify > 300: |
|
tts_voices_list = [ |
|
tts_voice00, tts_voice01, tts_voice02, tts_voice03, tts_voice04, |
|
tts_voice05, tts_voice06, tts_voice07, tts_voice08, tts_voice09, |
|
tts_voice10, tts_voice11 |
|
] |
|
|
|
for tts_voice_ in tts_voices_list: |
|
if "_XTTS_" in tts_voice_: |
|
raise RuntimeError( |
|
"XTTS is too slow to be used for audio longer than 5 " |
|
"minutes in this demo. Alternatively, you can install " |
|
"the app locally or use the Colab notebook available in" |
|
" the SoniTranslate repository." |
|
) |
|
|
|
if not self.task_in_cache("refine_vocals", [vocal_refinement], {}): |
|
self.vocals = None |
|
if vocal_refinement: |
|
try: |
|
from soni_translate.mdx_net import process_uvr_task |
|
_, _, _, _, file_vocals = process_uvr_task( |
|
orig_song_path=base_audio_wav, |
|
main_vocals=False, |
|
dereverb=True, |
|
remove_files_output_dir=True, |
|
) |
|
remove_files(vocals_audio_file) |
|
copy_files(file_vocals, ".") |
|
self.vocals = vocals_audio_file |
|
except Exception as error: |
|
logger.error(str(error)) |
|
|
|
if not self.task_in_cache("transcript_align", [ |
|
subtitle_file, |
|
SOURCE_LANGUAGE, |
|
transcriber_model, |
|
compute_type, |
|
batch_size, |
|
literalize_numbers, |
|
segment_duration_limit, |
|
( |
|
"l_unit" |
|
if text_segmentation_scale in ["word", "character"] |
|
and subtitle_file |
|
else "sentence" |
|
) |
|
], {"vocals": self.vocals}): |
|
if subtitle_file: |
|
prog_disp( |
|
"From SRT file...", 0.30, is_gui, progress=progress |
|
) |
|
audio = whisperx.load_audio( |
|
base_audio_wav if not self.vocals else self.vocals |
|
) |
|
self.result = srt_file_to_segments(subtitle_file) |
|
self.result["language"] = SOURCE_LANGUAGE |
|
else: |
|
prog_disp( |
|
"Transcribing...", 0.30, is_gui, progress=progress |
|
) |
|
SOURCE_LANGUAGE = ( |
|
None |
|
if SOURCE_LANGUAGE == "Automatic detection" |
|
else SOURCE_LANGUAGE |
|
) |
|
audio, self.result = transcribe_speech( |
|
base_audio_wav if not self.vocals else self.vocals, |
|
transcriber_model, |
|
compute_type, |
|
batch_size, |
|
SOURCE_LANGUAGE, |
|
literalize_numbers, |
|
segment_duration_limit, |
|
) |
|
logger.debug( |
|
"Transcript complete, " |
|
f"segments count {len(self.result['segments'])}" |
|
) |
|
|
|
self.align_language = self.result["language"] |
|
if ( |
|
not subtitle_file |
|
or text_segmentation_scale in ["word", "character"] |
|
): |
|
prog_disp("Aligning...", 0.45, is_gui, progress=progress) |
|
try: |
|
if self.align_language in ["vi"]: |
|
logger.info( |
|
"Deficient alignment for the " |
|
f"{self.align_language} language, skipping the" |
|
" process. It is suggested to reduce the " |
|
"duration of the segments as an alternative." |
|
) |
|
else: |
|
self.result = align_speech(audio, self.result) |
|
logger.debug( |
|
"Align complete, " |
|
f"segments count {len(self.result['segments'])}" |
|
) |
|
except Exception as error: |
|
logger.error(str(error)) |
|
|
|
if self.result["segments"] == []: |
|
raise ValueError("No active speech found in audio") |
|
|
|
if not self.task_in_cache("break_align", [ |
|
divide_text_segments_by, |
|
text_segmentation_scale, |
|
self.align_language |
|
], { |
|
"result": self.result, |
|
"align_language": self.align_language |
|
}): |
|
if self.align_language in ["ja", "zh", "zh-TW"]: |
|
divide_text_segments_by += "|!|?|...|。" |
|
if text_segmentation_scale in ["word", "character"]: |
|
self.result = linguistic_level_segments( |
|
self.result, |
|
text_segmentation_scale, |
|
) |
|
elif divide_text_segments_by: |
|
try: |
|
self.result = break_aling_segments( |
|
self.result, |
|
break_characters=divide_text_segments_by, |
|
) |
|
except Exception as error: |
|
logger.error(str(error)) |
|
|
|
if not self.task_in_cache("diarize", [ |
|
min_speakers, |
|
max_speakers, |
|
YOUR_HF_TOKEN[:len(YOUR_HF_TOKEN)//2], |
|
diarization_model |
|
], { |
|
"result": self.result |
|
}): |
|
prog_disp("Diarizing...", 0.60, is_gui, progress=progress) |
|
diarize_model_select = diarization_models[diarization_model] |
|
self.result_diarize = diarize_speech( |
|
base_audio_wav if not self.vocals else self.vocals, |
|
self.result, |
|
min_speakers, |
|
max_speakers, |
|
YOUR_HF_TOKEN, |
|
diarize_model_select, |
|
) |
|
logger.debug("Diarize complete") |
|
self.result_source_lang = copy.deepcopy(self.result_diarize) |
|
|
|
if not self.task_in_cache("translate", [ |
|
TRANSLATE_AUDIO_TO, |
|
translate_process |
|
], { |
|
"result_diarize": self.result_diarize |
|
}): |
|
prog_disp("Translating...", 0.70, is_gui, progress=progress) |
|
lang_source = ( |
|
self.align_language |
|
if self.align_language |
|
else SOURCE_LANGUAGE |
|
) |
|
self.result_diarize["segments"] = translate_text( |
|
self.result_diarize["segments"], |
|
TRANSLATE_AUDIO_TO, |
|
translate_process, |
|
chunk_size=1800, |
|
source=lang_source, |
|
) |
|
logger.debug("Translation complete") |
|
logger.debug(self.result_diarize) |
|
|
|
if get_translated_text: |
|
|
|
json_data = [] |
|
for segment in self.result_diarize["segments"]: |
|
start = segment["start"] |
|
text = segment["text"] |
|
speaker = int(segment.get("speaker", "SPEAKER_00")[-2:]) + 1 |
|
json_data.append( |
|
{"start": start, "text": text, "speaker": speaker} |
|
) |
|
|
|
|
|
json_string = json.dumps(json_data, indent=2) |
|
logger.info("Done") |
|
self.edit_subs_complete = True |
|
return json_string.encode().decode("unicode_escape") |
|
|
|
if get_video_from_text_json: |
|
|
|
if self.result_diarize is None: |
|
raise ValueError("Generate the transcription first.") |
|
|
|
text_json_loaded = json.loads(text_json) |
|
for i, segment in enumerate(self.result_diarize["segments"]): |
|
segment["text"] = text_json_loaded[i]["text"] |
|
segment["speaker"] = "SPEAKER_{:02d}".format( |
|
int(text_json_loaded[i]["speaker"]) - 1 |
|
) |
|
|
|
|
|
if not self.task_in_cache("subs_and_edit", [ |
|
copy.deepcopy(self.result_diarize), |
|
output_format_subtitle, |
|
TRANSLATE_AUDIO_TO |
|
], { |
|
"result_diarize": self.result_diarize |
|
}): |
|
if output_format_subtitle == "disable": |
|
self.sub_file = "sub_tra.srt" |
|
elif output_format_subtitle != "ass": |
|
self.sub_file = process_subtitles( |
|
self.result_source_lang, |
|
self.align_language, |
|
self.result_diarize, |
|
output_format_subtitle, |
|
TRANSLATE_AUDIO_TO, |
|
) |
|
|
|
|
|
if output_format_subtitle != "srt": |
|
_ = process_subtitles( |
|
self.result_source_lang, |
|
self.align_language, |
|
self.result_diarize, |
|
"srt", |
|
TRANSLATE_AUDIO_TO, |
|
) |
|
|
|
if output_format_subtitle == "ass": |
|
convert_ori = "ffmpeg -i sub_ori.srt sub_ori.ass -y" |
|
convert_tra = "ffmpeg -i sub_tra.srt sub_tra.ass -y" |
|
self.sub_file = "sub_tra.ass" |
|
run_command(convert_ori) |
|
run_command(convert_tra) |
|
|
|
format_sub = ( |
|
output_format_subtitle |
|
if output_format_subtitle != "disable" |
|
else "srt" |
|
) |
|
|
|
if output_type == "subtitle": |
|
|
|
out_subs = [] |
|
tra_subs = media_out( |
|
media_file, |
|
TRANSLATE_AUDIO_TO, |
|
video_output_name, |
|
format_sub, |
|
file_obj=self.sub_file, |
|
) |
|
out_subs.append(tra_subs) |
|
|
|
ori_subs = media_out( |
|
media_file, |
|
self.align_language, |
|
video_output_name, |
|
format_sub, |
|
file_obj=f"sub_ori.{format_sub}", |
|
) |
|
out_subs.append(ori_subs) |
|
logger.info(f"Done: {out_subs}") |
|
return out_subs |
|
|
|
if output_type == "subtitle [by speaker]": |
|
output = get_subtitle_speaker( |
|
media_file, |
|
result=self.result_diarize, |
|
language=TRANSLATE_AUDIO_TO, |
|
extension=format_sub, |
|
base_name=video_output_name, |
|
) |
|
logger.info(f"Done: {str(output)}") |
|
return output |
|
|
|
if "video [subtitled]" in output_type: |
|
output = media_out( |
|
media_file, |
|
TRANSLATE_AUDIO_TO + "_subtitled", |
|
video_output_name, |
|
"wav" if is_audio_file(media_file) else ( |
|
"mkv" if "mkv" in output_type else "mp4" |
|
), |
|
file_obj=base_audio_wav if is_audio_file(media_file) else base_video_file, |
|
soft_subtitles=False if is_audio_file(media_file) else True, |
|
subtitle_files=output_format_subtitle, |
|
) |
|
msg_out = output[0] if isinstance(output, list) else output |
|
logger.info(f"Done: {msg_out}") |
|
return output |
|
|
|
if not self.task_in_cache("tts", [ |
|
TRANSLATE_AUDIO_TO, |
|
tts_voice00, |
|
tts_voice01, |
|
tts_voice02, |
|
tts_voice03, |
|
tts_voice04, |
|
tts_voice05, |
|
tts_voice06, |
|
tts_voice07, |
|
tts_voice08, |
|
tts_voice09, |
|
tts_voice10, |
|
tts_voice11, |
|
dereverb_automatic_xtts |
|
], { |
|
"sub_file": self.sub_file |
|
}): |
|
prog_disp("Text to speech...", 0.80, is_gui, progress=progress) |
|
self.valid_speakers = audio_segmentation_to_voice( |
|
self.result_diarize, |
|
TRANSLATE_AUDIO_TO, |
|
is_gui, |
|
tts_voice00, |
|
tts_voice01, |
|
tts_voice02, |
|
tts_voice03, |
|
tts_voice04, |
|
tts_voice05, |
|
tts_voice06, |
|
tts_voice07, |
|
tts_voice08, |
|
tts_voice09, |
|
tts_voice10, |
|
tts_voice11, |
|
dereverb_automatic_xtts, |
|
) |
|
|
|
if not self.task_in_cache("acc_and_vc", [ |
|
max_accelerate_audio, |
|
acceleration_rate_regulation, |
|
voice_imitation, |
|
voice_imitation_max_segments, |
|
voice_imitation_remove_previous, |
|
voice_imitation_vocals_dereverb, |
|
voice_imitation_method, |
|
custom_voices, |
|
custom_voices_workers, |
|
copy.deepcopy(self.vci.model_config), |
|
avoid_overlap |
|
], { |
|
"valid_speakers": self.valid_speakers |
|
}): |
|
audio_files, speakers_list = accelerate_segments( |
|
self.result_diarize, |
|
max_accelerate_audio, |
|
self.valid_speakers, |
|
acceleration_rate_regulation, |
|
) |
|
|
|
|
|
if voice_imitation: |
|
prog_disp( |
|
"Voice Imitation...", 0.85, is_gui, progress=progress |
|
) |
|
from soni_translate.text_to_speech import toneconverter |
|
|
|
try: |
|
toneconverter( |
|
copy.deepcopy(self.result_diarize), |
|
voice_imitation_max_segments, |
|
voice_imitation_remove_previous, |
|
voice_imitation_vocals_dereverb, |
|
voice_imitation_method, |
|
) |
|
except Exception as error: |
|
logger.error(str(error)) |
|
|
|
|
|
if custom_voices: |
|
prog_disp( |
|
"Applying customized voices...", |
|
0.90, |
|
is_gui, |
|
progress=progress, |
|
) |
|
|
|
try: |
|
self.vci( |
|
audio_files, |
|
speakers_list, |
|
overwrite=True, |
|
parallel_workers=custom_voices_workers, |
|
) |
|
self.vci.unload_models() |
|
except Exception as error: |
|
logger.error(str(error)) |
|
|
|
prog_disp( |
|
"Creating final translated video...", |
|
0.95, |
|
is_gui, |
|
progress=progress, |
|
) |
|
remove_files(dub_audio_file) |
|
create_translated_audio( |
|
self.result_diarize, |
|
audio_files, |
|
dub_audio_file, |
|
False, |
|
avoid_overlap, |
|
) |
|
|
|
|
|
hash_base_audio_wav = get_hash(base_audio_wav) |
|
if voiceless_track: |
|
if self.voiceless_id != hash_base_audio_wav: |
|
from soni_translate.mdx_net import process_uvr_task |
|
|
|
try: |
|
|
|
remove_files(voiceless_audio_file) |
|
uvr_voiceless_audio_wav, _ = process_uvr_task( |
|
orig_song_path=base_audio_wav, |
|
song_id="voiceless", |
|
only_voiceless=True, |
|
remove_files_output_dir=False, |
|
) |
|
copy_files(uvr_voiceless_audio_wav, ".") |
|
base_audio_wav = voiceless_audio_file |
|
self.voiceless_id = hash_base_audio_wav |
|
|
|
except Exception as error: |
|
logger.error(str(error)) |
|
else: |
|
base_audio_wav = voiceless_audio_file |
|
|
|
if not self.task_in_cache("mix_aud", [ |
|
mix_method_audio, |
|
volume_original_audio, |
|
volume_translated_audio, |
|
voiceless_track |
|
], {}): |
|
|
|
remove_files(mix_audio_file) |
|
command_volume_mix = f'ffmpeg -y -i {base_audio_wav} -i {dub_audio_file} -filter_complex "[0:0]volume={volume_original_audio}[a];[1:0]volume={volume_translated_audio}[b];[a][b]amix=inputs=2:duration=longest" -c:a libmp3lame {mix_audio_file}' |
|
command_background_mix = f'ffmpeg -i {base_audio_wav} -i {dub_audio_file} -filter_complex "[1:a]asplit=2[sc][mix];[0:a][sc]sidechaincompress=threshold=0.003:ratio=20[bg]; [bg][mix]amerge[final]" -map [final] {mix_audio_file}' |
|
if mix_method_audio == "Adjusting volumes and mixing audio": |
|
|
|
run_command(command_volume_mix) |
|
else: |
|
try: |
|
|
|
run_command(command_background_mix) |
|
except Exception as error_mix: |
|
|
|
logger.error(str(error_mix)) |
|
run_command(command_volume_mix) |
|
|
|
if "audio" in output_type or is_audio_file(media_file): |
|
output = media_out( |
|
media_file, |
|
TRANSLATE_AUDIO_TO, |
|
video_output_name, |
|
"wav" if "wav" in output_type else ( |
|
"ogg" if "ogg" in output_type else "mp3" |
|
), |
|
file_obj=mix_audio_file, |
|
subtitle_files=output_format_subtitle, |
|
) |
|
msg_out = output[0] if isinstance(output, list) else output |
|
logger.info(f"Done: {msg_out}") |
|
return output |
|
|
|
hash_base_video_file = get_hash(base_video_file) |
|
|
|
if burn_subtitles_to_video: |
|
hashvideo_text = [ |
|
hash_base_video_file, |
|
[seg["text"] for seg in self.result_diarize["segments"]] |
|
] |
|
if self.burn_subs_id != hashvideo_text: |
|
try: |
|
logger.info("Burn subtitles") |
|
remove_files(vid_subs) |
|
command = f"ffmpeg -i {base_video_file} -y -vf subtitles=sub_tra.srt -max_muxing_queue_size 9999 {vid_subs}" |
|
run_command(command) |
|
base_video_file = vid_subs |
|
self.burn_subs_id = hashvideo_text |
|
except Exception as error: |
|
logger.error(str(error)) |
|
else: |
|
base_video_file = vid_subs |
|
|
|
if not self.task_in_cache("output", [ |
|
hash_base_video_file, |
|
hash_base_audio_wav, |
|
burn_subtitles_to_video |
|
], {}): |
|
|
|
remove_files(video_output_file) |
|
run_command( |
|
f"ffmpeg -i {base_video_file} -i {mix_audio_file} -c:v copy -c:a copy -map 0:v -map 1:a -shortest {video_output_file}" |
|
) |
|
|
|
output = media_out( |
|
media_file, |
|
TRANSLATE_AUDIO_TO, |
|
video_output_name, |
|
"mkv" if "mkv" in output_type else "mp4", |
|
file_obj=video_output_file, |
|
soft_subtitles=soft_subtitles_to_video, |
|
subtitle_files=output_format_subtitle, |
|
) |
|
msg_out = output[0] if isinstance(output, list) else output |
|
logger.info(f"Done: {msg_out}") |
|
|
|
return output |
|
|
|
def hook_beta_processor( |
|
self, |
|
document, |
|
tgt_lang, |
|
translate_process, |
|
ori_lang, |
|
tts, |
|
name_final_file, |
|
custom_voices, |
|
custom_voices_workers, |
|
output_type, |
|
chunk_size, |
|
width, |
|
height, |
|
start_page, |
|
end_page, |
|
bcolor, |
|
is_gui, |
|
progress |
|
): |
|
prog_disp("Processing pages...", 0.10, is_gui, progress=progress) |
|
doc_data = doc_to_txtximg_pages(document, width, height, start_page, end_page, bcolor) |
|
result_diarize = page_data_to_segments(doc_data, 1700) |
|
|
|
prog_disp("Translating...", 0.20, is_gui, progress=progress) |
|
result_diarize["segments"] = translate_text( |
|
result_diarize["segments"], |
|
tgt_lang, |
|
translate_process, |
|
chunk_size=0, |
|
source=ori_lang, |
|
) |
|
chunk_size = ( |
|
chunk_size if chunk_size else determine_chunk_size(tts) |
|
) |
|
doc_data = update_page_data(result_diarize, doc_data) |
|
|
|
prog_disp("Text to speech...", 0.30, is_gui, progress=progress) |
|
result_diarize = page_data_to_segments(doc_data, chunk_size) |
|
valid_speakers = audio_segmentation_to_voice( |
|
result_diarize, |
|
tgt_lang, |
|
is_gui, |
|
tts, |
|
) |
|
|
|
|
|
audio_files, speakers_list = accelerate_segments( |
|
result_diarize, |
|
1.0, |
|
valid_speakers, |
|
) |
|
|
|
|
|
if custom_voices: |
|
prog_disp( |
|
"Applying customized voices...", |
|
0.60, |
|
is_gui, |
|
progress=progress, |
|
) |
|
self.vci( |
|
audio_files, |
|
speakers_list, |
|
overwrite=True, |
|
parallel_workers=custom_voices_workers, |
|
) |
|
self.vci.unload_models() |
|
|
|
|
|
result_diarize = fix_timestamps_docs(result_diarize, audio_files) |
|
final_wav_file = "audio_book.wav" |
|
remove_files(final_wav_file) |
|
|
|
prog_disp("Creating audio file...", 0.70, is_gui, progress=progress) |
|
create_translated_audio( |
|
result_diarize, audio_files, final_wav_file, False |
|
) |
|
|
|
prog_disp("Creating video file...", 0.80, is_gui, progress=progress) |
|
video_doc = create_video_from_images( |
|
doc_data, |
|
result_diarize |
|
) |
|
|
|
|
|
prog_disp("Merging...", 0.90, is_gui, progress=progress) |
|
vid_out = merge_video_and_audio(video_doc, final_wav_file) |
|
|
|
|
|
output = media_out( |
|
document, |
|
tgt_lang, |
|
name_final_file, |
|
"mkv" if "mkv" in output_type else "mp4", |
|
file_obj=vid_out, |
|
) |
|
logger.info(f"Done: {output}") |
|
return output |
|
|
|
def multilingual_docs_conversion( |
|
self, |
|
string_text="", |
|
document=None, |
|
directory_input="", |
|
origin_language="English (en)", |
|
target_language="English (en)", |
|
tts_voice00="en-US-EmmaMultilingualNeural-Female", |
|
name_final_file="", |
|
translate_process="google_translator", |
|
output_type="audio", |
|
chunk_size=None, |
|
custom_voices=False, |
|
custom_voices_workers=1, |
|
start_page=1, |
|
end_page=99999, |
|
width=1280, |
|
height=720, |
|
bcolor="dynamic", |
|
is_gui=False, |
|
progress=gr.Progress(), |
|
): |
|
if "gpt" in translate_process: |
|
check_openai_api_key() |
|
|
|
SOURCE_LANGUAGE = LANGUAGES[origin_language] |
|
if translate_process != "disable_translation": |
|
TRANSLATE_AUDIO_TO = LANGUAGES[target_language] |
|
else: |
|
TRANSLATE_AUDIO_TO = SOURCE_LANGUAGE |
|
logger.info("No translation") |
|
if tts_voice00[:2].lower() != TRANSLATE_AUDIO_TO[:2].lower(): |
|
logger.debug( |
|
"Make sure to select a 'TTS Speaker' suitable for the " |
|
"translation language to avoid errors with the TTS." |
|
) |
|
|
|
self.clear_cache(string_text, force=True) |
|
|
|
is_string = False |
|
if document is None: |
|
if os.path.exists(directory_input): |
|
document = directory_input |
|
else: |
|
document = string_text |
|
is_string = True |
|
document = document if isinstance(document, str) else document.name |
|
if not document: |
|
raise Exception("No data found") |
|
|
|
if os.environ.get("IS_DEMO") == "TRUE" and not is_string: |
|
raise RuntimeError( |
|
"This option is disabled in this demo. " |
|
"Alternatively, you can install " |
|
"the app locally or use the Colab notebook available in" |
|
" the SoniTranslate repository." |
|
) |
|
|
|
if "videobook" in output_type: |
|
if not document.lower().endswith(".pdf"): |
|
raise ValueError( |
|
"Videobooks are only compatible with PDF files." |
|
) |
|
|
|
return self.hook_beta_processor( |
|
document, |
|
TRANSLATE_AUDIO_TO, |
|
translate_process, |
|
SOURCE_LANGUAGE, |
|
tts_voice00, |
|
name_final_file, |
|
custom_voices, |
|
custom_voices_workers, |
|
output_type, |
|
chunk_size, |
|
width, |
|
height, |
|
start_page, |
|
end_page, |
|
bcolor, |
|
is_gui, |
|
progress |
|
) |
|
|
|
|
|
final_wav_file = "audio_book.wav" |
|
|
|
prog_disp("Processing text...", 0.15, is_gui, progress=progress) |
|
result_file_path, result_text = document_preprocessor( |
|
document, is_string, start_page, end_page |
|
) |
|
|
|
if ( |
|
output_type == "book (txt)" |
|
and translate_process == "disable_translation" |
|
): |
|
return result_file_path |
|
|
|
if "SET_LIMIT" == os.getenv("DEMO"): |
|
result_text = result_text[:50] |
|
logger.info( |
|
"DEMO; Generation is limited to 50 characters to prevent " |
|
"CPU errors. No limitations with GPU.\n" |
|
) |
|
|
|
if translate_process != "disable_translation": |
|
|
|
result_diarize = plain_text_to_segments(result_text, 1700) |
|
prog_disp("Translating...", 0.30, is_gui, progress=progress) |
|
|
|
result_diarize["segments"] = translate_text( |
|
result_diarize["segments"], |
|
TRANSLATE_AUDIO_TO, |
|
translate_process, |
|
chunk_size=0, |
|
source=SOURCE_LANGUAGE, |
|
) |
|
|
|
txt_file_path, result_text = segments_to_plain_text(result_diarize) |
|
|
|
if output_type == "book (txt)": |
|
return media_out( |
|
result_file_path if is_string else document, |
|
TRANSLATE_AUDIO_TO, |
|
name_final_file, |
|
"txt", |
|
file_obj=txt_file_path, |
|
) |
|
|
|
|
|
chunk_size = ( |
|
chunk_size if chunk_size else determine_chunk_size(tts_voice00) |
|
) |
|
result_diarize = plain_text_to_segments(result_text, chunk_size) |
|
logger.debug(result_diarize) |
|
|
|
prog_disp("Text to speech...", 0.45, is_gui, progress=progress) |
|
valid_speakers = audio_segmentation_to_voice( |
|
result_diarize, |
|
TRANSLATE_AUDIO_TO, |
|
is_gui, |
|
tts_voice00, |
|
) |
|
|
|
|
|
audio_files, speakers_list = accelerate_segments( |
|
result_diarize, |
|
1.0, |
|
valid_speakers, |
|
) |
|
|
|
|
|
if custom_voices: |
|
prog_disp( |
|
"Applying customized voices...", |
|
0.80, |
|
is_gui, |
|
progress=progress, |
|
) |
|
self.vci( |
|
audio_files, |
|
speakers_list, |
|
overwrite=True, |
|
parallel_workers=custom_voices_workers, |
|
) |
|
self.vci.unload_models() |
|
|
|
prog_disp( |
|
"Creating final audio file...", 0.90, is_gui, progress=progress |
|
) |
|
remove_files(final_wav_file) |
|
create_translated_audio( |
|
result_diarize, audio_files, final_wav_file, True |
|
) |
|
|
|
output = media_out( |
|
result_file_path if is_string else document, |
|
TRANSLATE_AUDIO_TO, |
|
name_final_file, |
|
"mp3" if "mp3" in output_type else ( |
|
"ogg" if "ogg" in output_type else "wav" |
|
), |
|
file_obj=final_wav_file, |
|
) |
|
|
|
logger.info(f"Done: {output}") |
|
|
|
return output |
|
|
|
|
|
title = "<center><strong><font size='7'>📽️ SoniTranslate 🈷️</font></strong></center>" |
|
|
|
|
|
def create_gui(theme, logs_in_gui=False): |
|
with gr.Blocks(theme=theme) as app: |
|
gr.Markdown(title) |
|
gr.Markdown(lg_conf["description"]) |
|
|
|
if os.environ.get("ZERO_GPU") == "TRUE": |
|
gr.Markdown( |
|
""" |
|
|
|
<details> |
|
<summary style="font-size: 1.5em;">⚠️ Important ⚠️</summary> |
|
<ul> |
|
<li>🚀 This demo uses a zero GPU setup only for the transcription and diarization process. Everything else runs on the CPU. It is recommended to use videos no longer than 15 minutes. ⏳</li> |
|
<li>❗ If you see `queue` when using this, it means another user is currently using it, and you need to wait until they are finished.</li> |
|
<li>🔒 Some functions are disabled, but if you duplicate this with a GPU and set the value in secrets "ZERO_GPU" to FALSE, you can use the app with full GPU acceleration. ⚡</li> |
|
</ul> |
|
</details> |
|
""" |
|
) |
|
|
|
with gr.Tab(lg_conf["tab_translate"]): |
|
with gr.Row(): |
|
with gr.Column(): |
|
input_data_type = gr.Dropdown( |
|
["SUBMIT VIDEO", "URL", "Find Video Path"], |
|
value="SUBMIT VIDEO", |
|
label=lg_conf["video_source"], |
|
) |
|
|
|
def swap_visibility(data_type): |
|
if data_type == "URL": |
|
return ( |
|
gr.update(visible=False, value=None), |
|
gr.update(visible=True, value=""), |
|
gr.update(visible=False, value=""), |
|
) |
|
elif data_type == "SUBMIT VIDEO": |
|
return ( |
|
gr.update(visible=True, value=None), |
|
gr.update(visible=False, value=""), |
|
gr.update(visible=False, value=""), |
|
) |
|
elif data_type == "Find Video Path": |
|
return ( |
|
gr.update(visible=False, value=None), |
|
gr.update(visible=False, value=""), |
|
gr.update(visible=True, value=""), |
|
) |
|
|
|
video_input = gr.File( |
|
label="VIDEO", |
|
file_count="multiple", |
|
type="filepath", |
|
) |
|
blink_input = gr.Textbox( |
|
visible=False, |
|
label=lg_conf["link_label"], |
|
info=lg_conf["link_info"], |
|
placeholder=lg_conf["link_ph"], |
|
) |
|
directory_input = gr.Textbox( |
|
visible=False, |
|
label=lg_conf["dir_label"], |
|
info=lg_conf["dir_info"], |
|
placeholder=lg_conf["dir_ph"], |
|
) |
|
input_data_type.change( |
|
fn=swap_visibility, |
|
inputs=input_data_type, |
|
outputs=[video_input, blink_input, directory_input], |
|
) |
|
|
|
gr.HTML() |
|
|
|
SOURCE_LANGUAGE = gr.Dropdown( |
|
LANGUAGES_LIST, |
|
value=LANGUAGES_LIST[0], |
|
label=lg_conf["sl_label"], |
|
info=lg_conf["sl_info"], |
|
) |
|
TRANSLATE_AUDIO_TO = gr.Dropdown( |
|
LANGUAGES_LIST[1:], |
|
value="English (en)", |
|
label=lg_conf["tat_label"], |
|
info=lg_conf["tat_info"], |
|
) |
|
|
|
gr.HTML("<hr></h2>") |
|
|
|
gr.Markdown(lg_conf["num_speakers"]) |
|
MAX_TTS = 12 |
|
min_speakers = gr.Slider( |
|
1, |
|
MAX_TTS, |
|
value=1, |
|
label=lg_conf["min_sk"], |
|
step=1, |
|
visible=False, |
|
) |
|
max_speakers = gr.Slider( |
|
1, |
|
MAX_TTS, |
|
value=2, |
|
step=1, |
|
label=lg_conf["max_sk"], |
|
) |
|
gr.Markdown(lg_conf["tts_select"]) |
|
|
|
def submit(value): |
|
visibility_dict = { |
|
f"tts_voice{i:02d}": gr.update(visible=i < value) |
|
for i in range(MAX_TTS) |
|
} |
|
return [value for value in visibility_dict.values()] |
|
|
|
tts_voice00 = gr.Dropdown( |
|
SoniTr.tts_info.tts_list(), |
|
value="en-US-EmmaMultilingualNeural-Female", |
|
label=lg_conf["sk1"], |
|
visible=True, |
|
interactive=True, |
|
) |
|
tts_voice01 = gr.Dropdown( |
|
SoniTr.tts_info.tts_list(), |
|
value="en-US-AndrewMultilingualNeural-Male", |
|
label=lg_conf["sk2"], |
|
visible=True, |
|
interactive=True, |
|
) |
|
tts_voice02 = gr.Dropdown( |
|
SoniTr.tts_info.tts_list(), |
|
value="en-US-AvaMultilingualNeural-Female", |
|
label=lg_conf["sk3"], |
|
visible=False, |
|
interactive=True, |
|
) |
|
tts_voice03 = gr.Dropdown( |
|
SoniTr.tts_info.tts_list(), |
|
value="en-US-BrianMultilingualNeural-Male", |
|
label=lg_conf["sk4"], |
|
visible=False, |
|
interactive=True, |
|
) |
|
tts_voice04 = gr.Dropdown( |
|
SoniTr.tts_info.tts_list(), |
|
value="de-DE-SeraphinaMultilingualNeural-Female", |
|
label=lg_conf["sk4"], |
|
visible=False, |
|
interactive=True, |
|
) |
|
tts_voice05 = gr.Dropdown( |
|
SoniTr.tts_info.tts_list(), |
|
value="de-DE-FlorianMultilingualNeural-Male", |
|
label=lg_conf["sk6"], |
|
visible=False, |
|
interactive=True, |
|
) |
|
tts_voice06 = gr.Dropdown( |
|
SoniTr.tts_info.tts_list(), |
|
value="fr-FR-VivienneMultilingualNeural-Female", |
|
label=lg_conf["sk7"], |
|
visible=False, |
|
interactive=True, |
|
) |
|
tts_voice07 = gr.Dropdown( |
|
SoniTr.tts_info.tts_list(), |
|
value="fr-FR-RemyMultilingualNeural-Male", |
|
label=lg_conf["sk8"], |
|
visible=False, |
|
interactive=True, |
|
) |
|
tts_voice08 = gr.Dropdown( |
|
SoniTr.tts_info.tts_list(), |
|
value="en-US-EmmaMultilingualNeural-Female", |
|
label=lg_conf["sk9"], |
|
visible=False, |
|
interactive=True, |
|
) |
|
tts_voice09 = gr.Dropdown( |
|
SoniTr.tts_info.tts_list(), |
|
value="en-US-AndrewMultilingualNeural-Male", |
|
label=lg_conf["sk10"], |
|
visible=False, |
|
interactive=True, |
|
) |
|
tts_voice10 = gr.Dropdown( |
|
SoniTr.tts_info.tts_list(), |
|
value="en-US-EmmaMultilingualNeural-Female", |
|
label=lg_conf["sk11"], |
|
visible=False, |
|
interactive=True, |
|
) |
|
tts_voice11 = gr.Dropdown( |
|
SoniTr.tts_info.tts_list(), |
|
value="en-US-AndrewMultilingualNeural-Male", |
|
label=lg_conf["sk12"], |
|
visible=False, |
|
interactive=True, |
|
) |
|
max_speakers.change( |
|
submit, |
|
max_speakers, |
|
[ |
|
tts_voice00, |
|
tts_voice01, |
|
tts_voice02, |
|
tts_voice03, |
|
tts_voice04, |
|
tts_voice05, |
|
tts_voice06, |
|
tts_voice07, |
|
tts_voice08, |
|
tts_voice09, |
|
tts_voice10, |
|
tts_voice11, |
|
], |
|
) |
|
|
|
with gr.Column(): |
|
with gr.Accordion( |
|
lg_conf["vc_title"], |
|
open=False, |
|
): |
|
gr.Markdown(lg_conf["vc_subtitle"]) |
|
voice_imitation_gui = gr.Checkbox( |
|
False, |
|
label=lg_conf["vc_active_label"], |
|
info=lg_conf["vc_active_info"], |
|
) |
|
openvoice_models = ["openvoice", "openvoice_v2"] |
|
voice_imitation_method_options = ( |
|
["freevc"] + openvoice_models |
|
if SoniTr.tts_info.xtts_enabled |
|
else openvoice_models |
|
) |
|
voice_imitation_method_gui = gr.Dropdown( |
|
voice_imitation_method_options, |
|
value=voice_imitation_method_options[0], |
|
label=lg_conf["vc_method_label"], |
|
info=lg_conf["vc_method_info"], |
|
) |
|
voice_imitation_max_segments_gui = gr.Slider( |
|
label=lg_conf["vc_segments_label"], |
|
info=lg_conf["vc_segments_info"], |
|
value=3, |
|
step=1, |
|
minimum=1, |
|
maximum=10, |
|
visible=True, |
|
interactive=True, |
|
) |
|
voice_imitation_vocals_dereverb_gui = gr.Checkbox( |
|
False, |
|
label=lg_conf["vc_dereverb_label"], |
|
info=lg_conf["vc_dereverb_info"], |
|
) |
|
voice_imitation_remove_previous_gui = gr.Checkbox( |
|
True, |
|
label=lg_conf["vc_remove_label"], |
|
info=lg_conf["vc_remove_info"], |
|
) |
|
|
|
if SoniTr.tts_info.xtts_enabled: |
|
with gr.Column(): |
|
with gr.Accordion( |
|
lg_conf["xtts_title"], |
|
open=False, |
|
): |
|
gr.Markdown(lg_conf["xtts_subtitle"]) |
|
wav_speaker_file = gr.File( |
|
label=lg_conf["xtts_file_label"] |
|
) |
|
wav_speaker_name = gr.Textbox( |
|
label=lg_conf["xtts_name_label"], |
|
value="", |
|
info=lg_conf["xtts_name_info"], |
|
placeholder="default_name", |
|
lines=1, |
|
) |
|
wav_speaker_start = gr.Number( |
|
label="Time audio start", |
|
value=0, |
|
visible=False, |
|
) |
|
wav_speaker_end = gr.Number( |
|
label="Time audio end", |
|
value=0, |
|
visible=False, |
|
) |
|
wav_speaker_dir = gr.Textbox( |
|
label="Directory save", |
|
value="_XTTS_", |
|
visible=False, |
|
) |
|
wav_speaker_dereverb = gr.Checkbox( |
|
True, |
|
label=lg_conf["xtts_dereverb_label"], |
|
info=lg_conf["xtts_dereverb_info"] |
|
) |
|
wav_speaker_output = gr.HTML() |
|
create_xtts_wav = gr.Button( |
|
lg_conf["xtts_button"] |
|
) |
|
gr.Markdown(lg_conf["xtts_footer"]) |
|
else: |
|
wav_speaker_dereverb = gr.Checkbox( |
|
False, |
|
label=lg_conf["xtts_dereverb_label"], |
|
info=lg_conf["xtts_dereverb_info"], |
|
visible=False |
|
) |
|
|
|
with gr.Column(): |
|
with gr.Accordion( |
|
lg_conf["extra_setting"], open=False |
|
): |
|
audio_accelerate = gr.Slider( |
|
label=lg_conf["acc_max_label"], |
|
value=1.9, |
|
step=0.1, |
|
minimum=1.0, |
|
maximum=2.5, |
|
visible=True, |
|
interactive=True, |
|
info=lg_conf["acc_max_info"], |
|
) |
|
acceleration_rate_regulation_gui = gr.Checkbox( |
|
False, |
|
label=lg_conf["acc_rate_label"], |
|
info=lg_conf["acc_rate_info"], |
|
) |
|
avoid_overlap_gui = gr.Checkbox( |
|
False, |
|
label=lg_conf["or_label"], |
|
info=lg_conf["or_info"], |
|
) |
|
|
|
gr.HTML("<hr></h2>") |
|
|
|
audio_mix_options = [ |
|
"Mixing audio with sidechain compression", |
|
"Adjusting volumes and mixing audio", |
|
] |
|
AUDIO_MIX = gr.Dropdown( |
|
audio_mix_options, |
|
value=audio_mix_options[1], |
|
label=lg_conf["aud_mix_label"], |
|
info=lg_conf["aud_mix_info"], |
|
) |
|
volume_original_mix = gr.Slider( |
|
label=lg_conf["vol_ori"], |
|
info="for Adjusting volumes and mixing audio", |
|
value=0.25, |
|
step=0.05, |
|
minimum=0.0, |
|
maximum=2.50, |
|
visible=True, |
|
interactive=True, |
|
) |
|
volume_translated_mix = gr.Slider( |
|
label=lg_conf["vol_tra"], |
|
info="for Adjusting volumes and mixing audio", |
|
value=1.80, |
|
step=0.05, |
|
minimum=0.0, |
|
maximum=2.50, |
|
visible=True, |
|
interactive=True, |
|
) |
|
main_voiceless_track = gr.Checkbox( |
|
label=lg_conf["voiceless_tk_label"], |
|
info=lg_conf["voiceless_tk_info"], |
|
) |
|
|
|
gr.HTML("<hr></h2>") |
|
sub_type_options = [ |
|
"disable", |
|
"srt", |
|
"vtt", |
|
"ass", |
|
"txt", |
|
"tsv", |
|
"json", |
|
"aud", |
|
] |
|
|
|
sub_type_output = gr.Dropdown( |
|
sub_type_options, |
|
value=sub_type_options[1], |
|
label=lg_conf["sub_type"], |
|
) |
|
soft_subtitles_to_video_gui = gr.Checkbox( |
|
label=lg_conf["soft_subs_label"], |
|
info=lg_conf["soft_subs_info"], |
|
) |
|
burn_subtitles_to_video_gui = gr.Checkbox( |
|
label=lg_conf["burn_subs_label"], |
|
info=lg_conf["burn_subs_info"], |
|
) |
|
|
|
gr.HTML("<hr></h2>") |
|
gr.Markdown(lg_conf["whisper_title"]) |
|
literalize_numbers_gui = gr.Checkbox( |
|
True, |
|
label=lg_conf["lnum_label"], |
|
info=lg_conf["lnum_info"], |
|
) |
|
vocal_refinement_gui = gr.Checkbox( |
|
False, |
|
label=lg_conf["scle_label"], |
|
info=lg_conf["scle_info"], |
|
) |
|
segment_duration_limit_gui = gr.Slider( |
|
label=lg_conf["sd_limit_label"], |
|
info=lg_conf["sd_limit_info"], |
|
value=15, |
|
step=1, |
|
minimum=1, |
|
maximum=30, |
|
) |
|
whisper_model_default = ( |
|
"large-v3" |
|
if SoniTr.device == "cuda" |
|
else "medium" |
|
) |
|
|
|
WHISPER_MODEL_SIZE = gr.Dropdown( |
|
ASR_MODEL_OPTIONS + find_whisper_models(), |
|
value=whisper_model_default, |
|
label="Whisper ASR model", |
|
info=lg_conf["asr_model_info"], |
|
allow_custom_value=True, |
|
) |
|
com_t_opt, com_t_default = ( |
|
[COMPUTE_TYPE_GPU, "float16"] |
|
if SoniTr.device == "cuda" |
|
else [COMPUTE_TYPE_CPU, "float32"] |
|
) |
|
compute_type = gr.Dropdown( |
|
com_t_opt, |
|
value=com_t_default, |
|
label=lg_conf["ctype_label"], |
|
info=lg_conf["ctype_info"], |
|
) |
|
batch_size_value = 8 if os.environ.get("ZERO_GPU") != "TRUE" else 32 |
|
batch_size = gr.Slider( |
|
minimum=1, |
|
maximum=32, |
|
value=batch_size_value, |
|
label=lg_conf["batchz_label"], |
|
info=lg_conf["batchz_info"], |
|
step=1, |
|
) |
|
input_srt = gr.File( |
|
label=lg_conf["srt_file_label"], |
|
file_types=[".srt", ".ass", ".vtt"], |
|
height=130, |
|
) |
|
|
|
gr.HTML("<hr></h2>") |
|
text_segmentation_options = [ |
|
"sentence", |
|
"word", |
|
"character" |
|
] |
|
text_segmentation_scale_gui = gr.Dropdown( |
|
text_segmentation_options, |
|
value=text_segmentation_options[0], |
|
label=lg_conf["tsscale_label"], |
|
info=lg_conf["tsscale_info"], |
|
) |
|
divide_text_segments_by_gui = gr.Textbox( |
|
label=lg_conf["divide_text_label"], |
|
value="", |
|
info=lg_conf["divide_text_info"], |
|
) |
|
|
|
gr.HTML("<hr></h2>") |
|
pyannote_models_list = list( |
|
diarization_models.keys() |
|
) |
|
diarization_process_dropdown = gr.Dropdown( |
|
pyannote_models_list, |
|
value=pyannote_models_list[1], |
|
label=lg_conf["diarization_label"], |
|
) |
|
translate_process_dropdown = gr.Dropdown( |
|
TRANSLATION_PROCESS_OPTIONS, |
|
value=TRANSLATION_PROCESS_OPTIONS[0], |
|
label=lg_conf["tr_process_label"], |
|
) |
|
|
|
gr.HTML("<hr></h2>") |
|
main_output_type = gr.Dropdown( |
|
OUTPUT_TYPE_OPTIONS, |
|
value=OUTPUT_TYPE_OPTIONS[0], |
|
label=lg_conf["out_type_label"], |
|
) |
|
VIDEO_OUTPUT_NAME = gr.Textbox( |
|
label=lg_conf["out_name_label"], |
|
value="", |
|
info=lg_conf["out_name_info"], |
|
) |
|
play_sound_gui = gr.Checkbox( |
|
True, |
|
label=lg_conf["task_sound_label"], |
|
info=lg_conf["task_sound_info"], |
|
) |
|
enable_cache_gui = gr.Checkbox( |
|
True, |
|
label=lg_conf["cache_label"], |
|
info=lg_conf["cache_info"], |
|
) |
|
PREVIEW = gr.Checkbox( |
|
label="Preview", info=lg_conf["preview_info"] |
|
) |
|
is_gui_dummy_check = gr.Checkbox( |
|
True, visible=False |
|
) |
|
|
|
with gr.Column(variant="compact"): |
|
edit_sub_check = gr.Checkbox( |
|
label=lg_conf["edit_sub_label"], |
|
info=lg_conf["edit_sub_info"], |
|
interactive=False, |
|
) |
|
dummy_false_check = gr.Checkbox( |
|
False, |
|
visible=False, |
|
) |
|
|
|
def visible_component_subs(input_bool): |
|
if input_bool: |
|
return gr.update(visible=True), gr.update( |
|
visible=True |
|
) |
|
else: |
|
return gr.update(visible=False), gr.update( |
|
visible=False |
|
) |
|
|
|
subs_button = gr.Button( |
|
lg_conf["button_subs"], |
|
variant="primary", |
|
visible=False, |
|
) |
|
subs_edit_space = gr.Textbox( |
|
visible=False, |
|
lines=10, |
|
label=lg_conf["editor_sub_label"], |
|
info=lg_conf["editor_sub_info"], |
|
placeholder=lg_conf["editor_sub_ph"], |
|
) |
|
edit_sub_check.change( |
|
visible_component_subs, |
|
[edit_sub_check], |
|
[subs_button, subs_edit_space], |
|
) |
|
|
|
with gr.Row(): |
|
video_button = gr.Button( |
|
lg_conf["button_translate"], |
|
variant="primary", |
|
) |
|
with gr.Row(): |
|
video_output = gr.File( |
|
label=lg_conf["output_result_label"], |
|
file_count="multiple", |
|
interactive=False, |
|
|
|
) |
|
|
|
gr.HTML("<hr></h2>") |
|
|
|
if ( |
|
os.getenv("YOUR_HF_TOKEN") is None |
|
or os.getenv("YOUR_HF_TOKEN") == "" |
|
): |
|
HFKEY = gr.Textbox( |
|
visible=True, |
|
label="HF Token", |
|
info=lg_conf["ht_token_info"], |
|
placeholder=lg_conf["ht_token_ph"], |
|
) |
|
else: |
|
HFKEY = gr.Textbox( |
|
visible=False, |
|
label="HF Token", |
|
info=lg_conf["ht_token_info"], |
|
placeholder=lg_conf["ht_token_ph"], |
|
) |
|
|
|
gr.Examples( |
|
examples=[ |
|
[ |
|
["./assets/Video_main.mp4"], |
|
"", |
|
"", |
|
"", |
|
False, |
|
whisper_model_default, |
|
batch_size_value, |
|
com_t_default, |
|
"Spanish (es)", |
|
"English (en)", |
|
1, |
|
2, |
|
"en-US-EmmaMultilingualNeural-Female", |
|
"en-US-AndrewMultilingualNeural-Male", |
|
], |
|
], |
|
fn=SoniTr.batch_multilingual_media_conversion, |
|
inputs=[ |
|
video_input, |
|
blink_input, |
|
directory_input, |
|
HFKEY, |
|
PREVIEW, |
|
WHISPER_MODEL_SIZE, |
|
batch_size, |
|
compute_type, |
|
SOURCE_LANGUAGE, |
|
TRANSLATE_AUDIO_TO, |
|
min_speakers, |
|
max_speakers, |
|
tts_voice00, |
|
tts_voice01, |
|
], |
|
outputs=[video_output], |
|
cache_examples=False, |
|
) |
|
|
|
with gr.Tab(lg_conf["tab_docs"]): |
|
with gr.Column(): |
|
with gr.Accordion("Docs", open=True): |
|
with gr.Column(variant="compact"): |
|
with gr.Column(): |
|
input_doc_type = gr.Dropdown( |
|
[ |
|
"WRITE TEXT", |
|
"SUBMIT DOCUMENT", |
|
"Find Document Path", |
|
], |
|
value="SUBMIT DOCUMENT", |
|
label=lg_conf["docs_input_label"], |
|
info=lg_conf["docs_input_info"], |
|
) |
|
|
|
def swap_visibility(data_type): |
|
if data_type == "WRITE TEXT": |
|
return ( |
|
gr.update(visible=True, value=""), |
|
gr.update(visible=False, value=None), |
|
gr.update(visible=False, value=""), |
|
) |
|
elif data_type == "SUBMIT DOCUMENT": |
|
return ( |
|
gr.update(visible=False, value=""), |
|
gr.update(visible=True, value=None), |
|
gr.update(visible=False, value=""), |
|
) |
|
elif data_type == "Find Document Path": |
|
return ( |
|
gr.update(visible=False, value=""), |
|
gr.update(visible=False, value=None), |
|
gr.update(visible=True, value=""), |
|
) |
|
|
|
text_docs = gr.Textbox( |
|
label="Text", |
|
value="This is an example", |
|
info="Write a text", |
|
placeholder="...", |
|
lines=5, |
|
visible=False, |
|
) |
|
input_docs = gr.File( |
|
label="Document", visible=True |
|
) |
|
directory_input_docs = gr.Textbox( |
|
visible=False, |
|
label="Document Path", |
|
info="Example: /home/my_doc.pdf", |
|
placeholder="Path goes here...", |
|
) |
|
input_doc_type.change( |
|
fn=swap_visibility, |
|
inputs=input_doc_type, |
|
outputs=[ |
|
text_docs, |
|
input_docs, |
|
directory_input_docs, |
|
], |
|
) |
|
|
|
gr.HTML() |
|
|
|
tts_documents = gr.Dropdown( |
|
list( |
|
filter( |
|
lambda x: x != "_XTTS_/AUTOMATIC.wav", |
|
SoniTr.tts_info.tts_list(), |
|
) |
|
), |
|
value="en-US-EmmaMultilingualNeural-Female", |
|
label="TTS", |
|
visible=True, |
|
interactive=True, |
|
) |
|
|
|
gr.HTML() |
|
|
|
docs_SOURCE_LANGUAGE = gr.Dropdown( |
|
LANGUAGES_LIST[1:], |
|
value="English (en)", |
|
label=lg_conf["sl_label"], |
|
info=lg_conf["docs_source_info"], |
|
) |
|
docs_TRANSLATE_TO = gr.Dropdown( |
|
LANGUAGES_LIST[1:], |
|
value="English (en)", |
|
label=lg_conf["tat_label"], |
|
info=lg_conf["tat_info"], |
|
) |
|
|
|
with gr.Column(): |
|
with gr.Accordion( |
|
lg_conf["extra_setting"], open=False |
|
): |
|
docs_translate_process_dropdown = gr.Dropdown( |
|
DOCS_TRANSLATION_PROCESS_OPTIONS, |
|
value=DOCS_TRANSLATION_PROCESS_OPTIONS[ |
|
0 |
|
], |
|
label="Translation process", |
|
) |
|
|
|
gr.HTML("<hr></h2>") |
|
|
|
docs_output_type = gr.Dropdown( |
|
DOCS_OUTPUT_TYPE_OPTIONS, |
|
value=DOCS_OUTPUT_TYPE_OPTIONS[2], |
|
label="Output type", |
|
) |
|
docs_OUTPUT_NAME = gr.Textbox( |
|
label="Final file name", |
|
value="", |
|
info=lg_conf["out_name_info"], |
|
) |
|
docs_chunk_size = gr.Number( |
|
label=lg_conf["chunk_size_label"], |
|
value=0, |
|
visible=True, |
|
interactive=True, |
|
info=lg_conf["chunk_size_info"], |
|
) |
|
gr.HTML("<hr></h2>") |
|
start_page_gui = gr.Number( |
|
step=1, |
|
value=1, |
|
minimum=1, |
|
maximum=99999, |
|
label="Start page", |
|
) |
|
end_page_gui = gr.Number( |
|
step=1, |
|
value=99999, |
|
minimum=1, |
|
maximum=99999, |
|
label="End page", |
|
) |
|
gr.HTML("<hr>Videobook config</h2>") |
|
videobook_width_gui = gr.Number( |
|
step=1, |
|
value=1280, |
|
minimum=100, |
|
maximum=4096, |
|
label="Width", |
|
) |
|
videobook_height_gui = gr.Number( |
|
step=1, |
|
value=720, |
|
minimum=100, |
|
maximum=4096, |
|
label="Height", |
|
) |
|
videobook_bcolor_gui = gr.Dropdown( |
|
BORDER_COLORS, |
|
value=BORDER_COLORS[0], |
|
label="Border color", |
|
) |
|
docs_dummy_check = gr.Checkbox( |
|
True, visible=False |
|
) |
|
|
|
with gr.Row(): |
|
docs_button = gr.Button( |
|
lg_conf["docs_button"], |
|
variant="primary", |
|
) |
|
with gr.Row(): |
|
docs_output = gr.File( |
|
label="Result", |
|
interactive=False, |
|
) |
|
|
|
with gr.Tab("Custom voice R.V.C. (Optional)"): |
|
|
|
with gr.Column(): |
|
with gr.Accordion("Get the R.V.C. Models", open=True): |
|
url_links = gr.Textbox( |
|
label="URLs", |
|
value="", |
|
info=lg_conf["cv_url_info"], |
|
placeholder="urls here...", |
|
lines=1, |
|
) |
|
download_finish = gr.HTML() |
|
download_button = gr.Button("DOWNLOAD MODELS") |
|
|
|
def update_models(): |
|
models_path, index_path = upload_model_list() |
|
|
|
dict_models = { |
|
f"fmodel{i:02d}": gr.update( |
|
choices=models_path |
|
) |
|
for i in range(MAX_TTS+1) |
|
} |
|
dict_index = { |
|
f"findex{i:02d}": gr.update( |
|
choices=index_path, value=None |
|
) |
|
for i in range(MAX_TTS+1) |
|
} |
|
dict_changes = {**dict_models, **dict_index} |
|
return [value for value in dict_changes.values()] |
|
|
|
with gr.Column(): |
|
with gr.Accordion(lg_conf["replace_title"], open=False): |
|
with gr.Column(variant="compact"): |
|
with gr.Column(): |
|
gr.Markdown(lg_conf["sec1_title"]) |
|
enable_custom_voice = gr.Checkbox( |
|
False, |
|
label="ENABLE", |
|
info=lg_conf["enable_replace"] |
|
) |
|
workers_custom_voice = gr.Number( |
|
step=1, |
|
value=1, |
|
minimum=1, |
|
maximum=50, |
|
label="workers", |
|
visible=False, |
|
) |
|
|
|
gr.Markdown(lg_conf["sec2_title"]) |
|
gr.Markdown(lg_conf["sec2_subtitle"]) |
|
|
|
PITCH_ALGO_OPT = [ |
|
"pm", |
|
"harvest", |
|
"crepe", |
|
"rmvpe", |
|
"rmvpe+", |
|
] |
|
|
|
def model_conf(): |
|
return gr.Dropdown( |
|
models_path, |
|
|
|
label="Model", |
|
visible=True, |
|
interactive=True, |
|
) |
|
|
|
def pitch_algo_conf(): |
|
return gr.Dropdown( |
|
PITCH_ALGO_OPT, |
|
value=PITCH_ALGO_OPT[3], |
|
label="Pitch algorithm", |
|
visible=True, |
|
interactive=True, |
|
) |
|
|
|
def pitch_lvl_conf(): |
|
return gr.Slider( |
|
label="Pitch level", |
|
minimum=-24, |
|
maximum=24, |
|
step=1, |
|
value=0, |
|
visible=True, |
|
interactive=True, |
|
) |
|
|
|
def index_conf(): |
|
return gr.Dropdown( |
|
index_path, |
|
value=None, |
|
label="Index", |
|
visible=True, |
|
interactive=True, |
|
) |
|
|
|
def index_inf_conf(): |
|
return gr.Slider( |
|
minimum=0, |
|
maximum=1, |
|
label="Index influence", |
|
value=0.75, |
|
) |
|
|
|
def respiration_filter_conf(): |
|
return gr.Slider( |
|
minimum=0, |
|
maximum=7, |
|
label="Respiration median filtering", |
|
value=3, |
|
step=1, |
|
interactive=True, |
|
) |
|
|
|
def envelope_ratio_conf(): |
|
return gr.Slider( |
|
minimum=0, |
|
maximum=1, |
|
label="Envelope ratio", |
|
value=0.25, |
|
interactive=True, |
|
) |
|
|
|
def consonant_protec_conf(): |
|
return gr.Slider( |
|
minimum=0, |
|
maximum=0.5, |
|
label="Consonant breath protection", |
|
value=0.5, |
|
interactive=True, |
|
) |
|
|
|
def button_conf(tts_name): |
|
return gr.Button( |
|
lg_conf["cv_button_apply"]+" "+tts_name, |
|
variant="primary", |
|
) |
|
|
|
TTS_TABS = [ |
|
'TTS Speaker {:02d}'.format(i) for i in range(1, MAX_TTS+1) |
|
] |
|
|
|
CV_SUBTITLES = [ |
|
lg_conf["cv_tts1"], |
|
lg_conf["cv_tts2"], |
|
lg_conf["cv_tts3"], |
|
lg_conf["cv_tts4"], |
|
lg_conf["cv_tts5"], |
|
lg_conf["cv_tts6"], |
|
lg_conf["cv_tts7"], |
|
lg_conf["cv_tts8"], |
|
lg_conf["cv_tts9"], |
|
lg_conf["cv_tts10"], |
|
lg_conf["cv_tts11"], |
|
lg_conf["cv_tts12"], |
|
] |
|
|
|
configs_storage = [] |
|
|
|
for i in range(MAX_TTS): |
|
with gr.Accordion(CV_SUBTITLES[i], open=False): |
|
gr.Markdown(TTS_TABS[i]) |
|
with gr.Column(): |
|
tag_gui = gr.Textbox( |
|
value=TTS_TABS[i], visible=False |
|
) |
|
model_gui = model_conf() |
|
pitch_algo_gui = pitch_algo_conf() |
|
pitch_lvl_gui = pitch_lvl_conf() |
|
index_gui = index_conf() |
|
index_inf_gui = index_inf_conf() |
|
rmf_gui = respiration_filter_conf() |
|
er_gui = envelope_ratio_conf() |
|
cbp_gui = consonant_protec_conf() |
|
|
|
with gr.Row(variant="compact"): |
|
button_config = button_conf( |
|
TTS_TABS[i] |
|
) |
|
|
|
confirm_conf = gr.HTML() |
|
|
|
button_config.click( |
|
SoniTr.vci.apply_conf, |
|
inputs=[ |
|
tag_gui, |
|
model_gui, |
|
pitch_algo_gui, |
|
pitch_lvl_gui, |
|
index_gui, |
|
index_inf_gui, |
|
rmf_gui, |
|
er_gui, |
|
cbp_gui, |
|
], |
|
outputs=[confirm_conf], |
|
) |
|
|
|
configs_storage.append({ |
|
"tag": tag_gui, |
|
"model": model_gui, |
|
"index": index_gui, |
|
}) |
|
|
|
with gr.Column(): |
|
with gr.Accordion("Test R.V.C.", open=False): |
|
with gr.Row(variant="compact"): |
|
text_test = gr.Textbox( |
|
label="Text", |
|
value="This is an example", |
|
info="write a text", |
|
placeholder="...", |
|
lines=5, |
|
) |
|
with gr.Column(): |
|
tts_test = gr.Dropdown( |
|
sorted(SoniTr.tts_info.list_edge), |
|
value="en-GB-ThomasNeural-Male", |
|
label="TTS", |
|
visible=True, |
|
interactive=True, |
|
) |
|
model_test = model_conf() |
|
index_test = index_conf() |
|
pitch_test = pitch_lvl_conf() |
|
pitch_alg_test = pitch_algo_conf() |
|
with gr.Row(variant="compact"): |
|
button_test = gr.Button("Test audio") |
|
|
|
with gr.Column(): |
|
with gr.Row(): |
|
original_ttsvoice = gr.Audio() |
|
ttsvoice = gr.Audio() |
|
|
|
button_test.click( |
|
SoniTr.vci.make_test, |
|
inputs=[ |
|
text_test, |
|
tts_test, |
|
model_test, |
|
index_test, |
|
pitch_test, |
|
pitch_alg_test, |
|
], |
|
outputs=[ttsvoice, original_ttsvoice], |
|
) |
|
|
|
download_button.click( |
|
download_list, |
|
[url_links], |
|
[download_finish], |
|
queue=False |
|
).then( |
|
update_models, |
|
[], |
|
[ |
|
elem["model"] for elem in configs_storage |
|
] + [model_test] + [ |
|
elem["index"] for elem in configs_storage |
|
] + [index_test], |
|
) |
|
|
|
with gr.Tab(lg_conf["tab_help"]): |
|
gr.Markdown(lg_conf["tutorial"]) |
|
gr.Markdown(news) |
|
|
|
def play_sound_alert(play_sound): |
|
|
|
if not play_sound: |
|
return None |
|
|
|
|
|
sound_alert = "assets/sound_alert.mp3" |
|
|
|
time.sleep(0.25) |
|
|
|
yield None |
|
|
|
time.sleep(0.25) |
|
yield sound_alert |
|
|
|
sound_alert_notification = gr.Audio( |
|
value=None, |
|
type="filepath", |
|
format="mp3", |
|
autoplay=True, |
|
visible=False, |
|
) |
|
|
|
if logs_in_gui: |
|
logger.info("Logs in gui need public url") |
|
|
|
class Logger: |
|
def __init__(self, filename): |
|
self.terminal = sys.stdout |
|
self.log = open(filename, "w") |
|
|
|
def write(self, message): |
|
self.terminal.write(message) |
|
self.log.write(message) |
|
|
|
def flush(self): |
|
self.terminal.flush() |
|
self.log.flush() |
|
|
|
def isatty(self): |
|
return False |
|
|
|
sys.stdout = Logger("output.log") |
|
|
|
def read_logs(): |
|
sys.stdout.flush() |
|
with open("output.log", "r") as f: |
|
return f.read() |
|
|
|
with gr.Accordion("Logs", open=False): |
|
logs = gr.Textbox(label=">>>") |
|
app.load(read_logs, None, logs, every=1) |
|
|
|
if SoniTr.tts_info.xtts_enabled: |
|
|
|
def update_tts_list(): |
|
update_dict = { |
|
f"tts_voice{i:02d}": gr.update(choices=SoniTr.tts_info.tts_list()) |
|
for i in range(MAX_TTS) |
|
} |
|
update_dict["tts_documents"] = gr.update( |
|
choices=list( |
|
filter( |
|
lambda x: x != "_XTTS_/AUTOMATIC.wav", |
|
SoniTr.tts_info.tts_list(), |
|
) |
|
) |
|
) |
|
return [value for value in update_dict.values()] |
|
|
|
create_xtts_wav.click( |
|
create_wav_file_vc, |
|
inputs=[ |
|
wav_speaker_name, |
|
wav_speaker_file, |
|
wav_speaker_start, |
|
wav_speaker_end, |
|
wav_speaker_dir, |
|
wav_speaker_dereverb, |
|
], |
|
outputs=[wav_speaker_output], |
|
).then( |
|
update_tts_list, |
|
None, |
|
[ |
|
tts_voice00, |
|
tts_voice01, |
|
tts_voice02, |
|
tts_voice03, |
|
tts_voice04, |
|
tts_voice05, |
|
tts_voice06, |
|
tts_voice07, |
|
tts_voice08, |
|
tts_voice09, |
|
tts_voice10, |
|
tts_voice11, |
|
tts_documents, |
|
], |
|
) |
|
|
|
|
|
subs_button.click( |
|
SoniTr.batch_multilingual_media_conversion, |
|
inputs=[ |
|
video_input, |
|
blink_input, |
|
directory_input, |
|
HFKEY, |
|
PREVIEW, |
|
WHISPER_MODEL_SIZE, |
|
batch_size, |
|
compute_type, |
|
SOURCE_LANGUAGE, |
|
TRANSLATE_AUDIO_TO, |
|
min_speakers, |
|
max_speakers, |
|
tts_voice00, |
|
tts_voice01, |
|
tts_voice02, |
|
tts_voice03, |
|
tts_voice04, |
|
tts_voice05, |
|
tts_voice06, |
|
tts_voice07, |
|
tts_voice08, |
|
tts_voice09, |
|
tts_voice10, |
|
tts_voice11, |
|
VIDEO_OUTPUT_NAME, |
|
AUDIO_MIX, |
|
audio_accelerate, |
|
acceleration_rate_regulation_gui, |
|
volume_original_mix, |
|
volume_translated_mix, |
|
sub_type_output, |
|
edit_sub_check, |
|
dummy_false_check, |
|
subs_edit_space, |
|
avoid_overlap_gui, |
|
vocal_refinement_gui, |
|
literalize_numbers_gui, |
|
segment_duration_limit_gui, |
|
diarization_process_dropdown, |
|
translate_process_dropdown, |
|
input_srt, |
|
main_output_type, |
|
main_voiceless_track, |
|
voice_imitation_gui, |
|
voice_imitation_max_segments_gui, |
|
voice_imitation_vocals_dereverb_gui, |
|
voice_imitation_remove_previous_gui, |
|
voice_imitation_method_gui, |
|
wav_speaker_dereverb, |
|
text_segmentation_scale_gui, |
|
divide_text_segments_by_gui, |
|
soft_subtitles_to_video_gui, |
|
burn_subtitles_to_video_gui, |
|
enable_cache_gui, |
|
enable_custom_voice, |
|
workers_custom_voice, |
|
is_gui_dummy_check, |
|
], |
|
outputs=subs_edit_space, |
|
).then( |
|
play_sound_alert, [play_sound_gui], [sound_alert_notification] |
|
) |
|
|
|
|
|
video_button.click( |
|
SoniTr.batch_multilingual_media_conversion, |
|
inputs=[ |
|
video_input, |
|
blink_input, |
|
directory_input, |
|
HFKEY, |
|
PREVIEW, |
|
WHISPER_MODEL_SIZE, |
|
batch_size, |
|
compute_type, |
|
SOURCE_LANGUAGE, |
|
TRANSLATE_AUDIO_TO, |
|
min_speakers, |
|
max_speakers, |
|
tts_voice00, |
|
tts_voice01, |
|
tts_voice02, |
|
tts_voice03, |
|
tts_voice04, |
|
tts_voice05, |
|
tts_voice06, |
|
tts_voice07, |
|
tts_voice08, |
|
tts_voice09, |
|
tts_voice10, |
|
tts_voice11, |
|
VIDEO_OUTPUT_NAME, |
|
AUDIO_MIX, |
|
audio_accelerate, |
|
acceleration_rate_regulation_gui, |
|
volume_original_mix, |
|
volume_translated_mix, |
|
sub_type_output, |
|
dummy_false_check, |
|
edit_sub_check, |
|
subs_edit_space, |
|
avoid_overlap_gui, |
|
vocal_refinement_gui, |
|
literalize_numbers_gui, |
|
segment_duration_limit_gui, |
|
diarization_process_dropdown, |
|
translate_process_dropdown, |
|
input_srt, |
|
main_output_type, |
|
main_voiceless_track, |
|
voice_imitation_gui, |
|
voice_imitation_max_segments_gui, |
|
voice_imitation_vocals_dereverb_gui, |
|
voice_imitation_remove_previous_gui, |
|
voice_imitation_method_gui, |
|
wav_speaker_dereverb, |
|
text_segmentation_scale_gui, |
|
divide_text_segments_by_gui, |
|
soft_subtitles_to_video_gui, |
|
burn_subtitles_to_video_gui, |
|
enable_cache_gui, |
|
enable_custom_voice, |
|
workers_custom_voice, |
|
is_gui_dummy_check, |
|
], |
|
outputs=video_output, |
|
trigger_mode="multiple", |
|
).then( |
|
play_sound_alert, [play_sound_gui], [sound_alert_notification] |
|
) |
|
|
|
|
|
docs_button.click( |
|
SoniTr.multilingual_docs_conversion, |
|
inputs=[ |
|
text_docs, |
|
input_docs, |
|
directory_input_docs, |
|
docs_SOURCE_LANGUAGE, |
|
docs_TRANSLATE_TO, |
|
tts_documents, |
|
docs_OUTPUT_NAME, |
|
docs_translate_process_dropdown, |
|
docs_output_type, |
|
docs_chunk_size, |
|
enable_custom_voice, |
|
workers_custom_voice, |
|
start_page_gui, |
|
end_page_gui, |
|
videobook_width_gui, |
|
videobook_height_gui, |
|
videobook_bcolor_gui, |
|
docs_dummy_check, |
|
], |
|
outputs=docs_output, |
|
trigger_mode="multiple", |
|
).then( |
|
play_sound_alert, [play_sound_gui], [sound_alert_notification] |
|
) |
|
|
|
return app |
|
|
|
|
|
def get_language_config(language_data, language=None, base_key="english"): |
|
base_lang = language_data.get(base_key) |
|
|
|
if language not in language_data: |
|
logger.error( |
|
f"Language {language} not found, defaulting to {base_key}" |
|
) |
|
return base_lang |
|
|
|
lg_conf = language_data.get(language, {}) |
|
lg_conf.update((k, v) for k, v in base_lang.items() if k not in lg_conf) |
|
|
|
return lg_conf |
|
|
|
|
|
def create_parser(): |
|
parser = argparse.ArgumentParser( |
|
formatter_class=argparse.ArgumentDefaultsHelpFormatter |
|
) |
|
parser.add_argument( |
|
"--theme", |
|
type=str, |
|
default="Taithrah/Minimal", |
|
help=( |
|
"Specify the theme; find themes in " |
|
"https://huggingface.co./spaces/gradio/theme-gallery;" |
|
" Example: --theme aliabid94/new-theme" |
|
), |
|
) |
|
parser.add_argument( |
|
"--public_url", |
|
action="store_true", |
|
default=False, |
|
help="Enable public link", |
|
) |
|
parser.add_argument( |
|
"--logs_in_gui", |
|
action="store_true", |
|
default=False, |
|
help="Displays the operations performed in Logs", |
|
) |
|
parser.add_argument( |
|
"--verbosity_level", |
|
type=str, |
|
default="info", |
|
help=( |
|
"Set logger verbosity level: " |
|
"debug, info, warning, error, or critical" |
|
), |
|
) |
|
parser.add_argument( |
|
"--language", |
|
type=str, |
|
default="english", |
|
help=" Select the language of the interface: english, spanish", |
|
) |
|
parser.add_argument( |
|
"--cpu_mode", |
|
action="store_true", |
|
default=False, |
|
help="Enable CPU mode to run the program without utilizing GPU acceleration.", |
|
) |
|
return parser |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
parser = create_parser() |
|
|
|
args = parser.parse_args() |
|
|
|
|
|
|
|
|
|
set_logging_level(args.verbosity_level) |
|
|
|
for id_model in UVR_MODELS: |
|
download_manager( |
|
os.path.join(MDX_DOWNLOAD_LINK, id_model), mdxnet_models_dir |
|
) |
|
|
|
models_path, index_path = upload_model_list() |
|
|
|
SoniTr = SoniTranslate(cpu_mode=args.cpu_mode if os.environ.get("ZERO_GPU") != "TRUE" else "cpu") |
|
|
|
lg_conf = get_language_config(language_data, language=args.language) |
|
|
|
app = create_gui(args.theme, logs_in_gui=args.logs_in_gui) |
|
|
|
app.queue() |
|
|
|
app.launch( |
|
max_threads=1, |
|
share=args.public_url, |
|
show_error=True, |
|
quiet=False, |
|
debug=(True if logger.isEnabledFor(logging.DEBUG) else False), |
|
) |
|
|