|
import gradio as gr |
|
import subprocess |
|
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, AutoTokenizer, AutoModelForSeq2SeqLM, pipeline |
|
import torch |
|
import librosa |
|
import tempfile |
|
from neon_tts_plugin_coqui import CoquiTTS |
|
from gtts import gTTS |
|
from numba import cuda |
|
|
|
|
|
language_input_audio = 'en' |
|
language_output_audio='ch' |
|
dict_lang = { |
|
'en': 'eng_latn', |
|
'es': 'spa_Latn', |
|
'fr': 'fra_Latn', |
|
'de': 'deu_Latn', |
|
'pl': 'pol_Latn', |
|
'uk': 'ukr_Cyrl', |
|
'ro': 'ron_Latn', |
|
'hu': 'hun_Latn', |
|
'bg': 'bul_Cyrl', |
|
'nl': 'nld_Latn', |
|
'fi': 'fin_Latn', |
|
'sl': 'slv_Latn', |
|
'lv': 'lvs_Latn', |
|
'ga': 'gle_Latn', |
|
'ch': 'zho_Hant', |
|
'ru': 'rus_Cyrl' |
|
} |
|
|
|
|
|
def radio_lang_input(lang): |
|
language_input_audio = lang |
|
return {var: language_input_audio} |
|
|
|
|
|
def radio_input(lang): |
|
language_output_audio = lang |
|
return {var_lang: language_output_audio} |
|
|
|
|
|
|
|
def video_load(video, language_input_audio, language_output_audio): |
|
|
|
|
|
subprocess.run(f'ffmpeg -y -i {video} -vf scale=720:-2 video720p.mp4', shell=True) |
|
|
|
|
|
subprocess.run('ffmpeg -y -i video720p.mp4 -vn -ar 16000 -ac 2 -ab 192K -f wav sound_from_input_video.wav', shell=True) |
|
|
|
|
|
|
|
if language_input_audio == 'en': |
|
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h") |
|
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") |
|
audio, rate = librosa.load('sound_from_input_video.wav', sr = 16000) |
|
input_values = processor(audio, sampling_rate=rate, return_tensors="pt", padding="longest").input_values |
|
|
|
logits = model(input_values).logits |
|
|
|
predicted_ids = torch.argmax(logits, dim=-1) |
|
transcription = processor.batch_decode(predicted_ids)[0] |
|
if language_input_audio == 'ru': |
|
processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-russian") |
|
model = Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-russian") |
|
audio, rate = librosa.load('sound_from_input_video.wav', sr = 16000) |
|
input_values = processor(audio, sampling_rate=rate, return_tensors="pt", padding="longest").input_values |
|
|
|
logits = model(input_values).logits |
|
|
|
predicted_ids = torch.argmax(logits, dim=-1) |
|
transcription = processor.batch_decode(predicted_ids)[0] |
|
|
|
|
|
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M") |
|
tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M") |
|
device = 0 if torch.cuda.is_available() else -1 |
|
translation_pipeline = pipeline("translation", model=model, tokenizer=tokenizer, src_lang=dict_lang[language_input_audio], tgt_lang=dict_lang[language_output_audio], max_length=2000000, device=-1) |
|
result = translation_pipeline(transcription) |
|
text_translations = result[0]['translation_text'] |
|
|
|
|
|
|
|
if language_output_audio == 'ru': |
|
tts = gTTS(text_translations, lang='ru') |
|
tts.save('ru.mp3') |
|
audio = 'ru.mp3' |
|
|
|
if language_output_audio in ['en', 'es', 'fr', 'de', 'pl', 'uk', 'ro', 'hu', 'bg', 'nl', 'fi', 'sl', 'lv', 'ga']: |
|
coquiTTS = CoquiTTS() |
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: |
|
coquiTTS.get_tts(text_translations, fp, speaker = {"language" : language_output_audio}) |
|
audio = fp.name |
|
|
|
if language_output_audio == 'ch': |
|
tts = gTTS(text_translations, lang='zh-CN') |
|
tts.save('china.mp3') |
|
audio = 'china.mp3' |
|
|
|
|
|
subprocess.run(f'python inference.py --checkpoint_path wav2lip_gan.pth --face video720p.mp4 --audio {audio} --nosmooth --pads 0 20 0 0', shell=True) |
|
video = 'results/result_voice.mp4' |
|
return text_translations, audio, video |
|
|
|
|
|
|
|
def audio_to_video_custom(audio): |
|
subprocess.run(f'python inference.py --checkpoint_path wav2lip_gan.pth --face video720p.mp4 --audio {audio} --nosmooth --pads 0 20 0 0', shell=True) |
|
video = 'results/result_voice.mp4' |
|
return video |
|
|
|
|
|
|
|
def text_to_audio_custom(text_translations, language_output_audio): |
|
|
|
if language_output_audio == 'ru': |
|
tts = gTTS(text_translations, lang='ru') |
|
tts.save('ru.mp3') |
|
audio = 'ru.mp3' |
|
|
|
if language_output_audio in ['en', 'es', 'fr', 'de', 'pl', 'uk', 'ro', 'hu', 'bg', 'nl', 'fi', 'sl', 'lv', 'ga']: |
|
coquiTTS = CoquiTTS() |
|
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp: |
|
coquiTTS.get_tts(text_translations, fp, speaker = {"language" : language_output_audio}) |
|
audio = fp.name |
|
|
|
|
|
if language_output_audio == 'ch': |
|
tts = gTTS(text_translations, lang='zh-CN') |
|
tts.save('china.mp3') |
|
audio = 'china.mp3' |
|
return audio |
|
|
|
|
|
with gr.Blocks(title="Speak video in any language") as demo: |
|
|
|
var = gr.State('en') |
|
var_lang = gr.State('ch') |
|
|
|
gr.Markdown("Service for translating videos into other languages with support for the speaker's facial expressions") |
|
gr.Markdown("The uploaded video must be only with a face. Preferably without sudden movements of the head.") |
|
with gr.Row(): |
|
with gr.Column(): |
|
|
|
radio_input_lang_video = gr.Radio(['en', 'ru'], value="en", label='Select input video language') |
|
|
|
seed = gr.Video(label="Input Video") |
|
|
|
radio = gr.Radio(['en', 'es', 'fr', 'de', 'pl', 'uk', 'ro', 'hu', 'bg', 'nl', 'fi', 'sl', 'lv', 'ga', 'ch', 'ru'], value="ch", label='Choose the language you want to speak') |
|
|
|
btn_1 = gr.Button("1. Generate video with translated audio") |
|
|
|
with gr.Column(): |
|
|
|
translations_text = gr.Text(label="Generated Translations Text", interactive=True) |
|
|
|
btn_3 = gr.Button("Generate custom translations to speech") |
|
|
|
translations_audio = gr.Audio(label="Generated Translations Audio", interactive=True, type="filepath") |
|
|
|
btn_2 = gr.Button("Generate video with custom audio") |
|
|
|
video_output = gr.Video(interactive=False, label="Generated Translations Video") |
|
|
|
radio_input_lang_video.change(fn=radio_lang_input, inputs=radio_input_lang_video, outputs=var) |
|
|
|
radio.change(fn=radio_input, inputs=radio, outputs=var_lang) |
|
|
|
btn_1.click(video_load, inputs=[seed, var, var_lang], outputs=[translations_text, translations_audio, video_output]) |
|
|
|
btn_2.click(audio_to_video_custom, inputs=[translations_audio], outputs=[video_output]) |
|
|
|
btn_3.click(text_to_audio_custom, inputs=[translations_text, var_lang], outputs=[translations_audio]) |
|
|
|
demo.launch(show_api=False) |
|
|