ATP
Add application file
8746ace
import gradio as gr
import subprocess
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import torch
import librosa
import tempfile
from neon_tts_plugin_coqui import CoquiTTS
from gtts import gTTS
from numba import cuda
#variables
language_input_audio = 'en'
language_output_audio='ch'
dict_lang = {
'en': 'eng_latn',
'es': 'spa_Latn',
'fr': 'fra_Latn',
'de': 'deu_Latn',
'pl': 'pol_Latn',
'uk': 'ukr_Cyrl',
'ro': 'ron_Latn',
'hu': 'hun_Latn',
'bg': 'bul_Cyrl',
'nl': 'nld_Latn',
'fi': 'fin_Latn',
'sl': 'slv_Latn',
'lv': 'lvs_Latn',
'ga': 'gle_Latn',
'ch': 'zho_Hant',
'ru': 'rus_Cyrl'
}
#functions
def radio_lang_input(lang):
language_input_audio = lang
return {var: language_input_audio}
#a function that determines the language of the output audio
def radio_input(lang):
language_output_audio = lang
return {var_lang: language_output_audio}
##
#convert input video file to text, audio, video
def video_load(video, language_input_audio, language_output_audio):
#convert video to video720p -s 1280x720
#
subprocess.run(f'ffmpeg -y -i {video} -vf scale=720:-2 video720p.mp4', shell=True)
#convert video to audio
#
subprocess.run('ffmpeg -y -i video720p.mp4 -vn -ar 16000 -ac 2 -ab 192K -f wav sound_from_input_video.wav', shell=True)
#convert audio to text
#
# load model and tokenizer
if language_input_audio == 'en':
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
audio, rate = librosa.load('sound_from_input_video.wav', sr = 16000)
input_values = processor(audio, sampling_rate=rate, return_tensors="pt", padding="longest").input_values
# retrieve logits
logits = model(input_values).logits
# take argmax and decode
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)[0]
if language_input_audio == 'ru':
processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-russian")
model = Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-russian")
audio, rate = librosa.load('sound_from_input_video.wav', sr = 16000)
input_values = processor(audio, sampling_rate=rate, return_tensors="pt", padding="longest").input_values
# retrieve logits
logits = model(input_values).logits
# take argmax and decode
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.batch_decode(predicted_ids)[0]
#convert text to text translations
#
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
device = 0 if torch.cuda.is_available() else -1
translation_pipeline = pipeline("translation", model=model, tokenizer=tokenizer, src_lang=dict_lang[language_input_audio], tgt_lang=dict_lang[language_output_audio], max_length=2000000, device=-1)
result = translation_pipeline(transcription)
text_translations = result[0]['translation_text']
#convert text to audio
#
#ru
if language_output_audio == 'ru':
tts = gTTS(text_translations, lang='ru')
tts.save('ru.mp3')
audio = 'ru.mp3'
#Vashington obcom
if language_output_audio in ['en', 'es', 'fr', 'de', 'pl', 'uk', 'ro', 'hu', 'bg', 'nl', 'fi', 'sl', 'lv', 'ga']:
coquiTTS = CoquiTTS()
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
coquiTTS.get_tts(text_translations, fp, speaker = {"language" : language_output_audio})
audio = fp.name
#Chineese
if language_output_audio == 'ch':
tts = gTTS(text_translations, lang='zh-CN')
tts.save('china.mp3')
audio = 'china.mp3'
#audio to video
#
subprocess.run(f'python inference.py --checkpoint_path wav2lip_gan.pth --face video720p.mp4 --audio {audio} --nosmooth --pads 0 20 0 0', shell=True)
video = 'results/result_voice.mp4'
return text_translations, audio, video
##
# function for create video from audio
def audio_to_video_custom(audio):
subprocess.run(f'python inference.py --checkpoint_path wav2lip_gan.pth --face video720p.mp4 --audio {audio} --nosmooth --pads 0 20 0 0', shell=True)
video = 'results/result_voice.mp4'
return video
##
# function for create audio from custom translations
def text_to_audio_custom(text_translations, language_output_audio):
#ru
if language_output_audio == 'ru':
tts = gTTS(text_translations, lang='ru')
tts.save('ru.mp3')
audio = 'ru.mp3'
#Vashington obcom
if language_output_audio in ['en', 'es', 'fr', 'de', 'pl', 'uk', 'ro', 'hu', 'bg', 'nl', 'fi', 'sl', 'lv', 'ga']:
coquiTTS = CoquiTTS()
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
coquiTTS.get_tts(text_translations, fp, speaker = {"language" : language_output_audio})
audio = fp.name
#Chineese
if language_output_audio == 'ch':
tts = gTTS(text_translations, lang='zh-CN')
tts.save('china.mp3')
audio = 'china.mp3'
return audio
##### blocks
with gr.Blocks(title="Speak video in any language") as demo:
# state variable
var = gr.State('en')
var_lang = gr.State('ch')
# markdown text
gr.Markdown("Service for translating videos into other languages ​​with support for the speaker's facial expressions")
gr.Markdown("The uploaded video must be only with a face. Preferably without sudden movements of the head.")
with gr.Row():
with gr.Column():
# radio button for change input lang
radio_input_lang_video = gr.Radio(['en', 'ru'], value="en", label='Select input video language')
# video input
seed = gr.Video(label="Input Video")
# radio button for change to output language
radio = gr.Radio(['en', 'es', 'fr', 'de', 'pl', 'uk', 'ro', 'hu', 'bg', 'nl', 'fi', 'sl', 'lv', 'ga', 'ch', 'ru'], value="ch", label='Choose the language you want to speak')
# main button
btn_1 = gr.Button("1. Generate video with translated audio")
with gr.Column():
# text output
translations_text = gr.Text(label="Generated Translations Text", interactive=True)
# button to generate text to audio
btn_3 = gr.Button("Generate custom translations to speech")
# output audio
translations_audio = gr.Audio(label="Generated Translations Audio", interactive=True, type="filepath")
# button to generate audio to video
btn_2 = gr.Button("Generate video with custom audio")
# video output
video_output = gr.Video(interactive=False, label="Generated Translations Video")
# change input lang video
radio_input_lang_video.change(fn=radio_lang_input, inputs=radio_input_lang_video, outputs=var)
# change output lang
radio.change(fn=radio_input, inputs=radio, outputs=var_lang)
# main button click
btn_1.click(video_load, inputs=[seed, var, var_lang], outputs=[translations_text, translations_audio, video_output])
# button click to custom audio to video
btn_2.click(audio_to_video_custom, inputs=[translations_audio], outputs=[video_output])
# button click to custom test to audio
btn_3.click(text_to_audio_custom, inputs=[translations_text, var_lang], outputs=[translations_audio])
demo.launch(show_api=False)