African Language Video Dubbing POC

import gradio as gr
from tqdm.asyncio import tqdm_asyncio
import os
import time
import asyncio
from concurrent.futures import ThreadPoolExecutor

from pipeline import (
    extract_audio_from_video,
    transcribe_and_preprocess_audio,
    translation_main,
    tts_main,
    create_combined_output,
    create_combined_output_subprocess,
)
from pipeline import translation_hdr, translation_url, LANG_DICT

executor = ThreadPoolExecutor()


async def process_video_translation(
    input_video, language, speaker, progress=gr.Progress(track_tqdm=True)
):
    if input_video is None:
        gr.Info("Please upload a video file", duration=2)
        return

    total_stages = 5

    # add time stamp to output video
    timestamp = time.strftime("%M%S")
    output_video = f"{input_video.split('.')[0]}_dubbed_{timestamp}.mp4"

    # delete the output video if it exists
    try:
        os.remove(output_video)
        print(f"Deleted existing output video: {output_video}")
    except FileNotFoundError:
        print(f"No existing output video found: {output_video}")
        pass

    with tqdm_asyncio(total=total_stages, desc="Processing video translation") as pbar:

        # stage 1: extract audio from video
        progress(0.1, desc="Extracting audio from video")
        output_audio_path = await asyncio.get_event_loop().run_in_executor(
            executor, extract_audio_from_video, input_video
        )
        pbar.update(1)

        # stage 2: transcribe audio
        progress(0.2, desc="Transcribing audio")
        sentences = await asyncio.get_event_loop().run_in_executor(
            executor, transcribe_and_preprocess_audio, output_audio_path
        )
        pbar.update(1)

        # stage 3: translate to twi
        progress(0.4, desc="Translating to Twi")
        khaya_translations = await translation_main(
            sentences, translation_url, translation_hdr, LANG_DICT[language]
        )
        pbar.update(1)

        # stage 4: convert to speech
        progress(0.7, desc="Converting to speech")
        output_audio = await tts_main(khaya_translations, speaker, LANG_DICT[language])
        # print(tts_output_files)
        pbar.update(1)

        # stage 5: combine audio streams
        progress(1.0, desc="Combining audio and video")
        output_video = await asyncio.get_event_loop().run_in_executor(
            executor,
            create_combined_output_subprocess,
            input_video,
            output_audio,
            output_video,
        )
        pbar.update(1)

        print("Video translation completed")
        gr.Info(f"Video translation completed", duration=2)

        print(f"Output video: {output_video}")
        return output_video


app_theme = gr.themes.Ocean(
    text_size="lg",
    spacing_size="lg",
)


def update_speaker_choices(language):
    if language == "Twi":
        return gr.update(choices=["male", "female"], value="male")
    elif language == "Ewe":
        return gr.update(choices=["male"], value="male")


with gr.Blocks(
    theme=app_theme,
    title="Video Dubbing Interface",
) as demo:
    with gr.Row(variant="compact"):
        with gr.Column(
            scale=1,
            min_width=100,
        ):
            gr.Image(
                "Color.png",
                show_label=False,
                height=100,
                show_download_button=False,
                show_fullscreen_button=False,
                container=False,
                show_share_button=False,
                # min_width=100,
                # scale=1,
            )
        with gr.Column(
            scale=3,
            variant="default",
        ):
            gr.HTML(
                """
                <div style="display: flex; align-items: center; justify-content: center;">
        <h1 style="font-size: 2em; font-weight: bold; margin-top: 1em;">
            African Language Video Dubbing POC
        </h1>
    </div>

                """,
            )
        with gr.Column(
            scale=1,
            min_width=100,
        ):
            gr.Image(
                "NLPGhana_logo_1.png",
                show_label=False,
                height=100,
                show_download_button=False,
                show_fullscreen_button=False,
                container=False,
                show_share_button=False,
            )
    gr.HTML("<hr style='margin-top: 0.5em;'>")

    gr.HTML("<div style='height: 20px;'></div>")

    # main interface components
    with gr.Row():
        with gr.Column():
            input_video = gr.Video(label="Input Video", sources=["upload"])
            input_language = gr.Radio(
                label="Select Language",
                choices=["Twi", "Ewe"],
                value="Twi",
                min_width=50,
                container=True,
                show_label=True,
            )
            print(input_language.value)
            speaker_choices = (
                ["male", "female"] if input_language.value == "Twi" else ["male"]
            )
            input_speaker = gr.Radio(
                label="Select Speaker",
                choices=speaker_choices,
                value="male",
                min_width=50,
                container=True,
                show_label=True,
            )
            submit = gr.Button("Process Video", scale=1)
        output_video = gr.Video(label="Processed Video")
        # Update the speaker choices based on the selected language
        input_language.change(
            update_speaker_choices,
            inputs=input_language,
            outputs=input_speaker,
        )
        submit.click(
            process_video_translation,
            inputs=[input_video, input_language, input_speaker],
            outputs=output_video,
        )

    gr.HTML("<div style='height: 10px;'></div>")


# Launch the interface
demo.launch(debug=True)