Spaces:

Ghana-NLP
/

demo-dubbing

Paused

App Files Files Community

Lagyamfi commited on Nov 13, 2024

Commit

9d616d2

1 Parent(s): db6bb35

fix order or tts output

Browse files

Files changed (2) hide show

app.py +61 -18
pipeline.py +71 -6

app.py CHANGED Viewed

@@ -2,6 +2,8 @@ import gradio as gr
 from tqdm.asyncio import tqdm_asyncio
 import os
 import time
 from pipeline import (
     extract_audio_from_video,
@@ -9,18 +11,21 @@ from pipeline import (
     translation_main,
     tts_main,
     create_combined_output,
 )
-from pipeline import translation_hdr, translation_url, LANG
 async def process_video_translation(
-    input_video, speaker, progress=gr.Progress(track_tqdm=True)
 ):
     if input_video is None:
         gr.Info("Please upload a video file", duration=2)
         return
-    total_stages = 6
     # add time stamp to output video
     timestamp = time.strftime("%M%S")
@@ -38,29 +43,40 @@ async def process_video_translation(
         # stage 1: extract audio from video
         progress(0.1, desc="Extracting audio from video")
-        output_audio_path = extract_audio_from_video(input_video)
         pbar.update(1)
-        # transcribe audio
         progress(0.2, desc="Transcribing audio")
-        sentences = transcribe_and_preprocess_audio(output_audio_path)
         pbar.update(1)
-        # translate to twi
         progress(0.4, desc="Translating to Twi")
         khaya_translations = await translation_main(
-            sentences, translation_url, translation_hdr, LANG
         )
         pbar.update(1)
-        # convert to speech
         progress(0.7, desc="Converting to speech")
-        output_audio = await tts_main(khaya_translations, speaker)
         # print(tts_output_files)
         pbar.update(1)
         progress(1.0, desc="Combining audio and video")
-        output_video = create_combined_output(input_video, output_audio, output_video)
         pbar.update(1)
         print("Video translation completed")
@@ -74,11 +90,20 @@ app_theme = gr.themes.Ocean(
     text_size="lg",
     spacing_size="lg",
 )
 with gr.Blocks(
     theme=app_theme,
     title="Video Dubbing Interface",
 ) as demo:
-    with gr.Row(variant="default"):
         with gr.Column(
             scale=1,
             min_width=0,
@@ -86,14 +111,14 @@ with gr.Blocks(
             gr.Image(
                 "logo_2.jpeg",
                 show_label=False,
-                height=200,
                 show_download_button=False,
                 show_fullscreen_button=False,
                 container=False,
                 show_share_button=False,
             )
         with gr.Column(
-            scale=6,
             variant="default",
         ):
             gr.HTML(
@@ -113,7 +138,7 @@ with gr.Blocks(
             gr.Image(
                 "NLPGhana_logo_1.png",
                 show_label=False,
-                height=200,
                 show_download_button=False,
                 show_fullscreen_button=False,
                 container=False,
@@ -127,19 +152,37 @@ with gr.Blocks(
     with gr.Row():
         with gr.Column():
             input_video = gr.Video(label="Input Video", sources=["upload"])
             input_speaker = gr.Radio(
                 label="Select Speaker",
-                choices=["male", "female"],
-                value="female",
                 min_width=50,
                 container=True,
                 show_label=True,
             )
             submit = gr.Button("Process Video", scale=1)
         output_video = gr.Video(label="Processed Video")
         submit.click(
             process_video_translation,
-            inputs=[input_video, input_speaker],
             outputs=output_video,
         )

 from tqdm.asyncio import tqdm_asyncio
 import os
 import time
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
 from pipeline import (
     extract_audio_from_video,
     translation_main,
     tts_main,
     create_combined_output,
+    create_combined_output_subprocess,
 )
+from pipeline import translation_hdr, translation_url, LANG_DICT
+executor = ThreadPoolExecutor()
 async def process_video_translation(
+    input_video, speaker, language, progress=gr.Progress(track_tqdm=True)
 ):
     if input_video is None:
         gr.Info("Please upload a video file", duration=2)
         return
+    total_stages = 5
     # add time stamp to output video
     timestamp = time.strftime("%M%S")
         # stage 1: extract audio from video
         progress(0.1, desc="Extracting audio from video")
+        output_audio_path = await asyncio.get_event_loop().run_in_executor(
+            executor, extract_audio_from_video, input_video
+        )
         pbar.update(1)
+        # stage 2: transcribe audio
         progress(0.2, desc="Transcribing audio")
+        sentences = await asyncio.get_event_loop().run_in_executor(
+            executor, transcribe_and_preprocess_audio, output_audio_path
+        )
         pbar.update(1)
+        # stage 3: translate to twi
         progress(0.4, desc="Translating to Twi")
         khaya_translations = await translation_main(
+            sentences, translation_url, translation_hdr, LANG_DICT[language]
         )
         pbar.update(1)
+        # stage 4: convert to speech
         progress(0.7, desc="Converting to speech")
+        output_audio = await tts_main(khaya_translations, speaker, LANG_DICT[language])
         # print(tts_output_files)
         pbar.update(1)
+        # stage 5: combine audio streams
         progress(1.0, desc="Combining audio and video")
+        output_video = await asyncio.get_event_loop().run_in_executor(
+            executor,
+            create_combined_output_subprocess,
+            input_video,
+            output_audio,
+            output_video,
+        )
         pbar.update(1)
         print("Video translation completed")
     text_size="lg",
     spacing_size="lg",
 )
+def update_speaker_choices(language):
+    if language == "Twi":
+        return gr.update(choices=["male", "female"], value="male")
+    elif language == "Ewe":
+        return gr.update(choices=["male"], value="male")
 with gr.Blocks(
     theme=app_theme,
     title="Video Dubbing Interface",
 ) as demo:
+    with gr.Row(variant="compact"):
         with gr.Column(
             scale=1,
             min_width=0,
             gr.Image(
                 "logo_2.jpeg",
                 show_label=False,
+                height=100,
                 show_download_button=False,
                 show_fullscreen_button=False,
                 container=False,
                 show_share_button=False,
             )
         with gr.Column(
+            scale=3,
             variant="default",
         ):
             gr.HTML(
             gr.Image(
                 "NLPGhana_logo_1.png",
                 show_label=False,
+                height=100,
                 show_download_button=False,
                 show_fullscreen_button=False,
                 container=False,
     with gr.Row():
         with gr.Column():
             input_video = gr.Video(label="Input Video", sources=["upload"])
+            input_language = gr.Radio(
+                label="Select Language",
+                choices=["Twi", "Ewe"],
+                value="Twi",
+                min_width=50,
+                container=True,
+                show_label=True,
+            )
+            print(input_language.value)
+            speaker_choices = (
+                ["male", "female"] if input_language.value == "Twi" else ["male"]
+            )
             input_speaker = gr.Radio(
                 label="Select Speaker",
+                choices=speaker_choices,
+                value="male",
                 min_width=50,
                 container=True,
                 show_label=True,
             )
             submit = gr.Button("Process Video", scale=1)
         output_video = gr.Video(label="Processed Video")
+        # Update the speaker choices based on the selected language
+        input_language.change(
+            update_speaker_choices,
+            inputs=input_language,
+            outputs=input_speaker,
+        )
         submit.click(
             process_video_translation,
+            inputs=[input_video, input_language, input_speaker],
             outputs=output_video,
         )

pipeline.py CHANGED Viewed

@@ -13,6 +13,7 @@ import ffmpeg
 import torch
 import aiofiles
 import tempfile
 # load khaya token from environment
@@ -38,7 +39,7 @@ tts_header = {
     "Ocp-Apim-Subscription-Key": f"{KHAYA_TOKEN}",
 }
-LANG = "tw"
 # Check if GPU is available
 pipe_device = 0 if torch.cuda.is_available() else -1
@@ -84,17 +85,29 @@ async def translation_main(sentences, url, headers, lang):
             asyncio.as_completed(tasks), total=len(tasks), desc="Translating Sentences"
         ):
             index, result = await f
             khaya_translations[index] = result
     return khaya_translations
 async def convert_text_to_speech(
-    session, tts_url, tts_header, text, text_index, speaker, semaphore, output_dir
 ):
-    speaker_dict = {"male": "twi_speaker_5", "female": "twi_speaker_7"}
-    speaker_id = speaker_dict[speaker]
-    data = {"text": text, "language": LANG, "speaker_id": speaker_id}
     try:
         async with semaphore:
@@ -114,7 +127,7 @@ async def convert_text_to_speech(
         print(f"Unexpected error: {e}")
-async def tts_main(khaya_translations, speaker):
     with tempfile.TemporaryDirectory() as temp_dir:
         async with aiohttp.ClientSession() as session:
             semaphore = asyncio.Semaphore(3)
@@ -125,6 +138,7 @@ async def tts_main(khaya_translations, speaker):
                     tts_header,
                     sent,
                     text_index,
                     speaker,
                     semaphore,
                     temp_dir,
@@ -182,6 +196,9 @@ def transcribe_and_preprocess_audio(input_audio):
 def combine_audio_streams(list_of_output_chunks, output_audio):
     input_streams = [ffmpeg.input(chunk) for chunk in list_of_output_chunks]
     concatenated = ffmpeg.concat(*input_streams, v=0, a=1).output(f"{output_audio}")
@@ -209,3 +226,51 @@ def create_combined_output(input_video, output_audio, output_video):
     except ffmpeg.Error as e:
         print(e.stderr.decode())
         raise e

 import torch
 import aiofiles
 import tempfile
+import subprocess
 # load khaya token from environment
     "Ocp-Apim-Subscription-Key": f"{KHAYA_TOKEN}",
 }
+LANG_DICT = {"Twi": "tw", "Ewe": "ee"}
 # Check if GPU is available
 pipe_device = 0 if torch.cuda.is_available() else -1
             asyncio.as_completed(tasks), total=len(tasks), desc="Translating Sentences"
         ):
             index, result = await f
+            # TODO: handle error response
             khaya_translations[index] = result
     return khaya_translations
 async def convert_text_to_speech(
+    session,
+    tts_url,
+    tts_header,
+    text,
+    text_index,
+    language,
+    speaker,
+    semaphore,
+    output_dir,
 ):
+    speaker_dict = {
+        "tw": {"male": "twi_speaker_5", "female": "twi_speaker_7"},
+        "ee": {"male": "ewe_speaker_3", "female": None},
+    }
+    speaker_id = speaker_dict[language][speaker]
+    data = {"text": text, "language": language, "speaker_id": speaker_id}
     try:
         async with semaphore:
         print(f"Unexpected error: {e}")
+async def tts_main(khaya_translations, speaker, language):
     with tempfile.TemporaryDirectory() as temp_dir:
         async with aiohttp.ClientSession() as session:
             semaphore = asyncio.Semaphore(3)
                     tts_header,
                     sent,
                     text_index,
+                    language,
                     speaker,
                     semaphore,
                     temp_dir,
 def combine_audio_streams(list_of_output_chunks, output_audio):
+    list_of_output_chunks = sorted(
+        list_of_output_chunks, key=lambda x: int(x.split("_")[1].split("/")[-1])
+    )
     input_streams = [ffmpeg.input(chunk) for chunk in list_of_output_chunks]
     concatenated = ffmpeg.concat(*input_streams, v=0, a=1).output(f"{output_audio}")
     except ffmpeg.Error as e:
         print(e.stderr.decode())
         raise e
+def create_combined_output_subprocess(input_video, output_audio, output_video):
+    video_duration = get_media_duration(input_video)
+    audio_duration = get_media_duration(output_audio)
+    speed_factor = calculate_speed_factor(video_duration, audio_duration)
+    print(f"Speed factor: {speed_factor}")
+    try:
+        command = [
+            "ffmpeg",
+            "-i",
+            f"{input_video}",
+            "-i",
+            f"{output_audio}",
+            "-filter:a",
+            f"atempo={speed_factor}",
+            "-c:v",
+            "copy",
+            "-map",
+            "0:v:0",
+            "-map",
+            "1:a:0",
+            f"{output_video}",
+        ]
+        subprocess.run(command, check=True)
+        print("Video and audio combined successfully")
+        return output_video
+    except subprocess.CalledProcessError as e:
+        print(e.stderr.decode())
+        raise e
+def get_media_duration(media_file):
+    """
+    Get the duration of a media file in seconds.
+    """
+    probe = ffmpeg.probe(media_file)
+    duration = float(probe["format"]["duration"])
+    return duration
+def calculate_speed_factor(video_duration, audio_duration):
+    """
+    Calculate the speed factor to align audio with video.
+    """
+    return audio_duration / video_duration