Spaces:
Paused
Paused
fix order or tts output
Browse files- app.py +61 -18
- pipeline.py +71 -6
app.py
CHANGED
@@ -2,6 +2,8 @@ import gradio as gr
|
|
2 |
from tqdm.asyncio import tqdm_asyncio
|
3 |
import os
|
4 |
import time
|
|
|
|
|
5 |
|
6 |
from pipeline import (
|
7 |
extract_audio_from_video,
|
@@ -9,18 +11,21 @@ from pipeline import (
|
|
9 |
translation_main,
|
10 |
tts_main,
|
11 |
create_combined_output,
|
|
|
12 |
)
|
13 |
-
from pipeline import translation_hdr, translation_url,
|
|
|
|
|
14 |
|
15 |
|
16 |
async def process_video_translation(
|
17 |
-
input_video, speaker, progress=gr.Progress(track_tqdm=True)
|
18 |
):
|
19 |
if input_video is None:
|
20 |
gr.Info("Please upload a video file", duration=2)
|
21 |
return
|
22 |
|
23 |
-
total_stages =
|
24 |
|
25 |
# add time stamp to output video
|
26 |
timestamp = time.strftime("%M%S")
|
@@ -38,29 +43,40 @@ async def process_video_translation(
|
|
38 |
|
39 |
# stage 1: extract audio from video
|
40 |
progress(0.1, desc="Extracting audio from video")
|
41 |
-
output_audio_path =
|
|
|
|
|
42 |
pbar.update(1)
|
43 |
|
44 |
-
# transcribe audio
|
45 |
progress(0.2, desc="Transcribing audio")
|
46 |
-
sentences =
|
|
|
|
|
47 |
pbar.update(1)
|
48 |
|
49 |
-
# translate to twi
|
50 |
progress(0.4, desc="Translating to Twi")
|
51 |
khaya_translations = await translation_main(
|
52 |
-
sentences, translation_url, translation_hdr,
|
53 |
)
|
54 |
pbar.update(1)
|
55 |
|
56 |
-
# convert to speech
|
57 |
progress(0.7, desc="Converting to speech")
|
58 |
-
output_audio = await tts_main(khaya_translations, speaker)
|
59 |
# print(tts_output_files)
|
60 |
pbar.update(1)
|
61 |
|
|
|
62 |
progress(1.0, desc="Combining audio and video")
|
63 |
-
output_video =
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
pbar.update(1)
|
65 |
|
66 |
print("Video translation completed")
|
@@ -74,11 +90,20 @@ app_theme = gr.themes.Ocean(
|
|
74 |
text_size="lg",
|
75 |
spacing_size="lg",
|
76 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
with gr.Blocks(
|
78 |
theme=app_theme,
|
79 |
title="Video Dubbing Interface",
|
80 |
) as demo:
|
81 |
-
with gr.Row(variant="
|
82 |
with gr.Column(
|
83 |
scale=1,
|
84 |
min_width=0,
|
@@ -86,14 +111,14 @@ with gr.Blocks(
|
|
86 |
gr.Image(
|
87 |
"logo_2.jpeg",
|
88 |
show_label=False,
|
89 |
-
height=
|
90 |
show_download_button=False,
|
91 |
show_fullscreen_button=False,
|
92 |
container=False,
|
93 |
show_share_button=False,
|
94 |
)
|
95 |
with gr.Column(
|
96 |
-
scale=
|
97 |
variant="default",
|
98 |
):
|
99 |
gr.HTML(
|
@@ -113,7 +138,7 @@ with gr.Blocks(
|
|
113 |
gr.Image(
|
114 |
"NLPGhana_logo_1.png",
|
115 |
show_label=False,
|
116 |
-
height=
|
117 |
show_download_button=False,
|
118 |
show_fullscreen_button=False,
|
119 |
container=False,
|
@@ -127,19 +152,37 @@ with gr.Blocks(
|
|
127 |
with gr.Row():
|
128 |
with gr.Column():
|
129 |
input_video = gr.Video(label="Input Video", sources=["upload"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
130 |
input_speaker = gr.Radio(
|
131 |
label="Select Speaker",
|
132 |
-
choices=
|
133 |
-
value="
|
134 |
min_width=50,
|
135 |
container=True,
|
136 |
show_label=True,
|
137 |
)
|
138 |
submit = gr.Button("Process Video", scale=1)
|
139 |
output_video = gr.Video(label="Processed Video")
|
|
|
|
|
|
|
|
|
|
|
|
|
140 |
submit.click(
|
141 |
process_video_translation,
|
142 |
-
inputs=[input_video, input_speaker],
|
143 |
outputs=output_video,
|
144 |
)
|
145 |
|
|
|
2 |
from tqdm.asyncio import tqdm_asyncio
|
3 |
import os
|
4 |
import time
|
5 |
+
import asyncio
|
6 |
+
from concurrent.futures import ThreadPoolExecutor
|
7 |
|
8 |
from pipeline import (
|
9 |
extract_audio_from_video,
|
|
|
11 |
translation_main,
|
12 |
tts_main,
|
13 |
create_combined_output,
|
14 |
+
create_combined_output_subprocess,
|
15 |
)
|
16 |
+
from pipeline import translation_hdr, translation_url, LANG_DICT
|
17 |
+
|
18 |
+
executor = ThreadPoolExecutor()
|
19 |
|
20 |
|
21 |
async def process_video_translation(
|
22 |
+
input_video, speaker, language, progress=gr.Progress(track_tqdm=True)
|
23 |
):
|
24 |
if input_video is None:
|
25 |
gr.Info("Please upload a video file", duration=2)
|
26 |
return
|
27 |
|
28 |
+
total_stages = 5
|
29 |
|
30 |
# add time stamp to output video
|
31 |
timestamp = time.strftime("%M%S")
|
|
|
43 |
|
44 |
# stage 1: extract audio from video
|
45 |
progress(0.1, desc="Extracting audio from video")
|
46 |
+
output_audio_path = await asyncio.get_event_loop().run_in_executor(
|
47 |
+
executor, extract_audio_from_video, input_video
|
48 |
+
)
|
49 |
pbar.update(1)
|
50 |
|
51 |
+
# stage 2: transcribe audio
|
52 |
progress(0.2, desc="Transcribing audio")
|
53 |
+
sentences = await asyncio.get_event_loop().run_in_executor(
|
54 |
+
executor, transcribe_and_preprocess_audio, output_audio_path
|
55 |
+
)
|
56 |
pbar.update(1)
|
57 |
|
58 |
+
# stage 3: translate to twi
|
59 |
progress(0.4, desc="Translating to Twi")
|
60 |
khaya_translations = await translation_main(
|
61 |
+
sentences, translation_url, translation_hdr, LANG_DICT[language]
|
62 |
)
|
63 |
pbar.update(1)
|
64 |
|
65 |
+
# stage 4: convert to speech
|
66 |
progress(0.7, desc="Converting to speech")
|
67 |
+
output_audio = await tts_main(khaya_translations, speaker, LANG_DICT[language])
|
68 |
# print(tts_output_files)
|
69 |
pbar.update(1)
|
70 |
|
71 |
+
# stage 5: combine audio streams
|
72 |
progress(1.0, desc="Combining audio and video")
|
73 |
+
output_video = await asyncio.get_event_loop().run_in_executor(
|
74 |
+
executor,
|
75 |
+
create_combined_output_subprocess,
|
76 |
+
input_video,
|
77 |
+
output_audio,
|
78 |
+
output_video,
|
79 |
+
)
|
80 |
pbar.update(1)
|
81 |
|
82 |
print("Video translation completed")
|
|
|
90 |
text_size="lg",
|
91 |
spacing_size="lg",
|
92 |
)
|
93 |
+
|
94 |
+
|
95 |
+
def update_speaker_choices(language):
|
96 |
+
if language == "Twi":
|
97 |
+
return gr.update(choices=["male", "female"], value="male")
|
98 |
+
elif language == "Ewe":
|
99 |
+
return gr.update(choices=["male"], value="male")
|
100 |
+
|
101 |
+
|
102 |
with gr.Blocks(
|
103 |
theme=app_theme,
|
104 |
title="Video Dubbing Interface",
|
105 |
) as demo:
|
106 |
+
with gr.Row(variant="compact"):
|
107 |
with gr.Column(
|
108 |
scale=1,
|
109 |
min_width=0,
|
|
|
111 |
gr.Image(
|
112 |
"logo_2.jpeg",
|
113 |
show_label=False,
|
114 |
+
height=100,
|
115 |
show_download_button=False,
|
116 |
show_fullscreen_button=False,
|
117 |
container=False,
|
118 |
show_share_button=False,
|
119 |
)
|
120 |
with gr.Column(
|
121 |
+
scale=3,
|
122 |
variant="default",
|
123 |
):
|
124 |
gr.HTML(
|
|
|
138 |
gr.Image(
|
139 |
"NLPGhana_logo_1.png",
|
140 |
show_label=False,
|
141 |
+
height=100,
|
142 |
show_download_button=False,
|
143 |
show_fullscreen_button=False,
|
144 |
container=False,
|
|
|
152 |
with gr.Row():
|
153 |
with gr.Column():
|
154 |
input_video = gr.Video(label="Input Video", sources=["upload"])
|
155 |
+
input_language = gr.Radio(
|
156 |
+
label="Select Language",
|
157 |
+
choices=["Twi", "Ewe"],
|
158 |
+
value="Twi",
|
159 |
+
min_width=50,
|
160 |
+
container=True,
|
161 |
+
show_label=True,
|
162 |
+
)
|
163 |
+
print(input_language.value)
|
164 |
+
speaker_choices = (
|
165 |
+
["male", "female"] if input_language.value == "Twi" else ["male"]
|
166 |
+
)
|
167 |
input_speaker = gr.Radio(
|
168 |
label="Select Speaker",
|
169 |
+
choices=speaker_choices,
|
170 |
+
value="male",
|
171 |
min_width=50,
|
172 |
container=True,
|
173 |
show_label=True,
|
174 |
)
|
175 |
submit = gr.Button("Process Video", scale=1)
|
176 |
output_video = gr.Video(label="Processed Video")
|
177 |
+
# Update the speaker choices based on the selected language
|
178 |
+
input_language.change(
|
179 |
+
update_speaker_choices,
|
180 |
+
inputs=input_language,
|
181 |
+
outputs=input_speaker,
|
182 |
+
)
|
183 |
submit.click(
|
184 |
process_video_translation,
|
185 |
+
inputs=[input_video, input_language, input_speaker],
|
186 |
outputs=output_video,
|
187 |
)
|
188 |
|
pipeline.py
CHANGED
@@ -13,6 +13,7 @@ import ffmpeg
|
|
13 |
import torch
|
14 |
import aiofiles
|
15 |
import tempfile
|
|
|
16 |
|
17 |
|
18 |
# load khaya token from environment
|
@@ -38,7 +39,7 @@ tts_header = {
|
|
38 |
"Ocp-Apim-Subscription-Key": f"{KHAYA_TOKEN}",
|
39 |
}
|
40 |
|
41 |
-
|
42 |
|
43 |
# Check if GPU is available
|
44 |
pipe_device = 0 if torch.cuda.is_available() else -1
|
@@ -84,17 +85,29 @@ async def translation_main(sentences, url, headers, lang):
|
|
84 |
asyncio.as_completed(tasks), total=len(tasks), desc="Translating Sentences"
|
85 |
):
|
86 |
index, result = await f
|
|
|
87 |
khaya_translations[index] = result
|
88 |
|
89 |
return khaya_translations
|
90 |
|
91 |
|
92 |
async def convert_text_to_speech(
|
93 |
-
session,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
):
|
95 |
-
speaker_dict = {
|
96 |
-
|
97 |
-
|
|
|
|
|
|
|
98 |
|
99 |
try:
|
100 |
async with semaphore:
|
@@ -114,7 +127,7 @@ async def convert_text_to_speech(
|
|
114 |
print(f"Unexpected error: {e}")
|
115 |
|
116 |
|
117 |
-
async def tts_main(khaya_translations, speaker):
|
118 |
with tempfile.TemporaryDirectory() as temp_dir:
|
119 |
async with aiohttp.ClientSession() as session:
|
120 |
semaphore = asyncio.Semaphore(3)
|
@@ -125,6 +138,7 @@ async def tts_main(khaya_translations, speaker):
|
|
125 |
tts_header,
|
126 |
sent,
|
127 |
text_index,
|
|
|
128 |
speaker,
|
129 |
semaphore,
|
130 |
temp_dir,
|
@@ -182,6 +196,9 @@ def transcribe_and_preprocess_audio(input_audio):
|
|
182 |
|
183 |
|
184 |
def combine_audio_streams(list_of_output_chunks, output_audio):
|
|
|
|
|
|
|
185 |
input_streams = [ffmpeg.input(chunk) for chunk in list_of_output_chunks]
|
186 |
concatenated = ffmpeg.concat(*input_streams, v=0, a=1).output(f"{output_audio}")
|
187 |
|
@@ -209,3 +226,51 @@ def create_combined_output(input_video, output_audio, output_video):
|
|
209 |
except ffmpeg.Error as e:
|
210 |
print(e.stderr.decode())
|
211 |
raise e
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
import torch
|
14 |
import aiofiles
|
15 |
import tempfile
|
16 |
+
import subprocess
|
17 |
|
18 |
|
19 |
# load khaya token from environment
|
|
|
39 |
"Ocp-Apim-Subscription-Key": f"{KHAYA_TOKEN}",
|
40 |
}
|
41 |
|
42 |
+
LANG_DICT = {"Twi": "tw", "Ewe": "ee"}
|
43 |
|
44 |
# Check if GPU is available
|
45 |
pipe_device = 0 if torch.cuda.is_available() else -1
|
|
|
85 |
asyncio.as_completed(tasks), total=len(tasks), desc="Translating Sentences"
|
86 |
):
|
87 |
index, result = await f
|
88 |
+
# TODO: handle error response
|
89 |
khaya_translations[index] = result
|
90 |
|
91 |
return khaya_translations
|
92 |
|
93 |
|
94 |
async def convert_text_to_speech(
|
95 |
+
session,
|
96 |
+
tts_url,
|
97 |
+
tts_header,
|
98 |
+
text,
|
99 |
+
text_index,
|
100 |
+
language,
|
101 |
+
speaker,
|
102 |
+
semaphore,
|
103 |
+
output_dir,
|
104 |
):
|
105 |
+
speaker_dict = {
|
106 |
+
"tw": {"male": "twi_speaker_5", "female": "twi_speaker_7"},
|
107 |
+
"ee": {"male": "ewe_speaker_3", "female": None},
|
108 |
+
}
|
109 |
+
speaker_id = speaker_dict[language][speaker]
|
110 |
+
data = {"text": text, "language": language, "speaker_id": speaker_id}
|
111 |
|
112 |
try:
|
113 |
async with semaphore:
|
|
|
127 |
print(f"Unexpected error: {e}")
|
128 |
|
129 |
|
130 |
+
async def tts_main(khaya_translations, speaker, language):
|
131 |
with tempfile.TemporaryDirectory() as temp_dir:
|
132 |
async with aiohttp.ClientSession() as session:
|
133 |
semaphore = asyncio.Semaphore(3)
|
|
|
138 |
tts_header,
|
139 |
sent,
|
140 |
text_index,
|
141 |
+
language,
|
142 |
speaker,
|
143 |
semaphore,
|
144 |
temp_dir,
|
|
|
196 |
|
197 |
|
198 |
def combine_audio_streams(list_of_output_chunks, output_audio):
|
199 |
+
list_of_output_chunks = sorted(
|
200 |
+
list_of_output_chunks, key=lambda x: int(x.split("_")[1].split("/")[-1])
|
201 |
+
)
|
202 |
input_streams = [ffmpeg.input(chunk) for chunk in list_of_output_chunks]
|
203 |
concatenated = ffmpeg.concat(*input_streams, v=0, a=1).output(f"{output_audio}")
|
204 |
|
|
|
226 |
except ffmpeg.Error as e:
|
227 |
print(e.stderr.decode())
|
228 |
raise e
|
229 |
+
|
230 |
+
|
231 |
+
def create_combined_output_subprocess(input_video, output_audio, output_video):
|
232 |
+
video_duration = get_media_duration(input_video)
|
233 |
+
audio_duration = get_media_duration(output_audio)
|
234 |
+
|
235 |
+
speed_factor = calculate_speed_factor(video_duration, audio_duration)
|
236 |
+
print(f"Speed factor: {speed_factor}")
|
237 |
+
|
238 |
+
try:
|
239 |
+
command = [
|
240 |
+
"ffmpeg",
|
241 |
+
"-i",
|
242 |
+
f"{input_video}",
|
243 |
+
"-i",
|
244 |
+
f"{output_audio}",
|
245 |
+
"-filter:a",
|
246 |
+
f"atempo={speed_factor}",
|
247 |
+
"-c:v",
|
248 |
+
"copy",
|
249 |
+
"-map",
|
250 |
+
"0:v:0",
|
251 |
+
"-map",
|
252 |
+
"1:a:0",
|
253 |
+
f"{output_video}",
|
254 |
+
]
|
255 |
+
subprocess.run(command, check=True)
|
256 |
+
print("Video and audio combined successfully")
|
257 |
+
return output_video
|
258 |
+
except subprocess.CalledProcessError as e:
|
259 |
+
print(e.stderr.decode())
|
260 |
+
raise e
|
261 |
+
|
262 |
+
|
263 |
+
def get_media_duration(media_file):
|
264 |
+
"""
|
265 |
+
Get the duration of a media file in seconds.
|
266 |
+
"""
|
267 |
+
probe = ffmpeg.probe(media_file)
|
268 |
+
duration = float(probe["format"]["duration"])
|
269 |
+
return duration
|
270 |
+
|
271 |
+
|
272 |
+
def calculate_speed_factor(video_duration, audio_duration):
|
273 |
+
"""
|
274 |
+
Calculate the speed factor to align audio with video.
|
275 |
+
"""
|
276 |
+
return audio_duration / video_duration
|