Spaces:
Paused
Paused
first draft of frontend
Browse files- NLPGhana_logo_2.png +0 -0
- app.py +135 -0
- logo_2.jpeg +0 -0
- pipeline.py +253 -0
NLPGhana_logo_2.png
ADDED
![]() |
app.py
ADDED
@@ -0,0 +1,135 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from tqdm import tqdm
|
3 |
+
|
4 |
+
from pipeline import (
|
5 |
+
extract_audio_from_video,
|
6 |
+
transcribe_and_preprocess_audio,
|
7 |
+
translation_main,
|
8 |
+
tts_main,
|
9 |
+
combine_audio_streams,
|
10 |
+
create_combined_output,
|
11 |
+
)
|
12 |
+
from pipeline import translation_hdr, translation_url, LANG
|
13 |
+
|
14 |
+
|
15 |
+
async def process_video_translation(
|
16 |
+
input_video, speaker, progress=gr.Progress(track_tqdm=True)
|
17 |
+
):
|
18 |
+
total_stages = 6
|
19 |
+
output_video = f"{input_video.split('.')[0]}_translated.mp4"
|
20 |
+
with tqdm(total=total_stages, desc="Processing video translation") as pbar:
|
21 |
+
progress(0.1, desc="Extracting audio from video")
|
22 |
+
pbar.update(1)
|
23 |
+
output_audio_path = extract_audio_from_video(input_video)
|
24 |
+
|
25 |
+
# transcribe audio
|
26 |
+
pbar.set_description("Transcribing audio")
|
27 |
+
pbar.update(1)
|
28 |
+
sentences = transcribe_and_preprocess_audio(output_audio_path)
|
29 |
+
|
30 |
+
# translate to twi
|
31 |
+
pbar.set_description("Translating to Twi")
|
32 |
+
khaya_translations = await translation_main(
|
33 |
+
sentences, translation_url, translation_hdr, LANG
|
34 |
+
)
|
35 |
+
# create output files
|
36 |
+
print("Creating output files")
|
37 |
+
list_of_output_chunks = [
|
38 |
+
f"translated_{i}.wav" for i in range(len(khaya_translations))
|
39 |
+
]
|
40 |
+
pbar.update(1)
|
41 |
+
|
42 |
+
# convert to speech
|
43 |
+
pbar.set_description("Converting to speech")
|
44 |
+
await tts_main(khaya_translations, speaker, list_of_output_chunks)
|
45 |
+
pbar.update(1)
|
46 |
+
|
47 |
+
# combine audio streams
|
48 |
+
print("Combining audio streams")
|
49 |
+
pbar.set_description("Combining audio streams")
|
50 |
+
output_audio = combine_audio_streams(
|
51 |
+
list_of_output_chunks, "combined_audio.wav"
|
52 |
+
)
|
53 |
+
pbar.update(1)
|
54 |
+
|
55 |
+
pbar.set_description("Combining audio and video")
|
56 |
+
create_combined_output(input_video, output_audio, output_video)
|
57 |
+
pbar.update(1)
|
58 |
+
|
59 |
+
print("Video translation completed")
|
60 |
+
gr.Info(f"Video translation completed", duration=2)
|
61 |
+
|
62 |
+
return output_video
|
63 |
+
|
64 |
+
|
65 |
+
with gr.Blocks(
|
66 |
+
theme=gr.themes.Soft(),
|
67 |
+
title="Video Dubbing Interface",
|
68 |
+
) as demo:
|
69 |
+
with gr.Row(variant="default"):
|
70 |
+
with gr.Column(
|
71 |
+
scale=1,
|
72 |
+
min_width=0,
|
73 |
+
):
|
74 |
+
gr.Image(
|
75 |
+
"logo_2.jpeg",
|
76 |
+
show_label=False,
|
77 |
+
width=150,
|
78 |
+
height=150,
|
79 |
+
show_download_button=False,
|
80 |
+
show_fullscreen_button=False,
|
81 |
+
container=False,
|
82 |
+
)
|
83 |
+
with gr.Column(
|
84 |
+
scale=2,
|
85 |
+
):
|
86 |
+
gr.Markdown("# Video Dubbing Interface", height=100)
|
87 |
+
with gr.Column(
|
88 |
+
scale=1,
|
89 |
+
min_width=0,
|
90 |
+
):
|
91 |
+
gr.Image(
|
92 |
+
"NLPGhana_logo_2.png",
|
93 |
+
show_label=False,
|
94 |
+
width=50,
|
95 |
+
height=150,
|
96 |
+
show_download_button=False,
|
97 |
+
show_fullscreen_button=False,
|
98 |
+
container=False,
|
99 |
+
)
|
100 |
+
|
101 |
+
# main interface components
|
102 |
+
with gr.Row():
|
103 |
+
input_video = gr.Video(label="Input Video", sources=["upload"])
|
104 |
+
input_speaker = gr.Radio(
|
105 |
+
label="Select Speaker",
|
106 |
+
choices=["male", "female"],
|
107 |
+
value="female",
|
108 |
+
min_width=50,
|
109 |
+
container=True,
|
110 |
+
)
|
111 |
+
output_video = gr.Video(label="Processed Video")
|
112 |
+
|
113 |
+
with gr.Row():
|
114 |
+
|
115 |
+
# process video translation
|
116 |
+
submit = gr.Button("Process Video", scale=1)
|
117 |
+
submit.click(
|
118 |
+
process_video_translation,
|
119 |
+
inputs=[input_video, input_speaker],
|
120 |
+
outputs=output_video,
|
121 |
+
)
|
122 |
+
|
123 |
+
|
124 |
+
# # Define the Gradio interface
|
125 |
+
# interface = gr.Interface(
|
126 |
+
# fn=process_video_translation, # Function to process the video
|
127 |
+
# inputs=gr.Video(label="Input Video"), # Video file input
|
128 |
+
# outputs=gr.Video(label="Processed Video"), # Video file output
|
129 |
+
# title="Video Processing Interface",
|
130 |
+
# description="Upload a video, and the processed video will be returned.",
|
131 |
+
# theme="light",
|
132 |
+
# )
|
133 |
+
|
134 |
+
# Launch the interface
|
135 |
+
demo.launch(debug=True)
|
logo_2.jpeg
ADDED
![]() |
pipeline.py
ADDED
@@ -0,0 +1,253 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# %%
|
2 |
+
|
3 |
+
# %load_ext autoreload
|
4 |
+
# %autoreload 2
|
5 |
+
|
6 |
+
from transformers import pipeline
|
7 |
+
import re
|
8 |
+
from num2words import num2words
|
9 |
+
import aiohttp
|
10 |
+
from aiohttp import ClientSession
|
11 |
+
from aiohttp_retry import RetryClient, ExponentialRetry
|
12 |
+
from tqdm import tqdm
|
13 |
+
import asyncio
|
14 |
+
import os
|
15 |
+
from dotenv import load_dotenv
|
16 |
+
import requests
|
17 |
+
import ffmpeg
|
18 |
+
|
19 |
+
|
20 |
+
# load khaya token from environment
|
21 |
+
load_dotenv()
|
22 |
+
|
23 |
+
# load khaya token
|
24 |
+
KHAYA_TOKEN = os.getenv("KHAYA_TOKEN")
|
25 |
+
|
26 |
+
translation_url = "https://translation-api.ghananlp.org/v1/translate"
|
27 |
+
|
28 |
+
translation_hdr = {
|
29 |
+
# Request headers
|
30 |
+
"Content-Type": "application/json",
|
31 |
+
"Cache-Control": "no-cache",
|
32 |
+
"Ocp-Apim-Subscription-Key": KHAYA_TOKEN,
|
33 |
+
}
|
34 |
+
|
35 |
+
LANG = "tw"
|
36 |
+
|
37 |
+
|
38 |
+
def replace_numbers_with_words(text):
|
39 |
+
def replace(match):
|
40 |
+
return num2words(match.group().replace(",", ""), lang="en")
|
41 |
+
|
42 |
+
return re.sub(r"[\d]+[.,\d]+", replace, text)
|
43 |
+
|
44 |
+
|
45 |
+
async def fetch(session, url, headers, data, semaphore, index):
|
46 |
+
async with semaphore:
|
47 |
+
try:
|
48 |
+
async with session.post(
|
49 |
+
url, headers=headers, json=data, timeout=10
|
50 |
+
) as response:
|
51 |
+
response.raise_for_status()
|
52 |
+
return index, await response.json()
|
53 |
+
except aiohttp.ClientError as e:
|
54 |
+
print(f"Request error: {e}")
|
55 |
+
return index, str(e)
|
56 |
+
except Exception as e:
|
57 |
+
print(f"Unexpected error: {e}")
|
58 |
+
return index, str(e)
|
59 |
+
|
60 |
+
|
61 |
+
async def translation_main(sentences, url, headers, lang):
|
62 |
+
khaya_translations = [None] * len(sentences)
|
63 |
+
semaphore = asyncio.Semaphore(2) # limit the number of concurrent requests
|
64 |
+
retry_options = ExponentialRetry(
|
65 |
+
attempts=3,
|
66 |
+
)
|
67 |
+
|
68 |
+
async with RetryClient(ClientSession(), retry_options=retry_options) as session:
|
69 |
+
tasks = []
|
70 |
+
for index, sent in enumerate(sentences):
|
71 |
+
data = {"in": sent, "lang": f"en-{lang}"}
|
72 |
+
tasks.append(fetch(session, url, headers, data, semaphore, index))
|
73 |
+
|
74 |
+
for f in tqdm(
|
75 |
+
asyncio.as_completed(tasks), total=len(tasks), desc="Translating Sentences"
|
76 |
+
):
|
77 |
+
index, result = await f
|
78 |
+
khaya_translations[index] = result
|
79 |
+
|
80 |
+
return khaya_translations
|
81 |
+
|
82 |
+
|
83 |
+
async def convert_text_to_speech(session, text, speaker, output_file):
|
84 |
+
speaker_dict = {"male": "twi_speaker_5", "female": "twi_speaker_7"}
|
85 |
+
speaker_id = speaker_dict[speaker]
|
86 |
+
try:
|
87 |
+
tts_url = "https://tts-backend-nlpghana-staging.azurewebsites.net/v0/tts" # Replace with your TTS API URL
|
88 |
+
data = {"text": text, "language": LANG, "speaker_id": speaker_id}
|
89 |
+
hdr = {
|
90 |
+
# Request headers
|
91 |
+
"Content-Type": "application/json",
|
92 |
+
"Cache-Control": "no-cache",
|
93 |
+
"Ocp-Apim-Subscription-Key": f"{KHAYA_TOKEN}",
|
94 |
+
}
|
95 |
+
async with session.post(tts_url, headers=hdr, json=data) as response:
|
96 |
+
response.raise_for_status()
|
97 |
+
with open(output_file, "wb") as file:
|
98 |
+
while True:
|
99 |
+
chunk = await response.content.read(1024)
|
100 |
+
if not chunk:
|
101 |
+
break
|
102 |
+
file.write(chunk)
|
103 |
+
except aiohttp.ClientError as e:
|
104 |
+
print(f"Request error: {e}")
|
105 |
+
except Exception as e:
|
106 |
+
print(f"Unexpected error: {e}")
|
107 |
+
|
108 |
+
|
109 |
+
async def tts_main(khaya_translations, speaker, list_of_output_chunks):
|
110 |
+
async with aiohttp.ClientSession() as session:
|
111 |
+
tasks = []
|
112 |
+
for i, sent in enumerate(khaya_translations):
|
113 |
+
output_file = list_of_output_chunks[i]
|
114 |
+
tasks.append(convert_text_to_speech(session, sent, speaker, output_file))
|
115 |
+
|
116 |
+
for f in tqdm(
|
117 |
+
asyncio.as_completed(tasks), total=len(tasks), desc="Converting to Speech"
|
118 |
+
):
|
119 |
+
await f
|
120 |
+
|
121 |
+
|
122 |
+
# %%
|
123 |
+
|
124 |
+
# filename = "CoolVision-Uzbekistan.mov"
|
125 |
+
output_path = "/Users/lawrenceadu-gyamfi/Documents/PERSONAL/GHANANLP/PROJECTS/SAINT/Examples/test_pipeline"
|
126 |
+
input_video = "test_input_video.mov"
|
127 |
+
input_audio = "input_audio.aac"
|
128 |
+
output_audio = "output_audio.wav"
|
129 |
+
output_video = "test_output_video.mp4"
|
130 |
+
filename_with_path = f"{output_path}/{input_video}"
|
131 |
+
|
132 |
+
|
133 |
+
# %%
|
134 |
+
# only need to run this once
|
135 |
+
# !ffmpeg -i {output_path}/{input_video} -vn -acodec copy {output_path}/{input_audio} -y
|
136 |
+
def extract_audio_from_video(input_video):
|
137 |
+
if input_video:
|
138 |
+
output_audio_path = f"separated_audio.aac"
|
139 |
+
try:
|
140 |
+
(
|
141 |
+
ffmpeg.input(f"{input_video}")
|
142 |
+
.output(f"{output_audio_path}", acodec="copy", vn=None)
|
143 |
+
.run(overwrite_output=True)
|
144 |
+
)
|
145 |
+
print("Audio extracted successfully")
|
146 |
+
return output_audio_path
|
147 |
+
except ffmpeg.Error as e:
|
148 |
+
print(e.stderr.decode())
|
149 |
+
raise e
|
150 |
+
|
151 |
+
|
152 |
+
# %%
|
153 |
+
# ASR pipeline
|
154 |
+
def transcribe_and_preprocess_audio(input_audio):
|
155 |
+
asr = pipeline(
|
156 |
+
"automatic-speech-recognition", model="openai/whisper-large-v3", device=0
|
157 |
+
)
|
158 |
+
pipeline_whisper_output = asr(
|
159 |
+
f"{input_audio}",
|
160 |
+
return_timestamps=True,
|
161 |
+
)
|
162 |
+
|
163 |
+
# preprocess the output before machine translation
|
164 |
+
sentences = pipeline_whisper_output["text"].split(". ")
|
165 |
+
sentences = [el.strip() for el in sentences if el]
|
166 |
+
|
167 |
+
# replace numbers with words
|
168 |
+
sentences = [replace_numbers_with_words(sent) for sent in sentences]
|
169 |
+
return sentences
|
170 |
+
|
171 |
+
|
172 |
+
# %%
|
173 |
+
# combine the audio files
|
174 |
+
def combine_audio_streams(list_of_output_chunks, output_audio):
|
175 |
+
input_streams = [ffmpeg.input(chunk) for chunk in list_of_output_chunks]
|
176 |
+
concatenated = ffmpeg.concat(*input_streams, v=0, a=1).output(f"{output_audio}")
|
177 |
+
|
178 |
+
try:
|
179 |
+
concatenated.run(overwrite_output=True)
|
180 |
+
return output_audio
|
181 |
+
except ffmpeg.Error as e:
|
182 |
+
print(e.stderr.decode())
|
183 |
+
|
184 |
+
|
185 |
+
# %%
|
186 |
+
# combine the audio and video
|
187 |
+
def create_combined_output(input_video, output_audio, output_video):
|
188 |
+
try:
|
189 |
+
video = ffmpeg.input(f"{input_video}")
|
190 |
+
audio = ffmpeg.input(f"{output_audio}") # .filter_('atempo', 1.09580838323)
|
191 |
+
(
|
192 |
+
ffmpeg.output(
|
193 |
+
video["v"],
|
194 |
+
audio["a"],
|
195 |
+
filename=f"{output_video}",
|
196 |
+
vcodec="copy",
|
197 |
+
).run(overwrite_output=True)
|
198 |
+
)
|
199 |
+
print("Video and audio combined successfully")
|
200 |
+
return output_video
|
201 |
+
except ffmpeg.Error as e:
|
202 |
+
print(e.stderr.decode())
|
203 |
+
|
204 |
+
|
205 |
+
# %%
|
206 |
+
|
207 |
+
|
208 |
+
async def process_video_translation(input_video, output_video):
|
209 |
+
print("Processing video translation")
|
210 |
+
|
211 |
+
print("Extracting audio from video")
|
212 |
+
output_audio_path = extract_audio_from_video(input_video)
|
213 |
+
|
214 |
+
# transcribe audio
|
215 |
+
print("Transcribing audio")
|
216 |
+
sentences = transcribe_and_preprocess_audio(output_audio_path)
|
217 |
+
|
218 |
+
# translate to twi
|
219 |
+
print("Translating to Twi")
|
220 |
+
khaya_translations = await translation_main(
|
221 |
+
sentences, translation_url, translation_hdr, LANG
|
222 |
+
)
|
223 |
+
|
224 |
+
# create output files
|
225 |
+
print("Creating output files")
|
226 |
+
list_of_output_chunks = [
|
227 |
+
f"translated_{i}.wav" for i in range(len(khaya_translations))
|
228 |
+
]
|
229 |
+
|
230 |
+
# convert to speech
|
231 |
+
print("Converting to speech")
|
232 |
+
await tts_main(khaya_translations, list_of_output_chunks)
|
233 |
+
|
234 |
+
# combine audio streams
|
235 |
+
print("Combining audio streams")
|
236 |
+
output_audio = combine_audio_streams(list_of_output_chunks, "combined_audio.wav")
|
237 |
+
|
238 |
+
print("Combining audio and video")
|
239 |
+
create_combined_output(input_video, output_audio, output_video)
|
240 |
+
|
241 |
+
print("Video translation completed")
|
242 |
+
|
243 |
+
return output_video
|
244 |
+
|
245 |
+
|
246 |
+
# %%
|
247 |
+
# test_input_video = "../Examples/test_pipeline/test_input_video.mov"
|
248 |
+
# test_output_video = "test_output_video.mp4"
|
249 |
+
|
250 |
+
|
251 |
+
# await process_video_translation(test_input_video, test_output_video)
|
252 |
+
|
253 |
+
# %%
|