Lagyamfi commited on
Commit
c59a3c0
·
1 Parent(s): 3c713dd

first draft of frontend

Browse files
Files changed (4) hide show
  1. NLPGhana_logo_2.png +0 -0
  2. app.py +135 -0
  3. logo_2.jpeg +0 -0
  4. pipeline.py +253 -0
NLPGhana_logo_2.png ADDED
app.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from tqdm import tqdm
3
+
4
+ from pipeline import (
5
+ extract_audio_from_video,
6
+ transcribe_and_preprocess_audio,
7
+ translation_main,
8
+ tts_main,
9
+ combine_audio_streams,
10
+ create_combined_output,
11
+ )
12
+ from pipeline import translation_hdr, translation_url, LANG
13
+
14
+
15
+ async def process_video_translation(
16
+ input_video, speaker, progress=gr.Progress(track_tqdm=True)
17
+ ):
18
+ total_stages = 6
19
+ output_video = f"{input_video.split('.')[0]}_translated.mp4"
20
+ with tqdm(total=total_stages, desc="Processing video translation") as pbar:
21
+ progress(0.1, desc="Extracting audio from video")
22
+ pbar.update(1)
23
+ output_audio_path = extract_audio_from_video(input_video)
24
+
25
+ # transcribe audio
26
+ pbar.set_description("Transcribing audio")
27
+ pbar.update(1)
28
+ sentences = transcribe_and_preprocess_audio(output_audio_path)
29
+
30
+ # translate to twi
31
+ pbar.set_description("Translating to Twi")
32
+ khaya_translations = await translation_main(
33
+ sentences, translation_url, translation_hdr, LANG
34
+ )
35
+ # create output files
36
+ print("Creating output files")
37
+ list_of_output_chunks = [
38
+ f"translated_{i}.wav" for i in range(len(khaya_translations))
39
+ ]
40
+ pbar.update(1)
41
+
42
+ # convert to speech
43
+ pbar.set_description("Converting to speech")
44
+ await tts_main(khaya_translations, speaker, list_of_output_chunks)
45
+ pbar.update(1)
46
+
47
+ # combine audio streams
48
+ print("Combining audio streams")
49
+ pbar.set_description("Combining audio streams")
50
+ output_audio = combine_audio_streams(
51
+ list_of_output_chunks, "combined_audio.wav"
52
+ )
53
+ pbar.update(1)
54
+
55
+ pbar.set_description("Combining audio and video")
56
+ create_combined_output(input_video, output_audio, output_video)
57
+ pbar.update(1)
58
+
59
+ print("Video translation completed")
60
+ gr.Info(f"Video translation completed", duration=2)
61
+
62
+ return output_video
63
+
64
+
65
+ with gr.Blocks(
66
+ theme=gr.themes.Soft(),
67
+ title="Video Dubbing Interface",
68
+ ) as demo:
69
+ with gr.Row(variant="default"):
70
+ with gr.Column(
71
+ scale=1,
72
+ min_width=0,
73
+ ):
74
+ gr.Image(
75
+ "logo_2.jpeg",
76
+ show_label=False,
77
+ width=150,
78
+ height=150,
79
+ show_download_button=False,
80
+ show_fullscreen_button=False,
81
+ container=False,
82
+ )
83
+ with gr.Column(
84
+ scale=2,
85
+ ):
86
+ gr.Markdown("# Video Dubbing Interface", height=100)
87
+ with gr.Column(
88
+ scale=1,
89
+ min_width=0,
90
+ ):
91
+ gr.Image(
92
+ "NLPGhana_logo_2.png",
93
+ show_label=False,
94
+ width=50,
95
+ height=150,
96
+ show_download_button=False,
97
+ show_fullscreen_button=False,
98
+ container=False,
99
+ )
100
+
101
+ # main interface components
102
+ with gr.Row():
103
+ input_video = gr.Video(label="Input Video", sources=["upload"])
104
+ input_speaker = gr.Radio(
105
+ label="Select Speaker",
106
+ choices=["male", "female"],
107
+ value="female",
108
+ min_width=50,
109
+ container=True,
110
+ )
111
+ output_video = gr.Video(label="Processed Video")
112
+
113
+ with gr.Row():
114
+
115
+ # process video translation
116
+ submit = gr.Button("Process Video", scale=1)
117
+ submit.click(
118
+ process_video_translation,
119
+ inputs=[input_video, input_speaker],
120
+ outputs=output_video,
121
+ )
122
+
123
+
124
+ # # Define the Gradio interface
125
+ # interface = gr.Interface(
126
+ # fn=process_video_translation, # Function to process the video
127
+ # inputs=gr.Video(label="Input Video"), # Video file input
128
+ # outputs=gr.Video(label="Processed Video"), # Video file output
129
+ # title="Video Processing Interface",
130
+ # description="Upload a video, and the processed video will be returned.",
131
+ # theme="light",
132
+ # )
133
+
134
+ # Launch the interface
135
+ demo.launch(debug=True)
logo_2.jpeg ADDED
pipeline.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # %%
2
+
3
+ # %load_ext autoreload
4
+ # %autoreload 2
5
+
6
+ from transformers import pipeline
7
+ import re
8
+ from num2words import num2words
9
+ import aiohttp
10
+ from aiohttp import ClientSession
11
+ from aiohttp_retry import RetryClient, ExponentialRetry
12
+ from tqdm import tqdm
13
+ import asyncio
14
+ import os
15
+ from dotenv import load_dotenv
16
+ import requests
17
+ import ffmpeg
18
+
19
+
20
+ # load khaya token from environment
21
+ load_dotenv()
22
+
23
+ # load khaya token
24
+ KHAYA_TOKEN = os.getenv("KHAYA_TOKEN")
25
+
26
+ translation_url = "https://translation-api.ghananlp.org/v1/translate"
27
+
28
+ translation_hdr = {
29
+ # Request headers
30
+ "Content-Type": "application/json",
31
+ "Cache-Control": "no-cache",
32
+ "Ocp-Apim-Subscription-Key": KHAYA_TOKEN,
33
+ }
34
+
35
+ LANG = "tw"
36
+
37
+
38
+ def replace_numbers_with_words(text):
39
+ def replace(match):
40
+ return num2words(match.group().replace(",", ""), lang="en")
41
+
42
+ return re.sub(r"[\d]+[.,\d]+", replace, text)
43
+
44
+
45
+ async def fetch(session, url, headers, data, semaphore, index):
46
+ async with semaphore:
47
+ try:
48
+ async with session.post(
49
+ url, headers=headers, json=data, timeout=10
50
+ ) as response:
51
+ response.raise_for_status()
52
+ return index, await response.json()
53
+ except aiohttp.ClientError as e:
54
+ print(f"Request error: {e}")
55
+ return index, str(e)
56
+ except Exception as e:
57
+ print(f"Unexpected error: {e}")
58
+ return index, str(e)
59
+
60
+
61
+ async def translation_main(sentences, url, headers, lang):
62
+ khaya_translations = [None] * len(sentences)
63
+ semaphore = asyncio.Semaphore(2) # limit the number of concurrent requests
64
+ retry_options = ExponentialRetry(
65
+ attempts=3,
66
+ )
67
+
68
+ async with RetryClient(ClientSession(), retry_options=retry_options) as session:
69
+ tasks = []
70
+ for index, sent in enumerate(sentences):
71
+ data = {"in": sent, "lang": f"en-{lang}"}
72
+ tasks.append(fetch(session, url, headers, data, semaphore, index))
73
+
74
+ for f in tqdm(
75
+ asyncio.as_completed(tasks), total=len(tasks), desc="Translating Sentences"
76
+ ):
77
+ index, result = await f
78
+ khaya_translations[index] = result
79
+
80
+ return khaya_translations
81
+
82
+
83
+ async def convert_text_to_speech(session, text, speaker, output_file):
84
+ speaker_dict = {"male": "twi_speaker_5", "female": "twi_speaker_7"}
85
+ speaker_id = speaker_dict[speaker]
86
+ try:
87
+ tts_url = "https://tts-backend-nlpghana-staging.azurewebsites.net/v0/tts" # Replace with your TTS API URL
88
+ data = {"text": text, "language": LANG, "speaker_id": speaker_id}
89
+ hdr = {
90
+ # Request headers
91
+ "Content-Type": "application/json",
92
+ "Cache-Control": "no-cache",
93
+ "Ocp-Apim-Subscription-Key": f"{KHAYA_TOKEN}",
94
+ }
95
+ async with session.post(tts_url, headers=hdr, json=data) as response:
96
+ response.raise_for_status()
97
+ with open(output_file, "wb") as file:
98
+ while True:
99
+ chunk = await response.content.read(1024)
100
+ if not chunk:
101
+ break
102
+ file.write(chunk)
103
+ except aiohttp.ClientError as e:
104
+ print(f"Request error: {e}")
105
+ except Exception as e:
106
+ print(f"Unexpected error: {e}")
107
+
108
+
109
+ async def tts_main(khaya_translations, speaker, list_of_output_chunks):
110
+ async with aiohttp.ClientSession() as session:
111
+ tasks = []
112
+ for i, sent in enumerate(khaya_translations):
113
+ output_file = list_of_output_chunks[i]
114
+ tasks.append(convert_text_to_speech(session, sent, speaker, output_file))
115
+
116
+ for f in tqdm(
117
+ asyncio.as_completed(tasks), total=len(tasks), desc="Converting to Speech"
118
+ ):
119
+ await f
120
+
121
+
122
+ # %%
123
+
124
+ # filename = "CoolVision-Uzbekistan.mov"
125
+ output_path = "/Users/lawrenceadu-gyamfi/Documents/PERSONAL/GHANANLP/PROJECTS/SAINT/Examples/test_pipeline"
126
+ input_video = "test_input_video.mov"
127
+ input_audio = "input_audio.aac"
128
+ output_audio = "output_audio.wav"
129
+ output_video = "test_output_video.mp4"
130
+ filename_with_path = f"{output_path}/{input_video}"
131
+
132
+
133
+ # %%
134
+ # only need to run this once
135
+ # !ffmpeg -i {output_path}/{input_video} -vn -acodec copy {output_path}/{input_audio} -y
136
+ def extract_audio_from_video(input_video):
137
+ if input_video:
138
+ output_audio_path = f"separated_audio.aac"
139
+ try:
140
+ (
141
+ ffmpeg.input(f"{input_video}")
142
+ .output(f"{output_audio_path}", acodec="copy", vn=None)
143
+ .run(overwrite_output=True)
144
+ )
145
+ print("Audio extracted successfully")
146
+ return output_audio_path
147
+ except ffmpeg.Error as e:
148
+ print(e.stderr.decode())
149
+ raise e
150
+
151
+
152
+ # %%
153
+ # ASR pipeline
154
+ def transcribe_and_preprocess_audio(input_audio):
155
+ asr = pipeline(
156
+ "automatic-speech-recognition", model="openai/whisper-large-v3", device=0
157
+ )
158
+ pipeline_whisper_output = asr(
159
+ f"{input_audio}",
160
+ return_timestamps=True,
161
+ )
162
+
163
+ # preprocess the output before machine translation
164
+ sentences = pipeline_whisper_output["text"].split(". ")
165
+ sentences = [el.strip() for el in sentences if el]
166
+
167
+ # replace numbers with words
168
+ sentences = [replace_numbers_with_words(sent) for sent in sentences]
169
+ return sentences
170
+
171
+
172
+ # %%
173
+ # combine the audio files
174
+ def combine_audio_streams(list_of_output_chunks, output_audio):
175
+ input_streams = [ffmpeg.input(chunk) for chunk in list_of_output_chunks]
176
+ concatenated = ffmpeg.concat(*input_streams, v=0, a=1).output(f"{output_audio}")
177
+
178
+ try:
179
+ concatenated.run(overwrite_output=True)
180
+ return output_audio
181
+ except ffmpeg.Error as e:
182
+ print(e.stderr.decode())
183
+
184
+
185
+ # %%
186
+ # combine the audio and video
187
+ def create_combined_output(input_video, output_audio, output_video):
188
+ try:
189
+ video = ffmpeg.input(f"{input_video}")
190
+ audio = ffmpeg.input(f"{output_audio}") # .filter_('atempo', 1.09580838323)
191
+ (
192
+ ffmpeg.output(
193
+ video["v"],
194
+ audio["a"],
195
+ filename=f"{output_video}",
196
+ vcodec="copy",
197
+ ).run(overwrite_output=True)
198
+ )
199
+ print("Video and audio combined successfully")
200
+ return output_video
201
+ except ffmpeg.Error as e:
202
+ print(e.stderr.decode())
203
+
204
+
205
+ # %%
206
+
207
+
208
+ async def process_video_translation(input_video, output_video):
209
+ print("Processing video translation")
210
+
211
+ print("Extracting audio from video")
212
+ output_audio_path = extract_audio_from_video(input_video)
213
+
214
+ # transcribe audio
215
+ print("Transcribing audio")
216
+ sentences = transcribe_and_preprocess_audio(output_audio_path)
217
+
218
+ # translate to twi
219
+ print("Translating to Twi")
220
+ khaya_translations = await translation_main(
221
+ sentences, translation_url, translation_hdr, LANG
222
+ )
223
+
224
+ # create output files
225
+ print("Creating output files")
226
+ list_of_output_chunks = [
227
+ f"translated_{i}.wav" for i in range(len(khaya_translations))
228
+ ]
229
+
230
+ # convert to speech
231
+ print("Converting to speech")
232
+ await tts_main(khaya_translations, list_of_output_chunks)
233
+
234
+ # combine audio streams
235
+ print("Combining audio streams")
236
+ output_audio = combine_audio_streams(list_of_output_chunks, "combined_audio.wav")
237
+
238
+ print("Combining audio and video")
239
+ create_combined_output(input_video, output_audio, output_video)
240
+
241
+ print("Video translation completed")
242
+
243
+ return output_video
244
+
245
+
246
+ # %%
247
+ # test_input_video = "../Examples/test_pipeline/test_input_video.mov"
248
+ # test_output_video = "test_output_video.mp4"
249
+
250
+
251
+ # await process_video_translation(test_input_video, test_output_video)
252
+
253
+ # %%