Lagyamfi commited on
Commit
9d616d2
·
1 Parent(s): db6bb35

fix order or tts output

Browse files
Files changed (2) hide show
  1. app.py +61 -18
  2. pipeline.py +71 -6
app.py CHANGED
@@ -2,6 +2,8 @@ import gradio as gr
2
  from tqdm.asyncio import tqdm_asyncio
3
  import os
4
  import time
 
 
5
 
6
  from pipeline import (
7
  extract_audio_from_video,
@@ -9,18 +11,21 @@ from pipeline import (
9
  translation_main,
10
  tts_main,
11
  create_combined_output,
 
12
  )
13
- from pipeline import translation_hdr, translation_url, LANG
 
 
14
 
15
 
16
  async def process_video_translation(
17
- input_video, speaker, progress=gr.Progress(track_tqdm=True)
18
  ):
19
  if input_video is None:
20
  gr.Info("Please upload a video file", duration=2)
21
  return
22
 
23
- total_stages = 6
24
 
25
  # add time stamp to output video
26
  timestamp = time.strftime("%M%S")
@@ -38,29 +43,40 @@ async def process_video_translation(
38
 
39
  # stage 1: extract audio from video
40
  progress(0.1, desc="Extracting audio from video")
41
- output_audio_path = extract_audio_from_video(input_video)
 
 
42
  pbar.update(1)
43
 
44
- # transcribe audio
45
  progress(0.2, desc="Transcribing audio")
46
- sentences = transcribe_and_preprocess_audio(output_audio_path)
 
 
47
  pbar.update(1)
48
 
49
- # translate to twi
50
  progress(0.4, desc="Translating to Twi")
51
  khaya_translations = await translation_main(
52
- sentences, translation_url, translation_hdr, LANG
53
  )
54
  pbar.update(1)
55
 
56
- # convert to speech
57
  progress(0.7, desc="Converting to speech")
58
- output_audio = await tts_main(khaya_translations, speaker)
59
  # print(tts_output_files)
60
  pbar.update(1)
61
 
 
62
  progress(1.0, desc="Combining audio and video")
63
- output_video = create_combined_output(input_video, output_audio, output_video)
 
 
 
 
 
 
64
  pbar.update(1)
65
 
66
  print("Video translation completed")
@@ -74,11 +90,20 @@ app_theme = gr.themes.Ocean(
74
  text_size="lg",
75
  spacing_size="lg",
76
  )
 
 
 
 
 
 
 
 
 
77
  with gr.Blocks(
78
  theme=app_theme,
79
  title="Video Dubbing Interface",
80
  ) as demo:
81
- with gr.Row(variant="default"):
82
  with gr.Column(
83
  scale=1,
84
  min_width=0,
@@ -86,14 +111,14 @@ with gr.Blocks(
86
  gr.Image(
87
  "logo_2.jpeg",
88
  show_label=False,
89
- height=200,
90
  show_download_button=False,
91
  show_fullscreen_button=False,
92
  container=False,
93
  show_share_button=False,
94
  )
95
  with gr.Column(
96
- scale=6,
97
  variant="default",
98
  ):
99
  gr.HTML(
@@ -113,7 +138,7 @@ with gr.Blocks(
113
  gr.Image(
114
  "NLPGhana_logo_1.png",
115
  show_label=False,
116
- height=200,
117
  show_download_button=False,
118
  show_fullscreen_button=False,
119
  container=False,
@@ -127,19 +152,37 @@ with gr.Blocks(
127
  with gr.Row():
128
  with gr.Column():
129
  input_video = gr.Video(label="Input Video", sources=["upload"])
 
 
 
 
 
 
 
 
 
 
 
 
130
  input_speaker = gr.Radio(
131
  label="Select Speaker",
132
- choices=["male", "female"],
133
- value="female",
134
  min_width=50,
135
  container=True,
136
  show_label=True,
137
  )
138
  submit = gr.Button("Process Video", scale=1)
139
  output_video = gr.Video(label="Processed Video")
 
 
 
 
 
 
140
  submit.click(
141
  process_video_translation,
142
- inputs=[input_video, input_speaker],
143
  outputs=output_video,
144
  )
145
 
 
2
  from tqdm.asyncio import tqdm_asyncio
3
  import os
4
  import time
5
+ import asyncio
6
+ from concurrent.futures import ThreadPoolExecutor
7
 
8
  from pipeline import (
9
  extract_audio_from_video,
 
11
  translation_main,
12
  tts_main,
13
  create_combined_output,
14
+ create_combined_output_subprocess,
15
  )
16
+ from pipeline import translation_hdr, translation_url, LANG_DICT
17
+
18
+ executor = ThreadPoolExecutor()
19
 
20
 
21
  async def process_video_translation(
22
+ input_video, speaker, language, progress=gr.Progress(track_tqdm=True)
23
  ):
24
  if input_video is None:
25
  gr.Info("Please upload a video file", duration=2)
26
  return
27
 
28
+ total_stages = 5
29
 
30
  # add time stamp to output video
31
  timestamp = time.strftime("%M%S")
 
43
 
44
  # stage 1: extract audio from video
45
  progress(0.1, desc="Extracting audio from video")
46
+ output_audio_path = await asyncio.get_event_loop().run_in_executor(
47
+ executor, extract_audio_from_video, input_video
48
+ )
49
  pbar.update(1)
50
 
51
+ # stage 2: transcribe audio
52
  progress(0.2, desc="Transcribing audio")
53
+ sentences = await asyncio.get_event_loop().run_in_executor(
54
+ executor, transcribe_and_preprocess_audio, output_audio_path
55
+ )
56
  pbar.update(1)
57
 
58
+ # stage 3: translate to twi
59
  progress(0.4, desc="Translating to Twi")
60
  khaya_translations = await translation_main(
61
+ sentences, translation_url, translation_hdr, LANG_DICT[language]
62
  )
63
  pbar.update(1)
64
 
65
+ # stage 4: convert to speech
66
  progress(0.7, desc="Converting to speech")
67
+ output_audio = await tts_main(khaya_translations, speaker, LANG_DICT[language])
68
  # print(tts_output_files)
69
  pbar.update(1)
70
 
71
+ # stage 5: combine audio streams
72
  progress(1.0, desc="Combining audio and video")
73
+ output_video = await asyncio.get_event_loop().run_in_executor(
74
+ executor,
75
+ create_combined_output_subprocess,
76
+ input_video,
77
+ output_audio,
78
+ output_video,
79
+ )
80
  pbar.update(1)
81
 
82
  print("Video translation completed")
 
90
  text_size="lg",
91
  spacing_size="lg",
92
  )
93
+
94
+
95
+ def update_speaker_choices(language):
96
+ if language == "Twi":
97
+ return gr.update(choices=["male", "female"], value="male")
98
+ elif language == "Ewe":
99
+ return gr.update(choices=["male"], value="male")
100
+
101
+
102
  with gr.Blocks(
103
  theme=app_theme,
104
  title="Video Dubbing Interface",
105
  ) as demo:
106
+ with gr.Row(variant="compact"):
107
  with gr.Column(
108
  scale=1,
109
  min_width=0,
 
111
  gr.Image(
112
  "logo_2.jpeg",
113
  show_label=False,
114
+ height=100,
115
  show_download_button=False,
116
  show_fullscreen_button=False,
117
  container=False,
118
  show_share_button=False,
119
  )
120
  with gr.Column(
121
+ scale=3,
122
  variant="default",
123
  ):
124
  gr.HTML(
 
138
  gr.Image(
139
  "NLPGhana_logo_1.png",
140
  show_label=False,
141
+ height=100,
142
  show_download_button=False,
143
  show_fullscreen_button=False,
144
  container=False,
 
152
  with gr.Row():
153
  with gr.Column():
154
  input_video = gr.Video(label="Input Video", sources=["upload"])
155
+ input_language = gr.Radio(
156
+ label="Select Language",
157
+ choices=["Twi", "Ewe"],
158
+ value="Twi",
159
+ min_width=50,
160
+ container=True,
161
+ show_label=True,
162
+ )
163
+ print(input_language.value)
164
+ speaker_choices = (
165
+ ["male", "female"] if input_language.value == "Twi" else ["male"]
166
+ )
167
  input_speaker = gr.Radio(
168
  label="Select Speaker",
169
+ choices=speaker_choices,
170
+ value="male",
171
  min_width=50,
172
  container=True,
173
  show_label=True,
174
  )
175
  submit = gr.Button("Process Video", scale=1)
176
  output_video = gr.Video(label="Processed Video")
177
+ # Update the speaker choices based on the selected language
178
+ input_language.change(
179
+ update_speaker_choices,
180
+ inputs=input_language,
181
+ outputs=input_speaker,
182
+ )
183
  submit.click(
184
  process_video_translation,
185
+ inputs=[input_video, input_language, input_speaker],
186
  outputs=output_video,
187
  )
188
 
pipeline.py CHANGED
@@ -13,6 +13,7 @@ import ffmpeg
13
  import torch
14
  import aiofiles
15
  import tempfile
 
16
 
17
 
18
  # load khaya token from environment
@@ -38,7 +39,7 @@ tts_header = {
38
  "Ocp-Apim-Subscription-Key": f"{KHAYA_TOKEN}",
39
  }
40
 
41
- LANG = "tw"
42
 
43
  # Check if GPU is available
44
  pipe_device = 0 if torch.cuda.is_available() else -1
@@ -84,17 +85,29 @@ async def translation_main(sentences, url, headers, lang):
84
  asyncio.as_completed(tasks), total=len(tasks), desc="Translating Sentences"
85
  ):
86
  index, result = await f
 
87
  khaya_translations[index] = result
88
 
89
  return khaya_translations
90
 
91
 
92
  async def convert_text_to_speech(
93
- session, tts_url, tts_header, text, text_index, speaker, semaphore, output_dir
 
 
 
 
 
 
 
 
94
  ):
95
- speaker_dict = {"male": "twi_speaker_5", "female": "twi_speaker_7"}
96
- speaker_id = speaker_dict[speaker]
97
- data = {"text": text, "language": LANG, "speaker_id": speaker_id}
 
 
 
98
 
99
  try:
100
  async with semaphore:
@@ -114,7 +127,7 @@ async def convert_text_to_speech(
114
  print(f"Unexpected error: {e}")
115
 
116
 
117
- async def tts_main(khaya_translations, speaker):
118
  with tempfile.TemporaryDirectory() as temp_dir:
119
  async with aiohttp.ClientSession() as session:
120
  semaphore = asyncio.Semaphore(3)
@@ -125,6 +138,7 @@ async def tts_main(khaya_translations, speaker):
125
  tts_header,
126
  sent,
127
  text_index,
 
128
  speaker,
129
  semaphore,
130
  temp_dir,
@@ -182,6 +196,9 @@ def transcribe_and_preprocess_audio(input_audio):
182
 
183
 
184
  def combine_audio_streams(list_of_output_chunks, output_audio):
 
 
 
185
  input_streams = [ffmpeg.input(chunk) for chunk in list_of_output_chunks]
186
  concatenated = ffmpeg.concat(*input_streams, v=0, a=1).output(f"{output_audio}")
187
 
@@ -209,3 +226,51 @@ def create_combined_output(input_video, output_audio, output_video):
209
  except ffmpeg.Error as e:
210
  print(e.stderr.decode())
211
  raise e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  import torch
14
  import aiofiles
15
  import tempfile
16
+ import subprocess
17
 
18
 
19
  # load khaya token from environment
 
39
  "Ocp-Apim-Subscription-Key": f"{KHAYA_TOKEN}",
40
  }
41
 
42
+ LANG_DICT = {"Twi": "tw", "Ewe": "ee"}
43
 
44
  # Check if GPU is available
45
  pipe_device = 0 if torch.cuda.is_available() else -1
 
85
  asyncio.as_completed(tasks), total=len(tasks), desc="Translating Sentences"
86
  ):
87
  index, result = await f
88
+ # TODO: handle error response
89
  khaya_translations[index] = result
90
 
91
  return khaya_translations
92
 
93
 
94
  async def convert_text_to_speech(
95
+ session,
96
+ tts_url,
97
+ tts_header,
98
+ text,
99
+ text_index,
100
+ language,
101
+ speaker,
102
+ semaphore,
103
+ output_dir,
104
  ):
105
+ speaker_dict = {
106
+ "tw": {"male": "twi_speaker_5", "female": "twi_speaker_7"},
107
+ "ee": {"male": "ewe_speaker_3", "female": None},
108
+ }
109
+ speaker_id = speaker_dict[language][speaker]
110
+ data = {"text": text, "language": language, "speaker_id": speaker_id}
111
 
112
  try:
113
  async with semaphore:
 
127
  print(f"Unexpected error: {e}")
128
 
129
 
130
+ async def tts_main(khaya_translations, speaker, language):
131
  with tempfile.TemporaryDirectory() as temp_dir:
132
  async with aiohttp.ClientSession() as session:
133
  semaphore = asyncio.Semaphore(3)
 
138
  tts_header,
139
  sent,
140
  text_index,
141
+ language,
142
  speaker,
143
  semaphore,
144
  temp_dir,
 
196
 
197
 
198
  def combine_audio_streams(list_of_output_chunks, output_audio):
199
+ list_of_output_chunks = sorted(
200
+ list_of_output_chunks, key=lambda x: int(x.split("_")[1].split("/")[-1])
201
+ )
202
  input_streams = [ffmpeg.input(chunk) for chunk in list_of_output_chunks]
203
  concatenated = ffmpeg.concat(*input_streams, v=0, a=1).output(f"{output_audio}")
204
 
 
226
  except ffmpeg.Error as e:
227
  print(e.stderr.decode())
228
  raise e
229
+
230
+
231
+ def create_combined_output_subprocess(input_video, output_audio, output_video):
232
+ video_duration = get_media_duration(input_video)
233
+ audio_duration = get_media_duration(output_audio)
234
+
235
+ speed_factor = calculate_speed_factor(video_duration, audio_duration)
236
+ print(f"Speed factor: {speed_factor}")
237
+
238
+ try:
239
+ command = [
240
+ "ffmpeg",
241
+ "-i",
242
+ f"{input_video}",
243
+ "-i",
244
+ f"{output_audio}",
245
+ "-filter:a",
246
+ f"atempo={speed_factor}",
247
+ "-c:v",
248
+ "copy",
249
+ "-map",
250
+ "0:v:0",
251
+ "-map",
252
+ "1:a:0",
253
+ f"{output_video}",
254
+ ]
255
+ subprocess.run(command, check=True)
256
+ print("Video and audio combined successfully")
257
+ return output_video
258
+ except subprocess.CalledProcessError as e:
259
+ print(e.stderr.decode())
260
+ raise e
261
+
262
+
263
+ def get_media_duration(media_file):
264
+ """
265
+ Get the duration of a media file in seconds.
266
+ """
267
+ probe = ffmpeg.probe(media_file)
268
+ duration = float(probe["format"]["duration"])
269
+ return duration
270
+
271
+
272
+ def calculate_speed_factor(video_duration, audio_duration):
273
+ """
274
+ Calculate the speed factor to align audio with video.
275
+ """
276
+ return audio_duration / video_duration