Spaces:
Running
on
T4
Running
on
T4
Fix type annotation and add ngram_filtering
Browse files
app.py
CHANGED
@@ -1,6 +1,7 @@
|
|
|
|
|
|
1 |
import os
|
2 |
|
3 |
-
from typing import Union
|
4 |
import gradio as gr
|
5 |
import numpy as np
|
6 |
import torch
|
@@ -49,14 +50,14 @@ translator = Translator(
|
|
49 |
def predict(
|
50 |
task_name: str,
|
51 |
audio_source: str,
|
52 |
-
input_audio_mic:
|
53 |
-
input_audio_file:
|
54 |
-
input_text:
|
55 |
-
source_language:
|
56 |
target_language: str,
|
57 |
-
) -> tuple[
|
58 |
task_name = task_name.split()[0]
|
59 |
-
source_language_code = LANGUAGE_NAME_TO_CODE
|
60 |
target_language_code = LANGUAGE_NAME_TO_CODE[target_language]
|
61 |
|
62 |
if task_name in ["S2ST", "S2TT", "ASR"]:
|
@@ -79,6 +80,7 @@ def predict(
|
|
79 |
task_str=task_name,
|
80 |
tgt_lang=target_language_code,
|
81 |
src_lang=source_language_code,
|
|
|
82 |
)
|
83 |
if task_name in ["S2ST", "T2ST"]:
|
84 |
return (sr, wav.cpu().detach().numpy()), text_out
|
@@ -86,7 +88,7 @@ def predict(
|
|
86 |
return None, text_out
|
87 |
|
88 |
|
89 |
-
def process_s2st_example(input_audio_file: str, target_language: str) -> tuple[
|
90 |
return predict(
|
91 |
task_name="S2ST",
|
92 |
audio_source="file",
|
@@ -98,7 +100,7 @@ def process_s2st_example(input_audio_file: str, target_language: str) -> tuple[s
|
|
98 |
)
|
99 |
|
100 |
|
101 |
-
def process_s2tt_example(input_audio_file: str, target_language: str) -> tuple[
|
102 |
return predict(
|
103 |
task_name="S2TT",
|
104 |
audio_source="file",
|
@@ -110,7 +112,9 @@ def process_s2tt_example(input_audio_file: str, target_language: str) -> tuple[s
|
|
110 |
)
|
111 |
|
112 |
|
113 |
-
def process_t2st_example(
|
|
|
|
|
114 |
return predict(
|
115 |
task_name="T2ST",
|
116 |
audio_source="",
|
@@ -122,7 +126,9 @@ def process_t2st_example(input_text: str, source_language: str, target_language:
|
|
122 |
)
|
123 |
|
124 |
|
125 |
-
def process_t2tt_example(
|
|
|
|
|
126 |
return predict(
|
127 |
task_name="T2TT",
|
128 |
audio_source="",
|
@@ -134,7 +140,7 @@ def process_t2tt_example(input_text: str, source_language: str, target_language:
|
|
134 |
)
|
135 |
|
136 |
|
137 |
-
def process_asr_example(input_audio_file: str, target_language: str) -> tuple[
|
138 |
return predict(
|
139 |
task_name="ASR",
|
140 |
audio_source="file",
|
@@ -317,10 +323,16 @@ with gr.Blocks(css="style.css") as demo:
|
|
317 |
examples=[
|
318 |
["My favorite animal is the elephant.", "English", "French"],
|
319 |
["My favorite animal is the elephant.", "English", "Mandarin Chinese"],
|
320 |
-
[
|
321 |
-
|
322 |
-
|
323 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
324 |
],
|
325 |
inputs=[input_text, source_language, target_language],
|
326 |
outputs=[output_audio, output_text],
|
@@ -332,10 +344,16 @@ with gr.Blocks(css="style.css") as demo:
|
|
332 |
examples=[
|
333 |
["My favorite animal is the elephant.", "English", "French"],
|
334 |
["My favorite animal is the elephant.", "English", "Mandarin Chinese"],
|
335 |
-
[
|
336 |
-
|
337 |
-
|
338 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
339 |
],
|
340 |
inputs=[input_text, source_language, target_language],
|
341 |
outputs=[output_audio, output_text],
|
|
|
1 |
+
from __future__ import annotations
|
2 |
+
|
3 |
import os
|
4 |
|
|
|
5 |
import gradio as gr
|
6 |
import numpy as np
|
7 |
import torch
|
|
|
50 |
def predict(
|
51 |
task_name: str,
|
52 |
audio_source: str,
|
53 |
+
input_audio_mic: str | None,
|
54 |
+
input_audio_file: str | None,
|
55 |
+
input_text: str | None,
|
56 |
+
source_language: str | None,
|
57 |
target_language: str,
|
58 |
+
) -> tuple[tuple[int, np.ndarray] | None, str]:
|
59 |
task_name = task_name.split()[0]
|
60 |
+
source_language_code = LANGUAGE_NAME_TO_CODE[source_language] if source_language else None
|
61 |
target_language_code = LANGUAGE_NAME_TO_CODE[target_language]
|
62 |
|
63 |
if task_name in ["S2ST", "S2TT", "ASR"]:
|
|
|
80 |
task_str=task_name,
|
81 |
tgt_lang=target_language_code,
|
82 |
src_lang=source_language_code,
|
83 |
+
ngram_filtering=True,
|
84 |
)
|
85 |
if task_name in ["S2ST", "T2ST"]:
|
86 |
return (sr, wav.cpu().detach().numpy()), text_out
|
|
|
88 |
return None, text_out
|
89 |
|
90 |
|
91 |
+
def process_s2st_example(input_audio_file: str, target_language: str) -> tuple[tuple[int, np.ndarray] | None, str]:
|
92 |
return predict(
|
93 |
task_name="S2ST",
|
94 |
audio_source="file",
|
|
|
100 |
)
|
101 |
|
102 |
|
103 |
+
def process_s2tt_example(input_audio_file: str, target_language: str) -> tuple[tuple[int, np.ndarray] | None, str]:
|
104 |
return predict(
|
105 |
task_name="S2TT",
|
106 |
audio_source="file",
|
|
|
112 |
)
|
113 |
|
114 |
|
115 |
+
def process_t2st_example(
|
116 |
+
input_text: str, source_language: str, target_language: str
|
117 |
+
) -> tuple[tuple[int, np.ndarray] | None, str]:
|
118 |
return predict(
|
119 |
task_name="T2ST",
|
120 |
audio_source="",
|
|
|
126 |
)
|
127 |
|
128 |
|
129 |
+
def process_t2tt_example(
|
130 |
+
input_text: str, source_language: str, target_language: str
|
131 |
+
) -> tuple[tuple[int, np.ndarray] | None, str]:
|
132 |
return predict(
|
133 |
task_name="T2TT",
|
134 |
audio_source="",
|
|
|
140 |
)
|
141 |
|
142 |
|
143 |
+
def process_asr_example(input_audio_file: str, target_language: str) -> tuple[tuple[int, np.ndarray] | None, str]:
|
144 |
return predict(
|
145 |
task_name="ASR",
|
146 |
audio_source="file",
|
|
|
323 |
examples=[
|
324 |
["My favorite animal is the elephant.", "English", "French"],
|
325 |
["My favorite animal is the elephant.", "English", "Mandarin Chinese"],
|
326 |
+
[
|
327 |
+
"Meta AI's Seamless M4T model is democratising spoken communication across language barriers",
|
328 |
+
"English",
|
329 |
+
"Hindi",
|
330 |
+
],
|
331 |
+
[
|
332 |
+
"Meta AI's Seamless M4T model is democratising spoken communication across language barriers",
|
333 |
+
"English",
|
334 |
+
"Spanish",
|
335 |
+
],
|
336 |
],
|
337 |
inputs=[input_text, source_language, target_language],
|
338 |
outputs=[output_audio, output_text],
|
|
|
344 |
examples=[
|
345 |
["My favorite animal is the elephant.", "English", "French"],
|
346 |
["My favorite animal is the elephant.", "English", "Mandarin Chinese"],
|
347 |
+
[
|
348 |
+
"Meta AI's Seamless M4T model is democratising spoken communication across language barriers",
|
349 |
+
"English",
|
350 |
+
"Hindi",
|
351 |
+
],
|
352 |
+
[
|
353 |
+
"Meta AI's Seamless M4T model is democratising spoken communication across language barriers",
|
354 |
+
"English",
|
355 |
+
"Spanish",
|
356 |
+
],
|
357 |
],
|
358 |
inputs=[input_text, source_language, target_language],
|
359 |
outputs=[output_audio, output_text],
|