MiKAI13 reach-vb HF staff commited on
Commit
5ed6c4c
0 Parent(s):

Duplicate from facebook/seamless_m4t

Browse files

Co-authored-by: Vaibhav Srivastav <[email protected]>

Files changed (9) hide show
  1. .gitattributes +36 -0
  2. Dockerfile +56 -0
  3. README.md +12 -0
  4. app.py +434 -0
  5. assets/sample_input.mp3 +3 -0
  6. assets/sample_input_2.mp3 +3 -0
  7. lang_list.py +254 -0
  8. requirements.txt +6 -0
  9. style.css +16 -0
.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.mp3 filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM nvidia/cuda:11.7.1-cudnn8-devel-ubuntu22.04
2
+ ENV DEBIAN_FRONTEND=noninteractive
3
+ RUN apt-get update && \
4
+ apt-get upgrade -y && \
5
+ apt-get install -y --no-install-recommends \
6
+ git \
7
+ git-lfs \
8
+ wget \
9
+ curl \
10
+ # python build dependencies \
11
+ build-essential \
12
+ libssl-dev \
13
+ zlib1g-dev \
14
+ libbz2-dev \
15
+ libreadline-dev \
16
+ libsqlite3-dev \
17
+ libncursesw5-dev \
18
+ xz-utils \
19
+ tk-dev \
20
+ libxml2-dev \
21
+ libxmlsec1-dev \
22
+ libffi-dev \
23
+ liblzma-dev \
24
+ # gradio dependencies \
25
+ ffmpeg \
26
+ # fairseq2 dependencies \
27
+ libsndfile-dev && \
28
+ apt-get clean && \
29
+ rm -rf /var/lib/apt/lists/*
30
+
31
+ RUN useradd -m -u 1000 user
32
+ USER user
33
+ ENV HOME=/home/user \
34
+ PATH=/home/user/.local/bin:${PATH}
35
+ WORKDIR ${HOME}/app
36
+
37
+ RUN curl https://pyenv.run | bash
38
+ ENV PATH=${HOME}/.pyenv/shims:${HOME}/.pyenv/bin:${PATH}
39
+ ARG PYTHON_VERSION=3.10.12
40
+ RUN pyenv install ${PYTHON_VERSION} && \
41
+ pyenv global ${PYTHON_VERSION} && \
42
+ pyenv rehash && \
43
+ pip install --no-cache-dir -U pip setuptools wheel
44
+
45
+ COPY --chown=1000 ./requirements.txt /tmp/requirements.txt
46
+ RUN pip install --no-cache-dir --upgrade -r /tmp/requirements.txt
47
+
48
+ COPY --chown=1000 . ${HOME}/app
49
+ ENV PYTHONPATH=${HOME}/app \
50
+ PYTHONUNBUFFERED=1 \
51
+ GRADIO_ALLOW_FLAGGING=never \
52
+ GRADIO_NUM_PORTS=1 \
53
+ GRADIO_SERVER_NAME=0.0.0.0 \
54
+ GRADIO_THEME=huggingface \
55
+ SYSTEM=spaces
56
+ CMD ["python", "app.py"]
README.md ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Seamless M4T
3
+ emoji: 📞
4
+ colorFrom: blue
5
+ colorTo: yellow
6
+ sdk: docker
7
+ pinned: false
8
+ suggested_hardware: t4-medium
9
+ duplicated_from: facebook/seamless_m4t
10
+ ---
11
+
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,434 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+
5
+ import gradio as gr
6
+ import numpy as np
7
+ import torch
8
+ import torchaudio
9
+ from seamless_communication.models.inference.translator import Translator
10
+
11
+ from lang_list import (
12
+ LANGUAGE_NAME_TO_CODE,
13
+ S2ST_TARGET_LANGUAGE_NAMES,
14
+ S2TT_TARGET_LANGUAGE_NAMES,
15
+ T2TT_TARGET_LANGUAGE_NAMES,
16
+ TEXT_SOURCE_LANGUAGE_NAMES,
17
+ )
18
+
19
+ DESCRIPTION = """# SeamlessM4T
20
+
21
+ [SeamlessM4T](https://github.com/facebookresearch/seamless_communication) is designed to provide high-quality
22
+ translation, allowing people from different linguistic communities to communicate effortlessly through speech and text.
23
+
24
+ This unified model enables multiple tasks like Speech-to-Speech (S2ST), Speech-to-Text (S2TT), Text-to-Speech (T2ST)
25
+ translation and more, without relying on multiple separate models.
26
+ """
27
+
28
+ CACHE_EXAMPLES = os.getenv("CACHE_EXAMPLES") == "1"
29
+
30
+ TASK_NAMES = [
31
+ "S2ST (Speech to Speech translation)",
32
+ "S2TT (Speech to Text translation)",
33
+ "T2ST (Text to Speech translation)",
34
+ "T2TT (Text to Text translation)",
35
+ "ASR (Automatic Speech Recognition)",
36
+ ]
37
+ AUDIO_SAMPLE_RATE = 16000.0
38
+ MAX_INPUT_AUDIO_LENGTH = 60 # in seconds
39
+ DEFAULT_TARGET_LANGUAGE = "French"
40
+
41
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
42
+ translator = Translator(
43
+ model_name_or_card="seamlessM4T_large",
44
+ vocoder_name_or_card="vocoder_36langs",
45
+ device=device,
46
+ sample_rate=AUDIO_SAMPLE_RATE,
47
+ )
48
+
49
+
50
+ def predict(
51
+ task_name: str,
52
+ audio_source: str,
53
+ input_audio_mic: str | None,
54
+ input_audio_file: str | None,
55
+ input_text: str | None,
56
+ source_language: str | None,
57
+ target_language: str,
58
+ ) -> tuple[tuple[int, np.ndarray] | None, str]:
59
+ task_name = task_name.split()[0]
60
+ source_language_code = LANGUAGE_NAME_TO_CODE[source_language] if source_language else None
61
+ target_language_code = LANGUAGE_NAME_TO_CODE[target_language]
62
+
63
+ if task_name in ["S2ST", "S2TT", "ASR"]:
64
+ if audio_source == "microphone":
65
+ input_data = input_audio_mic
66
+ else:
67
+ input_data = input_audio_file
68
+
69
+ arr, org_sr = torchaudio.load(input_data)
70
+ new_arr = torchaudio.functional.resample(arr, orig_freq=org_sr, new_freq=AUDIO_SAMPLE_RATE)
71
+ max_length = int(MAX_INPUT_AUDIO_LENGTH * AUDIO_SAMPLE_RATE)
72
+ if new_arr.shape[1] > max_length:
73
+ new_arr = new_arr[:, :max_length]
74
+ gr.Warning(f"Input audio is too long. Only the first {MAX_INPUT_AUDIO_LENGTH} seconds is used.")
75
+ torchaudio.save(input_data, new_arr, sample_rate=int(AUDIO_SAMPLE_RATE))
76
+ else:
77
+ input_data = input_text
78
+ text_out, wav, sr = translator.predict(
79
+ input=input_data,
80
+ task_str=task_name,
81
+ tgt_lang=target_language_code,
82
+ src_lang=source_language_code,
83
+ ngram_filtering=True,
84
+ )
85
+ if task_name in ["S2ST", "T2ST"]:
86
+ return (sr, wav.cpu().detach().numpy()), text_out
87
+ else:
88
+ return None, text_out
89
+
90
+
91
+ def process_s2st_example(input_audio_file: str, target_language: str) -> tuple[tuple[int, np.ndarray] | None, str]:
92
+ return predict(
93
+ task_name="S2ST",
94
+ audio_source="file",
95
+ input_audio_mic=None,
96
+ input_audio_file=input_audio_file,
97
+ input_text=None,
98
+ source_language=None,
99
+ target_language=target_language,
100
+ )
101
+
102
+
103
+ def process_s2tt_example(input_audio_file: str, target_language: str) -> tuple[tuple[int, np.ndarray] | None, str]:
104
+ return predict(
105
+ task_name="S2TT",
106
+ audio_source="file",
107
+ input_audio_mic=None,
108
+ input_audio_file=input_audio_file,
109
+ input_text=None,
110
+ source_language=None,
111
+ target_language=target_language,
112
+ )
113
+
114
+
115
+ def process_t2st_example(
116
+ input_text: str, source_language: str, target_language: str
117
+ ) -> tuple[tuple[int, np.ndarray] | None, str]:
118
+ return predict(
119
+ task_name="T2ST",
120
+ audio_source="",
121
+ input_audio_mic=None,
122
+ input_audio_file=None,
123
+ input_text=input_text,
124
+ source_language=source_language,
125
+ target_language=target_language,
126
+ )
127
+
128
+
129
+ def process_t2tt_example(
130
+ input_text: str, source_language: str, target_language: str
131
+ ) -> tuple[tuple[int, np.ndarray] | None, str]:
132
+ return predict(
133
+ task_name="T2TT",
134
+ audio_source="",
135
+ input_audio_mic=None,
136
+ input_audio_file=None,
137
+ input_text=input_text,
138
+ source_language=source_language,
139
+ target_language=target_language,
140
+ )
141
+
142
+
143
+ def process_asr_example(input_audio_file: str, target_language: str) -> tuple[tuple[int, np.ndarray] | None, str]:
144
+ return predict(
145
+ task_name="ASR",
146
+ audio_source="file",
147
+ input_audio_mic=None,
148
+ input_audio_file=input_audio_file,
149
+ input_text=None,
150
+ source_language=None,
151
+ target_language=target_language,
152
+ )
153
+
154
+
155
+ def update_audio_ui(audio_source: str) -> tuple[dict, dict]:
156
+ mic = audio_source == "microphone"
157
+ return (
158
+ gr.update(visible=mic, value=None), # input_audio_mic
159
+ gr.update(visible=not mic, value=None), # input_audio_file
160
+ )
161
+
162
+
163
+ def update_input_ui(task_name: str) -> tuple[dict, dict, dict, dict]:
164
+ task_name = task_name.split()[0]
165
+ if task_name == "S2ST":
166
+ return (
167
+ gr.update(visible=True), # audio_box
168
+ gr.update(visible=False), # input_text
169
+ gr.update(visible=False), # source_language
170
+ gr.update(
171
+ visible=True, choices=S2ST_TARGET_LANGUAGE_NAMES, value=DEFAULT_TARGET_LANGUAGE
172
+ ), # target_language
173
+ )
174
+ elif task_name == "S2TT":
175
+ return (
176
+ gr.update(visible=True), # audio_box
177
+ gr.update(visible=False), # input_text
178
+ gr.update(visible=False), # source_language
179
+ gr.update(
180
+ visible=True, choices=S2TT_TARGET_LANGUAGE_NAMES, value=DEFAULT_TARGET_LANGUAGE
181
+ ), # target_language
182
+ )
183
+ elif task_name == "T2ST":
184
+ return (
185
+ gr.update(visible=False), # audio_box
186
+ gr.update(visible=True), # input_text
187
+ gr.update(visible=True), # source_language
188
+ gr.update(
189
+ visible=True, choices=S2ST_TARGET_LANGUAGE_NAMES, value=DEFAULT_TARGET_LANGUAGE
190
+ ), # target_language
191
+ )
192
+ elif task_name == "T2TT":
193
+ return (
194
+ gr.update(visible=False), # audio_box
195
+ gr.update(visible=True), # input_text
196
+ gr.update(visible=True), # source_language
197
+ gr.update(
198
+ visible=True, choices=T2TT_TARGET_LANGUAGE_NAMES, value=DEFAULT_TARGET_LANGUAGE
199
+ ), # target_language
200
+ )
201
+ elif task_name == "ASR":
202
+ return (
203
+ gr.update(visible=True), # audio_box
204
+ gr.update(visible=False), # input_text
205
+ gr.update(visible=False), # source_language
206
+ gr.update(
207
+ visible=True, choices=S2TT_TARGET_LANGUAGE_NAMES, value=DEFAULT_TARGET_LANGUAGE
208
+ ), # target_language
209
+ )
210
+ else:
211
+ raise ValueError(f"Unknown task: {task_name}")
212
+
213
+
214
+ def update_output_ui(task_name: str) -> tuple[dict, dict]:
215
+ task_name = task_name.split()[0]
216
+ if task_name in ["S2ST", "T2ST"]:
217
+ return (
218
+ gr.update(visible=True, value=None), # output_audio
219
+ gr.update(value=None), # output_text
220
+ )
221
+ elif task_name in ["S2TT", "T2TT", "ASR"]:
222
+ return (
223
+ gr.update(visible=False, value=None), # output_audio
224
+ gr.update(value=None), # output_text
225
+ )
226
+ else:
227
+ raise ValueError(f"Unknown task: {task_name}")
228
+
229
+
230
+ def update_example_ui(task_name: str) -> tuple[dict, dict, dict, dict, dict]:
231
+ task_name = task_name.split()[0]
232
+ return (
233
+ gr.update(visible=task_name == "S2ST"), # s2st_example_row
234
+ gr.update(visible=task_name == "S2TT"), # s2tt_example_row
235
+ gr.update(visible=task_name == "T2ST"), # t2st_example_row
236
+ gr.update(visible=task_name == "T2TT"), # t2tt_example_row
237
+ gr.update(visible=task_name == "ASR"), # asr_example_row
238
+ )
239
+
240
+
241
+ with gr.Blocks(css="style.css") as demo:
242
+ gr.Markdown(DESCRIPTION)
243
+ gr.DuplicateButton(
244
+ value="Duplicate Space for private use",
245
+ elem_id="duplicate-button",
246
+ visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
247
+ )
248
+ with gr.Group():
249
+ task_name = gr.Dropdown(
250
+ label="Task",
251
+ choices=TASK_NAMES,
252
+ value=TASK_NAMES[0],
253
+ )
254
+ with gr.Row():
255
+ source_language = gr.Dropdown(
256
+ label="Source language",
257
+ choices=TEXT_SOURCE_LANGUAGE_NAMES,
258
+ value="English",
259
+ visible=False,
260
+ )
261
+ target_language = gr.Dropdown(
262
+ label="Target language",
263
+ choices=S2ST_TARGET_LANGUAGE_NAMES,
264
+ value=DEFAULT_TARGET_LANGUAGE,
265
+ )
266
+ with gr.Row() as audio_box:
267
+ audio_source = gr.Radio(
268
+ label="Audio source",
269
+ choices=["file", "microphone"],
270
+ value="file",
271
+ )
272
+ input_audio_mic = gr.Audio(
273
+ label="Input speech",
274
+ type="filepath",
275
+ source="microphone",
276
+ visible=False,
277
+ )
278
+ input_audio_file = gr.Audio(
279
+ label="Input speech",
280
+ type="filepath",
281
+ source="upload",
282
+ visible=True,
283
+ )
284
+ input_text = gr.Textbox(label="Input text", visible=False)
285
+ btn = gr.Button("Translate")
286
+ with gr.Column():
287
+ output_audio = gr.Audio(
288
+ label="Translated speech",
289
+ autoplay=False,
290
+ streaming=False,
291
+ type="numpy",
292
+ )
293
+ output_text = gr.Textbox(label="Translated text")
294
+
295
+ with gr.Row(visible=True) as s2st_example_row:
296
+ s2st_examples = gr.Examples(
297
+ examples=[
298
+ ["assets/sample_input.mp3", "French"],
299
+ ["assets/sample_input.mp3", "Mandarin Chinese"],
300
+ ["assets/sample_input_2.mp3", "Hindi"],
301
+ ["assets/sample_input_2.mp3", "Spanish"],
302
+ ],
303
+ inputs=[input_audio_file, target_language],
304
+ outputs=[output_audio, output_text],
305
+ fn=process_s2st_example,
306
+ cache_examples=CACHE_EXAMPLES,
307
+ )
308
+ with gr.Row(visible=False) as s2tt_example_row:
309
+ s2tt_examples = gr.Examples(
310
+ examples=[
311
+ ["assets/sample_input.mp3", "French"],
312
+ ["assets/sample_input.mp3", "Mandarin Chinese"],
313
+ ["assets/sample_input_2.mp3", "Hindi"],
314
+ ["assets/sample_input_2.mp3", "Spanish"],
315
+ ],
316
+ inputs=[input_audio_file, target_language],
317
+ outputs=[output_audio, output_text],
318
+ fn=process_s2tt_example,
319
+ cache_examples=CACHE_EXAMPLES,
320
+ )
321
+ with gr.Row(visible=False) as t2st_example_row:
322
+ t2st_examples = gr.Examples(
323
+ examples=[
324
+ ["My favorite animal is the elephant.", "English", "French"],
325
+ ["My favorite animal is the elephant.", "English", "Mandarin Chinese"],
326
+ [
327
+ "Meta AI's Seamless M4T model is democratising spoken communication across language barriers",
328
+ "English",
329
+ "Hindi",
330
+ ],
331
+ [
332
+ "Meta AI's Seamless M4T model is democratising spoken communication across language barriers",
333
+ "English",
334
+ "Spanish",
335
+ ],
336
+ ],
337
+ inputs=[input_text, source_language, target_language],
338
+ outputs=[output_audio, output_text],
339
+ fn=process_t2st_example,
340
+ cache_examples=CACHE_EXAMPLES,
341
+ )
342
+ with gr.Row(visible=False) as t2tt_example_row:
343
+ t2tt_examples = gr.Examples(
344
+ examples=[
345
+ ["My favorite animal is the elephant.", "English", "French"],
346
+ ["My favorite animal is the elephant.", "English", "Mandarin Chinese"],
347
+ [
348
+ "Meta AI's Seamless M4T model is democratising spoken communication across language barriers",
349
+ "English",
350
+ "Hindi",
351
+ ],
352
+ [
353
+ "Meta AI's Seamless M4T model is democratising spoken communication across language barriers",
354
+ "English",
355
+ "Spanish",
356
+ ],
357
+ ],
358
+ inputs=[input_text, source_language, target_language],
359
+ outputs=[output_audio, output_text],
360
+ fn=process_t2tt_example,
361
+ cache_examples=CACHE_EXAMPLES,
362
+ )
363
+ with gr.Row(visible=False) as asr_example_row:
364
+ asr_examples = gr.Examples(
365
+ examples=[
366
+ ["assets/sample_input.mp3", "English"],
367
+ ["assets/sample_input_2.mp3", "English"],
368
+ ],
369
+ inputs=[input_audio_file, target_language],
370
+ outputs=[output_audio, output_text],
371
+ fn=process_asr_example,
372
+ cache_examples=CACHE_EXAMPLES,
373
+ )
374
+
375
+ audio_source.change(
376
+ fn=update_audio_ui,
377
+ inputs=audio_source,
378
+ outputs=[
379
+ input_audio_mic,
380
+ input_audio_file,
381
+ ],
382
+ queue=False,
383
+ api_name=False,
384
+ )
385
+ task_name.change(
386
+ fn=update_input_ui,
387
+ inputs=task_name,
388
+ outputs=[
389
+ audio_box,
390
+ input_text,
391
+ source_language,
392
+ target_language,
393
+ ],
394
+ queue=False,
395
+ api_name=False,
396
+ ).then(
397
+ fn=update_output_ui,
398
+ inputs=task_name,
399
+ outputs=[output_audio, output_text],
400
+ queue=False,
401
+ api_name=False,
402
+ ).then(
403
+ fn=update_example_ui,
404
+ inputs=task_name,
405
+ outputs=[
406
+ s2st_example_row,
407
+ s2tt_example_row,
408
+ t2st_example_row,
409
+ t2tt_example_row,
410
+ asr_example_row,
411
+ ],
412
+ queue=False,
413
+ api_name=False,
414
+ )
415
+
416
+ btn.click(
417
+ fn=predict,
418
+ inputs=[
419
+ task_name,
420
+ audio_source,
421
+ input_audio_mic,
422
+ input_audio_file,
423
+ input_text,
424
+ source_language,
425
+ target_language,
426
+ ],
427
+ outputs=[output_audio, output_text],
428
+ api_name="run",
429
+ )
430
+ demo.queue(max_size=50).launch()
431
+
432
+ # Linking models to the space
433
+ # 'facebook/seamless-m4t-large'
434
+ # 'facebook/SONAR'
assets/sample_input.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:982369687f05bf8fcd6923c4ffcccda0fcce92f44eceae5a9d00a431f07ea87b
3
+ size 10272
assets/sample_input_2.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a505a4641e3f5f0ddec9508832793aa20e63d2545530b66bc04a9bd19a742e6
3
+ size 30624
lang_list.py ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Language dict
2
+ language_code_to_name = {
3
+ "afr": "Afrikaans",
4
+ "amh": "Amharic",
5
+ "arb": "Modern Standard Arabic",
6
+ "ary": "Moroccan Arabic",
7
+ "arz": "Egyptian Arabic",
8
+ "asm": "Assamese",
9
+ "ast": "Asturian",
10
+ "azj": "North Azerbaijani",
11
+ "bel": "Belarusian",
12
+ "ben": "Bengali",
13
+ "bos": "Bosnian",
14
+ "bul": "Bulgarian",
15
+ "cat": "Catalan",
16
+ "ceb": "Cebuano",
17
+ "ces": "Czech",
18
+ "ckb": "Central Kurdish",
19
+ "cmn": "Mandarin Chinese",
20
+ "cym": "Welsh",
21
+ "dan": "Danish",
22
+ "deu": "German",
23
+ "ell": "Greek",
24
+ "eng": "English",
25
+ "est": "Estonian",
26
+ "eus": "Basque",
27
+ "fin": "Finnish",
28
+ "fra": "French",
29
+ "gaz": "West Central Oromo",
30
+ "gle": "Irish",
31
+ "glg": "Galician",
32
+ "guj": "Gujarati",
33
+ "heb": "Hebrew",
34
+ "hin": "Hindi",
35
+ "hrv": "Croatian",
36
+ "hun": "Hungarian",
37
+ "hye": "Armenian",
38
+ "ibo": "Igbo",
39
+ "ind": "Indonesian",
40
+ "isl": "Icelandic",
41
+ "ita": "Italian",
42
+ "jav": "Javanese",
43
+ "jpn": "Japanese",
44
+ "kam": "Kamba",
45
+ "kan": "Kannada",
46
+ "kat": "Georgian",
47
+ "kaz": "Kazakh",
48
+ "kea": "Kabuverdianu",
49
+ "khk": "Halh Mongolian",
50
+ "khm": "Khmer",
51
+ "kir": "Kyrgyz",
52
+ "kor": "Korean",
53
+ "lao": "Lao",
54
+ "lit": "Lithuanian",
55
+ "ltz": "Luxembourgish",
56
+ "lug": "Ganda",
57
+ "luo": "Luo",
58
+ "lvs": "Standard Latvian",
59
+ "mai": "Maithili",
60
+ "mal": "Malayalam",
61
+ "mar": "Marathi",
62
+ "mkd": "Macedonian",
63
+ "mlt": "Maltese",
64
+ "mni": "Meitei",
65
+ "mya": "Burmese",
66
+ "nld": "Dutch",
67
+ "nno": "Norwegian Nynorsk",
68
+ "nob": "Norwegian Bokm\u00e5l",
69
+ "npi": "Nepali",
70
+ "nya": "Nyanja",
71
+ "oci": "Occitan",
72
+ "ory": "Odia",
73
+ "pan": "Punjabi",
74
+ "pbt": "Southern Pashto",
75
+ "pes": "Western Persian",
76
+ "pol": "Polish",
77
+ "por": "Portuguese",
78
+ "ron": "Romanian",
79
+ "rus": "Russian",
80
+ "slk": "Slovak",
81
+ "slv": "Slovenian",
82
+ "sna": "Shona",
83
+ "snd": "Sindhi",
84
+ "som": "Somali",
85
+ "spa": "Spanish",
86
+ "srp": "Serbian",
87
+ "swe": "Swedish",
88
+ "swh": "Swahili",
89
+ "tam": "Tamil",
90
+ "tel": "Telugu",
91
+ "tgk": "Tajik",
92
+ "tgl": "Tagalog",
93
+ "tha": "Thai",
94
+ "tur": "Turkish",
95
+ "ukr": "Ukrainian",
96
+ "urd": "Urdu",
97
+ "uzn": "Northern Uzbek",
98
+ "vie": "Vietnamese",
99
+ "xho": "Xhosa",
100
+ "yor": "Yoruba",
101
+ "yue": "Cantonese",
102
+ "zlm": "Colloquial Malay",
103
+ "zsm": "Standard Malay",
104
+ "zul": "Zulu",
105
+ }
106
+ LANGUAGE_NAME_TO_CODE = {v: k for k, v in language_code_to_name.items()}
107
+
108
+ # Source langs: S2ST / S2TT / ASR don't need source lang
109
+ # T2TT / T2ST use this
110
+ text_source_language_codes = [
111
+ "afr",
112
+ "amh",
113
+ "arb",
114
+ "ary",
115
+ "arz",
116
+ "asm",
117
+ "azj",
118
+ "bel",
119
+ "ben",
120
+ "bos",
121
+ "bul",
122
+ "cat",
123
+ "ceb",
124
+ "ces",
125
+ "ckb",
126
+ "cmn",
127
+ "cym",
128
+ "dan",
129
+ "deu",
130
+ "ell",
131
+ "eng",
132
+ "est",
133
+ "eus",
134
+ "fin",
135
+ "fra",
136
+ "gaz",
137
+ "gle",
138
+ "glg",
139
+ "guj",
140
+ "heb",
141
+ "hin",
142
+ "hrv",
143
+ "hun",
144
+ "hye",
145
+ "ibo",
146
+ "ind",
147
+ "isl",
148
+ "ita",
149
+ "jav",
150
+ "jpn",
151
+ "kan",
152
+ "kat",
153
+ "kaz",
154
+ "khk",
155
+ "khm",
156
+ "kir",
157
+ "kor",
158
+ "lao",
159
+ "lit",
160
+ "lug",
161
+ "luo",
162
+ "lvs",
163
+ "mai",
164
+ "mal",
165
+ "mar",
166
+ "mkd",
167
+ "mlt",
168
+ "mni",
169
+ "mya",
170
+ "nld",
171
+ "nno",
172
+ "nob",
173
+ "npi",
174
+ "nya",
175
+ "ory",
176
+ "pan",
177
+ "pbt",
178
+ "pes",
179
+ "pol",
180
+ "por",
181
+ "ron",
182
+ "rus",
183
+ "slk",
184
+ "slv",
185
+ "sna",
186
+ "snd",
187
+ "som",
188
+ "spa",
189
+ "srp",
190
+ "swe",
191
+ "swh",
192
+ "tam",
193
+ "tel",
194
+ "tgk",
195
+ "tgl",
196
+ "tha",
197
+ "tur",
198
+ "ukr",
199
+ "urd",
200
+ "uzn",
201
+ "vie",
202
+ "yor",
203
+ "yue",
204
+ "zsm",
205
+ "zul",
206
+ ]
207
+ TEXT_SOURCE_LANGUAGE_NAMES = sorted([language_code_to_name[code] for code in text_source_language_codes])
208
+
209
+ # Target langs:
210
+ # S2ST / T2ST
211
+ s2st_target_language_codes = [
212
+ "eng",
213
+ "arb",
214
+ "ben",
215
+ "cat",
216
+ "ces",
217
+ "cmn",
218
+ "cym",
219
+ "dan",
220
+ "deu",
221
+ "est",
222
+ "fin",
223
+ "fra",
224
+ "hin",
225
+ "ind",
226
+ "ita",
227
+ "jpn",
228
+ "kor",
229
+ "mlt",
230
+ "nld",
231
+ "pes",
232
+ "pol",
233
+ "por",
234
+ "ron",
235
+ "rus",
236
+ "slk",
237
+ "spa",
238
+ "swe",
239
+ "swh",
240
+ "tel",
241
+ "tgl",
242
+ "tha",
243
+ "tur",
244
+ "ukr",
245
+ "urd",
246
+ "uzn",
247
+ "vie",
248
+ ]
249
+ S2ST_TARGET_LANGUAGE_NAMES = sorted([language_code_to_name[code] for code in s2st_target_language_codes])
250
+
251
+ # S2TT / ASR
252
+ S2TT_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES
253
+ # T2TT
254
+ T2TT_TARGET_LANGUAGE_NAMES = TEXT_SOURCE_LANGUAGE_NAMES
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ fairseq2==0.1.0
2
+ git+https://github.com/facebookresearch/seamless_communication
3
+ gradio==3.40.1
4
+ huggingface_hub==0.16.4
5
+ torch==2.0.1
6
+ torchaudio==2.0.2
style.css ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ h1 {
2
+ text-align: center;
3
+ }
4
+
5
+ #duplicate-button {
6
+ margin: auto;
7
+ color: #fff;
8
+ background: #1565c0;
9
+ border-radius: 100vh;
10
+ }
11
+
12
+ #component-0 {
13
+ max-width: 730px;
14
+ margin: auto;
15
+ padding-top: 1.5rem;
16
+ }