Spaces:

facebook
/

seamless_m4t

Running on T4

App Files Files Community

Vaibhav Srivastav

hysts HF staff commited on Aug 21, 2023

Commit

8fb8950

•

1 Parent(s): da26cb0

Initial commit

Browse files

Co-authored-by: hysts <[email protected]>

Files changed (6) hide show

Dockerfile +65 -0
README.md +3 -2
app.py +221 -0
mlg_config.json +186 -0
requirements.txt +4 -0
style.css +16 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,65 @@

+FROM nvidia/cuda:11.7.1-cudnn8-devel-ubuntu22.04
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && \
+    apt-get upgrade -y && \
+    apt-get install -y --no-install-recommends \
+    git \
+    git-lfs \
+    wget \
+    curl \
+    # python build dependencies \
+    build-essential \
+    libssl-dev \
+    zlib1g-dev \
+    libbz2-dev \
+    libreadline-dev \
+    libsqlite3-dev \
+    libncursesw5-dev \
+    xz-utils \
+    tk-dev \
+    libxml2-dev \
+    libxmlsec1-dev \
+    libffi-dev \
+    liblzma-dev \
+    # gradio dependencies \
+    ffmpeg \
+    # fairseq2 dependencies \
+    libsndfile-dev && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:${PATH}
+WORKDIR ${HOME}/app
+RUN curl https://pyenv.run | bash
+ENV PATH=${HOME}/.pyenv/shims:${HOME}/.pyenv/bin:${PATH}
+ARG PYTHON_VERSION=3.10.12
+RUN pyenv install ${PYTHON_VERSION} && \
+    pyenv global ${PYTHON_VERSION} && \
+    pyenv rehash && \
+    pip install --no-cache-dir -U pip setuptools wheel
+RUN pip install --no-cache-dir torch==2.0.1 gradio==3.40.1 && \
+    pip install --extra-index-url https://test.pypi.org/simple/ fairseq2==0.1.0rc0
+RUN --mount=type=secret,id=GITHUB_TOKEN,mode=0444,required=true \
+    git clone https://$(cat /run/secrets/GITHUB_TOKEN)@github.com/fairinternal/seamless_communication && \
+    cd seamless_communication && \
+    pip install . && \
+    cd .. && \
+    rm -rf seamless_communication
+COPY ./requirements.txt /tmp/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /tmp/requirements.txt
+COPY --chown=1000 . ${HOME}/app
+ENV PYTHONPATH=${HOME}/app \
+    PYTHONUNBUFFERED=1 \
+    GRADIO_ALLOW_FLAGGING=never \
+    GRADIO_NUM_PORTS=1 \
+    GRADIO_SERVER_NAME=0.0.0.0 \
+    GRADIO_THEME=huggingface \
+    SYSTEM=spaces
+CMD ["python", "app.py"]

README.md CHANGED Viewed

@@ -1,10 +1,11 @@
 ---
-title: Seamless M4t
-emoji: 🌖
 colorFrom: blue
 colorTo: yellow
 sdk: docker
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Seamless M4T
+emoji: 📞
 colorFrom: blue
 colorTo: yellow
 sdk: docker
 pinned: false
+suggested_hardware: t4-medium
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,221 @@

+import json
+import os
+import gradio as gr
+import numpy as np
+import torch
+import torchaudio
+from seamless_communication.models.inference.translator import Translator
+DESCRIPTION = "# SeamlessM4T"
+with open("./mlg_config.json", "r") as f:
+    lang_idx_map = json.loads(f.read())
+LANGUAGES = lang_idx_map["multilingual"].keys()
+TASK_NAMES = [
+    "S2ST (Speech to Speech translation)",
+    "S2TT (Speech to Text translation)",
+    "T2ST (Text to Speech translation)",
+    "T2TT (Text to Text translation)",
+    "ASR (Automatic Speech Recognition)",
+]
+AUDIO_SAMPLE_RATE = 16000.0
+MAX_INPUT_AUDIO_LENGTH = 60  # in seconds
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+translator = Translator(
+    model_name_or_card="multitask_unity_large",
+    vocoder_name_or_card="vocoder_36langs",
+    device=device,
+    sample_rate=AUDIO_SAMPLE_RATE,
+)
+def predict(
+    task_name: str,
+    audio_source: str,
+    input_audio_mic: str,
+    input_audio_file: str,
+    input_text: str,
+    source_language: str,
+    target_language: str,
+) -> tuple[tuple[int, np.ndarray] | None, str]:
+    task_name = task_name.split()[0]
+    if task_name in ["S2ST", "S2TT", "ASR"]:
+        if audio_source == "microphone":
+            input_data = input_audio_mic
+        else:
+            input_data = input_audio_file
+        arr, org_sr = torchaudio.load(input_data)
+        new_arr = torchaudio.functional.resample(arr, orig_freq=org_sr, new_freq=AUDIO_SAMPLE_RATE)
+        max_length = int(MAX_INPUT_AUDIO_LENGTH * AUDIO_SAMPLE_RATE)
+        if new_arr.shape[1] > max_length:
+            new_arr = new_arr[:, :max_length]
+            gr.Warning(f"Input audio is too long. Only the first {MAX_INPUT_AUDIO_LENGTH} seconds is used.")
+        torchaudio.save(input_data, new_arr, sample_rate=int(AUDIO_SAMPLE_RATE))
+    else:
+        input_data = input_text
+    text_out, wav, sr = translator.predict(
+        input=input_data,
+        task_str=task_name,
+        tgt_lang=target_language,
+        src_lang=source_language,
+    )
+    if task_name in ["S2ST", "T2ST"]:
+        return (sr, wav.cpu().detach().numpy()), text_out
+    else:
+        return None, text_out
+def update_audio_ui(audio_source: str) -> tuple[dict, dict]:
+    mic = audio_source == "microphone"
+    return (
+        gr.update(visible=mic, value=None),  # input_audio_mic
+        gr.update(visible=not mic, value=None),  # input_audio_file
+    )
+def update_input_ui(task_name: str) -> tuple[dict, dict, dict, dict]:
+    task_name = task_name.split()[0]
+    if task_name in ["S2ST", "S2TT"]:
+        return (
+            gr.update(visible=True),  # audio_box
+            gr.update(visible=False),  # input_text
+            gr.update(visible=False),  # source_language
+            gr.update(visible=True),  # target_language
+        )
+    elif task_name in ["T2ST", "T2TT"]:
+        return (
+            gr.update(visible=False),  # audio_box
+            gr.update(visible=True),  # input_text
+            gr.update(visible=True),  # source_language
+            gr.update(visible=True),  # target_language
+        )
+    elif task_name == "ASR":
+        return (
+            gr.update(visible=True),  # audio_box
+            gr.update(visible=False),  # input_text
+            gr.update(visible=False),  # source_language
+            gr.update(visible=True),  # target_language
+        )
+    else:
+        raise ValueError(f"Unknown task: {task_name}")
+def update_output_ui(task_name: str) -> tuple[dict, dict]:
+    task_name = task_name.split()[0]
+    if task_name in ["S2ST", "T2ST"]:
+        return (
+            gr.update(visible=True, value=None),  # output_audio
+            gr.update(value=None),  # output_text
+        )
+    elif task_name in ["S2TT", "T2TT", "ASR"]:
+        return (
+            gr.update(visible=False, value=None),  # output_audio
+            gr.update(value=None),  # output_text
+        )
+    else:
+        raise ValueError(f"Unknown task: {task_name}")
+with gr.Blocks(css="style.css") as demo:
+    gr.Markdown(DESCRIPTION)
+    gr.DuplicateButton(
+        value="Duplicate Space for private use",
+        elem_id="duplicate-button",
+        visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
+    )
+    with gr.Group():
+        task_name = gr.Dropdown(
+            label="Task",
+            choices=TASK_NAMES,
+            value=TASK_NAMES[0],
+        )
+        with gr.Row():
+            source_language = gr.Dropdown(
+                label="Source language",
+                choices=LANGUAGES,
+                value="eng",
+                visible=False,
+            )
+            target_language = gr.Dropdown(
+                label="Target language",
+                choices=LANGUAGES,
+                value="fra",
+            )
+        with gr.Row() as audio_box:
+            audio_source = gr.Radio(
+                label="Audio source",
+                choices=["file", "microphone"],
+                value="file",
+            )
+            input_audio_mic = gr.Audio(
+                label="Input speech",
+                type="filepath",
+                source="microphone",
+                visible=False,
+            )
+            input_audio_file = gr.Audio(
+                label="Input speech",
+                type="filepath",
+                source="upload",
+                visible=True,
+            )
+        input_text = gr.Textbox(label="Input text", visible=False)
+        btn = gr.Button("Translate")
+        with gr.Column():
+            output_audio = gr.Audio(
+                label="Translated speech",
+                autoplay=False,
+                streaming=False,
+                type="numpy",
+            )
+            output_text = gr.Textbox(label="Translated text")
+    audio_source.change(
+        fn=update_audio_ui,
+        inputs=audio_source,
+        outputs=[
+            input_audio_mic,
+            input_audio_file,
+        ],
+        queue=False,
+        api_name=False,
+    )
+    task_name.change(
+        fn=update_input_ui,
+        inputs=task_name,
+        outputs=[
+            audio_box,
+            input_text,
+            source_language,
+            target_language,
+        ],
+        queue=False,
+        api_name=False,
+    ).then(
+        fn=update_output_ui,
+        inputs=task_name,
+        outputs=[output_audio, output_text],
+        queue=False,
+        api_name=False,
+    )
+    btn.click(
+        fn=predict,
+        inputs=[
+            task_name,
+            audio_source,
+            input_audio_mic,
+            input_audio_file,
+            input_text,
+            source_language,
+            target_language,
+        ],
+        outputs=[output_audio, output_text],
+        api_name="run",
+    )
+demo.queue(max_size=50).launch()

mlg_config.json ADDED Viewed

	@@ -0,0 +1,186 @@

+{
+    "multilingual": {
+        "arb": 0,
+        "ben": 1,
+        "cat": 2,
+        "ces": 3,
+        "cmn": 4,
+        "cym": 5,
+        "dan": 6,
+        "deu": 7,
+        "eng": 8,
+        "est": 9,
+        "fin": 10,
+        "fra": 11,
+        "hin": 12,
+        "ind": 13,
+        "ita": 14,
+        "jpn": 15,
+        "kor": 16,
+        "mlt": 17,
+        "nld": 18,
+        "pes": 19,
+        "pol": 20,
+        "por": 21,
+        "ron": 22,
+        "rus": 23,
+        "slk": 24,
+        "spa": 25,
+        "swe": 26,
+        "swh": 27,
+        "tel": 28,
+        "tgl": 29,
+        "tha": 30,
+        "tur": 31,
+        "ukr": 32,
+        "urd": 33,
+        "uzn": 34,
+        "vie": 35
+    },
+    "multispkr": {
+        "arb": [
+            0
+        ],
+        "ben": [
+            2,
+            1
+        ],
+        "cat": [
+            3
+        ],
+        "ces": [
+            4
+        ],
+        "cmn": [
+            5
+        ],
+        "cym": [
+            6
+        ],
+        "dan": [
+            7,
+            8
+        ],
+        "deu": [
+            9
+        ],
+        "eng": [
+            10
+        ],
+        "est": [
+            11,
+            12,
+            13
+        ],
+        "fin": [
+            14
+        ],
+        "fra": [
+            15
+        ],
+        "hin": [
+            16
+        ],
+        "ind": [
+            17,
+            24,
+            18,
+            20,
+            19,
+            21,
+            23,
+            27,
+            26,
+            22,
+            25
+        ],
+        "ita": [
+            29,
+            28
+        ],
+        "jpn": [
+            30
+        ],
+        "kor": [
+            31
+        ],
+        "mlt": [
+            32,
+            33,
+            34
+        ],
+        "nld": [
+            35
+        ],
+        "pes": [
+            36
+        ],
+        "pol": [
+            37
+        ],
+        "por": [
+            38
+        ],
+        "ron": [
+            39
+        ],
+        "rus": [
+            40
+        ],
+        "slk": [
+            41
+        ],
+        "spa": [
+            42
+        ],
+        "swe": [
+            43,
+            45,
+            44
+        ],
+        "swh": [
+            46,
+            48,
+            47
+        ],
+        "tel": [
+            49
+        ],
+        "tgl": [
+            50
+        ],
+        "tha": [
+            51,
+            54,
+            55,
+            52,
+            53
+        ],
+        "tur": [
+            58,
+            57,
+            56
+        ],
+        "ukr": [
+            59
+        ],
+        "urd": [
+            60,
+            61,
+            62
+        ],
+        "uzn": [
+            63,
+            64,
+            65
+        ],
+        "vie": [
+            66,
+            67,
+            70,
+            71,
+            68,
+            69
+        ]
+    }
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+gradio==3.40.1
+huggingface_hub==0.16.4
+torch==2.0.1
+torchaudio==2.0.2

style.css ADDED Viewed

	@@ -0,0 +1,16 @@

+h1 {
+  text-align: center;
+}
+#duplicate-button {
+  margin: auto;
+  color: #fff;
+  background: #1565c0;
+  border-radius: 100vh;
+}
+#component-0 {
+  max-width: 730px;
+  margin: auto;
+  padding-top: 1.5rem;
+}