sanjitaa's picture
upload seamless files
d775346
raw
history blame
4.13 kB
from __future__ import annotations
import os
import gradio as gr
import numpy as np
import torch
import torchaudio
from seamless_communication.models.inference import Translator
from lang_list import LANGUAGE_NAME_TO_CODE, S2TT_TARGET_LANGUAGE_NAMES
DESCRIPTION = """# Speech to Text Translation
[SeamlessM4T](https://github.com/facebookresearch/seamless_communication) is designed to provide high-quality
translation, allowing people from different linguistic communities to communicate effortlessly through speech and text.
This unified model enables multiple tasks like Speech-to-Speech (S2ST), Speech-to-Text (S2TT), Text-to-Speech (T2ST)
translation and more, without relying on multiple separate models. Here the task is to do the speech to text translation.
"""
CACHE_EXAMPLES = os.getenv("CACHE_EXAMPLES") == "1"
AUDIO_SAMPLE_RATE = 44100
#MAX_INPUT_AUDIO_LENGTH = 1800 # in seconds
DEFAULT_TARGET_LANGUAGE = "French"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
translator = Translator(
model_name_or_card="seamlessM4T_medium",
vocoder_name_or_card="vocoder_36langs",
device=device,
)
task_name = "S2TT (Speech to Text translation)"
task_name = task_name.split()[0]
def predict(
audio_source: str,
input_audio_mic: str | None,
input_audio_file: str | None,
target_language: str,
) -> str:
target_language_code = LANGUAGE_NAME_TO_CODE[target_language]
if audio_source == "microphone":
input_data = input_audio_mic
else:
input_data = input_audio_file
arr, org_sr = torchaudio.load(input_data)
new_arr = torchaudio.functional.resample(arr, orig_freq=org_sr, new_freq=AUDIO_SAMPLE_RATE)
#max_length = int(MAX_INPUT_AUDIO_LENGTH * AUDIO_SAMPLE_RATE)
# if new_arr.shape[1] > max_length:
# new_arr = new_arr[:, :max_length]
# gr.Warning(f"Input audio is too long. Only the first {MAX_INPUT_AUDIO_LENGTH} seconds is used.")
torchaudio.save(input_data, new_arr, sample_rate=int(AUDIO_SAMPLE_RATE))
text_out, wav, sr = translator.predict(
input = input_data,
task_str = task_name,
tgt_lang=target_language_code,
ngram_filtering=True,
)
return text_out
def update_audio_ui(audio_source: str) -> tuple[dict, dict]:
mic = audio_source == "microphone"
return (
gr.update(visible=mic, value=None), # input_audio_mic
gr.update(visible=not mic, value=None), # input_audio_file
)
# ... (previous parts of the code remain unchanged)
with gr.Blocks(css="style.css") as demo:
gr.Markdown(DESCRIPTION)
with gr.Group():
with gr.Row():
target_language = gr.Dropdown(
label="Target language",
choices=S2TT_TARGET_LANGUAGE_NAMES,
)
with gr.Row() as audio_box:
audio_source = gr.Radio(
label="Audio source",
choices=["file", "microphone"],
value="file",
)
input_audio_mic = gr.Audio(
label="Input speech",
type="filepath",
source="microphone",
visible=False,
)
input_audio_file = gr.Audio(
label="Input speech",
type="filepath",
source="upload",
visible=True,
)
btn = gr.Button("Translate")
with gr.Column():
output_text = gr.Textbox(label="Translated text")
audio_source.change(
fn=update_audio_ui,
inputs=audio_source,
outputs=[
input_audio_mic,
input_audio_file,
],
queue=False,
api_name=False,
)
btn.click(
fn=predict,
inputs=[
audio_source,
input_audio_mic,
input_audio_file,
target_language,
],
outputs=[output_text],
api_name="run",
)
demo.launch()