import gradio as gr import torch import scipy.io.wavfile as wavfile from transformers import AutoProcessor, SeamlessM4TModel tokenizer = AutoProcessor.from_pretrained("facebook/hf-seamless-m4t-medium") model = SeamlessM4TModel.from_pretrained("facebook/hf-seamless-m4t-medium") text = "some example text in the English language" def greet(text): inputs = tokenizer(text, return_tensors="pt") with torch.no_grad(): output = model(**inputs, decoder_input_ids=inputs["input_ids"]).waveform out = output[0] wavfile.write("tmp.wav", rate=16000, data=out) return open("tmp.wav", "rb").read() iface = gr.Interface(fn=greet, inputs="text", outputs="audio") iface.launch()