Spaces:
Running
on
Zero
Running
on
Zero
File size: 1,892 Bytes
7d1c060 d04b177 7d1c060 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
import torch
import soundfile as sf
from diffusers import StableAudioPipeline
import gradio as gr
# Load the StableAudio pipeline model
pipe = StableAudioPipeline.from_pretrained("stabilityai/stable-audio-open-1.0", torch_dtype=torch.float16)
pipe = pipe.to("cuda")
# Define the function to generate the sound based on a text prompt
@spaces.GPU
def generate_sound(prompt, negative_prompt, seed, inference_steps, duration, waveforms):
# Set the seed for reproducibility
generator = torch.Generator("cuda").manual_seed(seed)
# Run the audio generation
audio = pipe(
prompt,
negative_prompt=negative_prompt,
num_inference_steps=inference_steps,
audio_end_in_s=duration,
num_waveforms_per_prompt=waveforms,
generator=generator,
).audios
# Get the output and save to a file
output = audio[0].T.float().cpu().numpy()
sf.write("generated_sound.wav", output, pipe.vae.sampling_rate)
return "generated_sound.wav"
# Define the Gradio interface
app = gr.Interface(
fn=generate_sound,
inputs=[
gr.inputs.Textbox(label="Text Prompt", placeholder="Describe the sound you'd like to generate..."),
gr.inputs.Textbox(label="Negative Prompt", placeholder="Describe what you don't want in the sound..."),
gr.inputs.Slider(label="Seed", minimum=0, maximum=10000, step=1, default=0),
gr.inputs.Slider(label="Inference Steps", minimum=50, maximum=500, step=10, default=200),
gr.inputs.Slider(label="Duration (seconds)", minimum=1.0, maximum=30.0, step=0.5, default=10.0),
gr.inputs.Slider(label="Number of Waveforms", minimum=1, maximum=5, step=1, default=1)
],
outputs=gr.Audio(label="Generated Sound"),
title="StableAudio Text-to-Speech Generator",
description="Generate high-quality audio from text using StableAudio."
)
app.launch() |