import argparse import gradio as gr from audiodiffusion import AudioDiffusion def generate_spectrogram_audio_and_loop(audio_file,model_id): print(audio_file) print(model_id) audio_diffusion = AudioDiffusion(model_id=model_id) image, (sample_rate, audio) = audio_diffusion.generate_spectrogram_and_audio_from_audio(audio_file) loop = AudioDiffusion.loop_it(audio, sample_rate) if loop is None: loop = audio return image, (sample_rate, audio), (sample_rate, loop) demo = gr.Interface(fn=generate_spectrogram_audio_and_loop, title="Audio Diffusion", description="Forked from https://huggingface.co./spaces/teticio/audio-diffusion Built to style transfer to audio using Huggingface diffusers.\ Outputs a 5 second audio clip with elements from the initial audio uploaded. This takes about 2 hours without a GPU, so why not bake a cake in the meantime? (Or try the teticio/audio-diffusion-ddim-256 \ model which is faster.) The code for doing style transfer method was already into teticio's repo and python notebooks I just, I think hooked it up into a hugging face space. still need some more testing and such but would be cool hook up step number and then to also do inpainting and outpointing In this space and get the api working with the updated pipelines", inputs=[ gr.Audio(source="upload",type="filepath"), gr.Dropdown(label="Model", choices=[ "teticio/audio-diffusion-256", "teticio/audio-diffusion-breaks-256", "teticio/audio-diffusion-instrumental-hiphop-256", "teticio/audio-diffusion-ddim-256" ], value="teticio/audio-diffusion-256") ], outputs=[ gr.Image(label="Mel spectrogram", image_mode="L"), gr.Audio(label="Audio"), gr.Audio(label="Loop"), ], allow_flagging="never") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--port", type=int) parser.add_argument("--server", type=int) args = parser.parse_args() demo.launch(server_name=args.server or "0.0.0.0", server_port=args.port)