Spaces:

amphion
/

PicoAudio

Running on Zero

App Files Files Community

ZeyuXie commited on Jul 17, 2024

Commit

93c7dfc

verified ·

1 Parent(s): 930de62

Upload 8 files

Browse files

Files changed (7) hide show

app.py +135 -91
data/train_multi-event_v3.json +0 -0
data/train_single-event_multi_v3.json +0 -0
data/train_single-event_single_v3.json +0 -0
llm_preprocess.py +108 -0
pico_model.py +32 -1
requirements.txt +30 -29

app.py CHANGED Viewed

@@ -1,91 +1,135 @@
-import os
-import json
-import numpy as np
-import torch
-import soundfile as sf
-import gradio as gr
-from diffusers import DDPMScheduler
-from pico_model import PicoDiffusion
-from audioldm.variational_autoencoder.autoencoder import AutoencoderKL
-class dotdict(dict):
-    """dot.notation access to dictionary attributes"""
-    __getattr__ = dict.get
-    __setattr__ = dict.__setitem__
-    __delattr__ = dict.__delitem__
-class InferRunner:
-    def __init__(self, device):
-        vae_config = json.load(open("ckpts/ldm/vae_config.json"))
-        self.vae = AutoencoderKL(**vae_config).to(device)
-        vae_weights = torch.load("ckpts/ldm/pytorch_model_vae.bin", map_location=device)
-        self.vae.load_state_dict(vae_weights)
-        train_args = dotdict(json.loads(open("ckpts/pico_model/summary.jsonl").readlines()[0]))
-        self.pico_model = PicoDiffusion(
-            scheduler_name=train_args.scheduler_name,
-            unet_model_config_path=train_args.unet_model_config,
-            snr_gamma=train_args.snr_gamma,
-            freeze_text_encoder_ckpt="ckpts/laion_clap/630k-audioset-best.pt",
-            diffusion_pt="ckpts/pico_model/diffusion.pt",
-        ).eval().to(device)
-        self.scheduler = DDPMScheduler.from_pretrained(train_args.scheduler_name, subfolder="scheduler")
-device = "cuda" if torch.cuda.is_available() else "cpu"
-runner = InferRunner(device)
-event_list = [
-            "burping_belching",             # 0
-            "car_horn_honking",             #
-            "cat_meowing",                  #
-            "cow_mooing",                   #
-            "dog_barking",                  #
-            "door_knocking",                #
-            "door_slamming",                #
-            "explosion",                    #
-            "gunshot",                      # 8
-            "sheep_goat_bleating",          #
-            "sneeze",                       #
-            "spraying",                     #
-            "thump_thud",                   #
-            "train_horn",                   #
-            "tapping_clicking_clanking",    #
-            "woman_laughing",               #
-            "duck_quacking",                # 16
-            "whistling",                    #
-        ]
-def infer(caption, num_steps=200, guidance_scale=3.0, audio_len=16000*10):
-    with torch.no_grad():
-        latents = runner.pico_model.demo_inference(caption, runner.scheduler, num_steps=num_steps, guidance_scale=guidance_scale, num_samples_per_prompt=1, disable_progress=True)
-        mel = runner.vae.decode_first_stage(latents)
-        wave = runner.vae.decode_to_waveform(mel)[0][:audio_len]
-    outpath = f"output.wav"
-    sf.write(outpath, wave, samplerate=16000, subtype='PCM_16')
-    return outpath
-description_text = f"18 events: {', '.join(event_list)}"
-prompt = gr.Textbox(label="Prompt: Input your caption formatted as 'event1 at onset1-offset1_onset2-offset2 and event2 at onset1-offset1'.",
-    value="spraying at 0.38-1.176_3.06-3.856 and gunshot at 1.729-3.729_4.367-6.367_7.031-9.031.",)
-outaudio = gr.Audio()
-num_steps = gr.Slider(label="num_steps", minimum=1, maximum=300, value=200, step=1)
-guidance_scale = gr.Slider(label="guidance_scale", minimum=0.1, maximum=8.0, value=3.0, step=0.1)
-gr_interface = gr.Interface(
-    fn=infer,
-    inputs=[prompt, num_steps, guidance_scale],
-    outputs=[outaudio],
-    title="PicoAudio",
-    description=description_text,
-    allow_flagging=False,
-    examples=[
-        ["spraying at 0.38-1.176_3.06-3.856 and gunshot at 1.729-3.729_4.367-6.367_7.031-9.031."],
-        ["dog_barking at 0.562-2.562_4.25-6.25."],
-        ["cow_mooing at 0.958-3.582_5.272-7.896."],
-    ],
-    cache_examples="lazy", # Turn on to cache.
-)
-gr_interface.queue(10).launch()

+import os
+import json
+import numpy as np
+import torch
+import soundfile as sf
+import gradio as gr
+from diffusers import DDPMScheduler
+from pico_model import PicoDiffusion
+from audioldm.variational_autoencoder.autoencoder import AutoencoderKL
+from llm_preprocess import get_event, preprocess_gemini, preprocess_gpt
+class dotdict(dict):
+    """dot.notation access to dictionary attributes"""
+    __getattr__ = dict.get
+    __setattr__ = dict.__setitem__
+    __delattr__ = dict.__delitem__
+class InferRunner:
+    def __init__(self, device):
+        vae_config = json.load(open("ckpts/ldm/vae_config.json"))
+        self.vae = AutoencoderKL(**vae_config).to(device)
+        vae_weights = torch.load("ckpts/ldm/pytorch_model_vae.bin", map_location=device)
+        self.vae.load_state_dict(vae_weights)
+        train_args = dotdict(json.loads(open("ckpts/pico_model/summary.jsonl").readlines()[0]))
+        self.pico_model = PicoDiffusion(
+            scheduler_name=train_args.scheduler_name,
+            unet_model_config_path=train_args.unet_model_config,
+            snr_gamma=train_args.snr_gamma,
+            freeze_text_encoder_ckpt="ckpts/laion_clap/630k-audioset-best.pt",
+            diffusion_pt="ckpts/pico_model/diffusion.pt",
+        ).eval().to(device)
+        self.scheduler = DDPMScheduler.from_pretrained(train_args.scheduler_name, subfolder="scheduler")
+device = "cuda" if torch.cuda.is_available() else "cpu"
+runner = InferRunner(device)
+event_list = get_event()
+def infer(caption, num_steps=200, guidance_scale=3.0, audio_len=16000*10):
+    with torch.no_grad():
+        latents = runner.pico_model.demo_inference(caption, runner.scheduler, num_steps=num_steps, guidance_scale=guidance_scale, num_samples_per_prompt=1, disable_progress=True)
+        mel = runner.vae.decode_first_stage(latents)
+        wave = runner.vae.decode_to_waveform(mel)[0][:audio_len]
+    outpath = f"output.wav"
+    sf.write(outpath, wave, samplerate=16000, subtype='PCM_16')
+    return outpath
+def preprocess(caption):
+    output = preprocess_gemini(caption)
+    return output, output
+with gr.Blocks() as demo:
+    with gr.Row():
+        gr.Markdown("## PicoAudio")
+    with gr.Row():
+        description_text = f"18 events: {', '.join(event_list)}"
+        gr.Markdown(description_text)
+    with gr.Row():
+        gr.Markdown("## Step1")
+    with gr.Row():
+        preprocess_description_text = f"preprocess: free-text to timestamp caption via LLM"
+        gr.Markdown(preprocess_description_text)
+    with gr.Row():
+        with gr.Column():
+            freetext_prompt = gr.Textbox(label="Prompt: Input your free-text caption here. (e.g. a dog barks three times.)",
+                value="a dog barks three times.",)
+            preprocess_run_button = gr.Button()
+            prompt = None
+        with gr.Column():
+            freetext_prompt_out = gr.Textbox(label="Preprocess output")
+    with gr.Row():
+        with gr.Column():
+            gr.Examples(
+                        examples = [["spraying two times then gunshot three times."],
+                                    ["a dog barks three times."],
+                                    ["cow mooing two times."],],
+                        inputs = [freetext_prompt],
+                        outputs = [prompt]
+                        )
+        with gr.Column():
+            pass
+    with gr.Row():
+        gr.Markdown("## Step2")
+    with gr.Row():
+        with gr.Column():
+            prompt = gr.Textbox(label="Prompt: Input your caption formatted as 'event1 at onset1-offset1_onset2-offset2 and event2 at onset1-offset1'.",
+                value="spraying at 0.38-1.176_3.06-3.856 and gunshot at 1.729-3.729_4.367-6.367_7.031-9.031.",)
+            generate_run_button = gr.Button()
+            with gr.Accordion("Advanced options", open=False):
+                num_steps = gr.Slider(label="num_steps", minimum=1, maximum=300, value=200, step=1)
+                guidance_scale = gr.Slider(label="guidance_scale", minimum=0.1, maximum=8.0, value=3.0, step=0.1)
+        with gr.Column():
+            outaudio = gr.Audio()
+    preprocess_run_button.click(fn=preprocess_gemini, inputs=[freetext_prompt], outputs=[prompt, freetext_prompt_out])
+    generate_run_button.click(fn=infer, inputs=[prompt, num_steps, guidance_scale], outputs=[outaudio])
+    with gr.Row():
+        with gr.Column():
+            gr.Examples(
+                        examples = [["spraying at 0.38-1.176_3.06-3.856 and gunshot at 1.729-3.729_4.367-6.367_7.031-9.031."],
+                                    ["dog_barking at 0.562-2.562_4.25-6.25."],
+                                    ["cow_mooing at 0.958-3.582_5.272-7.896."],],
+                        inputs = [prompt, num_steps, guidance_scale],
+                        outputs = [outaudio]
+                        )
+        with gr.Column():
+            pass
+demo.launch()
+# description_text = f"18 events: {', '.join(event_list)}"
+# prompt = gr.Textbox(label="Prompt: Input your caption formatted as 'event1 at onset1-offset1_onset2-offset2 and event2 at onset1-offset1'.",
+#     value="spraying at 0.38-1.176_3.06-3.856 and gunshot at 1.729-3.729_4.367-6.367_7.031-9.031.",)
+# outaudio = gr.Audio()
+# num_steps = gr.Slider(label="num_steps", minimum=1, maximum=300, value=200, step=1)
+# guidance_scale = gr.Slider(label="guidance_scale", minimum=0.1, maximum=8.0, value=3.0, step=0.1)
+# gr_interface = gr.Interface(
+        #     fn=infer,
+        #     inputs=[prompt, num_steps, guidance_scale],
+        #     outputs=[outaudio],
+        #     title="PicoAudio",
+        #     description=description_text,
+        #     allow_flagging=False,
+        #     examples=[
+        #         ["spraying at 0.38-1.176_3.06-3.856 and gunshot at 1.729-3.729_4.367-6.367_7.031-9.031."],
+        #         ["dog_barking at 0.562-2.562_4.25-6.25."],
+        #         ["cow_mooing at 0.958-3.582_5.272-7.896."],
+        #     ],
+        #     cache_examples="lazy", # Turn on to cache.
+        # )
+        # gr_interface.queue(10).launch()

data/train_multi-event_v3.json ADDED Viewed

The diff for this file is too large to render. See raw diff

data/train_single-event_multi_v3.json ADDED Viewed

The diff for this file is too large to render. See raw diff

data/train_single-event_single_v3.json ADDED Viewed

The diff for this file is too large to render. See raw diff

llm_preprocess.py ADDED Viewed

	@@ -0,0 +1,108 @@

+"""
+At the command line, only need to run once to install the package via pip:
+$ pip install google-generativeai
+"""
+from pathlib import Path
+import os
+import json
+import re
+def get_event():
+    event_list = [
+            "burping_belching",             # 0
+            "car_horn_honking",             #
+            "cat_meowing",                  #
+            "cow_mooing",                   #
+            "dog_barking",                  #
+            "door_knocking",                #
+            "door_slamming",                #
+            "explosion",                    #
+            "gunshot",                      # 8
+            "sheep_goat_bleating",          #
+            "sneeze",                       #
+            "spraying",                     #
+            "thump_thud",                   #
+            "train_horn",                   #
+            "tapping_clicking_clanking",    #
+            "woman_laughing",               #
+            "duck_quacking",                # 16
+            "whistling",                    #
+        ]
+    return event_list
+def get_prompt():
+    train_json_list = ["data/train_multi-event_v3.json",
+        f"data/train_single-event_multi_v3.json",
+        f"data/train_single-event_single_v3.json"]
+    learn_pair = ""
+    for train_json in train_json_list:
+        with open(train_json, 'r') as train_file:
+            for idx, line in enumerate(train_file):
+                if idx >= 300: break
+                data = json.loads(line.strip())
+                learn_pair += f"{str(idx)}:{data['captions']}~{data['onset']}. "
+    preffix_prompt = "You need to convert the input sentence into the following standard timing format: 'event1--event2-- ... --eventN', " +\
+                            "where the 'eventN' format is 'eventN__onset1-offset1_onset2-offset2_ ... _onsetK-offsetK'. " +\
+                            "The 'onset-offset' inside needs to be determined based on common sense and the examples I provide, with a duration not less than 1 and not greater than 4.  All format 'onsetk-offsetk' should replaced by number. " +\
+                            "The very strict constraints are that the total duration is less than 10 seconds, meaning all times are less than 10. It is preferred that events do not overlap as much as possible. " +\
+                            "Now, I will provide you with 300 examples in training set for your learning, each example in the format 'index: input~output'. " +\
+                            learn_pair +\
+                            f"You need to map events to 18 given events: {', '.join(get_event())}"
+    #print(preffix_prompt)
+    return preffix_prompt
+def postprocess(caption):
+    caption = caption.replace('__', ' at ').replace('--', ' and ')
+    return caption
+def preprocess_gemini(free_text_caption):
+    preffix_prompt = get_prompt()
+    import google.generativeai as genai
+    genai.configure(api_key="AIzaSyDfGKPQtS9qExCfl3bnfxC1rLPzvORz3E4")
+    # Set up the model
+    generation_config = {
+      "temperature": 1,
+      "top_p": 0.95,
+      "top_k": 64,
+      "max_output_tokens": 8192,
+    }
+    model = genai.GenerativeModel(model_name="gemini-1.5-flash",
+                                  generation_config=generation_config,)
+    prompt_parts = [
+        preffix_prompt +\
+        f"Please convert the following inputs into the standard timing format:{free_text_caption}. You should only output results in the standard timing format. Do not output anything other than format and do not add symbols.",
+    ]
+    timestampCaption = model.generate_content(prompt_parts)
+    # output = "dog_barking at 0.562-2.562_4.25-6.25_7.01-8.21."
+    return postprocess(timestampCaption)
+def preprocess_gpt(free_text_caption):
+    preffix_prompt = get_prompt()
+    from openai import OpenAI
+    client = OpenAI(api_key="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")
+    completion_start = client.chat.completions.create(
+                    model="gpt-4-1106-preview",
+                    messages=[{
+                      "role": "user",
+                       "content":
+                            preffix_prompt +\
+                            f"Please convert the following inputs into the standard timing format:{free_text_caption}. You should only output results in the standard timing format. Do not output anything other than format and do not add symbols."
+                    }]
+                )
+    timestampCaption = completion_start.choices[0].message.content
+    #output = "dog_barking at 0.562-2.562_4.25-6.25_7.01-8.21."
+    return postprocess(timestampCaption)
+if __name__=="__main__":
+    caption = preprocess_gemini("spraying two times then gunshot three times.")
+    print(caption)

pico_model.py CHANGED Viewed

@@ -10,6 +10,37 @@ from diffusers import DDPMScheduler, UNet2DConditionModel
 from audioldm.audio.stft import TacotronSTFT
 from audioldm.variational_autoencoder.autoencoder import AutoencoderKL
 def _init_layer(layer):
     """Initialize a Linear or Convolutional layer. """
@@ -229,7 +260,7 @@ class PicoDiffusion(ClapText_Onset_2_Audio_Diffusion):
         ckpt = clap_load_state_dict(freeze_text_encoder_ckpt, skip_params=True)
         del_parameter_key = ["text_branch.embeddings.position_ids"]
         ckpt = {f"freeze_text_encoder.model.{k}":v for k, v in ckpt.items() if k not in del_parameter_key}
-        diffusion_ckpt = torch.load(diffusion_pt, map_location=torch.device(self.device))
         del diffusion_ckpt["class_emb.weight"]
         ckpt.update(diffusion_ckpt)
         self.load_state_dict(ckpt)

 from audioldm.audio.stft import TacotronSTFT
 from audioldm.variational_autoencoder.autoencoder import AutoencoderKL
+from audioldm.utils import default_audioldm_config, get_metadata
+def build_pretrained_models(name):
+    checkpoint = torch.load(get_metadata()[name]["path"], map_location="cpu")
+    scale_factor = checkpoint["state_dict"]["scale_factor"].item()
+    vae_state_dict = {k[18:]: v for k, v in checkpoint["state_dict"].items() if "first_stage_model." in k}
+    config = default_audioldm_config(name)
+    vae_config = config["model"]["params"]["first_stage_config"]["params"]
+    vae_config["scale_factor"] = scale_factor
+    vae = AutoencoderKL(**vae_config)
+    vae.load_state_dict(vae_state_dict)
+    fn_STFT = TacotronSTFT(
+        config["preprocessing"]["stft"]["filter_length"],
+        config["preprocessing"]["stft"]["hop_length"],
+        config["preprocessing"]["stft"]["win_length"],
+        config["preprocessing"]["mel"]["n_mel_channels"],
+        config["preprocessing"]["audio"]["sampling_rate"],
+        config["preprocessing"]["mel"]["mel_fmin"],
+        config["preprocessing"]["mel"]["mel_fmax"],
+    )
+    vae.eval()
+    fn_STFT.eval()
+    return vae, fn_STFT
 def _init_layer(layer):
     """Initialize a Linear or Convolutional layer. """
         ckpt = clap_load_state_dict(freeze_text_encoder_ckpt, skip_params=True)
         del_parameter_key = ["text_branch.embeddings.position_ids"]
         ckpt = {f"freeze_text_encoder.model.{k}":v for k, v in ckpt.items() if k not in del_parameter_key}
+        diffusion_ckpt = torch.load(diffusion_pt)
         del diffusion_ckpt["class_emb.weight"]
         ckpt.update(diffusion_ckpt)
         self.load_state_dict(ckpt)

requirements.txt CHANGED Viewed

@@ -1,29 +1,30 @@
-torch==2.0.1
-torchaudio==2.0.2
-torchvision==0.15.2
-transformers==4.37.2
-accelerate==0.26.1
-datasets==2.16.1
-diffusers==0.18.2
-einops==0.7.0
-h5py==3.10.0
-huggingface_hub==0.20.3
-importlib_metadata==7.0.1
-librosa==0.10.1
-matplotlib==3.8.2
-numpy==1.23.5
-omegaconf==2.0.6
-packaging==23.2
-pandas==2.2.0
-progressbar33==2.4
-protobuf==3.20.*
-resampy==0.4.2
-scikit_image==0.22.0
-scikit_learn==1.4.0
-scipy==1.12.0
-soundfile==0.12.1
-ssr_eval==0.0.7
-torchlibrosa==0.1.0
-tqdm==4.63.1
-laion-clap==1.1.4
-gradio

+torch==2.0.1
+torchaudio==2.0.2
+torchvision==0.15.2
+transformers==4.37.2
+accelerate==0.26.1
+datasets==2.16.1
+diffusers==0.18.2
+einops==0.7.0
+h5py==3.10.0
+huggingface_hub==0.20.3
+importlib_metadata==7.0.1
+librosa==0.10.1
+matplotlib==3.8.2
+numpy==1.23.5
+omegaconf==2.0.6
+packaging==23.2
+pandas==2.2.0
+progressbar33==2.4
+protobuf==3.20.*
+resampy==0.4.2
+scikit_image==0.22.0
+scikit_learn==1.4.0
+scipy==1.12.0
+soundfile==0.12.1
+ssr_eval==0.0.7
+torchlibrosa==0.1.0
+tqdm==4.63.1
+laion-clap==1.1.4
+gradio
+google-generativeai