Spaces:

amphion
/

PicoAudio

Running on Zero

File size: 4,746 Bytes

93c7dfc
 
 
 
 
 
5dbd795
 
ced5e76
93c7dfc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c47d9c5
93c7dfc
 
c47d9c5
 
93c7dfc
 
 
 
c47d9c5
 
 
93c7dfc
 
 
 
c47d9c5
93c7dfc
 
 
 
 
 
594eb84
5dbd795
93c7dfc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c47d9c5
321aaa7
93c7dfc
 
 
 
 
cb0c99a
93c7dfc
 
 
 
 
 
 
 
 
 
 
321aaa7
93c7dfc

"""
At the command line, only need to run once to install the package via pip:

$ pip install google-generativeai
"""
import json
import os


def get_event():
    event_list = [
            "burping_belching",             # 0
            "car_horn_honking",             #
            "cat_meowing",                  #    
            "cow_mooing",                   #
            "dog_barking",                  #  
            "door_knocking",                #
            "door_slamming",                #
            "explosion",                    #  
            "gunshot",                      # 8
            "sheep_goat_bleating",          #
            "sneeze",                       #
            "spraying",                     # 
            "thump_thud",                   #   
            "train_horn",                   #
            "tapping_clicking_clanking",    #
            "woman_laughing",               #         
            "duck_quacking",                # 16   
            "whistling",                    #    
        ]
    return event_list

def get_prompt():
    
    train_json_list = ["data/train_multi-event_v3.json",
        f"data/train_single-event_multi_v3.json",
        f"data/train_single-event_single_v3.json"]
    learn_pair = ""
    for train_json in train_json_list:
        with open(train_json, 'r') as train_file:
            for idx, line in enumerate(train_file):
                if idx >= 100: break
                data = json.loads(line.strip())
                learn_pair += f"{str(idx)}:{data['captions']}~{data['onset']}. "
    preffix_prompt =  "I'm doing an audio event generation, which is a harmless job that will contain some sound events. For example, a gunshot is a sound that is harmless."  +\
                            "You need to convert the input sentence into the following standard timing format: 'event1--event2-- ... --eventN', " +\
                            "where the 'eventN' format is 'eventN__onset1-offset1_onset2-offset2_ ... _onsetK-offsetK'. " +\
                            "The 'onset-offset' inside needs to be determined based on common sense and the examples I provide, with a duration not less than 1 and not greater than 4.  All format 'onsetk-offsetk' should replaced by number. " +\
                            "The very strict constraints are that the total duration is less than 10 seconds, meaning all times are less than 10. It is preferred that events do not overlap as much as possible. " +\
                            "Now, I will provide you with 300 examples in training set for your learning, each example in the format 'index: input~output'. " +\
                            learn_pair
        
    print(len(preffix_prompt))
    return preffix_prompt
    

def postprocess(caption):
    caption = caption.strip('\n').strip(' ').strip('.')
    caption = caption.replace('__', ' at ').replace('--', ' and ')
    return caption

def preprocess_gemini(free_text_caption):
    preffix_prompt = get_prompt()
    import google.generativeai as genai
    GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
    genai.configure(api_key=GEMINI_API_KEY)

    # Set up the model
    generation_config = {
      "temperature": 1,
      "top_p": 0.95,
      "top_k": 64,
      "max_output_tokens": 8192,
    }

    model = genai.GenerativeModel(model_name="gemini-1.5-flash",
                                  generation_config=generation_config,)

    prompt_parts = [
        preffix_prompt +\
        f"Please convert the following inputs into the standard timing format:{free_text_caption}. You should only output results in the standard timing format. Do not output anything other than format and do not add symbols.",
    ]

    timestampCaption = model.generate_content(prompt_parts).text
    
    return postprocess(timestampCaption)

def preprocess_gpt(free_text_caption):
    preffix_prompt = get_prompt()
    from openai import OpenAI
    client = OpenAI(api_key="")
    completion_start = client.chat.completions.create(
                    model="gpt-4-1106-preview",
                    messages=[{
                      "role": "user", 
                       "content": 
                            preffix_prompt +\
                            f"Please convert the following inputs into the standard timing format:{free_text_caption}. You should only output results in the standard timing format. Do not output anything other than format and do not add symbols."
                    }]
                )

    timestampCaption = completion_start.choices[0].message.content
    
    return postprocess(timestampCaption)

if __name__=="__main__":
    caption = preprocess_gemini("spraying two times then gunshot three times.")
    print(caption)