YuE-music-generator-demo-zero

Running on Zero

App Files Files Community

KingNish commited on Jan 29

Commit

b8a38aa

1 Parent(s): 773a80a

modified: app.py

Browse files

Files changed (1) hide show

app.py +354 -113

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import gradio as gr
 import subprocess
-import os
 import shutil
 import tempfile
 import spaces
@@ -27,10 +27,10 @@ def install_flash_attn():
 # Install flash-attn
 install_flash_attn()
-from huggingface_hub import snapshot_download
 # Create xcodec_mini_infer folder
-folder_path = './inference/xcodec_mini_infer'
 # Create the folder if it doesn't exist
 if not os.path.exists(folder_path):
@@ -41,22 +41,347 @@ else:
 snapshot_download(
     repo_id = "m-a-p/xcodec_mini_infer",
-    local_dir = "./inference/xcodec_mini_infer"
 )
-# Change to the "inference" directory
-inference_dir = "./inference"
-try:
-    os.chdir(inference_dir)
-    print(f"Changed working directory to: {os.getcwd()}")
-except FileNotFoundError:
-    print(f"Directory not found: {inference_dir}")
-    exit(1)
 def empty_output_folder(output_dir):
     # List all files in the output directory
     files = os.listdir(output_dir)
     # Iterate over the files and remove them
     for file in files:
         file_path = os.path.join(output_dir, file)
@@ -70,54 +395,8 @@ def empty_output_folder(output_dir):
         except Exception as e:
             print(f"Error deleting file {file_path}: {e}")
-# Function to create a temporary file with string content
-def create_temp_file(content, prefix, suffix=".txt"):
-    temp_file = tempfile.NamedTemporaryFile(delete=False, mode="w", prefix=prefix, suffix=suffix)
-    # Ensure content ends with newline and normalize line endings
-    content = content.strip() + "\n\n"  # Add extra newline at end
-    content = content.replace("\r\n", "\n").replace("\r", "\n")
-    temp_file.write(content)
-    temp_file.close()
-    # Debug: Print file contents
-    print(f"\nContent written to {prefix}{suffix}:")
-    print(content)
-    print("---")
-    return temp_file.name
-def get_last_mp3_file(output_dir):
-    # List all files in the output directory
-    files = os.listdir(output_dir)
-    # Filter only .mp3 files
-    mp3_files = [file for file in files if file.endswith('.mp3')]
-    if not mp3_files:
-        print("No .mp3 files found in the output folder.")
-        return None
-    # Get the full path for the mp3 files
-    mp3_files_with_path = [os.path.join(output_dir, file) for file in mp3_files]
-    # Sort the files based on the modification time (most recent first)
-    mp3_files_with_path.sort(key=lambda x: os.path.getmtime(x), reverse=True)
-    # Return the most recent .mp3 file
-    return mp3_files_with_path[0]
-device = torch.device(f"cuda" if torch.cuda.is_available() else "cpu")
-model = AutoModelForCausalLM.from_pretrained(
-    "m-a-p/YuE-s1-7B-anneal-en-cot",
-    torch_dtype=torch.float16,
-    attn_implementation="flash_attention_2", # To enable flashattn, you have to install flash-attn
-    )
-model.to(device)
-model.eval()
 @spaces.GPU(duration=120)
-def infer(genre_txt_content, lyrics_txt_content, num_segments=2, max_new_tokens=200):
     # Ensure the output folder exists
     output_dir = "./output"
@@ -125,55 +404,17 @@ def infer(genre_txt_content, lyrics_txt_content, num_segments=2, max_new_tokens=
     print(f"Output folder ensured at: {output_dir}")
     empty_output_folder(output_dir)
-    # Command and arguments with optimized settings
-    command = [
-        "python", "infer.py",
-        "--stage1_model", model,
-        # "--stage2_model", "m-a-p/YuE-s2-1B-general",
-        "--genre_txt", f"{genre_txt_content}",
-        "--lyrics_txt", f"{lyrics_txt_content}",
-        "--run_n_segments", f"{num_segments}",
-        # "--stage2_batch_size", "4",
-        "--output_dir", f"{output_dir}",
-        "--cuda_idx", "0",
-        "--max_new_tokens", f"{max_new_tokens}",
-        # "--disable_offload_model"
-    ]
-    # Set up environment variables for CUDA with optimized settings
-    env = os.environ.copy()
-    # Execute the command
-    try:
-        subprocess.run(command, check=True, env=env)
-        print("Command executed successfully!")
-        # Check and print the contents of the output folder
-        output_files = os.listdir(output_dir)
-        if output_files:
-            print("Output folder contents:")
-            for file in output_files:
-                print(f"- {file}")
-            last_mp3 = get_last_mp3_file(output_dir)
-            if last_mp3:
-                print("Last .mp3 file:", last_mp3)
-                return last_mp3
-            else:
-                return None
-        else:
-            print("Output folder is empty.")
-            return None
-    except subprocess.CalledProcessError as e:
-        print(f"Error occurred: {e}")
         return None
-    finally:
-        # Clean up temporary files
-        print("Temporary files deleted.")
-# Gradio
 with gr.Blocks() as demo:
     with gr.Column():
@@ -182,7 +423,7 @@ with gr.Blocks() as demo:
         <div style="display:flex;column-gap:4px;">
             <a href="https://github.com/multimodal-art-projection/YuE">
                 <img src='https://img.shields.io/badge/GitHub-Repo-blue'>
-            </a>
             <a href="https://map-yue.github.io">
                 <img src='https://img.shields.io/badge/Project-Page-green'>
             </a>
@@ -195,7 +436,7 @@ with gr.Blocks() as demo:
             with gr.Column():
                 genre_txt = gr.Textbox(label="Genre")
                 lyrics_txt = gr.Textbox(label="Lyrics")
             with gr.Column():
                 if is_shared_ui:
                     num_segments = gr.Number(label="Number of Segments", value=2, interactive=True)
@@ -242,16 +483,16 @@ Through the highs and lows, I'mma keep it real
 Living out my dreams with this mic and a deal
                     """
                 ]
-            ],
              inputs = [genre_txt, lyrics_txt],
             outputs = [music_out],
             cache_examples = False,
             # cache_mode="lazy",
-            fn=infer
         )
     submit_btn.click(
-        fn = infer,
         inputs = [genre_txt, lyrics_txt, num_segments, max_new_tokens],
         outputs = [music_out]
     )

 import gradio as gr
 import subprocess
+import os
 import shutil
 import tempfile
 import spaces
 # Install flash-attn
 install_flash_attn()
+from huggingface_hub import snapshot_download
 # Create xcodec_mini_infer folder
+folder_path = './xcodec_mini_infer'
 # Create the folder if it doesn't exist
 if not os.path.exists(folder_path):
 snapshot_download(
     repo_id = "m-a-p/xcodec_mini_infer",
+    local_dir = "./xcodec_mini_infer"
 )
+# Add xcodec_mini_infer and descriptaudiocodec to sys path
+import sys
+sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'xcodec_mini_infer'))
+sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'xcodec_mini_infer', 'descriptaudiocodec'))
+import argparse
+import numpy as np
+import json
+from omegaconf import OmegaConf
+import torchaudio
+from torchaudio.transforms import Resample
+import soundfile as sf
+import uuid
+from tqdm import tqdm
+from einops import rearrange
+from codecmanipulator import CodecManipulator
+from mmtokenizer import _MMSentencePieceTokenizer
+from transformers import AutoTokenizer, AutoModelForCausalLM, LogitsProcessor, LogitsProcessorList
+import glob
+import time
+import copy
+from collections import Counter
+from models.soundstream_hubert_new import SoundStream
+from vocoder import build_codec_model, process_audio
+from post_process_audio import replace_low_freq_with_energy_matched
+import re
+# --- Arguments and Model Loading from infer.py ---
+parser = argparse.ArgumentParser()
+# Model Configuration:
+parser.add_argument("--stage1_model", type=str, default="m-a-p/YuE-s1-7B-anneal-en-cot", help="The model checkpoint path or identifier for the Stage 1 model.")
+parser.add_argument("--max_new_tokens", type=int, default=3000, help="The maximum number of new tokens to generate in one pass during text generation.")
+parser.add_argument("--run_n_segments", type=int, default=2, help="The number of segments to process during the generation.")
+# Prompt
+parser.add_argument("--genre_txt", type=str, default="", help="The file path to a text file containing genre tags that describe the musical style or characteristics (e.g., instrumental, genre, mood, vocal timbre, vocal gender). This is used as part of the generation prompt.") # Modified: removed required=True and using default=""
+parser.add_argument("--lyrics_txt", type=str, default="", help="The file path to a text file containing the lyrics for the music generation. These lyrics will be processed and split into structured segments to guide the generation process.") # Modified: removed required=True and using default=""
+parser.add_argument("--use_audio_prompt", action="store_true", help="If set, the model will use an audio file as a prompt during generation. The audio file should be specified using --audio_prompt_path.")
+parser.add_argument("--audio_prompt_path", type=str, default="", help="The file path to an audio file to use as a reference prompt when --use_audio_prompt is enabled.")
+parser.add_argument("--prompt_start_time", type=float, default=0.0, help="The start time in seconds to extract the audio prompt from the given audio file.")
+parser.add_argument("--prompt_end_time", type=float, default=30.0, help="The end time in seconds to extract the audio prompt from the given audio file.")
+# Output
+parser.add_argument("--output_dir", type=str, default="./output", help="The directory where generated outputs will be saved.")
+parser.add_argument("--keep_intermediate", action="store_true", help="If set, intermediate outputs will be saved during processing.")
+parser.add_argument("--disable_offload_model", action="store_true", help="If set, the model will not be offloaded from the GPU to CPU after Stage 1 inference.")
+parser.add_argument("--cuda_idx", type=int, default=0)
+# Config for xcodec and upsampler
+parser.add_argument('--basic_model_config', default='./xcodec_mini_infer/final_ckpt/config.yaml', help='YAML files for xcodec configurations.')
+parser.add_argument('--resume_path', default='./xcodec_mini_infer/final_ckpt/ckpt_00360000.pth', help='Path to the xcodec checkpoint.')
+parser.add_argument('--config_path', type=str, default='./xcodec_mini_infer/decoders/config.yaml', help='Path to Vocos config file.')
+parser.add_argument('--vocal_decoder_path', type=str, default='./xcodec_mini_infer/decoders/decoder_131000.pth', help='Path to Vocos decoder weights.')
+parser.add_argument('--inst_decoder_path', type=str, default='./xcodec_mini_infer/decoders/decoder_151000.pth', help='Path to Vocos decoder weights.')
+parser.add_argument('-r', '--rescale', action='store_true', help='Rescale output to avoid clipping.')
+args = parser.parse_args([]) # Modified: Pass empty list to parse_args to avoid command line parsing in Gradio
+if args.use_audio_prompt and not args.audio_prompt_path:
+    raise FileNotFoundError("Please offer audio prompt filepath using '--audio_prompt_path', when you enable 'use_audio_prompt'!")
+model_name = args.stage1_model # Modified: Renamed 'model' to 'model_name' to avoid shadowing the loaded model later
+cuda_idx = args.cuda_idx
+max_new_tokens_config = args.max_new_tokens # Modified: Renamed 'max_new_tokens' to 'max_new_tokens_config' to avoid shadowing the Gradio input
+stage1_output_dir = os.path.join(args.output_dir, f"stage1")
+os.makedirs(stage1_output_dir, exist_ok=True)
+# load tokenizer and model
+device = torch.device(f"cuda:{cuda_idx}" if torch.cuda.is_available() else "cpu")
+# Now you can use `device` to move your tensors or models to the GPU (if available)
+print(f"Using device: {device}")
+mmtokenizer = _MMSentencePieceTokenizer("./mm_tokenizer_v0.2_hf/tokenizer.model")
+codectool = CodecManipulator("xcodec", 0, 1)
+model_config = OmegaConf.load(args.basic_model_config)
+codec_model = eval(model_config.generator.name)(**model_config.generator.config).to(device)
+parameter_dict = torch.load(args.resume_path, map_location='cpu')
+codec_model.load_state_dict(parameter_dict['codec_model'])
+codec_model.to(device)
+codec_model.eval()
+class BlockTokenRangeProcessor(LogitsProcessor):
+    def __init__(self, start_id, end_id):
+        self.blocked_token_ids = list(range(start_id, end_id))
+    def __call__(self, input_ids, scores):
+        scores[:, self.blocked_token_ids] = -float("inf")
+        return scores
+def load_audio_mono(filepath, sampling_rate=16000):
+    audio, sr = torchaudio.load(filepath)
+    # Convert to mono
+    audio = torch.mean(audio, dim=0, keepdim=True)
+    # Resample if needed
+    if sr != sampling_rate:
+        resampler = Resample(orig_freq=sr, new_freq=sampling_rate)
+        audio = resampler(audio)
+    return audio
+def split_lyrics(lyrics):
+    pattern = r"\[(\w+)\](.*?)\n(?=\[|\Z)"
+    segments = re.findall(pattern, lyrics, re.DOTALL)
+    structured_lyrics = [f"[{seg[0]}]\n{seg[1].strip()}\n\n" for seg in segments]
+    return structured_lyrics
+def generate_music(genres, lyrics_content, num_segments_run, max_new_tokens_run): # Modified: Function to encapsulate generation logic
+    stage1_output_set_local = [] # Modified: Local variable to store output paths
+    lyrics = split_lyrics(lyrics_content)
+    # intruction
+    full_lyrics = "\n".join(lyrics)
+    prompt_texts = [f"Generate music from the given lyrics segment by segment.\n[Genre] {genres}\n{full_lyrics}"]
+    prompt_texts += lyrics
+    random_id = uuid.uuid4()
+    output_seq = None
+    # Here is suggested decoding config
+    top_p = 0.93
+    temperature = 1.0
+    repetition_penalty = 1.2
+    # special tokens
+    start_of_segment = mmtokenizer.tokenize('[start_of_segment]')
+    end_of_segment = mmtokenizer.tokenize('[end_of_segment]')
+    raw_output = None
+    # Format text prompt
+    run_n_segments = min(num_segments_run+1, len(lyrics)) # Modified: Use passed num_segments_run
+    print(list(enumerate(tqdm(prompt_texts[:run_n_segments]))))
+    global model # Modified: Declare model as global to use the loaded model in Gradio scope
+    for i, p in enumerate(tqdm(prompt_texts[:run_n_segments])):
+        section_text = p.replace('[start_of_segment]', '').replace('[end_of_segment]', '')
+        guidance_scale = 1.5 if i <=1 else 1.2
+        if i==0:
+            continue
+        if i==1:
+            if args.use_audio_prompt:
+                audio_prompt = load_audio_mono(args.audio_prompt_path)
+                audio_prompt.unsqueeze_(0)
+                with torch.no_grad():
+                    raw_codes = codec_model.encode(audio_prompt.to(device), target_bw=0.5)
+                raw_codes = raw_codes.transpose(0, 1)
+                raw_codes = raw_codes.cpu().numpy().astype(np.int16)
+                # Format audio prompt
+                code_ids = codectool.npy2ids(raw_codes[0])
+                audio_prompt_codec = code_ids[int(args.prompt_start_time *50): int(args.prompt_end_time *50)] # 50 is tps of xcodec
+                audio_prompt_codec_ids = [mmtokenizer.soa] + codectool.sep_ids + audio_prompt_codec + [mmtokenizer.eoa]
+                sentence_ids = mmtokenizer.tokenize("[start_of_reference]") +  audio_prompt_codec_ids + mmtokenizer.tokenize("[end_of_reference]")
+                head_id = mmtokenizer.tokenize(prompt_texts[0]) + sentence_ids
+            else:
+                head_id = mmtokenizer.tokenize(prompt_texts[0])
+            prompt_ids = head_id + start_of_segment + mmtokenizer.tokenize(section_text) + [mmtokenizer.soa] + codectool.sep_ids
+        else:
+            prompt_ids = end_of_segment + start_of_segment + mmtokenizer.tokenize(section_text) + [mmtokenizer.soa] + codectool.sep_ids
+        prompt_ids = torch.as_tensor(prompt_ids).unsqueeze(0).to(device)
+        input_ids = torch.cat([raw_output, prompt_ids], dim=1) if i > 1 else prompt_ids
+        # Use window slicing in case output sequence exceeds the context of model
+        max_context = 16384-max_new_tokens_config-1 # Modified: Use max_new_tokens_config
+        if input_ids.shape[-1] > max_context:
+            print(f'Section {i}: output length {input_ids.shape[-1]} exceeding context length {max_context}, now using the last {max_context} tokens.')
+            input_ids = input_ids[:, -(max_context):]
+        with torch.no_grad():
+            output_seq = model.generate(
+                input_ids=input_ids,
+                max_new_tokens=max_new_tokens_run, # Modified: Use max_new_tokens_run
+                min_new_tokens=100,
+                do_sample=True,
+                top_p=top_p,
+                temperature=temperature,
+                repetition_penalty=repetition_penalty,
+                eos_token_id=mmtokenizer.eoa,
+                pad_token_id=mmtokenizer.eoa,
+                logits_processor=LogitsProcessorList([BlockTokenRangeProcessor(0, 32002), BlockTokenRangeProcessor(32016, 32016)]),
+                guidance_scale=guidance_scale,
+                )
+            if output_seq[0][-1].item() != mmtokenizer.eoa:
+                tensor_eoa = torch.as_tensor([[mmtokenizer.eoa]]).to(model.device)
+                output_seq = torch.cat((output_seq, tensor_eoa), dim=1)
+        if i > 1:
+            raw_output = torch.cat([raw_output, prompt_ids, output_seq[:, input_ids.shape[-1]:]], dim=1)
+        else:
+            raw_output = output_seq
+        print(len(raw_output))
+    # save raw output and check sanity
+    ids = raw_output[0].cpu().numpy()
+    soa_idx = np.where(ids == mmtokenizer.soa)[0].tolist()
+    eoa_idx = np.where(ids == mmtokenizer.eoa)[0].tolist()
+    if len(soa_idx)!=len(eoa_idx):
+        raise ValueError(f'invalid pairs of soa and eoa, Num of soa: {len(soa_idx)}, Num of eoa: {len(eoa_idx)}')
+    vocals = []
+    instrumentals = []
+    range_begin = 1 if args.use_audio_prompt else 0
+    for i in range(range_begin, len(soa_idx)):
+        codec_ids = ids[soa_idx[i]+1:eoa_idx[i]]
+        if codec_ids[0] == 32016:
+            codec_ids = codec_ids[1:]
+        codec_ids = codec_ids[:2 * (codec_ids.shape[0] // 2)]
+        vocals_ids = codectool.ids2npy(rearrange(codec_ids,"(n b) -> b n", b=2)[0])
+        vocals.append(vocals_ids)
+        instrumentals_ids = codectool.ids2npy(rearrange(codec_ids,"(n b) -> b n", b=2)[1])
+        instrumentals.append(instrumentals_ids)
+    vocals = np.concatenate(vocals, axis=1)
+    instrumentals = np.concatenate(instrumentals, axis=1)
+    vocal_save_path = os.path.join(stage1_output_dir, f"cot_{genres.replace(' ', '-')}_tp{top_p}_T{temperature}_rp{repetition_penalty}_maxtk{max_new_tokens_run}_vocal_{random_id}".replace('.', '@')+'.npy') # Modified: Use max_new_tokens_run in filename
+    inst_save_path = os.path.join(stage1_output_dir, f"cot_{genres.replace(' ', '-')}_tp{top_p}_T{temperature}_rp{repetition_penalty}_maxtk{max_new_tokens_run}_instrumental_{random_id}".replace('.', '@')+'.npy') # Modified: Use max_new_tokens_run in filename
+    np.save(vocal_save_path, vocals)
+    np.save(inst_save_path, instrumentals)
+    stage1_output_set_local.append(vocal_save_path)
+    stage1_output_set_local.append(inst_save_path)
+    # offload model - Removed offloading for gradio integration to keep model loaded
+    # if not args.disable_offload_model:
+    #     model.cpu()
+    #     del model
+    #     torch.cuda.empty_cache()
+    print("Converting to Audio...")
+    # convert audio tokens to audio
+    def save_audio(wav: torch.Tensor, path, sample_rate: int, rescale: bool = False):
+        folder_path = os.path.dirname(path)
+        if not os.path.exists(folder_path):
+            os.makedirs(folder_path)
+        limit = 0.99
+        max_val = wav.abs().max()
+        wav = wav * min(limit / max_val, 1) if rescale else wav.clamp(-limit, limit)
+        torchaudio.save(str(path), wav, sample_rate=sample_rate, encoding='PCM_S', bits_per_sample=16)
+    # reconstruct tracks
+    recons_output_dir = os.path.join(args.output_dir, "recons")
+    recons_mix_dir = os.path.join(recons_output_dir, 'mix')
+    os.makedirs(recons_mix_dir, exist_ok=True)
+    tracks = []
+    for npy in stage1_output_set_local: # Modified: Use stage1_output_set_local
+        codec_result = np.load(npy)
+        decodec_rlt=[]
+        with torch.no_grad():
+            decoded_waveform = codec_model.decode(torch.as_tensor(codec_result.astype(np.int16), dtype=torch.long).unsqueeze(0).permute(1, 0, 2).to(device))
+        decoded_waveform = decoded_waveform.cpu().squeeze(0)
+        decodec_rlt.append(torch.as_tensor(decoded_waveform))
+        decodec_rlt = torch.cat(decodec_rlt, dim=-1)
+        save_path = os.path.join(recons_output_dir, os.path.splitext(os.path.basename(npy))[0] + ".mp3")
+        tracks.append(save_path)
+        save_audio(decodec_rlt, save_path, 16000)
+    # mix tracks
+    for inst_path in tracks:
+        try:
+            if (inst_path.endswith('.wav') or inst_path.endswith('.mp3')) \
+                and 'instrumental' in inst_path:
+                # find pair
+                vocal_path = inst_path.replace('instrumental', 'vocal')
+                if not os.path.exists(vocal_path):
+                    continue
+                # mix
+                recons_mix = os.path.join(recons_mix_dir, os.path.basename(inst_path).replace('instrumental', 'mixed'))
+                vocal_stem, sr = sf.read(inst_path)
+                instrumental_stem, _ = sf.read(vocal_path)
+                mix_stem = (vocal_stem + instrumental_stem) / 1
+                sf.write(recons_mix, mix_stem, sr)
+        except Exception as e:
+            print(e)
+    # vocoder to upsample audios
+    vocal_decoder, inst_decoder = build_codec_model(args.config_path, args.vocal_decoder_path, args.inst_decoder_path)
+    vocoder_output_dir = os.path.join(args.output_dir, 'vocoder')
+    vocoder_stems_dir = os.path.join(vocoder_output_dir, 'stems')
+    vocoder_mix_dir = os.path.join(vocoder_output_dir, 'mix')
+    os.makedirs(vocoder_mix_dir, exist_ok=True)
+    os.makedirs(vocoder_stems_dir, exist_ok=True)
+    instrumental_output = None # Initialize outside try block
+    vocal_output = None # Initialize outside try block
+    recons_mix_path = "" # Initialize outside try block
+    for npy in stage1_output_set_local: # Modified: Use stage1_output_set_local
+        if 'instrumental' in npy:
+            # Process instrumental
+            instrumental_output = process_audio(
+                npy,
+                os.path.join(vocoder_stems_dir, 'instrumental.mp3'),
+                args.rescale,
+                args,
+                inst_decoder,
+                codec_model
+            )
+        else:
+            # Process vocal
+            vocal_output = process_audio(
+                npy,
+                os.path.join(vocoder_stems_dir, 'vocal.mp3'),
+                args.rescale,
+                args,
+                vocal_decoder,
+                codec_model
+            )
+    # mix tracks
+    try:
+        mix_output = instrumental_output + vocal_output
+        recons_mix_path_temp = os.path.join(recons_mix_dir, os.path.basename(recons_mix)) # Use recons_mix from previous step
+        save_audio(mix_output, recons_mix_path_temp, 44100, args.rescale)
+        print(f"Created mix: {recons_mix_path_temp}")
+        recons_mix_path = recons_mix_path_temp # Assign to outer scope variable
+    except RuntimeError as e:
+        print(e)
+        print(f"mix {recons_mix_path} failed! inst: {instrumental_output.shape}, vocal: {vocal_output.shape}")
+    # Post process
+    final_output_path = os.path.join(args.output_dir, os.path.basename(recons_mix_path)) # Use recons_mix_path from previous step
+    replace_low_freq_with_energy_matched(
+        a_file=recons_mix_path,     # 16kHz # Use recons_mix_path
+        b_file=recons_mix_path_temp,     # 48kHz # Use recons_mix_path_temp
+        c_file=final_output_path,
+        cutoff_freq=5500.0
+    )
+    print("All process Done")
+    return final_output_path # Modified: Return the final output audio path
+# Gradio UI
+model = AutoModelForCausalLM.from_pretrained( # Load model here for Gradio scope
+    "m-a-p/YuE-s1-7B-anneal-en-cot",
+    torch_dtype=torch.float16,
+    attn_implementation="flash_attention_2", # To enable flashattn, you have to install flash-attn
+    ).to(device).eval() # Modified: Load model globally for Gradio to access
 def empty_output_folder(output_dir):
     # List all files in the output directory
     files = os.listdir(output_dir)
     # Iterate over the files and remove them
     for file in files:
         file_path = os.path.join(output_dir, file)
         except Exception as e:
             print(f"Error deleting file {file_path}: {e}")
 @spaces.GPU(duration=120)
+def infer_gradio(genre_txt_content, lyrics_txt_content, num_segments=2, max_new_tokens=200): # Modified: Renamed infer to infer_gradio to avoid conflict
     # Ensure the output folder exists
     output_dir = "./output"
     print(f"Output folder ensured at: {output_dir}")
     empty_output_folder(output_dir)
+    # Call the generation function directly
+    output_audio_path = generate_music(genre_txt_content, lyrics_txt_content, int(num_segments), int(max_new_tokens)) # Modified: Call generate_music and pass num_segments and max_new_tokens as int
+    if output_audio_path and os.path.exists(output_audio_path):
+        print("Generated audio file:", output_audio_path)
+        return output_audio_path
+    else:
+        print("No audio file generated or path is invalid.")
         return None
 with gr.Blocks() as demo:
     with gr.Column():
         <div style="display:flex;column-gap:4px;">
             <a href="https://github.com/multimodal-art-projection/YuE">
                 <img src='https://img.shields.io/badge/GitHub-Repo-blue'>
+            </a>
             <a href="https://map-yue.github.io">
                 <img src='https://img.shields.io/badge/Project-Page-green'>
             </a>
             with gr.Column():
                 genre_txt = gr.Textbox(label="Genre")
                 lyrics_txt = gr.Textbox(label="Lyrics")
             with gr.Column():
                 if is_shared_ui:
                     num_segments = gr.Number(label="Number of Segments", value=2, interactive=True)
 Living out my dreams with this mic and a deal
                     """
                 ]
+            ],
              inputs = [genre_txt, lyrics_txt],
             outputs = [music_out],
             cache_examples = False,
             # cache_mode="lazy",
+            fn=infer_gradio # Modified: Use infer_gradio
         )
     submit_btn.click(
+        fn = infer_gradio, # Modified: Use infer_gradio
         inputs = [genre_txt, lyrics_txt, num_segments, max_new_tokens],
         outputs = [music_out]
     )