YuE-music-generator-demo-zero

Running on Zero

App Files Files Community

KingNish commited on Feb 2

Commit

ec39241

verified ·

1 Parent(s): dce5b4e

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -37

app.py CHANGED Viewed

@@ -119,6 +119,35 @@ def split_lyrics(lyrics: str):
     return structured_lyrics
 @spaces.GPU(duration=175)
 def generate_music(
         genre_txt=None,
         lyrics_txt=None,
@@ -171,7 +200,7 @@ def generate_music(
         for i, p in enumerate(tqdm(prompt_texts[:run_n_segments])):
             section_text = p.replace('[start_of_segment]', '').replace('[end_of_segment]', '')
-            guidance_scale = 1.5 if i <= 1 else 1.2  # Guidance scale adjusted based on segment index
             if i == 0:
                 continue
             if i == 1:
@@ -182,56 +211,30 @@ def generate_music(
                         raw_codes = codec_model.encode(audio_prompt.to(device), target_bw=0.5)
                     raw_codes = raw_codes.transpose(0, 1)
                     raw_codes = raw_codes.cpu().numpy().astype(np.int16)
-                    # Format audio prompt
                     code_ids = codectool.npy2ids(raw_codes[0])
-                    audio_prompt_codec = code_ids[int(prompt_start_time * 50): int(prompt_end_time * 50)]  # 50 is tps of xcodec
-                    audio_prompt_codec_ids = [mmtokenizer.soa] + codectool.sep_ids + audio_prompt_codec + [
-                        mmtokenizer.eoa]
-                    sentence_ids = mmtokenizer.tokenize("[start_of_reference]") + audio_prompt_codec_ids + mmtokenizer.tokenize(
-                        "[end_of_reference]")
                     head_id = mmtokenizer.tokenize(prompt_texts[0]) + sentence_ids
                 else:
                     head_id = mmtokenizer.tokenize(prompt_texts[0])
                 prompt_ids = head_id + start_of_segment + mmtokenizer.tokenize(section_text) + [mmtokenizer.soa] + codectool.sep_ids
             else:
                 prompt_ids = end_of_segment + start_of_segment + mmtokenizer.tokenize(section_text) + [mmtokenizer.soa] + codectool.sep_ids
             prompt_ids = torch.as_tensor(prompt_ids).unsqueeze(0).to(device)
             input_ids = torch.cat([raw_output, prompt_ids], dim=1) if i > 1 else prompt_ids
-            # Use window slicing in case output sequence exceeds the context of model
             max_context = 16384 - max_new_tokens - 1
             if input_ids.shape[-1] > max_context:
                 print(
                     f'Section {i}: output length {input_ids.shape[-1]} exceeding context length {max_context}, now using the last {max_context} tokens.')
                 input_ids = input_ids[:, -(max_context):]
-            def model_inference(input_ids, max_new_tokens, top_p, temperature, repetition_penalty, guidance_scale):
-                """
-                Performs model inference to generate music tokens.
-                """
-                with torch.inference_mode(), torch.autocast(device_type='cuda', dtype=torch.float16):
-                    output_seq = model.generate(
-                        input_ids=input_ids,
-                        max_new_tokens=max_new_tokens,
-                        min_new_tokens=100,  # Keep min_new_tokens to avoid short generations
-                        do_sample=True,
-                        top_p=top_p,
-                        temperature=temperature,
-                        repetition_penalty=repetition_penalty,
-                        eos_token_id=mmtokenizer.eoa,
-                        pad_token_id=mmtokenizer.eoa,
-                        logits_processor=LogitsProcessorList([BlockTokenRangeProcessor(0, 32002), BlockTokenRangeProcessor(32016, 32016)]),
-                        guidance_scale=guidance_scale,
-                        use_cache=True
-                    )
-                    if output_seq[0][-1].item() != mmtokenizer.eoa:
-                        tensor_eoa = torch.as_tensor([[mmtokenizer.eoa]]).to(model.device)
-                        output_seq = torch.cat((output_seq, tensor_eoa), dim=1)
-                return output_seq
-            output_seq = model_inference(input_ids, max_new_tokens, top_p, temperature, repetition_penalty, guidance_scale)
             if i > 1:
                 raw_output = torch.cat([raw_output, prompt_ids, output_seq[:, input_ids.shape[-1]:]], dim=1)
             else:

     return structured_lyrics
 @spaces.GPU(duration=175)
+def requires_cuda(input_ids, max_new_tokens, top_p, temperature, repetition_penalty, guidance_scale):
+    """
+    This function wraps the heavy GPU inference that uses torch.autocast and torch.inference_mode.
+    It calls model.generate with the appropriate parameters and returns the generated sequence.
+    """
+    with torch.inference_mode(), torch.autocast(device_type='cuda', dtype=torch.float16):
+        output_seq = model.generate(
+            input_ids=input_ids,
+            max_new_tokens=max_new_tokens,
+            min_new_tokens=100,  # Keep min_new_tokens to avoid short generations
+            do_sample=True,
+            top_p=top_p,
+            temperature=temperature,
+            repetition_penalty=repetition_penalty,
+            eos_token_id=mmtokenizer.eoa,
+            pad_token_id=mmtokenizer.eoa,
+            logits_processor=LogitsProcessorList([
+                BlockTokenRangeProcessor(0, 32002),
+                BlockTokenRangeProcessor(32016, 32016)
+            ]),
+            guidance_scale=guidance_scale,
+            use_cache=True
+        )
+        # If the output does not end with the EOS token, append it.
+        if output_seq[0][-1].item() != mmtokenizer.eoa:
+            tensor_eoa = torch.as_tensor([[mmtokenizer.eoa]]).to(model.device)
+            output_seq = torch.cat((output_seq, tensor_eoa), dim=1)
+    return output_seq
 def generate_music(
         genre_txt=None,
         lyrics_txt=None,
         for i, p in enumerate(tqdm(prompt_texts[:run_n_segments])):
             section_text = p.replace('[start_of_segment]', '').replace('[end_of_segment]', '')
+            guidance_scale = 1.5 if i <= 1 else 1.2  # Adjust guidance scale per segment
             if i == 0:
                 continue
             if i == 1:
                         raw_codes = codec_model.encode(audio_prompt.to(device), target_bw=0.5)
                     raw_codes = raw_codes.transpose(0, 1)
                     raw_codes = raw_codes.cpu().numpy().astype(np.int16)
                     code_ids = codectool.npy2ids(raw_codes[0])
+                    audio_prompt_codec = code_ids[int(prompt_start_time * 50): int(prompt_end_time * 50)]
+                    audio_prompt_codec_ids = [mmtokenizer.soa] + codectool.sep_ids + audio_prompt_codec + [mmtokenizer.eoa]
+                    sentence_ids = mmtokenizer.tokenize("[start_of_reference]") + audio_prompt_codec_ids + mmtokenizer.tokenize("[end_of_reference]")
                     head_id = mmtokenizer.tokenize(prompt_texts[0]) + sentence_ids
                 else:
                     head_id = mmtokenizer.tokenize(prompt_texts[0])
                 prompt_ids = head_id + start_of_segment + mmtokenizer.tokenize(section_text) + [mmtokenizer.soa] + codectool.sep_ids
             else:
                 prompt_ids = end_of_segment + start_of_segment + mmtokenizer.tokenize(section_text) + [mmtokenizer.soa] + codectool.sep_ids
             prompt_ids = torch.as_tensor(prompt_ids).unsqueeze(0).to(device)
             input_ids = torch.cat([raw_output, prompt_ids], dim=1) if i > 1 else prompt_ids
+            # Window slicing in case the sequence exceeds the model's context length
             max_context = 16384 - max_new_tokens - 1
             if input_ids.shape[-1] > max_context:
                 print(
                     f'Section {i}: output length {input_ids.shape[-1]} exceeding context length {max_context}, now using the last {max_context} tokens.')
                 input_ids = input_ids[:, -(max_context):]
+            # Perform the GPU-heavy inference using the requires_cuda function.
+            output_seq = requires_cuda(input_ids, max_new_tokens, top_p, temperature, repetition_penalty, guidance_scale)
             if i > 1:
                 raw_output = torch.cat([raw_output, prompt_ids, output_seq[:, input_ids.shape[-1]:]], dim=1)
             else: