Spaces:

ASLP-lab
/

DiffRhythm

Running on Zero

App Files Files Community

ing0 commited on 3 days ago

Commit

9743771

1 Parent(s): 1a3ee96

test mp3

Browse files

Files changed (3) hide show

app.py +5 -5
diffrhythm/infer/infer.py +14 -5
diffrhythm/infer/infer_utils.py +0 -3

app.py CHANGED Viewed

@@ -54,14 +54,14 @@ def R1_infer1(theme, tags_gen, language):
         client = OpenAI(api_key=os.getenv('HS_DP_API'), base_url = "https://ark.cn-beijing.volces.com/api/v3")
         llm_prompt = """
-        请围绕"{theme}"主题生成一首符合"{tags}"风格的完整歌词。生成的{language}语言的歌词。
         ### **歌曲结构要求**
         1. 歌词应富有变化，使情绪递进，整体连贯有层次感。**每行歌词长度应自然变化**，切勿长度一致，导致很格式化。
-        2. **时间戳分配应根据歌曲的标签\歌词的情感、节奏来合理推测**，而非机械地按照歌词长度分配。
         ### **歌曲内容要求**
         1. **第一句歌词的时间戳应考虑前奏长度**，避免歌词从 `[00:00.00]` 直接开始。
         2. **严格按照 LRC 格式输出歌词**，每行格式为 `[mm:ss.xx]歌词内容`。
-        3. 输出的歌词不能有空行、括号，不能有其他解释内容，例如：副歌、桥段、结尾。
         4. 输出必须是**纯净的 LRC**。
         """
@@ -156,8 +156,8 @@ with gr.Blocks(css=css) as demo:
                         - Each line must follow: `[mm:ss.xx]Lyric content`
                         - Example of valid format:
                             ```
-                            [00:07.23]Fight me fight me fight me
-                            [00:08.73]You made me so unlike me
                             ```
                         2. **Generation Duration Limits**

         client = OpenAI(api_key=os.getenv('HS_DP_API'), base_url = "https://ark.cn-beijing.volces.com/api/v3")
         llm_prompt = """
+        请围绕"{theme}"主题生成一首符合"{tags}"风格的语言为{language}的完整歌词。同时我希望你生成的歌词严格符合下述要求：
         ### **歌曲结构要求**
         1. 歌词应富有变化，使情绪递进，整体连贯有层次感。**每行歌词长度应自然变化**，切勿长度一致，导致很格式化。
+        2. **时间戳分配应根据歌曲的标签、歌词的情感、节奏来合理推测**，而非机械地按照歌词长度分配。
         ### **歌曲内容要求**
         1. **第一句歌词的时间戳应考虑前奏长度**，避免歌词从 `[00:00.00]` 直接开始。
         2. **严格按照 LRC 格式输出歌词**，每行格式为 `[mm:ss.xx]歌词内容`。
+        3. 输出的歌词不能有空行、括号，严禁出现除了时间戳和歌词以外的内容，例如：副歌、桥段、结尾等段落注释。
         4. 输出必须是**纯净的 LRC**。
         """
                         - Each line must follow: `[mm:ss.xx]Lyric content`
                         - Example of valid format:
                             ```
+                            [00:10.00]Moonlight spills through broken blinds
+                            [00:13.20]Your shadow dances on the dashboard shrine
                             ```
                         2. **Generation Duration Limits**

diffrhythm/infer/infer.py CHANGED Viewed

@@ -8,7 +8,8 @@ from tqdm import tqdm
 import random
 import numpy as np
 import time
-import spaces
 from diffrhythm.infer.infer_utils import (
     get_reference_latent,
@@ -18,7 +19,6 @@ from diffrhythm.infer.infer_utils import (
     get_negative_style_prompt
 )
-@spaces.GPU
 def decode_audio(latents, vae_model, chunked=False, overlap=32, chunk_size=128):
     downsampling_ratio = 2048
     io_channels = 2
@@ -74,7 +74,6 @@ def decode_audio(latents, vae_model, chunked=False, overlap=32, chunk_size=128):
             y_final[:,:,t_start:t_end] = y_chunk[:,:,chunk_start:chunk_end]
         return y_final
-@spaces.GPU
 def inference(cfm_model, vae_model, cond, text, duration, style_prompt, negative_style_prompt, steps, sway_sampling_coef, start_time):
     # import pdb; pdb.set_trace()
     s_t = time.time()
@@ -91,7 +90,7 @@ def inference(cfm_model, vae_model, cond, text, duration, style_prompt, negative
             start_time=start_time
         )
-        # generated = generated.to(torch.float32)
         latent = generated.transpose(1, 2) # [b d t]
         e_t = time.time()
         print(f"**** cfm time : {e_t-s_t} ****")
@@ -104,8 +103,18 @@ def inference(cfm_model, vae_model, cond, text, duration, style_prompt, negative
         output_tensor = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1, 1).cpu()
         output_np = output_tensor.numpy().T.astype(np.float32)
         print(f"**** vae time : {time.time()-e_t} ****")
         print(output_np.mean(), output_np.min(), output_np.max(), output_np.std())
-        return (44100, output_np)
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()

 import random
 import numpy as np
 import time
+import io
+import pydub
 from diffrhythm.infer.infer_utils import (
     get_reference_latent,
     get_negative_style_prompt
 )
 def decode_audio(latents, vae_model, chunked=False, overlap=32, chunk_size=128):
     downsampling_ratio = 2048
     io_channels = 2
             y_final[:,:,t_start:t_end] = y_chunk[:,:,chunk_start:chunk_end]
         return y_final
 def inference(cfm_model, vae_model, cond, text, duration, style_prompt, negative_style_prompt, steps, sway_sampling_coef, start_time):
     # import pdb; pdb.set_trace()
     s_t = time.time()
             start_time=start_time
         )
+        generated = generated.to(torch.float32)
         latent = generated.transpose(1, 2) # [b d t]
         e_t = time.time()
         print(f"**** cfm time : {e_t-s_t} ****")
         output_tensor = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1, 1).cpu()
         output_np = output_tensor.numpy().T.astype(np.float32)
         print(f"**** vae time : {time.time()-e_t} ****")
+        e_t = time.time()
         print(output_np.mean(), output_np.min(), output_np.max(), output_np.std())
+        # return (44100, output_np)
+        buffer = io.BytesIO()
+        output_np = np.int16(output_np * 2**15)
+        song = pydub.AudioSegment(output_np.tobytes(), frame_rate=44100, sample_width=2, channels=2)
+        song.export(buffer, format="mp3", bitrate="320k")
+        print(f"**** buffer time : {time.time()-e_t} ****")
+        return buffer.getvalue()
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()

diffrhythm/infer/infer_utils.py CHANGED Viewed

@@ -35,9 +35,6 @@ def prepare_model(device):
     # prepare vae
     vae_ckpt_path = hf_hub_download(repo_id="ASLP-lab/DiffRhythm-vae", filename="vae_model.pt")
     vae = torch.jit.load(vae_ckpt_path, map_location='cpu').to(device)
-    print("********* vae.parameters()  ", next(vae.parameters()).dtype)
-    vae = vae.half()
-    print("********* vae half parameters()  ", next(vae.parameters()).dtype)
     return cfm, tokenizer, muq, vae

     # prepare vae
     vae_ckpt_path = hf_hub_download(repo_id="ASLP-lab/DiffRhythm-vae", filename="vae_model.pt")
     vae = torch.jit.load(vae_ckpt_path, map_location='cpu').to(device)
     return cfm, tokenizer, muq, vae