Spaces:

ASLP-lab
/

DiffRhythm

Running on Zero

App Files Files Community

ing0 commited on 3 days ago

Commit

2a3c97e

1 Parent(s): ee08889

Zero GPU

Browse files

Files changed (3) hide show

app.py +86 -103
diffrhythm/infer/infer.py +4 -5
diffrhythm/infer/infer_utils.py +1 -2

app.py CHANGED Viewed

@@ -9,6 +9,7 @@ from einops import rearrange
 import argparse
 import json
 import os
 from tqdm import tqdm
 import random
 import numpy as np
@@ -22,13 +23,15 @@ from diffrhythm.infer.infer_utils import (
 )
 from diffrhythm.infer.infer import inference
-device='cpu'
 cfm, tokenizer, muq, vae = prepare_model(device)
 cfm = torch.compile(cfm)
-def infer_music(lrc, ref_audio_path, steps, sway_sampling_coef_bool, max_frames=2048, device='cpu'):
-    sway_sampling_coef = -1 if sway_sampling_coef_bool else None
     lrc_prompt, start_time = get_lrc_token(lrc, tokenizer, device)
     style_prompt = get_style_prompt(muq, ref_audio_path)
     negative_style_prompt = get_negative_style_prompt(device)
@@ -48,7 +51,7 @@ def infer_music(lrc, ref_audio_path, steps, sway_sampling_coef_bool, max_frames=
 def R1_infer1(theme, tags_gen, language):
     try:
-        client = OpenAI(api_key="XXXX", base_url = "https://ark.cn-beijing.volces.com/api/v3")
         llm_prompt = """
         请围绕"{theme}"主题生成一首符合"{tags}"风格的完整歌词。生成的{language}语言的歌词。
@@ -82,7 +85,7 @@ def R1_infer1(theme, tags_gen, language):
 def R1_infer2(tags_lyrics, lyrics_input):
-    client = OpenAI(api_key="XXX", base_url = "https://ark.cn-beijing.volces.com/api/v3")
     llm_prompt = """
     {lyrics_input}这是一首歌的歌词,每一行是一句歌词,{tags_lyrics}是我希望这首歌的风格，我现在想要给这首歌的每一句歌词打时间戳得到LRC，我希望时间戳分配应根据歌曲的标签、歌词的情感、节奏来合理推测，而非机械地按照歌词长度分配。第一句歌词的时间戳应考虑前奏长度，避免歌词从 `[00:00.00]` 直接开始。严格按照 LRC 格式输出歌词，每行格式为 `[mm:ss.xx]歌词内容`。最后的结果只输出LRC,不需要其他的解释。
@@ -110,10 +113,32 @@ css = """
     white-space: pre-wrap;  /* 保留换行 */
     line-height: 1.5;  /* 行高优化 */
 }
 """
 with gr.Blocks(css=css) as demo:
-    gr.Markdown("# DiffRhythm")
     with gr.Tabs() as tabs:
@@ -144,31 +169,26 @@ with gr.Blocks(css=css) as demo:
                         placeholder="Input the full lyrics",
                         lines=12,
                         max_lines=50,
-                        elem_classes="lyrics-scroll-box"
                     )
-                    audio_prompt = gr.Audio(label="Audio Prompt", type="filepath")
                 with gr.Column():
                     steps = gr.Slider(
                                     minimum=10,
-                                    maximum=40,
                                     value=32,
                                     step=1,
                                     label="Diffusion Steps",
                                     interactive=True,
                                     elem_id="step_slider"
                                 )
-                    sway_sampling_coef_bool = gr.Radio(
-                                    choices=[("False", False), ("True", True)],
-                                    label="Use sway_sampling_coef",
-                                    value=False,
-                                    interactive=True,
-                                    elem_classes="horizontal-radio"
-                                )
                     lyrics_btn = gr.Button("Submit", variant="primary")
                     audio_output = gr.Audio(label="Audio Result", type="filepath", elem_id="audio_output")
             gr.Examples(
                 examples=[
                     ["./gift_of_the_world.wav"],
@@ -177,59 +197,21 @@ with gr.Blocks(css=css) as demo:
                 ],
                 inputs=[audio_prompt],
                 label="Audio Examples",
-                examples_per_page=3
             )
             gr.Examples(
                 examples=[
-                    ["""[00:10.00]Moonlight spills through broken blinds
-[00:13.20]Your shadow dances on the dashboard shrine
-[00:16.85]Neon ghosts in gasoline rain
-[00:20.40]I hear your laughter down the midnight train
-[00:24.15]Static whispers through frayed wires
-[00:27.65]Guitar strings hum our cathedral choirs
-[00:31.30]Flicker screens show reruns of June
-[00:34.90]I'm drowning in this mercury lagoon
-[00:38.55]Electric veins pulse through concrete skies
-[00:42.10]Your name echoes in the hollow where my heartbeat lies
-[00:45.75]We're satellites trapped in parallel light
-[00:49.25]Burning through the atmosphere of endless night
-[01:00.00]Dusty vinyl spins reverse
-[01:03.45]Our polaroid timeline bleeds through the verse
-[01:07.10]Telescope aimed at dead stars
-[01:10.65]Still tracing constellations through prison bars
-[01:14.30]Electric veins pulse through concrete skies
-[01:17.85]Your name echoes in the hollow where my heartbeat lies
-[01:21.50]We're satellites trapped in parallel light
-[01:25.05]Burning through the atmosphere of endless night
-[02:10.00]Clockwork gears grind moonbeams to rust
-[02:13.50]Our fingerprint smudged by interstellar dust
-[02:17.15]Velvet thunder rolls through my veins
-[02:20.70]Chasing phantom trains through solar plane
-[02:24.35]Electric veins pulse through concrete skies
-[02:27.90]Your name echoes in the hollow where my heartbeat lies"""],
-                ["""[00:05.00]Stardust whispers in your eyes
-[00:09.30]Moonlight paints our silhouettes
-[00:13.75]Tides bring secrets from the deep
-[00:18.20]Where forever's breath is kept
-[00:22.90]We dance through constellations' maze
-[00:27.15]Footprints melt in cosmic waves
-[00:31.65]Horizons hum our silent vow
-[00:36.10]Time unravels here and now
-[00:40.85]Eternal embers in the night oh oh oh
-[00:45.25]Healing scars with liquid light
-[00:49.70]Galaxies write our refrain
-[00:54.15]Love reborn in endless rain
-[01:15.30]Paper boats of memories
-[01:19.75]Float through veins of ancient trees
-[01:24.20]Your laughter spins aurora threads
-[01:28.65]Weaving dawn through featherbed"""]
                 ],
-                inputs=[lrc],
                 label="Lrc Examples",
-                examples_per_page=2
             )
         # page 2
         with gr.Tab("LLM Generate LRC", id=1):
             with gr.Row():
@@ -241,8 +223,26 @@ with gr.Blocks(css=css) as demo:
                         gr.Markdown("### Method 1: Generate from Theme")
                         theme = gr.Textbox(label="theme", placeholder="Enter song theme, e.g. Love and Heartbreak")
                         tags_gen = gr.Textbox(label="tags", placeholder="Example: male pop confidence healing")
-                        language = gr.Dropdown(["zh", "en"], label="language", value="en")
                         gen_from_theme_btn = gr.Button("Generate LRC (From Theme)", variant="primary")
                     with gr.Group(visible=True):
                         gr.Markdown("### Method 2: Add Timestamps to Lyrics")
@@ -250,55 +250,37 @@ with gr.Blocks(css=css) as demo:
                         lyrics_input = gr.Textbox(
                             label="Raw Lyrics (without timestamps)",
                             placeholder="Enter plain lyrics (without timestamps), e.g.:\nYesterday\nAll my troubles...",
-                            lines=12,
                             max_lines=50,
                             elem_classes="lyrics-scroll-box"
                         )
                         gen_from_lyrics_btn = gr.Button("Generate LRC (From Lyrics)", variant="primary")
                 with gr.Column():
                     lrc_output = gr.Textbox(
                         label="Generated LRC Lyrics",
                         placeholder="Timed lyrics will appear here",
-                        lines=50,
                         elem_classes="lrc-output",
                         show_copy_button=True
                     )
-            # Examples section
-            gr.Examples(
-                examples=[
-                    [
-                        "Love and Heartbreak",
-                        "female vocal emotional piano pop",
-                        "en"
-                    ],
-                    [
-                        "Heroic Epic",
-                        "male choir orchestral powerful",
-                        "zh"
-                    ]
-                ],
-                inputs=[theme, tags_gen, language],
-                label="Examples: Generate from Theme"
-            )
-            gr.Examples(
-                examples=[
-                    [
-                        "acoustic folk happy",
-                        """I'm sitting here in the boring room
-                        It's just another rainy Sunday afternoon"""
-                    ],
-                    [
-                        "electronic dance energetic",
-                        """We're living in a material world
-                        And I am a material girl"""
-                    ]
-                ],
-                inputs=[tags_lyrics, lyrics_input],
-                label="Examples: Generate from Lyrics"
-            )
             # Bind functions
             gen_from_theme_btn.click(
@@ -321,10 +303,11 @@ with gr.Blocks(css=css) as demo:
     lyrics_btn.click(
         fn=infer_music,
-        inputs=[lrc, audio_prompt, steps, sway_sampling_coef_bool],
         outputs=audio_output
     )
 demo.queue().launch(show_api=False, show_error=True)

 import argparse
 import json
 import os
+import spaces
 from tqdm import tqdm
 import random
 import numpy as np
 )
 from diffrhythm.infer.infer import inference
+device='cuda'
 cfm, tokenizer, muq, vae = prepare_model(device)
 cfm = torch.compile(cfm)
+@spaces.GPU
+def infer_music(lrc, ref_audio_path, steps, max_frames=2048, device='cuda'):
+    sway_sampling_coef = -1 if steps < 32 else None
     lrc_prompt, start_time = get_lrc_token(lrc, tokenizer, device)
     style_prompt = get_style_prompt(muq, ref_audio_path)
     negative_style_prompt = get_negative_style_prompt(device)
 def R1_infer1(theme, tags_gen, language):
     try:
+        client = OpenAI(api_key="3581722f-9abc-49cf-9792-fa962cad9c4f", base_url = "https://ark.cn-beijing.volces.com/api/v3")
         llm_prompt = """
         请围绕"{theme}"主题生成一首符合"{tags}"风格的完整歌词。生成的{language}语言的歌词。
 def R1_infer2(tags_lyrics, lyrics_input):
+    client = OpenAI(api_key="3581722f-9abc-49cf-9792-fa962cad9c4f", base_url = "https://ark.cn-beijing.volces.com/api/v3")
     llm_prompt = """
     {lyrics_input}这是一首歌的歌词,每一行是一句歌词,{tags_lyrics}是我希望这首歌的风格，我现在想要给这首歌的每一句歌词打时间戳得到LRC，我希望时间戳分配应根据歌曲的标签、歌词的情感、节奏来合理推测，而非机械地按照歌词长度分配。第一句歌词的时间戳应考虑前奏长度，避免歌词从 `[00:00.00]` 直接开始。严格按照 LRC 格式输出歌词，每行格式为 `[mm:ss.xx]歌词内容`。最后的结果只输出LRC,不需要其他的解释。
     white-space: pre-wrap;  /* 保留换行 */
     line-height: 1.5;  /* 行高优化 */
 }
+.gr-examples {
+    background: transparent !important;
+    border: 1px solid #e0e0e0 !important;
+    border-radius: 8px;
+    margin: 1rem 0 !important;
+    padding: 1rem !important;
+}
 """
 with gr.Blocks(css=css) as demo:
+    gr.Markdown("<h1 style='text-align: center'>DiffRhythm(谛韵)</h1>")
+    gr.HTML("""
+        <div style="display:flex; justify-content: center; column-gap:4px;">
+            <a href="https://github.com/ASLP-lab/DiffRhythm">
+                <img src='https://img.shields.io/badge/Arxiv-Paper-blue'>
+            </a>
+            <a href="https://github.com/ASLP-lab/DiffRhythm">
+                <img src='https://img.shields.io/badge/GitHub-Repo-green'>
+            </a>
+            <a href="https://aslp-lab.github.io/DiffRhythm.github.io/">
+                <img src='https://img.shields.io/badge/Project-Page-brown'>
+            </a>
+        </div>
+        """)
     with gr.Tabs() as tabs:
                         placeholder="Input the full lyrics",
                         lines=12,
                         max_lines=50,
+                        elem_classes="lyrics-scroll-box",
+                        value="""[00:05.00]Stardust whispers in your eyes\n[00:09.30]Moonlight paints our silhouettes\n[00:13.75]Tides bring secrets from the deep\n[00:18.20]Where forever's breath is kept\n[00:22.90]We dance through constellations' maze\n[00:27.15]Footprints melt in cosmic waves\n[00:31.65]Horizons hum our silent vow\n[00:36.10]Time unravels here and now\n[00:40.85]Eternal embers in the night oh oh oh\n[00:45.25]Healing scars with liquid light\n[00:49.70]Galaxies write our refrain\n[00:54.15]Love reborn in endless rain\n[01:00.00]Interlude\n[01:15.30]Paper boats of memories\n[01:19.75]Float through veins of ancient trees\n[01:24.20]Your laughter spins aurora threads\n[01:28.65]Weaving dawn through featherbed"""
                     )
+                    audio_prompt = gr.Audio(label="Audio Prompt", type="filepath", value="./gift_of_the_world.wav")
                 with gr.Column():
                     steps = gr.Slider(
                                     minimum=10,
+                                    maximum=100,
                                     value=32,
                                     step=1,
                                     label="Diffusion Steps",
                                     interactive=True,
                                     elem_id="step_slider"
                                 )
                     lyrics_btn = gr.Button("Submit", variant="primary")
                     audio_output = gr.Audio(label="Audio Result", type="filepath", elem_id="audio_output")
             gr.Examples(
                 examples=[
                     ["./gift_of_the_world.wav"],
                 ],
                 inputs=[audio_prompt],
                 label="Audio Examples",
+                examples_per_page=3,
+                elem_id="audio-examples-container"
             )
             gr.Examples(
                 examples=[
+                    ["""[00:05.00]Stardust whispers in your eyes\n[00:09.30]Moonlight paints our silhouettes\n[00:13.75]Tides bring secrets from the deep\n[00:18.20]Where forever's breath is kept\n[00:22.90]We dance through constellations' maze\n[00:27.15]Footprints melt in cosmic waves\n[00:31.65]Horizons hum our silent vow\n[00:36.10]Time unravels here and now\n[00:40.85]Eternal embers in the night oh oh oh\n[00:45.25]Healing scars with liquid light\n[00:49.70]Galaxies write our refrain\n[00:54.15]Love reborn in endless rain\n[01:00.00]Interlude\n[01:15.30]Paper boats of memories\n[01:19.75]Float through veins of ancient trees\n[01:24.20]Your laughter spins aurora threads\n[01:28.65]Weaving dawn through featherbed"""],
+                    ["""[00:10.00]Moonlight spills through broken blinds\n[00:13.20]Your shadow dances on the dashboard shrine\n[00:16.85]Neon ghosts in gasoline rain\n[00:20.40]I hear your laughter down the midnight train\n[00:24.15]Static whispers through frayed wires\n[00:27.65]Guitar strings hum our cathedral choirs\n[00:31.30]Flicker screens show reruns of June\n[00:34.90]I'm drowning in this mercury lagoon\n[00:38.55]Electric veins pulse through concrete skies\n[00:42.10]Your name echoes in the hollow where my heartbeat lies\n[00:45.75]We're satellites trapped in parallel light\n[00:49.25]Burning through the atmosphere of endless night\n[01:00.00]Dusty vinyl spins reverse\n[01:03.45]Our polaroid timeline bleeds through the verse\n[01:07.10]Telescope aimed at dead stars\n[01:10.65]Still tracing constellations through prison bars\n[01:14.30]Electric veins pulse through concrete skies\n[01:17.85]Your name echoes in the hollow where my heartbeat lies\n[01:21.50]We're satellites trapped in parallel light\n[01:25.05]Burning through the atmosphere of endless night\n[02:10.00]Clockwork gears grind moonbeams to rust\n[02:13.50]Our fingerprint smudged by interstellar dust\n[02:17.15]Velvet thunder rolls through my veins\n[02:20.70]Chasing phantom trains through solar plane\n[02:24.35]Electric veins pulse through concrete skies\n[02:27.90]Your name echoes in the hollow where my heartbeat lies"""]
                 ],
+                inputs=[lrc],
                 label="Lrc Examples",
+                examples_per_page=2,
+                elem_id="lrc-examples-container",
             )
         # page 2
         with gr.Tab("LLM Generate LRC", id=1):
             with gr.Row():
                         gr.Markdown("### Method 1: Generate from Theme")
                         theme = gr.Textbox(label="theme", placeholder="Enter song theme, e.g. Love and Heartbreak")
                         tags_gen = gr.Textbox(label="tags", placeholder="Example: male pop confidence healing")
+                        # language = gr.Dropdown(["zh", "en"], label="language", value="en")
+                        language = gr.Radio(["zh", "en"], label="Language", value="en")
                         gen_from_theme_btn = gr.Button("Generate LRC (From Theme)", variant="primary")
+                        gr.Examples(
+                            examples=[
+                                [
+                                    "Love and Heartbreak",
+                                    "vocal emotional piano pop",
+                                    "en"
+                                ],
+                                [
+                                    "Heroic Epic",
+                                    "choir orchestral powerful",
+                                    "zh"
+                                ]
+                            ],
+                            inputs=[theme, tags_gen, language],
+                            label="Examples: Generate from Theme"
+                        )
                     with gr.Group(visible=True):
                         gr.Markdown("### Method 2: Add Timestamps to Lyrics")
                         lyrics_input = gr.Textbox(
                             label="Raw Lyrics (without timestamps)",
                             placeholder="Enter plain lyrics (without timestamps), e.g.:\nYesterday\nAll my troubles...",
+                            lines=10,
                             max_lines=50,
                             elem_classes="lyrics-scroll-box"
                         )
                         gen_from_lyrics_btn = gr.Button("Generate LRC (From Lyrics)", variant="primary")
+                        gr.Examples(
+                            examples=[
+                                [
+                                    "acoustic folk happy",
+                                    """I'm sitting here in the boring room\nIt's just another rainy Sunday afternoon"""
+                                ],
+                                [
+                                    "electronic dance energetic",
+                                    """We're living in a material world\nAnd I am a material girl"""
+                                ]
+                            ],
+                            inputs=[tags_lyrics, lyrics_input],
+                            label="Examples: Generate from Lyrics"
+                        )
                 with gr.Column():
                     lrc_output = gr.Textbox(
                         label="Generated LRC Lyrics",
                         placeholder="Timed lyrics will appear here",
+                        lines=57,
                         elem_classes="lrc-output",
                         show_copy_button=True
                     )
             # Bind functions
             gen_from_theme_btn.click(
     lyrics_btn.click(
         fn=infer_music,
+        inputs=[lrc, audio_prompt, steps],
         outputs=audio_output
     )
 demo.queue().launch(show_api=False, show_error=True)

diffrhythm/infer/infer.py CHANGED Viewed

@@ -90,14 +90,13 @@ def inference(cfm_model, vae_model, cond, text, duration, style_prompt, negative
         generated = generated.to(torch.float32)
         latent = generated.transpose(1, 2) # [b d t]
-        output = decode_audio(latent, vae_model)
         # Rearrange audio batch to a single sequence
         output = rearrange(output, "b d n -> d (b n)")
-        # Peak normalize, clip, convert to int16, and save to file
-        output = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1, 1).mul(32767).to(torch.int16).cpu()
-        return output
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()

         generated = generated.to(torch.float32)
         latent = generated.transpose(1, 2) # [b d t]
+        output = decode_audio(latent, vae_model, chunked=False)
         # Rearrange audio batch to a single sequence
         output = rearrange(output, "b d n -> d (b n)")
+        output_tensor = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1, 1).cpu()
+        output_np = output_tensor.numpy().T.astype(np.float32)
+        return (44100, output_np)
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()

diffrhythm/infer/infer_utils.py CHANGED Viewed

@@ -34,8 +34,7 @@ def prepare_model(device):
     # prepare vae
     vae_ckpt_path = hf_hub_download(repo_id="ASLP-lab/DiffRhythm-vae", filename="vae_model.pt")
-    print(f"****************** {device} ******************")
-    vae = torch.jit.load(vae_ckpt_path, map_location=device)
     return cfm, tokenizer, muq, vae

     # prepare vae
     vae_ckpt_path = hf_hub_download(repo_id="ASLP-lab/DiffRhythm-vae", filename="vae_model.pt")
+    vae = torch.jit.load(vae_ckpt_path).to(device)
     return cfm, tokenizer, muq, vae