Spaces:

ASLP-lab
/

DiffRhythm

Running on Zero

App Files Files Community

ing0 commited on 2 days ago

Commit

d411505

1 Parent(s): b311ba6

cn lyrics example

Browse files

Files changed (4) hide show

app.py +33 -30
diffrhythm/infer/infer_utils.py +1 -0
src/prompt/rap_cn.wav +0 -0
src/prompt/rap_en.wav +0 -0

app.py CHANGED Viewed

@@ -29,15 +29,18 @@ device='cuda'
 cfm, tokenizer, muq, vae = prepare_model(device)
 cfm = torch.compile(cfm)
-@spaces.GPU
 def infer_music(lrc, ref_audio_path, seed=42, randomize_seed=False, steps=32, file_type='wav', max_frames=2048, device='cuda'):
     if randomize_seed:
         seed = random.randint(0, MAX_SEED)
     torch.manual_seed(seed)
     sway_sampling_coef = -1 if steps < 32 else None
-    lrc_prompt, start_time = get_lrc_token(lrc, tokenizer, device)
-    style_prompt = get_style_prompt(muq, ref_audio_path)
     negative_style_prompt = get_negative_style_prompt(device)
     latent_prompt = get_reference_latent(device, max_frames)
     generated_song = inference(cfm_model=cfm,
@@ -169,7 +172,7 @@ with gr.Blocks(css=css) as demo:
             with gr.Row():
                 with gr.Column():
                     lrc = gr.Textbox(
-                        label="Lrc",
                         placeholder="Input the full lyrics",
                         lines=12,
                         max_lines=50,
@@ -181,26 +184,23 @@ with gr.Blocks(css=css) as demo:
                 with gr.Column():
                     with gr.Accordion("Best Practices Guide", open=True):
                         gr.Markdown("""
-                        1. **Lyrics Format Requirements**
-                        - Each line must follow: `[mm:ss.xx]Lyric content`
-                        - Example of valid format:
-                            ```
-                            [00:10.00]Moonlight spills through broken blinds
-                            [00:13.20]Your shadow dances on the dashboard shrine
-                            ```
-                        2. **Generation Duration Limits**
-                        - Current version supports maximum **95 seconds** of music generation
-                        - Total timestamps should not exceed 01:35.00 (95 seconds)
-                        3. **Audio Prompt Requirements**
-                        - Reference audio should be ≥ 1 second, audio >10 seconds will be randomly clipped into 10 seconds
-                        - For optimal results, the 10-second clips should be carefully selected
-                        - Shorter clips may lead to incoherent generation
-                        4. **Supported Languages**
-                        - **Chinese and English**
-                        - More languages comming soon
                         """)
                     lyrics_btn = gr.Button("Generate", variant="primary")
@@ -239,23 +239,26 @@ with gr.Blocks(css=css) as demo:
                     ["./src/prompt/classic_en.wav"],
                     ["./src/prompt/jazz_cn.wav"],
                     ["./src/prompt/jazz_en.wav"],
                     ["./src/prompt/default.wav"]
                 ],
                 inputs=[audio_prompt],
                 label="Audio Examples",
-                examples_per_page=11,
                 elem_id="audio-examples-container"
             )
             gr.Examples(
                 examples=[
                     ["""[00:10.00]Moonlight spills through broken blinds\n[00:13.20]Your shadow dances on the dashboard shrine\n[00:16.85]Neon ghosts in gasoline rain\n[00:20.40]I hear your laughter down the midnight train\n[00:24.15]Static whispers through frayed wires\n[00:27.65]Guitar strings hum our cathedral choirs\n[00:31.30]Flicker screens show reruns of June\n[00:34.90]I'm drowning in this mercury lagoon\n[00:38.55]Electric veins pulse through concrete skies\n[00:42.10]Your name echoes in the hollow where my heartbeat lies\n[00:45.75]We're satellites trapped in parallel light\n[00:49.25]Burning through the atmosphere of endless night\n[01:00.00]Dusty vinyl spins reverse\n[01:03.45]Our polaroid timeline bleeds through the verse\n[01:07.10]Telescope aimed at dead stars\n[01:10.65]Still tracing constellations through prison bars\n[01:14.30]Electric veins pulse through concrete skies\n[01:17.85]Your name echoes in the hollow where my heartbeat lies\n[01:21.50]We're satellites trapped in parallel light\n[01:25.05]Burning through the atmosphere of endless night\n[02:10.00]Clockwork gears grind moonbeams to rust\n[02:13.50]Our fingerprint smudged by interstellar dust\n[02:17.15]Velvet thunder rolls through my veins\n[02:20.70]Chasing phantom trains through solar plane\n[02:24.35]Electric veins pulse through concrete skies\n[02:27.90]Your name echoes in the hollow where my heartbeat lies"""],
-                    ["""[00:04.34]Tell me that I'm special\n[00:06.57]Tell me I look pretty\n[00:08.46]Tell me I'm a little angel\n[00:10.58]Sweetheart of your city\n[00:13.64]Say what I'm dying to hear\n[00:17.35]Cause I'm dying to hear you\n[00:20.86]Tell me I'm that new thing\n[00:22.93]Tell me that I'm relevant\n[00:24.96]Tell me that I got a big heart\n[00:27.04]Then back it up with evidence\n[00:29.94]I need it and I don't know why\n[00:34.28]This late at night\n[00:36.32]Isn't it lonely\n[00:39.24]I'd do anything to make you want me\n[00:43.40]I'd give it all up if you told me\n[00:47.42]That I'd be\n[00:49.43]The number one girl in your eyes\n[00:52.85]Your one and only\n[00:55.74]So what's it gon' take for you to want me\n[00:59.78]I'd give it all up if you told me\n[01:03.89]That I'd be\n[01:05.94]The number one girl in your eyes\n[01:11.34]Tell me I'm going real big places\n[01:14.32]Down to earth so friendly\n[01:16.30]And even through all the phases\n[01:18.46]Tell me you accept me\n[01:21.56]Well that's all I'm dying to hear\n[01:25.30]Yeah I'm dying to hear you\n[01:28.91]Tell me that you need me\n[01:30.85]Tell me that I'm loved\n[01:32.90]Tell me that I'm worth it"""]
                 ],
                 inputs=[lrc],
                 label="Lrc Examples",
-                examples_per_page=2,
                 elem_id="lrc-examples-container",
             )
@@ -270,7 +273,7 @@ with gr.Blocks(css=css) as demo:
                         gr.Markdown("### Method 1: Generate from Theme")
                         theme = gr.Textbox(label="theme", placeholder="Enter song theme, e.g: Love and Heartbreak")
                         tags_gen = gr.Textbox(label="tags", placeholder="Enter song tags, e.g: pop confidence healing")
-                        language = gr.Radio(["zh", "en"], label="Language", value="en")
                         gen_from_theme_btn = gr.Button("Generate LRC (From Theme)", variant="primary")
                         gr.Examples(
@@ -283,7 +286,7 @@ with gr.Blocks(css=css) as demo:
                                 [
                                     "Heroic Epic",
                                     "choir orchestral powerful",
-                                    "zh"
                                 ]
                             ],
                             inputs=[theme, tags_gen, language],
@@ -321,7 +324,7 @@ with gr.Blocks(css=css) as demo:
                 with gr.Column():
                     lrc_output = gr.Textbox(
-                        label="Generated LRC Lyrics",
                         placeholder="Timed lyrics will appear here",
                         lines=57,
                         elem_classes="lrc-output",

 cfm, tokenizer, muq, vae = prepare_model(device)
 cfm = torch.compile(cfm)
+@spaces.GPU(duration=20)
 def infer_music(lrc, ref_audio_path, seed=42, randomize_seed=False, steps=32, file_type='wav', max_frames=2048, device='cuda'):
     if randomize_seed:
         seed = random.randint(0, MAX_SEED)
     torch.manual_seed(seed)
     sway_sampling_coef = -1 if steps < 32 else None
+    try:
+        lrc_prompt, start_time = get_lrc_token(lrc, tokenizer, device)
+        style_prompt = get_style_prompt(muq, ref_audio_path)
+    except Exception as e:
+        raise gr.Error(f"Error: {str(e)}")
     negative_style_prompt = get_negative_style_prompt(device)
     latent_prompt = get_reference_latent(device, max_frames)
     generated_song = inference(cfm_model=cfm,
             with gr.Row():
                 with gr.Column():
                     lrc = gr.Textbox(
+                        label="Lyrics",
                         placeholder="Input the full lyrics",
                         lines=12,
                         max_lines=50,
                 with gr.Column():
                     with gr.Accordion("Best Practices Guide", open=True):
                         gr.Markdown("""
+1. **Lyrics Format Requirements**
+    - Each line must follow: `[mm:ss.xx]Lyric content`
+    - Example of valid format:
+    ```
+    [00:10.00]Moonlight spills through broken blinds
+    [00:13.20]Your shadow dances on the dashboard shrine
+    ```
+2. **Generation Duration Limits**
+    - Current version supports maximum **95 seconds** of music generation
+    - Total timestamps should not exceed 01:35.00 (95 seconds)
+3. **Audio Prompt Requirements**
+    - Reference audio should be ≥ 1 second, audio >10 seconds will be randomly clipped into 10 seconds
+    - For optimal results, the 10-second clips should be carefully selected
+    - Shorter clips may lead to incoherent generation
+4. **Supported Languages**
+    - **Chinese and English**
+    - More languages comming soon
                         """)
                     lyrics_btn = gr.Button("Generate", variant="primary")
                     ["./src/prompt/classic_en.wav"],
                     ["./src/prompt/jazz_cn.wav"],
                     ["./src/prompt/jazz_en.wav"],
+                    ["./src/prompt/rap_cn.wav"],
+                    ["./src/prompt/rap_en.wav"],
                     ["./src/prompt/default.wav"]
                 ],
                 inputs=[audio_prompt],
                 label="Audio Examples",
+                examples_per_page=13,
                 elem_id="audio-examples-container"
             )
             gr.Examples(
                 examples=[
                     ["""[00:10.00]Moonlight spills through broken blinds\n[00:13.20]Your shadow dances on the dashboard shrine\n[00:16.85]Neon ghosts in gasoline rain\n[00:20.40]I hear your laughter down the midnight train\n[00:24.15]Static whispers through frayed wires\n[00:27.65]Guitar strings hum our cathedral choirs\n[00:31.30]Flicker screens show reruns of June\n[00:34.90]I'm drowning in this mercury lagoon\n[00:38.55]Electric veins pulse through concrete skies\n[00:42.10]Your name echoes in the hollow where my heartbeat lies\n[00:45.75]We're satellites trapped in parallel light\n[00:49.25]Burning through the atmosphere of endless night\n[01:00.00]Dusty vinyl spins reverse\n[01:03.45]Our polaroid timeline bleeds through the verse\n[01:07.10]Telescope aimed at dead stars\n[01:10.65]Still tracing constellations through prison bars\n[01:14.30]Electric veins pulse through concrete skies\n[01:17.85]Your name echoes in the hollow where my heartbeat lies\n[01:21.50]We're satellites trapped in parallel light\n[01:25.05]Burning through the atmosphere of endless night\n[02:10.00]Clockwork gears grind moonbeams to rust\n[02:13.50]Our fingerprint smudged by interstellar dust\n[02:17.15]Velvet thunder rolls through my veins\n[02:20.70]Chasing phantom trains through solar plane\n[02:24.35]Electric veins pulse through concrete skies\n[02:27.90]Your name echoes in the hollow where my heartbeat lies"""],
+                    ["""[00:04.34]Tell me that I'm special\n[00:06.57]Tell me I look pretty\n[00:08.46]Tell me I'm a little angel\n[00:10.58]Sweetheart of your city\n[00:13.64]Say what I'm dying to hear\n[00:17.35]Cause I'm dying to hear you\n[00:20.86]Tell me I'm that new thing\n[00:22.93]Tell me that I'm relevant\n[00:24.96]Tell me that I got a big heart\n[00:27.04]Then back it up with evidence\n[00:29.94]I need it and I don't know why\n[00:34.28]This late at night\n[00:36.32]Isn't it lonely\n[00:39.24]I'd do anything to make you want me\n[00:43.40]I'd give it all up if you told me\n[00:47.42]That I'd be\n[00:49.43]The number one girl in your eyes\n[00:52.85]Your one and only\n[00:55.74]So what's it gon' take for you to want me\n[00:59.78]I'd give it all up if you told me\n[01:03.89]That I'd be\n[01:05.94]The number one girl in your eyes\n[01:11.34]Tell me I'm going real big places\n[01:14.32]Down to earth so friendly\n[01:16.30]And even through all the phases\n[01:18.46]Tell me you accept me\n[01:21.56]Well that's all I'm dying to hear\n[01:25.30]Yeah I'm dying to hear you\n[01:28.91]Tell me that you need me\n[01:30.85]Tell me that I'm loved\n[01:32.90]Tell me that I'm worth it"""],
+                    ["""[00:04.27]只因你太美 baby\n[00:08.95]只因你实在是太美 baby\n[00:13.99]只因你太美 baby\n[00:18.89]迎面走来的你让我如此蠢蠢欲动\n[00:20.88]这种感觉我从未有\n[00:21.79]Cause I got a crush on you who you\n[00:25.74]你是我的我是你的谁\n[00:28.09]再多一眼看一眼就会爆炸\n[00:30.31]再近一点靠近点快被融化\n[00:32.49]想要把你占为己有 baby\n[00:34.60]不管走到哪里\n[00:35.44]都会想起的人是你 you you\n[00:38.12]我应该拿你怎样\n[00:39.61]Uh 所有人都在看着你\n[00:42.36]我的心总是不安\n[00:44.18]Oh 我现在已病入膏肓\n[00:46.63]Eh oh\n[00:47.84]难道真的因你而疯狂吗\n[00:51.57]我本来不是这种人\n[00:53.59]因你变成奇怪的人\n[00:55.77]第一次呀变成这样的我\n[01:01.23]不管我怎么去否认\n[01:03.21]只因你太美 baby\n[01:11.46]只因你实在是太美 baby\n[01:16.75]只因你太美 baby\n[01:21.09]Oh eh oh\n[01:22.82]现在确认地告诉我\n[01:25.26]Oh eh oh\n[01:27.31]你到底属于谁\n[01:29.98]Oh eh oh\n[01:31.70]现在确认地告诉我\n[01:34.45]Oh eh oh\n[01:36.35]你到底属于谁\n[01:37.65]就是现在告诉我\n[01:40.00]跟着那节奏 缓缓 make wave\n"""]
                 ],
                 inputs=[lrc],
                 label="Lrc Examples",
+                examples_per_page=3,
                 elem_id="lrc-examples-container",
             )
                         gr.Markdown("### Method 1: Generate from Theme")
                         theme = gr.Textbox(label="theme", placeholder="Enter song theme, e.g: Love and Heartbreak")
                         tags_gen = gr.Textbox(label="tags", placeholder="Enter song tags, e.g: pop confidence healing")
+                        language = gr.Radio(["cn", "en"], label="Language", value="en")
                         gen_from_theme_btn = gr.Button("Generate LRC (From Theme)", variant="primary")
                         gr.Examples(
                                 [
                                     "Heroic Epic",
                                     "choir orchestral powerful",
+                                    "cn"
                                 ]
                             ],
                             inputs=[theme, tags_gen, language],
                 with gr.Column():
                     lrc_output = gr.Textbox(
+                        label="Generated LRC",
                         placeholder="Timed lyrics will appear here",
                         lines=57,
                         elem_classes="lrc-output",

diffrhythm/infer/infer_utils.py CHANGED Viewed

@@ -56,6 +56,7 @@ def get_style_prompt(model, wav_path):
     audio, _ = librosa.load(wav_path, sr=24000)
     audio_len = librosa.get_duration(y=audio, sr=24000)
     assert audio_len >= 1, "Input audio length shorter than 1 second"
     if audio_len > 10:

     audio, _ = librosa.load(wav_path, sr=24000)
     audio_len = librosa.get_duration(y=audio, sr=24000)
     assert audio_len >= 1, "Input audio length shorter than 1 second"
     if audio_len > 10:

src/prompt/rap_cn.wav ADDED Viewed

Binary file (441 kB). View file

src/prompt/rap_en.wav ADDED Viewed

Binary file (882 kB). View file