Spaces:

ASLP-lab
/

DiffRhythm

Running on Zero

App Files Files Community

ing0 commited on 4 days ago

Commit

ccebb03

1 Parent(s): 4e97955

cpu test

Browse files

Files changed (2) hide show

app.py +24 -9
diffrhythm/infer/infer.py +6 -5

app.py CHANGED Viewed

@@ -22,16 +22,13 @@ from diffrhythm.infer.infer_utils import (
 )
 from diffrhythm.infer.infer import inference
-device='cuda'
 cfm, tokenizer, muq, vae = prepare_model(device)
 cfm = torch.compile(cfm)
-def infer_music(lrc, ref_audio_path, max_frames=2048, device='cuda'):
-    # lrc_list = lrc.split("\n")
-    # print(lrc_list)
-    # return "./gift_of_the_world.wav"
     lrc_prompt, start_time = get_lrc_token(lrc, tokenizer, device)
     style_prompt = get_style_prompt(muq, ref_audio_path)
     negative_style_prompt = get_negative_style_prompt(device)
@@ -43,6 +40,8 @@ def infer_music(lrc, ref_audio_path, max_frames=2048, device='cuda'):
                                duration=max_frames,
                                style_prompt=style_prompt,
                                negative_style_prompt=negative_style_prompt,
                                start_time=start_time
                                )
     return generated_song
@@ -150,6 +149,22 @@ with gr.Blocks(css=css) as demo:
                     audio_prompt = gr.Audio(label="Audio Prompt", type="filepath")
                 with gr.Column():
                     lyrics_btn = gr.Button("Submit", variant="primary")
                     audio_output = gr.Audio(label="Audio Result", type="filepath", elem_id="audio_output")
@@ -210,7 +225,7 @@ with gr.Blocks(css=css) as demo:
 [01:24.20]Your laughter spins aurora threads
 [01:28.65]Weaving dawn through featherbed"""]
                 ],
-                inputs=[lrc],  # 只绑定到歌词输入
                 label="Lrc Examples",
                 examples_per_page=2
             )
@@ -306,7 +321,7 @@ with gr.Blocks(css=css) as demo:
     lyrics_btn.click(
         fn=infer_music,
-        inputs=[lrc, audio_prompt],
         outputs=audio_output
     )

 )
 from diffrhythm.infer.infer import inference
+device='cpu'
 cfm, tokenizer, muq, vae = prepare_model(device)
 cfm = torch.compile(cfm)
+def infer_music(lrc, ref_audio_path, steps, sway_sampling_coef_bool, max_frames=2048, device='cpu'):
+    sway_sampling_coef = -1 if sway_sampling_coef_bool else None
     lrc_prompt, start_time = get_lrc_token(lrc, tokenizer, device)
     style_prompt = get_style_prompt(muq, ref_audio_path)
     negative_style_prompt = get_negative_style_prompt(device)
                                duration=max_frames,
                                style_prompt=style_prompt,
                                negative_style_prompt=negative_style_prompt,
+                               steps=steps,
+                               sway_sampling_coef=sway_sampling_coef,
                                start_time=start_time
                                )
     return generated_song
                     audio_prompt = gr.Audio(label="Audio Prompt", type="filepath")
                 with gr.Column():
+                    steps = gr.Slider(
+                                    minimum=10,
+                                    maximum=40,
+                                    value=32,
+                                    step=1,
+                                    label="Diffusion Steps",
+                                    interactive=True,
+                                    elem_id="step_slider"
+                                )
+                    sway_sampling_coef_bool = gr.Radio(
+                                    choices=[("False", False), ("True", True)],
+                                    label="Use sway_sampling_coef",
+                                    value=False,
+                                    interactive=True,
+                                    elem_classes="horizontal-radio"
+                                )
                     lyrics_btn = gr.Button("Submit", variant="primary")
                     audio_output = gr.Audio(label="Audio Result", type="filepath", elem_id="audio_output")
 [01:24.20]Your laughter spins aurora threads
 [01:28.65]Weaving dawn through featherbed"""]
                 ],
+                inputs=[lrc],
                 label="Lrc Examples",
                 examples_per_page=2
             )
     lyrics_btn.click(
         fn=infer_music,
+        inputs=[lrc, audio_prompt, steps, sway_sampling_coef_bool],
         outputs=audio_output
     )

diffrhythm/infer/infer.py CHANGED Viewed

@@ -72,7 +72,7 @@ def decode_audio(latents, vae_model, chunked=False, overlap=32, chunk_size=128):
             y_final[:,:,t_start:t_end] = y_chunk[:,:,chunk_start:chunk_end]
         return y_final
-def inference(cfm_model, vae_model, cond, text, duration, style_prompt, negative_style_prompt, start_time, steps):
     # import pdb; pdb.set_trace()
     with torch.inference_mode():
         generated, _ = cfm_model.sample(
@@ -81,8 +81,9 @@ def inference(cfm_model, vae_model, cond, text, duration, style_prompt, negative
             duration=duration,
             style_prompt=style_prompt,
             negative_style_prompt=negative_style_prompt,
-            steps=32,
             cfg_strength=4.0,
             start_time=start_time
         )
@@ -100,10 +101,10 @@ def inference(cfm_model, vae_model, cond, text, duration, style_prompt, negative
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--lrc-path', type=str, default="/home/node59_tmpdata3/hkchen/DiffRhythm/diffrhythm/diffrhythm/infer/example/eg.lrc") # lyrics of target song
-    parser.add_argument('--ref-audio-path', type=str, default="/home/node59_tmpdata3/hkchen/DiffRhythm/diffrhythm/diffrhythm/infer/example/eg.mp3") # reference audio as style prompt for target song
     parser.add_argument('--audio-length', type=int, default=95) # length of target song
-    parser.add_argument('--output-dir', type=str, default="/home/node59_tmpdata3/hkchen/DiffRhythm/diffrhythm/diffrhythm/infer/example/output")
     args = parser.parse_args()
     device = 'cuda'

             y_final[:,:,t_start:t_end] = y_chunk[:,:,chunk_start:chunk_end]
         return y_final
+def inference(cfm_model, vae_model, cond, text, duration, style_prompt, negative_style_prompt, steps, sway_sampling_coef, start_time):
     # import pdb; pdb.set_trace()
     with torch.inference_mode():
         generated, _ = cfm_model.sample(
             duration=duration,
             style_prompt=style_prompt,
             negative_style_prompt=negative_style_prompt,
+            steps=steps,
             cfg_strength=4.0,
+            sway_sampling_coef=sway_sampling_coef,
             start_time=start_time
         )
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
+    parser.add_argument('--lrc-path', type=str, default="example/eg.lrc") # lyrics of target song
+    parser.add_argument('--ref-audio-path', type=str, default="example/eg.mp3") # reference audio as style prompt for target song
     parser.add_argument('--audio-length', type=int, default=95) # length of target song
+    parser.add_argument('--output-dir', type=str, default="example/output")
     args = parser.parse_args()
     device = 'cuda'