Spaces:

ASLP-lab
/

DiffRhythm

Running on Zero

ing0 commited on 3 days ago

Commit

3bfd709

1 Parent(s): 2597df1

Zero GPU

Files changed (3) hide show

app.py CHANGED Viewed

@@ -125,7 +125,7 @@ css = """
 """
 with gr.Blocks(css=css) as demo:
-    gr.Markdown("<h1 style='text-align: center'>DiffRhythm(谛韵)</h1>")
     gr.HTML("""
         <div style="display:flex; justify-content: center; column-gap:4px;">
             <a href="https://github.com/ASLP-lab/DiffRhythm">
@@ -172,7 +172,7 @@ with gr.Blocks(css=css) as demo:
                         elem_classes="lyrics-scroll-box",
                         value="""[00:05.00]Stardust whispers in your eyes\n[00:09.30]Moonlight paints our silhouettes\n[00:13.75]Tides bring secrets from the deep\n[00:18.20]Where forever's breath is kept\n[00:22.90]We dance through constellations' maze\n[00:27.15]Footprints melt in cosmic waves\n[00:31.65]Horizons hum our silent vow\n[00:36.10]Time unravels here and now\n[00:40.85]Eternal embers in the night oh oh oh\n[00:45.25]Healing scars with liquid light\n[00:49.70]Galaxies write our refrain\n[00:54.15]Love reborn in endless rain\n[01:00.00]Interlude\n[01:15.30]Paper boats of memories\n[01:19.75]Float through veins of ancient trees\n[01:24.20]Your laughter spins aurora threads\n[01:28.65]Weaving dawn through featherbed"""
                     )
-                    audio_prompt = gr.Audio(label="Audio Prompt", type="filepath", value="./gift_of_the_world.wav")
                 with gr.Column():
                     steps = gr.Slider(

 """
 with gr.Blocks(css=css) as demo:
+    gr.Markdown("<h1 style='text-align: center'>DiffRhythm (谛韵)</h1>")
     gr.HTML("""
         <div style="display:flex; justify-content: center; column-gap:4px;">
             <a href="https://github.com/ASLP-lab/DiffRhythm">
                         elem_classes="lyrics-scroll-box",
                         value="""[00:05.00]Stardust whispers in your eyes\n[00:09.30]Moonlight paints our silhouettes\n[00:13.75]Tides bring secrets from the deep\n[00:18.20]Where forever's breath is kept\n[00:22.90]We dance through constellations' maze\n[00:27.15]Footprints melt in cosmic waves\n[00:31.65]Horizons hum our silent vow\n[00:36.10]Time unravels here and now\n[00:40.85]Eternal embers in the night oh oh oh\n[00:45.25]Healing scars with liquid light\n[00:49.70]Galaxies write our refrain\n[00:54.15]Love reborn in endless rain\n[01:00.00]Interlude\n[01:15.30]Paper boats of memories\n[01:19.75]Float through veins of ancient trees\n[01:24.20]Your laughter spins aurora threads\n[01:28.65]Weaving dawn through featherbed"""
                     )
+                    audio_prompt = gr.Audio(label="Audio Prompt", type="filepath", value="./prompt/gift_of_the_world.wav")
                 with gr.Column():
                     steps = gr.Slider(

diffrhythm/infer/infer.py CHANGED Viewed

@@ -74,6 +74,7 @@ def decode_audio(latents, vae_model, chunked=False, overlap=32, chunk_size=128):
 def inference(cfm_model, vae_model, cond, text, duration, style_prompt, negative_style_prompt, steps, sway_sampling_coef, start_time):
     # import pdb; pdb.set_trace()
     with torch.inference_mode():
         generated, _ = cfm_model.sample(
             cond=cond,
@@ -89,13 +90,18 @@ def inference(cfm_model, vae_model, cond, text, duration, style_prompt, negative
         generated = generated.to(torch.float32)
         latent = generated.transpose(1, 2) # [b d t]
         output = decode_audio(latent, vae_model, chunked=False)
         # Rearrange audio batch to a single sequence
         output = rearrange(output, "b d n -> d (b n)")
         output_tensor = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1, 1).cpu()
         output_np = output_tensor.numpy().T.astype(np.float32)
         return (44100, output_np)
 if __name__ == "__main__":

 def inference(cfm_model, vae_model, cond, text, duration, style_prompt, negative_style_prompt, steps, sway_sampling_coef, start_time):
     # import pdb; pdb.set_trace()
+    s_t = time.time()
     with torch.inference_mode():
         generated, _ = cfm_model.sample(
             cond=cond,
         generated = generated.to(torch.float32)
         latent = generated.transpose(1, 2) # [b d t]
+        e_t = time.time()
+        print(f"**** cfm time : {e_t-s_t} ****")
+        print(latent.mean(), latent.min(), latent.max(), latent.std())
         output = decode_audio(latent, vae_model, chunked=False)
+        print(output.mean(), output.min(), output.max(), output.std())
         # Rearrange audio batch to a single sequence
         output = rearrange(output, "b d n -> d (b n)")
         output_tensor = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1, 1).cpu()
         output_np = output_tensor.numpy().T.astype(np.float32)
+        print(f"**** vae time : {time.tiem()-e_t} ****")
+        print(output_np.mean(), output_np.min(), output_np.max(), output_np.std())
         return (44100, output_np)
 if __name__ == "__main__":

requirements.txt CHANGED Viewed

@@ -31,3 +31,4 @@ lazy_loader==0.4
 scipy==1.15.2
 ftfy==6.3.1
 torchdiffeq==0.2.5

 scipy==1.15.2
 ftfy==6.3.1
 torchdiffeq==0.2.5
+https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.2cxx11abiTRUE-cp310-cp310-linux_x86_64.whl