ing0 commited on
Commit
3bfd709
·
1 Parent(s): 2597df1
Files changed (3) hide show
  1. app.py +2 -2
  2. diffrhythm/infer/infer.py +7 -1
  3. requirements.txt +1 -0
app.py CHANGED
@@ -125,7 +125,7 @@ css = """
125
  """
126
 
127
  with gr.Blocks(css=css) as demo:
128
- gr.Markdown("<h1 style='text-align: center'>DiffRhythm(谛韵)</h1>")
129
  gr.HTML("""
130
  <div style="display:flex; justify-content: center; column-gap:4px;">
131
  <a href="https://github.com/ASLP-lab/DiffRhythm">
@@ -172,7 +172,7 @@ with gr.Blocks(css=css) as demo:
172
  elem_classes="lyrics-scroll-box",
173
  value="""[00:05.00]Stardust whispers in your eyes\n[00:09.30]Moonlight paints our silhouettes\n[00:13.75]Tides bring secrets from the deep\n[00:18.20]Where forever's breath is kept\n[00:22.90]We dance through constellations' maze\n[00:27.15]Footprints melt in cosmic waves\n[00:31.65]Horizons hum our silent vow\n[00:36.10]Time unravels here and now\n[00:40.85]Eternal embers in the night oh oh oh\n[00:45.25]Healing scars with liquid light\n[00:49.70]Galaxies write our refrain\n[00:54.15]Love reborn in endless rain\n[01:00.00]Interlude\n[01:15.30]Paper boats of memories\n[01:19.75]Float through veins of ancient trees\n[01:24.20]Your laughter spins aurora threads\n[01:28.65]Weaving dawn through featherbed"""
174
  )
175
- audio_prompt = gr.Audio(label="Audio Prompt", type="filepath", value="./gift_of_the_world.wav")
176
 
177
  with gr.Column():
178
  steps = gr.Slider(
 
125
  """
126
 
127
  with gr.Blocks(css=css) as demo:
128
+ gr.Markdown("<h1 style='text-align: center'>DiffRhythm (谛韵)</h1>")
129
  gr.HTML("""
130
  <div style="display:flex; justify-content: center; column-gap:4px;">
131
  <a href="https://github.com/ASLP-lab/DiffRhythm">
 
172
  elem_classes="lyrics-scroll-box",
173
  value="""[00:05.00]Stardust whispers in your eyes\n[00:09.30]Moonlight paints our silhouettes\n[00:13.75]Tides bring secrets from the deep\n[00:18.20]Where forever's breath is kept\n[00:22.90]We dance through constellations' maze\n[00:27.15]Footprints melt in cosmic waves\n[00:31.65]Horizons hum our silent vow\n[00:36.10]Time unravels here and now\n[00:40.85]Eternal embers in the night oh oh oh\n[00:45.25]Healing scars with liquid light\n[00:49.70]Galaxies write our refrain\n[00:54.15]Love reborn in endless rain\n[01:00.00]Interlude\n[01:15.30]Paper boats of memories\n[01:19.75]Float through veins of ancient trees\n[01:24.20]Your laughter spins aurora threads\n[01:28.65]Weaving dawn through featherbed"""
174
  )
175
+ audio_prompt = gr.Audio(label="Audio Prompt", type="filepath", value="./prompt/gift_of_the_world.wav")
176
 
177
  with gr.Column():
178
  steps = gr.Slider(
diffrhythm/infer/infer.py CHANGED
@@ -74,6 +74,7 @@ def decode_audio(latents, vae_model, chunked=False, overlap=32, chunk_size=128):
74
 
75
  def inference(cfm_model, vae_model, cond, text, duration, style_prompt, negative_style_prompt, steps, sway_sampling_coef, start_time):
76
  # import pdb; pdb.set_trace()
 
77
  with torch.inference_mode():
78
  generated, _ = cfm_model.sample(
79
  cond=cond,
@@ -89,13 +90,18 @@ def inference(cfm_model, vae_model, cond, text, duration, style_prompt, negative
89
 
90
  generated = generated.to(torch.float32)
91
  latent = generated.transpose(1, 2) # [b d t]
92
-
 
 
93
  output = decode_audio(latent, vae_model, chunked=False)
 
94
 
95
  # Rearrange audio batch to a single sequence
96
  output = rearrange(output, "b d n -> d (b n)")
97
  output_tensor = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1, 1).cpu()
98
  output_np = output_tensor.numpy().T.astype(np.float32)
 
 
99
  return (44100, output_np)
100
 
101
  if __name__ == "__main__":
 
74
 
75
  def inference(cfm_model, vae_model, cond, text, duration, style_prompt, negative_style_prompt, steps, sway_sampling_coef, start_time):
76
  # import pdb; pdb.set_trace()
77
+ s_t = time.time()
78
  with torch.inference_mode():
79
  generated, _ = cfm_model.sample(
80
  cond=cond,
 
90
 
91
  generated = generated.to(torch.float32)
92
  latent = generated.transpose(1, 2) # [b d t]
93
+ e_t = time.time()
94
+ print(f"**** cfm time : {e_t-s_t} ****")
95
+ print(latent.mean(), latent.min(), latent.max(), latent.std())
96
  output = decode_audio(latent, vae_model, chunked=False)
97
+ print(output.mean(), output.min(), output.max(), output.std())
98
 
99
  # Rearrange audio batch to a single sequence
100
  output = rearrange(output, "b d n -> d (b n)")
101
  output_tensor = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1, 1).cpu()
102
  output_np = output_tensor.numpy().T.astype(np.float32)
103
+ print(f"**** vae time : {time.tiem()-e_t} ****")
104
+ print(output_np.mean(), output_np.min(), output_np.max(), output_np.std())
105
  return (44100, output_np)
106
 
107
  if __name__ == "__main__":
requirements.txt CHANGED
@@ -31,3 +31,4 @@ lazy_loader==0.4
31
  scipy==1.15.2
32
  ftfy==6.3.1
33
  torchdiffeq==0.2.5
 
 
31
  scipy==1.15.2
32
  ftfy==6.3.1
33
  torchdiffeq==0.2.5
34
+ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.2cxx11abiTRUE-cp310-cp310-linux_x86_64.whl