Spaces:
Running
on
Zero
Running
on
Zero
Zero GPU
Browse files- app.py +2 -2
- diffrhythm/infer/infer.py +7 -1
- requirements.txt +1 -0
app.py
CHANGED
@@ -125,7 +125,7 @@ css = """
|
|
125 |
"""
|
126 |
|
127 |
with gr.Blocks(css=css) as demo:
|
128 |
-
gr.Markdown("<h1 style='text-align: center'>DiffRhythm(谛韵)</h1>")
|
129 |
gr.HTML("""
|
130 |
<div style="display:flex; justify-content: center; column-gap:4px;">
|
131 |
<a href="https://github.com/ASLP-lab/DiffRhythm">
|
@@ -172,7 +172,7 @@ with gr.Blocks(css=css) as demo:
|
|
172 |
elem_classes="lyrics-scroll-box",
|
173 |
value="""[00:05.00]Stardust whispers in your eyes\n[00:09.30]Moonlight paints our silhouettes\n[00:13.75]Tides bring secrets from the deep\n[00:18.20]Where forever's breath is kept\n[00:22.90]We dance through constellations' maze\n[00:27.15]Footprints melt in cosmic waves\n[00:31.65]Horizons hum our silent vow\n[00:36.10]Time unravels here and now\n[00:40.85]Eternal embers in the night oh oh oh\n[00:45.25]Healing scars with liquid light\n[00:49.70]Galaxies write our refrain\n[00:54.15]Love reborn in endless rain\n[01:00.00]Interlude\n[01:15.30]Paper boats of memories\n[01:19.75]Float through veins of ancient trees\n[01:24.20]Your laughter spins aurora threads\n[01:28.65]Weaving dawn through featherbed"""
|
174 |
)
|
175 |
-
audio_prompt = gr.Audio(label="Audio Prompt", type="filepath", value="./gift_of_the_world.wav")
|
176 |
|
177 |
with gr.Column():
|
178 |
steps = gr.Slider(
|
|
|
125 |
"""
|
126 |
|
127 |
with gr.Blocks(css=css) as demo:
|
128 |
+
gr.Markdown("<h1 style='text-align: center'>DiffRhythm (谛韵)</h1>")
|
129 |
gr.HTML("""
|
130 |
<div style="display:flex; justify-content: center; column-gap:4px;">
|
131 |
<a href="https://github.com/ASLP-lab/DiffRhythm">
|
|
|
172 |
elem_classes="lyrics-scroll-box",
|
173 |
value="""[00:05.00]Stardust whispers in your eyes\n[00:09.30]Moonlight paints our silhouettes\n[00:13.75]Tides bring secrets from the deep\n[00:18.20]Where forever's breath is kept\n[00:22.90]We dance through constellations' maze\n[00:27.15]Footprints melt in cosmic waves\n[00:31.65]Horizons hum our silent vow\n[00:36.10]Time unravels here and now\n[00:40.85]Eternal embers in the night oh oh oh\n[00:45.25]Healing scars with liquid light\n[00:49.70]Galaxies write our refrain\n[00:54.15]Love reborn in endless rain\n[01:00.00]Interlude\n[01:15.30]Paper boats of memories\n[01:19.75]Float through veins of ancient trees\n[01:24.20]Your laughter spins aurora threads\n[01:28.65]Weaving dawn through featherbed"""
|
174 |
)
|
175 |
+
audio_prompt = gr.Audio(label="Audio Prompt", type="filepath", value="./prompt/gift_of_the_world.wav")
|
176 |
|
177 |
with gr.Column():
|
178 |
steps = gr.Slider(
|
diffrhythm/infer/infer.py
CHANGED
@@ -74,6 +74,7 @@ def decode_audio(latents, vae_model, chunked=False, overlap=32, chunk_size=128):
|
|
74 |
|
75 |
def inference(cfm_model, vae_model, cond, text, duration, style_prompt, negative_style_prompt, steps, sway_sampling_coef, start_time):
|
76 |
# import pdb; pdb.set_trace()
|
|
|
77 |
with torch.inference_mode():
|
78 |
generated, _ = cfm_model.sample(
|
79 |
cond=cond,
|
@@ -89,13 +90,18 @@ def inference(cfm_model, vae_model, cond, text, duration, style_prompt, negative
|
|
89 |
|
90 |
generated = generated.to(torch.float32)
|
91 |
latent = generated.transpose(1, 2) # [b d t]
|
92 |
-
|
|
|
|
|
93 |
output = decode_audio(latent, vae_model, chunked=False)
|
|
|
94 |
|
95 |
# Rearrange audio batch to a single sequence
|
96 |
output = rearrange(output, "b d n -> d (b n)")
|
97 |
output_tensor = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1, 1).cpu()
|
98 |
output_np = output_tensor.numpy().T.astype(np.float32)
|
|
|
|
|
99 |
return (44100, output_np)
|
100 |
|
101 |
if __name__ == "__main__":
|
|
|
74 |
|
75 |
def inference(cfm_model, vae_model, cond, text, duration, style_prompt, negative_style_prompt, steps, sway_sampling_coef, start_time):
|
76 |
# import pdb; pdb.set_trace()
|
77 |
+
s_t = time.time()
|
78 |
with torch.inference_mode():
|
79 |
generated, _ = cfm_model.sample(
|
80 |
cond=cond,
|
|
|
90 |
|
91 |
generated = generated.to(torch.float32)
|
92 |
latent = generated.transpose(1, 2) # [b d t]
|
93 |
+
e_t = time.time()
|
94 |
+
print(f"**** cfm time : {e_t-s_t} ****")
|
95 |
+
print(latent.mean(), latent.min(), latent.max(), latent.std())
|
96 |
output = decode_audio(latent, vae_model, chunked=False)
|
97 |
+
print(output.mean(), output.min(), output.max(), output.std())
|
98 |
|
99 |
# Rearrange audio batch to a single sequence
|
100 |
output = rearrange(output, "b d n -> d (b n)")
|
101 |
output_tensor = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1, 1).cpu()
|
102 |
output_np = output_tensor.numpy().T.astype(np.float32)
|
103 |
+
print(f"**** vae time : {time.tiem()-e_t} ****")
|
104 |
+
print(output_np.mean(), output_np.min(), output_np.max(), output_np.std())
|
105 |
return (44100, output_np)
|
106 |
|
107 |
if __name__ == "__main__":
|
requirements.txt
CHANGED
@@ -31,3 +31,4 @@ lazy_loader==0.4
|
|
31 |
scipy==1.15.2
|
32 |
ftfy==6.3.1
|
33 |
torchdiffeq==0.2.5
|
|
|
|
31 |
scipy==1.15.2
|
32 |
ftfy==6.3.1
|
33 |
torchdiffeq==0.2.5
|
34 |
+
https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.2cxx11abiTRUE-cp310-cp310-linux_x86_64.whl
|