ing0 commited on
Commit
2a3c97e
·
1 Parent(s): ee08889
app.py CHANGED
@@ -9,6 +9,7 @@ from einops import rearrange
9
  import argparse
10
  import json
11
  import os
 
12
  from tqdm import tqdm
13
  import random
14
  import numpy as np
@@ -22,13 +23,15 @@ from diffrhythm.infer.infer_utils import (
22
  )
23
  from diffrhythm.infer.infer import inference
24
 
25
- device='cpu'
 
26
  cfm, tokenizer, muq, vae = prepare_model(device)
27
  cfm = torch.compile(cfm)
28
 
29
- def infer_music(lrc, ref_audio_path, steps, sway_sampling_coef_bool, max_frames=2048, device='cpu'):
 
30
 
31
- sway_sampling_coef = -1 if sway_sampling_coef_bool else None
32
  lrc_prompt, start_time = get_lrc_token(lrc, tokenizer, device)
33
  style_prompt = get_style_prompt(muq, ref_audio_path)
34
  negative_style_prompt = get_negative_style_prompt(device)
@@ -48,7 +51,7 @@ def infer_music(lrc, ref_audio_path, steps, sway_sampling_coef_bool, max_frames=
48
 
49
  def R1_infer1(theme, tags_gen, language):
50
  try:
51
- client = OpenAI(api_key="XXXX", base_url = "https://ark.cn-beijing.volces.com/api/v3")
52
 
53
  llm_prompt = """
54
  请围绕"{theme}"主题生成一首符合"{tags}"风格的完整歌词。生成的{language}语言的歌词。
@@ -82,7 +85,7 @@ def R1_infer1(theme, tags_gen, language):
82
 
83
 
84
  def R1_infer2(tags_lyrics, lyrics_input):
85
- client = OpenAI(api_key="XXX", base_url = "https://ark.cn-beijing.volces.com/api/v3")
86
 
87
  llm_prompt = """
88
  {lyrics_input}这是一首歌的歌词,每一行是一句歌词,{tags_lyrics}是我希望这首歌的风格,我现在想要给这首歌的每一句歌词打时间戳得到LRC,我希望时间戳分配应根据歌曲的标签、歌词的情感、节奏来合理推测,而非机械地按照歌词长度分配。第一句歌词的时间戳应考虑前奏长度,避免歌词从 `[00:00.00]` 直接开始。严格按照 LRC 格式输出歌词,每行格式为 `[mm:ss.xx]歌词内容`。最后的结果只输出LRC,不需要其他的解释。
@@ -110,10 +113,32 @@ css = """
110
  white-space: pre-wrap; /* 保留换行 */
111
  line-height: 1.5; /* 行高优化 */
112
  }
 
 
 
 
 
 
 
 
 
113
  """
114
 
115
  with gr.Blocks(css=css) as demo:
116
- gr.Markdown("# DiffRhythm")
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
  with gr.Tabs() as tabs:
119
 
@@ -144,31 +169,26 @@ with gr.Blocks(css=css) as demo:
144
  placeholder="Input the full lyrics",
145
  lines=12,
146
  max_lines=50,
147
- elem_classes="lyrics-scroll-box"
 
148
  )
149
- audio_prompt = gr.Audio(label="Audio Prompt", type="filepath")
150
 
151
  with gr.Column():
152
  steps = gr.Slider(
153
  minimum=10,
154
- maximum=40,
155
  value=32,
156
  step=1,
157
  label="Diffusion Steps",
158
  interactive=True,
159
  elem_id="step_slider"
160
  )
161
- sway_sampling_coef_bool = gr.Radio(
162
- choices=[("False", False), ("True", True)],
163
- label="Use sway_sampling_coef",
164
- value=False,
165
- interactive=True,
166
- elem_classes="horizontal-radio"
167
- )
168
  lyrics_btn = gr.Button("Submit", variant="primary")
169
  audio_output = gr.Audio(label="Audio Result", type="filepath", elem_id="audio_output")
170
 
171
-
 
172
  gr.Examples(
173
  examples=[
174
  ["./gift_of_the_world.wav"],
@@ -177,59 +197,21 @@ with gr.Blocks(css=css) as demo:
177
  ],
178
  inputs=[audio_prompt],
179
  label="Audio Examples",
180
- examples_per_page=3
 
181
  )
182
 
183
  gr.Examples(
184
  examples=[
185
- ["""[00:10.00]Moonlight spills through broken blinds
186
- [00:13.20]Your shadow dances on the dashboard shrine
187
- [00:16.85]Neon ghosts in gasoline rain
188
- [00:20.40]I hear your laughter down the midnight train
189
- [00:24.15]Static whispers through frayed wires
190
- [00:27.65]Guitar strings hum our cathedral choirs
191
- [00:31.30]Flicker screens show reruns of June
192
- [00:34.90]I'm drowning in this mercury lagoon
193
- [00:38.55]Electric veins pulse through concrete skies
194
- [00:42.10]Your name echoes in the hollow where my heartbeat lies
195
- [00:45.75]We're satellites trapped in parallel light
196
- [00:49.25]Burning through the atmosphere of endless night
197
- [01:00.00]Dusty vinyl spins reverse
198
- [01:03.45]Our polaroid timeline bleeds through the verse
199
- [01:07.10]Telescope aimed at dead stars
200
- [01:10.65]Still tracing constellations through prison bars
201
- [01:14.30]Electric veins pulse through concrete skies
202
- [01:17.85]Your name echoes in the hollow where my heartbeat lies
203
- [01:21.50]We're satellites trapped in parallel light
204
- [01:25.05]Burning through the atmosphere of endless night
205
- [02:10.00]Clockwork gears grind moonbeams to rust
206
- [02:13.50]Our fingerprint smudged by interstellar dust
207
- [02:17.15]Velvet thunder rolls through my veins
208
- [02:20.70]Chasing phantom trains through solar plane
209
- [02:24.35]Electric veins pulse through concrete skies
210
- [02:27.90]Your name echoes in the hollow where my heartbeat lies"""],
211
- ["""[00:05.00]Stardust whispers in your eyes
212
- [00:09.30]Moonlight paints our silhouettes
213
- [00:13.75]Tides bring secrets from the deep
214
- [00:18.20]Where forever's breath is kept
215
- [00:22.90]We dance through constellations' maze
216
- [00:27.15]Footprints melt in cosmic waves
217
- [00:31.65]Horizons hum our silent vow
218
- [00:36.10]Time unravels here and now
219
- [00:40.85]Eternal embers in the night oh oh oh
220
- [00:45.25]Healing scars with liquid light
221
- [00:49.70]Galaxies write our refrain
222
- [00:54.15]Love reborn in endless rain
223
- [01:15.30]Paper boats of memories
224
- [01:19.75]Float through veins of ancient trees
225
- [01:24.20]Your laughter spins aurora threads
226
- [01:28.65]Weaving dawn through featherbed"""]
227
  ],
228
- inputs=[lrc],
229
  label="Lrc Examples",
230
- examples_per_page=2
 
231
  )
232
-
233
  # page 2
234
  with gr.Tab("LLM Generate LRC", id=1):
235
  with gr.Row():
@@ -241,8 +223,26 @@ with gr.Blocks(css=css) as demo:
241
  gr.Markdown("### Method 1: Generate from Theme")
242
  theme = gr.Textbox(label="theme", placeholder="Enter song theme, e.g. Love and Heartbreak")
243
  tags_gen = gr.Textbox(label="tags", placeholder="Example: male pop confidence healing")
244
- language = gr.Dropdown(["zh", "en"], label="language", value="en")
 
245
  gen_from_theme_btn = gr.Button("Generate LRC (From Theme)", variant="primary")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
246
 
247
  with gr.Group(visible=True):
248
  gr.Markdown("### Method 2: Add Timestamps to Lyrics")
@@ -250,55 +250,37 @@ with gr.Blocks(css=css) as demo:
250
  lyrics_input = gr.Textbox(
251
  label="Raw Lyrics (without timestamps)",
252
  placeholder="Enter plain lyrics (without timestamps), e.g.:\nYesterday\nAll my troubles...",
253
- lines=12,
254
  max_lines=50,
255
  elem_classes="lyrics-scroll-box"
256
  )
 
257
  gen_from_lyrics_btn = gr.Button("Generate LRC (From Lyrics)", variant="primary")
258
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
259
  with gr.Column():
260
  lrc_output = gr.Textbox(
261
  label="Generated LRC Lyrics",
262
  placeholder="Timed lyrics will appear here",
263
- lines=50,
264
  elem_classes="lrc-output",
265
  show_copy_button=True
266
  )
267
-
268
- # Examples section
269
- gr.Examples(
270
- examples=[
271
- [
272
- "Love and Heartbreak",
273
- "female vocal emotional piano pop",
274
- "en"
275
- ],
276
- [
277
- "Heroic Epic",
278
- "male choir orchestral powerful",
279
- "zh"
280
- ]
281
- ],
282
- inputs=[theme, tags_gen, language],
283
- label="Examples: Generate from Theme"
284
- )
285
-
286
- gr.Examples(
287
- examples=[
288
- [
289
- "acoustic folk happy",
290
- """I'm sitting here in the boring room
291
- It's just another rainy Sunday afternoon"""
292
- ],
293
- [
294
- "electronic dance energetic",
295
- """We're living in a material world
296
- And I am a material girl"""
297
- ]
298
- ],
299
- inputs=[tags_lyrics, lyrics_input],
300
- label="Examples: Generate from Lyrics"
301
- )
302
 
303
  # Bind functions
304
  gen_from_theme_btn.click(
@@ -321,10 +303,11 @@ with gr.Blocks(css=css) as demo:
321
 
322
  lyrics_btn.click(
323
  fn=infer_music,
324
- inputs=[lrc, audio_prompt, steps, sway_sampling_coef_bool],
325
  outputs=audio_output
326
  )
327
-
 
328
  demo.queue().launch(show_api=False, show_error=True)
329
 
330
 
 
9
  import argparse
10
  import json
11
  import os
12
+ import spaces
13
  from tqdm import tqdm
14
  import random
15
  import numpy as np
 
23
  )
24
  from diffrhythm.infer.infer import inference
25
 
26
+
27
+ device='cuda'
28
  cfm, tokenizer, muq, vae = prepare_model(device)
29
  cfm = torch.compile(cfm)
30
 
31
+ @spaces.GPU
32
+ def infer_music(lrc, ref_audio_path, steps, max_frames=2048, device='cuda'):
33
 
34
+ sway_sampling_coef = -1 if steps < 32 else None
35
  lrc_prompt, start_time = get_lrc_token(lrc, tokenizer, device)
36
  style_prompt = get_style_prompt(muq, ref_audio_path)
37
  negative_style_prompt = get_negative_style_prompt(device)
 
51
 
52
  def R1_infer1(theme, tags_gen, language):
53
  try:
54
+ client = OpenAI(api_key="3581722f-9abc-49cf-9792-fa962cad9c4f", base_url = "https://ark.cn-beijing.volces.com/api/v3")
55
 
56
  llm_prompt = """
57
  请围绕"{theme}"主题生成一首符合"{tags}"风格的完整歌词。生成的{language}语言的歌词。
 
85
 
86
 
87
  def R1_infer2(tags_lyrics, lyrics_input):
88
+ client = OpenAI(api_key="3581722f-9abc-49cf-9792-fa962cad9c4f", base_url = "https://ark.cn-beijing.volces.com/api/v3")
89
 
90
  llm_prompt = """
91
  {lyrics_input}这是一首歌的歌词,每一行是一句歌词,{tags_lyrics}是我希望这首歌的风格,我现在想要给这首歌的每一句歌词打时间戳得到LRC,我希望时间戳分配应根据歌曲的标签、歌词的情感、节奏来合理推测,而非机械地按照歌词长度分配。第一句歌词的时间戳应考虑前奏长度,避免歌词从 `[00:00.00]` 直接开始。严格按照 LRC 格式输出歌词,每行格式为 `[mm:ss.xx]歌词内容`。最后的结果只输出LRC,不需要其他的解释。
 
113
  white-space: pre-wrap; /* 保留换行 */
114
  line-height: 1.5; /* 行高优化 */
115
  }
116
+
117
+ .gr-examples {
118
+ background: transparent !important;
119
+ border: 1px solid #e0e0e0 !important;
120
+ border-radius: 8px;
121
+ margin: 1rem 0 !important;
122
+ padding: 1rem !important;
123
+ }
124
+
125
  """
126
 
127
  with gr.Blocks(css=css) as demo:
128
+ gr.Markdown("<h1 style='text-align: center'>DiffRhythm(谛韵)</h1>")
129
+ gr.HTML("""
130
+ <div style="display:flex; justify-content: center; column-gap:4px;">
131
+ <a href="https://github.com/ASLP-lab/DiffRhythm">
132
+ <img src='https://img.shields.io/badge/Arxiv-Paper-blue'>
133
+ </a>
134
+ <a href="https://github.com/ASLP-lab/DiffRhythm">
135
+ <img src='https://img.shields.io/badge/GitHub-Repo-green'>
136
+ </a>
137
+ <a href="https://aslp-lab.github.io/DiffRhythm.github.io/">
138
+ <img src='https://img.shields.io/badge/Project-Page-brown'>
139
+ </a>
140
+ </div>
141
+ """)
142
 
143
  with gr.Tabs() as tabs:
144
 
 
169
  placeholder="Input the full lyrics",
170
  lines=12,
171
  max_lines=50,
172
+ elem_classes="lyrics-scroll-box",
173
+ value="""[00:05.00]Stardust whispers in your eyes\n[00:09.30]Moonlight paints our silhouettes\n[00:13.75]Tides bring secrets from the deep\n[00:18.20]Where forever's breath is kept\n[00:22.90]We dance through constellations' maze\n[00:27.15]Footprints melt in cosmic waves\n[00:31.65]Horizons hum our silent vow\n[00:36.10]Time unravels here and now\n[00:40.85]Eternal embers in the night oh oh oh\n[00:45.25]Healing scars with liquid light\n[00:49.70]Galaxies write our refrain\n[00:54.15]Love reborn in endless rain\n[01:00.00]Interlude\n[01:15.30]Paper boats of memories\n[01:19.75]Float through veins of ancient trees\n[01:24.20]Your laughter spins aurora threads\n[01:28.65]Weaving dawn through featherbed"""
174
  )
175
+ audio_prompt = gr.Audio(label="Audio Prompt", type="filepath", value="./gift_of_the_world.wav")
176
 
177
  with gr.Column():
178
  steps = gr.Slider(
179
  minimum=10,
180
+ maximum=100,
181
  value=32,
182
  step=1,
183
  label="Diffusion Steps",
184
  interactive=True,
185
  elem_id="step_slider"
186
  )
 
 
 
 
 
 
 
187
  lyrics_btn = gr.Button("Submit", variant="primary")
188
  audio_output = gr.Audio(label="Audio Result", type="filepath", elem_id="audio_output")
189
 
190
+
191
+
192
  gr.Examples(
193
  examples=[
194
  ["./gift_of_the_world.wav"],
 
197
  ],
198
  inputs=[audio_prompt],
199
  label="Audio Examples",
200
+ examples_per_page=3,
201
+ elem_id="audio-examples-container"
202
  )
203
 
204
  gr.Examples(
205
  examples=[
206
+ ["""[00:05.00]Stardust whispers in your eyes\n[00:09.30]Moonlight paints our silhouettes\n[00:13.75]Tides bring secrets from the deep\n[00:18.20]Where forever's breath is kept\n[00:22.90]We dance through constellations' maze\n[00:27.15]Footprints melt in cosmic waves\n[00:31.65]Horizons hum our silent vow\n[00:36.10]Time unravels here and now\n[00:40.85]Eternal embers in the night oh oh oh\n[00:45.25]Healing scars with liquid light\n[00:49.70]Galaxies write our refrain\n[00:54.15]Love reborn in endless rain\n[01:00.00]Interlude\n[01:15.30]Paper boats of memories\n[01:19.75]Float through veins of ancient trees\n[01:24.20]Your laughter spins aurora threads\n[01:28.65]Weaving dawn through featherbed"""],
207
+ ["""[00:10.00]Moonlight spills through broken blinds\n[00:13.20]Your shadow dances on the dashboard shrine\n[00:16.85]Neon ghosts in gasoline rain\n[00:20.40]I hear your laughter down the midnight train\n[00:24.15]Static whispers through frayed wires\n[00:27.65]Guitar strings hum our cathedral choirs\n[00:31.30]Flicker screens show reruns of June\n[00:34.90]I'm drowning in this mercury lagoon\n[00:38.55]Electric veins pulse through concrete skies\n[00:42.10]Your name echoes in the hollow where my heartbeat lies\n[00:45.75]We're satellites trapped in parallel light\n[00:49.25]Burning through the atmosphere of endless night\n[01:00.00]Dusty vinyl spins reverse\n[01:03.45]Our polaroid timeline bleeds through the verse\n[01:07.10]Telescope aimed at dead stars\n[01:10.65]Still tracing constellations through prison bars\n[01:14.30]Electric veins pulse through concrete skies\n[01:17.85]Your name echoes in the hollow where my heartbeat lies\n[01:21.50]We're satellites trapped in parallel light\n[01:25.05]Burning through the atmosphere of endless night\n[02:10.00]Clockwork gears grind moonbeams to rust\n[02:13.50]Our fingerprint smudged by interstellar dust\n[02:17.15]Velvet thunder rolls through my veins\n[02:20.70]Chasing phantom trains through solar plane\n[02:24.35]Electric veins pulse through concrete skies\n[02:27.90]Your name echoes in the hollow where my heartbeat lies"""]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  ],
209
+ inputs=[lrc],
210
  label="Lrc Examples",
211
+ examples_per_page=2,
212
+ elem_id="lrc-examples-container",
213
  )
214
+
215
  # page 2
216
  with gr.Tab("LLM Generate LRC", id=1):
217
  with gr.Row():
 
223
  gr.Markdown("### Method 1: Generate from Theme")
224
  theme = gr.Textbox(label="theme", placeholder="Enter song theme, e.g. Love and Heartbreak")
225
  tags_gen = gr.Textbox(label="tags", placeholder="Example: male pop confidence healing")
226
+ # language = gr.Dropdown(["zh", "en"], label="language", value="en")
227
+ language = gr.Radio(["zh", "en"], label="Language", value="en")
228
  gen_from_theme_btn = gr.Button("Generate LRC (From Theme)", variant="primary")
229
+
230
+ gr.Examples(
231
+ examples=[
232
+ [
233
+ "Love and Heartbreak",
234
+ "vocal emotional piano pop",
235
+ "en"
236
+ ],
237
+ [
238
+ "Heroic Epic",
239
+ "choir orchestral powerful",
240
+ "zh"
241
+ ]
242
+ ],
243
+ inputs=[theme, tags_gen, language],
244
+ label="Examples: Generate from Theme"
245
+ )
246
 
247
  with gr.Group(visible=True):
248
  gr.Markdown("### Method 2: Add Timestamps to Lyrics")
 
250
  lyrics_input = gr.Textbox(
251
  label="Raw Lyrics (without timestamps)",
252
  placeholder="Enter plain lyrics (without timestamps), e.g.:\nYesterday\nAll my troubles...",
253
+ lines=10,
254
  max_lines=50,
255
  elem_classes="lyrics-scroll-box"
256
  )
257
+
258
  gen_from_lyrics_btn = gr.Button("Generate LRC (From Lyrics)", variant="primary")
259
 
260
+ gr.Examples(
261
+ examples=[
262
+ [
263
+ "acoustic folk happy",
264
+ """I'm sitting here in the boring room\nIt's just another rainy Sunday afternoon"""
265
+ ],
266
+ [
267
+ "electronic dance energetic",
268
+ """We're living in a material world\nAnd I am a material girl"""
269
+ ]
270
+ ],
271
+ inputs=[tags_lyrics, lyrics_input],
272
+ label="Examples: Generate from Lyrics"
273
+ )
274
+
275
+
276
  with gr.Column():
277
  lrc_output = gr.Textbox(
278
  label="Generated LRC Lyrics",
279
  placeholder="Timed lyrics will appear here",
280
+ lines=57,
281
  elem_classes="lrc-output",
282
  show_copy_button=True
283
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
284
 
285
  # Bind functions
286
  gen_from_theme_btn.click(
 
303
 
304
  lyrics_btn.click(
305
  fn=infer_music,
306
+ inputs=[lrc, audio_prompt, steps],
307
  outputs=audio_output
308
  )
309
+
310
+
311
  demo.queue().launch(show_api=False, show_error=True)
312
 
313
 
diffrhythm/infer/infer.py CHANGED
@@ -90,14 +90,13 @@ def inference(cfm_model, vae_model, cond, text, duration, style_prompt, negative
90
  generated = generated.to(torch.float32)
91
  latent = generated.transpose(1, 2) # [b d t]
92
 
93
- output = decode_audio(latent, vae_model)
94
 
95
  # Rearrange audio batch to a single sequence
96
  output = rearrange(output, "b d n -> d (b n)")
97
- # Peak normalize, clip, convert to int16, and save to file
98
- output = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1, 1).mul(32767).to(torch.int16).cpu()
99
-
100
- return output
101
 
102
  if __name__ == "__main__":
103
  parser = argparse.ArgumentParser()
 
90
  generated = generated.to(torch.float32)
91
  latent = generated.transpose(1, 2) # [b d t]
92
 
93
+ output = decode_audio(latent, vae_model, chunked=False)
94
 
95
  # Rearrange audio batch to a single sequence
96
  output = rearrange(output, "b d n -> d (b n)")
97
+ output_tensor = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1, 1).cpu()
98
+ output_np = output_tensor.numpy().T.astype(np.float32)
99
+ return (44100, output_np)
 
100
 
101
  if __name__ == "__main__":
102
  parser = argparse.ArgumentParser()
diffrhythm/infer/infer_utils.py CHANGED
@@ -34,8 +34,7 @@ def prepare_model(device):
34
 
35
  # prepare vae
36
  vae_ckpt_path = hf_hub_download(repo_id="ASLP-lab/DiffRhythm-vae", filename="vae_model.pt")
37
- print(f"****************** {device} ******************")
38
- vae = torch.jit.load(vae_ckpt_path, map_location=device)
39
 
40
  return cfm, tokenizer, muq, vae
41
 
 
34
 
35
  # prepare vae
36
  vae_ckpt_path = hf_hub_download(repo_id="ASLP-lab/DiffRhythm-vae", filename="vae_model.pt")
37
+ vae = torch.jit.load(vae_ckpt_path).to(device)
 
38
 
39
  return cfm, tokenizer, muq, vae
40