ing0 commited on
Commit
9743771
·
1 Parent(s): 1a3ee96
app.py CHANGED
@@ -54,14 +54,14 @@ def R1_infer1(theme, tags_gen, language):
54
  client = OpenAI(api_key=os.getenv('HS_DP_API'), base_url = "https://ark.cn-beijing.volces.com/api/v3")
55
 
56
  llm_prompt = """
57
- 请围绕"{theme}"主题生成一首符合"{tags}"风格的完整歌词。生成的{language}语言的歌词。
58
  ### **歌曲结构要求**
59
  1. 歌词应富有变化,使情绪递进,整体连贯有层次感。**每行歌词长度应自然变化**,切勿长度一致,导致很格式化。
60
- 2. **时间戳分配应根据歌曲的标签\歌词的情感、节奏来合理推测**,而非机械地按照歌词长度分配。
61
  ### **歌曲内容要求**
62
  1. **第一句歌词的时间戳应考虑前奏长度**,避免歌词从 `[00:00.00]` 直接开始。
63
  2. **严格按照 LRC 格式输出歌词**,每行格式为 `[mm:ss.xx]歌词内容`。
64
- 3. 输出的歌词不能有空行、括号,不能有其他解释内容,例如:副歌、桥段、结尾。
65
  4. 输出必须是**纯净的 LRC**。
66
  """
67
 
@@ -156,8 +156,8 @@ with gr.Blocks(css=css) as demo:
156
  - Each line must follow: `[mm:ss.xx]Lyric content`
157
  - Example of valid format:
158
  ```
159
- [00:07.23]Fight me fight me fight me
160
- [00:08.73]You made me so unlike me
161
  ```
162
 
163
  2. **Generation Duration Limits**
 
54
  client = OpenAI(api_key=os.getenv('HS_DP_API'), base_url = "https://ark.cn-beijing.volces.com/api/v3")
55
 
56
  llm_prompt = """
57
+ 请围绕"{theme}"主题生成一首符合"{tags}"风格的语言为{language}的完整歌词。同时我希望你生成的歌词严格符合下述要求:
58
  ### **歌曲结构要求**
59
  1. 歌词应富有变化,使情绪递进,整体连贯有层次感。**每行歌词长度应自然变化**,切勿长度一致,导致很格式化。
60
+ 2. **时间戳分配应根据歌曲的标签、歌词的情感、节奏来合理推测**,而非机械地按照歌词长度分配。
61
  ### **歌曲内容要求**
62
  1. **第一句歌词的时间戳应考虑前奏长度**,避免歌词从 `[00:00.00]` 直接开始。
63
  2. **严格按照 LRC 格式输出歌词**,每行格式为 `[mm:ss.xx]歌词内容`。
64
+ 3. 输出的歌词不能有空行、括号,严禁出现除了时间戳和歌词以外的内容,例如:副歌、桥段、结尾等段落注释。
65
  4. 输出必须是**纯净的 LRC**。
66
  """
67
 
 
156
  - Each line must follow: `[mm:ss.xx]Lyric content`
157
  - Example of valid format:
158
  ```
159
+ [00:10.00]Moonlight spills through broken blinds
160
+ [00:13.20]Your shadow dances on the dashboard shrine
161
  ```
162
 
163
  2. **Generation Duration Limits**
diffrhythm/infer/infer.py CHANGED
@@ -8,7 +8,8 @@ from tqdm import tqdm
8
  import random
9
  import numpy as np
10
  import time
11
- import spaces
 
12
 
13
  from diffrhythm.infer.infer_utils import (
14
  get_reference_latent,
@@ -18,7 +19,6 @@ from diffrhythm.infer.infer_utils import (
18
  get_negative_style_prompt
19
  )
20
 
21
- @spaces.GPU
22
  def decode_audio(latents, vae_model, chunked=False, overlap=32, chunk_size=128):
23
  downsampling_ratio = 2048
24
  io_channels = 2
@@ -74,7 +74,6 @@ def decode_audio(latents, vae_model, chunked=False, overlap=32, chunk_size=128):
74
  y_final[:,:,t_start:t_end] = y_chunk[:,:,chunk_start:chunk_end]
75
  return y_final
76
 
77
- @spaces.GPU
78
  def inference(cfm_model, vae_model, cond, text, duration, style_prompt, negative_style_prompt, steps, sway_sampling_coef, start_time):
79
  # import pdb; pdb.set_trace()
80
  s_t = time.time()
@@ -91,7 +90,7 @@ def inference(cfm_model, vae_model, cond, text, duration, style_prompt, negative
91
  start_time=start_time
92
  )
93
 
94
- # generated = generated.to(torch.float32)
95
  latent = generated.transpose(1, 2) # [b d t]
96
  e_t = time.time()
97
  print(f"**** cfm time : {e_t-s_t} ****")
@@ -104,8 +103,18 @@ def inference(cfm_model, vae_model, cond, text, duration, style_prompt, negative
104
  output_tensor = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1, 1).cpu()
105
  output_np = output_tensor.numpy().T.astype(np.float32)
106
  print(f"**** vae time : {time.time()-e_t} ****")
 
107
  print(output_np.mean(), output_np.min(), output_np.max(), output_np.std())
108
- return (44100, output_np)
 
 
 
 
 
 
 
 
 
109
 
110
  if __name__ == "__main__":
111
  parser = argparse.ArgumentParser()
 
8
  import random
9
  import numpy as np
10
  import time
11
+ import io
12
+ import pydub
13
 
14
  from diffrhythm.infer.infer_utils import (
15
  get_reference_latent,
 
19
  get_negative_style_prompt
20
  )
21
 
 
22
  def decode_audio(latents, vae_model, chunked=False, overlap=32, chunk_size=128):
23
  downsampling_ratio = 2048
24
  io_channels = 2
 
74
  y_final[:,:,t_start:t_end] = y_chunk[:,:,chunk_start:chunk_end]
75
  return y_final
76
 
 
77
  def inference(cfm_model, vae_model, cond, text, duration, style_prompt, negative_style_prompt, steps, sway_sampling_coef, start_time):
78
  # import pdb; pdb.set_trace()
79
  s_t = time.time()
 
90
  start_time=start_time
91
  )
92
 
93
+ generated = generated.to(torch.float32)
94
  latent = generated.transpose(1, 2) # [b d t]
95
  e_t = time.time()
96
  print(f"**** cfm time : {e_t-s_t} ****")
 
103
  output_tensor = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1, 1).cpu()
104
  output_np = output_tensor.numpy().T.astype(np.float32)
105
  print(f"**** vae time : {time.time()-e_t} ****")
106
+ e_t = time.time()
107
  print(output_np.mean(), output_np.min(), output_np.max(), output_np.std())
108
+ # return (44100, output_np)
109
+
110
+ buffer = io.BytesIO()
111
+
112
+ output_np = np.int16(output_np * 2**15)
113
+ song = pydub.AudioSegment(output_np.tobytes(), frame_rate=44100, sample_width=2, channels=2)
114
+ song.export(buffer, format="mp3", bitrate="320k")
115
+ print(f"**** buffer time : {time.time()-e_t} ****")
116
+ return buffer.getvalue()
117
+
118
 
119
  if __name__ == "__main__":
120
  parser = argparse.ArgumentParser()
diffrhythm/infer/infer_utils.py CHANGED
@@ -35,9 +35,6 @@ def prepare_model(device):
35
  # prepare vae
36
  vae_ckpt_path = hf_hub_download(repo_id="ASLP-lab/DiffRhythm-vae", filename="vae_model.pt")
37
  vae = torch.jit.load(vae_ckpt_path, map_location='cpu').to(device)
38
- print("********* vae.parameters() ", next(vae.parameters()).dtype)
39
- vae = vae.half()
40
- print("********* vae half parameters() ", next(vae.parameters()).dtype)
41
  return cfm, tokenizer, muq, vae
42
 
43
 
 
35
  # prepare vae
36
  vae_ckpt_path = hf_hub_download(repo_id="ASLP-lab/DiffRhythm-vae", filename="vae_model.pt")
37
  vae = torch.jit.load(vae_ckpt_path, map_location='cpu').to(device)
 
 
 
38
  return cfm, tokenizer, muq, vae
39
 
40