Spaces:

ASLP-lab
/

DiffRhythm

Running on Zero

ing0 commited on 4 days ago

Commit

4e97955

1 Parent(s): b96e750

hub ckpt

Files changed (3) hide show

diffrhythm/infer/infer.py CHANGED Viewed

@@ -72,7 +72,7 @@ def decode_audio(latents, vae_model, chunked=False, overlap=32, chunk_size=128):
             y_final[:,:,t_start:t_end] = y_chunk[:,:,chunk_start:chunk_end]
         return y_final
-def inference(cfm_model, vae_model, cond, text, duration, style_prompt, negative_style_prompt, start_time):
     # import pdb; pdb.set_trace()
     with torch.inference_mode():
         generated, _ = cfm_model.sample(

             y_final[:,:,t_start:t_end] = y_chunk[:,:,chunk_start:chunk_end]
         return y_final
+def inference(cfm_model, vae_model, cond, text, duration, style_prompt, negative_style_prompt, start_time, steps):
     # import pdb; pdb.set_trace()
     with torch.inference_mode():
         generated, _ = cfm_model.sample(

diffrhythm/infer/infer_utils.py CHANGED Viewed

@@ -6,14 +6,14 @@ from muq import MuQMuLan
 from mutagen.mp3 import MP3
 import os
 import numpy as np
 from diffrhythm.model import DiT, CFM
 def prepare_model(device):
     # prepare cfm model
-    dit_ckpt_path = "/home/node59_tmpdata3/hkchen/music_opensource/dit_model_dpo_normal.pt"
-    dit_config_path = "/home/node59_tmpdata3/hkchen/DiffRhythm/diffrhythm/diffrhythm/config/diffrhythm-1b.json"
     with open(dit_config_path) as f:
         model_config = json.load(f)
     dit_model_cls = DiT
@@ -33,7 +33,8 @@ def prepare_model(device):
     muq = muq.to(device).eval()
     # prepare vae
-    vae = torch.jit.load("/home/node59_tmpdata3/hkchen/F5-TTS-V0/infer/vae_infer.pt").to(device)
     return cfm, tokenizer, muq, vae
@@ -43,7 +44,7 @@ def get_reference_latent(device, max_frames):
     return torch.zeros(1, max_frames, 64).to(device)
 def get_negative_style_prompt(device):
-    file_path = "/home/node59_tmpdata3/hkchen/DiffRhythm/diffrhythm/diffrhythm/infer/example/vocal.npy"
     vocal_stlye = np.load(file_path)
     vocal_stlye = torch.from_numpy(vocal_stlye).to(device) # [1, 512]

 from mutagen.mp3 import MP3
 import os
 import numpy as np
+from huggingface_hub import hf_hub_download
 from diffrhythm.model import DiT, CFM
 def prepare_model(device):
     # prepare cfm model
+    dit_ckpt_path = hf_hub_download(repo_id="ASLP-lab/DiffRhythm-base", filename="cfm_model.pt")
+    dit_config_path = "./diffrhythm/config/diffrhythm-1b.json"
     with open(dit_config_path) as f:
         model_config = json.load(f)
     dit_model_cls = DiT
     muq = muq.to(device).eval()
     # prepare vae
+    vae_ckpt_path = hf_hub_download(repo_id="ASLP-lab/DiffRhythm-vae", filename="vae_model.pt")
+    vae = torch.jit.load(vae_ckpt_path).to(device)
     return cfm, tokenizer, muq, vae
     return torch.zeros(1, max_frames, 64).to(device)
 def get_negative_style_prompt(device):
+    file_path = "./prompt/negative_prompt.npy"
     vocal_stlye = np.load(file_path)
     vocal_stlye = torch.from_numpy(vocal_stlye).to(device) # [1, 512]

prompt/negative_prompt.npy ADDED Viewed

+version https://git-lfs.github.com/spec/v1
+oid sha256:6cb7d74eb7a8eda12acb8247b21d373928301db8a8cb0db480d341799fed3ce5
+size 2176