Spaces:

ASLP-lab
/

DiffRhythm

Running on Zero

App Files Files Community

ing0 commited on 4 days ago

Commit

97f0d6e

1 Parent(s): ea04001

test

Browse files

Files changed (4) hide show

app.py +1 -1
diffrhythm/g2p/g2p/mandarin.py +4 -1
diffrhythm/model/cfm.py +16 -13
diffrhythm/model/dit.py +23 -27

app.py CHANGED Viewed

@@ -13,7 +13,7 @@ from tqdm import tqdm
 import random
 import numpy as np
 import sys
-from diffrhythm.infer.infer_utils import (
     get_reference_latent,
     get_lrc_token,
     get_style_prompt,

 import random
 import numpy as np
 import sys
+from huggface_diffrhythm.space.DiffRhythm.diffrhythm.infer.infer_utils import (
     get_reference_latent,
     get_lrc_token,
     get_style_prompt,

diffrhythm/g2p/g2p/mandarin.py CHANGED Viewed

@@ -187,7 +187,10 @@ with open(
 ) as fread:
     txt_list = fread.readlines()
     for txt in txt_list:
-        word, pinyin = txt.strip().split("\t")
         word_pinyin_dict[word] = pinyin
     fread.close()

 ) as fread:
     txt_list = fread.readlines()
     for txt in txt_list:
+        try:
+            word, pinyin = txt.strip().split("\t")
+        except:
+            print(txt.strip())
         word_pinyin_dict[word] = pinyin
     fread.close()

diffrhythm/model/cfm.py CHANGED Viewed

@@ -193,25 +193,28 @@ class CFM(nn.Module):
         # test for no ref audio
         if no_ref_audio:
             cond = torch.zeros_like(cond)
         def fn(t, x):
-            # at each step, conditioning is fixed
-            # step_cond = torch.where(cond_mask, cond, torch.zeros_like(cond))
-            # predict flow
             pred = self.transformer(
-                x=x, cond=step_cond, text=text, time=t, mask=mask, drop_audio_cond=False, drop_text=False, drop_prompt=False,
-                style_prompt=style_prompt, style_prompt_lens=style_prompt_lens, start_time=start_time
             )
-            if cfg_strength < 1e-5:
-                return pred
-            null_pred = self.transformer(
-                x=x, cond=step_cond, text=text, time=t, mask=mask, drop_audio_cond=True, drop_text=True, drop_prompt=False,
-                style_prompt=negative_style_prompt, style_prompt_lens=style_prompt_lens, start_time=start_time
-            )
-            return pred + (pred - null_pred) * cfg_strength
         # noise input
         # to make sure batch inference result is same with different batch size, and for sure single inference

         # test for no ref audio
         if no_ref_audio:
             cond = torch.zeros_like(cond)
+        start_time_embed, positive_text_embed, positive_text_residuals = self.transformer.forward_timestep_invariant(text, step_cond.shape[1], drop_text=False, start_time=start_time)
+        _, negative_text_embed, negative_text_residuals = self.transformer.forward_timestep_invariant(text, step_cond.shape[1], drop_text=True, start_time=start_time)
+        text_embed = torch.cat([positive_text_embed, negative_text_embed], 0)
+        text_residuals = [torch.cat([a, b], 0) for a, b in zip(positive_text_residuals, negative_text_residuals)]
+        step_cond = torch.cat([step_cond, step_cond], 0)
+        style_prompt = torch.cat([style_prompt, negative_style_prompt], 0)
+        start_time_embed = torch.cat([start_time_embed, start_time_embed], 0)
         def fn(t, x):
+            x = torch.cat([x, x], 0)
             pred = self.transformer(
+                x=x, text_embed=text_embed, text_residuals=text_residuals, cond=step_cond, time=t,
+                drop_audio_cond=True, drop_prompt=False, style_prompt=style_prompt, start_time=start_time_embed
             )
+            positive_pred, negative_pred = pred.chunk(2, 0)
+            cfg_pred = positive_pred + (positive_pred - negative_pred) * cfg_strength
+            return cfg_pred
         # noise input
         # to make sure batch inference result is same with different batch size, and for sure single inference

diffrhythm/model/dit.py CHANGED Viewed

@@ -15,7 +15,7 @@ import torch
 import torch.nn.functional as F
 from x_transformers.x_transformers import RotaryEmbedding
-from transformers.models.llama.modeling_llama import LlamaDecoderLayer
 from transformers.models.llama import LlamaConfig
 from torch.utils.checkpoint import checkpoint
@@ -28,7 +28,8 @@ from diffrhythm.model.modules import (
     precompute_freqs_cis,
     get_pos_embed_indices,
 )
 # Text embedding
@@ -134,9 +135,11 @@ class DiT(nn.Module):
         #)
         llama_config = LlamaConfig(hidden_size=dim, intermediate_size=dim * ff_mult, hidden_act='silu')
         llama_config._attn_implementation = 'sdpa'
         self.transformer_blocks = nn.ModuleList(
             [LlamaDecoderLayer(llama_config, layer_idx=i) for i in range(depth)]
         )
         self.long_skip_connection = nn.Linear(dim * 2, dim, bias=False) if long_skip_connection else None
         self.text_fusion_linears = nn.ModuleList(
@@ -157,60 +160,53 @@ class DiT(nn.Module):
         # if use_style_prompt:
         #     self.prompt_rnn = nn.LSTM(64, cond_dim, 1, batch_first=True)
     def forward(
         self,
         x: float["b n d"],  # nosied input audio  # noqa: F722
         cond: float["b n d"],  # masked cond audio  # noqa: F722
-        text: int["b nt"],  # text  # noqa: F722
         time: float["b"] | float[""],  # time step  # noqa: F821 F722
         drop_audio_cond,  # cfg for cond audio
-        drop_text,  # cfg for text
         drop_prompt=False,
         style_prompt=None, # [b d t]
-        style_prompt_lens=None,
-        mask: bool["b n"] | None = None,  # noqa: F722
-        grad_ckpt=False,
         start_time=None,
     ):
         batch, seq_len = x.shape[0], x.shape[1]
         if time.ndim == 0:
             time = time.repeat(batch)
-        # t: conditioning time, c: context (text + masked cond audio), x: noised input audio
         t = self.time_embed(time)
-        s_t = self.start_time_embed(start_time)
-        c = t + s_t
-        text_embed = self.text_embed(text, seq_len, drop_text=drop_text)
-        # import pdb; pdb.set_trace()
         if drop_prompt:
             style_prompt = torch.zeros_like(style_prompt)
-        # if self.training:
-        #     packed_style_prompt = torch.nn.utils.rnn.pack_padded_sequence(style_prompt.transpose(1, 2), style_prompt_lens.cpu(), batch_first=True, enforce_sorted=False)
-        # else:
-        #     packed_style_prompt = style_prompt.transpose(1, 2)
-        #print(packed_style_prompt.shape)
-        # _, style_emb = self.prompt_rnn.forward(packed_style_prompt)
-        # _, (h_n, c_n) = self.prompt_rnn.forward(packed_style_prompt)
-        # style_emb = h_n.squeeze(0) # 1, B, dim -> B, dim
-        style_emb = style_prompt # [b, 512]
-        x = self.input_embed(x, cond, text_embed, style_emb, c, drop_audio_cond=drop_audio_cond)
         if self.long_skip_connection is not None:
             residual = x
         pos_ids = torch.arange(x.shape[1], device=x.device)
         pos_ids = pos_ids.unsqueeze(0).repeat(x.shape[0], 1)
         for i, block in enumerate(self.transformer_blocks):
-            if not grad_ckpt:
-                x, *_ = block(x, position_ids=pos_ids)
-            else:
-                x, *_ = checkpoint(block, x, position_ids=pos_ids, use_reentrant=False)
             if i < self.depth // 2:
-                x = x + self.text_fusion_linears[i](text_embed)
         if self.long_skip_connection is not None:
             x = self.long_skip_connection(torch.cat((x, residual), dim=-1))

 import torch.nn.functional as F
 from x_transformers.x_transformers import RotaryEmbedding
+from transformers.models.llama.modeling_llama import LlamaDecoderLayer, LlamaRotaryEmbedding
 from transformers.models.llama import LlamaConfig
 from torch.utils.checkpoint import checkpoint
     precompute_freqs_cis,
     get_pos_embed_indices,
 )
+from liger_kernel.transformers import apply_liger_kernel_to_llama
+apply_liger_kernel_to_llama()
 # Text embedding
         #)
         llama_config = LlamaConfig(hidden_size=dim, intermediate_size=dim * ff_mult, hidden_act='silu')
         llama_config._attn_implementation = 'sdpa'
+        #llama_config._attn_implementation = ''
         self.transformer_blocks = nn.ModuleList(
             [LlamaDecoderLayer(llama_config, layer_idx=i) for i in range(depth)]
         )
+        self.rotary_emb = LlamaRotaryEmbedding(config=llama_config)
         self.long_skip_connection = nn.Linear(dim * 2, dim, bias=False) if long_skip_connection else None
         self.text_fusion_linears = nn.ModuleList(
         # if use_style_prompt:
         #     self.prompt_rnn = nn.LSTM(64, cond_dim, 1, batch_first=True)
+    def forward_timestep_invariant(self, text, seq_len, drop_text, start_time):
+        s_t = self.start_time_embed(start_time)
+        text_embed = self.text_embed(text, seq_len, drop_text=drop_text)
+        text_residuals = []
+        for layer in self.text_fusion_linears:
+            text_residual = layer(text_embed)
+            text_residuals.append(text_residual)
+        return s_t, text_embed, text_residuals
     def forward(
         self,
         x: float["b n d"],  # nosied input audio  # noqa: F722
+        text_embed: int["b nt"],  # text  # noqa: F722
+        text_residuals,
         cond: float["b n d"],  # masked cond audio  # noqa: F722
         time: float["b"] | float[""],  # time step  # noqa: F821 F722
         drop_audio_cond,  # cfg for cond audio
         drop_prompt=False,
         style_prompt=None, # [b d t]
         start_time=None,
     ):
         batch, seq_len = x.shape[0], x.shape[1]
         if time.ndim == 0:
             time = time.repeat(batch)
         t = self.time_embed(time)
+        c = t + start_time
         if drop_prompt:
             style_prompt = torch.zeros_like(style_prompt)
+        style_embed = style_prompt # [b, 512]
+        x = self.input_embed(x, cond, text_embed, style_embed, c, drop_audio_cond=drop_audio_cond)
         if self.long_skip_connection is not None:
             residual = x
         pos_ids = torch.arange(x.shape[1], device=x.device)
         pos_ids = pos_ids.unsqueeze(0).repeat(x.shape[0], 1)
+        rotary_embed = self.rotary_emb(x, pos_ids)
         for i, block in enumerate(self.transformer_blocks):
+            x, *_ = block(x, position_embeddings=rotary_embed)
             if i < self.depth // 2:
+                x = x + text_residuals[i]
         if self.long_skip_connection is not None:
             x = self.long_skip_connection(torch.cat((x, residual), dim=-1))