KingNish commited on
Commit
2d64f5c
·
verified ·
1 Parent(s): b52edce

Upload ./vocos/heads.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. vocos/heads.py +170 -0
vocos/heads.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+
3
+ import torch
4
+ from torch import nn
5
+ from torchaudio.functional.functional import _hz_to_mel, _mel_to_hz
6
+
7
+ from vocos.spectral_ops import IMDCT, ISTFT
8
+ from vocos.modules import symexp
9
+
10
+
11
+ class FourierHead(nn.Module):
12
+ """Base class for inverse fourier modules."""
13
+
14
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
15
+ """
16
+ Args:
17
+ x (Tensor): Input tensor of shape (B, L, H), where B is the batch size,
18
+ L is the sequence length, and H denotes the model dimension.
19
+
20
+ Returns:
21
+ Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal.
22
+ """
23
+ raise NotImplementedError("Subclasses must implement the forward method.")
24
+
25
+
26
+ class ISTFTHead(FourierHead):
27
+ """
28
+ ISTFT Head module for predicting STFT complex coefficients.
29
+
30
+ Args:
31
+ dim (int): Hidden dimension of the model.
32
+ n_fft (int): Size of Fourier transform.
33
+ hop_length (int): The distance between neighboring sliding window frames, which should align with
34
+ the resolution of the input features.
35
+ padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
36
+ """
37
+
38
+ def __init__(self, dim: int, n_fft: int, hop_length: int, padding: str = "same", ckpt: Optional[str] = None):
39
+ super().__init__()
40
+ out_dim = n_fft + 2
41
+ self.out = torch.nn.Linear(dim, out_dim)
42
+ self.istft = ISTFT(n_fft=n_fft, hop_length=hop_length, win_length=n_fft, padding=padding)
43
+ # load the checkpoint if provided
44
+ if ckpt is not None:
45
+ params = torch.load(ckpt, map_location="cpu")
46
+ # find head.out.weight and head.out.bias in the checkpoint
47
+ out_weight = params["head.out.weight"]
48
+ out_bias = params["head.out.bias"]
49
+
50
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
51
+ """
52
+ Forward pass of the ISTFTHead module.
53
+
54
+ Args:
55
+ x (Tensor): Input tensor of shape (B, L, H), where B is the batch size,
56
+ L is the sequence length, and H denotes the model dimension.
57
+
58
+ Returns:
59
+ Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal.
60
+ """
61
+ x = self.out(x).transpose(1, 2)
62
+ mag, p = x.chunk(2, dim=1)
63
+ mag = torch.exp(mag)
64
+ mag = torch.clip(mag, max=1e2) # safeguard to prevent excessively large magnitudes
65
+ # wrapping happens here. These two lines produce real and imaginary value
66
+ x = torch.cos(p)
67
+ y = torch.sin(p)
68
+ # recalculating phase here does not produce anything new
69
+ # only costs time
70
+ # phase = torch.atan2(y, x)
71
+ # S = mag * torch.exp(phase * 1j)
72
+ # better directly produce the complex value
73
+ S = mag * (x + 1j * y)
74
+ audio = self.istft(S)
75
+ return audio
76
+
77
+
78
+ class IMDCTSymExpHead(FourierHead):
79
+ """
80
+ IMDCT Head module for predicting MDCT coefficients with symmetric exponential function
81
+
82
+ Args:
83
+ dim (int): Hidden dimension of the model.
84
+ mdct_frame_len (int): Length of the MDCT frame.
85
+ padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
86
+ sample_rate (int, optional): The sample rate of the audio. If provided, the last layer will be initialized
87
+ based on perceptual scaling. Defaults to None.
88
+ clip_audio (bool, optional): Whether to clip the audio output within the range of [-1.0, 1.0]. Defaults to False.
89
+ """
90
+
91
+ def __init__(
92
+ self,
93
+ dim: int,
94
+ mdct_frame_len: int,
95
+ padding: str = "same",
96
+ sample_rate: Optional[int] = None,
97
+ clip_audio: bool = False,
98
+ ):
99
+ super().__init__()
100
+ out_dim = mdct_frame_len // 2
101
+ self.out = nn.Linear(dim, out_dim)
102
+ self.imdct = IMDCT(frame_len=mdct_frame_len, padding=padding)
103
+ self.clip_audio = clip_audio
104
+
105
+ if sample_rate is not None:
106
+ # optionally init the last layer following mel-scale
107
+ m_max = _hz_to_mel(sample_rate // 2)
108
+ m_pts = torch.linspace(0, m_max, out_dim)
109
+ f_pts = _mel_to_hz(m_pts)
110
+ scale = 1 - (f_pts / f_pts.max())
111
+
112
+ with torch.no_grad():
113
+ self.out.weight.mul_(scale.view(-1, 1))
114
+
115
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
116
+ """
117
+ Forward pass of the IMDCTSymExpHead module.
118
+
119
+ Args:
120
+ x (Tensor): Input tensor of shape (B, L, H), where B is the batch size,
121
+ L is the sequence length, and H denotes the model dimension.
122
+
123
+ Returns:
124
+ Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal.
125
+ """
126
+ x = self.out(x)
127
+ x = symexp(x)
128
+ x = torch.clip(x, min=-1e2, max=1e2) # safeguard to prevent excessively large magnitudes
129
+ audio = self.imdct(x)
130
+ if self.clip_audio:
131
+ audio = torch.clip(x, min=-1.0, max=1.0)
132
+
133
+ return audio
134
+
135
+
136
+ class IMDCTCosHead(FourierHead):
137
+ """
138
+ IMDCT Head module for predicting MDCT coefficients with parametrizing MDCT = exp(m) · cos(p)
139
+
140
+ Args:
141
+ dim (int): Hidden dimension of the model.
142
+ mdct_frame_len (int): Length of the MDCT frame.
143
+ padding (str, optional): Type of padding. Options are "center" or "same". Defaults to "same".
144
+ clip_audio (bool, optional): Whether to clip the audio output within the range of [-1.0, 1.0]. Defaults to False.
145
+ """
146
+
147
+ def __init__(self, dim: int, mdct_frame_len: int, padding: str = "same", clip_audio: bool = False):
148
+ super().__init__()
149
+ self.clip_audio = clip_audio
150
+ self.out = nn.Linear(dim, mdct_frame_len)
151
+ self.imdct = IMDCT(frame_len=mdct_frame_len, padding=padding)
152
+
153
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
154
+ """
155
+ Forward pass of the IMDCTCosHead module.
156
+
157
+ Args:
158
+ x (Tensor): Input tensor of shape (B, L, H), where B is the batch size,
159
+ L is the sequence length, and H denotes the model dimension.
160
+
161
+ Returns:
162
+ Tensor: Reconstructed time-domain audio signal of shape (B, T), where T is the length of the output signal.
163
+ """
164
+ x = self.out(x)
165
+ m, p = x.chunk(2, dim=2)
166
+ m = torch.exp(m).clip(max=1e2) # safeguard to prevent excessively large magnitudes
167
+ audio = self.imdct(m * torch.cos(p))
168
+ if self.clip_audio:
169
+ audio = torch.clip(x, min=-1.0, max=1.0)
170
+ return audio