VoiceCloning-be's picture
h
3a478bf
raw
history blame
7.66 kB
import torch
from torch.nn.utils import remove_weight_norm
from torch.nn.utils.parametrizations import weight_norm
from typing import Optional
from rvc.lib.algorithm.residuals import LRELU_SLOPE, ResBlock1, ResBlock2
from rvc.lib.algorithm.commons import init_weights
class Generator(torch.nn.Module):
"""Generator for synthesizing audio. Optimized for performance and quality.
Args:
initial_channel (int): Number of channels in the initial convolutional layer.
resblock (str): Type of residual block to use (1 or 2).
resblock_kernel_sizes (list): Kernel sizes of the residual blocks.
resblock_dilation_sizes (list): Dilation rates of the residual blocks.
upsample_rates (list): Upsampling rates.
upsample_initial_channel (int): Number of channels in the initial upsampling layer.
upsample_kernel_sizes (list): Kernel sizes of the upsampling layers.
gin_channels (int, optional): Number of channels for the global conditioning input. Defaults to 0.
"""
def __init__(
self,
initial_channel,
resblock,
resblock_kernel_sizes,
resblock_dilation_sizes,
upsample_rates,
upsample_initial_channel,
upsample_kernel_sizes,
gin_channels=0,
):
super(Generator, self).__init__()
self.num_kernels = len(resblock_kernel_sizes)
self.num_upsamples = len(upsample_rates)
self.conv_pre = torch.nn.Conv1d(
initial_channel, upsample_initial_channel, 7, 1, padding=3
)
resblock = ResBlock1 if resblock == "1" else ResBlock2
self.ups_and_resblocks = torch.nn.ModuleList()
for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
self.ups_and_resblocks.append(
weight_norm(
torch.nn.ConvTranspose1d(
upsample_initial_channel // (2**i),
upsample_initial_channel // (2 ** (i + 1)),
k,
u,
padding=(k - u) // 2,
)
)
)
ch = upsample_initial_channel // (2 ** (i + 1))
for j, (k, d) in enumerate(
zip(resblock_kernel_sizes, resblock_dilation_sizes)
):
self.ups_and_resblocks.append(resblock(ch, k, d))
self.conv_post = torch.nn.Conv1d(ch, 1, 7, 1, padding=3, bias=False)
self.ups_and_resblocks.apply(init_weights)
if gin_channels != 0:
self.cond = torch.nn.Conv1d(gin_channels, upsample_initial_channel, 1)
def forward(self, x: torch.Tensor, g: Optional[torch.Tensor] = None):
x = self.conv_pre(x)
if g is not None:
x = x + self.cond(g)
resblock_idx = 0
for _ in range(self.num_upsamples):
x = torch.nn.functional.leaky_relu(x, LRELU_SLOPE)
x = self.ups_and_resblocks[resblock_idx](x)
resblock_idx += 1
xs = 0
for _ in range(self.num_kernels):
xs += self.ups_and_resblocks[resblock_idx](x)
resblock_idx += 1
x = xs / self.num_kernels
x = torch.nn.functional.leaky_relu(x)
x = self.conv_post(x)
x = torch.tanh(x)
return x
def __prepare_scriptable__(self):
"""Prepares the module for scripting."""
for l in self.ups_and_resblocks:
for hook in l._forward_pre_hooks.values():
if (
hook.__module__ == "torch.nn.utils.parametrizations.weight_norm"
and hook.__class__.__name__ == "WeightNorm"
):
torch.nn.utils.remove_weight_norm(l)
return self
def remove_weight_norm(self):
"""Removes weight normalization from the upsampling and residual blocks."""
for l in self.ups_and_resblocks:
remove_weight_norm(l)
class SineGen(torch.nn.Module):
"""Sine wave generator.
Args:
samp_rate (int): Sampling rate in Hz.
harmonic_num (int, optional): Number of harmonic overtones. Defaults to 0.
sine_amp (float, optional): Amplitude of sine waveform. Defaults to 0.1.
noise_std (float, optional): Standard deviation of Gaussian noise. Defaults to 0.003.
voiced_threshold (float, optional): F0 threshold for voiced/unvoiced classification. Defaults to 0.
flag_for_pulse (bool, optional): Whether this SineGen is used inside PulseGen. Defaults to False.
"""
def __init__(
self,
samp_rate,
harmonic_num=0,
sine_amp=0.1,
noise_std=0.003,
voiced_threshold=0,
flag_for_pulse=False,
):
super(SineGen, self).__init__()
self.sine_amp = sine_amp
self.noise_std = noise_std
self.harmonic_num = harmonic_num
self.dim = self.harmonic_num + 1
self.sample_rate = samp_rate
self.voiced_threshold = voiced_threshold
def _f02uv(self, f0):
"""Converts F0 to voiced/unvoiced signal.
Args:
f0 (torch.Tensor): F0 tensor with shape (batch_size, length, 1)..
"""
uv = torch.ones_like(f0)
uv = uv * (f0 > self.voiced_threshold)
return uv
def forward(self, f0: torch.Tensor, upp: int):
"""Generates sine waves.
Args:
f0 (torch.Tensor): F0 tensor with shape (batch_size, length, 1).
upp (int): Upsampling factor.
"""
with torch.no_grad():
f0 = f0[:, None].transpose(1, 2)
f0_buf = torch.zeros(f0.shape[0], f0.shape[1], self.dim, device=f0.device)
f0_buf[:, :, 0] = f0[:, :, 0]
f0_buf[:, :, 1:] = (
f0_buf[:, :, 0:1]
* torch.arange(2, self.harmonic_num + 2, device=f0.device)[
None, None, :
]
)
rad_values = (f0_buf / float(self.sample_rate)) % 1
rand_ini = torch.rand(
f0_buf.shape[0], f0_buf.shape[2], device=f0_buf.device
)
rand_ini[:, 0] = 0
rad_values[:, 0, :] = rad_values[:, 0, :] + rand_ini
tmp_over_one = torch.cumsum(rad_values, 1)
tmp_over_one *= upp
tmp_over_one = torch.nn.functional.interpolate(
tmp_over_one.transpose(2, 1),
scale_factor=float(upp),
mode="linear",
align_corners=True,
).transpose(2, 1)
rad_values = torch.nn.functional.interpolate(
rad_values.transpose(2, 1), scale_factor=float(upp), mode="nearest"
).transpose(2, 1)
tmp_over_one %= 1
tmp_over_one_idx = (tmp_over_one[:, 1:, :] - tmp_over_one[:, :-1, :]) < 0
cumsum_shift = torch.zeros_like(rad_values)
cumsum_shift[:, 1:, :] = tmp_over_one_idx * -1.0
sine_waves = torch.sin(
torch.cumsum(rad_values + cumsum_shift, dim=1) * 2 * torch.pi
)
sine_waves = sine_waves * self.sine_amp
uv = self._f02uv(f0)
uv = torch.nn.functional.interpolate(
uv.transpose(2, 1), scale_factor=float(upp), mode="nearest"
).transpose(2, 1)
noise_amp = uv * self.noise_std + (1 - uv) * self.sine_amp / 3
noise = noise_amp * torch.randn_like(sine_waves)
sine_waves = sine_waves * uv + noise
return sine_waves, uv, noise