Spaces:

marigold334
/

Glow-HiFi-TTS

Runtime error

App Files Files Community

marigold334 commited on Jun 10, 2023

Commit

41989ff

•

1 Parent(s): 397c86d

Upload 10 files

Browse files

Files changed (9) hide show

Hmodel.py +289 -0
app.py +135 -0
commons.py +214 -0
datautils.py +67 -0
log/1038_eunsik_01/Glow_TTS_00289602.pt +3 -0
log/1038_eunsik_01/HiFI_GAN_00257000.pt +3 -0
model.py +154 -0
module.py +299 -0
requirements.txt +7 -0

Hmodel.py ADDED Viewed

	@@ -0,0 +1,289 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils import weight_norm, spectral_norm, remove_weight_norm
+# V2 model을 기준으로 한다.
+class ResBlock(nn.Module):
+    def __init__(self, channels, kernel_size):
+        """
+        channels:
+        kernel_size: 3, 7, 11 중 하나
+        """
+        super(ResBlock, self).__init__()
+        # padding = (kernel_size-1)*dilation//2 ("same")
+        self.convs1 = nn.ModuleList([
+            weight_norm(nn.Conv1d(channels, channels, kernel_size, stride=1, dilation=1,
+                                  padding=(kernel_size-1)*1//2)),
+            weight_norm(nn.Conv1d(channels, channels, kernel_size, stride=1, dilation=1,
+                                  padding=(kernel_size-1)*1//2))
+        ])
+        self.convs2 = nn.ModuleList([
+            weight_norm(nn.Conv1d(channels, channels, kernel_size, stride=1, dilation=3,
+                                  padding=(kernel_size-1)*3//2)),
+            weight_norm(nn.Conv1d(channels, channels, kernel_size, stride=1, dilation=1,
+                                  padding=(kernel_size-1)*1//2))
+        ])
+        self.convs3 = nn.ModuleList([
+            weight_norm(nn.Conv1d(channels, channels, kernel_size, stride=1, dilation=5,
+                                  padding=(kernel_size-1)*5//2)),
+            weight_norm(nn.Conv1d(channels, channels, kernel_size, stride=1, dilation=1,
+                                  padding=(kernel_size-1)*1//2))
+        ])
+        self.modules = [self.convs1, self.convs2, self.convs3]
+        # 평균이 0, 표준편차가 0.01인 정규분포로 가중치 초기화
+        for module in self.modules:
+            for conv in module:
+                nn.init.normal_(conv.weight, mean=0.0, std=0.01)
+    def forward(self, x):
+        """
+        =====inputs=====
+        x: (B, channels, F) # mel-spectrogram으로부터 얻어진 input features
+        =====outputs=====
+        x: (B, channels, F) # mel-spectrogram으로부터 얻어진 output features
+        """
+        for module in self.modules:
+            for conv in module:
+                y = F.leaky_relu(x, 0.1)
+                y = conv(y)
+            x = x + y
+        return x
+    def remove_weight_norm(self):
+        for module in self.modules:
+            for conv in module:
+                remove_weight_norm(conv)
+class MRF(nn.Module):
+    def __init__(self, channels):
+        """
+        channels:
+        """
+        super(MRF, self).__init__()
+        self.res_blocks = nn.ModuleList([
+            ResBlock(channels, kernel_size=3),
+            ResBlock(channels, kernel_size=7),
+            ResBlock(channels, kernel_size=11),
+        ])
+    def forward(self, x):
+        """
+        =====inputs=====
+        x: (B, channels, F)
+        =====outputs=====
+        x: (B, channels, F)
+        """
+        skip_list = []
+        for res_block in self.res_blocks:
+            skip_x = res_block(x)
+            skip_list.append(skip_x)
+        x = sum(skip_list) / len(self.res_blocks)
+        return x
+    def remove_weight_norm(self):
+        for block in self.res_blocks:
+            block.remove_weight_norm()
+class Generator(nn.Module):
+    def __init__(self):
+        super(Generator, self).__init__()
+        self.pre_conv = weight_norm(nn.Conv1d(80, 128, kernel_size=7, stride=1, dilation=1,
+                                              padding=(7-1)//2)) # (B, 80, F) -> (B, 128, F)
+        nn.init.normal_(self.pre_conv.weight, mean=0.0, std=0.01) # 논문 저자 구현에는 없음.
+        self.up_convs = nn.ModuleList()
+        self.mrfs = nn.ModuleList()
+        ku = [16, 16, 4, 4]
+        for i in range(4):
+            # ku//2 배 upsampling
+            channels = 128//(2**(i+1))
+            up_conv = weight_norm(nn.ConvTranspose1d(128//(2**i), channels, kernel_size=ku[i], stride=ku[i]//2,
+                                                     padding=(ku[i]-ku[i]//2)//2))
+                # (B, 128, F) -(1)-> (B, 64, F*8) -(2)-> (B, 32, F*8*8) -(3)-> (B, 16, F*8*8*2) -(4)-> (B, 8, F*8*8*2*2)
+            nn.init.normal_(up_conv.weight, mean=0.0, std=0.01)
+            self.up_convs.append(up_conv)
+            # MRF
+            mrf = MRF(channels) # (B, channels, F) -> (B, channels, F)
+            self.mrfs.append(mrf)
+        self.post_conv = weight_norm(nn.Conv1d(8, 1, kernel_size=7, stride=1, dilation=1,
+                                               padding=(7-1)//2)) # (B, 8, F*256) -> (B, 1, F*256)
+        nn.init.normal_(self.post_conv.weight, mean=0.0, std=0.01)
+    def forward(self, x):
+        """
+        =====inputs=====
+        x: (B, 80, F) # mel_spectrogram
+        =====outputs=====
+        x: (B, 1, F*256) # waveform
+        """
+        x = self.pre_conv(x) # (B, 80, F) -> (B, 128, F)
+        for i in range(4):
+            x = F.leaky_relu(x, 0.1)
+            x = self.up_convs[i](x)
+            x = self.mrfs[i](x)
+            # final: (B, 128, F) -> (B, 8, F*256)
+        x = F.leaky_relu(x, 0.1)
+        x = self.post_conv(x) # (B, 8, F*256) -> (B, 1, F*256)
+        x = torch.tanh(x)
+        return x
+    def remove_weight_norm(self):
+        print('Removing weight norm...')
+        for l in self.up_convs:
+            remove_weight_norm(l)
+        for l in self.mrfs:
+            l.remove_weight_norm()
+        remove_weight_norm(self.pre_conv)
+        remove_weight_norm(self.post_conv)
+class SubPD(nn.Module):
+    def __init__(self, period):
+        #period: 2, 3, 5, 7, 11 중 하나
+        super(SubPD, self).__init__()
+        self.period = period
+        self.convs = nn.ModuleList()
+        channels = 1
+        for i in range(1, 5): # 논문 저자의 변형 구현 대신 논문대로 구현함.
+            conv = weight_norm(nn.Conv2d(channels, 2**(5+i), kernel_size=(5, 1), stride=(3, 1), dilation=1, padding=0))
+            self.convs.append(conv)
+            channels = 2**(5+i)
+            # (B, 1, [T/p]+1, p) -(1)-> (B, 64, ?, p) -(2)-> (B, 128, ?, p) -(3)-> (B, 256, ?, p) -(4)-> (B, 512, ?, p)
+        last_conv = weight_norm(nn.Conv2d(channels, 1024, kernel_size=(5, 1), stride=(1, 1), dilation=1,
+                                            padding=(2, 0))) # (B, 512, ?, p) -> (B, 1024, ?, p)
+        self.convs.append(last_conv)
+        self.post_conv = weight_norm(nn.Conv2d(1024, 1, kernel_size=(3, 1), stride=(1, 1), dilation=1,
+                                                padding=(1, 0))) # (B, 1024, ?, p) -> (B, 1, ?, p)
+    def forward(self, waveform):
+        """
+        =====inputs=====
+        waveform: (B, 1, T)
+        =====outputs=====
+        x: (B, ?) # flatten된 real/fake 벡터 (0~1(?))
+        features: feature를 모두 모아놓은 list (Feature Matching Loss를 계산하기 위함.)
+        """
+        features = []
+        B, _, T = waveform.size()
+        P = self.period
+        # padding
+        if T % P != 0:
+            padding = P - (T % P)
+            waveform = F.pad(waveform, (0, padding), "reflect") # 앞쪽에 0, 뒤쪽에 padding만큼 패딩, reflect는 마치 거울에 반사되듯이 패딩함.
+                # ex) [1, 2, 3, 4, 5]를 앞쪽에 2, 뒤쪽에 3만큼 reflect 모드로 padding -> [3, 2, 1, 2, 3, 4, 5, 4, 3, 2]
+            T += padding
+        # reshape
+        x = waveform.view(B, 1, T//P, P) # (B, 1, [T/P]+1, P)
+        for conv in self.convs:
+            x = conv(x)
+            x = F.leaky_relu(x, 0.1)
+            features.append(x)
+        x = self.post_conv(x)
+        features.append(x)
+        x = torch.flatten(x, 1, -1) # index 1번째 차원부터 마지막 차원까지 flatten | (B, ?)
+        ##### sigmoid 함수나 cliping 과정을 거치지 않아도 되는가...?
+        return x, features
+class MPD(nn.Module):
+    def __init__(self):
+        super(MPD, self).__init__()
+        self.sub_pds = nn.ModuleList([
+            SubPD(2), SubPD(3), SubPD(5), SubPD(7), SubPD(11),
+        ]) # (B, 1, T) -> (B, ?), features list
+    def forward(self, real_waveform, gen_waveform):
+        """
+        =====inputs=====
+        real_waveform: (B, 1, T) # 실제 음성
+        gen_waveform: (B, 1, T) # 생성 음성
+        =====outputs=====
+        real_outputs: (B, ?) list (len=5) # 실제 음성에 대한 SubPD outputs list
+        gen_outputs: (B, ?) list # 생성 음성에 대한 SubPD outputs list
+        real_features: features list # 실제 음성에 대한 SubPD features list
+        gen_features: features list # 생성 음성에 대한 SubPD features list
+        """
+        real_outputs, gen_outputs, real_features, gen_features = [], [], [], []
+        for sub_pd in self.sub_pds:
+            real_output, real_feature = sub_pd(real_waveform)
+            gen_output, gen_feature = sub_pd(gen_waveform)
+            real_outputs.append(real_output)
+            gen_outputs.append(gen_output)
+            real_features.append(real_feature)
+            gen_features.append(gen_feature)
+        return real_outputs, gen_outputs, real_features, gen_features
+class SubSD(nn.Module):
+    def __init__(self, first=False):
+        """
+        first: boolean (first가 True이면 spectral normalization을 적용한다.)
+        """
+        super(SubSD, self).__init__()
+        norm = spectral_norm if first else weight_norm # first가 True이면 spectral_norm, 그렇지 않으면 weight_norm
+        self.convs = nn.ModuleList([ # Mel-GAN 논문에 맞게 구현
+            norm(nn.Conv1d(1, 16, kernel_size=15, stride=1, padding=(15-1)//2)), # (B, 1, T) -> (B, 16, T)
+            norm(nn.Conv1d(16, 64, kernel_size=41, stride=4, groups=4, padding=(41-1)//2)), # (B, 16, T) -> (B, 64, T/4(?))
+            norm(nn.Conv1d(64, 256, kernel_size=41, stride=4, groups=16, padding=(41-1)//2)), # (B, 64, T/4(?)) -> (B, 256, T/16(?))
+            norm(nn.Conv1d(256, 1024, kernel_size=41, stride=4, groups=64, padding=(41-1)//2)), # (B, 256, T/16(?)) -> (B, 1024, T/64(?))
+            norm(nn.Conv1d(1024, 1024, kernel_size=41, stride=4, groups=256, padding=(41-1)//2)), # (B, 1024, T/64(?)) -> (B, 1024, T/256(?))
+            norm(nn.Conv1d(1024, 1024, kernel_size=5, stride=1, padding=(5-1)//2)) # (B, 1024, T/256(?)) -> (B, 1024, T/256(?))
+        ])
+        self.post_conv = norm(nn.Conv1d(1024, 1, kernel_size=3, stride=1, padding=(3-1)//2)) # (B, 1024, ?) -> (B, 1, ?)
+    def forward(self, waveform):
+        """
+        =====inputs=====
+        waveform: (B, 1, T)
+        =====outputs=====
+        x: (B, ?) # flatten된 real/fake 벡터 (0~1(?))
+        features: feature를 모두 모아놓은 list (Feature Matching Loss를 계산하기 위함.)
+        """
+        features = []
+        x = waveform
+        for conv in self.convs:
+            x = conv(x)
+            x = F.leaky_relu(x, 0.1)
+            features.append(x)
+        x = self.post_conv(x) # (B, 1, ?)
+        features.append(x)
+        x = x.squeeze(1) # (B, ?)
+        ##### sigmoid 함수나 cliping 과정을 거치지 않아도 되는가...?
+        return x, features
+class MSD(nn.Module):
+    def __init__(self):
+        super(MSD, self).__init__()
+        self.sub_sds = nn.ModuleList([
+            SubSD(first=True), SubSD(), SubSD()
+        ]) # (B, 1, T) -> (B, ?), features list
+        self.avgpool = nn.AvgPool1d(kernel_size=4, stride=2, padding=2) # x2 down sampling
+    def forward(self, real_waveform, gen_waveform):
+        """
+        =====inputs=====
+        real_waveform: (B, 1, T) # 실제 음성
+        gen_waveform: (B, 1, T) # 생성 음성
+        =====outputs=====
+        real_outputs: (B, ?) list (len=3) # 실제 음성에 대한 SubSD outputs list
+        gen_outputs: (B, ?) list # 생성 음성에 대한 SubSD outputs list
+        real_features: features list # 실제 음성에 대한 SubSD features list
+        gen_features: features list # 생성 음성에 대한 SubSD features list
+        """
+        real_outputs, gen_outputs, real_features, gen_features = [], [], [], []
+        for idx, sub_sd in enumerate(self.sub_sds):
+            if idx != 0:
+                real_waveform = self.avgpool(real_waveform)
+                gen_waveform = self.avgpool(gen_waveform)
+            real_output, real_feature = sub_sd(real_waveform)
+            gen_output, gen_feature = sub_sd(gen_waveform)
+            real_outputs.append(real_output)
+            gen_outputs.append(gen_output)
+            real_features.append(real_feature)
+            gen_features.append(gen_feature)
+        return real_outputs, gen_outputs, real_features, gen_features

app.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import streamlit as st
+import soundfile as sf
+import timeit
+import uuid
+import os
+import torch
+from datautils import *
+from model import Generator as Glow_model
+from utils import scan_checkpoint, plot_mel, plot_alignment
+from Hmodel import Generator as GAN_model
+MAX_WAV_VALUE = 32768.0
+device = torch.device('cuda:0')
+torch.cuda.manual_seed(1234)
+name = '1038_eunsik_01'
+# Nix
+from nix.models.TTS import NixTTSInference
+def init_session_state():
+    # Model
+    if "init_model" not in st.session_state:
+        st.session_state.init_model = True
+        st.session_state.model_variant = "KSS"
+        st.session_state.TTS = NixTTSInference("assets/nix-ljspeech-sdp-v0.1")
+def update_model():
+    if st.session_state.model_variant == "KSS":
+        st.session_state.TTS = NixTTSInference("assets/nix-ljspeech-v0.1")
+    elif st.session_state.model_variant == "은식":
+        st.session_state.TTS = NixTTSInference("assets/nix-ljspeech-sdp-v0.1")
+def update_session_state(state_id, state_value):
+    st.session_state[f"{state_id}"] = state_value
+def centered_text(input_text, mode = "h1",):
+    st.markdown(
+        f"<{mode} style='text-align: center;'>{input_text}</{mode}>", unsafe_allow_html = True)
+def generate_voice(input_text,):
+    # TTS Inference
+    c, c_length, phoneme = st.session_state.TTS.tokenize(input_text)
+    voice = st.session_state.TTS.vocalize(c, c_length)
+    # Save audio (bug in Streamlit, can't play numpy array directly)
+    sf.write(f"cache_sound/{input_text}.wav", voice[0,0], 22050)
+    # Play audio
+    st.audio(f"cache_sound/{input_text}.wav", format = "audio/wav")
+    os.remove(f"cache_sound/{input_text}.wav")
+    st.caption("Generated Voice")
+st.set_page_config(
+    page_title = "소신 Team Demo",
+    page_icon = "🔉",
+)
+init_session_state()
+centered_text("🔉 소신 Team Demo")
+centered_text("mel generator : Glow-TTS, vocoder : HiFi-GAN", "h5")
+st.write(" ")
+mode = "p"
+st.markdown(
+    f"<{mode} style='text-align: left;'><small>This is a demo trained by our vocie.&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; The voice \"KSS\" is traind 3 times \"은식\" is finetuned from \"KSS\" for 3 times &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; We got this deomoformat from Nix-TTS Interactive Demo</small></{mode}>",
+    unsafe_allow_html = True
+)
+st.write(" ")
+st.write(" ")
+col1, col2 = st.columns(2)
+with col1:
+    input_text = st.text_input(
+        "한글로만 입력해주세요",
+        value = "딥러닝은 정말 재밌어!",
+    )
+with col2:
+    model_variant = st.selectbox("목소리 선택해주세요", options = ["KSS", "은식"], index = 1)
+    if model_variant != st.session_state.model_variant:
+        # Update variant choice
+        update_session_state("model_variant", model_variant)
+        # Re-load model
+        update_model()
+button_gen = st.button("Generate Voice")
+if button_gen == True:
+    generate_voice(input_text)
+class TTS:
+    def __init__(self, model_variant):
+        self.flowgenerator = Glow_model(n_vocab = 70, h_c= 192, f_c = 768, f_c_dp = 256, out_c = 80, k_s = 3, k_s_dec = 5, heads=2, layers_enc = 6)
+        self.voicegenerator = GAN_model()
+        if model_variant == '은식':
+            last_chpt1 = './log/1038_eunsik_01/Glow_TTS_00289602.pt'
+        check_point = torch.load(last_chpt1)
+        self.flowgenerator.load_state_dict(check_point['generator'])
+        self.flowgenerator.decoder.skip()
+        self.flowgenerator.eval()
+        if model_variant == '은식':
+            last_chpt2 = './log/1038_eunsik_01/HiFI_GAN_00257000.pt'
+        check_point = torch.load(last_chpt2)
+        self.voicegenerator.load_state_dict(check_point['gen_model'])
+        self.voicegenerator.eval()
+        self.voicegenerator.remove_weight_norm()
+    def inference(self, input_text):
+        x = text_to_sequence(sentence)
+        filters = '([.,!?])'
+        sentence = re.sub(re.compile(filters), '', text)
+        x = torch.autograd.Variable(torch.tensor(x).unsqueeze(0)).to(device).long()
+        x_length = torch.tensor(x.shape[1]).unsqueeze(0).to(device)
+        with torch.no_grad():
+            noise_scale = .667
+            length_scale = 1.0
+            (y_gen_tst, *_), *_, (attn_gen, *_) = flowgenerator(x, x_length, gen = True, noise_scale = noise_scale, length_scale = length_scale)
+            y = voicegenerator(y_gen_tst)
+            audio = y.squeeze() * MAX_WAV_VALUE
+            audio = audio.cpu().numpy().astype('int16')
+            output_file = os.path.join(out_dir, 'gen_'+text[:3]+'.wav')
+            write(output_file, 22050, audio)
+            print(f'{text} is stored in {out_dir}')
+        return voice
+plot_mel(y_gen_tst[0].data.cpu().numpy())
+plot_alignment(attn_gen[0,0].data.cpu().numpy(), sequence_to_text(x[0].data.cpu().numpy()))
+ipd.display(fig1,fig2)
+ipd.Audio(filename=output_file)

commons.py ADDED Viewed

	@@ -0,0 +1,214 @@

+import torch
+import torch.nn.functional as F
+from module import *
+import numpy as np
+import math
+def sequence_mask(length, max_length=None):
+    if max_length is None:
+        max_length = max(length)
+    x = torch.arange(max_length, dtype=length.dtype, device=length.device)
+    return x.unsqueeze(0) < length.unsqueeze(1)
+def maximum_path(value, mask, max_neg_val=-np.inf):
+    """ Numpy-friendly version. It's about 4 times faster than torch version.
+    value: [b, t_x, t_y]
+    mask: [b, t_x, t_y]
+    """
+    value = value * mask
+    device = value.device
+    dtype = value.dtype
+    value = value.cpu().detach().numpy()
+    mask = mask.cpu().detach().numpy().astype(np.bool)
+    b, t_x, t_y = value.shape
+    direction = np.zeros(value.shape, dtype=np.int64)
+    v = np.zeros((b, t_x), dtype=np.float32)
+    x_range = np.arange(t_x, dtype=np.float32).reshape(1, -1)
+    for j in range(t_y):
+        v0 = np.pad(v, [[0, 0], [1, 0]], mode="constant", constant_values=max_neg_val)[:, :-1]
+        v1 = v
+        max_mask = (v1 >= v0)
+        v_max = np.where(max_mask, v1, v0)
+        direction[:, :, j] = max_mask
+        index_mask = (x_range <= j)
+        v = np.where(index_mask, v_max + value[:, :, j], max_neg_val)
+    direction = np.where(mask, direction, 1)
+    path = np.zeros(value.shape, dtype=np.float32)
+    index = mask[:, :, 0].sum(1).astype(np.int64) - 1
+    index_range = np.arange(b)
+    for j in reversed(range(t_y)):
+        path[index_range, index, j] = 1
+        index = index + direction[index_range, index, j] - 1
+    path = path * mask.astype(np.float32)
+    path = torch.from_numpy(path).to(device=device, dtype=dtype)
+    return path
+def generate_path(duration, mask):
+    """
+    duration: [b, t_x]
+    mask: [b, t_x, t_y]
+    """
+    device = duration.device
+    b, t_x, t_y = mask.shape
+    cum_duration = torch.cumsum(duration, 1)
+    path = torch.zeros(b, t_x, t_y, dtype=mask.dtype).to(device=device)
+    cum_duration_flat = cum_duration.view(b * t_x)
+    path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
+    path = path.view(b, t_x, t_y)
+    path = path - F.pad(path, (0, 0, 1, 0, 0, 0))[:, :-1]
+    path = path * mask
+    return path
+def mle_loss(z, m, logs, logdet, mask):
+    # neg normal likelihood w/o the constant term
+    l = torch.sum(logs) + 0.5 * torch.sum(torch.exp(-2 * logs) * ((z - m)**2))
+    l = l - torch.sum(logdet)  # log jacobian determinant
+    # averaging across batch, channel and time axes
+    l = l / torch.sum(torch.ones_like(z) * mask)
+    l = l + 0.5 * math.log(2 * math.pi)  # add the remaining constant term
+    return l
+def duration_loss(logw, logw_, lengths):
+    l = torch.sum((logw - logw_)**2) / torch.sum(lengths)
+    return l
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+def GAN_Loss_Generator(gen_outputs):
+    """
+    gen_outputs: (B, ?) list # MPD(len=5) 또는 MSD(len=3)의 출력
+    """
+    loss = 0
+    for DG in gen_outputs:
+        loss += torch.mean((DG-1)**2)
+    return loss
+def GAN_Loss_Discriminator(real_outputs, gen_outputs):
+    """
+    real_outputs: (B, ?) list # MPD(len=5) 또는 MSD(len=3)의 출력
+    gen_outputs: (B, ?) list # MPD(len=5) 또는 MSD(len=3)의 출력
+    """
+    loss = 0
+    for D, DG in zip(real_outputs, gen_outputs):
+        loss += torch.mean((D-1)**2 + DG**2)
+    return loss
+def Mel_Spectrogram_Loss(real_mel, gen_mel):
+    """
+    real_mel: (B, F, 80) # Dataloader로부터 가져온 mel-spectrogram
+    gen_mel: (B, F, 80) # Generator가 생성한 waveform의 mel-spectrogram
+    """
+    loss = F.l1_loss(real_mel, gen_mel)
+    return 45*loss
+def Feature_Matching_Loss(real_features, gen_features):
+    """
+    real_features: (?, ..., ?) list of list # MPD(len=[5, 6]) 또는 MSD(len=[3, 7])의 출력
+    gen_features: (?, ..., ?) list of list # MPD(len=[5, 6]) 또는 MSD(len=[3, 7])의 출력
+    """
+    loss = 0
+    for Ds, DGs in zip(real_features, gen_features):
+        for D, DG in zip(Ds, DGs):
+            loss += torch.mean(torch.abs(D - DG))
+    return 2*loss
+def Final_Loss_Generator(mpd_gen_outputs, mpd_real_features, mpd_gen_features,
+                         msd_gen_outputs, msd_real_features, msd_gen_features,
+                         real_mel, gen_mel):
+    """
+    =====inputs=====
+    [:3]: MPD outputs 뒤쪽 3개
+    [3:6]: MSD outputs 뒤쪽 3개
+    [7:8]: real_mel and gen_mel
+    =====outputs=====
+    Generator_Loss
+    Mel_Loss
+    """
+    Gen_Adv1 = GAN_Loss_Generator(mpd_gen_outputs)
+    Gen_Adv2 = GAN_Loss_Generator(msd_gen_outputs)
+    Adv = Gen_Adv1 + Gen_Adv2
+    FM1 = Feature_Matching_Loss(mpd_real_features, mpd_gen_features)
+    FM2 = Feature_Matching_Loss(msd_real_features, msd_gen_features)
+    FM = FM1 + FM2
+    Mel_Loss = Mel_Spectrogram_Loss(real_mel, gen_mel)
+    Generator_Loss = Adv + FM + Mel_Loss
+    return Generator_Loss, Mel_Loss , Adv, FM
+def Final_Loss_Discriminator(mpd_real_outputs, mpd_gen_outputs,
+                             msd_real_outputs, msd_gen_outputs):
+    """
+    =====inputs=====
+    [:2]: MPD outputs 앞쪽 2개
+    [2:4]: MSD outputs 앞쪽 2개
+    =====outputs=====
+    Discriminator_Loss
+    """
+    Disc_Adv1 = GAN_Loss_Discriminator(mpd_real_outputs, mpd_gen_outputs)
+    Disc_Adv2 = GAN_Loss_Discriminator(msd_real_outputs, msd_gen_outputs)
+    Discriminator_Loss = Disc_Adv1 + Disc_Adv2
+    return Discriminator_Loss
+class Adam():
+    def __init__(self, params, scheduler, dim_model, warmup_steps=4000, lr=1e0, betas=(0.9, 0.98), eps=1e-9):
+        self.params = params
+        self.scheduler = scheduler
+        self.dim_model = dim_model
+        self.warmup_steps = warmup_steps
+        self.lr = lr
+        self.betas = betas
+        self.eps = eps
+        self.step_num = 1
+        self.cur_lr = lr * self._get_lr_scale()
+        self._optim = torch.optim.Adam(params, lr=self.cur_lr, betas=betas, eps=eps)
+        self.param_groups = self._optim.param_groups
+    def _get_lr_scale(self):
+        if self.scheduler == "noam":
+            return np.power(self.dim_model, -0.5) * np.min([np.power(self.step_num, -0.5), self.step_num * np.power(self.warmup_steps, -1.5)])
+        else:
+            return 1
+    def _update_learning_rate(self):
+        self.step_num += 1
+        if self.scheduler == "noam":
+            self.cur_lr = self.lr * self._get_lr_scale()
+            for param_group in self._optim.param_groups:
+                param_group['lr'] = self.cur_lr
+    def get_lr(self):
+        return self.cur_lr
+    def step(self):
+        self._optim.step()
+        self._update_learning_rate()
+    def zero_grad(self):
+        self._optim.zero_grad()
+    def load_state_dict(self, d):
+        self._optim.load_state_dict(d)
+    def state_dict(self):
+        return self._optim.state_dict()

datautils.py ADDED Viewed

	@@ -0,0 +1,67 @@

+from jamo import hangul_to_jamo
+import librosa
+import torch
+sample_rate = 22050
+preemphasis = 0.97
+n_fft = 1024
+hop_length = 256
+win_length = 1024
+ref_db = 20
+max_db = 100
+mel_dim = 80
+PAD = '_'
+EOS = '~'
+SPACE = ' '
+JAMO_LEADS = "".join([chr(_) for _ in range(0x1100, 0x1113)])
+JAMO_VOWELS = "".join([chr(_) for _ in range(0x1161, 0x1176)])
+JAMO_TAILS = "".join([chr(_) for _ in range(0x11A8, 0x11C3)])
+VALID_CHARS = JAMO_LEADS + JAMO_VOWELS + JAMO_TAILS + SPACE
+symbols = PAD + EOS + VALID_CHARS
+_symbol_to_id = {s: i for i, s in enumerate(symbols)}
+_id_to_symbol = {i: s for i, s in enumerate(symbols)}
+# text를 초성, 중성, 종성으로 분리하여 id로 반환하는 함수
+def text_to_sequence(text):
+    sequence = []
+    if not 0x1100 <= ord(text[0]) <= 0x1113:
+        text = ''.join(list(hangul_to_jamo(text)))
+    for s in text:
+        sequence.append(_symbol_to_id[s])
+    sequence.append(_symbol_to_id['~'])
+    return sequence
+def sequence_to_text(sequence):
+    result = ''
+    for symbol_id in sequence:
+        if symbol_id in _id_to_symbol:
+            s = _id_to_symbol[symbol_id]
+            result += s
+    return result.replace('}{', ' ')
+def mel_spectrogram(y, n_fft=1024, num_mels=80, sampling_rate=22050, hop_size=256, win_size=1024, fmin=0, fmax=8000, center=False):
+    """
+    if torch.min(y) < -1.:
+        print('min value is ', torch.min(y))
+    if torch.max(y) > 1.:
+        print('max value is ', torch.max(y))
+    """
+    mel = librosa.filters.mel(sampling_rate, n_fft, num_mels, fmin, fmax)
+    y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
+    y = y.squeeze(1)
+    spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=torch.hann_window(win_size).to(y.device),
+                      center=center, pad_mode='reflect', normalized=False, onesided=True)
+    spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))
+    spec = torch.matmul(torch.from_numpy(mel).float().to(y.device), spec)
+    spec = torch.log(torch.clamp(spec, min=1e-5) * 1)
+    return spec

log/1038_eunsik_01/Glow_TTS_00289602.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7c84ba863d822d97db5c00dc4e69fcdf15d9040c456cab6503179bb220748cee
+size 279930587

log/1038_eunsik_01/HiFI_GAN_00257000.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2db9dd4f1fa06c40d98a19ac4e1740e390d66de5cef3138d670a29d8f917bddb
+size 421187547

model.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from module import *
+from commons import *
+import math
+class Generator(nn.Module):
+    def __init__(self, n_vocab, h_c, f_c, f_c_dp, out_c, k_s = 3, k_s_dec = 5, heads=2, layers_enc = 6):
+        super().__init__()
+        self.encoder = Encoder(n_vocab, out_c, h_c, f_c, f_c_dp, heads= heads, layers = layers_enc, k_s = k_s)
+        self.decoder = Decoder(in_c = out_c, hi_c = h_c, k_s = k_s_dec)
+    def forward(self, x, x_lengths, y=None, y_lengths=None, gen = False, noise_scale=1., length_scale=1.):
+        x_m, x_logs, logw, x_mask = self.encoder(x, x_lengths)
+        if gen:
+            w = torch.exp(logw) * x_mask * length_scale
+            w_ceil = torch.ceil(w)
+            y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
+            y_max_length = None
+            y, y_lengths, y_max_length = self.preprocess(y, y_lengths, y_max_length)
+            z_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), 1).to(x_mask.dtype)
+            attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(z_mask, 2)
+            attn = generate_path(w_ceil.squeeze(1), attn_mask.squeeze(1)).unsqueeze(1)
+            z_m = torch.matmul(attn.squeeze(1).transpose(1, 2), x_m.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
+            z_logs = torch.matmul(attn.squeeze(1).transpose(1, 2), x_logs.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
+            logw_ = torch.log(1e-8 + torch.sum(attn, -1)) * x_mask
+            z = (z_m + torch.exp(z_logs) * torch.randn_like(z_m) * noise_scale) * z_mask
+            y, logdet = self.decoder(z, z_mask, reverse=True)
+            return (y, z_m, z_logs, logdet, z_mask), (x_m, x_logs, x_mask), (attn, logw, logw_)
+        else:
+            y_max_length = y.size(2)
+            y, y_lengths, y_max_length = self.preprocess(y, y_lengths, y_max_length)
+            z_mask = torch.unsqueeze(sequence_mask(y_lengths, y_max_length), 1).to(x_mask.dtype)
+            attn_mask = torch.unsqueeze(x_mask, -1) * torch.unsqueeze(z_mask, 2)
+            z, logdet = self.decoder(y, z_mask, reverse=False)
+            with torch.no_grad():
+                x_s_sq_r = torch.exp(-2 * x_logs)
+                logp1 = torch.sum(-0.5 * math.log(2 * math.pi) - x_logs, [1]).unsqueeze(-1) # [b, t, 1]
+                logp2 = torch.matmul(x_s_sq_r.transpose(1,2), -0.5 * (z ** 2)) # [b, t, d] x [b, d, t'] = [b, t, t']
+                logp3 = torch.matmul((x_m * x_s_sq_r).transpose(1,2), z) # [b, t, d] x [b, d, t'] = [b, t, t']
+                logp4 = torch.sum(-0.5 * (x_m ** 2) * x_s_sq_r, [1]).unsqueeze(-1) # [b, t, 1]
+                logp = logp1 + logp2 + logp3 + logp4 # [b, t, t']
+                attn = maximum_path(logp, attn_mask.squeeze(1)).unsqueeze(1).detach()
+            z_m = torch.matmul(attn.squeeze(1).transpose(1, 2), x_m.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
+            z_logs = torch.matmul(attn.squeeze(1).transpose(1, 2), x_logs.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
+            logw_ = torch.log(1e-8 + torch.sum(attn, -1)) * x_mask
+            return (z, z_m, z_logs, logdet, z_mask), (x_m, x_logs, x_mask), (attn, logw, logw_)
+    def preprocess(self, y, y_lengths, y_max_length):
+        if y_max_length is not None:
+            y_max_length = (y_max_length // 2) * 2
+            y = y[:,:,:y_max_length]
+        y_lengths = (y_lengths // 2) * 2
+        return y, y_lengths, y_max_length
+class Encoder(nn.Module):
+    def __init__(self, n_vocab, out_c, h_c, f_c, f_c_dp, heads, layers, k_s, p=0.1, mean_only = True):
+        super().__init__()
+        self.h_c = h_c
+        self.mean_only = mean_only
+        self.emb = nn.Embedding(n_vocab, h_c)
+        nn.init.normal_(self.emb.weight, 0.0, h_c**(-0.5))
+        self.prenet = Prenet(in_c = h_c, hi_c = h_c, out_c = h_c, k_s = 5)
+        self.drop = nn.Dropout(p=p)
+        self.atten_layers = nn.ModuleList()
+        self.norm_layers = nn.ModuleList()
+        self.ffn_layers = nn.ModuleList()
+        for i in range(layers):
+            self.atten_layers.append(MultiheadAttention(h_c, h_c, heads, window_size=4, heads_share=True, p=0.1, block_length=None))
+            self.norm_layers.extend([Layernorm(h_c), Layernorm(h_c)])
+            self.ffn_layers.append(FFN(h_c, f_c, k_s, p))
+        self.proj_m = nn.Conv1d(h_c, out_c, 1)
+        if not mean_only:
+            self.proj_s = nn.Conv1d(h_c, out_c, 1)
+        self.proj_w = DurationPredictor(h_c, f_c_dp, k_s, p=p)
+    def forward(self,x, x_length):
+        x = self.emb(x) * torch.sqrt(torch.tensor(self.h_c)) # [b,t,h]
+        x = torch.transpose(x, 1, -1)  # [b,h,t]
+        x_mask = torch.unsqueeze(sequence_mask(x_length, x.size(2)), 1).to(x.dtype)
+        x = self.prenet(x, x_mask)
+        atten_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
+        for i in range(len(self.atten_layers)):
+            x = x * x_mask
+            y = self.drop(self.atten_layers[i](x, atten_mask))
+            x = self.norm_layers[2*i](x+y)
+            y = self.drop(self.ffn_layers[i](x, x_mask))
+            x = self.norm_layers[2*i+1](x+y)
+        x = x*x_mask
+        x_m = self.proj_m(x)
+        if not self.mean_only:
+            x_logs = self.proj_m(x)
+        else:
+            x_logs = torch.zeros_like(x_m)
+        logw = self.proj_w(x.detach(), x_mask)
+        return x_m, x_logs, logw, x_mask
+class Decoder(nn.Module):
+    def __init__(self, in_c, hi_c, k_s, d_l =1 , blocks = 12, splits = 4,):
+        super().__init__()
+        self.flows = nn.ModuleList()
+        for _ in range(blocks):
+            self.flows.extend([ActNorm(in_c*2), InvConvNear(splits = splits), Couplinglayer(in_c*2, hi_c, k_s, d_l = d_l)])
+    def forward(self, x, x_mask = None, reverse = False):
+        if not reverse:
+            flows = self.flows
+            tot_logdet = 0
+        else:
+            flows = reversed(self.flows)
+            tot_logdet = None
+        b, c, t = x.shape
+        t = t - t%2
+        if x_mask is None:
+            mask = torch.ones(b,1,t//2)
+        else:
+            mask = x_mask[:,:,1::2]
+        x = x[:,:,:t].reshape(b, c, t//2, 2).transpose(2,3).contiguous().reshape(b,2*c,t//2) * mask # [b, 2c, t/2]
+        for f in flows:
+            x, logdet = f(x, mask, reverse = reverse)
+            if not reverse:
+                tot_logdet = tot_logdet + logdet
+        if x_mask is None:
+            mask = torch.ones(b,1,t)
+        else:
+            mask = x_mask[:,:,:t]
+        x = x.reshape(b,c,2,t//2).transpose(2,3).contiguous().reshape(b,c,t) * mask # [b, c, t]
+        return x, tot_logdet
+    def skip(self):
+        for f in self.flows:
+            f.skip()
+    def ddi_init(self):
+        for i, f in enumerate(self.flows):
+            if i % 3 == 0:
+                f.set_ddi()

module.py ADDED Viewed

	@@ -0,0 +1,299 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+######################################### encoder ##############################################
+class Layernorm(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.gamma = nn.Parameter(torch.ones(1, channels))
+        self.beta = nn.Parameter(torch.zeros(1, channels))
+    def forward(self, x):
+        m = torch.mean(x, dim = 1, keepdim = True)
+        v = torch.mean((x-m)**2, dim = 1, keepdim = True)
+        x = (x - m) * torch.rsqrt(v + 1e-4) # normarlization
+        n = len(x.shape)
+        shape = [1, -1] + [1]*(n-2)
+        x = x*self.gamma.reshape(*shape) + self.beta.reshape(*shape)
+        return x
+class Prenet(nn.Module):
+    def __init__(self, in_c, hi_c, out_c, k_s = 5, layers =3, p = 0.05):
+        super().__init__()
+        self.crn = nn.ModuleList()
+        self.crn.extend([nn.Conv1d(in_c, hi_c, k_s, padding = k_s//2), Layernorm(hi_c), nn.ReLU(), nn.Dropout(p=p)])
+        self.crn.extend([nn.Conv1d(hi_c, hi_c, k_s, padding = k_s//2), Layernorm(hi_c), nn.ReLU(), nn.Dropout(p=p)])
+        self.crn.extend([nn.Conv1d(hi_c, hi_c, k_s, padding = k_s//2), Layernorm(hi_c), nn.ReLU(), nn.Dropout(p=p)])
+        self.proj = nn.Conv1d(hi_c, out_c, 1)
+        self.proj.weight.data.zero_()
+        self.proj.bias.data.zero_()
+    def forward(self, start, x_mask=1):
+        x = start
+        for layer in self.crn:
+            x = layer(x)    # [b. c. t]
+            x = x * x_mask
+        x = self.proj(x) + start    # [b. c. t]
+        end = x * x_mask
+        return end      # [b. c. t]
+class MultiheadAttention(nn.Module):
+    def __init__(self, c, out_c, heads, window_size=4, heads_share=True, p=0.1, block_length=None,):
+        super().__init__()
+        self.k = c // heads
+        self.window_size = window_size
+        self.proj_q = nn.Conv1d(c,c,1)
+        self.proj_k = nn.Conv1d(c,c,1)
+        self.proj_v = nn.Conv1d(c,c,1)
+        nn.init.xavier_uniform_(self.proj_q.weight)
+        nn.init.xavier_uniform_(self.proj_k.weight)
+        nn.init.xavier_uniform_(self.proj_v.weight)
+        n_heads_rel = 1 if heads_share else heads
+        self.d_k = (self.k)**(-0.5)
+        self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size*2 +1, self.k) * self.d_k)
+        self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size*2 +1, self.k) * self.d_k)
+        self.conv_o = nn.Conv1d(c, out_c, 1)
+        self.drop = nn.Dropout(p=p)
+    def forward(self, x, attn_mask=None):
+        query, key, value = self.proj_q(x), self.proj_k(x), self.proj_v(x)
+        b, c, t = query.shape
+        h, k = c // self.k, self.k
+        query = query.reshape(b,h,k,t)
+        key = key.reshape(b,h,k,t)
+        value = value.reshape(b,h,k,t)
+        matrix = self.get_relative_matrix(self.emb_rel_k, t)
+        rel_logit = torch.matmul(matrix.unsqueeze(0), query) #  [1,1,2t-1,k] * [b,h,k,t] = [b,h,2t-1,t]
+        abs_logit = self.rel_to_abs(rel_logit.transpose(2,3))
+        local_score = abs_logit * self.d_k
+        score = torch.matmul(query.transpose(2,3), key) * self.d_k + local_score
+        if attn_mask is not None:
+            score = score.masked_fill(attn_mask == 0, -1e4)
+        align = F.softmax(score, dim = -1)
+        atten = self.drop(align)
+        self.atten = atten
+        matrix = self.get_relative_matrix(self.emb_rel_v, t).transpose(1,2) # [1,k,2t-1]
+        weight = self.abs_to_rel(atten).transpose(2,3) # [b,h,2t-1,t]
+        output = torch.matmul(value, atten) + torch.matmul(matrix.unsqueeze(0), weight) # [b,h,k,t]
+        x = self.conv_o(output.contiguous().reshape(b,c,t))
+        return x
+    def get_relative_matrix(self, emb_rel_k, t):
+        s = self.window_size
+        pad_size = max(t - s - 1, 0)
+        start = max(s+1-t, 0)
+        emb_rel_k = F.pad(emb_rel_k, (0,0, pad_size, pad_size))
+        return emb_rel_k[:,start:start+2*t+1]
+    def rel_to_abs(self, x):
+        b,h,t,_= x.shape
+        x = F.pad(x, (0,1)).reshape(b,h,2*t*t)
+        x = F.pad(x, (0,t-1)).reshape(b,h,t+1, 2*t-1)[:,:,:t,t-1:]
+        return x
+    def abs_to_rel(self, x):
+        b,h,t,t = x.shape
+        x = F.pad(x, (0, t-1)).reshape(b,h,2*t*t-t)
+        x = F.pad(x, (t,0)).reshape(b,h,t,2*t)[:,:,:,1:]
+        return x
+class FFN(nn.Module):
+    def __init__(self, h_c, f_c, k_s, p = 0.1):
+        super().__init__()
+        self.conv1 = nn.Conv1d(h_c, f_c, k_s, padding=k_s//2)
+        self.conv2 = nn.Conv1d(f_c, h_c, k_s, padding=k_s//2)
+        self.drop = nn.Dropout(p=p)
+    def forward(self, x, x_mask = None):
+        x = self.conv2(self.drop(F.relu(self.conv1(x*x_mask)))*x_mask)
+        return x * x_mask
+class DurationPredictor(nn.Module):
+    def __init__(self, in_c, f_c, k_s, p=0.1):
+        super().__init__()
+        self.block1 = nn.Sequential(nn.Conv1d(in_c, f_c, k_s, padding=k_s//2),
+                                    nn.ReLU(),
+                                    Layernorm(f_c),
+                                    nn.Dropout(p=p))
+        self.block2 = nn.Sequential(nn.Conv1d(f_c, f_c, k_s, padding=k_s//2),
+                                    nn.ReLU(),
+                                    Layernorm(f_c),
+                                    nn.Dropout(p=p))
+        self.proj = nn.Conv1d(f_c, 1, 1)
+    def forward(self, x, x_mask):
+        x = self.block1(x * x_mask)
+        x = self.block2(x * x_mask)
+        x = self.proj(x * x_mask)
+        return x * x_mask
+######################################### decoder ##############################################
+# static file system(reasoning the type of tensor), optimizing computation graph, complie before functioning >> to accelate the speed
+@torch.jit.script
+def fuse_tan_sig_add(x:torch.Tensor, mid:int) -> torch.Tensor:
+    a, b = x[:, :mid, :], x[:, mid:, :]
+    return torch.sigmoid(a) * torch.tanh(b)
+class WN(nn.Module): # non-casual wavenet without dilation
+    def __init__(self, hi_c, k_s, d_l = 1, layers = 3, p=0.05):
+        super().__init__()
+        self.hi_c = hi_c
+        self.resblocks=nn.ModuleList()
+        self.skipblocks=nn.ModuleList()
+        self.drop = nn.Dropout(p=p)
+        for _ in range(layers):
+            res_layer = nn.Conv1d(hi_c, 2*hi_c, k_s, dilation=d_l, padding=k_s//2)
+            res_layer = nn.utils.weight_norm(res_layer, name = 'weight')
+            self.resblocks.append(res_layer)
+            if _ ==2:
+                skip_layer = nn.Conv1d(hi_c, hi_c, 1)   # last layer
+            else:
+                skip_layer = nn.Conv1d(hi_c, 2*hi_c, 1)
+            skip_layer = nn.utils.weight_norm(skip_layer, name = 'weight')
+            self.skipblocks.append(skip_layer)
+    def forward(self, x, x_mask = None):
+        mid = self.hi_c
+        end = torch.zeros_like(x, dtype=x.dtype)
+        for i in range(len(self.resblocks)):
+            x = self.drop(self.resblocks[i](x)) # [b, 2c, t]
+            x = fuse_tan_sig_add(x, mid) # [b, c, t]
+            y = self.skipblocks[i](x)
+            if i == 2:
+                end = end + y   # last layer
+            else:
+                x = (x + y[:, :mid, :]) * x_mask
+                end = end + y[:, mid:, :]
+        return end * x_mask
+    def skip(self):
+        for layer1, layer2 in zip(self.resblocks, self.skipblocks):
+            nn.utils.remove_weight_norm(layer1)
+            nn.utils.remove_weight_norm(layer2)
+class Couplinglayer(nn.Module):
+    def __init__(self, in_c, hi_c, k_s, d_l = 1):
+        super().__init__()
+        s_proj = nn.Conv1d(in_c//2, hi_c, 1)
+        self.start = nn.utils.weight_norm(s_proj, name = 'weight')
+        # Initializing last layer to 0 makes the affine coupling layers
+        # do nothing at first.  It helps to stabilze training. from glow paper
+        self.end = nn.Conv1d(hi_c, in_c, 1)
+        self.end.weight.data.zero_()
+        self.end.bias.data.zero_()
+        self.wn = WN(hi_c, k_s, d_l)
+        # y = x * logs + t
+    def forward(self, x, x_mask=None, reverse = False):
+        if x_mask is None:
+            x_mask = 1
+        mid = x.shape[1]//2 # divide channels by 2
+        x_0, x_1 = x[:, :mid, :], x[:, mid:, :]
+        z_1 = self.end(self.wn(self.start(x_1) * x_mask, x_mask))
+        logs, t = z_1[:,mid:,:], z_1[:, :mid, :]
+        if reverse:
+            x_0 = torch.exp(-logs)*(x_0 - t) * x_mask
+            logdet = None
+        else :
+            x_0 = torch.exp(logs + 1e-4) * x_0 + t
+            logdet = torch.sum(logs * x_mask, [1,2]) # sum(log(s))
+        z = torch.cat([x_0, x_1], dim = 1)
+        return z, logdet
+    def skip(self):
+        self.wn.skip()
+class InvConvNear(nn.Module):
+    def __init__(self, splits = 4):
+        super().__init__()
+        self.splits = splits
+        w_init = torch.linalg.qr(torch.randn((splits, splits)).normal_())[0] # othonormal vector matrix
+        if torch.det(w_init) < 0:
+            w_init[0,:] = -w_init[0,:]
+        self.weight = nn.Parameter(w_init)
+    def forward(self, x, x_mask=None, reverse = False):
+        b, c, t = x.shape
+        if x_mask is None:
+            x_mask = 1
+            x_len = torch.ones(b) * t # [b]
+        else:
+            x_len = torch.sum(x_mask, [1,2])
+        s = self.splits
+        x = x.reshape(b, 2, c//s, s//2, t) # split channels into 2 groups
+        x = x.permute(0,1,3,2,4).contiguous().reshape(b, s, c//s, t)
+        if reverse:
+            if hasattr(self, "weight_inv"):
+                weight = self.weight_inv
+            weight = torch.inverse(self.weight).to(dtype=self.weight.dtype)
+            logdet = None
+        else:
+            weight = self.weight
+            logdet = torch.logdet(weight) * (c//s) * x_len # h*w*log(det(W)) since there's no necesserity for decomposition
+        weight = weight.unsqueeze(-1).unsqueeze(-1)
+        z = F.conv2d(x, weight) # z = matmul(weight, x_i,j) for i,j in h = c//s, w = t
+        z = z.reshape(b, 2, s//2, c//s, t).permute(0,1,3,2,4).contiguous().reshape(b, c, t) * x_mask
+        return z, logdet
+    def skip(self):
+        self.weigth_inv = torch.inverse(self.weight.float()).to(dtype=self.weight.dtype)
+class ActNorm(nn.Module):
+    def __init__(self, hi_c, ddi = False): # data dependent initialization
+        super().__init__()
+        self.logs = nn.Parameter(torch.zeros(1, hi_c, 1))
+        self.bias = nn.Parameter(torch.zeros(1, hi_c, 1))
+        self.ddi = ddi
+    def forward(self, x, x_mask = None, reverse = False):
+        b, _, t = x.shape
+        if x_mask is None:
+            x_mask = torch.ones(b,1,t).to(device= x.device, dtype = x.dtype)
+        x_len = torch.sum(x_mask, [1, 2])
+        if self.ddi:
+            self.initialize(x, x_mask)
+            self.ddi = False
+        # y = exp(logs) * x + bias > normalization in channel dim
+        if reverse:
+            z = (x - self.bias) * torch.exp(-self.logs) * x_mask
+            logdet = None
+        else:
+            z = (torch.exp(self.logs) * x + self.bias) * x_mask
+            logdet = torch.sum(self.logs, [1,2])* x_len
+        return z, logdet
+    def initialize(self, x, x_mask):
+        with torch.no_grad():
+            n = torch.sum(x_mask, [0,2])
+            m = torch.sum(x * x_mask, [0,2])/n
+            m_s = torch.sum(x * x * x_mask, [0,2])/n
+            v = m_s - m**2
+            logs = 0.5 * torch.log(torch.clamp_min(v, 1e-6))
+            init_bias = (-m/torch.exp(-logs)).reshape(*self.bias.shape).to(dtype = self.bias.dtype) # -m/s
+            init_logs = (-logs).reshape(*self.logs.shape).to(dtype = self.logs.dtype) # -logs
+            self.bias.data.copy_(init_bias)
+            self.logs.data.copy_(init_logs)
+    def set_ddi(self):
+        self.ddi = True
+    def skip(self):
+        pass

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+torch==1.4.0
+numpy==1.17.4
+librosa==0.7.2
+scipy==1.4.1
+tensorboard==2.0
+soundfile==0.10.3.post1
+matplotlib==3.1.3