Spaces:

unilight
/

sheet-demo

Sleeping

App Files Files Community

unilight commited on 19 days ago

Commit

052c3ff

•

1 Parent(s): 781dc92

init

Browse files

Files changed (3) hide show

app.py +184 -0
models/modules.py +59 -0
models/sslmos.py +212 -0

app.py ADDED Viewed

	@@ -0,0 +1,184 @@

+import os
+from glob import glob
+import torch.nn.functional as F
+import torchaudio
+from loguru import logger
+import soundfile as sf
+import librosa
+import gradio as gr
+from huggingface_hub import hf_hub_download
+import time
+import torch
+import yaml
+# from s3prl_vc.upstream.interface import get_upstream
+# from s3prl.nn import Featurizer
+# import s3prl_vc.models
+# from s3prl_vc.utils import read_hdf5
+# from s3prl_vc.vocoder import Vocoder
+# ---------- Settings ----------
+GPU_ID = '-1'
+os.environ['CUDA_VISIBLE_DEVICES'] = GPU_ID
+DEVICE = 'cuda' if GPU_ID != '-1' else 'cpu'
+SERVER_PORT = 42208
+SERVER_NAME = "0.0.0.0"
+SSL_DIR = './keyble_ssl'
+FS = 16000
+resamplers = {}
+MIN_REQUIRED_WAV_LENGTH = 1040
+# EXAMPLE_DIR = './examples'
+# en_examples = sorted(glob(os.path.join(EXAMPLE_DIR, "en", '*.wav')))
+# jp_examples = sorted(glob(os.path.join(EXAMPLE_DIR, "jp", '*.wav')))
+# zh_examples = sorted(glob(os.path.join(EXAMPLE_DIR, "zh", '*.wav')))
+# TRGSPKS = ["TEF1", "TEF2", "TEM1", "TEM2"]
+# ref_samples = {
+    # trgspk: sorted(glob(os.path.join("./ref_samples", trgspk, '*.wav')))
+    # for trgspk in TRGSPKS
+# }
+# ---------- Logging ----------
+logger.add('app.log', mode='a')
+logger.info('============================= App restarted =============================')
+# ---------- Download models ----------
+logger.info('============================= Download models ===========================')
+model_paths = {
+    "SSL-MOS, all training sets": {
+        "ckpt": hf_hub_download(repo_id="unilight/sheet-models", filename="bvcc+nisqa+pstn+singmos+somos+tencent+tmhint-qi/sslmos+mdf/2337/checkpoint-86000steps.pkl"),
+        "config": hf_hub_download(repo_id="unilight/sheet-models", filename="bvcc+nisqa+pstn+singmos+somos+tencent+tmhint-qi/sslmos+mdf/2337/config.yml"),
+    }
+}
+# ---------- Model ----------
+models = {}
+for name, path_dict in model_paths.items():
+    logger.info(f'============================= Setting up model for {name} =============')
+    checkpoint_path = path_dict["ckpt"]
+    config_path = path_dict["config"]
+    with open(config_path) as f:
+        config = yaml.load(f, Loader=yaml.Loader)
+    if config["model_type"] == "SSLMOS":
+        from models.sslmos import SSLMOS
+        model = SSLMOS(
+            config["model_input"],
+            num_listeners=config.get("num_listeners", None),
+            num_domains=config.get("num_domains", None),
+            **config["model_params"],
+        ).to(DEVICE)
+    model.load_state_dict(torch.load(checkpoint_path, map_location="cpu")["model"])
+    model = model.eval().to(DEVICE)
+    logger.info(f"Loaded model parameters from {checkpoint_path}.")
+    models[name] = model
+def read_wav(wav_path):
+    # read waveform
+    waveform, sample_rate = torchaudio.load(
+        wav_path, channels_first=False
+    )  # waveform: [T, 1]
+    # resample if needed
+    if sample_rate != FS:
+        resampler_key = f"{sample_rate}-{FS}"
+        if resampler_key not in resamplers:
+            resamplers[resampler_key] = torchaudio.transforms.Resample(
+                sample_rate, FS, dtype=waveform.dtype
+            )
+        waveform = resamplers[resampler_key](waveform)
+    waveform = waveform.squeeze(-1)
+    # always pad to a minumum length
+    if waveform.shape[0] < MIN_REQUIRED_WAV_LENGTH:
+        to_pad = (MIN_REQUIRED_WAV_LENGTH - waveform.shape[0]) // 2
+        waveform = F.pad(waveform, (to_pad, to_pad), "constant", 0)
+    return waveform, sample_rate
+def predict(model_name, wav_file):
+    x, fs = read_wav(wav_file)
+    logger.info('wav file loaded')
+    # set up model input
+    model_input = x.unsqueeze(0).to(DEVICE)
+    model_lengths = model_input.new_tensor([model_input.size(1)]).long()
+    inputs = {
+        config["model_input"]: model_input,
+        config["model_input"] + "_lengths": model_lengths,
+    }
+    with torch.no_grad():
+        # model forward
+        if config["inference_mode"] == "mean_listener":
+            outputs = models[model_name].mean_listener_inference(inputs)
+        elif config["inference_mode"] == "mean_net":
+            outputs = models[model_name].mean_net_inference(inputs)
+    pred_mean_scores = outputs["scores"].cpu().detach().numpy()[0]
+    return pred_mean_scores
+with gr.Blocks(title="S3PRL-VC: Any-to-one voice conversion demo on VCC2020") as demo:
+    gr.Markdown(
+        """
+        # S3PRL-VC: Any-to-one voice conversion demo on VCC2020
+        ### [[Paper (ICASSP2023)]](https://arxiv.org/abs/2110.06280) [[Paper(JSTSP)]](https://arxiv.org/abs/2207.04356) [[Code]](https://github.com/unilight/s3prl-vc)
+        **S3PRL-VC** is a voice conversion (VC) toolkit for benchmarking self-supervised speech representations (S3Rs). The term **any-to-one** means that the system can convert from any unseen speaker to a pre-defined speaker given in training.
+        In this demo, you can record your voice, and the model will convert your voice to one of the four pre-defined speakers. These four speakers come from the **voice conversion challenge (VCC) 2020**. You can listen to the samples to get a sense of what these speakers sound like.
+        The **RTF** of the system is around **1.5~2.5**, i.e. if you recorded a 5 second long audio, it will take 5 * (1.5~2.5) = 7.5~12.5 seconds to generate the output.
+        """
+    )
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown("## Record your speech here!")
+            input_wav = gr.Audio(label="Input speech", source='microphone', type='filepath')
+            gr.Markdown("## Select a model!")
+            model_name = gr.Radio(label="Model", choices=list(model_paths.keys()))
+            evaluate_btn = gr.Button(value="Evaluate!")
+            # gr.Markdown("### You can use these examples if using a microphone is too troublesome!")
+            # gr.Markdown("I recorded the samples using my Macbook Pro, so there might be some noises.")
+            # gr.Examples(
+            #     examples=en_examples,
+            #     inputs=input_wav,
+            #     label="English examples"
+            # )
+            # gr.Examples(
+            #     examples=jp_examples,
+            #     inputs=input_wav,
+            #     label="Japanese examples"
+            # )
+            # gr.Examples(
+            #     examples=zh_examples,
+            #     inputs=input_wav,
+            #     label="Mandarin examples"
+            # )
+        with gr.Column():
+            gr.Markdown("## The predicted scores is here:")
+            output_score = gr.Textbox(label="Prediction", interactive=False)
+        evaluate_btn.click(predict, [model_name, input_wav], output_score)
+if __name__ == '__main__':
+    try:
+        demo.launch(debug=True,
+                     enable_queue=True,
+                     )
+    except KeyboardInterrupt as e:
+        print(e)
+    finally:
+        demo.close()

models/modules.py ADDED Viewed

	@@ -0,0 +1,59 @@

+# -*- coding: utf-8 -*-
+# Copyright 2024 Wen-Chin Huang
+#  MIT License (https://opensource.org/licenses/MIT)
+# LDNet modules
+# taken from: https://github.com/unilight/LDNet/blob/main/models/modules.py (written by myself)
+import torch
+from torch import nn
+STRIDE = 3
+class Projection(nn.Module):
+    def __init__(
+        self,
+        in_dim,
+        hidden_dim,
+        activation,
+        output_type,
+        _output_dim,
+        output_step=1.0,
+        range_clipping=False,
+    ):
+        super(Projection, self).__init__()
+        self.output_type = output_type
+        self.range_clipping = range_clipping
+        if output_type == "scalar":
+            output_dim = 1
+            if range_clipping:
+                self.proj = nn.Tanh()
+        elif output_type == "categorical":
+            output_dim = _output_dim
+            self.output_step = output_step
+        else:
+            raise NotImplementedError("wrong output_type: {}".format(output_type))
+        self.net = nn.Sequential(
+            nn.Linear(in_dim, hidden_dim),
+            activation(),
+            nn.Dropout(0.3),
+            nn.Linear(hidden_dim, output_dim),
+        )
+    def forward(self, x, inference=False):
+        output = self.net(x)
+        # scalar / categorical
+        if self.output_type == "scalar":
+            # range clipping
+            if self.range_clipping:
+                return self.proj(output) * 2.0 + 3
+            else:
+                return output
+        else:
+            if inference:
+                return torch.argmax(output, dim=-1) * self.output_step + 1
+            else:
+                return output

models/sslmos.py ADDED Viewed

	@@ -0,0 +1,212 @@

+# -*- coding: utf-8 -*-
+# Copyright 2024 Wen-Chin Huang
+#  MIT License (https://opensource.org/licenses/MIT)
+# SSLMOS model
+# modified from: https://github.com/nii-yamagishilab/mos-finetune-ssl/blob/main/mos_fairseq.py (written by Erica Cooper)
+import torch
+import torch.nn as nn
+from .modules import Projection
+class SSLMOS(torch.nn.Module):
+    def __init__(
+        self,
+        # model related
+        ssl_module: str,
+        s3prl_name: str,
+        ssl_model_output_dim: int,
+        ssl_model_layer_idx: int,
+        # mean net related
+        mean_net_dnn_dim: int = 64,
+        mean_net_output_type: str = "scalar",
+        mean_net_output_dim: int = 5,
+        mean_net_output_step: float = 0.25,
+        mean_net_range_clipping: bool = True,
+        # listener related
+        use_listener_modeling: bool = False,
+        num_listeners: int = None,
+        listener_emb_dim: int = None,
+        use_mean_listener: bool = True,
+        # decoder related
+        decoder_type: str = "ffn",
+        decoder_dnn_dim: int = 64,
+        output_type: str = "scalar",
+        range_clipping: bool = True,
+    ):
+        super().__init__()  # this is needed! or else there will be an error.
+        self.use_mean_listener = use_mean_listener
+        self.output_type = output_type
+        # define listener embedding
+        self.use_listener_modeling = use_listener_modeling
+        # define ssl model
+        if ssl_module == "s3prl":
+            from s3prl.nn import S3PRLUpstream
+            if s3prl_name in S3PRLUpstream.available_names():
+                self.ssl_model = S3PRLUpstream(s3prl_name)
+            self.ssl_model_layer_idx = ssl_model_layer_idx
+        else:
+            raise NotImplementedError
+        # default uses ffn type mean net
+        self.mean_net_dnn = Projection(
+            ssl_model_output_dim,
+            mean_net_dnn_dim,
+            nn.ReLU,
+            mean_net_output_type,
+            mean_net_output_dim,
+            mean_net_output_step,
+            mean_net_range_clipping,
+        )
+        # listener modeling related
+        self.use_listener_modeling = use_listener_modeling
+        if use_listener_modeling:
+            self.num_listeners = num_listeners
+            self.listener_embeddings = nn.Embedding(
+                num_embeddings=num_listeners, embedding_dim=listener_emb_dim
+            )
+            # define decoder
+            self.decoder_type = decoder_type
+            if decoder_type == "ffn":
+                decoder_dnn_input_dim = ssl_model_output_dim + listener_emb_dim
+            else:
+                raise NotImplementedError
+            # there is always dnn
+            self.decoder_dnn = Projection(
+                decoder_dnn_input_dim,
+                decoder_dnn_dim,
+                self.activation,
+                output_type,
+                range_clipping,
+            )
+    def get_num_params(self):
+        return sum(p.numel() for n, p in self.named_parameters())
+    def forward(self, inputs):
+        """Calculate forward propagation.
+        Args:
+            waveform has shape (batch, time)
+            waveform_lengths has shape (batch)
+            listener_ids has shape (batch)
+        """
+        waveform = inputs["waveform"]
+        waveform_lengths = inputs["waveform_lengths"]
+        batch, time = waveform.shape
+        # get listener embedding
+        if self.use_listener_modeling:
+            listener_ids = inputs["listener_idxs"]
+            # NOTE(unlight): not tested yet
+            listener_embs = self.listener_embeddings(listener_ids)  # (batch, emb_dim)
+            listener_embs = torch.stack(
+                [listener_embs for i in range(time)], dim=1
+            )  # (batch, time, feat_dim)
+        # ssl model forward
+        all_encoder_outputs, all_encoder_outputs_lens = self.ssl_model(
+            waveform, waveform_lengths
+        )
+        encoder_outputs = all_encoder_outputs[self.ssl_model_layer_idx]
+        encoder_outputs_lens = all_encoder_outputs_lens[self.ssl_model_layer_idx]
+        # inject listener embedding
+        if self.use_listener_modeling:
+            # NOTE(unlight): not tested yet
+            encoder_outputs = encoder_outputs.view(
+                (batch, time, -1)
+            )  # (batch, time, feat_dim)
+            decoder_inputs = torch.cat(
+                [encoder_outputs, listener_embs], dim=-1
+            )  # concat along feature dimension
+        else:
+            decoder_inputs = encoder_outputs
+        # masked mean pooling
+        # masks = make_non_pad_mask(encoder_outputs_lens)
+        # masks = masks.unsqueeze(-1).to(decoder_inputs.device) # [B, max_time, 1]
+        # decoder_inputs = torch.sum(decoder_inputs * masks, dim=1) / encoder_outputs_lens.unsqueeze(-1)
+        # mean net
+        mean_net_outputs = self.mean_net_dnn(
+            decoder_inputs
+        )  # [batch, time, 1 (scalar) / 5 (categorical)]
+        # decoder
+        if self.use_listener_modeling:
+            if self.decoder_type == "rnn":
+                decoder_outputs, (h, c) = self.decoder_rnn(decoder_inputs)
+            else:
+                decoder_outputs = decoder_inputs
+            decoder_outputs = self.decoder_dnn(
+                decoder_outputs
+            )  # [batch, time, 1 (scalar) / 5 (categorical)]
+        # set outputs
+        # return lengths for masked loss calculation
+        ret = {
+            "waveform_lengths": waveform_lengths,
+            "frame_lengths": encoder_outputs_lens,
+        }
+        # define scores
+        ret["mean_scores"] = mean_net_outputs
+        ret["ld_scores"] = decoder_outputs if self.use_listener_modeling else None
+        return ret
+    def mean_net_inference(self, inputs):
+        waveform = inputs["waveform"]
+        waveform_lengths = inputs["waveform_lengths"]
+        # ssl model forward
+        all_encoder_outputs, all_encoder_outputs_lens = self.ssl_model(
+            waveform, waveform_lengths
+        )
+        encoder_outputs = all_encoder_outputs[self.ssl_model_layer_idx]
+        # mean net
+        decoder_inputs = encoder_outputs
+        mean_net_outputs = self.mean_net_dnn(
+            decoder_inputs, inference=True
+        )  # [batch, time, 1 (scalar) / 5 (categorical)]
+        mean_net_outputs = mean_net_outputs.squeeze(-1)
+        scores = torch.mean(mean_net_outputs, dim=1) # [batch]
+        return {
+            "ssl_embeddings": encoder_outputs,
+            "scores": scores
+        }
+    def mean_net_inference_p1(self, waveform, waveform_lengths):
+        # ssl model forward
+        all_encoder_outputs, _ = self.ssl_model(waveform, waveform_lengths)
+        encoder_outputs = all_encoder_outputs[self.ssl_model_layer_idx]
+        return encoder_outputs
+    def mean_net_inference_p2(self, encoder_outputs):
+        # mean net
+        mean_net_outputs = self.mean_net_dnn(
+            encoder_outputs
+        )  # [batch, time, 1 (scalar) / 5 (categorical)]
+        mean_net_outputs = mean_net_outputs.squeeze(-1)
+        scores = torch.mean(mean_net_outputs, dim=1)
+        return scores
+    def get_ssl_embeddings(self, inputs):
+        waveform = inputs["waveform"]
+        waveform_lengths = inputs["waveform_lengths"]
+        all_encoder_outputs, all_encoder_outputs_lens = self.ssl_model(
+            waveform, waveform_lengths
+        )
+        encoder_outputs = all_encoder_outputs[self.ssl_model_layer_idx]
+        return encoder_outputs