Spaces:

innova-ai
/

YuE-music-generator-demo

Running on Zero

App Files Files Community

KingNish commited on Jan 28

Commit

f72005d

verified ·

1 Parent(s): 6f1fc14

Upload ./vocos/pretrained.py with huggingface_hub

Browse files

Files changed (1) hide show

vocos/pretrained.py +204 -0

vocos/pretrained.py ADDED Viewed

	@@ -0,0 +1,204 @@

+from __future__ import annotations
+from typing import Any, Dict, Tuple, Union, Optional
+import torch
+import yaml
+from huggingface_hub import hf_hub_download
+from torch import nn
+from vocos.feature_extractors import FeatureExtractor, EncodecFeatures
+from vocos.heads import FourierHead
+from vocos.models import Backbone
+def instantiate_class(args: Union[Any, Tuple[Any, ...]], init: Dict[str, Any]) -> Any:
+    """Instantiates a class with the given args and init.
+    Args:
+        args: Positional arguments required for instantiation.
+        init: Dict of the form {"class_path":...,"init_args":...}.
+    Returns:
+        The instantiated class object.
+    """
+    kwargs = init.get("init_args", {})
+    if not isinstance(args, tuple):
+        args = (args,)
+    class_module, class_name = init["class_path"].rsplit(".", 1)
+    module = __import__(class_module, fromlist=[class_name])
+    args_class = getattr(module, class_name)
+    return args_class(*args, **kwargs)
+class Vocos(nn.Module):
+    """
+    The Vocos class represents a Fourier-based neural vocoder for audio synthesis.
+    This class is primarily designed for inference, with support for loading from pretrained
+    model checkpoints. It consists of three main components: a feature extractor,
+    a backbone, and a head.
+    """
+    def __init__(
+        self, feature_extractor: FeatureExtractor, backbone: Backbone, head: FourierHead,
+    ):
+        super().__init__()
+        self.feature_extractor = feature_extractor
+        self.backbone = backbone
+        self.head = head
+    @classmethod
+    def from_hparams(cls, config_path: str) -> Vocos:
+        """
+        Class method to create a new Vocos model instance from hyperparameters stored in a yaml configuration file.
+        """
+        with open(config_path, "r") as f:
+            config = yaml.safe_load(f)
+        feature_extractor = instantiate_class(args=(), init=config["feature_extractor"])
+        backbone = instantiate_class(args=(), init=config["backbone"])
+        head = instantiate_class(args=(), init=config["head"])
+        model = cls(feature_extractor=feature_extractor, backbone=backbone, head=head)
+        return model
+    @classmethod
+    def from_pretrained(cls, repo_id: str, revision: Optional[str] = None) -> Vocos:
+        """
+        Class method to create a new Vocos model instance from a pre-trained model stored in the Hugging Face model hub.
+        """
+        config_path = hf_hub_download(repo_id=repo_id, filename="config.yaml", revision=revision)
+        model_path = hf_hub_download(repo_id=repo_id, filename="pytorch_model.bin", revision=revision)
+        model = cls.from_hparams(config_path)
+        state_dict = torch.load(model_path, map_location="cpu")
+        if isinstance(model.feature_extractor, EncodecFeatures):
+            encodec_parameters = {
+                "feature_extractor.encodec." + key: value
+                for key, value in model.feature_extractor.encodec.state_dict().items()
+            }
+            state_dict.update(encodec_parameters)
+        model.load_state_dict(state_dict)
+        model.eval()
+        return model
+    @torch.inference_mode()
+    def forward(self, audio_input: torch.Tensor, **kwargs: Any) -> torch.Tensor:
+        """
+        Method to run a copy-synthesis from audio waveform. The feature extractor first processes the audio input,
+        which is then passed through the backbone and the head to reconstruct the audio output.
+        Args:
+            audio_input (Tensor): The input tensor representing the audio waveform of shape (B, T),
+                                        where B is the batch size and L is the waveform length.
+        Returns:
+            Tensor: The output tensor representing the reconstructed audio waveform of shape (B, T).
+        """
+        features = self.feature_extractor(audio_input, **kwargs)
+        audio_output = self.decode(features, **kwargs)
+        return audio_output
+    @torch.inference_mode()
+    def decode(self, features_input: torch.Tensor, **kwargs: Any) -> torch.Tensor:
+        """
+        Method to decode audio waveform from already calculated features. The features input is passed through
+        the backbone and the head to reconstruct the audio output.
+        Args:
+            features_input (Tensor): The input tensor of features of shape (B, C, L), where B is the batch size,
+                                     C denotes the feature dimension, and L is the sequence length.
+        Returns:
+            Tensor: The output tensor representing the reconstructed audio waveform of shape (B, T).
+        """
+        x = self.backbone(features_input, **kwargs)
+        audio_output = self.head(x)
+        return audio_output
+    @torch.inference_mode()
+    def codes_to_features(self, codes: torch.Tensor) -> torch.Tensor:
+        """
+        Transforms an input sequence of discrete tokens (codes) into feature embeddings using the feature extractor's
+        codebook weights.
+        Args:
+            codes (Tensor): The input tensor. Expected shape is (K, L) or (K, B, L),
+                            where K is the number of codebooks, B is the batch size and L is the sequence length.
+        Returns:
+            Tensor: Features of shape (B, C, L), where B is the batch size, C denotes the feature dimension,
+                    and L is the sequence length.
+        """
+        assert isinstance(
+            self.feature_extractor, EncodecFeatures
+        ), "Feature extractor should be an instance of EncodecFeatures"
+        if codes.dim() == 2:
+            codes = codes.unsqueeze(1)
+        n_bins = self.feature_extractor.encodec.quantizer.bins
+        offsets = torch.arange(0, n_bins * len(codes), n_bins, device=codes.device)
+        embeddings_idxs = codes + offsets.view(-1, 1, 1)
+        features = torch.nn.functional.embedding(embeddings_idxs, self.feature_extractor.codebook_weights).sum(dim=0)
+        features = features.transpose(1, 2)
+        return features
+class VocosDecoder(nn.Module):
+    """
+    The Vocos class represents a Fourier-based neural vocoder for audio synthesis.
+    This class is primarily designed for inference, with support for loading from pretrained
+    model checkpoints. It consists of three main components: a feature extractor,
+    a backbone, and a head.
+    """
+    def __init__(
+        self, backbone: Backbone, head: FourierHead,
+    ):
+        super().__init__()
+        self.backbone = backbone
+        self.head = head
+    @classmethod
+    def from_hparams(cls, config_path: str) -> Vocos:
+        """
+        Class method to create a new Vocos model instance from hyperparameters stored in a yaml configuration file.
+        """
+        with open(config_path, "r") as f:
+            config = yaml.safe_load(f)
+        backbone = instantiate_class(args=(), init=config["backbone"])
+        head = instantiate_class(args=(), init=config["head"])
+        model = cls(backbone=backbone, head=head)
+        return model
+    @torch.inference_mode()
+    def forward(self, features: torch.Tensor, **kwargs: Any) -> torch.Tensor:
+        """
+        Method to run a copy-synthesis from audio waveform. The feature extractor first processes the audio input,
+        which is then passed through the backbone and the head to reconstruct the audio output.
+        Args:
+            audio_input (Tensor): The input tensor representing the audio waveform of shape (B, T),
+                                        where B is the batch size and L is the waveform length.
+        Returns:
+            Tensor: The output tensor representing the reconstructed audio waveform of shape (B, T).
+        """
+        audio_output = self.decode(features, **kwargs)
+        return audio_output
+    @torch.inference_mode()
+    def decode(self, features_input: torch.Tensor, **kwargs: Any) -> torch.Tensor:
+        """
+        Method to decode audio waveform from already calculated features. The features input is passed through
+        the backbone and the head to reconstruct the audio output.
+        Args:
+            features_input (Tensor): The input tensor of features of shape (B, C, L), where B is the batch size,
+                                     C denotes the feature dimension, and L is the sequence length.
+        Returns:
+            Tensor: The output tensor representing the reconstructed audio waveform of shape (B, T).
+        """
+        x = self.backbone(features_input, **kwargs)
+        audio_output = self.head(x)
+        return audio_output