KingNish commited on
Commit
f72005d
·
verified ·
1 Parent(s): 6f1fc14

Upload ./vocos/pretrained.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. vocos/pretrained.py +204 -0
vocos/pretrained.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Dict, Tuple, Union, Optional
4
+
5
+ import torch
6
+ import yaml
7
+ from huggingface_hub import hf_hub_download
8
+ from torch import nn
9
+ from vocos.feature_extractors import FeatureExtractor, EncodecFeatures
10
+ from vocos.heads import FourierHead
11
+ from vocos.models import Backbone
12
+
13
+
14
+ def instantiate_class(args: Union[Any, Tuple[Any, ...]], init: Dict[str, Any]) -> Any:
15
+ """Instantiates a class with the given args and init.
16
+
17
+ Args:
18
+ args: Positional arguments required for instantiation.
19
+ init: Dict of the form {"class_path":...,"init_args":...}.
20
+
21
+ Returns:
22
+ The instantiated class object.
23
+ """
24
+ kwargs = init.get("init_args", {})
25
+ if not isinstance(args, tuple):
26
+ args = (args,)
27
+ class_module, class_name = init["class_path"].rsplit(".", 1)
28
+ module = __import__(class_module, fromlist=[class_name])
29
+ args_class = getattr(module, class_name)
30
+ return args_class(*args, **kwargs)
31
+
32
+
33
+ class Vocos(nn.Module):
34
+ """
35
+ The Vocos class represents a Fourier-based neural vocoder for audio synthesis.
36
+ This class is primarily designed for inference, with support for loading from pretrained
37
+ model checkpoints. It consists of three main components: a feature extractor,
38
+ a backbone, and a head.
39
+ """
40
+
41
+ def __init__(
42
+ self, feature_extractor: FeatureExtractor, backbone: Backbone, head: FourierHead,
43
+ ):
44
+ super().__init__()
45
+ self.feature_extractor = feature_extractor
46
+ self.backbone = backbone
47
+ self.head = head
48
+
49
+ @classmethod
50
+ def from_hparams(cls, config_path: str) -> Vocos:
51
+ """
52
+ Class method to create a new Vocos model instance from hyperparameters stored in a yaml configuration file.
53
+ """
54
+ with open(config_path, "r") as f:
55
+ config = yaml.safe_load(f)
56
+ feature_extractor = instantiate_class(args=(), init=config["feature_extractor"])
57
+ backbone = instantiate_class(args=(), init=config["backbone"])
58
+ head = instantiate_class(args=(), init=config["head"])
59
+ model = cls(feature_extractor=feature_extractor, backbone=backbone, head=head)
60
+ return model
61
+
62
+ @classmethod
63
+ def from_pretrained(cls, repo_id: str, revision: Optional[str] = None) -> Vocos:
64
+ """
65
+ Class method to create a new Vocos model instance from a pre-trained model stored in the Hugging Face model hub.
66
+ """
67
+ config_path = hf_hub_download(repo_id=repo_id, filename="config.yaml", revision=revision)
68
+ model_path = hf_hub_download(repo_id=repo_id, filename="pytorch_model.bin", revision=revision)
69
+ model = cls.from_hparams(config_path)
70
+ state_dict = torch.load(model_path, map_location="cpu")
71
+ if isinstance(model.feature_extractor, EncodecFeatures):
72
+ encodec_parameters = {
73
+ "feature_extractor.encodec." + key: value
74
+ for key, value in model.feature_extractor.encodec.state_dict().items()
75
+ }
76
+ state_dict.update(encodec_parameters)
77
+ model.load_state_dict(state_dict)
78
+ model.eval()
79
+ return model
80
+
81
+ @torch.inference_mode()
82
+ def forward(self, audio_input: torch.Tensor, **kwargs: Any) -> torch.Tensor:
83
+ """
84
+ Method to run a copy-synthesis from audio waveform. The feature extractor first processes the audio input,
85
+ which is then passed through the backbone and the head to reconstruct the audio output.
86
+
87
+ Args:
88
+ audio_input (Tensor): The input tensor representing the audio waveform of shape (B, T),
89
+ where B is the batch size and L is the waveform length.
90
+
91
+
92
+ Returns:
93
+ Tensor: The output tensor representing the reconstructed audio waveform of shape (B, T).
94
+ """
95
+ features = self.feature_extractor(audio_input, **kwargs)
96
+ audio_output = self.decode(features, **kwargs)
97
+ return audio_output
98
+
99
+ @torch.inference_mode()
100
+ def decode(self, features_input: torch.Tensor, **kwargs: Any) -> torch.Tensor:
101
+ """
102
+ Method to decode audio waveform from already calculated features. The features input is passed through
103
+ the backbone and the head to reconstruct the audio output.
104
+
105
+ Args:
106
+ features_input (Tensor): The input tensor of features of shape (B, C, L), where B is the batch size,
107
+ C denotes the feature dimension, and L is the sequence length.
108
+
109
+ Returns:
110
+ Tensor: The output tensor representing the reconstructed audio waveform of shape (B, T).
111
+ """
112
+ x = self.backbone(features_input, **kwargs)
113
+ audio_output = self.head(x)
114
+ return audio_output
115
+
116
+ @torch.inference_mode()
117
+ def codes_to_features(self, codes: torch.Tensor) -> torch.Tensor:
118
+ """
119
+ Transforms an input sequence of discrete tokens (codes) into feature embeddings using the feature extractor's
120
+ codebook weights.
121
+
122
+ Args:
123
+ codes (Tensor): The input tensor. Expected shape is (K, L) or (K, B, L),
124
+ where K is the number of codebooks, B is the batch size and L is the sequence length.
125
+
126
+ Returns:
127
+ Tensor: Features of shape (B, C, L), where B is the batch size, C denotes the feature dimension,
128
+ and L is the sequence length.
129
+ """
130
+ assert isinstance(
131
+ self.feature_extractor, EncodecFeatures
132
+ ), "Feature extractor should be an instance of EncodecFeatures"
133
+
134
+ if codes.dim() == 2:
135
+ codes = codes.unsqueeze(1)
136
+
137
+ n_bins = self.feature_extractor.encodec.quantizer.bins
138
+ offsets = torch.arange(0, n_bins * len(codes), n_bins, device=codes.device)
139
+ embeddings_idxs = codes + offsets.view(-1, 1, 1)
140
+ features = torch.nn.functional.embedding(embeddings_idxs, self.feature_extractor.codebook_weights).sum(dim=0)
141
+ features = features.transpose(1, 2)
142
+
143
+ return features
144
+
145
+ class VocosDecoder(nn.Module):
146
+ """
147
+ The Vocos class represents a Fourier-based neural vocoder for audio synthesis.
148
+ This class is primarily designed for inference, with support for loading from pretrained
149
+ model checkpoints. It consists of three main components: a feature extractor,
150
+ a backbone, and a head.
151
+ """
152
+
153
+ def __init__(
154
+ self, backbone: Backbone, head: FourierHead,
155
+ ):
156
+ super().__init__()
157
+ self.backbone = backbone
158
+ self.head = head
159
+
160
+ @classmethod
161
+ def from_hparams(cls, config_path: str) -> Vocos:
162
+ """
163
+ Class method to create a new Vocos model instance from hyperparameters stored in a yaml configuration file.
164
+ """
165
+ with open(config_path, "r") as f:
166
+ config = yaml.safe_load(f)
167
+ backbone = instantiate_class(args=(), init=config["backbone"])
168
+ head = instantiate_class(args=(), init=config["head"])
169
+ model = cls(backbone=backbone, head=head)
170
+ return model
171
+
172
+ @torch.inference_mode()
173
+ def forward(self, features: torch.Tensor, **kwargs: Any) -> torch.Tensor:
174
+ """
175
+ Method to run a copy-synthesis from audio waveform. The feature extractor first processes the audio input,
176
+ which is then passed through the backbone and the head to reconstruct the audio output.
177
+
178
+ Args:
179
+ audio_input (Tensor): The input tensor representing the audio waveform of shape (B, T),
180
+ where B is the batch size and L is the waveform length.
181
+
182
+
183
+ Returns:
184
+ Tensor: The output tensor representing the reconstructed audio waveform of shape (B, T).
185
+ """
186
+ audio_output = self.decode(features, **kwargs)
187
+ return audio_output
188
+
189
+ @torch.inference_mode()
190
+ def decode(self, features_input: torch.Tensor, **kwargs: Any) -> torch.Tensor:
191
+ """
192
+ Method to decode audio waveform from already calculated features. The features input is passed through
193
+ the backbone and the head to reconstruct the audio output.
194
+
195
+ Args:
196
+ features_input (Tensor): The input tensor of features of shape (B, C, L), where B is the batch size,
197
+ C denotes the feature dimension, and L is the sequence length.
198
+
199
+ Returns:
200
+ Tensor: The output tensor representing the reconstructed audio waveform of shape (B, T).
201
+ """
202
+ x = self.backbone(features_input, **kwargs)
203
+ audio_output = self.head(x)
204
+ return audio_output