Text-to-Speech
English
geneing commited on
Commit
2d2f498
·
1 Parent(s): 8fdffc3

Fixed missing change. Updated models_onnx from models from upstream.

Browse files
Files changed (2) hide show
  1. kokoro.py +1 -1
  2. models_onnx.py +4 -224
kokoro.py CHANGED
@@ -116,7 +116,7 @@ def forward(model, tokens, ref_s, speed):
116
  tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
117
  input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
118
  text_mask = length_to_mask(input_lengths).to(device)
119
- bert_dur = model.bert(tokens, attention_mask=(~text_mask).int())
120
  d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
121
  s = ref_s[:, 128:]
122
  d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
 
116
  tokens = torch.LongTensor([[0, *tokens, 0]]).to(device)
117
  input_lengths = torch.LongTensor([tokens.shape[-1]]).to(device)
118
  text_mask = length_to_mask(input_lengths).to(device)
119
+ bert_dur = model.bert(tokens)
120
  d_en = model.bert_encoder(bert_dur).transpose(-1, -2)
121
  s = ref_s[:, 128:]
122
  d = model.predictor.text_encoder(d_en, s, input_lengths, text_mask)
models_onnx.py CHANGED
@@ -1,6 +1,5 @@
1
  # https://github.com/yl4579/StyleTTS2/blob/main/models.py
2
- from ast import Tuple
3
- from istftnet import Decoder
4
  from munch import Munch
5
  from pathlib import Path
6
  from plbert import load_plbert
@@ -12,118 +11,6 @@ import torch
12
  import torch.nn as nn
13
  import torch.nn.functional as F
14
 
15
- class LearnedDownSample(nn.Module):
16
- def __init__(self, layer_type, dim_in):
17
- super().__init__()
18
- self.layer_type = layer_type
19
-
20
- if self.layer_type == 'none':
21
- self.conv = nn.Identity()
22
- elif self.layer_type == 'timepreserve':
23
- self.conv = spectral_norm(nn.Conv2d(dim_in, dim_in, kernel_size=(3, 1), stride=(2, 1), groups=dim_in, padding=(1, 0)))
24
- elif self.layer_type == 'half':
25
- self.conv = spectral_norm(nn.Conv2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), groups=dim_in, padding=1))
26
- else:
27
- raise RuntimeError('Got unexpected donwsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
28
-
29
- def forward(self, x):
30
- return self.conv(x)
31
-
32
- class LearnedUpSample(nn.Module):
33
- def __init__(self, layer_type, dim_in):
34
- super().__init__()
35
- self.layer_type = layer_type
36
-
37
- if self.layer_type == 'none':
38
- self.conv = nn.Identity()
39
- elif self.layer_type == 'timepreserve':
40
- self.conv = nn.ConvTranspose2d(dim_in, dim_in, kernel_size=(3, 1), stride=(2, 1), groups=dim_in, output_padding=(1, 0), padding=(1, 0))
41
- elif self.layer_type == 'half':
42
- self.conv = nn.ConvTranspose2d(dim_in, dim_in, kernel_size=(3, 3), stride=(2, 2), groups=dim_in, output_padding=1, padding=1)
43
- else:
44
- raise RuntimeError('Got unexpected upsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
45
-
46
-
47
- def forward(self, x):
48
- return self.conv(x)
49
-
50
- class DownSample(nn.Module):
51
- def __init__(self, layer_type):
52
- super().__init__()
53
- self.layer_type = layer_type
54
-
55
- def forward(self, x):
56
- if self.layer_type == 'none':
57
- return x
58
- elif self.layer_type == 'timepreserve':
59
- return F.avg_pool2d(x, (2, 1))
60
- elif self.layer_type == 'half':
61
- if x.shape[-1] % 2 != 0:
62
- x = torch.cat([x, x[..., -1].unsqueeze(-1)], dim=-1)
63
- return F.avg_pool2d(x, 2)
64
- else:
65
- raise RuntimeError('Got unexpected donwsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
66
-
67
-
68
- class UpSample(nn.Module):
69
- def __init__(self, layer_type):
70
- super().__init__()
71
- self.layer_type = layer_type
72
-
73
- def forward(self, x):
74
- if self.layer_type == 'none':
75
- return x
76
- elif self.layer_type == 'timepreserve':
77
- return F.interpolate(x, scale_factor=(2, 1), mode='nearest')
78
- elif self.layer_type == 'half':
79
- return F.interpolate(x, scale_factor=2, mode='nearest')
80
- else:
81
- raise RuntimeError('Got unexpected upsampletype %s, expected is [none, timepreserve, half]' % self.layer_type)
82
-
83
-
84
- class ResBlk(nn.Module):
85
- def __init__(self, dim_in, dim_out, actv=nn.LeakyReLU(0.2),
86
- normalize=False, downsample='none'):
87
- super().__init__()
88
- self.actv = actv
89
- self.normalize = normalize
90
- self.downsample = DownSample(downsample)
91
- self.downsample_res = LearnedDownSample(downsample, dim_in)
92
- self.learned_sc = dim_in != dim_out
93
- self._build_weights(dim_in, dim_out)
94
-
95
- def _build_weights(self, dim_in, dim_out):
96
- self.conv1 = spectral_norm(nn.Conv2d(dim_in, dim_in, 3, 1, 1))
97
- self.conv2 = spectral_norm(nn.Conv2d(dim_in, dim_out, 3, 1, 1))
98
- if self.normalize:
99
- self.norm1 = nn.InstanceNorm2d(dim_in, affine=True)
100
- self.norm2 = nn.InstanceNorm2d(dim_in, affine=True)
101
- if self.learned_sc:
102
- self.conv1x1 = spectral_norm(nn.Conv2d(dim_in, dim_out, 1, 1, 0, bias=False))
103
-
104
- def _shortcut(self, x):
105
- if self.learned_sc:
106
- x = self.conv1x1(x)
107
- if self.downsample:
108
- x = self.downsample(x)
109
- return x
110
-
111
- def _residual(self, x):
112
- if self.normalize:
113
- x = self.norm1(x)
114
- x = self.actv(x)
115
- x = self.conv1(x)
116
- x = self.downsample_res(x)
117
- if self.normalize:
118
- x = self.norm2(x)
119
- x = self.actv(x)
120
- x = self.conv2(x)
121
- return x
122
-
123
- def forward(self, x):
124
- x = self._shortcut(x) + self._residual(x)
125
- return x / np.sqrt(2) # unit variance
126
-
127
  class LinearNorm(torch.nn.Module):
128
  def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
129
  super(LinearNorm, self).__init__()
@@ -136,98 +23,6 @@ class LinearNorm(torch.nn.Module):
136
  def forward(self, x):
137
  return self.linear_layer(x)
138
 
139
- class Discriminator2d(nn.Module):
140
- def __init__(self, dim_in=48, num_domains=1, max_conv_dim=384, repeat_num=4):
141
- super().__init__()
142
- blocks = []
143
- blocks += [spectral_norm(nn.Conv2d(1, dim_in, 3, 1, 1))]
144
-
145
- for lid in range(repeat_num):
146
- dim_out = min(dim_in*2, max_conv_dim)
147
- blocks += [ResBlk(dim_in, dim_out, downsample='half')]
148
- dim_in = dim_out
149
-
150
- blocks += [nn.LeakyReLU(0.2)]
151
- blocks += [spectral_norm(nn.Conv2d(dim_out, dim_out, 5, 1, 0))]
152
- blocks += [nn.LeakyReLU(0.2)]
153
- blocks += [nn.AdaptiveAvgPool2d(1)]
154
- blocks += [spectral_norm(nn.Conv2d(dim_out, num_domains, 1, 1, 0))]
155
- self.main = nn.Sequential(*blocks)
156
-
157
- def get_feature(self, x):
158
- features = []
159
- for l in self.main:
160
- x = l(x)
161
- features.append(x)
162
- out = features[-1]
163
- out = out.view(out.size(0), -1) # (batch, num_domains)
164
- return out, features
165
-
166
- def forward(self, x):
167
- out, features = self.get_feature(x)
168
- out = out.squeeze() # (batch)
169
- return out, features
170
-
171
- class ResBlk1d(nn.Module):
172
- def __init__(self, dim_in, dim_out, actv=nn.LeakyReLU(0.2),
173
- normalize=False, downsample='none', dropout_p=0.2):
174
- super().__init__()
175
- self.actv = actv
176
- self.normalize = normalize
177
- self.downsample_type = downsample
178
- self.learned_sc = dim_in != dim_out
179
- self._build_weights(dim_in, dim_out)
180
- self.dropout_p = dropout_p
181
-
182
- if self.downsample_type == 'none':
183
- self.pool = nn.Identity()
184
- else:
185
- self.pool = weight_norm(nn.Conv1d(dim_in, dim_in, kernel_size=3, stride=2, groups=dim_in, padding=1))
186
-
187
- def _build_weights(self, dim_in, dim_out):
188
- self.conv1 = weight_norm(nn.Conv1d(dim_in, dim_in, 3, 1, 1))
189
- self.conv2 = weight_norm(nn.Conv1d(dim_in, dim_out, 3, 1, 1))
190
- if self.normalize:
191
- self.norm1 = nn.InstanceNorm1d(dim_in, affine=True)
192
- self.norm2 = nn.InstanceNorm1d(dim_in, affine=True)
193
- if self.learned_sc:
194
- self.conv1x1 = weight_norm(nn.Conv1d(dim_in, dim_out, 1, 1, 0, bias=False))
195
-
196
- def downsample(self, x):
197
- if self.downsample_type == 'none':
198
- return x
199
- else:
200
- if x.shape[-1] % 2 != 0:
201
- x = torch.cat([x, x[..., -1].unsqueeze(-1)], dim=-1)
202
- return F.avg_pool1d(x, 2)
203
-
204
- def _shortcut(self, x):
205
- if self.learned_sc:
206
- x = self.conv1x1(x)
207
- x = self.downsample(x)
208
- return x
209
-
210
- def _residual(self, x):
211
- if self.normalize:
212
- x = self.norm1(x)
213
- x = self.actv(x)
214
- x = F.dropout(x, p=self.dropout_p, training=self.training)
215
-
216
- x = self.conv1(x)
217
- x = self.pool(x)
218
- if self.normalize:
219
- x = self.norm2(x)
220
-
221
- x = self.actv(x)
222
- x = F.dropout(x, p=self.dropout_p, training=self.training)
223
-
224
- x = self.conv2(x)
225
- return x
226
-
227
- def forward(self, x):
228
- x = self._shortcut(x) + self._residual(x)
229
- return x / np.sqrt(2) # unit variance
230
-
231
  class LayerNorm(nn.Module):
232
  def __init__(self, channels, eps=1e-5):
233
  super().__init__()
@@ -312,19 +107,6 @@ class TextEncoder(nn.Module):
312
  return mask
313
 
314
 
315
-
316
- class AdaIN1d(nn.Module):
317
- def __init__(self, style_dim, num_features):
318
- super().__init__()
319
- self.norm = nn.InstanceNorm1d(num_features, affine=False)
320
- self.fc = nn.Linear(style_dim, num_features*2)
321
-
322
- def forward(self, x, s):
323
- h = self.fc(s)
324
- h = h.view(h.size(0), h.size(1), 1)
325
- gamma, beta = torch.chunk(h, chunks=2, dim=1)
326
- return (1 + gamma) * self.norm(x) + beta
327
-
328
  class UpSample1d(nn.Module):
329
  def __init__(self, layer_type):
330
  super().__init__()
@@ -406,6 +188,7 @@ class AdaLayerNorm(nn.Module):
406
 
407
 
408
  class ProsodyPredictor(nn.Module):
 
409
  def __init__(self, style_dim, d_hid, nlayers, max_dur=50, dropout=0.1):
410
  super().__init__()
411
 
@@ -418,7 +201,6 @@ class ProsodyPredictor(nn.Module):
418
  self.duration_proj = LinearNorm(d_hid, max_dur)
419
 
420
  self.shared = nn.LSTM(d_hid + style_dim, d_hid // 2, 1, batch_first=True, bidirectional=True)
421
-
422
  self.F0 = nn.ModuleList()
423
  self.F0.append(AdainResBlk1d(d_hid, d_hid, style_dim, dropout_p=dropout))
424
  self.F0.append(AdainResBlk1d(d_hid, d_hid // 2, style_dim, upsample=True, dropout_p=dropout))
@@ -462,6 +244,7 @@ class ProsodyPredictor(nn.Module):
462
 
463
  return duration.squeeze(-1), en
464
 
 
465
  def F0Ntrain(self, x: torch.Tensor, s: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
466
  x1 = x.transpose(-1, -2)
467
  x2, _temp = self.shared(x1)
@@ -574,6 +357,7 @@ def recursive_munch(d):
574
  else:
575
  return d
576
 
 
577
  def build_model(path: str, device: str):
578
  config = Path(__file__).parent / 'config.json'
579
  assert config.exists(), f'Config path incorrect: config.json not found at {config}'
@@ -587,17 +371,14 @@ def build_model(path: str, device: str):
587
  resblock_dilation_sizes=args.decoder.resblock_dilation_sizes,
588
  upsample_kernel_sizes=args.decoder.upsample_kernel_sizes,
589
  gen_istft_n_fft=args.decoder.gen_istft_n_fft, gen_istft_hop_size=args.decoder.gen_istft_hop_size)
590
-
591
  text_encoder = TextEncoder(channels=args.hidden_dim, kernel_size=5, depth=args.n_layer, n_symbols=args.n_token)
592
  predictor = ProsodyPredictor(style_dim=args.style_dim, d_hid=args.hidden_dim, nlayers=args.n_layer, max_dur=args.max_dur, dropout=args.dropout)
593
  bert = load_plbert()
594
  bert_encoder = nn.Linear(bert.config.hidden_size, args.hidden_dim)
595
-
596
  for parent in [bert, bert_encoder, predictor, decoder, text_encoder]:
597
  for child in parent.children():
598
  if isinstance(child, nn.RNNBase):
599
  child.flatten_parameters()
600
-
601
  model = Munch(
602
  bert=bert.to(device).eval(),
603
  bert_encoder=bert_encoder.to(device).eval(),
@@ -605,7 +386,6 @@ def build_model(path: str, device: str):
605
  decoder=decoder.to(device).eval(),
606
  text_encoder=text_encoder.to(device).eval(),
607
  )
608
-
609
  for key, state_dict in torch.load(path, map_location='cpu', weights_only=True)['net'].items():
610
  assert key in model, key
611
  try:
 
1
  # https://github.com/yl4579/StyleTTS2/blob/main/models.py
2
+ from istftnet import AdaIN1d, Decoder
 
3
  from munch import Munch
4
  from pathlib import Path
5
  from plbert import load_plbert
 
11
  import torch.nn as nn
12
  import torch.nn.functional as F
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  class LinearNorm(torch.nn.Module):
15
  def __init__(self, in_dim, out_dim, bias=True, w_init_gain='linear'):
16
  super(LinearNorm, self).__init__()
 
23
  def forward(self, x):
24
  return self.linear_layer(x)
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  class LayerNorm(nn.Module):
27
  def __init__(self, channels, eps=1e-5):
28
  super().__init__()
 
107
  return mask
108
 
109
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  class UpSample1d(nn.Module):
111
  def __init__(self, layer_type):
112
  super().__init__()
 
188
 
189
 
190
  class ProsodyPredictor(nn.Module):
191
+
192
  def __init__(self, style_dim, d_hid, nlayers, max_dur=50, dropout=0.1):
193
  super().__init__()
194
 
 
201
  self.duration_proj = LinearNorm(d_hid, max_dur)
202
 
203
  self.shared = nn.LSTM(d_hid + style_dim, d_hid // 2, 1, batch_first=True, bidirectional=True)
 
204
  self.F0 = nn.ModuleList()
205
  self.F0.append(AdainResBlk1d(d_hid, d_hid, style_dim, dropout_p=dropout))
206
  self.F0.append(AdainResBlk1d(d_hid, d_hid // 2, style_dim, upsample=True, dropout_p=dropout))
 
244
 
245
  return duration.squeeze(-1), en
246
 
247
+
248
  def F0Ntrain(self, x: torch.Tensor, s: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
249
  x1 = x.transpose(-1, -2)
250
  x2, _temp = self.shared(x1)
 
357
  else:
358
  return d
359
 
360
+
361
  def build_model(path: str, device: str):
362
  config = Path(__file__).parent / 'config.json'
363
  assert config.exists(), f'Config path incorrect: config.json not found at {config}'
 
371
  resblock_dilation_sizes=args.decoder.resblock_dilation_sizes,
372
  upsample_kernel_sizes=args.decoder.upsample_kernel_sizes,
373
  gen_istft_n_fft=args.decoder.gen_istft_n_fft, gen_istft_hop_size=args.decoder.gen_istft_hop_size)
 
374
  text_encoder = TextEncoder(channels=args.hidden_dim, kernel_size=5, depth=args.n_layer, n_symbols=args.n_token)
375
  predictor = ProsodyPredictor(style_dim=args.style_dim, d_hid=args.hidden_dim, nlayers=args.n_layer, max_dur=args.max_dur, dropout=args.dropout)
376
  bert = load_plbert()
377
  bert_encoder = nn.Linear(bert.config.hidden_size, args.hidden_dim)
 
378
  for parent in [bert, bert_encoder, predictor, decoder, text_encoder]:
379
  for child in parent.children():
380
  if isinstance(child, nn.RNNBase):
381
  child.flatten_parameters()
 
382
  model = Munch(
383
  bert=bert.to(device).eval(),
384
  bert_encoder=bert_encoder.to(device).eval(),
 
386
  decoder=decoder.to(device).eval(),
387
  text_encoder=text_encoder.to(device).eval(),
388
  )
 
389
  for key, state_dict in torch.load(path, map_location='cpu', weights_only=True)['net'].items():
390
  assert key in model, key
391
  try: