Staticaliza commited on
Commit
f70c122
·
verified ·
1 Parent(s): 2c7f7e6

Upload 3 files

Browse files
modules/campplus/DTDNN.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker). All Rights Reserved.
2
+ # Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
3
+
4
+ from collections import OrderedDict
5
+
6
+ import torch
7
+ from torch import nn
8
+ import torch.nn.functional as F
9
+
10
+ from modules.campplus.layers import DenseLayer, StatsPool, TDNNLayer, CAMDenseTDNNBlock, TransitLayer, BasicResBlock, get_nonlinear
11
+
12
+
13
+ class FCM(nn.Module):
14
+ def __init__(self,
15
+ block=BasicResBlock,
16
+ num_blocks=[2, 2],
17
+ m_channels=32,
18
+ feat_dim=80):
19
+ super(FCM, self).__init__()
20
+ self.in_planes = m_channels
21
+ self.conv1 = nn.Conv2d(1, m_channels, kernel_size=3, stride=1, padding=1, bias=False)
22
+ self.bn1 = nn.BatchNorm2d(m_channels)
23
+
24
+ self.layer1 = self._make_layer(block, m_channels, num_blocks[0], stride=2)
25
+ self.layer2 = self._make_layer(block, m_channels, num_blocks[1], stride=2)
26
+
27
+ self.conv2 = nn.Conv2d(m_channels, m_channels, kernel_size=3, stride=(2, 1), padding=1, bias=False)
28
+ self.bn2 = nn.BatchNorm2d(m_channels)
29
+ self.out_channels = m_channels * (feat_dim // 8)
30
+
31
+ def _make_layer(self, block, planes, num_blocks, stride):
32
+ strides = [stride] + [1] * (num_blocks - 1)
33
+ layers = []
34
+ for stride in strides:
35
+ layers.append(block(self.in_planes, planes, stride))
36
+ self.in_planes = planes * block.expansion
37
+ return nn.Sequential(*layers)
38
+
39
+ def forward(self, x):
40
+ x = x.unsqueeze(1)
41
+ out = F.relu(self.bn1(self.conv1(x)))
42
+ out = self.layer1(out)
43
+ out = self.layer2(out)
44
+ out = F.relu(self.bn2(self.conv2(out)))
45
+
46
+ shape = out.shape
47
+ out = out.reshape(shape[0], shape[1]*shape[2], shape[3])
48
+ return out
49
+
50
+ class CAMPPlus(nn.Module):
51
+ def __init__(self,
52
+ feat_dim=80,
53
+ embedding_size=512,
54
+ growth_rate=32,
55
+ bn_size=4,
56
+ init_channels=128,
57
+ config_str='batchnorm-relu',
58
+ memory_efficient=True):
59
+ super(CAMPPlus, self).__init__()
60
+
61
+ self.head = FCM(feat_dim=feat_dim)
62
+ channels = self.head.out_channels
63
+
64
+ self.xvector = nn.Sequential(
65
+ OrderedDict([
66
+
67
+ ('tdnn',
68
+ TDNNLayer(channels,
69
+ init_channels,
70
+ 5,
71
+ stride=2,
72
+ dilation=1,
73
+ padding=-1,
74
+ config_str=config_str)),
75
+ ]))
76
+ channels = init_channels
77
+ for i, (num_layers, kernel_size,
78
+ dilation) in enumerate(zip((12, 24, 16), (3, 3, 3), (1, 2, 2))):
79
+ block = CAMDenseTDNNBlock(num_layers=num_layers,
80
+ in_channels=channels,
81
+ out_channels=growth_rate,
82
+ bn_channels=bn_size * growth_rate,
83
+ kernel_size=kernel_size,
84
+ dilation=dilation,
85
+ config_str=config_str,
86
+ memory_efficient=memory_efficient)
87
+ self.xvector.add_module('block%d' % (i + 1), block)
88
+ channels = channels + num_layers * growth_rate
89
+ self.xvector.add_module(
90
+ 'transit%d' % (i + 1),
91
+ TransitLayer(channels,
92
+ channels // 2,
93
+ bias=False,
94
+ config_str=config_str))
95
+ channels //= 2
96
+
97
+ self.xvector.add_module(
98
+ 'out_nonlinear', get_nonlinear(config_str, channels))
99
+
100
+ self.xvector.add_module('stats', StatsPool())
101
+ self.xvector.add_module(
102
+ 'dense',
103
+ DenseLayer(channels * 2, embedding_size, config_str='batchnorm_'))
104
+
105
+ for m in self.modules():
106
+ if isinstance(m, (nn.Conv1d, nn.Linear)):
107
+ nn.init.kaiming_normal_(m.weight.data)
108
+ if m.bias is not None:
109
+ nn.init.zeros_(m.bias)
110
+
111
+ def forward(self, x):
112
+ x = x.permute(0, 2, 1) # (B,T,F) => (B,F,T)
113
+ x = self.head(x)
114
+ x = self.xvector(x)
115
+ return x
modules/campplus/classifier.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker). All Rights Reserved.
2
+ # Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
3
+
4
+ import torch
5
+ import torch.nn as nn
6
+ import torch.nn.functional as F
7
+
8
+ from modules.campplus.layers import DenseLayer
9
+
10
+
11
+ class CosineClassifier(nn.Module):
12
+ def __init__(
13
+ self,
14
+ input_dim,
15
+ num_blocks=0,
16
+ inter_dim=512,
17
+ out_neurons=1000,
18
+ ):
19
+
20
+ super().__init__()
21
+ self.blocks = nn.ModuleList()
22
+
23
+ for index in range(num_blocks):
24
+ self.blocks.append(
25
+ DenseLayer(input_dim, inter_dim, config_str='batchnorm')
26
+ )
27
+ input_dim = inter_dim
28
+
29
+ self.weight = nn.Parameter(
30
+ torch.FloatTensor(out_neurons, input_dim)
31
+ )
32
+ nn.init.xavier_uniform_(self.weight)
33
+
34
+ def forward(self, x):
35
+ # x: [B, dim]
36
+ for layer in self.blocks:
37
+ x = layer(x)
38
+
39
+ # normalized
40
+ x = F.linear(F.normalize(x), F.normalize(self.weight))
41
+ return x
42
+
43
+ class LinearClassifier(nn.Module):
44
+ def __init__(
45
+ self,
46
+ input_dim,
47
+ num_blocks=0,
48
+ inter_dim=512,
49
+ out_neurons=1000,
50
+ ):
51
+
52
+ super().__init__()
53
+ self.blocks = nn.ModuleList()
54
+
55
+ self.nonlinear = nn.ReLU(inplace=True)
56
+ for index in range(num_blocks):
57
+ self.blocks.append(
58
+ DenseLayer(input_dim, inter_dim, bias=True)
59
+ )
60
+ input_dim = inter_dim
61
+
62
+ self.linear = nn.Linear(input_dim, out_neurons, bias=True)
63
+
64
+ def forward(self, x):
65
+ # x: [B, dim]
66
+ x = self.nonlinear(x)
67
+ for layer in self.blocks:
68
+ x = layer(x)
69
+ x = self.linear(x)
70
+ return x
modules/campplus/layers.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 3D-Speaker (https://github.com/alibaba-damo-academy/3D-Speaker). All Rights Reserved.
2
+ # Licensed under the Apache License, Version 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
3
+
4
+ import torch
5
+ import torch.nn.functional as F
6
+ import torch.utils.checkpoint as cp
7
+ from torch import nn
8
+
9
+
10
+ def get_nonlinear(config_str, channels):
11
+ nonlinear = nn.Sequential()
12
+ for name in config_str.split('-'):
13
+ if name == 'relu':
14
+ nonlinear.add_module('relu', nn.ReLU(inplace=True))
15
+ elif name == 'prelu':
16
+ nonlinear.add_module('prelu', nn.PReLU(channels))
17
+ elif name == 'batchnorm':
18
+ nonlinear.add_module('batchnorm', nn.BatchNorm1d(channels))
19
+ elif name == 'batchnorm_':
20
+ nonlinear.add_module('batchnorm',
21
+ nn.BatchNorm1d(channels, affine=False))
22
+ else:
23
+ raise ValueError('Unexpected module ({}).'.format(name))
24
+ return nonlinear
25
+
26
+ def statistics_pooling(x, dim=-1, keepdim=False, unbiased=True, eps=1e-2):
27
+ mean = x.mean(dim=dim)
28
+ std = x.std(dim=dim, unbiased=unbiased)
29
+ stats = torch.cat([mean, std], dim=-1)
30
+ if keepdim:
31
+ stats = stats.unsqueeze(dim=dim)
32
+ return stats
33
+
34
+
35
+ class StatsPool(nn.Module):
36
+ def forward(self, x):
37
+ return statistics_pooling(x)
38
+
39
+
40
+ class TDNNLayer(nn.Module):
41
+ def __init__(self,
42
+ in_channels,
43
+ out_channels,
44
+ kernel_size,
45
+ stride=1,
46
+ padding=0,
47
+ dilation=1,
48
+ bias=False,
49
+ config_str='batchnorm-relu'):
50
+ super(TDNNLayer, self).__init__()
51
+ if padding < 0:
52
+ assert kernel_size % 2 == 1, 'Expect equal paddings, but got even kernel size ({})'.format(
53
+ kernel_size)
54
+ padding = (kernel_size - 1) // 2 * dilation
55
+ self.linear = nn.Conv1d(in_channels,
56
+ out_channels,
57
+ kernel_size,
58
+ stride=stride,
59
+ padding=padding,
60
+ dilation=dilation,
61
+ bias=bias)
62
+ self.nonlinear = get_nonlinear(config_str, out_channels)
63
+
64
+ def forward(self, x):
65
+ x = self.linear(x)
66
+ x = self.nonlinear(x)
67
+ return x
68
+
69
+
70
+ class CAMLayer(nn.Module):
71
+ def __init__(self,
72
+ bn_channels,
73
+ out_channels,
74
+ kernel_size,
75
+ stride,
76
+ padding,
77
+ dilation,
78
+ bias,
79
+ reduction=2):
80
+ super(CAMLayer, self).__init__()
81
+ self.linear_local = nn.Conv1d(bn_channels,
82
+ out_channels,
83
+ kernel_size,
84
+ stride=stride,
85
+ padding=padding,
86
+ dilation=dilation,
87
+ bias=bias)
88
+ self.linear1 = nn.Conv1d(bn_channels, bn_channels // reduction, 1)
89
+ self.relu = nn.ReLU(inplace=True)
90
+ self.linear2 = nn.Conv1d(bn_channels // reduction, out_channels, 1)
91
+ self.sigmoid = nn.Sigmoid()
92
+
93
+ def forward(self, x):
94
+ y = self.linear_local(x)
95
+ context = x.mean(-1, keepdim=True)+self.seg_pooling(x)
96
+ context = self.relu(self.linear1(context))
97
+ m = self.sigmoid(self.linear2(context))
98
+ return y*m
99
+
100
+ def seg_pooling(self, x, seg_len=100, stype='avg'):
101
+ if stype == 'avg':
102
+ seg = F.avg_pool1d(x, kernel_size=seg_len, stride=seg_len, ceil_mode=True)
103
+ elif stype == 'max':
104
+ seg = F.max_pool1d(x, kernel_size=seg_len, stride=seg_len, ceil_mode=True)
105
+ else:
106
+ raise ValueError('Wrong segment pooling type.')
107
+ shape = seg.shape
108
+ seg = seg.unsqueeze(-1).expand(*shape, seg_len).reshape(*shape[:-1], -1)
109
+ seg = seg[..., :x.shape[-1]]
110
+ return seg
111
+
112
+
113
+ class CAMDenseTDNNLayer(nn.Module):
114
+ def __init__(self,
115
+ in_channels,
116
+ out_channels,
117
+ bn_channels,
118
+ kernel_size,
119
+ stride=1,
120
+ dilation=1,
121
+ bias=False,
122
+ config_str='batchnorm-relu',
123
+ memory_efficient=False):
124
+ super(CAMDenseTDNNLayer, self).__init__()
125
+ assert kernel_size % 2 == 1, 'Expect equal paddings, but got even kernel size ({})'.format(
126
+ kernel_size)
127
+ padding = (kernel_size - 1) // 2 * dilation
128
+ self.memory_efficient = memory_efficient
129
+ self.nonlinear1 = get_nonlinear(config_str, in_channels)
130
+ self.linear1 = nn.Conv1d(in_channels, bn_channels, 1, bias=False)
131
+ self.nonlinear2 = get_nonlinear(config_str, bn_channels)
132
+ self.cam_layer = CAMLayer(bn_channels,
133
+ out_channels,
134
+ kernel_size,
135
+ stride=stride,
136
+ padding=padding,
137
+ dilation=dilation,
138
+ bias=bias)
139
+
140
+ def bn_function(self, x):
141
+ return self.linear1(self.nonlinear1(x))
142
+
143
+ def forward(self, x):
144
+ if self.training and self.memory_efficient:
145
+ x = cp.checkpoint(self.bn_function, x)
146
+ else:
147
+ x = self.bn_function(x)
148
+ x = self.cam_layer(self.nonlinear2(x))
149
+ return x
150
+
151
+
152
+ class CAMDenseTDNNBlock(nn.ModuleList):
153
+ def __init__(self,
154
+ num_layers,
155
+ in_channels,
156
+ out_channels,
157
+ bn_channels,
158
+ kernel_size,
159
+ stride=1,
160
+ dilation=1,
161
+ bias=False,
162
+ config_str='batchnorm-relu',
163
+ memory_efficient=False):
164
+ super(CAMDenseTDNNBlock, self).__init__()
165
+ for i in range(num_layers):
166
+ layer = CAMDenseTDNNLayer(in_channels=in_channels + i * out_channels,
167
+ out_channels=out_channels,
168
+ bn_channels=bn_channels,
169
+ kernel_size=kernel_size,
170
+ stride=stride,
171
+ dilation=dilation,
172
+ bias=bias,
173
+ config_str=config_str,
174
+ memory_efficient=memory_efficient)
175
+ self.add_module('tdnnd%d' % (i + 1), layer)
176
+
177
+ def forward(self, x):
178
+ for layer in self:
179
+ x = torch.cat([x, layer(x)], dim=1)
180
+ return x
181
+
182
+
183
+ class TransitLayer(nn.Module):
184
+ def __init__(self,
185
+ in_channels,
186
+ out_channels,
187
+ bias=True,
188
+ config_str='batchnorm-relu'):
189
+ super(TransitLayer, self).__init__()
190
+ self.nonlinear = get_nonlinear(config_str, in_channels)
191
+ self.linear = nn.Conv1d(in_channels, out_channels, 1, bias=bias)
192
+
193
+ def forward(self, x):
194
+ x = self.nonlinear(x)
195
+ x = self.linear(x)
196
+ return x
197
+
198
+
199
+ class DenseLayer(nn.Module):
200
+ def __init__(self,
201
+ in_channels,
202
+ out_channels,
203
+ bias=False,
204
+ config_str='batchnorm-relu'):
205
+ super(DenseLayer, self).__init__()
206
+ self.linear = nn.Conv1d(in_channels, out_channels, 1, bias=bias)
207
+ self.nonlinear = get_nonlinear(config_str, out_channels)
208
+
209
+ def forward(self, x):
210
+ if len(x.shape) == 2:
211
+ x = self.linear(x.unsqueeze(dim=-1)).squeeze(dim=-1)
212
+ else:
213
+ x = self.linear(x)
214
+ x = self.nonlinear(x)
215
+ return x
216
+
217
+
218
+ class BasicResBlock(nn.Module):
219
+ expansion = 1
220
+
221
+ def __init__(self, in_planes, planes, stride=1):
222
+ super(BasicResBlock, self).__init__()
223
+ self.conv1 = nn.Conv2d(in_planes,
224
+ planes,
225
+ kernel_size=3,
226
+ stride=(stride, 1),
227
+ padding=1,
228
+ bias=False)
229
+ self.bn1 = nn.BatchNorm2d(planes)
230
+ self.conv2 = nn.Conv2d(planes,
231
+ planes,
232
+ kernel_size=3,
233
+ stride=1,
234
+ padding=1,
235
+ bias=False)
236
+ self.bn2 = nn.BatchNorm2d(planes)
237
+
238
+ self.shortcut = nn.Sequential()
239
+ if stride != 1 or in_planes != self.expansion * planes:
240
+ self.shortcut = nn.Sequential(
241
+ nn.Conv2d(in_planes,
242
+ self.expansion * planes,
243
+ kernel_size=1,
244
+ stride=(stride, 1),
245
+ bias=False),
246
+ nn.BatchNorm2d(self.expansion * planes))
247
+
248
+ def forward(self, x):
249
+ out = F.relu(self.bn1(self.conv1(x)))
250
+ out = self.bn2(self.conv2(out))
251
+ out += self.shortcut(x)
252
+ out = F.relu(out)
253
+ return out