Spaces:

tobiasc
/

conex

Build error

App Files Files Community

conex / espnet2 /asr /preencoder /sinc.py

tobiasc

Initial commit

ad16788 over 2 years ago

raw

history blame contribute delete

10.3 kB

	#!/usr/bin/env python3
	# 2020, Technische Universität München; Ludwig Kürzinger
	# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)

	"""Sinc convolutions for raw audio input."""

	from collections import OrderedDict
	from espnet2.asr.preencoder.abs_preencoder import AbsPreEncoder
	from espnet2.layers.sinc_conv import LogCompression
	from espnet2.layers.sinc_conv import SincConv
	import humanfriendly
	import torch
	from typeguard import check_argument_types
	from typing import Optional
	from typing import Tuple
	from typing import Union


	class LightweightSincConvs(AbsPreEncoder):
	"""Lightweight Sinc Convolutions.

	Instead of using precomputed features, end-to-end speech recognition
	can also be done directly from raw audio using sinc convolutions, as
	described in "Lightweight End-to-End Speech Recognition from Raw Audio
	Data Using Sinc-Convolutions" by Kürzinger et al.
	https://arxiv.org/abs/2010.07597

	To use Sinc convolutions in your model instead of the default f-bank
	frontend, set this module as your pre-encoder with `preencoder: sinc`
	and use the input of the sliding window frontend with
	`frontend: sliding_window` in your yaml configuration file.
	So that the process flow is:

	Frontend (SlidingWindow) -> SpecAug -> Normalization ->
	Pre-encoder (LightweightSincConvs) -> Encoder -> Decoder

	Note that this method also performs data augmentation in time domain
	(vs. in spectral domain in the default frontend).
	Use `plot_sinc_filters.py` to visualize the learned Sinc filters.
	"""

	def __init__(
	self,
	fs: Union[int, str, float] = 16000,
	in_channels: int = 1,
	out_channels: int = 256,
	activation_type: str = "leakyrelu",
	dropout_type: str = "dropout",
	windowing_type: str = "hamming",
	scale_type: str = "mel",
	):
	"""Initialize the module.

	Args:
	fs: Sample rate.
	in_channels: Number of input channels.
	out_channels: Number of output channels (for each input channel).
	activation_type: Choice of activation function.
	dropout_type: Choice of dropout function.
	windowing_type: Choice of windowing function.
	scale_type: Choice of filter-bank initialization scale.
	"""
	assert check_argument_types()
	super().__init__()
	if isinstance(fs, str):
	fs = humanfriendly.parse_size(fs)
	self.fs = fs
	self.in_channels = in_channels
	self.out_channels = out_channels
	self.activation_type = activation_type
	self.dropout_type = dropout_type
	self.windowing_type = windowing_type
	self.scale_type = scale_type

	self.choices_dropout = {
	"dropout": torch.nn.Dropout,
	"spatial": SpatialDropout,
	"dropout2d": torch.nn.Dropout2d,
	}
	if dropout_type not in self.choices_dropout:
	raise NotImplementedError(
	f"Dropout type has to be one of "
	f"{list(self.choices_dropout.keys())}",
	)

	self.choices_activation = {
	"leakyrelu": torch.nn.LeakyReLU,
	"relu": torch.nn.ReLU,
	}
	if activation_type not in self.choices_activation:
	raise NotImplementedError(
	f"Activation type has to be one of "
	f"{list(self.choices_activation.keys())}",
	)

	# initialization
	self._create_sinc_convs()
	# Sinc filters require custom initialization
	self.espnet_initialization_fn()

	def _create_sinc_convs(self):
	blocks = OrderedDict()

	# SincConvBlock
	out_channels = 128
	self.filters = SincConv(
	self.in_channels,
	out_channels,
	kernel_size=101,
	stride=1,
	fs=self.fs,
	window_func=self.windowing_type,
	scale_type=self.scale_type,
	)
	block = OrderedDict(
	[
	("Filters", self.filters),
	("LogCompression", LogCompression()),
	("BatchNorm", torch.nn.BatchNorm1d(out_channels, affine=True)),
	("AvgPool", torch.nn.AvgPool1d(2)),
	]
	)
	blocks["SincConvBlock"] = torch.nn.Sequential(block)
	in_channels = out_channels

	# First convolutional block, connects the sinc output to the front-end "body"
	out_channels = 128
	blocks["DConvBlock1"] = self.gen_lsc_block(
	in_channels,
	out_channels,
	depthwise_kernel_size=25,
	depthwise_stride=2,
	pointwise_groups=0,
	avgpool=True,
	dropout_probability=0.1,
	)
	in_channels = out_channels

	# Second convolutional block, multiple convolutional layers
	out_channels = self.out_channels
	for layer in [2, 3, 4]:
	blocks[f"DConvBlock{layer}"] = self.gen_lsc_block(
	in_channels, out_channels, depthwise_kernel_size=9, depthwise_stride=1
	)
	in_channels = out_channels

	# Third Convolutional block, acts as coupling to encoder
	out_channels = self.out_channels
	blocks["DConvBlock5"] = self.gen_lsc_block(
	in_channels,
	out_channels,
	depthwise_kernel_size=7,
	depthwise_stride=1,
	pointwise_groups=0,
	)

	self.blocks = torch.nn.Sequential(blocks)

	def gen_lsc_block(
	self,
	in_channels: int,
	out_channels: int,
	depthwise_kernel_size: int = 9,
	depthwise_stride: int = 1,
	depthwise_groups=None,
	pointwise_groups=0,
	dropout_probability: float = 0.15,
	avgpool=False,
	):
	"""Generate a convolutional block for Lightweight Sinc convolutions.

	Each block consists of either a depthwise or a depthwise-separable
	convolutions together with dropout, (batch-)normalization layer, and
	an optional average-pooling layer.

	Args:
	in_channels: Number of input channels.
	out_channels: Number of output channels.
	depthwise_kernel_size: Kernel size of the depthwise convolution.
	depthwise_stride: Stride of the depthwise convolution.
	depthwise_groups: Number of groups of the depthwise convolution.
	pointwise_groups: Number of groups of the pointwise convolution.
	dropout_probability: Dropout probability in the block.
	avgpool: If True, an AvgPool layer is inserted.

	Returns:
	torch.nn.Sequential: Neural network building block.
	"""
	block = OrderedDict()
	if not depthwise_groups:
	# GCD(in_channels, out_channels) to prevent size mismatches
	depthwise_groups, r = in_channels, out_channels
	while r != 0:
	depthwise_groups, r = depthwise_groups, depthwise_groups % r
	block["depthwise"] = torch.nn.Conv1d(
	in_channels,
	out_channels,
	depthwise_kernel_size,
	depthwise_stride,
	groups=depthwise_groups,
	)
	if pointwise_groups:
	block["pointwise"] = torch.nn.Conv1d(
	out_channels, out_channels, 1, 1, groups=pointwise_groups
	)
	block["activation"] = self.choices_activation[self.activation_type]()
	block["batchnorm"] = torch.nn.BatchNorm1d(out_channels, affine=True)
	if avgpool:
	block["avgpool"] = torch.nn.AvgPool1d(2)
	block["dropout"] = self.choices_dropout[self.dropout_type](dropout_probability)
	return torch.nn.Sequential(block)

	def espnet_initialization_fn(self):
	"""Initialize sinc filters with filterbank values."""
	self.filters.init_filters()
	for block in self.blocks:
	for layer in block:
	if type(layer) == torch.nn.BatchNorm1d and layer.affine:
	layer.weight.data[:] = 1.0
	layer.bias.data[:] = 0.0

	def forward(
	self, input: torch.Tensor, input_lengths: torch.Tensor
	) -> Tuple[torch.Tensor, torch.Tensor]:
	"""Apply Lightweight Sinc Convolutions.

	The input shall be formatted as (B, T, C_in, D_in)
	with B as batch size, T as time dimension, C_in as channels,
	and D_in as feature dimension.

	The output will then be (B, T, C_out*D_out)
	with C_out and D_out as output dimensions.

	The current module structure only handles D_in=400, so that D_out=1.
	Remark for the multichannel case: C_out is the number of out_channels
	given at initialization multiplied with C_in.
	"""
	# Transform input data:
	# (B, T, C_in, D_in) -> (B*T, C_in, D_in)
	B, T, C_in, D_in = input.size()
	input_frames = input.view(B * T, C_in, D_in)
	output_frames = self.blocks.forward(input_frames)

	# ---TRANSFORM: (BT, C_out, D_out) -> (B, T, C_outD_out)
	_, C_out, D_out = output_frames.size()
	output_frames = output_frames.view(B, T, C_out * D_out)
	return output_frames, input_lengths # no state in this layer

	def output_size(self) -> int:
	"""Get the output size."""
	return self.out_channels * self.in_channels


	class SpatialDropout(torch.nn.Module):
	"""Spatial dropout module.

	Apply dropout to full channels on tensors of input (B, C, D)
	"""

	def __init__(
	self,
	dropout_probability: float = 0.15,
	shape: Optional[Union[tuple, list]] = None,
	):
	"""Initialize.

	Args:
	dropout_probability: Dropout probability.
	shape (tuple, list): Shape of input tensors.
	"""
	assert check_argument_types()
	super().__init__()
	if shape is None:
	shape = (0, 2, 1)
	self.dropout = torch.nn.Dropout2d(dropout_probability)
	self.shape = (shape,)

	def forward(self, x: torch.Tensor) -> torch.Tensor:
	"""Forward of spatial dropout module."""
	y = x.permute(*self.shape)
	y = self.dropout(y)
	return y.permute(*self.shape)