AudioToken

Sleeping

guyyariv

AudioTokenDemo

1b92e8f over 1 year ago

12.6 kB

	#!/usr/bin/env python
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from torch.autograd import Variable
	from itertools import product, permutations, combinations_with_replacement, chain


	class Unary(nn.Module):
	def __init__(self, embed_size):
	"""
	Captures local entity information
	:param embed_size: the embedding dimension
	"""
	super(Unary, self).__init__()
	self.embed = nn.Conv1d(embed_size, embed_size, 1)
	self.feature_reduce = nn.Conv1d(embed_size, 1, 1)

	def forward(self, X):
	X = X.transpose(1, 2)

	X_embed = self.embed(X)

	X_nl_embed = F.dropout(F.relu(X_embed))
	X_poten = self.feature_reduce(X_nl_embed)
	return X_poten.squeeze(1)


	class Pairwise(nn.Module):
	def __init__(self, embed_x_size, x_spatial_dim=None, embed_y_size=None, y_spatial_dim=None):
	"""
	Captures interaction between utilities or entities of the same utility
	:param embed_x_size: the embedding dimension of the first utility
	:param x_spatial_dim: the spatial dimension of the first utility for batch norm and weighted marginalization
	:param embed_y_size: the embedding dimension of the second utility (none for self-interactions)
	:param y_spatial_dim: the spatial dimension of the second utility for batch norm and weighted marginalization
	"""

	super(Pairwise, self).__init__()
	embed_y_size = embed_y_size if y_spatial_dim is not None else embed_x_size
	self.y_spatial_dim = y_spatial_dim if y_spatial_dim is not None else x_spatial_dim

	self.embed_size = max(embed_x_size, embed_y_size)
	self.x_spatial_dim = x_spatial_dim

	self.embed_X = nn.Conv1d(embed_x_size, self.embed_size, 1)
	self.embed_Y = nn.Conv1d(embed_y_size, self.embed_size, 1)
	if x_spatial_dim is not None:
	self.normalize_S = nn.BatchNorm1d(self.x_spatial_dim * self.y_spatial_dim)

	self.margin_X = nn.Conv1d(self.y_spatial_dim, 1, 1)
	self.margin_Y = nn.Conv1d(self.x_spatial_dim, 1, 1)

	def forward(self, X, Y=None):

	X_t = X.transpose(1, 2)
	Y_t = Y.transpose(1, 2) if Y is not None else X_t


	X_embed = self.embed_X(X_t)
	Y_embed = self.embed_Y(Y_t)

	X_norm = F.normalize(X_embed)
	Y_norm = F.normalize(Y_embed)

	S = X_norm.transpose(1, 2).bmm(Y_norm)
	if self.x_spatial_dim is not None:
	S = self.normalize_S(S.view(-1, self.x_spatial_dim * self.y_spatial_dim)) \
	.view(-1, self.x_spatial_dim, self.y_spatial_dim)

	X_poten = self.margin_X(S.transpose(1, 2)).transpose(1, 2).squeeze(2)
	Y_poten = self.margin_Y(S).transpose(1, 2).squeeze(2)
	else:
	X_poten = S.mean(dim=2, keepdim=False)
	Y_poten = S.mean(dim=1, keepdim=False)

	if Y is None:
	return X_poten
	else:
	return X_poten, Y_poten


	class Atten(nn.Module):
	def __init__(self, util_e, sharing_factor_weights=[], prior_flag=False,
	sizes=[], size_force=False, pairwise_flag=True,
	unary_flag=True, self_flag=True):
	"""
	The class performs an attention on a given list of utilities representation.
	:param util_e: the embedding dimensions
	:param sharing_factor_weights: To share weights, provide a dict of tuples:
	{idx: (num_utils, connected utils)
	Note, for efficiency, the shared utils (i.e., history, are connected to ans
	and question only.
	TODO: connections between shared utils
	:param prior_flag: is prior factor provided
	:param sizes: the spatial simension (used for batch-norm and weighted marginalization)
	:param size_force: force spatial size with adaptive avg pooling.
	:param pairwise_flag: use pairwise interaction between utilities
	:param unary_flag: use local information
	:param self_flag: use self interactions between utilitie's entities
	"""
	super(Atten, self).__init__()
	self.util_e = util_e

	self.prior_flag = prior_flag

	self.n_utils = len(util_e)

	self.spatial_pool = nn.ModuleDict()

	self.un_models = nn.ModuleList()

	self.self_flag = self_flag
	self.pairwise_flag = pairwise_flag
	self.unary_flag = unary_flag
	self.size_force = size_force

	if len(sizes) == 0:
	sizes = [None for _ in util_e]

	self.sharing_factor_weights = sharing_factor_weights

	#force the provided size
	for idx, e_dim in enumerate(util_e):
	self.un_models.append(Unary(e_dim))
	if self.size_force:
	self.spatial_pool[str(idx)] = nn.AdaptiveAvgPool1d(sizes[idx])

	#Pairwise
	self.pp_models = nn.ModuleDict()
	for ((idx1, e_dim_1), (idx2, e_dim_2)) \
	in combinations_with_replacement(enumerate(util_e), 2):
	# self
	if self.self_flag and idx1 == idx2:
	self.pp_models[str(idx1)] = Pairwise(e_dim_1, sizes[idx1])
	else:
	if pairwise_flag:
	if idx1 in self.sharing_factor_weights:
	# not connected
	if idx2 not in self.sharing_factor_weights[idx1][1]:
	continue
	if idx2 in self.sharing_factor_weights:
	# not connected
	if idx1 not in self.sharing_factor_weights[idx2][1]:
	continue
	self.pp_models[str((idx1, idx2))] = Pairwise(e_dim_1, sizes[idx1], e_dim_2, sizes[idx2])

	# Handle reduce potentials (with scalars)
	self.reduce_potentials = nn.ModuleList()

	self.num_of_potentials = dict()

	self.default_num_of_potentials = 0

	if self.self_flag:
	self.default_num_of_potentials += 1
	if self.unary_flag:
	self.default_num_of_potentials += 1
	if self.prior_flag:
	self.default_num_of_potentials += 1
	for idx in range(self.n_utils):
	self.num_of_potentials[idx] = self.default_num_of_potentials

	'''
	All other utilities
	'''
	if pairwise_flag:
	for idx, (num_utils, connected_utils) in sharing_factor_weights:
	for c_u in connected_utils:
	self.num_of_potentials[c_u] += num_utils
	self.num_of_potentials[idx] += 1
	for k in self.num_of_potentials:
	if k not in self.sharing_factor_weights:
	self.num_of_potentials[k] += (self.n_utils - 1) \
	- len(sharing_factor_weights)

	for idx in range(self.n_utils):
	self.reduce_potentials.append(nn.Conv1d(self.num_of_potentials[idx],
	1, 1, bias=False))

	def forward(self, utils, priors=None):
	assert self.n_utils == len(utils)
	assert (priors is None and not self.prior_flag) \
	or (priors is not None
	and self.prior_flag
	and len(priors) == self.n_utils)
	b_size = utils[0].size(0)
	util_factors = dict()
	attention = list()

	#Force size, constant size is used for pairwise batch normalization
	if self.size_force:
	for i, (num_utils, _) in self.sharing_factor_weights.items():
	if str(i) not in self.spatial_pool.keys():
	continue
	else:
	high_util = utils[i]
	high_util = high_util.view(num_utils * b_size, high_util.size(2), high_util.size(3))
	high_util = high_util.transpose(1, 2)
	utils[i] = self.spatial_pool[str(i)](high_util).transpose(1, 2)

	for i in range(self.n_utils):
	if i in self.sharing_factor_weights \
	or str(i) not in self.spatial_pool.keys():
	continue
	utils[i] = utils[i].transpose(1, 2)
	utils[i] = self.spatial_pool[str(i)](utils[i]).transpose(1, 2)
	if self.prior_flag and priors[i] is not None:
	priors[i] = self.spatial_pool[str(i)](priors[i].unsqueeze(1)).squeeze(1)

	# handle Shared weights
	for i, (num_utils, connected_list) in self.sharing_factor_weights:
	if self.unary_flag:
	util_factors.setdefault(i, []).append(self.un_models[i](utils[i]))

	if self.self_flag:
	util_factors.setdefault(i, []).append(self.pp_models[str(i)](utils[i]))

	if self.pairwise_flag:
	for j in connected_list:
	other_util = utils[j]
	expanded_util = other_util.unsqueeze(1).expand(b_size,
	num_utils,
	other_util.size(1),
	other_util.size(2)).contiguous().view(
	b_size * num_utils,
	other_util.size(1),
	other_util.size(2))

	if i < j:
	factor_ij, factor_ji = self.pp_models[str((i, j))](utils[i], expanded_util)
	else:
	factor_ji, factor_ij = self.pp_models[str((j, i))](expanded_util, utils[i])
	util_factors[i].append(factor_ij)
	util_factors.setdefault(j, []).append(factor_ji.view(b_size, num_utils, factor_ji.size(1)))

	# handle local factors
	for i in range(self.n_utils):
	if i in self.sharing_factor_weights:
	continue
	if self.unary_flag:
	util_factors.setdefault(i, []).append(self.un_models[i](utils[i]))
	if self.self_flag:
	util_factors.setdefault(i, []).append(self.pp_models[str(i)](utils[i]))

	# joint
	if self.pairwise_flag:
	for (i, j) in combinations_with_replacement(range(self.n_utils), 2):
	if i in self.sharing_factor_weights \
	or j in self.sharing_factor_weights:
	continue
	if i == j:
	continue
	else:
	factor_ij, factor_ji = self.pp_models[str((i, j))](utils[i], utils[j])
	util_factors.setdefault(i, []).append(factor_ij)
	util_factors.setdefault(j, []).append(factor_ji)

	# perform attention
	for i in range(self.n_utils):
	if self.prior_flag:
	prior = priors[i] \
	if priors[i] is not None \
	else torch.zeros_like(util_factors[i][0], requires_grad=False).cuda()

	util_factors[i].append(prior)

	util_factors[i] = torch.cat([p if len(p.size()) == 3 else p.unsqueeze(1)
	for p in util_factors[i]], dim=1)
	util_factors[i] = self.reduce_potentials[i](util_factors[i]).squeeze(1)
	util_factors[i] = F.softmax(util_factors[i], dim=1).unsqueeze(2)
	attention.append(torch.bmm(utils[i].transpose(1, 2), util_factors[i]).squeeze(2))

	return attention


	class NaiveAttention(nn.Module):
	def __init__(self):
	"""
	Used for ablation analysis - removing attention.
	"""
	super(NaiveAttention, self).__init__()

	def forward(self, utils, priors):
	atten = []
	spatial_atten = []
	for u, p in zip(utils, priors):
	if type(u) is tuple:
	u = u[1]
	num_elements = u.shape[0]
	if p is not None:
	u = u.view(-1, u.shape[-2], u.shape[-1])
	p = p.view(-1, p.shape[-2], p.shape[-1])
	spatial_atten.append(
	torch.bmm(p.transpose(1, 2), u).squeeze(2).view(num_elements, -1, u.shape[-2], u.shape[-1]))
	else:
	spatial_atten.append(u.mean(2))
	continue
	if p is not None:
	atten.append(torch.bmm(u.transpose(1, 2), p.unsqueeze(2)).squeeze(2))
	else:
	atten.append(u.mean(1))
	return atten, spatial_atten