Spaces:

oncomark
/

ai

Running

App Files Files Community

ai / mlm.py

iamspriyadarshi

Upload 13 files

11767f5 verified 22 days ago

raw

history blame contribute delete

4.55 kB

	"""
	Method MLM.
	Code to run the Multivariate Linear Model (MLM) method.
	"""

	import numpy as np
	import pandas as pd
	from scipy.sparse import csr_matrix

	from pre import extract, match, rename_net, get_net_mat, filt_min_n, return_data

	from scipy import stats

	from tqdm.auto import tqdm

	import numba as nb


	@nb.njit(nb.f4[:, :](nb.f4[:, :], nb.f4[:, :], nb.f4[:, :], nb.i8), parallel=True, cache=True)
	def fit_mlm(X, y, inv, df):
	X = np.ascontiguousarray(X)
	n_samples = y.shape[1]
	n_fsets = X.shape[1]
	coef, sse, _, _ = np.linalg.lstsq(X, y)
	if len(sse) == 0:
	raise ValueError("""Couldn\'t fit a multivariate linear model. This can happen because there are more sources
	(covariates) than unique targets (samples), or because the network\'s matrix rank is smaller than the number of
	sources.""")
	sse = sse / df
	se = np.zeros((n_samples, n_fsets), dtype=nb.f4)
	for i in nb.prange(n_samples):
	se[i] = np.sqrt(np.diag(sse[i] * inv))
	t = coef.T/se
	return t.astype(nb.f4)


	def mlm(mat, net, batch_size=10000, verbose=False):

	# Get dims
	n_samples = mat.shape[0]
	n_features, n_fsets = net.shape

	# Add intercept to network
	net = np.column_stack((np.ones((n_features, ), dtype=np.float32), net))

	# Compute inv and df for lm
	inv = np.linalg.inv(np.dot(net.T, net))
	df = n_features - n_fsets - 1

	if isinstance(mat, csr_matrix):
	# Init empty acts
	n_batches = int(np.ceil(n_samples / batch_size))
	es = np.zeros((n_samples, n_fsets), dtype=np.float32)
	for i in tqdm(range(n_batches), disable=not verbose):

	# Subset batch
	srt, end = i * batch_size, i * batch_size + batch_size
	y = mat[srt:end].toarray().T

	# Compute MLM for batch
	es[srt:end] = fit_mlm(net, y, inv, df)[:, 1:]
	else:
	# Compute MLM for all
	es = fit_mlm(net, mat.T, inv, df)[:, 1:]

	# Get p-values
	pvals = 2 * (1 - stats.t.cdf(np.abs(es), df))

	return es, pvals


	def run_mlm(mat, net, source='source', target='target', weight='weight', batch_size=10000,
	min_n=5, verbose=False, use_raw=True):
	"""
	Multivariate Linear Model (MLM).

	MLM fits a multivariate linear model for each sample, where the observed molecular readouts in `mat` are the response
	variable and the regulator weights in `net` are the covariates. Target features with no associated weight are set to
	zero. The obtained t-values from the fitted model are the activities (`mlm_estimate`) of the regulators in `net`.

	Parameters
	----------
	mat : list, DataFrame or AnnData
	List of [features, matrix], dataframe (samples x features) or an AnnData instance.
	net : DataFrame
	Network in long format.
	source : str
	Column name in net with source nodes.
	target : str
	Column name in net with target nodes.
	weight : str
	Column name in net with weights.
	batch_size : int
	Size of the samples to use for each batch. Increasing this will consume more memmory but it will run faster.
	min_n : int
	Minimum of targets per source. If less, sources are removed.
	verbose : bool
	Whether to show progress.
	use_raw : bool
	Use raw attribute of mat if present.

	Returns
	-------
	estimate : DataFrame
	MLM scores. Stored in `.obsm['mlm_estimate']` if `mat` is AnnData.
	pvals : DataFrame
	Obtained p-values. Stored in `.obsm['mlm_pvals']` if `mat` is AnnData.
	"""

	# Extract sparse matrix and array of genes
	m, r, c = extract(mat, use_raw=use_raw, verbose=verbose)

	# Transform net
	net = rename_net(net, source=source, target=target, weight=weight)
	net = filt_min_n(c, net, min_n=min_n)
	sources, targets, net = get_net_mat(net)

	# Match arrays
	net = match(c, targets, net)

	if verbose:
	print('Running mlm on mat with {0} samples and {1} targets for {2} sources.'.format(m.shape[0], len(c), net.shape[1]))

	# Run MLM
	estimate, pvals = mlm(m, net, batch_size=batch_size, verbose=verbose)

	# Transform to df
	estimate = pd.DataFrame(estimate, index=r, columns=sources)
	estimate.name = 'mlm_estimate'
	pvals = pd.DataFrame(pvals, index=r, columns=sources)
	pvals.name = 'mlm_pvals'

	return return_data(mat=mat, results=(estimate, pvals))