ai / mlm.py
iamspriyadarshi's picture
Upload 13 files
11767f5 verified
"""
Method MLM.
Code to run the Multivariate Linear Model (MLM) method.
"""
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from pre import extract, match, rename_net, get_net_mat, filt_min_n, return_data
from scipy import stats
from tqdm.auto import tqdm
import numba as nb
@nb.njit(nb.f4[:, :](nb.f4[:, :], nb.f4[:, :], nb.f4[:, :], nb.i8), parallel=True, cache=True)
def fit_mlm(X, y, inv, df):
X = np.ascontiguousarray(X)
n_samples = y.shape[1]
n_fsets = X.shape[1]
coef, sse, _, _ = np.linalg.lstsq(X, y)
if len(sse) == 0:
raise ValueError("""Couldn\'t fit a multivariate linear model. This can happen because there are more sources
(covariates) than unique targets (samples), or because the network\'s matrix rank is smaller than the number of
sources.""")
sse = sse / df
se = np.zeros((n_samples, n_fsets), dtype=nb.f4)
for i in nb.prange(n_samples):
se[i] = np.sqrt(np.diag(sse[i] * inv))
t = coef.T/se
return t.astype(nb.f4)
def mlm(mat, net, batch_size=10000, verbose=False):
# Get dims
n_samples = mat.shape[0]
n_features, n_fsets = net.shape
# Add intercept to network
net = np.column_stack((np.ones((n_features, ), dtype=np.float32), net))
# Compute inv and df for lm
inv = np.linalg.inv(np.dot(net.T, net))
df = n_features - n_fsets - 1
if isinstance(mat, csr_matrix):
# Init empty acts
n_batches = int(np.ceil(n_samples / batch_size))
es = np.zeros((n_samples, n_fsets), dtype=np.float32)
for i in tqdm(range(n_batches), disable=not verbose):
# Subset batch
srt, end = i * batch_size, i * batch_size + batch_size
y = mat[srt:end].toarray().T
# Compute MLM for batch
es[srt:end] = fit_mlm(net, y, inv, df)[:, 1:]
else:
# Compute MLM for all
es = fit_mlm(net, mat.T, inv, df)[:, 1:]
# Get p-values
pvals = 2 * (1 - stats.t.cdf(np.abs(es), df))
return es, pvals
def run_mlm(mat, net, source='source', target='target', weight='weight', batch_size=10000,
min_n=5, verbose=False, use_raw=True):
"""
Multivariate Linear Model (MLM).
MLM fits a multivariate linear model for each sample, where the observed molecular readouts in `mat` are the response
variable and the regulator weights in `net` are the covariates. Target features with no associated weight are set to
zero. The obtained t-values from the fitted model are the activities (`mlm_estimate`) of the regulators in `net`.
Parameters
----------
mat : list, DataFrame or AnnData
List of [features, matrix], dataframe (samples x features) or an AnnData instance.
net : DataFrame
Network in long format.
source : str
Column name in net with source nodes.
target : str
Column name in net with target nodes.
weight : str
Column name in net with weights.
batch_size : int
Size of the samples to use for each batch. Increasing this will consume more memmory but it will run faster.
min_n : int
Minimum of targets per source. If less, sources are removed.
verbose : bool
Whether to show progress.
use_raw : bool
Use raw attribute of mat if present.
Returns
-------
estimate : DataFrame
MLM scores. Stored in `.obsm['mlm_estimate']` if `mat` is AnnData.
pvals : DataFrame
Obtained p-values. Stored in `.obsm['mlm_pvals']` if `mat` is AnnData.
"""
# Extract sparse matrix and array of genes
m, r, c = extract(mat, use_raw=use_raw, verbose=verbose)
# Transform net
net = rename_net(net, source=source, target=target, weight=weight)
net = filt_min_n(c, net, min_n=min_n)
sources, targets, net = get_net_mat(net)
# Match arrays
net = match(c, targets, net)
if verbose:
print('Running mlm on mat with {0} samples and {1} targets for {2} sources.'.format(m.shape[0], len(c), net.shape[1]))
# Run MLM
estimate, pvals = mlm(m, net, batch_size=batch_size, verbose=verbose)
# Transform to df
estimate = pd.DataFrame(estimate, index=r, columns=sources)
estimate.name = 'mlm_estimate'
pvals = pd.DataFrame(pvals, index=r, columns=sources)
pvals.name = 'mlm_pvals'
return return_data(mat=mat, results=(estimate, pvals))