|
"""
|
|
Preprocessing functions.
|
|
Functions to preprocess the data before running any method.
|
|
"""
|
|
|
|
import numpy as np
|
|
from scipy.sparse import csr_matrix, issparse
|
|
import pandas as pd
|
|
import logging
|
|
from anndata import AnnData
|
|
from numpy.random import default_rng
|
|
|
|
|
|
def check_mat(m, r, c, verbose=False):
|
|
|
|
|
|
if issparse(m) and not isinstance(m, csr_matrix):
|
|
m = csr_matrix(m)
|
|
|
|
|
|
if type(m) is csr_matrix:
|
|
msk_features = m.getnnz(axis=0) == 0
|
|
else:
|
|
msk_features = np.count_nonzero(m, axis=0) == 0
|
|
n_empty_features = np.sum(msk_features)
|
|
if n_empty_features > 0:
|
|
if verbose:
|
|
print("{0} features of mat are empty, they will be removed.".format(n_empty_features))
|
|
c = c[~msk_features]
|
|
m = m[:, ~msk_features]
|
|
|
|
|
|
if np.any(c[1:] == c[:-1]):
|
|
raise ValueError("""mat contains repeated feature names, please make them unique.""")
|
|
|
|
|
|
if type(m) is csr_matrix:
|
|
msk_samples = m.getnnz(axis=1) == 0
|
|
else:
|
|
msk_samples = np.count_nonzero(m, axis=1) == 0
|
|
n_empty_samples = np.sum(msk_samples)
|
|
if n_empty_samples > 0:
|
|
if verbose:
|
|
print("{0} samples of mat are empty, they will be removed.".format(n_empty_samples))
|
|
r = r[~msk_samples]
|
|
m = m[~msk_samples]
|
|
|
|
|
|
if np.any(~np.isfinite(m.data)):
|
|
raise ValueError("""mat contains non finite values (nan or inf), please set them to 0 or remove them.""")
|
|
|
|
return m, r, c
|
|
|
|
|
|
def extract(mat, use_raw=True, verbose=False, dtype=np.float32):
|
|
"""
|
|
Processes different input types so that they can be used downstream.
|
|
|
|
Parameters
|
|
----------
|
|
mat : list, pd.DataFrame or AnnData
|
|
List of [matrix, samples, features], dataframe (samples x features) or an AnnData instance.
|
|
use_raw : bool
|
|
Use `raw` attribute of `adata` if present.
|
|
dtype : type
|
|
Type of float used.
|
|
|
|
Returns
|
|
-------
|
|
m : csr_matrix
|
|
Sparse matrix containing molecular readouts or statistics.
|
|
r : ndarray
|
|
Array of sample names.
|
|
c : ndarray
|
|
Array of feature names.
|
|
"""
|
|
|
|
if type(mat) is list:
|
|
m, r, c = mat
|
|
m = np.array(m, dtype=dtype)
|
|
r = np.array(r, dtype='U')
|
|
c = np.array(c, dtype='U')
|
|
elif type(mat) is pd.DataFrame:
|
|
m = mat.values.astype(dtype)
|
|
r = mat.index.values.astype('U')
|
|
c = mat.columns.values.astype('U')
|
|
elif type(mat) is AnnData:
|
|
if use_raw:
|
|
if mat.raw is None:
|
|
raise ValueError("Received `use_raw=True`, but `mat.raw` is empty.")
|
|
m = mat.raw.X.astype(dtype)
|
|
c = mat.raw.var.index.values.astype('U')
|
|
else:
|
|
m = mat.X.astype(dtype)
|
|
c = mat.var.index.values.astype('U')
|
|
r = mat.obs.index.values.astype('U')
|
|
|
|
else:
|
|
raise ValueError("""mat must be a list of [matrix, samples, features], dataframe (samples x features) or an AnnData
|
|
instance.""")
|
|
|
|
|
|
m, r, c = check_mat(m, r, c, verbose=verbose)
|
|
|
|
|
|
msk = np.argsort(c)
|
|
|
|
return m[:, msk].astype(dtype), r.astype('U'), c[msk].astype('U')
|
|
|
|
|
|
def filt_min_n(c, net, min_n=5):
|
|
"""
|
|
Removes sources of a `net` with less than min_n targets.
|
|
|
|
First it filters target features in `net` that are not in `mat` and then removes sources with less than `min_n` targets.
|
|
|
|
Parameters
|
|
----------
|
|
c : ndarray
|
|
Column names of `mat`.
|
|
net : DataFrame
|
|
Network in long format.
|
|
min_n : int
|
|
Minimum of targets per source. If less, sources are removed.
|
|
|
|
Returns
|
|
-------
|
|
net : DataFrame
|
|
Filtered net in long format.
|
|
"""
|
|
|
|
|
|
msk = np.isin(net['target'].values.astype('U'), c)
|
|
net = net.iloc[msk]
|
|
|
|
|
|
sources, counts = np.unique(net['source'].values.astype('U'), return_counts=True)
|
|
|
|
|
|
msk = np.isin(net['source'].values.astype('U'), sources[counts >= min_n])
|
|
|
|
|
|
net = net[msk]
|
|
|
|
if net.shape[0] == 0:
|
|
raise ValueError("""No sources with more than min_n={0} targets. Make sure mat and net have shared target features or
|
|
reduce the number assigned to min_n""".format(min_n))
|
|
|
|
return net
|
|
|
|
|
|
def match(c, r, net):
|
|
"""
|
|
Matches `mat` with a regulatory adjacency matrix.
|
|
|
|
Parameters
|
|
----------
|
|
c : ndarray
|
|
Column names of `mat`.
|
|
r : ndarray
|
|
Row names of `net`.
|
|
net : ndarray
|
|
Regulatory adjacency matrix.
|
|
|
|
Returns
|
|
-------
|
|
regX : ndarray
|
|
Matching regulatory adjacency matrix.
|
|
"""
|
|
|
|
|
|
regX = np.zeros((c.shape[0], net.shape[1]), dtype=np.float32)
|
|
|
|
|
|
c_dict = {gene: i for i, gene in enumerate(c)}
|
|
idxs = [c_dict[gene] for gene in r if gene in c_dict]
|
|
|
|
|
|
regX[idxs, :] = net[: len(idxs), :]
|
|
|
|
return regX
|
|
|
|
|
|
def rename_net(net, source='source', target='target', weight='weight'):
|
|
"""
|
|
Renames input network to match decoupler's format (source, target, weight).
|
|
|
|
Parameters
|
|
----------
|
|
net : DataFrame
|
|
Network in long format.
|
|
source : str
|
|
Column name where to extract source features.
|
|
target : str
|
|
Column name where to extract target features.
|
|
weight : str, None
|
|
Column name where to extract features' weights. If no weights are available, set to None.
|
|
|
|
Returns
|
|
-------
|
|
net : DataFrame
|
|
Renamed network.
|
|
"""
|
|
|
|
|
|
msg = 'Column name "{0}" not found in net. Please specify a valid column.'
|
|
assert source in net.columns, msg.format(source)
|
|
assert target in net.columns, msg.format(target)
|
|
if weight is not None:
|
|
assert weight in net.columns, msg.format(weight) + """Alternatively, set to None if no weights are available."""
|
|
else:
|
|
net = net.copy()
|
|
net['weight'] = 1.0
|
|
weight = 'weight'
|
|
|
|
|
|
net = net.rename(columns={source: 'source', target: 'target', weight: 'weight'})
|
|
|
|
|
|
net = net.reindex(columns=['source', 'target', 'weight'])
|
|
|
|
|
|
is_d = net.duplicated(['source', 'target']).sum()
|
|
if is_d > 0:
|
|
raise ValueError('net contains repeated edges, please remove them.')
|
|
|
|
return net
|
|
|
|
|
|
def get_net_mat(net):
|
|
"""
|
|
Transforms a given network to a regulatory adjacency matrix (targets x sources).
|
|
|
|
Parameters
|
|
----------
|
|
net : DataFrame
|
|
Network in long format.
|
|
|
|
Returns
|
|
-------
|
|
sources : ndarray
|
|
Array of source names.
|
|
targets : ndarray
|
|
Array of target names.
|
|
X : ndarray
|
|
Array of interactions bewteen sources and targets (target x source).
|
|
"""
|
|
|
|
|
|
X = net.pivot(columns='source', index='target', values='weight')
|
|
X[np.isnan(X)] = 0
|
|
|
|
|
|
sources = X.columns.values
|
|
targets = X.index.values
|
|
X = X.values
|
|
|
|
return sources.astype('U'), targets.astype('U'), X.astype(np.float32)
|
|
|
|
|
|
def mask_features(mat, log=False, thr=1, use_raw=False):
|
|
if log:
|
|
thr = np.exp(thr) - 1
|
|
if type(mat) is list:
|
|
m, r, c = mat
|
|
m[m < thr] = 0.0
|
|
return [m, r, c]
|
|
elif type(mat) is pd.DataFrame:
|
|
mat.loc[:, :] = np.where(mat.values < thr, 0.0, mat.values)
|
|
return mat
|
|
elif type(mat) is AnnData:
|
|
if use_raw:
|
|
if mat.raw is None:
|
|
raise ValueError("Received `use_raw=True`, but `mat.raw` is empty.")
|
|
mat.raw.X[mat.raw.X < thr] = 0.0
|
|
else:
|
|
mat.X[mat.X < thr] = 0.0
|
|
else:
|
|
raise ValueError("""mat must be a list of [matrix, samples, features], dataframe (samples x features) or an AnnData
|
|
instance.""")
|
|
|
|
|
|
def add_to_anndata(mat, results):
|
|
for result in results:
|
|
if result is not None:
|
|
mat.obsm[result.name] = result
|
|
|
|
|
|
def return_data(mat, results):
|
|
if isinstance(mat, AnnData):
|
|
if mat.obs_names.size != results[0].index.size:
|
|
logging.warning('Provided AnnData contains empty observations. Returning repaired object.')
|
|
mat = mat[results[0].index, :].copy()
|
|
add_to_anndata(mat, results)
|
|
return mat
|
|
else:
|
|
add_to_anndata(mat, results)
|
|
return None
|
|
else:
|
|
return tuple([result for result in results if result is not None])
|
|
|
|
|
|
def break_ties(m, c, seed):
|
|
|
|
rng = default_rng(seed=seed)
|
|
idx = np.arange(c.size)
|
|
idx = rng.choice(idx, c.size, replace=False)
|
|
m, c = m[:, idx], c[idx]
|
|
return m, c
|
|
|