File size: 9,348 Bytes
11767f5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 |
"""
Preprocessing functions.
Functions to preprocess the data before running any method.
"""
import numpy as np
from scipy.sparse import csr_matrix, issparse
import pandas as pd
import logging
from anndata import AnnData
from numpy.random import default_rng
def check_mat(m, r, c, verbose=False):
# Accept any sparse format but transform to csr
if issparse(m) and not isinstance(m, csr_matrix):
m = csr_matrix(m)
# Check for empty features
if type(m) is csr_matrix:
msk_features = m.getnnz(axis=0) == 0
else:
msk_features = np.count_nonzero(m, axis=0) == 0
n_empty_features = np.sum(msk_features)
if n_empty_features > 0:
if verbose:
print("{0} features of mat are empty, they will be removed.".format(n_empty_features))
c = c[~msk_features]
m = m[:, ~msk_features]
# Check for repeated features
if np.any(c[1:] == c[:-1]):
raise ValueError("""mat contains repeated feature names, please make them unique.""")
# Check for empty samples
if type(m) is csr_matrix:
msk_samples = m.getnnz(axis=1) == 0
else:
msk_samples = np.count_nonzero(m, axis=1) == 0
n_empty_samples = np.sum(msk_samples)
if n_empty_samples > 0:
if verbose:
print("{0} samples of mat are empty, they will be removed.".format(n_empty_samples))
r = r[~msk_samples]
m = m[~msk_samples]
# Check for non finite values
if np.any(~np.isfinite(m.data)):
raise ValueError("""mat contains non finite values (nan or inf), please set them to 0 or remove them.""")
return m, r, c
def extract(mat, use_raw=True, verbose=False, dtype=np.float32):
"""
Processes different input types so that they can be used downstream.
Parameters
----------
mat : list, pd.DataFrame or AnnData
List of [matrix, samples, features], dataframe (samples x features) or an AnnData instance.
use_raw : bool
Use `raw` attribute of `adata` if present.
dtype : type
Type of float used.
Returns
-------
m : csr_matrix
Sparse matrix containing molecular readouts or statistics.
r : ndarray
Array of sample names.
c : ndarray
Array of feature names.
"""
if type(mat) is list:
m, r, c = mat
m = np.array(m, dtype=dtype)
r = np.array(r, dtype='U')
c = np.array(c, dtype='U')
elif type(mat) is pd.DataFrame:
m = mat.values.astype(dtype)
r = mat.index.values.astype('U')
c = mat.columns.values.astype('U')
elif type(mat) is AnnData:
if use_raw:
if mat.raw is None:
raise ValueError("Received `use_raw=True`, but `mat.raw` is empty.")
m = mat.raw.X.astype(dtype)
c = mat.raw.var.index.values.astype('U')
else:
m = mat.X.astype(dtype)
c = mat.var.index.values.astype('U')
r = mat.obs.index.values.astype('U')
else:
raise ValueError("""mat must be a list of [matrix, samples, features], dataframe (samples x features) or an AnnData
instance.""")
# Check mat for empty or not finite values
m, r, c = check_mat(m, r, c, verbose=verbose)
# Sort genes
msk = np.argsort(c)
return m[:, msk].astype(dtype), r.astype('U'), c[msk].astype('U')
def filt_min_n(c, net, min_n=5):
"""
Removes sources of a `net` with less than min_n targets.
First it filters target features in `net` that are not in `mat` and then removes sources with less than `min_n` targets.
Parameters
----------
c : ndarray
Column names of `mat`.
net : DataFrame
Network in long format.
min_n : int
Minimum of targets per source. If less, sources are removed.
Returns
-------
net : DataFrame
Filtered net in long format.
"""
# Find shared targets between mat and net
msk = np.isin(net['target'].values.astype('U'), c)
net = net.iloc[msk]
# Count unique sources
sources, counts = np.unique(net['source'].values.astype('U'), return_counts=True)
# Find sources with more than min_n targets
msk = np.isin(net['source'].values.astype('U'), sources[counts >= min_n])
# Filter
net = net[msk]
if net.shape[0] == 0:
raise ValueError("""No sources with more than min_n={0} targets. Make sure mat and net have shared target features or
reduce the number assigned to min_n""".format(min_n))
return net
def match(c, r, net):
"""
Matches `mat` with a regulatory adjacency matrix.
Parameters
----------
c : ndarray
Column names of `mat`.
r : ndarray
Row names of `net`.
net : ndarray
Regulatory adjacency matrix.
Returns
-------
regX : ndarray
Matching regulatory adjacency matrix.
"""
# Init empty regX
regX = np.zeros((c.shape[0], net.shape[1]), dtype=np.float32)
# Create an index array for rows of c corresponding to r
c_dict = {gene: i for i, gene in enumerate(c)}
idxs = [c_dict[gene] for gene in r if gene in c_dict]
# Populate regX using advanced indexing
regX[idxs, :] = net[: len(idxs), :]
return regX
def rename_net(net, source='source', target='target', weight='weight'):
"""
Renames input network to match decoupler's format (source, target, weight).
Parameters
----------
net : DataFrame
Network in long format.
source : str
Column name where to extract source features.
target : str
Column name where to extract target features.
weight : str, None
Column name where to extract features' weights. If no weights are available, set to None.
Returns
-------
net : DataFrame
Renamed network.
"""
# Check if names are in columns
msg = 'Column name "{0}" not found in net. Please specify a valid column.'
assert source in net.columns, msg.format(source)
assert target in net.columns, msg.format(target)
if weight is not None:
assert weight in net.columns, msg.format(weight) + """Alternatively, set to None if no weights are available."""
else:
net = net.copy()
net['weight'] = 1.0
weight = 'weight'
# Rename
net = net.rename(columns={source: 'source', target: 'target', weight: 'weight'})
# Sort
net = net.reindex(columns=['source', 'target', 'weight'])
# Check if duplicated
is_d = net.duplicated(['source', 'target']).sum()
if is_d > 0:
raise ValueError('net contains repeated edges, please remove them.')
return net
def get_net_mat(net):
"""
Transforms a given network to a regulatory adjacency matrix (targets x sources).
Parameters
----------
net : DataFrame
Network in long format.
Returns
-------
sources : ndarray
Array of source names.
targets : ndarray
Array of target names.
X : ndarray
Array of interactions bewteen sources and targets (target x source).
"""
# Pivot df to a wider format
X = net.pivot(columns='source', index='target', values='weight')
X[np.isnan(X)] = 0
# Store node names and weights
sources = X.columns.values
targets = X.index.values
X = X.values
return sources.astype('U'), targets.astype('U'), X.astype(np.float32)
def mask_features(mat, log=False, thr=1, use_raw=False):
if log:
thr = np.exp(thr) - 1
if type(mat) is list:
m, r, c = mat
m[m < thr] = 0.0
return [m, r, c]
elif type(mat) is pd.DataFrame:
mat.loc[:, :] = np.where(mat.values < thr, 0.0, mat.values)
return mat
elif type(mat) is AnnData:
if use_raw:
if mat.raw is None:
raise ValueError("Received `use_raw=True`, but `mat.raw` is empty.")
mat.raw.X[mat.raw.X < thr] = 0.0
else:
mat.X[mat.X < thr] = 0.0
else:
raise ValueError("""mat must be a list of [matrix, samples, features], dataframe (samples x features) or an AnnData
instance.""")
def add_to_anndata(mat, results):
for result in results:
if result is not None:
mat.obsm[result.name] = result
def return_data(mat, results):
if isinstance(mat, AnnData):
if mat.obs_names.size != results[0].index.size:
logging.warning('Provided AnnData contains empty observations. Returning repaired object.')
mat = mat[results[0].index, :].copy()
add_to_anndata(mat, results)
return mat
else:
add_to_anndata(mat, results)
return None
else:
return tuple([result for result in results if result is not None])
def break_ties(m, c, seed):
# Randomize feature order to break ties randomly
rng = default_rng(seed=seed)
idx = np.arange(c.size)
idx = rng.choice(idx, c.size, replace=False)
m, c = m[:, idx], c[idx]
return m, c
|