""" Preprocessing functions. Functions to preprocess the data before running any method. """ import numpy as np from scipy.sparse import csr_matrix, issparse import pandas as pd import logging from anndata import AnnData from numpy.random import default_rng def check_mat(m, r, c, verbose=False): # Accept any sparse format but transform to csr if issparse(m) and not isinstance(m, csr_matrix): m = csr_matrix(m) # Check for empty features if type(m) is csr_matrix: msk_features = m.getnnz(axis=0) == 0 else: msk_features = np.count_nonzero(m, axis=0) == 0 n_empty_features = np.sum(msk_features) if n_empty_features > 0: if verbose: print("{0} features of mat are empty, they will be removed.".format(n_empty_features)) c = c[~msk_features] m = m[:, ~msk_features] # Check for repeated features if np.any(c[1:] == c[:-1]): raise ValueError("""mat contains repeated feature names, please make them unique.""") # Check for empty samples if type(m) is csr_matrix: msk_samples = m.getnnz(axis=1) == 0 else: msk_samples = np.count_nonzero(m, axis=1) == 0 n_empty_samples = np.sum(msk_samples) if n_empty_samples > 0: if verbose: print("{0} samples of mat are empty, they will be removed.".format(n_empty_samples)) r = r[~msk_samples] m = m[~msk_samples] # Check for non finite values if np.any(~np.isfinite(m.data)): raise ValueError("""mat contains non finite values (nan or inf), please set them to 0 or remove them.""") return m, r, c def extract(mat, use_raw=True, verbose=False, dtype=np.float32): """ Processes different input types so that they can be used downstream. Parameters ---------- mat : list, pd.DataFrame or AnnData List of [matrix, samples, features], dataframe (samples x features) or an AnnData instance. use_raw : bool Use `raw` attribute of `adata` if present. dtype : type Type of float used. Returns ------- m : csr_matrix Sparse matrix containing molecular readouts or statistics. r : ndarray Array of sample names. c : ndarray Array of feature names. """ if type(mat) is list: m, r, c = mat m = np.array(m, dtype=dtype) r = np.array(r, dtype='U') c = np.array(c, dtype='U') elif type(mat) is pd.DataFrame: m = mat.values.astype(dtype) r = mat.index.values.astype('U') c = mat.columns.values.astype('U') elif type(mat) is AnnData: if use_raw: if mat.raw is None: raise ValueError("Received `use_raw=True`, but `mat.raw` is empty.") m = mat.raw.X.astype(dtype) c = mat.raw.var.index.values.astype('U') else: m = mat.X.astype(dtype) c = mat.var.index.values.astype('U') r = mat.obs.index.values.astype('U') else: raise ValueError("""mat must be a list of [matrix, samples, features], dataframe (samples x features) or an AnnData instance.""") # Check mat for empty or not finite values m, r, c = check_mat(m, r, c, verbose=verbose) # Sort genes msk = np.argsort(c) return m[:, msk].astype(dtype), r.astype('U'), c[msk].astype('U') def filt_min_n(c, net, min_n=5): """ Removes sources of a `net` with less than min_n targets. First it filters target features in `net` that are not in `mat` and then removes sources with less than `min_n` targets. Parameters ---------- c : ndarray Column names of `mat`. net : DataFrame Network in long format. min_n : int Minimum of targets per source. If less, sources are removed. Returns ------- net : DataFrame Filtered net in long format. """ # Find shared targets between mat and net msk = np.isin(net['target'].values.astype('U'), c) net = net.iloc[msk] # Count unique sources sources, counts = np.unique(net['source'].values.astype('U'), return_counts=True) # Find sources with more than min_n targets msk = np.isin(net['source'].values.astype('U'), sources[counts >= min_n]) # Filter net = net[msk] if net.shape[0] == 0: raise ValueError("""No sources with more than min_n={0} targets. Make sure mat and net have shared target features or reduce the number assigned to min_n""".format(min_n)) return net def match(c, r, net): """ Matches `mat` with a regulatory adjacency matrix. Parameters ---------- c : ndarray Column names of `mat`. r : ndarray Row names of `net`. net : ndarray Regulatory adjacency matrix. Returns ------- regX : ndarray Matching regulatory adjacency matrix. """ # Init empty regX regX = np.zeros((c.shape[0], net.shape[1]), dtype=np.float32) # Create an index array for rows of c corresponding to r c_dict = {gene: i for i, gene in enumerate(c)} idxs = [c_dict[gene] for gene in r if gene in c_dict] # Populate regX using advanced indexing regX[idxs, :] = net[: len(idxs), :] return regX def rename_net(net, source='source', target='target', weight='weight'): """ Renames input network to match decoupler's format (source, target, weight). Parameters ---------- net : DataFrame Network in long format. source : str Column name where to extract source features. target : str Column name where to extract target features. weight : str, None Column name where to extract features' weights. If no weights are available, set to None. Returns ------- net : DataFrame Renamed network. """ # Check if names are in columns msg = 'Column name "{0}" not found in net. Please specify a valid column.' assert source in net.columns, msg.format(source) assert target in net.columns, msg.format(target) if weight is not None: assert weight in net.columns, msg.format(weight) + """Alternatively, set to None if no weights are available.""" else: net = net.copy() net['weight'] = 1.0 weight = 'weight' # Rename net = net.rename(columns={source: 'source', target: 'target', weight: 'weight'}) # Sort net = net.reindex(columns=['source', 'target', 'weight']) # Check if duplicated is_d = net.duplicated(['source', 'target']).sum() if is_d > 0: raise ValueError('net contains repeated edges, please remove them.') return net def get_net_mat(net): """ Transforms a given network to a regulatory adjacency matrix (targets x sources). Parameters ---------- net : DataFrame Network in long format. Returns ------- sources : ndarray Array of source names. targets : ndarray Array of target names. X : ndarray Array of interactions bewteen sources and targets (target x source). """ # Pivot df to a wider format X = net.pivot(columns='source', index='target', values='weight') X[np.isnan(X)] = 0 # Store node names and weights sources = X.columns.values targets = X.index.values X = X.values return sources.astype('U'), targets.astype('U'), X.astype(np.float32) def mask_features(mat, log=False, thr=1, use_raw=False): if log: thr = np.exp(thr) - 1 if type(mat) is list: m, r, c = mat m[m < thr] = 0.0 return [m, r, c] elif type(mat) is pd.DataFrame: mat.loc[:, :] = np.where(mat.values < thr, 0.0, mat.values) return mat elif type(mat) is AnnData: if use_raw: if mat.raw is None: raise ValueError("Received `use_raw=True`, but `mat.raw` is empty.") mat.raw.X[mat.raw.X < thr] = 0.0 else: mat.X[mat.X < thr] = 0.0 else: raise ValueError("""mat must be a list of [matrix, samples, features], dataframe (samples x features) or an AnnData instance.""") def add_to_anndata(mat, results): for result in results: if result is not None: mat.obsm[result.name] = result def return_data(mat, results): if isinstance(mat, AnnData): if mat.obs_names.size != results[0].index.size: logging.warning('Provided AnnData contains empty observations. Returning repaired object.') mat = mat[results[0].index, :].copy() add_to_anndata(mat, results) return mat else: add_to_anndata(mat, results) return None else: return tuple([result for result in results if result is not None]) def break_ties(m, c, seed): # Randomize feature order to break ties randomly rng = default_rng(seed=seed) idx = np.arange(c.size) idx = rng.choice(idx, c.size, replace=False) m, c = m[:, idx], c[idx] return m, c