Package dropkick
Automated cell filtering and QC of single-cell RNA sequencing data
Expand source code
# -*- coding: utf-8 -*-
"""
Automated cell filtering and QC of single-cell RNA sequencing data
"""
from .api import (
    dropkick,
    recipe_dropkick,
    plot_thresh_obs,
    coef_inventory,
    coef_plot,
    score_plot,
)
from .qc import qc_summary
__all__ = [
    "dropkick",
    "recipe_dropkick",
    "plot_thresh_obs",
    "coef_inventory",
    "coef_plot",
    "score_plot",
    "qc_summary",
]
from ._version import get_versions
__version__ = get_versions()["version"]
del get_versionsSub-modules
- dropkick.api
- 
Functions for cell filtering of scRNA-seq data via dropkick 
- dropkick.errors
- 
Error reporting for dropkick model 
- dropkick.logistic
- 
Logistic regression model functions for dropkick 
- dropkick.qc
- 
Automated ambient gene testing and counts data QC 
- dropkick.scorer
- 
Modified version of sklearn.metrics.scorerto allow for scoring the entire lambda path of aglmnetmodel …
- dropkick.util
- 
Utility functions for scoring dropkick logistic regression model 
Functions
- def coef_inventory(adata, n=10)
- 
Returns highest and lowest coefficient values from logistic regression model, along with sparsity Parameters- adata:- anndata.AnnData
- object generated from dropkick()
- n:- int, optional- (default=10)
- number of genes to show at top and bottom of coefficient list
 Returnsprints top and bottom ngenes by their coefficient values to consoleExpand source codedef coef_inventory(adata, n=10): """ Returns highest and lowest coefficient values from logistic regression model, along with sparsity Parameters ---------- adata : anndata.AnnData object generated from `dropkick` n : int, optional (default=10) number of genes to show at top and bottom of coefficient list Returns ------- prints top and bottom `n` genes by their coefficient values to console """ print("\nTop HVGs by coefficient value (good cells):") print(adata.var.loc[-adata.var.dropkick_coef.isna(), "dropkick_coef"].nlargest(n)) print("\nBottom HVGs by coefficient value (bad droplets):") print(adata.var.loc[-adata.var.dropkick_coef.isna(), "dropkick_coef"].nsmallest(n)) n_zero = (adata.var.dropkick_coef == 0).sum() n_coef = (-adata.var.dropkick_coef.isna()).sum() sparsity = round((n_zero / n_coef) * 100, 3) print( "\n{} coefficients equal to zero. Model sparsity: {} %\n".format( n_zero, sparsity ) )
- def coef_plot(adata, save_to=None, verbose=True)
- 
Plots dropkick coefficient values and cross validation (CV) scores for tested values of lambda ( lambda_path)Parameters- adata:- anndata.AnnData
- object generated from dropkick()
- save_to:- str, optional- (default=None)
- path to .pngfile for saving figure
- verbose:- bool, optional- (default=True)
- print updates to console
 Returns- fig:- matplotlib.figure
- plot of CV scores (mean +/- SEM) and coefficient values (coef_path) versus log(lambda_path). includes indicator of chosen lambda value.
 if save_tois not None, write to file instead of returningfigobject.Expand source codedef coef_plot(adata, save_to=None, verbose=True): """ Plots dropkick coefficient values and cross validation (CV) scores for tested values of lambda (`lambda_path`) Parameters ---------- adata : anndata.AnnData object generated from `dropkick` save_to : str, optional (default=None) path to `.png` file for saving figure verbose : bool, optional (default=True) print updates to console Returns ------- fig : matplotlib.figure plot of CV scores (mean +/- SEM) and coefficient values (`coef_path`) versus log(`lambda_path`). includes indicator of chosen lambda value. if `save_to` is not None, write to file instead of returning `fig` object. """ cmap = cm.get_cmap("coolwarm") fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(7, 7), sharex=True) # plot coefficient values versus log(lambda) on top axis axes[0].set_ylabel("Coefficient Value", fontsize=12) axes[0].plot( np.log(adata.uns["dropkick_args"]["lambda_path"]), adata.uns["dropkick_args"]["coef_path"], alpha=0.5, linewidth=2, ) axes[0].tick_params(axis="both", which="major", labelsize=12) # plot vertical line at chosen lambda value and add to legend axes[0].axvline( np.log(adata.uns["dropkick_args"]["chosen_lambda"]), label=None, color="k", ls="--", linewidth=2, ) # plot total model sparsity and top three genes by coefficient value # get range of values for positioning text val_range = ( adata.uns["dropkick_args"]["coef_path"].max() - adata.uns["dropkick_args"]["coef_path"].min() ) # put model sparsity on top n_zero = (adata.var.dropkick_coef == 0).sum() n_coef = (-adata.var.dropkick_coef.isna()).sum() sparsity = round((n_zero / n_coef) * 100, 2) axes[0].text( x=np.log(adata.uns["dropkick_args"]["chosen_lambda"]), y=adata.var.dropkick_coef.max() + (0.24 * val_range), s=" Sparsity: {} %".format(sparsity), fontsize=12, color="k", ) # add top three genes by coefficient value as annotation [ axes[0].text( x=np.log(adata.uns["dropkick_args"]["chosen_lambda"]), y=adata.var.dropkick_coef.max() + (0.18 * val_range) - (0.06 * val_range * x), s=" " + adata.var.loc[-adata.var.dropkick_coef.isna(), "dropkick_coef"] .nlargest(3) .index[x], fontsize=12, fontstyle="italic", color="g", ) for x in range(3) ] # add bottom three genes by coefficient value as annotation [ axes[0].text( x=np.log(adata.uns["dropkick_args"]["chosen_lambda"]), y=adata.var.dropkick_coef.min() - (0.24 * val_range) + (0.06 * val_range * x), s=" " + adata.var.loc[-adata.var.dropkick_coef.isna(), "dropkick_coef"] .nsmallest(3) .index[x], fontsize=12, fontstyle="italic", color=cmap(1.0), ) for x in range(3) ] # plot CV scores versus log(lambda) on right y-axis axes[1].plot( np.log(adata.uns["dropkick_args"]["lambda_path"]), -2 * adata.uns["dropkick_args"]["cv_mean_score"], label="Mean Deviance", color=cmap(0.0), linewidth=2, ) axes[1].fill_between( np.log(adata.uns["dropkick_args"]["lambda_path"]), y1=(-2 * adata.uns["dropkick_args"]["cv_mean_score"]) - 2 * adata.uns["dropkick_args"]["cv_standard_error"], y2=(-2 * adata.uns["dropkick_args"]["cv_mean_score"]) + 2 * adata.uns["dropkick_args"]["cv_standard_error"], color=cmap(0.0), alpha=0.3, label="Deviance SEM", ) # plot vertical line at chosen lambda value and add to legend axes[1].axvline( np.log(adata.uns["dropkick_args"]["chosen_lambda"]), label="Chosen lambda: {:.2e}".format( adata.uns["dropkick_args"]["chosen_lambda"][0] ), color="k", ls="--", linewidth=2, ) axes[1].set_xlabel("Log (lambda)", fontsize=12) axes[1].set_ylabel("Binomial Deviance", fontsize=12) axes[1].tick_params(axis="both", which="major", labelsize=12) axes[1].legend(fontsize=12) plt.tight_layout() if save_to: if verbose: print("Saving coefficient plot to {}".format(save_to)) fig.savefig(save_to, dpi=200) else: return fig
- def dropkick(adata, min_genes=50, mito_names='^mt-|^MT-', n_ambient=10, n_hvgs=2000, metrics=['arcsinh_n_genes_by_counts'], thresh_methods=['multiotsu'], directions=['above'], alphas=[0.1], max_iter=2000, n_jobs=2, seed=18, verbose=True)
- 
Generates logistic regression model of cell quality Parameters- adata:- anndata.AnnData
- object containing unfiltered, raw scRNA-seq counts in .Xlayer
- min_genes:- int, optional- (default=50)
- threshold for minimum genes detected. ignores all cells with less than min_genes (dropkick label = 0).
- mito_names:- str, optional- (default="^mt-|^MT-")
- substring encompassing mitochondrial gene names for calculation of mito expression
- n_ambient:- int, optional- (default=10)
- number of ambient genes to call. top genes by cells.
- n_hvgs:- intor- None, optional- (default=2000)
- number of HVGs to calculate using Seurat method. if None, do not calculate HVGs
- metrics:- listof- str, optional- (default="arcsinh_n_genes_by_counts")
- name of column(s) to threshold from adata.obs
- thresh_methods:- listof- str {"otsu","multiotsu","li","mean"}, optional
- (default="multiotsu")
- automated thresholding method(s) corresponding to each element in metrics
- directions:- listof- str {"above","below"}, optional- (default="above")
- which direction to keep during training (dropkick label = 1) corresponding
to each element in metrics
- alphas:- listof- float, optional- (default=0.1)
- alpha value(s) to test using glmnet with n-fold cross validation
- max_iter:- int, optional- (default=2000)
- number of iterations for glmnet optimization
- n_jobs:- int, optional- (default=2)
- number of threads for cross validation by glmnet
- seed:- int, optional- (default=18)
- random state for cross validation by glmnet
- verbose:- bool, optional- (default=True)
- verbosity for glmnet training and warnings
 Returns- rc:- LogisticRegression
- trained logistic regression classifier
 updates adatainplace to include "train", "dropkick_score", and "dropkick_label" columns in.obsExpand source codedef dropkick( adata, min_genes=50, mito_names="^mt-|^MT-", n_ambient=10, n_hvgs=2000, metrics=["arcsinh_n_genes_by_counts"], thresh_methods=["multiotsu"], directions=["above"], alphas=[0.1], max_iter=2000, n_jobs=2, seed=18, verbose=True, ): """ Generates logistic regression model of cell quality Parameters ---------- adata : anndata.AnnData object containing unfiltered, raw scRNA-seq counts in `.X` layer min_genes : int, optional (default=50) threshold for minimum genes detected. ignores all cells with less than min_genes (dropkick label = 0). mito_names : str, optional (default="^mt-|^MT-") substring encompassing mitochondrial gene names for calculation of mito expression n_ambient : int, optional (default=10) number of ambient genes to call. top genes by cells. n_hvgs : int or None, optional (default=2000) number of HVGs to calculate using Seurat method. if None, do not calculate HVGs metrics : list of str, optional (default="arcsinh_n_genes_by_counts") name of column(s) to threshold from `adata.obs` thresh_methods : list of str {"otsu","multiotsu","li","mean"}, optional (default="multiotsu") automated thresholding method(s) corresponding to each element in `metrics` directions : list of str {"above","below"}, optional (default="above") which direction to keep during training (dropkick label = 1) corresponding to each element in `metrics` alphas : list of float, optional (default=0.1) alpha value(s) to test using glmnet with n-fold cross validation max_iter : int, optional (default=2000) number of iterations for glmnet optimization n_jobs : int, optional (default=2) number of threads for cross validation by glmnet seed : int, optional (default=18) random state for cross validation by glmnet verbose : bool, optional (default=True) verbosity for glmnet training and warnings Returns ------- rc : LogisticRegression trained logistic regression classifier updates `adata` inplace to include "train", "dropkick_score", and "dropkick_label" columns in `.obs` """ # 0) preprocess counts and calculate required QC metrics a = adata.copy() # make copy of anndata before manipulating a = recipe_dropkick( a, filter=True, min_genes=min_genes, calc_metrics=True, mito_names=mito_names, n_ambient=n_ambient, target_sum=None, n_hvgs=n_hvgs, X_final="arcsinh_norm", verbose=verbose, ) # 1) threshold chosen heuristics using automated methods if verbose: print("Thresholding on heuristics for training labels:\n\t{}".format(metrics)) # convert args to list if isinstance(metrics, str): metrics = [metrics] if isinstance(thresh_methods, str): thresh_methods = [thresh_methods] if isinstance(directions, str): directions = [directions] adata_thresh = auto_thresh_obs( a, methods=thresh_methods, obs_cols=metrics, directions=directions ) # 2) create labels from combination of thresholds a = filter_thresh_obs( a, adata_thresh, obs_cols=metrics, inclusive=True, name="train", verbose=verbose ) X = a[:, a.var.highly_variable].X.copy() # X for testing y = a.obs["train"].copy(deep=True) # final y is "train" labels from step 2 if verbose: print("Training dropkick with alphas:\n\t{}".format(alphas)) if len(alphas) > 1: # 3.1) cross-validation to choose alpha and lambda values cv_scores = {"rc": [], "lambda": [], "alpha": [], "score": []} # dictionary o/p for alpha in alphas: rc = LogitNet( alpha=alpha, n_lambda=100, standardize=False, scoring="log_loss", cut_point=1.0, n_splits=5, max_iter=max_iter, n_jobs=n_jobs, random_state=seed, verbose=verbose, ) rc.fit(adata=a, y=y, n_hvgs=n_hvgs) cv_scores["rc"].append(rc) cv_scores["alpha"].append(alpha) cv_scores["lambda"].append(rc.lambda_best_) cv_scores["score"].append(rc.score(X, y, lamb=rc.lambda_best_)) # determine optimal lambda and alpha values by accuracy score lambda_ = cv_scores["lambda"][ cv_scores["score"].index(max(cv_scores["score"])) ] # choose alpha value alpha_ = cv_scores["alpha"][ cv_scores["score"].index(max(cv_scores["score"])) ] # choose l1 ratio rc_ = cv_scores["rc"][ cv_scores["score"].index(max(cv_scores["score"])) ] # choose classifier print( "Chosen lambda value:\n\t{}\nChosen alpha value:\n\t{}".format( lambda_, alpha_ ) ) else: # 3.2) train model with single alpha value rc_ = LogitNet( alpha=alphas[0], n_lambda=100, standardize=False, scoring="log_loss", cut_point=1.0, n_splits=5, max_iter=max_iter, n_jobs=n_jobs, random_state=seed, verbose=verbose, ) rc_.fit(adata=a, y=y, n_hvgs=n_hvgs) print("Chosen lambda value:\n\t{}".format(rc_.lambda_best_)) lambda_, alpha_ = rc_.lambda_best_, alphas[0] # 4) use model to assign scores and labels to original adata print("Assigning scores and labels") if "dropkick_score" in adata.obs.columns: print("Warning: Overwriting existing dropkick scores in .obs") adata.obs["dropkick_score"] = np.nan # initialize dropkick_score as NaNs adata.obs.loc[a.obs_names, "dropkick_score"] = rc_.predict_proba(X)[:, 1] adata.obs.dropkick_score.fillna(0, inplace=True) # fill ignored cells with zeros if "dropkick_label" in adata.obs.columns: print("Warning: Overwriting existing dropkick labels in .obs") adata.obs["dropkick_label"] = np.nan # initialize dropkick_label as NaNs adata.obs.loc[a.obs_names, "dropkick_label"] = rc_.predict(X) adata.obs.dropkick_label.fillna(0, inplace=True) # fill ignored cells with zeros adata.obs.dropkick_label = ( adata.obs.dropkick_label.astype(bool).astype(str).astype("category") ) # convert to categorical strings # add heuristics to .obs for metric in metrics: adata.obs[metric] = np.nan # initialize as NaNs adata.obs.loc[a.obs_names, metric] = a.obs[metric] adata.obs[metric].fillna(0, inplace=True) # fill ignored cells with zeros # add dropkick coefficients to genes used in model (hvgs from `a`) adata.var["dropkick_coef"] = np.nan # initialize gene coefs as NaNs adata.var.loc[ a.var_names[a.var.highly_variable], "dropkick_coef" ] = rc_.coef_.squeeze() # 5) save model hyperparameters in .uns adata.uns["dropkick_thresholds"] = adata_thresh adata.uns["dropkick_args"] = { "n_ambient": n_ambient, "n_hvgs": n_hvgs, "metrics": metrics, "thresh_methods": thresh_methods, "alphas": alphas, "chosen_alpha": alpha_, "lambda_path": rc_.lambda_path_, "chosen_lambda": lambda_, "coef_path": rc_.coef_path_.squeeze().T, "cv_mean_score": rc_.cv_mean_score_, "cv_standard_error": rc_.cv_standard_error_, "max_iter": max_iter, "seed": seed, } # save command-line arguments to .uns for reference print("Done!\n") return rc_
- def plot_thresh_obs(adata, thresholds, bins=40, axes=None, save_to=None, verbose=True)
- 
Plots automated thresholding on metrics in adata.obsas output byauto_thresh_obs()Parameters- adata:- anndata.AnnData
- object containing unfiltered scRNA-seq data
- thresholds:- dict
- output of auto_thresh_obs()function
- bins:- int, optional- (default=40)
- number of bins for histogram
- axes:- matplotlib.axes.Axes, optional- (default=None)
- single ax or list of axes objects corresponding to number of thresholds to
plot. ignored if save_tois not None.
- save_to:- str, optional- (default=None)
- path to .pngfile for saving figure; returns figure by default
- verbose:- bool, optional- (default=True)
- print updates to console
 Returns- plot of distributions of obs_colsin thresholds dictionary with corresponding
- thresholds
 Expand source codedef plot_thresh_obs(adata, thresholds, bins=40, axes=None, save_to=None, verbose=True): """ Plots automated thresholding on metrics in `adata.obs` as output by `auto_thresh_obs()` Parameters ---------- adata : anndata.AnnData object containing unfiltered scRNA-seq data thresholds : dict output of `auto_thresh_obs()` function bins : int, optional (default=40) number of bins for histogram axes : matplotlib.axes.Axes, optional (default=None) single ax or list of axes objects corresponding to number of thresholds to plot. ignored if `save_to` is not None. save_to : str, optional (default=None) path to `.png` file for saving figure; returns figure by default verbose : bool, optional (default=True) print updates to console Returns ------- plot of distributions of `obs_cols` in thresholds dictionary with corresponding thresholds """ if save_to or not axes: fig, axes = plt.subplots( ncols=len(thresholds), nrows=1, figsize=(len(thresholds) * 4, 4), sharey=True, ) # if multiple plots, loop through axes if len(thresholds) > 1: axes[0].set_ylabel("cells") for i in range(len(thresholds)): axes[i].hist(adata.obs[list(thresholds.keys())[i]], bins=bins) if isinstance(list(thresholds.values())[i]["thresh"], np.ndarray): [ axes[i].axvline(_x, color="r") for _x in list(thresholds.values())[i]["thresh"] ] else: axes[i].axvline(list(thresholds.values())[i], color="r") axes[i].set_title(list(thresholds.keys())[i]) # if single plot, only one set of axes in subplot else: axes.set_ylabel("cells") axes.hist(adata.obs[list(thresholds.keys())[0]], bins=bins) if isinstance(list(thresholds.values())[0]["thresh"], np.ndarray): [ axes.axvline(_x, color="r") for _x in list(thresholds.values())[0]["thresh"] ] else: axes.axvline(list(thresholds.values())[0]["thresh"], color="r") axes.set_title(list(thresholds.keys())[0]) plt.tight_layout() if save_to: if verbose: print("Saving threshold plot to {}".format(save_to)) fig.savefig(save_to, dpi=200) elif not axes: return fig
- def qc_summary(adata, mito=True, ambient=True, genes=True, fig=None, save_to=None, verbose=True)
- 
Plots summary of counts distribution and ambient genes Parameters- adata:- anndata.AnnData
- object containing unfiltered scRNA-seq data
- mito:- bool, optional- (default=True)
- show pct_counts_mitoas points
- ambient:- bool, optional- (default=True)
- show pct_counts_ambientas points
- genes:- bool, optional- (default=True)
- show n_genes_by_countsas points
- fig:- matplotlib.figure, optional- (default=None)
- figure object for plotting. if None, create new.
- save_to:- str, optional- (default=None)
- path to .pngfile for saving figure; returns figure by default
- verbose:- bool, optional- (default=True)
- print updates to console
 Returns- fig:- matplotlib.figure
- counts_plot(),- sc.pl.highest_expr_genes(), and- dropout_plot()in single figure
 Expand source codedef qc_summary( adata, mito=True, ambient=True, genes=True, fig=None, save_to=None, verbose=True ): """ Plots summary of counts distribution and ambient genes Parameters ---------- adata : anndata.AnnData object containing unfiltered scRNA-seq data mito : bool, optional (default=True) show `pct_counts_mito` as points ambient : bool, optional (default=True) show `pct_counts_ambient` as points genes : bool, optional (default=True) show `n_genes_by_counts` as points fig : matplotlib.figure, optional (default=None) figure object for plotting. if None, create new. save_to : str, optional (default=None) path to `.png` file for saving figure; returns figure by default verbose : bool, optional (default=True) print updates to console Returns ------- fig : matplotlib.figure `counts_plot()`, `sc.pl.highest_expr_genes()`, and `dropout_plot()` in single figure """ if not fig: fig = plt.figure(figsize=(14, 7)) # arrange axes as subplots gs = gridspec.GridSpec(nrows=3, ncols=3, figure=fig) ax1 = plt.subplot(gs[0:4, 0:2]) ax2 = plt.subplot(gs[0:2, 2]) # add plots to axes counts_plot(adata, ax=ax1, show=False, genes=genes, ambient=ambient, mito=mito) dropout_plot(adata, ax=ax2, show=False) gs.tight_layout(figure=fig, w_pad=1.8) # return if save_to is not None: if verbose: print("Saving QC plot to {}".format(save_to)) fig.savefig(save_to, dpi=200) else: return fig
- def recipe_dropkick(adata, filter=True, min_genes=50, calc_metrics=True, mito_names='^mt-|^MT-', n_ambient=10, target_sum=None, n_hvgs=2000, X_final='raw_counts', verbose=True)
- 
dropkick preprocessing recipe Parameters- adata:- AnnData.AnnData
- object with raw counts data in .X
- filter:- bool, optional- (default=True)
- remove cells with less than min_genes detected and genes with zero total counts
- min_genes:- int, optional- (default=50)
- threshold for minimum genes detected. ignored if filter==False.
- calc_metrics:- bool, optional- (default=True)
- if False, do not calculate metrics in .obs/.var
- mito_names:- str, optional- (default="^mt-|^MT-")
- substring encompassing mitochondrial gene names for calculation of mito
expression. ignored if calc_metrics==False.
- n_ambient:- int, optional- (default=10)
- number of ambient genes to call (top genes by cells). ignored if
calc_metrics==False.
- target_sum:- int, optional- (default=None)
- total sum of counts for each cell prior to arcsinh or log1p transformations. if None, use median counts.
- n_hvgs:- int, optional- (default=2000)
- number of HVGs to calculate using Seurat method. if None, do not calculate HVGs.
- X_final:- str, optional- (default="raw_counts")
- which normalization layer should be left in .Xslot? ("raw_counts", "arcsinh_norm", "log1p_norm")
- verbose:- bool, optional- (default=True)
- print updates to the console
 Returns- adata:- AnnData.AnnData
- updated object includes:
- useful .obsand.varcolumns (ifcalc_metrics==True) ("total_counts", "pct_counts_mito", "n_genes_by_counts", etc.) - raw counts (adata.layers["raw_counts"]) - arcsinh-transformed normalized counts (adata.layers["arcsinh_norm"]) - highly variable genes if desired (adata.var["highly_variable"])
 Expand source codedef recipe_dropkick( adata, filter=True, min_genes=50, calc_metrics=True, mito_names="^mt-|^MT-", n_ambient=10, target_sum=None, n_hvgs=2000, X_final="raw_counts", verbose=True, ): """ dropkick preprocessing recipe Parameters ---------- adata : AnnData.AnnData object with raw counts data in `.X` filter : bool, optional (default=True) remove cells with less than min_genes detected and genes with zero total counts min_genes : int, optional (default=50) threshold for minimum genes detected. ignored if `filter`==False. calc_metrics : bool, optional (default=True) if False, do not calculate metrics in `.obs`/`.var` mito_names : str, optional (default="^mt-|^MT-") substring encompassing mitochondrial gene names for calculation of mito expression. ignored if `calc_metrics`==False. n_ambient : int, optional (default=10) number of ambient genes to call (top genes by cells). ignored if `calc_metrics`==False. target_sum : int, optional (default=None) total sum of counts for each cell prior to arcsinh or log1p transformations. if None, use median counts. n_hvgs : int, optional (default=2000) number of HVGs to calculate using Seurat method. if None, do not calculate HVGs. X_final : str, optional (default="raw_counts") which normalization layer should be left in `.X` slot? ("raw_counts", "arcsinh_norm", "log1p_norm") verbose : bool, optional (default=True) print updates to the console Returns ------- adata : AnnData.AnnData updated object includes: - useful `.obs` and `.var` columns (if `calc_metrics`==True) ("total_counts", "pct_counts_mito", "n_genes_by_counts", etc.) - raw counts (`adata.layers["raw_counts"]`) - arcsinh-transformed normalized counts (`adata.layers["arcsinh_norm"]`) - highly variable genes if desired (`adata.var["highly_variable"]`) """ if filter: # remove cells and genes with zero total counts orig_shape = adata.shape sc.pp.filter_cells(adata, min_genes=min_genes) sc.pp.filter_genes(adata, min_counts=1) if verbose: if adata.shape[0] != orig_shape[0]: print( "Ignoring {} barcodes with less than {} genes detected".format( orig_shape[0] - adata.shape[0], min_genes ) ) if adata.shape[1] != orig_shape[1]: print( "Ignoring {} genes with zero total counts".format( orig_shape[1] - adata.shape[1] ) ) adata.obs.drop(columns=["n_genes"], inplace=True) adata.var.drop(columns=["n_counts"], inplace=True) # store raw counts before manipulation adata.layers["raw_counts"] = adata.X.copy() if calc_metrics: # identify mitochondrial genes adata.var["mito"] = adata.var_names.str.contains(mito_names) # identify putative ambient genes by lowest dropout pct (top n_ambient) adata.var["pct_dropout_by_counts"] = np.array( (1 - (adata.X.astype(bool).sum(axis=0) / adata.n_obs)) * 100 ).squeeze() lowest_dropout = adata.var.pct_dropout_by_counts.nsmallest(n=n_ambient).min() highest_dropout = adata.var.pct_dropout_by_counts.nsmallest(n=n_ambient).max() adata.var["ambient"] = adata.var.pct_dropout_by_counts <= highest_dropout # reorder genes by dropout rate adata = adata[:, np.argsort(adata.var.pct_dropout_by_counts)].copy() if verbose: print( "Top {} ambient genes have dropout rates between {} and {} percent:\n\t{}".format( len(adata.var_names[adata.var.ambient]), round(lowest_dropout, 3), round(highest_dropout, 3), adata.var_names[adata.var.ambient].tolist(), ) ) # calculate standard qc .obs and .var sc.pp.calculate_qc_metrics( adata, qc_vars=["mito", "ambient"], inplace=True, percent_top=None ) # remove pesky unneeded columns from .obs and .var adata.obs.drop( columns=adata.obs.columns[adata.obs.columns.str.startswith("log1p_")].union( adata.obs.columns[adata.obs.columns.str.contains("total_counts_")] ), inplace=True, ) adata.var.drop( columns=adata.var.columns[adata.var.columns.str.startswith("log1p_")], inplace=True, ) # other arcsinh-transformed metrics adata.obs["arcsinh_total_counts"] = np.arcsinh(adata.obs["total_counts"]) adata.obs["arcsinh_n_genes_by_counts"] = np.arcsinh( adata.obs["n_genes_by_counts"] ) # normalize counts before transforming sc.pp.normalize_total(adata, target_sum=target_sum, layers=None, layer_norm=None) adata.layers["norm_counts"] = adata.X.copy() # HVGs if n_hvgs is not None: if verbose: print("Determining {} highly variable genes".format(n_hvgs)) # log1p transform for HVGs (adata.layers["log1p_norm"]) sc.pp.log1p(adata) adata.layers["log1p_norm"] = adata.X.copy() # save to .layers sc.pp.highly_variable_genes( adata, n_top_genes=n_hvgs, n_bins=20, flavor="seurat" ) adata.var.drop(columns=["dispersions", "dispersions_norm"], inplace=True) # arcsinh-transform normalized counts (adata.layers["arcsinh_norm"]) adata.X = np.arcsinh(adata.layers["norm_counts"]) sc.pp.scale(adata) # scale genes for feeding into model adata.layers[ "arcsinh_norm" ] = adata.X.copy() # save arcsinh scaled counts in .layers # remove unneeded stuff del adata.layers["norm_counts"] # set .X as desired for downstream processing; default raw_counts if (X_final != "raw_counts") & verbose: print("Setting {} layer to .X".format(X_final)) adata.X = adata.layers[X_final].copy() return adata
- def score_plot(adata, metrics=['arcsinh_n_genes_by_counts', 'pct_counts_ambient'], save_to=None, verbose=True)
- 
Plots scatter of barcodes across two metrics, with points colored by dropkick_score. Shows automated thresholding on metrics inadata.obsas output byauto_thresh_obs()Parameters- adata:- anndata.AnnData
- object generated from dropkick()
- metrics:- listof- str, optional
- (default=["arcsinh_n_genes_by_counts","pct_counts_ambient"])
- names of metrics to plot scatter and histograms for
- save_to:- str, optional- (default=None)
- path to .pngfile for saving figure; returns figure if None
- verbose:- bool, optional- (default=True)
- print updates to console
 Returns- g:- seaborn.jointgrid
- joint plot of metric distributions colored by dropkick_scoreand containing corresponding training thresholds
 if save_tois not None, write to file instead of returninggobject.Expand source codedef score_plot( adata, metrics=["arcsinh_n_genes_by_counts", "pct_counts_ambient"], save_to=None, verbose=True, ): """ Plots scatter of barcodes across two metrics, with points colored by `dropkick_score`. Shows automated thresholding on metrics in `adata.obs` as output by `auto_thresh_obs()` Parameters ---------- adata : anndata.AnnData object generated from `dropkick` metrics : list of str, optional (default=["arcsinh_n_genes_by_counts","pct_counts_ambient"]) names of metrics to plot scatter and histograms for save_to : str, optional (default=None) path to `.png` file for saving figure; returns figure if None verbose : bool, optional (default=True) print updates to console Returns ------- g : seaborn.jointgrid joint plot of metric distributions colored by `dropkick_score` and containing corresponding training thresholds if `save_to` is not None, write to file instead of returning `g` object. """ # initialize joint plot object g = sns.jointplot( x=adata.obs[metrics[0]], y=adata.obs[metrics[1]], height=7, space=0, color="k", marginal_kws=dict(bins=40), ) # change to focus on scatter plot g.ax_joint.cla() plt.sca(g.ax_joint) # set axes labels plt.xlabel(metrics[0], fontsize=12) plt.ylabel(metrics[1], fontsize=12) # scatter plot, color by dropkick_score points = plt.scatter( x=adata.obs[metrics[0]], y=adata.obs[metrics[1]], c=adata.obs["dropkick_score"], s=25, cmap="coolwarm_r", alpha=0.5, ) plt.tick_params(axis="both", which="major", labelsize=12) # plot training thresholds on scatter if metrics[0] in adata.uns["dropkick_thresholds"]: if isinstance( adata.uns["dropkick_thresholds"][metrics[0]]["thresh"], np.ndarray ): [ plt.axvline(_x, linestyle="--", color="k", linewidth=2) for _x in adata.uns["dropkick_thresholds"][metrics[0]]["thresh"] ] else: plt.axvline( adata.uns["dropkick_thresholds"][metrics[0]]["thresh"], linestyle="--", color="k", linewidth=2, ) if metrics[1] in adata.uns["dropkick_thresholds"]: if isinstance( adata.uns["dropkick_thresholds"][metrics[1]]["thresh"], np.ndarray ): [ plt.axhline(_x, linestyle="--", color="k", linewidth=2) for _x in adata.uns["dropkick_thresholds"][metrics[1]]["thresh"] ] else: plt.axhline( adata.uns["dropkick_thresholds"][metrics[1]]["thresh"], linestyle="--", color="k", linewidth=2, ) # change focus to x margin plot to continue threshold line if metrics[0] in adata.uns["dropkick_thresholds"]: plt.sca(g.ax_marg_x) if isinstance( adata.uns["dropkick_thresholds"][metrics[0]]["thresh"], np.ndarray ): [ plt.axvline(_x, linestyle="--", color="k", linewidth=2) for _x in adata.uns["dropkick_thresholds"][metrics[0]]["thresh"] ] else: plt.axvline( adata.uns["dropkick_thresholds"][metrics[0]]["thresh"], linestyle="--", color="k", linewidth=2, ) # change focus to y margin plot to continue threshold line if metrics[1] in adata.uns["dropkick_thresholds"]: plt.sca(g.ax_marg_y) if isinstance( adata.uns["dropkick_thresholds"][metrics[1]]["thresh"], np.ndarray ): [ plt.axhline(_x, linestyle="--", color="k", linewidth=2) for _x in adata.uns["dropkick_thresholds"][metrics[1]]["thresh"] ] else: plt.axhline( adata.uns["dropkick_thresholds"][metrics[1]]["thresh"], linestyle="--", color="k", linewidth=2, ) # add colorbar inside scatter axes axins1 = inset_axes( g.ax_joint, width="40%", # width = 40% of parent_bbox width height="4%", # height : 4% loc="upper right", ) cbar = plt.colorbar( points, cax=axins1, drawedges=False, label="dropkick_score", orientation="horizontal", ticks=[0.1, 0.5, 0.9], ) cbar.ax.tick_params(labelsize=12) cbar.solids.set_edgecolor("face") # add histogram of scores on top of colorbar axins2 = inset_axes( g.ax_joint, width="40%", # width = 40% of parent_bbox width height="4%", # height : 4% loc="upper right", ) _ = axins2.hist( # only include scores > 0 in hist so you can see distribution adata.obs.loc[adata.obs["dropkick_score"] > 0.0, "dropkick_score"], bins=40, color="k", alpha=0.7, histtype="step", ) axins2.axis("off") if save_to is not None: if verbose: print("Saving score plot to {}".format(save_to)) g.savefig(save_to, dpi=200) else: return g