Package dropkick

Automated cell filtering and QC of single-cell RNA sequencing data

Expand source code
# -*- coding: utf-8 -*-
"""
Automated cell filtering and QC of single-cell RNA sequencing data
"""
from .api import (
    dropkick,
    recipe_dropkick,
    plot_thresh_obs,
    coef_inventory,
    coef_plot,
    score_plot,
)
from .qc import qc_summary

__all__ = [
    "dropkick",
    "recipe_dropkick",
    "plot_thresh_obs",
    "coef_inventory",
    "coef_plot",
    "score_plot",
    "qc_summary",
]

from ._version import get_versions

__version__ = get_versions()["version"]
del get_versions

Sub-modules

dropkick.api

Functions for cell filtering of scRNA-seq data via dropkick

dropkick.errors

Error reporting for dropkick model

dropkick.logistic

Logistic regression model functions for dropkick

dropkick.qc

Automated ambient gene testing and counts data QC

dropkick.scorer

Modified version of sklearn.metrics.scorer to allow for scoring the entire lambda path of a glmnet model …

dropkick.util

Utility functions for scoring dropkick logistic regression model

Functions

def coef_inventory(adata, n=10)

Returns highest and lowest coefficient values from logistic regression model, along with sparsity

Parameters

adata : anndata.AnnData
object generated from dropkick()
n : int, optional (default=10)
number of genes to show at top and bottom of coefficient list

Returns

prints top and bottom n genes by their coefficient values to console

Expand source code
def coef_inventory(adata, n=10):
    """
    Returns highest and lowest coefficient values from logistic regression model,
    along with sparsity

    Parameters
    ----------

    adata : anndata.AnnData
        object generated from `dropkick`
    n : int, optional (default=10)
        number of genes to show at top and bottom of coefficient list

    Returns
    -------

    prints top and bottom `n` genes by their coefficient values to console
    """
    print("\nTop HVGs by coefficient value (good cells):")
    print(adata.var.loc[-adata.var.dropkick_coef.isna(), "dropkick_coef"].nlargest(n))
    print("\nBottom HVGs by coefficient value (bad droplets):")
    print(adata.var.loc[-adata.var.dropkick_coef.isna(), "dropkick_coef"].nsmallest(n))
    n_zero = (adata.var.dropkick_coef == 0).sum()
    n_coef = (-adata.var.dropkick_coef.isna()).sum()
    sparsity = round((n_zero / n_coef) * 100, 3)
    print(
        "\n{} coefficients equal to zero. Model sparsity: {} %\n".format(
            n_zero, sparsity
        )
    )
def coef_plot(adata, save_to=None, verbose=True)

Plots dropkick coefficient values and cross validation (CV) scores for tested values of lambda (lambda_path)

Parameters

adata : anndata.AnnData
object generated from dropkick()
save_to : str, optional (default=None)
path to .png file for saving figure
verbose : bool, optional (default=True)
print updates to console

Returns

fig : matplotlib.figure
plot of CV scores (mean +/- SEM) and coefficient values (coef_path) versus log(lambda_path). includes indicator of chosen lambda value.

if save_to is not None, write to file instead of returning fig object.

Expand source code
def coef_plot(adata, save_to=None, verbose=True):
    """
    Plots dropkick coefficient values and cross validation (CV) scores for tested
    values of lambda (`lambda_path`)

    Parameters
    ----------

    adata : anndata.AnnData
        object generated from `dropkick`
    save_to : str, optional (default=None)
        path to `.png` file for saving figure
    verbose : bool, optional (default=True)
        print updates to console

    Returns
    -------

    fig : matplotlib.figure
        plot of CV scores (mean +/- SEM) and coefficient values (`coef_path`) versus
        log(`lambda_path`). includes indicator of chosen lambda value.

    if `save_to` is not None, write to file instead of returning `fig` object.
    """
    cmap = cm.get_cmap("coolwarm")
    fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(7, 7), sharex=True)
    # plot coefficient values versus log(lambda) on top axis
    axes[0].set_ylabel("Coefficient Value", fontsize=12)
    axes[0].plot(
        np.log(adata.uns["dropkick_args"]["lambda_path"]),
        adata.uns["dropkick_args"]["coef_path"],
        alpha=0.5,
        linewidth=2,
    )
    axes[0].tick_params(axis="both", which="major", labelsize=12)
    # plot vertical line at chosen lambda value and add to legend
    axes[0].axvline(
        np.log(adata.uns["dropkick_args"]["chosen_lambda"]),
        label=None,
        color="k",
        ls="--",
        linewidth=2,
    )
    # plot total model sparsity and top three genes by coefficient value
    # get range of values for positioning text
    val_range = (
        adata.uns["dropkick_args"]["coef_path"].max()
        - adata.uns["dropkick_args"]["coef_path"].min()
    )
    # put model sparsity on top
    n_zero = (adata.var.dropkick_coef == 0).sum()
    n_coef = (-adata.var.dropkick_coef.isna()).sum()
    sparsity = round((n_zero / n_coef) * 100, 2)
    axes[0].text(
        x=np.log(adata.uns["dropkick_args"]["chosen_lambda"]),
        y=adata.var.dropkick_coef.max() + (0.24 * val_range),
        s=" Sparsity: {} %".format(sparsity),
        fontsize=12,
        color="k",
    )
    # add top three genes by coefficient value as annotation
    [
        axes[0].text(
            x=np.log(adata.uns["dropkick_args"]["chosen_lambda"]),
            y=adata.var.dropkick_coef.max()
            + (0.18 * val_range)
            - (0.06 * val_range * x),
            s=" "
            + adata.var.loc[-adata.var.dropkick_coef.isna(), "dropkick_coef"]
            .nlargest(3)
            .index[x],
            fontsize=12,
            fontstyle="italic",
            color="g",
        )
        for x in range(3)
    ]
    # add bottom three genes by coefficient value as annotation
    [
        axes[0].text(
            x=np.log(adata.uns["dropkick_args"]["chosen_lambda"]),
            y=adata.var.dropkick_coef.min()
            - (0.24 * val_range)
            + (0.06 * val_range * x),
            s=" "
            + adata.var.loc[-adata.var.dropkick_coef.isna(), "dropkick_coef"]
            .nsmallest(3)
            .index[x],
            fontsize=12,
            fontstyle="italic",
            color=cmap(1.0),
        )
        for x in range(3)
    ]

    # plot CV scores versus log(lambda) on right y-axis
    axes[1].plot(
        np.log(adata.uns["dropkick_args"]["lambda_path"]),
        -2 * adata.uns["dropkick_args"]["cv_mean_score"],
        label="Mean Deviance",
        color=cmap(0.0),
        linewidth=2,
    )
    axes[1].fill_between(
        np.log(adata.uns["dropkick_args"]["lambda_path"]),
        y1=(-2 * adata.uns["dropkick_args"]["cv_mean_score"])
        - 2 * adata.uns["dropkick_args"]["cv_standard_error"],
        y2=(-2 * adata.uns["dropkick_args"]["cv_mean_score"])
        + 2 * adata.uns["dropkick_args"]["cv_standard_error"],
        color=cmap(0.0),
        alpha=0.3,
        label="Deviance SEM",
    )
    # plot vertical line at chosen lambda value and add to legend
    axes[1].axvline(
        np.log(adata.uns["dropkick_args"]["chosen_lambda"]),
        label="Chosen lambda: {:.2e}".format(
            adata.uns["dropkick_args"]["chosen_lambda"][0]
        ),
        color="k",
        ls="--",
        linewidth=2,
    )
    axes[1].set_xlabel("Log (lambda)", fontsize=12)
    axes[1].set_ylabel("Binomial Deviance", fontsize=12)
    axes[1].tick_params(axis="both", which="major", labelsize=12)
    axes[1].legend(fontsize=12)

    plt.tight_layout()
    if save_to:
        if verbose:
            print("Saving coefficient plot to {}".format(save_to))
        fig.savefig(save_to, dpi=200)
    else:
        return fig
def dropkick(adata, min_genes=50, mito_names='^mt-|^MT-', n_ambient=10, n_hvgs=2000, metrics=['arcsinh_n_genes_by_counts'], thresh_methods=['multiotsu'], directions=['above'], alphas=[0.1], max_iter=2000, n_jobs=2, seed=18, verbose=True)

Generates logistic regression model of cell quality

Parameters

adata : anndata.AnnData
object containing unfiltered, raw scRNA-seq counts in .X layer
min_genes : int, optional (default=50)
threshold for minimum genes detected. ignores all cells with less than min_genes (dropkick label = 0).
mito_names : str, optional (default="^mt-|^MT-")
substring encompassing mitochondrial gene names for calculation of mito expression
n_ambient : int, optional (default=10)
number of ambient genes to call. top genes by cells.
n_hvgs : int or None, optional (default=2000)
number of HVGs to calculate using Seurat method. if None, do not calculate HVGs
metrics : list of str, optional (default="arcsinh_n_genes_by_counts")
name of column(s) to threshold from adata.obs
thresh_methods : list of str {"otsu","multiotsu","li","mean"}, optional
 
(default="multiotsu")
automated thresholding method(s) corresponding to each element in metrics
directions : list of str {"above","below"}, optional (default="above")
which direction to keep during training (dropkick label = 1) corresponding to each element in metrics
alphas : list of float, optional (default=0.1)
alpha value(s) to test using glmnet with n-fold cross validation
max_iter : int, optional (default=2000)
number of iterations for glmnet optimization
n_jobs : int, optional (default=2)
number of threads for cross validation by glmnet
seed : int, optional (default=18)
random state for cross validation by glmnet
verbose : bool, optional (default=True)
verbosity for glmnet training and warnings

Returns

rc : LogisticRegression
trained logistic regression classifier

updates adata inplace to include "train", "dropkick_score", and "dropkick_label" columns in .obs

Expand source code
def dropkick(
    adata,
    min_genes=50,
    mito_names="^mt-|^MT-",
    n_ambient=10,
    n_hvgs=2000,
    metrics=["arcsinh_n_genes_by_counts"],
    thresh_methods=["multiotsu"],
    directions=["above"],
    alphas=[0.1],
    max_iter=2000,
    n_jobs=2,
    seed=18,
    verbose=True,
):
    """
    Generates logistic regression model of cell quality

    Parameters
    ----------

    adata : anndata.AnnData
        object containing unfiltered, raw scRNA-seq counts in `.X` layer
    min_genes : int, optional (default=50)
        threshold for minimum genes detected. ignores all cells with less than
        min_genes (dropkick label = 0).
    mito_names : str, optional (default="^mt-|^MT-")
        substring encompassing mitochondrial gene names for calculation of mito
        expression
    n_ambient : int, optional (default=10)
        number of ambient genes to call. top genes by cells.
    n_hvgs : int or None, optional (default=2000)
        number of HVGs to calculate using Seurat method. if None, do not calculate
        HVGs
    metrics : list of str, optional (default="arcsinh_n_genes_by_counts")
        name of column(s) to threshold from `adata.obs`
    thresh_methods : list of str {"otsu","multiotsu","li","mean"}, optional
    (default="multiotsu")
        automated thresholding method(s) corresponding to each element in `metrics`
    directions : list of str {"above","below"}, optional (default="above")
        which direction to keep during training (dropkick label = 1) corresponding
        to each element in `metrics`
    alphas : list of float, optional (default=0.1)
        alpha value(s) to test using glmnet with n-fold cross validation
    max_iter : int, optional (default=2000)
        number of iterations for glmnet optimization
    n_jobs : int, optional (default=2)
        number of threads for cross validation by glmnet
    seed : int, optional (default=18)
        random state for cross validation by glmnet
    verbose : bool, optional (default=True)
        verbosity for glmnet training and warnings

    Returns
    -------

    rc : LogisticRegression
        trained logistic regression classifier

    updates `adata` inplace to include "train", "dropkick_score", and
    "dropkick_label" columns in `.obs`
    """
    # 0) preprocess counts and calculate required QC metrics
    a = adata.copy()  # make copy of anndata before manipulating
    a = recipe_dropkick(
        a,
        filter=True,
        min_genes=min_genes,
        calc_metrics=True,
        mito_names=mito_names,
        n_ambient=n_ambient,
        target_sum=None,
        n_hvgs=n_hvgs,
        X_final="arcsinh_norm",
        verbose=verbose,
    )

    # 1) threshold chosen heuristics using automated methods
    if verbose:
        print("Thresholding on heuristics for training labels:\n\t{}".format(metrics))
    # convert args to list
    if isinstance(metrics, str):
        metrics = [metrics]
    if isinstance(thresh_methods, str):
        thresh_methods = [thresh_methods]
    if isinstance(directions, str):
        directions = [directions]
    adata_thresh = auto_thresh_obs(
        a, methods=thresh_methods, obs_cols=metrics, directions=directions
    )

    # 2) create labels from combination of thresholds
    a = filter_thresh_obs(
        a, adata_thresh, obs_cols=metrics, inclusive=True, name="train", verbose=verbose
    )

    X = a[:, a.var.highly_variable].X.copy()  # X for testing
    y = a.obs["train"].copy(deep=True)  # final y is "train" labels from step 2
    if verbose:
        print("Training dropkick with alphas:\n\t{}".format(alphas))

    if len(alphas) > 1:
        # 3.1) cross-validation to choose alpha and lambda values
        cv_scores = {"rc": [], "lambda": [], "alpha": [], "score": []}  # dictionary o/p
        for alpha in alphas:
            rc = LogitNet(
                alpha=alpha,
                n_lambda=100,
                standardize=False,
                scoring="log_loss",
                cut_point=1.0,
                n_splits=5,
                max_iter=max_iter,
                n_jobs=n_jobs,
                random_state=seed,
                verbose=verbose,
            )
            rc.fit(adata=a, y=y, n_hvgs=n_hvgs)
            cv_scores["rc"].append(rc)
            cv_scores["alpha"].append(alpha)
            cv_scores["lambda"].append(rc.lambda_best_)
            cv_scores["score"].append(rc.score(X, y, lamb=rc.lambda_best_))
        # determine optimal lambda and alpha values by accuracy score
        lambda_ = cv_scores["lambda"][
            cv_scores["score"].index(max(cv_scores["score"]))
        ]  # choose alpha value
        alpha_ = cv_scores["alpha"][
            cv_scores["score"].index(max(cv_scores["score"]))
        ]  # choose l1 ratio
        rc_ = cv_scores["rc"][
            cv_scores["score"].index(max(cv_scores["score"]))
        ]  # choose classifier
        print(
            "Chosen lambda value:\n\t{}\nChosen alpha value:\n\t{}".format(
                lambda_, alpha_
            )
        )
    else:
        # 3.2) train model with single alpha value
        rc_ = LogitNet(
            alpha=alphas[0],
            n_lambda=100,
            standardize=False,
            scoring="log_loss",
            cut_point=1.0,
            n_splits=5,
            max_iter=max_iter,
            n_jobs=n_jobs,
            random_state=seed,
            verbose=verbose,
        )
        rc_.fit(adata=a, y=y, n_hvgs=n_hvgs)
        print("Chosen lambda value:\n\t{}".format(rc_.lambda_best_))
        lambda_, alpha_ = rc_.lambda_best_, alphas[0]

    # 4) use model to assign scores and labels to original adata
    print("Assigning scores and labels")
    if "dropkick_score" in adata.obs.columns:
        print("Warning: Overwriting existing dropkick scores in .obs")

    adata.obs["dropkick_score"] = np.nan  # initialize dropkick_score as NaNs
    adata.obs.loc[a.obs_names, "dropkick_score"] = rc_.predict_proba(X)[:, 1]
    adata.obs.dropkick_score.fillna(0, inplace=True)  # fill ignored cells with zeros

    if "dropkick_label" in adata.obs.columns:
        print("Warning: Overwriting existing dropkick labels in .obs")

    adata.obs["dropkick_label"] = np.nan  # initialize dropkick_label as NaNs
    adata.obs.loc[a.obs_names, "dropkick_label"] = rc_.predict(X)
    adata.obs.dropkick_label.fillna(0, inplace=True)  # fill ignored cells with zeros
    adata.obs.dropkick_label = (
        adata.obs.dropkick_label.astype(bool).astype(str).astype("category")
    )  # convert to categorical strings

    # add heuristics to .obs
    for metric in metrics:
        adata.obs[metric] = np.nan  # initialize as NaNs
        adata.obs.loc[a.obs_names, metric] = a.obs[metric]
        adata.obs[metric].fillna(0, inplace=True)  # fill ignored cells with zeros

    # add dropkick coefficients to genes used in model (hvgs from `a`)
    adata.var["dropkick_coef"] = np.nan  # initialize gene coefs as NaNs
    adata.var.loc[
        a.var_names[a.var.highly_variable], "dropkick_coef"
    ] = rc_.coef_.squeeze()

    # 5) save model hyperparameters in .uns
    adata.uns["dropkick_thresholds"] = adata_thresh
    adata.uns["dropkick_args"] = {
        "n_ambient": n_ambient,
        "n_hvgs": n_hvgs,
        "metrics": metrics,
        "thresh_methods": thresh_methods,
        "alphas": alphas,
        "chosen_alpha": alpha_,
        "lambda_path": rc_.lambda_path_,
        "chosen_lambda": lambda_,
        "coef_path": rc_.coef_path_.squeeze().T,
        "cv_mean_score": rc_.cv_mean_score_,
        "cv_standard_error": rc_.cv_standard_error_,
        "max_iter": max_iter,
        "seed": seed,
    }  # save command-line arguments to .uns for reference

    print("Done!\n")
    return rc_
def plot_thresh_obs(adata, thresholds, bins=40, axes=None, save_to=None, verbose=True)

Plots automated thresholding on metrics in adata.obs as output by auto_thresh_obs()

Parameters

adata : anndata.AnnData
object containing unfiltered scRNA-seq data
thresholds : dict
output of auto_thresh_obs() function
bins : int, optional (default=40)
number of bins for histogram
axes : matplotlib.axes.Axes, optional (default=None)
single ax or list of axes objects corresponding to number of thresholds to plot. ignored if save_to is not None.
save_to : str, optional (default=None)
path to .png file for saving figure; returns figure by default
verbose : bool, optional (default=True)
print updates to console

Returns

plot of distributions of obs_cols in thresholds dictionary with corresponding
thresholds
 
Expand source code
def plot_thresh_obs(adata, thresholds, bins=40, axes=None, save_to=None, verbose=True):
    """
    Plots automated thresholding on metrics in `adata.obs` as output by
    `auto_thresh_obs()`

    Parameters
    ----------

    adata : anndata.AnnData
        object containing unfiltered scRNA-seq data
    thresholds : dict
        output of `auto_thresh_obs()` function
    bins : int, optional (default=40)
        number of bins for histogram
    axes : matplotlib.axes.Axes, optional (default=None)
        single ax or list of axes objects corresponding to number of thresholds to
        plot. ignored if `save_to` is not None.
    save_to : str, optional (default=None)
        path to `.png` file for saving figure; returns figure by default
    verbose : bool, optional (default=True)
        print updates to console

    Returns
    -------

    plot of distributions of `obs_cols` in thresholds dictionary with corresponding
    thresholds
    """
    if save_to or not axes:
        fig, axes = plt.subplots(
            ncols=len(thresholds),
            nrows=1,
            figsize=(len(thresholds) * 4, 4),
            sharey=True,
        )
    # if multiple plots, loop through axes
    if len(thresholds) > 1:
        axes[0].set_ylabel("cells")
        for i in range(len(thresholds)):
            axes[i].hist(adata.obs[list(thresholds.keys())[i]], bins=bins)
            if isinstance(list(thresholds.values())[i]["thresh"], np.ndarray):
                [
                    axes[i].axvline(_x, color="r")
                    for _x in list(thresholds.values())[i]["thresh"]
                ]
            else:
                axes[i].axvline(list(thresholds.values())[i], color="r")
            axes[i].set_title(list(thresholds.keys())[i])
    # if single plot, only one set of axes in subplot
    else:
        axes.set_ylabel("cells")
        axes.hist(adata.obs[list(thresholds.keys())[0]], bins=bins)
        if isinstance(list(thresholds.values())[0]["thresh"], np.ndarray):
            [
                axes.axvline(_x, color="r")
                for _x in list(thresholds.values())[0]["thresh"]
            ]
        else:
            axes.axvline(list(thresholds.values())[0]["thresh"], color="r")
        axes.set_title(list(thresholds.keys())[0])
    plt.tight_layout()
    if save_to:
        if verbose:
            print("Saving threshold plot to {}".format(save_to))
        fig.savefig(save_to, dpi=200)
    elif not axes:
        return fig
def qc_summary(adata, mito=True, ambient=True, genes=True, fig=None, save_to=None, verbose=True)

Plots summary of counts distribution and ambient genes

Parameters

adata : anndata.AnnData
object containing unfiltered scRNA-seq data
mito : bool, optional (default=True)
show pct_counts_mito as points
ambient : bool, optional (default=True)
show pct_counts_ambient as points
genes : bool, optional (default=True)
show n_genes_by_counts as points
fig : matplotlib.figure, optional (default=None)
figure object for plotting. if None, create new.
save_to : str, optional (default=None)
path to .png file for saving figure; returns figure by default
verbose : bool, optional (default=True)
print updates to console

Returns

fig : matplotlib.figure
counts_plot(), sc.pl.highest_expr_genes(), and dropout_plot() in single figure
Expand source code
def qc_summary(
    adata, mito=True, ambient=True, genes=True, fig=None, save_to=None, verbose=True
):
    """
    Plots summary of counts distribution and ambient genes

    Parameters
    ----------

    adata : anndata.AnnData
        object containing unfiltered scRNA-seq data
    mito : bool, optional (default=True)
        show `pct_counts_mito` as points
    ambient : bool, optional (default=True)
        show `pct_counts_ambient` as points
    genes : bool, optional (default=True)
        show `n_genes_by_counts` as points
    fig : matplotlib.figure, optional (default=None)
        figure object for plotting. if None, create new.
    save_to : str, optional (default=None)
        path to `.png` file for saving figure; returns figure by default
    verbose : bool, optional (default=True)
        print updates to console

    Returns
    -------

    fig : matplotlib.figure
        `counts_plot()`, `sc.pl.highest_expr_genes()`, and `dropout_plot()` in single 
        figure
    """
    if not fig:
        fig = plt.figure(figsize=(14, 7))
    # arrange axes as subplots
    gs = gridspec.GridSpec(nrows=3, ncols=3, figure=fig)
    ax1 = plt.subplot(gs[0:4, 0:2])
    ax2 = plt.subplot(gs[0:2, 2])
    # add plots to axes
    counts_plot(adata, ax=ax1, show=False, genes=genes, ambient=ambient, mito=mito)
    dropout_plot(adata, ax=ax2, show=False)
    gs.tight_layout(figure=fig, w_pad=1.8)
    # return
    if save_to is not None:
        if verbose:
            print("Saving QC plot to {}".format(save_to))
        fig.savefig(save_to, dpi=200)
    else:
        return fig
def recipe_dropkick(adata, filter=True, min_genes=50, calc_metrics=True, mito_names='^mt-|^MT-', n_ambient=10, target_sum=None, n_hvgs=2000, X_final='raw_counts', verbose=True)

dropkick preprocessing recipe

Parameters

adata : AnnData.AnnData
object with raw counts data in .X
filter : bool, optional (default=True)
remove cells with less than min_genes detected and genes with zero total counts
min_genes : int, optional (default=50)
threshold for minimum genes detected. ignored if filter==False.
calc_metrics : bool, optional (default=True)
if False, do not calculate metrics in .obs/.var
mito_names : str, optional (default="^mt-|^MT-")
substring encompassing mitochondrial gene names for calculation of mito expression. ignored if calc_metrics==False.
n_ambient : int, optional (default=10)
number of ambient genes to call (top genes by cells). ignored if calc_metrics==False.
target_sum : int, optional (default=None)
total sum of counts for each cell prior to arcsinh or log1p transformations. if None, use median counts.
n_hvgs : int, optional (default=2000)
number of HVGs to calculate using Seurat method. if None, do not calculate HVGs.
X_final : str, optional (default="raw_counts")
which normalization layer should be left in .X slot? ("raw_counts", "arcsinh_norm", "log1p_norm")
verbose : bool, optional (default=True)
print updates to the console

Returns

adata : AnnData.AnnData
updated object includes: - useful .obs and .var columns (if calc_metrics==True) ("total_counts", "pct_counts_mito", "n_genes_by_counts", etc.) - raw counts (adata.layers["raw_counts"]) - arcsinh-transformed normalized counts (adata.layers["arcsinh_norm"]) - highly variable genes if desired (adata.var["highly_variable"])
Expand source code
def recipe_dropkick(
    adata,
    filter=True,
    min_genes=50,
    calc_metrics=True,
    mito_names="^mt-|^MT-",
    n_ambient=10,
    target_sum=None,
    n_hvgs=2000,
    X_final="raw_counts",
    verbose=True,
):
    """
    dropkick preprocessing recipe

    Parameters
    ----------

    adata : AnnData.AnnData
        object with raw counts data in `.X`
    filter : bool, optional (default=True)
        remove cells with less than min_genes detected and genes with zero total counts
    min_genes : int, optional (default=50)
        threshold for minimum genes detected. ignored if `filter`==False.
    calc_metrics : bool, optional (default=True)
        if False, do not calculate metrics in `.obs`/`.var`
    mito_names : str, optional (default="^mt-|^MT-")
        substring encompassing mitochondrial gene names for calculation of mito
        expression. ignored if `calc_metrics`==False.
    n_ambient : int, optional (default=10)
        number of ambient genes to call (top genes by cells). ignored if
        `calc_metrics`==False.
    target_sum : int, optional (default=None)
        total sum of counts for each cell prior to arcsinh or log1p transformations.
        if None, use median counts.
    n_hvgs : int, optional (default=2000)
        number of HVGs to calculate using Seurat method. if None, do not calculate
        HVGs.
    X_final : str, optional (default="raw_counts")
        which normalization layer should be left in `.X` slot? ("raw_counts",
        "arcsinh_norm", "log1p_norm")
    verbose : bool, optional (default=True)
        print updates to the console

    Returns
    -------

    adata : AnnData.AnnData
        updated object includes:
        - useful `.obs` and `.var` columns (if `calc_metrics`==True)
            ("total_counts", "pct_counts_mito", "n_genes_by_counts", etc.)
        - raw counts (`adata.layers["raw_counts"]`)
        - arcsinh-transformed normalized counts (`adata.layers["arcsinh_norm"]`)
        - highly variable genes if desired (`adata.var["highly_variable"]`)
    """
    if filter:
        # remove cells and genes with zero total counts
        orig_shape = adata.shape
        sc.pp.filter_cells(adata, min_genes=min_genes)
        sc.pp.filter_genes(adata, min_counts=1)
        if verbose:
            if adata.shape[0] != orig_shape[0]:
                print(
                    "Ignoring {} barcodes with less than {} genes detected".format(
                        orig_shape[0] - adata.shape[0], min_genes
                    )
                )
            if adata.shape[1] != orig_shape[1]:
                print(
                    "Ignoring {} genes with zero total counts".format(
                        orig_shape[1] - adata.shape[1]
                    )
                )
        adata.obs.drop(columns=["n_genes"], inplace=True)
        adata.var.drop(columns=["n_counts"], inplace=True)

    # store raw counts before manipulation
    adata.layers["raw_counts"] = adata.X.copy()

    if calc_metrics:
        # identify mitochondrial genes
        adata.var["mito"] = adata.var_names.str.contains(mito_names)
        # identify putative ambient genes by lowest dropout pct (top n_ambient)
        adata.var["pct_dropout_by_counts"] = np.array(
            (1 - (adata.X.astype(bool).sum(axis=0) / adata.n_obs)) * 100
        ).squeeze()
        lowest_dropout = adata.var.pct_dropout_by_counts.nsmallest(n=n_ambient).min()
        highest_dropout = adata.var.pct_dropout_by_counts.nsmallest(n=n_ambient).max()
        adata.var["ambient"] = adata.var.pct_dropout_by_counts <= highest_dropout
        # reorder genes by dropout rate
        adata = adata[:, np.argsort(adata.var.pct_dropout_by_counts)].copy()
        if verbose:
            print(
                "Top {} ambient genes have dropout rates between {} and {} percent:\n\t{}".format(
                    len(adata.var_names[adata.var.ambient]),
                    round(lowest_dropout, 3),
                    round(highest_dropout, 3),
                    adata.var_names[adata.var.ambient].tolist(),
                )
            )
        # calculate standard qc .obs and .var
        sc.pp.calculate_qc_metrics(
            adata, qc_vars=["mito", "ambient"], inplace=True, percent_top=None
        )
        # remove pesky unneeded columns from .obs and .var
        adata.obs.drop(
            columns=adata.obs.columns[adata.obs.columns.str.startswith("log1p_")].union(
                adata.obs.columns[adata.obs.columns.str.contains("total_counts_")]
            ),
            inplace=True,
        )
        adata.var.drop(
            columns=adata.var.columns[adata.var.columns.str.startswith("log1p_")],
            inplace=True,
        )
        # other arcsinh-transformed metrics
        adata.obs["arcsinh_total_counts"] = np.arcsinh(adata.obs["total_counts"])
        adata.obs["arcsinh_n_genes_by_counts"] = np.arcsinh(
            adata.obs["n_genes_by_counts"]
        )

    # normalize counts before transforming
    sc.pp.normalize_total(adata, target_sum=target_sum, layers=None, layer_norm=None)
    adata.layers["norm_counts"] = adata.X.copy()

    # HVGs
    if n_hvgs is not None:
        if verbose:
            print("Determining {} highly variable genes".format(n_hvgs))
        # log1p transform for HVGs (adata.layers["log1p_norm"])
        sc.pp.log1p(adata)
        adata.layers["log1p_norm"] = adata.X.copy()  # save to .layers
        sc.pp.highly_variable_genes(
            adata, n_top_genes=n_hvgs, n_bins=20, flavor="seurat"
        )
        adata.var.drop(columns=["dispersions", "dispersions_norm"], inplace=True)

    # arcsinh-transform normalized counts (adata.layers["arcsinh_norm"])
    adata.X = np.arcsinh(adata.layers["norm_counts"])
    sc.pp.scale(adata)  # scale genes for feeding into model
    adata.layers[
        "arcsinh_norm"
    ] = adata.X.copy()  # save arcsinh scaled counts in .layers
    # remove unneeded stuff
    del adata.layers["norm_counts"]

    # set .X as desired for downstream processing; default raw_counts
    if (X_final != "raw_counts") & verbose:
        print("Setting {} layer to .X".format(X_final))
    adata.X = adata.layers[X_final].copy()

    return adata
def score_plot(adata, metrics=['arcsinh_n_genes_by_counts', 'pct_counts_ambient'], save_to=None, verbose=True)

Plots scatter of barcodes across two metrics, with points colored by dropkick_score. Shows automated thresholding on metrics in adata.obs as output by auto_thresh_obs()

Parameters

adata : anndata.AnnData
object generated from dropkick()
metrics : list of str, optional
 
(default=["arcsinh_n_genes_by_counts","pct_counts_ambient"])
names of metrics to plot scatter and histograms for
save_to : str, optional (default=None)
path to .png file for saving figure; returns figure if None
verbose : bool, optional (default=True)
print updates to console

Returns

g : seaborn.jointgrid
joint plot of metric distributions colored by dropkick_score and containing corresponding training thresholds

if save_to is not None, write to file instead of returning g object.

Expand source code
def score_plot(
    adata,
    metrics=["arcsinh_n_genes_by_counts", "pct_counts_ambient"],
    save_to=None,
    verbose=True,
):
    """
    Plots scatter of barcodes across two metrics, with points colored by
    `dropkick_score`. Shows automated thresholding on metrics in `adata.obs` as output
    by `auto_thresh_obs()`

    Parameters
    ----------

    adata : anndata.AnnData
        object generated from `dropkick`
    metrics : list of str, optional
    (default=["arcsinh_n_genes_by_counts","pct_counts_ambient"])
        names of metrics to plot scatter and histograms for
    save_to : str, optional (default=None)
        path to `.png` file for saving figure; returns figure if None
    verbose : bool, optional (default=True)
        print updates to console

    Returns
    -------

    g : seaborn.jointgrid
        joint plot of metric distributions colored by `dropkick_score` and containing
        corresponding training thresholds

    if `save_to` is not None, write to file instead of returning `g` object.
    """
    # initialize joint plot object
    g = sns.jointplot(
        x=adata.obs[metrics[0]],
        y=adata.obs[metrics[1]],
        height=7,
        space=0,
        color="k",
        marginal_kws=dict(bins=40),
    )
    # change to focus on scatter plot
    g.ax_joint.cla()
    plt.sca(g.ax_joint)
    # set axes labels
    plt.xlabel(metrics[0], fontsize=12)
    plt.ylabel(metrics[1], fontsize=12)
    # scatter plot, color by dropkick_score
    points = plt.scatter(
        x=adata.obs[metrics[0]],
        y=adata.obs[metrics[1]],
        c=adata.obs["dropkick_score"],
        s=25,
        cmap="coolwarm_r",
        alpha=0.5,
    )
    plt.tick_params(axis="both", which="major", labelsize=12)
    # plot training thresholds on scatter
    if metrics[0] in adata.uns["dropkick_thresholds"]:
        if isinstance(
            adata.uns["dropkick_thresholds"][metrics[0]]["thresh"], np.ndarray
        ):
            [
                plt.axvline(_x, linestyle="--", color="k", linewidth=2)
                for _x in adata.uns["dropkick_thresholds"][metrics[0]]["thresh"]
            ]
        else:
            plt.axvline(
                adata.uns["dropkick_thresholds"][metrics[0]]["thresh"],
                linestyle="--",
                color="k",
                linewidth=2,
            )
    if metrics[1] in adata.uns["dropkick_thresholds"]:
        if isinstance(
            adata.uns["dropkick_thresholds"][metrics[1]]["thresh"], np.ndarray
        ):
            [
                plt.axhline(_x, linestyle="--", color="k", linewidth=2)
                for _x in adata.uns["dropkick_thresholds"][metrics[1]]["thresh"]
            ]
        else:
            plt.axhline(
                adata.uns["dropkick_thresholds"][metrics[1]]["thresh"],
                linestyle="--",
                color="k",
                linewidth=2,
            )
    # change focus to x margin plot to continue threshold line
    if metrics[0] in adata.uns["dropkick_thresholds"]:
        plt.sca(g.ax_marg_x)
        if isinstance(
            adata.uns["dropkick_thresholds"][metrics[0]]["thresh"], np.ndarray
        ):
            [
                plt.axvline(_x, linestyle="--", color="k", linewidth=2)
                for _x in adata.uns["dropkick_thresholds"][metrics[0]]["thresh"]
            ]
        else:
            plt.axvline(
                adata.uns["dropkick_thresholds"][metrics[0]]["thresh"],
                linestyle="--",
                color="k",
                linewidth=2,
            )
    # change focus to y margin plot to continue threshold line
    if metrics[1] in adata.uns["dropkick_thresholds"]:
        plt.sca(g.ax_marg_y)
        if isinstance(
            adata.uns["dropkick_thresholds"][metrics[1]]["thresh"], np.ndarray
        ):
            [
                plt.axhline(_x, linestyle="--", color="k", linewidth=2)
                for _x in adata.uns["dropkick_thresholds"][metrics[1]]["thresh"]
            ]
        else:
            plt.axhline(
                adata.uns["dropkick_thresholds"][metrics[1]]["thresh"],
                linestyle="--",
                color="k",
                linewidth=2,
            )
    # add colorbar inside scatter axes
    axins1 = inset_axes(
        g.ax_joint,
        width="40%",  # width = 40% of parent_bbox width
        height="4%",  # height : 4%
        loc="upper right",
    )
    cbar = plt.colorbar(
        points,
        cax=axins1,
        drawedges=False,
        label="dropkick_score",
        orientation="horizontal",
        ticks=[0.1, 0.5, 0.9],
    )
    cbar.ax.tick_params(labelsize=12)
    cbar.solids.set_edgecolor("face")
    # add histogram of scores on top of colorbar
    axins2 = inset_axes(
        g.ax_joint,
        width="40%",  # width = 40% of parent_bbox width
        height="4%",  # height : 4%
        loc="upper right",
    )
    _ = axins2.hist(
        # only include scores > 0 in hist so you can see distribution
        adata.obs.loc[adata.obs["dropkick_score"] > 0.0, "dropkick_score"],
        bins=40,
        color="k",
        alpha=0.7,
        histtype="step",
    )
    axins2.axis("off")
    if save_to is not None:
        if verbose:
            print("Saving score plot to {}".format(save_to))
        g.savefig(save_to, dpi=200)
    else:
        return g