Randomized quantiles

Table of Contents

Introduction

We previously illustrated an example of our approach to testing GOF using real data. Here, we characterize the variability in the randomized quantiles that went into that illustration.

Setup

import anndata
import numpy as np
import pandas as pd
import scipy.optimize as so
import scipy.special as sp
import scipy.stats as st
import scmodes
import sqlite3
import rpy2.robjects.packages
import rpy2.robjects.pandas2ri
rpy2.robjects.pandas2ri.activate()
ashr = rpy2.robjects.packages.importr('ashr')
%matplotlib inline
%config InlineBackend.figure_formats = set(['retina'])
import colorcet
import matplotlib.pyplot as plt
plt.rcParams['figure.facecolor'] = 'w'
plt.rcParams['font.family'] = 'Nimbus Sans'

Data

Read the data.

gene_info = pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/scqtl-genes.txt.gz', sep='\t', index_col=0)
cytotoxic_t = scmodes.dataset.read_10x('/project2/mstephens/aksarkar/projects/singlecell-ideas/data/10xgenomics/cytotoxic_t/filtered_matrices_mex/hg19/', return_df=True)
gene = 'ENSG00000109475'
x = cytotoxic_t.loc[:,gene]
s = cytotoxic_t.sum(axis=1)
y = np.arange(x.max() + 1)

Seed 2

Generate an independent draw of randomized quantiles.

rpp = dict()
pmf = dict()
np.random.seed(2)
gamma_res = scmodes.ebpm.ebpm_gamma(x, s)
rpp['Gamma'] = scmodes.benchmark.gof._rpp(
  scmodes.benchmark.gof._zig_cdf(x - 1, size=s, log_mu=gamma_res[0], log_phi=-gamma_res[1]),
  scmodes.benchmark.gof._zig_pmf(x, size=s, log_mu=gamma_res[0], log_phi=-gamma_res[1]))
pmf['Gamma'] = np.array([scmodes.benchmark.gof._zig_pmf(k, size=s, log_mu=gamma_res[0], log_phi=-gamma_res[1]).mean()
                         for k in y])

point_gamma_res = scmodes.ebpm.ebpm_point_gamma(x, s)
rpp['ZIG'] = scmodes.benchmark.gof._rpp(
  scmodes.benchmark.gof._zig_cdf(x - 1, size=s, log_mu=point_gamma_res[0], log_phi=-point_gamma_res[1], logodds=point_gamma_res[2]),
  scmodes.benchmark.gof._zig_pmf(x, size=s, log_mu=point_gamma_res[0], log_phi=-point_gamma_res[1], logodds=point_gamma_res[2]))
pmf['ZIG'] = np.array([scmodes.benchmark.gof._zig_pmf(k, size=s, log_mu=point_gamma_res[0], log_phi=-point_gamma_res[1], logodds=point_gamma_res[2]).mean()
                       for k in y])

zief_res = scmodes.ebpm.ebpm_point_expfam(x, s)
rpp['ZIEF'] = scmodes.benchmark.gof._rpp(
  scmodes.benchmark.gof._point_expfam_cdf(x.values.ravel() - 1, size=s, res=zief_res),
  scmodes.benchmark.gof._point_expfam_pmf(x.values.ravel(), size=s, res=zief_res))
# We need np.full here because _point_expfam_pmf does not broadcast
pmf['ZIEF'] = np.array([scmodes.benchmark.gof._point_expfam_pmf(np.full(x.shape, k), size=s, res=zief_res).mean()
                        for k in y])

unimodal_res = scmodes.ebpm.ebpm_unimodal(x, s)
rpp['Unimodal'] = scmodes.benchmark.gof._rpp(
  scmodes.benchmark.gof._ash_cdf(x - 1, s=s, fit=unimodal_res),
  scmodes.benchmark.gof._ash_pmf(x, s=s, fit=unimodal_res))
# It is simpler to compute this here than to mess with the ash_data object
g = np.array(unimodal_res.rx2('fitted_g'))
a = np.fmin(g[1], g[2])
b = np.fmax(g[1], g[2])
comp_dens_conv = np.array([((st.gamma(a=k + 1, scale=1 / s.values.reshape(-1, 1)).cdf(b.reshape(1, -1)) - st.gamma(a=k + 1, scale=1 / s.values.reshape(-1, 1)).cdf(a.reshape(1, -1))) / np.outer(s, b - a)).mean(axis=0) for k in y])
comp_dens_conv[:,0] = st.poisson(mu=s.values.reshape(-1, 1) * b[0]).pmf(y).mean(axis=0)
pmf['Unimodal'] = comp_dens_conv @ g[0]

npmle_res = scmodes.ebpm.ebpm_npmle(x, s)
rpp['NPMLE'] = scmodes.benchmark.gof._rpp(
  scmodes.benchmark.gof._ash_cdf(x - 1, s=s, fit=npmle_res),
  scmodes.benchmark.gof._ash_pmf(x, s=s, fit=npmle_res))
g = np.array(npmle_res.rx2('fitted_g'))
a = np.fmin(g[1], g[2])
b = np.fmax(g[1], g[2])
comp_dens_conv = np.array([((st.gamma(a=k + 1, scale=1 / s.values.reshape(-1, 1)).cdf(b.reshape(1, -1)) - st.gamma(a=k + 1, scale=1 / s.values.reshape(-1, 1)).cdf(a.reshape(1, -1))) / np.outer(s, b - a)).mean(axis=0) for k in y])
pmf['NPMLE'] = comp_dens_conv @ g[0]

Plot the randomized quantiles against uniform quantiles.

plt.clf()
plt.gcf().set_size_inches(3, 3)
grid = np.linspace(0, 1, x.shape[0] + 1)[1:]
plt.xscale('log')
plt.yscale('log')
for c, k in zip(plt.get_cmap('Dark2').colors, ['Gamma', 'ZIG', 'Unimodal', 'ZIEF', 'NPMLE']):
  if k in rpp:
    plt.plot(grid, np.sort(rpp[k]), color=c, lw=1, marker=None, label=k)
plt.plot([1e-5, 1], [1e-5, 1], lw=1, ls=':', c='0.5')
plt.legend(frameon=False)
plt.xlim(1e-5, 1)
plt.ylim(1e-5, 1)
plt.xlabel('Theoretical quantiles')
plt.ylabel('Randomized quantiles')
plt.tight_layout()

cytotoxic_t-rpl34-rpp-2.png

Seed 3

Generate an independent draw of randomized quantiles.

rpp = dict()
pmf = dict()
np.random.seed(3)
gamma_res = scmodes.ebpm.ebpm_gamma(x, s)
rpp['Gamma'] = scmodes.benchmark.gof._rpp(
  scmodes.benchmark.gof._zig_cdf(x - 1, size=s, log_mu=gamma_res[0], log_phi=-gamma_res[1]),
  scmodes.benchmark.gof._zig_pmf(x, size=s, log_mu=gamma_res[0], log_phi=-gamma_res[1]))
pmf['Gamma'] = np.array([scmodes.benchmark.gof._zig_pmf(k, size=s, log_mu=gamma_res[0], log_phi=-gamma_res[1]).mean()
                         for k in y])

point_gamma_res = scmodes.ebpm.ebpm_point_gamma(x, s)
rpp['ZIG'] = scmodes.benchmark.gof._rpp(
  scmodes.benchmark.gof._zig_cdf(x - 1, size=s, log_mu=point_gamma_res[0], log_phi=-point_gamma_res[1], logodds=point_gamma_res[2]),
  scmodes.benchmark.gof._zig_pmf(x, size=s, log_mu=point_gamma_res[0], log_phi=-point_gamma_res[1], logodds=point_gamma_res[2]))
pmf['ZIG'] = np.array([scmodes.benchmark.gof._zig_pmf(k, size=s, log_mu=point_gamma_res[0], log_phi=-point_gamma_res[1], logodds=point_gamma_res[2]).mean()
                       for k in y])

zief_res = scmodes.ebpm.ebpm_point_expfam(x, s)
rpp['ZIEF'] = scmodes.benchmark.gof._rpp(
  scmodes.benchmark.gof._point_expfam_cdf(x.values.ravel() - 1, size=s, res=zief_res),
  scmodes.benchmark.gof._point_expfam_pmf(x.values.ravel(), size=s, res=zief_res))
# We need np.full here because _point_expfam_pmf does not broadcast
pmf['ZIEF'] = np.array([scmodes.benchmark.gof._point_expfam_pmf(np.full(x.shape, k), size=s, res=zief_res).mean()
                        for k in y])

unimodal_res = scmodes.ebpm.ebpm_unimodal(x, s)
rpp['Unimodal'] = scmodes.benchmark.gof._rpp(
  scmodes.benchmark.gof._ash_cdf(x - 1, s=s, fit=unimodal_res),
  scmodes.benchmark.gof._ash_pmf(x, s=s, fit=unimodal_res))
# It is simpler to compute this here than to mess with the ash_data object
g = np.array(unimodal_res.rx2('fitted_g'))
a = np.fmin(g[1], g[2])
b = np.fmax(g[1], g[2])
comp_dens_conv = np.array([((st.gamma(a=k + 1, scale=1 / s.values.reshape(-1, 1)).cdf(b.reshape(1, -1)) - st.gamma(a=k + 1, scale=1 / s.values.reshape(-1, 1)).cdf(a.reshape(1, -1))) / np.outer(s, b - a)).mean(axis=0) for k in y])
comp_dens_conv[:,0] = st.poisson(mu=s.values.reshape(-1, 1) * b[0]).pmf(y).mean(axis=0)
pmf['Unimodal'] = comp_dens_conv @ g[0]

npmle_res = scmodes.ebpm.ebpm_npmle(x, s)
rpp['NPMLE'] = scmodes.benchmark.gof._rpp(
  scmodes.benchmark.gof._ash_cdf(x - 1, s=s, fit=npmle_res),
  scmodes.benchmark.gof._ash_pmf(x, s=s, fit=npmle_res))
g = np.array(npmle_res.rx2('fitted_g'))
a = np.fmin(g[1], g[2])
b = np.fmax(g[1], g[2])
comp_dens_conv = np.array([((st.gamma(a=k + 1, scale=1 / s.values.reshape(-1, 1)).cdf(b.reshape(1, -1)) - st.gamma(a=k + 1, scale=1 / s.values.reshape(-1, 1)).cdf(a.reshape(1, -1))) / np.outer(s, b - a)).mean(axis=0) for k in y])
pmf['NPMLE'] = comp_dens_conv @ g[0]

Plot the randomized quantiles against uniform quantiles.

plt.clf()
plt.gcf().set_size_inches(3, 3)
grid = np.linspace(0, 1, x.shape[0] + 1)[1:]
plt.xscale('log')
plt.yscale('log')
for c, k in zip(plt.get_cmap('Dark2').colors, ['Gamma', 'ZIG', 'Unimodal', 'ZIEF', 'NPMLE']):
  if k in rpp:
    plt.plot(grid, np.sort(rpp[k]), color=c, lw=1, marker=None, label=k)
plt.plot([1e-5, 1], [1e-5, 1], lw=1, ls=':', c='0.5')
plt.legend(frameon=False)
plt.xlim(1e-5, 1)
plt.ylim(1e-5, 1)
plt.xlabel('Theoretical quantiles')
plt.ylabel('Randomized quantiles')
plt.tight_layout()

cytotoxic_t-rpl34-rpp-3.png

Seed 10

Generate an independent draw of randomized quantiles.

rpp = dict()
pmf = dict()
np.random.seed(10)
gamma_res = scmodes.ebpm.ebpm_gamma(x, s)
rpp['Gamma'] = scmodes.benchmark.gof._rpp(
  scmodes.benchmark.gof._zig_cdf(x - 1, size=s, log_mu=gamma_res[0], log_phi=-gamma_res[1]),
  scmodes.benchmark.gof._zig_pmf(x, size=s, log_mu=gamma_res[0], log_phi=-gamma_res[1]))
pmf['Gamma'] = np.array([scmodes.benchmark.gof._zig_pmf(k, size=s, log_mu=gamma_res[0], log_phi=-gamma_res[1]).mean()
                         for k in y])

point_gamma_res = scmodes.ebpm.ebpm_point_gamma(x, s)
rpp['ZIG'] = scmodes.benchmark.gof._rpp(
  scmodes.benchmark.gof._zig_cdf(x - 1, size=s, log_mu=point_gamma_res[0], log_phi=-point_gamma_res[1], logodds=point_gamma_res[2]),
  scmodes.benchmark.gof._zig_pmf(x, size=s, log_mu=point_gamma_res[0], log_phi=-point_gamma_res[1], logodds=point_gamma_res[2]))
pmf['ZIG'] = np.array([scmodes.benchmark.gof._zig_pmf(k, size=s, log_mu=point_gamma_res[0], log_phi=-point_gamma_res[1], logodds=point_gamma_res[2]).mean()
                       for k in y])

zief_res = scmodes.ebpm.ebpm_point_expfam(x, s)
rpp['ZIEF'] = scmodes.benchmark.gof._rpp(
  scmodes.benchmark.gof._point_expfam_cdf(x.values.ravel() - 1, size=s, res=zief_res),
  scmodes.benchmark.gof._point_expfam_pmf(x.values.ravel(), size=s, res=zief_res))
# We need np.full here because _point_expfam_pmf does not broadcast
pmf['ZIEF'] = np.array([scmodes.benchmark.gof._point_expfam_pmf(np.full(x.shape, k), size=s, res=zief_res).mean()
                        for k in y])

unimodal_res = scmodes.ebpm.ebpm_unimodal(x, s)
rpp['Unimodal'] = scmodes.benchmark.gof._rpp(
  scmodes.benchmark.gof._ash_cdf(x - 1, s=s, fit=unimodal_res),
  scmodes.benchmark.gof._ash_pmf(x, s=s, fit=unimodal_res))
# It is simpler to compute this here than to mess with the ash_data object
g = np.array(unimodal_res.rx2('fitted_g'))
a = np.fmin(g[1], g[2])
b = np.fmax(g[1], g[2])
comp_dens_conv = np.array([((st.gamma(a=k + 1, scale=1 / s.values.reshape(-1, 1)).cdf(b.reshape(1, -1)) - st.gamma(a=k + 1, scale=1 / s.values.reshape(-1, 1)).cdf(a.reshape(1, -1))) / np.outer(s, b - a)).mean(axis=0) for k in y])
comp_dens_conv[:,0] = st.poisson(mu=s.values.reshape(-1, 1) * b[0]).pmf(y).mean(axis=0)
pmf['Unimodal'] = comp_dens_conv @ g[0]

npmle_res = scmodes.ebpm.ebpm_npmle(x, s)
rpp['NPMLE'] = scmodes.benchmark.gof._rpp(
  scmodes.benchmark.gof._ash_cdf(x - 1, s=s, fit=npmle_res),
  scmodes.benchmark.gof._ash_pmf(x, s=s, fit=npmle_res))
g = np.array(npmle_res.rx2('fitted_g'))
a = np.fmin(g[1], g[2])
b = np.fmax(g[1], g[2])
comp_dens_conv = np.array([((st.gamma(a=k + 1, scale=1 / s.values.reshape(-1, 1)).cdf(b.reshape(1, -1)) - st.gamma(a=k + 1, scale=1 / s.values.reshape(-1, 1)).cdf(a.reshape(1, -1))) / np.outer(s, b - a)).mean(axis=0) for k in y])
pmf['NPMLE'] = comp_dens_conv @ g[0]

Plot the randomized quantiles against uniform quantiles.

plt.clf()
plt.gcf().set_size_inches(3, 3)
grid = np.linspace(0, 1, x.shape[0] + 1)[1:]
plt.xscale('log')
plt.yscale('log')
for c, k in zip(plt.get_cmap('Dark2').colors, ['Gamma', 'ZIG', 'Unimodal', 'ZIEF', 'NPMLE']):
  if k in rpp:
    plt.plot(grid, np.sort(rpp[k]), color=c, lw=1, marker=None, label=k)
plt.plot([1e-5, 1], [1e-5, 1], lw=1, ls=':', c='0.5')
plt.legend(frameon=False)
plt.xlim(1e-5, 1)
plt.ylim(1e-5, 1)
plt.xlabel('Theoretical quantiles')
plt.ylabel('Randomized quantiles')
plt.tight_layout()

cytotoxic_t-rpl34-rpp-4.png

Expected randomized quantiles

The randomized quantiles are

\begin{equation} u_i \mid x_i \sim \operatorname{Uniform}(\hat{F}(x - 1), \hat{F}(x)) \end{equation}

for \(i = 1, \ldots, n\); therefore, their expected value is

\begin{equation} \operatorname{E}[u_i \mid x_i] = \frac{\hat{F}(x - 1) + \hat{F}(x)}{2} = \hat{F}(x - 1) + \frac{1}{2}\hat{f}(x). \end{equation}

Assuming \(x \sim \hat{F}\), \(u_i \sim \operatorname{Uniform}(0, 1)\) and the \(k\)th order statistic

\begin{equation} u_{(k)} \sim \operatorname{Beta}(k, n - k + 1). \end{equation}

Plot the expected randomized quantiles against Uniform quantiles. Superimpose the \((0.025, 0.975)\) quantiles of the distribution of Uniform order statistics.

erpp = dict()
gamma_res = scmodes.ebpm.ebpm_gamma(x, s)
erpp['Gamma'] = (scmodes.benchmark.gof._zig_cdf(x - 1, size=s, log_mu=gamma_res[0], log_phi=-gamma_res[1]) +
                 .5 * scmodes.benchmark.gof._zig_pmf(x, size=s, log_mu=gamma_res[0], log_phi=-gamma_res[1]))

point_gamma_res = scmodes.ebpm.ebpm_point_gamma(x, s)
erpp['ZIG'] = (scmodes.benchmark.gof._zig_cdf(x - 1, size=s, log_mu=point_gamma_res[0], log_phi=-point_gamma_res[1], logodds=-point_gamma_res[2]) +
               .5 * scmodes.benchmark.gof._zig_pmf(x, size=s, log_mu=point_gamma_res[0], log_phi=-point_gamma_res[1], logodds=-point_gamma_res[2]))

zief_res = scmodes.ebpm.ebpm_point_expfam(x, s)
erpp['ZIEF'] = (scmodes.benchmark.gof._point_expfam_cdf(x.values.ravel() - 1, size=s, res=zief_res) +
                .5 * scmodes.benchmark.gof._point_expfam_pmf(x.values.ravel(), size=s, res=zief_res))

unimodal_res = scmodes.ebpm.ebpm_unimodal(x, s)
erpp['Unimodal'] = (scmodes.benchmark.gof._ash_cdf(x - 1, s=s, fit=unimodal_res) +
                    .5 * scmodes.benchmark.gof._ash_pmf(x, s=s, fit=unimodal_res))

npmle_res = scmodes.ebpm.ebpm_npmle(x, s)
erpp['NPMLE'] = (scmodes.benchmark.gof._ash_cdf(x - 1, s=s, fit=npmle_res) +
                 .5 * scmodes.benchmark.gof._ash_pmf(x, s=s, fit=npmle_res))
grid = np.linspace(0, 1, x.shape[0] + 1)[1:]
F = st.beta(np.arange(1, 1 + x.shape[0]), 1 + x.shape[0] - np.arange(1, 1 + x.shape[0]))

plt.clf()
plt.gcf().set_size_inches(3, 3)
plt.xscale('log')
plt.yscale('log')
for c, k in zip(plt.get_cmap('Dark2').colors, ['Gamma', 'ZIG', 'Unimodal', 'ZIEF', 'NPMLE']):
  if k in erpp:
    plt.plot(grid, np.sort(erpp[k]), color=c, lw=1, marker=None, label=k)
plt.fill_between(grid, F.ppf(.025), F.ppf(.975), color='0.85')
plt.legend(frameon=False, handletextpad=0.25)
lim = [1e-5, 1]
plt.plot(lim, lim, lw=1, ls=':', c='0.5')
plt.xlim(lim)
plt.ylim(lim)
plt.xlabel('Theoretical quantiles')
plt.ylabel('Expected randomized quantiles')
plt.tight_layout()

erpp.png

Plot the expected randomized quantiles on a linear scale.

grid = np.linspace(0, 1, x.shape[0] + 1)[1:]
plt.clf()
plt.gcf().set_size_inches(3, 3)
for c, k in zip(plt.get_cmap('Dark2').colors, ['Gamma', 'ZIG', 'Unimodal', 'ZIEF', 'NPMLE']):
  if k == 'ZIG':
    ls = (0, (4, 4))
  else:
    ls = '-'
  if k in erpp:
    plt.plot(grid, np.sort(erpp[k]), color=c, lw=1, ls=ls, marker=None, label=k)
plt.legend(frameon=False, handletextpad=0.25)
lim = [0, 1]
plt.plot(lim, lim, lw=1, ls=':', c='0.5')
plt.xlim(lim)
plt.ylim(lim)
plt.xlabel('Theoretical quantiles')
plt.ylabel('Expected randomized quantiles')
plt.tight_layout()

erpp-linear.png

Gamma model

Look at the estimated Gamma model parameters.

pd.Series({k: v for k, v in zip(('log_mean', 'log_inv_disp', 'llik'), gamma_res)})
log_mean           -4.567842
log_inv_disp        2.392556
llik           -31464.243518
dtype: float64

Look at the estimated point-Gamma model parameters.

pd.Series({k: v for k, v in zip(('log_mean', 'log_inv_disp', 'logodds', 'llik'), point_gamma_res)})
log_mean           -4.567096
log_inv_disp        2.404508
logodds            -7.125454
llik           -31459.278213
dtype: float64

Estimate how many excess zeros in this data are predicted by the point-Gamma model.

sp.expit(point_gamma_res[2]) * x.shape[0]
8.205192265041903

Compute the difference in marginal log likelihood between a point-Gamma and a Gamma expression model.

point_gamma_res[-1] - gamma_res[-1]
4.965304532452137

Compute the difference in marginal log likelihood between a unimodal and a Gamma expression model.

np.array(unimodal_res.rx2('loglik')) - gamma_res[-1]
array([265.32446831])

Dependency on size factors

Look at the distribution of size factors.

plt.clf()
plt.gcf().set_size_inches(2.5, 2.5)
plt.hist(s, bins=15, density=True, color='0.7')
plt.xlabel('Size factor')
plt.ylabel('Density')
plt.tight_layout()

size.png

Look at the distribution of \(x_{ij} / x_{i+}\).

plt.clf()
plt.gcf().set_size_inches(2.5, 2.5)
plt.hist(x / s, bins=15, density=True, color='0.7')
plt.xlabel('$x_i / s_i$')
plt.ylabel('Density')
plt.tight_layout()

mle.png

Plot the expected randomized quantile against the size factor, to see if they are not independent.

plt.clf()
fig, ax = plt.subplots(1, 5, sharey=True)
fig.set_size_inches(7.5, 2.5)
for a, k in zip(ax, erpp):
  a.scatter(s, erpp[k], s=1, c='k')
  a.set_title(k)
  a.set_xlabel('Size factor')
ax[0].set_ylabel('Expected randomized quantile')
fig.tight_layout()

rpp-size.png

Plot the example again.

pmf = dict()
pmf['Gamma'] = np.array([scmodes.benchmark.gof._zig_pmf(k, size=s, log_mu=gamma_res[0], log_phi=-gamma_res[1]).mean()
                         for k in y])
pmf['ZIG'] = np.array([scmodes.benchmark.gof._zig_pmf(k, size=s, log_mu=point_gamma_res[0], log_phi=-point_gamma_res[1], logodds=-point_gamma_res[2]).mean()
                       for k in y])
pmf['ZIEF'] = np.array([scmodes.benchmark.gof._point_expfam_pmf(np.full(x.shape, k), size=s, res=zief_res).mean()
                        for k in y])

g = np.array(unimodal_res.rx2('fitted_g'))
g = g[:,g[0] > 1e-8]
a = np.fmin(g[1], g[2])
b = np.fmax(g[1], g[2])
comp_dens_conv = np.array([((st.gamma(a=k + 1, scale=1 / s.values.reshape(-1, 1)).cdf(b.reshape(1, -1)) - st.gamma(a=k + 1, scale=1 / s.values.reshape(-1, 1)).cdf(a.reshape(1, -1))) / np.outer(s, b - a)).mean(axis=0) for k in y])
comp_dens_conv[:,0] = st.poisson(mu=s.values.reshape(-1, 1) * b[0]).pmf(y).mean(axis=0)
pmf['Unimodal'] = comp_dens_conv @ g[0]

g = np.array(npmle_res.rx2('fitted_g'))
g = g[:,g[0] > 1e-8]
a = np.fmin(g[1], g[2])
b = np.fmax(g[1], g[2])
comp_dens_conv = np.array([((st.gamma(a=k + 1, scale=1 / s.values.reshape(-1, 1)).cdf(b.reshape(1, -1)) - st.gamma(a=k + 1, scale=1 / s.values.reshape(-1, 1)).cdf(a.reshape(1, -1))) / np.outer(s, b - a)).mean(axis=0) for k in y])
pmf['NPMLE'] = comp_dens_conv @ g[0]
plt.clf()
plt.gcf().set_size_inches(4, 2.5)
plt.hist(x, bins=y, color='0.7', density=True, label=None)
for c, k in zip(plt.get_cmap('Dark2').colors, ['Gamma', 'ZIG', 'Unimodal', 'ZIEF', 'NPMLE']):
  plt.plot(y + .5, pmf[k], color=c, lw=1, label=k)
plt.legend(frameon=False)
plt.title(gene_info.loc[gene, 'name'])
plt.xlabel('Number of molecules')
plt.ylabel('Density')
plt.tight_layout()

rpl34.png

Take the fitted NPMLE \(\hat{g}\). For each observed \(x_{i+}\), draw one \(\lambda_i \sim \hat{g}\), then convolve with the measurement model.

np.random.seed(0)
g = np.array(unimodal_res.rx2('fitted_g'))
g = g[:,g[0] > 1e-8]
a = np.fmin(g[1], g[2])
b = np.fmax(g[1], g[2])
z = np.random.choice(g.shape[1], size=x.shape[0], p=g[0])
lam = np.random.uniform(a[z], b[z])
x = np.random.poisson(s * lam)
plt.clf()
plt.gcf().set_size_inches(4, 2.5)
plt.hist(x, bins=y, color='0.7', density=True, label=None)
for c, k in zip(plt.get_cmap('Dark2').colors, ['Gamma', 'ZIG', 'Unimodal', 'ZIEF', 'NPMLE']):
  plt.plot(y + .5, pmf[k], color=c, lw=1, label=k)
plt.legend(frameon=False)
plt.title('Random sample from unimodal')
plt.xlabel('Number of molecules')
plt.ylabel('Density')
plt.tight_layout()

ppc.png

Fit a dependent expression model.

def loss(a, x, s):
  return -np.array(scmodes.ebpm.ebpm_unimodal(x, np.exp(a * np.log(s))).rx2('loglik'))[0]

opt = so.minimize_scalar(loss, bracket=[0, 1], method='brent', args=(x, s))
assert opt.success
opt.x, opt.fun, -unimodal_res.rx2('loglik')[0]
(1.0177157921515285, 31202.115784911897, 31198.919049387503)

iPSC example

Pick a different, better-behaved example.

ipsc = anndata.read_h5ad('/project2/mstephens/aksarkar/projects/singlecell-ideas/data/ipsc/ipsc.h5ad')
gene = 'PACRG'
x = ipsc[:,ipsc.var['name'] == gene].X.A.ravel()
s = ipsc.obs['mol_hs'].values.ravel()
y = np.arange(x.max() + 1)
grid = np.linspace(0, (x / s).max(), 1000)
pmf = dict()
erpp = dict()
cdf = dict()

gamma_res = scmodes.ebpm.ebpm_gamma(x, s)
pmf['Gamma'] = np.array([scmodes.benchmark.gof._zig_pmf(k, size=s, log_mu=gamma_res[0], log_phi=-gamma_res[1]).mean()
                         for k in y])
cdf['Gamma'] = st.gamma(a=np.exp(gamma_res[1]), scale=np.exp(gamma_res[0] - gamma_res[1])).cdf(grid)
erpp['Gamma'] = (scmodes.benchmark.gof._zig_cdf(x - 1, size=s, log_mu=gamma_res[0], log_phi=-gamma_res[1]) +
                 .5 * scmodes.benchmark.gof._zig_pmf(x, size=s, log_mu=gamma_res[0], log_phi=-gamma_res[1]))

point_gamma_res = scmodes.ebpm.ebpm_point_gamma(x, s)
cdf['ZIG'] = sp.expit(point_gamma_res[2]) + sp.expit(-point_gamma_res[2]) * st.gamma(a=np.exp(point_gamma_res[1]), scale=np.exp(point_gamma_res[0] - point_gamma_res[1])).cdf(grid)
pmf['ZIG'] = np.array([scmodes.benchmark.gof._zig_pmf(k, size=s, log_mu=point_gamma_res[0], log_phi=-point_gamma_res[1], logodds=point_gamma_res[2]).mean()
                       for k in y])
erpp['ZIG'] = (scmodes.benchmark.gof._zig_cdf(x - 1, size=s, log_mu=point_gamma_res[0], log_phi=-point_gamma_res[1], logodds=point_gamma_res[2]) +
               .5 * scmodes.benchmark.gof._zig_pmf(x, size=s, log_mu=point_gamma_res[0], log_phi=-point_gamma_res[1], logodds=-point_gamma_res[2]))

zief_res = scmodes.ebpm.ebpm_point_expfam(x, s)
pmf['ZIEF'] = np.array([scmodes.benchmark.gof._point_expfam_pmf(np.full(x.shape, k), size=s, res=zief_res).mean()
                        for k in y])
cdf['ZIEF'] = np.interp(grid, np.array(zief_res.slots['distribution'])[:,0], np.cumsum(np.array(zief_res.slots['distribution'])[:,1]))
erpp['ZIEF'] = (scmodes.benchmark.gof._point_expfam_cdf(x - 1, size=s, res=zief_res) +
                .5 * scmodes.benchmark.gof._point_expfam_pmf(x, size=s, res=zief_res))

unimodal_res = scmodes.ebpm.ebpm_unimodal(x, s)
g = np.array(unimodal_res.rx2('fitted_g'))
g = g[:,g[0] > 1e-8]
a = np.fmin(g[1], g[2])
b = np.fmax(g[1], g[2])
comp_dens_conv = np.array([((st.gamma(a=k + 1, scale=1 / s.reshape(-1, 1)).cdf(b.reshape(1, -1)) - st.gamma(a=k + 1, scale=1 / s.reshape(-1, 1)).cdf(a.reshape(1, -1))) / np.outer(s, b - a)).mean(axis=0) for k in y])
comp_dens_conv[:,0] = st.poisson(mu=s.reshape(-1, 1) * b[0]).pmf(y).mean(axis=0)
pmf['Unimodal'] = comp_dens_conv @ g[0]
cdf['Unimodal'] = ashr.cdf_ash(unimodal_res, grid).rx2('y').ravel()
erpp['Unimodal'] = (scmodes.benchmark.gof._ash_cdf(x - 1, s=s, fit=unimodal_res) +
                    .5 * scmodes.benchmark.gof._ash_pmf(x, s=s, fit=unimodal_res))

npmle_res = scmodes.ebpm.ebpm_npmle(x, s, step=1e-6)
g = np.array(npmle_res.rx2('fitted_g'))
g = g[:,g[0] > 1e-8]
a = np.fmin(g[1], g[2])
b = np.fmax(g[1], g[2])
comp_dens_conv = np.array([((st.gamma(a=k + 1, scale=1 / s.reshape(-1, 1)).cdf(b.reshape(1, -1)) - st.gamma(a=k + 1, scale=1 / s.reshape(-1, 1)).cdf(a.reshape(1, -1))) / np.outer(s, b - a)).mean(axis=0) for k in y])
pmf['NPMLE'] = comp_dens_conv @ g[0]
cdf['NPMLE'] = ashr.cdf_ash(npmle_res, grid).rx2('y').ravel()
erpp['NPMLE'] = (scmodes.benchmark.gof._ash_cdf(x - 1, s=s, fit=npmle_res) +
                 .5 * scmodes.benchmark.gof._ash_pmf(x, s=s, fit=npmle_res))
plt.clf()
fig, ax = plt.subplots(2, 1)
fig.set_size_inches(4, 4)
ax[0].hist(x, bins=y, color='k', label=None)
for c, k in zip(plt.get_cmap('Dark2').colors, ['Gamma', 'ZIG', 'Unimodal', 'ZIEF', 'NPMLE']):
  ax[0].plot(y + .5, x.shape[0] * pmf[k], color=c, lw=1, label=k)
ax[0].legend(frameon=False)
ax[0].set_title(gene)
ax[0].set_xlabel('Number of molecules')
ax[0].set_ylabel('Number of cells')

for c, k in zip(plt.get_cmap('Dark2').colors, ['Gamma', 'ZIG', 'Unimodal', 'ZIEF', 'NPMLE']):
  ax[1].plot(grid, cdf[k], color=c, lw=1, label=k)
ax[1].set_xlabel('Latent gene expression')
ax[1].set_ylabel('CDF')
ax[1].legend(frameon=False)
fig.tight_layout()

ipsc-ex.png

opt = so.minimize_scalar(lambda a: -scmodes.ebpm.ebpm_point_gamma(x, np.exp((1 - a) * np.log(s)))[-1], bracket=[0, 1])
opt
fun: 19321.753576794905
nfev: 16
nit: 12
success: True
x: 0.7713271091374253
point_gamma_res = scmodes.ebpm.ebpm_point_gamma(x, np.exp((1 - opt.x) * np.log(s)))
k = rf'ZIG ($\alpha$ = {opt.x:.3g})'
plt.clf()
fig, ax = plt.subplots(2, 1)
fig.set_size_inches(4, 4)
ax[0].hist(x, bins=y, color='k', label=None)
for c, k in zip(plt.get_cmap('Paired').colors, pmf):
  if k.startswith('ZIG'):
    ax[0].plot(y + .5, x.shape[0] * pmf[k], color=c, lw=1, label=k)
ax[0].legend(frameon=False)
ax[0].set_title(gene)
ax[0].set_xlabel('Number of molecules')
ax[0].set_ylabel('Number of cells')

for q in (.1, .25, .5, .75, .9):
  temp = np.percentile(sp.expit(point_gamma_res[2]) + sp.expit(-point_gamma_res[2]) * st.gamma(a=np.exp(point_gamma_res[1]), scale=np.exp(point_gamma_res[0] - point_gamma_res[1]) / np.exp(opt.x * np.log(s)).reshape(-1, 1)).cdf(grid), 100 * q, axis=0)
  ax[1].plot(grid, temp, color=colorcet.cm['fire'](q), lw=1, label=f'{q:.2g}')
ax[1].set_ylim(0, 1)
ax[1].set_xlabel('Latent gene expression')
ax[1].set_ylabel('CDF')
ax[1].legend(frameon=False)
fig.tight_layout()

ipsc-ex2.png

Author: Abhishek Sarkar

Created: 2020-08-08 Sat 14:16

Validate