Manuscript figures

Setup
Figure 1
- Preprint
- Revision
Figure 2

Setup

import anndata
import numpy as np
import os
import pandas as pd
import scanpy as sc
import scmodes
import scipy.stats as st
import scipy.special as sp
import sqlite3

import rpy2.robjects.packages
import rpy2.robjects.pandas2ri
rpy2.robjects.pandas2ri.activate()
ashr = rpy2.robjects.packages.importr('ashr')

%matplotlib inline
%config InlineBackend.figure_formats = set(['svg'])

import matplotlib.pyplot as plt
plt.rcParams['figure.facecolor'] = 'w'
plt.rcParams['font.family'] = 'Nimbus Sans'

Figure 1

Preprint

Draw an example of our approach to testing GOF using real data.

gene_info = pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/scqtl-genes.txt.gz', sep='\t', index_col=0)

cytotoxic_t = scmodes.dataset.read_10x('/project2/mstephens/aksarkar/projects/singlecell-ideas/data/10xgenomics/cytotoxic_t/filtered_matrices_mex/hg19/', return_df=True)

gene = 'ENSG00000109475'
x = cytotoxic_t.loc[:,gene]
s = cytotoxic_t.sum(axis=1)
y = np.arange(x.max() + 1)

rpp = dict()
pmf = dict()
np.random.seed(1)

gamma_res = scmodes.ebpm.ebpm_gamma(x, s)
rpp['Gamma'] = scmodes.benchmark.gof._rpp(
  scmodes.benchmark.gof._zig_cdf(x - 1, size=s, log_mu=gamma_res[0], log_phi=-gamma_res[1]),
  scmodes.benchmark.gof._zig_pmf(x, size=s, log_mu=gamma_res[0], log_phi=-gamma_res[1]))
pmf['Gamma'] = np.array([scmodes.benchmark.gof._zig_pmf(k, size=s, log_mu=gamma_res[0], log_phi=-gamma_res[1]).mean()
                         for k in y])

point_gamma_res = scmodes.ebpm.ebpm_point_gamma(x, s)
rpp['ZIG'] = scmodes.benchmark.gof._rpp(
  scmodes.benchmark.gof._zig_cdf(x - 1, size=s, log_mu=point_gamma_res[0], log_phi=-point_gamma_res[1], logodds=point_gamma_res[2]),
  scmodes.benchmark.gof._zig_pmf(x, size=s, log_mu=point_gamma_res[0], log_phi=-point_gamma_res[1], logodds=point_gamma_res[2]))
pmf['ZIG'] = np.array([scmodes.benchmark.gof._zig_pmf(k, size=s, log_mu=point_gamma_res[0], log_phi=-point_gamma_res[1], logodds=point_gamma_res[2]).mean()
                       for k in y])

zief_res = scmodes.ebpm.ebpm_point_expfam(x, s)
rpp['ZIEF'] = scmodes.benchmark.gof._rpp(
  scmodes.benchmark.gof._point_expfam_cdf(x.values.ravel() - 1, size=s, res=zief_res),
  scmodes.benchmark.gof._point_expfam_pmf(x.values.ravel(), size=s, res=zief_res))
# We need np.full here because _point_expfam_pmf does not broadcast
pmf['ZIEF'] = np.array([scmodes.benchmark.gof._point_expfam_pmf(np.full(x.shape, k), size=s, res=zief_res).mean()
                        for k in y])

unimodal_res = scmodes.ebpm.ebpm_unimodal(x, s)
rpp['Unimodal'] = scmodes.benchmark.gof._rpp(
  scmodes.benchmark.gof._ash_cdf(x - 1, s=s, fit=unimodal_res),
  scmodes.benchmark.gof._ash_pmf(x, s=s, fit=unimodal_res))
# It is simpler to compute this here than to mess with the ash_data object
g = np.array(unimodal_res.rx2('fitted_g'))
a = np.fmin(g[1], g[2])
b = np.fmax(g[1], g[2])
comp_dens_conv = np.array([((st.gamma(a=k + 1, scale=1 / s.values.reshape(-1, 1)).cdf(b.reshape(1, -1)) - st.gamma(a=k + 1, scale=1 / s.values.reshape(-1, 1)).cdf(a.reshape(1, -1))) / np.outer(s, b - a)).mean(axis=0) for k in y])
comp_dens_conv[:,0] = st.poisson(mu=s.values.reshape(-1, 1) * b[0]).pmf(y).mean(axis=0)
pmf['Unimodal'] = comp_dens_conv @ g[0]

npmle_res = scmodes.ebpm.ebpm_npmle(x, s)
rpp['NPMLE'] = scmodes.benchmark.gof._rpp(
  scmodes.benchmark.gof._ash_cdf(x - 1, s=s, fit=npmle_res),
  scmodes.benchmark.gof._ash_pmf(x, s=s, fit=npmle_res))
g = np.array(npmle_res.rx2('fitted_g'))
a = np.fmin(g[1], g[2])
b = np.fmax(g[1], g[2])
comp_dens_conv = np.array([((st.gamma(a=k + 1, scale=1 / s.values.reshape(-1, 1)).cdf(b.reshape(1, -1)) - st.gamma(a=k + 1, scale=1 / s.values.reshape(-1, 1)).cdf(a.reshape(1, -1))) / np.outer(s, b - a)).mean(axis=0) for k in y])
pmf['NPMLE'] = comp_dens_conv @ g[0]

plt.clf()
fig, ax = plt.subplots(2, 1)
fig.set_size_inches(4, 4)
ax[0].hist(x, bins=y, color='k', label=None)
for c, k in zip(plt.get_cmap('Dark2').colors, ['Gamma', 'ZIG', 'Unimodal', 'ZIEF', 'NPMLE']):
  ax[0].plot(y + .5, x.shape[0] * pmf[k], color=c, lw=1, label=k)
ax[0].legend(frameon=False)
ax[0].set_title(gene_info.loc[gene, 'name'])
ax[0].set_xlabel('Number of molecules')
ax[0].set_ylabel('Number of cells')

for c, k in zip(plt.get_cmap('Dark2').colors, ['Gamma', 'ZIG', 'Unimodal', 'ZIEF', 'NPMLE']):
  ax[1].plot(*getattr(scmodes.deconvolve, f'fit_{k.lower()}')(x, s), color=c, lw=1, label=k)
ax[1].set_xlabel('Latent gene expression')
ax[1].set_ylabel('CDF')
ax[1].legend(frameon=False)
fig.tight_layout()

plt.clf()
plt.gcf().set_size_inches(3, 3)
grid = np.linspace(0, 1, x.shape[0] + 1)[1:]
plt.xscale('log')
plt.yscale('log')
for c, k in zip(plt.get_cmap('Dark2').colors, ['Gamma', 'ZIG', 'Unimodal', 'ZIEF', 'NPMLE']):
  if k in rpp:
    plt.plot(grid, np.sort(rpp[k]), color=c, lw=1, marker=None, label=k)
plt.plot([1e-5, 1], [1e-5, 1], lw=1, ls=':', c='0.5')
plt.legend(frameon=False)
plt.xlim(1e-5, 1)
plt.ylim(1e-5, 1)
plt.xlabel('Theoretical quantiles')
plt.ylabel('Randomized quantiles')
plt.tight_layout()

Draw the histogram of GOF test \(p\)-values.

gof_res = pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-modes/data/gof/gof.txt.gz', sep='\t', index_col=0)
plt.clf()
fig, ax = plt.subplots(1, 6, sharey=True)
fig.set_size_inches(8, 2)
for a, t, (k, g) in zip(ax.ravel(),
                        ['T cell', 'B cell', 'iPSC', 'T cell/B cell', 'Cytotoxic/naive T', 'PBMC'],
                        gof_res.groupby('dataset')):
  a.hist(g.loc[g['method'] == 'gamma', 'p'], bins=np.linspace(0, 1, 11), color='0.7', density=True)
  a.axhline(y=1, c='k', ls=':', lw=1)
  a.set_xlim([0, 1])
  a.set_title(t)
ax[0].set_ylabel('Density')
for a in ax:
  a.set_xlabel('$p$-value')
fig.tight_layout()

Draw examples of multi-modal gene expression in iPSCs

ipsc_res = pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-modes/data/deconv-generalization/ipsc-gpu.txt.gz', index_col=0, sep='\t')
lrt = st.chi2(1).sf(-2 * (ipsc_res['gamma'] - ipsc_res['point_gamma']))
query = list(ipsc_res[lrt < .05 / lrt.shape[0]].index)

# SKP1: top eQTL
query.append('ENSG00000113558')
ipsc_counts = scmodes.dataset.ipsc(prefix='/project2/mstephens/aksarkar/projects/singlecell-qtl/data/',
                                   query=query, return_df=True)

annotation = pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/scqtl-annotation.txt', sep='\t')
keep_samples = pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/quality-single-cells.txt', index_col=0, header=None, sep='\t')
annotation = annotation.loc[keep_samples.values.ravel()]
s = annotation['mol_hs']
donor_info = pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-modes/data/20130606_sample_info.txt', sep='\t')

ipsc_log_mu = pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/density-estimation/design1/zi2-log-mu.txt.gz', sep=' ', index_col=0)
ipsc_log_phi = pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/density-estimation/design1/zi2-log-phi.txt.gz', sep=' ', index_col=0)
ipsc_logodds = pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/density-estimation/design1/zi2-logodds.txt.gz', sep=' ', index_col=0)
del ipsc_log_mu['NA18498']

def point_gamma_cdf(grid, log_mu, log_phi, logodds):
  res = st.gamma(a=np.exp(-log_phi), scale=np.exp(log_mu + log_phi)).cdf(grid)
  res *= sp.expit(-logodds)
  res += sp.expit(logodds)
  return res

lam = ipsc_counts.values / s.values.reshape(-1, 1)
grid = np.linspace(lam.min(), lam.max(), 1000)
cdfs = np.array([[point_gamma_cdf(grid,
                                  ipsc_log_mu.loc[j, k],
                                  ipsc_log_phi.loc[j, k],
                                  ipsc_logodds.loc[j, k])
                  for k in ipsc_log_mu]
                 for j in ipsc_counts.columns])

colors = {k: 'r' if v == 'male' else 'b' for k, v in
          donor_info.set_index('Sample')
          .filter(items=list(ipsc_log_mu.columns), axis=0)
          ['Gender']
          .iteritems()}

k = 'ENSG00000129824'
x = ipsc_counts.loc[:,k].values
plt.clf()
fig, ax = plt.subplots(2, 1)
fig.set_size_inches(4, 4)

ax[0].hist(x, bins=np.arange(x.max() + 1), color='k')
ax[0].set_title(gene_info.loc[k, 'name'])
ax[0].set_xlabel('Number of molecules')
ax[0].set_ylabel('Number of cells')

for f, k in zip(cdfs[2], ipsc_log_mu.columns):
  ax[1].plot(grid, f, lw=1, alpha=0.2, c=colors.get(k, 'k'))
ax[1].set_xlim(0, 2e-4)
ax[1].set_xticks(np.linspace(0, 2e-4, 3))
dummy = [plt.Line2D([0], [0], c='r'), plt.Line2D([0], [0], c='b')]
ax[1].legend(dummy, ['Male', 'Female'], title='Sex', frameon=False)
ax[1].set_ylabel('CDF')
ax[1].set_xlabel('Latent gene expression')

fig.tight_layout()

Include a gene with an eQTL to illustrate how marginally “multi-modal” might not be the correct question to ask.

mean_qtl_stats = pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/scqtl-mapping/zinb/mean.txt.gz', sep=' ', index_col=0)

with sqlite3.connect('/project2/mstephens/aksarkar/projects/singlecell-qtl/browser/browser.db') as conn:
  geno = pd.read_sql('select * from mean_qtl_geno where gene = ?', con=conn, params=('ENSG00000113558',)).set_index('ind')['value']

cm = plt.get_cmap('Dark2')
k = 'ENSG00000113558'
x = ipsc_counts.loc[:,k].values
plt.clf()
fig, ax = plt.subplots(2, 1)
fig.set_size_inches(4, 4)

ax[0].hist(x, bins=np.arange(x.max() + 1), color='k')
ax[0].set_title(gene_info.loc[k, 'name'])
ax[0].set_xlabel('Number of molecules')
ax[0].set_ylabel('Number of cells')

for f, k in zip(cdfs[1], ipsc_log_mu.columns):
  ax[1].plot(grid, f, lw=1, alpha=0.3, c=cm(int(geno[k])))
dummy = [plt.Line2D([0], [0], c=cm(i)) for i in range(3)]
# https://www.ncbi.nlm.nih.gov/snp/?term=rs13356194
ax[1].legend(dummy, ['CC', 'CT', 'TT'], title=mean_qtl_stats.loc['ENSG00000113558', 'id'].split('.')[0], frameon=False)
ax[1].set_ylabel('CDF')
ax[1].set_xlabel('Latent gene expression')

fig.tight_layout()

Draw examples of bimodal genes detected in an in silico mixture.

def _mix_10x(k1, k2, min_detect=0.01, return_y=False):
  x1 = scmodes.dataset.read_10x(f'/project2/mstephens/aksarkar/projects/singlecell-ideas/data/10xgenomics/{k1}/filtered_matrices_mex/hg19/', return_df=True, min_detect=0)
  x2 = scmodes.dataset.read_10x(f'/project2/mstephens/aksarkar/projects/singlecell-ideas/data/10xgenomics/{k2}/filtered_matrices_mex/hg19/', return_df=True, min_detect=0)
  x, y = scmodes.dataset.synthetic_mix(x1, x2, min_detect=min_detect)
  if return_y:
    return x, y
  else:
    return x

def _cd8_cd19_mix(**kwargs):
  return _mix_10x('cytotoxic_t', 'b_cells', **kwargs)

mix_x, mix_y = _cd8_cd19_mix(return_y=True)

gof_res = pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-modes/data/gof/gof.txt.gz', sep='\t', index_col=0)

sig0 = gof_res[gof_res['method'] == 'gamma'].groupby('dataset').apply(lambda x: x.loc[x['p'] < 0.05 / x.shape[0]]).reset_index(drop=True)
nsig1 = gof_res[gof_res['method'] == 'zig'].groupby('dataset').apply(lambda x: x.loc[x['p'] >= 0.05 / x.shape[0]]).reset_index(drop=True)
examples = sig0.merge(nsig1, on=['dataset', 'gene'], suffixes=['_gamma', '_zig'])
del examples['method_gamma']
del examples['method_zig']

plt.clf()
fig, ax = plt.subplots(2, 2)
fig.set_size_inches(4, 4)
for a, k in zip(ax.ravel(), examples.sort_values('p_gamma')['gene'].head(n=4)):
  for i, (l, c) in enumerate(zip(['B cell', 'T cell'], ['r', 'k'])):
    x = mix_x.loc[mix_y == i,k]
    a.hist(x, bins=np.arange(x.max() + 1), color=c, label=l, alpha=0.5)
    a.set_title(gene_info.loc[k, 'name'])
for a in ax:
  a[0].set_ylabel('Number of cells')
for a in ax.T:
  a[-1].set_xlabel('Number of molecules')
ax[0,0].legend(frameon=False)
fig.tight_layout()

Revision

Pick a better example gene.

k = 'ENSG00000129824'
x = ipsc_counts.loc[:,k].values
y = np.arange(x.max() + 1)

grid = np.linspace(0, (x / s).max(), 1000)
pmf = dict()
cdf = dict()

gamma_res = scmodes.ebpm.ebpm_gamma(x, s)
pmf['Gamma'] = np.array([scmodes.benchmark.gof._zig_pmf(k, size=s, log_mu=gamma_res[0], log_phi=-gamma_res[1]).mean()
                         for k in y])
cdf['Gamma'] = st.gamma(a=np.exp(gamma_res[1]), scale=np.exp(gamma_res[0] - gamma_res[1])).cdf(grid)

point_gamma_res = scmodes.ebpm.ebpm_point_gamma(x, s)
cdf['Point-Gamma'] = sp.expit(point_gamma_res[2]) + sp.expit(-point_gamma_res[2]) * st.gamma(a=np.exp(point_gamma_res[1]), scale=np.exp(point_gamma_res[0] - point_gamma_res[1])).cdf(grid)
pmf['Point-Gamma'] = np.array([scmodes.benchmark.gof._zig_pmf(k, size=s, log_mu=point_gamma_res[0], log_phi=-point_gamma_res[1], logodds=point_gamma_res[2]).mean()
                       for k in y])

unimodal_res = scmodes.ebpm.ebpm_unimodal(x, s)
g = np.array(unimodal_res.rx2('fitted_g'))
g = g[:,g[0] > 1e-8]
a = np.fmin(g[1], g[2])
b = np.fmax(g[1], g[2])
comp_dens_conv = np.array([((st.gamma(a=k + 1, scale=1 / s.values.reshape(-1, 1)).cdf(b.reshape(1, -1)) - st.gamma(a=k + 1, scale=1 / s.values.reshape(-1, 1)).cdf(a.reshape(1, -1))) / np.outer(s, b - a)).mean(axis=0) for k in y])
comp_dens_conv[:,0] = st.poisson(mu=s.values.reshape(-1, 1) * b[0]).pmf(y).mean(axis=0)
pmf['Unimodal'] = comp_dens_conv @ g[0]
cdf['Unimodal'] = ashr.cdf_ash(unimodal_res, grid).rx2('y').ravel()

npmle_res = scmodes.ebpm.ebpm_npmle(x, s, K=512, max_grid_updates=5, tol=1e-5)
g = np.array(npmle_res.rx2('fitted_g'))
g = g[:,g[0] > 1e-8]
a = np.fmin(g[1], g[2])
b = np.fmax(g[1], g[2])
comp_dens_conv = np.array([((st.gamma(a=k + 1, scale=1 / s.values.reshape(-1, 1)).cdf(b.reshape(1, -1)) - st.gamma(a=k + 1, scale=1 / s.values.reshape(-1, 1)).cdf(a.reshape(1, -1))) / np.outer(s, b - a)).mean(axis=0) for k in y])
pmf['Non-parametric'] = comp_dens_conv @ g[0]
cdf['Non-parametric'] = ashr.cdf_ash(npmle_res, grid).rx2('y').ravel()

cm = plt.get_cmap('Dark2')
plt.clf()
fig, ax = plt.subplots(1, 2)
fig.set_size_inches(7.5, 2)
ax[0].hist(x, bins=y, color='k', label=None)
for c, k in zip(cm.colors, pmf):
  ax[0].plot(y + .5, x.shape[0] * pmf[k], color=c, lw=1, label=k)
ax[0].legend(frameon=False)
ax[0].set_title(gene_info.loc['ENSG00000129824', 'name'])
ax[0].set_xlabel('Number of molecules')
ax[0].set_ylabel('Number of cells')

for c, k in zip(cm.colors, cdf):
  ax[1].plot(grid, cdf[k], color=c, lw=1, label=k)
ax[1].set_title(gene_info.loc['ENSG00000129824', 'name'])
ax[1].set_xlabel('Latent gene expression')
ax[1].set_ylabel('CDF')
ax[1].legend(frameon=False)
fig.tight_layout()

Plot the NB measurement model results.

fits = dict()
for k in control:
  fits[k] = np.load(f'/project2/mstephens/aksarkar/projects/singlecell-modes/data/ebnbm/nb-heuristic-{k}.npy')

labels = ['Chromium (1)', 'Chromium (2)', 'Drop-Seq', 'GemCode', 'InDrops']
keys = ['chromium1', 'chromium2', 'dropseq', 'gemcode', 'indrops']

cm = plt.get_cmap('Dark2')
plt.clf()
plt.gcf().set_size_inches(5.5, 2.5)
plt.xscale('log')
grid = np.logspace(-3, -2, 100)
for i, (k, l) in enumerate(zip(keys, labels)):
  temp = fits[k][:,:,-1].sum(axis=1) / np.log(10)
  diff = temp - temp.max()
  plt.plot(grid, diff, lw=1, label=l, c=cm(i + 3))
plt.ylim(-10, 0)
plt.axhline(y=-1, lw=1, ls=':', c='k')
plt.legend(title='Dataset', frameon=False, loc='center left', bbox_to_anchor=(1, .5), ncol=2)
plt.xlabel(r'Measurement dispersion')
plt.ylabel('Difference log$_{10}$ likelihood\nfrom best')
plt.tight_layout()

Plot the model comparison results.

thresh = 10
query1 = compare(llik[llik['dataset'].isin(control)], baseline='gamma', thresh=thresh, agg='mean')
keys = ['chromium1', 'chromium2', 'dropseq', 'gemcode', 'indrops']
labels = ['Chromium (1)', 'Chromium (2)', 'Drop-Seq', 'GemCode', 'InDrops']
plt.clf()
plt.gcf().set_size_inches(2.5, 3)
plt.bar(range(query1.shape[0]), query1.loc[keys, 'point_gamma'], color='k')
plt.xticks(range(query1.shape[0]), labels, rotation=90)
plt.title('Point-Gamma vs.\nGamma')
plt.ylabel(f'Fraction of genes with\nlikelihood ratio > {thresh}')
plt.tight_layout()

thresh = 100
keys = ['b_cells', 'cytotoxic_t', 'ipsc', 'cytotoxic_t-b_cells', 'cytotoxic_t-naive_t', 'kidney', 'pbmc_10k_v3', 'brain', 'retina']
labels = ['B cell', 'T cell', 'iPSC', 'T cell/B cell', 'Cyto/naive T', 'Kidney', 'PBMC', 'Brain', 'Retina']
query1 = compare(llik[llik['dataset'].isin(keys)], baseline='gamma', thresh=thresh, agg='mean')
query2 = compare(llik[llik['dataset'].isin(keys)], baseline='unimodal', thresh=thresh, agg='mean')

plt.clf()
fig, ax = plt.subplots(1, 3)
fig.set_size_inches(7.5, 3)
ax[0].bar(range(query1.shape[0]), query1.loc[keys, 'point_gamma'], color='k')
ax[0].set_title('Point-Gamma vs.\nGamma')

ax[1].bar(range(query1.shape[0]), query1.loc[keys, 'unimodal'], color='k')
ax[1].set_title('Unimodal vs.\nGamma')

ax[2].bar(range(query2.shape[0]), query2.loc[keys, 'npmle'], color='k')
ax[2].set_title('Non-parametric vs.\nUnimodal')

for a in ax:
  a.set_xticks(range(query2.shape[0]))
  a.set_xticklabels(labels, rotation=90)
  a.set_xlabel('Dataset')
  a.set_ylabel(f'Fraction of genes with\nlikelihood ratio > {thresh}')
fig.tight_layout()

thresh = 10
keys = ['b_cells', 'cytotoxic_t', 'ipsc', 'cytotoxic_t-b_cells', 'cytotoxic_t-naive_t', 'kidney', 'pbmc_10k_v3', 'brain', 'retina']
labels = ['B cell', 'T cell', 'iPSC', 'T cell/B cell', 'Cyto/naive T', 'Kidney', 'PBMC', 'Brain', 'Retina']
query1 = compare(llik[llik['dataset'].isin(keys)], baseline='gamma', thresh=thresh, agg='mean')

plt.clf()
plt.gcf().set_size_inches(2.5, 3)
plt.bar(range(query1.shape[0]), query1.loc[keys, 'point_gamma'], color='k')
plt.title('Point-Gamma vs.\nGamma')
plt.xticks(range(query1.shape[0]), labels, rotation=90)
plt.xlabel('Dataset')
plt.ylabel(f'Fraction of genes with\nlikelihood ratio > {thresh}')
plt.tight_layout()

Plot an example of unimodal over Gamma.

dat = data['pbmc_10k_v3']()
pmf = dict()
cdf = dict()
gene = 'ENSG00000163736'

x = dat[:,dat.var['ensg'] == gene].X.A.ravel()
s = dat.obs['size']
y = np.arange(x.max() + 1)
lam = x / s
grid = np.logspace(-11, -3, 1000)

print(f'fitting {gene} (Gamma)')
temp = scmodes.ebpm.ebpm_gamma(x, s, tol=1e-5, extrapolate=True)
cdf['Gamma'] = st.gamma(a=np.exp(temp[1]), scale=np.exp(temp[0] - temp[1])).cdf(grid)
pmf['Gamma'] = np.array([scmodes.benchmark.gof._zig_pmf(k, size=s, log_mu=temp[0], log_phi=-temp[1]).mean()
                      for k in y])

print(f'fitting {gene} (Point-Gamma)')
temp = scmodes.ebpm.ebpm_point_gamma(x, s, tol=1e-5, extrapolate=True)
cdf['Point-Gamma'] = sp.expit(temp[2]) + sp.expit(-temp[2]) * st.gamma(a=np.exp(temp[1]), scale=np.exp(temp[0] - temp[1])).cdf(grid)
pmf['Point-Gamma'] = np.array([scmodes.benchmark.gof._zig_pmf(k, size=s, log_mu=temp[0], log_phi=-temp[1], logodds=temp[2]).mean()
                      for k in y])

print(f'fitting {gene} (Unimodal)')
unimodal_res = scmodes.ebpm.ebpm_unimodal(x, s, mixcompdist='halfuniform')
g = np.array(unimodal_res.rx2('fitted_g'))
a = np.fmin(g[1], g[2])
b = np.fmax(g[1], g[2])
comp_dens_conv = np.array([((st.gamma(a=k + 1, scale=1 / s.values.reshape(-1, 1)).cdf(b.reshape(1, -1)) - st.gamma(a=k + 1, scale=1 / s.values.reshape(-1, 1)).cdf(a.reshape(1, -1))) / np.outer(s, b - a)).mean(axis=0) for k in y])
comp_dens_conv[:,0] = st.poisson(mu=s.values.reshape(-1, 1) * b[0]).pmf(y).mean(axis=0)
pmf['Unimodal'] = comp_dens_conv @ g[0]
cdf['Unimodal'] = ashr.cdf_ash(unimodal_res, grid).rx2('y').ravel()

cm = plt.get_cmap('Dark2')
plt.clf()
fig, ax = plt.subplots(2, 1)
fig.set_size_inches(3.75, 4)

ax[0].hist(x, bins=y, color='k')
ax[1].hist(x, bins=y, color='0.7')
for i, (k, ls) in enumerate(zip(pmf, ['-', (0, (3, 3)), '-'])):
  ax[1].plot(y + 0.5, x.shape[0] * pmf[k], lw=1, ls=ls, c=cm(i), label=k)
ax[0].set_xlabel('Number of molecules')
ax[0].set_ylabel('Number of cells')
ax[0].set_title(dat.var.loc[dat.var['ensg'] == gene, 'name'][0])
ax[1].legend(frameon=False)
ax[1].set_xlabel('Number of molecules')
ax[1].set_ylabel('Number of cells')
ax[1].set_ylim(0, 10)
ax[1].set_yticks([0, 5, 10])
fig.tight_layout()

Figure 2

Plot the Poisson thinning benchmark.

pois_thin_res = (pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-modes/data/lra-generalization/lra-generalization.txt.gz', sep='\t', header=[0, 1], index_col=[0, 1, 2])
                 .xs(8, level=1)
                 ['validation']
                 [['nmf', 'glmpca', 'pvae', 'nbvae']])
del pois_thin_res['nbvae']
pois_thin_res = pois_thin_res.drop('liver-caudate-lobe', level=0)
titles = ['T cell', 'B cell', 'iPSC', 'T cell/\nB cell', 'Cytotoxic/\nnaive T', 'Kidney', 'PBMC', 'Brain', 'Retina']

plt.clf()
fig, ax = plt.subplots(1, len(titles), sharey=True)
fig.set_size_inches(8, 2.5)
for a, (k, g), t in zip(ax.ravel(), pois_thin_res.groupby(level=0, sort=False), titles):
  for x, m in enumerate(g.columns):
    a.scatter(x + np.random.normal(scale=0.1, size=g.shape[0]), g[m].values, s=4, c='k', zorder=3)
  a.grid(c='0.8', lw=1, axis='x')
  a.set_xlim(-0.5, 2.5)
  a.set_xticks(np.arange(3))
  a.set_xticklabels([m.upper() for m in g.columns], rotation=90)
  a.set_xlabel('Method')
  a.set_title(t)
ax[0].set_ylabel('Log likelihood of\nheld-out molecules')
fig.tight_layout()

Plot the distribution of log likelihood differences.

train = np.load('/scratch/midway2/aksarkar/modes/fitted-values/b_cells-train.npy')
test = np.load('/scratch/midway2/aksarkar/modes/fitted-values/b_cells-test.npy')
lam0 = np.load('/scratch/midway2/aksarkar/modes/fitted-values/b_cells-nmf-lam.npy')
lam1 = np.load('/scratch/midway2/aksarkar/modes/fitted-values/b_cells-glmpca-lam.npy')
lam2 = np.load('/scratch/midway2/aksarkar/modes/fitted-values/b_cells-pvae-lam.npy')

s = (test.sum(axis=1) / train.sum(axis=1)).reshape(-1, 1)
llik0 = np.ma.masked_invalid(st.poisson(s * lam0).logpmf(test))
llik1 = st.poisson(s * lam1).logpmf(test)
llik2 = st.poisson(s * lam2).logpmf(test)

%matplotlib agg
plt.clf()
fig, ax = plt.subplots(1, 2, sharex=True, sharey=True)
fig.set_size_inches(8, 2)

grid = np.arange(test.max())
for i in grid:
  ax[0].boxplot((llik1 - llik0)[test == i].ravel(), positions=[i], widths=[0.5], medianprops={'color': 'k'}, flierprops={'marker': '.', 'markersize': 2})
ax[0].axhline(y=0, ls=':', lw=1, c='r')
ax[0].set_xlabel('Observation in validation set')
ax[0].set_ylabel('Improvement log lik\nover NMF')
ax[0].set_title('GLMPCA')

for i in grid:
  ax[1].boxplot((llik2 - llik0)[test == i].ravel(), positions=[i], widths=[0.5], medianprops={'color': 'k'}, flierprops={'marker': '.', 'markersize': 2})
ax[1].axhline(y=0, ls=':', lw=1, c='r')
ax[1].set_title('PVAE')
ax[1].set_xticks(grid[::5])
ax[1].set_xticklabels(grid[::5].astype(int))
ax[1].set_xlim(-.5, grid.max() - .5)
ax[1].set_xlabel('Observation in validation set')

fig.tight_layout()
plt.savefig('/project2/mstephens/aksarkar/projects/singlecell-modes/org/figure/figures.org/b_cells-llik-diff.png', dpi=300, format='png')

Plot the fitted values against each other.

%matplotlib agg
plt.clf()
fig, ax = plt.subplots(1, 6, sharey=True)
fig.set_size_inches(8, 2)

lim = [0, 20]
lam0 = np.load('/scratch/midway2/aksarkar/modes/fitted-values/b_cells-nmf-lam.npy')
lam1 = np.load('/scratch/midway2/aksarkar/modes/fitted-values/b_cells-glmpca-lam.npy')
lam2 = np.load('/scratch/midway2/aksarkar/modes/fitted-values/b_cells-pvae-lam.npy')

ax[0].scatter(np.sqrt(lam1.ravel()), np.sqrt(lam0.ravel()), s=1, c='k', alpha=0.1)
ax[0].plot(lim, lim, lw=1, ls=':', c='r')
ax[0].set_xlabel(r'GLMPCA $\sqrt{\hat\lambda}$')
ax[0].set_ylabel(r'NMF $\sqrt{\hat\lambda}$')
ax[0].set_title('B cell')

ax[1].scatter(np.sqrt(lam2.ravel()), np.sqrt(lam0.ravel()), s=1, c='k', alpha=0.1)
ax[1].plot(lim, lim, lw=1, ls=':', c='r')
ax[1].set_xlabel(r'PVAE $\sqrt{\hat\lambda}$')
ax[1].set_title('B cell')

lam0 = np.load('/scratch/midway2/aksarkar/modes/fitted-values/cytotoxic_t-b_cells-nmf-lam.npy')
lam1 = np.load('/scratch/midway2/aksarkar/modes/fitted-values/cytotoxic_t-b_cells-glmpca-lam.npy')
lam2 = np.load('/scratch/midway2/aksarkar/modes/fitted-values/cytotoxic_t-b_cells-pvae-lam.npy')

ax[2].scatter(np.sqrt(lam1.ravel()), np.sqrt(lam0.ravel()), s=1, c='k', alpha=0.1)
ax[2].plot(lim, lim, lw=1, ls=':', c='r')
ax[2].set_xlabel(r'GLMPCA $\sqrt{\hat\lambda}$')
ax[2].set_title('T cell/B cell')

ax[3].scatter(np.sqrt(lam2.ravel()), np.sqrt(lam0.ravel()), s=1, c='k', alpha=0.1)
ax[3].plot(lim, lim, lw=1, ls=':', c='r')
ax[3].set_xlabel(r'PVAE $\sqrt{\hat\lambda}$')
ax[3].set_title('T cell/B cell')

lam0 = np.load('/scratch/midway2/aksarkar/modes/fitted-values/pbmcs_10k_v3-nmf-lam.npy')
lam1 = np.load('/scratch/midway2/aksarkar/modes/fitted-values/pbmcs_10k_v3-glmpca-lam.npy')
lam2 = np.load('/scratch/midway2/aksarkar/modes/fitted-values/pbmcs_10k_v3-pvae-lam.npy')

ax[4].scatter(np.sqrt(lam1.ravel()), np.sqrt(lam0.ravel()), s=1, c='k', alpha=0.1)
ax[4].plot(lim, lim, lw=1, ls=':', c='r')
ax[4].set_xlabel(r'GLMPCA $\sqrt{\hat\lambda}$')
ax[4].set_title('PBMC')

ax[5].scatter(np.sqrt(lam2.ravel()), np.sqrt(lam0.ravel()), s=1, c='k', alpha=0.1)
ax[5].plot(lim, lim, lw=1, ls=':', c='r')
ax[5].set_xlabel(r'PVAE $\sqrt{\hat\lambda}$')
ax[5].set_title('PBMC')

fig.tight_layout()
plt.savefig('/project2/mstephens/aksarkar/projects/singlecell-modes/org/figure/figures.org/fitted-values.png', dpi=300, format='png')