Deconvolution of near-Poisson data

Table of Contents

Introduction

Deconvolution of Poisson data assuming a point mass is trivial:

\begin{align*} x_i &\sim \operatorname{Poisson}(s_i \lambda_i)\\ \lambda_i &\sim \delta_\mu(\cdot)\\ \hat\mu &= \frac{\sum_i x_i}{\sum_i s_i} \end{align*}

More general approaches should all be able to fit this case. Here, we investigate cases where they fail.

Setup

import numpy as np
import pandas as pd
import scipy.special as sp
import scipy.stats as st
import scmodes
import scqtl.simple

import rpy2.robjects.packages
import rpy2.robjects.pandas2ri

rpy2.robjects.pandas2ri.activate()

ashr = rpy2.robjects.packages.importr('ashr')
descend = rpy2.robjects.packages.importr('descend')
%matplotlib inline
%config InlineBackend.figure_formats = set(['retina'])
import matplotlib.pyplot as plt
plt.rcParams['figure.facecolor'] = 'w'
plt.rcParams['font.family'] = 'Nimbus Sans'

Results

Extract the examples.

Read the results of benchmarking deconvolution methods on homogeneous tissues.

benchmark = {}
for data in ('cytotoxic_t', 'b_cells', 'ipsc'):
  benchmark[data] = (
    pd.read_csv(f'/project2/mstephens/aksarkar/projects/singlecell-modes/data/deconv-generalization/{data}-gpu.txt.gz', index_col=0, sep='\t')
    .merge(pd.read_csv(f'/project2/mstephens/aksarkar/projects/singlecell-modes/data/deconv-generalization/{data}-unimodal.txt.gz', index_col=0, sep='\t'), left_index=True, right_index=True)
    .merge(pd.read_csv(f'/project2/mstephens/aksarkar/projects/singlecell-modes/data/deconv-generalization/{data}-zief.txt.gz', index_col=0, sep='\t'), left_index=True, right_index=True)
    .merge(pd.read_csv(f'/project2/mstephens/aksarkar/projects/singlecell-modes/data/deconv-generalization/{data}-npmle.txt.gz', index_col=0, sep='\t'), left_index=True, right_index=True))

Extract the genes for which ZIEF does best.

query = benchmark['b_cells'][benchmark['b_cells']['zief'] > benchmark['b_cells']['gamma']].index

Read the count matrix.

b_cells = scmodes.dataset.read_10x('/project2/mstephens/aksarkar/projects/singlecell-ideas/data/10xgenomics/b_cells/filtered_matrices_mex/hg19/', return_df=True)
s = b_cells.sum(axis=1)

Read the gene metadata.

gene_info = pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/scqtl-genes.txt.gz', sep='\t', index_col=0)

Deconvolve specific examples

Fit various deconvolutions.

x = b_cells.loc[:,query[0]]
lam = x / s
K = 100
grid = np.linspace(0, lam.max(), K + 1)
gamma_res = scqtl.simple.fit_nb(x, s)
zig_res = scqtl.simple.fit_zinb(x, s)
unimodal_res = ashr.ash_workhorse(
    pd.Series(np.zeros(x.shape)),
    1,
    outputlevel=pd.Series(['fitted_g', 'loglik']),
    lik=ashr.lik_pois(y=x, scale=s, link='identity'),
    mixsd=pd.Series(np.geomspace(lam.min() + 1e-8, lam.max(), 25)),
    mode=pd.Series([lam.min(), lam.max()]))
zief_res = descend.deconvSingle(x, scaling_consts=s, verbose=False)
npmle_res = ashr.ash_workhorse(
    pd.Series(np.zeros(x.shape)),
    1,
    outputlevel=pd.Series(['fitted_g', 'loglik']),
    lik=ashr.lik_pois(y=x, scale=s, link='identity'),
    g=ashr.unimix(pd.Series(np.ones(K) / K), pd.Series(grid[:-1]), pd.Series(grid[1:])))

Discretize the CDFs.

grid = np.linspace(lam.min(), 1e-3, 1000)
gamma_cdf = st.gamma(a=gamma_res[1], scale=gamma_res[0] / gamma_res[1]).cdf(grid)
zig_cdf = gamma_cdf * sp.expit(-zig_res[2]) + sp.expit(zig_res[2])
unimodal_cdf = ashr.cdf_ash(unimodal_res, grid)
zief_g = np.array(zief_res.slots['distribution'])[:,:2]
zief_cdf = np.array([zief_g[:,0], np.cumsum(zief_g[:,1])])
npmle_cdf = ashr.cdf_ash(npmle_res, grid)
plt.clf()
fig, ax = plt.subplots(2, 1)
fig.set_size_inches(6, 4)
ax[0].hist(x, bins=np.arange(x.max() + 1), color='k')
ax[0].set_xlabel('Num mols')
ax[0].set_ylabel('Num cells')

cm = plt.get_cmap('Dark2').colors
ax[1].set_xlim(0, 1e-3)
ax[1].plot(grid, gamma_cdf, color=cm[0], lw=1, label='Gamma')
ax[1].plot(grid, zig_cdf, color=cm[1], lw=1, label='ZIG')
ax[1].plot(np.array(unimodal_cdf.rx2('x')),
           np.array(unimodal_cdf.rx2('y')).ravel(), c=cm[2], lw=1, label='Unimodal')
ax[1].plot(zief_cdf[0], zief_cdf[1], c=cm[3], lw=1, label='ZIEF')
ax[1].plot(np.array(npmle_cdf.rx2('x')),
           np.array(npmle_cdf.rx2('y')).ravel(), c=cm[4], lw=1, label='NPMLE')
ax[1].set_xlabel('Latent gene expression')
ax[1].set_ylabel('CDF')
ax[1].legend(frameon=False)

fig.tight_layout()

park7-example.png

Zoom in around 0.

plt.clf()
plt.gcf().set_size_inches(3, 3)
cm = plt.get_cmap('Dark2').colors
plt.xlim(0, 1e-4)
plt.ylim(0, .1)
plt.xticks([0, 5e-5, 1e-4])
plt.plot(grid, gamma_cdf, color=cm[0], lw=1, label='Gamma')
plt.plot(grid, zig_cdf, color=cm[1], lw=1, label='ZIG')
plt.plot(np.array(unimodal_cdf.rx2('x')),
         np.array(unimodal_cdf.rx2('y')).ravel(), c=cm[2], lw=1, label='Unimodal')
plt.plot(zief_cdf[0], zief_cdf[1], c=cm[3], lw=1, label='ZIEF')
plt.plot(np.array(npmle_cdf.rx2('x')),
         np.array(npmle_cdf.rx2('y')).ravel(), c=cm[4], lw=1, label='NPMLE')
plt.xlabel('Latent gene expression')
plt.ylabel('CDF')
plt.legend(frameon=False)
plt.tight_layout()

park7-example-inset.png

Report the training log likelihoods.

point_llik = st.poisson(mu=x.sum() / s.sum()).logpmf(x)
gamma_llik = st.nbinom(n=gamma_res[1], p=1 / (1 + s * gamma_res[0] / gamma_res[1])).logpmf(x)
zig_llik = np.where(x < 1,
                    -np.log1p(np.exp(-zig_res[2])) + np.log1p(np.exp(gamma_llik - zig_res[2])), 
                    -np.log1p(np.exp(zig_res[2])) + gamma_llik)
unimodal_llik = np.array(unimodal_res.rx2('loglik'))
zief_llik = np.where(x < 1,
                     np.log(st.poisson(mu=s.values.reshape(-1, 1) * zief_g[:,0]).pmf(x.values.reshape(-1, 1)).dot(zief_g[:,1])),
                     np.log(st.poisson(mu=s.values.reshape(-1, 1) * zief_g[1:,0]).pmf(x.values.reshape(-1, 1)).dot(zief_g[1:,1])))
npmle_llik = np.array(npmle_res.rx2('loglik'))
pd.Series({'Point': point_llik.sum(),
           'Gamma': gamma_llik.sum(),
           'ZIG': zig_llik.sum(),
           'Unimodal': unimodal_llik.sum(),
           'ZIEF': zief_llik.sum(),
           'NPMLE': npmle_llik.sum()})
Point      -31653.253236
Gamma       -7702.696681
ZIG         -7702.818529
Unimodal    -7702.849389
ZIEF        -7702.299609
NPMLE       -7701.940404
dtype: float64

The reason ZIEF appeared anomalous in our benchmark was that our implementation of the validation set log likelihood, marginalizing over the estimated latent distribution \(\hat{g}\), was incorrect.

Poisson ash

Draw some near-Poisson data.

data, _ = scqtl.simulation.simulate(num_samples=1000, logodds=-5, seed=2)
x = pd.Series(data[:,0])
s = pd.Series(data[:,1])
lam = x / s

Write out the data.

rpy2.robjects.r['saveRDS'](pd.DataFrame({'x': x, 's': s}), 'pois-mode-est.Rds')
rpy2.rinterface.NULL

Fit Poisson convolved with point mass.

fit0 = scqtl.simple.fit_pois(x, s)

Fit ZINB.

fit1 = scqtl.simple.fit_zinb(x, s)

Fit Poisson ash.

fit2 = ashr.ash_workhorse(
  # these are ignored by ash
  pd.Series(np.zeros(x.shape)),
  1,
  outputlevel=pd.Series(['loglik', 'fitted_g']),
  lik=ashr.lik_pois(y=x, scale=s, link='identity'),
  mixcompdist='halfuniform',
  mixsd=pd.Series(np.geomspace(lam.min() + 1e-8, lam.max(), 25)))

Report the log likelihood.

pd.Series({'pointmass': fit0[-1],
           'zig': fit1[-1],
           'unimodal': np.array(fit2.rx2('loglik'))[0]})
pointmass   -2008.663140
zig         -2008.407921
unimodal    -2077.552567
dtype: float64

Plot the data and estimated \(\hat{g}\). Also plot the estimated mode.

cm = plt.get_cmap('Dark2')
plt.clf()
fig, ax = plt.subplots(2, 1)
fig.set_size_inches(5, 4)
ax[0].hist(x.values, bins=np.arange(x.max() + 1), color='0.5')
ax[0].set_xlim(0, x.max())
ax[0].set_xlabel('Number of molecules')
ax[0].set_ylabel('Number of cells')

ax[1].axvline(x=fit0[0], c=cm(0), lw=1, ls='--', label='Point mass')
ax[1].plot(grid, zinb_cdf, lw=1, c=cm(1), label='ZIG')
ax[1].axvline(x=fit1[0], c=cm(1), ls=':', lw=1)
ax[1].plot(grid, Fx, lw=1, c=cm(2), label='Unimodal')
ax[1].axvline(x=np.array(fit2.rx2('fitted_g').rx2('a'))[0], c=cm(1), ls=':', lw=1)
ax[1].legend(frameon=False)
ax[1].set_xlabel('Latent gene expression')
ax[1].set_ylabel('CDF')

fig.tight_layout()

pois-point-unimodal-example.png

Examine mode estimation.

mode = ashr.ash_estmode(
  betahat=pd.Series(np.zeros(x.shape)),
  sebetahat=1,
  modemin=lam.min(),
  modemax=lam.max(),
  lik=ashr.lik_pois(y=x, scale=s, link='identity'),
  mixcompdist='halfuniform',
  mixsd=pd.Series(np.geomspace(lam.min() + 1e-8, lam.max(), 25)))
mode = np.array(mode)[0]
fit3 = ashr.ash_workhorse(
  # these are ignored by ash
  pd.Series(np.zeros(x.shape)),
  1,
  outputlevel=pd.Series(['loglik', 'fitted_g']),
  lik=ashr.lik_pois(y=x, scale=s, link='identity'),
  mixcompdist='halfuniform',
  mixsd=pd.Series(np.geomspace(lam.min() + 1e-8, lam.max(), 25)),
  mode=mode)

Evaluate the ash log likelihood on a grid of modes, over the space that ash.estmode optimizes over.

grid = np.linspace(lam.min(), lam.max(), 1000)[1:]
llik = []
for l in grid:
  fit = ashr.ash_workhorse(
         # these are ignored by ash
         pd.Series(np.zeros(x.shape)),
         1,
         outputlevel=pd.Series(['loglik']),
         lik=ashr.lik_pois(y=x, scale=s, link='identity'),
         mixcompdist='halfuniform',
         mixsd=pd.Series(np.geomspace(lam.min() + 1e-8, lam.max(), 25)),
         mode=l)
  llik.append(np.array(fit.rx2('loglik'))[0])
llik = np.array(llik)

Plot the marginal likelihood against the mode.

cm = plt.get_cmap('Dark2')
plt.clf()
fig, ax = plt.subplots(2, 1, sharex=True)
fig.set_size_inches(6, 3)
for a in ax:
  a.plot(grid, llik, lw=1, c='k')
  for i, (m, l) in enumerate(zip([fit0[0], fit1[0], np.array(fit2.rx2('fitted_g').rx2('a'))[0], mode, grid[np.argmax(llik)]], ['Point', 'ZIG', 'ash', 'Two step', 'Grid'])):
    a.axvline(x=m, lw=1, c=cm(i), ls=':', label=l)

ax[1].legend(frameon=False, bbox_to_anchor=(1, .5), loc='center left')
ax[1].set_xlabel('Mode')
ax[1].set_ylim(-2009, -2008)
for a in ax:
  a.set_ylabel('Log lik')
fig.tight_layout()

mode-search.png

Report the log likelihood of the two step approach.

pd.Series({'pointmass': fit0[-1],
           'zig': fit1[-1],
           'unimodal': np.array(fit2.rx2('loglik'))[0],
           'twostep': np.array(fit3.rx2('loglik'))[0],
          })
pointmass   -2008.663140
zig         -2008.407921
unimodal    -2077.552567
twostep     -2008.891981
dtype: float64

Author: Abhishek Sarkar

Created: 2019-11-24 Sun 17:24

Validate