NPMLE on C1 spike-in data

Introduction
Setup
Results
- Benchmark
- Extract examples

Introduction

We found NPMLE overfits spike-in data from C1, but not other platforms.

Setup

import numpy as np
import pandas as pd
import scqtl.simple
import sklearn.model_selection as skms

import rpy2.robjects.packages
import rpy2.robjects.pandas2ri

rpy2.robjects.pandas2ri.activate()

ashr = rpy2.robjects.packages.importr('ashr')

%matplotlib inline
%config InlineBackend.figure_formats = set(['retina'])

import matplotlib.pyplot as plt
plt.rcParams['figure.facecolor'] = 'w'

Results

Benchmark

Read the results.

benchmark = {}
for data in ('dropseq', 'indrops', 'chromium1', 'chromium2', 'gemcode', 'c1'):
  benchmark[data] = (
    pd.read_csv(f'/project2/mstephens/aksarkar/projects/singlecell-modes/data/deconv-generalization/{data}-gpu.txt.gz', index_col=0, sep='\t')
    .merge(pd.read_csv(f'/project2/mstephens/aksarkar/projects/singlecell-modes/data/deconv-generalization/{data}-unimodal.txt.gz', index_col=0, sep='\t'), left_index=True, right_index=True)
    .merge(pd.read_csv(f'/project2/mstephens/aksarkar/projects/singlecell-modes/data/deconv-generalization/{data}-zief.txt.gz', index_col=0, sep='\t'), left_index=True, right_index=True)
    .merge(pd.read_csv(f'/project2/mstephens/aksarkar/projects/singlecell-modes/data/deconv-generalization/{data}-npmle.txt.gz', index_col=0, sep='\t'), left_index=True, right_index=True))

Find genes where NPMLE does worst against Gamma.

T = benchmark['c1']
llik_diff = T["npmle"] - T["gamma"]
query = llik_diff.sort_values().head(n=12).index

c1 = data['c1']()

plt.clf()
fig, ax = plt.subplots(4, 3)
fig.set_size_inches(7, 5)
for a, k in zip(ax.ravel(), query):
  a.hist(c1[:,k], bins=np.arange(c1[:,k].max() + 1), color='k')
  a.text(x=.95, y=.95,
         s=f"$\Delta l$={llik_diff.loc[k]:.1f}",
         horizontalalignment='right',
         verticalalignment='top',
         transform=a.transAxes)
for y in range(ax.shape[0]):
  ax[y][0].set_ylabel('Num cells')
for x in range(ax.shape[1]):
  ax[-1][x].set_xlabel('Num mols')
fig.tight_layout()

Extract examples

x = c1[:,query][:,0]
s = c1.sum(axis=1)
lam = x / s
train, val, train_s, val_s = skms.train_test_split(x, s, test_size=0.1, random_state=1)
train_lam = train / train_s

K = 200
grid = np.linspace(0, train_lam.max(), K + 1)
gamma_res = scqtl.simple.fit_nb(train, train_s)
npmle_res = ashr.ash_workhorse(
  pd.Series(np.zeros(train.shape)),
  1,
  outputlevel=pd.Series(['fitted_g', 'loglik']),
  lik=ashr.lik_pois(y=pd.Series(train), scale=pd.Series(train_s), link='identity'),
  g=ashr.unimix(pd.Series(np.ones(K) / K), pd.Series(grid[:-1]), pd.Series(grid[1:])))
npmle_val_res = ashr.ash_workhorse(
  pd.Series(np.zeros(val.shape)),
  1,
  outputlevel='loglik',
  lik=ashr.lik_pois(y=pd.Series(val), scale=pd.Series(val_s), link='identity'),
  fixg=True,
  g=npmle_res.rx2('fitted_g'))

gamma_train_llik = gamma_res[-1]
npmle_train_llik = np.array(npmle_res.rx2('loglik'))

gamma_val_llik = st.nbinom(n=gamma_res[1], p=1 / (1 + val_s * gamma_res[0] / gamma_res[1])).logpmf(val).sum()
npmle_val_llik = np.array(npmle_val_res.rx2('loglik'))

npmle_train_llik.sum() - gamma_train_llik.sum()

-13918.570811914533

npmle_val_llik.sum() - gamma_val_llik.sum()

-1543.3543034150318

lam_grid = np.linspace(0, lam.max(), 5000)
gamma_cdf = st.gamma(a=gamma_res[1], scale=gamma_res[0] / gamma_res[1]).cdf(lam_grid)
npmle_cdf = ashr.cdf_ash(npmle_res, lam_grid)

plt.clf()
fig, ax = plt.subplots(2, 1)
fig.set_size_inches(5, 3)
h = ax[0].hist(train, bins=np.arange(0, x.max() + 1, 1), color='k', label='Training')
ax[0].hist(val, bottom=h[0], bins=np.arange(0, x.max() + 1, 1), color='r', label='Validation')
ax[0].legend(frameon=False)
ax[0].set_xlabel('Num mols')
ax[0].set_ylabel('Num cells')

ax[1].set_xlim(0, 0.01)
ax[1].plot(np.array(npmle_cdf.rx2('x')).ravel(), np.array(npmle_cdf.rx2('y')).ravel(), c='b', label='NPMLE', lw=1)
ax[1].plot(lam_grid, gamma_cdf, c='orange', label='Gamma', lw=1)
ax[1].set_xlabel('Latent gene expression')
ax[1].set_ylabel('CDF')
ax[1].legend(frameon=False)

fig.tight_layout()

Look at the grid.

lam.max()

1.0

Investigate where the MLE \(\hat\lambda = x / s = 1\).

np.where(s == 1)

(array([867, 869, 872, 873, 874, 875, 877, 883, 887, 893, 895, 896, 897,
898]),)

c1[867]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0])

Check whether size factor equal to 1 is sensible.

annotations = pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/scqtl-annotation.txt', sep='\t')
keep_samples = pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/quality-single-cells.txt', sep='\t', header=None, index_col=0)
annotations.loc[np.logical_and(keep_samples.values.ravel(), (annotations['mol_ercc'] == 1).values.ravel())].shape[0]

NPMLE does poorly because our choice of grid is suboptimal.