Comparison of fitted values

Table of Contents

Introduction

We previously found that NMF, GLM-PCA, and PVAE all had similar average performance on a binomial thinning benchmark. Here, we investigate the per-observation performance of these methods.

Setup

import anndata
import numpy as np
import pandas as pd
import scanpy as sc
import scipy.stats as st
import scmodes
import torch
%matplotlib inline
%config InlineBackend.figure_formats = set(['retina'])
import matplotlib.pyplot as plt
plt.rcParams['figure.facecolor'] = 'w'
plt.rcParams['font.family'] = 'Nimbus Sans'

Results

Datasets

Follow the data processing for the Poisson thinning analysis.

B cells

Split the data into training and validation sets using binomial thinning, then fit NMF, GLMPCA, and PVAE.

# Important: this implicitly sets the random seed
x = data['b_cells']()
train, test = scmodes.benchmark.train_test_split(x)
np.save('/scratch/midway2/aksarkar/modes/fitted-values/b_cells-train', train)
np.save('/scratch/midway2/aksarkar/modes/fitted-values/b_cells-test', test)
sbatch --partition=broadwl -n1 --mem=8G --time=24:00:00
#!/bin/bash
source activate scmodes
python <<EOF
<<imports>>
train = np.load('/scratch/midway2/aksarkar/modes/fitted-values/b_cells-train.npy')
l, f, _ = scmodes.lra.nmf(train, rank=8, tol=1e-2, max_iters=100000, verbose=True)
lam0 = l @ f.T
np.save('/scratch/midway2/aksarkar/modes/fitted-values/b_cells-nmf-lam', lam0)

l, f, _ = scmodes.lra.glmpca(train, rank=8, tol=1e-2, max_iters=100000, verbose=True)
lam1 = np.exp(l @ f.T)
np.save('/scratch/midway2/aksarkar/modes/fitted-values/b_cells-glmpca-lam', lam1)
EOF
sbatch --partition=gpu2 --gres=gpu:1 --mem=8G
#!/bin/bash
source activate scmodes
python <<EOF
<<imports>>
train = np.load('/scratch/midway2/aksarkar/modes/fitted-values/b_cells-train.npy')
torch.manual_seed(1)
xt = torch.tensor(train, dtype=torch.float)
m = scmodes.lra.PVAE(input_dim=train.shape[1], latent_dim=8).fit(xt, test_size=0.1, lr=1e-3, max_epochs=200, trace=True)
lam2 = m.denoise(xt, n_samples=100)
np.save('/scratch/midway2/aksarkar/modes/fitted-values/b_cells-pvae-lam', lam2)
np.save('/scratch/midway2/aksarkar/modes/fitted-values/b_cells-pvae-trace', np.array(m.trace))

m = scmodes.lra.NBVAE(input_dim=train.shape[1], latent_dim=8, disp_by_gene=True).fit(xt, test_size=0.1, lr=1e-3, max_epochs=400, trace=True)
lam3 = m.denoise(xt, n_samples=100)
np.save('/scratch/midway2/aksarkar/modes/fitted-values/b_cells-nbvae-log_inv_disp', 
        m.log_inv_disp.detach().cpu().numpy())
np.save('/scratch/midway2/aksarkar/modes/fitted-values/b_cells-nbvae-lam', lam3)
np.save('/scratch/midway2/aksarkar/modes/fitted-values/b_cells-nbvae-trace', np.array(m.trace))
EOF

Examine the VAE optimization traces.

cm = plt.get_cmap('Paired')
plt.clf()
plt.gcf().set_size_inches(4, 2)
trace = np.load('/scratch/midway2/aksarkar/modes/fitted-values/b_cells-pvae-trace.npy')
trace2 = np.load('/scratch/midway2/aksarkar/modes/fitted-values/b_cells-nbvae-trace.npy')
for i, (t, s, k) in enumerate(zip(trace.T, [900, 100], ['PVAE train', 'PVAE val'])):
  plt.plot(np.log(t / s), lw=1, c=cm(i), label=k)
for i, (t, s, k) in enumerate(zip(trace2.T, [900, 100], ['NBVAE train', 'NBVAE val'])):
  plt.plot(np.log(t / s), lw=1, c=cm(i + 2), label=k)
plt.legend(frameon=False)
plt.xlabel('Epoch')
plt.ylabel('Avg log neg ELBO')
plt.tight_layout()

b_cells-pvae-trace.png

Read the results.

train = np.load('/scratch/midway2/aksarkar/modes/fitted-values/b_cells-train.npy')
test = np.load('/scratch/midway2/aksarkar/modes/fitted-values/b_cells-test.npy')
lam0 = np.load('/scratch/midway2/aksarkar/modes/fitted-values/b_cells-nmf-lam.npy')
lam1 = np.load('/scratch/midway2/aksarkar/modes/fitted-values/b_cells-glmpca-lam.npy')
lam2 = np.load('/scratch/midway2/aksarkar/modes/fitted-values/b_cells-pvae-lam.npy')
lam3 = np.load('/scratch/midway2/aksarkar/modes/fitted-values/b_cells-nbvae-lam.npy')
nbvae_log_inv_disp = np.load('/scratch/midway2/aksarkar/modes/fitted-values/b_cells-nbvae-log_inv_disp.npy')

Compare the fitted values of each method against other.

plt.clf()
fig, ax = plt.subplots(1, 3, sharey=True)
fig.set_size_inches(5.5, 2.5)

ax[0].scatter(np.sqrt(lam1.ravel()[::50]), np.sqrt(lam0.ravel()[::50]), s=1, c='k', alpha=0.1)
ax[0].plot([0, 11], [0, 11], lw=1, ls=':', c='r')
ax[0].set_xlabel(r'GLMPCA $\sqrt{\hat\lambda}$')
ax[0].set_ylabel(r'NMF $\sqrt{\hat\lambda}$')

ax[1].scatter(np.sqrt(lam2.ravel()[::50]), np.sqrt(lam0.ravel()[::50]), s=1, c='k', alpha=0.1)
ax[1].plot([0, 11], [0, 11], lw=1, ls=':', c='r')
ax[1].set_xlabel(r'PVAE $\sqrt{\hat\lambda}$')

ax[2].scatter(np.sqrt(lam3.ravel()[::50]), np.sqrt(lam0.ravel()[::50]), s=1, c='k', alpha=0.1)
ax[2].plot([0, 11], [0, 11], lw=1, ls=':', c='r')
ax[2].set_xlabel(r'NBVAE $\sqrt{\hat\lambda}$')

a = fig.add_subplot(111, frame_on=False, xticks=[], yticks=[])
a.set_title('B cell')
fig.tight_layout()

b_cells.png

Compare the fitted values of each method against each other, only for observations which were 1 in the validation data.

plt.clf()
fig, ax = plt.subplots(2, 2)
fig.set_size_inches(4, 4)

query = np.isclose(test, 1)
lim = [-400, 5]
ax[0][0].scatter(np.log(lam1[query].ravel()), np.log(lam0[query].ravel()), s=1, c='k', alpha=0.1)
ax[0][0].plot(lim, lim, lw=1, ls=':', c='r')
ax[0][0].set_xlim(lim)
ax[0][0].set_ylim(lim)
ax[0][0].set_ylabel(r'NMF $\log\ \hat\lambda$')

ax[0][1].scatter(np.log(lam2[query].ravel()), np.log(lam0[query].ravel()), s=1, c='k', alpha=0.1)
ax[0][1].plot(lim, lim, lw=1, ls=':', c='r')
ax[0][1].set_xlim(lim)
ax[0][1].set_ylim(lim)

lim = [-20, 5]
ax[0][0].add_patch(plt.Rectangle((lim[0], lim[0]), lim[1] - lim[0], lim[1] - lim[0], fill=False, color='b'))
ax[0][1].add_patch(plt.Rectangle((lim[0], lim[0]), lim[1] - lim[0], lim[1] - lim[0], fill=False, color='b'))

ax[1][0].scatter(np.log(lam1[query].ravel()), np.log(lam0[query].ravel()), s=1, c='k', alpha=0.1)
ax[1][0].plot(lim, lim, lw=1, ls=':', c='r')
ax[1][0].set_xlim(lim)
ax[1][0].set_ylim(lim)
ax[1][0].set_xlabel(r'GLMPCA $\log\ \hat\lambda$')
ax[1][0].set_ylabel(r'NMF $\log\ \hat\lambda$')

ax[1][1].scatter(np.log(lam2[query].ravel()), np.log(lam0[query].ravel()), s=1, c='k', alpha=0.1)
ax[1][1].plot(lim, lim, lw=1, ls=':', c='r')
ax[1][1].set_xlim(lim)
ax[1][1].set_ylim(lim)
ax[1][1].set_xlabel(r'PVAE $\log\ \hat\lambda$')

a = fig.add_subplot(111, frame_on=False, xticks=[], yticks=[])
a.set_title('B cell validation $x = 1$')
fig.tight_layout()

b_cells-val-1.png

Repeat for observations which were 0 in the validation data.

plt.clf()
fig, ax = plt.subplots(2, 2)
fig.set_size_inches(4, 4)

query = np.isclose(test, 0)
lim = [-400, 5]
ax[0][0].scatter(np.log(lam1[query].ravel()[::50]), np.log(lam0[query].ravel()[::50]), s=1, c='k', alpha=0.1)
ax[0][0].plot(lim, lim, lw=1, ls=':', c='r')
ax[0][0].set_xlim(lim)
ax[0][0].set_ylim(lim)
ax[0][0].set_ylabel(r'NMF $\log\ \hat\lambda$')

ax[0][1].scatter(np.log(lam2[query].ravel()[::50]), np.log(lam0[query].ravel()[::50]), s=1, c='k', alpha=0.1)
ax[0][1].plot(lim, lim, lw=1, ls=':', c='r')
ax[0][1].set_xlim(lim)
ax[0][1].set_ylim(lim)

lim = [-20, 5]
ax[0][0].add_patch(plt.Rectangle((lim[0], lim[0]), lim[1] - lim[0], lim[1] - lim[0], fill=False, color='b'))
ax[0][1].add_patch(plt.Rectangle((lim[0], lim[0]), lim[1] - lim[0], lim[1] - lim[0], fill=False, color='b'))

ax[1][0].scatter(np.log(lam1[query].ravel()[::50]), np.log(lam0[query].ravel()[::50]), s=1, c='k', alpha=0.1)
ax[1][0].plot(lim, lim, lw=1, ls=':', c='r')
ax[1][0].set_xlim(lim)
ax[1][0].set_ylim(lim)
ax[1][0].set_xlabel(r'GLMPCA $\log\ \hat\lambda$')
ax[1][0].set_ylabel(r'NMF $\log\ \hat\lambda$')

ax[1][1].scatter(np.log(lam2[query].ravel()[::50]), np.log(lam0[query].ravel()[::50]), s=1, c='k', alpha=0.1)
ax[1][1].plot(lim, lim, lw=1, ls=':', c='r')
ax[1][1].set_xlim(lim)
ax[1][1].set_ylim(lim)
ax[1][1].set_xlabel(r'PVAE $\log\ \hat\lambda$')

a = fig.add_subplot(111, frame_on=False, xticks=[], yticks=[])
a.set_title('B cell validation $x = 0$')
fig.tight_layout()

b_cells-val-0.png

Look at the average training log likelihoods.

llik0 = st.poisson(lam0).logpmf(train)
llik1 = st.poisson(lam1).logpmf(train)
llik2 = st.poisson(lam2).logpmf(train)
llik3 = st.poisson(lam3).logpmf(train)
pd.Series({'NMF': llik0.mean(), 'GLMPCA': llik1.mean(), 'PVAE': llik2.mean(), 'NBVAE': llik3.mean()})
NMF      -1.038913
GLMPCA   -1.036029
PVAE     -1.062526
NBVAE    -0.830541
dtype: float64

Look at the distribution of per-observation training log likelihood differences, stratified by observed value.

plt.clf()
fig, ax = plt.subplots(4, 1, sharex=True)
fig.set_size_inches(7, 6)

grid = np.arange(train.max() + 1)
for i in grid:
  ax[0].boxplot((llik1 - llik0)[train == i].ravel(), positions=[i], widths=[0.5], medianprops={'color': 'k'}, flierprops={'marker': '.', 'markersize': 2})
ax[0].axhline(y=0, ls=':', lw=1, c='r')
ax[0].set_ylabel('Improvement train\nlog lik over NMF')
ax[0].set_title('GLMPCA')

for i in grid:
  ax[1].boxplot((llik2 - llik0)[train == i].ravel(), positions=[i], widths=[0.5], medianprops={'color': 'k'}, flierprops={'marker': '.', 'markersize': 2})
ax[1].axhline(y=0, ls=':', lw=1, c='r')
ax[1].set_ylabel('Improvement train\nlog lik over NMF')
ax[1].set_title('PVAE')

for i in grid:
  ax[2].boxplot((llik3 - llik0)[train == i].ravel(), positions=[i], widths=[0.5], medianprops={'color': 'k'}, flierprops={'marker': '.', 'markersize': 2})
ax[2].axhline(y=0, ls=':', lw=1, c='r')
ax[2].set_ylabel('Improvement train\nlog lik over NMF')
ax[2].set_title('NBVAE')

h, e = np.histogram(train.ravel(), bins=grid, density=True)
ax[3].bar(e[:-1], h, color='k')
ax[3].set_ylabel('Relative frequency')
ax[3].set_xticks(grid[::5])
ax[3].set_xticklabels(grid[::5].astype(int))
ax[3].set_xlim(-.5, grid.max() - .5)
ax[3].set_xlabel('Observation in training set')

fig.tight_layout()

b_cells-train-llik-diff.png

Look at the total training log likelihood differences, stratified by observed value.

plt.clf()
fig, ax = plt.subplots(3, 1, sharex=True)
fig.set_size_inches(7, 4)

grid = np.arange(train.max() + 1)
ax[0].bar(grid, np.array([(llik1 - llik0)[train == i].sum() for i in grid]), color='k')
ax[0].axhline(y=0, ls=':', lw=1, c='r')
ax[0].set_ylabel('Improvement train\nlog lik over NMF')
ax[0].set_title('GLMPCA')

ax[1].bar(grid, np.array([(llik2 - llik0)[train == i].sum() for i in grid]), color='k')
ax[1].axhline(y=0, ls=':', lw=1, c='r')
ax[1].set_ylabel('Improvement train\nlog lik over NMF')
ax[1].set_title('PVAE')

ax[2].bar(grid, np.array([(llik3 - llik0)[train == i].sum() for i in grid]), color='k')
ax[2].axhline(y=0, ls=':', lw=1, c='r')
ax[2].set_ylabel('Improvement train\nlog lik over NMF')
ax[2].set_title('NBVAE')
ax[2].set_xticks(grid[::5])
ax[2].set_xticklabels(grid[::5].astype(int))
ax[2].set_xlim(-.5, grid.max() - .5)
ax[2].set_xlabel('Observation in training set')

fig.tight_layout()

b_cells-train-sum-llik-diff.png

Repeat the analysis for the validation data.

s = (test.sum(axis=1) / train.sum(axis=1)).reshape(-1, 1)
llik0 = st.poisson(s * lam0).logpmf(test)
llik1 = st.poisson(s * lam1).logpmf(test)
llik2 = st.poisson(s * lam2).logpmf(test)
llik3 = st.poisson(s * lam3).logpmf(test)
pd.Series({'NMF': llik0.mean(), 'GLMPCA': llik1.mean(), 'PVAE': llik2.mean(), 'NBVAE': llik3.mean()})
NMF      -1.132348
GLMPCA   -1.087027
PVAE     -1.071313
NBVAE    -1.129745
dtype: float64
plt.clf()
fig, ax = plt.subplots(4, 1, sharex=True)
fig.set_size_inches(7, 6)

grid = np.arange(test.max())
for i in grid:
  ax[0].boxplot((llik1 - llik0)[test == i].ravel(), positions=[i], widths=[0.5], medianprops={'color': 'k'}, flierprops={'marker': '.', 'markersize': 2})
ax[0].axhline(y=0, ls=':', lw=1, c='r')
ax[0].set_ylabel('Improvement val log lik\nover NMF')
ax[0].set_title('GLMPCA')

for i in grid:
  ax[1].boxplot((llik2 - llik0)[test == i].ravel(), positions=[i], widths=[0.5], medianprops={'color': 'k'}, flierprops={'marker': '.', 'markersize': 2})
ax[1].axhline(y=0, ls=':', lw=1, c='r')
ax[1].set_ylabel('Improvement val log lik\nover NMF')
ax[1].set_title('PVAE')

for i in grid:
  ax[2].boxplot((llik3 - llik0)[test == i].ravel(), positions=[i], widths=[0.5], medianprops={'color': 'k'}, flierprops={'marker': '.', 'markersize': 2})
ax[2].axhline(y=0, ls=':', lw=1, c='r')
ax[2].set_ylabel('Improvement val log lik\nover NMF')
ax[2].set_title('NBVAE')

h, e = np.histogram(test.ravel(), bins=np.arange(test.max() + 1), density=True)
ax[3].bar(e[:-1], h, color='k')
ax[3].set_ylabel('Relative frequency')
ax[3].set_xticks(grid[::5])
ax[3].set_xticklabels(grid[::5].astype(int))
ax[3].set_xlim(-.5, grid.max() - .5)
ax[3].set_xlabel('Observation in validation set')

fig.tight_layout()

b_cells-llik-diff.png

plt.clf()
fig, ax = plt.subplots(3, 1, sharex=True)
fig.set_size_inches(7, 4)

grid = np.arange(test.max() + 1)
ax[0].bar(grid, np.array([(llik1 - llik0)[test == i].sum() for i in grid]), color='k')
ax[0].axhline(y=0, ls=':', lw=1, c='r', zorder=-3)
ax[0].set_ylabel('Improvement val log lik\nover NMF')
ax[0].set_title('GLMPCA')

ax[1].bar(grid, np.array([(llik2 - llik0)[test == i].sum() for i in grid]), color='k')
ax[1].axhline(y=0, ls=':', lw=1, c='r', zorder=-3)
ax[1].set_ylabel('Improvement val log lik\nover NMF')
ax[1].set_title('PVAE')

ax[2].bar(grid, np.array([(llik3 - llik0)[test == i].sum() for i in grid]), color='k')
ax[2].axhline(y=0, ls=':', lw=1, c='r', zorder=-3)
ax[2].set_ylabel('Improvement val log lik\nover NMF')
ax[2].set_title('NBPVAE')
ax[2].set_xticks(grid[::5])
ax[2].set_xticklabels(grid[::5].astype(int))
ax[2].set_xlim(-.5, grid.max() - .5)
ax[2].set_xlabel('Observation in validation set')

fig.tight_layout()

b_cells-sum-llik-diff.png

Count the fraction of observations equal to one in the validation set.

(test == 1).sum() / np.prod(test.shape)
0.187488

For observations which were equal to one in validation set, look at the distribution of training set values.

pd.Series(dict(enumerate([np.logical_and(train == i, test == 1).sum() / (test == 1).sum() for i in range(4)])))
0    0.540323
1    0.254025
2    0.107712
3    0.050151
dtype: float64

Look at the distribution of (a random sample of) randomized quantiles for each method.

train_qs = []
test_qs = []
for l in [lam0, lam1, lam2]:
  F = st.poisson(l)
  q = np.sort((F.cdf(train - 1) + np.random.uniform(size=train.shape) * F.pmf(train)).ravel()[::10])
  train_qs.append(q)
  q = np.sort((F.cdf(test - 1) + np.random.uniform(size=test.shape) * F.pmf(test)).ravel()[::10])
  test_qs.append(q)
for l in [lam3]:
  F = st.nbinom(n=np.exp(nbvae_log_inv_disp), p=1 / (1 + l * np.exp(-nbvae_log_inv_disp)))
  q = np.sort((F.cdf(train - 1) + np.random.uniform(size=train.shape) * F.pmf(train)).ravel()[::10])
  train_qs.append(q)
  q = np.sort((F.cdf(test - 1) + np.random.uniform(size=test.shape) * F.pmf(test)).ravel()[::10])
  test_qs.append(q)
cm = plt.get_cmap('Dark2')
plt.clf()
fig, ax = plt.subplots(1, 2, sharey=True)
fig.set_size_inches(5.5, 3)
lim = [0, 1]
for a, qs, t in zip(ax.ravel(), [train_qs, test_qs], ['Training', 'Validation']):
  for i, (q, k) in enumerate(zip(qs, ['NMF', 'GLMPCA', 'PVAE', 'NBVAE'])):
    x = np.linspace(0, 1, q.shape[0] + 1)[1:]
    a.plot(x, np.sort(q), lw=1, c=cm(i), label=k)
    a.plot(lim, lim, lw=1, ls=':', c='0.7')
  a.set_xlim(lim)
  a.set_ylim(lim)
  a.set_xlabel('Theoretical quantile')
  a.set_title(t)
ax[0].set_ylabel('Randomized quantile')
ax[1].legend(frameon=False)
fig.tight_layout()

b_cells-rpp.png

T cell/B cell

Split the data into training and validation sets using binomial thinning, then fit NMF, GLMPCA, and PVAE.

# Important: this implicitly sets the random seed
x = data['cytotoxic_t-b_cells']()
train, test = scmodes.benchmark.train_test_split(x)
np.save('/scratch/midway2/aksarkar/modes/fitted-values/cytotoxic_t-b_cells-train', train)
np.save('/scratch/midway2/aksarkar/modes/fitted-values/cytotoxic_t-b_cells-test', test)
sbatch --partition=broadwl -n1 --mem=8G --time=24:00:00
#!/bin/bash
source activate scmodes
python <<EOF
<<imports>>
train = np.load('/scratch/midway2/aksarkar/modes/fitted-values/cytotoxic_t-b_cells-train.npy')
l, f, _ = scmodes.lra.nmf(train, rank=8, tol=1e-2, max_iters=100000, verbose=True)
lam0 = l @ f.T
np.save('/scratch/midway2/aksarkar/modes/fitted-values/cytotoxic_t-b_cells-nmf-lam', lam0)

l, f, _ = scmodes.lra.glmpca(train, rank=8, tol=1e-2, max_iters=100000, verbose=True)
lam1 = np.exp(l @ f.T)
np.save('/scratch/midway2/aksarkar/modes/fitted-values/cytotoxic_t-b_cells-glmpca-lam', lam1)
EOF
sbatch --partition=gpu2 --gres=gpu:1 --mem=8G
#!/bin/bash
source activate scmodes
python <<EOF
<<imports>>
train = np.load('/scratch/midway2/aksarkar/modes/fitted-values/cytotoxic_t-b_cells-train.npy')
xt = torch.tensor(train, dtype=torch.float)
m = scmodes.lra.PVAE(input_dim=train.shape[1], latent_dim=8).fit(xt, test_size=0.1, lr=1e-3, max_epochs=200, trace=True)
lam2 = m.denoise(xt, n_samples=100)
np.save('/scratch/midway2/aksarkar/modes/fitted-values/cytotoxic_t-b_cells-pvae-lam', lam2)
np.save('/scratch/midway2/aksarkar/modes/fitted-values/cytotoxic_t-b_cells-pvae-trace', np.array(m.trace))
EOF

Read the results.

train = np.load('/scratch/midway2/aksarkar/modes/fitted-values/cytotoxic_t-b_cells-train.npy')
test = np.load('/scratch/midway2/aksarkar/modes/fitted-values/cytotoxic_t-b_cells-test.npy')
lam0 = np.load('/scratch/midway2/aksarkar/modes/fitted-values/cytotoxic_t-b_cells-nmf-lam.npy')
lam1 = np.load('/scratch/midway2/aksarkar/modes/fitted-values/cytotoxic_t-b_cells-glmpca-lam.npy')
lam2 = np.load('/scratch/midway2/aksarkar/modes/fitted-values/cytotoxic_t-b_cells-pvae-lam.npy')

Plot the results.

plt.clf()
fig, ax = plt.subplots(1, 2, sharey=True)
fig.set_size_inches(4, 2.5)

ax[0].scatter(np.sqrt(lam1.ravel()[::50]), np.sqrt(lam0.ravel()[::50]), s=1, c='k', alpha=0.1)
ax[0].plot([0, 10], [0, 10], lw=1, ls=':', c='r')
ax[0].set_xlabel(r'GLMPCA $\sqrt{\hat\lambda}$')
ax[0].set_ylabel(r'NMF $\sqrt{\hat\lambda}$')

ax[1].scatter(np.sqrt(lam2.ravel()[::50]), np.sqrt(lam0.ravel()[::50]), s=1, c='k', alpha=0.1)
ax[1].plot([0, 10], [0, 10], lw=1, ls=':', c='r')
ax[1].set_xlabel(r'PVAE $\sqrt{\hat\lambda}$')

a = fig.add_subplot(111, frame_on=False, xticks=[], yticks=[])
a.set_title('T cell/B cell')
fig.tight_layout()

cytotoxic_t-b_cells.png

PBMC

Split the data into training and validation sets using binomial thinning, then fit NMF, GLMPCA, and PVAE.

# Important: this implicitly sets the random seed
x = data['pbmc_10k_v3']()
train, test = scmodes.benchmark.train_test_split(x)
np.save('/scratch/midway2/aksarkar/modes/fitted-values/pbmcs_10k_v3-train', train)
np.save('/scratch/midway2/aksarkar/modes/fitted-values/pbmcs_10k_v3-test', test)
sbatch --partition=broadwl -n1 --mem=8G --time=24:00:00
#!/bin/bash
source activate scmodes
python <<EOF
<<imports>>
train = np.load('/scratch/midway2/aksarkar/modes/fitted-values/pbmcs_10k_v3-train.npy')
l, f, _ = scmodes.lra.nmf(train, rank=8, tol=1e-2, max_iters=100000, verbose=True)
lam0 = l @ f.T
np.save('/scratch/midway2/aksarkar/modes/fitted-values/pbmcs_10k_v3-nmf-lam', lam0)

l, f, _ = scmodes.lra.glmpca(train, rank=8, tol=1e-2, max_iters=100000, verbose=True)
lam1 = np.exp(l @ f.T)
np.save('/scratch/midway2/aksarkar/modes/fitted-values/pbmcs_10k_v3-glmpca-lam', lam1)
EOF
sbatch --partition=gpu2 --gres=gpu:1 --mem=8G
#!/bin/bash
source activate scmodes
python <<EOF
<<imports>>
train = np.load('/scratch/midway2/aksarkar/modes/fitted-values/pbmcs_10k_v3-train.npy')
xt = torch.tensor(train, dtype=torch.float)
m = scmodes.lra.PVAE(input_dim=train.shape[1], latent_dim=8).fit(xt, test_size=0.1, lr=1e-3, max_epochs=200, trace=True)
lam2 = m.denoise(xt, n_samples=100)
np.save('/scratch/midway2/aksarkar/modes/fitted-values/pbmcs_10k_v3-pvae-lam', lam2)
np.save('/scratch/midway2/aksarkar/modes/fitted-values/pbmcs_10k_v3-pvae-trace', np.array(m.trace))
EOF

Read the results.

train = np.load('/scratch/midway2/aksarkar/modes/fitted-values/pbmcs_10k_v3-train.npy')
test = np.load('/scratch/midway2/aksarkar/modes/fitted-values/pbmcs_10k_v3-test.npy')
lam0 = np.load('/scratch/midway2/aksarkar/modes/fitted-values/pbmcs_10k_v3-nmf-lam.npy')
lam1 = np.load('/scratch/midway2/aksarkar/modes/fitted-values/pbmcs_10k_v3-glmpca-lam.npy')
lam2 = np.load('/scratch/midway2/aksarkar/modes/fitted-values/pbmcs_10k_v3-pvae-lam.npy')

Plot the results.

lim = [0, 3]
plt.clf()
fig, ax = plt.subplots(1, 2, sharey=True)
fig.set_size_inches(4, 2.5)

ax[0].scatter(np.sqrt(lam1.ravel()[::50]), np.sqrt(lam0.ravel()[::50]), s=1, c='k', alpha=0.1)
ax[0].plot(lim, lim, lw=1, ls=':', c='r')
ax[0].set_xlabel(r'GLMPCA $\sqrt{\hat\lambda}$')
ax[0].set_ylabel(r'NMF $\sqrt{\hat\lambda}$')

ax[1].scatter(np.sqrt(lam2.ravel()[::50]), np.sqrt(lam0.ravel()[::50]), s=1, c='k', alpha=0.1)
ax[1].plot(lim, lim, lw=1, ls=':', c='r')
ax[1].set_xlabel(r'PVAE $\sqrt{\hat\lambda}$')

a = fig.add_subplot(111, frame_on=False, xticks=[], yticks=[])
a.set_title('PBMC')
fig.tight_layout()

pbmcs_10k_v3.png

Liver

Split the data into training and validation sets using binomial thinning, then fit NMF, GLMPCA, and PVAE.

# Important: this implicitly sets the random seed
x = data['liver-caudate-lobe']()
train, test = scmodes.benchmark.train_test_split(x)
np.save('/scratch/midway2/aksarkar/modes/fitted-values/liver-caudate-lobe-train', train)
np.save('/scratch/midway2/aksarkar/modes/fitted-values/liver-caudate-lobe-test', test)
sbatch --partition=broadwl -n1 --mem=8G --time=2:00:00
#!/bin/bash
source activate scmodes
python <<EOF
<<imports>>
train = np.load('/scratch/midway2/aksarkar/modes/fitted-values/liver-caudate-lobe-train.npy')
l, f, _ = scmodes.lra.nmf(train, rank=8, tol=1e-2, max_iters=100000, verbose=True)
lam0 = l @ f.T
np.save('/scratch/midway2/aksarkar/modes/fitted-values/liver-caudate-lobe-nmf-lam', lam0)

l, f, _ = scmodes.lra.glmpca(train, rank=8, tol=1e-2, max_iters=100000, verbose=True)
lam1 = np.exp(l @ f.T)
np.save('/scratch/midway2/aksarkar/modes/fitted-values/liver-caudate-lobe-glmpca-lam', lam1)
EOF
sbatch --partition=gpu2 --gres=gpu:1 --mem=8G
#!/bin/bash
source activate scmodes
python <<EOF
<<imports>>
train = np.load('/scratch/midway2/aksarkar/modes/fitted-values/liver-caudate-lobe-train.npy')
torch.manual_seed(1)
xt = torch.tensor(train, dtype=torch.float)
m = scmodes.lra.PVAE(input_dim=train.shape[1], latent_dim=8).fit(xt, test_size=0.1, lr=1e-3, max_epochs=1600, trace=True)
lam2 = m.denoise(xt, n_samples=100)
np.save('/scratch/midway2/aksarkar/modes/fitted-values/liver-caudate-lobe-pvae-lam', lam2)
np.save('/scratch/midway2/aksarkar/modes/fitted-values/liver-caudate-lobe-pvae-trace', np.array(m.trace))
EOF

Examine the VAE optimization traces.

cm = plt.get_cmap('Paired')
plt.clf()
plt.gcf().set_size_inches(4, 2)
trace = np.load('/scratch/midway2/aksarkar/modes/fitted-values/liver-caudate-lobe-pvae-trace.npy')
for i, (t, s, k) in enumerate(zip(trace.T, [900, 100], ['PVAE train', 'PVAE val'])):
  plt.plot(np.log(t / s), lw=1, c=cm(i), label=k)
plt.legend(frameon=False)
plt.xlabel('Epoch')
plt.ylabel('Avg log neg ELBO')
plt.tight_layout()

liver-caudate-lobe-pvae-trace.png

Read the results.

train = np.load('/scratch/midway2/aksarkar/modes/fitted-values/liver-caudate-lobe-train.npy')
test = np.load('/scratch/midway2/aksarkar/modes/fitted-values/liver-caudate-lobe-test.npy')
lam0 = np.load('/scratch/midway2/aksarkar/modes/fitted-values/liver-caudate-lobe-nmf-lam.npy')
lam1 = np.load('/scratch/midway2/aksarkar/modes/fitted-values/liver-caudate-lobe-glmpca-lam.npy')
lam2 = np.load('/scratch/midway2/aksarkar/modes/fitted-values/liver-caudate-lobe-pvae-lam.npy')
s = (test.sum(axis=1) / train.sum(axis=1)).reshape(-1, 1)
llik0 = st.poisson(s * lam0).logpmf(test)
llik1 = st.poisson(s * lam1).logpmf(test)
llik2 = st.poisson(s * lam2).logpmf(test)
pd.Series({'NMF': llik0.mean(), 'GLMPCA': llik1.mean(), 'PVAE': llik2.mean()})
NMF      -0.961543
GLMPCA   -0.940434
PVAE     -0.979979
dtype: float64

Plot the results.

plt.clf()
fig, ax = plt.subplots(3, 1, sharex=True)
fig.set_size_inches(7, 4)

grid = np.arange(50)
ax[0].bar(grid, np.array([(llik1 - llik0)[test == i].sum() for i in grid]), color='k')
ax[0].axhline(y=0, ls=':', lw=1, c='r', zorder=-3)
ax[0].set_ylabel('Improvement val log lik\nover NMF')
ax[0].set_title('GLMPCA')

ax[1].bar(grid, np.array([(llik2 - llik0)[test == i].sum() for i in grid]), color='k')
ax[1].axhline(y=0, ls=':', lw=1, c='r', zorder=-3)
ax[1].set_ylabel('Improvement val log lik\nover NMF')
ax[1].set_title('PVAE')

ax[-1].set_xticks(grid[::5])
ax[-1].set_xticklabels(grid[::5].astype(int))
ax[-1].set_xlim(-.5, grid.max() - .5)
ax[-1].set_xlabel('Observation in validation set')

fig.tight_layout()

liver-caudate-lobe-llik-diff.png

lim = [0, 30]
plt.clf()
fig, ax = plt.subplots(1, 2, sharey=True)
fig.set_size_inches(4, 2.5)

ax[0].scatter(np.sqrt(lam1.ravel()[::50]), np.sqrt(lam0.ravel()[::50]), s=1, c='k', alpha=0.1)
ax[0].plot(lim, lim, lw=1, ls=':', c='r')
ax[0].set_xlabel(r'GLMPCA $\sqrt{\hat\lambda}$')
ax[0].set_ylabel(r'NMF $\sqrt{\hat\lambda}$')

ax[1].scatter(np.sqrt(lam2.ravel()[::50]), np.sqrt(lam0.ravel()[::50]), s=1, c='k', alpha=0.1)
ax[1].plot(lim, lim, lw=1, ls=':', c='r')
ax[1].set_xlabel(r'PVAE $\sqrt{\hat\lambda}$')

a = fig.add_subplot(111, frame_on=False, xticks=[], yticks=[])
a.set_title('Liver')
fig.tight_layout()

liver-caudate-lobe.png

Author: Abhishek Sarkar

Created: 2020-09-14 Mon 13:11

Validate