Validation set log likelihood comparison

Introduction
Setup
Results
- 90/10 split
- 80/20 split

Introduction

We find examples where ZIEF has better out-of-sample log likelihood than NPMLE. Investigate why this happens.

Setup

import numpy as np
import pandas as pd
import sklearn.model_selection as skms
import rpy2.robjects.packages
import rpy2.robjects.pandas2ri

rpy2.robjects.pandas2ri.activate()

ashr = rpy2.robjects.packages.importr('ashr')
descend = rpy2.robjects.packages.importr('descend')

%matplotlib inline
%config InlineBackend.figure_formats = set(['retina'])

import matplotlib.pyplot as plt
plt.rcParams['figure.facecolor'] = 'w'

Results

90/10 split

chromium2 = data['chromium2']()
query = benchmark['chromium2']['zief'] > benchmark['chromium2']['npmle']
x = chromium2[:,query][:,0]
s = chromium1.sum(axis=1)

train, val, train_s, val_s = skms.train_test_split(x, s, test_size=0.1, random_state=1)
train_lam = train / train_s

K = 100
grid = np.linspace(0, lam.max(), K + 1)
zief_res = descend.deconvSingle(train, scaling_consts=train_s, verbose=False)
npmle_res = ashr.ash_workhorse(
  pd.Series(np.zeros(train.shape)),
  1,
  outputlevel=pd.Series(['fitted_g', 'loglik']),
  lik=ashr.lik_pois(y=pd.Series(train), scale=pd.Series(train_s), link='identity'),
  g=ashr.unimix(pd.Series(np.ones(K) / K), pd.Series(grid[:-1]), pd.Series(grid[1:])))
npmle_val_res = ashr.ash_workhorse(
  pd.Series(np.zeros(val.shape)),
  1,
  outputlevel='loglik',
  lik=ashr.lik_pois(y=pd.Series(val), scale=pd.Series(val_s), link='identity'),
  fixg=True,
  g=npmle_res.rx2('fitted_g'))

zief_train_llik = np.where(train < 1,
                     np.log(st.poisson(mu=train_s.reshape(-1, 1) * zief_g[:,0]).pmf(train.reshape(-1, 1)).dot(zief_g[:,1])),
                     np.log(st.poisson(mu=train_s.reshape(-1, 1) * zief_g[1:,0]).pmf(train.reshape(-1, 1)).dot(zief_g[1:,1])))
npmle_train_llik = np.array(npmle_res.rx2('loglik'))


zief_val_llik = np.where(val < 1,
                     np.log(st.poisson(mu=val_s.reshape(-1, 1) * zief_g[:,0]).pmf(val.reshape(-1, 1)).dot(zief_g[:,1])),
                     np.log(st.poisson(mu=val_s.reshape(-1, 1) * zief_g[1:,0]).pmf(val.reshape(-1, 1)).dot(zief_g[1:,1])))
npmle_val_llik = np.array(npmle_val_res.rx2('loglik'))

zief_train_llik.sum() - npmle_train_llik.sum()

-6.1758928957397075

zief_val_llik.sum() - npmle_val_llik.sum()

0.4706860378236115

zief_g = np.array(zief_res.slots['distribution'])[:,:2]
zief_cdf = np.array([zief_g[:,0], np.cumsum(zief_g[:,1])])
npmle_cdf = ashr.cdf_ash(npmle_res, np.linspace(0, lam.max(), 1000))

plt.clf()
fig, ax = plt.subplots(2, 1)
fig.set_size_inches(5, 3)
h = ax[0].hist(train, bins=np.arange(0, x.max() + 1, 10), color='k', label='Training')
ax[0].hist(val, bottom=h[0], bins=np.arange(0, x.max() + 1, 10), color='r', label='Validation')
ax[0].legend(frameon=False)
ax[0].set_xlabel('Num mols')
ax[0].set_ylabel('Num cells')

ax[1].set_xlim(0.13, 0.18)
ax[1].plot(np.array(npmle_cdf.rx2('x')).ravel(), np.array(npmle_cdf.rx2('y')).ravel(), c='b', label='NPMLE', lw=1)
ax[1].plot(zief_cdf[0], zief_cdf[1], c='orange', label='ZIEF', lw=1)
ax[1].set_xlabel('Latent gene expression')
ax[1].set_ylabel('CDF')
ax[1].legend(frameon=False)

fig.tight_layout()

80/20 split

Repeat the analysis, holding out a larger validation set.

train, val, train_s, val_s = skms.train_test_split(x, s, test_size=0.2, random_state=1)
train_lam = train / train_s

K = 100
grid = np.linspace(0, lam.max(), K + 1)
zief_res = descend.deconvSingle(train, scaling_consts=train_s, verbose=False)
npmle_res = ashr.ash_workhorse(
  pd.Series(np.zeros(train.shape)),
  1,
  outputlevel=pd.Series(['fitted_g', 'loglik']),
  lik=ashr.lik_pois(y=pd.Series(train), scale=pd.Series(train_s), link='identity'),
  g=ashr.unimix(pd.Series(np.ones(K) / K), pd.Series(grid[:-1]), pd.Series(grid[1:])))
npmle_val_res = ashr.ash_workhorse(
  pd.Series(np.zeros(val.shape)),
  1,
  outputlevel='loglik',
  lik=ashr.lik_pois(y=pd.Series(val), scale=pd.Series(val_s), link='identity'),
  fixg=True,
  g=npmle_res.rx2('fitted_g'))

zief_train_llik = np.where(train < 1,
                     np.log(st.poisson(mu=train_s.reshape(-1, 1) * zief_g[:,0]).pmf(train.reshape(-1, 1)).dot(zief_g[:,1])),
                     np.log(st.poisson(mu=train_s.reshape(-1, 1) * zief_g[1:,0]).pmf(train.reshape(-1, 1)).dot(zief_g[1:,1])))
npmle_train_llik = np.array(npmle_res.rx2('loglik'))


zief_val_llik = np.where(val < 1,
                     np.log(st.poisson(mu=val_s.reshape(-1, 1) * zief_g[:,0]).pmf(val.reshape(-1, 1)).dot(zief_g[:,1])),
                     np.log(st.poisson(mu=val_s.reshape(-1, 1) * zief_g[1:,0]).pmf(val.reshape(-1, 1)).dot(zief_g[1:,1])))
npmle_val_llik = np.array(npmle_val_res.rx2('loglik'))

zief_train_llik.sum() - npmle_train_llik.sum()

-4.889467453983343

zief_val_llik.sum() - npmle_val_llik.sum()

-0.6343187388595197

zief_g = np.array(zief_res.slots['distribution'])[:,:2]
zief_cdf = np.array([zief_g[:,0], np.cumsum(zief_g[:,1])])
npmle_cdf = ashr.cdf_ash(npmle_res, np.linspace(0, lam.max(), 1000))

plt.clf()
fig, ax = plt.subplots(2, 1)
fig.set_size_inches(5, 3)
h = ax[0].hist(train, bins=np.arange(0, x.max() + 1, 10), color='k', label='Training')
ax[0].hist(val, bottom=h[0], bins=np.arange(0, x.max() + 1, 10), color='r', label='Validation')
ax[0].legend(frameon=False)
ax[0].set_xlabel('Num mols')
ax[0].set_ylabel('Num cells')

ax[1].set_xlim(0.13, 0.18)
ax[1].plot(np.array(npmle_cdf.rx2('x')).ravel(), np.array(npmle_cdf.rx2('y')).ravel(), c='b', label='NPMLE', lw=1)
ax[1].plot(zief_cdf[0], zief_cdf[1], c='orange', label='ZIEF', lw=1)
ax[1].set_xlabel('Latent gene expression')
ax[1].set_ylabel('CDF')
ax[1].legend(frameon=False)

fig.tight_layout()

Validation set log likelihood comparison

Table of Contents

Introduction

Setup

Results

90/10 split

80/20 split