Distribution deconvolution examples

Table of Contents

Introduction

We systematically compared different deconvolution methods on their ability to generalize to held out data. Here, we look at the data and fitted distributions for specific examples.

Setup

import functools as ft
import multiprocessing as mp
import numpy as np
import pandas as pd
import scanpy
import scipy.stats as st
import scipy.special as sp
import scmodes
import sklearn.model_selection as skms

import rpy2.robjects.packages
import rpy2.robjects.pandas2ri
import rpy2.robjects.numpy2ri

rpy2.robjects.pandas2ri.activate()
rpy2.robjects.numpy2ri.activate()

ashr = rpy2.robjects.packages.importr('ashr')
descend = rpy2.robjects.packages.importr('descend')
%matplotlib inline
%config InlineBackend.figure_formats = set(['retina'])
import colorcet
import matplotlib.pyplot as plt
plt.rcParams['figure.facecolor'] = 'w'
plt.rcParams['font.family'] = 'Nimbus Sans'

Results

Test

b_cells = scmodes.dataset.read_10x('/project2/mstephens/aksarkar/projects/singlecell-ideas/data/10xgenomics/b_cells/filtered_matrices_mex/hg19/', return_df=True)
llik_diff = benchmark['b_cells']['npmle'] - benchmark['b_cells']['gamma']
query = llik_diff[llik_diff > 0].sort_values(ascending=False).head(n=1).index
x = b_cells[query].values.ravel()
s = b_cells.sum(axis=1).values.ravel()
plt.clf()
fig, ax = plt.subplots(2, 1)
fig.set_size_inches(6, 4)
ax[0].hist(x, bins=np.arange(x.max() + 1), color='k')
ax[0].set_xlabel('Num mols')
ax[0].set_ylabel('Num cells')

for c, k in zip(plt.get_cmap('Dark2').colors, ['Gamma', 'ZIG', 'Unimodal', 'ZIEF', 'NPMLE']):
  ax[1].plot(*getattr(scmodes.deconvolve, f'fit_{k.lower()}')(x, s), color=c, lw=1, label=k)
ax[1].set_xlabel('Latent gene expression')
ax[1].set_ylabel('CDF')
ax[1].legend(frameon=False)
fig.tight_layout()

test.png

Metadata

Read gene metadata.

gene_info = pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/scqtl-genes.txt.gz', sep='\t', index_col=0)

Spike-in data

Read the results.

benchmark = {
  k: g.reset_index(level=0, drop=True) for k, g in
  (pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-modes/data/deconv-generalization/control-kl.txt.gz',
               sep='\t', index_col=[0, 1])
   .groupby(level=0))}

We are primarily interested in cases where NPMLE does better than unimodal, because these could potentially be cases where zeros drive the difference in generalization performance.

chromium1 = data['chromium1']()
llik_diff = benchmark['chromium1']['npmle'] - benchmark['chromium1']['unimodal']
query = llik_diff[np.logical_and(chromium1.sum(axis=0) > 0, llik_diff > 0)].sort_values(ascending=False).head(n=12).index
plt.clf()
fig, ax = plt.subplots(4, 3)
fig.set_size_inches(7, 5)
for a, k in zip(ax.ravel(), query):
  a.hist(chromium1[:,k], bins=np.arange(chromium1[:,k].max() + 2), color='k')
  a.text(x=.95, y=.95,
         s=f"$\Delta KL$={llik_diff.loc[k]:.1f}",
         horizontalalignment='right',
         verticalalignment='top',
         transform=a.transAxes)
for y in range(ax.shape[0]):
  ax[y][0].set_ylabel('Num cells')
for x in range(ax.shape[1]):
  ax[-1][x].set_xlabel('Num mols')
fig.tight_layout()

chromium1-npmle-examples.png

chromium2 = data['chromium2']()
llik_diff = benchmark['chromium2']['npmle'] - benchmark['chromium2']['unimodal']
query = llik_diff[np.logical_and(chromium2.sum(axis=0) > 0, llik_diff > 0)].sort_values(ascending=False).head(n=12).index
plt.clf()
fig, ax = plt.subplots(4, 3)
fig.set_size_inches(7, 5)
for a, k in zip(ax.ravel(), query):
  a.hist(chromium2[:,k], bins=np.arange(chromium2[:,k].max() + 2), color='k')
  a.text(x=.95, y=.95,
         s=f"$\Delta KL$={llik_diff.loc[k]:.1f}",
         horizontalalignment='right',
         verticalalignment='top',
         transform=a.transAxes)
for y in range(ax.shape[0]):
  ax[y][0].set_ylabel('Num cells')
for x in range(ax.shape[1]):
  ax[-1][x].set_xlabel('Num mols')
fig.tight_layout()

chromium2-npmle-examples.png

dropseq = data['dropseq']()
llik_diff = benchmark['dropseq']['npmle'] - benchmark['dropseq']['unimodal']
query = benchmark['dropseq'].loc[llik_diff[llik_diff > 0].sort_values(ascending=False).head(n=12).index].index
plt.clf()
fig, ax = plt.subplots(4, 3)
fig.set_size_inches(7, 5)
for a, k in zip(ax.ravel(), query):
  a.hist(dropseq[:,k], bins=np.arange(dropseq[:,k].max() + 2), color='k')
  a.text(x=.95, y=.95,
         s=f"$\Delta KL$={llik_diff.loc[k]:.1f}",
         horizontalalignment='right',
         verticalalignment='top',
         transform=a.transAxes)
for y in range(ax.shape[0]):
  ax[y][0].set_ylabel('Num cells')
for x in range(ax.shape[1]):
  ax[-1][x].set_xlabel('Num mols')
fig.tight_layout()

dropseq-npmle-examples.png

gemcode = data['gemcode']()
llik_diff = benchmark['gemcode']['npmle'] - benchmark['gemcode']['unimodal']
query = llik_diff[np.logical_and(gemcode.sum(axis=0) > 0, llik_diff > 0)].sort_values(ascending=False).head(n=12).index
plt.clf()
fig, ax = plt.subplots(4, 3)
fig.set_size_inches(7, 5)
for a, k in zip(ax.ravel(), query):
  a.hist(gemcode[:,k], bins=np.arange(gemcode[:,k].max() + 2), color='k')
  a.text(x=.95, y=.95,
         s=f"$\Delta KL$={llik_diff.loc[k]:.1f}",
         horizontalalignment='right',
         verticalalignment='top',
         transform=a.transAxes)
for y in range(ax.shape[0]):
  ax[y][0].set_ylabel('Num cells')
for x in range(ax.shape[1]):
  ax[-1][x].set_xlabel('Num mols')
fig.tight_layout()

gemcode-npmle-examples.png

indrops = data['indrops']()
llik_diff = benchmark['indrops']['npmle'] - benchmark['indrops']['unimodal']
query = llik_diff[np.logical_and(indrops.sum(axis=0) > 0, llik_diff > 0)].sort_values(ascending=False).head(n=12).index
plt.clf()
fig, ax = plt.subplots(4, 3)
fig.set_size_inches(7, 5)
for a, k in zip(ax.ravel(), query):
  a.hist(indrops[:,k], bins=np.arange(indrops[:,k].max() + 2), color='k')
  a.text(x=.95, y=.95,
         s=f"$\Delta KL$={llik_diff.loc[k]:.1f}",
         horizontalalignment='right',
         verticalalignment='top',
         transform=a.transAxes)
for y in range(ax.shape[0]):
  ax[y][0].set_ylabel('Num cells')
for x in range(ax.shape[1]):
  ax[-1][x].set_xlabel('Num mols')
fig.tight_layout()

indrops-npmle-examples.png

Homogeneous populations

Read the results.

benchmark = {
  k: g.reset_index(level=0, drop=True) for k, g in
  (pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-modes/data/deconv-generalization/homogeneous-kl.txt.gz',
               sep='\t', index_col=[0, 1])
   .groupby(level=0))}

Focus on cases where NPMLE does better than unimodal. Look at B cells.

b_cells = scmodes.dataset.read_10x('/project2/mstephens/aksarkar/projects/singlecell-ideas/data/10xgenomics/b_cells/filtered_matrices_mex/hg19/', return_df=True)
llik_diff = benchmark['b_cells']['npmle'] - benchmark['b_cells']['unimodal']
query = llik_diff[llik_diff > 0].sort_values(ascending=False).head(n=12).index
plt.clf()
fig, ax = plt.subplots(3, 4)
fig.set_size_inches(7, 5)
for a, k in zip(ax.ravel(), query):
  a.hist(b_cells.loc[:,k], bins=np.arange(b_cells.loc[:,k].max() + 1), color='k')
  a.text(x=.95, y=.95,
         s=f"$\Delta KL$={llik_diff.loc[k]:.1g}",
         horizontalalignment='right',
         verticalalignment='top',
         transform=a.transAxes)
  a.set_title(gene_info.loc[k, 'name'] if k in gene_info.index else k)
for y in range(ax.shape[0]):
  ax[y][0].set_ylabel('Num cells')
for x in range(ax.shape[1]):
  ax[-1][x].set_xlabel('Num mols')
fig.tight_layout()

b-cell-npmle-examples.png

Look at cytotoxic T cells.

cytotoxic_t = scmodes.dataset.read_10x('/project2/mstephens/aksarkar/projects/singlecell-ideas/data/10xgenomics/cytotoxic_t/filtered_matrices_mex/hg19/', return_df=True)
llik_diff = benchmark['cytotoxic_t']['npmle'] - benchmark['cytotoxic_t']['unimodal']
query = llik_diff[llik_diff > 0].sort_values(ascending=False).head(n=12).index
plt.clf()
fig, ax = plt.subplots(4, 3)
fig.set_size_inches(7, 5)
for a, k in zip(ax.ravel(), query):
  a.hist(cytotoxic_t.loc[:,k], bins=np.arange(cytotoxic_t.loc[:,k].max() + 1), color='k')
  a.text(x=.95, y=.95,
         s=f"$\Delta KL$={llik_diff.loc[k]:.1g}",
         horizontalalignment='right',
         verticalalignment='top',
         transform=a.transAxes)
  a.set_title(gene_info.loc[k, 'name'] if k in gene_info.index else k)
for y in range(ax.shape[0]):
  ax[y][0].set_ylabel('Num cells')
for x in range(ax.shape[1]):
  ax[-1][x].set_xlabel('Num mols')
fig.tight_layout()

cytotoxic-t-npmle-examples.png

Make sure our choice of discretization for unimodal is acceptable on RPL34.

k = 'ENSG00000109475'
x = cytotoxic_t.loc[:,k]
s = cytotoxic_t.sum(axis=1)

plt.clf()
fig, ax = plt.subplots(2, 1)
fig.set_size_inches(6, 4)
ax[0].hist(x, bins=np.arange(x.max() + 1), color='k')
ax[0].set_xlabel('Num mols')
ax[0].set_ylabel('Num cells')

for c, k in zip(plt.get_cmap('Dark2').colors, ['Gamma', 'ZIG', 'Unimodal', 'ZIEF', 'NPMLE']):
  ax[1].plot(*getattr(scmodes.deconvolve, f'fit_{k.lower()}')(x, s), color=c, lw=1, label=k)
ax[1].set_xlabel('Latent gene expression')
ax[1].set_ylabel('CDF')
ax[1].legend(frameon=False)
fig.tight_layout()

cytotoxic-t-rpl34.png

Mengyin Lu previously reported CCL5 could possibly have bimodal expression. Look at the fitted distributions for CCL5.

k = 'ENSG00000161570'
x = cytotoxic_t.loc[:,k]
s = cytotoxic_t.sum(axis=1)

plt.clf()
fig, ax = plt.subplots(2, 1)
fig.set_size_inches(6, 4)
ax[0].hist(x, bins=np.arange(x.max() + 1), color='k')
ax[0].set_xlabel('Num mols')
ax[0].set_ylabel('Num cells')

for c, k in zip(plt.get_cmap('Dark2').colors, ['Gamma', 'ZIG', 'Unimodal', 'ZIEF', 'NPMLE']):
  ax[1].plot(*getattr(scmodes.deconvolve, f'fit_{k.lower()}')(x, s), color=c, lw=1, label=k)
ax[1].set_xlabel('Latent gene expression')
ax[1].set_ylabel('CDF')
ax[1].legend(frameon=False)
fig.tight_layout()

cytotoxic-t-ccl5.png

Look at iPSCs.

ipsc_counts = scmodes.dataset.ipsc('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/', n=100, return_df=True)
llik_diff = benchmark['ipsc']['npmle'] - benchmark['ipsc']['unimodal']
query = llik_diff[llik_diff > 0].sort_values(ascending=False).head(n=12).index
plt.clf()
fig, ax = plt.subplots(4, 3)
fig.set_size_inches(7, 5)
for a, k in zip(ax.ravel(), query):
  h = a.hist(ipsc_counts.loc[:,k], bins=np.arange(ipsc_counts.loc[:,k].max() + 1), color='k')
  a.text(x=.95, y=.95,
         s=f"$\Delta KL$={llik_diff.loc[k]:.1g}",
         horizontalalignment='right',
         verticalalignment='top',
         transform=a.transAxes)
  a.set_title(gene_info.loc[k, 'name'])
for y in range(ax.shape[0]):
  ax[y][0].set_ylabel('Num cells')
for x in range(ax.shape[1]):
  ax[-1][x].set_xlabel('Num mols')
fig.tight_layout()

ipsc-unimodal-examples.png

Compare against Gamma.

llik_diff = benchmark['ipsc']['npmle'] - benchmark['ipsc']['gamma']
query = llik_diff[llik_diff > 0].sort_values(ascending=False).head(n=12).index
plt.clf()
fig, ax = plt.subplots(4, 3)
fig.set_size_inches(7, 5)
for a, k in zip(ax.ravel(), query):
  a.hist(ipsc_counts.loc[:,k], bins=np.arange(ipsc_counts.loc[:,k].max() + 1), color='k')
  a.text(x=.95, y=.95,
         s=f"$\Delta KL$={llik_diff.loc[k]:.1g}",
         horizontalalignment='right',
         verticalalignment='top',
         transform=a.transAxes)
  a.set_title(gene_info.loc[k, 'name'])
for y in range(ax.shape[0]):
  ax[y][0].set_ylabel('Num cells')
for x in range(ax.shape[1]):
  ax[-1][x].set_xlabel('Num mols')
fig.tight_layout()

ipsc-npmle-examples.png

Look at the fitted distributions for SLC25A5.

k = 'ENSG00000005022'
x = ipsc_counts.loc[:,k]
s = ipsc_counts.sum(axis=1)

plt.clf()
fig, ax = plt.subplots(2, 1)
fig.set_size_inches(6, 4)
ax[0].hist(x, bins=np.arange(x.max() + 1), color='k')
ax[0].set_xlabel('Num mols')
ax[0].set_ylabel('Num cells')

for c, k in zip(plt.get_cmap('Dark2').colors, ['Gamma', 'ZIG', 'Unimodal', 'ZIEF', 'NPMLE']):
  ax[1].plot(*getattr(scmodes.deconvolve, f'fit_{k.lower()}')(x, s), color=c, lw=1, label=k)
ax[1].set_xlabel('Latent gene expression')
ax[1].set_ylabel('CDF')
ax[1].legend(frameon=False)
fig.tight_layout()

ipsc-slc25a5.png

Conclusions:

  • When NPMLE does better than Gamma, unimodal does comparably well. The genes where this happens quite clearly exhibit unimodal gene expression.

Investigate Gamma vs. Point-Gamma in high-depth data

Look at the full benchmark results for NB vs. ZINB (only the GPU implementation is suitable for this).

ipsc_res = pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-modes/data/deconv-generalization/ipsc-gpu.txt.gz', index_col=0, sep='\t')
plt.clf()
plt.gcf().set_size_inches(3, 3)
h = plt.hist(ipsc_res['zinb'] - ipsc_res['nb'], color='k', bins=np.arange(-10, 40, 2))
plt.xlabel('Diff val set log lik from NB')
_ = plt.ylabel('Number of genes')

homogeneous-nb-zinb-benchmark.png

Print the histogram for inspection.

np.vstack((h[1][:-1], h[0])).T.astype(int)
array([[ -10,    0],
[  -8,    0],
[  -6,    0],
[  -4,    0],
[  -2, 4977],
[   0, 4973],
[   2,    2],
[   4,    3],
[   6,    0],
[   8,    0],
[  10,    0],
[  12,    0],
[  14,    1],
[  16,    0],
[  18,    0],
[  20,    0],
[  22,    0],
[  24,    0],
[  26,    0],
[  28,    1],
[  30,    0],
[  32,    0],
[  34,    0],
[  36,    0]])

Try to find a case in the full iPSC data where point-Gamma (ZINB) is required, by performing likelihood ratio tests against Gamma (NB). Use Bonferroni correction to control FWER.

lrt = st.chi2(1).sf(-2 * (ipsc_res['gamma'] - ipsc_res['point_gamma']))
ipsc_res[lrt < .05 / lrt.shape[0]]
gamma  point_gamma
gene
ENSG00000112530 -2017.717750 -1987.894352
ENSG00000129824 -1803.651932 -1789.106410

The two genes are PACRG and RPS4Y1. Look at the fitted distributions for these genes.

ipsc_counts = scmodes.dataset.ipsc(prefix='/project2/mstephens/aksarkar/projects/singlecell-qtl/data/',
                                   query=list(ipsc_res[lrt < .05 / lrt.shape[0]].index), return_df=True)
annotation = pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/scqtl-annotation.txt', sep='\t')
keep_samples = pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/quality-single-cells.txt', index_col=0, header=None, sep='\t')
s = annotation.loc[keep_samples.values.ravel(), 'mol_hs']
k = 'ENSG00000112530'
x = ipsc_counts.loc[:,k].values

plt.clf()
fig, ax = plt.subplots(2, 1)
fig.set_size_inches(6, 4)
ax[0].hist(x, bins=np.arange(x.max() + 1), color='k')
ax[0].set_title(gene_info.loc[k, 'name'])
ax[0].set_xlabel('Num mols')
ax[0].set_ylabel('Num cells')

for c, k in zip(plt.get_cmap('Dark2').colors, ['Gamma', 'ZIG', 'Unimodal', 'ZIEF', 'NPMLE']):
  ax[1].plot(*getattr(scmodes.deconvolve, f'fit_{k.lower()}')(x, s), color=c, lw=1, label=k)
ax[1].set_xlabel('Latent gene expression')
ax[1].set_ylabel('CDF')
ax[1].legend(frameon=False)
fig.tight_layout()

ipsc-pacrg.png

PACRG (Parkin coregulated gene) shares a promoter with PKRG (PARK2; Parkin). PARKIN is involved in a signaling pathway for mitochondria damage, and mitochrondrial damage appears to be relevant to neurodegenerative disease etiology (Alzheimer's disease and Parkinson's disease). Mitochrondrial damage could explain the bimodal distribution of PACRG in the iPSC data: the reprogramming protocol could induce oxidative stress on some of the cells, resulting in damage.

k = 'ENSG00000129824'
x = ipsc_counts.loc[:,k].values

plt.clf()
fig, ax = plt.subplots(2, 1)
fig.set_size_inches(6, 4)
ax[0].hist(x, bins=np.arange(x.max() + 1), color='k')
ax[0].set_title(gene_info.loc[k, 'name'])
ax[0].set_xlabel('Num mols')
ax[0].set_ylabel('Num cells')

for c, k in zip(plt.get_cmap('Dark2').colors, ['Gamma', 'ZIG', 'Unimodal', 'ZIEF', 'NPMLE']):
  ax[1].plot(*getattr(scmodes.deconvolve, f'fit_{k.lower()}')(x, s), color=c, lw=1, label=k)
ax[1].set_xlabel('Latent gene expression')
ax[1].set_ylabel('CDF')
ax[1].legend(frameon=False)
fig.tight_layout()

ipsc-rps4y1.png

RPS4Y1 (Ribosomal protein S4, Y-linked) is on the Y chromosome, and therefore we should expect it to only be expressed in males. The homologous gene on the X chromosome (RPS4X) has a different coding sequence. Naively, sex-linkage should explain why this gene exhibits bimodal gene expression.

curl -OL 'http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/working/20130606_sample_info/20130606_sample_info.txt'
donor_info = pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-modes/data/20130606_sample_info.txt', sep='\t')

Count how many cells come from females.

prefix = '/project2/mstephens/aksarkar/projects/singlecell-qtl/data/'
annotations = pd.read_csv(f'{prefix}/scqtl-annotation.txt', sep='\t')
keep_samples = pd.read_csv(f'{prefix}/quality-single-cells.txt', sep='\t', index_col=0, header=None)
annotations = annotations.loc[keep_samples.values.ravel()]
annotations.merge(donor_info, left_on='chip_id', right_on='Sample')['Gender'].value_counts()
female    3109
male      2369
Name: Gender, dtype: int64

Look at the fitted ZINB distribution per-individual for this gene.

ipsc_log_mu = pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/density-estimation/design1/zi2-log-mu.txt.gz', sep=' ', index_col=0)
ipsc_log_phi = pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/density-estimation/design1/zi2-log-phi.txt.gz', sep=' ', index_col=0)
ipsc_logodds = pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/density-estimation/design1/zi2-logodds.txt.gz', sep=' ', index_col=0)
def point_gamma_cdf(grid, log_mu, log_phi, logodds):
  res = st.gamma(a=np.exp(-log_phi), scale=np.exp(log_mu + log_phi)).cdf(grid)
  res *= sp.expit(-logodds)
  res += sp.expit(logodds)
  return res

lam = ipsc_counts.values / annotations['mol_hs'].values.reshape(-1, 1)
grid = np.linspace(lam.min(), lam.max(), 1000)
cdfs = np.array([[point_gamma_cdf(grid,
                                  ipsc_log_mu.loc[j, k],
                                  ipsc_log_phi.loc[j, k],
                                  ipsc_logodds.loc[j, k])
                  for k in ipsc_log_mu]
                 for j in ipsc_counts.columns
                 if j != list(ipsc_log_mu.columns).index('NA18498')])
plt.clf()
fig, ax = plt.subplots(2, 1, sharex=True)
fig.set_size_inches(6, 4)
for a, F, k in zip(ax, cdfs, ipsc_counts.columns):
  for f in F:
    a.set_xlim(0, 2e-4)
    a.set_xticks(np.linspace(0, 2e-4, 3))
    a.plot(grid, f, c='k', lw=1, alpha=0.2)
  a.set_title(gene_info.loc[k, 'name'])
  a.set_ylabel('CDF')
ax[-1].set_xlabel('Latent gene expression')
fig.tight_layout()

ipsc-point-gamma-cdf-pacrg.png

Plot the estimated RPS4Y1 latent gene expression, stratified by donor sex.

colors = {k: 'r' if v == 'male' else 'b' for k, v in
          donor_info.set_index('Sample')
          .filter(items=list(ipsc_log_mu.columns), axis=0)
          ['Gender']
          .iteritems()}
plt.clf()
plt.gcf().set_size_inches(3, 2)
for f, k in zip(cdfs[1], ipsc_log_mu.columns):
  plt.xlim(0, 2e-4)
  plt.xticks(np.linspace(0, 2e-4, 3))
  plt.plot(grid, f, lw=1, alpha=0.2, c=colors.get(k, 'k'))
  dummy = [plt.Line2D([0], [0], c='r'), plt.Line2D([0], [0], c='b')]
  plt.legend(dummy, ['Male', 'Female'], frameon=False)
  plt.title('RPS4Y1')
  plt.ylabel('CDF')
  plt.xlabel('Latent gene expression')

ipsc-point-gamma-cdf-rps4y1.png

Synthetic mixtures

Read the results.

benchmark = {
  k: g.reset_index(level=0, drop=True) for k, g in
  (pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-modes/data/deconv-generalization/synthetic-mix-kl.txt.gz',
               sep='\t', index_col=[0, 1])
   .groupby(level=0))}

Read the data.

t_cells = scmodes.dataset.read_10x('/project2/mstephens/aksarkar/projects/singlecell-ideas/data/10xgenomics/cytotoxic_t/filtered_matrices_mex/hg19/', min_detect=0, return_df=True)
b_cells = scmodes.dataset.read_10x('/project2/mstephens/aksarkar/projects/singlecell-ideas/data/10xgenomics/b_cells/filtered_matrices_mex/hg19/', min_detect=0, return_df=True)
mix1, y1 = scmodes.dataset.synthetic_mix(t_cells, b_cells, min_detect=0.25)

cyto = scmodes.dataset.read_10x(prefix='/project2/mstephens/aksarkar/projects/singlecell-ideas/data/10xgenomics/cytotoxic_t/filtered_matrices_mex/hg19', return_df=True, min_detect=0)
naive = scmodes.dataset.read_10x(prefix='/project2/mstephens/aksarkar/projects/singlecell-ideas/data/10xgenomics/naive_cytotoxic/filtered_matrices_mex/hg19', return_df=True, min_detect=0)
mix2, y2 = scmodes.dataset.synthetic_mix(cyto, naive, min_detect=0.25)

Look at T cell/B cell mix.

llik_diff = benchmark['cd8-cd19-mix']['npmle'] - benchmark['cd8-cd19-mix']['unimodal']
query = benchmark['cd8-cd19-mix'].loc[llik_diff[llik_diff > 0].sort_values(ascending=False).head(n=12).index].index.astype(int)
plt.clf()
fig, ax = plt.subplots(3, 4)
fig.set_size_inches(8, 5)
for a, k in zip(ax.ravel(), query):
  h = a.hist(mix1.loc[y1 == 1,mix1.columns[k]], bins=np.arange(mix1.iloc[:,k].max() + 1), color='k', alpha=0.6, label='T cells')
  a.hist(mix1.loc[y1 == 0,mix1.columns[k]], bins=np.arange(mix1.iloc[:,k].max() + 1), color='r', alpha=0.6, label='B cells')
  a.text(x=.95, y=.95,
         s=f"$\Delta KL$={llik_diff.iloc[k]:.1g}",
         horizontalalignment='right',
         verticalalignment='top',
         transform=a.transAxes)
  a.set_title(gene_info.loc[mix1.columns[k], 'name'] if mix1.columns[k] in gene_info.index else mix1.columns[k])
handles, labels = a.get_legend_handles_labels()
fig.legend(handles, labels, frameon=False, loc='center left', bbox_to_anchor=(1, 0.5))
for a in ax:
  a[0].set_ylabel('Num cells')
for a in ax.T:
  a[-1].set_xlabel('Num mols')
fig.tight_layout()

cd8-cd19-mix-npmle-examples.png

llik_diff = benchmark['cd8-cd19-mix']['npmle'] - benchmark['cd8-cd19-mix']['unimodal']
query = benchmark['cd8-cd19-mix'].loc[llik_diff[llik_diff > 0].sort_values(ascending=False).head(n=4).index].index.astype(int)
plt.clf()
fig, ax = plt.subplots(2, 2)
fig.set_size_inches(4, 4)
for a, k in zip(ax.ravel(), query):
  h = a.hist(mix1.loc[y1 == 1,mix1.columns[k]], bins=np.arange(mix1.iloc[:,k].max() + 1), color='k', alpha=0.6, label='T cells')
  a.hist(mix1.loc[y1 == 0,mix1.columns[k]], bins=np.arange(mix1.iloc[:,k].max() + 1), color='r', alpha=0.6, label='B cells')
  a.text(x=.95, y=.95,
         s=f"$\Delta l$={llik_diff.iloc[k]:.1g}",
         horizontalalignment='right',
         verticalalignment='top',
         transform=a.transAxes)
  a.set_title(gene_info.loc[mix1.columns[k], 'name'] if mix1.columns[k] in gene_info.index else mix1.columns[k])
ax[0, 0].legend(frameon=False, loc='center right')
for a in ax:
  a[0].set_ylabel('Num cells')
for a in ax.T:
  a[-1].set_xlabel('Num molecules')
fig.tight_layout()

cd8-cd19-mlcb-examples.png

Look cytotoxic/naive T cell mix.

llik_diff = benchmark['cyto-naive-t-mix']['npmle'] - benchmark['cyto-naive-t-mix']['unimodal']
query = benchmark['cyto-naive-t-mix'].loc[llik_diff[llik_diff > 0].sort_values(ascending=False).head(n=12).index].index
plt.clf()
fig, ax = plt.subplots(3, 4)
fig.set_size_inches(8, 5)
for a, k in zip(ax.ravel(), query):
  a.hist(mix2.loc[y1 == 1,k], bins=np.arange(mix2.loc[:,k].max() + 1), alpha=0.6, color='k', label='Cytotoxic T')
  a.hist(mix2.loc[y1 == 0,k], bins=np.arange(mix2.loc[:,k].max() + 1), alpha=0.6, color='r', label='Naive T')
  a.text(x=.95, y=.95,
         s=f"$\Delta KL$={llik_diff.loc[k]:.1g}",
         horizontalalignment='right',
         verticalalignment='top',
         transform=a.transAxes)
  a.set_title(gene_info.loc[k, 'name'] if k in gene_info.index else k)
handles, labels = a.get_legend_handles_labels()
fig.legend(handles, labels, frameon=False, loc='center left', bbox_to_anchor=(1, 0.5))
for a in ax:
  a[0].set_ylabel('Num cells')
for a in ax.T:
  a[-1].set_xlabel('Num mols')
fig.tight_layout()

cyto-naive-t-mix-npmle-examples.png

MALAT1 (Metastasis-associated lung adenocarcinoma transcript 1) is a lincRNA which appears to show tri-modal gene expression in this mixture. It is involved in T cell activation, through interaction with NF-κB (Zhao et al. 2016).

Heterogeneous populations

Read the results.

benchmark = {
  k: g.reset_index(level=0, drop=True) for k, g in
  (pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-modes/data/deconv-generalization/heterogeneous-kl.txt.gz',
               sep='\t', index_col=[0, 1])
   .groupby(level=0))}

Cortex

Fix the cortex count matrix (seems to break the parser in pandas).

import scmodes
work = scmodes.dataset.cortex('/project2/mstephens/aksarkar/projects/singlecell-ideas/data/zeisel-2015/GSE60361_C1-3005-Expression.txt.gz', return_df=True)
work.to_csv('/project2/mstephens/aksarkar/projects/singlecell-ideas/data/zeisel-2015/zeisel-2015.txt.gz', compression='gzip', sep='\t')
sbatch --partition=broadwl --mem=16G
#!/bin/bash
source activate scmodes
python <<EOF
<<fix-cortex>>
EOF
cortex_counts = pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-ideas/data/zeisel-2015/zeisel-2015.txt.gz', sep='\t', index_col=0)

Look at the fitted distribution for the genes where NPMLE does worst.

llik_diff = benchmark['cortex']['npmle'] - benchmark['cortex']['gamma']
llik_diff.sort_values().head(n=12)
Ttr           -3.136362
Hbb-bs        -3.032071
Hba-a2_loc1   -2.053220
Hba-a2_loc2   -1.985382
Sst           -1.842792
Apoe          -1.414460
Npy           -1.403264
Hbb-b2        -1.365434
Ccl3          -1.226098
Mgp           -1.198429
Acta2         -1.139285
Ptgds         -1.132885
dtype: float64
k = 'Ttr'
x = cortex_counts.loc[:,k].values
s = cortex_counts.sum(axis=1).values

plt.clf()
fig, ax = plt.subplots(2, 1)
fig.set_size_inches(6, 4)
ax[0].hist(x, bins=np.arange(0, x.max() + 1, 100), color='k')
ax[0].set_title(k.upper())
ax[0].set_xlabel('Num mols')
ax[0].set_ylabel('Num cells')

for c, k in zip(plt.get_cmap('Dark2').colors, ['Gamma', 'ZIG', 'Unimodal', 'ZIEF', 'NPMLE']):
  ax[1].plot(*getattr(scmodes.deconvolve, f'fit_{k.lower()}')(x, s), color=c, lw=1, label=k)
ax[1].set_xlabel('Latent gene expression')
ax[1].set_ylabel('CDF')
ax[1].legend(frameon=False)
fig.tight_layout()

cortex-ttr.png

Look at the distribution of latent gene expression values (naively estimated) for TTR.

pd.Series((x / s)[x > 0]).describe()
count    90.000000
mean      0.059269
std       0.152847
min       0.000055
25%       0.000368
50%       0.003031
75%       0.008099
max       0.610003
dtype: float64

For these genes, more than 90% of the samples have observed count 0, so the optimal model appears to predict 0 for all samples.

Look at genes where ZIG does better than Gamma.

llik_diff = benchmark['cortex']['point_gamma'] - benchmark['cortex']['gamma']
query = benchmark['cortex'].loc[llik_diff[llik_diff > 0].sort_values(ascending=False).head(n=12).index].index
plt.clf()
fig, ax = plt.subplots(4, 3)
fig.set_size_inches(7, 5)
for a, k in zip(ax.ravel(), query):
  a.hist(cortex_counts.loc[:,k], bins=np.arange(cortex_counts.loc[:,k].max() + 1), color='k')
  a.text(x=.95, y=.95,
         s=f"$\Delta KL$={llik_diff.loc[k]:.1f}",
         horizontalalignment='right',
         verticalalignment='top',
         transform=a.transAxes)
  a.set_title(k.upper())
for y in range(ax.shape[0]):
  ax[y][0].set_ylabel('Num cells')
for x in range(ax.shape[1]):
  ax[-1][x].set_xlabel('Num mols')
fig.tight_layout()

cortex-zig-examples.png

Look at genes where unimodal shows greatest performance gain.

llik_diff = benchmark['cortex']['unimodal'] - benchmark['cortex']['gamma']
query = benchmark['cortex'].loc[llik_diff[llik_diff > 0].sort_values(ascending=False).head(n=12).index].index
plt.clf()
fig, ax = plt.subplots(4, 3)
fig.set_size_inches(7, 5)
for a, k in zip(ax.ravel(), query):
  a.hist(cortex_counts.loc[:,k], bins=np.arange(cortex_counts.loc[:,k].max() + 1), color='k')
  a.text(x=.95, y=.95,
         s=f"$\Delta KL$={llik_diff.loc[k]:.1f}",
         horizontalalignment='right',
         verticalalignment='top',
         transform=a.transAxes)
  a.set_title(k.upper())
for y in range(ax.shape[0]):
  ax[y][0].set_ylabel('Num cells')
for x in range(ax.shape[1]):
  ax[-1][x].set_xlabel('Num mols')
fig.tight_layout()

cortex-unimodal-examples.png

Show genes where NPMLE shows greatest performance gain.

llik_diff = benchmark['cortex']['npmle'] - benchmark['cortex']['gamma']
query = benchmark['cortex'].loc[llik_diff[llik_diff > 0].sort_values(ascending=False).head(n=12).index].index
plt.clf()
fig, ax = plt.subplots(4, 3)
fig.set_size_inches(7, 5)
for a, k in zip(ax.ravel(), query):
  a.hist(cortex_counts.loc[:,k], bins=np.arange(cortex_counts.loc[:,k].max() + 1), color='k')
  a.text(x=.95, y=.95,
         s=f"$\Delta KL$={llik_diff.loc[k]:.1f}",
         horizontalalignment='right',
         verticalalignment='top',
         transform=a.transAxes)
  a.set_title(k.upper())
for y in range(ax.shape[0]):
  ax[y][0].set_ylabel('Num cells')
for x in range(ax.shape[1]):
  ax[-1][x].set_xlabel('Num mols')
fig.tight_layout()

cortex-npmle-examples.png

Stratify the gene expression of these genes by inferred cell type.

curl -sLO "https://storage.googleapis.com/linnarsson-lab-www-blobs/blobs/cortex/expression_mRNA_17-Aug-2014.txt"
gzip expression_mRNA_17-Aug-2014.txt
cortex_celltypes = (pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-ideas/data/zeisel-2015/expression_mRNA_17-Aug-2014.txt.gz', sep='\t', header=None, index_col=0, nrows=10)
                    .reset_index(drop=True)
                    .set_index(1))
plt.clf()
fig, ax = plt.subplots(3, 3)
fig.set_size_inches(6, 6)
for a, k in zip(ax.ravel(), set(cortex_celltypes.loc['level1class'])):
  x = cortex_counts.loc[(cortex_celltypes.loc['level1class'] == k).values, 'Gpm6a']
  a.hist(x, bins=np.arange(x.max() + 1), color='k')
  a.set_title(k)
for a in ax:
  a[0].set_ylabel('Num cells')
for a in ax.T:
  a[-1].set_xlabel('Num mols')
fig.tight_layout()

cortex-gpm6a.png

plt.clf()
fig, ax = plt.subplots(3, 3)
fig.set_size_inches(6, 6)
for a, k in zip(ax.ravel(), set(cortex_celltypes.loc['level1class'])):
  x = cortex_counts.loc[(cortex_celltypes.loc['level1class'] == k).values, 'Scn2a1']
  a.hist(x, bins=np.arange(x.max() + 1), color='k')
  a.set_title(k)
for a in ax:
  a[0].set_ylabel('Num cells')
for a in ax.T:
  a[-1].set_xlabel('Num mols')
fig.tight_layout()

cortex-scn2a.png

PBMCs

Read the PBMC data.

pbmcs_68k_counts = scmodes.dataset.read_10x('/project2/mstephens/aksarkar/projects/singlecell-ideas/data/10xgenomics/fresh_68k_pbmc_donor_a/filtered_matrices_mex/hg19/', return_df=True)
llik_diff = benchmark['pbmcs_68k']['npmle'] - benchmark['pbmcs_68k']['unimodal']
query = benchmark['pbmcs_68k'].loc[llik_diff[llik_diff > 0].sort_values(ascending=False).head(n=12).index].index
plt.clf()
fig, ax = plt.subplots(4, 3)
fig.set_size_inches(7, 5)
for a, k in zip(ax.ravel(), query):
  a.hist(pbmcs_68k_counts.loc[:,k], bins=np.arange(pbmcs_68k_counts.loc[:,k].max() + 1), color='k')
  a.text(x=.95, y=.95,
         s=f"$\Delta KL$={llik_diff.loc[k]:.2f}",
         horizontalalignment='right',
         verticalalignment='top',
         transform=a.transAxes)
  a.set_title(gene_info.loc[k, 'name'].upper())
for y in range(ax.shape[0]):
  ax[y][0].set_ylabel('Num cells')
for x in range(ax.shape[1]):
  ax[-1][x].set_xlabel('Num mols')
fig.tight_layout()

pbmc-npmle-examples.png

Author: Abhishek Sarkar

Created: 2019-09-08 Sun 22:16

Validate