MLCB figures

Table of Contents

Introduction

Draw examples for MLCB submission.

Setup

import numpy as np
import pandas as pd
import scipy.special as sp
import scipy.stats as st
import scmodes
import sqlite3
%matplotlib inline
%config InlineBackend.figure_formats = set(['retina'])
import matplotlib.pyplot as plt
plt.rcParams['figure.facecolor'] = 'w'
plt.rcParams['font.family'] = 'Nimbus Sans'

Results

MLCB plots

Plot the iPSC examples.

ipsc_res = pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-modes/data/deconv-generalization/ipsc-gpu.txt.gz', index_col=0, sep='\t')
lrt = st.chi2(1).sf(-2 * (ipsc_res['gamma'] - ipsc_res['point_gamma']))
query = list(ipsc_res[lrt < .05 / lrt.shape[0]].index)

# SKP1: top eQTL
query.append('ENSG00000113558')
ipsc_counts = scmodes.dataset.ipsc(prefix='/project2/mstephens/aksarkar/projects/singlecell-qtl/data/',
                                   query=query, return_df=True)
annotation = pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/scqtl-annotation.txt', sep='\t')
keep_samples = pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/quality-single-cells.txt', index_col=0, header=None, sep='\t')
annotation = annotation.loc[keep_samples.values.ravel()]
s = annotation['mol_hs']
donor_info = pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-modes/data/20130606_sample_info.txt', sep='\t')
ipsc_log_mu = pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/density-estimation/design1/zi2-log-mu.txt.gz', sep=' ', index_col=0)
ipsc_log_phi = pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/density-estimation/design1/zi2-log-phi.txt.gz', sep=' ', index_col=0)
ipsc_logodds = pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/density-estimation/design1/zi2-logodds.txt.gz', sep=' ', index_col=0)
del ipsc_log_mu['NA18498']
def point_gamma_cdf(grid, log_mu, log_phi, logodds):
  res = st.gamma(a=np.exp(-log_phi), scale=np.exp(log_mu + log_phi)).cdf(grid)
  res *= sp.expit(-logodds)
  res += sp.expit(logodds)
  return res

lam = ipsc_counts.values / s.values.reshape(-1, 1)
grid = np.linspace(lam.min(), lam.max(), 1000)
cdfs = np.array([[point_gamma_cdf(grid,
                                  ipsc_log_mu.loc[j, k],
                                  ipsc_log_phi.loc[j, k],
                                  ipsc_logodds.loc[j, k])
                  for k in ipsc_log_mu]
                 for j in ipsc_counts.columns])
gene_info = pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/scqtl-genes.txt.gz', sep='\t', index_col=0)
colors = {k: 'r' if v == 'male' else 'b' for k, v in
          donor_info.set_index('Sample')
          .filter(items=list(ipsc_log_mu.columns), axis=0)
          ['Gender']
          .iteritems()}

k = 'ENSG00000129824'
x = ipsc_counts.loc[:,k].values
plt.clf()
fig, ax = plt.subplots(2, 1)
fig.set_size_inches(4, 4)

ax[0].hist(x, bins=np.arange(x.max() + 1), color='k')
ax[0].set_title(gene_info.loc[k, 'name'])
ax[0].set_xlabel('Num mols')
ax[0].set_ylabel('Num cells')

for f, k in zip(cdfs[2], ipsc_log_mu.columns):
  ax[1].plot(grid, f, lw=1, alpha=0.2, c=colors.get(k, 'k'))
ax[1].set_xlim(0, 2e-4)
ax[1].set_xticks(np.linspace(0, 2e-4, 3))
dummy = [plt.Line2D([0], [0], c='r'), plt.Line2D([0], [0], c='b')]
ax[1].legend(dummy, ['Male', 'Female'], title='Sex', frameon=False)
ax[1].set_ylabel('CDF')
ax[1].set_xlabel('Latent gene expression')

fig.tight_layout()

mlcb-rps4y1.png

mean_qtl_stats = pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/scqtl-mapping/zinb/mean.txt.gz', sep=' ', index_col=0)
with sqlite3.connect('/project2/mstephens/aksarkar/projects/singlecell-qtl/browser/browser.db') as conn:
  geno = pd.read_sql('select * from mean_qtl_geno where gene = ?', con=conn, params=('ENSG00000113558',)).set_index('ind')['value']
cm = plt.get_cmap('Dark2')
k = 'ENSG00000113558'
x = ipsc_counts.loc[:,k].values
plt.clf()
fig, ax = plt.subplots(2, 1)
fig.set_size_inches(4, 4)

ax[0].hist(x, bins=np.arange(x.max() + 1), color='k')
ax[0].set_title(gene_info.loc[k, 'name'])
ax[0].set_xlabel('Num mols')
ax[0].set_ylabel('Num cells')

for f, k in zip(cdfs[1], ipsc_log_mu.columns):
  ax[1].plot(grid, f, lw=1, alpha=0.3, c=cm(int(geno[k])))
dummy = [plt.Line2D([0], [0], c=cm(i)) for i in range(3)]
# https://www.ncbi.nlm.nih.gov/snp/?term=rs13356194
ax[1].legend(dummy, ['CC', 'CT', 'TT'], title=mean_qtl_stats.loc['ENSG00000113558', 'id'].split('.')[0], frameon=False)
ax[1].set_ylabel('CDF')
ax[1].set_xlabel('Latent gene expression')

fig.tight_layout()

mlcb-skp1.png

Vector format

%config InlineBackend.figure_formats = set(['svg'])
plt.clf()
fig, ax = plt.subplots(1, len(results), sharey=True)
fig.set_size_inches(8, 2)
for a, k, t in zip(ax, results, titles):
  llik = results[k]['validation']
  for x, m in enumerate(llik):
    a.scatter(x + np.random.normal(scale=0.1, size=llik.shape[0]), llik[m].values, s=1, c='k', zorder=3)
  a.grid(c='0.8', lw=1, axis='x')
  a.set_xlim(-0.5, 2.5)
  a.set_xticks(np.arange(3))
  a.set_xticklabels([m.upper() for m in methods], rotation=90)
  a.set_title(t)
ax[0].set_ylabel('Validation log lik')
fig.tight_layout()

Sorry, your browser does not support SVG.

k = 'ENSG00000129824'
x = ipsc_counts.loc[:,k].values
plt.clf()
fig, ax = plt.subplots(2, 1)
fig.set_size_inches(4, 4)

ax[0].hist(x, bins=np.arange(x.max() + 1), color='k')
ax[0].set_title(gene_info.loc[k, 'name'])
ax[0].set_xlabel('Num mols')
ax[0].set_ylabel('Num cells')

for f, k in zip(cdfs[2], ipsc_log_mu.columns):
  ax[1].plot(grid, f, lw=1, alpha=0.2, c=colors.get(k, 'k'))
ax[1].set_xlim(0, 2e-4)
ax[1].set_xticks(np.linspace(0, 2e-4, 3))
dummy = [plt.Line2D([0], [0], c='r'), plt.Line2D([0], [0], c='b')]
ax[1].legend(dummy, ['Male', 'Female'], title='Sex', frameon=False)
ax[1].set_ylabel('CDF')
ax[1].set_xlabel('Latent gene expression')

fig.tight_layout()

Sorry, your browser does not support SVG.

cm = plt.get_cmap('Dark2')
k = 'ENSG00000113558'
x = ipsc_counts.loc[:,k].values
plt.clf()
fig, ax = plt.subplots(2, 1)
fig.set_size_inches(4, 4)

ax[0].hist(x, bins=np.arange(x.max() + 1), color='k')
ax[0].set_title(gene_info.loc[k, 'name'])
ax[0].set_xlabel('Num mols')
ax[0].set_ylabel('Num cells')

for f, k in zip(cdfs[1], ipsc_log_mu.columns):
  ax[1].plot(grid, f, lw=1, alpha=0.3, c=cm(int(geno[k])))
dummy = [plt.Line2D([0], [0], c=cm(i)) for i in range(3)]
# https://www.ncbi.nlm.nih.gov/snp/?term=rs13356194
ax[1].legend(dummy, ['CC', 'CT', 'TT'], title=mean_qtl_stats.loc['ENSG00000113558', 'id'].split('.')[0], frameon=False)
ax[1].set_ylabel('CDF')
ax[1].set_xlabel('Latent gene expression')

fig.tight_layout()

Sorry, your browser does not support SVG.

cytotoxic_t = scmodes.dataset.read_10x('/project2/mstephens/aksarkar/projects/singlecell-ideas/data/10xgenomics/cytotoxic_t/filtered_matrices_mex/hg19/', return_df=True)
k = 'ENSG00000109475'
x = cytotoxic_t.loc[:,k]
s = cytotoxic_t.sum(axis=1)

plt.clf()
fig, ax = plt.subplots(2, 1)
fig.set_size_inches(4, 4)
ax[0].hist(x, bins=np.arange(x.max() + 1), color='k')
ax[0].set_title(gene_info.loc[k, 'name'])
ax[0].set_xlabel('Num mols')
ax[0].set_ylabel('Num cells')

for c, k in zip(plt.get_cmap('Dark2').colors, ['Gamma', 'ZIG', 'Unimodal', 'ZIEF', 'NPMLE']):
  ax[1].plot(*getattr(scmodes.deconvolve, f'fit_{k.lower()}')(x, s), color=c, lw=1, label=k)
ax[1].set_xlabel('Latent gene expression')
ax[1].set_ylabel('CDF')
ax[1].legend(frameon=False)
fig.tight_layout()

Sorry, your browser does not support SVG.

Poster figures

%config InlineBackend.figure_formats = set(['svg'])
data = ['cytotoxic_t', 'b_cells', 'naive_t', 'cytotoxic_t-b_cells', 'cytotoxic_t-naive_t', 'pbmcs_68k']
methods = ['nmf', 'wglmpca', 'pvae']
titles = ['B cell', 'Cytotoxic T', 'Naive T', 'B cell/T cell', 'Naive/cytotoxic T', 'PBMC']
poisthin_res = pd.concat(
  {k: pd.concat([pd.read_csv(f'/project2/mstephens/aksarkar/projects/singlecell-modes/data/lra-generalization/{k}-{m}.txt.gz', header=[0, 1], index_col=0, sep='\t') for m in methods if not (k == 'pbmcs_68k' and m == 'wglmpca')], axis=1) for k in data})

plt.clf()
fig, ax = plt.subplots(2, 3, sharex=True)
fig.set_size_inches(5, 4)
for a, k, t in zip(ax.ravel(), data, titles):
  llik = poisthin_res.loc[k]['validation']
  for x, m in enumerate(llik):
    a.scatter(x + np.random.normal(scale=0.1, size=llik.shape[0]), llik[m].values, s=1, c='k', zorder=3)
  a.grid(c='0.8', lw=1, axis='x')
  a.set_xlim(-0.5, 2.5)
  a.set_xticks(np.arange(3))
  a.set_title(t)
for a in ax[-1]:
  a.set_xlabel('Method')
  a.set_xticklabels([m.upper() for m in llik.columns], rotation=90)
for a in ax:
  a[0].set_ylabel('Validation log lik')
fig.tight_layout()

Sorry, your browser does not support SVG.

data = ['cytotoxic_t', 'b_cells', 'ipsc', 'cytotoxic_t-b_cells', 'cytotoxic_t-naive_t', 'pbmcs_68k']
methods = ['oracle', 'ebpm_point', 'wnmf', 'wglmpca']
imputation_res = (pd.concat({(d, m): pd.read_csv(f'/project2/mstephens/aksarkar/projects/singlecell-modes/data/imputation/{d}-{m}.txt.gz', sep='\t', index_col=0)
                                 for d in data for m in methods if d != 'naive_t'})
                  .reset_index(level=0)
                  .reset_index(drop=True)
                  .rename({'level_0': 'dataset'}, axis=1))

plt.clf()
fig, ax = plt.subplots(2, 3, sharex=True)
fig.set_size_inches(5, 4)
for a, d, t in zip(ax.ravel(), data, ['T cells', 'B cells', 'iPSC', 'T cell/B cell', 'Cytotoxic/naive T', 'PBMC']):
  g = imputation_res[imputation_res['dataset'] == d]
  base = g.loc[g['method'] == 'ebpm_point']
  for x, m in enumerate(methods[2:]):
    y = g.loc[g['method'] == m]
    a.scatter(x + np.random.normal(size=y.shape[0], scale=0.1),
              1e5 * (base['loss'].values - y['loss'].values), s=2, c='k', zorder=4)
  a.axhline(y=0, c='r', lw=1, ls=':')
  a.set_title(t)
  a.grid(c='0.8', lw=1, axis='x')
for a in ax[-1]:
  a.set_xticks(np.arange(2))
  a.set_xticklabels([m.upper() for m in methods[2:]], rotation=90)
  a.set_xlabel('Method')
a = fig.add_subplot(111, frameon=False)
a.tick_params(labelcolor='none', top='off', bottom='off', left='off', right='off')
a.set_ylabel('Improvement in Poisson loss\nover mean imputation')
fig.tight_layout()

Sorry, your browser does not support SVG.

gof_res = pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-modes/data/gof/gof.txt.gz', sep='\t', index_col=0)
plt.clf()
fig, ax = plt.subplots(1, 6, sharey=True)
fig.set_size_inches(8, 2)
for a, t, (k, g) in zip(ax.ravel(),
                        ['T cell', 'B cell', 'iPSC', 'T cell/B cell', 'Cytotoxic/naive T', 'PBMC'],
                        gof_res.groupby('dataset')):
  a.hist(g.loc[g['method'] == 'gamma', 'p'], bins=np.linspace(0, 1, 11), color='0.7', density=True)
  a.axhline(y=1, c='k', ls=':', lw=1)
  a.set_xlim([0, 1])
  a.set_title(t)
ax[0].set_ylabel('Density')
for a in ax:
  a.set_xlabel('$p$-value')
fig.tight_layout()

Sorry, your browser does not support SVG.

Author: Abhishek Sarkar

Created: 2020-02-13 Thu 21:27

Validate