Expression variation in 10X v3 PBMC data

Table of Contents

Introduction

We previously found no evidence of multi-modal gene expression in scRNA-seq of 68K PBMCs and 540K immune cells. Florian Wagner suggested this could be because the noise introduced by the experiments was too high, and that a more sensitive experiment would reveal multi-modal patterns. As a specific example, he analyzed scRNA-seq of 10K PBMCs generated by the 10X platform, version 3. Here, we repeat our systematic analysis on this data set.

Setup

Submitted batch job 64851566

import anndata
import numpy as np
import pandas as pd
import scanpy
import scipy.io as si
import scipy.sparse as ss
import scmodes
import scmodes.ebpm.sgd
%matplotlib inline
%config InlineBackend.figure_formats = set(['retina'])
import matplotlib.pyplot as plt
plt.rcParams['figure.facecolor'] = 'w'
plt.rcParams['font.family'] = 'Nimbus Sans'

Data

Read the count matrix.

x = (si.mmread('/project2/mstephens/aksarkar/projects/singlecell-ideas/data/10xgenomics/pbmc_10k_v3/filtered_feature_bc_matrix/matrix.mtx.gz')
     .tocsr())
x.shape
(33538, 11769)

Read the metadata.

barcodes = pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-ideas/data/10xgenomics/pbmc_10k_v3/filtered_feature_bc_matrix/barcodes.tsv.gz', header=None, sep='\t')
barcodes.columns = ['barcode']
genes = pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-ideas/data/10xgenomics/pbmc_10k_v3/filtered_feature_bc_matrix/features.tsv.gz', header=None, sep='\t')
genes.columns = ['ensg', 'name', 'type']
data = anndata.AnnData(x.T, obs=barcodes, var=genes)

Keep genes with a non-zero observation in at least 1 cell.

scanpy.pp.filter_genes(data, min_cells=1)
data.shape
(11769, 23036)

Write out the filtered data.

data.write('/scratch/midway2/aksarkar/modes/10k_pbmc_v3.h5ad')

Read the filtered data.

data = anndata.read_h5ad('/scratch/midway2/aksarkar/modes/10k_pbmc_v3.h5ad')

Results

Gamma assumption

Fit a Gamma distribution to expression variation at each gene.

gamma_res = scmodes.ebpm.sgd.ebpm_gamma(data.X, batch_size=128, lr=5e-2, max_epochs=40, verbose=True)
gamma_res = pd.DataFrame(np.vstack(gamma_res[:-1]).T, columns=['log_mu', 'neg_log_phi'])

Write out the results.

gamma_res.to_csv('/scratch/midway2/aksarkar/modes/10k_pbmc_v3-gamma.txt.gz', sep='\t')

Test GOF at each gene.

s = data.X.sum(axis=1).A.ravel()
gamma_gof = []
for j, (gene, (log_mu, neg_log_phi)) in enumerate(gamma_res.iterrows()):
  d, p = scmodes.benchmark.gof._gof(
    x=data.X[:,j].A.ravel(),
    cdf=scmodes.benchmark.gof._zig_cdf,
    pmf=scmodes.benchmark.gof._zig_pmf,
    size=s,
    log_mu=log_mu,
    log_phi=-neg_log_phi)
  gamma_gof.append([gene, d, p])
gamma_gof = pd.DataFrame(gamma_gof, columns=['gene', 'stat', 'p'])

Write the GOF results.

gamma_gof.to_csv('/project2/mstephens/aksarkar/projects/singlecell-modes/data/gof/10k_pbmc_v3-gamma.txt.gz', sep='\t')

Read the GOF results.

gamma_gof = pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-modes/data/gof/10k_pbmc_v3-gamma.txt.gz', index_col=0, sep='\t')

Plot the histogram of GOF \(p\)-values.

plt.clf()
plt.gcf().set_size_inches(2, 2)
plt.hist(gamma_gof['p'].values.ravel(), bins=np.linspace(0, 1, 11), color='0.7', density=True)
plt.axhline(y=1, lw=1, ls=':', c='k')
plt.xlim(0, 1)
plt.xlabel('$p$-value')
plt.ylabel('Density')
plt.tight_layout()

gamma-gof-hist.png

Report the fraction of genes which significantly depart from Gamma (Bonferroni-corrected \(p < 0.05\)).

sig = gamma_gof[gamma_gof['p'] < 0.05 / gamma_gof.shape[0]]
sig.shape[0] / gamma_gof.shape[0]
0.046188574405278696

Point-Gamma assumption

Fit a point-Gamma assumption distribution to expression variation.

gamma_res = pd.read_csv('/scratch/midway2/aksarkar/modes/10k_pbmc_v3-gamma.txt.gz', sep='\t', index_col=0)
point_gamma_res = scmodes.ebpm.sgd.ebpm_point_gamma(
  data.X,
  init=(gamma_res['log_mu'].values.reshape(1, -1),
        gamma_res['neg_log_phi'].values.reshape(1, -1)),
  batch_size=128,
  lr=5e-2,
  max_epochs=20,
  verbose=True)
point_gamma_res = pd.DataFrame(np.vstack(point_gamma_res[:-1]).T, columns=['log_mu', 'neg_log_phi', 'logodds'])

Write out the results.

point_gamma_res.to_csv('/scratch/midway2/aksarkar/modes/10k_pbmc_v3-point-gamma.txt.gz')

Test for GOF.

point_gamma_gof = []
for j, (gene, (log_mu, neg_log_phi, logodds)) in enumerate(point_gamma_res.iterrows()):
  d, p = scmodes.benchmark.gof._gof(
    x=data.X[:,j].A.ravel(),
    cdf=scmodes.benchmark.gof._zig_cdf,
    pmf=scmodes.benchmark.gof._zig_pmf,
    size=s,
    log_mu=log_mu,
    log_phi=-neg_log_phi,
    logodds=logodds)
  point_gamma_gof.append([gene, d, p])
point_gamma_gof = pd.DataFrame(point_gamma_gof, columns=['gene', 'stat', 'p'])

Write out the GOF results.

point_gamma_gof.to_csv('/project2/mstephens/aksarkar/projects/singlecell-modes/data/gof/10k_pbmc_v3-point-gamma.txt.gz')
plt.clf()
plt.gcf().set_size_inches(2, 2)
plt.hist(point_gamma_gof['p'].values.ravel(), bins=np.linspace(0, 1, 11), color='0.7', density=True)
plt.axhline(y=1, lw=1, ls=':', c='k')
plt.xlim(0, 1)
plt.xlabel('$p$-value')
plt.ylabel('Density')
plt.tight_layout()

point-gamma-got-hist.png

Report the fraction of genes which significantly depart from Gamma (Bonferroni-corrected \(p < 0.05\)).

sig = point_gamma_gof[point_gamma_gof['p'] < 0.05 / point_gamma_gof.shape[0]]
sig.shape[0] / point_gamma_gof.shape[0]
0.04575447126237194

Unimodal assumption

Extract all genes which departed from a Gamma assumption on expression variation.

query = data[:,gamma_gof['p'] < 0.05 / gamma_gof.shape[0]]
query.shape
(11769, 1064)

Write out the data.

query.write('/scratch/midway2/aksarkar/modes/unimodal-data/10k_pbmcs_v3.h5ad')

Read the data.

query = anndata.read_h5ad('/scratch/midway2/aksarkar/modes/unimodal-data/10k_pbmcs_v3.h5ad')

Test for goodness of fit of UA.

sbatch --partition=broadwl -n1 -c28 --exclusive --time=24:00:00 --job-name=gof
#!/bin/bash
source activate scmodes
python <<EOF
import anndata
import multiprocessing as mp
import pandas as pd
import scmodes
with mp.Pool() as pool:
  data = anndata.read_h5ad('/scratch/midway2/aksarkar/modes/unimodal-data/10k_pbmcs_v3.h5ad')
  x = pd.DataFrame(data.X.A, index=data.obs['barcode'], columns=data.var['name'])
  unimodal_gof_res = scmodes.benchmark.evaluate_gof(x, methods=['unimodal'], pool=pool)
  unimodal_gof_res.to_csv('/project2/mstephens/aksarkar/projects/singlecell-modes/data/gof/10k_pbmcs_v3-unimodal.txt.gz', sep='\t')
EOF

Read the results.

unimodal_gof_res = pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-modes/data/gof/10k_pbmcs_v3-unimodal.txt.gz', index_col=0, sep='\t')

Plot the histogram of GOF \(p\)-values.

plt.clf()
plt.gcf().set_size_inches(2, 2)
plt.hist(unimodal_gof_res['p'].values.ravel(), bins=np.linspace(0, 1, 11), color='0.7', density=True)
plt.axhline(y=1, lw=1, ls=':', c='k')
plt.xlim(0, 1)
plt.xlabel('$p$-value')
plt.ylabel('Density')
plt.tight_layout()

unimodal-gof-hist.png

Report the fraction of genes which significantly depart from UA (Bonferroni-corrected \(p < 0.05\)).

sig = unimodal_gof_res[unimodal_gof_res['p'] < 0.05 / unimodal_gof_res.shape[0]]
sig.shape[0] / unimodal_gof_res.shape[0]
0.1156015037593985

Plot the top genes (by diagnostic test statistic) which significantly depart from UA.

plt.clf()
fig, ax = plt.subplots(2, 4)
fig.set_size_inches(8, 4)
for a, (j, row) in zip(ax.ravel(), sig.sort_values('p').iterrows()):
  xj = data.X[:,j].A.ravel()
  a.hist(xj, bins=np.arange(xj.max() + 2), color='k')
  a.set_title(row['gene'])
for a in ax:
  a[0].set_ylabel('Num cells')
for a in ax.T:
  a[-1].set_xlabel('Num mols')
fig.tight_layout()

unimodal-ex.png

Plot the genes with highest average observed count which significantly depart from UA.

plt.clf()
fig, ax = plt.subplots(2, 4)
fig.set_size_inches(8, 4)
for a, j in zip(ax.ravel(), np.argsort(-query.X.mean(axis=0).A.ravel())):
  x_j = query.X[:,j].A.ravel()
  a.hist(x_j, bins=np.arange(x_j.max() + 2), color='k')
  a.set_title(query.var.iloc[j]['name'])
for a in ax:
  a[0].set_ylabel('Num cells')
for a in ax.T:
  a[-1].set_xlabel('Num mols')
fig.tight_layout()

unimodal-ex2.png

Author: Abhishek Sarkar

Created: 2020-01-09 Thu 11:28

Validate