Expression variation in human retina cells

Table of Contents

Introduction

Lukowski et al. 2019 generated a cell atlas of the human retina. Here, we study the prevalence of multi-modal expression variation in this data.

Setup

import anndata
import numpy as np
import pandas as pd
import scanpy as sc
import scipy.io as si
import scipy.sparse as ss
import scmodes
import scmodes.benchmark.gof
import scmodes.ebpm.sgd
import torch
%matplotlib inline
%config InlineBackend.figure_formats = set(['retina'])
import matplotlib.pyplot as plt
plt.rcParams['figure.facecolor'] = 'w'
plt.rcParams['font.family'] = 'Nimbus Sans'

Data

Read the data, and retain genes which have non-zero observations in at least 1% of cells.

prefix = '/project2/mstephens/aksarkar/projects/singlecell-ideas/data/human-cell-atlas/adult-retina/7f5e7a85-a45c-4535-90bf-0ef930a0040b.mtx'
counts = si.mmread(f'{prefix}/matrix.mtx.gz').tocsr()
# Important: the HCA metadata has a header, which breaks scmodes.dataset.read_10x
samples = pd.read_csv(f'{prefix}/cells.tsv.gz', sep='\t')
genes = pd.read_csv(f'{prefix}/genes.tsv.gz', sep='\t')
x = anndata.AnnData(counts.T, obs=samples, var=genes)
sc.pp.filter_genes(x, min_cells=.01 * x.shape[0])
x
AnnData object with n_obs × n_vars = 41350 × 11446
obs: 'cellkey', 'genes_detected', 'file_uuid', 'file_version', 'total_umis', 'emptydrops_is_cell', 'barcode', 'cell_suspension.provenance.document_id', 'specimen_from_organism.provenance.document_id', 'derived_organ_ontology', 'derived_organ_label', 'derived_organ_parts_ontology', 'derived_organ_parts_label', 'cell_suspension.genus_species.ontology', 'cell_suspension.genus_species.ontology_label', 'donor_organism.provenance.document_id', 'donor_organism.human_specific.ethnicity.ontology', 'donor_organism.human_specific.ethnicity.ontology_label', 'donor_organism.diseases.ontology', 'donor_organism.diseases.ontology_label', 'donor_organism.development_stage.ontology', 'donor_organism.development_stage.ontology_label', 'donor_organism.sex', 'donor_organism.is_living', 'specimen_from_organism.organ.ontology', 'specimen_from_organism.organ.ontology_label', 'specimen_from_organism.organ_parts.ontology', 'specimen_from_organism.organ_parts.ontology_label', 'library_preparation_protocol.provenance.document_id', 'library_preparation_protocol.input_nucleic_acid_molecule.ontology', 'library_preparation_protocol.input_nucleic_acid_molecule.ontology_label', 'library_preparation_protocol.library_construction_method.ontology', 'library_preparation_protocol.library_construction_method.ontology_label', 'library_preparation_protocol.end_bias', 'library_preparation_protocol.strand', 'project.provenance.document_id', 'project.project_core.project_short_name', 'project.project_core.project_title', 'analysis_protocol.provenance.document_id', 'dss_bundle_fqid', 'bundle_uuid', 'bundle_version', 'analysis_protocol.protocol_core.protocol_id', 'analysis_working_group_approval_status'
var: 'featurekey', 'featurename', 'featuretype', 'chromosome', 'featurestart', 'featureend', 'isgene', 'genus_species', 'n_cells'

Write the h5ad out.

x.write('/project2/mstephens/aksarkar/projects/singlecell-ideas/data/human-cell-atlas/adult-retina/adult-retina.h5ad')

Report the number of samples per donor.

x.obs['donor_organism.provenance.document_id'].value_counts()
427c0a62-9baf-42ab-a3a3-f48d10544280    21285
b8049daa-7458-47bf-8ec2-3f5c56d2cb34    16164
66b7152c-8e94-412b-b0e4-f8fbf554df5a     3901
Name: donor_organism.provenance.document_id, dtype: int64

Read the h5ad.

x = anndata.read_h5ad('/project2/mstephens/aksarkar/projects/singlecell-ideas/data/human-cell-atlas/adult-retina/adult-retina.h5ad')

Results

Gamma assumption

Fit a Gamma distribution to expression variation at each gene within the donor with maximum number of cells assayed.

query = x.obs['donor_organism.provenance.document_id'] == '427c0a62-9baf-42ab-a3a3-f48d10544280'
res = scmodes.ebpm.sgd.ebpm_gamma(x[query].X, batch_size=64, max_epochs=20, verbose=True)

Test GOF at each gene.

s = x[query].X.sum(axis=1).A.ravel()
gof_res = []
for j, (log_mu, neg_log_phi) in enumerate(np.vstack(res[:-1]).T):
  d, p = scmodes.benchmark.gof._gof(x=x[query,j].X.A.ravel(),
                                    cdf=scmodes.benchmark.gof._zig_cdf,
                                    pmf=scmodes.benchmark.gof._zig_pmf,
                                    size=s,
                                    log_mu=log_mu,
                                    log_phi=-neg_log_phi)
  gof_res.append([d, p])
gof_res = pd.DataFrame(gof_res, columns=['stat', 'p'], index=x.var['featurename']).reset_index()
gof_res.to_csv('/project2/mstephens/aksarkar/projects/singlecell-modes/data/gof/adult-retina-gamma.txt.gz', sep='\t', compression='gzip')

Plot the histogram of GOF \(p\)-values.

plt.clf()
plt.gcf().set_size_inches(2, 2)
plt.hist(gof_res['p'], bins=np.linspace(0, 1, 11), color='0.7', density=True)
plt.axhline(y=1, lw=1, ls=':', c='k')
plt.xlim(0, 1)
plt.xlabel('$p$-value')
plt.ylabel('Density')
plt.tight_layout()

gamma-gof-hist.png

Report the fraction of genes which significantly depart from Gamma (Bonferroni-corrected \(p < 0.05\)).

sig = gof_res.loc[gof_res['p'] < 0.05 / gof_res.shape[0]]
sig.shape[0] / gof_res.shape[0]
0.020094356106936922

Point-Gamma assumption

Fit a point-Gamma distribution to expression variation at each gene.

res = scmodes.ebpm.sgd.ebpm_point_gamma(x[query].X, max_epochs=20, verbose=True)

Test GOF at each gene.

s = x[query].X.sum(axis=1).A.ravel()
result = []
for j, (log_mu, neg_log_phi, logodds) in enumerate(np.vstack(res[:3]).T):
  d, p = scmodes.benchmark.gof._gof(x=x[query,j].X.A.ravel(),
                                    cdf=scmodes.benchmark.gof._zig_cdf,
                                    pmf=scmodes.benchmark.gof._zig_pmf,
                                    size=s,
                                    log_mu=log_mu,
                                    log_phi=-neg_log_phi,
                                    logodds=logodds)
  result.append([d, p])
result = pd.DataFrame(result, columns=['stat', 'p'], index=x.var['featurename']).reset_index()
result.to_csv('/project2/mstephens/aksarkar/projects/singlecell-modes/data/gof/retina-point-gamma.txt.gz', sep='\t', compression='gzip')

Plot the histogram of GOF \(p\)-values.

plt.clf()
plt.gcf().set_size_inches(2, 2)
plt.hist(result['p'], bins=np.linspace(0, 1, 11), color='0.7', density=True)
plt.axhline(y=1, lw=1, ls=':', c='k')
plt.xlim(0, 1)
plt.xlabel('$p$-value')
plt.ylabel('Density')
plt.tight_layout()

point-gamma-gof-hist.png

Report the fraction of genes which significantly depart from point-Gamma (Bonferroni-corrected \(p < 0.05\)).

sig = result.loc[result['p'] < 0.05 / result.shape[0]]
sig.shape[0] / result.shape[0]
0.01712388607373755

Write out the significant genes.

x.obs['size'] = x.X.sum(axis=1)
x[query,sig.index].write('/scratch/midway2/aksarkar/modes/unimodal-data/retina.h5ad')

Unimodal assumption

Fit a unimodal distribution to expression variation at each gene which departed significantly from a point-Gamma assumption.

sbatch --partition=broadwl -n1 -c28 --exclusive --time=12:00:00 --job-name=gof
#!/bin/bash
source activate scmodes
python <<EOF
import anndata
import multiprocessing as mp
import pandas as pd
import scmodes
with mp.Pool(maxtasksperchild=20) as pool:
  x = anndata.read('/scratch/midway2/aksarkar/modes/unimodal-data/retina.h5ad')
  res = scmodes.benchmark.evaluate_gof(pd.DataFrame(x.X.A), s=x.obs['size'], pool=pool, 
                                       methods=['unimodal'])
  res.index = x.var['featurename']
  res.to_csv(f'/project2/mstephens/aksarkar/projects/singlecell-modes/data/gof/retina-unimodal.txt.gz', compression='gzip', sep='\t')
EOF

Read the results.

unimodal_res = pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-modes/data/gof/retina-unimodal.txt.gz', sep='\t', index_col=0)

Plot the histogram of GOF \(p\)-values.

plt.clf()
plt.gcf().set_size_inches(2, 2)
plt.hist(unimodal_res['p'], bins=np.linspace(0, 1, 11), color='0.7', density=True)
plt.axhline(y=1, lw=1, ls=':', c='k')
plt.xlim(0, 1)
plt.xlabel('$p$-value')
plt.ylabel('Density')
plt.tight_layout()

unimodal-gof-hist.png

Report the number and fraction of genes which depart from a unimodal assumption.

sig = unimodal_res.loc[unimodal_res['p'] < .05 / unimodal_res.shape[0]]
sig.shape[0], sig.shape[0] / x.shape[1]
(124, 0.010833478944609471)

Author: Abhishek Sarkar

Created: 2020-07-21 Tue 11:35

Validate