Marker genes in in silico mixtures

Table of Contents

Introduction

We previously found that almost no genes depart from a unimodal assumption on expression variation, even for heterogeneous tissues. Here, we investigate this question on in silico mixtures of sorted immune cells.

Setup

import anndata
import numpy as np
import pandas as pd
import scanpy
import scipy.io
import scipy.sparse as ss
import scmodes
import sqlite3
%matplotlib inline
%config InlineBackend.figure_formats = set(['retina'])
import matplotlib.pyplot as plt
plt.rcParams['figure.facecolor'] = 'w'
plt.rcParams['font.family'] = 'Nimbus Sans'

Methods

Data

Load the sorted cells.

def _read_10x(prefix):
  counts = scipy.io.mmread(f'{prefix}/matrix.mtx.gz').tocsr()
  samples = pd.read_csv(f'{prefix}/barcodes.tsv.gz', sep='\t', header=None)
  genes = pd.read_csv(f'{prefix}/genes.tsv.gz', sep='\t', header=None)
  return anndata.AnnData(counts.T, obs=samples, var=genes, filemode='memory')

cell_types = ['b_cells', 'cd14_monocytes', 'cd34', 'cd4_t_helper', 'cd56_nk',
         'cytotoxic_t', 'memory_t', 'naive_cytotoxic', 'naive_t', 'regulatory_t']

data = {k: _read_10x(f'/project2/mstephens/aksarkar/projects/singlecell-ideas/data/10xgenomics/{k}/filtered_matrices_mex/hg19/') for k in cell_types}
# TODO: anndata.concatenate is broken?
mix_obs = (pd.concat([data[k].obs for k in data], keys=data.keys())
           .reset_index(level=0)
           .rename({'level_0': 'cell_type', 0: 'barcode'}, axis=1))
mix_var = data['b_cells'].var.rename({0: 'gene', 1: 'name'}, axis=1)
# Important: we need CSR for vstack, but CSC to subset by gene downstream
mix = anndata.AnnData(ss.vstack([data[k].X for k in data]).tocsc(), var=mix_var, obs=mix_obs)

Marker genes

Get marker genes from Thermo Fisher.

markers = [
  'CD11B',   # DC pan
  'CD127',    # CD4 pan
  'CD13',    # DC pan
  'CD134',    # CD4 pan
  'CD137',    # CD4 pan
  'CD152',    # CD4 pan
  'CD154',    # CD4 pan
  'CD19',    # B cell pan
  'CD2',    # CD4 pan
  'CD20',    # B cell pan
  'CD22',    # B cell pan
  'CD25',    #  (HIGH) CD4 pan
  'CD27',    # CD4 pan
  'CD272',    # CD4 pan
  'CD279',    # CD4 pan
  'CD28',    # CD4 pan
  'CD3',    # CD4 pan
  'CD33',    # DC pan
  'CD4',    # CD4 pan
  'CD45RA',    # NAIVE
  'CD45RO',    # MEMORY
  'CD5',    # CD4 pan
  'CD62L',    # low: EFFECTOR high: MEMORY/NAIVE
  'CD69',    # HIGH
  'CD7',    # CD4 pan
  'CD70h',    # B cell pan
  'CD79A'    # B cell pan
  'CD79B'    # B cell pan
  'CD80',    # DC pan
  'CD83',    # DC pan
  'CD86',    # DC pan
]

Results

Examples

Look at CD4.

idx = mix.var[mix.var['name'] == 'CD4'].index
n_bins = mix.X[:,idx].max()
cm = plt.get_cmap('Paired')
plt.clf()
plt.gcf().set_size_inches(4, 3)
plt.yscale('symlog', linthreshy=10)
for i, k in enumerate(data):
  bins, edges = np.histogram(data[k].X[:,idx].A.ravel(), bins=np.arange(n_bins + 1))
  plt.plot(edges[:-1], bins, color=cm(i), lw=1, c=cm(i), label=k)
plt.axhline(y=0, lw=1, ls=':', c='k')
plt.title('CD4')
plt.legend(frameon=False, loc='center left', bbox_to_anchor=(1, .5))
plt.xlabel('Number of molecules')
plt.ylabel('Number of cells')
plt.tight_layout()

cd4.png

Look at CD8A.

idx = mix.var[mix.var['name'] == 'CD8A'].index
n_bins = mix.X[:,idx].max()
cm = plt.get_cmap('Paired')
plt.clf()
plt.gcf().set_size_inches(4, 3)
plt.yscale('symlog', linthreshy=10)
for i, k in enumerate(data):
  bins, edges = np.histogram(data[k].X[:,idx].A.ravel(), bins=np.arange(n_bins + 1))
  plt.plot(edges[:-1], bins, color=cm(i), lw=1, c=cm(i), label=k)
plt.axhline(y=0, lw=1, ls=':', c='k')
plt.title('CD8')
plt.legend(frameon=False, loc='center left', bbox_to_anchor=(1, .5))
plt.xlabel('Number of molecules')
plt.ylabel('Number of cells')
plt.tight_layout()

cd8a.png

Browser

Pick out an interesting set of genes (superset of markers).

idx = mix.var.loc[mix.var.apply(lambda x: x[1].startswith('CD') or x[1].startswith('HLA'), axis=1)].index
with sqlite3.connect('/project2/mstephens/aksarkar/projects/singlecell-modes/browser/browser.db') as conn:
  conn.execute('drop table if exists markers;')
  for j in idx:
    print(f'Processsing gene {j}')
    n_bins = mix.X[:,j].A.ravel().max()
    if n_bins <= 1:
      continue
    for k in data:
      bins, edges = np.histogram(data[k].X[:,j].A.ravel(), bins=np.arange(n_bins + 1))
      t = pd.DataFrame({'n_mols': edges[:-1], 'n_cells': bins})
      t['cell_type'] = k
      t['name'] = mix.var.loc[j, 'name']
      t.to_sql(name='markers', con=conn, if_exists='append', index=False)
with sqlite3.connect('/project2/mstephens/aksarkar/projects/singlecell-modes/browser/browser.db') as conn:
  conn.execute('create index idx_markers on markers(name, cell_type);')

Author: Abhishek Sarkar

Created: 2019-12-13 Fri 23:59

Validate