Manuscript figures
Setup
import anndata import numpy as np import os import pandas as pd import scanpy as sc import scmodes import scipy.stats as st import scipy.special as sp import sqlite3
import rpy2.robjects.packages import rpy2.robjects.pandas2ri rpy2.robjects.pandas2ri.activate() ashr = rpy2.robjects.packages.importr('ashr')
%matplotlib inline %config InlineBackend.figure_formats = set(['svg'])
import matplotlib.pyplot as plt plt.rcParams['figure.facecolor'] = 'w' plt.rcParams['font.family'] = 'Nimbus Sans'
Figure 1
Preprint
Draw an example of our approach to testing GOF using real data.
gene_info = pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/scqtl-genes.txt.gz', sep='\t', index_col=0)
cytotoxic_t = scmodes.dataset.read_10x('/project2/mstephens/aksarkar/projects/singlecell-ideas/data/10xgenomics/cytotoxic_t/filtered_matrices_mex/hg19/', return_df=True)
gene = 'ENSG00000109475' x = cytotoxic_t.loc[:,gene] s = cytotoxic_t.sum(axis=1) y = np.arange(x.max() + 1)
rpp = dict() pmf = dict() np.random.seed(1)
gamma_res = scmodes.ebpm.ebpm_gamma(x, s) rpp['Gamma'] = scmodes.benchmark.gof._rpp( scmodes.benchmark.gof._zig_cdf(x - 1, size=s, log_mu=gamma_res[0], log_phi=-gamma_res[1]), scmodes.benchmark.gof._zig_pmf(x, size=s, log_mu=gamma_res[0], log_phi=-gamma_res[1])) pmf['Gamma'] = np.array([scmodes.benchmark.gof._zig_pmf(k, size=s, log_mu=gamma_res[0], log_phi=-gamma_res[1]).mean() for k in y])
point_gamma_res = scmodes.ebpm.ebpm_point_gamma(x, s) rpp['ZIG'] = scmodes.benchmark.gof._rpp( scmodes.benchmark.gof._zig_cdf(x - 1, size=s, log_mu=point_gamma_res[0], log_phi=-point_gamma_res[1], logodds=point_gamma_res[2]), scmodes.benchmark.gof._zig_pmf(x, size=s, log_mu=point_gamma_res[0], log_phi=-point_gamma_res[1], logodds=point_gamma_res[2])) pmf['ZIG'] = np.array([scmodes.benchmark.gof._zig_pmf(k, size=s, log_mu=point_gamma_res[0], log_phi=-point_gamma_res[1], logodds=point_gamma_res[2]).mean() for k in y])
zief_res = scmodes.ebpm.ebpm_point_expfam(x, s) rpp['ZIEF'] = scmodes.benchmark.gof._rpp( scmodes.benchmark.gof._point_expfam_cdf(x.values.ravel() - 1, size=s, res=zief_res), scmodes.benchmark.gof._point_expfam_pmf(x.values.ravel(), size=s, res=zief_res)) # We need np.full here because _point_expfam_pmf does not broadcast pmf['ZIEF'] = np.array([scmodes.benchmark.gof._point_expfam_pmf(np.full(x.shape, k), size=s, res=zief_res).mean() for k in y])
unimodal_res = scmodes.ebpm.ebpm_unimodal(x, s) rpp['Unimodal'] = scmodes.benchmark.gof._rpp( scmodes.benchmark.gof._ash_cdf(x - 1, s=s, fit=unimodal_res), scmodes.benchmark.gof._ash_pmf(x, s=s, fit=unimodal_res)) # It is simpler to compute this here than to mess with the ash_data object g = np.array(unimodal_res.rx2('fitted_g')) a = np.fmin(g[1], g[2]) b = np.fmax(g[1], g[2]) comp_dens_conv = np.array([((st.gamma(a=k + 1, scale=1 / s.values.reshape(-1, 1)).cdf(b.reshape(1, -1)) - st.gamma(a=k + 1, scale=1 / s.values.reshape(-1, 1)).cdf(a.reshape(1, -1))) / np.outer(s, b - a)).mean(axis=0) for k in y]) comp_dens_conv[:,0] = st.poisson(mu=s.values.reshape(-1, 1) * b[0]).pmf(y).mean(axis=0) pmf['Unimodal'] = comp_dens_conv @ g[0]
npmle_res = scmodes.ebpm.ebpm_npmle(x, s) rpp['NPMLE'] = scmodes.benchmark.gof._rpp( scmodes.benchmark.gof._ash_cdf(x - 1, s=s, fit=npmle_res), scmodes.benchmark.gof._ash_pmf(x, s=s, fit=npmle_res)) g = np.array(npmle_res.rx2('fitted_g')) a = np.fmin(g[1], g[2]) b = np.fmax(g[1], g[2]) comp_dens_conv = np.array([((st.gamma(a=k + 1, scale=1 / s.values.reshape(-1, 1)).cdf(b.reshape(1, -1)) - st.gamma(a=k + 1, scale=1 / s.values.reshape(-1, 1)).cdf(a.reshape(1, -1))) / np.outer(s, b - a)).mean(axis=0) for k in y]) pmf['NPMLE'] = comp_dens_conv @ g[0]
plt.clf() fig, ax = plt.subplots(2, 1) fig.set_size_inches(4, 4) ax[0].hist(x, bins=y, color='k', label=None) for c, k in zip(plt.get_cmap('Dark2').colors, ['Gamma', 'ZIG', 'Unimodal', 'ZIEF', 'NPMLE']): ax[0].plot(y + .5, x.shape[0] * pmf[k], color=c, lw=1, label=k) ax[0].legend(frameon=False) ax[0].set_title(gene_info.loc[gene, 'name']) ax[0].set_xlabel('Number of molecules') ax[0].set_ylabel('Number of cells') for c, k in zip(plt.get_cmap('Dark2').colors, ['Gamma', 'ZIG', 'Unimodal', 'ZIEF', 'NPMLE']): ax[1].plot(*getattr(scmodes.deconvolve, f'fit_{k.lower()}')(x, s), color=c, lw=1, label=k) ax[1].set_xlabel('Latent gene expression') ax[1].set_ylabel('CDF') ax[1].legend(frameon=False) fig.tight_layout()
plt.clf() plt.gcf().set_size_inches(3, 3) grid = np.linspace(0, 1, x.shape[0] + 1)[1:] plt.xscale('log') plt.yscale('log') for c, k in zip(plt.get_cmap('Dark2').colors, ['Gamma', 'ZIG', 'Unimodal', 'ZIEF', 'NPMLE']): if k in rpp: plt.plot(grid, np.sort(rpp[k]), color=c, lw=1, marker=None, label=k) plt.plot([1e-5, 1], [1e-5, 1], lw=1, ls=':', c='0.5') plt.legend(frameon=False) plt.xlim(1e-5, 1) plt.ylim(1e-5, 1) plt.xlabel('Theoretical quantiles') plt.ylabel('Randomized quantiles') plt.tight_layout()
Draw the histogram of GOF test \(p\)-values.
gof_res = pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-modes/data/gof/gof.txt.gz', sep='\t', index_col=0) plt.clf() fig, ax = plt.subplots(1, 6, sharey=True) fig.set_size_inches(8, 2) for a, t, (k, g) in zip(ax.ravel(), ['T cell', 'B cell', 'iPSC', 'T cell/B cell', 'Cytotoxic/naive T', 'PBMC'], gof_res.groupby('dataset')): a.hist(g.loc[g['method'] == 'gamma', 'p'], bins=np.linspace(0, 1, 11), color='0.7', density=True) a.axhline(y=1, c='k', ls=':', lw=1) a.set_xlim([0, 1]) a.set_title(t) ax[0].set_ylabel('Density') for a in ax: a.set_xlabel('$p$-value') fig.tight_layout()
Draw examples of multi-modal gene expression in iPSCs
ipsc_res = pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-modes/data/deconv-generalization/ipsc-gpu.txt.gz', index_col=0, sep='\t') lrt = st.chi2(1).sf(-2 * (ipsc_res['gamma'] - ipsc_res['point_gamma'])) query = list(ipsc_res[lrt < .05 / lrt.shape[0]].index) # SKP1: top eQTL query.append('ENSG00000113558') ipsc_counts = scmodes.dataset.ipsc(prefix='/project2/mstephens/aksarkar/projects/singlecell-qtl/data/', query=query, return_df=True)
annotation = pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/scqtl-annotation.txt', sep='\t') keep_samples = pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/quality-single-cells.txt', index_col=0, header=None, sep='\t') annotation = annotation.loc[keep_samples.values.ravel()] s = annotation['mol_hs'] donor_info = pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-modes/data/20130606_sample_info.txt', sep='\t')
ipsc_log_mu = pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/density-estimation/design1/zi2-log-mu.txt.gz', sep=' ', index_col=0) ipsc_log_phi = pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/density-estimation/design1/zi2-log-phi.txt.gz', sep=' ', index_col=0) ipsc_logodds = pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/density-estimation/design1/zi2-logodds.txt.gz', sep=' ', index_col=0) del ipsc_log_mu['NA18498']
def point_gamma_cdf(grid, log_mu, log_phi, logodds): res = st.gamma(a=np.exp(-log_phi), scale=np.exp(log_mu + log_phi)).cdf(grid) res *= sp.expit(-logodds) res += sp.expit(logodds) return res lam = ipsc_counts.values / s.values.reshape(-1, 1) grid = np.linspace(lam.min(), lam.max(), 1000) cdfs = np.array([[point_gamma_cdf(grid, ipsc_log_mu.loc[j, k], ipsc_log_phi.loc[j, k], ipsc_logodds.loc[j, k]) for k in ipsc_log_mu] for j in ipsc_counts.columns])
colors = {k: 'r' if v == 'male' else 'b' for k, v in donor_info.set_index('Sample') .filter(items=list(ipsc_log_mu.columns), axis=0) ['Gender'] .iteritems()}
k = 'ENSG00000129824' x = ipsc_counts.loc[:,k].values plt.clf() fig, ax = plt.subplots(2, 1) fig.set_size_inches(4, 4) ax[0].hist(x, bins=np.arange(x.max() + 1), color='k') ax[0].set_title(gene_info.loc[k, 'name']) ax[0].set_xlabel('Number of molecules') ax[0].set_ylabel('Number of cells') for f, k in zip(cdfs[2], ipsc_log_mu.columns): ax[1].plot(grid, f, lw=1, alpha=0.2, c=colors.get(k, 'k')) ax[1].set_xlim(0, 2e-4) ax[1].set_xticks(np.linspace(0, 2e-4, 3)) dummy = [plt.Line2D([0], [0], c='r'), plt.Line2D([0], [0], c='b')] ax[1].legend(dummy, ['Male', 'Female'], title='Sex', frameon=False) ax[1].set_ylabel('CDF') ax[1].set_xlabel('Latent gene expression') fig.tight_layout()
Include a gene with an eQTL to illustrate how marginally “multi-modal” might not be the correct question to ask.
mean_qtl_stats = pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-qtl/data/scqtl-mapping/zinb/mean.txt.gz', sep=' ', index_col=0)
with sqlite3.connect('/project2/mstephens/aksarkar/projects/singlecell-qtl/browser/browser.db') as conn: geno = pd.read_sql('select * from mean_qtl_geno where gene = ?', con=conn, params=('ENSG00000113558',)).set_index('ind')['value']
cm = plt.get_cmap('Dark2') k = 'ENSG00000113558' x = ipsc_counts.loc[:,k].values plt.clf() fig, ax = plt.subplots(2, 1) fig.set_size_inches(4, 4) ax[0].hist(x, bins=np.arange(x.max() + 1), color='k') ax[0].set_title(gene_info.loc[k, 'name']) ax[0].set_xlabel('Number of molecules') ax[0].set_ylabel('Number of cells') for f, k in zip(cdfs[1], ipsc_log_mu.columns): ax[1].plot(grid, f, lw=1, alpha=0.3, c=cm(int(geno[k]))) dummy = [plt.Line2D([0], [0], c=cm(i)) for i in range(3)] # https://www.ncbi.nlm.nih.gov/snp/?term=rs13356194 ax[1].legend(dummy, ['CC', 'CT', 'TT'], title=mean_qtl_stats.loc['ENSG00000113558', 'id'].split('.')[0], frameon=False) ax[1].set_ylabel('CDF') ax[1].set_xlabel('Latent gene expression') fig.tight_layout()
Draw examples of bimodal genes detected in an in silico mixture.
def _mix_10x(k1, k2, min_detect=0.01, return_y=False): x1 = scmodes.dataset.read_10x(f'/project2/mstephens/aksarkar/projects/singlecell-ideas/data/10xgenomics/{k1}/filtered_matrices_mex/hg19/', return_df=True, min_detect=0) x2 = scmodes.dataset.read_10x(f'/project2/mstephens/aksarkar/projects/singlecell-ideas/data/10xgenomics/{k2}/filtered_matrices_mex/hg19/', return_df=True, min_detect=0) x, y = scmodes.dataset.synthetic_mix(x1, x2, min_detect=min_detect) if return_y: return x, y else: return x def _cd8_cd19_mix(**kwargs): return _mix_10x('cytotoxic_t', 'b_cells', **kwargs) mix_x, mix_y = _cd8_cd19_mix(return_y=True)
gof_res = pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-modes/data/gof/gof.txt.gz', sep='\t', index_col=0)
sig0 = gof_res[gof_res['method'] == 'gamma'].groupby('dataset').apply(lambda x: x.loc[x['p'] < 0.05 / x.shape[0]]).reset_index(drop=True) nsig1 = gof_res[gof_res['method'] == 'zig'].groupby('dataset').apply(lambda x: x.loc[x['p'] >= 0.05 / x.shape[0]]).reset_index(drop=True) examples = sig0.merge(nsig1, on=['dataset', 'gene'], suffixes=['_gamma', '_zig']) del examples['method_gamma'] del examples['method_zig']
plt.clf() fig, ax = plt.subplots(2, 2) fig.set_size_inches(4, 4) for a, k in zip(ax.ravel(), examples.sort_values('p_gamma')['gene'].head(n=4)): for i, (l, c) in enumerate(zip(['B cell', 'T cell'], ['r', 'k'])): x = mix_x.loc[mix_y == i,k] a.hist(x, bins=np.arange(x.max() + 1), color=c, label=l, alpha=0.5) a.set_title(gene_info.loc[k, 'name']) for a in ax: a[0].set_ylabel('Number of cells') for a in ax.T: a[-1].set_xlabel('Number of molecules') ax[0,0].legend(frameon=False) fig.tight_layout()
Revision
Pick a better example gene.
k = 'ENSG00000129824' x = ipsc_counts.loc[:,k].values y = np.arange(x.max() + 1) grid = np.linspace(0, (x / s).max(), 1000) pmf = dict() cdf = dict() gamma_res = scmodes.ebpm.ebpm_gamma(x, s) pmf['Gamma'] = np.array([scmodes.benchmark.gof._zig_pmf(k, size=s, log_mu=gamma_res[0], log_phi=-gamma_res[1]).mean() for k in y]) cdf['Gamma'] = st.gamma(a=np.exp(gamma_res[1]), scale=np.exp(gamma_res[0] - gamma_res[1])).cdf(grid) point_gamma_res = scmodes.ebpm.ebpm_point_gamma(x, s) cdf['Point-Gamma'] = sp.expit(point_gamma_res[2]) + sp.expit(-point_gamma_res[2]) * st.gamma(a=np.exp(point_gamma_res[1]), scale=np.exp(point_gamma_res[0] - point_gamma_res[1])).cdf(grid) pmf['Point-Gamma'] = np.array([scmodes.benchmark.gof._zig_pmf(k, size=s, log_mu=point_gamma_res[0], log_phi=-point_gamma_res[1], logodds=point_gamma_res[2]).mean() for k in y]) unimodal_res = scmodes.ebpm.ebpm_unimodal(x, s) g = np.array(unimodal_res.rx2('fitted_g')) g = g[:,g[0] > 1e-8] a = np.fmin(g[1], g[2]) b = np.fmax(g[1], g[2]) comp_dens_conv = np.array([((st.gamma(a=k + 1, scale=1 / s.values.reshape(-1, 1)).cdf(b.reshape(1, -1)) - st.gamma(a=k + 1, scale=1 / s.values.reshape(-1, 1)).cdf(a.reshape(1, -1))) / np.outer(s, b - a)).mean(axis=0) for k in y]) comp_dens_conv[:,0] = st.poisson(mu=s.values.reshape(-1, 1) * b[0]).pmf(y).mean(axis=0) pmf['Unimodal'] = comp_dens_conv @ g[0] cdf['Unimodal'] = ashr.cdf_ash(unimodal_res, grid).rx2('y').ravel() npmle_res = scmodes.ebpm.ebpm_npmle(x, s, K=512, max_grid_updates=5, tol=1e-5) g = np.array(npmle_res.rx2('fitted_g')) g = g[:,g[0] > 1e-8] a = np.fmin(g[1], g[2]) b = np.fmax(g[1], g[2]) comp_dens_conv = np.array([((st.gamma(a=k + 1, scale=1 / s.values.reshape(-1, 1)).cdf(b.reshape(1, -1)) - st.gamma(a=k + 1, scale=1 / s.values.reshape(-1, 1)).cdf(a.reshape(1, -1))) / np.outer(s, b - a)).mean(axis=0) for k in y]) pmf['Non-parametric'] = comp_dens_conv @ g[0] cdf['Non-parametric'] = ashr.cdf_ash(npmle_res, grid).rx2('y').ravel()
cm = plt.get_cmap('Dark2') plt.clf() fig, ax = plt.subplots(1, 2) fig.set_size_inches(7.5, 2) ax[0].hist(x, bins=y, color='k', label=None) for c, k in zip(cm.colors, pmf): ax[0].plot(y + .5, x.shape[0] * pmf[k], color=c, lw=1, label=k) ax[0].legend(frameon=False) ax[0].set_title(gene_info.loc['ENSG00000129824', 'name']) ax[0].set_xlabel('Number of molecules') ax[0].set_ylabel('Number of cells') for c, k in zip(cm.colors, cdf): ax[1].plot(grid, cdf[k], color=c, lw=1, label=k) ax[1].set_title(gene_info.loc['ENSG00000129824', 'name']) ax[1].set_xlabel('Latent gene expression') ax[1].set_ylabel('CDF') ax[1].legend(frameon=False) fig.tight_layout()
Plot the NB measurement model results.
fits = dict() for k in control: fits[k] = np.load(f'/project2/mstephens/aksarkar/projects/singlecell-modes/data/ebnbm/nb-heuristic-{k}.npy') labels = ['Chromium (1)', 'Chromium (2)', 'Drop-Seq', 'GemCode', 'InDrops'] keys = ['chromium1', 'chromium2', 'dropseq', 'gemcode', 'indrops'] cm = plt.get_cmap('Dark2') plt.clf() plt.gcf().set_size_inches(5.5, 2.5) plt.xscale('log') grid = np.logspace(-3, -2, 100) for i, (k, l) in enumerate(zip(keys, labels)): temp = fits[k][:,:,-1].sum(axis=1) / np.log(10) diff = temp - temp.max() plt.plot(grid, diff, lw=1, label=l, c=cm(i + 3)) plt.ylim(-10, 0) plt.axhline(y=-1, lw=1, ls=':', c='k') plt.legend(title='Dataset', frameon=False, loc='center left', bbox_to_anchor=(1, .5), ncol=2) plt.xlabel(r'Measurement dispersion') plt.ylabel('Difference log$_{10}$ likelihood\nfrom best') plt.tight_layout()
Plot the model comparison results.
thresh = 10 query1 = compare(llik[llik['dataset'].isin(control)], baseline='gamma', thresh=thresh, agg='mean') keys = ['chromium1', 'chromium2', 'dropseq', 'gemcode', 'indrops'] labels = ['Chromium (1)', 'Chromium (2)', 'Drop-Seq', 'GemCode', 'InDrops'] plt.clf() plt.gcf().set_size_inches(2.5, 3) plt.bar(range(query1.shape[0]), query1.loc[keys, 'point_gamma'], color='k') plt.xticks(range(query1.shape[0]), labels, rotation=90) plt.title('Point-Gamma vs.\nGamma') plt.ylabel(f'Fraction of genes with\nlikelihood ratio > {thresh}') plt.tight_layout()
thresh = 100 keys = ['b_cells', 'cytotoxic_t', 'ipsc', 'cytotoxic_t-b_cells', 'cytotoxic_t-naive_t', 'kidney', 'pbmc_10k_v3', 'brain', 'retina'] labels = ['B cell', 'T cell', 'iPSC', 'T cell/B cell', 'Cyto/naive T', 'Kidney', 'PBMC', 'Brain', 'Retina'] query1 = compare(llik[llik['dataset'].isin(keys)], baseline='gamma', thresh=thresh, agg='mean') query2 = compare(llik[llik['dataset'].isin(keys)], baseline='unimodal', thresh=thresh, agg='mean') plt.clf() fig, ax = plt.subplots(1, 3) fig.set_size_inches(7.5, 3) ax[0].bar(range(query1.shape[0]), query1.loc[keys, 'point_gamma'], color='k') ax[0].set_title('Point-Gamma vs.\nGamma') ax[1].bar(range(query1.shape[0]), query1.loc[keys, 'unimodal'], color='k') ax[1].set_title('Unimodal vs.\nGamma') ax[2].bar(range(query2.shape[0]), query2.loc[keys, 'npmle'], color='k') ax[2].set_title('Non-parametric vs.\nUnimodal') for a in ax: a.set_xticks(range(query2.shape[0])) a.set_xticklabels(labels, rotation=90) a.set_xlabel('Dataset') a.set_ylabel(f'Fraction of genes with\nlikelihood ratio > {thresh}') fig.tight_layout()
thresh = 10 keys = ['b_cells', 'cytotoxic_t', 'ipsc', 'cytotoxic_t-b_cells', 'cytotoxic_t-naive_t', 'kidney', 'pbmc_10k_v3', 'brain', 'retina'] labels = ['B cell', 'T cell', 'iPSC', 'T cell/B cell', 'Cyto/naive T', 'Kidney', 'PBMC', 'Brain', 'Retina'] query1 = compare(llik[llik['dataset'].isin(keys)], baseline='gamma', thresh=thresh, agg='mean') plt.clf() plt.gcf().set_size_inches(2.5, 3) plt.bar(range(query1.shape[0]), query1.loc[keys, 'point_gamma'], color='k') plt.title('Point-Gamma vs.\nGamma') plt.xticks(range(query1.shape[0]), labels, rotation=90) plt.xlabel('Dataset') plt.ylabel(f'Fraction of genes with\nlikelihood ratio > {thresh}') plt.tight_layout()
Plot an example of unimodal over Gamma.
dat = data['pbmc_10k_v3']() pmf = dict() cdf = dict() gene = 'ENSG00000163736' x = dat[:,dat.var['ensg'] == gene].X.A.ravel() s = dat.obs['size'] y = np.arange(x.max() + 1) lam = x / s grid = np.logspace(-11, -3, 1000) print(f'fitting {gene} (Gamma)') temp = scmodes.ebpm.ebpm_gamma(x, s, tol=1e-5, extrapolate=True) cdf['Gamma'] = st.gamma(a=np.exp(temp[1]), scale=np.exp(temp[0] - temp[1])).cdf(grid) pmf['Gamma'] = np.array([scmodes.benchmark.gof._zig_pmf(k, size=s, log_mu=temp[0], log_phi=-temp[1]).mean() for k in y]) print(f'fitting {gene} (Point-Gamma)') temp = scmodes.ebpm.ebpm_point_gamma(x, s, tol=1e-5, extrapolate=True) cdf['Point-Gamma'] = sp.expit(temp[2]) + sp.expit(-temp[2]) * st.gamma(a=np.exp(temp[1]), scale=np.exp(temp[0] - temp[1])).cdf(grid) pmf['Point-Gamma'] = np.array([scmodes.benchmark.gof._zig_pmf(k, size=s, log_mu=temp[0], log_phi=-temp[1], logodds=temp[2]).mean() for k in y]) print(f'fitting {gene} (Unimodal)') unimodal_res = scmodes.ebpm.ebpm_unimodal(x, s, mixcompdist='halfuniform') g = np.array(unimodal_res.rx2('fitted_g')) a = np.fmin(g[1], g[2]) b = np.fmax(g[1], g[2]) comp_dens_conv = np.array([((st.gamma(a=k + 1, scale=1 / s.values.reshape(-1, 1)).cdf(b.reshape(1, -1)) - st.gamma(a=k + 1, scale=1 / s.values.reshape(-1, 1)).cdf(a.reshape(1, -1))) / np.outer(s, b - a)).mean(axis=0) for k in y]) comp_dens_conv[:,0] = st.poisson(mu=s.values.reshape(-1, 1) * b[0]).pmf(y).mean(axis=0) pmf['Unimodal'] = comp_dens_conv @ g[0] cdf['Unimodal'] = ashr.cdf_ash(unimodal_res, grid).rx2('y').ravel()
cm = plt.get_cmap('Dark2') plt.clf() fig, ax = plt.subplots(2, 1) fig.set_size_inches(3.75, 4) ax[0].hist(x, bins=y, color='k') ax[1].hist(x, bins=y, color='0.7') for i, (k, ls) in enumerate(zip(pmf, ['-', (0, (3, 3)), '-'])): ax[1].plot(y + 0.5, x.shape[0] * pmf[k], lw=1, ls=ls, c=cm(i), label=k) ax[0].set_xlabel('Number of molecules') ax[0].set_ylabel('Number of cells') ax[0].set_title(dat.var.loc[dat.var['ensg'] == gene, 'name'][0]) ax[1].legend(frameon=False) ax[1].set_xlabel('Number of molecules') ax[1].set_ylabel('Number of cells') ax[1].set_ylim(0, 10) ax[1].set_yticks([0, 5, 10]) fig.tight_layout()
Figure 2
Plot the Poisson thinning benchmark.
pois_thin_res = (pd.read_csv('/project2/mstephens/aksarkar/projects/singlecell-modes/data/lra-generalization/lra-generalization.txt.gz', sep='\t', header=[0, 1], index_col=[0, 1, 2]) .xs(8, level=1) ['validation'] [['nmf', 'glmpca', 'pvae', 'nbvae']]) del pois_thin_res['nbvae'] pois_thin_res = pois_thin_res.drop('liver-caudate-lobe', level=0) titles = ['T cell', 'B cell', 'iPSC', 'T cell/\nB cell', 'Cytotoxic/\nnaive T', 'Kidney', 'PBMC', 'Brain', 'Retina']
plt.clf() fig, ax = plt.subplots(1, len(titles), sharey=True) fig.set_size_inches(8, 2.5) for a, (k, g), t in zip(ax.ravel(), pois_thin_res.groupby(level=0, sort=False), titles): for x, m in enumerate(g.columns): a.scatter(x + np.random.normal(scale=0.1, size=g.shape[0]), g[m].values, s=4, c='k', zorder=3) a.grid(c='0.8', lw=1, axis='x') a.set_xlim(-0.5, 2.5) a.set_xticks(np.arange(3)) a.set_xticklabels([m.upper() for m in g.columns], rotation=90) a.set_xlabel('Method') a.set_title(t) ax[0].set_ylabel('Log likelihood of\nheld-out molecules') fig.tight_layout()
Plot the distribution of log likelihood differences.
train = np.load('/scratch/midway2/aksarkar/modes/fitted-values/b_cells-train.npy') test = np.load('/scratch/midway2/aksarkar/modes/fitted-values/b_cells-test.npy') lam0 = np.load('/scratch/midway2/aksarkar/modes/fitted-values/b_cells-nmf-lam.npy') lam1 = np.load('/scratch/midway2/aksarkar/modes/fitted-values/b_cells-glmpca-lam.npy') lam2 = np.load('/scratch/midway2/aksarkar/modes/fitted-values/b_cells-pvae-lam.npy')
s = (test.sum(axis=1) / train.sum(axis=1)).reshape(-1, 1) llik0 = np.ma.masked_invalid(st.poisson(s * lam0).logpmf(test)) llik1 = st.poisson(s * lam1).logpmf(test) llik2 = st.poisson(s * lam2).logpmf(test)
%matplotlib agg plt.clf() fig, ax = plt.subplots(1, 2, sharex=True, sharey=True) fig.set_size_inches(8, 2) grid = np.arange(test.max()) for i in grid: ax[0].boxplot((llik1 - llik0)[test == i].ravel(), positions=[i], widths=[0.5], medianprops={'color': 'k'}, flierprops={'marker': '.', 'markersize': 2}) ax[0].axhline(y=0, ls=':', lw=1, c='r') ax[0].set_xlabel('Observation in validation set') ax[0].set_ylabel('Improvement log lik\nover NMF') ax[0].set_title('GLMPCA') for i in grid: ax[1].boxplot((llik2 - llik0)[test == i].ravel(), positions=[i], widths=[0.5], medianprops={'color': 'k'}, flierprops={'marker': '.', 'markersize': 2}) ax[1].axhline(y=0, ls=':', lw=1, c='r') ax[1].set_title('PVAE') ax[1].set_xticks(grid[::5]) ax[1].set_xticklabels(grid[::5].astype(int)) ax[1].set_xlim(-.5, grid.max() - .5) ax[1].set_xlabel('Observation in validation set') fig.tight_layout() plt.savefig('/project2/mstephens/aksarkar/projects/singlecell-modes/org/figure/figures.org/b_cells-llik-diff.png', dpi=300, format='png')
Plot the fitted values against each other.
%matplotlib agg plt.clf() fig, ax = plt.subplots(1, 6, sharey=True) fig.set_size_inches(8, 2) lim = [0, 20] lam0 = np.load('/scratch/midway2/aksarkar/modes/fitted-values/b_cells-nmf-lam.npy') lam1 = np.load('/scratch/midway2/aksarkar/modes/fitted-values/b_cells-glmpca-lam.npy') lam2 = np.load('/scratch/midway2/aksarkar/modes/fitted-values/b_cells-pvae-lam.npy') ax[0].scatter(np.sqrt(lam1.ravel()), np.sqrt(lam0.ravel()), s=1, c='k', alpha=0.1) ax[0].plot(lim, lim, lw=1, ls=':', c='r') ax[0].set_xlabel(r'GLMPCA $\sqrt{\hat\lambda}$') ax[0].set_ylabel(r'NMF $\sqrt{\hat\lambda}$') ax[0].set_title('B cell') ax[1].scatter(np.sqrt(lam2.ravel()), np.sqrt(lam0.ravel()), s=1, c='k', alpha=0.1) ax[1].plot(lim, lim, lw=1, ls=':', c='r') ax[1].set_xlabel(r'PVAE $\sqrt{\hat\lambda}$') ax[1].set_title('B cell') lam0 = np.load('/scratch/midway2/aksarkar/modes/fitted-values/cytotoxic_t-b_cells-nmf-lam.npy') lam1 = np.load('/scratch/midway2/aksarkar/modes/fitted-values/cytotoxic_t-b_cells-glmpca-lam.npy') lam2 = np.load('/scratch/midway2/aksarkar/modes/fitted-values/cytotoxic_t-b_cells-pvae-lam.npy') ax[2].scatter(np.sqrt(lam1.ravel()), np.sqrt(lam0.ravel()), s=1, c='k', alpha=0.1) ax[2].plot(lim, lim, lw=1, ls=':', c='r') ax[2].set_xlabel(r'GLMPCA $\sqrt{\hat\lambda}$') ax[2].set_title('T cell/B cell') ax[3].scatter(np.sqrt(lam2.ravel()), np.sqrt(lam0.ravel()), s=1, c='k', alpha=0.1) ax[3].plot(lim, lim, lw=1, ls=':', c='r') ax[3].set_xlabel(r'PVAE $\sqrt{\hat\lambda}$') ax[3].set_title('T cell/B cell') lam0 = np.load('/scratch/midway2/aksarkar/modes/fitted-values/pbmcs_10k_v3-nmf-lam.npy') lam1 = np.load('/scratch/midway2/aksarkar/modes/fitted-values/pbmcs_10k_v3-glmpca-lam.npy') lam2 = np.load('/scratch/midway2/aksarkar/modes/fitted-values/pbmcs_10k_v3-pvae-lam.npy') ax[4].scatter(np.sqrt(lam1.ravel()), np.sqrt(lam0.ravel()), s=1, c='k', alpha=0.1) ax[4].plot(lim, lim, lw=1, ls=':', c='r') ax[4].set_xlabel(r'GLMPCA $\sqrt{\hat\lambda}$') ax[4].set_title('PBMC') ax[5].scatter(np.sqrt(lam2.ravel()), np.sqrt(lam0.ravel()), s=1, c='k', alpha=0.1) ax[5].plot(lim, lim, lw=1, ls=':', c='r') ax[5].set_xlabel(r'PVAE $\sqrt{\hat\lambda}$') ax[5].set_title('PBMC') fig.tight_layout() plt.savefig('/project2/mstephens/aksarkar/projects/singlecell-modes/org/figure/figures.org/fitted-values.png', dpi=300, format='png')