Hierarchical PMF
Table of Contents
Introduction
Consider the model \( \DeclareMathOperator\E{E} \DeclareMathOperator\Gam{Gamma} \DeclareMathOperator\KL{\mathcal{KL}} \DeclareMathOperator\Mult{Multinomial} \DeclareMathOperator\Pois{Poisson} \newcommand\mf{\mathbf{F}} \newcommand\ml{\mathbf{L}} \newcommand\mx{\mathbf{X}} \newcommand\va{\mathbf{a}} \newcommand\vb{\mathbf{b}} \newcommand\vs{\mathbf{s}} \newcommand\vl{\mathbf{l}} \newcommand\vf{\mathbf{f}} \)
\begin{align} x_{ij} \mid \lambda_{ij} &\sim \Pois(\lambda_{ij})\\ \lambda_{ij} &= (\ml\mf')_{ij}\\ l_{ik} &\sim \Gam(a_{lk}, b_{lk})\\ f_{jk} &\sim \Gam(a_{fk}, b_{fk}), \end{align}where the Gamma distributions are parameterized by shape and rate. Levitin et al. 2019 considered a variation of this model to model scRNA-seq data. One can use VBEM to estimate the posterior \(p(\ml, \mf \mid \mx, \va_l, \vb_l, \va_f, \vb_f)\) (Cemgil 2009). Here, we investigate alternative methods that do not rely on EM.
Setup
import numpy as np import pandas as pd import scipy.stats as st import torch
import rpy2.robjects.packages import rpy2.robjects.pandas2ri rpy2.robjects.pandas2ri.activate()
%matplotlib inline %config InlineBackend.figure_formats = set(['retina'])
import matplotlib.pyplot as plt plt.rcParams['figure.facecolor'] = 'w' plt.rcParams['font.family'] = 'Nimbus Sans'
Methods
VBEM
Consider the augmented model (Cemgil 2009)
\begin{align} x_{ij} &= \textstyle\sum_k z_{ijk}\\ z_{ijk} \mid l_{ik}, f_{jk} &\sim \Pois(l_{ik} f_{jk})\\ l_{ik} &\sim \Gam(a_{lk}, b_{lk})\\ f_{jk} &\sim \Gam(a_{fk}, b_{fk}). \end{align}VBEM updates for the variational parameters are analytic
\begin{align} q(z_{ij1}, \ldots, z_{ijK}) &= \Mult(x_{ij}, \pi_{ij1}, \ldots, \pi_{ijK})\\ \pi_{ijk} &\propto \exp(\E[\ln l_{ik}] + \E[\ln f_{jk}])\\ q(l_{ik}) &= \Gam(\textstyle \sum_j \E[z_{ijk}] + a_{lk}, \sum_j \E[f_{jk}] + b_{lk})\\ &\triangleq \Gam(\alpha_{lik}, \beta_{lk})\\ q(f_{jk}) &= \Gam(\textstyle \sum_i \E[z_{ijk}] + a_{fk}, \sum_i \E[l_{ik}] + b_{fk})\\ &\triangleq \Gam(\alpha_{fjk}, \beta_{fk}), \end{align}where the expectations
\begin{align} \E[z_{ijk}] &= x_{ij} \pi_{ijk}\\ \E[l_{ik}] &= \alpha_{lik} / \beta_{lk}\\ \E[\ln l_{ik}] &= \psi(\alpha_{lik}) - \ln \beta_{lk}\\ \E[f_{jk}] &= \alpha_{fjk} / \beta_{fk}\\ \E[\ln f_{jk}] &= \psi(\alpha_{fjk}) - \ln \beta_{fk}, \end{align}and \(\psi\) denotes the digamma function. The ELBO
\begin{multline} \ell = \sum_{i, j} \left[ \sum_k \Big[ \E[z_{ijk}] (\E[\ln l_{ik}] + \E[\ln f_{jk}] - \ln \pi_{ijk}) - \E[l_{ik}]\E[f_{jk}] \Big] - \ln\Gamma(x_{ij} + 1)\right]\\ + \sum_{i, k} \left[ (a_{lk} - \alpha_{lik}) \E[\ln l_{ik}] - (b_{lk} - \beta_{lk}) \E[l_{ik}] + a_{lk} \ln b_{lk} - \alpha_{lik} \ln \beta_{lk} - \ln\Gamma(a_{lk}) + \ln\Gamma(\alpha_{lik})\right]\\ + \sum_{j, k} \left[ (a_{fk} - \alpha_{fjk}) \E[\ln f_{jk}] - (b_{fk} - \beta_{fk}) \E[f_{jk}] + a_{fk} \ln b_{fk} - \alpha_{fjk} \ln \beta_{fk} - \ln\Gamma(a_{fk}) + \ln\Gamma(\alpha_{fjk})\right]. \end{multline}Letting \(t_{ij} \triangleq \sum_k \exp(\E[\ln l_{ik}] + \E[\ln f_{jk}])\), and plugging in \(\E[z_{ijk}], \pi_{ijk}\),
\begin{gather} \sum_{i, j, k} \E[z_{ijk}] (\E[\ln l_{ik}] + \E[\ln f_{jk}] - \ln \pi_{ijk}) = \sum_{i, j} x_{ij} \ln t_{ij}\\ \sum_i \E[z_{ijk}] = \exp(\E[\ln f_{jk}]) \sum_i \frac{x_{ij}}{t_{ij}} \exp(\E[\ln l_{ik}])\\ \sum_j \E[z_{ijk}] = \exp(\E[\ln l_{ik}]) \sum_j \frac{x_{ij}}{t_{ij}} \exp(\E[\ln f_{jk}]). \end{gather}Updates for \(b_{lk}, b_{fk}\) are analytic:
\begin{align} \frac{\partial \ell}{\partial b_{lk}} &= \sum_{i} \left[\frac{a_{lk}}{b_{lk}} - \E[l_{ik}]\right] = 0\\ b_{lk} &:= \frac{n a_{lk}}{\sum_{i} \E[l_{ik}]}\\ \frac{\partial \ell}{\partial b_{fk}} &= \sum_{j} \left[\frac{a_{fk}}{b_{fk}} - \E[f_{jk}]\right] = 0\\ b_{fk} &:= \frac{p a_{fk}}{\sum_{j} \E[f_{jk}]}, \end{align}as are gradients for \(a_{lk}, a_{fk}\):
\begin{align} \frac{\partial \ell}{\partial a_{lk}} &= \sum_{i} \Big[\E[\ln l_{ik}] + \ln b_{lk} - \psi(a_{lk})\Big]\\ \frac{\partial^2 \ell}{\partial a_{lk}^2} &= -\psi^{(1)}(a_{lk})\\ \frac{\partial \ell}{\partial a_{fk}} &= \sum_{j} \Big[\E[\ln f_{jk}] + \ln b_{fk} - \psi(a_{fk})\Big]\\ \frac{\partial^2 \ell}{\partial a_{fk}^2} &= -\psi^{(1)}(a_{fk}), \end{align}where \(\psi^{(1)}\) denotes the trigamma function.
class PMFVBEM(torch.nn.Module): def __init__(self, n, p, k): super().__init__() self.alpha_l = torch.nn.Parameter(torch.exp(torch.randn([n, k]))) self.beta_l = torch.nn.Parameter(torch.exp(torch.randn([1, k]))) self.alpha_f = torch.nn.Parameter(torch.exp(torch.randn([p, k]))) self.beta_f = torch.nn.Parameter(torch.exp(torch.randn([1, k]))) self.a_l = torch.nn.Parameter(torch.ones([1, k])) self.b_l = torch.nn.Parameter(torch.ones([1, k])) self.a_f = torch.nn.Parameter(torch.ones([1, k])) self.b_f = torch.nn.Parameter(torch.ones([1, k])) @torch.no_grad() def elbo(self, x): """ELBO wrt q(l_{ik}) q(f_{jk}), i.e. integrating out z_{ijk}""" q_l = torch.distributions.Gamma(self.alpha_l, self.beta_l) q_f = torch.distributions.Gamma(self.alpha_f, self.beta_f) kl_l = torch.distributions.kl.kl_divergence(q_l, torch.distributions.Gamma(self.a_l, self.b_l)).sum() kl_f = torch.distributions.kl.kl_divergence(q_f, torch.distributions.Gamma(self.a_f, self.b_f)).sum() # TODO: multiple samples? l = q_l.rsample() f = q_f.rsample() err = torch.distributions.Poisson(l @ f.T).log_prob(x).sum() elbo = err - kl_l - kl_f return elbo def _pm(self, x): pm_l = self.alpha_l / self.beta_l pm_ln_l = torch.digamma(self.alpha_l) - torch.log(self.beta_l) pm_f = self.alpha_f / self.beta_f pm_ln_f = torch.digamma(self.alpha_f) - torch.log(self.beta_f) t = x / (torch.exp(pm_ln_l) @ torch.exp(pm_ln_f).T) return pm_l, pm_ln_l, pm_f, pm_ln_f, t def elbo_z(self, x): """ELBO wrt q(l_{ik}) q(f_{jk}) q(z_{ijk})""" pm_l, pm_ln_l, pm_f, pm_ln_f, _ = self._pm(x) ret = (x * torch.log(torch.exp(pm_ln_l) @ torch.exp(pm_ln_f).T) - pm_l @ pm_f.T - torch.lgamma(x + 1)).sum() ret += ((self.a_l - self.alpha_l) * pm_ln_l - (self.b_l - self.beta_l) * pm_l + self.a_l * torch.log(self.b_l) - self.alpha_l * torch.log(self.beta_l) - torch.lgamma(self.a_l) + torch.lgamma(self.alpha_l)).sum() ret += ((self.a_f - self.alpha_f) * pm_ln_f - (self.b_f - self.beta_f) * pm_f + self.a_f * torch.log(self.b_f) - self.alpha_f * torch.log(self.beta_f) - torch.lgamma(self.a_f) + torch.lgamma(self.alpha_f)).sum() assert torch.isfinite(ret) assert ret <= 0 return ret @torch.no_grad() def fit(self, x, num_epochs, eps=1e-15, step=1): self.trace = [] self.trace2 = [] for i in range(num_epochs): self.trace.append(-self.elbo(x).cpu().numpy()) self.trace2.append(-self.elbo_z(x).cpu().numpy()) obj = self.elbo_z(x).cpu().numpy() # Cemgil 2009, Alg. 1 # Important: this optimizes elbo_z, not elbo pm_l, pm_ln_l, pm_f, pm_ln_f, t = self._pm(x) self.alpha_l.data = torch.exp(pm_ln_l) * (t @ torch.exp(pm_ln_f)) + self.a_l self.beta_l.data = pm_f.sum(dim=0) + self.b_l pm_l, pm_ln_l, pm_f, pm_ln_f, t = self._pm(x) self.alpha_f.data = torch.exp(pm_ln_f) * (torch.exp(pm_ln_l).T @ t).T + self.a_f self.beta_f.data = pm_l.sum(dim=0) + self.b_f pm_l, pm_ln_l, pm_f, pm_ln_f, _ = self._pm(x) self.a_l.data += step * (pm_ln_l + torch.log(self.b_l) - torch.digamma(self.a_l)).sum(dim=0) / torch.polygamma(1, self.a_l) torch.clamp(self.a_l, min=eps) self.b_l.data = pm_l.shape[0] * self.a_l / pm_l.sum(dim=0) self.a_f.data += step * (pm_ln_f + torch.log(self.b_f) - torch.digamma(self.a_f)).sum(dim=0) / torch.polygamma(1, self.a_f) torch.clamp(self.a_f, min=eps) self.b_f.data = pm_f.shape[0] * self.a_f / pm_f.sum(dim=0) return self
Newton-Raphson
Alternatively, one could plug in \(\E[z_{ijk}], \pi_{ijk}\) and then optimize the ELBO with respect to \(\alpha_{lik}, \beta_{lk}, \alpha_{fjk}, \beta_{fk}, a_{lk}, b_{lk}, a_{fk}, b_{fk}\) using one-dimensional Newton-Raphson updates. To prototype this approach, we use automatic differentiation to get the first and second derivatives.
class PMFN(torch.nn.Module): def __init__(self, n, p, k): super().__init__() self.alpha_l = torch.nn.Parameter(torch.exp(torch.randn([n, k]))) self.beta_l = torch.nn.Parameter(torch.exp(torch.randn([1, k]))) self.alpha_f = torch.nn.Parameter(torch.exp(torch.randn([p, k]))) self.beta_f = torch.nn.Parameter(torch.exp(torch.randn([1, k]))) self.a_l = torch.nn.Parameter(torch.ones([1, k])) self.b_l = torch.nn.Parameter(torch.ones([1, k])) self.a_f = torch.nn.Parameter(torch.ones([1, k])) self.b_f = torch.nn.Parameter(torch.ones([1, k])) @torch.no_grad() def elbo(self, x): q_l = torch.distributions.Gamma(self.alpha_l, self.beta_l) q_f = torch.distributions.Gamma(self.alpha_f, self.beta_f) kl_l = torch.distributions.kl.kl_divergence(q_l, torch.distributions.Gamma(self.a_l, self.b_l)).sum() kl_f = torch.distributions.kl.kl_divergence(q_f, torch.distributions.Gamma(self.a_f, self.b_f)).sum() # TODO: multiple samples? l = q_l.rsample() f = q_f.rsample() err = torch.distributions.Poisson(l @ f.T).log_prob(x).sum() elbo = err - kl_l - kl_f return elbo def elbo_z(self, x): pm_l = (self.alpha_l / self.beta_l) exp_pm_ln_l = torch.exp(torch.digamma(self.alpha_l) - torch.log(self.beta_l)) pm_f = (self.alpha_f / self.beta_f) exp_pm_ln_f = torch.exp(torch.digamma(self.alpha_f) - torch.log(self.beta_f)) temp = exp_pm_ln_l @ exp_pm_ln_f.T ret = (x * torch.log(temp) - pm_l @ pm_f.T - torch.lgamma(x + 1)).sum() ret += ((self.a_l - self.alpha_l) * torch.log(exp_pm_ln_l) - (self.b_l - self.beta_l) * pm_l + self.a_l * torch.log(self.b_l) - self.alpha_l * torch.log(self.beta_l) - torch.lgamma(self.a_l) + torch.lgamma(self.alpha_l)).sum() ret += ((self.a_f - self.alpha_f) * torch.log(exp_pm_ln_f) - (self.b_f - self.beta_f) * pm_f + self.a_f * torch.log(self.b_f) - self.alpha_f * torch.log(self.beta_f) - torch.lgamma(self.a_f) + torch.lgamma(self.alpha_f)).sum() assert not torch.isnan(ret) assert torch.isfinite(ret) assert ret <= 0 return ret def _update(self, x, par, step=1, eps=1e-15): """Newton-Raphon update""" for i in range(par.shape[0]): for k in range(par.shape[1]): # Important: this optimizes elbo_z, not elbo d = torch.autograd.grad(self.elbo_z(x), par, create_graph=True)[0] d2 = torch.autograd.grad(d[i,k], par)[0] with torch.no_grad(): par.data[i,k] -= step * d[i,k] / (d2[i,k] + eps) par.data[i,k] = torch.clamp(par.data[i,k], min=eps) def fit(self, x, num_epochs, step=1): self.trace = [] self.trace2 = [] for t in range(num_epochs): # TODO: update order probably matters for par in self.parameters(): if par.requires_grad: self._update(x, par, step=step) self.trace.append(-self.elbo(x).cpu().numpy()) with torch.no_grad(): self.trace2.append(-self.elbo_z(x).cpu().numpy()) return self
Pathwise gradient
In the original model, the ELBO
\begin{equation} \ell = \sum_{i,j} \E[\ln p(x_{ij} \mid \ml, \mf)] - \KL(q(\ml) \Vert p(\ml)) - \KL(q(\mf) \Vert p(\mf)). \end{equation}The KL terms are analytic; however, the first expectation is not (unlike for the approach described above, which made a variational approximation on \(z\)). In this case, one can still optimize the ELBO using the pathwise gradient (reviewed by Mohamed et al. 2020) and gradient descent. Briefly,
\begin{align} \nabla \textstyle \sum_{i,j} \E[\ln p(x_{ij} \mid \ml, \mf)] &\approx \frac{1}{S} \sum_{s=1}^S \sum_{i,j} \nabla \ln p(x_{ij} \mid \ml^{(s)}, \mf^{(s)}), \quad \ml^{(s)} \sim q(\ml), \mf^{(s)} \sim q(\mf)\\ &= \frac{1}{S} \sum_{s=1}^S \sum_{i,j} \nabla \ln p(x_{ij} \mid h_{\ml}(\epsilon_{\ml}^{(s)}), h_{\mf}(\epsilon_{\ml}^{(s)})), \quad \epsilon_{\ml}^{(s)} \sim \Gam(1, 1), \epsilon_{\mf}^{(s)} \sim \Gam(1, 1), \end{align}where \(\epsilon_{\ml}, \epsilon_{\mf}\) are matrices with elements following a standard Gamma distribution and \(h_{\ml}, h_{\mf}\) are differentiable, element-wise functions of the sampled elements and the variational parameters. The difference between the two lower bounds is
\begin{equation} \sum_{i, j} x_{ij} \Big[ \E[\ln(\textstyle \sum_k l_{ik} f_{jk})] - \ln (\sum_k \exp(\E[\ln l_{ik}] + \E[\ln f_{jk}])) \Big], \end{equation}which may always be non-negative (intuitively, it seems to follow from Jensen’s inequality). It is not clear that the local optima of the two lower bounds are the same.
class PMFGD(torch.nn.Module): def __init__(self, n, p, k): super().__init__() self.alpha_l = torch.nn.Parameter(torch.exp(torch.randn([n, k]))) self.beta_l = torch.nn.Parameter(torch.exp(torch.randn([1, k]))) self.alpha_f = torch.nn.Parameter(torch.exp(torch.randn([p, k]))) self.beta_f = torch.nn.Parameter(torch.exp(torch.randn([1, k]))) self.a_l = torch.nn.Parameter(torch.ones([1, k])) self.b_l = torch.nn.Parameter(torch.ones([1, k])) self.a_f = torch.nn.Parameter(torch.ones([1, k])) self.b_f = torch.nn.Parameter(torch.ones([1, k])) def forward(self, x, n_samples): q_l = torch.distributions.Gamma(self.alpha_l, self.beta_l) q_f = torch.distributions.Gamma(self.alpha_f, self.beta_f) kl_l = torch.distributions.kl.kl_divergence(q_l, torch.distributions.Gamma(self.a_l, self.b_l)).sum() kl_f = torch.distributions.kl.kl_divergence(q_f, torch.distributions.Gamma(self.a_f, self.b_f)).sum() l = q_l.rsample(n_samples) f = q_f.rsample(n_samples) lam = torch.einsum('lik,ljk->lij', l, f) err = torch.distributions.Poisson(lam).log_prob(x.unsqueeze(0)).mean(dim=0).sum() elbo = err - kl_l - kl_f return -elbo def fit(self, x, num_epochs, n_samples=1, **kwargs): self.trace = [] n_samples = torch.Size([n_samples]) opt = torch.optim.Adam(self.parameters(), **kwargs) for i in range(num_epochs): opt.zero_grad() loss = self.forward(x, n_samples) self.trace.append(loss.detach().cpu().numpy()) loss.backward() opt.step() return self
Alternating Poisson Regression
Carbonetto et al. 2021 propose Alternating Poisson Regression as a general algorithmic framework for fitting PMF. The essence of the approach is that the objective function (for MLE) can be written
\begin{align} \ell &= \sum_i \ell_i\\ \ell_i &= \sum_j x_{ij} \ln (\vl_i' \vf_j) - \vl_i' \vf_j - \ln\Gam(x_{ij} + 1), \end{align}which corresponds to estimating \(\vl_i\) by fitting \(n\) regression subproblems of the form (fixing \(i\))
\begin{align} x_j \sim \Pois(\vl' \vf_j) \end{align}by maximum likelihood. By symmetry, one can also estimate \(\vf_j\) by fitting \(p\) regression subproblems. Here, we show that HPMF can be similarly decomposed into problems of the form
\begin{align} x_{j} &\sim \Pois(\vl' \vf_j)\\ l_{k} &\sim \Gam(a_k, b_k). \end{align}Fixing \(i\), consider the augmented model
\begin{align} x_{j} &= \textstyle\sum_k z_{jk}\\ z_{jk} &\sim \Pois(l_{k} f_{jk})\\ l_{k} &\sim \Gam(a_k, b_k). \end{align}The log joint probability
\begin{equation} \ln p = \sum_{j, k} \left[z_{jk} \ln (l_k f_{jk}) - l_k f_{jk}\right] + \sum_k \left[a_k \ln b_k + (a_k - 1) \ln l_k - l_k b_k - \ln\Gamma(a_k)\right] + \mathrm{const}, \end{equation}yielding coordinate updates
\begin{align} q(z_{j1}, \ldots, z_{jK}) &= \Mult(x_j; \pi_{j1}, \ldots, \pi_{jK})\\ \pi_{jk} &\propto \exp\left(\ln f_{jk} + \E[\ln l_k]\right)\\ q(l_k) &= \textstyle\Gam(\underbrace{\sum_j \E[z_{jk}] + a_k}_{\triangleq \alpha_k}, \underbrace{\sum_j x_j + b_k}_{\triangleq \beta_k}). \end{align}The ELBO
\begin{multline} \ell_i = \sum_{j, k} \left[\E[z_{jk}] (\ln f_{jk} + \E[\ln l_k] - \ln \pi_{jk}) - \E[l_k] f_{jk} - \ln\Gamma(x_j + 1)\right]\\ + \sum_k \left[(a_k - \alpha_k) \E[\ln l_k] - (b_k - \beta_k) \E[l_k] + a_k \ln b_k - \alpha_k \ln \beta_k + \ln\Gamma(a_k) - \ln\Gamma(\alpha_k)\right], \end{multline}which is clearly the terms of the HPMF ELBO above involving \(i\). Note that one cannot update \(a_k, b_k\) within this subproblem, since they are shared between subproblems.
Using a similar argument as made above for full HPMF, one can plug in \(\E[z_{jk}]\), yielding
\begin{gather} \sum_{j, k} \E[z_{jk}] (\ln f_{jk} + \E[\ln l_k] - \ln \pi_{jk}) = \sum_j x_j \ln t_j\\ \sum_j \E[z_{jk}] = \E[\ln l_k] \sum_j \frac{x_j f_{jk}}{t_j}, \end{gather}where \(t_j \triangleq \sum_k \exp(\ln f_{jk} + \E[\ln l_k])\).
Results
Simulated example
Simulate from the model.
rng = np.random.default_rng(1) n = 200 p = 300 K = 3 a_l = 1 b_l = 1 a_f = 1 b_f = 1 l = rng.gamma(a_l, b_l, size=(n, K)) f = rng.gamma(a_f, b_f, size=(p, K)) x = rng.poisson(l @ f.T) xt = torch.tensor(x, dtype=torch.float, device='cuda')
To initialize all methods, run 50 epochs of VBEM. Compare VBEM and Newton-Raphson (which optimize the ELBO eq. 20-21) and gradient descent (which optimizes the ELBO eq. 32). For each method, trace the ELBO estimated according to eq. 32.
# Important: VBEM is randomly initialized print('fitting initial model by VBEM') torch.manual_seed(1) m0 = PMFVBEM(n, p, K).cuda().fit(xt, num_epochs=50, step=1e-3) models = dict() # Continue VBEM updates print('fitting by VBEM') models['VBEM'] = PMFVBEM(n, p, K).cuda() models['VBEM'].load_state_dict(m0.state_dict()) models['VBEM'].fit(xt, num_epochs=200, step=1e-3) # Important: GD is stochastic print('fitting by GD (1)') torch.manual_seed(2) models['GD (1)'] = PMFGD(n, p, K).cuda() models['GD (1)'].load_state_dict(m0.state_dict()) models['GD (1)'].fit(xt, num_epochs=60000, lr=5e-2, eps=1e-2) print('fitting by GD (10)') torch.manual_seed(2) models['GD (10)'] = PMFGD(n, p, K).cuda() models['GD (10)'].load_state_dict(m0.state_dict()) models['GD (10)'].fit(xt, num_epochs=60000, n_samples=10, lr=5e-2, eps=1e-2) print('fitting by Newton-Raphson') models['Newton'] = PMFN(n, p, K).cuda() models['Newton'].load_state_dict(m0.state_dict()) models['Newton'].fit(xt, num_epochs=100, step=0.5)
PMFN()
Write out the saved models.
torch.save({k: models[k].state_dict() for k in models}, '/scratch/midway2/aksarkar/singlecell/hpmf-sim-ex.pt')
Look at the ELBO achieved by the algorithms.
cm = plt.get_cmap('Dark2') plt.clf() fig, ax = plt.subplots(1, 3, sharey=True) fig.set_size_inches(7.5, 2.5) for i, name in enumerate(models): temp = getattr(models[name], 'trace2', None) if temp: ax[0].plot(temp, c=cm(i), lw=1, marker=None, label=name) ax[0].set_title('Approximating $q(z_{ijk})$') for i, name in enumerate(models): mu = pd.Series(models[name].trace).ewm(alpha=0.1).mean() sd = pd.Series(models[name].trace).ewm(alpha=0.1).std() for a in ax[1:]: a.plot(mu, c=cm(i), lw=1, marker=None, label=name) a.fill_between(np.arange(len(mu)), mu - sd, mu + sd, color=cm(i), alpha=0.1) a.set_title('Integrating out $z_{ijk}$') ax[-1].set_xlim(0, 100) ax[-1].legend(frameon=False, loc='center left', bbox_to_anchor=(1, .5)) for a in ax: a.set_xlabel('Epoch (after initialization)') ax[0].set_ylabel('Negative ELBO') fig.tight_layout()
Report the final (approximate) ELBO.
{name: np.array(models[name].trace[-10:]).mean() for name in models}
{'VBEM': 104990.25, 'GD (1)': 104915.23, 'GD (10)': 104877.33, 'Newton': 104974.35}
Compare the fitted values of each approach to the observed values.
plt.clf() fig, ax = plt.subplots(1, 4, sharey=True) fig.set_size_inches(7, 3) for a, name in zip(ax, models): with torch.no_grad(): pm = ((models[name].alpha_l / models[name].beta_l) @ (models[name].alpha_f / models[name].beta_f).T).cpu() a.scatter(pm.ravel(), x.ravel(), s=1, alpha=0.1, c='k') lim = [0, 60] for a, name in zip(ax, models): a.plot(lim, lim, c='r', lw=1, ls=':') a.set_xlabel('Fitted value') a.set_title(name) ax[0].set_ylabel('True value') fig.tight_layout()
Compare the estimated factors to the ground truth.
plt.clf() fig, ax = plt.subplots(4, 3, sharex=True, sharey=True) fig.set_size_inches(6, 8) for i, (row, name) in enumerate(zip(ax, models)): with torch.no_grad(): pm = (models[name].alpha_f / models[name].beta_f).cpu().numpy() order = np.argmax(np.corrcoef(f, pm, rowvar=False)[K:,:K], axis=0) for k, a in enumerate(row): a.scatter(pm[:,order[k]], f[:,k], s=1, c='k', alpha=0.1) a.set_title(name) row[0].set_ylabel(f'True factor') for k, a in enumerate(ax.T): a[-1].set_xlabel(f'Estimated factor {k + 1}') fig.tight_layout()
Look at the estimated priors.
cm = plt.get_cmap('Dark2') plt.clf() fig, ax = plt.subplots(1, 2) fig.set_size_inches(5, 2.5) grid = np.linspace(0, 20, 1000) for a, k in zip(ax, ('l', 'f')): with torch.no_grad(): g = st.gamma(a=getattr(models['VBEM'], f'a_{k}').cpu()[0,j], scale=1 / getattr(models['VBEM'], f'b_{k}').cpu()[0,j]) for j in range(K): a.plot(grid, g.pdf(grid), lw=1, c=cm(j), label=f'k={j}') a.legend(frameon=False) ax[0].set_ylabel('Density') ax[0].set_xlabel('Loading') ax[1].set_xlabel('Factor') fig.tight_layout()
Compare the estimated loadings to the true loadings.
plt.clf() fig, ax = plt.subplots(4, 3, sharey=True, sharex=False) fig.set_size_inches(6, 8) for i, (row, name) in enumerate(zip(ax, models)): with torch.no_grad(): pm = (models[name].alpha_l / models[name].beta_l).cpu().numpy() order = np.argmax(np.corrcoef(l, pm, rowvar=False)[K:,:K], axis=0) for k, a in enumerate(row): a.scatter(pm[:,order[k]], l[:,k], s=1, c='k', alpha=0.1) a.set_title(name) row[0].set_ylabel(f'True loadings {i+1}') for a in ax.T: a[-1].set_xlabel('Estimated lodaings') fig.tight_layout()
Correlated factors example
Simulate factors that are correlated by drawing from a log multivariate Gaussian.
rng = np.random.default_rng(1) n = 200 p = 300 K = 3 a_l = 1 b_l = 1 l = rng.gamma(a_l, b_l, size=(n, K)) cov = np.eye(K) cov[1,2] = cov[2,1] = 0.6 f = np.exp(rng.multivariate_normal(mean=np.zeros(K), cov=cov, size=(p,))) x = rng.poisson(l @ f.T) xt = torch.tensor(x, dtype=torch.float, device='cuda')
To initialize all methods, run 50 epochs of VBEM. Compare VBEM, Newton-Raphson, and gradient descent, tracing the ELBO estimated according to eq. 32.
# Important: VBEM is randomly initialized print('fitting initial model by VBEM') torch.manual_seed(1) m0 = PMFVBEM(n, p, K).cuda().fit(xt, num_epochs=50, step=1e-3) models = dict() # Continue VBEM updates print('fitting by VBEM') models['VBEM'] = PMFVBEM(n, p, K).cuda() models['VBEM'].load_state_dict(m0.state_dict()) models['VBEM'].fit(xt, num_epochs=1000, step=1e-3) # Important: GD is stochastic print('fitting by GD') torch.manual_seed(2) models['GD'] = PMFGD(n, p, K).cuda() models['GD'].load_state_dict(m0.state_dict()) models['GD'].fit(xt, num_epochs=60000, lr=5e-2, eps=1e-2) print('fitting by Newton-Raphson') models['Newton'] = PMFN(n, p, K).cuda() models['Newton'].load_state_dict(m0.state_dict()) models['Newton'].fit(xt, num_epochs=200, step=0.5)
PMFN()
Look at the ELBO achieved by the algorithms.
cm = plt.get_cmap('Dark2') plt.clf() fig, ax = plt.subplots(1, 3) fig.set_size_inches(7.5, 2.5) for i, name in enumerate(models): temp = getattr(models[name], 'trace2', None) if temp: ax[0].plot(temp, c=cm(i), lw=1, marker=None, label=name) ax[0].set_title('Approximating $q(z_{ijk})$') for i, name in enumerate(models): mu = pd.Series(models[name].trace).ewm(alpha=0.1).mean() sd = pd.Series(models[name].trace).ewm(alpha=0.1).std() for a in ax[1:]: a.plot(mu, c=cm(i), lw=1, marker=None, label=name) a.fill_between(np.arange(len(mu)), mu - sd, mu + sd, color=cm(i), alpha=0.1) a.set_title('Integrating out $z_{ijk}$') ax[-1].set_xlim(0, 1000) ax[-1].legend(frameon=False, loc='center left', bbox_to_anchor=(1, .5)) for a in ax: a.set_xlabel('Epoch (after initialization)') ax[0].set_ylabel('Negative ELBO') fig.tight_layout()
Report the final (approximate) ELBO.
{name: np.array(models[name].trace[-10:]).mean() for name in models}
{'VBEM': 118721.9, 'GD': 118704.336, 'Newton': 118723.39}
Compare the fitted values of each approach to the observed values.
plt.clf() fig, ax = plt.subplots(1, 3, sharey=True) fig.set_size_inches(7, 3) for a, name in zip(ax, models): with torch.no_grad(): pm = ((models[name].alpha_l / models[name].beta_l) @ (models[name].alpha_f / models[name].beta_f).T).cpu() a.scatter(pm.ravel(), x.ravel(), s=1, alpha=0.1, c='k') lim = [0, 230] for a, name in zip(ax, models): a.plot(lim, lim, c='r', lw=1, ls=':') a.set_xlabel('Fitted value') a.set_title(name) ax[0].set_ylabel('True value') fig.tight_layout()
Compare the estimated factors to the ground truth.
plt.clf() fig, ax = plt.subplots(3, 3, sharey=True, sharex=False) fig.set_size_inches(6, 6) for i, (row, name) in enumerate(zip(ax, models)): with torch.no_grad(): pm = (models[name].alpha_f / models[name].beta_f).cpu().numpy() order = np.argmax(np.corrcoef(f, pm, rowvar=False)[K:,:K], axis=0) for k, a in enumerate(row): a.scatter(pm[:,order[k]], f[:,k], s=1, c='k', alpha=0.1) a.set_title(name) row[0].set_ylabel(f'True factor {i+1}') for a in ax.T: a[-1].set_xlabel('Estimated factor') fig.tight_layout()
fastTopics example
Peter Carbonetto demonstrated that EM for NMF can fail to make progress, with consequences on e.g., VBEM for LDA.
names = rpy2.robjects.r['load']('/scratch/midway2/aksarkar/singlecell/smallsimb.RData') dat = {name: rpy2.robjects.r[name] for name in names} n, p = dat['X'].shape _, K = dat['L0'].shape xt = torch.tensor(dat['X'], dtype=torch.float, device='cuda')
Initialize \(\alpha_{lik}, \alpha_{fjk}\) at the given initialization \(\ml_0, \mf_0\) and fix \(\beta_{lk} = 1, \beta_{fk} = 1\).
m0 = PMFVBEM(n, p, K).cuda() eps = 0.1 m0.alpha_l.data = torch.clamp(torch.tensor(dat['L0']), min=eps) m0.beta_l.data = torch.ones([1, K]) m0.alpha_f.data = torch.clamp(torch.tensor(dat['F0']), min=eps) m0.beta_l.data = torch.ones([1, K])
models = dict()
print('fitting by VBEM') models['VBEM'] = PMFVBEM(n, p, K).cuda() models['VBEM'].load_state_dict(m0.state_dict()) models['VBEM'].fit(xt, num_epochs=500, step=1e-3)
PMFVBEM()
print('fitting by Newton-Raphson') models['Newton'] = PMFN(n, p, K).cuda() models['Newton'].load_state_dict(m0.state_dict()) models['Newton'].fit(xt, num_epochs=50, step=1e-6)
PMFN()
print(f'fitting by GD') # Important: GD is stochastic torch.manual_seed(2) models['GD'] = PMFGD(n, p, K).cuda() models['GD'].load_state_dict(m0.state_dict()) models['GD'].fit(xt, num_epochs=60000, n_samples=n_samples, lr=1e-4, eps=1e-2)
PMFGD()
Look at the ELBO achieved by the algorithms.
cm = plt.get_cmap('Dark2') plt.clf() fig, ax = plt.subplots(1, 3) fig.set_size_inches(7.5, 2.5) for i, name in enumerate(models): temp = getattr(models[name], 'trace2', None) if temp: ax[0].plot(temp, c=cm(i), lw=1, marker=None, label=name) ax[0].set_title('Approximating $q(z_{ijk})$') for i, name in enumerate(models): mu = pd.Series(models[name].trace).ewm(alpha=0.1).mean() sd = pd.Series(models[name].trace).ewm(alpha=0.1).std() for a in ax[1:]: a.plot(mu, c=cm(i), lw=1, marker=None, label=name) a.fill_between(np.arange(len(mu)), mu - sd, mu + sd, color=cm(i), alpha=0.1) a.set_title('Integrating out $z_{ijk}$') ax[-1].set_xlim(0, 500) ax[-1].legend(frameon=False, loc='center left', bbox_to_anchor=(1, .5)) for a in ax: a.set_xlabel('Epoch (after initialization)') ax[0].set_ylabel('Negative ELBO') fig.tight_layout()
Report the final (approximate) ELBO.
{name: np.array(models[name].trace[-10:]).mean() for name in models}
{'VBEM': 42475.727, 'Newton': 139408.52, 'GD': 63879.742}
Compare the fitted values of each approach to the observed values.
plt.clf() fig, ax = plt.subplots(1, 3, sharey=True) fig.set_size_inches(7, 3) for a, name in zip(ax, models): with torch.no_grad(): pm = ((models[name].alpha_l / models[name].beta_l) @ (models[name].alpha_f / models[name].beta_f).T).cpu() a.scatter(pm.ravel(), dat['X'].ravel(), s=1, alpha=0.1, c='k') lim = [0, 1225] for a, name in zip(ax, models): a.plot(lim, lim, c='r', lw=1, ls=':') a.set_xlabel('Fitted value') a.set_title(name) ax[0].set_ylabel('True value') fig.tight_layout()
As an alternative initialization, run 50 epochs of VBEM. Compare VBEM, Newton-Raphson, and gradient descent, tracing the ELBO estimated according to eq. 32.
# Important: VBEM is randomly initialized print('fitting initial model by VBEM') torch.manual_seed(1) m0 = PMFVBEM(n, p, K).cuda().fit(xt, num_epochs=50, step=1e-3) models = dict() # Continue VBEM updates print('fitting by VBEM') models['VBEM'] = PMFVBEM(n, p, K).cuda() models['VBEM'].load_state_dict(m0.state_dict()) models['VBEM'].fit(xt, num_epochs=1000, step=1e-3) print('fitting by Newton-Raphson') models['Newton'] = PMFN(n, p, K).cuda() models['Newton'].load_state_dict(m0.state_dict()) models['Newton'].fit(xt, num_epochs=200, step=1e-3) # Important: GD is stochastic print('fitting by GD') torch.manual_seed(2) models['GD'] = PMFGD(n, p, K).cuda() models['GD'].load_state_dict(m0.state_dict()) models['GD'].fit(xt, num_epochs=60000, n_samples=10, lr=1e-3, eps=1e-2)
PMFGD()
Look at the ELBO achieved by the algorithms.
cm = plt.get_cmap('Dark2') plt.clf() fig, ax = plt.subplots(1, 3) fig.set_size_inches(7.5, 2.5) for i, name in enumerate(models): temp = getattr(models[name], 'trace2', None) if temp: ax[0].plot(temp, c=cm(i), lw=1, marker=None, label=name) ax[0].set_title('Approximating $q(z_{ijk})$') for i, name in enumerate(models): mu = pd.Series(models[name].trace).ewm(alpha=0.1).mean() sd = pd.Series(models[name].trace).ewm(alpha=0.1).std() for a in ax[1:]: a.plot(mu, c=cm(i), lw=1, marker=None, label=name) a.fill_between(np.arange(len(mu)), mu - sd, mu + sd, color=cm(i), alpha=0.1) a.set_title('Integrating out $z_{ijk}$') ax[-1].set_xlim(0, 1000) ax[-1].legend(frameon=False, loc='center left', bbox_to_anchor=(1, .5)) for a in ax: a.set_xlabel('Epoch (after initialization)') ax[0].set_ylabel('Negative ELBO') fig.tight_layout()
Report the final (approximate) ELBO.
{name: np.array(models[name].trace[-10:]).mean() for name in models}
{'VBEM': 35019.47, 'Newton': 35052.777, 'GD': 34970.6}
Compare the fitted values of each approach to the observed values.
plt.clf() fig, ax = plt.subplots(1, 3, sharey=True) fig.set_size_inches(7, 3) for a, name in zip(ax, models): with torch.no_grad(): pm = ((models[name].alpha_l / models[name].beta_l) @ (models[name].alpha_f / models[name].beta_f).T).cpu() a.scatter(pm.ravel(), dat['X'].ravel(), s=1, alpha=0.1, c='k') lim = [0, 1225] for a, name in zip(ax, models): a.plot(lim, lim, c='r', lw=1, ls=':') a.set_xlabel('Fitted value') a.set_title(name) ax[0].set_ylabel('True value') fig.tight_layout()