diff --git a/pymc3/step_methods/hmc/quadpotential.py b/pymc3/step_methods/hmc/quadpotential.py index 9ad1834ae2..65ae08d969 100644 --- a/pymc3/step_methods/hmc/quadpotential.py +++ b/pymc3/step_methods/hmc/quadpotential.py @@ -1,3 +1,4 @@ +import warnings import numpy as np from numpy.random import normal import scipy.linalg @@ -8,7 +9,8 @@ __all__ = ['quad_potential', 'QuadPotentialDiag', 'QuadPotentialFull', - 'QuadPotentialFullInv', 'QuadPotentialDiagAdapt', 'isquadpotential'] + 'QuadPotentialFullInv', 'QuadPotentialDiagAdapt', + 'QuadPotentialFullAdapt', 'isquadpotential'] def quad_potential(C, is_cov): @@ -416,7 +418,7 @@ def velocity_energy(self, x, v_out): class QuadPotentialFull(QuadPotential): """Basic QuadPotential object for Hamiltonian calculations.""" - def __init__(self, A, dtype=None): + def __init__(self, cov, dtype=None): """Compute the lower cholesky decomposition of the potential. Parameters @@ -427,32 +429,189 @@ def __init__(self, A, dtype=None): if dtype is None: dtype = theano.config.floatX self.dtype = dtype - self.A = A.astype(self.dtype) - self.L = scipy.linalg.cholesky(A, lower=True) + self._cov = np.array(cov, dtype=self.dtype, copy=True) + self._chol = scipy.linalg.cholesky(self._cov, lower=True) + self._n = len(self._cov) def velocity(self, x, out=None): """Compute the current velocity at a position in parameter space.""" - return np.dot(self.A, x, out=out) + return np.dot(self._cov, x, out=out) def random(self): """Draw random value from QuadPotential.""" - n = floatX(normal(size=self.L.shape[0])) - return scipy.linalg.solve_triangular(self.L.T, n) + vals = np.random.normal(size=self._n).astype(self.dtype) + return scipy.linalg.solve_triangular(self._chol.T, vals, + overwrite_b=True) def energy(self, x, velocity=None): """Compute kinetic energy at a position in parameter space.""" if velocity is None: velocity = self.velocity(x) - return .5 * x.dot(velocity) + return 0.5 * np.dot(x, velocity) def velocity_energy(self, x, v_out): """Compute velocity and return kinetic energy at a position in parameter space.""" self.velocity(x, out=v_out) - return 0.5 * np.dot(x, v_out) + return self.energy(x, v_out) __call__ = random +class QuadPotentialFullAdapt(QuadPotentialFull): + """Adapt a dense mass matrix using the sample covariances + + If the parameter ``doubling`` is true, the adaptation window is doubled + every time it is passed. This can lead to better convergence of the mass + matrix estimation. + + """ + def __init__( + self, + n, + initial_mean, + initial_cov=None, + initial_weight=0, + adaptation_window=101, + update_window=1, + doubling=True, + dtype=None, + ): + warnings.warn("QuadPotentialFullAdapt is an experimental feature") + + if initial_cov is not None and initial_cov.ndim != 2: + raise ValueError("Initial covariance must be two-dimensional.") + if initial_mean.ndim != 1: + raise ValueError("Initial mean must be one-dimensional.") + if initial_cov is not None and initial_cov.shape != (n, n): + raise ValueError( + "Wrong shape for initial_cov: expected %s got %s" + % (n, initial_cov.shape) + ) + if len(initial_mean) != n: + raise ValueError( + "Wrong shape for initial_mean: expected %s got %s" + % (n, len(initial_mean)) + ) + + if dtype is None: + dtype = theano.config.floatX + + if initial_cov is None: + initial_cov = np.eye(n, dtype=dtype) + initial_weight = 1 + + self.dtype = dtype + self._n = n + self._cov = np.array(initial_cov, dtype=self.dtype, copy=True) + self._chol = scipy.linalg.cholesky(self._cov, lower=True) + self._chol_error = None + self._foreground_cov = _WeightedCovariance( + self._n, initial_mean, initial_cov, initial_weight, self.dtype + ) + self._background_cov = _WeightedCovariance(self._n, dtype=self.dtype) + self._n_samples = 0 + + self._doubling = doubling + self._adaptation_window = int(adaptation_window) + self._update_window = int(update_window) + self._previous_update = 0 + + def _update_from_weightvar(self, weightvar): + weightvar.current_covariance(out=self._cov) + try: + self._chol = scipy.linalg.cholesky(self._cov, lower=True) + except (scipy.linalg.LinAlgError, ValueError) as error: + self._chol_error = error + + def update(self, sample, grad, tune): + if not tune: + return + + # Steps since previous update + delta = self._n_samples - self._previous_update + + self._foreground_cov.add_sample(sample, weight=1) + self._background_cov.add_sample(sample, weight=1) + + # Update the covariance matrix and recompute the Cholesky factorization + # every "update_window" steps + if (delta + 1) % self._update_window == 0: + self._update_from_weightvar(self._foreground_cov) + + # Reset the background covariance if we are at the end of the adaptation window. + if delta >= self._adaptation_window: + self._foreground_cov = self._background_cov + self._background_cov = _WeightedCovariance( + self._n, dtype=self.dtype + ) + + self._previous_update = self._n_samples + if self._doubling: + self._adaptation_window *= 2 + + self._n_samples += 1 + + def raise_ok(self, vmap): + if self._chol_error is not None: + raise ValueError("{0}".format(self._chol_error)) + + +class _WeightedCovariance: + """Online algorithm for computing mean and covariance + + This implements the `Welford's algorithm + `_ based + on the implementation in `the Stan math library + `_. + + """ + + def __init__( + self, + nelem, + initial_mean=None, + initial_covariance=None, + initial_weight=0, + dtype="d", + ): + self._dtype = dtype + self.n_samples = float(initial_weight) + if initial_mean is None: + self.mean = np.zeros(nelem, dtype="d") + else: + self.mean = np.array(initial_mean, dtype="d", copy=True) + if initial_covariance is None: + self.raw_cov = np.eye(nelem, dtype="d") + else: + self.raw_cov = np.array(initial_covariance, dtype="d", copy=True) + + self.raw_cov[:] *= self.n_samples + + if self.raw_cov.shape != (nelem, nelem): + raise ValueError("Invalid shape for initial covariance.") + if self.mean.shape != (nelem,): + raise ValueError("Invalid shape for initial mean.") + + def add_sample(self, x, weight): + x = np.asarray(x) + self.n_samples += 1 + old_diff = x - self.mean + self.mean[:] += old_diff / self.n_samples + new_diff = x - self.mean + self.raw_cov[:] += weight * new_diff[:, None] * old_diff[None, :] + + def current_covariance(self, out=None): + if self.n_samples == 0: + raise ValueError("Can not compute covariance without samples.") + if out is not None: + return np.divide(self.raw_cov, self.n_samples - 1, out=out) + else: + return (self.raw_cov / (self.n_samples - 1)).astype(self._dtype) + + def current_mean(self): + return np.array(self.mean, dtype=self._dtype) + + try: import sksparse.cholmod as cholmod chol_available = True diff --git a/pymc3/tests/test_quadpotential.py b/pymc3/tests/test_quadpotential.py index 0b41c7547f..a22ece3f09 100644 --- a/pymc3/tests/test_quadpotential.py +++ b/pymc3/tests/test_quadpotential.py @@ -38,16 +38,16 @@ def test_equal_diag(): x = floatX(np.random.randn(5)) pots = [ quadpotential.quad_potential(diag, False), - quadpotential.quad_potential(1. / diag, True), + quadpotential.quad_potential(1.0 / diag, True), quadpotential.quad_potential(np.diag(diag), False), - quadpotential.quad_potential(np.diag(1. / diag), True), + quadpotential.quad_potential(np.diag(1.0 / diag), True), ] if quadpotential.chol_available: - diag_ = scipy.sparse.csc_matrix(np.diag(1. / diag)) + diag_ = scipy.sparse.csc_matrix(np.diag(1.0 / diag)) pots.append(quadpotential.quad_potential(diag_, True)) - v = np.diag(1. / diag).dot(x) - e = x.dot(np.diag(1. / diag).dot(x)) / 2 + v = np.diag(1.0 / diag).dot(x) + e = x.dot(np.diag(1.0 / diag).dot(x)) / 2 for pot in pots: v_ = pot.velocity(x) e_ = pot.energy(x) @@ -85,9 +85,9 @@ def test_random_diag(): np.random.seed(42) pots = [ quadpotential.quad_potential(d, True), - quadpotential.quad_potential(1./d, False), + quadpotential.quad_potential(1.0 / d, False), quadpotential.quad_potential(np.diag(d), True), - quadpotential.quad_potential(np.diag(1./d), False), + quadpotential.quad_potential(np.diag(1.0 / d), False), ] if quadpotential.chol_available: d_ = scipy.sparse.csc_matrix(np.diag(d)) @@ -95,7 +95,7 @@ def test_random_diag(): pots.append(pot) for pot in pots: vals = np.array([pot.random() for _ in range(1000)]) - npt.assert_allclose(vals.std(0), np.sqrt(1./d), atol=0.1) + npt.assert_allclose(vals.std(0), np.sqrt(1.0 / d), atol=0.1) def test_random_dense(): @@ -112,7 +112,9 @@ def test_random_dense(): quadpotential.QuadPotentialFullInv(inv), ] if quadpotential.chol_available: - pot = quadpotential.QuadPotential_Sparse(scipy.sparse.csc_matrix(cov)) + pot = quadpotential.QuadPotential_Sparse( + scipy.sparse.csc_matrix(cov) + ) pots.append(pot) for pot in pots: cov_ = np.cov(np.array([pot.random() for _ in range(1000)]).T) @@ -137,3 +139,133 @@ def energy(self, x, velocity=None): step = pymc3.NUTS(potential=pot) pymc3.sample(10, init=None, step=step, chains=1) assert called + + +def test_weighted_covariance(ndim=10, seed=5432): + np.random.seed(seed) + + L = np.random.randn(ndim, ndim) + L[np.triu_indices_from(L, 1)] = 0.0 + L[np.diag_indices_from(L)] = np.exp(L[np.diag_indices_from(L)]) + cov = np.dot(L, L.T) + mean = np.random.randn(ndim) + + samples = np.random.multivariate_normal(mean, cov, size=100) + mu_est0 = np.mean(samples, axis=0) + cov_est0 = np.cov(samples, rowvar=0) + + est = quadpotential._WeightedCovariance(ndim) + for sample in samples: + est.add_sample(sample, 1) + mu_est = est.current_mean() + cov_est = est.current_covariance() + + assert np.allclose(mu_est, mu_est0) + assert np.allclose(cov_est, cov_est0) + + # Make sure that the weighted estimate also works + est2 = quadpotential._WeightedCovariance( + ndim, + np.mean(samples[:10], axis=0), + np.cov(samples[:10], rowvar=0, bias=True), + 10, + ) + for sample in samples[10:]: + est2.add_sample(sample, 1) + mu_est2 = est2.current_mean() + cov_est2 = est2.current_covariance() + + assert np.allclose(mu_est2, mu_est0) + assert np.allclose(cov_est2, cov_est0) + + +def test_full_adapt_sample_p(seed=4566): + # ref: https://github.com/stan-dev/stan/pull/2672 + np.random.seed(seed) + m = np.array([[3.0, -2.0], [-2.0, 4.0]]) + m_inv = np.linalg.inv(m) + + var = np.array( + [ + [2 * m[0, 0], m[1, 0] * m[1, 0] + m[1, 1] * m[0, 0]], + [m[0, 1] * m[0, 1] + m[1, 1] * m[0, 0], 2 * m[1, 1]], + ] + ) + + n_samples = 1000 + pot = quadpotential.QuadPotentialFullAdapt(2, np.zeros(2), m_inv, 1) + samples = [pot.random() for n in range(n_samples)] + sample_cov = np.cov(samples, rowvar=0) + + # Covariance matrix within 5 sigma of expected value + # (comes from a Wishart distribution) + assert np.all(np.abs(m - sample_cov) < 5 * np.sqrt(var / n_samples)) + + +def test_full_adapt_update_window(seed=1123): + np.random.seed(seed) + init_cov = np.array([[1.0, 0.02], [0.02, 0.8]]) + pot = quadpotential.QuadPotentialFullAdapt( + 2, np.zeros(2), init_cov, 1, update_window=50 + ) + assert np.allclose(pot._cov, init_cov) + for i in range(49): + pot.update(np.random.randn(2), None, True) + assert np.allclose(pot._cov, init_cov) + pot.update(np.random.randn(2), None, True) + assert not np.allclose(pot._cov, init_cov) + + +def test_full_adapt_adaptation_window(seed=8978): + np.random.seed(seed) + window = 10 + pot = quadpotential.QuadPotentialFullAdapt( + 2, np.zeros(2), np.eye(2), 1, adaptation_window=window + ) + for i in range(window + 1): + pot.update(np.random.randn(2), None, True) + assert pot._previous_update == window + assert pot._adaptation_window == window * 2 + + pot = quadpotential.QuadPotentialFullAdapt( + 2, np.zeros(2), np.eye(2), 1, adaptation_window=window, doubling=False + ) + for i in range(window + 1): + pot.update(np.random.randn(2), None, True) + assert pot._previous_update == window + assert pot._adaptation_window == window + + +def test_full_adapt_not_invertible(): + window = 10 + pot = quadpotential.QuadPotentialFullAdapt( + 2, np.zeros(2), np.eye(2), 0, adaptation_window=window + ) + for i in range(window + 1): + pot.update(np.ones(2), None, True) + with pytest.raises(ValueError): + pot.raise_ok(None) + + +def test_full_adapt_warn(): + with pytest.warns(UserWarning): + quadpotential.QuadPotentialFullAdapt(2, np.zeros(2), np.eye(2), 0) + + +def test_full_adapt_sampling(seed=289586): + np.random.seed(seed) + + L = np.random.randn(5, 5) + L[np.diag_indices_from(L)] = np.exp(L[np.diag_indices_from(L)]) + L[np.triu_indices_from(L, 1)] = 0.0 + + with pymc3.Model() as model: + pymc3.MvNormal("a", mu=np.zeros(len(L)), chol=L, shape=len(L)) + + pot = quadpotential.QuadPotentialFullAdapt( + model.ndim, np.zeros(model.ndim) + ) + step = pymc3.NUTS(model=model, potential=pot) + pymc3.sample( + draws=10, tune=1000, random_seed=seed, step=step, cores=1, chains=1 + )