diff --git a/pymc3/distributions/distribution.py b/pymc3/distributions/distribution.py index c6d45bb0f1..b1971c7922 100644 --- a/pymc3/distributions/distribution.py +++ b/pymc3/distributions/distribution.py @@ -91,6 +91,28 @@ def _repr_latex_(self, name=None, dist=None): """Magic method name for IPython to use for LaTeX formatting.""" return None + def logp_nojac(self, *args, **kwargs): + """Return the logp, but do not include a jacobian term for transforms. + + If we use different parametrizations for the same distribution, we + need to add the determinant of the jacobian of the transformation + to make sure the densities still describe the same distribution. + However, MAP estimates are not invariant with respect to the + parametrization, we need to exclude the jacobian terms in this case. + + This function should be overwritten in base classes for transformed + distributions. + """ + return self.logp(*args, **kwargs) + + def logp_sum(self, *args, **kwargs): + """Return the sum of the logp values for the given observations. + + Subclasses can use this to improve the speed of logp evaluations + if only the sum of the logp values is needed. + """ + return tt.sum(self.logp(*args, **kwargs)) + __latex__ = _repr_latex_ diff --git a/pymc3/distributions/transforms.py b/pymc3/distributions/transforms.py index c813c44235..443fdb1c10 100644 --- a/pymc3/distributions/transforms.py +++ b/pymc3/distributions/transforms.py @@ -80,6 +80,9 @@ def logp(self, x): return (self.dist.logp(self.transform_used.backward(x)) + self.transform_used.jacobian_det(x)) + def logp_nojac(self, x): + return self.dist.logp(self.transform_used.backward(x)) + transform = Transform diff --git a/pymc3/model.py b/pymc3/model.py index 01ac38ddc9..49156d83ab 100644 --- a/pymc3/model.py +++ b/pymc3/model.py @@ -169,6 +169,18 @@ def d2logp(self, vars=None): """Compiled log probability density hessian function""" return self.model.fn(hessian(self.logpt, vars)) + @property + def logp_nojac(self): + return self.model.fn(self.logp_nojact) + + def dlogp_nojac(self, vars=None): + """Compiled log density gradient function, without jacobian terms.""" + return self.model.fn(gradient(self.logp_nojact, vars)) + + def d2logp_nojac(self, vars=None): + """Compiled log density hessian function, without jacobian terms.""" + return self.model.fn(hessian(self.logp_nojact, vars)) + @property def fastlogp(self): """Compiled log probability density function""" @@ -182,13 +194,36 @@ def fastd2logp(self, vars=None): """Compiled log probability density hessian function""" return self.model.fastfn(hessian(self.logpt, vars)) + @property + def fastlogp_nojac(self): + return self.model.fastfn(self.logp_nojact) + + def fastdlogp_nojac(self, vars=None): + """Compiled log density gradient function, without jacobian terms.""" + return self.model.fastfn(gradient(self.logp_nojact, vars)) + + def fastd2logp_nojac(self, vars=None): + """Compiled log density hessian function, without jacobian terms.""" + return self.model.fastfn(hessian(self.logp_nojact, vars)) + @property def logpt(self): """Theano scalar of log-probability of the model""" if getattr(self, 'total_size', None) is not None: - logp = tt.sum(self.logp_elemwiset) * self.scaling + logp = self.logp_sum_unscaledt * self.scaling + else: + logp = self.logp_sum_unscaledt + if self.name is not None: + logp.name = '__logp_%s' % self.name + return logp + + @property + def logp_nojact(self): + """Theano scalar of log-probability, excluding jacobian terms.""" + if getattr(self, 'total_size', None) is not None: + logp = tt.sum(self.logp_nojac_unscaledt) * self.scaling else: - logp = tt.sum(self.logp_elemwiset) + logp = tt.sum(self.logp_nojac_unscaledt) if self.name is not None: logp.name = '__logp_%s' % self.name return logp @@ -623,9 +658,26 @@ def logp_dlogp_function(self, grad_vars=None, **kwargs): def logpt(self): """Theano scalar of log-probability of the model""" with self: - factors = [var.logpt for var in self.basic_RVs] + self.potentials - logp = tt.add(*map(tt.sum, factors)) - logp.name = '__logp' + factors = [var.logpt for var in self.basic_RVs] + logp_factors = tt.sum(factors) + logp_potentials = tt.sum([tt.sum(pot) for pot in self.potentials]) + logp = logp_factors + logp_potentials + if self.name: + logp.name = '__logp_%s' % self.name + else: + logp.name = '__logp' + return logp + + @property + def logp_nojact(self): + """Theano scalar of log-probability of the model""" + with self: + factors = [var.logp_nojact for var in self.basic_RVs] + self.potentials + logp = tt.sum([tt.sum(factor) for factor in factors]) + if self.name: + logp.name = '__logp_nojac_%s' % self.name + else: + logp.name = '__logp_nojac' return logp @property @@ -634,7 +686,7 @@ def varlogpt(self): (excluding deterministic).""" with self: factors = [var.logpt for var in self.vars] - return tt.add(*map(tt.sum, factors)) + return tt.sum(factors) @property def vars(self): @@ -1066,6 +1118,10 @@ def __init__(self, type=None, owner=None, index=None, name=None, self.tag.test_value = np.ones( distribution.shape, distribution.dtype) * distribution.default() self.logp_elemwiset = distribution.logp(self) + # The logp might need scaling in minibatches. + # This is done in `Factor`. + self.logp_sum_unscaledt = distribution.logp_sum(self) + self.logp_nojac_unscaledt = distribution.logp_nojac(self) self.total_size = total_size self.model = model self.scaling = _get_scaling(total_size, self.shape, self.ndim) @@ -1169,6 +1225,10 @@ def __init__(self, type=None, owner=None, index=None, name=None, data=None, self.missing_values = data.missing_values self.logp_elemwiset = distribution.logp(data) + # The logp might need scaling in minibatches. + # This is done in `Factor`. + self.logp_sum_unscaledt = distribution.logp_sum(data) + self.logp_nojac_unscaledt = distribution.logp_nojac(data) self.total_size = total_size self.model = model self.distribution = distribution @@ -1220,6 +1280,10 @@ def __init__(self, name, data, distribution, total_size=None, model=None): self.missing_values = [datum.missing_values for datum in self.data.values() if datum.missing_values is not None] self.logp_elemwiset = distribution.logp(**self.data) + # The logp might need scaling in minibatches. + # This is done in `Factor`. + self.logp_sum_unscaledt = distribution.logp_sum(**self.data) + self.logp_nojac_unscaledt = distribution.logp_nojac(**self.data) self.total_size = total_size self.model = model self.distribution = distribution diff --git a/pymc3/tuning/starting.py b/pymc3/tuning/starting.py index 78a7d64f64..7b4148549b 100644 --- a/pymc3/tuning/starting.py +++ b/pymc3/tuning/starting.py @@ -18,8 +18,10 @@ __all__ = ['find_MAP'] + def find_MAP(start=None, vars=None, fmin=None, - return_raw=False, model=None, live_disp=False, callback=None, *args, **kwargs): + return_raw=False, model=None, live_disp=False, callback=None, + *args, **kwargs): """ Sets state to the local maximum a posteriori point given a model. Current default of fmin_Hessian does not deal well with optimizing close @@ -69,7 +71,7 @@ def find_MAP(start=None, vars=None, fmin=None, except AttributeError: gradient_avail = False - if disc_vars or not gradient_avail : + if disc_vars or not gradient_avail: pm._log.warning("Warning: gradient not available." + "(E.g. vars contains discrete variables). MAP " + "estimates may not be accurate for the default " + @@ -88,13 +90,13 @@ def find_MAP(start=None, vars=None, fmin=None, start = Point(start, model=model) bij = DictToArrayBijection(ArrayOrdering(vars), start) - logp = bij.mapf(model.fastlogp) + logp = bij.mapf(model.fastlogp_nojac) def logp_o(point): return nan_to_high(-logp(point)) # Check to see if minimization function actually uses the gradient if 'fprime' in getargspec(fmin).args: - dlogp = bij.mapf(model.fastdlogp(vars)) + dlogp = bij.mapf(model.fastdlogp_nojac(vars)) def grad_logp_o(point): return nan_to_num(-dlogp(point))