diff --git a/pymc3/__init__.py b/pymc3/__init__.py
index 291a841471..215f867ee8 100644
--- a/pymc3/__init__.py
+++ b/pymc3/__init__.py
@@ -18,8 +18,6 @@
 from .exceptions import *
 from . import sampling
 
-from .debug import *
-
 from .diagnostics import *
 from .backends.tracetab import *
 
diff --git a/pymc3/debug.py b/pymc3/debug.py
deleted file mode 100644
index 1821bd194a..0000000000
--- a/pymc3/debug.py
+++ /dev/null
@@ -1,24 +0,0 @@
-from .blocking import DictToVarBijection
-
-# TODO I could not locate this function used anywhere in the code base
-# do we need it?
-
-
-def eval_univariate(f, var, idx, point, x):
-    """
-    Evaluate a function as a at a specific point and only varying values at one index.
-
-    Useful for debugging misspecified likelihoods.
-
-    Parameters
-    ----------
-
-    f : function : dict -> val
-    var : variable
-    idx : index into variable
-    point : point at which to center
-    x : array points at which to evaluate x
-
-    """
-    bij = DictToVarBijection(var, idx, point)
-    return list(map(bij.mapf(f), x))
diff --git a/pymc3/distributions/continuous.py b/pymc3/distributions/continuous.py
index ee67a89e68..49f015bd4a 100644
--- a/pymc3/distributions/continuous.py
+++ b/pymc3/distributions/continuous.py
@@ -201,7 +201,7 @@ def logp(self, value):
         return tt.zeros_like(value)
 
     def _repr_latex_(self, name=None, dist=None):
-        return r'${} \sim \text{Flat}()$'.format(name)
+        return r'${} \sim \text{{Flat}}()$'.format(name)
 
 
 class HalfFlat(PositiveContinuous):
@@ -218,7 +218,7 @@ def logp(self, value):
         return bound(tt.zeros_like(value), value > 0)
 
     def _repr_latex_(self, name=None, dist=None):
-        return r'${} \sim \text{{HalfFlat}()$'.format(name)
+        return r'${} \sim \text{{HalfFlat}}()$'.format(name)
 
 
 class Normal(Continuous):
diff --git a/pymc3/distributions/discrete.py b/pymc3/distributions/discrete.py
index 11318ee090..6241a69fed 100644
--- a/pymc3/distributions/discrete.py
+++ b/pymc3/distributions/discrete.py
@@ -632,11 +632,13 @@ def logp(self, value):
         psi = self.psi
         theta = self.theta
 
-        logp_val = tt.switch(tt.gt(value, 0),
-                     tt.log(psi) + self.pois.logp(value),
-                     logaddexp(tt.log1p(-psi), tt.log(psi) - theta))
+        logp_val = tt.switch(
+            tt.gt(value, 0),
+            tt.log(psi) + self.pois.logp(value),
+            logaddexp(tt.log1p(-psi), tt.log(psi) - theta))
 
-        return bound(logp_val,
+        return bound(
+            logp_val,
             0 <= value,
             0 <= psi, psi <= 1,
             0 <= theta)
@@ -700,11 +702,13 @@ def logp(self, value):
         p = self.p
         n = self.n
 
-        logp_val = tt.switch(tt.gt(value, 0),
-                 tt.log(psi) + self.bin.logp(value),
-                 logaddexp(tt.log1p(-psi), tt.log(psi) + n * tt.log1p(-p)))
+        logp_val = tt.switch(
+            tt.gt(value, 0),
+            tt.log(psi) + self.bin.logp(value),
+            logaddexp(tt.log1p(-psi), tt.log(psi) + n * tt.log1p(-p)))
 
-        return bound(logp_val,
+        return bound(
+            logp_val,
             0 <= value, value <= n,
             0 <= psi, psi <= 1,
             0 <= p, p <= 1)
@@ -715,10 +719,14 @@ def _repr_latex_(self, name=None, dist=None):
         n = dist.n
         p = dist.p
         psi = dist.psi
-        return r'${} \sim \text{{ZeroInflatedBinomial}}(\mathit{{n}}={}, \mathit{{p}}={}, \mathit{{psi}}={})$'.format(name,
-                                                get_variable_name(n),
-                                                get_variable_name(p),
-                                                get_variable_name(psi))
+
+        name_n = get_variable_name(n)
+        name_p = get_variable_name(p)
+        name_psi = get_variable_name(psi)
+        return (r'${} \sim \text{{ZeroInflatedBinomial}}'
+                r'(\mathit{{n}}={}, \mathit{{p}}={}, '
+                r'\mathit{{psi}}={})$'
+                .format(name, name_n, name_p, name_psi))
 
 
 class ZeroInflatedNegativeBinomial(Discrete):
@@ -731,10 +739,18 @@ class ZeroInflatedNegativeBinomial(Discrete):
 
     .. math::
 
-       f(x \mid \psi, \mu, \alpha) = \left\{ \begin{array}{l}
-            (1-\psi) + \psi \left (\frac{\alpha}{\alpha+\mu} \right) ^\alpha, \text{if } x = 0 \\
-            \psi \frac{\Gamma(x+\alpha)}{x! \Gamma(\alpha)} \left (\frac{\alpha}{\mu+\alpha} \right)^\alpha \left( \frac{\mu}{\mu+\alpha} \right)^x, \text{if } x=1,2,3,\ldots
-            \end{array} \right.
+       f(x \mid \psi, \mu, \alpha) = \left\{
+         \begin{array}{l}
+           (1-\psi) + \psi \left (
+             \frac{\alpha}{\alpha+\mu}
+           \right) ^\alpha, \text{if } x = 0 \\
+           \psi \frac{\Gamma(x+\alpha)}{x! \Gamma(\alpha)} \left (
+             \frac{\alpha}{\mu+\alpha}
+           \right)^\alpha \left(
+             \frac{\mu}{\mu+\alpha}
+           \right)^x, \text{if } x=1,2,3,\ldots
+         \end{array}
+       \right.
 
     ========  ==========================
     Support   :math:`x \in \mathbb{N}_0`
@@ -776,14 +792,21 @@ def logp(self, value):
         mu = self.mu
         psi = self.psi
 
-        logp_val = tt.switch(tt.gt(value, 0),
-                     tt.log(psi) + self.nb.logp(value),
-                     logaddexp(tt.log1p(-psi), tt.log(psi) + alpha * (tt.log(alpha) - tt.log(alpha + mu))))
+        logp_other = tt.log(psi) + self.nb.logp(value)
+        logp_0 = logaddexp(
+            tt.log1p(-psi),
+            tt.log(psi) + alpha * (tt.log(alpha) - tt.log(alpha + mu)))
+
+        logp_val = tt.switch(
+            tt.gt(value, 0),
+            logp_other,
+            logp_0)
 
-        return bound(logp_val,
-                    0 <= value,
-                    0 <= psi, psi <= 1,
-                    mu > 0, alpha > 0)
+        return bound(
+            logp_val,
+            0 <= value,
+            0 <= psi, psi <= 1,
+            mu > 0, alpha > 0)
 
     def _repr_latex_(self, name=None, dist=None):
         if dist is None:
@@ -791,7 +814,11 @@ def _repr_latex_(self, name=None, dist=None):
         mu = dist.mu
         alpha = dist.alpha
         psi = dist.psi
-        return r'${} \sim \text{{ZeroInflatedNegativeBinomial}}(\mathit{{mu}}={}, \mathit{{alpha}}={}, \mathit{{psi}}={})$'.format(name,
-                                                get_variable_name(mu),
-                                                get_variable_name(alpha),
-                                                get_variable_name(psi))
+
+        name_mu = get_variable_name(mu)
+        name_alpha = get_variable_name(alpha)
+        name_psi = get_variable_name(psi)
+        return (r'${} \sim \text{{ZeroInflatedNegativeBinomial}}'
+                r'(\mathit{{mu}}={}, \mathit{{alpha}}={}, '
+                r'\mathit{{psi}}={})$'
+                .format(name, name_mu, name_alpha, name_psi))
diff --git a/pymc3/distributions/distribution.py b/pymc3/distributions/distribution.py
index c6d45bb0f1..b1971c7922 100644
--- a/pymc3/distributions/distribution.py
+++ b/pymc3/distributions/distribution.py
@@ -91,6 +91,28 @@ def _repr_latex_(self, name=None, dist=None):
         """Magic method name for IPython to use for LaTeX formatting."""
         return None
 
+    def logp_nojac(self, *args, **kwargs):
+        """Return the logp, but do not include a jacobian term for transforms.
+
+        If we use different parametrizations for the same distribution, we
+        need to add the determinant of the jacobian of the transformation
+        to make sure the densities still describe the same distribution.
+        However, MAP estimates are not invariant with respect to the
+        parametrization, we need to exclude the jacobian terms in this case.
+
+        This function should be overwritten in base classes for transformed
+        distributions.
+        """
+        return self.logp(*args, **kwargs)
+
+    def logp_sum(self, *args, **kwargs):
+        """Return the sum of the logp values for the given observations.
+
+        Subclasses can use this to improve the speed of logp evaluations
+        if only the sum of the logp values is needed.
+        """
+        return tt.sum(self.logp(*args, **kwargs))
+
     __latex__ = _repr_latex_
 
 
diff --git a/pymc3/distributions/transforms.py b/pymc3/distributions/transforms.py
index c813c44235..443fdb1c10 100644
--- a/pymc3/distributions/transforms.py
+++ b/pymc3/distributions/transforms.py
@@ -80,6 +80,9 @@ def logp(self, x):
         return (self.dist.logp(self.transform_used.backward(x)) +
                 self.transform_used.jacobian_det(x))
 
+    def logp_nojac(self, x):
+        return self.dist.logp(self.transform_used.backward(x))
+
 transform = Transform
 
 
diff --git a/pymc3/model.py b/pymc3/model.py
index 01ac38ddc9..92eaa4604e 100644
--- a/pymc3/model.py
+++ b/pymc3/model.py
@@ -152,6 +152,9 @@ class Factor(object):
     """Common functionality for objects with a log probability density
     associated with them.
     """
+    def __init__(self, *args, **kwargs):
+        super(Factor, self).__init__(*args, **kwargs)
+
     @property
     def logp(self):
         """Compiled log probability density function"""
@@ -169,6 +172,18 @@ def d2logp(self, vars=None):
         """Compiled log probability density hessian function"""
         return self.model.fn(hessian(self.logpt, vars))
 
+    @property
+    def logp_nojac(self):
+        return self.model.fn(self.logp_nojact)
+
+    def dlogp_nojac(self, vars=None):
+        """Compiled log density gradient function, without jacobian terms."""
+        return self.model.fn(gradient(self.logp_nojact, vars))
+
+    def d2logp_nojac(self, vars=None):
+        """Compiled log density hessian function, without jacobian terms."""
+        return self.model.fn(hessian(self.logp_nojact, vars))
+
     @property
     def fastlogp(self):
         """Compiled log probability density function"""
@@ -182,13 +197,36 @@ def fastd2logp(self, vars=None):
         """Compiled log probability density hessian function"""
         return self.model.fastfn(hessian(self.logpt, vars))
 
+    @property
+    def fastlogp_nojac(self):
+        return self.model.fastfn(self.logp_nojact)
+
+    def fastdlogp_nojac(self, vars=None):
+        """Compiled log density gradient function, without jacobian terms."""
+        return self.model.fastfn(gradient(self.logp_nojact, vars))
+
+    def fastd2logp_nojac(self, vars=None):
+        """Compiled log density hessian function, without jacobian terms."""
+        return self.model.fastfn(hessian(self.logp_nojact, vars))
+
     @property
     def logpt(self):
         """Theano scalar of log-probability of the model"""
         if getattr(self, 'total_size', None) is not None:
-            logp = tt.sum(self.logp_elemwiset) * self.scaling
+            logp = self.logp_sum_unscaledt * self.scaling
         else:
-            logp = tt.sum(self.logp_elemwiset)
+            logp = self.logp_sum_unscaledt
+        if self.name is not None:
+            logp.name = '__logp_%s' % self.name
+        return logp
+
+    @property
+    def logp_nojact(self):
+        """Theano scalar of log-probability, excluding jacobian terms."""
+        if getattr(self, 'total_size', None) is not None:
+            logp = tt.sum(self.logp_nojac_unscaledt) * self.scaling
+        else:
+            logp = tt.sum(self.logp_nojac_unscaledt)
         if self.name is not None:
             logp.name = '__logp_%s' % self.name
         return logp
@@ -623,9 +661,26 @@ def logp_dlogp_function(self, grad_vars=None, **kwargs):
     def logpt(self):
         """Theano scalar of log-probability of the model"""
         with self:
-            factors = [var.logpt for var in self.basic_RVs] + self.potentials
-            logp = tt.add(*map(tt.sum, factors))
-            logp.name = '__logp'
+            factors = [var.logpt for var in self.basic_RVs]
+            logp_factors = tt.sum(factors)
+            logp_potentials = tt.sum([tt.sum(pot) for pot in self.potentials])
+            logp = logp_factors + logp_potentials
+            if self.name:
+                logp.name = '__logp_%s' % self.name
+            else:
+                logp.name = '__logp'
+            return logp
+
+    @property
+    def logp_nojact(self):
+        """Theano scalar of log-probability of the model"""
+        with self:
+            factors = [var.logp_nojact for var in self.basic_RVs] + self.potentials
+            logp = tt.sum([tt.sum(factor) for factor in factors])
+            if self.name:
+                logp.name = '__logp_nojac_%s' % self.name
+            else:
+                logp.name = '__logp_nojac'
             return logp
 
     @property
@@ -634,7 +689,7 @@ def varlogpt(self):
            (excluding deterministic)."""
         with self:
             factors = [var.logpt for var in self.vars]
-            return tt.add(*map(tt.sum, factors))
+            return tt.sum(factors)
 
     @property
     def vars(self):
@@ -1066,6 +1121,10 @@ def __init__(self, type=None, owner=None, index=None, name=None,
             self.tag.test_value = np.ones(
                 distribution.shape, distribution.dtype) * distribution.default()
             self.logp_elemwiset = distribution.logp(self)
+            # The logp might need scaling in minibatches.
+            # This is done in `Factor`.
+            self.logp_sum_unscaledt = distribution.logp_sum(self)
+            self.logp_nojac_unscaledt = distribution.logp_nojac(self)
             self.total_size = total_size
             self.model = model
             self.scaling = _get_scaling(total_size, self.shape, self.ndim)
@@ -1169,6 +1228,10 @@ def __init__(self, type=None, owner=None, index=None, name=None, data=None,
 
             self.missing_values = data.missing_values
             self.logp_elemwiset = distribution.logp(data)
+            # The logp might need scaling in minibatches.
+            # This is done in `Factor`.
+            self.logp_sum_unscaledt = distribution.logp_sum(data)
+            self.logp_nojac_unscaledt = distribution.logp_nojac(data)
             self.total_size = total_size
             self.model = model
             self.distribution = distribution
@@ -1220,6 +1283,10 @@ def __init__(self, name, data, distribution, total_size=None, model=None):
         self.missing_values = [datum.missing_values for datum in self.data.values()
                                if datum.missing_values is not None]
         self.logp_elemwiset = distribution.logp(**self.data)
+        # The logp might need scaling in minibatches.
+        # This is done in `Factor`.
+        self.logp_sum_unscaledt = distribution.logp_sum(**self.data)
+        self.logp_nojac_unscaledt = distribution.logp_nojac(**self.data)
         self.total_size = total_size
         self.model = model
         self.distribution = distribution
diff --git a/pymc3/stats.py b/pymc3/stats.py
index 58d3287d5e..7bc212f318 100644
--- a/pymc3/stats.py
+++ b/pymc3/stats.py
@@ -14,6 +14,7 @@
 from scipy.misc import logsumexp
 from scipy.stats import dirichlet
 from scipy.stats.distributions import pareto
+from scipy.optimize import minimize
 
 from .backends import tracetab as ttab
 
@@ -340,7 +341,7 @@ def bpic(trace, model=None):
     return 3 * mean_deviance - 2 * deviance_at_mean
 
 
-def compare(traces, models, ic='WAIC', bootstrap=True, b_samples=1000,
+def compare(traces, models, ic='WAIC', method='stacking', b_samples=1000,
             alpha=1, seed=None):
     """Compare models based on the widely available information criterion (WAIC)
     or leave-one-out (LOO) cross-validation.
@@ -355,19 +356,28 @@ def compare(traces, models, ic='WAIC', bootstrap=True, b_samples=1000,
     ic : string
         Information Criterion (WAIC or LOO) used to compare models.
         Default WAIC.
-    bootstrap : boolean
-        If True a Bayesian bootstrap will be used to compute the weights and
-        the standard error of the IC estimate (SE).
+    method : str
+        Method used to estimate the weights for each model. Available options
+        are:
+            - 'stacking' : (default) stacking of predictive distributions.
+            - 'BB-pseudo-BMA' : pseudo-Bayesian Model averaging using Akaike-type
+        weighting. The weights are stabilized using the Bayesian bootstrap
+            - 'pseudo-BMA': pseudo-Bayesian Model averaging using Akaike-type
+        weighting, without Bootstrap stabilization (not recommended)
+
+        For more information read https://arxiv.org/abs/1704.02030
     b_samples: int
-        Number of samples taken by the Bayesian bootstrap estimation
+        Number of samples taken by the Bayesian bootstrap estimation. Only
+        useful when method = 'BB-pseudo-BMA'.
     alpha : float
         The shape parameter in the Dirichlet distribution used for the
-        Bayesian bootstrap. When alpha=1 (default), the distribution is uniform
-        on the simplex. A smaller alpha will keeps the final weights
-        more away from 0 and 1.
+        Bayesian bootstrap. Only useful when method = 'BB-pseudo-BMA'. When
+        alpha=1 (default), the distribution is uniform on the simplex. A
+        smaller alpha will keeps the final weights more away from 0 and 1.
     seed : int or np.random.RandomState instance
-           If int or RandomState, use it for seeding Bayesian bootstrap.
-           Default None the global np.random state is used.
+           If int or RandomState, use it for seeding Bayesian bootstrap. Only
+           useful when method = 'BB-pseudo-BMA'. Default None the global
+           np.random state is used.
 
     Returns
     -------
@@ -380,13 +390,13 @@ def compare(traces, models, ic='WAIC', bootstrap=True, b_samples=1000,
     dIC : Relative difference between each IC (WAIC or LOO)
     and the lowest IC (WAIC or LOO).
         It's always 0 for the top-ranked model.
-    weight: Akaike-like weights for each model.
+    weight: Relative weight for each model.
         This can be loosely interpreted as the probability of each model
         (among the compared model) given the data. By default the uncertainty 
         in the weights estimation is considered using Bayesian bootstrap.
     SE : Standard error of the IC estimate.
-        By default these values are estimated using Bayesian bootstrap (best
-        option) or, if bootstrap=False, using a Gaussian approximation
+        If method = BB-pseudo-BMA these values are estimated using Bayesian
+        bootstrap.
     dSE : Standard error of the difference in IC between each model and
     the top-ranked model.
         It's always 0 for the top-ranked model.
@@ -409,6 +419,14 @@ def compare(traces, models, ic='WAIC', bootstrap=True, b_samples=1000,
         raise NotImplementedError(
             'The information criterion {} is not supported.'.format(ic))
 
+    if len(set([len(m.observed_RVs) for m in models])) != 1:
+        raise ValueError(
+            'The Observed RVs should be the same across all models')
+
+    if method not in ['stacking', 'BB-pseudo-BMA', 'pseudo-BMA']:
+        raise NotImplementedError(
+            'The method to compute weights {} is not supported.'.format(method))
+
     warns = np.zeros(len(models))
 
     c = 0
@@ -425,45 +443,95 @@ def add_warns(*args):
 
     ics.sort(key=lambda x: x[1][0])
 
-    if bootstrap:
-        N = len(ics[0][1][3])
-
-        ic_i = np.zeros((len(ics), N))
-        for i in range(len(ics)):
-            ic_i[i] = ics[i][1][3] * N
+    if method == 'stacking':
+        N, K, ic_i = _ic_matrix(ics)
+        exp_ic_i = np.exp(-0.5 * ic_i)
+        Km = K - 1
+
+        def w_fuller(w):
+            return np.concatenate((w, 1. - np.sum(w, keepdims=True)))
+
+        def log_score(w):
+            w_full = w_fuller(w)
+            score = 0.
+            for i in range(N):
+                score += np.log(np.dot(exp_ic_i[i], w_full))
+            return -score
+
+        def gradient(w):
+            w_full = w_fuller(w)
+            grad = np.zeros(Km)
+            for k in range(Km):
+                for i in range(N):
+                    grad[k] += (exp_ic_i[i, k] - exp_ic_i[i, Km]) / \
+                        np.dot(exp_ic_i[i], w_full)
+            return -grad
+
+        theta = np.full(Km, 1. / K)
+        bounds = [(0., 1.) for i in range(Km)]
+        constraints = [{'type': 'ineq', 'fun': lambda x: -np.sum(x) + 1.},
+                       {'type': 'ineq', 'fun': lambda x: np.sum(x)}]
+
+        w = minimize(fun=log_score,
+                     x0=theta,
+                     jac=gradient,
+                     bounds=bounds,
+                     constraints=constraints)
+
+        weights = w_fuller(w['x'])
+        ses = [res[1] for _, res in ics]
+
+    elif method == 'BB-pseudo-BMA':
+        N, K, ic_i = _ic_matrix(ics)
+        ic_i = ic_i * N
 
         b_weighting = dirichlet.rvs(alpha=[alpha] * N, size=b_samples,
                                     random_state=seed)
-        weights = np.zeros((b_samples, len(ics)))
-        z_bs = np.zeros((b_samples, len(ics)))
+        weights = np.zeros((b_samples, K))
+        z_bs = np.zeros_like(weights)
         for i in range(b_samples):
-            z_b = np.dot(ic_i, b_weighting[i])
+            z_b = np.dot(b_weighting[i], ic_i)
             u_weights = np.exp(-0.5 * (z_b - np.min(z_b)))
             z_bs[i] = z_b
             weights[i] = u_weights / np.sum(u_weights)
 
-        weights_mean = weights.mean(0)
-        se = z_bs.std(0)
-        for i, (idx, res) in enumerate(ics):
-            diff = res[3] - ics[0][1][3]
-            d_ic = np.sum(diff)
-            d_se = np.sqrt(len(diff) * np.var(diff))
-            df_comp.at[idx] = (res[0], res[2], d_ic, weights_mean[i],
-                               se[i], d_se, warns[idx])
+        weights = weights.mean(0)
+        ses = z_bs.std(0)
 
-    else:
+    elif method == 'pseudo-BMA':
         min_ic = ics[0][1][0]
         Z = np.sum([np.exp(-0.5 * (x[1][0] - min_ic)) for x in ics])
+        weights = []
+        ses = []
+        for _, res in ics:
+            weights.append(np.exp(-0.5 * (res[0] - min_ic)) / Z)
+            ses.append(res[1])
 
-        for idx, res in ics:
+    if np.any(weights):
+        for i, (idx, res) in enumerate(ics):
             diff = res[3] - ics[0][1][3]
             d_ic = np.sum(diff)
             d_se = np.sqrt(len(diff) * np.var(diff))
-            weight = np.exp(-0.5 * (res[0] - min_ic)) / Z
-            df_comp.at[idx] = (res[0], res[2], d_ic, weight, res[1],
-                               d_se, warns[idx])
+            se = ses[i]
+            weight = weights[i]
+            df_comp.at[idx] = (res[0], res[2], d_ic, weight, se, d_se,
+                               warns[idx])
+
+        return df_comp.sort_values(by=ic)
+
+
+def _ic_matrix(ics):
+    """Store the previously computed pointwise predictive accuracy values (ics)
+    in a 2D matrix array.
+    """
+    N = len(ics[0][1][3])
+    K = len(ics)
+
+    ic_i = np.zeros((N, K))
+    for i in range(K):
+        ic_i[:, i] = ics[i][1][3]
 
-    return df_comp.sort_values(by=ic)
+    return N, K, ic_i
 
 
 def make_indices(dimensions):
diff --git a/pymc3/tests/test_distributions.py b/pymc3/tests/test_distributions.py
index 69dc7f3f0b..6e1e122c0c 100644
--- a/pymc3/tests/test_distributions.py
+++ b/pymc3/tests/test_distributions.py
@@ -378,7 +378,7 @@ def check_logp(self, model, value, domain, paramdomains, logp_reference, decimal
 
     def check_int_to_1(self, model, value, domain, paramdomains):
         pdf = model.fastfn(exp(model.logpt))
-        for pt in product(paramdomains, n_samples=100):
+        for pt in product(paramdomains, n_samples=10):
             pt = Point(pt, value=value.tag.test_value, model=model)
             bij = DictToVarBijection(value, (), pt)
             pdfx = bij.mapf(pdf)
diff --git a/pymc3/tests/test_stats.py b/pymc3/tests/test_stats.py
index 01c392460e..00150b05c5 100644
--- a/pymc3/tests/test_stats.py
+++ b/pymc3/tests/test_stats.py
@@ -47,6 +47,55 @@ def test_log_post_trace():
     npt.assert_allclose(logp, -0.5 * np.log(2 * np.pi), atol=1e-7)
 
 
+def test_compare():
+    np.random.seed(42)
+    x_obs = np.random.normal(0, 1, size=100)
+
+    with pm.Model() as model0:
+        mu = pm.Normal('mu', 0, 1)
+        x = pm.Normal('x', mu=mu, sd=1, observed=x_obs)
+        trace0 = pm.sample(1000)
+
+    with pm.Model() as model1:
+        mu = pm.Normal('mu', 0, 1)
+        x = pm.Normal('x', mu=mu, sd=0.8, observed=x_obs)
+        trace1 = pm.sample(1000)
+
+    with pm.Model() as model2:
+        mu = pm.Normal('mu', 0, 1)
+        x = pm.StudentT('x', nu=1, mu=mu, lam=1, observed=x_obs)
+        trace2 = pm.sample(1000)
+
+    traces = [trace0] * 2
+    models = [model0] * 2
+
+    w_st = pm.compare(traces, models, method='stacking')['weight']
+    w_bb_bma = pm.compare(traces, models, method='BB-pseudo-BMA')['weight']
+    w_bma = pm.compare(traces, models, method='pseudo-BMA')['weight']
+
+    assert_almost_equal(w_st[0], w_st[1])
+    assert_almost_equal(w_bb_bma[0], w_bb_bma[1])
+    assert_almost_equal(w_bma[0], w_bma[1])
+
+    assert_almost_equal(np.sum(w_st), 1.)
+    assert_almost_equal(np.sum(w_bb_bma), 1.)
+    assert_almost_equal(np.sum(w_bma), 1.)
+
+    traces = [trace0, trace1, trace2]
+    models = [model0, model1, model2]
+    w_st = pm.compare(traces, models, method='stacking')['weight']
+    w_bb_bma = pm.compare(traces, models, method='BB-pseudo-BMA')['weight']
+    w_bma = pm.compare(traces, models, method='pseudo-BMA')['weight']
+
+    assert(w_st[0] > w_st[1] > w_st[2])
+    assert(w_bb_bma[0] > w_bb_bma[1] > w_bb_bma[2])
+    assert(w_bma[0] > w_bma[1] > w_bma[2])
+
+    assert_almost_equal(np.sum(w_st), 1.)
+    assert_almost_equal(np.sum(w_st), 1.)
+    assert_almost_equal(np.sum(w_st), 1.)
+
+
 class TestStats(SeededTest):
     @classmethod
     def setup_class(cls):
diff --git a/pymc3/tuning/starting.py b/pymc3/tuning/starting.py
index 224e7406f5..9b75ab1d73 100644
--- a/pymc3/tuning/starting.py
+++ b/pymc3/tuning/starting.py
@@ -94,10 +94,20 @@ def find_MAP(start=None, vars=None, method=None, progressbar=True, return_raw=Fa
     logp_func = bij.mapf(model.fastlogp)
     x0 = bij.map(start)
 
+
+    logp = bij.mapf(model.fastlogp_nojac)
+    def logp_o(point):
+        return nan_to_high(-logp(point))
+
+    # Check to see if minimization function actually uses the gradient
     if method in ["CG", "BFGS", "Newton-CG", "L-BFGS-B", "TNC",
                   "SLSQP", "dogleg", "trust-ncg"]:
-        dlogp_func = bij.mapf(model.fastdlogp(vars))
-        cost_func = CostFuncWrapper(maxeval, progressbar, logp_func, dlogp_func)
+
+        dlogp = bij.mapf(model.fastdlogp_nojac(vars))
+        def grad_logp_o(point):
+            return nan_to_num(-dlogp(point))
+
+        cost_func = CostFuncWrapper(maxeval, progressbar, logp, dlogp)
         compute_gradient = True
     else:
         cost_func = CostFuncWrapper(maxeval, progressbar, logp_func)
@@ -229,4 +239,3 @@ def update_progress_desc(self, neg_value, grad=None):
         else:
             norm_grad = np.linalg.norm(grad)
             self.progress.set_description(self.desc.format(neg_value, norm_grad))
-