From b0025b03c0fee44d0ab5edc07add722c2723ed2e Mon Sep 17 00:00:00 2001
From: aloctavodia <aloctavodia@gmail.com>
Date: Thu, 5 Aug 2021 11:00:41 +0300
Subject: [PATCH 01/26] frowardporting from unreleased v3 plus generalization

---
 pymc3/distributions/bart.py  | 100 ++++++++++++++++++++++++++---------
 pymc3/distributions/tree.py  |  10 ++--
 pymc3/step_methods/pgbart.py |  89 +++++++++++++++----------------
 3 files changed, 124 insertions(+), 75 deletions(-)

diff --git a/pymc3/distributions/bart.py b/pymc3/distributions/bart.py
index 6d4256c323..b9d75c6cda 100644
--- a/pymc3/distributions/bart.py
+++ b/pymc3/distributions/bart.py
@@ -15,6 +15,7 @@
 import numpy as np
 
 from pandas import DataFrame, Series
+from scipy.special import expit
 
 from pymc3.distributions.distribution import NoDistribution
 from pymc3.distributions.tree import LeafNode, SplitNode, Tree
@@ -23,11 +24,23 @@
 
 
 class BaseBART(NoDistribution):
-    def __init__(self, X, Y, m=200, alpha=0.25, split_prior=None, *args, **kwargs):
-
+    def __init__(
+        self,
+        X,
+        Y,
+        m=50,
+        alpha=0.25,
+        k=2,
+        split_prior=None,
+        jitter=False,
+        *args,
+        **kwargs,
+    ):
+
+        self.jitter = jitter
         self.X, self.Y, self.missing_data = self.preprocess_XY(X, Y)
 
-        super().__init__(shape=X.shape[0], dtype="float64", initval=0, *args, **kwargs)
+        super().__init__(shape=X.shape[0], dtype="float64", testval=self.Y.mean(), *args, **kwargs)
 
         if self.X.ndim != 2:
             raise ValueError("The design matrix X must have two dimensions")
@@ -48,16 +61,26 @@ def __init__(self, X, Y, m=200, alpha=0.25, split_prior=None, *args, **kwargs):
                 "The value for the alpha parameter for the tree structure "
                 "must be in the interval (0, 1)"
             )
+        self.m = m
+        self.alpha = alpha
+
+        self.init_mean = self.Y.mean()
+        # if data is binary
+        if np.all(np.unique(self.Y) == [0, 1]):
+            self.mu_std = 6 / (k * self.m ** 0.5)
+        # maybe we need to check for count data
+        else:
+            self.mu_std = self.Y.std() / (k * self.m ** 0.5)
 
         self.num_observations = X.shape[0]
         self.num_variates = X.shape[1]
         self.available_predictors = list(range(self.num_variates))
         self.ssv = SampleSplittingVariable(split_prior, self.num_variates)
-        self.m = m
-        self.alpha = alpha
+        self.initial_value_leaf_nodes = self.init_mean / self.m
         self.trees = self.init_list_of_trees()
         self.all_trees = []
         self.mean = fast_mean()
+        self.normal = NormalSampler()
         self.prior_prob_leaf_node = compute_prior_probability(alpha)
 
     def preprocess_XY(self, X, Y):
@@ -66,11 +89,13 @@ def preprocess_XY(self, X, Y):
         if isinstance(X, (Series, DataFrame)):
             X = X.to_numpy()
         missing_data = np.any(np.isnan(X))
-        X = np.random.normal(X, np.std(X, 0) / 100)
+        if self.jitter:
+            X = np.random.normal(X, np.nanstd(X, 0) / 100000)
+        Y = Y.astype(float)
         return X, Y, missing_data
 
     def init_list_of_trees(self):
-        initial_value_leaf_nodes = self.Y.mean() / self.m
+        initial_value_leaf_nodes = self.initial_value_leaf_nodes
         initial_idx_data_points_leaf_nodes = np.array(range(self.num_observations), dtype="int32")
         list_of_trees = []
         for i in range(self.m):
@@ -84,7 +109,7 @@ def init_list_of_trees(self):
         # bartMachine: A Powerful Tool for Machine Learning in R. ArXiv e-prints, 2013
         # The sum_trees_output will contain the sum of the predicted output for all trees.
         # When R_j is needed we subtract the current predicted output for tree T_j.
-        self.sum_trees_output = np.full_like(self.Y, self.Y.mean())
+        self.sum_trees_output = np.full_like(self.Y, self.init_mean)
 
         return list_of_trees
 
@@ -153,20 +178,13 @@ def get_new_idx_data_points(self, current_split_node, idx_data_points):
 
         return left_node_idx_data_points, right_node_idx_data_points
 
-    def get_residuals(self):
-        """Compute the residuals."""
-        R_j = self.Y - self.sum_trees_output
-        return R_j
-
-    def get_residuals_loo(self, tree):
-        """Compute the residuals without leaving the passed tree out."""
-        R_j = self.Y - (self.sum_trees_output - tree.predict_output(self.num_observations))
-        return R_j
-
     def draw_leaf_value(self, idx_data_points):
         """Draw the residual mean."""
-        R_j = self.get_residuals()[idx_data_points]
-        draw = self.mean(R_j)
+        Y_pred_idx = self.sum_trees_output[idx_data_points]
+        mu_mean = self.mean(Y_pred_idx) / self.m
+        # add cache of precomputed normal values
+        draw = self.normal.random() * self.mu_std + mu_mean
+
         return draw
 
     def predict(self, X_new):
@@ -174,7 +192,6 @@ def predict(self, X_new):
         trees = self.all_trees
         num_observations = X_new.shape[0]
         pred = np.zeros((len(trees), num_observations))
-        np.random.randint(len(trees))
         for draw, trees_to_sum in enumerate(trees):
             new_Y = np.zeros(num_observations)
             for tree in trees_to_sum:
@@ -232,6 +249,20 @@ def discrete_uniform_sampler(upper_value):
     return int(np.random.random() * upper_value)
 
 
+class NormalSampler:
+    def __init__(self):
+        self.size = 1000
+        self.cache = []
+
+    def random(self):
+        if not self.cache:
+            self.update()
+        return self.cache.pop()
+
+    def update(self):
+        self.cache = np.random.normal(loc=0.0, scale=1, size=self.size).tolist()
+
+
 class SampleSplittingVariable:
     def __init__(self, prior, num_variates):
         self.prior = prior
@@ -272,13 +303,32 @@ class BART(BaseBART):
     m : int
         Number of trees
     alpha : float
-        Control the prior probability over the depth of the trees. Must be in the interval (0, 1),
-        altought it is recomenned to be in the interval (0, 0.5].
+        Control the prior probability over the depth of the trees. Even when it can takes values in
+        the interval (0, 1), it is recommended to be in the interval (0, 0.5].
+    k : int
+        LALALA. Recomended to be between 1 and 3
     split_prior : array-like
         Each element of split_prior should be in the [0, 1] interval and the elements should sum
         to 1. Otherwise they will be normalized.
         Defaults to None, all variable have the same a prior probability
+    jitter : bool
+        Whether to jitter the X values or not. Defaults to False. When values of X are repeated,
+        jittering X has the effect of increasing the number of effective spliting variables,
+        otherwise it does not have any effect.
     """
 
-    def __init__(self, X, Y, m=200, alpha=0.25, split_prior=None):
-        super().__init__(X, Y, m, alpha, split_prior)
+    def __init__(self, X, Y, m=50, alpha=0.25, k=2, split_prior=None, jitter=False):
+        super().__init__(X, Y, m, alpha, k, split_prior, jitter)
+
+    def _str_repr(self, name=None, dist=None, formatting="plain"):
+        if dist is None:
+            dist = self
+        X = (type(self.X),)
+        Y = (type(self.Y),)
+        alpha = self.alpha
+        m = self.m
+
+        if "latex" in formatting:
+            return f"$\\text{{{name}}} \\sim  \\text{{BART}}(\\text{{alpha = }}\\text{{{alpha}}}, \\text{{m = }}\\text{{{m}}})$"
+        else:
+            return f"{name} ~ BART(alpha = {alpha}, m = {m})"
diff --git a/pymc3/distributions/tree.py b/pymc3/distributions/tree.py
index 8e84bd9a7c..12f7dc6285 100644
--- a/pymc3/distributions/tree.py
+++ b/pymc3/distributions/tree.py
@@ -45,12 +45,13 @@ class Tree:
     tree_id : int, optional
     """
 
-    def __init__(self, tree_id=0):
+    def __init__(self, tree_id=0, num_observations=0):
         self.tree_structure = {}
         self.num_nodes = 0
         self.idx_leaf_nodes = []
         self.idx_prunable_split_nodes = []
         self.tree_id = tree_id
+        self.num_observations = num_observations
 
     def __getitem__(self, index):
         return self.get_node(index)
@@ -77,11 +78,12 @@ def delete_node(self, index):
         del self.tree_structure[index]
         self.num_nodes -= 1
 
-    def predict_output(self, num_observations):
-        output = np.zeros(num_observations)
+    def predict_output(self):
+        output = np.zeros(self.num_observations)
         for node_index in self.idx_leaf_nodes:
             current_node = self.get_node(node_index)
             output[current_node.idx_data_points] = current_node.value
+
         return output
 
     def predict_out_of_sample(self, x):
@@ -163,7 +165,7 @@ def init_tree(tree_id, leaf_node_value, idx_data_points):
         -------
 
         """
-        new_tree = Tree(tree_id)
+        new_tree = Tree(tree_id, len(idx_data_points))
         new_tree[0] = LeafNode(index=0, value=leaf_node_value, idx_data_points=idx_data_points)
         return new_tree
 
diff --git a/pymc3/step_methods/pgbart.py b/pymc3/step_methods/pgbart.py
index b3b00bfa52..321c18fcc7 100644
--- a/pymc3/step_methods/pgbart.py
+++ b/pymc3/step_methods/pgbart.py
@@ -15,14 +15,15 @@
 import logging
 
 import numpy as np
+from scipy.stats import triang
 
-from aesara import function as aesara_function
+from theano import function as theano_function
 
-from pymc3.aesaraf import inputvars, join_nonshared_inputs, make_shared_replacements
 from pymc3.distributions import BART
 from pymc3.distributions.tree import Tree
 from pymc3.model import modelcontext
 from pymc3.step_methods.arraystep import ArrayStepShared, Competence
+from pymc3.aesaraf import inputvars, join_nonshared_inputs, make_shared_replacements
 
 _log = logging.getLogger("pymc3")
 
@@ -59,7 +60,6 @@ class PGBART(ArrayStepShared):
     def __init__(self, vars=None, num_particles=10, max_stages=5000, chunk="auto", model=None):
         _log.warning("The BART model is experimental. Use with caution.")
         model = modelcontext(model)
-        initial_values = model.initial_point
         vars = inputvars(vars)
         self.bart = vars[0].distribution
 
@@ -76,27 +76,27 @@ def __init__(self, vars=None, num_particles=10, max_stages=5000, chunk="auto", m
         self.log_num_particles = np.log(num_particles)
         self.indices = list(range(1, num_particles))
         self.max_stages = max_stages
+
+        shared = make_shared_replacements(vars, model)
+        self.likelihood_logp = logp([model.datalogpt], vars, shared)
+        self.init_leaf_nodes = self.bart.initial_value_leaf_nodes
+        self.init_likelihood = self.likelihood_logp(self.bart.sum_trees_output)
+        self.init_log_weight = self.init_likelihood - self.log_num_particles
         self.old_trees_particles_list = []
         for i in range(self.bart.m):
-            p = ParticleTree(self.bart.trees[i], self.bart.prior_prob_leaf_node)
+            p = ParticleTree(
+                self.bart.trees[i],
+                self.bart.prior_prob_leaf_node,
+                self.init_log_weight,
+                self.init_likelihood,
+            )
             self.old_trees_particles_list.append(p)
-
-        shared = make_shared_replacements(initial_values, vars, model)
-        self.likelihood_logp = logp(initial_values, [model.datalogpt], vars, shared)
         super().__init__(vars, shared)
 
     def astep(self, _):
         bart = self.bart
-        num_observations = bart.num_observations
-        variable_inclusion = np.zeros(bart.num_variates, dtype="int")
 
-        # For the tunning phase we restrict max_stages to a low number, otherwise it is almost sure
-        # we will reach max_stages given that our first set of m trees is not good at all.
-        # Can set max_stages as a function of the number of variables/dimensions?
-        if self.tune:
-            max_stages = 5
-        else:
-            max_stages = self.max_stages
+        variable_inclusion = np.zeros(bart.num_variates, dtype="int")
 
         if self.idx == bart.m:
             self.idx = 0
@@ -106,12 +106,12 @@ def astep(self, _):
                 break
             self.idx += 1
             tree = bart.trees[idx]
-            R_j = bart.get_residuals_loo(tree)
+            sum_trees_output_noi = bart.sum_trees_output - tree.predict_output()
             # Generate an initial set of SMC particles
             # at the end of the algorithm we return one of these particles as the new tree
-            particles = self.init_particles(tree.tree_id, R_j, num_observations)
+            particles = self.init_particles(tree.tree_id)
 
-            for t in range(1, max_stages):
+            for t in range(1, self.max_stages):
                 # Get old particle at stage t
                 particles[0] = self.get_old_tree_particle(tree.tree_id, t)
                 # sample each particle (try to grow each tree)
@@ -119,16 +119,18 @@ def astep(self, _):
                     particles[c].sample_tree_sequential(bart)
                 # Update weights. Since the prior is used as the proposal,the weights
                 # are updated additively as the ratio of the new and old log_likelihoods
-                for p_idx, p in enumerate(particles):
-                    new_likelihood = self.likelihood_logp(p.tree.predict_output(num_observations))
+                for p in particles:
+                    new_likelihood = self.likelihood_logp(
+                        sum_trees_output_noi + p.tree.predict_output()
+                    )
                     p.log_weight += new_likelihood - p.old_likelihood_logp
                     p.old_likelihood_logp = new_likelihood
 
                 # Normalize weights
                 W, normalized_weights = self.normalize(particles)
-
                 # Resample all but first particle
                 re_n_w = normalized_weights[1:] / normalized_weights[1:].sum()
+
                 new_indices = np.random.choice(self.indices, size=len(self.indices), p=re_n_w)
                 particles[1:] = particles[new_indices]
 
@@ -149,8 +151,7 @@ def astep(self, _):
             new_tree = np.random.choice(particles, p=normalized_weights)
             self.old_trees_particles_list[tree.tree_id] = new_tree
             bart.trees[idx] = new_tree.tree
-            new_prediction = new_tree.tree.predict_output(num_observations)
-            bart.sum_trees_output = bart.Y - R_j + new_prediction
+            bart.sum_trees_output = sum_trees_output_noi + new_tree.tree.predict_output()
 
             if not self.tune:
                 self.iter += 1
@@ -162,7 +163,6 @@ def astep(self, _):
                     variable_inclusion[index] += 1
 
         stats = {"variable_inclusion": variable_inclusion}
-
         return bart.sum_trees_output, [stats]
 
     @staticmethod
@@ -170,8 +170,7 @@ def competence(var, has_grad):
         """
         PGBART is only suitable for BART distributions
         """
-        dist = getattr(var.owner, "op", None)
-        if isinstance(dist, BART):
+        if isinstance(var.distribution, BART):
             return Competence.IDEAL
         return Competence.INCOMPATIBLE
 
@@ -196,30 +195,27 @@ def get_old_tree_particle(self, tree_id, t):
         old_tree_particle.set_particle_to_step(t)
         return old_tree_particle
 
-    def init_particles(self, tree_id, R_j, num_observations):
+    def init_particles(self, tree_id):
         """
         Initialize particles
         """
-        # The first particle is from the tree we are trying to replace
         prev_tree = self.get_old_tree_particle(tree_id, 0)
-        likelihood = self.likelihood_logp(prev_tree.tree.predict_output(num_observations))
+        likelihood = self.likelihood_logp(prev_tree.tree.predict_output())
         prev_tree.old_likelihood_logp = likelihood
         prev_tree.log_weight = likelihood - self.log_num_particles
         particles = [prev_tree]
 
-        # The rest of the particles are identically initialized
-        initial_value_leaf_nodes = R_j.mean()
-        initial_idx_data_points_leaf_nodes = np.arange(num_observations, dtype="int32")
+        initial_idx_data_points_leaf_nodes = np.arange(self.bart.num_observations, dtype="int32")
         new_tree = Tree.init_tree(
             tree_id=tree_id,
-            leaf_node_value=initial_value_leaf_nodes,
+            leaf_node_value=self.init_leaf_nodes,
             idx_data_points=initial_idx_data_points_leaf_nodes,
         )
-        likelihood_logp = self.likelihood_logp(new_tree.predict_output(num_observations))
-        log_weight = likelihood_logp - self.log_num_particles
-        for i in range(1, self.num_particles):
+
+        prior_prob = self.bart.prior_prob_leaf_node
+        for _ in range(1, self.num_particles):
             particles.append(
-                ParticleTree(new_tree, self.bart.prior_prob_leaf_node, log_weight, likelihood_logp)
+                ParticleTree(new_tree, prior_prob, self.init_log_weight, self.init_likelihood)
             )
 
         return np.array(particles)
@@ -239,10 +235,10 @@ class ParticleTree:
 
     def __init__(self, tree, prior_prob_leaf_node, log_weight=0, likelihood=0):
         self.tree = tree.copy()  # keeps the tree that we care at the moment
-        self.expansion_nodes = tree.idx_leaf_nodes.copy()  # This should be the array [0]
+        self.expansion_nodes = [0]
         self.tree_history = [self.tree]
         self.expansion_nodes_history = [self.expansion_nodes]
-        self.log_weight = 0
+        self.log_weight = log_weight
         self.prior_prob_leaf_node = prior_prob_leaf_node
         self.old_likelihood_logp = likelihood
         self.used_variates = []
@@ -255,7 +251,8 @@ def sample_tree_sequential(self, bart):
 
             if prob_leaf < np.random.random():
                 grow_successful, index_selected_predictor = bart.grow_tree(
-                    self.tree, index_leaf_node
+                    self.tree,
+                    index_leaf_node,
                 )
                 if grow_successful:
                     # Add new leaf nodes indexes
@@ -275,8 +272,8 @@ def set_particle_to_step(self, t):
             self.expansion_nodes = self.expansion_nodes_history[t]
 
 
-def logp(point, out_vars, vars, shared):
-    """Compile Aesara function of the model and the input and output variables.
+def logp(out_vars, vars, shared):
+    """Compile Theano function of the model and the input and output variables.
 
     Parameters
     ----------
@@ -285,9 +282,9 @@ def logp(point, out_vars, vars, shared):
     vars: List
         containing :class:`pymc3.Distribution` for the input variables
     shared: List
-        containing :class:`aesara.tensor.Tensor` for depended shared data
+        containing :class:`theano.tensor.Tensor` for depended shared data
     """
-    out_list, inarray0 = join_nonshared_inputs(point, out_vars, vars, shared)
-    f = aesara_function([inarray0], out_list[0])
+    out_list, inarray0 = join_nonshared_inputs(out_vars, vars, shared)
+    f = theano_function([inarray0], out_list[0])
     f.trust_input = True
     return f

From 4224969c3634aa7c45d8df8cf1eca6cd2f84463c Mon Sep 17 00:00:00 2001
From: aloctavodia <aloctavodia@gmail.com>
Date: Fri, 6 Aug 2021 13:55:48 +0300
Subject: [PATCH 02/26] aesarize

---
 pymc3/distributions/bart.py    | 375 ++++++++-------------------------
 pymc3/sampling.py              |  14 +-
 pymc3/step_methods/hmc/nuts.py |   5 +-
 pymc3/step_methods/pgbart.py   | 332 +++++++++++++++++++++++++----
 4 files changed, 389 insertions(+), 337 deletions(-)

diff --git a/pymc3/distributions/bart.py b/pymc3/distributions/bart.py
index b9d75c6cda..775f4ff550 100644
--- a/pymc3/distributions/bart.py
+++ b/pymc3/distributions/bart.py
@@ -14,40 +14,81 @@
 
 import numpy as np
 
-from pandas import DataFrame, Series
-from scipy.special import expit
-
+from aesara.tensor.random.op import RandomVariable, default_shape_from_params
 from pymc3.distributions.distribution import NoDistribution
-from pymc3.distributions.tree import LeafNode, SplitNode, Tree
+
 
 __all__ = ["BART"]
 
 
-class BaseBART(NoDistribution):
-    def __init__(
-        self,
+class BARTRV(RandomVariable):
+    name = "bart"
+    ndim_supp = 1
+    ndims_params = [2, 1, 0, 0, 0]
+    dtype = "floatX"
+    _print_name = ("BART", "\\operatorname{BART}")
+    all_trees = None
+
+    def _shape_from_params(self, dist_params, rep_param_idx=1, param_shapes=None):
+        return default_shape_from_params(self.ndim_supp, dist_params, rep_param_idx, param_shapes)
+
+    def __new__(cls, *args, **kwargs):
+        return super().__new__(cls)
+
+    @classmethod
+    def rng_fn(cls, X_new=None, *args, **kwargs):
+        # in old version X_new could be passed by the user to get out-of-sample predictions
+
+        all_trees = cls.all_trees
+        if all_trees:
+            pred = 0
+            idx = np.random.randint(len(all_trees))
+            trees = all_trees[idx]
+            if X_new is None:
+                for tree in trees:
+                    pred += tree.predict_output()
+            else:
+                for tree in trees:
+                    pred += np.array([tree.predict_out_of_sample(x) for x in X_new])
+            return pred
+            # XXX check why I did not make this random
+            # pred = np.zeros((len(trees), num_observations))
+            # for draw, trees_to_sum in enumerate(trees):
+            #    new_Y = np.zeros(num_observations)
+            #    for tree in trees_to_sum:
+            #        new_Y += [tree.predict_out_of_sample(x) for x in X_new]
+            #    pred[draw] = new_Y
+            # return pred
+        else:
+            return np.full_like(cls.Y, cls.Y.mean())
+
+
+bart = BARTRV()
+
+
+class BART(NoDistribution):
+    """Improper flat prior over the positive reals."""
+
+    def __new__(
+        cls,
+        name,
         X,
         Y,
         m=50,
         alpha=0.25,
         k=2,
-        split_prior=None,
-        jitter=False,
-        *args,
+        ndim_supp=1,
+        ndims_params=[2, 1, 0, 0, 0],
+        dtype="floatX",
         **kwargs,
     ):
 
-        self.jitter = jitter
-        self.X, self.Y, self.missing_data = self.preprocess_XY(X, Y)
-
-        super().__init__(shape=X.shape[0], dtype="float64", testval=self.Y.mean(), *args, **kwargs)
-
-        if self.X.ndim != 2:
+        if X.ndim != 2:
             raise ValueError("The design matrix X must have two dimensions")
 
-        if self.Y.ndim != 1:
+        if Y.ndim != 1:
             raise ValueError("The response matrix Y must have one dimension")
-        if self.X.shape[0] != self.Y.shape[0]:
+        if X.shape[0] != Y.shape[0]:
             raise ValueError(
                 "The design matrix X and the response matrix Y must have the same number of elements"
             )
@@ -61,274 +102,32 @@ def __init__(
                 "The value for the alpha parameter for the tree structure "
                 "must be in the interval (0, 1)"
             )
-        self.m = m
-        self.alpha = alpha
-
-        self.init_mean = self.Y.mean()
-        # if data is binary
-        if np.all(np.unique(self.Y) == [0, 1]):
-            self.mu_std = 6 / (k * self.m ** 0.5)
-        # maybe we need to check for count data
-        else:
-            self.mu_std = self.Y.std() / (k * self.m ** 0.5)
-
-        self.num_observations = X.shape[0]
-        self.num_variates = X.shape[1]
-        self.available_predictors = list(range(self.num_variates))
-        self.ssv = SampleSplittingVariable(split_prior, self.num_variates)
-        self.initial_value_leaf_nodes = self.init_mean / self.m
-        self.trees = self.init_list_of_trees()
-        self.all_trees = []
-        self.mean = fast_mean()
-        self.normal = NormalSampler()
-        self.prior_prob_leaf_node = compute_prior_probability(alpha)
-
-    def preprocess_XY(self, X, Y):
-        if isinstance(Y, (Series, DataFrame)):
-            Y = Y.to_numpy()
-        if isinstance(X, (Series, DataFrame)):
-            X = X.to_numpy()
-        missing_data = np.any(np.isnan(X))
-        if self.jitter:
-            X = np.random.normal(X, np.nanstd(X, 0) / 100000)
-        Y = Y.astype(float)
-        return X, Y, missing_data
-
-    def init_list_of_trees(self):
-        initial_value_leaf_nodes = self.initial_value_leaf_nodes
-        initial_idx_data_points_leaf_nodes = np.array(range(self.num_observations), dtype="int32")
-        list_of_trees = []
-        for i in range(self.m):
-            new_tree = Tree.init_tree(
-                tree_id=i,
-                leaf_node_value=initial_value_leaf_nodes,
-                idx_data_points=initial_idx_data_points_leaf_nodes,
-            )
-            list_of_trees.append(new_tree)
-        # Diff trick to speed computation of residuals. From Section 3.1 of Kapelner, A and Bleich, J.
-        # bartMachine: A Powerful Tool for Machine Learning in R. ArXiv e-prints, 2013
-        # The sum_trees_output will contain the sum of the predicted output for all trees.
-        # When R_j is needed we subtract the current predicted output for tree T_j.
-        self.sum_trees_output = np.full_like(self.Y, self.init_mean)
-
-        return list_of_trees
-
-    def __iter__(self):
-        return iter(self.trees)
-
-    def __repr_latex(self):
-        raise NotImplementedError
-
-    def get_available_splitting_rules(self, idx_data_points_split_node, idx_split_variable):
-        x_j = self.X[idx_data_points_split_node, idx_split_variable]
-        if self.missing_data:
-            x_j = x_j[~np.isnan(x_j)]
-        values = np.unique(x_j)
-        # The last value is never available as it would leave the right subtree empty.
-        return values[:-1]
-
-    def grow_tree(self, tree, index_leaf_node):
-        current_node = tree.get_node(index_leaf_node)
-
-        index_selected_predictor = self.ssv.rvs()
-        selected_predictor = self.available_predictors[index_selected_predictor]
-        available_splitting_rules = self.get_available_splitting_rules(
-            current_node.idx_data_points, selected_predictor
-        )
-        # This can be unsuccessful when there are not available splitting rules
-        if available_splitting_rules.size == 0:
-            return False, None
-
-        index_selected_splitting_rule = discrete_uniform_sampler(len(available_splitting_rules))
-        selected_splitting_rule = available_splitting_rules[index_selected_splitting_rule]
-        new_split_node = SplitNode(
-            index=index_leaf_node,
-            idx_split_variable=selected_predictor,
-            split_value=selected_splitting_rule,
-        )
-
-        left_node_idx_data_points, right_node_idx_data_points = self.get_new_idx_data_points(
-            new_split_node, current_node.idx_data_points
-        )
-
-        left_node_value = self.draw_leaf_value(left_node_idx_data_points)
-        right_node_value = self.draw_leaf_value(right_node_idx_data_points)
-
-        new_left_node = LeafNode(
-            index=current_node.get_idx_left_child(),
-            value=left_node_value,
-            idx_data_points=left_node_idx_data_points,
-        )
-        new_right_node = LeafNode(
-            index=current_node.get_idx_right_child(),
-            value=right_node_value,
-            idx_data_points=right_node_idx_data_points,
-        )
-        tree.grow_tree(index_leaf_node, new_split_node, new_left_node, new_right_node)
-
-        return True, index_selected_predictor
-
-    def get_new_idx_data_points(self, current_split_node, idx_data_points):
-        idx_split_variable = current_split_node.idx_split_variable
-        split_value = current_split_node.split_value
-
-        left_idx = self.X[idx_data_points, idx_split_variable] <= split_value
-        left_node_idx_data_points = idx_data_points[left_idx]
-        right_node_idx_data_points = idx_data_points[~left_idx]
-
-        return left_node_idx_data_points, right_node_idx_data_points
 
-    def draw_leaf_value(self, idx_data_points):
-        """Draw the residual mean."""
-        Y_pred_idx = self.sum_trees_output[idx_data_points]
-        mu_mean = self.mean(Y_pred_idx) / self.m
-        # add cache of precomputed normal values
-        draw = self.normal.random() * self.mu_std + mu_mean
-
-        return draw
-
-    def predict(self, X_new):
-        """Compute out of sample predictions evaluated at X_new"""
-        trees = self.all_trees
-        num_observations = X_new.shape[0]
-        pred = np.zeros((len(trees), num_observations))
-        for draw, trees_to_sum in enumerate(trees):
-            new_Y = np.zeros(num_observations)
-            for tree in trees_to_sum:
-                new_Y += [tree.predict_out_of_sample(x) for x in X_new]
-            pred[draw] = new_Y
-        return pred
-
-
-def compute_prior_probability(alpha):
-    """
-    Calculate the probability of the node being a LeafNode (1 - p(being SplitNode)).
-    Taken from equation 19 in [Rockova2018].
-
-    Parameters
-    ----------
-    alpha : float
-
-    Returns
-    -------
-    list with probabilities for leaf nodes
-
-    References
-    ----------
-    .. [Rockova2018] Veronika Rockova, Enakshi Saha (2018). On the theory of BART.
-    arXiv, `link <https://arxiv.org/abs/1810.00787>`__
-    """
-    prior_leaf_prob = [0]
-    depth = 1
-    while prior_leaf_prob[-1] < 1:
-        prior_leaf_prob.append(1 - alpha ** depth)
-        depth += 1
-    return prior_leaf_prob
-
-
-def fast_mean():
-    """If available use Numba to speed up the computation of the mean."""
-    try:
-        from numba import jit
-    except ImportError:
-        return np.mean
-
-    @jit
-    def mean(a):
-        count = a.shape[0]
-        suma = 0
-        for i in range(count):
-            suma += a[i]
-        return suma / count
-
-    return mean
-
-
-def discrete_uniform_sampler(upper_value):
-    """Draw from the uniform distribution with bounds [0, upper_value)."""
-    return int(np.random.random() * upper_value)
-
-
-class NormalSampler:
-    def __init__(self):
-        self.size = 1000
-        self.cache = []
-
-    def random(self):
-        if not self.cache:
-            self.update()
-        return self.cache.pop()
-
-    def update(self):
-        self.cache = np.random.normal(loc=0.0, scale=1, size=self.size).tolist()
-
-
-class SampleSplittingVariable:
-    def __init__(self, prior, num_variates):
-        self.prior = prior
-        self.num_variates = num_variates
-
-        if self.prior is not None:
-            self.prior = np.asarray(self.prior)
-            self.prior = self.prior / self.prior.sum()
-            if self.prior.size != self.num_variates:
-                raise ValueError(
-                    f"The size of split_prior ({self.prior.size}) should be the "
-                    f"same as the number of covariates ({self.num_variates})"
-                )
-            self.enu = list(enumerate(np.cumsum(self.prior)))
-
-    def rvs(self):
-        if self.prior is None:
-            return int(np.random.random() * self.num_variates)
-        else:
-            r = np.random.random()
-            for i, v in self.enu:
-                if r <= v:
-                    return i
-
-
-class BART(BaseBART):
-    """
-    BART distribution.
-
-    Distribution representing a sum over trees
-
-    Parameters
-    ----------
-    X : array-like
-        The design matrix.
-    Y : array-like
-        The response vector.
-    m : int
-        Number of trees
-    alpha : float
-        Control the prior probability over the depth of the trees. Even when it can takes values in
-        the interval (0, 1), it is recommended to be in the interval (0, 0.5].
-    k : int
-        LALALA. Recomended to be between 1 and 3
-    split_prior : array-like
-        Each element of split_prior should be in the [0, 1] interval and the elements should sum
-        to 1. Otherwise they will be normalized.
-        Defaults to None, all variable have the same a prior probability
-    jitter : bool
-        Whether to jitter the X values or not. Defaults to False. When values of X are repeated,
-        jittering X has the effect of increasing the number of effective spliting variables,
-        otherwise it does not have any effect.
-    """
-
-    def __init__(self, X, Y, m=50, alpha=0.25, k=2, split_prior=None, jitter=False):
-        super().__init__(X, Y, m, alpha, k, split_prior, jitter)
-
-    def _str_repr(self, name=None, dist=None, formatting="plain"):
-        if dist is None:
-            dist = self
-        X = (type(self.X),)
-        Y = (type(self.Y),)
-        alpha = self.alpha
-        m = self.m
-
-        if "latex" in formatting:
-            return f"$\\text{{{name}}} \\sim  \\text{{BART}}(\\text{{alpha = }}\\text{{{alpha}}}, \\text{{m = }}\\text{{{m}}})$"
-        else:
-            return f"{name} ~ BART(alpha = {alpha}, m = {m})"
+        cls.all_trees = []
+        bart_op = type(
+            f"BART_{name}",
+            (BARTRV,),
+            dict(
+                name="BART",
+                all_trees=cls.all_trees,
+                ndim_supp=ndim_supp,
+                ndims_params=ndims_params,
+                dtype=dtype,
+                inplace=False,
+                X=X,
+                Y=Y,
+                m=m,
+                alpha=alpha,
+                k=k,
+            ),
+        )()
+
+        NoDistribution.register(BARTRV)
+
+        cls.rv_op = bart_op
+        params = [X, Y, m, alpha, k]
+        return super().__new__(cls, name, *params, **kwargs)
+
+    @classmethod
+    def dist(cls, *params, **kwargs):
+        return super().dist(params, **kwargs)
diff --git a/pymc3/sampling.py b/pymc3/sampling.py
index f032ad1fda..0a8f97e8e9 100644
--- a/pymc3/sampling.py
+++ b/pymc3/sampling.py
@@ -42,6 +42,7 @@
 from pymc3.backends.base import BaseTrace, MultiTrace
 from pymc3.backends.ndarray import NDArray
 from pymc3.blocking import DictToArrayBijection
+from pymc3.distributions.bart import BARTRV
 from pymc3.exceptions import IncorrectArgumentsError, SamplingError
 from pymc3.model import Model, Point, modelcontext
 from pymc3.parallel_sampling import Draw, _cpu_count
@@ -232,13 +233,14 @@ def _print_step_hierarchy(s: Step, level=0) -> None:
         _log.info(">" * level + f"{s.__class__.__name__}: [{varnames}]")
 
 
-def all_continuous(vars):
+def all_continuous(vars, model):
     """Check that vars not include discrete variables or BART variables, excepting observed RVs."""
 
     vars_ = [var for var in vars if not (var.owner and hasattr(var.tag, "observations"))]
+
     if any(
         [
-            (var.dtype in discrete_types or (var.owner and isinstance(var.owner.op, pm.BART)))
+            (var.dtype in discrete_types or isinstance(model.values_to_rvs[var].owner.op, BARTRV))
             for var in vars_
         ]
     ):
@@ -499,7 +501,7 @@ def sample(
 
     draws += tune
 
-    if step is None and init is not None and all_continuous(model.value_vars):
+    if step is None and init is not None and all_continuous(model.value_vars, model):
         try:
             # By default, try to use NUTS
             _log.info("Auto-assigning NUTS sampler...")
@@ -634,10 +636,6 @@ def sample(
     trace.report._n_draws = n_draws
     trace.report._t_sampling = t_sampling
 
-    if "variable_inclusion" in trace.stat_names:
-        variable_inclusion = np.stack(trace.get_sampler_stats("variable_inclusion")).mean(0)
-        trace.report.variable_importance = variable_inclusion / variable_inclusion.sum()
-
     n_chains = len(trace.chains)
     _log.info(
         f'Sampling {n_chains} chain{"s" if n_chains > 1 else ""} for {n_tune:_d} tune and {n_draws:_d} draw iterations '
@@ -2128,7 +2126,7 @@ def init_nuts(
     vars = kwargs.get("vars", model.value_vars)
     if set(vars) != set(model.value_vars):
         raise ValueError("Must use init_nuts on all variables of a model.")
-    if not all_continuous(vars):
+    if not all_continuous(vars, model):
         raise ValueError("init_nuts can only be used for models with only " "continuous variables.")
 
     if not isinstance(init, str):
diff --git a/pymc3/step_methods/hmc/nuts.py b/pymc3/step_methods/hmc/nuts.py
index dca02a6c9b..267c20659f 100644
--- a/pymc3/step_methods/hmc/nuts.py
+++ b/pymc3/step_methods/hmc/nuts.py
@@ -18,7 +18,7 @@
 
 from pymc3.aesaraf import floatX
 from pymc3.backends.report import SamplerWarning, WarningType
-from pymc3.distributions import BART
+from pymc3.distributions.bart import BARTRV
 from pymc3.math import logbern, logdiffexp_numpy
 from pymc3.step_methods.arraystep import Competence
 from pymc3.step_methods.hmc.base_hmc import BaseHMC, DivergenceInfo, HMCStepData
@@ -198,8 +198,9 @@ def _hamiltonian_step(self, start, p0, step_size):
     @staticmethod
     def competence(var, has_grad):
         """Check how appropriate this class is for sampling a random variable."""
+
         dist = getattr(var.owner, "op", None)
-        if var.dtype in continuous_types and has_grad and not isinstance(dist, BART):
+        if var.dtype in continuous_types and has_grad and not isinstance(dist, BARTRV):
             return Competence.IDEAL
         return Competence.INCOMPATIBLE
 
diff --git a/pymc3/step_methods/pgbart.py b/pymc3/step_methods/pgbart.py
index 321c18fcc7..eeb91884f1 100644
--- a/pymc3/step_methods/pgbart.py
+++ b/pymc3/step_methods/pgbart.py
@@ -15,15 +15,17 @@
 import logging
 
 import numpy as np
-from scipy.stats import triang
+from pandas import DataFrame, Series
 
-from theano import function as theano_function
+from aesara import function as aesara_function
 
-from pymc3.distributions import BART
-from pymc3.distributions.tree import Tree
+from pymc3.aesaraf import inputvars, join_nonshared_inputs, make_shared_replacements
+from pymc3.distributions.bart import BARTRV
+from pymc3.distributions.tree import LeafNode, SplitNode, Tree
 from pymc3.model import modelcontext
 from pymc3.step_methods.arraystep import ArrayStepShared, Competence
-from pymc3.aesaraf import inputvars, join_nonshared_inputs, make_shared_replacements
+from pymc3.blocking import RaveledVars
+
 
 _log = logging.getLogger("pymc3")
 
@@ -60,8 +62,35 @@ class PGBART(ArrayStepShared):
     def __init__(self, vars=None, num_particles=10, max_stages=5000, chunk="auto", model=None):
         _log.warning("The BART model is experimental. Use with caution.")
         model = modelcontext(model)
-        vars = inputvars(vars)
-        self.bart = vars[0].distribution
+        initial_values = model.initial_point
+        value_bart = inputvars(vars)[0]
+        self.bart = model.values_to_rvs[value_bart].owner.op
+
+        self.X, self.Y, self.missing_data = preprocess_XY(self.bart.X, self.bart.Y)
+        self.m = self.bart.m
+        self.alpha = self.bart.alpha
+        self.k = self.bart.k
+
+        self.init_mean = self.Y.mean()
+        # if data is binary
+        if np.all(np.unique(self.Y) == [0, 1]):
+            self.mu_std = 6 / (self.k * self.m ** 0.5)
+        # maybe we need to check for count data
+        else:
+            self.mu_std = self.Y.std() / (self.k * self.m ** 0.5)
+
+        self.num_observations = self.X.shape[0]
+        self.num_variates = self.X.shape[1]
+        self.available_predictors = list(range(self.num_variates))
+        self.ssv = SampleSplittingVariable(None, self.num_variates)
+        self.initial_value_leaf_nodes = self.init_mean / self.m
+
+        self.trees, self.sum_trees_output = init_list_of_trees(
+            self.initial_value_leaf_nodes, self.num_observations, self.m, self.Y, self.init_mean
+        )
+        self.mean = fast_mean()
+        self.normal = NormalSampler()
+        self.prior_prob_leaf_node = compute_prior_probability(self.alpha)
 
         self.tune = True
         self.idx = 0
@@ -70,43 +99,43 @@ def __init__(self, vars=None, num_particles=10, max_stages=5000, chunk="auto", m
         self.chunk = chunk
 
         if chunk == "auto":
-            self.chunk = max(1, int(self.bart.m * 0.1))
-        self.bart.chunk = self.chunk
+            self.chunk = max(1, int(self.m * 0.1))
+        # self.bart.chunk = self.chunk
         self.num_particles = num_particles
         self.log_num_particles = np.log(num_particles)
         self.indices = list(range(1, num_particles))
         self.max_stages = max_stages
 
-        shared = make_shared_replacements(vars, model)
-        self.likelihood_logp = logp([model.datalogpt], vars, shared)
-        self.init_leaf_nodes = self.bart.initial_value_leaf_nodes
-        self.init_likelihood = self.likelihood_logp(self.bart.sum_trees_output)
+        shared = make_shared_replacements(initial_values, vars, model)
+        self.likelihood_logp = logp(initial_values, [model.datalogpt], vars, shared)
+        self.init_leaf_nodes = self.initial_value_leaf_nodes
+        self.init_likelihood = self.likelihood_logp(self.sum_trees_output)
         self.init_log_weight = self.init_likelihood - self.log_num_particles
         self.old_trees_particles_list = []
-        for i in range(self.bart.m):
+        for i in range(self.m):
             p = ParticleTree(
-                self.bart.trees[i],
-                self.bart.prior_prob_leaf_node,
+                self.trees[i],
+                self.prior_prob_leaf_node,
                 self.init_log_weight,
                 self.init_likelihood,
             )
             self.old_trees_particles_list.append(p)
         super().__init__(vars, shared)
 
-    def astep(self, _):
-        bart = self.bart
+    def astep(self, q):
+        point_map_info = q.point_map_info
 
-        variable_inclusion = np.zeros(bart.num_variates, dtype="int")
+        variable_inclusion = np.zeros(self.num_variates, dtype="int")
 
-        if self.idx == bart.m:
+        if self.idx == self.m:
             self.idx = 0
 
         for idx in range(self.idx, self.idx + self.chunk):
-            if idx >= bart.m:
+            if idx >= self.m:
                 break
             self.idx += 1
-            tree = bart.trees[idx]
-            sum_trees_output_noi = bart.sum_trees_output - tree.predict_output()
+            tree = self.trees[idx]
+            sum_trees_output_noi = self.sum_trees_output - tree.predict_output()
             # Generate an initial set of SMC particles
             # at the end of the algorithm we return one of these particles as the new tree
             particles = self.init_particles(tree.tree_id)
@@ -116,7 +145,17 @@ def astep(self, _):
                 particles[0] = self.get_old_tree_particle(tree.tree_id, t)
                 # sample each particle (try to grow each tree)
                 for c in range(1, self.num_particles):
-                    particles[c].sample_tree_sequential(bart)
+                    particles[c].sample_tree_sequential(
+                        self.ssv,
+                        self.available_predictors,
+                        self.X,
+                        self.missing_data,
+                        self.sum_trees_output,
+                        self.mean,
+                        self.m,
+                        self.normal,
+                        self.mu_std,
+                    )
                 # Update weights. Since the prior is used as the proposal,the weights
                 # are updated additively as the ratio of the new and old log_likelihoods
                 for p in particles:
@@ -150,27 +189,31 @@ def astep(self, _):
             # Get the new tree and update
             new_tree = np.random.choice(particles, p=normalized_weights)
             self.old_trees_particles_list[tree.tree_id] = new_tree
-            bart.trees[idx] = new_tree.tree
-            bart.sum_trees_output = sum_trees_output_noi + new_tree.tree.predict_output()
+            self.trees[idx] = new_tree.tree
+            self.sum_trees_output = sum_trees_output_noi + new_tree.tree.predict_output()
 
             if not self.tune:
                 self.iter += 1
                 self.sum_trees.append(new_tree.tree)
-                if not self.iter % bart.m:
-                    bart.all_trees.append(self.sum_trees)
+                if not self.iter % self.m:
+                    # update the all_trees variable in BARTRV to be used in the rng_fn method
+                    # fails for chains > 1
+                    self.bart.all_trees.append(self.sum_trees)
                     self.sum_trees = []
                 for index in new_tree.used_variates:
                     variable_inclusion[index] += 1
 
         stats = {"variable_inclusion": variable_inclusion}
-        return bart.sum_trees_output, [stats]
+        sum_trees_output = RaveledVars(self.sum_trees_output, point_map_info)
+        return sum_trees_output, [stats]
 
     @staticmethod
     def competence(var, has_grad):
         """
         PGBART is only suitable for BART distributions
         """
-        if isinstance(var.distribution, BART):
+        dist = getattr(var.owner, "op", None)
+        if isinstance(dist, BARTRV):
             return Competence.IDEAL
         return Competence.INCOMPATIBLE
 
@@ -205,14 +248,14 @@ def init_particles(self, tree_id):
         prev_tree.log_weight = likelihood - self.log_num_particles
         particles = [prev_tree]
 
-        initial_idx_data_points_leaf_nodes = np.arange(self.bart.num_observations, dtype="int32")
+        initial_idx_data_points_leaf_nodes = np.arange(self.num_observations, dtype="int32")
         new_tree = Tree.init_tree(
             tree_id=tree_id,
             leaf_node_value=self.init_leaf_nodes,
             idx_data_points=initial_idx_data_points_leaf_nodes,
         )
 
-        prior_prob = self.bart.prior_prob_leaf_node
+        prior_prob = self.prior_prob_leaf_node
         for _ in range(1, self.num_particles):
             particles.append(
                 ParticleTree(new_tree, prior_prob, self.init_log_weight, self.init_likelihood)
@@ -243,16 +286,27 @@ def __init__(self, tree, prior_prob_leaf_node, log_weight=0, likelihood=0):
         self.old_likelihood_logp = likelihood
         self.used_variates = []
 
-    def sample_tree_sequential(self, bart):
+    def sample_tree_sequential(
+        self, ssv, available_predictors, X, missing_data, sum_trees_output, mean, m, normal, mu_std
+    ):
         if self.expansion_nodes:
             index_leaf_node = self.expansion_nodes.pop(0)
             # Probability that this node will remain a leaf node
             prob_leaf = self.prior_prob_leaf_node[self.tree[index_leaf_node].depth]
 
             if prob_leaf < np.random.random():
-                grow_successful, index_selected_predictor = bart.grow_tree(
+                grow_successful, index_selected_predictor = grow_tree(
                     self.tree,
                     index_leaf_node,
+                    ssv,
+                    available_predictors,
+                    X,
+                    missing_data,
+                    sum_trees_output,
+                    mean,
+                    m,
+                    normal,
+                    mu_std,
                 )
                 if grow_successful:
                     # Add new leaf nodes indexes
@@ -272,8 +326,208 @@ def set_particle_to_step(self, t):
             self.expansion_nodes = self.expansion_nodes_history[t]
 
 
-def logp(out_vars, vars, shared):
-    """Compile Theano function of the model and the input and output variables.
+def preprocess_XY(X, Y):
+    if isinstance(Y, (Series, DataFrame)):
+        Y = Y.to_numpy()
+    if isinstance(X, (Series, DataFrame)):
+        X = X.to_numpy()
+    missing_data = np.any(np.isnan(X))
+    Y = Y.astype(float)
+    return X, Y, missing_data
+
+
+class SampleSplittingVariable:
+    def __init__(self, prior, num_variates):
+        self.prior = prior
+        self.num_variates = num_variates
+
+        if self.prior is not None:
+            self.prior = np.asarray(self.prior)
+            self.prior = self.prior / self.prior.sum()
+            if self.prior.size != self.num_variates:
+                raise ValueError(
+                    f"The size of split_prior ({self.prior.size}) should be the "
+                    f"same as the number of covariates ({self.num_variates})"
+                )
+            self.enu = list(enumerate(np.cumsum(self.prior)))
+
+    def rvs(self):
+        if self.prior is None:
+            return int(np.random.random() * self.num_variates)
+        else:
+            r = np.random.random()
+            for i, v in self.enu:
+                if r <= v:
+                    return i
+
+
+def init_list_of_trees(initial_value_leaf_nodes, num_observations, m, Y, init_mean):
+    initial_value_leaf_nodes = initial_value_leaf_nodes
+    initial_idx_data_points_leaf_nodes = np.array(range(num_observations), dtype="int32")
+    list_of_trees = []
+    for i in range(m):
+        new_tree = Tree.init_tree(
+            tree_id=i,
+            leaf_node_value=initial_value_leaf_nodes,
+            idx_data_points=initial_idx_data_points_leaf_nodes,
+        )
+        list_of_trees.append(new_tree)
+    sum_trees_output = np.full_like(Y, init_mean)
+
+    return list_of_trees, sum_trees_output
+
+
+def compute_prior_probability(alpha):
+    """
+    Calculate the probability of the node being a LeafNode (1 - p(being SplitNode)).
+    Taken from equation 19 in [Rockova2018].
+
+    Parameters
+    ----------
+    alpha : float
+
+    Returns
+    -------
+    list with probabilities for leaf nodes
+
+    References
+    ----------
+    .. [Rockova2018] Veronika Rockova, Enakshi Saha (2018). On the theory of BART.
+    arXiv, `link <https://arxiv.org/abs/1810.00787>`__
+    """
+    prior_leaf_prob = [0]
+    depth = 1
+    while prior_leaf_prob[-1] < 1:
+        prior_leaf_prob.append(1 - alpha ** depth)
+        depth += 1
+    return prior_leaf_prob
+
+
+def grow_tree(
+    tree,
+    index_leaf_node,
+    ssv,
+    available_predictors,
+    X,
+    missing_data,
+    sum_trees_output,
+    mean,
+    m,
+    normal,
+    mu_std,
+):
+    current_node = tree.get_node(index_leaf_node)
+
+    index_selected_predictor = ssv.rvs()
+    selected_predictor = available_predictors[index_selected_predictor]
+    available_splitting_rules = get_available_splitting_rules(
+        X[current_node.idx_data_points, selected_predictor],
+        missing_data,
+    )
+    # This can be unsuccessful when there are not available splitting rules
+    if available_splitting_rules.size == 0:
+        return False, None
+
+    index_selected_splitting_rule = discrete_uniform_sampler(len(available_splitting_rules))
+    selected_splitting_rule = available_splitting_rules[index_selected_splitting_rule]
+    new_split_node = SplitNode(
+        index=index_leaf_node,
+        idx_split_variable=selected_predictor,
+        split_value=selected_splitting_rule,
+    )
+
+    left_node_idx_data_points, right_node_idx_data_points = get_new_idx_data_points(
+        new_split_node, current_node.idx_data_points, X
+    )
+
+    left_node_value = draw_leaf_value(
+        sum_trees_output[left_node_idx_data_points], mean, m, normal, mu_std
+    )
+    right_node_value = draw_leaf_value(
+        sum_trees_output[right_node_idx_data_points], mean, m, normal, mu_std
+    )
+
+    new_left_node = LeafNode(
+        index=current_node.get_idx_left_child(),
+        value=left_node_value,
+        idx_data_points=left_node_idx_data_points,
+    )
+    new_right_node = LeafNode(
+        index=current_node.get_idx_right_child(),
+        value=right_node_value,
+        idx_data_points=right_node_idx_data_points,
+    )
+    tree.grow_tree(index_leaf_node, new_split_node, new_left_node, new_right_node)
+
+    return True, index_selected_predictor
+
+
+def get_available_splitting_rules(X_j, missing_data):
+    if missing_data:
+        X_j = X_j[~np.isnan(X_j)]
+    values = np.unique(X_j)
+    # The last value is never available as it would leave the right subtree empty.
+    return values[:-1]
+
+
+def get_new_idx_data_points(current_split_node, idx_data_points, X):
+    idx_split_variable = current_split_node.idx_split_variable
+    split_value = current_split_node.split_value
+
+    left_idx = X[idx_data_points, idx_split_variable] <= split_value
+    left_node_idx_data_points = idx_data_points[left_idx]
+    right_node_idx_data_points = idx_data_points[~left_idx]
+
+    return left_node_idx_data_points, right_node_idx_data_points
+
+
+def draw_leaf_value(sum_trees_output_idx, mean, m, normal, mu_std):
+    """Draw Gaussian distributed leaf values"""
+    mu_mean = mean(sum_trees_output_idx) / m
+    # add cache of precomputed normal values
+    draw = normal.random() * mu_std + mu_mean
+    return draw
+
+
+def fast_mean():
+    """If available use Numba to speed up the computation of the mean."""
+    try:
+        from numba import jit
+    except ImportError:
+        return np.mean
+
+    @jit
+    def mean(a):
+        count = a.shape[0]
+        suma = 0
+        for i in range(count):
+            suma += a[i]
+        return suma / count
+
+    return mean
+
+
+def discrete_uniform_sampler(upper_value):
+    """Draw from the uniform distribution with bounds [0, upper_value)."""
+    return int(np.random.random() * upper_value)
+
+
+class NormalSampler:
+    def __init__(self):
+        self.size = 1000
+        self.cache = []
+
+    def random(self):
+        if not self.cache:
+            self.update()
+        return self.cache.pop()
+
+    def update(self):
+        self.cache = np.random.normal(loc=0.0, scale=1, size=self.size).tolist()
+
+
+def logp(point, out_vars, vars, shared):
+    """Compile Aesara function of the model and the input and output variables.
 
     Parameters
     ----------
@@ -282,9 +536,9 @@ def logp(out_vars, vars, shared):
     vars: List
         containing :class:`pymc3.Distribution` for the input variables
     shared: List
-        containing :class:`theano.tensor.Tensor` for depended shared data
+        containing :class:`aesara.tensor.Tensor` for depended shared data
     """
-    out_list, inarray0 = join_nonshared_inputs(out_vars, vars, shared)
-    f = theano_function([inarray0], out_list[0])
+    out_list, inarray0 = join_nonshared_inputs(point, out_vars, vars, shared)
+    f = aesara_function([inarray0], out_list[0])
     f.trust_input = True
     return f

From 750ca75b597e6fd43c220c72c08a2386e29ce9f6 Mon Sep 17 00:00:00 2001
From: aloctavodia <aloctavodia@gmail.com>
Date: Sat, 7 Aug 2021 19:06:42 +0300
Subject: [PATCH 03/26] improve docstrings

---
 pymc3/distributions/bart.py  | 48 +++++++++++++++++++++++++++++-------
 pymc3/distributions/tree.py  | 13 +++++++---
 pymc3/step_methods/pgbart.py | 10 ++++----
 3 files changed, 54 insertions(+), 17 deletions(-)

diff --git a/pymc3/distributions/bart.py b/pymc3/distributions/bart.py
index 775f4ff550..b2990b4f0b 100644
--- a/pymc3/distributions/bart.py
+++ b/pymc3/distributions/bart.py
@@ -22,9 +22,13 @@
 
 
 class BARTRV(RandomVariable):
+    """
+    Base class for BART
+    """
+
     name = "bart"
     ndim_supp = 1
-    ndims_params = [2, 1, 0, 0, 0]
+    ndims_params = [2, 1, 0, 0, 0, 1]
     dtype = "floatX"
     _print_name = ("BART", "\\operatorname{BART}")
     all_trees = None
@@ -37,8 +41,6 @@ def __new__(cls, *args, **kwargs):
 
     @classmethod
     def rng_fn(cls, X_new=None, *args, **kwargs):
-        # in old version X_new could be passed by the user to get out-of-sample predictions
-
         all_trees = cls.all_trees
         if all_trees:
             pred = 0
@@ -67,7 +69,30 @@ def rng_fn(cls, X_new=None, *args, **kwargs):
 
 
 class BART(NoDistribution):
-    """Improper flat prior over the positive reals."""
+    """
+    BART distribution.
+
+    Distribution representing a sum over trees
+
+    Parameters
+    ----------
+    X : array-like
+        The covariate matrix.
+    Y : array-like
+        The response vector.
+    m : int
+        Number of trees
+    alpha : float
+        Control the prior probability over the depth of the trees. Even when it can takes values in
+        the interval (0, 1), it is recommended to be in the interval (0, 0.5].
+    k : int
+        Scale parameter for the values of the leaf nodes. Defaults to 2. Recomended to be between 1
+        and 3.
+    split_prior : array-like
+        Each element of split_prior should be in the [0, 1] interval and the elements should sum
+        to 1. Otherwise they will be normalized.
+        Defaults to None, i.e. all covariate have the same prior probability to be selected.
+    """
 
     def __new__(
         cls,
@@ -77,23 +102,24 @@ def __new__(
         m=50,
         alpha=0.25,
         k=2,
+        split_prior=None,
         ndim_supp=1,
-        ndims_params=[2, 1, 0, 0, 0],
+        ndims_params=[2, 1, 0, 0, 0, 1],
         dtype="floatX",
         **kwargs,
     ):
 
         if X.ndim != 2:
-            raise ValueError("The design matrix X must have two dimensions")
+            raise ValueError("The covariate matrix X must have two dimensions")
 
         if Y.ndim != 1:
-            raise ValueError("The response matrix Y must have one dimension")
+            raise ValueError("The response vector Y must have one dimension")
         if X.shape[0] != Y.shape[0]:
             raise ValueError(
-                "The design matrix X and the response matrix Y must have the same number of elements"
+                "The covariate matrix X and the response vector Y must have the same number of elements"
             )
         if not isinstance(m, int):
-            raise ValueError("The number of trees m type must be int")
+            raise ValueError("The number of trees m type must be an integer")
         if m < 1:
             raise ValueError("The number of trees m must be greater than zero")
 
@@ -103,6 +129,8 @@ def __new__(
                 "must be in the interval (0, 1)"
             )
 
+        if split_prior is None:
+            split_prior = np.ones(X.shape[1])
         cls.all_trees = []
         bart_op = type(
             f"BART_{name}",
@@ -114,11 +142,13 @@ def __new__(
                 ndims_params=ndims_params,
                 dtype=dtype,
                 inplace=False,
+                initval=Y.mean(),
                 X=X,
                 Y=Y,
                 m=m,
                 alpha=alpha,
                 k=k,
+                split_prior=split_prior,
             ),
         )()
 
diff --git a/pymc3/distributions/tree.py b/pymc3/distributions/tree.py
index 12f7dc6285..40f0436113 100644
--- a/pymc3/distributions/tree.py
+++ b/pymc3/distributions/tree.py
@@ -21,15 +21,18 @@
 
 class Tree:
     """Full binary tree
+
     A full binary tree is a tree where each node has exactly zero or two children.
     This structure is used as the basic component of the Bayesian Additive Regression Tree (BART)
+
     Attributes
     ----------
     tree_structure : dict
-        A dictionary that represents the nodes stored in breadth-first order, based in the array method
-        for storing binary trees (https://en.wikipedia.org/wiki/Binary_tree#Arrays).
+        A dictionary that represents the nodes stored in breadth-first order, based in the array
+        method for storing binary trees (https://en.wikipedia.org/wiki/Binary_tree#Arrays).
         The dictionary's keys are integers that represent the nodes position.
-        The dictionary's values are objects of type SplitNode or LeafNode that represent the nodes of the tree itself.
+        The dictionary's values are objects of type SplitNode or LeafNode that represent the nodes
+        of the tree itself.
     num_nodes : int
         Total number of nodes.
     idx_leaf_nodes : list
@@ -39,10 +42,14 @@ class Tree:
         its children are leaf nodes.
     tree_id : int
         Identifier used to get the previous tree in the ParticleGibbs algorithm used in BART.
+    num_observations : int
+        Number of observations used to fit BART.
+
 
     Parameters
     ----------
     tree_id : int, optional
+    num_observations : int, optional
     """
 
     def __init__(self, tree_id=0, num_observations=0):
diff --git a/pymc3/step_methods/pgbart.py b/pymc3/step_methods/pgbart.py
index eeb91884f1..c190c8c8ab 100644
--- a/pymc3/step_methods/pgbart.py
+++ b/pymc3/step_methods/pgbart.py
@@ -70,6 +70,7 @@ def __init__(self, vars=None, num_particles=10, max_stages=5000, chunk="auto", m
         self.m = self.bart.m
         self.alpha = self.bart.alpha
         self.k = self.bart.k
+        self.split_prior = self.bart.split_prior
 
         self.init_mean = self.Y.mean()
         # if data is binary
@@ -82,7 +83,7 @@ def __init__(self, vars=None, num_particles=10, max_stages=5000, chunk="auto", m
         self.num_observations = self.X.shape[0]
         self.num_variates = self.X.shape[1]
         self.available_predictors = list(range(self.num_variates))
-        self.ssv = SampleSplittingVariable(None, self.num_variates)
+        self.ssv = SampleSplittingVariable(self.split_prior, self.num_variates)
         self.initial_value_leaf_nodes = self.init_mean / self.m
 
         self.trees, self.sum_trees_output = init_list_of_trees(
@@ -100,7 +101,6 @@ def __init__(self, vars=None, num_particles=10, max_stages=5000, chunk="auto", m
 
         if chunk == "auto":
             self.chunk = max(1, int(self.m * 0.1))
-        # self.bart.chunk = self.chunk
         self.num_particles = num_particles
         self.log_num_particles = np.log(num_particles)
         self.indices = list(range(1, num_particles))
@@ -196,8 +196,8 @@ def astep(self, q):
                 self.iter += 1
                 self.sum_trees.append(new_tree.tree)
                 if not self.iter % self.m:
-                    # update the all_trees variable in BARTRV to be used in the rng_fn method
-                    # fails for chains > 1
+                    # XXX update the all_trees variable in BARTRV to be used in the rng_fn method
+                    # this fails for chains > 1 as the variable is not shared between proccesses
                     self.bart.all_trees.append(self.sum_trees)
                     self.sum_trees = []
                 for index in new_tree.used_variates:
@@ -265,7 +265,7 @@ def init_particles(self, tree_id):
 
     def resample(self, particles, weights):
         """
-        resample a set of particles given its weights
+        Resample a set of particles given its weights
         """
         particles = np.random.choice(particles, size=len(particles), p=weights)
         return particles

From f9a0937ff1339490ae153d902c37e3b5a6dcfaf4 Mon Sep 17 00:00:00 2001
From: aloctavodia <aloctavodia@gmail.com>
Date: Mon, 9 Aug 2021 08:29:09 +0300
Subject: [PATCH 04/26] small fix docstring and variable names

---
 pymc3/distributions/bart.py  | 31 ++++++-------------------------
 pymc3/distributions/tree.py  |  4 ++--
 pymc3/step_methods/pgbart.py | 24 ++++++++++++++----------
 3 files changed, 22 insertions(+), 37 deletions(-)

diff --git a/pymc3/distributions/bart.py b/pymc3/distributions/bart.py
index b2990b4f0b..c4ed4d19c0 100644
--- a/pymc3/distributions/bart.py
+++ b/pymc3/distributions/bart.py
@@ -70,7 +70,7 @@ def rng_fn(cls, X_new=None, *args, **kwargs):
 
 class BART(NoDistribution):
     """
-    BART distribution.
+    Bayesian Additive Regression Tree distribution.
 
     Distribution representing a sum over trees
 
@@ -85,13 +85,13 @@ class BART(NoDistribution):
     alpha : float
         Control the prior probability over the depth of the trees. Even when it can takes values in
         the interval (0, 1), it is recommended to be in the interval (0, 0.5].
-    k : int
+    k : float
         Scale parameter for the values of the leaf nodes. Defaults to 2. Recomended to be between 1
         and 3.
     split_prior : array-like
-        Each element of split_prior should be in the [0, 1] interval and the elements should sum
-        to 1. Otherwise they will be normalized.
-        Defaults to None, i.e. all covariate have the same prior probability to be selected.
+        Each element of split_prior should be in the [0, 1] interval and the elements should sum to
+        1. Otherwise they will be normalized.
+        Defaults to None, i.e. all covariates have the same prior probability to be selected.
     """
 
     def __new__(
@@ -109,29 +109,10 @@ def __new__(
         **kwargs,
     ):
 
-        if X.ndim != 2:
-            raise ValueError("The covariate matrix X must have two dimensions")
-
-        if Y.ndim != 1:
-            raise ValueError("The response vector Y must have one dimension")
-        if X.shape[0] != Y.shape[0]:
-            raise ValueError(
-                "The covariate matrix X and the response vector Y must have the same number of elements"
-            )
-        if not isinstance(m, int):
-            raise ValueError("The number of trees m type must be an integer")
-        if m < 1:
-            raise ValueError("The number of trees m must be greater than zero")
-
-        if alpha <= 0 or 1 <= alpha:
-            raise ValueError(
-                "The value for the alpha parameter for the tree structure "
-                "must be in the interval (0, 1)"
-            )
-
         if split_prior is None:
             split_prior = np.ones(X.shape[1])
         cls.all_trees = []
+
         bart_op = type(
             f"BART_{name}",
             (BARTRV,),
diff --git a/pymc3/distributions/tree.py b/pymc3/distributions/tree.py
index 40f0436113..8a5160d0ad 100644
--- a/pymc3/distributions/tree.py
+++ b/pymc3/distributions/tree.py
@@ -38,8 +38,8 @@ class Tree:
     idx_leaf_nodes : list
         List with the index of the leaf nodes of the tree.
     idx_prunable_split_nodes : list
-        List with the index of the prunable splitting nodes of the tree. A splitting node is prunable if both
-        its children are leaf nodes.
+        List with the index of the prunable splitting nodes of the tree. A splitting node is
+        prunable if both its children are leaf nodes.
     tree_id : int
         Identifier used to get the previous tree in the ParticleGibbs algorithm used in BART.
     num_observations : int
diff --git a/pymc3/step_methods/pgbart.py b/pymc3/step_methods/pgbart.py
index c190c8c8ab..92d8227642 100644
--- a/pymc3/step_methods/pgbart.py
+++ b/pymc3/step_methods/pgbart.py
@@ -60,7 +60,7 @@ class PGBART(ArrayStepShared):
     stats_dtypes = [{"variable_inclusion": np.ndarray}]
 
     def __init__(self, vars=None, num_particles=10, max_stages=5000, chunk="auto", model=None):
-        _log.warning("The BART model is experimental. Use with caution.")
+        _log.warning("BART is experimental. Use with caution.")
         model = modelcontext(model)
         initial_values = model.initial_point
         value_bart = inputvars(vars)[0]
@@ -166,18 +166,18 @@ def astep(self, q):
                     p.old_likelihood_logp = new_likelihood
 
                 # Normalize weights
-                W, normalized_weights = self.normalize(particles)
+                W_t, normalized_weights = self.normalize(particles)
+
+                # Set the new weights
+                for p in particles:
+                    p.log_weight = W_t
+
                 # Resample all but first particle
                 re_n_w = normalized_weights[1:] / normalized_weights[1:].sum()
 
                 new_indices = np.random.choice(self.indices, size=len(self.indices), p=re_n_w)
                 particles[1:] = particles[new_indices]
 
-                # Set the new weights
-                w_t = W - self.log_num_particles
-                for p in particles:
-                    p.log_weight = w_t
-
                 # Check if particles can keep growing, otherwise stop iterating
                 non_available_nodes_for_expansion = np.ones(self.num_particles - 1)
                 for c in range(1, self.num_particles):
@@ -219,19 +219,19 @@ def competence(var, has_grad):
 
     def normalize(self, particles):
         """
-        Use logsumexp trick to get W and softmax to get normalized_weights
+        Use logsumexp trick to get W_t and softmax to get normalized_weights
         """
         log_w = np.array([p.log_weight for p in particles])
         log_w_max = log_w.max()
         log_w_ = log_w - log_w_max
         w_ = np.exp(log_w_)
         w_sum = w_.sum()
-        W = log_w_max + np.log(w_sum)
+        W_t = log_w_max + np.log(w_sum) - self.log_num_particles
         normalized_weights = w_ / w_sum
         # stabilize weights to avoid assigning exactly zero probability to a particle
         normalized_weights += 1e-12
 
-        return W, normalized_weights
+        return W_t, normalized_weights
 
     def get_old_tree_particle(self, tree_id, t):
         old_tree_particle = self.old_trees_particles_list[tree_id]
@@ -513,6 +513,10 @@ def discrete_uniform_sampler(upper_value):
 
 
 class NormalSampler:
+    """
+    Cache samples from a standard normal distribution
+    """
+
     def __init__(self):
         self.size = 1000
         self.cache = []

From 05ce929085946ee4b7f1f317a64072f1e95855c2 Mon Sep 17 00:00:00 2001
From: aloctavodia <aloctavodia@gmail.com>
Date: Mon, 9 Aug 2021 19:38:43 +0300
Subject: [PATCH 05/26] fix format variable importance

---
 pymc3/sampling.py            | 9 +++++++++
 pymc3/step_methods/pgbart.py | 5 +++--
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/pymc3/sampling.py b/pymc3/sampling.py
index 0a8f97e8e9..6c57ac0ced 100644
--- a/pymc3/sampling.py
+++ b/pymc3/sampling.py
@@ -636,6 +636,15 @@ def sample(
     trace.report._n_draws = n_draws
     trace.report._t_sampling = t_sampling
 
+    if "variable_inclusion" in trace.stat_names:
+        for strace in trace._straces.values():
+            for stat in strace._stats:
+                if "variable_inclusion" in stat:
+                    if trace.nchains > 1:
+                        stat["variable_inclusion"] = np.vstack(stat["variable_inclusion"])
+                    else:
+                        stat["variable_inclusion"] = [np.vstack(stat["variable_inclusion"])]
+
     n_chains = len(trace.chains)
     _log.info(
         f'Sampling {n_chains} chain{"s" if n_chains > 1 else ""} for {n_tune:_d} tune and {n_draws:_d} draw iterations '
diff --git a/pymc3/step_methods/pgbart.py b/pymc3/step_methods/pgbart.py
index 92d8227642..3b733863ae 100644
--- a/pymc3/step_methods/pgbart.py
+++ b/pymc3/step_methods/pgbart.py
@@ -13,6 +13,7 @@
 #   limitations under the License.
 
 import logging
+from typing import Any, Dict, List, Tuple
 
 import numpy as np
 from pandas import DataFrame, Series
@@ -99,7 +100,7 @@ def __init__(self, vars=None, num_particles=10, max_stages=5000, chunk="auto", m
         self.sum_trees = []
         self.chunk = chunk
 
-        if chunk == "auto":
+        if self.chunk == "auto":
             self.chunk = max(1, int(self.m * 0.1))
         self.num_particles = num_particles
         self.log_num_particles = np.log(num_particles)
@@ -122,7 +123,7 @@ def __init__(self, vars=None, num_particles=10, max_stages=5000, chunk="auto", m
             self.old_trees_particles_list.append(p)
         super().__init__(vars, shared)
 
-    def astep(self, q):
+    def astep(self, q: RaveledVars) -> Tuple[RaveledVars, List[Dict[str, Any]]]:
         point_map_info = q.point_map_info
 
         variable_inclusion = np.zeros(self.num_variates, dtype="int")

From 412845d64bbd528d412edb0ebd2bdbaa432ca363 Mon Sep 17 00:00:00 2001
From: aloctavodia <aloctavodia@gmail.com>
Date: Tue, 10 Aug 2021 11:50:01 +0300
Subject: [PATCH 06/26] fix broadcasting issue and other minor fixes

---
 pymc3/distributions/bart.py        |  6 +++---
 pymc3/step_methods/hmc/base_hmc.py |  1 +
 pymc3/step_methods/pgbart.py       | 18 +++++++++++-------
 3 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/pymc3/distributions/bart.py b/pymc3/distributions/bart.py
index c4ed4d19c0..624c24b72f 100644
--- a/pymc3/distributions/bart.py
+++ b/pymc3/distributions/bart.py
@@ -40,11 +40,11 @@ def __new__(cls, *args, **kwargs):
         return super().__new__(cls)
 
     @classmethod
-    def rng_fn(cls, X_new=None, *args, **kwargs):
+    def rng_fn(cls, rng, X_new=None, *args, **kwargs):
         all_trees = cls.all_trees
         if all_trees:
             pred = 0
-            idx = np.random.randint(len(all_trees))
+            idx = rng.randint(len(all_trees))
             trees = all_trees[idx]
             if X_new is None:
                 for tree in trees:
@@ -111,7 +111,7 @@ def __new__(
 
         if split_prior is None:
             split_prior = np.ones(X.shape[1])
-        cls.all_trees = []
+        cls.all_trees = all_trees
 
         bart_op = type(
             f"BART_{name}",
diff --git a/pymc3/step_methods/hmc/base_hmc.py b/pymc3/step_methods/hmc/base_hmc.py
index c5e9603a90..aaaaa9f4b2 100644
--- a/pymc3/step_methods/hmc/base_hmc.py
+++ b/pymc3/step_methods/hmc/base_hmc.py
@@ -101,6 +101,7 @@ def __init__(
         # XXX: If the dimensions of these terms change, the step size
         # dimension-scaling should change as well, no?
         test_point = self._model.initial_point
+
         nuts_vars = [test_point[v.name] for v in vars]
         size = sum(v.size for v in nuts_vars)
 
diff --git a/pymc3/step_methods/pgbart.py b/pymc3/step_methods/pgbart.py
index 3b733863ae..87acb98326 100644
--- a/pymc3/step_methods/pgbart.py
+++ b/pymc3/step_methods/pgbart.py
@@ -87,7 +87,7 @@ def __init__(self, vars=None, num_particles=10, max_stages=5000, chunk="auto", m
         self.ssv = SampleSplittingVariable(self.split_prior, self.num_variates)
         self.initial_value_leaf_nodes = self.init_mean / self.m
 
-        self.trees, self.sum_trees_output = init_list_of_trees(
+        self.trees, sum_trees_output = init_list_of_trees(
             self.initial_value_leaf_nodes, self.num_observations, self.m, self.Y, self.init_mean
         )
         self.mean = fast_mean()
@@ -110,7 +110,7 @@ def __init__(self, vars=None, num_particles=10, max_stages=5000, chunk="auto", m
         shared = make_shared_replacements(initial_values, vars, model)
         self.likelihood_logp = logp(initial_values, [model.datalogpt], vars, shared)
         self.init_leaf_nodes = self.initial_value_leaf_nodes
-        self.init_likelihood = self.likelihood_logp(self.sum_trees_output)
+        self.init_likelihood = self.likelihood_logp(sum_trees_output)
         self.init_log_weight = self.init_likelihood - self.log_num_particles
         self.old_trees_particles_list = []
         for i in range(self.m):
@@ -125,6 +125,7 @@ def __init__(self, vars=None, num_particles=10, max_stages=5000, chunk="auto", m
 
     def astep(self, q: RaveledVars) -> Tuple[RaveledVars, List[Dict[str, Any]]]:
         point_map_info = q.point_map_info
+        sum_trees_output = q.data
 
         variable_inclusion = np.zeros(self.num_variates, dtype="int")
 
@@ -136,7 +137,7 @@ def astep(self, q: RaveledVars) -> Tuple[RaveledVars, List[Dict[str, Any]]]:
                 break
             self.idx += 1
             tree = self.trees[idx]
-            sum_trees_output_noi = self.sum_trees_output - tree.predict_output()
+            sum_trees_output_noi = sum_trees_output - tree.predict_output()
             # Generate an initial set of SMC particles
             # at the end of the algorithm we return one of these particles as the new tree
             particles = self.init_particles(tree.tree_id)
@@ -151,7 +152,7 @@ def astep(self, q: RaveledVars) -> Tuple[RaveledVars, List[Dict[str, Any]]]:
                         self.available_predictors,
                         self.X,
                         self.missing_data,
-                        self.sum_trees_output,
+                        sum_trees_output,
                         self.mean,
                         self.m,
                         self.normal,
@@ -191,7 +192,7 @@ def astep(self, q: RaveledVars) -> Tuple[RaveledVars, List[Dict[str, Any]]]:
             new_tree = np.random.choice(particles, p=normalized_weights)
             self.old_trees_particles_list[tree.tree_id] = new_tree
             self.trees[idx] = new_tree.tree
-            self.sum_trees_output = sum_trees_output_noi + new_tree.tree.predict_output()
+            sum_trees_output = sum_trees_output_noi + new_tree.tree.predict_output()
 
             if not self.tune:
                 self.iter += 1
@@ -205,7 +206,7 @@ def astep(self, q: RaveledVars) -> Tuple[RaveledVars, List[Dict[str, Any]]]:
                     variable_inclusion[index] += 1
 
         stats = {"variable_inclusion": variable_inclusion}
-        sum_trees_output = RaveledVars(self.sum_trees_output, point_map_info)
+        sum_trees_output = RaveledVars(sum_trees_output, point_map_info)
         return sum_trees_output, [stats]
 
     @staticmethod
@@ -509,7 +510,10 @@ def mean(a):
 
 
 def discrete_uniform_sampler(upper_value):
-    """Draw from the uniform distribution with bounds [0, upper_value)."""
+    """Draw from the uniform distribution with bounds [0, upper_value).
+
+    This is the same and np.random.randit(upper_value) but faster.
+    """
     return int(np.random.random() * upper_value)
 
 

From 139821c21d97f8add53bac2bfb94f7b5089e2e27 Mon Sep 17 00:00:00 2001
From: aloctavodia <aloctavodia@gmail.com>
Date: Tue, 10 Aug 2021 12:36:15 +0300
Subject: [PATCH 07/26] add test and fix pylint

---
 pymc3/distributions/bart.py  | 12 +++---------
 pymc3/step_methods/pgbart.py |  7 ++++---
 pymc3/tests/test_sampling.py | 18 +++++++++++-------
 3 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/pymc3/distributions/bart.py b/pymc3/distributions/bart.py
index 624c24b72f..7d483d9302 100644
--- a/pymc3/distributions/bart.py
+++ b/pymc3/distributions/bart.py
@@ -15,8 +15,8 @@
 import numpy as np
 
 from aesara.tensor.random.op import RandomVariable, default_shape_from_params
-from pymc3.distributions.distribution import NoDistribution
 
+from pymc3.distributions.distribution import NoDistribution
 
 __all__ = ["BART"]
 
@@ -26,7 +26,7 @@ class BARTRV(RandomVariable):
     Base class for BART
     """
 
-    name = "bart"
+    name = "BART"
     ndim_supp = 1
     ndims_params = [2, 1, 0, 0, 0, 1]
     dtype = "floatX"
@@ -103,15 +103,12 @@ def __new__(
         alpha=0.25,
         k=2,
         split_prior=None,
-        ndim_supp=1,
-        ndims_params=[2, 1, 0, 0, 0, 1],
-        dtype="floatX",
         **kwargs,
     ):
 
         if split_prior is None:
             split_prior = np.ones(X.shape[1])
-        cls.all_trees = all_trees
+        cls.all_trees = []
 
         bart_op = type(
             f"BART_{name}",
@@ -119,9 +116,6 @@ def __new__(
             dict(
                 name="BART",
                 all_trees=cls.all_trees,
-                ndim_supp=ndim_supp,
-                ndims_params=ndims_params,
-                dtype=dtype,
                 inplace=False,
                 initval=Y.mean(),
                 X=X,
diff --git a/pymc3/step_methods/pgbart.py b/pymc3/step_methods/pgbart.py
index 87acb98326..911008f60a 100644
--- a/pymc3/step_methods/pgbart.py
+++ b/pymc3/step_methods/pgbart.py
@@ -13,20 +13,20 @@
 #   limitations under the License.
 
 import logging
+
 from typing import Any, Dict, List, Tuple
 
 import numpy as np
-from pandas import DataFrame, Series
 
 from aesara import function as aesara_function
+from pandas import DataFrame, Series
 
 from pymc3.aesaraf import inputvars, join_nonshared_inputs, make_shared_replacements
+from pymc3.blocking import RaveledVars
 from pymc3.distributions.bart import BARTRV
 from pymc3.distributions.tree import LeafNode, SplitNode, Tree
 from pymc3.model import modelcontext
 from pymc3.step_methods.arraystep import ArrayStepShared, Competence
-from pymc3.blocking import RaveledVars
-
 
 _log = logging.getLogger("pymc3")
 
@@ -66,6 +66,7 @@ def __init__(self, vars=None, num_particles=10, max_stages=5000, chunk="auto", m
         initial_values = model.initial_point
         value_bart = inputvars(vars)[0]
         self.bart = model.values_to_rvs[value_bart].owner.op
+        self.bart.all_trees = []
 
         self.X, self.Y, self.missing_data = preprocess_XY(self.bart.X, self.bart.Y)
         self.m = self.bart.m
diff --git a/pymc3/tests/test_sampling.py b/pymc3/tests/test_sampling.py
index e09ce70b90..4fd37cf70d 100644
--- a/pymc3/tests/test_sampling.py
+++ b/pymc3/tests/test_sampling.py
@@ -174,20 +174,24 @@ def test_trace_report(self, step_cls, discard):
             assert trace.report.n_draws == 100
             assert isinstance(trace.report.t_sampling, float)
 
-    @pytest.mark.xfail(reason="BART not refactored for v4")
-    def test_trace_report_bart(self):
+    def test_bart_vi(self):
         X = np.random.normal(0, 1, size=(3, 250)).T
         Y = np.random.normal(0, 1, size=250)
         X[:, 0] = np.random.normal(Y, 0.1)
 
         with pm.Model() as model:
-            mu = pm.BART("mu", X, Y, m=20)
+            mu = pm.BART("mu", X, Y, m=10)
             sigma = pm.HalfNormal("sigma", 1)
             y = pm.Normal("y", mu, sigma, observed=Y)
-            trace = pm.sample(500, tune=100, random_seed=3415, return_inferencedata=False)
-        var_imp = trace.report.variable_importance
-        assert var_imp[0] > var_imp[1:].sum()
-        npt.assert_almost_equal(var_imp.sum(), 1)
+            idata = pm.sample(random_seed=3415, chains=1)
+            var_imp = (
+                idata.sample_stats["variable_inclusion"]
+                .stack(samples=("chain", "draw"))
+                .mean("samples")
+            )
+            var_imp /= var_imp.sum()
+            assert var_imp[0] > var_imp[1:].sum()
+            npt.assert_almost_equal(var_imp.sum(), 1)
 
     def test_return_inferencedata(self):
         with self.model:

From f0037c15a90b91a2af221d4ce3f4531fa8e78f8d Mon Sep 17 00:00:00 2001
From: aloctavodia <aloctavodia@gmail.com>
Date: Wed, 11 Aug 2021 08:24:25 +0300
Subject: [PATCH 08/26] fix float32

---
 pymc3/distributions/tree.py  | 3 ++-
 pymc3/step_methods/pgbart.py | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/pymc3/distributions/tree.py b/pymc3/distributions/tree.py
index 8a5160d0ad..31ed47b530 100644
--- a/pymc3/distributions/tree.py
+++ b/pymc3/distributions/tree.py
@@ -16,6 +16,7 @@
 
 from copy import deepcopy
 
+import aesara
 import numpy as np
 
 
@@ -91,7 +92,7 @@ def predict_output(self):
             current_node = self.get_node(node_index)
             output[current_node.idx_data_points] = current_node.value
 
-        return output
+        return output.astype(aesara.config.floatX)
 
     def predict_out_of_sample(self, x):
         """
diff --git a/pymc3/step_methods/pgbart.py b/pymc3/step_methods/pgbart.py
index 911008f60a..ccb493b1cd 100644
--- a/pymc3/step_methods/pgbart.py
+++ b/pymc3/step_methods/pgbart.py
@@ -16,6 +16,7 @@
 
 from typing import Any, Dict, List, Tuple
 
+import aesara
 import numpy as np
 
 from aesara import function as aesara_function
@@ -375,7 +376,7 @@ def init_list_of_trees(initial_value_leaf_nodes, num_observations, m, Y, init_me
             idx_data_points=initial_idx_data_points_leaf_nodes,
         )
         list_of_trees.append(new_tree)
-    sum_trees_output = np.full_like(Y, init_mean)
+    sum_trees_output = np.full_like(Y, init_mean).astype(aesara.config.floatX)
 
     return list_of_trees, sum_trees_output
 

From afff2beec66dffce828fc431b12942658d6009d5 Mon Sep 17 00:00:00 2001
From: aloctavodia <aloctavodia@gmail.com>
Date: Wed, 11 Aug 2021 15:25:46 +0300
Subject: [PATCH 09/26] sample splitting variables non-uniformly

---
 pymc3/distributions/bart.py  |  2 --
 pymc3/step_methods/pgbart.py | 39 +++++++++++++++++-------------------
 2 files changed, 18 insertions(+), 23 deletions(-)

diff --git a/pymc3/distributions/bart.py b/pymc3/distributions/bart.py
index 7d483d9302..f657b9a021 100644
--- a/pymc3/distributions/bart.py
+++ b/pymc3/distributions/bart.py
@@ -106,8 +106,6 @@ def __new__(
         **kwargs,
     ):
 
-        if split_prior is None:
-            split_prior = np.ones(X.shape[1])
         cls.all_trees = []
 
         bart_op = type(
diff --git a/pymc3/step_methods/pgbart.py b/pymc3/step_methods/pgbart.py
index ccb493b1cd..2979c7ca86 100644
--- a/pymc3/step_methods/pgbart.py
+++ b/pymc3/step_methods/pgbart.py
@@ -74,6 +74,8 @@ def __init__(self, vars=None, num_particles=10, max_stages=5000, chunk="auto", m
         self.alpha = self.bart.alpha
         self.k = self.bart.k
         self.split_prior = self.bart.split_prior
+        if self.split_prior is None:
+            self.split_prior = np.ones(self.X.shape[1])
 
         self.init_mean = self.Y.mean()
         # if data is binary
@@ -86,7 +88,6 @@ def __init__(self, vars=None, num_particles=10, max_stages=5000, chunk="auto", m
         self.num_observations = self.X.shape[0]
         self.num_variates = self.X.shape[1]
         self.available_predictors = list(range(self.num_variates))
-        self.ssv = SampleSplittingVariable(self.split_prior, self.num_variates)
         self.initial_value_leaf_nodes = self.init_mean / self.m
 
         self.trees, sum_trees_output = init_list_of_trees(
@@ -130,6 +131,7 @@ def astep(self, q: RaveledVars) -> Tuple[RaveledVars, List[Dict[str, Any]]]:
         sum_trees_output = q.data
 
         variable_inclusion = np.zeros(self.num_variates, dtype="int")
+        self.ssv = SampleSplittingVariable(self.split_prior)
 
         if self.idx == self.m:
             self.idx = 0
@@ -195,6 +197,8 @@ def astep(self, q: RaveledVars) -> Tuple[RaveledVars, List[Dict[str, Any]]]:
             self.old_trees_particles_list[tree.tree_id] = new_tree
             self.trees[idx] = new_tree.tree
             sum_trees_output = sum_trees_output_noi + new_tree.tree.predict_output()
+            for index in new_tree.used_variates:
+                self.split_prior[index] += 1
 
             if not self.tune:
                 self.iter += 1
@@ -341,28 +345,21 @@ def preprocess_XY(X, Y):
 
 
 class SampleSplittingVariable:
-    def __init__(self, prior, num_variates):
-        self.prior = prior
-        self.num_variates = num_variates
-
-        if self.prior is not None:
-            self.prior = np.asarray(self.prior)
-            self.prior = self.prior / self.prior.sum()
-            if self.prior.size != self.num_variates:
-                raise ValueError(
-                    f"The size of split_prior ({self.prior.size}) should be the "
-                    f"same as the number of covariates ({self.num_variates})"
-                )
-            self.enu = list(enumerate(np.cumsum(self.prior)))
+    def __init__(self, alpha_prior):
+        """
+        Sample splitting variables proportional to `alpha_prior`.
+
+        This is equivalent as sampling weights from a Dirichlet distribution with `alpha_prior`
+        parameter and then using those weights to sample from the available spliting variables.
+        This enforce sparsity.
+        """
+        self.enu = list(enumerate(np.cumsum(alpha_prior / alpha_prior.sum())))
 
     def rvs(self):
-        if self.prior is None:
-            return int(np.random.random() * self.num_variates)
-        else:
-            r = np.random.random()
-            for i, v in self.enu:
-                if r <= v:
-                    return i
+        r = np.random.random()
+        for i, v in self.enu:
+            if r <= v:
+                return i
 
 
 def init_list_of_trees(initial_value_leaf_nodes, num_observations, m, Y, init_mean):

From f4b7b139af9312586c719ba20a37f2d971022f48 Mon Sep 17 00:00:00 2001
From: aloctavodia <aloctavodia@gmail.com>
Date: Thu, 12 Aug 2021 08:51:42 +0300
Subject: [PATCH 10/26] remove xfail

---
 pymc3/tests/test_ode.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/pymc3/tests/test_ode.py b/pymc3/tests/test_ode.py
index 3ce7ed473a..2bb143bc3c 100644
--- a/pymc3/tests/test_ode.py
+++ b/pymc3/tests/test_ode.py
@@ -301,9 +301,6 @@ def system(y, t, p):
         assert idata.posterior["y0"].shape == (1, 100)
         assert idata.posterior["sigma"].shape == (1, 100)
 
-    @pytest.mark.xfail(
-        condition=IS_WINDOWS, reason="https://github.com/pymc-devs/aesara/issues/390"
-    )
     def test_scalar_ode_2_param(self):
         """Test running model for a scalar ODE with 2 parameters"""
 

From 45381cab788963e728aead55ebd0fa60df0acae0 Mon Sep 17 00:00:00 2001
From: aloctavodia <aloctavodia@gmail.com>
Date: Thu, 12 Aug 2021 08:54:34 +0300
Subject: [PATCH 11/26] add back xfail on windows

---
 pymc3/tests/test_ode.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pymc3/tests/test_ode.py b/pymc3/tests/test_ode.py
index 2bb143bc3c..3ce7ed473a 100644
--- a/pymc3/tests/test_ode.py
+++ b/pymc3/tests/test_ode.py
@@ -301,6 +301,9 @@ def system(y, t, p):
         assert idata.posterior["y0"].shape == (1, 100)
         assert idata.posterior["sigma"].shape == (1, 100)
 
+    @pytest.mark.xfail(
+        condition=IS_WINDOWS, reason="https://github.com/pymc-devs/aesara/issues/390"
+    )
     def test_scalar_ode_2_param(self):
         """Test running model for a scalar ODE with 2 parameters"""
 

From a07f050e3dae454e744d240fc63ae8707081c873 Mon Sep 17 00:00:00 2001
From: aloctavodia <aloctavodia@gmail.com>
Date: Thu, 12 Aug 2021 09:19:53 +0300
Subject: [PATCH 12/26] add back xfail on windows and for float32

---
 pymc3/tests/test_parallel_sampling.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pymc3/tests/test_parallel_sampling.py b/pymc3/tests/test_parallel_sampling.py
index 8bdc3ca1dc..68810acb4f 100644
--- a/pymc3/tests/test_parallel_sampling.py
+++ b/pymc3/tests/test_parallel_sampling.py
@@ -170,6 +170,10 @@ def test_explicit_sample(mp_start_method):
     proc.join()
 
 
+@pytest.mark.xfail(
+    condition=(IS_WINDOWS or IS_FLOAT32),
+    reason="Possibly the same issue described in https://github.com/pymc-devs/pymc3/pull/4701",
+)
 def test_iterator():
     with pm.Model() as model:
         a = pm.Normal("a", shape=1)

From 35bc056898d3f568d1c2ef313cc78f4615748386 Mon Sep 17 00:00:00 2001
From: aloctavodia <aloctavodia@gmail.com>
Date: Thu, 12 Aug 2021 10:03:58 +0300
Subject: [PATCH 13/26] fix test

---
 pymc3/tests/test_parallel_sampling.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pymc3/tests/test_parallel_sampling.py b/pymc3/tests/test_parallel_sampling.py
index 68810acb4f..fcbad3d577 100644
--- a/pymc3/tests/test_parallel_sampling.py
+++ b/pymc3/tests/test_parallel_sampling.py
@@ -171,7 +171,7 @@ def test_explicit_sample(mp_start_method):
 
 
 @pytest.mark.xfail(
-    condition=(IS_WINDOWS or IS_FLOAT32),
+    condition=(IS_FLOAT32),
     reason="Possibly the same issue described in https://github.com/pymc-devs/pymc3/pull/4701",
 )
 def test_iterator():

From 8f498fc157900024fe3d20cb8a2733c5f5cdf612 Mon Sep 17 00:00:00 2001
From: aloctavodia <aloctavodia@gmail.com>
Date: Fri, 13 Aug 2021 15:56:12 +0300
Subject: [PATCH 14/26] clean rnd

---
 pymc3/distributions/bart.py  | 18 +++++++-----------
 pymc3/step_methods/pgbart.py |  1 -
 2 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/pymc3/distributions/bart.py b/pymc3/distributions/bart.py
index f657b9a021..d7694fe661 100644
--- a/pymc3/distributions/bart.py
+++ b/pymc3/distributions/bart.py
@@ -40,27 +40,23 @@ def __new__(cls, *args, **kwargs):
         return super().__new__(cls)
 
     @classmethod
-    def rng_fn(cls, rng, X_new=None, *args, **kwargs):
+    def rng_fn(cls, rng=np.random.default_rng(), X_new=None, *args, **kwargs):
         all_trees = cls.all_trees
         if all_trees:
-            pred = 0
-            idx = rng.randint(len(all_trees))
+            # this should be rng.integers() but when sampling from the prior/posterior predictive
+            # I get 'numpy.random.mtrand.RandomState' object has no attribute 'integers'
+            # So I guess those functions need to be updated
+            idx = np.random.randint(len(all_trees))
             trees = all_trees[idx]
             if X_new is None:
+                pred = np.zeros(trees[0].num_observations)
                 for tree in trees:
                     pred += tree.predict_output()
             else:
+                pred = np.zeros(X_new.shape[0])
                 for tree in trees:
                     pred += np.array([tree.predict_out_of_sample(x) for x in X_new])
             return pred
-            # XXX check why I did not make this random
-            # pred = np.zeros((len(trees), num_observations))
-            # for draw, trees_to_sum in enumerate(trees):
-            #    new_Y = np.zeros(num_observations)
-            #    for tree in trees_to_sum:
-            #        new_Y += [tree.predict_out_of_sample(x) for x in X_new]
-            #    pred[draw] = new_Y
-            # return pred
         else:
             return np.full_like(cls.Y, cls.Y.mean())
 
diff --git a/pymc3/step_methods/pgbart.py b/pymc3/step_methods/pgbart.py
index 2979c7ca86..16e1fcd0bc 100644
--- a/pymc3/step_methods/pgbart.py
+++ b/pymc3/step_methods/pgbart.py
@@ -67,7 +67,6 @@ def __init__(self, vars=None, num_particles=10, max_stages=5000, chunk="auto", m
         initial_values = model.initial_point
         value_bart = inputvars(vars)[0]
         self.bart = model.values_to_rvs[value_bart].owner.op
-        self.bart.all_trees = []
 
         self.X, self.Y, self.missing_data = preprocess_XY(self.bart.X, self.bart.Y)
         self.m = self.bart.m

From 0bfaeba8c65f2b27d39b5a048fcc30f895639fc4 Mon Sep 17 00:00:00 2001
From: aloctavodia <aloctavodia@gmail.com>
Date: Tue, 17 Aug 2021 13:02:22 +0300
Subject: [PATCH 15/26] add size argument and check for NoDistribution

---
 pymc3/distributions/bart.py | 40 ++++++++++++++++++++++---------------
 pymc3/sampling.py           |  7 +++++--
 2 files changed, 29 insertions(+), 18 deletions(-)

diff --git a/pymc3/distributions/bart.py b/pymc3/distributions/bart.py
index d7694fe661..03e6d587bd 100644
--- a/pymc3/distributions/bart.py
+++ b/pymc3/distributions/bart.py
@@ -36,27 +36,35 @@ class BARTRV(RandomVariable):
     def _shape_from_params(self, dist_params, rep_param_idx=1, param_shapes=None):
         return default_shape_from_params(self.ndim_supp, dist_params, rep_param_idx, param_shapes)
 
-    def __new__(cls, *args, **kwargs):
-        return super().__new__(cls)
-
     @classmethod
-    def rng_fn(cls, rng=np.random.default_rng(), X_new=None, *args, **kwargs):
+    def rng_fn(cls, rng=np.random.default_rng(), *args, **kwargs):
+        size = kwargs.pop("size", None)
+        X_new = kwargs.pop("X_new", None)
         all_trees = cls.all_trees
         if all_trees:
-            # this should be rng.integers() but when sampling from the prior/posterior predictive
-            # I get 'numpy.random.mtrand.RandomState' object has no attribute 'integers'
-            # So I guess those functions need to be updated
-            idx = np.random.randint(len(all_trees))
-            trees = all_trees[idx]
+
+            if size is None:
+                size = ()
+            elif isinstance(size, int):
+                size = [size]
+
+            flatten_size = 1
+            for s in size:
+                flatten_size *= s
+
+            idx = rng.randint(len(all_trees), size=flatten_size)
+
             if X_new is None:
-                pred = np.zeros(trees[0].num_observations)
-                for tree in trees:
-                    pred += tree.predict_output()
+                pred = np.zeros((flatten_size, all_trees[0][0].num_observations))
+                for ind, p in enumerate(pred):
+                    for tree in all_trees[idx[ind]]:
+                        p += tree.predict_output()
             else:
-                pred = np.zeros(X_new.shape[0])
-                for tree in trees:
-                    pred += np.array([tree.predict_out_of_sample(x) for x in X_new])
-            return pred
+                pred = np.zeros((flatten_size, X_new.shape[0]))
+                for ind, p in enumerate(pred):
+                    for tree in all_trees[idx[ind]]:
+                        p += np.array([tree.predict_out_of_sample(x) for x in X_new])
+            return pred.reshape((*size, -1))
         else:
             return np.full_like(cls.Y, cls.Y.mean())
 
diff --git a/pymc3/sampling.py b/pymc3/sampling.py
index 6c57ac0ced..e8131ce642 100644
--- a/pymc3/sampling.py
+++ b/pymc3/sampling.py
@@ -42,7 +42,7 @@
 from pymc3.backends.base import BaseTrace, MultiTrace
 from pymc3.backends.ndarray import NDArray
 from pymc3.blocking import DictToArrayBijection
-from pymc3.distributions.bart import BARTRV
+from pymc3.distributions import NoDistribution
 from pymc3.exceptions import IncorrectArgumentsError, SamplingError
 from pymc3.model import Model, Point, modelcontext
 from pymc3.parallel_sampling import Draw, _cpu_count
@@ -240,7 +240,10 @@ def all_continuous(vars, model):
 
     if any(
         [
-            (var.dtype in discrete_types or isinstance(model.values_to_rvs[var].owner.op, BARTRV))
+            (
+                var.dtype in discrete_types
+                or isinstance(model.values_to_rvs[var].owner.op, NoDistribution)
+            )
             for var in vars_
         ]
     ):

From c7da202b86d1fad728b279b06ab67d4365739c7b Mon Sep 17 00:00:00 2001
From: aloctavodia <aloctavodia@gmail.com>
Date: Tue, 17 Aug 2021 23:05:19 +0300
Subject: [PATCH 16/26] stop updating split_prior after tuning

---
 pymc3/step_methods/pgbart.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/pymc3/step_methods/pgbart.py b/pymc3/step_methods/pgbart.py
index 16e1fcd0bc..b575c74e1c 100644
--- a/pymc3/step_methods/pgbart.py
+++ b/pymc3/step_methods/pgbart.py
@@ -196,10 +196,11 @@ def astep(self, q: RaveledVars) -> Tuple[RaveledVars, List[Dict[str, Any]]]:
             self.old_trees_particles_list[tree.tree_id] = new_tree
             self.trees[idx] = new_tree.tree
             sum_trees_output = sum_trees_output_noi + new_tree.tree.predict_output()
-            for index in new_tree.used_variates:
-                self.split_prior[index] += 1
 
-            if not self.tune:
+            if self.tune:
+                for index in new_tree.used_variates:
+                    self.split_prior[index] += 1
+            else:
                 self.iter += 1
                 self.sum_trees.append(new_tree.tree)
                 if not self.iter % self.m:

From 4985e555efcab3248376b9fc2cda4971e2fab560 Mon Sep 17 00:00:00 2001
From: aloctavodia <aloctavodia@gmail.com>
Date: Wed, 18 Aug 2021 18:33:50 +0300
Subject: [PATCH 17/26] clean code and small speed-up

---
 pymc3/step_methods/pgbart.py | 133 +++++++++++++++--------------------
 1 file changed, 58 insertions(+), 75 deletions(-)

diff --git a/pymc3/step_methods/pgbart.py b/pymc3/step_methods/pgbart.py
index b575c74e1c..0aec9d7ac5 100644
--- a/pymc3/step_methods/pgbart.py
+++ b/pymc3/step_methods/pgbart.py
@@ -87,10 +87,12 @@ def __init__(self, vars=None, num_particles=10, max_stages=5000, chunk="auto", m
         self.num_observations = self.X.shape[0]
         self.num_variates = self.X.shape[1]
         self.available_predictors = list(range(self.num_variates))
-        self.initial_value_leaf_nodes = self.init_mean / self.m
 
-        self.trees, sum_trees_output = init_list_of_trees(
-            self.initial_value_leaf_nodes, self.num_observations, self.m, self.Y, self.init_mean
+        sum_trees_output = np.full_like(self.Y, self.init_mean).astype(aesara.config.floatX)
+        self.a_tree = Tree.init_tree(
+            tree_id=0,
+            leaf_node_value=self.init_mean / self.m,
+            idx_data_points=np.arange(self.num_observations, dtype="int32"),
         )
         self.mean = fast_mean()
         self.normal = NormalSampler()
@@ -111,18 +113,18 @@ def __init__(self, vars=None, num_particles=10, max_stages=5000, chunk="auto", m
 
         shared = make_shared_replacements(initial_values, vars, model)
         self.likelihood_logp = logp(initial_values, [model.datalogpt], vars, shared)
-        self.init_leaf_nodes = self.initial_value_leaf_nodes
         self.init_likelihood = self.likelihood_logp(sum_trees_output)
         self.init_log_weight = self.init_likelihood - self.log_num_particles
-        self.old_trees_particles_list = []
+        self.all_particles = []
         for i in range(self.m):
+            self.a_tree.tree_id = i
             p = ParticleTree(
-                self.trees[i],
+                self.a_tree,
                 self.prior_prob_leaf_node,
                 self.init_log_weight,
                 self.init_likelihood,
             )
-            self.old_trees_particles_list.append(p)
+            self.all_particles.append(p)
         super().__init__(vars, shared)
 
     def astep(self, q: RaveledVars) -> Tuple[RaveledVars, List[Dict[str, Any]]]:
@@ -139,7 +141,7 @@ def astep(self, q: RaveledVars) -> Tuple[RaveledVars, List[Dict[str, Any]]]:
             if idx >= self.m:
                 break
             self.idx += 1
-            tree = self.trees[idx]
+            tree = self.all_particles[idx].tree
             sum_trees_output_noi = sum_trees_output - tree.predict_output()
             # Generate an initial set of SMC particles
             # at the end of the algorithm we return one of these particles as the new tree
@@ -192,23 +194,24 @@ def astep(self, q: RaveledVars) -> Tuple[RaveledVars, List[Dict[str, Any]]]:
                     break
 
             # Get the new tree and update
-            new_tree = np.random.choice(particles, p=normalized_weights)
-            self.old_trees_particles_list[tree.tree_id] = new_tree
-            self.trees[idx] = new_tree.tree
-            sum_trees_output = sum_trees_output_noi + new_tree.tree.predict_output()
+            new_particle = np.random.choice(particles, p=normalized_weights)
+            new_tree = new_particle.tree
+            new_particle.log_weight = new_particle.old_likelihood_logp - self.log_num_particles
+            self.all_particles[tree.tree_id] = new_particle
+            sum_trees_output = sum_trees_output_noi + new_tree.predict_output()
 
             if self.tune:
-                for index in new_tree.used_variates:
+                for index in new_particle.used_variates:
                     self.split_prior[index] += 1
             else:
                 self.iter += 1
-                self.sum_trees.append(new_tree.tree)
+                self.sum_trees.append(new_tree)
                 if not self.iter % self.m:
                     # XXX update the all_trees variable in BARTRV to be used in the rng_fn method
                     # this fails for chains > 1 as the variable is not shared between proccesses
                     self.bart.all_trees.append(self.sum_trees)
                     self.sum_trees = []
-                for index in new_tree.used_variates:
+                for index in new_particle.used_variates:
                     variable_inclusion[index] += 1
 
         stats = {"variable_inclusion": variable_inclusion}
@@ -242,7 +245,7 @@ def normalize(self, particles):
         return W_t, normalized_weights
 
     def get_old_tree_particle(self, tree_id, t):
-        old_tree_particle = self.old_trees_particles_list[tree_id]
+        old_tree_particle = self.all_particles[tree_id]
         old_tree_particle.set_particle_to_step(t)
         return old_tree_particle
 
@@ -250,34 +253,21 @@ def init_particles(self, tree_id):
         """
         Initialize particles
         """
-        prev_tree = self.get_old_tree_particle(tree_id, 0)
-        likelihood = self.likelihood_logp(prev_tree.tree.predict_output())
-        prev_tree.old_likelihood_logp = likelihood
-        prev_tree.log_weight = likelihood - self.log_num_particles
-        particles = [prev_tree]
-
-        initial_idx_data_points_leaf_nodes = np.arange(self.num_observations, dtype="int32")
-        new_tree = Tree.init_tree(
-            tree_id=tree_id,
-            leaf_node_value=self.init_leaf_nodes,
-            idx_data_points=initial_idx_data_points_leaf_nodes,
-        )
+        particles = [self.get_old_tree_particle(tree_id, 0)]
 
-        prior_prob = self.prior_prob_leaf_node
         for _ in range(1, self.num_particles):
+            self.a_tree.tree_id = tree_id
             particles.append(
-                ParticleTree(new_tree, prior_prob, self.init_log_weight, self.init_likelihood)
+                ParticleTree(
+                    self.a_tree,
+                    self.prior_prob_leaf_node,
+                    self.init_log_weight,
+                    self.init_likelihood,
+                )
             )
 
         return np.array(particles)
 
-    def resample(self, particles, weights):
-        """
-        Resample a set of particles given its weights
-        """
-        particles = np.random.choice(particles, size=len(particles), p=weights)
-        return particles
-
 
 class ParticleTree:
     """
@@ -295,7 +285,16 @@ def __init__(self, tree, prior_prob_leaf_node, log_weight=0, likelihood=0):
         self.used_variates = []
 
     def sample_tree_sequential(
-        self, ssv, available_predictors, X, missing_data, sum_trees_output, mean, m, normal, mu_std
+        self,
+        ssv,
+        available_predictors,
+        X,
+        missing_data,
+        sum_trees_output,
+        mean,
+        m,
+        normal,
+        mu_std,
     ):
         if self.expansion_nodes:
             index_leaf_node = self.expansion_nodes.pop(0)
@@ -303,7 +302,7 @@ def sample_tree_sequential(
             prob_leaf = self.prior_prob_leaf_node[self.tree[index_leaf_node].depth]
 
             if prob_leaf < np.random.random():
-                grow_successful, index_selected_predictor = grow_tree(
+                index_selected_predictor = grow_tree(
                     self.tree,
                     index_leaf_node,
                     ssv,
@@ -316,8 +315,7 @@ def sample_tree_sequential(
                     normal,
                     mu_std,
                 )
-                if grow_successful:
-                    # Add new leaf nodes indexes
+                if index_selected_predictor is not None:
                     new_indexes = self.tree.idx_leaf_nodes[-2:]
                     self.expansion_nodes.extend(new_indexes)
                     self.used_variates.append(index_selected_predictor)
@@ -362,22 +360,6 @@ def rvs(self):
                 return i
 
 
-def init_list_of_trees(initial_value_leaf_nodes, num_observations, m, Y, init_mean):
-    initial_value_leaf_nodes = initial_value_leaf_nodes
-    initial_idx_data_points_leaf_nodes = np.array(range(num_observations), dtype="int32")
-    list_of_trees = []
-    for i in range(m):
-        new_tree = Tree.init_tree(
-            tree_id=i,
-            leaf_node_value=initial_value_leaf_nodes,
-            idx_data_points=initial_idx_data_points_leaf_nodes,
-        )
-        list_of_trees.append(new_tree)
-    sum_trees_output = np.full_like(Y, init_mean).astype(aesara.config.floatX)
-
-    return list_of_trees, sum_trees_output
-
-
 def compute_prior_probability(alpha):
     """
     Calculate the probability of the node being a LeafNode (1 - p(being SplitNode)).
@@ -421,16 +403,17 @@ def grow_tree(
 
     index_selected_predictor = ssv.rvs()
     selected_predictor = available_predictors[index_selected_predictor]
-    available_splitting_rules = get_available_splitting_rules(
-        X[current_node.idx_data_points, selected_predictor],
-        missing_data,
-    )
-    # This can be unsuccessful when there are not available splitting rules
-    if available_splitting_rules.size == 0:
-        return False, None
+    available_splitting_values = X[current_node.idx_data_points, selected_predictor]
+    if missing_data:
+        available_splitting_values = available_splitting_values[
+            ~np.isnan(available_splitting_values)
+        ]
+
+    if available_splitting_values.size == 0:
+        return None
 
-    index_selected_splitting_rule = discrete_uniform_sampler(len(available_splitting_rules))
-    selected_splitting_rule = available_splitting_rules[index_selected_splitting_rule]
+    idx_selected_splitting_values = discrete_uniform_sampler(len(available_splitting_values))
+    selected_splitting_rule = available_splitting_values[idx_selected_splitting_values]
     new_split_node = SplitNode(
         index=index_leaf_node,
         idx_split_variable=selected_predictor,
@@ -460,15 +443,13 @@ def grow_tree(
     )
     tree.grow_tree(index_leaf_node, new_split_node, new_left_node, new_right_node)
 
-    return True, index_selected_predictor
+    return index_selected_predictor
 
 
-def get_available_splitting_rules(X_j, missing_data):
+def get_available_splitting_values(X_j, missing_data):
     if missing_data:
         X_j = X_j[~np.isnan(X_j)]
-    values = np.unique(X_j)
-    # The last value is never available as it would leave the right subtree empty.
-    return values[:-1]
+    return X_j
 
 
 def get_new_idx_data_points(current_split_node, idx_data_points, X):
@@ -484,10 +465,12 @@ def get_new_idx_data_points(current_split_node, idx_data_points, X):
 
 def draw_leaf_value(sum_trees_output_idx, mean, m, normal, mu_std):
     """Draw Gaussian distributed leaf values"""
-    mu_mean = mean(sum_trees_output_idx) / m
-    # add cache of precomputed normal values
-    draw = normal.random() * mu_std + mu_mean
-    return draw
+    if sum_trees_output_idx.size == 0:
+        return 0
+    else:
+        mu_mean = mean(sum_trees_output_idx) / m
+        draw = normal.random() * mu_std + mu_mean
+        return draw
 
 
 def fast_mean():

From 51b2c4d06be6f57946fd173c640e040f544ab226 Mon Sep 17 00:00:00 2001
From: aloctavodia <aloctavodia@gmail.com>
Date: Wed, 18 Aug 2021 18:36:33 +0300
Subject: [PATCH 18/26] clean code and small speed-up

---
 pymc3/step_methods/pgbart.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/pymc3/step_methods/pgbart.py b/pymc3/step_methods/pgbart.py
index 0aec9d7ac5..fab674b20a 100644
--- a/pymc3/step_methods/pgbart.py
+++ b/pymc3/step_methods/pgbart.py
@@ -446,12 +446,6 @@ def grow_tree(
     return index_selected_predictor
 
 
-def get_available_splitting_values(X_j, missing_data):
-    if missing_data:
-        X_j = X_j[~np.isnan(X_j)]
-    return X_j
-
-
 def get_new_idx_data_points(current_split_node, idx_data_points, X):
     idx_split_variable = current_split_node.idx_split_variable
     split_value = current_split_node.split_value

From 14d212841514fa4241ce3a15e3641ed8c53f0ef6 Mon Sep 17 00:00:00 2001
From: aloctavodia <aloctavodia@gmail.com>
Date: Thu, 19 Aug 2021 15:55:52 +0300
Subject: [PATCH 19/26] revert xfail

---
 pymc3/tests/test_parallel_sampling.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/pymc3/tests/test_parallel_sampling.py b/pymc3/tests/test_parallel_sampling.py
index fcbad3d577..8bdc3ca1dc 100644
--- a/pymc3/tests/test_parallel_sampling.py
+++ b/pymc3/tests/test_parallel_sampling.py
@@ -170,10 +170,6 @@ def test_explicit_sample(mp_start_method):
     proc.join()
 
 
-@pytest.mark.xfail(
-    condition=(IS_FLOAT32),
-    reason="Possibly the same issue described in https://github.com/pymc-devs/pymc3/pull/4701",
-)
 def test_iterator():
     with pm.Model() as model:
         a = pm.Normal("a", shape=1)

From d1982dcba5b0c93a961cf4f3a7b920bf214b14c2 Mon Sep 17 00:00:00 2001
From: aloctavodia <aloctavodia@gmail.com>
Date: Thu, 19 Aug 2021 16:32:54 +0300
Subject: [PATCH 20/26] add tests

---
 pymc3/tests/test_bart.py     | 69 ++++++++++++++++++++++++++++++++++++
 pymc3/tests/test_sampling.py | 19 ----------
 2 files changed, 69 insertions(+), 19 deletions(-)
 create mode 100644 pymc3/tests/test_bart.py

diff --git a/pymc3/tests/test_bart.py b/pymc3/tests/test_bart.py
new file mode 100644
index 0000000000..17d0e78f25
--- /dev/null
+++ b/pymc3/tests/test_bart.py
@@ -0,0 +1,69 @@
+import numpy as np
+
+import pymc3 as pm
+
+
+def test_split_node():
+    split_node = pm.distributions.tree.SplitNode(index=5, idx_split_variable=2, split_value=3.0)
+    assert split_node.index == 5
+    assert split_node.idx_split_variable == 2
+    assert split_node.split_value == 3.0
+    assert split_node.depth == 2
+    assert split_node.get_idx_parent_node() == 2
+    assert split_node.get_idx_left_child() == 11
+    assert split_node.get_idx_right_child() == 12
+
+
+def test_leaf_node():
+    leaf_node = pm.distributions.tree.LeafNode(index=5, value=3.14, idx_data_points=[1, 2, 3])
+    assert leaf_node.index == 5
+    assert np.array_equal(leaf_node.idx_data_points, [1, 2, 3])
+    assert leaf_node.value == 3.14
+    assert leaf_node.get_idx_parent_node() == 2
+    assert leaf_node.get_idx_left_child() == 11
+    assert leaf_node.get_idx_right_child() == 12
+
+
+def test_model():
+    X = np.linspace(7, 15, 100)
+    Y = np.sin(np.random.normal(X, 0.2)) + 3
+    X = X[:, None]
+
+    with pm.Model() as model:
+        sigma = pm.HalfNormal("sigma", 1)
+        mu = pm.BART("mu", X, Y, m=50)
+        y = pm.Normal("y", mu, sigma, observed=Y)
+        idata = pm.sample()
+        mean = idata.posterior["mu"].stack(samples=("chain", "draw")).mean("samples")
+
+    np.testing.assert_allclose(mean, Y, 0.5)
+
+    Y = np.repeat([0, 1], 50)
+    with pm.Model() as model:
+        mu_ = pm.BART("mu_", X, Y, m=50)
+        mu = pm.Deterministic("mu", pm.math.invlogit(mu_))
+        y = pm.Bernoulli("y", mu, observed=Y)
+        idata = pm.sample()
+        mean = idata.posterior["mu"].stack(samples=("chain", "draw")).mean("samples")
+
+    np.testing.assert_allclose(mean, Y, atol=0.5)
+
+
+def test_bart_vi():
+    X = np.random.normal(0, 1, size=(3, 250)).T
+    Y = np.random.normal(0, 1, size=250)
+    X[:, 0] = np.random.normal(Y, 0.1)
+
+    with pm.Model() as model:
+        mu = pm.BART("mu", X, Y, m=10)
+        sigma = pm.HalfNormal("sigma", 1)
+        y = pm.Normal("y", mu, sigma, observed=Y)
+        idata = pm.sample(random_seed=3415, chains=1)
+        var_imp = (
+            idata.sample_stats["variable_inclusion"]
+            .stack(samples=("chain", "draw"))
+            .mean("samples")
+        )
+        var_imp /= var_imp.sum()
+        assert var_imp[0] > var_imp[1:].sum()
+        np.testing.assert_almost_equal(var_imp.sum(), 1)
diff --git a/pymc3/tests/test_sampling.py b/pymc3/tests/test_sampling.py
index 4fd37cf70d..ffa20bfd68 100644
--- a/pymc3/tests/test_sampling.py
+++ b/pymc3/tests/test_sampling.py
@@ -174,25 +174,6 @@ def test_trace_report(self, step_cls, discard):
             assert trace.report.n_draws == 100
             assert isinstance(trace.report.t_sampling, float)
 
-    def test_bart_vi(self):
-        X = np.random.normal(0, 1, size=(3, 250)).T
-        Y = np.random.normal(0, 1, size=250)
-        X[:, 0] = np.random.normal(Y, 0.1)
-
-        with pm.Model() as model:
-            mu = pm.BART("mu", X, Y, m=10)
-            sigma = pm.HalfNormal("sigma", 1)
-            y = pm.Normal("y", mu, sigma, observed=Y)
-            idata = pm.sample(random_seed=3415, chains=1)
-            var_imp = (
-                idata.sample_stats["variable_inclusion"]
-                .stack(samples=("chain", "draw"))
-                .mean("samples")
-            )
-            var_imp /= var_imp.sum()
-            assert var_imp[0] > var_imp[1:].sum()
-            npt.assert_almost_equal(var_imp.sum(), 1)
-
     def test_return_inferencedata(self):
         with self.model:
             kwargs = dict(draws=100, tune=50, cores=1, chains=2, step=pm.Metropolis())

From c1c3a0b3574240b5dc5b8ff2f865b29bd1741342 Mon Sep 17 00:00:00 2001
From: aloctavodia <aloctavodia@gmail.com>
Date: Thu, 19 Aug 2021 16:53:45 +0300
Subject: [PATCH 21/26] fix number of chains

---
 pymc3/tests/test_bart.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pymc3/tests/test_bart.py b/pymc3/tests/test_bart.py
index 17d0e78f25..cf697fa2b7 100644
--- a/pymc3/tests/test_bart.py
+++ b/pymc3/tests/test_bart.py
@@ -33,7 +33,7 @@ def test_model():
         sigma = pm.HalfNormal("sigma", 1)
         mu = pm.BART("mu", X, Y, m=50)
         y = pm.Normal("y", mu, sigma, observed=Y)
-        idata = pm.sample()
+        idata = pm.sample(chains=4)
         mean = idata.posterior["mu"].stack(samples=("chain", "draw")).mean("samples")
 
     np.testing.assert_allclose(mean, Y, 0.5)
@@ -43,7 +43,7 @@ def test_model():
         mu_ = pm.BART("mu_", X, Y, m=50)
         mu = pm.Deterministic("mu", pm.math.invlogit(mu_))
         y = pm.Bernoulli("y", mu, observed=Y)
-        idata = pm.sample()
+        idata = pm.sample(chains=4)
         mean = idata.posterior["mu"].stack(samples=("chain", "draw")).mean("samples")
 
     np.testing.assert_allclose(mean, Y, atol=0.5)

From 9ea259ad7d14649b5f143fbe97600332d949a7a3 Mon Sep 17 00:00:00 2001
From: aloctavodia <aloctavodia@gmail.com>
Date: Thu, 19 Aug 2021 17:17:32 +0300
Subject: [PATCH 22/26] revert test

---
 pymc3/tests/test_bart.py | 25 -------------------------
 1 file changed, 25 deletions(-)

diff --git a/pymc3/tests/test_bart.py b/pymc3/tests/test_bart.py
index cf697fa2b7..d7567342b0 100644
--- a/pymc3/tests/test_bart.py
+++ b/pymc3/tests/test_bart.py
@@ -24,31 +24,6 @@ def test_leaf_node():
     assert leaf_node.get_idx_right_child() == 12
 
 
-def test_model():
-    X = np.linspace(7, 15, 100)
-    Y = np.sin(np.random.normal(X, 0.2)) + 3
-    X = X[:, None]
-
-    with pm.Model() as model:
-        sigma = pm.HalfNormal("sigma", 1)
-        mu = pm.BART("mu", X, Y, m=50)
-        y = pm.Normal("y", mu, sigma, observed=Y)
-        idata = pm.sample(chains=4)
-        mean = idata.posterior["mu"].stack(samples=("chain", "draw")).mean("samples")
-
-    np.testing.assert_allclose(mean, Y, 0.5)
-
-    Y = np.repeat([0, 1], 50)
-    with pm.Model() as model:
-        mu_ = pm.BART("mu_", X, Y, m=50)
-        mu = pm.Deterministic("mu", pm.math.invlogit(mu_))
-        y = pm.Bernoulli("y", mu, observed=Y)
-        idata = pm.sample(chains=4)
-        mean = idata.posterior["mu"].stack(samples=("chain", "draw")).mean("samples")
-
-    np.testing.assert_allclose(mean, Y, atol=0.5)
-
-
 def test_bart_vi():
     X = np.random.normal(0, 1, size=(3, 250)).T
     Y = np.random.normal(0, 1, size=250)

From 6cebc1055d541f4aa3538fa18d6b522250c35707 Mon Sep 17 00:00:00 2001
From: aloctavodia <aloctavodia@gmail.com>
Date: Mon, 23 Aug 2021 15:03:02 +0300
Subject: [PATCH 23/26] clean code, refactor and small speed-up

---
 pymc3/step_methods/pgbart.py | 88 +++++++++++++++++++-----------------
 1 file changed, 47 insertions(+), 41 deletions(-)

diff --git a/pymc3/step_methods/pgbart.py b/pymc3/step_methods/pgbart.py
index fab674b20a..351f1ae8a2 100644
--- a/pymc3/step_methods/pgbart.py
+++ b/pymc3/step_methods/pgbart.py
@@ -61,7 +61,7 @@ class PGBART(ArrayStepShared):
     generates_stats = True
     stats_dtypes = [{"variable_inclusion": np.ndarray}]
 
-    def __init__(self, vars=None, num_particles=10, max_stages=5000, chunk="auto", model=None):
+    def __init__(self, vars=None, num_particles=10, max_stages=100, chunk="auto", model=None):
         _log.warning("BART is experimental. Use with caution.")
         model = modelcontext(model)
         initial_values = model.initial_point
@@ -78,7 +78,8 @@ def __init__(self, vars=None, num_particles=10, max_stages=5000, chunk="auto", m
 
         self.init_mean = self.Y.mean()
         # if data is binary
-        if np.all(np.unique(self.Y) == [0, 1]):
+        Y_unique = np.unique(self.Y)
+        if Y_unique.size == 2 and np.all(Y_unique == [0, 1]):
             self.mu_std = 6 / (self.k * self.m ** 0.5)
         # maybe we need to check for count data
         else:
@@ -97,6 +98,7 @@ def __init__(self, vars=None, num_particles=10, max_stages=5000, chunk="auto", m
         self.mean = fast_mean()
         self.normal = NormalSampler()
         self.prior_prob_leaf_node = compute_prior_probability(self.alpha)
+        self.ssv = SampleSplittingVariable(self.split_prior)
 
         self.tune = True
         self.idx = 0
@@ -120,7 +122,6 @@ def __init__(self, vars=None, num_particles=10, max_stages=5000, chunk="auto", m
             self.a_tree.tree_id = i
             p = ParticleTree(
                 self.a_tree,
-                self.prior_prob_leaf_node,
                 self.init_log_weight,
                 self.init_likelihood,
             )
@@ -132,7 +133,6 @@ def astep(self, q: RaveledVars) -> Tuple[RaveledVars, List[Dict[str, Any]]]:
         sum_trees_output = q.data
 
         variable_inclusion = np.zeros(self.num_variates, dtype="int")
-        self.ssv = SampleSplittingVariable(self.split_prior)
 
         if self.idx == self.m:
             self.idx = 0
@@ -140,21 +140,24 @@ def astep(self, q: RaveledVars) -> Tuple[RaveledVars, List[Dict[str, Any]]]:
         for idx in range(self.idx, self.idx + self.chunk):
             if idx >= self.m:
                 break
-            self.idx += 1
             tree = self.all_particles[idx].tree
             sum_trees_output_noi = sum_trees_output - tree.predict_output()
+            self.idx += 1
             # Generate an initial set of SMC particles
             # at the end of the algorithm we return one of these particles as the new tree
             particles = self.init_particles(tree.tree_id)
 
-            for t in range(1, self.max_stages):
+            for t in range(self.max_stages):
                 # Get old particle at stage t
-                particles[0] = self.get_old_tree_particle(tree.tree_id, t)
+                if t > 0:
+                    particles[0] = self.get_old_tree_particle(tree.tree_id, t)
                 # sample each particle (try to grow each tree)
-                for c in range(1, self.num_particles):
-                    particles[c].sample_tree_sequential(
+                compute_logp = [True]
+                for p in particles[1:]:
+                    clp = p.sample_tree_sequential(
                         self.ssv,
                         self.available_predictors,
+                        self.prior_prob_leaf_node,
                         self.X,
                         self.missing_data,
                         sum_trees_output,
@@ -163,34 +166,34 @@ def astep(self, q: RaveledVars) -> Tuple[RaveledVars, List[Dict[str, Any]]]:
                         self.normal,
                         self.mu_std,
                     )
+                    compute_logp.append(clp)
                 # Update weights. Since the prior is used as the proposal,the weights
                 # are updated additively as the ratio of the new and old log_likelihoods
-                for p in particles:
-                    new_likelihood = self.likelihood_logp(
-                        sum_trees_output_noi + p.tree.predict_output()
-                    )
-                    p.log_weight += new_likelihood - p.old_likelihood_logp
-                    p.old_likelihood_logp = new_likelihood
-
+                for clp, p in zip(compute_logp, particles):
+                    if clp:  # Compute the likelihood when p has changed from the previous iteration
+                        new_likelihood = self.likelihood_logp(
+                            sum_trees_output_noi + p.tree.predict_output()
+                        )
+                        p.log_weight += new_likelihood - p.old_likelihood_logp
+                        p.old_likelihood_logp = new_likelihood
                 # Normalize weights
                 W_t, normalized_weights = self.normalize(particles)
 
-                # Set the new weights
-                for p in particles:
-                    p.log_weight = W_t
-
                 # Resample all but first particle
                 re_n_w = normalized_weights[1:] / normalized_weights[1:].sum()
-
                 new_indices = np.random.choice(self.indices, size=len(self.indices), p=re_n_w)
                 particles[1:] = particles[new_indices]
 
+                # Set the new weights
+                for p in particles:
+                    p.log_weight = W_t
+
                 # Check if particles can keep growing, otherwise stop iterating
-                non_available_nodes_for_expansion = np.ones(self.num_particles - 1)
-                for c in range(1, self.num_particles):
-                    if len(particles[c].expansion_nodes) != 0:
-                        non_available_nodes_for_expansion[c - 1] = 0
-                if np.all(non_available_nodes_for_expansion):
+                non_available_nodes_for_expansion = []
+                for p in particles[1:]:
+                    if p.expansion_nodes:
+                        non_available_nodes_for_expansion.append(0)
+                if all(non_available_nodes_for_expansion):
                     break
 
             # Get the new tree and update
@@ -203,6 +206,7 @@ def astep(self, q: RaveledVars) -> Tuple[RaveledVars, List[Dict[str, Any]]]:
             if self.tune:
                 for index in new_particle.used_variates:
                     self.split_prior[index] += 1
+                    self.ssv = SampleSplittingVariable(self.split_prior)
             else:
                 self.iter += 1
                 self.sum_trees.append(new_tree)
@@ -253,14 +257,16 @@ def init_particles(self, tree_id):
         """
         Initialize particles
         """
-        particles = [self.get_old_tree_particle(tree_id, 0)]
+        p = self.get_old_tree_particle(tree_id, 0)
+        p.log_weight = self.init_log_weight
+        p.old_likelihood_logp = self.init_likelihood
+        particles = [p]
 
-        for _ in range(1, self.num_particles):
+        for _ in self.indices:
             self.a_tree.tree_id = tree_id
             particles.append(
                 ParticleTree(
                     self.a_tree,
-                    self.prior_prob_leaf_node,
                     self.init_log_weight,
                     self.init_likelihood,
                 )
@@ -274,13 +280,12 @@ class ParticleTree:
     Particle tree
     """
 
-    def __init__(self, tree, prior_prob_leaf_node, log_weight=0, likelihood=0):
+    def __init__(self, tree, log_weight, likelihood):
         self.tree = tree.copy()  # keeps the tree that we care at the moment
         self.expansion_nodes = [0]
         self.tree_history = [self.tree]
         self.expansion_nodes_history = [self.expansion_nodes]
         self.log_weight = log_weight
-        self.prior_prob_leaf_node = prior_prob_leaf_node
         self.old_likelihood_logp = likelihood
         self.used_variates = []
 
@@ -288,6 +293,7 @@ def sample_tree_sequential(
         self,
         ssv,
         available_predictors,
+        prior_prob_leaf_node,
         X,
         missing_data,
         sum_trees_output,
@@ -296,13 +302,14 @@ def sample_tree_sequential(
         normal,
         mu_std,
     ):
+        clp = False
         if self.expansion_nodes:
             index_leaf_node = self.expansion_nodes.pop(0)
             # Probability that this node will remain a leaf node
-            prob_leaf = self.prior_prob_leaf_node[self.tree[index_leaf_node].depth]
+            prob_leaf = prior_prob_leaf_node[self.tree[index_leaf_node].depth]
 
             if prob_leaf < np.random.random():
-                index_selected_predictor = grow_tree(
+                clp, index_selected_predictor = grow_tree(
                     self.tree,
                     index_leaf_node,
                     ssv,
@@ -315,21 +322,20 @@ def sample_tree_sequential(
                     normal,
                     mu_std,
                 )
-                if index_selected_predictor is not None:
+                if clp:
                     new_indexes = self.tree.idx_leaf_nodes[-2:]
                     self.expansion_nodes.extend(new_indexes)
                     self.used_variates.append(index_selected_predictor)
 
             self.tree_history.append(self.tree)
             self.expansion_nodes_history.append(self.expansion_nodes)
+        return clp
 
     def set_particle_to_step(self, t):
         if len(self.tree_history) <= t:
-            self.tree = self.tree_history[-1]
-            self.expansion_nodes = self.expansion_nodes_history[-1]
-        else:
-            self.tree = self.tree_history[t]
-            self.expansion_nodes = self.expansion_nodes_history[t]
+            t = -1
+        self.tree = self.tree_history[t]
+        self.expansion_nodes = self.expansion_nodes_history[t]
 
 
 def preprocess_XY(X, Y):
@@ -410,7 +416,7 @@ def grow_tree(
         ]
 
     if available_splitting_values.size == 0:
-        return None
+        return False, None
 
     idx_selected_splitting_values = discrete_uniform_sampler(len(available_splitting_values))
     selected_splitting_rule = available_splitting_values[idx_selected_splitting_values]
@@ -443,7 +449,7 @@ def grow_tree(
     )
     tree.grow_tree(index_leaf_node, new_split_node, new_left_node, new_right_node)
 
-    return index_selected_predictor
+    return True, index_selected_predictor
 
 
 def get_new_idx_data_points(current_split_node, idx_data_points, X):

From efa8dc6cf78f975fe31c68c1df55cb7bfbaa46ab Mon Sep 17 00:00:00 2001
From: aloctavodia <aloctavodia@gmail.com>
Date: Fri, 3 Sep 2021 14:38:20 +0300
Subject: [PATCH 24/26] test random

---
 pymc3/tests/test_bart.py | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/pymc3/tests/test_bart.py b/pymc3/tests/test_bart.py
index d7567342b0..2dbeda89ef 100644
--- a/pymc3/tests/test_bart.py
+++ b/pymc3/tests/test_bart.py
@@ -1,5 +1,7 @@
 import numpy as np
 
+from numpy.random import RandomState
+
 import pymc3 as pm
 
 
@@ -33,7 +35,7 @@ def test_bart_vi():
         mu = pm.BART("mu", X, Y, m=10)
         sigma = pm.HalfNormal("sigma", 1)
         y = pm.Normal("y", mu, sigma, observed=Y)
-        idata = pm.sample(random_seed=3415, chains=1)
+        idata = pm.sample(random_seed=3415)
         var_imp = (
             idata.sample_stats["variable_inclusion"]
             .stack(samples=("chain", "draw"))
@@ -42,3 +44,23 @@ def test_bart_vi():
         var_imp /= var_imp.sum()
         assert var_imp[0] > var_imp[1:].sum()
         np.testing.assert_almost_equal(var_imp.sum(), 1)
+
+
+def test_bart_random():
+    X = np.random.normal(0, 1, size=(2, 50)).T
+    Y = np.random.normal(0, 1, size=50)
+
+    with pm.Model() as model:
+        mu = pm.BART("mu", X, Y, m=10)
+        sigma = pm.HalfNormal("sigma", 1)
+        y = pm.Normal("y", mu, sigma, observed=Y)
+        idata = pm.sample(random_seed=3415, chains=1)
+
+    rng = RandomState(12345)
+    pred_all = mu.owner.op.rng_fn(rng, size=2)
+    rng = RandomState(12345)
+    pred_first = mu.owner.op.rng_fn(rng, X_new=X[:10])
+
+    assert np.all(pred_first == pred_all[0, :10])
+    assert pred_all.shape == (2, 50)
+    assert pred_first.shape == (10,)

From 64496cd882193a3b53f7b669cbcf3b587cb56763 Mon Sep 17 00:00:00 2001
From: aloctavodia <aloctavodia@gmail.com>
Date: Fri, 3 Sep 2021 14:55:24 +0300
Subject: [PATCH 25/26] test random

---
 pymc3/tests/test_bart.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pymc3/tests/test_bart.py b/pymc3/tests/test_bart.py
index 2dbeda89ef..0a32de4d11 100644
--- a/pymc3/tests/test_bart.py
+++ b/pymc3/tests/test_bart.py
@@ -1,6 +1,7 @@
 import numpy as np
 
 from numpy.random import RandomState
+from numpy.testing import assert_almost_equal
 
 import pymc3 as pm
 
@@ -61,6 +62,6 @@ def test_bart_random():
     rng = RandomState(12345)
     pred_first = mu.owner.op.rng_fn(rng, X_new=X[:10])
 
-    assert np.all(pred_first == pred_all[0, :10])
+    assert_almost_equal(pred_first, pred_all[0, :10], decimal=4)
     assert pred_all.shape == (2, 50)
     assert pred_first.shape == (10,)

From d67c9a37a9f1fcda6651e8a242eeb4c22ae80a30 Mon Sep 17 00:00:00 2001
From: aloctavodia <aloctavodia@gmail.com>
Date: Fri, 3 Sep 2021 18:36:42 +0300
Subject: [PATCH 26/26] add missing data test

---
 pymc3/tests/test_bart.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/pymc3/tests/test_bart.py b/pymc3/tests/test_bart.py
index 0a32de4d11..5d221633a4 100644
--- a/pymc3/tests/test_bart.py
+++ b/pymc3/tests/test_bart.py
@@ -65,3 +65,15 @@ def test_bart_random():
     assert_almost_equal(pred_first, pred_all[0, :10], decimal=4)
     assert pred_all.shape == (2, 50)
     assert pred_first.shape == (10,)
+
+
+def test_missing_data():
+    X = np.random.normal(0, 1, size=(2, 50)).T
+    Y = np.random.normal(0, 1, size=50)
+    X[10:20, 0] = np.nan
+
+    with pm.Model() as model:
+        mu = pm.BART("mu", X, Y, m=10)
+        sigma = pm.HalfNormal("sigma", 1)
+        y = pm.Normal("y", mu, sigma, observed=Y)
+        idata = pm.sample(random_seed=3415)