Speed-up Single Leaf Scoring (#13)

* Speed-up Single Leaf Scoring Also updates tests and docs.
zillow · Oct 7, 2023 · b9a764d · b9a764d
1 parent 3f110ad
commit b9a764d
Show file tree

Hide file tree

Showing 3 changed files with 48 additions and 13 deletions.
diff --git a/docs/user_guide.rst b/docs/user_guide.rst
@@ -49,7 +49,7 @@ Let's fit a quantile forest on a simple regression dataset::
     >>> reg.fit(X_train, y_train)
     RandomForestQuantileRegressor(...)
 
-During model initialization, the parameter `max_samples_leaf` can be specified, which determines the maximum number of samples per leaf node to retain. If `max_samples_leaf` is smaller than the number of samples in a given leaf node, then a subset of values are randomly selected. By default, the model retains one randomly selected sample per leaf node (`max_samples_leaf = 1`); all samples can be retained by specifying `max_samples_leaf = None`. Note that the number of retained samples can materially impact the size of the model object.
+During model initialization, the parameter `max_samples_leaf` can be specified, which determines the maximum number of samples per leaf node to retain. If `max_samples_leaf` is smaller than the number of samples in a given leaf node, then a subset of values are randomly selected. By default, the model retains one randomly selected sample per leaf node (`max_samples_leaf = 1`), which enables the use of optimizations at prediction time that are not available when a variable number of samples may be retained per leaf. All samples can be retained by specifying `max_samples_leaf = None`. Note that the number of retained samples can materially impact the size of the model object.
 
 A notable advantage of quantile forests is that they can be fit once, while arbitrary quantiles can be estimated at prediction time. Accordingly, since the quantiles can be specified at prediction time, the model accepts an optional parameter during the call to the `predict` method, which can be a float or list of floats that specify the empirical quantiles to return::
 

diff --git a/quantile_forest/_quantile_forest.py b/quantile_forest/_quantile_forest.py
@@ -477,15 +477,36 @@ def predict(
                 X_leaves = self.apply(X)
             X_indices = None
 
-        y_pred = self.forest_.predict(
-            quantiles,
-            X_leaves,
-            X_indices,
-            interpolation,
-            weighted_leaves,
-            weighted_quantile,
-            aggregate_leaves_first,
-        )
+        if self.max_samples_leaf == 1:  # optimize for single-sample-per-leaf performance
+            leaf_values = np.empty((len(X), self.n_estimators))
+            y_train_leaves = np.asarray(self.forest_.y_train_leaves)
+            y_train = np.asarray(self.forest_.y_train)
+            for tree in range(self.n_estimators):
+                if X_indices is None:
+                    train_indices = y_train_leaves[tree, X_leaves[:, tree], 0]
+                else:
+                    unsampled_indices = X_indices[:, tree] == 1
+                    unsampled_leaves = X_leaves[unsampled_indices, tree]
+                    train_indices = np.zeros(len(X), dtype=int)
+                    train_indices[unsampled_indices] = y_train_leaves[tree, unsampled_leaves, 0]
+                leaf_values[:, tree] = y_train[train_indices - 1]
+                leaf_values[train_indices == 0, tree] = np.nan
+            if len(quantiles) == 1 and quantiles[0] == -1:  # calculate mean
+                func = np.mean if X_indices is None else np.nanmean
+                y_pred = np.expand_dims(func(leaf_values, axis=1), axis=1)
+            else:  # calculate quantiles
+                func = np.quantile if X_indices is None else np.nanquantile
+                y_pred = func(leaf_values, quantiles, axis=1).T
+        else:
+            y_pred = self.forest_.predict(
+                quantiles,
+                X_leaves,
+                X_indices,
+                interpolation,
+                weighted_leaves,
+                weighted_quantile,
+                aggregate_leaves_first,
+            )
 
         if y_pred.shape[1] == 1:
             y_pred = np.squeeze(y_pred, axis=1)

diff --git a/quantile_forest/tests/test_quantile_forest.py b/quantile_forest/tests/test_quantile_forest.py
@@ -201,7 +201,12 @@ def check_predict_quantiles_toy(name):
     if name == "RandomForestQuantileRegressor":
         for oob_score in [False, True]:
             # Check weighted and unweighted leaves.
-            est = ForestRegressor(n_estimators=20, max_depth=1, random_state=0)
+            est = ForestRegressor(
+                n_estimators=20,
+                max_depth=1,
+                max_samples_leaf=None,
+                random_state=0,
+            )
             est.fit(X, y)
             y_pred1 = est.predict(
                 X,
@@ -727,6 +732,7 @@ def test_oob_samples_duplicates(name):
 
 def check_predict_oob(
     name,
+    max_samples_leaf,
     quantiles,
     weighted_quantile,
     aggregate_leaves_first,
@@ -737,7 +743,13 @@ def check_predict_oob(
 
     ForestRegressor = FOREST_REGRESSORS[name]
 
-    est = ForestRegressor(n_estimators=20, bootstrap=True, oob_score=True, random_state=0)
+    est = ForestRegressor(
+        n_estimators=20,
+        max_samples_leaf=max_samples_leaf,
+        bootstrap=True,
+        oob_score=True,
+        random_state=0,
+    )
     est.fit(X, y)
 
     n_quantiles = None
@@ -907,16 +919,18 @@ def check_predict_oob(
 
 
 @pytest.mark.parametrize("name", FOREST_REGRESSORS)
+@pytest.mark.parametrize("max_samples_leaf", [None, 1])
 @pytest.mark.parametrize("quantiles", [None, "mean", 0.5, [0.2, 0.5, 0.8]])
 @pytest.mark.parametrize("weighted_quantile", [True, False])
 @pytest.mark.parametrize("aggregate_leaves_first", [True, False])
 def test_predict_oob(
     name,
+    max_samples_leaf,
     quantiles,
     weighted_quantile,
     aggregate_leaves_first,
 ):
-    check_predict_oob(name, quantiles, weighted_quantile, aggregate_leaves_first)
+    check_predict_oob(name, max_samples_leaf, quantiles, weighted_quantile, aggregate_leaves_first)
 
 
 def check_quantile_ranks_oob(name):