Skip to content

Commit

Permalink
Speed-up Single Leaf Scoring (#13)
Browse files Browse the repository at this point in the history
* Speed-up Single Leaf Scoring

Also updates tests and docs.
  • Loading branch information
reidjohnson authored Oct 7, 2023
1 parent 3f110ad commit b9a764d
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 13 deletions.
2 changes: 1 addition & 1 deletion docs/user_guide.rst
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ Let's fit a quantile forest on a simple regression dataset::
>>> reg.fit(X_train, y_train)
RandomForestQuantileRegressor(...)

During model initialization, the parameter `max_samples_leaf` can be specified, which determines the maximum number of samples per leaf node to retain. If `max_samples_leaf` is smaller than the number of samples in a given leaf node, then a subset of values are randomly selected. By default, the model retains one randomly selected sample per leaf node (`max_samples_leaf = 1`); all samples can be retained by specifying `max_samples_leaf = None`. Note that the number of retained samples can materially impact the size of the model object.
During model initialization, the parameter `max_samples_leaf` can be specified, which determines the maximum number of samples per leaf node to retain. If `max_samples_leaf` is smaller than the number of samples in a given leaf node, then a subset of values are randomly selected. By default, the model retains one randomly selected sample per leaf node (`max_samples_leaf = 1`), which enables the use of optimizations at prediction time that are not available when a variable number of samples may be retained per leaf. All samples can be retained by specifying `max_samples_leaf = None`. Note that the number of retained samples can materially impact the size of the model object.

A notable advantage of quantile forests is that they can be fit once, while arbitrary quantiles can be estimated at prediction time. Accordingly, since the quantiles can be specified at prediction time, the model accepts an optional parameter during the call to the `predict` method, which can be a float or list of floats that specify the empirical quantiles to return::

Expand Down
39 changes: 30 additions & 9 deletions quantile_forest/_quantile_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -477,15 +477,36 @@ def predict(
X_leaves = self.apply(X)
X_indices = None

y_pred = self.forest_.predict(
quantiles,
X_leaves,
X_indices,
interpolation,
weighted_leaves,
weighted_quantile,
aggregate_leaves_first,
)
if self.max_samples_leaf == 1: # optimize for single-sample-per-leaf performance
leaf_values = np.empty((len(X), self.n_estimators))
y_train_leaves = np.asarray(self.forest_.y_train_leaves)
y_train = np.asarray(self.forest_.y_train)
for tree in range(self.n_estimators):
if X_indices is None:
train_indices = y_train_leaves[tree, X_leaves[:, tree], 0]
else:
unsampled_indices = X_indices[:, tree] == 1
unsampled_leaves = X_leaves[unsampled_indices, tree]
train_indices = np.zeros(len(X), dtype=int)
train_indices[unsampled_indices] = y_train_leaves[tree, unsampled_leaves, 0]
leaf_values[:, tree] = y_train[train_indices - 1]
leaf_values[train_indices == 0, tree] = np.nan
if len(quantiles) == 1 and quantiles[0] == -1: # calculate mean
func = np.mean if X_indices is None else np.nanmean
y_pred = np.expand_dims(func(leaf_values, axis=1), axis=1)
else: # calculate quantiles
func = np.quantile if X_indices is None else np.nanquantile
y_pred = func(leaf_values, quantiles, axis=1).T
else:
y_pred = self.forest_.predict(
quantiles,
X_leaves,
X_indices,
interpolation,
weighted_leaves,
weighted_quantile,
aggregate_leaves_first,
)

if y_pred.shape[1] == 1:
y_pred = np.squeeze(y_pred, axis=1)
Expand Down
20 changes: 17 additions & 3 deletions quantile_forest/tests/test_quantile_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,12 @@ def check_predict_quantiles_toy(name):
if name == "RandomForestQuantileRegressor":
for oob_score in [False, True]:
# Check weighted and unweighted leaves.
est = ForestRegressor(n_estimators=20, max_depth=1, random_state=0)
est = ForestRegressor(
n_estimators=20,
max_depth=1,
max_samples_leaf=None,
random_state=0,
)
est.fit(X, y)
y_pred1 = est.predict(
X,
Expand Down Expand Up @@ -727,6 +732,7 @@ def test_oob_samples_duplicates(name):

def check_predict_oob(
name,
max_samples_leaf,
quantiles,
weighted_quantile,
aggregate_leaves_first,
Expand All @@ -737,7 +743,13 @@ def check_predict_oob(

ForestRegressor = FOREST_REGRESSORS[name]

est = ForestRegressor(n_estimators=20, bootstrap=True, oob_score=True, random_state=0)
est = ForestRegressor(
n_estimators=20,
max_samples_leaf=max_samples_leaf,
bootstrap=True,
oob_score=True,
random_state=0,
)
est.fit(X, y)

n_quantiles = None
Expand Down Expand Up @@ -907,16 +919,18 @@ def check_predict_oob(


@pytest.mark.parametrize("name", FOREST_REGRESSORS)
@pytest.mark.parametrize("max_samples_leaf", [None, 1])
@pytest.mark.parametrize("quantiles", [None, "mean", 0.5, [0.2, 0.5, 0.8]])
@pytest.mark.parametrize("weighted_quantile", [True, False])
@pytest.mark.parametrize("aggregate_leaves_first", [True, False])
def test_predict_oob(
name,
max_samples_leaf,
quantiles,
weighted_quantile,
aggregate_leaves_first,
):
check_predict_oob(name, quantiles, weighted_quantile, aggregate_leaves_first)
check_predict_oob(name, max_samples_leaf, quantiles, weighted_quantile, aggregate_leaves_first)


def check_quantile_ranks_oob(name):
Expand Down

0 comments on commit b9a764d

Please sign in to comment.