Skip to content

Commit

Permalink
Fix multi-target quantile ranks (#84)
Browse files Browse the repository at this point in the history
  • Loading branch information
reidjohnson authored Sep 4, 2024
1 parent 4df81ae commit 087d8e9
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 24 deletions.
2 changes: 1 addition & 1 deletion quantile_forest/_quantile_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -879,7 +879,7 @@ def quantile_ranks(
y = np.expand_dims(y, axis=1)

y_ranks = self.forest_.quantile_ranks(
y.astype(np.float64).T,
y.astype(np.float64),
X_leaves,
X_indices,
kind,
Expand Down
47 changes: 30 additions & 17 deletions quantile_forest/_quantile_forest_fast.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -481,8 +481,10 @@ cdef class QuantileForest:
Attributes
----------
y_train : array-like of shape (n_samples, n_outputs)
Training target values. Assumes values are sorted in ascending order.
y_train : array-like of shape (n_outputs, n_samples)
Training target values. Assumes that the values are sorted in
ascending order for each output. The outputs are assigned to the first
dimension (n_outputs) of the array for ease of indexing.
y_train_leaves : array-like of shape \
(n_estimators, n_leaves, n_outputs, n_indices)
Expand Down Expand Up @@ -600,9 +602,11 @@ cdef class QuantileForest:
preds : array-like of shape (n_samples, n_outputs, n_quantiles)
Quantiles or means for samples as floats.
"""
cdef intp_t n_quantiles, n_samples, n_trees, n_outputs, n_train
cdef intp_t n_samples, n_outputs, n_quantiles
cdef intp_t n_trees, n_train, max_idx
cdef intp_t i, j, k, l
cdef bint use_mean
cdef list[char*] interpolations
cdef vector[double] leaf_samples
cdef vector[double] leaf_weights
cdef vector[vector[intp_t]] train_indices
Expand All @@ -617,11 +621,11 @@ cdef class QuantileForest:
cdef cnp.ndarray[float64_t, ndim=3] preds
cdef double[:, :, :] preds_view

n_quantiles = len(quantiles)
n_samples = X_leaves.shape[0]
n_trees = X_leaves.shape[1]

n_outputs = self.y_train.size()
n_quantiles = len(quantiles)

n_trees = X_leaves.shape[1]
n_train = self.y_train[0].size()
max_idx = self.y_train_leaves.shape[3]

Expand All @@ -641,8 +645,8 @@ cdef class QuantileForest:
"must be None."
)

interps = [b"linear", b"lower", b"higher", b"midpoint", b"nearest"]
if interpolation not in interps:
interpolations = [b"linear", b"lower", b"higher", b"midpoint", b"nearest"]
if interpolation not in interpolations:
raise ValueError(f"Invalid interpolation method {interpolation}.")

# Initialize NumPy array with NaN values and get view for nogil.
Expand Down Expand Up @@ -815,8 +819,10 @@ cdef class QuantileForest:
ranks : array-like of shape (n_samples, n_outputs)
Quantiles ranks in range [0, 1] for samples as floats.
"""
cdef intp_t n_samples, n_trees, n_outputs
cdef intp_t n_samples, n_outputs
cdef intp_t n_trees, max_idx
cdef intp_t i, j
cdef list[char*] kinds
cdef vector[double] leaf_samples
cdef vector[vector[intp_t]] train_indices
cdef intp_t idx, train_idx
Expand All @@ -825,11 +831,16 @@ cdef class QuantileForest:
cdef cnp.ndarray[float64_t, ndim=2] ranks
cdef double[:, :] ranks_view

n_outputs = y_scores.shape[0]
n_samples = y_scores.shape[0]
n_outputs = y_scores.shape[1]

n_samples = X_leaves.shape[0]
n_trees = X_leaves.shape[1]
if n_outputs != <intp_t>self.y_train.size():
raise ValueError(
f"Number of target outputs in training data ({self.y_train.size()}) does not "
f"match the number of outputs in test data being scored ({n_outputs})."
)

n_trees = X_leaves.shape[1]
max_idx = self.y_train_leaves.shape[3]

if X_indices is not None:
Expand All @@ -856,7 +867,6 @@ cdef class QuantileForest:
for j in range(n_outputs):
for k in range(<intp_t>(train_indices.size())):
train_indices[k].clear()
leaf_samples.clear()
leaf_preds.clear()

# Accumulate training indices across leaves for each tree.
Expand All @@ -875,13 +885,15 @@ cdef class QuantileForest:
if train_indices[k].size() == 0:
continue

leaf_samples.clear()

# Get training target values associated with indices.
for train_idx in train_indices[k]:
if train_idx != 0:
leaf_samples.push_back(self.y_train[j][train_idx - 1])

# Calculate rank.
pred = calc_quantile_rank(leaf_samples, y_scores[j, i], kind=kind)
pred = calc_quantile_rank(leaf_samples, y_scores[i, j], kind=kind)
if pred != -1:
leaf_preds.push_back(pred)

Expand Down Expand Up @@ -931,19 +943,20 @@ cdef class QuantileForest:
proximities : list of dicts
Dicts mapping sample indices to proximity counts.
"""
cdef vector[map[intp_t, intp_t]] proximities
cdef intp_t n_samples, n_trees, n_train
cdef intp_t n_samples
cdef intp_t n_trees, n_train, max_idx
cdef intp_t i, j
cdef vector[intp_t] train_indices
cdef vector[int] leaf_weights
cdef intp_t train_idx
cdef int cutoff, train_wgt
cdef priority_queue[pair[int, intp_t]] queue
cdef pair[int, intp_t] entry
cdef vector[map[intp_t, intp_t]] proximities

n_samples = X_leaves.shape[0]
n_trees = X_leaves.shape[1]

n_trees = X_leaves.shape[1]
n_train = self.y_train[0].size()
max_idx = self.y_train_leaves.shape[3]

Expand Down
20 changes: 14 additions & 6 deletions quantile_forest/tests/test_quantile_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -622,7 +622,7 @@ def check_quantile_ranks_toy(name):

kwargs = {"aggregate_leaves_first": False}

expected = [0.6875, 0.6875, 0.4375, 0.9375, 0.875, 0.875]
expected = [0.75, 0.75, 0.5, 1.0, 1.0, 1.0]
y_ranks = est.quantile_ranks(X, y, kind="rank", **kwargs)
assert_allclose(y_ranks, expected)

Expand Down Expand Up @@ -656,13 +656,10 @@ def check_quantile_ranks(name):
X_train = x1.reshape(-1, 1)
y_train = np.squeeze(x1 * 2 + e1)

x2 = np.random.choice(np.arange(0, 101), size=4)
x2 = np.random.choice(np.arange(0, 101), size=10)

X_test = x2.reshape(-1, 1)
y_test = np.squeeze(X_test * 2)

y_train = y_train.astype(np.float64)
y_test = y_test.astype(np.float64)
y_test = np.squeeze(X_test * 2 + e1)

est = ForestRegressor(n_estimators=10, random_state=0)

Expand All @@ -673,6 +670,17 @@ def check_quantile_ranks(name):
assert np.all(y_ranks >= 0)
assert np.all(y_ranks <= 1)

# Check predicted ranks on multi-target data.
y_train = np.stack([np.squeeze(x1), np.squeeze(x1 * 2)], axis=1)
y_test = np.stack([np.squeeze(X_test - e1), np.squeeze(X_test * 2 + e1)], axis=1)

est.fit(X_train, y_train)
y_ranks = est.quantile_ranks(X_test, y_test)

assert y_ranks.shape == (X_test.shape[0], 2)
assert np.all(y_ranks >= 0)
assert np.all(y_ranks <= 1)


@pytest.mark.parametrize("name", FOREST_REGRESSORS)
def test_quantile_ranks(name):
Expand Down

0 comments on commit 087d8e9

Please sign in to comment.