diff --git a/quantile_forest/_quantile_forest.py b/quantile_forest/_quantile_forest.py index da05b82..208b279 100755 --- a/quantile_forest/_quantile_forest.py +++ b/quantile_forest/_quantile_forest.py @@ -879,7 +879,7 @@ def quantile_ranks( y = np.expand_dims(y, axis=1) y_ranks = self.forest_.quantile_ranks( - y.astype(np.float64).T, + y.astype(np.float64), X_leaves, X_indices, kind, diff --git a/quantile_forest/_quantile_forest_fast.pyx b/quantile_forest/_quantile_forest_fast.pyx index 181b4d9..e2928bf 100755 --- a/quantile_forest/_quantile_forest_fast.pyx +++ b/quantile_forest/_quantile_forest_fast.pyx @@ -481,8 +481,10 @@ cdef class QuantileForest: Attributes ---------- - y_train : array-like of shape (n_samples, n_outputs) - Training target values. Assumes values are sorted in ascending order. + y_train : array-like of shape (n_outputs, n_samples) + Training target values. Assumes that the values are sorted in + ascending order for each output. The outputs are assigned to the first + dimension (n_outputs) of the array for ease of indexing. y_train_leaves : array-like of shape \ (n_estimators, n_leaves, n_outputs, n_indices) @@ -600,9 +602,11 @@ cdef class QuantileForest: preds : array-like of shape (n_samples, n_outputs, n_quantiles) Quantiles or means for samples as floats. """ - cdef intp_t n_quantiles, n_samples, n_trees, n_outputs, n_train + cdef intp_t n_samples, n_outputs, n_quantiles + cdef intp_t n_trees, n_train, max_idx cdef intp_t i, j, k, l cdef bint use_mean + cdef list[char*] interpolations cdef vector[double] leaf_samples cdef vector[double] leaf_weights cdef vector[vector[intp_t]] train_indices @@ -617,11 +621,11 @@ cdef class QuantileForest: cdef cnp.ndarray[float64_t, ndim=3] preds cdef double[:, :, :] preds_view - n_quantiles = len(quantiles) n_samples = X_leaves.shape[0] - n_trees = X_leaves.shape[1] - n_outputs = self.y_train.size() + n_quantiles = len(quantiles) + + n_trees = X_leaves.shape[1] n_train = self.y_train[0].size() max_idx = self.y_train_leaves.shape[3] @@ -641,8 +645,8 @@ cdef class QuantileForest: "must be None." ) - interps = [b"linear", b"lower", b"higher", b"midpoint", b"nearest"] - if interpolation not in interps: + interpolations = [b"linear", b"lower", b"higher", b"midpoint", b"nearest"] + if interpolation not in interpolations: raise ValueError(f"Invalid interpolation method {interpolation}.") # Initialize NumPy array with NaN values and get view for nogil. @@ -815,8 +819,10 @@ cdef class QuantileForest: ranks : array-like of shape (n_samples, n_outputs) Quantiles ranks in range [0, 1] for samples as floats. """ - cdef intp_t n_samples, n_trees, n_outputs + cdef intp_t n_samples, n_outputs + cdef intp_t n_trees, max_idx cdef intp_t i, j + cdef list[char*] kinds cdef vector[double] leaf_samples cdef vector[vector[intp_t]] train_indices cdef intp_t idx, train_idx @@ -825,11 +831,16 @@ cdef class QuantileForest: cdef cnp.ndarray[float64_t, ndim=2] ranks cdef double[:, :] ranks_view - n_outputs = y_scores.shape[0] + n_samples = y_scores.shape[0] + n_outputs = y_scores.shape[1] - n_samples = X_leaves.shape[0] - n_trees = X_leaves.shape[1] + if n_outputs != self.y_train.size(): + raise ValueError( + f"Number of target outputs in training data ({self.y_train.size()}) does not " + f"match the number of outputs in test data being scored ({n_outputs})." + ) + n_trees = X_leaves.shape[1] max_idx = self.y_train_leaves.shape[3] if X_indices is not None: @@ -856,7 +867,6 @@ cdef class QuantileForest: for j in range(n_outputs): for k in range((train_indices.size())): train_indices[k].clear() - leaf_samples.clear() leaf_preds.clear() # Accumulate training indices across leaves for each tree. @@ -875,13 +885,15 @@ cdef class QuantileForest: if train_indices[k].size() == 0: continue + leaf_samples.clear() + # Get training target values associated with indices. for train_idx in train_indices[k]: if train_idx != 0: leaf_samples.push_back(self.y_train[j][train_idx - 1]) # Calculate rank. - pred = calc_quantile_rank(leaf_samples, y_scores[j, i], kind=kind) + pred = calc_quantile_rank(leaf_samples, y_scores[i, j], kind=kind) if pred != -1: leaf_preds.push_back(pred) @@ -931,8 +943,8 @@ cdef class QuantileForest: proximities : list of dicts Dicts mapping sample indices to proximity counts. """ - cdef vector[map[intp_t, intp_t]] proximities - cdef intp_t n_samples, n_trees, n_train + cdef intp_t n_samples + cdef intp_t n_trees, n_train, max_idx cdef intp_t i, j cdef vector[intp_t] train_indices cdef vector[int] leaf_weights @@ -940,10 +952,11 @@ cdef class QuantileForest: cdef int cutoff, train_wgt cdef priority_queue[pair[int, intp_t]] queue cdef pair[int, intp_t] entry + cdef vector[map[intp_t, intp_t]] proximities n_samples = X_leaves.shape[0] - n_trees = X_leaves.shape[1] + n_trees = X_leaves.shape[1] n_train = self.y_train[0].size() max_idx = self.y_train_leaves.shape[3] diff --git a/quantile_forest/tests/test_quantile_forest.py b/quantile_forest/tests/test_quantile_forest.py index c2b508b..eaac0c3 100755 --- a/quantile_forest/tests/test_quantile_forest.py +++ b/quantile_forest/tests/test_quantile_forest.py @@ -622,7 +622,7 @@ def check_quantile_ranks_toy(name): kwargs = {"aggregate_leaves_first": False} - expected = [0.6875, 0.6875, 0.4375, 0.9375, 0.875, 0.875] + expected = [0.75, 0.75, 0.5, 1.0, 1.0, 1.0] y_ranks = est.quantile_ranks(X, y, kind="rank", **kwargs) assert_allclose(y_ranks, expected) @@ -656,13 +656,10 @@ def check_quantile_ranks(name): X_train = x1.reshape(-1, 1) y_train = np.squeeze(x1 * 2 + e1) - x2 = np.random.choice(np.arange(0, 101), size=4) + x2 = np.random.choice(np.arange(0, 101), size=10) X_test = x2.reshape(-1, 1) - y_test = np.squeeze(X_test * 2) - - y_train = y_train.astype(np.float64) - y_test = y_test.astype(np.float64) + y_test = np.squeeze(X_test * 2 + e1) est = ForestRegressor(n_estimators=10, random_state=0) @@ -673,6 +670,17 @@ def check_quantile_ranks(name): assert np.all(y_ranks >= 0) assert np.all(y_ranks <= 1) + # Check predicted ranks on multi-target data. + y_train = np.stack([np.squeeze(x1), np.squeeze(x1 * 2)], axis=1) + y_test = np.stack([np.squeeze(X_test - e1), np.squeeze(X_test * 2 + e1)], axis=1) + + est.fit(X_train, y_train) + y_ranks = est.quantile_ranks(X_test, y_test) + + assert y_ranks.shape == (X_test.shape[0], 2) + assert np.all(y_ranks >= 0) + assert np.all(y_ranks <= 1) + @pytest.mark.parametrize("name", FOREST_REGRESSORS) def test_quantile_ranks(name):