Fix multi-target quantile ranks (#84)

zillow · Sep 4, 2024 · 087d8e9 · 087d8e9
1 parent 4df81ae
commit 087d8e9
Show file tree

Hide file tree

Showing 3 changed files with 45 additions and 24 deletions.
diff --git a/quantile_forest/_quantile_forest.py b/quantile_forest/_quantile_forest.py
@@ -879,7 +879,7 @@ def quantile_ranks(
             y = np.expand_dims(y, axis=1)
 
         y_ranks = self.forest_.quantile_ranks(
-            y.astype(np.float64).T,
+            y.astype(np.float64),
             X_leaves,
             X_indices,
             kind,

diff --git a/quantile_forest/_quantile_forest_fast.pyx b/quantile_forest/_quantile_forest_fast.pyx
@@ -481,8 +481,10 @@ cdef class QuantileForest:
 
     Attributes
     ----------
-    y_train : array-like of shape (n_samples, n_outputs)
-        Training target values. Assumes values are sorted in ascending order.
+    y_train : array-like of shape (n_outputs, n_samples)
+        Training target values. Assumes that the values are sorted in
+        ascending order for each output. The outputs are assigned to the first
+        dimension (n_outputs) of the array for ease of indexing.
 
     y_train_leaves : array-like of shape \
             (n_estimators, n_leaves, n_outputs, n_indices)
@@ -600,9 +602,11 @@ cdef class QuantileForest:
         preds : array-like of shape (n_samples, n_outputs, n_quantiles)
             Quantiles or means for samples as floats.
         """
-        cdef intp_t n_quantiles, n_samples, n_trees, n_outputs, n_train
+        cdef intp_t n_samples, n_outputs, n_quantiles
+        cdef intp_t n_trees, n_train, max_idx
         cdef intp_t i, j, k, l
         cdef bint use_mean
+        cdef list[char*] interpolations
         cdef vector[double] leaf_samples
         cdef vector[double] leaf_weights
         cdef vector[vector[intp_t]] train_indices
@@ -617,11 +621,11 @@ cdef class QuantileForest:
         cdef cnp.ndarray[float64_t, ndim=3] preds
         cdef double[:, :, :] preds_view
 
-        n_quantiles = len(quantiles)
         n_samples = X_leaves.shape[0]
-        n_trees = X_leaves.shape[1]
-
         n_outputs = self.y_train.size()
+        n_quantiles = len(quantiles)
+
+        n_trees = X_leaves.shape[1]
         n_train = self.y_train[0].size()
         max_idx = self.y_train_leaves.shape[3]
 
@@ -641,8 +645,8 @@ cdef class QuantileForest:
                     "must be None."
                 )
 
-        interps = [b"linear", b"lower", b"higher", b"midpoint", b"nearest"]
-        if interpolation not in interps:
+        interpolations = [b"linear", b"lower", b"higher", b"midpoint", b"nearest"]
+        if interpolation not in interpolations:
             raise ValueError(f"Invalid interpolation method {interpolation}.")
 
         # Initialize NumPy array with NaN values and get view for nogil.
@@ -815,8 +819,10 @@ cdef class QuantileForest:
         ranks : array-like of shape (n_samples, n_outputs)
             Quantiles ranks in range [0, 1] for samples as floats.
         """
-        cdef intp_t n_samples, n_trees, n_outputs
+        cdef intp_t n_samples, n_outputs
+        cdef intp_t n_trees, max_idx
         cdef intp_t i, j
+        cdef list[char*] kinds
         cdef vector[double] leaf_samples
         cdef vector[vector[intp_t]] train_indices
         cdef intp_t idx, train_idx
@@ -825,11 +831,16 @@ cdef class QuantileForest:
         cdef cnp.ndarray[float64_t, ndim=2] ranks
         cdef double[:, :] ranks_view
 
-        n_outputs = y_scores.shape[0]
+        n_samples = y_scores.shape[0]
+        n_outputs = y_scores.shape[1]
 
-        n_samples = X_leaves.shape[0]
-        n_trees = X_leaves.shape[1]
+        if n_outputs != <intp_t>self.y_train.size():
+            raise ValueError(
+                f"Number of target outputs in training data ({self.y_train.size()}) does not "
+                f"match the number of outputs in test data being scored ({n_outputs})."
+            )
 
+        n_trees = X_leaves.shape[1]
         max_idx = self.y_train_leaves.shape[3]
 
         if X_indices is not None:
@@ -856,7 +867,6 @@ cdef class QuantileForest:
                 for j in range(n_outputs):
                     for k in range(<intp_t>(train_indices.size())):
                         train_indices[k].clear()
-                    leaf_samples.clear()
                     leaf_preds.clear()
 
                     # Accumulate training indices across leaves for each tree.
@@ -875,13 +885,15 @@ cdef class QuantileForest:
                         if train_indices[k].size() == 0:
                             continue
 
+                        leaf_samples.clear()
+
                         # Get training target values associated with indices.
                         for train_idx in train_indices[k]:
                             if train_idx != 0:
                                 leaf_samples.push_back(self.y_train[j][train_idx - 1])
 
                         # Calculate rank.
-                        pred = calc_quantile_rank(leaf_samples, y_scores[j, i], kind=kind)
+                        pred = calc_quantile_rank(leaf_samples, y_scores[i, j], kind=kind)
                         if pred != -1:
                             leaf_preds.push_back(pred)
 
@@ -931,19 +943,20 @@ cdef class QuantileForest:
         proximities : list of dicts
             Dicts mapping sample indices to proximity counts.
         """
-        cdef vector[map[intp_t, intp_t]] proximities
-        cdef intp_t n_samples, n_trees, n_train
+        cdef intp_t n_samples
+        cdef intp_t n_trees, n_train, max_idx
         cdef intp_t i, j
         cdef vector[intp_t] train_indices
         cdef vector[int] leaf_weights
         cdef intp_t train_idx
         cdef int cutoff, train_wgt
         cdef priority_queue[pair[int, intp_t]] queue
         cdef pair[int, intp_t] entry
+        cdef vector[map[intp_t, intp_t]] proximities
 
         n_samples = X_leaves.shape[0]
-        n_trees = X_leaves.shape[1]
 
+        n_trees = X_leaves.shape[1]
         n_train = self.y_train[0].size()
         max_idx = self.y_train_leaves.shape[3]
 

diff --git a/quantile_forest/tests/test_quantile_forest.py b/quantile_forest/tests/test_quantile_forest.py
@@ -622,7 +622,7 @@ def check_quantile_ranks_toy(name):
 
     kwargs = {"aggregate_leaves_first": False}
 
-    expected = [0.6875, 0.6875, 0.4375, 0.9375, 0.875, 0.875]
+    expected = [0.75, 0.75, 0.5, 1.0, 1.0, 1.0]
     y_ranks = est.quantile_ranks(X, y, kind="rank", **kwargs)
     assert_allclose(y_ranks, expected)
 
@@ -656,13 +656,10 @@ def check_quantile_ranks(name):
     X_train = x1.reshape(-1, 1)
     y_train = np.squeeze(x1 * 2 + e1)
 
-    x2 = np.random.choice(np.arange(0, 101), size=4)
+    x2 = np.random.choice(np.arange(0, 101), size=10)
 
     X_test = x2.reshape(-1, 1)
-    y_test = np.squeeze(X_test * 2)
-
-    y_train = y_train.astype(np.float64)
-    y_test = y_test.astype(np.float64)
+    y_test = np.squeeze(X_test * 2 + e1)
 
     est = ForestRegressor(n_estimators=10, random_state=0)
 
@@ -673,6 +670,17 @@ def check_quantile_ranks(name):
     assert np.all(y_ranks >= 0)
     assert np.all(y_ranks <= 1)
 
+    # Check predicted ranks on multi-target data.
+    y_train = np.stack([np.squeeze(x1), np.squeeze(x1 * 2)], axis=1)
+    y_test = np.stack([np.squeeze(X_test - e1), np.squeeze(X_test * 2 + e1)], axis=1)
+
+    est.fit(X_train, y_train)
+    y_ranks = est.quantile_ranks(X_test, y_test)
+
+    assert y_ranks.shape == (X_test.shape[0], 2)
+    assert np.all(y_ranks >= 0)
+    assert np.all(y_ranks <= 1)
+
 
 @pytest.mark.parametrize("name", FOREST_REGRESSORS)
 def test_quantile_ranks(name):