Provide workaround for cupy.percentile bug(#3315)

wphicks · web-flow · commit 550121bcce25 · 2020-12-17T18:37:36.000Z
Ensure that the 100th quantile value returned by cupy.percentile is the maximum of the input array rather than (possibly) NaN due to cupy/cupy#4451. This eliminates an intermittent failure observed in tests of KBinsDiscretizer, which makes use of cupy.percentile. Note that this includes an alteration of the included sklearn code and should be reverted once the upstream cupy issue is resolved. Resolve failure due to ValueError described in #2933. Authors: - William Hicks <whicks@nvidia.com> Approvers: - Dante Gama Dessavre - Victor Lafargue URL: #3315
diff --git a/python/cuml/_thirdparty/sklearn/preprocessing/_discretization.py b/python/cuml/_thirdparty/sklearn/preprocessing/_discretization.py
@@ -199,6 +199,10 @@ def fit(self, X, y=None):
             elif self.strategy == 'quantile':
                 quantiles = np.linspace(0, 100, n_bins[jj] + 1)
                 bin_edges[jj] = np.asarray(np.percentile(column, quantiles))
+                # Workaround for https://github.com/cupy/cupy/issues/4451
+                # This should be removed as soon as a fix is available in cupy
+                # in order to limit alterations in the included sklearn code
+                bin_edges[jj][-1] = col_max
 
             elif self.strategy == 'kmeans':
                 # Deterministic initialization with uniform spacing
diff --git a/python/cuml/test/test_preprocessing.py b/python/cuml/test/test_preprocessing.py
@@ -556,7 +556,6 @@ def test_robust_scale_sparse(sparse_clf_dataset,  # noqa: F811
 @pytest.mark.parametrize("n_bins", [5, 20])
 @pytest.mark.parametrize("encode", ['ordinal', 'onehot-dense', 'onehot'])
 @pytest.mark.parametrize("strategy", ['uniform', 'quantile', 'kmeans'])
-@pytest.mark.xfail(strict=False)
 def test_kbinsdiscretizer(blobs_dataset, n_bins,  # noqa: F811
                           encode, strategy):
     X_np, X = blobs_dataset