diff --git a/doc/whats_new.rst b/doc/whats_new.rst index ba3d4d584..80314925d 100644 --- a/doc/whats_new.rst +++ b/doc/whats_new.rst @@ -4,10 +4,10 @@ Release history =============== -.. include:: whats_new/v0.0.4.rst +.. include:: whats_new/v0.4.rst -.. include:: whats_new/v0.0.3.rst +.. include:: whats_new/v0.3.rst -.. include:: whats_new/v0.0.2.rst +.. include:: whats_new/v0.2.rst -.. include:: whats_new/v0.0.1.rst +.. include:: whats_new/v0.1.rst diff --git a/doc/whats_new/v0.0.1.rst b/doc/whats_new/v0.1.rst similarity index 100% rename from doc/whats_new/v0.0.1.rst rename to doc/whats_new/v0.1.rst diff --git a/doc/whats_new/v0.0.2.rst b/doc/whats_new/v0.2.rst similarity index 100% rename from doc/whats_new/v0.0.2.rst rename to doc/whats_new/v0.2.rst diff --git a/doc/whats_new/v0.0.3.rst b/doc/whats_new/v0.3.rst similarity index 100% rename from doc/whats_new/v0.0.3.rst rename to doc/whats_new/v0.3.rst diff --git a/doc/whats_new/v0.0.4.rst b/doc/whats_new/v0.4.rst similarity index 97% rename from doc/whats_new/v0.0.4.rst rename to doc/whats_new/v0.4.rst index ec7dfcaab..dd011da2e 100644 --- a/doc/whats_new/v0.0.4.rst +++ b/doc/whats_new/v0.4.rst @@ -14,6 +14,10 @@ Bug fixes deviation. By :user:`Guillaume Lemaitre ` in :issue:`491`. +- Raise an error when passing target which is not supported, i.e. regression + target or multilabel targets. Imbalanced-learn does not support this case. + By :user:`Guillaume Lemaitre ` in :issue:`490`. + Version 0.4 =========== diff --git a/imblearn/base.py b/imblearn/base.py index f059941d9..0a07cc065 100644 --- a/imblearn/base.py +++ b/imblearn/base.py @@ -15,6 +15,7 @@ from sklearn.externals import six from sklearn.preprocessing import label_binarize from sklearn.utils import check_X_y +from sklearn.utils.multiclass import check_classification_targets from .utils import check_sampling_strategy, check_target_type from .utils.deprecation import deprecate_parameter @@ -77,6 +78,7 @@ def fit_resample(self, X, y): """ self._deprecate_ratio() + check_classification_targets(y) X, y, binarize_y = self._check_X_y(X, y) self.sampling_strategy_ = check_sampling_strategy( diff --git a/imblearn/over_sampling/tests/test_smote_nc.py b/imblearn/over_sampling/tests/test_smote_nc.py index 9dfdc67a4..65598523b 100644 --- a/imblearn/over_sampling/tests/test_smote_nc.py +++ b/imblearn/over_sampling/tests/test_smote_nc.py @@ -127,9 +127,12 @@ def test_smotenc_check_target_type(): y = np.linspace(0, 1, 30) smote = SMOTENC(categorical_features=categorical_features, random_state=0) - with pytest.warns(UserWarning, match='should be of types'): + with pytest.raises(ValueError, match="Unknown label type: 'continuous'"): + smote.fit_resample(X, y) + rng = np.random.RandomState(42) + y = rng.randint(2, size=(20, 3)) + with pytest.raises(ValueError, match="'y' should encode the multiclass"): smote.fit_resample(X, y) - def test_smotenc_samplers_one_label(): X, _, categorical_features = data_heterogneous_unordered() diff --git a/imblearn/utils/_validation.py b/imblearn/utils/_validation.py index e4fc7224d..d00db9c63 100644 --- a/imblearn/utils/_validation.py +++ b/imblearn/utils/_validation.py @@ -86,17 +86,14 @@ def check_target_type(y, indicate_one_vs_all=False): """ type_y = type_of_target(y) - if type_y not in TARGET_KIND: - # FIXME: perfectly we should raise an error but the sklearn API does - # not allow for it - warnings.warn("'y' should be of types {} only. Got {} instead.".format( - TARGET_KIND, type_of_target(y))) - - if indicate_one_vs_all: - return (y.argmax(axis=1) if type_y == 'multilabel-indicator' else y, - type_y == 'multilabel-indicator') - else: - return y.argmax(axis=1) if type_y == 'multilabel-indicator' else y + if type_y == 'multilabel-indicator': + if np.any(y.sum(axis=1) > 1): + raise ValueError( + "When 'y' corresponds to '{}', 'y' should encode the " + "multiclass (a single 1 by row).".format(type_y)) + y = y.argmax(axis=1) + + return (y, type_y == 'multilabel-indicator') if indicate_one_vs_all else y def _sampling_strategy_all(y, sampling_type): diff --git a/imblearn/utils/estimator_checks.py b/imblearn/utils/estimator_checks.py index fa21e5c1f..7d08f3313 100644 --- a/imblearn/utils/estimator_checks.py +++ b/imblearn/utils/estimator_checks.py @@ -33,8 +33,6 @@ from imblearn.over_sampling import SMOTE from imblearn.under_sampling import NearMiss, ClusterCentroids -from imblearn.utils.testing import warns - DONT_SUPPORT_RATIO = ['SVMSMOTE', 'BorderlineSMOTE'] SUPPORT_STRING = ['RandomUnderSampler', 'RandomOverSampler'] HAVE_SAMPLE_INDICES = [ @@ -54,7 +52,6 @@ def monkey_patch_check_dtype_object(name, estimator_orig): X = rng.rand(40, 10).astype(object) y = np.array([0] * 10 + [1] * 30, dtype=np.int) estimator = clone(estimator_orig) - estimator.fit(X, y) try: @@ -123,14 +120,20 @@ def check_estimator(Estimator, run_sampler_tests=True): def check_target_type(name, Estimator): + # should raise warning if the target is continuous (we cannot raise error) X = np.random.random((20, 2)) y = np.linspace(0, 1, 20) estimator = Estimator() # FIXME: in 0.6 set the random_state for all if name not in DONT_HAVE_RANDOM_STATE: set_random_state(estimator) - with warns(UserWarning, match='should be of types'): - estimator.fit(X, y) + with pytest.raises(ValueError, match="Unknown label type: 'continuous'"): + estimator.fit_resample(X, y) + # if the target is multilabel then we should raise an error + rng = np.random.RandomState(42) + y = rng.randint(2, size=(20, 3)) + with pytest.raises(ValueError, match="'y' should encode the multiclass"): + estimator.fit_resample(X, y) def check_samplers_one_label(name, Sampler): @@ -139,7 +142,7 @@ def check_samplers_one_label(name, Sampler): X = np.random.random((20, 2)) y = np.zeros(20) try: - sampler.fit(X, y) + sampler.fit_resample(X, y) except ValueError as e: if 'class' not in repr(e): print(error_string_fit, Sampler, e) @@ -157,7 +160,7 @@ def check_samplers_fit(name, Sampler): sampler = Sampler() X = np.random.random((30, 2)) y = np.array([1] * 20 + [0] * 10) - sampler.fit(X, y) + sampler.fit_resample(X, y) assert hasattr(sampler, 'sampling_strategy_'), \ "No fitted attribute sampling_strategy_" @@ -165,7 +168,7 @@ def check_samplers_fit(name, Sampler): def check_samplers_fit_resample(name, Sampler): sampler = Sampler() X, y = make_classification(n_samples=1000, n_classes=3, n_informative=4, - weights=[0.2, 0.3, 0.5], random_state=0) + weights=[0.2, 0.3, 0.5], random_state=0) target_stats = Counter(y) X_res, y_res = sampler.fit_resample(X, y) if isinstance(sampler, BaseOverSampler): diff --git a/imblearn/utils/tests/test_estimator_checks.py b/imblearn/utils/tests/test_estimator_checks.py index 134ee79c2..06df4d973 100644 --- a/imblearn/utils/tests/test_estimator_checks.py +++ b/imblearn/utils/tests/test_estimator_checks.py @@ -3,6 +3,7 @@ from sklearn.base import BaseEstimator from sklearn.utils import check_X_y +from sklearn.utils.multiclass import check_classification_targets from imblearn.base import BaseSampler from imblearn.utils.estimator_checks import check_estimator @@ -17,6 +18,8 @@ def fit(self, X, y): return self def fit_resample(self, X, y): + check_classification_targets(y) + self.fit(X, y) return X, y @@ -27,36 +30,15 @@ def fit(self, X, y): X, y = check_X_y(X, y, accept_sparse=True) return self - def fit_resample(self, X, y): - self.fit(X, y) - return X, y - class NoAcceptingSparseSampler(BaseBadSampler): """Sampler which does not accept sparse matrix.""" def fit(self, X, y): - X, y = check_X_y(X, y, accept_sparse=False) - y, _ = check_target_type(y, indicate_one_vs_all=True) - self.sampling_strategy_ = 'sampling_strategy_' - return self - - def fit_resample(self, X, y): - self.fit(X, y) - return X, y - - -class NotTransformingTargetOvR(BaseBadSampler): - """Sampler which does not transform OvR enconding.""" - def fit(self, X, y): - X, y = check_X_y(X, y, accept_sparse=True) y, _ = check_target_type(y, indicate_one_vs_all=True) + X, y = check_X_y(X, y, accept_sparse=False) self.sampling_strategy_ = 'sampling_strategy_' return self - def fit_resample(self, X, y): - self.fit(X, y) - return X, y - class NotPreservingDtypeSampler(BaseSampler): _sampling_type = 'bypass' @@ -72,7 +54,6 @@ def _fit_resample(self, X, y): [(BaseBadSampler, AssertionError, "TypeError not raised by fit"), (NotFittedSampler, AssertionError, "No fitted attribute"), (NoAcceptingSparseSampler, TypeError, "A sparse matrix was passed"), - (NotTransformingTargetOvR, ValueError, "bad input shape"), (NotPreservingDtypeSampler, AssertionError, "X dytype is not preserved")] ) def test_check_estimator(Estimator, err_type, err_msg): diff --git a/imblearn/utils/tests/test_validation.py b/imblearn/utils/tests/test_validation.py index 773896e5e..2d34b2e9c 100644 --- a/imblearn/utils/tests/test_validation.py +++ b/imblearn/utils/tests/test_validation.py @@ -60,12 +60,6 @@ def test_check_target_type_ova(target, output_target, is_ova): assert binarize_target == is_ova -def test_check_target_warning(): - target = np.arange(4).reshape((2, 2)) - with pytest.warns(UserWarning, match='should be of types'): - check_target_type(target) - - def test_check_sampling_strategy_warning(): msg = 'dict for cleaning methods is deprecated' with pytest.warns(DeprecationWarning, match=msg):