diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/variance_thresholding/VarianceThreshold.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/variance_thresholding/VarianceThreshold.py new file mode 100644 index 000000000..e5e71ea1e --- /dev/null +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/variance_thresholding/VarianceThreshold.py @@ -0,0 +1,44 @@ +from typing import Any, Dict, Optional, Union + +import numpy as np + +from sklearn.feature_selection import VarianceThreshold as SklearnVarianceThreshold + +from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.base_tabular_preprocessing import \ + autoPyTorchTabularPreprocessingComponent + + +class VarianceThreshold(autoPyTorchTabularPreprocessingComponent): + """ + Removes features that have the same value in the training data. + """ + def __init__(self, random_state: Optional[np.random.RandomState] = None): + super().__init__() + + def fit(self, X: Dict[str, Any], y: Optional[Any] = None) -> 'VarianceThreshold': + + self.check_requirements(X, y) + + self.preprocessor['numerical'] = SklearnVarianceThreshold( + threshold=0.0 + ) + return self + + def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: + if self.preprocessor['numerical'] is None: + raise ValueError("cannot call transform on {} without fitting first." + .format(self.__class__.__name__)) + X.update({'variance_threshold': self.preprocessor}) + return X + + @staticmethod + def get_properties( + dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None + ) -> Dict[str, Union[str, bool]]: + + return { + 'shortname': 'Variance Threshold', + 'name': 'Variance Threshold (constant feature removal)', + 'handles_sparse': True, + } diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/variance_thresholding/__init__.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/variance_thresholding/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/autoPyTorch/pipeline/tabular_classification.py b/autoPyTorch/pipeline/tabular_classification.py index b95de512e..92dc764bb 100644 --- a/autoPyTorch/pipeline/tabular_classification.py +++ b/autoPyTorch/pipeline/tabular_classification.py @@ -27,6 +27,8 @@ ) from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling import ScalerChoice +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.variance_thresholding. \ + VarianceThreshold import VarianceThreshold from autoPyTorch.pipeline.components.setup.early_preprocessor.EarlyPreprocessing import EarlyPreprocessing from autoPyTorch.pipeline.components.setup.lr_scheduler import SchedulerChoice from autoPyTorch.pipeline.components.setup.network.base_network import NetworkComponent @@ -307,6 +309,7 @@ def _get_pipeline_steps( steps.extend([ ("imputer", SimpleImputer(random_state=self.random_state)), + ("variance_threshold", VarianceThreshold(random_state=self.random_state)), ("encoder", EncoderChoice(default_dataset_properties, random_state=self.random_state)), ("scaler", ScalerChoice(default_dataset_properties, random_state=self.random_state)), ("feature_preprocessor", FeatureProprocessorChoice(default_dataset_properties, diff --git a/autoPyTorch/pipeline/tabular_regression.py b/autoPyTorch/pipeline/tabular_regression.py index 57d0126d0..daee7f74a 100644 --- a/autoPyTorch/pipeline/tabular_regression.py +++ b/autoPyTorch/pipeline/tabular_regression.py @@ -27,6 +27,8 @@ ) from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling import ScalerChoice +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.variance_thresholding. \ + VarianceThreshold import VarianceThreshold from autoPyTorch.pipeline.components.setup.early_preprocessor.EarlyPreprocessing import EarlyPreprocessing from autoPyTorch.pipeline.components.setup.lr_scheduler import SchedulerChoice from autoPyTorch.pipeline.components.setup.network.base_network import NetworkComponent @@ -257,6 +259,7 @@ def _get_pipeline_steps( steps.extend([ ("imputer", SimpleImputer(random_state=self.random_state)), + ("variance_threshold", VarianceThreshold(random_state=self.random_state)), ("encoder", EncoderChoice(default_dataset_properties, random_state=self.random_state)), ("scaler", ScalerChoice(default_dataset_properties, random_state=self.random_state)), ("feature_preprocessor", FeatureProprocessorChoice(default_dataset_properties, diff --git a/test/test_pipeline/components/preprocessing/base.py b/test/test_pipeline/components/preprocessing/base.py index ac16e286a..35f6ed271 100644 --- a/test/test_pipeline/components/preprocessing/base.py +++ b/test/test_pipeline/components/preprocessing/base.py @@ -6,6 +6,8 @@ from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding import EncoderChoice from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling import ScalerChoice +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.variance_thresholding. \ + VarianceThreshold import VarianceThreshold from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline @@ -28,6 +30,7 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]], steps.extend([ ("imputer", SimpleImputer()), + ("variance_threshold", VarianceThreshold()), ("encoder", EncoderChoice(default_dataset_properties)), ("scaler", ScalerChoice(default_dataset_properties)), ("tabular_transformer", TabularColumnTransformer()), diff --git a/test/test_pipeline/components/preprocessing/test_variance_thresholding.py b/test/test_pipeline/components/preprocessing/test_variance_thresholding.py new file mode 100644 index 000000000..3f22835b3 --- /dev/null +++ b/test/test_pipeline/components/preprocessing/test_variance_thresholding.py @@ -0,0 +1,49 @@ +import numpy as np +from numpy.testing import assert_array_equal + + +from sklearn.base import BaseEstimator +from sklearn.compose import make_column_transformer + +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.variance_thresholding. \ + VarianceThreshold import VarianceThreshold + + +def test_variance_threshold(): + data = np.array([[1, 2, 1], + [7, 8, 9], + [4, 5, 1], + [11, 12, 1], + [17, 18, 19], + [14, 15, 16]]) + numerical_columns = [0, 1, 2] + train_indices = np.array([0, 2, 3]) + test_indices = np.array([1, 4, 5]) + dataset_properties = { + 'categorical_columns': [], + 'numerical_columns': numerical_columns, + } + X = { + 'X_train': data[train_indices], + 'dataset_properties': dataset_properties + } + component = VarianceThreshold() + + component = component.fit(X) + X = component.transform(X) + variance_threshold = X['variance_threshold']['numerical'] + + # check if the fit dictionary X is modified as expected + assert isinstance(X['variance_threshold'], dict) + assert isinstance(variance_threshold, BaseEstimator) + + # make column transformer with returned encoder to fit on data + column_transformer = make_column_transformer((variance_threshold, + X['dataset_properties']['numerical_columns']), + remainder='passthrough') + column_transformer = column_transformer.fit(X['X_train']) + transformed = column_transformer.transform(data[test_indices]) + + assert_array_equal(transformed, np.array([[7, 8], + [17, 18], + [14, 15]]))