diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index 3f398c464..bacf905e7 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -29,7 +29,7 @@ jobs: - name: Run tests run: | if [ ${{ matrix.code-cov }} ]; then codecov='--cov=autoPyTorch --cov-report=xml'; fi - python -m pytest -n 2 --timeout=600 --timeout-method=thread --dist load test -sv $codecov + python -m pytest --durations=20 --timeout=300 --timeout-method=thread -v $codecov test - name: Check for files left behind by test if: ${{ always() }} run: | diff --git a/autoPyTorch/datasets/base_dataset.py b/autoPyTorch/datasets/base_dataset.py index 565ffd4f3..51a7a8e38 100644 --- a/autoPyTorch/datasets/base_dataset.py +++ b/autoPyTorch/datasets/base_dataset.py @@ -48,6 +48,7 @@ class TransformSubset(Subset): We achieve so by adding a train flag to the pytorch subset """ + def __init__(self, dataset: Dataset, indices: Sequence[int], train: bool) -> None: self.dataset = dataset self.indices = indices @@ -371,3 +372,11 @@ def get_dataset_properties(self, dataset_requirements: List[FitRequirement]) -> 'num_classes': self.num_classes, }) return dataset_properties + + def get_required_dataset_info(self) -> Dict[str, Any]: + """ + Returns a dictionary containing required dataset properties to instantiate a pipeline, + """ + info = {'output_type': self.output_type, + 'issparse': self.issparse} + return info diff --git a/autoPyTorch/datasets/tabular_dataset.py b/autoPyTorch/datasets/tabular_dataset.py index ab75ce3f8..dbaa3a260 100644 --- a/autoPyTorch/datasets/tabular_dataset.py +++ b/autoPyTorch/datasets/tabular_dataset.py @@ -104,7 +104,6 @@ def __init__(self, X: Union[np.ndarray, pd.DataFrame], # rather to have a performance through time on the test data if X_test is not None: X_test, self._test_data_types, _, _, _ = self.interpret_columns(X_test) - # Some quality checks on the data if self.data_types != self._test_data_types: raise ValueError(f"The train data inferred types {self.data_types} are " @@ -205,8 +204,7 @@ def interpret_columns(self, return data, data_types, nan_mask, itovs, vtois - def infer_dataset_properties(self, X: Any) \ - -> Tuple[List[int], List[int], List[object], int]: + def infer_dataset_properties(self, X: Any) -> Tuple[List[int], List[int], List[object], int]: """ Infers the properties of the dataset like categorical_columns, numerical_columns, categories, num_features @@ -225,5 +223,16 @@ def infer_dataset_properties(self, X: Any) \ numerical_columns.append(i) categories = [np.unique(X.iloc[:, a]).tolist() for a in categorical_columns] num_features = X.shape[1] - return categorical_columns, numerical_columns, categories, num_features + + def get_required_dataset_info(self) -> Dict[str, Any]: + """ + Returns a dictionary containing required dataset properties to instantiate a pipeline, + """ + info = super().get_required_dataset_info() + info.update({ + 'numerical_columns': self.numerical_columns, + 'categorical_columns': self.categorical_columns, + 'task_type': self.task_type + }) + return info diff --git a/autoPyTorch/evaluation/abstract_evaluator.py b/autoPyTorch/evaluation/abstract_evaluator.py index 65d252852..aeb89464d 100644 --- a/autoPyTorch/evaluation/abstract_evaluator.py +++ b/autoPyTorch/evaluation/abstract_evaluator.py @@ -32,8 +32,7 @@ from autoPyTorch.datasets.base_dataset import BaseDataset from autoPyTorch.datasets.tabular_dataset import TabularDataset from autoPyTorch.evaluation.utils import ( - convert_multioutput_multiclass_to_multilabel, - subsampler + convert_multioutput_multiclass_to_multilabel ) from autoPyTorch.pipeline.base_pipeline import BasePipeline from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric @@ -42,6 +41,7 @@ get_metrics, ) from autoPyTorch.utils.backend import Backend +from autoPyTorch.utils.common import subsampler from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates from autoPyTorch.utils.logging_ import PicklableClientLogger, get_named_client_logger from autoPyTorch.utils.pipeline import get_dataset_requirements diff --git a/autoPyTorch/evaluation/train_evaluator.py b/autoPyTorch/evaluation/train_evaluator.py index 16b05a30e..3d3887ee5 100644 --- a/autoPyTorch/evaluation/train_evaluator.py +++ b/autoPyTorch/evaluation/train_evaluator.py @@ -17,9 +17,9 @@ AbstractEvaluator, fit_and_suppress_warnings ) -from autoPyTorch.evaluation.utils import subsampler from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric from autoPyTorch.utils.backend import Backend +from autoPyTorch.utils.common import subsampler from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates __all__ = ['TrainEvaluator', 'eval_function'] diff --git a/autoPyTorch/evaluation/utils.py b/autoPyTorch/evaluation/utils.py index d783413ca..f7cefd100 100644 --- a/autoPyTorch/evaluation/utils.py +++ b/autoPyTorch/evaluation/utils.py @@ -4,8 +4,6 @@ import numpy as np -import pandas as pd - from smac.runhistory.runhistory import RunValue __all__ = [ @@ -16,12 +14,6 @@ ] -def subsampler(data: Union[np.ndarray, pd.DataFrame], - x: Union[np.ndarray, List[int]] - ) -> Union[np.ndarray, pd.DataFrame]: - return data[x] if isinstance(data, np.ndarray) else data.iloc[x] - - def read_queue(queue_: Queue) -> List[RunValue]: stack: List[RunValue] = [] while True: diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py index e77c65be2..24491af44 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/TabularColumnTransformer.py @@ -11,7 +11,7 @@ autoPyTorchTabularPreprocessingComponent ) from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.utils import get_tabular_preprocessers -from autoPyTorch.utils.common import FitRequirement +from autoPyTorch.utils.common import FitRequirement, subsampler class TabularColumnTransformer(autoPyTorchTabularPreprocessingComponent): @@ -48,7 +48,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer": "TabularColumnTransformer": an instance of self """ self.check_requirements(X, y) - numerical_pipeline = 'drop' categorical_pipeline = 'drop' @@ -67,11 +66,11 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "TabularColumnTransformer": # Where to get the data -- Prioritize X_train if any else # get from backend if 'X_train' in X: - X_train = X['X_train'] + X_train = subsampler(X['X_train'], X['train_indices']) else: X_train = X['backend'].load_datamanager().train_tensors[0] - self.preprocessor.fit(X_train) + self.preprocessor.fit(X_train) return self def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder_choice.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder_choice.py index 019861c92..7be7c94a2 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder_choice.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/encoding/base_encoder_choice.py @@ -78,6 +78,9 @@ def get_hyperparameter_search_space(self, # add only no encoder to choice hyperparameters in case the dataset is only numerical if len(dataset_properties['categorical_columns']) == 0: default = 'NoEncoder' + if include is not None and default not in include: + raise ValueError("Provided {} in include, however, the dataset " + "is incompatible with it".format(include)) preprocessor = CSH.CategoricalHyperparameter('__choice__', ['NoEncoder'], default_value=default) diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/KernelPCA.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/KernelPCA.py new file mode 100644 index 000000000..4fb77b90f --- /dev/null +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/KernelPCA.py @@ -0,0 +1,104 @@ +from math import ceil, floor +from typing import Any, Dict, Optional, Tuple, Union + +from ConfigSpace.conditions import EqualsCondition, InCondition +from ConfigSpace.configuration_space import ConfigurationSpace +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + UniformFloatHyperparameter, + UniformIntegerHyperparameter, +) + +import numpy as np + +import sklearn.decomposition +from sklearn.base import BaseEstimator + +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing.\ + base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent +from autoPyTorch.utils.common import FitRequirement + + +class KernelPCA(autoPyTorchFeaturePreprocessingComponent): + def __init__(self, n_components: int = 10, + kernel: str = 'rbf', degree: int = 3, + gamma: float = 0.01, coef0: float = 0.0, + random_state: Optional[Union[int, np.random.RandomState]] = None + ) -> None: + self.n_components = n_components + self.kernel = kernel + self.degree = degree + self.gamma = gamma + self.coef0 = coef0 + self.random_state = random_state + super().__init__() + + self.add_fit_requirements([ + FitRequirement('issparse', (bool,), user_defined=True, dataset_property=True)]) + + def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: + + self.preprocessor['numerical'] = sklearn.decomposition.KernelPCA( + n_components=self.n_components, kernel=self.kernel, + degree=self.degree, gamma=self.gamma, coef0=self.coef0, + remove_zero_eig=True, random_state=self.random_state) + + return self + + @staticmethod + def get_hyperparameter_search_space( + dataset_properties: Optional[Dict[str, str]] = None, + n_components: Tuple[Tuple, float] = ((0.5, 0.9), 0.5), + kernel: Tuple[Tuple, str] = (('poly', 'rbf', 'sigmoid', 'cosine'), 'rbf'), + gamma: Tuple[Tuple, float, bool] = ((3.0517578125e-05, 8), 0.01, True), + degree: Tuple[Tuple, int] = ((2, 5), 3), + coef0: Tuple[Tuple, float] = ((-1, 1), 0) + ) -> ConfigurationSpace: + + if dataset_properties is not None: + n_features = len(dataset_properties['numerical_columns']) + n_components = ((floor(n_components[0][0] * n_features), ceil(n_components[0][1] * n_features)), + ceil(n_components[1] * n_features)) + else: + n_components = ((10, 2000), 100) + + n_components = UniformIntegerHyperparameter( + "n_components", lower=n_components[0][0], upper=n_components[0][1], default_value=n_components[1]) + kernel_hp = CategoricalHyperparameter('kernel', choices=kernel[0], default_value=kernel[1]) + gamma = UniformFloatHyperparameter( + "gamma", + lower=gamma[0][0], upper=gamma[0][1], + log=gamma[2], + default_value=gamma[1], + ) + coef0 = UniformFloatHyperparameter("coef0", lower=coef0[0][0], upper=coef0[0][1], default_value=coef0[1]) + cs = ConfigurationSpace() + cs.add_hyperparameters([n_components, kernel_hp, gamma, coef0]) + + if "poly" in kernel_hp.choices: + degree = UniformIntegerHyperparameter('degree', lower=degree[0][0], upper=degree[0][1], + default_value=degree[1]) + cs.add_hyperparameters([degree]) + degree_depends_on_poly = EqualsCondition(degree, kernel_hp, "poly") + cs.add_conditions([degree_depends_on_poly]) + kernels = [] + if "sigmoid" in kernel_hp.choices: + kernels.append("sigmoid") + if "poly" in kernel_hp.choices: + kernels.append("poly") + coef0_condition = InCondition(coef0, kernel_hp, kernels) + kernels = [] + if "rbf" in kernel_hp.choices: + kernels.append("rbf") + if "poly" in kernel_hp.choices: + kernels.append("poly") + gamma_condition = InCondition(gamma, kernel_hp, kernels) + cs.add_conditions([coef0_condition, gamma_condition]) + return cs + + @staticmethod + def get_properties(dataset_properties: Optional[Dict[str, str]] = None) -> Dict[str, Any]: + return {'shortname': 'KernelPCA', + 'name': 'Kernel Principal Component Analysis', + 'handles_sparse': True + } diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/NoFeaturePreprocessor.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/NoFeaturePreprocessor.py new file mode 100644 index 000000000..85e11973d --- /dev/null +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/NoFeaturePreprocessor.py @@ -0,0 +1,52 @@ +from typing import Any, Dict, Optional, Union + +import numpy as np + +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing.\ + base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent + + +class NoFeaturePreprocessor(autoPyTorchFeaturePreprocessingComponent): + """ + Don't perform feature preprocessing on categorical features + """ + def __init__(self, + random_state: Optional[Union[np.random.RandomState, int]] = None + ): + super().__init__() + self.random_state = random_state + + def fit(self, X: Dict[str, Any], y: Any = None) -> autoPyTorchFeaturePreprocessingComponent: + """ + The fit function calls the fit function of the underlying model + and returns the transformed array. + Args: + X (np.ndarray): input features + y (Optional[np.ndarray]): input labels + + Returns: + instance of self + """ + self.check_requirements(X, y) + + return self + + def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: + """ + Adds the self into the 'X' dictionary and returns it. + Args: + X (Dict[str, Any]): 'X' dictionary + + Returns: + (Dict[str, Any]): the updated 'X' dictionary + """ + X.update({'feature_preprocessor': self.preprocessor}) + return X + + @staticmethod + def get_properties(dataset_properties: Optional[Dict[str, Any]] = None) -> Dict[str, Union[str, bool]]: + return { + 'shortname': 'NoFeaturePreprocessing', + 'name': 'No Feature Preprocessing', + 'handles_sparse': True + } diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/Nystroem.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/Nystroem.py new file mode 100644 index 000000000..93be983e9 --- /dev/null +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/Nystroem.py @@ -0,0 +1,102 @@ +from math import ceil, floor +from typing import Any, Dict, Optional, Tuple, Union + +from ConfigSpace.conditions import EqualsCondition, InCondition +from ConfigSpace.configuration_space import ConfigurationSpace +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + UniformFloatHyperparameter, + UniformIntegerHyperparameter, +) + +import numpy as np + +import sklearn.kernel_approximation +from sklearn.base import BaseEstimator + +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing.\ + base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent + + +class Nystroem(autoPyTorchFeaturePreprocessingComponent): + def __init__(self, n_components: int = 10, + kernel: str = 'rbf', degree: int = 3, + gamma: float = 0.01, coef0: float = 0.0, + random_state: Optional[Union[int, np.random.RandomState]] = None + ) -> None: + self.n_components = n_components + self.kernel = kernel + self.degree = degree + self.gamma = gamma + self.coef0 = coef0 + self.random_state = random_state + super().__init__() + + def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: + + self.preprocessor['numerical'] = sklearn.kernel_approximation.Nystroem( + n_components=self.n_components, kernel=self.kernel, + degree=self.degree, gamma=self.gamma, coef0=self.coef0, + random_state=self.random_state) + + return self + + @staticmethod + def get_hyperparameter_search_space( + dataset_properties: Optional[Dict[str, str]] = None, + n_components: Tuple[Tuple, float, bool] = ((0.5, 0.9), 0.5, True), + kernel: Tuple[Tuple, str] = (('poly', 'rbf', 'sigmoid', 'cosine'), 'rbf'), + gamma: Tuple[Tuple, float, bool] = ((3.0517578125e-05, 8), 0.01, True), + degree: Tuple[Tuple, int] = ((2, 5), 3), + coef0: Tuple[Tuple, float] = ((-1, 1), 0) + ) -> ConfigurationSpace: + + if dataset_properties is not None: + n_features = len(dataset_properties['numerical_columns']) + # if numerical features are 1, set log to False + if n_features == 1: + log = False + else: + log = n_components[2] + n_components = ((floor(n_components[0][0] * n_features), ceil(n_components[0][1] * n_features)), + ceil(n_components[1] * n_features), log) + else: + n_components = ((10, 2000), 100, True) + + n_components = UniformIntegerHyperparameter( + "n_components", lower=n_components[0][0], upper=n_components[0][1], + default_value=n_components[1], log=n_components[2]) + kernel_hp = CategoricalHyperparameter('kernel', choices=kernel[0], default_value=kernel[1]) + gamma = UniformFloatHyperparameter( + "gamma", + lower=gamma[0][0], upper=gamma[0][1], + log=gamma[2], + default_value=gamma[1], + ) + degree = UniformIntegerHyperparameter('degree', lower=degree[0][0], upper=degree[0][1], default_value=degree[1]) + coef0 = UniformFloatHyperparameter("coef0", lower=coef0[0][0], upper=coef0[0][1], default_value=coef0[1]) + cs = ConfigurationSpace() + cs.add_hyperparameters([n_components, kernel_hp, degree, gamma, coef0]) + + degree_depends_on_poly = EqualsCondition(degree, kernel_hp, "poly") + kernels = [] + if "sigmoid" in kernel_hp.choices: + kernels.append("sigmoid") + if "poly" in kernel_hp.choices: + kernels.append("poly") + coef0_condition = InCondition(coef0, kernel_hp, kernels) + kernels = [] + if "rbf" in kernel_hp.choices: + kernels.append("rbf") + if "poly" in kernel_hp.choices: + kernels.append("poly") + gamma_condition = InCondition(gamma, kernel_hp, kernels) + cs.add_conditions([degree_depends_on_poly, coef0_condition, gamma_condition]) + return cs + + @staticmethod + def get_properties(dataset_properties: Optional[Dict[str, str]] = None) -> Dict[str, Any]: + return {'shortname': 'Nystroem', + 'name': 'Nystroem kernel approximation', + 'handles_sparse': True + } diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/PolynomialFeatures.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/PolynomialFeatures.py new file mode 100644 index 000000000..9f542acd0 --- /dev/null +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/PolynomialFeatures.py @@ -0,0 +1,61 @@ +from typing import Any, Dict, Optional, Tuple, Union + +from ConfigSpace.configuration_space import ConfigurationSpace +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, + UniformIntegerHyperparameter, +) + +import numpy as np + +import sklearn.decomposition +from sklearn.base import BaseEstimator + +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing.\ + base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent + + +class PolynomialFeatures(autoPyTorchFeaturePreprocessingComponent): + def __init__(self, degree: int = 2, interaction_only: bool = False, + include_bias: bool = False, + random_state: Optional[Union[int, np.random.RandomState]] = None): + self.degree = degree + self.interaction_only = interaction_only + self.include_bias = include_bias + + self.random_state = random_state + super().__init__() + + def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: + + self.preprocessor['numerical'] = sklearn.preprocessing.PolynomialFeatures( + degree=self.degree, interaction_only=self.interaction_only, + include_bias=self.include_bias) + return self + + @staticmethod + def get_properties(dataset_properties: Optional[Dict[str, str]] = None) -> Dict[str, Any]: + return {'shortname': 'PolynomialFeatures', + 'name': 'PolynomialFeatures', + 'handles_sparse': True} + + @staticmethod + def get_hyperparameter_search_space( + dataset_properties: Optional[Dict[str, str]] = None, + degree: Tuple[Tuple, int] = ((2, 3), 2), + interaction_only: Tuple[Tuple, bool] = ((True, False), False), + include_bias: Tuple[Tuple, bool] = ((True, False), False) + ) -> ConfigurationSpace: + + degree = UniformIntegerHyperparameter("degree", lower=degree[0][0], upper=degree[0][1], default_value=degree[1]) + interaction_only = CategoricalHyperparameter("interaction_only", + choices=interaction_only[0], + default_value=interaction_only[1]) + include_bias = CategoricalHyperparameter("include_bias", + choices=include_bias[0], + default_value=include_bias[1]) + + cs = ConfigurationSpace() + cs.add_hyperparameters([degree, interaction_only, include_bias]) + + return cs diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/PowerTransformer.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/PowerTransformer.py new file mode 100644 index 000000000..c02606c3d --- /dev/null +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/PowerTransformer.py @@ -0,0 +1,49 @@ +from typing import Any, Dict, Optional, Tuple, Union + +from ConfigSpace.configuration_space import ConfigurationSpace +from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, +) + +import numpy as np + +import sklearn.preprocessing +from sklearn.base import BaseEstimator + +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \ + base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent + + +class PowerTransformer(autoPyTorchFeaturePreprocessingComponent): + def __init__(self, standardize: bool = True, + random_state: Optional[Union[int, np.random.RandomState]] = None): + self.standardize = standardize + + self.random_state = random_state + super().__init__() + + def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: + self.preprocessor['numerical'] = sklearn.preprocessing.PowerTransformer(method="yeo-johnson", + standardize=self.standardize, + copy=False) + return self + + @staticmethod + def get_properties(dataset_properties: Optional[Dict[str, str]] = None) -> Dict[str, Any]: + return {'shortname': 'PowerTransformer', + 'name': 'Power Transformer', + 'handles_sparse': True} + + @staticmethod + def get_hyperparameter_search_space( + dataset_properties: Optional[Dict[str, str]] = None, + standardize: Tuple[Tuple, bool] = ((True, False), True) + ) -> ConfigurationSpace: + standardize = CategoricalHyperparameter("standardize", + choices=standardize[0], + default_value=standardize[1]) + + cs = ConfigurationSpace() + cs.add_hyperparameters([standardize]) + + return cs diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/RandomKitchenSinks.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/RandomKitchenSinks.py new file mode 100644 index 000000000..8f03e1880 --- /dev/null +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/RandomKitchenSinks.py @@ -0,0 +1,74 @@ +from math import ceil, floor +from typing import Any, Dict, Optional, Tuple, Union + +from ConfigSpace.configuration_space import ConfigurationSpace +from ConfigSpace.hyperparameters import ( + UniformFloatHyperparameter, + UniformIntegerHyperparameter, +) + +import numpy as np + +import sklearn.kernel_approximation +from sklearn.base import BaseEstimator + +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \ + base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent + + +class RandomKitchenSinks(autoPyTorchFeaturePreprocessingComponent): + def __init__(self, n_components: int = 100, + gamma: float = 1.0, + random_state: Optional[Union[int, np.random.RandomState]] = None + ) -> None: + self.n_components = n_components + self.gamma = gamma + self.random_state = random_state + super().__init__() + + def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: + + self.preprocessor['numerical'] = sklearn.kernel_approximation.RBFSampler( + self.gamma, self.n_components, self.random_state) + return self + + @staticmethod + def get_hyperparameter_search_space( + dataset_properties: Optional[Dict[str, str]] = None, + n_components: Tuple[Tuple, float, bool] = ((0.5, 0.9), 0.5, True), + gamma: Tuple[Tuple, float, bool] = ((3.0517578125e-05, 8), 1.0, True), + degree: Tuple[Tuple, int] = ((2, 5), 3), + coef0: Tuple[Tuple, float] = ((-1, 1), 0) + ) -> ConfigurationSpace: + + if dataset_properties is not None: + n_features = len(dataset_properties['numerical_columns']) + # if numerical features are 1, set log to False + if n_features == 1: + log = False + else: + log = n_components[2] + n_components = ((floor(n_components[0][0] * n_features), ceil(n_components[0][1] * n_features)), + ceil(n_components[1] * n_features), log) + else: + n_components = ((10, 2000), 100, True) + + n_components = UniformIntegerHyperparameter( + "n_components", lower=n_components[0][0], upper=n_components[0][1], + default_value=n_components[1], log=n_components[2]) + gamma = UniformFloatHyperparameter( + "gamma", + lower=gamma[0][0], upper=gamma[0][1], + log=gamma[2], + default_value=gamma[1], + ) + cs = ConfigurationSpace() + cs.add_hyperparameters([n_components, gamma]) + return cs + + @staticmethod + def get_properties(dataset_properties: Optional[Dict[str, str]] = None) -> Dict[str, Any]: + return {'shortname': 'KitchenSink', + 'name': 'Random Kitchen Sinks', + 'handles_sparse': True + } diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/TruncatedSVD.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/TruncatedSVD.py new file mode 100644 index 000000000..558fdb4de --- /dev/null +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/TruncatedSVD.py @@ -0,0 +1,55 @@ +from math import floor +from typing import Any, Dict, Optional, Tuple, Union + +from ConfigSpace.configuration_space import ConfigurationSpace +from ConfigSpace.hyperparameters import ( + UniformIntegerHyperparameter, +) + +import numpy as np + +import sklearn.decomposition +from sklearn.base import BaseEstimator + +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing\ + .base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent + + +class TruncatedSVD(autoPyTorchFeaturePreprocessingComponent): + def __init__(self, target_dim: int = 128, + random_state: Optional[Union[int, np.random.RandomState]] = None): + self.target_dim = target_dim + + self.random_state = random_state + super().__init__() + + def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: + + self.preprocessor['numerical'] = sklearn.decomposition.TruncatedSVD(self.target_dim, algorithm="randomized") + + return self + + @staticmethod + def get_properties(dataset_properties: Optional[Dict[str, str]] = None) -> Dict[str, Any]: + return {'shortname': 'TruncSVD', + 'name': 'Truncated Singular Value Decomposition', + 'handles_sparse': True} + + @staticmethod + def get_hyperparameter_search_space( + dataset_properties: Optional[Dict[str, str]] = None, + target_dim: Tuple[Tuple, float] = ((0.5, 0.9), 0.5), + ) -> ConfigurationSpace: + cs = ConfigurationSpace() + + if dataset_properties is not None: + n_features = len(dataset_properties['numerical_columns']) + target_dim = ((floor(target_dim[0][0] * n_features), floor(target_dim[0][1] * n_features)), + floor(target_dim[1] * n_features)) + else: + target_dim = ((10, 256), 128) + target_dim = UniformIntegerHyperparameter("target_dim", lower=target_dim[0][0], + upper=target_dim[0][1], default_value=target_dim[1]) + cs.add_hyperparameters([target_dim]) + + return cs diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/__init__.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/base_feature_preprocessor.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/base_feature_preprocessor.py new file mode 100644 index 000000000..8c85bbf30 --- /dev/null +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/base_feature_preprocessor.py @@ -0,0 +1,27 @@ +from typing import Any, Dict, List + +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.base_tabular_preprocessing import ( + autoPyTorchTabularPreprocessingComponent +) + + +class autoPyTorchFeaturePreprocessingComponent(autoPyTorchTabularPreprocessingComponent): + _required_properties: List[str] = ['handles_sparse'] + + def __init__(self) -> None: + super().__init__() + + def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: + """ + Adds the fitted feature preprocessor into the 'X' dictionary and returns it. + Args: + X (Dict[str, Any]): 'X' dictionary + + Returns: + (Dict[str, Any]): the updated 'X' dictionary + """ + if self.preprocessor['numerical'] is None and self.preprocessor['categorical'] is None: + raise AttributeError("{} can't tranform without fitting first" + .format(self.__class__.__name__)) + X.update({'feature_preprocessor': self.preprocessor}) + return X diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/base_feature_preprocessor_choice.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/base_feature_preprocessor_choice.py new file mode 100644 index 000000000..56af71877 --- /dev/null +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/base_feature_preprocessor_choice.py @@ -0,0 +1,116 @@ +import os +from collections import OrderedDict +from typing import Any, Dict, List, Optional + +import ConfigSpace.hyperparameters as CSH +from ConfigSpace.configuration_space import ConfigurationSpace + +from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice +from autoPyTorch.pipeline.components.base_component import ( + ThirdPartyComponents, + autoPyTorchComponent, + find_components, +) +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \ + base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent + +preprocessing_directory = os.path.split(__file__)[0] +_preprocessors = find_components(__package__, + preprocessing_directory, + autoPyTorchFeaturePreprocessingComponent) +_addons = ThirdPartyComponents(autoPyTorchFeaturePreprocessingComponent) + + +def add_feature_preprocessor(feature_preprocessor: autoPyTorchFeaturePreprocessingComponent) -> None: + _addons.add_component(feature_preprocessor) + + +class FeatureProprocessorChoice(autoPyTorchChoice): + """ + Allows for dynamically choosing feature_preprocessor component at runtime + """ + + def get_components(self) -> Dict[str, autoPyTorchComponent]: + """Returns the available feature_preprocessor components + + Args: + None + + Returns: + Dict[str, autoPyTorchComponent]: all feature preprocessor components available + as choices for encoding the categorical columns + """ + components: Dict = OrderedDict() + components.update(_preprocessors) + components.update(_addons.components) + return components + + def get_hyperparameter_search_space(self, + dataset_properties: Optional[Dict[str, Any]] = None, + default: Optional[str] = None, + include: Optional[List[str]] = None, + exclude: Optional[List[str]] = None) -> ConfigurationSpace: + cs = ConfigurationSpace() + + if dataset_properties is None: + dataset_properties = dict() + + dataset_properties = {**self.dataset_properties, **dataset_properties} + + available_ = self.get_available_components(dataset_properties=dataset_properties, + include=include, + exclude=exclude) + + if len(available_) == 0: + raise ValueError("no feature preprocessors found, please add a feature preprocessor") + + if default is None: + defaults = ['NoFeaturePreprocessor', + 'FastICA', + 'KernelPCA', + 'RandomKitchenSinks', + 'Nystroem', + 'PolynomialFeatures', + 'PowerTransformer', + 'TruncatedSVD', + ] + for default_ in defaults: + if default_ in available_: + if include is not None and default_ not in include: + continue + if exclude is not None and default_ in exclude: + continue + default = default_ + break + + # add only no feature preprocessor to choice hyperparameters in case the dataset is only categorical + if len(dataset_properties['numerical_columns']) == 0: + default = 'NoFeaturePreprocessor' + if include is not None and default not in include: + raise ValueError("Provided {} in include, however, " + "the dataset is incompatible with it".format(include)) + preprocessor = CSH.CategoricalHyperparameter('__choice__', + ['NoFeaturePreprocessor'], + default_value=default) + else: + # Truncated SVD requires n_features > n_components + if len(dataset_properties['numerical_columns']) == 1: + del available_['TruncatedSVD'] + preprocessor = CSH.CategoricalHyperparameter('__choice__', + list(available_.keys()), + default_value=default) + + cs.add_hyperparameter(preprocessor) + + # add only child hyperparameters of early_preprocessor choices + for name in preprocessor.choices: + updates = self._get_search_space_updates(prefix=name) + config_space = available_[name].get_hyperparameter_search_space(dataset_properties, # type:ignore + **updates) + parent_hyperparameter = {'parent': preprocessor, 'value': name} + cs.add_configuration_space(name, config_space, + parent_hyperparameter=parent_hyperparameter) + + self.configuration_space = cs + self.dataset_properties = dataset_properties + return cs diff --git a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/base_scaler_choice.py b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/base_scaler_choice.py index 718c80d39..6fdcc47bc 100644 --- a/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/base_scaler_choice.py +++ b/autoPyTorch/pipeline/components/preprocessing/tabular_preprocessing/scaling/base_scaler_choice.py @@ -74,6 +74,9 @@ def get_hyperparameter_search_space(self, # add only no scaler to choice hyperparameters in case the dataset is only categorical if len(dataset_properties['numerical_columns']) == 0: default = 'NoScaler' + if include is not None and default not in include: + raise ValueError("Provided {} in include, however, " + "the dataset is incompatible with it".format(include)) preprocessor = CSH.CategoricalHyperparameter('__choice__', ['NoScaler'], default_value=default) diff --git a/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py b/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py index 5d6def24a..6053bdf6d 100644 --- a/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py +++ b/autoPyTorch/pipeline/components/setup/early_preprocessor/EarlyPreprocessing.py @@ -31,7 +31,6 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> "EarlyPreprocessing": def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: transforms = get_preprocess_transforms(X) - if X['dataset_properties']['is_small_preprocess']: if 'X_train' in X: X_train = X['X_train'] diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py b/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py index d355005e8..2557e92b8 100644 --- a/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py +++ b/autoPyTorch/pipeline/components/setup/network_backbone/base_network_backbone.py @@ -1,5 +1,11 @@ from abc import abstractmethod -from typing import Any, Dict, Tuple +from typing import Any, Dict, Iterable, Optional, Tuple + +import numpy as np + +import pandas as pd + +from scipy.sparse import csr_matrix import torch from torch import nn @@ -8,6 +14,7 @@ from autoPyTorch.pipeline.components.base_component import ( autoPyTorchComponent, ) +from autoPyTorch.utils.common import FitRequirement class NetworkBackboneComponent(autoPyTorchComponent): @@ -19,8 +26,15 @@ class NetworkBackboneComponent(autoPyTorchComponent): def __init__(self, **kwargs: Any): super().__init__() + self.add_fit_requirements([ + FitRequirement('is_small_preprocess', (bool,), user_defined=True, dataset_property=True), + FitRequirement('X_train', (np.ndarray, pd.DataFrame, csr_matrix), user_defined=True, + dataset_property=False), + FitRequirement('input_shape', (Iterable,), user_defined=True, dataset_property=True), + FitRequirement('tabular_transformer', (BaseEstimator,), user_defined=False, dataset_property=False)]) self.backbone: nn.Module = None self.config = kwargs + self.input_shape: Optional[Iterable] = None def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: """ @@ -32,8 +46,17 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: Returns: Self """ + self.check_requirements(X, y) + X_train = X['X_train'] - input_shape = X['X_train'].shape[1:] + if X["dataset_properties"]["is_small_preprocess"]: + input_shape = X_train.shape[1:] + else: + # get input shape by transforming first two elements of the training set + column_transformer = X['tabular_transformer'].preprocessor + input_shape = column_transformer.transform(X_train[:1]).shape[1:] + + self.input_shape = input_shape self.backbone = self.build_backbone( input_shape=input_shape, @@ -42,13 +65,15 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: def transform(self, X: Dict[str, Any]) -> Dict[str, Any]: """ - Adds the network head into the fit dictionary 'X' and returns it. - + Adds the network backbone into the fit dictionary 'X' and returns it. + Also, updates the input shape as from this point only the shape of + the transformed dataset is used Args: X (Dict[str, Any]): 'X' dictionary Returns: (Dict[str, Any]): the updated 'X' dictionary """ + X['dataset_properties'].update({'input_shape': self.input_shape}) X.update({'network_backbone': self.backbone}) return X diff --git a/autoPyTorch/pipeline/components/setup/network_backbone/utils.py b/autoPyTorch/pipeline/components/setup/network_backbone/utils.py index 45a96a362..c20ca5ed2 100644 --- a/autoPyTorch/pipeline/components/setup/network_backbone/utils.py +++ b/autoPyTorch/pipeline/components/setup/network_backbone/utils.py @@ -50,7 +50,7 @@ def forward( def backward(ctx: typing.Any, grad_output: torch.Tensor ) -> typing.Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - x1, x2, alpha, beta = ctx.saved_variables + x1, x2, alpha, beta = ctx.saved_tensors grad_x1 = grad_x2 = grad_alpha = grad_beta = None if ctx.needs_input_grad[0]: @@ -81,7 +81,7 @@ def forward(ctx: typing.Any, def backward(ctx: typing.Any, grad_output: torch.Tensor ) -> typing.Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: - x, alpha, beta, bl = ctx.saved_variables + x, alpha, beta, bl = ctx.saved_tensors grad_x = grad_alpha = grad_beta = grad_bl = None if ctx.needs_input_grad[0]: diff --git a/autoPyTorch/pipeline/components/setup/network_head/base_network_head.py b/autoPyTorch/pipeline/components/setup/network_head/base_network_head.py index be2a9c7dc..ced7630fa 100644 --- a/autoPyTorch/pipeline/components/setup/network_head/base_network_head.py +++ b/autoPyTorch/pipeline/components/setup/network_head/base_network_head.py @@ -1,11 +1,12 @@ from abc import abstractmethod -from typing import Any, Dict, Tuple +from typing import Any, Dict, Iterable, Tuple import torch.nn as nn from autoPyTorch.constants import CLASSIFICATION_TASKS, STRING_TO_TASK_TYPES from autoPyTorch.pipeline.components.base_component import BaseEstimator, autoPyTorchComponent from autoPyTorch.pipeline.components.setup.network_backbone.utils import get_output_shape +from autoPyTorch.utils.common import FitRequirement class NetworkHeadComponent(autoPyTorchComponent): @@ -17,6 +18,12 @@ class NetworkHeadComponent(autoPyTorchComponent): def __init__(self, **kwargs: Any): super().__init__() + self.add_fit_requirements([ + FitRequirement('input_shape', (Iterable,), user_defined=True, dataset_property=True), + FitRequirement('num_classes', (int,), user_defined=True, dataset_property=True), + FitRequirement('task_type', (str,), user_defined=True, dataset_property=True), + FitRequirement('output_shape', (Iterable, int), user_defined=True, dataset_property=True), + ]) self.head: nn.Module = None self.config = kwargs @@ -30,7 +37,7 @@ def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: Returns: Self """ - input_shape = X['X_train'].shape[1:] + input_shape = X['dataset_properties']['input_shape'] output_shape = (X['dataset_properties']['num_classes'],) if \ STRING_TO_TASK_TYPES[X['dataset_properties']['task_type']] in \ CLASSIFICATION_TASKS else X['dataset_properties']['output_shape'] diff --git a/autoPyTorch/pipeline/components/training/losses.py b/autoPyTorch/pipeline/components/training/losses.py index 705246310..84d914f1c 100644 --- a/autoPyTorch/pipeline/components/training/losses.py +++ b/autoPyTorch/pipeline/components/training/losses.py @@ -47,7 +47,7 @@ def get_supported_losses(task: int, output_type: int) -> Dict[str, Type[Loss]]: return supported_losses -def get_loss_instance(dataset_properties: Dict[str, Any], name: Optional[str] = None) -> Loss: +def get_loss_instance(dataset_properties: Dict[str, Any], name: Optional[str] = None) -> Type[Loss]: assert 'task_type' in dataset_properties, \ "Expected dataset_properties to have task_type got {}".format(dataset_properties.keys()) assert 'output_type' in dataset_properties, \ @@ -68,4 +68,4 @@ def get_loss_instance(dataset_properties: Dict[str, Any], name: Optional[str] = else: loss = get_default(task) - return loss() + return loss diff --git a/autoPyTorch/pipeline/components/training/trainer/MixUpTrainer.py b/autoPyTorch/pipeline/components/training/trainer/MixUpTrainer.py index a49e17682..5bcf0f861 100644 --- a/autoPyTorch/pipeline/components/training/trainer/MixUpTrainer.py +++ b/autoPyTorch/pipeline/components/training/trainer/MixUpTrainer.py @@ -2,6 +2,7 @@ from ConfigSpace.configuration_space import ConfigurationSpace from ConfigSpace.hyperparameters import ( + CategoricalHyperparameter, UniformFloatHyperparameter, ) @@ -9,11 +10,13 @@ import torch +from autoPyTorch.constants import CLASSIFICATION_TASKS, STRING_TO_TASK_TYPES from autoPyTorch.pipeline.components.training.trainer.base_trainer import BaseTrainerComponent class MixUpTrainer(BaseTrainerComponent): - def __init__(self, alpha: float, random_state: typing.Optional[np.random.RandomState] = None): + def __init__(self, alpha: float, weighted_loss: bool = False, + random_state: typing.Optional[np.random.RandomState] = None): """ This class handles the training of a network for a single given epoch. @@ -22,6 +25,7 @@ def __init__(self, alpha: float, random_state: typing.Optional[np.random.RandomS """ super().__init__(random_state=random_state) + self.weighted_loss = weighted_loss self.alpha = alpha def data_preparation(self, X: np.ndarray, y: np.ndarray, @@ -62,10 +66,16 @@ def get_properties(dataset_properties: typing.Optional[typing.Dict[str, typing.A @staticmethod def get_hyperparameter_search_space(dataset_properties: typing.Optional[typing.Dict] = None, - alpha: typing.Tuple[typing.Tuple[float, float], float] = ((0, 1), 0.2) + alpha: typing.Tuple[typing.Tuple[float, float], float] = ((0, 1), 0.2), + weighted_loss: typing.Tuple[typing.Tuple, bool] = ((True, False), True) ) -> ConfigurationSpace: alpha = UniformFloatHyperparameter( "alpha", alpha[0][0], alpha[0][1], default_value=alpha[1]) + weighted_loss = CategoricalHyperparameter("weighted_loss", choices=weighted_loss[0], + default_value=weighted_loss[1]) cs = ConfigurationSpace() cs.add_hyperparameters([alpha]) + if dataset_properties is not None: + if STRING_TO_TASK_TYPES[dataset_properties['task_type']] not in CLASSIFICATION_TASKS: + cs.add_hyperparameters([weighted_loss]) return cs diff --git a/autoPyTorch/pipeline/components/training/trainer/StandardTrainer.py b/autoPyTorch/pipeline/components/training/trainer/StandardTrainer.py index 454d4c625..dbd190c59 100644 --- a/autoPyTorch/pipeline/components/training/trainer/StandardTrainer.py +++ b/autoPyTorch/pipeline/components/training/trainer/StandardTrainer.py @@ -1,13 +1,27 @@ import typing from ConfigSpace.configuration_space import ConfigurationSpace +from ConfigSpace.hyperparameters import CategoricalHyperparameter import numpy as np +from autoPyTorch.constants import CLASSIFICATION_TASKS, STRING_TO_TASK_TYPES from autoPyTorch.pipeline.components.training.trainer.base_trainer import BaseTrainerComponent class StandardTrainer(BaseTrainerComponent): + def __init__(self, weighted_loss: bool = False, + random_state: typing.Optional[np.random.RandomState] = None): + """ + This class handles the training of a network for a single given epoch. + + Args: + weighted_loss (bool): whether to use weighted loss + + """ + super().__init__(random_state=random_state) + self.weighted_loss = weighted_loss + def data_preparation(self, X: np.ndarray, y: np.ndarray, ) -> typing.Tuple[np.ndarray, typing.Dict[str, np.ndarray]]: """ @@ -40,7 +54,13 @@ def get_properties(dataset_properties: typing.Optional[typing.Dict[str, typing.A @staticmethod def get_hyperparameter_search_space(dataset_properties: typing.Optional[typing.Dict] = None, - **kwargs: typing.Any + weighted_loss: typing.Tuple[typing.Tuple, bool] = ((True, False), True) ) -> ConfigurationSpace: + weighted_loss = CategoricalHyperparameter("weighted_loss", choices=weighted_loss[0], + default_value=weighted_loss[1]) cs = ConfigurationSpace() + if dataset_properties is not None: + if STRING_TO_TASK_TYPES[dataset_properties['task_type']] not in CLASSIFICATION_TASKS: + cs.add_hyperparameters([weighted_loss]) + return cs diff --git a/autoPyTorch/pipeline/components/training/trainer/base_trainer.py b/autoPyTorch/pipeline/components/training/trainer/base_trainer.py index 64e30ae45..69665007e 100644 --- a/autoPyTorch/pipeline/components/training/trainer/base_trainer.py +++ b/autoPyTorch/pipeline/components/training/trainer/base_trainer.py @@ -3,14 +3,18 @@ import numpy as np +import pandas as pd + import torch from torch.autograd import Variable from torch.optim import Optimizer from torch.optim.lr_scheduler import _LRScheduler from torch.utils.tensorboard.writer import SummaryWriter +from autoPyTorch.constants import BINARY from autoPyTorch.pipeline.components.training.base_training import autoPyTorchTrainingComponent from autoPyTorch.pipeline.components.training.metrics.utils import calculate_score +from autoPyTorch.utils.implementations import get_loss_weight_strategy from autoPyTorch.utils.logging_ import PicklableClientLogger @@ -165,6 +169,7 @@ class BaseTrainerComponent(autoPyTorchTrainingComponent): def __init__(self, random_state: Optional[Union[np.random.RandomState, int]] = None) -> None: super().__init__() self.random_state = random_state + self.weighted_loss: bool = False def prepare( self, @@ -176,7 +181,9 @@ def prepare( device: torch.device, metrics_during_training: bool, scheduler: _LRScheduler, - task_type: int + task_type: int, + output_type: int, + labels: Union[np.ndarray, torch.Tensor, pd.DataFrame] ) -> None: # Save the device to be used @@ -185,8 +192,20 @@ def prepare( # Setup the metrics self.metrics = metrics + # Weights for the loss function + weights = None + kwargs = {} + if self.weighted_loss: + weights = self.get_class_weights(output_type, labels) + if output_type == BINARY: + kwargs['pos_weight'] = weights + else: + kwargs['weight'] = weights + + criterion = criterion(**kwargs) if weights is not None else criterion() + # Setup the loss function - self.criterion = criterion.to(device) + self.criterion = criterion # setup the model self.model = model.to(device) @@ -245,7 +264,6 @@ def train_epoch(self, train_loader: torch.utils.data.DataLoader, epoch: int, targets_data = list() for step, (data, targets) in enumerate(train_loader): - if self.budget_tracker.is_max_time_reached(): logger.info("Stopping training as max time reached") break @@ -358,6 +376,14 @@ def compute_metrics(self, outputs_data: np.ndarray, targets_data: np.ndarray targets_data = torch.cat(targets_data, dim=0) return calculate_score(targets_data, outputs_data, self.task_type, self.metrics) + def get_class_weights(self, output_type: int, labels: Union[np.ndarray, torch.Tensor, pd.DataFrame] + ) -> np.ndarray: + strategy = get_loss_weight_strategy(output_type) + weights = strategy(y=labels) + weights = torch.from_numpy(weights) + weights = weights.type(torch.FloatTensor).to(self.device) + return weights + def data_preparation(self, X: np.ndarray, y: np.ndarray, ) -> Tuple[np.ndarray, Dict[str, np.ndarray]]: """ diff --git a/autoPyTorch/pipeline/components/training/trainer/base_trainer_choice.py b/autoPyTorch/pipeline/components/training/trainer/base_trainer_choice.py index 92243311c..e6a630fb6 100755 --- a/autoPyTorch/pipeline/components/training/trainer/base_trainer_choice.py +++ b/autoPyTorch/pipeline/components/training/trainer/base_trainer_choice.py @@ -19,7 +19,7 @@ from torch.optim.lr_scheduler import _LRScheduler from torch.utils.tensorboard.writer import SummaryWriter -from autoPyTorch.constants import STRING_TO_TASK_TYPES +from autoPyTorch.constants import STRING_TO_OUTPUT_TYPES, STRING_TO_TASK_TYPES from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice from autoPyTorch.pipeline.components.base_component import ( ThirdPartyComponents, @@ -56,6 +56,7 @@ class TrainerChoice(autoPyTorchChoice): epoch happens, that is, how batches of data are fed and used to train the network. """ + def __init__(self, dataset_properties: Dict[str, Any], random_state: Optional[np.random.RandomState] = None @@ -97,11 +98,11 @@ def get_components(self) -> Dict[str, autoPyTorchComponent]: return components def get_hyperparameter_search_space( - self, - dataset_properties: Optional[Dict[str, str]] = None, - default: Optional[str] = None, - include: Optional[List[str]] = None, - exclude: Optional[List[str]] = None, + self, + dataset_properties: Optional[Dict[str, str]] = None, + default: Optional[str] = None, + include: Optional[List[str]] = None, + exclude: Optional[List[str]] = None, ) -> ConfigurationSpace: """Returns the configuration space of the current chosen components @@ -121,6 +122,8 @@ def get_hyperparameter_search_space( if dataset_properties is None: dataset_properties = {} + dataset_properties = {**self.dataset_properties, **dataset_properties} + # Compile a list of legal trainers for this problem available_trainers = self.get_available_components( dataset_properties=dataset_properties, @@ -270,7 +273,9 @@ def _fit(self, X: Dict[str, Any], y: Any = None, **kwargs: Any) -> torch.nn.Modu device=self.get_device(X), metrics_during_training=X['metrics_during_training'], scheduler=X['lr_scheduler'], - task_type=STRING_TO_TASK_TYPES[X['dataset_properties']['task_type']] + task_type=STRING_TO_TASK_TYPES[X['dataset_properties']['task_type']], + output_type=STRING_TO_OUTPUT_TYPES[X['dataset_properties']['output_type']], + labels=X['y_train'][X['backend'].load_datamanager().splits[X['split_id']][0]] ) total_parameter_count, trainable_parameter_count = self.count_parameters(X['network']) self.run_summary = RunSummary( @@ -490,6 +495,10 @@ def check_requirements(self, X: Dict[str, Any], y: Any = None) -> None: config_option )) + # For early stopping, we need to know the patience + if 'early_stopping' not in X: + raise ValueError('To fit a Trainer, expected fit dictionary to have early_stopping') + def get_device(self, X: Dict[str, Any]) -> torch.device: """ Returns the device to do torch operations diff --git a/autoPyTorch/pipeline/tabular_classification.py b/autoPyTorch/pipeline/tabular_classification.py index 5059f3536..3540d9660 100644 --- a/autoPyTorch/pipeline/tabular_classification.py +++ b/autoPyTorch/pipeline/tabular_classification.py @@ -16,6 +16,8 @@ from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding.base_encoder_choice import ( EncoderChoice ) +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \ + base_feature_preprocessor_choice import FeatureProprocessorChoice from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler_choice import ScalerChoice from autoPyTorch.pipeline.components.setup.early_preprocessor.EarlyPreprocessing import EarlyPreprocessing @@ -59,25 +61,25 @@ class TabularClassificationPipeline(ClassifierMixin, BasePipeline): """ def __init__( - self, - config: Optional[Configuration] = None, - steps: Optional[List[Tuple[str, autoPyTorchChoice]]] = None, - dataset_properties: Optional[Dict[str, Any]] = None, - include: Optional[Dict[str, Any]] = None, - exclude: Optional[Dict[str, Any]] = None, - random_state: Optional[np.random.RandomState] = None, - init_params: Optional[Dict[str, Any]] = None, - search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None + self, + config: Optional[Configuration] = None, + steps: Optional[List[Tuple[str, autoPyTorchChoice]]] = None, + dataset_properties: Optional[Dict[str, Any]] = None, + include: Optional[Dict[str, Any]] = None, + exclude: Optional[Dict[str, Any]] = None, + random_state: Optional[np.random.RandomState] = None, + init_params: Optional[Dict[str, Any]] = None, + search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None ): super().__init__( config, steps, dataset_properties, include, exclude, random_state, init_params, search_space_updates) def fit_transformer( - self, - X: np.ndarray, - y: np.ndarray, - fit_params: Optional[Dict[str, Any]] = None + self, + X: np.ndarray, + y: np.ndarray, + fit_params: Optional[Dict[str, Any]] = None ) -> Tuple[np.ndarray, Optional[Dict[str, Any]]]: """Fits the pipeline given a training (X,y) pair @@ -239,6 +241,7 @@ def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]], ("imputer", SimpleImputer()), ("encoder", EncoderChoice(default_dataset_properties)), ("scaler", ScalerChoice(default_dataset_properties)), + ("feature_preprocessor", FeatureProprocessorChoice(default_dataset_properties)), ("tabular_transformer", TabularColumnTransformer()), ("preprocessing", EarlyPreprocessing()), ("network_backbone", NetworkBackboneChoice(default_dataset_properties)), diff --git a/autoPyTorch/utils/common.py b/autoPyTorch/utils/common.py index 3143ced11..88af11531 100644 --- a/autoPyTorch/utils/common.py +++ b/autoPyTorch/utils/common.py @@ -133,3 +133,9 @@ def hash_array_or_matrix(X: Union[np.ndarray, pd.DataFrame]) -> str: hash = m.hexdigest() return hash + + +def subsampler(data: Union[np.ndarray, pd.DataFrame, scipy.sparse.csr_matrix], + x: Union[np.ndarray, List[int]] + ) -> Union[np.ndarray, pd.DataFrame, scipy.sparse.csr_matrix]: + return data[x] if isinstance(data, (np.ndarray, scipy.sparse.csr_matrix)) else data.iloc[x] diff --git a/autoPyTorch/utils/implementations.py b/autoPyTorch/utils/implementations.py new file mode 100644 index 000000000..15f1758e1 --- /dev/null +++ b/autoPyTorch/utils/implementations.py @@ -0,0 +1,48 @@ +from typing import Callable, Union + +import numpy as np + +import torch + +from autoPyTorch.constants import BINARY + + +def get_loss_weight_strategy(output_type: int) -> Callable: + if output_type == BINARY: + return LossWeightStrategyWeightedBinary() + else: + return LossWeightStrategyWeighted() + + +class LossWeightStrategyWeighted(): + def __call__(self, y: Union[np.ndarray, torch.Tensor]) -> np.ndarray: + if isinstance(y, torch.Tensor): + y = y.detach().cpu().numpy() if y.is_cuda else y.numpy() + if isinstance(y[0], str): + y = y.astype('float64') + counts = np.sum(y, axis=0) + total_weight = y.shape[0] + + if len(y.shape) > 1: + weight_per_class = total_weight / y.shape[1] + weights = (np.ones(y.shape[1]) * weight_per_class) / np.maximum(counts, 1) + else: + classes, counts = np.unique(y, axis=0, return_counts=True) + classes, counts = classes[::-1], counts[::-1] + weight_per_class = total_weight / classes.shape[0] + weights = (np.ones(classes.shape[0]) * weight_per_class) / counts + + return weights + + +class LossWeightStrategyWeightedBinary(): + def __call__(self, y: Union[np.ndarray, torch.Tensor]) -> np.ndarray: + if isinstance(y, torch.Tensor): + y = y.detach().cpu().numpy() if y.is_cuda else y.numpy() + if isinstance(y[0], str): + y = y.astype('float64') + counts_one = np.sum(y, axis=0) + counts_zero = counts_one + (-y.shape[0]) + weights = counts_zero / np.maximum(counts_one, 1) + + return np.array(weights) diff --git a/requirements.txt b/requirements.txt index 366837cd6..cced8bcf4 100755 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,8 @@ pandas torch torchvision +tensorboard scikit-learn>=0.22.0,<0.23 -torchvision -pytorch-lightning numpy scipy lockfile @@ -16,4 +15,4 @@ dask distributed>=2.2.0 catboost lightgbm - +flaky diff --git a/setup.py b/setup.py index 07ab30a8c..c496a48c1 100755 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ # noinspection PyInterpreter setuptools.setup( name="autoPyTorch", - version="0.0.2", + version="0.0.3", author="AutoML Freiburg", author_email="zimmerl@informatik.uni-freiburg.de", description=("Auto-PyTorch searches neural architectures using BO-HB"), diff --git a/test/conftest.py b/test/conftest.py index 8ef3cc28f..d16d40546 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -156,30 +156,29 @@ def fit_dictionary(request): def fit_dictionary_numerical_only(backend): X, y = make_classification( n_samples=200, - n_features=4, - n_informative=3, - n_redundant=1, + n_features=10, + n_informative=6, + n_redundant=4, n_repeated=0, n_classes=2, n_clusters_per_class=2, shuffle=True, random_state=0 ) + X = X.astype('float64') datamanager = TabularDataset( X=X, Y=y, X_test=X, Y_test=y, ) - info = {'task_type': datamanager.task_type, - 'output_type': datamanager.output_type, - 'issparse': datamanager.issparse, - 'numerical_columns': datamanager.numerical_columns, - 'categorical_columns': datamanager.categorical_columns} + info = datamanager.get_required_dataset_info() dataset_properties = datamanager.get_dataset_properties(get_dataset_requirements(info)) fit_dictionary = { 'X_train': X, 'y_train': y, + 'train_indices': datamanager.splits[0][0], + 'val_indices': datamanager.splits[0][1], 'dataset_properties': dataset_properties, 'num_run': np.random.randint(50), 'device': 'cpu', @@ -209,16 +208,14 @@ def fit_dictionary_categorical_only(backend): X=X, Y=y, X_test=X, Y_test=y, ) - info = {'task_type': datamanager.task_type, - 'output_type': datamanager.output_type, - 'issparse': datamanager.issparse, - 'numerical_columns': datamanager.numerical_columns, - 'categorical_columns': datamanager.categorical_columns} + info = datamanager.get_required_dataset_info() dataset_properties = datamanager.get_dataset_properties(get_dataset_requirements(info)) fit_dictionary = { 'X_train': X, 'y_train': y, + 'train_indices': datamanager.splits[0][0], + 'val_indices': datamanager.splits[0][1], 'dataset_properties': dataset_properties, 'num_run': np.random.randint(50), 'device': 'cpu', @@ -250,17 +247,15 @@ def fit_dictionary_num_and_categorical(backend): X=X, Y=y, X_test=X, Y_test=y, ) - info = {'task_type': datamanager.task_type, - 'output_type': datamanager.output_type, - 'issparse': datamanager.issparse, - 'numerical_columns': datamanager.numerical_columns, - 'categorical_columns': datamanager.categorical_columns} + info = datamanager.get_required_dataset_info() dataset_properties = datamanager.get_dataset_properties(get_dataset_requirements(info)) fit_dictionary = { 'X_train': X, 'y_train': y, + 'train_indices': datamanager.splits[0][0], + 'val_indices': datamanager.splits[0][1], 'dataset_properties': dataset_properties, 'num_run': np.random.randint(50), 'device': 'cpu', diff --git a/test/test_datasets/test_tabular_dataset.py b/test/test_datasets/test_tabular_dataset.py index dfc72be77..6d5dacd8d 100644 --- a/test/test_datasets/test_tabular_dataset.py +++ b/test/test_datasets/test_tabular_dataset.py @@ -98,11 +98,7 @@ def test_get_dataset_properties(self): backend.save_datamanager(datamanager) datamanager = backend.load_datamanager() - info = {'task_type': datamanager.task_type, - 'output_type': datamanager.output_type, - 'issparse': datamanager.issparse, - 'numerical_columns': datamanager.numerical_columns, - 'categorical_columns': datamanager.categorical_columns} + info = datamanager.get_required_dataset_info() dataset_requirements = get_dataset_requirements(info) dataset_properties = datamanager.get_dataset_properties(dataset_requirements) diff --git a/test/test_pipeline/components/base.py b/test/test_pipeline/components/base.py index 120fa9fcd..6ad3ad824 100644 --- a/test/test_pipeline/components/base.py +++ b/test/test_pipeline/components/base.py @@ -1,13 +1,22 @@ import logging import unittest +from typing import Any, Dict, List, Optional, Tuple from sklearn.datasets import make_classification import torch -from autoPyTorch.constants import STRING_TO_TASK_TYPES +from autoPyTorch.constants import STRING_TO_OUTPUT_TYPES, STRING_TO_TASK_TYPES +from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.TabularColumnTransformer import \ + TabularColumnTransformer +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding.base_encoder_choice import \ + EncoderChoice +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler_choice import ScalerChoice from autoPyTorch.pipeline.components.training.metrics.utils import get_metrics from autoPyTorch.pipeline.components.training.trainer.base_trainer import BudgetTracker +from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline class BaseTraining(unittest.TestCase): @@ -40,7 +49,7 @@ def setUp(self): layers.append(torch.nn.Sigmoid()) layers.append(torch.nn.Linear(4, 2)) self.model = torch.nn.Sequential(*layers) - self.criterion = torch.nn.CrossEntropyLoss() + self.criterion = torch.nn.CrossEntropyLoss self.optimizer = torch.optim.SGD(self.model.parameters(), lr=0.01) self.device = torch.device('cpu') self.logger = logging.getLogger('test') @@ -51,9 +60,13 @@ def setUp(self): max_epochs=self.epochs, ) self.task_type = STRING_TO_TASK_TYPES[self.dataset_properties['task_type']] + self.output_type = STRING_TO_OUTPUT_TYPES[self.dataset_properties['output_type']] def _overfit_model(self): self.model.train() + # initialise the criterion as it is + # not being done in __init__ + self.criterion = self.criterion() for epoch in range(self.epochs): total_loss = 0 for x, y in self.loader: @@ -67,3 +80,29 @@ def _overfit_model(self): # Backward pass loss.backward() self.optimizer.step() + + +class TabularPipeline(TabularClassificationPipeline): + def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]], + ) -> List[Tuple[str, autoPyTorchChoice]]: + """ + Defines what steps a pipeline should follow. + The step itself has choices given via autoPyTorchChoice. + + Returns: + List[Tuple[str, autoPyTorchChoice]]: list of steps sequentially exercised + by the pipeline. + """ + steps = [] # type: List[Tuple[str, autoPyTorchChoice]] + + default_dataset_properties = {'target_type': 'tabular_classification'} + if dataset_properties is not None: + default_dataset_properties.update(dataset_properties) + + steps.extend([ + ("imputer", SimpleImputer()), + ("encoder", EncoderChoice(default_dataset_properties)), + ("scaler", ScalerChoice(default_dataset_properties)), + ("tabular_transformer", TabularColumnTransformer()), + ]) + return steps diff --git a/test/test_pipeline/components/test_feature_preprocessor.py b/test/test_pipeline/components/test_feature_preprocessor.py new file mode 100644 index 000000000..a812929e9 --- /dev/null +++ b/test/test_pipeline/components/test_feature_preprocessor.py @@ -0,0 +1,76 @@ +import numpy as np + +import pytest + +from sklearn.base import BaseEstimator +from sklearn.compose import make_column_transformer + +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \ + NoFeaturePreprocessor import NoFeaturePreprocessor +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing.\ + base_feature_preprocessor_choice import FeatureProprocessorChoice +from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline + + +@pytest.fixture(params=['TruncatedSVD', 'PolynomialFeatures', 'PowerTransformer', + 'Nystroem', 'KernelPCA', 'RandomKitchenSinks']) +def preprocessor(request): + return request.param + + +@pytest.mark.parametrize("fit_dictionary", ['fit_dictionary_numerical_only', + 'fit_dictionary_num_and_categorical'], indirect=True) +class TestFeaturePreprocessors: + + def test_feature_preprocessor(self, fit_dictionary, preprocessor): + preprocessor = FeatureProprocessorChoice( + dataset_properties=fit_dictionary['dataset_properties'] + ).get_components()[preprocessor]() + configuration = preprocessor.\ + get_hyperparameter_search_space(dataset_properties=fit_dictionary["dataset_properties"]) \ + .get_default_configuration().get_dictionary() + preprocessor = preprocessor.set_params(**configuration) + preprocessor.fit(fit_dictionary) + X = preprocessor.transform(fit_dictionary) + sklearn_preprocessor = X['feature_preprocessor']['numerical'] + + # check if the fit dictionary X is modified as expected + assert isinstance(X['feature_preprocessor'], dict) + if isinstance(preprocessor, NoFeaturePreprocessor): + assert sklearn_preprocessor is None, sklearn_preprocessor + pytest.skip("Tests not relevant for {}".format(preprocessor.__class__.__name__)) + assert isinstance(sklearn_preprocessor, BaseEstimator) + assert (X['feature_preprocessor']['categorical']) is None + + # make column transformer with returned encoder to fit on data + column_transformer = make_column_transformer((sklearn_preprocessor, + X['dataset_properties']['numerical_columns']), + remainder='passthrough') + column_transformer.fit(X['X_train']) + + transformed = column_transformer.transform(X['X_train']) + assert isinstance(transformed, np.ndarray) + + def test_pipeline_fit_include(self, fit_dictionary, preprocessor): + """ + This test ensures that a tabular classification + pipeline can be fit with all preprocessors + in the include + """ + + fit_dictionary['epochs'] = 1 + + pipeline = TabularClassificationPipeline( + dataset_properties=fit_dictionary['dataset_properties'], + include={'feature_preprocessor': [preprocessor]}) + cs = pipeline.get_hyperparameter_search_space() + config = cs.sample_configuration() + pipeline.set_hyperparameters(config) + pipeline.fit(fit_dictionary) + + # To make sure we fitted the model, there should be a + # run summary object with accuracy + run_summary = pipeline.named_steps['trainer'].run_summary + assert run_summary is not None + + assert preprocessor == pipeline.named_steps['feature_preprocessor'].choice.__class__.__name__ diff --git a/test/test_pipeline/components/test_feature_preprocessor_choice.py b/test/test_pipeline/components/test_feature_preprocessor_choice.py new file mode 100644 index 000000000..52d55c6df --- /dev/null +++ b/test/test_pipeline/components/test_feature_preprocessor_choice.py @@ -0,0 +1,52 @@ +import copy +import unittest + +from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \ + base_feature_preprocessor_choice import FeatureProprocessorChoice + + +class TestFeaturePreprocessorChoice(unittest.TestCase): + def test_get_set_config_space(self): + """Make sure that we can setup a valid choice in the feature preprocessor + choice""" + dataset_properties = {'numerical_columns': list(range(4)), 'categorical_columns': [5]} + feature_preprocessor_choice = FeatureProprocessorChoice(dataset_properties) + cs = feature_preprocessor_choice.get_hyperparameter_search_space() + + # Make sure that all hyperparameters are part of the search space + self.assertListEqual( + sorted(cs.get_hyperparameter('__choice__').choices), + sorted(list(feature_preprocessor_choice.get_components().keys())) + ) + + # Make sure we can properly set some random configs + # Whereas just one iteration will make sure the algorithm works, + # doing five iterations increase the confidence. We will be able to + # catch component specific crashes + for i in range(5): + config = cs.sample_configuration() + config_dict = copy.deepcopy(config.get_dictionary()) + feature_preprocessor_choice.set_hyperparameters(config) + + self.assertEqual(feature_preprocessor_choice.choice.__class__, + feature_preprocessor_choice.get_components()[config_dict['__choice__']]) + + # Then check the choice configuration + selected_choice = config_dict.pop('__choice__', None) + for key, value in config_dict.items(): + # Remove the selected_choice string from the parameter + # so we can query in the object for it + key = key.replace(selected_choice + ':', '') + self.assertIn(key, vars(feature_preprocessor_choice.choice)) + self.assertEqual(value, feature_preprocessor_choice.choice.__dict__[key]) + + def test_only_categorical(self): + dataset_properties = {'numerical_columns': [], 'categorical_columns': list(range(4))} + + chooser = FeatureProprocessorChoice(dataset_properties) + configspace = chooser.get_hyperparameter_search_space().sample_configuration().get_dictionary() + self.assertEqual(configspace['__choice__'], 'NoFeaturePreprocessor') + + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_pipeline/components/test_setup_networks.py b/test/test_pipeline/components/test_setup_networks.py index 2e3c07ccc..46debb0c5 100644 --- a/test/test_pipeline/components/test_setup_networks.py +++ b/test/test_pipeline/components/test_setup_networks.py @@ -1,3 +1,5 @@ +import flaky + import pytest import torch @@ -15,6 +17,7 @@ def head(request): return request.param +@flaky.flaky(max_runs=3) @pytest.mark.parametrize("fit_dictionary", ['fit_dictionary_numerical_only', 'fit_dictionary_categorical_only', 'fit_dictionary_num_and_categorical'], indirect=True) diff --git a/test/test_pipeline/components/test_tabular_column_transformer.py b/test/test_pipeline/components/test_tabular_column_transformer.py index 08d891a14..5eae26f69 100644 --- a/test/test_pipeline/components/test_tabular_column_transformer.py +++ b/test/test_pipeline/components/test_tabular_column_transformer.py @@ -1,115 +1,52 @@ -import unittest -from typing import Any, Dict, List, Optional, Tuple +from test.test_pipeline.components.base import TabularPipeline import numpy as np +import pytest + from scipy.sparse import csr_matrix from sklearn.compose import ColumnTransformer -from autoPyTorch.pipeline.components.base_choice import autoPyTorchChoice from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.TabularColumnTransformer import ( TabularColumnTransformer ) -from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.encoding.base_encoder_choice import ( - EncoderChoice -) -from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.imputation.SimpleImputer import SimpleImputer -from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler_choice import ScalerChoice -from autoPyTorch.pipeline.tabular_classification import TabularClassificationPipeline - - -class TabularPipeline(TabularClassificationPipeline): - def _get_pipeline_steps(self, dataset_properties: Optional[Dict[str, Any]], - ) -> List[Tuple[str, autoPyTorchChoice]]: - """ - Defines what steps a pipeline should follow. - The step itself has choices given via autoPyTorchChoice. - - Returns: - List[Tuple[str, autoPyTorchChoice]]: list of steps sequentially exercised - by the pipeline. - """ - steps = [] # type: List[Tuple[str, autoPyTorchChoice]] - - default_dataset_properties = {'target_type': 'tabular_classification'} - if dataset_properties is not None: - default_dataset_properties.update(dataset_properties) - steps.extend([ - ("imputer", SimpleImputer()), - ("encoder", EncoderChoice(default_dataset_properties)), - ("scaler", ScalerChoice(default_dataset_properties)), - ("tabular_transformer", TabularColumnTransformer()), - ]) - return steps +@pytest.mark.parametrize("fit_dictionary", ['fit_dictionary_numerical_only', + 'fit_dictionary_categorical_only', + 'fit_dictionary_num_and_categorical'], indirect=True) +class TestTabularTransformer: + def test_tabular_preprocess(self, fit_dictionary): -class TabularTransformerTest(unittest.TestCase): - - def test_tabular_preprocess_only_numerical(self): - dataset_properties = dict(numerical_columns=list(range(15)), - categorical_columns=[], - categories=[], - num_features=15, - num_classes=2, - issparse=False) - X = dict(X_train=np.random.random((10, 15)), - is_small_preprocess=True, - dataset_properties=dataset_properties - ) - - pipeline = TabularPipeline(dataset_properties=dataset_properties) - pipeline = pipeline.fit(X) - X = pipeline.transform(X) + pipeline = TabularPipeline(dataset_properties=fit_dictionary['dataset_properties']) + pipeline = pipeline.fit(fit_dictionary) + X = pipeline.transform(fit_dictionary) column_transformer = X['tabular_transformer'] # check if transformer was added to fit dictionary - self.assertIn('tabular_transformer', X.keys()) + assert 'tabular_transformer' in X.keys() # check if transformer is of expected type # In this case we expect the tabular transformer not the actual column transformer # as the later is not callable and runs into error in the compose transform - self.assertIsInstance(column_transformer, TabularColumnTransformer) + assert isinstance(column_transformer, TabularColumnTransformer) data = column_transformer.preprocessor.fit_transform(X['X_train']) - self.assertIsInstance(data, np.ndarray) + assert isinstance(data, np.ndarray) - def test_tabular_preprocess_only_categorical(self): - dataset_properties = dict(numerical_columns=[], - categorical_columns=list(range(2)), - categories=[['male', 'female'], ['germany']], - num_features=15, - num_classes=2, - issparse=False) - X = dict(X_train=np.array([['male', 'germany'], - ['female', 'germany'], - ['male', 'germany']], dtype=object), - dataset_properties=dataset_properties - ) - pipeline = TabularPipeline(dataset_properties=dataset_properties) - pipeline = pipeline.fit(X) - X = pipeline.transform(X) - column_transformer = X['tabular_transformer'] - - # check if transformer was added to fit dictionary - self.assertIn('tabular_transformer', X.keys()) - # check if transformer is of expected type - self.assertIsInstance(column_transformer, TabularColumnTransformer) - - data = column_transformer.preprocessor.fit_transform(X['X_train']) - self.assertIsInstance(data, np.ndarray) - - def test_sparse_data(self): + def test_sparse_data(self, fit_dictionary): X = np.random.binomial(1, 0.1, (100, 2000)) sparse_X = csr_matrix(X) numerical_columns = list(range(2000)) categorical_columns = [] train_indices = np.array(range(50)) - dataset_properties = dict(numerical_columns=numerical_columns, categorical_columns=categorical_columns, + dataset_properties = dict(numerical_columns=numerical_columns, + categorical_columns=categorical_columns, categories=[], issparse=True) X = { - 'X_train': sparse_X[train_indices], + 'X_train': sparse_X, + 'train_indices': train_indices, 'dataset_properties': dataset_properties } @@ -120,13 +57,9 @@ def test_sparse_data(self): column_transformer = X['tabular_transformer'] # check if transformer was added to fit dictionary - self.assertIn('tabular_transformer', X.keys()) + assert 'tabular_transformer' in X.keys() # check if transformer is of expected type - self.assertIsInstance(column_transformer.preprocessor, ColumnTransformer) + assert isinstance(column_transformer.preprocessor, ColumnTransformer) data = column_transformer.preprocessor.fit_transform(X['X_train']) - self.assertIsInstance(data, csr_matrix) - - -if __name__ == '__main__': - unittest.main() + assert isinstance(data, csr_matrix) diff --git a/test/test_pipeline/components/test_training.py b/test/test_pipeline/components/test_training.py index ec745d613..4e8e9b0ca 100644 --- a/test/test_pipeline/components/test_training.py +++ b/test/test_pipeline/components/test_training.py @@ -139,7 +139,9 @@ def test_evaluate(self): device=self.device, metrics_during_training=True, scheduler=None, - task_type=self.task_type + task_type=self.task_type, + output_type=self.output_type, + labels=self.y ) prev_loss, prev_metrics = trainer.evaluate(self.loader, epoch=1, writer=None) @@ -173,7 +175,9 @@ def test_epoch_training(self): optimizer=self.optimizer, device=self.device, metrics_during_training=True, - task_type=self.task_type + task_type=self.task_type, + output_type=self.output_type, + labels=self.y ) # Train the model @@ -205,7 +209,9 @@ def test_epoch_training(self): optimizer=self.optimizer, device=self.device, metrics_during_training=True, - task_type=self.task_type + task_type=self.task_type, + output_type=self.output_type, + labels=self.y ) # Train the model @@ -264,7 +270,7 @@ def test_every_trainer_is_valid(self): def test_get_set_config_space(self): """Make sure that we can setup a valid choice in the trainer choice""" - trainer_choice = TrainerChoice(dataset_properties={}) + trainer_choice = TrainerChoice(dataset_properties={'task_type': 'tabular_classification'}) cs = trainer_choice.get_hyperparameter_search_space() # Make sure that all hyperparameters are part of the serach space diff --git a/test/test_pipeline/test_losses.py b/test/test_pipeline/test_losses.py index 7cb744a29..ca3438d58 100644 --- a/test/test_pipeline/test_losses.py +++ b/test/test_pipeline/test_losses.py @@ -3,7 +3,9 @@ import torch from torch import nn +from autoPyTorch.constants import STRING_TO_OUTPUT_TYPES from autoPyTorch.pipeline.components.training.losses import get_loss_instance +from autoPyTorch.utils.implementations import get_loss_weight_strategy @pytest.mark.parametrize('output_type', ['multiclass', @@ -13,7 +15,7 @@ def test_get_no_name(output_type): dataset_properties = {'task_type': 'tabular_classification', 'output_type': output_type} loss = get_loss_instance(dataset_properties) - assert isinstance(loss, nn.Module) + assert isinstance(loss(), nn.Module) @pytest.mark.parametrize('output_type_name', [('multiclass', 'CrossEntropyLoss'), @@ -21,7 +23,7 @@ def test_get_no_name(output_type): def test_get_name(output_type_name): output_type, name = output_type_name dataset_properties = {'task_type': 'tabular_classification', 'output_type': output_type} - loss = get_loss_instance(dataset_properties, name) + loss = get_loss_instance(dataset_properties, name)() assert isinstance(loss, nn.Module) assert str(loss) == f"{name}()" @@ -33,7 +35,8 @@ def test_get_name_error(): get_loss_instance(dataset_properties, name) -def test_losses(): +@pytest.mark.parametrize('weighted', [True, False]) +def test_losses(weighted): list_properties = [{'task_type': 'tabular_classification', 'output_type': 'multiclass'}, {'task_type': 'tabular_classification', 'output_type': 'binary'}, {'task_type': 'tabular_regression', 'output_type': 'continuous'}] @@ -41,7 +44,17 @@ def test_losses(): list_predictions = [pred_cross_entropy, torch.empty(4).random_(2), torch.randn(4)] list_names = [None, 'BCEWithLogitsLoss', None] list_targets = [torch.empty(4, dtype=torch.long).random_(4), torch.empty(4).random_(2), torch.randn(4)] - for dataset_properties, pred, target, name in zip(list_properties, list_predictions, list_targets, list_names): + labels = [torch.empty(20, dtype=torch.long).random_(4), torch.empty(12, dtype=torch.long).random_(2), None] + for dataset_properties, pred, target, name, label in zip(list_properties, list_predictions, + list_targets, list_names, labels): loss = get_loss_instance(dataset_properties=dataset_properties, name=name) + weights = None + if bool(weighted) and 'classification' in dataset_properties['task_type']: + strategy = get_loss_weight_strategy(output_type=STRING_TO_OUTPUT_TYPES[dataset_properties['output_type']]) + weights = strategy(y=label) + weights = torch.from_numpy(weights) + weights = weights.type(torch.FloatTensor) + kwargs = {'pos_weight': weights} if 'binary' in dataset_properties['output_type'] else {'weight': weights} + loss = loss() if weights is None else loss(**kwargs) score = loss(pred, target) assert isinstance(score, torch.Tensor) diff --git a/test/test_pipeline/test_tabular_classification.py b/test/test_pipeline/test_tabular_classification.py index d5cc9acee..e7ae68012 100644 --- a/test/test_pipeline/test_tabular_classification.py +++ b/test/test_pipeline/test_tabular_classification.py @@ -146,11 +146,9 @@ def test_default_configuration(self, fit_dictionary, is_small_preprocess): """Makes sure that when no config is set, we can trust the default configuration from the space""" - fit_dictionary['is_small_preprocess'] = is_small_preprocess - + fit_dictionary['dataset_properties']['is_small_preprocess'] = is_small_preprocess pipeline = TabularClassificationPipeline( dataset_properties=fit_dictionary['dataset_properties']) - pipeline.fit(fit_dictionary) def test_remove_key_check_requirements(self, fit_dictionary):