-
Notifications
You must be signed in to change notification settings - Fork 302
[ADD] feature preprocessors from autosklearn #378
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
nabenabe0928
merged 15 commits into
automl:development
from
ravinkohli:add_feature_preprocessors
Mar 3, 2022
Merged
Changes from all commits
Commits
Show all changes
15 commits
Select commit
Hold shift + click to select a range
6799fab
in progress
ravinkohli ab8333f
add remaining preprocessors
ravinkohli 46c4551
fix flake and mypy after rebase
ravinkohli a6ba41f
Fix tests and add documentation
ravinkohli f356163
fix tests bug
ravinkohli 370e4b3
fix bug in tests
ravinkohli 45bc440
fix bug where search space updates were not honoured
ravinkohli 04664f5
handle check for score func in feature preprocessors
ravinkohli 70360d5
address comments from shuhei
ravinkohli 0fb2322
apply suggestions from code review
ravinkohli 0844751
add documentation for feature preprocessors with percent to int value…
ravinkohli cab7276
fix tests
ravinkohli 9b0dc20
fix tests
ravinkohli 641ae81
address comments from shuhei
ravinkohli 9518ba0
fix tests which fail due to scaler
ravinkohli File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
172 changes: 172 additions & 0 deletions
172
...ssing/tabular_preprocessing/feature_preprocessing/ExtraTreesPreprocessorClassification.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,172 @@ | ||
| from typing import Any, Dict, Optional, Union | ||
|
|
||
| from ConfigSpace.configuration_space import ConfigurationSpace | ||
| from ConfigSpace.hyperparameters import ( | ||
| CategoricalHyperparameter, | ||
| UniformFloatHyperparameter, | ||
| UniformIntegerHyperparameter, | ||
| ) | ||
|
|
||
| import numpy as np | ||
|
|
||
| from sklearn.base import BaseEstimator | ||
| from sklearn.ensemble import ExtraTreesClassifier | ||
| from sklearn.feature_selection import SelectFromModel | ||
|
|
||
| from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType | ||
| from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \ | ||
| base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent | ||
| from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \ | ||
| utils import NoneType_ | ||
| from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter, check_none | ||
|
|
||
|
|
||
| CRITERION_CHOICES = ("gini", "entropy") | ||
|
|
||
|
|
||
| class ExtraTreesPreprocessorClassification(autoPyTorchFeaturePreprocessingComponent): | ||
| """ | ||
| Select features based on importance weights calculated using extra trees | ||
| """ | ||
| def __init__(self, bootstrap: bool = True, n_estimators: int = 10, | ||
| criterion: str = "gini", max_features: float = 0.5, | ||
| max_depth: Union[int, NoneType_] = 5, min_samples_split: int = 2, | ||
| min_samples_leaf: int = 1, min_weight_fraction_leaf: float = 0, | ||
| max_leaf_nodes: Union[int, NoneType_] = "none", | ||
| min_impurity_decrease: float = 0, oob_score: bool = False, | ||
| verbose: int = 0, | ||
| random_state: Optional[np.random.RandomState] = None): | ||
| self.bootstrap = bootstrap | ||
| self.n_estimators = n_estimators | ||
| if criterion not in CRITERION_CHOICES: | ||
| raise ValueError(f"`criterion` of {self.__class__.__name__} " | ||
| f"must be in {CRITERION_CHOICES}, but got: {criterion}") | ||
| self.criterion = criterion | ||
| self.max_features = max_features | ||
| self.min_impurity_decrease = min_impurity_decrease | ||
| self.max_depth = max_depth | ||
| self.min_samples_split = min_samples_split | ||
| self.min_samples_leaf = min_samples_leaf | ||
| self.min_weight_fraction_leaf = min_weight_fraction_leaf | ||
| self.max_leaf_nodes = max_leaf_nodes | ||
| self.oob_score = oob_score | ||
| self.verbose = verbose | ||
|
|
||
| super().__init__(random_state=random_state) | ||
|
|
||
| def get_components_kwargs(self) -> Dict[str, Any]: | ||
| """ | ||
| returns keyword arguments required by the feature preprocessor | ||
|
|
||
| Returns: | ||
| Dict[str, Any]: kwargs | ||
| """ | ||
| return dict( | ||
| bootstrap=self.bootstrap, | ||
| n_estimators=self.n_estimators, | ||
| criterion=self.criterion, | ||
| max_features=self.max_features, | ||
| min_impurity_decrease=self.min_impurity_decrease, | ||
| max_depth=self.max_depth, | ||
| min_samples_split=self.min_samples_split, | ||
| min_samples_leaf=self.min_samples_leaf, | ||
| min_weight_fraction_leaf=self.min_weight_fraction_leaf, | ||
| max_leaf_nodes=self.max_leaf_nodes, | ||
| oob_score=self.oob_score, | ||
| verbose=self.verbose, | ||
| random_state=self.random_state, | ||
| ) | ||
|
|
||
| def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: | ||
|
|
||
| if check_none(self.max_leaf_nodes): | ||
| self.max_leaf_nodes = None | ||
| elif isinstance(self.max_leaf_nodes, int): | ||
| self.max_leaf_nodes = int(self.max_leaf_nodes) | ||
| else: | ||
| raise ValueError(f"Expected `max_leaf_nodes` to be either " | ||
| f"in ('None', 'none', None) or an integer, got {self.max_leaf_nodes}") | ||
|
|
||
| if check_none(self.max_depth): | ||
| self.max_depth = None | ||
| elif isinstance(self.max_depth, int): | ||
| self.max_depth = int(self.max_depth) | ||
| else: | ||
| raise ValueError(f"Expected `max_depth` to be either " | ||
ravinkohli marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| f"in ('None', 'none', None) or an integer, got {self.max_depth}") | ||
|
|
||
| # TODO: add class_weights | ||
| estimator = ExtraTreesClassifier(**self.get_components_kwargs()) | ||
|
|
||
| self.preprocessor['numerical'] = SelectFromModel(estimator=estimator, | ||
| threshold='mean', | ||
| prefit=False) | ||
| return self | ||
|
|
||
| @staticmethod | ||
| def get_hyperparameter_search_space( | ||
| dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, | ||
| bootstrap: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='bootstrap', | ||
| value_range=(True, False), | ||
| default_value=True, | ||
| ), | ||
| n_estimators: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='n_estimators', | ||
| value_range=(10, 100), | ||
| default_value=10, | ||
| ), | ||
| max_depth: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='max_depth', | ||
| value_range=("none",), | ||
| default_value="none", | ||
| ), | ||
| max_features: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='max_features', | ||
| value_range=(0, 1), | ||
| default_value=0.5, | ||
| ), | ||
| min_impurity_decrease: HyperparameterSearchSpace = HyperparameterSearchSpace( | ||
| hyperparameter='min_impurity_decrease', | ||
| value_range=(0,), | ||
| default_value=0), | ||
| criterion: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='criterion', | ||
| value_range=CRITERION_CHOICES, | ||
| default_value="gini", | ||
| ), | ||
| min_samples_split: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='min_samples_split', | ||
| value_range=(2, 20), | ||
| default_value=2, | ||
| ), | ||
| min_samples_leaf: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='min_samples_leaf', | ||
| value_range=(1, 20), | ||
| default_value=1, | ||
| ), | ||
| min_weight_fraction_leaf: HyperparameterSearchSpace = HyperparameterSearchSpace( | ||
| hyperparameter='min_weight_fraction_leaf', | ||
| value_range=(0,), | ||
| default_value=0), | ||
| max_leaf_nodes: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='max_leaf_nodes', | ||
| value_range=("none",), | ||
| default_value="none", | ||
| ), | ||
| ) -> ConfigurationSpace: | ||
|
|
||
| cs = ConfigurationSpace() | ||
| add_hyperparameter(cs, bootstrap, CategoricalHyperparameter) | ||
| add_hyperparameter(cs, n_estimators, UniformIntegerHyperparameter) | ||
| add_hyperparameter(cs, max_features, UniformFloatHyperparameter) | ||
| add_hyperparameter(cs, min_impurity_decrease, UniformFloatHyperparameter) | ||
| add_hyperparameter(cs, criterion, CategoricalHyperparameter) | ||
| add_hyperparameter(cs, max_depth, UniformIntegerHyperparameter) | ||
| add_hyperparameter(cs, min_samples_split, UniformIntegerHyperparameter) | ||
| add_hyperparameter(cs, min_samples_leaf, UniformIntegerHyperparameter) | ||
| add_hyperparameter(cs, min_weight_fraction_leaf, UniformFloatHyperparameter) | ||
| add_hyperparameter(cs, max_leaf_nodes, UniformIntegerHyperparameter) | ||
|
|
||
| return cs | ||
|
|
||
| @staticmethod | ||
| def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]: | ||
| return {'shortname': 'ETC', | ||
| 'name': 'Extra Trees Classifier Preprocessing', | ||
| 'handles_sparse': True, | ||
| 'handles_regression': False, | ||
| 'handles_classification': True | ||
| } | ||
175 changes: 175 additions & 0 deletions
175
...rocessing/tabular_preprocessing/feature_preprocessing/ExtraTreesPreprocessorRegression.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,175 @@ | ||
| from typing import Any, Dict, List, Optional, Union | ||
|
|
||
| from ConfigSpace.configuration_space import ConfigurationSpace | ||
| from ConfigSpace.hyperparameters import ( | ||
| CategoricalHyperparameter, | ||
| UniformFloatHyperparameter, | ||
| UniformIntegerHyperparameter, | ||
| ) | ||
|
|
||
| import numpy as np | ||
|
|
||
| from sklearn.base import BaseEstimator | ||
| from sklearn.ensemble import ExtraTreesRegressor | ||
| from sklearn.feature_selection import SelectFromModel | ||
|
|
||
| from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType | ||
| from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \ | ||
| base_feature_preprocessor import autoPyTorchFeaturePreprocessingComponent | ||
| from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.feature_preprocessing. \ | ||
| utils import NoneType_ | ||
| from autoPyTorch.utils.common import FitRequirement, HyperparameterSearchSpace, add_hyperparameter, check_none | ||
|
|
||
|
|
||
| CRITERION_CHOICES = ('mse', 'friedman_mse', 'mae') | ||
|
|
||
|
|
||
| class ExtraTreesPreprocessorRegression(autoPyTorchFeaturePreprocessingComponent): | ||
| """ | ||
| Selects features based on importance weights using extra trees | ||
| """ | ||
| def __init__(self, bootstrap: bool = True, n_estimators: int = 10, | ||
| criterion: str = "mse", max_features: float = 1, | ||
| max_depth: Union[int, NoneType_] = 5, min_samples_split: int = 2, | ||
| min_samples_leaf: int = 1, min_weight_fraction_leaf: float = 0, | ||
| max_leaf_nodes: Union[int, NoneType_] = "none", | ||
| oob_score: bool = False, verbose: int = 0, | ||
| random_state: Optional[np.random.RandomState] = None): | ||
| self.bootstrap = bootstrap | ||
| self.n_estimators = n_estimators | ||
| if criterion not in CRITERION_CHOICES: | ||
| raise ValueError(f"`criterion` of {self.__class__.__name__} " | ||
| f"must be in {CRITERION_CHOICES}, but got: {criterion}") | ||
| self.criterion = criterion | ||
| self.max_features = max_features | ||
| self.max_depth = max_depth | ||
| self.min_samples_split = min_samples_split | ||
| self.min_samples_leaf = min_samples_leaf | ||
| self.min_weight_fraction_leaf = min_weight_fraction_leaf | ||
| self.max_leaf_nodes = max_leaf_nodes | ||
| self.oob_score = oob_score | ||
| self.verbose = verbose | ||
|
|
||
| super().__init__(random_state=random_state) | ||
|
|
||
| self.add_fit_requirements([ | ||
| FitRequirement('numerical_columns', (List,), user_defined=True, dataset_property=True)]) | ||
|
|
||
| def get_components_kwargs(self) -> Dict[str, Any]: | ||
| """ | ||
| returns keyword arguments required by the feature preprocessor | ||
|
|
||
| Returns: | ||
| Dict[str, Any]: kwargs | ||
| """ | ||
| return dict( | ||
| bootstrap=self.bootstrap, | ||
| n_estimators=self.n_estimators, | ||
| criterion=self.criterion, | ||
| max_features=self.max_features, | ||
| max_depth=self.max_depth, | ||
| min_samples_split=self.min_samples_split, | ||
| min_samples_leaf=self.min_samples_leaf, | ||
| min_weight_fraction_leaf=self.min_weight_fraction_leaf, | ||
| max_leaf_nodes=self.max_leaf_nodes, | ||
| oob_score=self.oob_score, | ||
| verbose=self.verbose, | ||
| random_state=self.random_state, | ||
| ) | ||
|
|
||
| def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator: | ||
|
|
||
| self.check_requirements(X, y) | ||
|
|
||
| if check_none(self.max_leaf_nodes): | ||
| self.max_leaf_nodes = None | ||
| elif isinstance(self.max_leaf_nodes, int): | ||
| self.max_leaf_nodes = int(self.max_leaf_nodes) | ||
| else: | ||
| raise ValueError(f"Expected `max_leaf_nodes` to be either " | ||
| f"in ('None', 'none', None) or an integer, got {self.max_leaf_nodes}") | ||
|
|
||
| if check_none(self.max_depth): | ||
| self.max_depth = None | ||
| elif isinstance(self.max_depth, int): | ||
| self.max_depth = int(self.max_depth) | ||
| else: | ||
| raise ValueError(f"Expected `max_depth` to be either " | ||
| f"in ('None', 'none', None) or an integer, got {self.max_depth}") | ||
|
|
||
| num_features = len(X['dataset_properties']['numerical_columns']) | ||
| max_features = int( | ||
| float(self.max_features) * (np.log(num_features) + 1)) | ||
| # Use at most half of the features | ||
| max_features = max(1, min(int(num_features / 2), max_features)) | ||
|
|
||
| # TODO: add class_weights | ||
| estimator = ExtraTreesRegressor(**self.get_components_kwargs()) | ||
|
|
||
| self.preprocessor['numerical'] = SelectFromModel(estimator=estimator, | ||
| threshold='mean', | ||
| prefit=False) | ||
| return self | ||
|
|
||
| @staticmethod | ||
| def get_hyperparameter_search_space( | ||
| dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None, | ||
| bootstrap: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='bootstrap', | ||
| value_range=(True, False), | ||
| default_value=True, | ||
| ), | ||
| n_estimators: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='n_estimators', | ||
| value_range=(100,), | ||
| default_value=100, | ||
| ), | ||
| max_depth: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='max_depth', | ||
| value_range=("none",), | ||
| default_value="none", | ||
| ), | ||
| max_features: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='max_features', | ||
| value_range=(0.1, 1), | ||
| default_value=1, | ||
| ), | ||
| criterion: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='criterion', | ||
| value_range=CRITERION_CHOICES, | ||
| default_value="mse", | ||
| ), | ||
| min_samples_split: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='min_samples_split', | ||
| value_range=(2, 20), | ||
| default_value=2, | ||
| ), | ||
| min_samples_leaf: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='min_samples_leaf', | ||
| value_range=(1, 20), | ||
| default_value=1, | ||
| ), | ||
| min_weight_fraction_leaf: HyperparameterSearchSpace = HyperparameterSearchSpace( | ||
| hyperparameter='min_weight_fraction_leaf', | ||
| value_range=(0,), | ||
| default_value=0), | ||
| max_leaf_nodes: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter='max_leaf_nodes', | ||
| value_range=("none",), | ||
| default_value="none", | ||
| ), | ||
| ) -> ConfigurationSpace: | ||
|
|
||
| cs = ConfigurationSpace() | ||
| add_hyperparameter(cs, bootstrap, CategoricalHyperparameter) | ||
| add_hyperparameter(cs, n_estimators, UniformIntegerHyperparameter) | ||
| add_hyperparameter(cs, max_features, UniformFloatHyperparameter) | ||
| add_hyperparameter(cs, criterion, CategoricalHyperparameter) | ||
| add_hyperparameter(cs, max_depth, UniformIntegerHyperparameter) | ||
| add_hyperparameter(cs, min_samples_split, UniformIntegerHyperparameter) | ||
| add_hyperparameter(cs, min_samples_leaf, UniformIntegerHyperparameter) | ||
| add_hyperparameter(cs, min_weight_fraction_leaf, UniformFloatHyperparameter) | ||
| add_hyperparameter(cs, max_leaf_nodes, UniformIntegerHyperparameter) | ||
|
|
||
| return cs | ||
|
|
||
| @staticmethod | ||
| def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None) -> Dict[str, Any]: | ||
| return {'shortname': 'ETR', | ||
| 'name': 'Extra Trees Regressor Preprocessing', | ||
| 'handles_sparse': True, | ||
| 'handles_regression': True, | ||
| 'handles_classification': False | ||
| } |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.