Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from typing import Any, Dict, Optional, Union

import numpy as np

from sklearn.preprocessing import PowerTransformer as SklearnPowerTransformer

from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler import BaseScaler


class PowerTransformer(BaseScaler):
"""
Map data to as close to a Gaussian distribution as possible
in order to reduce variance and minimize skewness.

Uses `yeo-johnson` power transform method. Also, data is normalised
to zero mean and unit variance.
"""
def __init__(self,
random_state: Optional[Union[np.random.RandomState, int]] = None):
super().__init__()
self.random_state = random_state

def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler:

self.check_requirements(X, y)

self.preprocessor['numerical'] = SklearnPowerTransformer(copy=False)
return self

@staticmethod
def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
) -> Dict[str, Union[str, bool]]:
return {
'shortname': 'PowerTransformer',
'name': 'PowerTransformer',
'handles_sparse': False
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
from typing import Any, Dict, Optional, Union

from ConfigSpace.configuration_space import ConfigurationSpace
from ConfigSpace.hyperparameters import (
CategoricalHyperparameter,
UniformIntegerHyperparameter
)

import numpy as np

from sklearn.preprocessing import QuantileTransformer as SklearnQuantileTransformer

from autoPyTorch.datasets.base_dataset import BaseDatasetPropertiesType
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.base_scaler import BaseScaler
from autoPyTorch.utils.common import HyperparameterSearchSpace, add_hyperparameter


class QuantileTransformer(BaseScaler):
"""
Transforms the features to follow a uniform or a normal distribution
using quantiles information.
"""
def __init__(
self,
n_quantiles: int = 1000,
output_distribution: str = "normal",
random_state: Optional[Union[np.random.RandomState, int]] = None
):
super().__init__()
self.random_state = random_state
self.n_quantiles = n_quantiles
self.output_distribution = output_distribution

def fit(self, X: Dict[str, Any], y: Any = None) -> BaseScaler:

self.check_requirements(X, y)

self.preprocessor['numerical'] = SklearnQuantileTransformer(n_quantiles=self.n_quantiles,
output_distribution=self.output_distribution,
copy=False)
return self

@staticmethod
def get_hyperparameter_search_space(
dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None,
n_quantiles: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="n_quantiles",
value_range=(10, 2000),
default_value=1000,
),
output_distribution: HyperparameterSearchSpace = HyperparameterSearchSpace(hyperparameter="output_distribution",
value_range=("uniform", "normal"),
default_value="normal",
)
) -> ConfigurationSpace:
cs = ConfigurationSpace()

# TODO parametrize like the Random Forest as n_quantiles = n_features^param
add_hyperparameter(cs, n_quantiles, UniformIntegerHyperparameter)
add_hyperparameter(cs, output_distribution, CategoricalHyperparameter)

return cs

@staticmethod
def get_properties(dataset_properties: Optional[Dict[str, BaseDatasetPropertiesType]] = None
) -> Dict[str, Union[str, bool]]:
return {
'shortname': 'QuantileTransformer',
'name': 'QuantileTransformer',
'handles_sparse': False
}
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,14 @@ def get_hyperparameter_search_space(self,
raise ValueError("no scalers found, please add a scaler")

if default is None:
defaults = ['StandardScaler', 'Normalizer', 'MinMaxScaler', 'NoScaler']
defaults = [
'StandardScaler',
'Normalizer',
'MinMaxScaler',
'PowerTransformer',
'QuantileTransformer',
'NoScaler'
]
for default_ in defaults:
if default_ in available_scalers:
default = default_
Expand Down
124 changes: 124 additions & 0 deletions test/test_pipeline/components/preprocessing/test_scalers.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.MinMaxScaler import MinMaxScaler
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.NoScaler import NoScaler
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.Normalizer import Normalizer
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.PowerTransformer import \
PowerTransformer
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.QuantileTransformer import \
QuantileTransformer
from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.scaling.StandardScaler import StandardScaler


Expand Down Expand Up @@ -239,3 +243,123 @@ def test_none_scaler(self):
self.assertIsInstance(X['scaler'], dict)
self.assertIsNone(X['scaler']['categorical'])
self.assertIsNone(X['scaler']['numerical'])


def test_power_transformer():
data = np.array([[1, 2, 3],
[7, 8, 9],
[4, 5, 6],
[11, 12, 13],
[17, 18, 19],
[14, 15, 16]])
train_indices = np.array([0, 2, 5])
test_indices = np.array([1, 4, 3])
categorical_columns = list()
numerical_columns = [0, 1, 2]
dataset_properties = {'categorical_columns': categorical_columns,
'numerical_columns': numerical_columns,
'issparse': False}
X = {
'X_train': data[train_indices],
'dataset_properties': dataset_properties
}
scaler_component = PowerTransformer()

scaler_component = scaler_component.fit(X)
X = scaler_component.transform(X)
scaler = X['scaler']['numerical']

# check if the fit dictionary X is modified as expected
assert isinstance(X['scaler'], dict)
assert isinstance(scaler, BaseEstimator)
assert X['scaler']['categorical'] is None

# make column transformer with returned encoder to fit on data
column_transformer = make_column_transformer((scaler, X['dataset_properties']['numerical_columns']),
remainder='passthrough')
column_transformer = column_transformer.fit(X['X_train'])
transformed = column_transformer.transform(data[test_indices])

assert_allclose(transformed, np.array([[0.531648, 0.522782, 0.515394],
[1.435794, 1.451064, 1.461685],
[0.993609, 1.001055, 1.005734]]), rtol=1e-06)


class TestQuantileTransformer():
def test_quantile_transformer_uniform(self):
data = np.array([[1, 2, 3],
[7, 8, 9],
[4, 5, 6],
[11, 12, 13],
[17, 18, 19],
[14, 15, 16]])
train_indices = np.array([0, 2, 5])
test_indices = np.array([1, 4, 3])
categorical_columns = list()
numerical_columns = [0, 1, 2]
dataset_properties = {'categorical_columns': categorical_columns,
'numerical_columns': numerical_columns,
'issparse': False}
X = {
'X_train': data[train_indices],
'dataset_properties': dataset_properties
}
scaler_component = QuantileTransformer(output_distribution='uniform')

scaler_component = scaler_component.fit(X)
X = scaler_component.transform(X)
scaler = X['scaler']['numerical']

# check if the fit dictionary X is modified as expected
assert isinstance(X['scaler'], dict)
assert isinstance(scaler, BaseEstimator)
assert X['scaler']['categorical'] is None

# make column transformer with returned encoder to fit on data
column_transformer = make_column_transformer((scaler, X['dataset_properties']['numerical_columns']),
remainder='passthrough')
column_transformer = column_transformer.fit(X['X_train'])
transformed = column_transformer.transform(data[test_indices])

assert_allclose(transformed, np.array([[0.65, 0.65, 0.65],
[1, 1, 1],
[0.85, 0.85, 0.85]]), rtol=1e-06)

def test_quantile_transformer_normal(self):
data = np.array([[1, 2, 3],
[7, 8, 9],
[4, 5, 6],
[11, 12, 13],
[17, 18, 19],
[14, 15, 16]])
train_indices = np.array([0, 2, 5])
test_indices = np.array([1, 4, 3])
categorical_columns = list()
numerical_columns = [0, 1, 2]
dataset_properties = {'categorical_columns': categorical_columns,
'numerical_columns': numerical_columns,
'issparse': False}
X = {
'X_train': data[train_indices],
'dataset_properties': dataset_properties
}
scaler_component = QuantileTransformer(output_distribution='normal')

scaler_component = scaler_component.fit(X)
X = scaler_component.transform(X)
scaler = X['scaler']['numerical']

# check if the fit dictionary X is modified as expected
assert isinstance(X['scaler'], dict)
assert isinstance(scaler, BaseEstimator)
assert X['scaler']['categorical'] is None

# make column transformer with returned encoder to fit on data
column_transformer = make_column_transformer((scaler, X['dataset_properties']['numerical_columns']),
remainder='passthrough')
column_transformer = column_transformer.fit(X['X_train'])
transformed = column_transformer.transform(data[test_indices])

assert_allclose(transformed, np.array([[0.38532, 0.38532, 0.38532],
[5.199338, 5.199338, 5.199338],
[1.036433, 1.036433, 1.036433]]), rtol=1e-05)