automl · franchuterivera · May 17, 2021 · May 8, 2021 · May 11, 2021 · May 11, 2021
diff --git a/...ipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/KernelPCA.py b/...ipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/KernelPCA.py
@@ -23,15 +23,14 @@ class KernelPCA(autoPyTorchFeaturePreprocessingComponent):
     def __init__(self, n_components: int = 10,
                  kernel: str = 'rbf', degree: int = 3,
                  gamma: float = 0.01, coef0: float = 0.0,
-                 random_state: Optional[Union[int, np.random.RandomState]] = None
+                 random_state: Optional[np.random.RandomState] = None
                  ) -> None:
         self.n_components = n_components
         self.kernel = kernel
         self.degree = degree
         self.gamma = gamma
         self.coef0 = coef0
-        self.random_state = random_state
-        super().__init__()
+        super().__init__(random_state=random_state)
 
         self.add_fit_requirements([
             FitRequirement('issparse', (bool,), user_defined=True, dataset_property=True)])

diff --git a/...pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/Nystroem.py b/...pipeline/components/preprocessing/tabular_preprocessing/feature_preprocessing/Nystroem.py
@@ -23,15 +23,14 @@ class Nystroem(autoPyTorchFeaturePreprocessingComponent):
     def __init__(self, n_components: int = 10,
                  kernel: str = 'rbf', degree: int = 3,
                  gamma: float = 0.01, coef0: float = 0.0,
-                 random_state: Optional[Union[int, np.random.RandomState]] = None
+                 random_state: Optional[np.random.RandomState] = None
                  ) -> None:
         self.n_components = n_components
         self.kernel = kernel
         self.degree = degree
         self.gamma = gamma
         self.coef0 = coef0
-        self.random_state = random_state
-        super().__init__()
+        super().__init__(random_state=random_state)
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
 

diff --git a/...omponents/preprocessing/tabular_preprocessing/feature_preprocessing/PolynomialFeatures.py b/...omponents/preprocessing/tabular_preprocessing/feature_preprocessing/PolynomialFeatures.py
@@ -19,13 +19,12 @@
 class PolynomialFeatures(autoPyTorchFeaturePreprocessingComponent):
     def __init__(self, degree: int = 2, interaction_only: bool = False,
                  include_bias: bool = False,
-                 random_state: Optional[Union[int, np.random.RandomState]] = None):
+                 random_state: Optional[np.random.RandomState] = None):
         self.degree = degree
         self.interaction_only = interaction_only
         self.include_bias = include_bias
 
-        self.random_state = random_state
-        super().__init__()
+        super().__init__(random_state=random_state)
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         self.preprocessor['numerical'] = sklearn.preprocessing.PolynomialFeatures(

diff --git a/.../components/preprocessing/tabular_preprocessing/feature_preprocessing/PowerTransformer.py b/.../components/preprocessing/tabular_preprocessing/feature_preprocessing/PowerTransformer.py
@@ -17,11 +17,10 @@
 
 class PowerTransformer(autoPyTorchFeaturePreprocessingComponent):
     def __init__(self, standardize: bool = True,
-                 random_state: Optional[Union[int, np.random.RandomState]] = None):
+                 random_state: Optional[np.random.RandomState] = None):
         self.standardize = standardize
 
-        self.random_state = random_state
-        super().__init__()
+        super().__init__(random_state=random_state)
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
         self.preprocessor['numerical'] = sklearn.preprocessing.PowerTransformer(method="yeo-johnson",

diff --git a/...omponents/preprocessing/tabular_preprocessing/feature_preprocessing/RandomKitchenSinks.py b/...omponents/preprocessing/tabular_preprocessing/feature_preprocessing/RandomKitchenSinks.py
@@ -20,12 +20,11 @@
 class RandomKitchenSinks(autoPyTorchFeaturePreprocessingComponent):
     def __init__(self, n_components: int = 100,
                  gamma: float = 1.0,
-                 random_state: Optional[Union[int, np.random.RandomState]] = None
+                 random_state: Optional[np.random.RandomState] = None
                  ) -> None:
         self.n_components = n_components
         self.gamma = gamma
-        self.random_state = random_state
-        super().__init__()
+        super().__init__(random_state=random_state)
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
 

diff --git a/...line/components/preprocessing/tabular_preprocessing/feature_preprocessing/TruncatedSVD.py b/...line/components/preprocessing/tabular_preprocessing/feature_preprocessing/TruncatedSVD.py
@@ -18,11 +18,10 @@
 
 class TruncatedSVD(autoPyTorchFeaturePreprocessingComponent):
     def __init__(self, target_dim: int = 128,
-                 random_state: Optional[Union[int, np.random.RandomState]] = None):
+                 random_state: Optional[np.random.RandomState] = None):
         self.target_dim = target_dim
 
-        self.random_state = random_state
-        super().__init__()
+        super().__init__(random_state=random_state)
 
     def fit(self, X: Dict[str, Any], y: Any = None) -> BaseEstimator:
 

diff --git a/...ts/preprocessing/tabular_preprocessing/feature_preprocessing/base_feature_preprocessor.py b/...ts/preprocessing/tabular_preprocessing/feature_preprocessing/base_feature_preprocessor.py
@@ -1,4 +1,8 @@
-from typing import Any, Dict, List
+from typing import Any, Dict, List, Optional
+
+import numpy as np
+
+from sklearn.utils import check_random_state
 
 from autoPyTorch.pipeline.components.preprocessing.tabular_preprocessing.base_tabular_preprocessing import (
     autoPyTorchTabularPreprocessingComponent
@@ -8,7 +12,13 @@
 class autoPyTorchFeaturePreprocessingComponent(autoPyTorchTabularPreprocessingComponent):
     _required_properties: List[str] = ['handles_sparse']
 
-    def __init__(self) -> None:
+    def __init__(self, random_state: Optional[np.random.RandomState] = None) -> None:
+        if random_state is None:
+            # A trainer components need a random state for
+            # sampling -- for example in MixUp training
+            self.random_state = check_random_state(1)
+        else:
+            self.random_state = random_state
         super().__init__()
 
     def transform(self, X: Dict[str, Any]) -> Dict[str, Any]:

diff --git a/test/conftest.py b/test/conftest.py
@@ -25,6 +25,9 @@
 from autoPyTorch.utils.pipeline import get_dataset_requirements
 
 
+N_SAMPLES = 200
+
+
 @pytest.fixture(scope="session")
 def callattr_ahead_of_alltests(request):
     """
@@ -191,7 +194,7 @@ def session_run_at_end():
 def get_tabular_data(task):
     if task == "classification_numerical_only":
         X, y = make_classification(
-            n_samples=200,
+            n_samples=N_SAMPLES,
             n_features=4,
             n_informative=3,
             n_redundant=1,
@@ -207,18 +210,18 @@ def get_tabular_data(task):
         X, y = fetch_openml(data_id=40981, return_X_y=True, as_frame=True)
         categorical_columns = [column for column in X.columns if X[column].dtype.name == 'category']
         X = X[categorical_columns]
-        X = X.iloc[0:200]
-        y = y.iloc[0:200]
+        X = X.iloc[0:N_SAMPLES]
+        y = y.iloc[0:N_SAMPLES]
         validator = TabularInputValidator(is_classification=True).fit(X.copy(), y.copy())
 
     elif task == "classification_numerical_and_categorical":
         X, y = fetch_openml(data_id=40981, return_X_y=True, as_frame=True)
-        X = X.iloc[0:200]
-        y = y.iloc[0:200]
+        X = X.iloc[0:N_SAMPLES]
+        y = y.iloc[0:N_SAMPLES]
         validator = TabularInputValidator(is_classification=True).fit(X.copy(), y.copy())
 
     elif task == "regression_numerical_only":
-        X, y = make_regression(n_samples=200,
+        X, y = make_regression(n_samples=N_SAMPLES,
                                n_features=4,
                                n_informative=3,
                                n_targets=1,
@@ -240,8 +243,8 @@ def get_tabular_data(task):
             else:
                 X[column] = X[column].fillna(0)
 
-        X = X.iloc[0:200]
-        y = y.iloc[0:200]
+        X = X.iloc[0:N_SAMPLES]
+        y = y.iloc[0:N_SAMPLES]
         y = (y - y.mean()) / y.std()
         validator = TabularInputValidator(is_classification=False).fit(X.copy(), y.copy())
 
@@ -256,8 +259,8 @@ def get_tabular_data(task):
             else:
                 X[column] = X[column].fillna(0)
 
-        X = X.iloc[0:200]
-        y = y.iloc[0:200]
+        X = X.iloc[0:N_SAMPLES]
+        y = y.iloc[0:N_SAMPLES]
         y = (y - y.mean()) / y.std()
         validator = TabularInputValidator(is_classification=False).fit(X.copy(), y.copy())
     elif task == 'iris':
@@ -288,7 +291,7 @@ def get_fit_dictionary(X, y, validator, backend):
         'num_run': np.random.randint(50),
         'device': 'cpu',
         'budget_type': 'epochs',
-        'epochs': 100,
+        'epochs': 5,
         'torch_num_threads': 1,
         'early_stopping': 10,
         'working_dir': '/tmp',
@@ -326,7 +329,7 @@ def dataset(request):
 @pytest.fixture
 def dataset_traditional_classifier_num_only():
     X, y = make_classification(
-        n_samples=200,
+        n_samples=N_SAMPLES,
         n_features=4,
         n_informative=3,
         n_redundant=1,
@@ -344,15 +347,15 @@ def dataset_traditional_classifier_categorical_only():
     X, y = fetch_openml(data_id=40981, return_X_y=True, as_frame=True)
     categorical_columns = [column for column in X.columns if X[column].dtype.name == 'category']
     X = X[categorical_columns]
-    X, y = X[:200].to_numpy(), y[:200].to_numpy().astype(np.int)
+    X, y = X[:N_SAMPLES].to_numpy(), y[:N_SAMPLES].to_numpy().astype(np.int)
     return X, y
 
 
 @pytest.fixture
 def dataset_traditional_classifier_num_categorical():
     X, y = fetch_openml(data_id=40981, return_X_y=True, as_frame=True)
     y = y.astype(np.int)
-    X, y = X[:200].to_numpy(), y[:200].to_numpy().astype(np.int)
+    X, y = X[:N_SAMPLES].to_numpy(), y[:N_SAMPLES].to_numpy().astype(np.int)
     return X, y
 
 
@@ -456,3 +459,8 @@ def loss_mse():
 @pytest.fixture
 def loss_details(request):
     return request.getfixturevalue(request.param)
+
+
+@pytest.fixture
+def n_samples():
+    return N_SAMPLES