Merge pull request #137 from perib/new_search_space_def

perib · web-flow · commit bbef3e49bae2 · 2024-06-18T19:33:04.000-07:00
New search space def
diff --git a/tpot2/builtin_modules/__init__.py b/tpot2/builtin_modules/__init__.py
@@ -3,7 +3,7 @@
 from .column_one_hot_encoder import ColumnOneHotEncoder
 from .arithmetictransformer import ArithmeticTransformer
 from .arithmetictransformer import AddTransformer, mul_neg_1_Transformer, MulTransformer, SafeReciprocalTransformer, EQTransformer, NETransformer, GETransformer, GTTransformer, LETransformer, LTTransformer, MinTransformer, MaxTransformer, ZeroTransformer, OneTransformer, NTransformer
-from .passthrough import Passthrough
+from .passthrough import Passthrough, SkipTransformer
 from .imputer import ColumnSimpleImputer
 from .estimatortransformer import EstimatorTransformer
 from .passkbinsdiscretizer import PassKBinsDiscretizer
diff --git a/tpot2/builtin_modules/passthrough.py b/tpot2/builtin_modules/passthrough.py
@@ -1,4 +1,5 @@
 from sklearn.base import BaseEstimator, TransformerMixin
+import numpy as np
 
 class Passthrough(TransformerMixin,BaseEstimator):
 
@@ -7,3 +8,14 @@ def fit(self, X=None, y=None):
 
     def transform(self, X):
         return X
+
+
+class SkipTransformer(TransformerMixin,BaseEstimator):
+
+    def fit(self, X=None, y=None):
+        return self
+
+    def transform(self, X):
+        #empty array of same shape as X
+        return np.array([]).reshape(X.shape[0],0)
+    
diff --git a/tpot2/config/get_configspace.py b/tpot2/config/get_configspace.py
@@ -27,7 +27,7 @@
 from tpot2.builtin_modules import AddTransformer, mul_neg_1_Transformer, MulTransformer, SafeReciprocalTransformer, EQTransformer, NETransformer, GETransformer, GTTransformer, LETransformer, LTTransformer, MinTransformer, MaxTransformer, ZeroTransformer, OneTransformer, NTransformer
 from tpot2.builtin_modules.genetic_encoders import DominantEncoder, RecessiveEncoder, HeterosisEncoder, UnderDominanceEncoder, OverDominanceEncoder 
 from tpot2.builtin_modules import ZeroCount, ColumnOneHotEncoder, PassKBinsDiscretizer
-from tpot2.builtin_modules import Passthrough
+from tpot2.builtin_modules import Passthrough, SkipTransformer
 from sklearn.linear_model import SGDClassifier, LogisticRegression, SGDRegressor, Ridge, Lasso, ElasticNet, Lars, LassoLars, LassoLarsCV, RidgeCV, ElasticNetCV, PassiveAggressiveClassifier, ARDRegression
 from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, ExtraTreesRegressor, ExtraTreesClassifier, AdaBoostRegressor, AdaBoostClassifier, GradientBoostingRegressor,RandomForestRegressor, BaggingRegressor, ExtraTreesRegressor, HistGradientBoostingClassifier, HistGradientBoostingRegressor
 from sklearn.neural_network import MLPClassifier, MLPRegressor
@@ -45,7 +45,7 @@
 from sklearn.feature_selection import f_classif, f_regression #TODO create a selectomixin using these?
 from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
 from sklearn.gaussian_process import GaussianProcessRegressor, GaussianProcessClassifier
-
+from sklearn.impute import SimpleImputer
 
 all_methods = [SGDClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, MLPClassifier, DecisionTreeClassifier, XGBClassifier, KNeighborsClassifier, SVC, LogisticRegression, LGBMClassifier, LinearSVC, GaussianNB, BernoulliNB, MultinomialNB, ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor, DecisionTreeRegressor, KNeighborsRegressor, XGBRegressor,  ZeroCount, ColumnOneHotEncoder, Binarizer, FastICA, FeatureAgglomeration, MaxAbsScaler, MinMaxScaler, Normalizer, Nystroem, PCA, PolynomialFeatures, RBFSampler, RobustScaler, StandardScaler, SelectFwe, SelectPercentile, VarianceThreshold, SGDRegressor, Ridge, Lasso, ElasticNet, Lars, LassoLars, LassoLarsCV, RidgeCV, SVR, LinearSVR, AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor, BaggingRegressor, ExtraTreesRegressor, DecisionTreeRegressor, KNeighborsRegressor, ElasticNetCV,
                AdaBoostClassifier,MLPRegressor,
@@ -54,8 +54,9 @@
                PowerTransformer, QuantileTransformer,ARDRegression, QuadraticDiscriminantAnalysis, PassiveAggressiveClassifier, LinearDiscriminantAnalysis,
                DominantEncoder, RecessiveEncoder, HeterosisEncoder, UnderDominanceEncoder, OverDominanceEncoder,
                GaussianProcessClassifier, BaggingClassifier,LGBMRegressor,
-               Passthrough,
+               Passthrough,SkipTransformer,
                PassKBinsDiscretizer,
+               SimpleImputer,
                ]
 
 
@@ -123,7 +124,7 @@
         "all_transformers" : ["transformers", "scalers"],
 
         "arithmatic": ["AddTransformer", "mul_neg_1_Transformer", "MulTransformer", "SafeReciprocalTransformer", "EQTransformer", "NETransformer", "GETransformer", "GTTransformer", "LETransformer", "LTTransformer", "MinTransformer", "MaxTransformer"],
-        "imputers": [],
+        "imputers": ["SimpleImputer"],
         "skrebate": ["ReliefF", "SURF", "SURFstar", "MultiSURF"],
         "genetic_encoders": ["DominantEncoder", "RecessiveEncoder", "HeterosisEncoder", "UnderDominanceEncoder", "OverDominanceEncoder"],
 
@@ -135,6 +136,8 @@
 
 def get_configspace(name, n_classes=3, n_samples=1000, n_features=100, random_state=None):
     match name:
+        case "SimpleImputer":
+            return imputers.simple_imputer_cs
 
         #autoqtl_builtins.py
         case "FeatureEncodingFrequencySelector":
@@ -152,6 +155,8 @@ def get_configspace(name, n_classes=3, n_samples=1000, n_features=100, random_st
 
         case "Passthrough":
             return {}
+        case "SkipTransformer":
+            return {}
 
         #classifiers.py
         case "LinearDiscriminantAnalysis":
diff --git a/tpot2/config/imputers.py b/tpot2/config/imputers.py
@@ -1,7 +1,7 @@
 from ConfigSpace import ConfigurationSpace
 from ConfigSpace import ConfigurationSpace, Integer, Float, Categorical, Normal
 
-simple_imputer = ConfigurationSpace(
+simple_imputer_cs = ConfigurationSpace(
     space = {
         'strategy' : Categorical('strategy', ['mean','median', 'most_frequent', ]),
         'add_indicator' : Categorical('add_indicator', [True, False]), 
diff --git a/tpot2/search_spaces/pipelines/sequential.py b/tpot2/search_spaces/pipelines/sequential.py
@@ -12,9 +12,10 @@ class SequentialPipelineIndividual(SklearnIndividual):
     # takes in a list of search spaces. each space is a list of SklearnIndividualGenerators.
     # will produce a pipeline of Sequential length. Each step in the pipeline will correspond to the the search space provided in the same index.
 
-    def __init__(self, search_spaces : List[SklearnIndividualGenerator], rng=None) -> None:
+    def __init__(self, search_spaces : List[SklearnIndividualGenerator], memory=None, rng=None) -> None:
         super().__init__()
         self.search_spaces = search_spaces
+        self.memory = memory
         self.pipeline = []
 
         for space in self.search_spaces:
@@ -25,6 +26,14 @@ def __init__(self, search_spaces : List[SklearnIndividualGenerator], rng=None) -
     #TODO, mutate all steps or just one?
     def mutate(self, rng=None):
         rng = np.random.default_rng()
+
+        # mutated = False
+        # for step in self.pipeline:
+        #     if rng.random() < 0.5:
+        #         if step.mutate(rng):
+        #             mutated = True
+        # return mutated
+
         step = rng.choice(self.pipeline)
         return step.mutate(rng)
      
@@ -102,6 +111,15 @@ def _crossover_swap_segment(self, other, rng):
     def _crossover_inner_step(self, other, rng):
         rng = np.random.default_rng()
         
+        # crossover_success = False
+        # for idx in range(len(self.pipeline)):
+        #     if rng.random() < 0.5:
+        #         if self.pipeline[idx].crossover(other.pipeline[idx], rng):
+        #             crossover_success = True
+                
+        # return crossover_success
+
+
         crossover_success = False
         for idx in range(len(self.pipeline)):
             if rng.random() < 0.5:
@@ -111,7 +129,7 @@ def _crossover_inner_step(self, other, rng):
         return crossover_success
     
     def export_pipeline(self):
-        return sklearn.pipeline.make_pipeline(*[step.export_pipeline() for step in self.pipeline])
+        return sklearn.pipeline.make_pipeline(*[step.export_pipeline() for step in self.pipeline], memory=self.memory)
     
     def unique_id(self):
         l = [step.unique_id() for step in self.pipeline]
@@ -122,12 +140,13 @@ def unique_id(self):
 
 
 class SequentialPipeline(SklearnIndividualGenerator):
-    def __init__(self, search_spaces : List[SklearnIndividualGenerator] ) -> None:
+    def __init__(self, search_spaces : List[SklearnIndividualGenerator], memory=None ) -> None:
         """
         Takes in a list of search spaces. will produce a pipeline of Sequential length. Each step in the pipeline will correspond to the the search space provided in the same index.
         """
         
         self.search_spaces = search_spaces
+        self.memory = memory
 
     def generate(self, rng=None):
-        return SequentialPipelineIndividual(self.search_spaces, rng=rng)
+        return SequentialPipelineIndividual(self.search_spaces, memory=self.memory, rng=rng)
diff --git a/tpot2/search_spaces/pipelines/union.py b/tpot2/search_spaces/pipelines/union.py
@@ -34,7 +34,7 @@ def _crossover(self, other, rng=None):
         #swap a random step in the pipeline with the corresponding step in the other pipeline
         rng = np.random.default_rng()
 
-        cx_funcs = [self._crossover_swap_random_steps, self._crossover_inner_step]
+        cx_funcs = [self._crossover_inner_step]
         rng.shuffle(cx_funcs)
         for cx_func in cx_funcs:
             if cx_func(other, rng):