Merge pull request #145 from perib/dev

nickotto · web-flow · commit 9db1e3391e29 · 2024-09-11T12:59:40.000-07:00
Dev
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -22,6 +22,7 @@ jobs:
           pip install --upgrade pip
           pip install .
           pip install -r docs/requirements_docs.txt
+          pip install mkdocstrings[python] griffe
 
       # - name: Convert notebooks to HTML
       #   # if: ${{ github.event_name == 'push' && contains(github.event.head_commit.modified, 'Tutorial/') && contains(github.event.head_commit.modified, '.ipynb') }}
diff --git a/Tutorial/amltk_search_space_parser_example.ipynb b/Tutorial/amltk_search_space_parser_example.ipynb
diff --git a/setup.py b/setup.py
@@ -53,7 +53,8 @@ def calculate_version():
     extras_require={
         'skrebate': ['skrebate>=0.3.4'],
         'mdr': ['scikit-mdr>=0.4.4'],
-        'sklearnex' : ['scikit-learn-intelex>=2023.2.1']
+        'sklearnex' : ['scikit-learn-intelex>=2023.2.1'],
+        'amltk' : ['amltk>=1.12.1'],
     },
     classifiers=[
         'Intended Audience :: Science/Research',
diff --git a/tpot2/__init__.py b/tpot2/__init__.py
@@ -8,9 +8,9 @@
 from .population import Population
 
 from . import builtin_modules
-from . import utils
 from . import config
 from . import search_spaces
+from . import utils
 from . import evolvers
 from . import objectives
 from . import selectors
diff --git a/tpot2/builtin_modules/feature_set_selector.py b/tpot2/builtin_modules/feature_set_selector.py
@@ -92,6 +92,9 @@ def fit(self, X, y=None):
 
     # def transform(self, X):
     
+    def _get_tags(self):
+        tags = {"allow_nan": True, "requires_y": False}
+        return tags
 
     def _get_support_mask(self):
         """
diff --git a/tpot2/config/classifiers.py b/tpot2/config/classifiers.py
@@ -535,7 +535,7 @@ def MLPClassifier_hyperparameter_parser(params):
 def get_GaussianProcessClassifier_ConfigurationSpace(n_features, random_state):
     space = {
         'n_features': n_features,
-        'alpha': Float("alpha", bounds=(1e-14, 1.0), log=True),
+        'alpha': Float("alpha", bounds=(1e-10, 1.0), log=True),
         'thetaL': Float("thetaL", bounds=(1e-10, 1e-3), log=True),
         'thetaU': Float("thetaU", bounds=(1.0, 100000), log=True),
     }
diff --git a/tpot2/config/get_configspace.py b/tpot2/config/get_configspace.py
@@ -406,12 +406,12 @@ def get_configspace(name, n_classes=3, n_samples=1000, n_features=100, random_st
     raise ValueError(f"Could not find configspace for {name}")
    
 
-def get_search_space(name, n_classes=3, n_samples=100, n_features=100, random_state=None, return_choice_pipeline=True):
+def get_search_space(name, n_classes=3, n_samples=100, n_features=100, random_state=None, return_choice_pipeline=True, base_node=EstimatorNode):
 
 
     #if list of names, return a list of EstimatorNodes
     if isinstance(name, list) or isinstance(name, np.ndarray):
-        search_spaces = [get_search_space(n, n_classes=n_classes, n_samples=n_samples, n_features=n_features, random_state=random_state, return_choice_pipeline=False) for n in name]
+        search_spaces = [get_search_space(n, n_classes=n_classes, n_samples=n_samples, n_features=n_features, random_state=random_state, return_choice_pipeline=False, base_node=base_node) for n in name]
         #remove Nones
         search_spaces = [s for s in search_spaces if s is not None]
 
@@ -422,12 +422,12 @@ def get_search_space(name, n_classes=3, n_samples=100, n_features=100, random_st
     
     if name in GROUPNAMES:
         name_list = GROUPNAMES[name]
-        return get_search_space(name_list, n_classes=n_classes, n_samples=n_samples, n_features=n_features, random_state=random_state, return_choice_pipeline=return_choice_pipeline)
+        return get_search_space(name_list, n_classes=n_classes, n_samples=n_samples, n_features=n_features, random_state=random_state, return_choice_pipeline=return_choice_pipeline, base_node=base_node)
     
-    return get_node(name, n_classes=n_classes, n_samples=n_samples, n_features=n_features, random_state=random_state)
+    return get_node(name, n_classes=n_classes, n_samples=n_samples, n_features=n_features, random_state=random_state, base_node=base_node)
 
 
-def get_node(name, n_classes=3, n_samples=100, n_features=100, random_state=None):
+def get_node(name, n_classes=3, n_samples=100, n_features=100, random_state=None, base_node=EstimatorNode):
 
     #these are wrappers that take in another estimator as a parameter
     # TODO Add AdaBoostRegressor, AdaBoostClassifier as wrappers? wrap a decision tree with different params?
@@ -461,39 +461,39 @@ def get_node(name, n_classes=3, n_samples=100, n_features=100, random_state=None
         return EstimatorNode(STRING_TO_CLASS[name], configspace, hyperparameter_parser=imputers.IterativeImputer_hyperparameter_parser)
     if name == "RobustScaler":
         configspace = get_configspace(name, n_classes=n_classes, n_samples=n_samples, random_state=random_state)
-        return EstimatorNode(STRING_TO_CLASS[name], configspace, hyperparameter_parser=transformers.robust_scaler_hyperparameter_parser)
+        return base_node(STRING_TO_CLASS[name], configspace, hyperparameter_parser=transformers.robust_scaler_hyperparameter_parser)
     if name == "GradientBoostingClassifier":
         configspace = get_configspace(name, n_classes=n_classes, n_samples=n_samples, random_state=random_state)
-        return EstimatorNode(STRING_TO_CLASS[name], configspace, hyperparameter_parser=classifiers.GradientBoostingClassifier_hyperparameter_parser)
+        return base_node(STRING_TO_CLASS[name], configspace, hyperparameter_parser=classifiers.GradientBoostingClassifier_hyperparameter_parser)
     if name == "HistGradientBoostingClassifier":
         configspace = get_configspace(name, n_classes=n_classes, n_samples=n_samples, random_state=random_state)
-        return EstimatorNode(STRING_TO_CLASS[name], configspace, hyperparameter_parser=classifiers.HistGradientBoostingClassifier_hyperparameter_parser)
+        return base_node(STRING_TO_CLASS[name], configspace, hyperparameter_parser=classifiers.HistGradientBoostingClassifier_hyperparameter_parser)
     if name == "GradientBoostingRegressor":
         configspace = get_configspace(name, n_classes=n_classes, n_samples=n_samples, random_state=random_state)
-        return EstimatorNode(STRING_TO_CLASS[name], configspace, hyperparameter_parser=regressors.GradientBoostingRegressor_hyperparameter_parser)
+        return base_node(STRING_TO_CLASS[name], configspace, hyperparameter_parser=regressors.GradientBoostingRegressor_hyperparameter_parser)
     if  name == "HistGradientBoostingRegressor":
         configspace = get_configspace(name, n_classes=n_classes, n_samples=n_samples, random_state=random_state)
-        return EstimatorNode(STRING_TO_CLASS[name], configspace, hyperparameter_parser=regressors.HistGradientBoostingRegressor_hyperparameter_parser)
+        return base_node(STRING_TO_CLASS[name], configspace, hyperparameter_parser=regressors.HistGradientBoostingRegressor_hyperparameter_parser)
     if name == "MLPClassifier":
         configspace = get_configspace(name, n_classes=n_classes, n_samples=n_samples, random_state=random_state)
-        return EstimatorNode(STRING_TO_CLASS[name], configspace, hyperparameter_parser=classifiers.MLPClassifier_hyperparameter_parser)
+        return base_node(STRING_TO_CLASS[name], configspace, hyperparameter_parser=classifiers.MLPClassifier_hyperparameter_parser)
     if name == "MLPRegressor":
         configspace = get_configspace(name, n_classes=n_classes, n_samples=n_samples, random_state=random_state)
-        return EstimatorNode(STRING_TO_CLASS[name], configspace, hyperparameter_parser=regressors.MLPRegressor_hyperparameter_parser)
+        return base_node(STRING_TO_CLASS[name], configspace, hyperparameter_parser=regressors.MLPRegressor_hyperparameter_parser)
     if name == "GaussianProcessRegressor":
         configspace = get_configspace(name, n_classes=n_classes, n_samples=n_samples, random_state=random_state)
-        return EstimatorNode(STRING_TO_CLASS[name], configspace, hyperparameter_parser=regressors.GaussianProcessRegressor_hyperparameter_parser)
+        return base_node(STRING_TO_CLASS[name], configspace, hyperparameter_parser=regressors.GaussianProcessRegressor_hyperparameter_parser)
     if name == "GaussianProcessClassifier":
         configspace = get_configspace(name, n_classes=n_classes, n_samples=n_samples, random_state=random_state)
-        return EstimatorNode(STRING_TO_CLASS[name], configspace, hyperparameter_parser=classifiers.GaussianProcessClassifier_hyperparameter_parser)
+        return base_node(STRING_TO_CLASS[name], configspace, hyperparameter_parser=classifiers.GaussianProcessClassifier_hyperparameter_parser)
     if name == "FeatureAgglomeration":
         configspace = get_configspace(name, n_features=n_features)
-        return EstimatorNode(STRING_TO_CLASS[name], configspace, hyperparameter_parser=transformers.FeatureAgglomeration_hyperparameter_parser)
+        return base_node(STRING_TO_CLASS[name], configspace, hyperparameter_parser=transformers.FeatureAgglomeration_hyperparameter_parser)
 
     configspace = get_configspace(name, n_classes=n_classes, n_samples=n_samples, n_features=n_features, random_state=random_state)
     if configspace is None:
         #raise warning
         warnings.warn(f"Could not find configspace for {name}")
         return None
     
-    return EstimatorNode(STRING_TO_CLASS[name], configspace)
+    return base_node(STRING_TO_CLASS[name], configspace)
diff --git a/tpot2/config/regressors.py b/tpot2/config/regressors.py
@@ -354,7 +354,7 @@ def get_ExtraTreesRegressor_ConfigurationSpace(random_state):
 def get_GaussianProcessRegressor_ConfigurationSpace(n_features, random_state):
     space = {
         'n_features': n_features,
-        'alpha': Float("alpha", bounds=(1e-14, 1.0), log=True),
+        'alpha': Float("alpha", bounds=(1e-10, 1.0), log=True),
         'thetaL': Float("thetaL", bounds=(1e-10, 1e-3), log=True),
         'thetaU': Float("thetaU", bounds=(1.0, 100000), log=True),
     }
diff --git a/tpot2/search_spaces/base.py b/tpot2/search_spaces/base.py
@@ -1,18 +1,10 @@
 import tpot2
-import numpy as np
-import pandas as pd
 import sklearn
-from tpot2 import config
-from typing import Generator, List, Tuple, Union
-import random
 from sklearn.base import BaseEstimator
 import sklearn
 import networkx as nx
 from . import graph_utils
 from typing import final
-from abc import ABC, abstractmethod
-
-
 
 
 
diff --git a/tpot2/search_spaces/nodes/estimator_node_gradual.py b/tpot2/search_spaces/nodes/estimator_node_gradual.py
@@ -0,0 +1,146 @@
+# try https://automl.github.io/ConfigSpace/main/api/hyperparameters.html
+
+import numpy as np
+from tpot2.search_spaces.base import SklearnIndividual, SklearnIndividualGenerator
+from ConfigSpace import ConfigurationSpace
+from typing import final
+import ConfigSpace
+
+
+NONE_SPECIAL_STRING = "<NONE>"
+TRUE_SPECIAL_STRING = "<TRUE>"
+FALSE_SPECIAL_STRING = "<FALSE>"
+
+
+def default_hyperparameter_parser(params:dict) -> dict:
+    return params
+
+
+# NOTE: This is not the default, currently experimental
+class EstimatorNodeIndividual_gradual(SklearnIndividual):
+    """
+    Note that ConfigurationSpace does not support None as a parameter. Instead, use the special string "<NONE>". TPOT will automatically replace instances of this string with the Python None. 
+
+    Parameters
+    ----------
+    method : type
+        The class of the estimator to be used
+
+    space : ConfigurationSpace|dict
+        The hyperparameter space to be used. If a dict is passed, hyperparameters are fixed and not learned.
+    
+    """
+    def __init__(self, method: type, 
+                        space: ConfigurationSpace|dict, #TODO If a dict is passed, hyperparameters are fixed and not learned. Is this confusing? Should we make a second node type?
+                        hyperparameter_parser: callable = None,
+                        rng=None) -> None:
+        super().__init__()
+        self.method = method
+        self.space = space
+        
+        if hyperparameter_parser is None:
+            self.hyperparameter_parser = default_hyperparameter_parser
+        else:
+            self.hyperparameter_parser = hyperparameter_parser
+        
+        if isinstance(space, dict):
+            self.hyperparameters = space
+        else:
+            rng = np.random.default_rng(rng)
+            self.space.seed(rng.integers(0, 2**32))
+            self.hyperparameters = dict(self.space.sample_configuration())
+
+        self.check_hyperparameters_for_None()
+
+    def mutate(self, rng=None):
+        if isinstance(self.space, dict): 
+            return False
+        self.hyperparameters = gradual_hyperparameter_update(params=self.hyperparameters, configspace=self.space, rng=rng)
+        self.check_hyperparameters_for_None()
+        return True
+
+    def crossover(self, other, rng=None):
+        if isinstance(self.space, dict):
+            return False
+        
+        rng = np.random.default_rng(rng)
+        if self.method != other.method:
+            return False
+
+        #loop through hyperparameters, randomly swap items in self.hyperparameters with items in other.hyperparameters
+        for hyperparameter in self.space:
+            if rng.choice([True, False]):
+                if hyperparameter in other.hyperparameters:
+                    self.hyperparameters[hyperparameter] = other.hyperparameters[hyperparameter]
+
+        self.check_hyperparameters_for_None()
+
+        return True
+
+    def check_hyperparameters_for_None(self):
+        for key, value in self.hyperparameters.items():
+            #if string
+            if isinstance(value, str):
+                if value == NONE_SPECIAL_STRING:
+                    self.hyperparameters[key] = None
+                elif value == TRUE_SPECIAL_STRING:
+                    self.hyperparameters[key] = True
+                elif value == FALSE_SPECIAL_STRING:
+                    self.hyperparameters[key] = False
+
+    @final #this method should not be overridden, instead override hyperparameter_parser
+    def export_pipeline(self, **kwargs):
+        return self.method(**self.hyperparameter_parser(self.hyperparameters))
+    
+    def unique_id(self):
+        #return a dictionary of the method and the hyperparameters
+        method_str = self.method.__name__
+        params = list(self.hyperparameters.keys())
+        params = sorted(params)
+
+        id_str = f"{method_str}({', '.join([f'{param}={self.hyperparameters[param]}' for param in params])})"
+        
+        return id_str
+
+def gradual_hyperparameter_update(params:dict, configspace:ConfigurationSpace, rng=None):
+    rng = np.random.default_rng(rng)
+    configspace.seed(rng.integers(0, 2**32))
+    new_params = dict(configspace.sample_configuration())
+    for param in list(new_params.keys()):
+        #if parameter is float, multiply by normal distribution
+        if param not in params:
+            continue
+        try:
+            if issubclass(type(configspace[param]), ConfigSpace.hyperparameters.hyperparameter.FloatHyperparameter):
+                
+                if configspace[param].log:
+                    new_params[param] = params[param] * rng.lognormal(0, 1)
+                else:
+                    new_params[param] = params[param] + rng.normal(0, .1)* (configspace[param].upper-configspace[param].lower)
+                    # if check if above or below min and cap
+                if new_params[param] < configspace[param].lower:
+                    new_params[param] = configspace[param].lower
+                elif new_params[param] > configspace[param].upper:
+                    new_params[param] = configspace[param].upper
+            #if parameter is integer, add normal distribution
+            elif issubclass(type(configspace[param]), ConfigSpace.hyperparameters.hyperparameter.IntegerHyperparameter):
+                new_params[param] = params[param] * np.random.normal(0, 1)
+                # if check if above or below min and cap
+                if new_params[param] < configspace[param].lower:
+                    new_params[param] = configspace[param].lower
+                elif new_params[param] > configspace[param].upper:
+                    new_params[param] = configspace[param].upper
+                new_params[param] = int(new_params[param])
+        except:
+            pass
+
+    return new_params
+
+class EstimatorNode_gradual(SklearnIndividualGenerator):
+    def __init__(self, method, space, hyperparameter_parser=default_hyperparameter_parser):
+        self.method = method
+        self.space = space
+        self.hyperparameter_parser = hyperparameter_parser
+
+    def generate(self, rng=None):
+        return EstimatorNodeIndividual_gradual(self.method, self.space, hyperparameter_parser=self.hyperparameter_parser, rng=rng)
diff --git a/tpot2/search_spaces/nodes/genetic_feature_selection.py b/tpot2/search_spaces/nodes/genetic_feature_selection.py
@@ -15,15 +15,29 @@
 class MaskSelector(BaseEstimator, SelectorMixin):
     """Select predefined feature subsets."""
 
-    def __init__(self, mask):
+    def __init__(self, mask, set_output_transform=None):
         self.mask = mask
+        self.set_output_transform = set_output_transform
+        if set_output_transform is not None:
+            self.set_output(transform=set_output_transform)
 
     def fit(self, X, y=None):
+        self.n_features_in_ = X.shape[1]
+        if isinstance(X, pd.DataFrame):
+            self.feature_names_in_ = X.columns
+        #     self.set_output(transform="pandas")
+        self.is_fitted_ = True #so sklearn knows it's fitted
         return self
 
+    def _get_tags(self):
+        tags = {"allow_nan": True, "requires_y": False}
+        return tags
+
     def _get_support_mask(self):
         return np.array(self.mask)
 
+    def get_feature_names_out(self, input_features=None):
+        return self.feature_names_in_[self.get_support()]
 
 class GeneticFeatureSelectorIndividual(SklearnIndividual):
     def __init__(   self,
diff --git a/tpot2/utils/__init__.py b/tpot2/utils/__init__.py
@@ -1,2 +1,11 @@
 from . import eval_utils
-from .utils import *
+from .utils import *
+
+# If amltk is installed, import the parser
+try:
+    from .amltk_parser import tpot2_parser
+except ImportError:
+    # Handle the case when amltk is not installed
+    pass
+    # print("amltk is not installed. Please install it to use tpot2_parser.")
+    # Optional: raise an exception or provide alternative functionality
diff --git a/tpot2/utils/amltk_parser.py b/tpot2/utils/amltk_parser.py

Original file line number	Diff line number	Diff line change
`@@ -535,7 +535,7 @@ def MLPClassifier_hyperparameter_parser(params):`
`535`	`535`	`def get_GaussianProcessClassifier_ConfigurationSpace(n_features, random_state):`
`536`	`536`	`space = {`
`537`	`537`	`'n_features': n_features,`
`538`		`- 'alpha': Float("alpha", bounds=(1e-14, 1.0), log=True),`
	`538`	`+ 'alpha': Float("alpha", bounds=(1e-10, 1.0), log=True),`
`539`	`539`	`'thetaL': Float("thetaL", bounds=(1e-10, 1e-3), log=True),`
`540`	`540`	`'thetaU': Float("thetaU", bounds=(1.0, 100000), log=True),`
`541`	`541`	`}`
Original file line number	Diff line number	Diff line change
`@@ -354,7 +354,7 @@ def get_ExtraTreesRegressor_ConfigurationSpace(random_state):`
`354`	`354`	`def get_GaussianProcessRegressor_ConfigurationSpace(n_features, random_state):`
`355`	`355`	`space = {`
`356`	`356`	`'n_features': n_features,`
`357`		`- 'alpha': Float("alpha", bounds=(1e-14, 1.0), log=True),`
	`357`	`+ 'alpha': Float("alpha", bounds=(1e-10, 1.0), log=True),`
`358`	`358`	`'thetaL': Float("thetaL", bounds=(1e-10, 1e-3), log=True),`
`359`	`359`	`'thetaU': Float("thetaU", bounds=(1.0, 100000), log=True),`
`360`	`360`	`}`