Merge pull request #127 from perib/new_search_space_def

added unique_id, fixed issue where graphpipeline not correctly identi…
EpistasisLab · Apr 19, 2024 · 5448a43 · 5448a43
2 parents b2a00ed + d61cd29
commit 5448a43
Show file tree

Hide file tree

Showing 15 changed files with 626 additions and 736 deletions.
diff --git a/Tutorial/2_Search_Spaces.ipynb b/Tutorial/2_Search_Spaces.ipynb
diff --git a/tpot2/config/classifiers.py b/tpot2/config/classifiers.py
@@ -307,7 +307,7 @@ def get_LinearDiscriminantAnalysis_ConfigurationSpace():
 
 #### Gradient Boosting Classifiers
 
-def get_GradientBoostingClassifier_ConfigurationSpace(n_features, random_state):
+def get_GradientBoostingClassifier_ConfigurationSpace(n_classes, n_features, random_state):
     early_stop = Categorical("early_stop", ["off", "valid", "train"])
     n_iter_no_change = Integer("n_iter_no_change",bounds=(1,20))
     validation_fraction = Float("validation_fraction", bounds=(0.01, 0.4))
@@ -316,7 +316,6 @@ def get_GradientBoostingClassifier_ConfigurationSpace(n_features, random_state):
     validation_fraction_cond = EqualsCondition(validation_fraction, early_stop, "valid")
 
     space = {
-        'loss': Categorical("loss", ['log_loss', 'exponential']),
         'learning_rate': Float("learning_rate", bounds=(1e-3, 1), log=True),
         'min_samples_leaf': Integer("min_samples_leaf", bounds=(1, 200)),
         'min_samples_split': Integer("min_samples_split", bounds=(2, 20)),
@@ -327,6 +326,11 @@ def get_GradientBoostingClassifier_ConfigurationSpace(n_features, random_state):
         'tol': 1e-4,
     }
 
+    if n_classes == 2:
+        space['loss']= Categorical("loss", ['log_loss', 'exponential'])
+    else:
+        space['loss'] = "log_loss"
+
     if random_state is not None: #This is required because configspace doesn't allow None as a value
         space['random_state'] = random_state
 

diff --git a/tpot2/config/get_configspace.py b/tpot2/config/get_configspace.py
@@ -223,7 +223,7 @@ def get_configspace(name, n_classes=3, n_samples=100, n_features=100, random_sta
         case "RandomForestClassifier":
             return classifiers.get_RandomForestClassifier_ConfigurationSpace(n_features=n_features, random_state=random_state)
         case "GradientBoostingClassifier":
-            return classifiers.get_GradientBoostingClassifier_ConfigurationSpace(n_features=n_features, random_state=random_state)
+            return classifiers.get_GradientBoostingClassifier_ConfigurationSpace(n_classes=n_classes, n_features=n_features, random_state=random_state)
         case "HistGradientBoostingClassifier":
             return classifiers.get_HistGradientBoostingClassifier_ConfigurationSpace(n_features=n_features, random_state=random_state)
         case "XGBClassifier":

diff --git a/tpot2/graphsklearn.py b/tpot2/graphsklearn.py
@@ -11,6 +11,8 @@
 from sklearn.utils.validation import check_memory
 from sklearn.preprocessing import LabelEncoder
 
+from sklearn.base import is_classifier, is_regressor
+
 #labels - str
 #attributes - "instance" -> instance of the type
 
@@ -128,7 +130,7 @@ def fit_sklearn_digraph(graph: nx.DiGraph,
         #    instance.fit(this_X, y)
 
 
-        if issubclass(type(instance), sklearn.base.RegressorMixin) or issubclass(type(instance), sklearn.base.ClassifierMixin):
+        if is_classifier(instance) or is_regressor(instance):
             transformed, instance = estimator_fit_transform_override_cross_val_predict_cached(instance, this_X, y, cv=cross_val_predict_cv, method=method)
         else:
             transformed, instance = fit_transform_one_cached(instance, this_X, y)#instance.fit_transform(this_X,y)
@@ -168,7 +170,7 @@ def transform_sklearn_digraph(graph: nx.DiGraph,
         else:
             this_X = np.hstack([transformed_steps[child] for child in get_ordered_successors(graph, node)])
 
-        if issubclass(type(instance), sklearn.base.RegressorMixin) or issubclass(type(instance), sklearn.base.ClassifierMixin):
+        if is_classifier(instance) or is_regressor(instance):
             this_method = _method_name(instance.__class__.__name__, instance, method)
             transformed = getattr(instance, this_method)(this_X)
         else:

diff --git a/tpot2/search_spaces/base.py b/tpot2/search_spaces/base.py
@@ -40,10 +40,6 @@ def generate(self, rng=None) -> SklearnIndividual:
         pass
 
 
-
-
-
-
 def flatten_graphpipeline(est):
     flattened_full_graph = est.graph.copy()
 

diff --git a/tpot2/search_spaces/nodes/estimator_node.py b/tpot2/search_spaces/nodes/estimator_node.py
@@ -95,7 +95,7 @@ def export_pipeline(self, **kwargs):
 
     def unique_id(self):
         #return a dictionary of the method and the hyperparameters
-        return (self.method, self.hyperparameters)
+        return (self.method, str(tuple(sorted(list(self.hyperparameter_parser(self.hyperparameters).items())))))
 
 class EstimatorNode(SklearnIndividualGenerator):
     def __init__(self, method, space, hyperparameter_parser=default_hyperparameter_parser):

diff --git a/tpot2/search_spaces/nodes/estimator_node_simple.py b/tpot2/search_spaces/nodes/estimator_node_simple.py
@@ -12,7 +12,7 @@ class EstimatorNodeIndividual(SklearnIndividual):
     def __init__(self, method, space ) -> None:
         super().__init__()
         self.method = method
-        self.space = space
+        self.space = space #a dictionary. keys are hyperparameters, values are the space of the hyperparameter. If list, then hyperparameter is categorical. If tuple, then hyperparameter is continuous. If single value, then hyperparameter is fixed.
 
         self._mutate_hyperparameters()
 

diff --git a/tpot2/search_spaces/nodes/fss_node.py b/tpot2/search_spaces/nodes/fss_node.py
@@ -60,7 +60,7 @@ def export_pipeline(self):
 
 
     def unique_id(self):
-        return self.selected_subset_name
+        return ("FSS", self.selected_subset_name)
 
 
 class FSSNode(SklearnIndividualGenerator):

diff --git a/tpot2/search_spaces/nodes/genetic_feature_selection.py b/tpot2/search_spaces/nodes/genetic_feature_selection.py
@@ -146,7 +146,7 @@ def export_pipeline(self):
 
 
     def unique_id(self):
-        return self.mask
+        return tuple(self.mask)
 
 
 class GeneticFeatureSelectorNode(SklearnIndividualGenerator):

diff --git a/tpot2/search_spaces/pipelines/graph.py b/tpot2/search_spaces/pipelines/graph.py
@@ -10,6 +10,7 @@
 from ..nodes.estimator_node import EstimatorNodeIndividual
 from typing import Union, Callable
 import sklearn
+from functools import partial
 
 class GraphPipelineIndividual(SklearnIndividual):
     """
@@ -100,10 +101,10 @@ def __init__(
 
         self.merge_duplicated_nodes_toggle = True
 
+        self.graphkey = None
 
     def mutate(self, rng=None):
         rng = np.random.default_rng(rng)
-        self.key = None
 
         rng.shuffle(self.mutate_methods_list)
         for mutate_method in self.mutate_methods_list:
@@ -130,7 +131,8 @@ def mutate(self, rng=None):
                             print('something went wrong with ', mutate_method)
                         except:
                             pass
-
+
+                self.graphkey = None
                 return True
 
         return False
@@ -335,6 +337,9 @@ def crossover(self, ind2, rng=None):
                 print('something went wrong with ', crossover_method)
             except:
                 pass
+
+        if finished:
+            self.graphkey = None
 
         return finished
 
@@ -661,12 +666,17 @@ def plot(self):
         plt.show()
 
 
-
-
-
-
     def unique_id(self):
-        return self
+        if self.graphkey is None:
+            #copy self.graph
+            new_graph = self.graph.copy()
+            for n in new_graph.nodes:
+                new_graph.nodes[n]['label'] = n.unique_id()
+
+            new_graph = nx.convert_node_labels_to_integers(new_graph)
+            self.graphkey = GraphKey(new_graph)
+
+        return self.graphkey
 
 
 class GraphPipeline(SklearnIndividualGenerator):
@@ -753,4 +763,43 @@ def generate(self, rng=None):
                 func = rng.choice(starting_ops)
                 func(rng=rng)
 
-        return ind
+        return ind
+
+
+
+
+
+class GraphKey():
+    '''
+    A class that can be used as a key for a graph.
+
+    Parameters
+    ----------
+    graph : (nx.Graph)
+        The graph to use as a key. Node Attributes are used for the hash.
+    matched_label : (str)
+        The node attribute to consider for the hash.
+    '''
+
+    def __init__(self, graph, matched_label='label') -> None:#['hyperparameters', 'method_class']) -> None:
+
+
+        self.graph = graph
+        self.matched_label = matched_label
+        self.node_match = partial(node_match, matched_labels=[matched_label])
+        self.key = int(nx.weisfeiler_lehman_graph_hash(self.graph, node_attr=self.matched_label),16) #hash(tuple(sorted([val for (node, val) in self.graph.degree()])))
+
+
+    #If hash is different, node is definitely different
+    # https://arxiv.org/pdf/2002.06653.pdf
+    def __hash__(self) -> int:
+
+        return self.key
+
+    #If hash is same, use __eq__ to know if they are actually different
+    def __eq__(self, other):
+        return nx.is_isomorphic(self.graph, other.graph, node_match=self.node_match)
+
+def node_match(n1,n2, matched_labels):
+    return all( [ n1[m] == n2[m] for m in matched_labels])
+
diff --git a/tpot2/search_spaces/pipelines/sequential.py b/tpot2/search_spaces/pipelines/sequential.py
@@ -47,7 +47,7 @@ def export_pipeline(self):
         return sklearn.pipeline.make_pipeline(*[step.export_pipeline() for step in self.pipeline])
 
     def unique_id(self):
-        return self
+        return tuple([step.unique_id() for step in self.pipeline])
 
 
 class SequentialPipeline(SklearnIndividualGenerator):

diff --git a/tpot2/search_spaces/pipelines/wrapper.py b/tpot2/search_spaces/pipelines/wrapper.py
@@ -13,7 +13,8 @@ class WrapperPipelineIndividual(SklearnIndividual):
     def __init__(self, 
                  nodegen: SklearnIndividualGenerator, 
                  method: type, 
-                space: ConfigurationSpace,
+                 space: ConfigurationSpace,
+                 hyperparameter_parser: callable = None,
                  rng=None) -> None:
 
 
@@ -30,7 +31,7 @@ def __init__(self,
         self.space.seed(rng.integers(0, 2**32))
         self.hyperparameters = dict(self.space.sample_configuration())
 
-
+        self.hyperparameters_parser = hyperparameter_parser
 
 
     def mutate(self, rng=None):
@@ -53,14 +54,24 @@ def crossover(self, other, rng=None):
         return self.node.crossover(other.node, rng)
 
     def export_pipeline(self):
+
+        if self.hyperparameters_parser is not None:
+            final_params = self.hyperparameters_parser(self.hyperparameters)
+        else:
+            final_params = self.hyperparameters
 
         est = self.node.export_pipeline()
-        wrapped_est = self.method(est, **self.hyperparameters)
+        wrapped_est = self.method(est, **final_params)
         return wrapped_est
 
-
     def unique_id(self):
-        return self.node.unique_id()
+
+        if self.hyperparameters_parser is not None:
+            final_params = self.hyperparameters_parser(self.hyperparameters)
+        else:
+            final_params = self.hyperparameters
+
+        return (self.method, str(tuple(sorted(list(final_params.items())))) ,self.node.unique_id())
 
 
 class WrapperPipeline(SklearnIndividualGenerator):

diff --git a/tpot2/tests/test_estimators.py b/tpot2/tests/test_estimators.py
@@ -20,7 +20,7 @@ def tpot_estimator():
     search_space = tpot2.search_spaces.pipelines.GraphPipeline(
             root_search_space= tpot2.config.get_search_space("classifiers", n_samples=n_samples, n_features=n_features, n_classes=n_classes),
             leaf_search_space = None, 
-            inner_search_space = tpot2.config.get_search_space(["selectors","transformers","classifiers"],n_samples=n_samples, n_features=n_features, n_classes=n_classes),
+            inner_search_space = tpot2.config.get_search_space(["selectors","transformers"],n_samples=n_samples, n_features=n_features, n_classes=n_classes),
             max_size = 10,
         )
     return tpot2.TPOTEstimator(  
@@ -39,11 +39,11 @@ def tpot_estimator():
 
 @pytest.fixture
 def tpot_classifier():
-    return tpot2.tpot_estimator.templates.TPOTClassifier(max_time_seconds=10,verbose=3)
+    return tpot2.tpot_estimator.templates.TPOTClassifier(max_time_seconds=10,verbose=0)
 
 @pytest.fixture
 def tpot_regressor():
-    return tpot2.tpot_estimator.templates.TPOTRegressor(max_time_seconds=10,verbose=3)
+    return tpot2.tpot_estimator.templates.TPOTRegressor(max_time_seconds=10,verbose=0)
 
 
 
@@ -116,7 +116,7 @@ def test_tpot_regressor_fit(tpot_regressor):
 
     scorer = sklearn.metrics.get_scorer('neg_mean_squared_error')
     X, y = sklearn.datasets.load_diabetes(return_X_y=True)
-    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.75, test_size=0.25)
+    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.05, test_size=0.95)
     tpot_regressor.fit(X_train, y_train)
     assert tpot_regressor.fitted_pipeline_ is not None
 
diff --git a/tpot2/tests/test_hello_world.py b/tpot2/tests/test_hello_world.py
@@ -17,10 +17,6 @@ def test_hello_world(test_input, expected):
     assert test_input is expected
 
 
-@pytest.mark.xfail(reason="Not yet implemented")
-def test_divide_by_zero():
-    assert 1 / 0 == 1
-
 
 def test_print(capture_stdout):
     print("Hello World")
Original file line number	Diff line number	Diff line change
Expand Up		@@ -40,10 +40,6 @@ def generate(self, rng=None) -> SklearnIndividual:
		pass






		def flatten_graphpipeline(est):
		flattened_full_graph = est.graph.copy()

Expand Down