Skip to content

Commit

Permalink
Merge pull request #127 from perib/new_search_space_def
Browse files Browse the repository at this point in the history
added unique_id, fixed issue where graphpipeline not correctly identi…
  • Loading branch information
perib authored Apr 19, 2024
2 parents b2a00ed + d61cd29 commit 5448a43
Show file tree
Hide file tree
Showing 15 changed files with 626 additions and 736 deletions.
1,101 changes: 532 additions & 569 deletions Tutorial/2_Search_Spaces.ipynb

Large diffs are not rendered by default.

8 changes: 6 additions & 2 deletions tpot2/config/classifiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ def get_LinearDiscriminantAnalysis_ConfigurationSpace():

#### Gradient Boosting Classifiers

def get_GradientBoostingClassifier_ConfigurationSpace(n_features, random_state):
def get_GradientBoostingClassifier_ConfigurationSpace(n_classes, n_features, random_state):
early_stop = Categorical("early_stop", ["off", "valid", "train"])
n_iter_no_change = Integer("n_iter_no_change",bounds=(1,20))
validation_fraction = Float("validation_fraction", bounds=(0.01, 0.4))
Expand All @@ -316,7 +316,6 @@ def get_GradientBoostingClassifier_ConfigurationSpace(n_features, random_state):
validation_fraction_cond = EqualsCondition(validation_fraction, early_stop, "valid")

space = {
'loss': Categorical("loss", ['log_loss', 'exponential']),
'learning_rate': Float("learning_rate", bounds=(1e-3, 1), log=True),
'min_samples_leaf': Integer("min_samples_leaf", bounds=(1, 200)),
'min_samples_split': Integer("min_samples_split", bounds=(2, 20)),
Expand All @@ -327,6 +326,11 @@ def get_GradientBoostingClassifier_ConfigurationSpace(n_features, random_state):
'tol': 1e-4,
}

if n_classes == 2:
space['loss']= Categorical("loss", ['log_loss', 'exponential'])
else:
space['loss'] = "log_loss"

if random_state is not None: #This is required because configspace doesn't allow None as a value
space['random_state'] = random_state

Expand Down
2 changes: 1 addition & 1 deletion tpot2/config/get_configspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ def get_configspace(name, n_classes=3, n_samples=100, n_features=100, random_sta
case "RandomForestClassifier":
return classifiers.get_RandomForestClassifier_ConfigurationSpace(n_features=n_features, random_state=random_state)
case "GradientBoostingClassifier":
return classifiers.get_GradientBoostingClassifier_ConfigurationSpace(n_features=n_features, random_state=random_state)
return classifiers.get_GradientBoostingClassifier_ConfigurationSpace(n_classes=n_classes, n_features=n_features, random_state=random_state)
case "HistGradientBoostingClassifier":
return classifiers.get_HistGradientBoostingClassifier_ConfigurationSpace(n_features=n_features, random_state=random_state)
case "XGBClassifier":
Expand Down
6 changes: 4 additions & 2 deletions tpot2/graphsklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
from sklearn.utils.validation import check_memory
from sklearn.preprocessing import LabelEncoder

from sklearn.base import is_classifier, is_regressor

#labels - str
#attributes - "instance" -> instance of the type

Expand Down Expand Up @@ -128,7 +130,7 @@ def fit_sklearn_digraph(graph: nx.DiGraph,
# instance.fit(this_X, y)


if issubclass(type(instance), sklearn.base.RegressorMixin) or issubclass(type(instance), sklearn.base.ClassifierMixin):
if is_classifier(instance) or is_regressor(instance):
transformed, instance = estimator_fit_transform_override_cross_val_predict_cached(instance, this_X, y, cv=cross_val_predict_cv, method=method)
else:
transformed, instance = fit_transform_one_cached(instance, this_X, y)#instance.fit_transform(this_X,y)
Expand Down Expand Up @@ -168,7 +170,7 @@ def transform_sklearn_digraph(graph: nx.DiGraph,
else:
this_X = np.hstack([transformed_steps[child] for child in get_ordered_successors(graph, node)])

if issubclass(type(instance), sklearn.base.RegressorMixin) or issubclass(type(instance), sklearn.base.ClassifierMixin):
if is_classifier(instance) or is_regressor(instance):
this_method = _method_name(instance.__class__.__name__, instance, method)
transformed = getattr(instance, this_method)(this_X)
else:
Expand Down
4 changes: 0 additions & 4 deletions tpot2/search_spaces/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,6 @@ def generate(self, rng=None) -> SklearnIndividual:
pass






def flatten_graphpipeline(est):
flattened_full_graph = est.graph.copy()

Expand Down
2 changes: 1 addition & 1 deletion tpot2/search_spaces/nodes/estimator_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def export_pipeline(self, **kwargs):

def unique_id(self):
#return a dictionary of the method and the hyperparameters
return (self.method, self.hyperparameters)
return (self.method, str(tuple(sorted(list(self.hyperparameter_parser(self.hyperparameters).items())))))

class EstimatorNode(SklearnIndividualGenerator):
def __init__(self, method, space, hyperparameter_parser=default_hyperparameter_parser):
Expand Down
2 changes: 1 addition & 1 deletion tpot2/search_spaces/nodes/estimator_node_simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ class EstimatorNodeIndividual(SklearnIndividual):
def __init__(self, method, space ) -> None:
super().__init__()
self.method = method
self.space = space
self.space = space #a dictionary. keys are hyperparameters, values are the space of the hyperparameter. If list, then hyperparameter is categorical. If tuple, then hyperparameter is continuous. If single value, then hyperparameter is fixed.

self._mutate_hyperparameters()

Expand Down
2 changes: 1 addition & 1 deletion tpot2/search_spaces/nodes/fss_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def export_pipeline(self):


def unique_id(self):
return self.selected_subset_name
return ("FSS", self.selected_subset_name)


class FSSNode(SklearnIndividualGenerator):
Expand Down
2 changes: 1 addition & 1 deletion tpot2/search_spaces/nodes/genetic_feature_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def export_pipeline(self):


def unique_id(self):
return self.mask
return tuple(self.mask)


class GeneticFeatureSelectorNode(SklearnIndividualGenerator):
Expand Down
65 changes: 57 additions & 8 deletions tpot2/search_spaces/pipelines/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from ..nodes.estimator_node import EstimatorNodeIndividual
from typing import Union, Callable
import sklearn
from functools import partial

class GraphPipelineIndividual(SklearnIndividual):
"""
Expand Down Expand Up @@ -100,10 +101,10 @@ def __init__(

self.merge_duplicated_nodes_toggle = True

self.graphkey = None

def mutate(self, rng=None):
rng = np.random.default_rng(rng)
self.key = None

rng.shuffle(self.mutate_methods_list)
for mutate_method in self.mutate_methods_list:
Expand All @@ -130,7 +131,8 @@ def mutate(self, rng=None):
print('something went wrong with ', mutate_method)
except:
pass


self.graphkey = None
return True

return False
Expand Down Expand Up @@ -335,6 +337,9 @@ def crossover(self, ind2, rng=None):
print('something went wrong with ', crossover_method)
except:
pass

if finished:
self.graphkey = None

return finished

Expand Down Expand Up @@ -661,12 +666,17 @@ def plot(self):
plt.show()






def unique_id(self):
return self
if self.graphkey is None:
#copy self.graph
new_graph = self.graph.copy()
for n in new_graph.nodes:
new_graph.nodes[n]['label'] = n.unique_id()

new_graph = nx.convert_node_labels_to_integers(new_graph)
self.graphkey = GraphKey(new_graph)

return self.graphkey


class GraphPipeline(SklearnIndividualGenerator):
Expand Down Expand Up @@ -753,4 +763,43 @@ def generate(self, rng=None):
func = rng.choice(starting_ops)
func(rng=rng)

return ind
return ind





class GraphKey():
'''
A class that can be used as a key for a graph.
Parameters
----------
graph : (nx.Graph)
The graph to use as a key. Node Attributes are used for the hash.
matched_label : (str)
The node attribute to consider for the hash.
'''

def __init__(self, graph, matched_label='label') -> None:#['hyperparameters', 'method_class']) -> None:


self.graph = graph
self.matched_label = matched_label
self.node_match = partial(node_match, matched_labels=[matched_label])
self.key = int(nx.weisfeiler_lehman_graph_hash(self.graph, node_attr=self.matched_label),16) #hash(tuple(sorted([val for (node, val) in self.graph.degree()])))


#If hash is different, node is definitely different
# https://arxiv.org/pdf/2002.06653.pdf
def __hash__(self) -> int:

return self.key

#If hash is same, use __eq__ to know if they are actually different
def __eq__(self, other):
return nx.is_isomorphic(self.graph, other.graph, node_match=self.node_match)

def node_match(n1,n2, matched_labels):
return all( [ n1[m] == n2[m] for m in matched_labels])

2 changes: 1 addition & 1 deletion tpot2/search_spaces/pipelines/sequential.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def export_pipeline(self):
return sklearn.pipeline.make_pipeline(*[step.export_pipeline() for step in self.pipeline])

def unique_id(self):
return self
return tuple([step.unique_id() for step in self.pipeline])


class SequentialPipeline(SklearnIndividualGenerator):
Expand Down
21 changes: 16 additions & 5 deletions tpot2/search_spaces/pipelines/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ class WrapperPipelineIndividual(SklearnIndividual):
def __init__(self,
nodegen: SklearnIndividualGenerator,
method: type,
space: ConfigurationSpace,
space: ConfigurationSpace,
hyperparameter_parser: callable = None,
rng=None) -> None:


Expand All @@ -30,7 +31,7 @@ def __init__(self,
self.space.seed(rng.integers(0, 2**32))
self.hyperparameters = dict(self.space.sample_configuration())


self.hyperparameters_parser = hyperparameter_parser


def mutate(self, rng=None):
Expand All @@ -53,14 +54,24 @@ def crossover(self, other, rng=None):
return self.node.crossover(other.node, rng)

def export_pipeline(self):

if self.hyperparameters_parser is not None:
final_params = self.hyperparameters_parser(self.hyperparameters)
else:
final_params = self.hyperparameters

est = self.node.export_pipeline()
wrapped_est = self.method(est, **self.hyperparameters)
wrapped_est = self.method(est, **final_params)
return wrapped_est


def unique_id(self):
return self.node.unique_id()

if self.hyperparameters_parser is not None:
final_params = self.hyperparameters_parser(self.hyperparameters)
else:
final_params = self.hyperparameters

return (self.method, str(tuple(sorted(list(final_params.items())))) ,self.node.unique_id())


class WrapperPipeline(SklearnIndividualGenerator):
Expand Down
8 changes: 4 additions & 4 deletions tpot2/tests/test_estimators.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def tpot_estimator():
search_space = tpot2.search_spaces.pipelines.GraphPipeline(
root_search_space= tpot2.config.get_search_space("classifiers", n_samples=n_samples, n_features=n_features, n_classes=n_classes),
leaf_search_space = None,
inner_search_space = tpot2.config.get_search_space(["selectors","transformers","classifiers"],n_samples=n_samples, n_features=n_features, n_classes=n_classes),
inner_search_space = tpot2.config.get_search_space(["selectors","transformers"],n_samples=n_samples, n_features=n_features, n_classes=n_classes),
max_size = 10,
)
return tpot2.TPOTEstimator(
Expand All @@ -39,11 +39,11 @@ def tpot_estimator():

@pytest.fixture
def tpot_classifier():
return tpot2.tpot_estimator.templates.TPOTClassifier(max_time_seconds=10,verbose=3)
return tpot2.tpot_estimator.templates.TPOTClassifier(max_time_seconds=10,verbose=0)

@pytest.fixture
def tpot_regressor():
return tpot2.tpot_estimator.templates.TPOTRegressor(max_time_seconds=10,verbose=3)
return tpot2.tpot_estimator.templates.TPOTRegressor(max_time_seconds=10,verbose=0)



Expand Down Expand Up @@ -116,7 +116,7 @@ def test_tpot_regressor_fit(tpot_regressor):

scorer = sklearn.metrics.get_scorer('neg_mean_squared_error')
X, y = sklearn.datasets.load_diabetes(return_X_y=True)
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.75, test_size=0.25)
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, train_size=0.05, test_size=0.95)
tpot_regressor.fit(X_train, y_train)
assert tpot_regressor.fitted_pipeline_ is not None

4 changes: 0 additions & 4 deletions tpot2/tests/test_hello_world.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,6 @@ def test_hello_world(test_input, expected):
assert test_input is expected


@pytest.mark.xfail(reason="Not yet implemented")
def test_divide_by_zero():
assert 1 / 0 == 1


def test_print(capture_stdout):
print("Hello World")
Expand Down
Loading

0 comments on commit 5448a43

Please sign in to comment.