Skip to content

Commit 944699a

Browse files
authored
Merge pull request #148 from EpistasisLab/dev
Dev
2 parents 96ef8bb + 46f42bb commit 944699a

13 files changed

+2293
-66
lines changed

Tutorial/amltk_search_space_parser_example.ipynb

+1,897
Large diffs are not rendered by default.

setup.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,8 @@ def calculate_version():
5353
extras_require={
5454
'skrebate': ['skrebate>=0.3.4'],
5555
'mdr': ['scikit-mdr>=0.4.4'],
56-
'sklearnex' : ['scikit-learn-intelex>=2023.2.1']
56+
'sklearnex' : ['scikit-learn-intelex>=2023.2.1'],
57+
'amltk' : ['amltk>=1.12.1'],
5758
},
5859
classifiers=[
5960
'Intended Audience :: Science/Research',

tpot2/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,9 @@
88
from .population import Population
99

1010
from . import builtin_modules
11-
from . import utils
1211
from . import config
1312
from . import search_spaces
13+
from . import utils
1414
from . import evolvers
1515
from . import objectives
1616
from . import selectors

tpot2/config/classifiers.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -535,7 +535,7 @@ def MLPClassifier_hyperparameter_parser(params):
535535
def get_GaussianProcessClassifier_ConfigurationSpace(n_features, random_state):
536536
space = {
537537
'n_features': n_features,
538-
'alpha': Float("alpha", bounds=(1e-14, 1.0), log=True),
538+
'alpha': Float("alpha", bounds=(1e-10, 1.0), log=True),
539539
'thetaL': Float("thetaL", bounds=(1e-10, 1e-3), log=True),
540540
'thetaU': Float("thetaU", bounds=(1.0, 100000), log=True),
541541
}

tpot2/config/get_configspace.py

+36-22
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,8 @@
4545
from sklearn.feature_selection import f_classif, f_regression #TODO create a selectomixin using these?
4646
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
4747
from sklearn.gaussian_process import GaussianProcessRegressor, GaussianProcessClassifier
48-
from sklearn.impute import SimpleImputer
48+
from sklearn.experimental import enable_iterative_imputer
49+
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer
4950

5051
all_methods = [SGDClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, MLPClassifier, DecisionTreeClassifier, XGBClassifier, KNeighborsClassifier, SVC, LogisticRegression, LGBMClassifier, LinearSVC, GaussianNB, BernoulliNB, MultinomialNB, ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor, DecisionTreeRegressor, KNeighborsRegressor, XGBRegressor, ZeroCount, ColumnOneHotEncoder, Binarizer, FastICA, FeatureAgglomeration, MaxAbsScaler, MinMaxScaler, Normalizer, Nystroem, PCA, PolynomialFeatures, RBFSampler, RobustScaler, StandardScaler, SelectFwe, SelectPercentile, VarianceThreshold, SGDRegressor, Ridge, Lasso, ElasticNet, Lars, LassoLars, LassoLarsCV, RidgeCV, SVR, LinearSVR, AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor, BaggingRegressor, ExtraTreesRegressor, DecisionTreeRegressor, KNeighborsRegressor, ElasticNetCV,
5152
AdaBoostClassifier,MLPRegressor,
@@ -56,7 +57,7 @@
5657
GaussianProcessClassifier, BaggingClassifier,LGBMRegressor,
5758
Passthrough,SkipTransformer,
5859
PassKBinsDiscretizer,
59-
SimpleImputer,
60+
SimpleImputer, IterativeImputer, KNNImputer
6061
]
6162

6263

@@ -124,7 +125,7 @@
124125
"all_transformers" : ["transformers", "scalers"],
125126

126127
"arithmatic": ["AddTransformer", "mul_neg_1_Transformer", "MulTransformer", "SafeReciprocalTransformer", "EQTransformer", "NETransformer", "GETransformer", "GTTransformer", "LETransformer", "LTTransformer", "MinTransformer", "MaxTransformer"],
127-
"imputers": ["SimpleImputer"],
128+
"imputers": ["SimpleImputer", "IterativeImputer", "KNNImputer"],
128129
"skrebate": ["ReliefF", "SURF", "SURFstar", "MultiSURF"],
129130
"genetic_encoders": ["DominantEncoder", "RecessiveEncoder", "HeterosisEncoder", "UnderDominanceEncoder", "OverDominanceEncoder"],
130131

@@ -136,8 +137,6 @@
136137

137138
def get_configspace(name, n_classes=3, n_samples=1000, n_features=100, random_state=None):
138139
match name:
139-
case "SimpleImputer":
140-
return imputers.simple_imputer_cs
141140

142141
#autoqtl_builtins.py
143142
case "FeatureEncodingFrequencySelector":
@@ -352,6 +351,12 @@ def get_configspace(name, n_classes=3, n_samples=1000, n_features=100, random_st
352351
)
353352

354353
#imputers.py
354+
case "SimpleImputer":
355+
return imputers.simple_imputer_cs
356+
case "IterativeImputer":
357+
return imputers.get_IterativeImputer_config_space(n_features=n_features, random_state=random_state)
358+
case "KNNImputer":
359+
return imputers.get_KNNImputer_config_space(n_samples=n_samples)
355360

356361
#mdr_configs.py
357362
case "MDR":
@@ -401,12 +406,12 @@ def get_configspace(name, n_classes=3, n_samples=1000, n_features=100, random_st
401406
raise ValueError(f"Could not find configspace for {name}")
402407

403408

404-
def get_search_space(name, n_classes=3, n_samples=100, n_features=100, random_state=None, return_choice_pipeline=True):
409+
def get_search_space(name, n_classes=3, n_samples=100, n_features=100, random_state=None, return_choice_pipeline=True, base_node=EstimatorNode):
405410

406411

407412
#if list of names, return a list of EstimatorNodes
408413
if isinstance(name, list) or isinstance(name, np.ndarray):
409-
search_spaces = [get_search_space(n, n_classes=n_classes, n_samples=n_samples, n_features=n_features, random_state=random_state, return_choice_pipeline=False) for n in name]
414+
search_spaces = [get_search_space(n, n_classes=n_classes, n_samples=n_samples, n_features=n_features, random_state=random_state, return_choice_pipeline=False, base_node=base_node) for n in name]
410415
#remove Nones
411416
search_spaces = [s for s in search_spaces if s is not None]
412417

@@ -417,12 +422,12 @@ def get_search_space(name, n_classes=3, n_samples=100, n_features=100, random_st
417422

418423
if name in GROUPNAMES:
419424
name_list = GROUPNAMES[name]
420-
return get_search_space(name_list, n_classes=n_classes, n_samples=n_samples, n_features=n_features, random_state=random_state, return_choice_pipeline=return_choice_pipeline)
425+
return get_search_space(name_list, n_classes=n_classes, n_samples=n_samples, n_features=n_features, random_state=random_state, return_choice_pipeline=return_choice_pipeline, base_node=base_node)
421426

422-
return get_node(name, n_classes=n_classes, n_samples=n_samples, n_features=n_features, random_state=random_state)
427+
return get_node(name, n_classes=n_classes, n_samples=n_samples, n_features=n_features, random_state=random_state, base_node=base_node)
423428

424429

425-
def get_node(name, n_classes=3, n_samples=100, n_features=100, random_state=None):
430+
def get_node(name, n_classes=3, n_samples=100, n_features=100, random_state=None, base_node=EstimatorNode):
426431

427432
#these are wrappers that take in another estimator as a parameter
428433
# TODO Add AdaBoostRegressor, AdaBoostClassifier as wrappers? wrap a decision tree with different params?
@@ -443,43 +448,52 @@ def get_node(name, n_classes=3, n_samples=100, n_features=100, random_state=None
443448
sfm_sp = get_configspace(name="SelectFromModel", n_classes=n_classes, n_samples=n_samples, random_state=random_state)
444449
ext = get_node("ExtraTreesRegressor", n_classes=n_classes, n_samples=n_samples, random_state=random_state)
445450
return WrapperPipeline(estimator_search_space=ext, method=SelectFromModel, space=sfm_sp)
446-
451+
# TODO Add IterativeImputer with more estimator methods
452+
'''
453+
if name == "IterativeImputer_learnedestimators":
454+
iteative_sp = get_configspace(name="IterativeImputer", n_classes=n_classes, n_samples=n_samples, random_state=random_state)
455+
regessor_searchspace = get_search_space(["LinearRegression", ..], n_classes=n_classes, n_samples=n_samples, random_state=random_state)
456+
return WrapperPipeline(estimator_search_space=regressor_searchspace, method=ItartiveImputer, space=iteative_sp)
457+
'''
447458
#these are nodes that have special search spaces which require custom parsing of the hyperparameters
459+
if name == "IterativeImputer":
460+
configspace = get_configspace(name, n_classes=n_classes, n_samples=n_samples, random_state=random_state)
461+
return EstimatorNode(STRING_TO_CLASS[name], configspace, hyperparameter_parser=imputers.IterativeImputer_hyperparameter_parser)
448462
if name == "RobustScaler":
449463
configspace = get_configspace(name, n_classes=n_classes, n_samples=n_samples, random_state=random_state)
450-
return EstimatorNode(STRING_TO_CLASS[name], configspace, hyperparameter_parser=transformers.robust_scaler_hyperparameter_parser)
464+
return base_node(STRING_TO_CLASS[name], configspace, hyperparameter_parser=transformers.robust_scaler_hyperparameter_parser)
451465
if name == "GradientBoostingClassifier":
452466
configspace = get_configspace(name, n_classes=n_classes, n_samples=n_samples, random_state=random_state)
453-
return EstimatorNode(STRING_TO_CLASS[name], configspace, hyperparameter_parser=classifiers.GradientBoostingClassifier_hyperparameter_parser)
467+
return base_node(STRING_TO_CLASS[name], configspace, hyperparameter_parser=classifiers.GradientBoostingClassifier_hyperparameter_parser)
454468
if name == "HistGradientBoostingClassifier":
455469
configspace = get_configspace(name, n_classes=n_classes, n_samples=n_samples, random_state=random_state)
456-
return EstimatorNode(STRING_TO_CLASS[name], configspace, hyperparameter_parser=classifiers.HistGradientBoostingClassifier_hyperparameter_parser)
470+
return base_node(STRING_TO_CLASS[name], configspace, hyperparameter_parser=classifiers.HistGradientBoostingClassifier_hyperparameter_parser)
457471
if name == "GradientBoostingRegressor":
458472
configspace = get_configspace(name, n_classes=n_classes, n_samples=n_samples, random_state=random_state)
459-
return EstimatorNode(STRING_TO_CLASS[name], configspace, hyperparameter_parser=regressors.GradientBoostingRegressor_hyperparameter_parser)
473+
return base_node(STRING_TO_CLASS[name], configspace, hyperparameter_parser=regressors.GradientBoostingRegressor_hyperparameter_parser)
460474
if name == "HistGradientBoostingRegressor":
461475
configspace = get_configspace(name, n_classes=n_classes, n_samples=n_samples, random_state=random_state)
462-
return EstimatorNode(STRING_TO_CLASS[name], configspace, hyperparameter_parser=regressors.HistGradientBoostingRegressor_hyperparameter_parser)
476+
return base_node(STRING_TO_CLASS[name], configspace, hyperparameter_parser=regressors.HistGradientBoostingRegressor_hyperparameter_parser)
463477
if name == "MLPClassifier":
464478
configspace = get_configspace(name, n_classes=n_classes, n_samples=n_samples, random_state=random_state)
465-
return EstimatorNode(STRING_TO_CLASS[name], configspace, hyperparameter_parser=classifiers.MLPClassifier_hyperparameter_parser)
479+
return base_node(STRING_TO_CLASS[name], configspace, hyperparameter_parser=classifiers.MLPClassifier_hyperparameter_parser)
466480
if name == "MLPRegressor":
467481
configspace = get_configspace(name, n_classes=n_classes, n_samples=n_samples, random_state=random_state)
468-
return EstimatorNode(STRING_TO_CLASS[name], configspace, hyperparameter_parser=regressors.MLPRegressor_hyperparameter_parser)
482+
return base_node(STRING_TO_CLASS[name], configspace, hyperparameter_parser=regressors.MLPRegressor_hyperparameter_parser)
469483
if name == "GaussianProcessRegressor":
470484
configspace = get_configspace(name, n_classes=n_classes, n_samples=n_samples, random_state=random_state)
471-
return EstimatorNode(STRING_TO_CLASS[name], configspace, hyperparameter_parser=regressors.GaussianProcessRegressor_hyperparameter_parser)
485+
return base_node(STRING_TO_CLASS[name], configspace, hyperparameter_parser=regressors.GaussianProcessRegressor_hyperparameter_parser)
472486
if name == "GaussianProcessClassifier":
473487
configspace = get_configspace(name, n_classes=n_classes, n_samples=n_samples, random_state=random_state)
474-
return EstimatorNode(STRING_TO_CLASS[name], configspace, hyperparameter_parser=classifiers.GaussianProcessClassifier_hyperparameter_parser)
488+
return base_node(STRING_TO_CLASS[name], configspace, hyperparameter_parser=classifiers.GaussianProcessClassifier_hyperparameter_parser)
475489
if name == "FeatureAgglomeration":
476490
configspace = get_configspace(name, n_features=n_features)
477-
return EstimatorNode(STRING_TO_CLASS[name], configspace, hyperparameter_parser=transformers.FeatureAgglomeration_hyperparameter_parser)
491+
return base_node(STRING_TO_CLASS[name], configspace, hyperparameter_parser=transformers.FeatureAgglomeration_hyperparameter_parser)
478492

479493
configspace = get_configspace(name, n_classes=n_classes, n_samples=n_samples, n_features=n_features, random_state=random_state)
480494
if configspace is None:
481495
#raise warning
482496
warnings.warn(f"Could not find configspace for {name}")
483497
return None
484498

485-
return EstimatorNode(STRING_TO_CLASS[name], configspace)
499+
return base_node(STRING_TO_CLASS[name], configspace)

tpot2/config/imputers.py

+74-3
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,80 @@
1+
import sklearn
2+
import sklearn.ensemble
3+
import sklearn.linear_model
4+
import sklearn.neighbors
15
from ConfigSpace import ConfigurationSpace
26
from ConfigSpace import ConfigurationSpace, Integer, Float, Categorical, Normal
7+
from ConfigSpace import EqualsCondition
8+
39

410
simple_imputer_cs = ConfigurationSpace(
511
space = {
6-
'strategy' : Categorical('strategy', ['mean','median', 'most_frequent', ]),
7-
'add_indicator' : Categorical('add_indicator', [True, False]),
12+
'strategy' : Categorical('strategy',
13+
['mean','median', 'most_frequent', 'constant']
14+
),
15+
#'add_indicator' : Categorical('add_indicator', [True, False]),
16+
#Removed add_indicator, it appends a mask next to the rest of the data
17+
# and can cause errors. gk
18+
}
19+
)
20+
21+
def get_IterativeImputer_config_space(n_features, random_state):
22+
space = { 'initial_strategy' : Categorical('initial_strategy',
23+
['mean', 'median',
24+
'most_frequent', 'constant']),
25+
'n_nearest_features' : Integer('n_nearest_features',
26+
bounds=(1, n_features)),
27+
'imputation_order' : Categorical('imputation_order',
28+
['ascending', 'descending',
29+
'roman', 'arabic', 'random']),
830
}
9-
)
31+
32+
estimator = Categorical('estimator', ['Bayesian', 'RFR', 'Ridge', 'KNN'])
33+
sample_posterior = Categorical('sample_posterior', [True, False])
34+
sampling_condition = EqualsCondition(sample_posterior, estimator, 'Bayesian')
35+
36+
if random_state is not None:
37+
#This is required because configspace doesn't allow None as a value
38+
space['random_state'] = random_state
39+
40+
cs = ConfigurationSpace(space=space)
41+
cs.add_hyperparameters([estimator, sample_posterior])
42+
cs.add_conditions([sampling_condition])
43+
return cs
44+
45+
def get_KNNImputer_config_space(n_samples):
46+
space = {
47+
'n_neighbors': Integer('n_neighbors', bounds=(1, max(n_samples,100))),
48+
'weights': Categorical('weights', ['uniform', 'distance'])
49+
}
50+
51+
return ConfigurationSpace(
52+
space=space
53+
)
54+
55+
def IterativeImputer_hyperparameter_parser(params):
56+
est = params['estimator']
57+
match est:
58+
case 'Bayesian':
59+
estimator = sklearn.linear_model.BayesianRidge()
60+
case 'RFR':
61+
estimator = sklearn.ensemble.RandomForestRegressor()
62+
case 'Ridge':
63+
estimator = sklearn.linear_model.Ridge()
64+
case 'KNN':
65+
estimator = sklearn.neighbors.KNeighborsRegressor()
66+
67+
final_params = {
68+
'estimator' : estimator,
69+
'initial_strategy' : params['initial_strategy'],
70+
'n_nearest_features' : params['n_nearest_features'],
71+
'imputation_order' : params['imputation_order'],
72+
}
73+
74+
if 'sample_posterior' in params:
75+
final_params['sample_posterior'] = params['sample_posterior']
76+
77+
if 'random_state' in params:
78+
final_params['random_state'] = params['random_state']
79+
80+
return final_params

tpot2/config/regressors.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -354,7 +354,7 @@ def get_ExtraTreesRegressor_ConfigurationSpace(random_state):
354354
def get_GaussianProcessRegressor_ConfigurationSpace(n_features, random_state):
355355
space = {
356356
'n_features': n_features,
357-
'alpha': Float("alpha", bounds=(1e-14, 1.0), log=True),
357+
'alpha': Float("alpha", bounds=(1e-10, 1.0), log=True),
358358
'thetaL': Float("thetaL", bounds=(1e-10, 1e-3), log=True),
359359
'thetaU': Float("thetaU", bounds=(1.0, 100000), log=True),
360360
}

tpot2/search_spaces/base.py

-8
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,10 @@
11
import tpot2
2-
import numpy as np
3-
import pandas as pd
42
import sklearn
5-
from tpot2 import config
6-
from typing import Generator, List, Tuple, Union
7-
import random
83
from sklearn.base import BaseEstimator
94
import sklearn
105
import networkx as nx
116
from . import graph_utils
127
from typing import final
13-
from abc import ABC, abstractmethod
14-
15-
168

179

1810

0 commit comments

Comments
 (0)