Skip to content

Commit fedc90d

Browse files
authored
Merge pull request #136 from perib/new_search_space_def
New search space def
2 parents 378149f + 8056594 commit fedc90d

16 files changed

+244
-65
lines changed

tpot2/builtin_modules/__init__.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,5 @@
55
from .arithmetictransformer import AddTransformer, mul_neg_1_Transformer, MulTransformer, SafeReciprocalTransformer, EQTransformer, NETransformer, GETransformer, GTTransformer, LETransformer, LTTransformer, MinTransformer, MaxTransformer, ZeroTransformer, OneTransformer, NTransformer
66
from .passthrough import Passthrough
77
from .imputer import ColumnSimpleImputer
8-
from .estimatortransformer import EstimatorTransformer
8+
from .estimatortransformer import EstimatorTransformer
9+
from .passkbinsdiscretizer import PassKBinsDiscretizer

tpot2/builtin_modules/column_one_hot_encoder.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def _X_selected(X, selected):
3737
class ColumnOneHotEncoder(BaseEstimator, TransformerMixin):
3838

3939

40-
def __init__(self, columns='auto', drop=None, handle_unknown='error', sparse_output=False, min_frequency=None,max_categories=None):
40+
def __init__(self, columns='auto', drop=None, handle_unknown='infrequent_if_exist', sparse_output=False, min_frequency=None,max_categories=None):
4141
'''
4242
4343
Parameters
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
import pandas as pd
2+
from sklearn.base import BaseEstimator, TransformerMixin
3+
from sklearn.compose import ColumnTransformer
4+
from sklearn.preprocessing import KBinsDiscretizer
5+
import numpy as np
6+
7+
def select_features(X, min_unique=10,):
8+
9+
if isinstance(X, pd.DataFrame):
10+
return [col for col in X.columns if len(X[col].unique()) > min_unique]
11+
else:
12+
return [i for i in range(X.shape[1]) if len(np.unique(X[:, i])) > min_unique]
13+
14+
class PassKBinsDiscretizer(BaseEstimator, TransformerMixin):
15+
"""
16+
Same as sklearn.preprocessing.KBinsDiscretizer, but passes through columns that are not discretized due to having fewer than n_bins unique values instead of ignoring them.
17+
"""
18+
def __init__(self, n_bins=5, encode='onehot-dense', strategy='quantile', subsample='warn', random_state=None):
19+
self.n_bins = n_bins
20+
self.encode = encode
21+
self.strategy = strategy
22+
self.subsample = subsample
23+
self.random_state = random_state
24+
25+
def fit(self, X, y=None):
26+
# Identify columns with more than n unique values
27+
# Create a ColumnTransformer to select and discretize the chosen columns
28+
self.selected_columns_ = select_features(X, min_unique=10)
29+
if isinstance(X, pd.DataFrame):
30+
self.not_selected_columns_ = [col for col in X.columns if col not in self.selected_columns_]
31+
else:
32+
self.not_selected_columns_ = [i for i in range(X.shape[1]) if i not in self.selected_columns_]
33+
34+
enc = KBinsDiscretizer(n_bins=self.n_bins, encode=self.encode, strategy=self.strategy, subsample=self.subsample, random_state=self.random_state)
35+
self.transformer = ColumnTransformer([
36+
('discretizer', enc, self.selected_columns_),
37+
('passthrough', 'passthrough', self.not_selected_columns_)
38+
])
39+
self.transformer.fit(X)
40+
return self
41+
42+
def transform(self, X):
43+
return self.transformer.transform(X)

tpot2/config/classifiers.py

+30-6
Original file line numberDiff line numberDiff line change
@@ -386,10 +386,18 @@ def GradientBoostingClassifier_hyperparameter_parser(params):
386386
final_params['n_iter_no_change'] = None
387387
final_params['validation_fraction'] = None
388388
elif params['early_stop'] == 'valid':
389-
final_params['n_iter_no_change'] = params['n_iter_no_change']
390-
final_params['validation_fraction'] = params['validation_fraction']
389+
#this is required because in crossover, its possible that n_iter_no_change is not in the params
390+
if 'n_iter_no_change' not in params:
391+
final_params['n_iter_no_change'] = 10
392+
else:
393+
final_params['n_iter_no_change'] = params['n_iter_no_change']
394+
if 'validation_fraction' not in params:
395+
final_params['validation_fraction'] = 0.1
396+
else:
397+
final_params['validation_fraction'] = params['validation_fraction']
391398
elif params['early_stop'] == 'train':
392-
final_params['n_iter_no_change'] = params['n_iter_no_change']
399+
if 'n_iter_no_change' not in params:
400+
final_params['n_iter_no_change'] = 10
393401
final_params['validation_fraction'] = None
394402

395403

@@ -445,16 +453,32 @@ def HistGradientBoostingClassifier_hyperparameter_parser(params):
445453
final_params['random_state'] = params['random_state']
446454

447455

456+
448457
if params['early_stop'] == 'off':
449458
# final_params['n_iter_no_change'] = 0
450459
final_params['validation_fraction'] = None
451460
final_params['early_stopping'] = False
452461
elif params['early_stop'] == 'valid':
453-
final_params['n_iter_no_change'] = params['n_iter_no_change']
454-
final_params['validation_fraction'] = params['validation_fraction']
462+
463+
#this is required because in crossover, its possible that n_iter_no_change is not in the params
464+
if 'n_iter_no_change' not in params:
465+
final_params['n_iter_no_change'] = 10
466+
else:
467+
final_params['n_iter_no_change'] = params['n_iter_no_change']
468+
if 'validation_fraction' not in params:
469+
final_params['validation_fraction'] = 0.1
470+
else:
471+
final_params['validation_fraction'] = params['validation_fraction']
472+
455473
final_params['early_stopping'] = True
456474
elif params['early_stop'] == 'train':
457-
final_params['n_iter_no_change'] = params['n_iter_no_change']
475+
476+
if 'n_iter_no_change' not in params:
477+
final_params['n_iter_no_change'] = 10
478+
else:
479+
final_params['n_iter_no_change'] = params['n_iter_no_change']
480+
481+
458482
final_params['validation_fraction'] = None
459483
final_params['early_stopping'] = True
460484

tpot2/config/get_configspace.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
from tpot2.builtin_modules import genetic_encoders, feature_encoding_frequency_selector
2727
from tpot2.builtin_modules import AddTransformer, mul_neg_1_Transformer, MulTransformer, SafeReciprocalTransformer, EQTransformer, NETransformer, GETransformer, GTTransformer, LETransformer, LTTransformer, MinTransformer, MaxTransformer, ZeroTransformer, OneTransformer, NTransformer
2828
from tpot2.builtin_modules.genetic_encoders import DominantEncoder, RecessiveEncoder, HeterosisEncoder, UnderDominanceEncoder, OverDominanceEncoder
29-
from tpot2.builtin_modules import ZeroCount, ColumnOneHotEncoder
29+
from tpot2.builtin_modules import ZeroCount, ColumnOneHotEncoder, PassKBinsDiscretizer
3030
from tpot2.builtin_modules import Passthrough
3131
from sklearn.linear_model import SGDClassifier, LogisticRegression, SGDRegressor, Ridge, Lasso, ElasticNet, Lars, LassoLars, LassoLarsCV, RidgeCV, ElasticNetCV, PassiveAggressiveClassifier, ARDRegression
3232
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, ExtraTreesRegressor, ExtraTreesClassifier, AdaBoostRegressor, AdaBoostClassifier, GradientBoostingRegressor,RandomForestRegressor, BaggingRegressor, ExtraTreesRegressor, HistGradientBoostingClassifier, HistGradientBoostingRegressor
@@ -55,6 +55,7 @@
5555
DominantEncoder, RecessiveEncoder, HeterosisEncoder, UnderDominanceEncoder, OverDominanceEncoder,
5656
GaussianProcessClassifier, BaggingClassifier,LGBMRegressor,
5757
Passthrough,
58+
PassKBinsDiscretizer,
5859
]
5960

6061

@@ -117,7 +118,7 @@
117118
"regressors" : ["LGBMRegressor", 'AdaBoostRegressor', "ARDRegression", 'DecisionTreeRegressor', 'ExtraTreesRegressor', 'HistGradientBoostingRegressor', 'KNeighborsRegressor', 'LinearSVR', "MLPRegressor", 'RandomForestRegressor', 'SGDRegressor', 'SVR', 'XGBRegressor'],
118119

119120

120-
"transformers": ["Binarizer", "PCA", "ZeroCount", "ColumnOneHotEncoder", "FastICA", "FeatureAgglomeration", "Nystroem", "RBFSampler", "QuantileTransformer", "PowerTransformer"],
121+
"transformers": ["PassKBinsDiscretizer", "Binarizer", "PCA", "ZeroCount", "ColumnOneHotEncoder", "FastICA", "FeatureAgglomeration", "Nystroem", "RBFSampler", "QuantileTransformer", "PowerTransformer"],
121122
"scalers": ["MinMaxScaler", "RobustScaler", "StandardScaler", "MaxAbsScaler", "Normalizer", ],
122123
"all_transformers" : ["transformers", "scalers"],
123124

@@ -291,6 +292,8 @@ def get_configspace(name, n_classes=3, n_samples=1000, n_features=100, random_st
291292
return transformers.PolynomialFeatures_configspace
292293
case "StandardScaler":
293294
return {}
295+
case "PassKBinsDiscretizer":
296+
return transformers.get_passkbinsdiscretizer_configspace(random_state=random_state)
294297

295298
#selectors.py
296299
case "SelectFwe":

tpot2/config/regressors.py

+25-6
Original file line numberDiff line numberDiff line change
@@ -436,10 +436,20 @@ def GradientBoostingRegressor_hyperparameter_parser(params):
436436
final_params['n_iter_no_change'] = None
437437
final_params['validation_fraction'] = None
438438
elif params['early_stop'] == 'valid':
439-
final_params['n_iter_no_change'] = params['n_iter_no_change']
440-
final_params['validation_fraction'] = params['validation_fraction']
439+
#this is required because in crossover, its possible that n_iter_no_change is not in the params
440+
if 'n_iter_no_change' not in params:
441+
final_params['n_iter_no_change'] = 10
442+
else:
443+
final_params['n_iter_no_change'] = params['n_iter_no_change']
444+
if 'validation_fraction' not in params:
445+
final_params['validation_fraction'] = 0.1
446+
else:
447+
final_params['validation_fraction'] = params['validation_fraction']
441448
elif params['early_stop'] == 'train':
442-
final_params['n_iter_no_change'] = params['n_iter_no_change']
449+
if 'n_iter_no_change' not in params:
450+
final_params['n_iter_no_change'] = 10
451+
else:
452+
final_params['n_iter_no_change'] = params['n_iter_no_change']
443453
final_params['validation_fraction'] = None
444454

445455

@@ -498,11 +508,20 @@ def HistGradientBoostingRegressor_hyperparameter_parser(params):
498508
# final_params['validation_fraction'] = None
499509
final_params['early_stopping'] = False
500510
elif params['early_stop'] == 'valid':
501-
final_params['n_iter_no_change'] = params['n_iter_no_change']
502-
final_params['validation_fraction'] = params['validation_fraction']
511+
if 'n_iter_no_change' not in params:
512+
final_params['n_iter_no_change'] = 10
513+
else:
514+
final_params['n_iter_no_change'] = params['n_iter_no_change']
515+
if 'validation_fraction' not in params:
516+
final_params['validation_fraction'] = 0.1
517+
else:
518+
final_params['validation_fraction'] = params['validation_fraction']
503519
final_params['early_stopping'] = True
504520
elif params['early_stop'] == 'train':
505-
final_params['n_iter_no_change'] = params['n_iter_no_change']
521+
if 'n_iter_no_change' not in params:
522+
final_params['n_iter_no_change'] = 10
523+
else:
524+
final_params['n_iter_no_change'] = params['n_iter_no_change']
506525
final_params['validation_fraction'] = None
507526
final_params['early_stopping'] = True
508527

tpot2/config/transformers.py

+20-1
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,22 @@ def get_QuantileTransformer_configspace(random_state=None):
124124
)
125125

126126

127+
def get_passkbinsdiscretizer_configspace(random_state=None):
128+
space = {
129+
'n_bins': Integer('n_bins', bounds=(3, 100)),
130+
'encode': 'onehot-dense',
131+
'strategy': Categorical('strategy', ['uniform', 'quantile', 'kmeans']),
132+
# 'subsample': Categorical('subsample', ['auto', 'warn', 'ignore']),
133+
}
134+
135+
if random_state is not None: #This is required because configspace doesn't allow None as a value
136+
space['random_state'] = random_state
137+
138+
return ConfigurationSpace(
139+
space = space
140+
141+
)
142+
127143

128144
### ROBUST SCALER
129145

@@ -133,4 +149,7 @@ def get_QuantileTransformer_configspace(random_state=None):
133149
})
134150

135151
def robust_scaler_hyperparameter_parser(params):
136-
return {"quantile_range": (params["q_min"], params["q_max"])}
152+
return {"quantile_range": (params["q_min"], params["q_max"])}
153+
154+
155+

tpot2/evolvers/base_evolver.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -498,7 +498,7 @@ def optimize(self, generations=None):
498498
self._client.close()
499499
self._cluster.close()
500500

501-
tpot2.utils.get_pareto_frontier(self.population.evaluated_individuals, column_names=self.objective_names, weights=self.objective_function_weights, invalid_values=["TIMEOUT","INVALID"])
501+
tpot2.utils.get_pareto_frontier(self.population.evaluated_individuals, column_names=self.objective_names, weights=self.objective_function_weights)
502502

503503
def step(self,):
504504
if self.population_size_list is not None:
@@ -624,7 +624,7 @@ def evaluate_population_full(self, budget=None):
624624
parallel_timeout = 10
625625

626626
#scores = tpot2.utils.eval_utils.parallel_eval_objective_list(individuals_to_evaluate, self.objective_functions, self.n_jobs, verbose=self.verbose, timeout=self.max_eval_time_seconds, budget=budget, n_expected_columns=len(self.objective_names), client=self._client, parallel_timeout=parallel_timeout, **self.objective_kwargs)
627-
scores, start_times, end_times, eval_errors = tpot2.utils.eval_utils.parallel_eval_objective_list2(individuals_to_evaluate, self.objective_functions, verbose=self.verbose, max_eval_time_seconds=self.max_eval_time_seconds, budget=budget, n_expected_columns=len(self.objective_names), client=self._client, **self.objective_kwargs)
627+
scores, start_times, end_times, eval_errors = tpot2.utils.eval_utils.parallel_eval_objective_list2(individuals_to_evaluate, self.objective_functions, verbose=self.verbose, max_eval_time_seconds=self.max_eval_time_seconds, budget=budget, n_expected_columns=len(self.objective_names), client=self._client, scheduled_timeout_time=self.scheduled_timeout_time, **self.objective_kwargs)
628628

629629
self.population.update_column(individuals_to_evaluate, column_names=self.objective_names, data=scores)
630630
if budget is not None:
@@ -705,6 +705,7 @@ def evaluate_population_selection_early_stop(self,survival_counts, thresholds=No
705705
generation = self.generation,
706706
n_expected_columns=len(self.objective_names),
707707
client=self._client,
708+
scheduled_timeout_time=self.scheduled_timeout_time,
708709
**self.objective_kwargs,
709710
)
710711

tpot2/evolvers/steady_state_evolver.py

+6-6
Original file line numberDiff line numberDiff line change
@@ -342,7 +342,7 @@ def optimize(self):
342342
###############################
343343
if self.verbose >= 3:
344344
sign = np.sign(self.objective_function_weights)
345-
valid_df = self.population.evaluated_individuals[~self.population.evaluated_individuals[self.objective_names].isin(["TIMEOUT","INVALID"]).any(axis=1)][self.objective_names]*sign
345+
valid_df = self.population.evaluated_individuals[~self.population.evaluated_individuals[["Eval Error"]].isin(["TIMEOUT","INVALID"]).any(axis=1)][self.objective_names]*sign
346346
cur_best_scores = valid_df.max(axis=0)*sign
347347
cur_best_scores = cur_best_scores.to_numpy()
348348
for i, obj in enumerate(self.objective_names):
@@ -353,7 +353,7 @@ def optimize(self):
353353
#get sign of objective_function_weights
354354
sign = np.sign(self.objective_function_weights)
355355
#get best score for each objective
356-
valid_df = self.population.evaluated_individuals[~self.population.evaluated_individuals[self.objective_names].isin(["TIMEOUT","INVALID"]).any(axis=1)][self.objective_names]*sign
356+
valid_df = self.population.evaluated_individuals[~self.population.evaluated_individuals[["Eval Error"]].isin(["TIMEOUT","INVALID"]).any(axis=1)][self.objective_names]*sign
357357
cur_best_scores = valid_df.max(axis=0)
358358
cur_best_scores = cur_best_scores.to_numpy()
359359
#cur_best_scores = self.population.get_column(self.population.population, column_names=self.objective_names).max(axis=0)*sign #TODO this assumes the current population is the best
@@ -499,7 +499,7 @@ def optimize(self):
499499
elif len(submitted_futures) < self.max_queue_size:
500500

501501
initial_population = self.population.evaluated_individuals.iloc[:self.initial_population_size*3]
502-
invalid_initial_population = initial_population[initial_population[self.objective_names].isin(["TIMEOUT","INVALID"]).any(axis=1)]
502+
invalid_initial_population = initial_population[initial_population[["Eval Error"]].isin(["TIMEOUT","INVALID"]).any(axis=1)]
503503
if len(invalid_initial_population) >= self.initial_population_size*3: #if all individuals in the 3*initial population are invalid
504504
raise Exception("No individuals could be evaluated in the initial population. This may indicate a bug in the configuration, included models, or objective functions. Set verbose>=4 to see the errors that caused individuals to fail.")
505505

@@ -540,8 +540,8 @@ def optimize(self):
540540
# Step 7: Cleanup
541541
###############################
542542

543-
self.population.remove_invalid_from_population(column_names=self.objective_names, invalid_value="INVALID")
544-
self.population.remove_invalid_from_population(column_names=self.objective_names, invalid_value="TIMEOUT")
543+
self.population.remove_invalid_from_population(column_names="Eval Error", invalid_value="INVALID")
544+
self.population.remove_invalid_from_population(column_names="Eval Error", invalid_value="TIMEOUT")
545545

546546

547547
#done, cleanup futures
@@ -556,7 +556,7 @@ def optimize(self):
556556
self._client.close()
557557
self._cluster.close()
558558

559-
tpot2.utils.get_pareto_frontier(self.population.evaluated_individuals, column_names=self.objective_names, weights=self.objective_function_weights, invalid_values=["TIMEOUT","INVALID"])
559+
tpot2.utils.get_pareto_frontier(self.population.evaluated_individuals, column_names=self.objective_names, weights=self.objective_function_weights)
560560

561561

562562

tpot2/search_spaces/nodes/genetic_feature_selection.py

+5
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,11 @@ def __init__( self,
5050
else:
5151
self.mask = mask
5252

53+
# check if there are no features selected, if so select one
54+
if sum(self.mask) == 0:
55+
index = rng.choice(len(self.mask))
56+
self.mask[index] = True
57+
5358
self.mutation_list = [self._mutate_add, self._mutate_remove]
5459
self.crossover_list = [self._crossover_swap]
5560

0 commit comments

Comments
 (0)