Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New search space def #136

Merged
merged 7 commits into from
May 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion tpot2/builtin_modules/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@
from .arithmetictransformer import AddTransformer, mul_neg_1_Transformer, MulTransformer, SafeReciprocalTransformer, EQTransformer, NETransformer, GETransformer, GTTransformer, LETransformer, LTTransformer, MinTransformer, MaxTransformer, ZeroTransformer, OneTransformer, NTransformer
from .passthrough import Passthrough
from .imputer import ColumnSimpleImputer
from .estimatortransformer import EstimatorTransformer
from .estimatortransformer import EstimatorTransformer
from .passkbinsdiscretizer import PassKBinsDiscretizer
2 changes: 1 addition & 1 deletion tpot2/builtin_modules/column_one_hot_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def _X_selected(X, selected):
class ColumnOneHotEncoder(BaseEstimator, TransformerMixin):


def __init__(self, columns='auto', drop=None, handle_unknown='error', sparse_output=False, min_frequency=None,max_categories=None):
def __init__(self, columns='auto', drop=None, handle_unknown='infrequent_if_exist', sparse_output=False, min_frequency=None,max_categories=None):
'''

Parameters
Expand Down
43 changes: 43 additions & 0 deletions tpot2/builtin_modules/passkbinsdiscretizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import KBinsDiscretizer
import numpy as np

def select_features(X, min_unique=10,):

if isinstance(X, pd.DataFrame):
return [col for col in X.columns if len(X[col].unique()) > min_unique]
else:
return [i for i in range(X.shape[1]) if len(np.unique(X[:, i])) > min_unique]

class PassKBinsDiscretizer(BaseEstimator, TransformerMixin):
"""
Same as sklearn.preprocessing.KBinsDiscretizer, but passes through columns that are not discretized due to having fewer than n_bins unique values instead of ignoring them.
"""
def __init__(self, n_bins=5, encode='onehot-dense', strategy='quantile', subsample='warn', random_state=None):
self.n_bins = n_bins
self.encode = encode
self.strategy = strategy
self.subsample = subsample
self.random_state = random_state

def fit(self, X, y=None):
# Identify columns with more than n unique values
# Create a ColumnTransformer to select and discretize the chosen columns
self.selected_columns_ = select_features(X, min_unique=10)
if isinstance(X, pd.DataFrame):
self.not_selected_columns_ = [col for col in X.columns if col not in self.selected_columns_]
else:
self.not_selected_columns_ = [i for i in range(X.shape[1]) if i not in self.selected_columns_]

enc = KBinsDiscretizer(n_bins=self.n_bins, encode=self.encode, strategy=self.strategy, subsample=self.subsample, random_state=self.random_state)
self.transformer = ColumnTransformer([
('discretizer', enc, self.selected_columns_),
('passthrough', 'passthrough', self.not_selected_columns_)
])
self.transformer.fit(X)
return self

def transform(self, X):
return self.transformer.transform(X)
36 changes: 30 additions & 6 deletions tpot2/config/classifiers.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,10 +386,18 @@ def GradientBoostingClassifier_hyperparameter_parser(params):
final_params['n_iter_no_change'] = None
final_params['validation_fraction'] = None
elif params['early_stop'] == 'valid':
final_params['n_iter_no_change'] = params['n_iter_no_change']
final_params['validation_fraction'] = params['validation_fraction']
#this is required because in crossover, its possible that n_iter_no_change is not in the params
if 'n_iter_no_change' not in params:
final_params['n_iter_no_change'] = 10
else:
final_params['n_iter_no_change'] = params['n_iter_no_change']
if 'validation_fraction' not in params:
final_params['validation_fraction'] = 0.1
else:
final_params['validation_fraction'] = params['validation_fraction']
elif params['early_stop'] == 'train':
final_params['n_iter_no_change'] = params['n_iter_no_change']
if 'n_iter_no_change' not in params:
final_params['n_iter_no_change'] = 10
final_params['validation_fraction'] = None


Expand Down Expand Up @@ -445,16 +453,32 @@ def HistGradientBoostingClassifier_hyperparameter_parser(params):
final_params['random_state'] = params['random_state']



if params['early_stop'] == 'off':
# final_params['n_iter_no_change'] = 0
final_params['validation_fraction'] = None
final_params['early_stopping'] = False
elif params['early_stop'] == 'valid':
final_params['n_iter_no_change'] = params['n_iter_no_change']
final_params['validation_fraction'] = params['validation_fraction']

#this is required because in crossover, its possible that n_iter_no_change is not in the params
if 'n_iter_no_change' not in params:
final_params['n_iter_no_change'] = 10
else:
final_params['n_iter_no_change'] = params['n_iter_no_change']
if 'validation_fraction' not in params:
final_params['validation_fraction'] = 0.1
else:
final_params['validation_fraction'] = params['validation_fraction']

final_params['early_stopping'] = True
elif params['early_stop'] == 'train':
final_params['n_iter_no_change'] = params['n_iter_no_change']

if 'n_iter_no_change' not in params:
final_params['n_iter_no_change'] = 10
else:
final_params['n_iter_no_change'] = params['n_iter_no_change']


final_params['validation_fraction'] = None
final_params['early_stopping'] = True

Expand Down
7 changes: 5 additions & 2 deletions tpot2/config/get_configspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from tpot2.builtin_modules import genetic_encoders, feature_encoding_frequency_selector
from tpot2.builtin_modules import AddTransformer, mul_neg_1_Transformer, MulTransformer, SafeReciprocalTransformer, EQTransformer, NETransformer, GETransformer, GTTransformer, LETransformer, LTTransformer, MinTransformer, MaxTransformer, ZeroTransformer, OneTransformer, NTransformer
from tpot2.builtin_modules.genetic_encoders import DominantEncoder, RecessiveEncoder, HeterosisEncoder, UnderDominanceEncoder, OverDominanceEncoder
from tpot2.builtin_modules import ZeroCount, ColumnOneHotEncoder
from tpot2.builtin_modules import ZeroCount, ColumnOneHotEncoder, PassKBinsDiscretizer
from tpot2.builtin_modules import Passthrough
from sklearn.linear_model import SGDClassifier, LogisticRegression, SGDRegressor, Ridge, Lasso, ElasticNet, Lars, LassoLars, LassoLarsCV, RidgeCV, ElasticNetCV, PassiveAggressiveClassifier, ARDRegression
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, ExtraTreesRegressor, ExtraTreesClassifier, AdaBoostRegressor, AdaBoostClassifier, GradientBoostingRegressor,RandomForestRegressor, BaggingRegressor, ExtraTreesRegressor, HistGradientBoostingClassifier, HistGradientBoostingRegressor
Expand Down Expand Up @@ -55,6 +55,7 @@
DominantEncoder, RecessiveEncoder, HeterosisEncoder, UnderDominanceEncoder, OverDominanceEncoder,
GaussianProcessClassifier, BaggingClassifier,LGBMRegressor,
Passthrough,
PassKBinsDiscretizer,
]


Expand Down Expand Up @@ -117,7 +118,7 @@
"regressors" : ["LGBMRegressor", 'AdaBoostRegressor', "ARDRegression", 'DecisionTreeRegressor', 'ExtraTreesRegressor', 'HistGradientBoostingRegressor', 'KNeighborsRegressor', 'LinearSVR', "MLPRegressor", 'RandomForestRegressor', 'SGDRegressor', 'SVR', 'XGBRegressor'],


"transformers": ["Binarizer", "PCA", "ZeroCount", "ColumnOneHotEncoder", "FastICA", "FeatureAgglomeration", "Nystroem", "RBFSampler", "QuantileTransformer", "PowerTransformer"],
"transformers": ["PassKBinsDiscretizer", "Binarizer", "PCA", "ZeroCount", "ColumnOneHotEncoder", "FastICA", "FeatureAgglomeration", "Nystroem", "RBFSampler", "QuantileTransformer", "PowerTransformer"],
"scalers": ["MinMaxScaler", "RobustScaler", "StandardScaler", "MaxAbsScaler", "Normalizer", ],
"all_transformers" : ["transformers", "scalers"],

Expand Down Expand Up @@ -291,6 +292,8 @@ def get_configspace(name, n_classes=3, n_samples=1000, n_features=100, random_st
return transformers.PolynomialFeatures_configspace
case "StandardScaler":
return {}
case "PassKBinsDiscretizer":
return transformers.get_passkbinsdiscretizer_configspace(random_state=random_state)

#selectors.py
case "SelectFwe":
Expand Down
31 changes: 25 additions & 6 deletions tpot2/config/regressors.py
Original file line number Diff line number Diff line change
Expand Up @@ -436,10 +436,20 @@ def GradientBoostingRegressor_hyperparameter_parser(params):
final_params['n_iter_no_change'] = None
final_params['validation_fraction'] = None
elif params['early_stop'] == 'valid':
final_params['n_iter_no_change'] = params['n_iter_no_change']
final_params['validation_fraction'] = params['validation_fraction']
#this is required because in crossover, its possible that n_iter_no_change is not in the params
if 'n_iter_no_change' not in params:
final_params['n_iter_no_change'] = 10
else:
final_params['n_iter_no_change'] = params['n_iter_no_change']
if 'validation_fraction' not in params:
final_params['validation_fraction'] = 0.1
else:
final_params['validation_fraction'] = params['validation_fraction']
elif params['early_stop'] == 'train':
final_params['n_iter_no_change'] = params['n_iter_no_change']
if 'n_iter_no_change' not in params:
final_params['n_iter_no_change'] = 10
else:
final_params['n_iter_no_change'] = params['n_iter_no_change']
final_params['validation_fraction'] = None


Expand Down Expand Up @@ -498,11 +508,20 @@ def HistGradientBoostingRegressor_hyperparameter_parser(params):
# final_params['validation_fraction'] = None
final_params['early_stopping'] = False
elif params['early_stop'] == 'valid':
final_params['n_iter_no_change'] = params['n_iter_no_change']
final_params['validation_fraction'] = params['validation_fraction']
if 'n_iter_no_change' not in params:
final_params['n_iter_no_change'] = 10
else:
final_params['n_iter_no_change'] = params['n_iter_no_change']
if 'validation_fraction' not in params:
final_params['validation_fraction'] = 0.1
else:
final_params['validation_fraction'] = params['validation_fraction']
final_params['early_stopping'] = True
elif params['early_stop'] == 'train':
final_params['n_iter_no_change'] = params['n_iter_no_change']
if 'n_iter_no_change' not in params:
final_params['n_iter_no_change'] = 10
else:
final_params['n_iter_no_change'] = params['n_iter_no_change']
final_params['validation_fraction'] = None
final_params['early_stopping'] = True

Expand Down
21 changes: 20 additions & 1 deletion tpot2/config/transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,22 @@ def get_QuantileTransformer_configspace(random_state=None):
)


def get_passkbinsdiscretizer_configspace(random_state=None):
space = {
'n_bins': Integer('n_bins', bounds=(3, 100)),
'encode': 'onehot-dense',
'strategy': Categorical('strategy', ['uniform', 'quantile', 'kmeans']),
# 'subsample': Categorical('subsample', ['auto', 'warn', 'ignore']),
}

if random_state is not None: #This is required because configspace doesn't allow None as a value
space['random_state'] = random_state

return ConfigurationSpace(
space = space

)


### ROBUST SCALER

Expand All @@ -133,4 +149,7 @@ def get_QuantileTransformer_configspace(random_state=None):
})

def robust_scaler_hyperparameter_parser(params):
return {"quantile_range": (params["q_min"], params["q_max"])}
return {"quantile_range": (params["q_min"], params["q_max"])}



5 changes: 3 additions & 2 deletions tpot2/evolvers/base_evolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -498,7 +498,7 @@ def optimize(self, generations=None):
self._client.close()
self._cluster.close()

tpot2.utils.get_pareto_frontier(self.population.evaluated_individuals, column_names=self.objective_names, weights=self.objective_function_weights, invalid_values=["TIMEOUT","INVALID"])
tpot2.utils.get_pareto_frontier(self.population.evaluated_individuals, column_names=self.objective_names, weights=self.objective_function_weights)

def step(self,):
if self.population_size_list is not None:
Expand Down Expand Up @@ -624,7 +624,7 @@ def evaluate_population_full(self, budget=None):
parallel_timeout = 10

#scores = tpot2.utils.eval_utils.parallel_eval_objective_list(individuals_to_evaluate, self.objective_functions, self.n_jobs, verbose=self.verbose, timeout=self.max_eval_time_seconds, budget=budget, n_expected_columns=len(self.objective_names), client=self._client, parallel_timeout=parallel_timeout, **self.objective_kwargs)
scores, start_times, end_times, eval_errors = tpot2.utils.eval_utils.parallel_eval_objective_list2(individuals_to_evaluate, self.objective_functions, verbose=self.verbose, max_eval_time_seconds=self.max_eval_time_seconds, budget=budget, n_expected_columns=len(self.objective_names), client=self._client, **self.objective_kwargs)
scores, start_times, end_times, eval_errors = tpot2.utils.eval_utils.parallel_eval_objective_list2(individuals_to_evaluate, self.objective_functions, verbose=self.verbose, max_eval_time_seconds=self.max_eval_time_seconds, budget=budget, n_expected_columns=len(self.objective_names), client=self._client, scheduled_timeout_time=self.scheduled_timeout_time, **self.objective_kwargs)

self.population.update_column(individuals_to_evaluate, column_names=self.objective_names, data=scores)
if budget is not None:
Expand Down Expand Up @@ -705,6 +705,7 @@ def evaluate_population_selection_early_stop(self,survival_counts, thresholds=No
generation = self.generation,
n_expected_columns=len(self.objective_names),
client=self._client,
scheduled_timeout_time=self.scheduled_timeout_time,
**self.objective_kwargs,
)

Expand Down
12 changes: 6 additions & 6 deletions tpot2/evolvers/steady_state_evolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,7 +342,7 @@ def optimize(self):
###############################
if self.verbose >= 3:
sign = np.sign(self.objective_function_weights)
valid_df = self.population.evaluated_individuals[~self.population.evaluated_individuals[self.objective_names].isin(["TIMEOUT","INVALID"]).any(axis=1)][self.objective_names]*sign
valid_df = self.population.evaluated_individuals[~self.population.evaluated_individuals[["Eval Error"]].isin(["TIMEOUT","INVALID"]).any(axis=1)][self.objective_names]*sign
cur_best_scores = valid_df.max(axis=0)*sign
cur_best_scores = cur_best_scores.to_numpy()
for i, obj in enumerate(self.objective_names):
Expand All @@ -353,7 +353,7 @@ def optimize(self):
#get sign of objective_function_weights
sign = np.sign(self.objective_function_weights)
#get best score for each objective
valid_df = self.population.evaluated_individuals[~self.population.evaluated_individuals[self.objective_names].isin(["TIMEOUT","INVALID"]).any(axis=1)][self.objective_names]*sign
valid_df = self.population.evaluated_individuals[~self.population.evaluated_individuals[["Eval Error"]].isin(["TIMEOUT","INVALID"]).any(axis=1)][self.objective_names]*sign
cur_best_scores = valid_df.max(axis=0)
cur_best_scores = cur_best_scores.to_numpy()
#cur_best_scores = self.population.get_column(self.population.population, column_names=self.objective_names).max(axis=0)*sign #TODO this assumes the current population is the best
Expand Down Expand Up @@ -499,7 +499,7 @@ def optimize(self):
elif len(submitted_futures) < self.max_queue_size:

initial_population = self.population.evaluated_individuals.iloc[:self.initial_population_size*3]
invalid_initial_population = initial_population[initial_population[self.objective_names].isin(["TIMEOUT","INVALID"]).any(axis=1)]
invalid_initial_population = initial_population[initial_population[["Eval Error"]].isin(["TIMEOUT","INVALID"]).any(axis=1)]
if len(invalid_initial_population) >= self.initial_population_size*3: #if all individuals in the 3*initial population are invalid
raise Exception("No individuals could be evaluated in the initial population. This may indicate a bug in the configuration, included models, or objective functions. Set verbose>=4 to see the errors that caused individuals to fail.")

Expand Down Expand Up @@ -540,8 +540,8 @@ def optimize(self):
# Step 7: Cleanup
###############################

self.population.remove_invalid_from_population(column_names=self.objective_names, invalid_value="INVALID")
self.population.remove_invalid_from_population(column_names=self.objective_names, invalid_value="TIMEOUT")
self.population.remove_invalid_from_population(column_names="Eval Error", invalid_value="INVALID")
self.population.remove_invalid_from_population(column_names="Eval Error", invalid_value="TIMEOUT")


#done, cleanup futures
Expand All @@ -556,7 +556,7 @@ def optimize(self):
self._client.close()
self._cluster.close()

tpot2.utils.get_pareto_frontier(self.population.evaluated_individuals, column_names=self.objective_names, weights=self.objective_function_weights, invalid_values=["TIMEOUT","INVALID"])
tpot2.utils.get_pareto_frontier(self.population.evaluated_individuals, column_names=self.objective_names, weights=self.objective_function_weights)



Expand Down
5 changes: 5 additions & 0 deletions tpot2/search_spaces/nodes/genetic_feature_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@ def __init__( self,
else:
self.mask = mask

# check if there are no features selected, if so select one
if sum(self.mask) == 0:
index = rng.choice(len(self.mask))
self.mask[index] = True

self.mutation_list = [self._mutate_add, self._mutate_remove]
self.crossover_list = [self._crossover_swap]

Expand Down
Loading
Loading