From 21a56ec082eb568c6ed6dcc1b804e11a2f474542 Mon Sep 17 00:00:00 2001 From: Randy Olson Date: Thu, 1 Jun 2017 21:42:28 -0400 Subject: [PATCH] Hacky demo of TPOTEnsemble --- tpot/base.py | 37 +++++++++++-------------------------- tpot/gp_deap.py | 21 ++++++++++++++++----- 2 files changed, 27 insertions(+), 31 deletions(-) diff --git a/tpot/base.py b/tpot/base.py index d53a9c57..f5ba9fa1 100644 --- a/tpot/base.py +++ b/tpot/base.py @@ -42,6 +42,7 @@ from sklearn.preprocessing import FunctionTransformer, Imputer from sklearn.model_selection import train_test_split from sklearn.metrics.scorer import make_scorer +from sklearn.ensemble import VotingClassifier from update_checker import update_check @@ -201,6 +202,7 @@ def __init__(self, generations=100, population_size=100, offspring_size=None, self._pareto_front = None self._optimized_pipeline = None + self._pipeline_ensemble_list = None self.fitted_pipeline_ = None self._fitted_imputer = None self._pop = None @@ -514,7 +516,7 @@ def pareto_eq(ind1, ind2): try: with warnings.catch_warnings(): warnings.simplefilter('ignore') - pop, _ = eaMuPlusLambda( + pop, _, self._pipeline_ensemble_list = eaMuPlusLambda( population=pop, toolbox=self._toolbox, mu=self.population_size, @@ -549,11 +551,9 @@ def pareto_eq(ind1, ind2): # Store the pipeline with the highest internal testing score if self._pareto_front: - self._update_top_pipeline() - # It won't raise error for a small test like in a unit test because a few pipeline sometimes # may fail due to the training data does not fit the operator's requirement. - if not self._optimized_pipeline: + if not self._pipeline_ensemble_list: print('There was an error in the TPOT optimization ' 'process. This could be because the data was ' 'not formatted properly, or because data for ' @@ -561,7 +561,7 @@ def pareto_eq(ind1, ind2): 'TPOTClassifier object. Please make sure you ' 'passed the data to TPOT correctly.') else: - self.fitted_pipeline_ = self._toolbox.compile(expr=self._optimized_pipeline) + self.fitted_pipeline_ = VotingClassifier(estimators=self._pipeline_ensemble_list) with warnings.catch_warnings(): warnings.simplefilter('ignore') @@ -571,7 +571,7 @@ def pareto_eq(ind1, ind2): # Add an extra line of spacing if the progress bar was used if self.verbosity >= 2: print('') - print('Best pipeline: {}'.format(self._optimized_pipeline)) + print('Best pipeline: {}'.format(self._pipeline_ensemble_list)) # Store and fit the entire Pareto front as fitted models for convenience self.pareto_front_fitted_pipelines_ = {} @@ -589,15 +589,6 @@ def pareto_eq(ind1, ind2): raise return self - def _update_top_pipeline(self): - """Helper function to update the _optimized_pipeline field.""" - if self._pareto_front: - top_score = -float('inf') - for pipeline, pipeline_scores in zip(self._pareto_front.items, reversed(self._pareto_front.keys)): - if pipeline_scores.wvalues[1] > top_score: - self._optimized_pipeline = pipeline - top_score = pipeline_scores.wvalues[1] - def predict(self, features): """Use the optimized pipeline to predict the target for a feature set. @@ -808,7 +799,7 @@ def _set_param_recursive(self, pipeline_steps, parameter, value): setattr(obj, parameter, value) - def _evaluate_individuals(self, individuals, features, target, sample_weight=None, groups=None): + def _evaluate_individuals(self, individuals, features, target, sample_weight=None, groups=None, pipeline_ensemble_list=None): """Determine the fit of the provided individuals. Parameters @@ -856,21 +847,11 @@ def _evaluate_individuals(self, individuals, features, target, sample_weight=Non fitnesses_dict[indidx] = (5000., -float('inf')) if not self._pbar.disable: self._pbar.update(1) - # Check if the individual was evaluated before - elif individual_str in self.evaluated_individuals_: - # Get fitness score from previous evaluation - fitnesses_dict[indidx] = self.evaluated_individuals_[individual_str] - if self.verbosity > 2: - self._pbar.write('Pipeline encountered that has previously been evaluated during the ' - 'optimization process. Using the score from the previous evaluation.') - if not self._pbar.disable: - self._pbar.update(1) else: try: # Transform the tree expression into an sklearn pipeline sklearn_pipeline = self._toolbox.compile(expr=individual) - # Fix random state when the operator allows and build sample weight dictionary self._set_param_recursive(sklearn_pipeline.steps, 'random_state', 42) @@ -882,6 +863,10 @@ def _evaluate_individuals(self, individuals, features, target, sample_weight=Non if not self._pbar.disable: self._pbar.update(1) continue + + if pipeline_ensemble_list is not None: + sklearn_pipeline = VotingClassifier(estimators=pipeline_ensemble_list + [('evaluate', sklearn_pipeline)]) + eval_individuals_str.append(individual_str) operator_count_list.append(operator_count) sklearn_pipeline_list.append(sklearn_pipeline) diff --git a/tpot/gp_deap.py b/tpot/gp_deap.py index 4c31228d..45935115 100644 --- a/tpot/gp_deap.py +++ b/tpot/gp_deap.py @@ -161,6 +161,8 @@ def eaMuPlusLambda(population, toolbox, mu, lambda_, cxpb, mutpb, ngen, pbar, record = stats.compile(population) if stats is not None else {} logbook.record(gen=0, nevals=len(invalid_ind), **record) + + pipeline_ensemble_list = [] # Begin the generational process for gen in range(1, ngen + 1): @@ -169,7 +171,7 @@ def eaMuPlusLambda(population, toolbox, mu, lambda_, cxpb, mutpb, ngen, pbar, offspring = varOr(population, toolbox, lambda_, cxpb, mutpb) # Evaluate the individuals with an invalid fitness - invalid_ind = [ind for ind in offspring if not ind.fitness.valid] + invalid_ind = offspring # update pbar for valid_ind if not pbar.disable: @@ -177,9 +179,18 @@ def eaMuPlusLambda(population, toolbox, mu, lambda_, cxpb, mutpb, ngen, pbar, if not (max_time_mins is None) and pbar.n >= pbar.total: pbar.total += lambda_ - fitnesses = toolbox.evaluate(invalid_ind) + fitnesses = toolbox.evaluate(invalid_ind, pipeline_ensemble_list=pipeline_ensemble_list) + best_gen_ind = None + best_gen_fitness = -float('inf') for ind, fit in zip(invalid_ind, fitnesses): ind.fitness.values = fit + if fit[1] > best_gen_fitness: + best_gen_fitness = fit[1] + best_gen_ind = ind + + best_gen_ind_sklearn_pipeline = toolbox.compile(expr=best_gen_ind) + best_gen_ind_ensemble_entry = ('pipeline{}'.format(gen), best_gen_ind_sklearn_pipeline) + pipeline_ensemble_list.append(best_gen_ind_ensemble_entry) # Update the hall of fame with the generated individuals if halloffame is not None: @@ -211,7 +222,7 @@ def eaMuPlusLambda(population, toolbox, mu, lambda_, cxpb, mutpb, ngen, pbar, record = stats.compile(population) if stats is not None else {} logbook.record(gen=gen, nevals=len(invalid_ind), **record) - return population, logbook + return population, logbook, pipeline_ensemble_list def cxOnePoint(ind1, ind2): @@ -351,7 +362,7 @@ def _wrapped_cross_val_score(sklearn_pipeline, features, target, cv, scoring_function, sample_weight, max_eval_time_mins, groups): max_time_seconds = max(int(max_eval_time_mins * 60), 1) - sample_weight_dict = set_sample_weight(sklearn_pipeline.steps, sample_weight) + #sample_weight_dict = set_sample_weight(sklearn_pipeline.steps, sample_weight) # build a job for cross_val_score tmp_it = Interruptable_cross_val_score( clone(sklearn_pipeline), @@ -361,7 +372,7 @@ def _wrapped_cross_val_score(sklearn_pipeline, features, target, cv=cv, n_jobs=1, verbose=0, - fit_params=sample_weight_dict, + #fit_params=sample_weight_dict, groups=groups ) tmp_it.start()