Skip to content

Commit bfb019e

Browse files
authored
Merge pull request #110 from perib/dev
preprocessing fix
2 parents 8eec352 + a0095e5 commit bfb019e

File tree

2 files changed

+56
-32
lines changed

2 files changed

+56
-32
lines changed

tpot2/tpot_estimator/estimator.py

+28-16
Original file line numberDiff line numberDiff line change
@@ -619,23 +619,35 @@ def fit(self, X, y):
619619
if self.preprocessing:
620620
#X = pd.DataFrame(X)
621621

622-
#TODO: check if there are missing values in X before imputation. If not, don't include imputation in pipeline. Check if there are categorical columns. If not, don't include one hot encoding in pipeline
623-
if isinstance(X, pd.DataFrame): #pandas dataframe
624-
if self.categorical_features is not None:
625-
X[self.categorical_features] = X[self.categorical_features].astype(object)
626-
self._preprocessing_pipeline = sklearn.pipeline.make_pipeline(tpot2.builtin_modules.ColumnSimpleImputer("categorical", strategy='most_frequent'), #impute categorical columns
627-
tpot2.builtin_modules.ColumnSimpleImputer("numeric", strategy='mean'), #impute numeric columns
628-
tpot2.builtin_modules.ColumnOneHotEncoder("categorical", min_frequency=0.0001)) #one hot encode categorical columns
629-
X = self._preprocessing_pipeline.fit_transform(X)
630-
else:
631-
if self.categorical_features is not None: #numpy array and categorical columns specified
632-
self._preprocessing_pipeline = sklearn.pipeline.make_pipeline(tpot2.builtin_modules.ColumnSimpleImputer(self.categorical_features, strategy='most_frequent'), #impute categorical columns
633-
tpot2.builtin_modules.ColumnSimpleImputer("all", strategy='mean'), #impute remaining numeric columns
634-
tpot2.builtin_modules.ColumnOneHotEncoder(self.categorical_features, min_frequency=0.0001)) #one hot encode categorical columns
635-
else: #numpy array and no categorical columns specified, just do imputation
636-
self._preprocessing_pipeline = sklearn.pipeline.make_pipeline(tpot2.builtin_modules.ColumnSimpleImputer("all", strategy='mean'))
637-
622+
if not isinstance(self.preprocessing, bool) and isinstance(self.preprocessing, sklearn.base.BaseEstimator):
623+
self._preprocessing_pipeline = self.preprocessing
638624

625+
#TODO: check if there are missing values in X before imputation. If not, don't include imputation in pipeline. Check if there are categorical columns. If not, don't include one hot encoding in pipeline
626+
else: #if self.preprocessing is True or not a sklearn estimator
627+
628+
pipeline_steps = []
629+
630+
if self.categorical_features is not None: #if categorical features are specified, use those
631+
pipeline_steps.append(("impute_categorical", tpot2.builtin_modules.ColumnSimpleImputer(self.categorical_features, strategy='most_frequent')))
632+
pipeline_steps.append(("impute_numeric", tpot2.builtin_modules.ColumnSimpleImputer("numeric", strategy='mean')))
633+
pipeline_steps.append(("impute_categorical", tpot2.builtin_modules.ColumnOneHotEncoder(self.categorical_features, strategy='most_frequent')))
634+
635+
else:
636+
if isinstance(X, pd.DataFrame):
637+
categorical_columns = X.select_dtypes(include=['object']).columns
638+
if len(categorical_columns) > 0:
639+
pipeline_steps.append(("impute_categorical", tpot2.builtin_modules.ColumnSimpleImputer("categorical", strategy='most_frequent')))
640+
pipeline_steps.append(("impute_numeric", tpot2.builtin_modules.ColumnSimpleImputer("numeric", strategy='mean')))
641+
pipeline_steps.append(("impute_categorical", tpot2.builtin_modules.ColumnOneHotEncoder("categorical", strategy='most_frequent')))
642+
else:
643+
pipeline_steps.append(("impute_numeric", tpot2.builtin_modules.ColumnSimpleImputer("all", strategy='mean')))
644+
else:
645+
pipeline_steps.append(("impute_numeric", tpot2.builtin_modules.ColumnSimpleImputer("all", strategy='mean')))
646+
647+
self._preprocessing_pipeline = sklearn.pipeline.Pipeline(pipeline_steps)
648+
649+
X = self._preprocessing_pipeline.fit_transform(X, y)
650+
639651
else:
640652
self._preprocessing_pipeline = None
641653

tpot2/tpot_estimator/steady_state_estimator.py

+28-16
Original file line numberDiff line numberDiff line change
@@ -605,23 +605,35 @@ def fit(self, X, y):
605605
if self.preprocessing:
606606
#X = pd.DataFrame(X)
607607

608-
#TODO: check if there are missing values in X before imputation. If not, don't include imputation in pipeline. Check if there are categorical columns. If not, don't include one hot encoding in pipeline
609-
if isinstance(X, pd.DataFrame): #pandas dataframe
610-
if self.categorical_features is not None:
611-
X[self.categorical_features] = X[self.categorical_features].astype(object)
612-
self._preprocessing_pipeline = sklearn.pipeline.make_pipeline(tpot2.builtin_modules.ColumnSimpleImputer("categorical", strategy='most_frequent'), #impute categorical columns
613-
tpot2.builtin_modules.ColumnSimpleImputer("numeric", strategy='mean'), #impute numeric columns
614-
tpot2.builtin_modules.ColumnOneHotEncoder("categorical", min_frequency=0.0001)) #one hot encode categorical columns
615-
X = self._preprocessing_pipeline.fit_transform(X)
616-
else:
617-
if self.categorical_features is not None: #numpy array and categorical columns specified
618-
self._preprocessing_pipeline = sklearn.pipeline.make_pipeline(tpot2.builtin_modules.ColumnSimpleImputer(self.categorical_features, strategy='most_frequent'), #impute categorical columns
619-
tpot2.builtin_modules.ColumnSimpleImputer("all", strategy='mean'), #impute remaining numeric columns
620-
tpot2.builtin_modules.ColumnOneHotEncoder(self.categorical_features, min_frequency=0.0001)) #one hot encode categorical columns
621-
else: #numpy array and no categorical columns specified, just do imputation
622-
self._preprocessing_pipeline = sklearn.pipeline.make_pipeline(tpot2.builtin_modules.ColumnSimpleImputer("all", strategy='mean'))
623-
608+
if not isinstance(self.preprocessing, bool) and isinstance(self.preprocessing, sklearn.base.BaseEstimator):
609+
self._preprocessing_pipeline = self.preprocessing
624610

611+
#TODO: check if there are missing values in X before imputation. If not, don't include imputation in pipeline. Check if there are categorical columns. If not, don't include one hot encoding in pipeline
612+
else: #if self.preprocessing is True or not a sklearn estimator
613+
614+
pipeline_steps = []
615+
616+
if self.categorical_features is not None: #if categorical features are specified, use those
617+
pipeline_steps.append(("impute_categorical", tpot2.builtin_modules.ColumnSimpleImputer(self.categorical_features, strategy='most_frequent')))
618+
pipeline_steps.append(("impute_numeric", tpot2.builtin_modules.ColumnSimpleImputer("numeric", strategy='mean')))
619+
pipeline_steps.append(("impute_categorical", tpot2.builtin_modules.ColumnOneHotEncoder(self.categorical_features, strategy='most_frequent')))
620+
621+
else:
622+
if isinstance(X, pd.DataFrame):
623+
categorical_columns = X.select_dtypes(include=['object']).columns
624+
if len(categorical_columns) > 0:
625+
pipeline_steps.append(("impute_categorical", tpot2.builtin_modules.ColumnSimpleImputer("categorical", strategy='most_frequent')))
626+
pipeline_steps.append(("impute_numeric", tpot2.builtin_modules.ColumnSimpleImputer("numeric", strategy='mean')))
627+
pipeline_steps.append(("impute_categorical", tpot2.builtin_modules.ColumnOneHotEncoder("categorical", strategy='most_frequent')))
628+
else:
629+
pipeline_steps.append(("impute_numeric", tpot2.builtin_modules.ColumnSimpleImputer("all", strategy='mean')))
630+
else:
631+
pipeline_steps.append(("impute_numeric", tpot2.builtin_modules.ColumnSimpleImputer("all", strategy='mean')))
632+
633+
self._preprocessing_pipeline = sklearn.pipeline.Pipeline(pipeline_steps)
634+
635+
X = self._preprocessing_pipeline.fit_transform(X, y)
636+
625637
else:
626638
self._preprocessing_pipeline = None
627639

0 commit comments

Comments
 (0)