Skip to content

Commit ef2a9a1

Browse files
authored
Merge pull request #120 from perib/dev
Dev
2 parents 14922f6 + 4681389 commit ef2a9a1

File tree

4 files changed

+26
-12
lines changed

4 files changed

+26
-12
lines changed

tpot2/builtin_modules/column_one_hot_encoder.py

+22-8
Original file line numberDiff line numberDiff line change
@@ -11,16 +11,13 @@
1111

1212

1313

14-
def auto_select_categorical_features(X):
14+
def auto_select_categorical_features(X, min_unique=10,):
1515

16-
if not isinstance(X, pd.DataFrame):
17-
return []
18-
19-
feature_mask = []
20-
for column in X.columns:
21-
feature_mask.append(not is_numeric_dtype(X[column]))
16+
if isinstance(X, pd.DataFrame):
17+
return [col for col in X.columns if len(X[col].unique()) < min_unique]
18+
else:
19+
return [i for i in range(X.shape[1]) if len(np.unique(X[:, i])) < min_unique]
2220

23-
return feature_mask
2421

2522

2623
def _X_selected(X, selected):
@@ -41,6 +38,21 @@ class ColumnOneHotEncoder(BaseEstimator, TransformerMixin):
4138

4239

4340
def __init__(self, columns='auto', drop=None, handle_unknown='error', sparse_output=False, min_frequency=None,max_categories=None):
41+
'''
42+
43+
Parameters
44+
----------
45+
46+
columns : str, list, default='auto'
47+
- 'auto' : Automatically select categorical features based on columns with less than 10 unique values
48+
- 'categorical' : Automatically select categorical features
49+
- 'numeric' : Automatically select numeric features
50+
- 'all' : Select all features
51+
- list : A list of columns to select
52+
53+
drop, handle_unknown, sparse_output, min_frequency, max_categories : see sklearn.preprocessing.OneHotEncoder
54+
55+
'''
4456

4557
self.columns = columns
4658
self.drop = drop
@@ -73,6 +85,8 @@ def fit(self, X, y=None):
7385
self.columns_ = list(X.select_dtypes(exclude='number').columns)
7486
elif self.columns == "numeric":
7587
self.columns_ = [col for col in X.columns if is_numeric_dtype(X[col])]
88+
elif self.columns == "auto":
89+
self.columns_ = auto_select_categorical_features(X)
7690
elif self.columns == "all":
7791
if isinstance(X, pd.DataFrame):
7892
self.columns_ = X.columns

tpot2/config/transformers.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from functools import partial
22
import numpy as np
33

4-
from tpot2.builtin_modules import ZeroCount, OneHotEncoder
4+
from tpot2.builtin_modules import ZeroCount, OneHotEncoder, ColumnOneHotEncoder
55
from sklearn.preprocessing import Binarizer
66
from sklearn.decomposition import FastICA
77
from sklearn.cluster import FeatureAgglomeration
@@ -99,5 +99,5 @@ def make_transformer_config_dictionary(random_state=None, n_features=10):
9999
RobustScaler: {},
100100
StandardScaler: {},
101101
ZeroCount: params_tpot_builtins_ZeroCount,
102-
OneHotEncoder: params_tpot_builtins_OneHotEncoder,
102+
ColumnOneHotEncoder: params_tpot_builtins_OneHotEncoder,
103103
}

tpot2/individual_representations/graph_pipeline_individual/templates.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ def estimator_graph_individual_generator(
5454
starting_ops = []
5555
if inner_config_dict is not None:
5656
starting_ops.append(ind._mutate_insert_inner_node)
57-
if leaf_config_dict is not None:
57+
if leaf_config_dict is not None or inner_config_dict is not None:
5858
starting_ops.append(ind._mutate_insert_leaf)
5959
n_nodes -= 1
6060

tpot2/objectives/complexity.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ def MultinomialNB_Complexity(model):
142142

143143
def calculate_model_complexity(est):
144144
if isinstance(est, sklearn.pipeline.Pipeline) or isinstance(est, sklearn.pipeline.FeatureUnion):
145-
return sum(calculate_model_complexity(estimator) for estimator in est.steps)
145+
return sum(calculate_model_complexity(estimator) for _,estimator in est.steps)
146146
if isinstance(est, GraphPipeline):
147147
return sum(calculate_model_complexity(est.graph.nodes[node]['instance']) for node in est.graph.nodes)
148148

0 commit comments

Comments
 (0)