11
11
12
12
13
13
14
- def auto_select_categorical_features (X ):
14
+ def auto_select_categorical_features (X , min_unique = 10 , ):
15
15
16
- if not isinstance (X , pd .DataFrame ):
17
- return []
18
-
19
- feature_mask = []
20
- for column in X .columns :
21
- feature_mask .append (not is_numeric_dtype (X [column ]))
16
+ if isinstance (X , pd .DataFrame ):
17
+ return [col for col in X .columns if len (X [col ].unique ()) < min_unique ]
18
+ else :
19
+ return [i for i in range (X .shape [1 ]) if len (np .unique (X [:, i ])) < min_unique ]
22
20
23
- return feature_mask
24
21
25
22
26
23
def _X_selected (X , selected ):
@@ -41,6 +38,21 @@ class ColumnOneHotEncoder(BaseEstimator, TransformerMixin):
41
38
42
39
43
40
def __init__ (self , columns = 'auto' , drop = None , handle_unknown = 'error' , sparse_output = False , min_frequency = None ,max_categories = None ):
41
+ '''
42
+
43
+ Parameters
44
+ ----------
45
+
46
+ columns : str, list, default='auto'
47
+ - 'auto' : Automatically select categorical features based on columns with less than 10 unique values
48
+ - 'categorical' : Automatically select categorical features
49
+ - 'numeric' : Automatically select numeric features
50
+ - 'all' : Select all features
51
+ - list : A list of columns to select
52
+
53
+ drop, handle_unknown, sparse_output, min_frequency, max_categories : see sklearn.preprocessing.OneHotEncoder
54
+
55
+ '''
44
56
45
57
self .columns = columns
46
58
self .drop = drop
@@ -73,6 +85,8 @@ def fit(self, X, y=None):
73
85
self .columns_ = list (X .select_dtypes (exclude = 'number' ).columns )
74
86
elif self .columns == "numeric" :
75
87
self .columns_ = [col for col in X .columns if is_numeric_dtype (X [col ])]
88
+ elif self .columns == "auto" :
89
+ self .columns_ = auto_select_categorical_features (X )
76
90
elif self .columns == "all" :
77
91
if isinstance (X , pd .DataFrame ):
78
92
self .columns_ = X .columns
0 commit comments