diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 8e650702..bff0d698 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -18,9 +18,9 @@ Each individual change should have a link to the pull request after the descript 1.1.2 (2024-02-07) ------------------ - Added ^^^^^ +- Update OneHotEncodingTransformer to default to returning int8 columns `#175 `_ - Updated NullIndicator to return int8 columns `#173 https://github.com/lvgig/tubular/pull/173`_ 1.1.1 (2024-01-18) diff --git a/tests/nominal/test_OneHotEncodingTransformer.py b/tests/nominal/test_OneHotEncodingTransformer.py index 44388f67..a057877b 100644 --- a/tests/nominal/test_OneHotEncodingTransformer.py +++ b/tests/nominal/test_OneHotEncodingTransformer.py @@ -1,3 +1,4 @@ +import numpy as np import pandas as pd import pytest import sklearn @@ -49,7 +50,11 @@ def test_one_hot_encoder_init_called(self, mocker): Again not using ta.functions.assert_function_call for this as it does not handle self being passed to OneHotEncoder.init """ - expected_keyword_args = {"sparse": False, "handle_unknown": "ignore"} + expected_keyword_args = { + "sparse": False, + "handle_unknown": "ignore", + "dtype": np.int8, + } mocker.patch("sklearn.preprocessing.OneHotEncoder.__init__") @@ -435,12 +440,20 @@ def test_expected_output(self, df_test, expected): Also tests that OneHotEncodingTransformer.transform does not modify unrelated columns. """ # transformer is fit on the whole dataset separately from the input df to work with the decorators + columns = ["b"] df_train = d.create_df_7() - x = OneHotEncodingTransformer(columns="b") + x = OneHotEncodingTransformer(columns=columns) x.fit(df_train) df_transformed = x.transform(df_test) + for col in [ + column + f"_{value}" + for column in columns + for value in df_train[column].unique().tolist() + ]: + expected[col] = expected[col].astype(np.int8) + ta.equality.assert_frame_equal_msg( expected=expected, actual=df_transformed, @@ -513,11 +526,19 @@ def test_unseen_categories_encoded_as_all_zeroes(self, df_test, expected): """Test OneHotEncodingTransformer.transform encodes unseen categories correctly (all 0s).""" # transformer is fit on the whole dataset separately from the input df to work with the decorators df_train = d.create_df_7() - x = OneHotEncodingTransformer(columns=["a", "b", "c"], verbose=False) + columns = ["a", "b", "c"] + x = OneHotEncodingTransformer(columns=columns, verbose=False) x.fit(df_train) df_transformed = x.transform(df_test) + for col in [ + column + f"_{value}" + for column in columns + for value in df_train[column].unique().tolist() + ]: + expected[col] = expected[col].astype(np.int8) + ta.equality.assert_equal_dispatch( expected=expected, actual=df_transformed, diff --git a/tubular/nominal.py b/tubular/nominal.py index 93229a5d..2e57ba8e 100644 --- a/tubular/nominal.py +++ b/tubular/nominal.py @@ -1060,6 +1060,7 @@ def __init__( drop_original: bool = False, copy: bool = True, verbose: bool = False, + dtype: np.int8 = np.int8, **kwargs: dict[str, bool], ) -> None: BaseNominalTransformer.__init__( @@ -1070,7 +1071,13 @@ def __init__( ) # Set attributes for scikit-learn'S OneHotEncoder - OneHotEncoder.__init__(self, sparse=False, handle_unknown="ignore", **kwargs) + OneHotEncoder.__init__( + self, + sparse=False, + handle_unknown="ignore", + dtype=dtype, + **kwargs, + ) # Set other class attrributes self.separator = separator