From 36d36072f269764369dce04b6b5b667f442461df Mon Sep 17 00:00:00 2001 From: limlam96 <103185696+limlam96@users.noreply.github.com> Date: Fri, 2 Feb 2024 11:20:55 +0000 Subject: [PATCH 1/4] edited OneHotEncoder to default to int8 output --- .../nominal/test_OneHotEncodingTransformer.py | 27 ++++++++++++++++--- tubular/nominal.py | 9 ++++++- 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/tests/nominal/test_OneHotEncodingTransformer.py b/tests/nominal/test_OneHotEncodingTransformer.py index 44388f67..a057877b 100644 --- a/tests/nominal/test_OneHotEncodingTransformer.py +++ b/tests/nominal/test_OneHotEncodingTransformer.py @@ -1,3 +1,4 @@ +import numpy as np import pandas as pd import pytest import sklearn @@ -49,7 +50,11 @@ def test_one_hot_encoder_init_called(self, mocker): Again not using ta.functions.assert_function_call for this as it does not handle self being passed to OneHotEncoder.init """ - expected_keyword_args = {"sparse": False, "handle_unknown": "ignore"} + expected_keyword_args = { + "sparse": False, + "handle_unknown": "ignore", + "dtype": np.int8, + } mocker.patch("sklearn.preprocessing.OneHotEncoder.__init__") @@ -435,12 +440,20 @@ def test_expected_output(self, df_test, expected): Also tests that OneHotEncodingTransformer.transform does not modify unrelated columns. """ # transformer is fit on the whole dataset separately from the input df to work with the decorators + columns = ["b"] df_train = d.create_df_7() - x = OneHotEncodingTransformer(columns="b") + x = OneHotEncodingTransformer(columns=columns) x.fit(df_train) df_transformed = x.transform(df_test) + for col in [ + column + f"_{value}" + for column in columns + for value in df_train[column].unique().tolist() + ]: + expected[col] = expected[col].astype(np.int8) + ta.equality.assert_frame_equal_msg( expected=expected, actual=df_transformed, @@ -513,11 +526,19 @@ def test_unseen_categories_encoded_as_all_zeroes(self, df_test, expected): """Test OneHotEncodingTransformer.transform encodes unseen categories correctly (all 0s).""" # transformer is fit on the whole dataset separately from the input df to work with the decorators df_train = d.create_df_7() - x = OneHotEncodingTransformer(columns=["a", "b", "c"], verbose=False) + columns = ["a", "b", "c"] + x = OneHotEncodingTransformer(columns=columns, verbose=False) x.fit(df_train) df_transformed = x.transform(df_test) + for col in [ + column + f"_{value}" + for column in columns + for value in df_train[column].unique().tolist() + ]: + expected[col] = expected[col].astype(np.int8) + ta.equality.assert_equal_dispatch( expected=expected, actual=df_transformed, diff --git a/tubular/nominal.py b/tubular/nominal.py index 93229a5d..2e57ba8e 100644 --- a/tubular/nominal.py +++ b/tubular/nominal.py @@ -1060,6 +1060,7 @@ def __init__( drop_original: bool = False, copy: bool = True, verbose: bool = False, + dtype: np.int8 = np.int8, **kwargs: dict[str, bool], ) -> None: BaseNominalTransformer.__init__( @@ -1070,7 +1071,13 @@ def __init__( ) # Set attributes for scikit-learn'S OneHotEncoder - OneHotEncoder.__init__(self, sparse=False, handle_unknown="ignore", **kwargs) + OneHotEncoder.__init__( + self, + sparse=False, + handle_unknown="ignore", + dtype=dtype, + **kwargs, + ) # Set other class attrributes self.separator = separator From c82c5b07426dcb2daf05f13b875e36a79e1931dc Mon Sep 17 00:00:00 2001 From: limlam96 <103185696+limlam96@users.noreply.github.com> Date: Mon, 5 Feb 2024 10:51:12 +0000 Subject: [PATCH 2/4] updated version and changelog --- CHANGELOG.rst | 6 ++++++ tubular/_version.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 19d5fe8e..4b988e6a 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -16,6 +16,12 @@ Subsections for each version can be one of the following; Each individual change should have a link to the pull request after the description of the change. +1.1.2 (2024-02-7) +------------------ +Added +^^^^^ +- Update OneHotEncodingTransformer to default to returning int8 columns + 1.1.1 (2024-01-18) ------------------ diff --git a/tubular/_version.py b/tubular/_version.py index a82b376d..72f26f59 100644 --- a/tubular/_version.py +++ b/tubular/_version.py @@ -1 +1 @@ -__version__ = "1.1.1" +__version__ = "1.1.2" From 2b4e96e20a24a86deb8e02a2f9549103bc07dbde Mon Sep 17 00:00:00 2001 From: limlam96 <103185696+limlam96@users.noreply.github.com> Date: Mon, 5 Feb 2024 10:54:00 +0000 Subject: [PATCH 3/4] added PR to changelog --- CHANGELOG.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 4b988e6a..9b020c9d 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -20,7 +20,7 @@ Each individual change should have a link to the pull request after the descript ------------------ Added ^^^^^ -- Update OneHotEncodingTransformer to default to returning int8 columns +- Update OneHotEncodingTransformer to default to returning int8 columns https://github.com/lvgig/tubular/pull/175 1.1.1 (2024-01-18) ------------------ From 72cd91f670a7547dd1384cb3b9efa06f7cced9fb Mon Sep 17 00:00:00 2001 From: limlam96 <103185696+limlam96@users.noreply.github.com> Date: Mon, 5 Feb 2024 11:33:48 +0000 Subject: [PATCH 4/4] updated changelog format --- CHANGELOG.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 9b020c9d..f7e4ad32 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -16,11 +16,12 @@ Subsections for each version can be one of the following; Each individual change should have a link to the pull request after the description of the change. -1.1.2 (2024-02-7) +1.1.2 (2024-02-07) ------------------ Added ^^^^^ -- Update OneHotEncodingTransformer to default to returning int8 columns https://github.com/lvgig/tubular/pull/175 +- Update OneHotEncodingTransformer to default to returning int8 columns `#175 `_ + 1.1.1 (2024-01-18) ------------------