Skip to content

Commit

Permalink
Merge pull request #175 from lvgig/feature/one_hot_encoder_default_int8
Browse files Browse the repository at this point in the history
Feature/one hot encoder default int8
  • Loading branch information
davidhopkinson26 authored Feb 6, 2024
2 parents fb205d2 + 495c635 commit cebe0c0
Show file tree
Hide file tree
Showing 3 changed files with 33 additions and 5 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@ Each individual change should have a link to the pull request after the descript

1.1.2 (2024-02-07)
------------------

Added
^^^^^
- Update OneHotEncodingTransformer to default to returning int8 columns `#175 <https://github.com/lvgig/tubular/pull/175>`_
- Updated NullIndicator to return int8 columns `#173 https://github.com/lvgig/tubular/pull/173`_

1.1.1 (2024-01-18)
Expand Down
27 changes: 24 additions & 3 deletions tests/nominal/test_OneHotEncodingTransformer.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import numpy as np
import pandas as pd
import pytest
import sklearn
Expand Down Expand Up @@ -49,7 +50,11 @@ def test_one_hot_encoder_init_called(self, mocker):
Again not using ta.functions.assert_function_call for this as it does not handle self being passed to OneHotEncoder.init
"""
expected_keyword_args = {"sparse": False, "handle_unknown": "ignore"}
expected_keyword_args = {
"sparse": False,
"handle_unknown": "ignore",
"dtype": np.int8,
}

mocker.patch("sklearn.preprocessing.OneHotEncoder.__init__")

Expand Down Expand Up @@ -435,12 +440,20 @@ def test_expected_output(self, df_test, expected):
Also tests that OneHotEncodingTransformer.transform does not modify unrelated columns.
"""
# transformer is fit on the whole dataset separately from the input df to work with the decorators
columns = ["b"]
df_train = d.create_df_7()
x = OneHotEncodingTransformer(columns="b")
x = OneHotEncodingTransformer(columns=columns)
x.fit(df_train)

df_transformed = x.transform(df_test)

for col in [
column + f"_{value}"
for column in columns
for value in df_train[column].unique().tolist()
]:
expected[col] = expected[col].astype(np.int8)

ta.equality.assert_frame_equal_msg(
expected=expected,
actual=df_transformed,
Expand Down Expand Up @@ -513,11 +526,19 @@ def test_unseen_categories_encoded_as_all_zeroes(self, df_test, expected):
"""Test OneHotEncodingTransformer.transform encodes unseen categories correctly (all 0s)."""
# transformer is fit on the whole dataset separately from the input df to work with the decorators
df_train = d.create_df_7()
x = OneHotEncodingTransformer(columns=["a", "b", "c"], verbose=False)
columns = ["a", "b", "c"]
x = OneHotEncodingTransformer(columns=columns, verbose=False)
x.fit(df_train)

df_transformed = x.transform(df_test)

for col in [
column + f"_{value}"
for column in columns
for value in df_train[column].unique().tolist()
]:
expected[col] = expected[col].astype(np.int8)

ta.equality.assert_equal_dispatch(
expected=expected,
actual=df_transformed,
Expand Down
9 changes: 8 additions & 1 deletion tubular/nominal.py
Original file line number Diff line number Diff line change
Expand Up @@ -1060,6 +1060,7 @@ def __init__(
drop_original: bool = False,
copy: bool = True,
verbose: bool = False,
dtype: np.int8 = np.int8,
**kwargs: dict[str, bool],
) -> None:
BaseNominalTransformer.__init__(
Expand All @@ -1070,7 +1071,13 @@ def __init__(
)

# Set attributes for scikit-learn'S OneHotEncoder
OneHotEncoder.__init__(self, sparse=False, handle_unknown="ignore", **kwargs)
OneHotEncoder.__init__(
self,
sparse=False,
handle_unknown="ignore",
dtype=dtype,
**kwargs,
)

# Set other class attrributes
self.separator = separator
Expand Down

0 comments on commit cebe0c0

Please sign in to comment.