Skip to content

Commit

Permalink
Merge pull request #177 from lvgig/feature/rare_encoder_forget_rare_cats
Browse files Browse the repository at this point in the history
edited GroupRareLevelsTransformer to forget rare categories when work…
  • Loading branch information
limlam96 authored Feb 8, 2024
2 parents 81003b9 + 66034ae commit 8efb2af
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 10 deletions.
7 changes: 6 additions & 1 deletion CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,18 @@ Subsections for each version can be one of the following;

Each individual change should have a link to the pull request after the description of the change.

1.2.1 (2024-02-08)
------------------
Added
^^^^^
- Updated GroupRareLevelsTransformer so that when working with category dtypes it forgets categories encoded as rare (this is wanted behaviour as these categories are no longer present in the data) `#177 https://github.com/lvgig/tubular/pull/177`_

1.2.0 (2024-02-06)
------------------
Added
^^^^^
- Update OneHotEncodingTransformer to default to returning int8 columns `#175 <https://github.com/lvgig/tubular/pull/175>`_
- Updated NullIndicator to return int8 columns `#173 https://github.com/lvgig/tubular/pull/173`_
- Update OneHotEncodingTransformer to default to returning int8 columns `#175 <https://github.com/lvgig/tubular/pull/175>`_
- Updated MeanResponseTransformer to coerce return to float (useful behaviour for category type features) `#174 <https://github.com/lvgig/tubular/pull/174>`_

1.1.1 (2024-01-18)
Expand Down
35 changes: 30 additions & 5 deletions tests/nominal/test_GroupRareLevelsTransformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ def expected_df_1():
df["c"] = pd.Series(
["a", "a", "c", "c", "e", "e", "rare", "rare", "rare", "rare"],
dtype=pd.CategoricalDtype(
categories=["a", "c", "e", "f", "g", "h", "rare"],
categories=["a", "c", "e", "rare"],
ordered=False,
),
)
Expand Down Expand Up @@ -361,19 +361,19 @@ def test_expected_output_no_weight_single_row_na_category_column(self):
one_row_df = pd.DataFrame({"b": [np.nan], "c": [np.NaN]})
one_row_df["c"] = one_row_df["c"].astype("category")

# add rare as a category in dataframe
one_row_df["c"] = one_row_df["c"].cat.add_categories("rare")

x = GroupRareLevelsTransformer(columns=["b", "c"], cut_off_percent=0.2)

# set the mappging dict directly rather than fitting x on df so test works with decorators
x.non_rare_levels = {"b": ["a", np.NaN], "c": ["e", "c", "a", np.NaN]}

one_row_df_transformed = x.transform(one_row_df)

expected_df = one_row_df.copy()
expected_df["c"] = expected_df["c"].cat.add_categories(x.rare_level_name)

ta.equality.assert_frame_equal_msg(
actual=one_row_df_transformed,
expected=one_row_df,
expected=expected_df,
msg_tag="Unexpected values in GroupRareLevelsTransformer.transform",
)

Expand Down Expand Up @@ -434,3 +434,28 @@ def test_expected_output_unseen_levels_not_encoded(self):
actual=list(df_transformed["b"]),
msg="Unseen levels are not left unchanged when unseen_levels_to_rare is set to false",
)

def test_rare_categories_forgotten(self):
"test that for category dtype, categories encoded as rare are forgotten by series"

df = d.create_df_8()

column = "c"

x = GroupRareLevelsTransformer(
columns=column,
cut_off_percent=0.25,
)

expected_removed_cats = ["c", "b"]

x.fit(df)

output_df = x.transform(df)

output_categories = output_df[column].dtype.categories

for cat in expected_removed_cats:
assert (
cat not in output_categories
), f"{x.classname} output columns should forget rare encoded categories, expected {cat} to be forgotten from column {column}"
2 changes: 1 addition & 1 deletion tubular/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "1.2.0"
__version__ = "1.2.1"
18 changes: 15 additions & 3 deletions tubular/nominal.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,19 +445,31 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
# for categorical dtypes have to set new category for the impute values first
# and convert back to the categorical type, other it will convert to object
if "category" in X[c].dtype.name:

categories_before = X[c].dtype.categories

if self.rare_level_name not in X[c].cat.categories:
X[c] = X[c].cat.add_categories(self.rare_level_name)

dtype_before = X[c].dtype

X[c] = pd.Series(
data=np.where(
X[c].isin(self.non_rare_levels[c]),
X[c],
self.rare_level_name,
),
index=X.index,
).astype(dtype_before)
)

remaining_categories = [
category
for category in categories_before
if category in self.non_rare_levels[c]
]

X[c] = pd.Categorical(
X[c],
categories=remaining_categories + [self.rare_level_name],
)

else:
# using np.where converts np.NaN to str value if only one row of data frame is passed
Expand Down

0 comments on commit 8efb2af

Please sign in to comment.