Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

edited MeanResponseTransformer to convert category to float, fixed tests #174

Merged
merged 14 commits into from
Feb 6, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
140 changes: 110 additions & 30 deletions tests/nominal/test_MeanResponseTransformer.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from itertools import product

import numpy as np
import pandas as pd
import pytest
Expand All @@ -11,13 +13,26 @@

@pytest.fixture()
def learnt_mapping_dict():
return {
full_dict = {}

b_dict = {
"b": {"a": 1.0, "b": 2.0, "c": 3.0, "d": 4.0, "e": 5.0, "f": 6.0},
"b_blue": {"a": 1.0, "b": 1.0, "c": 0.0, "d": 0.0, "e": 0.0, "f": 0.0},
"b_yellow": {"a": 0.0, "b": 0.0, "c": 1.0, "d": 1.0, "e": 0.0, "f": 0.0},
"b_green": {"a": 0.0, "b": 0.0, "c": 0.0, "d": 0.0, "e": 1.0, "f": 1.0},
}

# c matches b, but is categorical
c_dict = {
"c" + suffix: b_dict["b" + suffix]
for suffix in ["", "_blue", "_yellow", "_green"]
}

full_dict.update(b_dict)
full_dict.update(c_dict)

return full_dict


@pytest.fixture()
def learnt_unseen_levels_encoding_dict_mean():
Expand Down Expand Up @@ -199,10 +214,13 @@ def test_output1(self):
assert_series_equal(expected, output)

def test_output2(self):
"Test output of method."
"Test output of method - for category dtype"
x = MeanResponseTransformer(columns="a", prior=0)

x.fit(X=pd.DataFrame({"a": [1, 2]}), y=pd.Series([2, 3]))
df = pd.DataFrame({"a": ["a", "b"]})
df["a"] = df["a"].astype("category")

x.fit(X=df, y=pd.Series([2, 3]))

expected1 = (1 * 1) / (1)

Expand Down Expand Up @@ -354,8 +372,9 @@ def test_correct_mappings_stored(
):
"Test that the mapping dictionary created in fit has the correct keys and values."
df = d.create_MeanResponseTransformer_test_df()
columns = ["b", "c"]
x = MeanResponseTransformer(
columns=["b"],
columns=columns,
level=level,
unseen_level_handling=unseen_level_handling,
)
Expand All @@ -364,15 +383,16 @@ def test_correct_mappings_stored(
if level:
if level == "all":
assert set(x.mapped_columns) == {
"b_blue",
"b_yellow",
"b_green",
prefix + "_" + suffix
for prefix, suffix in product(
columns,
df[target_column].unique().tolist(),
)
}, "Stored mapped columns are not as expected"

else:
assert set(x.mapped_columns) == {
"b_blue",
"b_yellow",
prefix + "_" + suffix for prefix, suffix in product(columns, level)
}, "Stored mapped columns are not as expected"

for column in x.mapped_columns:
Expand All @@ -381,7 +401,7 @@ def test_correct_mappings_stored(
assert actual == expected

else:
assert x.columns == ["b"], "Columns attribute changed in fit"
assert x.columns == columns, "Columns attribute changed in fit"

for column in x.columns:
actual = x.mappings[column]
Expand Down Expand Up @@ -1024,7 +1044,6 @@ def test_check_is_fitted_called(self, mocker):
x.transform(df)

def test_not_dataframe_error_raised(self):

df = d.create_MeanResponseTransformer_test_df()

x = MeanResponseTransformer(columns="b")
Expand Down Expand Up @@ -1086,7 +1105,8 @@ def test_learnt_values_not_modified(self):
)
def test_expected_output_binary_response(self, df, expected):
"""Test that the output is expected from transform with a binary response."""
x = MeanResponseTransformer(columns=["b", "d", "f"])
columns = ["b", "d", "f"]
x = MeanResponseTransformer(columns=columns)

# set the impute values dict directly rather than fitting x on df so test works with helpers
x.mappings = {
Expand All @@ -1097,6 +1117,9 @@ def test_expected_output_binary_response(self, df, expected):

df_transformed = x.transform(df)

for col in columns:
expected[col] = expected[col].astype(float)

ta.equality.assert_frame_equal_msg(
actual=df_transformed,
expected=expected,
Expand All @@ -1112,17 +1135,30 @@ def test_expected_output_binary_response(self, df, expected):
)
def test_expected_output_one_multi_level(self, df, expected):
"""Test that the output is expected from transform with a multi-level response and one level selected."""
x = MeanResponseTransformer(columns=["b", "f"], level=["blue"])
columns = ["b", "f"]
level = ["blue"]
x = MeanResponseTransformer(columns=columns, level=level)

for col in [
prefix + "_" + suffix for prefix, suffix in product(columns, level)
]:
expected[col] = expected[col].astype(float)

# set the impute values dict directly rather than fitting x on df so test works with helpers
x.mappings = {
"b_blue": {"a": 1, "b": 1, "c": 0, "d": 0, "e": 0, "f": 0},
"f_blue": {False: 2 / 3, True: 0},
}
x.response_levels = ["blue"]
x.response_levels = level
x.mapped_columns = list(x.mappings.keys())
df_transformed = x.transform(df)

for col in [
prefix + "_" + suffix
for prefix, suffix in product(columns, x.response_levels)
]:
expected[col] = expected[col].astype(float)

ta.equality.assert_frame_equal_msg(
actual=df_transformed,
expected=expected,
Expand All @@ -1139,7 +1175,8 @@ def test_expected_output_one_multi_level(self, df, expected):
)
def test_expected_output_all_levels(self, df, expected):
"""Test that the output is expected from transform for a multi-level response and all levels selected."""
x = MeanResponseTransformer(columns=["b", "f"], level="all")
columns = ["b", "f"]
x = MeanResponseTransformer(columns=columns, level="all")

# set the impute values dict directly rather than fitting x on df so test works with helpers
x.mappings = {
Expand All @@ -1155,8 +1192,11 @@ def test_expected_output_all_levels(self, df, expected):
x.mapped_columns = list(x.mappings.keys())
df_transformed = x.transform(df)

print(df_transformed)
print(expected)
for col in [
prefix + "_" + suffix
for prefix, suffix in product(columns, x.response_levels)
]:
expected[col] = expected[col].astype(float)

ta.equality.assert_frame_equal_msg(
actual=df_transformed,
Expand Down Expand Up @@ -1194,15 +1234,20 @@ def test_expected_output_sigle_level_response_unseen_levels_mean(
"""Test that the output is expected from transform with a single level response with unseen levels in data with
unseen_level_handling set to 'Mean'.
"""
columns = ["b", "d", "f"]
target = "a"
x = MeanResponseTransformer(
columns=["b", "d", "f"],
columns=columns,
unseen_level_handling="Mean",
)

initial_df = d.create_MeanResponseTransformer_test_df()
x.fit(initial_df, initial_df["a"])
x.fit(initial_df, initial_df[target])
df_transformed = x.transform(df)

for col in columns:
expected[col] = expected[col].astype(float)

ta.equality.assert_frame_equal_msg(
actual=df_transformed,
expected=expected,
Expand All @@ -1224,15 +1269,20 @@ def test_expected_output_sigle_level_response_unseen_levels_median(
"""Test that the output is expected from transform with a single level response with unseen levels in data
with unseen_level_handling set to 'Median'.
"""
columns = ["b", "d", "f"]
target = "a"
x = MeanResponseTransformer(
columns=["b", "d", "f"],
columns=columns,
unseen_level_handling="Median",
)

initial_df = d.create_MeanResponseTransformer_test_df()
x.fit(initial_df, initial_df["a"])
x.fit(initial_df, initial_df[target])
df_transformed = x.transform(df)

for col in columns:
expected[col] = expected[col].astype(float)

ta.equality.assert_frame_equal_msg(
actual=df_transformed,
expected=expected,
Expand All @@ -1254,15 +1304,20 @@ def test_expected_output_sigle_level_response_unseen_levels_lowest(
"""Test that the output is expected from transform with a single level response with unseen levels in data
with unseen_level_handling set to 'Lowest'.
"""
columns = ["b", "d", "f"]
target = "a"
x = MeanResponseTransformer(
columns=["b", "d", "f"],
columns=columns,
unseen_level_handling="Lowest",
)

initial_df = d.create_MeanResponseTransformer_test_df()
x.fit(initial_df, initial_df["a"])
x.fit(initial_df, initial_df[target])
df_transformed = x.transform(df)

for col in columns:
expected[col] = expected[col].astype(float)

ta.equality.assert_frame_equal_msg(
actual=df_transformed,
expected=expected,
Expand All @@ -1284,15 +1339,20 @@ def test_expected_output_sigle_level_response_unseen_levels_highest(
"""Test that the output is expected from transform with a single level response with unseen levels in data
with unseen_level_handling set to 'Highest'.
"""
columns = ["b", "d", "f"]
target = "a"
x = MeanResponseTransformer(
columns=["b", "d", "f"],
unseen_level_handling="Highest",
)

initial_df = d.create_MeanResponseTransformer_test_df()
x.fit(initial_df, initial_df["a"])
x.fit(initial_df, initial_df[target])
df_transformed = x.transform(df)

for col in columns:
expected[col] = expected[col].astype(float)

ta.equality.assert_frame_equal_msg(
actual=df_transformed,
expected=expected,
Expand All @@ -1314,12 +1374,17 @@ def test_expected_output_sigle_level_response_unseen_levels_arbitrary(
"""Test that the output is expected from transform with a single level response with unseen levels in data
with unseen_level_handling set to an arbitrary int/float value'.
"""
x = MeanResponseTransformer(columns=["b", "d", "f"], unseen_level_handling=21.6)
columns = ["b", "d", "f"]
target = "a"
x = MeanResponseTransformer(columns=columns, unseen_level_handling=21.6)

initial_df = d.create_MeanResponseTransformer_test_df()
x.fit(initial_df, initial_df["a"])
x.fit(initial_df, initial_df[target])
df_transformed = x.transform(df)

for col in columns:
expected[col] = expected[col].astype(float)

ta.equality.assert_frame_equal_msg(
actual=df_transformed,
expected=expected,
Expand All @@ -1335,12 +1400,19 @@ def test_expected_output_sigle_level_response_unseen_levels_arbitrary(
)
def test_expected_output_one_multi_level_unseen_levels(self, df, expected):
"""Test that the output is expected from transform with a multi-level response and unseen levels and one level selected."""
columns = ["b", "f"]
level = ["blue"]
x = MeanResponseTransformer(
columns=["b", "f"],
level=["blue"],
columns=columns,
level=level,
unseen_level_handling="Mean",
)

for col in [
prefix + "_" + suffix for prefix, suffix in product(columns, level)
]:
expected[col] = expected[col].astype(float)

initial_df = d.create_MeanResponseTransformer_test_df()
x.fit(initial_df, initial_df["multi_level_response"])
df_transformed = x.transform(df)
Expand All @@ -1360,16 +1432,24 @@ def test_expected_output_one_multi_level_unseen_levels(self, df, expected):
)
def test_expected_output_all_multi_level_unseen_levels(self, df, expected):
"""Test that the output is expected from transform with a multi-level response and unseen levels and all level selected."""
columns = ["b", "f"]
target = "multi_level_response"
x = MeanResponseTransformer(
columns=["b", "f"],
columns=columns,
level="all",
unseen_level_handling="Highest",
)

initial_df = d.create_MeanResponseTransformer_test_df()
x.fit(initial_df, initial_df["multi_level_response"])
x.fit(initial_df, initial_df[target])
df_transformed = x.transform(df)

for col in [
prefix + "_" + suffix
for prefix, suffix in product(columns, initial_df[target].unique().tolist())
]:
expected[col] = expected[col].astype(float)

ta.equality.assert_frame_equal_msg(
actual=df_transformed,
expected=expected,
Expand Down
37 changes: 14 additions & 23 deletions tubular/nominal.py
Original file line number Diff line number Diff line change
Expand Up @@ -766,33 +766,24 @@ def fit(self, X: pd.DataFrame, y: pd.Series) -> pd.DataFrame:
self._fit_binary_response(X, y, self.columns)
self.encoded_feature_columns = self.columns

if self.unseen_level_handling == "Mean":
if isinstance(self.unseen_level_handling, (int, float)):
for c in self.encoded_feature_columns:
self.unseen_levels_encoding_dict[c] = (
X_temp[c].map(self.mappings[c]).mean()
)

elif self.unseen_level_handling == "Median":
self.unseen_levels_encoding_dict[c] = float(self.unseen_level_handling)
else:
for c in self.encoded_feature_columns:
self.unseen_levels_encoding_dict[c] = (
X_temp[c].map(self.mappings[c]).median()
)
X_temp[c] = X_temp[c].map(self.mappings[c]).astype(float)

elif self.unseen_level_handling == "Lowest":
for c in self.encoded_feature_columns:
self.unseen_levels_encoding_dict[c] = (
X_temp[c].map(self.mappings[c]).min()
)
if self.unseen_level_handling == "Mean":
self.unseen_levels_encoding_dict[c] = X_temp[c].mean()

elif self.unseen_level_handling == "Highest":
for c in self.encoded_feature_columns:
self.unseen_levels_encoding_dict[c] = (
X_temp[c].map(self.mappings[c]).max()
)
if self.unseen_level_handling == "Median":
self.unseen_levels_encoding_dict[c] = X_temp[c].median()

elif isinstance(self.unseen_level_handling, (int, float)):
for c in self.encoded_feature_columns:
self.unseen_levels_encoding_dict[c] = float(self.unseen_level_handling)
if self.unseen_level_handling == "Lowest":
self.unseen_levels_encoding_dict[c] = X_temp[c].min()

if self.unseen_level_handling == "Highest":
self.unseen_levels_encoding_dict[c] = X_temp[c].max()

return self

Expand All @@ -810,7 +801,7 @@ def map_imputation_values(self, X: pd.DataFrame) -> pd.DataFrame:
input dataframe with mappings applied
"""
for c in self.columns:
X[c] = X[c].map(self.mappings[c])
X[c] = X[c].map(self.mappings[c]).astype(float)

return X

Expand Down
Loading