From d00901e1faf354d2480297dedfc57994beec451a Mon Sep 17 00:00:00 2001 From: limlam96 <103185696+limlam96@users.noreply.github.com> Date: Fri, 2 Feb 2024 10:54:55 +0000 Subject: [PATCH 01/12] edited MeanResponseTransformer to convert category to float, fixed tests --- tests/nominal/test_MeanResponseTransformer.py | 140 ++++++++++++++---- tubular/nominal.py | 37 ++--- 2 files changed, 124 insertions(+), 53 deletions(-) diff --git a/tests/nominal/test_MeanResponseTransformer.py b/tests/nominal/test_MeanResponseTransformer.py index 287cede0..c010662a 100644 --- a/tests/nominal/test_MeanResponseTransformer.py +++ b/tests/nominal/test_MeanResponseTransformer.py @@ -1,3 +1,5 @@ +from itertools import product + import numpy as np import pandas as pd import pytest @@ -11,13 +13,26 @@ @pytest.fixture() def learnt_mapping_dict(): - return { + full_dict = {} + + b_dict = { "b": {"a": 1.0, "b": 2.0, "c": 3.0, "d": 4.0, "e": 5.0, "f": 6.0}, "b_blue": {"a": 1.0, "b": 1.0, "c": 0.0, "d": 0.0, "e": 0.0, "f": 0.0}, "b_yellow": {"a": 0.0, "b": 0.0, "c": 1.0, "d": 1.0, "e": 0.0, "f": 0.0}, "b_green": {"a": 0.0, "b": 0.0, "c": 0.0, "d": 0.0, "e": 1.0, "f": 1.0}, } + # c matches b, but is categorical + c_dict = { + "c" + suffix: b_dict["b" + suffix] + for suffix in ["", "_blue", "_yellow", "_green"] + } + + full_dict.update(b_dict) + full_dict.update(c_dict) + + return full_dict + @pytest.fixture() def learnt_unseen_levels_encoding_dict_mean(): @@ -199,10 +214,13 @@ def test_output1(self): assert_series_equal(expected, output) def test_output2(self): - "Test output of method." + "Test output of method - for category dtype" x = MeanResponseTransformer(columns="a", prior=0) - x.fit(X=pd.DataFrame({"a": [1, 2]}), y=pd.Series([2, 3])) + df = pd.DataFrame({"a": ["a", "b"]}) + df["a"] = df["a"].astype("category") + + x.fit(X=df, y=pd.Series([2, 3])) expected1 = (1 * 1) / (1) @@ -354,8 +372,9 @@ def test_correct_mappings_stored( ): "Test that the mapping dictionary created in fit has the correct keys and values." df = d.create_MeanResponseTransformer_test_df() + columns = ["b", "c"] x = MeanResponseTransformer( - columns=["b"], + columns=columns, level=level, unseen_level_handling=unseen_level_handling, ) @@ -364,15 +383,16 @@ def test_correct_mappings_stored( if level: if level == "all": assert set(x.mapped_columns) == { - "b_blue", - "b_yellow", - "b_green", + prefix + "_" + suffix + for prefix, suffix in product( + columns, + df[target_column].unique().tolist(), + ) }, "Stored mapped columns are not as expected" else: assert set(x.mapped_columns) == { - "b_blue", - "b_yellow", + prefix + "_" + suffix for prefix, suffix in product(columns, level) }, "Stored mapped columns are not as expected" for column in x.mapped_columns: @@ -381,7 +401,7 @@ def test_correct_mappings_stored( assert actual == expected else: - assert x.columns == ["b"], "Columns attribute changed in fit" + assert x.columns == columns, "Columns attribute changed in fit" for column in x.columns: actual = x.mappings[column] @@ -1024,7 +1044,6 @@ def test_check_is_fitted_called(self, mocker): x.transform(df) def test_not_dataframe_error_raised(self): - df = d.create_MeanResponseTransformer_test_df() x = MeanResponseTransformer(columns="b") @@ -1086,7 +1105,8 @@ def test_learnt_values_not_modified(self): ) def test_expected_output_binary_response(self, df, expected): """Test that the output is expected from transform with a binary response.""" - x = MeanResponseTransformer(columns=["b", "d", "f"]) + columns = ["b", "d", "f"] + x = MeanResponseTransformer(columns=columns) # set the impute values dict directly rather than fitting x on df so test works with helpers x.mappings = { @@ -1097,6 +1117,9 @@ def test_expected_output_binary_response(self, df, expected): df_transformed = x.transform(df) + for col in columns: + expected[col] = expected[col].astype(float) + ta.equality.assert_frame_equal_msg( actual=df_transformed, expected=expected, @@ -1112,17 +1135,30 @@ def test_expected_output_binary_response(self, df, expected): ) def test_expected_output_one_multi_level(self, df, expected): """Test that the output is expected from transform with a multi-level response and one level selected.""" - x = MeanResponseTransformer(columns=["b", "f"], level=["blue"]) + columns = ["b", "f"] + level = ["blue"] + x = MeanResponseTransformer(columns=columns, level=level) + + for col in [ + prefix + "_" + suffix for prefix, suffix in product(columns, level) + ]: + expected[col] = expected[col].astype(float) # set the impute values dict directly rather than fitting x on df so test works with helpers x.mappings = { "b_blue": {"a": 1, "b": 1, "c": 0, "d": 0, "e": 0, "f": 0}, "f_blue": {False: 2 / 3, True: 0}, } - x.response_levels = ["blue"] + x.response_levels = level x.mapped_columns = list(x.mappings.keys()) df_transformed = x.transform(df) + for col in [ + prefix + "_" + suffix + for prefix, suffix in product(columns, x.response_levels) + ]: + expected[col] = expected[col].astype(float) + ta.equality.assert_frame_equal_msg( actual=df_transformed, expected=expected, @@ -1139,7 +1175,8 @@ def test_expected_output_one_multi_level(self, df, expected): ) def test_expected_output_all_levels(self, df, expected): """Test that the output is expected from transform for a multi-level response and all levels selected.""" - x = MeanResponseTransformer(columns=["b", "f"], level="all") + columns = ["b", "f"] + x = MeanResponseTransformer(columns=columns, level="all") # set the impute values dict directly rather than fitting x on df so test works with helpers x.mappings = { @@ -1155,8 +1192,11 @@ def test_expected_output_all_levels(self, df, expected): x.mapped_columns = list(x.mappings.keys()) df_transformed = x.transform(df) - print(df_transformed) - print(expected) + for col in [ + prefix + "_" + suffix + for prefix, suffix in product(columns, x.response_levels) + ]: + expected[col] = expected[col].astype(float) ta.equality.assert_frame_equal_msg( actual=df_transformed, @@ -1194,15 +1234,20 @@ def test_expected_output_sigle_level_response_unseen_levels_mean( """Test that the output is expected from transform with a single level response with unseen levels in data with unseen_level_handling set to 'Mean'. """ + columns = ["b", "d", "f"] + target = "a" x = MeanResponseTransformer( - columns=["b", "d", "f"], + columns=columns, unseen_level_handling="Mean", ) initial_df = d.create_MeanResponseTransformer_test_df() - x.fit(initial_df, initial_df["a"]) + x.fit(initial_df, initial_df[target]) df_transformed = x.transform(df) + for col in columns: + expected[col] = expected[col].astype(float) + ta.equality.assert_frame_equal_msg( actual=df_transformed, expected=expected, @@ -1224,15 +1269,20 @@ def test_expected_output_sigle_level_response_unseen_levels_median( """Test that the output is expected from transform with a single level response with unseen levels in data with unseen_level_handling set to 'Median'. """ + columns = ["b", "d", "f"] + target = "a" x = MeanResponseTransformer( - columns=["b", "d", "f"], + columns=columns, unseen_level_handling="Median", ) initial_df = d.create_MeanResponseTransformer_test_df() - x.fit(initial_df, initial_df["a"]) + x.fit(initial_df, initial_df[target]) df_transformed = x.transform(df) + for col in columns: + expected[col] = expected[col].astype(float) + ta.equality.assert_frame_equal_msg( actual=df_transformed, expected=expected, @@ -1254,15 +1304,20 @@ def test_expected_output_sigle_level_response_unseen_levels_lowest( """Test that the output is expected from transform with a single level response with unseen levels in data with unseen_level_handling set to 'Lowest'. """ + columns = ["b", "d", "f"] + target = "a" x = MeanResponseTransformer( - columns=["b", "d", "f"], + columns=columns, unseen_level_handling="Lowest", ) initial_df = d.create_MeanResponseTransformer_test_df() - x.fit(initial_df, initial_df["a"]) + x.fit(initial_df, initial_df[target]) df_transformed = x.transform(df) + for col in columns: + expected[col] = expected[col].astype(float) + ta.equality.assert_frame_equal_msg( actual=df_transformed, expected=expected, @@ -1284,15 +1339,20 @@ def test_expected_output_sigle_level_response_unseen_levels_highest( """Test that the output is expected from transform with a single level response with unseen levels in data with unseen_level_handling set to 'Highest'. """ + columns = ["b", "d", "f"] + target = "a" x = MeanResponseTransformer( columns=["b", "d", "f"], unseen_level_handling="Highest", ) initial_df = d.create_MeanResponseTransformer_test_df() - x.fit(initial_df, initial_df["a"]) + x.fit(initial_df, initial_df[target]) df_transformed = x.transform(df) + for col in columns: + expected[col] = expected[col].astype(float) + ta.equality.assert_frame_equal_msg( actual=df_transformed, expected=expected, @@ -1314,12 +1374,17 @@ def test_expected_output_sigle_level_response_unseen_levels_arbitrary( """Test that the output is expected from transform with a single level response with unseen levels in data with unseen_level_handling set to an arbitrary int/float value'. """ - x = MeanResponseTransformer(columns=["b", "d", "f"], unseen_level_handling=21.6) + columns = ["b", "d", "f"] + target = "a" + x = MeanResponseTransformer(columns=columns, unseen_level_handling=21.6) initial_df = d.create_MeanResponseTransformer_test_df() - x.fit(initial_df, initial_df["a"]) + x.fit(initial_df, initial_df[target]) df_transformed = x.transform(df) + for col in columns: + expected[col] = expected[col].astype(float) + ta.equality.assert_frame_equal_msg( actual=df_transformed, expected=expected, @@ -1335,12 +1400,19 @@ def test_expected_output_sigle_level_response_unseen_levels_arbitrary( ) def test_expected_output_one_multi_level_unseen_levels(self, df, expected): """Test that the output is expected from transform with a multi-level response and unseen levels and one level selected.""" + columns = ["b", "f"] + level = ["blue"] x = MeanResponseTransformer( - columns=["b", "f"], - level=["blue"], + columns=columns, + level=level, unseen_level_handling="Mean", ) + for col in [ + prefix + "_" + suffix for prefix, suffix in product(columns, level) + ]: + expected[col] = expected[col].astype(float) + initial_df = d.create_MeanResponseTransformer_test_df() x.fit(initial_df, initial_df["multi_level_response"]) df_transformed = x.transform(df) @@ -1360,16 +1432,24 @@ def test_expected_output_one_multi_level_unseen_levels(self, df, expected): ) def test_expected_output_all_multi_level_unseen_levels(self, df, expected): """Test that the output is expected from transform with a multi-level response and unseen levels and all level selected.""" + columns = ["b", "f"] + target = "multi_level_response" x = MeanResponseTransformer( - columns=["b", "f"], + columns=columns, level="all", unseen_level_handling="Highest", ) initial_df = d.create_MeanResponseTransformer_test_df() - x.fit(initial_df, initial_df["multi_level_response"]) + x.fit(initial_df, initial_df[target]) df_transformed = x.transform(df) + for col in [ + prefix + "_" + suffix + for prefix, suffix in product(columns, initial_df[target].unique().tolist()) + ]: + expected[col] = expected[col].astype(float) + ta.equality.assert_frame_equal_msg( actual=df_transformed, expected=expected, diff --git a/tubular/nominal.py b/tubular/nominal.py index 93229a5d..b4b4ce66 100644 --- a/tubular/nominal.py +++ b/tubular/nominal.py @@ -766,33 +766,24 @@ def fit(self, X: pd.DataFrame, y: pd.Series) -> pd.DataFrame: self._fit_binary_response(X, y, self.columns) self.encoded_feature_columns = self.columns - if self.unseen_level_handling == "Mean": + if isinstance(self.unseen_level_handling, (int, float)): for c in self.encoded_feature_columns: - self.unseen_levels_encoding_dict[c] = ( - X_temp[c].map(self.mappings[c]).mean() - ) - - elif self.unseen_level_handling == "Median": + self.unseen_levels_encoding_dict[c] = float(self.unseen_level_handling) + else: for c in self.encoded_feature_columns: - self.unseen_levels_encoding_dict[c] = ( - X_temp[c].map(self.mappings[c]).median() - ) + X_temp[c] = X_temp[c].map(self.mappings[c]).astype(float) - elif self.unseen_level_handling == "Lowest": - for c in self.encoded_feature_columns: - self.unseen_levels_encoding_dict[c] = ( - X_temp[c].map(self.mappings[c]).min() - ) + if self.unseen_level_handling == "Mean": + self.unseen_levels_encoding_dict[c] = X_temp[c].mean() - elif self.unseen_level_handling == "Highest": - for c in self.encoded_feature_columns: - self.unseen_levels_encoding_dict[c] = ( - X_temp[c].map(self.mappings[c]).max() - ) + if self.unseen_level_handling == "Median": + self.unseen_levels_encoding_dict[c] = X_temp[c].median() - elif isinstance(self.unseen_level_handling, (int, float)): - for c in self.encoded_feature_columns: - self.unseen_levels_encoding_dict[c] = float(self.unseen_level_handling) + if self.unseen_level_handling == "Lowest": + self.unseen_levels_encoding_dict[c] = X_temp[c].min() + + if self.unseen_level_handling == "Highest": + self.unseen_levels_encoding_dict[c] = X_temp[c].max() return self @@ -810,7 +801,7 @@ def map_imputation_values(self, X: pd.DataFrame) -> pd.DataFrame: input dataframe with mappings applied """ for c in self.columns: - X[c] = X[c].map(self.mappings[c]) + X[c] = X[c].map(self.mappings[c]).astype(float) return X From 4aaa91a0c2ed0292125cacd29df747f9bec13fc7 Mon Sep 17 00:00:00 2001 From: limlam96 <103185696+limlam96@users.noreply.github.com> Date: Mon, 5 Feb 2024 10:55:36 +0000 Subject: [PATCH 02/12] updated version and changelog --- CHANGELOG.rst | 7 +++++++ tubular/_version.py | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 19d5fe8e..f432f20f 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -16,6 +16,13 @@ Subsections for each version can be one of the following; Each individual change should have a link to the pull request after the description of the change. +1.1.2 (2024-02-07) +------------------ + +Added +^^^^^ +- Updated MeanResponseTransformer to coerce return to float (useful behaviour for category type features) https://github.com/lvgig/tubular/pull/174 + 1.1.1 (2024-01-18) ------------------ diff --git a/tubular/_version.py b/tubular/_version.py index a82b376d..72f26f59 100644 --- a/tubular/_version.py +++ b/tubular/_version.py @@ -1 +1 @@ -__version__ = "1.1.1" +__version__ = "1.1.2" From 944393e2ce85f329b7303e9e4376442ecb5c6bbb Mon Sep 17 00:00:00 2001 From: limlam96 <103185696+limlam96@users.noreply.github.com> Date: Mon, 5 Feb 2024 11:32:43 +0000 Subject: [PATCH 03/12] updated change log format --- CHANGELOG.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index f432f20f..63a8e9f4 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -21,7 +21,7 @@ Each individual change should have a link to the pull request after the descript Added ^^^^^ -- Updated MeanResponseTransformer to coerce return to float (useful behaviour for category type features) https://github.com/lvgig/tubular/pull/174 +- Updated MeanResponseTransformer to coerce return to float (useful behaviour for category type features) `#174 `_ 1.1.1 (2024-01-18) ------------------ From 46f02c7eef8cd3bfe83ff27306d25353b05bf75e Mon Sep 17 00:00:00 2001 From: limlam96 <103185696+limlam96@users.noreply.github.com> Date: Mon, 5 Feb 2024 15:15:42 +0000 Subject: [PATCH 04/12] tidied definition of 'expected_created_cols' in MeanResponseTransformer tests --- tests/nominal/test_MeanResponseTransformer.py | 53 +++++++++++-------- 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/tests/nominal/test_MeanResponseTransformer.py b/tests/nominal/test_MeanResponseTransformer.py index c010662a..08e87ad8 100644 --- a/tests/nominal/test_MeanResponseTransformer.py +++ b/tests/nominal/test_MeanResponseTransformer.py @@ -382,18 +382,24 @@ def test_correct_mappings_stored( if level: if level == "all": - assert set(x.mapped_columns) == { + expected_created_cols = { prefix + "_" + suffix for prefix, suffix in product( columns, df[target_column].unique().tolist(), ) - }, "Stored mapped columns are not as expected" + } + assert ( + set(x.mapped_columns) == expected_created_cols + ), "Stored mapped columns are not as expected" else: - assert set(x.mapped_columns) == { + expected_created_cols = { prefix + "_" + suffix for prefix, suffix in product(columns, level) - }, "Stored mapped columns are not as expected" + } + assert ( + set(x.mapped_columns) == expected_created_cols + ), "Stored mapped columns are not as expected" for column in x.mapped_columns: actual = x.mappings[column] @@ -1137,11 +1143,12 @@ def test_expected_output_one_multi_level(self, df, expected): """Test that the output is expected from transform with a multi-level response and one level selected.""" columns = ["b", "f"] level = ["blue"] + expected_created_cols = [ + prefix + "_" + suffix for prefix, suffix in product(columns, level) + ] x = MeanResponseTransformer(columns=columns, level=level) - for col in [ - prefix + "_" + suffix for prefix, suffix in product(columns, level) - ]: + for col in expected_created_cols: expected[col] = expected[col].astype(float) # set the impute values dict directly rather than fitting x on df so test works with helpers @@ -1152,11 +1159,12 @@ def test_expected_output_one_multi_level(self, df, expected): x.response_levels = level x.mapped_columns = list(x.mappings.keys()) df_transformed = x.transform(df) - - for col in [ + new_expected_created_cols = [ prefix + "_" + suffix for prefix, suffix in product(columns, x.response_levels) - ]: + ] + + for col in new_expected_created_cols: expected[col] = expected[col].astype(float) ta.equality.assert_frame_equal_msg( @@ -1191,11 +1199,12 @@ def test_expected_output_all_levels(self, df, expected): x.response_levels = ["blue", "green", "yellow"] x.mapped_columns = list(x.mappings.keys()) df_transformed = x.transform(df) - - for col in [ + expected_created_cols = [ prefix + "_" + suffix for prefix, suffix in product(columns, x.response_levels) - ]: + ] + + for col in expected_created_cols: expected[col] = expected[col].astype(float) ta.equality.assert_frame_equal_msg( @@ -1402,15 +1411,16 @@ def test_expected_output_one_multi_level_unseen_levels(self, df, expected): """Test that the output is expected from transform with a multi-level response and unseen levels and one level selected.""" columns = ["b", "f"] level = ["blue"] + expected_created_cols = [ + prefix + "_" + suffix for prefix, suffix in product(columns, level) + ] x = MeanResponseTransformer( columns=columns, level=level, unseen_level_handling="Mean", ) - for col in [ - prefix + "_" + suffix for prefix, suffix in product(columns, level) - ]: + for col in expected_created_cols: expected[col] = expected[col].astype(float) initial_df = d.create_MeanResponseTransformer_test_df() @@ -1434,20 +1444,21 @@ def test_expected_output_all_multi_level_unseen_levels(self, df, expected): """Test that the output is expected from transform with a multi-level response and unseen levels and all level selected.""" columns = ["b", "f"] target = "multi_level_response" + initial_df = d.create_MeanResponseTransformer_test_df() + expected_created_cols = [ + prefix + "_" + suffix + for prefix, suffix in product(columns, initial_df[target].unique().tolist()) + ] x = MeanResponseTransformer( columns=columns, level="all", unseen_level_handling="Highest", ) - initial_df = d.create_MeanResponseTransformer_test_df() x.fit(initial_df, initial_df[target]) df_transformed = x.transform(df) - for col in [ - prefix + "_" + suffix - for prefix, suffix in product(columns, initial_df[target].unique().tolist()) - ]: + for col in expected_created_cols: expected[col] = expected[col].astype(float) ta.equality.assert_frame_equal_msg( From d83e552b3724bce883b33ac71b906f2426e6feb7 Mon Sep 17 00:00:00 2001 From: limlam96 <103185696+limlam96@users.noreply.github.com> Date: Tue, 6 Feb 2024 09:20:58 +0000 Subject: [PATCH 05/12] PR comments - parametrised MRE prior encoding test to try both cat and object --- tests/nominal/test_MeanResponseTransformer.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/nominal/test_MeanResponseTransformer.py b/tests/nominal/test_MeanResponseTransformer.py index 08e87ad8..223a6d1b 100644 --- a/tests/nominal/test_MeanResponseTransformer.py +++ b/tests/nominal/test_MeanResponseTransformer.py @@ -213,12 +213,16 @@ def test_output1(self): assert_series_equal(expected, output) - def test_output2(self): - "Test output of method - for category dtype" + @pytest.mark.parametrize( + "dtype", + ["object", "category"], + ) + def test_output2(self, dtype): + "Test output of method - for category and object dtypes" x = MeanResponseTransformer(columns="a", prior=0) df = pd.DataFrame({"a": ["a", "b"]}) - df["a"] = df["a"].astype("category") + df["a"] = df["a"].astype(dtype) x.fit(X=df, y=pd.Series([2, 3])) From 90a6e209721105d80c195fdcf9acd40d22741110 Mon Sep 17 00:00:00 2001 From: limlam96 <103185696+limlam96@users.noreply.github.com> Date: Tue, 6 Feb 2024 11:03:01 +0000 Subject: [PATCH 06/12] edited MRE transformer to have option to control return type --- tests/nominal/test_MeanResponseTransformer.py | 221 +++++++++++++----- tubular/nominal.py | 32 ++- 2 files changed, 197 insertions(+), 56 deletions(-) diff --git a/tests/nominal/test_MeanResponseTransformer.py b/tests/nominal/test_MeanResponseTransformer.py index 223a6d1b..ac86770a 100644 --- a/tests/nominal/test_MeanResponseTransformer.py +++ b/tests/nominal/test_MeanResponseTransformer.py @@ -36,52 +36,71 @@ def learnt_mapping_dict(): @pytest.fixture() def learnt_unseen_levels_encoding_dict_mean(): - return { + return_dict = { "b": (1.0 + 2.0 + 3.0 + 4.0 + 5.0 + 6.0) / 6, "b_blue": (1.0 + 1.0 + 0.0 + 0.0 + 0.0 + 0.0) / 6, "b_yellow": (0.0 + 0.0 + 1.0 + 1.0 + 0.0 + 0.0) / 6, "b_green": (0.0 + 0.0 + 0.0 + 0.0 + 1.0 + 1.0) / 6, } + for key in return_dict: + return_dict[key] = np.float32(return_dict[key]) + return return_dict + @pytest.fixture() def learnt_unseen_levels_encoding_dict_median(): - return { + return_dict = { "b": (3.0 + 4.0) / 2, "b_blue": (0.0 + 0.0) / 2, "b_yellow": (0.0 + 0.0) / 2, "b_green": (0.0 + 0.0) / 2, } + for key in return_dict: + return_dict[key] = np.float32(return_dict[key]) + return return_dict + @pytest.fixture() def learnt_unseen_levels_encoding_dict_highest(): - return { + return_dict = { "b": 6.0, "b_blue": 1.0, "b_yellow": 1.0, "b_green": 1.0, } + for key in return_dict: + return_dict[key] = np.float32(return_dict[key]) + return return_dict @pytest.fixture() def learnt_unseen_levels_encoding_dict_lowest(): - return { + + return_dict = { "b": 1.0, "b_blue": 0.0, "b_yellow": 0.0, "b_green": 0.0, } + for key in return_dict: + return_dict[key] = np.float32(return_dict[key]) + return return_dict + @pytest.fixture() def learnt_unseen_levels_encoding_dict_arbitrary(): - return { + return_dict = { "b": 22.0, "b_blue": 22.0, "b_yellow": 22.0, "b_green": 22.0, } + for key in return_dict: + return_dict[key] = np.float32(return_dict[key]) + return return_dict class TestInit: @@ -151,6 +170,14 @@ def test_unseen_level_handling_incorrect_value_error(self): ): MeanResponseTransformer(unseen_level_handling="AAA") + def test_return_type_handling_incorrect_value_error(self): + """Test that an exception is raised if return_type is an incorrect value.""" + with pytest.raises( + ValueError, + match="return_type should be one of: 'float64', 'float32'", + ): + MeanResponseTransformer(return_type="int") + def test_values_passed_in_init_set_to_attribute(self): """Test that the values passed in init are saved in an attribute of the same name.""" x = MeanResponseTransformer( @@ -158,6 +185,7 @@ def test_values_passed_in_init_set_to_attribute(self): prior=1, level="any", unseen_level_handling="Mean", + return_type="float32", ) ta.classes.test_object_attributes( @@ -167,6 +195,8 @@ def test_values_passed_in_init_set_to_attribute(self): "prior": 1, "level": "any", "unseen_level_handling": "Mean", + "return_type": "float32", + "cast_method": np.float32, }, msg="Attributes for MeanResponseTransformer set in init", ) @@ -214,12 +244,17 @@ def test_output1(self): assert_series_equal(expected, output) @pytest.mark.parametrize( - "dtype", - ["object", "category"], + ("dtype", "return_type"), + [ + ("object", "float64"), + ("object", "float32"), + ("category", "float64"), + ("category", "float32"), + ], ) - def test_output2(self, dtype): + def test_output2(self, dtype, return_type): "Test output of method - for category and object dtypes" - x = MeanResponseTransformer(columns="a", prior=0) + x = MeanResponseTransformer(columns="a", prior=0, return_type=return_type) df = pd.DataFrame({"a": ["a", "b"]}) df["a"] = df["a"].astype(dtype) @@ -518,14 +553,22 @@ def test_learnt_values(self): x._fit_binary_response(df, df["a"], x.columns) + expected_mappings = { + "b": {"a": 1.0, "b": 2.0, "c": 3.0, "d": 4.0, "e": 5.0, "f": 6.0}, + "d": {1: 1.0, 2: 2.0, 3: 3.0, 4: 4.0, 5: 5.0, 6: 6.0}, + "f": {False: 2.0, True: 5.0}, + } + + for key in expected_mappings: + for value in expected_mappings[key]: + expected_mappings[key][value] = x.cast_method( + expected_mappings[key][value], + ) + ta.classes.test_object_attributes( obj=x, expected_attributes={ - "mappings": { - "b": {"a": 1.0, "b": 2.0, "c": 3.0, "d": 4.0, "e": 5.0, "f": 6.0}, - "d": {1: 1.0, 2: 2.0, 3: 3.0, 4: 4.0, 5: 5.0, 6: 6.0}, - "f": {False: 2.0, True: 5.0}, - }, + "mappings": expected_mappings, "global_mean": np.float64(3.5), }, msg="mappings attribute", @@ -541,28 +584,35 @@ def test_learnt_values_prior_no_weight(self): x._fit_binary_response(df, df["a"], x.columns) + expected_mappings = { + "b": { + "a": 37 / 12, + "b": 13 / 4, + "c": 41 / 12, + "d": 43 / 12, + "e": 15 / 4, + "f": 47 / 12, + }, + "d": { + 1: 37 / 12, + 2: 13 / 4, + 3: 41 / 12, + 4: 43 / 12, + 5: 15 / 4, + 6: 47 / 12, + }, + "f": {False: 47 / 16, True: 65 / 16}, + } + for key in expected_mappings: + for value in expected_mappings[key]: + expected_mappings[key][value] = x.cast_method( + expected_mappings[key][value], + ) + ta.classes.test_object_attributes( obj=x, expected_attributes={ - "mappings": { - "b": { - "a": 37 / 12, - "b": 13 / 4, - "c": 41 / 12, - "d": 43 / 12, - "e": 15 / 4, - "f": 47 / 12, - }, - "d": { - 1: 37 / 12, - 2: 13 / 4, - 3: 41 / 12, - 4: 43 / 12, - 5: 15 / 4, - 6: 47 / 12, - }, - "f": {False: 47 / 16, True: 65 / 16}, - }, + "mappings": expected_mappings, "global_mean": np.float64(3.5), }, msg="mappings attribute", @@ -578,14 +628,22 @@ def test_learnt_values_no_prior_weight(self): x._fit_binary_response(df, df["a"], x.columns) + expected_mappings = { + "b": {"a": 1.0, "b": 2.0, "c": 3.0, "d": 4.0, "e": 5.0, "f": 6.0}, + "d": {1: 1.0, 2: 2.0, 3: 3.0, 4: 4.0, 5: 5.0, 6: 6.0}, + "f": {False: 14 / 6, True: 77 / 15}, + } + + for key in expected_mappings: + for value in expected_mappings[key]: + expected_mappings[key][value] = x.cast_method( + expected_mappings[key][value], + ) + ta.classes.test_object_attributes( obj=x, expected_attributes={ - "mappings": { - "b": {"a": 1.0, "b": 2.0, "c": 3.0, "d": 4.0, "e": 5.0, "f": 6.0}, - "d": {1: 1.0, 2: 2.0, 3: 3.0, 4: 4.0, 5: 5.0, 6: 6.0}, - "f": {False: 14 / 6, True: 77 / 15}, - }, + "mappings": expected_mappings, }, msg="mappings attribute", ) @@ -606,13 +664,20 @@ def test_learnt_values_prior_weight(self): x._fit_binary_response(df, df["a"], x.columns) + expected_mappings = { + "d": {1: 7 / 2, 2: 11 / 3, 3: 23 / 6, 4: 4.0, 5: 30 / 7, 6: 32 / 7}, + "f": {False: 13 / 4, True: 50 / 11}, + } + for key in expected_mappings: + for value in expected_mappings[key]: + expected_mappings[key][value] = x.cast_method( + expected_mappings[key][value], + ) + ta.classes.test_object_attributes( obj=x, expected_attributes={ - "mappings": { - "d": {1: 7 / 2, 2: 11 / 3, 3: 23 / 6, 4: 4.0, 5: 30 / 7, 6: 32 / 7}, - "f": {False: 13 / 4, True: 50 / 11}, - }, + "mappings": expected_mappings, "global_mean": np.float64(4.0), }, msg="mappings attribute", @@ -1128,7 +1193,7 @@ def test_expected_output_binary_response(self, df, expected): df_transformed = x.transform(df) for col in columns: - expected[col] = expected[col].astype(float) + expected[col] = expected[col].astype(x.return_type) ta.equality.assert_frame_equal_msg( actual=df_transformed, @@ -1153,7 +1218,7 @@ def test_expected_output_one_multi_level(self, df, expected): x = MeanResponseTransformer(columns=columns, level=level) for col in expected_created_cols: - expected[col] = expected[col].astype(float) + expected[col] = expected[col].astype(x.return_type) # set the impute values dict directly rather than fitting x on df so test works with helpers x.mappings = { @@ -1169,7 +1234,7 @@ def test_expected_output_one_multi_level(self, df, expected): ] for col in new_expected_created_cols: - expected[col] = expected[col].astype(float) + expected[col] = expected[col].astype(x.return_type) ta.equality.assert_frame_equal_msg( actual=df_transformed, @@ -1209,7 +1274,7 @@ def test_expected_output_all_levels(self, df, expected): ] for col in expected_created_cols: - expected[col] = expected[col].astype(float) + expected[col] = expected[col].astype(x.return_type) ta.equality.assert_frame_equal_msg( actual=df_transformed, @@ -1259,7 +1324,7 @@ def test_expected_output_sigle_level_response_unseen_levels_mean( df_transformed = x.transform(df) for col in columns: - expected[col] = expected[col].astype(float) + expected[col] = expected[col].astype(x.return_type) ta.equality.assert_frame_equal_msg( actual=df_transformed, @@ -1294,7 +1359,7 @@ def test_expected_output_sigle_level_response_unseen_levels_median( df_transformed = x.transform(df) for col in columns: - expected[col] = expected[col].astype(float) + expected[col] = expected[col].astype(x.return_type) ta.equality.assert_frame_equal_msg( actual=df_transformed, @@ -1329,7 +1394,7 @@ def test_expected_output_sigle_level_response_unseen_levels_lowest( df_transformed = x.transform(df) for col in columns: - expected[col] = expected[col].astype(float) + expected[col] = expected[col].astype(x.return_type) ta.equality.assert_frame_equal_msg( actual=df_transformed, @@ -1364,7 +1429,7 @@ def test_expected_output_sigle_level_response_unseen_levels_highest( df_transformed = x.transform(df) for col in columns: - expected[col] = expected[col].astype(float) + expected[col] = expected[col].astype(x.return_type) ta.equality.assert_frame_equal_msg( actual=df_transformed, @@ -1396,7 +1461,7 @@ def test_expected_output_sigle_level_response_unseen_levels_arbitrary( df_transformed = x.transform(df) for col in columns: - expected[col] = expected[col].astype(float) + expected[col] = expected[col].astype(x.return_type) ta.equality.assert_frame_equal_msg( actual=df_transformed, @@ -1425,7 +1490,7 @@ def test_expected_output_one_multi_level_unseen_levels(self, df, expected): ) for col in expected_created_cols: - expected[col] = expected[col].astype(float) + expected[col] = expected[col].astype(x.return_type) initial_df = d.create_MeanResponseTransformer_test_df() x.fit(initial_df, initial_df["multi_level_response"]) @@ -1463,7 +1528,7 @@ def test_expected_output_all_multi_level_unseen_levels(self, df, expected): df_transformed = x.transform(df) for col in expected_created_cols: - expected[col] = expected[col].astype(float) + expected[col] = expected[col].astype(x.return_type) ta.equality.assert_frame_equal_msg( actual=df_transformed, @@ -1486,3 +1551,53 @@ def test_nulls_introduced_in_transform_error(self): match="MeanResponseTransformer: nulls would be introduced into column b from levels not present in mapping", ): x.transform(df) + + @pytest.mark.parametrize( + "prior, level, target, unseen_level_handling", + [ + (5, "all", "c", "Mean"), + (100, ["a", "b"], "c", "Lowest"), + (1, None, "a", "Highest"), + (0, None, "a", "Median"), + ], + ) + def test_return_type_can_be_changed( + self, + prior, + level, + target, + unseen_level_handling, + ): + "Test that output return types are controlled by return_type param, this defaults to float32 so test float64 here" + + df = d.create_MeanResponseTransformer_test_df() + + columns = ["b", "d", "f"] + x = MeanResponseTransformer( + columns=columns, + return_type="float64", + prior=prior, + unseen_level_handling=unseen_level_handling, + level=level, + ) + + x.fit(df, df[target]) + + output_df = x.transform(df) + + if target == "c": + actual_levels = df[target].unique().tolist() if level == "all" else level + expected_created_cols = [ + prefix + "_" + suffix + for prefix, suffix in product(columns, actual_levels) + ] + + else: + expected_created_cols = columns + + for col in expected_created_cols: + expected_type = x.return_type + actual_type = output_df[col].dtype.name + assert ( + actual_type == expected_type + ), f"{x.classname} should output columns with type determine by the return_type param, expected {expected_type} but got {actual_type}" diff --git a/tubular/nominal.py b/tubular/nominal.py index b4b4ce66..160d19a5 100644 --- a/tubular/nominal.py +++ b/tubular/nominal.py @@ -2,6 +2,7 @@ from __future__ import annotations import warnings +from typing import Literal import numpy as np import pandas as pd @@ -513,6 +514,9 @@ class MeanResponseTransformer(BaseNominalTransformer): in order to encode unseen levels in each categorical column with the mean, median etc. of each column. One can also pass an arbitrary int/float value to use for encoding unseen levels. + return_type: Literal['float32', 'float64'] + What type to cast return column as, consider exploring float32 to save memory. Defaults to float32. + **kwargs Arbitrary keyword arguments passed onto BaseTransformer.init method. @@ -551,6 +555,11 @@ class MeanResponseTransformer(BaseNominalTransformer): unseen_levels_encoding_dict: dict Dict containing the values (based on chosen unseen_level_handling) derived from the encoded columns to use when handling unseen levels in data passed to transform method. + return_type: Literal['float32', 'float64'] + What type to cast return column as. Defaults to float32. + + cast_method: Literal[np.float32, np,float64] + Store the casting method associated to return_type """ @@ -561,6 +570,7 @@ def __init__( prior: int = 0, level: str | list | None = None, unseen_level_handling: str | int | float | None = None, + return_type: Literal["float32", "float64"] = "float32", **kwargs: dict[str, bool], ) -> None: if weights_column is not None and type(weights_column) is not str: @@ -586,10 +596,19 @@ def __init__( msg = f"{self.classname()}: unseen_level_handling should be the option: Mean, Median, Lowest, Highest or an arbitrary int/float value" raise ValueError(msg) + if return_type not in ["float64", "float32"]: + msg = f"{self.classname()}: return_type should be one of: 'float64', 'float32'" + raise ValueError(msg) + self.weights_column = weights_column self.prior = prior self.level = level self.unseen_level_handling = unseen_level_handling + self.return_type = return_type + if return_type == "float64": + self.cast_method = np.float64 + else: + self.cast_method = np.float32 # TODO: set default prior to None and refactor to only use prior regularisation when it is set? BaseNominalTransformer.__init__(self, columns=columns, **kwargs) @@ -696,6 +715,10 @@ def _fit_binary_response( group_weight, ).to_dict() + # to_dict changes types + for key in self.mappings[c]: + self.mappings[c][key] = self.cast_method(self.mappings[c][key]) + def fit(self, X: pd.DataFrame, y: pd.Series) -> pd.DataFrame: """Identify mapping of categorical levels to mean response values. @@ -768,10 +791,13 @@ def fit(self, X: pd.DataFrame, y: pd.Series) -> pd.DataFrame: if isinstance(self.unseen_level_handling, (int, float)): for c in self.encoded_feature_columns: - self.unseen_levels_encoding_dict[c] = float(self.unseen_level_handling) + self.unseen_levels_encoding_dict[c] = self.cast_method( + self.unseen_level_handling, + ) + else: for c in self.encoded_feature_columns: - X_temp[c] = X_temp[c].map(self.mappings[c]).astype(float) + X_temp[c] = X_temp[c].map(self.mappings[c]).astype(self.return_type) if self.unseen_level_handling == "Mean": self.unseen_levels_encoding_dict[c] = X_temp[c].mean() @@ -801,7 +827,7 @@ def map_imputation_values(self, X: pd.DataFrame) -> pd.DataFrame: input dataframe with mappings applied """ for c in self.columns: - X[c] = X[c].map(self.mappings[c]).astype(float) + X[c] = X[c].map(self.mappings[c]).astype(self.return_type) return X From 0b3bb28a28058970de90b0004c57e3f641d48e25 Mon Sep 17 00:00:00 2001 From: limlam96 <103185696+limlam96@users.noreply.github.com> Date: Tue, 6 Feb 2024 11:09:44 +0000 Subject: [PATCH 07/12] added commentary in MeanResponseTransformer tests --- tests/nominal/test_MeanResponseTransformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/nominal/test_MeanResponseTransformer.py b/tests/nominal/test_MeanResponseTransformer.py index ac86770a..3762ba86 100644 --- a/tests/nominal/test_MeanResponseTransformer.py +++ b/tests/nominal/test_MeanResponseTransformer.py @@ -22,7 +22,7 @@ def learnt_mapping_dict(): "b_green": {"a": 0.0, "b": 0.0, "c": 0.0, "d": 0.0, "e": 1.0, "f": 1.0}, } - # c matches b, but is categorical + # c matches b, but is categorical (see test_data.create_MeanResponseTransformer_test_df) c_dict = { "c" + suffix: b_dict["b" + suffix] for suffix in ["", "_blue", "_yellow", "_green"] From e6b81c21b48d71fde109fe8340563be979c4a68e Mon Sep 17 00:00:00 2001 From: limlam96 <103185696+limlam96@users.noreply.github.com> Date: Tue, 6 Feb 2024 11:10:58 +0000 Subject: [PATCH 08/12] updated version and changelog --- CHANGELOG.rst | 2 +- tubular/_version.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 63a8e9f4..d140790c 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -23,7 +23,7 @@ Added ^^^^^ - Updated MeanResponseTransformer to coerce return to float (useful behaviour for category type features) `#174 `_ -1.1.1 (2024-01-18) +1.2.0 (2024-01-18) ------------------ Added diff --git a/tubular/_version.py b/tubular/_version.py index 72f26f59..c68196d1 100644 --- a/tubular/_version.py +++ b/tubular/_version.py @@ -1 +1 @@ -__version__ = "1.1.2" +__version__ = "1.2.0" From 130fbe01bce55c3292d8a12e98dbc424c87f4fe0 Mon Sep 17 00:00:00 2001 From: limlam96 <103185696+limlam96@users.noreply.github.com> Date: Tue, 6 Feb 2024 11:23:33 +0000 Subject: [PATCH 09/12] split out large MeanResponseTransformer test --- tests/nominal/test_MeanResponseTransformer.py | 81 ++++++++++++------- 1 file changed, 50 insertions(+), 31 deletions(-) diff --git a/tests/nominal/test_MeanResponseTransformer.py b/tests/nominal/test_MeanResponseTransformer.py index 3762ba86..36783fa6 100644 --- a/tests/nominal/test_MeanResponseTransformer.py +++ b/tests/nominal/test_MeanResponseTransformer.py @@ -398,11 +398,10 @@ def test_response_column_nulls_error( ("level", "target_column", "unseen_level_handling"), [ (None, "a", "Mean"), - ("all", "multi_level_response", 32), - (["yellow", "blue"], "multi_level_response", "Highest"), + (None, "a", "Lowest"), ], ) - def test_correct_mappings_stored( + def test_correct_mappings_stored_numeric_response( self, learnt_mapping_dict, level, @@ -419,39 +418,59 @@ def test_correct_mappings_stored( ) x.fit(df, df[target_column]) - if level: - if level == "all": - expected_created_cols = { - prefix + "_" + suffix - for prefix, suffix in product( - columns, - df[target_column].unique().tolist(), - ) - } - assert ( - set(x.mapped_columns) == expected_created_cols - ), "Stored mapped columns are not as expected" + assert x.columns == columns, "Columns attribute changed in fit" - else: - expected_created_cols = { - prefix + "_" + suffix for prefix, suffix in product(columns, level) - } - assert ( - set(x.mapped_columns) == expected_created_cols - ), "Stored mapped columns are not as expected" + for column in x.columns: + actual = x.mappings[column] + expected = learnt_mapping_dict[column] + assert actual == expected - for column in x.mapped_columns: - actual = x.mappings[column] - expected = learnt_mapping_dict[column] - assert actual == expected + @pytest.mark.parametrize( + ("level", "target_column", "unseen_level_handling"), + [ + (["blue"], "multi_level_response", "Median"), + ("all", "multi_level_response", 32), + (["yellow", "blue"], "multi_level_response", "Highest"), + ], + ) + def test_correct_mappings_stored_categorical_response( + self, + learnt_mapping_dict, + level, + target_column, + unseen_level_handling, + ): + "Test that the mapping dictionary created in fit has the correct keys and values." + df = d.create_MeanResponseTransformer_test_df() + columns = ["b", "c"] + x = MeanResponseTransformer( + columns=columns, + level=level, + unseen_level_handling=unseen_level_handling, + ) + x.fit(df, df[target_column]) + + if level == "all": + expected_created_cols = { + prefix + "_" + suffix + for prefix, suffix in product( + columns, + df[target_column].unique().tolist(), + ) + } else: - assert x.columns == columns, "Columns attribute changed in fit" + expected_created_cols = { + prefix + "_" + suffix for prefix, suffix in product(columns, level) + } + assert ( + set(x.mapped_columns) == expected_created_cols + ), "Stored mapped columns are not as expected" - for column in x.columns: - actual = x.mappings[column] - expected = learnt_mapping_dict[column] - assert actual == expected + for column in x.mapped_columns: + actual = x.mappings[column] + expected = learnt_mapping_dict[column] + assert actual == expected @pytest.mark.parametrize( ("level", "target_column", "unseen_level_handling"), From c1831589ec874ee48b9e06722cfdd13a57cc9843 Mon Sep 17 00:00:00 2001 From: limlam96 <103185696+limlam96@users.noreply.github.com> Date: Tue, 6 Feb 2024 11:29:44 +0000 Subject: [PATCH 10/12] edited MeanResponseEncoder prior test --- tests/nominal/test_MeanResponseTransformer.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/tests/nominal/test_MeanResponseTransformer.py b/tests/nominal/test_MeanResponseTransformer.py index 36783fa6..bfc065b9 100644 --- a/tests/nominal/test_MeanResponseTransformer.py +++ b/tests/nominal/test_MeanResponseTransformer.py @@ -244,17 +244,12 @@ def test_output1(self): assert_series_equal(expected, output) @pytest.mark.parametrize( - ("dtype", "return_type"), - [ - ("object", "float64"), - ("object", "float32"), - ("category", "float64"), - ("category", "float32"), - ], + "dtype", + ["object", "category"], ) - def test_output2(self, dtype, return_type): + def test_output2(self, dtype): "Test output of method - for category and object dtypes" - x = MeanResponseTransformer(columns="a", prior=0, return_type=return_type) + x = MeanResponseTransformer(columns="a", prior=0) df = pd.DataFrame({"a": ["a", "b"]}) df["a"] = df["a"].astype(dtype) From 5a5815546e34348b32cb7d605294dbd9ee979c78 Mon Sep 17 00:00:00 2001 From: limlam96 <103185696+limlam96@users.noreply.github.com> Date: Tue, 6 Feb 2024 11:36:36 +0000 Subject: [PATCH 11/12] fixed changelog --- CHANGELOG.rst | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index ce59251b..3dd38060 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -16,7 +16,7 @@ Subsections for each version can be one of the following; Each individual change should have a link to the pull request after the description of the change. -1.1.2 (2024-02-07) +1.1.2 (2024-02-06) ------------------ Added @@ -25,6 +25,8 @@ Added - Update OneHotEncodingTransformer to default to returning int8 columns `#175 `_ - Updated MeanResponseTransformer to coerce return to float (useful behaviour for category type features) `#174 `_ +1.1.1 (2024-01-18) +------------------ Added ^^^^^ From b10649399f759695c69f7b9cf602ef825c486c67 Mon Sep 17 00:00:00 2001 From: limlam96 <103185696+limlam96@users.noreply.github.com> Date: Tue, 6 Feb 2024 11:37:49 +0000 Subject: [PATCH 12/12] fixed changelog --- CHANGELOG.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 3dd38060..1e0ea489 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -16,7 +16,7 @@ Subsections for each version can be one of the following; Each individual change should have a link to the pull request after the description of the change. -1.1.2 (2024-02-06) +1.2.0 (2024-02-06) ------------------ Added