diff --git a/CHANGELOG.rst b/CHANGELOG.rst index bff0d698..da22a368 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -16,12 +16,14 @@ Subsections for each version can be one of the following; Each individual change should have a link to the pull request after the description of the change. -1.1.2 (2024-02-07) +1.2.0 (2024-02-06) ------------------ Added ^^^^^ - Update OneHotEncodingTransformer to default to returning int8 columns `#175 `_ - Updated NullIndicator to return int8 columns `#173 https://github.com/lvgig/tubular/pull/173`_ +- Update OneHotEncodingTransformer to default to returning int8 columns `#175 `_ +- Updated MeanResponseTransformer to coerce return to float (useful behaviour for category type features) `#174 `_ 1.1.1 (2024-01-18) ------------------ diff --git a/tests/nominal/test_MeanResponseTransformer.py b/tests/nominal/test_MeanResponseTransformer.py index 287cede0..bfc065b9 100644 --- a/tests/nominal/test_MeanResponseTransformer.py +++ b/tests/nominal/test_MeanResponseTransformer.py @@ -1,3 +1,5 @@ +from itertools import product + import numpy as np import pandas as pd import pytest @@ -11,62 +13,94 @@ @pytest.fixture() def learnt_mapping_dict(): - return { + full_dict = {} + + b_dict = { "b": {"a": 1.0, "b": 2.0, "c": 3.0, "d": 4.0, "e": 5.0, "f": 6.0}, "b_blue": {"a": 1.0, "b": 1.0, "c": 0.0, "d": 0.0, "e": 0.0, "f": 0.0}, "b_yellow": {"a": 0.0, "b": 0.0, "c": 1.0, "d": 1.0, "e": 0.0, "f": 0.0}, "b_green": {"a": 0.0, "b": 0.0, "c": 0.0, "d": 0.0, "e": 1.0, "f": 1.0}, } + # c matches b, but is categorical (see test_data.create_MeanResponseTransformer_test_df) + c_dict = { + "c" + suffix: b_dict["b" + suffix] + for suffix in ["", "_blue", "_yellow", "_green"] + } + + full_dict.update(b_dict) + full_dict.update(c_dict) + + return full_dict + @pytest.fixture() def learnt_unseen_levels_encoding_dict_mean(): - return { + return_dict = { "b": (1.0 + 2.0 + 3.0 + 4.0 + 5.0 + 6.0) / 6, "b_blue": (1.0 + 1.0 + 0.0 + 0.0 + 0.0 + 0.0) / 6, "b_yellow": (0.0 + 0.0 + 1.0 + 1.0 + 0.0 + 0.0) / 6, "b_green": (0.0 + 0.0 + 0.0 + 0.0 + 1.0 + 1.0) / 6, } + for key in return_dict: + return_dict[key] = np.float32(return_dict[key]) + return return_dict + @pytest.fixture() def learnt_unseen_levels_encoding_dict_median(): - return { + return_dict = { "b": (3.0 + 4.0) / 2, "b_blue": (0.0 + 0.0) / 2, "b_yellow": (0.0 + 0.0) / 2, "b_green": (0.0 + 0.0) / 2, } + for key in return_dict: + return_dict[key] = np.float32(return_dict[key]) + return return_dict + @pytest.fixture() def learnt_unseen_levels_encoding_dict_highest(): - return { + return_dict = { "b": 6.0, "b_blue": 1.0, "b_yellow": 1.0, "b_green": 1.0, } + for key in return_dict: + return_dict[key] = np.float32(return_dict[key]) + return return_dict @pytest.fixture() def learnt_unseen_levels_encoding_dict_lowest(): - return { + + return_dict = { "b": 1.0, "b_blue": 0.0, "b_yellow": 0.0, "b_green": 0.0, } + for key in return_dict: + return_dict[key] = np.float32(return_dict[key]) + return return_dict + @pytest.fixture() def learnt_unseen_levels_encoding_dict_arbitrary(): - return { + return_dict = { "b": 22.0, "b_blue": 22.0, "b_yellow": 22.0, "b_green": 22.0, } + for key in return_dict: + return_dict[key] = np.float32(return_dict[key]) + return return_dict class TestInit: @@ -136,6 +170,14 @@ def test_unseen_level_handling_incorrect_value_error(self): ): MeanResponseTransformer(unseen_level_handling="AAA") + def test_return_type_handling_incorrect_value_error(self): + """Test that an exception is raised if return_type is an incorrect value.""" + with pytest.raises( + ValueError, + match="return_type should be one of: 'float64', 'float32'", + ): + MeanResponseTransformer(return_type="int") + def test_values_passed_in_init_set_to_attribute(self): """Test that the values passed in init are saved in an attribute of the same name.""" x = MeanResponseTransformer( @@ -143,6 +185,7 @@ def test_values_passed_in_init_set_to_attribute(self): prior=1, level="any", unseen_level_handling="Mean", + return_type="float32", ) ta.classes.test_object_attributes( @@ -152,6 +195,8 @@ def test_values_passed_in_init_set_to_attribute(self): "prior": 1, "level": "any", "unseen_level_handling": "Mean", + "return_type": "float32", + "cast_method": np.float32, }, msg="Attributes for MeanResponseTransformer set in init", ) @@ -198,11 +243,18 @@ def test_output1(self): assert_series_equal(expected, output) - def test_output2(self): - "Test output of method." + @pytest.mark.parametrize( + "dtype", + ["object", "category"], + ) + def test_output2(self, dtype): + "Test output of method - for category and object dtypes" x = MeanResponseTransformer(columns="a", prior=0) - x.fit(X=pd.DataFrame({"a": [1, 2]}), y=pd.Series([2, 3])) + df = pd.DataFrame({"a": ["a", "b"]}) + df["a"] = df["a"].astype(dtype) + + x.fit(X=df, y=pd.Series([2, 3])) expected1 = (1 * 1) / (1) @@ -341,11 +393,10 @@ def test_response_column_nulls_error( ("level", "target_column", "unseen_level_handling"), [ (None, "a", "Mean"), - ("all", "multi_level_response", 32), - (["yellow", "blue"], "multi_level_response", "Highest"), + (None, "a", "Lowest"), ], ) - def test_correct_mappings_stored( + def test_correct_mappings_stored_numeric_response( self, learnt_mapping_dict, level, @@ -354,39 +405,67 @@ def test_correct_mappings_stored( ): "Test that the mapping dictionary created in fit has the correct keys and values." df = d.create_MeanResponseTransformer_test_df() + columns = ["b", "c"] x = MeanResponseTransformer( - columns=["b"], + columns=columns, level=level, unseen_level_handling=unseen_level_handling, ) x.fit(df, df[target_column]) - if level: - if level == "all": - assert set(x.mapped_columns) == { - "b_blue", - "b_yellow", - "b_green", - }, "Stored mapped columns are not as expected" + assert x.columns == columns, "Columns attribute changed in fit" - else: - assert set(x.mapped_columns) == { - "b_blue", - "b_yellow", - }, "Stored mapped columns are not as expected" + for column in x.columns: + actual = x.mappings[column] + expected = learnt_mapping_dict[column] + assert actual == expected - for column in x.mapped_columns: - actual = x.mappings[column] - expected = learnt_mapping_dict[column] - assert actual == expected + @pytest.mark.parametrize( + ("level", "target_column", "unseen_level_handling"), + [ + (["blue"], "multi_level_response", "Median"), + ("all", "multi_level_response", 32), + (["yellow", "blue"], "multi_level_response", "Highest"), + ], + ) + def test_correct_mappings_stored_categorical_response( + self, + learnt_mapping_dict, + level, + target_column, + unseen_level_handling, + ): + "Test that the mapping dictionary created in fit has the correct keys and values." + df = d.create_MeanResponseTransformer_test_df() + columns = ["b", "c"] + x = MeanResponseTransformer( + columns=columns, + level=level, + unseen_level_handling=unseen_level_handling, + ) + x.fit(df, df[target_column]) + + if level == "all": + expected_created_cols = { + prefix + "_" + suffix + for prefix, suffix in product( + columns, + df[target_column].unique().tolist(), + ) + } else: - assert x.columns == ["b"], "Columns attribute changed in fit" + expected_created_cols = { + prefix + "_" + suffix for prefix, suffix in product(columns, level) + } + assert ( + set(x.mapped_columns) == expected_created_cols + ), "Stored mapped columns are not as expected" - for column in x.columns: - actual = x.mappings[column] - expected = learnt_mapping_dict[column] - assert actual == expected + for column in x.mapped_columns: + actual = x.mappings[column] + expected = learnt_mapping_dict[column] + assert actual == expected @pytest.mark.parametrize( ("level", "target_column", "unseen_level_handling"), @@ -488,14 +567,22 @@ def test_learnt_values(self): x._fit_binary_response(df, df["a"], x.columns) + expected_mappings = { + "b": {"a": 1.0, "b": 2.0, "c": 3.0, "d": 4.0, "e": 5.0, "f": 6.0}, + "d": {1: 1.0, 2: 2.0, 3: 3.0, 4: 4.0, 5: 5.0, 6: 6.0}, + "f": {False: 2.0, True: 5.0}, + } + + for key in expected_mappings: + for value in expected_mappings[key]: + expected_mappings[key][value] = x.cast_method( + expected_mappings[key][value], + ) + ta.classes.test_object_attributes( obj=x, expected_attributes={ - "mappings": { - "b": {"a": 1.0, "b": 2.0, "c": 3.0, "d": 4.0, "e": 5.0, "f": 6.0}, - "d": {1: 1.0, 2: 2.0, 3: 3.0, 4: 4.0, 5: 5.0, 6: 6.0}, - "f": {False: 2.0, True: 5.0}, - }, + "mappings": expected_mappings, "global_mean": np.float64(3.5), }, msg="mappings attribute", @@ -511,28 +598,35 @@ def test_learnt_values_prior_no_weight(self): x._fit_binary_response(df, df["a"], x.columns) + expected_mappings = { + "b": { + "a": 37 / 12, + "b": 13 / 4, + "c": 41 / 12, + "d": 43 / 12, + "e": 15 / 4, + "f": 47 / 12, + }, + "d": { + 1: 37 / 12, + 2: 13 / 4, + 3: 41 / 12, + 4: 43 / 12, + 5: 15 / 4, + 6: 47 / 12, + }, + "f": {False: 47 / 16, True: 65 / 16}, + } + for key in expected_mappings: + for value in expected_mappings[key]: + expected_mappings[key][value] = x.cast_method( + expected_mappings[key][value], + ) + ta.classes.test_object_attributes( obj=x, expected_attributes={ - "mappings": { - "b": { - "a": 37 / 12, - "b": 13 / 4, - "c": 41 / 12, - "d": 43 / 12, - "e": 15 / 4, - "f": 47 / 12, - }, - "d": { - 1: 37 / 12, - 2: 13 / 4, - 3: 41 / 12, - 4: 43 / 12, - 5: 15 / 4, - 6: 47 / 12, - }, - "f": {False: 47 / 16, True: 65 / 16}, - }, + "mappings": expected_mappings, "global_mean": np.float64(3.5), }, msg="mappings attribute", @@ -548,14 +642,22 @@ def test_learnt_values_no_prior_weight(self): x._fit_binary_response(df, df["a"], x.columns) + expected_mappings = { + "b": {"a": 1.0, "b": 2.0, "c": 3.0, "d": 4.0, "e": 5.0, "f": 6.0}, + "d": {1: 1.0, 2: 2.0, 3: 3.0, 4: 4.0, 5: 5.0, 6: 6.0}, + "f": {False: 14 / 6, True: 77 / 15}, + } + + for key in expected_mappings: + for value in expected_mappings[key]: + expected_mappings[key][value] = x.cast_method( + expected_mappings[key][value], + ) + ta.classes.test_object_attributes( obj=x, expected_attributes={ - "mappings": { - "b": {"a": 1.0, "b": 2.0, "c": 3.0, "d": 4.0, "e": 5.0, "f": 6.0}, - "d": {1: 1.0, 2: 2.0, 3: 3.0, 4: 4.0, 5: 5.0, 6: 6.0}, - "f": {False: 14 / 6, True: 77 / 15}, - }, + "mappings": expected_mappings, }, msg="mappings attribute", ) @@ -576,13 +678,20 @@ def test_learnt_values_prior_weight(self): x._fit_binary_response(df, df["a"], x.columns) + expected_mappings = { + "d": {1: 7 / 2, 2: 11 / 3, 3: 23 / 6, 4: 4.0, 5: 30 / 7, 6: 32 / 7}, + "f": {False: 13 / 4, True: 50 / 11}, + } + for key in expected_mappings: + for value in expected_mappings[key]: + expected_mappings[key][value] = x.cast_method( + expected_mappings[key][value], + ) + ta.classes.test_object_attributes( obj=x, expected_attributes={ - "mappings": { - "d": {1: 7 / 2, 2: 11 / 3, 3: 23 / 6, 4: 4.0, 5: 30 / 7, 6: 32 / 7}, - "f": {False: 13 / 4, True: 50 / 11}, - }, + "mappings": expected_mappings, "global_mean": np.float64(4.0), }, msg="mappings attribute", @@ -1024,7 +1133,6 @@ def test_check_is_fitted_called(self, mocker): x.transform(df) def test_not_dataframe_error_raised(self): - df = d.create_MeanResponseTransformer_test_df() x = MeanResponseTransformer(columns="b") @@ -1086,7 +1194,8 @@ def test_learnt_values_not_modified(self): ) def test_expected_output_binary_response(self, df, expected): """Test that the output is expected from transform with a binary response.""" - x = MeanResponseTransformer(columns=["b", "d", "f"]) + columns = ["b", "d", "f"] + x = MeanResponseTransformer(columns=columns) # set the impute values dict directly rather than fitting x on df so test works with helpers x.mappings = { @@ -1097,6 +1206,9 @@ def test_expected_output_binary_response(self, df, expected): df_transformed = x.transform(df) + for col in columns: + expected[col] = expected[col].astype(x.return_type) + ta.equality.assert_frame_equal_msg( actual=df_transformed, expected=expected, @@ -1112,16 +1224,31 @@ def test_expected_output_binary_response(self, df, expected): ) def test_expected_output_one_multi_level(self, df, expected): """Test that the output is expected from transform with a multi-level response and one level selected.""" - x = MeanResponseTransformer(columns=["b", "f"], level=["blue"]) + columns = ["b", "f"] + level = ["blue"] + expected_created_cols = [ + prefix + "_" + suffix for prefix, suffix in product(columns, level) + ] + x = MeanResponseTransformer(columns=columns, level=level) + + for col in expected_created_cols: + expected[col] = expected[col].astype(x.return_type) # set the impute values dict directly rather than fitting x on df so test works with helpers x.mappings = { "b_blue": {"a": 1, "b": 1, "c": 0, "d": 0, "e": 0, "f": 0}, "f_blue": {False: 2 / 3, True: 0}, } - x.response_levels = ["blue"] + x.response_levels = level x.mapped_columns = list(x.mappings.keys()) df_transformed = x.transform(df) + new_expected_created_cols = [ + prefix + "_" + suffix + for prefix, suffix in product(columns, x.response_levels) + ] + + for col in new_expected_created_cols: + expected[col] = expected[col].astype(x.return_type) ta.equality.assert_frame_equal_msg( actual=df_transformed, @@ -1139,7 +1266,8 @@ def test_expected_output_one_multi_level(self, df, expected): ) def test_expected_output_all_levels(self, df, expected): """Test that the output is expected from transform for a multi-level response and all levels selected.""" - x = MeanResponseTransformer(columns=["b", "f"], level="all") + columns = ["b", "f"] + x = MeanResponseTransformer(columns=columns, level="all") # set the impute values dict directly rather than fitting x on df so test works with helpers x.mappings = { @@ -1154,9 +1282,13 @@ def test_expected_output_all_levels(self, df, expected): x.response_levels = ["blue", "green", "yellow"] x.mapped_columns = list(x.mappings.keys()) df_transformed = x.transform(df) + expected_created_cols = [ + prefix + "_" + suffix + for prefix, suffix in product(columns, x.response_levels) + ] - print(df_transformed) - print(expected) + for col in expected_created_cols: + expected[col] = expected[col].astype(x.return_type) ta.equality.assert_frame_equal_msg( actual=df_transformed, @@ -1194,15 +1326,20 @@ def test_expected_output_sigle_level_response_unseen_levels_mean( """Test that the output is expected from transform with a single level response with unseen levels in data with unseen_level_handling set to 'Mean'. """ + columns = ["b", "d", "f"] + target = "a" x = MeanResponseTransformer( - columns=["b", "d", "f"], + columns=columns, unseen_level_handling="Mean", ) initial_df = d.create_MeanResponseTransformer_test_df() - x.fit(initial_df, initial_df["a"]) + x.fit(initial_df, initial_df[target]) df_transformed = x.transform(df) + for col in columns: + expected[col] = expected[col].astype(x.return_type) + ta.equality.assert_frame_equal_msg( actual=df_transformed, expected=expected, @@ -1224,15 +1361,20 @@ def test_expected_output_sigle_level_response_unseen_levels_median( """Test that the output is expected from transform with a single level response with unseen levels in data with unseen_level_handling set to 'Median'. """ + columns = ["b", "d", "f"] + target = "a" x = MeanResponseTransformer( - columns=["b", "d", "f"], + columns=columns, unseen_level_handling="Median", ) initial_df = d.create_MeanResponseTransformer_test_df() - x.fit(initial_df, initial_df["a"]) + x.fit(initial_df, initial_df[target]) df_transformed = x.transform(df) + for col in columns: + expected[col] = expected[col].astype(x.return_type) + ta.equality.assert_frame_equal_msg( actual=df_transformed, expected=expected, @@ -1254,15 +1396,20 @@ def test_expected_output_sigle_level_response_unseen_levels_lowest( """Test that the output is expected from transform with a single level response with unseen levels in data with unseen_level_handling set to 'Lowest'. """ + columns = ["b", "d", "f"] + target = "a" x = MeanResponseTransformer( - columns=["b", "d", "f"], + columns=columns, unseen_level_handling="Lowest", ) initial_df = d.create_MeanResponseTransformer_test_df() - x.fit(initial_df, initial_df["a"]) + x.fit(initial_df, initial_df[target]) df_transformed = x.transform(df) + for col in columns: + expected[col] = expected[col].astype(x.return_type) + ta.equality.assert_frame_equal_msg( actual=df_transformed, expected=expected, @@ -1284,15 +1431,20 @@ def test_expected_output_sigle_level_response_unseen_levels_highest( """Test that the output is expected from transform with a single level response with unseen levels in data with unseen_level_handling set to 'Highest'. """ + columns = ["b", "d", "f"] + target = "a" x = MeanResponseTransformer( columns=["b", "d", "f"], unseen_level_handling="Highest", ) initial_df = d.create_MeanResponseTransformer_test_df() - x.fit(initial_df, initial_df["a"]) + x.fit(initial_df, initial_df[target]) df_transformed = x.transform(df) + for col in columns: + expected[col] = expected[col].astype(x.return_type) + ta.equality.assert_frame_equal_msg( actual=df_transformed, expected=expected, @@ -1314,12 +1466,17 @@ def test_expected_output_sigle_level_response_unseen_levels_arbitrary( """Test that the output is expected from transform with a single level response with unseen levels in data with unseen_level_handling set to an arbitrary int/float value'. """ - x = MeanResponseTransformer(columns=["b", "d", "f"], unseen_level_handling=21.6) + columns = ["b", "d", "f"] + target = "a" + x = MeanResponseTransformer(columns=columns, unseen_level_handling=21.6) initial_df = d.create_MeanResponseTransformer_test_df() - x.fit(initial_df, initial_df["a"]) + x.fit(initial_df, initial_df[target]) df_transformed = x.transform(df) + for col in columns: + expected[col] = expected[col].astype(x.return_type) + ta.equality.assert_frame_equal_msg( actual=df_transformed, expected=expected, @@ -1335,12 +1492,20 @@ def test_expected_output_sigle_level_response_unseen_levels_arbitrary( ) def test_expected_output_one_multi_level_unseen_levels(self, df, expected): """Test that the output is expected from transform with a multi-level response and unseen levels and one level selected.""" + columns = ["b", "f"] + level = ["blue"] + expected_created_cols = [ + prefix + "_" + suffix for prefix, suffix in product(columns, level) + ] x = MeanResponseTransformer( - columns=["b", "f"], - level=["blue"], + columns=columns, + level=level, unseen_level_handling="Mean", ) + for col in expected_created_cols: + expected[col] = expected[col].astype(x.return_type) + initial_df = d.create_MeanResponseTransformer_test_df() x.fit(initial_df, initial_df["multi_level_response"]) df_transformed = x.transform(df) @@ -1360,16 +1525,25 @@ def test_expected_output_one_multi_level_unseen_levels(self, df, expected): ) def test_expected_output_all_multi_level_unseen_levels(self, df, expected): """Test that the output is expected from transform with a multi-level response and unseen levels and all level selected.""" + columns = ["b", "f"] + target = "multi_level_response" + initial_df = d.create_MeanResponseTransformer_test_df() + expected_created_cols = [ + prefix + "_" + suffix + for prefix, suffix in product(columns, initial_df[target].unique().tolist()) + ] x = MeanResponseTransformer( - columns=["b", "f"], + columns=columns, level="all", unseen_level_handling="Highest", ) - initial_df = d.create_MeanResponseTransformer_test_df() - x.fit(initial_df, initial_df["multi_level_response"]) + x.fit(initial_df, initial_df[target]) df_transformed = x.transform(df) + for col in expected_created_cols: + expected[col] = expected[col].astype(x.return_type) + ta.equality.assert_frame_equal_msg( actual=df_transformed, expected=expected, @@ -1391,3 +1565,53 @@ def test_nulls_introduced_in_transform_error(self): match="MeanResponseTransformer: nulls would be introduced into column b from levels not present in mapping", ): x.transform(df) + + @pytest.mark.parametrize( + "prior, level, target, unseen_level_handling", + [ + (5, "all", "c", "Mean"), + (100, ["a", "b"], "c", "Lowest"), + (1, None, "a", "Highest"), + (0, None, "a", "Median"), + ], + ) + def test_return_type_can_be_changed( + self, + prior, + level, + target, + unseen_level_handling, + ): + "Test that output return types are controlled by return_type param, this defaults to float32 so test float64 here" + + df = d.create_MeanResponseTransformer_test_df() + + columns = ["b", "d", "f"] + x = MeanResponseTransformer( + columns=columns, + return_type="float64", + prior=prior, + unseen_level_handling=unseen_level_handling, + level=level, + ) + + x.fit(df, df[target]) + + output_df = x.transform(df) + + if target == "c": + actual_levels = df[target].unique().tolist() if level == "all" else level + expected_created_cols = [ + prefix + "_" + suffix + for prefix, suffix in product(columns, actual_levels) + ] + + else: + expected_created_cols = columns + + for col in expected_created_cols: + expected_type = x.return_type + actual_type = output_df[col].dtype.name + assert ( + actual_type == expected_type + ), f"{x.classname} should output columns with type determine by the return_type param, expected {expected_type} but got {actual_type}" diff --git a/tubular/_version.py b/tubular/_version.py index 72f26f59..c68196d1 100644 --- a/tubular/_version.py +++ b/tubular/_version.py @@ -1 +1 @@ -__version__ = "1.1.2" +__version__ = "1.2.0" diff --git a/tubular/nominal.py b/tubular/nominal.py index 2e57ba8e..cd5e8229 100644 --- a/tubular/nominal.py +++ b/tubular/nominal.py @@ -2,6 +2,7 @@ from __future__ import annotations import warnings +from typing import Literal import numpy as np import pandas as pd @@ -513,6 +514,9 @@ class MeanResponseTransformer(BaseNominalTransformer): in order to encode unseen levels in each categorical column with the mean, median etc. of each column. One can also pass an arbitrary int/float value to use for encoding unseen levels. + return_type: Literal['float32', 'float64'] + What type to cast return column as, consider exploring float32 to save memory. Defaults to float32. + **kwargs Arbitrary keyword arguments passed onto BaseTransformer.init method. @@ -551,6 +555,11 @@ class MeanResponseTransformer(BaseNominalTransformer): unseen_levels_encoding_dict: dict Dict containing the values (based on chosen unseen_level_handling) derived from the encoded columns to use when handling unseen levels in data passed to transform method. + return_type: Literal['float32', 'float64'] + What type to cast return column as. Defaults to float32. + + cast_method: Literal[np.float32, np,float64] + Store the casting method associated to return_type """ @@ -561,6 +570,7 @@ def __init__( prior: int = 0, level: str | list | None = None, unseen_level_handling: str | int | float | None = None, + return_type: Literal["float32", "float64"] = "float32", **kwargs: dict[str, bool], ) -> None: if weights_column is not None and type(weights_column) is not str: @@ -586,10 +596,19 @@ def __init__( msg = f"{self.classname()}: unseen_level_handling should be the option: Mean, Median, Lowest, Highest or an arbitrary int/float value" raise ValueError(msg) + if return_type not in ["float64", "float32"]: + msg = f"{self.classname()}: return_type should be one of: 'float64', 'float32'" + raise ValueError(msg) + self.weights_column = weights_column self.prior = prior self.level = level self.unseen_level_handling = unseen_level_handling + self.return_type = return_type + if return_type == "float64": + self.cast_method = np.float64 + else: + self.cast_method = np.float32 # TODO: set default prior to None and refactor to only use prior regularisation when it is set? BaseNominalTransformer.__init__(self, columns=columns, **kwargs) @@ -696,6 +715,10 @@ def _fit_binary_response( group_weight, ).to_dict() + # to_dict changes types + for key in self.mappings[c]: + self.mappings[c][key] = self.cast_method(self.mappings[c][key]) + def fit(self, X: pd.DataFrame, y: pd.Series) -> pd.DataFrame: """Identify mapping of categorical levels to mean response values. @@ -766,33 +789,27 @@ def fit(self, X: pd.DataFrame, y: pd.Series) -> pd.DataFrame: self._fit_binary_response(X, y, self.columns) self.encoded_feature_columns = self.columns - if self.unseen_level_handling == "Mean": + if isinstance(self.unseen_level_handling, (int, float)): for c in self.encoded_feature_columns: - self.unseen_levels_encoding_dict[c] = ( - X_temp[c].map(self.mappings[c]).mean() + self.unseen_levels_encoding_dict[c] = self.cast_method( + self.unseen_level_handling, ) - elif self.unseen_level_handling == "Median": + else: for c in self.encoded_feature_columns: - self.unseen_levels_encoding_dict[c] = ( - X_temp[c].map(self.mappings[c]).median() - ) + X_temp[c] = X_temp[c].map(self.mappings[c]).astype(self.return_type) - elif self.unseen_level_handling == "Lowest": - for c in self.encoded_feature_columns: - self.unseen_levels_encoding_dict[c] = ( - X_temp[c].map(self.mappings[c]).min() - ) + if self.unseen_level_handling == "Mean": + self.unseen_levels_encoding_dict[c] = X_temp[c].mean() - elif self.unseen_level_handling == "Highest": - for c in self.encoded_feature_columns: - self.unseen_levels_encoding_dict[c] = ( - X_temp[c].map(self.mappings[c]).max() - ) + if self.unseen_level_handling == "Median": + self.unseen_levels_encoding_dict[c] = X_temp[c].median() - elif isinstance(self.unseen_level_handling, (int, float)): - for c in self.encoded_feature_columns: - self.unseen_levels_encoding_dict[c] = float(self.unseen_level_handling) + if self.unseen_level_handling == "Lowest": + self.unseen_levels_encoding_dict[c] = X_temp[c].min() + + if self.unseen_level_handling == "Highest": + self.unseen_levels_encoding_dict[c] = X_temp[c].max() return self @@ -810,7 +827,7 @@ def map_imputation_values(self, X: pd.DataFrame) -> pd.DataFrame: input dataframe with mappings applied """ for c in self.columns: - X[c] = X[c].map(self.mappings[c]) + X[c] = X[c].map(self.mappings[c]).astype(self.return_type) return X