From 7a505fd26270d4ea9474dc1db379663a98b099e4 Mon Sep 17 00:00:00 2001 From: davidhopkinson26 <89029024+davidhopkinson26@users.noreply.github.com> Date: Mon, 15 Jul 2024 15:59:29 +0100 Subject: [PATCH 01/32] Update CONTRIBUTING.rst --- CONTRIBUTING.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 1aeb2071..52d6ab0b 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -15,8 +15,9 @@ We have some general requirements for all contributions then specific requiremen Set up development environment ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +For External contributors, first create your own fork of this repo. -First clone the repository; +Then clone the fork (or this repository if internal); .. code:: From 96563f1a62cd0d112d716a0f7aecfa58c7048d77 Mon Sep 17 00:00:00 2001 From: davidhopkinson26 <89029024+davidhopkinson26@users.noreply.github.com> Date: Thu, 18 Jul 2024 14:13:21 +0000 Subject: [PATCH 02/32] :construction: basic skeleton of base deat transformer test modules --- tests/dates/test_BaseDateTransformer.py | 46 +++++++++++++++++++ .../test_BaseTwocolumnDateTransformer.py | 46 +++++++++++++++++++ 2 files changed, 92 insertions(+) create mode 100644 tests/dates/test_BaseDateTransformer.py create mode 100644 tests/dates/test_BaseTwocolumnDateTransformer.py diff --git a/tests/dates/test_BaseDateTransformer.py b/tests/dates/test_BaseDateTransformer.py new file mode 100644 index 00000000..7abbdf94 --- /dev/null +++ b/tests/dates/test_BaseDateTransformer.py @@ -0,0 +1,46 @@ +from tests.base_tests import ( + ColumnStrListInitTests, + GenericFitTests, + GenericTransformTests, + OtherBaseBehaviourTests, +) + + +class GenericDatesTransformTests: + """Generic tests for Dates Transformers""" + + +class TestInit(ColumnStrListInitTests): + """Generic tests for transformer.init().""" + + @classmethod + def setup_class(cls): + cls.transformer_name = "BaseDateTransformer" + + +class TestFit(GenericFitTests): + """Generic tests for transformer.fit()""" + + @classmethod + def setup_class(cls): + cls.transformer_name = "BaseDateTransformer" + + +class TestTransform(GenericTransformTests, GenericDatesTransformTests): + """Tests for BaseDateTransformer.transform.""" + + @classmethod + def setup_class(cls): + cls.transformer_name = "BaseDateTransformer" + + +class TestOtherBaseBehaviour(OtherBaseBehaviourTests): + """ + Class to run tests for BaseTransformerBehaviour outside the three standard methods. + + May need to overwite specific tests in this class if the tested transformer modifies this behaviour. + """ + + @classmethod + def setup_class(cls): + cls.transformer_name = "BaseDateTransformer" diff --git a/tests/dates/test_BaseTwocolumnDateTransformer.py b/tests/dates/test_BaseTwocolumnDateTransformer.py new file mode 100644 index 00000000..cf5d62dd --- /dev/null +++ b/tests/dates/test_BaseTwocolumnDateTransformer.py @@ -0,0 +1,46 @@ +from tests.base_tests import ( + ColumnStrListInitTests, + GenericFitTests, + GenericTransformTests, + OtherBaseBehaviourTests, +) + + +class GenericTwoColumnDatesTransformTests: + """Generic tests for Dates Transformers""" + + +class TestInit(ColumnStrListInitTests): + """Generic tests for transformer.init().""" + + @classmethod + def setup_class(cls): + cls.transformer_name = "BaseDateTwoColumnTransformer" + + +class TestFit(GenericFitTests): + """Generic tests for transformer.fit()""" + + @classmethod + def setup_class(cls): + cls.transformer_name = "BaseDateTwoColumnTransformer" + + +class TestTransform(GenericTransformTests, GenericTwoColumnDatesTransformTests): + """Tests for BaseTwoColumnDateTransformer.transform.""" + + @classmethod + def setup_class(cls): + cls.transformer_name = "BaseDateTwoColumnTransformer" + + +class TestOtherBaseBehaviour(OtherBaseBehaviourTests): + """ + Class to run tests for BaseTransformerBehaviour outside the three standard methods. + + May need to overwite specific tests in this class if the tested transformer modifies this behaviour. + """ + + @classmethod + def setup_class(cls): + cls.transformer_name = "BaseDateTwoColumnTransformer" From 58b81a649e2e92c3825ffe2657f309cdf1c196b4 Mon Sep 17 00:00:00 2001 From: davidhopkinson26 <89029024+davidhopkinson26@users.noreply.github.com> Date: Thu, 18 Jul 2024 15:17:53 +0000 Subject: [PATCH 03/32] :construction: added tests to BaseDate test modules --- tests/dates/test_BaseDateTransformer.py | 33 +++++++++++++++++ .../test_BaseTwocolumnDateTransformer.py | 35 +++++++++++++++---- 2 files changed, 62 insertions(+), 6 deletions(-) diff --git a/tests/dates/test_BaseDateTransformer.py b/tests/dates/test_BaseDateTransformer.py index 7abbdf94..753631ae 100644 --- a/tests/dates/test_BaseDateTransformer.py +++ b/tests/dates/test_BaseDateTransformer.py @@ -1,3 +1,7 @@ +import pandas as pd +import pytest + +import tests.test_data as d from tests.base_tests import ( ColumnStrListInitTests, GenericFitTests, @@ -9,6 +13,35 @@ class GenericDatesTransformTests: """Generic tests for Dates Transformers""" + def test_mismatched_datetypes_error( + self, + columns, + datetime_col, + date_col, + initialized_transformers, + ): + "Test that transform raises an error if one column is a date and one is datetime" + + x = initialized_transformers(self.transformer_name) + + df = d.create_date_diff_different_dtypes() + # types don't seem to come out of the above function as expected, hard enforce + for col in ["date_col_1", "date_col_2"]: + df[col] = pd.to_datetime(df[col]).dt.date + + for col in ["datetime_col_1", "datetime_col_2"]: + df[col] = pd.to_datetime(df[col]) + + present_types = ( + {"datetime64", "date"} if datetime_col == 0 else {"date", "datetime64"} + ) + msg = rf"Columns fed to datetime transformers should be \['datetime64', 'date'\] and have consistent types, but found {present_types}. Please use ToDatetimeTransformer to standardise" + with pytest.raises( + TypeError, + match=msg, + ): + x.transform(df) + class TestInit(ColumnStrListInitTests): """Generic tests for transformer.init().""" diff --git a/tests/dates/test_BaseTwocolumnDateTransformer.py b/tests/dates/test_BaseTwocolumnDateTransformer.py index cf5d62dd..c40f72dc 100644 --- a/tests/dates/test_BaseTwocolumnDateTransformer.py +++ b/tests/dates/test_BaseTwocolumnDateTransformer.py @@ -1,16 +1,39 @@ -from tests.base_tests import ( +import pytest + +from tests.dates.test_BaseDateTransformer import ( ColumnStrListInitTests, + GenericDatesTransformTests, GenericFitTests, - GenericTransformTests, OtherBaseBehaviourTests, ) -class GenericTwoColumnDatesTransformTests: - """Generic tests for Dates Transformers""" +class GenericTwoColumnDatesInitTests: + """Generic tests for Init Dates Transformers which take two columns""" + + @pytest.mark.parametrize("columns", [["a"], ["a", "b", "c"]]) + def test_not_two_columns_error( + self, + uninitialized_transformers, + minimal_attribute_dict, + columns, + ): + """ "test that two correct error raised when passed more or less than two columns""" + + transformer = uninitialized_transformers[self.transformer_name] + init_args = minimal_attribute_dict[self.transformer_name] + init_args[columns] = columns + + msg = f"{self.classname()}: This transformer works with two columns only" + + with pytest.raises( + ValueError, + match=msg, + ): + transformer(init_args) -class TestInit(ColumnStrListInitTests): +class TestInit(ColumnStrListInitTests, GenericTwoColumnDatesInitTests): """Generic tests for transformer.init().""" @classmethod @@ -26,7 +49,7 @@ def setup_class(cls): cls.transformer_name = "BaseDateTwoColumnTransformer" -class TestTransform(GenericTransformTests, GenericTwoColumnDatesTransformTests): +class TestTransform(GenericDatesTransformTests): """Tests for BaseTwoColumnDateTransformer.transform.""" @classmethod From bdd0b4fd9c9fddf621e0bdc79ff63013aa13eba8 Mon Sep 17 00:00:00 2001 From: davidhopkinson26 <89029024+davidhopkinson26@users.noreply.github.com> Date: Thu, 18 Jul 2024 15:59:32 +0000 Subject: [PATCH 04/32] :white_check_mark: fixed test_mismatched_datetypes_error --- tests/dates/test_BaseDateTransformer.py | 78 +++++++++++++++++++++++-- 1 file changed, 74 insertions(+), 4 deletions(-) diff --git a/tests/dates/test_BaseDateTransformer.py b/tests/dates/test_BaseDateTransformer.py index 753631ae..b80ea5a5 100644 --- a/tests/dates/test_BaseDateTransformer.py +++ b/tests/dates/test_BaseDateTransformer.py @@ -1,7 +1,8 @@ +import datetime + import pandas as pd import pytest -import tests.test_data as d from tests.base_tests import ( ColumnStrListInitTests, GenericFitTests, @@ -10,21 +11,88 @@ ) +def create_date_diff_different_dtypes(): + """Dataframe with different datetime formats""" + return pd.DataFrame( + { + "date_col_1": [ + datetime.date(1993, 9, 27), + datetime.date(2000, 3, 19), + datetime.date(2018, 11, 10), + datetime.date(2018, 10, 10), + datetime.date(2018, 10, 10), + datetime.date(2018, 10, 10), + datetime.date(2018, 12, 10), + datetime.date( + 1985, + 7, + 23, + ), + ], + "date_col_2": [ + datetime.date(2020, 5, 1), + datetime.date(2019, 12, 25), + datetime.date(2018, 11, 10), + datetime.date(2018, 11, 10), + datetime.date(2018, 9, 10), + datetime.date(2015, 11, 10), + datetime.date(2015, 11, 10), + datetime.date(2015, 7, 23), + ], + "datetime_col_1": [ + datetime.datetime(1993, 9, 27, tzinfo=datetime.timezone.utc), + datetime.datetime(2000, 3, 19, tzinfo=datetime.timezone.utc), + datetime.datetime(2018, 11, 10, tzinfo=datetime.timezone.utc), + datetime.datetime(2018, 10, 10, tzinfo=datetime.timezone.utc), + datetime.datetime(2018, 10, 10, tzinfo=datetime.timezone.utc), + datetime.datetime(2018, 10, 10, tzinfo=datetime.timezone.utc), + datetime.datetime(2018, 12, 10, tzinfo=datetime.timezone.utc), + datetime.datetime( + 1985, + 7, + 23, + tzinfo=datetime.timezone.utc, + ), + ], + "datetime_col_2": [ + datetime.datetime(2020, 5, 1, tzinfo=datetime.timezone.utc), + datetime.datetime(2019, 12, 25, tzinfo=datetime.timezone.utc), + datetime.datetime(2018, 11, 10, tzinfo=datetime.timezone.utc), + datetime.datetime(2018, 11, 10, tzinfo=datetime.timezone.utc), + datetime.datetime(2018, 9, 10, tzinfo=datetime.timezone.utc), + datetime.datetime(2015, 11, 10, tzinfo=datetime.timezone.utc), + datetime.datetime(2015, 11, 10, tzinfo=datetime.timezone.utc), + datetime.datetime(2015, 7, 23, tzinfo=datetime.timezone.utc), + ], + }, + ) + + class GenericDatesTransformTests: """Generic tests for Dates Transformers""" + @pytest.mark.parametrize( + ("columns, datetime_col, date_col"), + [ + (["date_col_1", "datetime_col_2"], 1, 0), + (["datetime_col_1", "date_col_2"], 0, 1), + ], + ) def test_mismatched_datetypes_error( self, columns, datetime_col, date_col, - initialized_transformers, + uninitialized_transformers, ): "Test that transform raises an error if one column is a date and one is datetime" - x = initialized_transformers(self.transformer_name) + x = uninitialized_transformers[self.transformer_name]( + columns=columns, + new_column_name="c", + ) - df = d.create_date_diff_different_dtypes() + df = create_date_diff_different_dtypes() # types don't seem to come out of the above function as expected, hard enforce for col in ["date_col_1", "date_col_2"]: df[col] = pd.to_datetime(df[col]).dt.date @@ -35,6 +103,8 @@ def test_mismatched_datetypes_error( present_types = ( {"datetime64", "date"} if datetime_col == 0 else {"date", "datetime64"} ) + print("***********************************************************") + print(df) msg = rf"Columns fed to datetime transformers should be \['datetime64', 'date'\] and have consistent types, but found {present_types}. Please use ToDatetimeTransformer to standardise" with pytest.raises( TypeError, From 5fcd093f6a29af4ee694869faa59009c0b2c31a1 Mon Sep 17 00:00:00 2001 From: davidhopkinson26 <89029024+davidhopkinson26@users.noreply.github.com> Date: Thu, 18 Jul 2024 16:00:49 +0000 Subject: [PATCH 05/32] :construction: fixes to uninitialized_transformers call --- tests/dates/test_BaseTwocolumnDateTransformer.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/dates/test_BaseTwocolumnDateTransformer.py b/tests/dates/test_BaseTwocolumnDateTransformer.py index c40f72dc..d28ff4bd 100644 --- a/tests/dates/test_BaseTwocolumnDateTransformer.py +++ b/tests/dates/test_BaseTwocolumnDateTransformer.py @@ -20,17 +20,16 @@ def test_not_two_columns_error( ): """ "test that two correct error raised when passed more or less than two columns""" - transformer = uninitialized_transformers[self.transformer_name] init_args = minimal_attribute_dict[self.transformer_name] init_args[columns] = columns - msg = f"{self.classname()}: This transformer works with two columns only" + msg = f"{self.transformer_name}: This transformer works with two columns only" with pytest.raises( ValueError, match=msg, ): - transformer(init_args) + uninitialized_transformers[self.transformer_name](init_args) class TestInit(ColumnStrListInitTests, GenericTwoColumnDatesInitTests): From bc4bf16ab121b7b54165ef6612da140138cfae4b Mon Sep 17 00:00:00 2001 From: Sam Forward <122817027+Chip2916@users.noreply.github.com> Date: Fri, 19 Jul 2024 09:45:27 +0000 Subject: [PATCH 06/32] Updated NMI init tests --- .../test_NearestMeanResponseImputer.py | 519 +++++++++--------- 1 file changed, 255 insertions(+), 264 deletions(-) diff --git a/tests/imputers/test_NearestMeanResponseImputer.py b/tests/imputers/test_NearestMeanResponseImputer.py index 3f0abbfd..fdd29be9 100644 --- a/tests/imputers/test_NearestMeanResponseImputer.py +++ b/tests/imputers/test_NearestMeanResponseImputer.py @@ -1,11 +1,12 @@ import numpy as np import pandas as pd -import pytest -import test_aide as ta -import tests.test_data as d -import tubular -from tubular.imputers import NearestMeanResponseImputer +# import pytest +# import test_aide as ta +# import tests.test_data as d +from tests.base_tests import ( + ColumnStrListInitTests, +) # Dataframe used exclusively in this testing script @@ -24,297 +25,287 @@ def create_NearestMeanResponseImputer_test_df(): ) -class TestInit: - """Tests for NearestMeanResponseImputer.__init__.""" +class TestInit(ColumnStrListInitTests): + """Generic tests for transformer.init().""" - def test_super_init_called(self, mocker): - """Test that init calls BaseTransformer.init.""" - expected_call_args = { - 0: {"args": (), "kwargs": {"columns": None, "verbose": True}}, - } + @classmethod + def setup_class(cls): + cls.transformer_name = "NearestMeanResponseImputer" - with ta.functions.assert_function_call( - mocker, - tubular.base.BaseTransformer, - "__init__", - expected_call_args, - ): - NearestMeanResponseImputer(columns=None, verbose=True) +# class TestFit: +# """Tests for NearestMeanResponseImputer.fit.""" -class TestFit: - """Tests for NearestMeanResponseImputer.fit.""" - - def test_super_fit_called(self, mocker): - """Test that fit calls BaseTransformer.fit.""" - df = create_NearestMeanResponseImputer_test_df() - - x = NearestMeanResponseImputer(columns=["a", "b"]) +# def test_super_fit_called(self, mocker): +# """Test that fit calls BaseTransformer.fit.""" +# df = create_NearestMeanResponseImputer_test_df() - expected_call_args = { - 0: { - "args": ( - create_NearestMeanResponseImputer_test_df(), - create_NearestMeanResponseImputer_test_df()["c"], - ), - "kwargs": {}, - }, - } +# x = NearestMeanResponseImputer(columns=["a", "b"]) - with ta.functions.assert_function_call( - mocker, - tubular.base.BaseTransformer, - "fit", - expected_call_args, - ): - x.fit(df, df["c"]) +# expected_call_args = { +# 0: { +# "args": ( +# create_NearestMeanResponseImputer_test_df(), +# create_NearestMeanResponseImputer_test_df()["c"], +# ), +# "kwargs": {}, +# }, +# } - def test_null_values_in_response_error(self): - """Test an error is raised if the response column contains null entries.""" - df = d.create_df_3() +# with ta.functions.assert_function_call( +# mocker, +# tubular.base.BaseTransformer, +# "fit", +# expected_call_args, +# ): +# x.fit(df, df["c"]) - x = NearestMeanResponseImputer(columns=["a", "b"]) +# def test_null_values_in_response_error(self): +# """Test an error is raised if the response column contains null entries.""" +# df = d.create_df_3() - with pytest.raises( - ValueError, - match="NearestMeanResponseImputer: y has 1 null values", - ): - x.fit(df, df["c"]) - - def test_columns_with_no_nulls_error(self): - """Test an error is raised if a non-response column contains no nulls.""" - df = pd.DataFrame( - {"a": [1, 2, 3, 4, 5], "b": [5, 4, 3, 2, 1], "c": [3, 2, 1, 4, 5]}, - ) +# x = NearestMeanResponseImputer(columns=["a", "b"]) - x = NearestMeanResponseImputer(columns=["a", "b"]) +# with pytest.raises( +# ValueError, +# match="NearestMeanResponseImputer: y has 1 null values", +# ): +# x.fit(df, df["c"]) - with pytest.raises( - ValueError, - match="NearestMeanResponseImputer: Column a has no missing values, cannot use this transformer.", - ): - x.fit(df, df["c"]) +# def test_columns_with_no_nulls_error(self): +# """Test an error is raised if a non-response column contains no nulls.""" +# df = pd.DataFrame( +# {"a": [1, 2, 3, 4, 5], "b": [5, 4, 3, 2, 1], "c": [3, 2, 1, 4, 5]}, +# ) - def test_fit_returns_self(self): - """Test fit returns self?.""" - df = create_NearestMeanResponseImputer_test_df() +# x = NearestMeanResponseImputer(columns=["a", "b"]) - x = NearestMeanResponseImputer(columns=["a", "b"]) +# with pytest.raises( +# ValueError, +# match="NearestMeanResponseImputer: Column a has no missing values, cannot use this transformer.", +# ): +# x.fit(df, df["c"]) - x_fitted = x.fit(df, df["c"]) +# def test_fit_returns_self(self): +# """Test fit returns self?.""" +# df = create_NearestMeanResponseImputer_test_df() - assert ( - x_fitted is x - ), "Returned value from NearestMeanResponseImputer.fit not as expected." +# x = NearestMeanResponseImputer(columns=["a", "b"]) - def test_fit_not_changing_data(self): - """Test fit does not change X.""" - df = create_NearestMeanResponseImputer_test_df() +# x_fitted = x.fit(df, df["c"]) - x = NearestMeanResponseImputer(columns=["a", "b"]) +# assert ( +# x_fitted is x +# ), "Returned value from NearestMeanResponseImputer.fit not as expected." - x.fit(df, df["c"]) +# def test_fit_not_changing_data(self): +# """Test fit does not change X.""" +# df = create_NearestMeanResponseImputer_test_df() - ta.equality.assert_equal_dispatch( - expected=create_NearestMeanResponseImputer_test_df(), - actual=df, - msg="Check X not changing during fit", - ) +# x = NearestMeanResponseImputer(columns=["a", "b"]) - def test_learnt_values(self): - """Test that the nearest response values learnt during fit are expected.""" - df = create_NearestMeanResponseImputer_test_df() +# x.fit(df, df["c"]) - x = NearestMeanResponseImputer(columns=["a", "b"]) +# ta.equality.assert_equal_dispatch( +# expected=create_NearestMeanResponseImputer_test_df(), +# actual=df, +# msg="Check X not changing during fit", +# ) - x.fit(df, df["c"]) +# def test_learnt_values(self): +# """Test that the nearest response values learnt during fit are expected.""" +# df = create_NearestMeanResponseImputer_test_df() - ta.classes.test_object_attributes( - obj=x, - expected_attributes={ - "impute_values_": {"a": np.float64(2), "b": np.float64(3)}, - }, - msg="impute_values_ attribute", - ) +# x = NearestMeanResponseImputer(columns=["a", "b"]) - def test_learnt_values2(self): - """Test that the nearest mean response values learnt during fit are expected.""" - df = pd.DataFrame( - { - "a": [1, 1, np.nan, np.nan, 3, 5], - "b": [np.nan, np.nan, 1, 3, 3, 4], - "c": [2, 3, 2, 1, 4, 1], - }, - ) - - x = NearestMeanResponseImputer(columns=["a", "b"]) +# x.fit(df, df["c"]) - x.fit(df, df["c"]) - - ta.classes.test_object_attributes( - obj=x, - expected_attributes={ - "impute_values_": {"a": np.float64(5), "b": np.float64(3)}, - }, - msg="impute_values_ attribute", - ) +# ta.classes.test_object_attributes( +# obj=x, +# expected_attributes={ +# "impute_values_": {"a": np.float64(2), "b": np.float64(3)}, +# }, +# msg="impute_values_ attribute", +# ) +# def test_learnt_values2(self): +# """Test that the nearest mean response values learnt during fit are expected.""" +# df = pd.DataFrame( +# { +# "a": [1, 1, np.nan, np.nan, 3, 5], +# "b": [np.nan, np.nan, 1, 3, 3, 4], +# "c": [2, 3, 2, 1, 4, 1], +# }, +# ) + +# x = NearestMeanResponseImputer(columns=["a", "b"]) -class TestTransform: - """Tests for NearestMeanResponseImputer.transform.""" +# x.fit(df, df["c"]) + +# ta.classes.test_object_attributes( +# obj=x, +# expected_attributes={ +# "impute_values_": {"a": np.float64(5), "b": np.float64(3)}, +# }, +# msg="impute_values_ attribute", +# ) + + +# class TestTransform: +# """Tests for NearestMeanResponseImputer.transform.""" + +# def expected_df_1(): +# """Expected output for .""" +# df = pd.DataFrame( +# {"a": [1, 1, 2, 3, 3, 2], "b": [3, 3, 1, 3, 3, 4], "c": [2, 3, 2, 1, 4, 1]}, +# ) - def expected_df_1(): - """Expected output for .""" - df = pd.DataFrame( - {"a": [1, 1, 2, 3, 3, 2], "b": [3, 3, 1, 3, 3, 4], "c": [2, 3, 2, 1, 4, 1]}, - ) +# df[["a", "b"]] = df[["a", "b"]].astype("float64") - df[["a", "b"]] = df[["a", "b"]].astype("float64") +# return df - return df +# def expected_df_2(): +# """Expected output for .""" +# df = pd.DataFrame( +# { +# "a": [1, 1, 2, 3, 3, 2], +# "b": [np.nan, np.nan, 1, 3, 3, 4], +# "c": [2, 3, 2, 1, 4, 1], +# }, +# ) + +# df["a"] = df["a"].astype("float64") + +# return df + +# def expected_df_3(): +# """Expected output for .""" +# df = pd.DataFrame({"a": [2, 3, 4, 1, 4, 2]}) + +# df["a"] = df["a"].astype("float64") + +# return df + +# def test_check_is_fitted_called(self, mocker): +# """Test that BaseTransformer check_is_fitted called.""" +# df = create_NearestMeanResponseImputer_test_df() + +# x = NearestMeanResponseImputer(columns=["a", "b"]) + +# x.fit(df, df["c"]) + +# expected_call_args = {0: {"args": (["impute_values_"],), "kwargs": {}}} + +# with ta.functions.assert_function_call( +# mocker, +# tubular.base.BaseTransformer, +# "check_is_fitted", +# expected_call_args, +# ): +# x.transform(df) - def expected_df_2(): - """Expected output for .""" - df = pd.DataFrame( - { - "a": [1, 1, 2, 3, 3, 2], - "b": [np.nan, np.nan, 1, 3, 3, 4], - "c": [2, 3, 2, 1, 4, 1], - }, - ) +# def test_super_transform_called(self, mocker): +# """Test that BaseTransformer.transform called.""" +# df = create_NearestMeanResponseImputer_test_df() + +# x = NearestMeanResponseImputer(columns=["a", "b"]) + +# x.fit(df, df["c"]) + +# expected_call_args = { +# 0: {"args": (create_NearestMeanResponseImputer_test_df(),), "kwargs": {}}, +# } + +# with ta.functions.assert_function_call( +# mocker, +# tubular.base.BaseTransformer, +# "transform", +# expected_call_args, +# ): +# x.transform(df) + +# @pytest.mark.parametrize( +# ("df", "expected"), +# ta.pandas.adjusted_dataframe_params( +# create_NearestMeanResponseImputer_test_df(), +# expected_df_1(), +# ), +# ) +# def test_nulls_imputed_correctly(self, df, expected): +# """Test missing values are filled with the correct values.""" +# x = NearestMeanResponseImputer(columns=["a", "b"]) - df["a"] = df["a"].astype("float64") +# # set the impute values dict directly rather than fitting x on df so test works with helpers +# x.impute_values_ = {"a": 2.0, "b": 3.0} - return df +# df_transformed = x.transform(df) - def expected_df_3(): - """Expected output for .""" - df = pd.DataFrame({"a": [2, 3, 4, 1, 4, 2]}) +# ta.equality.assert_equal_dispatch( +# expected=expected, +# actual=df_transformed, +# msg="Check nulls filled correctly in transform", +# ) + +# @pytest.mark.parametrize( +# ("df", "expected"), +# ta.pandas.adjusted_dataframe_params( +# create_NearestMeanResponseImputer_test_df(), +# expected_df_2(), +# ), +# ) +# def test_nulls_imputed_correctly2(self, df, expected): +# """Test missing values are filled with the correct values - and unrelated columns are unchanged.""" +# x = NearestMeanResponseImputer(columns="a") + +# # set the impute values dict directly rather than fitting x on df so test works with helpers +# x.impute_values_ = {"a": 2.0} - df["a"] = df["a"].astype("float64") - - return df - - def test_check_is_fitted_called(self, mocker): - """Test that BaseTransformer check_is_fitted called.""" - df = create_NearestMeanResponseImputer_test_df() - - x = NearestMeanResponseImputer(columns=["a", "b"]) - - x.fit(df, df["c"]) - - expected_call_args = {0: {"args": (["impute_values_"],), "kwargs": {}}} - - with ta.functions.assert_function_call( - mocker, - tubular.base.BaseTransformer, - "check_is_fitted", - expected_call_args, - ): - x.transform(df) - - def test_super_transform_called(self, mocker): - """Test that BaseTransformer.transform called.""" - df = create_NearestMeanResponseImputer_test_df() - - x = NearestMeanResponseImputer(columns=["a", "b"]) - - x.fit(df, df["c"]) - - expected_call_args = { - 0: {"args": (create_NearestMeanResponseImputer_test_df(),), "kwargs": {}}, - } - - with ta.functions.assert_function_call( - mocker, - tubular.base.BaseTransformer, - "transform", - expected_call_args, - ): - x.transform(df) - - @pytest.mark.parametrize( - ("df", "expected"), - ta.pandas.adjusted_dataframe_params( - create_NearestMeanResponseImputer_test_df(), - expected_df_1(), - ), - ) - def test_nulls_imputed_correctly(self, df, expected): - """Test missing values are filled with the correct values.""" - x = NearestMeanResponseImputer(columns=["a", "b"]) - - # set the impute values dict directly rather than fitting x on df so test works with helpers - x.impute_values_ = {"a": 2.0, "b": 3.0} - - df_transformed = x.transform(df) - - ta.equality.assert_equal_dispatch( - expected=expected, - actual=df_transformed, - msg="Check nulls filled correctly in transform", - ) - - @pytest.mark.parametrize( - ("df", "expected"), - ta.pandas.adjusted_dataframe_params( - create_NearestMeanResponseImputer_test_df(), - expected_df_2(), - ), - ) - def test_nulls_imputed_correctly2(self, df, expected): - """Test missing values are filled with the correct values - and unrelated columns are unchanged.""" - x = NearestMeanResponseImputer(columns="a") - - # set the impute values dict directly rather than fitting x on df so test works with helpers - x.impute_values_ = {"a": 2.0} - - df_transformed = x.transform(df) - - ta.equality.assert_equal_dispatch( - expected=expected, - actual=df_transformed, - msg="Check nulls filled correctly in transform", - ) - - @pytest.mark.parametrize( - ("df", "expected"), - ta.pandas.adjusted_dataframe_params( - pd.DataFrame({"a": [np.nan, 3, 4, 1, 4, np.nan]}), - expected_df_3(), - ), - ) - def test_nulls_imputed_correctly3(self, df, expected): - """Test missing values are filled with the correct values - with median value from separate dataframe.""" - x = NearestMeanResponseImputer(columns="a") - - # set the impute values dict directly rather than fitting x on df so test works with helpers - x.impute_values_ = {"a": 2.0} - - df_transformed = x.transform(df) - - ta.equality.assert_equal_dispatch( - expected=expected, - actual=df_transformed, - msg="Check nulls filled correctly in transform", - ) - - def test_learnt_values_not_modified(self): - """Test that the impute_values_ from fit are not changed in transform.""" - df = create_NearestMeanResponseImputer_test_df() - - x = NearestMeanResponseImputer(columns=["a", "b"]) - - x.fit(df, df["c"]) - - x2 = NearestMeanResponseImputer(columns=["a", "b"]) - - x2.fit(df, df["c"]) - - x2.transform(df) - - ta.equality.assert_equal_dispatch( - expected=x.impute_values_, - actual=x2.impute_values_, - msg="Impute values not changed in transform", - ) +# df_transformed = x.transform(df) + +# ta.equality.assert_equal_dispatch( +# expected=expected, +# actual=df_transformed, +# msg="Check nulls filled correctly in transform", +# ) + +# @pytest.mark.parametrize( +# ("df", "expected"), +# ta.pandas.adjusted_dataframe_params( +# pd.DataFrame({"a": [np.nan, 3, 4, 1, 4, np.nan]}), +# expected_df_3(), +# ), +# ) +# def test_nulls_imputed_correctly3(self, df, expected): +# """Test missing values are filled with the correct values - with median value from separate dataframe.""" +# x = NearestMeanResponseImputer(columns="a") + +# # set the impute values dict directly rather than fitting x on df so test works with helpers +# x.impute_values_ = {"a": 2.0} + +# df_transformed = x.transform(df) + +# ta.equality.assert_equal_dispatch( +# expected=expected, +# actual=df_transformed, +# msg="Check nulls filled correctly in transform", +# ) + +# def test_learnt_values_not_modified(self): +# """Test that the impute_values_ from fit are not changed in transform.""" +# df = create_NearestMeanResponseImputer_test_df() + +# x = NearestMeanResponseImputer(columns=["a", "b"]) + +# x.fit(df, df["c"]) + +# x2 = NearestMeanResponseImputer(columns=["a", "b"]) + +# x2.fit(df, df["c"]) + +# x2.transform(df) + +# ta.equality.assert_equal_dispatch( +# expected=x.impute_values_, +# actual=x2.impute_values_, +# msg="Impute values not changed in transform", +# ) From 397c6c76acdb00ebde04d03f15c847546b07019c Mon Sep 17 00:00:00 2001 From: Sam Forward <122817027+Chip2916@users.noreply.github.com> Date: Fri, 19 Jul 2024 10:23:30 +0000 Subject: [PATCH 07/32] Unblock base generic tests for NMI --- tests/conftest.py | 9 + .../test_NearestMeanResponseImputer.py | 205 ++++++++++-------- tests/test_data.py | 13 ++ tubular/imputers.py | 4 + 4 files changed, 138 insertions(+), 93 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index e75eec67..931a3f9b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -13,6 +13,7 @@ from tests.test_data import ( create_is_between_dates_df_1, create_numeric_df_1, + create_numeric_df_2, create_object_df, ) @@ -293,6 +294,7 @@ def minimal_dataframe_lookup() -> dict[str, pd.DataFrame]: """ num_df = create_numeric_df_1() + nan_df = create_numeric_df_2() object_df = create_object_df() date_df = create_is_between_dates_df_1() @@ -324,6 +326,13 @@ def minimal_dataframe_lookup() -> dict[str, pd.DataFrame]: for transformer in other_num_transformers: min_df_dict[transformer] = num_df + # Some transformers require missing values to work + other_nan_transformers = [ + "NearestMeanResponseImputer", + ] + for transformer in other_nan_transformers: + min_df_dict[transformer] = nan_df + return min_df_dict diff --git a/tests/imputers/test_NearestMeanResponseImputer.py b/tests/imputers/test_NearestMeanResponseImputer.py index fdd29be9..48c6b6f8 100644 --- a/tests/imputers/test_NearestMeanResponseImputer.py +++ b/tests/imputers/test_NearestMeanResponseImputer.py @@ -1,11 +1,13 @@ +import copy + import numpy as np import pandas as pd +import pytest +import test_aide as ta -# import pytest -# import test_aide as ta -# import tests.test_data as d from tests.base_tests import ( ColumnStrListInitTests, + GenericFitTests, ) @@ -33,122 +35,139 @@ def setup_class(cls): cls.transformer_name = "NearestMeanResponseImputer" -# class TestFit: -# """Tests for NearestMeanResponseImputer.fit.""" +class TestFit(GenericFitTests): + """Generic tests for transformer.fit()""" -# def test_super_fit_called(self, mocker): -# """Test that fit calls BaseTransformer.fit.""" -# df = create_NearestMeanResponseImputer_test_df() + @classmethod + def setup_class(cls): + cls.transformer_name = "NearestMeanResponseImputer" -# x = NearestMeanResponseImputer(columns=["a", "b"]) + def test_fit_passed_series( + self, + initialized_transformers, + minimal_dataframe_lookup, + ): + """Test fit is passed a series as y argument.""" -# expected_call_args = { -# 0: { -# "args": ( -# create_NearestMeanResponseImputer_test_df(), -# create_NearestMeanResponseImputer_test_df()["c"], -# ), -# "kwargs": {}, -# }, -# } + df = minimal_dataframe_lookup[self.transformer_name] -# with ta.functions.assert_function_call( -# mocker, -# tubular.base.BaseTransformer, -# "fit", -# expected_call_args, -# ): -# x.fit(df, df["c"]) + x = initialized_transformers[self.transformer_name] -# def test_null_values_in_response_error(self): -# """Test an error is raised if the response column contains null entries.""" -# df = d.create_df_3() + with pytest.raises( + TypeError, + match="unexpected type for y, should be a pd.Series", + ): + x.fit(df)(columns=["c"], separator=333) -# x = NearestMeanResponseImputer(columns=["a", "b"]) + def test_fit_not_changing_data( + self, + initialized_transformers, + minimal_dataframe_lookup, + ): + """Test fit does not change X.""" -# with pytest.raises( -# ValueError, -# match="NearestMeanResponseImputer: y has 1 null values", -# ): -# x.fit(df, df["c"]) + df = minimal_dataframe_lookup[self.transformer_name] + original_df = copy.deepcopy(df) -# def test_columns_with_no_nulls_error(self): -# """Test an error is raised if a non-response column contains no nulls.""" -# df = pd.DataFrame( -# {"a": [1, 2, 3, 4, 5], "b": [5, 4, 3, 2, 1], "c": [3, 2, 1, 4, 5]}, -# ) + x = initialized_transformers[self.transformer_name] -# x = NearestMeanResponseImputer(columns=["a", "b"]) + x.fit(df, df["c"]) -# with pytest.raises( -# ValueError, -# match="NearestMeanResponseImputer: Column a has no missing values, cannot use this transformer.", -# ): -# x.fit(df, df["c"]) + ta.equality.assert_equal_dispatch( + expected=original_df, + actual=df, + msg="Check X not changing during fit", + ) -# def test_fit_returns_self(self): -# """Test fit returns self?.""" -# df = create_NearestMeanResponseImputer_test_df() + # def test_null_values_in_response_error(self): + # """Test an error is raised if the response column contains null entries.""" + # df = d.create_df_3() -# x = NearestMeanResponseImputer(columns=["a", "b"]) + # x = NearestMeanResponseImputer(columns=["b"]) -# x_fitted = x.fit(df, df["c"]) + # with pytest.raises( + # ValueError, + # match="NearestMeanResponseImputer: y has 1 null values", + # ): + # x.fit(df, df["c"]) -# assert ( -# x_fitted is x -# ), "Returned value from NearestMeanResponseImputer.fit not as expected." + # def test_columns_with_no_nulls_error(self): + # """Test an error is raised if a non-response column contains no nulls.""" + # df = pd.DataFrame( + # {"a": [1, 2, 3, 4, 5], "b": [5, 4, 3, 2, 1], "c": [3, 2, 1, 4, 5]}, + # ) -# def test_fit_not_changing_data(self): -# """Test fit does not change X.""" -# df = create_NearestMeanResponseImputer_test_df() + # x = NearestMeanResponseImputer(columns=["a", "b"]) -# x = NearestMeanResponseImputer(columns=["a", "b"]) + # with pytest.raises( + # ValueError, + # match="NearestMeanResponseImputer: Column a has no missing values, cannot use this transformer.", + # ): + # x.fit(df, df["c"]) -# x.fit(df, df["c"]) + # def test_fit_returns_self(self): + # """Test fit returns self?.""" + # df = create_NearestMeanResponseImputer_test_df() -# ta.equality.assert_equal_dispatch( -# expected=create_NearestMeanResponseImputer_test_df(), -# actual=df, -# msg="Check X not changing during fit", -# ) + # x = NearestMeanResponseImputer(columns=["a", "b"]) -# def test_learnt_values(self): -# """Test that the nearest response values learnt during fit are expected.""" -# df = create_NearestMeanResponseImputer_test_df() + # x_fitted = x.fit(df, df["c"]) -# x = NearestMeanResponseImputer(columns=["a", "b"]) + # assert ( + # x_fitted is x + # ), "Returned value from NearestMeanResponseImputer.fit not as expected." -# x.fit(df, df["c"]) + # def test_fit_not_changing_data(self): + # """Test fit does not change X.""" + # df = create_NearestMeanResponseImputer_test_df() -# ta.classes.test_object_attributes( -# obj=x, -# expected_attributes={ -# "impute_values_": {"a": np.float64(2), "b": np.float64(3)}, -# }, -# msg="impute_values_ attribute", -# ) + # x = NearestMeanResponseImputer(columns=["a", "b"]) -# def test_learnt_values2(self): -# """Test that the nearest mean response values learnt during fit are expected.""" -# df = pd.DataFrame( -# { -# "a": [1, 1, np.nan, np.nan, 3, 5], -# "b": [np.nan, np.nan, 1, 3, 3, 4], -# "c": [2, 3, 2, 1, 4, 1], -# }, -# ) + # x.fit(df, df["c"]) -# x = NearestMeanResponseImputer(columns=["a", "b"]) + # ta.equality.assert_equal_dispatch( + # expected=create_NearestMeanResponseImputer_test_df(), + # actual=df, + # msg="Check X not changing during fit", + # ) -# x.fit(df, df["c"]) + # def test_learnt_values(self): + # """Test that the nearest response values learnt during fit are expected.""" + # df = create_NearestMeanResponseImputer_test_df() -# ta.classes.test_object_attributes( -# obj=x, -# expected_attributes={ -# "impute_values_": {"a": np.float64(5), "b": np.float64(3)}, -# }, -# msg="impute_values_ attribute", -# ) + # x = NearestMeanResponseImputer(columns=["a", "b"]) + + # x.fit(df, df["c"]) + + # ta.classes.test_object_attributes( + # obj=x, + # expected_attributes={ + # "impute_values_": {"a": np.float64(2), "b": np.float64(3)}, + # }, + # msg="impute_values_ attribute", + # ) + + # def test_learnt_values2(self): + # """Test that the nearest mean response values learnt during fit are expected.""" + # df = pd.DataFrame( + # { + # "a": [1, 1, np.nan, np.nan, 3, 5], + # "b": [np.nan, np.nan, 1, 3, 3, 4], + # "c": [2, 3, 2, 1, 4, 1], + # }, + # ) + + # x = NearestMeanResponseImputer(columns=["a", "b"]) + + # x.fit(df, df["c"]) + + # ta.classes.test_object_attributes( + # obj=x, + # expected_attributes={ + # "impute_values_": {"a": np.float64(5), "b": np.float64(3)}, + # }, + # msg="impute_values_ attribute", + # ) # class TestTransform: diff --git a/tests/test_data.py b/tests/test_data.py index f284503d..880e25e2 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -34,6 +34,19 @@ def create_numeric_df_1(): ) +def create_numeric_df_2(): + """Example with numeric dataframe that includes missings.""" + return pd.DataFrame( + { + "a": [34.48, 21.71, np.nan, 1.08, 32.93, 4.74, 2.76, 75.7, np.nan, 61.31], + "b": [12.03, 20.32, 24.12, 24.18, 68.99, 0.0, 0.0, 59.46, 11.02, 60.68], + "c": [17.06, 12.25, 19.15, 29.73, 1.98, 8.23, 15.22, 20.59, 3.82, 39.73], + "d": [25.94, 70.22, 72.94, 64.55, 0.41, 13.62, 30.22, 4.6, 67.13, 10.38], + "e": [94.3, 4.18, 51.7, 16.63, 2.6, 16.57, 3.51, 30.79, 66.19, 25.44], + }, + ) + + def create_object_df(): """Example with object columns - c is numeric target""" return pd.DataFrame( diff --git a/tubular/imputers.py b/tubular/imputers.py index d2f7f0fa..929ea393 100644 --- a/tubular/imputers.py +++ b/tubular/imputers.py @@ -413,6 +413,10 @@ def fit(self, X: pd.DataFrame, y: pd.Series | None = None) -> pd.DataFrame: """ super().fit(X, y) + if not isinstance(y, pd.Series): + msg = "unexpected type for y, should be a pd.Series" + raise TypeError(msg) + n_nulls = y.isna().sum() if n_nulls > 0: From edb25dc779edfccdcf531bdc0f3fdb4288073413 Mon Sep 17 00:00:00 2001 From: Sam Forward <122817027+Chip2916@users.noreply.github.com> Date: Fri, 19 Jul 2024 10:32:47 +0000 Subject: [PATCH 08/32] Unblock all NMI fit tests --- .../test_NearestMeanResponseImputer.py | 136 ++++++++---------- 1 file changed, 60 insertions(+), 76 deletions(-) diff --git a/tests/imputers/test_NearestMeanResponseImputer.py b/tests/imputers/test_NearestMeanResponseImputer.py index 48c6b6f8..1cc919c6 100644 --- a/tests/imputers/test_NearestMeanResponseImputer.py +++ b/tests/imputers/test_NearestMeanResponseImputer.py @@ -5,13 +5,15 @@ import pytest import test_aide as ta +import tests.test_data as d from tests.base_tests import ( ColumnStrListInitTests, GenericFitTests, ) +from tubular.imputers import NearestMeanResponseImputer -# Dataframe used exclusively in this testing script +# Dataframes used exclusively in this testing script def create_NearestMeanResponseImputer_test_df(): """Create DataFrame to use in NearestMeanResponseImputer tests. @@ -27,6 +29,21 @@ def create_NearestMeanResponseImputer_test_df(): ) +def create_NearestMeanResponseImputer_test_df_2(): + """Create second DataFrame to use in NearestMeanResponseImputer tests. + + DataFrame column c is the response, the other columns are numerical columns containing null entries. + + """ + return pd.DataFrame( + { + "a": [1, 1, np.nan, np.nan, 3, 5], + "b": [np.nan, np.nan, 1, 3, 3, 4], + "c": [2, 3, 2, 1, 4, 1], + }, + ) + + class TestInit(ColumnStrListInitTests): """Generic tests for transformer.init().""" @@ -79,95 +96,62 @@ def test_fit_not_changing_data( msg="Check X not changing during fit", ) - # def test_null_values_in_response_error(self): - # """Test an error is raised if the response column contains null entries.""" - # df = d.create_df_3() - - # x = NearestMeanResponseImputer(columns=["b"]) - - # with pytest.raises( - # ValueError, - # match="NearestMeanResponseImputer: y has 1 null values", - # ): - # x.fit(df, df["c"]) + def test_null_values_in_response_error(self): + """Test an error is raised if the response column contains null entries.""" + df = d.create_df_3() - # def test_columns_with_no_nulls_error(self): - # """Test an error is raised if a non-response column contains no nulls.""" - # df = pd.DataFrame( - # {"a": [1, 2, 3, 4, 5], "b": [5, 4, 3, 2, 1], "c": [3, 2, 1, 4, 5]}, - # ) + x = NearestMeanResponseImputer(columns=["b"]) - # x = NearestMeanResponseImputer(columns=["a", "b"]) - - # with pytest.raises( - # ValueError, - # match="NearestMeanResponseImputer: Column a has no missing values, cannot use this transformer.", - # ): - # x.fit(df, df["c"]) - - # def test_fit_returns_self(self): - # """Test fit returns self?.""" - # df = create_NearestMeanResponseImputer_test_df() - - # x = NearestMeanResponseImputer(columns=["a", "b"]) - - # x_fitted = x.fit(df, df["c"]) - - # assert ( - # x_fitted is x - # ), "Returned value from NearestMeanResponseImputer.fit not as expected." + with pytest.raises( + ValueError, + match="NearestMeanResponseImputer: y has 1 null values", + ): + x.fit(df, df["c"]) - # def test_fit_not_changing_data(self): - # """Test fit does not change X.""" - # df = create_NearestMeanResponseImputer_test_df() + def test_columns_with_no_nulls_error(self): + """Test an error is raised if a non-response column contains no nulls.""" + df = d.create_numeric_df_1() - # x = NearestMeanResponseImputer(columns=["a", "b"]) + x = NearestMeanResponseImputer(columns=["a", "b"]) - # x.fit(df, df["c"]) + with pytest.raises( + ValueError, + match="NearestMeanResponseImputer: Column a has no missing values, cannot use this transformer.", + ): + x.fit(df, df["c"]) - # ta.equality.assert_equal_dispatch( - # expected=create_NearestMeanResponseImputer_test_df(), - # actual=df, - # msg="Check X not changing during fit", - # ) + def test_learnt_values(self): + """Test that the nearest response values learnt during fit are expected.""" + df = create_NearestMeanResponseImputer_test_df() - # def test_learnt_values(self): - # """Test that the nearest response values learnt during fit are expected.""" - # df = create_NearestMeanResponseImputer_test_df() + x = NearestMeanResponseImputer(columns=["a", "b"]) - # x = NearestMeanResponseImputer(columns=["a", "b"]) + x.fit(df, df["c"]) - # x.fit(df, df["c"]) + ta.classes.test_object_attributes( + obj=x, + expected_attributes={ + "impute_values_": {"a": np.float64(2), "b": np.float64(3)}, + }, + msg="impute_values_ attribute", + ) - # ta.classes.test_object_attributes( - # obj=x, - # expected_attributes={ - # "impute_values_": {"a": np.float64(2), "b": np.float64(3)}, - # }, - # msg="impute_values_ attribute", - # ) + def test_learnt_values2(self): + """Test that the nearest mean response values learnt during fit are expected.""" - # def test_learnt_values2(self): - # """Test that the nearest mean response values learnt during fit are expected.""" - # df = pd.DataFrame( - # { - # "a": [1, 1, np.nan, np.nan, 3, 5], - # "b": [np.nan, np.nan, 1, 3, 3, 4], - # "c": [2, 3, 2, 1, 4, 1], - # }, - # ) + df = create_NearestMeanResponseImputer_test_df_2() - # x = NearestMeanResponseImputer(columns=["a", "b"]) + x = NearestMeanResponseImputer(columns=["a", "b"]) - # x.fit(df, df["c"]) + x.fit(df, df["c"]) - # ta.classes.test_object_attributes( - # obj=x, - # expected_attributes={ - # "impute_values_": {"a": np.float64(5), "b": np.float64(3)}, - # }, - # msg="impute_values_ attribute", - # ) + ta.classes.test_object_attributes( + obj=x, + expected_attributes={ + "impute_values_": {"a": np.float64(5), "b": np.float64(3)}, + }, + msg="impute_values_ attribute", + ) # class TestTransform: From cffc8576ce8163bd4908896d58bf857c18f86b5a Mon Sep 17 00:00:00 2001 From: Sam Forward <122817027+Chip2916@users.noreply.github.com> Date: Fri, 19 Jul 2024 12:25:58 +0000 Subject: [PATCH 09/32] Brough NMI transform tests in line with new framework --- CHANGELOG.rst | 1 + .../test_NearestMeanResponseImputer.py | 169 ++---------------- 2 files changed, 13 insertions(+), 157 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 700d4b5e..dd7b7242 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -67,6 +67,7 @@ Changed - Refactored OHE transformer tests to align with new testing framework. - Moved fixtures relating only to a single test out of conftest and into testing script where utilised. - !!!Introduced dependency on Sklearn's OneHotEncoder by adding test to check OHE transformer (which we are calling from within our OHE wrapper) is fit before transform +- Refactored NearestMeanResponseImputer in line with new testing framework. Removed diff --git a/tests/imputers/test_NearestMeanResponseImputer.py b/tests/imputers/test_NearestMeanResponseImputer.py index 1cc919c6..a1a402ba 100644 --- a/tests/imputers/test_NearestMeanResponseImputer.py +++ b/tests/imputers/test_NearestMeanResponseImputer.py @@ -9,6 +9,10 @@ from tests.base_tests import ( ColumnStrListInitTests, GenericFitTests, + GenericTransformTests, +) +from tests.imputers.test_BaseImputer import ( + GenericImputerTransformTests, ) from tubular.imputers import NearestMeanResponseImputer @@ -154,161 +158,12 @@ def test_learnt_values2(self): ) -# class TestTransform: -# """Tests for NearestMeanResponseImputer.transform.""" - -# def expected_df_1(): -# """Expected output for .""" -# df = pd.DataFrame( -# {"a": [1, 1, 2, 3, 3, 2], "b": [3, 3, 1, 3, 3, 4], "c": [2, 3, 2, 1, 4, 1]}, -# ) - -# df[["a", "b"]] = df[["a", "b"]].astype("float64") - -# return df - -# def expected_df_2(): -# """Expected output for .""" -# df = pd.DataFrame( -# { -# "a": [1, 1, 2, 3, 3, 2], -# "b": [np.nan, np.nan, 1, 3, 3, 4], -# "c": [2, 3, 2, 1, 4, 1], -# }, -# ) - -# df["a"] = df["a"].astype("float64") - -# return df - -# def expected_df_3(): -# """Expected output for .""" -# df = pd.DataFrame({"a": [2, 3, 4, 1, 4, 2]}) - -# df["a"] = df["a"].astype("float64") - -# return df - -# def test_check_is_fitted_called(self, mocker): -# """Test that BaseTransformer check_is_fitted called.""" -# df = create_NearestMeanResponseImputer_test_df() - -# x = NearestMeanResponseImputer(columns=["a", "b"]) - -# x.fit(df, df["c"]) - -# expected_call_args = {0: {"args": (["impute_values_"],), "kwargs": {}}} - -# with ta.functions.assert_function_call( -# mocker, -# tubular.base.BaseTransformer, -# "check_is_fitted", -# expected_call_args, -# ): -# x.transform(df) - -# def test_super_transform_called(self, mocker): -# """Test that BaseTransformer.transform called.""" -# df = create_NearestMeanResponseImputer_test_df() - -# x = NearestMeanResponseImputer(columns=["a", "b"]) - -# x.fit(df, df["c"]) +class TestTransform( + GenericTransformTests, + GenericImputerTransformTests, +): + """Tests for transformer.transform.""" -# expected_call_args = { -# 0: {"args": (create_NearestMeanResponseImputer_test_df(),), "kwargs": {}}, -# } - -# with ta.functions.assert_function_call( -# mocker, -# tubular.base.BaseTransformer, -# "transform", -# expected_call_args, -# ): -# x.transform(df) - -# @pytest.mark.parametrize( -# ("df", "expected"), -# ta.pandas.adjusted_dataframe_params( -# create_NearestMeanResponseImputer_test_df(), -# expected_df_1(), -# ), -# ) -# def test_nulls_imputed_correctly(self, df, expected): -# """Test missing values are filled with the correct values.""" -# x = NearestMeanResponseImputer(columns=["a", "b"]) - -# # set the impute values dict directly rather than fitting x on df so test works with helpers -# x.impute_values_ = {"a": 2.0, "b": 3.0} - -# df_transformed = x.transform(df) - -# ta.equality.assert_equal_dispatch( -# expected=expected, -# actual=df_transformed, -# msg="Check nulls filled correctly in transform", -# ) - -# @pytest.mark.parametrize( -# ("df", "expected"), -# ta.pandas.adjusted_dataframe_params( -# create_NearestMeanResponseImputer_test_df(), -# expected_df_2(), -# ), -# ) -# def test_nulls_imputed_correctly2(self, df, expected): -# """Test missing values are filled with the correct values - and unrelated columns are unchanged.""" -# x = NearestMeanResponseImputer(columns="a") - -# # set the impute values dict directly rather than fitting x on df so test works with helpers -# x.impute_values_ = {"a": 2.0} - -# df_transformed = x.transform(df) - -# ta.equality.assert_equal_dispatch( -# expected=expected, -# actual=df_transformed, -# msg="Check nulls filled correctly in transform", -# ) - -# @pytest.mark.parametrize( -# ("df", "expected"), -# ta.pandas.adjusted_dataframe_params( -# pd.DataFrame({"a": [np.nan, 3, 4, 1, 4, np.nan]}), -# expected_df_3(), -# ), -# ) -# def test_nulls_imputed_correctly3(self, df, expected): -# """Test missing values are filled with the correct values - with median value from separate dataframe.""" -# x = NearestMeanResponseImputer(columns="a") - -# # set the impute values dict directly rather than fitting x on df so test works with helpers -# x.impute_values_ = {"a": 2.0} - -# df_transformed = x.transform(df) - -# ta.equality.assert_equal_dispatch( -# expected=expected, -# actual=df_transformed, -# msg="Check nulls filled correctly in transform", -# ) - -# def test_learnt_values_not_modified(self): -# """Test that the impute_values_ from fit are not changed in transform.""" -# df = create_NearestMeanResponseImputer_test_df() - -# x = NearestMeanResponseImputer(columns=["a", "b"]) - -# x.fit(df, df["c"]) - -# x2 = NearestMeanResponseImputer(columns=["a", "b"]) - -# x2.fit(df, df["c"]) - -# x2.transform(df) - -# ta.equality.assert_equal_dispatch( -# expected=x.impute_values_, -# actual=x2.impute_values_, -# msg="Impute values not changed in transform", -# ) + @classmethod + def setup_class(cls): + cls.transformer_name = "NearestMeanResponseImputer" From 1441ed28f1b074aa0314149b4306088e2787fca4 Mon Sep 17 00:00:00 2001 From: Sam Forward <122817027+Chip2916@users.noreply.github.com> Date: Fri, 19 Jul 2024 14:42:40 +0000 Subject: [PATCH 10/32] Removed duplication of test df creation --- .../test_NearestMeanResponseImputer.py | 51 +------------------ tests/test_data.py | 8 ++- 2 files changed, 4 insertions(+), 55 deletions(-) diff --git a/tests/imputers/test_NearestMeanResponseImputer.py b/tests/imputers/test_NearestMeanResponseImputer.py index a1a402ba..a400564b 100644 --- a/tests/imputers/test_NearestMeanResponseImputer.py +++ b/tests/imputers/test_NearestMeanResponseImputer.py @@ -1,7 +1,6 @@ import copy import numpy as np -import pandas as pd import pytest import test_aide as ta @@ -17,37 +16,6 @@ from tubular.imputers import NearestMeanResponseImputer -# Dataframes used exclusively in this testing script -def create_NearestMeanResponseImputer_test_df(): - """Create DataFrame to use in NearestMeanResponseImputer tests. - - DataFrame column c is the response, the other columns are numerical columns containing null entries. - - """ - return pd.DataFrame( - { - "a": [1, 1, 2, 3, 3, np.nan], - "b": [np.nan, np.nan, 1, 3, 3, 4], - "c": [2, 3, 2, 1, 4, 1], - }, - ) - - -def create_NearestMeanResponseImputer_test_df_2(): - """Create second DataFrame to use in NearestMeanResponseImputer tests. - - DataFrame column c is the response, the other columns are numerical columns containing null entries. - - """ - return pd.DataFrame( - { - "a": [1, 1, np.nan, np.nan, 3, 5], - "b": [np.nan, np.nan, 1, 3, 3, 4], - "c": [2, 3, 2, 1, 4, 1], - }, - ) - - class TestInit(ColumnStrListInitTests): """Generic tests for transformer.init().""" @@ -126,7 +94,7 @@ def test_columns_with_no_nulls_error(self): def test_learnt_values(self): """Test that the nearest response values learnt during fit are expected.""" - df = create_NearestMeanResponseImputer_test_df() + df = d.create_numeric_df_2() x = NearestMeanResponseImputer(columns=["a", "b"]) @@ -140,23 +108,6 @@ def test_learnt_values(self): msg="impute_values_ attribute", ) - def test_learnt_values2(self): - """Test that the nearest mean response values learnt during fit are expected.""" - - df = create_NearestMeanResponseImputer_test_df_2() - - x = NearestMeanResponseImputer(columns=["a", "b"]) - - x.fit(df, df["c"]) - - ta.classes.test_object_attributes( - obj=x, - expected_attributes={ - "impute_values_": {"a": np.float64(5), "b": np.float64(3)}, - }, - msg="impute_values_ attribute", - ) - class TestTransform( GenericTransformTests, diff --git a/tests/test_data.py b/tests/test_data.py index 880e25e2..b1332f42 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -38,11 +38,9 @@ def create_numeric_df_2(): """Example with numeric dataframe that includes missings.""" return pd.DataFrame( { - "a": [34.48, 21.71, np.nan, 1.08, 32.93, 4.74, 2.76, 75.7, np.nan, 61.31], - "b": [12.03, 20.32, 24.12, 24.18, 68.99, 0.0, 0.0, 59.46, 11.02, 60.68], - "c": [17.06, 12.25, 19.15, 29.73, 1.98, 8.23, 15.22, 20.59, 3.82, 39.73], - "d": [25.94, 70.22, 72.94, 64.55, 0.41, 13.62, 30.22, 4.6, 67.13, 10.38], - "e": [94.3, 4.18, 51.7, 16.63, 2.6, 16.57, 3.51, 30.79, 66.19, 25.44], + "a": [1, 1, 2, 3, 3, np.nan], + "b": [np.nan, np.nan, 1, 3, 3, 4], + "c": [2, 3, 2, 1, 4, 1], }, ) From e5d0db7cc6c48611413c25831855ca9d10fe5c6a Mon Sep 17 00:00:00 2001 From: limlam96 <103185696+limlam96@users.noreply.github.com> Date: Fri, 19 Jul 2024 15:41:01 +0000 Subject: [PATCH 11/32] reworked tests for InteractionTransformer --- CHANGELOG.rst | 10 +- tests/numeric/test_InteractionTransformer.py | 97 +++----------------- tubular/numeric.py | 2 +- 3 files changed, 23 insertions(+), 86 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index dd7b7242..37bad0fe 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -16,7 +16,15 @@ Subsections for each version can be one of the following; Each individual change should have a link to the pull request after the description of the change. -1.3.1 (unreleased) +1.3.2 (unreleased) +------------------ + +Changed +^^^^^^^ + +- Refactored tests for InteractionTransformer + +1.3.1 (2024-07-17) ------------------ Changed diff --git a/tests/numeric/test_InteractionTransformer.py b/tests/numeric/test_InteractionTransformer.py index a84dd137..e3259315 100644 --- a/tests/numeric/test_InteractionTransformer.py +++ b/tests/numeric/test_InteractionTransformer.py @@ -1,66 +1,25 @@ -import contextlib -import re - import numpy as np import pandas as pd import pytest import test_aide as ta import tests.test_data as d -import tubular +from tests.numeric.test_BaseNumericTransformer import ( + BaseNumericTransformerInitTests, + BaseNumericTransformerTransformTests, +) from tubular.numeric import InteractionTransformer -class TestInit: +class TestInit(BaseNumericTransformerInitTests): """Tests for InteractionTransformer.init().""" - def test_super_init_called(self, mocker): - """Test that init calls BaseTransformer.init.""" - expected_call_args = { - 0: { - "args": (), - "kwargs": {"columns": ["b", "c"], "verbose": True}, - }, - } - - with ta.functions.assert_function_call( - mocker, - tubular.base.BaseTransformer, - "__init__", - expected_call_args, - ), contextlib.suppress(AttributeError): - InteractionTransformer( - columns=["b", "c"], - min_degree=2, - max_degree=2, - verbose=True, - ) + @classmethod + def setup_class(cls): + cls.transformer_name = "InteractionTransformer" def test_invalid_input_type_errors(self): """Test that an exceptions are raised for invalid input types.""" - with pytest.raises( - TypeError, - match=re.escape( - "InteractionTransformer: columns must be a string or list with the columns to be pre-processed (if specified)", - ), - ): - InteractionTransformer( - columns=3.2, - min_degree=2, - max_degree=2, - ) - - with pytest.raises( - TypeError, - match=re.escape( - "InteractionTransformer: each element of columns should be a single (string) column name", - ), - ): - InteractionTransformer( - columns=["A", "B", 4], - min_degree=2, - max_degree=2, - ) with pytest.raises( TypeError, @@ -121,28 +80,14 @@ def test_invalid_input_value_errors(self): max_degree=4, ) - def test_attributes_set(self): - """Test that the values passed for columns, degrees are saved to attributes on the object.""" - x = InteractionTransformer( - columns=["A", "B", "C"], - min_degree=2, - max_degree=3, - ) - - ta.classes.test_object_attributes( - obj=x, - expected_attributes={ - "columns": ["A", "B", "C"], - "min_degree": 2, - "max_degree": 3, - }, - msg="Attributes for InteractionTransformer set in init", - ) - -class TestTransform: +class TestTransform(BaseNumericTransformerTransformTests): """Tests for InteractionTransformer.transform().""" + @classmethod + def setup_class(cls): + cls.transformer_name = "InteractionTransformer" + def expected_df_1(): """Expected output of test_expected_output_default_assignment.""" return pd.DataFrame( @@ -183,22 +128,6 @@ def expected_df_2(): }, ) - def test_super_transform_called(self, mocker): - """Test that BaseTransformer.transform called.""" - df = d.create_df_3() - - x = InteractionTransformer(columns=["b", "c"]) - - expected_call_args = {0: {"args": (df.copy(),), "kwargs": {}}} - - with ta.functions.assert_function_call( - mocker, - tubular.base.BaseTransformer, - "transform", - expected_call_args, - ): - x.transform(df) - @pytest.mark.parametrize( ("df", "expected"), ta.pandas.adjusted_dataframe_params(d.create_df_3(), expected_df_1()), diff --git a/tubular/numeric.py b/tubular/numeric.py index 2d4229c7..0c2184f1 100644 --- a/tubular/numeric.py +++ b/tubular/numeric.py @@ -558,7 +558,7 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: return X -class InteractionTransformer(BaseTransformer): +class InteractionTransformer(BaseNumericTransformer): """Transformer that generates interaction features. Transformer generates a new column for all combinations from the selected columns up to the maximum degree provided. (For sklearn version higher than 1.0.0>, only interaction of a degree higher or equal to the minimum From 3b39f180ace98659b52d588bc829fe564d88ef76 Mon Sep 17 00:00:00 2001 From: davidhopkinson26 <89029024+davidhopkinson26@users.noreply.github.com> Date: Thu, 25 Jul 2024 09:37:22 +0000 Subject: [PATCH 12/32] :white_check_mark: fixes test_not_two_columns_error --- tests/dates/test_BaseTwocolumnDateTransformer.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/dates/test_BaseTwocolumnDateTransformer.py b/tests/dates/test_BaseTwocolumnDateTransformer.py index d28ff4bd..f36b5055 100644 --- a/tests/dates/test_BaseTwocolumnDateTransformer.py +++ b/tests/dates/test_BaseTwocolumnDateTransformer.py @@ -11,7 +11,7 @@ class GenericTwoColumnDatesInitTests: """Generic tests for Init Dates Transformers which take two columns""" - @pytest.mark.parametrize("columns", [["a"], ["a", "b", "c"]]) + @pytest.mark.parametrize("columns", ["a", ["a", "b", "c"]]) def test_not_two_columns_error( self, uninitialized_transformers, @@ -21,7 +21,9 @@ def test_not_two_columns_error( """ "test that two correct error raised when passed more or less than two columns""" init_args = minimal_attribute_dict[self.transformer_name] - init_args[columns] = columns + print(init_args) + init_args["columns"] = columns + print(init_args) msg = f"{self.transformer_name}: This transformer works with two columns only" @@ -29,7 +31,7 @@ def test_not_two_columns_error( ValueError, match=msg, ): - uninitialized_transformers[self.transformer_name](init_args) + uninitialized_transformers[self.transformer_name](*init_args) class TestInit(ColumnStrListInitTests, GenericTwoColumnDatesInitTests): From e12748092c48d4ab84407de31d3cb027666ce9dc Mon Sep 17 00:00:00 2001 From: davidhopkinson26 <89029024+davidhopkinson26@users.noreply.github.com> Date: Thu, 25 Jul 2024 09:38:47 +0000 Subject: [PATCH 13/32] :fire: delete some debugging lines --- tests/dates/test_BaseDateTransformer.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/dates/test_BaseDateTransformer.py b/tests/dates/test_BaseDateTransformer.py index b80ea5a5..759fcdf3 100644 --- a/tests/dates/test_BaseDateTransformer.py +++ b/tests/dates/test_BaseDateTransformer.py @@ -103,8 +103,6 @@ def test_mismatched_datetypes_error( present_types = ( {"datetime64", "date"} if datetime_col == 0 else {"date", "datetime64"} ) - print("***********************************************************") - print(df) msg = rf"Columns fed to datetime transformers should be \['datetime64', 'date'\] and have consistent types, but found {present_types}. Please use ToDatetimeTransformer to standardise" with pytest.raises( TypeError, From 307211ff0a10baccbf4fba879f56aa3d473a7859 Mon Sep 17 00:00:00 2001 From: davidhopkinson26 <89029024+davidhopkinson26@users.noreply.github.com> Date: Thu, 25 Jul 2024 10:29:01 +0000 Subject: [PATCH 14/32] :construction: refactored to use DateTransformerMixin in dates.py and mix this into the BaseDateTransformer and BaseDateTwoColumnTransformer, reusing logic from BaseTwoColumnTransformer --- tests/dates/test_BaseDateTransformer.py | 4 +- .../test_BaseTwocolumnDateTransformer.py | 41 +++----------- tubular/dates.py | 54 +++---------------- 3 files changed, 16 insertions(+), 83 deletions(-) diff --git a/tests/dates/test_BaseDateTransformer.py b/tests/dates/test_BaseDateTransformer.py index 759fcdf3..f46a8fee 100644 --- a/tests/dates/test_BaseDateTransformer.py +++ b/tests/dates/test_BaseDateTransformer.py @@ -68,7 +68,7 @@ def create_date_diff_different_dtypes(): ) -class GenericDatesTransformTests: +class DatesTransformMixinTests: """Generic tests for Dates Transformers""" @pytest.mark.parametrize( @@ -127,7 +127,7 @@ def setup_class(cls): cls.transformer_name = "BaseDateTransformer" -class TestTransform(GenericTransformTests, GenericDatesTransformTests): +class TestTransform(GenericTransformTests, DatesTransformMixinTests): """Tests for BaseDateTransformer.transform.""" @classmethod diff --git a/tests/dates/test_BaseTwocolumnDateTransformer.py b/tests/dates/test_BaseTwocolumnDateTransformer.py index f36b5055..f0731659 100644 --- a/tests/dates/test_BaseTwocolumnDateTransformer.py +++ b/tests/dates/test_BaseTwocolumnDateTransformer.py @@ -1,40 +1,15 @@ -import pytest - -from tests.dates.test_BaseDateTransformer import ( - ColumnStrListInitTests, - GenericDatesTransformTests, +from tests.base_tests import ( GenericFitTests, + GenericTransformTests, OtherBaseBehaviourTests, + TwoColumnListInitTests, +) +from tests.dates.test_BaseDateTransformer import ( + DatesTransformMixinTests, ) -class GenericTwoColumnDatesInitTests: - """Generic tests for Init Dates Transformers which take two columns""" - - @pytest.mark.parametrize("columns", ["a", ["a", "b", "c"]]) - def test_not_two_columns_error( - self, - uninitialized_transformers, - minimal_attribute_dict, - columns, - ): - """ "test that two correct error raised when passed more or less than two columns""" - - init_args = minimal_attribute_dict[self.transformer_name] - print(init_args) - init_args["columns"] = columns - print(init_args) - - msg = f"{self.transformer_name}: This transformer works with two columns only" - - with pytest.raises( - ValueError, - match=msg, - ): - uninitialized_transformers[self.transformer_name](*init_args) - - -class TestInit(ColumnStrListInitTests, GenericTwoColumnDatesInitTests): +class TestInit(TwoColumnListInitTests): """Generic tests for transformer.init().""" @classmethod @@ -50,7 +25,7 @@ def setup_class(cls): cls.transformer_name = "BaseDateTwoColumnTransformer" -class TestTransform(GenericDatesTransformTests): +class TestTransform(GenericTransformTests, DatesTransformMixinTests): """Tests for BaseTwoColumnDateTransformer.transform.""" @classmethod diff --git a/tubular/dates.py b/tubular/dates.py index fce4dc12..dc8b9b2b 100644 --- a/tubular/dates.py +++ b/tubular/dates.py @@ -8,10 +8,10 @@ import numpy as np import pandas as pd -from tubular.base import BaseTransformer +from tubular.base import BaseTransformer, BaseTwoColumnTransformer -class BaseDateTransformer(BaseTransformer): +class DateTransformerMixin: """ Extends BaseTransformer for datetime scenarios. @@ -141,54 +141,12 @@ def transform( return X -class BaseDateTwoColumnTransformer(BaseDateTransformer): - """ - Extends BaseDateTransformer for 2 column scenarios - - Parameters - ---------- - columns : List[str] - List of 2 columns. First column will be subtracted from second. - - new_column_name : str - Name for the new year column. - - drop_original : bool - Flag for whether to drop the original columns. - - **kwargs - Arbitrary keyword arguments passed onto BaseTransformer.init method. - - Attributes - ---------- - columns : List[str] - List of 2 columns. First column will be subtracted from second. - - new_column_name : str - Name for the new year column. - - drop_original : bool - Flag for whether to drop the original columns. +class BaseDateTransformer(BaseTransformer, DateTransformerMixin): + pass - """ - def __init__( - self, - columns: list, - new_column_name: str, - drop_original: bool = False, - **kwargs: dict[str, bool], - ) -> None: - super().__init__( - columns=columns, - new_column_name=new_column_name, - drop_original=drop_original, - **kwargs, - ) - - if len(columns) != 2: - msg = f"{self.classname()}: This transformer works with two columns only" - raise ValueError(msg) +class BaseDateTwoColumnTransformer(BaseTwoColumnTransformer, DateTransformerMixin): + pass class DateDiffLeapYearTransformer(BaseDateTwoColumnTransformer): From 73d5bc5cb23ab39f566805125dd9e8f4c7cc56c4 Mon Sep 17 00:00:00 2001 From: davidhopkinson26 <89029024+davidhopkinson26@users.noreply.github.com> Date: Thu, 25 Jul 2024 10:42:52 +0000 Subject: [PATCH 15/32] Revert ":construction: refactored to use DateTransformerMixin in dates.py and mix this into the BaseDateTransformer and BaseDateTwoColumnTransformer, reusing logic from BaseTwoColumnTransformer" This reverts commit 307211ff0a10baccbf4fba879f56aa3d473a7859. --- tests/dates/test_BaseDateTransformer.py | 4 +- .../test_BaseTwocolumnDateTransformer.py | 41 +++++++++++--- tubular/dates.py | 54 ++++++++++++++++--- 3 files changed, 83 insertions(+), 16 deletions(-) diff --git a/tests/dates/test_BaseDateTransformer.py b/tests/dates/test_BaseDateTransformer.py index f46a8fee..759fcdf3 100644 --- a/tests/dates/test_BaseDateTransformer.py +++ b/tests/dates/test_BaseDateTransformer.py @@ -68,7 +68,7 @@ def create_date_diff_different_dtypes(): ) -class DatesTransformMixinTests: +class GenericDatesTransformTests: """Generic tests for Dates Transformers""" @pytest.mark.parametrize( @@ -127,7 +127,7 @@ def setup_class(cls): cls.transformer_name = "BaseDateTransformer" -class TestTransform(GenericTransformTests, DatesTransformMixinTests): +class TestTransform(GenericTransformTests, GenericDatesTransformTests): """Tests for BaseDateTransformer.transform.""" @classmethod diff --git a/tests/dates/test_BaseTwocolumnDateTransformer.py b/tests/dates/test_BaseTwocolumnDateTransformer.py index f0731659..f36b5055 100644 --- a/tests/dates/test_BaseTwocolumnDateTransformer.py +++ b/tests/dates/test_BaseTwocolumnDateTransformer.py @@ -1,15 +1,40 @@ -from tests.base_tests import ( +import pytest + +from tests.dates.test_BaseDateTransformer import ( + ColumnStrListInitTests, + GenericDatesTransformTests, GenericFitTests, - GenericTransformTests, OtherBaseBehaviourTests, - TwoColumnListInitTests, -) -from tests.dates.test_BaseDateTransformer import ( - DatesTransformMixinTests, ) -class TestInit(TwoColumnListInitTests): +class GenericTwoColumnDatesInitTests: + """Generic tests for Init Dates Transformers which take two columns""" + + @pytest.mark.parametrize("columns", ["a", ["a", "b", "c"]]) + def test_not_two_columns_error( + self, + uninitialized_transformers, + minimal_attribute_dict, + columns, + ): + """ "test that two correct error raised when passed more or less than two columns""" + + init_args = minimal_attribute_dict[self.transformer_name] + print(init_args) + init_args["columns"] = columns + print(init_args) + + msg = f"{self.transformer_name}: This transformer works with two columns only" + + with pytest.raises( + ValueError, + match=msg, + ): + uninitialized_transformers[self.transformer_name](*init_args) + + +class TestInit(ColumnStrListInitTests, GenericTwoColumnDatesInitTests): """Generic tests for transformer.init().""" @classmethod @@ -25,7 +50,7 @@ def setup_class(cls): cls.transformer_name = "BaseDateTwoColumnTransformer" -class TestTransform(GenericTransformTests, DatesTransformMixinTests): +class TestTransform(GenericDatesTransformTests): """Tests for BaseTwoColumnDateTransformer.transform.""" @classmethod diff --git a/tubular/dates.py b/tubular/dates.py index dc8b9b2b..fce4dc12 100644 --- a/tubular/dates.py +++ b/tubular/dates.py @@ -8,10 +8,10 @@ import numpy as np import pandas as pd -from tubular.base import BaseTransformer, BaseTwoColumnTransformer +from tubular.base import BaseTransformer -class DateTransformerMixin: +class BaseDateTransformer(BaseTransformer): """ Extends BaseTransformer for datetime scenarios. @@ -141,12 +141,54 @@ def transform( return X -class BaseDateTransformer(BaseTransformer, DateTransformerMixin): - pass +class BaseDateTwoColumnTransformer(BaseDateTransformer): + """ + Extends BaseDateTransformer for 2 column scenarios + + Parameters + ---------- + columns : List[str] + List of 2 columns. First column will be subtracted from second. + + new_column_name : str + Name for the new year column. + + drop_original : bool + Flag for whether to drop the original columns. + + **kwargs + Arbitrary keyword arguments passed onto BaseTransformer.init method. + + Attributes + ---------- + columns : List[str] + List of 2 columns. First column will be subtracted from second. + + new_column_name : str + Name for the new year column. + + drop_original : bool + Flag for whether to drop the original columns. + """ -class BaseDateTwoColumnTransformer(BaseTwoColumnTransformer, DateTransformerMixin): - pass + def __init__( + self, + columns: list, + new_column_name: str, + drop_original: bool = False, + **kwargs: dict[str, bool], + ) -> None: + super().__init__( + columns=columns, + new_column_name=new_column_name, + drop_original=drop_original, + **kwargs, + ) + + if len(columns) != 2: + msg = f"{self.classname()}: This transformer works with two columns only" + raise ValueError(msg) class DateDiffLeapYearTransformer(BaseDateTwoColumnTransformer): From 32614c3b10e366aaa8b57b27b8b70eb9d7039b5e Mon Sep 17 00:00:00 2001 From: davidhopkinson26 <89029024+davidhopkinson26@users.noreply.github.com> Date: Thu, 25 Jul 2024 12:35:17 +0000 Subject: [PATCH 16/32] :construction: refactored new_col_name to new_column_name in base and somparison modules for consitency --- tests/conftest.py | 4 ++-- tubular/base.py | 6 +++--- tubular/comparison.py | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index e75eec67..39751970 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -138,7 +138,7 @@ def minimal_attribute_dict(): }, "BaseTwoColumnTransformer": { "columns": ["a", "b"], - "new_col_name": "c", + "new_column_name": "c", }, "BetweenDatesTransformer": { "new_column_name": "c", @@ -194,7 +194,7 @@ def minimal_attribute_dict(): }, "EqualityChecker": { "columns": ["a", "b"], - "new_col_name": "c", + "new_column_name": "c", "drop_original": True, }, "GroupRareLevelsTransformer": { diff --git a/tubular/base.py b/tubular/base.py index 7e9dbd57..6ad4237e 100644 --- a/tubular/base.py +++ b/tubular/base.py @@ -264,7 +264,7 @@ class BaseTwoColumnTransformer(BaseTransformer): def __init__( self, columns: list[str], - new_col_name: str, + new_column_name: str, **kwargs: dict[str, bool], ) -> None: super().__init__(columns=columns, **kwargs) @@ -277,11 +277,11 @@ def __init__( msg = f"{self.classname()}: This transformer works with two columns only" raise ValueError(msg) - if not (isinstance(new_col_name, str)): + if not (isinstance(new_column_name, str)): msg = f"{self.classname()}: new_col_name should be str" raise TypeError(msg) - self.new_col_name = new_col_name + self.new_column_name = new_column_name class DataFrameMethodTransformer(BaseDropOriginalMixin, BaseTransformer): diff --git a/tubular/comparison.py b/tubular/comparison.py index 43a79cc8..5344e461 100644 --- a/tubular/comparison.py +++ b/tubular/comparison.py @@ -28,11 +28,11 @@ class EqualityChecker(BaseDropOriginalMixin, BaseTwoColumnTransformer): def __init__( self, columns: list, - new_col_name: str, + new_column_name: str, drop_original: bool = False, **kwargs: dict[str, bool], ) -> None: - super().__init__(columns=columns, new_col_name=new_col_name, **kwargs) + super().__init__(columns=columns, new_col_name=new_column_name, **kwargs) BaseDropOriginalMixin.set_drop_original_column(self, drop_original) From b98d9612bbbb521fc6f8293d80f9128bd3392344 Mon Sep 17 00:00:00 2001 From: davidhopkinson26 <89029024+davidhopkinson26@users.noreply.github.com> Date: Thu, 25 Jul 2024 12:37:05 +0000 Subject: [PATCH 17/32] Revert ":construction: refactored new_col_name to new_column_name in base and somparison modules for consitency" This reverts commit 32614c3b10e366aaa8b57b27b8b70eb9d7039b5e. --- tests/conftest.py | 4 ++-- tubular/base.py | 6 +++--- tubular/comparison.py | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 39751970..e75eec67 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -138,7 +138,7 @@ def minimal_attribute_dict(): }, "BaseTwoColumnTransformer": { "columns": ["a", "b"], - "new_column_name": "c", + "new_col_name": "c", }, "BetweenDatesTransformer": { "new_column_name": "c", @@ -194,7 +194,7 @@ def minimal_attribute_dict(): }, "EqualityChecker": { "columns": ["a", "b"], - "new_column_name": "c", + "new_col_name": "c", "drop_original": True, }, "GroupRareLevelsTransformer": { diff --git a/tubular/base.py b/tubular/base.py index 6ad4237e..7e9dbd57 100644 --- a/tubular/base.py +++ b/tubular/base.py @@ -264,7 +264,7 @@ class BaseTwoColumnTransformer(BaseTransformer): def __init__( self, columns: list[str], - new_column_name: str, + new_col_name: str, **kwargs: dict[str, bool], ) -> None: super().__init__(columns=columns, **kwargs) @@ -277,11 +277,11 @@ def __init__( msg = f"{self.classname()}: This transformer works with two columns only" raise ValueError(msg) - if not (isinstance(new_column_name, str)): + if not (isinstance(new_col_name, str)): msg = f"{self.classname()}: new_col_name should be str" raise TypeError(msg) - self.new_column_name = new_column_name + self.new_col_name = new_col_name class DataFrameMethodTransformer(BaseDropOriginalMixin, BaseTransformer): diff --git a/tubular/comparison.py b/tubular/comparison.py index 5344e461..43a79cc8 100644 --- a/tubular/comparison.py +++ b/tubular/comparison.py @@ -28,11 +28,11 @@ class EqualityChecker(BaseDropOriginalMixin, BaseTwoColumnTransformer): def __init__( self, columns: list, - new_column_name: str, + new_col_name: str, drop_original: bool = False, **kwargs: dict[str, bool], ) -> None: - super().__init__(columns=columns, new_col_name=new_column_name, **kwargs) + super().__init__(columns=columns, new_col_name=new_col_name, **kwargs) BaseDropOriginalMixin.set_drop_original_column(self, drop_original) From 40ecfda8aa413000a0ff444cc96e20efb7f9ae22 Mon Sep 17 00:00:00 2001 From: davidhopkinson26 <89029024+davidhopkinson26@users.noreply.github.com> Date: Thu, 25 Jul 2024 12:44:08 +0000 Subject: [PATCH 18/32] :construction: renamed new_col_name to new_column_name for consistency --- tests/base_tests.py | 4 ++-- tests/comparison/test_EqualityChecker.py | 4 ++-- tests/conftest.py | 4 ++-- tubular/base.py | 10 +++++----- tubular/comparison.py | 8 ++++---- 5 files changed, 15 insertions(+), 15 deletions(-) diff --git a/tests/base_tests.py b/tests/base_tests.py index a2fa93f4..a09b4012 100644 --- a/tests/base_tests.py +++ b/tests/base_tests.py @@ -227,12 +227,12 @@ def test_new_column_name_type_error( """Test an error is raised if any type other than str passed to new_column_name""" args = minimal_attribute_dict[self.transformer_name].copy() - args["new_col_name"] = new_column_type + args["new_column_name"] = new_column_type with pytest.raises( TypeError, match=re.escape( - f"{self.transformer_name}: new_col_name should be str", + f"{self.transformer_name}: new_column_name should be str", ), ): uninitialized_transformers[self.transformer_name](**args) diff --git a/tests/comparison/test_EqualityChecker.py b/tests/comparison/test_EqualityChecker.py index 1effda08..c5fb8ab7 100644 --- a/tests/comparison/test_EqualityChecker.py +++ b/tests/comparison/test_EqualityChecker.py @@ -48,7 +48,7 @@ def test_expected_output(self, test_dataframe): example_transformer = EqualityChecker( columns=["b", "c"], - new_col_name="bool_logic", + new_column_name="bool_logic", ) actual = example_transformer.transform(test_dataframe) @@ -73,7 +73,7 @@ def test_expected_output_dropped(self, test_dataframe): example_transformer = EqualityChecker( columns=["b", "c"], - new_col_name="bool_logic", + new_column_name="bool_logic", drop_original=True, ) actual = example_transformer.transform(test_dataframe) diff --git a/tests/conftest.py b/tests/conftest.py index e75eec67..39751970 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -138,7 +138,7 @@ def minimal_attribute_dict(): }, "BaseTwoColumnTransformer": { "columns": ["a", "b"], - "new_col_name": "c", + "new_column_name": "c", }, "BetweenDatesTransformer": { "new_column_name": "c", @@ -194,7 +194,7 @@ def minimal_attribute_dict(): }, "EqualityChecker": { "columns": ["a", "b"], - "new_col_name": "c", + "new_column_name": "c", "drop_original": True, }, "GroupRareLevelsTransformer": { diff --git a/tubular/base.py b/tubular/base.py index 7e9dbd57..65e23e7a 100644 --- a/tubular/base.py +++ b/tubular/base.py @@ -253,7 +253,7 @@ class BaseTwoColumnTransformer(BaseTransformer): columns : list Column pair to apply the transformer to, must be list, cannot be None - new_col_name : str + new_column_name : str Name of new column being created, must be str, cannot be None **kwargs @@ -264,7 +264,7 @@ class BaseTwoColumnTransformer(BaseTransformer): def __init__( self, columns: list[str], - new_col_name: str, + new_column_name: str, **kwargs: dict[str, bool], ) -> None: super().__init__(columns=columns, **kwargs) @@ -277,11 +277,11 @@ def __init__( msg = f"{self.classname()}: This transformer works with two columns only" raise ValueError(msg) - if not (isinstance(new_col_name, str)): - msg = f"{self.classname()}: new_col_name should be str" + if not (isinstance(new_column_name, str)): + msg = f"{self.classname()}: new_column_name should be str" raise TypeError(msg) - self.new_col_name = new_col_name + self.new_column_name = new_column_name class DataFrameMethodTransformer(BaseDropOriginalMixin, BaseTransformer): diff --git a/tubular/comparison.py b/tubular/comparison.py index 43a79cc8..4b25c546 100644 --- a/tubular/comparison.py +++ b/tubular/comparison.py @@ -14,7 +14,7 @@ class EqualityChecker(BaseDropOriginalMixin, BaseTwoColumnTransformer): columns: list List containing names of the two columns to check. - new_col_name: string + new_column_name: string string containing the name of the new column. drop_original: boolean = False @@ -28,11 +28,11 @@ class EqualityChecker(BaseDropOriginalMixin, BaseTwoColumnTransformer): def __init__( self, columns: list, - new_col_name: str, + new_column_name: str, drop_original: bool = False, **kwargs: dict[str, bool], ) -> None: - super().__init__(columns=columns, new_col_name=new_col_name, **kwargs) + super().__init__(columns=columns, new_column_name=new_column_name, **kwargs) BaseDropOriginalMixin.set_drop_original_column(self, drop_original) @@ -53,7 +53,7 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: """ X = super().transform(X) - X[self.new_col_name] = X[self.columns[0]] == X[self.columns[1]] + X[self.new_column_name] = X[self.columns[0]] == X[self.columns[1]] # Drop original columns if self.drop_original is True BaseDropOriginalMixin.drop_original_column( From e993abdeb2085b40efeb3426ffb934be59f14342 Mon Sep 17 00:00:00 2001 From: davidhopkinson26 <89029024+davidhopkinson26@users.noreply.github.com> Date: Thu, 25 Jul 2024 13:58:01 +0000 Subject: [PATCH 19/32] :construction: restructured baseDateTransformer and BaseTwoColumnDateTransformer to inherit from DateTransformerMixin and BaseTransformer/ BaseTwoColumnTransformer --- tubular/dates.py | 141 +++++++++++++++++------------------------------ 1 file changed, 50 insertions(+), 91 deletions(-) diff --git a/tubular/dates.py b/tubular/dates.py index fce4dc12..1b00efc4 100644 --- a/tubular/dates.py +++ b/tubular/dates.py @@ -8,60 +8,10 @@ import numpy as np import pandas as pd -from tubular.base import BaseTransformer +from tubular.base import BaseTransformer, BaseTwoColumnTransformer -class BaseDateTransformer(BaseTransformer): - """ - Extends BaseTransformer for datetime scenarios. - - Parameters - ---------- - columns : List[str] - List of 2 columns. First column will be subtracted from second. - - new_column_name : str - Name for the new year column. - - drop_original : bool - Flag for whether to drop the original columns. - - **kwargs - Arbitrary keyword arguments passed onto BaseTransformer.init method. - - Attributes - ---------- - columns : List[str] - List of 2 columns. First column will be subtracted from second. - - new_column_name : str - Name for the new year column. - - drop_original : bool - Flag for whether to drop the original columns. - - """ - - def __init__( - self, - columns: list, - new_column_name: str, - drop_original: bool = False, - **kwargs: dict[str, bool], - ) -> None: - super().__init__(columns=columns, **kwargs) - - if not (isinstance(new_column_name, str)): - msg = f"{self.classname()}: new_column_name should be str" - raise TypeError(msg) - - if not (isinstance(drop_original, bool)): - msg = f"{self.classname()}: drop_original should be bool" - raise TypeError(msg) - - self.new_column_name = new_column_name - self.drop_original = drop_original - +class DateTransformerMixin: def check_columns_are_date_or_datetime( self, X: pd.DataFrame, @@ -112,6 +62,24 @@ def check_columns_are_date_or_datetime( msg, ) + +class BaseDateTransformer(DateTransformerMixin, BaseTransformer): + """ + Extends BaseTransformer for datetime scenarios. + + """ + + def __init__( + self, + columns: list[str], + new_column_name: str | None = None, + drop_original: bool = False, + **kwargs: dict[str, bool], + ) -> None: + super().__init__(columns=columns, **kwargs) + self.new_column_name = new_column_name + self.drop_original = drop_original + def transform( self, X: pd.DataFrame, @@ -141,54 +109,45 @@ def transform( return X -class BaseDateTwoColumnTransformer(BaseDateTransformer): - """ - Extends BaseDateTransformer for 2 column scenarios - - Parameters - ---------- - columns : List[str] - List of 2 columns. First column will be subtracted from second. +class BaseDateTwoColumnTransformer(DateTransformerMixin, BaseTwoColumnTransformer): + def __init__( + self, + columns: list[str], + new_column_name: str | None = None, + drop_original: bool = False, + **kwargs: dict[str, bool], + ) -> None: + super().__init__(columns=columns, new_column_name=new_column_name, **kwargs) - new_column_name : str - Name for the new year column. + self.drop_original = drop_original - drop_original : bool - Flag for whether to drop the original columns. + def transform( + self, + X: pd.DataFrame, + datetime_only: bool = False, + ) -> pd.DataFrame: + """Base transform method, calls parent transform and validates data. - **kwargs - Arbitrary keyword arguments passed onto BaseTransformer.init method. + Parameters + ---------- + X : pd.DataFrame + Data containing self.columns - Attributes - ---------- - columns : List[str] - List of 2 columns. First column will be subtracted from second. + datetime_only: bool + Indicates whether ONLY datetime types are accepted - new_column_name : str - Name for the new year column. + Returns + ------- + X : pd.DataFrame + Validated data - drop_original : bool - Flag for whether to drop the original columns. + """ - """ + X = super().transform(X) - def __init__( - self, - columns: list, - new_column_name: str, - drop_original: bool = False, - **kwargs: dict[str, bool], - ) -> None: - super().__init__( - columns=columns, - new_column_name=new_column_name, - drop_original=drop_original, - **kwargs, - ) + self.check_columns_are_date_or_datetime(X, datetime_only=datetime_only) - if len(columns) != 2: - msg = f"{self.classname()}: This transformer works with two columns only" - raise ValueError(msg) + return X class DateDiffLeapYearTransformer(BaseDateTwoColumnTransformer): From 9b217cf69169b0db83a27accbe4115e8b0ea997e Mon Sep 17 00:00:00 2001 From: davidhopkinson26 <89029024+davidhopkinson26@users.noreply.github.com> Date: Thu, 25 Jul 2024 13:58:49 +0000 Subject: [PATCH 20/32] :construction: marked some tests which will need redeveloping with xfail for now --- tests/dates/test_DateDiffLeapYearTransformer.py | 1 + tests/dates/test_SeriesDtMethodTransformer.py | 1 + tests/dates/test_ToDatetimeTransformer.py | 1 + 3 files changed, 3 insertions(+) diff --git a/tests/dates/test_DateDiffLeapYearTransformer.py b/tests/dates/test_DateDiffLeapYearTransformer.py index 3076b9d6..4b0c48fe 100644 --- a/tests/dates/test_DateDiffLeapYearTransformer.py +++ b/tests/dates/test_DateDiffLeapYearTransformer.py @@ -74,6 +74,7 @@ def test_new_column_name_type_error(self): drop_original=True, ) + @pytest.mark.xfail def test_drop_original_type_error(self): """Test that an exception is raised if drop_original is not a bool.""" with pytest.raises( diff --git a/tests/dates/test_SeriesDtMethodTransformer.py b/tests/dates/test_SeriesDtMethodTransformer.py index 9f824a2a..804a3257 100644 --- a/tests/dates/test_SeriesDtMethodTransformer.py +++ b/tests/dates/test_SeriesDtMethodTransformer.py @@ -9,6 +9,7 @@ class TestInit: """Tests for SeriesDtMethodTransformer.init().""" + @pytest.mark.xfail def test_invalid_input_type_errors(self): """Test that an exceptions are raised for invalid input types.""" with pytest.raises( diff --git a/tests/dates/test_ToDatetimeTransformer.py b/tests/dates/test_ToDatetimeTransformer.py index c7ca0940..a15f389c 100644 --- a/tests/dates/test_ToDatetimeTransformer.py +++ b/tests/dates/test_ToDatetimeTransformer.py @@ -22,6 +22,7 @@ def test_column_type_error(self): new_column_name="a", ) + @pytest.mark.xfail def test_new_column_name_type_error(self): """Test that an exception is raised if new_column_name is not a str.""" with pytest.raises( From a032ba269ebed58478835901ef750b2c87d1bbcc Mon Sep 17 00:00:00 2001 From: davidhopkinson26 <89029024+davidhopkinson26@users.noreply.github.com> Date: Thu, 25 Jul 2024 14:07:20 +0000 Subject: [PATCH 21/32] :art: changed to use tests from TwoColumnListInitTests and made Date sepcific test class a mixin --- tests/dates/test_BaseDateTransformer.py | 4 +- .../test_BaseTwocolumnDateTransformer.py | 39 +++---------------- 2 files changed, 8 insertions(+), 35 deletions(-) diff --git a/tests/dates/test_BaseDateTransformer.py b/tests/dates/test_BaseDateTransformer.py index 759fcdf3..d130e102 100644 --- a/tests/dates/test_BaseDateTransformer.py +++ b/tests/dates/test_BaseDateTransformer.py @@ -68,7 +68,7 @@ def create_date_diff_different_dtypes(): ) -class GenericDatesTransformTests: +class DatesMixinTransformTests: """Generic tests for Dates Transformers""" @pytest.mark.parametrize( @@ -127,7 +127,7 @@ def setup_class(cls): cls.transformer_name = "BaseDateTransformer" -class TestTransform(GenericTransformTests, GenericDatesTransformTests): +class TestTransform(GenericTransformTests, DatesMixinTransformTests): """Tests for BaseDateTransformer.transform.""" @classmethod diff --git a/tests/dates/test_BaseTwocolumnDateTransformer.py b/tests/dates/test_BaseTwocolumnDateTransformer.py index f36b5055..c854df4f 100644 --- a/tests/dates/test_BaseTwocolumnDateTransformer.py +++ b/tests/dates/test_BaseTwocolumnDateTransformer.py @@ -1,40 +1,13 @@ -import pytest - -from tests.dates.test_BaseDateTransformer import ( - ColumnStrListInitTests, - GenericDatesTransformTests, +from tests.base_tests import ( GenericFitTests, + GenericTransformTests, OtherBaseBehaviourTests, + TwoColumnListInitTests, ) +from tests.dates.test_BaseDateTransformer import DatesMixinTransformTests -class GenericTwoColumnDatesInitTests: - """Generic tests for Init Dates Transformers which take two columns""" - - @pytest.mark.parametrize("columns", ["a", ["a", "b", "c"]]) - def test_not_two_columns_error( - self, - uninitialized_transformers, - minimal_attribute_dict, - columns, - ): - """ "test that two correct error raised when passed more or less than two columns""" - - init_args = minimal_attribute_dict[self.transformer_name] - print(init_args) - init_args["columns"] = columns - print(init_args) - - msg = f"{self.transformer_name}: This transformer works with two columns only" - - with pytest.raises( - ValueError, - match=msg, - ): - uninitialized_transformers[self.transformer_name](*init_args) - - -class TestInit(ColumnStrListInitTests, GenericTwoColumnDatesInitTests): +class TestInit(TwoColumnListInitTests): """Generic tests for transformer.init().""" @classmethod @@ -50,7 +23,7 @@ def setup_class(cls): cls.transformer_name = "BaseDateTwoColumnTransformer" -class TestTransform(GenericDatesTransformTests): +class TestTransform(GenericTransformTests, DatesMixinTransformTests): """Tests for BaseTwoColumnDateTransformer.transform.""" @classmethod From ab1bec818749a2944490b3bea03e1e3a43f41daa Mon Sep 17 00:00:00 2001 From: davidhopkinson26 <89029024+davidhopkinson26@users.noreply.github.com> Date: Thu, 25 Jul 2024 14:11:43 +0000 Subject: [PATCH 22/32] :art: changed name of BaseDropOriginalMixin to DropOriginalMixin --- tubular/base.py | 8 ++++---- tubular/comparison.py | 8 ++++---- tubular/mixins.py | 2 +- tubular/nominal.py | 8 ++++---- tubular/numeric.py | 4 ++-- 5 files changed, 15 insertions(+), 15 deletions(-) diff --git a/tubular/base.py b/tubular/base.py index 65e23e7a..101d0e04 100644 --- a/tubular/base.py +++ b/tubular/base.py @@ -11,7 +11,7 @@ from sklearn.utils.validation import check_is_fitted from tubular._version import __version__ -from tubular.mixins import BaseDropOriginalMixin +from tubular.mixins import DropOriginalMixin pd.options.mode.copy_on_write = True @@ -284,7 +284,7 @@ def __init__( self.new_column_name = new_column_name -class DataFrameMethodTransformer(BaseDropOriginalMixin, BaseTransformer): +class DataFrameMethodTransformer(DropOriginalMixin, BaseTransformer): """Tranformer that applies a pandas.DataFrame method. @@ -374,7 +374,7 @@ def __init__( self.pd_method_name = pd_method_name self.pd_method_kwargs = pd_method_kwargs - BaseDropOriginalMixin.set_drop_original_column(self, drop_original) + DropOriginalMixin.set_drop_original_column(self, drop_original) try: df = pd.DataFrame() @@ -409,7 +409,7 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: ) # Drop original columns if self.drop_original is True - BaseDropOriginalMixin.drop_original_column( + DropOriginalMixin.drop_original_column( self, X, self.drop_original, diff --git a/tubular/comparison.py b/tubular/comparison.py index 4b25c546..0eff8eaa 100644 --- a/tubular/comparison.py +++ b/tubular/comparison.py @@ -3,10 +3,10 @@ import pandas as pd # noqa: TCH002 from tubular.base import BaseTwoColumnTransformer -from tubular.mixins import BaseDropOriginalMixin +from tubular.mixins import DropOriginalMixin -class EqualityChecker(BaseDropOriginalMixin, BaseTwoColumnTransformer): +class EqualityChecker(DropOriginalMixin, BaseTwoColumnTransformer): """Transformer to check if two columns are equal. Parameters @@ -34,7 +34,7 @@ def __init__( ) -> None: super().__init__(columns=columns, new_column_name=new_column_name, **kwargs) - BaseDropOriginalMixin.set_drop_original_column(self, drop_original) + DropOriginalMixin.set_drop_original_column(self, drop_original) def transform(self, X: pd.DataFrame) -> pd.DataFrame: """Create a column which is populated by the boolean @@ -56,7 +56,7 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: X[self.new_column_name] = X[self.columns[0]] == X[self.columns[1]] # Drop original columns if self.drop_original is True - BaseDropOriginalMixin.drop_original_column( + DropOriginalMixin.drop_original_column( self, X, self.drop_original, diff --git a/tubular/mixins.py b/tubular/mixins.py index 1fab0458..ede232ca 100644 --- a/tubular/mixins.py +++ b/tubular/mixins.py @@ -4,7 +4,7 @@ import pandas as pd -class BaseDropOriginalMixin: +class DropOriginalMixin: """Mixin class to validate and apply 'drop_original' argument used by various transformers. Transformer deletes transformer input columns depending on boolean argument. diff --git a/tubular/nominal.py b/tubular/nominal.py index f4e0ffe3..91c0844c 100644 --- a/tubular/nominal.py +++ b/tubular/nominal.py @@ -10,7 +10,7 @@ from tubular.base import BaseTransformer from tubular.mapping import BaseMappingTransformMixin -from tubular.mixins import BaseDropOriginalMixin, WeightColumnMixin +from tubular.mixins import DropOriginalMixin, WeightColumnMixin class BaseNominalTransformer(BaseTransformer): @@ -1036,7 +1036,7 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: return BaseMappingTransformMixin.transform(self, X) -class OneHotEncodingTransformer(BaseDropOriginalMixin, BaseTransformer, OneHotEncoder): +class OneHotEncodingTransformer(DropOriginalMixin, BaseTransformer, OneHotEncoder): """Transformer to convert cetegorical variables into dummy columns. Extends the sklearn OneHotEncoder class to provide easy renaming of dummy columns. @@ -1108,7 +1108,7 @@ def __init__( self.separator = separator - BaseDropOriginalMixin.set_drop_original_column(self, drop_original) + DropOriginalMixin.set_drop_original_column(self, drop_original) def fit(self, X: pd.DataFrame, y: pd.Series | None = None) -> pd.DataFrame: """Gets list of levels for each column to be transformed. This defines which dummy columns @@ -1254,7 +1254,7 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: ) # Drop original columns if self.drop_original is True - BaseDropOriginalMixin.drop_original_column( + DropOriginalMixin.drop_original_column( self, X, self.drop_original, diff --git a/tubular/numeric.py b/tubular/numeric.py index 2d4229c7..83c1cd5b 100644 --- a/tubular/numeric.py +++ b/tubular/numeric.py @@ -13,7 +13,7 @@ ) from tubular.base import BaseTransformer, DataFrameMethodTransformer -from tubular.mixins import BaseDropOriginalMixin +from tubular.mixins import DropOriginalMixin class BaseNumericTransformer(BaseTransformer): @@ -107,7 +107,7 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: return X -class LogTransformer(BaseNumericTransformer, BaseDropOriginalMixin): +class LogTransformer(BaseNumericTransformer, DropOriginalMixin): """Transformer to apply log transformation. Transformer has the option to add 1 to the columns to log and drop the From 9d1db6c7a98ed83e79970ed23ebe27e102acf18b Mon Sep 17 00:00:00 2001 From: davidhopkinson26 <89029024+davidhopkinson26@users.noreply.github.com> Date: Thu, 25 Jul 2024 14:16:33 +0000 Subject: [PATCH 23/32] :construction: added drop original mixin to BaseDateTransformers. Actual dropping still needs to be added to child classes as currently drop original doesn't actually do anything yet --- tubular/dates.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/tubular/dates.py b/tubular/dates.py index 1b00efc4..3bb1f193 100644 --- a/tubular/dates.py +++ b/tubular/dates.py @@ -9,6 +9,7 @@ import pandas as pd from tubular.base import BaseTransformer, BaseTwoColumnTransformer +from tubular.mixins import DropOriginalMixin class DateTransformerMixin: @@ -63,7 +64,7 @@ def check_columns_are_date_or_datetime( ) -class BaseDateTransformer(DateTransformerMixin, BaseTransformer): +class BaseDateTransformer(DateTransformerMixin, DropOriginalMixin, BaseTransformer): """ Extends BaseTransformer for datetime scenarios. @@ -77,8 +78,10 @@ def __init__( **kwargs: dict[str, bool], ) -> None: super().__init__(columns=columns, **kwargs) + + DropOriginalMixin.set_drop_original_column(self, drop_original) + self.new_column_name = new_column_name - self.drop_original = drop_original def transform( self, @@ -109,7 +112,11 @@ def transform( return X -class BaseDateTwoColumnTransformer(DateTransformerMixin, BaseTwoColumnTransformer): +class BaseDateTwoColumnTransformer( + DateTransformerMixin, + DropOriginalMixin, + BaseTwoColumnTransformer, +): def __init__( self, columns: list[str], @@ -119,7 +126,7 @@ def __init__( ) -> None: super().__init__(columns=columns, new_column_name=new_column_name, **kwargs) - self.drop_original = drop_original + DropOriginalMixin.set_drop_original_column(self, drop_original) def transform( self, From 1c35973e5f70c76d1a2a4123a249e248cec140c9 Mon Sep 17 00:00:00 2001 From: davidhopkinson26 <89029024+davidhopkinson26@users.noreply.github.com> Date: Thu, 25 Jul 2024 14:38:47 +0000 Subject: [PATCH 24/32] :construction: moved new_column_name handling to NewColumnNameMixin --- tests/base_tests.py | 53 ++++++++++++++---------- tests/comparison/test_EqualityChecker.py | 7 +++- tests/conftest.py | 1 - tubular/base.py | 10 ----- tubular/comparison.py | 7 ++-- tubular/dates.py | 4 +- tubular/mixins.py | 11 +++++ 7 files changed, 54 insertions(+), 39 deletions(-) diff --git a/tests/base_tests.py b/tests/base_tests.py index a09b4012..81e4bc4c 100644 --- a/tests/base_tests.py +++ b/tests/base_tests.py @@ -144,6 +144,36 @@ def test_drop_column_arg_errors( uninitialized_transformers[self.transformer_name](**args) +class NewColumnNameInitMixintests: + """ + Tests for BaseTransformer.init() behaviour specific to when a transformer accepts a "new_column_name" column. + Note this deliberately avoids starting with "Tests" so that the tests are not run on import. + """ + + @pytest.mark.parametrize( + "new_column_type", + [1, True, {"a": 1}, [1, 2], None, np.inf, np.nan], + ) + def test_new_column_name_type_error( + self, + new_column_type, + minimal_attribute_dict, + uninitialized_transformers, + ): + """Test an error is raised if any type other than str passed to new_column_name""" + + args = minimal_attribute_dict[self.transformer_name].copy() + args["new_column_name"] = new_column_type + + with pytest.raises( + TypeError, + match=re.escape( + f"{self.transformer_name}: new_column_name should be str", + ), + ): + uninitialized_transformers[self.transformer_name](**args) + + class WeightColumnInitMixinTests: """ Tests for BaseTransformer.init() behaviour specific to when a transformer takes accepts a weight column. @@ -214,29 +244,6 @@ def test_list_length_error( ): uninitialized_transformers[self.transformer_name](**args) - @pytest.mark.parametrize( - "new_column_type", - [1, True, {"a": 1}, [1, 2], None, np.inf, np.nan], - ) - def test_new_column_name_type_error( - self, - new_column_type, - minimal_attribute_dict, - uninitialized_transformers, - ): - """Test an error is raised if any type other than str passed to new_column_name""" - - args = minimal_attribute_dict[self.transformer_name].copy() - args["new_column_name"] = new_column_type - - with pytest.raises( - TypeError, - match=re.escape( - f"{self.transformer_name}: new_column_name should be str", - ), - ): - uninitialized_transformers[self.transformer_name](**args) - class GenericFitTests: """ diff --git a/tests/comparison/test_EqualityChecker.py b/tests/comparison/test_EqualityChecker.py index c5fb8ab7..8bb028d3 100644 --- a/tests/comparison/test_EqualityChecker.py +++ b/tests/comparison/test_EqualityChecker.py @@ -6,13 +6,18 @@ DropOriginalInitMixinTests, GenericFitTests, GenericTransformTests, + NewColumnNameInitMixintests, OtherBaseBehaviourTests, TwoColumnListInitTests, ) from tubular.comparison import EqualityChecker -class TestInit(DropOriginalInitMixinTests, TwoColumnListInitTests): +class TestInit( + DropOriginalInitMixinTests, + NewColumnNameInitMixintests, + TwoColumnListInitTests, +): """Generic tests for transformer.init().""" @classmethod diff --git a/tests/conftest.py b/tests/conftest.py index 39751970..11dc94c9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -138,7 +138,6 @@ def minimal_attribute_dict(): }, "BaseTwoColumnTransformer": { "columns": ["a", "b"], - "new_column_name": "c", }, "BetweenDatesTransformer": { "new_column_name": "c", diff --git a/tubular/base.py b/tubular/base.py index 101d0e04..bc94aa3d 100644 --- a/tubular/base.py +++ b/tubular/base.py @@ -253,9 +253,6 @@ class BaseTwoColumnTransformer(BaseTransformer): columns : list Column pair to apply the transformer to, must be list, cannot be None - new_column_name : str - Name of new column being created, must be str, cannot be None - **kwargs Arbitrary keyword arguments passed onto BaseTransformer.__init__(). @@ -264,7 +261,6 @@ class BaseTwoColumnTransformer(BaseTransformer): def __init__( self, columns: list[str], - new_column_name: str, **kwargs: dict[str, bool], ) -> None: super().__init__(columns=columns, **kwargs) @@ -277,12 +273,6 @@ def __init__( msg = f"{self.classname()}: This transformer works with two columns only" raise ValueError(msg) - if not (isinstance(new_column_name, str)): - msg = f"{self.classname()}: new_column_name should be str" - raise TypeError(msg) - - self.new_column_name = new_column_name - class DataFrameMethodTransformer(DropOriginalMixin, BaseTransformer): diff --git a/tubular/comparison.py b/tubular/comparison.py index 0eff8eaa..55ebce8e 100644 --- a/tubular/comparison.py +++ b/tubular/comparison.py @@ -3,10 +3,10 @@ import pandas as pd # noqa: TCH002 from tubular.base import BaseTwoColumnTransformer -from tubular.mixins import DropOriginalMixin +from tubular.mixins import DropOriginalMixin, NewColumnNameMixin -class EqualityChecker(DropOriginalMixin, BaseTwoColumnTransformer): +class EqualityChecker(DropOriginalMixin, NewColumnNameMixin, BaseTwoColumnTransformer): """Transformer to check if two columns are equal. Parameters @@ -32,9 +32,10 @@ def __init__( drop_original: bool = False, **kwargs: dict[str, bool], ) -> None: - super().__init__(columns=columns, new_column_name=new_column_name, **kwargs) + super().__init__(columns=columns, **kwargs) DropOriginalMixin.set_drop_original_column(self, drop_original) + NewColumnNameMixin.check_and_set_new_column_name(self, new_column_name) def transform(self, X: pd.DataFrame) -> pd.DataFrame: """Create a column which is populated by the boolean diff --git a/tubular/dates.py b/tubular/dates.py index 3bb1f193..40a967e6 100644 --- a/tubular/dates.py +++ b/tubular/dates.py @@ -124,10 +124,12 @@ def __init__( drop_original: bool = False, **kwargs: dict[str, bool], ) -> None: - super().__init__(columns=columns, new_column_name=new_column_name, **kwargs) + super().__init__(columns=columns, **kwargs) DropOriginalMixin.set_drop_original_column(self, drop_original) + self.new_column_name = new_column_name + def transform( self, X: pd.DataFrame, diff --git a/tubular/mixins.py b/tubular/mixins.py index ede232ca..8610c420 100644 --- a/tubular/mixins.py +++ b/tubular/mixins.py @@ -60,6 +60,17 @@ def drop_original_column( return X +class NewColumnNameMixin: + """Helper to validate and set""" + + def check_and_set_new_column_name(self, new_column_name: str) -> None: + if not (isinstance(new_column_name, str)): + msg = f"{self.classname()}: new_column_name should be str" + raise TypeError(msg) + + self.new_column_name = new_column_name + + class WeightColumnMixin: def check_weights_column(self, X: pd.DataFrame, weights_column: str) -> None: """Helper method for validating weights column in dataframe. From cb204116c5127a648580b74b6a5e04cb281547bc Mon Sep 17 00:00:00 2001 From: davidhopkinson26 <89029024+davidhopkinson26@users.noreply.github.com> Date: Thu, 25 Jul 2024 14:44:40 +0000 Subject: [PATCH 25/32] :art: BaseDateTransformer classes now use NewColumnNameMixin and tests now use NewColumnNameInitMixintests, DropOriginalInitMixinTests --- tests/dates/test_BaseDateTransformer.py | 8 +++++++- tests/dates/test_BaseTwocolumnDateTransformer.py | 8 +++++++- tests/dates/test_DateDiffLeapYearTransformer.py | 1 - tests/dates/test_SeriesDtMethodTransformer.py | 1 - tests/dates/test_ToDatetimeTransformer.py | 1 - tubular/dates.py | 8 +++----- 6 files changed, 17 insertions(+), 10 deletions(-) diff --git a/tests/dates/test_BaseDateTransformer.py b/tests/dates/test_BaseDateTransformer.py index d130e102..0ab1d4ba 100644 --- a/tests/dates/test_BaseDateTransformer.py +++ b/tests/dates/test_BaseDateTransformer.py @@ -5,8 +5,10 @@ from tests.base_tests import ( ColumnStrListInitTests, + DropOriginalInitMixinTests, GenericFitTests, GenericTransformTests, + NewColumnNameInitMixintests, OtherBaseBehaviourTests, ) @@ -111,7 +113,11 @@ def test_mismatched_datetypes_error( x.transform(df) -class TestInit(ColumnStrListInitTests): +class TestInit( + NewColumnNameInitMixintests, + DropOriginalInitMixinTests, + ColumnStrListInitTests, +): """Generic tests for transformer.init().""" @classmethod diff --git a/tests/dates/test_BaseTwocolumnDateTransformer.py b/tests/dates/test_BaseTwocolumnDateTransformer.py index c854df4f..cd9cf946 100644 --- a/tests/dates/test_BaseTwocolumnDateTransformer.py +++ b/tests/dates/test_BaseTwocolumnDateTransformer.py @@ -1,13 +1,19 @@ from tests.base_tests import ( + DropOriginalInitMixinTests, GenericFitTests, GenericTransformTests, + NewColumnNameInitMixintests, OtherBaseBehaviourTests, TwoColumnListInitTests, ) from tests.dates.test_BaseDateTransformer import DatesMixinTransformTests -class TestInit(TwoColumnListInitTests): +class TestInit( + NewColumnNameInitMixintests, + DropOriginalInitMixinTests, + TwoColumnListInitTests, +): """Generic tests for transformer.init().""" @classmethod diff --git a/tests/dates/test_DateDiffLeapYearTransformer.py b/tests/dates/test_DateDiffLeapYearTransformer.py index 4b0c48fe..3076b9d6 100644 --- a/tests/dates/test_DateDiffLeapYearTransformer.py +++ b/tests/dates/test_DateDiffLeapYearTransformer.py @@ -74,7 +74,6 @@ def test_new_column_name_type_error(self): drop_original=True, ) - @pytest.mark.xfail def test_drop_original_type_error(self): """Test that an exception is raised if drop_original is not a bool.""" with pytest.raises( diff --git a/tests/dates/test_SeriesDtMethodTransformer.py b/tests/dates/test_SeriesDtMethodTransformer.py index 804a3257..9f824a2a 100644 --- a/tests/dates/test_SeriesDtMethodTransformer.py +++ b/tests/dates/test_SeriesDtMethodTransformer.py @@ -9,7 +9,6 @@ class TestInit: """Tests for SeriesDtMethodTransformer.init().""" - @pytest.mark.xfail def test_invalid_input_type_errors(self): """Test that an exceptions are raised for invalid input types.""" with pytest.raises( diff --git a/tests/dates/test_ToDatetimeTransformer.py b/tests/dates/test_ToDatetimeTransformer.py index a15f389c..c7ca0940 100644 --- a/tests/dates/test_ToDatetimeTransformer.py +++ b/tests/dates/test_ToDatetimeTransformer.py @@ -22,7 +22,6 @@ def test_column_type_error(self): new_column_name="a", ) - @pytest.mark.xfail def test_new_column_name_type_error(self): """Test that an exception is raised if new_column_name is not a str.""" with pytest.raises( diff --git a/tubular/dates.py b/tubular/dates.py index 40a967e6..5823939f 100644 --- a/tubular/dates.py +++ b/tubular/dates.py @@ -9,7 +9,7 @@ import pandas as pd from tubular.base import BaseTransformer, BaseTwoColumnTransformer -from tubular.mixins import DropOriginalMixin +from tubular.mixins import DropOriginalMixin, NewColumnNameMixin class DateTransformerMixin: @@ -80,8 +80,7 @@ def __init__( super().__init__(columns=columns, **kwargs) DropOriginalMixin.set_drop_original_column(self, drop_original) - - self.new_column_name = new_column_name + NewColumnNameMixin.check_and_set_new_column_name(self, new_column_name) def transform( self, @@ -127,8 +126,7 @@ def __init__( super().__init__(columns=columns, **kwargs) DropOriginalMixin.set_drop_original_column(self, drop_original) - - self.new_column_name = new_column_name + NewColumnNameMixin.check_and_set_new_column_name(self, new_column_name) def transform( self, From cfd83dc26d3aaaffc9fbde2d5716d7cf8bea163e Mon Sep 17 00:00:00 2001 From: davidhopkinson26 <89029024+davidhopkinson26@users.noreply.github.com> Date: Thu, 25 Jul 2024 14:59:39 +0000 Subject: [PATCH 26/32] :memo: updated docstrings --- tubular/dates.py | 37 ++++++++++++++++++++++++++++++++++++- tubular/mixins.py | 2 +- 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/tubular/dates.py b/tubular/dates.py index 5823939f..6849700b 100644 --- a/tubular/dates.py +++ b/tubular/dates.py @@ -13,6 +13,8 @@ class DateTransformerMixin: + """Class to contain methods specific to datetime transformers""" + def check_columns_are_date_or_datetime( self, X: pd.DataFrame, @@ -66,8 +68,21 @@ def check_columns_are_date_or_datetime( class BaseDateTransformer(DateTransformerMixin, DropOriginalMixin, BaseTransformer): """ - Extends BaseTransformer for datetime scenarios. + Extends BaseTransformer for datetime scenarios + + Parameters + ---------- + columns : List[str] + List of 2 columns. First column will be subtracted from second. + + new_column_name : str + Name for the new year column. + + drop_original : bool + Flag for whether to drop the original columns. + **kwargs + Arbitrary keyword arguments passed onto BaseTransformer.init method. """ def __init__( @@ -116,6 +131,26 @@ class BaseDateTwoColumnTransformer( DropOriginalMixin, BaseTwoColumnTransformer, ): + + """Extends BaseTwoColumnTransformer for datetime scenarios + + Parameters + ---------- + columns : list + Either a list of str values or a string giving which columns in a input pandas.DataFrame the transformer + will be applied to. + + new_column_name : str + Name for the new year column. + + drop_original : bool + Flag for whether to drop the original columns. + + **kwargs + Arbitrary keyword arguments passed onto BaseTransformer.init method. + + """ + def __init__( self, columns: list[str], diff --git a/tubular/mixins.py b/tubular/mixins.py index 8610c420..03211a01 100644 --- a/tubular/mixins.py +++ b/tubular/mixins.py @@ -61,7 +61,7 @@ def drop_original_column( class NewColumnNameMixin: - """Helper to validate and set""" + """Helper to validate and set new_column_name attribute""" def check_and_set_new_column_name(self, new_column_name: str) -> None: if not (isinstance(new_column_name, str)): From 17ab554344c60077a1d0632374c1ae70708f7881 Mon Sep 17 00:00:00 2001 From: davidhopkinson26 <89029024+davidhopkinson26@users.noreply.github.com> Date: Thu, 25 Jul 2024 15:05:09 +0000 Subject: [PATCH 27/32] :memo: update changelog --- CHANGELOG.rst | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index dd7b7242..3b7033e2 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -16,7 +16,19 @@ Subsections for each version can be one of the following; Each individual change should have a link to the pull request after the description of the change. -1.3.1 (unreleased) +1.3.2 (unreleased) +------------------ + +Changed +^^^^^^^ +- Refactored BaseDateTransformer, BaseDateTwoColumnTransformer and associated testing `#280 `_ + +Added +^^^^^ + +- mixin class to handle new_column_name arguments `#280 `_ + +1.3.1 (2024-07-18) ------------------ Changed From 980e304a5f121d4a91355bcc04a1de55476e33a9 Mon Sep 17 00:00:00 2001 From: davidhopkinson26 <89029024+davidhopkinson26@users.noreply.github.com> Date: Thu, 25 Jul 2024 15:05:42 +0000 Subject: [PATCH 28/32] :memo: update changelog --- CHANGELOG.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 3b7033e2..f7b03ffe 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -21,12 +21,12 @@ Each individual change should have a link to the pull request after the descript Changed ^^^^^^^ -- Refactored BaseDateTransformer, BaseDateTwoColumnTransformer and associated testing `#280 `_ +- Refactored BaseDateTransformer, BaseDateTwoColumnTransformer and associated testing `#273 `_ Added ^^^^^ -- mixin class to handle new_column_name arguments `#280 `_ +- mixin class to handle new_column_name arguments `#273 `_ 1.3.1 (2024-07-18) ------------------ From b507634342fcd935827cddd6dc0b36777178b643 Mon Sep 17 00:00:00 2001 From: davidhopkinson26 <89029024+davidhopkinson26@users.noreply.github.com> Date: Thu, 25 Jul 2024 15:27:31 +0000 Subject: [PATCH 29/32] :art: moved logic from BaseTwoColumnTransformer into a mixin and reduced duplication in dates.py --- CHANGELOG.rst | 6 +-- tests/base/test_BaseTwoColumnTransformer.py | 40 ----------------- tests/conftest.py | 3 -- tubular/base.py | 33 -------------- tubular/comparison.py | 12 +++-- tubular/dates.py | 49 +++++---------------- tubular/mixins.py | 13 ++++++ 7 files changed, 35 insertions(+), 121 deletions(-) delete mode 100644 tests/base/test_BaseTwoColumnTransformer.py diff --git a/CHANGELOG.rst b/CHANGELOG.rst index f7b03ffe..db310316 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -22,11 +22,7 @@ Each individual change should have a link to the pull request after the descript Changed ^^^^^^^ - Refactored BaseDateTransformer, BaseDateTwoColumnTransformer and associated testing `#273 `_ - -Added -^^^^^ - -- mixin class to handle new_column_name arguments `#273 `_ +- BaseTwoColumnTransformer removed in favour of mixin classes TwoColumnMixin and NewColumnNameMixin to handle validation of two columns and new_column_name arguments `#273 `_ 1.3.1 (2024-07-18) ------------------ diff --git a/tests/base/test_BaseTwoColumnTransformer.py b/tests/base/test_BaseTwoColumnTransformer.py deleted file mode 100644 index 2cf75888..00000000 --- a/tests/base/test_BaseTwoColumnTransformer.py +++ /dev/null @@ -1,40 +0,0 @@ -from tests.base_tests import ( - GenericFitTests, - GenericTransformTests, - OtherBaseBehaviourTests, - TwoColumnListInitTests, -) - - -class TestInit(TwoColumnListInitTests): - """Generic tests for transformer.init().""" - - @classmethod - def setup_class(cls): - cls.transformer_name = "BaseTwoColumnTransformer" - - -class TestFit(GenericFitTests): - """Generic tests for transformer.fit()""" - - @classmethod - def setup_class(cls): - cls.transformer_name = "BaseTwoColumnTransformer" - - -class TestTransform(GenericTransformTests): - @classmethod - def setup_class(cls): - cls.transformer_name = "BaseTwoColumnTransformer" - - -class TestOtherBaseBehaviour(OtherBaseBehaviourTests): - """ - Class to run tests for BaseTransformerBehaviour outside the three standard methods. - - May need to overwite specific tests in this class if the tested transformer modifies this behaviour. - """ - - @classmethod - def setup_class(cls): - cls.transformer_name = "BaseTwoColumnTransformer" diff --git a/tests/conftest.py b/tests/conftest.py index 0e2b0212..d94686ec 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -137,9 +137,6 @@ def minimal_attribute_dict(): "BaseTransformer": { "columns": ["a"], }, - "BaseTwoColumnTransformer": { - "columns": ["a", "b"], - }, "BetweenDatesTransformer": { "new_column_name": "c", "columns": ["a", "c", "b"], diff --git a/tubular/base.py b/tubular/base.py index bc94aa3d..103edf81 100644 --- a/tubular/base.py +++ b/tubular/base.py @@ -241,39 +241,6 @@ def columns_check(self, X: pd.DataFrame) -> None: raise ValueError(f"{self.classname()}: variable " + c + " is not in X") -class BaseTwoColumnTransformer(BaseTransformer): - """Transformer that takes a list of two columns as an argument, as well as new_column_name - - Inherits from BaseTransformer, all current transformers that use this argument also output a new column - Inherits fit and transform methods from BaseTransformer (required by sklearn transformers), simple input checking - and functionality to copy X prior to transform. - - Parameters - ---------- - columns : list - Column pair to apply the transformer to, must be list, cannot be None - - **kwargs - Arbitrary keyword arguments passed onto BaseTransformer.__init__(). - - """ - - def __init__( - self, - columns: list[str], - **kwargs: dict[str, bool], - ) -> None: - super().__init__(columns=columns, **kwargs) - - if not (isinstance(columns, list)): - msg = f"{self.classname()}: columns should be list" - raise TypeError(msg) - - if len(columns) != 2: - msg = f"{self.classname()}: This transformer works with two columns only" - raise ValueError(msg) - - class DataFrameMethodTransformer(DropOriginalMixin, BaseTransformer): """Tranformer that applies a pandas.DataFrame method. diff --git a/tubular/comparison.py b/tubular/comparison.py index 55ebce8e..c334ffd0 100644 --- a/tubular/comparison.py +++ b/tubular/comparison.py @@ -2,11 +2,16 @@ import pandas as pd # noqa: TCH002 -from tubular.base import BaseTwoColumnTransformer -from tubular.mixins import DropOriginalMixin, NewColumnNameMixin +from tubular.base import BaseTransformer +from tubular.mixins import DropOriginalMixin, NewColumnNameMixin, TwoColumnMixin -class EqualityChecker(DropOriginalMixin, NewColumnNameMixin, BaseTwoColumnTransformer): +class EqualityChecker( + DropOriginalMixin, + NewColumnNameMixin, + TwoColumnMixin, + BaseTransformer, +): """Transformer to check if two columns are equal. Parameters @@ -34,6 +39,7 @@ def __init__( ) -> None: super().__init__(columns=columns, **kwargs) + TwoColumnMixin.check_two_columns(self, columns) DropOriginalMixin.set_drop_original_column(self, drop_original) NewColumnNameMixin.check_and_set_new_column_name(self, new_column_name) diff --git a/tubular/dates.py b/tubular/dates.py index 6849700b..ffbd66b8 100644 --- a/tubular/dates.py +++ b/tubular/dates.py @@ -8,8 +8,8 @@ import numpy as np import pandas as pd -from tubular.base import BaseTransformer, BaseTwoColumnTransformer -from tubular.mixins import DropOriginalMixin, NewColumnNameMixin +from tubular.base import BaseTransformer +from tubular.mixins import DropOriginalMixin, NewColumnNameMixin, TwoColumnMixin class DateTransformerMixin: @@ -127,12 +127,11 @@ def transform( class BaseDateTwoColumnTransformer( - DateTransformerMixin, - DropOriginalMixin, - BaseTwoColumnTransformer, + TwoColumnMixin, + BaseDateTransformer, ): - """Extends BaseTwoColumnTransformer for datetime scenarios + """Extends BaseDateTransformer for transformers which accept exactly two columns Parameters ---------- @@ -158,38 +157,14 @@ def __init__( drop_original: bool = False, **kwargs: dict[str, bool], ) -> None: - super().__init__(columns=columns, **kwargs) - - DropOriginalMixin.set_drop_original_column(self, drop_original) - NewColumnNameMixin.check_and_set_new_column_name(self, new_column_name) - - def transform( - self, - X: pd.DataFrame, - datetime_only: bool = False, - ) -> pd.DataFrame: - """Base transform method, calls parent transform and validates data. - - Parameters - ---------- - X : pd.DataFrame - Data containing self.columns - - datetime_only: bool - Indicates whether ONLY datetime types are accepted - - Returns - ------- - X : pd.DataFrame - Validated data - - """ - - X = super().transform(X) - - self.check_columns_are_date_or_datetime(X, datetime_only=datetime_only) + super().__init__( + columns=columns, + new_column_name=new_column_name, + drop_original=drop_original, + **kwargs, + ) - return X + TwoColumnMixin.check_two_columns(self, columns) class DateDiffLeapYearTransformer(BaseDateTwoColumnTransformer): diff --git a/tubular/mixins.py b/tubular/mixins.py index 03211a01..1f0c1a91 100644 --- a/tubular/mixins.py +++ b/tubular/mixins.py @@ -71,6 +71,19 @@ def check_and_set_new_column_name(self, new_column_name: str) -> None: self.new_column_name = new_column_name +class TwoColumnMixin: + """helper to validate columns when exactly two columns are required""" + + def check_two_columns(self, columns: list[str] | str) -> None: + if not (isinstance(columns, list)): + msg = f"{self.classname()}: columns should be list" + raise TypeError(msg) + + if len(columns) != 2: + msg = f"{self.classname()}: This transformer works with two columns only" + raise ValueError(msg) + + class WeightColumnMixin: def check_weights_column(self, X: pd.DataFrame, weights_column: str) -> None: """Helper method for validating weights column in dataframe. From c1ae8b926ac134d0b6ac9e2d43cf91ef1ce861c2 Mon Sep 17 00:00:00 2001 From: davidhopkinson26 <89029024+davidhopkinson26@users.noreply.github.com> Date: Thu, 25 Jul 2024 15:30:29 +0000 Subject: [PATCH 30/32] :art: removed DateTransformermixin in favour of the logic being in BaseDateTransformer --- tubular/dates.py | 64 +++++++++++++++++++++++------------------------- 1 file changed, 30 insertions(+), 34 deletions(-) diff --git a/tubular/dates.py b/tubular/dates.py index ffbd66b8..e2622813 100644 --- a/tubular/dates.py +++ b/tubular/dates.py @@ -12,8 +12,36 @@ from tubular.mixins import DropOriginalMixin, NewColumnNameMixin, TwoColumnMixin -class DateTransformerMixin: - """Class to contain methods specific to datetime transformers""" +class BaseDateTransformer(DropOriginalMixin, BaseTransformer): + """ + Extends BaseTransformer for datetime scenarios + + Parameters + ---------- + columns : List[str] + List of 2 columns. First column will be subtracted from second. + + new_column_name : str + Name for the new year column. + + drop_original : bool + Flag for whether to drop the original columns. + + **kwargs + Arbitrary keyword arguments passed onto BaseTransformer.init method. + """ + + def __init__( + self, + columns: list[str], + new_column_name: str | None = None, + drop_original: bool = False, + **kwargs: dict[str, bool], + ) -> None: + super().__init__(columns=columns, **kwargs) + + DropOriginalMixin.set_drop_original_column(self, drop_original) + NewColumnNameMixin.check_and_set_new_column_name(self, new_column_name) def check_columns_are_date_or_datetime( self, @@ -65,38 +93,6 @@ def check_columns_are_date_or_datetime( msg, ) - -class BaseDateTransformer(DateTransformerMixin, DropOriginalMixin, BaseTransformer): - """ - Extends BaseTransformer for datetime scenarios - - Parameters - ---------- - columns : List[str] - List of 2 columns. First column will be subtracted from second. - - new_column_name : str - Name for the new year column. - - drop_original : bool - Flag for whether to drop the original columns. - - **kwargs - Arbitrary keyword arguments passed onto BaseTransformer.init method. - """ - - def __init__( - self, - columns: list[str], - new_column_name: str | None = None, - drop_original: bool = False, - **kwargs: dict[str, bool], - ) -> None: - super().__init__(columns=columns, **kwargs) - - DropOriginalMixin.set_drop_original_column(self, drop_original) - NewColumnNameMixin.check_and_set_new_column_name(self, new_column_name) - def transform( self, X: pd.DataFrame, From 03fa239e6e5994ac020dc8337a81d93e194de311 Mon Sep 17 00:00:00 2001 From: davidhopkinson26 <89029024+davidhopkinson26@users.noreply.github.com> Date: Mon, 29 Jul 2024 11:08:40 +0000 Subject: [PATCH 31/32] :art: changed call to mixin methods to call from self --- tubular/comparison.py | 6 +++--- tubular/dates.py | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tubular/comparison.py b/tubular/comparison.py index c334ffd0..2d3cab7b 100644 --- a/tubular/comparison.py +++ b/tubular/comparison.py @@ -39,9 +39,9 @@ def __init__( ) -> None: super().__init__(columns=columns, **kwargs) - TwoColumnMixin.check_two_columns(self, columns) - DropOriginalMixin.set_drop_original_column(self, drop_original) - NewColumnNameMixin.check_and_set_new_column_name(self, new_column_name) + self.check_two_columns(columns) + self.set_drop_original_column(drop_original) + self.check_and_set_new_column_name(new_column_name) def transform(self, X: pd.DataFrame) -> pd.DataFrame: """Create a column which is populated by the boolean diff --git a/tubular/dates.py b/tubular/dates.py index e2622813..4b1c23df 100644 --- a/tubular/dates.py +++ b/tubular/dates.py @@ -12,7 +12,7 @@ from tubular.mixins import DropOriginalMixin, NewColumnNameMixin, TwoColumnMixin -class BaseDateTransformer(DropOriginalMixin, BaseTransformer): +class BaseDateTransformer(NewColumnNameMixin, DropOriginalMixin, BaseTransformer): """ Extends BaseTransformer for datetime scenarios @@ -40,8 +40,8 @@ def __init__( ) -> None: super().__init__(columns=columns, **kwargs) - DropOriginalMixin.set_drop_original_column(self, drop_original) - NewColumnNameMixin.check_and_set_new_column_name(self, new_column_name) + self.set_drop_original_column(drop_original) + self.check_and_set_new_column_name(new_column_name) def check_columns_are_date_or_datetime( self, @@ -160,7 +160,7 @@ def __init__( **kwargs, ) - TwoColumnMixin.check_two_columns(self, columns) + self.check_two_columns(columns) class DateDiffLeapYearTransformer(BaseDateTwoColumnTransformer): From d5984ef5b268f6a5019ffff3f6046f8bf60db883 Mon Sep 17 00:00:00 2001 From: davidhopkinson26 <89029024+davidhopkinson26@users.noreply.github.com> Date: Mon, 29 Jul 2024 11:16:01 +0000 Subject: [PATCH 32/32] :art: removed str from check_two_columns typehints --- tubular/mixins.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tubular/mixins.py b/tubular/mixins.py index 1f0c1a91..909e9bfc 100644 --- a/tubular/mixins.py +++ b/tubular/mixins.py @@ -74,7 +74,7 @@ def check_and_set_new_column_name(self, new_column_name: str) -> None: class TwoColumnMixin: """helper to validate columns when exactly two columns are required""" - def check_two_columns(self, columns: list[str] | str) -> None: + def check_two_columns(self, columns: list[str]) -> None: if not (isinstance(columns, list)): msg = f"{self.classname()}: columns should be list" raise TypeError(msg)