diff --git a/src/python/nimbusml/internal/core/base_pipeline_item.py b/src/python/nimbusml/internal/core/base_pipeline_item.py index cfd1aee9..8700da1a 100644 --- a/src/python/nimbusml/internal/core/base_pipeline_item.py +++ b/src/python/nimbusml/internal/core/base_pipeline_item.py @@ -956,8 +956,10 @@ def _steal_io(self, node): """ if hasattr(node, '_columns') and node._columns is not None: self << node._columns - setattr(node, node._attr_input, - getattr(node, node._attr_output)) + + if hasattr(node, '_attr_output'): + setattr(node, node._attr_input, + getattr(node, node._attr_output)) else: # No columns specified. The user plans to fit the pipeline as # fit(X, y). diff --git a/src/python/nimbusml/preprocessing/missing_values/filter.py b/src/python/nimbusml/preprocessing/missing_values/filter.py index 18435c13..ccdd272d 100644 --- a/src/python/nimbusml/preprocessing/missing_values/filter.py +++ b/src/python/nimbusml/preprocessing/missing_values/filter.py @@ -77,3 +77,13 @@ def get_params(self, deep=False): Get the parameters for this operator. """ return core.get_params(self) + + def _nodes_with_presteps(self): + """ + Inserts preprocessing before this one. + """ + from ..schema import TypeConverter + return [ + TypeConverter( + result_type='R4')._steal_io(self), + self] diff --git a/src/python/nimbusml/preprocessing/missing_values/handler.py b/src/python/nimbusml/preprocessing/missing_values/handler.py index d390eb4a..1a1fac0a 100644 --- a/src/python/nimbusml/preprocessing/missing_values/handler.py +++ b/src/python/nimbusml/preprocessing/missing_values/handler.py @@ -106,3 +106,13 @@ def get_params(self, deep=False): Get the parameters for this operator. """ return core.get_params(self) + + def _nodes_with_presteps(self): + """ + Inserts preprocessing before this one. + """ + from ..schema import TypeConverter + return [ + TypeConverter( + result_type='R4')._steal_io(self), + self] diff --git a/src/python/nimbusml/preprocessing/missing_values/indicator.py b/src/python/nimbusml/preprocessing/missing_values/indicator.py index fdcfecc9..8c6a26a4 100644 --- a/src/python/nimbusml/preprocessing/missing_values/indicator.py +++ b/src/python/nimbusml/preprocessing/missing_values/indicator.py @@ -79,3 +79,13 @@ def get_params(self, deep=False): Get the parameters for this operator. """ return core.get_params(self) + + def _nodes_with_presteps(self): + """ + Inserts preprocessing before this one. + """ + from ..schema import TypeConverter + return [ + TypeConverter( + result_type='R4')._steal_io(self), + self] diff --git a/src/python/nimbusml/tests/preprocessing/missing_values/test_data_with_missing.py b/src/python/nimbusml/tests/preprocessing/missing_values/test_data_with_missing.py index 9b072af4..fb0bdc79 100644 --- a/src/python/nimbusml/tests/preprocessing/missing_values/test_data_with_missing.py +++ b/src/python/nimbusml/tests/preprocessing/missing_values/test_data_with_missing.py @@ -9,7 +9,7 @@ from math import isnan from nimbusml import Pipeline from nimbusml.linear_model import FastLinearRegressor -from nimbusml.preprocessing.missing_values import Filter, Handler +from nimbusml.preprocessing.missing_values import Filter, Handler, Indicator from pandas import DataFrame from sklearn.utils.testing import assert_equal, assert_true, \ assert_allclose @@ -75,6 +75,90 @@ def test_input_types(self): res['Score'].values, [ 4.965541, 0.519701, 4.992831, 3.877400, 5.020121], rtol=1e-4) + def test_input_conversion_to_float(self): + data={'f0': [0, 1, 2, 3], + 'f1': [1, 2, 3, 4], + 'f2': [1, 2, 3, 4], + 'f3': [1, 2, 3, 4], + 'f4': ['2', '3', '4', '5'], + 'f5': [4, 5, np.nan, 9]} + + data = DataFrame(data).astype({ + 'f0': np.int8, + 'f1': np.int16, + 'f2': np.int32, + 'f3': np.int64, + 'f4': str, + 'f5': np.float64}) + + # Check Indicator + xf = Indicator() + result = xf.fit_transform(data) + + assert_equal(result.loc[2, 'f5'], True) + result.loc[2, 'f5'] = False + result = ~result + self.assertTrue(result.all(axis=None)) + + # Check Filter + xf = Filter() + result = xf.fit_transform(data) + assert_equal(len(result), 3) + assert_equal(result.loc[2, 'f5'], 9.0) + + # Check Handler + xf = Handler(replace_with='Mean') + result = xf.fit_transform(data) + assert_equal(len(result), 4) + assert_equal(result.loc[2, 'f5.f5'], 6.0) + assert_equal(result.loc[2, 'f5.IsMissing.f5'], 1.0) + + def test_input_conversion_to_float_retains_other_column_types(self): + data={'f0': [0, 1, 2, 3], + 'f1': ['2', '3', '4', '5'], + 'f2': [4, 5, np.nan, 9]} + + data = DataFrame(data).astype({ + 'f0': np.int32, + 'f1': str, + 'f2': np.float64}) + + # Check Indicator + xf = Indicator(columns={'f2.ind': 'f2'}) + result = xf.fit_transform(data) + assert_equal(result.dtypes['f0'], np.int32) + assert_equal(result.dtypes['f1'], np.object) + assert_equal(result.dtypes['f2'], np.float64) + assert_equal(result.dtypes['f2.ind'], np.bool) + assert_equal(result.loc[2, 'f2.ind'], True) + assert_equal(len(result), 4) + + # Check Filter + xf = Filter(columns=['f2']) + result = xf.fit_transform(data) + assert_equal(len(result), 3) + assert_equal(result.loc[2, 'f2'], 9.0) + assert_equal(result.dtypes['f0'], np.int32) + assert_equal(result.dtypes['f1'], np.object) + assert_equal(result.dtypes['f2'], np.float32) + + xf = Filter(columns=['f1']) + result = xf.fit_transform(data) + assert_equal(len(result), 4) + assert_equal(result.loc[3, 'f2'], 9.0) + assert_equal(result.dtypes['f0'], np.int32) + assert_equal(result.dtypes['f1'], np.float32) + assert_equal(result.dtypes['f2'], np.float64) + + # Check Handler + xf = Handler(columns=['f2'], replace_with='Mean') + result = xf.fit_transform(data) + assert_equal(len(result), 4) + assert_equal(result.loc[2, 'f2.f2'], 6.0) + assert_equal(result.dtypes['f0'], np.int32) + assert_equal(result.dtypes['f1'], np.object) + assert_equal(result.dtypes['f2.f2'], np.float32) + if __name__ == '__main__': unittest.main() diff --git a/src/python/nimbusml/tests/utils/test_exports.py b/src/python/nimbusml/tests/utils/test_exports.py index 96d1ddfa..26800725 100644 --- a/src/python/nimbusml/tests/utils/test_exports.py +++ b/src/python/nimbusml/tests/utils/test_exports.py @@ -492,6 +492,17 @@ def test_get_fit_info_fastl(self): 'Month', 'Day'], 'type': 'start'}, + {'inputs': ['Ozone'], + 'name': 'TypeConverter', + 'outputs': ['Ozone'], + 'schema_after': ['Unnamed0', + 'Ozone', + 'Solar_R', + 'Wind', + 'Temp', + 'Month', + 'Day'], + 'type': 'transform'}, {'inputs': ['Ozone'], 'name': 'Filter', 'outputs': ['Ozone'], @@ -506,7 +517,7 @@ def test_get_fit_info_fastl(self): for el in info[0]: if 'operator' in el: del el['operator'] - self.assertEqual(exp, info[0][:2]) + self.assertEqual(exp, info[0][:3]) def test_word_embedding(self): diff --git a/src/python/tools/compiler_utils.py b/src/python/tools/compiler_utils.py index c64f5af3..7771ed6c 100644 --- a/src/python/tools/compiler_utils.py +++ b/src/python/tools/compiler_utils.py @@ -129,6 +129,9 @@ def _nodes_with_presteps(self): 'MeanVarianceScaler': int_to_r4_converter, 'LogMeanVarianceScaler': int_to_r4_converter, 'Binner': int_to_r4_converter, + 'Filter': int_to_r4_converter, + 'Handler': int_to_r4_converter, + 'Indicator': int_to_r4_converter, # 'SupervisedBinner': int_to_r4_converter, # not exist in nimbusml 'IidSpikeDetector': timeseries_to_r4_converter,