Fix bug in Pipeline.transform() #294

ganik · 2019-10-03T11:56:58Z

tuple [](start = 35, length = 5)

when y could be a tuple? How y_temp is used now? #Resolved

y_temp is not used at all in this method.

In reply to: 330999871 [](ancestors = 330999871)

ganik · 2019-10-04T22:29:02Z

_preprocess_X_y [](start = 43, length = 15)

Do you need to call this at all? #Resolved

-Original file line number
+Diff line change
@@ Expand Up / @@ -40,9 +40,16 @@ @@
         [PR#232](https://github.com/microsoft/NimbusML/pull/232)
         Enable passing python executable to dataprep package, so dataprep can execute python transformations
+    - **Fixed `Pipeline.transform()` in transform only `Pipeline` fails if y column is provided **
+        [PR#294](https://github.com/microsoft/NimbusML/pull/294)
+        Enable calling `.transform()` on a `Pipeline` containing only transforms when the y column is provided
     ## **Breaking Changes**
-    None.
+    - **Removed `y` parameter from `Pipeline.transform()`**
+        [PR#294](https://github.com/microsoft/NimbusML/pull/294)
+        Removed `y` parameter from `Pipeline.transform()` as it is not needed nor used for transforming data with a fitted `Pipeline`.
     ## **Enhancements**
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -686,6 +686,7 @@ @@
         <Compile Include="nimbusml\tests\pipeline\test_pipeline_split_models.py" />
         <Compile Include="nimbusml\tests\pipeline\test_pipeline_combining.py" />
         <Compile Include="nimbusml\tests\pipeline\test_pipeline_subclassing.py" />
+        <Compile Include="nimbusml\tests\pipeline\test_pipeline_transform_method.py" />
         <Compile Include="nimbusml\tests\preprocessing\normalization\test_lpscaler.py" />
         <Compile Include="nimbusml\tests\preprocessing\normalization\test_meanvariancescaler.py" />
         <Compile Include="nimbusml\tests\preprocessing\schema\test_prefixcolumnconcatenator.py" />
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -2,9 +2,8 @@ @@
     # LightLda: cluster topics
     import pandas
     from nimbusml import Pipeline
-    from nimbusml.feature_extraction.text import LightLda
-    from nimbusml.feature_extraction.text import NGramFeaturizer
-    from nimbusml.internal.entrypoints._ngramextractor_ngram import n_gram
+    from nimbusml.feature_extraction.text import LightLda, NGramFeaturizer
+    from nimbusml.feature_extraction.text.extractor import Ngram
     # create the data
     topics = pandas.DataFrame(data=dict(review=[
@@ Expand All / @@ -19,7 +18,7 @@ @@
     # there are three main topics in our data. set num_topic=3
     # and see if LightLDA vectors for topics look similar
-    pipeline = Pipeline([NGramFeaturizer(word_feature_extractor=n_gram(
+    pipeline = Pipeline([NGramFeaturizer(word_feature_extractor=Ngram(
     ), vector_normalizer='None') << 'review', LightLda(num_topic=3)])
     y = pipeline.fit_transform(topics)
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -2,7 +2,7 @@ @@
     # Example with TextTransform and LogisticRegressionBinaryClassifier
     import pandas
     from nimbusml.feature_extraction.text import NGramFeaturizer
-    from nimbusml.internal.entrypoints._ngramextractor_ngram import n_gram
+    from nimbusml.feature_extraction.text.extractor import Ngram
     from nimbusml.linear_model import LogisticRegressionBinaryClassifier
     train_reviews = pandas.DataFrame(
@@ Expand Down Expand Up / @@ -77,7 +77,7 @@ @@
     y = train_reviews['like']
     X = train_reviews.loc[:, train_reviews.columns != 'like']
-    ngram = NGramFeaturizer(word_feature_extractor=n_gram()) << 'review'
+    ngram = NGramFeaturizer(word_feature_extractor=Ngram()) << 'review'
     X = ngram.fit_transform(X)
     # view the transformed numerical values and column names
@@ Expand Down @@

-Original file line number
+Diff line change
@@ Expand Up / @@ -2254,7 +2254,6 @@ def test( @@
         def transform(
                 self,
                 X,
-                y=None,
                 verbose=0,
                 as_binary_data_stream=False,
                 **params):
@@ Expand All / @@ -2275,18 +2274,7 @@ def transform( @@
                     "Model is not fitted. Train or load a model before test("
                     ").")
-            if y is not None:
-                if len(self.steps) > 0:
-                    last_node = self.last_node
-                    if last_node.type == 'transform':
-                        raise ValueError(
-                            "Pipeline needs a trainer as last step for test()")
-            X, y_temp, columns_renamed, feature_columns, label_column, \
-                schema, weights, weight_column = self._preprocess_X_y(X, y)
-            if not isinstance(y, (str, tuple)):
-                y = y_temp
+            X, _, _, _, _, schema, _, _ = self._preprocess_X_y(X)
             all_nodes = []
@@ Expand Down @@

Provide feedback

-Original file line number
+Diff line change
@@ -0,0 +1,26 @@
+    # --------------------------------------------------------------------------------------------
+    # Copyright (c) Microsoft Corporation. All rights reserved.
+    # Licensed under the MIT License.
+    # --------------------------------------------------------------------------------------------
+    import unittest
+    import pandas
+    from nimbusml import Pipeline, FileDataStream
+    from nimbusml.datasets import get_dataset
+    from nimbusml.feature_extraction.text import NGramFeaturizer
+    path = get_dataset("wiki_detox_train").as_filepath()
+    data = FileDataStream.read_csv(path, sep='\t')
+    df = data.to_df().head()
+    X = df['SentimentText']
+    class TestPipelineTransformMethod(unittest.TestCase):
+        def test_transform_only_pipeline_transform_method(self):
+            p = Pipeline([NGramFeaturizer(char_feature_extractor=None) << 'SentimentText'])
+            p.fit(X)
+            xf = p.transform(X)
+            assert 'SentimentText.==rude==' in xf.columns
+    if __name__ == '__main__':
+        unittest.main()