From 2211823059ccde3cd77270eed6d9619cb110918a Mon Sep 17 00:00:00 2001
From: Antonio Velazquez <anvelazq@microsoft.com>
Date: Thu, 12 Sep 2019 15:29:20 -0700
Subject: [PATCH 1/3] Fixed issue with TextFeaturizer when no inputColumnName
 is provided. Added Tests. Fixed a minor mistake in documentation.

---
 .../Text/TextCatalog.cs                       |  4 +-
 .../Transformers/TextFeaturizerTests.cs       | 90 +++++++++++++++++++
 2 files changed, 92 insertions(+), 2 deletions(-)
diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs
index 637a53f444..91615cf687 100644
--- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs
+++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs
@@ -39,7 +39,7 @@ public static TextFeaturizingEstimator FeaturizeText(this TransformsCatalog.Text
                 outputColumnName, inputColumnName);
 
         /// <summary>
-        ///  Create a <see cref="TextFeaturizingEstimator"/>, which transforms a text column into featurized float array that represents normalized counts of n-grams and char-grams.
+        ///  Create a <see cref="TextFeaturizingEstimator"/>, which transforms a text column into featurized vector of <see cref="System.Single"/> that represents normalized counts of n-grams and char-grams.
         /// </summary>
         /// <remarks>This transform can operate over several columns.</remarks>
         /// <param name="catalog">The text-related transform's catalog.</param>
@@ -62,7 +62,7 @@ public static TextFeaturizingEstimator FeaturizeText(this TransformsCatalog.Text
             TextFeaturizingEstimator.Options options,
             params string[] inputColumnNames)
             => new TextFeaturizingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(),
-                outputColumnName, inputColumnNames, options);
+                outputColumnName, inputColumnNames == null || inputColumnNames.Length == 0 ? new[] { outputColumnName } : inputColumnNames, options);
 
         /// <summary>
         /// Create a <see cref="TokenizingByCharactersEstimator"/>, which tokenizes by splitting text into sequences of characters
diff --git a/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs b/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs
index c648937e9c..14583a9660 100644
--- a/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs
+++ b/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs
@@ -33,6 +33,12 @@ private class TestClass
             public float[] Features = null;
         }
 
+        private class TestClass2
+        {
+            public string Features;
+            public string[] OutputTokens;
+        }
+
         [Fact]
         public void TextFeaturizerWithPredefinedStopWordRemoverTest()
         {
@@ -80,6 +86,90 @@ public void TextFeaturizerWithWordFeatureExtractorTest()
             Assert.Equal(expected, prediction.Features);
         }
 
+        [Fact]
+        public void TextFeaturizerWithWordFeatureExtractorTestWithNullInputNames()
+        {
+            var data = new[] { new TestClass2() { Features = "This is some text in english", OutputTokens=null},
+                               new TestClass2() { Features = "This is another example", OutputTokens=null } };
+            var dataView = ML.Data.LoadFromEnumerable(data);
+
+            var options = new TextFeaturizingEstimator.Options()
+            {
+                WordFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 1 },
+                CharFeatureExtractor = null,
+                Norm = TextFeaturizingEstimator.NormFunction.None,
+                OutputTokensColumnName = "OutputTokens"
+            };
+
+            var pipeline = ML.Transforms.Text.FeaturizeText("Features", options, null);
+            dataView = pipeline.Fit(dataView).Transform(dataView);
+
+            VBuffer<float> features = default;
+            float[][] transformed = { null, null };
+
+            var expected = new float[][] {
+                new float[] { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f, 0.0f },
+                new float[] { 1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 1.0f }
+            };
+
+            using (var cursor = dataView.GetRowCursor(dataView.Schema))
+            {
+                var i = 0;
+                while (cursor.MoveNext())
+                {
+                    var featureGetter = cursor.GetGetter<VBuffer<float>>(cursor.Schema["Features"]);
+                    featureGetter(ref features);
+                    transformed[i] = features.DenseValues().ToArray();
+                    i++;
+                }
+            }
+
+            Assert.Equal(expected[0], transformed[0]);
+            Assert.Equal(expected[1], transformed[1]);
+        }
+
+        [Fact]
+        public void TextFeaturizerWithWordFeatureExtractorTestWithNoInputNames()
+        {
+            var data = new[] { new TestClass2() { Features = "This is some text in english", OutputTokens=null},
+                               new TestClass2() { Features = "This is another example", OutputTokens=null } };
+            var dataView = ML.Data.LoadFromEnumerable(data);
+
+            var options = new TextFeaturizingEstimator.Options()
+            {
+                WordFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 1 },
+                CharFeatureExtractor = null,
+                Norm = TextFeaturizingEstimator.NormFunction.None,
+                OutputTokensColumnName = "OutputTokens"
+            };
+
+            var pipeline = ML.Transforms.Text.FeaturizeText("Features", options);
+            dataView = pipeline.Fit(dataView).Transform(dataView);
+
+            VBuffer<float> features = default;
+            float[][] transformed = { null, null };
+
+            var expected = new float[][] {
+                new float[] { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f, 0.0f },
+                new float[] { 1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 1.0f }
+            };
+
+            using (var cursor = dataView.GetRowCursor(dataView.Schema))
+            {
+                var i = 0;
+                while (cursor.MoveNext())
+                {
+                    var featureGetter = cursor.GetGetter<VBuffer<float>>(cursor.Schema["Features"]);
+                    featureGetter(ref features);
+                    transformed[i] = features.DenseValues().ToArray();
+                    i++;
+                }
+            }
+
+            Assert.Equal(expected[0], transformed[0]);
+            Assert.Equal(expected[1], transformed[1]);
+        }
+
         [Fact]
         public void TextFeaturizerWithCharFeatureExtractorTest()
         {

From 05deae1bffd19c65b2aa56883349405cda39ecbf Mon Sep 17 00:00:00 2001
From: Antonio Velazquez <anvelazq@microsoft.com>
Date: Thu, 12 Sep 2019 16:41:54 -0700
Subject: [PATCH 2/3] Added case when inputColumnName is an empty string

---
 .../Text/TextCatalog.cs                       |  5 ++-
 .../Transformers/TextFeaturizerTests.cs       | 43 +++++++++++++++++++
 2 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs
index 91615cf687..cfe9e31bd9 100644
--- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs
+++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs
@@ -62,7 +62,10 @@ public static TextFeaturizingEstimator FeaturizeText(this TransformsCatalog.Text
             TextFeaturizingEstimator.Options options,
             params string[] inputColumnNames)
             => new TextFeaturizingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(),
-                outputColumnName, inputColumnNames == null || inputColumnNames.Length == 0 ? new[] { outputColumnName } : inputColumnNames, options);
+                outputColumnName,
+                inputColumnNames == null || inputColumnNames.Length == 0 || (inputColumnNames.Length == 1 && inputColumnNames[0] == "")  ?
+                    new[] { outputColumnName } : inputColumnNames,
+                options);
 
         /// <summary>
         /// Create a <see cref="TokenizingByCharactersEstimator"/>, which tokenizes by splitting text into sequences of characters
diff --git a/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs b/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs
index 14583a9660..cbe172cd9c 100644
--- a/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs
+++ b/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs
@@ -128,6 +128,49 @@ public void TextFeaturizerWithWordFeatureExtractorTestWithNullInputNames()
             Assert.Equal(expected[1], transformed[1]);
         }
 
+        [Fact]
+        public void TextFeaturizerWithWordFeatureExtractorTestWithEmptyInputName()
+        {
+            var data = new[] { new TestClass2() { Features = "This is some text in english", OutputTokens=null},
+                               new TestClass2() { Features = "This is another example", OutputTokens=null } };
+            var dataView = ML.Data.LoadFromEnumerable(data);
+
+            var options = new TextFeaturizingEstimator.Options()
+            {
+                WordFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 1 },
+                CharFeatureExtractor = null,
+                Norm = TextFeaturizingEstimator.NormFunction.None,
+                OutputTokensColumnName = "OutputTokens"
+            };
+
+            var pipeline = ML.Transforms.Text.FeaturizeText("Features", options, "");
+            dataView = pipeline.Fit(dataView).Transform(dataView);
+
+            VBuffer<float> features = default;
+            float[][] transformed = { null, null };
+
+            var expected = new float[][] {
+                new float[] { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f, 0.0f },
+                new float[] { 1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 1.0f }
+            };
+
+            using (var cursor = dataView.GetRowCursor(dataView.Schema))
+            {
+                var i = 0;
+                while (cursor.MoveNext())
+                {
+                    var featureGetter = cursor.GetGetter<VBuffer<float>>(cursor.Schema["Features"]);
+                    featureGetter(ref features);
+                    transformed[i] = features.DenseValues().ToArray();
+                    i++;
+                }
+            }
+
+            Assert.Equal(expected[0], transformed[0]);
+            Assert.Equal(expected[1], transformed[1]);
+        }
+
+
         [Fact]
         public void TextFeaturizerWithWordFeatureExtractorTestWithNoInputNames()
         {

From eb1433cf0159aeb24890fa7f1bc7098816131de8 Mon Sep 17 00:00:00 2001
From: Antonio Velazquez <anvelazq@microsoft.com>
Date: Wed, 18 Sep 2019 09:31:38 -0700
Subject: [PATCH 3/3] Removed case when inputColumnNames is an empty string.
 Simplified check of inputColumnNames using Utils.Size()

---
 .../Text/TextCatalog.cs                       |  5 +--
 .../Transformers/TextFeaturizerTests.cs       | 45 +------------------
 2 files changed, 3 insertions(+), 47 deletions(-)

diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs
index cfe9e31bd9..7b4b554b7c 100644
--- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs
+++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs
@@ -3,6 +3,7 @@
 // See the LICENSE file in the project root for more information.
 
 using Microsoft.ML.Data;
+using Microsoft.ML.Internal.Utilities;
 using Microsoft.ML.Runtime;
 using Microsoft.ML.Transforms.Text;
 
@@ -62,9 +63,7 @@ public static TextFeaturizingEstimator FeaturizeText(this TransformsCatalog.Text
             TextFeaturizingEstimator.Options options,
             params string[] inputColumnNames)
             => new TextFeaturizingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(),
-                outputColumnName,
-                inputColumnNames == null || inputColumnNames.Length == 0 || (inputColumnNames.Length == 1 && inputColumnNames[0] == "")  ?
-                    new[] { outputColumnName } : inputColumnNames,
+                outputColumnName, Utils.Size(inputColumnNames) == 0  ? new[] { outputColumnName } : inputColumnNames,
                 options);
 
         /// <summary>
diff --git a/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs b/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs
index cbe172cd9c..24d2c671f6 100644
--- a/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs
+++ b/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs
@@ -87,7 +87,7 @@ public void TextFeaturizerWithWordFeatureExtractorTest()
         }
 
         [Fact]
-        public void TextFeaturizerWithWordFeatureExtractorTestWithNullInputNames()
+        public void TextFeaturizerWithWordFeatureExtractorWithNullInputNamesTest()
         {
             var data = new[] { new TestClass2() { Features = "This is some text in english", OutputTokens=null},
                                new TestClass2() { Features = "This is another example", OutputTokens=null } };
@@ -128,49 +128,6 @@ public void TextFeaturizerWithWordFeatureExtractorTestWithNullInputNames()
             Assert.Equal(expected[1], transformed[1]);
         }
 
-        [Fact]
-        public void TextFeaturizerWithWordFeatureExtractorTestWithEmptyInputName()
-        {
-            var data = new[] { new TestClass2() { Features = "This is some text in english", OutputTokens=null},
-                               new TestClass2() { Features = "This is another example", OutputTokens=null } };
-            var dataView = ML.Data.LoadFromEnumerable(data);
-
-            var options = new TextFeaturizingEstimator.Options()
-            {
-                WordFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 1 },
-                CharFeatureExtractor = null,
-                Norm = TextFeaturizingEstimator.NormFunction.None,
-                OutputTokensColumnName = "OutputTokens"
-            };
-
-            var pipeline = ML.Transforms.Text.FeaturizeText("Features", options, "");
-            dataView = pipeline.Fit(dataView).Transform(dataView);
-
-            VBuffer<float> features = default;
-            float[][] transformed = { null, null };
-
-            var expected = new float[][] {
-                new float[] { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f, 0.0f },
-                new float[] { 1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 1.0f }
-            };
-
-            using (var cursor = dataView.GetRowCursor(dataView.Schema))
-            {
-                var i = 0;
-                while (cursor.MoveNext())
-                {
-                    var featureGetter = cursor.GetGetter<VBuffer<float>>(cursor.Schema["Features"]);
-                    featureGetter(ref features);
-                    transformed[i] = features.DenseValues().ToArray();
-                    i++;
-                }
-            }
-
-            Assert.Equal(expected[0], transformed[0]);
-            Assert.Equal(expected[1], transformed[1]);
-        }
-
-
         [Fact]
         public void TextFeaturizerWithWordFeatureExtractorTestWithNoInputNames()
         {