From 2211823059ccde3cd77270eed6d9619cb110918a Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Thu, 12 Sep 2019 15:29:20 -0700 Subject: [PATCH 1/3] Fixed issue with TextFeaturizer when no inputColumnName is provided. Added Tests. Fixed a minor mistake in documentation. --- .../Text/TextCatalog.cs | 4 +- .../Transformers/TextFeaturizerTests.cs | 90 +++++++++++++++++++ 2 files changed, 92 insertions(+), 2 deletions(-) diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs index 637a53f444..91615cf687 100644 --- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs +++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs @@ -39,7 +39,7 @@ public static TextFeaturizingEstimator FeaturizeText(this TransformsCatalog.Text outputColumnName, inputColumnName); /// - /// Create a , which transforms a text column into featurized float array that represents normalized counts of n-grams and char-grams. + /// Create a , which transforms a text column into featurized vector of that represents normalized counts of n-grams and char-grams. /// /// This transform can operate over several columns. /// The text-related transform's catalog. @@ -62,7 +62,7 @@ public static TextFeaturizingEstimator FeaturizeText(this TransformsCatalog.Text TextFeaturizingEstimator.Options options, params string[] inputColumnNames) => new TextFeaturizingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), - outputColumnName, inputColumnNames, options); + outputColumnName, inputColumnNames == null || inputColumnNames.Length == 0 ? new[] { outputColumnName } : inputColumnNames, options); /// /// Create a , which tokenizes by splitting text into sequences of characters diff --git a/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs b/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs index c648937e9c..14583a9660 100644 --- a/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs @@ -33,6 +33,12 @@ private class TestClass public float[] Features = null; } + private class TestClass2 + { + public string Features; + public string[] OutputTokens; + } + [Fact] public void TextFeaturizerWithPredefinedStopWordRemoverTest() { @@ -80,6 +86,90 @@ public void TextFeaturizerWithWordFeatureExtractorTest() Assert.Equal(expected, prediction.Features); } + [Fact] + public void TextFeaturizerWithWordFeatureExtractorTestWithNullInputNames() + { + var data = new[] { new TestClass2() { Features = "This is some text in english", OutputTokens=null}, + new TestClass2() { Features = "This is another example", OutputTokens=null } }; + var dataView = ML.Data.LoadFromEnumerable(data); + + var options = new TextFeaturizingEstimator.Options() + { + WordFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 1 }, + CharFeatureExtractor = null, + Norm = TextFeaturizingEstimator.NormFunction.None, + OutputTokensColumnName = "OutputTokens" + }; + + var pipeline = ML.Transforms.Text.FeaturizeText("Features", options, null); + dataView = pipeline.Fit(dataView).Transform(dataView); + + VBuffer features = default; + float[][] transformed = { null, null }; + + var expected = new float[][] { + new float[] { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f, 0.0f }, + new float[] { 1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 1.0f } + }; + + using (var cursor = dataView.GetRowCursor(dataView.Schema)) + { + var i = 0; + while (cursor.MoveNext()) + { + var featureGetter = cursor.GetGetter>(cursor.Schema["Features"]); + featureGetter(ref features); + transformed[i] = features.DenseValues().ToArray(); + i++; + } + } + + Assert.Equal(expected[0], transformed[0]); + Assert.Equal(expected[1], transformed[1]); + } + + [Fact] + public void TextFeaturizerWithWordFeatureExtractorTestWithNoInputNames() + { + var data = new[] { new TestClass2() { Features = "This is some text in english", OutputTokens=null}, + new TestClass2() { Features = "This is another example", OutputTokens=null } }; + var dataView = ML.Data.LoadFromEnumerable(data); + + var options = new TextFeaturizingEstimator.Options() + { + WordFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 1 }, + CharFeatureExtractor = null, + Norm = TextFeaturizingEstimator.NormFunction.None, + OutputTokensColumnName = "OutputTokens" + }; + + var pipeline = ML.Transforms.Text.FeaturizeText("Features", options); + dataView = pipeline.Fit(dataView).Transform(dataView); + + VBuffer features = default; + float[][] transformed = { null, null }; + + var expected = new float[][] { + new float[] { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f, 0.0f }, + new float[] { 1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 1.0f } + }; + + using (var cursor = dataView.GetRowCursor(dataView.Schema)) + { + var i = 0; + while (cursor.MoveNext()) + { + var featureGetter = cursor.GetGetter>(cursor.Schema["Features"]); + featureGetter(ref features); + transformed[i] = features.DenseValues().ToArray(); + i++; + } + } + + Assert.Equal(expected[0], transformed[0]); + Assert.Equal(expected[1], transformed[1]); + } + [Fact] public void TextFeaturizerWithCharFeatureExtractorTest() { From 05deae1bffd19c65b2aa56883349405cda39ecbf Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Thu, 12 Sep 2019 16:41:54 -0700 Subject: [PATCH 2/3] Added case when inputColumnName is an empty string --- .../Text/TextCatalog.cs | 5 ++- .../Transformers/TextFeaturizerTests.cs | 43 +++++++++++++++++++ 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs index 91615cf687..cfe9e31bd9 100644 --- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs +++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs @@ -62,7 +62,10 @@ public static TextFeaturizingEstimator FeaturizeText(this TransformsCatalog.Text TextFeaturizingEstimator.Options options, params string[] inputColumnNames) => new TextFeaturizingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), - outputColumnName, inputColumnNames == null || inputColumnNames.Length == 0 ? new[] { outputColumnName } : inputColumnNames, options); + outputColumnName, + inputColumnNames == null || inputColumnNames.Length == 0 || (inputColumnNames.Length == 1 && inputColumnNames[0] == "") ? + new[] { outputColumnName } : inputColumnNames, + options); /// /// Create a , which tokenizes by splitting text into sequences of characters diff --git a/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs b/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs index 14583a9660..cbe172cd9c 100644 --- a/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs @@ -128,6 +128,49 @@ public void TextFeaturizerWithWordFeatureExtractorTestWithNullInputNames() Assert.Equal(expected[1], transformed[1]); } + [Fact] + public void TextFeaturizerWithWordFeatureExtractorTestWithEmptyInputName() + { + var data = new[] { new TestClass2() { Features = "This is some text in english", OutputTokens=null}, + new TestClass2() { Features = "This is another example", OutputTokens=null } }; + var dataView = ML.Data.LoadFromEnumerable(data); + + var options = new TextFeaturizingEstimator.Options() + { + WordFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 1 }, + CharFeatureExtractor = null, + Norm = TextFeaturizingEstimator.NormFunction.None, + OutputTokensColumnName = "OutputTokens" + }; + + var pipeline = ML.Transforms.Text.FeaturizeText("Features", options, ""); + dataView = pipeline.Fit(dataView).Transform(dataView); + + VBuffer features = default; + float[][] transformed = { null, null }; + + var expected = new float[][] { + new float[] { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f, 0.0f }, + new float[] { 1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 1.0f } + }; + + using (var cursor = dataView.GetRowCursor(dataView.Schema)) + { + var i = 0; + while (cursor.MoveNext()) + { + var featureGetter = cursor.GetGetter>(cursor.Schema["Features"]); + featureGetter(ref features); + transformed[i] = features.DenseValues().ToArray(); + i++; + } + } + + Assert.Equal(expected[0], transformed[0]); + Assert.Equal(expected[1], transformed[1]); + } + + [Fact] public void TextFeaturizerWithWordFeatureExtractorTestWithNoInputNames() { From eb1433cf0159aeb24890fa7f1bc7098816131de8 Mon Sep 17 00:00:00 2001 From: Antonio Velazquez Date: Wed, 18 Sep 2019 09:31:38 -0700 Subject: [PATCH 3/3] Removed case when inputColumnNames is an empty string. Simplified check of inputColumnNames using Utils.Size() --- .../Text/TextCatalog.cs | 5 +-- .../Transformers/TextFeaturizerTests.cs | 45 +------------------ 2 files changed, 3 insertions(+), 47 deletions(-) diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs index cfe9e31bd9..7b4b554b7c 100644 --- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs +++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs @@ -3,6 +3,7 @@ // See the LICENSE file in the project root for more information. using Microsoft.ML.Data; +using Microsoft.ML.Internal.Utilities; using Microsoft.ML.Runtime; using Microsoft.ML.Transforms.Text; @@ -62,9 +63,7 @@ public static TextFeaturizingEstimator FeaturizeText(this TransformsCatalog.Text TextFeaturizingEstimator.Options options, params string[] inputColumnNames) => new TextFeaturizingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), - outputColumnName, - inputColumnNames == null || inputColumnNames.Length == 0 || (inputColumnNames.Length == 1 && inputColumnNames[0] == "") ? - new[] { outputColumnName } : inputColumnNames, + outputColumnName, Utils.Size(inputColumnNames) == 0 ? new[] { outputColumnName } : inputColumnNames, options); /// diff --git a/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs b/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs index cbe172cd9c..24d2c671f6 100644 --- a/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs @@ -87,7 +87,7 @@ public void TextFeaturizerWithWordFeatureExtractorTest() } [Fact] - public void TextFeaturizerWithWordFeatureExtractorTestWithNullInputNames() + public void TextFeaturizerWithWordFeatureExtractorWithNullInputNamesTest() { var data = new[] { new TestClass2() { Features = "This is some text in english", OutputTokens=null}, new TestClass2() { Features = "This is another example", OutputTokens=null } }; @@ -128,49 +128,6 @@ public void TextFeaturizerWithWordFeatureExtractorTestWithNullInputNames() Assert.Equal(expected[1], transformed[1]); } - [Fact] - public void TextFeaturizerWithWordFeatureExtractorTestWithEmptyInputName() - { - var data = new[] { new TestClass2() { Features = "This is some text in english", OutputTokens=null}, - new TestClass2() { Features = "This is another example", OutputTokens=null } }; - var dataView = ML.Data.LoadFromEnumerable(data); - - var options = new TextFeaturizingEstimator.Options() - { - WordFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 1 }, - CharFeatureExtractor = null, - Norm = TextFeaturizingEstimator.NormFunction.None, - OutputTokensColumnName = "OutputTokens" - }; - - var pipeline = ML.Transforms.Text.FeaturizeText("Features", options, ""); - dataView = pipeline.Fit(dataView).Transform(dataView); - - VBuffer features = default; - float[][] transformed = { null, null }; - - var expected = new float[][] { - new float[] { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.0f, 0.0f }, - new float[] { 1.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 1.0f } - }; - - using (var cursor = dataView.GetRowCursor(dataView.Schema)) - { - var i = 0; - while (cursor.MoveNext()) - { - var featureGetter = cursor.GetGetter>(cursor.Schema["Features"]); - featureGetter(ref features); - transformed[i] = features.DenseValues().ToArray(); - i++; - } - } - - Assert.Equal(expected[0], transformed[0]); - Assert.Equal(expected[1], transformed[1]); - } - - [Fact] public void TextFeaturizerWithWordFeatureExtractorTestWithNoInputNames() {