Skip to content
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 12 additions & 3 deletions src/Microsoft.ML.Core/Utilities/NormStr.cs
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ public NormStr Get(string str, bool add = false)
return add ? AddCore(str.AsMemory(), hash) : null;
}

public NormStr Get(ReadOnlyMemory<char> str, bool add = false)
public NormStr Get(ReadOnlyMemory<char> str, bool add = false, bool duplicateStr = true)
{
AssertValid();

Expand All @@ -136,6 +136,15 @@ public NormStr Get(ReadOnlyMemory<char> str, bool add = false)
}
Contracts.Assert(ins == -1);

if (duplicateStr)
{
// To avoid the case where 'str' actually stores a string with the
// content of a whole row in the dataset, a new 'str' is created
// See issue https://github.com/dotnet/machinelearning/issues/4571
// and PR https://github.com/dotnet/machinelearning/pull/4576
return add ? AddCore(str.ToString().AsMemory(), hash) : null;
}

return add ? AddCore(str, hash) : null;
}

Expand All @@ -147,9 +156,9 @@ public NormStr Add(string str)
return Get(str, true);
}

public NormStr Add(ReadOnlyMemory<char> str)
public NormStr Add(ReadOnlyMemory<char> str, bool duplicateStr = true)
{
return Get(str, true);
return Get(str, true, duplicateStr);
}

/// <summary>
Expand Down
139 changes: 139 additions & 0 deletions test/Microsoft.ML.Benchmarks/FeaturizeTextBench.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
// Licensed to the .NET Foundation under one or more agreements.

@antoniovs1029 antoniovs1029 Dec 16, 2019

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@sharwell Please, review this benchmark test and tell me if this was what you had in mind when you requested to add one

// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System;
using System.IO;
using System.Collections.Generic;
using System.Linq;
using Microsoft.ML.Data;
using BenchmarkDotNet.Attributes;
using Microsoft.ML.Transforms.Text;
using Xunit;

namespace Microsoft.ML.Benchmarks
{
[Config(typeof(TrainConfig))]
public class FeaturizeTextBench
{
private MLContext mlContext;
private IDataView dataset;
private static int numColumns = 1000;
private static int numRows = 300;
private static int maxWordLength = 15;

[GlobalSetup]
public void SetupData()
{
Path.GetTempFileName();
mlContext = new MLContext(seed: 1);
var path = Path.GetTempFileName();
Console.WriteLine($"Created dataset in temporary file:\n{path}\n");
path = CreateRandomFile(path);

var columns = new List<TextLoader.Column>();
for(int i = 0; i < numColumns; i++)
{
columns.Add(new TextLoader.Column($"Column{i}", DataKind.String, i));
}

var textLoader = mlContext.Data.CreateTextLoader(new TextLoader.Options()
{
Columns = columns.ToArray(),
HasHeader = false,
Separators = new char[] { ',' }
});

dataset = textLoader.Load(path);
}

[Benchmark]
public ITransformer TrainFeaturizeText()

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💡 Consider adding the following:

  • A benchmark that sets up the cache, and then requests the same string from it a few times using both a string and a ReadOnlyMemory<char> that is a substring of another longer string. Both cases should reveal no allocations for the lookup.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💡 Can you include a sample output from this test as a comment in the pull request, just so we have a point-in-time reference?

@antoniovs1029 antoniovs1029 Dec 17, 2019

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure what the sample output should look like, since in this sample I am actually not generating an output, only a transformer that featurizes the text.

I could add a step where I actually use the transformer to transform some input data (perhaps a single row of the dataset). But then again, the output of the featurizer would be a big vector of floats representing random strings.

So I wouldn't think it's worth it to do this.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

❔ Is there any meaningful way to vary the "size" of this test, such that we can see how increases to the size impact the performance?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, varying the number of rows generated for the dataset is the first thing I can think of. Would you recommend adding more benchmarks for different sizes?

{
var textColumns = new List<string>();
for (int i = 0; i < 20; i++) // Only load first 20 columns
{
textColumns.Add($"Column{i}");
}

var featurizers = new List<TextFeaturizingEstimator>();
foreach (var textColumn in textColumns)
{
var featurizer = mlContext.Transforms.Text.FeaturizeText(textColumn, new TextFeaturizingEstimator.Options()
{
CharFeatureExtractor = null,
WordFeatureExtractor = new WordBagEstimator.Options()
{
NgramLength = 2,
MaximumNgramsCount = new int[] { 200000 }
}
});
featurizers.Add(featurizer);
}

IEstimator<ITransformer> pipeline = featurizers.First();
foreach (var featurizer in featurizers.Skip(1))
{
pipeline = pipeline.Append(featurizer);
}

var model = pipeline.Fit(dataset);
var memoryUsage = GC.GetTotalMemory(true);
Console.WriteLine($"Memory Used: {memoryUsage/1000000:0,0.00}MB");
Assert.True(memoryUsage < 240000000, $"This benchmark should use less than 240MB of memory, but it's using {memoryUsage/1000000:0,0.00}MB"); // Memory usage should be less than 1GB after PR https://github.com/dotnet/machinelearning/pull/4576

return model;
}

public static string CreateRandomFile(string path)
{
// Create file with random strings
// to use as dataset of the benchmark

Random random = new Random(1);

using (StreamWriter file = new StreamWriter(path))
{
for(int i = 0; i < numRows; i++)
file.WriteLine(CreateRandomLine(numColumns, random));
}
return path;
}

public static string CreateRandomLine(int columns, Random random)
{
var lineSB = new System.Text.StringBuilder();
for(int i = 0; i < columns; i++)
{
lineSB.Append(CreateRandomColumn(random, random.Next(100)));
lineSB.Append(",");
}
return lineSB.ToString();
}

public static string CreateRandomColumn(Random random, int numwords)
{
const string characters =
"01234567890" +
"abcdefghijklmnopqrstuvwxyz" +
"ABCDEFGHIJKLMNOPQRSTUVWXYZ";

var columnSB = new System.Text.StringBuilder();
int wordLength;

for(int i = 0; i < numwords; i++)
{
wordLength = random.Next(1, maxWordLength);
for(int j = 0; j < wordLength; j++)
columnSB.Append(characters[random.Next(characters.Length)]);

columnSB.Append(" ");
}

if (random.Next(2) == 0) // sometimes return the column as lowercase
return columnSB.ToString().ToLower();

return columnSB.ToString();
}
}
}