Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/Microsoft.ML.Transforms/Text/TextCatalog.cs
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,7 @@ public static CustomStopWordsRemovingEstimator RemoveStopWords(this TransformsCa
=> new CustomStopWordsRemovingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnName, stopwords);

/// <summary>
/// Create a <see cref="WordHashBagEstimator"/>, which maps the column specified in <paramref name="inputColumnName"/>
/// Create a <see cref="WordBagEstimator"/>, which maps the column specified in <paramref name="inputColumnName"/>
/// to a vector of n-gram counts in a new column named <paramref name="outputColumnName"/>.
/// </summary>
/// <remarks>
Expand Down Expand Up @@ -363,7 +363,7 @@ public static WordBagEstimator ProduceWordBags(this TransformsCatalog.TextTransf
outputColumnName, inputColumnName, ngramLength, skipLength, useAllLengths, maximumNgramsCount, weighting);

/// <summary>
/// Create a <see cref="WordHashBagEstimator"/>, which maps the multiple columns specified in <paramref name="inputColumnNames"/>
/// Create a <see cref="WordBagEstimator"/>, which maps the multiple columns specified in <paramref name="inputColumnNames"/>
/// to a vector of n-gram counts in a new column named <paramref name="outputColumnName"/>.
/// </summary>
/// <remarks>
Expand Down
8 changes: 4 additions & 4 deletions src/Microsoft.ML.Transforms/Text/WordTokenizing.cs
Original file line number Diff line number Diff line change
Expand Up @@ -415,10 +415,10 @@ public void SaveAsOnnx(OnnxContext ctx)
string[] separators = column.SeparatorsArray.Select(c => c.ToString()).ToArray();
tokenizerNode.AddAttribute("separators", separators);

opType = "Squeeze";
var squeezeOutput = ctx.AddIntermediateVariable(_type, column.Name);
var squeezeNode = ctx.CreateNode(opType, intermediateVar, squeezeOutput, ctx.GetNodeName(opType), "");
squeezeNode.AddAttribute("axes", new long[] { 1 });
opType = "Reshape";
var shape = ctx.AddInitializer(new long[] { 1, -1 }, new long[] { 2 }, "Shape");
var reshapeOutput = ctx.AddIntermediateVariable(new VectorDataViewType(TextDataViewType.Instance, 1), column.Name);
var reshapeNode = ctx.CreateNode(opType, new[] { intermediateVar, shape }, new[] { reshapeOutput }, ctx.GetNodeName(opType), "");
}
}
}
Expand Down
21 changes: 12 additions & 9 deletions test/Microsoft.ML.Tests/OnnxConversionTest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1310,11 +1310,11 @@ public void NgramOnnxConversionTest(
IEstimator<ITransformer>[] pipelines =
{
mlContext.Transforms.Text.TokenizeIntoWords("Tokens", "Text", new[] { ' ' })
.Append(mlContext.Transforms.Conversion.MapValueToKey("Tokens"))
.Append(mlContext.Transforms.Text.ProduceNgrams("NGrams", "Tokens",
ngramLength: ngramLength,
useAllLengths: useAllLength,
weighting: weighting)),
.Append(mlContext.Transforms.Conversion.MapValueToKey("Tokens"))

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any particular reason the formatting was changed? Usually there is a small offset to indicate line continuation.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, I noticed that line 1321 was not aligned with the rest. I'll add the offsets back though and just align that line.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: It looks like the offset for line 1313 is still wrong. It seems to be left aligned with the previous line.

.Append(mlContext.Transforms.Text.ProduceNgrams("NGrams", "Tokens",
ngramLength: ngramLength,
useAllLengths: useAllLength,
weighting: weighting)),

mlContext.Transforms.Text.TokenizeIntoCharactersAsKeys("Tokens", "Text")
.Append(mlContext.Transforms.Text.ProduceNgrams("NGrams", "Tokens",
Expand All @@ -1323,9 +1323,12 @@ public void NgramOnnxConversionTest(
weighting: weighting)),

mlContext.Transforms.Text.ProduceWordBags("Tokens", "Text",
ngramLength: ngramLength,
useAllLengths: useAllLength,
weighting: weighting)
ngramLength: ngramLength,
useAllLengths: useAllLength,
weighting: weighting),

mlContext.Transforms.Text.TokenizeIntoWords("Tokens0", "Text")
.Append(mlContext.Transforms.Text.ProduceWordBags("Tokens", "Tokens0"))
};

for (int i = 0; i < pipelines.Length; i++)
Expand All @@ -1346,7 +1349,7 @@ public void NgramOnnxConversionTest(
var onnxEstimator = mlContext.Transforms.ApplyOnnxModel(onnxFilePath, gpuDeviceId: _gpuDeviceId, fallbackToCpu: _fallbackToCpu);
var onnxTransformer = onnxEstimator.Fit(dataView);
var onnxResult = onnxTransformer.Transform(dataView);
var columnName = i == pipelines.Length - 1 ? "Tokens" : "NGrams";
var columnName = i >= pipelines.Length - 2 ? "Tokens" : "NGrams";
CompareResults(columnName, columnName, transformedData, onnxResult, 3);

VBuffer<ReadOnlyMemory<char>> mlNetSlots = default;
Expand Down