diff --git a/src/Microsoft.ML.OnnxConverter/SaveOnnxCommand.cs b/src/Microsoft.ML.OnnxConverter/SaveOnnxCommand.cs index a2aa961b22..f668d3222f 100644 --- a/src/Microsoft.ML.OnnxConverter/SaveOnnxCommand.cs +++ b/src/Microsoft.ML.OnnxConverter/SaveOnnxCommand.cs @@ -2,8 +2,10 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +using System; using System.Collections.Generic; using System.IO; +using System.Linq; using Google.Protobuf; using Microsoft.ML; using Microsoft.ML.Command; @@ -188,7 +190,9 @@ internal static ModelProto ConvertTransformListToOnnxModel(OnnxContextImpl ctx, if (outputData.Schema[i].IsHidden) continue; - var idataviewColumnName = outputData.Schema[i].Name; + var column = outputData.Schema[i]; + + var idataviewColumnName = column.Name; // Since the last IDataView also contains columns of the initial IDataView, last IDataView's columns found in // _inputToDrop should be removed too. @@ -204,11 +208,39 @@ internal static ModelProto ConvertTransformListToOnnxModel(OnnxContextImpl ctx, var trueVariableName = ctx.AddIntermediateVariable(null, idataviewColumnName + ".output", true); ctx.CreateNode("Identity", variableName, trueVariableName, ctx.GetNodeName("Identity"), ""); ctx.AddOutputVariable(outputData.Schema[i].Type, trueVariableName); + + if (column.HasSlotNames()) + AddSlotNames(ctx, column); } + // Add metadata graph outputs + return ctx.MakeModel(); } + private static void AddSlotNames(OnnxContextImpl ctx, DataViewSchema.Column column) + { + VBuffer> slotNames = default; + column.GetSlotNames(ref slotNames); + IEnumerable slotNamesAsStrings = slotNames.DenseValues().Select(name => name.ToString()); + + string opType = "LabelEncoder"; + string labelEncoderInputName = $"mlnet.{column.Name}.unusedInput"; + string labelEncoderOutputName = $"mlnet.{column.Name}.unusedOutput"; + string labelEncoderNodeName = $"mlnet.{column.Name}.SlotNames"; + + string[] oneVals = new string[] { "one" }; + long[] dims = new long[] { 1, 1 }; + var one = ctx.AddInitializer(oneVals, dims, labelEncoderNodeName); + + var labelEncoderOutput = ctx.AddIntermediateVariable(NumberDataViewType.Int64, labelEncoderOutputName, true); + var node = ctx.CreateNode(opType, one, labelEncoderOutput, labelEncoderNodeName); + node.AddAttribute("keys_strings", slotNamesAsStrings); + node.AddAttribute("values_int64s", Enumerable.Range(0, slotNames.Length).Select(x => (long)x)); + + ctx.AddOutputVariable(NumberDataViewType.Int64, labelEncoderOutput); + } + private void Run(IChannel ch) { ILegacyDataLoader loader = null; diff --git a/src/Microsoft.ML.OnnxTransformer/OnnxTransform.cs b/src/Microsoft.ML.OnnxTransformer/OnnxTransform.cs index fd213845b4..f57ad26f67 100644 --- a/src/Microsoft.ML.OnnxTransformer/OnnxTransform.cs +++ b/src/Microsoft.ML.OnnxTransformer/OnnxTransform.cs @@ -15,6 +15,7 @@ using Microsoft.ML.OnnxRuntime; using Microsoft.ML.Runtime; using Microsoft.ML.Transforms.Onnx; +using static Microsoft.ML.Model.OnnxConverter.OnnxCSharpToProtoWrapper; using OnnxShape = System.Collections.Generic.List; [assembly: LoadableClass(OnnxTransformer.Summary, typeof(IDataTransform), typeof(OnnxTransformer), @@ -416,11 +417,40 @@ protected override DataViewSchema.DetachedColumn[] GetOutputColumnsCore() { var onnxOutputName = _parent.Outputs[i]; var columnName = onnxOutputName.EndsWith(stdSuffix) ? onnxOutputName.Replace(stdSuffix, "") : onnxOutputName; - info[i] = new DataViewSchema.DetachedColumn(columnName, _parent.OutputTypes[i], null); + + var builder = new DataViewSchema.Annotations.Builder(); + AddSlotNames(columnName, builder); + + info[i] = new DataViewSchema.DetachedColumn(columnName, _parent.OutputTypes[i], builder.ToAnnotations()); } return info; } + private void AddSlotNames(string columnName, DataViewSchema.Annotations.Builder builder) + { + var graph = _parent.Model.Graph; + var nodes = graph.Node; + + var slotNamesNodeName = $"mlnet.{columnName}.SlotNames"; + var slotsNode = nodes.FirstOrDefault(node => node.Name == slotNamesNodeName); + var slotsAttr = slotsNode?.Attribute.FirstOrDefault(attr => attr.Name == "keys_strings"); + if (slotsAttr == null) + return; + + int count = slotsAttr.Strings.Count(); + ValueGetter>> getter = (ref VBuffer> dst) => + { + var dstEditor = VBufferEditor.Create(ref dst, count); + for (int i = 0; i < count; i++) + { + dstEditor.Values[i] = slotsAttr.Strings[i].ToString(Encoding.UTF8).AsMemory(); + } + dst = dstEditor.Commit(); + }; + + builder.AddSlotNames(count, getter); + } + private protected override Func GetDependenciesCore(Func activeOutput) { return col => Enumerable.Range(0, _parent.Outputs.Length).Any(i => activeOutput(i)) && _inputColIndices.Any(i => i == col); diff --git a/src/Microsoft.ML.OnnxTransformer/OnnxUtils.cs b/src/Microsoft.ML.OnnxTransformer/OnnxUtils.cs index 525a8c46fc..e51e1091a0 100644 --- a/src/Microsoft.ML.OnnxTransformer/OnnxUtils.cs +++ b/src/Microsoft.ML.OnnxTransformer/OnnxUtils.cs @@ -11,6 +11,7 @@ using Microsoft.ML.OnnxRuntime; using Microsoft.ML.OnnxRuntime.Tensors; using Microsoft.ML.Runtime; +using static Microsoft.ML.Model.OnnxConverter.OnnxCSharpToProtoWrapper; using OnnxShape = System.Collections.Generic.List; namespace Microsoft.ML.Transforms.Onnx @@ -157,6 +158,8 @@ public OnnxVariableInfo(string name, OnnxShape shape, Type typeInOnnxRuntime, Da /// internal OnnxModelInfo ModelInfo { get; } + internal GraphProto Graph { get; } + /// /// Constructs OnnxModel object from file. /// @@ -217,6 +220,8 @@ public OnnxModel(string modelFile, int? gpuDeviceId = null, bool fallbackToCpu = // Create a view to the used ONNX model from ONNXRuntime's perspective. ModelInfo = new OnnxModelInfo(inputInfos, outputInfos, overrideableInitializers); + + Graph = model.Graph; } private List GetOnnxVariablesFromMetadata(IReadOnlyDictionary nodeMetadata, @@ -233,6 +238,10 @@ private List GetOnnxVariablesFromMetadata(IReadOnlyDictionary< var dataViewType = typePool[name]; var caster = casterPool?[name]; + if (name.StartsWith("mlnet.") && + (name.EndsWith(".unusedInput") || name.EndsWith(".unusedOutput"))) + continue; + OnnxVariableInfo info = null; if (shapeDictionary != null && shapeDictionary.ContainsKey(name)) { diff --git a/src/Microsoft.ML.Transforms/Text/NgramTransform.cs b/src/Microsoft.ML.Transforms/Text/NgramTransform.cs index 298b75a30f..6540deebd3 100644 --- a/src/Microsoft.ML.Transforms/Text/NgramTransform.cs +++ b/src/Microsoft.ML.Transforms/Text/NgramTransform.cs @@ -770,9 +770,6 @@ public void SaveAsOnnx(OnnxContext ctx) private void SaveAsOnnxCore(OnnxContext ctx, int iinfo, string srcVariableName, string dstVariableName) { - VBuffer> slotNames = default; - GetSlotNames(iinfo, 0, ref slotNames); - var transformInfo = _parent._transformInfos[iinfo]; // TfIdfVectorizer accepts strings, int32 and int64 tensors. diff --git a/test/BaselineOutput/Common/Onnx/BinaryClassification/BreastCancer/OneHotBagPipeline.txt b/test/BaselineOutput/Common/Onnx/BinaryClassification/BreastCancer/OneHotBagPipeline.txt index 93550ceda7..cf6eab6e9d 100644 --- a/test/BaselineOutput/Common/Onnx/BinaryClassification/BreastCancer/OneHotBagPipeline.txt +++ b/test/BaselineOutput/Common/Onnx/BinaryClassification/BreastCancer/OneHotBagPipeline.txt @@ -324,8 +324,8 @@ { "name": "target_weights", "floats": [ - 0.50476193, - -0.97911227 + 0.504761934, + -0.979112267 ], "type": "FLOATS" } @@ -428,6 +428,51 @@ "name": "Identity1", "opType": "Identity" }, + { + "input": [ + "mlnet.F2.SlotNames" + ], + "output": [ + "mlnet.F2.unusedOutput" + ], + "name": "mlnet.F2.SlotNames", + "opType": "LabelEncoder", + "attribute": [ + { + "name": "keys_strings", + "strings": [ + "NA==", + "MQ==", + "OA==", + "MTA=", + "Mg==", + "Mw==", + "Nw==", + "NQ==", + "Ng==", + "OQ==" + ], + "type": "STRINGS" + }, + { + "name": "values_int64s", + "ints": [ + "0", + "1", + "2", + "3", + "4", + "5", + "6", + "7", + "8", + "9" + ], + "type": "INTS" + } + ], + "domain": "ai.onnx.ml" + }, { "input": [ "Features" @@ -438,6 +483,53 @@ "name": "Identity2", "opType": "Identity" }, + { + "input": [ + "mlnet.Features.SlotNames" + ], + "output": [ + "mlnet.Features.unusedOutput" + ], + "name": "mlnet.Features.SlotNames", + "opType": "LabelEncoder", + "attribute": [ + { + "name": "keys_strings", + "strings": [ + "RjE=", + "RjIuNA==", + "RjIuMQ==", + "RjIuOA==", + "RjIuMTA=", + "RjIuMg==", + "RjIuMw==", + "RjIuNw==", + "RjIuNQ==", + "RjIuNg==", + "RjIuOQ==" + ], + "type": "STRINGS" + }, + { + "name": "values_int64s", + "ints": [ + "0", + "1", + "2", + "3", + "4", + "5", + "6", + "7", + "8", + "9", + "10" + ], + "type": "INTS" + } + ], + "domain": "ai.onnx.ml" + }, { "input": [ "PredictedLabel" @@ -484,6 +576,28 @@ 0 ], "name": "Offset" + }, + { + "dims": [ + "1", + "1" + ], + "dataType": 8, + "stringData": [ + "b25l" + ], + "name": "mlnet.F2.SlotNames" + }, + { + "dims": [ + "1", + "1" + ], + "dataType": 8, + "stringData": [ + "b25l" + ], + "name": "mlnet.Features.SlotNames" } ], "input": [ @@ -597,6 +711,24 @@ } } }, + { + "name": "mlnet.F2.unusedOutput", + "type": { + "tensorType": { + "elemType": 7, + "shape": { + "dim": [ + { + "dimValue": "-1" + }, + { + "dimValue": "1" + } + ] + } + } + } + }, { "name": "Features.output", "type": { @@ -615,6 +747,24 @@ } } }, + { + "name": "mlnet.Features.unusedOutput", + "type": { + "tensorType": { + "elemType": 7, + "shape": { + "dim": [ + { + "dimValue": "-1" + }, + { + "dimValue": "1" + } + ] + } + } + } + }, { "name": "PredictedLabel.output", "type": { diff --git a/test/BaselineOutput/Common/Onnx/MultiClassClassification/BreastCancer/MultiClassificationLogisticRegressionSaveModelToOnnxTest.txt b/test/BaselineOutput/Common/Onnx/MultiClassClassification/BreastCancer/MultiClassificationLogisticRegressionSaveModelToOnnxTest.txt index ab04e5921f..14057b9746 100644 --- a/test/BaselineOutput/Common/Onnx/MultiClassClassification/BreastCancer/MultiClassificationLogisticRegressionSaveModelToOnnxTest.txt +++ b/test/BaselineOutput/Common/Onnx/MultiClassClassification/BreastCancer/MultiClassificationLogisticRegressionSaveModelToOnnxTest.txt @@ -322,6 +322,51 @@ ], "name": "Identity2", "opType": "Identity" + }, + { + "input": [ + "mlnet.Score.SlotNames" + ], + "output": [ + "mlnet.Score.unusedOutput" + ], + "name": "mlnet.Score.SlotNames", + "opType": "LabelEncoder", + "attribute": [ + { + "name": "keys_strings", + "strings": [ + "NQ==", + "Mw==", + "Ng==", + "NA==", + "OA==", + "MQ==", + "Mg==", + "Nw==", + "MTA=", + "OQ==" + ], + "type": "STRINGS" + }, + { + "name": "values_int64s", + "ints": [ + "0", + "1", + "2", + "3", + "4", + "5", + "6", + "7", + "8", + "9" + ], + "type": "INTS" + } + ], + "domain": "ai.onnx.ml" } ], "name": "model", @@ -336,6 +381,17 @@ "1" ], "name": "ShapeVar" + }, + { + "dims": [ + "1", + "1" + ], + "dataType": 8, + "stringData": [ + "b25l" + ], + "name": "mlnet.Score.SlotNames" } ], "input": [ @@ -448,6 +504,24 @@ } } } + }, + { + "name": "mlnet.Score.unusedOutput", + "type": { + "tensorType": { + "elemType": 7, + "shape": { + "dim": [ + { + "dimValue": "-1" + }, + { + "dimValue": "1" + } + ] + } + } + } } ], "valueInfo": [ diff --git a/test/Microsoft.ML.Tests/OnnxConversionTest.cs b/test/Microsoft.ML.Tests/OnnxConversionTest.cs index 8d842a65fb..cbf24b5858 100644 --- a/test/Microsoft.ML.Tests/OnnxConversionTest.cs +++ b/test/Microsoft.ML.Tests/OnnxConversionTest.cs @@ -1272,8 +1272,10 @@ public void NgramOnnxConversionTest( var onnxModel = mlContext.Model.ConvertToOnnxProtobuf(model, dataView); var onnxFilename = $"Ngram-{i}-{ngramLength}-{useAllLength}-{weighting}.onnx"; + var txtFilename = $"Ngram-{i}-{ngramLength}-{useAllLength}-{weighting}.txt"; var onnxFilePath = GetOutputPath(onnxFilename); - SaveOnnxModel(onnxModel, onnxFilePath, null); + var txtFilePath = GetOutputPath(txtFilename); + SaveOnnxModel(onnxModel, onnxFilePath, txtFilePath); if (IsOnnxRuntimeSupported()) { @@ -1282,6 +1284,16 @@ public void NgramOnnxConversionTest( var onnxResult = onnxTransformer.Transform(dataView); var columnName = i == pipelines.Length - 1 ? "Tokens" : "NGrams"; CompareSelectedColumns(columnName, columnName, transformedData, onnxResult, 3); + + VBuffer> mlNetSlots = default; + VBuffer> onnxSlots= default; + transformedData.Schema[columnName].GetSlotNames(ref mlNetSlots); + onnxResult.Schema[columnName].GetSlotNames(ref onnxSlots); + Assert.Equal(mlNetSlots.Length, onnxSlots.Length); + var mlNetSlotNames = mlNetSlots.DenseValues().ToList(); + var onnxSlotNames = onnxSlots.DenseValues().ToList(); + for (int j = 0; j < mlNetSlots.Length; j++) + Assert.Equal(mlNetSlotNames[j].ToString(), onnxSlotNames[j].ToString()); } } Done();