diff --git a/src/DotNetBridge/Bridge.cs b/src/DotNetBridge/Bridge.cs index 0aeb27a6..a7954355 100644 --- a/src/DotNetBridge/Bridge.cs +++ b/src/DotNetBridge/Bridge.cs @@ -128,51 +128,51 @@ public unsafe static partial class Bridge // For setting bool values to NativeBridge. [UnmanagedFunctionPointer(CallingConvention.StdCall)] - private unsafe delegate void BLSetter(EnvironmentBlock* penv, int col, long index, byte value); + private unsafe delegate void BLSetter(EnvironmentBlock* penv, int col, long m, long n, byte value); // For setting float values to NativeBridge. [UnmanagedFunctionPointer(CallingConvention.StdCall)] - private unsafe delegate void R4Setter(EnvironmentBlock* penv, int col, long index, float value); + private unsafe delegate void R4Setter(EnvironmentBlock* penv, int col, long m, long n, float value); // For setting double values to NativeBridge. [UnmanagedFunctionPointer(CallingConvention.StdCall)] - private unsafe delegate void R8Setter(EnvironmentBlock* penv, int col, long index, double value); + private unsafe delegate void R8Setter(EnvironmentBlock* penv, int col, long m, long n, double value); // For setting I1 values to NativeBridge. [UnmanagedFunctionPointer(CallingConvention.StdCall)] - private unsafe delegate void I1Setter(EnvironmentBlock* penv, int col, long index, sbyte value); + private unsafe delegate void I1Setter(EnvironmentBlock* penv, int col, long m, long n, sbyte value); // For setting I2 values to NativeBridge. [UnmanagedFunctionPointer(CallingConvention.StdCall)] - private unsafe delegate void I2Setter(EnvironmentBlock* penv, int col, long index, short value); + private unsafe delegate void I2Setter(EnvironmentBlock* penv, int col, long m, long n, short value); // For setting I4 values to NativeBridge. [UnmanagedFunctionPointer(CallingConvention.StdCall)] - private unsafe delegate void I4Setter(EnvironmentBlock* penv, int col, long index, int value); + private unsafe delegate void I4Setter(EnvironmentBlock* penv, int col, long m, long n, int value); // For setting I8 values to NativeBridge. [UnmanagedFunctionPointer(CallingConvention.StdCall)] - private unsafe delegate void I8Setter(EnvironmentBlock* penv, int col, long index, long value); + private unsafe delegate void I8Setter(EnvironmentBlock* penv, int col, long m, long n, long value); // For setting U1 values to NativeBridge. [UnmanagedFunctionPointer(CallingConvention.StdCall)] - private unsafe delegate void U1Setter(EnvironmentBlock* penv, int col, long index, byte value); + private unsafe delegate void U1Setter(EnvironmentBlock* penv, int col, long m, long n, byte value); // For setting U2 values to NativeBridge. [UnmanagedFunctionPointer(CallingConvention.StdCall)] - private unsafe delegate void U2Setter(EnvironmentBlock* penv, int col, long index, ushort value); + private unsafe delegate void U2Setter(EnvironmentBlock* penv, int col, long m, long n, ushort value); // For setting U4 values to NativeBridge. [UnmanagedFunctionPointer(CallingConvention.StdCall)] - private unsafe delegate void U4Setter(EnvironmentBlock* penv, int col, long index, uint value); + private unsafe delegate void U4Setter(EnvironmentBlock* penv, int col, long m, long n, uint value); // For setting U8 values to NativeBridge. [UnmanagedFunctionPointer(CallingConvention.StdCall)] - private unsafe delegate void U8Setter(EnvironmentBlock* penv, int col, long index, ulong value); + private unsafe delegate void U8Setter(EnvironmentBlock* penv, int col, long m, long n, ulong value); // For setting string values, to a generic pointer and index. [UnmanagedFunctionPointer(CallingConvention.StdCall)] - private unsafe delegate void TXSetter(EnvironmentBlock* penv, int col, long index, sbyte* pch, int cch); + private unsafe delegate void TXSetter(EnvironmentBlock* penv, int col, long m, long n, sbyte* pch, int cch); // For setting string key values, to a generic pointer and index. [UnmanagedFunctionPointer(CallingConvention.StdCall)] diff --git a/src/DotNetBridge/Entrypoints.cs b/src/DotNetBridge/Entrypoints.cs index 1898bb73..039247fe 100644 --- a/src/DotNetBridge/Entrypoints.cs +++ b/src/DotNetBridge/Entrypoints.cs @@ -12,6 +12,9 @@ [assembly: LoadableClass(typeof(void), typeof(DotNetBridgeEntrypoints), null, typeof(SignatureEntryPointModule), "DotNetBridgeEntrypoints")] +[assembly: LoadableClass(VariableColumnTransform.Summary, typeof(VariableColumnTransform), null, typeof(SignatureLoadDataTransform), + "", VariableColumnTransform.LoaderSignature)] + namespace Microsoft.ML.DotNetBridge { internal static class DotNetBridgeEntrypoints @@ -72,5 +75,17 @@ public static ModelSchemaOutput GetSchema(IHostEnvironment env, TransformModelIn return new ModelSchemaOutput { Schema = new EmptyDataView(host, input.Model.OutputSchema) }; } + + [TlcModule.EntryPoint(Name = "Transforms.VariableColumnTransform", Desc = VariableColumnTransform.Summary, + UserName = "Variable Column Creator", ShortName = "Variable Column Creator")] + public static CommonOutputs.TransformOutput CreateVariableColumn(IHostEnvironment env, VariableColumnTransform.Options inputOptions) + { + Contracts.CheckValue(env, nameof(env)); + var host = env.Register("VariableColumnCreator"); + EntryPointUtils.CheckInputArgs(host, inputOptions); + + var xf = VariableColumnTransform.Create(env, inputOptions, inputOptions.Data); + return new CommonOutputs.TransformOutput { Model = new TransformModelImpl(env, xf, inputOptions.Data), OutputData = xf }; + } } } diff --git a/src/DotNetBridge/NativeDataInterop.cs b/src/DotNetBridge/NativeDataInterop.cs index ab2ff993..b7f1a762 100644 --- a/src/DotNetBridge/NativeDataInterop.cs +++ b/src/DotNetBridge/NativeDataInterop.cs @@ -77,6 +77,12 @@ private struct DataViewBlock // key types. Zero means unbounded, -1 means not a key type. [FieldOffset(0x20)] public int* keyCards; + + // The number of values in each row of a column. + // A value count of 0 means that each row of the + // column is variable length. + [FieldOffset(0x28)] + public byte* valueCounts; } private struct ColumnMetadataInfo @@ -114,6 +120,7 @@ private static unsafe void SendViewToNativeAsDataFrame(IChannel ch, EnvironmentB var nameUtf8Bytes = new ValueListBuilder(UTF8_BUFFER_SIZE); var nameIndices = new ValueListBuilder(INDICES_BUFFER_SIZE); var expandCols = new HashSet(1000); + var valueCounts = new List(1000); for (int col = 0; col < schema.Count; col++) { @@ -127,11 +134,7 @@ private static unsafe void SendViewToNativeAsDataFrame(IChannel ch, EnvironmentB var kind = itemType.GetRawKind(); int keyCard; - if (fullType.GetValueCount() == 0) - { - throw ch.ExceptNotSupp("Column has variable length vector: " + - name + ". Not supported in python. Drop column before sending to Python"); - } + byte valueCount = (fullType.GetValueCount() == 0) ? (byte)0 : (byte)1; if (itemType is KeyDataViewType) { @@ -224,6 +227,7 @@ private static unsafe void SendViewToNativeAsDataFrame(IChannel ch, EnvironmentB { kindList.Append(kind); keyCardList.Append(keyCard); + valueCounts.Add(valueCount); } } @@ -234,11 +238,13 @@ private static unsafe void SendViewToNativeAsDataFrame(IChannel ch, EnvironmentB var keyCards = keyCardList.AsSpan(); var nameBytes = nameUtf8Bytes.AsSpan(); var names = new byte*[nameIndices.Length]; + var valueCountsBytes = valueCounts.ToArray(); fixed (InternalDataKind* prgkind = kinds) fixed (byte* prgbNames = nameBytes) fixed (byte** prgname = names) fixed (int* prgkeyCard = keyCards) + fixed (byte* prgbValueCount = valueCountsBytes) { for (int iid = 0; iid < names.Length; iid++) names[iid] = prgbNames + nameIndices[iid]; @@ -249,6 +255,7 @@ private static unsafe void SendViewToNativeAsDataFrame(IChannel ch, EnvironmentB block.names = (sbyte**)prgname; block.kinds = prgkind; block.keyCards = prgkeyCard; + block.valueCounts = prgbValueCount; dataSink(penv, &block, out var setters, out var keyValueSetter); @@ -290,7 +297,12 @@ private static unsafe void SendViewToNativeAsDataFrame(IChannel ch, EnvironmentB } } fillers[i] = BufferFillerBase.Create(penv, cursor, pyColumn, colIndices[i], prgkind[pyColumn], type, setters[pyColumn]); - pyColumn += type is VectorDataViewType ? type.GetVectorSize() : 1; + + if ((type is VectorDataViewType) && (type.GetVectorSize() > 0)) + { + pyColumn += type.GetVectorSize(); + } + else pyColumn++; } for (int crow = 0; ; crow++) { @@ -387,13 +399,17 @@ private static unsafe void SendViewToNativeAsCsr(IChannel ch, EnvironmentBlock* InternalDataKind.I4, InternalDataKind.I4}; + var valueCounts = new List { 1, 1, 1, 1 }; + var kinds = kindList.ToArray(); var nameBytes = nameUtf8Bytes.AsSpan(); var names = new byte*[nameIndices.Length]; + var valueCountsBytes = valueCounts.ToArray(); fixed (InternalDataKind* prgkind = kinds) fixed (byte* prgbNames = nameBytes) fixed (byte** prgname = names) + fixed (byte* prgbValueCount = valueCountsBytes) { for (int iid = 0; iid < names.Length; iid++) names[iid] = prgbNames + nameIndices[iid]; @@ -404,6 +420,7 @@ private static unsafe void SendViewToNativeAsCsr(IChannel ch, EnvironmentBlock* block.names = (sbyte**)prgname; block.kinds = prgkind; block.keyCards = null; + block.valueCounts = prgbValueCount; dataSink(penv, &block, out var setters, out var keyValueSetter); @@ -476,7 +493,7 @@ private static void AddUniqueName( private abstract unsafe class BufferFillerBase { - public delegate void ValuePoker(T value, int col, long index); + public delegate void ValuePoker(T value, int col, long m, long n); protected readonly int _colIndex; protected readonly DataViewRow _input; @@ -500,23 +517,23 @@ public static BufferFillerBase Create(EnvironmentBlock* penv, DataViewRow input, case InternalDataKind.U1: var fnI1 = MarshalDelegate(setter); ValuePoker pokeU1 = - (byte value, int col, long index) => fnI1(penv, col, index, value > keyMax ? (sbyte)-1 : (sbyte)(value - 1)); + (byte value, int col, long m, long n) => fnI1(penv, col, m, n, value > keyMax ? (sbyte)-1 : (sbyte)(value - 1)); return new Impl(input, pyCol, idvCol, type, pokeU1); case InternalDataKind.U2: var fnI2 = MarshalDelegate(setter); ValuePoker pokeU2 = - (ushort value, int col, long index) => fnI2(penv, col, index, value > keyMax ? (short)-1 : (short)(value - 1)); + (ushort value, int col, long m, long n) => fnI2(penv, col, m, n, value > keyMax ? (short)-1 : (short)(value - 1)); return new Impl(input, pyCol, idvCol, type, pokeU2); case InternalDataKind.U4: var fnI4 = MarshalDelegate(setter); ValuePoker pokeU4 = - (uint value, int col, long index) => fnI4(penv, col, index, value > keyMax ? -1 : (int)(value - 1)); + (uint value, int col, long m, long n) => fnI4(penv, col, m, n, value > keyMax ? -1 : (int)(value - 1)); return new Impl(input, pyCol, idvCol, type, pokeU4); case InternalDataKind.U8: // We convert U8 key types with key names to I4. fnI4 = MarshalDelegate(setter); ValuePoker pokeU8 = - (ulong value, int col, long index) => fnI4(penv, col, index, value > keyMax ? -1 : (int)(value - 1)); + (ulong value, int col, long m, long n) => fnI4(penv, col, m, n, value > keyMax ? -1 : (int)(value - 1)); return new Impl(input, pyCol, idvCol, type, pokeU8); } } @@ -528,23 +545,23 @@ public static BufferFillerBase Create(EnvironmentBlock* penv, DataViewRow input, case InternalDataKind.U1: var fnI1 = MarshalDelegate(setter); ValuePoker pokeU1 = - (byte value, int col, long index) => fnI1(penv, col, index, (sbyte)(value - 1)); + (byte value, int col, long m, long n) => fnI1(penv, col, m, n, (sbyte)(value - 1)); return new Impl(input, pyCol, idvCol, type, pokeU1); case InternalDataKind.U2: var fnI2 = MarshalDelegate(setter); ValuePoker pokeU2 = - (ushort value, int col, long index) => fnI2(penv, col, index, (short)(value - 1)); + (ushort value, int col, long m, long n) => fnI2(penv, col, m, n, (short)(value - 1)); return new Impl(input, pyCol, idvCol, type, pokeU2); case InternalDataKind.U4: var fnI4 = MarshalDelegate(setter); ValuePoker pokeU4 = - (uint value, int col, long index) => fnI4(penv, col, index, (int)(value - 1)); + (uint value, int col, long m, long n) => fnI4(penv, col, m, n, (int)(value - 1)); return new Impl(input, pyCol, idvCol, type, pokeU4); case InternalDataKind.U8: // We convert U8 key types with key names to I4. fnI4 = MarshalDelegate(setter); ValuePoker pokeU8 = - (ulong value, int col, long index) => fnI4(penv, col, index, (int)(value - 1)); + (ulong value, int col, long m, long n) => fnI4(penv, col, m, n, (int)(value - 1)); return new Impl(input, pyCol, idvCol, type, pokeU8); } } @@ -555,70 +572,70 @@ public static BufferFillerBase Create(EnvironmentBlock* penv, DataViewRow input, case InternalDataKind.R4: var fnR4 = MarshalDelegate(setter); ValuePoker pokeR4 = - (float value, int col, long index) => fnR4(penv, col, index, value); + (float value, int col, long m, long n) => fnR4(penv, col, m, n, value); return new Impl(input, pyCol, idvCol, type, pokeR4); case InternalDataKind.R8: var fnR8 = MarshalDelegate(setter); ValuePoker pokeR8 = - (double value, int col, long index) => fnR8(penv, col, index, value); + (double value, int col, long m, long n) => fnR8(penv, col, m, n, value); return new Impl(input, pyCol, idvCol, type, pokeR8); case InternalDataKind.BL: var fnBl = MarshalDelegate(setter); ValuePoker pokeBl = - (bool value, int col, long index) => fnBl(penv, col, index, !value ? (byte)0 : value ? (byte)1 : (byte)0xFF); + (bool value, int col, long m, long n) => fnBl(penv, col, m, n, !value ? (byte)0 : value ? (byte)1 : (byte)0xFF); return new Impl(input, pyCol, idvCol, type, pokeBl); case InternalDataKind.I1: var fnI1 = MarshalDelegate(setter); ValuePoker pokeI1 = - (sbyte value, int col, long index) => fnI1(penv, col, index, value); + (sbyte value, int col, long m, long n) => fnI1(penv, col, m, n, value); return new Impl(input, pyCol, idvCol, type, pokeI1); case InternalDataKind.I2: var fnI2 = MarshalDelegate(setter); ValuePoker pokeI2 = - (short value, int col, long index) => fnI2(penv, col, index, value); + (short value, int col, long m, long n) => fnI2(penv, col, m, n, value); return new Impl(input, pyCol, idvCol, type, pokeI2); case InternalDataKind.I4: var fnI4 = MarshalDelegate(setter); ValuePoker pokeI4 = - (int value, int col, long index) => fnI4(penv, col, index, value); + (int value, int col, long m, long n) => fnI4(penv, col, m, n, value); return new Impl(input, pyCol, idvCol, type, pokeI4); case InternalDataKind.I8: var fnI8 = MarshalDelegate(setter); ValuePoker pokeI8 = - (long value, int col, long index) => fnI8(penv, col, index, value); + (long value, int col, long m, long n) => fnI8(penv, col, m, n, value); return new Impl(input, pyCol, idvCol, type, pokeI8); case InternalDataKind.U1: var fnU1 = MarshalDelegate(setter); ValuePoker pokeU1 = - (byte value, int col, long index) => fnU1(penv, col, index, value); + (byte value, int col, long m, long n) => fnU1(penv, col, m, n, value); return new Impl(input, pyCol, idvCol, type, pokeU1); case InternalDataKind.U2: var fnU2 = MarshalDelegate(setter); ValuePoker pokeU2 = - (ushort value, int col, long index) => fnU2(penv, col, index, value); + (ushort value, int col, long m, long n) => fnU2(penv, col, m, n, value); return new Impl(input, pyCol, idvCol, type, pokeU2); case InternalDataKind.U4: var fnU4 = MarshalDelegate(setter); ValuePoker pokeU4 = - (uint value, int col, long index) => fnU4(penv, col, index, value); + (uint value, int col, long m, long n) => fnU4(penv, col, m, n, value); return new Impl(input, pyCol, idvCol, type, pokeU4); case InternalDataKind.U8: var fnU8 = MarshalDelegate(setter); ValuePoker pokeU8 = - (ulong value, int col, long index) => fnU8(penv, col, index, value); + (ulong value, int col, long m, long n) => fnU8(penv, col, m, n, value); return new Impl(input, pyCol, idvCol, type, pokeU8); case InternalDataKind.TX: var fnTX = MarshalDelegate(setter); ValuePoker> pokeTX = - (ReadOnlyMemory value, int col, long index) => + (ReadOnlyMemory value, int col, long m, long n) => { if (value.IsEmpty) - fnTX(penv, col, index, null, 0); + fnTX(penv, col, m, n, null, 0); else { byte[] bt = Encoding.UTF8.GetBytes(value.ToString()); fixed (byte* pt = bt) - fnTX(penv, col, index, (sbyte*)pt, bt.Length); + fnTX(penv, col, m, n, (sbyte*)pt, bt.Length); } }; return new Impl>(input, pyCol, idvCol, type, pokeTX); @@ -639,6 +656,7 @@ private sealed class Impl : BufferFillerBase private VBuffer _buffer; private readonly ValueGetter _get; private readonly ValuePoker _poker; + private readonly bool _isVarLength; public Impl(DataViewRow input, int pyColIndex, int idvColIndex, DataViewType type, ValuePoker poker) : base(input, pyColIndex) @@ -652,6 +670,7 @@ public Impl(DataViewRow input, int pyColIndex, int idvColIndex, DataViewType typ _get = RowCursorUtils.GetGetterAs(type, input, idvColIndex); _poker = poker; + _isVarLength = (type.GetValueCount() == 0); } public override void Set() { @@ -662,7 +681,9 @@ public override void Set() { for (int i = 0; i < _buffer.Length; i++) { - _poker(_buffer.GetValues()[i], _colIndex + i, _input.Position); + if (_isVarLength) + _poker(_buffer.GetValues()[i], _colIndex, _input.Position, i); + else _poker(_buffer.GetValues()[i], _colIndex + i, _input.Position, 0); } } else @@ -677,7 +698,10 @@ public override void Set() TSrc val = default(TSrc); if (ii < values.Length && indices[ii] == i) val = values[ii]; - _poker(val, _colIndex + i, _input.Position); + + if (_isVarLength) + _poker(val, _colIndex, _input.Position, i); + else _poker(val, _colIndex + i, _input.Position, 0); } } } @@ -685,7 +709,7 @@ public override void Set() { TSrc value = default(TSrc); _get(ref value); - _poker(value, _colIndex, _input.Position); + _poker(value, _colIndex, _input.Position, 0); } } } @@ -734,20 +758,20 @@ public CsrData(EnvironmentBlock* penv, void** setters, InternalDataKind outputDa _indptrSetter = MarshalDelegate(setters[IndPtrCol]); _shapeSetter = MarshalDelegate(setters[ShapeCol]); - _indptrSetter(_penv, IndPtrCol, 0, 0); + _indptrSetter(_penv, IndPtrCol, 0, 0, 0); } public void AppendR4(float value, int col) { - _r4DataSetter(_penv, DataCol, _index, value); - _indicesSetter(_penv, IndicesCol, _index, col); + _r4DataSetter(_penv, DataCol, _index, 0, value); + _indicesSetter(_penv, IndicesCol, _index, 0, col); _index++; } public void AppendR8(double value, int col) { - _r8DataSetter(_penv, DataCol, _index, value); - _indicesSetter(_penv, IndicesCol, _index, col); + _r8DataSetter(_penv, DataCol, _index, 0, value); + _indicesSetter(_penv, IndicesCol, _index, 0, col); _index++; } @@ -756,13 +780,13 @@ public void IncrementRow() col = 0; _row++; - _indptrSetter(_penv, IndPtrCol, _row, _index); + _indptrSetter(_penv, IndPtrCol, _row, 0, _index); } public void SetShape(int m, int n) { - _shapeSetter(_penv, ShapeCol, 0, m); - _shapeSetter(_penv, ShapeCol, 1, n); + _shapeSetter(_penv, ShapeCol, 0, 0, m); + _shapeSetter(_penv, ShapeCol, 1, 0, n); } } diff --git a/src/DotNetBridge/transforms/VariableColumnTransform.cs b/src/DotNetBridge/transforms/VariableColumnTransform.cs new file mode 100644 index 00000000..ea9ecafb --- /dev/null +++ b/src/DotNetBridge/transforms/VariableColumnTransform.cs @@ -0,0 +1,337 @@ +//------------------------------------------------------------------------------ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +//------------------------------------------------------------------------------ + +using System; +using System.Collections.Generic; +using Microsoft.ML; +using Microsoft.ML.CommandLine; +using Microsoft.ML.Data; +using Microsoft.ML.Runtime; +using Microsoft.ML.Transforms; +using Microsoft.ML.Internal.Utilities; + + +namespace Microsoft.ML.DotNetBridge +{ + using BitArray = System.Collections.BitArray; + + /// + /// A transform that combines the specified input columns + /// in to a single variable length vectorized column and + /// passes the rest of the columns through unchanged. + /// + [BestFriend] + internal sealed class VariableColumnTransform : IDataTransform, IRowToRowMapper + { + public class Options : TransformInputBase + { + [Argument(ArgumentType.Multiple, HelpText = "Features", SortOrder = 2)] + public string[] Features; + + [Argument(ArgumentType.Multiple, HelpText = "Length Column Name", SortOrder = 2)] + public string LengthColumnName; + } + + private sealed class Bindings + { + public readonly List outputToInputMap; + public readonly List vectorToInputMap; + public int outputColumn; + public int lengthColumn; + + public Bindings() + { + outputToInputMap = new List(); + vectorToInputMap = new List(); + outputColumn = -1; + lengthColumn = -1; + } + } + + private readonly IHost _host; + private readonly Bindings _bindings; + private readonly HashSet _columnNames; + + public IDataView Source { get; } + + DataViewSchema IRowToRowMapper.InputSchema => Source.Schema; + + private VariableColumnTransform(IHostEnvironment env, IDataView input, string[] features, string lengthColumnName) + { + Contracts.CheckValue(env, nameof(env)); + + Source = input; + _host = env.Register(RegistrationName); + _bindings = new Bindings(); + + _columnNames = (features == null) ? new HashSet() : + new HashSet(features); + + OutputSchema = ProcessInputSchema(input.Schema, lengthColumnName); + } + + internal const string Summary = "Combines the specified input columns in to a single variable length vectorized column."; + + public const string LoaderSignature = "VariableColumnTransform"; + + private static VersionInfo GetVersionInfo() + { + return new VersionInfo( + modelSignature: "VARLENCL", + verWrittenCur: 0x00010001, // Initial + verReadableCur: 0x00010001, + verWeCanReadBack: 0x00010001, + loaderSignature: LoaderSignature, + loaderAssemblyName: typeof(VariableColumnTransform).Assembly.FullName); + } + + internal static string RegistrationName = "VariableColumnTransform"; + + public static VariableColumnTransform Create(IHostEnvironment env, Options options, IDataView input) + { + return new VariableColumnTransform(env, input, options.Features, options.LengthColumnName); + } + + public static VariableColumnTransform Create(IHostEnvironment env, ModelLoadContext ctx, IDataView input) + { + Contracts.CheckValue(env, nameof(env)); + var h = env.Register(RegistrationName); + h.CheckValue(ctx, nameof(ctx)); + h.CheckValue(input, nameof(input)); + ctx.CheckAtModel(GetVersionInfo()); + return h.Apply("Loading Model", ch => new VariableColumnTransform(h, ctx, input)); + } + + private VariableColumnTransform(IHost host, ModelLoadContext ctx, IDataView input) + { + Contracts.AssertValue(host, nameof(host)); + host.CheckValue(input, nameof(input)); + + Source = input; + _host = host; + + // TODO: fill this in + } + + void ICanSaveModel.Save(ModelSaveContext ctx) + { + _host.CheckValue(ctx, nameof(ctx)); + ctx.CheckAtModel(); + ctx.SetVersionInfo(GetVersionInfo()); + + // TODO: fill this in + } + + public bool CanShuffle => Source.CanShuffle; + + DataViewSchema IDataView.Schema => OutputSchema; + public DataViewSchema OutputSchema { get; } + + private DataViewSchema ProcessInputSchema(DataViewSchema inputSchema, string lengthColumnName) + { + var builder = new DataViewSchema.Builder(); + for (int i = 0; i < inputSchema.Count; i++) + { + var name = inputSchema[i].Name; + + if (_columnNames.Contains(name)) + { + _bindings.vectorToInputMap.Add(i); + } + else if (name == lengthColumnName) + { + _bindings.lengthColumn = i; + } + else + { + builder.AddColumn(name, inputSchema[i].Type); + _bindings.outputToInputMap.Add(i); + } + } + + if (_bindings.vectorToInputMap.Count > 0) + { + var type = inputSchema[_bindings.vectorToInputMap[0]].Type as PrimitiveDataViewType; + + for (int i = 1; i < _bindings.vectorToInputMap.Count; i++) + { + var nextType = inputSchema[_bindings.vectorToInputMap[i]].Type as PrimitiveDataViewType; + if (!nextType.Equals(type)) + { + throw Contracts.Except("Input data types of the columns to vectorize must " + + "all be of the same type. Found {0} and {1}.", + type.ToString(), + nextType.ToString()); + } + } + + var outputColumnType = new VectorDataViewType(type, 0); + var outputColumnName = inputSchema[_bindings.vectorToInputMap[0]].Name; + builder.AddColumn(outputColumnName, outputColumnType); + + _bindings.outputColumn = _bindings.outputToInputMap.Count; + } + + return builder.ToSchema(); + } + + public long? GetRowCount() + { + return Source.GetRowCount(); + } + + public DataViewRowCursor GetRowCursor(IEnumerable columnsNeeded, Random rand = null) + { + var predicate = RowCursorUtils.FromColumnsToPredicate(columnsNeeded, OutputSchema); + + _host.CheckValueOrNull(rand); + return new Cursor(_host, this, _bindings, predicate, rand); + } + + public DataViewRowCursor[] GetRowCursorSet(IEnumerable columnsNeeded, int n, Random rand = null) + { + var predicate = RowCursorUtils.FromColumnsToPredicate(columnsNeeded, OutputSchema); + + _host.CheckValueOrNull(rand); + return new DataViewRowCursor[] { new Cursor(_host, this, _bindings, predicate, rand) }; + } + + private sealed class Cursor : RootCursorBase + { + private readonly IDataTransform _view; + private readonly BitArray _active; + private readonly Bindings _bindings; + private readonly DataViewRowCursor _cursor; + + public override DataViewSchema Schema => _view.Schema; + + public override long Batch + { + get { return 0; } + } + + public Cursor(IChannelProvider provider, IDataTransform view, Bindings bindings, Func predicate, Random rand) + : base(provider) + { + Ch.AssertValue(view); + Ch.AssertValueOrNull(rand); + Ch.Assert(view.Schema.Count >= 0); + + _view = view; + _bindings = bindings; + _cursor = view.Source.GetRowCursorForAllColumns(); + _active = new BitArray(view.Schema.Count); + + if (predicate == null) _active.SetAll(true); + else + { + for (int i = 0; i < view.Schema.Count; ++i) + _active[i] = predicate(i); + } + } + + public override ValueGetter GetIdGetter() + { + return (ref DataViewRowId val) => + { + Ch.Check(IsGood, RowCursorUtils.FetchValueStateError); + val = new DataViewRowId((ulong)Position, 0); + }; + } + + public override bool IsColumnActive(DataViewSchema.Column column) + { + Ch.Check(column.Index < Schema.Count); + return _active[column.Index]; + } + + private Delegate MakeVarLengthVectorGetter(DataViewRow input) + { + var srcGetters = new ValueGetter[_bindings.vectorToInputMap.Count]; + ValueGetter lengthGetter = null; + + for (int i = 0; i < _bindings.vectorToInputMap.Count; i++) + { + var column = input.Schema[_bindings.vectorToInputMap[i]]; + srcGetters[i] = input.GetGetter(column); + } + + if (_bindings.lengthColumn >= 0) + { + var column = input.Schema[_bindings.lengthColumn]; + lengthGetter = input.GetGetter(column); + } + + T tmp = default(T); + ValueGetter> result = (ref VBuffer dst) => + { + int length = _bindings.vectorToInputMap.Count; + if (lengthGetter != null) + { + long expectedLength = length; + lengthGetter(ref expectedLength); + + if ((expectedLength >= 0) && (expectedLength < length)) + { + length = (int)expectedLength; + } + } + + var editor = VBufferEditor.Create(ref dst, length); + + for (int i = 0; i < length; i++) + { + srcGetters[i](ref tmp); + editor.Values[i] = tmp; + } + + dst = editor.Commit(); + }; + return result; + } + + /// + /// Returns a value getter delegate to fetch the value of column with the given columnIndex, from the row. + /// This throws if the column is not active in this row, or if the type + /// differs from this column's type. + /// + /// is the column's content type. + /// is the output column whose getter should be returned. + public override ValueGetter GetGetter(DataViewSchema.Column column) + { + if (column.Index == _bindings.outputColumn) + { + VectorDataViewType columnType = column.Type as VectorDataViewType; + Delegate getter = Utils.MarshalInvoke(MakeVarLengthVectorGetter, columnType.ItemType.RawType, _cursor); + return getter as ValueGetter; + } + else + { + int inputIndex = _bindings.outputToInputMap[column.Index]; + return _cursor.GetGetter(_cursor.Schema[inputIndex]); + } + } + + protected override bool MoveNextCore() + { + return _cursor.MoveNext(); + } + } + + /// + /// Given a set of columns, return the input columns that are needed to generate those output columns. + /// + IEnumerable IRowToRowMapper.GetDependencies(IEnumerable dependingColumns) + => dependingColumns; + + DataViewRow IRowToRowMapper.GetRow(DataViewRow input, IEnumerable activeColumns) + { + Contracts.CheckValue(input, nameof(input)); + Contracts.CheckValue(activeColumns, nameof(activeColumns)); + Contracts.CheckParam(input.Schema == Source.Schema, nameof(input), "Schema of input row must be the same as the schema the mapper is bound to"); + return input; + } + } +} \ No newline at end of file diff --git a/src/NativeBridge/DataViewInterop.h b/src/NativeBridge/DataViewInterop.h index 00faca4b..01dc21fc 100644 --- a/src/NativeBridge/DataViewInterop.h +++ b/src/NativeBridge/DataViewInterop.h @@ -522,6 +522,10 @@ struct DataViewBlock // Column key type cardinalities. Only contains the values for the columns that have // key names. const int *keyCards; + // The number of values in each row of a column. + // A value count of 0 means that each row of the + // column is variable length. + const BYTE *valueCounts; }; enum ML_PY_TYPE_MAP_ENUM { diff --git a/src/NativeBridge/ManagedInterop.cpp b/src/NativeBridge/ManagedInterop.cpp index 7c6a6fcb..46062fb5 100644 --- a/src/NativeBridge/ManagedInterop.cpp +++ b/src/NativeBridge/ManagedInterop.cpp @@ -6,63 +6,16 @@ #include "DataViewInterop.h" #include "ManagedInterop.h" -inline void destroyManagerCObject(PyObject* obj) { - auto* b = static_cast(PyCapsule_GetPointer(obj, NULL)); - if (b) { delete b; } -} - -#define SetDict2(cpptype, nptype); \ - {\ - PythonObject* col = dynamic_cast*>(column);\ - auto shrd = col->GetData();\ - auto* data = shrd->data();\ - bp::handle<> h(::PyCapsule_New((void*)column, NULL, (PyCapsule_Destructor)&destroyManagerCObject));\ - dict[_names[i]] = np::from_data(\ - data,\ - np::dtype::get_builtin(),\ - bp::make_tuple(shrd->size()),\ - bp::make_tuple(sizeof(nptype)), bp::object(h));\ - } - -#define SetDict1(type) SetDict2(type, type) -#define SetDictAndKeys(type, i); \ - {\ - PythonObject* col = dynamic_cast*>(column);\ - auto shrd = col->GetData();\ - auto* data = shrd->data();\ - bp::handle<> h(::PyCapsule_New((void*)column, NULL, (PyCapsule_Destructor)&destroyManagerCObject));\ - np::ndarray npdata = np::from_data(\ - data,\ - np::dtype::get_builtin(),\ - bp::make_tuple(shrd->size()),\ - bp::make_tuple(sizeof(float)), bp::object(h));\ - if (keyNames == nullptr)\ - {\ - dict[_names[i]] = npdata;\ - }\ - else\ - {\ - dict[_names[i]] = bp::dict();\ - dict[_names[i]]["..Data"] = npdata;\ - auto shrd = keyNames->GetData();\ - bp::list list;\ - for (int j = 0; j < shrd->size(); j++)\ - {\ - bp::object obj;\ - const std::string& value = shrd->at(j);\ - if (!value.empty())\ - {\ - obj = bp::object(value);\ - }\ - list.append(obj);\ - }\ - dict[_names[i]]["..KeyValues"] = list;\ - }\ - }\ +#define AddToDict(type); \ + {\ + PyColumn* col = dynamic_cast*>(column);\ + col->AddToDict(dict, _names[i], keyNames, maxRows);\ + }\ #define STATIC + EnvironmentBlock::~EnvironmentBlock() { // Everything (except data buffers) that we might have exposed to managed code, @@ -112,7 +65,7 @@ void EnvironmentBlock::DataSinkCore(const DataViewBlock * pdata) for (int i = 0; i < pdata->ccol; i++) { BYTE kind = pdata->kinds[i]; - _columns.push_back(PythonObjectBase::CreateObject(kind, pdata->crow, 1)); + _columns.push_back(PyColumnBase::Create(kind, pdata->crow, pdata->valueCounts[i])); switch (kind) { @@ -162,7 +115,7 @@ void EnvironmentBlock::DataSinkCore(const DataViewBlock * pdata) if (pdata->keyCards && (pdata->keyCards[i] >= 0)) { _columnToKeyMap.insert(i); - _vKeyValues.push_back(new PythonObject(TX, pdata->keyCards[i], 1)); + _vKeyValues.push_back(new PyColumnSingle(TX, pdata->keyCards[i])); } _names.push_back(pdata->names[i]); @@ -230,20 +183,27 @@ bp::dict EnvironmentBlock::GetData() return bp::dict(); } + size_t maxRows = 0; + for (size_t i = 0; i < _columns.size(); i++) + { + size_t numRows = _columns[i]->GetNumRows(); + if (numRows > maxRows) maxRows = numRows; + } + CxInt64 numKeys = 0; bp::dict dict = bp::dict(); for (size_t i = 0; i < _columns.size(); i++) { - PythonObjectBase* column = _columns[i]; - PythonObject* keyNames = nullptr; + PyColumnBase* column = _columns[i]; + const std::vector* keyNames = nullptr; if (_columnToKeyMap.find(i) != _columnToKeyMap.end()) - keyNames = _vKeyValues[numKeys++]; + keyNames = _vKeyValues[numKeys++]->GetData(); signed char kind = column->GetKind(); switch (kind) { case -1: { - PythonObject* col = dynamic_cast*>(column); + PyColumnSingle* col = dynamic_cast*>(column); auto shrd = col->GetData(); bp::list list; for (size_t i = 0; i < shrd->size(); i++) @@ -263,57 +223,42 @@ bp::dict EnvironmentBlock::GetData() } break; case BL: - SetDict2(signed char, bool); + AddToDict(signed char); break; case I1: - SetDictAndKeys(signed char, i); + AddToDict(signed char); break; case I2: - SetDictAndKeys(signed short, i); + AddToDict(signed short); break; case I4: - SetDictAndKeys(signed int, i); + AddToDict(signed int); break; case I8: - SetDict1(CxInt64); + AddToDict(CxInt64); break; case U1: - SetDict1(unsigned char); + AddToDict(unsigned char); break; case U2: - SetDict1(unsigned short); + AddToDict(unsigned short); break; case U4: - SetDict1(unsigned int); + AddToDict(unsigned int); break; case U8: - SetDict1(CxUInt64); + AddToDict(CxUInt64); break; case R4: - SetDict1(float); + AddToDict(float); break; case R8: - SetDict1(double); + AddToDict(double); break; case TX: - { - PythonObject* col = dynamic_cast*>(column); - auto shrd = col->GetData(); - bp::list list; - for (size_t i = 0; i < shrd->size(); i++) - { - bp::object obj; - const std::string& value = shrd->at(i); - if (!value.empty()) - { - obj = bp::object(value); - } - list.append(obj); - } - dict[_names[i]] = list; + AddToDict(std::string); delete column; - } - break; + break; case TS: case DT: case DZ: diff --git a/src/NativeBridge/ManagedInterop.h b/src/NativeBridge/ManagedInterop.h index 519c464f..485a59cc 100644 --- a/src/NativeBridge/ManagedInterop.h +++ b/src/NativeBridge/ManagedInterop.h @@ -104,90 +104,90 @@ class CLASS_ALIGN EnvironmentBlock // Column names. std::vector _names; - std::vector _columns; + std::vector _columns; // Set of all key column indexes. std::unordered_set _columnToKeyMap; - std::vector*> _vKeyValues; + std::vector*> _vKeyValues; - static MANAGED_CALLBACK(void) SetR4(EnvironmentBlock *env, int col, long index, float value) + static MANAGED_CALLBACK(void) SetR4(EnvironmentBlock *env, int col, long m, long n, float value) { - PythonObject* colObject = dynamic_cast*>(env->_columns[col]); + PyColumn* colObject = dynamic_cast*>(env->_columns[col]); assert(colObject != nullptr); - colObject->SetAt(index, 0, value); + colObject->SetAt(m, n, value); } - static MANAGED_CALLBACK(void) SetR8(EnvironmentBlock *env, int col, long index, double value) + static MANAGED_CALLBACK(void) SetR8(EnvironmentBlock *env, int col, long m, long n, double value) { - PythonObject* colObject = dynamic_cast*>(env->_columns[col]); + PyColumn* colObject = dynamic_cast*>(env->_columns[col]); assert(colObject != nullptr); - colObject->SetAt(index, 0, value); + colObject->SetAt(m, n, value); } - static MANAGED_CALLBACK(void) SetBL(EnvironmentBlock *env, int col, long index, signed char value) + static MANAGED_CALLBACK(void) SetBL(EnvironmentBlock *env, int col, long m, long n, signed char value) { - PythonObject* colObject = dynamic_cast*>(env->_columns[col]); + PyColumn* colObject = dynamic_cast*>(env->_columns[col]); assert(colObject != nullptr); - colObject->SetAt(index, 0, value); + colObject->SetAt(m, n, value); if (value < 0) env->_columns[col]->SetKind(-1); } - static MANAGED_CALLBACK(void) SetI1(EnvironmentBlock *env, int col, long index, signed char value) + static MANAGED_CALLBACK(void) SetI1(EnvironmentBlock *env, int col, long m, long n, signed char value) { - PythonObject* colObject = dynamic_cast*>(env->_columns[col]); + PyColumn* colObject = dynamic_cast*>(env->_columns[col]); assert(colObject != nullptr); - colObject->SetAt(index, 0, value); + colObject->SetAt(m, n, value); } - static MANAGED_CALLBACK(void) SetI2(EnvironmentBlock *env, int col, long index, short value) + static MANAGED_CALLBACK(void) SetI2(EnvironmentBlock *env, int col, long m, long n, short value) { - PythonObject* colObject = dynamic_cast*>(env->_columns[col]); + PyColumn* colObject = dynamic_cast*>(env->_columns[col]); assert(colObject != nullptr); - colObject->SetAt(index, 0, value); + colObject->SetAt(m, n, value); } - static MANAGED_CALLBACK(void) SetI4(EnvironmentBlock *env, int col, long index, int value) + static MANAGED_CALLBACK(void) SetI4(EnvironmentBlock *env, int col, long m, long n, int value) { - PythonObject* colObject = dynamic_cast*>(env->_columns[col]); + PyColumn* colObject = dynamic_cast*>(env->_columns[col]); assert(colObject != nullptr); - colObject->SetAt(index, 0, value); + colObject->SetAt(m, n, value); } - static MANAGED_CALLBACK(void) SetI8(EnvironmentBlock *env, int col, long index, CxInt64 value) + static MANAGED_CALLBACK(void) SetI8(EnvironmentBlock *env, int col, long m, long n, CxInt64 value) { - PythonObject* colObject = dynamic_cast*>(env->_columns[col]); + PyColumn* colObject = dynamic_cast*>(env->_columns[col]); assert(colObject != nullptr); - colObject->SetAt(index, 0, value); + colObject->SetAt(m, n, value); } - static MANAGED_CALLBACK(void) SetU1(EnvironmentBlock *env, int col, long index, unsigned char value) + static MANAGED_CALLBACK(void) SetU1(EnvironmentBlock *env, int col, long m, long n, unsigned char value) { - PythonObject* colObject = dynamic_cast*>(env->_columns[col]); + PyColumn* colObject = dynamic_cast*>(env->_columns[col]); assert(colObject != nullptr); - colObject->SetAt(index, 0, value); + colObject->SetAt(m, n, value); } - static MANAGED_CALLBACK(void) SetU2(EnvironmentBlock *env, int col, long index, unsigned short value) + static MANAGED_CALLBACK(void) SetU2(EnvironmentBlock *env, int col, long m, long n, unsigned short value) { - PythonObject* colObject = dynamic_cast*>(env->_columns[col]); + PyColumn* colObject = dynamic_cast*>(env->_columns[col]); assert(colObject != nullptr); - colObject->SetAt(index, 0, value); + colObject->SetAt(m, n, value); } - static MANAGED_CALLBACK(void) SetU4(EnvironmentBlock *env, int col, long index, unsigned int value) + static MANAGED_CALLBACK(void) SetU4(EnvironmentBlock *env, int col, long m, long n, unsigned int value) { - PythonObject* colObject = dynamic_cast*>(env->_columns[col]); + PyColumn* colObject = dynamic_cast*>(env->_columns[col]); assert(colObject != nullptr); - colObject->SetAt(index, 0, value); + colObject->SetAt(m, n, value); } - static MANAGED_CALLBACK(void) SetU8(EnvironmentBlock *env, int col, long index, CxUInt64 value) + static MANAGED_CALLBACK(void) SetU8(EnvironmentBlock *env, int col, long m, long n, CxUInt64 value) { - PythonObject* colObject = dynamic_cast*>(env->_columns[col]); + PyColumn* colObject = dynamic_cast*>(env->_columns[col]); assert(colObject != nullptr); - colObject->SetAt(index, 0, value); + colObject->SetAt(m, n, value); } - static MANAGED_CALLBACK(void) SetTX(EnvironmentBlock *env, int col, long index, char* value, long length) + static MANAGED_CALLBACK(void) SetTX(EnvironmentBlock *env, int col, long m, long n, char* value, long length) { - PythonObject* colObject = dynamic_cast*>(env->_columns[col]); + PyColumn* colObject = dynamic_cast*>(env->_columns[col]); assert(colObject != nullptr); - colObject->SetAt(index, 0, std::string(value, length)); + colObject->SetAt(m, n, std::string(value, length)); } static MANAGED_CALLBACK(void) SetKeyValue(EnvironmentBlock *env, int keyColumnIndex, int keyCode, char* value, long length) { assert(keyColumnIndex < env->_vKeyValues.size()); - PythonObject* keyNamesObject = env->_vKeyValues[keyColumnIndex]; + PyColumn* keyNamesObject = env->_vKeyValues[keyColumnIndex]; keyNamesObject->SetAt(keyCode, 0, std::string(value, length)); } }; diff --git a/src/NativeBridge/PythonInterop.cpp b/src/NativeBridge/PythonInterop.cpp index be2f0d79..46586fa7 100644 --- a/src/NativeBridge/PythonInterop.cpp +++ b/src/NativeBridge/PythonInterop.cpp @@ -1,55 +1,325 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT license. +#include #include "stdafx.h" #include "PythonInterop.h" -PythonObjectBase::PythonObjectBase(const int& kind) + +inline void destroyManagerCObject(PyObject* obj) { + auto* b = static_cast(PyCapsule_GetPointer(obj, NULL)); + if (b) { delete b; } +} + + +PyColumnBase::PyColumnBase(const int& kind) { _kind = kind; } -PythonObjectBase::~PythonObjectBase() +PyColumnBase::~PyColumnBase() { } -PythonObjectBase::creation_map* PythonObjectBase::m_pCreationMap = PythonObjectBase::CreateMap(); +PyColumnBase::creation_map* PyColumnBase::m_pSingleCreationMap = PyColumnBase::CreateSingleMap(); +PyColumnBase::creation_map* PyColumnBase::m_pVariableCreationMap = PyColumnBase::CreateVariableMap(); -PythonObjectBase::creation_map* PythonObjectBase::CreateMap() +PyColumnBase::creation_map* PyColumnBase::CreateSingleMap() { - PythonObjectBase::creation_map* map = new PythonObjectBase::creation_map(); + PyColumnBase::creation_map* map = new PyColumnBase::creation_map(); - map->insert(creation_map_entry(BL, CreateObject)); - map->insert(creation_map_entry(I1, CreateObject)); - map->insert(creation_map_entry(I2, CreateObject)); - map->insert(creation_map_entry(I4, CreateObject)); - map->insert(creation_map_entry(I8, CreateObject)); - map->insert(creation_map_entry(U1, CreateObject)); - map->insert(creation_map_entry(U2, CreateObject)); - map->insert(creation_map_entry(U4, CreateObject)); - map->insert(creation_map_entry(U8, CreateObject)); - map->insert(creation_map_entry(R4, CreateObject)); - map->insert(creation_map_entry(R8, CreateObject)); - map->insert(creation_map_entry(TX, CreateObject)); + map->insert(creation_map_entry(BL, CreateSingle)); + map->insert(creation_map_entry(I1, CreateSingle)); + map->insert(creation_map_entry(I2, CreateSingle)); + map->insert(creation_map_entry(I4, CreateSingle)); + map->insert(creation_map_entry(I8, CreateSingle)); + map->insert(creation_map_entry(U1, CreateSingle)); + map->insert(creation_map_entry(U2, CreateSingle)); + map->insert(creation_map_entry(U4, CreateSingle)); + map->insert(creation_map_entry(U8, CreateSingle)); + map->insert(creation_map_entry(R4, CreateSingle)); + map->insert(creation_map_entry(R8, CreateSingle)); + map->insert(creation_map_entry(TX, CreateSingle)); return map; } -PythonObjectBase* PythonObjectBase::CreateObject(const int& kind, size_t numRows, size_t numCols) +PyColumnBase::creation_map* PyColumnBase::CreateVariableMap() { - creation_map::iterator found = m_pCreationMap->find(kind); + PyColumnBase::creation_map* map = new PyColumnBase::creation_map(); + + map->insert(creation_map_entry(BL, CreateVariable)); + map->insert(creation_map_entry(I1, CreateVariable)); + map->insert(creation_map_entry(I2, CreateVariable)); + map->insert(creation_map_entry(I4, CreateVariable)); + map->insert(creation_map_entry(I8, CreateVariable)); + map->insert(creation_map_entry(U1, CreateVariable)); + map->insert(creation_map_entry(U2, CreateVariable)); + map->insert(creation_map_entry(U4, CreateVariable)); + map->insert(creation_map_entry(U8, CreateVariable)); + map->insert(creation_map_entry(R4, CreateVariable)); + map->insert(creation_map_entry(R8, CreateVariable)); + map->insert(creation_map_entry(TX, CreateVariable)); + return map; +} - if (found == m_pCreationMap->end()) +PyColumnBase* PyColumnBase::Create(const int& kind, size_t numRows, size_t numCols) +{ + if (numCols == 0) + { + creation_map::iterator found = m_pVariableCreationMap->find(kind); + if (found != m_pVariableCreationMap->end()) + return found->second(kind, numRows); + } + else { - std::stringstream message; - message << "Columns of kind " << kind << " are not supported."; - throw std::invalid_argument(message.str().c_str()); + creation_map::iterator found = m_pSingleCreationMap->find(kind); + if (found != m_pSingleCreationMap->end()) + return found->second(kind, numRows); } - return found->second(kind, numRows, numCols); + std::stringstream message; + message << "Columns of kind " << kind << " are not supported."; + throw std::invalid_argument(message.str().c_str()); +} + +template PyColumnBase* PyColumnBase::CreateSingle(const int& kind, size_t nRows) +{ + return new PyColumnSingle(kind, nRows); +} + +template PyColumnBase* PyColumnBase::CreateVariable(const int& kind, size_t nRows) +{ + return new PyColumnVariable(kind, nRows); +} + +template +void PyColumnSingle::AddToDict(bp::dict& dict, + const std::string& name, + const std::vector* keyNames, + const size_t expectedRows) +{ + auto* data = _pData->data(); + + switch (this->_kind) + { + case DataKind::BL: + { + bp::handle<> h(::PyCapsule_New((void*)this, NULL, (PyCapsule_Destructor)&destroyManagerCObject)); + dict[name] = np::from_data( + data, + np::dtype::get_builtin(), + bp::make_tuple(_pData->size()), + bp::make_tuple(sizeof(bool)), bp::object(h)); + } + break; + case DataKind::I1: + case DataKind::I2: + case DataKind::I4: + { + bp::handle<> h(::PyCapsule_New((void*)this, NULL, (PyCapsule_Destructor)&destroyManagerCObject)); + np::ndarray npdata = np::from_data( + data, + np::dtype::get_builtin(), + bp::make_tuple(_pData->size()), + bp::make_tuple(sizeof(float)), bp::object(h)); + if (keyNames == nullptr) + { + dict[name] = npdata; + } + else + { + dict[name] = bp::dict(); + dict[name]["..Data"] = npdata; + bp::list list; + for (int j = 0; j < keyNames->size(); j++) + { + bp::object obj; + const std::string& value = keyNames->at(j); + if (!value.empty()) + { + obj = bp::object(value); + } + list.append(obj); + } + dict[name]["..KeyValues"] = list; + } + } + break; + case DataKind::I8: + case DataKind::U1: + case DataKind::U2: + case DataKind::U4: + case DataKind::U8: + case DataKind::R4: + case DataKind::R8: + { + bp::handle<> h(::PyCapsule_New((void*)this, NULL, (PyCapsule_Destructor)&destroyManagerCObject)); + dict[name] = np::from_data( + data, + np::dtype::get_builtin(), + bp::make_tuple(_pData->size()), + bp::make_tuple(sizeof(T)), bp::object(h)); + } + break; + } } -template PythonObjectBase* PythonObjectBase::CreateObject(const int& kind, size_t nRows, size_t nColumns) +template <> +void PyColumnSingle::AddToDict(bp::dict& dict, + const std::string& name, + const std::vector* keyNames, + const size_t expectedRows) { - return new PythonObject(kind, nRows, nColumns); + bp::list list; + for (size_t i = 0; i < _pData->size(); i++) + { + bp::object obj; + const std::string& value = _pData->at(i); + if (!value.empty()) + { + obj = bp::object(value); + } + list.append(obj); + } + dict[name] = list; } +template +void PyColumnVariable::SetAt(size_t nRow, size_t nCol, const T& value) +{ + if ((nRow + 1) > _numRows) _numRows = nRow + 1; + + /* + * Make sure there are enough columns for the request. + */ + for (size_t i = _data.size(); i <= nCol; i++) + { + _data.push_back(new std::vector()); + } + + std::vector* pColData = _data[nCol]; + + /* + * Fill in any missing row values. + */ + for (size_t i = pColData->size(); i < nRow; i++) + { + pColData->push_back(GetMissingValue()); + } + + pColData->push_back(GetConvertedValue(value)); +} + +/* + * Note: an instance of this object should not be used + * and should be considered invalid after the first time + * this method has been called. + */ +template +void PyColumnVariable::Deleter(PyObject* obj) +{ + auto* deleteData = static_cast::DeleteData*>(PyCapsule_GetPointer(obj, NULL)); + + PyColumnVariable* instance = deleteData->instance; + size_t column = deleteData->column; + + std::vector* data = instance->_data[column]; + if (data != nullptr) + { + instance->_data[column] = nullptr; + instance->_numDeletedColumns++; + delete data; + + if (instance->_numDeletedColumns == instance->_data.size()) + { + delete instance; + } + } +} + +template +void PyColumnVariable::AddToDict(bp::dict& dict, + const std::string& name, + const std::vector* keyNames, + const size_t expectedRows) +{ + size_t numRows = (expectedRows > _numRows) ? expectedRows : _numRows; + size_t numCols = _data.size(); + + if (numCols == 0) + { + /* + * If there were no values set then create a + * column so it can be filled with missing values. + */ + _data.push_back(new std::vector()); + numCols = 1; + } + + const std::string colNameBase = name + "."; + int maxDigits = (int)ceil(log10(numCols)); + if (maxDigits == 0) maxDigits = 1; + + for (size_t i = 0; i < numCols; i++) + { + std::vector* pColData = _data[i]; + + /* + * Make sure all the columns are the same length. + */ + for (size_t j = pColData->size(); j < numRows; j++) + { + pColData->push_back(GetMissingValue()); + } + + std::string colName = std::to_string(i); + colName = std::string(maxDigits - colName.length(), '0') + colName; + colName = colNameBase + colName; + + AddColumnToDict(dict, colName, i); + } +} + +template +void PyColumnVariable::AddColumnToDict(bp::dict& dict, + const std::string& name, + size_t index) +{ + auto* data = _data[index]->data(); + + DeleteData* deleteData = new DeleteData(); + deleteData->instance = this; + deleteData->column = index; + + bp::handle<> h(::PyCapsule_New((void*)deleteData, NULL, (PyCapsule_Destructor)&Deleter)); + dict[name] = np::from_data( + data, + np::dtype::get_builtin(), + bp::make_tuple(_data[index]->size()), + bp::make_tuple(sizeof(T2)), bp::object(h)); +} + +template<> +void PyColumnVariable::AddColumnToDict(bp::dict& dict, + const std::string& name, + size_t index) +{ + bp::list list; + std::vector* pColData = _data[index]; + size_t numRows = pColData->size(); + + for (size_t i = 0; i < numRows; i++) + { + bp::object obj; + NullableString value = pColData->at(i); + + if (value) + { + obj = bp::object(*value); + } + + list.append(obj); + } + + dict[name] = list; +} diff --git a/src/NativeBridge/PythonInterop.h b/src/NativeBridge/PythonInterop.h index a0fbce58..8929ae39 100644 --- a/src/NativeBridge/PythonInterop.h +++ b/src/NativeBridge/PythonInterop.h @@ -3,6 +3,9 @@ #pragma once #include +#include +#include + // Taken from ML.NET source code. These values should be stable. enum DataKind @@ -24,85 +27,204 @@ enum DataKind DZ = 15, }; -class PythonObjectBase +class PyColumnBase { private: - typedef std::map creation_map; - typedef std::pair creation_map_entry; + typedef std::map creation_map; + typedef std::pair creation_map_entry; + + static creation_map* m_pSingleCreationMap; + static creation_map* CreateSingleMap(); - static creation_map* m_pCreationMap; - static creation_map* CreateMap(); + static creation_map* m_pVariableCreationMap; + static creation_map* CreateVariableMap(); - template static PythonObjectBase* CreateObject(const int& name, size_t nRows, size_t nColumns); + template static PyColumnBase* CreateSingle(const int& kind, size_t nRows); + template static PyColumnBase* CreateVariable(const int& kind, size_t nRows); protected: int _kind; public: - PythonObjectBase(const int& kind); - static PythonObjectBase* CreateObject(const int& kind, size_t numRows, size_t numCols); - const int& GetKind() const; - void SetKind(int kind); - virtual ~PythonObjectBase(); + static PyColumnBase* Create(const int& kind, size_t numRows, size_t numCols); + + PyColumnBase(const int& kind); + virtual ~PyColumnBase(); + + const int& GetKind() const { return _kind; } + void SetKind(int kind) { _kind = kind; } + + virtual size_t GetNumRows() = 0; + virtual size_t GetNumCols() = 0; }; -inline const int& PythonObjectBase::GetKind() const -{ - return _kind; -} -inline void PythonObjectBase::SetKind(int kind) +/* + * Template typed abstract base class which provides + * the required interface for all derived classes. + */ +template +class PyColumn : public PyColumnBase { - _kind = kind; -} +public: + PyColumn(const int& kind) : PyColumnBase(kind) {} + virtual ~PyColumn() {} + virtual void SetAt(size_t nRow, size_t nCol, const T& value) = 0; + virtual void AddToDict(bp::dict& dict, + const std::string& name, + const std::vector* keyNames, + const size_t expectedRows) = 0; +}; +/* + * Handles the single value case. + */ template -class PythonObject : public PythonObjectBase +class PyColumnSingle : public PyColumn { protected: std::vector* _pData; - size_t _numRows; - size_t _numCols; - public: - PythonObject(const int& kind, size_t numRows = 1, size_t numCols = 1); - virtual ~PythonObject(); - void SetAt(size_t nRow, size_t nCol, const T& value); - const std::vector* GetData() const; + PyColumnSingle(const int& kind, size_t numRows = 0); + virtual ~PyColumnSingle(); + virtual void SetAt(size_t nRow, size_t nCol, const T& value); + virtual void AddToDict(bp::dict& dict, + const std::string& name, + const std::vector* keyNames, + const size_t expectedRows); + virtual size_t GetNumRows(); + virtual size_t GetNumCols(); + const std::vector* GetData() const { return _pData; } }; template -inline PythonObject::PythonObject(const int& kind, size_t numRows, size_t numCols) - : PythonObjectBase(kind) +inline PyColumnSingle::PyColumnSingle(const int& kind, size_t numRows) + : PyColumn(kind) { - _numRows = numRows; - _numCols = numCols; - _pData = new std::vector(); - if (_numRows > 0) { - _pData->reserve(_numRows*_numCols); + if (numRows > 0) { + _pData->reserve(numRows); } } template -inline PythonObject::~PythonObject() +inline PyColumnSingle::~PyColumnSingle() { delete _pData; } template -inline void PythonObject::SetAt(size_t nRow, size_t nCol, const T& value) +inline void PyColumnSingle::SetAt(size_t nRow, size_t nCol, const T& value) +{ + if (_pData->size() <= nRow) + _pData->resize(nRow + 1); + _pData->at(nRow) = value; +} + +template +inline size_t PyColumnSingle::GetNumRows() { - size_t index = nRow * _numCols + nCol; - if (_pData->size() <= index) - _pData->resize(index + 1); - _pData->at(index) = value; + return _pData->size(); } template -inline const std::vector* PythonObject::GetData() const +inline size_t PyColumnSingle::GetNumCols() +{ + return 1; +} + + +typedef boost::optional NullableString; + +/* + * Handles the variable value case. + */ +template +class PyColumnVariable : public PyColumn { - return _pData; -} \ No newline at end of file +private: + std::vector*> _data; + + size_t _numRows; + size_t _numDeletedColumns; + +public: + PyColumnVariable(const int& kind, size_t numRows = 0); + virtual ~PyColumnVariable(); + virtual void SetAt(size_t nRow, size_t nCol, const T& value); + virtual void AddToDict(bp::dict& dict, + const std::string& name, + const std::vector* keyNames, + const size_t expectedRows); + virtual size_t GetNumRows(); + virtual size_t GetNumCols(); + + T2 GetMissingValue(); + T2 GetConvertedValue(const T& value); + + void AddColumnToDict(bp::dict& dict, const std::string& name, size_t index); + +public: + typedef struct + { + PyColumnVariable* instance; + size_t column; + } DeleteData; + + static void Deleter(PyObject* obj); +}; + +template +inline PyColumnVariable::PyColumnVariable(const int& kind, size_t numRows) + : PyColumn(kind), + _numRows(numRows), + _numDeletedColumns(0) +{ +} + +template +inline PyColumnVariable::~PyColumnVariable() +{ + for (unsigned int i = 0; i < _data.size(); i++) + { + if (_data[i] != nullptr) delete _data[i]; + } +} + +template +inline size_t PyColumnVariable::GetNumRows() +{ + return _numRows; +} + +template +inline size_t PyColumnVariable::GetNumCols() +{ + return _data.size(); +} + +template +inline T2 PyColumnVariable::GetMissingValue() +{ + return NAN; +} + +template +inline T2 PyColumnVariable::GetConvertedValue(const T& value) +{ + return (T2)value; +} + +template <> +inline NullableString PyColumnVariable::GetMissingValue() +{ + return boost::none; +} + +template <> +inline NullableString PyColumnVariable::GetConvertedValue(const std::string& value) +{ + return value; +} diff --git a/src/python/nimbusml.pyproj b/src/python/nimbusml.pyproj index 6655e1db..eeedbdad 100644 --- a/src/python/nimbusml.pyproj +++ b/src/python/nimbusml.pyproj @@ -429,6 +429,7 @@ + @@ -685,6 +686,7 @@ + diff --git a/src/python/nimbusml/internal/entrypoints/transforms_variablecolumn.py b/src/python/nimbusml/internal/entrypoints/transforms_variablecolumn.py new file mode 100644 index 00000000..16fca0ad --- /dev/null +++ b/src/python/nimbusml/internal/entrypoints/transforms_variablecolumn.py @@ -0,0 +1,69 @@ +""" +Transforms.VariableColumnTransform +""" + + +from ..utils.entrypoints import EntryPoint +from ..utils.utils import try_set, unlist + + +def transforms_variablecolumn( + data, + output_data=None, + model=None, + features=None, + length_column_name=None, + **params): + """ + **Description** + Combines the specified input columns in to a + single variable length vectorized column. + + :param data: Input dataset (inputs). + :param output_data: Transformed dataset (outputs). + :param model: Transform model (outputs). + """ + + entrypoint_name = 'Transforms.VariableColumnTransform' + inputs = {} + outputs = {} + + if data is not None: + inputs['Data'] = try_set( + obj=data, + none_acceptable=False, + is_of_type=str) + if features is not None: + inputs['Features'] = try_set( + obj=features, + none_acceptable=True, + is_of_type=list, + is_column=True) + if length_column_name is not None: + inputs['LengthColumnName'] = try_set( + obj=length_column_name, + none_acceptable=True, + is_of_type=str) + if output_data is not None: + outputs['OutputData'] = try_set( + obj=output_data, + none_acceptable=False, + is_of_type=str) + if model is not None: + outputs['Model'] = try_set( + obj=model, + none_acceptable=False, + is_of_type=str) + + input_variables = { + x for x in unlist(inputs.values()) + if isinstance(x, str) and x.startswith("$")} + output_variables = { + x for x in unlist(outputs.values()) + if isinstance(x, str) and x.startswith("$")} + + entrypoint = EntryPoint( + name=entrypoint_name, inputs=inputs, outputs=outputs, + input_variables=input_variables, + output_variables=output_variables) + return entrypoint diff --git a/src/python/nimbusml/tests/test_variable_column.py b/src/python/nimbusml/tests/test_variable_column.py new file mode 100644 index 00000000..6c1fc8bd --- /dev/null +++ b/src/python/nimbusml/tests/test_variable_column.py @@ -0,0 +1,196 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- + +import unittest + +import numpy as np +import pandas as pd +from nimbusml import Pipeline +from nimbusml.internal.entrypoints.transforms_variablecolumn import transforms_variablecolumn +from nimbusml.internal.utils.entrypoints import Graph, DataOutputFormat + + +class TestVariableColumn(unittest.TestCase): + + def to_variable_column(self, input, features=None, length_column_name=None): + node = transforms_variablecolumn(data='$data', + output_data='$output_data', + features=features, + length_column_name=length_column_name) + + graph_nodes = [node] + graph = Graph(dict(data=''), + dict(output_data=''), + DataOutputFormat.DF, + *(graph_nodes)) + + (out_model, out_data, out_metrics, _) = graph.run(verbose=True, X=input) + return out_data + + def test_nonvariable_columns_are_returned_unchanged(self): + train_data = {'c1': [2, 3, 4, 5], + 'c2': [3, 4, 5, 6], + 'c3': [4, 5, 6, 7], + 'c4': [0, 1, 2, 1]} + train_df = pd.DataFrame(train_data).astype({'c1': np.float64, + 'c2': np.float64}) + + result = self.to_variable_column(train_df, ['c1', 'c2']) + + self.assertTrue(result.loc[:, 'c3'].equals(train_df.loc[:, 'c3'])) + self.assertTrue(result.loc[:, 'c4'].equals(train_df.loc[:, 'c4'])) + + def test_variable_columns_of_same_length_do_not_add_nans(self): + train_data = {'c1': [2, 3, 4, 5], + 'c2': [3, 4, 5, 6], + 'c3': [4, 5, 6, 7]} + train_df = pd.DataFrame(train_data).astype({'c1': np.float64, + 'c2': np.float64}) + + result = self.to_variable_column(train_df, ['c1', 'c2']) + + self.assertTrue(result.loc[:, 'c1.0'].equals(train_df.loc[:, 'c1'])) + self.assertTrue(result.loc[:, 'c1.1'].equals(train_df.loc[:, 'c2'])) + + def test_variable_columns_with_different_lengths_return_nans(self): + train_data = {'c1': [2, 3, 4, 5], + 'c2': [3, 4, 5, 6], + 'c3': [4, 5, 6, 7], + 'c4': [0, 1, 2, 1]} + train_df = pd.DataFrame(train_data).astype({'c1': np.float64, + 'c2': np.float64}) + + result = self.to_variable_column(train_df, ['c1', 'c2'], 'c4') + + expectedC1 = pd.Series([np.nan, 3, 4, 5]).astype(np.float64) + expectedC2 = pd.Series([np.nan, np.nan, 5, np.nan]).astype(np.float64) + + self.assertTrue(result.loc[:, 'c1.0'].equals(expectedC1)) + self.assertTrue(result.loc[:, 'c1.1'].equals(expectedC2)) + + def test_variable_columns_with_different_lengths_return_nans_when_no_other_columns_are_present(self): + train_data = {'c1': [2, 3, 4, 5], + 'c2': [3, 4, 5, 6], + 'c3': [0, 1, 2, 1]} + train_df = pd.DataFrame(train_data).astype({'c1': np.float64, + 'c2': np.float64}) + + result = self.to_variable_column(train_df, ['c1', 'c2'], 'c3') + + expectedC1 = pd.Series([np.nan, 3, 4, 5]).astype(np.float64) + expectedC2 = pd.Series([np.nan, np.nan, 5, np.nan]).astype(np.float64) + + self.assertEqual(len(result.columns), 2) + self.assertTrue(result.loc[:, 'c1.0'].equals(expectedC1)) + self.assertTrue(result.loc[:, 'c1.1'].equals(expectedC2)) + + def test_variable_columns_are_converted_to_float32(self): + """ + There are no integer nans so values that can be + converted to float32 are converted to support nans. + There is nullable integer type support in pandas but + it is currently marked as experimental and the docs + state that the api may change in the future. See + https://pandas.pydata.org/pandas-docs/stable/user_guide/integer_na.html + """ + types = [np.int8, np.int16, np.uint8, np.uint16, np.float32] + + for type in types: + train_data = {'c1': [2, 3, 4, 5], 'c2': [3, 4, 5, 6]} + train_df = pd.DataFrame(train_data).astype(type); + + result = self.to_variable_column(train_df, ['c1', 'c2']) + + self.assertEqual(str(result.dtypes[0]), 'float32') + self.assertEqual(str(result.dtypes[1]), 'float32') + + def test_variable_columns_are_converted_to_float64(self): + """ + There are no integer nans so values that can be + converted to float64 are converted to support nans. + There is nullable integer type support in pandas but + it is currently marked as experimental and the docs + state that the api may change in the future. See + https://pandas.pydata.org/pandas-docs/stable/user_guide/integer_na.html + """ + types = [np.int32, np.uint32, np.int64, np.uint64, np.float64] + + for type in types: + train_data = {'c1': [2, 3, 4, 5], 'c2': [3, 4, 5, 6]} + train_df = pd.DataFrame(train_data).astype(type); + + result = self.to_variable_column(train_df, ['c1', 'c2']) + + self.assertEqual(str(result.dtypes[0]), 'float64') + self.assertEqual(str(result.dtypes[1]), 'float64') + + def test_column_with_all_vector_lengths_of_zero_returns_one_column_filled_with_nans(self): + train_data = {'c1': [2, 3, 4, 5], + 'c2': [3, 4, 5, 6], + 'c3': [0, 0, 0, 0]} + train_df = pd.DataFrame(train_data).astype({'c1': np.float64, + 'c2': np.float64}) + + result = self.to_variable_column(train_df, ['c1', 'c2'], 'c3') + + expectedC1 = pd.Series([np.nan, np.nan, np.nan, np.nan]).astype(np.float64) + + self.assertEqual(len(result.columns), 1) + self.assertTrue(result.loc[:, 'c1.0'].equals(expectedC1)) + + def test_variable_column_conversion_leaves_nans_untouched_if_they_already_exist_in_the_input(self): + train_data = {'c1': [2, 3, np.nan, 5], + 'c2': [3, np.nan, 5, 6], + 'c3': [2, 2, 2, 1]} + train_df = pd.DataFrame(train_data).astype({'c1': np.float64, + 'c2': np.float64}) + + result = self.to_variable_column(train_df, ['c1', 'c2'], 'c3') + + expectedC1 = pd.Series([2, 3, np.nan, 5]).astype(np.float64) + expectedC2 = pd.Series([3, np.nan, 5, np.nan]).astype(np.float64) + + self.assertEqual(len(result.columns), 2) + self.assertTrue(result.loc[:, 'c1.0'].equals(expectedC1)) + self.assertTrue(result.loc[:, 'c1.1'].equals(expectedC2)) + + def test_column_names_are_zero_padded(self): + numColsToVerify = [1, 2, 10, 11, 100, 101] + + for numCols in numColsToVerify: + inputColNames = ['c' + str(i) for i in range(numCols)] + train_data = {k: [2,3,4,5] for k in inputColNames} + train_df = pd.DataFrame(train_data).astype(np.float32); + + result = self.to_variable_column(train_df, inputColNames) + + maxDigits = len(inputColNames[-1]) - 1 + expectedColNames = ['c0.' + str(i).zfill(maxDigits) for i in range(numCols)] + + self.assertTrue(all(result.columns == expectedColNames)) + + def test_variable_column_of_type_string(self): + train_data = {'c1': ['a', 'b', '', 'd'], + 'c2': ['e', 'f', 'g', 'h'], + 'c3': [0, 1, 2, 1]} + train_df = pd.DataFrame(train_data) + + result = self.to_variable_column(train_df, ['c1', 'c2'], 'c3') + + self.assertEqual(result.loc[0, 'c1.0'], None) + self.assertEqual(result.loc[1, 'c1.0'], 'b') + self.assertEqual(result.loc[2, 'c1.0'], '') + self.assertEqual(result.loc[3, 'c1.0'], 'd') + + self.assertNotEqual(result.loc[2, 'c1.0'], None) + + self.assertEqual(result.loc[0, 'c1.1'], None) + self.assertEqual(result.loc[1, 'c1.1'], None) + self.assertEqual(result.loc[2, 'c1.1'], 'g') + self.assertEqual(result.loc[3, 'c1.1'], None) + + +if __name__ == '__main__': + unittest.main()