diff --git a/README.md b/README.md index 8551dafc..1ec683ab 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # NimbusML -`nimbusml` is a Python module that provides experimental Python bindings for [ML.NET](https://github.com/dotnet/machinelearning). +`nimbusml` is a Python module that provides Python bindings for [ML.NET](https://github.com/dotnet/machinelearning). ML.NET was originally developed in Microsoft Research and is used across many product groups in Microsoft like Windows, Bing, PowerPoint, Excel and others. `nimbusml` was built to enable data science teams that are more familiar with Python to take advantage of ML.NET's functionality and performance. diff --git a/build.cmd b/build.cmd index 8b6bf29b..8ed5005d 100644 --- a/build.cmd +++ b/build.cmd @@ -311,6 +311,14 @@ copy "%BuildOutputDir%%Configuration%\pybridge.pyd" "%__currentScriptDir%src\py if %PythonVersion% == 2.7 ( copy "%BuildOutputDir%%Configuration%\Platform\win-x64\publish\*.dll" "%__currentScriptDir%src\python\nimbusml\internal\libs\" + :: remove dataprep dlls as its not supported in python 2.7 + del "%__currentScriptDir%src\python\nimbusml\internal\libs\Microsoft.DPrep.*" + del "%__currentScriptDir%src\python\nimbusml\internal\libs\Microsoft.Data.*" + del "%__currentScriptDir%src\python\nimbusml\internal\libs\Microsoft.ProgramSynthesis.*" + del "%__currentScriptDir%src\python\nimbusml\internal\libs\Microsoft.DataPrep.dll" + del "%__currentScriptDir%src\python\nimbusml\internal\libs\ExcelDataReader.dll" + del "%__currentScriptDir%src\python\nimbusml\internal\libs\Microsoft.WindowsAzure.Storage.dll" + del "%__currentScriptDir%src\python\nimbusml\internal\libs\Microsoft.Workbench.Messaging.SDK.dll" ) else ( for /F "tokens=*" %%A in (build/libs_win.txt) do copy "%BuildOutputDir%%Configuration%\Platform\win-x64\publish\%%A" "%__currentScriptDir%src\python\nimbusml\internal\libs\" ) @@ -340,10 +348,15 @@ if "%InstallPythonPackages%" == "True" ( echo "#################################" echo "Installing python packages ... " echo "#################################" + call "%PythonExe%" -m pip install --upgrade pip call "%PythonExe%" -m pip install --upgrade nose pytest graphviz imageio pytest-cov "jupyter_client>=4.4.0" "nbconvert>=4.2.0" - if %PythonVersion% == 2.7 ( call "%PythonExe%" -m pip install --upgrade pyzmq ) - :: Run azureml-dataprep tests only in pyhon 3.7 as its an optional dependency - if %PythonVersion% == 3.7 ( call "%PythonExe%" -m pip install --upgrade azureml-dataprep ) + + if %PythonVersion% == 2.7 ( + call "%PythonExe%" -m pip install --upgrade pyzmq + ) else ( + call "%PythonExe%" -m pip install --upgrade "azureml-dataprep>=1.1.12" + ) + call "%PythonExe%" -m pip install --upgrade "%__currentScriptDir%target\%WheelFile%" call "%PythonExe%" -m pip install "scikit-learn==0.19.2" ) diff --git a/build.sh b/build.sh index b1300c3f..6d5221c9 100755 --- a/build.sh +++ b/build.sh @@ -52,7 +52,7 @@ while [ "$1" != "" ]; do __runTests=true __installPythonPackages=true ;; - --installPythonPackages) + --installpythonpackages) __installPythonPackages=true ;; --includeextendedtests) @@ -219,6 +219,19 @@ then ext=*.dylib fi cp "${BuildOutputDir}/${__configuration}/Platform/${PublishDir}"/publish/${ext} "${__currentScriptDir}/src/python/nimbusml/internal/libs/" + # Obtain "libtensorflow_framework.so.1", which is the upgraded version of "libtensorflow.so". This is required for tests TensorFlowScorer.py to pass in Linux distros with Python 2.7 + if [ ! "$(uname -s)" = "Darwin" ] + then + cp "${BuildOutputDir}/${__configuration}/Platform/${PublishDir}"/publish/libtensorflow_framework.so.1 "${__currentScriptDir}/src/python/nimbusml/internal/libs/" + fi + # remove dataprep dlls as its not supported in python 2.7 + rm -f "${__currentScriptDir}/src/python/nimbusml/internal/libs/Microsoft.DPrep.*" + rm -f "${__currentScriptDir}/src/python/nimbusml/internal/libs/Microsoft.Data.*" + rm -f "${__currentScriptDir}/src/python/nimbusml/internal/libs/Microsoft.ProgramSynthesis.*" + rm -f "${__currentScriptDir}/src/python/nimbusml/internal/libs/Microsoft.DataPrep.dll" + rm -f "${__currentScriptDir}/src/python/nimbusml/internal/libs/ExcelDataReader.dll" + rm -f "${__currentScriptDir}/src/python/nimbusml/internal/libs/Microsoft.WindowsAzure.Storage.dll" + rm -f "${__currentScriptDir}/src/python/nimbusml/internal/libs/Microsoft.Workbench.Messaging.SDK.dll" else libs_txt=libs_linux.txt if [ "$(uname -s)" = "Darwin" ] @@ -271,13 +284,14 @@ then if [ ${PythonVersion} = 2.7 ] then "${PythonExe}" -m pip install --upgrade pyzmq - elif [ ${PythonVersion} = 3.6 ] && [ "$(uname -s)" = "Darwin" ] - then - "${PythonExe}" -m pip install --upgrade pytest-remotedata - elif [ ${PythonVersion} = 3.7 ] - then - "${PythonExe}" -m pip install --upgrade azureml-dataprep - fi + else + if [ ${PythonVersion} = 3.6 ] && [ "$(uname -s)" = "Darwin" ] + then + "${PythonExe}" -m pip install --upgrade pytest-remotedata + fi + + "${PythonExe}" -m pip install --upgrade "azureml-dataprep>=1.1.12" + fi "${PythonExe}" -m pip install --upgrade "${Wheel}" "${PythonExe}" -m pip install "scikit-learn==0.19.2" fi diff --git a/build/libs_linux.txt b/build/libs_linux.txt index d53a5a84..6ce4cbed 100644 --- a/build/libs_linux.txt +++ b/build/libs_linux.txt @@ -12,6 +12,4 @@ libonnxruntime.so System.Drawing.Common.dll TensorFlow.NET.dll NumSharp.Core.dll -Microsoft.DataPrep.dll -Microsoft.DPrep.* Microsoft.ML.* diff --git a/build/libs_mac.txt b/build/libs_mac.txt index de7e27b3..85544169 100644 --- a/build/libs_mac.txt +++ b/build/libs_mac.txt @@ -12,6 +12,4 @@ libtensorflow_framework.1.dylib System.Drawing.Common.dll TensorFlow.NET.dll NumSharp.Core.dll -Microsoft.DataPrep.dll -Microsoft.DPrep.* Microsoft.ML.* diff --git a/build/libs_win.txt b/build/libs_win.txt index 62c1bab0..7ef9cca7 100644 --- a/build/libs_win.txt +++ b/build/libs_win.txt @@ -12,6 +12,4 @@ tensorflow.dll TensorFlow.NET.dll NumSharp.Core.dll System.Drawing.Common.dll -Microsoft.DataPrep.dll -Microsoft.DPrep.* Microsoft.ML.* diff --git a/docs/release-notes/release-1.4.0.md b/docs/release-notes/release-1.4.0.md new file mode 100644 index 00000000..1c30e978 --- /dev/null +++ b/docs/release-notes/release-1.4.0.md @@ -0,0 +1,57 @@ +# [NimbusML](https://docs.microsoft.com/en-us/nimbusml/overview) 1.4.0 + +## **New Features** + +- **Add initial implementation of DatasetTransformer.** + + [PR#240](https://github.com/microsoft/NimbusML/pull/240) + This transform allows a fitted transformer based model to be inserted + in to another `Pipeline`. + + ```python + Pipeline([ + DatasetTransformer(transform_model=transform_pipeline.model), + OnlineGradientDescentRegressor(label='c2', feature=['c1']) + ]) + ``` + +## **Bug Fixes** + +- **Fixed `classes_` attribute when no `y` input specified ** + + [PR#218](https://github.com/microsoft/NimbusML/pull/218) + Fix a bug with the classes_ attribute when no y input is specified during fitting. + This addresses [issue 216](https://github.com/microsoft/NimbusML/issues/216) + +- **Fixed Add NumSharp.Core.dll ** + + [PR#220](https://github.com/microsoft/NimbusML/pull/220) + Fixed a bug that prevented running TensorFlowScorer. + This addresses [issue 219](https://github.com/microsoft/NimbusML/issues/219) + +- **Fixed Enable scoring of ML.NET models saved with new TransformerChain format ** + + [PR#230](https://github.com/microsoft/NimbusML/pull/230) + Fixed error loading a model that was saved with mlnet auto-train. + This addresses [issue 201](https://github.com/microsoft/NimbusML/issues/201) + +- **Fixed Pass python path to Dprep package ** + + [PR#232](https://github.com/microsoft/NimbusML/pull/232) + Enable passing python executable to dataprep package, so dataprep can execute python transformations + +## **Breaking Changes** + +None. + +## **Enhancements** + +None. + +## **Documentation and Samples** + +None. + +## **Remarks** + +None. diff --git a/release-next.md b/release-next.md index 68bfa7ef..031f060f 100644 --- a/release-next.md +++ b/release-next.md @@ -2,15 +2,91 @@ ## **New Features** -None. +- **Initial implementation of `csr_matrix` output support.** + + [PR#250](https://github.com/microsoft/NimbusML/pull/250) + Add support for data output in `scipy.sparse.csr_matrix` format. + + ```python + xf = OneHotVectorizer(columns={'c0':'c0', 'c1':'c1'}) + xf.fit(train_df) + result = xf.transform(train_df, as_csr=True) + ``` + +- **Permutation Feature Importance for model interpretibility.** + + [PR#279](https://github.com/microsoft/NimbusML/pull/279) + Adds `permutation_feature_importance()` method to `Pipeline` and + predictor estimators, enabling evaluation of model-wide feature + importances on any dataset with same schema as the dataset used + to fit the `Pipeline`. + + ```python + pipe = Pipeline([ + LogisticRegressionBinaryClassifier(label='label', feature=['feature']) + ]) + pipe.fit(data) + pipe.permutation_feature_importance(data) + ``` + +- **Initial implementation of DateTime input and output column support.** + + [PR#290](https://github.com/microsoft/NimbusML/pull/290) + Add initial support for input and output of Pandas DateTime columns. + +- **Initial implementation of LpScaler.** + + [PR#253](https://github.com/microsoft/NimbusML/pull/253) + Normalize vectors (rows) individually by rescaling them to unit norm (L2, L1 or LInf). + Performs the following operation on a vector X: Y = (X - M) / D, where M is mean and D + is either L2 norm, L1 norm or LInf norm. + +- **Add support for variable length vector output.** + + [PR#267](https://github.com/microsoft/NimbusML/pull/267) + Support output of columns returned from ML.Net which contain variable length vectors. + +- **Save `predictor_model` when pickling a `Pipeline`.** + + [PR#295](https://github.com/microsoft/NimbusML/pull/295) + +- **Initial implementation of the WordTokenizer transform.** + + [PR#296](https://github.com/microsoft/NimbusML/pull/296) + +- **Add support for summary output from tree based predictors.** + + [PR#298](https://github.com/microsoft/NimbusML/pull/298) ## **Bug Fixes** -None. +- **Fixed `Pipeline.transform()` in transform only `Pipeline` fails if y column is provided ** + + [PR#294](https://github.com/microsoft/NimbusML/pull/294) + Enable calling `.transform()` on a `Pipeline` containing only transforms when the y column is provided + +- **Fix issue when using `predict_proba` or `decision_function` with combined models.** + + [PR#272](https://github.com/microsoft/NimbusML/pull/272) + +- **Fix `Pipeline._extract_classes_from_headers` was not checking for valid steps.** + + [PR#292](https://github.com/microsoft/NimbusML/pull/292) + +- **Fix BinaryDataStream was not valid as input for transformer.** + + [PR#307](https://github.com/microsoft/NimbusML/pull/307) + +- **Fix casing for the installPythonPackages build.sh argument.** + + [PR#256](https://github.com/microsoft/NimbusML/pull/256) ## **Breaking Changes** -None. +- **Removed `y` parameter from `Pipeline.transform()`** + + [PR#294](https://github.com/microsoft/NimbusML/pull/294) + Removed `y` parameter from `Pipeline.transform()` as it is not needed nor used for transforming data with a fitted `Pipeline`. ## **Enhancements** diff --git a/src/DotNetBridge/Bridge.cs b/src/DotNetBridge/Bridge.cs index 40220cc8..a7954355 100644 --- a/src/DotNetBridge/Bridge.cs +++ b/src/DotNetBridge/Bridge.cs @@ -7,10 +7,8 @@ using System.Runtime.InteropServices; using System.Text; using System.Threading; -using Microsoft.ML; using Microsoft.ML.Data; using Microsoft.ML.EntryPoints; -using Microsoft.ML.Model.OnnxConverter; using Microsoft.ML.Runtime; using Microsoft.ML.Trainers; using Microsoft.ML.Trainers.Ensemble; @@ -19,7 +17,7 @@ using Microsoft.ML.Transforms; using Microsoft.ML.Transforms.TimeSeries; -namespace Microsoft.MachineLearning.DotNetBridge +namespace Microsoft.ML.DotNetBridge { /// /// The main entry point from native code. Note that GC / lifetime issues are critical to get correct. @@ -130,51 +128,51 @@ public unsafe static partial class Bridge // For setting bool values to NativeBridge. [UnmanagedFunctionPointer(CallingConvention.StdCall)] - private unsafe delegate void BLSetter(EnvironmentBlock* penv, int col, long index, byte value); + private unsafe delegate void BLSetter(EnvironmentBlock* penv, int col, long m, long n, byte value); // For setting float values to NativeBridge. [UnmanagedFunctionPointer(CallingConvention.StdCall)] - private unsafe delegate void R4Setter(EnvironmentBlock* penv, int col, long index, float value); + private unsafe delegate void R4Setter(EnvironmentBlock* penv, int col, long m, long n, float value); // For setting double values to NativeBridge. [UnmanagedFunctionPointer(CallingConvention.StdCall)] - private unsafe delegate void R8Setter(EnvironmentBlock* penv, int col, long index, double value); + private unsafe delegate void R8Setter(EnvironmentBlock* penv, int col, long m, long n, double value); // For setting I1 values to NativeBridge. [UnmanagedFunctionPointer(CallingConvention.StdCall)] - private unsafe delegate void I1Setter(EnvironmentBlock* penv, int col, long index, sbyte value); + private unsafe delegate void I1Setter(EnvironmentBlock* penv, int col, long m, long n, sbyte value); // For setting I2 values to NativeBridge. [UnmanagedFunctionPointer(CallingConvention.StdCall)] - private unsafe delegate void I2Setter(EnvironmentBlock* penv, int col, long index, short value); + private unsafe delegate void I2Setter(EnvironmentBlock* penv, int col, long m, long n, short value); // For setting I4 values to NativeBridge. [UnmanagedFunctionPointer(CallingConvention.StdCall)] - private unsafe delegate void I4Setter(EnvironmentBlock* penv, int col, long index, int value); + private unsafe delegate void I4Setter(EnvironmentBlock* penv, int col, long m, long n, int value); // For setting I8 values to NativeBridge. [UnmanagedFunctionPointer(CallingConvention.StdCall)] - private unsafe delegate void I8Setter(EnvironmentBlock* penv, int col, long index, long value); + private unsafe delegate void I8Setter(EnvironmentBlock* penv, int col, long m, long n, long value); // For setting U1 values to NativeBridge. [UnmanagedFunctionPointer(CallingConvention.StdCall)] - private unsafe delegate void U1Setter(EnvironmentBlock* penv, int col, long index, byte value); + private unsafe delegate void U1Setter(EnvironmentBlock* penv, int col, long m, long n, byte value); // For setting U2 values to NativeBridge. [UnmanagedFunctionPointer(CallingConvention.StdCall)] - private unsafe delegate void U2Setter(EnvironmentBlock* penv, int col, long index, ushort value); + private unsafe delegate void U2Setter(EnvironmentBlock* penv, int col, long m, long n, ushort value); // For setting U4 values to NativeBridge. [UnmanagedFunctionPointer(CallingConvention.StdCall)] - private unsafe delegate void U4Setter(EnvironmentBlock* penv, int col, long index, uint value); + private unsafe delegate void U4Setter(EnvironmentBlock* penv, int col, long m, long n, uint value); // For setting U8 values to NativeBridge. [UnmanagedFunctionPointer(CallingConvention.StdCall)] - private unsafe delegate void U8Setter(EnvironmentBlock* penv, int col, long index, ulong value); + private unsafe delegate void U8Setter(EnvironmentBlock* penv, int col, long m, long n, ulong value); // For setting string values, to a generic pointer and index. [UnmanagedFunctionPointer(CallingConvention.StdCall)] - private unsafe delegate void TXSetter(EnvironmentBlock* penv, int col, long index, sbyte* pch, int cch); + private unsafe delegate void TXSetter(EnvironmentBlock* penv, int col, long m, long n, sbyte* pch, int cch); // For setting string key values, to a generic pointer and index. [UnmanagedFunctionPointer(CallingConvention.StdCall)] @@ -186,12 +184,6 @@ private enum FnId Generic = 2, } -#if !CORECLR - // The hosting code invokes this to get a specific entry point. - [UnmanagedFunctionPointer(CallingConvention.StdCall)] - private delegate IntPtr NativeFnGetter(FnId id); -#endif - #region Callbacks to native // Call back to provide messages to native code. @@ -236,8 +228,9 @@ private struct EnvironmentBlock [FieldOffset(0x18)] public readonly void* modelSink; + //Max slots to return for vector valued columns(<=0 to return all). [FieldOffset(0x20)] - public readonly int maxThreadsAllowed; + public readonly int maxSlots; // Call back to provide cancel flag. [FieldOffset(0x28)] @@ -252,41 +245,14 @@ private struct EnvironmentBlock [UnmanagedFunctionPointer(CallingConvention.StdCall)] private unsafe delegate int NativeGeneric(EnvironmentBlock* penv, sbyte* psz, int cdata, DataSourceBlock** ppdata); -#if !CORECLR - private static NativeFnGetter FnGetter; -#endif private static NativeGeneric FnGeneric; private static TDel MarshalDelegate(void* pv) { Contracts.Assert(typeof(TDel).IsSubclassOf(typeof(Delegate))); Contracts.Assert(pv != null); -#if CORECLR return Marshal.GetDelegateForFunctionPointer((IntPtr)pv); -#else - return (TDel)(object)Marshal.GetDelegateForFunctionPointer((IntPtr)pv, typeof(TDel)); -#endif - } - -#if !CORECLR - /// - /// This is the bootstrapping entry point. It's labeled private but is actually invoked from the native - /// code to poke the address of the FnGetter callback into the address encoded in the string parameter. - /// This odd way of doing things is because the most convenient way to call an initial managed method - /// imposes the signature of Func{string, int}, which doesn't allow us to return a function adress. - /// - private static unsafe int GetFnGetterCallback(string addr) - { - if (FnGetter == null) - Interlocked.CompareExchange(ref FnGetter, (NativeFnGetter)GetFn, null); - long a = long.Parse(addr); - IntPtr* p = null; - IntPtr** pp = &p; - *(long*)pp = a; - *p = Marshal.GetFunctionPointerForDelegate(FnGetter); - return 1; } -#endif /// /// This is the main FnGetter function. Given an FnId value, it returns a native-callable @@ -334,6 +300,7 @@ private static unsafe int GenericExec(EnvironmentBlock* penv, sbyte* psz, int cd //env.ComponentCatalog.RegisterAssembly(typeof(TimeSeriesProcessingEntryPoints).Assembly); //env.ComponentCatalog.RegisterAssembly(typeof(ParquetLoader).Assembly); env.ComponentCatalog.RegisterAssembly(typeof(SsaChangePointDetector).Assembly); + env.ComponentCatalog.RegisterAssembly(typeof(DotNetBridgeEntrypoints).Assembly); using (var ch = host.Start("Executing")) { @@ -397,7 +364,7 @@ private static unsafe int GenericExec(EnvironmentBlock* penv, sbyte* psz, int cd // Wrap the data sets. ch.Trace("Wrapping native data sources"); ch.Trace("Executing"); - ExecCore(penv, host, ch, graph, cdata, ppdata); + RunGraphCore(penv, host, graph, cdata, ppdata); } catch (Exception e) { @@ -420,24 +387,6 @@ private static unsafe int GenericExec(EnvironmentBlock* penv, sbyte* psz, int cd return 0; } - private static void CheckModel(IHost host, byte** ppModelBin, long* pllModelBinLen, int i) - { - host.CheckParam( - ppModelBin != null && ppModelBin[i] != null - && pllModelBinLen != null && pllModelBinLen[i] > 0, "pModelBin", "Model is missing"); - } - - private static void ExecCore(EnvironmentBlock* penv, IHost host, IChannel ch, string graph, int cdata, DataSourceBlock** ppdata) - { - Contracts.AssertValue(ch); - ch.AssertValue(host); - ch.AssertNonEmpty(graph); - ch.Assert(cdata >= 0); - ch.Assert(ppdata != null || cdata == 0); - - RunGraphCore(penv, host, graph, cdata, ppdata); - } - /// /// Convert UTF8 bytes with known length to ROM. Negative length unsupported. /// @@ -483,25 +432,7 @@ internal static string BytesToString(sbyte* psz) if (cch == 0) return null; -#if CORECLR - return Encoding.UTF8.GetString((byte*)psz, cch); -#else - if (cch <= 0) - return ""; - - var decoder = Encoding.UTF8.GetDecoder(); - var chars = new char[decoder.GetCharCount((byte*)psz, cch, true)]; - int bytesUsed; - int charsUsed; - bool complete; - fixed (char* pchars = chars) - decoder.Convert((byte*)psz, cch, pchars, chars.Length, true, out bytesUsed, out charsUsed, out complete); - Contracts.Assert(bytesUsed == cch); - Contracts.Assert(charsUsed == chars.Length); - Contracts.Assert(complete); - return new string(chars); -#endif } /// diff --git a/src/DotNetBridge/DotNetBridge.csproj b/src/DotNetBridge/DotNetBridge.csproj index dd0e2c5d..822db6aa 100644 --- a/src/DotNetBridge/DotNetBridge.csproj +++ b/src/DotNetBridge/DotNetBridge.csproj @@ -16,6 +16,7 @@ https://github.com/Microsoft/NimbusML true DotNetBridge.snk + latest @@ -31,19 +32,19 @@ all runtime; build; native; contentfiles; analyzers - - - - - - - - - - - + + + + + + + + + + + - + diff --git a/src/DotNetBridge/Entrypoints.cs b/src/DotNetBridge/Entrypoints.cs new file mode 100644 index 00000000..9be84e67 --- /dev/null +++ b/src/DotNetBridge/Entrypoints.cs @@ -0,0 +1,182 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using Microsoft.ML; +using Microsoft.ML.CommandLine; +using Microsoft.ML.DotNetBridge; +using Microsoft.ML.Data; +using Microsoft.ML.EntryPoints; +using Microsoft.ML.Runtime; +using Microsoft.ML.Transforms; + +[assembly: LoadableClass(typeof(void), typeof(DotNetBridgeEntrypoints), null, typeof(SignatureEntryPointModule), "DotNetBridgeEntrypoints")] + +[assembly: LoadableClass(VariableColumnTransform.Summary, typeof(VariableColumnTransform), null, typeof(SignatureLoadDataTransform), + "", VariableColumnTransform.LoaderSignature)] + +namespace Microsoft.ML.DotNetBridge +{ + internal static class DotNetBridgeEntrypoints + { + [TlcModule.EntryPoint(Name = "Transforms.PrefixColumnConcatenator", Desc = ColumnConcatenatingTransformer.Summary, + UserName = ColumnConcatenatingTransformer.UserName, ShortName = ColumnConcatenatingTransformer.LoadName)] + public static CommonOutputs.TransformOutput ConcatColumns(IHostEnvironment env, ColumnCopyingTransformer.Options input) + { + Contracts.CheckValue(env, nameof(env)); + var host = env.Register("PrefixConcatColumns"); + host.CheckValue(input, nameof(input)); + EntryPointUtils.CheckInputArgs(host, input); + + // Get all column names with preserving order. + var colNames = new List(input.Data.Schema.Count); + for (int i = 0; i < input.Data.Schema.Count; i++) + colNames.Add(input.Data.Schema[i].Name); + + // Iterate throuh input options, find matching source columns, create new input options + var inputOptions = new ColumnConcatenatingTransformer.Options() { Data = input.Data }; + var columns = new List(input.Columns.Length); + foreach (var col in input.Columns) + { + var newCol = new ColumnConcatenatingTransformer.Column(); + newCol.Name = col.Name; + var prefix = col.Source; + newCol.Source = colNames.Where(x => x.StartsWith(prefix, StringComparison.InvariantCulture)).ToArray(); + if (newCol.Source.Length == 0) + throw new ArgumentOutOfRangeException("No matching columns found for prefix: " + prefix); + + columns.Add(newCol); + } + inputOptions.Columns = columns.ToArray(); + + var xf = ColumnConcatenatingTransformer.Create(env, inputOptions, inputOptions.Data); + return new CommonOutputs.TransformOutput { Model = new TransformModelImpl(env, xf, inputOptions.Data), OutputData = xf }; + } + + public sealed class TransformModelInput + { + [Argument(ArgumentType.Required, HelpText = "The transform model.", SortOrder = 1)] + public TransformModel Model; + } + + public sealed class ModelSchemaOutput + { + [TlcModule.Output(Desc = "The model schema", SortOrder = 1)] + public IDataView Schema; + } + + [TlcModule.EntryPoint(Name = "Models.Schema", Desc = "Retrieve output model schema")] + public static ModelSchemaOutput GetSchema(IHostEnvironment env, TransformModelInput input) + { + Contracts.CheckValue(env, nameof(env)); + var host = env.Register("GetSchema"); + host.CheckValue(input, nameof(input)); + EntryPointUtils.CheckInputArgs(host, input); + + return new ModelSchemaOutput { Schema = new EmptyDataView(host, input.Model.OutputSchema) }; + } + + [TlcModule.EntryPoint(Name = "Transforms.VariableColumnTransform", Desc = VariableColumnTransform.Summary, + UserName = "Variable Column Creator", ShortName = "Variable Column Creator")] + public static CommonOutputs.TransformOutput CreateVariableColumn(IHostEnvironment env, VariableColumnTransform.Options inputOptions) + { + Contracts.CheckValue(env, nameof(env)); + var host = env.Register("VariableColumnCreator"); + EntryPointUtils.CheckInputArgs(host, inputOptions); + + var xf = VariableColumnTransform.Create(env, inputOptions, inputOptions.Data); + return new CommonOutputs.TransformOutput { Model = new TransformModelImpl(env, xf, inputOptions.Data), OutputData = xf }; + } + + public sealed class ScoringTransformInput + { + [Argument(ArgumentType.Required, HelpText = "The dataset to be scored", SortOrder = 1)] + public IDataView Data; + + [Argument(ArgumentType.Required, HelpText = "The predictor model to apply to data", SortOrder = 2)] + public PredictorModel PredictorModel; + + [Argument(ArgumentType.AtMostOnce, HelpText = "Suffix to append to the score columns", SortOrder = 3)] + public string Suffix; + } + + public sealed class ScoringTransformOutput + { + [TlcModule.Output(Desc = "The scored dataset", SortOrder = 1)] + public IDataView ScoredData; + + [TlcModule.Output(Desc = "The scoring transform", SortOrder = 2)] + public TransformModel ScoringTransform; + } + + private static bool AreSchemasCompatible(DataViewSchema schema1, DataViewSchema schema2) + { + if (schema1 == null) + return schema2 == null; + if (schema2 == null) + return schema1 == null; + if (schema1.Count != schema2.Count) + return false; + + for (int i = 0; i < schema1.Count; i++) + { + if(schema1[i].Type != schema2[i].Type) + return false; + } + + return true; + } + + [TlcModule.EntryPoint(Name = "Transforms.DatasetScorerEx", Desc = "Score a dataset with a predictor model")] + public static ScoringTransformOutput Score(IHostEnvironment env, ScoringTransformInput input) + { + Contracts.CheckValue(env, nameof(env)); + var host = env.Register("ScoreModel"); + host.CheckValue(input, nameof(input)); + EntryPointUtils.CheckInputArgs(host, input); + + RoleMappedData data; + IPredictor predictor; + var inputData = input.Data; + try + { + input.PredictorModel.PrepareData(host, inputData, out data, out predictor); + } + catch (Exception) + { + // this can happen in csr_matrix case, try to use only trainer model. + host.Assert(inputData.Schema.Count == 1); + var inputColumnName = inputData.Schema[0].Name; + var trainingSchema = input.PredictorModel.GetTrainingSchema(host); + // get feature vector item type. + var trainingFeatureColumn = (DataViewSchema.Column)trainingSchema.Feature; + var requiredType = trainingFeatureColumn.Type.GetItemType().RawType; + var featuresColumnName = trainingFeatureColumn.Name; + predictor = input.PredictorModel.Predictor; + var xf = new TypeConvertingTransformer(host, + new TypeConvertingEstimator.ColumnOptions(featuresColumnName, requiredType, inputColumnName)).Transform(inputData); + data = new RoleMappedData(xf, null, featuresColumnName); + } + + IDataView scoredPipe; + using (var ch = host.Start("Creating scoring pipeline")) + { + ch.Trace("Creating pipeline"); + var bindable = ScoreUtils.GetSchemaBindableMapper(host, predictor); + ch.AssertValue(bindable); + + var mapper = bindable.Bind(host, data.Schema); + var scorer = ScoreUtils.GetScorerComponent(host, mapper, input.Suffix); + scoredPipe = scorer.CreateComponent(host, data.Data, mapper, input.PredictorModel.GetTrainingSchema(host)); + } + + return + new ScoringTransformOutput + { + ScoredData = scoredPipe, + ScoringTransform = new TransformModelImpl(host, scoredPipe, inputData) + }; + + } + } +} diff --git a/src/DotNetBridge/MessageValidator.cs b/src/DotNetBridge/MessageValidator.cs index 2aa78c27..4243a45d 100644 --- a/src/DotNetBridge/MessageValidator.cs +++ b/src/DotNetBridge/MessageValidator.cs @@ -7,7 +7,7 @@ using System.Globalization; using Microsoft.ML.Runtime; -namespace Microsoft.MachineLearning.DotNetBridge +namespace Microsoft.ML.DotNetBridge { /// /// This is a temporary solution to validate the messages from ML.NET to nimbusml. diff --git a/src/DotNetBridge/NativeDataInterop.cs b/src/DotNetBridge/NativeDataInterop.cs index c9b70526..461beb3c 100644 --- a/src/DotNetBridge/NativeDataInterop.cs +++ b/src/DotNetBridge/NativeDataInterop.cs @@ -9,14 +9,17 @@ using System.Globalization; using System.Runtime.InteropServices; using System.Text; -using Microsoft.ML; using Microsoft.ML.Data; using Microsoft.ML.Runtime; +using System.Buffers; -namespace Microsoft.MachineLearning.DotNetBridge +namespace Microsoft.ML.DotNetBridge { public unsafe static partial class Bridge { + const int UTF8_BUFFER_SIZE = 10 * 1024 * 1024; // 10 MB + const int INDICES_BUFFER_SIZE = 1024 * 1024; // 1 Mln + /// /// This is provided by the native code and represents a native data source. It provides schema /// information and call backs for iteration. @@ -30,20 +33,17 @@ private struct DataSourceBlock [FieldOffset(0x08)] public readonly long crow; [FieldOffset(0x10)] - public readonly long* ids; - [FieldOffset(0x18)] public readonly sbyte** names; - [FieldOffset(0x20)] + [FieldOffset(0x18)] public readonly InternalDataKind* kinds; - [FieldOffset(0x28)] + [FieldOffset(0x20)] public readonly long* keyCards; - [FieldOffset(0x30)] + [FieldOffset(0x28)] public readonly long* vecCards; - [FieldOffset(0x38)] - public readonly void** getters; - + [FieldOffset(0x30)] // Call back pointers. - [FieldOffset(0x40)] + public readonly void** getters; + [FieldOffset(0x38)] public readonly void* labelsGetter; #pragma warning restore 649 // never assigned } @@ -77,6 +77,12 @@ private struct DataViewBlock // key types. Zero means unbounded, -1 means not a key type. [FieldOffset(0x20)] public int* keyCards; + + // The number of values in each row of a column. + // A value count of 0 means that each row of the + // column is variable length. + [FieldOffset(0x28)] + public byte* valueCounts; } private struct ColumnMetadataInfo @@ -93,7 +99,7 @@ public ColumnMetadataInfo(bool expand, string[] slotNames, Dictionary infos = null) + private static unsafe void SendViewToNativeAsDataFrame(IChannel ch, EnvironmentBlock* penv, IDataView view, Dictionary infos = null) { Contracts.AssertValue(ch); Contracts.Assert(penv != null); @@ -108,14 +114,13 @@ private static unsafe void SendViewToNative(IChannel ch, EnvironmentBlock* penv, var dataSink = MarshalDelegate(penv->dataSink); var schema = view.Schema; - var colIndices = new List(); - var kindList = new List(); - var keyCardList = new List(); - var nameUtf8Bytes = new List(); - var nameIndices = new List(); - - var expandCols = new HashSet(); - var allNames = new HashSet(); + var colIndices = new List(1000); + var kindList = new ValueListBuilder(INDICES_BUFFER_SIZE); + var keyCardList = new ValueListBuilder(INDICES_BUFFER_SIZE); + var nameUtf8Bytes = new ValueListBuilder(UTF8_BUFFER_SIZE); + var nameIndices = new ValueListBuilder(INDICES_BUFFER_SIZE); + var expandCols = new HashSet(1000); + var valueCounts = new List(1000); for (int col = 0; col < schema.Count; col++) { @@ -129,11 +134,7 @@ private static unsafe void SendViewToNative(IChannel ch, EnvironmentBlock* penv, var kind = itemType.GetRawKind(); int keyCard; - if (fullType.GetValueCount() == 0) - { - throw ch.ExceptNotSupp("Column has variable length vector: " + - name + ". Not supported in python. Drop column before sending to Python"); - } + byte valueCount = (fullType.GetValueCount() == 0) ? (byte)0 : (byte)1; if (itemType is KeyDataViewType) { @@ -181,6 +182,7 @@ private static unsafe void SendViewToNative(IChannel ch, EnvironmentBlock* penv, case InternalDataKind.R8: case InternalDataKind.BL: case InternalDataKind.TX: + case InternalDataKind.DT: break; } keyCard = -1; @@ -201,63 +203,60 @@ private static unsafe void SendViewToNative(IChannel ch, EnvironmentBlock* penv, { Contracts.Assert(info.SlotNames.Length == nSlots); for (int i = 0; i < nSlots; i++) - AddUniqueName(info.SlotNames[i], allNames, nameIndices, nameUtf8Bytes); + AddUniqueName(info.SlotNames[i], ref nameIndices, ref nameUtf8Bytes); } else if (schema[col].HasSlotNames(nSlots)) { var romNames = default(VBuffer>); schema[col].Annotations.GetValue(AnnotationUtils.Kinds.SlotNames, ref romNames); - foreach (var kvp in romNames.Items(true)) - { - // REVIEW: Add the proper number of zeros to the slot index to make them sort in the right order. - var slotName = name + "." + - (!kvp.Value.IsEmpty ? kvp.Value.ToString() : kvp.Key.ToString(CultureInfo.InvariantCulture)); - AddUniqueName(slotName, allNames, nameIndices, nameUtf8Bytes); - } + AddUniqueName(name, romNames, ref nameIndices, ref nameUtf8Bytes); } else { for (int i = 0; i < nSlots; i++) - AddUniqueName(name + "." + i, allNames, nameIndices, nameUtf8Bytes); + AddUniqueName(name + "." + i, ref nameIndices, ref nameUtf8Bytes); } } else { nSlots = 1; - AddUniqueName(name, allNames, nameIndices, nameUtf8Bytes); + AddUniqueName(name, ref nameIndices, ref nameUtf8Bytes); } colIndices.Add(col); for (int i = 0; i < nSlots; i++) { - kindList.Add(kind); - keyCardList.Add(keyCard); + kindList.Append(kind); + keyCardList.Append(keyCard); + valueCounts.Add(valueCount); } } - ch.Assert(allNames.Count == kindList.Count); - ch.Assert(allNames.Count == keyCardList.Count); - ch.Assert(allNames.Count == nameIndices.Count); + ch.Assert(kindList.Length == keyCardList.Length); + ch.Assert(kindList.Length == nameIndices.Length); - var kinds = kindList.ToArray(); - var keyCards = keyCardList.ToArray(); - var nameBytes = nameUtf8Bytes.ToArray(); - var names = new byte*[allNames.Count]; + var kinds = kindList.AsSpan(); + var keyCards = keyCardList.AsSpan(); + var nameBytes = nameUtf8Bytes.AsSpan(); + var names = new byte*[nameIndices.Length]; + var valueCountsBytes = valueCounts.ToArray(); fixed (InternalDataKind* prgkind = kinds) fixed (byte* prgbNames = nameBytes) fixed (byte** prgname = names) fixed (int* prgkeyCard = keyCards) + fixed (byte* prgbValueCount = valueCountsBytes) { for (int iid = 0; iid < names.Length; iid++) names[iid] = prgbNames + nameIndices[iid]; DataViewBlock block; - block.ccol = allNames.Count; + block.ccol = nameIndices.Length; block.crow = view.GetRowCount() ?? 0; block.names = (sbyte**)prgname; block.kinds = prgkind; block.keyCards = prgkeyCard; + block.valueCounts = prgbValueCount; dataSink(penv, &block, out var setters, out var keyValueSetter); @@ -298,8 +297,13 @@ private static unsafe void SendViewToNative(IChannel ch, EnvironmentBlock* penv, keyIndex++; } } - fillers[i] = BufferFillerBase.Create(penv, cursor, pyColumn, colIndices[i], kinds[pyColumn], type, setters[pyColumn]); - pyColumn += type is VectorDataViewType ? type.GetVectorSize() : 1; + fillers[i] = BufferFillerBase.Create(penv, cursor, pyColumn, colIndices[i], prgkind[pyColumn], type, setters[pyColumn]); + + if ((type is VectorDataViewType) && (type.GetVectorSize() > 0)) + { + pyColumn += type.GetVectorSize(); + } + else pyColumn++; } for (int crow = 0; ; crow++) { @@ -317,23 +321,180 @@ private static unsafe void SendViewToNative(IChannel ch, EnvironmentBlock* penv, } } - private static string AddUniqueName(string name, HashSet allNames, List nameIndices, List nameUtf8Bytes) + private static unsafe void SendViewToNativeAsCsr(IChannel ch, EnvironmentBlock* penv, IDataView view) + { + Contracts.AssertValue(ch); + Contracts.Assert(penv != null); + Contracts.AssertValue(view); + if (penv->dataSink == null) + { + // Environment doesn't want any data! + return; + } + + var dataSink = MarshalDelegate(penv->dataSink); + + var schema = view.Schema; + var colIndices = new List(); + var outputDataKind = InternalDataKind.R4; + + int numOutputRows = 0; + int numOutputCols = 0; + + for (int col = 0; col < schema.Count; col++) + { + if (schema[col].IsHidden) + continue; + + var fullType = schema[col].Type; + var itemType = fullType.GetItemType(); + int valueCount = fullType.GetValueCount(); + + if (valueCount == 0) + { + throw ch.ExceptNotSupp("Column has variable length vector: " + + schema[col].Name + ". Not supported in python. Drop column before sending to Python"); + } + + if (itemType.IsStandardScalar()) + { + switch (itemType.GetRawKind()) + { + default: + throw Contracts.Except("Data type {0} not supported", itemType.GetRawKind()); + + case InternalDataKind.I1: + case InternalDataKind.I2: + case InternalDataKind.U1: + case InternalDataKind.U2: + case InternalDataKind.R4: + break; + + case InternalDataKind.I4: + case InternalDataKind.U4: + case InternalDataKind.I8: + case InternalDataKind.R8: + outputDataKind = InternalDataKind.R8; + break; + } + } + else + { + throw Contracts.Except("Data type {0} not supported", itemType.GetRawKind()); + } + + colIndices.Add(col); + numOutputCols += valueCount; + } + + var nameIndices = new ValueListBuilder(10); + var nameUtf8Bytes = new ValueListBuilder(100); + + AddUniqueName("data", ref nameIndices, ref nameUtf8Bytes); + AddUniqueName("indices", ref nameIndices, ref nameUtf8Bytes); + AddUniqueName("indptr", ref nameIndices, ref nameUtf8Bytes); + AddUniqueName("shape", ref nameIndices, ref nameUtf8Bytes); + + var kindList = new List {outputDataKind, + InternalDataKind.I4, + InternalDataKind.I4, + InternalDataKind.I4}; + + var valueCounts = new List { 1, 1, 1, 1 }; + + var kinds = kindList.ToArray(); + var nameBytes = nameUtf8Bytes.AsSpan(); + var names = new byte*[nameIndices.Length]; + var valueCountsBytes = valueCounts.ToArray(); + + fixed (InternalDataKind* prgkind = kinds) + fixed (byte* prgbNames = nameBytes) + fixed (byte** prgname = names) + fixed (byte* prgbValueCount = valueCountsBytes) + { + for (int iid = 0; iid < names.Length; iid++) + names[iid] = prgbNames + nameIndices[iid]; + + DataViewBlock block; + block.ccol = nameIndices.Length; + block.crow = view.GetRowCount() ?? 0; + block.names = (sbyte**)prgname; + block.kinds = prgkind; + block.keyCards = null; + block.valueCounts = prgbValueCount; + + dataSink(penv, &block, out var setters, out var keyValueSetter); + + if (setters == null) return; + + using (var cursor = view.GetRowCursor(view.Schema.Where(col => colIndices.Contains(col.Index)))) + { + CsrData csrData = new CsrData(penv, setters, outputDataKind); + var fillers = new CsrFillerBase[colIndices.Count]; + + for (int i = 0; i < colIndices.Count; i++) + { + var type = schema[colIndices[i]].Type; + fillers[i] = CsrFillerBase.Create(penv, cursor, colIndices[i], type, outputDataKind, csrData); + } + + for (;; numOutputRows++) + { + if (!cursor.MoveNext()) break; + + for (int i = 0; i < fillers.Length; i++) + { + fillers[i].Set(); + } + + csrData.IncrementRow(); + } + + csrData.SetShape(numOutputRows, numOutputCols); + } + } + } + + private static void AddUniqueName(string name, + ref ValueListBuilder nameIndices, + ref ValueListBuilder utf8Names) + { + if (utf8Names.Capacity - utf8Names.Length < name.Length * 2 + 2) + utf8Names.Grow(); + + nameIndices.Append(utf8Names.Length); + var bytesNumber = Encoding.UTF8.GetBytes(name, 0, name.Length, utf8Names.Buffer, utf8Names.Length); + utf8Names.Length += bytesNumber; + utf8Names.Append(0); + } + + private static void AddUniqueName( + string columnName, + VBuffer> slotNames, + ref ValueListBuilder nameIndices, + ref ValueListBuilder utf8Names) { - string newName = name; - int i = 1; - while (!allNames.Add(newName)) - newName = string.Format(CultureInfo.InvariantCulture, "{0}_{1}", name, i++); - // REVIEW: Column names should not be affected by the slot names. They should always win against slot names. - byte[] bNewName = Encoding.UTF8.GetBytes(newName); - nameIndices.Add(nameUtf8Bytes.Count); - nameUtf8Bytes.AddRange(bNewName); - nameUtf8Bytes.Add(0); - return newName; + var columnNameBytes = Encoding.UTF8.GetBytes(columnName); + var dotBytes = Encoding.UTF8.GetBytes("."); + + foreach (var kvp in slotNames.Items(true)) + { + // REVIEW: Add the proper number of zeros to the slot index to make them sort in the right order. + var slotName = (!kvp.Value.IsEmpty ? kvp.Value.ToString() : kvp.Key.ToString(CultureInfo.InvariantCulture)); + if (utf8Names.Capacity - utf8Names.Length < slotName.Length * 2 + columnNameBytes.Length + dotBytes.Length) + utf8Names.Grow(); + nameIndices.Append(utf8Names.Length); + utf8Names.AppendRange(columnNameBytes); + utf8Names.AppendRange(dotBytes); + var bytesNumber = Encoding.UTF8.GetBytes(slotName, 0, slotName.Length, utf8Names.Buffer, utf8Names.Length); + utf8Names.Length += bytesNumber; + utf8Names.Append(0); + } } private abstract unsafe class BufferFillerBase { - public delegate void ValuePoker(T value, int col, long index); + public delegate void ValuePoker(T value, int col, long m, long n); protected readonly int _colIndex; protected readonly DataViewRow _input; @@ -357,23 +518,23 @@ public static BufferFillerBase Create(EnvironmentBlock* penv, DataViewRow input, case InternalDataKind.U1: var fnI1 = MarshalDelegate(setter); ValuePoker pokeU1 = - (byte value, int col, long index) => fnI1(penv, col, index, value > keyMax ? (sbyte)-1 : (sbyte)(value - 1)); + (byte value, int col, long m, long n) => fnI1(penv, col, m, n, value > keyMax ? (sbyte)-1 : (sbyte)(value - 1)); return new Impl(input, pyCol, idvCol, type, pokeU1); case InternalDataKind.U2: var fnI2 = MarshalDelegate(setter); ValuePoker pokeU2 = - (ushort value, int col, long index) => fnI2(penv, col, index, value > keyMax ? (short)-1 : (short)(value - 1)); + (ushort value, int col, long m, long n) => fnI2(penv, col, m, n, value > keyMax ? (short)-1 : (short)(value - 1)); return new Impl(input, pyCol, idvCol, type, pokeU2); case InternalDataKind.U4: var fnI4 = MarshalDelegate(setter); ValuePoker pokeU4 = - (uint value, int col, long index) => fnI4(penv, col, index, value > keyMax ? -1 : (int)(value - 1)); + (uint value, int col, long m, long n) => fnI4(penv, col, m, n, value > keyMax ? -1 : (int)(value - 1)); return new Impl(input, pyCol, idvCol, type, pokeU4); case InternalDataKind.U8: // We convert U8 key types with key names to I4. fnI4 = MarshalDelegate(setter); ValuePoker pokeU8 = - (ulong value, int col, long index) => fnI4(penv, col, index, value > keyMax ? -1 : (int)(value - 1)); + (ulong value, int col, long m, long n) => fnI4(penv, col, m, n, value > keyMax ? -1 : (int)(value - 1)); return new Impl(input, pyCol, idvCol, type, pokeU8); } } @@ -385,23 +546,23 @@ public static BufferFillerBase Create(EnvironmentBlock* penv, DataViewRow input, case InternalDataKind.U1: var fnI1 = MarshalDelegate(setter); ValuePoker pokeU1 = - (byte value, int col, long index) => fnI1(penv, col, index, (sbyte)(value - 1)); + (byte value, int col, long m, long n) => fnI1(penv, col, m, n, (sbyte)(value - 1)); return new Impl(input, pyCol, idvCol, type, pokeU1); case InternalDataKind.U2: var fnI2 = MarshalDelegate(setter); ValuePoker pokeU2 = - (ushort value, int col, long index) => fnI2(penv, col, index, (short)(value - 1)); + (ushort value, int col, long m, long n) => fnI2(penv, col, m, n, (short)(value - 1)); return new Impl(input, pyCol, idvCol, type, pokeU2); case InternalDataKind.U4: var fnI4 = MarshalDelegate(setter); ValuePoker pokeU4 = - (uint value, int col, long index) => fnI4(penv, col, index, (int)(value - 1)); + (uint value, int col, long m, long n) => fnI4(penv, col, m, n, (int)(value - 1)); return new Impl(input, pyCol, idvCol, type, pokeU4); case InternalDataKind.U8: // We convert U8 key types with key names to I4. fnI4 = MarshalDelegate(setter); ValuePoker pokeU8 = - (ulong value, int col, long index) => fnI4(penv, col, index, (int)(value - 1)); + (ulong value, int col, long m, long n) => fnI4(penv, col, m, n, (int)(value - 1)); return new Impl(input, pyCol, idvCol, type, pokeU8); } } @@ -412,70 +573,81 @@ public static BufferFillerBase Create(EnvironmentBlock* penv, DataViewRow input, case InternalDataKind.R4: var fnR4 = MarshalDelegate(setter); ValuePoker pokeR4 = - (float value, int col, long index) => fnR4(penv, col, index, value); + (float value, int col, long m, long n) => fnR4(penv, col, m, n, value); return new Impl(input, pyCol, idvCol, type, pokeR4); case InternalDataKind.R8: var fnR8 = MarshalDelegate(setter); ValuePoker pokeR8 = - (double value, int col, long index) => fnR8(penv, col, index, value); + (double value, int col, long m, long n) => fnR8(penv, col, m, n, value); return new Impl(input, pyCol, idvCol, type, pokeR8); case InternalDataKind.BL: var fnBl = MarshalDelegate(setter); ValuePoker pokeBl = - (bool value, int col, long index) => fnBl(penv, col, index, !value ? (byte)0 : value ? (byte)1 : (byte)0xFF); + (bool value, int col, long m, long n) => fnBl(penv, col, m, n, !value ? (byte)0 : value ? (byte)1 : (byte)0xFF); return new Impl(input, pyCol, idvCol, type, pokeBl); case InternalDataKind.I1: var fnI1 = MarshalDelegate(setter); ValuePoker pokeI1 = - (sbyte value, int col, long index) => fnI1(penv, col, index, value); + (sbyte value, int col, long m, long n) => fnI1(penv, col, m, n, value); return new Impl(input, pyCol, idvCol, type, pokeI1); case InternalDataKind.I2: var fnI2 = MarshalDelegate(setter); ValuePoker pokeI2 = - (short value, int col, long index) => fnI2(penv, col, index, value); + (short value, int col, long m, long n) => fnI2(penv, col, m, n, value); return new Impl(input, pyCol, idvCol, type, pokeI2); case InternalDataKind.I4: var fnI4 = MarshalDelegate(setter); ValuePoker pokeI4 = - (int value, int col, long index) => fnI4(penv, col, index, value); + (int value, int col, long m, long n) => fnI4(penv, col, m, n, value); return new Impl(input, pyCol, idvCol, type, pokeI4); case InternalDataKind.I8: var fnI8 = MarshalDelegate(setter); ValuePoker pokeI8 = - (long value, int col, long index) => fnI8(penv, col, index, value); + (long value, int col, long m, long n) => fnI8(penv, col, m, n, value); return new Impl(input, pyCol, idvCol, type, pokeI8); case InternalDataKind.U1: var fnU1 = MarshalDelegate(setter); ValuePoker pokeU1 = - (byte value, int col, long index) => fnU1(penv, col, index, value); + (byte value, int col, long m, long n) => fnU1(penv, col, m, n, value); return new Impl(input, pyCol, idvCol, type, pokeU1); case InternalDataKind.U2: var fnU2 = MarshalDelegate(setter); ValuePoker pokeU2 = - (ushort value, int col, long index) => fnU2(penv, col, index, value); + (ushort value, int col, long m, long n) => fnU2(penv, col, m, n, value); return new Impl(input, pyCol, idvCol, type, pokeU2); case InternalDataKind.U4: var fnU4 = MarshalDelegate(setter); ValuePoker pokeU4 = - (uint value, int col, long index) => fnU4(penv, col, index, value); + (uint value, int col, long m, long n) => fnU4(penv, col, m, n, value); return new Impl(input, pyCol, idvCol, type, pokeU4); case InternalDataKind.U8: var fnU8 = MarshalDelegate(setter); ValuePoker pokeU8 = - (ulong value, int col, long index) => fnU8(penv, col, index, value); + (ulong value, int col, long m, long n) => fnU8(penv, col, m, n, value); return new Impl(input, pyCol, idvCol, type, pokeU8); + case InternalDataKind.DT: + var fnDT = MarshalDelegate(setter); + ValuePoker pokeDT = + (DateTime value, int col, long m, long n) => + { + DateTimeOffset dto = (value.Kind == DateTimeKind.Unspecified) ? + new DateTimeOffset(value, TimeSpan.Zero) : + new DateTimeOffset(value); + fnDT(penv, col, m, n, dto.ToUnixTimeMilliseconds()); + }; + return new Impl(input, pyCol, idvCol, type, pokeDT); case InternalDataKind.TX: var fnTX = MarshalDelegate(setter); ValuePoker> pokeTX = - (ReadOnlyMemory value, int col, long index) => + (ReadOnlyMemory value, int col, long m, long n) => { if (value.IsEmpty) - fnTX(penv, col, index, null, 0); + fnTX(penv, col, m, n, null, 0); else { byte[] bt = Encoding.UTF8.GetBytes(value.ToString()); fixed (byte* pt = bt) - fnTX(penv, col, index, (sbyte*)pt, bt.Length); + fnTX(penv, col, m, n, (sbyte*)pt, bt.Length); } }; return new Impl>(input, pyCol, idvCol, type, pokeTX); @@ -496,6 +668,7 @@ private sealed class Impl : BufferFillerBase private VBuffer _buffer; private readonly ValueGetter _get; private readonly ValuePoker _poker; + private readonly bool _isVarLength; public Impl(DataViewRow input, int pyColIndex, int idvColIndex, DataViewType type, ValuePoker poker) : base(input, pyColIndex) @@ -509,6 +682,7 @@ public Impl(DataViewRow input, int pyColIndex, int idvColIndex, DataViewType typ _get = RowCursorUtils.GetGetterAs(type, input, idvColIndex); _poker = poker; + _isVarLength = (type.GetValueCount() == 0); } public override void Set() { @@ -519,7 +693,9 @@ public override void Set() { for (int i = 0; i < _buffer.Length; i++) { - _poker(_buffer.GetValues()[i], _colIndex + i, _input.Position); + if (_isVarLength) + _poker(_buffer.GetValues()[i], _colIndex, _input.Position, i); + else _poker(_buffer.GetValues()[i], _colIndex + i, _input.Position, 0); } } else @@ -534,7 +710,10 @@ public override void Set() TSrc val = default(TSrc); if (ii < values.Length && indices[ii] == i) val = values[ii]; - _poker(val, _colIndex + i, _input.Position); + + if (_isVarLength) + _poker(val, _colIndex, _input.Position, i); + else _poker(val, _colIndex + i, _input.Position, 0); } } } @@ -542,7 +721,239 @@ public override void Set() { TSrc value = default(TSrc); _get(ref value); - _poker(value, _colIndex, _input.Position); + _poker(value, _colIndex, _input.Position, 0); + } + } + } + } + + private unsafe class CsrData + { + private const int DataCol = 0; + private const int IndicesCol = 1; + private const int IndPtrCol = 2; + private const int ShapeCol = 3; + + private readonly R4Setter _r4DataSetter; + private readonly R8Setter _r8DataSetter; + private readonly I4Setter _indicesSetter; + private readonly I4Setter _indptrSetter; + private readonly I4Setter _shapeSetter; + + public int col; + + private int _row; + private int _index; + + private EnvironmentBlock* _penv; + + public CsrData(EnvironmentBlock* penv, void** setters, InternalDataKind outputDataKind) + { + col = 0; + + _row = 0; + _index = 0; + _penv = penv; + + if (outputDataKind == InternalDataKind.R4) + { + _r4DataSetter = MarshalDelegate(setters[DataCol]); + _r8DataSetter = null; + } + else if(outputDataKind == InternalDataKind.R8) + { + _r4DataSetter = null; + _r8DataSetter = MarshalDelegate(setters[DataCol]); + } + + _indicesSetter = MarshalDelegate(setters[IndicesCol]); + _indptrSetter = MarshalDelegate(setters[IndPtrCol]); + _shapeSetter = MarshalDelegate(setters[ShapeCol]); + + _indptrSetter(_penv, IndPtrCol, 0, 0, 0); + } + + public void AppendR4(float value, int col) + { + _r4DataSetter(_penv, DataCol, _index, 0, value); + _indicesSetter(_penv, IndicesCol, _index, 0, col); + _index++; + } + + public void AppendR8(double value, int col) + { + _r8DataSetter(_penv, DataCol, _index, 0, value); + _indicesSetter(_penv, IndicesCol, _index, 0, col); + _index++; + } + + public void IncrementRow() + { + col = 0; + _row++; + + _indptrSetter(_penv, IndPtrCol, _row, 0, _index); + } + + public void SetShape(int m, int n) + { + _shapeSetter(_penv, ShapeCol, 0, 0, m); + _shapeSetter(_penv, ShapeCol, 1, 0, n); + } + } + + private abstract unsafe class CsrFillerBase + { + public delegate void DataAppender(T value, int col); + + protected CsrFillerBase() {} + + public static CsrFillerBase Create(EnvironmentBlock* penv, + DataViewRow input, + int idvCol, + DataViewType idvColType, + InternalDataKind outputDataKind, + CsrData csrData) + { + if (outputDataKind == InternalDataKind.R4) + { + switch (idvColType.GetItemType().GetRawKind()) + { + case InternalDataKind.I1: + DataAppender appendI1 = (sbyte val, int i) => csrData.AppendR4((float)val, i); + return new CsrFiller(input, idvCol, idvColType, appendI1, csrData); + case InternalDataKind.I2: + DataAppender appendI2 = (short val, int i) => csrData.AppendR4((float)val, i); + return new CsrFiller(input, idvCol, idvColType, appendI2, csrData); + case InternalDataKind.U1: + DataAppender appendU1 = (byte val, int i) => csrData.AppendR4((float)val, i); + return new CsrFiller(input, idvCol, idvColType, appendU1, csrData); + case InternalDataKind.U2: + DataAppender appendU2 = (ushort val, int i) => csrData.AppendR4((float)val, i); + return new CsrFiller(input, idvCol, idvColType, appendU2, csrData); + case InternalDataKind.R4: + DataAppender appendR4 = (float val, int i) => csrData.AppendR4((float)val, i); + return new CsrFiller(input, idvCol, idvColType, appendR4, csrData); + default: + throw Contracts.Except("Source data type not supported"); + } + } + else if (outputDataKind == InternalDataKind.R8) + { + switch (idvColType.GetItemType().GetRawKind()) + { + case InternalDataKind.I1: + DataAppender appendI1 = (sbyte val, int i) => csrData.AppendR8((double)val, i); + return new CsrFiller(input, idvCol, idvColType, appendI1, csrData); + case InternalDataKind.I2: + DataAppender appendI2 = (short val, int i) => csrData.AppendR8((double)val, i); + return new CsrFiller(input, idvCol, idvColType, appendI2, csrData); + case InternalDataKind.I4: + DataAppender appendI4 = (int val, int i) => csrData.AppendR8((double)val, i); + return new CsrFiller(input, idvCol, idvColType, appendI4, csrData); + case InternalDataKind.U1: + DataAppender appendU1 = (byte val, int i) => csrData.AppendR8((double)val, i); + return new CsrFiller(input, idvCol, idvColType, appendU1, csrData); + case InternalDataKind.U2: + DataAppender appendU2 = (ushort val, int i) => csrData.AppendR8((double)val, i); + return new CsrFiller(input, idvCol, idvColType, appendU2, csrData); + case InternalDataKind.U4: + DataAppender appendU4 = (uint val, int i) => csrData.AppendR8((double)val, i); + return new CsrFiller(input, idvCol, idvColType, appendU4, csrData); + case InternalDataKind.R4: + DataAppender appendR4 = (float val, int i) => csrData.AppendR8((double)val, i); + return new CsrFiller(input, idvCol, idvColType, appendR4, csrData); + case InternalDataKind.I8: + DataAppender appendI8 = (long val, int i) => csrData.AppendR8((double)val, i); + return new CsrFiller(input, idvCol, idvColType, appendI8, csrData); + case InternalDataKind.R8: + DataAppender appendR8 = (double val, int i) => csrData.AppendR8((double)val, i); + return new CsrFiller(input, idvCol, idvColType, appendR8, csrData); + default: + throw Contracts.Except("Source data type not supported"); + } + } + + throw Contracts.Except("Target data type not supported."); + } + + public abstract void Set(); + + private sealed class CsrFiller : CsrFillerBase + { + private readonly ValueGetter> _getVec; + private readonly ValueGetter _get; + private VBuffer _buffer; + + private CsrData _csrData; + private readonly DataAppender _dataAppender; + + private readonly IEqualityComparer comparer = EqualityComparer.Default; + + public CsrFiller(DataViewRow input, + int idvColIndex, + DataViewType type, + DataAppender dataAppender, + CsrData csrData) + : base() + { + Contracts.AssertValue(input); + Contracts.Assert(0 <= idvColIndex && idvColIndex < input.Schema.Count); + + if (type is VectorDataViewType) + _getVec = RowCursorUtils.GetVecGetterAs((PrimitiveDataViewType)type.GetItemType(), input, idvColIndex); + else + _get = RowCursorUtils.GetGetterAs(type, input, idvColIndex); + + _csrData = csrData; + _dataAppender = dataAppender; + } + + public bool IsDefault(TSrc t) + { + return comparer.Equals(t, default(TSrc)); + } + + public override void Set() + { + if (_getVec != null) + { + _getVec(ref _buffer); + if (_buffer.IsDense) + { + var values = _buffer.GetValues(); + + for (int i = 0; i < values.Length; i++) + { + if (!IsDefault(values[i])) + _dataAppender(values[i], _csrData.col); + + _csrData.col++; + } + } + else + { + var values = _buffer.GetValues(); + var indices = _buffer.GetIndices(); + + for (int i = 0; i < values.Length; i++) + { + if (!IsDefault(values[i])) + _dataAppender(values[i], _csrData.col + indices[i]); + } + + _csrData.col += _buffer.Length; + } + } + else + { + TSrc value = default(TSrc); + _get(ref value); + + if (!IsDefault(value)) + _dataAppender(value, _csrData.col); + + _csrData.col++; } } } diff --git a/src/DotNetBridge/NativeDataView.cs b/src/DotNetBridge/NativeDataView.cs index 09796203..1a829889 100644 --- a/src/DotNetBridge/NativeDataView.cs +++ b/src/DotNetBridge/NativeDataView.cs @@ -8,13 +8,12 @@ using System.Collections.Concurrent; using System.Linq; using System.Threading; -using Microsoft.ML; using Microsoft.ML.Data; using Microsoft.ML.Internal.Utilities; using System.Threading.Tasks; using Microsoft.ML.Runtime; -namespace Microsoft.MachineLearning.DotNetBridge +namespace Microsoft.ML.DotNetBridge { public unsafe static partial class Bridge { @@ -143,6 +142,10 @@ public NativeDataView(IHostEnvironment env, DataSourceBlock* pdata) case InternalDataKind.Text: columns.Add(new TextColumn(pdata, pdata->getters[c], c, name)); break; + case InternalDataKind.DT: + if (pdata->vecCards[c] == -1) + columns.Add(new DateTimeColumn(pdata, pdata->getters[c], c, name)); + break; } } @@ -867,6 +870,31 @@ public override void Dispose() } } + private sealed class DateTimeColumn : Column + { + private I8Getter _getter; + + public DateTimeColumn(DataSourceBlock* data, void* getter, int colIndex, string name) + : base(data, colIndex, name, DateTimeDataViewType.Instance) + { + _getter = MarshalDelegate(getter); + } + + public override void CopyOut(long index, Batch batch, ref DateTime value) + { + Contracts.Check(Data != null, AlreadyDisposed); + Contracts.Assert(0 <= index); + _getter(Data, ColIndex, index, out var val); + value = DateTimeOffset.FromUnixTimeMilliseconds(val).UtcDateTime; + } + + public override void Dispose() + { + _getter = null; + base.Dispose(); + } + } + private sealed class TextColumn : Column> { private TXGetter _getter; diff --git a/src/DotNetBridge/RmlEnvironment.cs b/src/DotNetBridge/RmlEnvironment.cs index d2e861fe..1bcc0f50 100644 --- a/src/DotNetBridge/RmlEnvironment.cs +++ b/src/DotNetBridge/RmlEnvironment.cs @@ -8,7 +8,7 @@ using Microsoft.ML; using Microsoft.ML.Runtime; -namespace Microsoft.MachineLearning.DotNetBridge +namespace Microsoft.ML.DotNetBridge { internal class RmlEnvironment : HostEnvironmentBase { @@ -55,7 +55,6 @@ protected override IHost RegisterCore(HostEnvironmentBase source public RmlEnvironment(Bridge.CheckCancelled checkDelegate, int? seed = null, bool verbose = false) : this(RandomUtils.Create(seed), verbose) { - CheckCancelled = checkDelegate; } diff --git a/src/DotNetBridge/RunGraph.cs b/src/DotNetBridge/RunGraph.cs index c9d668fa..55f02795 100644 --- a/src/DotNetBridge/RunGraph.cs +++ b/src/DotNetBridge/RunGraph.cs @@ -9,8 +9,6 @@ using System.IO; using System.Linq; using Microsoft.DataPrep.Common; -using Microsoft.ML; -using Microsoft.ML.CommandLine; using Microsoft.ML.Data; using Microsoft.ML.Data.IO; using Microsoft.ML.EntryPoints; @@ -20,36 +18,15 @@ using Newtonsoft.Json; using Newtonsoft.Json.Linq; -namespace Microsoft.MachineLearning.DotNetBridge +namespace Microsoft.ML.DotNetBridge { public unsafe static partial class Bridge { // std:null specifier in a graph, used to redirect output to std::null const string STDNULL = ""; - private sealed class RunGraphArgs - { -#pragma warning disable 649 // never assigned - [Argument(ArgumentType.AtMostOnce)] - public string graph; - - [Argument(ArgumentType.LastOccurenceWins, HelpText = "Desired degree of parallelism in the data pipeline", ShortName = "conc")] - public int? parallel; - - [Argument(ArgumentType.AtMostOnce, HelpText = "Random seed", ShortName = "seed")] - public int? randomSeed; - - [Argument(ArgumentType.AtMostOnce, ShortName = "lab")] - public string labelColumn; //not used - - [Argument(ArgumentType.Multiple, ShortName = "feat")] - public string[] featureColumn; //not used - - [Argument(ArgumentType.AtMostOnce, HelpText = "Max slots to return for vector valued columns (<=0 to return all)")] - public int maxSlots = -1; - -#pragma warning restore 649 // never assigned - } + // graph output format specifier, used to output to a sparse csr matrix + const string CSR_MATRIX = ""; private static void SaveIdvToFile(IDataView idv, string path, IHost host) { @@ -58,7 +35,15 @@ private static void SaveIdvToFile(IDataView idv, string path, IHost host) var extension = Path.GetExtension(path); IDataSaver saver; if (extension != ".csv" && extension != ".tsv" && extension != ".txt") + { saver = new BinarySaver(host, new BinarySaver.Arguments()); + + var schemaFilePath = Path.GetDirectoryName(path) + + Path.DirectorySeparatorChar + + Path.GetFileNameWithoutExtension(path) + + ".schema"; + SaveIdvSchemaToFile(idv, schemaFilePath, host); + } else { var saverArgs = new TextSaver.Arguments @@ -80,6 +65,25 @@ private static void SaveIdvToFile(IDataView idv, string path, IHost host) } } + private static void SaveIdvSchemaToFile(IDataView idv, string path, IHost host) + { + var emptyDataView = new EmptyDataView(host, idv.Schema); + var saverArgs = new TextSaver.Arguments + { + OutputHeader = false, + OutputSchema = true, + Dense = true + }; + IDataSaver saver = new TextSaver(host, saverArgs); + + using (var fs = File.OpenWrite(path)) + { + saver.SaveData(fs, emptyDataView, Utils.GetIdentityPermutation(emptyDataView.Schema.Count) + .Where(x => !emptyDataView.Schema[x].IsHidden && saver.IsColumnSavable(emptyDataView.Schema[x].Type)) + .ToArray()); + } + } + private static void SavePredictorModelToFile(PredictorModel model, string path, IHost host) { using (var fs = File.OpenWrite(path)) @@ -90,19 +94,11 @@ private static void RunGraphCore(EnvironmentBlock* penv, IHostEnvironment env, s { Contracts.AssertValue(env); - var args = new RunGraphArgs(); - string err = null; - if (!CmdParser.ParseArguments(env, graphStr, args, e => err = err ?? e)) - throw env.Except(err); - - int? maxThreadsAllowed = Math.Min(args.parallel > 0 ? args.parallel.Value : penv->maxThreadsAllowed, penv->maxThreadsAllowed); - maxThreadsAllowed = penv->maxThreadsAllowed > 0 ? maxThreadsAllowed : args.parallel; - var host = env.Register("RunGraph", args.randomSeed, null); - + var host = env.Register("RunGraph", penv->seed, null); JObject graph; try { - graph = JObject.Parse(args.graph); + graph = JObject.Parse(graphStr); } catch (JsonReaderException ex) { @@ -148,10 +144,7 @@ private static void RunGraphCore(EnvironmentBlock* penv, IHostEnvironment env, s if (extension == ".txt") dv = TextLoader.LoadFile(host, new TextLoader.Options(), new MultiFileSource(path)); else if (extension == ".dprep") - { - DPrepSettings.Instance.PythonPath = BytesToString(penv->pythonPath); - dv = DataFlow.FromDPrepFile(path).ToDataView(); - } + dv = LoadDprepFile(BytesToString(penv->pythonPath), path); else dv = new BinaryLoader(host, new BinaryLoader.Arguments(), path); } @@ -218,14 +211,18 @@ private static void RunGraphCore(EnvironmentBlock* penv, IHostEnvironment env, s throw host.ExceptNotSupp("File handle outputs not yet supported."); case TlcModule.DataKind.DataView: var idv = runner.GetOutput(varName); - if (!string.IsNullOrWhiteSpace(path)) + if (path == CSR_MATRIX) + { + SendViewToNativeAsCsr(ch, penv, idv); + } + else if (!string.IsNullOrWhiteSpace(path)) { SaveIdvToFile(idv, path, host); } else { - var infos = ProcessColumns(ref idv, args.maxSlots, host); - SendViewToNative(ch, penv, idv, infos); + var infos = ProcessColumns(ref idv, penv->maxSlots, host); + SendViewToNativeAsDataFrame(ch, penv, idv, infos); } break; case TlcModule.DataKind.PredictorModel: @@ -286,6 +283,12 @@ private static void RunGraphCore(EnvironmentBlock* penv, IHostEnvironment env, s } } + private static IDataView LoadDprepFile(string pythonPath, string path) + { + DPrepSettings.Instance.PythonPath = pythonPath; + return DataFlow.FromDPrepFile(path).ToDataView(); + } + private static Dictionary ProcessColumns(ref IDataView view, int maxSlots, IHostEnvironment env) { Dictionary result = null; diff --git a/src/DotNetBridge/ValueListBuilder.cs b/src/DotNetBridge/ValueListBuilder.cs new file mode 100644 index 00000000..418cd673 --- /dev/null +++ b/src/DotNetBridge/ValueListBuilder.cs @@ -0,0 +1,118 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Buffers; +using System.Diagnostics; +using System.Runtime.CompilerServices; + +namespace Microsoft.ML.DotNetBridge +{ + internal ref struct ValueListBuilder + { + private Span _span; + private T[] _arrayFromPool; + private int _pos; + + public ValueListBuilder(Span initialSpan) + { + _span = initialSpan; + _arrayFromPool = null; + _pos = 0; + } + + public ValueListBuilder(int initialSize = 1024) + { + _arrayFromPool = ArrayPool.Shared.Rent(initialSize); + _span = _arrayFromPool; + _pos = 0; + } + + public int Length + { + get => _pos; + set + { + Debug.Assert(value >= 0); + Debug.Assert(value <= _span.Length); + _pos = value; + } + } + + public int Capacity + { + get => _span.Length; + } + + public ref T this[int index] + { + get + { + Debug.Assert(index < _pos); + return ref _span[index]; + } + } + + public T[] Buffer + { + get => _arrayFromPool; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void Append(T item) + { + int pos = _pos; + if (pos >= _span.Length) + Grow(); + + _span[pos] = item; + _pos = pos + 1; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void AppendRange(T[] items) + { + int pos = _pos; + while(pos + items.Length >= _span.Length) + Grow(); + + foreach (T item in items) + { + _span[pos] = item; + _pos = pos + 1; + pos++; + } + } + + public ReadOnlySpan AsSpan() + { + return _span.Slice(0, _pos); + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void Dispose() + { + if (_arrayFromPool != null) + { + ArrayPool.Shared.Return(_arrayFromPool); + _arrayFromPool = null; + } + } + + public void Grow() + { + T[] array = ArrayPool.Shared.Rent(_span.Length * 2); + + bool success = _span.TryCopyTo(array); + Debug.Assert(success); + + T[] toReturn = _arrayFromPool; + _span = _arrayFromPool = array; + if (toReturn != null) + { + ArrayPool.Shared.Return(toReturn); + } + } + } +} \ No newline at end of file diff --git a/src/DotNetBridge/transforms/VariableColumnTransform.cs b/src/DotNetBridge/transforms/VariableColumnTransform.cs new file mode 100644 index 00000000..ea9ecafb --- /dev/null +++ b/src/DotNetBridge/transforms/VariableColumnTransform.cs @@ -0,0 +1,337 @@ +//------------------------------------------------------------------------------ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. +//------------------------------------------------------------------------------ + +using System; +using System.Collections.Generic; +using Microsoft.ML; +using Microsoft.ML.CommandLine; +using Microsoft.ML.Data; +using Microsoft.ML.Runtime; +using Microsoft.ML.Transforms; +using Microsoft.ML.Internal.Utilities; + + +namespace Microsoft.ML.DotNetBridge +{ + using BitArray = System.Collections.BitArray; + + /// + /// A transform that combines the specified input columns + /// in to a single variable length vectorized column and + /// passes the rest of the columns through unchanged. + /// + [BestFriend] + internal sealed class VariableColumnTransform : IDataTransform, IRowToRowMapper + { + public class Options : TransformInputBase + { + [Argument(ArgumentType.Multiple, HelpText = "Features", SortOrder = 2)] + public string[] Features; + + [Argument(ArgumentType.Multiple, HelpText = "Length Column Name", SortOrder = 2)] + public string LengthColumnName; + } + + private sealed class Bindings + { + public readonly List outputToInputMap; + public readonly List vectorToInputMap; + public int outputColumn; + public int lengthColumn; + + public Bindings() + { + outputToInputMap = new List(); + vectorToInputMap = new List(); + outputColumn = -1; + lengthColumn = -1; + } + } + + private readonly IHost _host; + private readonly Bindings _bindings; + private readonly HashSet _columnNames; + + public IDataView Source { get; } + + DataViewSchema IRowToRowMapper.InputSchema => Source.Schema; + + private VariableColumnTransform(IHostEnvironment env, IDataView input, string[] features, string lengthColumnName) + { + Contracts.CheckValue(env, nameof(env)); + + Source = input; + _host = env.Register(RegistrationName); + _bindings = new Bindings(); + + _columnNames = (features == null) ? new HashSet() : + new HashSet(features); + + OutputSchema = ProcessInputSchema(input.Schema, lengthColumnName); + } + + internal const string Summary = "Combines the specified input columns in to a single variable length vectorized column."; + + public const string LoaderSignature = "VariableColumnTransform"; + + private static VersionInfo GetVersionInfo() + { + return new VersionInfo( + modelSignature: "VARLENCL", + verWrittenCur: 0x00010001, // Initial + verReadableCur: 0x00010001, + verWeCanReadBack: 0x00010001, + loaderSignature: LoaderSignature, + loaderAssemblyName: typeof(VariableColumnTransform).Assembly.FullName); + } + + internal static string RegistrationName = "VariableColumnTransform"; + + public static VariableColumnTransform Create(IHostEnvironment env, Options options, IDataView input) + { + return new VariableColumnTransform(env, input, options.Features, options.LengthColumnName); + } + + public static VariableColumnTransform Create(IHostEnvironment env, ModelLoadContext ctx, IDataView input) + { + Contracts.CheckValue(env, nameof(env)); + var h = env.Register(RegistrationName); + h.CheckValue(ctx, nameof(ctx)); + h.CheckValue(input, nameof(input)); + ctx.CheckAtModel(GetVersionInfo()); + return h.Apply("Loading Model", ch => new VariableColumnTransform(h, ctx, input)); + } + + private VariableColumnTransform(IHost host, ModelLoadContext ctx, IDataView input) + { + Contracts.AssertValue(host, nameof(host)); + host.CheckValue(input, nameof(input)); + + Source = input; + _host = host; + + // TODO: fill this in + } + + void ICanSaveModel.Save(ModelSaveContext ctx) + { + _host.CheckValue(ctx, nameof(ctx)); + ctx.CheckAtModel(); + ctx.SetVersionInfo(GetVersionInfo()); + + // TODO: fill this in + } + + public bool CanShuffle => Source.CanShuffle; + + DataViewSchema IDataView.Schema => OutputSchema; + public DataViewSchema OutputSchema { get; } + + private DataViewSchema ProcessInputSchema(DataViewSchema inputSchema, string lengthColumnName) + { + var builder = new DataViewSchema.Builder(); + for (int i = 0; i < inputSchema.Count; i++) + { + var name = inputSchema[i].Name; + + if (_columnNames.Contains(name)) + { + _bindings.vectorToInputMap.Add(i); + } + else if (name == lengthColumnName) + { + _bindings.lengthColumn = i; + } + else + { + builder.AddColumn(name, inputSchema[i].Type); + _bindings.outputToInputMap.Add(i); + } + } + + if (_bindings.vectorToInputMap.Count > 0) + { + var type = inputSchema[_bindings.vectorToInputMap[0]].Type as PrimitiveDataViewType; + + for (int i = 1; i < _bindings.vectorToInputMap.Count; i++) + { + var nextType = inputSchema[_bindings.vectorToInputMap[i]].Type as PrimitiveDataViewType; + if (!nextType.Equals(type)) + { + throw Contracts.Except("Input data types of the columns to vectorize must " + + "all be of the same type. Found {0} and {1}.", + type.ToString(), + nextType.ToString()); + } + } + + var outputColumnType = new VectorDataViewType(type, 0); + var outputColumnName = inputSchema[_bindings.vectorToInputMap[0]].Name; + builder.AddColumn(outputColumnName, outputColumnType); + + _bindings.outputColumn = _bindings.outputToInputMap.Count; + } + + return builder.ToSchema(); + } + + public long? GetRowCount() + { + return Source.GetRowCount(); + } + + public DataViewRowCursor GetRowCursor(IEnumerable columnsNeeded, Random rand = null) + { + var predicate = RowCursorUtils.FromColumnsToPredicate(columnsNeeded, OutputSchema); + + _host.CheckValueOrNull(rand); + return new Cursor(_host, this, _bindings, predicate, rand); + } + + public DataViewRowCursor[] GetRowCursorSet(IEnumerable columnsNeeded, int n, Random rand = null) + { + var predicate = RowCursorUtils.FromColumnsToPredicate(columnsNeeded, OutputSchema); + + _host.CheckValueOrNull(rand); + return new DataViewRowCursor[] { new Cursor(_host, this, _bindings, predicate, rand) }; + } + + private sealed class Cursor : RootCursorBase + { + private readonly IDataTransform _view; + private readonly BitArray _active; + private readonly Bindings _bindings; + private readonly DataViewRowCursor _cursor; + + public override DataViewSchema Schema => _view.Schema; + + public override long Batch + { + get { return 0; } + } + + public Cursor(IChannelProvider provider, IDataTransform view, Bindings bindings, Func predicate, Random rand) + : base(provider) + { + Ch.AssertValue(view); + Ch.AssertValueOrNull(rand); + Ch.Assert(view.Schema.Count >= 0); + + _view = view; + _bindings = bindings; + _cursor = view.Source.GetRowCursorForAllColumns(); + _active = new BitArray(view.Schema.Count); + + if (predicate == null) _active.SetAll(true); + else + { + for (int i = 0; i < view.Schema.Count; ++i) + _active[i] = predicate(i); + } + } + + public override ValueGetter GetIdGetter() + { + return (ref DataViewRowId val) => + { + Ch.Check(IsGood, RowCursorUtils.FetchValueStateError); + val = new DataViewRowId((ulong)Position, 0); + }; + } + + public override bool IsColumnActive(DataViewSchema.Column column) + { + Ch.Check(column.Index < Schema.Count); + return _active[column.Index]; + } + + private Delegate MakeVarLengthVectorGetter(DataViewRow input) + { + var srcGetters = new ValueGetter[_bindings.vectorToInputMap.Count]; + ValueGetter lengthGetter = null; + + for (int i = 0; i < _bindings.vectorToInputMap.Count; i++) + { + var column = input.Schema[_bindings.vectorToInputMap[i]]; + srcGetters[i] = input.GetGetter(column); + } + + if (_bindings.lengthColumn >= 0) + { + var column = input.Schema[_bindings.lengthColumn]; + lengthGetter = input.GetGetter(column); + } + + T tmp = default(T); + ValueGetter> result = (ref VBuffer dst) => + { + int length = _bindings.vectorToInputMap.Count; + if (lengthGetter != null) + { + long expectedLength = length; + lengthGetter(ref expectedLength); + + if ((expectedLength >= 0) && (expectedLength < length)) + { + length = (int)expectedLength; + } + } + + var editor = VBufferEditor.Create(ref dst, length); + + for (int i = 0; i < length; i++) + { + srcGetters[i](ref tmp); + editor.Values[i] = tmp; + } + + dst = editor.Commit(); + }; + return result; + } + + /// + /// Returns a value getter delegate to fetch the value of column with the given columnIndex, from the row. + /// This throws if the column is not active in this row, or if the type + /// differs from this column's type. + /// + /// is the column's content type. + /// is the output column whose getter should be returned. + public override ValueGetter GetGetter(DataViewSchema.Column column) + { + if (column.Index == _bindings.outputColumn) + { + VectorDataViewType columnType = column.Type as VectorDataViewType; + Delegate getter = Utils.MarshalInvoke(MakeVarLengthVectorGetter, columnType.ItemType.RawType, _cursor); + return getter as ValueGetter; + } + else + { + int inputIndex = _bindings.outputToInputMap[column.Index]; + return _cursor.GetGetter(_cursor.Schema[inputIndex]); + } + } + + protected override bool MoveNextCore() + { + return _cursor.MoveNext(); + } + } + + /// + /// Given a set of columns, return the input columns that are needed to generate those output columns. + /// + IEnumerable IRowToRowMapper.GetDependencies(IEnumerable dependingColumns) + => dependingColumns; + + DataViewRow IRowToRowMapper.GetRow(DataViewRow input, IEnumerable activeColumns) + { + Contracts.CheckValue(input, nameof(input)); + Contracts.CheckValue(activeColumns, nameof(activeColumns)); + Contracts.CheckParam(input.Schema == Source.Schema, nameof(input), "Schema of input row must be the same as the schema the mapper is bound to"); + return input; + } + } +} \ No newline at end of file diff --git a/src/NativeBridge/DataViewInterop.cpp b/src/NativeBridge/DataViewInterop.cpp index dd349012..d681df1f 100644 --- a/src/NativeBridge/DataViewInterop.cpp +++ b/src/NativeBridge/DataViewInterop.cpp @@ -7,317 +7,318 @@ DataSourceBlock::DataSourceBlock(bp::dict& data) { - // Assert that this class doesn't have a vtable. - assert(offsetof(DataSourceBlock, ccol) == 0); + // Assert that this class doesn't have a vtable. + assert(offsetof(DataSourceBlock, ccol) == 0); - CxInt64 llTotalNumRows = -1; - assert(data.contains(PYTHON_DATA_KEY_INFO)); - bp::dict varInfo = bp::extract(data[PYTHON_DATA_KEY_INFO]); + CxInt64 llTotalNumRows = -1; + assert(data.contains(PYTHON_DATA_KEY_INFO)); + bp::dict varInfo = bp::extract(data[PYTHON_DATA_KEY_INFO]); - assert(data.contains(PYTHON_DATA_COL_TYPES)); - bp::list colTypes = bp::extract(data[PYTHON_DATA_COL_TYPES]); + assert(data.contains(PYTHON_DATA_COL_TYPES)); + bp::list colTypes = bp::extract(data[PYTHON_DATA_COL_TYPES]); - bp::stl_input_iterator keys(data.keys()), end1; - bp::stl_input_iterator values(data.values()); - CxInt64 dataframeColCount = -1; - for (; keys != end1; keys++) - { - bp::object key = *keys; - char* name = bp::extract(key); - bp::object value = *values++; - if (strcmp(name, PYTHON_DATA_KEY_INFO) == 0 || strcmp(name, PYTHON_DATA_COL_TYPES) == 0) - continue; + bp::stl_input_iterator keys(data.keys()), end1; + bp::stl_input_iterator values(data.values()); + CxInt64 dataframeColCount = -1; + for (; keys != end1; keys++) + { + bp::object key = *keys; + char* name = bp::extract(key); + bp::object value = *values++; + if (strcmp(name, PYTHON_DATA_KEY_INFO) == 0 || strcmp(name, PYTHON_DATA_COL_TYPES) == 0) + continue; - // now it should be a column names - std::string colName = bp::extract(key); - dataframeColCount++; - auto tp = bp::extract(colTypes[dataframeColCount]); - ML_PY_TYPE_MAP_ENUM colType = static_cast(tp[0]); + // now it should be a column names + std::string colName = bp::extract(key); + dataframeColCount++; + auto tp = bp::extract(colTypes[dataframeColCount]); + ML_PY_TYPE_MAP_ENUM colType = static_cast(tp[0]); - BYTE kind; - void *pgetter; - bool isKey = false; - bool isNumeric = false; - bool isText = false; - CxInt64 vecCard = -1; - // Numeric or bool values. - if (bp::extract(value).check()) - { - isNumeric = true; - np::ndarray val = bp::extract(value); - switch (colType) - { - case (ML_PY_BOOL): - kind = BL; - pgetter = (void*)&GetBL; - break; - case (ML_PY_BOOL64): - kind = BL; - pgetter = (void*)&GetBL64; - break; - case (ML_PY_UINT8): - kind = U1; - pgetter = (void*)&GetU1; - break; - case (ML_PY_UINT16): - kind = U2; - pgetter = (void*)&GetU2; - break; - case (ML_PY_UINT32): - kind = U4; - pgetter = (void*)&GetU4; - break; - case (ML_PY_UINT64): - kind = U8; - pgetter = (void*)&GetU8; - break; - case (ML_PY_INT8): - kind = I1; - pgetter = (void*)&GetI1; - break; - case (ML_PY_INT16): - kind = I2; - pgetter = (void*)&GetI2; - break; - case (ML_PY_INT32): - kind = I4; - pgetter = (void*)&GetI4; - break; - case (ML_PY_INT64): - kind = I8; - pgetter = (void*)&GetI8; - break; - case (ML_PY_FLOAT16): - // What to do with numpy.float16 ? - throw std::invalid_argument("numpy.float16 data type is not supported"); - case (ML_PY_FLOAT32): - kind = R4; - pgetter = (void*)&GetR4; - break; - case (ML_PY_FLOAT64): - kind = R8; - pgetter = (void*)&GetR8; - break; - default: - throw std::invalid_argument("column " + colName + " has unsupported type"); - } - const char *data = val.get_data(); - this->_vdata.push_back(data); + BYTE kind; + void *pgetter; + bool isKey = false; + bool isNumeric = false; + bool isText = false; + CxInt64 vecCard = -1; + // Numeric or bool values. + if (bp::extract(value).check()) + { + isNumeric = true; + np::ndarray val = bp::extract(value); + switch (colType) + { + case (ML_PY_BOOL): + kind = BL; + pgetter = (void*)&GetBL; + break; + case (ML_PY_BOOL64): + kind = BL; + pgetter = (void*)&GetBL64; + break; + case (ML_PY_UINT8): + kind = U1; + pgetter = (void*)&GetU1; + break; + case (ML_PY_UINT16): + kind = U2; + pgetter = (void*)&GetU2; + break; + case (ML_PY_UINT32): + kind = U4; + pgetter = (void*)&GetU4; + break; + case (ML_PY_UINT64): + kind = U8; + pgetter = (void*)&GetU8; + break; + case (ML_PY_INT8): + kind = I1; + pgetter = (void*)&GetI1; + break; + case (ML_PY_INT16): + kind = I2; + pgetter = (void*)&GetI2; + break; + case (ML_PY_INT32): + kind = I4; + pgetter = (void*)&GetI4; + break; + case (ML_PY_INT64): + kind = I8; + pgetter = (void*)&GetI8; + break; + case (ML_PY_FLOAT16): + // What to do with numpy.float16 ? + throw std::invalid_argument("numpy.float16 data type is not supported"); + case (ML_PY_FLOAT32): + kind = R4; + pgetter = (void*)&GetR4; + break; + case (ML_PY_FLOAT64): + kind = R8; + pgetter = (void*)&GetR8; + break; + case (ML_PY_DATETIME): + kind = DT; + pgetter = (void*)&GetI8; + break; + default: + throw std::invalid_argument("column " + colName + " has unsupported type"); + } + const char *data = val.get_data(); + this->_vdata.push_back(data); - assert(this->_mpnum.size() == dataframeColCount); - this->_mpnum.push_back(_vdata.size() - 1); - if (llTotalNumRows == -1) - llTotalNumRows = val.shape(0); - else - assert(llTotalNumRows == val.shape(0)); - } - // Text or key values. - else if (bp::extract(value).check()) - { - bp::list list = bp::extract(value); + assert(this->_mpnum.size() == dataframeColCount); + this->_mpnum.push_back(_vdata.size() - 1); + if (llTotalNumRows == -1) + llTotalNumRows = val.shape(0); + else + assert(llTotalNumRows == val.shape(0)); + } + // Text or key values. + else if (bp::extract(value).check()) + { + bp::list list = bp::extract(value); - // Key values. - switch (colType) - { - case (ML_PY_CAT): - if (varInfo.contains(colName)) - { - isKey = true; - assert(bp::extract(varInfo[colName]).check()); - bp::list keyNames = bp::extract(varInfo[colName]); + // Key values. + switch (colType) + { + case (ML_PY_CAT): + if (varInfo.contains(colName)) + { + isKey = true; + assert(bp::extract(varInfo[colName]).check()); + bp::list keyNames = bp::extract(varInfo[colName]); - kind = U4; - pgetter = (void*)GetKeyInt; + kind = U4; + pgetter = (void*)GetKeyInt; - // TODO: Handle vectors. - this->_vkeyCard.push_back(len(keyNames)); - //this->_vvecCard.push_back(vecCard); - this->_vkeydata.push_back(list); - this->_vkeynames.push_back(keyNames); + // TODO: Handle vectors. + this->_vkeyCard.push_back(len(keyNames)); + //this->_vvecCard.push_back(vecCard); + this->_vkeydata.push_back(list); + this->_vkeynames.push_back(keyNames); - assert(this->_mpkey.size() == dataframeColCount); - this->_mpkey.push_back(_vkeydata.size() - 1); - if (llTotalNumRows == -1) - llTotalNumRows = len(list); - else - assert(llTotalNumRows == len(list)); - } - else - continue; - break; - // Text values. - case (ML_PY_TEXT): - case (ML_PY_UNICODE): - isText = true; - kind = TX; - if (colType == ML_PY_TEXT) - pgetter = (void*)GetTX; - else // colType is "unicode" - // in python 2.7 strings can be passed as unicode bytestring (NOT the same as UTF8 encoded strings) - pgetter = (void*)GetUnicodeTX; + assert(this->_mpkey.size() == dataframeColCount); + this->_mpkey.push_back(_vkeydata.size() - 1); + if (llTotalNumRows == -1) + llTotalNumRows = len(list); + else + assert(llTotalNumRows == len(list)); + } + else + continue; + break; + // Text values. + case (ML_PY_TEXT): + case (ML_PY_UNICODE): + isText = true; + kind = TX; + if (colType == ML_PY_TEXT) + pgetter = (void*)GetTX; + else // colType is "unicode" + // in python 2.7 strings can be passed as unicode bytestring (NOT the same as UTF8 encoded strings) + pgetter = (void*)GetUnicodeTX; - // TODO: Handle vectors. - //this->_vvecCard.push_back(vecCard); - this->_vtextdata.push_back(list); + // TODO: Handle vectors. + //this->_vvecCard.push_back(vecCard); + this->_vtextdata.push_back(list); - assert(this->_mptxt.size() == dataframeColCount); - this->_mptxt.push_back(_vtextdata.size() - 1); - if (llTotalNumRows == -1) - llTotalNumRows = len(list); - else - assert(llTotalNumRows == len(list)); - break; - default: - throw std::invalid_argument("column " + colName + " has unsupported type"); - } - } - // A sparse vector. - else if (bp::extract(value).check()) - { - bp::dict sparse = bp::extract(value); - np::ndarray indices = bp::extract(sparse["indices"]); - _sparseIndices = (int*)indices.get_data(); - np::ndarray indptr = bp::extract(sparse["indptr"]); - _indPtr = (int*)indptr.get_data(); + assert(this->_mptxt.size() == dataframeColCount); + this->_mptxt.push_back(_vtextdata.size() - 1); + if (llTotalNumRows == -1) + llTotalNumRows = len(list); + else + assert(llTotalNumRows == len(list)); + break; + default: + throw std::invalid_argument("column " + colName + " has unsupported type"); + } + } + // A sparse vector. + else if (bp::extract(value).check()) + { + bp::dict sparse = bp::extract(value); + np::ndarray indices = bp::extract(sparse["indices"]); + _sparseIndices = (int*)indices.get_data(); + np::ndarray indptr = bp::extract(sparse["indptr"]); + _indPtr = (int*)indptr.get_data(); - np::ndarray values = bp::extract(sparse["values"]); - _sparseValues = values.get_data(); - switch (colType) - { - case (ML_PY_BOOL): - kind = BL; - pgetter = (void*)&GetBLVector; - break; - case (ML_PY_UINT8): - kind = U1; - pgetter = (void*)&GetU1Vector; - break; - case (ML_PY_UINT16): - kind = U2; - pgetter = (void*)&GetU2Vector; - break; - case (ML_PY_UINT32): - kind = U4; - pgetter = (void*)&GetU4Vector; - break; - case (ML_PY_UINT64): - kind = U8; - pgetter = (void*)&GetU8Vector; - break; - case (ML_PY_INT8): - kind = I1; - pgetter = (void*)&GetI1Vector; - break; - case (ML_PY_INT16): - kind = I2; - pgetter = (void*)&GetI2Vector; - break; - case (ML_PY_INT32): - kind = I4; - pgetter = (void*)&GetI4Vector; - break; - case (ML_PY_INT64): - kind = I8; - pgetter = (void*)&GetI8Vector; - break; - case (ML_PY_FLOAT16): - throw std::invalid_argument("numpy.float16 data type is not supported in sparse data"); - case (ML_PY_FLOAT32): - kind = R4; - pgetter = (void*)&GetR4Vector; - break; - case (ML_PY_FLOAT64): - kind = R8; - pgetter = (void*)&GetR8Vector; - break; - default: - throw std::invalid_argument("column " + colName + " has unsupported type"); - } - vecCard = bp::extract(sparse["colCount"]); - name = (char*)"Data"; + np::ndarray values = bp::extract(sparse["values"]); + _sparseValues = values.get_data(); + switch (colType) + { + case (ML_PY_BOOL): + kind = BL; + pgetter = (void*)&GetBLVector; + break; + case (ML_PY_UINT8): + kind = U1; + pgetter = (void*)&GetU1Vector; + break; + case (ML_PY_UINT16): + kind = U2; + pgetter = (void*)&GetU2Vector; + break; + case (ML_PY_UINT32): + kind = U4; + pgetter = (void*)&GetU4Vector; + break; + case (ML_PY_UINT64): + kind = U8; + pgetter = (void*)&GetU8Vector; + break; + case (ML_PY_INT8): + kind = I1; + pgetter = (void*)&GetI1Vector; + break; + case (ML_PY_INT16): + kind = I2; + pgetter = (void*)&GetI2Vector; + break; + case (ML_PY_INT32): + kind = I4; + pgetter = (void*)&GetI4Vector; + break; + case (ML_PY_INT64): + kind = I8; + pgetter = (void*)&GetI8Vector; + break; + case (ML_PY_FLOAT16): + throw std::invalid_argument("numpy.float16 data type is not supported in sparse data"); + case (ML_PY_FLOAT32): + kind = R4; + pgetter = (void*)&GetR4Vector; + break; + case (ML_PY_FLOAT64): + kind = R8; + pgetter = (void*)&GetR8Vector; + break; + default: + throw std::invalid_argument("column " + colName + " has unsupported type"); + } + vecCard = bp::extract(sparse["colCount"]); + name = (char*)"Data"; - if (llTotalNumRows == -1) - llTotalNumRows = len(indptr) - 1; - else - assert(llTotalNumRows == len(indptr) - 1); - } - else - throw std::invalid_argument("unsupported data type provided"); + if (llTotalNumRows == -1) + llTotalNumRows = len(indptr) - 1; + else + assert(llTotalNumRows == len(indptr) - 1); + } + else + throw std::invalid_argument("unsupported data type provided"); - this->_vgetter.push_back(pgetter); - this->_vname.push_back(name); - this->_vkind.push_back(kind); - _vvecCard.push_back(vecCard); + this->_vgetter.push_back(pgetter); + this->_vname.push_back(name); + this->_vkind.push_back(kind); + _vvecCard.push_back(vecCard); - if (!isNumeric) - { - assert(this->_mpnum.size() == dataframeColCount); - this->_mpnum.push_back(-1); - } - if (!isKey) - { - assert(this->_mpkey.size() == dataframeColCount); - this->_mpkey.push_back(-1); - this->_vkeyCard.push_back(-1); - } - if (!isText) - { - assert(this->_mptxt.size() == dataframeColCount); - this->_mptxt.push_back(-1); - } - } + if (!isNumeric) + { + assert(this->_mpnum.size() == dataframeColCount); + this->_mpnum.push_back(-1); + } + if (!isKey) + { + assert(this->_mpkey.size() == dataframeColCount); + this->_mpkey.push_back(-1); + this->_vkeyCard.push_back(-1); + } + if (!isText) + { + assert(this->_mptxt.size() == dataframeColCount); + this->_mptxt.push_back(-1); + } + } - assert(_vname.size() <= (size_t)(dataframeColCount + 1)); + assert(_vname.size() <= (size_t)(dataframeColCount + 1)); - this->crow = llTotalNumRows; - this->ccol = this->_vname.size(); - this->getLabels = &GetKeyNames; + this->crow = llTotalNumRows; + this->ccol = this->_vname.size(); + this->getLabels = &GetKeyNames; - assert(this->ccol == this->_vkind.size()); - assert(this->ccol == this->_vkeyCard.size()); - assert(this->ccol == this->_vgetter.size()); + assert(this->ccol == this->_vkind.size()); + assert(this->ccol == this->_vkeyCard.size()); + assert(this->ccol == this->_vgetter.size()); - // This is used in Revo, but seems to not be needed here. - this->ids = nullptr; - - if (this->ccol > 0) - { - this->names = &this->_vname[0]; - this->kinds = &this->_vkind[0]; - this->keyCards = &this->_vkeyCard[0]; - this->vecCards = &this->_vvecCard[0]; - this->getters = &this->_vgetter[0]; - } - else - { - this->names = nullptr; - this->kinds = nullptr; - this->keyCards = nullptr; - this->vecCards = nullptr; - this->getters = nullptr; - } + if (this->ccol > 0) + { + this->names = &this->_vname[0]; + this->kinds = &this->_vkind[0]; + this->keyCards = &this->_vkeyCard[0]; + this->vecCards = &this->_vvecCard[0]; + this->getters = &this->_vgetter[0]; + } + else + { + this->names = nullptr; + this->kinds = nullptr; + this->keyCards = nullptr; + this->vecCards = nullptr; + this->getters = nullptr; + } } DataSourceBlock::~DataSourceBlock() { #if _MSC_VER - for (std::vector::iterator it = this->_vtextdata_cache.begin(); it != this->_vtextdata_cache.end(); ++it) { - char* tmp = *it; - if (tmp != NULL) - free(tmp); - } + for (std::vector::iterator it = this->_vtextdata_cache.begin(); it != this->_vtextdata_cache.end(); ++it) { + char* tmp = *it; + if (tmp != NULL) + free(tmp); + } #endif - FillDead(this->ccol); - FillDead(this->crow); + FillDead(this->ccol); + FillDead(this->crow); - FillDead(this->names); - FillDead(this->kinds); - FillDead(this->keyCards); - FillDead(this->vecCards); - FillDead(this->getters); - FillDead(this->getLabels); + FillDead(this->names); + FillDead(this->kinds); + FillDead(this->keyCards); + FillDead(this->vecCards); + FillDead(this->getters); + FillDead(this->getLabels); } diff --git a/src/NativeBridge/DataViewInterop.h b/src/NativeBridge/DataViewInterop.h index 0f3011fa..c764b285 100644 --- a/src/NativeBridge/DataViewInterop.h +++ b/src/NativeBridge/DataViewInterop.h @@ -25,482 +25,480 @@ using namespace boost::python; // REVIEW: Need to figure out proper story for multi-threaded execution. class DataSourceBlock { - // Fields that are visible to managed code come first and do not start with an underscore. - // Fields that are only visible to this code start with an underscore. + // Fields that are visible to managed code come first and do not start with an underscore. + // Fields that are only visible to this code start with an underscore. private: - // *** These fields are known by managed code. It is critical that this struct not have a vtable. - // It is also critical that the layout of this prefix NOT vary from release to release or build to build. - - // Number of columns. - CxInt64 ccol; - // Total number of rows. Zero for unknown. - CxInt64 crow; - - // Column ids. - const CxInt64 *ids; - // Column names. - const char **names; - // Column data kinds. - const BYTE *kinds; - // Column key type cardinalities. Zero for unbounded, -1 for non-key-types. - const CxInt64 *keyCards; - // Column vector type cardinalities. Zero for variable size, -1 for non-vector-types. - const CxInt64 *vecCards; - // The call back item getter function pointers. Currently only used for string - // values (nullptr for others). For strings these are GETSTR function pointers. - const void **getters; - - // Call back function for getting labels. - GETLABELS getLabels; + // *** These fields are known by managed code. It is critical that this struct not have a vtable. + // It is also critical that the layout of this prefix NOT vary from release to release or build to build. + + // Number of columns. + CxInt64 ccol; + // Total number of rows. Zero for unknown. + CxInt64 crow; + + // Column names. + const char **names; + // Column data kinds. + const BYTE *kinds; + // Column key type cardinalities. Zero for unbounded, -1 for non-key-types. + const CxInt64 *keyCards; + // Column vector type cardinalities. Zero for variable size, -1 for non-vector-types. + const CxInt64 *vecCards; + // The call back item getter function pointers. Currently only used for string + // values (nullptr for others). For strings these are GETSTR function pointers. + const void **getters; + + // Call back function for getting labels. + GETLABELS getLabels; private: - // *** Stuff below here is not known by the managed code. - - std::vector _mpnum; - std::vector _mptxt; - std::vector _mpkey; - - // The vectors below here are parallel. - - // Column names. - std::vector _vname; - // Column DataKind values. - std::vector _vkind; - // Column key type cardinalities. Zero for unbounded, -1 for non-key-types. - std::vector _vkeyCard; - // Column vector type cardinalities. Zero for variable size, -1 for non-vector-types. - std::vector _vvecCard; - // Data getters for the columns (null for non-text columns). - std::vector _vgetter; - - std::vector _vdata; - std::vector _vtextdata; - std::vector _vtextdata_cache; - std::vector _vkeydata; - std::vector _vkeynames; - - // Stores the sparse data. - // REVIEW: need better documentatoin here - is this a pointer, or buffer ? If buffer, why this is not a vector ? Where do we store type of values ? What is indptr ? - void* _sparseValues; - int* _sparseIndices; - int* _indPtr; + // *** Stuff below here is not known by the managed code. + + std::vector _mpnum; + std::vector _mptxt; + std::vector _mpkey; + + // The vectors below here are parallel. + + // Column names. + std::vector _vname; + // Column DataKind values. + std::vector _vkind; + // Column key type cardinalities. Zero for unbounded, -1 for non-key-types. + std::vector _vkeyCard; + // Column vector type cardinalities. Zero for variable size, -1 for non-vector-types. + std::vector _vvecCard; + // Data getters for the columns (null for non-text columns). + std::vector _vgetter; + + std::vector _vdata; + std::vector _vtextdata; + std::vector _vtextdata_cache; + std::vector _vkeydata; + std::vector _vkeynames; + + // Stores the sparse data. + // REVIEW: need better documentatoin here - is this a pointer, or buffer ? If buffer, why this is not a vector ? Where do we store type of values ? What is indptr ? + void* _sparseValues; + int* _sparseIndices; + int* _indPtr; public: - DataSourceBlock(bp::dict& data); - ~DataSourceBlock(); + DataSourceBlock(bp::dict& data); + ~DataSourceBlock(); private: - bp::object SelectItemForType(bp::list& container) - { - auto length = len(container); - - for (auto index = 0; index < length; index++) - { - bp::object item = container[index]; - - if (!item.is_none()) - { - return item; - } - } - - return bp::object(); - } - - // Callback methods. These are only needed from managed code via the embedded function pointers above, - // so can be private. - static MANAGED_CALLBACK(void) GetBL(DataSourceBlock *pdata, int col, long index, /*out*/ signed char &dst) - { - CxInt64 numCol = pdata->_mpnum[col]; - assert(0 <= numCol && numCol < (CxInt64)pdata->_vdata.size()); - const signed char *charData = reinterpret_cast(pdata->_vdata[numCol]); - dst = charData[index]; - } - static MANAGED_CALLBACK(void) GetBL64(DataSourceBlock *pdata, int col, long index, /*out*/ signed char &dst) - { - CxInt64 numCol = pdata->_mpnum[col]; - assert(0 <= numCol && numCol < (CxInt64)pdata->_vdata.size()); - const double *charData = reinterpret_cast(pdata->_vdata[numCol]); - if (boost::math::isnan(charData[index])) - dst = -1; - else - dst = (signed char)charData[index]; - } - static MANAGED_CALLBACK(void) GetU1(DataSourceBlock *pdata, int col, long index, /*out*/ unsigned char &dst) - { - CxInt64 numCol = pdata->_mpnum[col]; - assert(0 <= numCol && numCol < (CxInt64)pdata->_vdata.size()); - const unsigned char *charData = reinterpret_cast(pdata->_vdata[numCol]); - dst = charData[index]; - } - static MANAGED_CALLBACK(void) GetU2(DataSourceBlock *pdata, int col, long index, /*out*/ unsigned short &dst) - { - CxInt64 numCol = pdata->_mpnum[col]; - assert(0 <= numCol && numCol < (CxInt64)pdata->_vdata.size()); - const unsigned short *shortData = reinterpret_cast(pdata->_vdata[numCol]); - dst = shortData[index]; - } - static MANAGED_CALLBACK(void) GetU4(DataSourceBlock *pdata, int col, long index, /*out*/ unsigned int &dst) - { - CxInt64 numCol = pdata->_mpnum[col]; - assert(0 <= numCol && numCol < (CxInt64)pdata->_vdata.size()); - const unsigned int *intData = reinterpret_cast(pdata->_vdata[numCol]); - dst = intData[index]; - } - static MANAGED_CALLBACK(void) GetU8(DataSourceBlock *pdata, int col, long index, /*out*/ CxUInt64 &dst) - { - CxInt64 numCol = pdata->_mpnum[col]; - assert(0 <= numCol && numCol < (CxInt64)pdata->_vdata.size()); - const CxUInt64 *longData = reinterpret_cast(pdata->_vdata[numCol]); - dst = longData[index]; - } - static MANAGED_CALLBACK(void) GetI1(DataSourceBlock *pdata, int col, long index, /*out*/ signed char &dst) - { - CxInt64 numCol = pdata->_mpnum[col]; - assert(0 <= numCol && numCol < (CxInt64)pdata->_vdata.size()); - const signed char *charData = reinterpret_cast(pdata->_vdata[numCol]); - dst = charData[index]; - } - static MANAGED_CALLBACK(void) GetI2(DataSourceBlock *pdata, int col, long index, /*out*/ short &dst) - { - CxInt64 numCol = pdata->_mpnum[col]; - assert(0 <= numCol && numCol < (CxInt64)pdata->_vdata.size()); - const short *shortData = reinterpret_cast(pdata->_vdata[numCol]); - dst = shortData[index]; - } - static MANAGED_CALLBACK(void) GetI4(DataSourceBlock *pdata, int col, long index, /*out*/ int &dst) - { - CxInt64 numCol = pdata->_mpnum[col]; - assert(0 <= numCol && numCol < (CxInt64)pdata->_vdata.size()); - const int *intData = reinterpret_cast(pdata->_vdata[numCol]); - dst = intData[index]; - } - static MANAGED_CALLBACK(void) GetI8(DataSourceBlock *pdata, int col, long index, /*out*/ CxInt64 &dst) - { - CxInt64 numCol = pdata->_mpnum[col]; - assert(0 <= numCol && numCol < (CxInt64)pdata->_vdata.size()); - const CxInt64 *longData = reinterpret_cast(pdata->_vdata[numCol]); - dst = longData[index]; - } - static MANAGED_CALLBACK(void) GetR4(DataSourceBlock *pdata, int col, long index, /*out*/ float &dst) - { - CxInt64 numCol = pdata->_mpnum[col]; - assert(0 <= numCol && numCol < (CxInt64)pdata->_vdata.size()); - const float *floatData = reinterpret_cast(pdata->_vdata[numCol]); - dst = floatData[index]; - } - static MANAGED_CALLBACK(void) GetR8(DataSourceBlock *pdata, int col, long index, /*out*/ double &dst) - { - CxInt64 numCol = pdata->_mpnum[col]; - assert(0 <= numCol && numCol < (CxInt64)pdata->_vdata.size()); - const double *doubleData = reinterpret_cast(pdata->_vdata[numCol]); - dst = doubleData[index]; - } - - // Call back from C# to map from data buffer and index to char* and convert to UTF16. - static MANAGED_CALLBACK(void) GetTX(DataSourceBlock *pdata, int col, long index, const/*out*/ char*& pch, /*out*/int32_t &size, /*out*/int32_t &missing) - { - CxInt64 txCol = pdata->_mptxt[col]; - assert(0 <= txCol && txCol < (CxInt64)pdata->_vtextdata.size()); - bp::object s = pdata->_vtextdata[txCol][index]; - - if (bp::extract(s).check()) - { - size = -1; - missing = -1; - pch = bp::extract(s); - if (s.is_none()) - { - size = 0; - pch = 0; - } - else - { + bp::object SelectItemForType(bp::list& container) + { + auto length = len(container); + + for (auto index = 0; index < length; index++) + { + bp::object item = container[index]; + + if (!item.is_none()) + { + return item; + } + } + + return bp::object(); + } + + // Callback methods. These are only needed from managed code via the embedded function pointers above, + // so can be private. + static MANAGED_CALLBACK(void) GetBL(DataSourceBlock *pdata, int col, long index, /*out*/ signed char &dst) + { + CxInt64 numCol = pdata->_mpnum[col]; + assert(0 <= numCol && numCol < (CxInt64)pdata->_vdata.size()); + const signed char *charData = reinterpret_cast(pdata->_vdata[numCol]); + dst = charData[index]; + } + static MANAGED_CALLBACK(void) GetBL64(DataSourceBlock *pdata, int col, long index, /*out*/ signed char &dst) + { + CxInt64 numCol = pdata->_mpnum[col]; + assert(0 <= numCol && numCol < (CxInt64)pdata->_vdata.size()); + const double *charData = reinterpret_cast(pdata->_vdata[numCol]); + if (boost::math::isnan(charData[index])) + dst = -1; + else + dst = (signed char)charData[index]; + } + static MANAGED_CALLBACK(void) GetU1(DataSourceBlock *pdata, int col, long index, /*out*/ unsigned char &dst) + { + CxInt64 numCol = pdata->_mpnum[col]; + assert(0 <= numCol && numCol < (CxInt64)pdata->_vdata.size()); + const unsigned char *charData = reinterpret_cast(pdata->_vdata[numCol]); + dst = charData[index]; + } + static MANAGED_CALLBACK(void) GetU2(DataSourceBlock *pdata, int col, long index, /*out*/ unsigned short &dst) + { + CxInt64 numCol = pdata->_mpnum[col]; + assert(0 <= numCol && numCol < (CxInt64)pdata->_vdata.size()); + const unsigned short *shortData = reinterpret_cast(pdata->_vdata[numCol]); + dst = shortData[index]; + } + static MANAGED_CALLBACK(void) GetU4(DataSourceBlock *pdata, int col, long index, /*out*/ unsigned int &dst) + { + CxInt64 numCol = pdata->_mpnum[col]; + assert(0 <= numCol && numCol < (CxInt64)pdata->_vdata.size()); + const unsigned int *intData = reinterpret_cast(pdata->_vdata[numCol]); + dst = intData[index]; + } + static MANAGED_CALLBACK(void) GetU8(DataSourceBlock *pdata, int col, long index, /*out*/ CxUInt64 &dst) + { + CxInt64 numCol = pdata->_mpnum[col]; + assert(0 <= numCol && numCol < (CxInt64)pdata->_vdata.size()); + const CxUInt64 *longData = reinterpret_cast(pdata->_vdata[numCol]); + dst = longData[index]; + } + static MANAGED_CALLBACK(void) GetI1(DataSourceBlock *pdata, int col, long index, /*out*/ signed char &dst) + { + CxInt64 numCol = pdata->_mpnum[col]; + assert(0 <= numCol && numCol < (CxInt64)pdata->_vdata.size()); + const signed char *charData = reinterpret_cast(pdata->_vdata[numCol]); + dst = charData[index]; + } + static MANAGED_CALLBACK(void) GetI2(DataSourceBlock *pdata, int col, long index, /*out*/ short &dst) + { + CxInt64 numCol = pdata->_mpnum[col]; + assert(0 <= numCol && numCol < (CxInt64)pdata->_vdata.size()); + const short *shortData = reinterpret_cast(pdata->_vdata[numCol]); + dst = shortData[index]; + } + static MANAGED_CALLBACK(void) GetI4(DataSourceBlock *pdata, int col, long index, /*out*/ int &dst) + { + CxInt64 numCol = pdata->_mpnum[col]; + assert(0 <= numCol && numCol < (CxInt64)pdata->_vdata.size()); + const int *intData = reinterpret_cast(pdata->_vdata[numCol]); + dst = intData[index]; + } + static MANAGED_CALLBACK(void) GetI8(DataSourceBlock *pdata, int col, long index, /*out*/ CxInt64 &dst) + { + CxInt64 numCol = pdata->_mpnum[col]; + assert(0 <= numCol && numCol < (CxInt64)pdata->_vdata.size()); + const CxInt64 *longData = reinterpret_cast(pdata->_vdata[numCol]); + dst = longData[index]; + } + static MANAGED_CALLBACK(void) GetR4(DataSourceBlock *pdata, int col, long index, /*out*/ float &dst) + { + CxInt64 numCol = pdata->_mpnum[col]; + assert(0 <= numCol && numCol < (CxInt64)pdata->_vdata.size()); + const float *floatData = reinterpret_cast(pdata->_vdata[numCol]); + dst = floatData[index]; + } + static MANAGED_CALLBACK(void) GetR8(DataSourceBlock *pdata, int col, long index, /*out*/ double &dst) + { + CxInt64 numCol = pdata->_mpnum[col]; + assert(0 <= numCol && numCol < (CxInt64)pdata->_vdata.size()); + const double *doubleData = reinterpret_cast(pdata->_vdata[numCol]); + dst = doubleData[index]; + } + + // Call back from C# to map from data buffer and index to char* and convert to UTF16. + static MANAGED_CALLBACK(void) GetTX(DataSourceBlock *pdata, int col, long index, const/*out*/ char*& pch, /*out*/int32_t &size, /*out*/int32_t &missing) + { + CxInt64 txCol = pdata->_mptxt[col]; + assert(0 <= txCol && txCol < (CxInt64)pdata->_vtextdata.size()); + bp::object s = pdata->_vtextdata[txCol][index]; + + if (bp::extract(s).check()) + { + size = -1; + missing = -1; + pch = bp::extract(s); + if (s.is_none()) + { + size = 0; + pch = 0; + } + else + { #if _MSC_VER - Utf8ToUtf16le(pch, pch, size); + Utf8ToUtf16le(pch, pch, size); #endif - pdata->_vtextdata_cache.push_back((char*)pch); - } - } - else - { - // Missing values in Python are float.NaN. - assert(bp::extract(s).check()); - missing = 1; - } - } - - // The method below executes in python 2.7 only! + pdata->_vtextdata_cache.push_back((char*)pch); + } + } + else + { + // Missing values in Python are float.NaN. + assert(bp::extract(s).check()); + missing = 1; + } + } + + // The method below executes in python 2.7 only! // Call back from C# to get text data in UTF16 from unicode bytestring static MANAGED_CALLBACK(void) GetUnicodeTX(DataSourceBlock *pdata, int col, long index, const/*out*/ char*& pch, /*out*/int32_t &size, /*out*/int32_t &missing) - { + { CxInt64 txCol = pdata->_mptxt[col]; assert(0 <= txCol && txCol < (CxInt64)pdata->_vtextdata.size()); auto s = pdata->_vtextdata[txCol][index]; - if (bp::extract(str(s).encode("utf_8")).check()) - { - missing = -1; - pch = bp::extract(str(s).encode("utf_8")); + if (bp::extract(str(s).encode("utf_8")).check()) + { + missing = -1; + pch = bp::extract(str(s).encode("utf_8")); #if _MSC_VER Utf8ToUtf16le(pch, pch, size); #endif - pdata->_vtextdata_cache.push_back((char*)pch); - } - else - { - // Missing values in Python are float.NaN. - assert(bp::extract(s).check()); - missing = 1; - } + pdata->_vtextdata_cache.push_back((char*)pch); + } + else + { + // Missing values in Python are float.NaN. + assert(bp::extract(s).check()); + missing = 1; + } } #if _MSC_VER - static void Utf8ToUtf16le(const char* utf8Str, const/*out*/ char*& pch, /*out*/int &size) - { - // Allocate the utf16 string buffer. - size = MultiByteToWideChar(CP_UTF8, 0, utf8Str, -1, NULL, 0); - if (size == 0) - { - pch = 0; - return; - } - - wchar_t* utf16Str = new wchar_t[size]; - - try - { - // Convert the utf8 string. - MultiByteToWideChar(CP_UTF8, 0, utf8Str, -1, utf16Str, size); - } - catch (...) - { - // On exception clean up and re-throw. - if (utf16Str) delete[] utf16Str; - throw; - } - - // size includes a NULL character at the end, discount it - assert(utf16Str[size - 1] == L'\0'); - size -= 1; - pch = (char*)utf16Str; - } + static void Utf8ToUtf16le(const char* utf8Str, const/*out*/ char*& pch, /*out*/int &size) + { + // Allocate the utf16 string buffer. + size = MultiByteToWideChar(CP_UTF8, 0, utf8Str, -1, NULL, 0); + if (size == 0) + { + pch = 0; + return; + } + + wchar_t* utf16Str = new wchar_t[size]; + + try + { + // Convert the utf8 string. + MultiByteToWideChar(CP_UTF8, 0, utf8Str, -1, utf16Str, size); + } + catch (...) + { + // On exception clean up and re-throw. + if (utf16Str) delete[] utf16Str; + throw; + } + + // size includes a NULL character at the end, discount it + assert(utf16Str[size - 1] == L'\0'); + size -= 1; + pch = (char*)utf16Str; + } #endif - static MANAGED_CALLBACK(void) GetKeyInt(DataSourceBlock *pdata, int col, long index, /*out*/ int& dst) - { - CxInt64 keyCol = pdata->_mpkey[col]; - assert(0 <= keyCol && keyCol < (CxInt64)pdata->_vkeydata.size()); - - auto & list = pdata->_vkeydata[keyCol]; - bp::object obj = pdata->SelectItemForType(list); - assert(strcmp(obj.ptr()->ob_type->tp_name, "int") == 0); - dst = bp::extract(list[index]); - } - - // Callback function for getting labels for key-type columns. Returns success. - static MANAGED_CALLBACK(bool) GetKeyNames(DataSourceBlock *pdata, int col, int count, const char **buffer) - { - if (count <= 0 || buffer == nullptr) - { - // Invalid count or buffer, don't zero out buffer returning. - assert(false); - return false; - } - if (pdata == nullptr) - { - // Invalid pdata. - return OnGetLabelsFailure(count, buffer); - } - if (0 > col || (size_t)col >= pdata->_mpkey.size()) - { - // Invalid column id. - return OnGetLabelsFailure(count, buffer); - } - if (pdata->_vkeyCard[col] != count) - { - // Column is not a key type. - return OnGetLabelsFailure(count, buffer); - } - - CxInt64 keyCol = pdata->_mpkey[col]; - bp::list & names = pdata->_vkeynames[keyCol]; - if (len(names) != count) - { - // No labels for this column. This is not a logic error. - return OnGetLabelsFailure(count, buffer); - } - - for (int i = 0; i < count; ++i, ++buffer) - *buffer = bp::extract(names[i]); - return true; - } - - static bool OnGetLabelsFailure(int count, const char **buffer) - { - assert(false); - for (int i = 0; i < count; i++) - buffer[i] = nullptr; - return false; - } - - // Same method has two modes: if "inquire" is true, it returns the number of indices/values needed for the current row. - // If "inquire" is false, it assumes that indices/values are big enough, and fills them in for the current row. - static MANAGED_CALLBACK(void) GetBLVector(DataSourceBlock *pdata, int col, CxInt64 index, int* indices, unsigned char* values, bool inquire, /*out*/ int &size) - { - size = pdata->_indPtr[index + 1] - pdata->_indPtr[index]; - if (inquire) - return; - - const unsigned char *boolData = reinterpret_cast(pdata->_sparseValues); - for (int i = 0; i < size; i++) - { - indices[i] = pdata->_sparseIndices[pdata->_indPtr[index] + i]; - values[i] = boolData[pdata->_indPtr[index] + i]; - } - } - - static MANAGED_CALLBACK(void) GetU1Vector(DataSourceBlock *pdata, int col, CxInt64 index, int* indices, unsigned char* values, bool inquire, /*out*/ int &size) - { - size = pdata->_indPtr[index + 1] - pdata->_indPtr[index]; - if (inquire) - return; - - const unsigned char *int8Data = reinterpret_cast(pdata->_sparseValues); - for (int i = 0; i < size; i++) - { - indices[i] = pdata->_sparseIndices[pdata->_indPtr[index] + i]; - values[i] = int8Data[pdata->_indPtr[index] + i]; - } - } - - static MANAGED_CALLBACK(void) GetU2Vector(DataSourceBlock *pdata, int col, CxInt64 index, int* indices, unsigned short* values, bool inquire, /*out*/ int &size) - { - size = pdata->_indPtr[index + 1] - pdata->_indPtr[index]; - if (inquire) - return; - - const unsigned short *int16Data = reinterpret_cast(pdata->_sparseValues); - for (int i = 0; i < size; i++) - { - indices[i] = pdata->_sparseIndices[pdata->_indPtr[index] + i]; - values[i] = int16Data[pdata->_indPtr[index] + i]; - } - } - - static MANAGED_CALLBACK(void) GetU4Vector(DataSourceBlock *pdata, int col, CxInt64 index, int* indices, unsigned int* values, bool inquire, /*out*/ int &size) - { - size = pdata->_indPtr[index + 1] - pdata->_indPtr[index]; - if (inquire) - return; - - const unsigned int *int32Data = reinterpret_cast(pdata->_sparseValues); - for (int i = 0; i < size; i++) - { - indices[i] = pdata->_sparseIndices[pdata->_indPtr[index] + i]; - values[i] = int32Data[pdata->_indPtr[index] + i]; - } - } - - static MANAGED_CALLBACK(void) GetU8Vector(DataSourceBlock *pdata, int col, CxInt64 index, int* indices, CxUInt64* values, bool inquire, /*out*/ int &size) - { - size = pdata->_indPtr[index + 1] - pdata->_indPtr[index]; - if (inquire) - return; - - const unsigned long *int64Data = reinterpret_cast(pdata->_sparseValues); - for (int i = 0; i < size; i++) - { - indices[i] = pdata->_sparseIndices[pdata->_indPtr[index] + i]; - values[i] = int64Data[pdata->_indPtr[index] + i]; - } - } - - static MANAGED_CALLBACK(void) GetI1Vector(DataSourceBlock *pdata, int col, CxInt64 index, int* indices, signed char* values, bool inquire, /*out*/ int &size) - { - size = pdata->_indPtr[index + 1] - pdata->_indPtr[index]; - if (inquire) - return; - - const signed char *int8Data = reinterpret_cast(pdata->_sparseValues); - for (int i = 0; i < size; i++) - { - indices[i] = pdata->_sparseIndices[pdata->_indPtr[index] + i]; - values[i] = int8Data[pdata->_indPtr[index] + i]; - } - } - - static MANAGED_CALLBACK(void) GetI2Vector(DataSourceBlock *pdata, int col, CxInt64 index, int* indices, short* values, bool inquire, /*out*/ int &size) - { - size = pdata->_indPtr[index + 1] - pdata->_indPtr[index]; - if (inquire) - return; - - const short *int16Data = reinterpret_cast(pdata->_sparseValues); - for (int i = 0; i < size; i++) - { - indices[i] = pdata->_sparseIndices[pdata->_indPtr[index] + i]; - values[i] = int16Data[pdata->_indPtr[index] + i]; - } - } - - static MANAGED_CALLBACK(void) GetI4Vector(DataSourceBlock *pdata, int col, CxInt64 index, int* indices, int* values, bool inquire, /*out*/ int &size) - { - size = pdata->_indPtr[index + 1] - pdata->_indPtr[index]; - if (inquire) - return; - - const int *int32Data = reinterpret_cast(pdata->_sparseValues); - for (int i = 0; i < size; i++) - { - indices[i] = pdata->_sparseIndices[pdata->_indPtr[index] + i]; - values[i] = int32Data[pdata->_indPtr[index] + i]; - } - } - - static MANAGED_CALLBACK(void) GetI8Vector(DataSourceBlock *pdata, int col, CxInt64 index, int* indices, CxInt64* values, bool inquire, /*out*/ int &size) - { - size = pdata->_indPtr[index + 1] - pdata->_indPtr[index]; - if (inquire) - return; - - const CxInt64 *int64Data = reinterpret_cast(pdata->_sparseValues); - for (int i = 0; i < size; i++) - { - indices[i] = pdata->_sparseIndices[pdata->_indPtr[index] + i]; - values[i] = int64Data[pdata->_indPtr[index] + i]; - } - } - - static MANAGED_CALLBACK(void) GetR4Vector(DataSourceBlock *pdata, int col, CxInt64 index, int* indices, float* values, bool inquire, /*out*/ int &size) - { - size = pdata->_indPtr[index + 1] - pdata->_indPtr[index]; - if (inquire) - return; - - const float *floatData = reinterpret_cast(pdata->_sparseValues); - for (int i = 0; i < size; i++) - { - indices[i] = pdata->_sparseIndices[pdata->_indPtr[index] + i]; - values[i] = floatData[pdata->_indPtr[index] + i]; - } - } - - static MANAGED_CALLBACK(void) GetR8Vector(DataSourceBlock *pdata, int col, CxInt64 index, int* indices, double* values, bool inquire, /*out*/ int &size) - { - size = pdata->_indPtr[index + 1] - pdata->_indPtr[index]; - if (inquire) - return; - - const double *doubleData = reinterpret_cast(pdata->_sparseValues); - for (int i = 0; i < size; i++) - { - indices[i] = pdata->_sparseIndices[pdata->_indPtr[index] + i]; - values[i] = doubleData[pdata->_indPtr[index] + i]; - } - } + static MANAGED_CALLBACK(void) GetKeyInt(DataSourceBlock *pdata, int col, long index, /*out*/ int& dst) + { + CxInt64 keyCol = pdata->_mpkey[col]; + assert(0 <= keyCol && keyCol < (CxInt64)pdata->_vkeydata.size()); + + auto & list = pdata->_vkeydata[keyCol]; + bp::object obj = pdata->SelectItemForType(list); + assert(strcmp(obj.ptr()->ob_type->tp_name, "int") == 0); + dst = bp::extract(list[index]); + } + + // Callback function for getting labels for key-type columns. Returns success. + static MANAGED_CALLBACK(bool) GetKeyNames(DataSourceBlock *pdata, int col, int count, const char **buffer) + { + if (count <= 0 || buffer == nullptr) + { + // Invalid count or buffer, don't zero out buffer returning. + assert(false); + return false; + } + if (pdata == nullptr) + { + // Invalid pdata. + return OnGetLabelsFailure(count, buffer); + } + if (0 > col || (size_t)col >= pdata->_mpkey.size()) + { + // Invalid column id. + return OnGetLabelsFailure(count, buffer); + } + if (pdata->_vkeyCard[col] != count) + { + // Column is not a key type. + return OnGetLabelsFailure(count, buffer); + } + + CxInt64 keyCol = pdata->_mpkey[col]; + bp::list & names = pdata->_vkeynames[keyCol]; + if (len(names) != count) + { + // No labels for this column. This is not a logic error. + return OnGetLabelsFailure(count, buffer); + } + + for (int i = 0; i < count; ++i, ++buffer) + *buffer = bp::extract(names[i]); + return true; + } + + static bool OnGetLabelsFailure(int count, const char **buffer) + { + assert(false); + for (int i = 0; i < count; i++) + buffer[i] = nullptr; + return false; + } + + // Same method has two modes: if "inquire" is true, it returns the number of indices/values needed for the current row. + // If "inquire" is false, it assumes that indices/values are big enough, and fills them in for the current row. + static MANAGED_CALLBACK(void) GetBLVector(DataSourceBlock *pdata, int col, CxInt64 index, int* indices, unsigned char* values, bool inquire, /*out*/ int &size) + { + size = pdata->_indPtr[index + 1] - pdata->_indPtr[index]; + if (inquire) + return; + + const unsigned char *boolData = reinterpret_cast(pdata->_sparseValues); + for (int i = 0; i < size; i++) + { + indices[i] = pdata->_sparseIndices[pdata->_indPtr[index] + i]; + values[i] = boolData[pdata->_indPtr[index] + i]; + } + } + + static MANAGED_CALLBACK(void) GetU1Vector(DataSourceBlock *pdata, int col, CxInt64 index, int* indices, unsigned char* values, bool inquire, /*out*/ int &size) + { + size = pdata->_indPtr[index + 1] - pdata->_indPtr[index]; + if (inquire) + return; + + const unsigned char *int8Data = reinterpret_cast(pdata->_sparseValues); + for (int i = 0; i < size; i++) + { + indices[i] = pdata->_sparseIndices[pdata->_indPtr[index] + i]; + values[i] = int8Data[pdata->_indPtr[index] + i]; + } + } + + static MANAGED_CALLBACK(void) GetU2Vector(DataSourceBlock *pdata, int col, CxInt64 index, int* indices, unsigned short* values, bool inquire, /*out*/ int &size) + { + size = pdata->_indPtr[index + 1] - pdata->_indPtr[index]; + if (inquire) + return; + + const unsigned short *int16Data = reinterpret_cast(pdata->_sparseValues); + for (int i = 0; i < size; i++) + { + indices[i] = pdata->_sparseIndices[pdata->_indPtr[index] + i]; + values[i] = int16Data[pdata->_indPtr[index] + i]; + } + } + + static MANAGED_CALLBACK(void) GetU4Vector(DataSourceBlock *pdata, int col, CxInt64 index, int* indices, unsigned int* values, bool inquire, /*out*/ int &size) + { + size = pdata->_indPtr[index + 1] - pdata->_indPtr[index]; + if (inquire) + return; + + const unsigned int *int32Data = reinterpret_cast(pdata->_sparseValues); + for (int i = 0; i < size; i++) + { + indices[i] = pdata->_sparseIndices[pdata->_indPtr[index] + i]; + values[i] = int32Data[pdata->_indPtr[index] + i]; + } + } + + static MANAGED_CALLBACK(void) GetU8Vector(DataSourceBlock *pdata, int col, CxInt64 index, int* indices, CxUInt64* values, bool inquire, /*out*/ int &size) + { + size = pdata->_indPtr[index + 1] - pdata->_indPtr[index]; + if (inquire) + return; + + const unsigned long *int64Data = reinterpret_cast(pdata->_sparseValues); + for (int i = 0; i < size; i++) + { + indices[i] = pdata->_sparseIndices[pdata->_indPtr[index] + i]; + values[i] = int64Data[pdata->_indPtr[index] + i]; + } + } + + static MANAGED_CALLBACK(void) GetI1Vector(DataSourceBlock *pdata, int col, CxInt64 index, int* indices, signed char* values, bool inquire, /*out*/ int &size) + { + size = pdata->_indPtr[index + 1] - pdata->_indPtr[index]; + if (inquire) + return; + + const signed char *int8Data = reinterpret_cast(pdata->_sparseValues); + for (int i = 0; i < size; i++) + { + indices[i] = pdata->_sparseIndices[pdata->_indPtr[index] + i]; + values[i] = int8Data[pdata->_indPtr[index] + i]; + } + } + + static MANAGED_CALLBACK(void) GetI2Vector(DataSourceBlock *pdata, int col, CxInt64 index, int* indices, short* values, bool inquire, /*out*/ int &size) + { + size = pdata->_indPtr[index + 1] - pdata->_indPtr[index]; + if (inquire) + return; + + const short *int16Data = reinterpret_cast(pdata->_sparseValues); + for (int i = 0; i < size; i++) + { + indices[i] = pdata->_sparseIndices[pdata->_indPtr[index] + i]; + values[i] = int16Data[pdata->_indPtr[index] + i]; + } + } + + static MANAGED_CALLBACK(void) GetI4Vector(DataSourceBlock *pdata, int col, CxInt64 index, int* indices, int* values, bool inquire, /*out*/ int &size) + { + size = pdata->_indPtr[index + 1] - pdata->_indPtr[index]; + if (inquire) + return; + + const int *int32Data = reinterpret_cast(pdata->_sparseValues); + for (int i = 0; i < size; i++) + { + indices[i] = pdata->_sparseIndices[pdata->_indPtr[index] + i]; + values[i] = int32Data[pdata->_indPtr[index] + i]; + } + } + + static MANAGED_CALLBACK(void) GetI8Vector(DataSourceBlock *pdata, int col, CxInt64 index, int* indices, CxInt64* values, bool inquire, /*out*/ int &size) + { + size = pdata->_indPtr[index + 1] - pdata->_indPtr[index]; + if (inquire) + return; + + const CxInt64 *int64Data = reinterpret_cast(pdata->_sparseValues); + for (int i = 0; i < size; i++) + { + indices[i] = pdata->_sparseIndices[pdata->_indPtr[index] + i]; + values[i] = int64Data[pdata->_indPtr[index] + i]; + } + } + + static MANAGED_CALLBACK(void) GetR4Vector(DataSourceBlock *pdata, int col, CxInt64 index, int* indices, float* values, bool inquire, /*out*/ int &size) + { + size = pdata->_indPtr[index + 1] - pdata->_indPtr[index]; + if (inquire) + return; + + const float *floatData = reinterpret_cast(pdata->_sparseValues); + for (int i = 0; i < size; i++) + { + indices[i] = pdata->_sparseIndices[pdata->_indPtr[index] + i]; + values[i] = floatData[pdata->_indPtr[index] + i]; + } + } + + static MANAGED_CALLBACK(void) GetR8Vector(DataSourceBlock *pdata, int col, CxInt64 index, int* indices, double* values, bool inquire, /*out*/ int &size) + { + size = pdata->_indPtr[index + 1] - pdata->_indPtr[index]; + if (inquire) + return; + + const double *doubleData = reinterpret_cast(pdata->_sparseValues); + for (int i = 0; i < size; i++) + { + indices[i] = pdata->_sparseIndices[pdata->_indPtr[index] + i]; + values[i] = doubleData[pdata->_indPtr[index] + i]; + } + } }; // A native wrapper around a managed IDataView for receiving data back from managed code. @@ -508,40 +506,45 @@ class DataSourceBlock // This is filled in by managed code and referenced by native code. struct DataViewBlock { - // *** These fields are shared from managed code. It is critical that this struct not have a vtable. - // It is also critical that the layout of this NOT vary from release to release or build to build. - // The managed code assumes that CxInt64 occupies 8 bytes, and each pointer occupies 8 bytes. - - // Number of columns. - CxInt64 ccol; - // Total number of rows. Zero for unknown. - CxInt64 crow; - - // Column names. - const char **names; - // Column data kinds. - const BYTE *kinds; - // Column key type cardinalities. Only contains the values for the columns that have - // key names. - const int *keyCards; + // *** These fields are shared from managed code. It is critical that this struct not have a vtable. + // It is also critical that the layout of this NOT vary from release to release or build to build. + // The managed code assumes that CxInt64 occupies 8 bytes, and each pointer occupies 8 bytes. + + // Number of columns. + CxInt64 ccol; + // Total number of rows. Zero for unknown. + CxInt64 crow; + + // Column names. + const char **names; + // Column data kinds. + const BYTE *kinds; + // Column key type cardinalities. Only contains the values for the columns that have + // key names. + const int *keyCards; + // The number of values in each row of a column. + // A value count of 0 means that each row of the + // column is variable length. + const BYTE *valueCounts; }; enum ML_PY_TYPE_MAP_ENUM { - ML_PY_BOOL = '?', - ML_PY_BOOL64 = '!', - ML_PY_UINT8 = 'B', - ML_PY_UINT16 = 'H', - ML_PY_UINT32 = 'I', - ML_PY_UINT64 = 'Q', - ML_PY_INT8 = 'b', - ML_PY_INT16 = 'h', - ML_PY_INT32 = 'i', - ML_PY_INT64 = 'q', - ML_PY_FLOAT16 = 'e', - ML_PY_FLOAT32 = 'f', - ML_PY_FLOAT64 = 'd', - ML_PY_CAT = 'c', - ML_PY_TEXT = 't', - ML_PY_UNICODE = 'u', - ML_PY_UNSUPPORTED = 'x' + ML_PY_BOOL = '?', + ML_PY_BOOL64 = '!', + ML_PY_UINT8 = 'B', + ML_PY_UINT16 = 'H', + ML_PY_UINT32 = 'I', + ML_PY_UINT64 = 'Q', + ML_PY_INT8 = 'b', + ML_PY_INT16 = 'h', + ML_PY_INT32 = 'i', + ML_PY_INT64 = 'q', + ML_PY_FLOAT16 = 'e', + ML_PY_FLOAT32 = 'f', + ML_PY_FLOAT64 = 'd', + ML_PY_CAT = 'c', + ML_PY_TEXT = 't', + ML_PY_UNICODE = 'u', + ML_PY_DATETIME = 'z', + ML_PY_UNSUPPORTED = 'x' }; diff --git a/src/NativeBridge/ManagedInterop.cpp b/src/NativeBridge/ManagedInterop.cpp index bca89755..6a80000d 100644 --- a/src/NativeBridge/ManagedInterop.cpp +++ b/src/NativeBridge/ManagedInterop.cpp @@ -6,322 +6,267 @@ #include "DataViewInterop.h" #include "ManagedInterop.h" -inline void destroyManagerCObject(PyObject* obj) { - auto* b = static_cast(PyCapsule_GetPointer(obj, NULL)); - if (b) { delete b; } -} - -#define SetDict2(cpptype, nptype); \ - {\ - PythonObject* col = dynamic_cast*>(column);\ - auto shrd = col->GetData();\ - auto* data = shrd->data();\ - bp::handle<> h(::PyCapsule_New((void*)column, NULL, (PyCapsule_Destructor)&destroyManagerCObject));\ - dict[_names[i]] = np::from_data(\ - data,\ - np::dtype::get_builtin(),\ - bp::make_tuple(shrd->size()),\ - bp::make_tuple(sizeof(nptype)), bp::object(h));\ - } - -#define SetDict1(type) SetDict2(type, type) -#define SetDictAndKeys(type, i); \ - {\ - PythonObject* col = dynamic_cast*>(column);\ - auto shrd = col->GetData();\ - auto* data = shrd->data();\ - bp::handle<> h(::PyCapsule_New((void*)column, NULL, (PyCapsule_Destructor)&destroyManagerCObject));\ - np::ndarray npdata = np::from_data(\ - data,\ - np::dtype::get_builtin(),\ - bp::make_tuple(shrd->size()),\ - bp::make_tuple(sizeof(float)), bp::object(h));\ - if (keyNames == nullptr)\ - {\ - dict[_names[i]] = npdata;\ - }\ - else\ - {\ - dict[_names[i]] = bp::dict();\ - dict[_names[i]]["..Data"] = npdata;\ - auto shrd = keyNames->GetData();\ - bp::list list;\ - for (int j = 0; j < shrd->size(); j++)\ - {\ - bp::object obj;\ - const std::string& value = shrd->at(j);\ - if (!value.empty())\ - {\ - obj = bp::object(value);\ - }\ - list.append(obj);\ - }\ - dict[_names[i]]["..KeyValues"] = list;\ - }\ - }\ +#define AddToDict(type); \ + {\ + PyColumn* col = dynamic_cast*>(column);\ + col->AddToDict(dict, _names[i], keyNames, maxRows);\ + }\ #define STATIC + EnvironmentBlock::~EnvironmentBlock() { - // Everything (except data buffers) that we might have exposed to managed code, - // fill with dead values. - FillDead(this->verbosity); - FillDead(this->seed); - FillDead(this->messageSink); - FillDead(this->modelSink); - FillDead(this->checkCancel); + // Everything (except data buffers) that we might have exposed to managed code, + // fill with dead values. + FillDead(this->verbosity); + FillDead(this->seed); + FillDead(this->maxSlots); + FillDead(this->messageSink); + FillDead(this->modelSink); + FillDead(this->checkCancel); - for (size_t i = 0; i < _vset.size(); i++) - FillDead(_vset[i]); + for (size_t i = 0; i < _vset.size(); i++) + FillDead(_vset[i]); } -EnvironmentBlock::EnvironmentBlock(int verbosity, int maxThreadsAllowed, int seed, const char* pythonPath) +EnvironmentBlock::EnvironmentBlock(int verbosity, int maxSlots, int seed, const char* pythonPath) { - // Assert that this class doesn't have a vtable. - assert(offsetof(EnvironmentBlock, verbosity) == 0); + // Assert that this class doesn't have a vtable. + assert(offsetof(EnvironmentBlock, verbosity) == 0); - this->_errCode = PyErrorCode_NoError; - this->verbosity = verbosity; - this->maxThreadsAllowed = maxThreadsAllowed; - this->seed = seed; - this->pythonPath = pythonPath; - this->_kindMask = (1 << Warning) | (1 << Error); - if (verbosity > 0) - this->_kindMask |= (1 << Info); - if (this->verbosity > 3) - this->_kindMask |= (1 << Trace); - this->dataSink = &DataSink; - this->messageSink = &MessageSink; - this->modelSink = &ModelSink; - this->checkCancel = &CheckCancel; + this->verbosity = verbosity; + this->maxSlots = maxSlots; + this->seed = seed; + this->pythonPath = pythonPath; + this->_kindMask = (1 << Warning) | (1 << Error); + if (verbosity > 0) + this->_kindMask |= (1 << Info); + if (this->verbosity > 3) + this->_kindMask |= (1 << Trace); + this->dataSink = &DataSink; + this->messageSink = &MessageSink; + this->modelSink = &ModelSink; + this->checkCancel = &CheckCancel; } STATIC MANAGED_CALLBACK(void) EnvironmentBlock::DataSink(EnvironmentBlock *penv, const DataViewBlock *pdata, void **&setters, void *&keyValueSetter) { - penv->DataSinkCore(pdata); - setters = &penv->_vset[0]; - keyValueSetter = (void *)&SetKeyValue; + penv->DataSinkCore(pdata); + setters = &penv->_vset[0]; + keyValueSetter = (void *)&SetKeyValue; } void EnvironmentBlock::DataSinkCore(const DataViewBlock * pdata) { - assert(pdata != nullptr); + assert(pdata != nullptr); - // Create a data set. - CxInt64 numKeys = 0; - for (int i = 0; i < pdata->ccol; i++) - { - BYTE kind = pdata->kinds[i]; - _columns.push_back(PythonObjectBase::CreateObject(kind, pdata->crow, 1)); + for (int i = 0; i < pdata->ccol; i++) + { + BYTE kind = pdata->kinds[i]; + _columns.push_back(PyColumnBase::Create(kind, pdata->crow, pdata->valueCounts[i])); - switch (kind) - { - case BL: - _vset.push_back((void*)&SetBL); - break; - case I1: - _vset.push_back((void*)&SetI1); - break; - case I2: - _vset.push_back((void*)&SetI2); - break; - case I4: - _vset.push_back((void*)&SetI4); - break; - case I8: - _vset.push_back((void*)&SetI8); - break; - case U1: - _vset.push_back((void*)&SetU1); - break; - case U2: - _vset.push_back((void*)&SetU2); - break; - case U4: - _vset.push_back((void*)&SetU4); - break; - case U8: - _vset.push_back((void*)&SetU8); - break; - case R4: - _vset.push_back((void*)&SetR4); - break; - case R8: - _vset.push_back((void*)&SetR8); - break; - case TX: - _vset.push_back((void*)&SetTX); - break; - case TS: // tbd - case DT: // tbd - case DZ: // tbd - default: - throw std::invalid_argument("data type is not supported " + std::to_string(kind)); - } + switch (kind) + { + case BL: + _vset.push_back((void*)&SetBL); + break; + case I1: + _vset.push_back((void*)&SetI1); + break; + case I2: + _vset.push_back((void*)&SetI2); + break; + case I4: + _vset.push_back((void*)&SetI4); + break; + case DT: + case I8: + _vset.push_back((void*)&SetI8); + break; + case U1: + _vset.push_back((void*)&SetU1); + break; + case U2: + _vset.push_back((void*)&SetU2); + break; + case U4: + _vset.push_back((void*)&SetU4); + break; + case U8: + _vset.push_back((void*)&SetU8); + break; + case R4: + _vset.push_back((void*)&SetR4); + break; + case R8: + _vset.push_back((void*)&SetR8); + break; + case TX: + _vset.push_back((void*)&SetTX); + break; + case TS: // tbd + case DZ: // tbd + default: + throw std::invalid_argument("data type is not supported " + std::to_string(kind)); + } - if (pdata->keyCards[i] >= 0) - { - _vKeyValues.push_back(new PythonObject(TX, pdata->keyCards[i], 1)); - _columnToKeyMap.push_back(numKeys++); - } - else - _columnToKeyMap.push_back(-1); + if (pdata->keyCards && (pdata->keyCards[i] >= 0)) + { + _columnToKeyMap.insert(i); + _vKeyValues.push_back(new PyColumnSingle(TX, pdata->keyCards[i])); + } - _names.push_back(pdata->names[i]); - } + _names.push_back(pdata->names[i]); + } } STATIC MANAGED_CALLBACK(void) EnvironmentBlock::ModelSink(EnvironmentBlock * env, - const unsigned char * pBinaryModel, size_t iModelLen) + const unsigned char * pBinaryModel, size_t iModelLen) { } STATIC MANAGED_CALLBACK(void) EnvironmentBlock::MessageSink(EnvironmentBlock * env, MessageKind kind, - const char * sender, const char * message) + const char * sender, const char * message) { - bool bShowMessage = (env->_kindMask >> kind) & 1; - string sMessage(message); - string sSender(sender); + bool bShowMessage = (env->_kindMask >> kind) & 1; + string sMessage(message); + string sSender(sender); - if (bShowMessage) - { - CX_TraceIn("MessageSink"); - string sMessage = std::string(message); - string sSender = std::string(sender); + if (bShowMessage) + { + CX_TraceIn("MessageSink"); + string sMessage = std::string(message); + string sSender = std::string(sender); - switch (kind) - { - default: - case Info: - sMessage = sMessage + "\n"; - break; - case Warning: - sMessage = "Warning: " + sMessage + "\n"; - break; - case Trace: - sMessage = sSender + ": " + sMessage + "\n"; - break; - case Error: // We will throw the error when ConnectToMlNet returns - sMessage = "Error: " + sMessage; - break; - } + switch (kind) + { + default: + case Info: + sMessage = sMessage + "\n"; + break; + case Warning: + sMessage = "Warning: " + sMessage + "\n"; + break; + case Trace: + sMessage = sSender + ": " + sMessage + "\n"; + break; + case Error: // We will throw the error when ConnectToMlNet returns + sMessage = "Error: " + sMessage; + env->_errMessage = sMessage; + break; + } - // Redirect message to Python streams - PyObject *sys = PyImport_ImportModule("sys"); - PyObject *pystream = PyObject_GetAttrString(sys, (kind == Error) ? "stderr" : "stdout"); - PyObject_CallMethod(pystream, "write", "s", sMessage.c_str()); - PyObject_CallMethod(pystream, "flush", NULL); - Py_XDECREF(pystream); - Py_XDECREF(sys); + // Redirect message to Python streams + PyObject *sys = PyImport_ImportModule("sys"); + PyObject *pystream = PyObject_GetAttrString(sys, (kind == Error) ? "stderr" : "stdout"); + PyObject_CallMethod(pystream, "write", "s", sMessage.c_str()); + PyObject_CallMethod(pystream, "flush", NULL); + Py_XDECREF(pystream); + Py_XDECREF(sys); - CX_TraceOut("MessageSink"); - } + CX_TraceOut("MessageSink"); + } } STATIC MANAGED_CALLBACK(bool) EnvironmentBlock::CheckCancel() { - return false; + return false; } bp::dict EnvironmentBlock::GetData() { - if (_names.size() == 0) - { - return bp::dict(); - } + if (_columns.size() == 0) + { + return bp::dict(); + } + + size_t maxRows = 0; + for (size_t i = 0; i < _columns.size(); i++) + { + size_t numRows = _columns[i]->GetNumRows(); + if (numRows > maxRows) maxRows = numRows; + } - bp::dict dict = bp::dict(); - for (size_t i = 0; i < _names.size(); i++) - { - PythonObjectBase* column = _columns[i]; - PythonObject* keyNames = nullptr; - if (_columnToKeyMap[i] >= 0) - keyNames = _vKeyValues[_columnToKeyMap[i]]; + CxInt64 numKeys = 0; + bp::dict dict = bp::dict(); + for (size_t i = 0; i < _columns.size(); i++) + { + PyColumnBase* column = _columns[i]; + const std::vector* keyNames = nullptr; + if (_columnToKeyMap.find(i) != _columnToKeyMap.end()) + keyNames = _vKeyValues[numKeys++]->GetData(); - signed char kind = column->GetKind(); - switch (kind) { - case -1: - { - PythonObject* col = dynamic_cast*>(column); - auto shrd = col->GetData(); - bp::list list; - for (size_t i = 0; i < shrd->size(); i++) - { - bp::object obj; - signed char value = shrd->at(i); - if (value < 0) - obj = bp::object(NAN); - else if (value == 0) - obj = bp::object(false); - else - obj = bp::object(true); + signed char kind = column->GetKind(); + switch (kind) { + case -1: + { + PyColumnSingle* col = dynamic_cast*>(column); + auto shrd = col->GetData(); + bp::list list; + for (size_t i = 0; i < shrd->size(); i++) + { + bp::object obj; + signed char value = shrd->at(i); + if (value < 0) + obj = bp::object(NAN); + else if (value == 0) + obj = bp::object(false); + else + obj = bp::object(true); - list.append(obj); - } - dict[_names[i]] = list; - } - break; - case BL: - SetDict2(signed char, bool); - break; - case I1: - SetDictAndKeys(signed char, i); - break; - case I2: - SetDictAndKeys(signed short, i); - break; - case I4: - SetDictAndKeys(signed int, i); - break; - case I8: - SetDict1(CxInt64); - break; - case U1: - SetDict1(unsigned char); - break; - case U2: - SetDict1(unsigned short); - break; - case U4: - SetDict1(unsigned int); - break; - case U8: - SetDict1(CxUInt64); - break; - case R4: - SetDict1(float); - break; - case R8: - SetDict1(double); - break; - case TX: - { - PythonObject* col = dynamic_cast*>(column); - auto shrd = col->GetData(); - bp::list list; - for (size_t i = 0; i < shrd->size(); i++) - { - bp::object obj; - const std::string& value = shrd->at(i); - if (!value.empty()) - { - obj = bp::object(value); - } - list.append(obj); - } - dict[_names[i]] = list; - delete column; - } - break; - case TS: - case DT: - case DZ: - default: - throw std::invalid_argument("data type is not supported " + std::to_string(kind)); - } - } - return dict; + list.append(obj); + } + dict[_names[i]] = list; + } + break; + case BL: + AddToDict(signed char); + break; + case I1: + AddToDict(signed char); + break; + case I2: + AddToDict(signed short); + break; + case I4: + AddToDict(signed int); + break; + case I8: + AddToDict(CxInt64); + break; + case U1: + AddToDict(unsigned char); + break; + case U2: + AddToDict(unsigned short); + break; + case U4: + AddToDict(unsigned int); + break; + case U8: + AddToDict(CxUInt64); + break; + case R4: + AddToDict(float); + break; + case R8: + AddToDict(double); + break; + case TX: + AddToDict(std::string); + delete column; + break; + case DT: + AddToDict(CxInt64); + break; + case TS: + case DZ: + default: + throw std::invalid_argument("data type is not supported " + std::to_string(kind)); + } + } + return dict; } diff --git a/src/NativeBridge/ManagedInterop.h b/src/NativeBridge/ManagedInterop.h index 5d9582a3..485a59cc 100644 --- a/src/NativeBridge/ManagedInterop.h +++ b/src/NativeBridge/ManagedInterop.h @@ -4,6 +4,7 @@ using namespace std; #include "stdafx.h" #include "PythonInterop.h" +#include #define CX_TraceOut(...) #define CX_TraceIn(...) @@ -15,32 +16,25 @@ struct DataViewBlock; // WARNING: These values are defined by the ML.NET code so should not be changed! enum MessageKind { - Trace = 0, - Info = 1, - Warning = 2, - Error = 3 -}; - -// These are only used locally -enum PyErrorCode -{ - PyErrorCode_NoError = 0, - PyErrorCode_Failure = 1 + Trace = 0, + Info = 1, + Warning = 2, + Error = 3 }; // REVIEW: the exceptions thrown in the callbacks will not be caught by BxlServer on Linux. // On Linux, CoreCLR will ignore previous stack frames, i.e., those before entering the managed code. typedef MANAGED_CALLBACK_PTR(void, MODELSINK) (EnvironmentBlock * env, - const unsigned char * binaryModel, size_t modelLen); + const unsigned char * binaryModel, size_t modelLen); typedef MANAGED_CALLBACK_PTR(void, MESSAGESINK)(EnvironmentBlock *penv, MessageKind kind, - const char * sender, const char * message); + const char * sender, const char * message); typedef MANAGED_CALLBACK_PTR(void, DATASINK)(EnvironmentBlock *penv, const DataViewBlock *pdata, - // Outputs: - // * setters: item setter function pointers. - // keyValueSetter: setter for key values. - void **& setters, void *& keyValueSetter); + // Outputs: + // * setters: item setter function pointers. + // keyValueSetter: setter for key values. + void **& setters, void *& keyValueSetter); // Callback function for getting cancel flag. typedef MANAGED_CALLBACK_PTR(bool, CHECKCANCEL)(); @@ -52,152 +46,150 @@ typedef MANAGED_CALLBACK_PTR(void, SETSTR)(void *pv, CxInt64 index, const char * // As such, it is critical that this class NOT have a vtable, so virtual functions are illegal! class CLASS_ALIGN EnvironmentBlock { - // Fields that are visible to managed code come first and do not start with an underscore. - // Fields that are only visible to this code start with an underscore. + // Fields that are visible to managed code come first and do not start with an underscore. + // Fields that are only visible to this code start with an underscore. private: - // *** These fields are known by managed code. It is critical that this struct not have a vtable. - // It is also critical that the layout of this prefix NOT vary from release to release or build to build. - // The managed code assumes that each pointer occupies 8 bytes. + // *** These fields are known by managed code. It is critical that this struct not have a vtable. + // It is also critical that the layout of this prefix NOT vary from release to release or build to build. + // The managed code assumes that each pointer occupies 8 bytes. - // Indicates a verbosity level. Zero means default (minimal). Larger generally means more information. - int verbosity; + // Indicates a verbosity level. Zero means default (minimal). Larger generally means more information. + int verbosity; - // The random seed. - int seed; + // The random seed. + int seed; - // The message sink. - MESSAGESINK messageSink; + // The message sink. + MESSAGESINK messageSink; - // The data sink. - DATASINK dataSink; + // The data sink. + DATASINK dataSink; - // The model sink. - MODELSINK modelSink; + // The model sink. + MODELSINK modelSink; - // Indicates max threads allowed. Less than one means default (maximal). - int maxThreadsAllowed; + // Max slots to return for vector valued columns(<=0 to return all). + int maxSlots; - // Check cancellation flag. - CHECKCANCEL checkCancel; + // Check cancellation flag. + CHECKCANCEL checkCancel; - // Path to python executable - const char* pythonPath; + // Path to python executable + const char* pythonPath; public: - EnvironmentBlock(int verbosity = 0, int maxThreadsAllowed = 0, int seed = 42, const char* pythonPath = NULL); - ~EnvironmentBlock(); - PyErrorCode GetErrorCode() { return _errCode; } - std::string GetErrorMessage() { return _errMessage; } - bp::dict GetData(); + EnvironmentBlock(int verbosity = 0, int maxSlots = -1, int seed = 42, const char* pythonPath = NULL); + ~EnvironmentBlock(); + std::string GetErrorMessage() { return _errMessage; } + bp::dict GetData(); private: - static MANAGED_CALLBACK(void) DataSink(EnvironmentBlock *penv, const DataViewBlock *pdata, void **&setters, void *&keyValueSetter); - static MANAGED_CALLBACK(void) MessageSink(EnvironmentBlock *penv, MessageKind kind, const char *sender, const char *message); - static MANAGED_CALLBACK(void) ModelSink(EnvironmentBlock *penv, const unsigned char *pBinaryModel, size_t iModelLen); - static MANAGED_CALLBACK(bool) CheckCancel(); + static MANAGED_CALLBACK(void) DataSink(EnvironmentBlock *penv, const DataViewBlock *pdata, void **&setters, void *&keyValueSetter); + static MANAGED_CALLBACK(void) MessageSink(EnvironmentBlock *penv, MessageKind kind, const char *sender, const char *message); + static MANAGED_CALLBACK(void) ModelSink(EnvironmentBlock *penv, const unsigned char *pBinaryModel, size_t iModelLen); + static MANAGED_CALLBACK(bool) CheckCancel(); private: - void DataSinkCore(const DataViewBlock * pdata); + void DataSinkCore(const DataViewBlock * pdata); private: - // This has a bit set for each kind of message that is desired. - int _kindMask; - // Fields used by the data callbacks. These keep the appropriate memory alive during the data operations. - int _irowBase; - int _crowWant; - std::vector _vset; - PyErrorCode _errCode; - std::string _errMessage; - - std::vector _names; - std::vector _columns; - // Maps between the column index, and the index in _vKeyValues containing the key names, or -1 if - // there are no key names. - std::vector _columnToKeyMap; - - std::vector*> _vKeyValues; - - static MANAGED_CALLBACK(void) SetR4(EnvironmentBlock *env, int col, long index, float value) - { - PythonObject* colObject = dynamic_cast*>(env->_columns[col]); - assert(colObject != nullptr); - colObject->SetAt(index, 0, value); - } - static MANAGED_CALLBACK(void) SetR8(EnvironmentBlock *env, int col, long index, double value) - { - PythonObject* colObject = dynamic_cast*>(env->_columns[col]); - assert(colObject != nullptr); - colObject->SetAt(index, 0, value); - } - static MANAGED_CALLBACK(void) SetBL(EnvironmentBlock *env, int col, long index, signed char value) - { - PythonObject* colObject = dynamic_cast*>(env->_columns[col]); - assert(colObject != nullptr); - colObject->SetAt(index, 0, value); - if (value < 0) - env->_columns[col]->SetKind(-1); - } - static MANAGED_CALLBACK(void) SetI1(EnvironmentBlock *env, int col, long index, signed char value) - { - PythonObject* colObject = dynamic_cast*>(env->_columns[col]); - assert(colObject != nullptr); - colObject->SetAt(index, 0, value); - } - static MANAGED_CALLBACK(void) SetI2(EnvironmentBlock *env, int col, long index, short value) - { - PythonObject* colObject = dynamic_cast*>(env->_columns[col]); - assert(colObject != nullptr); - colObject->SetAt(index, 0, value); - } - static MANAGED_CALLBACK(void) SetI4(EnvironmentBlock *env, int col, long index, int value) - { - PythonObject* colObject = dynamic_cast*>(env->_columns[col]); - assert(colObject != nullptr); - colObject->SetAt(index, 0, value); - } - static MANAGED_CALLBACK(void) SetI8(EnvironmentBlock *env, int col, long index, CxInt64 value) - { - PythonObject* colObject = dynamic_cast*>(env->_columns[col]); - assert(colObject != nullptr); - colObject->SetAt(index, 0, value); - } - static MANAGED_CALLBACK(void) SetU1(EnvironmentBlock *env, int col, long index, unsigned char value) - { - PythonObject* colObject = dynamic_cast*>(env->_columns[col]); - assert(colObject != nullptr); - colObject->SetAt(index, 0, value); - } - static MANAGED_CALLBACK(void) SetU2(EnvironmentBlock *env, int col, long index, unsigned short value) - { - PythonObject* colObject = dynamic_cast*>(env->_columns[col]); - assert(colObject != nullptr); - colObject->SetAt(index, 0, value); - } - static MANAGED_CALLBACK(void) SetU4(EnvironmentBlock *env, int col, long index, unsigned int value) - { - PythonObject* colObject = dynamic_cast*>(env->_columns[col]); - assert(colObject != nullptr); - colObject->SetAt(index, 0, value); - } - static MANAGED_CALLBACK(void) SetU8(EnvironmentBlock *env, int col, long index, CxUInt64 value) - { - PythonObject* colObject = dynamic_cast*>(env->_columns[col]); - assert(colObject != nullptr); - colObject->SetAt(index, 0, value); - } - static MANAGED_CALLBACK(void) SetTX(EnvironmentBlock *env, int col, long index, char* value, long length) - { - PythonObject* colObject = dynamic_cast*>(env->_columns[col]); - assert(colObject != nullptr); - colObject->SetAt(index, 0, std::string(value, length)); - } - static MANAGED_CALLBACK(void) SetKeyValue(EnvironmentBlock *env, int keyColumnIndex, int keyCode, char* value, long length) - { - assert(keyColumnIndex < env->_vKeyValues.size()); - PythonObject* keyNamesObject = env->_vKeyValues[keyColumnIndex]; - keyNamesObject->SetAt(keyCode, 0, std::string(value, length)); - } + // This has a bit set for each kind of message that is desired. + int _kindMask; + // Fields used by the data callbacks. These keep the appropriate memory alive during the data operations. + int _irowBase; + int _crowWant; + std::vector _vset; + std::string _errMessage; + + // Column names. + std::vector _names; + std::vector _columns; + + // Set of all key column indexes. + std::unordered_set _columnToKeyMap; + std::vector*> _vKeyValues; + + static MANAGED_CALLBACK(void) SetR4(EnvironmentBlock *env, int col, long m, long n, float value) + { + PyColumn* colObject = dynamic_cast*>(env->_columns[col]); + assert(colObject != nullptr); + colObject->SetAt(m, n, value); + } + static MANAGED_CALLBACK(void) SetR8(EnvironmentBlock *env, int col, long m, long n, double value) + { + PyColumn* colObject = dynamic_cast*>(env->_columns[col]); + assert(colObject != nullptr); + colObject->SetAt(m, n, value); + } + static MANAGED_CALLBACK(void) SetBL(EnvironmentBlock *env, int col, long m, long n, signed char value) + { + PyColumn* colObject = dynamic_cast*>(env->_columns[col]); + assert(colObject != nullptr); + colObject->SetAt(m, n, value); + if (value < 0) + env->_columns[col]->SetKind(-1); + } + static MANAGED_CALLBACK(void) SetI1(EnvironmentBlock *env, int col, long m, long n, signed char value) + { + PyColumn* colObject = dynamic_cast*>(env->_columns[col]); + assert(colObject != nullptr); + colObject->SetAt(m, n, value); + } + static MANAGED_CALLBACK(void) SetI2(EnvironmentBlock *env, int col, long m, long n, short value) + { + PyColumn* colObject = dynamic_cast*>(env->_columns[col]); + assert(colObject != nullptr); + colObject->SetAt(m, n, value); + } + static MANAGED_CALLBACK(void) SetI4(EnvironmentBlock *env, int col, long m, long n, int value) + { + PyColumn* colObject = dynamic_cast*>(env->_columns[col]); + assert(colObject != nullptr); + colObject->SetAt(m, n, value); + } + static MANAGED_CALLBACK(void) SetI8(EnvironmentBlock *env, int col, long m, long n, CxInt64 value) + { + PyColumn* colObject = dynamic_cast*>(env->_columns[col]); + assert(colObject != nullptr); + colObject->SetAt(m, n, value); + } + static MANAGED_CALLBACK(void) SetU1(EnvironmentBlock *env, int col, long m, long n, unsigned char value) + { + PyColumn* colObject = dynamic_cast*>(env->_columns[col]); + assert(colObject != nullptr); + colObject->SetAt(m, n, value); + } + static MANAGED_CALLBACK(void) SetU2(EnvironmentBlock *env, int col, long m, long n, unsigned short value) + { + PyColumn* colObject = dynamic_cast*>(env->_columns[col]); + assert(colObject != nullptr); + colObject->SetAt(m, n, value); + } + static MANAGED_CALLBACK(void) SetU4(EnvironmentBlock *env, int col, long m, long n, unsigned int value) + { + PyColumn* colObject = dynamic_cast*>(env->_columns[col]); + assert(colObject != nullptr); + colObject->SetAt(m, n, value); + } + static MANAGED_CALLBACK(void) SetU8(EnvironmentBlock *env, int col, long m, long n, CxUInt64 value) + { + PyColumn* colObject = dynamic_cast*>(env->_columns[col]); + assert(colObject != nullptr); + colObject->SetAt(m, n, value); + } + static MANAGED_CALLBACK(void) SetTX(EnvironmentBlock *env, int col, long m, long n, char* value, long length) + { + PyColumn* colObject = dynamic_cast*>(env->_columns[col]); + assert(colObject != nullptr); + colObject->SetAt(m, n, std::string(value, length)); + } + static MANAGED_CALLBACK(void) SetKeyValue(EnvironmentBlock *env, int keyColumnIndex, int keyCode, char* value, long length) + { + assert(keyColumnIndex < env->_vKeyValues.size()); + PyColumn* keyNamesObject = env->_vKeyValues[keyColumnIndex]; + keyNamesObject->SetAt(keyCode, 0, std::string(value, length)); + } }; @@ -208,32 +200,32 @@ class CLASS_ALIGN EnvironmentBlock inline void FillDead(int& x) { - assert(sizeof(int) == 4); - x = BAD_QUAD; + assert(sizeof(int) == 4); + x = BAD_QUAD; } inline void FillDead(CxInt64& x) { - assert(sizeof(CxInt64) == 8); - assert(sizeof(int) == 4); - ((int *)&x)[0] = BAD_QUAD; - ((int *)&x)[1] = BAD_QUAD; + assert(sizeof(CxInt64) == 8); + assert(sizeof(int) == 4); + ((int *)&x)[0] = BAD_QUAD; + ((int *)&x)[1] = BAD_QUAD; } template inline void FillDead(T*& x) { - assert(sizeof(T*) == 8); - assert(sizeof(int) == 4); - ((int *)&x)[0] = BAD_QUAD; - ((int *)&x)[1] = BAD_QUAD; + assert(sizeof(T*) == 8); + assert(sizeof(int) == 4); + ((int *)&x)[0] = BAD_QUAD; + ((int *)&x)[1] = BAD_QUAD; } struct MlNetExecutionError : std::exception { - MlNetExecutionError(const char *message) : msg_(message) { } - virtual char const *what() const noexcept { return msg_.c_str(); } + MlNetExecutionError(const char *message) : msg_(message) { } + virtual char const *what() const noexcept { return msg_.c_str(); } private: - std::string msg_; + std::string msg_; }; diff --git a/src/NativeBridge/PythonInterop.cpp b/src/NativeBridge/PythonInterop.cpp index c0a833b7..f2d4ee87 100644 --- a/src/NativeBridge/PythonInterop.cpp +++ b/src/NativeBridge/PythonInterop.cpp @@ -1,55 +1,339 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT license. +#include #include "stdafx.h" #include "PythonInterop.h" -PythonObjectBase::PythonObjectBase(const int& kind) + +inline void destroyManagerCObject(PyObject* obj) { + auto* b = static_cast(PyCapsule_GetPointer(obj, NULL)); + if (b) { delete b; } +} + + +PyColumnBase::PyColumnBase(const int& kind) { - _kind = kind; + _kind = kind; } -PythonObjectBase::~PythonObjectBase() +PyColumnBase::~PyColumnBase() { } -PythonObjectBase::creation_map* PythonObjectBase::m_pCreationMap = PythonObjectBase::CreateMap(); +PyColumnBase::creation_map* PyColumnBase::m_pSingleCreationMap = PyColumnBase::CreateSingleMap(); +PyColumnBase::creation_map* PyColumnBase::m_pVariableCreationMap = PyColumnBase::CreateVariableMap(); + +PyColumnBase::creation_map* PyColumnBase::CreateSingleMap() +{ + PyColumnBase::creation_map* map = new PyColumnBase::creation_map(); + + map->insert(creation_map_entry(BL, CreateSingle)); + map->insert(creation_map_entry(I1, CreateSingle)); + map->insert(creation_map_entry(I2, CreateSingle)); + map->insert(creation_map_entry(I4, CreateSingle)); + map->insert(creation_map_entry(I8, CreateSingle)); + map->insert(creation_map_entry(U1, CreateSingle)); + map->insert(creation_map_entry(U2, CreateSingle)); + map->insert(creation_map_entry(U4, CreateSingle)); + map->insert(creation_map_entry(U8, CreateSingle)); + map->insert(creation_map_entry(R4, CreateSingle)); + map->insert(creation_map_entry(R8, CreateSingle)); + map->insert(creation_map_entry(TX, CreateSingle)); + map->insert(creation_map_entry(DT, CreateSingle)); + return map; +} + +PyColumnBase::creation_map* PyColumnBase::CreateVariableMap() +{ + PyColumnBase::creation_map* map = new PyColumnBase::creation_map(); + + map->insert(creation_map_entry(BL, CreateVariable)); + map->insert(creation_map_entry(I1, CreateVariable)); + map->insert(creation_map_entry(I2, CreateVariable)); + map->insert(creation_map_entry(I4, CreateVariable)); + map->insert(creation_map_entry(I8, CreateVariable)); + map->insert(creation_map_entry(U1, CreateVariable)); + map->insert(creation_map_entry(U2, CreateVariable)); + map->insert(creation_map_entry(U4, CreateVariable)); + map->insert(creation_map_entry(U8, CreateVariable)); + map->insert(creation_map_entry(R4, CreateVariable)); + map->insert(creation_map_entry(R8, CreateVariable)); + map->insert(creation_map_entry(TX, CreateVariable)); + return map; +} + +PyColumnBase* PyColumnBase::Create(const int& kind, size_t numRows, size_t numCols) +{ + if (numCols == 0) + { + creation_map::iterator found = m_pVariableCreationMap->find(kind); + if (found != m_pVariableCreationMap->end()) + return found->second(kind, numRows); + } + else + { + creation_map::iterator found = m_pSingleCreationMap->find(kind); + if (found != m_pSingleCreationMap->end()) + return found->second(kind, numRows); + } + + std::stringstream message; + message << "Columns of kind " << kind << " are not supported."; + throw std::invalid_argument(message.str().c_str()); +} -PythonObjectBase::creation_map* PythonObjectBase::CreateMap() +template PyColumnBase* PyColumnBase::CreateSingle(const int& kind, size_t nRows) { - PythonObjectBase::creation_map* map = new PythonObjectBase::creation_map(); + return new PyColumnSingle(kind, nRows); +} - map->insert(creation_map_entry(BL, CreateObject)); - map->insert(creation_map_entry(I1, CreateObject)); - map->insert(creation_map_entry(I2, CreateObject)); - map->insert(creation_map_entry(I4, CreateObject)); - map->insert(creation_map_entry(I8, CreateObject)); - map->insert(creation_map_entry(U1, CreateObject)); - map->insert(creation_map_entry(U2, CreateObject)); - map->insert(creation_map_entry(U4, CreateObject)); - map->insert(creation_map_entry(U8, CreateObject)); - map->insert(creation_map_entry(R4, CreateObject)); - map->insert(creation_map_entry(R8, CreateObject)); - map->insert(creation_map_entry(TX, CreateObject)); - return map; +template PyColumnBase* PyColumnBase::CreateVariable(const int& kind, size_t nRows) +{ + return new PyColumnVariable(kind, nRows); } -PythonObjectBase* PythonObjectBase::CreateObject(const int& kind, size_t numRows, size_t numCols) +template +void PyColumnSingle::AddToDict(bp::dict& dict, + const std::string& name, + const std::vector* keyNames, + const size_t expectedRows) { - creation_map::iterator found = m_pCreationMap->find(kind); + auto* data = _pData->data(); - if (found == m_pCreationMap->end()) - { - std::stringstream message; - message << "Columns of kind " << kind << " are not supported."; - throw std::invalid_argument(message.str().c_str()); - } + switch (this->_kind) + { + case DataKind::BL: + { + bp::handle<> h(::PyCapsule_New((void*)this, NULL, (PyCapsule_Destructor)&destroyManagerCObject)); + dict[name] = np::from_data( + data, + np::dtype::get_builtin(), + bp::make_tuple(_pData->size()), + bp::make_tuple(sizeof(bool)), bp::object(h)); + } + break; + case DataKind::I1: + case DataKind::I2: + case DataKind::I4: + { + bp::handle<> h(::PyCapsule_New((void*)this, NULL, (PyCapsule_Destructor)&destroyManagerCObject)); + np::ndarray npdata = np::from_data( + data, + np::dtype::get_builtin(), + bp::make_tuple(_pData->size()), + bp::make_tuple(sizeof(float)), bp::object(h)); + if (keyNames == nullptr) + { + dict[name] = npdata; + } + else + { + dict[name] = bp::dict(); + dict[name]["..Data"] = npdata; + bp::list list; + for (int j = 0; j < keyNames->size(); j++) + { + bp::object obj; + const std::string& value = keyNames->at(j); + if (!value.empty()) + { + obj = bp::object(value); + } + list.append(obj); + } + dict[name]["..KeyValues"] = list; + } + } + break; + case DataKind::I8: + case DataKind::U1: + case DataKind::U2: + case DataKind::U4: + case DataKind::U8: + case DataKind::R4: + case DataKind::R8: + { + bp::handle<> h(::PyCapsule_New((void*)this, NULL, (PyCapsule_Destructor)&destroyManagerCObject)); + dict[name] = np::from_data( + data, + np::dtype::get_builtin(), + bp::make_tuple(_pData->size()), + bp::make_tuple(sizeof(T)), bp::object(h)); + } + break; + case DataKind::DT: + { + bp::handle<> h(::PyCapsule_New((void*)this, NULL, (PyCapsule_Destructor)&destroyManagerCObject)); + np::ndarray npdata = np::from_data( + data, + np::dtype::get_builtin(), + bp::make_tuple(_pData->size()), + bp::make_tuple(sizeof(T)), bp::object(h)); - return found->second(kind, numRows, numCols); + dict[name] = bp::dict(); + dict[name]["..DateTime"] = npdata; + } + break; + } } -template PythonObjectBase* PythonObjectBase::CreateObject(const int& kind, size_t nRows, size_t nColumns) +template <> +void PyColumnSingle::AddToDict(bp::dict& dict, + const std::string& name, + const std::vector* keyNames, + const size_t expectedRows) { - return new PythonObject(kind, nRows, nColumns); + bp::list list; + for (size_t i = 0; i < _pData->size(); i++) + { + bp::object obj; + const std::string& value = _pData->at(i); + if (!value.empty()) + { + obj = bp::object(value); + } + list.append(obj); + } + dict[name] = list; } +template +void PyColumnVariable::SetAt(size_t nRow, size_t nCol, const T& value) +{ + if ((nRow + 1) > _numRows) _numRows = nRow + 1; + + /* + * Make sure there are enough columns for the request. + */ + for (size_t i = _data.size(); i <= nCol; i++) + { + _data.push_back(new std::vector()); + } + + std::vector* pColData = _data[nCol]; + + /* + * Fill in any missing row values. + */ + for (size_t i = pColData->size(); i < nRow; i++) + { + pColData->push_back(GetMissingValue()); + } + + pColData->push_back(GetConvertedValue(value)); +} + +/* + * Note: an instance of this object should not be used + * and should be considered invalid after the first time + * this method has been called. + */ +template +void PyColumnVariable::Deleter(PyObject* obj) +{ + auto* deleteData = static_cast::DeleteData*>(PyCapsule_GetPointer(obj, NULL)); + + PyColumnVariable* instance = deleteData->instance; + size_t column = deleteData->column; + + std::vector* data = instance->_data[column]; + if (data != nullptr) + { + instance->_data[column] = nullptr; + instance->_numDeletedColumns++; + delete data; + + if (instance->_numDeletedColumns == instance->_data.size()) + { + delete instance; + } + } +} + +template +void PyColumnVariable::AddToDict(bp::dict& dict, + const std::string& name, + const std::vector* keyNames, + const size_t expectedRows) +{ + size_t numRows = (expectedRows > _numRows) ? expectedRows : _numRows; + size_t numCols = _data.size(); + + if (numCols == 0) + { + /* + * If there were no values set then create a + * column so it can be filled with missing values. + */ + _data.push_back(new std::vector()); + numCols = 1; + } + + const std::string colNameBase = name + "."; + int maxDigits = (int)ceil(log10(numCols)); + if (maxDigits == 0) maxDigits = 1; + + for (size_t i = 0; i < numCols; i++) + { + std::vector* pColData = _data[i]; + + /* + * Make sure all the columns are the same length. + */ + for (size_t j = pColData->size(); j < numRows; j++) + { + pColData->push_back(GetMissingValue()); + } + + std::string colName = std::to_string(i); + colName = std::string(maxDigits - colName.length(), '0') + colName; + colName = colNameBase + colName; + + AddColumnToDict(dict, colName, i); + } +} + +template +void PyColumnVariable::AddColumnToDict(bp::dict& dict, + const std::string& name, + size_t index) +{ + auto* data = _data[index]->data(); + + DeleteData* deleteData = new DeleteData(); + deleteData->instance = this; + deleteData->column = index; + + bp::handle<> h(::PyCapsule_New((void*)deleteData, NULL, (PyCapsule_Destructor)&Deleter)); + dict[name] = np::from_data( + data, + np::dtype::get_builtin(), + bp::make_tuple(_data[index]->size()), + bp::make_tuple(sizeof(T2)), bp::object(h)); +} + +template<> +void PyColumnVariable::AddColumnToDict(bp::dict& dict, + const std::string& name, + size_t index) +{ + bp::list list; + std::vector* pColData = _data[index]; + size_t numRows = pColData->size(); + + for (size_t i = 0; i < numRows; i++) + { + bp::object obj; + NullableString value = pColData->at(i); + + if (value) + { + obj = bp::object(*value); + } + + list.append(obj); + } + + dict[name] = list; +} diff --git a/src/NativeBridge/PythonInterop.h b/src/NativeBridge/PythonInterop.h index 9654476a..8929ae39 100644 --- a/src/NativeBridge/PythonInterop.h +++ b/src/NativeBridge/PythonInterop.h @@ -2,107 +2,229 @@ // Licensed under the MIT license. #pragma once - #include +#include +#include + // Taken from ML.NET source code. These values should be stable. enum DataKind { - I1 = 1, - U1 = 2, - I2 = 3, - U2 = 4, - I4 = 5, - U4 = 6, - I8 = 7, - U8 = 8, - R4 = 9, - R8 = 10, - TX = 11, - BL = 12, - TS = 13, - DT = 14, - DZ = 15, + I1 = 1, + U1 = 2, + I2 = 3, + U2 = 4, + I4 = 5, + U4 = 6, + I8 = 7, + U8 = 8, + R4 = 9, + R8 = 10, + TX = 11, + BL = 12, + TS = 13, + DT = 14, + DZ = 15, }; -class PythonObjectBase +class PyColumnBase { private: - typedef std::map creation_map; - typedef std::pair creation_map_entry; + typedef std::map creation_map; + typedef std::pair creation_map_entry; - static creation_map* m_pCreationMap; - static creation_map* CreateMap(); + static creation_map* m_pSingleCreationMap; + static creation_map* CreateSingleMap(); - template static PythonObjectBase* CreateObject(const int& name, size_t nRows, size_t nColumns); + static creation_map* m_pVariableCreationMap; + static creation_map* CreateVariableMap(); + + template static PyColumnBase* CreateSingle(const int& kind, size_t nRows); + template static PyColumnBase* CreateVariable(const int& kind, size_t nRows); protected: - int _kind; + int _kind; public: - PythonObjectBase(const int& kind); - static PythonObjectBase* CreateObject(const int& kind, size_t numRows, size_t numCols); - const int& GetKind() const; - void SetKind(int kind); - virtual ~PythonObjectBase(); + static PyColumnBase* Create(const int& kind, size_t numRows, size_t numCols); + + PyColumnBase(const int& kind); + virtual ~PyColumnBase(); + + const int& GetKind() const { return _kind; } + void SetKind(int kind) { _kind = kind; } + + virtual size_t GetNumRows() = 0; + virtual size_t GetNumCols() = 0; }; -inline const int& PythonObjectBase::GetKind() const -{ - return _kind; -} -inline void PythonObjectBase::SetKind(int kind) +/* + * Template typed abstract base class which provides + * the required interface for all derived classes. + */ +template +class PyColumn : public PyColumnBase { - _kind = kind; -} +public: + PyColumn(const int& kind) : PyColumnBase(kind) {} + virtual ~PyColumn() {} + virtual void SetAt(size_t nRow, size_t nCol, const T& value) = 0; + virtual void AddToDict(bp::dict& dict, + const std::string& name, + const std::vector* keyNames, + const size_t expectedRows) = 0; +}; +/* + * Handles the single value case. + */ template -class PythonObject : public PythonObjectBase +class PyColumnSingle : public PyColumn { protected: - std::vector* _pData; - - size_t _numRows; - size_t _numCols; + std::vector* _pData; public: - PythonObject(const int& kind, size_t numRows = 1, size_t numCols = 1); - virtual ~PythonObject(); - void SetAt(size_t nRow, size_t nCol, const T& value); - const std::vector* GetData() const; + PyColumnSingle(const int& kind, size_t numRows = 0); + virtual ~PyColumnSingle(); + virtual void SetAt(size_t nRow, size_t nCol, const T& value); + virtual void AddToDict(bp::dict& dict, + const std::string& name, + const std::vector* keyNames, + const size_t expectedRows); + virtual size_t GetNumRows(); + virtual size_t GetNumCols(); + const std::vector* GetData() const { return _pData; } }; template -inline PythonObject::PythonObject(const int& kind, size_t numRows, size_t numCols) - : PythonObjectBase(kind) +inline PyColumnSingle::PyColumnSingle(const int& kind, size_t numRows) + : PyColumn(kind) { - _numRows = numRows; - _numCols = numCols; + _pData = new std::vector(); + if (numRows > 0) { + _pData->reserve(numRows); + } +} - _pData = new std::vector(); - if (_numRows > 0) - _pData->reserve(_numRows*_numCols); +template +inline PyColumnSingle::~PyColumnSingle() +{ + delete _pData; } template -inline PythonObject::~PythonObject() +inline void PyColumnSingle::SetAt(size_t nRow, size_t nCol, const T& value) { - delete _pData; + if (_pData->size() <= nRow) + _pData->resize(nRow + 1); + _pData->at(nRow) = value; } template -inline void PythonObject::SetAt(size_t nRow, size_t nCol, const T& value) +inline size_t PyColumnSingle::GetNumRows() { - size_t index = nRow*_numCols + nCol; - if (_pData->size() <= index) - _pData->resize(index + 1); - _pData->at(index) = value; + return _pData->size(); } template -inline const std::vector* PythonObject::GetData() const +inline size_t PyColumnSingle::GetNumCols() +{ + return 1; +} + + +typedef boost::optional NullableString; + +/* + * Handles the variable value case. + */ +template +class PyColumnVariable : public PyColumn { - return _pData; -} \ No newline at end of file +private: + std::vector*> _data; + + size_t _numRows; + size_t _numDeletedColumns; + +public: + PyColumnVariable(const int& kind, size_t numRows = 0); + virtual ~PyColumnVariable(); + virtual void SetAt(size_t nRow, size_t nCol, const T& value); + virtual void AddToDict(bp::dict& dict, + const std::string& name, + const std::vector* keyNames, + const size_t expectedRows); + virtual size_t GetNumRows(); + virtual size_t GetNumCols(); + + T2 GetMissingValue(); + T2 GetConvertedValue(const T& value); + + void AddColumnToDict(bp::dict& dict, const std::string& name, size_t index); + +public: + typedef struct + { + PyColumnVariable* instance; + size_t column; + } DeleteData; + + static void Deleter(PyObject* obj); +}; + +template +inline PyColumnVariable::PyColumnVariable(const int& kind, size_t numRows) + : PyColumn(kind), + _numRows(numRows), + _numDeletedColumns(0) +{ +} + +template +inline PyColumnVariable::~PyColumnVariable() +{ + for (unsigned int i = 0; i < _data.size(); i++) + { + if (_data[i] != nullptr) delete _data[i]; + } +} + +template +inline size_t PyColumnVariable::GetNumRows() +{ + return _numRows; +} + +template +inline size_t PyColumnVariable::GetNumCols() +{ + return _data.size(); +} + +template +inline T2 PyColumnVariable::GetMissingValue() +{ + return NAN; +} + +template +inline T2 PyColumnVariable::GetConvertedValue(const T& value) +{ + return (T2)value; +} + +template <> +inline NullableString PyColumnVariable::GetMissingValue() +{ + return boost::none; +} + +template <> +inline NullableString PyColumnVariable::GetConvertedValue(const std::string& value) +{ + return value; +} diff --git a/src/NativeBridge/UnixInterface.h b/src/NativeBridge/UnixInterface.h index 0a7c1155..bb2c7fd5 100644 --- a/src/NativeBridge/UnixInterface.h +++ b/src/NativeBridge/UnixInterface.h @@ -24,7 +24,7 @@ #define CORECLR_SHUTDOWN "coreclr_shutdown" #define DOTNETBRIDGE "DotNetBridge" -#define DOTNETBRIDGE_FQDN "Microsoft.MachineLearning.DotNetBridge.Bridge" +#define DOTNETBRIDGE_FQDN "Microsoft.ML.DotNetBridge.Bridge" #define GET_FN "GetFn" @@ -151,7 +151,7 @@ class UnixMlNetInterface std::string libsroot(mlnetpath); std::string coreclrdir(coreclrpath); - if (strlen(dpreppath) == 0) + if (strlen(dpreppath) == 0) { dpreppath = mlnetpath; } diff --git a/src/NativeBridge/WinInterface.h b/src/NativeBridge/WinInterface.h index 4f5238db..3548b578 100644 --- a/src/NativeBridge/WinInterface.h +++ b/src/NativeBridge/WinInterface.h @@ -302,7 +302,7 @@ class WinMlNetInterface HRESULT hr = host->CreateDelegate( _domainId, W("DotNetBridge"), - W("Microsoft.MachineLearning.DotNetBridge.Bridge"), + W("Microsoft.ML.DotNetBridge.Bridge"), W("GetFn"), &getter); if (FAILED(hr)) diff --git a/src/NativeBridge/dllmain.cpp b/src/NativeBridge/dllmain.cpp index 0dafd696..1a7a297d 100644 --- a/src/NativeBridge/dllmain.cpp +++ b/src/NativeBridge/dllmain.cpp @@ -7,6 +7,7 @@ #include "ManagedInterop.h" #define PARAM_SEED "seed" +#define PARAM_MAX_SLOTS "max_slots" #define PARAM_GRAPH "graph" #define PARAM_VERBOSE "verbose" #define PARAM_MLNET_PATH "mlnetPath" @@ -75,15 +76,15 @@ bp::dict pxCall(bp::dict& params) bp::extract mlnetPath(params[PARAM_MLNET_PATH]); bp::extract dotnetClrPath(params[PARAM_DOTNETCLR_PATH]); bp::extract dprepPath(params[PARAM_DPREP_PATH]); - bp::extract pythonPath(params[PARAM_PYTHON_PATH]); - bp::extract verbose(params[PARAM_VERBOSE]); + bp::extract pythonPath(params[PARAM_PYTHON_PATH]); + bp::extract verbose(params[PARAM_VERBOSE]); std::int32_t i_verbose = std::int32_t(verbose); std::string s_mlnetPath = std::string(mlnetPath); std::string s_dotnetClrPath = std::string(dotnetClrPath); std::string s_dprepPath = std::string(dprepPath); std::string s_pythonPath = std::string(pythonPath); - std::string s_graph = std::string(graph); - const char *mlnetpath = s_mlnetPath.c_str(); + std::string s_graph = std::string(graph); + const char *mlnetpath = s_mlnetPath.c_str(); const char *coreclrpath = s_dotnetClrPath.c_str(); const char *dpreppath = s_dprepPath.c_str(); @@ -96,7 +97,11 @@ bp::dict pxCall(bp::dict& params) if (params.has_key(PARAM_SEED)) seed = bp::extract(params[PARAM_SEED]); - EnvironmentBlock env(i_verbose, 0, seed, s_pythonPath.c_str()); + int maxSlots = -1; + if (params.has_key(PARAM_MAX_SLOTS)) + maxSlots = bp::extract(params[PARAM_MAX_SLOTS]); + + EnvironmentBlock env(i_verbose, maxSlots, seed, s_pythonPath.c_str()); int retCode; if (params.has_key(PARAM_DATA) && bp::extract(params[PARAM_DATA]).check()) { @@ -112,8 +117,7 @@ bp::dict pxCall(bp::dict& params) res = env.GetData(); if (retCode == -1) - // REVIEW: get the content of IChannel and add it the the error message. - throw std::runtime_error("Returned code is -1. Check the log for error messages."); + throw std::runtime_error(env.GetErrorMessage()); } catch (const std::exception& e) { diff --git a/src/NativeBridge/stdafx.h b/src/NativeBridge/stdafx.h index f5fe57f1..91c2f2fb 100644 --- a/src/NativeBridge/stdafx.h +++ b/src/NativeBridge/stdafx.h @@ -81,7 +81,7 @@ class StopWatch ~StopWatch() { auto endTime = std::chrono::high_resolution_clock::now(); - + std::stringstream buffer; buffer << m_description << ":" << ((endTime - m_startTime).count() / 1000000) << " msecs" << std::endl; diff --git a/src/Platforms/build.csproj b/src/Platforms/build.csproj index 75fa806f..3db67054 100644 --- a/src/Platforms/build.csproj +++ b/src/Platforms/build.csproj @@ -11,19 +11,19 @@ - - - - - - - - - - - + + + + + + + + + + + - + diff --git a/src/python/nimbusml.pyproj b/src/python/nimbusml.pyproj index a0ac2115..df43c116 100644 --- a/src/python/nimbusml.pyproj +++ b/src/python/nimbusml.pyproj @@ -67,9 +67,7 @@ - - Code - + @@ -93,6 +91,9 @@ + + + @@ -102,6 +103,7 @@ + @@ -120,6 +122,7 @@ + @@ -172,11 +175,14 @@ + + + @@ -295,11 +301,15 @@ + + + + @@ -337,6 +347,7 @@ + @@ -384,6 +395,8 @@ + + @@ -421,9 +434,11 @@ + + @@ -442,6 +457,7 @@ + @@ -613,6 +629,7 @@ + @@ -626,9 +643,11 @@ + + @@ -637,11 +656,13 @@ + + @@ -665,9 +686,20 @@ + + + + + + + + + + + diff --git a/src/python/nimbusml/__init__.py b/src/python/nimbusml/__init__.py index 3abbc2ef..55a92107 100644 --- a/src/python/nimbusml/__init__.py +++ b/src/python/nimbusml/__init__.py @@ -2,7 +2,7 @@ Microsoft Machine Learning for Python """ -__version__ = '1.3.1' +__version__ = '1.5.0' # CoreCLR version of MicrosoftML is built on Windows. # But file permissions are not preserved when it's copied to Linux. diff --git a/src/python/nimbusml/_pipeline.py b/src/python/nimbusml/_pipeline.py index 692e1dea..3e0dce27 100644 --- a/src/python/nimbusml/_pipeline.py +++ b/src/python/nimbusml/_pipeline.py @@ -19,6 +19,7 @@ from scipy.sparse import csr_matrix from sklearn.utils.validation import check_X_y, check_array from sklearn.utils.multiclass import unique_labels +from zipfile import ZipFile from .internal.core.base_pipeline_item import BasePipelineItem from .internal.entrypoints.data_customtextloader import \ @@ -38,8 +39,11 @@ from .internal.entrypoints.models_regressionevaluator import \ models_regressionevaluator from .internal.entrypoints.models_summarizer import models_summarizer -from .internal.entrypoints.transforms_datasetscorer import \ - transforms_datasetscorer +from .internal.entrypoints.models_schema import models_schema +from .internal.entrypoints.transforms_datasetscorerex import \ + transforms_datasetscorerex +from .internal.entrypoints.transforms_datasettransformscorer import \ + transforms_datasettransformscorer from .internal.entrypoints.transforms_featurecombiner import \ transforms_featurecombiner from .internal.entrypoints.transforms_featurecontributioncalculationtransformer import \ @@ -56,6 +60,8 @@ transforms_modelcombiner from .internal.entrypoints.transforms_optionalcolumncreator import \ transforms_optionalcolumncreator +from .internal.entrypoints.transforms_permutationfeatureimportance import \ + transforms_permutationfeatureimportance from .internal.entrypoints \ .transforms_predictedlabelcolumnoriginalvalueconverter import \ transforms_predictedlabelcolumnoriginalvalueconverter @@ -67,7 +73,7 @@ from .internal.utils.data_schema import DataSchema from .internal.utils.data_stream import DataStream, ViewDataStream, \ FileDataStream, BinaryDataStream -from .internal.utils.entrypoints import Graph +from .internal.utils.entrypoints import Graph, DataOutputFormat from .internal.utils.schema_helper import _extract_label_column from .internal.utils.utils import trace, unlist @@ -275,7 +281,7 @@ def nodes(self): @property def last_node(self): - if len(self.steps) <= 0: + if not self.steps: raise TypeError("No steps given.") last_step = self.steps[-1] return last_step if not isinstance(last_step, tuple) else \ @@ -561,9 +567,12 @@ def _init_graph_nodes( inputs = OrderedDict([(file_data.replace('$', ''), '')]) # connect transform node inputs/outputs - if feature_columns is None and not isinstance(X, BinaryDataStream): + if feature_columns is None: if schema is None: - schema = DataSchema.read_schema(X) + if isinstance(X, BinaryDataStream): + schema = X.schema + else: + schema = DataSchema.read_schema(X) feature_columns = [c.Name for c in schema] if label_column: # if label_column is a string, remove it from @@ -596,6 +605,14 @@ def _init_graph_nodes( output_data=output_data, output_model=output_model, strategy_iosklearn=strategy_iosklearn) + + for node in enumerate([n for n in transform_nodes + if n.name == 'Models.DatasetTransformer']): + input_name = 'dataset_transformer_model' + str(node[0]) + inputs[input_name] = node[1].inputs['TransformModel'] + node[1].inputs['TransformModel'] = '$' + input_name + node[1].input_variables.add(node[1].inputs['TransformModel']) + graph_nodes['transform_nodes'] = transform_nodes return graph_nodes, feature_columns, inputs, transform_nodes, \ columns_out @@ -644,8 +661,7 @@ def _update_graph_nodes_for_learner( else: raise NotImplementedError( "Strategy '{0}' to handle unspecified inputs is not " - "implemented".format( - strategy_iosklearn)) + "implemented".format(strategy_iosklearn)) if label_column is not None or last_node._use_role(Role.Label): if getattr(last_node, 'label_column_name_', None): @@ -668,8 +684,7 @@ def _update_graph_nodes_for_learner( last_node.label_column_name = None label_column = None - if weight_column is not None or last_node._use_role( - Role.Weight): + if weight_column is not None or last_node._use_role(Role.Weight): if getattr(last_node, 'example_weight_column_name', None): weight_column = last_node.example_weight_column_name elif weight_column: @@ -681,8 +696,7 @@ def _update_graph_nodes_for_learner( if (hasattr(last_node, 'row_group_column_name_') and last_node.row_group_column_name_ is not None): group_id_column = last_node.row_group_column_name_ - elif (hasattr(last_node, - 'row_group_column_name') and + elif (hasattr(last_node, 'row_group_column_name') and last_node.row_group_column_name is not None): group_id_column = last_node.row_group_column_name else: @@ -704,10 +718,8 @@ def _update_graph_nodes_for_learner( # todo: ideally all the nodes have the same name for params # so we dont have to distinguish if its learner or - # transformer. We will supply - # input_data, output_data & output_model vars. Its up to - # node to - # use suplied vars + # transformer. We will supply input_data, output_data and + # output_model vars. Its up to node to use suplied vars. learner_node = last_node._get_node( feature_column_name=learner_features, training_data=output_data, @@ -734,6 +746,7 @@ def _fit_graph(self, X, y, verbose, **params): output_binary_data_stream = params.pop( 'output_binary_data_stream', False) params.pop('parallel', None) + do_output_predictor_model = params.pop('output_predictor_model', None) X, y, columns_renamed, feature_columns, label_column, schema, \ weights, weight_column = self._preprocess_X_y(X, y, weights) @@ -748,6 +761,7 @@ def _fit_graph(self, X, y, verbose, **params): input_data = "$input_data" output_data = "$output_data" output_model = "$output_model" + output_predictor_model = "$output_predictor_model" predictor_model = "$predictor_model" graph_nodes, feature_columns, inputs, transform_nodes, \ @@ -762,10 +776,13 @@ def _fit_graph(self, X, y, verbose, **params): self._update_graph_nodes_for_learner( graph_nodes, transform_nodes, - columns_out, label_column, + columns_out, + label_column, weight_column, - output_data, output_model, - predictor_model, y, + output_data, + output_model, + predictor_model, + y, strategy_iosklearn=strategy_iosklearn) # graph_nodes contain graph sections, which is needed for CV. @@ -775,19 +792,33 @@ def _fit_graph(self, X, y, verbose, **params): graph_nodes = list(itertools.chain(*graph_nodes.values())) # combine output models - transform_models = [node.outputs["Model"] - for node in graph_nodes if - "Model" in node.outputs] - if learner_node and len( - transform_models) > 0: # no need to combine if there is - # only 1 model + transform_models = [] + for node in graph_nodes: + if node.name == 'Models.DatasetTransformer': + transform_models.append(node.inputs['TransformModel']) + elif "Model" in node.outputs: + transform_models.append(node.outputs["Model"]) + # no need to combine if there is only 1 model + if learner_node and len(transform_models) > 0: combine_model_node = transforms_manyheterogeneousmodelcombiner( transform_models=transform_models, - predictor_model=( - predictor_model if learner_node else None), + predictor_model=predictor_model, model=output_model) combine_model_node._implicit = True graph_nodes.append(combine_model_node) + if do_output_predictor_model: + # get implicit_nodes and build predictor model only + implicit_nodes = graph_sections['implicit_nodes'] + implicit_transform_models = [] + for node in implicit_nodes: + if "Model" in node.outputs: + implicit_transform_models.append(node.outputs["Model"]) + output_predictor_model_node = transforms_manyheterogeneousmodelcombiner( + transform_models=implicit_transform_models, + predictor_model=predictor_model, + model=output_predictor_model) + output_predictor_model_node._implicit = True + graph_nodes.append(output_predictor_model_node) elif len(transform_models) > 1: combine_model_node = transforms_modelcombiner( models=transform_models, @@ -796,12 +827,13 @@ def _fit_graph(self, X, y, verbose, **params): graph_nodes.append(combine_model_node) elif len(graph_nodes) == 0: raise RuntimeError( - "Unable to process the pipeline len(transform_models)={" - "0}.".format( - len(transform_models))) + "Unable to process the pipeline len(transform_models)={0}.". + format(len(transform_models))) # create the graph outputs = OrderedDict([(output_model.replace('$', ''), '')]) + if do_output_predictor_model: + outputs[output_predictor_model.replace('$', '')] = '' # REVIEW: ideally we should remove output completely from the # graph if its not needed # however graph validation logic prevents doing that at the moment, @@ -809,10 +841,18 @@ def _fit_graph(self, X, y, verbose, **params): if learner_node is None: # last node is transformer outputs[output_data.replace( '$', '')] = '' if do_fit_transform else '' + + data_output_format = DataOutputFormat.DF + if do_fit_transform: + if output_binary_data_stream: + data_output_format = DataOutputFormat.IDV + elif params.pop('as_csr', False): + data_output_format = DataOutputFormat.CSR + graph = Graph( inputs, outputs, - do_fit_transform and output_binary_data_stream, + data_output_format, *(graph_nodes)) # Checks that every parameter in params was used. @@ -1131,10 +1171,9 @@ def move_information_about_roles_once_used(): # run the graph # REVIEW: we should have the possibility to keep the model in - # memory - # and not in a file. + # memory and not in a file. try: - (out_model, out_data, out_metrics) = graph.run( + (out_model, out_data, out_metrics, out_predictor_model) = graph.run( X=X, y=y, random_state=self.random_state, @@ -1160,6 +1199,8 @@ def move_information_about_roles_once_used(): move_information_about_roles_once_used() self.graph_ = graph self.model = out_model + if out_predictor_model: + self.predictor_model = out_predictor_model self.data = out_data # stop the clock self._run_time = time.time() - start_time @@ -1371,9 +1412,7 @@ def _process_learner( optional_node = transforms_optionalcolumncreator( column=[label], data="$input_data" if num_transforms == 0 else - output_data + - str( - num_transforms), + output_data + str(num_transforms), output_data="$optional_data", model=output_model + str(num_transforms + 1)) optional_node._implicit = True @@ -1381,24 +1420,20 @@ def _process_learner( data="$optional_data", label_column=label, output_data="$label_data", - model=output_model + str( - num_transforms + 2)) + model=output_model + str(num_transforms + 2)) label_node._implicit = True feature_node = transforms_featurecombiner( data="$label_data", features=features, output_data=output_data, - model=output_model + str( - num_transforms + 3)) + model=output_model + str(num_transforms + 3)) feature_node._implicit = True implicit_nodes = [optional_node, label_node, feature_node] elif learner.type in ('classifier', 'ranker'): optional_node = transforms_optionalcolumncreator( column=[label], data="$input_data" if num_transforms == 0 else - output_data + - str( - num_transforms), + output_data + str(num_transforms), output_data="$optional_data", model=output_model + str(num_transforms + 1)) optional_node._implicit = True @@ -1409,25 +1444,20 @@ def _process_learner( text_key_values=False, model=output_model + str(num_transforms + 2)) label_node._implicit = True - feature_node = transforms_featurecombiner( data="$label_data", features=features, output_data=output_data, - model=output_model + str( - num_transforms + 3)) + model=output_model + str(num_transforms + 3)) feature_node._implicit = True implicit_nodes = [optional_node, label_node, feature_node] elif learner.type in {'recommender', 'sequence'}: - raise NotImplementedError( - "Type '{0}' is not implemented yet.".format( - learner.type)) + raise NotImplementedError("Type '{0}' is not implemented yet.". + format(learner.type)) else: feature_node = transforms_featurecombiner( data="$input_data" if num_transforms == 0 else - output_data + - str( - num_transforms), + output_data + str(num_transforms), features=features, output_data=output_data, model=output_model + str(num_transforms + 1)) @@ -1713,20 +1743,20 @@ def get_feature_contributions(self, X, top=10, bottom=10, verbose=0, to report. :param bottom: The number of negative contributions with highest magnitude to report. - :return: dataframe of containing the raw data, predicted label, score, + :return: dataframe containing the raw data, predicted label, score, probabilities, and feature contributions. """ self.verbose = verbose if not self._is_fitted: raise ValueError( - "Model is not fitted. Train or load a model before test().") + "Model is not fitted. Train or load a model before.") if len(self.steps) > 0: last_node = self.last_node if last_node.type == 'transform': raise ValueError( - "Pipeline needs a trainer as last step for test()") + "Pipeline needs a trainer as last step.") X, y_temp, columns_renamed, feature_columns, label_column, \ schema, weights, weight_column = self._preprocess_X_y(X) @@ -1742,7 +1772,7 @@ def get_feature_contributions(self, X, top=10, bottom=10, verbose=0, all_nodes = [importtext_node] inputs = dict([('file', ''), ('predictor_model', self.model)]) - score_node = transforms_datasetscorer( + score_node = transforms_datasetscorerex( data="$data", predictor_model="$predictor_model", scored_data="$scoredvectordata") @@ -1759,10 +1789,203 @@ def get_feature_contributions(self, X, top=10, bottom=10, verbose=0, outputs = dict(output_data="") + data_output_format = DataOutputFormat.IDV if as_binary_data_stream \ + else DataOutputFormat.DF, + + graph = Graph( + inputs, + outputs, + data_output_format, + *all_nodes) + + class_name = type(self).__name__ + method_name = inspect.currentframe().f_code.co_name + telemetry_info = ".".join([class_name, method_name]) + + try: + (out_model, out_data, out_metrics, _) = graph.run( + X=X, + random_state=self.random_state, + model=self.model, + verbose=verbose, + telemetry_info=telemetry_info, + **params) + except RuntimeError as e: + raise e + + return out_data + + def get_output_columns(self, verbose=0, **params): + """ + Returns the output list of columns for the fitted model. + :return: list . + """ + self.verbose = verbose + + if not self._is_fitted: + raise ValueError( + "Model is not fitted. Train or load a model before.") + + if len(self.steps) > 0: + last_node = self.last_node + if last_node.type != 'transform': + raise ValueError( + "Pipeline needs a transformer as last step.") + + inputs = dict([('transform_model', self.model)]) + schema_node = models_schema( + transform_model="$transform_model", + schema="$output_data") + all_nodes = [schema_node] + + outputs = dict(output_data="") + + graph = Graph( + inputs, + outputs, + DataOutputFormat.LIST, + *all_nodes) + + try: + (_, out_data, _, _) = graph.run( + X=None, + y=None, + random_state=self.random_state, + model=self.model, + no_input_data=True, + verbose=verbose, + **params) + except RuntimeError as e: + raise e + + return out_data + + @trace + def permutation_feature_importance(self, X, number_of_examples=None, + permutation_count=1, + filter_zero_weight_features=False, + verbose=0, as_binary_data_stream=False, + **params): + """ + Permutation feature importance (PFI) is a technique to determine the + global importance of features in a trained machine learning model. PFI + is a simple yet powerful technique motivated by Breiman in section 10 + of his Random Forests paper (Machine Learning, 2001). The advantage of + the PFI method is that it is model agnostic - it works with any model + that can be evaluated - and it can use any dataset, not just the + training set, to compute feature importance metrics. + + PFI works by taking a labeled dataset, choosing a feature, and + permuting the values for that feature across all the examples, so that + each example now has a random value for the feature and the original + values for all other features. The evaluation metric (e.g. NDCG) is + then calculated for this modified dataset, and the change in the + evaluation metric from the original dataset is computed. The larger the + change in the evaluation metric, the more important the feature is to + the model, i.e. the most important features are those that the model is + most sensitive to. PFI works by performing this permutation analysis + across all the features of a model, one after another. + + Note that for increasing metrics (e.g. AUC, accuracy, R-Squared, NDCG), + the most important features will be those with the highest negative + mean change in the metric. Conversely, for decreasing metrics (e.g. + Mean Squared Error, Log loss), the most important features will be + those with the highest positive mean change in the metric. + + PFI is supported for binary classifiers, classifiers, regressors, and + rankers. + + The mean changes and statndard errors of the means are evaluated for + the following metrics are evaluated for PFI: + + * Binary Classification: + + * Area under ROC curve (AUC) + * Accuracy + * Positive precision + * Positive recall + * Negative precision + * Negative recall + * F1 score + * Area under Precision-Recall curve (AUPRC) + + * Multiclass classification: + + * Macro accuracy + * Micro accuracy + * Log loss + * Log loss reduction + * Top k accuracy + * Per-class log loss + + * Regression: + + * Mean absolute error (MAE) + * Mean squared error (MSE) + * Root mean squared error (RMSE) + * Loss function + * R-Squared + + * Ranking + + * Discounted cumulative gains (DCG) @1, @2, and @3 + * Normalized discounted cumulative gains (NDCG) @1, @2, and @3 + + **Reference** + + `Breiman, L. Random Forests. Machine Learning (2001) 45: 5. + `_ + + :param X: {array-like [n_samples, n_features], + :py:class:`nimbusml.FileDataStream` } + :param number_of_examples: Limit the number of examples to evaluate on. + ``'None'`` means all examples in the dataset are used. + :param permutation_count: The number of permutations to perform. + :filter_zero_weight_features: Pre-filter features with zero weight. PFI + will not be evaluated on these features. + :return: dataframe containing the mean change in evaluation metrics and + standard error of the mean for each feature. Features with the + largest change in a metric are the most important in the model with + respect to that metric. + """ + self.verbose = verbose + + if not self._is_fitted: + raise ValueError( + "Model is not fitted. Train or load a model before test().") + + X, _, _, _, _, schema, _, _ = self._preprocess_X_y(X) + + all_nodes = [] + inputs = dict([('data', ''), ('predictor_model', self.model)]) + if isinstance(X, FileDataStream): + importtext_node = data_customtextloader( + input_file="$file", + data="$data", + custom_schema=schema.to_string( + add_sep=True)) + all_nodes = [importtext_node] + inputs = dict([('file', ''), ('predictor_model', self.model)]) + + pfi_node = transforms_permutationfeatureimportance( + data="$data", + predictor_model="$predictor_model", + metrics="$output_data", + permutation_count=permutation_count, + number_of_examples_to_use=number_of_examples, + use_feature_weight_filter=filter_zero_weight_features) + + all_nodes.extend([pfi_node]) + + outputs = dict(output_data="") + + data_output_format = DataOutputFormat.IDV if as_binary_data_stream \ + else DataOutputFormat.DF, + graph = Graph( inputs, outputs, - as_binary_data_stream, + data_output_format, *all_nodes) class_name = type(self).__name__ @@ -1770,7 +1993,7 @@ def get_feature_contributions(self, X, top=10, bottom=10, verbose=0, telemetry_info = ".".join([class_name, method_name]) try: - (out_model, out_data, out_metrics) = graph.run( + (out_model, out_data, out_metrics, _) = graph.run( X=X, random_state=self.random_state, model=self.model, @@ -1780,8 +2003,38 @@ def get_feature_contributions(self, X, top=10, bottom=10, verbose=0, except RuntimeError as e: raise e + out_data = self._fix_pfi_columns(out_data) + return out_data + def _fix_pfi_columns(self, data): + cols = [] + for i in range(len(data.columns)): + if 'StdErr' in data.columns.values[i]: + if data.columns.values[i][:15] == 'PerClassLogLoss' : + cols.append('PerClassLogLoss' + \ + data.columns.values[i][21:] + '.StdErr') + elif data.columns.values[i][:10] == 'Discounted': + pos = int(data.columns.values[i][-1]) + 1 + cols.append('DCG@' + str(pos) + '.StdErr') + elif data.columns.values[i][:10] == 'Normalized': + pos = int(data.columns.values[i][-1]) + 1 + cols.append('NDCG@' + str(pos) + '.StdErr') + else: + cols.append(data.columns.values[i][:-6] + '.StdErr') + else: + if data.columns.values[i][:10] == 'Discounted': + pos = int(data.columns.values[i][26]) + 1 + cols.append('DCG@' + str(pos)) + elif data.columns.values[i][:10] == 'Normalized': + pos = int(data.columns.values[i][36]) + 1 + cols.append('NDCG@' + str(pos)) + else: + cols.append(data.columns.values[i]) + data.columns = cols + + return data + @trace def _predict(self, X, y=None, evaltype='auto', group_id=None, @@ -1816,22 +2069,44 @@ def _predict(self, X, y=None, isinstance(X, DataFrame) and isinstance(y, (str, tuple))): y = y_temp + is_transformer_chain = False + with ZipFile(self.model) as model_zip: + is_transformer_chain = any('TransformerChain' in item + for item in model_zip.namelist()) + all_nodes = [] - inputs = dict([('data', ''), ('predictor_model', self.model)]) - if isinstance(X, FileDataStream): - importtext_node = data_customtextloader( - input_file="$file", + if is_transformer_chain: + inputs = dict([('data', ''), ('transform_model', self.model)]) + if isinstance(X, FileDataStream): + importtext_node = data_customtextloader( + input_file="$file", + data="$data", + custom_schema=schema.to_string( + add_sep=True)) + all_nodes = [importtext_node] + inputs = dict([('file', ''), ('transform_model', self.model)]) + + score_node = transforms_datasettransformscorer( data="$data", - custom_schema=schema.to_string( - add_sep=True)) - all_nodes = [importtext_node] - inputs = dict([('file', ''), ('predictor_model', self.model)]) - - score_node = transforms_datasetscorer( - data="$data", - predictor_model="$predictor_model", - scored_data="$scoredVectorData") - all_nodes.extend([score_node]) + transform_model="$transform_model", + scored_data="$scoredVectorData") + all_nodes.extend([score_node]) + else: + inputs = dict([('data', ''), ('predictor_model', self.model)]) + if isinstance(X, FileDataStream): + importtext_node = data_customtextloader( + input_file="$file", + data="$data", + custom_schema=schema.to_string( + add_sep=True)) + all_nodes = [importtext_node] + inputs = dict([('file', ''), ('predictor_model', self.model)]) + + score_node = transforms_datasetscorerex( + data="$data", + predictor_model="$predictor_model", + scored_data="$scoredVectorData") + all_nodes.extend([score_node]) if (evaltype in ['binary', 'multiclass']) or \ (hasattr(self, 'steps') @@ -1866,10 +2141,13 @@ def _predict(self, X, y=None, else: outputs = dict(output_data="") + data_output_format = DataOutputFormat.IDV if as_binary_data_stream \ + else DataOutputFormat.DF, + graph = Graph( inputs, outputs, - as_binary_data_stream, + data_output_format, *all_nodes) class_name = type(self).__name__ @@ -1877,7 +2155,7 @@ def _predict(self, X, y=None, telemetry_info = ".".join([class_name, method_name]) try: - (out_model, out_data, out_metrics) = graph.run( + (out_model, out_data, out_metrics, _) = graph.run( X=X, y=y, random_state=self.random_state, @@ -1889,6 +2167,10 @@ def _predict(self, X, y=None, self._run_time = time.time() - start_time raise e + if is_transformer_chain: + out_data['PredictedLabel'] = out_data['PredictedLabel']*1 + + if y is not None: # We need to fix the schema for ranking metrics if evaltype == 'ranking': @@ -1900,7 +2182,7 @@ def _predict(self, X, y=None, return out_data, out_metrics def _extract_classes(self, y): - if ((len(self.steps) > 0) and + if (self.steps and (self.last_node.type in ['classifier', 'anomaly']) and (y is not None) and (not isinstance(y, (str, tuple)))): @@ -1913,7 +2195,10 @@ def _extract_classes(self, y): self._add_classes(unique_classes) def _extract_classes_from_headers(self, headers): - if hasattr(self.last_node, 'classes_'): + # Note: _classes can not be added to the Pipeline unless + # it already exists in the predictor node because the + # dtype is required to set the correct type. + if self.steps and hasattr(self.last_node, 'classes_'): classes = [x.replace('Score.', '') for x in headers] classes = np.array(classes).astype(self.last_node.classes_.dtype) self._add_classes(classes) @@ -1922,7 +2207,9 @@ def _add_classes(self, classes): # Create classes_ attribute similar to scikit # Add both to pipeline and ending classifier self.classes_ = classes - self.last_node.classes_ = classes + + if self.steps: + self.last_node.classes_ = classes @trace def predict(self, X, verbose=0, as_binary_data_stream=False, **params): @@ -1947,7 +2234,7 @@ def predict_proba(self, X, verbose=0, **params): :return: array, shape = [n_samples, n_classes] """ - if hasattr(self, 'steps') and len(self.steps) > 0: + if hasattr(self, 'steps') and self.steps: last_node = self.last_node last_node._check_implements_method('predict_proba') @@ -1987,7 +2274,7 @@ def decision_function(self, X, verbose=0, **params): :return: array, shape=(n_samples,) if n_classes == 2 else ( n_samples, n_classes) """ - if hasattr(self, 'steps') and len(self.steps) > 0: + if hasattr(self, 'steps') and self.steps: last_node = self.last_node last_node._check_implements_method('decision_function') @@ -2147,7 +2434,6 @@ def test( def transform( self, X, - y=None, verbose=0, as_binary_data_stream=False, **params): @@ -2168,18 +2454,7 @@ def transform( "Model is not fitted. Train or load a model before test(" ").") - if y is not None: - if len(self.steps) > 0: - last_node = self.last_node - if last_node.type == 'transform': - raise ValueError( - "Pipeline needs a trainer as last step for test()") - - X, y_temp, columns_renamed, feature_columns, label_column, \ - schema, weights, weight_column = self._preprocess_X_y(X, y) - - if not isinstance(y, (str, tuple)): - y = y_temp + X, _, _, _, _, schema, _, _ = self._preprocess_X_y(X) all_nodes = [] @@ -2200,10 +2475,16 @@ def transform( all_nodes.extend([apply_node]) + data_output_format = DataOutputFormat.DF + if as_binary_data_stream: + data_output_format = DataOutputFormat.IDV + elif params.pop('as_csr', False): + data_output_format = DataOutputFormat.CSR + graph = Graph( inputs, dict(output_data=""), - as_binary_data_stream, + data_output_format, *all_nodes) class_name = type(self).__name__ @@ -2212,7 +2493,7 @@ def transform( max_slots = params.pop('max_slots', -1) try: - (out_model, out_data, out_metrics) = graph.run( + (out_model, out_data, out_metrics, _) = graph.run( X=X, random_state=self.random_state, model=self.model, @@ -2275,7 +2556,7 @@ def summary(self, verbose=0, **params): graph = Graph( inputs, outputs, - False, + DataOutputFormat.DF, *all_nodes) class_name = type(self).__name__ @@ -2283,7 +2564,7 @@ def summary(self, verbose=0, **params): telemetry_info = ".".join([class_name, method_name]) try: - (_, summary_data, _) = graph.run( + (_, summary_data, _, _) = graph.run( X=None, y=None, random_state=self.random_state, @@ -2296,7 +2577,6 @@ def summary(self, verbose=0, **params): self._run_time = time.time() - start_time raise e - self._validate_model_summary(summary_data) self.model_summary = summary_data # stop the clock @@ -2304,46 +2584,6 @@ def summary(self, verbose=0, **params): self._write_csv_time = graph._write_csv_time return self.model_summary - @trace - def _validate_model_summary(self, model_summary): - """ - Validates model summary has correct format - - :param model_summary: model summary dataframes - - """ - if not isinstance(model_summary, (DataFrame)): - raise TypeError( - "Unexpected type {0} for model_summary, type DataFrame " - "is expected ".format( - type(model_summary))) - - col_names = [ - 'Bias', - 'ClassNames', - 'Coefficients', - 'PredictorName', - 'Summary', - 'VectorName' - ] - - col_name_prefixes = [ - 'Weights', - 'Gains', - 'Support vectors.', - 'VectorData' - ] - - for col in model_summary.columns: - if col in col_names: - pass - elif any([col.startswith(pre) for pre in col_name_prefixes]): - pass - else: - raise TypeError( - "Unsupported '{0}' column is in model_summary".format( - col)) - @trace def save_model(self, dst): """ @@ -2374,7 +2614,7 @@ def load_model(self, src): self.steps = [] def __getstate__(self): - odict = {'export_version': 1} + odict = {'export_version': 2} if hasattr(self, 'steps'): odict['steps'] = self.steps @@ -2386,6 +2626,13 @@ def __getstate__(self): with open(self.model, "rb") as f: odict['modelbytes'] = f.read() + if (hasattr(self, 'predictor_model') and + self.predictor_model is not None and + os.path.isfile(self.predictor_model)): + + with open(self.predictor_model, "rb") as f: + odict['predictor_model_bytes'] = f.read() + return odict def __setstate__(self, state): @@ -2393,11 +2640,18 @@ def __setstate__(self, state): self.model = None self.random_state = None - for k, v in state.items(): - if k not in {'modelbytes', 'export_version'}: - setattr(self, k, v) + if state.get('export_version', 0) == 0: + # Pickled pipelines which were created + # before export_version was added used + # the default implementation which uses + # the instances __dict__. + if 'steps' in state: + self.steps = state['steps'] + + elif state.get('export_version', 0) in {1, 2}: + if 'steps' in state: + self.steps = state['steps'] - if state.get('export_version', 0) == 1: if 'modelbytes' in state: (fd, modelfile) = tempfile.mkstemp() fl = os.fdopen(fd, "wb") @@ -2405,6 +2659,16 @@ def __setstate__(self, state): fl.close() self.model = modelfile + if 'predictor_model_bytes' in state: + (fd, modelfile) = tempfile.mkstemp() + fl = os.fdopen(fd, "wb") + fl.write(state['predictor_model_bytes']) + fl.close() + self.predictor_model = modelfile + + else: + raise ValueError('Pipeline version not supported.') + @trace def score( self, @@ -2525,7 +2789,7 @@ def combine_models(cls, *items, **params): graph = Graph( inputs, outputs, - False, + DataOutputFormat.DF, *nodes) class_name = cls.__name__ @@ -2533,7 +2797,7 @@ def combine_models(cls, *items, **params): telemetry_info = ".".join([class_name, method_name]) try: - (out_model, _, _) = graph.run( + (out_model, _, _, _) = graph.run( X=None, y=None, random_state=None, diff --git a/src/python/nimbusml/base_predictor.py b/src/python/nimbusml/base_predictor.py index f33f746c..538e7b5c 100644 --- a/src/python/nimbusml/base_predictor.py +++ b/src/python/nimbusml/base_predictor.py @@ -88,7 +88,13 @@ def _invoke_inference_method(self, method, X, **params): @trace def get_feature_contributions(self, X, **params): - return self._invoke_inference_method('get_feature_contributions', X, **params) + return self._invoke_inference_method('get_feature_contributions', + X, **params) + + @trace + def permutation_feature_importance(self, X, **params): + return self._invoke_inference_method('permutation_feature_importance', + X, **params) @trace def predict(self, X, **params): diff --git a/src/python/nimbusml/examples/LpScaler.py b/src/python/nimbusml/examples/LpScaler.py new file mode 100644 index 00000000..b77b8539 --- /dev/null +++ b/src/python/nimbusml/examples/LpScaler.py @@ -0,0 +1,47 @@ +############################################################################### +# LpScaler +import numpy +from nimbusml import FileDataStream +from nimbusml.datasets import get_dataset +from nimbusml.preprocessing.normalization import LpScaler + +path = get_dataset('infert').as_filepath() +data = FileDataStream.read_csv( + path, + sep=',', + numeric_dtype=numpy.float32, + collapse=True) + +print(data.head()) + +# row_num education age.age age.parity age.induced age.case age.spontaneous age.stratum age.pooled.stratum +# 1.0 0-5yrs 26.0 6.0 1.0 1.0 2.0 1.0 3.0 +# 2.0 0-5yrs 42.0 1.0 1.0 1.0 0.0 2.0 1.0 +# 3.0 0-5yrs 39.0 6.0 2.0 1.0 0.0 3.0 4.0 +# 4.0 0-5yrs 34.0 4.0 2.0 1.0 0.0 4.0 2.0 +# 5.0 6-11yrs 35.0 3.0 1.0 1.0 1.0 5.0 32.0 + +xf = LpScaler(columns={'norm': 'age'}) +features = xf.fit_transform(data) + +print_opts = { + 'index': False, + 'justify': 'left', + 'columns': [ + 'norm.age', + 'norm.parity', + 'norm.induced', + 'norm.case', + 'norm.spontaneous', + 'norm.stratum', + 'norm.pooled.stratum' + ] +} +print('LpScaler\n', features.head().to_string(**print_opts)) + +# norm.age norm.parity norm.induced norm.case norm.spontaneous norm.stratum norm.pooled.stratum +# 0.963624 0.222375 0.037062 0.037062 0.074125 0.037062 0.111187 +# 0.997740 0.023756 0.023756 0.023756 0.000000 0.047511 0.023756 +# 0.978985 0.150613 0.050204 0.025102 0.000000 0.075307 0.100409 +# 0.982725 0.115615 0.057807 0.028904 0.000000 0.115615 0.057807 +# 0.732032 0.062746 0.020915 0.020915 0.020915 0.104576 0.669286 diff --git a/src/python/nimbusml/examples/PermutationFeatureImportance.py b/src/python/nimbusml/examples/PermutationFeatureImportance.py new file mode 100644 index 00000000..44a476ba --- /dev/null +++ b/src/python/nimbusml/examples/PermutationFeatureImportance.py @@ -0,0 +1,173 @@ +############################################################################### +# Permutation Feature Importance (PFI) + +# Permutation feature importance (PFI) is a technique to determine the global +# importance of features in a trained machine learning model. PFI is a simple +# yet powerful technique motivated by Breiman in section 10 of his Random +# Forests paper (Machine Learning, 2001). The advantage of the PFI method is +# that it is model agnostic - it works with any model that can be evaluated - +# and it can use any dataset, not just the training set, to compute feature +# importance metrics. + +# PFI works by taking a labeled dataset, choosing a feature, and permuting the +# values for that feature across all the examples, so that each example now has +# a random value for the feature and the original values for all other +# features. The evaluation metric (e.g. NDCG) is then calculated for this +# modified dataset, and the change in the evaluation metric from the original +# dataset is computed. The larger the change in the evaluation metric, the more +# important the feature is to the model, i.e. the most important features are +# those that the model is most sensitive to. PFI works by performing this +# permutation analysis across allthe features of a model, one after another. + +# PFI is supported for binary classifiers, classifiers, regressors, and +# rankers. + +from nimbusml import Pipeline, FileDataStream +from nimbusml.datasets import get_dataset +from nimbusml.ensemble import LightGbmRanker +from nimbusml.feature_extraction.categorical import OneHotVectorizer +from nimbusml.linear_model import LogisticRegressionBinaryClassifier, \ + FastLinearClassifier, FastLinearRegressor +from nimbusml.preprocessing import ToKey +from numpy.testing import assert_almost_equal + +# data input (as a FileDataStream) +adult_path = get_dataset('uciadult_train').as_filepath() +classification_data = FileDataStream.read_csv(adult_path) +print(classification_data.head()) +# label workclass education ... capital-loss hours-per-week +# 0 0 Private 11th ... 0 40 +# 1 0 Private HS-grad ... 0 50 +# 2 1 Local-gov Assoc-acdm ... 0 40 +# 3 1 Private Some-college ... 0 40 +# 4 0 ? Some-college ... 0 30 + +###################################### +# PFI for Binary Classification models +###################################### +# define the training pipeline with a binary classifier +binary_pipeline = Pipeline([ + OneHotVectorizer(columns=['education']), + LogisticRegressionBinaryClassifier( + feature=['age', 'education'], label='label')]) + +# train the model +binary_model = binary_pipeline.fit(classification_data) + +# get permutation feature importance +binary_pfi = binary_model.permutation_feature_importance(classification_data) + +# Print PFI for each feature, ordered by most important features w.r.t. AUC. +# Since AUC is an increasing metric, the highest negative changes indicate the +# most important features. +print("============== PFI for Binary Classification Model ==============") +print(binary_pfi.sort_values('AreaUnderRocCurve').head()) +# FeatureName AreaUnderRocCurve AreaUnderRocCurve.StdErr ... +# 0 age -0.081604 0.0 ... +# 6 education.Prof-school -0.012964 0.0 ... +# 10 education.Doctorate -0.012863 0.0 ... +# 8 education.Bachelors -0.010593 0.0 ... +# 2 education.HS-grad -0.005918 0.0 ... + + +############################### +# PFI for Classification models +############################### +# define the training pipeline with a classifier +# use 1 thread and no shuffling to force determinism +multiclass_pipeline = Pipeline([ + OneHotVectorizer(columns=['education']), + FastLinearClassifier(feature=['age', 'education'], label='label', + number_of_threads=1, shuffle=False)]) + +# train the model +multiclass_model = multiclass_pipeline.fit(classification_data) + +# get permutation feature importance +multiclass_pfi = multiclass_model.permutation_feature_importance(classification_data) + +# Print PFI for each feature, ordered by most important features w.r.t. Macro +# accuracy. Since Macro accuracy is an increasing metric, the highest negative +# changes indicate the most important features. +print("================== PFI for Classification Model ==================") +print(multiclass_pfi.sort_values('MacroAccuracy').head()) +# FeatureName MacroAccuracy ... MicroAccuracy ... +# 10 education.Doctorate -0.028233 ... -0.020 ... +# 0 age -0.001750 ... 0.002 ... +# 6 education.Prof-school -0.001750 ... 0.002 ... +# 9 education.Masters -0.001299 ... -0.002 ... +# 1 education.11th 0.000000 ... 0.000 ... + +########################### +# PFI for Regression models +########################### +# load input data +infert_path = get_dataset('infert').as_filepath() +regression_data = FileDataStream.read_csv(infert_path) +print(regression_data.head()) +# age case education induced parity ... row_num spontaneous ... +# 0 26 1 0-5yrs 1 6 ... 1 2 ... +# 1 42 1 0-5yrs 1 1 ... 2 0 ... +# 2 39 1 0-5yrs 2 6 ... 3 0 ... +# 3 34 1 0-5yrs 2 4 ... 4 0 ... +# 4 35 1 6-11yrs 1 3 ... 5 1 ... + +# define the training pipeline with a regressor +# use 1 thread and no shuffling to force determinism +regression_pipeline = Pipeline([ + OneHotVectorizer(columns=['education']), + FastLinearRegressor(feature=['induced', 'education'], label='age', + number_of_threads=1, shuffle=False)]) + +# train the model +regression_model = regression_pipeline.fit(regression_data) + +# get permutation feature importance +regression_pfi = regression_model.permutation_feature_importance(regression_data) + +# print PFI for each feaure, ordered by most important features w.r.t. MAE. +# Since MAE is a decreasing metric, the highest positive changes indicate the +# most important features. +print("==================== PFI for Regression Model ====================") +print(regression_pfi.sort_values('MeanAbsoluteError', ascending=False).head()) +# FeatureName MeanAbsoluteError ... RSquared RSquared.StdErr +#3 education.12+ yrs 0.393451 ... -0.146338 0.0 +#0 induced 0.085804 ... -0.026168 0.0 +#1 education.0-5yrs 0.064460 ... -0.027587 0.0 +#2 education.6-11yrs -0.000047 ... 0.000059 0.0 + +######################## +# PFI for Ranking models +######################## +# load input data +ticket_path = get_dataset('gen_tickettrain').as_filepath() +ranking_data = FileDataStream.read_csv(ticket_path) +print(ranking_data.head()) +# rank group carrier price Class dep_day nbr_stops duration +# 0 2 1 AA 240 3 1 0 12.0 +# 1 1 1 AA 300 3 0 1 15.0 +# 2 1 1 AA 360 3 0 2 18.0 +# 3 0 1 AA 540 2 0 0 12.0 +# 4 1 1 AA 600 2 0 1 15.0 + +# define the training pipeline with a ranker +ranking_pipeline = Pipeline([ + ToKey(columns=['group']), + LightGbmRanker(feature=['Class', 'dep_day', 'duration'], + label='rank', group_id='group')]) + +# train the model +ranking_model = ranking_pipeline.fit(ranking_data) + +# get permutation feature importance +ranking_pfi = ranking_model.permutation_feature_importance(ranking_data) + +# Print PFI for each feature, ordered by most important features w.r.t. DCG@1. +# Since DCG is an increasing metric, the highest negative changes indicate the +# most important features. +print("===================== PFI for Ranking Model =====================") +print(ranking_pfi.sort_values('DCG@1').head()) +# Feature DCG@1 DCG@2 DCG@3 ... NDCG@1 NDCG@2 ... +# 0 Class -4.869096 -7.030914 -5.948893 ... -0.420238 -0.407281 ... +# 2 duration -2.344379 -3.595958 -3.956632 ... -0.232143 -0.231539 ... +# 1 dep_day 0.000000 0.000000 0.000000 ... 0.000000 0.000000 ... diff --git a/src/python/nimbusml/examples/PrefixColumnConcatenator.py b/src/python/nimbusml/examples/PrefixColumnConcatenator.py new file mode 100644 index 00000000..b11ddb02 --- /dev/null +++ b/src/python/nimbusml/examples/PrefixColumnConcatenator.py @@ -0,0 +1,25 @@ +############################################################################### +# PrefixColumnConcatenator +import numpy as np +import pandas as pd +from nimbusml.preprocessing.schema import PrefixColumnConcatenator + +data = pd.DataFrame( + data=dict( + PrefixA=[2.5, np.nan, 2.1, 1.0], + PrefixB=[.75, .9, .8, .76], + AnotherColumn=[np.nan, 2.5, 2.6, 2.4])) + +# transform usage +xf = PrefixColumnConcatenator(columns={'combined': 'Prefix'}) + +# fit and transform +features = xf.fit_transform(data) + +# print features +print(features.head()) +# PrefixA PrefixB AnotherColumn combined.PrefixA combined.PrefixB +#0 2.5 0.75 NaN 2.5 0.75 +#1 NaN 0.90 2.5 NaN 0.90 +#2 2.1 0.80 2.6 2.1 0.80 +#3 1.0 0.76 2.4 1.0 0.76 \ No newline at end of file diff --git a/src/python/nimbusml/examples/Schema.py b/src/python/nimbusml/examples/Schema.py new file mode 100644 index 00000000..c0b8d493 --- /dev/null +++ b/src/python/nimbusml/examples/Schema.py @@ -0,0 +1,33 @@ +############################################################################### +# Get schema from a fitted pipeline example. +import numpy as np +import pandas as pd +from nimbusml import Pipeline, FileDataStream +from nimbusml.datasets import get_dataset +from nimbusml.feature_extraction.text import NGramFeaturizer +from nimbusml.feature_extraction.text.extractor import Ngram + +# data input (as a FileDataStream) +path = get_dataset("wiki_detox_train").as_filepath() + +data = FileDataStream.read_csv(path, sep='\t') +print(data.head()) +# Sentiment SentimentText +# 0 1 ==RUDE== Dude, you are rude upload that carl p... +# 1 1 == OK! == IM GOING TO VANDALIZE WILD ONES WIK... +# 2 1 Stop trolling, zapatancas, calling me a liar m... +# 3 1 ==You're cool== You seem like a really cool g... +# 4 1 ::::: Why are you threatening me? I'm not bein... + +pipe = Pipeline([ + NGramFeaturizer( + word_feature_extractor=Ngram(), + columns={ + 'features': ['SentimentText']}) +]) + +pipe.fit(data) +schema = pipe.get_output_columns() + +print(schema[0:5]) +# ['Sentiment', 'SentimentText', 'features.Char.|=|=', 'features.Char.=|=|r', 'features.Char.=|r|u'] diff --git a/src/python/nimbusml/examples/WordTokenizer.py b/src/python/nimbusml/examples/WordTokenizer.py new file mode 100644 index 00000000..028d5d7e --- /dev/null +++ b/src/python/nimbusml/examples/WordTokenizer.py @@ -0,0 +1,32 @@ +############################################################################### +# WordTokenizer + +from nimbusml import Pipeline, FileDataStream +from nimbusml.datasets import get_dataset +from nimbusml.preprocessing.text import WordTokenizer + +# data input (as a FileDataStream) +path = get_dataset("wiki_detox_train").as_filepath() + +data = FileDataStream.read_csv(path, sep='\t') +print(data.head()) +# Sentiment SentimentText +# 0 1 ==RUDE== Dude, you are rude upload that carl p... +# 1 1 == OK! == IM GOING TO VANDALIZE WILD ONES WIK... +# 2 1 Stop trolling, zapatancas, calling me a liar m... +# 3 1 ==You're cool== You seem like a really cool g... +# 4 1 ::::: Why are you threatening me? I'm not bein... + +tokenize = WordTokenizer(char_array_term_separators=[" "]) << {'wt': 'SentimentText'} +pipeline = Pipeline([tokenize]) + +tokenize.fit(data) +y = tokenize.transform(data) + +print(y.drop(labels='SentimentText', axis=1).head()) +# Sentiment wt.000 wt.001 wt.002 wt.003 wt.004 wt.005 ... wt.366 wt.367 wt.368 wt.369 wt.370 wt.371 wt.372 +# 0 1 ==RUDE== Dude, you are rude upload ... None None None None None None None +# 1 1 == OK! == IM GOING TO ... None None None None None None None +# 2 1 Stop trolling, zapatancas, calling me a ... None None None None None None None +# 3 1 ==You're cool== You seem like a ... None None None None None None None +# 4 1 ::::: Why are you threatening me? ... None None None None None None None diff --git a/src/python/nimbusml/examples/examples_from_dataframe/LightLda_df.py b/src/python/nimbusml/examples/examples_from_dataframe/LightLda_df.py index fd4df05b..c4a35a8f 100644 --- a/src/python/nimbusml/examples/examples_from_dataframe/LightLda_df.py +++ b/src/python/nimbusml/examples/examples_from_dataframe/LightLda_df.py @@ -2,9 +2,8 @@ # LightLda: cluster topics import pandas from nimbusml import Pipeline -from nimbusml.feature_extraction.text import LightLda -from nimbusml.feature_extraction.text import NGramFeaturizer -from nimbusml.internal.entrypoints._ngramextractor_ngram import n_gram +from nimbusml.feature_extraction.text import LightLda, NGramFeaturizer +from nimbusml.feature_extraction.text.extractor import Ngram # create the data topics = pandas.DataFrame(data=dict(review=[ @@ -19,7 +18,7 @@ # there are three main topics in our data. set num_topic=3 # and see if LightLDA vectors for topics look similar -pipeline = Pipeline([NGramFeaturizer(word_feature_extractor=n_gram( +pipeline = Pipeline([NGramFeaturizer(word_feature_extractor=Ngram( ), vector_normalizer='None') << 'review', LightLda(num_topic=3)]) y = pipeline.fit_transform(topics) diff --git a/src/python/nimbusml/examples/examples_from_dataframe/LpScaler_df.py b/src/python/nimbusml/examples/examples_from_dataframe/LpScaler_df.py new file mode 100644 index 00000000..d84679ab --- /dev/null +++ b/src/python/nimbusml/examples/examples_from_dataframe/LpScaler_df.py @@ -0,0 +1,48 @@ +############################################################################### +# LpScaler +import numpy as np +import pandas as pd +from nimbusml import Pipeline +from nimbusml.preprocessing.normalization import LpScaler +from nimbusml.preprocessing.schema import ColumnConcatenator + +in_df = pd.DataFrame( + data=dict( + Sepal_Length=[2.5, 1, 2.1, 1.0], + Sepal_Width=[.75, .9, .8, .76], + Petal_Length=[0, 2.5, 2.6, 2.4], + Species=["setosa", "viginica", "setosa", 'versicolor'])) + +in_df.iloc[:, 0:3] = in_df.iloc[:, 0:3].astype(np.float32) + +concat = ColumnConcatenator() << { + 'cat': [ 'Sepal_Length', 'Sepal_Width', 'Petal_Length'] +} + +# Normalize the input values by rescaling them to unit norm (L2, L1 or LInf). +# Performs the following operation on a vector X: Y = (X - M) / D, where M is +# mean and D is either L2 norm, L1 norm or LInf norm. +normed = LpScaler() << {'norm': 'cat'} + +pipeline = Pipeline([concat, normed]) +out_df = pipeline.fit_transform(in_df) + +print_opts = { + 'index': False, + 'justify': 'left', + 'columns': [ + 'Sepal_Length', + 'Sepal_Width', + 'Petal_Length', + 'norm.Sepal_Length', + 'norm.Sepal_Width', + 'norm.Petal_Length' + ] +} +print('LpScaler\n', out_df.to_string(**print_opts)) + +# Sepal_Length Sepal_Width Petal_Length norm.Sepal_Length norm.Sepal_Width norm.Petal_Length +# 2.5 0.75 0.0 0.957826 0.287348 0.000000 +# 1.0 0.90 2.5 0.352235 0.317011 0.880587 +# 2.1 0.80 2.6 0.611075 0.232790 0.756569 +# 1.0 0.76 2.4 0.369167 0.280567 0.886001 diff --git a/src/python/nimbusml/examples/examples_from_dataframe/NGramFeaturizer_df.py b/src/python/nimbusml/examples/examples_from_dataframe/NGramFeaturizer_df.py index e87b8168..e6cc14d1 100644 --- a/src/python/nimbusml/examples/examples_from_dataframe/NGramFeaturizer_df.py +++ b/src/python/nimbusml/examples/examples_from_dataframe/NGramFeaturizer_df.py @@ -2,7 +2,7 @@ # Example with TextTransform and LogisticRegressionBinaryClassifier import pandas from nimbusml.feature_extraction.text import NGramFeaturizer -from nimbusml.internal.entrypoints._ngramextractor_ngram import n_gram +from nimbusml.feature_extraction.text.extractor import Ngram from nimbusml.linear_model import LogisticRegressionBinaryClassifier train_reviews = pandas.DataFrame( @@ -77,7 +77,7 @@ y = train_reviews['like'] X = train_reviews.loc[:, train_reviews.columns != 'like'] -ngram = NGramFeaturizer(word_feature_extractor=n_gram()) << 'review' +ngram = NGramFeaturizer(word_feature_extractor=Ngram()) << 'review' X = ngram.fit_transform(X) # view the transformed numerical values and column names diff --git a/src/python/nimbusml/examples/examples_from_dataframe/PrefixColumnConcatenator_df.py b/src/python/nimbusml/examples/examples_from_dataframe/PrefixColumnConcatenator_df.py new file mode 100644 index 00000000..022e014a --- /dev/null +++ b/src/python/nimbusml/examples/examples_from_dataframe/PrefixColumnConcatenator_df.py @@ -0,0 +1,31 @@ +############################################################################### +# PrefixColumnConcatenator +import numpy as np +import pandas as pd +from nimbusml import Pipeline, Role +from nimbusml.datasets import get_dataset +from nimbusml.linear_model import LogisticRegressionClassifier +from nimbusml.preprocessing.schema import PrefixColumnConcatenator +from nimbusml.preprocessing.schema import ColumnDropper +from sklearn.model_selection import train_test_split + +# use 'iris' data set to create test and train data +# Sepal_Length Sepal_Width Petal_Length Petal_Width Label Species Setosa +# 0 5.1 3.5 1.4 0.2 0 setosa 1.0 +# 1 4.9 3.0 1.4 0.2 0 setosa 1.0 +df = get_dataset("iris").as_df() + +X_train, X_test, y_train, y_test = \ + train_test_split(df.loc[:, df.columns != 'Label'], df['Label']) + +concat = PrefixColumnConcatenator() << {'Sepal': 'Sepal_'} +concat1 = PrefixColumnConcatenator() << {'Petal': 'Petal_'} +dropcols = ColumnDropper() << ['Sepal_Length', 'Sepal_Width', 'Petal_Length', + 'Petal_Width', 'Setosa', 'Species'] + +pipeline = Pipeline([concat, concat1, dropcols, LogisticRegressionClassifier()]) +pipeline.fit(X_train, y_train) + +# Evaluate the model +metrics, scores = pipeline.test(X_test, y_test, output_scores=True) +print(metrics) diff --git a/src/python/nimbusml/examples/examples_from_dataframe/WordTokenizer_df.py b/src/python/nimbusml/examples/examples_from_dataframe/WordTokenizer_df.py new file mode 100644 index 00000000..31980567 --- /dev/null +++ b/src/python/nimbusml/examples/examples_from_dataframe/WordTokenizer_df.py @@ -0,0 +1,33 @@ +############################################################################### +# WordTokenizer + +import pandas +from nimbusml import Pipeline +from nimbusml.preprocessing.text import WordTokenizer + +# create the data +customer_reviews = pandas.DataFrame(data=dict(review=[ + "I really did not like the taste of it", + "It was surprisingly quite good!", + "I will never ever ever go to that place again!!", + "The best ever!! It was amazingly good and super fast", + "I wish I had gone earlier, it was that great", + "somewhat dissapointing. I'd probably wont try again", + "Never visit again... rascals!"])) + +tokenize = WordTokenizer(char_array_term_separators=[" ", "n"]) << 'review' + +pipeline = Pipeline([tokenize]) + +tokenize.fit(customer_reviews) +y = tokenize.transform(customer_reviews) + +print(y) +# review.00 review.01 review.02 review.03 review.04 review.05 review.06 review.07 review.08 review.09 review.10 review.11 +# 0 I really did ot like the taste of it None None None +# 1 It was surprisi gly quite good! None None None None None None +# 2 I will ever ever ever go to that place agai !! None +# 3 The best ever!! It was amazi gly good a d super fast +# 4 I wish I had go e earlier, it was that great None +# 5 somewhat dissapoi ti g. I'd probably wo t try agai None None +# 6 Never visit agai ... rascals! None None None None None None None diff --git a/src/python/nimbusml/internal/core/linear_model/_linearsvmbinaryclassifier.py b/src/python/nimbusml/internal/core/linear_model/_linearsvmbinaryclassifier.py index 0109ba44..34a18740 100644 --- a/src/python/nimbusml/internal/core/linear_model/_linearsvmbinaryclassifier.py +++ b/src/python/nimbusml/internal/core/linear_model/_linearsvmbinaryclassifier.py @@ -69,7 +69,9 @@ class LinearSvmBinaryClassifier( :param caching: Whether trainer should cache input training data. - :param lambda_: Regularizer constant. + :param l2_regularization: L2 regularization weight. It also controls the + learning rate, with the learning rate being inversely proportional to + it. :param perform_projection: Perform projection to unit-ball? Typically used with batch size > 1. @@ -105,7 +107,7 @@ def __init__( self, normalize='Auto', caching='Auto', - lambda_=0.001, + l2_regularization=0.001, perform_projection=False, number_of_iterations=1, initial_weights_diameter=0.0, @@ -119,7 +121,7 @@ def __init__( self.normalize = normalize self.caching = caching - self.lambda_ = lambda_ + self.l2_regularization = l2_regularization self.perform_projection = perform_projection self.number_of_iterations = number_of_iterations self.initial_weights_diameter = initial_weights_diameter @@ -146,7 +148,7 @@ def _get_node(self, **all_args): all_args), normalize_features=self.normalize, caching=self.caching, - lambda_=self.lambda_, + lambda_=self.l2_regularization, perform_projection=self.perform_projection, number_of_iterations=self.number_of_iterations, initial_weights_diameter=self.initial_weights_diameter, diff --git a/src/python/nimbusml/internal/core/preprocessing/_datasettransformer.py b/src/python/nimbusml/internal/core/preprocessing/_datasettransformer.py new file mode 100644 index 00000000..545e6e36 --- /dev/null +++ b/src/python/nimbusml/internal/core/preprocessing/_datasettransformer.py @@ -0,0 +1,49 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +DatasetTransformer +""" + +__all__ = ["DatasetTransformer"] + + +from ...entrypoints.models_datasettransformer import models_datasettransformer +from ...utils.utils import trace +from ..base_pipeline_item import BasePipelineItem, DefaultSignature + + +class DatasetTransformer(BasePipelineItem, DefaultSignature): + """ + **Description** + Applies a TransformModel to a dataset. + + :param transform_model: Transform model. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + transform_model, + **params): + BasePipelineItem.__init__( + self, type='transform', **params) + + self.transform_model = transform_model + + @property + def _entrypoint(self): + return models_datasettransformer + + @trace + def _get_node(self, **all_args): + algo_args = dict( + transform_model=self.transform_model) + + all_args.update(algo_args) + return self._entrypoint(**all_args) diff --git a/src/python/nimbusml/internal/core/preprocessing/normalization/_lpscaler.py b/src/python/nimbusml/internal/core/preprocessing/normalization/_lpscaler.py new file mode 100644 index 00000000..3dce5d56 --- /dev/null +++ b/src/python/nimbusml/internal/core/preprocessing/normalization/_lpscaler.py @@ -0,0 +1,93 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +LpScaler +""" + +__all__ = ["LpScaler"] + + +from ....entrypoints.transforms_lpnormalizer import transforms_lpnormalizer +from ....utils.utils import trace +from ...base_pipeline_item import BasePipelineItem, DefaultSignature + + +class LpScaler(BasePipelineItem, DefaultSignature): + """ + **Description** + Normalize vectors (rows) individually by rescaling them to unit norm (L2, L1 or LInf). Performs the following operation on a vector X: Y = (X - M) / D, where M is mean and D is either L2 norm, L1 norm or LInf norm. + + :param norm: The norm to use to normalize each sample. + + :param sub_mean: Subtract mean from each value before normalizing. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + norm='L2', + sub_mean=False, + **params): + BasePipelineItem.__init__( + self, type='transform', **params) + + self.norm = norm + self.sub_mean = sub_mean + + @property + def _entrypoint(self): + return transforms_lpnormalizer + + @trace + def _get_node(self, **all_args): + + input_columns = self.input + if input_columns is None and 'input' in all_args: + input_columns = all_args['input'] + if 'input' in all_args: + all_args.pop('input') + + output_columns = self.output + if output_columns is None and 'output' in all_args: + output_columns = all_args['output'] + if 'output' in all_args: + all_args.pop('output') + + # validate input + if input_columns is None: + raise ValueError( + "'None' input passed when it cannot be none.") + + if not isinstance(input_columns, list): + raise ValueError( + "input has to be a list of strings, instead got %s" % + type(input_columns)) + + # validate output + if output_columns is None: + output_columns = input_columns + + if not isinstance(output_columns, list): + raise ValueError( + "output has to be a list of strings, instead got %s" % + type(output_columns)) + + algo_args = dict( + column=[ + dict( + Source=i, + Name=o) for i, + o in zip( + input_columns, + output_columns)] if input_columns else None, + norm=self.norm, + sub_mean=self.sub_mean) + + all_args.update(algo_args) + return self._entrypoint(**all_args) diff --git a/src/python/nimbusml/internal/core/preprocessing/schema/_prefixcolumnconcatenator.py b/src/python/nimbusml/internal/core/preprocessing/schema/_prefixcolumnconcatenator.py new file mode 100644 index 00000000..d202e947 --- /dev/null +++ b/src/python/nimbusml/internal/core/preprocessing/schema/_prefixcolumnconcatenator.py @@ -0,0 +1,100 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +""" +PrefixColumnConcatenator +""" + +__all__ = ["PrefixColumnConcatenator"] + + +from ....entrypoints.transforms_prefixcolumnconcatenator import \ + transforms_prefixcolumnconcatenator +from ....utils.utils import trace +from ...base_pipeline_item import BasePipelineItem, DefaultSignature + + +class PrefixColumnConcatenator(BasePipelineItem, DefaultSignature): + """ + + Combines several columns into a single vector-valued column by prefix + + .. remarks:: + ``PrefixColumnConcatenator`` creates a single vector-valued column from + multiple + columns. It can be performed on data before training a model. The + concatenation + can significantly speed up the processing of data when the number of + columns + is as large as hundreds to thousands. + + :param params: Additional arguments sent to compute engine. + + .. seealso:: + :py:class:`ColumnDropper + `, + :py:class:`ColumnSelector + `. + + .. index:: transform, schema + + Example: + .. literalinclude:: /../nimbusml/examples/PrefixColumnConcatenator.py + :language: python + """ + + @trace + def __init__( + self, + **params): + BasePipelineItem.__init__( + self, type='transform', **params) + + @property + def _entrypoint(self): + return transforms_prefixcolumnconcatenator + + @trace + def _get_node(self, **all_args): + + input_columns = self.input + if input_columns is None and 'input' in all_args: + input_columns = all_args['input'] + if 'input' in all_args: + all_args.pop('input') + + output_columns = self.output + if output_columns is None and 'output' in all_args: + output_columns = all_args['output'] + if 'output' in all_args: + all_args.pop('output') + + # validate input + if input_columns is None: + raise ValueError( + "'None' input passed when it cannot be none.") + + if not isinstance(input_columns, list): + raise ValueError( + "input has to be a list of strings, instead got %s" % + type(input_columns)) + + # validate output + if output_columns is None: + raise ValueError( + "'None' output passed when it cannot be none.") + + if not isinstance(output_columns, list): + raise ValueError( + "output has to be a list of strings, instead got %s" % + type(output_columns)) + + algo_args = dict( + column=[ + dict( + Source=i, Name=o) for i, o in zip( + input_columns, output_columns)] if input_columns else None) + + all_args.update(algo_args) + return self._entrypoint(**all_args) diff --git a/src/python/nimbusml/internal/core/preprocessing/text/_wordtokenizer.py b/src/python/nimbusml/internal/core/preprocessing/text/_wordtokenizer.py new file mode 100644 index 00000000..66e06176 --- /dev/null +++ b/src/python/nimbusml/internal/core/preprocessing/text/_wordtokenizer.py @@ -0,0 +1,89 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +WordTokenizer +""" + +__all__ = ["WordTokenizer"] + + +from ....entrypoints.transforms_wordtokenizer import transforms_wordtokenizer +from ....utils.utils import trace +from ...base_pipeline_item import BasePipelineItem, DefaultSignature + + +class WordTokenizer(BasePipelineItem, DefaultSignature): + """ + **Description** + The input to this transform is text, and the output is a vector of text containing the words (tokens) in the original text. The separator is space, but can be specified as any other character (or multiple characters) if needed. + + :param char_array_term_separators: Array of single character term + separator(s). By default uses space character separator. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + char_array_term_separators=None, + **params): + BasePipelineItem.__init__( + self, type='transform', **params) + + self.char_array_term_separators = char_array_term_separators + + @property + def _entrypoint(self): + return transforms_wordtokenizer + + @trace + def _get_node(self, **all_args): + + input_columns = self.input + if input_columns is None and 'input' in all_args: + input_columns = all_args['input'] + if 'input' in all_args: + all_args.pop('input') + + output_columns = self.output + if output_columns is None and 'output' in all_args: + output_columns = all_args['output'] + if 'output' in all_args: + all_args.pop('output') + + # validate input + if input_columns is None: + raise ValueError( + "'None' input passed when it cannot be none.") + + if not isinstance(input_columns, list): + raise ValueError( + "input has to be a list of strings, instead got %s" % + type(input_columns)) + + # validate output + if output_columns is None: + output_columns = input_columns + + if not isinstance(output_columns, list): + raise ValueError( + "output has to be a list of strings, instead got %s" % + type(output_columns)) + + algo_args = dict( + column=[ + dict( + Source=i, + Name=o) for i, + o in zip( + input_columns, + output_columns)] if input_columns else None, + char_array_term_separators=self.char_array_term_separators) + + all_args.update(algo_args) + return self._entrypoint(**all_args) diff --git a/src/python/nimbusml/internal/entrypoints/models_schema.py b/src/python/nimbusml/internal/entrypoints/models_schema.py new file mode 100644 index 00000000..0b8b0056 --- /dev/null +++ b/src/python/nimbusml/internal/entrypoints/models_schema.py @@ -0,0 +1,47 @@ +""" +Models.Summarizer +""" + + +from ..utils.entrypoints import EntryPoint +from ..utils.utils import try_set, unlist + + +def models_schema( + transform_model, + schema=None, + **params): + """ + **Description** + Retreives input/output column schema for transform model. + + :param transform_model: The transform model. + """ + + entrypoint_name = 'Models.Schema' + inputs = {} + outputs = {} + + if transform_model is not None: + inputs['Model'] = try_set( + obj=transform_model, + none_acceptable=False, + is_of_type=str) + if schema is not None: + outputs['Schema'] = try_set( + obj=schema, + none_acceptable=False, + is_of_type=str) + + input_variables = { + x for x in unlist(inputs.values()) + if isinstance(x, str) and x.startswith("$")} + output_variables = { + x for x in unlist(outputs.values()) + if isinstance(x, str) and x.startswith("$")} + + entrypoint = EntryPoint( + name=entrypoint_name, inputs=inputs, outputs=outputs, + input_variables=input_variables, + output_variables=output_variables) + return entrypoint diff --git a/src/python/nimbusml/internal/entrypoints/transforms_datasetscorerex.py b/src/python/nimbusml/internal/entrypoints/transforms_datasetscorerex.py new file mode 100644 index 00000000..7a5d8c71 --- /dev/null +++ b/src/python/nimbusml/internal/entrypoints/transforms_datasetscorerex.py @@ -0,0 +1,68 @@ +""" +Transforms.DatasetScorerEx +""" + + +from ..utils.entrypoints import EntryPoint +from ..utils.utils import try_set, unlist + + +def transforms_datasetscorerex( + data, + predictor_model, + scored_data=None, + scoring_transform=None, + suffix=None, + **params): + """ + **Description** + Score a dataset with a predictor model + + :param data: The dataset to be scored (inputs). + :param predictor_model: The predictor model to apply to data + (inputs). + :param suffix: Suffix to append to the score columns (inputs). + :param scored_data: The scored dataset (outputs). + :param scoring_transform: The scoring transform (outputs). + """ + + entrypoint_name = 'Transforms.DatasetScorerEx' + inputs = {} + outputs = {} + + if data is not None: + inputs['Data'] = try_set( + obj=data, + none_acceptable=False, + is_of_type=str) + if predictor_model is not None: + inputs['PredictorModel'] = try_set( + obj=predictor_model, + none_acceptable=False, + is_of_type=str) + if suffix is not None: + inputs['Suffix'] = try_set( + obj=suffix, + none_acceptable=True, + is_of_type=str) + if scored_data is not None: + outputs['ScoredData'] = try_set( + obj=scored_data, + none_acceptable=False, + is_of_type=str) + if scoring_transform is not None: + outputs['ScoringTransform'] = try_set( + obj=scoring_transform, none_acceptable=False, is_of_type=str) + + input_variables = { + x for x in unlist(inputs.values()) + if isinstance(x, str) and x.startswith("$")} + output_variables = { + x for x in unlist(outputs.values()) + if isinstance(x, str) and x.startswith("$")} + + entrypoint = EntryPoint( + name=entrypoint_name, inputs=inputs, outputs=outputs, + input_variables=input_variables, + output_variables=output_variables) + return entrypoint diff --git a/src/python/nimbusml/internal/entrypoints/transforms_featureselectorbymutualinformation.py b/src/python/nimbusml/internal/entrypoints/transforms_featureselectorbymutualinformation.py index 0663f8cd..74443348 100644 --- a/src/python/nimbusml/internal/entrypoints/transforms_featureselectorbymutualinformation.py +++ b/src/python/nimbusml/internal/entrypoints/transforms_featureselectorbymutualinformation.py @@ -55,7 +55,7 @@ def transforms_featureselectorbymutualinformation( none_acceptable=False, is_of_type=str) if label_column_name is not None: - inputs['LabelColumn'] = try_set( + inputs['LabelColumnName'] = try_set( obj=label_column_name, none_acceptable=True, is_of_type=str, diff --git a/src/python/nimbusml/internal/entrypoints/transforms_permutationfeatureimportance.py b/src/python/nimbusml/internal/entrypoints/transforms_permutationfeatureimportance.py new file mode 100644 index 00000000..18ff2e51 --- /dev/null +++ b/src/python/nimbusml/internal/entrypoints/transforms_permutationfeatureimportance.py @@ -0,0 +1,81 @@ +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +Transforms.PermutationFeatureImportance +""" + +import numbers + +from ..utils.entrypoints import EntryPoint +from ..utils.utils import try_set, unlist + + +def transforms_permutationfeatureimportance( + data, + predictor_model, + metrics=None, + use_feature_weight_filter=False, + number_of_examples_to_use=None, + permutation_count=1, + **params): + """ + **Description** + Permutation Feature Importance (PFI) + + :param data: Input dataset (inputs). + :param predictor_model: The path to the model file (inputs). + :param use_feature_weight_filter: Use feature weights to pre- + filter features (inputs). + :param number_of_examples_to_use: Limit the number of examples to + evaluate on (inputs). + :param permutation_count: The number of permutations to perform + (inputs). + :param metrics: The PFI metrics (outputs). + """ + + entrypoint_name = 'Transforms.PermutationFeatureImportance' + inputs = {} + outputs = {} + + if data is not None: + inputs['Data'] = try_set( + obj=data, + none_acceptable=False, + is_of_type=str) + if predictor_model is not None: + inputs['PredictorModel'] = try_set( + obj=predictor_model, + none_acceptable=False, + is_of_type=str) + if use_feature_weight_filter is not None: + inputs['UseFeatureWeightFilter'] = try_set( + obj=use_feature_weight_filter, + none_acceptable=True, + is_of_type=bool) + if number_of_examples_to_use is not None: + inputs['NumberOfExamplesToUse'] = try_set( + obj=number_of_examples_to_use, + none_acceptable=True, + is_of_type=numbers.Real) + if permutation_count is not None: + inputs['PermutationCount'] = try_set( + obj=permutation_count, + none_acceptable=True, + is_of_type=numbers.Real) + if metrics is not None: + outputs['Metrics'] = try_set( + obj=metrics, + none_acceptable=False, + is_of_type=str) + + input_variables = { + x for x in unlist(inputs.values()) + if isinstance(x, str) and x.startswith("$")} + output_variables = { + x for x in unlist(outputs.values()) + if isinstance(x, str) and x.startswith("$")} + + entrypoint = EntryPoint( + name=entrypoint_name, inputs=inputs, outputs=outputs, + input_variables=input_variables, + output_variables=output_variables) + return entrypoint diff --git a/src/python/nimbusml/internal/entrypoints/transforms_prefixcolumnconcatenator.py b/src/python/nimbusml/internal/entrypoints/transforms_prefixcolumnconcatenator.py new file mode 100644 index 00000000..cfe672b7 --- /dev/null +++ b/src/python/nimbusml/internal/entrypoints/transforms_prefixcolumnconcatenator.py @@ -0,0 +1,64 @@ +""" +Transforms.PrefixColumnConcatenator +""" + + +from ..utils.entrypoints import EntryPoint +from ..utils.utils import try_set, unlist + + +def transforms_prefixcolumnconcatenator( + column, + data, + output_data=None, + model=None, + **params): + """ + **Description** + Concatenates one or more columns of the same item type by prefix. + + :param column: New column definition(s) (optional form: + name:srcs) (inputs). + :param data: Input dataset (inputs). + :param output_data: Transformed dataset (outputs). + :param model: Transform model (outputs). + """ + + entrypoint_name = 'Transforms.PrefixColumnConcatenator' + inputs = {} + outputs = {} + + if column is not None: + inputs['Column'] = try_set( + obj=column, + none_acceptable=False, + is_of_type=list, + is_column=True) + if data is not None: + inputs['Data'] = try_set( + obj=data, + none_acceptable=False, + is_of_type=str) + if output_data is not None: + outputs['OutputData'] = try_set( + obj=output_data, + none_acceptable=False, + is_of_type=str) + if model is not None: + outputs['Model'] = try_set( + obj=model, + none_acceptable=False, + is_of_type=str) + + input_variables = { + x for x in unlist(inputs.values()) + if isinstance(x, str) and x.startswith("$")} + output_variables = { + x for x in unlist(outputs.values()) + if isinstance(x, str) and x.startswith("$")} + + entrypoint = EntryPoint( + name=entrypoint_name, inputs=inputs, outputs=outputs, + input_variables=input_variables, + output_variables=output_variables) + return entrypoint diff --git a/src/python/nimbusml/internal/entrypoints/transforms_variablecolumn.py b/src/python/nimbusml/internal/entrypoints/transforms_variablecolumn.py new file mode 100644 index 00000000..16fca0ad --- /dev/null +++ b/src/python/nimbusml/internal/entrypoints/transforms_variablecolumn.py @@ -0,0 +1,69 @@ +""" +Transforms.VariableColumnTransform +""" + + +from ..utils.entrypoints import EntryPoint +from ..utils.utils import try_set, unlist + + +def transforms_variablecolumn( + data, + output_data=None, + model=None, + features=None, + length_column_name=None, + **params): + """ + **Description** + Combines the specified input columns in to a + single variable length vectorized column. + + :param data: Input dataset (inputs). + :param output_data: Transformed dataset (outputs). + :param model: Transform model (outputs). + """ + + entrypoint_name = 'Transforms.VariableColumnTransform' + inputs = {} + outputs = {} + + if data is not None: + inputs['Data'] = try_set( + obj=data, + none_acceptable=False, + is_of_type=str) + if features is not None: + inputs['Features'] = try_set( + obj=features, + none_acceptable=True, + is_of_type=list, + is_column=True) + if length_column_name is not None: + inputs['LengthColumnName'] = try_set( + obj=length_column_name, + none_acceptable=True, + is_of_type=str) + if output_data is not None: + outputs['OutputData'] = try_set( + obj=output_data, + none_acceptable=False, + is_of_type=str) + if model is not None: + outputs['Model'] = try_set( + obj=model, + none_acceptable=False, + is_of_type=str) + + input_variables = { + x for x in unlist(inputs.values()) + if isinstance(x, str) and x.startswith("$")} + output_variables = { + x for x in unlist(outputs.values()) + if isinstance(x, str) and x.startswith("$")} + + entrypoint = EntryPoint( + name=entrypoint_name, inputs=inputs, outputs=outputs, + input_variables=input_variables, + output_variables=output_variables) + return entrypoint diff --git a/src/python/nimbusml/internal/entrypoints/transforms_wordtokenizer.py b/src/python/nimbusml/internal/entrypoints/transforms_wordtokenizer.py new file mode 100644 index 00000000..e7fac07a --- /dev/null +++ b/src/python/nimbusml/internal/entrypoints/transforms_wordtokenizer.py @@ -0,0 +1,76 @@ +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +Transforms.WordTokenizer +""" + + +from ..utils.entrypoints import EntryPoint +from ..utils.utils import try_set, unlist + + +def transforms_wordtokenizer( + data, + output_data=None, + model=None, + column=None, + char_array_term_separators=None, + **params): + """ + **Description** + The input to this transform is text, and the output is a vector of + text containing the words (tokens) in the original text. The + separator is space, but can be specified as any other + character (or multiple characters) if needed. + + :param column: New column definition(s) (inputs). + :param data: Input dataset (inputs). + :param char_array_term_separators: Array of single character term + separator(s). By default uses space character separator. + (inputs). + :param output_data: Transformed dataset (outputs). + :param model: Transform model (outputs). + """ + + entrypoint_name = 'Transforms.WordTokenizer' + inputs = {} + outputs = {} + + if column is not None: + inputs['Column'] = try_set( + obj=column, + none_acceptable=True, + is_of_type=list, + is_column=True) + if data is not None: + inputs['Data'] = try_set( + obj=data, + none_acceptable=False, + is_of_type=str) + if char_array_term_separators is not None: + inputs['CharArrayTermSeparators'] = try_set( + obj=char_array_term_separators, + none_acceptable=True, + is_of_type=list) + if output_data is not None: + outputs['OutputData'] = try_set( + obj=output_data, + none_acceptable=False, + is_of_type=str) + if model is not None: + outputs['Model'] = try_set( + obj=model, + none_acceptable=False, + is_of_type=str) + + input_variables = { + x for x in unlist(inputs.values()) + if isinstance(x, str) and x.startswith("$")} + output_variables = { + x for x in unlist(outputs.values()) + if isinstance(x, str) and x.startswith("$")} + + entrypoint = EntryPoint( + name=entrypoint_name, inputs=inputs, outputs=outputs, + input_variables=input_variables, + output_variables=output_variables) + return entrypoint diff --git a/src/python/nimbusml/internal/utils/data_schema.py b/src/python/nimbusml/internal/utils/data_schema.py index 0fb409e1..e1880dab 100644 --- a/src/python/nimbusml/internal/utils/data_schema.py +++ b/src/python/nimbusml/internal/utils/data_schema.py @@ -655,7 +655,7 @@ def handle_file(filename): graph = Graph(*(graph_nodes), inputs=dict(file=filename), outputs=dict(data='')) st = FileDataStream(filename, schema=None) - (out_model, out_data, out_metrics) = graph.run(verbose=True, + (out_model, out_data, out_metrics, _) = graph.run(verbose=True, X=st) if isinstance(filepath_or_buffer, StringIO): @@ -882,6 +882,21 @@ def clean_name(col): final_schema.sort() return DataSchema(final_schema, **opt) + @staticmethod + def extract_idv_schema_from_file(path): + with open(path, 'r') as f: + lines = f.readlines() + + col_regex = re.compile(r'#@\s*(col=.*)$') + col_specs = [] + + for line in lines: + match = col_regex.match(line) + if match: + col_specs.append(match.group(1)) + + return DataSchema(' '.join(col_specs)) + class COL: """ diff --git a/src/python/nimbusml/internal/utils/data_stream.py b/src/python/nimbusml/internal/utils/data_stream.py index 7d490bc6..ea544307 100644 --- a/src/python/nimbusml/internal/utils/data_stream.py +++ b/src/python/nimbusml/internal/utils/data_stream.py @@ -399,13 +399,17 @@ def __init__(self, parent, columns): class BinaryDataStream(DataStream): """ - Defines a data view. + Data accessor for IDV data format, see here https://github.com/dotnet/machinelearning/blob/master/docs/code/IDataViewImplementation.md """ - def __init__(self, filename): - # REVIEW: would be good to figure out a way to know the schema of the - # binary IDV. - super(BinaryDataStream, self).__init__(DataSchema("")) + def __init__(self, filename=None): + if filename: + schema_file_path = os.path.splitext(filename)[0] + '.schema' + schema = DataSchema.extract_idv_schema_from_file(schema_file_path) + else: + schema = DataSchema("") + + super(BinaryDataStream, self).__init__(schema) self._filename = filename def __repr__(self): @@ -419,7 +423,7 @@ def to_df(self): # Do not move these imports or the module fails # due to circular references. from ..entrypoints.transforms_nooperation import transforms_nooperation - from .entrypoints import Graph + from .entrypoints import Graph, DataOutputFormat no_op = transforms_nooperation( data='$data', output_data='$output_data') @@ -427,8 +431,8 @@ def to_df(self): graph = Graph( dict( data=''), dict( - output_data=''), False, *(graph_nodes)) - (out_model, out_data, out_metrics) = graph.run(verbose=True, X=self) + output_data=''), DataOutputFormat.DF, *(graph_nodes)) + (out_model, out_data, out_metrics, _) = graph.run(verbose=True, X=self) return out_data def head(self, n=5, skip=0): @@ -438,7 +442,7 @@ def head(self, n=5, skip=0): transforms_rowtakefilter from ..entrypoints.transforms_rowskipfilter import \ transforms_rowskipfilter - from .entrypoints import Graph + from .entrypoints import Graph, DataOutputFormat if n == 0: raise ValueError("n must be > 0") graph_nodes = [] @@ -456,10 +460,16 @@ def head(self, n=5, skip=0): graph = Graph( dict( data=''), dict( - output_data=''), False, *(graph_nodes)) - (out_model, out_data, out_metrics) = graph.run(verbose=True, X=self) + output_data=''), DataOutputFormat.DF, *(graph_nodes)) + (out_model, out_data, out_metrics, _) = graph.run(verbose=True, X=self) return out_data + def get_dataframe_schema(self): + if not hasattr(self, '_df_schema') or not self._df_schema: + head = self.head(n=1) + self._df_schema = DataSchema.read_schema(head) + return self._df_schema + def clone(self): """ Copy/clone the object. @@ -479,7 +489,7 @@ class DprepDataStream(BinaryDataStream): def __init__(self, dataflow=None, filename=None): if dataflow is None and filename is None: raise ValueError('Both dataflow object and filename are None') - super(DprepDataStream, self).__init__(DataSchema("")) + super(DprepDataStream, self).__init__() if dataflow is not None: (fd, filename) = tempfile.mkstemp(suffix='.dprep') fl = os.fdopen(fd, "wt") diff --git a/src/python/nimbusml/internal/utils/dataframes.py b/src/python/nimbusml/internal/utils/dataframes.py index fe46ac20..17572ad1 100644 --- a/src/python/nimbusml/internal/utils/dataframes.py +++ b/src/python/nimbusml/internal/utils/dataframes.py @@ -6,7 +6,7 @@ import numpy as np import six -from pandas import DataFrame, Series, concat, Categorical +from pandas import DataFrame, Series, concat, Categorical, to_datetime from pandas.api.types import infer_dtype from scipy.sparse import csr_matrix @@ -47,6 +47,13 @@ def resolve_dataframe(dataframe): # Workaround, empty dataframe needs to be sent as an array # to convey type information ret[name_i] = serie.values.reshape((len(serie), 1)) + + elif serie.dtype == np.dtype('datetime64[ns]'): + values = serie.values.astype(np.int64, copy=False) + values = values // 1000000 # convert from nanoseconds to milliseconds + ret[str(i)] = values + types.append(_global_dtype_to_char_dict[np.dtype('datetime64[ns]')]) + elif serie.dtype == np.object or str(serie.dtype) == '= 2: raise BridgeRuntimeError( - "{0}.\n--CODE--\n{1}\n--GRAPH--\n{2}\n--DATA--\n{3}" - "\n--\nconcatenated={4}".format( - str(e), code, str(self), vars, concatenated), + "{0}.\n--GRAPH--\n{1}\n--DATA--\n{2}" + "\n--\nconcatenated={3}".format( + str(e), str(self), vars, concatenated), model=output_modelfilename) else: raise BridgeRuntimeError( @@ -348,12 +319,16 @@ def _get_separator(self): return None return pieces[0].replace("sep=", "").strip() - def idv_bridge(self, X, y, code, random_state=None, verbose=1, **params): + def run(self, X, y=None, max_slots=-1, random_state=None, verbose=1, **params): + if params.get("dryrun") is not None: + return 'graph = %s' % (str(self)) + output_modelfilename = None + output_predictor_modelfilename = None output_metricsfilename = None out_metrics = None - # Ideally, idv_bridge shouldn't care if it's running CV + # Ideally, run_graph shouldn't care if it's running CV # or a regular pipeline. That required changing the idv_bridge to be # more flexible (e.g. changing return value, changing input # structure, etc.) In my first attempt, this approach caused @@ -421,15 +396,23 @@ def remove_multi_level_index(c): output_modelfilename = _get_temp_file(suffix='.model.bin') self.outputs['output_model'] = output_modelfilename + # set graph output model to temp file + if 'output_predictor_model' in self.outputs: + output_predictor_modelfilename = _get_temp_file(suffix='.predictor.model.bin') + self.outputs['output_predictor_model'] = output_predictor_modelfilename + # set graph output metrics to temp file if 'output_metrics' in self.outputs: output_metricsfilename = _get_temp_file(suffix='.txt') self.outputs['output_metrics'] = output_metricsfilename - if 'output_data' in self.outputs and \ - self._output_binary_data_stream: - output_idvfilename = _get_temp_file(suffix='.idv') - self.outputs['output_data'] = output_idvfilename + if 'output_data' in self.outputs: + if self._data_output_format == DataOutputFormat.IDV: + output_idvfilename = _get_temp_file(suffix='.idv') + self.outputs['output_data'] = output_idvfilename + + elif self._data_output_format == DataOutputFormat.CSR: + self.outputs['output_data'] = "" # set graph file for debuggings if verbose > 0: @@ -442,9 +425,7 @@ def remove_multi_level_index(c): f.write(self.nimbusml_runnable_graph) call_parameters['verbose'] = try_set(verbose, False, six.integer_types) - call_parameters['graph'] = try_set( - 'graph = {%s} %s' % - (str(self), code), False, str) + call_parameters['graph'] = try_set(str(self), False, str) # Set paths to .NET Core CLR, ML.NET and DataPrep libs set_clr_environment_vars() @@ -455,23 +436,35 @@ def remove_multi_level_index(c): if random_state: call_parameters['seed'] = try_set(random_state, False, six.integer_types) + + if max_slots: + call_parameters['max_slots'] = try_set(max_slots, False, six.integer_types) + ret = self._try_call_bridge( px_call, call_parameters, - code, verbose, concatenated, - output_modelfilename) - - out_data = resolve_output(ret) - # remove label column from data - if out_data is not None and concatenated: - out_columns = list(out_data.columns) - if hasattr(y, 'columns'): - y_column = y.columns[0] - if y_column in out_columns: - out_columns.remove(y_column) - out_data = out_data[out_columns] + output_modelfilename, + output_predictor_modelfilename) + + out_data = None + + if not cv and self._data_output_format == DataOutputFormat.CSR: + out_data = resolve_output_as_csrmatrix(ret) + elif not cv and self._data_output_format == DataOutputFormat.LIST: + out_data = resolve_output_as_list(ret) + else: + out_data = resolve_output_as_dataframe(ret) + # remove label column from data + if out_data is not None and concatenated: + out_columns = list(out_data.columns) + if hasattr(y, 'columns'): + y_column = y.columns[0] + if y_column in out_columns: + out_columns.remove(y_column) + out_data = out_data[out_columns] + if output_metricsfilename: out_metrics = pd.read_csv( output_metricsfilename, @@ -484,18 +477,15 @@ def remove_multi_level_index(c): if cv: return self._process_graph_run_results(out_data) - elif self._output_binary_data_stream: + elif self._data_output_format == DataOutputFormat.IDV: output = BinaryDataStream(output_idvfilename) - return (output_modelfilename, output, out_metrics) + return (output_modelfilename, output, out_metrics, output_predictor_modelfilename) else: - return (output_modelfilename, out_data, out_metrics) + return (output_modelfilename, out_data, out_metrics, output_predictor_modelfilename) finally: if cv: self._remove_temp_files() else: - if output_modelfilename: - # os.remove(output_modelfilename) - pass if output_metricsfilename: os.remove(output_metricsfilename) diff --git a/src/python/nimbusml/linear_model/_linearsvmbinaryclassifier.py b/src/python/nimbusml/linear_model/_linearsvmbinaryclassifier.py index 4f35d8c5..783a6ad5 100644 --- a/src/python/nimbusml/linear_model/_linearsvmbinaryclassifier.py +++ b/src/python/nimbusml/linear_model/_linearsvmbinaryclassifier.py @@ -78,7 +78,9 @@ class LinearSvmBinaryClassifier( :param caching: Whether trainer should cache input training data. - :param lambda_: Regularizer constant. + :param l2_regularization: L2 regularization weight. It also controls the + learning rate, with the learning rate being inversely proportional to + it. :param perform_projection: Perform projection to unit-ball? Typically used with batch size > 1. @@ -114,7 +116,7 @@ def __init__( self, normalize='Auto', caching='Auto', - lambda_=0.001, + l2_regularization=0.001, perform_projection=False, number_of_iterations=1, initial_weights_diameter=0.0, @@ -147,7 +149,7 @@ def __init__( self, normalize=normalize, caching=caching, - lambda_=lambda_, + l2_regularization=l2_regularization, perform_projection=perform_projection, number_of_iterations=number_of_iterations, initial_weights_diameter=initial_weights_diameter, diff --git a/src/python/nimbusml/model_selection/_cv.py b/src/python/nimbusml/model_selection/_cv.py index d719e07f..79a5def4 100644 --- a/src/python/nimbusml/model_selection/_cv.py +++ b/src/python/nimbusml/model_selection/_cv.py @@ -17,7 +17,7 @@ transforms_manyheterogeneousmodelcombiner from ..internal.entrypoints.transforms_modelcombiner import \ transforms_modelcombiner -from ..internal.utils.entrypoints import Graph, GraphOutputType +from ..internal.utils.entrypoints import Graph, GraphOutputType, DataOutputFormat # Extension method for extending a list of steps, with chaining @@ -544,7 +544,7 @@ def fit( group_column=group_id) steps.add(cv_node) - graph = Graph(cv_aux_info.inputs, self.outputs, False, *steps) + graph = Graph(cv_aux_info.inputs, self.outputs, DataOutputFormat.DF, *steps) # prepare telemetry info class_name = type(self).__name__ @@ -557,7 +557,6 @@ def fit( X=X, y=y, random_state=pipeline.random_state, - seed=pipeline.random_state, w=weights, verbose=verbose, telemetry_info=telemetry_info, diff --git a/src/python/nimbusml/preprocessing/__init__.py b/src/python/nimbusml/preprocessing/__init__.py index 2af0b4b3..09a735c8 100644 --- a/src/python/nimbusml/preprocessing/__init__.py +++ b/src/python/nimbusml/preprocessing/__init__.py @@ -1,10 +1,12 @@ from ._fromkey import FromKey from ._tokey import ToKey from ._tensorflowscorer import TensorFlowScorer +from ._datasettransformer import DatasetTransformer __all__ = [ 'FromKey', 'ToKey', - 'TensorFlowScorer' + 'TensorFlowScorer', + 'DatasetTransformer' ] diff --git a/src/python/nimbusml/preprocessing/_datasettransformer.py b/src/python/nimbusml/preprocessing/_datasettransformer.py new file mode 100644 index 00000000..f3964a4b --- /dev/null +++ b/src/python/nimbusml/preprocessing/_datasettransformer.py @@ -0,0 +1,54 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +DatasetTransformer +""" + +__all__ = ["DatasetTransformer"] + + +from sklearn.base import TransformerMixin + +from ..base_transform import BaseTransform +from ..internal.core.preprocessing._datasettransformer import \ + DatasetTransformer as core +from ..internal.utils.utils import trace + + +class DatasetTransformer(core, BaseTransform, TransformerMixin): + """ + **Description** + Applies a TransformModel to a dataset. + + :param columns: see `Columns `_. + + :param transform_model: Transform model. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + transform_model, + columns=None, + **params): + + if columns: + params['columns'] = columns + BaseTransform.__init__(self, **params) + core.__init__( + self, + transform_model=transform_model, + **params) + self._columns = columns + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/preprocessing/normalization/__init__.py b/src/python/nimbusml/preprocessing/normalization/__init__.py index 2c05bf41..a312a870 100644 --- a/src/python/nimbusml/preprocessing/normalization/__init__.py +++ b/src/python/nimbusml/preprocessing/normalization/__init__.py @@ -1,6 +1,7 @@ from ._binner import Binner from ._globalcontrastrowscaler import GlobalContrastRowScaler from ._logmeanvariancescaler import LogMeanVarianceScaler +from ._lpscaler import LpScaler from ._meanvariancescaler import MeanVarianceScaler from ._minmaxscaler import MinMaxScaler @@ -8,6 +9,7 @@ 'Binner', 'GlobalContrastRowScaler', 'LogMeanVarianceScaler', + 'LpScaler', 'MeanVarianceScaler', - 'MinMaxScaler', + 'MinMaxScaler' ] diff --git a/src/python/nimbusml/preprocessing/normalization/_lpscaler.py b/src/python/nimbusml/preprocessing/normalization/_lpscaler.py new file mode 100644 index 00000000..e9fcbb34 --- /dev/null +++ b/src/python/nimbusml/preprocessing/normalization/_lpscaler.py @@ -0,0 +1,68 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +LpScaler +""" + +__all__ = ["LpScaler"] + + +from sklearn.base import TransformerMixin + +from ...base_transform import BaseTransform +from ...internal.core.preprocessing.normalization._lpscaler import \ + LpScaler as core +from ...internal.utils.utils import trace + + +class LpScaler(core, BaseTransform, TransformerMixin): + """ + **Description** + Normalize vectors (rows) individually by rescaling them to unit norm (L2, L1 or LInf). Performs the following operation on a vector X: Y = (X - M) / D, where M is mean and D is either L2 norm, L1 norm or LInf norm. + + :param columns: see `Columns `_. + + :param norm: The norm to use to normalize each sample. + + :param sub_mean: Subtract mean from each value before normalizing. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + norm='L2', + sub_mean=False, + columns=None, + **params): + + if columns: + params['columns'] = columns + BaseTransform.__init__(self, **params) + core.__init__( + self, + norm=norm, + sub_mean=sub_mean, + **params) + self._columns = columns + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) + + def _nodes_with_presteps(self): + """ + Inserts preprocessing before this one. + """ + from ..schema import TypeConverter + return [ + TypeConverter( + result_type='R4')._steal_io(self), + self] diff --git a/src/python/nimbusml/preprocessing/schema/__init__.py b/src/python/nimbusml/preprocessing/schema/__init__.py index a8dae9f8..c28d8ee4 100644 --- a/src/python/nimbusml/preprocessing/schema/__init__.py +++ b/src/python/nimbusml/preprocessing/schema/__init__.py @@ -2,6 +2,7 @@ from ._columndropper import ColumnDropper from ._columnduplicator import ColumnDuplicator from ._columnselector import ColumnSelector +from ._prefixcolumnconcatenator import PrefixColumnConcatenator from ._typeconverter import TypeConverter __all__ = [ @@ -9,6 +10,7 @@ 'ColumnDropper', 'ColumnDuplicator', 'ColumnSelector', + 'PrefixColumnConcatenator', 'TypeConverter' ] diff --git a/src/python/nimbusml/preprocessing/schema/_prefixcolumnconcatenator.py b/src/python/nimbusml/preprocessing/schema/_prefixcolumnconcatenator.py new file mode 100644 index 00000000..53eccf1f --- /dev/null +++ b/src/python/nimbusml/preprocessing/schema/_prefixcolumnconcatenator.py @@ -0,0 +1,86 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +""" +PrefixColumnConcatenator +""" + +__all__ = ["PrefixColumnConcatenator"] + + +from sklearn.base import TransformerMixin + +from ...base_transform import BaseTransform +from ...internal.core.preprocessing.schema._prefixcolumnconcatenator import \ + PrefixColumnConcatenator as core +from ...internal.utils.utils import trace + + +class PrefixColumnConcatenator(core, BaseTransform, TransformerMixin): + """ + + Combines several columns into a single vector-valued column by prefix. + + .. remarks:: + ``PrefixColumnConcatenator`` creates a single vector-valued column from + multiple + columns. It can be performed on data before training a model. The + concatenation + can significantly speed up the processing of data when the number of + columns + is as large as hundreds to thousands. + + :param columns: a dictionary of key-value pairs, where key is the output + column name and value is a list of input column names. + + * Only one key-value pair is allowed. + * Input column type: numeric or string. + * Output column type: + `Vector Type `_. + + The << operator can be used to set this value (see + `Column Operator `_) + + For example + * ColumnConcatenator(columns={'features': ['age', 'parity', + 'induced']}) + * ColumnConcatenator() << {'features': ['age', 'parity', + 'induced']}) + + For more details see `Columns `_. + + :param params: Additional arguments sent to compute engine. + + .. seealso:: + :py:class:`ColumnDropper + `, + :py:class:`ColumnSelector + `. + + .. index:: transform, schema + + Example: + .. literalinclude:: /../nimbusml/examples/PrefixColumnConcatenator.py + :language: python + """ + + @trace + def __init__( + self, + columns=None, + **params): + + if columns: + params['columns'] = columns + BaseTransform.__init__(self, **params) + core.__init__( + self, + **params) + self._columns = columns + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/preprocessing/text/__init__.py b/src/python/nimbusml/preprocessing/text/__init__.py index c312a30e..f40795e1 100644 --- a/src/python/nimbusml/preprocessing/text/__init__.py +++ b/src/python/nimbusml/preprocessing/text/__init__.py @@ -1,5 +1,7 @@ from ._chartokenizer import CharTokenizer +from ._wordtokenizer import WordTokenizer __all__ = [ - 'CharTokenizer' -] \ No newline at end of file + 'CharTokenizer', + 'WordTokenizer' +] diff --git a/src/python/nimbusml/preprocessing/text/_wordtokenizer.py b/src/python/nimbusml/preprocessing/text/_wordtokenizer.py new file mode 100644 index 00000000..94a1c2ac --- /dev/null +++ b/src/python/nimbusml/preprocessing/text/_wordtokenizer.py @@ -0,0 +1,55 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +WordTokenizer +""" + +__all__ = ["WordTokenizer"] + + +from sklearn.base import TransformerMixin + +from ...base_transform import BaseTransform +from ...internal.core.preprocessing.text._wordtokenizer import \ + WordTokenizer as core +from ...internal.utils.utils import trace + + +class WordTokenizer(core, BaseTransform, TransformerMixin): + """ + **Description** + The input to this transform is text, and the output is a vector of text containing the words (tokens) in the original text. The separator is space, but can be specified as any other character (or multiple characters) if needed. + + :param columns: see `Columns `_. + + :param char_array_term_separators: Array of single character term + separator(s). By default uses space character separator. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + char_array_term_separators=None, + columns=None, + **params): + + if columns: + params['columns'] = columns + BaseTransform.__init__(self, **params) + core.__init__( + self, + char_array_term_separators=char_array_term_separators, + **params) + self._columns = columns + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/tests/data_type/test_datetime.py b/src/python/nimbusml/tests/data_type/test_datetime.py new file mode 100644 index 00000000..fabab5b0 --- /dev/null +++ b/src/python/nimbusml/tests/data_type/test_datetime.py @@ -0,0 +1,140 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +import os +import sys +import unittest +import tempfile + +import numpy as np +import pandas as pd +from nimbusml import Pipeline, DprepDataStream +from nimbusml.preprocessing.missing_values import Handler + + +def get_temp_file(suffix=None): + fd, file_name = tempfile.mkstemp(suffix=suffix) + fl = os.fdopen(fd, 'w') + fl.close() + return file_name + + +class TestDateTimeDataType(unittest.TestCase): + def test_negative_values(self): + milliseconds_in_year = 365*24*60*60*1000 + data = [i * milliseconds_in_year for i in [-1, -2, -3, -3.3]] + + df = pd.DataFrame({'c1': data, 'c2': [3,4,5,6]}) + df = df.astype({'c1': np.dtype('datetime64[ms]')}) + + pipeline = Pipeline(steps=[Handler(columns={'c2': 'c2'})]) + result = pipeline.fit_transform(df) + + self.assertTrue(result.loc[:, 'c1'].equals(df.loc[:, 'c1'])) + self.assertEqual(result.loc[:, 'c1'].dtype, np.dtype('datetime64[ns]')) + + self.assertEqual(result.loc[0, 'c1'].year, 1969) + self.assertEqual(result.loc[0, 'c1'].hour, 0) + self.assertEqual(result.loc[0, 'c1'].minute, 0) + self.assertEqual(result.loc[0, 'c1'].second, 0) + + self.assertEqual(result.loc[3, 'c1'].year, 1966) + + def test_timestamp_boundaries(self): + # Here are the current min and max for a Pandas Timestamp + # 1677-09-21 00:12:43.145225 + # 2262-04-11 23:47:16.854775807 + + data = [pd.Timestamp(1677, 9, 22, 1), pd.Timestamp.max] + df = pd.DataFrame({'c1': data, 'c2': [3,4]}) + df = df.astype({'c1': np.dtype('datetime64[ms]')}) + + pipeline = Pipeline(steps=[Handler(columns={'c2': 'c2'})]) + result = pipeline.fit_transform(df) + + self.assertTrue(result.loc[:, 'c1'].equals(df.loc[:, 'c1'])) + self.assertEqual(result.dtypes[0], np.dtype('datetime64[ns]')) + + self.assertEqual(result.loc[0, 'c1'].year, 1677) + self.assertEqual(result.loc[0, 'c1'].month, 9) + self.assertEqual(result.loc[0, 'c1'].day, 22) + + self.assertEqual(result.loc[1, 'c1'].year, 2262) + self.assertEqual(result.loc[1, 'c1'].month, 4) + self.assertEqual(result.loc[1, 'c1'].day, 11) + + def test_datetime_column_parsed_from_string(self): + dates = ["2018-01-02", "2018-02-01"] + df = pd.DataFrame({'c1': dates, 'c2': [3,4]}) + + file_name = get_temp_file('.csv') + df.to_csv(file_name) + df = pd.read_csv(file_name, parse_dates=['c1'], index_col=0) + + self.assertEqual(df.dtypes[0], np.dtype('datetime64[ns]')) + + pipeline = Pipeline(steps=[Handler(columns={'c2': 'c2'})]) + result = pipeline.fit_transform(df) + + self.assertEqual(result.loc[0, 'c1'].year, 2018) + self.assertEqual(result.loc[0, 'c1'].month, 1) + self.assertEqual(result.loc[0, 'c1'].day, 2) + self.assertEqual(result.loc[0, 'c1'].hour, 0) + self.assertEqual(result.loc[0, 'c1'].minute, 0) + self.assertEqual(result.loc[0, 'c1'].second, 0) + + self.assertEqual(result.loc[1, 'c1'].year, 2018) + self.assertEqual(result.loc[1, 'c1'].month, 2) + self.assertEqual(result.loc[1, 'c1'].day, 1) + self.assertEqual(result.loc[1, 'c1'].hour, 0) + self.assertEqual(result.loc[1, 'c1'].minute, 0) + self.assertEqual(result.loc[1, 'c1'].second, 0) + + self.assertEqual(len(result), 2) + self.assertEqual(result.dtypes[0], np.dtype('datetime64[ns]')) + + os.remove(file_name) + + @unittest.skipIf(sys.version_info[:2] == (2, 7), "azureml-dataprep is not installed.") + def test_dprep_datastream(self): + import azureml.dataprep as dprep + + dates = ["2018-01-02 00:00:00", "2018-02-01 10:00:00"] + col2 = ['0', '1'] + label_array = np.repeat([0], 2) + train_df = pd.DataFrame({'col1': dates, 'col2': col2, 'label': label_array}) + + pipeline = Pipeline(steps=[ + Handler(columns={'2': 'col2'}, concat=False, impute_by_slot=True, replace_with='Mean') + ]) + + file_name = get_temp_file('.csv') + train_df.to_csv(file_name) + + dataflow = dprep.read_csv(file_name, infer_column_types=True) + dprepDataStream = DprepDataStream(dataflow) + + result = pipeline.fit_transform(dprepDataStream) + + self.assertEqual(result.loc[:, 'col1'].dtype, np.dtype('datetime64[ns]')) + + self.assertEqual(result.loc[0, 'col1'].year, 2018) + self.assertEqual(result.loc[0, 'col1'].month, 1) + self.assertEqual(result.loc[0, 'col1'].day, 2) + self.assertEqual(result.loc[0, 'col1'].hour, 0) + self.assertEqual(result.loc[0, 'col1'].minute, 0) + self.assertEqual(result.loc[0, 'col1'].second, 0) + + self.assertEqual(result.loc[1, 'col1'].year, 2018) + self.assertEqual(result.loc[1, 'col1'].month, 2) + self.assertEqual(result.loc[1, 'col1'].day, 1) + self.assertEqual(result.loc[1, 'col1'].hour, 10) + self.assertEqual(result.loc[1, 'col1'].minute, 0) + self.assertEqual(result.loc[1, 'col1'].second, 0) + + os.remove(file_name) + + +if __name__ == '__main__': + unittest.main() diff --git a/src/python/nimbusml/tests/decomposition/test_pcaanomalydetector.py b/src/python/nimbusml/tests/decomposition/test_pcaanomalydetector.py index f3d81ea2..c5a04806 100644 --- a/src/python/nimbusml/tests/decomposition/test_pcaanomalydetector.py +++ b/src/python/nimbusml/tests/decomposition/test_pcaanomalydetector.py @@ -25,7 +25,7 @@ def test_PcaAnomalyDetector(self): scores = svm.predict(X_test) assert_almost_equal( scores.sum().sum(), - 4.181632, + 4.1786637, decimal=7, err_msg="Sum should be %s" % 4.181632) diff --git a/src/python/nimbusml/tests/dprep/test_dprep.py b/src/python/nimbusml/tests/dprep/test_dprep.py index c8ebbbdb..a2061c51 100644 --- a/src/python/nimbusml/tests/dprep/test_dprep.py +++ b/src/python/nimbusml/tests/dprep/test_dprep.py @@ -28,7 +28,7 @@ def assert_2d_array_equal(actual, desired): continue assert_true(actual[i][y] == desired[i][y]) -@unittest.skipIf(os.name == "posix" or sys.version_info[:2] != (3, 7), "azureml-dataprep is not installed.") +@unittest.skipIf(sys.version_info[:2] == (2, 7), "azureml-dataprep is not installed.") class TestDprep(unittest.TestCase): def test_fit_transform(self): diff --git a/src/python/nimbusml/tests/ensemble/test_ensembleregressor.py b/src/python/nimbusml/tests/ensemble/test_ensembleregressor.py index 5c61d9b2..a3c95495 100644 --- a/src/python/nimbusml/tests/ensemble/test_ensembleregressor.py +++ b/src/python/nimbusml/tests/ensemble/test_ensembleregressor.py @@ -33,7 +33,7 @@ def test_ensembleregressor(self): scores = ensemble.predict(X_test) r2 = r2_score(y_test, scores) - assert_greater(r2, 0.12, "should be greater than %s" % 0.12) + assert_greater(r2, 0.105, "should be greater than %s" % 0.105) assert_less(r2, 0.13, "sum should be less than %s" % 0.13) ensemble_with_options = EnsembleRegressor( @@ -46,8 +46,8 @@ def test_ensembleregressor(self): scores = ensemble_with_options.predict(X_test) r2 = r2_score(y_test, scores) - assert_greater(r2, 0.0279, "R-Squared should be greater than %s" % 0.0279) - assert_less(r2, 0.03, "R-Squared should be less than %s" % 0.03) + assert_greater(r2, 0.07, "R-Squared should be greater than %s" % 0.07) + assert_less(r2, 0.08, "R-Squared should be less than %s" % 0.08) if __name__ == '__main__': diff --git a/src/python/nimbusml/tests/idv/test_idv.py b/src/python/nimbusml/tests/idv/test_idv.py index e86f2226..c92fb092 100644 --- a/src/python/nimbusml/tests/idv/test_idv.py +++ b/src/python/nimbusml/tests/idv/test_idv.py @@ -9,8 +9,10 @@ import pandas as pd from nimbusml import Pipeline, FileDataStream, BinaryDataStream from nimbusml.datasets import get_dataset -from nimbusml.linear_model import FastLinearRegressor +from nimbusml.feature_extraction.categorical import OneHotVectorizer +from nimbusml.linear_model import FastLinearRegressor, OnlineGradientDescentRegressor from nimbusml.preprocessing.normalization import MinMaxScaler +from nimbusml.preprocessing.schema import ColumnDropper from sklearn.utils.testing import assert_true, assert_array_equal # data input (as a FileDataStream) @@ -105,6 +107,113 @@ def test_test(self): assert_array_equal(scores, scores_df) assert_array_equal(metrics, metrics_df) + def test_fit_predictor_with_idv(self): + train_data = {'c0': ['a', 'b', 'a', 'b'], + 'c1': [1, 2, 3, 4], + 'c2': [2, 3, 4, 5]} + train_df = pd.DataFrame(train_data).astype({'c1': np.float64, + 'c2': np.float64}) + + test_data = {'c0': ['a', 'b', 'b'], + 'c1': [1.5, 2.3, 3.7], + 'c2': [2.2, 4.9, 2.7]} + test_df = pd.DataFrame(test_data).astype({'c1': np.float64, + 'c2': np.float64}) + + # Fit a transform pipeline to the training data + transform_pipeline = Pipeline([OneHotVectorizer() << 'c0']) + transform_pipeline.fit(train_df) + df = transform_pipeline.transform(train_df, as_binary_data_stream=True) + + # Fit a predictor pipeline given a transformed BinaryDataStream + predictor = OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1']) + predictor_pipeline = Pipeline([predictor]) + predictor_pipeline.fit(df) + + # Perform a prediction given the test data using + # the transform and predictor defined previously. + df = transform_pipeline.transform(test_df, as_binary_data_stream=True) + result_1 = predictor_pipeline.predict(df) + + # Create expected result + xf = OneHotVectorizer() << 'c0' + df = xf.fit_transform(train_df) + predictor = OnlineGradientDescentRegressor(label='c2', feature=['c0.a', 'c0.b', 'c1']) + predictor.fit(df) + df = xf.transform(test_df) + expected_result = predictor.predict(df) + + self.assertTrue(result_1.loc[:, 'Score'].equals(expected_result)) + + def test_fit_transform_with_idv(self): + path = get_dataset('infert').as_filepath() + data = FileDataStream.read_csv(path) + + featurization_pipeline = Pipeline([OneHotVectorizer(columns={'education': 'education'})]) + featurization_pipeline.fit(data) + featurized_data = featurization_pipeline.transform(data, as_binary_data_stream=True) + + schema = featurized_data.schema + num_columns = len(schema) + self.assertTrue('case' in schema) + self.assertTrue('row_num' in schema) + + pipeline = Pipeline([ColumnDropper() << ['case', 'row_num']]) + pipeline.fit(featurized_data) + result = pipeline.transform(featurized_data, as_binary_data_stream=True) + + schema = result.schema + self.assertEqual(len(schema), num_columns - 2) + self.assertTrue('case' not in schema) + self.assertTrue('row_num' not in schema) + + def test_schema_with_vectorized_column(self): + path = get_dataset('infert').as_filepath() + data = FileDataStream.read_csv(path) + + featurization_pipeline = Pipeline([OneHotVectorizer(columns={'education': 'education'})]) + featurization_pipeline.fit(data) + featurized_data = featurization_pipeline.transform(data, as_binary_data_stream=True) + + # col=row_num:I8:0 col=education:R4:1-3 col=age:I8:4 col=parity:I8:5 + # col=induced:I8:6 col=case:I8:7 col=spontaneous:I8:8 col=stratum:I8:9 + # col=pooled.stratum:I8:10 quote+ + schema = featurized_data.schema + + self.assertEqual(len(schema), 9) + self.assertEqual(schema['age'].Type, 'I8') + self.assertEqual(schema['age'].Name, 'age') + self.assertEqual(schema['age'].IsVector, False) + + self.assertEqual(schema['education'].Type, 'R4') + self.assertEqual(schema['education'].Name, 'education') + self.assertEqual(len(schema['education'].Pos), 3) + self.assertEqual(schema['education'].IsVector, True) + + self.assertTrue('education.0-5yrs' not in schema) + self.assertTrue('education.6-11yrs' not in schema) + self.assertTrue('education.12+yrs' not in schema) + + # col=row_num:I8:0 col=education.0-5yrs:R4:1 col=education.6-11yrs:R4:2 + # col=education.12+yrs:R4:3 col=age:I8:4 col=parity:I8:5 col=induced:I8:6 + # col=case:I8:7 col=spontaneous:I8:8 col=stratum:I8:9 col=pooled.stratum:I8:10 + # quote+ header=+ + schema = featurized_data.get_dataframe_schema() + + self.assertEqual(len(schema), 11) + self.assertEqual(schema['age'].Type, 'I8') + self.assertEqual(schema['age'].Name, 'age') + self.assertEqual(schema['age'].IsVector, False) + + self.assertTrue('education' not in schema) + self.assertTrue('education.0-5yrs' in schema) + self.assertTrue('education.6-11yrs' in schema) + self.assertTrue('education.12+yrs' in schema) + + self.assertEqual(schema['education.0-5yrs'].Type, 'R4') + self.assertEqual(schema['education.0-5yrs'].Name, 'education.0-5yrs') + self.assertEqual(schema['education.0-5yrs'].IsVector, False) + if __name__ == '__main__': unittest.main() diff --git a/src/python/nimbusml/tests/linear_model/test_averagedperceptronbinaryclassifier.py b/src/python/nimbusml/tests/linear_model/test_averagedperceptronbinaryclassifier.py index 96397f70..bcdc6530 100644 --- a/src/python/nimbusml/tests/linear_model/test_averagedperceptronbinaryclassifier.py +++ b/src/python/nimbusml/tests/linear_model/test_averagedperceptronbinaryclassifier.py @@ -37,7 +37,7 @@ def setUpClass(cls): def test_averagedperceptron(self): accuracy = get_accuracy(self, AveragedPerceptronBinaryClassifier()) # Accuracy depends on column Unnamed0 (index). - assert_greater(accuracy, 0.98, "accuracy should be %s" % 0.98) + assert_greater(accuracy, 0.93, "accuracy should be greater than %s" % 0.93) def test_averagedperceptron_supported_losses(self): # bug: 'exp' fails on this test diff --git a/src/python/nimbusml/tests/model_selection/test_sweep.py b/src/python/nimbusml/tests/model_selection/test_sweep.py index 5a5f0b32..4faa1993 100644 --- a/src/python/nimbusml/tests/model_selection/test_sweep.py +++ b/src/python/nimbusml/tests/model_selection/test_sweep.py @@ -18,7 +18,7 @@ from nimbusml.feature_extraction.text import NGramFeaturizer from nimbusml.feature_extraction.text import WordEmbedding from nimbusml.feature_extraction.text.extractor import Ngram -from nimbusml.linear_model import FastLinearBinaryClassifier +from nimbusml.linear_model import FastLinearBinaryClassifier, AveragedPerceptronBinaryClassifier from nimbusml.utils import get_X_y from sklearn.model_selection import GridSearchCV from sklearn.utils.testing import assert_raises @@ -68,12 +68,8 @@ def test_hyperparameters_sweep(self): 'learner__number_of_trees': 1} def test_learners_sweep(self): - # grid search over 2 learners, even though pipe defined with - # FastTreesBinaryClassifier - # FastLinearBinaryClassifier learner wins, meaning we grid searched - # over it + # grid search over 2 learners np.random.seed(0) - df = pd.DataFrame(dict(education=['A', 'A', 'A', 'A', 'B', 'A', 'B'], workclass=['X', 'Y', 'X', 'X', 'X', 'Y', 'Y'], y=[1, 0, 1, 1, 0, 1, 0])) @@ -86,17 +82,13 @@ def test_learners_sweep(self): param_grid = dict( learner=[ - FastLinearBinaryClassifier(), - FastTreesBinaryClassifier()], - learner__number_of_threads=[ - 1, - 4]) + AveragedPerceptronBinaryClassifier(), + FastTreesBinaryClassifier()]) grid = GridSearchCV(pipe, param_grid) grid.fit(X, y) assert grid.best_params_[ - 'learner'].__class__.__name__ == 'FastLinearBinaryClassifier' - assert grid.best_params_['learner__number_of_threads'] == 1 + 'learner'].__class__.__name__ == 'AveragedPerceptronBinaryClassifier' @unittest.skipIf( six.PY2, diff --git a/src/python/nimbusml/tests/model_summary/test_model_summary.py b/src/python/nimbusml/tests/model_summary/test_model_summary.py index e21d25e9..650238ae 100644 --- a/src/python/nimbusml/tests/model_summary/test_model_summary.py +++ b/src/python/nimbusml/tests/model_summary/test_model_summary.py @@ -67,25 +67,20 @@ OrdinaryLeastSquaresRegressor(), PoissonRegressionRegressor(), OneVsRestClassifier(FastLinearBinaryClassifier()), - LightGbmClassifier(), GamRegressor(), GamBinaryClassifier(), PcaAnomalyDetector(), FactorizationMachineBinaryClassifier(), KMeansPlusPlus(), - NaiveBayesClassifier() - - # Skipping these tests since they are throwing the following error: - # *** System.NotSupportedException: 'Column has variable length - # vector: CategoricalSplitFeatures. Not supported in python. - # Drop column before sending to Python - #FastForestBinaryClassifier(), - #FastForestRegressor(), - #FastTreesBinaryClassifier(), - #FastTreesRegressor(), - #FastTreesTweedieRegressor(), - #LightGbmRegressor(), - #LightGbmBinaryClassifier(), + NaiveBayesClassifier(), + FastForestBinaryClassifier(number_of_trees=2), + FastForestRegressor(number_of_trees=2), + FastTreesBinaryClassifier(number_of_trees=2), + FastTreesRegressor(number_of_trees=2), + FastTreesTweedieRegressor(number_of_trees=2), + LightGbmRegressor(number_of_iterations=2), + LightGbmClassifier(), + LightGbmBinaryClassifier(number_of_iterations=2) ] learners_not_supported = [ diff --git a/src/python/nimbusml/tests/pipeline/test_csr_input.py b/src/python/nimbusml/tests/pipeline/test_csr_input.py new file mode 100644 index 00000000..176a7651 --- /dev/null +++ b/src/python/nimbusml/tests/pipeline/test_csr_input.py @@ -0,0 +1,65 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +import os +import unittest + +import numpy as np +import pandas as pd +from nimbusml import Pipeline, FileDataStream +from nimbusml.datasets import get_dataset +from nimbusml.feature_extraction.categorical import OneHotVectorizer +from nimbusml.linear_model import LogisticRegressionBinaryClassifier +from nimbusml.preprocessing import DatasetTransformer +from nimbusml.preprocessing.schema import PrefixColumnConcatenator +from nimbusml.preprocessing.schema import ColumnDropper +from numpy.testing import assert_equal + +class TestCsrInput(unittest.TestCase): + + def test_predict_proba_on_csr(self): + path = get_dataset('infert').as_filepath() + data = FileDataStream.read_csv(path) + cols = list(data.head(1).columns.values) # ordered data column names. + + # train featurizer + featurization_pipeline = Pipeline([OneHotVectorizer(columns={'education': 'education'})]) + featurization_pipeline.fit(data) + # Note: the relative order of all columns is still the same as in raw data. + #print(featurization_pipeline.get_output_columns()) + + # need to remove extra columns before getting csr_matrix featurized data as it wont have column name information. + csr_featurization_pipeline = Pipeline([DatasetTransformer(featurization_pipeline.model), ColumnDropper() << ['case', 'row_num']]) + sparse_featurized_data = csr_featurization_pipeline.fit_transform(data, as_csr=True) + # Note: the relative order of all columns is still the same. + #print(csr_featurization_pipeline.get_output_columns()) + + # train learner + # Note: order & number of feature columns for learner (parameter 'feature') should be the same as in csr_matrix above + cols.remove('row_num') + cols.remove('case') + feature_cols = cols + #print(feature_cols) + #['education', 'age', 'parity', 'induced', 'spontaneous', 'stratum', 'pooled.stratum'] + training_pipeline = Pipeline([DatasetTransformer(featurization_pipeline.model), LogisticRegressionBinaryClassifier(feature=feature_cols, label='case')]) + training_pipeline.fit(data, output_predictor_model=True) + + # load just a learner model + predictor_pipeline = Pipeline() + predictor_pipeline.load_model(training_pipeline.predictor_model) + # see the order of Feature.* columns that get passed to learner algo + #print(predictor_pipeline.get_output_columns()) + + # use just a learner model on csr_matrix featurized data + predictions = predictor_pipeline.predict_proba(sparse_featurized_data) + assert_equal(len(predictions), 248) + assert_equal(len(predictions[0]), 2) + + # get feature contributions + fcc = predictor_pipeline.get_feature_contributions(sparse_featurized_data) + assert_equal(fcc.shape, (248,30)) + +if __name__ == '__main__': + unittest.main() + diff --git a/src/python/nimbusml/tests/pipeline/test_load_save.py b/src/python/nimbusml/tests/pipeline/test_load_save.py index fc112fe5..19bc26ce 100644 --- a/src/python/nimbusml/tests/pipeline/test_load_save.py +++ b/src/python/nimbusml/tests/pipeline/test_load_save.py @@ -7,10 +7,13 @@ import pickle import unittest +import numpy as np +import pandas as pd + from nimbusml import Pipeline from nimbusml.datasets import get_dataset from nimbusml.feature_extraction.categorical import OneHotVectorizer -from nimbusml.linear_model import FastLinearBinaryClassifier +from nimbusml.linear_model import FastLinearBinaryClassifier, OnlineGradientDescentRegressor from nimbusml.utils import get_X_y from numpy.testing import assert_almost_equal @@ -326,5 +329,43 @@ def test_predictor_loaded_from_zip_has_feature_contributions(self): os.remove(model_filename) + def test_pickled_pipeline_with_predictor_model(self): + train_data = {'c1': [1, 2, 3, 4], 'c2': [2, 3, 4, 5]} + train_df = pd.DataFrame(train_data).astype({'c1': np.float64, + 'c2': np.float64}) + + test_data = {'c1': [1.5, 2.3, 3.7], 'c2': [2.2, 4.9, 2.7]} + test_df = pd.DataFrame(test_data).astype({'c1': np.float64, + 'c2': np.float64}) + + # Create predictor model and use it to predict + pipeline = Pipeline([OnlineGradientDescentRegressor(label='c2')], random_state=0) + pipeline.fit(train_df, output_predictor_model=True) + result_1 = pipeline.predict(test_df) + + self.assertTrue(pipeline.model) + self.assertTrue(pipeline.predictor_model) + self.assertNotEqual(pipeline.model, pipeline.predictor_model) + + pickle_filename = 'nimbusml_model.p' + with open(pickle_filename, 'wb') as f: + pickle.dump(pipeline, f) + + os.remove(pipeline.model) + os.remove(pipeline.predictor_model) + + with open(pickle_filename, "rb") as f: + pipeline_pickle = pickle.load(f) + + os.remove(pickle_filename) + + # Load predictor pipeline and score data + predictor_pipeline = Pipeline() + predictor_pipeline.load_model(pipeline_pickle.predictor_model) + result_2 = predictor_pipeline.predict(test_df) + + self.assertTrue(result_1.equals(result_2)) + + if __name__ == '__main__': unittest.main() diff --git a/src/python/nimbusml/tests/pipeline/test_permutation_feature_importance.py b/src/python/nimbusml/tests/pipeline/test_permutation_feature_importance.py new file mode 100644 index 00000000..347b2798 --- /dev/null +++ b/src/python/nimbusml/tests/pipeline/test_permutation_feature_importance.py @@ -0,0 +1,125 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +import os +import unittest + +from nimbusml import FileDataStream +from nimbusml import Pipeline +from nimbusml.datasets import get_dataset +from nimbusml.ensemble import LightGbmRanker +from nimbusml.feature_extraction.categorical import OneHotVectorizer +from nimbusml.linear_model import LogisticRegressionBinaryClassifier, \ + FastLinearClassifier, FastLinearRegressor +from nimbusml.preprocessing import ToKey +from numpy.testing import assert_almost_equal +from pandas.testing import assert_frame_equal + +class TestPermutationFeatureImportance(unittest.TestCase): + + @classmethod + def setUpClass(self): + adult_path = get_dataset('uciadult_train').as_filepath() + self.classification_data = FileDataStream.read_csv(adult_path) + binary_pipeline = Pipeline([ + OneHotVectorizer(columns=['education']), + LogisticRegressionBinaryClassifier( + feature=['age', 'education'], label='label', + number_of_threads=1)]) + self.binary_model = binary_pipeline.fit(self.classification_data) + self.binary_pfi = self.binary_model.permutation_feature_importance(self.classification_data) + classifier_pipeline = Pipeline([ + OneHotVectorizer(columns=['education']), + FastLinearClassifier(feature=['age', 'education'], label='label', + number_of_threads=1, shuffle=False)]) + self.classifier_model = classifier_pipeline.fit(self.classification_data) + self.classifier_pfi = self.classifier_model.permutation_feature_importance(self.classification_data) + + infert_path = get_dataset('infert').as_filepath() + self.regression_data = FileDataStream.read_csv(infert_path) + regressor_pipeline = Pipeline([ + OneHotVectorizer(columns=['education']), + FastLinearRegressor(feature=['induced', 'education'], label='age', + number_of_threads=1, shuffle=False)]) + self.regressor_model = regressor_pipeline.fit(self.regression_data) + self.regressor_pfi = self.regressor_model.permutation_feature_importance(self.regression_data) + + ticket_path = get_dataset('gen_tickettrain').as_filepath() + self.ranking_data = FileDataStream.read_csv(ticket_path) + ranker_pipeline = Pipeline([ + ToKey(columns=['group']), + LightGbmRanker(feature=['Class', 'dep_day', 'duration'], + label='rank', group_id='group', + random_state=0, number_of_threads=1)]) + self.ranker_model = ranker_pipeline.fit(self.ranking_data) + self.ranker_pfi = self.ranker_model.permutation_feature_importance(self.ranking_data) + + def test_binary_classifier(self): + assert_almost_equal(self.binary_pfi['AreaUnderRocCurve'].sum(), -0.140824, 6) + assert_almost_equal(self.binary_pfi['PositivePrecision'].sum(), -0.482143, 6) + assert_almost_equal(self.binary_pfi['PositiveRecall'].sum(), -0.0695652, 6) + assert_almost_equal(self.binary_pfi['NegativePrecision'].sum(), -0.0139899, 6) + assert_almost_equal(self.binary_pfi['NegativeRecall'].sum(), -0.00779221, 6) + assert_almost_equal(self.binary_pfi['F1Score'].sum(), -0.126983, 6) + assert_almost_equal(self.binary_pfi['AreaUnderPrecisionRecallCurve'].sum(), -0.19365, 5) + + def test_binary_classifier_from_loaded_model(self): + model_path = "model.zip" + self.binary_model.save_model(model_path) + loaded_model = Pipeline() + loaded_model.load_model(model_path) + pfi_from_loaded = loaded_model.permutation_feature_importance(self.classification_data) + assert_frame_equal(self.binary_pfi, pfi_from_loaded) + os.remove(model_path) + + def test_clasifier(self): + assert_almost_equal(self.classifier_pfi['MacroAccuracy'].sum(), -0.0256352, 6) + assert_almost_equal(self.classifier_pfi['LogLoss'].sum(), 0.158811, 6) + assert_almost_equal(self.classifier_pfi['LogLossReduction'].sum(), -0.29449, 5) + assert_almost_equal(self.classifier_pfi['PerClassLogLoss.0'].sum(), 0.0808459, 6) + assert_almost_equal(self.classifier_pfi['PerClassLogLoss.1'].sum(), 0.419826, 6) + + def test_classifier_from_loaded_model(self): + model_path = "model.zip" + self.classifier_model.save_model(model_path) + loaded_model = Pipeline() + loaded_model.load_model(model_path) + pfi_from_loaded = loaded_model.permutation_feature_importance(self.classification_data) + assert_frame_equal(self.classifier_pfi, pfi_from_loaded) + os.remove(model_path) + + def test_regressor(self): + assert_almost_equal(self.regressor_pfi['MeanAbsoluteError'].sum(), 0.504701, 6) + assert_almost_equal(self.regressor_pfi['MeanSquaredError'].sum(), 5.59277, 5) + assert_almost_equal(self.regressor_pfi['RootMeanSquaredError'].sum(), 0.553048, 6) + assert_almost_equal(self.regressor_pfi['RSquared'].sum(), -0.203612, 6) + + def test_regressor_from_loaded_model(self): + model_path = "model.zip" + self.regressor_model.save_model(model_path) + loaded_model = Pipeline() + loaded_model.load_model(model_path) + pfi_from_loaded = loaded_model.permutation_feature_importance(self.regression_data) + assert_frame_equal(self.regressor_pfi, pfi_from_loaded) + os.remove(model_path) + + def test_ranker(self): + assert_almost_equal(self.ranker_pfi['DCG@1'].sum(), -2.16404, 5) + assert_almost_equal(self.ranker_pfi['DCG@2'].sum(), -3.5294, 4) + assert_almost_equal(self.ranker_pfi['DCG@3'].sum(), -4.9721, 4) + assert_almost_equal(self.ranker_pfi['NDCG@1'].sum(), -0.114286, 6) + assert_almost_equal(self.ranker_pfi['NDCG@2'].sum(), -0.198631, 6) + assert_almost_equal(self.ranker_pfi['NDCG@3'].sum(), -0.236544, 6) + + def test_ranker_from_loaded_model(self): + model_path = "model.zip" + self.ranker_model.save_model(model_path) + loaded_model = Pipeline() + loaded_model.load_model(model_path) + pfi_from_loaded = loaded_model.permutation_feature_importance(self.ranking_data) + assert_frame_equal(self.ranker_pfi, pfi_from_loaded) + os.remove(model_path) + +if __name__ == '__main__': + unittest.main() diff --git a/src/python/nimbusml/tests/pipeline/test_pipeline_combining.py b/src/python/nimbusml/tests/pipeline/test_pipeline_combining.py index f16e43aa..f1fc2ec7 100644 --- a/src/python/nimbusml/tests/pipeline/test_pipeline_combining.py +++ b/src/python/nimbusml/tests/pipeline/test_pipeline_combining.py @@ -11,7 +11,10 @@ from nimbusml.datasets import get_dataset from nimbusml.feature_extraction.categorical import OneHotVectorizer from nimbusml.linear_model import LogisticRegressionBinaryClassifier, OnlineGradientDescentRegressor +from nimbusml.multiclass import OneVsRestClassifier from nimbusml.preprocessing.filter import RangeFilter +from nimbusml.preprocessing import DatasetTransformer +from nimbusml.preprocessing.schema import PrefixColumnConcatenator seed = 0 @@ -406,6 +409,76 @@ def test_combine_with_classifier_trained_with_filedatastream(self): self.assertTrue(result_1.equals(result_2)) + def test_combined_models_support_predict_proba(self): + path = get_dataset('infert').as_filepath() + + data = FileDataStream.read_csv(path) + + transform = OneHotVectorizer(columns={'edu': 'education'}) + df = transform.fit_transform(data, as_binary_data_stream=True) + + feature_cols = ['parity', 'edu', 'age', 'induced', 'spontaneous', 'stratum', 'pooled.stratum'] + predictor = LogisticRegressionBinaryClassifier(feature=feature_cols, label='case') + predictor.fit(df) + + data = FileDataStream.read_csv(path) + df = transform.transform(data, as_binary_data_stream=True) + result_1 = predictor.predict_proba(df) + + data = FileDataStream.read_csv(path) + combined_pipeline = Pipeline.combine_models(transform, predictor) + result_2 = combined_pipeline.predict_proba(data) + + self.assertTrue(np.array_equal(result_1, result_2)) + + + def test_combined_models_support_predict_proba_with_more_than_2_classes(self): + path = get_dataset('infert').as_filepath() + data = FileDataStream.read_csv(path) + + featurization_pipeline = Pipeline([OneHotVectorizer(columns={'education': 'education'})]) + featurization_pipeline.fit(data) + featurized_data = featurization_pipeline.transform(data) + + feature_cols = ['education', 'age'] + training_pipeline = Pipeline([DatasetTransformer(featurization_pipeline.model), OneVsRestClassifier(LogisticRegressionBinaryClassifier(), feature=feature_cols, label='induced')]) + training_pipeline.fit(data, output_predictor_model=True) + + concat_pipeline = Pipeline([PrefixColumnConcatenator({'education': 'education.'})]) + concat_pipeline.fit(featurized_data) + + predictor_pipeline = Pipeline() + predictor_pipeline.load_model(training_pipeline.predictor_model) + + concat_and_predictor_pipeline = Pipeline.combine_models(concat_pipeline, predictor_pipeline) + + result = concat_and_predictor_pipeline.predict_proba(featurized_data) + self.assertEqual(result.shape[1], 3) + + + def test_combined_models_support_decision_function(self): + path = get_dataset('infert').as_filepath() + + data = FileDataStream.read_csv(path) + + transform = OneHotVectorizer(columns={'edu': 'education'}) + df = transform.fit_transform(data, as_binary_data_stream=True) + + feature_cols = ['parity', 'edu', 'age', 'induced', 'spontaneous', 'stratum', 'pooled.stratum'] + predictor = LogisticRegressionBinaryClassifier(feature=feature_cols, label='case') + predictor.fit(df) + + data = FileDataStream.read_csv(path) + df = transform.transform(data, as_binary_data_stream=True) + result_1 = predictor.decision_function(df) + + data = FileDataStream.read_csv(path) + combined_pipeline = Pipeline.combine_models(transform, predictor) + result_2 = combined_pipeline.decision_function(data) + + self.assertTrue(np.array_equal(result_1, result_2)) + + if __name__ == '__main__': unittest.main() diff --git a/src/python/nimbusml/tests/pipeline/test_pipeline_get_schema.py b/src/python/nimbusml/tests/pipeline/test_pipeline_get_schema.py new file mode 100644 index 00000000..63bb5310 --- /dev/null +++ b/src/python/nimbusml/tests/pipeline/test_pipeline_get_schema.py @@ -0,0 +1,67 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +import unittest + +import numpy as np +import pandas as pd +from nimbusml import Pipeline, FileDataStream +from nimbusml.datasets import get_dataset +from nimbusml.feature_extraction.categorical import OneHotVectorizer +from nimbusml.linear_model import OnlineGradientDescentRegressor +from nimbusml.preprocessing.filter import RangeFilter + +train_data = {'c0': ['a', 'b', 'a', 'b'], + 'c1': [1, 2, 3, 4], + 'c2': [2, 3, 4, 5]} +train_df = pd.DataFrame(train_data).astype({'c1': np.float64, + 'c2': np.float64}) + + +class TestPipelineGetSchema(unittest.TestCase): + + def test_get_schema_returns_correct_value_for_single_valued_columns(self): + df = train_df.drop(['c0'], axis=1) + + pipeline = Pipeline([RangeFilter(min=0.0, max=4.5) << 'c2']) + pipeline.fit(df) + df = pipeline.transform(df) + + schema = pipeline.get_output_columns() + + self.assertTrue('c1' in schema) + self.assertTrue('c2' in schema) + + self.assertEqual(len(schema), 2) + + def test_get_schema_returns_correct_value_for_vector_valued_columns(self): + pipeline = Pipeline([OneHotVectorizer() << 'c0']) + pipeline.fit(train_df) + + schema = pipeline.get_output_columns() + + self.assertTrue('c0.a' in schema) + self.assertTrue('c0.b' in schema) + self.assertTrue('c1' in schema) + self.assertTrue('c2' in schema) + + self.assertEqual(len(schema), 4) + + def test_get_schema_does_not_work_when_predictor_is_part_of_model(self): + df = train_df.drop(['c0'], axis=1) + + pipeline = Pipeline([OnlineGradientDescentRegressor(label='c2')]) + pipeline.fit(df) + + try: + schema = pipeline.get_output_columns() + except Exception as e: + pass + else: + self.fail() + + +if __name__ == '__main__': + unittest.main() + diff --git a/src/python/nimbusml/tests/pipeline/test_pipeline_split_models.py b/src/python/nimbusml/tests/pipeline/test_pipeline_split_models.py new file mode 100644 index 00000000..bc1399bf --- /dev/null +++ b/src/python/nimbusml/tests/pipeline/test_pipeline_split_models.py @@ -0,0 +1,172 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +import os +import unittest + +import numpy as np +import pandas as pd +from nimbusml import Pipeline, FileDataStream +from nimbusml.datasets import get_dataset +from nimbusml.feature_extraction.categorical import OneHotVectorizer +from nimbusml.linear_model import LogisticRegressionBinaryClassifier, OnlineGradientDescentRegressor +from nimbusml.preprocessing.filter import RangeFilter +from nimbusml.preprocessing.schema import ColumnConcatenator, PrefixColumnConcatenator + +seed = 0 + +train_data = {'c0': ['a', 'b', 'a', 'b'], + 'c1': [1, 2, 3, 4], + 'c2': [2, 3, 4, 5]} +train_df = pd.DataFrame(train_data).astype({'c1': np.float64, + 'c2': np.float64}) + +test_data = {'c0': ['a', 'b', 'b'], + 'c1': [1.5, 2.3, 3.7], + 'c2': [2.2, 4.9, 2.7]} +test_df = pd.DataFrame(test_data).astype({'c1': np.float64, + 'c2': np.float64}) + + +class TestPipelineSplitModels(unittest.TestCase): + + def test_notvectorized_output_predictor_model(self): + """ + This test verifies that outputted predictor model from + combined (with featurizers) pipeline runs successfully + on featurized data with no vectors. + """ + df = train_df.drop(['c0'], axis=1) + + # Create and fit a RangeFilter transform using the training + # data and use it to transform the training data. + transform_pipeline = Pipeline([RangeFilter(min=0.0, max=4.5) << 'c2'], random_state=seed) + transform_pipeline.fit(df) + df1 = transform_pipeline.transform(df) + + # Create and fit a combined model and spit out predictor model + combined_pipeline = Pipeline([RangeFilter(min=0.0, max=4.5) << 'c2', + OnlineGradientDescentRegressor(label='c2')], + random_state=seed) + combined_pipeline.fit(df, output_predictor_model=True) + result_1 = combined_pipeline.predict(df) + + # Load predictor pipeline and score featurized data + predictor_pipeline = Pipeline() + predictor_pipeline.load_model(combined_pipeline.predictor_model) + result_2 = predictor_pipeline.predict(df1) + + self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score']) + self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score']) + + def test_vectorized_output_predictor_model(self): + """ + This test shows that outputted predictor model from + combined (with featurizers) pipeline fails to run + on featurized data with vectors. + """ + + # Create and fit a OneHotVectorizer transform using the + # training data and use it to transform the training data. + transform_pipeline = Pipeline([OneHotVectorizer() << 'c0'], random_state=seed) + transform_pipeline.fit(train_df) + df = transform_pipeline.transform(train_df) + + # Create and fit a combined model and spit out predictor model + combined_pipeline = Pipeline([OneHotVectorizer() << 'c0', + OnlineGradientDescentRegressor(label='c2')], + random_state=seed) + combined_pipeline.fit(train_df, output_predictor_model=True) + result_1 = combined_pipeline.predict(train_df) + + # Load predictor pipeline and score featurized data + predictor_pipeline = Pipeline() + predictor_pipeline.load_model(combined_pipeline.predictor_model) + + try: + # This does not work because the input schema doesnt + # match. Input schema looks for vector 'c0' with slots 'a,b' + # but featurized data has only columns 'c0.a' and 'c0.b' + predictor_pipeline.predict(df) + + except Exception as e: + pass + else: + self.fail() + + def test_vectorized_with_concat_output_predictor_model(self): + """ + This test shows how to prepend ColumnConcatenator transform + to outputted predictor model from combined (with featurizers) pipeline + so it successfully runs on featurized data with vectors. + """ + # Create and fit a OneHotVectorizer transform using the + # training data and use it to transform the training data. + transform_pipeline = Pipeline([OneHotVectorizer() << 'c0'], random_state=seed) + transform_pipeline.fit(train_df) + df = transform_pipeline.transform(train_df) + + # Create, fit and score with combined model. + # Output predictor model separately. + combined_pipeline = Pipeline([OneHotVectorizer() << 'c0', + OnlineGradientDescentRegressor(label='c2')], + random_state=seed) + combined_pipeline.fit(train_df, output_predictor_model=True) + result_1 = combined_pipeline.predict(train_df) + + # train ColumnConcatenator on featurized data + concat_pipeline = Pipeline([ColumnConcatenator(columns={'c0': ['c0.a', 'c0.b']})]) + concat_pipeline.fit(df) + + # Load predictor pipeline + predictor_pipeline = Pipeline() + predictor_pipeline.load_model(combined_pipeline.predictor_model) + + # combine concat and predictor models and score + combined_predictor_pipeline = Pipeline.combine_models(concat_pipeline, + predictor_pipeline) + result_2 = combined_predictor_pipeline.predict(df) + + self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score']) + self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score']) + + def test_vectorized_with_prefixconcat_output_predictor_model(self): + """ + This test shows how to prepend ColumnConcatenator transform + to outputted predictor model from combined (with featurizers) pipeline + so it successfully runs on featurized data with vectors. + """ + # Create and fit a OneHotVectorizer transform using the + # training data and use it to transform the training data. + transform_pipeline = Pipeline([OneHotVectorizer() << 'c0'], random_state=seed) + transform_pipeline.fit(train_df) + df = transform_pipeline.transform(train_df) + + # Create, fit and score with combined model. + # Output predictor model separately. + combined_pipeline = Pipeline([OneHotVectorizer() << 'c0', + OnlineGradientDescentRegressor(label='c2')], + random_state=seed) + combined_pipeline.fit(train_df, output_predictor_model=True) + result_1 = combined_pipeline.predict(train_df) + + # train ColumnConcatenator on featurized data + concat_pipeline = Pipeline([PrefixColumnConcatenator(columns={'c0': 'c0.'})]) + concat_pipeline.fit(df) + + # Load predictor pipeline + predictor_pipeline = Pipeline() + predictor_pipeline.load_model(combined_pipeline.predictor_model) + + # combine concat and predictor models and score + combined_predictor_pipeline = Pipeline.combine_models(concat_pipeline, + predictor_pipeline) + result_2 = combined_predictor_pipeline.predict(df) + + self.assertEqual(result_1.loc[0, 'Score'], result_2.loc[0, 'Score']) + self.assertEqual(result_1.loc[1, 'Score'], result_2.loc[1, 'Score']) + +if __name__ == '__main__': + unittest.main() + diff --git a/src/python/nimbusml/tests/pipeline/test_pipeline_transform_method.py b/src/python/nimbusml/tests/pipeline/test_pipeline_transform_method.py new file mode 100644 index 00000000..e16a1e99 --- /dev/null +++ b/src/python/nimbusml/tests/pipeline/test_pipeline_transform_method.py @@ -0,0 +1,26 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +import unittest + +import pandas +from nimbusml import Pipeline, FileDataStream +from nimbusml.datasets import get_dataset +from nimbusml.feature_extraction.text import NGramFeaturizer + +path = get_dataset("wiki_detox_train").as_filepath() +data = FileDataStream.read_csv(path, sep='\t') +df = data.to_df().head() +X = df['SentimentText'] + +class TestPipelineTransformMethod(unittest.TestCase): + + def test_transform_only_pipeline_transform_method(self): + p = Pipeline([NGramFeaturizer(char_feature_extractor=None) << 'SentimentText']) + p.fit(X) + xf = p.transform(X) + assert 'SentimentText.==rude==' in xf.columns + +if __name__ == '__main__': + unittest.main() diff --git a/src/python/nimbusml/tests/pipeline/test_predict_proba_decision_function.py b/src/python/nimbusml/tests/pipeline/test_predict_proba_decision_function.py index f6cc1c70..138622b4 100644 --- a/src/python/nimbusml/tests/pipeline/test_predict_proba_decision_function.py +++ b/src/python/nimbusml/tests/pipeline/test_predict_proba_decision_function.py @@ -209,12 +209,12 @@ def test_pass_predict_proba_from_load_model(selfs): class TestDecisionFunction(unittest.TestCase): def test_pass_decision_function_binary(self): assert_almost_equal(decfun_sum(FactorizationMachineBinaryClassifier( - )), -32.618393, decimal=5, err_msg=invalid_decision_function_output) + )), -30.2316, decimal=4, err_msg=invalid_decision_function_output) def test_pass_decision_function_binary_with_pipeline(self): assert_almost_equal( decfun_sum(Pipeline([FactorizationMachineBinaryClassifier( - )])), -32.618393, decimal=5, + )])), -30.2316, decimal=4, err_msg=invalid_decision_function_output) def test_pass_decision_function_multiclass(self): diff --git a/src/python/nimbusml/tests/pipeline/test_uci_adult.py b/src/python/nimbusml/tests/pipeline/test_uci_adult.py index 990f0b72..00ae1728 100644 --- a/src/python/nimbusml/tests/pipeline/test_uci_adult.py +++ b/src/python/nimbusml/tests/pipeline/test_uci_adult.py @@ -6,14 +6,15 @@ import tempfile import unittest +import numpy as np from nimbusml import FileDataStream from nimbusml import Pipeline from nimbusml.datasets import get_dataset from nimbusml.ensemble import FastTreesBinaryClassifier from nimbusml.feature_extraction.categorical import OneHotVectorizer from nimbusml.linear_model import FastLinearBinaryClassifier -from nimbusml.utils import check_accuracy, get_X_y -from sklearn.utils.testing import assert_raises_regex, assert_equal, assert_true +from nimbusml.utils import get_X_y +from sklearn.utils.testing import assert_raises_regex, assert_equal, assert_true, assert_greater train_file = get_dataset("uciadult_train").as_filepath() test_file = get_dataset("uciadult_test").as_filepath() @@ -32,6 +33,15 @@ 'col=sex:TX:7 col=native-country-region:TX:8 header+' label_column = 'label' +def check_accuracy(test_file, label_column, predictions, threshold, sep=','): + (test, label) = get_X_y(test_file, label_column, sep=sep) + accuracy = np.mean(label[label_column].values == + predictions.ix[:, 'PredictedLabel'].values) + assert_greater( + accuracy, + threshold, + "accuracy should be greater than %s" % + threshold) class TestUciAdult(unittest.TestCase): @@ -173,15 +183,5 @@ def test_experiment_loadsavemodel(self): sum2, "model metrics don't match after loading model") - def test_parallel(self): - (train, label) = get_X_y(train_file, label_column, sep=',') - cat = OneHotVectorizer() << categorical_columns - ftree = FastTreesBinaryClassifier() - pipeline = Pipeline([cat, ftree]) - - result = pipeline.fit(train, label, parallel=8) - result2 = pipeline.fit(train, label, parallel=1) - assert_true(result == result2) - if __name__ == '__main__': unittest.main() diff --git a/src/python/nimbusml/tests/preprocessing/missing_values/test_data_with_missing.py b/src/python/nimbusml/tests/preprocessing/missing_values/test_data_with_missing.py index fb0bdc79..0dc85f6e 100644 --- a/src/python/nimbusml/tests/preprocessing/missing_values/test_data_with_missing.py +++ b/src/python/nimbusml/tests/preprocessing/missing_values/test_data_with_missing.py @@ -98,7 +98,8 @@ def test_input_conversion_to_float(self): assert_equal(result.loc[2, 'f5'], True) result.loc[2, 'f5'] = False result = ~result - self.assertTrue(result.all(axis=None)) + for val in result.all().tolist(): + self.assertTrue(val) # Check Filter xf = Filter() diff --git a/src/python/nimbusml/tests/preprocessing/normalization/test_lpscaler.py b/src/python/nimbusml/tests/preprocessing/normalization/test_lpscaler.py new file mode 100644 index 00000000..94f7d1bd --- /dev/null +++ b/src/python/nimbusml/tests/preprocessing/normalization/test_lpscaler.py @@ -0,0 +1,70 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- + +import unittest + +import numpy as np +import pandas as pd +from nimbusml import Pipeline +from nimbusml.preprocessing.normalization import LpScaler +from nimbusml.preprocessing.schema import ColumnConcatenator +from sklearn.utils.testing import assert_greater, assert_less + + +class TestLpScaler(unittest.TestCase): + + def test_lpscaler(self): + in_df = pd.DataFrame( + data=dict( + Sepal_Length=[2.5, 1, 2.1, 1.0], + Sepal_Width=[.75, .9, .8, .76], + Petal_Length=[0, 2.5, 2.6, 2.4], + Species=["setosa", "viginica", "setosa", 'versicolor'])) + + in_df.iloc[:, 0:3] = in_df.iloc[:, 0:3].astype(np.float32) + + src_cols = ['Sepal_Length', 'Sepal_Width', 'Petal_Length'] + + pipeline = Pipeline([ + ColumnConcatenator() << {'concat': src_cols}, + LpScaler() << {'norm': 'concat'} + ]) + out_df = pipeline.fit_transform(in_df) + + cols = ['concat.' + s for s in src_cols] + cols.extend(['norm.' + s for s in src_cols]) + sum = out_df[cols].sum().sum() + sum_range = (23.24, 23.25) + assert_greater(sum, sum_range[0], "sum should be greater than %s" % sum_range[0]) + assert_less(sum, sum_range[1], "sum should be less than %s" % sum_range[1]) + + def test_lpscaler_automatically_converts_to_single(self): + in_df = pd.DataFrame( + data=dict( + Sepal_Length=[2.5, 1, 2.1, 1.0], + Sepal_Width=[.75, .9, .8, .76], + Petal_Length=[0, 2.5, 2.6, 2.4], + Species=["setosa", "viginica", "setosa", 'versicolor'])) + + in_df.iloc[:, 0:3] = in_df.iloc[:, 0:3].astype(np.float64) + + src_cols = ['Sepal_Length', 'Sepal_Width', 'Petal_Length'] + + pipeline = Pipeline([ + ColumnConcatenator() << {'concat': src_cols}, + LpScaler() << {'norm': 'concat'} + ]) + out_df = pipeline.fit_transform(in_df) + + cols = ['concat.' + s for s in src_cols] + cols.extend(['norm.' + s for s in src_cols]) + sum = out_df[cols].sum().sum() + sum_range = (23.24, 23.25) + assert_greater(sum, sum_range[0], "sum should be greater than %s" % sum_range[0]) + assert_less(sum, sum_range[1], "sum should be less than %s" % sum_range[1]) + + +if __name__ == '__main__': + unittest.main() diff --git a/src/python/nimbusml/tests/preprocessing/schema/test_prefixcolumnconcatenator.py b/src/python/nimbusml/tests/preprocessing/schema/test_prefixcolumnconcatenator.py new file mode 100644 index 00000000..75471be3 --- /dev/null +++ b/src/python/nimbusml/tests/preprocessing/schema/test_prefixcolumnconcatenator.py @@ -0,0 +1,36 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- + +import unittest + +from nimbusml import FileDataStream +from nimbusml.datasets import get_dataset +from nimbusml.preprocessing.schema import PrefixColumnConcatenator + + +class TestPrefixColumnConcatenator(unittest.TestCase): + + def test_prefix_columns_concatenator(self): + data = get_dataset('iris').as_df() + xf = PrefixColumnConcatenator(columns={'Spl': 'Sepal_', 'Pet': 'Petal_' }) + features = xf.fit_transform(data) + + assert features.shape == (150, 11) + assert set(features.columns) == { + 'Sepal_Length', + 'Sepal_Width', + 'Petal_Length', + 'Petal_Width', + 'Label', + 'Species', + 'Setosa', + 'Spl.Sepal_Length', + 'Spl.Sepal_Width', + 'Pet.Petal_Length', + 'Pet.Petal_Width'} + + +if __name__ == '__main__': + unittest.main() diff --git a/src/python/nimbusml/tests/preprocessing/test_datasettransformer.py b/src/python/nimbusml/tests/preprocessing/test_datasettransformer.py new file mode 100644 index 00000000..197119c6 --- /dev/null +++ b/src/python/nimbusml/tests/preprocessing/test_datasettransformer.py @@ -0,0 +1,184 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +import os +import unittest + +import numpy as np +import pandas as pd +from nimbusml import Pipeline, FileDataStream +from nimbusml.datasets import get_dataset +from nimbusml.feature_extraction.categorical import OneHotVectorizer +from nimbusml.linear_model import LogisticRegressionBinaryClassifier, OnlineGradientDescentRegressor +from nimbusml.preprocessing import DatasetTransformer +from nimbusml.preprocessing.filter import RangeFilter +from nimbusml import FileDataStream + +seed = 0 + +train_data = {'c0': ['a', 'b', 'a', 'b'], + 'c1': [1, 2, 3, 4], + 'c2': [2, 3, 4, 5]} +train_df = pd.DataFrame(train_data).astype({'c1': np.float64, + 'c2': np.float64}) + +test_data = {'c0': ['a', 'b', 'b'], + 'c1': [1.5, 2.3, 3.7], + 'c2': [2.2, 4.9, 2.7]} +test_df = pd.DataFrame(test_data).astype({'c1': np.float64, + 'c2': np.float64}) + + +class TestDatasetTransformer(unittest.TestCase): + + def test_same_schema_with_dataframe_input(self): + train_df_updated = train_df.drop(['c0'], axis=1) + test_df_updated = test_df.drop(['c0'], axis=1) + + rf_max = 4.5 + + # Create reference pipeline + std_pipeline = Pipeline([ + RangeFilter(min=0.0, max=rf_max) << 'c2', + OnlineGradientDescentRegressor(label='c2', feature=['c1']) + ], random_state=seed) + + std_pipeline.fit(train_df_updated) + result_1 = std_pipeline.predict(test_df_updated) + + # Create combined pipeline + transform_pipeline = Pipeline([RangeFilter(min=0.0, max=rf_max) << 'c2']) + transform_pipeline.fit(train_df_updated) + + combined_pipeline = Pipeline([ + DatasetTransformer(transform_model=transform_pipeline.model), + OnlineGradientDescentRegressor(label='c2', feature=['c1']) + ], random_state=seed) + combined_pipeline.fit(train_df_updated) + + os.remove(transform_pipeline.model) + + result_2 = combined_pipeline.predict(test_df_updated) + + self.assertTrue(result_1.equals(result_2)) + + + def test_different_schema_with_dataframe_input(self): + # Create reference pipeline + std_pipeline = Pipeline([ + OneHotVectorizer() << 'c0', + OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1']) + ], random_state=seed) + + std_pipeline.fit(train_df) + result_1 = std_pipeline.predict(test_df) + + # Create combined pipeline + transform_pipeline = Pipeline([OneHotVectorizer() << 'c0'], random_state=seed) + transform_pipeline.fit(train_df) + + combined_pipeline = Pipeline([ + DatasetTransformer(transform_model=transform_pipeline.model), + OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1']) + ], random_state=seed) + combined_pipeline.fit(train_df) + + os.remove(transform_pipeline.model) + + result_2 = combined_pipeline.predict(test_df) + + self.assertTrue(result_1.equals(result_2)) + + + def test_different_schema_with_filedatastream_input(self): + train_filename = "train-data.csv" + train_df.to_csv(train_filename, index=False, header=True) + train_data_stream = FileDataStream.read_csv(train_filename, sep=',', header=True) + + test_filename = "test-data.csv" + test_df.to_csv(test_filename, index=False, header=True) + test_data_stream = FileDataStream.read_csv(test_filename, sep=',', header=True) + + # Create reference pipeline + std_pipeline = Pipeline([ + OneHotVectorizer() << 'c0', + OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1']) + ], random_state=seed) + + std_pipeline.fit(train_data_stream) + result_1 = std_pipeline.predict(test_data_stream) + + # Create combined pipeline + transform_pipeline = Pipeline([OneHotVectorizer() << 'c0'], random_state=seed) + transform_pipeline.fit(train_data_stream) + + combined_pipeline = Pipeline([ + DatasetTransformer(transform_model=transform_pipeline.model), + OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1']) + ], random_state=seed) + combined_pipeline.fit(train_data_stream) + + os.remove(transform_pipeline.model) + + result_2 = combined_pipeline.predict(test_data_stream) + + self.assertTrue(result_1.equals(result_2)) + + os.remove(train_filename) + os.remove(test_filename) + + + def test_combining_two_dataset_transformers(self): + rf_max = 4.5 + + # Create reference pipeline + std_pipeline = Pipeline([ + RangeFilter(min=0.0, max=rf_max) << 'c2', + OneHotVectorizer() << 'c0', + OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1']) + ], random_state=seed) + + std_pipeline.fit(train_df) + result_1 = std_pipeline.predict(test_df) + + # Create combined pipeline + transform_pipeline1 = Pipeline([RangeFilter(min=0.0, max=rf_max) << 'c2']) + transform_pipeline1.fit(train_df) + + transform_pipeline2 = Pipeline([OneHotVectorizer() << 'c0'], random_state=seed) + transform_pipeline2.fit(train_df) + + combined_pipeline = Pipeline([ + DatasetTransformer(transform_model=transform_pipeline1.model), + DatasetTransformer(transform_model=transform_pipeline2.model), + OnlineGradientDescentRegressor(label='c2', feature=['c0', 'c1']) + ], random_state=seed) + combined_pipeline.fit(train_df) + + os.remove(transform_pipeline1.model) + os.remove(transform_pipeline2.model) + + result_2 = combined_pipeline.predict(test_df) + + self.assertTrue(result_1.equals(result_2)) + + + def test_get_fit_info(self): + transform_pipeline = Pipeline([RangeFilter(min=0.0, max=4.5) << 'c2']) + transform_pipeline.fit(train_df) + + combined_pipeline = Pipeline([ + DatasetTransformer(transform_model=transform_pipeline.model), + OnlineGradientDescentRegressor(label='c2', feature=['c1']) + ], random_state=seed) + combined_pipeline.fit(train_df) + + info = combined_pipeline.get_fit_info(train_df) + + self.assertTrue(info[0][1]['name'] == 'DatasetTransformer') + + +if __name__ == '__main__': + unittest.main() + diff --git a/src/python/nimbusml/tests/preprocessing/text/test_wordtokenizer.py b/src/python/nimbusml/tests/preprocessing/text/test_wordtokenizer.py new file mode 100644 index 00000000..a8c66016 --- /dev/null +++ b/src/python/nimbusml/tests/preprocessing/text/test_wordtokenizer.py @@ -0,0 +1,33 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +import unittest + +import pandas +from nimbusml import Pipeline +from nimbusml.preprocessing.text import WordTokenizer + + +class TestWordTokenizer(unittest.TestCase): + + def test_wordtokenizer(self): + customer_reviews = pandas.DataFrame(data=dict(review=[ + "I really did not like the taste of it", + "It was surprisingly quite good!"])) + + tokenize = WordTokenizer(char_array_term_separators=[" ", "n"]) << 'review' + pipeline = Pipeline([tokenize]) + + tokenize.fit(customer_reviews) + y = tokenize.transform(customer_reviews) + + self.assertEqual(y.shape, (2, 9)) + + self.assertEqual(y.loc[0, 'review.3'], 'ot') + self.assertEqual(y.loc[1, 'review.3'], 'gly') + self.assertEqual(y.loc[1, 'review.6'], None) + + +if __name__ == '__main__': + unittest.main() diff --git a/src/python/nimbusml/tests/scikit/test_uci_adult_scikit.py b/src/python/nimbusml/tests/scikit/test_uci_adult_scikit.py index a08ce3b9..037987db 100644 --- a/src/python/nimbusml/tests/scikit/test_uci_adult_scikit.py +++ b/src/python/nimbusml/tests/scikit/test_uci_adult_scikit.py @@ -16,14 +16,14 @@ from nimbusml.linear_model import FastLinearBinaryClassifier from nimbusml.linear_model import LogisticRegressionClassifier from nimbusml.preprocessing.normalization import MeanVarianceScaler -from nimbusml.utils import check_accuracy_scikit, get_X_y +from nimbusml.utils import get_X_y from sklearn.base import clone from sklearn.datasets import load_iris from sklearn.decomposition import PCA from sklearn.model_selection import GridSearchCV from sklearn.pipeline import Pipeline, FeatureUnion from sklearn.preprocessing import OneHotEncoder -from sklearn.utils.testing import assert_equal +from sklearn.utils.testing import assert_equal, assert_greater try: from pandas.testing import assert_frame_equal @@ -45,6 +45,19 @@ 'native-country-region'] selected_features = ['age', 'education-num'] +def check_accuracy_scikit( + test_file, + label_column, + predictions, + threshold, + sep=','): + (test, label) = get_X_y(test_file, label_column, sep=sep) + accuracy = np.mean(label[label_column].values == predictions.values) + assert_greater( + accuracy, + threshold, + "accuracy should be greater than %s" % + threshold) class TestUciAdultScikit(unittest.TestCase): diff --git a/src/python/nimbusml/tests/test_csr_matrix_output.py b/src/python/nimbusml/tests/test_csr_matrix_output.py new file mode 100644 index 00000000..f4909906 --- /dev/null +++ b/src/python/nimbusml/tests/test_csr_matrix_output.py @@ -0,0 +1,184 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- + +import unittest + +import numpy as np +import pandas as pd +from nimbusml import Pipeline +from nimbusml.feature_extraction.categorical import OneHotVectorizer +from nimbusml.preprocessing.schema import ColumnConcatenator, ColumnDropper +from scipy.sparse import csr_matrix + + +class TestCsrMatrixOutput(unittest.TestCase): + + def test_column_dropped_output_produces_expected_result(self): + train_data = {'c1': [1, 0, 0, 4], + 'c2': [2, 3, 0, 5], + 'c3': [3, 4, 5, 6]} + train_df = pd.DataFrame(train_data).astype(np.float32) + + xf = ColumnDropper(columns=['c3']) + xf.fit(train_df) + result = xf.transform(train_df, as_csr=True) + + self.assertEqual(result.nnz, 5) + self.assertTrue(type(result) == csr_matrix) + result = pd.DataFrame(result.todense()) + + train_data = {0: [1, 0, 0, 4], + 1: [2, 3, 0, 5]} + expected_result = pd.DataFrame(train_data).astype(np.float32) + + self.assertTrue(result.equals(expected_result)) + + def test_fit_transform_produces_expected_result(self): + train_data = {'c1': [1, 0, 0, 4], + 'c2': [2, 3, 0, 5], + 'c3': [3, 4, 5, 6]} + train_df = pd.DataFrame(train_data).astype(np.float32) + + xf = ColumnDropper(columns=['c3']) + result = xf.fit_transform(train_df, as_csr=True) + + self.assertEqual(result.nnz, 5) + self.assertTrue(type(result) == csr_matrix) + result = pd.DataFrame(result.todense()) + + train_data = {0: [1, 0, 0, 4], + 1: [2, 3, 0, 5]} + expected_result = pd.DataFrame(train_data).astype(np.float32) + + self.assertTrue(result.equals(expected_result)) + + def test_vector_column_combined_with_single_value_columns(self): + train_data = {'c1': [1, 0, 0, 4], + 'c2': [2, 3, 0, 5], + 'c3': [3, 4, 5, 6]} + train_df = pd.DataFrame(train_data).astype(np.float32) + + xf = ColumnConcatenator(columns={'features': ['c1', 'c2', 'c3']}) + xf.fit(train_df) + result = xf.transform(train_df, as_csr=True) + + self.assertEqual(result.nnz, 18) + self.assertTrue(type(result) == csr_matrix) + result = pd.DataFrame(result.todense()) + + train_data = {0: [1, 0, 0, 4], + 1: [2, 3, 0, 5], + 2: [3, 4, 5, 6], + 3: [1, 0, 0, 4], + 4: [2, 3, 0, 5], + 5: [3, 4, 5, 6]} + expected_result = pd.DataFrame(train_data).astype(np.float32) + self.assertTrue(result.equals(expected_result)) + + def test_sparse_vector_column(self): + train_data = {'c0': ['a', 'b', 'a', 'b'], + 'c1': ['c', 'd', 'd', 'c']} + train_df = pd.DataFrame(train_data) + + xf = OneHotVectorizer(columns={'c0':'c0', 'c1':'c1'}) + xf.fit(train_df) + expected_result = xf.transform(train_df) + self.assertTrue(type(expected_result) == pd.DataFrame) + + result = xf.transform(train_df, as_csr=True) + self.assertEqual(result.nnz, 8) + self.assertTrue(type(result) == csr_matrix) + + result = pd.DataFrame(result.todense(), columns=['c0.a', 'c0.b', 'c1.c', 'c1.d']) + + self.assertTrue(result.equals(expected_result)) + + def test_sparse_vector_column_combined_with_single_value_columns(self): + train_data = {'c0': [0, 1, 0, 3], + 'c1': ['a', 'b', 'a', 'b']} + train_df = pd.DataFrame(train_data).astype({'c0': np.float32}) + + xf = OneHotVectorizer(columns={'c1':'c1'}) + xf.fit(train_df) + expected_result = xf.transform(train_df) + self.assertTrue(type(expected_result) == pd.DataFrame) + + result = xf.transform(train_df, as_csr=True) + self.assertEqual(result.nnz, 6) + self.assertTrue(type(result) == csr_matrix) + + result = pd.DataFrame(result.todense(), columns=['c0', 'c1.a', 'c1.b']) + + self.assertTrue(result.equals(expected_result)) + + def test_types_convertable_to_r4_get_output_as_r4(self): + train_data = {'c1': [1, 0, 0, 4], + 'c2': [2, 3, 0, 5], + 'c3': [3, 4, 5, 6], + 'c4': [4, 5, 6, 7]} + train_df = pd.DataFrame(train_data).astype({'c1': np.ubyte, + 'c2': np.short, + 'c3': np.float32}) + + xf = ColumnDropper(columns=['c4']) + xf.fit(train_df) + result = xf.transform(train_df, as_csr=True) + + self.assertTrue(type(result) == csr_matrix) + self.assertEqual(result.nnz, 9) + result = pd.DataFrame(result.todense()) + + train_data = {0: [1, 0, 0, 4], + 1: [2, 3, 0, 5], + 2: [3, 4, 5, 6]} + expected_result = pd.DataFrame(train_data).astype(np.float32) + + self.assertTrue(result.equals(expected_result)) + + self.assertEqual(result.dtypes[0], np.float32) + self.assertEqual(result.dtypes[1], np.float32) + self.assertEqual(result.dtypes[2], np.float32) + + def test_types_convertable_to_r8_get_output_as_r8(self): + large_int64 = 372036854775807 + train_data = {'c1': [1, 0, 0, 4], + 'c2': [2, 3, 0, 5], + 'c3': [3, 0, 5, 0], + 'c4': [0, 5, 6, 7], + 'c5': [0, 5, 0, large_int64], + 'c6': [5, 6, 7, 8]} + train_df = pd.DataFrame(train_data).astype({'c1': np.ubyte, + 'c2': np.short, + 'c3': np.float32, + 'c4': np.float64, + 'c5': np.int64}) + + xf = ColumnDropper(columns=['c6']) + xf.fit(train_df) + result = xf.transform(train_df, as_csr=True) + + self.assertTrue(type(result) == csr_matrix) + self.assertEqual(result.nnz, 12) + result = pd.DataFrame(result.todense()) + + train_data = {0: [1, 0, 0, 4], + 1: [2, 3, 0, 5], + 2: [3, 0, 5, 0], + 3: [0, 5, 6, 7], + 4: [0, 5, 0, large_int64]} + expected_result = pd.DataFrame(train_data).astype(np.float64) + + self.assertTrue(result.equals(expected_result)) + + self.assertEqual(result.dtypes[0], np.float64) + self.assertEqual(result.dtypes[1], np.float64) + self.assertEqual(result.dtypes[2], np.float64) + self.assertEqual(result.dtypes[3], np.float64) + + self.assertEqual(result.loc[3, 4], large_int64) + + +if __name__ == '__main__': + unittest.main() diff --git a/src/python/nimbusml/tests/test_entrypoints.py b/src/python/nimbusml/tests/test_entrypoints.py index 257d5bef..c4e53546 100644 --- a/src/python/nimbusml/tests/test_entrypoints.py +++ b/src/python/nimbusml/tests/test_entrypoints.py @@ -11,7 +11,7 @@ from nimbusml.internal.entrypoints.transforms_twoheterogeneousmodelcombiner \ import \ transforms_twoheterogeneousmodelcombiner -from nimbusml.internal.utils.entrypoints import EntryPoint, Graph +from nimbusml.internal.utils.entrypoints import EntryPoint, Graph, DataOutputFormat # from imp import reload @@ -116,7 +116,7 @@ def test_logistic_regression_graph(self): graph = Graph( dict( input_data=""), dict( - output_model=""), False, *all_nodes) + output_model=""), DataOutputFormat.DF, *all_nodes) # print(graph) graph.run(X=None, dryrun=True) diff --git a/src/python/nimbusml/tests/test_errors.py b/src/python/nimbusml/tests/test_errors.py index df14baf2..744ac275 100644 --- a/src/python/nimbusml/tests/test_errors.py +++ b/src/python/nimbusml/tests/test_errors.py @@ -41,7 +41,7 @@ def test_error_wrong_column_name(self): raise Exception( 'boost.python did not replace the exception.\n{0}'.format( e)) - assert "Check the log for error messages" in str(e) + assert "Error: *** System.ArgumentOutOfRangeException: 'Could not find input column" in str(e) @unittest.skip("System.NullReferenceException") def test_char_tokenizer(self): diff --git a/src/python/nimbusml/tests/test_syntax_onehotvectorizer.py b/src/python/nimbusml/tests/test_syntax_onehotvectorizer.py index 556271af..c31879c1 100644 --- a/src/python/nimbusml/tests/test_syntax_onehotvectorizer.py +++ b/src/python/nimbusml/tests/test_syntax_onehotvectorizer.py @@ -79,7 +79,7 @@ def test_syntax5_failing(self): vec.fit_transform(X, verbose=2) assert False except RuntimeError as e: - assert "Returned code is -1. Check the log for error messages.." \ + assert "Error: *** System.ArgumentOutOfRangeException: 'Could not find input column" \ in str(e) vec = OneHotVectorizer() << {'edu1': ['education']} res = vec.fit_transform(X) @@ -147,3 +147,6 @@ def test_syntax9_multiple_inputs(self): 'out1': ['education1', 'education2']} output4 = ng4.fit_transform(X) assert output4.shape == (5, 13) + +if __name__ == '__main__': + unittest.main() diff --git a/src/python/nimbusml/tests/test_variable_column.py b/src/python/nimbusml/tests/test_variable_column.py new file mode 100644 index 00000000..6c1fc8bd --- /dev/null +++ b/src/python/nimbusml/tests/test_variable_column.py @@ -0,0 +1,196 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- + +import unittest + +import numpy as np +import pandas as pd +from nimbusml import Pipeline +from nimbusml.internal.entrypoints.transforms_variablecolumn import transforms_variablecolumn +from nimbusml.internal.utils.entrypoints import Graph, DataOutputFormat + + +class TestVariableColumn(unittest.TestCase): + + def to_variable_column(self, input, features=None, length_column_name=None): + node = transforms_variablecolumn(data='$data', + output_data='$output_data', + features=features, + length_column_name=length_column_name) + + graph_nodes = [node] + graph = Graph(dict(data=''), + dict(output_data=''), + DataOutputFormat.DF, + *(graph_nodes)) + + (out_model, out_data, out_metrics, _) = graph.run(verbose=True, X=input) + return out_data + + def test_nonvariable_columns_are_returned_unchanged(self): + train_data = {'c1': [2, 3, 4, 5], + 'c2': [3, 4, 5, 6], + 'c3': [4, 5, 6, 7], + 'c4': [0, 1, 2, 1]} + train_df = pd.DataFrame(train_data).astype({'c1': np.float64, + 'c2': np.float64}) + + result = self.to_variable_column(train_df, ['c1', 'c2']) + + self.assertTrue(result.loc[:, 'c3'].equals(train_df.loc[:, 'c3'])) + self.assertTrue(result.loc[:, 'c4'].equals(train_df.loc[:, 'c4'])) + + def test_variable_columns_of_same_length_do_not_add_nans(self): + train_data = {'c1': [2, 3, 4, 5], + 'c2': [3, 4, 5, 6], + 'c3': [4, 5, 6, 7]} + train_df = pd.DataFrame(train_data).astype({'c1': np.float64, + 'c2': np.float64}) + + result = self.to_variable_column(train_df, ['c1', 'c2']) + + self.assertTrue(result.loc[:, 'c1.0'].equals(train_df.loc[:, 'c1'])) + self.assertTrue(result.loc[:, 'c1.1'].equals(train_df.loc[:, 'c2'])) + + def test_variable_columns_with_different_lengths_return_nans(self): + train_data = {'c1': [2, 3, 4, 5], + 'c2': [3, 4, 5, 6], + 'c3': [4, 5, 6, 7], + 'c4': [0, 1, 2, 1]} + train_df = pd.DataFrame(train_data).astype({'c1': np.float64, + 'c2': np.float64}) + + result = self.to_variable_column(train_df, ['c1', 'c2'], 'c4') + + expectedC1 = pd.Series([np.nan, 3, 4, 5]).astype(np.float64) + expectedC2 = pd.Series([np.nan, np.nan, 5, np.nan]).astype(np.float64) + + self.assertTrue(result.loc[:, 'c1.0'].equals(expectedC1)) + self.assertTrue(result.loc[:, 'c1.1'].equals(expectedC2)) + + def test_variable_columns_with_different_lengths_return_nans_when_no_other_columns_are_present(self): + train_data = {'c1': [2, 3, 4, 5], + 'c2': [3, 4, 5, 6], + 'c3': [0, 1, 2, 1]} + train_df = pd.DataFrame(train_data).astype({'c1': np.float64, + 'c2': np.float64}) + + result = self.to_variable_column(train_df, ['c1', 'c2'], 'c3') + + expectedC1 = pd.Series([np.nan, 3, 4, 5]).astype(np.float64) + expectedC2 = pd.Series([np.nan, np.nan, 5, np.nan]).astype(np.float64) + + self.assertEqual(len(result.columns), 2) + self.assertTrue(result.loc[:, 'c1.0'].equals(expectedC1)) + self.assertTrue(result.loc[:, 'c1.1'].equals(expectedC2)) + + def test_variable_columns_are_converted_to_float32(self): + """ + There are no integer nans so values that can be + converted to float32 are converted to support nans. + There is nullable integer type support in pandas but + it is currently marked as experimental and the docs + state that the api may change in the future. See + https://pandas.pydata.org/pandas-docs/stable/user_guide/integer_na.html + """ + types = [np.int8, np.int16, np.uint8, np.uint16, np.float32] + + for type in types: + train_data = {'c1': [2, 3, 4, 5], 'c2': [3, 4, 5, 6]} + train_df = pd.DataFrame(train_data).astype(type); + + result = self.to_variable_column(train_df, ['c1', 'c2']) + + self.assertEqual(str(result.dtypes[0]), 'float32') + self.assertEqual(str(result.dtypes[1]), 'float32') + + def test_variable_columns_are_converted_to_float64(self): + """ + There are no integer nans so values that can be + converted to float64 are converted to support nans. + There is nullable integer type support in pandas but + it is currently marked as experimental and the docs + state that the api may change in the future. See + https://pandas.pydata.org/pandas-docs/stable/user_guide/integer_na.html + """ + types = [np.int32, np.uint32, np.int64, np.uint64, np.float64] + + for type in types: + train_data = {'c1': [2, 3, 4, 5], 'c2': [3, 4, 5, 6]} + train_df = pd.DataFrame(train_data).astype(type); + + result = self.to_variable_column(train_df, ['c1', 'c2']) + + self.assertEqual(str(result.dtypes[0]), 'float64') + self.assertEqual(str(result.dtypes[1]), 'float64') + + def test_column_with_all_vector_lengths_of_zero_returns_one_column_filled_with_nans(self): + train_data = {'c1': [2, 3, 4, 5], + 'c2': [3, 4, 5, 6], + 'c3': [0, 0, 0, 0]} + train_df = pd.DataFrame(train_data).astype({'c1': np.float64, + 'c2': np.float64}) + + result = self.to_variable_column(train_df, ['c1', 'c2'], 'c3') + + expectedC1 = pd.Series([np.nan, np.nan, np.nan, np.nan]).astype(np.float64) + + self.assertEqual(len(result.columns), 1) + self.assertTrue(result.loc[:, 'c1.0'].equals(expectedC1)) + + def test_variable_column_conversion_leaves_nans_untouched_if_they_already_exist_in_the_input(self): + train_data = {'c1': [2, 3, np.nan, 5], + 'c2': [3, np.nan, 5, 6], + 'c3': [2, 2, 2, 1]} + train_df = pd.DataFrame(train_data).astype({'c1': np.float64, + 'c2': np.float64}) + + result = self.to_variable_column(train_df, ['c1', 'c2'], 'c3') + + expectedC1 = pd.Series([2, 3, np.nan, 5]).astype(np.float64) + expectedC2 = pd.Series([3, np.nan, 5, np.nan]).astype(np.float64) + + self.assertEqual(len(result.columns), 2) + self.assertTrue(result.loc[:, 'c1.0'].equals(expectedC1)) + self.assertTrue(result.loc[:, 'c1.1'].equals(expectedC2)) + + def test_column_names_are_zero_padded(self): + numColsToVerify = [1, 2, 10, 11, 100, 101] + + for numCols in numColsToVerify: + inputColNames = ['c' + str(i) for i in range(numCols)] + train_data = {k: [2,3,4,5] for k in inputColNames} + train_df = pd.DataFrame(train_data).astype(np.float32); + + result = self.to_variable_column(train_df, inputColNames) + + maxDigits = len(inputColNames[-1]) - 1 + expectedColNames = ['c0.' + str(i).zfill(maxDigits) for i in range(numCols)] + + self.assertTrue(all(result.columns == expectedColNames)) + + def test_variable_column_of_type_string(self): + train_data = {'c1': ['a', 'b', '', 'd'], + 'c2': ['e', 'f', 'g', 'h'], + 'c3': [0, 1, 2, 1]} + train_df = pd.DataFrame(train_data) + + result = self.to_variable_column(train_df, ['c1', 'c2'], 'c3') + + self.assertEqual(result.loc[0, 'c1.0'], None) + self.assertEqual(result.loc[1, 'c1.0'], 'b') + self.assertEqual(result.loc[2, 'c1.0'], '') + self.assertEqual(result.loc[3, 'c1.0'], 'd') + + self.assertNotEqual(result.loc[2, 'c1.0'], None) + + self.assertEqual(result.loc[0, 'c1.1'], None) + self.assertEqual(result.loc[1, 'c1.1'], None) + self.assertEqual(result.loc[2, 'c1.1'], 'g') + self.assertEqual(result.loc[3, 'c1.1'], None) + + +if __name__ == '__main__': + unittest.main() diff --git a/src/python/nimbusml/utils/__init__.py b/src/python/nimbusml/utils/__init__.py index 3243711a..1c85d629 100644 --- a/src/python/nimbusml/utils/__init__.py +++ b/src/python/nimbusml/utils/__init__.py @@ -1,5 +1,4 @@ -from .utils import get_X_y, evaluate_binary_classifier, check_accuracy, \ - check_accuracy_scikit, load_img, ColumnSelector +from .utils import get_X_y, evaluate_binary_classifier, load_img, ColumnSelector try: from inspect import signature @@ -9,8 +8,6 @@ __all__ = [ 'get_X_y', 'evaluate_binary_classifier', - 'check_accuracy', - 'check_accuracy_scikit', 'load_img', 'ColumnSelector', 'signature' diff --git a/src/python/nimbusml/utils/utils.py b/src/python/nimbusml/utils/utils.py index b9b33075..5e2e9fe6 100644 --- a/src/python/nimbusml/utils/utils.py +++ b/src/python/nimbusml/utils/utils.py @@ -12,12 +12,9 @@ import pandas as pd from sklearn.base import BaseEstimator, TransformerMixin from sklearn.metrics import roc_auc_score -from sklearn.utils.testing import assert_greater - - -# select columns from DataFrame insize a pipeline +# select columns from DataFrame inside a pipeline class ColumnSelector(BaseEstimator, TransformerMixin): def __init__(self, columns, ravel=False): self.columns = columns @@ -106,30 +103,4 @@ def evaluate_binary_classifier(target, predicted, probabilities=None): auc_score = None if probabilities is not None: auc_score = roc_auc_score(target, probabilities) - return (accuracy, auc_score) - - -def check_accuracy(test_file, label_column, predictions, threshold, sep=','): - (test, label) = get_X_y(test_file, label_column, sep=sep) - accuracy = np.mean(label[label_column].values == - predictions.ix[:, 'PredictedLabel'].values) - assert_greater( - accuracy, - threshold, - "accuracy should be greater than %s" % - threshold) - - -def check_accuracy_scikit( - test_file, - label_column, - predictions, - threshold, - sep=','): - (test, label) = get_X_y(test_file, label_column, sep=sep) - accuracy = np.mean(label[label_column].values == predictions.values) - assert_greater( - accuracy, - threshold, - "accuracy should be greater than %s" % - threshold) + return (accuracy, auc_score) \ No newline at end of file diff --git a/src/python/setup.py b/src/python/setup.py index 251adae1..fc350275 100644 --- a/src/python/setup.py +++ b/src/python/setup.py @@ -45,7 +45,7 @@ # Versions should comply with PEP440. For a discussion on # single-sourcing the version across setup.py and the project code, see # https://packaging.python.org/en/latest/single_source_version.html - version='1.3.1', + version='1.5.0', description='NimbusML', long_description=long_description, @@ -115,7 +115,7 @@ 'nose>=1.3', 'pytest>=4.4.0', 'graphviz', 'imageio', ], - 'dprep': ['azureml-dataprep'], + 'dprep': ['azureml-dataprep>=1.1.12'], 'utils': ['graphviz', 'imageio'], }, diff --git a/src/python/setup.py.in b/src/python/setup.py.in index 3ddce586..e65db7d8 100644 --- a/src/python/setup.py.in +++ b/src/python/setup.py.in @@ -115,7 +115,7 @@ setup( 'nose>=1.3', 'pytest>=4.4.0', 'graphviz', 'imageio', ], - 'dprep': ['azureml-dataprep'], + 'dprep': ['azureml-dataprep>=1.1.12'], 'utils': ['graphviz', 'imageio'], }, diff --git a/src/python/tests/test_estimator_checks.py b/src/python/tests/test_estimator_checks.py index 9cbc09d0..7879896c 100644 --- a/src/python/tests/test_estimator_checks.py +++ b/src/python/tests/test_estimator_checks.py @@ -8,6 +8,7 @@ import json import os +from nimbusml.decomposition import FactorizationMachineBinaryClassifier from nimbusml.ensemble import EnsembleClassifier from nimbusml.ensemble import EnsembleRegressor from nimbusml.ensemble import LightGbmBinaryClassifier @@ -16,6 +17,7 @@ from nimbusml.ensemble import LightGbmRegressor from nimbusml.feature_extraction.text import NGramFeaturizer from nimbusml.internal.entrypoints._ngramextractor_ngram import n_gram +from nimbusml.linear_model import SgdBinaryClassifier from nimbusml.preprocessing import TensorFlowScorer from nimbusml.preprocessing.filter import SkipFilter, TakeFilter from nimbusml.timeseries import (IidSpikeDetector, IidChangePointDetector, @@ -70,7 +72,9 @@ # bug, low tolerance 'FastLinearRegressor': 'check_supervised_y_2d, ' 'check_regressor_data_not_an_array, ' - 'check_regressors_int', + 'check_regressors_int, ' + # todo: investigate + 'check_regressors_train', # bug decision function shape should be 1 # dimensional arrays, tolerance 'FastLinearClassifier': 'check_classifiers_train', @@ -93,6 +97,8 @@ 'check_estimators_dtypes', # tolerance 'LogisticRegressionClassifier': 'check_classifiers_train', + # todo: investigate + 'OnlineGradientDescentRegressor': 'check_regressors_train', # bug decision function shape, prediction bug 'NaiveBayesClassifier': 'check_classifiers_train, check_classifiers_classes', @@ -156,8 +162,7 @@ 'PixelExtractor, Loader, Resizer, \ GlobalContrastRowScaler, PcaTransformer, ' 'ColumnConcatenator, Sentiment, CharTokenizer, LightLda, ' - 'NGramFeaturizer, \ - WordEmbedding', + 'NGramFeaturizer, WordEmbedding, LpScaler, WordTokenizer', 'check_transformer_data_not_an_array, check_pipeline_consistency, ' 'check_fit2d_1feature, check_estimators_fit_returns_self,\ check_fit2d_1sample, ' @@ -189,6 +194,7 @@ INSTANCES = { 'EnsembleClassifier': EnsembleClassifier(num_models=3), 'EnsembleRegressor': EnsembleRegressor(num_models=3), + 'FactorizationMachineBinaryClassifier': FactorizationMachineBinaryClassifier(shuffle=False), 'LightGbmBinaryClassifier': LightGbmBinaryClassifier( minimum_example_count_per_group=1, minimum_example_count_per_leaf=1), 'LightGbmClassifier': LightGbmClassifier( @@ -198,6 +204,7 @@ 'LightGbmRanker': LightGbmRanker( minimum_example_count_per_group=1, minimum_example_count_per_leaf=1), 'NGramFeaturizer': NGramFeaturizer(word_feature_extractor=n_gram()), + 'SgdBinaryClassifier': SgdBinaryClassifier(number_of_threads=1, shuffle=False), 'SkipFilter': SkipFilter(count=5), 'TakeFilter': TakeFilter(count=100000), 'IidSpikeDetector': IidSpikeDetector(columns=['F0']), @@ -256,7 +263,14 @@ def load_json(file_path): return json.loads(content_without_comments) -skip_epoints = set(['OneVsRestClassifier', 'TreeFeaturizer']) +skip_epoints = set([ + 'OneVsRestClassifier', + 'TreeFeaturizer', + # skip SymSgdBinaryClassifier for now, because of crashes. + 'SymSgdBinaryClassifier', + 'DatasetTransformer' +]) + epoints = [] my_path = os.path.realpath(__file__) my_dir = os.path.dirname(my_path) diff --git a/src/python/tests_extended/test_docs_example.py b/src/python/tests_extended/test_docs_example.py index 27470667..3c93d010 100644 --- a/src/python/tests_extended/test_docs_example.py +++ b/src/python/tests_extended/test_docs_example.py @@ -128,9 +128,10 @@ def test_examples(self): "CacheClassesFromAssembly: can't map name " "OLSLinearRegression to Void, already mapped to Void", # TensorFlowScorer.py - "tensorflow/compiler/xla/service/service.cc:150] XLA service", - "tensorflow/compiler/xla/service/service.cc:158] StreamExecutor device", + "tensorflow/compiler/xla/service/service.cc:168] XLA service", + "tensorflow/compiler/xla/service/service.cc:175] StreamExecutor device", "tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency:", + "tensorflow/compiler/jit/mark_for_compilation_pass.cc:1412] (One-time warning): Not using XLA:CPU", # Binner.py "from collections import Mapping, defaultdict", "DeprecationWarning: Using or importing the ABCs", diff --git a/src/python/tools/compiler_utils.py b/src/python/tools/compiler_utils.py index 7771ed6c..41b8e154 100644 --- a/src/python/tools/compiler_utils.py +++ b/src/python/tools/compiler_utils.py @@ -128,6 +128,7 @@ def _nodes_with_presteps(self): 'MinMaxScaler': int_to_r4_converter, 'MeanVarianceScaler': int_to_r4_converter, 'LogMeanVarianceScaler': int_to_r4_converter, + 'LpScaler': int_to_r4_converter, 'Binner': int_to_r4_converter, 'Filter': int_to_r4_converter, 'Handler': int_to_r4_converter, diff --git a/src/python/tools/entrypoint_compiler.py b/src/python/tools/entrypoint_compiler.py index b34bf66f..1ea5429d 100644 --- a/src/python/tools/entrypoint_compiler.py +++ b/src/python/tools/entrypoint_compiler.py @@ -1457,7 +1457,7 @@ def parse_arg(argument, inout): assert not is_column arg_obj = NumericArrayArg(argument, inout) elif itemType in ["String", "DataView", "PredictorModel", - "TransformModel", "Node"]: + "TransformModel", "Node", "Char"]: arg_obj = StringArrayArg(argument, inout, is_column=is_column) elif isinstance(itemType, dict): diff --git a/src/python/tools/manifest.json b/src/python/tools/manifest.json index 60bd5321..c8e6d6e5 100644 --- a/src/python/tools/manifest.json +++ b/src/python/tools/manifest.json @@ -18486,7 +18486,6 @@ }, { "Name": "LabelColumnName", - "PassAs": "LabelColumn", "Type": "String", "Desc": "Column to use for labels", "Aliases": [ @@ -21711,6 +21710,79 @@ "ITransformOutput" ] }, + { + "Name": "Transforms.PermutationFeatureImportance", + "Desc": "Permutation Feature Importance (PFI)", + "FriendlyName": "PFI", + "ShortName": "PFI", + "Inputs": [ + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The path to the model file", + "Aliases": [ + "path" + ], + "Required": true, + "SortOrder": 150.0, + "IsNullable": false + }, + { + "Name": "UseFeatureWeightFilter", + "Type": "Bool", + "Desc": "Use feature weights to pre-filter features", + "Aliases": [ + "usefw" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": false + }, + { + "Name": "NumberOfExamplesToUse", + "Type": "Int", + "Desc": "Limit the number of examples to evaluate on", + "Aliases": [ + "numexamples" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "PermutationCount", + "Type": "Int", + "Desc": "The number of permutations to perform", + "Aliases": [ + "permutations" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1 + } + ], + "Outputs": [ + { + "Name": "Metrics", + "Type": "DataView", + "Desc": "The PFI metrics" + } + ], + "InputKind": [ + "ITransformInput" + ] + }, { "Name": "Transforms.PredictedLabelColumnOriginalValueConverter", "Desc": "Transforms a predicted label column to its original values, unless it is of type bool.", diff --git a/src/python/tools/manifest_diff.json b/src/python/tools/manifest_diff.json index acff52df..0a66d5ff 100644 --- a/src/python/tools/manifest_diff.json +++ b/src/python/tools/manifest_diff.json @@ -241,7 +241,14 @@ "Module": "linear_model", "Type": "Classifier", "Predict_Proba" : true, - "Decision_Function" : true + "Decision_Function" : true, + "Inputs": [ + { + "Name": "Lambda", + "NewName": "l2_regularization", + "Desc": "L2 regularization weight. It also controls the learning rate, with the learning rate being inversely proportional to it." + } + ] }, { "Name": "Trainers.EnsembleClassification", @@ -317,6 +324,12 @@ } ] }, + { + "Name": "Models.DatasetTransformer", + "NewName": "DatasetTransformer", + "Module": "preprocessing", + "Type": "Transform" + }, { "Name": "Trainers.FieldAwareFactorizationMachineBinaryClassifier", "NewName": "FactorizationMachineBinaryClassifier", @@ -474,6 +487,12 @@ "Module": "preprocessing.normalization", "Type": "Transform" }, + { + "Name": "Transforms.LpNormalizer", + "NewName": "LpScaler", + "Module": "preprocessing.normalization", + "Type": "Transform" + }, { "Name": "Transforms.MissingValuesRowDropper", "NewName": "Filter", @@ -714,6 +733,12 @@ "Module": "preprocessing.text", "Type": "Transform" }, + { + "Name": "Transforms.WordTokenizer", + "NewName": "WordTokenizer", + "Module": "preprocessing.text", + "Type": "Transform" + }, { "Name": "Transforms.LightLda", "NewName": "LightLda", diff --git a/version.txt b/version.txt index 3a3cd8cc..bc80560f 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -1.3.1 +1.5.0