diff --git a/.vsts-ci.yml b/.vsts-ci.yml index b217ab07..2379d4cd 100644 --- a/.vsts-ci.yml +++ b/.vsts-ci.yml @@ -12,8 +12,6 @@ phases: _configuration: RlsWinPy3.6 Py35: _configuration: RlsWinPy3.5 - Py27: - _configuration: RlsWinPy2.7 buildQueue: name: Hosted VS2017 diff --git a/build.cmd b/build.cmd index 36ba13ef..af4b6d75 100644 --- a/build.cmd +++ b/build.cmd @@ -350,6 +350,7 @@ copy "%BuildOutputDir%%Configuration%\pybridge.pyd" "%__currentScriptDir%src\py if %PythonVersion% == 2.7 ( copy "%BuildOutputDir%%Configuration%\Platform\win-x64\publish\*.dll" "%__currentScriptDir%src\python\nimbusml\internal\libs\" + xcopy /S /E /I "%BuildOutputDir%%Configuration%\Platform\win-x64\publish\Data" "%__currentScriptDir%src\python\nimbusml\internal\libs\Data" :: remove dataprep dlls as its not supported in python 2.7 del "%__currentScriptDir%src\python\nimbusml\internal\libs\Microsoft.DPrep.*" del "%__currentScriptDir%src\python\nimbusml\internal\libs\Microsoft.Data.*" @@ -360,6 +361,7 @@ if %PythonVersion% == 2.7 ( del "%__currentScriptDir%src\python\nimbusml\internal\libs\Microsoft.Workbench.Messaging.SDK.dll" ) else ( for /F "tokens=*" %%A in (build/libs_win.txt) do copy "%BuildOutputDir%%Configuration%\Platform\win-x64\publish\%%A" "%__currentScriptDir%src\python\nimbusml\internal\libs\" + xcopy /S /E /I "%BuildOutputDir%%Configuration%\Platform\win-x64\publish\Data" "%__currentScriptDir%src\python\nimbusml\internal\libs\Data" ) if "%DebugBuild%" == "True" ( @@ -394,6 +396,7 @@ if "%InstallPythonPackages%" == "True" ( call "%PythonExe%" -m pip install --upgrade pyzmq ) else ( call "%PythonExe%" -m pip install --upgrade "azureml-dataprep>=1.1.33" + call "%PythonExe%" -m pip install --upgrade onnxruntime ) call "%PythonExe%" -m pip install --upgrade "%__currentScriptDir%target\%WheelFile%" diff --git a/build.sh b/build.sh index 2e6b7d7f..2038c95d 100755 --- a/build.sh +++ b/build.sh @@ -213,6 +213,7 @@ then cp "${BuildOutputDir}/${__configuration}/Platform/${PublishDir}"/publish/System.Native.a "${__currentScriptDir}/src/python/nimbusml/internal/libs/" cp "${BuildOutputDir}/${__configuration}/Platform/${PublishDir}"/publish/createdump "${__currentScriptDir}/src/python/nimbusml/internal/libs/" || : cp "${BuildOutputDir}/${__configuration}/Platform/${PublishDir}"/publish/sosdocsunix.txt "${__currentScriptDir}/src/python/nimbusml/internal/libs/" + cp -r "${BuildOutputDir}/${__configuration}/Platform/${PublishDir}"/publish/Data "${__currentScriptDir}/src/python/nimbusml/internal/libs/." ext=*.so if [ "$(uname -s)" = "Darwin" ] then @@ -241,6 +242,7 @@ then cat build/${libs_txt} | while read i; do cp "${BuildOutputDir}/${__configuration}/Platform/${PublishDir}"/publish/$i "${__currentScriptDir}/src/python/nimbusml/internal/libs/" done + cp -r "${BuildOutputDir}/${__configuration}/Platform/${PublishDir}"/publish/Data "${__currentScriptDir}/src/python/nimbusml/internal/libs/." fi if [[ $__configuration = Dbg* ]] @@ -291,6 +293,7 @@ then fi "${PythonExe}" -m pip install --upgrade "azureml-dataprep>=1.1.33" + "${PythonExe}" -m pip install --upgrade onnxruntime fi "${PythonExe}" -m pip install --upgrade "${Wheel}" "${PythonExe}" -m pip install "scikit-learn==0.19.2" diff --git a/build/libs_linux.txt b/build/libs_linux.txt index 41953f3f..b7298fef 100644 --- a/build/libs_linux.txt +++ b/build/libs_linux.txt @@ -2,6 +2,7 @@ Google.Protobuf.dll Newtonsoft.Json.dll libCpuMathNative.so libFastTreeNative.so +libFeaturizers.so libLdaNative.so libMklImports.so libMklProxyNative.so diff --git a/build/libs_mac.txt b/build/libs_mac.txt index 8b5066ed..497791e8 100644 --- a/build/libs_mac.txt +++ b/build/libs_mac.txt @@ -2,6 +2,7 @@ Google.Protobuf.dll Newtonsoft.Json.dll libCpuMathNative.dylib libFastTreeNative.dylib +libFeaturizers.dylib libLdaNative.dylib libMklImports.dylib libMklProxyNative.dylib diff --git a/build/libs_win.txt b/build/libs_win.txt index 7ef9cca7..2b0baca8 100644 --- a/build/libs_win.txt +++ b/build/libs_win.txt @@ -8,8 +8,10 @@ libiomp5md.dll MklImports.dll MklProxyNative.dll SymSgdNative.dll +Featurizers.dll tensorflow.dll TensorFlow.NET.dll NumSharp.Core.dll System.Drawing.Common.dll Microsoft.ML.* +onnxruntime.dll diff --git a/nuget.config b/nuget.config index cedba361..9265d7b5 100644 --- a/nuget.config +++ b/nuget.config @@ -5,6 +5,5 @@ - diff --git a/src/DotNetBridge/Bridge.cs b/src/DotNetBridge/Bridge.cs index a7954355..89e7e652 100644 --- a/src/DotNetBridge/Bridge.cs +++ b/src/DotNetBridge/Bridge.cs @@ -7,8 +7,10 @@ using System.Runtime.InteropServices; using System.Text; using System.Threading; +using Microsoft.ML; using Microsoft.ML.Data; using Microsoft.ML.EntryPoints; +using Microsoft.ML.Featurizers; using Microsoft.ML.Runtime; using Microsoft.ML.Trainers; using Microsoft.ML.Trainers.Ensemble; @@ -296,11 +298,12 @@ private static unsafe int GenericExec(EnvironmentBlock* penv, sbyte* psz, int cd //env.ComponentCatalog.RegisterAssembly(typeof(AutoInference).Assembly); // ML.PipelineInference env.ComponentCatalog.RegisterAssembly(typeof(DataViewReference).Assembly); env.ComponentCatalog.RegisterAssembly(typeof(ImageLoadingTransformer).Assembly); - //env.ComponentCatalog.RegisterAssembly(typeof(SaveOnnxCommand).Assembly); + env.ComponentCatalog.RegisterAssembly(typeof(OnnxExportExtensions).Assembly); //env.ComponentCatalog.RegisterAssembly(typeof(TimeSeriesProcessingEntryPoints).Assembly); //env.ComponentCatalog.RegisterAssembly(typeof(ParquetLoader).Assembly); env.ComponentCatalog.RegisterAssembly(typeof(SsaChangePointDetector).Assembly); env.ComponentCatalog.RegisterAssembly(typeof(DotNetBridgeEntrypoints).Assembly); + env.ComponentCatalog.RegisterAssembly(typeof(DateTimeTransformer).Assembly); using (var ch = host.Start("Executing")) { diff --git a/src/DotNetBridge/DotNetBridge.csproj b/src/DotNetBridge/DotNetBridge.csproj index 00688f17..31c27043 100644 --- a/src/DotNetBridge/DotNetBridge.csproj +++ b/src/DotNetBridge/DotNetBridge.csproj @@ -32,17 +32,21 @@ all runtime; build; native; contentfiles; analyzers - - - - - - - - - - - + + + + + + + + + + + + + + + diff --git a/src/DotNetBridge/Entrypoints.cs b/src/DotNetBridge/Entrypoints.cs index 535d9d75..6f7f8d0c 100644 --- a/src/DotNetBridge/Entrypoints.cs +++ b/src/DotNetBridge/Entrypoints.cs @@ -178,5 +178,59 @@ public static ScoringTransformOutput Score(IHostEnvironment env, ScoringTransfor }; } + + public sealed class OnnxTransformInput : TransformInputBase + { + [Argument(ArgumentType.Required, HelpText = "Path to the onnx model file.", ShortName = "model", SortOrder = 0)] + public string ModelFile; + + [Argument(ArgumentType.Multiple, HelpText = "Name of the input column.", SortOrder = 1)] + public string[] InputColumns; + + [Argument(ArgumentType.Multiple, HelpText = "Name of the output column.", SortOrder = 2)] + public string[] OutputColumns; + + [Argument(ArgumentType.AtMostOnce, HelpText = "GPU device id to run on (e.g. 0,1,..). Null for CPU. Requires CUDA 9.1.", SortOrder = 3)] + public int? GpuDeviceId = null; + + [Argument(ArgumentType.AtMostOnce, HelpText = "If true, resumes execution on CPU upon GPU error. If false, will raise the GPU execption.", SortOrder = 4)] + public bool FallbackToCpu = false; + } + + public sealed class OnnxTransformOutput + { + [TlcModule.Output(Desc = "ONNX transformed dataset", SortOrder = 1)] + public IDataView OutputData; + + [TlcModule.Output(Desc = "Transform model", SortOrder = 2)] + public TransformModel Model; + } + + [TlcModule.EntryPoint(Name = "Models.OnnxTransformer", + Desc = "Applies an ONNX model to a dataset.", + UserName = "Onnx Transformer", + ShortName = "onnx-xf")] + public static OnnxTransformOutput ApplyOnnxModel(IHostEnvironment env, OnnxTransformInput input) + { + var host = EntryPointUtils.CheckArgsAndCreateHost(env, "OnnxTransform", input); + + var inputColumns = input.InputColumns ?? (Array.Empty()); + var outputColumns = input.OutputColumns ?? (Array.Empty()); + + var transformsCatalog = new TransformsCatalog(host); + var onnxScoringEstimator = OnnxCatalog.ApplyOnnxModel(transformsCatalog, + outputColumns, + inputColumns, + input.ModelFile, + input.GpuDeviceId, + input.FallbackToCpu); + + var view = onnxScoringEstimator.Fit(input.Data).Transform(input.Data); + return new OnnxTransformOutput() + { + Model = new TransformModelImpl(host, view, input.Data), + OutputData = view + }; + } } } diff --git a/src/DotNetBridge/ManifestUtils.cs b/src/DotNetBridge/ManifestUtils.cs index 7d1c89a5..b566cf2f 100644 --- a/src/DotNetBridge/ManifestUtils.cs +++ b/src/DotNetBridge/ManifestUtils.cs @@ -11,6 +11,7 @@ using System.Text.RegularExpressions; using Microsoft.ML.Data; using Microsoft.ML.EntryPoints; +using Microsoft.ML.Featurizers; using Microsoft.ML.Model.OnnxConverter; using Microsoft.ML.Runtime; using Microsoft.ML.Trainers; @@ -42,8 +43,10 @@ public static class ManifestUtils typeof(ImageLoadingTransformer), typeof(SymbolicSgdLogisticRegressionBinaryTrainer), typeof(OnnxContext), + typeof(OnnxExportExtensions), typeof(SsaForecastingTransformer), - typeof(VariableColumnTransform) + typeof(VariableColumnTransform), + typeof(DateTimeTransformer) }; private static (IEnumerable epListContents, JObject manifest) BuildManifests() diff --git a/src/DotNetBridge/NativeDataInterop.cs b/src/DotNetBridge/NativeDataInterop.cs index 461beb3c..c95584c0 100644 --- a/src/DotNetBridge/NativeDataInterop.cs +++ b/src/DotNetBridge/NativeDataInterop.cs @@ -213,8 +213,11 @@ private static unsafe void SendViewToNativeAsDataFrame(IChannel ch, EnvironmentB } else { - for (int i = 0; i < nSlots; i++) - AddUniqueName(name + "." + i, ref nameIndices, ref nameUtf8Bytes); + if (nSlots == 1) + AddUniqueName(name, ref nameIndices, ref nameUtf8Bytes); + else + for (int i = 0; i < nSlots; i++) + AddUniqueName(name + "." + i, ref nameIndices, ref nameUtf8Bytes); } } else diff --git a/src/DotNetBridge/NativeDataView.cs b/src/DotNetBridge/NativeDataView.cs index 1a829889..d3795757 100644 --- a/src/DotNetBridge/NativeDataView.cs +++ b/src/DotNetBridge/NativeDataView.cs @@ -416,7 +416,7 @@ public TextColumnReader(int batchSize, long rowsCount, int cref, Column[] column _waiterPublish = new OrderedWaiter(firstCleared: true); _queue = new BlockingCollection(QueueSize); - _thdRead = Utils.RunOnBackgroundThread(ThreadProc); + _thdRead = Utils.RunOnBackgroundThreadAsync(ThreadProc); } public void Release() @@ -1406,4 +1406,4 @@ public override void Dispose() #endregion } } -} \ No newline at end of file +} diff --git a/src/DotNetBridge/RmlEnvironment.cs b/src/DotNetBridge/RmlEnvironment.cs index dc9ff045..e9893426 100644 --- a/src/DotNetBridge/RmlEnvironment.cs +++ b/src/DotNetBridge/RmlEnvironment.cs @@ -52,14 +52,9 @@ protected override IHost RegisterCore(HostEnvironmentBase source } public RmlEnvironment(Bridge.CheckCancelled checkDelegate, int? seed = null, bool verbose = false) - : this(RandomUtils.Create(seed), verbose) + : base(seed, verbose) { CheckCancelled = checkDelegate; - } - - public RmlEnvironment(Random rand, bool verbose = false) - : base(rand, verbose) - { CultureInfo.CurrentUICulture = CultureInfo.InvariantCulture; EnsureDispatcher(); } diff --git a/src/DotNetBridge/transforms/VariableColumnTransform.cs b/src/DotNetBridge/transforms/VariableColumnTransform.cs index ea9ecafb..9ee1ebd7 100644 --- a/src/DotNetBridge/transforms/VariableColumnTransform.cs +++ b/src/DotNetBridge/transforms/VariableColumnTransform.cs @@ -247,6 +247,9 @@ public override bool IsColumnActive(DataViewSchema.Column column) return _active[column.Index]; } + private static readonly FuncInstanceMethodInfo1 _makeVarLengthVectorGetterMethodInfo + = FuncInstanceMethodInfo1.Create(target => target.MakeVarLengthVectorGetter); + private Delegate MakeVarLengthVectorGetter(DataViewRow input) { var srcGetters = new ValueGetter[_bindings.vectorToInputMap.Count]; @@ -304,7 +307,7 @@ public override ValueGetter GetGetter(DataViewSchema.Column colu if (column.Index == _bindings.outputColumn) { VectorDataViewType columnType = column.Type as VectorDataViewType; - Delegate getter = Utils.MarshalInvoke(MakeVarLengthVectorGetter, columnType.ItemType.RawType, _cursor); + Delegate getter = Utils.MarshalInvoke(_makeVarLengthVectorGetterMethodInfo, this, columnType.ItemType.RawType, _cursor); return getter as ValueGetter; } else diff --git a/src/Platforms/build.csproj b/src/Platforms/build.csproj index ef1b03d4..065da7d3 100644 --- a/src/Platforms/build.csproj +++ b/src/Platforms/build.csproj @@ -11,17 +11,21 @@ - - - - - - - - - - - + + + + + + + + + + + + + + + diff --git a/src/python/nimbusml.pyproj b/src/python/nimbusml.pyproj index 18356d5c..2deae4ab 100644 --- a/src/python/nimbusml.pyproj +++ b/src/python/nimbusml.pyproj @@ -92,14 +92,19 @@ + + + + + - + @@ -111,6 +116,7 @@ + @@ -125,6 +131,7 @@ + @@ -163,6 +170,7 @@ + @@ -178,6 +186,7 @@ + @@ -233,6 +242,7 @@ + @@ -309,19 +319,25 @@ + + + + + + @@ -345,6 +361,8 @@ + + @@ -383,6 +401,7 @@ + @@ -400,6 +419,7 @@ + @@ -412,6 +432,7 @@ + @@ -440,6 +461,7 @@ + @@ -457,6 +479,8 @@ + + @@ -634,6 +658,7 @@ + @@ -650,7 +675,9 @@ + + @@ -663,6 +690,8 @@ + + @@ -699,16 +728,22 @@ + + + + + + @@ -717,7 +752,11 @@ + + + + diff --git a/src/python/nimbusml/__init__.py b/src/python/nimbusml/__init__.py index afb13002..79762616 100644 --- a/src/python/nimbusml/__init__.py +++ b/src/python/nimbusml/__init__.py @@ -2,7 +2,7 @@ Microsoft Machine Learning for Python """ -__version__ = '1.6.1' +__version__ = '1.7.0' # CoreCLR version of MicrosoftML is built on Windows. # But file permissions are not preserved when it's copied to Linux. diff --git a/src/python/nimbusml/base_predictor.py b/src/python/nimbusml/base_predictor.py index e619b115..6b31e566 100644 --- a/src/python/nimbusml/base_predictor.py +++ b/src/python/nimbusml/base_predictor.py @@ -178,7 +178,6 @@ def summary(self): self.model_summary_ = pipeline.summary() return self.model_summary_ - @trace def _get_implicit_transforms( self, features, @@ -308,6 +307,11 @@ def _get_graph_nodes( if label_column is None: label_column = Role.Label self.label_column_name = label_column + + if y is None \ + and self._use_role(Role.Label) \ + and label_column in learner_features: + learner_features.remove(label_column) else: self.label_column_name = None label_column = None @@ -354,3 +358,20 @@ def _get_graph_nodes( row_group_column_name=group_id_column) graph_nodes['learner_node'] = [learner_node] return graph_nodes, learner_features + + @trace + def export_to_onnx(self, *args, **kwargs): + """ + Export the model to the ONNX format. + + See :py:meth:`nimbusml.Pipeline.export_to_onnx` for accepted arguments. + """ + if not hasattr(self, 'model_') \ + or self.model_ is None \ + or not os.path.isfile(self.model_): + + raise ValueError("Model is not fitted. Train or load a model before " + "export_to_onnx().") + + pipeline = Pipeline([self], model=self.model_) + pipeline.export_to_onnx(*args, **kwargs) diff --git a/src/python/nimbusml/base_transform.py b/src/python/nimbusml/base_transform.py index b227d567..f0c4f861 100644 --- a/src/python/nimbusml/base_transform.py +++ b/src/python/nimbusml/base_transform.py @@ -124,3 +124,20 @@ def transform(self, X, as_binary_data_stream=False, **params): data = pipeline.transform( X, as_binary_data_stream=as_binary_data_stream, **params) return data + + @trace + def export_to_onnx(self, *args, **kwargs): + """ + Export the model to the ONNX format. + + See :py:meth:`nimbusml.Pipeline.export_to_onnx` for accepted arguments. + """ + if not hasattr(self, 'model_') \ + or self.model_ is None \ + or not os.path.isfile(self.model_): + + raise ValueError("Model is not fitted. Train or load a model before " + "export_to_onnx().") + + pipeline = Pipeline([self], model=self.model_) + pipeline.export_to_onnx(*args, **kwargs) diff --git a/src/python/nimbusml/ensemble/lightgbmbinaryclassifier.py b/src/python/nimbusml/ensemble/lightgbmbinaryclassifier.py index c87bbbb0..125f536b 100644 --- a/src/python/nimbusml/ensemble/lightgbmbinaryclassifier.py +++ b/src/python/nimbusml/ensemble/lightgbmbinaryclassifier.py @@ -110,6 +110,9 @@ class LightGbmBinaryClassifier( :param handle_missing_value: Enable special handling of missing value or not. + :param use_zero_as_missing_value: Enable usage of zero (0) as missing + value. + :param minimum_example_count_per_group: Minimum number of instances per categorical group. @@ -165,6 +168,7 @@ def __init__( batch_size=1048576, use_categorical_split=None, handle_missing_value=True, + use_zero_as_missing_value=False, minimum_example_count_per_group=100, maximum_categorical_split_point_count=32, categorical_smoothing=10.0, @@ -219,6 +223,7 @@ def __init__( batch_size=batch_size, use_categorical_split=use_categorical_split, handle_missing_value=handle_missing_value, + use_zero_as_missing_value=use_zero_as_missing_value, minimum_example_count_per_group=minimum_example_count_per_group, maximum_categorical_split_point_count=maximum_categorical_split_point_count, categorical_smoothing=categorical_smoothing, diff --git a/src/python/nimbusml/ensemble/lightgbmclassifier.py b/src/python/nimbusml/ensemble/lightgbmclassifier.py index a6951be2..d3d000e3 100644 --- a/src/python/nimbusml/ensemble/lightgbmclassifier.py +++ b/src/python/nimbusml/ensemble/lightgbmclassifier.py @@ -105,6 +105,9 @@ class LightGbmClassifier(core, BasePredictor, ClassifierMixin): :param handle_missing_value: Enable special handling of missing value or not. + :param use_zero_as_missing_value: Enable usage of zero (0) as missing + value. + :param minimum_example_count_per_group: Minimum number of instances per categorical group. @@ -160,6 +163,7 @@ def __init__( batch_size=1048576, use_categorical_split=None, handle_missing_value=True, + use_zero_as_missing_value=False, minimum_example_count_per_group=100, maximum_categorical_split_point_count=32, categorical_smoothing=10.0, @@ -214,6 +218,7 @@ def __init__( batch_size=batch_size, use_categorical_split=use_categorical_split, handle_missing_value=handle_missing_value, + use_zero_as_missing_value=use_zero_as_missing_value, minimum_example_count_per_group=minimum_example_count_per_group, maximum_categorical_split_point_count=maximum_categorical_split_point_count, categorical_smoothing=categorical_smoothing, diff --git a/src/python/nimbusml/ensemble/lightgbmranker.py b/src/python/nimbusml/ensemble/lightgbmranker.py index fb96f5cd..61bcbd90 100644 --- a/src/python/nimbusml/ensemble/lightgbmranker.py +++ b/src/python/nimbusml/ensemble/lightgbmranker.py @@ -105,6 +105,9 @@ class LightGbmRanker(core, BasePredictor, ClassifierMixin): :param handle_missing_value: Enable special handling of missing value or not. + :param use_zero_as_missing_value: Enable usage of zero (0) as missing + value. + :param minimum_example_count_per_group: Minimum number of instances per categorical group. @@ -159,6 +162,7 @@ def __init__( batch_size=1048576, use_categorical_split=None, handle_missing_value=True, + use_zero_as_missing_value=False, minimum_example_count_per_group=100, maximum_categorical_split_point_count=32, categorical_smoothing=10.0, @@ -212,6 +216,7 @@ def __init__( batch_size=batch_size, use_categorical_split=use_categorical_split, handle_missing_value=handle_missing_value, + use_zero_as_missing_value=use_zero_as_missing_value, minimum_example_count_per_group=minimum_example_count_per_group, maximum_categorical_split_point_count=maximum_categorical_split_point_count, categorical_smoothing=categorical_smoothing, diff --git a/src/python/nimbusml/ensemble/lightgbmregressor.py b/src/python/nimbusml/ensemble/lightgbmregressor.py index 0d0a69ae..89258a7f 100644 --- a/src/python/nimbusml/ensemble/lightgbmregressor.py +++ b/src/python/nimbusml/ensemble/lightgbmregressor.py @@ -98,6 +98,9 @@ class LightGbmRegressor(core, BasePredictor, RegressorMixin): :param handle_missing_value: Enable special handling of missing value or not. + :param use_zero_as_missing_value: Enable usage of zero (0) as missing + value. + :param minimum_example_count_per_group: Minimum number of instances per categorical group. @@ -150,6 +153,7 @@ def __init__( batch_size=1048576, use_categorical_split=None, handle_missing_value=True, + use_zero_as_missing_value=False, minimum_example_count_per_group=100, maximum_categorical_split_point_count=32, categorical_smoothing=10.0, @@ -201,6 +205,7 @@ def __init__( batch_size=batch_size, use_categorical_split=use_categorical_split, handle_missing_value=handle_missing_value, + use_zero_as_missing_value=use_zero_as_missing_value, minimum_example_count_per_group=minimum_example_count_per_group, maximum_categorical_split_point_count=maximum_categorical_split_point_count, categorical_smoothing=categorical_smoothing, diff --git a/src/python/nimbusml/examples/DateTimeSplitter.py b/src/python/nimbusml/examples/DateTimeSplitter.py new file mode 100644 index 00000000..fd8612d3 --- /dev/null +++ b/src/python/nimbusml/examples/DateTimeSplitter.py @@ -0,0 +1,31 @@ +############################################################################### +# DateTimeSplitter +import pandas as pd +from nimbusml import FileDataStream +from nimbusml.datasets import get_dataset +from nimbusml.preprocessing import DateTimeSplitter + +# data input (as a FileDataStream) +path = get_dataset('infert').as_filepath() + +data = FileDataStream.read_csv(path, sep=',') + +# transform usage +xf = DateTimeSplitter(prefix='dt_') << 'age' + +# fit and transform +features = xf.fit_transform(data) + +features = features.drop(['row_num', 'education', 'parity', 'induced', + 'case', 'spontaneous', 'stratum', 'pooled.stratum'], axis=1) + +# print features +pd.set_option('display.max_columns', None) +pd.set_option('display.width', 1000) +print(features.head()) +# age dt_Year dt_Month dt_Day dt_Hour dt_Minute dt_Second dt_AmPm dt_Hour12 dt_DayOfWeek dt_DayOfQuarter dt_DayOfYear dt_WeekOfMonth dt_QuarterOfYear dt_HalfOfYear dt_WeekIso dt_YearIso dt_MonthLabel dt_AmPmLabel dt_DayOfWeekLabel dt_HolidayName dt_IsPaidTimeOff +# 0 26 1970 1 1 0 0 26 0 0 4 1 0 0 1 1 1 1970 January am Thursday None 0 +# 1 42 1970 1 1 0 0 42 0 0 4 1 0 0 1 1 1 1970 January am Thursday None 0 +# 2 39 1970 1 1 0 0 39 0 0 4 1 0 0 1 1 1 1970 January am Thursday None 0 +# 3 34 1970 1 1 0 0 34 0 0 4 1 0 0 1 1 1 1970 January am Thursday None 0 +# 4 35 1970 1 1 0 0 35 0 0 4 1 0 0 1 1 1 1970 January am Thursday None 0 \ No newline at end of file diff --git a/src/python/nimbusml/examples/RobustScaler.py b/src/python/nimbusml/examples/RobustScaler.py new file mode 100644 index 00000000..4c6a6405 --- /dev/null +++ b/src/python/nimbusml/examples/RobustScaler.py @@ -0,0 +1,39 @@ +############################################################################### +# RobustScaler +import numpy +from nimbusml import FileDataStream +from nimbusml.datasets import get_dataset +from nimbusml.preprocessing.normalization import RobustScaler + +# data input (as a FileDataStream) +path = get_dataset('infert').as_filepath() +data = FileDataStream.read_csv(path, sep=',') + +print(data.head()) +# row_num education age parity induced case spontaneous stratum pooled.stratum +# 0 1 0-5yrs 26 6 1 1 2 1 3 +# 1 2 0-5yrs 42 1 1 1 0 2 1 +# 2 3 0-5yrs 39 6 2 1 0 3 4 +# 3 4 0-5yrs 34 4 2 1 0 4 2 +# 4 5 6-11yrs 35 3 1 1 1 5 32 + +# transform usage +xf = RobustScaler( + center=True, scale=True, + columns={'age_norm': 'age', 'par_norm': 'parity'}) + +# fit and transform +features = xf.fit_transform(data) + +print(features.head(n=10)) +# row_num education age parity induced case spontaneous stratum pooled.stratum age_norm par_norm +# 0 1 0-5yrs 26 6 1 1 2 1 3 -0.434783 1.6 +# 1 2 0-5yrs 42 1 1 1 0 2 1 0.956522 -0.4 +# 2 3 0-5yrs 39 6 2 1 0 3 4 0.695652 1.6 +# 3 4 0-5yrs 34 4 2 1 0 4 2 0.260870 0.8 +# 4 5 6-11yrs 35 3 1 1 1 5 32 0.347826 0.4 +# 5 6 6-11yrs 36 4 2 1 1 6 36 0.434783 0.8 +# 6 7 6-11yrs 23 1 0 1 0 7 6 -0.695652 -0.4 +# 7 8 6-11yrs 32 2 0 1 0 8 22 0.086957 0.0 +# 8 9 6-11yrs 21 1 0 1 1 9 5 -0.869565 -0.4 +# 9 10 6-11yrs 28 2 0 1 0 10 19 -0.260870 0.0 diff --git a/src/python/nimbusml/examples/ToKeyImputer.py b/src/python/nimbusml/examples/ToKeyImputer.py new file mode 100644 index 00000000..820127f5 --- /dev/null +++ b/src/python/nimbusml/examples/ToKeyImputer.py @@ -0,0 +1,35 @@ +############################################################################### +# ToKey +import numpy +from nimbusml import FileDataStream +from nimbusml.datasets import get_dataset +from nimbusml.preprocessing import ToKeyImputer + +# data input (as a FileDataStream) +path = get_dataset('airquality').as_filepath() + +data = FileDataStream.read_csv(path, sep=',', numeric_dtype=numpy.float32, + names={0: 'id'}) +print(data.head(6)) +# id Ozone Solar_R Wind Temp Month Day +# 0 1.0 41.0 190.0 7.4 67.0 5.0 1.0 +# 1 2.0 36.0 118.0 8.0 72.0 5.0 2.0 +# 2 3.0 12.0 149.0 12.6 74.0 5.0 3.0 +# 3 4.0 18.0 313.0 11.5 62.0 5.0 4.0 +# 4 5.0 NaN NaN 14.3 56.0 5.0 5.0 +# 5 6.0 28.0 NaN 14.9 66.0 5.0 6.0 + + +# transform usage +xf = ToKeyImputer(columns={'Ozone_1': 'Ozone', 'Solar_R_1': 'Solar_R'}) + +# fit and transform +features = xf.fit_transform(data) +print(features.head(6)) +# id Ozone Solar_R Wind Temp Month Day Ozone_1 Solar_R_1 +# 0 1.0 41.0 190.0 7.4 67.0 5.0 1.0 41.0 190.0 +# 1 2.0 36.0 118.0 8.0 72.0 5.0 2.0 36.0 118.0 +# 2 3.0 12.0 149.0 12.6 74.0 5.0 3.0 12.0 149.0 +# 3 4.0 18.0 313.0 11.5 62.0 5.0 4.0 18.0 313.0 +# 4 5.0 NaN NaN 14.3 56.0 5.0 5.0 23.0 238.0 <== Missing values have been updated +# 5 6.0 28.0 NaN 14.9 66.0 5.0 6.0 28.0 238.0 <== Missing values have been updated diff --git a/src/python/nimbusml/examples/ToString.py b/src/python/nimbusml/examples/ToString.py new file mode 100644 index 00000000..82185d32 --- /dev/null +++ b/src/python/nimbusml/examples/ToString.py @@ -0,0 +1,45 @@ +############################################################################### +# ToKey +import numpy +from nimbusml import FileDataStream +from nimbusml.datasets import get_dataset +from nimbusml.preprocessing import ToString + +# data input (as a FileDataStream) +path = get_dataset('infert').as_filepath() + +data = FileDataStream.read_csv(path, sep=',', numeric_dtype=numpy.float32, + names={0: 'id'}) +print(data.head()) +# id education age parity induced case spontaneous stratum pooled.stratum +# 0 1.0 0-5yrs 26.0 6.0 1.0 1.0 2.0 1.0 3.0 +# 1 2.0 0-5yrs 42.0 1.0 1.0 1.0 0.0 2.0 1.0 +# 2 3.0 0-5yrs 39.0 6.0 2.0 1.0 0.0 3.0 4.0 +# 3 4.0 0-5yrs 34.0 4.0 2.0 1.0 0.0 4.0 2.0 +# 4 5.0 6-11yrs 35.0 3.0 1.0 1.0 1.0 5.0 32.0 + +# transform usage +xf = ToString(columns={'id_1': 'id', 'age_1': 'age'}) + +# fit and transform +features = xf.fit_transform(data) +print(features.head()) +# id education age parity induced case spontaneous stratum pooled.stratum id_1 age_1 +# 0 1.0 0-5yrs 26.0 6.0 1.0 1.0 2.0 1.0 3.0 1.000000 26.000000 +# 1 2.0 0-5yrs 42.0 1.0 1.0 1.0 0.0 2.0 1.0 2.000000 42.000000 +# 2 3.0 0-5yrs 39.0 6.0 2.0 1.0 0.0 3.0 4.0 3.000000 39.000000 +# 3 4.0 0-5yrs 34.0 4.0 2.0 1.0 0.0 4.0 2.0 4.000000 34.000000 +# 4 5.0 6-11yrs 35.0 3.0 1.0 1.0 1.0 5.0 32.0 5.000000 35.000000 + +print(features.dtypes) +# id float32 +# education object +# age float32 +# parity float32 +# induced float32 +# case float32 +# spontaneous float32 +# stratum float32 +# pooled.stratum float32 +# id_1 object <== string column +# age_1 object <== string column diff --git a/src/python/nimbusml/examples/examples_from_dataframe/DateTimeSplitter_df.py b/src/python/nimbusml/examples/examples_from_dataframe/DateTimeSplitter_df.py new file mode 100644 index 00000000..f049c39a --- /dev/null +++ b/src/python/nimbusml/examples/examples_from_dataframe/DateTimeSplitter_df.py @@ -0,0 +1,33 @@ +############################################################################### +# DateTimeSplitter +import pandas +from nimbusml import Pipeline +from nimbusml.preprocessing import DateTimeSplitter +from nimbusml.preprocessing.schema import ColumnSelector + +df = pandas.DataFrame(data=dict( + tokens1=[1, 2, 3, 157161600], + tokens2=[10, 11, 12, 13] +)) + +cols_to_drop = [ + 'dtHour12', 'dtDayOfWeek', 'dtDayOfQuarter', + 'dtDayOfYear', 'dtWeekOfMonth', 'dtQuarterOfYear', + 'dtHalfOfYear', 'dtWeekIso', 'dtYearIso', 'dtMonthLabel', + 'dtAmPmLabel', 'dtDayOfWeekLabel', 'dtIsPaidTimeOff' +] + +dts = DateTimeSplitter(prefix='dt', country='Canada') << 'tokens1' + +pipeline = Pipeline([dts, ColumnSelector(drop_columns=cols_to_drop)]) +y = pipeline.fit_transform(df) + +# view the three columns +pandas.set_option('display.max_columns', None) +pandas.set_option('display.width', 1000) +print(y) +# tokens1 tokens2 dtYear dtMonth dtDay dtHour dtMinute dtSecond dtAmPm dtHolidayName +# 0 1 10 1970 1 1 0 0 1 0 New Year's Day +# 1 2 11 1970 1 1 0 0 2 0 New Year's Day +# 2 3 12 1970 1 1 0 0 3 0 New Year's Day +# 3 157161600 13 1974 12 25 0 0 0 0 Christmas Day diff --git a/src/python/nimbusml/examples/examples_from_dataframe/OnnxRunner_df.py b/src/python/nimbusml/examples/examples_from_dataframe/OnnxRunner_df.py new file mode 100644 index 00000000..13c9f0ce --- /dev/null +++ b/src/python/nimbusml/examples/examples_from_dataframe/OnnxRunner_df.py @@ -0,0 +1,49 @@ +import os +import tempfile +import numpy as np +import pandas as pd +from nimbusml import Pipeline +from nimbusml.preprocessing import OnnxRunner +from nimbusml.preprocessing.normalization import MinMaxScaler + + +def get_tmp_file(suffix=None): + fd, file_name = tempfile.mkstemp(suffix=suffix) + fl = os.fdopen(fd, 'w') + fl.close() + return file_name + +# Generate the train and test data +np.random.seed(0) +x = np.arange(100, step=0.1) +y = x * 10 + (np.random.standard_normal(len(x)) * 10) +train_data = {'c1': x, 'c2': y} +train_df = pd.DataFrame(train_data).astype({'c1': np.float32, 'c2': np.float32}) + +test_data = {'c1': [2.5, 30.5], 'c2': [1, 1]} +test_df = pd.DataFrame(test_data).astype({'c1': np.float32, 'c2': np.float32}) + +# Fit a MinMaxScaler Pipeline +r1 = Pipeline([MinMaxScaler()]) +r1.fit(train_df) + +# Export the pipeline to ONNX +onnx_path = get_tmp_file('.onnx') +r1.export_to_onnx(onnx_path, 'com.microsoft.ml', onnx_version='Stable') + +# Perform the transform using the standard ML.Net backend +result_standard = r1.transform(test_df) +print(result_standard) +# c1 c2 +# 0 0.025025 0.000998 +# 1 0.305305 0.000998 + +# Perform the transform using the ONNX backend. +# Note, the extra columns and column name differences +# is a known issue with the ML.Net backend. +onnxrunner = OnnxRunner(model_file=onnx_path) +result_onnx = onnxrunner.fit_transform(test_df) +print(result_onnx) +# c1 c2 c12.0 c22.0 +# 0 2.5 1.0 0.025025 0.000998 +# 1 30.5 1.0 0.305305 0.000998 diff --git a/src/python/nimbusml/examples/examples_from_dataframe/RobustScaler_df.py b/src/python/nimbusml/examples/examples_from_dataframe/RobustScaler_df.py new file mode 100644 index 00000000..ff0ae793 --- /dev/null +++ b/src/python/nimbusml/examples/examples_from_dataframe/RobustScaler_df.py @@ -0,0 +1,20 @@ +############################################################################### +# RobustScaler +import pandas as pd +from nimbusml import Pipeline +from nimbusml.preprocessing.normalization import RobustScaler + + +df = pd.DataFrame(data=dict(c0=[1, 3, 5, 7, 9])) + +xf = RobustScaler(columns='c0', center=True, scale=True) +pipeline = Pipeline([xf]) +result = pipeline.fit_transform(df) + +print(result) +# c0 +# 0 -1.0 +# 1 -0.5 +# 2 0.0 +# 3 0.5 +# 4 1.0 diff --git a/src/python/nimbusml/examples/examples_from_dataframe/TimeSeriesImputer_df.py b/src/python/nimbusml/examples/examples_from_dataframe/TimeSeriesImputer_df.py new file mode 100644 index 00000000..38ec9073 --- /dev/null +++ b/src/python/nimbusml/examples/examples_from_dataframe/TimeSeriesImputer_df.py @@ -0,0 +1,29 @@ +############################################################################### +# DateTimeSplitter +import pandas +from nimbusml.timeseries import TimeSeriesImputer + +df = pandas.DataFrame(data=dict( + ts=[1, 2, 3, 5], + grain=[1970, 1970, 1970, 1970], + c3=[10, 13, 15, 20], + c4=[19, 12, 16, 19] +)) + +print(df) + +tsi = TimeSeriesImputer(time_series_column='ts', + grain_columns=['grain'], + filter_columns=['c3', 'c4'], + impute_mode='ForwardFill', + filter_mode='Include') +result = tsi.fit_transform(df) + +print(result) +# ts grain c3 c4 IsRowImputed +# 0 0 0 0 0 False +# 1 1 1970 10 19 False +# 2 2 1970 13 12 False +# 3 3 1970 15 16 False +# 4 4 1970 15 16 True <== New row added +# 5 5 1970 20 19 False diff --git a/src/python/nimbusml/examples/examples_from_dataframe/ToKeyImputer_df.py b/src/python/nimbusml/examples/examples_from_dataframe/ToKeyImputer_df.py new file mode 100644 index 00000000..f613e3f4 --- /dev/null +++ b/src/python/nimbusml/examples/examples_from_dataframe/ToKeyImputer_df.py @@ -0,0 +1,34 @@ +############################################################################### +# ToKeyImputer + +import pandas +from nimbusml.preprocessing import ToKeyImputer + +# Create the data +text_df = pandas.DataFrame( + data=dict( + text=[ + "cat", + "dog", + "fish", + "orange", + "cat orange", + "dog", + "fish", + None, + "spider"])) + +tokey = ToKeyImputer() << 'text' +y = tokey.fit_transform(text_df) +print(y) + +# text +# 0 cat +# 1 dog +# 2 fish +# 3 orange +# 4 cat orange +# 5 dog +# 6 fish +# 7 dog <== Missing value has been replaced +# 8 spider diff --git a/src/python/nimbusml/examples/examples_from_dataframe/ToString_df.py b/src/python/nimbusml/examples/examples_from_dataframe/ToString_df.py new file mode 100644 index 00000000..b6c631fd --- /dev/null +++ b/src/python/nimbusml/examples/examples_from_dataframe/ToString_df.py @@ -0,0 +1,43 @@ +############################################################################### +# ToString + +import pandas +from nimbusml.preprocessing import ToString, ToKey +from pandas import Categorical + +# Create the data +categorical_df = pandas.DataFrame(data=dict( + key=Categorical.from_codes([0, 1, 2, 1, 2, 0], categories=['a', 'b', 'c']), + text=['b', 'c', 'a', 'b', 'a', 'c'])) + +print(categorical_df.dtypes) +# key category +# text object +# dtype: object + +tostring = ToString(columns='key') +y = tostring.fit_transform(categorical_df) +print(y) +# key text +# 0 1 b +# 1 2 c +# 2 3 a +# 3 2 b +# 4 3 a +# 5 1 c + +print(y.dtypes) +# key object <== converted to string +# text object +# dtype: object + +tokey = ToKey(columns='text') +y = tokey.fit_transform(categorical_df) +y2 = tostring.clone().fit_transform(y) +print(y2['text'] == categorical_df['text']) +# 0 True +# 1 True +# 2 True +# 3 True +# 4 True +# 5 True diff --git a/src/python/nimbusml/internal/core/ensemble/lightgbmbinaryclassifier.py b/src/python/nimbusml/internal/core/ensemble/lightgbmbinaryclassifier.py index 2bf8468b..1ae0934d 100644 --- a/src/python/nimbusml/internal/core/ensemble/lightgbmbinaryclassifier.py +++ b/src/python/nimbusml/internal/core/ensemble/lightgbmbinaryclassifier.py @@ -99,6 +99,9 @@ class LightGbmBinaryClassifier( :param handle_missing_value: Enable special handling of missing value or not. + :param use_zero_as_missing_value: Enable usage of zero (0) as missing + value. + :param minimum_example_count_per_group: Minimum number of instances per categorical group. @@ -154,6 +157,7 @@ def __init__( batch_size=1048576, use_categorical_split=None, handle_missing_value=True, + use_zero_as_missing_value=False, minimum_example_count_per_group=100, maximum_categorical_split_point_count=32, categorical_smoothing=10.0, @@ -183,6 +187,7 @@ def __init__( self.batch_size = batch_size self.use_categorical_split = use_categorical_split self.handle_missing_value = handle_missing_value + self.use_zero_as_missing_value = use_zero_as_missing_value self.minimum_example_count_per_group = minimum_example_count_per_group self.maximum_categorical_split_point_count = maximum_categorical_split_point_count self.categorical_smoothing = categorical_smoothing @@ -220,6 +225,7 @@ def _get_node(self, **all_args): batch_size=self.batch_size, use_categorical_split=self.use_categorical_split, handle_missing_value=self.handle_missing_value, + use_zero_as_missing_value=self.use_zero_as_missing_value, minimum_example_count_per_group=self.minimum_example_count_per_group, maximum_categorical_split_point_count=self.maximum_categorical_split_point_count, categorical_smoothing=self.categorical_smoothing, diff --git a/src/python/nimbusml/internal/core/ensemble/lightgbmclassifier.py b/src/python/nimbusml/internal/core/ensemble/lightgbmclassifier.py index 5feace13..7bb5466a 100644 --- a/src/python/nimbusml/internal/core/ensemble/lightgbmclassifier.py +++ b/src/python/nimbusml/internal/core/ensemble/lightgbmclassifier.py @@ -97,6 +97,9 @@ class LightGbmClassifier( :param handle_missing_value: Enable special handling of missing value or not. + :param use_zero_as_missing_value: Enable usage of zero (0) as missing + value. + :param minimum_example_count_per_group: Minimum number of instances per categorical group. @@ -152,6 +155,7 @@ def __init__( batch_size=1048576, use_categorical_split=None, handle_missing_value=True, + use_zero_as_missing_value=False, minimum_example_count_per_group=100, maximum_categorical_split_point_count=32, categorical_smoothing=10.0, @@ -181,6 +185,7 @@ def __init__( self.batch_size = batch_size self.use_categorical_split = use_categorical_split self.handle_missing_value = handle_missing_value + self.use_zero_as_missing_value = use_zero_as_missing_value self.minimum_example_count_per_group = minimum_example_count_per_group self.maximum_categorical_split_point_count = maximum_categorical_split_point_count self.categorical_smoothing = categorical_smoothing @@ -218,6 +223,7 @@ def _get_node(self, **all_args): batch_size=self.batch_size, use_categorical_split=self.use_categorical_split, handle_missing_value=self.handle_missing_value, + use_zero_as_missing_value=self.use_zero_as_missing_value, minimum_example_count_per_group=self.minimum_example_count_per_group, maximum_categorical_split_point_count=self.maximum_categorical_split_point_count, categorical_smoothing=self.categorical_smoothing, diff --git a/src/python/nimbusml/internal/core/ensemble/lightgbmranker.py b/src/python/nimbusml/internal/core/ensemble/lightgbmranker.py index 6c06148d..c3394cf4 100644 --- a/src/python/nimbusml/internal/core/ensemble/lightgbmranker.py +++ b/src/python/nimbusml/internal/core/ensemble/lightgbmranker.py @@ -95,6 +95,9 @@ class LightGbmRanker(BasePipelineItem, DefaultSignatureWithRoles): :param handle_missing_value: Enable special handling of missing value or not. + :param use_zero_as_missing_value: Enable usage of zero (0) as missing + value. + :param minimum_example_count_per_group: Minimum number of instances per categorical group. @@ -149,6 +152,7 @@ def __init__( batch_size=1048576, use_categorical_split=None, handle_missing_value=True, + use_zero_as_missing_value=False, minimum_example_count_per_group=100, maximum_categorical_split_point_count=32, categorical_smoothing=10.0, @@ -176,6 +180,7 @@ def __init__( self.batch_size = batch_size self.use_categorical_split = use_categorical_split self.handle_missing_value = handle_missing_value + self.use_zero_as_missing_value = use_zero_as_missing_value self.minimum_example_count_per_group = minimum_example_count_per_group self.maximum_categorical_split_point_count = maximum_categorical_split_point_count self.categorical_smoothing = categorical_smoothing @@ -212,6 +217,7 @@ def _get_node(self, **all_args): batch_size=self.batch_size, use_categorical_split=self.use_categorical_split, handle_missing_value=self.handle_missing_value, + use_zero_as_missing_value=self.use_zero_as_missing_value, minimum_example_count_per_group=self.minimum_example_count_per_group, maximum_categorical_split_point_count=self.maximum_categorical_split_point_count, categorical_smoothing=self.categorical_smoothing, diff --git a/src/python/nimbusml/internal/core/ensemble/lightgbmregressor.py b/src/python/nimbusml/internal/core/ensemble/lightgbmregressor.py index 20fe5e57..b4cb7b5e 100644 --- a/src/python/nimbusml/internal/core/ensemble/lightgbmregressor.py +++ b/src/python/nimbusml/internal/core/ensemble/lightgbmregressor.py @@ -90,6 +90,9 @@ class LightGbmRegressor( :param handle_missing_value: Enable special handling of missing value or not. + :param use_zero_as_missing_value: Enable usage of zero (0) as missing + value. + :param minimum_example_count_per_group: Minimum number of instances per categorical group. @@ -142,6 +145,7 @@ def __init__( batch_size=1048576, use_categorical_split=None, handle_missing_value=True, + use_zero_as_missing_value=False, minimum_example_count_per_group=100, maximum_categorical_split_point_count=32, categorical_smoothing=10.0, @@ -168,6 +172,7 @@ def __init__( self.batch_size = batch_size self.use_categorical_split = use_categorical_split self.handle_missing_value = handle_missing_value + self.use_zero_as_missing_value = use_zero_as_missing_value self.minimum_example_count_per_group = minimum_example_count_per_group self.maximum_categorical_split_point_count = maximum_categorical_split_point_count self.categorical_smoothing = categorical_smoothing @@ -202,6 +207,7 @@ def _get_node(self, **all_args): batch_size=self.batch_size, use_categorical_split=self.use_categorical_split, handle_missing_value=self.handle_missing_value, + use_zero_as_missing_value=self.use_zero_as_missing_value, minimum_example_count_per_group=self.minimum_example_count_per_group, maximum_categorical_split_point_count=self.maximum_categorical_split_point_count, categorical_smoothing=self.categorical_smoothing, diff --git a/src/python/nimbusml/internal/core/preprocessing/datetimesplitter.py b/src/python/nimbusml/internal/core/preprocessing/datetimesplitter.py new file mode 100644 index 00000000..a00c3dc6 --- /dev/null +++ b/src/python/nimbusml/internal/core/preprocessing/datetimesplitter.py @@ -0,0 +1,57 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +DateTimeSplitter +""" + +__all__ = ["DateTimeSplitter"] + + +from ...entrypoints.transforms_datetimesplitter import \ + transforms_datetimesplitter +from ...utils.utils import trace +from ..base_pipeline_item import BasePipelineItem, DefaultSignature + + +class DateTimeSplitter(BasePipelineItem, DefaultSignature): + """ + **Description** + Splits a date time value into each individual component + + :param prefix: Output column prefix. + + :param country: Country to get holidays for. Defaults to none if not + passed. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + prefix, + country='None', + **params): + BasePipelineItem.__init__( + self, type='transform', **params) + + self.prefix = prefix + self.country = country + + @property + def _entrypoint(self): + return transforms_datetimesplitter + + @trace + def _get_node(self, **all_args): + algo_args = dict( + source=self.source, + prefix=self.prefix, + country=self.country) + + all_args.update(algo_args) + return self._entrypoint(**all_args) diff --git a/src/python/nimbusml/internal/core/preprocessing/normalization/robustscaler.py b/src/python/nimbusml/internal/core/preprocessing/normalization/robustscaler.py new file mode 100644 index 00000000..08845bae --- /dev/null +++ b/src/python/nimbusml/internal/core/preprocessing/normalization/robustscaler.py @@ -0,0 +1,103 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +RobustScaler +""" + +__all__ = ["RobustScaler"] + + +from ....entrypoints.transforms_robustscaler import transforms_robustscaler +from ....utils.utils import trace +from ...base_pipeline_item import BasePipelineItem, DefaultSignature + + +class RobustScaler(BasePipelineItem, DefaultSignature): + """ + **Description** + Removes the median and scales the data according to the quantile range. + + :param center: If True, center the data before scaling. + + :param scale: If True, scale the data to interquartile range. + + :param quantile_min: Min for the quantile range used to calculate scale. + + :param quantile_max: Max for the quantile range used to calculate scale. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + center=True, + scale=True, + quantile_min=25.0, + quantile_max=75.0, + **params): + BasePipelineItem.__init__( + self, type='transform', **params) + + self.center = center + self.scale = scale + self.quantile_min = quantile_min + self.quantile_max = quantile_max + + @property + def _entrypoint(self): + return transforms_robustscaler + + @trace + def _get_node(self, **all_args): + + input_columns = self.input + if input_columns is None and 'input' in all_args: + input_columns = all_args['input'] + if 'input' in all_args: + all_args.pop('input') + + output_columns = self.output + if output_columns is None and 'output' in all_args: + output_columns = all_args['output'] + if 'output' in all_args: + all_args.pop('output') + + # validate input + if input_columns is None: + raise ValueError( + "'None' input passed when it cannot be none.") + + if not isinstance(input_columns, list): + raise ValueError( + "input has to be a list of strings, instead got %s" % + type(input_columns)) + + # validate output + if output_columns is None: + output_columns = input_columns + + if not isinstance(output_columns, list): + raise ValueError( + "output has to be a list of strings, instead got %s" % + type(output_columns)) + + algo_args = dict( + column=[ + dict( + Source=i, + Name=o) for i, + o in zip( + input_columns, + output_columns)] if input_columns else None, + center=self.center, + scale=self.scale, + quantile_min=self.quantile_min, + quantile_max=self.quantile_max) + + all_args.update(algo_args) + return self._entrypoint(**all_args) diff --git a/src/python/nimbusml/internal/core/preprocessing/onnxrunner.py b/src/python/nimbusml/internal/core/preprocessing/onnxrunner.py new file mode 100644 index 00000000..34ed46ba --- /dev/null +++ b/src/python/nimbusml/internal/core/preprocessing/onnxrunner.py @@ -0,0 +1,71 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +OnnxRunner +""" + +__all__ = ["OnnxRunner"] + + +from ...entrypoints.models_onnxtransformer import models_onnxtransformer +from ...utils.utils import trace +from ..base_pipeline_item import BasePipelineItem, DefaultSignature + + +class OnnxRunner(BasePipelineItem, DefaultSignature): + """ + **Description** + Applies an ONNX model to a dataset. + + :param model_file: Path to the onnx model file. + + :param input_columns: Name of the input column. + + :param output_columns: Name of the output column. + + :param gpu_device_id: GPU device id to run on (e.g. 0,1,..). Null for CPU. + Requires CUDA 9.1. + + :param fallback_to_cpu: If true, resumes execution on CPU upon GPU error. + If false, will raise the GPU execption. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + model_file, + input_columns=None, + output_columns=None, + gpu_device_id=None, + fallback_to_cpu=False, + **params): + BasePipelineItem.__init__( + self, type='transform', **params) + + self.model_file = model_file + self.input_columns = input_columns + self.output_columns = output_columns + self.gpu_device_id = gpu_device_id + self.fallback_to_cpu = fallback_to_cpu + + @property + def _entrypoint(self): + return models_onnxtransformer + + @trace + def _get_node(self, **all_args): + algo_args = dict( + model_file=self.model_file, + input_columns=self.input_columns, + output_columns=self.output_columns, + gpu_device_id=self.gpu_device_id, + fallback_to_cpu=self.fallback_to_cpu) + + all_args.update(algo_args) + return self._entrypoint(**all_args) diff --git a/src/python/nimbusml/internal/core/preprocessing/tokeyimputer.py b/src/python/nimbusml/internal/core/preprocessing/tokeyimputer.py new file mode 100644 index 00000000..e82498a3 --- /dev/null +++ b/src/python/nimbusml/internal/core/preprocessing/tokeyimputer.py @@ -0,0 +1,80 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +ToKeyImputer +""" + +__all__ = ["ToKeyImputer"] + + +from ...entrypoints.transforms_categoryimputer import \ + transforms_categoryimputer +from ...utils.utils import trace +from ..base_pipeline_item import BasePipelineItem, DefaultSignature + + +class ToKeyImputer(BasePipelineItem, DefaultSignature): + """ + **Description** + Fills in missing values in a column based on the most frequent value + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + **params): + BasePipelineItem.__init__( + self, type='transform', **params) + + @property + def _entrypoint(self): + return transforms_categoryimputer + + @trace + def _get_node(self, **all_args): + + input_columns = self.input + if input_columns is None and 'input' in all_args: + input_columns = all_args['input'] + if 'input' in all_args: + all_args.pop('input') + + output_columns = self.output + if output_columns is None and 'output' in all_args: + output_columns = all_args['output'] + if 'output' in all_args: + all_args.pop('output') + + # validate input + if input_columns is None: + raise ValueError( + "'None' input passed when it cannot be none.") + + if not isinstance(input_columns, list): + raise ValueError( + "input has to be a list of strings, instead got %s" % + type(input_columns)) + + # validate output + if output_columns is None: + output_columns = input_columns + + if not isinstance(output_columns, list): + raise ValueError( + "output has to be a list of strings, instead got %s" % + type(output_columns)) + + algo_args = dict( + column=[ + dict( + Source=i, Name=o) for i, o in zip( + input_columns, output_columns)] if input_columns else None) + + all_args.update(algo_args) + return self._entrypoint(**all_args) diff --git a/src/python/nimbusml/internal/core/preprocessing/tostring.py b/src/python/nimbusml/internal/core/preprocessing/tostring.py new file mode 100644 index 00000000..2294c715 --- /dev/null +++ b/src/python/nimbusml/internal/core/preprocessing/tostring.py @@ -0,0 +1,79 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +ToString +""" + +__all__ = ["ToString"] + + +from ...entrypoints.transforms_tostring import transforms_tostring +from ...utils.utils import trace +from ..base_pipeline_item import BasePipelineItem, DefaultSignature + + +class ToString(BasePipelineItem, DefaultSignature): + """ + **Description** + Turns the given column into a column of its string representation + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + **params): + BasePipelineItem.__init__( + self, type='transform', **params) + + @property + def _entrypoint(self): + return transforms_tostring + + @trace + def _get_node(self, **all_args): + + input_columns = self.input + if input_columns is None and 'input' in all_args: + input_columns = all_args['input'] + if 'input' in all_args: + all_args.pop('input') + + output_columns = self.output + if output_columns is None and 'output' in all_args: + output_columns = all_args['output'] + if 'output' in all_args: + all_args.pop('output') + + # validate input + if input_columns is None: + raise ValueError( + "'None' input passed when it cannot be none.") + + if not isinstance(input_columns, list): + raise ValueError( + "input has to be a list of strings, instead got %s" % + type(input_columns)) + + # validate output + if output_columns is None: + output_columns = input_columns + + if not isinstance(output_columns, list): + raise ValueError( + "output has to be a list of strings, instead got %s" % + type(output_columns)) + + algo_args = dict( + column=[ + dict( + Source=i, Name=o) for i, o in zip( + input_columns, output_columns)] if input_columns else None) + + all_args.update(algo_args) + return self._entrypoint(**all_args) diff --git a/src/python/nimbusml/internal/core/timeseries/ssaforecaster.py b/src/python/nimbusml/internal/core/timeseries/ssaforecaster.py index ce9064b5..f1ee5f6b 100644 --- a/src/python/nimbusml/internal/core/timeseries/ssaforecaster.py +++ b/src/python/nimbusml/internal/core/timeseries/ssaforecaster.py @@ -38,7 +38,7 @@ class SsaForecaster(BasePipelineItem, DefaultSignature): :param series_length: The length of series that is kept in buffer for modeling (parameter N). - :param train_size: The length of series from the begining used for + :param train_size: The length of series from the beginning used for training. :param horizon: The number of values to forecast. diff --git a/src/python/nimbusml/internal/core/timeseries/timeseriesimputer.py b/src/python/nimbusml/internal/core/timeseries/timeseriesimputer.py new file mode 100644 index 00000000..7a01c9c1 --- /dev/null +++ b/src/python/nimbusml/internal/core/timeseries/timeseriesimputer.py @@ -0,0 +1,78 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +TimeSeriesImputer +""" + +__all__ = ["TimeSeriesImputer"] + + +from ...entrypoints.transforms_timeseriesimputer import \ + transforms_timeseriesimputer +from ...utils.utils import trace +from ..base_pipeline_item import BasePipelineItem, DefaultSignature + + +class TimeSeriesImputer(BasePipelineItem, DefaultSignature): + """ + **Description** + Fills in missing row and values + + :param time_series_column: Column representing the time. + + :param grain_columns: List of grain columns. + + :param filter_columns: Columns to filter. + + :param filter_mode: Filter mode. Either include or exclude. + + :param impute_mode: Mode for imputing, defaults to ForwardFill if not + provided. + + :param supress_type_errors: Suppress the errors that would occur if a + column and impute mode are incompatible. If true, will skip the column. + If false, will stop and throw an error. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + time_series_column, + grain_columns, + filter_columns=None, + filter_mode='Exclude', + impute_mode='ForwardFill', + supress_type_errors=False, + **params): + BasePipelineItem.__init__( + self, type='transform', **params) + + self.time_series_column = time_series_column + self.grain_columns = grain_columns + self.filter_columns = filter_columns + self.filter_mode = filter_mode + self.impute_mode = impute_mode + self.supress_type_errors = supress_type_errors + + @property + def _entrypoint(self): + return transforms_timeseriesimputer + + @trace + def _get_node(self, **all_args): + algo_args = dict( + time_series_column=self.time_series_column, + grain_columns=self.grain_columns, + filter_columns=self.filter_columns, + filter_mode=self.filter_mode, + impute_mode=self.impute_mode, + supress_type_errors=self.supress_type_errors) + + all_args.update(algo_args) + return self._entrypoint(**all_args) diff --git a/src/python/nimbusml/internal/entrypoints/_calibratortrainer_fixedplattcalibrator.py b/src/python/nimbusml/internal/entrypoints/_calibratortrainer_fixedplattcalibrator.py index 35e80514..8f1904cb 100644 --- a/src/python/nimbusml/internal/entrypoints/_calibratortrainer_fixedplattcalibrator.py +++ b/src/python/nimbusml/internal/entrypoints/_calibratortrainer_fixedplattcalibrator.py @@ -10,16 +10,16 @@ def fixed_platt_calibrator( - slope=1.0, + slope=-1.0, offset=0.0, **params): """ **Description** None - :param slope: The slope parameter of f(x) = 1 / (1 + exp(-slope * + :param slope: The slope parameter of f(x) = 1 / (1 + exp(slope * x + offset) (settings). - :param offset: The offset parameter of f(x) = 1 / (1 + exp(-slope + :param offset: The offset parameter of f(x) = 1 / (1 + exp(slope * x + offset) (settings). """ diff --git a/src/python/nimbusml/internal/entrypoints/models_fixedplattcalibrator.py b/src/python/nimbusml/internal/entrypoints/models_fixedplattcalibrator.py index f5665675..4876c6ff 100644 --- a/src/python/nimbusml/internal/entrypoints/models_fixedplattcalibrator.py +++ b/src/python/nimbusml/internal/entrypoints/models_fixedplattcalibrator.py @@ -13,7 +13,7 @@ def models_fixedplattcalibrator( data, uncalibrated_predictor_model, predictor_model=None, - slope=1.0, + slope=-1.0, offset=0.0, max_rows=1000000000, **params): @@ -23,12 +23,12 @@ def models_fixedplattcalibrator( model :param slope: The slope parameter of the calibration function 1 / - (1 + exp(-slope * x + offset) (inputs). + (1 + exp(slope * x + offset) (inputs). :param data: Input dataset (inputs). :param uncalibrated_predictor_model: The predictor to calibrate (inputs). :param offset: The offset parameter of the calibration function 1 - / (1 + exp(-slope * x + offset) (inputs). + / (1 + exp(slope * x + offset) (inputs). :param max_rows: The maximum number of examples to train the calibrator on (inputs). :param predictor_model: The trained model (outputs). diff --git a/src/python/nimbusml/internal/entrypoints/models_multioutputregressionevaluator.py b/src/python/nimbusml/internal/entrypoints/models_multioutputregressionevaluator.py index d7283221..8b56ca4b 100644 --- a/src/python/nimbusml/internal/entrypoints/models_multioutputregressionevaluator.py +++ b/src/python/nimbusml/internal/entrypoints/models_multioutputregressionevaluator.py @@ -15,7 +15,7 @@ def models_multioutputregressionevaluator( per_instance_metrics=None, name_column='Name', loss_function=None, - supress_scores_and_labels=False, + suppress_scores_and_labels=False, label_column=None, weight_column=None, score_column=None, @@ -28,7 +28,7 @@ def models_multioutputregressionevaluator( :param data: The data to be used for evaluation. (inputs). :param name_column: Name column name. (inputs). :param loss_function: Loss function (inputs). - :param supress_scores_and_labels: Supress labels and scores in + :param suppress_scores_and_labels: Suppress labels and scores in per-instance outputs? (inputs). :param label_column: Column to use for labels. (inputs). :param weight_column: Weight column name. (inputs). @@ -60,9 +60,9 @@ def models_multioutputregressionevaluator( obj=loss_function, none_acceptable=True, is_of_type=dict) - if supress_scores_and_labels is not None: - inputs['SupressScoresAndLabels'] = try_set( - obj=supress_scores_and_labels, + if suppress_scores_and_labels is not None: + inputs['SuppressScoresAndLabels'] = try_set( + obj=suppress_scores_and_labels, none_acceptable=True, is_of_type=bool) if label_column is not None: diff --git a/src/python/nimbusml/internal/entrypoints/models_onnxconverter.py b/src/python/nimbusml/internal/entrypoints/models_onnxconverter.py new file mode 100644 index 00000000..3c080eb6 --- /dev/null +++ b/src/python/nimbusml/internal/entrypoints/models_onnxconverter.py @@ -0,0 +1,116 @@ +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +Models.OnnxConverter +""" + + +from ..utils.entrypoints import EntryPoint +from ..utils.utils import try_set, unlist + + +def models_onnxconverter( + onnx, + data_file=None, + json=None, + name=None, + domain=None, + inputs_to_drop=None, + outputs_to_drop=None, + model=None, + onnx_version='Stable', + predictive_model=None, + **params): + """ + **Description** + Converts the model to ONNX format. + + :param data_file: The data file (inputs). + :param onnx: The path to write the output ONNX to. (inputs). + :param json: The path to write the output JSON to. (inputs). + :param name: The 'name' property in the output ONNX. By default + this will be the ONNX extension-less name. (inputs). + :param domain: The 'domain' property in the output ONNX. + (inputs). + :param inputs_to_drop: Array of input column names to drop + (inputs). + :param outputs_to_drop: Array of output column names to drop + (inputs). + :param model: Model that needs to be converted to ONNX format. + (inputs). + :param onnx_version: The targeted ONNX version. It can be either + "Stable" or "Experimental". If "Experimental" is used, + produced model can contain components that is not officially + supported in ONNX standard. (inputs). + :param predictive_model: Predictor model that needs to be + converted to ONNX format. (inputs). + """ + + entrypoint_name = 'Models.OnnxConverter' + inputs = {} + outputs = {} + + if data_file is not None: + inputs['DataFile'] = try_set( + obj=data_file, + none_acceptable=True, + is_of_type=str) + if onnx is not None: + inputs['Onnx'] = try_set( + obj=onnx, + none_acceptable=False, + is_of_type=str) + if json is not None: + inputs['Json'] = try_set( + obj=json, + none_acceptable=True, + is_of_type=str) + if name is not None: + inputs['Name'] = try_set( + obj=name, + none_acceptable=True, + is_of_type=str, + is_column=True) + if domain is not None: + inputs['Domain'] = try_set( + obj=domain, + none_acceptable=True, + is_of_type=str) + if inputs_to_drop is not None: + inputs['InputsToDrop'] = try_set( + obj=inputs_to_drop, + none_acceptable=True, + is_of_type=list) + if outputs_to_drop is not None: + inputs['OutputsToDrop'] = try_set( + obj=outputs_to_drop, + none_acceptable=True, + is_of_type=list) + if model is not None: + inputs['Model'] = try_set( + obj=model, + none_acceptable=True, + is_of_type=str) + if onnx_version is not None: + inputs['OnnxVersion'] = try_set( + obj=onnx_version, + none_acceptable=True, + is_of_type=str, + values=[ + 'Stable', + 'Experimental']) + if predictive_model is not None: + inputs['PredictiveModel'] = try_set( + obj=predictive_model, none_acceptable=True, is_of_type=str) + + input_variables = { + x for x in unlist(inputs.values()) + if isinstance(x, str) and x.startswith("$")} + output_variables = { + x for x in unlist(outputs.values()) + if isinstance(x, str) and x.startswith("$")} + + entrypoint = EntryPoint( + name=entrypoint_name, inputs=inputs, outputs=outputs, + input_variables=input_variables, + output_variables=output_variables) + return entrypoint diff --git a/src/python/nimbusml/internal/entrypoints/models_onnxtransformer.py b/src/python/nimbusml/internal/entrypoints/models_onnxtransformer.py new file mode 100644 index 00000000..173c976a --- /dev/null +++ b/src/python/nimbusml/internal/entrypoints/models_onnxtransformer.py @@ -0,0 +1,96 @@ +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +Models.OnnxTransformer +""" + +import numbers + +from ..utils.entrypoints import EntryPoint +from ..utils.utils import try_set, unlist + + +def models_onnxtransformer( + model_file, + data, + output_data=None, + model=None, + input_columns=None, + output_columns=None, + gpu_device_id=None, + fallback_to_cpu=False, + **params): + """ + **Description** + Applies an ONNX model to a dataset. + + :param model_file: Path to the onnx model file. (inputs). + :param input_columns: Name of the input column. (inputs). + :param data: Input dataset (inputs). + :param output_columns: Name of the output column. (inputs). + :param gpu_device_id: GPU device id to run on (e.g. 0,1,..). Null + for CPU. Requires CUDA 9.1. (inputs). + :param fallback_to_cpu: If true, resumes execution on CPU upon + GPU error. If false, will raise the GPU execption. (inputs). + :param output_data: ONNX transformed dataset (outputs). + :param model: Transform model (outputs). + """ + + entrypoint_name = 'Models.OnnxTransformer' + inputs = {} + outputs = {} + + if model_file is not None: + inputs['ModelFile'] = try_set( + obj=model_file, + none_acceptable=False, + is_of_type=str) + if input_columns is not None: + inputs['InputColumns'] = try_set( + obj=input_columns, + none_acceptable=True, + is_of_type=list, + is_column=True) + if data is not None: + inputs['Data'] = try_set( + obj=data, + none_acceptable=False, + is_of_type=str) + if output_columns is not None: + inputs['OutputColumns'] = try_set( + obj=output_columns, + none_acceptable=True, + is_of_type=list, + is_column=True) + if gpu_device_id is not None: + inputs['GpuDeviceId'] = try_set( + obj=gpu_device_id, + none_acceptable=True, + is_of_type=numbers.Real) + if fallback_to_cpu is not None: + inputs['FallbackToCpu'] = try_set( + obj=fallback_to_cpu, + none_acceptable=True, + is_of_type=bool) + if output_data is not None: + outputs['OutputData'] = try_set( + obj=output_data, + none_acceptable=False, + is_of_type=str) + if model is not None: + outputs['Model'] = try_set( + obj=model, + none_acceptable=False, + is_of_type=str) + + input_variables = { + x for x in unlist(inputs.values()) + if isinstance(x, str) and x.startswith("$")} + output_variables = { + x for x in unlist(outputs.values()) + if isinstance(x, str) and x.startswith("$")} + + entrypoint = EntryPoint( + name=entrypoint_name, inputs=inputs, outputs=outputs, + input_variables=input_variables, + output_variables=output_variables) + return entrypoint diff --git a/src/python/nimbusml/internal/entrypoints/timeseriesprocessingentrypoints_ssaforecasting.py b/src/python/nimbusml/internal/entrypoints/timeseriesprocessingentrypoints_ssaforecasting.py index f02da3a7..1684783c 100644 --- a/src/python/nimbusml/internal/entrypoints/timeseriesprocessingentrypoints_ssaforecasting.py +++ b/src/python/nimbusml/internal/entrypoints/timeseriesprocessingentrypoints_ssaforecasting.py @@ -43,7 +43,7 @@ def timeseriesprocessingentrypoints_ssaforecasting( building the trajectory matrix (parameter L). (inputs). :param series_length: The length of series that is kept in buffer for modeling (parameter N). (inputs). - :param train_size: The length of series from the begining used + :param train_size: The length of series from the beginning used for training. (inputs). :param horizon: The number of values to forecast. (inputs). :param confidence_level: The confidence level in [0, 1) for diff --git a/src/python/nimbusml/internal/entrypoints/trainers_generalizedadditivemodelbinaryclassifier.py b/src/python/nimbusml/internal/entrypoints/trainers_generalizedadditivemodelbinaryclassifier.py index e5b62a23..5c281338 100644 --- a/src/python/nimbusml/internal/entrypoints/trainers_generalizedadditivemodelbinaryclassifier.py +++ b/src/python/nimbusml/internal/entrypoints/trainers_generalizedadditivemodelbinaryclassifier.py @@ -36,7 +36,7 @@ def trainers_generalizedadditivemodelbinaryclassifier( **Description** Trains a gradient boosted stump per feature, on all features simultaneously, to fit target values using least-squares. It - mantains no interactions between features. + maintains no interactions between features. :param number_of_iterations: Total number of iterations over all features (inputs). diff --git a/src/python/nimbusml/internal/entrypoints/trainers_generalizedadditivemodelregressor.py b/src/python/nimbusml/internal/entrypoints/trainers_generalizedadditivemodelregressor.py index 1c56a706..2b9334f8 100644 --- a/src/python/nimbusml/internal/entrypoints/trainers_generalizedadditivemodelregressor.py +++ b/src/python/nimbusml/internal/entrypoints/trainers_generalizedadditivemodelregressor.py @@ -36,7 +36,7 @@ def trainers_generalizedadditivemodelregressor( **Description** Trains a gradient boosted stump per feature, on all features simultaneously, to fit target values using least-squares. It - mantains no interactions between features. + maintains no interactions between features. :param number_of_iterations: Total number of iterations over all features (inputs). diff --git a/src/python/nimbusml/internal/entrypoints/trainers_lightgbmbinaryclassifier.py b/src/python/nimbusml/internal/entrypoints/trainers_lightgbmbinaryclassifier.py index 5a54c69f..4ae20be2 100644 --- a/src/python/nimbusml/internal/entrypoints/trainers_lightgbmbinaryclassifier.py +++ b/src/python/nimbusml/internal/entrypoints/trainers_lightgbmbinaryclassifier.py @@ -35,6 +35,7 @@ def trainers_lightgbmbinaryclassifier( batch_size=1048576, use_categorical_split=None, handle_missing_value=True, + use_zero_as_missing_value=False, minimum_example_count_per_group=100, maximum_categorical_split_point_count=32, categorical_smoothing=10.0, @@ -88,6 +89,8 @@ def trainers_lightgbmbinaryclassifier( (inputs). :param handle_missing_value: Enable special handling of missing value or not. (inputs). + :param use_zero_as_missing_value: Enable usage of zero (0) as + missing value. (inputs). :param minimum_example_count_per_group: Minimum number of instances per categorical group. (inputs). :param maximum_categorical_split_point_count: Max number of @@ -243,6 +246,11 @@ def trainers_lightgbmbinaryclassifier( obj=handle_missing_value, none_acceptable=True, is_of_type=bool) + if use_zero_as_missing_value is not None: + inputs['UseZeroAsMissingValue'] = try_set( + obj=use_zero_as_missing_value, + none_acceptable=True, + is_of_type=bool) if minimum_example_count_per_group is not None: inputs['MinimumExampleCountPerGroup'] = try_set( obj=minimum_example_count_per_group, diff --git a/src/python/nimbusml/internal/entrypoints/trainers_lightgbmclassifier.py b/src/python/nimbusml/internal/entrypoints/trainers_lightgbmclassifier.py index 28f13e0a..d78f2b48 100644 --- a/src/python/nimbusml/internal/entrypoints/trainers_lightgbmclassifier.py +++ b/src/python/nimbusml/internal/entrypoints/trainers_lightgbmclassifier.py @@ -35,6 +35,7 @@ def trainers_lightgbmclassifier( batch_size=1048576, use_categorical_split=None, handle_missing_value=True, + use_zero_as_missing_value=False, minimum_example_count_per_group=100, maximum_categorical_split_point_count=32, categorical_smoothing=10.0, @@ -86,6 +87,8 @@ def trainers_lightgbmclassifier( (inputs). :param handle_missing_value: Enable special handling of missing value or not. (inputs). + :param use_zero_as_missing_value: Enable usage of zero (0) as + missing value. (inputs). :param minimum_example_count_per_group: Minimum number of instances per categorical group. (inputs). :param maximum_categorical_split_point_count: Max number of @@ -240,6 +243,11 @@ def trainers_lightgbmclassifier( obj=handle_missing_value, none_acceptable=True, is_of_type=bool) + if use_zero_as_missing_value is not None: + inputs['UseZeroAsMissingValue'] = try_set( + obj=use_zero_as_missing_value, + none_acceptable=True, + is_of_type=bool) if minimum_example_count_per_group is not None: inputs['MinimumExampleCountPerGroup'] = try_set( obj=minimum_example_count_per_group, diff --git a/src/python/nimbusml/internal/entrypoints/trainers_lightgbmranker.py b/src/python/nimbusml/internal/entrypoints/trainers_lightgbmranker.py index 5a3a44fd..0c2e9e0a 100644 --- a/src/python/nimbusml/internal/entrypoints/trainers_lightgbmranker.py +++ b/src/python/nimbusml/internal/entrypoints/trainers_lightgbmranker.py @@ -34,6 +34,7 @@ def trainers_lightgbmranker( batch_size=1048576, use_categorical_split=None, handle_missing_value=True, + use_zero_as_missing_value=False, minimum_example_count_per_group=100, maximum_categorical_split_point_count=32, categorical_smoothing=10.0, @@ -83,6 +84,8 @@ def trainers_lightgbmranker( (inputs). :param handle_missing_value: Enable special handling of missing value or not. (inputs). + :param use_zero_as_missing_value: Enable usage of zero (0) as + missing value. (inputs). :param minimum_example_count_per_group: Minimum number of instances per categorical group. (inputs). :param maximum_categorical_split_point_count: Max number of @@ -232,6 +235,11 @@ def trainers_lightgbmranker( obj=handle_missing_value, none_acceptable=True, is_of_type=bool) + if use_zero_as_missing_value is not None: + inputs['UseZeroAsMissingValue'] = try_set( + obj=use_zero_as_missing_value, + none_acceptable=True, + is_of_type=bool) if minimum_example_count_per_group is not None: inputs['MinimumExampleCountPerGroup'] = try_set( obj=minimum_example_count_per_group, diff --git a/src/python/nimbusml/internal/entrypoints/trainers_lightgbmregressor.py b/src/python/nimbusml/internal/entrypoints/trainers_lightgbmregressor.py index 32260ebe..9fbf3e69 100644 --- a/src/python/nimbusml/internal/entrypoints/trainers_lightgbmregressor.py +++ b/src/python/nimbusml/internal/entrypoints/trainers_lightgbmregressor.py @@ -32,6 +32,7 @@ def trainers_lightgbmregressor( batch_size=1048576, use_categorical_split=None, handle_missing_value=True, + use_zero_as_missing_value=False, minimum_example_count_per_group=100, maximum_categorical_split_point_count=32, categorical_smoothing=10.0, @@ -78,6 +79,8 @@ def trainers_lightgbmregressor( (inputs). :param handle_missing_value: Enable special handling of missing value or not. (inputs). + :param use_zero_as_missing_value: Enable usage of zero (0) as + missing value. (inputs). :param minimum_example_count_per_group: Minimum number of instances per categorical group. (inputs). :param maximum_categorical_split_point_count: Max number of @@ -218,6 +221,11 @@ def trainers_lightgbmregressor( obj=handle_missing_value, none_acceptable=True, is_of_type=bool) + if use_zero_as_missing_value is not None: + inputs['UseZeroAsMissingValue'] = try_set( + obj=use_zero_as_missing_value, + none_acceptable=True, + is_of_type=bool) if minimum_example_count_per_group is not None: inputs['MinimumExampleCountPerGroup'] = try_set( obj=minimum_example_count_per_group, diff --git a/src/python/nimbusml/internal/entrypoints/trainers_localdeepsvmbinaryclassifier.py b/src/python/nimbusml/internal/entrypoints/trainers_localdeepsvmbinaryclassifier.py new file mode 100644 index 00000000..0b2c5984 --- /dev/null +++ b/src/python/nimbusml/internal/entrypoints/trainers_localdeepsvmbinaryclassifier.py @@ -0,0 +1,175 @@ +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +Trainers.LocalDeepSvmBinaryClassifier +""" + +import numbers + +from ..utils.entrypoints import EntryPoint +from ..utils.utils import try_set, unlist + + +def trainers_localdeepsvmbinaryclassifier( + training_data, + predictor_model=None, + feature_column_name='Features', + label_column_name='Label', + example_weight_column_name=None, + normalize_features='Auto', + caching='Auto', + tree_depth=3, + lambda_w=0.1, + lambda_theta=0.01, + lambda_thetaprime=0.01, + sigma=1.0, + number_of_iterations=15000, + use_bias=True, + calibrator=None, + max_calibration_examples=1000000, + cache=True, + **params): + """ + **Description** + LD-SVM learns a binary, non-linear SVM classifier with a kernel that + is specifically designed to reduce prediction time. LD-SVM + learns decision boundaries that are locally linear. + + :param training_data: The data to be used for training (inputs). + :param feature_column_name: Column to use for features (inputs). + :param label_column_name: Column to use for labels (inputs). + :param example_weight_column_name: Column to use for example + weight (inputs). + :param normalize_features: Normalize option for the feature + column (inputs). + :param caching: Whether trainer should cache input training data + (inputs). + :param tree_depth: Depth of Local Deep SVM tree (inputs). + :param lambda_w: Regularizer for classifier parameter W (inputs). + :param lambda_theta: Regularizer for kernel parameter Theta + (inputs). + :param lambda_thetaprime: Regularizer for kernel parameter + Thetaprime (inputs). + :param sigma: Parameter for sigmoid sharpness (inputs). + :param number_of_iterations: Number of iterations (inputs). + :param use_bias: No bias (inputs). + :param calibrator: The calibrator kind to apply to the predictor. + Specify null for no calibration (inputs). + :param max_calibration_examples: The maximum number of examples + to use when training the calibrator (inputs). + :param cache: Whether to cache the data before the first + iteration (inputs). + :param predictor_model: The trained model (outputs). + """ + + entrypoint_name = 'Trainers.LocalDeepSvmBinaryClassifier' + inputs = {} + outputs = {} + + if training_data is not None: + inputs['TrainingData'] = try_set( + obj=training_data, + none_acceptable=False, + is_of_type=str) + if feature_column_name is not None: + inputs['FeatureColumnName'] = try_set( + obj=feature_column_name, + none_acceptable=True, + is_of_type=str, + is_column=True) + if label_column_name is not None: + inputs['LabelColumnName'] = try_set( + obj=label_column_name, + none_acceptable=True, + is_of_type=str, + is_column=True) + if example_weight_column_name is not None: + inputs['ExampleWeightColumnName'] = try_set( + obj=example_weight_column_name, + none_acceptable=True, + is_of_type=str, + is_column=True) + if normalize_features is not None: + inputs['NormalizeFeatures'] = try_set( + obj=normalize_features, + none_acceptable=True, + is_of_type=str, + values=[ + 'No', + 'Warn', + 'Auto', + 'Yes']) + if caching is not None: + inputs['Caching'] = try_set( + obj=caching, + none_acceptable=True, + is_of_type=str, + values=[ + 'Auto', + 'Memory', + 'None']) + if tree_depth is not None: + inputs['TreeDepth'] = try_set( + obj=tree_depth, + none_acceptable=True, + is_of_type=numbers.Real) + if lambda_w is not None: + inputs['LambdaW'] = try_set( + obj=lambda_w, + none_acceptable=True, + is_of_type=numbers.Real) + if lambda_theta is not None: + inputs['LambdaTheta'] = try_set( + obj=lambda_theta, + none_acceptable=True, + is_of_type=numbers.Real) + if lambda_thetaprime is not None: + inputs['LambdaThetaprime'] = try_set( + obj=lambda_thetaprime, + none_acceptable=True, + is_of_type=numbers.Real) + if sigma is not None: + inputs['Sigma'] = try_set( + obj=sigma, + none_acceptable=True, + is_of_type=numbers.Real) + if number_of_iterations is not None: + inputs['NumberOfIterations'] = try_set( + obj=number_of_iterations, + none_acceptable=True, + is_of_type=numbers.Real) + if use_bias is not None: + inputs['UseBias'] = try_set( + obj=use_bias, + none_acceptable=True, + is_of_type=bool) + if calibrator is not None: + inputs['Calibrator'] = try_set( + obj=calibrator, + none_acceptable=True, + is_of_type=dict) + if max_calibration_examples is not None: + inputs['MaxCalibrationExamples'] = try_set( + obj=max_calibration_examples, + none_acceptable=True, + is_of_type=numbers.Real) + if cache is not None: + inputs['Cache'] = try_set( + obj=cache, + none_acceptable=True, + is_of_type=bool) + if predictor_model is not None: + outputs['PredictorModel'] = try_set( + obj=predictor_model, none_acceptable=False, is_of_type=str) + + input_variables = { + x for x in unlist(inputs.values()) + if isinstance(x, str) and x.startswith("$")} + output_variables = { + x for x in unlist(outputs.values()) + if isinstance(x, str) and x.startswith("$")} + + entrypoint = EntryPoint( + name=entrypoint_name, inputs=inputs, outputs=outputs, + input_variables=input_variables, + output_variables=output_variables) + return entrypoint diff --git a/src/python/nimbusml/internal/entrypoints/trainers_logisticregressionclassifier.py b/src/python/nimbusml/internal/entrypoints/trainers_logisticregressionclassifier.py index 5db498b1..61759e4d 100644 --- a/src/python/nimbusml/internal/entrypoints/trainers_logisticregressionclassifier.py +++ b/src/python/nimbusml/internal/entrypoints/trainers_logisticregressionclassifier.py @@ -33,7 +33,7 @@ def trainers_logisticregressionclassifier( **params): """ **Description** - Maximum entrypy classification is a method in statistics used to + Maximum entropy classification is a method in statistics used to predict the probabilities of parallel events. The model predicts the probabilities of parallel events by fitting data to a softmax function. diff --git a/src/python/nimbusml/internal/entrypoints/transforms_categoryimputer.py b/src/python/nimbusml/internal/entrypoints/transforms_categoryimputer.py new file mode 100644 index 00000000..7f72261b --- /dev/null +++ b/src/python/nimbusml/internal/entrypoints/transforms_categoryimputer.py @@ -0,0 +1,65 @@ +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +Transforms.CategoryImputer +""" + + +from ..utils.entrypoints import EntryPoint +from ..utils.utils import try_set, unlist + + +def transforms_categoryimputer( + column, + data, + output_data=None, + model=None, + **params): + """ + **Description** + Fills in missing values in a column based on the most frequent value + + :param column: New column definition (optional form: name:src) + (inputs). + :param data: Input dataset (inputs). + :param output_data: Transformed dataset (outputs). + :param model: Transform model (outputs). + """ + + entrypoint_name = 'Transforms.CategoryImputer' + inputs = {} + outputs = {} + + if column is not None: + inputs['Column'] = try_set( + obj=column, + none_acceptable=False, + is_of_type=list, + is_column=True) + if data is not None: + inputs['Data'] = try_set( + obj=data, + none_acceptable=False, + is_of_type=str) + if output_data is not None: + outputs['OutputData'] = try_set( + obj=output_data, + none_acceptable=False, + is_of_type=str) + if model is not None: + outputs['Model'] = try_set( + obj=model, + none_acceptable=False, + is_of_type=str) + + input_variables = { + x for x in unlist(inputs.values()) + if isinstance(x, str) and x.startswith("$")} + output_variables = { + x for x in unlist(outputs.values()) + if isinstance(x, str) and x.startswith("$")} + + entrypoint = EntryPoint( + name=entrypoint_name, inputs=inputs, outputs=outputs, + input_variables=input_variables, + output_variables=output_variables) + return entrypoint diff --git a/src/python/nimbusml/internal/entrypoints/transforms_datetimesplitter.py b/src/python/nimbusml/internal/entrypoints/transforms_datetimesplitter.py new file mode 100644 index 00000000..ac2524c8 --- /dev/null +++ b/src/python/nimbusml/internal/entrypoints/transforms_datetimesplitter.py @@ -0,0 +1,119 @@ +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +Transforms.DateTimeSplitter +""" + + +from ..utils.entrypoints import EntryPoint +from ..utils.utils import try_set, unlist + + +def transforms_datetimesplitter( + source, + data, + prefix, + output_data=None, + model=None, + country='None', + **params): + """ + **Description** + Splits a date time value into each individual component + + :param source: Input column (inputs). + :param data: Input dataset (inputs). + :param prefix: Output column prefix (inputs). + :param country: Country to get holidays for. Defaults to none if + not passed (inputs). + :param output_data: Transformed dataset (outputs). + :param model: Transform model (outputs). + """ + + entrypoint_name = 'Transforms.DateTimeSplitter' + inputs = {} + outputs = {} + + if source is not None: + inputs['Source'] = try_set( + obj=source, + none_acceptable=False, + is_of_type=str, + is_column=True) + if data is not None: + inputs['Data'] = try_set( + obj=data, + none_acceptable=False, + is_of_type=str) + if prefix is not None: + inputs['Prefix'] = try_set( + obj=prefix, + none_acceptable=False, + is_of_type=str) + if country is not None: + inputs['Country'] = try_set( + obj=country, + none_acceptable=True, + is_of_type=str, + values=[ + 'None', + 'Argentina', + 'Australia', + 'Austria', + 'Belarus', + 'Belgium', + 'Brazil', + 'Canada', + 'Colombia', + 'Croatia', + 'Czech', + 'Denmark', + 'England', + 'Finland', + 'France', + 'Germany', + 'Hungary', + 'India', + 'Ireland', + 'IsleofMan', + 'Italy', + 'Japan', + 'Mexico', + 'Netherlands', + 'NewZealand', + 'NorthernIreland', + 'Norway', + 'Poland', + 'Portugal', + 'Scotland', + 'Slovenia', + 'SouthAfrica', + 'Spain', + 'Sweden', + 'Switzerland', + 'Ukraine', + 'UnitedKingdom', + 'UnitedStates', + 'Wales']) + if output_data is not None: + outputs['OutputData'] = try_set( + obj=output_data, + none_acceptable=False, + is_of_type=str) + if model is not None: + outputs['Model'] = try_set( + obj=model, + none_acceptable=False, + is_of_type=str) + + input_variables = { + x for x in unlist(inputs.values()) + if isinstance(x, str) and x.startswith("$")} + output_variables = { + x for x in unlist(outputs.values()) + if isinstance(x, str) and x.startswith("$")} + + entrypoint = EntryPoint( + name=entrypoint_name, inputs=inputs, outputs=outputs, + input_variables=input_variables, + output_variables=output_variables) + return entrypoint diff --git a/src/python/nimbusml/internal/entrypoints/transforms_missingvaluehandler.py b/src/python/nimbusml/internal/entrypoints/transforms_missingvaluehandler.py index 1f1a3870..121115b4 100644 --- a/src/python/nimbusml/internal/entrypoints/transforms_missingvaluehandler.py +++ b/src/python/nimbusml/internal/entrypoints/transforms_missingvaluehandler.py @@ -21,7 +21,7 @@ def transforms_missingvaluehandler( **Description** Handle missing values by replacing them with either the default value or the mean/min/max value (for non-text columns only). An - indicator column can optionally be concatenated, if theinput + indicator column can optionally be concatenated, if the input column type is numeric. :param column: New column definition(s) (optional form: diff --git a/src/python/nimbusml/internal/entrypoints/transforms_robustscaler.py b/src/python/nimbusml/internal/entrypoints/transforms_robustscaler.py new file mode 100644 index 00000000..615af180 --- /dev/null +++ b/src/python/nimbusml/internal/entrypoints/transforms_robustscaler.py @@ -0,0 +1,98 @@ +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +Transforms.RobustScaler +""" + +import numbers + +from ..utils.entrypoints import EntryPoint +from ..utils.utils import try_set, unlist + + +def transforms_robustscaler( + column, + data, + output_data=None, + model=None, + center=True, + scale=True, + quantile_min=25.0, + quantile_max=75.0, + **params): + """ + **Description** + Removes the median and scales the data according to the quantile + range. + + :param column: New column definition (optional form: name:src) + (inputs). + :param data: Input dataset (inputs). + :param center: If True, center the data before scaling. (inputs). + :param scale: If True, scale the data to interquartile range. + (inputs). + :param quantile_min: Min for the quantile range used to calculate + scale. (inputs). + :param quantile_max: Max for the quantile range used to calculate + scale. (inputs). + :param output_data: Transformed dataset (outputs). + :param model: Transform model (outputs). + """ + + entrypoint_name = 'Transforms.RobustScaler' + inputs = {} + outputs = {} + + if column is not None: + inputs['Column'] = try_set( + obj=column, + none_acceptable=False, + is_of_type=list, + is_column=True) + if data is not None: + inputs['Data'] = try_set( + obj=data, + none_acceptable=False, + is_of_type=str) + if center is not None: + inputs['Center'] = try_set( + obj=center, + none_acceptable=True, + is_of_type=bool) + if scale is not None: + inputs['Scale'] = try_set( + obj=scale, + none_acceptable=True, + is_of_type=bool) + if quantile_min is not None: + inputs['QuantileMin'] = try_set( + obj=quantile_min, + none_acceptable=True, + is_of_type=numbers.Real) + if quantile_max is not None: + inputs['QuantileMax'] = try_set( + obj=quantile_max, + none_acceptable=True, + is_of_type=numbers.Real) + if output_data is not None: + outputs['OutputData'] = try_set( + obj=output_data, + none_acceptable=False, + is_of_type=str) + if model is not None: + outputs['Model'] = try_set( + obj=model, + none_acceptable=False, + is_of_type=str) + + input_variables = { + x for x in unlist(inputs.values()) + if isinstance(x, str) and x.startswith("$")} + output_variables = { + x for x in unlist(outputs.values()) + if isinstance(x, str) and x.startswith("$")} + + entrypoint = EntryPoint( + name=entrypoint_name, inputs=inputs, outputs=outputs, + input_variables=input_variables, + output_variables=output_variables) + return entrypoint diff --git a/src/python/nimbusml/internal/entrypoints/transforms_timeseriesimputer.py b/src/python/nimbusml/internal/entrypoints/transforms_timeseriesimputer.py new file mode 100644 index 00000000..e58117ad --- /dev/null +++ b/src/python/nimbusml/internal/entrypoints/transforms_timeseriesimputer.py @@ -0,0 +1,114 @@ +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +Transforms.TimeSeriesImputer +""" + + +from ..utils.entrypoints import EntryPoint +from ..utils.utils import try_set, unlist + + +def transforms_timeseriesimputer( + time_series_column, + data, + grain_columns, + output_data=None, + model=None, + filter_columns=None, + filter_mode='Exclude', + impute_mode='ForwardFill', + supress_type_errors=False, + **params): + """ + **Description** + Fills in missing row and values + + :param time_series_column: Column representing the time (inputs). + :param data: Input dataset (inputs). + :param grain_columns: List of grain columns (inputs). + :param filter_columns: Columns to filter (inputs). + :param filter_mode: Filter mode. Either include or exclude + (inputs). + :param impute_mode: Mode for imputing, defaults to ForwardFill if + not provided (inputs). + :param supress_type_errors: Suppress the errors that would occur + if a column and impute mode are incompatible. If true, will + skip the column. If false, will stop and throw an error. + (inputs). + :param output_data: Transformed dataset (outputs). + :param model: Transform model (outputs). + """ + + entrypoint_name = 'Transforms.TimeSeriesImputer' + inputs = {} + outputs = {} + + if time_series_column is not None: + inputs['TimeSeriesColumn'] = try_set( + obj=time_series_column, + none_acceptable=False, + is_of_type=str, + is_column=True) + if data is not None: + inputs['Data'] = try_set( + obj=data, + none_acceptable=False, + is_of_type=str) + if grain_columns is not None: + inputs['GrainColumns'] = try_set( + obj=grain_columns, + none_acceptable=False, + is_of_type=list, + is_column=True) + if filter_columns is not None: + inputs['FilterColumns'] = try_set( + obj=filter_columns, + none_acceptable=True, + is_of_type=list, + is_column=True) + if filter_mode is not None: + inputs['FilterMode'] = try_set( + obj=filter_mode, + none_acceptable=True, + is_of_type=str, + values=[ + 'NoFilter', + 'Include', + 'Exclude']) + if impute_mode is not None: + inputs['ImputeMode'] = try_set( + obj=impute_mode, + none_acceptable=True, + is_of_type=str, + values=[ + 'ForwardFill', + 'BackFill', + 'Median']) + if supress_type_errors is not None: + inputs['SupressTypeErrors'] = try_set( + obj=supress_type_errors, + none_acceptable=True, + is_of_type=bool) + if output_data is not None: + outputs['OutputData'] = try_set( + obj=output_data, + none_acceptable=False, + is_of_type=str) + if model is not None: + outputs['Model'] = try_set( + obj=model, + none_acceptable=False, + is_of_type=str) + + input_variables = { + x for x in unlist(inputs.values()) + if isinstance(x, str) and x.startswith("$")} + output_variables = { + x for x in unlist(outputs.values()) + if isinstance(x, str) and x.startswith("$")} + + entrypoint = EntryPoint( + name=entrypoint_name, inputs=inputs, outputs=outputs, + input_variables=input_variables, + output_variables=output_variables) + return entrypoint diff --git a/src/python/nimbusml/internal/entrypoints/transforms_tostring.py b/src/python/nimbusml/internal/entrypoints/transforms_tostring.py new file mode 100644 index 00000000..2f6d9782 --- /dev/null +++ b/src/python/nimbusml/internal/entrypoints/transforms_tostring.py @@ -0,0 +1,65 @@ +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +Transforms.ToString +""" + + +from ..utils.entrypoints import EntryPoint +from ..utils.utils import try_set, unlist + + +def transforms_tostring( + column, + data, + output_data=None, + model=None, + **params): + """ + **Description** + Turns the given column into a column of its string representation + + :param column: New column definition (optional form: name:src) + (inputs). + :param data: Input dataset (inputs). + :param output_data: Transformed dataset (outputs). + :param model: Transform model (outputs). + """ + + entrypoint_name = 'Transforms.ToString' + inputs = {} + outputs = {} + + if column is not None: + inputs['Column'] = try_set( + obj=column, + none_acceptable=False, + is_of_type=list, + is_column=True) + if data is not None: + inputs['Data'] = try_set( + obj=data, + none_acceptable=False, + is_of_type=str) + if output_data is not None: + outputs['OutputData'] = try_set( + obj=output_data, + none_acceptable=False, + is_of_type=str) + if model is not None: + outputs['Model'] = try_set( + obj=model, + none_acceptable=False, + is_of_type=str) + + input_variables = { + x for x in unlist(inputs.values()) + if isinstance(x, str) and x.startswith("$")} + output_variables = { + x for x in unlist(outputs.values()) + if isinstance(x, str) and x.startswith("$")} + + entrypoint = EntryPoint( + name=entrypoint_name, inputs=inputs, outputs=outputs, + input_variables=input_variables, + output_variables=output_variables) + return entrypoint diff --git a/src/python/nimbusml/internal/utils/dataframes.py b/src/python/nimbusml/internal/utils/dataframes.py index 17572ad1..8e3665bc 100644 --- a/src/python/nimbusml/internal/utils/dataframes.py +++ b/src/python/nimbusml/internal/utils/dataframes.py @@ -63,7 +63,7 @@ def resolve_dataframe(dataframe): ret[name_i] = serie.values if infered_dtype == 'floating' or \ infered_dtype == 'mixed-integer-float': - s = serie.itemsize + s = serie.dtype.itemsize if s == 8: ret[str(i)] = serie.values.astype( np.float64, copy=False) @@ -77,7 +77,7 @@ def resolve_dataframe(dataframe): [_global_dtype_to_char_dict[ np.dtype(np.float32)]]) elif infered_dtype == 'integer': - s = serie.itemsize + s = serie.dtype.itemsize if s == 8: ret[str(i)] = serie.values.astype( np.int64, copy=False) diff --git a/src/python/nimbusml/pipeline.py b/src/python/nimbusml/pipeline.py index 6eb190e3..ab9ae1ac 100644 --- a/src/python/nimbusml/pipeline.py +++ b/src/python/nimbusml/pipeline.py @@ -39,6 +39,7 @@ from .internal.entrypoints.models_regressionevaluator import \ models_regressionevaluator from .internal.entrypoints.models_summarizer import models_summarizer +from .internal.entrypoints.models_onnxconverter import models_onnxconverter from .internal.entrypoints.models_schema import models_schema from .internal.entrypoints.transforms_datasetscorerex import \ transforms_datasetscorerex @@ -1015,6 +1016,8 @@ def fit(self, X, y=None, verbose=1, **params): :language: python """ + dry_run = params.pop('dry_run', False) + if self._is_fitted: # We restore the initial steps as they were # modified by the previous training. @@ -1058,7 +1061,7 @@ def move_information_about_roles_once_used(): # REVIEW: we should have the possibility to keep the model in # memory and not in a file. try: - (out_model, out_data, out_metrics, out_predictor_model) = graph.run( + graph_output = graph.run( X=X, y=y, random_state=self.random_state, @@ -1066,6 +1069,7 @@ def move_information_about_roles_once_used(): verbose=verbose, max_slots=max_slots, telemetry_info=telemetry_info, + dry_run=dry_run, **params) except RuntimeError as e: self._run_time = time.time() - start_time @@ -1081,17 +1085,21 @@ def move_information_about_roles_once_used(): delattr(self, "_cache_predictor") raise e - move_information_about_roles_once_used() - self.graph_ = graph - self.model = out_model - if out_predictor_model: - self.predictor_model = out_predictor_model - self.data = out_data - # stop the clock - self._run_time = time.time() - start_time - self._write_csv_time = graph._write_csv_time - delattr(self, "_cache_predictor") - return self + if dry_run: + return graph_output + else: + out_model, out_data, out_metrics, out_predictor_model = graph_output + move_information_about_roles_once_used() + self.graph_ = graph + self.model = out_model + if out_predictor_model: + self.predictor_model = out_predictor_model + self.data = out_data + # stop the clock + self._run_time = time.time() - start_time + self._write_csv_time = graph._write_csv_time + delattr(self, "_cache_predictor") + return self @trace def fit_transform( @@ -1623,8 +1631,9 @@ def get_feature_contributions(self, X, top=10, bottom=10, verbose=0, outputs = dict(output_data="") - data_output_format = DataOutputFormat.IDV if as_binary_data_stream \ - else DataOutputFormat.DF, + data_output_format = DataOutputFormat.DF + if as_binary_data_stream: + data_output_format = DataOutputFormat.IDV graph = Graph( inputs, @@ -1813,8 +1822,9 @@ def permutation_feature_importance(self, X, number_of_examples=None, outputs = dict(output_data="") - data_output_format = DataOutputFormat.IDV if as_binary_data_stream \ - else DataOutputFormat.DF, + data_output_format = DataOutputFormat.DF + if as_binary_data_stream: + data_output_format = DataOutputFormat.IDV graph = Graph( inputs, @@ -1975,8 +1985,9 @@ def _predict(self, X, y=None, else: outputs = dict(output_data="") - data_output_format = DataOutputFormat.IDV if as_binary_data_stream \ - else DataOutputFormat.DF, + data_output_format = DataOutputFormat.DF + if as_binary_data_stream: + data_output_format = DataOutputFormat.IDV graph = Graph( inputs, @@ -2001,8 +2012,9 @@ def _predict(self, X, y=None, self._run_time = time.time() - start_time raise e - if is_transformer_chain: - out_data['PredictedLabel'] = out_data['PredictedLabel']*1 + if data_output_format == DataOutputFormat.DF and \ + is_transformer_chain and 'PredictedLabel' in out_data.columns: + out_data['PredictedLabel'] = out_data['PredictedLabel']*1 if y is not None: @@ -2519,6 +2531,96 @@ def __setstate__(self, state): else: raise ValueError('Pipeline version not supported.') + @trace + def export_to_onnx(self, + dst, + domain, + dst_json=None, + name=None, + data_file=None, + inputs_to_drop=None, + outputs_to_drop=None, + onnx_version="Stable", + verbose=0): + """ + Export the model to the ONNX format. + + :param str dst: The path to write the output ONNX to. + :param str domain: A reverse-DNS name to indicate the model + namespace or domain, for example, 'org.onnx'. + :param str dst_json: The path to write the output ONNX to + in JSON format. + :param name: The 'graph.name' property in the output ONNX. By default + this will be the ONNX extension-less name. (inputs). + :param data_file: The data file (inputs). + :param inputs_to_drop: Array of input column names to drop + (inputs). + :param outputs_to_drop: Array of output column names to drop + (inputs). + :param onnx_version: The targeted ONNX version. It can be either + "Stable" or "Experimental". If "Experimental" is used, + produced model can contain components that is not officially + supported in ONNX standard. (inputs). + """ + if not domain: + raise ValueError("domain argument must be specified and not empty.") + + if not self._is_fitted: + raise ValueError("Model is not fitted. Train or load a model before " + "export_to_onnx().") + + # start the clock! + start_time = time.time() + + onnx_converter_args = { + 'onnx': dst, + 'json': dst_json, + 'domain': domain, + 'name': name, + 'data_file': data_file, + 'inputs_to_drop': inputs_to_drop, + 'outputs_to_drop': outputs_to_drop, + 'onnx_version': onnx_version + } + + if (len(self.steps) > 0) and (self.last_node.type != "transform"): + onnx_converter_args['predictive_model'] = "$model" + else: + onnx_converter_args['model'] = "$model" + + onnx_converter_node = models_onnxconverter(**onnx_converter_args) + + inputs = dict([('model', self.model)]) + outputs = dict() + + graph = Graph( + inputs, + outputs, + False, + onnx_converter_node) + + class_name = type(self).__name__ + method_name = inspect.currentframe().f_code.co_name + telemetry_info = ".".join([class_name, method_name]) + + try: + graph.run( + X=None, + y=None, + random_state=self.random_state, + model=self.model, + verbose=verbose, + is_summary=False, + no_input_data=True, + telemetry_info=telemetry_info) + except RuntimeError as e: + self._run_time = time.time() - start_time + raise e + + # stop the clock + self._run_time = time.time() - start_time + self._write_csv_time = graph._write_csv_time + @trace def score( self, diff --git a/src/python/nimbusml/preprocessing/__init__.py b/src/python/nimbusml/preprocessing/__init__.py index 26b41b8e..202eb15d 100644 --- a/src/python/nimbusml/preprocessing/__init__.py +++ b/src/python/nimbusml/preprocessing/__init__.py @@ -2,10 +2,18 @@ from .tokey import ToKey from .tensorflowscorer import TensorFlowScorer from .datasettransformer import DatasetTransformer +from .onnxrunner import OnnxRunner +from .datetimesplitter import DateTimeSplitter +from .tokeyimputer import ToKeyImputer +from .tostring import ToString __all__ = [ + 'DateTimeSplitter', 'FromKey', 'ToKey', + 'ToKeyImputer', + 'ToString', 'TensorFlowScorer', - 'DatasetTransformer' + 'DatasetTransformer', + 'OnnxRunner' ] diff --git a/src/python/nimbusml/preprocessing/datetimesplitter.py b/src/python/nimbusml/preprocessing/datetimesplitter.py new file mode 100644 index 00000000..c3fceb43 --- /dev/null +++ b/src/python/nimbusml/preprocessing/datetimesplitter.py @@ -0,0 +1,59 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +DateTimeSplitter +""" + +__all__ = ["DateTimeSplitter"] + + +from sklearn.base import TransformerMixin + +from ..base_transform import BaseTransform +from ..internal.core.preprocessing.datetimesplitter import \ + DateTimeSplitter as core +from ..internal.utils.utils import trace + + +class DateTimeSplitter(core, BaseTransform, TransformerMixin): + """ + **Description** + Splits a date time value into each individual component + + :param columns: see `Columns `_. + + :param prefix: Output column prefix. + + :param country: Country to get holidays for. Defaults to none if not + passed. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + prefix, + country='None', + columns=None, + **params): + + if columns: + params['columns'] = columns + BaseTransform.__init__(self, **params) + core.__init__( + self, + prefix=prefix, + country=country, + **params) + self._columns = columns + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/preprocessing/normalization/__init__.py b/src/python/nimbusml/preprocessing/normalization/__init__.py index f7d7647a..3928ac40 100644 --- a/src/python/nimbusml/preprocessing/normalization/__init__.py +++ b/src/python/nimbusml/preprocessing/normalization/__init__.py @@ -4,6 +4,7 @@ from .lpscaler import LpScaler from .meanvariancescaler import MeanVarianceScaler from .minmaxscaler import MinMaxScaler +from .robustscaler import RobustScaler __all__ = [ 'Binner', @@ -11,5 +12,6 @@ 'LogMeanVarianceScaler', 'LpScaler', 'MeanVarianceScaler', - 'MinMaxScaler' + 'MinMaxScaler', + 'RobustScaler' ] diff --git a/src/python/nimbusml/preprocessing/normalization/robustscaler.py b/src/python/nimbusml/preprocessing/normalization/robustscaler.py new file mode 100644 index 00000000..776d5609 --- /dev/null +++ b/src/python/nimbusml/preprocessing/normalization/robustscaler.py @@ -0,0 +1,66 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +RobustScaler +""" + +__all__ = ["RobustScaler"] + + +from sklearn.base import TransformerMixin + +from ...base_transform import BaseTransform +from ...internal.core.preprocessing.normalization.robustscaler import \ + RobustScaler as core +from ...internal.utils.utils import trace + + +class RobustScaler(core, BaseTransform, TransformerMixin): + """ + **Description** + Removes the median and scales the data according to the quantile range. + + :param columns: see `Columns `_. + + :param center: If True, center the data before scaling. + + :param scale: If True, scale the data to interquartile range. + + :param quantile_min: Min for the quantile range used to calculate scale. + + :param quantile_max: Max for the quantile range used to calculate scale. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + center=True, + scale=True, + quantile_min=25.0, + quantile_max=75.0, + columns=None, + **params): + + if columns: + params['columns'] = columns + BaseTransform.__init__(self, **params) + core.__init__( + self, + center=center, + scale=scale, + quantile_min=quantile_min, + quantile_max=quantile_max, + **params) + self._columns = columns + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/preprocessing/onnxrunner.py b/src/python/nimbusml/preprocessing/onnxrunner.py new file mode 100644 index 00000000..2df2ac75 --- /dev/null +++ b/src/python/nimbusml/preprocessing/onnxrunner.py @@ -0,0 +1,82 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +OnnxRunner +""" + +__all__ = ["OnnxRunner"] + + +from sklearn.base import TransformerMixin + +from ..base_transform import BaseTransform +from ..internal.core.preprocessing.onnxrunner import OnnxRunner as core +from ..internal.utils.utils import trace + + +class OnnxRunner(core, BaseTransform, TransformerMixin): + """ + **Description** + Applies an ONNX model to a dataset. + + :param columns: see `Columns `_. + + :param model_file: Path to the onnx model file. + + :param input_columns: Name of the input column. + + :param output_columns: Name of the output column. + + :param gpu_device_id: GPU device id to run on (e.g. 0,1,..). Null for CPU. + Requires CUDA 9.1. + + :param fallback_to_cpu: If true, resumes execution on CPU upon GPU error. + If false, will raise the GPU execption. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + model_file, + input_columns=None, + output_columns=None, + gpu_device_id=None, + fallback_to_cpu=False, + columns=None, + **params): + + if columns: + params['columns'] = columns + if columns: + input_columns = sum( + list( + columns.values()), + []) if isinstance( + list( + columns.values())[0], + list) else list( + columns.values()) + if columns: + output_columns = list(columns.keys()) + BaseTransform.__init__(self, **params) + core.__init__( + self, + model_file=model_file, + input_columns=input_columns, + output_columns=output_columns, + gpu_device_id=gpu_device_id, + fallback_to_cpu=fallback_to_cpu, + **params) + self._columns = columns + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/preprocessing/tokeyimputer.py b/src/python/nimbusml/preprocessing/tokeyimputer.py new file mode 100644 index 00000000..000d6a2f --- /dev/null +++ b/src/python/nimbusml/preprocessing/tokeyimputer.py @@ -0,0 +1,49 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +ToKeyImputer +""" + +__all__ = ["ToKeyImputer"] + + +from sklearn.base import TransformerMixin + +from ..base_transform import BaseTransform +from ..internal.core.preprocessing.tokeyimputer import ToKeyImputer as core +from ..internal.utils.utils import trace + + +class ToKeyImputer(core, BaseTransform, TransformerMixin): + """ + **Description** + Fills in missing values in a column based on the most frequent value + + :param columns: see `Columns `_. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + columns=None, + **params): + + if columns: + params['columns'] = columns + BaseTransform.__init__(self, **params) + core.__init__( + self, + **params) + self._columns = columns + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/preprocessing/tostring.py b/src/python/nimbusml/preprocessing/tostring.py new file mode 100644 index 00000000..2dd2826c --- /dev/null +++ b/src/python/nimbusml/preprocessing/tostring.py @@ -0,0 +1,49 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +ToString +""" + +__all__ = ["ToString"] + + +from sklearn.base import TransformerMixin + +from ..base_transform import BaseTransform +from ..internal.core.preprocessing.tostring import ToString as core +from ..internal.utils.utils import trace + + +class ToString(core, BaseTransform, TransformerMixin): + """ + **Description** + Turns the given column into a column of its string representation + + :param columns: see `Columns `_. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + columns=None, + **params): + + if columns: + params['columns'] = columns + BaseTransform.__init__(self, **params) + core.__init__( + self, + **params) + self._columns = columns + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/tests/pipeline/test_pipeline_split_models.py b/src/python/nimbusml/tests/pipeline/test_pipeline_split_models.py index bc1399bf..4a6d9109 100644 --- a/src/python/nimbusml/tests/pipeline/test_pipeline_split_models.py +++ b/src/python/nimbusml/tests/pipeline/test_pipeline_split_models.py @@ -47,7 +47,7 @@ def test_notvectorized_output_predictor_model(self): # Create and fit a combined model and spit out predictor model combined_pipeline = Pipeline([RangeFilter(min=0.0, max=4.5) << 'c2', - OnlineGradientDescentRegressor(label='c2')], + OnlineGradientDescentRegressor(feature=['c1'], label='c2')], random_state=seed) combined_pipeline.fit(df, output_predictor_model=True) result_1 = combined_pipeline.predict(df) diff --git a/src/python/nimbusml/tests/pipeline/test_uci_adult.py b/src/python/nimbusml/tests/pipeline/test_uci_adult.py index 00ae1728..2f6055de 100644 --- a/src/python/nimbusml/tests/pipeline/test_uci_adult.py +++ b/src/python/nimbusml/tests/pipeline/test_uci_adult.py @@ -36,7 +36,7 @@ def check_accuracy(test_file, label_column, predictions, threshold, sep=','): (test, label) = get_X_y(test_file, label_column, sep=sep) accuracy = np.mean(label[label_column].values == - predictions.ix[:, 'PredictedLabel'].values) + predictions['PredictedLabel'].values) assert_greater( accuracy, threshold, diff --git a/src/python/nimbusml/tests/preprocessing/missing_values/test_data_with_missing.py b/src/python/nimbusml/tests/preprocessing/missing_values/test_data_with_missing.py index 0dc85f6e..d21d4311 100644 --- a/src/python/nimbusml/tests/preprocessing/missing_values/test_data_with_missing.py +++ b/src/python/nimbusml/tests/preprocessing/missing_values/test_data_with_missing.py @@ -3,12 +3,14 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------------------------- +import platform import unittest import numpy as np from math import isnan from nimbusml import Pipeline from nimbusml.linear_model import FastLinearRegressor +from nimbusml.preprocessing import ToKeyImputer from nimbusml.preprocessing.missing_values import Filter, Handler, Indicator from pandas import DataFrame from sklearn.utils.testing import assert_equal, assert_true, \ @@ -160,6 +162,20 @@ def test_input_conversion_to_float_retains_other_column_types(self): assert_equal(result.dtypes['f1'], np.object) assert_equal(result.dtypes['f2.f2'], np.float32) + @unittest.skipIf('centos' in platform.linux_distribution()[0].lower(), "centos is not supported") + def test_category_imputation(self): + data={'f0': [4, 4, np.nan, 9], + 'f1': [4, 4, np.nan, np.nan]} + data = DataFrame(data) + + # Check ToKeyImputer + xf = ToKeyImputer(columns={'f0.out': 'f0', 'f1.out': 'f1'}) + result = xf.fit_transform(data) + + assert_equal(result['f0.out'][1], 4) + assert_equal(result['f0.out'][2], 4) + assert_equal(result['f1.out'][1], 4) + assert_equal(result['f1.out'][2], 4) if __name__ == '__main__': unittest.main() diff --git a/src/python/nimbusml/tests/preprocessing/normalization/test_robustscaler.py b/src/python/nimbusml/tests/preprocessing/normalization/test_robustscaler.py new file mode 100644 index 00000000..e4197034 --- /dev/null +++ b/src/python/nimbusml/tests/preprocessing/normalization/test_robustscaler.py @@ -0,0 +1,29 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +import platform +import unittest + +import pandas +from nimbusml import Pipeline +from nimbusml.preprocessing.normalization import RobustScaler + + +@unittest.skipIf('centos' in platform.linux_distribution()[0].lower(), "centos is not supported") +class TestRobustScaler(unittest.TestCase): + + def test_with_integer_inputs(self): + df = pandas.DataFrame(data=dict(c0=[1, 3, 5, 7, 9])) + + xf = RobustScaler(columns='c0', center=True, scale=True) + pipeline = Pipeline([xf]) + result = pipeline.fit_transform(df) + + expected_result = pandas.Series([-1.0, -0.5, 0.0, 0.5, 1.0]) + + self.assertTrue(result.loc[:, 'c0'].equals(expected_result)) + + +if __name__ == '__main__': + unittest.main() diff --git a/src/python/nimbusml/tests/preprocessing/test_datetimesplitter.py b/src/python/nimbusml/tests/preprocessing/test_datetimesplitter.py new file mode 100644 index 00000000..f3bb2643 --- /dev/null +++ b/src/python/nimbusml/tests/preprocessing/test_datetimesplitter.py @@ -0,0 +1,45 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- + +import platform +import unittest + +import pandas +from nimbusml import Pipeline +from nimbusml.preprocessing import DateTimeSplitter +from nimbusml.preprocessing.schema import ColumnSelector +from sklearn.utils.testing import assert_equal + + +@unittest.skipIf('centos' in platform.linux_distribution()[0].lower(), "centos is not supported") +class TestDateTimeSplitter(unittest.TestCase): + + def test_check_estimator_DateTimeSplitter(self): + df = pandas.DataFrame(data=dict(dt=[i for i in range(8)])) + dt = DateTimeSplitter(prefix='dt_') << 'dt' + result = dt.fit_transform(df) + assert_equal(result['dt_Year'][0], 1970, "it should have been year of 1970") + + def test_holidays(self): + df = pandas.DataFrame(data=dict( + tokens1=[1, 2, 3, 157161600], + tokens2=[10, 11, 12, 13] + )) + + cols_to_drop = [ + 'dtHour12', 'dtDayOfWeek', 'dtDayOfQuarter', + 'dtDayOfYear', 'dtWeekOfMonth', 'dtQuarterOfYear', + 'dtHalfOfYear', 'dtWeekIso', 'dtYearIso', 'dtMonthLabel', + 'dtAmPmLabel', 'dtDayOfWeekLabel', 'dtIsPaidTimeOff' + ] + + dts = DateTimeSplitter(prefix='dt', country='Canada') << 'tokens1' + pipeline = Pipeline([dts, ColumnSelector(drop_columns=cols_to_drop)]) + y = pipeline.fit_transform(df) + + self.assertEqual(y.loc[3, 'dtHolidayName'], 'Christmas Day') + +if __name__ == '__main__': + unittest.main() diff --git a/src/python/nimbusml/tests/preprocessing/test_tokeyimputer.py b/src/python/nimbusml/tests/preprocessing/test_tokeyimputer.py new file mode 100644 index 00000000..6eb87bdb --- /dev/null +++ b/src/python/nimbusml/tests/preprocessing/test_tokeyimputer.py @@ -0,0 +1,38 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- + +import platform +import unittest + +import numpy as np +import pandas as pd +from nimbusml.preprocessing import ToKeyImputer + + +@unittest.skipIf('centos' in platform.linux_distribution()[0].lower(), "centos is not supported") +class TestToKeyImputer(unittest.TestCase): + + def test_tokeyimputer(self): + text_df = pd.DataFrame( + data=dict( + text=[ + "cat", + "dog", + "fish", + "orange", + "cat orange", + "dog", + "fish", + None, + "spider"])) + + tokey = ToKeyImputer() << 'text' + y = tokey.fit_transform(text_df) + + self.assertEqual(y.loc[7, 'text'], 'dog') + + +if __name__ == '__main__': + unittest.main() diff --git a/src/python/nimbusml/tests/preprocessing/test_tostring.py b/src/python/nimbusml/tests/preprocessing/test_tostring.py new file mode 100644 index 00000000..89811502 --- /dev/null +++ b/src/python/nimbusml/tests/preprocessing/test_tostring.py @@ -0,0 +1,39 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- + +import platform +import unittest + +import numpy as np +from pandas import DataFrame +from nimbusml.preprocessing import ToString +from sklearn.utils.testing import assert_equal + + +@unittest.skipIf('centos' in platform.linux_distribution()[0].lower(), "centos is not supported") +class TestToString(unittest.TestCase): + + def test_tostring(self): + data={'f0': [4, 4, -1, 9], + 'f1': [5, 5, 3.1, -0.23], + 'f2': [6, 6.7, np.nan, np.nan]} + data = DataFrame(data).astype({'f0': np.int32, + 'f1': np.float32, + 'f2': np.float64}) + + xf = ToString(columns={'f0.out': 'f0', + 'f1.out': 'f1', + 'f2.out': 'f2'}) + result = xf.fit_transform(data) + + assert_equal(result['f0.out'][1], '4') + assert_equal(result['f0.out'][2], '-1') + assert_equal(result['f1.out'][1], '5.000000') + assert_equal(result['f1.out'][2], '3.100000') + assert_equal(result['f2.out'][1], '6.700000') + assert_equal(result['f2.out'][2], 'NaN') + +if __name__ == '__main__': + unittest.main() diff --git a/src/python/nimbusml/tests/test_fit_graph.py b/src/python/nimbusml/tests/test_fit_graph.py new file mode 100644 index 00000000..a0576767 --- /dev/null +++ b/src/python/nimbusml/tests/test_fit_graph.py @@ -0,0 +1,234 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- + +import json +import unittest +import six + +import numpy as np +import pandas as pd +from nimbusml import Pipeline, Role +from nimbusml.cluster import KMeansPlusPlus +from nimbusml.ensemble import FastTreesRegressor, FastForestRegressor +from nimbusml.linear_model import FastLinearClassifier + + +class TestVariableColumn(unittest.TestCase): + + def verify_regressor_nodes(self, graph, label_name, features, trainer_name): + nodes = graph['nodes'] + + self.assertEqual(nodes[0]["Name"], "Transforms.OptionalColumnCreator") + self.assertEqual(nodes[0]["Inputs"]["Column"], [label_name]) + + self.assertEqual(nodes[1]["Name"], "Transforms.LabelToFloatConverter") + self.assertEqual(nodes[1]["Inputs"]["LabelColumn"], label_name) + + self.assertEqual(nodes[2]["Name"], "Transforms.FeatureCombiner") + self.assertEqual(nodes[2]["Inputs"]["Features"], features) + + self.assertEqual(nodes[3]["Name"], trainer_name) + self.assertEqual(nodes[3]["Inputs"]["FeatureColumnName"], "Features") + self.assertEqual(nodes[3]["Inputs"]["LabelColumnName"], label_name) + + def verify_classifier_nodes(self, graph, label_name, features, trainer_name): + nodes = graph['nodes'] + + self.assertEqual(nodes[0]["Name"], "Transforms.OptionalColumnCreator") + self.assertEqual(nodes[0]["Inputs"]["Column"], [label_name]) + + self.assertEqual(nodes[1]["Name"], "Transforms.LabelColumnKeyBooleanConverter") + self.assertEqual(nodes[1]["Inputs"]["LabelColumn"], label_name) + + self.assertEqual(nodes[2]["Name"], "Transforms.FeatureCombiner") + self.assertEqual(nodes[2]["Inputs"]["Features"], features) + + self.assertEqual(nodes[3]["Name"], trainer_name) + self.assertEqual(nodes[3]["Inputs"]["FeatureColumnName"], "Features") + self.assertEqual(nodes[3]["Inputs"]["LabelColumnName"], label_name) + + def test_label_column_defaults_to_label_when_no_label_column_in_input_data(self): + train_data = {'c1': [2, 3, 4, 5], 'c2': [3, 4, 5, 6], + 'c3': [4, 5, 6, 7], 'c4': [0, 1, 2, 1]} + train_df = pd.DataFrame(train_data) + + predictor = FastForestRegressor() + pipeline = Pipeline([predictor]) + result = json.loads(pipeline.fit(train_df, dry_run=True)) + + self.verify_regressor_nodes(result, "Label", + ["c1", "c2", "c3", "c4"], + "Trainers.FastForestRegressor") + + def test_label_column_defaults_to_label_when_label_column_in_input_data(self): + train_data = {'c1': [2, 3, 4, 5], 'c2': [3, 4, 5, 6], + 'c3': [4, 5, 6, 7], 'Label': [0, 1, 2, 1]} + train_df = pd.DataFrame(train_data) + + predictor = FastTreesRegressor() + pipeline = Pipeline([predictor]) + result = json.loads(pipeline.fit(train_df, dry_run=True)) + + self.verify_regressor_nodes(result, "Label", + ["c1", "c2", "c3"], + "Trainers.FastTreeRegressor") + + def test_label_column_specified_as_argument_without_features(self): + train_data = {'c1': [2, 3, 4, 5], 'c2': [3, 4, 5, 6], + 'd1': [4, 5, 6, 7], 'c4': [0, 1, 2, 1]} + train_df = pd.DataFrame(train_data) + + predictor = FastForestRegressor(label='d1') + pipeline = Pipeline([predictor]) + result = json.loads(pipeline.fit(train_df, dry_run=True)) + + self.verify_regressor_nodes(result, "d1", + ["c1", "c2", "c4"], + "Trainers.FastForestRegressor") + + def test_label_column_specified_as_argument_with_features(self): + train_data = {'c1': [2, 3, 4, 5], 'd1': [3, 4, 5, 6], + 'c3': [4, 5, 6, 7], 'c4': [0, 1, 2, 1]} + train_df = pd.DataFrame(train_data) + + predictor = FastForestRegressor(label='d1', feature=['c1', 'c3', 'c4']) + pipeline = Pipeline([predictor]) + result = json.loads(pipeline.fit(train_df, dry_run=True)) + + self.verify_regressor_nodes(result, "d1", + ["c1", "c3", "c4"], + "Trainers.FastForestRegressor") + + def test_label_column_specified_as_role_without_features(self): + train_data = {'c1': [2, 3, 4, 5], 'd1': [3, 4, 5, 6], + 'c3': [4, 5, 6, 7], 'c4': [0, 1, 2, 1]} + train_df = pd.DataFrame(train_data) + + predictor = FastForestRegressor() << {Role.Label: 'd1'} + pipeline = Pipeline([predictor]) + result = json.loads(pipeline.fit(train_df, dry_run=True)) + + self.verify_regressor_nodes(result, "d1", + ["c1", "c3", "c4"], + "Trainers.FastForestRegressor") + + def test_label_column_specified_as_role_with_features(self): + train_data = {'c1': [2, 3, 4, 5], 'd1': [3, 4, 5, 6], + 'c3': [4, 5, 6, 7], 'c4': [0, 1, 2, 1]} + train_df = pd.DataFrame(train_data) + + predictor = FastForestRegressor() << { + Role.Label: 'd1', + Role.Feature: ['c1', 'c4'] + } + pipeline = Pipeline([predictor]) + result = json.loads(pipeline.fit(train_df, dry_run=True)) + + self.verify_regressor_nodes(result, "d1", + ["c1", "c4"], + "Trainers.FastForestRegressor") + + def test_default_label_for_classifier_without_label_column(self): + train_data = {'c1': [2, 3, 4, 5], 'c2': [3, 4, 5, 6], + 'c3': [4, 5, 6, 7], 'c4': [0, 1, 2, 1]} + train_df = pd.DataFrame(train_data) + + predictor = FastLinearClassifier() + pipeline = Pipeline([predictor]) + result = json.loads(pipeline.fit(train_df, dry_run=True)) + + self.verify_classifier_nodes(result, "Label", + ['c1', 'c2', 'c3', 'c4'], + "Trainers.StochasticDualCoordinateAscentClassifier") + + def test_default_label_for_classifier_with_label_column(self): + train_data = {'c1': [2, 3, 4, 5], 'c2': [3, 4, 5, 6], + 'c3': [4, 5, 6, 7], 'Label': [0, 1, 2, 1]} + train_df = pd.DataFrame(train_data) + + predictor = FastLinearClassifier() + pipeline = Pipeline([predictor]) + result = json.loads(pipeline.fit(train_df, dry_run=True)) + + self.verify_classifier_nodes(result, "Label", + ['c1', 'c2', 'c3'], + "Trainers.StochasticDualCoordinateAscentClassifier") + + def test_label_column_for_classifier_specified_as_argument(self): + train_data = {'c1': [2, 3, 4, 5], 'c2': [3, 4, 5, 6], + 'c3': [4, 5, 6, 7], 'd1': [0, 1, 2, 1]} + train_df = pd.DataFrame(train_data) + + predictor = FastLinearClassifier(label='d1') + pipeline = Pipeline([predictor]) + result = json.loads(pipeline.fit(train_df, dry_run=True)) + + self.verify_classifier_nodes(result, "d1", + ['c1', 'c2', 'c3'], + "Trainers.StochasticDualCoordinateAscentClassifier") + + def test_label_column_for_classifier_specified_as_argument_with_features(self): + train_data = {'c1': [2, 3, 4, 5], 'c2': [3, 4, 5, 6], + 'c3': [4, 5, 6, 7], 'd1': [0, 1, 2, 1]} + train_df = pd.DataFrame(train_data) + + predictor = FastLinearClassifier(label='d1', feature=['c1', 'c2']) + pipeline = Pipeline([predictor]) + result = json.loads(pipeline.fit(train_df, dry_run=True)) + + self.verify_classifier_nodes(result, "d1", + ['c1', 'c2'], + "Trainers.StochasticDualCoordinateAscentClassifier") + + def test_label_column_for_classifier_specified_as_role_without_features(self): + train_data = {'d1': [2, 3, 4, 5], 'c2': [3, 4, 5, 6], + 'c3': [4, 5, 6, 7], 'c4': [0, 1, 2, 1]} + train_df = pd.DataFrame(train_data) + + predictor = FastLinearClassifier() << {Role.Label: 'd1'} + pipeline = Pipeline([predictor]) + result = json.loads(pipeline.fit(train_df, dry_run=True)) + + self.verify_classifier_nodes(result, "d1", + ['c2', 'c3', 'c4'], + "Trainers.StochasticDualCoordinateAscentClassifier") + + def test_label_column_for_classifier_specified_as_role_with_features(self): + train_data = {'c1': [2, 3, 4, 5], 'c2': [3, 4, 5, 6], + 'c3': [4, 5, 6, 7], 'd1': [0, 1, 2, 1]} + train_df = pd.DataFrame(train_data) + + predictor = FastLinearClassifier() << { + Role.Label: 'd1', + Role.Feature: ['c1', 'c4'] + } + pipeline = Pipeline([predictor]) + result = json.loads(pipeline.fit(train_df, dry_run=True)) + + self.verify_classifier_nodes(result, "d1", + ['c1', 'c4'], + "Trainers.StochasticDualCoordinateAscentClassifier") + + def test_non_label_based_predictor_does_not_have_label_column_automatically_removed(self): + train_data = {'c1': [2, 3, 4, 5], 'c2': [3, 4, 5, 6], + 'c3': [4, 5, 6, 7], 'Label': [0, 1, 2, 1]} + train_df = pd.DataFrame(train_data) + + predictor = KMeansPlusPlus(n_clusters=5) + pipeline = Pipeline([predictor]) + result = json.loads(pipeline.fit(train_df, dry_run=True)) + nodes = result['nodes'] + + self.assertEqual(nodes[0]["Name"], "Transforms.FeatureCombiner") + if six.PY2: + self.assertItemsEqual(nodes[0]["Inputs"]["Features"], ['c1', 'c2', 'c3', 'Label']) + else: + self.assertCountEqual(nodes[0]["Inputs"]["Features"], ['c1', 'c2', 'c3', 'Label']) + self.assertEqual(nodes[1]["Name"], "Trainers.KMeansPlusPlusClusterer") + self.assertEqual(nodes[1]["Inputs"]["FeatureColumnName"], "Features") + + +if __name__ == '__main__': + unittest.main() diff --git a/src/python/nimbusml/tests/test_syntax_learner.py b/src/python/nimbusml/tests/test_syntax_learner.py index 2c649304..edc05372 100644 --- a/src/python/nimbusml/tests/test_syntax_learner.py +++ b/src/python/nimbusml/tests/test_syntax_learner.py @@ -458,7 +458,7 @@ def test_syntax_slots_wo_pipeline(self): if spl[0] == 'age': ages.append(l2) X_xf1.columns = pandas.MultiIndex( - levels=levels, labels=labels, names=names) + levels=levels, codes=labels, names=names) print(X_xf1.head(n=2).T) col_ages = [('age', a) for a in ages] diff --git a/src/python/nimbusml/tests/timeseries/test_timeseriesimputer.py b/src/python/nimbusml/tests/timeseries/test_timeseriesimputer.py new file mode 100644 index 00000000..fd530713 --- /dev/null +++ b/src/python/nimbusml/tests/timeseries/test_timeseriesimputer.py @@ -0,0 +1,43 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- + +import platform +import unittest + +import numpy as np +import pandas as pd +from nimbusml.timeseries import TimeSeriesImputer + + +@unittest.skipIf('centos' in platform.linux_distribution()[0].lower(), "centos is not supported") +class TestTimeSeriesImputer(unittest.TestCase): + + def test_timeseriesimputer_adds_new_row(self): + from nimbusml.timeseries import TimeSeriesImputer + + df = pd.DataFrame(data=dict( + ts=[1, 2, 3, 5], + grain=[1970, 1970, 1970, 1970], + c3=[10, 13, 15, 20], + c4=[19, 12, 16, 19] + )) + + tsi = TimeSeriesImputer(time_series_column='ts', + grain_columns=['grain'], + filter_columns=['c3', 'c4'], + impute_mode='ForwardFill', + filter_mode='Include') + result = tsi.fit_transform(df) + + self.assertEqual(result.loc[0, 'ts'], 1) + self.assertEqual(result.loc[3, 'ts'], 4) + self.assertEqual(result.loc[3, 'grain'], 1970) + self.assertEqual(result.loc[3, 'c3'], 15) + self.assertEqual(result.loc[3, 'c4'], 16) + self.assertEqual(result.loc[3, 'IsRowImputed'], True) + + +if __name__ == '__main__': + unittest.main() diff --git a/src/python/nimbusml/timeseries/__init__.py b/src/python/nimbusml/timeseries/__init__.py index 64e66add..05dbfa3c 100644 --- a/src/python/nimbusml/timeseries/__init__.py +++ b/src/python/nimbusml/timeseries/__init__.py @@ -3,11 +3,13 @@ from .ssaspikedetector import SsaSpikeDetector from .ssachangepointdetector import SsaChangePointDetector from .ssaforecaster import SsaForecaster +from .timeseriesimputer import TimeSeriesImputer __all__ = [ 'IidSpikeDetector', 'IidChangePointDetector', 'SsaSpikeDetector', 'SsaChangePointDetector', - 'SsaForecaster' + 'SsaForecaster', + 'TimeSeriesImputer' ] diff --git a/src/python/nimbusml/timeseries/ssaforecaster.py b/src/python/nimbusml/timeseries/ssaforecaster.py index 3cbe540f..35516d15 100644 --- a/src/python/nimbusml/timeseries/ssaforecaster.py +++ b/src/python/nimbusml/timeseries/ssaforecaster.py @@ -41,7 +41,7 @@ class SsaForecaster(core, BaseTransform, TransformerMixin): :param series_length: The length of series that is kept in buffer for modeling (parameter N). - :param train_size: The length of series from the begining used for + :param train_size: The length of series from the beginning used for training. :param horizon: The number of values to forecast. diff --git a/src/python/nimbusml/timeseries/timeseriesimputer.py b/src/python/nimbusml/timeseries/timeseriesimputer.py new file mode 100644 index 00000000..bb28c346 --- /dev/null +++ b/src/python/nimbusml/timeseries/timeseriesimputer.py @@ -0,0 +1,77 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +TimeSeriesImputer +""" + +__all__ = ["TimeSeriesImputer"] + + +from sklearn.base import TransformerMixin + +from ..base_transform import BaseTransform +from ..internal.core.timeseries.timeseriesimputer import \ + TimeSeriesImputer as core +from ..internal.utils.utils import trace + + +class TimeSeriesImputer(core, BaseTransform, TransformerMixin): + """ + **Description** + Fills in missing row and values + + :param columns: see `Columns `_. + + :param time_series_column: Column representing the time. + + :param grain_columns: List of grain columns. + + :param filter_columns: Columns to filter. + + :param filter_mode: Filter mode. Either include or exclude. + + :param impute_mode: Mode for imputing, defaults to ForwardFill if not + provided. + + :param supress_type_errors: Suppress the errors that would occur if a + column and impute mode are incompatible. If true, will skip the column. + If false, will stop and throw an error. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + time_series_column, + grain_columns, + filter_columns=None, + filter_mode='Exclude', + impute_mode='ForwardFill', + supress_type_errors=False, + columns=None, + **params): + + if columns: + params['columns'] = columns + BaseTransform.__init__(self, **params) + core.__init__( + self, + time_series_column=time_series_column, + grain_columns=grain_columns, + filter_columns=filter_columns, + filter_mode=filter_mode, + impute_mode=impute_mode, + supress_type_errors=supress_type_errors, + **params) + self._columns = columns + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/setup.py b/src/python/setup.py index e8481345..9ba7ff88 100644 --- a/src/python/setup.py +++ b/src/python/setup.py @@ -45,7 +45,7 @@ # Versions should comply with PEP440. For a discussion on # single-sourcing the version across setup.py and the project code, see # https://packaging.python.org/en/latest/single_source_version.html - version='1.6.1', + version='1.7.0', description='NimbusML', long_description=long_description, @@ -114,6 +114,7 @@ 'tests': [ 'nose>=1.3', 'pytest>=4.4.0', 'graphviz', 'imageio', + 'onnxruntime', ], 'dprep': ['azureml-dataprep>=1.1.33'], 'utils': ['graphviz', 'imageio'], @@ -134,6 +135,7 @@ 'nbconvert>=4.2.0', 'nose>=1.3', 'pytest>=4.4.0', + 'onnxruntime', ], python_requires='>=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <3.8.*', diff --git a/src/python/setup.py.in b/src/python/setup.py.in index 0489bc13..a995460d 100644 --- a/src/python/setup.py.in +++ b/src/python/setup.py.in @@ -114,6 +114,7 @@ setup( 'tests': [ 'nose>=1.3', 'pytest>=4.4.0', 'graphviz', 'imageio', + 'onnxruntime', ], 'dprep': ['azureml-dataprep>=1.1.33'], 'utils': ['graphviz', 'imageio'], @@ -134,6 +135,7 @@ setup( 'nbconvert>=4.2.0', 'nose>=1.3', 'pytest>=4.4.0', + 'onnxruntime' ], python_requires='>=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <3.8.*', diff --git a/src/python/tests/test_estimator_checks.py b/src/python/tests/test_estimator_checks.py index df7c1e87..a9372ad3 100644 --- a/src/python/tests/test_estimator_checks.py +++ b/src/python/tests/test_estimator_checks.py @@ -7,6 +7,7 @@ """ import json import os +import platform import unittest from nimbusml.cluster import KMeansPlusPlus @@ -19,9 +20,10 @@ from nimbusml.ensemble import LightGbmRegressor from nimbusml.feature_extraction.text import NGramFeaturizer from nimbusml.internal.entrypoints._ngramextractor_ngram import n_gram +from nimbusml.preprocessing import TensorFlowScorer, DateTimeSplitter from nimbusml.linear_model import SgdBinaryClassifier -from nimbusml.preprocessing import TensorFlowScorer from nimbusml.preprocessing.filter import SkipFilter, TakeFilter +from nimbusml.preprocessing.normalization import RobustScaler from nimbusml.timeseries import (IidSpikeDetector, IidChangePointDetector, SsaSpikeDetector, SsaChangePointDetector, SsaForecaster) @@ -56,6 +58,15 @@ # I8 should not have NA values 'CountSelector': 'check_estimators_dtypes', + # DateTimeSplitter does not work with floating point types. + 'DateTimeSplitter': + 'check_transformer_general, check_pipeline_consistency' + 'check_estimators_pickle, check_estimators_dtypes' + 'check_dict_unchanged, check_dtype_object, check_fit_score_takes_y' + 'check_transformer_data_not_an_array, check_fit1d_1feature,' + 'check_fit2d_1feature, check_fit2d_predict1d, check_estimators_overwrite_params,' + 'check_estimator_sparse_data, check_fit2d_1sample, check_dont_overwrite_parameters,' + 'check_estimators_fit_returns_self', # by design returns smaller number of rows 'SkipFilter': 'check_transformer_general, ' 'check_transformer_data_not_an_array', @@ -157,6 +168,16 @@ 'check_estimators_overwrite_params, \ check_estimator_sparse_data, check_estimators_pickle, ' 'check_estimators_nan_inf', + # RobustScaler does not support vectorized types + 'RobustScaler': 'check_estimator_sparse_data', + 'ToKeyImputer': + 'check_estimator_sparse_data, check_estimators_dtypes', + # Most of these skipped tests are failing because the checks + # require numerical types. ToString returns object types. + # TypeError: ufunc 'isfinite' not supported for the input types + 'ToString': 'check_estimator_sparse_data, check_pipeline_consistency' + 'check_transformer_data_not_an_array, check_estimators_pickle' + 'check_transformer_general', 'OrdinaryLeastSquaresRegressor': 'check_fit2d_1sample' } @@ -196,6 +217,7 @@ 'check_classifiers_train'] INSTANCES = { + 'DateTimeSplitter': DateTimeSplitter(prefix='dt', columns=['F0']), 'EnsembleClassifier': EnsembleClassifier(num_models=3), 'EnsembleRegressor': EnsembleRegressor(num_models=3), 'FactorizationMachineBinaryClassifier': FactorizationMachineBinaryClassifier(shuffle=False), @@ -209,6 +231,7 @@ 'LightGbmRanker': LightGbmRanker( minimum_example_count_per_group=1, minimum_example_count_per_leaf=1), 'NGramFeaturizer': NGramFeaturizer(word_feature_extractor=n_gram()), + 'RobustScaler': RobustScaler(scale=False), 'SgdBinaryClassifier': SgdBinaryClassifier(number_of_threads=1, shuffle=False), 'SkipFilter': SkipFilter(count=5), 'TakeFilter': TakeFilter(count=100000), @@ -256,9 +279,18 @@ 'TreeFeaturizer', # skip SymSgdBinaryClassifier for now, because of crashes. 'SymSgdBinaryClassifier', - 'DatasetTransformer' + 'DatasetTransformer', + 'OnnxRunner', + 'TimeSeriesImputer' ]) +if 'centos' in platform.linux_distribution()[0].lower(): + skip_epoints |= set([ + 'DateTimeSplitter', + 'RobustScaler', + 'ToKeyImputer', + 'ToString']) + def load_json(file_path): with open(file_path) as f: diff --git a/src/python/tests_extended/data_frame_tool.py b/src/python/tests_extended/data_frame_tool.py new file mode 100644 index 00000000..e1b9317e --- /dev/null +++ b/src/python/tests_extended/data_frame_tool.py @@ -0,0 +1,209 @@ +#------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +#-------------------------------------------------------------------------- + +from datetime import datetime +import numpy as np +import pandas as pd +import onnxruntime as onnxrt + +ort_float_set = set([np.float32, np.float64]) + +pd_float_set = set(['float64']) + +ort_int_set = set([np.int8, np.uint8, np.int16, np.uint16, np.int32, np.uint32, np.int64, np.uint64]) + +pd_int_set = set(['int64']) + +types_dict = { + 'tensor(float16)': np.float16, + 'tensor(float)' : np.float32, + 'tensor(double)' : np.float64, + + 'tensor(int8)' : np.int8, + 'tensor(uint8)' : np.uint8, + 'tensor(int16)' : np.int16, + 'tensor(uint16)' : np.uint16, + 'tensor(int32)' : np.int32, + 'tensor(uint32)' : np.uint32, + 'tensor(int64)' : np.int64, + 'tensor(uint64)' : np.uint64, + + 'tensor(bool)' : np.bool, + 'tensor(string)' : np.object +} + +class DataFrameTool(): + """ + This is a utility class used to run a model with pandas.DataFrame input + """ + def __init__(self, model_path, sess_options=None): + """ + :param model_path: path to the model to be loaded + :param sess_options: see onnxruntime.SessionsOptions + """ + self._model_path = model_path + self._sess_options = sess_options + self._sess = onnxrt.InferenceSession(self._model_path, self._sess_options) + + def _reshape_input(self, input_array, expected_shape): + """ + :param - input_array numpy array. This one is obtained from DataFrame and expected to have + : a rank if 1. + :expected_shape - shape fetched from the model which may include dynamic elements. + : expected_shape may at most have one -1, None or zero which will be computed from + : the size of the input_array. We replace None and zeros to -1 and let np.ndarray.reshape deal with it. + """ + # expected_shape rank is one, we will let onnxruntime to deal with it + if len(expected_shape) == 1: + return input_array + + inferred_shape = [dim if dim else -1 for dim in expected_shape] + return input_array.reshape(inferred_shape) + + def _validate_type(self, input_meta, col_type): + """ + : input_meta - meta info obtained from the model for the given input + : col_type - dtype of the column + : throws if conditions are not met + + float16 and bool will always require exact match + We attempt to convert any type to a string if it is required. + With strings we always want to put this into a flat array, cast to np.object and then reshape as object + Any other type to qualify for casting must match either integer or floating point types + Python datetime which is denoted in Pandas as datetime64[ns] are cast to int64 + """ + expected_type = types_dict[input_meta.type] + if input_meta.type == 'tensor(string)': + return + elif expected_type == col_type: + return + elif expected_type == np.int64 and str(col_type) == 'datetime64[ns]': + return + elif expected_type == np.uint32 and str(col_type) == 'category': + return + elif expected_type in ort_float_set and str(col_type) in pd_float_set: + return + elif expected_type in ort_int_set and str(col_type) in pd_int_set: + return + + raise TypeError("Input {} requires type {} unable to cast column type {} ".format( + input_meta.name, expected_type, col_type)) + + + def _process_input_list(self, df, input_metas, require): + """ + Return a dictionary of input_name : a typed and shaped np.array of values for a given input_meta + The function does the heavy lifting for _get_input_feeds() + + :param df: See :class:`pandas.DataFrame`. + :param input_metas: a list of name/type pairs + :require is a boolean. If True this helper throws on a missing input. + + """ + feeds = {} + # Process mandadory inputs. Raise an error if anything is not present + for input_meta in input_metas: + # We fully expect all the types are in the above dictionary + assert input_meta.type in types_dict, "Update types_dict for the new type" + if input_meta.name in df.columns: + self._validate_type(input_meta, df[input_meta.name].dtype) + if (df[input_meta.name].dtype) == 'datetime64[ns]': + input_array = np.array([dt.timestamp() for dt in df[input_meta.name]]).astype(np.int64) + elif (str(df[input_meta.name].dtype)) == 'category': + input_array = np.array([key + 1 for key in df[input_meta.name].array.codes]).astype(np.uint32) # in ONNX models trained in ML.NET input coming from "categorical columns" is 1 based indices, whereas Categorical columns save indices that are 0 based, and that need to be retrieved from .array.codes + else: + # With strings we must cast first to np.object then then reshape + # so we do it for everything + input_array = np.array(df[input_meta.name]).astype(types_dict[input_meta.type]) + + feeds[input_meta.name] = self._reshape_input(input_array, input_meta.shape) + + elif require: + raise RuntimeError("This model requires input {} of type {} but it is not found in the DataFrame".format( + input_meta.name, types_dict[input_meta.type])) + return feeds + + + def _get_input_feeds(self, df, sess): + """ + Return a dictionary of input_name : a typed and shaped np.array of values + This function accepts Pandas DataFrame as the first argument and onnxruntime + session with a loaded model. The function interrogates the model for the inputs + and matches the model input names to the DataFrame instance column names. + It requires exact matches for bool and float16 types. It attempts to convert to + string any input type if string is required. + It attempts to convert floating types to each other and does the same for all of the + integer types without requiring an exact match. + + :param df: See :class:`pandas.DataFrame`. The function only considers the first row (0) of each column + and feeds the data to the appropriate model inputs. + + :param sess: See :class:`onnxruntime.InferenceSession`. + + :: + For example: pd.DataFrame([[0], [4],[20]],index=[0], columns=['A', 'B', 'C']) + + """ + if df.empty: + raise RuntimeError('input DataFrame is empty') + + # Process mandadory inputs. Raise an error if anything is not present + feeds = self._process_input_list(df, sess.get_inputs(), True) + # Process optional overridable initializers. If present the initialzier value + # is overriden by the input. If not, the initialzier value embedded in the model takes effect. + initializers = self._process_input_list(df, sess.get_overridable_initializers(), False) + + feeds.update(initializers) + + return feeds + + def execute(self, df, output_names=None, output_types=None, run_options=None): + "Return a list of output values restricted to output names if not empty" + """ + Compute the predictions. + + :param df: See :class:`pandas.DataFrame`. + :output_name - list of column output names and their order to output + :output_types { output_name : dtype } optional dictionary that asks to cast output + to the colum type + + :param run_options: See :class:`onnxruntime.RunOptions`. + :: + sess.run([output_name], {input_name: x}) + Pandas DataFrame + """ + input_feed = self._get_input_feeds(df, self._sess); + if not output_names: + output_names = [output.name for output in self._sess._outputs_meta] + + results = self._sess.run(output_names, input_feed, run_options) + + df = pd.DataFrame() + for i, r in enumerate(results): + # TODO: remove this. These extra columns + # should not be in the output. + if output_names[i].startswith('mlnet.') and \ + output_names[i].endswith('.unusedOutput') and \ + r.shape == (1,1): + continue + + r = np.split(r, r.shape[-1], axis=-1) \ + if (r.shape[-1] > 1 and r.shape[0] > 1) else [r] + + for suffix, col in enumerate(r): + col = col.flatten() + if output_types and output_names[i] in output_types: + dtype = output_types[output_names[i]] + if dtype == np.dtype('datetime64'): + col = col.astype(np.int64) + col = [datetime.utcfromtimestamp(ts) for ts in col] + else: + col = col.astype(dtype) + + col_name = output_names[i] if len(r) == 1 else \ + output_names[i] + '.' + str(suffix) + df[col_name] = col + + return df diff --git a/src/python/tests_extended/test_automl_scenario.py b/src/python/tests_extended/test_automl_scenario.py new file mode 100644 index 00000000..ec659612 --- /dev/null +++ b/src/python/tests_extended/test_automl_scenario.py @@ -0,0 +1,90 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- + +import os +import time +import tempfile +import unittest +import pandas as pd +import six +from nimbusml import Pipeline, FileDataStream +from nimbusml.datasets import get_dataset +from nimbusml.feature_extraction.text import NGramFeaturizer +from nimbusml.linear_model import AveragedPerceptronBinaryClassifier +from nimbusml.multiclass import OneVsRestClassifier +from nimbusml.preprocessing import DatasetTransformer +from data_frame_tool import DataFrameTool as DFT + + +def get_tmp_file(suffix=None): + fd, file_name = tempfile.mkstemp(suffix=suffix) + fl = os.fdopen(fd, 'w') + fl.close() + return file_name + +path = get_dataset("wiki_detox_train").as_filepath() +train_set = FileDataStream.read_csv(path, sep='\t') +path = get_dataset("wiki_detox_test").as_filepath() +test_set = FileDataStream.read_csv(path, sep='\t') + +class TestOnnxRuntime(unittest.TestCase): + """ + Tests automl use case: + 1. Fit featurization pipeline separately. + 2. Fit learner on top of the featurization pipeline. + 3. Export to ONNX the learner pipeline. + 4. Compare results between ML.NET and ORT + """ + + @unittest.skipIf(six.PY2, "Disabled due to bug on Mac Python 2.7 build, more info:") + def test_automl_usecase(self): + # train featurization pipeline + featurization_pipe = Pipeline([NGramFeaturizer(keep_diacritics=True, columns={'Features': ['SentimentText']})]) + featurization_pipe.fit(train_set) + + # train learner pipeline + learner_pipe = Pipeline([DatasetTransformer(featurization_pipe.model), + OneVsRestClassifier(AveragedPerceptronBinaryClassifier(), + feature=['Features'], label='Sentiment') + ]) + learner_pipe.fit(train_set) + + # Export the learner pipeline to ONNX + onnx_path = get_tmp_file('.onnx') + learner_pipe.export_to_onnx(onnx_path, 'com.microsoft.ml', onnx_version='Stable') + + # Perform the transform using the standard ML.Net backend + start = time.time() + result_standard = learner_pipe.predict(test_set) + end = time.time() + print('%ss done transform using standard backend' % round(end - start, 3)) + + # Perform the transform using the ORT backend + df_tool = DFT(onnx_path) + dataset = test_set.to_df() + start = time.time() + result_ort = df_tool.execute(dataset, ['PredictedLabel.output', 'Score.output']) + end = time.time() + print('%ss done transform using ORT backend (excludes df load time)' % round(end - start, 3)) + + # compare the results + for col_tuple in (('PredictedLabel', 'PredictedLabel.output'), + ('Score.0', 'Score.output.0'), + ('Score.1', 'Score.output.1'), + ): + col_expected = result_standard.loc[:, col_tuple[0]] + col_ort = result_ort.loc[:, col_tuple[1]] + + check_kwargs = { + 'check_names': False, + 'check_exact': False, + 'check_dtype': True, + 'check_less_precise': True + } + + pd.testing.assert_series_equal(col_expected, col_ort, **check_kwargs) + +if __name__ == '__main__': + unittest.main() diff --git a/src/python/tests_extended/test_export_to_onnx.py b/src/python/tests_extended/test_export_to_onnx.py new file mode 100644 index 00000000..dfb9448c --- /dev/null +++ b/src/python/tests_extended/test_export_to_onnx.py @@ -0,0 +1,668 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +""" +Verify onnx export and transform support +""" +import contextlib +import io +import json +import os +import sys +import tempfile +import numpy as np +import pandas as pd +import pprint + +from nimbusml import Pipeline +from nimbusml.base_predictor import BasePredictor +from nimbusml.cluster import KMeansPlusPlus +from nimbusml.datasets import get_dataset +from nimbusml.datasets.image import get_RevolutionAnalyticslogo, get_Microsoftlogo +from nimbusml.decomposition import PcaTransformer, PcaAnomalyDetector +from nimbusml.ensemble import FastForestBinaryClassifier, FastTreesTweedieRegressor, LightGbmRanker +from nimbusml.feature_extraction.categorical import OneHotVectorizer, OneHotHashVectorizer +from nimbusml.feature_extraction.image import Loader, Resizer, PixelExtractor +from nimbusml.feature_extraction.text import NGramFeaturizer +from nimbusml.feature_extraction.text.extractor import Ngram +from nimbusml.feature_selection import CountSelector, MutualInformationSelector +from nimbusml.linear_model import (AveragedPerceptronBinaryClassifier, + FastLinearBinaryClassifier, + LinearSvmBinaryClassifier) +from nimbusml.multiclass import OneVsRestClassifier +from nimbusml.naive_bayes import NaiveBayesClassifier +from nimbusml.preprocessing import (TensorFlowScorer, FromKey, ToKey, + DateTimeSplitter, OnnxRunner) +from nimbusml.preprocessing.filter import SkipFilter, TakeFilter, RangeFilter +from nimbusml.preprocessing.missing_values import Filter, Handler, Indicator +from nimbusml.preprocessing.normalization import Binner, GlobalContrastRowScaler, LpScaler +from nimbusml.preprocessing.schema import (ColumnConcatenator, TypeConverter, + ColumnDuplicator, ColumnSelector, PrefixColumnConcatenator) +from nimbusml.preprocessing.text import CharTokenizer, WordTokenizer +from nimbusml.timeseries import (IidSpikeDetector, IidChangePointDetector, + SsaSpikeDetector, SsaChangePointDetector, + SsaForecaster) +from data_frame_tool import DataFrameTool as DFT + +SHOW_ONNX_JSON = False +SHOW_TRANSFORMED_RESULTS = True +SHOW_FULL_PANDAS_OUTPUT = False + +if SHOW_FULL_PANDAS_OUTPUT: + pd.set_option('display.max_columns', None) + pd.set_option('display.max_rows', None) + pd.set_option('display.width', 10000) + +script_path = os.path.realpath(__file__) +script_dir = os.path.dirname(script_path) + +# Sepal_Length Sepal_Width Petal_Length Petal_Width Label Species Setosa +# 0 5.1 3.5 1.4 0.2 0 setosa 1.0 +# 1 4.9 3.0 1.4 0.2 0 setosa 1.0 +iris_df = get_dataset("iris").as_df() +iris_df.drop(['Species'], axis=1, inplace=True) + +iris_with_nan_df = iris_df.copy() +iris_with_nan_df.loc[1, 'Petal_Length'] = np.nan + +iris_no_label_df = iris_df.drop(['Label'], axis=1) +iris_binary_df = iris_no_label_df.rename(columns={'Setosa': 'Label'}) +iris_regression_df = iris_no_label_df.drop(['Setosa'], axis=1).rename(columns={'Petal_Width': 'Label'}) + +# Unnamed: 0 education age parity induced case spontaneous stratum pooled.stratum education_str +# 0 1 0.0 26.0 6.0 1.0 1.0 2.0 1.0 3.0 0-5yrs +# 1 2 0.0 42.0 1.0 1.0 1.0 0.0 2.0 1.0 0-5yrs +infert_df = get_dataset("infert").as_df() +infert_df.columns = [i.replace(': ', '') for i in infert_df.columns] +infert_df.rename(columns={'case': 'Label'}, inplace=True) + +infert_onehot_df = (OneHotVectorizer() << 'education_str').fit_transform(infert_df) +infert_onehot_df['Label'] = infert_onehot_df['Label'].astype(np.uint32) + +# rank group carrier price Class dep_day nbr_stops duration +# 0 2 1 AA 240 3 1 0 12.0 +# 1 1 1 AA 300 3 0 1 15.0 +file_path = get_dataset("gen_tickettrain").as_filepath() +gen_tt_df = pd.read_csv(file_path) +gen_tt_df['group'] = gen_tt_df['group'].astype(np.uint32) + +# Unnamed: 0 Label Solar_R Wind Temp Month Day +# 0 1 41.0 190.0 7.4 67 5 1 +# 1 2 36.0 118.0 8.0 72 5 2 +airquality_df = get_dataset("airquality").as_df().fillna(0) +airquality_df = airquality_df[airquality_df.Ozone.notnull()] + +# Sentiment SentimentText +# 0 1 ==RUDE== Dude, you are rude upload that carl... +# 1 1 == OK! == IM GOING TO VANDALIZE WILD ONES W... +file_path = get_dataset("wiki_detox_train").as_filepath() +wiki_detox_df = pd.read_csv(file_path, sep='\t') +wiki_detox_df = wiki_detox_df.head(10) + +# Path Label +# 0 C:\repo\src\python... True +# 1 C:\repo\src\python... False +image_paths_df = pd.DataFrame(data=dict( + Path=[get_RevolutionAnalyticslogo(), get_Microsoftlogo()], + Label=[True, False])) + + +SKIP = { + 'DatasetTransformer', + 'LightLda', + 'NGramExtractor', # Crashes + 'OneVsRestClassifier', + 'OnnxRunner', + 'Sentiment', + 'TensorFlowScorer', + 'TimeSeriesImputer', + 'TreeFeaturizer', + 'WordEmbedding', +} + +INSTANCES = { + 'AveragedPerceptronBinaryClassifier': AveragedPerceptronBinaryClassifier( + feature=['education_str.0-5yrs', 'education_str.6-11yrs', 'education_str.12+ yrs']), + 'Binner': Binner(num_bins=3), + 'CharTokenizer': CharTokenizer(columns={'SentimentText_Transform': 'SentimentText'}), + 'ColumnConcatenator': ColumnConcatenator(columns={'Features': [ + 'Sepal_Length', + 'Sepal_Width', + 'Petal_Length', + 'Petal_Width', + 'Setosa']}), + 'ColumnSelector': ColumnSelector(columns=['Sepal_Width', 'Sepal_Length']), + 'ColumnDuplicator': ColumnDuplicator(columns={'dup': 'Sepal_Width'}), + 'CountSelector': CountSelector(count=5, columns=['Sepal_Width']), + 'DateTimeSplitter': DateTimeSplitter(prefix='dt'), + 'FastForestBinaryClassifier': FastForestBinaryClassifier(feature=['Sepal_Width', 'Sepal_Length'], + label='Setosa'), + 'FastLinearBinaryClassifier': FastLinearBinaryClassifier(feature=['Sepal_Width', 'Sepal_Length'], + label='Setosa'), + 'FastTreesTweedieRegressor': FastTreesTweedieRegressor(label='Ozone'), + 'Filter': Filter(columns=[ 'Petal_Length', 'Petal_Width']), + 'FromKey': Pipeline([ + ToKey(columns=['Sepal_Length']), + FromKey(columns=['Sepal_Length']) + ]), + # GlobalContrastRowScaler currently requires a vector input to work + 'GlobalContrastRowScaler': Pipeline([ + ColumnConcatenator() << { + 'concated_columns': [ + 'Petal_Length', + 'Sepal_Width', + 'Sepal_Length']}, + GlobalContrastRowScaler(columns={'normed_columns': 'concated_columns'}) + ]), + 'Handler': Handler(replace_with='Mean', columns={'NewVals': 'Petal_Length'}), + 'IidSpikeDetector': IidSpikeDetector(columns=['Sepal_Length']), + 'IidChangePointDetector': IidChangePointDetector(columns=['Sepal_Length']), + 'Indicator': Indicator(columns={'Has_Nan': 'Petal_Length'}), + 'KMeansPlusPlus': KMeansPlusPlus(n_clusters=3, feature=['Sepal_Width', 'Sepal_Length']), + 'LightGbmRanker': LightGbmRanker(feature=['Class', 'dep_day', 'duration'], + label='rank', + group_id='group'), + 'Loader': Loader(columns={'ImgPath': 'Path'}), + 'LpScaler': Pipeline([ + ColumnConcatenator() << { + 'concated_columns': [ + 'Petal_Length', + 'Sepal_Width', + 'Sepal_Length']}, + LpScaler(columns={'normed_columns': 'concated_columns'}) + ]), + 'MutualInformationSelector': Pipeline([ + ColumnConcatenator(columns={'Features': ['Sepal_Width', 'Sepal_Length', 'Petal_Width']}), + MutualInformationSelector( + columns='Features', + label='Label', + slots_in_output=2) # only accept one column + ]), + 'NaiveBayesClassifier': NaiveBayesClassifier(feature=['Sepal_Width', 'Sepal_Length']), + 'NGramFeaturizer': NGramFeaturizer(word_feature_extractor=Ngram(), + char_feature_extractor=Ngram(), + keep_diacritics=True, + columns={ 'features': ['SentimentText']}), + 'OneHotHashVectorizer': OneHotHashVectorizer(columns=['education_str']), + 'OneHotVectorizer': OneHotVectorizer(columns=['education_str']), + 'OneVsRestClassifier(AveragedPerceptronBinaryClassifier)': \ + OneVsRestClassifier(AveragedPerceptronBinaryClassifier(), + use_probabilities=True, + feature=['age', + 'education_str.0-5yrs', + 'education_str.6-11yrs', + 'education_str.12+ yrs'], + label='induced'), + 'OneVsRestClassifier(LinearSvmBinaryClassifier)': \ + OneVsRestClassifier(LinearSvmBinaryClassifier(), + use_probabilities=True, + feature=['age', + 'education_str.0-5yrs', + 'education_str.6-11yrs', + 'education_str.12+ yrs'], + label='induced'), + 'PcaAnomalyDetector': PcaAnomalyDetector(rank=3), + 'PcaTransformer': PcaTransformer(rank=2), + 'PixelExtractor': Pipeline([ + Loader(columns={'ImgPath': 'Path'}), + PixelExtractor(columns={'ImgPixels': 'ImgPath'}), + ]), + 'PrefixColumnConcatenator': PrefixColumnConcatenator(columns={'Features': 'Sepal_'}), + 'Resizer': Pipeline([ + Loader(columns={'ImgPath': 'Path'}), + Resizer(image_width=227, image_height=227, + columns={'ImgResize': 'ImgPath'}) + ]), + 'SkipFilter': SkipFilter(count=5), + 'SsaSpikeDetector': SsaSpikeDetector(columns=['Sepal_Length'], + seasonal_window_size=2), + 'SsaChangePointDetector': SsaChangePointDetector(columns=['Sepal_Length'], + seasonal_window_size=2), + 'SsaForecaster': SsaForecaster(columns=['Sepal_Length'], + window_size=2, + series_length=5, + train_size=5, + horizon=1), + 'RangeFilter': RangeFilter(min=5.0, max=5.1, columns=['Sepal_Length']), + 'TakeFilter': TakeFilter(count=100), + 'TensorFlowScorer': TensorFlowScorer( + model_location=os.path.join( + script_dir, + '..', + 'nimbusml', + 'examples', + 'frozen_saved_model.pb'), + columns={'c': ['a', 'b']}), + 'ToKey': ToKey(columns={'edu_1': 'education_str'}), + 'TypeConverter': TypeConverter(columns=['group'], result_type='R4'), + 'WordTokenizer': WordTokenizer(char_array_term_separators=[" "]) << {'wt': 'SentimentText'} +} + +DATASETS = { + 'AveragedPerceptronBinaryClassifier': infert_onehot_df, + 'Binner': iris_no_label_df, + 'BootstrapSampler': infert_df, + 'CharTokenizer': wiki_detox_df, + 'EnsembleRegressor': iris_regression_df, + 'FactorizationMachineBinaryClassifier': iris_binary_df, + 'FastForestBinaryClassifier': iris_no_label_df, + 'FastForestRegressor': iris_regression_df, + 'FastLinearBinaryClassifier': iris_no_label_df, + 'FastLinearClassifier': iris_binary_df, + 'FastLinearRegressor': iris_regression_df, + 'FastTreesBinaryClassifier': iris_binary_df, + 'FastTreesRegressor': iris_regression_df, + 'FastTreesTweedieRegressor': airquality_df, + 'Filter': iris_no_label_df, + 'GamBinaryClassifier': iris_binary_df, + 'GamRegressor': iris_regression_df, + 'GlobalContrastRowScaler': iris_df.astype(np.float32), + 'Handler': iris_with_nan_df, + 'Indicator': iris_with_nan_df, + 'LightGbmBinaryClassifier': iris_binary_df, + 'LightGbmRanker': gen_tt_df, + 'LinearSvmBinaryClassifier': iris_binary_df, + 'Loader': image_paths_df, + 'LogisticRegressionBinaryClassifier': iris_binary_df, + 'LogisticRegressionClassifier': iris_df, + 'LogMeanVarianceScaler': iris_no_label_df, + 'LpScaler': iris_no_label_df.drop(['Setosa'], axis=1).astype(np.float32), + 'MeanVarianceScaler': iris_no_label_df, + 'MinMaxScaler': iris_no_label_df, + 'NGramFeaturizer': wiki_detox_df, + 'OneHotHashVectorizer': infert_df, + 'OneHotVectorizer': infert_df, + 'OnlineGradientDescentRegressor': iris_regression_df, + 'OneVsRestClassifier(AveragedPerceptronBinaryClassifier)': infert_onehot_df, + 'OneVsRestClassifier(LinearSvmBinaryClassifier)': infert_onehot_df, + 'OrdinaryLeastSquaresRegressor': iris_regression_df, + 'PcaAnomalyDetector': iris_no_label_df, + 'PcaTransformer': iris_regression_df, + 'PixelExtractor': image_paths_df, + 'PoissonRegressionRegressor': iris_regression_df, + 'Resizer': image_paths_df, + 'SgdBinaryClassifier': iris_binary_df, + 'SymSgdBinaryClassifier': iris_binary_df, + 'ToKey': infert_df, + 'TypeConverter': gen_tt_df, + 'WordTokenizer': wiki_detox_df +} + +EXPECTED_RESULTS = { + 'AveragedPerceptronBinaryClassifier': {'cols': [('PredictedLabel', 'PredictedLabel', 'PredictedLabel.output')]}, + 'CharTokenizer': {'num_cols': 424, 'cols': 0}, + 'ColumnConcatenator': {'num_cols': 11, 'cols': 0}, + 'ColumnDuplicator': {'num_cols': 7, 'cols': 0}, + 'ColumnSelector': { + 'num_cols': 2, + 'cols': [('Sepal_Width', 'Sepal_Width', 'Sepal_Width.output'), + ('Sepal_Length', 'Sepal_Length', 'Sepal_Length.output')] + }, + #'EnsembleClassifier': {'cols': [('PredictedLabel', 'PredictedLabel')]}, + #'EnsembleRegressor': {'cols': [('Score', 'Score')]}, + 'FastForestBinaryClassifier': {'cols': [('PredictedLabel', 'PredictedLabel', 'PredictedLabel.output')]}, + 'FastForestRegressor': {'cols': [('Score', 'Score', 'Score.output')]}, + 'FastLinearBinaryClassifier': {'cols': [('PredictedLabel', 'PredictedLabel', 'PredictedLabel.output')]}, + 'FastLinearClassifier': {'cols': [('PredictedLabel', 'PredictedLabel', 'PredictedLabel.output')]}, + 'FastLinearRegressor': {'cols': [('Score', 'Score', 'Score.output')]}, + 'FastTreesBinaryClassifier': {'cols': [('PredictedLabel', 'PredictedLabel', 'PredictedLabel.output')]}, + 'FastTreesRegressor': {'cols': [('Score', 'Score', 'Score.output')]}, + 'FastTreesTweedieRegressor': {'cols': [('Score', 'Score', 'Score.output')]}, + 'FromKey': {'num_cols': 6, 'cols': 0}, + 'GlobalContrastRowScaler': {'num_cols': 12, 'cols': 0}, + 'Handler': {'num_cols': 8, 'cols': 0}, + 'Indicator': {'num_cols': 7, 'cols': 0}, + 'KMeansPlusPlus': {'cols': [('PredictedLabel', 'PredictedLabel', 'PredictedLabel.output')]}, + 'LightGbmBinaryClassifier': {'cols': [('PredictedLabel', 'PredictedLabel', 'PredictedLabel.output')]}, + 'LightGbmClassifier': {'cols': [('PredictedLabel', 'PredictedLabel', 'PredictedLabel.output')]}, + 'LightGbmRanker': {'cols': [('Score', 'Score', 'Score.output')]}, + 'LightGbmRegressor': {'cols': [('Score', 'Score', 'Score.output')]}, + 'LinearSvmBinaryClassifier': {'cols': [('PredictedLabel', 'PredictedLabel', 'PredictedLabel.output')]}, + 'LogisticRegressionBinaryClassifier': {'cols': [('PredictedLabel', 'PredictedLabel', 'PredictedLabel.output')]}, + 'LogisticRegressionClassifier': {'cols': [('PredictedLabel', 'PredictedLabel', 'PredictedLabel.output')]}, + 'LpScaler': {'num_cols': 10, 'cols': 0}, + 'MeanVarianceScaler': {'num_cols': 5, 'cols': 0}, + 'MinMaxScaler': {'num_cols': 5, 'cols': 0}, + 'MutualInformationSelector': {'num_cols': 8, 'cols': 0}, + 'NGramFeaturizer': {'num_cols': 273, 'cols': 0}, + 'NaiveBayesClassifier': {'cols': [('PredictedLabel', 'PredictedLabel', 'PredictedLabel.output')]}, + 'OneHotVectorizer': {'num_cols': 12, 'cols': 0}, + 'OneVsRestClassifier(AveragedPerceptronBinaryClassifier)': \ + {'cols': [('PredictedLabel', 'PredictedLabel', 'PredictedLabel.output')]}, + 'OneVsRestClassifier(LinearSvmBinaryClassifier)': \ + {'cols': [('PredictedLabel', 'PredictedLabel', 'PredictedLabel.output')]}, + 'OnlineGradientDescentRegressor': {'cols': [('Score', 'Score', 'Score.output')]}, + 'OrdinaryLeastSquaresRegressor': {'cols': [('Score', 'Score', 'Score.output')]}, + 'PcaTransformer': {'num_cols': 9, 'cols': 0}, + 'PoissonRegressionRegressor': {'cols': [('Score', 'Score', 'Score.output')]}, + 'PrefixColumnConcatenator': {'num_cols': 8, 'cols': 0}, + 'SgdBinaryClassifier': {'cols': [('PredictedLabel', 'PredictedLabel', 'PredictedLabel.output')]}, + 'SymSgdBinaryClassifier': {'cols': [('PredictedLabel', 'PredictedLabel', 'PredictedLabel.output')]}, + 'ToKey': {'num_cols': 11, 'cols': 0}, + 'TypeConverter': {'num_cols': 8, 'cols': 0}, + 'WordTokenizer': {'num_cols': 73, 'cols': 0} +} + +SUPPORTED_ESTIMATORS = { + 'AveragedPerceptronBinaryClassifier', + 'CharTokenizer', + 'ColumnConcatenator', + 'ColumnDuplicator', + 'ColumnSelector', + 'CountSelector', + 'EnsembleClassifier', + 'EnsembleRegressor', + 'FastForestBinaryClassifier', + 'FastForestRegressor', + 'FastLinearBinaryClassifier', + 'FastLinearClassifier', + 'FastLinearRegressor', + 'FastTreesBinaryClassifier', + 'FastTreesRegressor', + 'FastTreesTweedieRegressor', + 'FromKey', + 'GlobalContrastRowScaler', + 'Handler', + 'Indicator', + 'KMeansPlusPlus', + 'LightGbmBinaryClassifier', + 'LightGbmClassifier', + 'LightGbmRanker', + 'LightGbmRegressor', + 'LinearSvmBinaryClassifier', + 'LogisticRegressionBinaryClassifier', + 'LogisticRegressionClassifier', + 'LpScaler', + 'MeanVarianceScaler', + 'MinMaxScaler', + 'MutualInformationSelector', + 'NaiveBayesClassifier', + 'OneHotVectorizer', + 'OnlineGradientDescentRegressor', + 'OrdinaryLeastSquaresRegressor', + 'PcaTransformer', + 'PoissonRegressionRegressor', + 'SgdBinaryClassifier', + 'SymSgdBinaryClassifier', + 'ToKey', + 'TypeConverter', + 'WordTokenizer' +} + + +class CaptureOutputContext(): + """ + Context which can be used for + capturing stdout and stderr. + """ + def __enter__(self): + self.orig_stdout = sys.stdout + self.orig_stderr = sys.stderr + self.stdout_capturer = io.StringIO() + self.stderr_capturer = io.StringIO() + sys.stdout = self.stdout_capturer + sys.stderr = self.stderr_capturer + return self + + def __exit__(self, *args): + sys.stdout = self.orig_stdout + sys.stderr = self.orig_stderr + self.stdout = self.stdout_capturer.getvalue() + self.stderr = self.stderr_capturer.getvalue() + + if self.stdout: + print(self.stdout) + + if self.stderr: + print(self.stderr) + + # free up some memory + del self.stdout_capturer + del self.stderr_capturer + + +def get_tmp_file(suffix=None): + fd, file_name = tempfile.mkstemp(suffix=suffix) + fl = os.fdopen(fd, 'w') + fl.close() + return file_name + + +def get_file_size(file_path): + file_size = 0 + try: + file_size = os.path.getsize(file_path) + except: + pass + return file_size + + +def load_json(file_path): + with open(file_path) as f: + lines = f.readlines() + lines = [l for l in lines if not l.strip().startswith('#')] + content_without_comments = '\n'.join(lines) + return json.loads(content_without_comments) + + +def print_results(result_expected, result_onnx, result_onnx_ort): + print("\nML.Net Output (Expected Result):") + print(result_expected) + if not isinstance(result_expected, pd.Series): + print('Columns', result_expected.columns) + + print("\nOnnxRunner Result:") + print(result_onnx) + if not isinstance(result_onnx, pd.Series): + print('Columns', result_onnx.columns) + + print("\nORT Result:") + print(result_onnx_ort) + if not isinstance(result_onnx_ort, pd.Series): + print('Columns', result_onnx_ort.columns) + +def validate_results(class_name, result_expected, result_onnx, result_ort): + if not class_name in EXPECTED_RESULTS: + raise RuntimeError("ERROR: ONNX model executed but no results specified for comparison.") + + if 'num_cols' in EXPECTED_RESULTS[class_name]: + num_cols = EXPECTED_RESULTS[class_name]['num_cols'] + + if len(result_expected.columns) != num_cols: + raise RuntimeError("ERROR: The ML.Net output does not contain the expected number of columns.") + + if len(result_onnx.columns) != num_cols: + raise RuntimeError("ERROR: The ONNX output does not contain the expected number of columns.") + + if len(result_ort.columns) != num_cols: + raise RuntimeError("ERROR: The ORT output does not contain the expected number of columns.") + + col_tuples = EXPECTED_RESULTS[class_name]['cols'] + + if isinstance(col_tuples, int): + # If col_pairs is an int then slice the columns + # based on the value and use those pairs for comparison + col_tuples = list(zip(result_expected.columns[col_tuples:], + result_onnx.columns[col_tuples:], + result_ort.columns[col_tuples:])) + + if not col_tuples: + raise RuntimeError("ERROR: no columns specified for comparison of results.") + + for col_tuple in col_tuples: + try: + col_expected = result_expected.loc[:, col_tuple[0]] + col_onnx = result_onnx.loc[:, col_tuple[1]] + col_ort = result_ort.loc[:, col_tuple[2]] + + if isinstance(col_expected.dtype, pd.api.types.CategoricalDtype): + # ONNX does not export categorical columns so convert categorical + # columns received from ML.Net back to the original values before + # the comparison. + col_expected = col_expected.astype(col_expected.dtype.categories.dtype) + + check_kwargs = { + 'check_names': False, + 'check_exact': False, + 'check_dtype': True, + 'check_less_precise': True + } + + pd.testing.assert_series_equal(col_expected, col_onnx, **check_kwargs) + pd.testing.assert_series_equal(col_expected, col_ort, **check_kwargs) + + except Exception as e: + print(e) + raise RuntimeError("ERROR: OnnxRunner result does not match expected result.") + + return True + + +def test_export_to_onnx(estimator, class_name): + """ + Fit and test an estimator and determine + if it supports exporting to the ONNX format. + """ + onnx_path = get_tmp_file('.onnx') + onnx_json_path = get_tmp_file('.onnx.json') + + output = None + exported = False + export_valid = False + + try: + dataset = DATASETS.get(class_name, iris_df) + estimator.fit(dataset) + + with CaptureOutputContext() as output: + estimator.export_to_onnx(onnx_path, + 'com.microsoft.ml', + dst_json=onnx_json_path, + onnx_version='Stable') + except Exception as e: + print(e) + + onnx_file_size = get_file_size(onnx_path) + onnx_json_file_size = get_file_size(onnx_json_path) + + if (output and + (onnx_file_size != 0) and + (onnx_json_file_size != 0) and + (not 'cannot save itself as ONNX' in output.stdout) and + (not 'Warning: We do not know how to save the predictor as ONNX' in output.stdout)): + + exported = True + + print('ONNX model path:', onnx_path) + + if SHOW_ONNX_JSON: + with open(onnx_json_path) as f: + print(json.dumps(json.load(f), indent=4)) + + # Verify that the output of the exported onnx graph + # produces the same results as the standard estimators. + if isinstance(estimator, BasePredictor): + result_expected = estimator.predict(dataset) + else: + result_expected = estimator.transform(dataset) + + if isinstance(result_expected, pd.Series): + result_expected = pd.DataFrame(result_expected) + + try: + onnxrunner = OnnxRunner(model_file=onnx_path) + result_onnx = onnxrunner.fit_transform(dataset) + df_tool = DFT(onnx_path) + result_ort = df_tool.execute(dataset, []) + + if SHOW_TRANSFORMED_RESULTS: + print_results(result_expected, result_onnx, result_ort) + + export_valid = validate_results(class_name, + result_expected, + result_onnx, + result_ort) + except Exception as e: + print(e) + + os.remove(onnx_path) + os.remove(onnx_json_path) + return {'exported': exported, 'export_valid': export_valid} + + +manifest_diff = os.path.join(script_dir, '..', 'tools', 'manifest_diff.json') +entry_points = load_json(manifest_diff)['EntryPoints'] +entry_points.extend([ + {'NewName': 'OneVsRestClassifier(AveragedPerceptronBinaryClassifier)'}, + {'NewName': 'OneVsRestClassifier(LinearSvmBinaryClassifier)'} +]) +entry_points = sorted(entry_points, key=lambda ep: ep['NewName']) + +exportable_estimators = set() +unexportable_estimators = set() +runable_estimators = set() + +for entry_point in entry_points: + class_name = entry_point['NewName'] + +# if not class_name in ['OneVsRestClassifier(LinearSvmBinaryClassifier)']: +# continue + + print('\n===========> %s' % class_name) + + if class_name in SKIP: + print("skipped") + continue + + if class_name in INSTANCES: + estimator = INSTANCES[class_name] + else: + mod = __import__('nimbusml.' + entry_point['Module'], + fromlist=[str(class_name)]) + + the_class = getattr(mod, class_name) + estimator = the_class() + + result = test_export_to_onnx(estimator, class_name) + + if result['exported']: + exportable_estimators.add(class_name) + print('Estimator successfully exported to ONNX.') + + else: + unexportable_estimators.add(class_name) + print('Estimator could NOT be exported to ONNX.') + + if result['export_valid']: + runable_estimators.add(class_name) + print('Exported ONNX model successfully transformed with OnnxRunner.') + +print('\n=====================') +print('SUMMARY') +print('=====================') + +print('\nThe following estimators were skipped: ') +pprint.pprint(sorted(SKIP)) + +print('\nThe following estimators were successfully exported to ONNX:') +pprint.pprint(sorted(exportable_estimators)) + +print('\nThe following estimators could not be exported to ONNX: ') +pprint.pprint(sorted(unexportable_estimators)) + +failed_exports = SUPPORTED_ESTIMATORS.difference(exportable_estimators) +print("\nThe following estimators failed exporting to ONNX:") +pprint.pprint(sorted(failed_exports)) + +failed_e2e_estimators = exportable_estimators.difference(runable_estimators) +print("\nThe following tests exported to ONNX but failed the end to end test:") +pprint.pprint(sorted(failed_e2e_estimators)) + +print('\nThe following estimators successfully completed the end to end test: ') +pprint.pprint(sorted(runable_estimators)) +print() + +if len(failed_exports) + len(failed_e2e_estimators) > 0: + raise RuntimeError("ONNX export checks failed") + diff --git a/src/python/tools/entrypoint_compiler.py b/src/python/tools/entrypoint_compiler.py index ed829533..57b1b8de 100644 --- a/src/python/tools/entrypoint_compiler.py +++ b/src/python/tools/entrypoint_compiler.py @@ -1560,7 +1560,6 @@ def __init__(self, argument, inout): # dict self.default = argument.get('Default', Missing()) self.required = argument.get('Required', Missing()) self.aliases = argument.get('Aliases', Missing()) - self.pass_as = argument.get('PassAs', None) self.name_converted = convert_name(self.name) self.new_name_converted = convert_name( @@ -1615,7 +1614,7 @@ def get_body(self): "is_of_type=numbers.Real" body = template.format( inout=self.inout, - name=self.pass_as or self.name, + name=self.name, name_converted=self.name_converted, none_acceptable=not self.required) if not isinstance(self.range, Missing): @@ -1646,7 +1645,7 @@ def get_body(self): "none_acceptable={none_acceptable}, is_of_type=bool" body = template.format( inout=self.inout, - name=self.pass_as or self.name, + name=self.name, name_converted=self.name_converted, none_acceptable=not self.required) return body + ")" @@ -1693,7 +1692,7 @@ def get_body(self): template += ", is_column=True" body = template.format( inout=self.inout, - name=self.pass_as or self.name, + name=self.name, name_converted=self.name_converted, none_acceptable=not self.required) return body + ")" @@ -1717,7 +1716,7 @@ def get_body(self): "none_acceptable={none_acceptable}, is_of_type=str" body = template.format( inout=self.inout, - name=self.pass_as or self.name, + name=self.name, name_converted=self.name_converted, none_acceptable=not self.required) value_check = ", values={0}".format(str(self.type['Values'])) @@ -1748,7 +1747,7 @@ def get_body(self): "none_acceptable={none_acceptable}, is_of_type=list" body = template.format( inout=self.inout, - name=self.pass_as or self.name, + name=self.name, name_converted=self.name_converted, none_acceptable=not self.required) return body + ")" @@ -1790,7 +1789,7 @@ def get_body(self): template += ', is_column=True' body = template.format( inout=self.inout, - name=self.pass_as or self.name, + name=self.name, name_converted=self.name_converted, none_acceptable=not self.required) return body + ")" @@ -1818,7 +1817,7 @@ def get_body(self): template += ', is_column=True' body = template.format( inout=self.inout, - name=self.pass_as or self.name, + name=self.name, name_converted=self.name_converted, none_acceptable=not self.required) return body + ")" @@ -1846,7 +1845,7 @@ def get_body(self): "none_acceptable={none_acceptable}, is_of_type=dict" body = template.format( inout=self.inout, - name=self.pass_as or self.name, + name=self.name, name_converted=self.name_converted, none_acceptable=not self.required) return body + ")" @@ -1882,7 +1881,7 @@ def get_body(self): template += ", is_column=True" body = template.format( inout=self.inout, - name=self.pass_as or self.name, + name=self.name, name_converted=self.name_converted, none_acceptable=not self.required) field_check = ", field_names={0}".format( @@ -2041,6 +2040,7 @@ def generate_code(pkg_path, generate_entrypoints, generate_api): script_args = arg_parser.parse_args() pkg_path = os.path.join(my_dir, r'..\nimbusml') + if script_args.check_manual_changes: verbose = False if script_args.folder == 'temp': diff --git a/src/python/tools/manifest.json b/src/python/tools/manifest.json index e54ff2c2..fd7f7950 100644 --- a/src/python/tools/manifest.json +++ b/src/python/tools/manifest.json @@ -1731,14 +1731,14 @@ { "Name": "Slope", "Type": "Float", - "Desc": "The slope parameter of the calibration function 1 / (1 + exp(-slope * x + offset)", + "Desc": "The slope parameter of the calibration function 1 / (1 + exp(slope * x + offset)", "Aliases": [ "slope" ], "Required": false, "SortOrder": 1.0, "IsNullable": false, - "Default": 1.0 + "Default": -1.0 }, { "Name": "Data", @@ -1762,7 +1762,7 @@ { "Name": "Offset", "Type": "Float", - "Desc": "The offset parameter of the calibration function 1 / (1 + exp(-slope * x + offset)", + "Desc": "The offset parameter of the calibration function 1 / (1 + exp(slope * x + offset)", "Aliases": [ "offset" ], @@ -1903,9 +1903,9 @@ } }, { - "Name": "SupressScoresAndLabels", + "Name": "SuppressScoresAndLabels", "Type": "Bool", - "Desc": "Supress labels and scores in per-instance outputs?", + "Desc": "Suppress labels and scores in per-instance outputs?", "Aliases": [ "noScores" ], @@ -2194,6 +2194,203 @@ "ITrainerInput" ] }, + { + "Name": "Models.OnnxConverter", + "Desc": "Converts the model to ONNX format.", + "FriendlyName": "ONNX Converter.", + "ShortName": null, + "Inputs": [ + { + "Name": "DataFile", + "Type": "String", + "Desc": "The data file", + "Aliases": [ + "data" + ], + "Required": false, + "SortOrder": 0.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Onnx", + "Type": "String", + "Desc": "The path to write the output ONNX to.", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "Json", + "Type": "String", + "Desc": "The path to write the output JSON to.", + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Name", + "Type": "String", + "Desc": "The 'name' property in the output ONNX. By default this will be the ONNX extension-less name.", + "Required": false, + "SortOrder": 3.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Domain", + "Type": "String", + "Desc": "The 'domain' property in the output ONNX.", + "Required": false, + "SortOrder": 4.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "InputsToDrop", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "Array of input column names to drop", + "Required": false, + "SortOrder": 6.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "OutputsToDrop", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "Array of output column names to drop", + "Required": false, + "SortOrder": 8.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Model", + "Type": "TransformModel", + "Desc": "Model that needs to be converted to ONNX format.", + "Required": false, + "SortOrder": 10.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "OnnxVersion", + "Type": { + "Kind": "Enum", + "Values": [ + "Stable", + "Experimental" + ] + }, + "Desc": "The targeted ONNX version. It can be either \"Stable\" or \"Experimental\". If \"Experimental\" is used, produced model can contain components that is not officially supported in ONNX standard.", + "Required": false, + "SortOrder": 11.0, + "IsNullable": false, + "Default": "Stable" + }, + { + "Name": "PredictiveModel", + "Type": "PredictorModel", + "Desc": "Predictor model that needs to be converted to ONNX format.", + "Required": false, + "SortOrder": 12.0, + "IsNullable": false, + "Default": null + } + ], + "Outputs": [] + }, + { + "Name": "Models.OnnxTransformer", + "Desc": "Applies an ONNX model to a dataset.", + "FriendlyName": "Onnx Transformer", + "ShortName": "onnx-xf", + "Inputs": [ + { + "Name": "ModelFile", + "Type": "String", + "Desc": "Path to the onnx model file.", + "Aliases": [ + "model" + ], + "Required": true, + "SortOrder": 0.0, + "IsNullable": false + }, + { + "Name": "InputColumns", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "Name of the input column.", + "Required": false, + "SortOrder": 1.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "OutputColumns", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "Name of the output column.", + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "GpuDeviceId", + "Type": "Int", + "Desc": "GPU device id to run on (e.g. 0,1,..). Null for CPU. Requires CUDA 9.1.", + "Required": false, + "SortOrder": 3.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "FallbackToCpu", + "Type": "Bool", + "Desc": "If true, resumes execution on CPU upon GPU error. If false, will raise the GPU execption.", + "Required": false, + "SortOrder": 4.0, + "IsNullable": false, + "Default": false + } + ], + "Outputs": [ + { + "Name": "OutputData", + "Type": "DataView", + "Desc": "ONNX transformed dataset" + }, + { + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" + } + ], + "InputKind": [ + "ITransformInput" + ] + }, { "Name": "Models.OvaModelCombiner", "Desc": "Combines a sequence of PredictorModels into a single model", @@ -3981,7 +4178,7 @@ { "Name": "TrainSize", "Type": "Int", - "Desc": "The length of series from the begining used for training.", + "Desc": "The length of series from the beginning used for training.", "Required": true, "SortOrder": 2.0, "IsNullable": false, @@ -10418,7 +10615,7 @@ }, { "Name": "Trainers.GeneralizedAdditiveModelBinaryClassifier", - "Desc": "Trains a gradient boosted stump per feature, on all features simultaneously, to fit target values using least-squares. It mantains no interactions between features.", + "Desc": "Trains a gradient boosted stump per feature, on all features simultaneously, to fit target values using least-squares. It maintains no interactions between features.", "FriendlyName": "Generalized Additive Model for Binary Classification", "ShortName": "gam", "Inputs": [ @@ -10718,7 +10915,7 @@ }, { "Name": "Trainers.GeneralizedAdditiveModelRegressor", - "Desc": "Trains a gradient boosted stump per feature, on all features simultaneously, to fit target values using least-squares. It mantains no interactions between features.", + "Desc": "Trains a gradient boosted stump per feature, on all features simultaneously, to fit target values using least-squares. It maintains no interactions between features.", "FriendlyName": "Generalized Additive Model for Regression", "ShortName": "gamr", "Inputs": [ @@ -11546,6 +11743,9 @@ "Name": "HandleMissingValue", "Type": "Bool", "Desc": "Enable special handling of missing value or not.", + "Aliases": [ + "hmv" + ], "Required": false, "SortOrder": 150.0, "IsNullable": false, @@ -11558,6 +11758,25 @@ ] } }, + { + "Name": "UseZeroAsMissingValue", + "Type": "Bool", + "Desc": "Enable usage of zero (0) as missing value.", + "Aliases": [ + "uzam" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": false, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + true, + false + ] + } + }, { "Name": "MinimumExampleCountPerGroup", "Type": "Int", @@ -12043,6 +12262,9 @@ "Name": "HandleMissingValue", "Type": "Bool", "Desc": "Enable special handling of missing value or not.", + "Aliases": [ + "hmv" + ], "Required": false, "SortOrder": 150.0, "IsNullable": false, @@ -12055,6 +12277,25 @@ ] } }, + { + "Name": "UseZeroAsMissingValue", + "Type": "Bool", + "Desc": "Enable usage of zero (0) as missing value.", + "Aliases": [ + "uzam" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": false, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + true, + false + ] + } + }, { "Name": "MinimumExampleCountPerGroup", "Type": "Int", @@ -12540,6 +12781,9 @@ "Name": "HandleMissingValue", "Type": "Bool", "Desc": "Enable special handling of missing value or not.", + "Aliases": [ + "hmv" + ], "Required": false, "SortOrder": 150.0, "IsNullable": false, @@ -12552,6 +12796,25 @@ ] } }, + { + "Name": "UseZeroAsMissingValue", + "Type": "Bool", + "Desc": "Enable usage of zero (0) as missing value.", + "Aliases": [ + "uzam" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": false, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + true, + false + ] + } + }, { "Name": "MinimumExampleCountPerGroup", "Type": "Int", @@ -12998,6 +13261,9 @@ "Name": "HandleMissingValue", "Type": "Bool", "Desc": "Enable special handling of missing value or not.", + "Aliases": [ + "hmv" + ], "Required": false, "SortOrder": 150.0, "IsNullable": false, @@ -13010,6 +13276,25 @@ ] } }, + { + "Name": "UseZeroAsMissingValue", + "Type": "Bool", + "Desc": "Enable usage of zero (0) as missing value.", + "Aliases": [ + "uzam" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": false, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + true, + false + ] + } + }, { "Name": "MinimumExampleCountPerGroup", "Type": "Int", @@ -13417,10 +13702,10 @@ ] }, { - "Name": "Trainers.LogisticRegressionBinaryClassifier", - "Desc": "Logistic Regression is a method in statistics used to predict the probability of occurrence of an event and can be used as a classification algorithm. The algorithm predicts the probability of occurrence of an event by fitting data to a logistical function.", - "FriendlyName": "Logistic Regression", - "ShortName": "lr", + "Name": "Trainers.LocalDeepSvmBinaryClassifier", + "Desc": "LD-SVM learns a binary, non-linear SVM classifier with a kernel that is specifically designed to reduce prediction time. LD-SVM learns decision boundaries that are locally linear.", + "FriendlyName": "Local Deep SVM (LDSVM)", + "ShortName": "LDSVM", "Inputs": [ { "Name": "TrainingData", @@ -13509,30 +13794,312 @@ "Default": "Auto" }, { - "Name": "ShowTrainingStatistics", - "Type": "Bool", - "Desc": "Show statistics of training examples.", + "Name": "TreeDepth", + "Type": "Int", + "Desc": "Depth of Local Deep SVM tree", "Aliases": [ - "stat", - "ShowTrainingStats" + "depth" ], "Required": false, "SortOrder": 50.0, "IsNullable": false, - "Default": false + "Default": 3, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 1, + 3, + 5, + 7 + ] + } }, { - "Name": "L2Regularization", + "Name": "LambdaW", "Type": "Float", - "Desc": "L2 regularization weight", + "Desc": "Regularizer for classifier parameter W", "Aliases": [ - "l2", - "L2Weight" + "lw" ], "Required": false, "SortOrder": 50.0, "IsNullable": false, - "Default": 1.0, + "Default": 0.1, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 0.1, + 0.01, + 0.001 + ] + } + }, + { + "Name": "LambdaTheta", + "Type": "Float", + "Desc": "Regularizer for kernel parameter Theta", + "Aliases": [ + "lt" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": 0.01, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 0.1, + 0.01, + 0.001 + ] + } + }, + { + "Name": "LambdaThetaprime", + "Type": "Float", + "Desc": "Regularizer for kernel parameter Thetaprime", + "Aliases": [ + "lp" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": 0.01, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 0.1, + 0.01, + 0.001 + ] + } + }, + { + "Name": "Sigma", + "Type": "Float", + "Desc": "Parameter for sigmoid sharpness", + "Aliases": [ + "s" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": 1.0, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 1.0, + 0.1, + 0.01 + ] + } + }, + { + "Name": "NumberOfIterations", + "Type": "Int", + "Desc": "Number of iterations", + "Aliases": [ + "iter", + "NumIterations" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": 15000, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + 10000, + 15000 + ] + } + }, + { + "Name": "UseBias", + "Type": "Bool", + "Desc": "No bias", + "Aliases": [ + "bias" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": true, + "SweepRange": { + "RangeType": "Discrete", + "Values": [ + false, + true + ] + } + }, + { + "Name": "Calibrator", + "Type": { + "Kind": "Component", + "ComponentKind": "CalibratorTrainer" + }, + "Desc": "The calibrator kind to apply to the predictor. Specify null for no calibration", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": { + "Name": "PlattCalibrator" + } + }, + { + "Name": "MaxCalibrationExamples", + "Type": "Int", + "Desc": "The maximum number of examples to use when training the calibrator", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 1000000 + }, + { + "Name": "Cache", + "Type": "Bool", + "Desc": "Whether to cache the data before the first iteration", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": true + } + ], + "Outputs": [ + { + "Name": "PredictorModel", + "Type": "PredictorModel", + "Desc": "The trained model" + } + ], + "InputKind": [ + "ITrainerInputWithWeight", + "ITrainerInputWithLabel", + "ITrainerInput" + ], + "OutputKind": [ + "IBinaryClassificationOutput", + "ITrainerOutput" + ] + }, + { + "Name": "Trainers.LogisticRegressionBinaryClassifier", + "Desc": "Logistic Regression is a method in statistics used to predict the probability of occurrence of an event and can be used as a classification algorithm. The algorithm predicts the probability of occurrence of an event by fitting data to a logistical function.", + "FriendlyName": "Logistic Regression", + "ShortName": "lr", + "Inputs": [ + { + "Name": "TrainingData", + "Type": "DataView", + "Desc": "The data to be used for training", + "Aliases": [ + "data" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "FeatureColumnName", + "Type": "String", + "Desc": "Column to use for features", + "Aliases": [ + "feat" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": "Features" + }, + { + "Name": "LabelColumnName", + "Type": "String", + "Desc": "Column to use for labels", + "Aliases": [ + "lab" + ], + "Required": false, + "SortOrder": 3.0, + "IsNullable": false, + "Default": "Label" + }, + { + "Name": "ExampleWeightColumnName", + "Type": "String", + "Desc": "Column to use for example weight", + "Aliases": [ + "weight" + ], + "Required": false, + "SortOrder": 4.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "NormalizeFeatures", + "Type": { + "Kind": "Enum", + "Values": [ + "No", + "Warn", + "Auto", + "Yes" + ] + }, + "Desc": "Normalize option for the feature column", + "Aliases": [ + "norm" + ], + "Required": false, + "SortOrder": 5.0, + "IsNullable": false, + "Default": "Auto" + }, + { + "Name": "Caching", + "Type": { + "Kind": "Enum", + "Values": [ + "Auto", + "Memory", + "None" + ] + }, + "Desc": "Whether trainer should cache input training data", + "Aliases": [ + "cache" + ], + "Required": false, + "SortOrder": 6.0, + "IsNullable": false, + "Default": "Auto" + }, + { + "Name": "ShowTrainingStatistics", + "Type": "Bool", + "Desc": "Show statistics of training examples.", + "Aliases": [ + "stat", + "ShowTrainingStats" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": false + }, + { + "Name": "L2Regularization", + "Type": "Float", + "Desc": "L2 regularization weight", + "Aliases": [ + "l2", + "L2Weight" + ], + "Required": false, + "SortOrder": 50.0, + "IsNullable": false, + "Default": 1.0, "SweepRange": { "RangeType": "Float", "Min": 0.0, @@ -13739,7 +14306,7 @@ }, { "Name": "Trainers.LogisticRegressionClassifier", - "Desc": "Maximum entrypy classification is a method in statistics used to predict the probabilities of parallel events. The model predicts the probabilities of parallel events by fitting data to a softmax function.", + "Desc": "Maximum entropy classification is a method in statistics used to predict the probabilities of parallel events. The model predicts the probabilities of parallel events by fitting data to a softmax function.", "FriendlyName": "Multi-class Logistic Regression", "ShortName": "mlr", "Inputs": [ @@ -17211,6 +17778,82 @@ "ITransformOutput" ] }, + { + "Name": "Transforms.CategoryImputer", + "Desc": "Fills in missing values in a column based on the most frequent value", + "FriendlyName": "CategoryImputer", + "ShortName": "CategoryImputer", + "Inputs": [ + { + "Name": "Column", + "Type": { + "Kind": "Array", + "ItemType": { + "Kind": "Struct", + "Fields": [ + { + "Name": "Name", + "Type": "String", + "Desc": "Name of the new column", + "Aliases": [ + "name" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Source", + "Type": "String", + "Desc": "Name of the source column", + "Aliases": [ + "src" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + } + ] + } + }, + "Desc": "New column definition (optional form: name:src)", + "Aliases": [ + "col" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + } + ], + "Outputs": [ + { + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" + }, + { + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" + } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" + ] + }, { "Name": "Transforms.CharacterTokenizer", "Desc": "Character-oriented tokenizer where text is considered a sequence of characters.", @@ -18032,6 +18675,117 @@ } ] }, + { + "Name": "Transforms.DateTimeSplitter", + "Desc": "Splits a date time value into each individual component", + "FriendlyName": "DateTime Transform", + "ShortName": "DateTimeTransform", + "Inputs": [ + { + "Name": "Source", + "Type": "String", + "Desc": "Input column", + "Aliases": [ + "src" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "Prefix", + "Type": "String", + "Desc": "Output column prefix", + "Aliases": [ + "pre" + ], + "Required": true, + "SortOrder": 2.0, + "IsNullable": false + }, + { + "Name": "Country", + "Type": { + "Kind": "Enum", + "Values": [ + "None", + "Argentina", + "Australia", + "Austria", + "Belarus", + "Belgium", + "Brazil", + "Canada", + "Colombia", + "Croatia", + "Czech", + "Denmark", + "England", + "Finland", + "France", + "Germany", + "Hungary", + "India", + "Ireland", + "IsleofMan", + "Italy", + "Japan", + "Mexico", + "Netherlands", + "NewZealand", + "NorthernIreland", + "Norway", + "Poland", + "Portugal", + "Scotland", + "Slovenia", + "SouthAfrica", + "Spain", + "Sweden", + "Switzerland", + "Ukraine", + "UnitedKingdom", + "UnitedStates", + "Wales" + ] + }, + "Desc": "Country to get holidays for. Defaults to none if not passed", + "Aliases": [ + "ctry" + ], + "Required": false, + "SortOrder": 4.0, + "IsNullable": false, + "Default": "None" + } + ], + "Outputs": [ + { + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" + }, + { + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" + } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" + ] + }, { "Name": "Transforms.Dictionarizer", "Desc": "Converts input values (words, numbers, etc.) to index in a dictionary.", @@ -20637,7 +21391,7 @@ }, { "Name": "Transforms.MissingValueHandler", - "Desc": "Handle missing values by replacing them with either the default value or the mean/min/max value (for non-text columns only). An indicator column can optionally be concatenated, if theinput column type is numeric.", + "Desc": "Handle missing values by replacing them with either the default value or the mean/min/max value (for non-text columns only). An indicator column can optionally be concatenated, if the input column type is numeric.", "FriendlyName": "NA Handle Transform", "ShortName": "NAHandle", "Inputs": [ @@ -21814,13 +22568,98 @@ ], "Required": false, "SortOrder": 150.0, - "IsNullable": false, + "IsNullable": false, + "Default": null + } + ] + } + }, + "Desc": "New column definition(s) (optional form: name:src)", + "Aliases": [ + "col" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + } + ], + "Outputs": [ + { + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" + }, + { + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" + } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" + ] + }, + { + "Name": "Transforms.RandomNumberGenerator", + "Desc": "Adds a column with a generated number sequence.", + "FriendlyName": "Generate Number Transform", + "ShortName": "Generate", + "Inputs": [ + { + "Name": "Column", + "Type": { + "Kind": "Array", + "ItemType": { + "Kind": "Struct", + "Fields": [ + { + "Name": "Name", + "Type": "String", + "Desc": "Name of the new column", + "Aliases": [ + "name" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "UseCounter", + "Type": "Bool", + "Desc": "Use an auto-incremented integer starting at zero instead of a random number", + "Aliases": [ + "cnt" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, + "Default": null + }, + { + "Name": "Seed", + "Type": "UInt", + "Desc": "The random seed", + "Required": false, + "SortOrder": 150.0, + "IsNullable": true, "Default": null } ] } }, - "Desc": "New column definition(s) (optional form: name:src)", + "Desc": "New column definition(s) (optional form: name:seed)", "Aliases": [ "col" ], @@ -21835,6 +22674,27 @@ "Required": true, "SortOrder": 1.0, "IsNullable": false + }, + { + "Name": "UseCounter", + "Type": "Bool", + "Desc": "Use an auto-incremented integer starting at zero instead of a random number", + "Aliases": [ + "cnt" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": false + }, + { + "Name": "Seed", + "Type": "UInt", + "Desc": "The random seed", + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": 42 } ], "Outputs": [ @@ -21857,10 +22717,10 @@ ] }, { - "Name": "Transforms.RandomNumberGenerator", - "Desc": "Adds a column with a generated number sequence.", - "FriendlyName": "Generate Number Transform", - "ShortName": "Generate", + "Name": "Transforms.RobustScaler", + "Desc": "Removes the median and scales the data according to the quantile range.", + "FriendlyName": "RobustScalerTransformer", + "ShortName": "RobScalT", "Inputs": [ { "Name": "Column", @@ -21882,30 +22742,21 @@ "Default": null }, { - "Name": "UseCounter", - "Type": "Bool", - "Desc": "Use an auto-incremented integer starting at zero instead of a random number", + "Name": "Source", + "Type": "String", + "Desc": "Name of the source column", "Aliases": [ - "cnt" + "src" ], "Required": false, "SortOrder": 150.0, - "IsNullable": true, - "Default": null - }, - { - "Name": "Seed", - "Type": "UInt", - "Desc": "The random seed", - "Required": false, - "SortOrder": 150.0, - "IsNullable": true, + "IsNullable": false, "Default": null } ] } }, - "Desc": "New column definition(s) (optional form: name:seed)", + "Desc": "New column definition (optional form: name:src)", "Aliases": [ "col" ], @@ -21922,25 +22773,52 @@ "IsNullable": false }, { - "Name": "UseCounter", + "Name": "Center", "Type": "Bool", - "Desc": "Use an auto-incremented integer starting at zero instead of a random number", + "Desc": "If True, center the data before scaling.", "Aliases": [ - "cnt" + "ctr" ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 2.0, "IsNullable": false, - "Default": false + "Default": true }, { - "Name": "Seed", - "Type": "UInt", - "Desc": "The random seed", + "Name": "Scale", + "Type": "Bool", + "Desc": "If True, scale the data to interquartile range.", + "Aliases": [ + "sc" + ], "Required": false, - "SortOrder": 150.0, + "SortOrder": 3.0, "IsNullable": false, - "Default": 42 + "Default": true + }, + { + "Name": "QuantileMin", + "Type": "Float", + "Desc": "Min for the quantile range used to calculate scale.", + "Aliases": [ + "min" + ], + "Required": false, + "SortOrder": 4.0, + "IsNullable": false, + "Default": 25.0 + }, + { + "Name": "QuantileMax", + "Type": "Float", + "Desc": "Max for the quantile range used to calculate scale.", + "Aliases": [ + "max" + ], + "Required": false, + "SortOrder": 5.0, + "IsNullable": false, + "Default": 75.0 } ], "Outputs": [ @@ -22972,6 +23850,206 @@ "ITransformOutput" ] }, + { + "Name": "Transforms.TimeSeriesImputer", + "Desc": "Fills in missing row and values", + "FriendlyName": "TimeSeriesImputer", + "ShortName": "tsi", + "Inputs": [ + { + "Name": "TimeSeriesColumn", + "Type": "String", + "Desc": "Column representing the time", + "Aliases": [ + "time" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "GrainColumns", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "List of grain columns", + "Aliases": [ + "grains" + ], + "Required": true, + "SortOrder": 2.0, + "IsNullable": false + }, + { + "Name": "FilterColumns", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "Columns to filter", + "Aliases": [ + "filters" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "FilterMode", + "Type": { + "Kind": "Enum", + "Values": [ + "NoFilter", + "Include", + "Exclude" + ] + }, + "Desc": "Filter mode. Either include or exclude", + "Aliases": [ + "fmode" + ], + "Required": false, + "SortOrder": 3.0, + "IsNullable": false, + "Default": "Exclude" + }, + { + "Name": "ImputeMode", + "Type": { + "Kind": "Enum", + "Values": [ + "ForwardFill", + "BackFill", + "Median" + ] + }, + "Desc": "Mode for imputing, defaults to ForwardFill if not provided", + "Aliases": [ + "mode" + ], + "Required": false, + "SortOrder": 3.0, + "IsNullable": false, + "Default": "ForwardFill" + }, + { + "Name": "SupressTypeErrors", + "Type": "Bool", + "Desc": "Suppress the errors that would occur if a column and impute mode are incompatible. If true, will skip the column. If false, will stop and throw an error.", + "Aliases": [ + "error" + ], + "Required": false, + "SortOrder": 3.0, + "IsNullable": false, + "Default": false + } + ], + "Outputs": [ + { + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" + }, + { + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" + } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" + ] + }, + { + "Name": "Transforms.ToString", + "Desc": "Turns the given column into a column of its string representation", + "FriendlyName": "ToString Transform", + "ShortName": "tostr", + "Inputs": [ + { + "Name": "Column", + "Type": { + "Kind": "Array", + "ItemType": { + "Kind": "Struct", + "Fields": [ + { + "Name": "Name", + "Type": "String", + "Desc": "Name of the new column", + "Aliases": [ + "name" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Source", + "Type": "String", + "Desc": "Name of the source column", + "Aliases": [ + "src" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + } + ] + } + }, + "Desc": "New column definition (optional form: name:src)", + "Aliases": [ + "col" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + } + ], + "Outputs": [ + { + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" + }, + { + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" + } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" + ] + }, { "Name": "Transforms.TrainTestDatasetSplitter", "Desc": "Split the dataset into train and test sets", @@ -24277,19 +25355,19 @@ { "Name": "Slope", "Type": "Float", - "Desc": "The slope parameter of f(x) = 1 / (1 + exp(-slope * x + offset)", + "Desc": "The slope parameter of f(x) = 1 / (1 + exp(slope * x + offset)", "Aliases": [ "a" ], "Required": false, "SortOrder": 150.0, "IsNullable": false, - "Default": 1.0 + "Default": -1.0 }, { "Name": "Offset", "Type": "Float", - "Desc": "The offset parameter of f(x) = 1 / (1 + exp(-slope * x + offset)", + "Desc": "The offset parameter of f(x) = 1 / (1 + exp(slope * x + offset)", "Aliases": [ "b" ], diff --git a/src/python/tools/manifest_diff.json b/src/python/tools/manifest_diff.json index d56f33de..a70489ee 100644 --- a/src/python/tools/manifest_diff.json +++ b/src/python/tools/manifest_diff.json @@ -293,6 +293,24 @@ "Module": "preprocessing", "Type": "Transform" }, + { + "Name": "Transforms.CategoryImputer", + "NewName": "ToKeyImputer", + "Module": "preprocessing", + "Type": "Transform" + }, + { + "Name": "Transforms.ToString", + "NewName": "ToString", + "Module": "preprocessing", + "Type": "Transform" + }, + { + "Name": "Transforms.DateTimeSplitter", + "NewName": "DateTimeSplitter", + "Module": "preprocessing", + "Type": "Transform" + }, { "Name": "Transforms.TensorFlowScorer", "NewName": "TensorFlowScorer", @@ -329,6 +347,12 @@ "Module": "preprocessing", "Type": "Transform" }, + { + "Name": "Models.OnnxTransformer", + "NewName": "OnnxRunner", + "Module": "preprocessing", + "Type": "Transform" + }, { "Name": "Trainers.FieldAwareFactorizationMachineBinaryClassifier", "NewName": "FactorizationMachineBinaryClassifier", @@ -492,6 +516,12 @@ "Module": "preprocessing.normalization", "Type": "Transform" }, + { + "Name": "Transforms.RobustScaler", + "NewName": "RobustScaler", + "Module": "preprocessing.normalization", + "Type": "Transform" + }, { "Name": "Transforms.MissingValuesRowDropper", "NewName": "Filter", @@ -609,6 +639,12 @@ "Module": "timeseries", "Type": "Transform" }, + { + "Name": "Transforms.TimeSeriesImputer", + "NewName": "TimeSeriesImputer", + "Module": "timeseries", + "Type": "Transform" + }, { "Name": "Trainers.PoissonRegressor", "NewName": "PoissonRegressionRegressor", diff --git a/version.txt b/version.txt index 2eda823f..bd8bf882 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -1.6.1 \ No newline at end of file +1.7.0