diff --git a/.vsts-ci.yml b/.vsts-ci.yml
index b217ab07..2379d4cd 100644
--- a/.vsts-ci.yml
+++ b/.vsts-ci.yml
@@ -12,8 +12,6 @@ phases:
_configuration: RlsWinPy3.6
Py35:
_configuration: RlsWinPy3.5
- Py27:
- _configuration: RlsWinPy2.7
buildQueue:
name: Hosted VS2017
diff --git a/build.cmd b/build.cmd
index 36ba13ef..af4b6d75 100644
--- a/build.cmd
+++ b/build.cmd
@@ -350,6 +350,7 @@ copy "%BuildOutputDir%%Configuration%\pybridge.pyd" "%__currentScriptDir%src\py
if %PythonVersion% == 2.7 (
copy "%BuildOutputDir%%Configuration%\Platform\win-x64\publish\*.dll" "%__currentScriptDir%src\python\nimbusml\internal\libs\"
+ xcopy /S /E /I "%BuildOutputDir%%Configuration%\Platform\win-x64\publish\Data" "%__currentScriptDir%src\python\nimbusml\internal\libs\Data"
:: remove dataprep dlls as its not supported in python 2.7
del "%__currentScriptDir%src\python\nimbusml\internal\libs\Microsoft.DPrep.*"
del "%__currentScriptDir%src\python\nimbusml\internal\libs\Microsoft.Data.*"
@@ -360,6 +361,7 @@ if %PythonVersion% == 2.7 (
del "%__currentScriptDir%src\python\nimbusml\internal\libs\Microsoft.Workbench.Messaging.SDK.dll"
) else (
for /F "tokens=*" %%A in (build/libs_win.txt) do copy "%BuildOutputDir%%Configuration%\Platform\win-x64\publish\%%A" "%__currentScriptDir%src\python\nimbusml\internal\libs\"
+ xcopy /S /E /I "%BuildOutputDir%%Configuration%\Platform\win-x64\publish\Data" "%__currentScriptDir%src\python\nimbusml\internal\libs\Data"
)
if "%DebugBuild%" == "True" (
@@ -394,6 +396,7 @@ if "%InstallPythonPackages%" == "True" (
call "%PythonExe%" -m pip install --upgrade pyzmq
) else (
call "%PythonExe%" -m pip install --upgrade "azureml-dataprep>=1.1.33"
+ call "%PythonExe%" -m pip install --upgrade onnxruntime
)
call "%PythonExe%" -m pip install --upgrade "%__currentScriptDir%target\%WheelFile%"
diff --git a/build.sh b/build.sh
index 2e6b7d7f..2038c95d 100755
--- a/build.sh
+++ b/build.sh
@@ -213,6 +213,7 @@ then
cp "${BuildOutputDir}/${__configuration}/Platform/${PublishDir}"/publish/System.Native.a "${__currentScriptDir}/src/python/nimbusml/internal/libs/"
cp "${BuildOutputDir}/${__configuration}/Platform/${PublishDir}"/publish/createdump "${__currentScriptDir}/src/python/nimbusml/internal/libs/" || :
cp "${BuildOutputDir}/${__configuration}/Platform/${PublishDir}"/publish/sosdocsunix.txt "${__currentScriptDir}/src/python/nimbusml/internal/libs/"
+ cp -r "${BuildOutputDir}/${__configuration}/Platform/${PublishDir}"/publish/Data "${__currentScriptDir}/src/python/nimbusml/internal/libs/."
ext=*.so
if [ "$(uname -s)" = "Darwin" ]
then
@@ -241,6 +242,7 @@ then
cat build/${libs_txt} | while read i; do
cp "${BuildOutputDir}/${__configuration}/Platform/${PublishDir}"/publish/$i "${__currentScriptDir}/src/python/nimbusml/internal/libs/"
done
+ cp -r "${BuildOutputDir}/${__configuration}/Platform/${PublishDir}"/publish/Data "${__currentScriptDir}/src/python/nimbusml/internal/libs/."
fi
if [[ $__configuration = Dbg* ]]
@@ -291,6 +293,7 @@ then
fi
"${PythonExe}" -m pip install --upgrade "azureml-dataprep>=1.1.33"
+ "${PythonExe}" -m pip install --upgrade onnxruntime
fi
"${PythonExe}" -m pip install --upgrade "${Wheel}"
"${PythonExe}" -m pip install "scikit-learn==0.19.2"
diff --git a/build/libs_linux.txt b/build/libs_linux.txt
index 41953f3f..b7298fef 100644
--- a/build/libs_linux.txt
+++ b/build/libs_linux.txt
@@ -2,6 +2,7 @@ Google.Protobuf.dll
Newtonsoft.Json.dll
libCpuMathNative.so
libFastTreeNative.so
+libFeaturizers.so
libLdaNative.so
libMklImports.so
libMklProxyNative.so
diff --git a/build/libs_mac.txt b/build/libs_mac.txt
index 8b5066ed..497791e8 100644
--- a/build/libs_mac.txt
+++ b/build/libs_mac.txt
@@ -2,6 +2,7 @@ Google.Protobuf.dll
Newtonsoft.Json.dll
libCpuMathNative.dylib
libFastTreeNative.dylib
+libFeaturizers.dylib
libLdaNative.dylib
libMklImports.dylib
libMklProxyNative.dylib
diff --git a/build/libs_win.txt b/build/libs_win.txt
index 7ef9cca7..2b0baca8 100644
--- a/build/libs_win.txt
+++ b/build/libs_win.txt
@@ -8,8 +8,10 @@ libiomp5md.dll
MklImports.dll
MklProxyNative.dll
SymSgdNative.dll
+Featurizers.dll
tensorflow.dll
TensorFlow.NET.dll
NumSharp.Core.dll
System.Drawing.Common.dll
Microsoft.ML.*
+onnxruntime.dll
diff --git a/nuget.config b/nuget.config
index cedba361..9265d7b5 100644
--- a/nuget.config
+++ b/nuget.config
@@ -5,6 +5,5 @@
-
diff --git a/src/DotNetBridge/Bridge.cs b/src/DotNetBridge/Bridge.cs
index a7954355..89e7e652 100644
--- a/src/DotNetBridge/Bridge.cs
+++ b/src/DotNetBridge/Bridge.cs
@@ -7,8 +7,10 @@
using System.Runtime.InteropServices;
using System.Text;
using System.Threading;
+using Microsoft.ML;
using Microsoft.ML.Data;
using Microsoft.ML.EntryPoints;
+using Microsoft.ML.Featurizers;
using Microsoft.ML.Runtime;
using Microsoft.ML.Trainers;
using Microsoft.ML.Trainers.Ensemble;
@@ -296,11 +298,12 @@ private static unsafe int GenericExec(EnvironmentBlock* penv, sbyte* psz, int cd
//env.ComponentCatalog.RegisterAssembly(typeof(AutoInference).Assembly); // ML.PipelineInference
env.ComponentCatalog.RegisterAssembly(typeof(DataViewReference).Assembly);
env.ComponentCatalog.RegisterAssembly(typeof(ImageLoadingTransformer).Assembly);
- //env.ComponentCatalog.RegisterAssembly(typeof(SaveOnnxCommand).Assembly);
+ env.ComponentCatalog.RegisterAssembly(typeof(OnnxExportExtensions).Assembly);
//env.ComponentCatalog.RegisterAssembly(typeof(TimeSeriesProcessingEntryPoints).Assembly);
//env.ComponentCatalog.RegisterAssembly(typeof(ParquetLoader).Assembly);
env.ComponentCatalog.RegisterAssembly(typeof(SsaChangePointDetector).Assembly);
env.ComponentCatalog.RegisterAssembly(typeof(DotNetBridgeEntrypoints).Assembly);
+ env.ComponentCatalog.RegisterAssembly(typeof(DateTimeTransformer).Assembly);
using (var ch = host.Start("Executing"))
{
diff --git a/src/DotNetBridge/DotNetBridge.csproj b/src/DotNetBridge/DotNetBridge.csproj
index 00688f17..31c27043 100644
--- a/src/DotNetBridge/DotNetBridge.csproj
+++ b/src/DotNetBridge/DotNetBridge.csproj
@@ -32,17 +32,21 @@
all
runtime; build; native; contentfiles; analyzers
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/DotNetBridge/Entrypoints.cs b/src/DotNetBridge/Entrypoints.cs
index 535d9d75..6f7f8d0c 100644
--- a/src/DotNetBridge/Entrypoints.cs
+++ b/src/DotNetBridge/Entrypoints.cs
@@ -178,5 +178,59 @@ public static ScoringTransformOutput Score(IHostEnvironment env, ScoringTransfor
};
}
+
+ public sealed class OnnxTransformInput : TransformInputBase
+ {
+ [Argument(ArgumentType.Required, HelpText = "Path to the onnx model file.", ShortName = "model", SortOrder = 0)]
+ public string ModelFile;
+
+ [Argument(ArgumentType.Multiple, HelpText = "Name of the input column.", SortOrder = 1)]
+ public string[] InputColumns;
+
+ [Argument(ArgumentType.Multiple, HelpText = "Name of the output column.", SortOrder = 2)]
+ public string[] OutputColumns;
+
+ [Argument(ArgumentType.AtMostOnce, HelpText = "GPU device id to run on (e.g. 0,1,..). Null for CPU. Requires CUDA 9.1.", SortOrder = 3)]
+ public int? GpuDeviceId = null;
+
+ [Argument(ArgumentType.AtMostOnce, HelpText = "If true, resumes execution on CPU upon GPU error. If false, will raise the GPU execption.", SortOrder = 4)]
+ public bool FallbackToCpu = false;
+ }
+
+ public sealed class OnnxTransformOutput
+ {
+ [TlcModule.Output(Desc = "ONNX transformed dataset", SortOrder = 1)]
+ public IDataView OutputData;
+
+ [TlcModule.Output(Desc = "Transform model", SortOrder = 2)]
+ public TransformModel Model;
+ }
+
+ [TlcModule.EntryPoint(Name = "Models.OnnxTransformer",
+ Desc = "Applies an ONNX model to a dataset.",
+ UserName = "Onnx Transformer",
+ ShortName = "onnx-xf")]
+ public static OnnxTransformOutput ApplyOnnxModel(IHostEnvironment env, OnnxTransformInput input)
+ {
+ var host = EntryPointUtils.CheckArgsAndCreateHost(env, "OnnxTransform", input);
+
+ var inputColumns = input.InputColumns ?? (Array.Empty());
+ var outputColumns = input.OutputColumns ?? (Array.Empty());
+
+ var transformsCatalog = new TransformsCatalog(host);
+ var onnxScoringEstimator = OnnxCatalog.ApplyOnnxModel(transformsCatalog,
+ outputColumns,
+ inputColumns,
+ input.ModelFile,
+ input.GpuDeviceId,
+ input.FallbackToCpu);
+
+ var view = onnxScoringEstimator.Fit(input.Data).Transform(input.Data);
+ return new OnnxTransformOutput()
+ {
+ Model = new TransformModelImpl(host, view, input.Data),
+ OutputData = view
+ };
+ }
}
}
diff --git a/src/DotNetBridge/ManifestUtils.cs b/src/DotNetBridge/ManifestUtils.cs
index 7d1c89a5..b566cf2f 100644
--- a/src/DotNetBridge/ManifestUtils.cs
+++ b/src/DotNetBridge/ManifestUtils.cs
@@ -11,6 +11,7 @@
using System.Text.RegularExpressions;
using Microsoft.ML.Data;
using Microsoft.ML.EntryPoints;
+using Microsoft.ML.Featurizers;
using Microsoft.ML.Model.OnnxConverter;
using Microsoft.ML.Runtime;
using Microsoft.ML.Trainers;
@@ -42,8 +43,10 @@ public static class ManifestUtils
typeof(ImageLoadingTransformer),
typeof(SymbolicSgdLogisticRegressionBinaryTrainer),
typeof(OnnxContext),
+ typeof(OnnxExportExtensions),
typeof(SsaForecastingTransformer),
- typeof(VariableColumnTransform)
+ typeof(VariableColumnTransform),
+ typeof(DateTimeTransformer)
};
private static (IEnumerable epListContents, JObject manifest) BuildManifests()
diff --git a/src/DotNetBridge/NativeDataInterop.cs b/src/DotNetBridge/NativeDataInterop.cs
index 461beb3c..c95584c0 100644
--- a/src/DotNetBridge/NativeDataInterop.cs
+++ b/src/DotNetBridge/NativeDataInterop.cs
@@ -213,8 +213,11 @@ private static unsafe void SendViewToNativeAsDataFrame(IChannel ch, EnvironmentB
}
else
{
- for (int i = 0; i < nSlots; i++)
- AddUniqueName(name + "." + i, ref nameIndices, ref nameUtf8Bytes);
+ if (nSlots == 1)
+ AddUniqueName(name, ref nameIndices, ref nameUtf8Bytes);
+ else
+ for (int i = 0; i < nSlots; i++)
+ AddUniqueName(name + "." + i, ref nameIndices, ref nameUtf8Bytes);
}
}
else
diff --git a/src/DotNetBridge/NativeDataView.cs b/src/DotNetBridge/NativeDataView.cs
index 1a829889..d3795757 100644
--- a/src/DotNetBridge/NativeDataView.cs
+++ b/src/DotNetBridge/NativeDataView.cs
@@ -416,7 +416,7 @@ public TextColumnReader(int batchSize, long rowsCount, int cref, Column[] column
_waiterPublish = new OrderedWaiter(firstCleared: true);
_queue = new BlockingCollection(QueueSize);
- _thdRead = Utils.RunOnBackgroundThread(ThreadProc);
+ _thdRead = Utils.RunOnBackgroundThreadAsync(ThreadProc);
}
public void Release()
@@ -1406,4 +1406,4 @@ public override void Dispose()
#endregion
}
}
-}
\ No newline at end of file
+}
diff --git a/src/DotNetBridge/RmlEnvironment.cs b/src/DotNetBridge/RmlEnvironment.cs
index dc9ff045..e9893426 100644
--- a/src/DotNetBridge/RmlEnvironment.cs
+++ b/src/DotNetBridge/RmlEnvironment.cs
@@ -52,14 +52,9 @@ protected override IHost RegisterCore(HostEnvironmentBase source
}
public RmlEnvironment(Bridge.CheckCancelled checkDelegate, int? seed = null, bool verbose = false)
- : this(RandomUtils.Create(seed), verbose)
+ : base(seed, verbose)
{
CheckCancelled = checkDelegate;
- }
-
- public RmlEnvironment(Random rand, bool verbose = false)
- : base(rand, verbose)
- {
CultureInfo.CurrentUICulture = CultureInfo.InvariantCulture;
EnsureDispatcher();
}
diff --git a/src/DotNetBridge/transforms/VariableColumnTransform.cs b/src/DotNetBridge/transforms/VariableColumnTransform.cs
index ea9ecafb..9ee1ebd7 100644
--- a/src/DotNetBridge/transforms/VariableColumnTransform.cs
+++ b/src/DotNetBridge/transforms/VariableColumnTransform.cs
@@ -247,6 +247,9 @@ public override bool IsColumnActive(DataViewSchema.Column column)
return _active[column.Index];
}
+ private static readonly FuncInstanceMethodInfo1 _makeVarLengthVectorGetterMethodInfo
+ = FuncInstanceMethodInfo1.Create(target => target.MakeVarLengthVectorGetter);
+
private Delegate MakeVarLengthVectorGetter(DataViewRow input)
{
var srcGetters = new ValueGetter[_bindings.vectorToInputMap.Count];
@@ -304,7 +307,7 @@ public override ValueGetter GetGetter(DataViewSchema.Column colu
if (column.Index == _bindings.outputColumn)
{
VectorDataViewType columnType = column.Type as VectorDataViewType;
- Delegate getter = Utils.MarshalInvoke(MakeVarLengthVectorGetter, columnType.ItemType.RawType, _cursor);
+ Delegate getter = Utils.MarshalInvoke(_makeVarLengthVectorGetterMethodInfo, this, columnType.ItemType.RawType, _cursor);
return getter as ValueGetter;
}
else
diff --git a/src/Platforms/build.csproj b/src/Platforms/build.csproj
index ef1b03d4..065da7d3 100644
--- a/src/Platforms/build.csproj
+++ b/src/Platforms/build.csproj
@@ -11,17 +11,21 @@
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/src/python/nimbusml.pyproj b/src/python/nimbusml.pyproj
index 18356d5c..2deae4ab 100644
--- a/src/python/nimbusml.pyproj
+++ b/src/python/nimbusml.pyproj
@@ -92,14 +92,19 @@
+
+
+
+
+
-
+
@@ -111,6 +116,7 @@
+
@@ -125,6 +131,7 @@
+
@@ -163,6 +170,7 @@
+
@@ -178,6 +186,7 @@
+
@@ -233,6 +242,7 @@
+
@@ -309,19 +319,25 @@
+
+
+
+
+
+
@@ -345,6 +361,8 @@
+
+
@@ -383,6 +401,7 @@
+
@@ -400,6 +419,7 @@
+
@@ -412,6 +432,7 @@
+
@@ -440,6 +461,7 @@
+
@@ -457,6 +479,8 @@
+
+
@@ -634,6 +658,7 @@
+
@@ -650,7 +675,9 @@
+
+
@@ -663,6 +690,8 @@
+
+
@@ -699,16 +728,22 @@
+
+
+
+
+
+
@@ -717,7 +752,11 @@
+
+
+
+
diff --git a/src/python/nimbusml/__init__.py b/src/python/nimbusml/__init__.py
index afb13002..79762616 100644
--- a/src/python/nimbusml/__init__.py
+++ b/src/python/nimbusml/__init__.py
@@ -2,7 +2,7 @@
Microsoft Machine Learning for Python
"""
-__version__ = '1.6.1'
+__version__ = '1.7.0'
# CoreCLR version of MicrosoftML is built on Windows.
# But file permissions are not preserved when it's copied to Linux.
diff --git a/src/python/nimbusml/base_predictor.py b/src/python/nimbusml/base_predictor.py
index e619b115..6b31e566 100644
--- a/src/python/nimbusml/base_predictor.py
+++ b/src/python/nimbusml/base_predictor.py
@@ -178,7 +178,6 @@ def summary(self):
self.model_summary_ = pipeline.summary()
return self.model_summary_
- @trace
def _get_implicit_transforms(
self,
features,
@@ -308,6 +307,11 @@ def _get_graph_nodes(
if label_column is None:
label_column = Role.Label
self.label_column_name = label_column
+
+ if y is None \
+ and self._use_role(Role.Label) \
+ and label_column in learner_features:
+ learner_features.remove(label_column)
else:
self.label_column_name = None
label_column = None
@@ -354,3 +358,20 @@ def _get_graph_nodes(
row_group_column_name=group_id_column)
graph_nodes['learner_node'] = [learner_node]
return graph_nodes, learner_features
+
+ @trace
+ def export_to_onnx(self, *args, **kwargs):
+ """
+ Export the model to the ONNX format.
+
+ See :py:meth:`nimbusml.Pipeline.export_to_onnx` for accepted arguments.
+ """
+ if not hasattr(self, 'model_') \
+ or self.model_ is None \
+ or not os.path.isfile(self.model_):
+
+ raise ValueError("Model is not fitted. Train or load a model before "
+ "export_to_onnx().")
+
+ pipeline = Pipeline([self], model=self.model_)
+ pipeline.export_to_onnx(*args, **kwargs)
diff --git a/src/python/nimbusml/base_transform.py b/src/python/nimbusml/base_transform.py
index b227d567..f0c4f861 100644
--- a/src/python/nimbusml/base_transform.py
+++ b/src/python/nimbusml/base_transform.py
@@ -124,3 +124,20 @@ def transform(self, X, as_binary_data_stream=False, **params):
data = pipeline.transform(
X, as_binary_data_stream=as_binary_data_stream, **params)
return data
+
+ @trace
+ def export_to_onnx(self, *args, **kwargs):
+ """
+ Export the model to the ONNX format.
+
+ See :py:meth:`nimbusml.Pipeline.export_to_onnx` for accepted arguments.
+ """
+ if not hasattr(self, 'model_') \
+ or self.model_ is None \
+ or not os.path.isfile(self.model_):
+
+ raise ValueError("Model is not fitted. Train or load a model before "
+ "export_to_onnx().")
+
+ pipeline = Pipeline([self], model=self.model_)
+ pipeline.export_to_onnx(*args, **kwargs)
diff --git a/src/python/nimbusml/ensemble/lightgbmbinaryclassifier.py b/src/python/nimbusml/ensemble/lightgbmbinaryclassifier.py
index c87bbbb0..125f536b 100644
--- a/src/python/nimbusml/ensemble/lightgbmbinaryclassifier.py
+++ b/src/python/nimbusml/ensemble/lightgbmbinaryclassifier.py
@@ -110,6 +110,9 @@ class LightGbmBinaryClassifier(
:param handle_missing_value: Enable special handling of missing value or
not.
+ :param use_zero_as_missing_value: Enable usage of zero (0) as missing
+ value.
+
:param minimum_example_count_per_group: Minimum number of instances per
categorical group.
@@ -165,6 +168,7 @@ def __init__(
batch_size=1048576,
use_categorical_split=None,
handle_missing_value=True,
+ use_zero_as_missing_value=False,
minimum_example_count_per_group=100,
maximum_categorical_split_point_count=32,
categorical_smoothing=10.0,
@@ -219,6 +223,7 @@ def __init__(
batch_size=batch_size,
use_categorical_split=use_categorical_split,
handle_missing_value=handle_missing_value,
+ use_zero_as_missing_value=use_zero_as_missing_value,
minimum_example_count_per_group=minimum_example_count_per_group,
maximum_categorical_split_point_count=maximum_categorical_split_point_count,
categorical_smoothing=categorical_smoothing,
diff --git a/src/python/nimbusml/ensemble/lightgbmclassifier.py b/src/python/nimbusml/ensemble/lightgbmclassifier.py
index a6951be2..d3d000e3 100644
--- a/src/python/nimbusml/ensemble/lightgbmclassifier.py
+++ b/src/python/nimbusml/ensemble/lightgbmclassifier.py
@@ -105,6 +105,9 @@ class LightGbmClassifier(core, BasePredictor, ClassifierMixin):
:param handle_missing_value: Enable special handling of missing value or
not.
+ :param use_zero_as_missing_value: Enable usage of zero (0) as missing
+ value.
+
:param minimum_example_count_per_group: Minimum number of instances per
categorical group.
@@ -160,6 +163,7 @@ def __init__(
batch_size=1048576,
use_categorical_split=None,
handle_missing_value=True,
+ use_zero_as_missing_value=False,
minimum_example_count_per_group=100,
maximum_categorical_split_point_count=32,
categorical_smoothing=10.0,
@@ -214,6 +218,7 @@ def __init__(
batch_size=batch_size,
use_categorical_split=use_categorical_split,
handle_missing_value=handle_missing_value,
+ use_zero_as_missing_value=use_zero_as_missing_value,
minimum_example_count_per_group=minimum_example_count_per_group,
maximum_categorical_split_point_count=maximum_categorical_split_point_count,
categorical_smoothing=categorical_smoothing,
diff --git a/src/python/nimbusml/ensemble/lightgbmranker.py b/src/python/nimbusml/ensemble/lightgbmranker.py
index fb96f5cd..61bcbd90 100644
--- a/src/python/nimbusml/ensemble/lightgbmranker.py
+++ b/src/python/nimbusml/ensemble/lightgbmranker.py
@@ -105,6 +105,9 @@ class LightGbmRanker(core, BasePredictor, ClassifierMixin):
:param handle_missing_value: Enable special handling of missing value or
not.
+ :param use_zero_as_missing_value: Enable usage of zero (0) as missing
+ value.
+
:param minimum_example_count_per_group: Minimum number of instances per
categorical group.
@@ -159,6 +162,7 @@ def __init__(
batch_size=1048576,
use_categorical_split=None,
handle_missing_value=True,
+ use_zero_as_missing_value=False,
minimum_example_count_per_group=100,
maximum_categorical_split_point_count=32,
categorical_smoothing=10.0,
@@ -212,6 +216,7 @@ def __init__(
batch_size=batch_size,
use_categorical_split=use_categorical_split,
handle_missing_value=handle_missing_value,
+ use_zero_as_missing_value=use_zero_as_missing_value,
minimum_example_count_per_group=minimum_example_count_per_group,
maximum_categorical_split_point_count=maximum_categorical_split_point_count,
categorical_smoothing=categorical_smoothing,
diff --git a/src/python/nimbusml/ensemble/lightgbmregressor.py b/src/python/nimbusml/ensemble/lightgbmregressor.py
index 0d0a69ae..89258a7f 100644
--- a/src/python/nimbusml/ensemble/lightgbmregressor.py
+++ b/src/python/nimbusml/ensemble/lightgbmregressor.py
@@ -98,6 +98,9 @@ class LightGbmRegressor(core, BasePredictor, RegressorMixin):
:param handle_missing_value: Enable special handling of missing value or
not.
+ :param use_zero_as_missing_value: Enable usage of zero (0) as missing
+ value.
+
:param minimum_example_count_per_group: Minimum number of instances per
categorical group.
@@ -150,6 +153,7 @@ def __init__(
batch_size=1048576,
use_categorical_split=None,
handle_missing_value=True,
+ use_zero_as_missing_value=False,
minimum_example_count_per_group=100,
maximum_categorical_split_point_count=32,
categorical_smoothing=10.0,
@@ -201,6 +205,7 @@ def __init__(
batch_size=batch_size,
use_categorical_split=use_categorical_split,
handle_missing_value=handle_missing_value,
+ use_zero_as_missing_value=use_zero_as_missing_value,
minimum_example_count_per_group=minimum_example_count_per_group,
maximum_categorical_split_point_count=maximum_categorical_split_point_count,
categorical_smoothing=categorical_smoothing,
diff --git a/src/python/nimbusml/examples/DateTimeSplitter.py b/src/python/nimbusml/examples/DateTimeSplitter.py
new file mode 100644
index 00000000..fd8612d3
--- /dev/null
+++ b/src/python/nimbusml/examples/DateTimeSplitter.py
@@ -0,0 +1,31 @@
+###############################################################################
+# DateTimeSplitter
+import pandas as pd
+from nimbusml import FileDataStream
+from nimbusml.datasets import get_dataset
+from nimbusml.preprocessing import DateTimeSplitter
+
+# data input (as a FileDataStream)
+path = get_dataset('infert').as_filepath()
+
+data = FileDataStream.read_csv(path, sep=',')
+
+# transform usage
+xf = DateTimeSplitter(prefix='dt_') << 'age'
+
+# fit and transform
+features = xf.fit_transform(data)
+
+features = features.drop(['row_num', 'education', 'parity', 'induced',
+ 'case', 'spontaneous', 'stratum', 'pooled.stratum'], axis=1)
+
+# print features
+pd.set_option('display.max_columns', None)
+pd.set_option('display.width', 1000)
+print(features.head())
+# age dt_Year dt_Month dt_Day dt_Hour dt_Minute dt_Second dt_AmPm dt_Hour12 dt_DayOfWeek dt_DayOfQuarter dt_DayOfYear dt_WeekOfMonth dt_QuarterOfYear dt_HalfOfYear dt_WeekIso dt_YearIso dt_MonthLabel dt_AmPmLabel dt_DayOfWeekLabel dt_HolidayName dt_IsPaidTimeOff
+# 0 26 1970 1 1 0 0 26 0 0 4 1 0 0 1 1 1 1970 January am Thursday None 0
+# 1 42 1970 1 1 0 0 42 0 0 4 1 0 0 1 1 1 1970 January am Thursday None 0
+# 2 39 1970 1 1 0 0 39 0 0 4 1 0 0 1 1 1 1970 January am Thursday None 0
+# 3 34 1970 1 1 0 0 34 0 0 4 1 0 0 1 1 1 1970 January am Thursday None 0
+# 4 35 1970 1 1 0 0 35 0 0 4 1 0 0 1 1 1 1970 January am Thursday None 0
\ No newline at end of file
diff --git a/src/python/nimbusml/examples/RobustScaler.py b/src/python/nimbusml/examples/RobustScaler.py
new file mode 100644
index 00000000..4c6a6405
--- /dev/null
+++ b/src/python/nimbusml/examples/RobustScaler.py
@@ -0,0 +1,39 @@
+###############################################################################
+# RobustScaler
+import numpy
+from nimbusml import FileDataStream
+from nimbusml.datasets import get_dataset
+from nimbusml.preprocessing.normalization import RobustScaler
+
+# data input (as a FileDataStream)
+path = get_dataset('infert').as_filepath()
+data = FileDataStream.read_csv(path, sep=',')
+
+print(data.head())
+# row_num education age parity induced case spontaneous stratum pooled.stratum
+# 0 1 0-5yrs 26 6 1 1 2 1 3
+# 1 2 0-5yrs 42 1 1 1 0 2 1
+# 2 3 0-5yrs 39 6 2 1 0 3 4
+# 3 4 0-5yrs 34 4 2 1 0 4 2
+# 4 5 6-11yrs 35 3 1 1 1 5 32
+
+# transform usage
+xf = RobustScaler(
+ center=True, scale=True,
+ columns={'age_norm': 'age', 'par_norm': 'parity'})
+
+# fit and transform
+features = xf.fit_transform(data)
+
+print(features.head(n=10))
+# row_num education age parity induced case spontaneous stratum pooled.stratum age_norm par_norm
+# 0 1 0-5yrs 26 6 1 1 2 1 3 -0.434783 1.6
+# 1 2 0-5yrs 42 1 1 1 0 2 1 0.956522 -0.4
+# 2 3 0-5yrs 39 6 2 1 0 3 4 0.695652 1.6
+# 3 4 0-5yrs 34 4 2 1 0 4 2 0.260870 0.8
+# 4 5 6-11yrs 35 3 1 1 1 5 32 0.347826 0.4
+# 5 6 6-11yrs 36 4 2 1 1 6 36 0.434783 0.8
+# 6 7 6-11yrs 23 1 0 1 0 7 6 -0.695652 -0.4
+# 7 8 6-11yrs 32 2 0 1 0 8 22 0.086957 0.0
+# 8 9 6-11yrs 21 1 0 1 1 9 5 -0.869565 -0.4
+# 9 10 6-11yrs 28 2 0 1 0 10 19 -0.260870 0.0
diff --git a/src/python/nimbusml/examples/ToKeyImputer.py b/src/python/nimbusml/examples/ToKeyImputer.py
new file mode 100644
index 00000000..820127f5
--- /dev/null
+++ b/src/python/nimbusml/examples/ToKeyImputer.py
@@ -0,0 +1,35 @@
+###############################################################################
+# ToKey
+import numpy
+from nimbusml import FileDataStream
+from nimbusml.datasets import get_dataset
+from nimbusml.preprocessing import ToKeyImputer
+
+# data input (as a FileDataStream)
+path = get_dataset('airquality').as_filepath()
+
+data = FileDataStream.read_csv(path, sep=',', numeric_dtype=numpy.float32,
+ names={0: 'id'})
+print(data.head(6))
+# id Ozone Solar_R Wind Temp Month Day
+# 0 1.0 41.0 190.0 7.4 67.0 5.0 1.0
+# 1 2.0 36.0 118.0 8.0 72.0 5.0 2.0
+# 2 3.0 12.0 149.0 12.6 74.0 5.0 3.0
+# 3 4.0 18.0 313.0 11.5 62.0 5.0 4.0
+# 4 5.0 NaN NaN 14.3 56.0 5.0 5.0
+# 5 6.0 28.0 NaN 14.9 66.0 5.0 6.0
+
+
+# transform usage
+xf = ToKeyImputer(columns={'Ozone_1': 'Ozone', 'Solar_R_1': 'Solar_R'})
+
+# fit and transform
+features = xf.fit_transform(data)
+print(features.head(6))
+# id Ozone Solar_R Wind Temp Month Day Ozone_1 Solar_R_1
+# 0 1.0 41.0 190.0 7.4 67.0 5.0 1.0 41.0 190.0
+# 1 2.0 36.0 118.0 8.0 72.0 5.0 2.0 36.0 118.0
+# 2 3.0 12.0 149.0 12.6 74.0 5.0 3.0 12.0 149.0
+# 3 4.0 18.0 313.0 11.5 62.0 5.0 4.0 18.0 313.0
+# 4 5.0 NaN NaN 14.3 56.0 5.0 5.0 23.0 238.0 <== Missing values have been updated
+# 5 6.0 28.0 NaN 14.9 66.0 5.0 6.0 28.0 238.0 <== Missing values have been updated
diff --git a/src/python/nimbusml/examples/ToString.py b/src/python/nimbusml/examples/ToString.py
new file mode 100644
index 00000000..82185d32
--- /dev/null
+++ b/src/python/nimbusml/examples/ToString.py
@@ -0,0 +1,45 @@
+###############################################################################
+# ToKey
+import numpy
+from nimbusml import FileDataStream
+from nimbusml.datasets import get_dataset
+from nimbusml.preprocessing import ToString
+
+# data input (as a FileDataStream)
+path = get_dataset('infert').as_filepath()
+
+data = FileDataStream.read_csv(path, sep=',', numeric_dtype=numpy.float32,
+ names={0: 'id'})
+print(data.head())
+# id education age parity induced case spontaneous stratum pooled.stratum
+# 0 1.0 0-5yrs 26.0 6.0 1.0 1.0 2.0 1.0 3.0
+# 1 2.0 0-5yrs 42.0 1.0 1.0 1.0 0.0 2.0 1.0
+# 2 3.0 0-5yrs 39.0 6.0 2.0 1.0 0.0 3.0 4.0
+# 3 4.0 0-5yrs 34.0 4.0 2.0 1.0 0.0 4.0 2.0
+# 4 5.0 6-11yrs 35.0 3.0 1.0 1.0 1.0 5.0 32.0
+
+# transform usage
+xf = ToString(columns={'id_1': 'id', 'age_1': 'age'})
+
+# fit and transform
+features = xf.fit_transform(data)
+print(features.head())
+# id education age parity induced case spontaneous stratum pooled.stratum id_1 age_1
+# 0 1.0 0-5yrs 26.0 6.0 1.0 1.0 2.0 1.0 3.0 1.000000 26.000000
+# 1 2.0 0-5yrs 42.0 1.0 1.0 1.0 0.0 2.0 1.0 2.000000 42.000000
+# 2 3.0 0-5yrs 39.0 6.0 2.0 1.0 0.0 3.0 4.0 3.000000 39.000000
+# 3 4.0 0-5yrs 34.0 4.0 2.0 1.0 0.0 4.0 2.0 4.000000 34.000000
+# 4 5.0 6-11yrs 35.0 3.0 1.0 1.0 1.0 5.0 32.0 5.000000 35.000000
+
+print(features.dtypes)
+# id float32
+# education object
+# age float32
+# parity float32
+# induced float32
+# case float32
+# spontaneous float32
+# stratum float32
+# pooled.stratum float32
+# id_1 object <== string column
+# age_1 object <== string column
diff --git a/src/python/nimbusml/examples/examples_from_dataframe/DateTimeSplitter_df.py b/src/python/nimbusml/examples/examples_from_dataframe/DateTimeSplitter_df.py
new file mode 100644
index 00000000..f049c39a
--- /dev/null
+++ b/src/python/nimbusml/examples/examples_from_dataframe/DateTimeSplitter_df.py
@@ -0,0 +1,33 @@
+###############################################################################
+# DateTimeSplitter
+import pandas
+from nimbusml import Pipeline
+from nimbusml.preprocessing import DateTimeSplitter
+from nimbusml.preprocessing.schema import ColumnSelector
+
+df = pandas.DataFrame(data=dict(
+ tokens1=[1, 2, 3, 157161600],
+ tokens2=[10, 11, 12, 13]
+))
+
+cols_to_drop = [
+ 'dtHour12', 'dtDayOfWeek', 'dtDayOfQuarter',
+ 'dtDayOfYear', 'dtWeekOfMonth', 'dtQuarterOfYear',
+ 'dtHalfOfYear', 'dtWeekIso', 'dtYearIso', 'dtMonthLabel',
+ 'dtAmPmLabel', 'dtDayOfWeekLabel', 'dtIsPaidTimeOff'
+]
+
+dts = DateTimeSplitter(prefix='dt', country='Canada') << 'tokens1'
+
+pipeline = Pipeline([dts, ColumnSelector(drop_columns=cols_to_drop)])
+y = pipeline.fit_transform(df)
+
+# view the three columns
+pandas.set_option('display.max_columns', None)
+pandas.set_option('display.width', 1000)
+print(y)
+# tokens1 tokens2 dtYear dtMonth dtDay dtHour dtMinute dtSecond dtAmPm dtHolidayName
+# 0 1 10 1970 1 1 0 0 1 0 New Year's Day
+# 1 2 11 1970 1 1 0 0 2 0 New Year's Day
+# 2 3 12 1970 1 1 0 0 3 0 New Year's Day
+# 3 157161600 13 1974 12 25 0 0 0 0 Christmas Day
diff --git a/src/python/nimbusml/examples/examples_from_dataframe/OnnxRunner_df.py b/src/python/nimbusml/examples/examples_from_dataframe/OnnxRunner_df.py
new file mode 100644
index 00000000..13c9f0ce
--- /dev/null
+++ b/src/python/nimbusml/examples/examples_from_dataframe/OnnxRunner_df.py
@@ -0,0 +1,49 @@
+import os
+import tempfile
+import numpy as np
+import pandas as pd
+from nimbusml import Pipeline
+from nimbusml.preprocessing import OnnxRunner
+from nimbusml.preprocessing.normalization import MinMaxScaler
+
+
+def get_tmp_file(suffix=None):
+ fd, file_name = tempfile.mkstemp(suffix=suffix)
+ fl = os.fdopen(fd, 'w')
+ fl.close()
+ return file_name
+
+# Generate the train and test data
+np.random.seed(0)
+x = np.arange(100, step=0.1)
+y = x * 10 + (np.random.standard_normal(len(x)) * 10)
+train_data = {'c1': x, 'c2': y}
+train_df = pd.DataFrame(train_data).astype({'c1': np.float32, 'c2': np.float32})
+
+test_data = {'c1': [2.5, 30.5], 'c2': [1, 1]}
+test_df = pd.DataFrame(test_data).astype({'c1': np.float32, 'c2': np.float32})
+
+# Fit a MinMaxScaler Pipeline
+r1 = Pipeline([MinMaxScaler()])
+r1.fit(train_df)
+
+# Export the pipeline to ONNX
+onnx_path = get_tmp_file('.onnx')
+r1.export_to_onnx(onnx_path, 'com.microsoft.ml', onnx_version='Stable')
+
+# Perform the transform using the standard ML.Net backend
+result_standard = r1.transform(test_df)
+print(result_standard)
+# c1 c2
+# 0 0.025025 0.000998
+# 1 0.305305 0.000998
+
+# Perform the transform using the ONNX backend.
+# Note, the extra columns and column name differences
+# is a known issue with the ML.Net backend.
+onnxrunner = OnnxRunner(model_file=onnx_path)
+result_onnx = onnxrunner.fit_transform(test_df)
+print(result_onnx)
+# c1 c2 c12.0 c22.0
+# 0 2.5 1.0 0.025025 0.000998
+# 1 30.5 1.0 0.305305 0.000998
diff --git a/src/python/nimbusml/examples/examples_from_dataframe/RobustScaler_df.py b/src/python/nimbusml/examples/examples_from_dataframe/RobustScaler_df.py
new file mode 100644
index 00000000..ff0ae793
--- /dev/null
+++ b/src/python/nimbusml/examples/examples_from_dataframe/RobustScaler_df.py
@@ -0,0 +1,20 @@
+###############################################################################
+# RobustScaler
+import pandas as pd
+from nimbusml import Pipeline
+from nimbusml.preprocessing.normalization import RobustScaler
+
+
+df = pd.DataFrame(data=dict(c0=[1, 3, 5, 7, 9]))
+
+xf = RobustScaler(columns='c0', center=True, scale=True)
+pipeline = Pipeline([xf])
+result = pipeline.fit_transform(df)
+
+print(result)
+# c0
+# 0 -1.0
+# 1 -0.5
+# 2 0.0
+# 3 0.5
+# 4 1.0
diff --git a/src/python/nimbusml/examples/examples_from_dataframe/TimeSeriesImputer_df.py b/src/python/nimbusml/examples/examples_from_dataframe/TimeSeriesImputer_df.py
new file mode 100644
index 00000000..38ec9073
--- /dev/null
+++ b/src/python/nimbusml/examples/examples_from_dataframe/TimeSeriesImputer_df.py
@@ -0,0 +1,29 @@
+###############################################################################
+# DateTimeSplitter
+import pandas
+from nimbusml.timeseries import TimeSeriesImputer
+
+df = pandas.DataFrame(data=dict(
+ ts=[1, 2, 3, 5],
+ grain=[1970, 1970, 1970, 1970],
+ c3=[10, 13, 15, 20],
+ c4=[19, 12, 16, 19]
+))
+
+print(df)
+
+tsi = TimeSeriesImputer(time_series_column='ts',
+ grain_columns=['grain'],
+ filter_columns=['c3', 'c4'],
+ impute_mode='ForwardFill',
+ filter_mode='Include')
+result = tsi.fit_transform(df)
+
+print(result)
+# ts grain c3 c4 IsRowImputed
+# 0 0 0 0 0 False
+# 1 1 1970 10 19 False
+# 2 2 1970 13 12 False
+# 3 3 1970 15 16 False
+# 4 4 1970 15 16 True <== New row added
+# 5 5 1970 20 19 False
diff --git a/src/python/nimbusml/examples/examples_from_dataframe/ToKeyImputer_df.py b/src/python/nimbusml/examples/examples_from_dataframe/ToKeyImputer_df.py
new file mode 100644
index 00000000..f613e3f4
--- /dev/null
+++ b/src/python/nimbusml/examples/examples_from_dataframe/ToKeyImputer_df.py
@@ -0,0 +1,34 @@
+###############################################################################
+# ToKeyImputer
+
+import pandas
+from nimbusml.preprocessing import ToKeyImputer
+
+# Create the data
+text_df = pandas.DataFrame(
+ data=dict(
+ text=[
+ "cat",
+ "dog",
+ "fish",
+ "orange",
+ "cat orange",
+ "dog",
+ "fish",
+ None,
+ "spider"]))
+
+tokey = ToKeyImputer() << 'text'
+y = tokey.fit_transform(text_df)
+print(y)
+
+# text
+# 0 cat
+# 1 dog
+# 2 fish
+# 3 orange
+# 4 cat orange
+# 5 dog
+# 6 fish
+# 7 dog <== Missing value has been replaced
+# 8 spider
diff --git a/src/python/nimbusml/examples/examples_from_dataframe/ToString_df.py b/src/python/nimbusml/examples/examples_from_dataframe/ToString_df.py
new file mode 100644
index 00000000..b6c631fd
--- /dev/null
+++ b/src/python/nimbusml/examples/examples_from_dataframe/ToString_df.py
@@ -0,0 +1,43 @@
+###############################################################################
+# ToString
+
+import pandas
+from nimbusml.preprocessing import ToString, ToKey
+from pandas import Categorical
+
+# Create the data
+categorical_df = pandas.DataFrame(data=dict(
+ key=Categorical.from_codes([0, 1, 2, 1, 2, 0], categories=['a', 'b', 'c']),
+ text=['b', 'c', 'a', 'b', 'a', 'c']))
+
+print(categorical_df.dtypes)
+# key category
+# text object
+# dtype: object
+
+tostring = ToString(columns='key')
+y = tostring.fit_transform(categorical_df)
+print(y)
+# key text
+# 0 1 b
+# 1 2 c
+# 2 3 a
+# 3 2 b
+# 4 3 a
+# 5 1 c
+
+print(y.dtypes)
+# key object <== converted to string
+# text object
+# dtype: object
+
+tokey = ToKey(columns='text')
+y = tokey.fit_transform(categorical_df)
+y2 = tostring.clone().fit_transform(y)
+print(y2['text'] == categorical_df['text'])
+# 0 True
+# 1 True
+# 2 True
+# 3 True
+# 4 True
+# 5 True
diff --git a/src/python/nimbusml/internal/core/ensemble/lightgbmbinaryclassifier.py b/src/python/nimbusml/internal/core/ensemble/lightgbmbinaryclassifier.py
index 2bf8468b..1ae0934d 100644
--- a/src/python/nimbusml/internal/core/ensemble/lightgbmbinaryclassifier.py
+++ b/src/python/nimbusml/internal/core/ensemble/lightgbmbinaryclassifier.py
@@ -99,6 +99,9 @@ class LightGbmBinaryClassifier(
:param handle_missing_value: Enable special handling of missing value or
not.
+ :param use_zero_as_missing_value: Enable usage of zero (0) as missing
+ value.
+
:param minimum_example_count_per_group: Minimum number of instances per
categorical group.
@@ -154,6 +157,7 @@ def __init__(
batch_size=1048576,
use_categorical_split=None,
handle_missing_value=True,
+ use_zero_as_missing_value=False,
minimum_example_count_per_group=100,
maximum_categorical_split_point_count=32,
categorical_smoothing=10.0,
@@ -183,6 +187,7 @@ def __init__(
self.batch_size = batch_size
self.use_categorical_split = use_categorical_split
self.handle_missing_value = handle_missing_value
+ self.use_zero_as_missing_value = use_zero_as_missing_value
self.minimum_example_count_per_group = minimum_example_count_per_group
self.maximum_categorical_split_point_count = maximum_categorical_split_point_count
self.categorical_smoothing = categorical_smoothing
@@ -220,6 +225,7 @@ def _get_node(self, **all_args):
batch_size=self.batch_size,
use_categorical_split=self.use_categorical_split,
handle_missing_value=self.handle_missing_value,
+ use_zero_as_missing_value=self.use_zero_as_missing_value,
minimum_example_count_per_group=self.minimum_example_count_per_group,
maximum_categorical_split_point_count=self.maximum_categorical_split_point_count,
categorical_smoothing=self.categorical_smoothing,
diff --git a/src/python/nimbusml/internal/core/ensemble/lightgbmclassifier.py b/src/python/nimbusml/internal/core/ensemble/lightgbmclassifier.py
index 5feace13..7bb5466a 100644
--- a/src/python/nimbusml/internal/core/ensemble/lightgbmclassifier.py
+++ b/src/python/nimbusml/internal/core/ensemble/lightgbmclassifier.py
@@ -97,6 +97,9 @@ class LightGbmClassifier(
:param handle_missing_value: Enable special handling of missing value or
not.
+ :param use_zero_as_missing_value: Enable usage of zero (0) as missing
+ value.
+
:param minimum_example_count_per_group: Minimum number of instances per
categorical group.
@@ -152,6 +155,7 @@ def __init__(
batch_size=1048576,
use_categorical_split=None,
handle_missing_value=True,
+ use_zero_as_missing_value=False,
minimum_example_count_per_group=100,
maximum_categorical_split_point_count=32,
categorical_smoothing=10.0,
@@ -181,6 +185,7 @@ def __init__(
self.batch_size = batch_size
self.use_categorical_split = use_categorical_split
self.handle_missing_value = handle_missing_value
+ self.use_zero_as_missing_value = use_zero_as_missing_value
self.minimum_example_count_per_group = minimum_example_count_per_group
self.maximum_categorical_split_point_count = maximum_categorical_split_point_count
self.categorical_smoothing = categorical_smoothing
@@ -218,6 +223,7 @@ def _get_node(self, **all_args):
batch_size=self.batch_size,
use_categorical_split=self.use_categorical_split,
handle_missing_value=self.handle_missing_value,
+ use_zero_as_missing_value=self.use_zero_as_missing_value,
minimum_example_count_per_group=self.minimum_example_count_per_group,
maximum_categorical_split_point_count=self.maximum_categorical_split_point_count,
categorical_smoothing=self.categorical_smoothing,
diff --git a/src/python/nimbusml/internal/core/ensemble/lightgbmranker.py b/src/python/nimbusml/internal/core/ensemble/lightgbmranker.py
index 6c06148d..c3394cf4 100644
--- a/src/python/nimbusml/internal/core/ensemble/lightgbmranker.py
+++ b/src/python/nimbusml/internal/core/ensemble/lightgbmranker.py
@@ -95,6 +95,9 @@ class LightGbmRanker(BasePipelineItem, DefaultSignatureWithRoles):
:param handle_missing_value: Enable special handling of missing value or
not.
+ :param use_zero_as_missing_value: Enable usage of zero (0) as missing
+ value.
+
:param minimum_example_count_per_group: Minimum number of instances per
categorical group.
@@ -149,6 +152,7 @@ def __init__(
batch_size=1048576,
use_categorical_split=None,
handle_missing_value=True,
+ use_zero_as_missing_value=False,
minimum_example_count_per_group=100,
maximum_categorical_split_point_count=32,
categorical_smoothing=10.0,
@@ -176,6 +180,7 @@ def __init__(
self.batch_size = batch_size
self.use_categorical_split = use_categorical_split
self.handle_missing_value = handle_missing_value
+ self.use_zero_as_missing_value = use_zero_as_missing_value
self.minimum_example_count_per_group = minimum_example_count_per_group
self.maximum_categorical_split_point_count = maximum_categorical_split_point_count
self.categorical_smoothing = categorical_smoothing
@@ -212,6 +217,7 @@ def _get_node(self, **all_args):
batch_size=self.batch_size,
use_categorical_split=self.use_categorical_split,
handle_missing_value=self.handle_missing_value,
+ use_zero_as_missing_value=self.use_zero_as_missing_value,
minimum_example_count_per_group=self.minimum_example_count_per_group,
maximum_categorical_split_point_count=self.maximum_categorical_split_point_count,
categorical_smoothing=self.categorical_smoothing,
diff --git a/src/python/nimbusml/internal/core/ensemble/lightgbmregressor.py b/src/python/nimbusml/internal/core/ensemble/lightgbmregressor.py
index 20fe5e57..b4cb7b5e 100644
--- a/src/python/nimbusml/internal/core/ensemble/lightgbmregressor.py
+++ b/src/python/nimbusml/internal/core/ensemble/lightgbmregressor.py
@@ -90,6 +90,9 @@ class LightGbmRegressor(
:param handle_missing_value: Enable special handling of missing value or
not.
+ :param use_zero_as_missing_value: Enable usage of zero (0) as missing
+ value.
+
:param minimum_example_count_per_group: Minimum number of instances per
categorical group.
@@ -142,6 +145,7 @@ def __init__(
batch_size=1048576,
use_categorical_split=None,
handle_missing_value=True,
+ use_zero_as_missing_value=False,
minimum_example_count_per_group=100,
maximum_categorical_split_point_count=32,
categorical_smoothing=10.0,
@@ -168,6 +172,7 @@ def __init__(
self.batch_size = batch_size
self.use_categorical_split = use_categorical_split
self.handle_missing_value = handle_missing_value
+ self.use_zero_as_missing_value = use_zero_as_missing_value
self.minimum_example_count_per_group = minimum_example_count_per_group
self.maximum_categorical_split_point_count = maximum_categorical_split_point_count
self.categorical_smoothing = categorical_smoothing
@@ -202,6 +207,7 @@ def _get_node(self, **all_args):
batch_size=self.batch_size,
use_categorical_split=self.use_categorical_split,
handle_missing_value=self.handle_missing_value,
+ use_zero_as_missing_value=self.use_zero_as_missing_value,
minimum_example_count_per_group=self.minimum_example_count_per_group,
maximum_categorical_split_point_count=self.maximum_categorical_split_point_count,
categorical_smoothing=self.categorical_smoothing,
diff --git a/src/python/nimbusml/internal/core/preprocessing/datetimesplitter.py b/src/python/nimbusml/internal/core/preprocessing/datetimesplitter.py
new file mode 100644
index 00000000..a00c3dc6
--- /dev/null
+++ b/src/python/nimbusml/internal/core/preprocessing/datetimesplitter.py
@@ -0,0 +1,57 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
+"""
+DateTimeSplitter
+"""
+
+__all__ = ["DateTimeSplitter"]
+
+
+from ...entrypoints.transforms_datetimesplitter import \
+ transforms_datetimesplitter
+from ...utils.utils import trace
+from ..base_pipeline_item import BasePipelineItem, DefaultSignature
+
+
+class DateTimeSplitter(BasePipelineItem, DefaultSignature):
+ """
+ **Description**
+ Splits a date time value into each individual component
+
+ :param prefix: Output column prefix.
+
+ :param country: Country to get holidays for. Defaults to none if not
+ passed.
+
+ :param params: Additional arguments sent to compute engine.
+
+ """
+
+ @trace
+ def __init__(
+ self,
+ prefix,
+ country='None',
+ **params):
+ BasePipelineItem.__init__(
+ self, type='transform', **params)
+
+ self.prefix = prefix
+ self.country = country
+
+ @property
+ def _entrypoint(self):
+ return transforms_datetimesplitter
+
+ @trace
+ def _get_node(self, **all_args):
+ algo_args = dict(
+ source=self.source,
+ prefix=self.prefix,
+ country=self.country)
+
+ all_args.update(algo_args)
+ return self._entrypoint(**all_args)
diff --git a/src/python/nimbusml/internal/core/preprocessing/normalization/robustscaler.py b/src/python/nimbusml/internal/core/preprocessing/normalization/robustscaler.py
new file mode 100644
index 00000000..08845bae
--- /dev/null
+++ b/src/python/nimbusml/internal/core/preprocessing/normalization/robustscaler.py
@@ -0,0 +1,103 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
+"""
+RobustScaler
+"""
+
+__all__ = ["RobustScaler"]
+
+
+from ....entrypoints.transforms_robustscaler import transforms_robustscaler
+from ....utils.utils import trace
+from ...base_pipeline_item import BasePipelineItem, DefaultSignature
+
+
+class RobustScaler(BasePipelineItem, DefaultSignature):
+ """
+ **Description**
+ Removes the median and scales the data according to the quantile range.
+
+ :param center: If True, center the data before scaling.
+
+ :param scale: If True, scale the data to interquartile range.
+
+ :param quantile_min: Min for the quantile range used to calculate scale.
+
+ :param quantile_max: Max for the quantile range used to calculate scale.
+
+ :param params: Additional arguments sent to compute engine.
+
+ """
+
+ @trace
+ def __init__(
+ self,
+ center=True,
+ scale=True,
+ quantile_min=25.0,
+ quantile_max=75.0,
+ **params):
+ BasePipelineItem.__init__(
+ self, type='transform', **params)
+
+ self.center = center
+ self.scale = scale
+ self.quantile_min = quantile_min
+ self.quantile_max = quantile_max
+
+ @property
+ def _entrypoint(self):
+ return transforms_robustscaler
+
+ @trace
+ def _get_node(self, **all_args):
+
+ input_columns = self.input
+ if input_columns is None and 'input' in all_args:
+ input_columns = all_args['input']
+ if 'input' in all_args:
+ all_args.pop('input')
+
+ output_columns = self.output
+ if output_columns is None and 'output' in all_args:
+ output_columns = all_args['output']
+ if 'output' in all_args:
+ all_args.pop('output')
+
+ # validate input
+ if input_columns is None:
+ raise ValueError(
+ "'None' input passed when it cannot be none.")
+
+ if not isinstance(input_columns, list):
+ raise ValueError(
+ "input has to be a list of strings, instead got %s" %
+ type(input_columns))
+
+ # validate output
+ if output_columns is None:
+ output_columns = input_columns
+
+ if not isinstance(output_columns, list):
+ raise ValueError(
+ "output has to be a list of strings, instead got %s" %
+ type(output_columns))
+
+ algo_args = dict(
+ column=[
+ dict(
+ Source=i,
+ Name=o) for i,
+ o in zip(
+ input_columns,
+ output_columns)] if input_columns else None,
+ center=self.center,
+ scale=self.scale,
+ quantile_min=self.quantile_min,
+ quantile_max=self.quantile_max)
+
+ all_args.update(algo_args)
+ return self._entrypoint(**all_args)
diff --git a/src/python/nimbusml/internal/core/preprocessing/onnxrunner.py b/src/python/nimbusml/internal/core/preprocessing/onnxrunner.py
new file mode 100644
index 00000000..34ed46ba
--- /dev/null
+++ b/src/python/nimbusml/internal/core/preprocessing/onnxrunner.py
@@ -0,0 +1,71 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
+"""
+OnnxRunner
+"""
+
+__all__ = ["OnnxRunner"]
+
+
+from ...entrypoints.models_onnxtransformer import models_onnxtransformer
+from ...utils.utils import trace
+from ..base_pipeline_item import BasePipelineItem, DefaultSignature
+
+
+class OnnxRunner(BasePipelineItem, DefaultSignature):
+ """
+ **Description**
+ Applies an ONNX model to a dataset.
+
+ :param model_file: Path to the onnx model file.
+
+ :param input_columns: Name of the input column.
+
+ :param output_columns: Name of the output column.
+
+ :param gpu_device_id: GPU device id to run on (e.g. 0,1,..). Null for CPU.
+ Requires CUDA 9.1.
+
+ :param fallback_to_cpu: If true, resumes execution on CPU upon GPU error.
+ If false, will raise the GPU execption.
+
+ :param params: Additional arguments sent to compute engine.
+
+ """
+
+ @trace
+ def __init__(
+ self,
+ model_file,
+ input_columns=None,
+ output_columns=None,
+ gpu_device_id=None,
+ fallback_to_cpu=False,
+ **params):
+ BasePipelineItem.__init__(
+ self, type='transform', **params)
+
+ self.model_file = model_file
+ self.input_columns = input_columns
+ self.output_columns = output_columns
+ self.gpu_device_id = gpu_device_id
+ self.fallback_to_cpu = fallback_to_cpu
+
+ @property
+ def _entrypoint(self):
+ return models_onnxtransformer
+
+ @trace
+ def _get_node(self, **all_args):
+ algo_args = dict(
+ model_file=self.model_file,
+ input_columns=self.input_columns,
+ output_columns=self.output_columns,
+ gpu_device_id=self.gpu_device_id,
+ fallback_to_cpu=self.fallback_to_cpu)
+
+ all_args.update(algo_args)
+ return self._entrypoint(**all_args)
diff --git a/src/python/nimbusml/internal/core/preprocessing/tokeyimputer.py b/src/python/nimbusml/internal/core/preprocessing/tokeyimputer.py
new file mode 100644
index 00000000..e82498a3
--- /dev/null
+++ b/src/python/nimbusml/internal/core/preprocessing/tokeyimputer.py
@@ -0,0 +1,80 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
+"""
+ToKeyImputer
+"""
+
+__all__ = ["ToKeyImputer"]
+
+
+from ...entrypoints.transforms_categoryimputer import \
+ transforms_categoryimputer
+from ...utils.utils import trace
+from ..base_pipeline_item import BasePipelineItem, DefaultSignature
+
+
+class ToKeyImputer(BasePipelineItem, DefaultSignature):
+ """
+ **Description**
+ Fills in missing values in a column based on the most frequent value
+
+ :param params: Additional arguments sent to compute engine.
+
+ """
+
+ @trace
+ def __init__(
+ self,
+ **params):
+ BasePipelineItem.__init__(
+ self, type='transform', **params)
+
+ @property
+ def _entrypoint(self):
+ return transforms_categoryimputer
+
+ @trace
+ def _get_node(self, **all_args):
+
+ input_columns = self.input
+ if input_columns is None and 'input' in all_args:
+ input_columns = all_args['input']
+ if 'input' in all_args:
+ all_args.pop('input')
+
+ output_columns = self.output
+ if output_columns is None and 'output' in all_args:
+ output_columns = all_args['output']
+ if 'output' in all_args:
+ all_args.pop('output')
+
+ # validate input
+ if input_columns is None:
+ raise ValueError(
+ "'None' input passed when it cannot be none.")
+
+ if not isinstance(input_columns, list):
+ raise ValueError(
+ "input has to be a list of strings, instead got %s" %
+ type(input_columns))
+
+ # validate output
+ if output_columns is None:
+ output_columns = input_columns
+
+ if not isinstance(output_columns, list):
+ raise ValueError(
+ "output has to be a list of strings, instead got %s" %
+ type(output_columns))
+
+ algo_args = dict(
+ column=[
+ dict(
+ Source=i, Name=o) for i, o in zip(
+ input_columns, output_columns)] if input_columns else None)
+
+ all_args.update(algo_args)
+ return self._entrypoint(**all_args)
diff --git a/src/python/nimbusml/internal/core/preprocessing/tostring.py b/src/python/nimbusml/internal/core/preprocessing/tostring.py
new file mode 100644
index 00000000..2294c715
--- /dev/null
+++ b/src/python/nimbusml/internal/core/preprocessing/tostring.py
@@ -0,0 +1,79 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
+"""
+ToString
+"""
+
+__all__ = ["ToString"]
+
+
+from ...entrypoints.transforms_tostring import transforms_tostring
+from ...utils.utils import trace
+from ..base_pipeline_item import BasePipelineItem, DefaultSignature
+
+
+class ToString(BasePipelineItem, DefaultSignature):
+ """
+ **Description**
+ Turns the given column into a column of its string representation
+
+ :param params: Additional arguments sent to compute engine.
+
+ """
+
+ @trace
+ def __init__(
+ self,
+ **params):
+ BasePipelineItem.__init__(
+ self, type='transform', **params)
+
+ @property
+ def _entrypoint(self):
+ return transforms_tostring
+
+ @trace
+ def _get_node(self, **all_args):
+
+ input_columns = self.input
+ if input_columns is None and 'input' in all_args:
+ input_columns = all_args['input']
+ if 'input' in all_args:
+ all_args.pop('input')
+
+ output_columns = self.output
+ if output_columns is None and 'output' in all_args:
+ output_columns = all_args['output']
+ if 'output' in all_args:
+ all_args.pop('output')
+
+ # validate input
+ if input_columns is None:
+ raise ValueError(
+ "'None' input passed when it cannot be none.")
+
+ if not isinstance(input_columns, list):
+ raise ValueError(
+ "input has to be a list of strings, instead got %s" %
+ type(input_columns))
+
+ # validate output
+ if output_columns is None:
+ output_columns = input_columns
+
+ if not isinstance(output_columns, list):
+ raise ValueError(
+ "output has to be a list of strings, instead got %s" %
+ type(output_columns))
+
+ algo_args = dict(
+ column=[
+ dict(
+ Source=i, Name=o) for i, o in zip(
+ input_columns, output_columns)] if input_columns else None)
+
+ all_args.update(algo_args)
+ return self._entrypoint(**all_args)
diff --git a/src/python/nimbusml/internal/core/timeseries/ssaforecaster.py b/src/python/nimbusml/internal/core/timeseries/ssaforecaster.py
index ce9064b5..f1ee5f6b 100644
--- a/src/python/nimbusml/internal/core/timeseries/ssaforecaster.py
+++ b/src/python/nimbusml/internal/core/timeseries/ssaforecaster.py
@@ -38,7 +38,7 @@ class SsaForecaster(BasePipelineItem, DefaultSignature):
:param series_length: The length of series that is kept in buffer for
modeling (parameter N).
- :param train_size: The length of series from the begining used for
+ :param train_size: The length of series from the beginning used for
training.
:param horizon: The number of values to forecast.
diff --git a/src/python/nimbusml/internal/core/timeseries/timeseriesimputer.py b/src/python/nimbusml/internal/core/timeseries/timeseriesimputer.py
new file mode 100644
index 00000000..7a01c9c1
--- /dev/null
+++ b/src/python/nimbusml/internal/core/timeseries/timeseriesimputer.py
@@ -0,0 +1,78 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
+"""
+TimeSeriesImputer
+"""
+
+__all__ = ["TimeSeriesImputer"]
+
+
+from ...entrypoints.transforms_timeseriesimputer import \
+ transforms_timeseriesimputer
+from ...utils.utils import trace
+from ..base_pipeline_item import BasePipelineItem, DefaultSignature
+
+
+class TimeSeriesImputer(BasePipelineItem, DefaultSignature):
+ """
+ **Description**
+ Fills in missing row and values
+
+ :param time_series_column: Column representing the time.
+
+ :param grain_columns: List of grain columns.
+
+ :param filter_columns: Columns to filter.
+
+ :param filter_mode: Filter mode. Either include or exclude.
+
+ :param impute_mode: Mode for imputing, defaults to ForwardFill if not
+ provided.
+
+ :param supress_type_errors: Suppress the errors that would occur if a
+ column and impute mode are incompatible. If true, will skip the column.
+ If false, will stop and throw an error.
+
+ :param params: Additional arguments sent to compute engine.
+
+ """
+
+ @trace
+ def __init__(
+ self,
+ time_series_column,
+ grain_columns,
+ filter_columns=None,
+ filter_mode='Exclude',
+ impute_mode='ForwardFill',
+ supress_type_errors=False,
+ **params):
+ BasePipelineItem.__init__(
+ self, type='transform', **params)
+
+ self.time_series_column = time_series_column
+ self.grain_columns = grain_columns
+ self.filter_columns = filter_columns
+ self.filter_mode = filter_mode
+ self.impute_mode = impute_mode
+ self.supress_type_errors = supress_type_errors
+
+ @property
+ def _entrypoint(self):
+ return transforms_timeseriesimputer
+
+ @trace
+ def _get_node(self, **all_args):
+ algo_args = dict(
+ time_series_column=self.time_series_column,
+ grain_columns=self.grain_columns,
+ filter_columns=self.filter_columns,
+ filter_mode=self.filter_mode,
+ impute_mode=self.impute_mode,
+ supress_type_errors=self.supress_type_errors)
+
+ all_args.update(algo_args)
+ return self._entrypoint(**all_args)
diff --git a/src/python/nimbusml/internal/entrypoints/_calibratortrainer_fixedplattcalibrator.py b/src/python/nimbusml/internal/entrypoints/_calibratortrainer_fixedplattcalibrator.py
index 35e80514..8f1904cb 100644
--- a/src/python/nimbusml/internal/entrypoints/_calibratortrainer_fixedplattcalibrator.py
+++ b/src/python/nimbusml/internal/entrypoints/_calibratortrainer_fixedplattcalibrator.py
@@ -10,16 +10,16 @@
def fixed_platt_calibrator(
- slope=1.0,
+ slope=-1.0,
offset=0.0,
**params):
"""
**Description**
None
- :param slope: The slope parameter of f(x) = 1 / (1 + exp(-slope *
+ :param slope: The slope parameter of f(x) = 1 / (1 + exp(slope *
x + offset) (settings).
- :param offset: The offset parameter of f(x) = 1 / (1 + exp(-slope
+ :param offset: The offset parameter of f(x) = 1 / (1 + exp(slope
* x + offset) (settings).
"""
diff --git a/src/python/nimbusml/internal/entrypoints/models_fixedplattcalibrator.py b/src/python/nimbusml/internal/entrypoints/models_fixedplattcalibrator.py
index f5665675..4876c6ff 100644
--- a/src/python/nimbusml/internal/entrypoints/models_fixedplattcalibrator.py
+++ b/src/python/nimbusml/internal/entrypoints/models_fixedplattcalibrator.py
@@ -13,7 +13,7 @@ def models_fixedplattcalibrator(
data,
uncalibrated_predictor_model,
predictor_model=None,
- slope=1.0,
+ slope=-1.0,
offset=0.0,
max_rows=1000000000,
**params):
@@ -23,12 +23,12 @@ def models_fixedplattcalibrator(
model
:param slope: The slope parameter of the calibration function 1 /
- (1 + exp(-slope * x + offset) (inputs).
+ (1 + exp(slope * x + offset) (inputs).
:param data: Input dataset (inputs).
:param uncalibrated_predictor_model: The predictor to calibrate
(inputs).
:param offset: The offset parameter of the calibration function 1
- / (1 + exp(-slope * x + offset) (inputs).
+ / (1 + exp(slope * x + offset) (inputs).
:param max_rows: The maximum number of examples to train the
calibrator on (inputs).
:param predictor_model: The trained model (outputs).
diff --git a/src/python/nimbusml/internal/entrypoints/models_multioutputregressionevaluator.py b/src/python/nimbusml/internal/entrypoints/models_multioutputregressionevaluator.py
index d7283221..8b56ca4b 100644
--- a/src/python/nimbusml/internal/entrypoints/models_multioutputregressionevaluator.py
+++ b/src/python/nimbusml/internal/entrypoints/models_multioutputregressionevaluator.py
@@ -15,7 +15,7 @@ def models_multioutputregressionevaluator(
per_instance_metrics=None,
name_column='Name',
loss_function=None,
- supress_scores_and_labels=False,
+ suppress_scores_and_labels=False,
label_column=None,
weight_column=None,
score_column=None,
@@ -28,7 +28,7 @@ def models_multioutputregressionevaluator(
:param data: The data to be used for evaluation. (inputs).
:param name_column: Name column name. (inputs).
:param loss_function: Loss function (inputs).
- :param supress_scores_and_labels: Supress labels and scores in
+ :param suppress_scores_and_labels: Suppress labels and scores in
per-instance outputs? (inputs).
:param label_column: Column to use for labels. (inputs).
:param weight_column: Weight column name. (inputs).
@@ -60,9 +60,9 @@ def models_multioutputregressionevaluator(
obj=loss_function,
none_acceptable=True,
is_of_type=dict)
- if supress_scores_and_labels is not None:
- inputs['SupressScoresAndLabels'] = try_set(
- obj=supress_scores_and_labels,
+ if suppress_scores_and_labels is not None:
+ inputs['SuppressScoresAndLabels'] = try_set(
+ obj=suppress_scores_and_labels,
none_acceptable=True,
is_of_type=bool)
if label_column is not None:
diff --git a/src/python/nimbusml/internal/entrypoints/models_onnxconverter.py b/src/python/nimbusml/internal/entrypoints/models_onnxconverter.py
new file mode 100644
index 00000000..3c080eb6
--- /dev/null
+++ b/src/python/nimbusml/internal/entrypoints/models_onnxconverter.py
@@ -0,0 +1,116 @@
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
+"""
+Models.OnnxConverter
+"""
+
+
+from ..utils.entrypoints import EntryPoint
+from ..utils.utils import try_set, unlist
+
+
+def models_onnxconverter(
+ onnx,
+ data_file=None,
+ json=None,
+ name=None,
+ domain=None,
+ inputs_to_drop=None,
+ outputs_to_drop=None,
+ model=None,
+ onnx_version='Stable',
+ predictive_model=None,
+ **params):
+ """
+ **Description**
+ Converts the model to ONNX format.
+
+ :param data_file: The data file (inputs).
+ :param onnx: The path to write the output ONNX to. (inputs).
+ :param json: The path to write the output JSON to. (inputs).
+ :param name: The 'name' property in the output ONNX. By default
+ this will be the ONNX extension-less name. (inputs).
+ :param domain: The 'domain' property in the output ONNX.
+ (inputs).
+ :param inputs_to_drop: Array of input column names to drop
+ (inputs).
+ :param outputs_to_drop: Array of output column names to drop
+ (inputs).
+ :param model: Model that needs to be converted to ONNX format.
+ (inputs).
+ :param onnx_version: The targeted ONNX version. It can be either
+ "Stable" or "Experimental". If "Experimental" is used,
+ produced model can contain components that is not officially
+ supported in ONNX standard. (inputs).
+ :param predictive_model: Predictor model that needs to be
+ converted to ONNX format. (inputs).
+ """
+
+ entrypoint_name = 'Models.OnnxConverter'
+ inputs = {}
+ outputs = {}
+
+ if data_file is not None:
+ inputs['DataFile'] = try_set(
+ obj=data_file,
+ none_acceptable=True,
+ is_of_type=str)
+ if onnx is not None:
+ inputs['Onnx'] = try_set(
+ obj=onnx,
+ none_acceptable=False,
+ is_of_type=str)
+ if json is not None:
+ inputs['Json'] = try_set(
+ obj=json,
+ none_acceptable=True,
+ is_of_type=str)
+ if name is not None:
+ inputs['Name'] = try_set(
+ obj=name,
+ none_acceptable=True,
+ is_of_type=str,
+ is_column=True)
+ if domain is not None:
+ inputs['Domain'] = try_set(
+ obj=domain,
+ none_acceptable=True,
+ is_of_type=str)
+ if inputs_to_drop is not None:
+ inputs['InputsToDrop'] = try_set(
+ obj=inputs_to_drop,
+ none_acceptable=True,
+ is_of_type=list)
+ if outputs_to_drop is not None:
+ inputs['OutputsToDrop'] = try_set(
+ obj=outputs_to_drop,
+ none_acceptable=True,
+ is_of_type=list)
+ if model is not None:
+ inputs['Model'] = try_set(
+ obj=model,
+ none_acceptable=True,
+ is_of_type=str)
+ if onnx_version is not None:
+ inputs['OnnxVersion'] = try_set(
+ obj=onnx_version,
+ none_acceptable=True,
+ is_of_type=str,
+ values=[
+ 'Stable',
+ 'Experimental'])
+ if predictive_model is not None:
+ inputs['PredictiveModel'] = try_set(
+ obj=predictive_model, none_acceptable=True, is_of_type=str)
+
+ input_variables = {
+ x for x in unlist(inputs.values())
+ if isinstance(x, str) and x.startswith("$")}
+ output_variables = {
+ x for x in unlist(outputs.values())
+ if isinstance(x, str) and x.startswith("$")}
+
+ entrypoint = EntryPoint(
+ name=entrypoint_name, inputs=inputs, outputs=outputs,
+ input_variables=input_variables,
+ output_variables=output_variables)
+ return entrypoint
diff --git a/src/python/nimbusml/internal/entrypoints/models_onnxtransformer.py b/src/python/nimbusml/internal/entrypoints/models_onnxtransformer.py
new file mode 100644
index 00000000..173c976a
--- /dev/null
+++ b/src/python/nimbusml/internal/entrypoints/models_onnxtransformer.py
@@ -0,0 +1,96 @@
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
+"""
+Models.OnnxTransformer
+"""
+
+import numbers
+
+from ..utils.entrypoints import EntryPoint
+from ..utils.utils import try_set, unlist
+
+
+def models_onnxtransformer(
+ model_file,
+ data,
+ output_data=None,
+ model=None,
+ input_columns=None,
+ output_columns=None,
+ gpu_device_id=None,
+ fallback_to_cpu=False,
+ **params):
+ """
+ **Description**
+ Applies an ONNX model to a dataset.
+
+ :param model_file: Path to the onnx model file. (inputs).
+ :param input_columns: Name of the input column. (inputs).
+ :param data: Input dataset (inputs).
+ :param output_columns: Name of the output column. (inputs).
+ :param gpu_device_id: GPU device id to run on (e.g. 0,1,..). Null
+ for CPU. Requires CUDA 9.1. (inputs).
+ :param fallback_to_cpu: If true, resumes execution on CPU upon
+ GPU error. If false, will raise the GPU execption. (inputs).
+ :param output_data: ONNX transformed dataset (outputs).
+ :param model: Transform model (outputs).
+ """
+
+ entrypoint_name = 'Models.OnnxTransformer'
+ inputs = {}
+ outputs = {}
+
+ if model_file is not None:
+ inputs['ModelFile'] = try_set(
+ obj=model_file,
+ none_acceptable=False,
+ is_of_type=str)
+ if input_columns is not None:
+ inputs['InputColumns'] = try_set(
+ obj=input_columns,
+ none_acceptable=True,
+ is_of_type=list,
+ is_column=True)
+ if data is not None:
+ inputs['Data'] = try_set(
+ obj=data,
+ none_acceptable=False,
+ is_of_type=str)
+ if output_columns is not None:
+ inputs['OutputColumns'] = try_set(
+ obj=output_columns,
+ none_acceptable=True,
+ is_of_type=list,
+ is_column=True)
+ if gpu_device_id is not None:
+ inputs['GpuDeviceId'] = try_set(
+ obj=gpu_device_id,
+ none_acceptable=True,
+ is_of_type=numbers.Real)
+ if fallback_to_cpu is not None:
+ inputs['FallbackToCpu'] = try_set(
+ obj=fallback_to_cpu,
+ none_acceptable=True,
+ is_of_type=bool)
+ if output_data is not None:
+ outputs['OutputData'] = try_set(
+ obj=output_data,
+ none_acceptable=False,
+ is_of_type=str)
+ if model is not None:
+ outputs['Model'] = try_set(
+ obj=model,
+ none_acceptable=False,
+ is_of_type=str)
+
+ input_variables = {
+ x for x in unlist(inputs.values())
+ if isinstance(x, str) and x.startswith("$")}
+ output_variables = {
+ x for x in unlist(outputs.values())
+ if isinstance(x, str) and x.startswith("$")}
+
+ entrypoint = EntryPoint(
+ name=entrypoint_name, inputs=inputs, outputs=outputs,
+ input_variables=input_variables,
+ output_variables=output_variables)
+ return entrypoint
diff --git a/src/python/nimbusml/internal/entrypoints/timeseriesprocessingentrypoints_ssaforecasting.py b/src/python/nimbusml/internal/entrypoints/timeseriesprocessingentrypoints_ssaforecasting.py
index f02da3a7..1684783c 100644
--- a/src/python/nimbusml/internal/entrypoints/timeseriesprocessingentrypoints_ssaforecasting.py
+++ b/src/python/nimbusml/internal/entrypoints/timeseriesprocessingentrypoints_ssaforecasting.py
@@ -43,7 +43,7 @@ def timeseriesprocessingentrypoints_ssaforecasting(
building the trajectory matrix (parameter L). (inputs).
:param series_length: The length of series that is kept in buffer
for modeling (parameter N). (inputs).
- :param train_size: The length of series from the begining used
+ :param train_size: The length of series from the beginning used
for training. (inputs).
:param horizon: The number of values to forecast. (inputs).
:param confidence_level: The confidence level in [0, 1) for
diff --git a/src/python/nimbusml/internal/entrypoints/trainers_generalizedadditivemodelbinaryclassifier.py b/src/python/nimbusml/internal/entrypoints/trainers_generalizedadditivemodelbinaryclassifier.py
index e5b62a23..5c281338 100644
--- a/src/python/nimbusml/internal/entrypoints/trainers_generalizedadditivemodelbinaryclassifier.py
+++ b/src/python/nimbusml/internal/entrypoints/trainers_generalizedadditivemodelbinaryclassifier.py
@@ -36,7 +36,7 @@ def trainers_generalizedadditivemodelbinaryclassifier(
**Description**
Trains a gradient boosted stump per feature, on all features
simultaneously, to fit target values using least-squares. It
- mantains no interactions between features.
+ maintains no interactions between features.
:param number_of_iterations: Total number of iterations over all
features (inputs).
diff --git a/src/python/nimbusml/internal/entrypoints/trainers_generalizedadditivemodelregressor.py b/src/python/nimbusml/internal/entrypoints/trainers_generalizedadditivemodelregressor.py
index 1c56a706..2b9334f8 100644
--- a/src/python/nimbusml/internal/entrypoints/trainers_generalizedadditivemodelregressor.py
+++ b/src/python/nimbusml/internal/entrypoints/trainers_generalizedadditivemodelregressor.py
@@ -36,7 +36,7 @@ def trainers_generalizedadditivemodelregressor(
**Description**
Trains a gradient boosted stump per feature, on all features
simultaneously, to fit target values using least-squares. It
- mantains no interactions between features.
+ maintains no interactions between features.
:param number_of_iterations: Total number of iterations over all
features (inputs).
diff --git a/src/python/nimbusml/internal/entrypoints/trainers_lightgbmbinaryclassifier.py b/src/python/nimbusml/internal/entrypoints/trainers_lightgbmbinaryclassifier.py
index 5a54c69f..4ae20be2 100644
--- a/src/python/nimbusml/internal/entrypoints/trainers_lightgbmbinaryclassifier.py
+++ b/src/python/nimbusml/internal/entrypoints/trainers_lightgbmbinaryclassifier.py
@@ -35,6 +35,7 @@ def trainers_lightgbmbinaryclassifier(
batch_size=1048576,
use_categorical_split=None,
handle_missing_value=True,
+ use_zero_as_missing_value=False,
minimum_example_count_per_group=100,
maximum_categorical_split_point_count=32,
categorical_smoothing=10.0,
@@ -88,6 +89,8 @@ def trainers_lightgbmbinaryclassifier(
(inputs).
:param handle_missing_value: Enable special handling of missing
value or not. (inputs).
+ :param use_zero_as_missing_value: Enable usage of zero (0) as
+ missing value. (inputs).
:param minimum_example_count_per_group: Minimum number of
instances per categorical group. (inputs).
:param maximum_categorical_split_point_count: Max number of
@@ -243,6 +246,11 @@ def trainers_lightgbmbinaryclassifier(
obj=handle_missing_value,
none_acceptable=True,
is_of_type=bool)
+ if use_zero_as_missing_value is not None:
+ inputs['UseZeroAsMissingValue'] = try_set(
+ obj=use_zero_as_missing_value,
+ none_acceptable=True,
+ is_of_type=bool)
if minimum_example_count_per_group is not None:
inputs['MinimumExampleCountPerGroup'] = try_set(
obj=minimum_example_count_per_group,
diff --git a/src/python/nimbusml/internal/entrypoints/trainers_lightgbmclassifier.py b/src/python/nimbusml/internal/entrypoints/trainers_lightgbmclassifier.py
index 28f13e0a..d78f2b48 100644
--- a/src/python/nimbusml/internal/entrypoints/trainers_lightgbmclassifier.py
+++ b/src/python/nimbusml/internal/entrypoints/trainers_lightgbmclassifier.py
@@ -35,6 +35,7 @@ def trainers_lightgbmclassifier(
batch_size=1048576,
use_categorical_split=None,
handle_missing_value=True,
+ use_zero_as_missing_value=False,
minimum_example_count_per_group=100,
maximum_categorical_split_point_count=32,
categorical_smoothing=10.0,
@@ -86,6 +87,8 @@ def trainers_lightgbmclassifier(
(inputs).
:param handle_missing_value: Enable special handling of missing
value or not. (inputs).
+ :param use_zero_as_missing_value: Enable usage of zero (0) as
+ missing value. (inputs).
:param minimum_example_count_per_group: Minimum number of
instances per categorical group. (inputs).
:param maximum_categorical_split_point_count: Max number of
@@ -240,6 +243,11 @@ def trainers_lightgbmclassifier(
obj=handle_missing_value,
none_acceptable=True,
is_of_type=bool)
+ if use_zero_as_missing_value is not None:
+ inputs['UseZeroAsMissingValue'] = try_set(
+ obj=use_zero_as_missing_value,
+ none_acceptable=True,
+ is_of_type=bool)
if minimum_example_count_per_group is not None:
inputs['MinimumExampleCountPerGroup'] = try_set(
obj=minimum_example_count_per_group,
diff --git a/src/python/nimbusml/internal/entrypoints/trainers_lightgbmranker.py b/src/python/nimbusml/internal/entrypoints/trainers_lightgbmranker.py
index 5a3a44fd..0c2e9e0a 100644
--- a/src/python/nimbusml/internal/entrypoints/trainers_lightgbmranker.py
+++ b/src/python/nimbusml/internal/entrypoints/trainers_lightgbmranker.py
@@ -34,6 +34,7 @@ def trainers_lightgbmranker(
batch_size=1048576,
use_categorical_split=None,
handle_missing_value=True,
+ use_zero_as_missing_value=False,
minimum_example_count_per_group=100,
maximum_categorical_split_point_count=32,
categorical_smoothing=10.0,
@@ -83,6 +84,8 @@ def trainers_lightgbmranker(
(inputs).
:param handle_missing_value: Enable special handling of missing
value or not. (inputs).
+ :param use_zero_as_missing_value: Enable usage of zero (0) as
+ missing value. (inputs).
:param minimum_example_count_per_group: Minimum number of
instances per categorical group. (inputs).
:param maximum_categorical_split_point_count: Max number of
@@ -232,6 +235,11 @@ def trainers_lightgbmranker(
obj=handle_missing_value,
none_acceptable=True,
is_of_type=bool)
+ if use_zero_as_missing_value is not None:
+ inputs['UseZeroAsMissingValue'] = try_set(
+ obj=use_zero_as_missing_value,
+ none_acceptable=True,
+ is_of_type=bool)
if minimum_example_count_per_group is not None:
inputs['MinimumExampleCountPerGroup'] = try_set(
obj=minimum_example_count_per_group,
diff --git a/src/python/nimbusml/internal/entrypoints/trainers_lightgbmregressor.py b/src/python/nimbusml/internal/entrypoints/trainers_lightgbmregressor.py
index 32260ebe..9fbf3e69 100644
--- a/src/python/nimbusml/internal/entrypoints/trainers_lightgbmregressor.py
+++ b/src/python/nimbusml/internal/entrypoints/trainers_lightgbmregressor.py
@@ -32,6 +32,7 @@ def trainers_lightgbmregressor(
batch_size=1048576,
use_categorical_split=None,
handle_missing_value=True,
+ use_zero_as_missing_value=False,
minimum_example_count_per_group=100,
maximum_categorical_split_point_count=32,
categorical_smoothing=10.0,
@@ -78,6 +79,8 @@ def trainers_lightgbmregressor(
(inputs).
:param handle_missing_value: Enable special handling of missing
value or not. (inputs).
+ :param use_zero_as_missing_value: Enable usage of zero (0) as
+ missing value. (inputs).
:param minimum_example_count_per_group: Minimum number of
instances per categorical group. (inputs).
:param maximum_categorical_split_point_count: Max number of
@@ -218,6 +221,11 @@ def trainers_lightgbmregressor(
obj=handle_missing_value,
none_acceptable=True,
is_of_type=bool)
+ if use_zero_as_missing_value is not None:
+ inputs['UseZeroAsMissingValue'] = try_set(
+ obj=use_zero_as_missing_value,
+ none_acceptable=True,
+ is_of_type=bool)
if minimum_example_count_per_group is not None:
inputs['MinimumExampleCountPerGroup'] = try_set(
obj=minimum_example_count_per_group,
diff --git a/src/python/nimbusml/internal/entrypoints/trainers_localdeepsvmbinaryclassifier.py b/src/python/nimbusml/internal/entrypoints/trainers_localdeepsvmbinaryclassifier.py
new file mode 100644
index 00000000..0b2c5984
--- /dev/null
+++ b/src/python/nimbusml/internal/entrypoints/trainers_localdeepsvmbinaryclassifier.py
@@ -0,0 +1,175 @@
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
+"""
+Trainers.LocalDeepSvmBinaryClassifier
+"""
+
+import numbers
+
+from ..utils.entrypoints import EntryPoint
+from ..utils.utils import try_set, unlist
+
+
+def trainers_localdeepsvmbinaryclassifier(
+ training_data,
+ predictor_model=None,
+ feature_column_name='Features',
+ label_column_name='Label',
+ example_weight_column_name=None,
+ normalize_features='Auto',
+ caching='Auto',
+ tree_depth=3,
+ lambda_w=0.1,
+ lambda_theta=0.01,
+ lambda_thetaprime=0.01,
+ sigma=1.0,
+ number_of_iterations=15000,
+ use_bias=True,
+ calibrator=None,
+ max_calibration_examples=1000000,
+ cache=True,
+ **params):
+ """
+ **Description**
+ LD-SVM learns a binary, non-linear SVM classifier with a kernel that
+ is specifically designed to reduce prediction time. LD-SVM
+ learns decision boundaries that are locally linear.
+
+ :param training_data: The data to be used for training (inputs).
+ :param feature_column_name: Column to use for features (inputs).
+ :param label_column_name: Column to use for labels (inputs).
+ :param example_weight_column_name: Column to use for example
+ weight (inputs).
+ :param normalize_features: Normalize option for the feature
+ column (inputs).
+ :param caching: Whether trainer should cache input training data
+ (inputs).
+ :param tree_depth: Depth of Local Deep SVM tree (inputs).
+ :param lambda_w: Regularizer for classifier parameter W (inputs).
+ :param lambda_theta: Regularizer for kernel parameter Theta
+ (inputs).
+ :param lambda_thetaprime: Regularizer for kernel parameter
+ Thetaprime (inputs).
+ :param sigma: Parameter for sigmoid sharpness (inputs).
+ :param number_of_iterations: Number of iterations (inputs).
+ :param use_bias: No bias (inputs).
+ :param calibrator: The calibrator kind to apply to the predictor.
+ Specify null for no calibration (inputs).
+ :param max_calibration_examples: The maximum number of examples
+ to use when training the calibrator (inputs).
+ :param cache: Whether to cache the data before the first
+ iteration (inputs).
+ :param predictor_model: The trained model (outputs).
+ """
+
+ entrypoint_name = 'Trainers.LocalDeepSvmBinaryClassifier'
+ inputs = {}
+ outputs = {}
+
+ if training_data is not None:
+ inputs['TrainingData'] = try_set(
+ obj=training_data,
+ none_acceptable=False,
+ is_of_type=str)
+ if feature_column_name is not None:
+ inputs['FeatureColumnName'] = try_set(
+ obj=feature_column_name,
+ none_acceptable=True,
+ is_of_type=str,
+ is_column=True)
+ if label_column_name is not None:
+ inputs['LabelColumnName'] = try_set(
+ obj=label_column_name,
+ none_acceptable=True,
+ is_of_type=str,
+ is_column=True)
+ if example_weight_column_name is not None:
+ inputs['ExampleWeightColumnName'] = try_set(
+ obj=example_weight_column_name,
+ none_acceptable=True,
+ is_of_type=str,
+ is_column=True)
+ if normalize_features is not None:
+ inputs['NormalizeFeatures'] = try_set(
+ obj=normalize_features,
+ none_acceptable=True,
+ is_of_type=str,
+ values=[
+ 'No',
+ 'Warn',
+ 'Auto',
+ 'Yes'])
+ if caching is not None:
+ inputs['Caching'] = try_set(
+ obj=caching,
+ none_acceptable=True,
+ is_of_type=str,
+ values=[
+ 'Auto',
+ 'Memory',
+ 'None'])
+ if tree_depth is not None:
+ inputs['TreeDepth'] = try_set(
+ obj=tree_depth,
+ none_acceptable=True,
+ is_of_type=numbers.Real)
+ if lambda_w is not None:
+ inputs['LambdaW'] = try_set(
+ obj=lambda_w,
+ none_acceptable=True,
+ is_of_type=numbers.Real)
+ if lambda_theta is not None:
+ inputs['LambdaTheta'] = try_set(
+ obj=lambda_theta,
+ none_acceptable=True,
+ is_of_type=numbers.Real)
+ if lambda_thetaprime is not None:
+ inputs['LambdaThetaprime'] = try_set(
+ obj=lambda_thetaprime,
+ none_acceptable=True,
+ is_of_type=numbers.Real)
+ if sigma is not None:
+ inputs['Sigma'] = try_set(
+ obj=sigma,
+ none_acceptable=True,
+ is_of_type=numbers.Real)
+ if number_of_iterations is not None:
+ inputs['NumberOfIterations'] = try_set(
+ obj=number_of_iterations,
+ none_acceptable=True,
+ is_of_type=numbers.Real)
+ if use_bias is not None:
+ inputs['UseBias'] = try_set(
+ obj=use_bias,
+ none_acceptable=True,
+ is_of_type=bool)
+ if calibrator is not None:
+ inputs['Calibrator'] = try_set(
+ obj=calibrator,
+ none_acceptable=True,
+ is_of_type=dict)
+ if max_calibration_examples is not None:
+ inputs['MaxCalibrationExamples'] = try_set(
+ obj=max_calibration_examples,
+ none_acceptable=True,
+ is_of_type=numbers.Real)
+ if cache is not None:
+ inputs['Cache'] = try_set(
+ obj=cache,
+ none_acceptable=True,
+ is_of_type=bool)
+ if predictor_model is not None:
+ outputs['PredictorModel'] = try_set(
+ obj=predictor_model, none_acceptable=False, is_of_type=str)
+
+ input_variables = {
+ x for x in unlist(inputs.values())
+ if isinstance(x, str) and x.startswith("$")}
+ output_variables = {
+ x for x in unlist(outputs.values())
+ if isinstance(x, str) and x.startswith("$")}
+
+ entrypoint = EntryPoint(
+ name=entrypoint_name, inputs=inputs, outputs=outputs,
+ input_variables=input_variables,
+ output_variables=output_variables)
+ return entrypoint
diff --git a/src/python/nimbusml/internal/entrypoints/trainers_logisticregressionclassifier.py b/src/python/nimbusml/internal/entrypoints/trainers_logisticregressionclassifier.py
index 5db498b1..61759e4d 100644
--- a/src/python/nimbusml/internal/entrypoints/trainers_logisticregressionclassifier.py
+++ b/src/python/nimbusml/internal/entrypoints/trainers_logisticregressionclassifier.py
@@ -33,7 +33,7 @@ def trainers_logisticregressionclassifier(
**params):
"""
**Description**
- Maximum entrypy classification is a method in statistics used to
+ Maximum entropy classification is a method in statistics used to
predict the probabilities of parallel events. The model
predicts the probabilities of parallel events by fitting data
to a softmax function.
diff --git a/src/python/nimbusml/internal/entrypoints/transforms_categoryimputer.py b/src/python/nimbusml/internal/entrypoints/transforms_categoryimputer.py
new file mode 100644
index 00000000..7f72261b
--- /dev/null
+++ b/src/python/nimbusml/internal/entrypoints/transforms_categoryimputer.py
@@ -0,0 +1,65 @@
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
+"""
+Transforms.CategoryImputer
+"""
+
+
+from ..utils.entrypoints import EntryPoint
+from ..utils.utils import try_set, unlist
+
+
+def transforms_categoryimputer(
+ column,
+ data,
+ output_data=None,
+ model=None,
+ **params):
+ """
+ **Description**
+ Fills in missing values in a column based on the most frequent value
+
+ :param column: New column definition (optional form: name:src)
+ (inputs).
+ :param data: Input dataset (inputs).
+ :param output_data: Transformed dataset (outputs).
+ :param model: Transform model (outputs).
+ """
+
+ entrypoint_name = 'Transforms.CategoryImputer'
+ inputs = {}
+ outputs = {}
+
+ if column is not None:
+ inputs['Column'] = try_set(
+ obj=column,
+ none_acceptable=False,
+ is_of_type=list,
+ is_column=True)
+ if data is not None:
+ inputs['Data'] = try_set(
+ obj=data,
+ none_acceptable=False,
+ is_of_type=str)
+ if output_data is not None:
+ outputs['OutputData'] = try_set(
+ obj=output_data,
+ none_acceptable=False,
+ is_of_type=str)
+ if model is not None:
+ outputs['Model'] = try_set(
+ obj=model,
+ none_acceptable=False,
+ is_of_type=str)
+
+ input_variables = {
+ x for x in unlist(inputs.values())
+ if isinstance(x, str) and x.startswith("$")}
+ output_variables = {
+ x for x in unlist(outputs.values())
+ if isinstance(x, str) and x.startswith("$")}
+
+ entrypoint = EntryPoint(
+ name=entrypoint_name, inputs=inputs, outputs=outputs,
+ input_variables=input_variables,
+ output_variables=output_variables)
+ return entrypoint
diff --git a/src/python/nimbusml/internal/entrypoints/transforms_datetimesplitter.py b/src/python/nimbusml/internal/entrypoints/transforms_datetimesplitter.py
new file mode 100644
index 00000000..ac2524c8
--- /dev/null
+++ b/src/python/nimbusml/internal/entrypoints/transforms_datetimesplitter.py
@@ -0,0 +1,119 @@
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
+"""
+Transforms.DateTimeSplitter
+"""
+
+
+from ..utils.entrypoints import EntryPoint
+from ..utils.utils import try_set, unlist
+
+
+def transforms_datetimesplitter(
+ source,
+ data,
+ prefix,
+ output_data=None,
+ model=None,
+ country='None',
+ **params):
+ """
+ **Description**
+ Splits a date time value into each individual component
+
+ :param source: Input column (inputs).
+ :param data: Input dataset (inputs).
+ :param prefix: Output column prefix (inputs).
+ :param country: Country to get holidays for. Defaults to none if
+ not passed (inputs).
+ :param output_data: Transformed dataset (outputs).
+ :param model: Transform model (outputs).
+ """
+
+ entrypoint_name = 'Transforms.DateTimeSplitter'
+ inputs = {}
+ outputs = {}
+
+ if source is not None:
+ inputs['Source'] = try_set(
+ obj=source,
+ none_acceptable=False,
+ is_of_type=str,
+ is_column=True)
+ if data is not None:
+ inputs['Data'] = try_set(
+ obj=data,
+ none_acceptable=False,
+ is_of_type=str)
+ if prefix is not None:
+ inputs['Prefix'] = try_set(
+ obj=prefix,
+ none_acceptable=False,
+ is_of_type=str)
+ if country is not None:
+ inputs['Country'] = try_set(
+ obj=country,
+ none_acceptable=True,
+ is_of_type=str,
+ values=[
+ 'None',
+ 'Argentina',
+ 'Australia',
+ 'Austria',
+ 'Belarus',
+ 'Belgium',
+ 'Brazil',
+ 'Canada',
+ 'Colombia',
+ 'Croatia',
+ 'Czech',
+ 'Denmark',
+ 'England',
+ 'Finland',
+ 'France',
+ 'Germany',
+ 'Hungary',
+ 'India',
+ 'Ireland',
+ 'IsleofMan',
+ 'Italy',
+ 'Japan',
+ 'Mexico',
+ 'Netherlands',
+ 'NewZealand',
+ 'NorthernIreland',
+ 'Norway',
+ 'Poland',
+ 'Portugal',
+ 'Scotland',
+ 'Slovenia',
+ 'SouthAfrica',
+ 'Spain',
+ 'Sweden',
+ 'Switzerland',
+ 'Ukraine',
+ 'UnitedKingdom',
+ 'UnitedStates',
+ 'Wales'])
+ if output_data is not None:
+ outputs['OutputData'] = try_set(
+ obj=output_data,
+ none_acceptable=False,
+ is_of_type=str)
+ if model is not None:
+ outputs['Model'] = try_set(
+ obj=model,
+ none_acceptable=False,
+ is_of_type=str)
+
+ input_variables = {
+ x for x in unlist(inputs.values())
+ if isinstance(x, str) and x.startswith("$")}
+ output_variables = {
+ x for x in unlist(outputs.values())
+ if isinstance(x, str) and x.startswith("$")}
+
+ entrypoint = EntryPoint(
+ name=entrypoint_name, inputs=inputs, outputs=outputs,
+ input_variables=input_variables,
+ output_variables=output_variables)
+ return entrypoint
diff --git a/src/python/nimbusml/internal/entrypoints/transforms_missingvaluehandler.py b/src/python/nimbusml/internal/entrypoints/transforms_missingvaluehandler.py
index 1f1a3870..121115b4 100644
--- a/src/python/nimbusml/internal/entrypoints/transforms_missingvaluehandler.py
+++ b/src/python/nimbusml/internal/entrypoints/transforms_missingvaluehandler.py
@@ -21,7 +21,7 @@ def transforms_missingvaluehandler(
**Description**
Handle missing values by replacing them with either the default value
or the mean/min/max value (for non-text columns only). An
- indicator column can optionally be concatenated, if theinput
+ indicator column can optionally be concatenated, if the input
column type is numeric.
:param column: New column definition(s) (optional form:
diff --git a/src/python/nimbusml/internal/entrypoints/transforms_robustscaler.py b/src/python/nimbusml/internal/entrypoints/transforms_robustscaler.py
new file mode 100644
index 00000000..615af180
--- /dev/null
+++ b/src/python/nimbusml/internal/entrypoints/transforms_robustscaler.py
@@ -0,0 +1,98 @@
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
+"""
+Transforms.RobustScaler
+"""
+
+import numbers
+
+from ..utils.entrypoints import EntryPoint
+from ..utils.utils import try_set, unlist
+
+
+def transforms_robustscaler(
+ column,
+ data,
+ output_data=None,
+ model=None,
+ center=True,
+ scale=True,
+ quantile_min=25.0,
+ quantile_max=75.0,
+ **params):
+ """
+ **Description**
+ Removes the median and scales the data according to the quantile
+ range.
+
+ :param column: New column definition (optional form: name:src)
+ (inputs).
+ :param data: Input dataset (inputs).
+ :param center: If True, center the data before scaling. (inputs).
+ :param scale: If True, scale the data to interquartile range.
+ (inputs).
+ :param quantile_min: Min for the quantile range used to calculate
+ scale. (inputs).
+ :param quantile_max: Max for the quantile range used to calculate
+ scale. (inputs).
+ :param output_data: Transformed dataset (outputs).
+ :param model: Transform model (outputs).
+ """
+
+ entrypoint_name = 'Transforms.RobustScaler'
+ inputs = {}
+ outputs = {}
+
+ if column is not None:
+ inputs['Column'] = try_set(
+ obj=column,
+ none_acceptable=False,
+ is_of_type=list,
+ is_column=True)
+ if data is not None:
+ inputs['Data'] = try_set(
+ obj=data,
+ none_acceptable=False,
+ is_of_type=str)
+ if center is not None:
+ inputs['Center'] = try_set(
+ obj=center,
+ none_acceptable=True,
+ is_of_type=bool)
+ if scale is not None:
+ inputs['Scale'] = try_set(
+ obj=scale,
+ none_acceptable=True,
+ is_of_type=bool)
+ if quantile_min is not None:
+ inputs['QuantileMin'] = try_set(
+ obj=quantile_min,
+ none_acceptable=True,
+ is_of_type=numbers.Real)
+ if quantile_max is not None:
+ inputs['QuantileMax'] = try_set(
+ obj=quantile_max,
+ none_acceptable=True,
+ is_of_type=numbers.Real)
+ if output_data is not None:
+ outputs['OutputData'] = try_set(
+ obj=output_data,
+ none_acceptable=False,
+ is_of_type=str)
+ if model is not None:
+ outputs['Model'] = try_set(
+ obj=model,
+ none_acceptable=False,
+ is_of_type=str)
+
+ input_variables = {
+ x for x in unlist(inputs.values())
+ if isinstance(x, str) and x.startswith("$")}
+ output_variables = {
+ x for x in unlist(outputs.values())
+ if isinstance(x, str) and x.startswith("$")}
+
+ entrypoint = EntryPoint(
+ name=entrypoint_name, inputs=inputs, outputs=outputs,
+ input_variables=input_variables,
+ output_variables=output_variables)
+ return entrypoint
diff --git a/src/python/nimbusml/internal/entrypoints/transforms_timeseriesimputer.py b/src/python/nimbusml/internal/entrypoints/transforms_timeseriesimputer.py
new file mode 100644
index 00000000..e58117ad
--- /dev/null
+++ b/src/python/nimbusml/internal/entrypoints/transforms_timeseriesimputer.py
@@ -0,0 +1,114 @@
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
+"""
+Transforms.TimeSeriesImputer
+"""
+
+
+from ..utils.entrypoints import EntryPoint
+from ..utils.utils import try_set, unlist
+
+
+def transforms_timeseriesimputer(
+ time_series_column,
+ data,
+ grain_columns,
+ output_data=None,
+ model=None,
+ filter_columns=None,
+ filter_mode='Exclude',
+ impute_mode='ForwardFill',
+ supress_type_errors=False,
+ **params):
+ """
+ **Description**
+ Fills in missing row and values
+
+ :param time_series_column: Column representing the time (inputs).
+ :param data: Input dataset (inputs).
+ :param grain_columns: List of grain columns (inputs).
+ :param filter_columns: Columns to filter (inputs).
+ :param filter_mode: Filter mode. Either include or exclude
+ (inputs).
+ :param impute_mode: Mode for imputing, defaults to ForwardFill if
+ not provided (inputs).
+ :param supress_type_errors: Suppress the errors that would occur
+ if a column and impute mode are incompatible. If true, will
+ skip the column. If false, will stop and throw an error.
+ (inputs).
+ :param output_data: Transformed dataset (outputs).
+ :param model: Transform model (outputs).
+ """
+
+ entrypoint_name = 'Transforms.TimeSeriesImputer'
+ inputs = {}
+ outputs = {}
+
+ if time_series_column is not None:
+ inputs['TimeSeriesColumn'] = try_set(
+ obj=time_series_column,
+ none_acceptable=False,
+ is_of_type=str,
+ is_column=True)
+ if data is not None:
+ inputs['Data'] = try_set(
+ obj=data,
+ none_acceptable=False,
+ is_of_type=str)
+ if grain_columns is not None:
+ inputs['GrainColumns'] = try_set(
+ obj=grain_columns,
+ none_acceptable=False,
+ is_of_type=list,
+ is_column=True)
+ if filter_columns is not None:
+ inputs['FilterColumns'] = try_set(
+ obj=filter_columns,
+ none_acceptable=True,
+ is_of_type=list,
+ is_column=True)
+ if filter_mode is not None:
+ inputs['FilterMode'] = try_set(
+ obj=filter_mode,
+ none_acceptable=True,
+ is_of_type=str,
+ values=[
+ 'NoFilter',
+ 'Include',
+ 'Exclude'])
+ if impute_mode is not None:
+ inputs['ImputeMode'] = try_set(
+ obj=impute_mode,
+ none_acceptable=True,
+ is_of_type=str,
+ values=[
+ 'ForwardFill',
+ 'BackFill',
+ 'Median'])
+ if supress_type_errors is not None:
+ inputs['SupressTypeErrors'] = try_set(
+ obj=supress_type_errors,
+ none_acceptable=True,
+ is_of_type=bool)
+ if output_data is not None:
+ outputs['OutputData'] = try_set(
+ obj=output_data,
+ none_acceptable=False,
+ is_of_type=str)
+ if model is not None:
+ outputs['Model'] = try_set(
+ obj=model,
+ none_acceptable=False,
+ is_of_type=str)
+
+ input_variables = {
+ x for x in unlist(inputs.values())
+ if isinstance(x, str) and x.startswith("$")}
+ output_variables = {
+ x for x in unlist(outputs.values())
+ if isinstance(x, str) and x.startswith("$")}
+
+ entrypoint = EntryPoint(
+ name=entrypoint_name, inputs=inputs, outputs=outputs,
+ input_variables=input_variables,
+ output_variables=output_variables)
+ return entrypoint
diff --git a/src/python/nimbusml/internal/entrypoints/transforms_tostring.py b/src/python/nimbusml/internal/entrypoints/transforms_tostring.py
new file mode 100644
index 00000000..2f6d9782
--- /dev/null
+++ b/src/python/nimbusml/internal/entrypoints/transforms_tostring.py
@@ -0,0 +1,65 @@
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
+"""
+Transforms.ToString
+"""
+
+
+from ..utils.entrypoints import EntryPoint
+from ..utils.utils import try_set, unlist
+
+
+def transforms_tostring(
+ column,
+ data,
+ output_data=None,
+ model=None,
+ **params):
+ """
+ **Description**
+ Turns the given column into a column of its string representation
+
+ :param column: New column definition (optional form: name:src)
+ (inputs).
+ :param data: Input dataset (inputs).
+ :param output_data: Transformed dataset (outputs).
+ :param model: Transform model (outputs).
+ """
+
+ entrypoint_name = 'Transforms.ToString'
+ inputs = {}
+ outputs = {}
+
+ if column is not None:
+ inputs['Column'] = try_set(
+ obj=column,
+ none_acceptable=False,
+ is_of_type=list,
+ is_column=True)
+ if data is not None:
+ inputs['Data'] = try_set(
+ obj=data,
+ none_acceptable=False,
+ is_of_type=str)
+ if output_data is not None:
+ outputs['OutputData'] = try_set(
+ obj=output_data,
+ none_acceptable=False,
+ is_of_type=str)
+ if model is not None:
+ outputs['Model'] = try_set(
+ obj=model,
+ none_acceptable=False,
+ is_of_type=str)
+
+ input_variables = {
+ x for x in unlist(inputs.values())
+ if isinstance(x, str) and x.startswith("$")}
+ output_variables = {
+ x for x in unlist(outputs.values())
+ if isinstance(x, str) and x.startswith("$")}
+
+ entrypoint = EntryPoint(
+ name=entrypoint_name, inputs=inputs, outputs=outputs,
+ input_variables=input_variables,
+ output_variables=output_variables)
+ return entrypoint
diff --git a/src/python/nimbusml/internal/utils/dataframes.py b/src/python/nimbusml/internal/utils/dataframes.py
index 17572ad1..8e3665bc 100644
--- a/src/python/nimbusml/internal/utils/dataframes.py
+++ b/src/python/nimbusml/internal/utils/dataframes.py
@@ -63,7 +63,7 @@ def resolve_dataframe(dataframe):
ret[name_i] = serie.values
if infered_dtype == 'floating' or \
infered_dtype == 'mixed-integer-float':
- s = serie.itemsize
+ s = serie.dtype.itemsize
if s == 8:
ret[str(i)] = serie.values.astype(
np.float64, copy=False)
@@ -77,7 +77,7 @@ def resolve_dataframe(dataframe):
[_global_dtype_to_char_dict[
np.dtype(np.float32)]])
elif infered_dtype == 'integer':
- s = serie.itemsize
+ s = serie.dtype.itemsize
if s == 8:
ret[str(i)] = serie.values.astype(
np.int64, copy=False)
diff --git a/src/python/nimbusml/pipeline.py b/src/python/nimbusml/pipeline.py
index 6eb190e3..ab9ae1ac 100644
--- a/src/python/nimbusml/pipeline.py
+++ b/src/python/nimbusml/pipeline.py
@@ -39,6 +39,7 @@
from .internal.entrypoints.models_regressionevaluator import \
models_regressionevaluator
from .internal.entrypoints.models_summarizer import models_summarizer
+from .internal.entrypoints.models_onnxconverter import models_onnxconverter
from .internal.entrypoints.models_schema import models_schema
from .internal.entrypoints.transforms_datasetscorerex import \
transforms_datasetscorerex
@@ -1015,6 +1016,8 @@ def fit(self, X, y=None, verbose=1, **params):
:language: python
"""
+ dry_run = params.pop('dry_run', False)
+
if self._is_fitted:
# We restore the initial steps as they were
# modified by the previous training.
@@ -1058,7 +1061,7 @@ def move_information_about_roles_once_used():
# REVIEW: we should have the possibility to keep the model in
# memory and not in a file.
try:
- (out_model, out_data, out_metrics, out_predictor_model) = graph.run(
+ graph_output = graph.run(
X=X,
y=y,
random_state=self.random_state,
@@ -1066,6 +1069,7 @@ def move_information_about_roles_once_used():
verbose=verbose,
max_slots=max_slots,
telemetry_info=telemetry_info,
+ dry_run=dry_run,
**params)
except RuntimeError as e:
self._run_time = time.time() - start_time
@@ -1081,17 +1085,21 @@ def move_information_about_roles_once_used():
delattr(self, "_cache_predictor")
raise e
- move_information_about_roles_once_used()
- self.graph_ = graph
- self.model = out_model
- if out_predictor_model:
- self.predictor_model = out_predictor_model
- self.data = out_data
- # stop the clock
- self._run_time = time.time() - start_time
- self._write_csv_time = graph._write_csv_time
- delattr(self, "_cache_predictor")
- return self
+ if dry_run:
+ return graph_output
+ else:
+ out_model, out_data, out_metrics, out_predictor_model = graph_output
+ move_information_about_roles_once_used()
+ self.graph_ = graph
+ self.model = out_model
+ if out_predictor_model:
+ self.predictor_model = out_predictor_model
+ self.data = out_data
+ # stop the clock
+ self._run_time = time.time() - start_time
+ self._write_csv_time = graph._write_csv_time
+ delattr(self, "_cache_predictor")
+ return self
@trace
def fit_transform(
@@ -1623,8 +1631,9 @@ def get_feature_contributions(self, X, top=10, bottom=10, verbose=0,
outputs = dict(output_data="")
- data_output_format = DataOutputFormat.IDV if as_binary_data_stream \
- else DataOutputFormat.DF,
+ data_output_format = DataOutputFormat.DF
+ if as_binary_data_stream:
+ data_output_format = DataOutputFormat.IDV
graph = Graph(
inputs,
@@ -1813,8 +1822,9 @@ def permutation_feature_importance(self, X, number_of_examples=None,
outputs = dict(output_data="")
- data_output_format = DataOutputFormat.IDV if as_binary_data_stream \
- else DataOutputFormat.DF,
+ data_output_format = DataOutputFormat.DF
+ if as_binary_data_stream:
+ data_output_format = DataOutputFormat.IDV
graph = Graph(
inputs,
@@ -1975,8 +1985,9 @@ def _predict(self, X, y=None,
else:
outputs = dict(output_data="")
- data_output_format = DataOutputFormat.IDV if as_binary_data_stream \
- else DataOutputFormat.DF,
+ data_output_format = DataOutputFormat.DF
+ if as_binary_data_stream:
+ data_output_format = DataOutputFormat.IDV
graph = Graph(
inputs,
@@ -2001,8 +2012,9 @@ def _predict(self, X, y=None,
self._run_time = time.time() - start_time
raise e
- if is_transformer_chain:
- out_data['PredictedLabel'] = out_data['PredictedLabel']*1
+ if data_output_format == DataOutputFormat.DF and \
+ is_transformer_chain and 'PredictedLabel' in out_data.columns:
+ out_data['PredictedLabel'] = out_data['PredictedLabel']*1
if y is not None:
@@ -2519,6 +2531,96 @@ def __setstate__(self, state):
else:
raise ValueError('Pipeline version not supported.')
+ @trace
+ def export_to_onnx(self,
+ dst,
+ domain,
+ dst_json=None,
+ name=None,
+ data_file=None,
+ inputs_to_drop=None,
+ outputs_to_drop=None,
+ onnx_version="Stable",
+ verbose=0):
+ """
+ Export the model to the ONNX format.
+
+ :param str dst: The path to write the output ONNX to.
+ :param str domain: A reverse-DNS name to indicate the model
+ namespace or domain, for example, 'org.onnx'.
+ :param str dst_json: The path to write the output ONNX to
+ in JSON format.
+ :param name: The 'graph.name' property in the output ONNX. By default
+ this will be the ONNX extension-less name. (inputs).
+ :param data_file: The data file (inputs).
+ :param inputs_to_drop: Array of input column names to drop
+ (inputs).
+ :param outputs_to_drop: Array of output column names to drop
+ (inputs).
+ :param onnx_version: The targeted ONNX version. It can be either
+ "Stable" or "Experimental". If "Experimental" is used,
+ produced model can contain components that is not officially
+ supported in ONNX standard. (inputs).
+ """
+ if not domain:
+ raise ValueError("domain argument must be specified and not empty.")
+
+ if not self._is_fitted:
+ raise ValueError("Model is not fitted. Train or load a model before "
+ "export_to_onnx().")
+
+ # start the clock!
+ start_time = time.time()
+
+ onnx_converter_args = {
+ 'onnx': dst,
+ 'json': dst_json,
+ 'domain': domain,
+ 'name': name,
+ 'data_file': data_file,
+ 'inputs_to_drop': inputs_to_drop,
+ 'outputs_to_drop': outputs_to_drop,
+ 'onnx_version': onnx_version
+ }
+
+ if (len(self.steps) > 0) and (self.last_node.type != "transform"):
+ onnx_converter_args['predictive_model'] = "$model"
+ else:
+ onnx_converter_args['model'] = "$model"
+
+ onnx_converter_node = models_onnxconverter(**onnx_converter_args)
+
+ inputs = dict([('model', self.model)])
+ outputs = dict()
+
+ graph = Graph(
+ inputs,
+ outputs,
+ False,
+ onnx_converter_node)
+
+ class_name = type(self).__name__
+ method_name = inspect.currentframe().f_code.co_name
+ telemetry_info = ".".join([class_name, method_name])
+
+ try:
+ graph.run(
+ X=None,
+ y=None,
+ random_state=self.random_state,
+ model=self.model,
+ verbose=verbose,
+ is_summary=False,
+ no_input_data=True,
+ telemetry_info=telemetry_info)
+ except RuntimeError as e:
+ self._run_time = time.time() - start_time
+ raise e
+
+ # stop the clock
+ self._run_time = time.time() - start_time
+ self._write_csv_time = graph._write_csv_time
+
@trace
def score(
self,
diff --git a/src/python/nimbusml/preprocessing/__init__.py b/src/python/nimbusml/preprocessing/__init__.py
index 26b41b8e..202eb15d 100644
--- a/src/python/nimbusml/preprocessing/__init__.py
+++ b/src/python/nimbusml/preprocessing/__init__.py
@@ -2,10 +2,18 @@
from .tokey import ToKey
from .tensorflowscorer import TensorFlowScorer
from .datasettransformer import DatasetTransformer
+from .onnxrunner import OnnxRunner
+from .datetimesplitter import DateTimeSplitter
+from .tokeyimputer import ToKeyImputer
+from .tostring import ToString
__all__ = [
+ 'DateTimeSplitter',
'FromKey',
'ToKey',
+ 'ToKeyImputer',
+ 'ToString',
'TensorFlowScorer',
- 'DatasetTransformer'
+ 'DatasetTransformer',
+ 'OnnxRunner'
]
diff --git a/src/python/nimbusml/preprocessing/datetimesplitter.py b/src/python/nimbusml/preprocessing/datetimesplitter.py
new file mode 100644
index 00000000..c3fceb43
--- /dev/null
+++ b/src/python/nimbusml/preprocessing/datetimesplitter.py
@@ -0,0 +1,59 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
+"""
+DateTimeSplitter
+"""
+
+__all__ = ["DateTimeSplitter"]
+
+
+from sklearn.base import TransformerMixin
+
+from ..base_transform import BaseTransform
+from ..internal.core.preprocessing.datetimesplitter import \
+ DateTimeSplitter as core
+from ..internal.utils.utils import trace
+
+
+class DateTimeSplitter(core, BaseTransform, TransformerMixin):
+ """
+ **Description**
+ Splits a date time value into each individual component
+
+ :param columns: see `Columns `_.
+
+ :param prefix: Output column prefix.
+
+ :param country: Country to get holidays for. Defaults to none if not
+ passed.
+
+ :param params: Additional arguments sent to compute engine.
+
+ """
+
+ @trace
+ def __init__(
+ self,
+ prefix,
+ country='None',
+ columns=None,
+ **params):
+
+ if columns:
+ params['columns'] = columns
+ BaseTransform.__init__(self, **params)
+ core.__init__(
+ self,
+ prefix=prefix,
+ country=country,
+ **params)
+ self._columns = columns
+
+ def get_params(self, deep=False):
+ """
+ Get the parameters for this operator.
+ """
+ return core.get_params(self)
diff --git a/src/python/nimbusml/preprocessing/normalization/__init__.py b/src/python/nimbusml/preprocessing/normalization/__init__.py
index f7d7647a..3928ac40 100644
--- a/src/python/nimbusml/preprocessing/normalization/__init__.py
+++ b/src/python/nimbusml/preprocessing/normalization/__init__.py
@@ -4,6 +4,7 @@
from .lpscaler import LpScaler
from .meanvariancescaler import MeanVarianceScaler
from .minmaxscaler import MinMaxScaler
+from .robustscaler import RobustScaler
__all__ = [
'Binner',
@@ -11,5 +12,6 @@
'LogMeanVarianceScaler',
'LpScaler',
'MeanVarianceScaler',
- 'MinMaxScaler'
+ 'MinMaxScaler',
+ 'RobustScaler'
]
diff --git a/src/python/nimbusml/preprocessing/normalization/robustscaler.py b/src/python/nimbusml/preprocessing/normalization/robustscaler.py
new file mode 100644
index 00000000..776d5609
--- /dev/null
+++ b/src/python/nimbusml/preprocessing/normalization/robustscaler.py
@@ -0,0 +1,66 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
+"""
+RobustScaler
+"""
+
+__all__ = ["RobustScaler"]
+
+
+from sklearn.base import TransformerMixin
+
+from ...base_transform import BaseTransform
+from ...internal.core.preprocessing.normalization.robustscaler import \
+ RobustScaler as core
+from ...internal.utils.utils import trace
+
+
+class RobustScaler(core, BaseTransform, TransformerMixin):
+ """
+ **Description**
+ Removes the median and scales the data according to the quantile range.
+
+ :param columns: see `Columns `_.
+
+ :param center: If True, center the data before scaling.
+
+ :param scale: If True, scale the data to interquartile range.
+
+ :param quantile_min: Min for the quantile range used to calculate scale.
+
+ :param quantile_max: Max for the quantile range used to calculate scale.
+
+ :param params: Additional arguments sent to compute engine.
+
+ """
+
+ @trace
+ def __init__(
+ self,
+ center=True,
+ scale=True,
+ quantile_min=25.0,
+ quantile_max=75.0,
+ columns=None,
+ **params):
+
+ if columns:
+ params['columns'] = columns
+ BaseTransform.__init__(self, **params)
+ core.__init__(
+ self,
+ center=center,
+ scale=scale,
+ quantile_min=quantile_min,
+ quantile_max=quantile_max,
+ **params)
+ self._columns = columns
+
+ def get_params(self, deep=False):
+ """
+ Get the parameters for this operator.
+ """
+ return core.get_params(self)
diff --git a/src/python/nimbusml/preprocessing/onnxrunner.py b/src/python/nimbusml/preprocessing/onnxrunner.py
new file mode 100644
index 00000000..2df2ac75
--- /dev/null
+++ b/src/python/nimbusml/preprocessing/onnxrunner.py
@@ -0,0 +1,82 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
+"""
+OnnxRunner
+"""
+
+__all__ = ["OnnxRunner"]
+
+
+from sklearn.base import TransformerMixin
+
+from ..base_transform import BaseTransform
+from ..internal.core.preprocessing.onnxrunner import OnnxRunner as core
+from ..internal.utils.utils import trace
+
+
+class OnnxRunner(core, BaseTransform, TransformerMixin):
+ """
+ **Description**
+ Applies an ONNX model to a dataset.
+
+ :param columns: see `Columns `_.
+
+ :param model_file: Path to the onnx model file.
+
+ :param input_columns: Name of the input column.
+
+ :param output_columns: Name of the output column.
+
+ :param gpu_device_id: GPU device id to run on (e.g. 0,1,..). Null for CPU.
+ Requires CUDA 9.1.
+
+ :param fallback_to_cpu: If true, resumes execution on CPU upon GPU error.
+ If false, will raise the GPU execption.
+
+ :param params: Additional arguments sent to compute engine.
+
+ """
+
+ @trace
+ def __init__(
+ self,
+ model_file,
+ input_columns=None,
+ output_columns=None,
+ gpu_device_id=None,
+ fallback_to_cpu=False,
+ columns=None,
+ **params):
+
+ if columns:
+ params['columns'] = columns
+ if columns:
+ input_columns = sum(
+ list(
+ columns.values()),
+ []) if isinstance(
+ list(
+ columns.values())[0],
+ list) else list(
+ columns.values())
+ if columns:
+ output_columns = list(columns.keys())
+ BaseTransform.__init__(self, **params)
+ core.__init__(
+ self,
+ model_file=model_file,
+ input_columns=input_columns,
+ output_columns=output_columns,
+ gpu_device_id=gpu_device_id,
+ fallback_to_cpu=fallback_to_cpu,
+ **params)
+ self._columns = columns
+
+ def get_params(self, deep=False):
+ """
+ Get the parameters for this operator.
+ """
+ return core.get_params(self)
diff --git a/src/python/nimbusml/preprocessing/tokeyimputer.py b/src/python/nimbusml/preprocessing/tokeyimputer.py
new file mode 100644
index 00000000..000d6a2f
--- /dev/null
+++ b/src/python/nimbusml/preprocessing/tokeyimputer.py
@@ -0,0 +1,49 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
+"""
+ToKeyImputer
+"""
+
+__all__ = ["ToKeyImputer"]
+
+
+from sklearn.base import TransformerMixin
+
+from ..base_transform import BaseTransform
+from ..internal.core.preprocessing.tokeyimputer import ToKeyImputer as core
+from ..internal.utils.utils import trace
+
+
+class ToKeyImputer(core, BaseTransform, TransformerMixin):
+ """
+ **Description**
+ Fills in missing values in a column based on the most frequent value
+
+ :param columns: see `Columns `_.
+
+ :param params: Additional arguments sent to compute engine.
+
+ """
+
+ @trace
+ def __init__(
+ self,
+ columns=None,
+ **params):
+
+ if columns:
+ params['columns'] = columns
+ BaseTransform.__init__(self, **params)
+ core.__init__(
+ self,
+ **params)
+ self._columns = columns
+
+ def get_params(self, deep=False):
+ """
+ Get the parameters for this operator.
+ """
+ return core.get_params(self)
diff --git a/src/python/nimbusml/preprocessing/tostring.py b/src/python/nimbusml/preprocessing/tostring.py
new file mode 100644
index 00000000..2dd2826c
--- /dev/null
+++ b/src/python/nimbusml/preprocessing/tostring.py
@@ -0,0 +1,49 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
+"""
+ToString
+"""
+
+__all__ = ["ToString"]
+
+
+from sklearn.base import TransformerMixin
+
+from ..base_transform import BaseTransform
+from ..internal.core.preprocessing.tostring import ToString as core
+from ..internal.utils.utils import trace
+
+
+class ToString(core, BaseTransform, TransformerMixin):
+ """
+ **Description**
+ Turns the given column into a column of its string representation
+
+ :param columns: see `Columns `_.
+
+ :param params: Additional arguments sent to compute engine.
+
+ """
+
+ @trace
+ def __init__(
+ self,
+ columns=None,
+ **params):
+
+ if columns:
+ params['columns'] = columns
+ BaseTransform.__init__(self, **params)
+ core.__init__(
+ self,
+ **params)
+ self._columns = columns
+
+ def get_params(self, deep=False):
+ """
+ Get the parameters for this operator.
+ """
+ return core.get_params(self)
diff --git a/src/python/nimbusml/tests/pipeline/test_pipeline_split_models.py b/src/python/nimbusml/tests/pipeline/test_pipeline_split_models.py
index bc1399bf..4a6d9109 100644
--- a/src/python/nimbusml/tests/pipeline/test_pipeline_split_models.py
+++ b/src/python/nimbusml/tests/pipeline/test_pipeline_split_models.py
@@ -47,7 +47,7 @@ def test_notvectorized_output_predictor_model(self):
# Create and fit a combined model and spit out predictor model
combined_pipeline = Pipeline([RangeFilter(min=0.0, max=4.5) << 'c2',
- OnlineGradientDescentRegressor(label='c2')],
+ OnlineGradientDescentRegressor(feature=['c1'], label='c2')],
random_state=seed)
combined_pipeline.fit(df, output_predictor_model=True)
result_1 = combined_pipeline.predict(df)
diff --git a/src/python/nimbusml/tests/pipeline/test_uci_adult.py b/src/python/nimbusml/tests/pipeline/test_uci_adult.py
index 00ae1728..2f6055de 100644
--- a/src/python/nimbusml/tests/pipeline/test_uci_adult.py
+++ b/src/python/nimbusml/tests/pipeline/test_uci_adult.py
@@ -36,7 +36,7 @@
def check_accuracy(test_file, label_column, predictions, threshold, sep=','):
(test, label) = get_X_y(test_file, label_column, sep=sep)
accuracy = np.mean(label[label_column].values ==
- predictions.ix[:, 'PredictedLabel'].values)
+ predictions['PredictedLabel'].values)
assert_greater(
accuracy,
threshold,
diff --git a/src/python/nimbusml/tests/preprocessing/missing_values/test_data_with_missing.py b/src/python/nimbusml/tests/preprocessing/missing_values/test_data_with_missing.py
index 0dc85f6e..d21d4311 100644
--- a/src/python/nimbusml/tests/preprocessing/missing_values/test_data_with_missing.py
+++ b/src/python/nimbusml/tests/preprocessing/missing_values/test_data_with_missing.py
@@ -3,12 +3,14 @@
# Licensed under the MIT License.
# --------------------------------------------------------------------------------------------
+import platform
import unittest
import numpy as np
from math import isnan
from nimbusml import Pipeline
from nimbusml.linear_model import FastLinearRegressor
+from nimbusml.preprocessing import ToKeyImputer
from nimbusml.preprocessing.missing_values import Filter, Handler, Indicator
from pandas import DataFrame
from sklearn.utils.testing import assert_equal, assert_true, \
@@ -160,6 +162,20 @@ def test_input_conversion_to_float_retains_other_column_types(self):
assert_equal(result.dtypes['f1'], np.object)
assert_equal(result.dtypes['f2.f2'], np.float32)
+ @unittest.skipIf('centos' in platform.linux_distribution()[0].lower(), "centos is not supported")
+ def test_category_imputation(self):
+ data={'f0': [4, 4, np.nan, 9],
+ 'f1': [4, 4, np.nan, np.nan]}
+ data = DataFrame(data)
+
+ # Check ToKeyImputer
+ xf = ToKeyImputer(columns={'f0.out': 'f0', 'f1.out': 'f1'})
+ result = xf.fit_transform(data)
+
+ assert_equal(result['f0.out'][1], 4)
+ assert_equal(result['f0.out'][2], 4)
+ assert_equal(result['f1.out'][1], 4)
+ assert_equal(result['f1.out'][2], 4)
if __name__ == '__main__':
unittest.main()
diff --git a/src/python/nimbusml/tests/preprocessing/normalization/test_robustscaler.py b/src/python/nimbusml/tests/preprocessing/normalization/test_robustscaler.py
new file mode 100644
index 00000000..e4197034
--- /dev/null
+++ b/src/python/nimbusml/tests/preprocessing/normalization/test_robustscaler.py
@@ -0,0 +1,29 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+import platform
+import unittest
+
+import pandas
+from nimbusml import Pipeline
+from nimbusml.preprocessing.normalization import RobustScaler
+
+
+@unittest.skipIf('centos' in platform.linux_distribution()[0].lower(), "centos is not supported")
+class TestRobustScaler(unittest.TestCase):
+
+ def test_with_integer_inputs(self):
+ df = pandas.DataFrame(data=dict(c0=[1, 3, 5, 7, 9]))
+
+ xf = RobustScaler(columns='c0', center=True, scale=True)
+ pipeline = Pipeline([xf])
+ result = pipeline.fit_transform(df)
+
+ expected_result = pandas.Series([-1.0, -0.5, 0.0, 0.5, 1.0])
+
+ self.assertTrue(result.loc[:, 'c0'].equals(expected_result))
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/src/python/nimbusml/tests/preprocessing/test_datetimesplitter.py b/src/python/nimbusml/tests/preprocessing/test_datetimesplitter.py
new file mode 100644
index 00000000..f3bb2643
--- /dev/null
+++ b/src/python/nimbusml/tests/preprocessing/test_datetimesplitter.py
@@ -0,0 +1,45 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+
+import platform
+import unittest
+
+import pandas
+from nimbusml import Pipeline
+from nimbusml.preprocessing import DateTimeSplitter
+from nimbusml.preprocessing.schema import ColumnSelector
+from sklearn.utils.testing import assert_equal
+
+
+@unittest.skipIf('centos' in platform.linux_distribution()[0].lower(), "centos is not supported")
+class TestDateTimeSplitter(unittest.TestCase):
+
+ def test_check_estimator_DateTimeSplitter(self):
+ df = pandas.DataFrame(data=dict(dt=[i for i in range(8)]))
+ dt = DateTimeSplitter(prefix='dt_') << 'dt'
+ result = dt.fit_transform(df)
+ assert_equal(result['dt_Year'][0], 1970, "it should have been year of 1970")
+
+ def test_holidays(self):
+ df = pandas.DataFrame(data=dict(
+ tokens1=[1, 2, 3, 157161600],
+ tokens2=[10, 11, 12, 13]
+ ))
+
+ cols_to_drop = [
+ 'dtHour12', 'dtDayOfWeek', 'dtDayOfQuarter',
+ 'dtDayOfYear', 'dtWeekOfMonth', 'dtQuarterOfYear',
+ 'dtHalfOfYear', 'dtWeekIso', 'dtYearIso', 'dtMonthLabel',
+ 'dtAmPmLabel', 'dtDayOfWeekLabel', 'dtIsPaidTimeOff'
+ ]
+
+ dts = DateTimeSplitter(prefix='dt', country='Canada') << 'tokens1'
+ pipeline = Pipeline([dts, ColumnSelector(drop_columns=cols_to_drop)])
+ y = pipeline.fit_transform(df)
+
+ self.assertEqual(y.loc[3, 'dtHolidayName'], 'Christmas Day')
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/src/python/nimbusml/tests/preprocessing/test_tokeyimputer.py b/src/python/nimbusml/tests/preprocessing/test_tokeyimputer.py
new file mode 100644
index 00000000..6eb87bdb
--- /dev/null
+++ b/src/python/nimbusml/tests/preprocessing/test_tokeyimputer.py
@@ -0,0 +1,38 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+
+import platform
+import unittest
+
+import numpy as np
+import pandas as pd
+from nimbusml.preprocessing import ToKeyImputer
+
+
+@unittest.skipIf('centos' in platform.linux_distribution()[0].lower(), "centos is not supported")
+class TestToKeyImputer(unittest.TestCase):
+
+ def test_tokeyimputer(self):
+ text_df = pd.DataFrame(
+ data=dict(
+ text=[
+ "cat",
+ "dog",
+ "fish",
+ "orange",
+ "cat orange",
+ "dog",
+ "fish",
+ None,
+ "spider"]))
+
+ tokey = ToKeyImputer() << 'text'
+ y = tokey.fit_transform(text_df)
+
+ self.assertEqual(y.loc[7, 'text'], 'dog')
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/src/python/nimbusml/tests/preprocessing/test_tostring.py b/src/python/nimbusml/tests/preprocessing/test_tostring.py
new file mode 100644
index 00000000..89811502
--- /dev/null
+++ b/src/python/nimbusml/tests/preprocessing/test_tostring.py
@@ -0,0 +1,39 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+
+import platform
+import unittest
+
+import numpy as np
+from pandas import DataFrame
+from nimbusml.preprocessing import ToString
+from sklearn.utils.testing import assert_equal
+
+
+@unittest.skipIf('centos' in platform.linux_distribution()[0].lower(), "centos is not supported")
+class TestToString(unittest.TestCase):
+
+ def test_tostring(self):
+ data={'f0': [4, 4, -1, 9],
+ 'f1': [5, 5, 3.1, -0.23],
+ 'f2': [6, 6.7, np.nan, np.nan]}
+ data = DataFrame(data).astype({'f0': np.int32,
+ 'f1': np.float32,
+ 'f2': np.float64})
+
+ xf = ToString(columns={'f0.out': 'f0',
+ 'f1.out': 'f1',
+ 'f2.out': 'f2'})
+ result = xf.fit_transform(data)
+
+ assert_equal(result['f0.out'][1], '4')
+ assert_equal(result['f0.out'][2], '-1')
+ assert_equal(result['f1.out'][1], '5.000000')
+ assert_equal(result['f1.out'][2], '3.100000')
+ assert_equal(result['f2.out'][1], '6.700000')
+ assert_equal(result['f2.out'][2], 'NaN')
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/src/python/nimbusml/tests/test_fit_graph.py b/src/python/nimbusml/tests/test_fit_graph.py
new file mode 100644
index 00000000..a0576767
--- /dev/null
+++ b/src/python/nimbusml/tests/test_fit_graph.py
@@ -0,0 +1,234 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+
+import json
+import unittest
+import six
+
+import numpy as np
+import pandas as pd
+from nimbusml import Pipeline, Role
+from nimbusml.cluster import KMeansPlusPlus
+from nimbusml.ensemble import FastTreesRegressor, FastForestRegressor
+from nimbusml.linear_model import FastLinearClassifier
+
+
+class TestVariableColumn(unittest.TestCase):
+
+ def verify_regressor_nodes(self, graph, label_name, features, trainer_name):
+ nodes = graph['nodes']
+
+ self.assertEqual(nodes[0]["Name"], "Transforms.OptionalColumnCreator")
+ self.assertEqual(nodes[0]["Inputs"]["Column"], [label_name])
+
+ self.assertEqual(nodes[1]["Name"], "Transforms.LabelToFloatConverter")
+ self.assertEqual(nodes[1]["Inputs"]["LabelColumn"], label_name)
+
+ self.assertEqual(nodes[2]["Name"], "Transforms.FeatureCombiner")
+ self.assertEqual(nodes[2]["Inputs"]["Features"], features)
+
+ self.assertEqual(nodes[3]["Name"], trainer_name)
+ self.assertEqual(nodes[3]["Inputs"]["FeatureColumnName"], "Features")
+ self.assertEqual(nodes[3]["Inputs"]["LabelColumnName"], label_name)
+
+ def verify_classifier_nodes(self, graph, label_name, features, trainer_name):
+ nodes = graph['nodes']
+
+ self.assertEqual(nodes[0]["Name"], "Transforms.OptionalColumnCreator")
+ self.assertEqual(nodes[0]["Inputs"]["Column"], [label_name])
+
+ self.assertEqual(nodes[1]["Name"], "Transforms.LabelColumnKeyBooleanConverter")
+ self.assertEqual(nodes[1]["Inputs"]["LabelColumn"], label_name)
+
+ self.assertEqual(nodes[2]["Name"], "Transforms.FeatureCombiner")
+ self.assertEqual(nodes[2]["Inputs"]["Features"], features)
+
+ self.assertEqual(nodes[3]["Name"], trainer_name)
+ self.assertEqual(nodes[3]["Inputs"]["FeatureColumnName"], "Features")
+ self.assertEqual(nodes[3]["Inputs"]["LabelColumnName"], label_name)
+
+ def test_label_column_defaults_to_label_when_no_label_column_in_input_data(self):
+ train_data = {'c1': [2, 3, 4, 5], 'c2': [3, 4, 5, 6],
+ 'c3': [4, 5, 6, 7], 'c4': [0, 1, 2, 1]}
+ train_df = pd.DataFrame(train_data)
+
+ predictor = FastForestRegressor()
+ pipeline = Pipeline([predictor])
+ result = json.loads(pipeline.fit(train_df, dry_run=True))
+
+ self.verify_regressor_nodes(result, "Label",
+ ["c1", "c2", "c3", "c4"],
+ "Trainers.FastForestRegressor")
+
+ def test_label_column_defaults_to_label_when_label_column_in_input_data(self):
+ train_data = {'c1': [2, 3, 4, 5], 'c2': [3, 4, 5, 6],
+ 'c3': [4, 5, 6, 7], 'Label': [0, 1, 2, 1]}
+ train_df = pd.DataFrame(train_data)
+
+ predictor = FastTreesRegressor()
+ pipeline = Pipeline([predictor])
+ result = json.loads(pipeline.fit(train_df, dry_run=True))
+
+ self.verify_regressor_nodes(result, "Label",
+ ["c1", "c2", "c3"],
+ "Trainers.FastTreeRegressor")
+
+ def test_label_column_specified_as_argument_without_features(self):
+ train_data = {'c1': [2, 3, 4, 5], 'c2': [3, 4, 5, 6],
+ 'd1': [4, 5, 6, 7], 'c4': [0, 1, 2, 1]}
+ train_df = pd.DataFrame(train_data)
+
+ predictor = FastForestRegressor(label='d1')
+ pipeline = Pipeline([predictor])
+ result = json.loads(pipeline.fit(train_df, dry_run=True))
+
+ self.verify_regressor_nodes(result, "d1",
+ ["c1", "c2", "c4"],
+ "Trainers.FastForestRegressor")
+
+ def test_label_column_specified_as_argument_with_features(self):
+ train_data = {'c1': [2, 3, 4, 5], 'd1': [3, 4, 5, 6],
+ 'c3': [4, 5, 6, 7], 'c4': [0, 1, 2, 1]}
+ train_df = pd.DataFrame(train_data)
+
+ predictor = FastForestRegressor(label='d1', feature=['c1', 'c3', 'c4'])
+ pipeline = Pipeline([predictor])
+ result = json.loads(pipeline.fit(train_df, dry_run=True))
+
+ self.verify_regressor_nodes(result, "d1",
+ ["c1", "c3", "c4"],
+ "Trainers.FastForestRegressor")
+
+ def test_label_column_specified_as_role_without_features(self):
+ train_data = {'c1': [2, 3, 4, 5], 'd1': [3, 4, 5, 6],
+ 'c3': [4, 5, 6, 7], 'c4': [0, 1, 2, 1]}
+ train_df = pd.DataFrame(train_data)
+
+ predictor = FastForestRegressor() << {Role.Label: 'd1'}
+ pipeline = Pipeline([predictor])
+ result = json.loads(pipeline.fit(train_df, dry_run=True))
+
+ self.verify_regressor_nodes(result, "d1",
+ ["c1", "c3", "c4"],
+ "Trainers.FastForestRegressor")
+
+ def test_label_column_specified_as_role_with_features(self):
+ train_data = {'c1': [2, 3, 4, 5], 'd1': [3, 4, 5, 6],
+ 'c3': [4, 5, 6, 7], 'c4': [0, 1, 2, 1]}
+ train_df = pd.DataFrame(train_data)
+
+ predictor = FastForestRegressor() << {
+ Role.Label: 'd1',
+ Role.Feature: ['c1', 'c4']
+ }
+ pipeline = Pipeline([predictor])
+ result = json.loads(pipeline.fit(train_df, dry_run=True))
+
+ self.verify_regressor_nodes(result, "d1",
+ ["c1", "c4"],
+ "Trainers.FastForestRegressor")
+
+ def test_default_label_for_classifier_without_label_column(self):
+ train_data = {'c1': [2, 3, 4, 5], 'c2': [3, 4, 5, 6],
+ 'c3': [4, 5, 6, 7], 'c4': [0, 1, 2, 1]}
+ train_df = pd.DataFrame(train_data)
+
+ predictor = FastLinearClassifier()
+ pipeline = Pipeline([predictor])
+ result = json.loads(pipeline.fit(train_df, dry_run=True))
+
+ self.verify_classifier_nodes(result, "Label",
+ ['c1', 'c2', 'c3', 'c4'],
+ "Trainers.StochasticDualCoordinateAscentClassifier")
+
+ def test_default_label_for_classifier_with_label_column(self):
+ train_data = {'c1': [2, 3, 4, 5], 'c2': [3, 4, 5, 6],
+ 'c3': [4, 5, 6, 7], 'Label': [0, 1, 2, 1]}
+ train_df = pd.DataFrame(train_data)
+
+ predictor = FastLinearClassifier()
+ pipeline = Pipeline([predictor])
+ result = json.loads(pipeline.fit(train_df, dry_run=True))
+
+ self.verify_classifier_nodes(result, "Label",
+ ['c1', 'c2', 'c3'],
+ "Trainers.StochasticDualCoordinateAscentClassifier")
+
+ def test_label_column_for_classifier_specified_as_argument(self):
+ train_data = {'c1': [2, 3, 4, 5], 'c2': [3, 4, 5, 6],
+ 'c3': [4, 5, 6, 7], 'd1': [0, 1, 2, 1]}
+ train_df = pd.DataFrame(train_data)
+
+ predictor = FastLinearClassifier(label='d1')
+ pipeline = Pipeline([predictor])
+ result = json.loads(pipeline.fit(train_df, dry_run=True))
+
+ self.verify_classifier_nodes(result, "d1",
+ ['c1', 'c2', 'c3'],
+ "Trainers.StochasticDualCoordinateAscentClassifier")
+
+ def test_label_column_for_classifier_specified_as_argument_with_features(self):
+ train_data = {'c1': [2, 3, 4, 5], 'c2': [3, 4, 5, 6],
+ 'c3': [4, 5, 6, 7], 'd1': [0, 1, 2, 1]}
+ train_df = pd.DataFrame(train_data)
+
+ predictor = FastLinearClassifier(label='d1', feature=['c1', 'c2'])
+ pipeline = Pipeline([predictor])
+ result = json.loads(pipeline.fit(train_df, dry_run=True))
+
+ self.verify_classifier_nodes(result, "d1",
+ ['c1', 'c2'],
+ "Trainers.StochasticDualCoordinateAscentClassifier")
+
+ def test_label_column_for_classifier_specified_as_role_without_features(self):
+ train_data = {'d1': [2, 3, 4, 5], 'c2': [3, 4, 5, 6],
+ 'c3': [4, 5, 6, 7], 'c4': [0, 1, 2, 1]}
+ train_df = pd.DataFrame(train_data)
+
+ predictor = FastLinearClassifier() << {Role.Label: 'd1'}
+ pipeline = Pipeline([predictor])
+ result = json.loads(pipeline.fit(train_df, dry_run=True))
+
+ self.verify_classifier_nodes(result, "d1",
+ ['c2', 'c3', 'c4'],
+ "Trainers.StochasticDualCoordinateAscentClassifier")
+
+ def test_label_column_for_classifier_specified_as_role_with_features(self):
+ train_data = {'c1': [2, 3, 4, 5], 'c2': [3, 4, 5, 6],
+ 'c3': [4, 5, 6, 7], 'd1': [0, 1, 2, 1]}
+ train_df = pd.DataFrame(train_data)
+
+ predictor = FastLinearClassifier() << {
+ Role.Label: 'd1',
+ Role.Feature: ['c1', 'c4']
+ }
+ pipeline = Pipeline([predictor])
+ result = json.loads(pipeline.fit(train_df, dry_run=True))
+
+ self.verify_classifier_nodes(result, "d1",
+ ['c1', 'c4'],
+ "Trainers.StochasticDualCoordinateAscentClassifier")
+
+ def test_non_label_based_predictor_does_not_have_label_column_automatically_removed(self):
+ train_data = {'c1': [2, 3, 4, 5], 'c2': [3, 4, 5, 6],
+ 'c3': [4, 5, 6, 7], 'Label': [0, 1, 2, 1]}
+ train_df = pd.DataFrame(train_data)
+
+ predictor = KMeansPlusPlus(n_clusters=5)
+ pipeline = Pipeline([predictor])
+ result = json.loads(pipeline.fit(train_df, dry_run=True))
+ nodes = result['nodes']
+
+ self.assertEqual(nodes[0]["Name"], "Transforms.FeatureCombiner")
+ if six.PY2:
+ self.assertItemsEqual(nodes[0]["Inputs"]["Features"], ['c1', 'c2', 'c3', 'Label'])
+ else:
+ self.assertCountEqual(nodes[0]["Inputs"]["Features"], ['c1', 'c2', 'c3', 'Label'])
+ self.assertEqual(nodes[1]["Name"], "Trainers.KMeansPlusPlusClusterer")
+ self.assertEqual(nodes[1]["Inputs"]["FeatureColumnName"], "Features")
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/src/python/nimbusml/tests/test_syntax_learner.py b/src/python/nimbusml/tests/test_syntax_learner.py
index 2c649304..edc05372 100644
--- a/src/python/nimbusml/tests/test_syntax_learner.py
+++ b/src/python/nimbusml/tests/test_syntax_learner.py
@@ -458,7 +458,7 @@ def test_syntax_slots_wo_pipeline(self):
if spl[0] == 'age':
ages.append(l2)
X_xf1.columns = pandas.MultiIndex(
- levels=levels, labels=labels, names=names)
+ levels=levels, codes=labels, names=names)
print(X_xf1.head(n=2).T)
col_ages = [('age', a) for a in ages]
diff --git a/src/python/nimbusml/tests/timeseries/test_timeseriesimputer.py b/src/python/nimbusml/tests/timeseries/test_timeseriesimputer.py
new file mode 100644
index 00000000..fd530713
--- /dev/null
+++ b/src/python/nimbusml/tests/timeseries/test_timeseriesimputer.py
@@ -0,0 +1,43 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+
+import platform
+import unittest
+
+import numpy as np
+import pandas as pd
+from nimbusml.timeseries import TimeSeriesImputer
+
+
+@unittest.skipIf('centos' in platform.linux_distribution()[0].lower(), "centos is not supported")
+class TestTimeSeriesImputer(unittest.TestCase):
+
+ def test_timeseriesimputer_adds_new_row(self):
+ from nimbusml.timeseries import TimeSeriesImputer
+
+ df = pd.DataFrame(data=dict(
+ ts=[1, 2, 3, 5],
+ grain=[1970, 1970, 1970, 1970],
+ c3=[10, 13, 15, 20],
+ c4=[19, 12, 16, 19]
+ ))
+
+ tsi = TimeSeriesImputer(time_series_column='ts',
+ grain_columns=['grain'],
+ filter_columns=['c3', 'c4'],
+ impute_mode='ForwardFill',
+ filter_mode='Include')
+ result = tsi.fit_transform(df)
+
+ self.assertEqual(result.loc[0, 'ts'], 1)
+ self.assertEqual(result.loc[3, 'ts'], 4)
+ self.assertEqual(result.loc[3, 'grain'], 1970)
+ self.assertEqual(result.loc[3, 'c3'], 15)
+ self.assertEqual(result.loc[3, 'c4'], 16)
+ self.assertEqual(result.loc[3, 'IsRowImputed'], True)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/src/python/nimbusml/timeseries/__init__.py b/src/python/nimbusml/timeseries/__init__.py
index 64e66add..05dbfa3c 100644
--- a/src/python/nimbusml/timeseries/__init__.py
+++ b/src/python/nimbusml/timeseries/__init__.py
@@ -3,11 +3,13 @@
from .ssaspikedetector import SsaSpikeDetector
from .ssachangepointdetector import SsaChangePointDetector
from .ssaforecaster import SsaForecaster
+from .timeseriesimputer import TimeSeriesImputer
__all__ = [
'IidSpikeDetector',
'IidChangePointDetector',
'SsaSpikeDetector',
'SsaChangePointDetector',
- 'SsaForecaster'
+ 'SsaForecaster',
+ 'TimeSeriesImputer'
]
diff --git a/src/python/nimbusml/timeseries/ssaforecaster.py b/src/python/nimbusml/timeseries/ssaforecaster.py
index 3cbe540f..35516d15 100644
--- a/src/python/nimbusml/timeseries/ssaforecaster.py
+++ b/src/python/nimbusml/timeseries/ssaforecaster.py
@@ -41,7 +41,7 @@ class SsaForecaster(core, BaseTransform, TransformerMixin):
:param series_length: The length of series that is kept in buffer for
modeling (parameter N).
- :param train_size: The length of series from the begining used for
+ :param train_size: The length of series from the beginning used for
training.
:param horizon: The number of values to forecast.
diff --git a/src/python/nimbusml/timeseries/timeseriesimputer.py b/src/python/nimbusml/timeseries/timeseriesimputer.py
new file mode 100644
index 00000000..bb28c346
--- /dev/null
+++ b/src/python/nimbusml/timeseries/timeseriesimputer.py
@@ -0,0 +1,77 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+# - Generated by tools/entrypoint_compiler.py: do not edit by hand
+"""
+TimeSeriesImputer
+"""
+
+__all__ = ["TimeSeriesImputer"]
+
+
+from sklearn.base import TransformerMixin
+
+from ..base_transform import BaseTransform
+from ..internal.core.timeseries.timeseriesimputer import \
+ TimeSeriesImputer as core
+from ..internal.utils.utils import trace
+
+
+class TimeSeriesImputer(core, BaseTransform, TransformerMixin):
+ """
+ **Description**
+ Fills in missing row and values
+
+ :param columns: see `Columns `_.
+
+ :param time_series_column: Column representing the time.
+
+ :param grain_columns: List of grain columns.
+
+ :param filter_columns: Columns to filter.
+
+ :param filter_mode: Filter mode. Either include or exclude.
+
+ :param impute_mode: Mode for imputing, defaults to ForwardFill if not
+ provided.
+
+ :param supress_type_errors: Suppress the errors that would occur if a
+ column and impute mode are incompatible. If true, will skip the column.
+ If false, will stop and throw an error.
+
+ :param params: Additional arguments sent to compute engine.
+
+ """
+
+ @trace
+ def __init__(
+ self,
+ time_series_column,
+ grain_columns,
+ filter_columns=None,
+ filter_mode='Exclude',
+ impute_mode='ForwardFill',
+ supress_type_errors=False,
+ columns=None,
+ **params):
+
+ if columns:
+ params['columns'] = columns
+ BaseTransform.__init__(self, **params)
+ core.__init__(
+ self,
+ time_series_column=time_series_column,
+ grain_columns=grain_columns,
+ filter_columns=filter_columns,
+ filter_mode=filter_mode,
+ impute_mode=impute_mode,
+ supress_type_errors=supress_type_errors,
+ **params)
+ self._columns = columns
+
+ def get_params(self, deep=False):
+ """
+ Get the parameters for this operator.
+ """
+ return core.get_params(self)
diff --git a/src/python/setup.py b/src/python/setup.py
index e8481345..9ba7ff88 100644
--- a/src/python/setup.py
+++ b/src/python/setup.py
@@ -45,7 +45,7 @@
# Versions should comply with PEP440. For a discussion on
# single-sourcing the version across setup.py and the project code, see
# https://packaging.python.org/en/latest/single_source_version.html
- version='1.6.1',
+ version='1.7.0',
description='NimbusML',
long_description=long_description,
@@ -114,6 +114,7 @@
'tests': [
'nose>=1.3', 'pytest>=4.4.0',
'graphviz', 'imageio',
+ 'onnxruntime',
],
'dprep': ['azureml-dataprep>=1.1.33'],
'utils': ['graphviz', 'imageio'],
@@ -134,6 +135,7 @@
'nbconvert>=4.2.0',
'nose>=1.3',
'pytest>=4.4.0',
+ 'onnxruntime',
],
python_requires='>=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <3.8.*',
diff --git a/src/python/setup.py.in b/src/python/setup.py.in
index 0489bc13..a995460d 100644
--- a/src/python/setup.py.in
+++ b/src/python/setup.py.in
@@ -114,6 +114,7 @@ setup(
'tests': [
'nose>=1.3', 'pytest>=4.4.0',
'graphviz', 'imageio',
+ 'onnxruntime',
],
'dprep': ['azureml-dataprep>=1.1.33'],
'utils': ['graphviz', 'imageio'],
@@ -134,6 +135,7 @@ setup(
'nbconvert>=4.2.0',
'nose>=1.3',
'pytest>=4.4.0',
+ 'onnxruntime'
],
python_requires='>=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, <3.8.*',
diff --git a/src/python/tests/test_estimator_checks.py b/src/python/tests/test_estimator_checks.py
index df7c1e87..a9372ad3 100644
--- a/src/python/tests/test_estimator_checks.py
+++ b/src/python/tests/test_estimator_checks.py
@@ -7,6 +7,7 @@
"""
import json
import os
+import platform
import unittest
from nimbusml.cluster import KMeansPlusPlus
@@ -19,9 +20,10 @@
from nimbusml.ensemble import LightGbmRegressor
from nimbusml.feature_extraction.text import NGramFeaturizer
from nimbusml.internal.entrypoints._ngramextractor_ngram import n_gram
+from nimbusml.preprocessing import TensorFlowScorer, DateTimeSplitter
from nimbusml.linear_model import SgdBinaryClassifier
-from nimbusml.preprocessing import TensorFlowScorer
from nimbusml.preprocessing.filter import SkipFilter, TakeFilter
+from nimbusml.preprocessing.normalization import RobustScaler
from nimbusml.timeseries import (IidSpikeDetector, IidChangePointDetector,
SsaSpikeDetector, SsaChangePointDetector,
SsaForecaster)
@@ -56,6 +58,15 @@
# I8 should not have NA values
'CountSelector':
'check_estimators_dtypes',
+ # DateTimeSplitter does not work with floating point types.
+ 'DateTimeSplitter':
+ 'check_transformer_general, check_pipeline_consistency'
+ 'check_estimators_pickle, check_estimators_dtypes'
+ 'check_dict_unchanged, check_dtype_object, check_fit_score_takes_y'
+ 'check_transformer_data_not_an_array, check_fit1d_1feature,'
+ 'check_fit2d_1feature, check_fit2d_predict1d, check_estimators_overwrite_params,'
+ 'check_estimator_sparse_data, check_fit2d_1sample, check_dont_overwrite_parameters,'
+ 'check_estimators_fit_returns_self',
# by design returns smaller number of rows
'SkipFilter': 'check_transformer_general, '
'check_transformer_data_not_an_array',
@@ -157,6 +168,16 @@
'check_estimators_overwrite_params, \
check_estimator_sparse_data, check_estimators_pickle, '
'check_estimators_nan_inf',
+ # RobustScaler does not support vectorized types
+ 'RobustScaler': 'check_estimator_sparse_data',
+ 'ToKeyImputer':
+ 'check_estimator_sparse_data, check_estimators_dtypes',
+ # Most of these skipped tests are failing because the checks
+ # require numerical types. ToString returns object types.
+ # TypeError: ufunc 'isfinite' not supported for the input types
+ 'ToString': 'check_estimator_sparse_data, check_pipeline_consistency'
+ 'check_transformer_data_not_an_array, check_estimators_pickle'
+ 'check_transformer_general',
'OrdinaryLeastSquaresRegressor': 'check_fit2d_1sample'
}
@@ -196,6 +217,7 @@
'check_classifiers_train']
INSTANCES = {
+ 'DateTimeSplitter': DateTimeSplitter(prefix='dt', columns=['F0']),
'EnsembleClassifier': EnsembleClassifier(num_models=3),
'EnsembleRegressor': EnsembleRegressor(num_models=3),
'FactorizationMachineBinaryClassifier': FactorizationMachineBinaryClassifier(shuffle=False),
@@ -209,6 +231,7 @@
'LightGbmRanker': LightGbmRanker(
minimum_example_count_per_group=1, minimum_example_count_per_leaf=1),
'NGramFeaturizer': NGramFeaturizer(word_feature_extractor=n_gram()),
+ 'RobustScaler': RobustScaler(scale=False),
'SgdBinaryClassifier': SgdBinaryClassifier(number_of_threads=1, shuffle=False),
'SkipFilter': SkipFilter(count=5),
'TakeFilter': TakeFilter(count=100000),
@@ -256,9 +279,18 @@
'TreeFeaturizer',
# skip SymSgdBinaryClassifier for now, because of crashes.
'SymSgdBinaryClassifier',
- 'DatasetTransformer'
+ 'DatasetTransformer',
+ 'OnnxRunner',
+ 'TimeSeriesImputer'
])
+if 'centos' in platform.linux_distribution()[0].lower():
+ skip_epoints |= set([
+ 'DateTimeSplitter',
+ 'RobustScaler',
+ 'ToKeyImputer',
+ 'ToString'])
+
def load_json(file_path):
with open(file_path) as f:
diff --git a/src/python/tests_extended/data_frame_tool.py b/src/python/tests_extended/data_frame_tool.py
new file mode 100644
index 00000000..e1b9317e
--- /dev/null
+++ b/src/python/tests_extended/data_frame_tool.py
@@ -0,0 +1,209 @@
+#-------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+#--------------------------------------------------------------------------
+
+from datetime import datetime
+import numpy as np
+import pandas as pd
+import onnxruntime as onnxrt
+
+ort_float_set = set([np.float32, np.float64])
+
+pd_float_set = set(['float64'])
+
+ort_int_set = set([np.int8, np.uint8, np.int16, np.uint16, np.int32, np.uint32, np.int64, np.uint64])
+
+pd_int_set = set(['int64'])
+
+types_dict = {
+ 'tensor(float16)': np.float16,
+ 'tensor(float)' : np.float32,
+ 'tensor(double)' : np.float64,
+
+ 'tensor(int8)' : np.int8,
+ 'tensor(uint8)' : np.uint8,
+ 'tensor(int16)' : np.int16,
+ 'tensor(uint16)' : np.uint16,
+ 'tensor(int32)' : np.int32,
+ 'tensor(uint32)' : np.uint32,
+ 'tensor(int64)' : np.int64,
+ 'tensor(uint64)' : np.uint64,
+
+ 'tensor(bool)' : np.bool,
+ 'tensor(string)' : np.object
+}
+
+class DataFrameTool():
+ """
+ This is a utility class used to run a model with pandas.DataFrame input
+ """
+ def __init__(self, model_path, sess_options=None):
+ """
+ :param model_path: path to the model to be loaded
+ :param sess_options: see onnxruntime.SessionsOptions
+ """
+ self._model_path = model_path
+ self._sess_options = sess_options
+ self._sess = onnxrt.InferenceSession(self._model_path, self._sess_options)
+
+ def _reshape_input(self, input_array, expected_shape):
+ """
+ :param - input_array numpy array. This one is obtained from DataFrame and expected to have
+ : a rank if 1.
+ :expected_shape - shape fetched from the model which may include dynamic elements.
+ : expected_shape may at most have one -1, None or zero which will be computed from
+ : the size of the input_array. We replace None and zeros to -1 and let np.ndarray.reshape deal with it.
+ """
+ # expected_shape rank is one, we will let onnxruntime to deal with it
+ if len(expected_shape) == 1:
+ return input_array
+
+ inferred_shape = [dim if dim else -1 for dim in expected_shape]
+ return input_array.reshape(inferred_shape)
+
+ def _validate_type(self, input_meta, col_type):
+ """
+ : input_meta - meta info obtained from the model for the given input
+ : col_type - dtype of the column
+ : throws if conditions are not met
+
+ float16 and bool will always require exact match
+ We attempt to convert any type to a string if it is required.
+ With strings we always want to put this into a flat array, cast to np.object and then reshape as object
+ Any other type to qualify for casting must match either integer or floating point types
+ Python datetime which is denoted in Pandas as datetime64[ns] are cast to int64
+ """
+ expected_type = types_dict[input_meta.type]
+ if input_meta.type == 'tensor(string)':
+ return
+ elif expected_type == col_type:
+ return
+ elif expected_type == np.int64 and str(col_type) == 'datetime64[ns]':
+ return
+ elif expected_type == np.uint32 and str(col_type) == 'category':
+ return
+ elif expected_type in ort_float_set and str(col_type) in pd_float_set:
+ return
+ elif expected_type in ort_int_set and str(col_type) in pd_int_set:
+ return
+
+ raise TypeError("Input {} requires type {} unable to cast column type {} ".format(
+ input_meta.name, expected_type, col_type))
+
+
+ def _process_input_list(self, df, input_metas, require):
+ """
+ Return a dictionary of input_name : a typed and shaped np.array of values for a given input_meta
+ The function does the heavy lifting for _get_input_feeds()
+
+ :param df: See :class:`pandas.DataFrame`.
+ :param input_metas: a list of name/type pairs
+ :require is a boolean. If True this helper throws on a missing input.
+
+ """
+ feeds = {}
+ # Process mandadory inputs. Raise an error if anything is not present
+ for input_meta in input_metas:
+ # We fully expect all the types are in the above dictionary
+ assert input_meta.type in types_dict, "Update types_dict for the new type"
+ if input_meta.name in df.columns:
+ self._validate_type(input_meta, df[input_meta.name].dtype)
+ if (df[input_meta.name].dtype) == 'datetime64[ns]':
+ input_array = np.array([dt.timestamp() for dt in df[input_meta.name]]).astype(np.int64)
+ elif (str(df[input_meta.name].dtype)) == 'category':
+ input_array = np.array([key + 1 for key in df[input_meta.name].array.codes]).astype(np.uint32) # in ONNX models trained in ML.NET input coming from "categorical columns" is 1 based indices, whereas Categorical columns save indices that are 0 based, and that need to be retrieved from .array.codes
+ else:
+ # With strings we must cast first to np.object then then reshape
+ # so we do it for everything
+ input_array = np.array(df[input_meta.name]).astype(types_dict[input_meta.type])
+
+ feeds[input_meta.name] = self._reshape_input(input_array, input_meta.shape)
+
+ elif require:
+ raise RuntimeError("This model requires input {} of type {} but it is not found in the DataFrame".format(
+ input_meta.name, types_dict[input_meta.type]))
+ return feeds
+
+
+ def _get_input_feeds(self, df, sess):
+ """
+ Return a dictionary of input_name : a typed and shaped np.array of values
+ This function accepts Pandas DataFrame as the first argument and onnxruntime
+ session with a loaded model. The function interrogates the model for the inputs
+ and matches the model input names to the DataFrame instance column names.
+ It requires exact matches for bool and float16 types. It attempts to convert to
+ string any input type if string is required.
+ It attempts to convert floating types to each other and does the same for all of the
+ integer types without requiring an exact match.
+
+ :param df: See :class:`pandas.DataFrame`. The function only considers the first row (0) of each column
+ and feeds the data to the appropriate model inputs.
+
+ :param sess: See :class:`onnxruntime.InferenceSession`.
+
+ ::
+ For example: pd.DataFrame([[0], [4],[20]],index=[0], columns=['A', 'B', 'C'])
+
+ """
+ if df.empty:
+ raise RuntimeError('input DataFrame is empty')
+
+ # Process mandadory inputs. Raise an error if anything is not present
+ feeds = self._process_input_list(df, sess.get_inputs(), True)
+ # Process optional overridable initializers. If present the initialzier value
+ # is overriden by the input. If not, the initialzier value embedded in the model takes effect.
+ initializers = self._process_input_list(df, sess.get_overridable_initializers(), False)
+
+ feeds.update(initializers)
+
+ return feeds
+
+ def execute(self, df, output_names=None, output_types=None, run_options=None):
+ "Return a list of output values restricted to output names if not empty"
+ """
+ Compute the predictions.
+
+ :param df: See :class:`pandas.DataFrame`.
+ :output_name - list of column output names and their order to output
+ :output_types { output_name : dtype } optional dictionary that asks to cast output
+ to the colum type
+
+ :param run_options: See :class:`onnxruntime.RunOptions`.
+ ::
+ sess.run([output_name], {input_name: x})
+ Pandas DataFrame
+ """
+ input_feed = self._get_input_feeds(df, self._sess);
+ if not output_names:
+ output_names = [output.name for output in self._sess._outputs_meta]
+
+ results = self._sess.run(output_names, input_feed, run_options)
+
+ df = pd.DataFrame()
+ for i, r in enumerate(results):
+ # TODO: remove this. These extra columns
+ # should not be in the output.
+ if output_names[i].startswith('mlnet.') and \
+ output_names[i].endswith('.unusedOutput') and \
+ r.shape == (1,1):
+ continue
+
+ r = np.split(r, r.shape[-1], axis=-1) \
+ if (r.shape[-1] > 1 and r.shape[0] > 1) else [r]
+
+ for suffix, col in enumerate(r):
+ col = col.flatten()
+ if output_types and output_names[i] in output_types:
+ dtype = output_types[output_names[i]]
+ if dtype == np.dtype('datetime64'):
+ col = col.astype(np.int64)
+ col = [datetime.utcfromtimestamp(ts) for ts in col]
+ else:
+ col = col.astype(dtype)
+
+ col_name = output_names[i] if len(r) == 1 else \
+ output_names[i] + '.' + str(suffix)
+ df[col_name] = col
+
+ return df
diff --git a/src/python/tests_extended/test_automl_scenario.py b/src/python/tests_extended/test_automl_scenario.py
new file mode 100644
index 00000000..ec659612
--- /dev/null
+++ b/src/python/tests_extended/test_automl_scenario.py
@@ -0,0 +1,90 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+
+import os
+import time
+import tempfile
+import unittest
+import pandas as pd
+import six
+from nimbusml import Pipeline, FileDataStream
+from nimbusml.datasets import get_dataset
+from nimbusml.feature_extraction.text import NGramFeaturizer
+from nimbusml.linear_model import AveragedPerceptronBinaryClassifier
+from nimbusml.multiclass import OneVsRestClassifier
+from nimbusml.preprocessing import DatasetTransformer
+from data_frame_tool import DataFrameTool as DFT
+
+
+def get_tmp_file(suffix=None):
+ fd, file_name = tempfile.mkstemp(suffix=suffix)
+ fl = os.fdopen(fd, 'w')
+ fl.close()
+ return file_name
+
+path = get_dataset("wiki_detox_train").as_filepath()
+train_set = FileDataStream.read_csv(path, sep='\t')
+path = get_dataset("wiki_detox_test").as_filepath()
+test_set = FileDataStream.read_csv(path, sep='\t')
+
+class TestOnnxRuntime(unittest.TestCase):
+ """
+ Tests automl use case:
+ 1. Fit featurization pipeline separately.
+ 2. Fit learner on top of the featurization pipeline.
+ 3. Export to ONNX the learner pipeline.
+ 4. Compare results between ML.NET and ORT
+ """
+
+ @unittest.skipIf(six.PY2, "Disabled due to bug on Mac Python 2.7 build, more info:")
+ def test_automl_usecase(self):
+ # train featurization pipeline
+ featurization_pipe = Pipeline([NGramFeaturizer(keep_diacritics=True, columns={'Features': ['SentimentText']})])
+ featurization_pipe.fit(train_set)
+
+ # train learner pipeline
+ learner_pipe = Pipeline([DatasetTransformer(featurization_pipe.model),
+ OneVsRestClassifier(AveragedPerceptronBinaryClassifier(),
+ feature=['Features'], label='Sentiment')
+ ])
+ learner_pipe.fit(train_set)
+
+ # Export the learner pipeline to ONNX
+ onnx_path = get_tmp_file('.onnx')
+ learner_pipe.export_to_onnx(onnx_path, 'com.microsoft.ml', onnx_version='Stable')
+
+ # Perform the transform using the standard ML.Net backend
+ start = time.time()
+ result_standard = learner_pipe.predict(test_set)
+ end = time.time()
+ print('%ss done transform using standard backend' % round(end - start, 3))
+
+ # Perform the transform using the ORT backend
+ df_tool = DFT(onnx_path)
+ dataset = test_set.to_df()
+ start = time.time()
+ result_ort = df_tool.execute(dataset, ['PredictedLabel.output', 'Score.output'])
+ end = time.time()
+ print('%ss done transform using ORT backend (excludes df load time)' % round(end - start, 3))
+
+ # compare the results
+ for col_tuple in (('PredictedLabel', 'PredictedLabel.output'),
+ ('Score.0', 'Score.output.0'),
+ ('Score.1', 'Score.output.1'),
+ ):
+ col_expected = result_standard.loc[:, col_tuple[0]]
+ col_ort = result_ort.loc[:, col_tuple[1]]
+
+ check_kwargs = {
+ 'check_names': False,
+ 'check_exact': False,
+ 'check_dtype': True,
+ 'check_less_precise': True
+ }
+
+ pd.testing.assert_series_equal(col_expected, col_ort, **check_kwargs)
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/src/python/tests_extended/test_export_to_onnx.py b/src/python/tests_extended/test_export_to_onnx.py
new file mode 100644
index 00000000..dfb9448c
--- /dev/null
+++ b/src/python/tests_extended/test_export_to_onnx.py
@@ -0,0 +1,668 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+"""
+Verify onnx export and transform support
+"""
+import contextlib
+import io
+import json
+import os
+import sys
+import tempfile
+import numpy as np
+import pandas as pd
+import pprint
+
+from nimbusml import Pipeline
+from nimbusml.base_predictor import BasePredictor
+from nimbusml.cluster import KMeansPlusPlus
+from nimbusml.datasets import get_dataset
+from nimbusml.datasets.image import get_RevolutionAnalyticslogo, get_Microsoftlogo
+from nimbusml.decomposition import PcaTransformer, PcaAnomalyDetector
+from nimbusml.ensemble import FastForestBinaryClassifier, FastTreesTweedieRegressor, LightGbmRanker
+from nimbusml.feature_extraction.categorical import OneHotVectorizer, OneHotHashVectorizer
+from nimbusml.feature_extraction.image import Loader, Resizer, PixelExtractor
+from nimbusml.feature_extraction.text import NGramFeaturizer
+from nimbusml.feature_extraction.text.extractor import Ngram
+from nimbusml.feature_selection import CountSelector, MutualInformationSelector
+from nimbusml.linear_model import (AveragedPerceptronBinaryClassifier,
+ FastLinearBinaryClassifier,
+ LinearSvmBinaryClassifier)
+from nimbusml.multiclass import OneVsRestClassifier
+from nimbusml.naive_bayes import NaiveBayesClassifier
+from nimbusml.preprocessing import (TensorFlowScorer, FromKey, ToKey,
+ DateTimeSplitter, OnnxRunner)
+from nimbusml.preprocessing.filter import SkipFilter, TakeFilter, RangeFilter
+from nimbusml.preprocessing.missing_values import Filter, Handler, Indicator
+from nimbusml.preprocessing.normalization import Binner, GlobalContrastRowScaler, LpScaler
+from nimbusml.preprocessing.schema import (ColumnConcatenator, TypeConverter,
+ ColumnDuplicator, ColumnSelector, PrefixColumnConcatenator)
+from nimbusml.preprocessing.text import CharTokenizer, WordTokenizer
+from nimbusml.timeseries import (IidSpikeDetector, IidChangePointDetector,
+ SsaSpikeDetector, SsaChangePointDetector,
+ SsaForecaster)
+from data_frame_tool import DataFrameTool as DFT
+
+SHOW_ONNX_JSON = False
+SHOW_TRANSFORMED_RESULTS = True
+SHOW_FULL_PANDAS_OUTPUT = False
+
+if SHOW_FULL_PANDAS_OUTPUT:
+ pd.set_option('display.max_columns', None)
+ pd.set_option('display.max_rows', None)
+ pd.set_option('display.width', 10000)
+
+script_path = os.path.realpath(__file__)
+script_dir = os.path.dirname(script_path)
+
+# Sepal_Length Sepal_Width Petal_Length Petal_Width Label Species Setosa
+# 0 5.1 3.5 1.4 0.2 0 setosa 1.0
+# 1 4.9 3.0 1.4 0.2 0 setosa 1.0
+iris_df = get_dataset("iris").as_df()
+iris_df.drop(['Species'], axis=1, inplace=True)
+
+iris_with_nan_df = iris_df.copy()
+iris_with_nan_df.loc[1, 'Petal_Length'] = np.nan
+
+iris_no_label_df = iris_df.drop(['Label'], axis=1)
+iris_binary_df = iris_no_label_df.rename(columns={'Setosa': 'Label'})
+iris_regression_df = iris_no_label_df.drop(['Setosa'], axis=1).rename(columns={'Petal_Width': 'Label'})
+
+# Unnamed: 0 education age parity induced case spontaneous stratum pooled.stratum education_str
+# 0 1 0.0 26.0 6.0 1.0 1.0 2.0 1.0 3.0 0-5yrs
+# 1 2 0.0 42.0 1.0 1.0 1.0 0.0 2.0 1.0 0-5yrs
+infert_df = get_dataset("infert").as_df()
+infert_df.columns = [i.replace(': ', '') for i in infert_df.columns]
+infert_df.rename(columns={'case': 'Label'}, inplace=True)
+
+infert_onehot_df = (OneHotVectorizer() << 'education_str').fit_transform(infert_df)
+infert_onehot_df['Label'] = infert_onehot_df['Label'].astype(np.uint32)
+
+# rank group carrier price Class dep_day nbr_stops duration
+# 0 2 1 AA 240 3 1 0 12.0
+# 1 1 1 AA 300 3 0 1 15.0
+file_path = get_dataset("gen_tickettrain").as_filepath()
+gen_tt_df = pd.read_csv(file_path)
+gen_tt_df['group'] = gen_tt_df['group'].astype(np.uint32)
+
+# Unnamed: 0 Label Solar_R Wind Temp Month Day
+# 0 1 41.0 190.0 7.4 67 5 1
+# 1 2 36.0 118.0 8.0 72 5 2
+airquality_df = get_dataset("airquality").as_df().fillna(0)
+airquality_df = airquality_df[airquality_df.Ozone.notnull()]
+
+# Sentiment SentimentText
+# 0 1 ==RUDE== Dude, you are rude upload that carl...
+# 1 1 == OK! == IM GOING TO VANDALIZE WILD ONES W...
+file_path = get_dataset("wiki_detox_train").as_filepath()
+wiki_detox_df = pd.read_csv(file_path, sep='\t')
+wiki_detox_df = wiki_detox_df.head(10)
+
+# Path Label
+# 0 C:\repo\src\python... True
+# 1 C:\repo\src\python... False
+image_paths_df = pd.DataFrame(data=dict(
+ Path=[get_RevolutionAnalyticslogo(), get_Microsoftlogo()],
+ Label=[True, False]))
+
+
+SKIP = {
+ 'DatasetTransformer',
+ 'LightLda',
+ 'NGramExtractor', # Crashes
+ 'OneVsRestClassifier',
+ 'OnnxRunner',
+ 'Sentiment',
+ 'TensorFlowScorer',
+ 'TimeSeriesImputer',
+ 'TreeFeaturizer',
+ 'WordEmbedding',
+}
+
+INSTANCES = {
+ 'AveragedPerceptronBinaryClassifier': AveragedPerceptronBinaryClassifier(
+ feature=['education_str.0-5yrs', 'education_str.6-11yrs', 'education_str.12+ yrs']),
+ 'Binner': Binner(num_bins=3),
+ 'CharTokenizer': CharTokenizer(columns={'SentimentText_Transform': 'SentimentText'}),
+ 'ColumnConcatenator': ColumnConcatenator(columns={'Features': [
+ 'Sepal_Length',
+ 'Sepal_Width',
+ 'Petal_Length',
+ 'Petal_Width',
+ 'Setosa']}),
+ 'ColumnSelector': ColumnSelector(columns=['Sepal_Width', 'Sepal_Length']),
+ 'ColumnDuplicator': ColumnDuplicator(columns={'dup': 'Sepal_Width'}),
+ 'CountSelector': CountSelector(count=5, columns=['Sepal_Width']),
+ 'DateTimeSplitter': DateTimeSplitter(prefix='dt'),
+ 'FastForestBinaryClassifier': FastForestBinaryClassifier(feature=['Sepal_Width', 'Sepal_Length'],
+ label='Setosa'),
+ 'FastLinearBinaryClassifier': FastLinearBinaryClassifier(feature=['Sepal_Width', 'Sepal_Length'],
+ label='Setosa'),
+ 'FastTreesTweedieRegressor': FastTreesTweedieRegressor(label='Ozone'),
+ 'Filter': Filter(columns=[ 'Petal_Length', 'Petal_Width']),
+ 'FromKey': Pipeline([
+ ToKey(columns=['Sepal_Length']),
+ FromKey(columns=['Sepal_Length'])
+ ]),
+ # GlobalContrastRowScaler currently requires a vector input to work
+ 'GlobalContrastRowScaler': Pipeline([
+ ColumnConcatenator() << {
+ 'concated_columns': [
+ 'Petal_Length',
+ 'Sepal_Width',
+ 'Sepal_Length']},
+ GlobalContrastRowScaler(columns={'normed_columns': 'concated_columns'})
+ ]),
+ 'Handler': Handler(replace_with='Mean', columns={'NewVals': 'Petal_Length'}),
+ 'IidSpikeDetector': IidSpikeDetector(columns=['Sepal_Length']),
+ 'IidChangePointDetector': IidChangePointDetector(columns=['Sepal_Length']),
+ 'Indicator': Indicator(columns={'Has_Nan': 'Petal_Length'}),
+ 'KMeansPlusPlus': KMeansPlusPlus(n_clusters=3, feature=['Sepal_Width', 'Sepal_Length']),
+ 'LightGbmRanker': LightGbmRanker(feature=['Class', 'dep_day', 'duration'],
+ label='rank',
+ group_id='group'),
+ 'Loader': Loader(columns={'ImgPath': 'Path'}),
+ 'LpScaler': Pipeline([
+ ColumnConcatenator() << {
+ 'concated_columns': [
+ 'Petal_Length',
+ 'Sepal_Width',
+ 'Sepal_Length']},
+ LpScaler(columns={'normed_columns': 'concated_columns'})
+ ]),
+ 'MutualInformationSelector': Pipeline([
+ ColumnConcatenator(columns={'Features': ['Sepal_Width', 'Sepal_Length', 'Petal_Width']}),
+ MutualInformationSelector(
+ columns='Features',
+ label='Label',
+ slots_in_output=2) # only accept one column
+ ]),
+ 'NaiveBayesClassifier': NaiveBayesClassifier(feature=['Sepal_Width', 'Sepal_Length']),
+ 'NGramFeaturizer': NGramFeaturizer(word_feature_extractor=Ngram(),
+ char_feature_extractor=Ngram(),
+ keep_diacritics=True,
+ columns={ 'features': ['SentimentText']}),
+ 'OneHotHashVectorizer': OneHotHashVectorizer(columns=['education_str']),
+ 'OneHotVectorizer': OneHotVectorizer(columns=['education_str']),
+ 'OneVsRestClassifier(AveragedPerceptronBinaryClassifier)': \
+ OneVsRestClassifier(AveragedPerceptronBinaryClassifier(),
+ use_probabilities=True,
+ feature=['age',
+ 'education_str.0-5yrs',
+ 'education_str.6-11yrs',
+ 'education_str.12+ yrs'],
+ label='induced'),
+ 'OneVsRestClassifier(LinearSvmBinaryClassifier)': \
+ OneVsRestClassifier(LinearSvmBinaryClassifier(),
+ use_probabilities=True,
+ feature=['age',
+ 'education_str.0-5yrs',
+ 'education_str.6-11yrs',
+ 'education_str.12+ yrs'],
+ label='induced'),
+ 'PcaAnomalyDetector': PcaAnomalyDetector(rank=3),
+ 'PcaTransformer': PcaTransformer(rank=2),
+ 'PixelExtractor': Pipeline([
+ Loader(columns={'ImgPath': 'Path'}),
+ PixelExtractor(columns={'ImgPixels': 'ImgPath'}),
+ ]),
+ 'PrefixColumnConcatenator': PrefixColumnConcatenator(columns={'Features': 'Sepal_'}),
+ 'Resizer': Pipeline([
+ Loader(columns={'ImgPath': 'Path'}),
+ Resizer(image_width=227, image_height=227,
+ columns={'ImgResize': 'ImgPath'})
+ ]),
+ 'SkipFilter': SkipFilter(count=5),
+ 'SsaSpikeDetector': SsaSpikeDetector(columns=['Sepal_Length'],
+ seasonal_window_size=2),
+ 'SsaChangePointDetector': SsaChangePointDetector(columns=['Sepal_Length'],
+ seasonal_window_size=2),
+ 'SsaForecaster': SsaForecaster(columns=['Sepal_Length'],
+ window_size=2,
+ series_length=5,
+ train_size=5,
+ horizon=1),
+ 'RangeFilter': RangeFilter(min=5.0, max=5.1, columns=['Sepal_Length']),
+ 'TakeFilter': TakeFilter(count=100),
+ 'TensorFlowScorer': TensorFlowScorer(
+ model_location=os.path.join(
+ script_dir,
+ '..',
+ 'nimbusml',
+ 'examples',
+ 'frozen_saved_model.pb'),
+ columns={'c': ['a', 'b']}),
+ 'ToKey': ToKey(columns={'edu_1': 'education_str'}),
+ 'TypeConverter': TypeConverter(columns=['group'], result_type='R4'),
+ 'WordTokenizer': WordTokenizer(char_array_term_separators=[" "]) << {'wt': 'SentimentText'}
+}
+
+DATASETS = {
+ 'AveragedPerceptronBinaryClassifier': infert_onehot_df,
+ 'Binner': iris_no_label_df,
+ 'BootstrapSampler': infert_df,
+ 'CharTokenizer': wiki_detox_df,
+ 'EnsembleRegressor': iris_regression_df,
+ 'FactorizationMachineBinaryClassifier': iris_binary_df,
+ 'FastForestBinaryClassifier': iris_no_label_df,
+ 'FastForestRegressor': iris_regression_df,
+ 'FastLinearBinaryClassifier': iris_no_label_df,
+ 'FastLinearClassifier': iris_binary_df,
+ 'FastLinearRegressor': iris_regression_df,
+ 'FastTreesBinaryClassifier': iris_binary_df,
+ 'FastTreesRegressor': iris_regression_df,
+ 'FastTreesTweedieRegressor': airquality_df,
+ 'Filter': iris_no_label_df,
+ 'GamBinaryClassifier': iris_binary_df,
+ 'GamRegressor': iris_regression_df,
+ 'GlobalContrastRowScaler': iris_df.astype(np.float32),
+ 'Handler': iris_with_nan_df,
+ 'Indicator': iris_with_nan_df,
+ 'LightGbmBinaryClassifier': iris_binary_df,
+ 'LightGbmRanker': gen_tt_df,
+ 'LinearSvmBinaryClassifier': iris_binary_df,
+ 'Loader': image_paths_df,
+ 'LogisticRegressionBinaryClassifier': iris_binary_df,
+ 'LogisticRegressionClassifier': iris_df,
+ 'LogMeanVarianceScaler': iris_no_label_df,
+ 'LpScaler': iris_no_label_df.drop(['Setosa'], axis=1).astype(np.float32),
+ 'MeanVarianceScaler': iris_no_label_df,
+ 'MinMaxScaler': iris_no_label_df,
+ 'NGramFeaturizer': wiki_detox_df,
+ 'OneHotHashVectorizer': infert_df,
+ 'OneHotVectorizer': infert_df,
+ 'OnlineGradientDescentRegressor': iris_regression_df,
+ 'OneVsRestClassifier(AveragedPerceptronBinaryClassifier)': infert_onehot_df,
+ 'OneVsRestClassifier(LinearSvmBinaryClassifier)': infert_onehot_df,
+ 'OrdinaryLeastSquaresRegressor': iris_regression_df,
+ 'PcaAnomalyDetector': iris_no_label_df,
+ 'PcaTransformer': iris_regression_df,
+ 'PixelExtractor': image_paths_df,
+ 'PoissonRegressionRegressor': iris_regression_df,
+ 'Resizer': image_paths_df,
+ 'SgdBinaryClassifier': iris_binary_df,
+ 'SymSgdBinaryClassifier': iris_binary_df,
+ 'ToKey': infert_df,
+ 'TypeConverter': gen_tt_df,
+ 'WordTokenizer': wiki_detox_df
+}
+
+EXPECTED_RESULTS = {
+ 'AveragedPerceptronBinaryClassifier': {'cols': [('PredictedLabel', 'PredictedLabel', 'PredictedLabel.output')]},
+ 'CharTokenizer': {'num_cols': 424, 'cols': 0},
+ 'ColumnConcatenator': {'num_cols': 11, 'cols': 0},
+ 'ColumnDuplicator': {'num_cols': 7, 'cols': 0},
+ 'ColumnSelector': {
+ 'num_cols': 2,
+ 'cols': [('Sepal_Width', 'Sepal_Width', 'Sepal_Width.output'),
+ ('Sepal_Length', 'Sepal_Length', 'Sepal_Length.output')]
+ },
+ #'EnsembleClassifier': {'cols': [('PredictedLabel', 'PredictedLabel')]},
+ #'EnsembleRegressor': {'cols': [('Score', 'Score')]},
+ 'FastForestBinaryClassifier': {'cols': [('PredictedLabel', 'PredictedLabel', 'PredictedLabel.output')]},
+ 'FastForestRegressor': {'cols': [('Score', 'Score', 'Score.output')]},
+ 'FastLinearBinaryClassifier': {'cols': [('PredictedLabel', 'PredictedLabel', 'PredictedLabel.output')]},
+ 'FastLinearClassifier': {'cols': [('PredictedLabel', 'PredictedLabel', 'PredictedLabel.output')]},
+ 'FastLinearRegressor': {'cols': [('Score', 'Score', 'Score.output')]},
+ 'FastTreesBinaryClassifier': {'cols': [('PredictedLabel', 'PredictedLabel', 'PredictedLabel.output')]},
+ 'FastTreesRegressor': {'cols': [('Score', 'Score', 'Score.output')]},
+ 'FastTreesTweedieRegressor': {'cols': [('Score', 'Score', 'Score.output')]},
+ 'FromKey': {'num_cols': 6, 'cols': 0},
+ 'GlobalContrastRowScaler': {'num_cols': 12, 'cols': 0},
+ 'Handler': {'num_cols': 8, 'cols': 0},
+ 'Indicator': {'num_cols': 7, 'cols': 0},
+ 'KMeansPlusPlus': {'cols': [('PredictedLabel', 'PredictedLabel', 'PredictedLabel.output')]},
+ 'LightGbmBinaryClassifier': {'cols': [('PredictedLabel', 'PredictedLabel', 'PredictedLabel.output')]},
+ 'LightGbmClassifier': {'cols': [('PredictedLabel', 'PredictedLabel', 'PredictedLabel.output')]},
+ 'LightGbmRanker': {'cols': [('Score', 'Score', 'Score.output')]},
+ 'LightGbmRegressor': {'cols': [('Score', 'Score', 'Score.output')]},
+ 'LinearSvmBinaryClassifier': {'cols': [('PredictedLabel', 'PredictedLabel', 'PredictedLabel.output')]},
+ 'LogisticRegressionBinaryClassifier': {'cols': [('PredictedLabel', 'PredictedLabel', 'PredictedLabel.output')]},
+ 'LogisticRegressionClassifier': {'cols': [('PredictedLabel', 'PredictedLabel', 'PredictedLabel.output')]},
+ 'LpScaler': {'num_cols': 10, 'cols': 0},
+ 'MeanVarianceScaler': {'num_cols': 5, 'cols': 0},
+ 'MinMaxScaler': {'num_cols': 5, 'cols': 0},
+ 'MutualInformationSelector': {'num_cols': 8, 'cols': 0},
+ 'NGramFeaturizer': {'num_cols': 273, 'cols': 0},
+ 'NaiveBayesClassifier': {'cols': [('PredictedLabel', 'PredictedLabel', 'PredictedLabel.output')]},
+ 'OneHotVectorizer': {'num_cols': 12, 'cols': 0},
+ 'OneVsRestClassifier(AveragedPerceptronBinaryClassifier)': \
+ {'cols': [('PredictedLabel', 'PredictedLabel', 'PredictedLabel.output')]},
+ 'OneVsRestClassifier(LinearSvmBinaryClassifier)': \
+ {'cols': [('PredictedLabel', 'PredictedLabel', 'PredictedLabel.output')]},
+ 'OnlineGradientDescentRegressor': {'cols': [('Score', 'Score', 'Score.output')]},
+ 'OrdinaryLeastSquaresRegressor': {'cols': [('Score', 'Score', 'Score.output')]},
+ 'PcaTransformer': {'num_cols': 9, 'cols': 0},
+ 'PoissonRegressionRegressor': {'cols': [('Score', 'Score', 'Score.output')]},
+ 'PrefixColumnConcatenator': {'num_cols': 8, 'cols': 0},
+ 'SgdBinaryClassifier': {'cols': [('PredictedLabel', 'PredictedLabel', 'PredictedLabel.output')]},
+ 'SymSgdBinaryClassifier': {'cols': [('PredictedLabel', 'PredictedLabel', 'PredictedLabel.output')]},
+ 'ToKey': {'num_cols': 11, 'cols': 0},
+ 'TypeConverter': {'num_cols': 8, 'cols': 0},
+ 'WordTokenizer': {'num_cols': 73, 'cols': 0}
+}
+
+SUPPORTED_ESTIMATORS = {
+ 'AveragedPerceptronBinaryClassifier',
+ 'CharTokenizer',
+ 'ColumnConcatenator',
+ 'ColumnDuplicator',
+ 'ColumnSelector',
+ 'CountSelector',
+ 'EnsembleClassifier',
+ 'EnsembleRegressor',
+ 'FastForestBinaryClassifier',
+ 'FastForestRegressor',
+ 'FastLinearBinaryClassifier',
+ 'FastLinearClassifier',
+ 'FastLinearRegressor',
+ 'FastTreesBinaryClassifier',
+ 'FastTreesRegressor',
+ 'FastTreesTweedieRegressor',
+ 'FromKey',
+ 'GlobalContrastRowScaler',
+ 'Handler',
+ 'Indicator',
+ 'KMeansPlusPlus',
+ 'LightGbmBinaryClassifier',
+ 'LightGbmClassifier',
+ 'LightGbmRanker',
+ 'LightGbmRegressor',
+ 'LinearSvmBinaryClassifier',
+ 'LogisticRegressionBinaryClassifier',
+ 'LogisticRegressionClassifier',
+ 'LpScaler',
+ 'MeanVarianceScaler',
+ 'MinMaxScaler',
+ 'MutualInformationSelector',
+ 'NaiveBayesClassifier',
+ 'OneHotVectorizer',
+ 'OnlineGradientDescentRegressor',
+ 'OrdinaryLeastSquaresRegressor',
+ 'PcaTransformer',
+ 'PoissonRegressionRegressor',
+ 'SgdBinaryClassifier',
+ 'SymSgdBinaryClassifier',
+ 'ToKey',
+ 'TypeConverter',
+ 'WordTokenizer'
+}
+
+
+class CaptureOutputContext():
+ """
+ Context which can be used for
+ capturing stdout and stderr.
+ """
+ def __enter__(self):
+ self.orig_stdout = sys.stdout
+ self.orig_stderr = sys.stderr
+ self.stdout_capturer = io.StringIO()
+ self.stderr_capturer = io.StringIO()
+ sys.stdout = self.stdout_capturer
+ sys.stderr = self.stderr_capturer
+ return self
+
+ def __exit__(self, *args):
+ sys.stdout = self.orig_stdout
+ sys.stderr = self.orig_stderr
+ self.stdout = self.stdout_capturer.getvalue()
+ self.stderr = self.stderr_capturer.getvalue()
+
+ if self.stdout:
+ print(self.stdout)
+
+ if self.stderr:
+ print(self.stderr)
+
+ # free up some memory
+ del self.stdout_capturer
+ del self.stderr_capturer
+
+
+def get_tmp_file(suffix=None):
+ fd, file_name = tempfile.mkstemp(suffix=suffix)
+ fl = os.fdopen(fd, 'w')
+ fl.close()
+ return file_name
+
+
+def get_file_size(file_path):
+ file_size = 0
+ try:
+ file_size = os.path.getsize(file_path)
+ except:
+ pass
+ return file_size
+
+
+def load_json(file_path):
+ with open(file_path) as f:
+ lines = f.readlines()
+ lines = [l for l in lines if not l.strip().startswith('#')]
+ content_without_comments = '\n'.join(lines)
+ return json.loads(content_without_comments)
+
+
+def print_results(result_expected, result_onnx, result_onnx_ort):
+ print("\nML.Net Output (Expected Result):")
+ print(result_expected)
+ if not isinstance(result_expected, pd.Series):
+ print('Columns', result_expected.columns)
+
+ print("\nOnnxRunner Result:")
+ print(result_onnx)
+ if not isinstance(result_onnx, pd.Series):
+ print('Columns', result_onnx.columns)
+
+ print("\nORT Result:")
+ print(result_onnx_ort)
+ if not isinstance(result_onnx_ort, pd.Series):
+ print('Columns', result_onnx_ort.columns)
+
+def validate_results(class_name, result_expected, result_onnx, result_ort):
+ if not class_name in EXPECTED_RESULTS:
+ raise RuntimeError("ERROR: ONNX model executed but no results specified for comparison.")
+
+ if 'num_cols' in EXPECTED_RESULTS[class_name]:
+ num_cols = EXPECTED_RESULTS[class_name]['num_cols']
+
+ if len(result_expected.columns) != num_cols:
+ raise RuntimeError("ERROR: The ML.Net output does not contain the expected number of columns.")
+
+ if len(result_onnx.columns) != num_cols:
+ raise RuntimeError("ERROR: The ONNX output does not contain the expected number of columns.")
+
+ if len(result_ort.columns) != num_cols:
+ raise RuntimeError("ERROR: The ORT output does not contain the expected number of columns.")
+
+ col_tuples = EXPECTED_RESULTS[class_name]['cols']
+
+ if isinstance(col_tuples, int):
+ # If col_pairs is an int then slice the columns
+ # based on the value and use those pairs for comparison
+ col_tuples = list(zip(result_expected.columns[col_tuples:],
+ result_onnx.columns[col_tuples:],
+ result_ort.columns[col_tuples:]))
+
+ if not col_tuples:
+ raise RuntimeError("ERROR: no columns specified for comparison of results.")
+
+ for col_tuple in col_tuples:
+ try:
+ col_expected = result_expected.loc[:, col_tuple[0]]
+ col_onnx = result_onnx.loc[:, col_tuple[1]]
+ col_ort = result_ort.loc[:, col_tuple[2]]
+
+ if isinstance(col_expected.dtype, pd.api.types.CategoricalDtype):
+ # ONNX does not export categorical columns so convert categorical
+ # columns received from ML.Net back to the original values before
+ # the comparison.
+ col_expected = col_expected.astype(col_expected.dtype.categories.dtype)
+
+ check_kwargs = {
+ 'check_names': False,
+ 'check_exact': False,
+ 'check_dtype': True,
+ 'check_less_precise': True
+ }
+
+ pd.testing.assert_series_equal(col_expected, col_onnx, **check_kwargs)
+ pd.testing.assert_series_equal(col_expected, col_ort, **check_kwargs)
+
+ except Exception as e:
+ print(e)
+ raise RuntimeError("ERROR: OnnxRunner result does not match expected result.")
+
+ return True
+
+
+def test_export_to_onnx(estimator, class_name):
+ """
+ Fit and test an estimator and determine
+ if it supports exporting to the ONNX format.
+ """
+ onnx_path = get_tmp_file('.onnx')
+ onnx_json_path = get_tmp_file('.onnx.json')
+
+ output = None
+ exported = False
+ export_valid = False
+
+ try:
+ dataset = DATASETS.get(class_name, iris_df)
+ estimator.fit(dataset)
+
+ with CaptureOutputContext() as output:
+ estimator.export_to_onnx(onnx_path,
+ 'com.microsoft.ml',
+ dst_json=onnx_json_path,
+ onnx_version='Stable')
+ except Exception as e:
+ print(e)
+
+ onnx_file_size = get_file_size(onnx_path)
+ onnx_json_file_size = get_file_size(onnx_json_path)
+
+ if (output and
+ (onnx_file_size != 0) and
+ (onnx_json_file_size != 0) and
+ (not 'cannot save itself as ONNX' in output.stdout) and
+ (not 'Warning: We do not know how to save the predictor as ONNX' in output.stdout)):
+
+ exported = True
+
+ print('ONNX model path:', onnx_path)
+
+ if SHOW_ONNX_JSON:
+ with open(onnx_json_path) as f:
+ print(json.dumps(json.load(f), indent=4))
+
+ # Verify that the output of the exported onnx graph
+ # produces the same results as the standard estimators.
+ if isinstance(estimator, BasePredictor):
+ result_expected = estimator.predict(dataset)
+ else:
+ result_expected = estimator.transform(dataset)
+
+ if isinstance(result_expected, pd.Series):
+ result_expected = pd.DataFrame(result_expected)
+
+ try:
+ onnxrunner = OnnxRunner(model_file=onnx_path)
+ result_onnx = onnxrunner.fit_transform(dataset)
+ df_tool = DFT(onnx_path)
+ result_ort = df_tool.execute(dataset, [])
+
+ if SHOW_TRANSFORMED_RESULTS:
+ print_results(result_expected, result_onnx, result_ort)
+
+ export_valid = validate_results(class_name,
+ result_expected,
+ result_onnx,
+ result_ort)
+ except Exception as e:
+ print(e)
+
+ os.remove(onnx_path)
+ os.remove(onnx_json_path)
+ return {'exported': exported, 'export_valid': export_valid}
+
+
+manifest_diff = os.path.join(script_dir, '..', 'tools', 'manifest_diff.json')
+entry_points = load_json(manifest_diff)['EntryPoints']
+entry_points.extend([
+ {'NewName': 'OneVsRestClassifier(AveragedPerceptronBinaryClassifier)'},
+ {'NewName': 'OneVsRestClassifier(LinearSvmBinaryClassifier)'}
+])
+entry_points = sorted(entry_points, key=lambda ep: ep['NewName'])
+
+exportable_estimators = set()
+unexportable_estimators = set()
+runable_estimators = set()
+
+for entry_point in entry_points:
+ class_name = entry_point['NewName']
+
+# if not class_name in ['OneVsRestClassifier(LinearSvmBinaryClassifier)']:
+# continue
+
+ print('\n===========> %s' % class_name)
+
+ if class_name in SKIP:
+ print("skipped")
+ continue
+
+ if class_name in INSTANCES:
+ estimator = INSTANCES[class_name]
+ else:
+ mod = __import__('nimbusml.' + entry_point['Module'],
+ fromlist=[str(class_name)])
+
+ the_class = getattr(mod, class_name)
+ estimator = the_class()
+
+ result = test_export_to_onnx(estimator, class_name)
+
+ if result['exported']:
+ exportable_estimators.add(class_name)
+ print('Estimator successfully exported to ONNX.')
+
+ else:
+ unexportable_estimators.add(class_name)
+ print('Estimator could NOT be exported to ONNX.')
+
+ if result['export_valid']:
+ runable_estimators.add(class_name)
+ print('Exported ONNX model successfully transformed with OnnxRunner.')
+
+print('\n=====================')
+print('SUMMARY')
+print('=====================')
+
+print('\nThe following estimators were skipped: ')
+pprint.pprint(sorted(SKIP))
+
+print('\nThe following estimators were successfully exported to ONNX:')
+pprint.pprint(sorted(exportable_estimators))
+
+print('\nThe following estimators could not be exported to ONNX: ')
+pprint.pprint(sorted(unexportable_estimators))
+
+failed_exports = SUPPORTED_ESTIMATORS.difference(exportable_estimators)
+print("\nThe following estimators failed exporting to ONNX:")
+pprint.pprint(sorted(failed_exports))
+
+failed_e2e_estimators = exportable_estimators.difference(runable_estimators)
+print("\nThe following tests exported to ONNX but failed the end to end test:")
+pprint.pprint(sorted(failed_e2e_estimators))
+
+print('\nThe following estimators successfully completed the end to end test: ')
+pprint.pprint(sorted(runable_estimators))
+print()
+
+if len(failed_exports) + len(failed_e2e_estimators) > 0:
+ raise RuntimeError("ONNX export checks failed")
+
diff --git a/src/python/tools/entrypoint_compiler.py b/src/python/tools/entrypoint_compiler.py
index ed829533..57b1b8de 100644
--- a/src/python/tools/entrypoint_compiler.py
+++ b/src/python/tools/entrypoint_compiler.py
@@ -1560,7 +1560,6 @@ def __init__(self, argument, inout): # dict
self.default = argument.get('Default', Missing())
self.required = argument.get('Required', Missing())
self.aliases = argument.get('Aliases', Missing())
- self.pass_as = argument.get('PassAs', None)
self.name_converted = convert_name(self.name)
self.new_name_converted = convert_name(
@@ -1615,7 +1614,7 @@ def get_body(self):
"is_of_type=numbers.Real"
body = template.format(
inout=self.inout,
- name=self.pass_as or self.name,
+ name=self.name,
name_converted=self.name_converted,
none_acceptable=not self.required)
if not isinstance(self.range, Missing):
@@ -1646,7 +1645,7 @@ def get_body(self):
"none_acceptable={none_acceptable}, is_of_type=bool"
body = template.format(
inout=self.inout,
- name=self.pass_as or self.name,
+ name=self.name,
name_converted=self.name_converted,
none_acceptable=not self.required)
return body + ")"
@@ -1693,7 +1692,7 @@ def get_body(self):
template += ", is_column=True"
body = template.format(
inout=self.inout,
- name=self.pass_as or self.name,
+ name=self.name,
name_converted=self.name_converted,
none_acceptable=not self.required)
return body + ")"
@@ -1717,7 +1716,7 @@ def get_body(self):
"none_acceptable={none_acceptable}, is_of_type=str"
body = template.format(
inout=self.inout,
- name=self.pass_as or self.name,
+ name=self.name,
name_converted=self.name_converted,
none_acceptable=not self.required)
value_check = ", values={0}".format(str(self.type['Values']))
@@ -1748,7 +1747,7 @@ def get_body(self):
"none_acceptable={none_acceptable}, is_of_type=list"
body = template.format(
inout=self.inout,
- name=self.pass_as or self.name,
+ name=self.name,
name_converted=self.name_converted,
none_acceptable=not self.required)
return body + ")"
@@ -1790,7 +1789,7 @@ def get_body(self):
template += ', is_column=True'
body = template.format(
inout=self.inout,
- name=self.pass_as or self.name,
+ name=self.name,
name_converted=self.name_converted,
none_acceptable=not self.required)
return body + ")"
@@ -1818,7 +1817,7 @@ def get_body(self):
template += ', is_column=True'
body = template.format(
inout=self.inout,
- name=self.pass_as or self.name,
+ name=self.name,
name_converted=self.name_converted,
none_acceptable=not self.required)
return body + ")"
@@ -1846,7 +1845,7 @@ def get_body(self):
"none_acceptable={none_acceptable}, is_of_type=dict"
body = template.format(
inout=self.inout,
- name=self.pass_as or self.name,
+ name=self.name,
name_converted=self.name_converted,
none_acceptable=not self.required)
return body + ")"
@@ -1882,7 +1881,7 @@ def get_body(self):
template += ", is_column=True"
body = template.format(
inout=self.inout,
- name=self.pass_as or self.name,
+ name=self.name,
name_converted=self.name_converted,
none_acceptable=not self.required)
field_check = ", field_names={0}".format(
@@ -2041,6 +2040,7 @@ def generate_code(pkg_path, generate_entrypoints, generate_api):
script_args = arg_parser.parse_args()
pkg_path = os.path.join(my_dir, r'..\nimbusml')
+
if script_args.check_manual_changes:
verbose = False
if script_args.folder == 'temp':
diff --git a/src/python/tools/manifest.json b/src/python/tools/manifest.json
index e54ff2c2..fd7f7950 100644
--- a/src/python/tools/manifest.json
+++ b/src/python/tools/manifest.json
@@ -1731,14 +1731,14 @@
{
"Name": "Slope",
"Type": "Float",
- "Desc": "The slope parameter of the calibration function 1 / (1 + exp(-slope * x + offset)",
+ "Desc": "The slope parameter of the calibration function 1 / (1 + exp(slope * x + offset)",
"Aliases": [
"slope"
],
"Required": false,
"SortOrder": 1.0,
"IsNullable": false,
- "Default": 1.0
+ "Default": -1.0
},
{
"Name": "Data",
@@ -1762,7 +1762,7 @@
{
"Name": "Offset",
"Type": "Float",
- "Desc": "The offset parameter of the calibration function 1 / (1 + exp(-slope * x + offset)",
+ "Desc": "The offset parameter of the calibration function 1 / (1 + exp(slope * x + offset)",
"Aliases": [
"offset"
],
@@ -1903,9 +1903,9 @@
}
},
{
- "Name": "SupressScoresAndLabels",
+ "Name": "SuppressScoresAndLabels",
"Type": "Bool",
- "Desc": "Supress labels and scores in per-instance outputs?",
+ "Desc": "Suppress labels and scores in per-instance outputs?",
"Aliases": [
"noScores"
],
@@ -2194,6 +2194,203 @@
"ITrainerInput"
]
},
+ {
+ "Name": "Models.OnnxConverter",
+ "Desc": "Converts the model to ONNX format.",
+ "FriendlyName": "ONNX Converter.",
+ "ShortName": null,
+ "Inputs": [
+ {
+ "Name": "DataFile",
+ "Type": "String",
+ "Desc": "The data file",
+ "Aliases": [
+ "data"
+ ],
+ "Required": false,
+ "SortOrder": 0.0,
+ "IsNullable": false,
+ "Default": null
+ },
+ {
+ "Name": "Onnx",
+ "Type": "String",
+ "Desc": "The path to write the output ONNX to.",
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "Json",
+ "Type": "String",
+ "Desc": "The path to write the output JSON to.",
+ "Required": false,
+ "SortOrder": 2.0,
+ "IsNullable": false,
+ "Default": null
+ },
+ {
+ "Name": "Name",
+ "Type": "String",
+ "Desc": "The 'name' property in the output ONNX. By default this will be the ONNX extension-less name.",
+ "Required": false,
+ "SortOrder": 3.0,
+ "IsNullable": false,
+ "Default": null
+ },
+ {
+ "Name": "Domain",
+ "Type": "String",
+ "Desc": "The 'domain' property in the output ONNX.",
+ "Required": false,
+ "SortOrder": 4.0,
+ "IsNullable": false,
+ "Default": null
+ },
+ {
+ "Name": "InputsToDrop",
+ "Type": {
+ "Kind": "Array",
+ "ItemType": "String"
+ },
+ "Desc": "Array of input column names to drop",
+ "Required": false,
+ "SortOrder": 6.0,
+ "IsNullable": false,
+ "Default": null
+ },
+ {
+ "Name": "OutputsToDrop",
+ "Type": {
+ "Kind": "Array",
+ "ItemType": "String"
+ },
+ "Desc": "Array of output column names to drop",
+ "Required": false,
+ "SortOrder": 8.0,
+ "IsNullable": false,
+ "Default": null
+ },
+ {
+ "Name": "Model",
+ "Type": "TransformModel",
+ "Desc": "Model that needs to be converted to ONNX format.",
+ "Required": false,
+ "SortOrder": 10.0,
+ "IsNullable": false,
+ "Default": null
+ },
+ {
+ "Name": "OnnxVersion",
+ "Type": {
+ "Kind": "Enum",
+ "Values": [
+ "Stable",
+ "Experimental"
+ ]
+ },
+ "Desc": "The targeted ONNX version. It can be either \"Stable\" or \"Experimental\". If \"Experimental\" is used, produced model can contain components that is not officially supported in ONNX standard.",
+ "Required": false,
+ "SortOrder": 11.0,
+ "IsNullable": false,
+ "Default": "Stable"
+ },
+ {
+ "Name": "PredictiveModel",
+ "Type": "PredictorModel",
+ "Desc": "Predictor model that needs to be converted to ONNX format.",
+ "Required": false,
+ "SortOrder": 12.0,
+ "IsNullable": false,
+ "Default": null
+ }
+ ],
+ "Outputs": []
+ },
+ {
+ "Name": "Models.OnnxTransformer",
+ "Desc": "Applies an ONNX model to a dataset.",
+ "FriendlyName": "Onnx Transformer",
+ "ShortName": "onnx-xf",
+ "Inputs": [
+ {
+ "Name": "ModelFile",
+ "Type": "String",
+ "Desc": "Path to the onnx model file.",
+ "Aliases": [
+ "model"
+ ],
+ "Required": true,
+ "SortOrder": 0.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "InputColumns",
+ "Type": {
+ "Kind": "Array",
+ "ItemType": "String"
+ },
+ "Desc": "Name of the input column.",
+ "Required": false,
+ "SortOrder": 1.0,
+ "IsNullable": false,
+ "Default": null
+ },
+ {
+ "Name": "Data",
+ "Type": "DataView",
+ "Desc": "Input dataset",
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "OutputColumns",
+ "Type": {
+ "Kind": "Array",
+ "ItemType": "String"
+ },
+ "Desc": "Name of the output column.",
+ "Required": false,
+ "SortOrder": 2.0,
+ "IsNullable": false,
+ "Default": null
+ },
+ {
+ "Name": "GpuDeviceId",
+ "Type": "Int",
+ "Desc": "GPU device id to run on (e.g. 0,1,..). Null for CPU. Requires CUDA 9.1.",
+ "Required": false,
+ "SortOrder": 3.0,
+ "IsNullable": true,
+ "Default": null
+ },
+ {
+ "Name": "FallbackToCpu",
+ "Type": "Bool",
+ "Desc": "If true, resumes execution on CPU upon GPU error. If false, will raise the GPU execption.",
+ "Required": false,
+ "SortOrder": 4.0,
+ "IsNullable": false,
+ "Default": false
+ }
+ ],
+ "Outputs": [
+ {
+ "Name": "OutputData",
+ "Type": "DataView",
+ "Desc": "ONNX transformed dataset"
+ },
+ {
+ "Name": "Model",
+ "Type": "TransformModel",
+ "Desc": "Transform model"
+ }
+ ],
+ "InputKind": [
+ "ITransformInput"
+ ]
+ },
{
"Name": "Models.OvaModelCombiner",
"Desc": "Combines a sequence of PredictorModels into a single model",
@@ -3981,7 +4178,7 @@
{
"Name": "TrainSize",
"Type": "Int",
- "Desc": "The length of series from the begining used for training.",
+ "Desc": "The length of series from the beginning used for training.",
"Required": true,
"SortOrder": 2.0,
"IsNullable": false,
@@ -10418,7 +10615,7 @@
},
{
"Name": "Trainers.GeneralizedAdditiveModelBinaryClassifier",
- "Desc": "Trains a gradient boosted stump per feature, on all features simultaneously, to fit target values using least-squares. It mantains no interactions between features.",
+ "Desc": "Trains a gradient boosted stump per feature, on all features simultaneously, to fit target values using least-squares. It maintains no interactions between features.",
"FriendlyName": "Generalized Additive Model for Binary Classification",
"ShortName": "gam",
"Inputs": [
@@ -10718,7 +10915,7 @@
},
{
"Name": "Trainers.GeneralizedAdditiveModelRegressor",
- "Desc": "Trains a gradient boosted stump per feature, on all features simultaneously, to fit target values using least-squares. It mantains no interactions between features.",
+ "Desc": "Trains a gradient boosted stump per feature, on all features simultaneously, to fit target values using least-squares. It maintains no interactions between features.",
"FriendlyName": "Generalized Additive Model for Regression",
"ShortName": "gamr",
"Inputs": [
@@ -11546,6 +11743,9 @@
"Name": "HandleMissingValue",
"Type": "Bool",
"Desc": "Enable special handling of missing value or not.",
+ "Aliases": [
+ "hmv"
+ ],
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
@@ -11558,6 +11758,25 @@
]
}
},
+ {
+ "Name": "UseZeroAsMissingValue",
+ "Type": "Bool",
+ "Desc": "Enable usage of zero (0) as missing value.",
+ "Aliases": [
+ "uzam"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": false,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ true,
+ false
+ ]
+ }
+ },
{
"Name": "MinimumExampleCountPerGroup",
"Type": "Int",
@@ -12043,6 +12262,9 @@
"Name": "HandleMissingValue",
"Type": "Bool",
"Desc": "Enable special handling of missing value or not.",
+ "Aliases": [
+ "hmv"
+ ],
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
@@ -12055,6 +12277,25 @@
]
}
},
+ {
+ "Name": "UseZeroAsMissingValue",
+ "Type": "Bool",
+ "Desc": "Enable usage of zero (0) as missing value.",
+ "Aliases": [
+ "uzam"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": false,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ true,
+ false
+ ]
+ }
+ },
{
"Name": "MinimumExampleCountPerGroup",
"Type": "Int",
@@ -12540,6 +12781,9 @@
"Name": "HandleMissingValue",
"Type": "Bool",
"Desc": "Enable special handling of missing value or not.",
+ "Aliases": [
+ "hmv"
+ ],
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
@@ -12552,6 +12796,25 @@
]
}
},
+ {
+ "Name": "UseZeroAsMissingValue",
+ "Type": "Bool",
+ "Desc": "Enable usage of zero (0) as missing value.",
+ "Aliases": [
+ "uzam"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": false,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ true,
+ false
+ ]
+ }
+ },
{
"Name": "MinimumExampleCountPerGroup",
"Type": "Int",
@@ -12998,6 +13261,9 @@
"Name": "HandleMissingValue",
"Type": "Bool",
"Desc": "Enable special handling of missing value or not.",
+ "Aliases": [
+ "hmv"
+ ],
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
@@ -13010,6 +13276,25 @@
]
}
},
+ {
+ "Name": "UseZeroAsMissingValue",
+ "Type": "Bool",
+ "Desc": "Enable usage of zero (0) as missing value.",
+ "Aliases": [
+ "uzam"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": false,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ true,
+ false
+ ]
+ }
+ },
{
"Name": "MinimumExampleCountPerGroup",
"Type": "Int",
@@ -13417,10 +13702,10 @@
]
},
{
- "Name": "Trainers.LogisticRegressionBinaryClassifier",
- "Desc": "Logistic Regression is a method in statistics used to predict the probability of occurrence of an event and can be used as a classification algorithm. The algorithm predicts the probability of occurrence of an event by fitting data to a logistical function.",
- "FriendlyName": "Logistic Regression",
- "ShortName": "lr",
+ "Name": "Trainers.LocalDeepSvmBinaryClassifier",
+ "Desc": "LD-SVM learns a binary, non-linear SVM classifier with a kernel that is specifically designed to reduce prediction time. LD-SVM learns decision boundaries that are locally linear.",
+ "FriendlyName": "Local Deep SVM (LDSVM)",
+ "ShortName": "LDSVM",
"Inputs": [
{
"Name": "TrainingData",
@@ -13509,30 +13794,312 @@
"Default": "Auto"
},
{
- "Name": "ShowTrainingStatistics",
- "Type": "Bool",
- "Desc": "Show statistics of training examples.",
+ "Name": "TreeDepth",
+ "Type": "Int",
+ "Desc": "Depth of Local Deep SVM tree",
"Aliases": [
- "stat",
- "ShowTrainingStats"
+ "depth"
],
"Required": false,
"SortOrder": 50.0,
"IsNullable": false,
- "Default": false
+ "Default": 3,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ 1,
+ 3,
+ 5,
+ 7
+ ]
+ }
},
{
- "Name": "L2Regularization",
+ "Name": "LambdaW",
"Type": "Float",
- "Desc": "L2 regularization weight",
+ "Desc": "Regularizer for classifier parameter W",
"Aliases": [
- "l2",
- "L2Weight"
+ "lw"
],
"Required": false,
"SortOrder": 50.0,
"IsNullable": false,
- "Default": 1.0,
+ "Default": 0.1,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ 0.1,
+ 0.01,
+ 0.001
+ ]
+ }
+ },
+ {
+ "Name": "LambdaTheta",
+ "Type": "Float",
+ "Desc": "Regularizer for kernel parameter Theta",
+ "Aliases": [
+ "lt"
+ ],
+ "Required": false,
+ "SortOrder": 50.0,
+ "IsNullable": false,
+ "Default": 0.01,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ 0.1,
+ 0.01,
+ 0.001
+ ]
+ }
+ },
+ {
+ "Name": "LambdaThetaprime",
+ "Type": "Float",
+ "Desc": "Regularizer for kernel parameter Thetaprime",
+ "Aliases": [
+ "lp"
+ ],
+ "Required": false,
+ "SortOrder": 50.0,
+ "IsNullable": false,
+ "Default": 0.01,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ 0.1,
+ 0.01,
+ 0.001
+ ]
+ }
+ },
+ {
+ "Name": "Sigma",
+ "Type": "Float",
+ "Desc": "Parameter for sigmoid sharpness",
+ "Aliases": [
+ "s"
+ ],
+ "Required": false,
+ "SortOrder": 50.0,
+ "IsNullable": false,
+ "Default": 1.0,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ 1.0,
+ 0.1,
+ 0.01
+ ]
+ }
+ },
+ {
+ "Name": "NumberOfIterations",
+ "Type": "Int",
+ "Desc": "Number of iterations",
+ "Aliases": [
+ "iter",
+ "NumIterations"
+ ],
+ "Required": false,
+ "SortOrder": 50.0,
+ "IsNullable": false,
+ "Default": 15000,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ 10000,
+ 15000
+ ]
+ }
+ },
+ {
+ "Name": "UseBias",
+ "Type": "Bool",
+ "Desc": "No bias",
+ "Aliases": [
+ "bias"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": true,
+ "SweepRange": {
+ "RangeType": "Discrete",
+ "Values": [
+ false,
+ true
+ ]
+ }
+ },
+ {
+ "Name": "Calibrator",
+ "Type": {
+ "Kind": "Component",
+ "ComponentKind": "CalibratorTrainer"
+ },
+ "Desc": "The calibrator kind to apply to the predictor. Specify null for no calibration",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": {
+ "Name": "PlattCalibrator"
+ }
+ },
+ {
+ "Name": "MaxCalibrationExamples",
+ "Type": "Int",
+ "Desc": "The maximum number of examples to use when training the calibrator",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 1000000
+ },
+ {
+ "Name": "Cache",
+ "Type": "Bool",
+ "Desc": "Whether to cache the data before the first iteration",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": true
+ }
+ ],
+ "Outputs": [
+ {
+ "Name": "PredictorModel",
+ "Type": "PredictorModel",
+ "Desc": "The trained model"
+ }
+ ],
+ "InputKind": [
+ "ITrainerInputWithWeight",
+ "ITrainerInputWithLabel",
+ "ITrainerInput"
+ ],
+ "OutputKind": [
+ "IBinaryClassificationOutput",
+ "ITrainerOutput"
+ ]
+ },
+ {
+ "Name": "Trainers.LogisticRegressionBinaryClassifier",
+ "Desc": "Logistic Regression is a method in statistics used to predict the probability of occurrence of an event and can be used as a classification algorithm. The algorithm predicts the probability of occurrence of an event by fitting data to a logistical function.",
+ "FriendlyName": "Logistic Regression",
+ "ShortName": "lr",
+ "Inputs": [
+ {
+ "Name": "TrainingData",
+ "Type": "DataView",
+ "Desc": "The data to be used for training",
+ "Aliases": [
+ "data"
+ ],
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "FeatureColumnName",
+ "Type": "String",
+ "Desc": "Column to use for features",
+ "Aliases": [
+ "feat"
+ ],
+ "Required": false,
+ "SortOrder": 2.0,
+ "IsNullable": false,
+ "Default": "Features"
+ },
+ {
+ "Name": "LabelColumnName",
+ "Type": "String",
+ "Desc": "Column to use for labels",
+ "Aliases": [
+ "lab"
+ ],
+ "Required": false,
+ "SortOrder": 3.0,
+ "IsNullable": false,
+ "Default": "Label"
+ },
+ {
+ "Name": "ExampleWeightColumnName",
+ "Type": "String",
+ "Desc": "Column to use for example weight",
+ "Aliases": [
+ "weight"
+ ],
+ "Required": false,
+ "SortOrder": 4.0,
+ "IsNullable": false,
+ "Default": null
+ },
+ {
+ "Name": "NormalizeFeatures",
+ "Type": {
+ "Kind": "Enum",
+ "Values": [
+ "No",
+ "Warn",
+ "Auto",
+ "Yes"
+ ]
+ },
+ "Desc": "Normalize option for the feature column",
+ "Aliases": [
+ "norm"
+ ],
+ "Required": false,
+ "SortOrder": 5.0,
+ "IsNullable": false,
+ "Default": "Auto"
+ },
+ {
+ "Name": "Caching",
+ "Type": {
+ "Kind": "Enum",
+ "Values": [
+ "Auto",
+ "Memory",
+ "None"
+ ]
+ },
+ "Desc": "Whether trainer should cache input training data",
+ "Aliases": [
+ "cache"
+ ],
+ "Required": false,
+ "SortOrder": 6.0,
+ "IsNullable": false,
+ "Default": "Auto"
+ },
+ {
+ "Name": "ShowTrainingStatistics",
+ "Type": "Bool",
+ "Desc": "Show statistics of training examples.",
+ "Aliases": [
+ "stat",
+ "ShowTrainingStats"
+ ],
+ "Required": false,
+ "SortOrder": 50.0,
+ "IsNullable": false,
+ "Default": false
+ },
+ {
+ "Name": "L2Regularization",
+ "Type": "Float",
+ "Desc": "L2 regularization weight",
+ "Aliases": [
+ "l2",
+ "L2Weight"
+ ],
+ "Required": false,
+ "SortOrder": 50.0,
+ "IsNullable": false,
+ "Default": 1.0,
"SweepRange": {
"RangeType": "Float",
"Min": 0.0,
@@ -13739,7 +14306,7 @@
},
{
"Name": "Trainers.LogisticRegressionClassifier",
- "Desc": "Maximum entrypy classification is a method in statistics used to predict the probabilities of parallel events. The model predicts the probabilities of parallel events by fitting data to a softmax function.",
+ "Desc": "Maximum entropy classification is a method in statistics used to predict the probabilities of parallel events. The model predicts the probabilities of parallel events by fitting data to a softmax function.",
"FriendlyName": "Multi-class Logistic Regression",
"ShortName": "mlr",
"Inputs": [
@@ -17211,6 +17778,82 @@
"ITransformOutput"
]
},
+ {
+ "Name": "Transforms.CategoryImputer",
+ "Desc": "Fills in missing values in a column based on the most frequent value",
+ "FriendlyName": "CategoryImputer",
+ "ShortName": "CategoryImputer",
+ "Inputs": [
+ {
+ "Name": "Column",
+ "Type": {
+ "Kind": "Array",
+ "ItemType": {
+ "Kind": "Struct",
+ "Fields": [
+ {
+ "Name": "Name",
+ "Type": "String",
+ "Desc": "Name of the new column",
+ "Aliases": [
+ "name"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": null
+ },
+ {
+ "Name": "Source",
+ "Type": "String",
+ "Desc": "Name of the source column",
+ "Aliases": [
+ "src"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": null
+ }
+ ]
+ }
+ },
+ "Desc": "New column definition (optional form: name:src)",
+ "Aliases": [
+ "col"
+ ],
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "Data",
+ "Type": "DataView",
+ "Desc": "Input dataset",
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ }
+ ],
+ "Outputs": [
+ {
+ "Name": "OutputData",
+ "Type": "DataView",
+ "Desc": "Transformed dataset"
+ },
+ {
+ "Name": "Model",
+ "Type": "TransformModel",
+ "Desc": "Transform model"
+ }
+ ],
+ "InputKind": [
+ "ITransformInput"
+ ],
+ "OutputKind": [
+ "ITransformOutput"
+ ]
+ },
{
"Name": "Transforms.CharacterTokenizer",
"Desc": "Character-oriented tokenizer where text is considered a sequence of characters.",
@@ -18032,6 +18675,117 @@
}
]
},
+ {
+ "Name": "Transforms.DateTimeSplitter",
+ "Desc": "Splits a date time value into each individual component",
+ "FriendlyName": "DateTime Transform",
+ "ShortName": "DateTimeTransform",
+ "Inputs": [
+ {
+ "Name": "Source",
+ "Type": "String",
+ "Desc": "Input column",
+ "Aliases": [
+ "src"
+ ],
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "Data",
+ "Type": "DataView",
+ "Desc": "Input dataset",
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "Prefix",
+ "Type": "String",
+ "Desc": "Output column prefix",
+ "Aliases": [
+ "pre"
+ ],
+ "Required": true,
+ "SortOrder": 2.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "Country",
+ "Type": {
+ "Kind": "Enum",
+ "Values": [
+ "None",
+ "Argentina",
+ "Australia",
+ "Austria",
+ "Belarus",
+ "Belgium",
+ "Brazil",
+ "Canada",
+ "Colombia",
+ "Croatia",
+ "Czech",
+ "Denmark",
+ "England",
+ "Finland",
+ "France",
+ "Germany",
+ "Hungary",
+ "India",
+ "Ireland",
+ "IsleofMan",
+ "Italy",
+ "Japan",
+ "Mexico",
+ "Netherlands",
+ "NewZealand",
+ "NorthernIreland",
+ "Norway",
+ "Poland",
+ "Portugal",
+ "Scotland",
+ "Slovenia",
+ "SouthAfrica",
+ "Spain",
+ "Sweden",
+ "Switzerland",
+ "Ukraine",
+ "UnitedKingdom",
+ "UnitedStates",
+ "Wales"
+ ]
+ },
+ "Desc": "Country to get holidays for. Defaults to none if not passed",
+ "Aliases": [
+ "ctry"
+ ],
+ "Required": false,
+ "SortOrder": 4.0,
+ "IsNullable": false,
+ "Default": "None"
+ }
+ ],
+ "Outputs": [
+ {
+ "Name": "OutputData",
+ "Type": "DataView",
+ "Desc": "Transformed dataset"
+ },
+ {
+ "Name": "Model",
+ "Type": "TransformModel",
+ "Desc": "Transform model"
+ }
+ ],
+ "InputKind": [
+ "ITransformInput"
+ ],
+ "OutputKind": [
+ "ITransformOutput"
+ ]
+ },
{
"Name": "Transforms.Dictionarizer",
"Desc": "Converts input values (words, numbers, etc.) to index in a dictionary.",
@@ -20637,7 +21391,7 @@
},
{
"Name": "Transforms.MissingValueHandler",
- "Desc": "Handle missing values by replacing them with either the default value or the mean/min/max value (for non-text columns only). An indicator column can optionally be concatenated, if theinput column type is numeric.",
+ "Desc": "Handle missing values by replacing them with either the default value or the mean/min/max value (for non-text columns only). An indicator column can optionally be concatenated, if the input column type is numeric.",
"FriendlyName": "NA Handle Transform",
"ShortName": "NAHandle",
"Inputs": [
@@ -21814,13 +22568,98 @@
],
"Required": false,
"SortOrder": 150.0,
- "IsNullable": false,
+ "IsNullable": false,
+ "Default": null
+ }
+ ]
+ }
+ },
+ "Desc": "New column definition(s) (optional form: name:src)",
+ "Aliases": [
+ "col"
+ ],
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "Data",
+ "Type": "DataView",
+ "Desc": "Input dataset",
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ }
+ ],
+ "Outputs": [
+ {
+ "Name": "OutputData",
+ "Type": "DataView",
+ "Desc": "Transformed dataset"
+ },
+ {
+ "Name": "Model",
+ "Type": "TransformModel",
+ "Desc": "Transform model"
+ }
+ ],
+ "InputKind": [
+ "ITransformInput"
+ ],
+ "OutputKind": [
+ "ITransformOutput"
+ ]
+ },
+ {
+ "Name": "Transforms.RandomNumberGenerator",
+ "Desc": "Adds a column with a generated number sequence.",
+ "FriendlyName": "Generate Number Transform",
+ "ShortName": "Generate",
+ "Inputs": [
+ {
+ "Name": "Column",
+ "Type": {
+ "Kind": "Array",
+ "ItemType": {
+ "Kind": "Struct",
+ "Fields": [
+ {
+ "Name": "Name",
+ "Type": "String",
+ "Desc": "Name of the new column",
+ "Aliases": [
+ "name"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": null
+ },
+ {
+ "Name": "UseCounter",
+ "Type": "Bool",
+ "Desc": "Use an auto-incremented integer starting at zero instead of a random number",
+ "Aliases": [
+ "cnt"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
+ "Default": null
+ },
+ {
+ "Name": "Seed",
+ "Type": "UInt",
+ "Desc": "The random seed",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": true,
"Default": null
}
]
}
},
- "Desc": "New column definition(s) (optional form: name:src)",
+ "Desc": "New column definition(s) (optional form: name:seed)",
"Aliases": [
"col"
],
@@ -21835,6 +22674,27 @@
"Required": true,
"SortOrder": 1.0,
"IsNullable": false
+ },
+ {
+ "Name": "UseCounter",
+ "Type": "Bool",
+ "Desc": "Use an auto-incremented integer starting at zero instead of a random number",
+ "Aliases": [
+ "cnt"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": false
+ },
+ {
+ "Name": "Seed",
+ "Type": "UInt",
+ "Desc": "The random seed",
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": 42
}
],
"Outputs": [
@@ -21857,10 +22717,10 @@
]
},
{
- "Name": "Transforms.RandomNumberGenerator",
- "Desc": "Adds a column with a generated number sequence.",
- "FriendlyName": "Generate Number Transform",
- "ShortName": "Generate",
+ "Name": "Transforms.RobustScaler",
+ "Desc": "Removes the median and scales the data according to the quantile range.",
+ "FriendlyName": "RobustScalerTransformer",
+ "ShortName": "RobScalT",
"Inputs": [
{
"Name": "Column",
@@ -21882,30 +22742,21 @@
"Default": null
},
{
- "Name": "UseCounter",
- "Type": "Bool",
- "Desc": "Use an auto-incremented integer starting at zero instead of a random number",
+ "Name": "Source",
+ "Type": "String",
+ "Desc": "Name of the source column",
"Aliases": [
- "cnt"
+ "src"
],
"Required": false,
"SortOrder": 150.0,
- "IsNullable": true,
- "Default": null
- },
- {
- "Name": "Seed",
- "Type": "UInt",
- "Desc": "The random seed",
- "Required": false,
- "SortOrder": 150.0,
- "IsNullable": true,
+ "IsNullable": false,
"Default": null
}
]
}
},
- "Desc": "New column definition(s) (optional form: name:seed)",
+ "Desc": "New column definition (optional form: name:src)",
"Aliases": [
"col"
],
@@ -21922,25 +22773,52 @@
"IsNullable": false
},
{
- "Name": "UseCounter",
+ "Name": "Center",
"Type": "Bool",
- "Desc": "Use an auto-incremented integer starting at zero instead of a random number",
+ "Desc": "If True, center the data before scaling.",
"Aliases": [
- "cnt"
+ "ctr"
],
"Required": false,
- "SortOrder": 150.0,
+ "SortOrder": 2.0,
"IsNullable": false,
- "Default": false
+ "Default": true
},
{
- "Name": "Seed",
- "Type": "UInt",
- "Desc": "The random seed",
+ "Name": "Scale",
+ "Type": "Bool",
+ "Desc": "If True, scale the data to interquartile range.",
+ "Aliases": [
+ "sc"
+ ],
"Required": false,
- "SortOrder": 150.0,
+ "SortOrder": 3.0,
"IsNullable": false,
- "Default": 42
+ "Default": true
+ },
+ {
+ "Name": "QuantileMin",
+ "Type": "Float",
+ "Desc": "Min for the quantile range used to calculate scale.",
+ "Aliases": [
+ "min"
+ ],
+ "Required": false,
+ "SortOrder": 4.0,
+ "IsNullable": false,
+ "Default": 25.0
+ },
+ {
+ "Name": "QuantileMax",
+ "Type": "Float",
+ "Desc": "Max for the quantile range used to calculate scale.",
+ "Aliases": [
+ "max"
+ ],
+ "Required": false,
+ "SortOrder": 5.0,
+ "IsNullable": false,
+ "Default": 75.0
}
],
"Outputs": [
@@ -22972,6 +23850,206 @@
"ITransformOutput"
]
},
+ {
+ "Name": "Transforms.TimeSeriesImputer",
+ "Desc": "Fills in missing row and values",
+ "FriendlyName": "TimeSeriesImputer",
+ "ShortName": "tsi",
+ "Inputs": [
+ {
+ "Name": "TimeSeriesColumn",
+ "Type": "String",
+ "Desc": "Column representing the time",
+ "Aliases": [
+ "time"
+ ],
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "Data",
+ "Type": "DataView",
+ "Desc": "Input dataset",
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "GrainColumns",
+ "Type": {
+ "Kind": "Array",
+ "ItemType": "String"
+ },
+ "Desc": "List of grain columns",
+ "Aliases": [
+ "grains"
+ ],
+ "Required": true,
+ "SortOrder": 2.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "FilterColumns",
+ "Type": {
+ "Kind": "Array",
+ "ItemType": "String"
+ },
+ "Desc": "Columns to filter",
+ "Aliases": [
+ "filters"
+ ],
+ "Required": false,
+ "SortOrder": 2.0,
+ "IsNullable": false,
+ "Default": null
+ },
+ {
+ "Name": "FilterMode",
+ "Type": {
+ "Kind": "Enum",
+ "Values": [
+ "NoFilter",
+ "Include",
+ "Exclude"
+ ]
+ },
+ "Desc": "Filter mode. Either include or exclude",
+ "Aliases": [
+ "fmode"
+ ],
+ "Required": false,
+ "SortOrder": 3.0,
+ "IsNullable": false,
+ "Default": "Exclude"
+ },
+ {
+ "Name": "ImputeMode",
+ "Type": {
+ "Kind": "Enum",
+ "Values": [
+ "ForwardFill",
+ "BackFill",
+ "Median"
+ ]
+ },
+ "Desc": "Mode for imputing, defaults to ForwardFill if not provided",
+ "Aliases": [
+ "mode"
+ ],
+ "Required": false,
+ "SortOrder": 3.0,
+ "IsNullable": false,
+ "Default": "ForwardFill"
+ },
+ {
+ "Name": "SupressTypeErrors",
+ "Type": "Bool",
+ "Desc": "Suppress the errors that would occur if a column and impute mode are incompatible. If true, will skip the column. If false, will stop and throw an error.",
+ "Aliases": [
+ "error"
+ ],
+ "Required": false,
+ "SortOrder": 3.0,
+ "IsNullable": false,
+ "Default": false
+ }
+ ],
+ "Outputs": [
+ {
+ "Name": "OutputData",
+ "Type": "DataView",
+ "Desc": "Transformed dataset"
+ },
+ {
+ "Name": "Model",
+ "Type": "TransformModel",
+ "Desc": "Transform model"
+ }
+ ],
+ "InputKind": [
+ "ITransformInput"
+ ],
+ "OutputKind": [
+ "ITransformOutput"
+ ]
+ },
+ {
+ "Name": "Transforms.ToString",
+ "Desc": "Turns the given column into a column of its string representation",
+ "FriendlyName": "ToString Transform",
+ "ShortName": "tostr",
+ "Inputs": [
+ {
+ "Name": "Column",
+ "Type": {
+ "Kind": "Array",
+ "ItemType": {
+ "Kind": "Struct",
+ "Fields": [
+ {
+ "Name": "Name",
+ "Type": "String",
+ "Desc": "Name of the new column",
+ "Aliases": [
+ "name"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": null
+ },
+ {
+ "Name": "Source",
+ "Type": "String",
+ "Desc": "Name of the source column",
+ "Aliases": [
+ "src"
+ ],
+ "Required": false,
+ "SortOrder": 150.0,
+ "IsNullable": false,
+ "Default": null
+ }
+ ]
+ }
+ },
+ "Desc": "New column definition (optional form: name:src)",
+ "Aliases": [
+ "col"
+ ],
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ },
+ {
+ "Name": "Data",
+ "Type": "DataView",
+ "Desc": "Input dataset",
+ "Required": true,
+ "SortOrder": 1.0,
+ "IsNullable": false
+ }
+ ],
+ "Outputs": [
+ {
+ "Name": "OutputData",
+ "Type": "DataView",
+ "Desc": "Transformed dataset"
+ },
+ {
+ "Name": "Model",
+ "Type": "TransformModel",
+ "Desc": "Transform model"
+ }
+ ],
+ "InputKind": [
+ "ITransformInput"
+ ],
+ "OutputKind": [
+ "ITransformOutput"
+ ]
+ },
{
"Name": "Transforms.TrainTestDatasetSplitter",
"Desc": "Split the dataset into train and test sets",
@@ -24277,19 +25355,19 @@
{
"Name": "Slope",
"Type": "Float",
- "Desc": "The slope parameter of f(x) = 1 / (1 + exp(-slope * x + offset)",
+ "Desc": "The slope parameter of f(x) = 1 / (1 + exp(slope * x + offset)",
"Aliases": [
"a"
],
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
- "Default": 1.0
+ "Default": -1.0
},
{
"Name": "Offset",
"Type": "Float",
- "Desc": "The offset parameter of f(x) = 1 / (1 + exp(-slope * x + offset)",
+ "Desc": "The offset parameter of f(x) = 1 / (1 + exp(slope * x + offset)",
"Aliases": [
"b"
],
diff --git a/src/python/tools/manifest_diff.json b/src/python/tools/manifest_diff.json
index d56f33de..a70489ee 100644
--- a/src/python/tools/manifest_diff.json
+++ b/src/python/tools/manifest_diff.json
@@ -293,6 +293,24 @@
"Module": "preprocessing",
"Type": "Transform"
},
+ {
+ "Name": "Transforms.CategoryImputer",
+ "NewName": "ToKeyImputer",
+ "Module": "preprocessing",
+ "Type": "Transform"
+ },
+ {
+ "Name": "Transforms.ToString",
+ "NewName": "ToString",
+ "Module": "preprocessing",
+ "Type": "Transform"
+ },
+ {
+ "Name": "Transforms.DateTimeSplitter",
+ "NewName": "DateTimeSplitter",
+ "Module": "preprocessing",
+ "Type": "Transform"
+ },
{
"Name": "Transforms.TensorFlowScorer",
"NewName": "TensorFlowScorer",
@@ -329,6 +347,12 @@
"Module": "preprocessing",
"Type": "Transform"
},
+ {
+ "Name": "Models.OnnxTransformer",
+ "NewName": "OnnxRunner",
+ "Module": "preprocessing",
+ "Type": "Transform"
+ },
{
"Name": "Trainers.FieldAwareFactorizationMachineBinaryClassifier",
"NewName": "FactorizationMachineBinaryClassifier",
@@ -492,6 +516,12 @@
"Module": "preprocessing.normalization",
"Type": "Transform"
},
+ {
+ "Name": "Transforms.RobustScaler",
+ "NewName": "RobustScaler",
+ "Module": "preprocessing.normalization",
+ "Type": "Transform"
+ },
{
"Name": "Transforms.MissingValuesRowDropper",
"NewName": "Filter",
@@ -609,6 +639,12 @@
"Module": "timeseries",
"Type": "Transform"
},
+ {
+ "Name": "Transforms.TimeSeriesImputer",
+ "NewName": "TimeSeriesImputer",
+ "Module": "timeseries",
+ "Type": "Transform"
+ },
{
"Name": "Trainers.PoissonRegressor",
"NewName": "PoissonRegressionRegressor",
diff --git a/version.txt b/version.txt
index 2eda823f..bd8bf882 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-1.6.1
\ No newline at end of file
+1.7.0