diff --git a/build.cmd b/build.cmd index 8ed5005d..8ad4a127 100644 --- a/build.cmd +++ b/build.cmd @@ -173,6 +173,8 @@ if "%AzureBuild%" == "True" ( echo ##vso[task.prependpath]%_dotnetRoot% ) +set LOCAL_NUGET_PACKAGES_DIR=.\local-nuget-packages + :: Build managed code echo "" echo "#################################" @@ -311,6 +313,7 @@ copy "%BuildOutputDir%%Configuration%\pybridge.pyd" "%__currentScriptDir%src\py if %PythonVersion% == 2.7 ( copy "%BuildOutputDir%%Configuration%\Platform\win-x64\publish\*.dll" "%__currentScriptDir%src\python\nimbusml\internal\libs\" + xcopy /S /E /I "%BuildOutputDir%%Configuration%\Platform\win-x64\publish\Data" "%__currentScriptDir%src\python\nimbusml\internal\libs\Data" :: remove dataprep dlls as its not supported in python 2.7 del "%__currentScriptDir%src\python\nimbusml\internal\libs\Microsoft.DPrep.*" del "%__currentScriptDir%src\python\nimbusml\internal\libs\Microsoft.Data.*" @@ -321,6 +324,7 @@ if %PythonVersion% == 2.7 ( del "%__currentScriptDir%src\python\nimbusml\internal\libs\Microsoft.Workbench.Messaging.SDK.dll" ) else ( for /F "tokens=*" %%A in (build/libs_win.txt) do copy "%BuildOutputDir%%Configuration%\Platform\win-x64\publish\%%A" "%__currentScriptDir%src\python\nimbusml\internal\libs\" + xcopy /S /E /I "%BuildOutputDir%%Configuration%\Platform\win-x64\publish\Data" "%__currentScriptDir%src\python\nimbusml\internal\libs\Data" ) if "%DebugBuild%" == "True" ( diff --git a/build.sh b/build.sh index 6d5221c9..e2292693 100755 --- a/build.sh +++ b/build.sh @@ -175,6 +175,8 @@ then echo "Installing dotnet SDK ... " curl -sSL https://dot.net/v1/dotnet-install.sh | bash /dev/stdin -Version 2.1.701 -InstallDir ./cli + export LOCAL_NUGET_PACKAGES_DIR=./local-nuget-packages + # Build managed code echo "Building managed code ... " _dotnet="${__currentScriptDir}/cli/dotnet" @@ -213,6 +215,7 @@ then cp "${BuildOutputDir}/${__configuration}/Platform/${PublishDir}"/publish/System.Native.a "${__currentScriptDir}/src/python/nimbusml/internal/libs/" cp "${BuildOutputDir}/${__configuration}/Platform/${PublishDir}"/publish/createdump "${__currentScriptDir}/src/python/nimbusml/internal/libs/" || : cp "${BuildOutputDir}/${__configuration}/Platform/${PublishDir}"/publish/sosdocsunix.txt "${__currentScriptDir}/src/python/nimbusml/internal/libs/" + cp -r "${BuildOutputDir}/${__configuration}/Platform/${PublishDir}"/publish/Data "${__currentScriptDir}/src/python/nimbusml/internal/libs/." ext=*.so if [ "$(uname -s)" = "Darwin" ] then @@ -241,6 +244,7 @@ then cat build/${libs_txt} | while read i; do cp "${BuildOutputDir}/${__configuration}/Platform/${PublishDir}"/publish/$i "${__currentScriptDir}/src/python/nimbusml/internal/libs/" done + cp -r "${BuildOutputDir}/${__configuration}/Platform/${PublishDir}"/publish/Data "${__currentScriptDir}/src/python/nimbusml/internal/libs/." fi if [[ $__configuration = Dbg* ]] diff --git a/build/libs_linux.txt b/build/libs_linux.txt index 6ce4cbed..c2c7d848 100644 --- a/build/libs_linux.txt +++ b/build/libs_linux.txt @@ -1,6 +1,7 @@ Newtonsoft.Json.dll libCpuMathNative.so libFastTreeNative.so +libFeaturizers.so libLdaNative.so libMklImports.so libMklProxyNative.so diff --git a/build/libs_mac.txt b/build/libs_mac.txt index 85544169..1ebc1724 100644 --- a/build/libs_mac.txt +++ b/build/libs_mac.txt @@ -9,6 +9,7 @@ lib_lightgbm.dylib libtensorflow.dylib libonnxruntime.dylib libtensorflow_framework.1.dylib +Featurizers.dll System.Drawing.Common.dll TensorFlow.NET.dll NumSharp.Core.dll diff --git a/build/libs_win.txt b/build/libs_win.txt index 7ef9cca7..e815e645 100644 --- a/build/libs_win.txt +++ b/build/libs_win.txt @@ -8,6 +8,7 @@ libiomp5md.dll MklImports.dll MklProxyNative.dll SymSgdNative.dll +Featurizers.dll tensorflow.dll TensorFlow.NET.dll NumSharp.Core.dll diff --git a/local-nuget-packages/MicrosoftMLFeaturizers.0.1.0.nupkg b/local-nuget-packages/MicrosoftMLFeaturizers.0.1.0.nupkg new file mode 100644 index 00000000..0a8b2fbd Binary files /dev/null and b/local-nuget-packages/MicrosoftMLFeaturizers.0.1.0.nupkg differ diff --git a/local-nuget-packages/microsoft.extensions.ml.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.extensions.ml.0.18.2-preview2-28208-8.nupkg new file mode 100644 index 00000000..2ceed83a Binary files /dev/null and b/local-nuget-packages/microsoft.extensions.ml.0.18.2-preview2-28208-8.nupkg differ diff --git a/local-nuget-packages/microsoft.extensions.ml.symbols.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.extensions.ml.symbols.0.18.2-preview2-28208-8.nupkg new file mode 100644 index 00000000..845b027f Binary files /dev/null and b/local-nuget-packages/microsoft.extensions.ml.symbols.0.18.2-preview2-28208-8.nupkg differ diff --git a/local-nuget-packages/microsoft.ml.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.1.6.2-preview2-28208-8.nupkg new file mode 100644 index 00000000..a8debf72 Binary files /dev/null and b/local-nuget-packages/microsoft.ml.1.6.2-preview2-28208-8.nupkg differ diff --git a/local-nuget-packages/microsoft.ml.automl.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.automl.0.18.2-preview2-28208-8.nupkg new file mode 100644 index 00000000..f858c678 Binary files /dev/null and b/local-nuget-packages/microsoft.ml.automl.0.18.2-preview2-28208-8.nupkg differ diff --git a/local-nuget-packages/microsoft.ml.automl.symbols.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.automl.symbols.0.18.2-preview2-28208-8.nupkg new file mode 100644 index 00000000..3cf6ed34 Binary files /dev/null and b/local-nuget-packages/microsoft.ml.automl.symbols.0.18.2-preview2-28208-8.nupkg differ diff --git a/local-nuget-packages/microsoft.ml.cpumath.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.cpumath.1.6.2-preview2-28208-8.nupkg new file mode 100644 index 00000000..008df73c Binary files /dev/null and b/local-nuget-packages/microsoft.ml.cpumath.1.6.2-preview2-28208-8.nupkg differ diff --git a/local-nuget-packages/microsoft.ml.cpumath.symbols.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.cpumath.symbols.1.6.2-preview2-28208-8.nupkg new file mode 100644 index 00000000..bdcd6852 Binary files /dev/null and b/local-nuget-packages/microsoft.ml.cpumath.symbols.1.6.2-preview2-28208-8.nupkg differ diff --git a/local-nuget-packages/microsoft.ml.dataview.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.dataview.1.6.2-preview2-28208-8.nupkg new file mode 100644 index 00000000..5729bfa7 Binary files /dev/null and b/local-nuget-packages/microsoft.ml.dataview.1.6.2-preview2-28208-8.nupkg differ diff --git a/local-nuget-packages/microsoft.ml.dataview.symbols.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.dataview.symbols.1.6.2-preview2-28208-8.nupkg new file mode 100644 index 00000000..beefe429 Binary files /dev/null and b/local-nuget-packages/microsoft.ml.dataview.symbols.1.6.2-preview2-28208-8.nupkg differ diff --git a/local-nuget-packages/microsoft.ml.dnn.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.dnn.0.18.2-preview2-28208-8.nupkg new file mode 100644 index 00000000..f728196c Binary files /dev/null and b/local-nuget-packages/microsoft.ml.dnn.0.18.2-preview2-28208-8.nupkg differ diff --git a/local-nuget-packages/microsoft.ml.dnn.symbols.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.dnn.symbols.0.18.2-preview2-28208-8.nupkg new file mode 100644 index 00000000..73ffedf4 Binary files /dev/null and b/local-nuget-packages/microsoft.ml.dnn.symbols.0.18.2-preview2-28208-8.nupkg differ diff --git a/local-nuget-packages/microsoft.ml.ensemble.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.ensemble.0.18.2-preview2-28208-8.nupkg new file mode 100644 index 00000000..9cbdef31 Binary files /dev/null and b/local-nuget-packages/microsoft.ml.ensemble.0.18.2-preview2-28208-8.nupkg differ diff --git a/local-nuget-packages/microsoft.ml.ensemble.symbols.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.ensemble.symbols.0.18.2-preview2-28208-8.nupkg new file mode 100644 index 00000000..069b69d9 Binary files /dev/null and b/local-nuget-packages/microsoft.ml.ensemble.symbols.0.18.2-preview2-28208-8.nupkg differ diff --git a/local-nuget-packages/microsoft.ml.entrypoints.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.entrypoints.0.18.2-preview2-28208-8.nupkg new file mode 100644 index 00000000..8e27e3cc Binary files /dev/null and b/local-nuget-packages/microsoft.ml.entrypoints.0.18.2-preview2-28208-8.nupkg differ diff --git a/local-nuget-packages/microsoft.ml.entrypoints.symbols.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.entrypoints.symbols.0.18.2-preview2-28208-8.nupkg new file mode 100644 index 00000000..f72c9382 Binary files /dev/null and b/local-nuget-packages/microsoft.ml.entrypoints.symbols.0.18.2-preview2-28208-8.nupkg differ diff --git a/local-nuget-packages/microsoft.ml.experimental.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.experimental.0.18.2-preview2-28208-8.nupkg new file mode 100644 index 00000000..554d2417 Binary files /dev/null and b/local-nuget-packages/microsoft.ml.experimental.0.18.2-preview2-28208-8.nupkg differ diff --git a/local-nuget-packages/microsoft.ml.experimental.symbols.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.experimental.symbols.0.18.2-preview2-28208-8.nupkg new file mode 100644 index 00000000..fc844210 Binary files /dev/null and b/local-nuget-packages/microsoft.ml.experimental.symbols.0.18.2-preview2-28208-8.nupkg differ diff --git a/local-nuget-packages/microsoft.ml.fasttree.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.fasttree.1.6.2-preview2-28208-8.nupkg new file mode 100644 index 00000000..820b48b3 Binary files /dev/null and b/local-nuget-packages/microsoft.ml.fasttree.1.6.2-preview2-28208-8.nupkg differ diff --git a/local-nuget-packages/microsoft.ml.fasttree.symbols.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.fasttree.symbols.1.6.2-preview2-28208-8.nupkg new file mode 100644 index 00000000..4174ee8e Binary files /dev/null and b/local-nuget-packages/microsoft.ml.fasttree.symbols.1.6.2-preview2-28208-8.nupkg differ diff --git a/local-nuget-packages/microsoft.ml.featurizers.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.featurizers.0.18.2-preview2-28208-8.nupkg new file mode 100644 index 00000000..cb04dfd5 Binary files /dev/null and b/local-nuget-packages/microsoft.ml.featurizers.0.18.2-preview2-28208-8.nupkg differ diff --git a/local-nuget-packages/microsoft.ml.featurizers.symbols.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.featurizers.symbols.0.18.2-preview2-28208-8.nupkg new file mode 100644 index 00000000..5be74193 Binary files /dev/null and b/local-nuget-packages/microsoft.ml.featurizers.symbols.0.18.2-preview2-28208-8.nupkg differ diff --git a/local-nuget-packages/microsoft.ml.imageanalytics.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.imageanalytics.1.6.2-preview2-28208-8.nupkg new file mode 100644 index 00000000..7c5afeb9 Binary files /dev/null and b/local-nuget-packages/microsoft.ml.imageanalytics.1.6.2-preview2-28208-8.nupkg differ diff --git a/local-nuget-packages/microsoft.ml.imageanalytics.symbols.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.imageanalytics.symbols.1.6.2-preview2-28208-8.nupkg new file mode 100644 index 00000000..11d473a0 Binary files /dev/null and b/local-nuget-packages/microsoft.ml.imageanalytics.symbols.1.6.2-preview2-28208-8.nupkg differ diff --git a/local-nuget-packages/microsoft.ml.lightgbm.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.lightgbm.1.6.2-preview2-28208-8.nupkg new file mode 100644 index 00000000..381c705c Binary files /dev/null and b/local-nuget-packages/microsoft.ml.lightgbm.1.6.2-preview2-28208-8.nupkg differ diff --git a/local-nuget-packages/microsoft.ml.lightgbm.symbols.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.lightgbm.symbols.1.6.2-preview2-28208-8.nupkg new file mode 100644 index 00000000..cbd0cf9d Binary files /dev/null and b/local-nuget-packages/microsoft.ml.lightgbm.symbols.1.6.2-preview2-28208-8.nupkg differ diff --git a/local-nuget-packages/microsoft.ml.mkl.components.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.mkl.components.1.6.2-preview2-28208-8.nupkg new file mode 100644 index 00000000..7e448a72 Binary files /dev/null and b/local-nuget-packages/microsoft.ml.mkl.components.1.6.2-preview2-28208-8.nupkg differ diff --git a/local-nuget-packages/microsoft.ml.mkl.components.symbols.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.mkl.components.symbols.1.6.2-preview2-28208-8.nupkg new file mode 100644 index 00000000..c24c142e Binary files /dev/null and b/local-nuget-packages/microsoft.ml.mkl.components.symbols.1.6.2-preview2-28208-8.nupkg differ diff --git a/local-nuget-packages/microsoft.ml.mkl.redist.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.mkl.redist.1.6.2-preview2-28208-8.nupkg new file mode 100644 index 00000000..42d18904 Binary files /dev/null and b/local-nuget-packages/microsoft.ml.mkl.redist.1.6.2-preview2-28208-8.nupkg differ diff --git a/local-nuget-packages/microsoft.ml.onnxconverter.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.onnxconverter.0.18.2-preview2-28208-8.nupkg new file mode 100644 index 00000000..045429c8 Binary files /dev/null and b/local-nuget-packages/microsoft.ml.onnxconverter.0.18.2-preview2-28208-8.nupkg differ diff --git a/local-nuget-packages/microsoft.ml.onnxconverter.symbols.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.onnxconverter.symbols.0.18.2-preview2-28208-8.nupkg new file mode 100644 index 00000000..4a1216b1 Binary files /dev/null and b/local-nuget-packages/microsoft.ml.onnxconverter.symbols.0.18.2-preview2-28208-8.nupkg differ diff --git a/local-nuget-packages/microsoft.ml.onnxtransformer.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.onnxtransformer.1.6.2-preview2-28208-8.nupkg new file mode 100644 index 00000000..0d97af5c Binary files /dev/null and b/local-nuget-packages/microsoft.ml.onnxtransformer.1.6.2-preview2-28208-8.nupkg differ diff --git a/local-nuget-packages/microsoft.ml.onnxtransformer.symbols.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.onnxtransformer.symbols.1.6.2-preview2-28208-8.nupkg new file mode 100644 index 00000000..e8e99abc Binary files /dev/null and b/local-nuget-packages/microsoft.ml.onnxtransformer.symbols.1.6.2-preview2-28208-8.nupkg differ diff --git a/local-nuget-packages/microsoft.ml.parquet.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.parquet.0.18.2-preview2-28208-8.nupkg new file mode 100644 index 00000000..8f51320e Binary files /dev/null and b/local-nuget-packages/microsoft.ml.parquet.0.18.2-preview2-28208-8.nupkg differ diff --git a/local-nuget-packages/microsoft.ml.parquet.symbols.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.parquet.symbols.0.18.2-preview2-28208-8.nupkg new file mode 100644 index 00000000..765ce5f9 Binary files /dev/null and b/local-nuget-packages/microsoft.ml.parquet.symbols.0.18.2-preview2-28208-8.nupkg differ diff --git a/local-nuget-packages/microsoft.ml.recommender.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.recommender.0.18.2-preview2-28208-8.nupkg new file mode 100644 index 00000000..dffcf5c4 Binary files /dev/null and b/local-nuget-packages/microsoft.ml.recommender.0.18.2-preview2-28208-8.nupkg differ diff --git a/local-nuget-packages/microsoft.ml.recommender.symbols.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.recommender.symbols.0.18.2-preview2-28208-8.nupkg new file mode 100644 index 00000000..0c802cb0 Binary files /dev/null and b/local-nuget-packages/microsoft.ml.recommender.symbols.0.18.2-preview2-28208-8.nupkg differ diff --git a/local-nuget-packages/microsoft.ml.sampleutils.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.sampleutils.0.18.2-preview2-28208-8.nupkg new file mode 100644 index 00000000..88add318 Binary files /dev/null and b/local-nuget-packages/microsoft.ml.sampleutils.0.18.2-preview2-28208-8.nupkg differ diff --git a/local-nuget-packages/microsoft.ml.sampleutils.symbols.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.sampleutils.symbols.0.18.2-preview2-28208-8.nupkg new file mode 100644 index 00000000..6348fe79 Binary files /dev/null and b/local-nuget-packages/microsoft.ml.sampleutils.symbols.0.18.2-preview2-28208-8.nupkg differ diff --git a/local-nuget-packages/microsoft.ml.symbols.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.symbols.1.6.2-preview2-28208-8.nupkg new file mode 100644 index 00000000..6637e4ff Binary files /dev/null and b/local-nuget-packages/microsoft.ml.symbols.1.6.2-preview2-28208-8.nupkg differ diff --git a/local-nuget-packages/microsoft.ml.tensorflow.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.tensorflow.1.6.2-preview2-28208-8.nupkg new file mode 100644 index 00000000..2b4619e7 Binary files /dev/null and b/local-nuget-packages/microsoft.ml.tensorflow.1.6.2-preview2-28208-8.nupkg differ diff --git a/local-nuget-packages/microsoft.ml.tensorflow.redist.0.18.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.tensorflow.redist.0.18.2-preview2-28208-8.nupkg new file mode 100644 index 00000000..2e943616 Binary files /dev/null and b/local-nuget-packages/microsoft.ml.tensorflow.redist.0.18.2-preview2-28208-8.nupkg differ diff --git a/local-nuget-packages/microsoft.ml.tensorflow.symbols.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.tensorflow.symbols.1.6.2-preview2-28208-8.nupkg new file mode 100644 index 00000000..88925eb0 Binary files /dev/null and b/local-nuget-packages/microsoft.ml.tensorflow.symbols.1.6.2-preview2-28208-8.nupkg differ diff --git a/local-nuget-packages/microsoft.ml.timeseries.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.timeseries.1.6.2-preview2-28208-8.nupkg new file mode 100644 index 00000000..036a2ca2 Binary files /dev/null and b/local-nuget-packages/microsoft.ml.timeseries.1.6.2-preview2-28208-8.nupkg differ diff --git a/local-nuget-packages/microsoft.ml.timeseries.symbols.1.6.2-preview2-28208-8.nupkg b/local-nuget-packages/microsoft.ml.timeseries.symbols.1.6.2-preview2-28208-8.nupkg new file mode 100644 index 00000000..fcb211d3 Binary files /dev/null and b/local-nuget-packages/microsoft.ml.timeseries.symbols.1.6.2-preview2-28208-8.nupkg differ diff --git a/nuget.config b/nuget.config index cedba361..75ab3744 100644 --- a/nuget.config +++ b/nuget.config @@ -5,6 +5,7 @@ - + + diff --git a/src/DotNetBridge/Bridge.cs b/src/DotNetBridge/Bridge.cs index a7954355..00947124 100644 --- a/src/DotNetBridge/Bridge.cs +++ b/src/DotNetBridge/Bridge.cs @@ -7,6 +7,8 @@ using System.Runtime.InteropServices; using System.Text; using System.Threading; +using Microsoft.ML; +using Microsoft.ML.Featurizers; using Microsoft.ML.Data; using Microsoft.ML.EntryPoints; using Microsoft.ML.Runtime; @@ -300,6 +302,7 @@ private static unsafe int GenericExec(EnvironmentBlock* penv, sbyte* psz, int cd //env.ComponentCatalog.RegisterAssembly(typeof(TimeSeriesProcessingEntryPoints).Assembly); //env.ComponentCatalog.RegisterAssembly(typeof(ParquetLoader).Assembly); env.ComponentCatalog.RegisterAssembly(typeof(SsaChangePointDetector).Assembly); + env.ComponentCatalog.RegisterAssembly(typeof(CategoryImputerTransformer).Assembly); env.ComponentCatalog.RegisterAssembly(typeof(DotNetBridgeEntrypoints).Assembly); using (var ch = host.Start("Executing")) diff --git a/src/DotNetBridge/DotNetBridge.csproj b/src/DotNetBridge/DotNetBridge.csproj index 822db6aa..9985bb62 100644 --- a/src/DotNetBridge/DotNetBridge.csproj +++ b/src/DotNetBridge/DotNetBridge.csproj @@ -32,17 +32,19 @@ all runtime; build; native; contentfiles; analyzers - - - - - - - - - - - + + + + + + + + + + + + + diff --git a/src/Platforms/build.csproj b/src/Platforms/build.csproj index 3db67054..626822c9 100644 --- a/src/Platforms/build.csproj +++ b/src/Platforms/build.csproj @@ -11,17 +11,19 @@ - - - - - - - - - - - + + + + + + + + + + + + + diff --git a/src/python/nimbusml.pyproj b/src/python/nimbusml.pyproj index a90735af..dc1a2c39 100644 --- a/src/python/nimbusml.pyproj +++ b/src/python/nimbusml.pyproj @@ -91,10 +91,14 @@ + + + + @@ -106,6 +110,7 @@ + @@ -120,6 +125,7 @@ + @@ -158,6 +164,7 @@ + @@ -173,6 +180,7 @@ + @@ -228,6 +236,7 @@ + @@ -302,19 +311,24 @@ + + + + + @@ -394,6 +408,7 @@ + @@ -406,6 +421,7 @@ + @@ -434,6 +450,7 @@ + @@ -452,6 +469,8 @@ + + @@ -630,6 +649,7 @@ + @@ -646,6 +666,7 @@ + @@ -659,6 +680,8 @@ + + @@ -694,15 +717,20 @@ + + + + + @@ -711,6 +739,7 @@ + diff --git a/src/python/nimbusml/__init__.py b/src/python/nimbusml/__init__.py index 0b508fcf..0fdadc02 100644 --- a/src/python/nimbusml/__init__.py +++ b/src/python/nimbusml/__init__.py @@ -2,7 +2,7 @@ Microsoft Machine Learning for Python """ -__version__ = '1.5.0' +__version__ = '1.5.1' # CoreCLR version of MicrosoftML is built on Windows. # But file permissions are not preserved when it's copied to Linux. diff --git a/src/python/nimbusml/examples/DateTimeSplitter.py b/src/python/nimbusml/examples/DateTimeSplitter.py new file mode 100644 index 00000000..fd8612d3 --- /dev/null +++ b/src/python/nimbusml/examples/DateTimeSplitter.py @@ -0,0 +1,31 @@ +############################################################################### +# DateTimeSplitter +import pandas as pd +from nimbusml import FileDataStream +from nimbusml.datasets import get_dataset +from nimbusml.preprocessing import DateTimeSplitter + +# data input (as a FileDataStream) +path = get_dataset('infert').as_filepath() + +data = FileDataStream.read_csv(path, sep=',') + +# transform usage +xf = DateTimeSplitter(prefix='dt_') << 'age' + +# fit and transform +features = xf.fit_transform(data) + +features = features.drop(['row_num', 'education', 'parity', 'induced', + 'case', 'spontaneous', 'stratum', 'pooled.stratum'], axis=1) + +# print features +pd.set_option('display.max_columns', None) +pd.set_option('display.width', 1000) +print(features.head()) +# age dt_Year dt_Month dt_Day dt_Hour dt_Minute dt_Second dt_AmPm dt_Hour12 dt_DayOfWeek dt_DayOfQuarter dt_DayOfYear dt_WeekOfMonth dt_QuarterOfYear dt_HalfOfYear dt_WeekIso dt_YearIso dt_MonthLabel dt_AmPmLabel dt_DayOfWeekLabel dt_HolidayName dt_IsPaidTimeOff +# 0 26 1970 1 1 0 0 26 0 0 4 1 0 0 1 1 1 1970 January am Thursday None 0 +# 1 42 1970 1 1 0 0 42 0 0 4 1 0 0 1 1 1 1970 January am Thursday None 0 +# 2 39 1970 1 1 0 0 39 0 0 4 1 0 0 1 1 1 1970 January am Thursday None 0 +# 3 34 1970 1 1 0 0 34 0 0 4 1 0 0 1 1 1 1970 January am Thursday None 0 +# 4 35 1970 1 1 0 0 35 0 0 4 1 0 0 1 1 1 1970 January am Thursday None 0 \ No newline at end of file diff --git a/src/python/nimbusml/examples/RobustScaler.py b/src/python/nimbusml/examples/RobustScaler.py new file mode 100644 index 00000000..4c6a6405 --- /dev/null +++ b/src/python/nimbusml/examples/RobustScaler.py @@ -0,0 +1,39 @@ +############################################################################### +# RobustScaler +import numpy +from nimbusml import FileDataStream +from nimbusml.datasets import get_dataset +from nimbusml.preprocessing.normalization import RobustScaler + +# data input (as a FileDataStream) +path = get_dataset('infert').as_filepath() +data = FileDataStream.read_csv(path, sep=',') + +print(data.head()) +# row_num education age parity induced case spontaneous stratum pooled.stratum +# 0 1 0-5yrs 26 6 1 1 2 1 3 +# 1 2 0-5yrs 42 1 1 1 0 2 1 +# 2 3 0-5yrs 39 6 2 1 0 3 4 +# 3 4 0-5yrs 34 4 2 1 0 4 2 +# 4 5 6-11yrs 35 3 1 1 1 5 32 + +# transform usage +xf = RobustScaler( + center=True, scale=True, + columns={'age_norm': 'age', 'par_norm': 'parity'}) + +# fit and transform +features = xf.fit_transform(data) + +print(features.head(n=10)) +# row_num education age parity induced case spontaneous stratum pooled.stratum age_norm par_norm +# 0 1 0-5yrs 26 6 1 1 2 1 3 -0.434783 1.6 +# 1 2 0-5yrs 42 1 1 1 0 2 1 0.956522 -0.4 +# 2 3 0-5yrs 39 6 2 1 0 3 4 0.695652 1.6 +# 3 4 0-5yrs 34 4 2 1 0 4 2 0.260870 0.8 +# 4 5 6-11yrs 35 3 1 1 1 5 32 0.347826 0.4 +# 5 6 6-11yrs 36 4 2 1 1 6 36 0.434783 0.8 +# 6 7 6-11yrs 23 1 0 1 0 7 6 -0.695652 -0.4 +# 7 8 6-11yrs 32 2 0 1 0 8 22 0.086957 0.0 +# 8 9 6-11yrs 21 1 0 1 1 9 5 -0.869565 -0.4 +# 9 10 6-11yrs 28 2 0 1 0 10 19 -0.260870 0.0 diff --git a/src/python/nimbusml/examples/ToKeyImputer.py b/src/python/nimbusml/examples/ToKeyImputer.py new file mode 100644 index 00000000..820127f5 --- /dev/null +++ b/src/python/nimbusml/examples/ToKeyImputer.py @@ -0,0 +1,35 @@ +############################################################################### +# ToKey +import numpy +from nimbusml import FileDataStream +from nimbusml.datasets import get_dataset +from nimbusml.preprocessing import ToKeyImputer + +# data input (as a FileDataStream) +path = get_dataset('airquality').as_filepath() + +data = FileDataStream.read_csv(path, sep=',', numeric_dtype=numpy.float32, + names={0: 'id'}) +print(data.head(6)) +# id Ozone Solar_R Wind Temp Month Day +# 0 1.0 41.0 190.0 7.4 67.0 5.0 1.0 +# 1 2.0 36.0 118.0 8.0 72.0 5.0 2.0 +# 2 3.0 12.0 149.0 12.6 74.0 5.0 3.0 +# 3 4.0 18.0 313.0 11.5 62.0 5.0 4.0 +# 4 5.0 NaN NaN 14.3 56.0 5.0 5.0 +# 5 6.0 28.0 NaN 14.9 66.0 5.0 6.0 + + +# transform usage +xf = ToKeyImputer(columns={'Ozone_1': 'Ozone', 'Solar_R_1': 'Solar_R'}) + +# fit and transform +features = xf.fit_transform(data) +print(features.head(6)) +# id Ozone Solar_R Wind Temp Month Day Ozone_1 Solar_R_1 +# 0 1.0 41.0 190.0 7.4 67.0 5.0 1.0 41.0 190.0 +# 1 2.0 36.0 118.0 8.0 72.0 5.0 2.0 36.0 118.0 +# 2 3.0 12.0 149.0 12.6 74.0 5.0 3.0 12.0 149.0 +# 3 4.0 18.0 313.0 11.5 62.0 5.0 4.0 18.0 313.0 +# 4 5.0 NaN NaN 14.3 56.0 5.0 5.0 23.0 238.0 <== Missing values have been updated +# 5 6.0 28.0 NaN 14.9 66.0 5.0 6.0 28.0 238.0 <== Missing values have been updated diff --git a/src/python/nimbusml/examples/ToString.py b/src/python/nimbusml/examples/ToString.py new file mode 100644 index 00000000..82185d32 --- /dev/null +++ b/src/python/nimbusml/examples/ToString.py @@ -0,0 +1,45 @@ +############################################################################### +# ToKey +import numpy +from nimbusml import FileDataStream +from nimbusml.datasets import get_dataset +from nimbusml.preprocessing import ToString + +# data input (as a FileDataStream) +path = get_dataset('infert').as_filepath() + +data = FileDataStream.read_csv(path, sep=',', numeric_dtype=numpy.float32, + names={0: 'id'}) +print(data.head()) +# id education age parity induced case spontaneous stratum pooled.stratum +# 0 1.0 0-5yrs 26.0 6.0 1.0 1.0 2.0 1.0 3.0 +# 1 2.0 0-5yrs 42.0 1.0 1.0 1.0 0.0 2.0 1.0 +# 2 3.0 0-5yrs 39.0 6.0 2.0 1.0 0.0 3.0 4.0 +# 3 4.0 0-5yrs 34.0 4.0 2.0 1.0 0.0 4.0 2.0 +# 4 5.0 6-11yrs 35.0 3.0 1.0 1.0 1.0 5.0 32.0 + +# transform usage +xf = ToString(columns={'id_1': 'id', 'age_1': 'age'}) + +# fit and transform +features = xf.fit_transform(data) +print(features.head()) +# id education age parity induced case spontaneous stratum pooled.stratum id_1 age_1 +# 0 1.0 0-5yrs 26.0 6.0 1.0 1.0 2.0 1.0 3.0 1.000000 26.000000 +# 1 2.0 0-5yrs 42.0 1.0 1.0 1.0 0.0 2.0 1.0 2.000000 42.000000 +# 2 3.0 0-5yrs 39.0 6.0 2.0 1.0 0.0 3.0 4.0 3.000000 39.000000 +# 3 4.0 0-5yrs 34.0 4.0 2.0 1.0 0.0 4.0 2.0 4.000000 34.000000 +# 4 5.0 6-11yrs 35.0 3.0 1.0 1.0 1.0 5.0 32.0 5.000000 35.000000 + +print(features.dtypes) +# id float32 +# education object +# age float32 +# parity float32 +# induced float32 +# case float32 +# spontaneous float32 +# stratum float32 +# pooled.stratum float32 +# id_1 object <== string column +# age_1 object <== string column diff --git a/src/python/nimbusml/examples/examples_from_dataframe/DateTimeSplitter_df.py b/src/python/nimbusml/examples/examples_from_dataframe/DateTimeSplitter_df.py new file mode 100644 index 00000000..8e33ab7b --- /dev/null +++ b/src/python/nimbusml/examples/examples_from_dataframe/DateTimeSplitter_df.py @@ -0,0 +1,31 @@ +############################################################################### +# DateTimeSplitter +import pandas +from nimbusml.preprocessing import DateTimeSplitter + +df = pandas.DataFrame(data=dict( + tokens1=[1, 2, 3, 157161600], + tokens2=[10, 11, 12, 13] +)) + +cols_to_drop = [ + 'Hour12', 'DayOfWeek', 'DayOfQuarter', + 'DayOfYear', 'WeekOfMonth', 'QuarterOfYear', + 'HalfOfYear', 'WeekIso', 'YearIso', 'MonthLabel', + 'AmPmLabel', 'DayOfWeekLabel', 'IsPaidTimeOff' +] + +cd = DateTimeSplitter(prefix='dt', + country='Canada', + columns_to_drop=cols_to_drop) << 'tokens1' +y = cd.fit_transform(df) + +# view the three columns +pandas.set_option('display.max_columns', None) +pandas.set_option('display.width', 1000) +print(y) +# tokens1 tokens2 dtYear dtMonth dtDay dtHour dtMinute dtSecond dtAmPm dtHolidayName +# 0 1 10 1970 1 1 0 0 1 0 New Year's Day +# 1 2 11 1970 1 1 0 0 2 0 New Year's Day +# 2 3 12 1970 1 1 0 0 3 0 New Year's Day +# 3 157161600 13 1974 12 25 0 0 0 0 Christmas Day diff --git a/src/python/nimbusml/examples/examples_from_dataframe/RobustScaler_df.py b/src/python/nimbusml/examples/examples_from_dataframe/RobustScaler_df.py new file mode 100644 index 00000000..ff0ae793 --- /dev/null +++ b/src/python/nimbusml/examples/examples_from_dataframe/RobustScaler_df.py @@ -0,0 +1,20 @@ +############################################################################### +# RobustScaler +import pandas as pd +from nimbusml import Pipeline +from nimbusml.preprocessing.normalization import RobustScaler + + +df = pd.DataFrame(data=dict(c0=[1, 3, 5, 7, 9])) + +xf = RobustScaler(columns='c0', center=True, scale=True) +pipeline = Pipeline([xf]) +result = pipeline.fit_transform(df) + +print(result) +# c0 +# 0 -1.0 +# 1 -0.5 +# 2 0.0 +# 3 0.5 +# 4 1.0 diff --git a/src/python/nimbusml/examples/examples_from_dataframe/TimeSeriesImputer_df.py b/src/python/nimbusml/examples/examples_from_dataframe/TimeSeriesImputer_df.py new file mode 100644 index 00000000..38ec9073 --- /dev/null +++ b/src/python/nimbusml/examples/examples_from_dataframe/TimeSeriesImputer_df.py @@ -0,0 +1,29 @@ +############################################################################### +# DateTimeSplitter +import pandas +from nimbusml.timeseries import TimeSeriesImputer + +df = pandas.DataFrame(data=dict( + ts=[1, 2, 3, 5], + grain=[1970, 1970, 1970, 1970], + c3=[10, 13, 15, 20], + c4=[19, 12, 16, 19] +)) + +print(df) + +tsi = TimeSeriesImputer(time_series_column='ts', + grain_columns=['grain'], + filter_columns=['c3', 'c4'], + impute_mode='ForwardFill', + filter_mode='Include') +result = tsi.fit_transform(df) + +print(result) +# ts grain c3 c4 IsRowImputed +# 0 0 0 0 0 False +# 1 1 1970 10 19 False +# 2 2 1970 13 12 False +# 3 3 1970 15 16 False +# 4 4 1970 15 16 True <== New row added +# 5 5 1970 20 19 False diff --git a/src/python/nimbusml/examples/examples_from_dataframe/ToKeyImputer_df.py b/src/python/nimbusml/examples/examples_from_dataframe/ToKeyImputer_df.py new file mode 100644 index 00000000..f613e3f4 --- /dev/null +++ b/src/python/nimbusml/examples/examples_from_dataframe/ToKeyImputer_df.py @@ -0,0 +1,34 @@ +############################################################################### +# ToKeyImputer + +import pandas +from nimbusml.preprocessing import ToKeyImputer + +# Create the data +text_df = pandas.DataFrame( + data=dict( + text=[ + "cat", + "dog", + "fish", + "orange", + "cat orange", + "dog", + "fish", + None, + "spider"])) + +tokey = ToKeyImputer() << 'text' +y = tokey.fit_transform(text_df) +print(y) + +# text +# 0 cat +# 1 dog +# 2 fish +# 3 orange +# 4 cat orange +# 5 dog +# 6 fish +# 7 dog <== Missing value has been replaced +# 8 spider diff --git a/src/python/nimbusml/examples/examples_from_dataframe/ToString_df.py b/src/python/nimbusml/examples/examples_from_dataframe/ToString_df.py new file mode 100644 index 00000000..b6c631fd --- /dev/null +++ b/src/python/nimbusml/examples/examples_from_dataframe/ToString_df.py @@ -0,0 +1,43 @@ +############################################################################### +# ToString + +import pandas +from nimbusml.preprocessing import ToString, ToKey +from pandas import Categorical + +# Create the data +categorical_df = pandas.DataFrame(data=dict( + key=Categorical.from_codes([0, 1, 2, 1, 2, 0], categories=['a', 'b', 'c']), + text=['b', 'c', 'a', 'b', 'a', 'c'])) + +print(categorical_df.dtypes) +# key category +# text object +# dtype: object + +tostring = ToString(columns='key') +y = tostring.fit_transform(categorical_df) +print(y) +# key text +# 0 1 b +# 1 2 c +# 2 3 a +# 3 2 b +# 4 3 a +# 5 1 c + +print(y.dtypes) +# key object <== converted to string +# text object +# dtype: object + +tokey = ToKey(columns='text') +y = tokey.fit_transform(categorical_df) +y2 = tostring.clone().fit_transform(y) +print(y2['text'] == categorical_df['text']) +# 0 True +# 1 True +# 2 True +# 3 True +# 4 True +# 5 True diff --git a/src/python/nimbusml/internal/core/preprocessing/datetimesplitter.py b/src/python/nimbusml/internal/core/preprocessing/datetimesplitter.py new file mode 100644 index 00000000..db2c39ef --- /dev/null +++ b/src/python/nimbusml/internal/core/preprocessing/datetimesplitter.py @@ -0,0 +1,62 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +DateTimeSplitter +""" + +__all__ = ["DateTimeSplitter"] + + +from ...entrypoints.transforms_datetimesplitter import \ + transforms_datetimesplitter +from ...utils.utils import trace +from ..base_pipeline_item import BasePipelineItem, DefaultSignature + + +class DateTimeSplitter(BasePipelineItem, DefaultSignature): + """ + **Description** + Splits a date time value into each individual component + + :param prefix: Output column prefix. + + :param columns_to_drop: Columns to drop after the DateTime Expansion. + + :param country: Country to get holidays for. Defaults to none if not + passed. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + prefix, + columns_to_drop=None, + country='None', + **params): + BasePipelineItem.__init__( + self, type='transform', **params) + + self.prefix = prefix + self.columns_to_drop = columns_to_drop + self.country = country + + @property + def _entrypoint(self): + return transforms_datetimesplitter + + @trace + def _get_node(self, **all_args): + algo_args = dict( + source=self.source, + prefix=self.prefix, + columns_to_drop=self.columns_to_drop, + country=self.country) + + all_args.update(algo_args) + return self._entrypoint(**all_args) diff --git a/src/python/nimbusml/internal/core/preprocessing/normalization/robustscaler.py b/src/python/nimbusml/internal/core/preprocessing/normalization/robustscaler.py new file mode 100644 index 00000000..08845bae --- /dev/null +++ b/src/python/nimbusml/internal/core/preprocessing/normalization/robustscaler.py @@ -0,0 +1,103 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +RobustScaler +""" + +__all__ = ["RobustScaler"] + + +from ....entrypoints.transforms_robustscaler import transforms_robustscaler +from ....utils.utils import trace +from ...base_pipeline_item import BasePipelineItem, DefaultSignature + + +class RobustScaler(BasePipelineItem, DefaultSignature): + """ + **Description** + Removes the median and scales the data according to the quantile range. + + :param center: If True, center the data before scaling. + + :param scale: If True, scale the data to interquartile range. + + :param quantile_min: Min for the quantile range used to calculate scale. + + :param quantile_max: Max for the quantile range used to calculate scale. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + center=True, + scale=True, + quantile_min=25.0, + quantile_max=75.0, + **params): + BasePipelineItem.__init__( + self, type='transform', **params) + + self.center = center + self.scale = scale + self.quantile_min = quantile_min + self.quantile_max = quantile_max + + @property + def _entrypoint(self): + return transforms_robustscaler + + @trace + def _get_node(self, **all_args): + + input_columns = self.input + if input_columns is None and 'input' in all_args: + input_columns = all_args['input'] + if 'input' in all_args: + all_args.pop('input') + + output_columns = self.output + if output_columns is None and 'output' in all_args: + output_columns = all_args['output'] + if 'output' in all_args: + all_args.pop('output') + + # validate input + if input_columns is None: + raise ValueError( + "'None' input passed when it cannot be none.") + + if not isinstance(input_columns, list): + raise ValueError( + "input has to be a list of strings, instead got %s" % + type(input_columns)) + + # validate output + if output_columns is None: + output_columns = input_columns + + if not isinstance(output_columns, list): + raise ValueError( + "output has to be a list of strings, instead got %s" % + type(output_columns)) + + algo_args = dict( + column=[ + dict( + Source=i, + Name=o) for i, + o in zip( + input_columns, + output_columns)] if input_columns else None, + center=self.center, + scale=self.scale, + quantile_min=self.quantile_min, + quantile_max=self.quantile_max) + + all_args.update(algo_args) + return self._entrypoint(**all_args) diff --git a/src/python/nimbusml/internal/core/preprocessing/tokeyimputer.py b/src/python/nimbusml/internal/core/preprocessing/tokeyimputer.py new file mode 100644 index 00000000..e82498a3 --- /dev/null +++ b/src/python/nimbusml/internal/core/preprocessing/tokeyimputer.py @@ -0,0 +1,80 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +ToKeyImputer +""" + +__all__ = ["ToKeyImputer"] + + +from ...entrypoints.transforms_categoryimputer import \ + transforms_categoryimputer +from ...utils.utils import trace +from ..base_pipeline_item import BasePipelineItem, DefaultSignature + + +class ToKeyImputer(BasePipelineItem, DefaultSignature): + """ + **Description** + Fills in missing values in a column based on the most frequent value + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + **params): + BasePipelineItem.__init__( + self, type='transform', **params) + + @property + def _entrypoint(self): + return transforms_categoryimputer + + @trace + def _get_node(self, **all_args): + + input_columns = self.input + if input_columns is None and 'input' in all_args: + input_columns = all_args['input'] + if 'input' in all_args: + all_args.pop('input') + + output_columns = self.output + if output_columns is None and 'output' in all_args: + output_columns = all_args['output'] + if 'output' in all_args: + all_args.pop('output') + + # validate input + if input_columns is None: + raise ValueError( + "'None' input passed when it cannot be none.") + + if not isinstance(input_columns, list): + raise ValueError( + "input has to be a list of strings, instead got %s" % + type(input_columns)) + + # validate output + if output_columns is None: + output_columns = input_columns + + if not isinstance(output_columns, list): + raise ValueError( + "output has to be a list of strings, instead got %s" % + type(output_columns)) + + algo_args = dict( + column=[ + dict( + Source=i, Name=o) for i, o in zip( + input_columns, output_columns)] if input_columns else None) + + all_args.update(algo_args) + return self._entrypoint(**all_args) diff --git a/src/python/nimbusml/internal/core/preprocessing/tostring.py b/src/python/nimbusml/internal/core/preprocessing/tostring.py new file mode 100644 index 00000000..2294c715 --- /dev/null +++ b/src/python/nimbusml/internal/core/preprocessing/tostring.py @@ -0,0 +1,79 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +ToString +""" + +__all__ = ["ToString"] + + +from ...entrypoints.transforms_tostring import transforms_tostring +from ...utils.utils import trace +from ..base_pipeline_item import BasePipelineItem, DefaultSignature + + +class ToString(BasePipelineItem, DefaultSignature): + """ + **Description** + Turns the given column into a column of its string representation + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + **params): + BasePipelineItem.__init__( + self, type='transform', **params) + + @property + def _entrypoint(self): + return transforms_tostring + + @trace + def _get_node(self, **all_args): + + input_columns = self.input + if input_columns is None and 'input' in all_args: + input_columns = all_args['input'] + if 'input' in all_args: + all_args.pop('input') + + output_columns = self.output + if output_columns is None and 'output' in all_args: + output_columns = all_args['output'] + if 'output' in all_args: + all_args.pop('output') + + # validate input + if input_columns is None: + raise ValueError( + "'None' input passed when it cannot be none.") + + if not isinstance(input_columns, list): + raise ValueError( + "input has to be a list of strings, instead got %s" % + type(input_columns)) + + # validate output + if output_columns is None: + output_columns = input_columns + + if not isinstance(output_columns, list): + raise ValueError( + "output has to be a list of strings, instead got %s" % + type(output_columns)) + + algo_args = dict( + column=[ + dict( + Source=i, Name=o) for i, o in zip( + input_columns, output_columns)] if input_columns else None) + + all_args.update(algo_args) + return self._entrypoint(**all_args) diff --git a/src/python/nimbusml/internal/core/timeseries/timeseriesimputer.py b/src/python/nimbusml/internal/core/timeseries/timeseriesimputer.py new file mode 100644 index 00000000..0a492127 --- /dev/null +++ b/src/python/nimbusml/internal/core/timeseries/timeseriesimputer.py @@ -0,0 +1,78 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +TimeSeriesImputer +""" + +__all__ = ["TimeSeriesImputer"] + + +from ...entrypoints.transforms_timeseriesimputer import \ + transforms_timeseriesimputer +from ...utils.utils import trace +from ..base_pipeline_item import BasePipelineItem, DefaultSignature + + +class TimeSeriesImputer(BasePipelineItem, DefaultSignature): + """ + **Description** + Fills in missing row and values + + :param time_series_column: Column representing the time. + + :param grain_columns: List of grain columns. + + :param filter_columns: Columns to filter. + + :param filter_mode: Filter mode. Either include or exclude. + + :param impute_mode: Mode for imputing, defaults to ForwardFill if not + provided. + + :param supress_type_errors: Supress the errors that would occur if a column + and impute mode are imcompatible. If true, will skip the column. If + false, will stop and throw an error. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + time_series_column, + grain_columns, + filter_columns=None, + filter_mode='Exclude', + impute_mode='ForwardFill', + supress_type_errors=False, + **params): + BasePipelineItem.__init__( + self, type='transform', **params) + + self.time_series_column = time_series_column + self.grain_columns = grain_columns + self.filter_columns = filter_columns + self.filter_mode = filter_mode + self.impute_mode = impute_mode + self.supress_type_errors = supress_type_errors + + @property + def _entrypoint(self): + return transforms_timeseriesimputer + + @trace + def _get_node(self, **all_args): + algo_args = dict( + time_series_column=self.time_series_column, + grain_columns=self.grain_columns, + filter_columns=self.filter_columns, + filter_mode=self.filter_mode, + impute_mode=self.impute_mode, + supress_type_errors=self.supress_type_errors) + + all_args.update(algo_args) + return self._entrypoint(**all_args) diff --git a/src/python/nimbusml/internal/entrypoints/transforms_categoryimputer.py b/src/python/nimbusml/internal/entrypoints/transforms_categoryimputer.py new file mode 100644 index 00000000..7f72261b --- /dev/null +++ b/src/python/nimbusml/internal/entrypoints/transforms_categoryimputer.py @@ -0,0 +1,65 @@ +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +Transforms.CategoryImputer +""" + + +from ..utils.entrypoints import EntryPoint +from ..utils.utils import try_set, unlist + + +def transforms_categoryimputer( + column, + data, + output_data=None, + model=None, + **params): + """ + **Description** + Fills in missing values in a column based on the most frequent value + + :param column: New column definition (optional form: name:src) + (inputs). + :param data: Input dataset (inputs). + :param output_data: Transformed dataset (outputs). + :param model: Transform model (outputs). + """ + + entrypoint_name = 'Transforms.CategoryImputer' + inputs = {} + outputs = {} + + if column is not None: + inputs['Column'] = try_set( + obj=column, + none_acceptable=False, + is_of_type=list, + is_column=True) + if data is not None: + inputs['Data'] = try_set( + obj=data, + none_acceptable=False, + is_of_type=str) + if output_data is not None: + outputs['OutputData'] = try_set( + obj=output_data, + none_acceptable=False, + is_of_type=str) + if model is not None: + outputs['Model'] = try_set( + obj=model, + none_acceptable=False, + is_of_type=str) + + input_variables = { + x for x in unlist(inputs.values()) + if isinstance(x, str) and x.startswith("$")} + output_variables = { + x for x in unlist(outputs.values()) + if isinstance(x, str) and x.startswith("$")} + + entrypoint = EntryPoint( + name=entrypoint_name, inputs=inputs, outputs=outputs, + input_variables=input_variables, + output_variables=output_variables) + return entrypoint diff --git a/src/python/nimbusml/internal/entrypoints/transforms_datetimesplitter.py b/src/python/nimbusml/internal/entrypoints/transforms_datetimesplitter.py new file mode 100644 index 00000000..7afc028a --- /dev/null +++ b/src/python/nimbusml/internal/entrypoints/transforms_datetimesplitter.py @@ -0,0 +1,128 @@ +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +Transforms.DateTimeSplitter +""" + + +from ..utils.entrypoints import EntryPoint +from ..utils.utils import try_set, unlist + + +def transforms_datetimesplitter( + source, + data, + prefix, + output_data=None, + model=None, + columns_to_drop=None, + country='None', + **params): + """ + **Description** + Splits a date time value into each individual component + + :param source: Input column (inputs). + :param data: Input dataset (inputs). + :param prefix: Output column prefix (inputs). + :param columns_to_drop: Columns to drop after the DateTime + Expansion (inputs). + :param country: Country to get holidays for. Defaults to none if + not passed (inputs). + :param output_data: Transformed dataset (outputs). + :param model: Transform model (outputs). + """ + + entrypoint_name = 'Transforms.DateTimeSplitter' + inputs = {} + outputs = {} + + if source is not None: + inputs['Source'] = try_set( + obj=source, + none_acceptable=False, + is_of_type=str, + is_column=True) + if data is not None: + inputs['Data'] = try_set( + obj=data, + none_acceptable=False, + is_of_type=str) + if prefix is not None: + inputs['Prefix'] = try_set( + obj=prefix, + none_acceptable=False, + is_of_type=str) + if columns_to_drop is not None: + inputs['ColumnsToDrop'] = try_set( + obj=columns_to_drop, + none_acceptable=True, + is_of_type=list, + is_column=True) + if country is not None: + inputs['Country'] = try_set( + obj=country, + none_acceptable=True, + is_of_type=str, + values=[ + 'None', + 'Argentina', + 'Australia', + 'Austria', + 'Belarus', + 'Belgium', + 'Brazil', + 'Canada', + 'Colombia', + 'Croatia', + 'Czech', + 'Denmark', + 'England', + 'Finland', + 'France', + 'Germany', + 'Hungary', + 'India', + 'Ireland', + 'IsleofMan', + 'Italy', + 'Japan', + 'Mexico', + 'Netherlands', + 'NewZealand', + 'NorthernIreland', + 'Norway', + 'Poland', + 'Portugal', + 'Scotland', + 'Slovenia', + 'SouthAfrica', + 'Spain', + 'Sweden', + 'Switzerland', + 'Ukraine', + 'UnitedKingdom', + 'UnitedStates', + 'Wales']) + if output_data is not None: + outputs['OutputData'] = try_set( + obj=output_data, + none_acceptable=False, + is_of_type=str) + if model is not None: + outputs['Model'] = try_set( + obj=model, + none_acceptable=False, + is_of_type=str) + + input_variables = { + x for x in unlist(inputs.values()) + if isinstance(x, str) and x.startswith("$")} + output_variables = { + x for x in unlist(outputs.values()) + if isinstance(x, str) and x.startswith("$")} + + entrypoint = EntryPoint( + name=entrypoint_name, inputs=inputs, outputs=outputs, + input_variables=input_variables, + output_variables=output_variables) + return entrypoint diff --git a/src/python/nimbusml/internal/entrypoints/transforms_robustscaler.py b/src/python/nimbusml/internal/entrypoints/transforms_robustscaler.py new file mode 100644 index 00000000..615af180 --- /dev/null +++ b/src/python/nimbusml/internal/entrypoints/transforms_robustscaler.py @@ -0,0 +1,98 @@ +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +Transforms.RobustScaler +""" + +import numbers + +from ..utils.entrypoints import EntryPoint +from ..utils.utils import try_set, unlist + + +def transforms_robustscaler( + column, + data, + output_data=None, + model=None, + center=True, + scale=True, + quantile_min=25.0, + quantile_max=75.0, + **params): + """ + **Description** + Removes the median and scales the data according to the quantile + range. + + :param column: New column definition (optional form: name:src) + (inputs). + :param data: Input dataset (inputs). + :param center: If True, center the data before scaling. (inputs). + :param scale: If True, scale the data to interquartile range. + (inputs). + :param quantile_min: Min for the quantile range used to calculate + scale. (inputs). + :param quantile_max: Max for the quantile range used to calculate + scale. (inputs). + :param output_data: Transformed dataset (outputs). + :param model: Transform model (outputs). + """ + + entrypoint_name = 'Transforms.RobustScaler' + inputs = {} + outputs = {} + + if column is not None: + inputs['Column'] = try_set( + obj=column, + none_acceptable=False, + is_of_type=list, + is_column=True) + if data is not None: + inputs['Data'] = try_set( + obj=data, + none_acceptable=False, + is_of_type=str) + if center is not None: + inputs['Center'] = try_set( + obj=center, + none_acceptable=True, + is_of_type=bool) + if scale is not None: + inputs['Scale'] = try_set( + obj=scale, + none_acceptable=True, + is_of_type=bool) + if quantile_min is not None: + inputs['QuantileMin'] = try_set( + obj=quantile_min, + none_acceptable=True, + is_of_type=numbers.Real) + if quantile_max is not None: + inputs['QuantileMax'] = try_set( + obj=quantile_max, + none_acceptable=True, + is_of_type=numbers.Real) + if output_data is not None: + outputs['OutputData'] = try_set( + obj=output_data, + none_acceptable=False, + is_of_type=str) + if model is not None: + outputs['Model'] = try_set( + obj=model, + none_acceptable=False, + is_of_type=str) + + input_variables = { + x for x in unlist(inputs.values()) + if isinstance(x, str) and x.startswith("$")} + output_variables = { + x for x in unlist(outputs.values()) + if isinstance(x, str) and x.startswith("$")} + + entrypoint = EntryPoint( + name=entrypoint_name, inputs=inputs, outputs=outputs, + input_variables=input_variables, + output_variables=output_variables) + return entrypoint diff --git a/src/python/nimbusml/internal/entrypoints/transforms_timeseriesimputer.py b/src/python/nimbusml/internal/entrypoints/transforms_timeseriesimputer.py new file mode 100644 index 00000000..e19bd1f1 --- /dev/null +++ b/src/python/nimbusml/internal/entrypoints/transforms_timeseriesimputer.py @@ -0,0 +1,115 @@ +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +Transforms.TimeSeriesImputer +""" + + +from ..utils.entrypoints import EntryPoint +from ..utils.utils import try_set, unlist + + +def transforms_timeseriesimputer( + time_series_column, + data, + grain_columns, + output_data=None, + model=None, + filter_columns=None, + filter_mode='Exclude', + impute_mode='ForwardFill', + supress_type_errors=False, + **params): + """ + **Description** + Fills in missing row and values + + :param time_series_column: Column representing the time (inputs). + :param data: Input dataset (inputs). + :param grain_columns: List of grain columns (inputs). + :param filter_columns: Columns to filter (inputs). + :param filter_mode: Filter mode. Either include or exclude + (inputs). + :param impute_mode: Mode for imputing, defaults to ForwardFill if + not provided (inputs). + :param supress_type_errors: Supress the errors that would occur + if a column and impute mode are imcompatible. If true, will + skip the column. If false, will stop and throw an error. + (inputs). + :param output_data: Transformed dataset (outputs). + :param model: Transform model (outputs). + """ + + entrypoint_name = 'Transforms.TimeSeriesImputer' + inputs = {} + outputs = {} + + if time_series_column is not None: + inputs['TimeSeriesColumn'] = try_set( + obj=time_series_column, + none_acceptable=False, + is_of_type=str, + is_column=True) + if data is not None: + inputs['Data'] = try_set( + obj=data, + none_acceptable=False, + is_of_type=str) + if grain_columns is not None: + inputs['GrainColumns'] = try_set( + obj=grain_columns, + none_acceptable=False, + is_of_type=list, + is_column=True) + if filter_columns is not None: + inputs['FilterColumns'] = try_set( + obj=filter_columns, + none_acceptable=True, + is_of_type=list, + is_column=True) + if filter_mode is not None: + inputs['FilterMode'] = try_set( + obj=filter_mode, + none_acceptable=True, + is_of_type=str, + values=[ + 'NoFilter', + 'Include', + 'Exclude']) + if impute_mode is not None: + inputs['ImputeMode'] = try_set( + obj=impute_mode, + none_acceptable=True, + is_of_type=str, + values=[ + 'ForwardFill', + 'BackFill', + 'Median', + 'Interpolate']) + if supress_type_errors is not None: + inputs['SupressTypeErrors'] = try_set( + obj=supress_type_errors, + none_acceptable=True, + is_of_type=bool) + if output_data is not None: + outputs['OutputData'] = try_set( + obj=output_data, + none_acceptable=False, + is_of_type=str) + if model is not None: + outputs['Model'] = try_set( + obj=model, + none_acceptable=False, + is_of_type=str) + + input_variables = { + x for x in unlist(inputs.values()) + if isinstance(x, str) and x.startswith("$")} + output_variables = { + x for x in unlist(outputs.values()) + if isinstance(x, str) and x.startswith("$")} + + entrypoint = EntryPoint( + name=entrypoint_name, inputs=inputs, outputs=outputs, + input_variables=input_variables, + output_variables=output_variables) + return entrypoint diff --git a/src/python/nimbusml/internal/entrypoints/transforms_tostring.py b/src/python/nimbusml/internal/entrypoints/transforms_tostring.py new file mode 100644 index 00000000..2f6d9782 --- /dev/null +++ b/src/python/nimbusml/internal/entrypoints/transforms_tostring.py @@ -0,0 +1,65 @@ +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +Transforms.ToString +""" + + +from ..utils.entrypoints import EntryPoint +from ..utils.utils import try_set, unlist + + +def transforms_tostring( + column, + data, + output_data=None, + model=None, + **params): + """ + **Description** + Turns the given column into a column of its string representation + + :param column: New column definition (optional form: name:src) + (inputs). + :param data: Input dataset (inputs). + :param output_data: Transformed dataset (outputs). + :param model: Transform model (outputs). + """ + + entrypoint_name = 'Transforms.ToString' + inputs = {} + outputs = {} + + if column is not None: + inputs['Column'] = try_set( + obj=column, + none_acceptable=False, + is_of_type=list, + is_column=True) + if data is not None: + inputs['Data'] = try_set( + obj=data, + none_acceptable=False, + is_of_type=str) + if output_data is not None: + outputs['OutputData'] = try_set( + obj=output_data, + none_acceptable=False, + is_of_type=str) + if model is not None: + outputs['Model'] = try_set( + obj=model, + none_acceptable=False, + is_of_type=str) + + input_variables = { + x for x in unlist(inputs.values()) + if isinstance(x, str) and x.startswith("$")} + output_variables = { + x for x in unlist(outputs.values()) + if isinstance(x, str) and x.startswith("$")} + + entrypoint = EntryPoint( + name=entrypoint_name, inputs=inputs, outputs=outputs, + input_variables=input_variables, + output_variables=output_variables) + return entrypoint diff --git a/src/python/nimbusml/preprocessing/__init__.py b/src/python/nimbusml/preprocessing/__init__.py index 26b41b8e..728327be 100644 --- a/src/python/nimbusml/preprocessing/__init__.py +++ b/src/python/nimbusml/preprocessing/__init__.py @@ -2,10 +2,16 @@ from .tokey import ToKey from .tensorflowscorer import TensorFlowScorer from .datasettransformer import DatasetTransformer +from .datetimesplitter import DateTimeSplitter +from .tokeyimputer import ToKeyImputer +from .tostring import ToString __all__ = [ + 'DateTimeSplitter', 'FromKey', 'ToKey', + 'ToKeyImputer', + 'ToString', 'TensorFlowScorer', 'DatasetTransformer' ] diff --git a/src/python/nimbusml/preprocessing/datetimesplitter.py b/src/python/nimbusml/preprocessing/datetimesplitter.py new file mode 100644 index 00000000..fb33337b --- /dev/null +++ b/src/python/nimbusml/preprocessing/datetimesplitter.py @@ -0,0 +1,63 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +DateTimeSplitter +""" + +__all__ = ["DateTimeSplitter"] + + +from sklearn.base import TransformerMixin + +from ..base_transform import BaseTransform +from ..internal.core.preprocessing.datetimesplitter import \ + DateTimeSplitter as core +from ..internal.utils.utils import trace + + +class DateTimeSplitter(core, BaseTransform, TransformerMixin): + """ + **Description** + Splits a date time value into each individual component + + :param columns: see `Columns `_. + + :param prefix: Output column prefix. + + :param columns_to_drop: Columns to drop after the DateTime Expansion. + + :param country: Country to get holidays for. Defaults to none if not + passed. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + prefix, + columns_to_drop=None, + country='None', + columns=None, + **params): + + if columns: + params['columns'] = columns + BaseTransform.__init__(self, **params) + core.__init__( + self, + prefix=prefix, + columns_to_drop=columns_to_drop, + country=country, + **params) + self._columns = columns + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/preprocessing/normalization/__init__.py b/src/python/nimbusml/preprocessing/normalization/__init__.py index f7d7647a..3928ac40 100644 --- a/src/python/nimbusml/preprocessing/normalization/__init__.py +++ b/src/python/nimbusml/preprocessing/normalization/__init__.py @@ -4,6 +4,7 @@ from .lpscaler import LpScaler from .meanvariancescaler import MeanVarianceScaler from .minmaxscaler import MinMaxScaler +from .robustscaler import RobustScaler __all__ = [ 'Binner', @@ -11,5 +12,6 @@ 'LogMeanVarianceScaler', 'LpScaler', 'MeanVarianceScaler', - 'MinMaxScaler' + 'MinMaxScaler', + 'RobustScaler' ] diff --git a/src/python/nimbusml/preprocessing/normalization/robustscaler.py b/src/python/nimbusml/preprocessing/normalization/robustscaler.py new file mode 100644 index 00000000..776d5609 --- /dev/null +++ b/src/python/nimbusml/preprocessing/normalization/robustscaler.py @@ -0,0 +1,66 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +RobustScaler +""" + +__all__ = ["RobustScaler"] + + +from sklearn.base import TransformerMixin + +from ...base_transform import BaseTransform +from ...internal.core.preprocessing.normalization.robustscaler import \ + RobustScaler as core +from ...internal.utils.utils import trace + + +class RobustScaler(core, BaseTransform, TransformerMixin): + """ + **Description** + Removes the median and scales the data according to the quantile range. + + :param columns: see `Columns `_. + + :param center: If True, center the data before scaling. + + :param scale: If True, scale the data to interquartile range. + + :param quantile_min: Min for the quantile range used to calculate scale. + + :param quantile_max: Max for the quantile range used to calculate scale. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + center=True, + scale=True, + quantile_min=25.0, + quantile_max=75.0, + columns=None, + **params): + + if columns: + params['columns'] = columns + BaseTransform.__init__(self, **params) + core.__init__( + self, + center=center, + scale=scale, + quantile_min=quantile_min, + quantile_max=quantile_max, + **params) + self._columns = columns + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/preprocessing/tokeyimputer.py b/src/python/nimbusml/preprocessing/tokeyimputer.py new file mode 100644 index 00000000..000d6a2f --- /dev/null +++ b/src/python/nimbusml/preprocessing/tokeyimputer.py @@ -0,0 +1,49 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +ToKeyImputer +""" + +__all__ = ["ToKeyImputer"] + + +from sklearn.base import TransformerMixin + +from ..base_transform import BaseTransform +from ..internal.core.preprocessing.tokeyimputer import ToKeyImputer as core +from ..internal.utils.utils import trace + + +class ToKeyImputer(core, BaseTransform, TransformerMixin): + """ + **Description** + Fills in missing values in a column based on the most frequent value + + :param columns: see `Columns `_. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + columns=None, + **params): + + if columns: + params['columns'] = columns + BaseTransform.__init__(self, **params) + core.__init__( + self, + **params) + self._columns = columns + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/preprocessing/tostring.py b/src/python/nimbusml/preprocessing/tostring.py new file mode 100644 index 00000000..2dd2826c --- /dev/null +++ b/src/python/nimbusml/preprocessing/tostring.py @@ -0,0 +1,49 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +ToString +""" + +__all__ = ["ToString"] + + +from sklearn.base import TransformerMixin + +from ..base_transform import BaseTransform +from ..internal.core.preprocessing.tostring import ToString as core +from ..internal.utils.utils import trace + + +class ToString(core, BaseTransform, TransformerMixin): + """ + **Description** + Turns the given column into a column of its string representation + + :param columns: see `Columns `_. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + columns=None, + **params): + + if columns: + params['columns'] = columns + BaseTransform.__init__(self, **params) + core.__init__( + self, + **params) + self._columns = columns + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/tests/preprocessing/missing_values/test_data_with_missing.py b/src/python/nimbusml/tests/preprocessing/missing_values/test_data_with_missing.py index 0dc85f6e..99db58dd 100644 --- a/src/python/nimbusml/tests/preprocessing/missing_values/test_data_with_missing.py +++ b/src/python/nimbusml/tests/preprocessing/missing_values/test_data_with_missing.py @@ -9,6 +9,7 @@ from math import isnan from nimbusml import Pipeline from nimbusml.linear_model import FastLinearRegressor +from nimbusml.preprocessing import ToKeyImputer from nimbusml.preprocessing.missing_values import Filter, Handler, Indicator from pandas import DataFrame from sklearn.utils.testing import assert_equal, assert_true, \ @@ -160,6 +161,19 @@ def test_input_conversion_to_float_retains_other_column_types(self): assert_equal(result.dtypes['f1'], np.object) assert_equal(result.dtypes['f2.f2'], np.float32) + def test_category_imputation(self): + data={'f0': [4, 4, np.nan, 9], + 'f1': [4, 4, np.nan, np.nan]} + data = DataFrame(data) + + # Check ToKeyImputer + xf = ToKeyImputer(columns={'f0.out': 'f0', 'f1.out': 'f1'}) + result = xf.fit_transform(data) + + assert_equal(result['f0.out'][1], 4) + assert_equal(result['f0.out'][2], 4) + assert_equal(result['f1.out'][1], 4) + assert_equal(result['f1.out'][2], 4) if __name__ == '__main__': unittest.main() diff --git a/src/python/nimbusml/tests/preprocessing/normalization/test_robustscaler.py b/src/python/nimbusml/tests/preprocessing/normalization/test_robustscaler.py new file mode 100644 index 00000000..da854164 --- /dev/null +++ b/src/python/nimbusml/tests/preprocessing/normalization/test_robustscaler.py @@ -0,0 +1,27 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +import unittest + +import pandas +from nimbusml import Pipeline +from nimbusml.preprocessing.normalization import RobustScaler + + +class TestRobustScaler(unittest.TestCase): + + def test_with_integer_inputs(self): + df = pandas.DataFrame(data=dict(c0=[1, 3, 5, 7, 9])) + + xf = RobustScaler(columns='c0', center=True, scale=True) + pipeline = Pipeline([xf]) + result = pipeline.fit_transform(df) + + expected_result = pandas.Series([-1.0, -0.5, 0.0, 0.5, 1.0]) + + self.assertTrue(result.loc[:, 'c0'].equals(expected_result)) + + +if __name__ == '__main__': + unittest.main() diff --git a/src/python/nimbusml/tests/preprocessing/test_datetimesplitter.py b/src/python/nimbusml/tests/preprocessing/test_datetimesplitter.py new file mode 100644 index 00000000..0b9c8141 --- /dev/null +++ b/src/python/nimbusml/tests/preprocessing/test_datetimesplitter.py @@ -0,0 +1,42 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- + +import unittest + +import pandas +from nimbusml.preprocessing import DateTimeSplitter +from sklearn.utils.testing import assert_equal + + +class TestDateTimeSplitter(unittest.TestCase): + + def test_check_estimator_DateTimeSplitter(self): + df = pandas.DataFrame(data=dict(dt=[i for i in range(8)])) + dt = DateTimeSplitter(prefix='dt_') << 'dt' + result = dt.fit_transform(df) + assert_equal(result['dt_Year'][0], 1970, "it should have been year of 1970") + + def test_holidays(self): + df = pandas.DataFrame(data=dict( + tokens1=[1, 2, 3, 157161600], + tokens2=[10, 11, 12, 13] + )) + + cols_to_drop = [ + 'Hour12', 'DayOfWeek', 'DayOfQuarter', + 'DayOfYear', 'WeekOfMonth', 'QuarterOfYear', + 'HalfOfYear', 'WeekIso', 'YearIso', 'MonthLabel', + 'AmPmLabel', 'DayOfWeekLabel', 'IsPaidTimeOff' + ] + + dts = DateTimeSplitter(prefix='dt', + country='Canada', + columns_to_drop=cols_to_drop) << 'tokens1' + y = dts.fit_transform(df) + + self.assertEqual(y.loc[3, 'dtHolidayName'], 'Christmas Day') + +if __name__ == '__main__': + unittest.main() diff --git a/src/python/nimbusml/tests/preprocessing/test_tokeyimputer.py b/src/python/nimbusml/tests/preprocessing/test_tokeyimputer.py new file mode 100644 index 00000000..85c501c7 --- /dev/null +++ b/src/python/nimbusml/tests/preprocessing/test_tokeyimputer.py @@ -0,0 +1,36 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- + +import unittest + +import numpy as np +import pandas as pd +from nimbusml.preprocessing import ToKeyImputer + + +class TestToKeyImputer(unittest.TestCase): + + def test_tokeyimputer(self): + text_df = pd.DataFrame( + data=dict( + text=[ + "cat", + "dog", + "fish", + "orange", + "cat orange", + "dog", + "fish", + None, + "spider"])) + + tokey = ToKeyImputer() << 'text' + y = tokey.fit_transform(text_df) + + self.assertEqual(y.loc[7, 'text'], 'dog') + + +if __name__ == '__main__': + unittest.main() diff --git a/src/python/nimbusml/tests/preprocessing/test_tostring.py b/src/python/nimbusml/tests/preprocessing/test_tostring.py new file mode 100644 index 00000000..edb11d63 --- /dev/null +++ b/src/python/nimbusml/tests/preprocessing/test_tostring.py @@ -0,0 +1,37 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- + +import unittest + +import numpy as np +from pandas import DataFrame +from nimbusml.preprocessing import ToString +from sklearn.utils.testing import assert_equal + + +class TestToString(unittest.TestCase): + + def test_tostring(self): + data={'f0': [4, 4, -1, 9], + 'f1': [5, 5, 3.1, -0.23], + 'f2': [6, 6.7, np.nan, np.nan]} + data = DataFrame(data).astype({'f0': np.int32, + 'f1': np.float32, + 'f2': np.float64}) + + xf = ToString(columns={'f0.out': 'f0', + 'f1.out': 'f1', + 'f2.out': 'f2'}) + result = xf.fit_transform(data) + + assert_equal(result['f0.out'][1], '4') + assert_equal(result['f0.out'][2], '-1') + assert_equal(result['f1.out'][1], '5.000000') + assert_equal(result['f1.out'][2], '3.100000') + assert_equal(result['f2.out'][1], '6.700000') + assert_equal(result['f2.out'][2], 'NaN') + +if __name__ == '__main__': + unittest.main() diff --git a/src/python/nimbusml/tests/timeseries/test_timeseriesimputer.py b/src/python/nimbusml/tests/timeseries/test_timeseriesimputer.py new file mode 100644 index 00000000..98c9e21b --- /dev/null +++ b/src/python/nimbusml/tests/timeseries/test_timeseriesimputer.py @@ -0,0 +1,41 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- + +import unittest + +import numpy as np +import pandas as pd +from nimbusml.timeseries import TimeSeriesImputer + + +class TestTimeSeriesImputer(unittest.TestCase): + + def test_timeseriesimputer_adds_new_row(self): + from nimbusml.timeseries import TimeSeriesImputer + + df = pd.DataFrame(data=dict( + ts=[1, 2, 3, 5], + grain=[1970, 1970, 1970, 1970], + c3=[10, 13, 15, 20], + c4=[19, 12, 16, 19] + )) + + tsi = TimeSeriesImputer(time_series_column='ts', + grain_columns=['grain'], + filter_columns=['c3', 'c4'], + impute_mode='ForwardFill', + filter_mode='Include') + result = tsi.fit_transform(df) + + self.assertEqual(result.loc[0, 'ts'], 1) + self.assertEqual(result.loc[3, 'ts'], 4) + self.assertEqual(result.loc[3, 'grain'], 1970) + self.assertEqual(result.loc[3, 'c3'], 15) + self.assertEqual(result.loc[3, 'c4'], 16) + self.assertEqual(result.loc[3, 'IsRowImputed'], True) + + +if __name__ == '__main__': + unittest.main() diff --git a/src/python/nimbusml/timeseries/__init__.py b/src/python/nimbusml/timeseries/__init__.py index 64e66add..05dbfa3c 100644 --- a/src/python/nimbusml/timeseries/__init__.py +++ b/src/python/nimbusml/timeseries/__init__.py @@ -3,11 +3,13 @@ from .ssaspikedetector import SsaSpikeDetector from .ssachangepointdetector import SsaChangePointDetector from .ssaforecaster import SsaForecaster +from .timeseriesimputer import TimeSeriesImputer __all__ = [ 'IidSpikeDetector', 'IidChangePointDetector', 'SsaSpikeDetector', 'SsaChangePointDetector', - 'SsaForecaster' + 'SsaForecaster', + 'TimeSeriesImputer' ] diff --git a/src/python/nimbusml/timeseries/timeseriesimputer.py b/src/python/nimbusml/timeseries/timeseriesimputer.py new file mode 100644 index 00000000..150b9959 --- /dev/null +++ b/src/python/nimbusml/timeseries/timeseriesimputer.py @@ -0,0 +1,77 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +TimeSeriesImputer +""" + +__all__ = ["TimeSeriesImputer"] + + +from sklearn.base import TransformerMixin + +from ..base_transform import BaseTransform +from ..internal.core.timeseries.timeseriesimputer import \ + TimeSeriesImputer as core +from ..internal.utils.utils import trace + + +class TimeSeriesImputer(core, BaseTransform, TransformerMixin): + """ + **Description** + Fills in missing row and values + + :param columns: see `Columns `_. + + :param time_series_column: Column representing the time. + + :param grain_columns: List of grain columns. + + :param filter_columns: Columns to filter. + + :param filter_mode: Filter mode. Either include or exclude. + + :param impute_mode: Mode for imputing, defaults to ForwardFill if not + provided. + + :param supress_type_errors: Supress the errors that would occur if a column + and impute mode are imcompatible. If true, will skip the column. If + false, will stop and throw an error. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + time_series_column, + grain_columns, + filter_columns=None, + filter_mode='Exclude', + impute_mode='ForwardFill', + supress_type_errors=False, + columns=None, + **params): + + if columns: + params['columns'] = columns + BaseTransform.__init__(self, **params) + core.__init__( + self, + time_series_column=time_series_column, + grain_columns=grain_columns, + filter_columns=filter_columns, + filter_mode=filter_mode, + impute_mode=impute_mode, + supress_type_errors=supress_type_errors, + **params) + self._columns = columns + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/setup.py b/src/python/setup.py index fc350275..5fc3fcba 100644 --- a/src/python/setup.py +++ b/src/python/setup.py @@ -45,7 +45,7 @@ # Versions should comply with PEP440. For a discussion on # single-sourcing the version across setup.py and the project code, see # https://packaging.python.org/en/latest/single_source_version.html - version='1.5.0', + version='1.5.1', description='NimbusML', long_description=long_description, diff --git a/src/python/tests/test_estimator_checks.py b/src/python/tests/test_estimator_checks.py index 7dfd5eb8..1a835fbc 100644 --- a/src/python/tests/test_estimator_checks.py +++ b/src/python/tests/test_estimator_checks.py @@ -16,9 +16,10 @@ from nimbusml.ensemble import LightGbmRegressor from nimbusml.feature_extraction.text import NGramFeaturizer from nimbusml.internal.entrypoints._ngramextractor_ngram import n_gram +from nimbusml.preprocessing import TensorFlowScorer, DateTimeSplitter from nimbusml.linear_model import SgdBinaryClassifier -from nimbusml.preprocessing import TensorFlowScorer from nimbusml.preprocessing.filter import SkipFilter, TakeFilter +from nimbusml.preprocessing.normalization import RobustScaler from nimbusml.timeseries import (IidSpikeDetector, IidChangePointDetector, SsaSpikeDetector, SsaChangePointDetector, SsaForecaster) @@ -53,6 +54,12 @@ # I8 should not have NA values 'CountSelector': 'check_estimators_dtypes', + # DateTimeSplitter does not work with floating point types. + 'DateTimeSplitter': + 'check_transformer_general, check_pipeline_consistency' + 'check_estimators_pickle, check_estimators_dtypes' + 'check_dict_unchanged, check_dtype_object, check_fit_score_takes_y' + 'check_transformer_data_not_an_array', # by design returns smaller number of rows 'SkipFilter': 'check_transformer_general, ' 'check_transformer_data_not_an_array', @@ -154,6 +161,15 @@ 'check_estimators_overwrite_params, \ check_estimator_sparse_data, check_estimators_pickle, ' 'check_estimators_nan_inf', + # RobustScaler does not support vectorized types + 'RobustScaler': 'check_estimator_sparse_data', + 'ToKeyImputer': 'check_estimator_sparse_data', + # Most of these skipped tests are failing because the checks + # require numerical types. ToString returns object types. + # TypeError: ufunc 'isfinite' not supported for the input types + 'ToString': 'check_estimator_sparse_data, check_pipeline_consistency' + 'check_transformer_data_not_an_array, check_estimators_pickle' + 'check_transformer_general', } OMITTED_CHECKS_TUPLE = ( @@ -191,6 +207,7 @@ 'check_classifiers_train'] INSTANCES = { + 'DateTimeSplitter': DateTimeSplitter(prefix='dt', columns=['F0']), 'EnsembleClassifier': EnsembleClassifier(num_models=3), 'EnsembleRegressor': EnsembleRegressor(num_models=3), 'LightGbmBinaryClassifier': LightGbmBinaryClassifier( @@ -202,6 +219,7 @@ 'LightGbmRanker': LightGbmRanker( minimum_example_count_per_group=1, minimum_example_count_per_leaf=1), 'NGramFeaturizer': NGramFeaturizer(word_feature_extractor=n_gram()), + 'RobustScaler': RobustScaler(scale=False), 'SgdBinaryClassifier': SgdBinaryClassifier(number_of_threads=1, shuffle=False), 'SkipFilter': SkipFilter(count=5), 'TakeFilter': TakeFilter(count=100000), @@ -266,7 +284,8 @@ def load_json(file_path): 'TreeFeaturizer', # skip SymSgdBinaryClassifier for now, because of crashes. 'SymSgdBinaryClassifier', - 'DatasetTransformer' + 'DatasetTransformer', + 'TimeSeriesImputer' ]) epoints = [] diff --git a/src/python/tools/entrypoint_compiler.py b/src/python/tools/entrypoint_compiler.py index ed829533..57b1b8de 100644 --- a/src/python/tools/entrypoint_compiler.py +++ b/src/python/tools/entrypoint_compiler.py @@ -1560,7 +1560,6 @@ def __init__(self, argument, inout): # dict self.default = argument.get('Default', Missing()) self.required = argument.get('Required', Missing()) self.aliases = argument.get('Aliases', Missing()) - self.pass_as = argument.get('PassAs', None) self.name_converted = convert_name(self.name) self.new_name_converted = convert_name( @@ -1615,7 +1614,7 @@ def get_body(self): "is_of_type=numbers.Real" body = template.format( inout=self.inout, - name=self.pass_as or self.name, + name=self.name, name_converted=self.name_converted, none_acceptable=not self.required) if not isinstance(self.range, Missing): @@ -1646,7 +1645,7 @@ def get_body(self): "none_acceptable={none_acceptable}, is_of_type=bool" body = template.format( inout=self.inout, - name=self.pass_as or self.name, + name=self.name, name_converted=self.name_converted, none_acceptable=not self.required) return body + ")" @@ -1693,7 +1692,7 @@ def get_body(self): template += ", is_column=True" body = template.format( inout=self.inout, - name=self.pass_as or self.name, + name=self.name, name_converted=self.name_converted, none_acceptable=not self.required) return body + ")" @@ -1717,7 +1716,7 @@ def get_body(self): "none_acceptable={none_acceptable}, is_of_type=str" body = template.format( inout=self.inout, - name=self.pass_as or self.name, + name=self.name, name_converted=self.name_converted, none_acceptable=not self.required) value_check = ", values={0}".format(str(self.type['Values'])) @@ -1748,7 +1747,7 @@ def get_body(self): "none_acceptable={none_acceptable}, is_of_type=list" body = template.format( inout=self.inout, - name=self.pass_as or self.name, + name=self.name, name_converted=self.name_converted, none_acceptable=not self.required) return body + ")" @@ -1790,7 +1789,7 @@ def get_body(self): template += ', is_column=True' body = template.format( inout=self.inout, - name=self.pass_as or self.name, + name=self.name, name_converted=self.name_converted, none_acceptable=not self.required) return body + ")" @@ -1818,7 +1817,7 @@ def get_body(self): template += ', is_column=True' body = template.format( inout=self.inout, - name=self.pass_as or self.name, + name=self.name, name_converted=self.name_converted, none_acceptable=not self.required) return body + ")" @@ -1846,7 +1845,7 @@ def get_body(self): "none_acceptable={none_acceptable}, is_of_type=dict" body = template.format( inout=self.inout, - name=self.pass_as or self.name, + name=self.name, name_converted=self.name_converted, none_acceptable=not self.required) return body + ")" @@ -1882,7 +1881,7 @@ def get_body(self): template += ", is_column=True" body = template.format( inout=self.inout, - name=self.pass_as or self.name, + name=self.name, name_converted=self.name_converted, none_acceptable=not self.required) field_check = ", field_names={0}".format( @@ -2041,6 +2040,7 @@ def generate_code(pkg_path, generate_entrypoints, generate_api): script_args = arg_parser.parse_args() pkg_path = os.path.join(my_dir, r'..\nimbusml') + if script_args.check_manual_changes: verbose = False if script_args.folder == 'temp': diff --git a/src/python/tools/manifest.json b/src/python/tools/manifest.json index c8e6d6e5..45eb1a38 100644 --- a/src/python/tools/manifest.json +++ b/src/python/tools/manifest.json @@ -17301,6 +17301,82 @@ "ITransformOutput" ] }, + { + "Name": "Transforms.CategoryImputer", + "Desc": "Fills in missing values in a column based on the most frequent value", + "FriendlyName": "CategoryImputer", + "ShortName": "CategoryImputer", + "Inputs": [ + { + "Name": "Column", + "Type": { + "Kind": "Array", + "ItemType": { + "Kind": "Struct", + "Fields": [ + { + "Name": "Name", + "Type": "String", + "Desc": "Name of the new column", + "Aliases": [ + "name" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Source", + "Type": "String", + "Desc": "Name of the source column", + "Aliases": [ + "src" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + } + ] + } + }, + "Desc": "New column definition (optional form: name:src)", + "Aliases": [ + "col" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + } + ], + "Outputs": [ + { + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" + }, + { + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" + } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" + ] + }, { "Name": "Transforms.CharacterTokenizer", "Desc": "Character-oriented tokenizer where text is considered a sequence of characters.", @@ -18077,6 +18153,157 @@ } ] }, + { + "Name": "Transforms.DateTimeSplitter", + "Desc": "Splits a date time value into each individual component", + "FriendlyName": "DateTime Transform", + "ShortName": "DateTimeTransform", + "Inputs": [ + { + "Name": "Source", + "Type": "String", + "Desc": "Input column", + "Aliases": [ + "src" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "Prefix", + "Type": "String", + "Desc": "Output column prefix", + "Aliases": [ + "pre" + ], + "Required": true, + "SortOrder": 2.0, + "IsNullable": false + }, + { + "Name": "ColumnsToDrop", + "Type": { + "Kind": "Array", + "ItemType": { + "Kind": "Enum", + "Values": [ + "Year", + "Month", + "Day", + "Hour", + "Minute", + "Second", + "AmPm", + "Hour12", + "DayOfWeek", + "DayOfQuarter", + "DayOfYear", + "WeekOfMonth", + "QuarterOfYear", + "HalfOfYear", + "WeekIso", + "YearIso", + "MonthLabel", + "AmPmLabel", + "DayOfWeekLabel", + "HolidayName", + "IsPaidTimeOff" + ] + } + }, + "Desc": "Columns to drop after the DateTime Expansion", + "Aliases": [ + "drop" + ], + "Required": false, + "SortOrder": 3.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Country", + "Type": { + "Kind": "Enum", + "Values": [ + "None", + "Argentina", + "Australia", + "Austria", + "Belarus", + "Belgium", + "Brazil", + "Canada", + "Colombia", + "Croatia", + "Czech", + "Denmark", + "England", + "Finland", + "France", + "Germany", + "Hungary", + "India", + "Ireland", + "IsleofMan", + "Italy", + "Japan", + "Mexico", + "Netherlands", + "NewZealand", + "NorthernIreland", + "Norway", + "Poland", + "Portugal", + "Scotland", + "Slovenia", + "SouthAfrica", + "Spain", + "Sweden", + "Switzerland", + "Ukraine", + "UnitedKingdom", + "UnitedStates", + "Wales" + ] + }, + "Desc": "Country to get holidays for. Defaults to none if not passed", + "Aliases": [ + "ctry" + ], + "Required": false, + "SortOrder": 4.0, + "IsNullable": false, + "Default": "None" + } + ], + "Outputs": [ + { + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" + }, + { + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" + } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" + ] + }, { "Name": "Transforms.Dictionarizer", "Desc": "Converts input values (words, numbers, etc.) to index in a dictionary.", @@ -21931,6 +22158,130 @@ "ITransformOutput" ] }, + { + "Name": "Transforms.RobustScaler", + "Desc": "Removes the median and scales the data according to the quantile range.", + "FriendlyName": "RobustScalerTransformer", + "ShortName": "RobustScalerTransformer", + "Inputs": [ + { + "Name": "Column", + "Type": { + "Kind": "Array", + "ItemType": { + "Kind": "Struct", + "Fields": [ + { + "Name": "Name", + "Type": "String", + "Desc": "Name of the new column", + "Aliases": [ + "name" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Source", + "Type": "String", + "Desc": "Name of the source column", + "Aliases": [ + "src" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + } + ] + } + }, + "Desc": "New column definition (optional form: name:src)", + "Aliases": [ + "col" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "Center", + "Type": "Bool", + "Desc": "If True, center the data before scaling.", + "Aliases": [ + "ctr" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": true + }, + { + "Name": "Scale", + "Type": "Bool", + "Desc": "If True, scale the data to interquartile range.", + "Aliases": [ + "sc" + ], + "Required": false, + "SortOrder": 3.0, + "IsNullable": false, + "Default": true + }, + { + "Name": "QuantileMin", + "Type": "Float", + "Desc": "Min for the quantile range used to calculate scale.", + "Aliases": [ + "min" + ], + "Required": false, + "SortOrder": 4.0, + "IsNullable": false, + "Default": 25.0 + }, + { + "Name": "QuantileMax", + "Type": "Float", + "Desc": "Max for the quantile range used to calculate scale.", + "Aliases": [ + "max" + ], + "Required": false, + "SortOrder": 5.0, + "IsNullable": false, + "Default": 75.0 + } + ], + "Outputs": [ + { + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" + }, + { + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" + } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" + ] + }, { "Name": "Transforms.RowRangeFilter", "Desc": "Filters a dataview on a column of type Single, Double or Key (contiguous). Keeps the values that are in the specified min/max range. NaNs are always filtered out. If the input is a Key type, the min/max are considered percentages of the number of values.", @@ -22941,6 +23292,207 @@ "ITransformOutput" ] }, + { + "Name": "Transforms.TimeSeriesImputer", + "Desc": "Fills in missing row and values", + "FriendlyName": "TimeSeriesImputer", + "ShortName": "TimeSeriesImputer", + "Inputs": [ + { + "Name": "TimeSeriesColumn", + "Type": "String", + "Desc": "Column representing the time", + "Aliases": [ + "time" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "GrainColumns", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "List of grain columns", + "Aliases": [ + "grains" + ], + "Required": true, + "SortOrder": 2.0, + "IsNullable": false + }, + { + "Name": "FilterColumns", + "Type": { + "Kind": "Array", + "ItemType": "String" + }, + "Desc": "Columns to filter", + "Aliases": [ + "filters" + ], + "Required": false, + "SortOrder": 2.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "FilterMode", + "Type": { + "Kind": "Enum", + "Values": [ + "NoFilter", + "Include", + "Exclude" + ] + }, + "Desc": "Filter mode. Either include or exclude", + "Aliases": [ + "fmode" + ], + "Required": false, + "SortOrder": 3.0, + "IsNullable": false, + "Default": "Exclude" + }, + { + "Name": "ImputeMode", + "Type": { + "Kind": "Enum", + "Values": [ + "ForwardFill", + "BackFill", + "Median", + "Interpolate" + ] + }, + "Desc": "Mode for imputing, defaults to ForwardFill if not provided", + "Aliases": [ + "mode" + ], + "Required": false, + "SortOrder": 3.0, + "IsNullable": false, + "Default": "ForwardFill" + }, + { + "Name": "SupressTypeErrors", + "Type": "Bool", + "Desc": "Supress the errors that would occur if a column and impute mode are imcompatible. If true, will skip the column. If false, will stop and throw an error.", + "Aliases": [ + "error" + ], + "Required": false, + "SortOrder": 3.0, + "IsNullable": false, + "Default": false + } + ], + "Outputs": [ + { + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" + }, + { + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" + } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" + ] + }, + { + "Name": "Transforms.ToString", + "Desc": "Turns the given column into a column of its string representation", + "FriendlyName": "ToString Transform", + "ShortName": "ToStringTransform", + "Inputs": [ + { + "Name": "Column", + "Type": { + "Kind": "Array", + "ItemType": { + "Kind": "Struct", + "Fields": [ + { + "Name": "Name", + "Type": "String", + "Desc": "Name of the new column", + "Aliases": [ + "name" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + }, + { + "Name": "Source", + "Type": "String", + "Desc": "Name of the source column", + "Aliases": [ + "src" + ], + "Required": false, + "SortOrder": 150.0, + "IsNullable": false, + "Default": null + } + ] + } + }, + "Desc": "New column definition (optional form: name:src)", + "Aliases": [ + "col" + ], + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + }, + { + "Name": "Data", + "Type": "DataView", + "Desc": "Input dataset", + "Required": true, + "SortOrder": 1.0, + "IsNullable": false + } + ], + "Outputs": [ + { + "Name": "OutputData", + "Type": "DataView", + "Desc": "Transformed dataset" + }, + { + "Name": "Model", + "Type": "TransformModel", + "Desc": "Transform model" + } + ], + "InputKind": [ + "ITransformInput" + ], + "OutputKind": [ + "ITransformOutput" + ] + }, { "Name": "Transforms.TrainTestDatasetSplitter", "Desc": "Split the dataset into train and test sets", diff --git a/src/python/tools/manifest_diff.json b/src/python/tools/manifest_diff.json index cddfaf25..d5b0b3a3 100644 --- a/src/python/tools/manifest_diff.json +++ b/src/python/tools/manifest_diff.json @@ -294,6 +294,24 @@ "Module": "preprocessing", "Type": "Transform" }, + { + "Name": "Transforms.CategoryImputer", + "NewName": "ToKeyImputer", + "Module": "preprocessing", + "Type": "Transform" + }, + { + "Name": "Transforms.ToString", + "NewName": "ToString", + "Module": "preprocessing", + "Type": "Transform" + }, + { + "Name": "Transforms.DateTimeSplitter", + "NewName": "DateTimeSplitter", + "Module": "preprocessing", + "Type": "Transform" + }, { "Name": "Transforms.TensorFlowScorer", "NewName": "TensorFlowScorer", @@ -493,6 +511,12 @@ "Module": "preprocessing.normalization", "Type": "Transform" }, + { + "Name": "Transforms.RobustScaler", + "NewName": "RobustScaler", + "Module": "preprocessing.normalization", + "Type": "Transform" + }, { "Name": "Transforms.MissingValuesRowDropper", "NewName": "Filter", @@ -610,6 +634,12 @@ "Module": "timeseries", "Type": "Transform" }, + { + "Name": "Transforms.TimeSeriesImputer", + "NewName": "TimeSeriesImputer", + "Module": "timeseries", + "Type": "Transform" + }, { "Name": "Trainers.PoissonRegressor", "NewName": "PoissonRegressionRegressor", diff --git a/version.txt b/version.txt index 3e1ad720..8e03717d 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -1.5.0 \ No newline at end of file +1.5.1 \ No newline at end of file