diff --git a/README.md b/README.md index 8551dafc..1ec683ab 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # NimbusML -`nimbusml` is a Python module that provides experimental Python bindings for [ML.NET](https://github.com/dotnet/machinelearning). +`nimbusml` is a Python module that provides Python bindings for [ML.NET](https://github.com/dotnet/machinelearning). ML.NET was originally developed in Microsoft Research and is used across many product groups in Microsoft like Windows, Bing, PowerPoint, Excel and others. `nimbusml` was built to enable data science teams that are more familiar with Python to take advantage of ML.NET's functionality and performance. diff --git a/build.cmd b/build.cmd index f58ca80f..e21dc5ec 100644 --- a/build.cmd +++ b/build.cmd @@ -311,6 +311,14 @@ copy "%BuildOutputDir%%Configuration%\pybridge.pyd" "%__currentScriptDir%src\py if %PythonVersion% == 2.7 ( copy "%BuildOutputDir%%Configuration%\Platform\win-x64\publish\*.dll" "%__currentScriptDir%src\python\nimbusml\internal\libs\" + :: remove dataprep dlls as its not supported in python 2.7 + del "%__currentScriptDir%src\python\nimbusml\internal\libs\Microsoft.DPrep.*" + del "%__currentScriptDir%src\python\nimbusml\internal\libs\Microsoft.Data.*" + del "%__currentScriptDir%src\python\nimbusml\internal\libs\Microsoft.ProgramSynthesis.*" + del "%__currentScriptDir%src\python\nimbusml\internal\libs\Microsoft.DataPrep.dll" + del "%__currentScriptDir%src\python\nimbusml\internal\libs\ExcelDataReader.dll" + del "%__currentScriptDir%src\python\nimbusml\internal\libs\Microsoft.WindowsAzure.Storage.dll" + del "%__currentScriptDir%src\python\nimbusml\internal\libs\Microsoft.Workbench.Messaging.SDK.dll" ) else ( for /F "tokens=*" %%A in (build/libs_win.txt) do copy "%BuildOutputDir%%Configuration%\Platform\win-x64\publish\%%A" "%__currentScriptDir%src\python\nimbusml\internal\libs\" ) diff --git a/build.sh b/build.sh index e36a534a..3f47be49 100755 --- a/build.sh +++ b/build.sh @@ -219,6 +219,14 @@ then ext=*.dylib fi cp "${BuildOutputDir}/${__configuration}/Platform/${PublishDir}"/publish/${ext} "${__currentScriptDir}/src/python/nimbusml/internal/libs/" + # remove dataprep dlls as its not supported in python 2.7 + rm -f "${__currentScriptDir}/src/python/nimbusml/internal/libs/Microsoft.DPrep.*" + rm -f "${__currentScriptDir}/src/python/nimbusml/internal/libs/Microsoft.Data.*" + rm -f "${__currentScriptDir}/src/python/nimbusml/internal/libs/Microsoft.ProgramSynthesis.*" + rm -f "${__currentScriptDir}/src/python/nimbusml/internal/libs/Microsoft.DataPrep.dll" + rm -f "${__currentScriptDir}/src/python/nimbusml/internal/libs/ExcelDataReader.dll" + rm -f "${__currentScriptDir}/src/python/nimbusml/internal/libs/Microsoft.WindowsAzure.Storage.dll" + rm -f "${__currentScriptDir}/src/python/nimbusml/internal/libs/Microsoft.Workbench.Messaging.SDK.dll" else libs_txt=libs_linux.txt if [ "$(uname -s)" = "Darwin" ] diff --git a/src/DotNetBridge/Bridge.cs b/src/DotNetBridge/Bridge.cs index 40220cc8..8ae167f5 100644 --- a/src/DotNetBridge/Bridge.cs +++ b/src/DotNetBridge/Bridge.cs @@ -186,12 +186,6 @@ private enum FnId Generic = 2, } -#if !CORECLR - // The hosting code invokes this to get a specific entry point. - [UnmanagedFunctionPointer(CallingConvention.StdCall)] - private delegate IntPtr NativeFnGetter(FnId id); -#endif - #region Callbacks to native // Call back to provide messages to native code. @@ -236,8 +230,9 @@ private struct EnvironmentBlock [FieldOffset(0x18)] public readonly void* modelSink; + //Max slots to return for vector valued columns(<=0 to return all). [FieldOffset(0x20)] - public readonly int maxThreadsAllowed; + public readonly int maxSlots; // Call back to provide cancel flag. [FieldOffset(0x28)] @@ -252,41 +247,14 @@ private struct EnvironmentBlock [UnmanagedFunctionPointer(CallingConvention.StdCall)] private unsafe delegate int NativeGeneric(EnvironmentBlock* penv, sbyte* psz, int cdata, DataSourceBlock** ppdata); -#if !CORECLR - private static NativeFnGetter FnGetter; -#endif private static NativeGeneric FnGeneric; private static TDel MarshalDelegate(void* pv) { Contracts.Assert(typeof(TDel).IsSubclassOf(typeof(Delegate))); Contracts.Assert(pv != null); -#if CORECLR return Marshal.GetDelegateForFunctionPointer((IntPtr)pv); -#else - return (TDel)(object)Marshal.GetDelegateForFunctionPointer((IntPtr)pv, typeof(TDel)); -#endif - } - -#if !CORECLR - /// - /// This is the bootstrapping entry point. It's labeled private but is actually invoked from the native - /// code to poke the address of the FnGetter callback into the address encoded in the string parameter. - /// This odd way of doing things is because the most convenient way to call an initial managed method - /// imposes the signature of Func{string, int}, which doesn't allow us to return a function adress. - /// - private static unsafe int GetFnGetterCallback(string addr) - { - if (FnGetter == null) - Interlocked.CompareExchange(ref FnGetter, (NativeFnGetter)GetFn, null); - long a = long.Parse(addr); - IntPtr* p = null; - IntPtr** pp = &p; - *(long*)pp = a; - *p = Marshal.GetFunctionPointerForDelegate(FnGetter); - return 1; } -#endif /// /// This is the main FnGetter function. Given an FnId value, it returns a native-callable @@ -397,7 +365,7 @@ private static unsafe int GenericExec(EnvironmentBlock* penv, sbyte* psz, int cd // Wrap the data sets. ch.Trace("Wrapping native data sources"); ch.Trace("Executing"); - ExecCore(penv, host, ch, graph, cdata, ppdata); + RunGraphCore(penv, host, graph, cdata, ppdata); } catch (Exception e) { @@ -420,24 +388,6 @@ private static unsafe int GenericExec(EnvironmentBlock* penv, sbyte* psz, int cd return 0; } - private static void CheckModel(IHost host, byte** ppModelBin, long* pllModelBinLen, int i) - { - host.CheckParam( - ppModelBin != null && ppModelBin[i] != null - && pllModelBinLen != null && pllModelBinLen[i] > 0, "pModelBin", "Model is missing"); - } - - private static void ExecCore(EnvironmentBlock* penv, IHost host, IChannel ch, string graph, int cdata, DataSourceBlock** ppdata) - { - Contracts.AssertValue(ch); - ch.AssertValue(host); - ch.AssertNonEmpty(graph); - ch.Assert(cdata >= 0); - ch.Assert(ppdata != null || cdata == 0); - - RunGraphCore(penv, host, graph, cdata, ppdata); - } - /// /// Convert UTF8 bytes with known length to ROM. Negative length unsupported. /// @@ -483,25 +433,7 @@ internal static string BytesToString(sbyte* psz) if (cch == 0) return null; -#if CORECLR - return Encoding.UTF8.GetString((byte*)psz, cch); -#else - if (cch <= 0) - return ""; - - var decoder = Encoding.UTF8.GetDecoder(); - var chars = new char[decoder.GetCharCount((byte*)psz, cch, true)]; - int bytesUsed; - int charsUsed; - bool complete; - fixed (char* pchars = chars) - decoder.Convert((byte*)psz, cch, pchars, chars.Length, true, out bytesUsed, out charsUsed, out complete); - Contracts.Assert(bytesUsed == cch); - Contracts.Assert(charsUsed == chars.Length); - Contracts.Assert(complete); - return new string(chars); -#endif } /// diff --git a/src/DotNetBridge/RmlEnvironment.cs b/src/DotNetBridge/RmlEnvironment.cs index d2e861fe..36f44240 100644 --- a/src/DotNetBridge/RmlEnvironment.cs +++ b/src/DotNetBridge/RmlEnvironment.cs @@ -55,7 +55,6 @@ protected override IHost RegisterCore(HostEnvironmentBase source public RmlEnvironment(Bridge.CheckCancelled checkDelegate, int? seed = null, bool verbose = false) : this(RandomUtils.Create(seed), verbose) { - CheckCancelled = checkDelegate; } diff --git a/src/DotNetBridge/RunGraph.cs b/src/DotNetBridge/RunGraph.cs index 1fae7872..97c39730 100644 --- a/src/DotNetBridge/RunGraph.cs +++ b/src/DotNetBridge/RunGraph.cs @@ -27,30 +27,6 @@ public unsafe static partial class Bridge // std:null specifier in a graph, used to redirect output to std::null const string STDNULL = ""; - private sealed class RunGraphArgs - { -#pragma warning disable 649 // never assigned - [Argument(ArgumentType.AtMostOnce)] - public string graph; - - [Argument(ArgumentType.LastOccurenceWins, HelpText = "Desired degree of parallelism in the data pipeline", ShortName = "conc")] - public int? parallel; - - [Argument(ArgumentType.AtMostOnce, HelpText = "Random seed", ShortName = "seed")] - public int? randomSeed; - - [Argument(ArgumentType.AtMostOnce, ShortName = "lab")] - public string labelColumn; //not used - - [Argument(ArgumentType.Multiple, ShortName = "feat")] - public string[] featureColumn; //not used - - [Argument(ArgumentType.AtMostOnce, HelpText = "Max slots to return for vector valued columns (<=0 to return all)")] - public int maxSlots = -1; - -#pragma warning restore 649 // never assigned - } - private static void SaveIdvToFile(IDataView idv, string path, IHost host) { if (path == STDNULL) @@ -90,19 +66,11 @@ private static void RunGraphCore(EnvironmentBlock* penv, IHostEnvironment env, s { Contracts.AssertValue(env); - var args = new RunGraphArgs(); - string err = null; - if (!CmdParser.ParseArguments(env, graphStr, args, e => err = err ?? e)) - throw env.Except(err); - - int? maxThreadsAllowed = Math.Min(args.parallel > 0 ? args.parallel.Value : penv->maxThreadsAllowed, penv->maxThreadsAllowed); - maxThreadsAllowed = penv->maxThreadsAllowed > 0 ? maxThreadsAllowed : args.parallel; - var host = env.Register("RunGraph", args.randomSeed, null); - + var host = env.Register("RunGraph", penv->seed, null); JObject graph; try { - graph = JObject.Parse(args.graph); + graph = JObject.Parse(graphStr); } catch (JsonReaderException ex) { @@ -221,7 +189,7 @@ private static void RunGraphCore(EnvironmentBlock* penv, IHostEnvironment env, s } else { - var infos = ProcessColumns(ref idv, args.maxSlots, host); + var infos = ProcessColumns(ref idv, penv->maxSlots, host); SendViewToNative(ch, penv, idv, infos); } break; diff --git a/src/NativeBridge/DataViewInterop.cpp b/src/NativeBridge/DataViewInterop.cpp index dd349012..9b537e22 100644 --- a/src/NativeBridge/DataViewInterop.cpp +++ b/src/NativeBridge/DataViewInterop.cpp @@ -7,317 +7,317 @@ DataSourceBlock::DataSourceBlock(bp::dict& data) { - // Assert that this class doesn't have a vtable. - assert(offsetof(DataSourceBlock, ccol) == 0); + // Assert that this class doesn't have a vtable. + assert(offsetof(DataSourceBlock, ccol) == 0); - CxInt64 llTotalNumRows = -1; - assert(data.contains(PYTHON_DATA_KEY_INFO)); - bp::dict varInfo = bp::extract(data[PYTHON_DATA_KEY_INFO]); + CxInt64 llTotalNumRows = -1; + assert(data.contains(PYTHON_DATA_KEY_INFO)); + bp::dict varInfo = bp::extract(data[PYTHON_DATA_KEY_INFO]); - assert(data.contains(PYTHON_DATA_COL_TYPES)); - bp::list colTypes = bp::extract(data[PYTHON_DATA_COL_TYPES]); + assert(data.contains(PYTHON_DATA_COL_TYPES)); + bp::list colTypes = bp::extract(data[PYTHON_DATA_COL_TYPES]); - bp::stl_input_iterator keys(data.keys()), end1; - bp::stl_input_iterator values(data.values()); - CxInt64 dataframeColCount = -1; - for (; keys != end1; keys++) - { - bp::object key = *keys; - char* name = bp::extract(key); - bp::object value = *values++; - if (strcmp(name, PYTHON_DATA_KEY_INFO) == 0 || strcmp(name, PYTHON_DATA_COL_TYPES) == 0) - continue; + bp::stl_input_iterator keys(data.keys()), end1; + bp::stl_input_iterator values(data.values()); + CxInt64 dataframeColCount = -1; + for (; keys != end1; keys++) + { + bp::object key = *keys; + char* name = bp::extract(key); + bp::object value = *values++; + if (strcmp(name, PYTHON_DATA_KEY_INFO) == 0 || strcmp(name, PYTHON_DATA_COL_TYPES) == 0) + continue; - // now it should be a column names - std::string colName = bp::extract(key); - dataframeColCount++; - auto tp = bp::extract(colTypes[dataframeColCount]); - ML_PY_TYPE_MAP_ENUM colType = static_cast(tp[0]); + // now it should be a column names + std::string colName = bp::extract(key); + dataframeColCount++; + auto tp = bp::extract(colTypes[dataframeColCount]); + ML_PY_TYPE_MAP_ENUM colType = static_cast(tp[0]); - BYTE kind; - void *pgetter; - bool isKey = false; - bool isNumeric = false; - bool isText = false; - CxInt64 vecCard = -1; - // Numeric or bool values. - if (bp::extract(value).check()) - { - isNumeric = true; - np::ndarray val = bp::extract(value); - switch (colType) - { - case (ML_PY_BOOL): - kind = BL; - pgetter = (void*)&GetBL; - break; - case (ML_PY_BOOL64): - kind = BL; - pgetter = (void*)&GetBL64; - break; - case (ML_PY_UINT8): - kind = U1; - pgetter = (void*)&GetU1; - break; - case (ML_PY_UINT16): - kind = U2; - pgetter = (void*)&GetU2; - break; - case (ML_PY_UINT32): - kind = U4; - pgetter = (void*)&GetU4; - break; - case (ML_PY_UINT64): - kind = U8; - pgetter = (void*)&GetU8; - break; - case (ML_PY_INT8): - kind = I1; - pgetter = (void*)&GetI1; - break; - case (ML_PY_INT16): - kind = I2; - pgetter = (void*)&GetI2; - break; - case (ML_PY_INT32): - kind = I4; - pgetter = (void*)&GetI4; - break; - case (ML_PY_INT64): - kind = I8; - pgetter = (void*)&GetI8; - break; - case (ML_PY_FLOAT16): - // What to do with numpy.float16 ? - throw std::invalid_argument("numpy.float16 data type is not supported"); - case (ML_PY_FLOAT32): - kind = R4; - pgetter = (void*)&GetR4; - break; - case (ML_PY_FLOAT64): - kind = R8; - pgetter = (void*)&GetR8; - break; - default: - throw std::invalid_argument("column " + colName + " has unsupported type"); - } - const char *data = val.get_data(); - this->_vdata.push_back(data); + BYTE kind; + void *pgetter; + bool isKey = false; + bool isNumeric = false; + bool isText = false; + CxInt64 vecCard = -1; + // Numeric or bool values. + if (bp::extract(value).check()) + { + isNumeric = true; + np::ndarray val = bp::extract(value); + switch (colType) + { + case (ML_PY_BOOL): + kind = BL; + pgetter = (void*)&GetBL; + break; + case (ML_PY_BOOL64): + kind = BL; + pgetter = (void*)&GetBL64; + break; + case (ML_PY_UINT8): + kind = U1; + pgetter = (void*)&GetU1; + break; + case (ML_PY_UINT16): + kind = U2; + pgetter = (void*)&GetU2; + break; + case (ML_PY_UINT32): + kind = U4; + pgetter = (void*)&GetU4; + break; + case (ML_PY_UINT64): + kind = U8; + pgetter = (void*)&GetU8; + break; + case (ML_PY_INT8): + kind = I1; + pgetter = (void*)&GetI1; + break; + case (ML_PY_INT16): + kind = I2; + pgetter = (void*)&GetI2; + break; + case (ML_PY_INT32): + kind = I4; + pgetter = (void*)&GetI4; + break; + case (ML_PY_INT64): + kind = I8; + pgetter = (void*)&GetI8; + break; + case (ML_PY_FLOAT16): + // What to do with numpy.float16 ? + throw std::invalid_argument("numpy.float16 data type is not supported"); + case (ML_PY_FLOAT32): + kind = R4; + pgetter = (void*)&GetR4; + break; + case (ML_PY_FLOAT64): + kind = R8; + pgetter = (void*)&GetR8; + break; + default: + throw std::invalid_argument("column " + colName + " has unsupported type"); + } + const char *data = val.get_data(); + this->_vdata.push_back(data); - assert(this->_mpnum.size() == dataframeColCount); - this->_mpnum.push_back(_vdata.size() - 1); - if (llTotalNumRows == -1) - llTotalNumRows = val.shape(0); - else - assert(llTotalNumRows == val.shape(0)); - } - // Text or key values. - else if (bp::extract(value).check()) - { - bp::list list = bp::extract(value); + assert(this->_mpnum.size() == dataframeColCount); + this->_mpnum.push_back(_vdata.size() - 1); + if (llTotalNumRows == -1) + llTotalNumRows = val.shape(0); + else + assert(llTotalNumRows == val.shape(0)); + } + // Text or key values. + else if (bp::extract(value).check()) + { + bp::list list = bp::extract(value); - // Key values. - switch (colType) - { - case (ML_PY_CAT): - if (varInfo.contains(colName)) - { - isKey = true; - assert(bp::extract(varInfo[colName]).check()); - bp::list keyNames = bp::extract(varInfo[colName]); + // Key values. + switch (colType) + { + case (ML_PY_CAT): + if (varInfo.contains(colName)) + { + isKey = true; + assert(bp::extract(varInfo[colName]).check()); + bp::list keyNames = bp::extract(varInfo[colName]); - kind = U4; - pgetter = (void*)GetKeyInt; + kind = U4; + pgetter = (void*)GetKeyInt; - // TODO: Handle vectors. - this->_vkeyCard.push_back(len(keyNames)); - //this->_vvecCard.push_back(vecCard); - this->_vkeydata.push_back(list); - this->_vkeynames.push_back(keyNames); + // TODO: Handle vectors. + this->_vkeyCard.push_back(len(keyNames)); + //this->_vvecCard.push_back(vecCard); + this->_vkeydata.push_back(list); + this->_vkeynames.push_back(keyNames); - assert(this->_mpkey.size() == dataframeColCount); - this->_mpkey.push_back(_vkeydata.size() - 1); - if (llTotalNumRows == -1) - llTotalNumRows = len(list); - else - assert(llTotalNumRows == len(list)); - } - else - continue; - break; - // Text values. - case (ML_PY_TEXT): - case (ML_PY_UNICODE): - isText = true; - kind = TX; - if (colType == ML_PY_TEXT) - pgetter = (void*)GetTX; - else // colType is "unicode" - // in python 2.7 strings can be passed as unicode bytestring (NOT the same as UTF8 encoded strings) - pgetter = (void*)GetUnicodeTX; + assert(this->_mpkey.size() == dataframeColCount); + this->_mpkey.push_back(_vkeydata.size() - 1); + if (llTotalNumRows == -1) + llTotalNumRows = len(list); + else + assert(llTotalNumRows == len(list)); + } + else + continue; + break; + // Text values. + case (ML_PY_TEXT): + case (ML_PY_UNICODE): + isText = true; + kind = TX; + if (colType == ML_PY_TEXT) + pgetter = (void*)GetTX; + else // colType is "unicode" + // in python 2.7 strings can be passed as unicode bytestring (NOT the same as UTF8 encoded strings) + pgetter = (void*)GetUnicodeTX; - // TODO: Handle vectors. - //this->_vvecCard.push_back(vecCard); - this->_vtextdata.push_back(list); + // TODO: Handle vectors. + //this->_vvecCard.push_back(vecCard); + this->_vtextdata.push_back(list); - assert(this->_mptxt.size() == dataframeColCount); - this->_mptxt.push_back(_vtextdata.size() - 1); - if (llTotalNumRows == -1) - llTotalNumRows = len(list); - else - assert(llTotalNumRows == len(list)); - break; - default: - throw std::invalid_argument("column " + colName + " has unsupported type"); - } - } - // A sparse vector. - else if (bp::extract(value).check()) - { - bp::dict sparse = bp::extract(value); - np::ndarray indices = bp::extract(sparse["indices"]); - _sparseIndices = (int*)indices.get_data(); - np::ndarray indptr = bp::extract(sparse["indptr"]); - _indPtr = (int*)indptr.get_data(); + assert(this->_mptxt.size() == dataframeColCount); + this->_mptxt.push_back(_vtextdata.size() - 1); + if (llTotalNumRows == -1) + llTotalNumRows = len(list); + else + assert(llTotalNumRows == len(list)); + break; + default: + throw std::invalid_argument("column " + colName + " has unsupported type"); + } + } + // A sparse vector. + else if (bp::extract(value).check()) + { + bp::dict sparse = bp::extract(value); + np::ndarray indices = bp::extract(sparse["indices"]); + _sparseIndices = (int*)indices.get_data(); + np::ndarray indptr = bp::extract(sparse["indptr"]); + _indPtr = (int*)indptr.get_data(); - np::ndarray values = bp::extract(sparse["values"]); - _sparseValues = values.get_data(); - switch (colType) - { - case (ML_PY_BOOL): - kind = BL; - pgetter = (void*)&GetBLVector; - break; - case (ML_PY_UINT8): - kind = U1; - pgetter = (void*)&GetU1Vector; - break; - case (ML_PY_UINT16): - kind = U2; - pgetter = (void*)&GetU2Vector; - break; - case (ML_PY_UINT32): - kind = U4; - pgetter = (void*)&GetU4Vector; - break; - case (ML_PY_UINT64): - kind = U8; - pgetter = (void*)&GetU8Vector; - break; - case (ML_PY_INT8): - kind = I1; - pgetter = (void*)&GetI1Vector; - break; - case (ML_PY_INT16): - kind = I2; - pgetter = (void*)&GetI2Vector; - break; - case (ML_PY_INT32): - kind = I4; - pgetter = (void*)&GetI4Vector; - break; - case (ML_PY_INT64): - kind = I8; - pgetter = (void*)&GetI8Vector; - break; - case (ML_PY_FLOAT16): - throw std::invalid_argument("numpy.float16 data type is not supported in sparse data"); - case (ML_PY_FLOAT32): - kind = R4; - pgetter = (void*)&GetR4Vector; - break; - case (ML_PY_FLOAT64): - kind = R8; - pgetter = (void*)&GetR8Vector; - break; - default: - throw std::invalid_argument("column " + colName + " has unsupported type"); - } - vecCard = bp::extract(sparse["colCount"]); - name = (char*)"Data"; + np::ndarray values = bp::extract(sparse["values"]); + _sparseValues = values.get_data(); + switch (colType) + { + case (ML_PY_BOOL): + kind = BL; + pgetter = (void*)&GetBLVector; + break; + case (ML_PY_UINT8): + kind = U1; + pgetter = (void*)&GetU1Vector; + break; + case (ML_PY_UINT16): + kind = U2; + pgetter = (void*)&GetU2Vector; + break; + case (ML_PY_UINT32): + kind = U4; + pgetter = (void*)&GetU4Vector; + break; + case (ML_PY_UINT64): + kind = U8; + pgetter = (void*)&GetU8Vector; + break; + case (ML_PY_INT8): + kind = I1; + pgetter = (void*)&GetI1Vector; + break; + case (ML_PY_INT16): + kind = I2; + pgetter = (void*)&GetI2Vector; + break; + case (ML_PY_INT32): + kind = I4; + pgetter = (void*)&GetI4Vector; + break; + case (ML_PY_INT64): + kind = I8; + pgetter = (void*)&GetI8Vector; + break; + case (ML_PY_FLOAT16): + throw std::invalid_argument("numpy.float16 data type is not supported in sparse data"); + case (ML_PY_FLOAT32): + kind = R4; + pgetter = (void*)&GetR4Vector; + break; + case (ML_PY_FLOAT64): + kind = R8; + pgetter = (void*)&GetR8Vector; + break; + default: + throw std::invalid_argument("column " + colName + " has unsupported type"); + } + vecCard = bp::extract(sparse["colCount"]); + name = (char*)"Data"; - if (llTotalNumRows == -1) - llTotalNumRows = len(indptr) - 1; - else - assert(llTotalNumRows == len(indptr) - 1); - } - else - throw std::invalid_argument("unsupported data type provided"); + if (llTotalNumRows == -1) + llTotalNumRows = len(indptr) - 1; + else + assert(llTotalNumRows == len(indptr) - 1); + } + else + throw std::invalid_argument("unsupported data type provided"); - this->_vgetter.push_back(pgetter); - this->_vname.push_back(name); - this->_vkind.push_back(kind); - _vvecCard.push_back(vecCard); + this->_vgetter.push_back(pgetter); + this->_vname.push_back(name); + this->_vkind.push_back(kind); + _vvecCard.push_back(vecCard); - if (!isNumeric) - { - assert(this->_mpnum.size() == dataframeColCount); - this->_mpnum.push_back(-1); - } - if (!isKey) - { - assert(this->_mpkey.size() == dataframeColCount); - this->_mpkey.push_back(-1); - this->_vkeyCard.push_back(-1); - } - if (!isText) - { - assert(this->_mptxt.size() == dataframeColCount); - this->_mptxt.push_back(-1); - } - } + if (!isNumeric) + { + assert(this->_mpnum.size() == dataframeColCount); + this->_mpnum.push_back(-1); + } + if (!isKey) + { + assert(this->_mpkey.size() == dataframeColCount); + this->_mpkey.push_back(-1); + this->_vkeyCard.push_back(-1); + } + if (!isText) + { + assert(this->_mptxt.size() == dataframeColCount); + this->_mptxt.push_back(-1); + } + } - assert(_vname.size() <= (size_t)(dataframeColCount + 1)); + assert(_vname.size() <= (size_t)(dataframeColCount + 1)); - this->crow = llTotalNumRows; - this->ccol = this->_vname.size(); - this->getLabels = &GetKeyNames; + this->crow = llTotalNumRows; + this->ccol = this->_vname.size(); + this->getLabels = &GetKeyNames; - assert(this->ccol == this->_vkind.size()); - assert(this->ccol == this->_vkeyCard.size()); - assert(this->ccol == this->_vgetter.size()); + assert(this->ccol == this->_vkind.size()); + assert(this->ccol == this->_vkeyCard.size()); + assert(this->ccol == this->_vgetter.size()); - // This is used in Revo, but seems to not be needed here. - this->ids = nullptr; + // This is used in Revo, but seems to not be needed here. + this->ids = nullptr; - if (this->ccol > 0) - { - this->names = &this->_vname[0]; - this->kinds = &this->_vkind[0]; - this->keyCards = &this->_vkeyCard[0]; - this->vecCards = &this->_vvecCard[0]; - this->getters = &this->_vgetter[0]; - } - else - { - this->names = nullptr; - this->kinds = nullptr; - this->keyCards = nullptr; - this->vecCards = nullptr; - this->getters = nullptr; - } + if (this->ccol > 0) + { + this->names = &this->_vname[0]; + this->kinds = &this->_vkind[0]; + this->keyCards = &this->_vkeyCard[0]; + this->vecCards = &this->_vvecCard[0]; + this->getters = &this->_vgetter[0]; + } + else + { + this->names = nullptr; + this->kinds = nullptr; + this->keyCards = nullptr; + this->vecCards = nullptr; + this->getters = nullptr; + } } DataSourceBlock::~DataSourceBlock() { #if _MSC_VER - for (std::vector::iterator it = this->_vtextdata_cache.begin(); it != this->_vtextdata_cache.end(); ++it) { - char* tmp = *it; - if (tmp != NULL) - free(tmp); - } + for (std::vector::iterator it = this->_vtextdata_cache.begin(); it != this->_vtextdata_cache.end(); ++it) { + char* tmp = *it; + if (tmp != NULL) + free(tmp); + } #endif - FillDead(this->ccol); - FillDead(this->crow); + FillDead(this->ccol); + FillDead(this->crow); - FillDead(this->names); - FillDead(this->kinds); - FillDead(this->keyCards); - FillDead(this->vecCards); - FillDead(this->getters); - FillDead(this->getLabels); + FillDead(this->names); + FillDead(this->kinds); + FillDead(this->keyCards); + FillDead(this->vecCards); + FillDead(this->getters); + FillDead(this->getLabels); } diff --git a/src/NativeBridge/DataViewInterop.h b/src/NativeBridge/DataViewInterop.h index 0f3011fa..a49b52ed 100644 --- a/src/NativeBridge/DataViewInterop.h +++ b/src/NativeBridge/DataViewInterop.h @@ -25,482 +25,482 @@ using namespace boost::python; // REVIEW: Need to figure out proper story for multi-threaded execution. class DataSourceBlock { - // Fields that are visible to managed code come first and do not start with an underscore. - // Fields that are only visible to this code start with an underscore. + // Fields that are visible to managed code come first and do not start with an underscore. + // Fields that are only visible to this code start with an underscore. private: - // *** These fields are known by managed code. It is critical that this struct not have a vtable. - // It is also critical that the layout of this prefix NOT vary from release to release or build to build. - - // Number of columns. - CxInt64 ccol; - // Total number of rows. Zero for unknown. - CxInt64 crow; - - // Column ids. - const CxInt64 *ids; - // Column names. - const char **names; - // Column data kinds. - const BYTE *kinds; - // Column key type cardinalities. Zero for unbounded, -1 for non-key-types. - const CxInt64 *keyCards; - // Column vector type cardinalities. Zero for variable size, -1 for non-vector-types. - const CxInt64 *vecCards; - // The call back item getter function pointers. Currently only used for string - // values (nullptr for others). For strings these are GETSTR function pointers. - const void **getters; - - // Call back function for getting labels. - GETLABELS getLabels; + // *** These fields are known by managed code. It is critical that this struct not have a vtable. + // It is also critical that the layout of this prefix NOT vary from release to release or build to build. + + // Number of columns. + CxInt64 ccol; + // Total number of rows. Zero for unknown. + CxInt64 crow; + + // Column ids. + const CxInt64 *ids; + // Column names. + const char **names; + // Column data kinds. + const BYTE *kinds; + // Column key type cardinalities. Zero for unbounded, -1 for non-key-types. + const CxInt64 *keyCards; + // Column vector type cardinalities. Zero for variable size, -1 for non-vector-types. + const CxInt64 *vecCards; + // The call back item getter function pointers. Currently only used for string + // values (nullptr for others). For strings these are GETSTR function pointers. + const void **getters; + + // Call back function for getting labels. + GETLABELS getLabels; private: - // *** Stuff below here is not known by the managed code. - - std::vector _mpnum; - std::vector _mptxt; - std::vector _mpkey; - - // The vectors below here are parallel. - - // Column names. - std::vector _vname; - // Column DataKind values. - std::vector _vkind; - // Column key type cardinalities. Zero for unbounded, -1 for non-key-types. - std::vector _vkeyCard; - // Column vector type cardinalities. Zero for variable size, -1 for non-vector-types. - std::vector _vvecCard; - // Data getters for the columns (null for non-text columns). - std::vector _vgetter; - - std::vector _vdata; - std::vector _vtextdata; - std::vector _vtextdata_cache; - std::vector _vkeydata; - std::vector _vkeynames; - - // Stores the sparse data. - // REVIEW: need better documentatoin here - is this a pointer, or buffer ? If buffer, why this is not a vector ? Where do we store type of values ? What is indptr ? - void* _sparseValues; - int* _sparseIndices; - int* _indPtr; + // *** Stuff below here is not known by the managed code. + + std::vector _mpnum; + std::vector _mptxt; + std::vector _mpkey; + + // The vectors below here are parallel. + + // Column names. + std::vector _vname; + // Column DataKind values. + std::vector _vkind; + // Column key type cardinalities. Zero for unbounded, -1 for non-key-types. + std::vector _vkeyCard; + // Column vector type cardinalities. Zero for variable size, -1 for non-vector-types. + std::vector _vvecCard; + // Data getters for the columns (null for non-text columns). + std::vector _vgetter; + + std::vector _vdata; + std::vector _vtextdata; + std::vector _vtextdata_cache; + std::vector _vkeydata; + std::vector _vkeynames; + + // Stores the sparse data. + // REVIEW: need better documentatoin here - is this a pointer, or buffer ? If buffer, why this is not a vector ? Where do we store type of values ? What is indptr ? + void* _sparseValues; + int* _sparseIndices; + int* _indPtr; public: - DataSourceBlock(bp::dict& data); - ~DataSourceBlock(); + DataSourceBlock(bp::dict& data); + ~DataSourceBlock(); private: - bp::object SelectItemForType(bp::list& container) - { - auto length = len(container); - - for (auto index = 0; index < length; index++) - { - bp::object item = container[index]; - - if (!item.is_none()) - { - return item; - } - } - - return bp::object(); - } - - // Callback methods. These are only needed from managed code via the embedded function pointers above, - // so can be private. - static MANAGED_CALLBACK(void) GetBL(DataSourceBlock *pdata, int col, long index, /*out*/ signed char &dst) - { - CxInt64 numCol = pdata->_mpnum[col]; - assert(0 <= numCol && numCol < (CxInt64)pdata->_vdata.size()); - const signed char *charData = reinterpret_cast(pdata->_vdata[numCol]); - dst = charData[index]; - } - static MANAGED_CALLBACK(void) GetBL64(DataSourceBlock *pdata, int col, long index, /*out*/ signed char &dst) - { - CxInt64 numCol = pdata->_mpnum[col]; - assert(0 <= numCol && numCol < (CxInt64)pdata->_vdata.size()); - const double *charData = reinterpret_cast(pdata->_vdata[numCol]); - if (boost::math::isnan(charData[index])) - dst = -1; - else - dst = (signed char)charData[index]; - } - static MANAGED_CALLBACK(void) GetU1(DataSourceBlock *pdata, int col, long index, /*out*/ unsigned char &dst) - { - CxInt64 numCol = pdata->_mpnum[col]; - assert(0 <= numCol && numCol < (CxInt64)pdata->_vdata.size()); - const unsigned char *charData = reinterpret_cast(pdata->_vdata[numCol]); - dst = charData[index]; - } - static MANAGED_CALLBACK(void) GetU2(DataSourceBlock *pdata, int col, long index, /*out*/ unsigned short &dst) - { - CxInt64 numCol = pdata->_mpnum[col]; - assert(0 <= numCol && numCol < (CxInt64)pdata->_vdata.size()); - const unsigned short *shortData = reinterpret_cast(pdata->_vdata[numCol]); - dst = shortData[index]; - } - static MANAGED_CALLBACK(void) GetU4(DataSourceBlock *pdata, int col, long index, /*out*/ unsigned int &dst) - { - CxInt64 numCol = pdata->_mpnum[col]; - assert(0 <= numCol && numCol < (CxInt64)pdata->_vdata.size()); - const unsigned int *intData = reinterpret_cast(pdata->_vdata[numCol]); - dst = intData[index]; - } - static MANAGED_CALLBACK(void) GetU8(DataSourceBlock *pdata, int col, long index, /*out*/ CxUInt64 &dst) - { - CxInt64 numCol = pdata->_mpnum[col]; - assert(0 <= numCol && numCol < (CxInt64)pdata->_vdata.size()); - const CxUInt64 *longData = reinterpret_cast(pdata->_vdata[numCol]); - dst = longData[index]; - } - static MANAGED_CALLBACK(void) GetI1(DataSourceBlock *pdata, int col, long index, /*out*/ signed char &dst) - { - CxInt64 numCol = pdata->_mpnum[col]; - assert(0 <= numCol && numCol < (CxInt64)pdata->_vdata.size()); - const signed char *charData = reinterpret_cast(pdata->_vdata[numCol]); - dst = charData[index]; - } - static MANAGED_CALLBACK(void) GetI2(DataSourceBlock *pdata, int col, long index, /*out*/ short &dst) - { - CxInt64 numCol = pdata->_mpnum[col]; - assert(0 <= numCol && numCol < (CxInt64)pdata->_vdata.size()); - const short *shortData = reinterpret_cast(pdata->_vdata[numCol]); - dst = shortData[index]; - } - static MANAGED_CALLBACK(void) GetI4(DataSourceBlock *pdata, int col, long index, /*out*/ int &dst) - { - CxInt64 numCol = pdata->_mpnum[col]; - assert(0 <= numCol && numCol < (CxInt64)pdata->_vdata.size()); - const int *intData = reinterpret_cast(pdata->_vdata[numCol]); - dst = intData[index]; - } - static MANAGED_CALLBACK(void) GetI8(DataSourceBlock *pdata, int col, long index, /*out*/ CxInt64 &dst) - { - CxInt64 numCol = pdata->_mpnum[col]; - assert(0 <= numCol && numCol < (CxInt64)pdata->_vdata.size()); - const CxInt64 *longData = reinterpret_cast(pdata->_vdata[numCol]); - dst = longData[index]; - } - static MANAGED_CALLBACK(void) GetR4(DataSourceBlock *pdata, int col, long index, /*out*/ float &dst) - { - CxInt64 numCol = pdata->_mpnum[col]; - assert(0 <= numCol && numCol < (CxInt64)pdata->_vdata.size()); - const float *floatData = reinterpret_cast(pdata->_vdata[numCol]); - dst = floatData[index]; - } - static MANAGED_CALLBACK(void) GetR8(DataSourceBlock *pdata, int col, long index, /*out*/ double &dst) - { - CxInt64 numCol = pdata->_mpnum[col]; - assert(0 <= numCol && numCol < (CxInt64)pdata->_vdata.size()); - const double *doubleData = reinterpret_cast(pdata->_vdata[numCol]); - dst = doubleData[index]; - } - - // Call back from C# to map from data buffer and index to char* and convert to UTF16. - static MANAGED_CALLBACK(void) GetTX(DataSourceBlock *pdata, int col, long index, const/*out*/ char*& pch, /*out*/int32_t &size, /*out*/int32_t &missing) - { - CxInt64 txCol = pdata->_mptxt[col]; - assert(0 <= txCol && txCol < (CxInt64)pdata->_vtextdata.size()); - bp::object s = pdata->_vtextdata[txCol][index]; - - if (bp::extract(s).check()) - { - size = -1; - missing = -1; - pch = bp::extract(s); - if (s.is_none()) - { - size = 0; - pch = 0; - } - else - { + bp::object SelectItemForType(bp::list& container) + { + auto length = len(container); + + for (auto index = 0; index < length; index++) + { + bp::object item = container[index]; + + if (!item.is_none()) + { + return item; + } + } + + return bp::object(); + } + + // Callback methods. These are only needed from managed code via the embedded function pointers above, + // so can be private. + static MANAGED_CALLBACK(void) GetBL(DataSourceBlock *pdata, int col, long index, /*out*/ signed char &dst) + { + CxInt64 numCol = pdata->_mpnum[col]; + assert(0 <= numCol && numCol < (CxInt64)pdata->_vdata.size()); + const signed char *charData = reinterpret_cast(pdata->_vdata[numCol]); + dst = charData[index]; + } + static MANAGED_CALLBACK(void) GetBL64(DataSourceBlock *pdata, int col, long index, /*out*/ signed char &dst) + { + CxInt64 numCol = pdata->_mpnum[col]; + assert(0 <= numCol && numCol < (CxInt64)pdata->_vdata.size()); + const double *charData = reinterpret_cast(pdata->_vdata[numCol]); + if (boost::math::isnan(charData[index])) + dst = -1; + else + dst = (signed char)charData[index]; + } + static MANAGED_CALLBACK(void) GetU1(DataSourceBlock *pdata, int col, long index, /*out*/ unsigned char &dst) + { + CxInt64 numCol = pdata->_mpnum[col]; + assert(0 <= numCol && numCol < (CxInt64)pdata->_vdata.size()); + const unsigned char *charData = reinterpret_cast(pdata->_vdata[numCol]); + dst = charData[index]; + } + static MANAGED_CALLBACK(void) GetU2(DataSourceBlock *pdata, int col, long index, /*out*/ unsigned short &dst) + { + CxInt64 numCol = pdata->_mpnum[col]; + assert(0 <= numCol && numCol < (CxInt64)pdata->_vdata.size()); + const unsigned short *shortData = reinterpret_cast(pdata->_vdata[numCol]); + dst = shortData[index]; + } + static MANAGED_CALLBACK(void) GetU4(DataSourceBlock *pdata, int col, long index, /*out*/ unsigned int &dst) + { + CxInt64 numCol = pdata->_mpnum[col]; + assert(0 <= numCol && numCol < (CxInt64)pdata->_vdata.size()); + const unsigned int *intData = reinterpret_cast(pdata->_vdata[numCol]); + dst = intData[index]; + } + static MANAGED_CALLBACK(void) GetU8(DataSourceBlock *pdata, int col, long index, /*out*/ CxUInt64 &dst) + { + CxInt64 numCol = pdata->_mpnum[col]; + assert(0 <= numCol && numCol < (CxInt64)pdata->_vdata.size()); + const CxUInt64 *longData = reinterpret_cast(pdata->_vdata[numCol]); + dst = longData[index]; + } + static MANAGED_CALLBACK(void) GetI1(DataSourceBlock *pdata, int col, long index, /*out*/ signed char &dst) + { + CxInt64 numCol = pdata->_mpnum[col]; + assert(0 <= numCol && numCol < (CxInt64)pdata->_vdata.size()); + const signed char *charData = reinterpret_cast(pdata->_vdata[numCol]); + dst = charData[index]; + } + static MANAGED_CALLBACK(void) GetI2(DataSourceBlock *pdata, int col, long index, /*out*/ short &dst) + { + CxInt64 numCol = pdata->_mpnum[col]; + assert(0 <= numCol && numCol < (CxInt64)pdata->_vdata.size()); + const short *shortData = reinterpret_cast(pdata->_vdata[numCol]); + dst = shortData[index]; + } + static MANAGED_CALLBACK(void) GetI4(DataSourceBlock *pdata, int col, long index, /*out*/ int &dst) + { + CxInt64 numCol = pdata->_mpnum[col]; + assert(0 <= numCol && numCol < (CxInt64)pdata->_vdata.size()); + const int *intData = reinterpret_cast(pdata->_vdata[numCol]); + dst = intData[index]; + } + static MANAGED_CALLBACK(void) GetI8(DataSourceBlock *pdata, int col, long index, /*out*/ CxInt64 &dst) + { + CxInt64 numCol = pdata->_mpnum[col]; + assert(0 <= numCol && numCol < (CxInt64)pdata->_vdata.size()); + const CxInt64 *longData = reinterpret_cast(pdata->_vdata[numCol]); + dst = longData[index]; + } + static MANAGED_CALLBACK(void) GetR4(DataSourceBlock *pdata, int col, long index, /*out*/ float &dst) + { + CxInt64 numCol = pdata->_mpnum[col]; + assert(0 <= numCol && numCol < (CxInt64)pdata->_vdata.size()); + const float *floatData = reinterpret_cast(pdata->_vdata[numCol]); + dst = floatData[index]; + } + static MANAGED_CALLBACK(void) GetR8(DataSourceBlock *pdata, int col, long index, /*out*/ double &dst) + { + CxInt64 numCol = pdata->_mpnum[col]; + assert(0 <= numCol && numCol < (CxInt64)pdata->_vdata.size()); + const double *doubleData = reinterpret_cast(pdata->_vdata[numCol]); + dst = doubleData[index]; + } + + // Call back from C# to map from data buffer and index to char* and convert to UTF16. + static MANAGED_CALLBACK(void) GetTX(DataSourceBlock *pdata, int col, long index, const/*out*/ char*& pch, /*out*/int32_t &size, /*out*/int32_t &missing) + { + CxInt64 txCol = pdata->_mptxt[col]; + assert(0 <= txCol && txCol < (CxInt64)pdata->_vtextdata.size()); + bp::object s = pdata->_vtextdata[txCol][index]; + + if (bp::extract(s).check()) + { + size = -1; + missing = -1; + pch = bp::extract(s); + if (s.is_none()) + { + size = 0; + pch = 0; + } + else + { #if _MSC_VER - Utf8ToUtf16le(pch, pch, size); + Utf8ToUtf16le(pch, pch, size); #endif - pdata->_vtextdata_cache.push_back((char*)pch); - } - } - else - { - // Missing values in Python are float.NaN. - assert(bp::extract(s).check()); - missing = 1; - } - } - - // The method below executes in python 2.7 only! + pdata->_vtextdata_cache.push_back((char*)pch); + } + } + else + { + // Missing values in Python are float.NaN. + assert(bp::extract(s).check()); + missing = 1; + } + } + + // The method below executes in python 2.7 only! // Call back from C# to get text data in UTF16 from unicode bytestring static MANAGED_CALLBACK(void) GetUnicodeTX(DataSourceBlock *pdata, int col, long index, const/*out*/ char*& pch, /*out*/int32_t &size, /*out*/int32_t &missing) - { + { CxInt64 txCol = pdata->_mptxt[col]; assert(0 <= txCol && txCol < (CxInt64)pdata->_vtextdata.size()); auto s = pdata->_vtextdata[txCol][index]; - if (bp::extract(str(s).encode("utf_8")).check()) - { - missing = -1; - pch = bp::extract(str(s).encode("utf_8")); + if (bp::extract(str(s).encode("utf_8")).check()) + { + missing = -1; + pch = bp::extract(str(s).encode("utf_8")); #if _MSC_VER Utf8ToUtf16le(pch, pch, size); #endif - pdata->_vtextdata_cache.push_back((char*)pch); - } - else - { - // Missing values in Python are float.NaN. - assert(bp::extract(s).check()); - missing = 1; - } + pdata->_vtextdata_cache.push_back((char*)pch); + } + else + { + // Missing values in Python are float.NaN. + assert(bp::extract(s).check()); + missing = 1; + } } #if _MSC_VER - static void Utf8ToUtf16le(const char* utf8Str, const/*out*/ char*& pch, /*out*/int &size) - { - // Allocate the utf16 string buffer. - size = MultiByteToWideChar(CP_UTF8, 0, utf8Str, -1, NULL, 0); - if (size == 0) - { - pch = 0; - return; - } - - wchar_t* utf16Str = new wchar_t[size]; - - try - { - // Convert the utf8 string. - MultiByteToWideChar(CP_UTF8, 0, utf8Str, -1, utf16Str, size); - } - catch (...) - { - // On exception clean up and re-throw. - if (utf16Str) delete[] utf16Str; - throw; - } - - // size includes a NULL character at the end, discount it - assert(utf16Str[size - 1] == L'\0'); - size -= 1; - pch = (char*)utf16Str; - } + static void Utf8ToUtf16le(const char* utf8Str, const/*out*/ char*& pch, /*out*/int &size) + { + // Allocate the utf16 string buffer. + size = MultiByteToWideChar(CP_UTF8, 0, utf8Str, -1, NULL, 0); + if (size == 0) + { + pch = 0; + return; + } + + wchar_t* utf16Str = new wchar_t[size]; + + try + { + // Convert the utf8 string. + MultiByteToWideChar(CP_UTF8, 0, utf8Str, -1, utf16Str, size); + } + catch (...) + { + // On exception clean up and re-throw. + if (utf16Str) delete[] utf16Str; + throw; + } + + // size includes a NULL character at the end, discount it + assert(utf16Str[size - 1] == L'\0'); + size -= 1; + pch = (char*)utf16Str; + } #endif - static MANAGED_CALLBACK(void) GetKeyInt(DataSourceBlock *pdata, int col, long index, /*out*/ int& dst) - { - CxInt64 keyCol = pdata->_mpkey[col]; - assert(0 <= keyCol && keyCol < (CxInt64)pdata->_vkeydata.size()); - - auto & list = pdata->_vkeydata[keyCol]; - bp::object obj = pdata->SelectItemForType(list); - assert(strcmp(obj.ptr()->ob_type->tp_name, "int") == 0); - dst = bp::extract(list[index]); - } - - // Callback function for getting labels for key-type columns. Returns success. - static MANAGED_CALLBACK(bool) GetKeyNames(DataSourceBlock *pdata, int col, int count, const char **buffer) - { - if (count <= 0 || buffer == nullptr) - { - // Invalid count or buffer, don't zero out buffer returning. - assert(false); - return false; - } - if (pdata == nullptr) - { - // Invalid pdata. - return OnGetLabelsFailure(count, buffer); - } - if (0 > col || (size_t)col >= pdata->_mpkey.size()) - { - // Invalid column id. - return OnGetLabelsFailure(count, buffer); - } - if (pdata->_vkeyCard[col] != count) - { - // Column is not a key type. - return OnGetLabelsFailure(count, buffer); - } - - CxInt64 keyCol = pdata->_mpkey[col]; - bp::list & names = pdata->_vkeynames[keyCol]; - if (len(names) != count) - { - // No labels for this column. This is not a logic error. - return OnGetLabelsFailure(count, buffer); - } - - for (int i = 0; i < count; ++i, ++buffer) - *buffer = bp::extract(names[i]); - return true; - } - - static bool OnGetLabelsFailure(int count, const char **buffer) - { - assert(false); - for (int i = 0; i < count; i++) - buffer[i] = nullptr; - return false; - } - - // Same method has two modes: if "inquire" is true, it returns the number of indices/values needed for the current row. - // If "inquire" is false, it assumes that indices/values are big enough, and fills them in for the current row. - static MANAGED_CALLBACK(void) GetBLVector(DataSourceBlock *pdata, int col, CxInt64 index, int* indices, unsigned char* values, bool inquire, /*out*/ int &size) - { - size = pdata->_indPtr[index + 1] - pdata->_indPtr[index]; - if (inquire) - return; - - const unsigned char *boolData = reinterpret_cast(pdata->_sparseValues); - for (int i = 0; i < size; i++) - { - indices[i] = pdata->_sparseIndices[pdata->_indPtr[index] + i]; - values[i] = boolData[pdata->_indPtr[index] + i]; - } - } - - static MANAGED_CALLBACK(void) GetU1Vector(DataSourceBlock *pdata, int col, CxInt64 index, int* indices, unsigned char* values, bool inquire, /*out*/ int &size) - { - size = pdata->_indPtr[index + 1] - pdata->_indPtr[index]; - if (inquire) - return; - - const unsigned char *int8Data = reinterpret_cast(pdata->_sparseValues); - for (int i = 0; i < size; i++) - { - indices[i] = pdata->_sparseIndices[pdata->_indPtr[index] + i]; - values[i] = int8Data[pdata->_indPtr[index] + i]; - } - } - - static MANAGED_CALLBACK(void) GetU2Vector(DataSourceBlock *pdata, int col, CxInt64 index, int* indices, unsigned short* values, bool inquire, /*out*/ int &size) - { - size = pdata->_indPtr[index + 1] - pdata->_indPtr[index]; - if (inquire) - return; - - const unsigned short *int16Data = reinterpret_cast(pdata->_sparseValues); - for (int i = 0; i < size; i++) - { - indices[i] = pdata->_sparseIndices[pdata->_indPtr[index] + i]; - values[i] = int16Data[pdata->_indPtr[index] + i]; - } - } - - static MANAGED_CALLBACK(void) GetU4Vector(DataSourceBlock *pdata, int col, CxInt64 index, int* indices, unsigned int* values, bool inquire, /*out*/ int &size) - { - size = pdata->_indPtr[index + 1] - pdata->_indPtr[index]; - if (inquire) - return; - - const unsigned int *int32Data = reinterpret_cast(pdata->_sparseValues); - for (int i = 0; i < size; i++) - { - indices[i] = pdata->_sparseIndices[pdata->_indPtr[index] + i]; - values[i] = int32Data[pdata->_indPtr[index] + i]; - } - } - - static MANAGED_CALLBACK(void) GetU8Vector(DataSourceBlock *pdata, int col, CxInt64 index, int* indices, CxUInt64* values, bool inquire, /*out*/ int &size) - { - size = pdata->_indPtr[index + 1] - pdata->_indPtr[index]; - if (inquire) - return; - - const unsigned long *int64Data = reinterpret_cast(pdata->_sparseValues); - for (int i = 0; i < size; i++) - { - indices[i] = pdata->_sparseIndices[pdata->_indPtr[index] + i]; - values[i] = int64Data[pdata->_indPtr[index] + i]; - } - } - - static MANAGED_CALLBACK(void) GetI1Vector(DataSourceBlock *pdata, int col, CxInt64 index, int* indices, signed char* values, bool inquire, /*out*/ int &size) - { - size = pdata->_indPtr[index + 1] - pdata->_indPtr[index]; - if (inquire) - return; - - const signed char *int8Data = reinterpret_cast(pdata->_sparseValues); - for (int i = 0; i < size; i++) - { - indices[i] = pdata->_sparseIndices[pdata->_indPtr[index] + i]; - values[i] = int8Data[pdata->_indPtr[index] + i]; - } - } - - static MANAGED_CALLBACK(void) GetI2Vector(DataSourceBlock *pdata, int col, CxInt64 index, int* indices, short* values, bool inquire, /*out*/ int &size) - { - size = pdata->_indPtr[index + 1] - pdata->_indPtr[index]; - if (inquire) - return; - - const short *int16Data = reinterpret_cast(pdata->_sparseValues); - for (int i = 0; i < size; i++) - { - indices[i] = pdata->_sparseIndices[pdata->_indPtr[index] + i]; - values[i] = int16Data[pdata->_indPtr[index] + i]; - } - } - - static MANAGED_CALLBACK(void) GetI4Vector(DataSourceBlock *pdata, int col, CxInt64 index, int* indices, int* values, bool inquire, /*out*/ int &size) - { - size = pdata->_indPtr[index + 1] - pdata->_indPtr[index]; - if (inquire) - return; - - const int *int32Data = reinterpret_cast(pdata->_sparseValues); - for (int i = 0; i < size; i++) - { - indices[i] = pdata->_sparseIndices[pdata->_indPtr[index] + i]; - values[i] = int32Data[pdata->_indPtr[index] + i]; - } - } - - static MANAGED_CALLBACK(void) GetI8Vector(DataSourceBlock *pdata, int col, CxInt64 index, int* indices, CxInt64* values, bool inquire, /*out*/ int &size) - { - size = pdata->_indPtr[index + 1] - pdata->_indPtr[index]; - if (inquire) - return; - - const CxInt64 *int64Data = reinterpret_cast(pdata->_sparseValues); - for (int i = 0; i < size; i++) - { - indices[i] = pdata->_sparseIndices[pdata->_indPtr[index] + i]; - values[i] = int64Data[pdata->_indPtr[index] + i]; - } - } - - static MANAGED_CALLBACK(void) GetR4Vector(DataSourceBlock *pdata, int col, CxInt64 index, int* indices, float* values, bool inquire, /*out*/ int &size) - { - size = pdata->_indPtr[index + 1] - pdata->_indPtr[index]; - if (inquire) - return; - - const float *floatData = reinterpret_cast(pdata->_sparseValues); - for (int i = 0; i < size; i++) - { - indices[i] = pdata->_sparseIndices[pdata->_indPtr[index] + i]; - values[i] = floatData[pdata->_indPtr[index] + i]; - } - } - - static MANAGED_CALLBACK(void) GetR8Vector(DataSourceBlock *pdata, int col, CxInt64 index, int* indices, double* values, bool inquire, /*out*/ int &size) - { - size = pdata->_indPtr[index + 1] - pdata->_indPtr[index]; - if (inquire) - return; - - const double *doubleData = reinterpret_cast(pdata->_sparseValues); - for (int i = 0; i < size; i++) - { - indices[i] = pdata->_sparseIndices[pdata->_indPtr[index] + i]; - values[i] = doubleData[pdata->_indPtr[index] + i]; - } - } + static MANAGED_CALLBACK(void) GetKeyInt(DataSourceBlock *pdata, int col, long index, /*out*/ int& dst) + { + CxInt64 keyCol = pdata->_mpkey[col]; + assert(0 <= keyCol && keyCol < (CxInt64)pdata->_vkeydata.size()); + + auto & list = pdata->_vkeydata[keyCol]; + bp::object obj = pdata->SelectItemForType(list); + assert(strcmp(obj.ptr()->ob_type->tp_name, "int") == 0); + dst = bp::extract(list[index]); + } + + // Callback function for getting labels for key-type columns. Returns success. + static MANAGED_CALLBACK(bool) GetKeyNames(DataSourceBlock *pdata, int col, int count, const char **buffer) + { + if (count <= 0 || buffer == nullptr) + { + // Invalid count or buffer, don't zero out buffer returning. + assert(false); + return false; + } + if (pdata == nullptr) + { + // Invalid pdata. + return OnGetLabelsFailure(count, buffer); + } + if (0 > col || (size_t)col >= pdata->_mpkey.size()) + { + // Invalid column id. + return OnGetLabelsFailure(count, buffer); + } + if (pdata->_vkeyCard[col] != count) + { + // Column is not a key type. + return OnGetLabelsFailure(count, buffer); + } + + CxInt64 keyCol = pdata->_mpkey[col]; + bp::list & names = pdata->_vkeynames[keyCol]; + if (len(names) != count) + { + // No labels for this column. This is not a logic error. + return OnGetLabelsFailure(count, buffer); + } + + for (int i = 0; i < count; ++i, ++buffer) + *buffer = bp::extract(names[i]); + return true; + } + + static bool OnGetLabelsFailure(int count, const char **buffer) + { + assert(false); + for (int i = 0; i < count; i++) + buffer[i] = nullptr; + return false; + } + + // Same method has two modes: if "inquire" is true, it returns the number of indices/values needed for the current row. + // If "inquire" is false, it assumes that indices/values are big enough, and fills them in for the current row. + static MANAGED_CALLBACK(void) GetBLVector(DataSourceBlock *pdata, int col, CxInt64 index, int* indices, unsigned char* values, bool inquire, /*out*/ int &size) + { + size = pdata->_indPtr[index + 1] - pdata->_indPtr[index]; + if (inquire) + return; + + const unsigned char *boolData = reinterpret_cast(pdata->_sparseValues); + for (int i = 0; i < size; i++) + { + indices[i] = pdata->_sparseIndices[pdata->_indPtr[index] + i]; + values[i] = boolData[pdata->_indPtr[index] + i]; + } + } + + static MANAGED_CALLBACK(void) GetU1Vector(DataSourceBlock *pdata, int col, CxInt64 index, int* indices, unsigned char* values, bool inquire, /*out*/ int &size) + { + size = pdata->_indPtr[index + 1] - pdata->_indPtr[index]; + if (inquire) + return; + + const unsigned char *int8Data = reinterpret_cast(pdata->_sparseValues); + for (int i = 0; i < size; i++) + { + indices[i] = pdata->_sparseIndices[pdata->_indPtr[index] + i]; + values[i] = int8Data[pdata->_indPtr[index] + i]; + } + } + + static MANAGED_CALLBACK(void) GetU2Vector(DataSourceBlock *pdata, int col, CxInt64 index, int* indices, unsigned short* values, bool inquire, /*out*/ int &size) + { + size = pdata->_indPtr[index + 1] - pdata->_indPtr[index]; + if (inquire) + return; + + const unsigned short *int16Data = reinterpret_cast(pdata->_sparseValues); + for (int i = 0; i < size; i++) + { + indices[i] = pdata->_sparseIndices[pdata->_indPtr[index] + i]; + values[i] = int16Data[pdata->_indPtr[index] + i]; + } + } + + static MANAGED_CALLBACK(void) GetU4Vector(DataSourceBlock *pdata, int col, CxInt64 index, int* indices, unsigned int* values, bool inquire, /*out*/ int &size) + { + size = pdata->_indPtr[index + 1] - pdata->_indPtr[index]; + if (inquire) + return; + + const unsigned int *int32Data = reinterpret_cast(pdata->_sparseValues); + for (int i = 0; i < size; i++) + { + indices[i] = pdata->_sparseIndices[pdata->_indPtr[index] + i]; + values[i] = int32Data[pdata->_indPtr[index] + i]; + } + } + + static MANAGED_CALLBACK(void) GetU8Vector(DataSourceBlock *pdata, int col, CxInt64 index, int* indices, CxUInt64* values, bool inquire, /*out*/ int &size) + { + size = pdata->_indPtr[index + 1] - pdata->_indPtr[index]; + if (inquire) + return; + + const unsigned long *int64Data = reinterpret_cast(pdata->_sparseValues); + for (int i = 0; i < size; i++) + { + indices[i] = pdata->_sparseIndices[pdata->_indPtr[index] + i]; + values[i] = int64Data[pdata->_indPtr[index] + i]; + } + } + + static MANAGED_CALLBACK(void) GetI1Vector(DataSourceBlock *pdata, int col, CxInt64 index, int* indices, signed char* values, bool inquire, /*out*/ int &size) + { + size = pdata->_indPtr[index + 1] - pdata->_indPtr[index]; + if (inquire) + return; + + const signed char *int8Data = reinterpret_cast(pdata->_sparseValues); + for (int i = 0; i < size; i++) + { + indices[i] = pdata->_sparseIndices[pdata->_indPtr[index] + i]; + values[i] = int8Data[pdata->_indPtr[index] + i]; + } + } + + static MANAGED_CALLBACK(void) GetI2Vector(DataSourceBlock *pdata, int col, CxInt64 index, int* indices, short* values, bool inquire, /*out*/ int &size) + { + size = pdata->_indPtr[index + 1] - pdata->_indPtr[index]; + if (inquire) + return; + + const short *int16Data = reinterpret_cast(pdata->_sparseValues); + for (int i = 0; i < size; i++) + { + indices[i] = pdata->_sparseIndices[pdata->_indPtr[index] + i]; + values[i] = int16Data[pdata->_indPtr[index] + i]; + } + } + + static MANAGED_CALLBACK(void) GetI4Vector(DataSourceBlock *pdata, int col, CxInt64 index, int* indices, int* values, bool inquire, /*out*/ int &size) + { + size = pdata->_indPtr[index + 1] - pdata->_indPtr[index]; + if (inquire) + return; + + const int *int32Data = reinterpret_cast(pdata->_sparseValues); + for (int i = 0; i < size; i++) + { + indices[i] = pdata->_sparseIndices[pdata->_indPtr[index] + i]; + values[i] = int32Data[pdata->_indPtr[index] + i]; + } + } + + static MANAGED_CALLBACK(void) GetI8Vector(DataSourceBlock *pdata, int col, CxInt64 index, int* indices, CxInt64* values, bool inquire, /*out*/ int &size) + { + size = pdata->_indPtr[index + 1] - pdata->_indPtr[index]; + if (inquire) + return; + + const CxInt64 *int64Data = reinterpret_cast(pdata->_sparseValues); + for (int i = 0; i < size; i++) + { + indices[i] = pdata->_sparseIndices[pdata->_indPtr[index] + i]; + values[i] = int64Data[pdata->_indPtr[index] + i]; + } + } + + static MANAGED_CALLBACK(void) GetR4Vector(DataSourceBlock *pdata, int col, CxInt64 index, int* indices, float* values, bool inquire, /*out*/ int &size) + { + size = pdata->_indPtr[index + 1] - pdata->_indPtr[index]; + if (inquire) + return; + + const float *floatData = reinterpret_cast(pdata->_sparseValues); + for (int i = 0; i < size; i++) + { + indices[i] = pdata->_sparseIndices[pdata->_indPtr[index] + i]; + values[i] = floatData[pdata->_indPtr[index] + i]; + } + } + + static MANAGED_CALLBACK(void) GetR8Vector(DataSourceBlock *pdata, int col, CxInt64 index, int* indices, double* values, bool inquire, /*out*/ int &size) + { + size = pdata->_indPtr[index + 1] - pdata->_indPtr[index]; + if (inquire) + return; + + const double *doubleData = reinterpret_cast(pdata->_sparseValues); + for (int i = 0; i < size; i++) + { + indices[i] = pdata->_sparseIndices[pdata->_indPtr[index] + i]; + values[i] = doubleData[pdata->_indPtr[index] + i]; + } + } }; // A native wrapper around a managed IDataView for receiving data back from managed code. @@ -508,40 +508,40 @@ class DataSourceBlock // This is filled in by managed code and referenced by native code. struct DataViewBlock { - // *** These fields are shared from managed code. It is critical that this struct not have a vtable. - // It is also critical that the layout of this NOT vary from release to release or build to build. - // The managed code assumes that CxInt64 occupies 8 bytes, and each pointer occupies 8 bytes. - - // Number of columns. - CxInt64 ccol; - // Total number of rows. Zero for unknown. - CxInt64 crow; - - // Column names. - const char **names; - // Column data kinds. - const BYTE *kinds; - // Column key type cardinalities. Only contains the values for the columns that have - // key names. - const int *keyCards; + // *** These fields are shared from managed code. It is critical that this struct not have a vtable. + // It is also critical that the layout of this NOT vary from release to release or build to build. + // The managed code assumes that CxInt64 occupies 8 bytes, and each pointer occupies 8 bytes. + + // Number of columns. + CxInt64 ccol; + // Total number of rows. Zero for unknown. + CxInt64 crow; + + // Column names. + const char **names; + // Column data kinds. + const BYTE *kinds; + // Column key type cardinalities. Only contains the values for the columns that have + // key names. + const int *keyCards; }; enum ML_PY_TYPE_MAP_ENUM { - ML_PY_BOOL = '?', - ML_PY_BOOL64 = '!', - ML_PY_UINT8 = 'B', - ML_PY_UINT16 = 'H', - ML_PY_UINT32 = 'I', - ML_PY_UINT64 = 'Q', - ML_PY_INT8 = 'b', - ML_PY_INT16 = 'h', - ML_PY_INT32 = 'i', - ML_PY_INT64 = 'q', - ML_PY_FLOAT16 = 'e', - ML_PY_FLOAT32 = 'f', - ML_PY_FLOAT64 = 'd', - ML_PY_CAT = 'c', - ML_PY_TEXT = 't', - ML_PY_UNICODE = 'u', - ML_PY_UNSUPPORTED = 'x' + ML_PY_BOOL = '?', + ML_PY_BOOL64 = '!', + ML_PY_UINT8 = 'B', + ML_PY_UINT16 = 'H', + ML_PY_UINT32 = 'I', + ML_PY_UINT64 = 'Q', + ML_PY_INT8 = 'b', + ML_PY_INT16 = 'h', + ML_PY_INT32 = 'i', + ML_PY_INT64 = 'q', + ML_PY_FLOAT16 = 'e', + ML_PY_FLOAT32 = 'f', + ML_PY_FLOAT64 = 'd', + ML_PY_CAT = 'c', + ML_PY_TEXT = 't', + ML_PY_UNICODE = 'u', + ML_PY_UNSUPPORTED = 'x' }; diff --git a/src/NativeBridge/ManagedInterop.cpp b/src/NativeBridge/ManagedInterop.cpp index bca89755..a8ed7941 100644 --- a/src/NativeBridge/ManagedInterop.cpp +++ b/src/NativeBridge/ManagedInterop.cpp @@ -7,8 +7,8 @@ #include "ManagedInterop.h" inline void destroyManagerCObject(PyObject* obj) { - auto* b = static_cast(PyCapsule_GetPointer(obj, NULL)); - if (b) { delete b; } + auto* b = static_cast(PyCapsule_GetPointer(obj, NULL)); + if (b) { delete b; } } #define SetDict2(cpptype, nptype); \ @@ -65,263 +65,264 @@ inline void destroyManagerCObject(PyObject* obj) { EnvironmentBlock::~EnvironmentBlock() { - // Everything (except data buffers) that we might have exposed to managed code, - // fill with dead values. - FillDead(this->verbosity); - FillDead(this->seed); - FillDead(this->messageSink); - FillDead(this->modelSink); - FillDead(this->checkCancel); + // Everything (except data buffers) that we might have exposed to managed code, + // fill with dead values. + FillDead(this->verbosity); + FillDead(this->seed); + FillDead(this->maxSlots); + FillDead(this->messageSink); + FillDead(this->modelSink); + FillDead(this->checkCancel); - for (size_t i = 0; i < _vset.size(); i++) - FillDead(_vset[i]); + for (size_t i = 0; i < _vset.size(); i++) + FillDead(_vset[i]); } -EnvironmentBlock::EnvironmentBlock(int verbosity, int maxThreadsAllowed, int seed, const char* pythonPath) +EnvironmentBlock::EnvironmentBlock(int verbosity, int maxSlots, int seed, const char* pythonPath) { - // Assert that this class doesn't have a vtable. - assert(offsetof(EnvironmentBlock, verbosity) == 0); + // Assert that this class doesn't have a vtable. + assert(offsetof(EnvironmentBlock, verbosity) == 0); - this->_errCode = PyErrorCode_NoError; - this->verbosity = verbosity; - this->maxThreadsAllowed = maxThreadsAllowed; - this->seed = seed; - this->pythonPath = pythonPath; - this->_kindMask = (1 << Warning) | (1 << Error); - if (verbosity > 0) - this->_kindMask |= (1 << Info); - if (this->verbosity > 3) - this->_kindMask |= (1 << Trace); - this->dataSink = &DataSink; - this->messageSink = &MessageSink; - this->modelSink = &ModelSink; - this->checkCancel = &CheckCancel; + this->_errCode = PyErrorCode_NoError; + this->verbosity = verbosity; + this->maxSlots = maxSlots; + this->seed = seed; + this->pythonPath = pythonPath; + this->_kindMask = (1 << Warning) | (1 << Error); + if (verbosity > 0) + this->_kindMask |= (1 << Info); + if (this->verbosity > 3) + this->_kindMask |= (1 << Trace); + this->dataSink = &DataSink; + this->messageSink = &MessageSink; + this->modelSink = &ModelSink; + this->checkCancel = &CheckCancel; } STATIC MANAGED_CALLBACK(void) EnvironmentBlock::DataSink(EnvironmentBlock *penv, const DataViewBlock *pdata, void **&setters, void *&keyValueSetter) { - penv->DataSinkCore(pdata); - setters = &penv->_vset[0]; - keyValueSetter = (void *)&SetKeyValue; + penv->DataSinkCore(pdata); + setters = &penv->_vset[0]; + keyValueSetter = (void *)&SetKeyValue; } void EnvironmentBlock::DataSinkCore(const DataViewBlock * pdata) { - assert(pdata != nullptr); + assert(pdata != nullptr); - // Create a data set. - CxInt64 numKeys = 0; - for (int i = 0; i < pdata->ccol; i++) - { - BYTE kind = pdata->kinds[i]; - _columns.push_back(PythonObjectBase::CreateObject(kind, pdata->crow, 1)); + // Create a data set. + CxInt64 numKeys = 0; + for (int i = 0; i < pdata->ccol; i++) + { + BYTE kind = pdata->kinds[i]; + _columns.push_back(PythonObjectBase::CreateObject(kind, pdata->crow, 1)); - switch (kind) - { - case BL: - _vset.push_back((void*)&SetBL); - break; - case I1: - _vset.push_back((void*)&SetI1); - break; - case I2: - _vset.push_back((void*)&SetI2); - break; - case I4: - _vset.push_back((void*)&SetI4); - break; - case I8: - _vset.push_back((void*)&SetI8); - break; - case U1: - _vset.push_back((void*)&SetU1); - break; - case U2: - _vset.push_back((void*)&SetU2); - break; - case U4: - _vset.push_back((void*)&SetU4); - break; - case U8: - _vset.push_back((void*)&SetU8); - break; - case R4: - _vset.push_back((void*)&SetR4); - break; - case R8: - _vset.push_back((void*)&SetR8); - break; - case TX: - _vset.push_back((void*)&SetTX); - break; - case TS: // tbd - case DT: // tbd - case DZ: // tbd - default: - throw std::invalid_argument("data type is not supported " + std::to_string(kind)); - } + switch (kind) + { + case BL: + _vset.push_back((void*)&SetBL); + break; + case I1: + _vset.push_back((void*)&SetI1); + break; + case I2: + _vset.push_back((void*)&SetI2); + break; + case I4: + _vset.push_back((void*)&SetI4); + break; + case I8: + _vset.push_back((void*)&SetI8); + break; + case U1: + _vset.push_back((void*)&SetU1); + break; + case U2: + _vset.push_back((void*)&SetU2); + break; + case U4: + _vset.push_back((void*)&SetU4); + break; + case U8: + _vset.push_back((void*)&SetU8); + break; + case R4: + _vset.push_back((void*)&SetR4); + break; + case R8: + _vset.push_back((void*)&SetR8); + break; + case TX: + _vset.push_back((void*)&SetTX); + break; + case TS: // tbd + case DT: // tbd + case DZ: // tbd + default: + throw std::invalid_argument("data type is not supported " + std::to_string(kind)); + } - if (pdata->keyCards[i] >= 0) - { - _vKeyValues.push_back(new PythonObject(TX, pdata->keyCards[i], 1)); - _columnToKeyMap.push_back(numKeys++); - } - else - _columnToKeyMap.push_back(-1); + if (pdata->keyCards[i] >= 0) + { + _vKeyValues.push_back(new PythonObject(TX, pdata->keyCards[i], 1)); + _columnToKeyMap.push_back(numKeys++); + } + else + _columnToKeyMap.push_back(-1); - _names.push_back(pdata->names[i]); - } + _names.push_back(pdata->names[i]); + } } STATIC MANAGED_CALLBACK(void) EnvironmentBlock::ModelSink(EnvironmentBlock * env, - const unsigned char * pBinaryModel, size_t iModelLen) + const unsigned char * pBinaryModel, size_t iModelLen) { } STATIC MANAGED_CALLBACK(void) EnvironmentBlock::MessageSink(EnvironmentBlock * env, MessageKind kind, - const char * sender, const char * message) + const char * sender, const char * message) { - bool bShowMessage = (env->_kindMask >> kind) & 1; - string sMessage(message); - string sSender(sender); + bool bShowMessage = (env->_kindMask >> kind) & 1; + string sMessage(message); + string sSender(sender); - if (bShowMessage) - { - CX_TraceIn("MessageSink"); - string sMessage = std::string(message); - string sSender = std::string(sender); + if (bShowMessage) + { + CX_TraceIn("MessageSink"); + string sMessage = std::string(message); + string sSender = std::string(sender); - switch (kind) - { - default: - case Info: - sMessage = sMessage + "\n"; - break; - case Warning: - sMessage = "Warning: " + sMessage + "\n"; - break; - case Trace: - sMessage = sSender + ": " + sMessage + "\n"; - break; - case Error: // We will throw the error when ConnectToMlNet returns - sMessage = "Error: " + sMessage; - break; - } + switch (kind) + { + default: + case Info: + sMessage = sMessage + "\n"; + break; + case Warning: + sMessage = "Warning: " + sMessage + "\n"; + break; + case Trace: + sMessage = sSender + ": " + sMessage + "\n"; + break; + case Error: // We will throw the error when ConnectToMlNet returns + sMessage = "Error: " + sMessage; + break; + } - // Redirect message to Python streams - PyObject *sys = PyImport_ImportModule("sys"); - PyObject *pystream = PyObject_GetAttrString(sys, (kind == Error) ? "stderr" : "stdout"); - PyObject_CallMethod(pystream, "write", "s", sMessage.c_str()); - PyObject_CallMethod(pystream, "flush", NULL); - Py_XDECREF(pystream); - Py_XDECREF(sys); + // Redirect message to Python streams + PyObject *sys = PyImport_ImportModule("sys"); + PyObject *pystream = PyObject_GetAttrString(sys, (kind == Error) ? "stderr" : "stdout"); + PyObject_CallMethod(pystream, "write", "s", sMessage.c_str()); + PyObject_CallMethod(pystream, "flush", NULL); + Py_XDECREF(pystream); + Py_XDECREF(sys); - CX_TraceOut("MessageSink"); - } + CX_TraceOut("MessageSink"); + } } STATIC MANAGED_CALLBACK(bool) EnvironmentBlock::CheckCancel() { - return false; + return false; } bp::dict EnvironmentBlock::GetData() { - if (_names.size() == 0) - { - return bp::dict(); - } + if (_names.size() == 0) + { + return bp::dict(); + } - bp::dict dict = bp::dict(); - for (size_t i = 0; i < _names.size(); i++) - { - PythonObjectBase* column = _columns[i]; - PythonObject* keyNames = nullptr; - if (_columnToKeyMap[i] >= 0) - keyNames = _vKeyValues[_columnToKeyMap[i]]; + bp::dict dict = bp::dict(); + for (size_t i = 0; i < _names.size(); i++) + { + PythonObjectBase* column = _columns[i]; + PythonObject* keyNames = nullptr; + if (_columnToKeyMap[i] >= 0) + keyNames = _vKeyValues[_columnToKeyMap[i]]; - signed char kind = column->GetKind(); - switch (kind) { - case -1: - { - PythonObject* col = dynamic_cast*>(column); - auto shrd = col->GetData(); - bp::list list; - for (size_t i = 0; i < shrd->size(); i++) - { - bp::object obj; - signed char value = shrd->at(i); - if (value < 0) - obj = bp::object(NAN); - else if (value == 0) - obj = bp::object(false); - else - obj = bp::object(true); + signed char kind = column->GetKind(); + switch (kind) { + case -1: + { + PythonObject* col = dynamic_cast*>(column); + auto shrd = col->GetData(); + bp::list list; + for (size_t i = 0; i < shrd->size(); i++) + { + bp::object obj; + signed char value = shrd->at(i); + if (value < 0) + obj = bp::object(NAN); + else if (value == 0) + obj = bp::object(false); + else + obj = bp::object(true); - list.append(obj); - } - dict[_names[i]] = list; - } - break; - case BL: - SetDict2(signed char, bool); - break; - case I1: - SetDictAndKeys(signed char, i); - break; - case I2: - SetDictAndKeys(signed short, i); - break; - case I4: - SetDictAndKeys(signed int, i); - break; - case I8: - SetDict1(CxInt64); - break; - case U1: - SetDict1(unsigned char); - break; - case U2: - SetDict1(unsigned short); - break; - case U4: - SetDict1(unsigned int); - break; - case U8: - SetDict1(CxUInt64); - break; - case R4: - SetDict1(float); - break; - case R8: - SetDict1(double); - break; - case TX: - { - PythonObject* col = dynamic_cast*>(column); - auto shrd = col->GetData(); - bp::list list; - for (size_t i = 0; i < shrd->size(); i++) - { - bp::object obj; - const std::string& value = shrd->at(i); - if (!value.empty()) - { - obj = bp::object(value); - } - list.append(obj); - } - dict[_names[i]] = list; - delete column; - } - break; - case TS: - case DT: - case DZ: - default: - throw std::invalid_argument("data type is not supported " + std::to_string(kind)); - } - } - return dict; + list.append(obj); + } + dict[_names[i]] = list; + } + break; + case BL: + SetDict2(signed char, bool); + break; + case I1: + SetDictAndKeys(signed char, i); + break; + case I2: + SetDictAndKeys(signed short, i); + break; + case I4: + SetDictAndKeys(signed int, i); + break; + case I8: + SetDict1(CxInt64); + break; + case U1: + SetDict1(unsigned char); + break; + case U2: + SetDict1(unsigned short); + break; + case U4: + SetDict1(unsigned int); + break; + case U8: + SetDict1(CxUInt64); + break; + case R4: + SetDict1(float); + break; + case R8: + SetDict1(double); + break; + case TX: + { + PythonObject* col = dynamic_cast*>(column); + auto shrd = col->GetData(); + bp::list list; + for (size_t i = 0; i < shrd->size(); i++) + { + bp::object obj; + const std::string& value = shrd->at(i); + if (!value.empty()) + { + obj = bp::object(value); + } + list.append(obj); + } + dict[_names[i]] = list; + delete column; + } + break; + case TS: + case DT: + case DZ: + default: + throw std::invalid_argument("data type is not supported " + std::to_string(kind)); + } + } + return dict; } diff --git a/src/NativeBridge/ManagedInterop.h b/src/NativeBridge/ManagedInterop.h index 5d9582a3..e7b54038 100644 --- a/src/NativeBridge/ManagedInterop.h +++ b/src/NativeBridge/ManagedInterop.h @@ -15,32 +15,32 @@ struct DataViewBlock; // WARNING: These values are defined by the ML.NET code so should not be changed! enum MessageKind { - Trace = 0, - Info = 1, - Warning = 2, - Error = 3 + Trace = 0, + Info = 1, + Warning = 2, + Error = 3 }; // These are only used locally enum PyErrorCode { - PyErrorCode_NoError = 0, - PyErrorCode_Failure = 1 + PyErrorCode_NoError = 0, + PyErrorCode_Failure = 1 }; // REVIEW: the exceptions thrown in the callbacks will not be caught by BxlServer on Linux. // On Linux, CoreCLR will ignore previous stack frames, i.e., those before entering the managed code. typedef MANAGED_CALLBACK_PTR(void, MODELSINK) (EnvironmentBlock * env, - const unsigned char * binaryModel, size_t modelLen); + const unsigned char * binaryModel, size_t modelLen); typedef MANAGED_CALLBACK_PTR(void, MESSAGESINK)(EnvironmentBlock *penv, MessageKind kind, - const char * sender, const char * message); + const char * sender, const char * message); typedef MANAGED_CALLBACK_PTR(void, DATASINK)(EnvironmentBlock *penv, const DataViewBlock *pdata, - // Outputs: - // * setters: item setter function pointers. - // keyValueSetter: setter for key values. - void **& setters, void *& keyValueSetter); + // Outputs: + // * setters: item setter function pointers. + // keyValueSetter: setter for key values. + void **& setters, void *& keyValueSetter); // Callback function for getting cancel flag. typedef MANAGED_CALLBACK_PTR(bool, CHECKCANCEL)(); @@ -52,152 +52,152 @@ typedef MANAGED_CALLBACK_PTR(void, SETSTR)(void *pv, CxInt64 index, const char * // As such, it is critical that this class NOT have a vtable, so virtual functions are illegal! class CLASS_ALIGN EnvironmentBlock { - // Fields that are visible to managed code come first and do not start with an underscore. - // Fields that are only visible to this code start with an underscore. + // Fields that are visible to managed code come first and do not start with an underscore. + // Fields that are only visible to this code start with an underscore. private: - // *** These fields are known by managed code. It is critical that this struct not have a vtable. - // It is also critical that the layout of this prefix NOT vary from release to release or build to build. - // The managed code assumes that each pointer occupies 8 bytes. + // *** These fields are known by managed code. It is critical that this struct not have a vtable. + // It is also critical that the layout of this prefix NOT vary from release to release or build to build. + // The managed code assumes that each pointer occupies 8 bytes. - // Indicates a verbosity level. Zero means default (minimal). Larger generally means more information. - int verbosity; + // Indicates a verbosity level. Zero means default (minimal). Larger generally means more information. + int verbosity; - // The random seed. - int seed; + // The random seed. + int seed; - // The message sink. - MESSAGESINK messageSink; + // The message sink. + MESSAGESINK messageSink; - // The data sink. - DATASINK dataSink; + // The data sink. + DATASINK dataSink; - // The model sink. - MODELSINK modelSink; + // The model sink. + MODELSINK modelSink; - // Indicates max threads allowed. Less than one means default (maximal). - int maxThreadsAllowed; + // Max slots to return for vector valued columns(<=0 to return all). + int maxSlots; - // Check cancellation flag. - CHECKCANCEL checkCancel; + // Check cancellation flag. + CHECKCANCEL checkCancel; - // Path to python executable - const char* pythonPath; + // Path to python executable + const char* pythonPath; public: - EnvironmentBlock(int verbosity = 0, int maxThreadsAllowed = 0, int seed = 42, const char* pythonPath = NULL); - ~EnvironmentBlock(); - PyErrorCode GetErrorCode() { return _errCode; } - std::string GetErrorMessage() { return _errMessage; } - bp::dict GetData(); + EnvironmentBlock(int verbosity = 0, int maxSlots = -1, int seed = 42, const char* pythonPath = NULL); + ~EnvironmentBlock(); + PyErrorCode GetErrorCode() { return _errCode; } + std::string GetErrorMessage() { return _errMessage; } + bp::dict GetData(); private: - static MANAGED_CALLBACK(void) DataSink(EnvironmentBlock *penv, const DataViewBlock *pdata, void **&setters, void *&keyValueSetter); - static MANAGED_CALLBACK(void) MessageSink(EnvironmentBlock *penv, MessageKind kind, const char *sender, const char *message); - static MANAGED_CALLBACK(void) ModelSink(EnvironmentBlock *penv, const unsigned char *pBinaryModel, size_t iModelLen); - static MANAGED_CALLBACK(bool) CheckCancel(); + static MANAGED_CALLBACK(void) DataSink(EnvironmentBlock *penv, const DataViewBlock *pdata, void **&setters, void *&keyValueSetter); + static MANAGED_CALLBACK(void) MessageSink(EnvironmentBlock *penv, MessageKind kind, const char *sender, const char *message); + static MANAGED_CALLBACK(void) ModelSink(EnvironmentBlock *penv, const unsigned char *pBinaryModel, size_t iModelLen); + static MANAGED_CALLBACK(bool) CheckCancel(); private: - void DataSinkCore(const DataViewBlock * pdata); + void DataSinkCore(const DataViewBlock * pdata); private: - // This has a bit set for each kind of message that is desired. - int _kindMask; - // Fields used by the data callbacks. These keep the appropriate memory alive during the data operations. - int _irowBase; - int _crowWant; - std::vector _vset; - PyErrorCode _errCode; - std::string _errMessage; - - std::vector _names; - std::vector _columns; - // Maps between the column index, and the index in _vKeyValues containing the key names, or -1 if - // there are no key names. - std::vector _columnToKeyMap; - - std::vector*> _vKeyValues; - - static MANAGED_CALLBACK(void) SetR4(EnvironmentBlock *env, int col, long index, float value) - { - PythonObject* colObject = dynamic_cast*>(env->_columns[col]); - assert(colObject != nullptr); - colObject->SetAt(index, 0, value); - } - static MANAGED_CALLBACK(void) SetR8(EnvironmentBlock *env, int col, long index, double value) - { - PythonObject* colObject = dynamic_cast*>(env->_columns[col]); - assert(colObject != nullptr); - colObject->SetAt(index, 0, value); - } - static MANAGED_CALLBACK(void) SetBL(EnvironmentBlock *env, int col, long index, signed char value) - { - PythonObject* colObject = dynamic_cast*>(env->_columns[col]); - assert(colObject != nullptr); - colObject->SetAt(index, 0, value); - if (value < 0) - env->_columns[col]->SetKind(-1); - } - static MANAGED_CALLBACK(void) SetI1(EnvironmentBlock *env, int col, long index, signed char value) - { - PythonObject* colObject = dynamic_cast*>(env->_columns[col]); - assert(colObject != nullptr); - colObject->SetAt(index, 0, value); - } - static MANAGED_CALLBACK(void) SetI2(EnvironmentBlock *env, int col, long index, short value) - { - PythonObject* colObject = dynamic_cast*>(env->_columns[col]); - assert(colObject != nullptr); - colObject->SetAt(index, 0, value); - } - static MANAGED_CALLBACK(void) SetI4(EnvironmentBlock *env, int col, long index, int value) - { - PythonObject* colObject = dynamic_cast*>(env->_columns[col]); - assert(colObject != nullptr); - colObject->SetAt(index, 0, value); - } - static MANAGED_CALLBACK(void) SetI8(EnvironmentBlock *env, int col, long index, CxInt64 value) - { - PythonObject* colObject = dynamic_cast*>(env->_columns[col]); - assert(colObject != nullptr); - colObject->SetAt(index, 0, value); - } - static MANAGED_CALLBACK(void) SetU1(EnvironmentBlock *env, int col, long index, unsigned char value) - { - PythonObject* colObject = dynamic_cast*>(env->_columns[col]); - assert(colObject != nullptr); - colObject->SetAt(index, 0, value); - } - static MANAGED_CALLBACK(void) SetU2(EnvironmentBlock *env, int col, long index, unsigned short value) - { - PythonObject* colObject = dynamic_cast*>(env->_columns[col]); - assert(colObject != nullptr); - colObject->SetAt(index, 0, value); - } - static MANAGED_CALLBACK(void) SetU4(EnvironmentBlock *env, int col, long index, unsigned int value) - { - PythonObject* colObject = dynamic_cast*>(env->_columns[col]); - assert(colObject != nullptr); - colObject->SetAt(index, 0, value); - } - static MANAGED_CALLBACK(void) SetU8(EnvironmentBlock *env, int col, long index, CxUInt64 value) - { - PythonObject* colObject = dynamic_cast*>(env->_columns[col]); - assert(colObject != nullptr); - colObject->SetAt(index, 0, value); - } - static MANAGED_CALLBACK(void) SetTX(EnvironmentBlock *env, int col, long index, char* value, long length) - { - PythonObject* colObject = dynamic_cast*>(env->_columns[col]); - assert(colObject != nullptr); - colObject->SetAt(index, 0, std::string(value, length)); - } - static MANAGED_CALLBACK(void) SetKeyValue(EnvironmentBlock *env, int keyColumnIndex, int keyCode, char* value, long length) - { - assert(keyColumnIndex < env->_vKeyValues.size()); - PythonObject* keyNamesObject = env->_vKeyValues[keyColumnIndex]; - keyNamesObject->SetAt(keyCode, 0, std::string(value, length)); - } + // This has a bit set for each kind of message that is desired. + int _kindMask; + // Fields used by the data callbacks. These keep the appropriate memory alive during the data operations. + int _irowBase; + int _crowWant; + std::vector _vset; + PyErrorCode _errCode; + std::string _errMessage; + + std::vector _names; + std::vector _columns; + // Maps between the column index, and the index in _vKeyValues containing the key names, or -1 if + // there are no key names. + std::vector _columnToKeyMap; + + std::vector*> _vKeyValues; + + static MANAGED_CALLBACK(void) SetR4(EnvironmentBlock *env, int col, long index, float value) + { + PythonObject* colObject = dynamic_cast*>(env->_columns[col]); + assert(colObject != nullptr); + colObject->SetAt(index, 0, value); + } + static MANAGED_CALLBACK(void) SetR8(EnvironmentBlock *env, int col, long index, double value) + { + PythonObject* colObject = dynamic_cast*>(env->_columns[col]); + assert(colObject != nullptr); + colObject->SetAt(index, 0, value); + } + static MANAGED_CALLBACK(void) SetBL(EnvironmentBlock *env, int col, long index, signed char value) + { + PythonObject* colObject = dynamic_cast*>(env->_columns[col]); + assert(colObject != nullptr); + colObject->SetAt(index, 0, value); + if (value < 0) + env->_columns[col]->SetKind(-1); + } + static MANAGED_CALLBACK(void) SetI1(EnvironmentBlock *env, int col, long index, signed char value) + { + PythonObject* colObject = dynamic_cast*>(env->_columns[col]); + assert(colObject != nullptr); + colObject->SetAt(index, 0, value); + } + static MANAGED_CALLBACK(void) SetI2(EnvironmentBlock *env, int col, long index, short value) + { + PythonObject* colObject = dynamic_cast*>(env->_columns[col]); + assert(colObject != nullptr); + colObject->SetAt(index, 0, value); + } + static MANAGED_CALLBACK(void) SetI4(EnvironmentBlock *env, int col, long index, int value) + { + PythonObject* colObject = dynamic_cast*>(env->_columns[col]); + assert(colObject != nullptr); + colObject->SetAt(index, 0, value); + } + static MANAGED_CALLBACK(void) SetI8(EnvironmentBlock *env, int col, long index, CxInt64 value) + { + PythonObject* colObject = dynamic_cast*>(env->_columns[col]); + assert(colObject != nullptr); + colObject->SetAt(index, 0, value); + } + static MANAGED_CALLBACK(void) SetU1(EnvironmentBlock *env, int col, long index, unsigned char value) + { + PythonObject* colObject = dynamic_cast*>(env->_columns[col]); + assert(colObject != nullptr); + colObject->SetAt(index, 0, value); + } + static MANAGED_CALLBACK(void) SetU2(EnvironmentBlock *env, int col, long index, unsigned short value) + { + PythonObject* colObject = dynamic_cast*>(env->_columns[col]); + assert(colObject != nullptr); + colObject->SetAt(index, 0, value); + } + static MANAGED_CALLBACK(void) SetU4(EnvironmentBlock *env, int col, long index, unsigned int value) + { + PythonObject* colObject = dynamic_cast*>(env->_columns[col]); + assert(colObject != nullptr); + colObject->SetAt(index, 0, value); + } + static MANAGED_CALLBACK(void) SetU8(EnvironmentBlock *env, int col, long index, CxUInt64 value) + { + PythonObject* colObject = dynamic_cast*>(env->_columns[col]); + assert(colObject != nullptr); + colObject->SetAt(index, 0, value); + } + static MANAGED_CALLBACK(void) SetTX(EnvironmentBlock *env, int col, long index, char* value, long length) + { + PythonObject* colObject = dynamic_cast*>(env->_columns[col]); + assert(colObject != nullptr); + colObject->SetAt(index, 0, std::string(value, length)); + } + static MANAGED_CALLBACK(void) SetKeyValue(EnvironmentBlock *env, int keyColumnIndex, int keyCode, char* value, long length) + { + assert(keyColumnIndex < env->_vKeyValues.size()); + PythonObject* keyNamesObject = env->_vKeyValues[keyColumnIndex]; + keyNamesObject->SetAt(keyCode, 0, std::string(value, length)); + } }; @@ -208,32 +208,32 @@ class CLASS_ALIGN EnvironmentBlock inline void FillDead(int& x) { - assert(sizeof(int) == 4); - x = BAD_QUAD; + assert(sizeof(int) == 4); + x = BAD_QUAD; } inline void FillDead(CxInt64& x) { - assert(sizeof(CxInt64) == 8); - assert(sizeof(int) == 4); - ((int *)&x)[0] = BAD_QUAD; - ((int *)&x)[1] = BAD_QUAD; + assert(sizeof(CxInt64) == 8); + assert(sizeof(int) == 4); + ((int *)&x)[0] = BAD_QUAD; + ((int *)&x)[1] = BAD_QUAD; } template inline void FillDead(T*& x) { - assert(sizeof(T*) == 8); - assert(sizeof(int) == 4); - ((int *)&x)[0] = BAD_QUAD; - ((int *)&x)[1] = BAD_QUAD; + assert(sizeof(T*) == 8); + assert(sizeof(int) == 4); + ((int *)&x)[0] = BAD_QUAD; + ((int *)&x)[1] = BAD_QUAD; } struct MlNetExecutionError : std::exception { - MlNetExecutionError(const char *message) : msg_(message) { } - virtual char const *what() const noexcept { return msg_.c_str(); } + MlNetExecutionError(const char *message) : msg_(message) { } + virtual char const *what() const noexcept { return msg_.c_str(); } private: - std::string msg_; + std::string msg_; }; diff --git a/src/NativeBridge/PythonInterop.cpp b/src/NativeBridge/PythonInterop.cpp index c0a833b7..be2f0d79 100644 --- a/src/NativeBridge/PythonInterop.cpp +++ b/src/NativeBridge/PythonInterop.cpp @@ -6,7 +6,7 @@ PythonObjectBase::PythonObjectBase(const int& kind) { - _kind = kind; + _kind = kind; } PythonObjectBase::~PythonObjectBase() @@ -17,39 +17,39 @@ PythonObjectBase::creation_map* PythonObjectBase::m_pCreationMap = PythonObjectB PythonObjectBase::creation_map* PythonObjectBase::CreateMap() { - PythonObjectBase::creation_map* map = new PythonObjectBase::creation_map(); - - map->insert(creation_map_entry(BL, CreateObject)); - map->insert(creation_map_entry(I1, CreateObject)); - map->insert(creation_map_entry(I2, CreateObject)); - map->insert(creation_map_entry(I4, CreateObject)); - map->insert(creation_map_entry(I8, CreateObject)); - map->insert(creation_map_entry(U1, CreateObject)); - map->insert(creation_map_entry(U2, CreateObject)); - map->insert(creation_map_entry(U4, CreateObject)); - map->insert(creation_map_entry(U8, CreateObject)); - map->insert(creation_map_entry(R4, CreateObject)); - map->insert(creation_map_entry(R8, CreateObject)); - map->insert(creation_map_entry(TX, CreateObject)); - return map; + PythonObjectBase::creation_map* map = new PythonObjectBase::creation_map(); + + map->insert(creation_map_entry(BL, CreateObject)); + map->insert(creation_map_entry(I1, CreateObject)); + map->insert(creation_map_entry(I2, CreateObject)); + map->insert(creation_map_entry(I4, CreateObject)); + map->insert(creation_map_entry(I8, CreateObject)); + map->insert(creation_map_entry(U1, CreateObject)); + map->insert(creation_map_entry(U2, CreateObject)); + map->insert(creation_map_entry(U4, CreateObject)); + map->insert(creation_map_entry(U8, CreateObject)); + map->insert(creation_map_entry(R4, CreateObject)); + map->insert(creation_map_entry(R8, CreateObject)); + map->insert(creation_map_entry(TX, CreateObject)); + return map; } PythonObjectBase* PythonObjectBase::CreateObject(const int& kind, size_t numRows, size_t numCols) { - creation_map::iterator found = m_pCreationMap->find(kind); + creation_map::iterator found = m_pCreationMap->find(kind); - if (found == m_pCreationMap->end()) - { - std::stringstream message; - message << "Columns of kind " << kind << " are not supported."; - throw std::invalid_argument(message.str().c_str()); - } + if (found == m_pCreationMap->end()) + { + std::stringstream message; + message << "Columns of kind " << kind << " are not supported."; + throw std::invalid_argument(message.str().c_str()); + } - return found->second(kind, numRows, numCols); + return found->second(kind, numRows, numCols); } template PythonObjectBase* PythonObjectBase::CreateObject(const int& kind, size_t nRows, size_t nColumns) { - return new PythonObject(kind, nRows, nColumns); + return new PythonObject(kind, nRows, nColumns); } diff --git a/src/NativeBridge/PythonInterop.h b/src/NativeBridge/PythonInterop.h index 9654476a..9fe31ebf 100644 --- a/src/NativeBridge/PythonInterop.h +++ b/src/NativeBridge/PythonInterop.h @@ -8,53 +8,53 @@ // Taken from ML.NET source code. These values should be stable. enum DataKind { - I1 = 1, - U1 = 2, - I2 = 3, - U2 = 4, - I4 = 5, - U4 = 6, - I8 = 7, - U8 = 8, - R4 = 9, - R8 = 10, - TX = 11, - BL = 12, - TS = 13, - DT = 14, - DZ = 15, + I1 = 1, + U1 = 2, + I2 = 3, + U2 = 4, + I4 = 5, + U4 = 6, + I8 = 7, + U8 = 8, + R4 = 9, + R8 = 10, + TX = 11, + BL = 12, + TS = 13, + DT = 14, + DZ = 15, }; class PythonObjectBase { private: - typedef std::map creation_map; - typedef std::pair creation_map_entry; + typedef std::map creation_map; + typedef std::pair creation_map_entry; - static creation_map* m_pCreationMap; - static creation_map* CreateMap(); + static creation_map* m_pCreationMap; + static creation_map* CreateMap(); - template static PythonObjectBase* CreateObject(const int& name, size_t nRows, size_t nColumns); + template static PythonObjectBase* CreateObject(const int& name, size_t nRows, size_t nColumns); protected: - int _kind; + int _kind; public: - PythonObjectBase(const int& kind); - static PythonObjectBase* CreateObject(const int& kind, size_t numRows, size_t numCols); - const int& GetKind() const; - void SetKind(int kind); - virtual ~PythonObjectBase(); + PythonObjectBase(const int& kind); + static PythonObjectBase* CreateObject(const int& kind, size_t numRows, size_t numCols); + const int& GetKind() const; + void SetKind(int kind); + virtual ~PythonObjectBase(); }; inline const int& PythonObjectBase::GetKind() const { - return _kind; + return _kind; } inline void PythonObjectBase::SetKind(int kind) { - _kind = kind; + _kind = kind; } @@ -62,47 +62,47 @@ template class PythonObject : public PythonObjectBase { protected: - std::vector* _pData; + std::vector* _pData; - size_t _numRows; - size_t _numCols; + size_t _numRows; + size_t _numCols; public: - PythonObject(const int& kind, size_t numRows = 1, size_t numCols = 1); - virtual ~PythonObject(); - void SetAt(size_t nRow, size_t nCol, const T& value); - const std::vector* GetData() const; + PythonObject(const int& kind, size_t numRows = 1, size_t numCols = 1); + virtual ~PythonObject(); + void SetAt(size_t nRow, size_t nCol, const T& value); + const std::vector* GetData() const; }; template inline PythonObject::PythonObject(const int& kind, size_t numRows, size_t numCols) - : PythonObjectBase(kind) + : PythonObjectBase(kind) { - _numRows = numRows; - _numCols = numCols; + _numRows = numRows; + _numCols = numCols; - _pData = new std::vector(); - if (_numRows > 0) - _pData->reserve(_numRows*_numCols); + _pData = new std::vector(); + if (_numRows > 0) + _pData->reserve(_numRows*_numCols); } template inline PythonObject::~PythonObject() { - delete _pData; + delete _pData; } template inline void PythonObject::SetAt(size_t nRow, size_t nCol, const T& value) { - size_t index = nRow*_numCols + nCol; - if (_pData->size() <= index) - _pData->resize(index + 1); - _pData->at(index) = value; + size_t index = nRow * _numCols + nCol; + if (_pData->size() <= index) + _pData->resize(index + 1); + _pData->at(index) = value; } template inline const std::vector* PythonObject::GetData() const { - return _pData; + return _pData; } \ No newline at end of file diff --git a/src/NativeBridge/UnixInterface.h b/src/NativeBridge/UnixInterface.h index 0a7c1155..f5e88099 100644 --- a/src/NativeBridge/UnixInterface.h +++ b/src/NativeBridge/UnixInterface.h @@ -151,7 +151,7 @@ class UnixMlNetInterface std::string libsroot(mlnetpath); std::string coreclrdir(coreclrpath); - if (strlen(dpreppath) == 0) + if (strlen(dpreppath) == 0) { dpreppath = mlnetpath; } diff --git a/src/NativeBridge/dllmain.cpp b/src/NativeBridge/dllmain.cpp index 0dafd696..cbad43ba 100644 --- a/src/NativeBridge/dllmain.cpp +++ b/src/NativeBridge/dllmain.cpp @@ -7,6 +7,7 @@ #include "ManagedInterop.h" #define PARAM_SEED "seed" +#define PARAM_MAX_SLOTS "max_slots" #define PARAM_GRAPH "graph" #define PARAM_VERBOSE "verbose" #define PARAM_MLNET_PATH "mlnetPath" @@ -75,15 +76,15 @@ bp::dict pxCall(bp::dict& params) bp::extract mlnetPath(params[PARAM_MLNET_PATH]); bp::extract dotnetClrPath(params[PARAM_DOTNETCLR_PATH]); bp::extract dprepPath(params[PARAM_DPREP_PATH]); - bp::extract pythonPath(params[PARAM_PYTHON_PATH]); - bp::extract verbose(params[PARAM_VERBOSE]); + bp::extract pythonPath(params[PARAM_PYTHON_PATH]); + bp::extract verbose(params[PARAM_VERBOSE]); std::int32_t i_verbose = std::int32_t(verbose); std::string s_mlnetPath = std::string(mlnetPath); std::string s_dotnetClrPath = std::string(dotnetClrPath); std::string s_dprepPath = std::string(dprepPath); std::string s_pythonPath = std::string(pythonPath); - std::string s_graph = std::string(graph); - const char *mlnetpath = s_mlnetPath.c_str(); + std::string s_graph = std::string(graph); + const char *mlnetpath = s_mlnetPath.c_str(); const char *coreclrpath = s_dotnetClrPath.c_str(); const char *dpreppath = s_dprepPath.c_str(); @@ -96,7 +97,11 @@ bp::dict pxCall(bp::dict& params) if (params.has_key(PARAM_SEED)) seed = bp::extract(params[PARAM_SEED]); - EnvironmentBlock env(i_verbose, 0, seed, s_pythonPath.c_str()); + int maxSlots = -1; + if (params.has_key(PARAM_MAX_SLOTS)) + maxSlots = bp::extract(params[PARAM_MAX_SLOTS]); + + EnvironmentBlock env(i_verbose, maxSlots, seed, s_pythonPath.c_str()); int retCode; if (params.has_key(PARAM_DATA) && bp::extract(params[PARAM_DATA]).check()) { diff --git a/src/NativeBridge/stdafx.h b/src/NativeBridge/stdafx.h index f5fe57f1..91c2f2fb 100644 --- a/src/NativeBridge/stdafx.h +++ b/src/NativeBridge/stdafx.h @@ -81,7 +81,7 @@ class StopWatch ~StopWatch() { auto endTime = std::chrono::high_resolution_clock::now(); - + std::stringstream buffer; buffer << m_description << ":" << ((endTime - m_startTime).count() / 1000000) << " msecs" << std::endl; diff --git a/src/python/nimbusml/internal/utils/entrypoints.py b/src/python/nimbusml/internal/utils/entrypoints.py index 1e6037db..09a808ce 100644 --- a/src/python/nimbusml/internal/utils/entrypoints.py +++ b/src/python/nimbusml/internal/utils/entrypoints.py @@ -254,52 +254,10 @@ def nimbusml_runnable_graph(self): '"nodes"', '"Nodes"') - def run( - self, - X, - y=None, - seed=None, - parallel=None, - max_slots=-1, - random_state=None, - verbose=1, - **params): - """ - run graph - """ - code = "" - if parallel is not None: - if isinstance(parallel, six.integer_types): - code += "parallel = {} ".format(parallel) - else: - raise TypeError("parallel is not of 'int' type.") - if seed is not None: - if isinstance(seed, six.integer_types): - code += "seed = {} ".format(seed) - else: - raise TypeError("seed is not of 'int' type.") - if parallel is not None: - if isinstance(parallel, six.integer_types): - code += "parallel = {} ".format(parallel) - else: - raise TypeError("parallel is not of 'int' type.") - if max_slots is not None: - if isinstance(max_slots, six.integer_types): - code += "maxSlots = {} ".format(max_slots) - else: - raise TypeError("max_slots is not of 'int' type.") - - if params.get("dryrun") is not None: - ret = 'graph = {%s} %s' % (str(self), code) - else: - ret = self.idv_bridge(X, y, code, random_state, verbose, **params) - return ret - def _try_call_bridge( self, px_call, call_parameters, - code, verbose, concatenated, output_modelfilename): @@ -324,9 +282,9 @@ def _try_call_bridge( type(od), ','.join(od)) if isinstance(verbose, six.integer_types) and verbose >= 2: raise BridgeRuntimeError( - "{0}.\n--CODE--\n{1}\n--GRAPH--\n{2}\n--DATA--\n{3}" - "\n--\nconcatenated={4}".format( - str(e), code, str(self), vars, concatenated), + "{0}.\n--GRAPH--\n{1}\n--DATA--\n{2}" + "\n--\nconcatenated={3}".format( + str(e), str(self), vars, concatenated), model=output_modelfilename) else: raise BridgeRuntimeError( @@ -348,12 +306,15 @@ def _get_separator(self): return None return pieces[0].replace("sep=", "").strip() - def idv_bridge(self, X, y, code, random_state=None, verbose=1, **params): + def run(self, X, y=None, max_slots=-1, random_state=None, verbose=1, **params): + if params.get("dryrun") is not None: + return 'graph = %s' % (str(self)) + output_modelfilename = None output_metricsfilename = None out_metrics = None - # Ideally, idv_bridge shouldn't care if it's running CV + # Ideally, run_graph shouldn't care if it's running CV # or a regular pipeline. That required changing the idv_bridge to be # more flexible (e.g. changing return value, changing input # structure, etc.) In my first attempt, this approach caused @@ -442,9 +403,7 @@ def remove_multi_level_index(c): f.write(self.nimbusml_runnable_graph) call_parameters['verbose'] = try_set(verbose, False, six.integer_types) - call_parameters['graph'] = try_set( - 'graph = {%s} %s' % - (str(self), code), False, str) + call_parameters['graph'] = try_set(str(self), False, str) # Set paths to .NET Core CLR, ML.NET and DataPrep libs set_clr_environment_vars() @@ -455,10 +414,13 @@ def remove_multi_level_index(c): if random_state: call_parameters['seed'] = try_set(random_state, False, six.integer_types) + + if max_slots: + call_parameters['max_slots'] = try_set(max_slots, False, six.integer_types) + ret = self._try_call_bridge( px_call, call_parameters, - code, verbose, concatenated, output_modelfilename) diff --git a/src/python/nimbusml/model_selection/cv.py b/src/python/nimbusml/model_selection/cv.py index d719e07f..8a3d1a57 100644 --- a/src/python/nimbusml/model_selection/cv.py +++ b/src/python/nimbusml/model_selection/cv.py @@ -557,7 +557,6 @@ def fit( X=X, y=y, random_state=pipeline.random_state, - seed=pipeline.random_state, w=weights, verbose=verbose, telemetry_info=telemetry_info, diff --git a/src/python/nimbusml/tests/decomposition/test_pcaanomalydetector.py b/src/python/nimbusml/tests/decomposition/test_pcaanomalydetector.py index f3d81ea2..c5a04806 100644 --- a/src/python/nimbusml/tests/decomposition/test_pcaanomalydetector.py +++ b/src/python/nimbusml/tests/decomposition/test_pcaanomalydetector.py @@ -25,7 +25,7 @@ def test_PcaAnomalyDetector(self): scores = svm.predict(X_test) assert_almost_equal( scores.sum().sum(), - 4.181632, + 4.1786637, decimal=7, err_msg="Sum should be %s" % 4.181632) diff --git a/src/python/nimbusml/tests/ensemble/test_ensembleregressor.py b/src/python/nimbusml/tests/ensemble/test_ensembleregressor.py index 5c61d9b2..a3c95495 100644 --- a/src/python/nimbusml/tests/ensemble/test_ensembleregressor.py +++ b/src/python/nimbusml/tests/ensemble/test_ensembleregressor.py @@ -33,7 +33,7 @@ def test_ensembleregressor(self): scores = ensemble.predict(X_test) r2 = r2_score(y_test, scores) - assert_greater(r2, 0.12, "should be greater than %s" % 0.12) + assert_greater(r2, 0.105, "should be greater than %s" % 0.105) assert_less(r2, 0.13, "sum should be less than %s" % 0.13) ensemble_with_options = EnsembleRegressor( @@ -46,8 +46,8 @@ def test_ensembleregressor(self): scores = ensemble_with_options.predict(X_test) r2 = r2_score(y_test, scores) - assert_greater(r2, 0.0279, "R-Squared should be greater than %s" % 0.0279) - assert_less(r2, 0.03, "R-Squared should be less than %s" % 0.03) + assert_greater(r2, 0.07, "R-Squared should be greater than %s" % 0.07) + assert_less(r2, 0.08, "R-Squared should be less than %s" % 0.08) if __name__ == '__main__': diff --git a/src/python/nimbusml/tests/linear_model/test_averagedperceptronbinaryclassifier.py b/src/python/nimbusml/tests/linear_model/test_averagedperceptronbinaryclassifier.py index 96397f70..bcdc6530 100644 --- a/src/python/nimbusml/tests/linear_model/test_averagedperceptronbinaryclassifier.py +++ b/src/python/nimbusml/tests/linear_model/test_averagedperceptronbinaryclassifier.py @@ -37,7 +37,7 @@ def setUpClass(cls): def test_averagedperceptron(self): accuracy = get_accuracy(self, AveragedPerceptronBinaryClassifier()) # Accuracy depends on column Unnamed0 (index). - assert_greater(accuracy, 0.98, "accuracy should be %s" % 0.98) + assert_greater(accuracy, 0.93, "accuracy should be greater than %s" % 0.93) def test_averagedperceptron_supported_losses(self): # bug: 'exp' fails on this test diff --git a/src/python/nimbusml/tests/model_selection/test_sweep.py b/src/python/nimbusml/tests/model_selection/test_sweep.py index 5a5f0b32..4faa1993 100644 --- a/src/python/nimbusml/tests/model_selection/test_sweep.py +++ b/src/python/nimbusml/tests/model_selection/test_sweep.py @@ -18,7 +18,7 @@ from nimbusml.feature_extraction.text import NGramFeaturizer from nimbusml.feature_extraction.text import WordEmbedding from nimbusml.feature_extraction.text.extractor import Ngram -from nimbusml.linear_model import FastLinearBinaryClassifier +from nimbusml.linear_model import FastLinearBinaryClassifier, AveragedPerceptronBinaryClassifier from nimbusml.utils import get_X_y from sklearn.model_selection import GridSearchCV from sklearn.utils.testing import assert_raises @@ -68,12 +68,8 @@ def test_hyperparameters_sweep(self): 'learner__number_of_trees': 1} def test_learners_sweep(self): - # grid search over 2 learners, even though pipe defined with - # FastTreesBinaryClassifier - # FastLinearBinaryClassifier learner wins, meaning we grid searched - # over it + # grid search over 2 learners np.random.seed(0) - df = pd.DataFrame(dict(education=['A', 'A', 'A', 'A', 'B', 'A', 'B'], workclass=['X', 'Y', 'X', 'X', 'X', 'Y', 'Y'], y=[1, 0, 1, 1, 0, 1, 0])) @@ -86,17 +82,13 @@ def test_learners_sweep(self): param_grid = dict( learner=[ - FastLinearBinaryClassifier(), - FastTreesBinaryClassifier()], - learner__number_of_threads=[ - 1, - 4]) + AveragedPerceptronBinaryClassifier(), + FastTreesBinaryClassifier()]) grid = GridSearchCV(pipe, param_grid) grid.fit(X, y) assert grid.best_params_[ - 'learner'].__class__.__name__ == 'FastLinearBinaryClassifier' - assert grid.best_params_['learner__number_of_threads'] == 1 + 'learner'].__class__.__name__ == 'AveragedPerceptronBinaryClassifier' @unittest.skipIf( six.PY2, diff --git a/src/python/nimbusml/tests/pipeline/test_predict_proba_decision_function.py b/src/python/nimbusml/tests/pipeline/test_predict_proba_decision_function.py index f6cc1c70..138622b4 100644 --- a/src/python/nimbusml/tests/pipeline/test_predict_proba_decision_function.py +++ b/src/python/nimbusml/tests/pipeline/test_predict_proba_decision_function.py @@ -209,12 +209,12 @@ def test_pass_predict_proba_from_load_model(selfs): class TestDecisionFunction(unittest.TestCase): def test_pass_decision_function_binary(self): assert_almost_equal(decfun_sum(FactorizationMachineBinaryClassifier( - )), -32.618393, decimal=5, err_msg=invalid_decision_function_output) + )), -30.2316, decimal=4, err_msg=invalid_decision_function_output) def test_pass_decision_function_binary_with_pipeline(self): assert_almost_equal( decfun_sum(Pipeline([FactorizationMachineBinaryClassifier( - )])), -32.618393, decimal=5, + )])), -30.2316, decimal=4, err_msg=invalid_decision_function_output) def test_pass_decision_function_multiclass(self): diff --git a/src/python/nimbusml/tests/pipeline/test_uci_adult.py b/src/python/nimbusml/tests/pipeline/test_uci_adult.py index 990f0b72..c5be9d58 100644 --- a/src/python/nimbusml/tests/pipeline/test_uci_adult.py +++ b/src/python/nimbusml/tests/pipeline/test_uci_adult.py @@ -173,15 +173,5 @@ def test_experiment_loadsavemodel(self): sum2, "model metrics don't match after loading model") - def test_parallel(self): - (train, label) = get_X_y(train_file, label_column, sep=',') - cat = OneHotVectorizer() << categorical_columns - ftree = FastTreesBinaryClassifier() - pipeline = Pipeline([cat, ftree]) - - result = pipeline.fit(train, label, parallel=8) - result2 = pipeline.fit(train, label, parallel=1) - assert_true(result == result2) - if __name__ == '__main__': unittest.main() diff --git a/src/python/tests/test_estimator_checks.py b/src/python/tests/test_estimator_checks.py index 9cbc09d0..7fe31334 100644 --- a/src/python/tests/test_estimator_checks.py +++ b/src/python/tests/test_estimator_checks.py @@ -70,7 +70,9 @@ # bug, low tolerance 'FastLinearRegressor': 'check_supervised_y_2d, ' 'check_regressor_data_not_an_array, ' - 'check_regressors_int', + 'check_regressors_int, ' + # todo: investigate + 'check_regressors_train', # bug decision function shape should be 1 # dimensional arrays, tolerance 'FastLinearClassifier': 'check_classifiers_train', @@ -93,6 +95,8 @@ 'check_estimators_dtypes', # tolerance 'LogisticRegressionClassifier': 'check_classifiers_train', + # todo: investigate + 'OnlineGradientDescentRegressor': 'check_regressors_train', # bug decision function shape, prediction bug 'NaiveBayesClassifier': 'check_classifiers_train, check_classifiers_classes', diff --git a/src/python/tests_extended/test_docs_example.py b/src/python/tests_extended/test_docs_example.py index 27470667..3c93d010 100644 --- a/src/python/tests_extended/test_docs_example.py +++ b/src/python/tests_extended/test_docs_example.py @@ -128,9 +128,10 @@ def test_examples(self): "CacheClassesFromAssembly: can't map name " "OLSLinearRegression to Void, already mapped to Void", # TensorFlowScorer.py - "tensorflow/compiler/xla/service/service.cc:150] XLA service", - "tensorflow/compiler/xla/service/service.cc:158] StreamExecutor device", + "tensorflow/compiler/xla/service/service.cc:168] XLA service", + "tensorflow/compiler/xla/service/service.cc:175] StreamExecutor device", "tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency:", + "tensorflow/compiler/jit/mark_for_compilation_pass.cc:1412] (One-time warning): Not using XLA:CPU", # Binner.py "from collections import Mapping, defaultdict", "DeprecationWarning: Using or importing the ABCs",