From 28b8ed3ea17375615b2efa38fde5fcb1697f3dbc Mon Sep 17 00:00:00 2001
From: Gani Nazirov <ganaziro@microsoft.com>
Date: Mon, 24 Feb 2020 16:33:57 -0800
Subject: [PATCH 1/4] add ORT results

---
 nuget.config                                  |   5 +-
 src/python/tests_extended/data_frame_tool.py  | 195 ++++++++++++++++++
 .../tests_extended/test_export_to_onnx.py     |  17 +-
 3 files changed, 213 insertions(+), 4 deletions(-)
 create mode 100644 src/python/tests_extended/data_frame_tool.py
diff --git a/nuget.config b/nuget.config
index c0efdcaa..999bb5b5 100644
--- a/nuget.config
+++ b/nuget.config
@@ -5,7 +5,8 @@
   </config>
   <packageSources>
     <add key="nuget.org" value="https://api.nuget.org/v3/index.json" />
-    <add key="MachineLearning" value="https://pkgs.dev.azure.com/dnceng/public/_packaging/MachineLearning/nuget/v3/index.json"  />
-    <add key="MlNet_Daily" value="https://dotnet.myget.org/F/dotnet-core/api/v3/index.json" />
+    <!--add key="MachineLearning" value="https://pkgs.dev.azure.com/dnceng/public/_packaging/MachineLearning/nuget/v3/index.json"  />
+    <add key="MlNet_Daily" value="https://dotnet.myget.org/F/dotnet-core/api/v3/index.json" /-->
+    <add key="local_mlnet" value="E:\sources\machinelearning\bin\packages" />
   </packageSources>
 </configuration>
diff --git a/src/python/tests_extended/data_frame_tool.py b/src/python/tests_extended/data_frame_tool.py
new file mode 100644
index 00000000..a6d9c6d4
--- /dev/null
+++ b/src/python/tests_extended/data_frame_tool.py
@@ -0,0 +1,195 @@
+#-------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+#--------------------------------------------------------------------------
+
+from datetime import datetime
+import numpy as np
+import pandas as pd
+import onnxruntime as onnxrt
+
+ort_float_set = set([np.float32, np.float64])
+
+pd_float_set = set(['float64'])
+
+ort_int_set = set([np.int8, np.uint8, np.int16, np.uint16, np.int32, np.uint32, np.int64, np.uint64])
+
+pd_int_set = set(['int64'])
+
+types_dict = {
+    'tensor(float16)': np.float16,
+    'tensor(float)'  : np.float32,
+    'tensor(double)' : np.float64,
+
+    'tensor(int8)'   : np.int8,
+    'tensor(uint8)'  : np.uint8,
+    'tensor(int16)'  : np.int16,
+    'tensor(uint16)' : np.uint16,
+    'tensor(int32)'  : np.int32,
+    'tensor(uint32)' : np.uint32,
+    'tensor(int64)'  : np.int64,
+    'tensor(uint64)' : np.uint64,
+
+    'tensor(bool)'   : np.bool,
+    'tensor(string)' : np.object
+}
+
+class DataFrameTool():
+    """
+    This is a utility class used to run a model with pandas.DataFrame input
+    """
+    def __init__(self, model_path, sess_options=None):
+        """
+        :param model_path: path to the model to be loaded
+        :param sess_options: see onnxruntime.SessionsOptions
+        """
+        self._model_path = model_path
+        self._sess_options = sess_options
+        self._sess = onnxrt.InferenceSession(self._model_path, self._sess_options)
+        
+    def _reshape_input(self, input_array, expected_shape):
+        """
+        :param - input_array numpy array. This one is obtained from DataFrame and expected to have
+        :      a rank if 1.
+        :expected_shape - shape fetched from the model which may include dynamic elements.
+        :  expected_shape may at most have one -1, None or zero which will be computed from
+        :  the size of the input_array. We replace None and zeros to -1 and let np.ndarray.reshape deal with it.
+        """
+        # expected_shape rank is one, we will let onnxruntime to deal with it
+        if len(expected_shape) == 1:
+            return input_array
+            
+        inferred_shape = [dim if dim else -1 for dim in expected_shape]
+        return input_array.reshape(inferred_shape)
+     
+    def _validate_type(self, input_meta, col_type):
+        """
+        : input_meta - meta info obtained from the model for the given input
+        : col_type - dtype of the column
+        : throws if conditions are not met
+        
+         float16 and bool will always require exact match
+         We attempt to convert any type to a string if it is required.
+         With strings we always want to put this into a flat array, cast to np.object and then reshape as object
+         Any other type to qualify for casting must match either integer or floating point types
+         Python datetime which is denoted in Pandas as datetime64[ns] are cast to int64
+        """
+        expected_type = types_dict[input_meta.type]
+        if input_meta.type == 'tensor(string)':
+           return
+        elif expected_type == col_type:
+           return
+        elif expected_type == np.int64 and str(col_type) == 'datetime64[ns]':
+           return
+        elif expected_type == np.uint32 and str(col_type) == 'category':
+           return
+        elif expected_type in ort_float_set and str(col_type) in pd_float_set:
+           return
+        elif expected_type in ort_int_set and str(col_type) in pd_int_set:
+           return
+           
+        raise TypeError("Input {} requires type {} unable to cast column type {} ".format(
+            input_meta.name, expected_type, col_type))
+
+
+    def _process_input_list(self, df, input_metas, require):
+        """
+        Return a dictionary of input_name : a typed and shaped np.array of values for a given input_meta
+        The function does the heavy lifting for _get_input_feeds()
+
+        :param df: See :class:`pandas.DataFrame`. 
+        :param input_metas: a list of name/type pairs
+        :require is a boolean. If True this helper throws on a missing input.
+        
+        """
+        feeds = {}
+        # Process mandadory inputs. Raise an error if anything is not present
+        for input_meta in input_metas:
+            # We fully expect all the types are in the above dictionary
+            assert input_meta.type in types_dict, "Update types_dict for the new type"
+            if input_meta.name in df.columns:
+                self._validate_type(input_meta, df[input_meta.name].dtype)
+                if (df[input_meta.name].dtype) == 'datetime64[ns]':
+                    input_array = np.array([dt.timestamp() for dt in df[input_meta.name]]).astype(np.int64)
+                else:
+                    # With strings we must cast first to np.object then then reshape
+                    # so we do it for everything
+                    input_array = np.array(df[input_meta.name]).astype(types_dict[input_meta.type])
+                    
+                feeds[input_meta.name] = self._reshape_input(input_array, input_meta.shape)
+                
+            elif require:
+                raise RuntimeError("This model requires input {} of type {} but it is not found in the DataFrame".format(
+                               input_meta.name, types_dict[input_meta.type]))
+        return feeds
+
+
+    def _get_input_feeds(self, df, sess):
+        """
+        Return a dictionary of input_name : a typed and shaped np.array of values
+        This function accepts Pandas DataFrame as the first argument and onnxruntime
+        session with a loaded model. The function interrogates the model for the inputs
+        and matches the model input names to the DataFrame instance column names.
+        It requires exact matches for bool and float16 types. It attempts to convert to
+        string any input type if string is required.
+        It attempts to convert floating types to each other and does the same for all of the
+        integer types without requiring an exact match.
+
+        :param df: See :class:`pandas.DataFrame`. The function only considers the first row (0) of each column
+            and feeds the data to the appropriate model inputs.
+
+        :param sess: See :class:`onnxruntime.InferenceSession`.
+        
+        ::
+        For example: pd.DataFrame([[0], [4],[20]],index=[0], columns=['A', 'B', 'C'])
+
+        """
+        if df.empty:
+            raise RuntimeError('input DataFrame is empty')
+
+        # Process mandadory inputs. Raise an error if anything is not present
+        feeds = self._process_input_list(df, sess.get_inputs(), True)
+        # Process optional overridable initializers. If present the initialzier value
+        # is overriden by the input. If not, the initialzier value embedded in the model takes effect.
+        initializers = self._process_input_list(df, sess.get_overridable_initializers(), False)
+
+        feeds.update(initializers)
+
+        return feeds
+
+    def execute(self, df, output_names=None, output_types=None, run_options=None):
+        "Return a list of output values restricted to output names if not empty"
+        """
+        Compute the predictions.
+
+        :param df: See :class:`pandas.DataFrame`.
+        :output_name - list of column output names and their order to output
+        :output_types { output_name : dtype } optional dictionary that asks to cast output
+           to the colum type
+                  
+        :param run_options: See :class:`onnxruntime.RunOptions`.
+        ::
+        sess.run([output_name], {input_name: x})
+        Pandas DataFrame
+        """
+        input_feed = self._get_input_feeds(df, self._sess);
+        if not output_names:
+          output_names = [output.name for output in self._sess._outputs_meta]
+          
+        results = self._sess.run(output_names, input_feed, run_options)
+
+        df = pd.DataFrame()
+        for i in range(len(results)):
+           r = results[i].flatten()
+           if output_types and output_names[i] in output_types:
+             dtype = output_types[output_names[i]]
+             if dtype == np.dtype('datetime64'):
+                r = r.astype(np.int64)
+                r = [datetime.utcfromtimestamp(ts) for ts in r]
+             else:
+                r = r.astype(dtype)
+             
+           print(len(r))
+           df[output_names[i]] = r
+        
+        return df
diff --git a/src/python/tests_extended/test_export_to_onnx.py b/src/python/tests_extended/test_export_to_onnx.py
index bbe3e95a..d6343194 100644
--- a/src/python/tests_extended/test_export_to_onnx.py
+++ b/src/python/tests_extended/test_export_to_onnx.py
@@ -44,6 +44,7 @@
                                  SsaSpikeDetector, SsaChangePointDetector,
                                  SsaForecaster)
 
+from data_frame_tool import DataFrameTool as DFT
 
 SHOW_ONNX_JSON = False
 SHOW_TRANSFORMED_RESULTS = True
@@ -448,7 +449,7 @@ def load_json(file_path):
         return json.loads(content_without_comments)
 
 
-def print_results(result_expected, result_onnx):
+def print_results(result_expected, result_onnx, result_onnx_ort):
     print("\nML.Net Output (Expected Result):")
     print(result_expected)
     if not isinstance(result_expected, pd.Series):
@@ -459,6 +460,10 @@ def print_results(result_expected, result_onnx):
     if not isinstance(result_onnx, pd.Series):
         print('Columns', result_onnx.columns)
 
+    print("\nORT Result:")
+    print(result_onnx_ort)
+    if not isinstance(result_onnx_ort, pd.Series):
+        print('Columns', result_onnx_ort.columns)
 
 def validate_results(class_name, result_expected, result_onnx):
     if not class_name in EXPECTED_RESULTS:
@@ -489,6 +494,12 @@ def validate_results(class_name, result_expected, result_onnx):
             col_expected = result_expected.loc[:, col_pair[0]]
             col_onnx = result_onnx.loc[:, col_pair[1]]
 
+            if isinstance(col_expected.dtype, pd.api.types.CategoricalDtype):
+                # ONNX does not export categorical columns so convert categorical
+                # columns received from ML.Net back to the original values before
+                # the comparison.
+                col_expected = col_expected.astype(col_expected.dtype.categories.dtype)
+                
             pd.testing.assert_series_equal(col_expected,
                                            col_onnx,
                                            check_names=False,
@@ -559,9 +570,11 @@ def test_export_to_onnx(estimator, class_name):
         try:
             onnxrunner = OnnxRunner(model_file=onnx_path)
             result_onnx = onnxrunner.fit_transform(dataset)
+            df_tool = DFT(onnx_path)
+            result_ort = df_tool.execute(dataset, [])
 
             if SHOW_TRANSFORMED_RESULTS:
-                print_results(result_expected, result_onnx)
+                print_results(result_expected, result_onnx, result_ort)
 
             export_valid = validate_results(class_name,
                                             result_expected,

From e1c5cebce2d83b54b8aac73c57730fa089f9aabc Mon Sep 17 00:00:00 2001
From: Gani Nazirov <ganaziro@microsoft.com>
Date: Wed, 26 Feb 2020 09:53:33 -0800
Subject: [PATCH 2/4] fixes to dataframe tool and vinod

---
 src/python/tests_extended/data_frame_tool.py |  2 +-
 src/python/tests_extended/vinod.py           | 38 +++++++++++++-------
 src/python/tools/manifest.json               |  6 ++--
 3 files changed, 29 insertions(+), 17 deletions(-)

diff --git a/src/python/tests_extended/data_frame_tool.py b/src/python/tests_extended/data_frame_tool.py
index 578596c7..85efabd9 100644
--- a/src/python/tests_extended/data_frame_tool.py
+++ b/src/python/tests_extended/data_frame_tool.py
@@ -190,7 +190,7 @@ def execute(self, df, output_names=None, output_types=None, run_options=None):
                 continue
 
             r = np.split(r, r.shape[-1], axis=-1) \
-                if r.shape[-1] > 1 else [r]
+                if (r.shape[-1] > 1 and r.shape[0] > 1) else [r]
 
             for suffix, col in enumerate(r):
                 col = col.flatten()
diff --git a/src/python/tests_extended/vinod.py b/src/python/tests_extended/vinod.py
index ef8932d1..da858bad 100644
--- a/src/python/tests_extended/vinod.py
+++ b/src/python/tests_extended/vinod.py
@@ -1,12 +1,15 @@
 import os
+import time
 import tempfile
 import nimbusml.linear_model as nml_linear
 from nimbusml.feature_extraction.categorical import OneHotVectorizer
 from nimbusml.preprocessing.missing_values import Handler
 from nimbusml import FileDataStream
 from nimbusml.preprocessing import DatasetTransformer
+from nimbusml.preprocessing.schema import ColumnSelector
 from nimbusml import Pipeline
 from nimbusml.preprocessing import OnnxRunner
+from data_frame_tool import DataFrameTool as DFT
 
 def get_tmp_file(suffix=None):
     fd, file_name = tempfile.mkstemp(suffix=suffix)
@@ -38,26 +41,35 @@ def get_tmp_file(suffix=None):
     print('export done')
 
     # Perform the transform using the standard ML.Net backend
-    result_standard = pipe_training.transform(X_test_dprep)
+    start = time.time()
+    result_standard = pipe_training.predict(X_test_dprep)
+    end = time.time()
     print(result_standard)
-    print('done transform using standard backend')
-    #          c1        c2
-    # 0  0.025025  0.000998
-    # 1  0.305305  0.000998
+    print('%ss done transform using standard backend' % round(end - start, 3))
 
     # Perform the transform using the ONNX backend.
     # Note, the extra columns and column name differences
     # is a known issue with the ML.Net backend.
-    onnxrunner = OnnxRunner(model_file=onnx_path)
-    result_onnx = onnxrunner.fit_transform(X_test_dprep)
-    print('done transform using onnx backend')
-    print(result_onnx)
-    #      c1   c2     c12.0     c22.0
-    # 0   2.5  1.0  0.025025  0.000998
-    # 1  30.5  1.0  0.305305  0.000998
+    onnxrunner = Pipeline([OnnxRunner(model_file=onnx_path), 
+                           ColumnSelector(columns=['Score'])])
+    # Performance issue, commenting out for now
+    #start = time.time()
+    #result_onnx = onnxrunner.fit_transform(X_test_dprep, as_binary_data_stream=True)
+    #end = time.time()
+    #print(result_onnx.head(5))
+    #print('%ss done transform using onnx backend' % round(end - start, 3))
+
+    df_tool = DFT(onnx_path)
+    dataset = X_test_dprep.to_df()
+    start = time.time()
+    result_ort = df_tool.execute(dataset, [])
+    end = time.time()
+    print(result_ort)
+    print('%ss done transform using ORT backend (excludes df load time)' % round(end - start, 3))
+
 
 except Exception as e:
-    print('tragedy')
+    print('=============== ERROR =================')
     print(e)
 
 print ("done")
\ No newline at end of file
diff --git a/src/python/tools/manifest.json b/src/python/tools/manifest.json
index 8d14276b..eb019968 100644
--- a/src/python/tools/manifest.json
+++ b/src/python/tools/manifest.json
@@ -1903,9 +1903,9 @@
           }
         },
         {
-          "Name": "SupressScoresAndLabels",
+          "Name": "SuppressScoresAndLabels",
           "Type": "Bool",
-          "Desc": "Supress labels and scores in per-instance outputs?",
+          "Desc": "Suppress labels and scores in per-instance outputs?",
           "Aliases": [
             "noScores"
           ],
@@ -23945,7 +23945,7 @@
         {
           "Name": "SupressTypeErrors",
           "Type": "Bool",
-          "Desc": "Supress the errors that would occur if a column and impute mode are imcompatible. If true, will skip the column. If false, will stop and throw an error.",
+          "Desc": "Suppress the errors that would occur if a column and impute mode are incompatible. If true, will skip the column. If false, will stop and throw an error.",
           "Aliases": [
             "error"
           ],

From b5ad89acbb58764e010739c5c5efb5d935a835f7 Mon Sep 17 00:00:00 2001
From: Gani Nazirov <ganaziro@microsoft.com>
Date: Wed, 26 Feb 2020 09:57:39 -0800
Subject: [PATCH 3/4] typos fixes

---
 src/python/tests_extended/test_export_to_onnx.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/src/python/tests_extended/test_export_to_onnx.py b/src/python/tests_extended/test_export_to_onnx.py
index 03eda22a..750c7c75 100644
--- a/src/python/tests_extended/test_export_to_onnx.py
+++ b/src/python/tests_extended/test_export_to_onnx.py
@@ -45,8 +45,6 @@
                                  SsaForecaster)
 from data_frame_tool import DataFrameTool as DFT
 
-from data_frame_tool import DataFrameTool as DFT
-
 SHOW_ONNX_JSON = False
 SHOW_TRANSFORMED_RESULTS = True
 SHOW_FULL_PANDAS_OUTPUT = False
@@ -584,9 +582,6 @@ def test_export_to_onnx(estimator, class_name):
             df_tool = DFT(onnx_path)
             result_ort = df_tool.execute(dataset, [])
 
-            df_tool = DFT(onnx_path)
-            result_ort = df_tool.execute(dataset, [])
-
             if SHOW_TRANSFORMED_RESULTS:
                 print_results(result_expected, result_onnx, result_ort)
 
@@ -618,7 +613,7 @@ def test_export_to_onnx(estimator, class_name):
 for entry_point in entry_points:
     class_name = entry_point['NewName']
 
-#    if not class_name in ['Handler']:
+#    if not class_name in ['FastLinearClassifier']:
 #        continue
 
     print('\n===========> %s' % class_name)

From 72a04aa151008744773d1ea55468a031d345966c Mon Sep 17 00:00:00 2001
From: Gani Nazirov <ganaziro@microsoft.com>
Date: Wed, 26 Feb 2020 12:13:02 -0800
Subject: [PATCH 4/4] rollback

---
 nuget.config | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/nuget.config b/nuget.config
index 999bb5b5..c0efdcaa 100644
--- a/nuget.config
+++ b/nuget.config
@@ -5,8 +5,7 @@
   </config>
   <packageSources>
     <add key="nuget.org" value="https://api.nuget.org/v3/index.json" />
-    <!--add key="MachineLearning" value="https://pkgs.dev.azure.com/dnceng/public/_packaging/MachineLearning/nuget/v3/index.json"  />
-    <add key="MlNet_Daily" value="https://dotnet.myget.org/F/dotnet-core/api/v3/index.json" /-->
-    <add key="local_mlnet" value="E:\sources\machinelearning\bin\packages" />
+    <add key="MachineLearning" value="https://pkgs.dev.azure.com/dnceng/public/_packaging/MachineLearning/nuget/v3/index.json"  />
+    <add key="MlNet_Daily" value="https://dotnet.myget.org/F/dotnet-core/api/v3/index.json" />
   </packageSources>
 </configuration>