From 28b8ed3ea17375615b2efa38fde5fcb1697f3dbc Mon Sep 17 00:00:00 2001 From: Gani Nazirov Date: Mon, 24 Feb 2020 16:33:57 -0800 Subject: [PATCH 1/4] add ORT results --- nuget.config | 5 +- src/python/tests_extended/data_frame_tool.py | 195 ++++++++++++++++++ .../tests_extended/test_export_to_onnx.py | 17 +- 3 files changed, 213 insertions(+), 4 deletions(-) create mode 100644 src/python/tests_extended/data_frame_tool.py diff --git a/nuget.config b/nuget.config index c0efdcaa..999bb5b5 100644 --- a/nuget.config +++ b/nuget.config @@ -5,7 +5,8 @@ - - + + diff --git a/src/python/tests_extended/data_frame_tool.py b/src/python/tests_extended/data_frame_tool.py new file mode 100644 index 00000000..a6d9c6d4 --- /dev/null +++ b/src/python/tests_extended/data_frame_tool.py @@ -0,0 +1,195 @@ +#------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +#-------------------------------------------------------------------------- + +from datetime import datetime +import numpy as np +import pandas as pd +import onnxruntime as onnxrt + +ort_float_set = set([np.float32, np.float64]) + +pd_float_set = set(['float64']) + +ort_int_set = set([np.int8, np.uint8, np.int16, np.uint16, np.int32, np.uint32, np.int64, np.uint64]) + +pd_int_set = set(['int64']) + +types_dict = { + 'tensor(float16)': np.float16, + 'tensor(float)' : np.float32, + 'tensor(double)' : np.float64, + + 'tensor(int8)' : np.int8, + 'tensor(uint8)' : np.uint8, + 'tensor(int16)' : np.int16, + 'tensor(uint16)' : np.uint16, + 'tensor(int32)' : np.int32, + 'tensor(uint32)' : np.uint32, + 'tensor(int64)' : np.int64, + 'tensor(uint64)' : np.uint64, + + 'tensor(bool)' : np.bool, + 'tensor(string)' : np.object +} + +class DataFrameTool(): + """ + This is a utility class used to run a model with pandas.DataFrame input + """ + def __init__(self, model_path, sess_options=None): + """ + :param model_path: path to the model to be loaded + :param sess_options: see onnxruntime.SessionsOptions + """ + self._model_path = model_path + self._sess_options = sess_options + self._sess = onnxrt.InferenceSession(self._model_path, self._sess_options) + + def _reshape_input(self, input_array, expected_shape): + """ + :param - input_array numpy array. This one is obtained from DataFrame and expected to have + : a rank if 1. + :expected_shape - shape fetched from the model which may include dynamic elements. + : expected_shape may at most have one -1, None or zero which will be computed from + : the size of the input_array. We replace None and zeros to -1 and let np.ndarray.reshape deal with it. + """ + # expected_shape rank is one, we will let onnxruntime to deal with it + if len(expected_shape) == 1: + return input_array + + inferred_shape = [dim if dim else -1 for dim in expected_shape] + return input_array.reshape(inferred_shape) + + def _validate_type(self, input_meta, col_type): + """ + : input_meta - meta info obtained from the model for the given input + : col_type - dtype of the column + : throws if conditions are not met + + float16 and bool will always require exact match + We attempt to convert any type to a string if it is required. + With strings we always want to put this into a flat array, cast to np.object and then reshape as object + Any other type to qualify for casting must match either integer or floating point types + Python datetime which is denoted in Pandas as datetime64[ns] are cast to int64 + """ + expected_type = types_dict[input_meta.type] + if input_meta.type == 'tensor(string)': + return + elif expected_type == col_type: + return + elif expected_type == np.int64 and str(col_type) == 'datetime64[ns]': + return + elif expected_type == np.uint32 and str(col_type) == 'category': + return + elif expected_type in ort_float_set and str(col_type) in pd_float_set: + return + elif expected_type in ort_int_set and str(col_type) in pd_int_set: + return + + raise TypeError("Input {} requires type {} unable to cast column type {} ".format( + input_meta.name, expected_type, col_type)) + + + def _process_input_list(self, df, input_metas, require): + """ + Return a dictionary of input_name : a typed and shaped np.array of values for a given input_meta + The function does the heavy lifting for _get_input_feeds() + + :param df: See :class:`pandas.DataFrame`. + :param input_metas: a list of name/type pairs + :require is a boolean. If True this helper throws on a missing input. + + """ + feeds = {} + # Process mandadory inputs. Raise an error if anything is not present + for input_meta in input_metas: + # We fully expect all the types are in the above dictionary + assert input_meta.type in types_dict, "Update types_dict for the new type" + if input_meta.name in df.columns: + self._validate_type(input_meta, df[input_meta.name].dtype) + if (df[input_meta.name].dtype) == 'datetime64[ns]': + input_array = np.array([dt.timestamp() for dt in df[input_meta.name]]).astype(np.int64) + else: + # With strings we must cast first to np.object then then reshape + # so we do it for everything + input_array = np.array(df[input_meta.name]).astype(types_dict[input_meta.type]) + + feeds[input_meta.name] = self._reshape_input(input_array, input_meta.shape) + + elif require: + raise RuntimeError("This model requires input {} of type {} but it is not found in the DataFrame".format( + input_meta.name, types_dict[input_meta.type])) + return feeds + + + def _get_input_feeds(self, df, sess): + """ + Return a dictionary of input_name : a typed and shaped np.array of values + This function accepts Pandas DataFrame as the first argument and onnxruntime + session with a loaded model. The function interrogates the model for the inputs + and matches the model input names to the DataFrame instance column names. + It requires exact matches for bool and float16 types. It attempts to convert to + string any input type if string is required. + It attempts to convert floating types to each other and does the same for all of the + integer types without requiring an exact match. + + :param df: See :class:`pandas.DataFrame`. The function only considers the first row (0) of each column + and feeds the data to the appropriate model inputs. + + :param sess: See :class:`onnxruntime.InferenceSession`. + + :: + For example: pd.DataFrame([[0], [4],[20]],index=[0], columns=['A', 'B', 'C']) + + """ + if df.empty: + raise RuntimeError('input DataFrame is empty') + + # Process mandadory inputs. Raise an error if anything is not present + feeds = self._process_input_list(df, sess.get_inputs(), True) + # Process optional overridable initializers. If present the initialzier value + # is overriden by the input. If not, the initialzier value embedded in the model takes effect. + initializers = self._process_input_list(df, sess.get_overridable_initializers(), False) + + feeds.update(initializers) + + return feeds + + def execute(self, df, output_names=None, output_types=None, run_options=None): + "Return a list of output values restricted to output names if not empty" + """ + Compute the predictions. + + :param df: See :class:`pandas.DataFrame`. + :output_name - list of column output names and their order to output + :output_types { output_name : dtype } optional dictionary that asks to cast output + to the colum type + + :param run_options: See :class:`onnxruntime.RunOptions`. + :: + sess.run([output_name], {input_name: x}) + Pandas DataFrame + """ + input_feed = self._get_input_feeds(df, self._sess); + if not output_names: + output_names = [output.name for output in self._sess._outputs_meta] + + results = self._sess.run(output_names, input_feed, run_options) + + df = pd.DataFrame() + for i in range(len(results)): + r = results[i].flatten() + if output_types and output_names[i] in output_types: + dtype = output_types[output_names[i]] + if dtype == np.dtype('datetime64'): + r = r.astype(np.int64) + r = [datetime.utcfromtimestamp(ts) for ts in r] + else: + r = r.astype(dtype) + + print(len(r)) + df[output_names[i]] = r + + return df diff --git a/src/python/tests_extended/test_export_to_onnx.py b/src/python/tests_extended/test_export_to_onnx.py index bbe3e95a..d6343194 100644 --- a/src/python/tests_extended/test_export_to_onnx.py +++ b/src/python/tests_extended/test_export_to_onnx.py @@ -44,6 +44,7 @@ SsaSpikeDetector, SsaChangePointDetector, SsaForecaster) +from data_frame_tool import DataFrameTool as DFT SHOW_ONNX_JSON = False SHOW_TRANSFORMED_RESULTS = True @@ -448,7 +449,7 @@ def load_json(file_path): return json.loads(content_without_comments) -def print_results(result_expected, result_onnx): +def print_results(result_expected, result_onnx, result_onnx_ort): print("\nML.Net Output (Expected Result):") print(result_expected) if not isinstance(result_expected, pd.Series): @@ -459,6 +460,10 @@ def print_results(result_expected, result_onnx): if not isinstance(result_onnx, pd.Series): print('Columns', result_onnx.columns) + print("\nORT Result:") + print(result_onnx_ort) + if not isinstance(result_onnx_ort, pd.Series): + print('Columns', result_onnx_ort.columns) def validate_results(class_name, result_expected, result_onnx): if not class_name in EXPECTED_RESULTS: @@ -489,6 +494,12 @@ def validate_results(class_name, result_expected, result_onnx): col_expected = result_expected.loc[:, col_pair[0]] col_onnx = result_onnx.loc[:, col_pair[1]] + if isinstance(col_expected.dtype, pd.api.types.CategoricalDtype): + # ONNX does not export categorical columns so convert categorical + # columns received from ML.Net back to the original values before + # the comparison. + col_expected = col_expected.astype(col_expected.dtype.categories.dtype) + pd.testing.assert_series_equal(col_expected, col_onnx, check_names=False, @@ -559,9 +570,11 @@ def test_export_to_onnx(estimator, class_name): try: onnxrunner = OnnxRunner(model_file=onnx_path) result_onnx = onnxrunner.fit_transform(dataset) + df_tool = DFT(onnx_path) + result_ort = df_tool.execute(dataset, []) if SHOW_TRANSFORMED_RESULTS: - print_results(result_expected, result_onnx) + print_results(result_expected, result_onnx, result_ort) export_valid = validate_results(class_name, result_expected, From e1c5cebce2d83b54b8aac73c57730fa089f9aabc Mon Sep 17 00:00:00 2001 From: Gani Nazirov Date: Wed, 26 Feb 2020 09:53:33 -0800 Subject: [PATCH 2/4] fixes to dataframe tool and vinod --- src/python/tests_extended/data_frame_tool.py | 2 +- src/python/tests_extended/vinod.py | 38 +++++++++++++------- src/python/tools/manifest.json | 6 ++-- 3 files changed, 29 insertions(+), 17 deletions(-) diff --git a/src/python/tests_extended/data_frame_tool.py b/src/python/tests_extended/data_frame_tool.py index 578596c7..85efabd9 100644 --- a/src/python/tests_extended/data_frame_tool.py +++ b/src/python/tests_extended/data_frame_tool.py @@ -190,7 +190,7 @@ def execute(self, df, output_names=None, output_types=None, run_options=None): continue r = np.split(r, r.shape[-1], axis=-1) \ - if r.shape[-1] > 1 else [r] + if (r.shape[-1] > 1 and r.shape[0] > 1) else [r] for suffix, col in enumerate(r): col = col.flatten() diff --git a/src/python/tests_extended/vinod.py b/src/python/tests_extended/vinod.py index ef8932d1..da858bad 100644 --- a/src/python/tests_extended/vinod.py +++ b/src/python/tests_extended/vinod.py @@ -1,12 +1,15 @@ import os +import time import tempfile import nimbusml.linear_model as nml_linear from nimbusml.feature_extraction.categorical import OneHotVectorizer from nimbusml.preprocessing.missing_values import Handler from nimbusml import FileDataStream from nimbusml.preprocessing import DatasetTransformer +from nimbusml.preprocessing.schema import ColumnSelector from nimbusml import Pipeline from nimbusml.preprocessing import OnnxRunner +from data_frame_tool import DataFrameTool as DFT def get_tmp_file(suffix=None): fd, file_name = tempfile.mkstemp(suffix=suffix) @@ -38,26 +41,35 @@ def get_tmp_file(suffix=None): print('export done') # Perform the transform using the standard ML.Net backend - result_standard = pipe_training.transform(X_test_dprep) + start = time.time() + result_standard = pipe_training.predict(X_test_dprep) + end = time.time() print(result_standard) - print('done transform using standard backend') - # c1 c2 - # 0 0.025025 0.000998 - # 1 0.305305 0.000998 + print('%ss done transform using standard backend' % round(end - start, 3)) # Perform the transform using the ONNX backend. # Note, the extra columns and column name differences # is a known issue with the ML.Net backend. - onnxrunner = OnnxRunner(model_file=onnx_path) - result_onnx = onnxrunner.fit_transform(X_test_dprep) - print('done transform using onnx backend') - print(result_onnx) - # c1 c2 c12.0 c22.0 - # 0 2.5 1.0 0.025025 0.000998 - # 1 30.5 1.0 0.305305 0.000998 + onnxrunner = Pipeline([OnnxRunner(model_file=onnx_path), + ColumnSelector(columns=['Score'])]) + # Performance issue, commenting out for now + #start = time.time() + #result_onnx = onnxrunner.fit_transform(X_test_dprep, as_binary_data_stream=True) + #end = time.time() + #print(result_onnx.head(5)) + #print('%ss done transform using onnx backend' % round(end - start, 3)) + + df_tool = DFT(onnx_path) + dataset = X_test_dprep.to_df() + start = time.time() + result_ort = df_tool.execute(dataset, []) + end = time.time() + print(result_ort) + print('%ss done transform using ORT backend (excludes df load time)' % round(end - start, 3)) + except Exception as e: - print('tragedy') + print('=============== ERROR =================') print(e) print ("done") \ No newline at end of file diff --git a/src/python/tools/manifest.json b/src/python/tools/manifest.json index 8d14276b..eb019968 100644 --- a/src/python/tools/manifest.json +++ b/src/python/tools/manifest.json @@ -1903,9 +1903,9 @@ } }, { - "Name": "SupressScoresAndLabels", + "Name": "SuppressScoresAndLabels", "Type": "Bool", - "Desc": "Supress labels and scores in per-instance outputs?", + "Desc": "Suppress labels and scores in per-instance outputs?", "Aliases": [ "noScores" ], @@ -23945,7 +23945,7 @@ { "Name": "SupressTypeErrors", "Type": "Bool", - "Desc": "Supress the errors that would occur if a column and impute mode are imcompatible. If true, will skip the column. If false, will stop and throw an error.", + "Desc": "Suppress the errors that would occur if a column and impute mode are incompatible. If true, will skip the column. If false, will stop and throw an error.", "Aliases": [ "error" ], From b5ad89acbb58764e010739c5c5efb5d935a835f7 Mon Sep 17 00:00:00 2001 From: Gani Nazirov Date: Wed, 26 Feb 2020 09:57:39 -0800 Subject: [PATCH 3/4] typos fixes --- src/python/tests_extended/test_export_to_onnx.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/python/tests_extended/test_export_to_onnx.py b/src/python/tests_extended/test_export_to_onnx.py index 03eda22a..750c7c75 100644 --- a/src/python/tests_extended/test_export_to_onnx.py +++ b/src/python/tests_extended/test_export_to_onnx.py @@ -45,8 +45,6 @@ SsaForecaster) from data_frame_tool import DataFrameTool as DFT -from data_frame_tool import DataFrameTool as DFT - SHOW_ONNX_JSON = False SHOW_TRANSFORMED_RESULTS = True SHOW_FULL_PANDAS_OUTPUT = False @@ -584,9 +582,6 @@ def test_export_to_onnx(estimator, class_name): df_tool = DFT(onnx_path) result_ort = df_tool.execute(dataset, []) - df_tool = DFT(onnx_path) - result_ort = df_tool.execute(dataset, []) - if SHOW_TRANSFORMED_RESULTS: print_results(result_expected, result_onnx, result_ort) @@ -618,7 +613,7 @@ def test_export_to_onnx(estimator, class_name): for entry_point in entry_points: class_name = entry_point['NewName'] -# if not class_name in ['Handler']: +# if not class_name in ['FastLinearClassifier']: # continue print('\n===========> %s' % class_name) From 72a04aa151008744773d1ea55468a031d345966c Mon Sep 17 00:00:00 2001 From: Gani Nazirov Date: Wed, 26 Feb 2020 12:13:02 -0800 Subject: [PATCH 4/4] rollback --- nuget.config | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/nuget.config b/nuget.config index 999bb5b5..c0efdcaa 100644 --- a/nuget.config +++ b/nuget.config @@ -5,8 +5,7 @@ - - + +