diff --git a/src/python/nimbusml.pyproj b/src/python/nimbusml.pyproj index e27fedd2..971fe420 100644 --- a/src/python/nimbusml.pyproj +++ b/src/python/nimbusml.pyproj @@ -754,6 +754,8 @@ + + diff --git a/src/python/tests_extended/data_frame_tool.py b/src/python/tests_extended/data_frame_tool.py new file mode 100644 index 00000000..104a0bb8 --- /dev/null +++ b/src/python/tests_extended/data_frame_tool.py @@ -0,0 +1,192 @@ +#------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +#-------------------------------------------------------------------------- + +from datetime import datetime +import numpy as np +import pandas as pd +import onnxruntime as onnxrt + +ort_float_set = set([np.float32, np.float64]) + +pd_float_set = set(['float64']) + +ort_int_set = set([np.int8, np.uint8, np.int16, np.uint16, np.int32, np.uint32, np.int64, np.uint64]) + +pd_int_set = set(['int64']) + +types_dict = { + 'tensor(float16)': np.float16, + 'tensor(float)' : np.float32, + 'tensor(double)' : np.float64, + + 'tensor(int8)' : np.int8, + 'tensor(uint8)' : np.uint8, + 'tensor(int16)' : np.int16, + 'tensor(uint16)' : np.uint16, + 'tensor(int32)' : np.int32, + 'tensor(uint32)' : np.uint32, + 'tensor(int64)' : np.int64, + 'tensor(uint64)' : np.uint64, + + 'tensor(bool)' : np.bool, + 'tensor(string)' : np.object +} + +class DataFrameTool(): + """ + This is a utility class used to run a model with pandas.DataFrame input + """ + def __init__(self, model_path, sess_options=None): + """ + :param model_path: path to the model to be loaded + :param sess_options: see onnxruntime.SessionsOptions + """ + self._model_path = model_path + self._sess_options = sess_options + self._sess = onnxrt.InferenceSession(self._model_path, self._sess_options) + + def _reshape_input(self, input_array, expected_shape): + """ + :param - input_array numpy array. This one is obtained from DataFrame and expected to have + : a rank if 1. + :expected_shape - shape fetched from the model which may include dynamic elements. + : expected_shape may at most have one -1, None or zero which will be computed from + : the size of the input_array. We replace None and zeros to -1 and let np.ndarray.reshape deal with it. + """ + # expected_shape rank is one, we will let onnxruntime to deal with it + if len(expected_shape) == 1: + return input_array + + inferred_shape = [dim if dim else -1 for dim in expected_shape] + return input_array.reshape(inferred_shape) + + def _validate_type(self, input_meta, col_type): + """ + : input_meta - meta info obtained from the model for the given input + : col_type - dtype of the column + : throws if conditions are not met + + float16 and bool will always require exact match + We attempt to convert any type to a string if it is required. + With strings we always want to put this into a flat array, cast to np.object and then reshape as object + Any other type to qualify for casting must match either integer or floating point types + Python datetime which is denoted in Pandas as datetime64[ns] are cast to int64 + """ + expected_type = types_dict[input_meta.type] + if input_meta.type == 'tensor(string)': + return + elif expected_type == col_type: + return + elif expected_type == np.int64 and str(col_type) == 'datetime64[ns]': + return + elif expected_type in ort_float_set and str(col_type) in pd_float_set: + return + elif expected_type in ort_int_set and str(col_type) in pd_int_set: + return + + raise TypeError("Input {} requires type {} unable to cast column type {} ".format( + input_meta.name, expected_type, col_type)) + + + def _process_input_list(self, df, input_metas, require): + """ + Return a dictionary of input_name : a typed and shaped np.array of values for a given input_meta + The function does the heavy lifting for _get_input_feeds() + + :param df: See :class:`pandas.DataFrame`. + :param input_metas: a list of name/type pairs + :require is a boolean. If True this helper throws on a missing input. + + """ + feeds = {} + # Process mandadory inputs. Raise an error if anything is not present + for input_meta in input_metas: + # We fully expect all the types are in the above dictionary + assert input_meta.type in types_dict, "Update types_dict for the new type" + if input_meta.name in df.columns: + self._validate_type(input_meta, df[input_meta.name].dtype) + if (df[input_meta.name].dtype) == 'datetime64[ns]': + input_array = np.array([dt.timestamp() for dt in df[input_meta.name]]).astype(np.int64) + else: + # With strings we must cast first to np.object then then reshape + # so we do it for everything + input_array = np.array(df[input_meta.name]).astype(types_dict[input_meta.type]) + + feeds[input_meta.name] = self._reshape_input(input_array, input_meta.shape) + + elif require: + raise RuntimeError("This model requires input {} of type {} but it is not found in the DataFrame".format( + input_meta.name, types_dict[input_meta.type])) + return feeds + + + def _get_input_feeds(self, df, sess): + """ + Return a dictionary of input_name : a typed and shaped np.array of values + This function accepts Pandas DataFrame as the first argument and onnxruntime + session with a loaded model. The function interrogates the model for the inputs + and matches the model input names to the DataFrame instance column names. + It requires exact matches for bool and float16 types. It attempts to convert to + string any input type if string is required. + It attempts to convert floating types to each other and does the same for all of the + integer types without requiring an exact match. + + :param df: See :class:`pandas.DataFrame`. The function only considers the first row (0) of each column + and feeds the data to the appropriate model inputs. + + :param sess: See :class:`onnxruntime.InferenceSession`. + + :: + For example: pd.DataFrame([[0], [4],[20]],index=[0], columns=['A', 'B', 'C']) + + """ + if df.empty: + raise RuntimeError('input DataFrame is empty') + + # Process mandadory inputs. Raise an error if anything is not present + feeds = self._process_input_list(df, sess.get_inputs(), True) + # Process optional overridable initializers. If present the initialzier value + # is overriden by the input. If not, the initialzier value embedded in the model takes effect. + initializers = self._process_input_list(df, sess.get_overridable_initializers(), False) + + feeds.update(initializers) + + return feeds + + def execute(self, df, output_names=None, output_types=None, run_options=None): + "Return a list of output values restricted to output names if not empty" + """ + Compute the predictions. + + :param df: See :class:`pandas.DataFrame`. + :output_name - list of column output names and their order to output + :output_types { output_name : dtype } optional dictionary that asks to cast output + to the colum type + + :param run_options: See :class:`onnxruntime.RunOptions`. + :: + sess.run([output_name], {input_name: x}) + Pandas DataFrame + """ + input_feed = self._get_input_feeds(df, self._sess); + if not output_names: + output_names = [output.name for output in self._sess._outputs_meta] + + results = self._sess.run(output_names, input_feed, run_options) + + df = pd.DataFrame() + for i in range(len(results)): + r = results[i].flatten() + if output_types and output_names[i] in output_types: + dtype = output_types[output_names[i]] + if dtype == np.dtype('datetime64'): + r = r.astype(np.int64) + r = [datetime.utcfromtimestamp(ts) for ts in r] + else: + r = r.astype(dtype) + + df[output_names[i]] = r + + return df diff --git a/src/python/tests_extended/test_export_to_onnx.py b/src/python/tests_extended/test_export_to_onnx.py index bbe3e95a..bbd02b2a 100644 --- a/src/python/tests_extended/test_export_to_onnx.py +++ b/src/python/tests_extended/test_export_to_onnx.py @@ -43,6 +43,7 @@ from nimbusml.timeseries import (IidSpikeDetector, IidChangePointDetector, SsaSpikeDetector, SsaChangePointDetector, SsaForecaster) +from data_frame_tool import DataFrameTool as DFT SHOW_ONNX_JSON = False @@ -559,6 +560,8 @@ def test_export_to_onnx(estimator, class_name): try: onnxrunner = OnnxRunner(model_file=onnx_path) result_onnx = onnxrunner.fit_transform(dataset) + df_tool = DFT(onnx_path) + result_onnx1 = df_tool.execute(dataset, []) if SHOW_TRANSFORMED_RESULTS: print_results(result_expected, result_onnx) @@ -590,8 +593,8 @@ def test_export_to_onnx(estimator, class_name): for entry_point in entry_points: class_name = entry_point['NewName'] -# if not class_name in ['Handler']: -# continue + if not class_name in ['Handler']: + continue print('\n===========> %s' % class_name) diff --git a/src/python/tests_extended/vinod.py b/src/python/tests_extended/vinod.py new file mode 100644 index 00000000..ef8932d1 --- /dev/null +++ b/src/python/tests_extended/vinod.py @@ -0,0 +1,63 @@ +import os +import tempfile +import nimbusml.linear_model as nml_linear +from nimbusml.feature_extraction.categorical import OneHotVectorizer +from nimbusml.preprocessing.missing_values import Handler +from nimbusml import FileDataStream +from nimbusml.preprocessing import DatasetTransformer +from nimbusml import Pipeline +from nimbusml.preprocessing import OnnxRunner + +def get_tmp_file(suffix=None): + fd, file_name = tempfile.mkstemp(suffix=suffix) + fl = os.fdopen(fd, 'w') + fl.close() + return file_name + +X_train_dprep = FileDataStream.read_csv("E:/sources/vinod/NYCTaxiTipPrediction_train.csv") +X_test_dprep = FileDataStream.read_csv("E:/sources/vinod/NYCTaxiTipPrediction_valid.csv") + +try: + pipe_featurization = Pipeline([OneHotVectorizer(columns={'vendor_id': 'vendor_id', 'payment_type': 'payment_type', 'passenger_count': 'passenger_count','rate_code': 'rate_code'}) + ,Handler(columns={'trip_distance': 'trip_distance', 'trip_time_in_secs': 'trip_time_in_secs'}) + ]) + pipe_featurization.fit(X_train_dprep) + + pipe_training = Pipeline([DatasetTransformer(pipe_featurization.model), + nml_linear.FastLinearRegressor(feature=['vendor_id', 'payment_type', 'passenger_count', 'rate_code', 'trip_distance', 'trip_time_in_secs'],label='fare_amount') + ]) + pipe_training.fit(X_train_dprep) + + metrics, scores = pipe_training.test(X_test_dprep) + print(metrics) + print('training done') + + # Export the pipeline to ONNX + onnx_path = get_tmp_file('.onnx') + pipe_training.export_to_onnx(onnx_path, 'com.microsoft.ml', onnx_version='Stable') + print('export done') + + # Perform the transform using the standard ML.Net backend + result_standard = pipe_training.transform(X_test_dprep) + print(result_standard) + print('done transform using standard backend') + # c1 c2 + # 0 0.025025 0.000998 + # 1 0.305305 0.000998 + + # Perform the transform using the ONNX backend. + # Note, the extra columns and column name differences + # is a known issue with the ML.Net backend. + onnxrunner = OnnxRunner(model_file=onnx_path) + result_onnx = onnxrunner.fit_transform(X_test_dprep) + print('done transform using onnx backend') + print(result_onnx) + # c1 c2 c12.0 c22.0 + # 0 2.5 1.0 0.025025 0.000998 + # 1 30.5 1.0 0.305305 0.000998 + +except Exception as e: + print('tragedy') + print(e) + +print ("done") \ No newline at end of file