diff --git a/src/python/nimbusml.pyproj b/src/python/nimbusml.pyproj
index e27fedd2..971fe420 100644
--- a/src/python/nimbusml.pyproj
+++ b/src/python/nimbusml.pyproj
@@ -754,6 +754,8 @@
+
+
diff --git a/src/python/tests_extended/data_frame_tool.py b/src/python/tests_extended/data_frame_tool.py
new file mode 100644
index 00000000..104a0bb8
--- /dev/null
+++ b/src/python/tests_extended/data_frame_tool.py
@@ -0,0 +1,192 @@
+#-------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+#--------------------------------------------------------------------------
+
+from datetime import datetime
+import numpy as np
+import pandas as pd
+import onnxruntime as onnxrt
+
+ort_float_set = set([np.float32, np.float64])
+
+pd_float_set = set(['float64'])
+
+ort_int_set = set([np.int8, np.uint8, np.int16, np.uint16, np.int32, np.uint32, np.int64, np.uint64])
+
+pd_int_set = set(['int64'])
+
+types_dict = {
+ 'tensor(float16)': np.float16,
+ 'tensor(float)' : np.float32,
+ 'tensor(double)' : np.float64,
+
+ 'tensor(int8)' : np.int8,
+ 'tensor(uint8)' : np.uint8,
+ 'tensor(int16)' : np.int16,
+ 'tensor(uint16)' : np.uint16,
+ 'tensor(int32)' : np.int32,
+ 'tensor(uint32)' : np.uint32,
+ 'tensor(int64)' : np.int64,
+ 'tensor(uint64)' : np.uint64,
+
+ 'tensor(bool)' : np.bool,
+ 'tensor(string)' : np.object
+}
+
+class DataFrameTool():
+ """
+ This is a utility class used to run a model with pandas.DataFrame input
+ """
+ def __init__(self, model_path, sess_options=None):
+ """
+ :param model_path: path to the model to be loaded
+ :param sess_options: see onnxruntime.SessionsOptions
+ """
+ self._model_path = model_path
+ self._sess_options = sess_options
+ self._sess = onnxrt.InferenceSession(self._model_path, self._sess_options)
+
+ def _reshape_input(self, input_array, expected_shape):
+ """
+ :param - input_array numpy array. This one is obtained from DataFrame and expected to have
+ : a rank if 1.
+ :expected_shape - shape fetched from the model which may include dynamic elements.
+ : expected_shape may at most have one -1, None or zero which will be computed from
+ : the size of the input_array. We replace None and zeros to -1 and let np.ndarray.reshape deal with it.
+ """
+ # expected_shape rank is one, we will let onnxruntime to deal with it
+ if len(expected_shape) == 1:
+ return input_array
+
+ inferred_shape = [dim if dim else -1 for dim in expected_shape]
+ return input_array.reshape(inferred_shape)
+
+ def _validate_type(self, input_meta, col_type):
+ """
+ : input_meta - meta info obtained from the model for the given input
+ : col_type - dtype of the column
+ : throws if conditions are not met
+
+ float16 and bool will always require exact match
+ We attempt to convert any type to a string if it is required.
+ With strings we always want to put this into a flat array, cast to np.object and then reshape as object
+ Any other type to qualify for casting must match either integer or floating point types
+ Python datetime which is denoted in Pandas as datetime64[ns] are cast to int64
+ """
+ expected_type = types_dict[input_meta.type]
+ if input_meta.type == 'tensor(string)':
+ return
+ elif expected_type == col_type:
+ return
+ elif expected_type == np.int64 and str(col_type) == 'datetime64[ns]':
+ return
+ elif expected_type in ort_float_set and str(col_type) in pd_float_set:
+ return
+ elif expected_type in ort_int_set and str(col_type) in pd_int_set:
+ return
+
+ raise TypeError("Input {} requires type {} unable to cast column type {} ".format(
+ input_meta.name, expected_type, col_type))
+
+
+ def _process_input_list(self, df, input_metas, require):
+ """
+ Return a dictionary of input_name : a typed and shaped np.array of values for a given input_meta
+ The function does the heavy lifting for _get_input_feeds()
+
+ :param df: See :class:`pandas.DataFrame`.
+ :param input_metas: a list of name/type pairs
+ :require is a boolean. If True this helper throws on a missing input.
+
+ """
+ feeds = {}
+ # Process mandadory inputs. Raise an error if anything is not present
+ for input_meta in input_metas:
+ # We fully expect all the types are in the above dictionary
+ assert input_meta.type in types_dict, "Update types_dict for the new type"
+ if input_meta.name in df.columns:
+ self._validate_type(input_meta, df[input_meta.name].dtype)
+ if (df[input_meta.name].dtype) == 'datetime64[ns]':
+ input_array = np.array([dt.timestamp() for dt in df[input_meta.name]]).astype(np.int64)
+ else:
+ # With strings we must cast first to np.object then then reshape
+ # so we do it for everything
+ input_array = np.array(df[input_meta.name]).astype(types_dict[input_meta.type])
+
+ feeds[input_meta.name] = self._reshape_input(input_array, input_meta.shape)
+
+ elif require:
+ raise RuntimeError("This model requires input {} of type {} but it is not found in the DataFrame".format(
+ input_meta.name, types_dict[input_meta.type]))
+ return feeds
+
+
+ def _get_input_feeds(self, df, sess):
+ """
+ Return a dictionary of input_name : a typed and shaped np.array of values
+ This function accepts Pandas DataFrame as the first argument and onnxruntime
+ session with a loaded model. The function interrogates the model for the inputs
+ and matches the model input names to the DataFrame instance column names.
+ It requires exact matches for bool and float16 types. It attempts to convert to
+ string any input type if string is required.
+ It attempts to convert floating types to each other and does the same for all of the
+ integer types without requiring an exact match.
+
+ :param df: See :class:`pandas.DataFrame`. The function only considers the first row (0) of each column
+ and feeds the data to the appropriate model inputs.
+
+ :param sess: See :class:`onnxruntime.InferenceSession`.
+
+ ::
+ For example: pd.DataFrame([[0], [4],[20]],index=[0], columns=['A', 'B', 'C'])
+
+ """
+ if df.empty:
+ raise RuntimeError('input DataFrame is empty')
+
+ # Process mandadory inputs. Raise an error if anything is not present
+ feeds = self._process_input_list(df, sess.get_inputs(), True)
+ # Process optional overridable initializers. If present the initialzier value
+ # is overriden by the input. If not, the initialzier value embedded in the model takes effect.
+ initializers = self._process_input_list(df, sess.get_overridable_initializers(), False)
+
+ feeds.update(initializers)
+
+ return feeds
+
+ def execute(self, df, output_names=None, output_types=None, run_options=None):
+ "Return a list of output values restricted to output names if not empty"
+ """
+ Compute the predictions.
+
+ :param df: See :class:`pandas.DataFrame`.
+ :output_name - list of column output names and their order to output
+ :output_types { output_name : dtype } optional dictionary that asks to cast output
+ to the colum type
+
+ :param run_options: See :class:`onnxruntime.RunOptions`.
+ ::
+ sess.run([output_name], {input_name: x})
+ Pandas DataFrame
+ """
+ input_feed = self._get_input_feeds(df, self._sess);
+ if not output_names:
+ output_names = [output.name for output in self._sess._outputs_meta]
+
+ results = self._sess.run(output_names, input_feed, run_options)
+
+ df = pd.DataFrame()
+ for i in range(len(results)):
+ r = results[i].flatten()
+ if output_types and output_names[i] in output_types:
+ dtype = output_types[output_names[i]]
+ if dtype == np.dtype('datetime64'):
+ r = r.astype(np.int64)
+ r = [datetime.utcfromtimestamp(ts) for ts in r]
+ else:
+ r = r.astype(dtype)
+
+ df[output_names[i]] = r
+
+ return df
diff --git a/src/python/tests_extended/test_export_to_onnx.py b/src/python/tests_extended/test_export_to_onnx.py
index bbe3e95a..bbd02b2a 100644
--- a/src/python/tests_extended/test_export_to_onnx.py
+++ b/src/python/tests_extended/test_export_to_onnx.py
@@ -43,6 +43,7 @@
from nimbusml.timeseries import (IidSpikeDetector, IidChangePointDetector,
SsaSpikeDetector, SsaChangePointDetector,
SsaForecaster)
+from data_frame_tool import DataFrameTool as DFT
SHOW_ONNX_JSON = False
@@ -559,6 +560,8 @@ def test_export_to_onnx(estimator, class_name):
try:
onnxrunner = OnnxRunner(model_file=onnx_path)
result_onnx = onnxrunner.fit_transform(dataset)
+ df_tool = DFT(onnx_path)
+ result_onnx1 = df_tool.execute(dataset, [])
if SHOW_TRANSFORMED_RESULTS:
print_results(result_expected, result_onnx)
@@ -590,8 +593,8 @@ def test_export_to_onnx(estimator, class_name):
for entry_point in entry_points:
class_name = entry_point['NewName']
-# if not class_name in ['Handler']:
-# continue
+ if not class_name in ['Handler']:
+ continue
print('\n===========> %s' % class_name)
diff --git a/src/python/tests_extended/vinod.py b/src/python/tests_extended/vinod.py
new file mode 100644
index 00000000..ef8932d1
--- /dev/null
+++ b/src/python/tests_extended/vinod.py
@@ -0,0 +1,63 @@
+import os
+import tempfile
+import nimbusml.linear_model as nml_linear
+from nimbusml.feature_extraction.categorical import OneHotVectorizer
+from nimbusml.preprocessing.missing_values import Handler
+from nimbusml import FileDataStream
+from nimbusml.preprocessing import DatasetTransformer
+from nimbusml import Pipeline
+from nimbusml.preprocessing import OnnxRunner
+
+def get_tmp_file(suffix=None):
+ fd, file_name = tempfile.mkstemp(suffix=suffix)
+ fl = os.fdopen(fd, 'w')
+ fl.close()
+ return file_name
+
+X_train_dprep = FileDataStream.read_csv("E:/sources/vinod/NYCTaxiTipPrediction_train.csv")
+X_test_dprep = FileDataStream.read_csv("E:/sources/vinod/NYCTaxiTipPrediction_valid.csv")
+
+try:
+ pipe_featurization = Pipeline([OneHotVectorizer(columns={'vendor_id': 'vendor_id', 'payment_type': 'payment_type', 'passenger_count': 'passenger_count','rate_code': 'rate_code'})
+ ,Handler(columns={'trip_distance': 'trip_distance', 'trip_time_in_secs': 'trip_time_in_secs'})
+ ])
+ pipe_featurization.fit(X_train_dprep)
+
+ pipe_training = Pipeline([DatasetTransformer(pipe_featurization.model),
+ nml_linear.FastLinearRegressor(feature=['vendor_id', 'payment_type', 'passenger_count', 'rate_code', 'trip_distance', 'trip_time_in_secs'],label='fare_amount')
+ ])
+ pipe_training.fit(X_train_dprep)
+
+ metrics, scores = pipe_training.test(X_test_dprep)
+ print(metrics)
+ print('training done')
+
+ # Export the pipeline to ONNX
+ onnx_path = get_tmp_file('.onnx')
+ pipe_training.export_to_onnx(onnx_path, 'com.microsoft.ml', onnx_version='Stable')
+ print('export done')
+
+ # Perform the transform using the standard ML.Net backend
+ result_standard = pipe_training.transform(X_test_dprep)
+ print(result_standard)
+ print('done transform using standard backend')
+ # c1 c2
+ # 0 0.025025 0.000998
+ # 1 0.305305 0.000998
+
+ # Perform the transform using the ONNX backend.
+ # Note, the extra columns and column name differences
+ # is a known issue with the ML.Net backend.
+ onnxrunner = OnnxRunner(model_file=onnx_path)
+ result_onnx = onnxrunner.fit_transform(X_test_dprep)
+ print('done transform using onnx backend')
+ print(result_onnx)
+ # c1 c2 c12.0 c22.0
+ # 0 2.5 1.0 0.025025 0.000998
+ # 1 30.5 1.0 0.305305 0.000998
+
+except Exception as e:
+ print('tragedy')
+ print(e)
+
+print ("done")
\ No newline at end of file