From d5d86803ed65ef2763e5fdc0fd5c3dffdd8a5a9e Mon Sep 17 00:00:00 2001 From: "REDMOND\\nakazmi" Date: Wed, 14 Aug 2019 17:29:08 -0700 Subject: [PATCH 1/3] Handle new ML.NET model format for predictions --- src/python/nimbusml/examples/_201.py | 78 ++++++++++++++++++++++++++++ src/python/nimbusml/pipeline.py | 56 +++++++++++++++----- 2 files changed, 120 insertions(+), 14 deletions(-) create mode 100644 src/python/nimbusml/examples/_201.py diff --git a/src/python/nimbusml/examples/_201.py b/src/python/nimbusml/examples/_201.py new file mode 100644 index 00000000..9f558c6a --- /dev/null +++ b/src/python/nimbusml/examples/_201.py @@ -0,0 +1,78 @@ +from nimbusml import Pipeline, FileDataStream +from nimbusml.datasets import get_dataset +from nimbusml.feature_extraction.text import NGramFeaturizer +from nimbusml.feature_extraction.text.extractor import Ngram +from nimbusml.linear_model import AveragedPerceptronBinaryClassifier +from nimbusml.preprocessing.schema import ColumnConcatenator +import numpy as np +import pandas as pd + +mlnet_model_file = "D:\\Data\\UCIAdult\\SampleBinaryClassification\\SampleBinaryClassification.Model\\MLNETModel.zip" +nimbus_fds_model_file = "D:\\Data\\UCIAdult\\SampleBinaryClassification\\SampleBinaryClassification.Model\\NimbusFDSModel.zip" +nimbus_df_model_file = "D:\\Data\\UCIAdult\\SampleBinaryClassification\\SampleBinaryClassification.Model\\NimbusDFModel.zip" +nimbus_scored_data_path = "D:\\Data\\UCIAdult\\scored_nimbus.tsv" + +train_file = get_dataset('wiki_detox_train').as_filepath() +test_file = "D:\\Data\\UCIAdult\\test" + +#train_df = pd.read_csv(train_file) +#train_df.columns = [i.replace(' ', '') for i in train_df.columns] +train_fds = FileDataStream.read_csv(train_file, sep='\t') + +test_df = pd.read_csv(test_file) +test_df.columns = [i.replace(' ', '') for i in test_df.columns] +test_fds = FileDataStream.read_csv(test_file, sep='\t', numeric_dtype=np.float32) + +pipe_fds = Pipeline([NGramFeaturizer(word_feature_extractor=Ngram(), + columns={'features': ['SentimentText']}), + AveragedPerceptronBinaryClassifier(feature=['features'], label='Sentiment')]) + +pipe_fds.fit(train_fds) +##pipe_fds.save_model(nimbus_fds_model_file) +scores = pipe_fds.predict(train_fds) +print(scores.head()) +#with pd.option_context('display.max_rows', None): +# print(scores) + +#print(sum(1*(scores['PredictedLabel'] == 1))) +#print(sum(1*(scores['PredictedLabel'] == 0))) + + +#scores = pipe_fds.predict(test_df) + +#pipe_fds_loaded = Pipeline() +#pipe_fds_loaded.load_model(nimbus_fds_model_file) +##scores = pipe_fds_loaded.predict(test_fds) +#scores = pipe_fds_loaded.predict(test_df) + +#pipe_df = pipe_fds.clone() +#pipe_df.fit(train_df) +##pipe_df.save_model(nimbus_df_model_file) +#scores = pipe_df.predict(test_fds) +#scores = pipe_df.predict(test_df) + +#pipe_df_loaded = Pipeline() +#pipe_df_loaded.load_model(nimbus_df_model_file) +#scores = pipe_df_loaded.predict(test_fds) +#scores = pipe_df_loaded.predict(test_df) + + + + +#p = Pipeline() +#p.load_model(nimbus_fds_model_file) + +#scores = p.predict(test_fds, verbose=100) +##scores['PredictedLabel'] = scores['PredictedLabel']*1 +##scores.to_csv(nimbus_scored_data_path, sep="\t", index=False) +###print("scores from test fds") +#print(scores.head()) +##with pd.option_context('display.max_rows', None, 'display.max_columns', None): +## print(scores) + +#probs = p.predict_proba(test_fds) +#print(probs.head()) + +#scores = p.predict(test_df, verbose=100) +#print("scores from test df") +#print(scores.head()) diff --git a/src/python/nimbusml/pipeline.py b/src/python/nimbusml/pipeline.py index 9d13b5e1..25c2f228 100644 --- a/src/python/nimbusml/pipeline.py +++ b/src/python/nimbusml/pipeline.py @@ -19,6 +19,7 @@ from scipy.sparse import csr_matrix from sklearn.utils.validation import check_X_y, check_array from sklearn.utils.multiclass import unique_labels +from zipfile import ZipFile from .internal.core.base_pipeline_item import BasePipelineItem from .internal.entrypoints.data_customtextloader import \ @@ -40,6 +41,8 @@ from .internal.entrypoints.models_summarizer import models_summarizer from .internal.entrypoints.transforms_datasetscorer import \ transforms_datasetscorer +from .internal.entrypoints.transforms_datasettransformscorer import \ + transforms_datasettransformscorer from .internal.entrypoints.transforms_featurecombiner import \ transforms_featurecombiner from .internal.entrypoints.transforms_featurecontributioncalculationtransformer import \ @@ -1816,22 +1819,43 @@ def _predict(self, X, y=None, isinstance(X, DataFrame) and isinstance(y, (str, tuple))): y = y_temp + model_zip = ZipFile(self.model) + is_transformer_chain = any('TransformerChain' in item + for item in model_zip.namelist()) + all_nodes = [] - inputs = dict([('data', ''), ('predictor_model', self.model)]) - if isinstance(X, FileDataStream): - importtext_node = data_customtextloader( - input_file="$file", + if is_transformer_chain: + inputs = dict([('data', ''), ('transform_model', self.model)]) + if isinstance(X, FileDataStream): + importtext_node = data_customtextloader( + input_file="$file", + data="$data", + custom_schema=schema.to_string( + add_sep=True)) + all_nodes = [importtext_node] + inputs = dict([('file', ''), ('transform_model', self.model)]) + + score_node = transforms_datasettransformscorer( data="$data", - custom_schema=schema.to_string( - add_sep=True)) - all_nodes = [importtext_node] - inputs = dict([('file', ''), ('predictor_model', self.model)]) - - score_node = transforms_datasetscorer( - data="$data", - predictor_model="$predictor_model", - scored_data="$scoredVectorData") - all_nodes.extend([score_node]) + transform_model="$transform_model", + scored_data="$scoredVectorData") + all_nodes.extend([score_node]) + else: + inputs = dict([('data', ''), ('predictor_model', self.model)]) + if isinstance(X, FileDataStream): + importtext_node = data_customtextloader( + input_file="$file", + data="$data", + custom_schema=schema.to_string( + add_sep=True)) + all_nodes = [importtext_node] + inputs = dict([('file', ''), ('predictor_model', self.model)]) + + score_node = transforms_datasetscorer( + data="$data", + predictor_model="$predictor_model", + scored_data="$scoredVectorData") + all_nodes.extend([score_node]) if (evaltype in ['binary', 'multiclass']) or \ (hasattr(self, 'steps') @@ -1889,6 +1913,10 @@ def _predict(self, X, y=None, self._run_time = time.time() - start_time raise e + if is_transformer_chain: + out_data['PredictedLabel'] = out_data['PredictedLabel']*1 + + if y is not None: # We need to fix the schema for ranking metrics if evaltype == 'ranking': From d4a5ac67cc50167da69deafe3450c44ca5c11bce Mon Sep 17 00:00:00 2001 From: "REDMOND\\nakazmi" Date: Wed, 14 Aug 2019 17:33:28 -0700 Subject: [PATCH 2/3] fix --- src/python/nimbusml.pyproj | 4 +- src/python/nimbusml/examples/_201.py | 78 ---------------------------- 2 files changed, 1 insertion(+), 81 deletions(-) delete mode 100644 src/python/nimbusml/examples/_201.py diff --git a/src/python/nimbusml.pyproj b/src/python/nimbusml.pyproj index e2ae20cd..1b3e0ad6 100644 --- a/src/python/nimbusml.pyproj +++ b/src/python/nimbusml.pyproj @@ -67,9 +67,7 @@ - - Code - + diff --git a/src/python/nimbusml/examples/_201.py b/src/python/nimbusml/examples/_201.py deleted file mode 100644 index 9f558c6a..00000000 --- a/src/python/nimbusml/examples/_201.py +++ /dev/null @@ -1,78 +0,0 @@ -from nimbusml import Pipeline, FileDataStream -from nimbusml.datasets import get_dataset -from nimbusml.feature_extraction.text import NGramFeaturizer -from nimbusml.feature_extraction.text.extractor import Ngram -from nimbusml.linear_model import AveragedPerceptronBinaryClassifier -from nimbusml.preprocessing.schema import ColumnConcatenator -import numpy as np -import pandas as pd - -mlnet_model_file = "D:\\Data\\UCIAdult\\SampleBinaryClassification\\SampleBinaryClassification.Model\\MLNETModel.zip" -nimbus_fds_model_file = "D:\\Data\\UCIAdult\\SampleBinaryClassification\\SampleBinaryClassification.Model\\NimbusFDSModel.zip" -nimbus_df_model_file = "D:\\Data\\UCIAdult\\SampleBinaryClassification\\SampleBinaryClassification.Model\\NimbusDFModel.zip" -nimbus_scored_data_path = "D:\\Data\\UCIAdult\\scored_nimbus.tsv" - -train_file = get_dataset('wiki_detox_train').as_filepath() -test_file = "D:\\Data\\UCIAdult\\test" - -#train_df = pd.read_csv(train_file) -#train_df.columns = [i.replace(' ', '') for i in train_df.columns] -train_fds = FileDataStream.read_csv(train_file, sep='\t') - -test_df = pd.read_csv(test_file) -test_df.columns = [i.replace(' ', '') for i in test_df.columns] -test_fds = FileDataStream.read_csv(test_file, sep='\t', numeric_dtype=np.float32) - -pipe_fds = Pipeline([NGramFeaturizer(word_feature_extractor=Ngram(), - columns={'features': ['SentimentText']}), - AveragedPerceptronBinaryClassifier(feature=['features'], label='Sentiment')]) - -pipe_fds.fit(train_fds) -##pipe_fds.save_model(nimbus_fds_model_file) -scores = pipe_fds.predict(train_fds) -print(scores.head()) -#with pd.option_context('display.max_rows', None): -# print(scores) - -#print(sum(1*(scores['PredictedLabel'] == 1))) -#print(sum(1*(scores['PredictedLabel'] == 0))) - - -#scores = pipe_fds.predict(test_df) - -#pipe_fds_loaded = Pipeline() -#pipe_fds_loaded.load_model(nimbus_fds_model_file) -##scores = pipe_fds_loaded.predict(test_fds) -#scores = pipe_fds_loaded.predict(test_df) - -#pipe_df = pipe_fds.clone() -#pipe_df.fit(train_df) -##pipe_df.save_model(nimbus_df_model_file) -#scores = pipe_df.predict(test_fds) -#scores = pipe_df.predict(test_df) - -#pipe_df_loaded = Pipeline() -#pipe_df_loaded.load_model(nimbus_df_model_file) -#scores = pipe_df_loaded.predict(test_fds) -#scores = pipe_df_loaded.predict(test_df) - - - - -#p = Pipeline() -#p.load_model(nimbus_fds_model_file) - -#scores = p.predict(test_fds, verbose=100) -##scores['PredictedLabel'] = scores['PredictedLabel']*1 -##scores.to_csv(nimbus_scored_data_path, sep="\t", index=False) -###print("scores from test fds") -#print(scores.head()) -##with pd.option_context('display.max_rows', None, 'display.max_columns', None): -## print(scores) - -#probs = p.predict_proba(test_fds) -#print(probs.head()) - -#scores = p.predict(test_df, verbose=100) -#print("scores from test df") -#print(scores.head()) From 42b1680ac44a2e783f5eeb9114fad38a9b3069ae Mon Sep 17 00:00:00 2001 From: Gani Nazirov Date: Wed, 21 Aug 2019 16:31:25 -0700 Subject: [PATCH 3/3] use with{} statement with ZipFile --- src/python/nimbusml/pipeline.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/python/nimbusml/pipeline.py b/src/python/nimbusml/pipeline.py index 00154f0f..c1143160 100644 --- a/src/python/nimbusml/pipeline.py +++ b/src/python/nimbusml/pipeline.py @@ -1819,8 +1819,9 @@ def _predict(self, X, y=None, isinstance(X, DataFrame) and isinstance(y, (str, tuple))): y = y_temp - model_zip = ZipFile(self.model) - is_transformer_chain = any('TransformerChain' in item + is_transformer_chain = False + with ZipFile(self.model) as model_zip: + is_transformer_chain = any('TransformerChain' in item for item in model_zip.namelist()) all_nodes = []