diff --git a/src/DotNetBridge/Entrypoints.cs b/src/DotNetBridge/Entrypoints.cs index 039247fe..9be84e67 100644 --- a/src/DotNetBridge/Entrypoints.cs +++ b/src/DotNetBridge/Entrypoints.cs @@ -65,7 +65,7 @@ public sealed class ModelSchemaOutput public IDataView Schema; } - [TlcModule.EntryPoint(Name = "Models.Schema", Desc = "Retrieve input and output model schemas")] + [TlcModule.EntryPoint(Name = "Models.Schema", Desc = "Retrieve output model schema")] public static ModelSchemaOutput GetSchema(IHostEnvironment env, TransformModelInput input) { Contracts.CheckValue(env, nameof(env)); @@ -87,5 +87,96 @@ public static CommonOutputs.TransformOutput CreateVariableColumn(IHostEnvironmen var xf = VariableColumnTransform.Create(env, inputOptions, inputOptions.Data); return new CommonOutputs.TransformOutput { Model = new TransformModelImpl(env, xf, inputOptions.Data), OutputData = xf }; } + + public sealed class ScoringTransformInput + { + [Argument(ArgumentType.Required, HelpText = "The dataset to be scored", SortOrder = 1)] + public IDataView Data; + + [Argument(ArgumentType.Required, HelpText = "The predictor model to apply to data", SortOrder = 2)] + public PredictorModel PredictorModel; + + [Argument(ArgumentType.AtMostOnce, HelpText = "Suffix to append to the score columns", SortOrder = 3)] + public string Suffix; + } + + public sealed class ScoringTransformOutput + { + [TlcModule.Output(Desc = "The scored dataset", SortOrder = 1)] + public IDataView ScoredData; + + [TlcModule.Output(Desc = "The scoring transform", SortOrder = 2)] + public TransformModel ScoringTransform; + } + + private static bool AreSchemasCompatible(DataViewSchema schema1, DataViewSchema schema2) + { + if (schema1 == null) + return schema2 == null; + if (schema2 == null) + return schema1 == null; + if (schema1.Count != schema2.Count) + return false; + + for (int i = 0; i < schema1.Count; i++) + { + if(schema1[i].Type != schema2[i].Type) + return false; + } + + return true; + } + + [TlcModule.EntryPoint(Name = "Transforms.DatasetScorerEx", Desc = "Score a dataset with a predictor model")] + public static ScoringTransformOutput Score(IHostEnvironment env, ScoringTransformInput input) + { + Contracts.CheckValue(env, nameof(env)); + var host = env.Register("ScoreModel"); + host.CheckValue(input, nameof(input)); + EntryPointUtils.CheckInputArgs(host, input); + + RoleMappedData data; + IPredictor predictor; + var inputData = input.Data; + try + { + input.PredictorModel.PrepareData(host, inputData, out data, out predictor); + } + catch (Exception) + { + // this can happen in csr_matrix case, try to use only trainer model. + host.Assert(inputData.Schema.Count == 1); + var inputColumnName = inputData.Schema[0].Name; + var trainingSchema = input.PredictorModel.GetTrainingSchema(host); + // get feature vector item type. + var trainingFeatureColumn = (DataViewSchema.Column)trainingSchema.Feature; + var requiredType = trainingFeatureColumn.Type.GetItemType().RawType; + var featuresColumnName = trainingFeatureColumn.Name; + predictor = input.PredictorModel.Predictor; + var xf = new TypeConvertingTransformer(host, + new TypeConvertingEstimator.ColumnOptions(featuresColumnName, requiredType, inputColumnName)).Transform(inputData); + data = new RoleMappedData(xf, null, featuresColumnName); + } + + IDataView scoredPipe; + using (var ch = host.Start("Creating scoring pipeline")) + { + ch.Trace("Creating pipeline"); + var bindable = ScoreUtils.GetSchemaBindableMapper(host, predictor); + ch.AssertValue(bindable); + + var mapper = bindable.Bind(host, data.Schema); + var scorer = ScoreUtils.GetScorerComponent(host, mapper, input.Suffix); + scoredPipe = scorer.CreateComponent(host, data.Data, mapper, input.PredictorModel.GetTrainingSchema(host)); + } + + return + new ScoringTransformOutput + { + ScoredData = scoredPipe, + ScoringTransform = new TransformModelImpl(host, scoredPipe, inputData) + }; + + } } } diff --git a/src/python/nimbusml.pyproj b/src/python/nimbusml.pyproj index ced35764..a90735af 100644 --- a/src/python/nimbusml.pyproj +++ b/src/python/nimbusml.pyproj @@ -395,6 +395,7 @@ + @@ -685,6 +686,7 @@ + diff --git a/src/python/nimbusml/__init__.py b/src/python/nimbusml/__init__.py index 4e18a65b..0b508fcf 100644 --- a/src/python/nimbusml/__init__.py +++ b/src/python/nimbusml/__init__.py @@ -2,7 +2,7 @@ Microsoft Machine Learning for Python """ -__version__ = '1.4.2' +__version__ = '1.5.0' # CoreCLR version of MicrosoftML is built on Windows. # But file permissions are not preserved when it's copied to Linux. diff --git a/src/python/nimbusml/examples/Schema.py b/src/python/nimbusml/examples/Schema.py index ddec3c0d..c0b8d493 100644 --- a/src/python/nimbusml/examples/Schema.py +++ b/src/python/nimbusml/examples/Schema.py @@ -27,7 +27,7 @@ ]) pipe.fit(data) -schema = pipe.get_schema() +schema = pipe.get_output_columns() print(schema[0:5]) # ['Sentiment', 'SentimentText', 'features.Char.|=|=', 'features.Char.=|=|r', 'features.Char.=|r|u'] diff --git a/src/python/nimbusml/internal/entrypoints/transforms_datasetscorerex.py b/src/python/nimbusml/internal/entrypoints/transforms_datasetscorerex.py new file mode 100644 index 00000000..7a5d8c71 --- /dev/null +++ b/src/python/nimbusml/internal/entrypoints/transforms_datasetscorerex.py @@ -0,0 +1,68 @@ +""" +Transforms.DatasetScorerEx +""" + + +from ..utils.entrypoints import EntryPoint +from ..utils.utils import try_set, unlist + + +def transforms_datasetscorerex( + data, + predictor_model, + scored_data=None, + scoring_transform=None, + suffix=None, + **params): + """ + **Description** + Score a dataset with a predictor model + + :param data: The dataset to be scored (inputs). + :param predictor_model: The predictor model to apply to data + (inputs). + :param suffix: Suffix to append to the score columns (inputs). + :param scored_data: The scored dataset (outputs). + :param scoring_transform: The scoring transform (outputs). + """ + + entrypoint_name = 'Transforms.DatasetScorerEx' + inputs = {} + outputs = {} + + if data is not None: + inputs['Data'] = try_set( + obj=data, + none_acceptable=False, + is_of_type=str) + if predictor_model is not None: + inputs['PredictorModel'] = try_set( + obj=predictor_model, + none_acceptable=False, + is_of_type=str) + if suffix is not None: + inputs['Suffix'] = try_set( + obj=suffix, + none_acceptable=True, + is_of_type=str) + if scored_data is not None: + outputs['ScoredData'] = try_set( + obj=scored_data, + none_acceptable=False, + is_of_type=str) + if scoring_transform is not None: + outputs['ScoringTransform'] = try_set( + obj=scoring_transform, none_acceptable=False, is_of_type=str) + + input_variables = { + x for x in unlist(inputs.values()) + if isinstance(x, str) and x.startswith("$")} + output_variables = { + x for x in unlist(outputs.values()) + if isinstance(x, str) and x.startswith("$")} + + entrypoint = EntryPoint( + name=entrypoint_name, inputs=inputs, outputs=outputs, + input_variables=input_variables, + output_variables=output_variables) + return entrypoint diff --git a/src/python/nimbusml/pipeline.py b/src/python/nimbusml/pipeline.py index efa81735..3e0dce27 100644 --- a/src/python/nimbusml/pipeline.py +++ b/src/python/nimbusml/pipeline.py @@ -40,8 +40,8 @@ models_regressionevaluator from .internal.entrypoints.models_summarizer import models_summarizer from .internal.entrypoints.models_schema import models_schema -from .internal.entrypoints.transforms_datasetscorer import \ - transforms_datasetscorer +from .internal.entrypoints.transforms_datasetscorerex import \ + transforms_datasetscorerex from .internal.entrypoints.transforms_datasettransformscorer import \ transforms_datasettransformscorer from .internal.entrypoints.transforms_featurecombiner import \ @@ -1772,7 +1772,7 @@ def get_feature_contributions(self, X, top=10, bottom=10, verbose=0, all_nodes = [importtext_node] inputs = dict([('file', ''), ('predictor_model', self.model)]) - score_node = transforms_datasetscorer( + score_node = transforms_datasetscorerex( data="$data", predictor_model="$predictor_model", scored_data="$scoredvectordata") @@ -1815,7 +1815,7 @@ def get_feature_contributions(self, X, top=10, bottom=10, verbose=0, return out_data - def get_schema(self, verbose=0, **params): + def get_output_columns(self, verbose=0, **params): """ Returns the output list of columns for the fitted model. :return: list . @@ -2102,7 +2102,7 @@ def _predict(self, X, y=None, all_nodes = [importtext_node] inputs = dict([('file', ''), ('predictor_model', self.model)]) - score_node = transforms_datasetscorer( + score_node = transforms_datasetscorerex( data="$data", predictor_model="$predictor_model", scored_data="$scoredVectorData") diff --git a/src/python/nimbusml/tests/pipeline/test_csr_input.py b/src/python/nimbusml/tests/pipeline/test_csr_input.py new file mode 100644 index 00000000..176a7651 --- /dev/null +++ b/src/python/nimbusml/tests/pipeline/test_csr_input.py @@ -0,0 +1,65 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +import os +import unittest + +import numpy as np +import pandas as pd +from nimbusml import Pipeline, FileDataStream +from nimbusml.datasets import get_dataset +from nimbusml.feature_extraction.categorical import OneHotVectorizer +from nimbusml.linear_model import LogisticRegressionBinaryClassifier +from nimbusml.preprocessing import DatasetTransformer +from nimbusml.preprocessing.schema import PrefixColumnConcatenator +from nimbusml.preprocessing.schema import ColumnDropper +from numpy.testing import assert_equal + +class TestCsrInput(unittest.TestCase): + + def test_predict_proba_on_csr(self): + path = get_dataset('infert').as_filepath() + data = FileDataStream.read_csv(path) + cols = list(data.head(1).columns.values) # ordered data column names. + + # train featurizer + featurization_pipeline = Pipeline([OneHotVectorizer(columns={'education': 'education'})]) + featurization_pipeline.fit(data) + # Note: the relative order of all columns is still the same as in raw data. + #print(featurization_pipeline.get_output_columns()) + + # need to remove extra columns before getting csr_matrix featurized data as it wont have column name information. + csr_featurization_pipeline = Pipeline([DatasetTransformer(featurization_pipeline.model), ColumnDropper() << ['case', 'row_num']]) + sparse_featurized_data = csr_featurization_pipeline.fit_transform(data, as_csr=True) + # Note: the relative order of all columns is still the same. + #print(csr_featurization_pipeline.get_output_columns()) + + # train learner + # Note: order & number of feature columns for learner (parameter 'feature') should be the same as in csr_matrix above + cols.remove('row_num') + cols.remove('case') + feature_cols = cols + #print(feature_cols) + #['education', 'age', 'parity', 'induced', 'spontaneous', 'stratum', 'pooled.stratum'] + training_pipeline = Pipeline([DatasetTransformer(featurization_pipeline.model), LogisticRegressionBinaryClassifier(feature=feature_cols, label='case')]) + training_pipeline.fit(data, output_predictor_model=True) + + # load just a learner model + predictor_pipeline = Pipeline() + predictor_pipeline.load_model(training_pipeline.predictor_model) + # see the order of Feature.* columns that get passed to learner algo + #print(predictor_pipeline.get_output_columns()) + + # use just a learner model on csr_matrix featurized data + predictions = predictor_pipeline.predict_proba(sparse_featurized_data) + assert_equal(len(predictions), 248) + assert_equal(len(predictions[0]), 2) + + # get feature contributions + fcc = predictor_pipeline.get_feature_contributions(sparse_featurized_data) + assert_equal(fcc.shape, (248,30)) + +if __name__ == '__main__': + unittest.main() + diff --git a/src/python/nimbusml/tests/pipeline/test_pipeline_get_schema.py b/src/python/nimbusml/tests/pipeline/test_pipeline_get_schema.py index 5eccf4d7..63bb5310 100644 --- a/src/python/nimbusml/tests/pipeline/test_pipeline_get_schema.py +++ b/src/python/nimbusml/tests/pipeline/test_pipeline_get_schema.py @@ -28,7 +28,7 @@ def test_get_schema_returns_correct_value_for_single_valued_columns(self): pipeline.fit(df) df = pipeline.transform(df) - schema = pipeline.get_schema() + schema = pipeline.get_output_columns() self.assertTrue('c1' in schema) self.assertTrue('c2' in schema) @@ -39,7 +39,7 @@ def test_get_schema_returns_correct_value_for_vector_valued_columns(self): pipeline = Pipeline([OneHotVectorizer() << 'c0']) pipeline.fit(train_df) - schema = pipeline.get_schema() + schema = pipeline.get_output_columns() self.assertTrue('c0.a' in schema) self.assertTrue('c0.b' in schema) @@ -55,7 +55,7 @@ def test_get_schema_does_not_work_when_predictor_is_part_of_model(self): pipeline.fit(df) try: - schema = pipeline.get_schema() + schema = pipeline.get_output_columns() except Exception as e: pass else: diff --git a/src/python/setup.py b/src/python/setup.py index 7b983db8..fc350275 100644 --- a/src/python/setup.py +++ b/src/python/setup.py @@ -45,7 +45,7 @@ # Versions should comply with PEP440. For a discussion on # single-sourcing the version across setup.py and the project code, see # https://packaging.python.org/en/latest/single_source_version.html - version='1.4.2', + version='1.5.0', description='NimbusML', long_description=long_description, diff --git a/version.txt b/version.txt index c9929e36..3e1ad720 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -1.4.2 \ No newline at end of file +1.5.0 \ No newline at end of file