diff --git a/src/DotNetBridge/Entrypoints.cs b/src/DotNetBridge/Entrypoints.cs
index 039247fe..9be84e67 100644
--- a/src/DotNetBridge/Entrypoints.cs
+++ b/src/DotNetBridge/Entrypoints.cs
@@ -65,7 +65,7 @@ public sealed class ModelSchemaOutput
public IDataView Schema;
}
- [TlcModule.EntryPoint(Name = "Models.Schema", Desc = "Retrieve input and output model schemas")]
+ [TlcModule.EntryPoint(Name = "Models.Schema", Desc = "Retrieve output model schema")]
public static ModelSchemaOutput GetSchema(IHostEnvironment env, TransformModelInput input)
{
Contracts.CheckValue(env, nameof(env));
@@ -87,5 +87,96 @@ public static CommonOutputs.TransformOutput CreateVariableColumn(IHostEnvironmen
var xf = VariableColumnTransform.Create(env, inputOptions, inputOptions.Data);
return new CommonOutputs.TransformOutput { Model = new TransformModelImpl(env, xf, inputOptions.Data), OutputData = xf };
}
+
+ public sealed class ScoringTransformInput
+ {
+ [Argument(ArgumentType.Required, HelpText = "The dataset to be scored", SortOrder = 1)]
+ public IDataView Data;
+
+ [Argument(ArgumentType.Required, HelpText = "The predictor model to apply to data", SortOrder = 2)]
+ public PredictorModel PredictorModel;
+
+ [Argument(ArgumentType.AtMostOnce, HelpText = "Suffix to append to the score columns", SortOrder = 3)]
+ public string Suffix;
+ }
+
+ public sealed class ScoringTransformOutput
+ {
+ [TlcModule.Output(Desc = "The scored dataset", SortOrder = 1)]
+ public IDataView ScoredData;
+
+ [TlcModule.Output(Desc = "The scoring transform", SortOrder = 2)]
+ public TransformModel ScoringTransform;
+ }
+
+ private static bool AreSchemasCompatible(DataViewSchema schema1, DataViewSchema schema2)
+ {
+ if (schema1 == null)
+ return schema2 == null;
+ if (schema2 == null)
+ return schema1 == null;
+ if (schema1.Count != schema2.Count)
+ return false;
+
+ for (int i = 0; i < schema1.Count; i++)
+ {
+ if(schema1[i].Type != schema2[i].Type)
+ return false;
+ }
+
+ return true;
+ }
+
+ [TlcModule.EntryPoint(Name = "Transforms.DatasetScorerEx", Desc = "Score a dataset with a predictor model")]
+ public static ScoringTransformOutput Score(IHostEnvironment env, ScoringTransformInput input)
+ {
+ Contracts.CheckValue(env, nameof(env));
+ var host = env.Register("ScoreModel");
+ host.CheckValue(input, nameof(input));
+ EntryPointUtils.CheckInputArgs(host, input);
+
+ RoleMappedData data;
+ IPredictor predictor;
+ var inputData = input.Data;
+ try
+ {
+ input.PredictorModel.PrepareData(host, inputData, out data, out predictor);
+ }
+ catch (Exception)
+ {
+ // this can happen in csr_matrix case, try to use only trainer model.
+ host.Assert(inputData.Schema.Count == 1);
+ var inputColumnName = inputData.Schema[0].Name;
+ var trainingSchema = input.PredictorModel.GetTrainingSchema(host);
+ // get feature vector item type.
+ var trainingFeatureColumn = (DataViewSchema.Column)trainingSchema.Feature;
+ var requiredType = trainingFeatureColumn.Type.GetItemType().RawType;
+ var featuresColumnName = trainingFeatureColumn.Name;
+ predictor = input.PredictorModel.Predictor;
+ var xf = new TypeConvertingTransformer(host,
+ new TypeConvertingEstimator.ColumnOptions(featuresColumnName, requiredType, inputColumnName)).Transform(inputData);
+ data = new RoleMappedData(xf, null, featuresColumnName);
+ }
+
+ IDataView scoredPipe;
+ using (var ch = host.Start("Creating scoring pipeline"))
+ {
+ ch.Trace("Creating pipeline");
+ var bindable = ScoreUtils.GetSchemaBindableMapper(host, predictor);
+ ch.AssertValue(bindable);
+
+ var mapper = bindable.Bind(host, data.Schema);
+ var scorer = ScoreUtils.GetScorerComponent(host, mapper, input.Suffix);
+ scoredPipe = scorer.CreateComponent(host, data.Data, mapper, input.PredictorModel.GetTrainingSchema(host));
+ }
+
+ return
+ new ScoringTransformOutput
+ {
+ ScoredData = scoredPipe,
+ ScoringTransform = new TransformModelImpl(host, scoredPipe, inputData)
+ };
+
+ }
}
}
diff --git a/src/python/nimbusml.pyproj b/src/python/nimbusml.pyproj
index ced35764..a90735af 100644
--- a/src/python/nimbusml.pyproj
+++ b/src/python/nimbusml.pyproj
@@ -395,6 +395,7 @@
+
@@ -685,6 +686,7 @@
+
diff --git a/src/python/nimbusml/__init__.py b/src/python/nimbusml/__init__.py
index 4e18a65b..0b508fcf 100644
--- a/src/python/nimbusml/__init__.py
+++ b/src/python/nimbusml/__init__.py
@@ -2,7 +2,7 @@
Microsoft Machine Learning for Python
"""
-__version__ = '1.4.2'
+__version__ = '1.5.0'
# CoreCLR version of MicrosoftML is built on Windows.
# But file permissions are not preserved when it's copied to Linux.
diff --git a/src/python/nimbusml/examples/Schema.py b/src/python/nimbusml/examples/Schema.py
index ddec3c0d..c0b8d493 100644
--- a/src/python/nimbusml/examples/Schema.py
+++ b/src/python/nimbusml/examples/Schema.py
@@ -27,7 +27,7 @@
])
pipe.fit(data)
-schema = pipe.get_schema()
+schema = pipe.get_output_columns()
print(schema[0:5])
# ['Sentiment', 'SentimentText', 'features.Char.>|=|=', 'features.Char.=|=|r', 'features.Char.=|r|u']
diff --git a/src/python/nimbusml/internal/entrypoints/transforms_datasetscorerex.py b/src/python/nimbusml/internal/entrypoints/transforms_datasetscorerex.py
new file mode 100644
index 00000000..7a5d8c71
--- /dev/null
+++ b/src/python/nimbusml/internal/entrypoints/transforms_datasetscorerex.py
@@ -0,0 +1,68 @@
+"""
+Transforms.DatasetScorerEx
+"""
+
+
+from ..utils.entrypoints import EntryPoint
+from ..utils.utils import try_set, unlist
+
+
+def transforms_datasetscorerex(
+ data,
+ predictor_model,
+ scored_data=None,
+ scoring_transform=None,
+ suffix=None,
+ **params):
+ """
+ **Description**
+ Score a dataset with a predictor model
+
+ :param data: The dataset to be scored (inputs).
+ :param predictor_model: The predictor model to apply to data
+ (inputs).
+ :param suffix: Suffix to append to the score columns (inputs).
+ :param scored_data: The scored dataset (outputs).
+ :param scoring_transform: The scoring transform (outputs).
+ """
+
+ entrypoint_name = 'Transforms.DatasetScorerEx'
+ inputs = {}
+ outputs = {}
+
+ if data is not None:
+ inputs['Data'] = try_set(
+ obj=data,
+ none_acceptable=False,
+ is_of_type=str)
+ if predictor_model is not None:
+ inputs['PredictorModel'] = try_set(
+ obj=predictor_model,
+ none_acceptable=False,
+ is_of_type=str)
+ if suffix is not None:
+ inputs['Suffix'] = try_set(
+ obj=suffix,
+ none_acceptable=True,
+ is_of_type=str)
+ if scored_data is not None:
+ outputs['ScoredData'] = try_set(
+ obj=scored_data,
+ none_acceptable=False,
+ is_of_type=str)
+ if scoring_transform is not None:
+ outputs['ScoringTransform'] = try_set(
+ obj=scoring_transform, none_acceptable=False, is_of_type=str)
+
+ input_variables = {
+ x for x in unlist(inputs.values())
+ if isinstance(x, str) and x.startswith("$")}
+ output_variables = {
+ x for x in unlist(outputs.values())
+ if isinstance(x, str) and x.startswith("$")}
+
+ entrypoint = EntryPoint(
+ name=entrypoint_name, inputs=inputs, outputs=outputs,
+ input_variables=input_variables,
+ output_variables=output_variables)
+ return entrypoint
diff --git a/src/python/nimbusml/pipeline.py b/src/python/nimbusml/pipeline.py
index efa81735..3e0dce27 100644
--- a/src/python/nimbusml/pipeline.py
+++ b/src/python/nimbusml/pipeline.py
@@ -40,8 +40,8 @@
models_regressionevaluator
from .internal.entrypoints.models_summarizer import models_summarizer
from .internal.entrypoints.models_schema import models_schema
-from .internal.entrypoints.transforms_datasetscorer import \
- transforms_datasetscorer
+from .internal.entrypoints.transforms_datasetscorerex import \
+ transforms_datasetscorerex
from .internal.entrypoints.transforms_datasettransformscorer import \
transforms_datasettransformscorer
from .internal.entrypoints.transforms_featurecombiner import \
@@ -1772,7 +1772,7 @@ def get_feature_contributions(self, X, top=10, bottom=10, verbose=0,
all_nodes = [importtext_node]
inputs = dict([('file', ''), ('predictor_model', self.model)])
- score_node = transforms_datasetscorer(
+ score_node = transforms_datasetscorerex(
data="$data",
predictor_model="$predictor_model",
scored_data="$scoredvectordata")
@@ -1815,7 +1815,7 @@ def get_feature_contributions(self, X, top=10, bottom=10, verbose=0,
return out_data
- def get_schema(self, verbose=0, **params):
+ def get_output_columns(self, verbose=0, **params):
"""
Returns the output list of columns for the fitted model.
:return: list .
@@ -2102,7 +2102,7 @@ def _predict(self, X, y=None,
all_nodes = [importtext_node]
inputs = dict([('file', ''), ('predictor_model', self.model)])
- score_node = transforms_datasetscorer(
+ score_node = transforms_datasetscorerex(
data="$data",
predictor_model="$predictor_model",
scored_data="$scoredVectorData")
diff --git a/src/python/nimbusml/tests/pipeline/test_csr_input.py b/src/python/nimbusml/tests/pipeline/test_csr_input.py
new file mode 100644
index 00000000..176a7651
--- /dev/null
+++ b/src/python/nimbusml/tests/pipeline/test_csr_input.py
@@ -0,0 +1,65 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------------------------
+import os
+import unittest
+
+import numpy as np
+import pandas as pd
+from nimbusml import Pipeline, FileDataStream
+from nimbusml.datasets import get_dataset
+from nimbusml.feature_extraction.categorical import OneHotVectorizer
+from nimbusml.linear_model import LogisticRegressionBinaryClassifier
+from nimbusml.preprocessing import DatasetTransformer
+from nimbusml.preprocessing.schema import PrefixColumnConcatenator
+from nimbusml.preprocessing.schema import ColumnDropper
+from numpy.testing import assert_equal
+
+class TestCsrInput(unittest.TestCase):
+
+ def test_predict_proba_on_csr(self):
+ path = get_dataset('infert').as_filepath()
+ data = FileDataStream.read_csv(path)
+ cols = list(data.head(1).columns.values) # ordered data column names.
+
+ # train featurizer
+ featurization_pipeline = Pipeline([OneHotVectorizer(columns={'education': 'education'})])
+ featurization_pipeline.fit(data)
+ # Note: the relative order of all columns is still the same as in raw data.
+ #print(featurization_pipeline.get_output_columns())
+
+ # need to remove extra columns before getting csr_matrix featurized data as it wont have column name information.
+ csr_featurization_pipeline = Pipeline([DatasetTransformer(featurization_pipeline.model), ColumnDropper() << ['case', 'row_num']])
+ sparse_featurized_data = csr_featurization_pipeline.fit_transform(data, as_csr=True)
+ # Note: the relative order of all columns is still the same.
+ #print(csr_featurization_pipeline.get_output_columns())
+
+ # train learner
+ # Note: order & number of feature columns for learner (parameter 'feature') should be the same as in csr_matrix above
+ cols.remove('row_num')
+ cols.remove('case')
+ feature_cols = cols
+ #print(feature_cols)
+ #['education', 'age', 'parity', 'induced', 'spontaneous', 'stratum', 'pooled.stratum']
+ training_pipeline = Pipeline([DatasetTransformer(featurization_pipeline.model), LogisticRegressionBinaryClassifier(feature=feature_cols, label='case')])
+ training_pipeline.fit(data, output_predictor_model=True)
+
+ # load just a learner model
+ predictor_pipeline = Pipeline()
+ predictor_pipeline.load_model(training_pipeline.predictor_model)
+ # see the order of Feature.* columns that get passed to learner algo
+ #print(predictor_pipeline.get_output_columns())
+
+ # use just a learner model on csr_matrix featurized data
+ predictions = predictor_pipeline.predict_proba(sparse_featurized_data)
+ assert_equal(len(predictions), 248)
+ assert_equal(len(predictions[0]), 2)
+
+ # get feature contributions
+ fcc = predictor_pipeline.get_feature_contributions(sparse_featurized_data)
+ assert_equal(fcc.shape, (248,30))
+
+if __name__ == '__main__':
+ unittest.main()
+
diff --git a/src/python/nimbusml/tests/pipeline/test_pipeline_get_schema.py b/src/python/nimbusml/tests/pipeline/test_pipeline_get_schema.py
index 5eccf4d7..63bb5310 100644
--- a/src/python/nimbusml/tests/pipeline/test_pipeline_get_schema.py
+++ b/src/python/nimbusml/tests/pipeline/test_pipeline_get_schema.py
@@ -28,7 +28,7 @@ def test_get_schema_returns_correct_value_for_single_valued_columns(self):
pipeline.fit(df)
df = pipeline.transform(df)
- schema = pipeline.get_schema()
+ schema = pipeline.get_output_columns()
self.assertTrue('c1' in schema)
self.assertTrue('c2' in schema)
@@ -39,7 +39,7 @@ def test_get_schema_returns_correct_value_for_vector_valued_columns(self):
pipeline = Pipeline([OneHotVectorizer() << 'c0'])
pipeline.fit(train_df)
- schema = pipeline.get_schema()
+ schema = pipeline.get_output_columns()
self.assertTrue('c0.a' in schema)
self.assertTrue('c0.b' in schema)
@@ -55,7 +55,7 @@ def test_get_schema_does_not_work_when_predictor_is_part_of_model(self):
pipeline.fit(df)
try:
- schema = pipeline.get_schema()
+ schema = pipeline.get_output_columns()
except Exception as e:
pass
else:
diff --git a/src/python/setup.py b/src/python/setup.py
index 7b983db8..fc350275 100644
--- a/src/python/setup.py
+++ b/src/python/setup.py
@@ -45,7 +45,7 @@
# Versions should comply with PEP440. For a discussion on
# single-sourcing the version across setup.py and the project code, see
# https://packaging.python.org/en/latest/single_source_version.html
- version='1.4.2',
+ version='1.5.0',
description='NimbusML',
long_description=long_description,
diff --git a/version.txt b/version.txt
index c9929e36..3e1ad720 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-1.4.2
\ No newline at end of file
+1.5.0
\ No newline at end of file