diff --git a/src/DotNetBridge/Bridge.cs b/src/DotNetBridge/Bridge.cs index 96100247..302a9426 100644 --- a/src/DotNetBridge/Bridge.cs +++ b/src/DotNetBridge/Bridge.cs @@ -314,7 +314,7 @@ private static unsafe int GenericExec(EnvironmentBlock* penv, sbyte* psz, int cd env.ComponentCatalog.RegisterAssembly(typeof(CategoricalCatalog).Assembly); // ML.Transforms env.ComponentCatalog.RegisterAssembly(typeof(FastTreeRegressionTrainer).Assembly); // ML.FastTree - //env.ComponentCatalog.RegisterAssembly(typeof(EnsembleModelParameters).Assembly); // ML.Ensemble + env.ComponentCatalog.RegisterAssembly(typeof(EnsembleModelParameters).Assembly); // ML.Ensemble env.ComponentCatalog.RegisterAssembly(typeof(KMeansModelParameters).Assembly); // ML.KMeansClustering env.ComponentCatalog.RegisterAssembly(typeof(PcaModelParameters).Assembly); // ML.PCA env.ComponentCatalog.RegisterAssembly(typeof(CVSplit).Assembly); // ML.EntryPoints diff --git a/src/python/docs/docstrings/ClassifierBestPerformanceSelector.txt b/src/python/docs/docstrings/ClassifierBestPerformanceSelector.txt new file mode 100644 index 00000000..c7f2449a --- /dev/null +++ b/src/python/docs/docstrings/ClassifierBestPerformanceSelector.txt @@ -0,0 +1,38 @@ + """ + + **Description** + Combines only the models with the best performance. + + + :param metric_name: the metric type to be used to find the weights for + each model. Can be ``"AccuracyMicro"``, ``"AccuracyMacro"``, + ``"LogLoss"``, or ``"LogLossReduction"``. + + + .. seealso:: + :py:class:`EnsembleClassifier + ` + + * Submodel selectors: + :py:class:`ClassifierAllSelector + `, + :py:class:`ClassifierBestDiverseSelector + ` + + * Output combiners: + :py:class:`ClassifierAverage + `, + :py:class:`ClassifierMedian + `, + :py:class:`ClassifierStacking + `, + :py:class:`ClassifierVoting + ` + + + .. index:: models, ensemble, classification + + Example: + .. literalinclude:: /../nimbusml/examples/EnsembleClassifier.py + :language: python + """ \ No newline at end of file diff --git a/src/python/docs/docstrings/ClassifierWeightedAverage.txt b/src/python/docs/docstrings/ClassifierWeightedAverage.txt new file mode 100644 index 00000000..2c3a77b0 --- /dev/null +++ b/src/python/docs/docstrings/ClassifierWeightedAverage.txt @@ -0,0 +1,61 @@ + """ + + **Description** + Computes the weighted average of the outputs of the trained models + + + :param weightage_name: the metric type to be used to find the weights for + each model. Can be ``"AccuracyMicroAvg"`` or ``"AccuracyMacroAvg"``. + + :param normalize: Specifies the type of automatic normalization used: + + * ``"Auto"``: if normalization is needed, it is performed + automatically. This is the default choice. + * ``"No"``: no normalization is performed. + * ``"Yes"``: normalization is performed. + * ``"Warn"``: if normalization is needed, a warning + message is displayed, but normalization is not performed. + + Normalization rescales disparate data ranges to a standard scale. + Feature + scaling ensures the distances between data points are proportional + and + enables various optimization methods such as gradient descent to + converge + much faster. If normalization is performed, a ``MinMax`` normalizer + is + used. It normalizes values in an interval [a, b] where ``-1 <= a <= + 0`` + and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves + sparsity by mapping zero to zero. + + + .. seealso:: + :py:class:`EnsembleClassifier + ` + + * Submodel selectors: + :py:class:`ClassifierAllSelector + `, + :py:class:`ClassifierBestDiverseSelector + `, + :py:class:`ClassifierBestPerformanceSelector + ` + + * Output combiners: + :py:class:`ClassifierAverage + `, + :py:class:`ClassifierMedian + `, + :py:class:`ClassifierStacking + `, + :py:class:`ClassifierVoting + ` + + + .. index:: models, ensemble, classification + + Example: + .. literalinclude:: /../nimbusml/examples/EnsembleClassifier.py + :language: python + """ \ No newline at end of file diff --git a/src/python/docs/docstrings/EnsembleClassifier.txt b/src/python/docs/docstrings/EnsembleClassifier.txt new file mode 100644 index 00000000..3301f266 --- /dev/null +++ b/src/python/docs/docstrings/EnsembleClassifier.txt @@ -0,0 +1,144 @@ + """ + + **Description** + Train a multi class ensemble model + + .. remarks:: + An Ensemble is a set of models, each trained on a sample of the + training set. Training an ensemble instead of a single model can boost + the accuracy of a given algorithm. + + The quality of an Ensemble depends on two factors; Accuracy and + Diversity. Ensemble can be analogous to Teamwork. If every team member + is diverse and competent, then the team can perform very well. Here a + team member is a base learner and the team is the Ensemble. In the case + of classification ensembles, the base learner is a + ``LogisticRegressionClassifier``. + + + :param sampling_type: Specifies how the training samples are created: + + * ``BootstrapSelector``: takes a bootstrap sample of the training set + (sampling with replacement). This is the default method. + * ``RandomPartitionSelector``: randomly partitions the training set + into subsets. + * ``AllSelector``: every model is trained using the whole training set. + + Each of these Subset Selectors has two options for selecting features: + * ``AllFeatureSelector``: selects all the features. This is the default + method. + * ``RandomFeatureSelector``: selects a random subset of the features + for each model. + + :param num_models: indicates the number models to train, i.e. the number of + subsets of the training set to sample. The default value is 50. If + batches are used then this indicates the number of models per batch. + + :param sub_model_selector_type: Determines the efficient set of models the + ``output_combiner`` uses, and removes the least significant models. This is + used to improve the accuracy and reduce the model size. This is also called + pruning. + + * ``ClassifierAllSelector``: does not perform any pruning and selects + all models in the ensemble to combine to create the output. This is + the default submodel selector. + * ``ClassifierBestDiverseSelector``: combines models whose predictions + are as diverse as possible. Currently, only diagreement diversity is + supported. + * ``ClassifierBestPerformanceSelector``: combines only the models with + the best performance according some metric. The metric can be + ``"AccuracyMicro"``, ``"AccuracyMacro"``, ``"LogLoss"``, + or ``"LogLossReduction"``. + + + :output_combiner: indicates how to combine the predictions of the different + models into a single prediction. There are five available output + combiners for clasification: + + * ``ClassifierAverage``: computes the average of the scores produced by + the trained models. + * ``ClassifierMedian``: computes the median of the scores produced by + the trained models. + * ``ClassifierStacking``: computes the output by training a model on a + training set where each instance is a vector containing the outputs + of the different models on a training instance, and the instance's + label. + * ``ClassifierVoting``: computes the fraction of positive predictions + for each class from all the trained models, and outputs the class + with the largest number. + * ``ClassifierWeightedAverage``: computes the weighted average of the + outputs of the trained models, weighted by the specified metric. The + metric can be ``"AccuracyMicroAvg"`` or ``"AccuracyMacroAvg"``. + + :param normalize: Specifies the type of automatic normalization used: + + * ``"Auto"``: if normalization is needed, it is performed + automatically. This is the default choice. + * ``"No"``: no normalization is performed. + * ``"Yes"``: normalization is performed. + * ``"Warn"``: if normalization is needed, a warning + message is displayed, but normalization is not performed. + + Normalization rescales disparate data ranges to a standard scale. + Feature + scaling ensures the distances between data points are proportional + and + enables various optimization methods such as gradient descent to + converge + much faster. If normalization is performed, a ``MinMax`` normalizer + is + used. It normalizes values in an interval [a, b] where ``-1 <= a <= + 0`` + and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves + sparsity by mapping zero to zero. + + :param batch_size: train the models iteratively on subsets of the training + set of this size. When using this option, it is assumed that the + training set is randomized enough so that every batch is a random + sample of instances. The default value is -1, indicating using the + whole training set. If the value is changed to an integer greater than + 0, the number of trained models is the number of batches (the size of + the training set divided by the batch size), times ``num_models``. + + .. seealso:: + * Subset selectors: + :py:class:`AllInstanceSelector + `, + :py:class:`BootstrapSelector + `, + :py:class:`RandomPartitionSelector + ` + + * Feature selectors: + :py:class:`AllFeatureSelector + `, + :py:class:`RandomFeatureSelector + ` + + * Submodel selectors: + :py:class:`ClassifierAllSelector + `, + :py:class:`ClassifierBestDiverseSelector + `, + :py:class:`ClassifierBestPerformanceSelector + ` + + * Output combiners: + :py:class:`ClassifierAverage + `, + :py:class:`ClassifierMedian + `, + :py:class:`ClassifierStacking + `, + :py:class:`ClassifierVoting + `, + :py:class:`ClassifierWeightedAverage + ` + + + .. index:: models, ensemble, classification + + Example: + .. literalinclude:: /../nimbusml/examples/EnsembleClassifier.py + :language: python + """ \ No newline at end of file diff --git a/src/python/docs/docstrings/EnsembleRegressor.txt b/src/python/docs/docstrings/EnsembleRegressor.txt new file mode 100644 index 00000000..e185307c --- /dev/null +++ b/src/python/docs/docstrings/EnsembleRegressor.txt @@ -0,0 +1,134 @@ + """ + + **Description** + Train a regression ensemble model + + .. remarks:: + An Ensemble is a set of models, each trained on a sample of the + training set. Training an ensemble instead of a single model can boost + the accuracy of a given algorithm. + + The quality of an Ensemble depends on two factors; Accuracy and + Diversity. Ensemble can be analogous to Teamwork. If every team member + is diverse and competent, then the team can perform very well. Here a + team member is a base learner and the team is the Ensemble. In the case + of regression ensembles, the base learner is an + ``OnlineGradientDescentRegressor``. + + + :param sampling_type: Specifies how the training samples are created: + + * ``BootstrapSelector``: takes a bootstrap sample of the training set + (sampling with replacement). This is the default method. + * ``RandomPartitionSelector``: randomly partitions the training set + into subsets. + * ``AllSelector``: every model is trained using the whole training set. + + Each of these Subset Selectors has two options for selecting features: + * ``AllFeatureSelector``: selects all the features. This is the default + method. + * ``RandomFeatureSelector``: selects a random subset of the features + for each model. + + :param num_models: indicates the number models to train, i.e. the number of + subsets of the training set to sample. The default value is 50. If + batches are used then this indicates the number of models per batch. + + :param sub_model_selector_type: Determines the efficient set of models the + ``output_combiner`` uses, and removes the least significant models. This is + used to improve the accuracy and reduce the model size. This is also called + pruning. + + * ``RegressorAllSelector``: does not perform any pruning and selects + all models in the ensemble to combine to create the output. This is + the default submodel selector. + * ``RegressorBestDiverseSelector``: combines models whose predictions + are as diverse as possible. Currently, only diagreement diversity is + supported. + * ``RegressorBestPerformanceSelector``: combines only the models with + the best performance according to the specified metric. The metric + can be ``"L1"``, ``"L2"``, ``"Rms"``, or ``"Loss"``, or + ``"RSquared"``. + + + :output_combiner: indicates how to combine the predictions of the different + models into a single prediction. There are five available output + combiners for clasification: + + * ``RegressorAverage``: computes the average of the scores produced by + the trained models. + * ``RegressorMedian``: computes the median of the scores produced by + the trained models. + * ``RegressorStacking``: computes the output by training a model on a + training set where each instance is a vector containing the outputs + of the different models on a training instance, and the instance's + label. + + :param normalize: Specifies the type of automatic normalization used: + + * ``"Auto"``: if normalization is needed, it is performed + automatically. This is the default choice. + * ``"No"``: no normalization is performed. + * ``"Yes"``: normalization is performed. + * ``"Warn"``: if normalization is needed, a warning + message is displayed, but normalization is not performed. + + Normalization rescales disparate data ranges to a standard scale. + Feature + scaling ensures the distances between data points are proportional + and + enables various optimization methods such as gradient descent to + converge + much faster. If normalization is performed, a ``MinMax`` normalizer + is + used. It normalizes values in an interval [a, b] where ``-1 <= a <= + 0`` + and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves + sparsity by mapping zero to zero. + + :param batch_size: train the models iteratively on subsets of the training + set of this size. When using this option, it is assumed that the + training set is randomized enough so that every batch is a random + sample of instances. The default value is -1, indicating using the + whole training set. If the value is changed to an integer greater than + 0, the number of trained models is the number of batches (the size of + the training set divided by the batch size), times ``num_models``. + + .. seealso:: + * Subset selectors: + :py:class:`AllInstanceSelector + `, + :py:class:`BootstrapSelector + `, + :py:class:`RandomPartitionSelector + ` + + * Feature selectors: + :py:class:`AllFeatureSelector + `, + :py:class:`RandomFeatureSelector + ` + + * Submodel selectors: + :py:class:`RegressorAllSelector + `, + :py:class:`RegressorBestDiverseSelector + `, + :py:class:`RegressorBestPerformanceSelector + ` + + * Output combiners: + :py:class:`RegressorAverage + `, + :py:class:`RegressorMedian + `, + :py:class:`RegressorStacking + ` + + + .. index:: models, ensemble, classification + + Example: + .. literalinclude:: /../nimbusml/examples/EnsembleRegressor.py + :language: python + """ \ No newline at end of file diff --git a/src/python/docs/docstrings/RegressorBestPerformanceSelector.txt b/src/python/docs/docstrings/RegressorBestPerformanceSelector.txt new file mode 100644 index 00000000..83ba0116 --- /dev/null +++ b/src/python/docs/docstrings/RegressorBestPerformanceSelector.txt @@ -0,0 +1,36 @@ + """ + + **Description** + Computes the weighted average of the outputs of the trained models + + + :param metric_name: the metric type to be used to find the weights for + each model. Can be ``"L1"``, ``"L2"``, ``"Rms"``, or ``"Loss"``, or + ``"RSquared"``. + + + .. seealso:: + :py:class:`EnsembleRegressor + ` + + * Submodel selectors: + :py:class:`RegressorAllSelector + `, + :py:class:`RegressorBestDiverseSelector + ` + + * Output combiners: + :py:class:`RegressorAverage + `, + :py:class:`RegressorMedian + `, + :py:class:`RegressorStacking + ` + + + .. index:: models, ensemble, classification + + Example: + .. literalinclude:: /../nimbusml/examples/EnsembleClassifier.py + :language: python + """ \ No newline at end of file diff --git a/src/python/nimbusml.pyproj b/src/python/nimbusml.pyproj index 6ac9feba..e2ae20cd 100644 --- a/src/python/nimbusml.pyproj +++ b/src/python/nimbusml.pyproj @@ -46,6 +46,36 @@ + + + + + + + + + + + + + + + + + + + + + + + Code + + + + + + + @@ -68,6 +98,7 @@ + @@ -75,6 +106,8 @@ + + @@ -126,6 +159,7 @@ + @@ -212,6 +246,34 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -287,6 +349,8 @@ + + @@ -411,13 +475,18 @@ + + + + + @@ -561,9 +630,7 @@ - - Code - + @@ -585,6 +652,8 @@ + + @@ -752,6 +821,11 @@ + + + + + @@ -767,6 +841,11 @@ + + + + + @@ -833,6 +912,8 @@ + + @@ -842,6 +923,8 @@ + + @@ -908,6 +991,7 @@ + diff --git a/src/python/nimbusml/ensemble/__init__.py b/src/python/nimbusml/ensemble/__init__.py index b4934efa..d246e218 100644 --- a/src/python/nimbusml/ensemble/__init__.py +++ b/src/python/nimbusml/ensemble/__init__.py @@ -1,3 +1,5 @@ +from .ensembleclassifier import EnsembleClassifier +from .ensembleregressor import EnsembleRegressor from .fastforestbinaryclassifier import FastForestBinaryClassifier from .fastforestregressor import FastForestRegressor from .fasttreesbinaryclassifier import FastTreesBinaryClassifier @@ -11,6 +13,8 @@ from .lightgbmregressor import LightGbmRegressor __all__ = [ + 'EnsembleClassifier', + 'EnsembleRegressor', 'FastForestBinaryClassifier', 'FastForestRegressor', 'FastTreesBinaryClassifier', diff --git a/src/python/nimbusml/ensemble/ensembleclassifier.py b/src/python/nimbusml/ensemble/ensembleclassifier.py new file mode 100644 index 00000000..cf60f34d --- /dev/null +++ b/src/python/nimbusml/ensemble/ensembleclassifier.py @@ -0,0 +1,247 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +EnsembleClassifier +""" + +__all__ = ["EnsembleClassifier"] + + +from sklearn.base import ClassifierMixin + +from ..base_predictor import BasePredictor +from ..internal.core.ensemble.ensembleclassifier import \ + EnsembleClassifier as core +from ..internal.utils.utils import trace +from .feature_selector import AllFeatureSelector +from .subset_selector import BootstrapSelector + + +class EnsembleClassifier(core, BasePredictor, ClassifierMixin): + """ + + **Description** + Train a multi class ensemble model + + .. remarks:: + An Ensemble is a set of models, each trained on a sample of the + training set. Training an ensemble instead of a single model can boost + the accuracy of a given algorithm. + + The quality of an Ensemble depends on two factors; Accuracy and + Diversity. Ensemble can be analogous to Teamwork. If every team member + is diverse and competent, then the team can perform very well. Here a + team member is a base learner and the team is the Ensemble. In the case + of classification ensembles, the base learner is a + ``LogisticRegressionClassifier``. + + + :param feature: see `Columns `_. + + :param label: see `Columns `_. + + :param sampling_type: Specifies how the training samples are created: + + * ``BootstrapSelector``: takes a bootstrap sample of the training set + (sampling with replacement). This is the default method. + * ``RandomPartitionSelector``: randomly partitions the training set + into subsets. + * ``AllSelector``: every model is trained using the whole training set. + + Each of these Subset Selectors has two options for selecting features: + * ``AllFeatureSelector``: selects all the features. This is the default + method. + * ``RandomFeatureSelector``: selects a random subset of the features + for each model. + + :param num_models: indicates the number models to train, i.e. the number of + subsets of the training set to sample. The default value is 50. If + batches are used then this indicates the number of models per batch. + + :param sub_model_selector_type: Determines the efficient set of models the + ``output_combiner`` uses, and removes the least significant models. This is + used to improve the accuracy and reduce the model size. This is also called + pruning. + + * ``ClassifierAllSelector``: does not perform any pruning and selects + all models in the ensemble to combine to create the output. This is + the default submodel selector. + * ``ClassifierBestDiverseSelector``: combines models whose predictions + are as diverse as possible. Currently, only diagreement diversity is + supported. + * ``ClassifierBestPerformanceSelector``: combines only the models with + the best performance according some metric. The metric can be + ``"AccuracyMicro"``, ``"AccuracyMacro"``, ``"LogLoss"``, + or ``"LogLossReduction"``. + + + :output_combiner: indicates how to combine the predictions of the different + models into a single prediction. There are five available output + combiners for clasification: + + * ``ClassifierAverage``: computes the average of the scores produced by + the trained models. + * ``ClassifierMedian``: computes the median of the scores produced by + the trained models. + * ``ClassifierStacking``: computes the output by training a model on a + training set where each instance is a vector containing the outputs + of the different models on a training instance, and the instance's + label. + * ``ClassifierVoting``: computes the fraction of positive predictions + for each class from all the trained models, and outputs the class + with the largest number. + * ``ClassifierWeightedAverage``: computes the weighted average of the + outputs of the trained models, weighted by the specified metric. The + metric can be ``"AccuracyMicroAvg"`` or ``"AccuracyMacroAvg"``. + + :param output_combiner: Output combiner. + + :param normalize: Specifies the type of automatic normalization used: + + * ``"Auto"``: if normalization is needed, it is performed + automatically. This is the default choice. + * ``"No"``: no normalization is performed. + * ``"Yes"``: normalization is performed. + * ``"Warn"``: if normalization is needed, a warning + message is displayed, but normalization is not performed. + + Normalization rescales disparate data ranges to a standard scale. + Feature + scaling ensures the distances between data points are proportional + and + enables various optimization methods such as gradient descent to + converge + much faster. If normalization is performed, a ``MinMax`` normalizer + is + used. It normalizes values in an interval [a, b] where ``-1 <= a <= + 0`` + and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves + sparsity by mapping zero to zero. + + :param caching: Whether trainer should cache input training data. + + :param train_parallel: All the base learners will run asynchronously if the + value is true. + + :param batch_size: train the models iteratively on subsets of the training + set of this size. When using this option, it is assumed that the + training set is randomized enough so that every batch is a random + sample of instances. The default value is -1, indicating using the + whole training set. If the value is changed to an integer greater than + 0, the number of trained models is the number of batches (the size of + the training set divided by the batch size), times ``num_models``. + + :param show_metrics: True, if metrics for each model need to be evaluated + and shown in comparison table. This is done by using validation set if + available or the training set. + + :param params: Additional arguments sent to compute engine. + + .. seealso:: + * Subset selectors: + :py:class:`AllInstanceSelector + `, + :py:class:`BootstrapSelector + `, + :py:class:`RandomPartitionSelector + ` + + * Feature selectors: + :py:class:`AllFeatureSelector + `, + :py:class:`RandomFeatureSelector + ` + + * Submodel selectors: + :py:class:`ClassifierAllSelector + `, + :py:class:`ClassifierBestDiverseSelector + `, + :py:class:`ClassifierBestPerformanceSelector + ` + + * Output combiners: + :py:class:`ClassifierAverage + `, + :py:class:`ClassifierMedian + `, + :py:class:`ClassifierStacking + `, + :py:class:`ClassifierVoting + `, + :py:class:`ClassifierWeightedAverage + ` + + + .. index:: models, ensemble, classification + + Example: + .. literalinclude:: /../nimbusml/examples/EnsembleClassifier.py + :language: python + """ + + @trace + def __init__( + self, + sampling_type=BootstrapSelector( + feature_selector=AllFeatureSelector()), + num_models=None, + sub_model_selector_type=None, + output_combiner=None, + normalize='Auto', + caching='Auto', + train_parallel=False, + batch_size=-1, + show_metrics=False, + feature=None, + label=None, + **params): + + if 'feature_column_name' in params: + raise NameError( + "'feature_column_name' must be renamed to 'feature'") + if feature: + params['feature_column_name'] = feature + if 'label_column_name' in params: + raise NameError( + "'label_column_name' must be renamed to 'label'") + if label: + params['label_column_name'] = label + BasePredictor.__init__(self, type='classifier', **params) + core.__init__( + self, + sampling_type=sampling_type, + num_models=num_models, + sub_model_selector_type=sub_model_selector_type, + output_combiner=output_combiner, + normalize=normalize, + caching=caching, + train_parallel=train_parallel, + batch_size=batch_size, + show_metrics=show_metrics, + **params) + self.feature = feature + self.label = label + + @trace + def predict_proba(self, X, **params): + ''' + Returns probabilities + ''' + return self._predict_proba(X, **params) + + @trace + def decision_function(self, X, **params): + ''' + Returns score values + ''' + return self._decision_function(X, **params) + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/ensemble/ensembleregressor.py b/src/python/nimbusml/ensemble/ensembleregressor.py new file mode 100644 index 00000000..45fb10f5 --- /dev/null +++ b/src/python/nimbusml/ensemble/ensembleregressor.py @@ -0,0 +1,223 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +EnsembleRegressor +""" + +__all__ = ["EnsembleRegressor"] + + +from sklearn.base import RegressorMixin + +from ..base_predictor import BasePredictor +from ..internal.core.ensemble.ensembleregressor import \ + EnsembleRegressor as core +from ..internal.utils.utils import trace +from .feature_selector import AllFeatureSelector +from .subset_selector import BootstrapSelector + + +class EnsembleRegressor(core, BasePredictor, RegressorMixin): + """ + + **Description** + Train a regression ensemble model + + .. remarks:: + An Ensemble is a set of models, each trained on a sample of the + training set. Training an ensemble instead of a single model can boost + the accuracy of a given algorithm. + + The quality of an Ensemble depends on two factors; Accuracy and + Diversity. Ensemble can be analogous to Teamwork. If every team member + is diverse and competent, then the team can perform very well. Here a + team member is a base learner and the team is the Ensemble. In the case + of regression ensembles, the base learner is an + ``OnlineGradientDescentRegressor``. + + + :param feature: see `Columns `_. + + :param label: see `Columns `_. + + :param sampling_type: Specifies how the training samples are created: + + * ``BootstrapSelector``: takes a bootstrap sample of the training set + (sampling with replacement). This is the default method. + * ``RandomPartitionSelector``: randomly partitions the training set + into subsets. + * ``AllSelector``: every model is trained using the whole training set. + + Each of these Subset Selectors has two options for selecting features: + * ``AllFeatureSelector``: selects all the features. This is the default + method. + * ``RandomFeatureSelector``: selects a random subset of the features + for each model. + + :param num_models: indicates the number models to train, i.e. the number of + subsets of the training set to sample. The default value is 50. If + batches are used then this indicates the number of models per batch. + + :param sub_model_selector_type: Determines the efficient set of models the + ``output_combiner`` uses, and removes the least significant models. This is + used to improve the accuracy and reduce the model size. This is also called + pruning. + + * ``RegressorAllSelector``: does not perform any pruning and selects + all models in the ensemble to combine to create the output. This is + the default submodel selector. + * ``RegressorBestDiverseSelector``: combines models whose predictions + are as diverse as possible. Currently, only diagreement diversity is + supported. + * ``RegressorBestPerformanceSelector``: combines only the models with + the best performance according to the specified metric. The metric + can be ``"L1"``, ``"L2"``, ``"Rms"``, or ``"Loss"``, or + ``"RSquared"``. + + + :output_combiner: indicates how to combine the predictions of the different + models into a single prediction. There are five available output + combiners for clasification: + + * ``RegressorAverage``: computes the average of the scores produced by + the trained models. + * ``RegressorMedian``: computes the median of the scores produced by + the trained models. + * ``RegressorStacking``: computes the output by training a model on a + training set where each instance is a vector containing the outputs + of the different models on a training instance, and the instance's + label. + + :param output_combiner: Output combiner. + + :param normalize: Specifies the type of automatic normalization used: + + * ``"Auto"``: if normalization is needed, it is performed + automatically. This is the default choice. + * ``"No"``: no normalization is performed. + * ``"Yes"``: normalization is performed. + * ``"Warn"``: if normalization is needed, a warning + message is displayed, but normalization is not performed. + + Normalization rescales disparate data ranges to a standard scale. + Feature + scaling ensures the distances between data points are proportional + and + enables various optimization methods such as gradient descent to + converge + much faster. If normalization is performed, a ``MinMax`` normalizer + is + used. It normalizes values in an interval [a, b] where ``-1 <= a <= + 0`` + and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves + sparsity by mapping zero to zero. + + :param caching: Whether trainer should cache input training data. + + :param train_parallel: All the base learners will run asynchronously if the + value is true. + + :param batch_size: train the models iteratively on subsets of the training + set of this size. When using this option, it is assumed that the + training set is randomized enough so that every batch is a random + sample of instances. The default value is -1, indicating using the + whole training set. If the value is changed to an integer greater than + 0, the number of trained models is the number of batches (the size of + the training set divided by the batch size), times ``num_models``. + + :param show_metrics: True, if metrics for each model need to be evaluated + and shown in comparison table. This is done by using validation set if + available or the training set. + + :param params: Additional arguments sent to compute engine. + + .. seealso:: + * Subset selectors: + :py:class:`AllInstanceSelector + `, + :py:class:`BootstrapSelector + `, + :py:class:`RandomPartitionSelector + ` + + * Feature selectors: + :py:class:`AllFeatureSelector + `, + :py:class:`RandomFeatureSelector + ` + + * Submodel selectors: + :py:class:`RegressorAllSelector + `, + :py:class:`RegressorBestDiverseSelector + `, + :py:class:`RegressorBestPerformanceSelector + ` + + * Output combiners: + :py:class:`RegressorAverage + `, + :py:class:`RegressorMedian + `, + :py:class:`RegressorStacking + ` + + + .. index:: models, ensemble, classification + + Example: + .. literalinclude:: /../nimbusml/examples/EnsembleRegressor.py + :language: python + """ + + @trace + def __init__( + self, + sampling_type=BootstrapSelector( + feature_selector=AllFeatureSelector()), + num_models=None, + sub_model_selector_type=None, + output_combiner=None, + normalize='Auto', + caching='Auto', + train_parallel=False, + batch_size=-1, + show_metrics=False, + feature=None, + label=None, + **params): + + if 'feature_column_name' in params: + raise NameError( + "'feature_column_name' must be renamed to 'feature'") + if feature: + params['feature_column_name'] = feature + if 'label_column_name' in params: + raise NameError( + "'label_column_name' must be renamed to 'label'") + if label: + params['label_column_name'] = label + BasePredictor.__init__(self, type='regressor', **params) + core.__init__( + self, + sampling_type=sampling_type, + num_models=num_models, + sub_model_selector_type=sub_model_selector_type, + output_combiner=output_combiner, + normalize=normalize, + caching=caching, + train_parallel=train_parallel, + batch_size=batch_size, + show_metrics=show_metrics, + **params) + self.feature = feature + self.label = label + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/ensemble/feature_selector/__init__.py b/src/python/nimbusml/ensemble/feature_selector/__init__.py new file mode 100644 index 00000000..b5a235bb --- /dev/null +++ b/src/python/nimbusml/ensemble/feature_selector/__init__.py @@ -0,0 +1,7 @@ +from .allfeatureselector import AllFeatureSelector +from .randomfeatureselector import RandomFeatureSelector + +__all__ = [ + 'AllFeatureSelector', + 'RandomFeatureSelector' +] diff --git a/src/python/nimbusml/ensemble/feature_selector/allfeatureselector.py b/src/python/nimbusml/ensemble/feature_selector/allfeatureselector.py new file mode 100644 index 00000000..239e5d44 --- /dev/null +++ b/src/python/nimbusml/ensemble/feature_selector/allfeatureselector.py @@ -0,0 +1,39 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +AllFeatureSelector +""" + +__all__ = ["AllFeatureSelector"] + + +from ...internal.core.ensemble.feature_selector.allfeatureselector import \ + AllFeatureSelector as core +from ...internal.utils.utils import trace + + +class AllFeatureSelector(core): + """ + **Description** + Selects all features for each trainer in the ensemble + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + **params): + core.__init__( + self, + **params) + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/ensemble/feature_selector/randomfeatureselector.py b/src/python/nimbusml/ensemble/feature_selector/randomfeatureselector.py new file mode 100644 index 00000000..26914c0b --- /dev/null +++ b/src/python/nimbusml/ensemble/feature_selector/randomfeatureselector.py @@ -0,0 +1,44 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +RandomFeatureSelector +""" + +__all__ = ["RandomFeatureSelector"] + + +from ...internal.core.ensemble.feature_selector.randomfeatureselector import \ + RandomFeatureSelector as core +from ...internal.utils.utils import trace + + +class RandomFeatureSelector(core): + """ + **Description** + Selects a random subset of features for each trainer in the ensemble + + :param features_selection_proportion: The proportion of features to be + selected. The range is 0.0-1.0. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + features_selection_proportion=0.8, + **params): + core.__init__( + self, + features_selection_proportion=features_selection_proportion, + **params) + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/ensemble/output_combiner/__init__.py b/src/python/nimbusml/ensemble/output_combiner/__init__.py new file mode 100644 index 00000000..c71adc6d --- /dev/null +++ b/src/python/nimbusml/ensemble/output_combiner/__init__.py @@ -0,0 +1,19 @@ +from .classifieraverage import ClassifierAverage +from .classifiermedian import ClassifierMedian +from .classifierstacking import ClassifierStacking +from .classifiervoting import ClassifierVoting +from .classifierweightedaverage import ClassifierWeightedAverage +from .regressoraverage import RegressorAverage +from .regressormedian import RegressorMedian +from .regressorstacking import RegressorStacking + +__all__ = [ + 'ClassifierAverage', + 'ClassifierMedian', + 'ClassifierStacking', + 'ClassifierVoting', + 'ClassifierWeightedAverage', + 'ClassifierAverage', + 'ClassifierMedian', + 'ClassifierStacking' +] diff --git a/src/python/nimbusml/ensemble/output_combiner/classifieraverage.py b/src/python/nimbusml/ensemble/output_combiner/classifieraverage.py new file mode 100644 index 00000000..36d2d428 --- /dev/null +++ b/src/python/nimbusml/ensemble/output_combiner/classifieraverage.py @@ -0,0 +1,44 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +ClassifierAverage +""" + +__all__ = ["ClassifierAverage"] + + +from ...internal.core.ensemble.output_combiner.classifieraverage import \ + ClassifierAverage as core +from ...internal.utils.utils import trace + + +class ClassifierAverage(core): + """ + **Description** + Computes the average of the outputs of the trained models + + :param normalize: Whether to normalize the output of base models before + combining them. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + normalize=True, + **params): + core.__init__( + self, + normalize=normalize, + **params) + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/ensemble/output_combiner/classifiermedian.py b/src/python/nimbusml/ensemble/output_combiner/classifiermedian.py new file mode 100644 index 00000000..a4816e77 --- /dev/null +++ b/src/python/nimbusml/ensemble/output_combiner/classifiermedian.py @@ -0,0 +1,44 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +ClassifierMedian +""" + +__all__ = ["ClassifierMedian"] + + +from ...internal.core.ensemble.output_combiner.classifiermedian import \ + ClassifierMedian as core +from ...internal.utils.utils import trace + + +class ClassifierMedian(core): + """ + **Description** + Computes the median of the outputs of the trained models + + :param normalize: Whether to normalize the output of base models before + combining them. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + normalize=True, + **params): + core.__init__( + self, + normalize=normalize, + **params) + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/ensemble/output_combiner/classifierstacking.py b/src/python/nimbusml/ensemble/output_combiner/classifierstacking.py new file mode 100644 index 00000000..cfa413b9 --- /dev/null +++ b/src/python/nimbusml/ensemble/output_combiner/classifierstacking.py @@ -0,0 +1,45 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +ClassifierStacking +""" + +__all__ = ["ClassifierStacking"] + + +from ...internal.core.ensemble.output_combiner.classifierstacking import \ + ClassifierStacking as core +from ...internal.utils.utils import trace + + +class ClassifierStacking(core): + """ + **Description** + Computes the output by training a model on a training set where each instance is a vector containing the outputs of the different models on a training instance, and the instance's label + + :param validation_dataset_proportion: The proportion of instances to be + selected to test the individual base learner. If it is 0, it uses + training set. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + validation_dataset_proportion=0.3, + **params): + core.__init__( + self, + validation_dataset_proportion=validation_dataset_proportion, + **params) + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/ensemble/output_combiner/classifiervoting.py b/src/python/nimbusml/ensemble/output_combiner/classifiervoting.py new file mode 100644 index 00000000..bf0400da --- /dev/null +++ b/src/python/nimbusml/ensemble/output_combiner/classifiervoting.py @@ -0,0 +1,39 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +ClassifierVoting +""" + +__all__ = ["ClassifierVoting"] + + +from ...internal.core.ensemble.output_combiner.classifiervoting import \ + ClassifierVoting as core +from ...internal.utils.utils import trace + + +class ClassifierVoting(core): + """ + **Description** + Computes the fraction of positive predictions for each class from all the trained models, and outputs the class with the largest number + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + **params): + core.__init__( + self, + **params) + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/ensemble/output_combiner/classifierweightedaverage.py b/src/python/nimbusml/ensemble/output_combiner/classifierweightedaverage.py new file mode 100644 index 00000000..d684d9fe --- /dev/null +++ b/src/python/nimbusml/ensemble/output_combiner/classifierweightedaverage.py @@ -0,0 +1,98 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +ClassifierWeightedAverage +""" + +__all__ = ["ClassifierWeightedAverage"] + + +from ...internal.core.ensemble.output_combiner.classifierweightedaverage import \ + ClassifierWeightedAverage as core +from ...internal.utils.utils import trace + + +class ClassifierWeightedAverage(core): + """ + + **Description** + Computes the weighted average of the outputs of the trained models + + + :param weightage_name: the metric type to be used to find the weights for + each model. Can be ``"AccuracyMicroAvg"`` or ``"AccuracyMacroAvg"``. + + :param normalize: Specifies the type of automatic normalization used: + + * ``"Auto"``: if normalization is needed, it is performed + automatically. This is the default choice. + * ``"No"``: no normalization is performed. + * ``"Yes"``: normalization is performed. + * ``"Warn"``: if normalization is needed, a warning + message is displayed, but normalization is not performed. + + Normalization rescales disparate data ranges to a standard scale. + Feature + scaling ensures the distances between data points are proportional + and + enables various optimization methods such as gradient descent to + converge + much faster. If normalization is performed, a ``MinMax`` normalizer + is + used. It normalizes values in an interval [a, b] where ``-1 <= a <= + 0`` + and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves + sparsity by mapping zero to zero. + + :param params: Additional arguments sent to compute engine. + + .. seealso:: + :py:class:`EnsembleClassifier + ` + + * Submodel selectors: + :py:class:`ClassifierAllSelector + `, + :py:class:`ClassifierBestDiverseSelector + `, + :py:class:`ClassifierBestPerformanceSelector + ` + + * Output combiners: + :py:class:`ClassifierAverage + `, + :py:class:`ClassifierMedian + `, + :py:class:`ClassifierStacking + `, + :py:class:`ClassifierVoting + ` + + + .. index:: models, ensemble, classification + + Example: + .. literalinclude:: /../nimbusml/examples/EnsembleClassifier.py + :language: python + """ + + @trace + def __init__( + self, + weightage_name='AccuracyMicroAvg', + normalize=True, + **params): + core.__init__( + self, + weightage_name=weightage_name, + normalize=normalize, + **params) + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/ensemble/output_combiner/regressoraverage.py b/src/python/nimbusml/ensemble/output_combiner/regressoraverage.py new file mode 100644 index 00000000..280841f9 --- /dev/null +++ b/src/python/nimbusml/ensemble/output_combiner/regressoraverage.py @@ -0,0 +1,39 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +RegressorAverage +""" + +__all__ = ["RegressorAverage"] + + +from ...internal.core.ensemble.output_combiner.regressoraverage import \ + RegressorAverage as core +from ...internal.utils.utils import trace + + +class RegressorAverage(core): + """ + **Description** + Computes the average of the outputs of the trained models + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + **params): + core.__init__( + self, + **params) + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/ensemble/output_combiner/regressormedian.py b/src/python/nimbusml/ensemble/output_combiner/regressormedian.py new file mode 100644 index 00000000..849ef661 --- /dev/null +++ b/src/python/nimbusml/ensemble/output_combiner/regressormedian.py @@ -0,0 +1,39 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +RegressorMedian +""" + +__all__ = ["RegressorMedian"] + + +from ...internal.core.ensemble.output_combiner.regressormedian import \ + RegressorMedian as core +from ...internal.utils.utils import trace + + +class RegressorMedian(core): + """ + **Description** + Computes the median of the outputs of the trained models + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + **params): + core.__init__( + self, + **params) + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/ensemble/output_combiner/regressorstacking.py b/src/python/nimbusml/ensemble/output_combiner/regressorstacking.py new file mode 100644 index 00000000..1ee61abe --- /dev/null +++ b/src/python/nimbusml/ensemble/output_combiner/regressorstacking.py @@ -0,0 +1,45 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +RegressorStacking +""" + +__all__ = ["RegressorStacking"] + + +from ...internal.core.ensemble.output_combiner.regressorstacking import \ + RegressorStacking as core +from ...internal.utils.utils import trace + + +class RegressorStacking(core): + """ + **Description** + Computes the output by training a model on a training set where each instance is a vector containing the outputs of the different models on a training instance, and the instance's label + + :param validation_dataset_proportion: The proportion of instances to be + selected to test the individual base learner. If it is 0, it uses + training set. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + validation_dataset_proportion=0.3, + **params): + core.__init__( + self, + validation_dataset_proportion=validation_dataset_proportion, + **params) + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/ensemble/sub_model_selector/__init__.py b/src/python/nimbusml/ensemble/sub_model_selector/__init__.py new file mode 100644 index 00000000..ff22d4db --- /dev/null +++ b/src/python/nimbusml/ensemble/sub_model_selector/__init__.py @@ -0,0 +1,15 @@ +from .classifierallselector import ClassifierAllSelector +from .classifierbestdiverseselector import ClassifierBestDiverseSelector +from .classifierbestperformanceselector import ClassifierBestPerformanceSelector +from .regressorallselector import RegressorAllSelector +from .regressorbestdiverseselector import RegressorBestDiverseSelector +from .regressorbestperformanceselector import RegressorBestPerformanceSelector + +__all__ = [ + 'ClassifierAllSelector', + 'ClassifierBestDiverseSelector', + 'ClassifierBestPerformanceSelector', + 'RegressorAllSelector', + 'RegressorBestDiverseSelector', + 'RegressorBestPerformanceSelector' +] \ No newline at end of file diff --git a/src/python/nimbusml/ensemble/sub_model_selector/classifierallselector.py b/src/python/nimbusml/ensemble/sub_model_selector/classifierallselector.py new file mode 100644 index 00000000..7a1ac960 --- /dev/null +++ b/src/python/nimbusml/ensemble/sub_model_selector/classifierallselector.py @@ -0,0 +1,39 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +ClassifierAllSelector +""" + +__all__ = ["ClassifierAllSelector"] + + +from ...internal.core.ensemble.sub_model_selector.classifierallselector import \ + ClassifierAllSelector as core +from ...internal.utils.utils import trace + + +class ClassifierAllSelector(core): + """ + **Description** + Combines all the models to create the output. This is the default submodel selector. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + **params): + core.__init__( + self, + **params) + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/ensemble/sub_model_selector/classifierbestdiverseselector.py b/src/python/nimbusml/ensemble/sub_model_selector/classifierbestdiverseselector.py new file mode 100644 index 00000000..6d54037d --- /dev/null +++ b/src/python/nimbusml/ensemble/sub_model_selector/classifierbestdiverseselector.py @@ -0,0 +1,56 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +ClassifierBestDiverseSelector +""" + +__all__ = ["ClassifierBestDiverseSelector"] + + +from ...internal.core.ensemble.sub_model_selector.classifierbestdiverseselector import \ + ClassifierBestDiverseSelector as core +from ...internal.utils.utils import trace +from .diversity_measure import ClassifierDisagreement + + +class ClassifierBestDiverseSelector(core): + """ + **Description** + Combines the models whose predictions are as diverse as possible. + + :param diversity_metric_type: The metric type to be used to find the + diversity among base learners. + + :param learners_selection_proportion: The proportion of best base learners + to be selected. The range is 0.0-1.0. + + :param validation_dataset_proportion: The proportion of instances to be + selected to test the individual base learner. If it is 0, it uses + training set. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + diversity_metric_type=ClassifierDisagreement(), + learners_selection_proportion=0.5, + validation_dataset_proportion=0.3, + **params): + core.__init__( + self, + diversity_metric_type=diversity_metric_type, + learners_selection_proportion=learners_selection_proportion, + validation_dataset_proportion=validation_dataset_proportion, + **params) + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/ensemble/sub_model_selector/classifierbestperformanceselector.py b/src/python/nimbusml/ensemble/sub_model_selector/classifierbestperformanceselector.py new file mode 100644 index 00000000..09df140e --- /dev/null +++ b/src/python/nimbusml/ensemble/sub_model_selector/classifierbestperformanceselector.py @@ -0,0 +1,84 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +ClassifierBestPerformanceSelector +""" + +__all__ = ["ClassifierBestPerformanceSelector"] + + +from ...internal.core.ensemble.sub_model_selector.classifierbestperformanceselector import \ + ClassifierBestPerformanceSelector as core +from ...internal.utils.utils import trace + + +class ClassifierBestPerformanceSelector(core): + """ + + **Description** + Combines only the models with the best performance. + + + :param metric_name: the metric type to be used to find the weights for + each model. Can be ``"AccuracyMicro"``, ``"AccuracyMacro"``, + ``"LogLoss"``, or ``"LogLossReduction"``. + + :param learners_selection_proportion: The proportion of best base learners + to be selected. The range is 0.0-1.0. + + :param validation_dataset_proportion: The proportion of instances to be + selected to test the individual base learner. If it is 0, it uses + training set. + + :param params: Additional arguments sent to compute engine. + + .. seealso:: + :py:class:`EnsembleClassifier + ` + + * Submodel selectors: + :py:class:`ClassifierAllSelector + `, + :py:class:`ClassifierBestDiverseSelector + ` + + * Output combiners: + :py:class:`ClassifierAverage + `, + :py:class:`ClassifierMedian + `, + :py:class:`ClassifierStacking + `, + :py:class:`ClassifierVoting + ` + + + .. index:: models, ensemble, classification + + Example: + .. literalinclude:: /../nimbusml/examples/EnsembleClassifier.py + :language: python + """ + + @trace + def __init__( + self, + metric_name='AccuracyMicro', + learners_selection_proportion=0.5, + validation_dataset_proportion=0.3, + **params): + core.__init__( + self, + metric_name=metric_name, + learners_selection_proportion=learners_selection_proportion, + validation_dataset_proportion=validation_dataset_proportion, + **params) + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/ensemble/sub_model_selector/diversity_measure/__init__.py b/src/python/nimbusml/ensemble/sub_model_selector/diversity_measure/__init__.py new file mode 100644 index 00000000..84942268 --- /dev/null +++ b/src/python/nimbusml/ensemble/sub_model_selector/diversity_measure/__init__.py @@ -0,0 +1,7 @@ +from .classifierdisagreement import ClassifierDisagreement +from .regressordisagreement import RegressorDisagreement + +__all__ = [ + 'ClassifierDisagreement', + 'RegressorDisagreement' +] diff --git a/src/python/nimbusml/ensemble/sub_model_selector/diversity_measure/classifierdisagreement.py b/src/python/nimbusml/ensemble/sub_model_selector/diversity_measure/classifierdisagreement.py new file mode 100644 index 00000000..144d8991 --- /dev/null +++ b/src/python/nimbusml/ensemble/sub_model_selector/diversity_measure/classifierdisagreement.py @@ -0,0 +1,39 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +ClassifierDisagreement +""" + +__all__ = ["ClassifierDisagreement"] + + +from ....internal.core.ensemble.sub_model_selector.diversity_measure.classifierdisagreement import \ + ClassifierDisagreement as core +from ....internal.utils.utils import trace + + +class ClassifierDisagreement(core): + """ + **Description** + A measure of disagreement in predictions between a pair of classifiers, averaged over all pairs + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + **params): + core.__init__( + self, + **params) + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/ensemble/sub_model_selector/diversity_measure/regressordisagreement.py b/src/python/nimbusml/ensemble/sub_model_selector/diversity_measure/regressordisagreement.py new file mode 100644 index 00000000..8f3b617c --- /dev/null +++ b/src/python/nimbusml/ensemble/sub_model_selector/diversity_measure/regressordisagreement.py @@ -0,0 +1,39 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +RegressorDisagreement +""" + +__all__ = ["RegressorDisagreement"] + + +from ....internal.core.ensemble.sub_model_selector.diversity_measure.regressordisagreement import \ + RegressorDisagreement as core +from ....internal.utils.utils import trace + + +class RegressorDisagreement(core): + """ + **Description** + A measure of absolute value of disagreement in predictions between a pair of regressors, averaged over all pairs + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + **params): + core.__init__( + self, + **params) + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/ensemble/sub_model_selector/regressorallselector.py b/src/python/nimbusml/ensemble/sub_model_selector/regressorallselector.py new file mode 100644 index 00000000..7ac448df --- /dev/null +++ b/src/python/nimbusml/ensemble/sub_model_selector/regressorallselector.py @@ -0,0 +1,39 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +RegressorAllSelector +""" + +__all__ = ["RegressorAllSelector"] + + +from ...internal.core.ensemble.sub_model_selector.regressorallselector import \ + RegressorAllSelector as core +from ...internal.utils.utils import trace + + +class RegressorAllSelector(core): + """ + **Description** + Combines all the models to create the output. This is the default submodel selector. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + **params): + core.__init__( + self, + **params) + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/ensemble/sub_model_selector/regressorbestdiverseselector.py b/src/python/nimbusml/ensemble/sub_model_selector/regressorbestdiverseselector.py new file mode 100644 index 00000000..a89bfb68 --- /dev/null +++ b/src/python/nimbusml/ensemble/sub_model_selector/regressorbestdiverseselector.py @@ -0,0 +1,56 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +RegressorBestDiverseSelector +""" + +__all__ = ["RegressorBestDiverseSelector"] + + +from ...internal.core.ensemble.sub_model_selector.regressorbestdiverseselector import \ + RegressorBestDiverseSelector as core +from ...internal.utils.utils import trace +from .diversity_measure import RegressorDisagreement + + +class RegressorBestDiverseSelector(core): + """ + **Description** + Combines the models whose predictions are as diverse as possible. + + :param diversity_metric_type: The metric type to be used to find the + diversity among base learners. + + :param learners_selection_proportion: The proportion of best base learners + to be selected. The range is 0.0-1.0. + + :param validation_dataset_proportion: The proportion of instances to be + selected to test the individual base learner. If it is 0, it uses + training set. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + diversity_metric_type=RegressorDisagreement(), + learners_selection_proportion=0.5, + validation_dataset_proportion=0.3, + **params): + core.__init__( + self, + diversity_metric_type=diversity_metric_type, + learners_selection_proportion=learners_selection_proportion, + validation_dataset_proportion=validation_dataset_proportion, + **params) + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/ensemble/sub_model_selector/regressorbestperformanceselector.py b/src/python/nimbusml/ensemble/sub_model_selector/regressorbestperformanceselector.py new file mode 100644 index 00000000..52505e02 --- /dev/null +++ b/src/python/nimbusml/ensemble/sub_model_selector/regressorbestperformanceselector.py @@ -0,0 +1,82 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +RegressorBestPerformanceSelector +""" + +__all__ = ["RegressorBestPerformanceSelector"] + + +from ...internal.core.ensemble.sub_model_selector.regressorbestperformanceselector import \ + RegressorBestPerformanceSelector as core +from ...internal.utils.utils import trace + + +class RegressorBestPerformanceSelector(core): + """ + + **Description** + Computes the weighted average of the outputs of the trained models + + + :param metric_name: the metric type to be used to find the weights for + each model. Can be ``"L1"``, ``"L2"``, ``"Rms"``, or ``"Loss"``, or + ``"RSquared"``. + + :param learners_selection_proportion: The proportion of best base learners + to be selected. The range is 0.0-1.0. + + :param validation_dataset_proportion: The proportion of instances to be + selected to test the individual base learner. If it is 0, it uses + training set. + + :param params: Additional arguments sent to compute engine. + + .. seealso:: + :py:class:`EnsembleRegressor + ` + + * Submodel selectors: + :py:class:`RegressorAllSelector + `, + :py:class:`RegressorBestDiverseSelector + ` + + * Output combiners: + :py:class:`RegressorAverage + `, + :py:class:`RegressorMedian + `, + :py:class:`RegressorStacking + ` + + + .. index:: models, ensemble, classification + + Example: + .. literalinclude:: /../nimbusml/examples/EnsembleClassifier.py + :language: python + """ + + @trace + def __init__( + self, + metric_name='L1', + learners_selection_proportion=0.5, + validation_dataset_proportion=0.3, + **params): + core.__init__( + self, + metric_name=metric_name, + learners_selection_proportion=learners_selection_proportion, + validation_dataset_proportion=validation_dataset_proportion, + **params) + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/ensemble/subset_selector/__init__.py b/src/python/nimbusml/ensemble/subset_selector/__init__.py new file mode 100644 index 00000000..2005fde3 --- /dev/null +++ b/src/python/nimbusml/ensemble/subset_selector/__init__.py @@ -0,0 +1,9 @@ +from .allinstanceselector import AllInstanceSelector +from .bootstrapselector import BootstrapSelector +from .randompartitionselector import RandomPartitionSelector + +__all__ = [ + 'AllInstanceSelector', + 'BootstrapSelector', + 'RandomPartitionSelector' +] diff --git a/src/python/nimbusml/ensemble/subset_selector/allinstanceselector.py b/src/python/nimbusml/ensemble/subset_selector/allinstanceselector.py new file mode 100644 index 00000000..1b161c35 --- /dev/null +++ b/src/python/nimbusml/ensemble/subset_selector/allinstanceselector.py @@ -0,0 +1,43 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +AllInstanceSelector +""" + +__all__ = ["AllInstanceSelector"] + + +from ...internal.core.ensemble.subset_selector.allinstanceselector import \ + AllInstanceSelector as core +from ...internal.utils.utils import trace + + +class AllInstanceSelector(core): + """ + **Description** + Selects all rows for each trainer in the ensemble + + :param feature_selector: The Feature selector. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + feature_selector=None, + **params): + core.__init__( + self, + feature_selector=feature_selector, + **params) + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/ensemble/subset_selector/bootstrapselector.py b/src/python/nimbusml/ensemble/subset_selector/bootstrapselector.py new file mode 100644 index 00000000..54693df7 --- /dev/null +++ b/src/python/nimbusml/ensemble/subset_selector/bootstrapselector.py @@ -0,0 +1,43 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +BootstrapSelector +""" + +__all__ = ["BootstrapSelector"] + + +from ...internal.core.ensemble.subset_selector.bootstrapselector import \ + BootstrapSelector as core +from ...internal.utils.utils import trace + + +class BootstrapSelector(core): + """ + **Description** + Selects a bootstrapped sample of the rows for each trainer in the ensemble + + :param feature_selector: The Feature selector. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + feature_selector=None, + **params): + core.__init__( + self, + feature_selector=feature_selector, + **params) + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/ensemble/subset_selector/randompartitionselector.py b/src/python/nimbusml/ensemble/subset_selector/randompartitionselector.py new file mode 100644 index 00000000..52199bf1 --- /dev/null +++ b/src/python/nimbusml/ensemble/subset_selector/randompartitionselector.py @@ -0,0 +1,43 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +RandomPartitionSelector +""" + +__all__ = ["RandomPartitionSelector"] + + +from ...internal.core.ensemble.subset_selector.randompartitionselector import \ + RandomPartitionSelector as core +from ...internal.utils.utils import trace + + +class RandomPartitionSelector(core): + """ + **Description** + Randomly partitions the rows for each trainer in the ensemble + + :param feature_selector: The Feature selector. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + feature_selector=None, + **params): + core.__init__( + self, + feature_selector=feature_selector, + **params) + + def get_params(self, deep=False): + """ + Get the parameters for this operator. + """ + return core.get_params(self) diff --git a/src/python/nimbusml/examples/EnsembleClassifier.py b/src/python/nimbusml/examples/EnsembleClassifier.py new file mode 100644 index 00000000..1ebb0e2a --- /dev/null +++ b/src/python/nimbusml/examples/EnsembleClassifier.py @@ -0,0 +1,85 @@ +############################################################################### +# EnsembleClassifier +from nimbusml import Pipeline, FileDataStream +from nimbusml.datasets import get_dataset +from nimbusml.feature_extraction.categorical import OneHotVectorizer +from nimbusml.ensemble import EnsembleClassifier +from nimbusml.ensemble.feature_selector import RandomFeatureSelector +from nimbusml.ensemble.output_combiner import ClassifierVoting +from nimbusml.ensemble.subset_selector import RandomPartitionSelector +from nimbusml.ensemble.sub_model_selector import ClassifierBestDiverseSelector + +# data input (as a FileDataStream) +path = get_dataset('infert').as_filepath() +data = FileDataStream.read_csv(path) +print(data.head()) +# age case education induced parity ... row_num spontaneous ... +# 0 26 1 0-5yrs 1 6 ... 1 2 ... +# 1 42 1 0-5yrs 1 1 ... 2 0 ... +# 2 39 1 0-5yrs 2 6 ... 3 0 ... +# 3 34 1 0-5yrs 2 4 ... 4 0 ... +# 4 35 1 6-11yrs 1 3 ... 5 1 ... + + +# define the training pipeline using default sampling and ensembling parameters +pipeline_with_defaults = Pipeline([ + OneHotVectorizer(columns={'edu': 'education'}), + EnsembleClassifier(feature=['age', 'edu', 'parity'], + label='induced', + num_models=3) +]) + +# train, predict, and evaluate +metrics, predictions = pipeline_with_defaults.fit(data).test(data, output_scores=True) + +# print predictions +print(predictions.head()) +# PredictedLabel Score.0 Score.1 Score.2 +# 0 2 0.202721 0.186598 0.628115 +# 1 0 0.716737 0.190289 0.092974 +# 2 2 0.201026 0.185602 0.624761 +# 3 0 0.423328 0.235074 0.365649 +# 4 0 0.577509 0.220827 0.201664 + +# print evaluation metrics +print(metrics) +# Accuracy(micro-avg) Accuracy(macro-avg) Log-loss ... (class 0) ... +# 0 0.612903 0.417519 0.846467 ... 0.504007 ... +# (class 1) (class 2) +# 1.244033 1.439364 + + +# define the training pipeline with specific sampling and ensembling options +pipeline_with_options = Pipeline([ + OneHotVectorizer(columns={'edu': 'education'}), + EnsembleClassifier(feature=['age', 'edu', 'parity'], + label='induced', + num_models=3, + sampling_type = RandomPartitionSelector( + feature_selector=RandomFeatureSelector( + features_selction_proportion=0.7)), + sub_model_selector_type=ClassifierBestDiverseSelector(), + output_combiner=ClassifierVoting()) +]) + +# train, predict, and evaluate +metrics, predictions = pipeline_with_options.fit(data).test(data, output_scores=True) + +# print predictions +print(predictions.head()) +# PredictedLabel Score.0 Score.1 Score.2 +# 0 2 0.0 0.0 1.0 +# 1 0 1.0 0.0 0.0 +# 2 2 0.0 0.0 1.0 +# 3 0 1.0 0.0 0.0 +# 4 0 1.0 0.0 0.0 + +# print evaluation metrics +# note that accuracy metrics are lower than with defaults as this is a small +# dataset that we partition into 3 chunks for each classifier, which decreases +# model quality. +print(metrics) +# Accuracy(micro-avg) Accuracy(macro-avg) Log-loss ... (class 0) ... +# 0 0.596774 0.38352 13.926926 ... 0.48306 ... +# (class 1) (class 2) +# 33.52293 29.871374 \ No newline at end of file diff --git a/src/python/nimbusml/examples/EnsembleRegressor.py b/src/python/nimbusml/examples/EnsembleRegressor.py new file mode 100644 index 00000000..e5f73c97 --- /dev/null +++ b/src/python/nimbusml/examples/EnsembleRegressor.py @@ -0,0 +1,78 @@ +############################################################################### +# EnsembleRegressor +from nimbusml import Pipeline, FileDataStream +from nimbusml.datasets import get_dataset +from nimbusml.feature_extraction.categorical import OneHotVectorizer +from nimbusml.ensemble import EnsembleRegressor +from nimbusml.ensemble.feature_selector import RandomFeatureSelector +from nimbusml.ensemble.output_combiner import RegressorMedian +from nimbusml.ensemble.subset_selector import RandomPartitionSelector +from nimbusml.ensemble.sub_model_selector import RegressorBestDiverseSelector + +# data input (as a FileDataStream) +path = get_dataset('infert').as_filepath() +data = FileDataStream.read_csv(path) +print(data.head()) +# age case education induced parity ... row_num spontaneous ... +# 0 26 1 0-5yrs 1 6 ... 1 2 ... +# 1 42 1 0-5yrs 1 1 ... 2 0 ... +# 2 39 1 0-5yrs 2 6 ... 3 0 ... +# 3 34 1 0-5yrs 2 4 ... 4 0 ... +# 4 35 1 6-11yrs 1 3 ... 5 1 ... + +# define the training pipeline using default sampling and ensembling parameters +pipeline_with_defaults = Pipeline([ + OneHotVectorizer(columns={'edu': 'education'}), + EnsembleRegressor(feature=['induced', 'edu'], label='age', num_models=3) +]) + +# train, predict, and evaluate +metrics, predictions = pipeline_with_defaults.fit(data).test(data, output_scores=True) + +# print predictions +print(predictions.head()) +# Score +# 0 26.046741 +# 1 26.046741 +# 2 29.225840 +# 3 29.225840 +# 4 33.849384 + +# print evaluation metrics +print(metrics) +# L1(avg) L2(avg) RMS(avg) Loss-fn(avg) R Squared +# 0 4.69884 33.346123 5.77461 33.346124 -0.214011 + + +# define the training pipeline with specific sampling and ensembling options +pipeline_with_options = Pipeline([ + OneHotVectorizer(columns={'edu': 'education'}), + EnsembleRegressor(feature=['induced', 'edu'], + label='age', + num_models=3, + sampling_type = RandomPartitionSelector( + feature_selector=RandomFeatureSelector( + features_selction_proportion=0.7)), + sub_model_selector_type=RegressorBestDiverseSelector(), + output_combiner=RegressorMedian()) +]) + +# train, predict, and evaluate +metrics, predictions = pipeline_with_options.fit(data).test(data, output_scores=True) + +# print predictions +print(predictions.head()) +# Score +# 0 37.122200 +# 1 37.122200 +# 2 41.296204 +# 3 41.296204 +# 4 33.591423 + +# print evaluation metrics +# note that the converged loss function values are worse than with defaults as +# this is a small dataset that we partition into 3 chunks for each regressor, +# which decreases model quality +print(metrics) +# L1(avg) L2(avg) RMS(avg) Loss-fn(avg) R Squared +# 0 5.481676 44.924838 6.702599 44.924838 -0.63555 diff --git a/src/python/nimbusml/examples/examples_from_dataframe/EnsembleClassifier_iris_df.py b/src/python/nimbusml/examples/examples_from_dataframe/EnsembleClassifier_iris_df.py new file mode 100644 index 00000000..4f1e2108 --- /dev/null +++ b/src/python/nimbusml/examples/examples_from_dataframe/EnsembleClassifier_iris_df.py @@ -0,0 +1,53 @@ +############################################################################### +# EnsembleClassifier +import numpy as np +import pandas as pd +from nimbusml.datasets import get_dataset +from nimbusml.ensemble import EnsembleClassifier +from nimbusml.ensemble.feature_selector import RandomFeatureSelector +from nimbusml.ensemble.output_combiner import ClassifierVoting +from nimbusml.ensemble.subset_selector import RandomPartitionSelector +from nimbusml.ensemble.sub_model_selector import ClassifierBestDiverseSelector +from sklearn.model_selection import train_test_split + +# use 'iris' data set to create test and train data +# Sepal_Length Sepal_Width Petal_Length Petal_Width Label Species Setosa +# 0 5.1 3.5 1.4 0.2 0 setosa 1.0 +# 1 4.9 3.0 1.4 0.2 0 setosa 1.0 +np.random.seed(0) + +df = get_dataset("iris").as_df() +df.drop(['Species'], inplace=True, axis=1) + +X_train, X_test, y_train, y_test = \ + train_test_split(df.loc[:, df.columns != 'Label'], df['Label']) + +# train a model with default sampling and ensembling parameters and score +ensemble_with_defaults = EnsembleClassifier(num_models=3).fit(X_train, y_train) + +scores = ensemble_with_defaults.predict(X_test) +scores = pd.to_numeric(scores) + +# evaluate the model +print('Accuracy:', np.mean(y_test == [i for i in scores])) +# Accuracy: 0.9473684210526315 + + +# train a model with specific sampling and ensembling options and score +ensemble_with_options = EnsembleClassifier( + num_models=3, + sampling_type = RandomPartitionSelector( + feature_selector=RandomFeatureSelector( + features_selction_proportion=0.7)), + sub_model_selector_type=ClassifierBestDiverseSelector(), + output_combiner=ClassifierVoting()).fit(X_train, y_train) + +scores = ensemble_with_options.predict(X_test) +scores = pd.to_numeric(scores) + +# evaluate the model +# note that accuracy is lower than with defaults as this is a small dataset +# that we partition into 3 chunks for each classifier, which decreases model +# quality. +print('Accuracy:', np.mean(y_test == [i for i in scores])) +# Accuracy: 0.5789473684210527 diff --git a/src/python/nimbusml/examples/examples_from_dataframe/EnsembleRegressor_airquality_df.py b/src/python/nimbusml/examples/examples_from_dataframe/EnsembleRegressor_airquality_df.py new file mode 100644 index 00000000..b80df686 --- /dev/null +++ b/src/python/nimbusml/examples/examples_from_dataframe/EnsembleRegressor_airquality_df.py @@ -0,0 +1,49 @@ +############################################################################### +# EnsembleRegressor +import numpy as np +from nimbusml.datasets import get_dataset +from nimbusml.ensemble import EnsembleRegressor +from nimbusml.ensemble.feature_selector import RandomFeatureSelector +from nimbusml.ensemble.output_combiner import RegressorMedian +from nimbusml.ensemble.subset_selector import RandomPartitionSelector +from nimbusml.ensemble.sub_model_selector import RegressorBestDiverseSelector +from sklearn.metrics import r2_score +from sklearn.model_selection import train_test_split + +# use the built-in data set 'airquality' to create test and train data +# Unnamed: 0 Ozone Solar_R Wind Temp Month Day +# 0 1 41.0 190.0 7.4 67 5 1 +# 1 2 36.0 118.0 8.0 72 5 2 +np.random.seed(0) + +df = get_dataset("airquality").as_df().fillna(0) +df = df[df.Ozone.notnull()] + +X_train, X_test, y_train, y_test = train_test_split( + df.loc[:, df.columns != 'Ozone'], df['Ozone']) + +# train a model with default sampling and ensembling parameters and score +ensemble_with_defaults = EnsembleRegressor(num_models=3).fit(X_train, y_train) +scores = ensemble_with_defaults.predict(X_test) + +# evaluate the model +print('R-squared fit:', r2_score(y_test, scores, )) +# R-squared fit: 0.12144964995862884 + + +# train a model withe specific sampling and ensembling options and score +ensemble_with_options = EnsembleRegressor( + num_models=3, + sampling_type = RandomPartitionSelector( + feature_selector=RandomFeatureSelector( + features_selction_proportion=0.7)), + sub_model_selector_type=RegressorBestDiverseSelector(), + output_combiner=RegressorMedian()).fit(X_train, y_train) +scores = ensemble_with_options.predict(X_test) + +# evaluate the model +# note that this is a worse fit than with defaults as this is a small dataset +# that we partition into 3 chunks for each regressor, which decreases model +# quality +print('R-squared fit:', r2_score(y_test, scores, )) +# R-squared fit: 0.027908675807698735 \ No newline at end of file diff --git a/src/python/nimbusml/internal/core/ensemble/ensembleclassifier.py b/src/python/nimbusml/internal/core/ensemble/ensembleclassifier.py new file mode 100644 index 00000000..13850fc8 --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/ensembleclassifier.py @@ -0,0 +1,238 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +EnsembleClassifier +""" + +__all__ = ["EnsembleClassifier"] + + +from ...entrypoints._ensemblemulticlassoutputcombiner_multimedian import \ + multi_median +from ...entrypoints._ensemblemulticlasssubmodelselector_allselectormulticlass import \ + all_selector_multi_class +from ...entrypoints._ensemblesubsetselector_bootstrapselector import \ + bootstrap_selector +from ...entrypoints.trainers_ensembleclassification import \ + trainers_ensembleclassification +from ...utils.utils import trace +from ..base_pipeline_item import BasePipelineItem, DefaultSignatureWithRoles + + +class EnsembleClassifier( + BasePipelineItem, + DefaultSignatureWithRoles): + """ + + **Description** + Train a multi class ensemble model + + .. remarks:: + An Ensemble is a set of models, each trained on a sample of the + training set. Training an ensemble instead of a single model can boost + the accuracy of a given algorithm. + + The quality of an Ensemble depends on two factors; Accuracy and + Diversity. Ensemble can be analogous to Teamwork. If every team member + is diverse and competent, then the team can perform very well. Here a + team member is a base learner and the team is the Ensemble. In the case + of classification ensembles, the base learner is a + ``LogisticRegressionClassifier``. + + + :param sampling_type: Specifies how the training samples are created: + + * ``BootstrapSelector``: takes a bootstrap sample of the training set + (sampling with replacement). This is the default method. + * ``RandomPartitionSelector``: randomly partitions the training set + into subsets. + * ``AllSelector``: every model is trained using the whole training set. + + Each of these Subset Selectors has two options for selecting features: + * ``AllFeatureSelector``: selects all the features. This is the default + method. + * ``RandomFeatureSelector``: selects a random subset of the features + for each model. + + :param num_models: indicates the number models to train, i.e. the number of + subsets of the training set to sample. The default value is 50. If + batches are used then this indicates the number of models per batch. + + :param sub_model_selector_type: Determines the efficient set of models the + ``output_combiner`` uses, and removes the least significant models. This is + used to improve the accuracy and reduce the model size. This is also called + pruning. + + * ``ClassifierAllSelector``: does not perform any pruning and selects + all models in the ensemble to combine to create the output. This is + the default submodel selector. + * ``ClassifierBestDiverseSelector``: combines models whose predictions + are as diverse as possible. Currently, only diagreement diversity is + supported. + * ``ClassifierBestPerformanceSelector``: combines only the models with + the best performance according some metric. The metric can be + ``"AccuracyMicro"``, ``"AccuracyMacro"``, ``"LogLoss"``, + or ``"LogLossReduction"``. + + + :output_combiner: indicates how to combine the predictions of the different + models into a single prediction. There are five available output + combiners for clasification: + + * ``ClassifierAverage``: computes the average of the scores produced by + the trained models. + * ``ClassifierMedian``: computes the median of the scores produced by + the trained models. + * ``ClassifierStacking``: computes the output by training a model on a + training set where each instance is a vector containing the outputs + of the different models on a training instance, and the instance's + label. + * ``ClassifierVoting``: computes the fraction of positive predictions + for each class from all the trained models, and outputs the class + with the largest number. + * ``ClassifierWeightedAverage``: computes the weighted average of the + outputs of the trained models, weighted by the specified metric. The + metric can be ``"AccuracyMicroAvg"`` or ``"AccuracyMacroAvg"``. + + :param output_combiner: Output combiner. + + :param normalize: Specifies the type of automatic normalization used: + + * ``"Auto"``: if normalization is needed, it is performed + automatically. This is the default choice. + * ``"No"``: no normalization is performed. + * ``"Yes"``: normalization is performed. + * ``"Warn"``: if normalization is needed, a warning + message is displayed, but normalization is not performed. + + Normalization rescales disparate data ranges to a standard scale. + Feature + scaling ensures the distances between data points are proportional + and + enables various optimization methods such as gradient descent to + converge + much faster. If normalization is performed, a ``MinMax`` normalizer + is + used. It normalizes values in an interval [a, b] where ``-1 <= a <= + 0`` + and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves + sparsity by mapping zero to zero. + + :param caching: Whether trainer should cache input training data. + + :param train_parallel: All the base learners will run asynchronously if the + value is true. + + :param batch_size: train the models iteratively on subsets of the training + set of this size. When using this option, it is assumed that the + training set is randomized enough so that every batch is a random + sample of instances. The default value is -1, indicating using the + whole training set. If the value is changed to an integer greater than + 0, the number of trained models is the number of batches (the size of + the training set divided by the batch size), times ``num_models``. + + :param show_metrics: True, if metrics for each model need to be evaluated + and shown in comparison table. This is done by using validation set if + available or the training set. + + :param params: Additional arguments sent to compute engine. + + .. seealso:: + * Subset selectors: + :py:class:`AllInstanceSelector + `, + :py:class:`BootstrapSelector + `, + :py:class:`RandomPartitionSelector + ` + + * Feature selectors: + :py:class:`AllFeatureSelector + `, + :py:class:`RandomFeatureSelector + ` + + * Submodel selectors: + :py:class:`ClassifierAllSelector + `, + :py:class:`ClassifierBestDiverseSelector + `, + :py:class:`ClassifierBestPerformanceSelector + ` + + * Output combiners: + :py:class:`ClassifierAverage + `, + :py:class:`ClassifierMedian + `, + :py:class:`ClassifierStacking + `, + :py:class:`ClassifierVoting + `, + :py:class:`ClassifierWeightedAverage + ` + + + .. index:: models, ensemble, classification + + Example: + .. literalinclude:: /../nimbusml/examples/EnsembleClassifier.py + :language: python + """ + + @trace + def __init__( + self, + sampling_type=bootstrap_selector( + feature_selector={ + 'Name': 'AllFeatureSelector'}), + num_models=None, + sub_model_selector_type=None, + output_combiner=None, + normalize='Auto', + caching='Auto', + train_parallel=False, + batch_size=-1, + show_metrics=False, + **params): + BasePipelineItem.__init__( + self, type='classifier', **params) + + self.sampling_type = sampling_type + self.num_models = num_models + self.sub_model_selector_type = sub_model_selector_type + self.output_combiner = output_combiner + self.normalize = normalize + self.caching = caching + self.train_parallel = train_parallel + self.batch_size = batch_size + self.show_metrics = show_metrics + + @property + def _entrypoint(self): + return trainers_ensembleclassification + + @trace + def _get_node(self, **all_args): + algo_args = dict( + feature_column_name=self._getattr_role( + 'feature_column_name', + all_args), + label_column_name=self._getattr_role( + 'label_column_name', + all_args), + sampling_type=self.sampling_type, + num_models=self.num_models, + sub_model_selector_type=self.sub_model_selector_type, + output_combiner=self.output_combiner, + normalize_features=self.normalize, + caching=self.caching, + train_parallel=self.train_parallel, + batch_size=self.batch_size, + show_metrics=self.show_metrics) + + all_args.update(algo_args) + return self._entrypoint(**all_args) diff --git a/src/python/nimbusml/internal/core/ensemble/ensembleregressor.py b/src/python/nimbusml/internal/core/ensemble/ensembleregressor.py new file mode 100644 index 00000000..408b607d --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/ensembleregressor.py @@ -0,0 +1,226 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +EnsembleRegressor +""" + +__all__ = ["EnsembleRegressor"] + + +from ...entrypoints._ensembleregressionsubmodelselector_allselector import \ + all_selector +from ...entrypoints._ensemblesubsetselector_bootstrapselector import \ + bootstrap_selector +from ...entrypoints.trainers_ensembleregression import \ + trainers_ensembleregression +from ...utils.utils import trace +from ..base_pipeline_item import BasePipelineItem, DefaultSignatureWithRoles + + +class EnsembleRegressor( + BasePipelineItem, + DefaultSignatureWithRoles): + """ + + **Description** + Train a regression ensemble model + + .. remarks:: + An Ensemble is a set of models, each trained on a sample of the + training set. Training an ensemble instead of a single model can boost + the accuracy of a given algorithm. + + The quality of an Ensemble depends on two factors; Accuracy and + Diversity. Ensemble can be analogous to Teamwork. If every team member + is diverse and competent, then the team can perform very well. Here a + team member is a base learner and the team is the Ensemble. In the case + of regression ensembles, the base learner is an + ``OnlineGradientDescentRegressor``. + + + :param sampling_type: Specifies how the training samples are created: + + * ``BootstrapSelector``: takes a bootstrap sample of the training set + (sampling with replacement). This is the default method. + * ``RandomPartitionSelector``: randomly partitions the training set + into subsets. + * ``AllSelector``: every model is trained using the whole training set. + + Each of these Subset Selectors has two options for selecting features: + * ``AllFeatureSelector``: selects all the features. This is the default + method. + * ``RandomFeatureSelector``: selects a random subset of the features + for each model. + + :param num_models: indicates the number models to train, i.e. the number of + subsets of the training set to sample. The default value is 50. If + batches are used then this indicates the number of models per batch. + + :param sub_model_selector_type: Determines the efficient set of models the + ``output_combiner`` uses, and removes the least significant models. This is + used to improve the accuracy and reduce the model size. This is also called + pruning. + + * ``RegressorAllSelector``: does not perform any pruning and selects + all models in the ensemble to combine to create the output. This is + the default submodel selector. + * ``RegressorBestDiverseSelector``: combines models whose predictions + are as diverse as possible. Currently, only diagreement diversity is + supported. + * ``RegressorBestPerformanceSelector``: combines only the models with + the best performance according to the specified metric. The metric + can be ``"L1"``, ``"L2"``, ``"Rms"``, or ``"Loss"``, or + ``"RSquared"``. + + + :output_combiner: indicates how to combine the predictions of the different + models into a single prediction. There are five available output + combiners for clasification: + + * ``RegressorAverage``: computes the average of the scores produced by + the trained models. + * ``RegressorMedian``: computes the median of the scores produced by + the trained models. + * ``RegressorStacking``: computes the output by training a model on a + training set where each instance is a vector containing the outputs + of the different models on a training instance, and the instance's + label. + + :param output_combiner: Output combiner. + + :param normalize: Specifies the type of automatic normalization used: + + * ``"Auto"``: if normalization is needed, it is performed + automatically. This is the default choice. + * ``"No"``: no normalization is performed. + * ``"Yes"``: normalization is performed. + * ``"Warn"``: if normalization is needed, a warning + message is displayed, but normalization is not performed. + + Normalization rescales disparate data ranges to a standard scale. + Feature + scaling ensures the distances between data points are proportional + and + enables various optimization methods such as gradient descent to + converge + much faster. If normalization is performed, a ``MinMax`` normalizer + is + used. It normalizes values in an interval [a, b] where ``-1 <= a <= + 0`` + and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves + sparsity by mapping zero to zero. + + :param caching: Whether trainer should cache input training data. + + :param train_parallel: All the base learners will run asynchronously if the + value is true. + + :param batch_size: train the models iteratively on subsets of the training + set of this size. When using this option, it is assumed that the + training set is randomized enough so that every batch is a random + sample of instances. The default value is -1, indicating using the + whole training set. If the value is changed to an integer greater than + 0, the number of trained models is the number of batches (the size of + the training set divided by the batch size), times ``num_models``. + + :param show_metrics: True, if metrics for each model need to be evaluated + and shown in comparison table. This is done by using validation set if + available or the training set. + + :param params: Additional arguments sent to compute engine. + + .. seealso:: + * Subset selectors: + :py:class:`AllInstanceSelector + `, + :py:class:`BootstrapSelector + `, + :py:class:`RandomPartitionSelector + ` + + * Feature selectors: + :py:class:`AllFeatureSelector + `, + :py:class:`RandomFeatureSelector + ` + + * Submodel selectors: + :py:class:`RegressorAllSelector + `, + :py:class:`RegressorBestDiverseSelector + `, + :py:class:`RegressorBestPerformanceSelector + ` + + * Output combiners: + :py:class:`RegressorAverage + `, + :py:class:`RegressorMedian + `, + :py:class:`RegressorStacking + ` + + + .. index:: models, ensemble, classification + + Example: + .. literalinclude:: /../nimbusml/examples/EnsembleRegressor.py + :language: python + """ + + @trace + def __init__( + self, + sampling_type=bootstrap_selector( + feature_selector={ + 'Name': 'AllFeatureSelector'}), + num_models=None, + sub_model_selector_type=None, + output_combiner=None, + normalize='Auto', + caching='Auto', + train_parallel=False, + batch_size=-1, + show_metrics=False, + **params): + BasePipelineItem.__init__( + self, type='regressor', **params) + + self.sampling_type = sampling_type + self.num_models = num_models + self.sub_model_selector_type = sub_model_selector_type + self.output_combiner = output_combiner + self.normalize = normalize + self.caching = caching + self.train_parallel = train_parallel + self.batch_size = batch_size + self.show_metrics = show_metrics + + @property + def _entrypoint(self): + return trainers_ensembleregression + + @trace + def _get_node(self, **all_args): + algo_args = dict( + feature_column_name=self._getattr_role( + 'feature_column_name', + all_args), + label_column_name=self._getattr_role( + 'label_column_name', + all_args), + sampling_type=self.sampling_type, + num_models=self.num_models, + sub_model_selector_type=self.sub_model_selector_type, + output_combiner=self.output_combiner, + normalize_features=self.normalize, + caching=self.caching, + train_parallel=self.train_parallel, + batch_size=self.batch_size, + show_metrics=self.show_metrics) + + all_args.update(algo_args) + return self._entrypoint(**all_args) diff --git a/src/python/nimbusml/internal/core/ensemble/feature_selector/__init__.py b/src/python/nimbusml/internal/core/ensemble/feature_selector/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/feature_selector/__init__.py @@ -0,0 +1 @@ + diff --git a/src/python/nimbusml/internal/core/ensemble/feature_selector/allfeatureselector.py b/src/python/nimbusml/internal/core/ensemble/feature_selector/allfeatureselector.py new file mode 100644 index 00000000..d2fab736 --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/feature_selector/allfeatureselector.py @@ -0,0 +1,40 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +AllFeatureSelector +""" + +__all__ = ["AllFeatureSelector"] + + +from ....utils.entrypoints import Component +from ....utils.utils import trace + + +class AllFeatureSelector(Component): + """ + **Description** + Selects all features for each trainer in the ensemble + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + **params): + + self.kind = 'EnsembleFeatureSelector' + self.name = 'AllFeatureSelector' + self.settings = {} + + super( + AllFeatureSelector, + self).__init__( + name=self.name, + settings=self.settings, + kind=self.kind) diff --git a/src/python/nimbusml/internal/core/ensemble/feature_selector/randomfeatureselector.py b/src/python/nimbusml/internal/core/ensemble/feature_selector/randomfeatureselector.py new file mode 100644 index 00000000..9933864a --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/feature_selector/randomfeatureselector.py @@ -0,0 +1,52 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +RandomFeatureSelector +""" + +__all__ = ["RandomFeatureSelector"] + +import numbers + +from ....utils.entrypoints import Component +from ....utils.utils import trace, try_set + + +class RandomFeatureSelector(Component): + """ + **Description** + Selects a random subset of features for each trainer in the ensemble + + :param features_selection_proportion: The proportion of features to be + selected. The range is 0.0-1.0. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + features_selection_proportion=0.8, + **params): + + self.features_selection_proportion = features_selection_proportion + self.kind = 'EnsembleFeatureSelector' + self.name = 'RandomFeatureSelector' + self.settings = {} + + if features_selection_proportion is not None: + self.settings['FeaturesSelectionProportion'] = try_set( + obj=features_selection_proportion, + none_acceptable=True, + is_of_type=numbers.Real) + + super( + RandomFeatureSelector, + self).__init__( + name=self.name, + settings=self.settings, + kind=self.kind) diff --git a/src/python/nimbusml/internal/core/ensemble/output_combiner/__init__.py b/src/python/nimbusml/internal/core/ensemble/output_combiner/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/output_combiner/__init__.py @@ -0,0 +1 @@ + diff --git a/src/python/nimbusml/internal/core/ensemble/output_combiner/classifieraverage.py b/src/python/nimbusml/internal/core/ensemble/output_combiner/classifieraverage.py new file mode 100644 index 00000000..2b683f05 --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/output_combiner/classifieraverage.py @@ -0,0 +1,49 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +ClassifierAverage +""" + +__all__ = ["ClassifierAverage"] + + +from ....utils.entrypoints import Component +from ....utils.utils import trace, try_set + + +class ClassifierAverage(Component): + """ + **Description** + Computes the average of the outputs of the trained models + + :param normalize: Whether to normalize the output of base models before + combining them. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + normalize=True, + **params): + + self.normalize = normalize + self.kind = 'EnsembleMulticlassOutputCombiner' + self.name = 'MultiAverage' + self.settings = {} + + if normalize is not None: + self.settings['Normalize'] = try_set( + obj=normalize, none_acceptable=True, is_of_type=bool) + + super( + ClassifierAverage, + self).__init__( + name=self.name, + settings=self.settings, + kind=self.kind) diff --git a/src/python/nimbusml/internal/core/ensemble/output_combiner/classifiermedian.py b/src/python/nimbusml/internal/core/ensemble/output_combiner/classifiermedian.py new file mode 100644 index 00000000..e984a7ff --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/output_combiner/classifiermedian.py @@ -0,0 +1,49 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +ClassifierMedian +""" + +__all__ = ["ClassifierMedian"] + + +from ....utils.entrypoints import Component +from ....utils.utils import trace, try_set + + +class ClassifierMedian(Component): + """ + **Description** + Computes the median of the outputs of the trained models + + :param normalize: Whether to normalize the output of base models before + combining them. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + normalize=True, + **params): + + self.normalize = normalize + self.kind = 'EnsembleMulticlassOutputCombiner' + self.name = 'MultiMedian' + self.settings = {} + + if normalize is not None: + self.settings['Normalize'] = try_set( + obj=normalize, none_acceptable=True, is_of_type=bool) + + super( + ClassifierMedian, + self).__init__( + name=self.name, + settings=self.settings, + kind=self.kind) diff --git a/src/python/nimbusml/internal/core/ensemble/output_combiner/classifierstacking.py b/src/python/nimbusml/internal/core/ensemble/output_combiner/classifierstacking.py new file mode 100644 index 00000000..0f6aef2f --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/output_combiner/classifierstacking.py @@ -0,0 +1,53 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +ClassifierStacking +""" + +__all__ = ["ClassifierStacking"] + +import numbers + +from ....utils.entrypoints import Component +from ....utils.utils import trace, try_set + + +class ClassifierStacking(Component): + """ + **Description** + Computes the output by training a model on a training set where each instance is a vector containing the outputs of the different models on a training instance, and the instance's label + + :param validation_dataset_proportion: The proportion of instances to be + selected to test the individual base learner. If it is 0, it uses + training set. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + validation_dataset_proportion=0.3, + **params): + + self.validation_dataset_proportion = validation_dataset_proportion + self.kind = 'EnsembleMulticlassOutputCombiner' + self.name = 'MultiStacking' + self.settings = {} + + if validation_dataset_proportion is not None: + self.settings['ValidationDatasetProportion'] = try_set( + obj=validation_dataset_proportion, + none_acceptable=True, + is_of_type=numbers.Real) + + super( + ClassifierStacking, + self).__init__( + name=self.name, + settings=self.settings, + kind=self.kind) diff --git a/src/python/nimbusml/internal/core/ensemble/output_combiner/classifiervoting.py b/src/python/nimbusml/internal/core/ensemble/output_combiner/classifiervoting.py new file mode 100644 index 00000000..582a4df0 --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/output_combiner/classifiervoting.py @@ -0,0 +1,40 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +ClassifierVoting +""" + +__all__ = ["ClassifierVoting"] + + +from ....utils.entrypoints import Component +from ....utils.utils import trace + + +class ClassifierVoting(Component): + """ + **Description** + Computes the fraction of positive predictions for each class from all the trained models, and outputs the class with the largest number + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + **params): + + self.kind = 'EnsembleMulticlassOutputCombiner' + self.name = 'MultiVoting' + self.settings = {} + + super( + ClassifierVoting, + self).__init__( + name=self.name, + settings=self.settings, + kind=self.kind) diff --git a/src/python/nimbusml/internal/core/ensemble/output_combiner/classifierweightedaverage.py b/src/python/nimbusml/internal/core/ensemble/output_combiner/classifierweightedaverage.py new file mode 100644 index 00000000..7459a947 --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/output_combiner/classifierweightedaverage.py @@ -0,0 +1,107 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +ClassifierWeightedAverage +""" + +__all__ = ["ClassifierWeightedAverage"] + + +from ....utils.entrypoints import Component +from ....utils.utils import trace, try_set + + +class ClassifierWeightedAverage(Component): + """ + + **Description** + Computes the weighted average of the outputs of the trained models + + + :param weightage_name: the metric type to be used to find the weights for + each model. Can be ``"AccuracyMicroAvg"`` or ``"AccuracyMacroAvg"``. + + :param normalize: Specifies the type of automatic normalization used: + + * ``"Auto"``: if normalization is needed, it is performed + automatically. This is the default choice. + * ``"No"``: no normalization is performed. + * ``"Yes"``: normalization is performed. + * ``"Warn"``: if normalization is needed, a warning + message is displayed, but normalization is not performed. + + Normalization rescales disparate data ranges to a standard scale. + Feature + scaling ensures the distances between data points are proportional + and + enables various optimization methods such as gradient descent to + converge + much faster. If normalization is performed, a ``MinMax`` normalizer + is + used. It normalizes values in an interval [a, b] where ``-1 <= a <= + 0`` + and ``0 <= b <= 1`` and ``b - a = 1``. This normalizer preserves + sparsity by mapping zero to zero. + + :param params: Additional arguments sent to compute engine. + + .. seealso:: + :py:class:`EnsembleClassifier + ` + + * Submodel selectors: + :py:class:`ClassifierAllSelector + `, + :py:class:`ClassifierBestDiverseSelector + `, + :py:class:`ClassifierBestPerformanceSelector + ` + + * Output combiners: + :py:class:`ClassifierAverage + `, + :py:class:`ClassifierMedian + `, + :py:class:`ClassifierStacking + `, + :py:class:`ClassifierVoting + ` + + + .. index:: models, ensemble, classification + + Example: + .. literalinclude:: /../nimbusml/examples/EnsembleClassifier.py + :language: python + """ + + @trace + def __init__( + self, + weightage_name='AccuracyMicroAvg', + normalize=True, + **params): + + self.weightage_name = weightage_name + self.normalize = normalize + self.kind = 'EnsembleMulticlassOutputCombiner' + self.name = 'MultiWeightedAverage' + self.settings = {} + + if weightage_name is not None: + self.settings['WeightageName'] = try_set( + obj=weightage_name, none_acceptable=True, is_of_type=str, values=[ + 'AccuracyMicroAvg', 'AccuracyMacroAvg']) + if normalize is not None: + self.settings['Normalize'] = try_set( + obj=normalize, none_acceptable=True, is_of_type=bool) + + super( + ClassifierWeightedAverage, + self).__init__( + name=self.name, + settings=self.settings, + kind=self.kind) diff --git a/src/python/nimbusml/internal/core/ensemble/output_combiner/regressoraverage.py b/src/python/nimbusml/internal/core/ensemble/output_combiner/regressoraverage.py new file mode 100644 index 00000000..b129d20a --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/output_combiner/regressoraverage.py @@ -0,0 +1,40 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +RegressorAverage +""" + +__all__ = ["RegressorAverage"] + + +from ....utils.entrypoints import Component +from ....utils.utils import trace + + +class RegressorAverage(Component): + """ + **Description** + Computes the average of the outputs of the trained models + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + **params): + + self.kind = 'EnsembleRegressionOutputCombiner' + self.name = 'Average' + self.settings = {} + + super( + RegressorAverage, + self).__init__( + name=self.name, + settings=self.settings, + kind=self.kind) diff --git a/src/python/nimbusml/internal/core/ensemble/output_combiner/regressormedian.py b/src/python/nimbusml/internal/core/ensemble/output_combiner/regressormedian.py new file mode 100644 index 00000000..113a5a60 --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/output_combiner/regressormedian.py @@ -0,0 +1,40 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +RegressorMedian +""" + +__all__ = ["RegressorMedian"] + + +from ....utils.entrypoints import Component +from ....utils.utils import trace + + +class RegressorMedian(Component): + """ + **Description** + Computes the median of the outputs of the trained models + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + **params): + + self.kind = 'EnsembleRegressionOutputCombiner' + self.name = 'Median' + self.settings = {} + + super( + RegressorMedian, + self).__init__( + name=self.name, + settings=self.settings, + kind=self.kind) diff --git a/src/python/nimbusml/internal/core/ensemble/output_combiner/regressorstacking.py b/src/python/nimbusml/internal/core/ensemble/output_combiner/regressorstacking.py new file mode 100644 index 00000000..68bbe05f --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/output_combiner/regressorstacking.py @@ -0,0 +1,53 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +RegressorStacking +""" + +__all__ = ["RegressorStacking"] + +import numbers + +from ....utils.entrypoints import Component +from ....utils.utils import trace, try_set + + +class RegressorStacking(Component): + """ + **Description** + Computes the output by training a model on a training set where each instance is a vector containing the outputs of the different models on a training instance, and the instance's label + + :param validation_dataset_proportion: The proportion of instances to be + selected to test the individual base learner. If it is 0, it uses + training set. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + validation_dataset_proportion=0.3, + **params): + + self.validation_dataset_proportion = validation_dataset_proportion + self.kind = 'EnsembleRegressionOutputCombiner' + self.name = 'RegressionStacking' + self.settings = {} + + if validation_dataset_proportion is not None: + self.settings['ValidationDatasetProportion'] = try_set( + obj=validation_dataset_proportion, + none_acceptable=True, + is_of_type=numbers.Real) + + super( + RegressorStacking, + self).__init__( + name=self.name, + settings=self.settings, + kind=self.kind) diff --git a/src/python/nimbusml/internal/core/ensemble/sub_model_selector/__init__.py b/src/python/nimbusml/internal/core/ensemble/sub_model_selector/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/sub_model_selector/__init__.py @@ -0,0 +1 @@ + diff --git a/src/python/nimbusml/internal/core/ensemble/sub_model_selector/classifierallselector.py b/src/python/nimbusml/internal/core/ensemble/sub_model_selector/classifierallselector.py new file mode 100644 index 00000000..a5890ded --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/sub_model_selector/classifierallselector.py @@ -0,0 +1,40 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +ClassifierAllSelector +""" + +__all__ = ["ClassifierAllSelector"] + + +from ....utils.entrypoints import Component +from ....utils.utils import trace + + +class ClassifierAllSelector(Component): + """ + **Description** + Combines all the models to create the output. This is the default submodel selector. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + **params): + + self.kind = 'EnsembleMulticlassSubModelSelector' + self.name = 'AllSelectorMultiClass' + self.settings = {} + + super( + ClassifierAllSelector, + self).__init__( + name=self.name, + settings=self.settings, + kind=self.kind) diff --git a/src/python/nimbusml/internal/core/ensemble/sub_model_selector/classifierbestdiverseselector.py b/src/python/nimbusml/internal/core/ensemble/sub_model_selector/classifierbestdiverseselector.py new file mode 100644 index 00000000..a4c3e22c --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/sub_model_selector/classifierbestdiverseselector.py @@ -0,0 +1,73 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +ClassifierBestDiverseSelector +""" + +__all__ = ["ClassifierBestDiverseSelector"] + +import numbers + +from ....entrypoints._ensemblemulticlassdiversitymeasure_multidisagreementdiversitymeasure import \ + multi_disagreement_diversity_measure +from ....utils.entrypoints import Component +from ....utils.utils import trace, try_set + + +class ClassifierBestDiverseSelector(Component): + """ + **Description** + Combines the models whose predictions are as diverse as possible. + + :param diversity_metric_type: The metric type to be used to find the + diversity among base learners. + + :param learners_selection_proportion: The proportion of best base learners + to be selected. The range is 0.0-1.0. + + :param validation_dataset_proportion: The proportion of instances to be + selected to test the individual base learner. If it is 0, it uses + training set. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + diversity_metric_type=None, + learners_selection_proportion=0.5, + validation_dataset_proportion=0.3, + **params): + + self.diversity_metric_type = diversity_metric_type + self.learners_selection_proportion = learners_selection_proportion + self.validation_dataset_proportion = validation_dataset_proportion + self.kind = 'EnsembleMulticlassSubModelSelector' + self.name = 'BestDiverseSelectorMultiClass' + self.settings = {} + + if diversity_metric_type is not None: + self.settings['DiversityMetricType'] = try_set( + obj=diversity_metric_type, none_acceptable=True, is_of_type=dict) + if learners_selection_proportion is not None: + self.settings['LearnersSelectionProportion'] = try_set( + obj=learners_selection_proportion, + none_acceptable=True, + is_of_type=numbers.Real) + if validation_dataset_proportion is not None: + self.settings['ValidationDatasetProportion'] = try_set( + obj=validation_dataset_proportion, + none_acceptable=True, + is_of_type=numbers.Real) + + super( + ClassifierBestDiverseSelector, + self).__init__( + name=self.name, + settings=self.settings, + kind=self.kind) diff --git a/src/python/nimbusml/internal/core/ensemble/sub_model_selector/classifierbestperformanceselector.py b/src/python/nimbusml/internal/core/ensemble/sub_model_selector/classifierbestperformanceselector.py new file mode 100644 index 00000000..baf96b79 --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/sub_model_selector/classifierbestperformanceselector.py @@ -0,0 +1,101 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +ClassifierBestPerformanceSelector +""" + +__all__ = ["ClassifierBestPerformanceSelector"] + +import numbers + +from ....utils.entrypoints import Component +from ....utils.utils import trace, try_set + + +class ClassifierBestPerformanceSelector(Component): + """ + + **Description** + Combines only the models with the best performance. + + + :param metric_name: the metric type to be used to find the weights for + each model. Can be ``"AccuracyMicro"``, ``"AccuracyMacro"``, + ``"LogLoss"``, or ``"LogLossReduction"``. + + :param learners_selection_proportion: The proportion of best base learners + to be selected. The range is 0.0-1.0. + + :param validation_dataset_proportion: The proportion of instances to be + selected to test the individual base learner. If it is 0, it uses + training set. + + :param params: Additional arguments sent to compute engine. + + .. seealso:: + :py:class:`EnsembleClassifier + ` + + * Submodel selectors: + :py:class:`ClassifierAllSelector + `, + :py:class:`ClassifierBestDiverseSelector + ` + + * Output combiners: + :py:class:`ClassifierAverage + `, + :py:class:`ClassifierMedian + `, + :py:class:`ClassifierStacking + `, + :py:class:`ClassifierVoting + ` + + + .. index:: models, ensemble, classification + + Example: + .. literalinclude:: /../nimbusml/examples/EnsembleClassifier.py + :language: python + """ + + @trace + def __init__( + self, + metric_name='AccuracyMicro', + learners_selection_proportion=0.5, + validation_dataset_proportion=0.3, + **params): + + self.metric_name = metric_name + self.learners_selection_proportion = learners_selection_proportion + self.validation_dataset_proportion = validation_dataset_proportion + self.kind = 'EnsembleMulticlassSubModelSelector' + self.name = 'BestPerformanceSelectorMultiClass' + self.settings = {} + + if metric_name is not None: + self.settings['MetricName'] = try_set( + obj=metric_name, none_acceptable=True, is_of_type=str, values=[ + 'AccuracyMicro', 'AccuracyMacro', 'LogLoss', 'LogLossReduction']) + if learners_selection_proportion is not None: + self.settings['LearnersSelectionProportion'] = try_set( + obj=learners_selection_proportion, + none_acceptable=True, + is_of_type=numbers.Real) + if validation_dataset_proportion is not None: + self.settings['ValidationDatasetProportion'] = try_set( + obj=validation_dataset_proportion, + none_acceptable=True, + is_of_type=numbers.Real) + + super( + ClassifierBestPerformanceSelector, + self).__init__( + name=self.name, + settings=self.settings, + kind=self.kind) diff --git a/src/python/nimbusml/internal/core/ensemble/sub_model_selector/diversity_measure/__init__.py b/src/python/nimbusml/internal/core/ensemble/sub_model_selector/diversity_measure/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/sub_model_selector/diversity_measure/__init__.py @@ -0,0 +1 @@ + diff --git a/src/python/nimbusml/internal/core/ensemble/sub_model_selector/diversity_measure/classifierdisagreement.py b/src/python/nimbusml/internal/core/ensemble/sub_model_selector/diversity_measure/classifierdisagreement.py new file mode 100644 index 00000000..d5fa370e --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/sub_model_selector/diversity_measure/classifierdisagreement.py @@ -0,0 +1,40 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +ClassifierDisagreement +""" + +__all__ = ["ClassifierDisagreement"] + + +from .....utils.entrypoints import Component +from .....utils.utils import trace + + +class ClassifierDisagreement(Component): + """ + **Description** + A measure of disagreement in predictions between a pair of classifiers, averaged over all pairs + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + **params): + + self.kind = 'EnsembleMulticlassDiversityMeasure' + self.name = 'MultiDisagreementDiversityMeasure' + self.settings = {} + + super( + ClassifierDisagreement, + self).__init__( + name=self.name, + settings=self.settings, + kind=self.kind) diff --git a/src/python/nimbusml/internal/core/ensemble/sub_model_selector/diversity_measure/regressordisagreement.py b/src/python/nimbusml/internal/core/ensemble/sub_model_selector/diversity_measure/regressordisagreement.py new file mode 100644 index 00000000..f0f3d95b --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/sub_model_selector/diversity_measure/regressordisagreement.py @@ -0,0 +1,40 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +RegressorDisagreement +""" + +__all__ = ["RegressorDisagreement"] + + +from .....utils.entrypoints import Component +from .....utils.utils import trace + + +class RegressorDisagreement(Component): + """ + **Description** + A measure of absolute value of disagreement in predictions between a pair of regressors, averaged over all pairs + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + **params): + + self.kind = 'EnsembleRegressionDiversityMeasure' + self.name = 'RegressionDisagreementDiversityMeasure' + self.settings = {} + + super( + RegressorDisagreement, + self).__init__( + name=self.name, + settings=self.settings, + kind=self.kind) diff --git a/src/python/nimbusml/internal/core/ensemble/sub_model_selector/regressorallselector.py b/src/python/nimbusml/internal/core/ensemble/sub_model_selector/regressorallselector.py new file mode 100644 index 00000000..0aa9d614 --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/sub_model_selector/regressorallselector.py @@ -0,0 +1,40 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +RegressorAllSelector +""" + +__all__ = ["RegressorAllSelector"] + + +from ....utils.entrypoints import Component +from ....utils.utils import trace + + +class RegressorAllSelector(Component): + """ + **Description** + Combines all the models to create the output. This is the default submodel selector. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + **params): + + self.kind = 'EnsembleRegressionSubModelSelector' + self.name = 'AllSelector' + self.settings = {} + + super( + RegressorAllSelector, + self).__init__( + name=self.name, + settings=self.settings, + kind=self.kind) diff --git a/src/python/nimbusml/internal/core/ensemble/sub_model_selector/regressorbestdiverseselector.py b/src/python/nimbusml/internal/core/ensemble/sub_model_selector/regressorbestdiverseselector.py new file mode 100644 index 00000000..4691dd21 --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/sub_model_selector/regressorbestdiverseselector.py @@ -0,0 +1,73 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +RegressorBestDiverseSelector +""" + +__all__ = ["RegressorBestDiverseSelector"] + +import numbers + +from ....entrypoints._ensembleregressiondiversitymeasure_regressiondisagreementdiversitymeasure import \ + regression_disagreement_diversity_measure +from ....utils.entrypoints import Component +from ....utils.utils import trace, try_set + + +class RegressorBestDiverseSelector(Component): + """ + **Description** + Combines the models whose predictions are as diverse as possible. + + :param diversity_metric_type: The metric type to be used to find the + diversity among base learners. + + :param learners_selection_proportion: The proportion of best base learners + to be selected. The range is 0.0-1.0. + + :param validation_dataset_proportion: The proportion of instances to be + selected to test the individual base learner. If it is 0, it uses + training set. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + diversity_metric_type=None, + learners_selection_proportion=0.5, + validation_dataset_proportion=0.3, + **params): + + self.diversity_metric_type = diversity_metric_type + self.learners_selection_proportion = learners_selection_proportion + self.validation_dataset_proportion = validation_dataset_proportion + self.kind = 'EnsembleRegressionSubModelSelector' + self.name = 'BestDiverseSelectorRegression' + self.settings = {} + + if diversity_metric_type is not None: + self.settings['DiversityMetricType'] = try_set( + obj=diversity_metric_type, none_acceptable=True, is_of_type=dict) + if learners_selection_proportion is not None: + self.settings['LearnersSelectionProportion'] = try_set( + obj=learners_selection_proportion, + none_acceptable=True, + is_of_type=numbers.Real) + if validation_dataset_proportion is not None: + self.settings['ValidationDatasetProportion'] = try_set( + obj=validation_dataset_proportion, + none_acceptable=True, + is_of_type=numbers.Real) + + super( + RegressorBestDiverseSelector, + self).__init__( + name=self.name, + settings=self.settings, + kind=self.kind) diff --git a/src/python/nimbusml/internal/core/ensemble/sub_model_selector/regressorbestperformanceselector.py b/src/python/nimbusml/internal/core/ensemble/sub_model_selector/regressorbestperformanceselector.py new file mode 100644 index 00000000..51b07c66 --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/sub_model_selector/regressorbestperformanceselector.py @@ -0,0 +1,99 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +RegressorBestPerformanceSelector +""" + +__all__ = ["RegressorBestPerformanceSelector"] + +import numbers + +from ....utils.entrypoints import Component +from ....utils.utils import trace, try_set + + +class RegressorBestPerformanceSelector(Component): + """ + + **Description** + Computes the weighted average of the outputs of the trained models + + + :param metric_name: the metric type to be used to find the weights for + each model. Can be ``"L1"``, ``"L2"``, ``"Rms"``, or ``"Loss"``, or + ``"RSquared"``. + + :param learners_selection_proportion: The proportion of best base learners + to be selected. The range is 0.0-1.0. + + :param validation_dataset_proportion: The proportion of instances to be + selected to test the individual base learner. If it is 0, it uses + training set. + + :param params: Additional arguments sent to compute engine. + + .. seealso:: + :py:class:`EnsembleRegressor + ` + + * Submodel selectors: + :py:class:`RegressorAllSelector + `, + :py:class:`RegressorBestDiverseSelector + ` + + * Output combiners: + :py:class:`RegressorAverage + `, + :py:class:`RegressorMedian + `, + :py:class:`RegressorStacking + ` + + + .. index:: models, ensemble, classification + + Example: + .. literalinclude:: /../nimbusml/examples/EnsembleClassifier.py + :language: python + """ + + @trace + def __init__( + self, + metric_name='L1', + learners_selection_proportion=0.5, + validation_dataset_proportion=0.3, + **params): + + self.metric_name = metric_name + self.learners_selection_proportion = learners_selection_proportion + self.validation_dataset_proportion = validation_dataset_proportion + self.kind = 'EnsembleRegressionSubModelSelector' + self.name = 'BestPerformanceRegressionSelector' + self.settings = {} + + if metric_name is not None: + self.settings['MetricName'] = try_set( + obj=metric_name, none_acceptable=True, is_of_type=str, values=[ + 'L1', 'L2', 'Rms', 'Loss', 'RSquared']) + if learners_selection_proportion is not None: + self.settings['LearnersSelectionProportion'] = try_set( + obj=learners_selection_proportion, + none_acceptable=True, + is_of_type=numbers.Real) + if validation_dataset_proportion is not None: + self.settings['ValidationDatasetProportion'] = try_set( + obj=validation_dataset_proportion, + none_acceptable=True, + is_of_type=numbers.Real) + + super( + RegressorBestPerformanceSelector, + self).__init__( + name=self.name, + settings=self.settings, + kind=self.kind) diff --git a/src/python/nimbusml/internal/core/ensemble/subset_selector/__init__.py b/src/python/nimbusml/internal/core/ensemble/subset_selector/__init__.py new file mode 100644 index 00000000..8b137891 --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/subset_selector/__init__.py @@ -0,0 +1 @@ + diff --git a/src/python/nimbusml/internal/core/ensemble/subset_selector/allinstanceselector.py b/src/python/nimbusml/internal/core/ensemble/subset_selector/allinstanceselector.py new file mode 100644 index 00000000..75d86728 --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/subset_selector/allinstanceselector.py @@ -0,0 +1,50 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +AllInstanceSelector +""" + +__all__ = ["AllInstanceSelector"] + + +from ....entrypoints._ensemblefeatureselector_allfeatureselector import \ + all_feature_selector +from ....utils.entrypoints import Component +from ....utils.utils import trace, try_set + + +class AllInstanceSelector(Component): + """ + **Description** + Selects all rows for each trainer in the ensemble + + :param feature_selector: The Feature selector. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + feature_selector=None, + **params): + + self.feature_selector = feature_selector + self.kind = 'EnsembleSubsetSelector' + self.name = 'AllInstanceSelector' + self.settings = {} + + if feature_selector is not None: + self.settings['FeatureSelector'] = try_set( + obj=feature_selector, none_acceptable=True, is_of_type=dict) + + super( + AllInstanceSelector, + self).__init__( + name=self.name, + settings=self.settings, + kind=self.kind) diff --git a/src/python/nimbusml/internal/core/ensemble/subset_selector/bootstrapselector.py b/src/python/nimbusml/internal/core/ensemble/subset_selector/bootstrapselector.py new file mode 100644 index 00000000..f394418f --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/subset_selector/bootstrapselector.py @@ -0,0 +1,50 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +BootstrapSelector +""" + +__all__ = ["BootstrapSelector"] + + +from ....entrypoints._ensemblefeatureselector_allfeatureselector import \ + all_feature_selector +from ....utils.entrypoints import Component +from ....utils.utils import trace, try_set + + +class BootstrapSelector(Component): + """ + **Description** + Selects a bootstrapped sample of the rows for each trainer in the ensemble + + :param feature_selector: The Feature selector. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + feature_selector=None, + **params): + + self.feature_selector = feature_selector + self.kind = 'EnsembleSubsetSelector' + self.name = 'BootstrapSelector' + self.settings = {} + + if feature_selector is not None: + self.settings['FeatureSelector'] = try_set( + obj=feature_selector, none_acceptable=True, is_of_type=dict) + + super( + BootstrapSelector, + self).__init__( + name=self.name, + settings=self.settings, + kind=self.kind) diff --git a/src/python/nimbusml/internal/core/ensemble/subset_selector/randompartitionselector.py b/src/python/nimbusml/internal/core/ensemble/subset_selector/randompartitionselector.py new file mode 100644 index 00000000..a1f1e451 --- /dev/null +++ b/src/python/nimbusml/internal/core/ensemble/subset_selector/randompartitionselector.py @@ -0,0 +1,50 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +RandomPartitionSelector +""" + +__all__ = ["RandomPartitionSelector"] + + +from ....entrypoints._ensemblefeatureselector_allfeatureselector import \ + all_feature_selector +from ....utils.entrypoints import Component +from ....utils.utils import trace, try_set + + +class RandomPartitionSelector(Component): + """ + **Description** + Randomly partitions the rows for each trainer in the ensemble + + :param feature_selector: The Feature selector. + + :param params: Additional arguments sent to compute engine. + + """ + + @trace + def __init__( + self, + feature_selector=None, + **params): + + self.feature_selector = feature_selector + self.kind = 'EnsembleSubsetSelector' + self.name = 'RandomPartitionSelector' + self.settings = {} + + if feature_selector is not None: + self.settings['FeatureSelector'] = try_set( + obj=feature_selector, none_acceptable=True, is_of_type=dict) + + super( + RandomPartitionSelector, + self).__init__( + name=self.name, + settings=self.settings, + kind=self.kind) diff --git a/src/python/nimbusml/internal/entrypoints/_ensemblemulticlasssubmodelselector_bestdiverseselectormulticlass.py b/src/python/nimbusml/internal/entrypoints/_ensemblemulticlasssubmodelselector_bestdiverseselectormulticlass.py new file mode 100644 index 00000000..213322e5 --- /dev/null +++ b/src/python/nimbusml/internal/entrypoints/_ensemblemulticlasssubmodelselector_bestdiverseselectormulticlass.py @@ -0,0 +1,53 @@ +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +BestDiverseSelectorMultiClass +""" + +import numbers + +from ..utils.entrypoints import Component +from ..utils.utils import try_set +from ._ensemblemulticlassdiversitymeasure_multidisagreementdiversitymeasure import \ + multi_disagreement_diversity_measure + + +def best_diverse_selector_multi_class( + diversity_metric_type=None, + learners_selection_proportion=0.5, + validation_dataset_proportion=0.3, + **params): + """ + **Description** + None + + :param diversity_metric_type: The metric type to be used to find + the diversity among base learners (settings). + :param learners_selection_proportion: The proportion of best base + learners to be selected. The range is 0.0-1.0 (settings). + :param validation_dataset_proportion: The proportion of instances + to be selected to test the individual base learner. If it is + 0, it uses training set (settings). + """ + + entrypoint_name = 'BestDiverseSelectorMultiClass' + settings = {} + + if diversity_metric_type is not None: + settings['DiversityMetricType'] = try_set( + obj=diversity_metric_type, none_acceptable=True, is_of_type=dict) + if learners_selection_proportion is not None: + settings['LearnersSelectionProportion'] = try_set( + obj=learners_selection_proportion, + none_acceptable=True, + is_of_type=numbers.Real) + if validation_dataset_proportion is not None: + settings['ValidationDatasetProportion'] = try_set( + obj=validation_dataset_proportion, + none_acceptable=True, + is_of_type=numbers.Real) + + component = Component( + name=entrypoint_name, + settings=settings, + kind='EnsembleMulticlassSubModelSelector') + return component diff --git a/src/python/nimbusml/internal/entrypoints/_ensembleregressionsubmodelselector_bestdiverseselectorregression.py b/src/python/nimbusml/internal/entrypoints/_ensembleregressionsubmodelselector_bestdiverseselectorregression.py new file mode 100644 index 00000000..fccf36b2 --- /dev/null +++ b/src/python/nimbusml/internal/entrypoints/_ensembleregressionsubmodelselector_bestdiverseselectorregression.py @@ -0,0 +1,53 @@ +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +BestDiverseSelectorRegression +""" + +import numbers + +from ..utils.entrypoints import Component +from ..utils.utils import try_set +from ._ensembleregressiondiversitymeasure_regressiondisagreementdiversitymeasure import \ + regression_disagreement_diversity_measure + + +def best_diverse_selector_regression( + diversity_metric_type=None, + learners_selection_proportion=0.5, + validation_dataset_proportion=0.3, + **params): + """ + **Description** + None + + :param diversity_metric_type: The metric type to be used to find + the diversity among base learners (settings). + :param learners_selection_proportion: The proportion of best base + learners to be selected. The range is 0.0-1.0 (settings). + :param validation_dataset_proportion: The proportion of instances + to be selected to test the individual base learner. If it is + 0, it uses training set (settings). + """ + + entrypoint_name = 'BestDiverseSelectorRegression' + settings = {} + + if diversity_metric_type is not None: + settings['DiversityMetricType'] = try_set( + obj=diversity_metric_type, none_acceptable=True, is_of_type=dict) + if learners_selection_proportion is not None: + settings['LearnersSelectionProportion'] = try_set( + obj=learners_selection_proportion, + none_acceptable=True, + is_of_type=numbers.Real) + if validation_dataset_proportion is not None: + settings['ValidationDatasetProportion'] = try_set( + obj=validation_dataset_proportion, + none_acceptable=True, + is_of_type=numbers.Real) + + component = Component( + name=entrypoint_name, + settings=settings, + kind='EnsembleRegressionSubModelSelector') + return component diff --git a/src/python/nimbusml/internal/entrypoints/_ensemblesubsetselector_allinstanceselector.py b/src/python/nimbusml/internal/entrypoints/_ensemblesubsetselector_allinstanceselector.py new file mode 100644 index 00000000..4f8f83b8 --- /dev/null +++ b/src/python/nimbusml/internal/entrypoints/_ensemblesubsetselector_allinstanceselector.py @@ -0,0 +1,32 @@ +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +AllInstanceSelector +""" + + +from ..utils.entrypoints import Component +from ..utils.utils import try_set + + +def all_instance_selector( + feature_selector=None, + **params): + """ + **Description** + None + + :param feature_selector: The Feature selector (settings). + """ + + entrypoint_name = 'AllInstanceSelector' + settings = {} + + if feature_selector is not None: + settings['FeatureSelector'] = try_set( + obj=feature_selector, none_acceptable=True, is_of_type=dict) + + component = Component( + name=entrypoint_name, + settings=settings, + kind='EnsembleSubsetSelector') + return component diff --git a/src/python/nimbusml/internal/entrypoints/_ensemblesubsetselector_bootstrapselector.py b/src/python/nimbusml/internal/entrypoints/_ensemblesubsetselector_bootstrapselector.py new file mode 100644 index 00000000..c6d7868b --- /dev/null +++ b/src/python/nimbusml/internal/entrypoints/_ensemblesubsetselector_bootstrapselector.py @@ -0,0 +1,32 @@ +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +BootstrapSelector +""" + + +from ..utils.entrypoints import Component +from ..utils.utils import try_set + + +def bootstrap_selector( + feature_selector=None, + **params): + """ + **Description** + None + + :param feature_selector: The Feature selector (settings). + """ + + entrypoint_name = 'BootstrapSelector' + settings = {} + + if feature_selector is not None: + settings['FeatureSelector'] = try_set( + obj=feature_selector, none_acceptable=True, is_of_type=dict) + + component = Component( + name=entrypoint_name, + settings=settings, + kind='EnsembleSubsetSelector') + return component diff --git a/src/python/nimbusml/internal/entrypoints/_ensemblesubsetselector_randompartitionselector.py b/src/python/nimbusml/internal/entrypoints/_ensemblesubsetselector_randompartitionselector.py new file mode 100644 index 00000000..6b36937c --- /dev/null +++ b/src/python/nimbusml/internal/entrypoints/_ensemblesubsetselector_randompartitionselector.py @@ -0,0 +1,32 @@ +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +RandomPartitionSelector +""" + + +from ..utils.entrypoints import Component +from ..utils.utils import try_set + + +def random_partition_selector( + feature_selector=None, + **params): + """ + **Description** + None + + :param feature_selector: The Feature selector (settings). + """ + + entrypoint_name = 'RandomPartitionSelector' + settings = {} + + if feature_selector is not None: + settings['FeatureSelector'] = try_set( + obj=feature_selector, none_acceptable=True, is_of_type=dict) + + component = Component( + name=entrypoint_name, + settings=settings, + kind='EnsembleSubsetSelector') + return component diff --git a/src/python/nimbusml/internal/entrypoints/trainers_ensembleclassification.py b/src/python/nimbusml/internal/entrypoints/trainers_ensembleclassification.py new file mode 100644 index 00000000..aa87e677 --- /dev/null +++ b/src/python/nimbusml/internal/entrypoints/trainers_ensembleclassification.py @@ -0,0 +1,150 @@ +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +Trainers.EnsembleClassification +""" + +import numbers + +from ..utils.entrypoints import EntryPoint +from ..utils.utils import try_set, unlist +from ._ensemblemulticlasssubmodelselector_allselectormulticlass import \ + all_selector_multi_class +from ._ensemblesubsetselector_bootstrapselector import bootstrap_selector + + +def trainers_ensembleclassification( + training_data, + predictor_model=None, + sampling_type=bootstrap_selector( + feature_selector={ + 'Name': 'AllFeatureSelector'}), + feature_column_name='Features', + num_models=None, + label_column_name='Label', + sub_model_selector_type=None, + output_combiner=None, + normalize_features='Auto', + caching='Auto', + train_parallel=False, + batch_size=-1, + show_metrics=False, + **params): + """ + **Description** + Train multiclass ensemble. + + :param training_data: The data to be used for training (inputs). + :param sampling_type: Sampling Type (inputs). + :param feature_column_name: Column to use for features (inputs). + :param num_models: Number of models per batch. If not specified, + will default to 50 if there is only one base predictor, or + the number of base predictors otherwise. (inputs). + :param label_column_name: Column to use for labels (inputs). + :param sub_model_selector_type: Algorithm to prune the base + learners for selective Ensemble (inputs). + :param output_combiner: Output combiner (inputs). + :param normalize_features: Normalize option for the feature + column (inputs). + :param caching: Whether trainer should cache input training data + (inputs). + :param train_parallel: All the base learners will run + asynchronously if the value is true (inputs). + :param batch_size: Batch size (inputs). + :param show_metrics: True, if metrics for each model need to be + evaluated and shown in comparison table. This is done by + using validation set if available or the training set + (inputs). + :param predictor_model: The trained model (outputs). + """ + + entrypoint_name = 'Trainers.EnsembleClassification' + inputs = {} + outputs = {} + + if training_data is not None: + inputs['TrainingData'] = try_set( + obj=training_data, + none_acceptable=False, + is_of_type=str) + if sampling_type is not None: + inputs['SamplingType'] = try_set( + obj=sampling_type, + none_acceptable=True, + is_of_type=dict) + if feature_column_name is not None: + inputs['FeatureColumnName'] = try_set( + obj=feature_column_name, + none_acceptable=True, + is_of_type=str, + is_column=True) + if num_models is not None: + inputs['NumModels'] = try_set( + obj=num_models, + none_acceptable=True, + is_of_type=numbers.Real) + if label_column_name is not None: + inputs['LabelColumnName'] = try_set( + obj=label_column_name, + none_acceptable=True, + is_of_type=str, + is_column=True) + if sub_model_selector_type is not None: + inputs['SubModelSelectorType'] = try_set( + obj=sub_model_selector_type, + none_acceptable=True, + is_of_type=dict) + if output_combiner is not None: + inputs['OutputCombiner'] = try_set( + obj=output_combiner, + none_acceptable=True, + is_of_type=dict) + if normalize_features is not None: + inputs['NormalizeFeatures'] = try_set( + obj=normalize_features, + none_acceptable=True, + is_of_type=str, + values=[ + 'No', + 'Warn', + 'Auto', + 'Yes']) + if caching is not None: + inputs['Caching'] = try_set( + obj=caching, + none_acceptable=True, + is_of_type=str, + values=[ + 'Auto', + 'Memory', + 'None']) + if train_parallel is not None: + inputs['TrainParallel'] = try_set( + obj=train_parallel, + none_acceptable=True, + is_of_type=bool) + if batch_size is not None: + inputs['BatchSize'] = try_set( + obj=batch_size, + none_acceptable=True, + is_of_type=numbers.Real) + if show_metrics is not None: + inputs['ShowMetrics'] = try_set( + obj=show_metrics, + none_acceptable=True, + is_of_type=bool) + if predictor_model is not None: + outputs['PredictorModel'] = try_set( + obj=predictor_model, none_acceptable=False, is_of_type=str) + + input_variables = { + x for x in unlist(inputs.values()) + if isinstance(x, str) and x.startswith("$")} + output_variables = { + x for x in unlist(outputs.values()) + if isinstance(x, str) and x.startswith("$")} + + entrypoint = EntryPoint( + name=entrypoint_name, inputs=inputs, outputs=outputs, + input_variables=input_variables, + output_variables=output_variables) + return entrypoint diff --git a/src/python/nimbusml/internal/entrypoints/trainers_ensembleregression.py b/src/python/nimbusml/internal/entrypoints/trainers_ensembleregression.py new file mode 100644 index 00000000..e07b2ce8 --- /dev/null +++ b/src/python/nimbusml/internal/entrypoints/trainers_ensembleregression.py @@ -0,0 +1,148 @@ +# - Generated by tools/entrypoint_compiler.py: do not edit by hand +""" +Trainers.EnsembleRegression +""" + +import numbers + +from ..utils.entrypoints import EntryPoint +from ..utils.utils import try_set, unlist +from ._ensemblesubsetselector_bootstrapselector import bootstrap_selector + + +def trainers_ensembleregression( + training_data, + predictor_model=None, + sampling_type=bootstrap_selector( + feature_selector={ + 'Name': 'AllFeatureSelector'}), + feature_column_name='Features', + num_models=None, + label_column_name='Label', + sub_model_selector_type=None, + output_combiner=None, + normalize_features='Auto', + caching='Auto', + train_parallel=False, + batch_size=-1, + show_metrics=False, + **params): + """ + **Description** + Train regression ensemble. + + :param training_data: The data to be used for training (inputs). + :param sampling_type: Sampling Type (inputs). + :param feature_column_name: Column to use for features (inputs). + :param num_models: Number of models per batch. If not specified, + will default to 50 if there is only one base predictor, or + the number of base predictors otherwise. (inputs). + :param label_column_name: Column to use for labels (inputs). + :param sub_model_selector_type: Algorithm to prune the base + learners for selective Ensemble (inputs). + :param output_combiner: Output combiner (inputs). + :param normalize_features: Normalize option for the feature + column (inputs). + :param caching: Whether trainer should cache input training data + (inputs). + :param train_parallel: All the base learners will run + asynchronously if the value is true (inputs). + :param batch_size: Batch size (inputs). + :param show_metrics: True, if metrics for each model need to be + evaluated and shown in comparison table. This is done by + using validation set if available or the training set + (inputs). + :param predictor_model: The trained model (outputs). + """ + + entrypoint_name = 'Trainers.EnsembleRegression' + inputs = {} + outputs = {} + + if training_data is not None: + inputs['TrainingData'] = try_set( + obj=training_data, + none_acceptable=False, + is_of_type=str) + if sampling_type is not None: + inputs['SamplingType'] = try_set( + obj=sampling_type, + none_acceptable=True, + is_of_type=dict) + if feature_column_name is not None: + inputs['FeatureColumnName'] = try_set( + obj=feature_column_name, + none_acceptable=True, + is_of_type=str, + is_column=True) + if num_models is not None: + inputs['NumModels'] = try_set( + obj=num_models, + none_acceptable=True, + is_of_type=numbers.Real) + if label_column_name is not None: + inputs['LabelColumnName'] = try_set( + obj=label_column_name, + none_acceptable=True, + is_of_type=str, + is_column=True) + if sub_model_selector_type is not None: + inputs['SubModelSelectorType'] = try_set( + obj=sub_model_selector_type, + none_acceptable=True, + is_of_type=dict) + if output_combiner is not None: + inputs['OutputCombiner'] = try_set( + obj=output_combiner, + none_acceptable=True, + is_of_type=dict) + if normalize_features is not None: + inputs['NormalizeFeatures'] = try_set( + obj=normalize_features, + none_acceptable=True, + is_of_type=str, + values=[ + 'No', + 'Warn', + 'Auto', + 'Yes']) + if caching is not None: + inputs['Caching'] = try_set( + obj=caching, + none_acceptable=True, + is_of_type=str, + values=[ + 'Auto', + 'Memory', + 'None']) + if train_parallel is not None: + inputs['TrainParallel'] = try_set( + obj=train_parallel, + none_acceptable=True, + is_of_type=bool) + if batch_size is not None: + inputs['BatchSize'] = try_set( + obj=batch_size, + none_acceptable=True, + is_of_type=numbers.Real) + if show_metrics is not None: + inputs['ShowMetrics'] = try_set( + obj=show_metrics, + none_acceptable=True, + is_of_type=bool) + if predictor_model is not None: + outputs['PredictorModel'] = try_set( + obj=predictor_model, none_acceptable=False, is_of_type=str) + + input_variables = { + x for x in unlist(inputs.values()) + if isinstance(x, str) and x.startswith("$")} + output_variables = { + x for x in unlist(outputs.values()) + if isinstance(x, str) and x.startswith("$")} + + entrypoint = EntryPoint( + name=entrypoint_name, inputs=inputs, outputs=outputs, + input_variables=input_variables, + output_variables=output_variables) + return entrypoint diff --git a/src/python/nimbusml/tests/ensemble/test_ensembleclassifier.py b/src/python/nimbusml/tests/ensemble/test_ensembleclassifier.py new file mode 100644 index 00000000..53bbab22 --- /dev/null +++ b/src/python/nimbusml/tests/ensemble/test_ensembleclassifier.py @@ -0,0 +1,62 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- + +import platform +import unittest + +import numpy as np +import pandas as pd +import six +from nimbusml.datasets import get_dataset +from nimbusml.ensemble import EnsembleClassifier +from nimbusml.ensemble.feature_selector import RandomFeatureSelector +from nimbusml.ensemble.output_combiner import ClassifierVoting +from nimbusml.ensemble.subset_selector import RandomPartitionSelector +from nimbusml.ensemble.sub_model_selector import ClassifierBestDiverseSelector +from sklearn.model_selection import train_test_split +from sklearn.utils.testing import assert_greater + + +class TestEnsembleClassifier(unittest.TestCase): + + def test_ensembleclassifier(self): + np.random.seed(0) + df = get_dataset("iris").as_df() + df.drop(['Species'], inplace=True, axis=1) + + X_train, X_test, y_train, y_test = \ + train_test_split(df.loc[:, df.columns != 'Label'], df['Label']) + + ensemble = EnsembleClassifier(num_models=3).fit(X_train, y_train, verbose=0) + scores = ensemble.predict(X_test) + scores = pd.to_numeric(scores) + accuracy = np.mean(y_test == [i for i in scores]) + assert_greater( + accuracy, + 0.947, + "accuracy should be greater than %s" % + 0.948) + + ensemble_with_options = EnsembleClassifier( + num_models=3, + sampling_type = RandomPartitionSelector( + feature_selector=RandomFeatureSelector( + features_selction_proportion=0.7)), + sub_model_selector_type=ClassifierBestDiverseSelector(), + output_combiner=ClassifierVoting()).fit(X_train, y_train) + + scores = ensemble.predict(X_test) + scores = pd.to_numeric(scores) + accuracy = np.mean(y_test == [i for i in scores]) + assert_greater( + accuracy, + 0.578, + "accuracy should be greater than %s" % + 0.579) + + +if __name__ == '__main__': + unittest.main() + diff --git a/src/python/nimbusml/tests/ensemble/test_ensembleregressor.py b/src/python/nimbusml/tests/ensemble/test_ensembleregressor.py new file mode 100644 index 00000000..5c61d9b2 --- /dev/null +++ b/src/python/nimbusml/tests/ensemble/test_ensembleregressor.py @@ -0,0 +1,55 @@ +# -------------------------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------------------------- +import platform +import unittest + +import numpy as np +from nimbusml.datasets import get_dataset +from nimbusml.ensemble import EnsembleRegressor +from nimbusml.ensemble.feature_selector import RandomFeatureSelector +from nimbusml.ensemble.output_combiner import RegressorMedian +from nimbusml.ensemble.subset_selector import RandomPartitionSelector +from nimbusml.ensemble.sub_model_selector import RegressorBestDiverseSelector +from sklearn.metrics import r2_score +from sklearn.model_selection import train_test_split +from sklearn.utils.testing import assert_greater, assert_less + + +class TestEnsembleRegressor(unittest.TestCase): + + def test_ensembleregressor(self): + np.random.seed(0) + + df = get_dataset("airquality").as_df().fillna(0) + df = df[df.Ozone.notnull()] + + X_train, X_test, y_train, y_test = train_test_split( + df.loc[:, df.columns != 'Ozone'], df['Ozone']) + + # Train a model and score + ensemble = EnsembleRegressor(num_models=3).fit(X_train, y_train) + scores = ensemble.predict(X_test) + + r2 = r2_score(y_test, scores) + assert_greater(r2, 0.12, "should be greater than %s" % 0.12) + assert_less(r2, 0.13, "sum should be less than %s" % 0.13) + + ensemble_with_options = EnsembleRegressor( + num_models=3, + sampling_type = RandomPartitionSelector( + feature_selector=RandomFeatureSelector( + features_selction_proportion=0.7)), + sub_model_selector_type=RegressorBestDiverseSelector(), + output_combiner=RegressorMedian()).fit(X_train, y_train) + scores = ensemble_with_options.predict(X_test) + + r2 = r2_score(y_test, scores) + assert_greater(r2, 0.0279, "R-Squared should be greater than %s" % 0.0279) + assert_less(r2, 0.03, "R-Squared should be less than %s" % 0.03) + + +if __name__ == '__main__': + unittest.main() + diff --git a/src/python/nimbusml/tests/ensemble/test_lightgbmregressor.py b/src/python/nimbusml/tests/ensemble/test_lightgbmregressor.py index e252ed7c..691307d3 100644 --- a/src/python/nimbusml/tests/ensemble/test_lightgbmregressor.py +++ b/src/python/nimbusml/tests/ensemble/test_lightgbmregressor.py @@ -13,7 +13,7 @@ from sklearn.utils.testing import assert_greater, assert_less -class TestFastLinearRegressor(unittest.TestCase): +class TestLightGbmRegressor(unittest.TestCase): def test_lightgbmregressor(self): np.random.seed(0) diff --git a/src/python/tests/test_estimator_checks.py b/src/python/tests/test_estimator_checks.py index 6c8ef557..9cbc09d0 100644 --- a/src/python/tests/test_estimator_checks.py +++ b/src/python/tests/test_estimator_checks.py @@ -8,6 +8,8 @@ import json import os +from nimbusml.ensemble import EnsembleClassifier +from nimbusml.ensemble import EnsembleRegressor from nimbusml.ensemble import LightGbmBinaryClassifier from nimbusml.ensemble import LightGbmClassifier from nimbusml.ensemble import LightGbmRanker @@ -73,6 +75,10 @@ # dimensional arrays, tolerance 'FastLinearClassifier': 'check_classifiers_train', 'FastForestRegressor': 'check_fit_score_takes_y', # bug + 'EnsembleClassifier': 'check_supervised_y_2d, ' + 'check_classifiers_train', + 'EnsembleRegressor': 'check_supervised_y_2d, ' + 'check_regressors_train', # bug in decision_function 'FastTreesBinaryClassifier': 'check_decision_proba_consistency', @@ -181,6 +187,8 @@ 'check_classifiers_train'] INSTANCES = { + 'EnsembleClassifier': EnsembleClassifier(num_models=3), + 'EnsembleRegressor': EnsembleRegressor(num_models=3), 'LightGbmBinaryClassifier': LightGbmBinaryClassifier( minimum_example_count_per_group=1, minimum_example_count_per_leaf=1), 'LightGbmClassifier': LightGbmClassifier( diff --git a/src/python/tools/code_fixer.py b/src/python/tools/code_fixer.py index 8be5a7c1..21b6d1f4 100644 --- a/src/python/tools/code_fixer.py +++ b/src/python/tools/code_fixer.py @@ -20,6 +20,17 @@ NG_1_correct = """from ...base_transform import BaseTransform from .extractor import Ngram""" +ensemble = """from ..base_predictor import BasePredictor""" +ensemble_correct = """from ..base_predictor import BasePredictor +from .subset_selector import BootstrapSelector +from .feature_selector import AllFeatureSelector""" + +diverse_selector = """from ...internal.utils.utils import trace""" +classifier_diverse_selector = """from ...internal.utils.utils import trace +from .diversity_measure import ClassifierDisagreement""" +regressor_diverse_selector = """from ...internal.utils.utils import trace +from .diversity_measure import RegressorDisagreement""" + FM = \ """import numbers from sklearn.base import ClassifierMixin @@ -91,7 +102,25 @@ 'FactorizationMachineBinaryClassifier': (FM, FM_correct), 'OneHotHashVectorizer': (OHE, OHE_correct), 'CustomStopWordsRemover': (cust_stop, cust_stop_correct), - 'PredefinedStopWordsRemover': (pred_stop, pred_stop_correct) + 'PredefinedStopWordsRemover': (pred_stop, pred_stop_correct), + 'EnsembleClassifier': [(ensemble, ensemble_correct), + ('sampling_type = bootstrap_selector', + 'sampling_type = BootstrapSelector'), + ("feature_selector = {'Name': 'AllFeatureSelector'}", + "feature_selector = AllFeatureSelector()")], + 'EnsembleRegressor': [(ensemble, ensemble_correct), + ('sampling_type = bootstrap_selector', + 'sampling_type = BootstrapSelector'), + ("feature_selector = {'Name': 'AllFeatureSelector'}", + "feature_selector = AllFeatureSelector()")], + 'ClassifierBestDiverseSelector': [(diverse_selector, + classifier_diverse_selector), + ('diversity_metric_type = None', + 'diversity_metric_type = ClassifierDisagreement()')], + 'RegressorBestDiverseSelector': [(diverse_selector, + regressor_diverse_selector), + ('diversity_metric_type = None', + 'diversity_metric_type = RegressorDisagreement()')] } diff --git a/src/python/tools/entrypoint_compiler.py b/src/python/tools/entrypoint_compiler.py index f368f385..b2765691 100644 --- a/src/python/tools/entrypoint_compiler.py +++ b/src/python/tools/entrypoint_compiler.py @@ -302,24 +302,13 @@ def write_api(entrypoint, kind="node", pkg_path=None, overwrite=False): dots = "..." if "." in class_dir: dots = "...." - class_imports = [ + imports = [ arg.get_import( prefix=( "%sentrypoints." % dots)) for arg in visible_args if arg.get_import() is not None] - class_imports = '\n'.join(class_imports) - - dots = "..." - if "." in class_dir: - dots = "...." - core_class_imports = [ - arg.get_import( - prefix=( - "%sentrypoints." % - dots)) for arg in visible_args if - arg.get_import() is not None] - core_class_imports = '\n'.join(core_class_imports) + imports = '\n'.join(imports) # write the class to a file py_path = module_to_path(class_dir, pkg_path) @@ -377,7 +366,7 @@ def write_api(entrypoint, kind="node", pkg_path=None, overwrite=False): class_file, class_dir, banner, - core_class_imports, + imports, class_args, core_args_map, entrypoint_args_map, @@ -1503,7 +1492,15 @@ def parse_arg(argument, inout): "BoosterParameterFunction", "ParallelLightGBM", "AutoMlEngine", - "SearchTerminator"]: + "SearchTerminator", + "EnsembleSubsetSelector", + "EnsembleFeatureSelector", + "EnsembleMulticlassSubModelSelector", + "EnsembleMulticlassDiversityMeasure", + "EnsembleMulticlassOutputCombiner", + "EnsembleRegressionSubModelSelector", + "EnsembleRegressionDiversityMeasure", + "EnsembleRegressionOutputCombiner"]: arg_obj = ComponentArg(argument, inout) elif componentKind in ["ClassificationLossFunction", "RegressionLossFunction", diff --git a/src/python/tools/manifest_diff.json b/src/python/tools/manifest_diff.json index 0ff9aedf..acff52df 100644 --- a/src/python/tools/manifest_diff.json +++ b/src/python/tools/manifest_diff.json @@ -243,6 +243,20 @@ "Predict_Proba" : true, "Decision_Function" : true }, + { + "Name": "Trainers.EnsembleClassification", + "NewName": "EnsembleClassifier", + "Module": "ensemble", + "Type": "Classifier", + "Predict_Proba" : true, + "Decision_Function" : true + }, + { + "Name": "Trainers.EnsembleRegression", + "NewName": "EnsembleRegressor", + "Module": "ensemble", + "Type": "Regressor" + }, { "Name": "Transforms.ApproximateBootstrapSampler", "NewName": "BootstrapSampler", @@ -778,6 +792,193 @@ } ] }, + { + "Kind": "EnsembleSubsetSelector", + "Components": [ + { + "Name": "AllInstanceSelector", + "NewName": "AllInstanceSelector", + "Desc": "Selects all rows for each trainer in the ensemble", + "Module": "ensemble.subset_selector", + "Type": "Component" + }, + { + "Name": "BootstrapSelector", + "NewName": "BootstrapSelector", + "Desc": "Selects a bootstrapped sample of the rows for each trainer in the ensemble", + "Module": "ensemble.subset_selector", + "Type": "Component" + }, + { + "Name": "RandomPartitionSelector", + "NewName": "RandomPartitionSelector", + "Desc": "Randomly partitions the rows for each trainer in the ensemble", + "Module": "ensemble.subset_selector", + "Type": "Component" + } + ] + }, + { + "Kind": "EnsembleFeatureSelector", + "Components": [ + { + "Name": "AllFeatureSelector", + "NewName": "AllFeatureSelector", + "Desc": "Selects all features for each trainer in the ensemble", + "Module": "ensemble.feature_selector", + "Type": "Component" + }, + { + "Name": "RandomFeatureSelector", + "NewName": "RandomFeatureSelector", + "Desc": "Selects a random subset of features for each trainer in the ensemble", + "Module": "ensemble.feature_selector", + "Type": "Component" + } + ] + }, + { + "Kind": "EnsembleMulticlassOutputCombiner", + "Components": [ + { + "Name": "MultiAverage", + "NewName": "ClassifierAverage", + "Desc": "Computes the average of the outputs of the trained models", + "Module": "ensemble.output_combiner", + "Type": "Component" + }, + { + "Name": "MultiMedian", + "NewName": "ClassifierMedian", + "Desc": "Computes the median of the outputs of the trained models", + "Module": "ensemble.output_combiner", + "Type": "Component" + }, + { + "Name": "MultiStacking", + "NewName": "ClassifierStacking", + "Desc": "Computes the output by training a model on a training set where each instance is a vector containing the outputs of the different models on a training instance, and the instance's label", + "Module": "ensemble.output_combiner", + "Type": "Component" + }, + { + "Name": "MultiVoting", + "NewName": "ClassifierVoting", + "Desc": "Computes the fraction of positive predictions for each class from all the trained models, and outputs the class with the largest number", + "Module": "ensemble.output_combiner", + "Type": "Component" + }, + { + "Name": "MultiWeightedAverage", + "NewName": "ClassifierWeightedAverage", + "Desc": "Computes the weighted average of the outputs of the trained models", + "Module": "ensemble.output_combiner", + "Type": "Component" + } + ] + }, + { + "Kind": "EnsembleRegressionOutputCombiner", + "Components": [ + { + "Name": "Average", + "NewName": "RegressorAverage", + "Desc": "Computes the average of the outputs of the trained models", + "Module": "ensemble.output_combiner", + "Type": "Component" + }, + { + "Name": "Median", + "NewName": "RegressorMedian", + "Desc": "Computes the median of the outputs of the trained models", + "Module": "ensemble.output_combiner", + "Type": "Component" + }, + { + "Name": "RegressionStacking", + "NewName": "RegressorStacking", + "Desc": "Computes the output by training a model on a training set where each instance is a vector containing the outputs of the different models on a training instance, and the instance's label", + "Module": "ensemble.output_combiner", + "Type": "Component" + } + ] + }, + { + "Kind": "EnsembleMulticlassSubModelSelector", + "Components": [ + { + "Name": "AllSelectorMultiClass", + "NewName": "ClassifierAllSelector", + "Desc": "Combines all the models to create the output. This is the default submodel selector.", + "Module": "ensemble.sub_model_selector", + "Type": "Component" + }, + { + "Name": "BestDiverseSelectorMultiClass", + "NewName": "ClassifierBestDiverseSelector", + "Desc": "Combines the models whose predictions are as diverse as possible.", + "Module": "ensemble.sub_model_selector", + "Type": "Component" + }, + { + "Name": "BestPerformanceSelectorMultiClass", + "NewName": "ClassifierBestPerformanceSelector", + "Desc": "Combines only the models with the best performance.", + "Module": "ensemble.sub_model_selector", + "Type": "Component" + } + ] + }, + { + "Kind": "EnsembleRegressionSubModelSelector", + "Components": [ + { + "Name": "AllSelector", + "NewName": "RegressorAllSelector", + "Desc": "Combines all the models to create the output. This is the default submodel selector.", + "Module": "ensemble.sub_model_selector", + "Type": "Component" + }, + { + "Name": "BestDiverseSelectorRegression", + "NewName": "RegressorBestDiverseSelector", + "Desc": "Combines the models whose predictions are as diverse as possible.", + "Module": "ensemble.sub_model_selector", + "Type": "Component" + }, + { + "Name": "BestPerformanceRegressionSelector", + "NewName": "RegressorBestPerformanceSelector", + "Desc": "Combines only the models with the best performance.", + "Module": "ensemble.sub_model_selector", + "Type": "Component" + } + ] + }, + { + "Kind": "EnsembleMulticlassDiversityMeasure", + "Components": [ + { + "Name": "MultiDisagreementDiversityMeasure", + "NewName": "ClassifierDisagreement", + "Desc": "A measure of disagreement in predictions between a pair of classifiers, averaged over all pairs", + "Module": "ensemble.sub_model_selector.diversity_measure", + "Type": "Component" + } + ] + }, + { + "Kind": "EnsembleRegressionDiversityMeasure", + "Components": [ + { + "Name": "RegressionDisagreementDiversityMeasure", + "NewName": "RegressorDisagreement", + "Desc": "A measure of absolute value of disagreement in predictions between a pair of regressors, averaged over all pairs", + "Module": "ensemble.sub_model_selector.diversity_measure", + "Type": "Component" + } + ] + }, { "Kind": "NgramExtractor", "Components": [