-
Notifications
You must be signed in to change notification settings - Fork 29.1k
[SPARK-20601][PYTHON][ML] Python API Changes for Constrained Logistic Regression Params #17922
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -246,18 +246,55 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti | |
| "be used in the model. Supported options: auto, binomial, multinomial", | ||
| typeConverter=TypeConverters.toString) | ||
|
|
||
| lowerBoundsOnCoefficients = Param(Params._dummy(), "lowerBoundsOnCoefficients", | ||
| "The lower bounds on coefficients if fitting under bound " | ||
| "constrained optimization. The bound matrix must be " | ||
| "compatible with the shape " | ||
| "(1, number of features) for binomial regression, or " | ||
| "(number of classes, number of features) " | ||
| "for multinomial regression.", | ||
| typeConverter=TypeConverters.toMatrix) | ||
|
|
||
| upperBoundsOnCoefficients = Param(Params._dummy(), "upperBoundsOnCoefficients", | ||
| "The upper bounds on coefficients if fitting under bound " | ||
| "constrained optimization. The bound matrix must be " | ||
| "compatible with the shape " | ||
| "(1, number of features) for binomial regression, or " | ||
| "(number of classes, number of features) " | ||
| "for multinomial regression.", | ||
| typeConverter=TypeConverters.toMatrix) | ||
|
|
||
| lowerBoundsOnIntercepts = Param(Params._dummy(), "lowerBoundsOnIntercepts", | ||
| "The lower bounds on intercepts if fitting under bound " | ||
| "constrained optimization. The bounds vector size must be" | ||
| "equal with 1 for binomial regression, or the number of" | ||
| "lasses for multinomial regression.", | ||
| typeConverter=TypeConverters.toVector) | ||
|
|
||
| upperBoundsOnIntercepts = Param(Params._dummy(), "upperBoundsOnIntercepts", | ||
| "The upper bounds on intercepts if fitting under bound " | ||
| "constrained optimization. The bound vector size must be " | ||
| "equal with 1 for binomial regression, or the number of " | ||
| "classes for multinomial regression.", | ||
| typeConverter=TypeConverters.toVector) | ||
|
|
||
| @keyword_only | ||
| def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", | ||
| maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, | ||
| threshold=0.5, thresholds=None, probabilityCol="probability", | ||
| rawPredictionCol="rawPrediction", standardization=True, weightCol=None, | ||
| aggregationDepth=2, family="auto"): | ||
| aggregationDepth=2, family="auto", | ||
| lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None, | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should fill up the previous line before starting another, here and below |
||
| lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None): | ||
|
|
||
| """ | ||
| __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ | ||
| maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \ | ||
| threshold=0.5, thresholds=None, probabilityCol="probability", \ | ||
| rawPredictionCol="rawPrediction", standardization=True, weightCol=None, \ | ||
| aggregationDepth=2, family="auto") | ||
| aggregationDepth=2, family="auto", \ | ||
| lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None, \ | ||
| lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None): | ||
| If the threshold and thresholds Params are both set, they must be equivalent. | ||
| """ | ||
| super(LogisticRegression, self).__init__() | ||
|
|
@@ -274,13 +311,17 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre | |
| maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, | ||
| threshold=0.5, thresholds=None, probabilityCol="probability", | ||
| rawPredictionCol="rawPrediction", standardization=True, weightCol=None, | ||
| aggregationDepth=2, family="auto"): | ||
| aggregationDepth=2, family="auto", | ||
| lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None, | ||
| lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None): | ||
| """ | ||
| setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ | ||
| maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \ | ||
| threshold=0.5, thresholds=None, probabilityCol="probability", \ | ||
| rawPredictionCol="rawPrediction", standardization=True, weightCol=None, \ | ||
| aggregationDepth=2, family="auto") | ||
| aggregationDepth=2, family="auto", \ | ||
| lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None, \ | ||
| lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None): | ||
| Sets params for logistic regression. | ||
| If the threshold and thresholds Params are both set, they must be equivalent. | ||
| """ | ||
|
|
@@ -375,6 +416,48 @@ def getFamily(self): | |
| """ | ||
| return self.getOrDefault(self.family) | ||
|
|
||
| @since("2.3.0") | ||
| def setLowerBoundsOnCoefficients(self, value): | ||
| """ | ||
| Sets the value of :py:attr:`lowerBoundsOnCoefficients` | ||
| """ | ||
| return self._set(lowerBoundsOnCoefficients=value) | ||
|
|
||
| @since("2.3.0") | ||
| def getLowerBoundsOnCoefficients(self): | ||
| """ | ||
| Gets the value of :py:attr:`lowerBoundsOnCoefficients` | ||
| """ | ||
| return self.getOrDefault(self.lowerBoundsOnCoefficients) | ||
|
|
||
| @since("2.3.0") | ||
| def setUpperBoundsOnCoefficients(self, value): | ||
| """ | ||
| Sets the value of :py:attr:`upperBoundsOnCoefficients` | ||
| """ | ||
| return self._set(upperBoundsOnCoefficients=value) | ||
|
|
||
| @since("2.3.0") | ||
| def getUpperBoundsOnCoefficients(self): | ||
| """ | ||
| Gets the value of :py:attr:`upperBoundsOnCoefficients` | ||
| """ | ||
| return self.getOrDefault(self.upperBoundsOnCoefficients) | ||
|
|
||
| @since("2.3.0") | ||
| def setLowerBoundsOnIntercepts(self, value): | ||
| """ | ||
| Sets the value of :py:attr:`lowerBoundsOnIntercepts` | ||
| """ | ||
| return self._set(lowerBoundsOnIntercepts=value) | ||
|
|
||
| @since("2.3.0") | ||
| def getLowerBoundsOnIntercepts(self): | ||
| """ | ||
| Gets the value of :py:attr:`lowerBoundsOnIntercepts` | ||
| """ | ||
| return self.getOrDefault(self.lowerBoundsOnIntercepts) | ||
|
|
||
|
|
||
| class LogisticRegressionModel(JavaModel, JavaClassificationModel, JavaMLWritable, JavaMLReadable): | ||
| """ | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -27,7 +27,7 @@ | |
|
|
||
| from py4j.java_gateway import JavaObject | ||
|
|
||
| from pyspark.ml.linalg import DenseVector, Vector | ||
| from pyspark.ml.linalg import DenseVector, Vector, Matrix | ||
| from pyspark.ml.util import Identifiable | ||
|
|
||
|
|
||
|
|
@@ -169,6 +169,15 @@ def toVector(value): | |
| return DenseVector(value) | ||
| raise TypeError("Could not convert %s to vector" % value) | ||
|
|
||
| @staticmethod | ||
| def toMatrix(value): | ||
| """ | ||
| Convert a value to ML Matrix, if possible | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. While I am aware of this, distinction between
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is not a big issue, but you can still refer the clarification in MLlib user guide to get the convention in MLlib. |
||
| """ | ||
| if isinstance(value, Matrix): | ||
| return value | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this method really necessary? It's not actually converting anything, just checking the type. If this wasn't here and the user put something other than a Matrix, what error would be raised? |
||
| raise TypeError("Could not convert %s to Matrix" % value) | ||
|
|
||
| @staticmethod | ||
| def toFloat(value): | ||
| """ | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -71,6 +71,34 @@ | |
| ser = PickleSerializer() | ||
|
|
||
|
|
||
| def generate_multinomial_logistic_input( | ||
| weights, x_mean, x_variance, add_intercept, n_points, seed=None): | ||
| """Creates multinomial logistic dataset""" | ||
|
|
||
| if seed: | ||
| np.random.seed(seed) | ||
| n_features = x_mean.shape[0] | ||
|
|
||
| x = np.random.randn(n_points, n_features) | ||
| x = x * np.sqrt(x_variance) + x_mean | ||
|
|
||
| if add_intercept: | ||
| x = np.hstack([x, np.ones((n_points, 1))]) | ||
|
|
||
| # Compute margins | ||
| margins = np.hstack([np.zeros((n_points, 1)), x.dot(weights.T)]) | ||
| # Shift to avoid overflow and compute probs | ||
| probs = np.exp(np.subtract(margins, margins.max(axis=1).reshape(n_points, -1))) | ||
| # Compute cumulative prob | ||
| cum_probs = np.cumsum(probs / probs.sum(axis=1).reshape(n_points, -1), axis=1) | ||
| # Assign class | ||
| classes = np.apply_along_axis( | ||
| lambda x: np.searchsorted(cum_probs[1, ], np.random.random()), | ||
| axis=1, arr=cum_probs) | ||
| return [(float(label), DenseVector(features)) | ||
| for (label, features) in list(zip(classes, x[:, :-int(add_intercept)]))] | ||
|
|
||
|
|
||
| class MLlibTestCase(unittest.TestCase): | ||
| def setUp(self): | ||
| self.sc = SparkContext('local[4]', "MLlib tests") | ||
|
|
@@ -832,6 +860,96 @@ def test_logistic_regression(self): | |
| except OSError: | ||
| pass | ||
|
|
||
| def logistic_regression_check_thresholds(self): | ||
| self.assertIsInstance( | ||
| LogisticRegression(threshold=0.5, thresholds=[0.5, 0.5]), | ||
| LogisticRegressionModel | ||
| ) | ||
|
|
||
| self.assertRaisesRegexp( | ||
| ValueError, | ||
| "Logistic Regression getThreshold found inconsistent.*$", | ||
| LogisticRegression, threshold=0.42, thresholds=[0.5, 0.5] | ||
| ) | ||
|
|
||
| def test_binomial_logistic_regression_bounds(self): | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @zero323 We usually run PySpark MLlib test with loading a dataset from
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Example datasets are not that good for checking constraints, and generator seems like a better idea than creating large enough example by hand. I can of course remove it, if this is an issue.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For PySpark, we should only check the output be consistent with Scala. The most straight-forward way for this test should be loading data directly and run constraint LR on it: This will make the test case simple and time-saving. Thanks.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agree this is probably overkill for testing this. The functionality is already in Scala and should be tested there, here in python we are just setting the parameters. |
||
| x_mean = np.array([5.843, 3.057, 3.758, 1.199]) | ||
| x_variance = np.array([0.6856, 0.1899, 3.116, 0.581]) | ||
|
|
||
| coefficients = np.array( | ||
| [[-0.57997, 0.912083, -0.371077, -0.819866, 2.688191]] | ||
| ) | ||
|
|
||
| dataset = self.spark.createDataFrame( | ||
| generate_multinomial_logistic_input( | ||
| coefficients, x_mean, x_variance, True, 1000), | ||
| ["label", "features"] | ||
| ) | ||
|
|
||
| lower_bounds_on_coefficients = Matrices.dense(1, 4, [0.0, -1.0, 0.0, -1.0]) | ||
| upper_bounds_on_coefficients = Matrices.dense(1, 4, [0.0, 1.0, 1.0, 0.0]) | ||
| lower_bounds_on_intercepts = Vectors.dense([0.0]) | ||
| upper_bounds_on_intercepts = Vectors.dense([1.0]) | ||
|
|
||
| lr = LogisticRegression( | ||
| standardization=True, fitIntercept=True, | ||
| lowerBoundsOnCoefficients=lower_bounds_on_coefficients, | ||
| upperBoundsOnCoefficients=upper_bounds_on_coefficients, | ||
| lowerBoundsOnIntercepts=lower_bounds_on_intercepts, | ||
| upperBoundsOnIntercepts=upper_bounds_on_intercepts | ||
| ) | ||
|
|
||
| lrm = lr.fit(dataset) | ||
|
|
||
| self.assertIsInstance(lrm, LogisticRegressionModel) | ||
| self.assertTrue(np.all( | ||
| lower_bounds_on_coefficients.toArray() <= lrm.coefficientMatrix.toArray() | ||
| )) | ||
|
|
||
| self.assertTrue(np.all( | ||
| lrm.coefficientMatrix.toArray() <= upper_bounds_on_coefficients.toArray() | ||
| )) | ||
|
|
||
| def test_multinomial_regression_bounds(self): | ||
| x_mean = np.array([5.843, 3.057, 3.758, 1.199]) | ||
| x_variance = np.array([0.6856, 0.1899, 3.116, 0.581]) | ||
|
|
||
| coefficients = np.array([ | ||
| [-0.57997, 0.912083, -0.371077, -0.819866, 2.688191], | ||
| [-0.16624, -0.84355, -0.048509, -0.301789, 4.170682] | ||
| ]) | ||
|
|
||
| dataset = self.spark.createDataFrame( | ||
| generate_multinomial_logistic_input( | ||
| coefficients, x_mean, x_variance, True, 1000), | ||
| ["label", "features"] | ||
| ) | ||
|
|
||
| lower_bounds_on_coefficients = Matrices.dense(3, 4, np.repeat(-10.0, 12)) | ||
| upper_bounds_on_coefficients = Matrices.dense(3, 4, np.repeat(10.0, 12)) | ||
| lower_bounds_on_intercepts = Vectors.dense(np.repeat(-3.0, 3)) | ||
| upper_bounds_on_intercepts = Vectors.dense(np.repeat(3.0, 3)) | ||
|
|
||
| lr = LogisticRegression( | ||
| standardization=True, fitIntercept=True, | ||
| lowerBoundsOnCoefficients=lower_bounds_on_coefficients, | ||
| upperBoundsOnCoefficients=upper_bounds_on_coefficients, | ||
| lowerBoundsOnIntercepts=lower_bounds_on_intercepts, | ||
| upperBoundsOnIntercepts=upper_bounds_on_intercepts | ||
| ) | ||
|
|
||
| lrm = lr.fit(dataset) | ||
|
|
||
| self.assertIsInstance(lrm, LogisticRegressionModel) | ||
|
|
||
| self.assertTrue(np.all( | ||
| lower_bounds_on_coefficients.toArray() <= lrm.coefficientMatrix.toArray() | ||
| )) | ||
|
|
||
| self.assertTrue(np.all( | ||
| lrm.coefficientMatrix.toArray() <= upper_bounds_on_coefficients.toArray() | ||
| )) | ||
|
|
||
| def _compare_params(self, m1, m2, param): | ||
| """ | ||
| Compare 2 ML Params instances for the given param, and assert both have the same param value | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think you can condense this and the above text blocks some more