diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py index 60bdeedd6a144..c1cb3ddccb074 100644 --- a/python/pyspark/ml/classification.py +++ b/python/pyspark/ml/classification.py @@ -246,18 +246,55 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti "be used in the model. Supported options: auto, binomial, multinomial", typeConverter=TypeConverters.toString) + lowerBoundsOnCoefficients = Param(Params._dummy(), "lowerBoundsOnCoefficients", + "The lower bounds on coefficients if fitting under bound " + "constrained optimization. The bound matrix must be " + "compatible with the shape " + "(1, number of features) for binomial regression, or " + "(number of classes, number of features) " + "for multinomial regression.", + typeConverter=TypeConverters.toMatrix) + + upperBoundsOnCoefficients = Param(Params._dummy(), "upperBoundsOnCoefficients", + "The upper bounds on coefficients if fitting under bound " + "constrained optimization. The bound matrix must be " + "compatible with the shape " + "(1, number of features) for binomial regression, or " + "(number of classes, number of features) " + "for multinomial regression.", + typeConverter=TypeConverters.toMatrix) + + lowerBoundsOnIntercepts = Param(Params._dummy(), "lowerBoundsOnIntercepts", + "The lower bounds on intercepts if fitting under bound " + "constrained optimization. The bounds vector size must be" + "equal with 1 for binomial regression, or the number of" + "lasses for multinomial regression.", + typeConverter=TypeConverters.toVector) + + upperBoundsOnIntercepts = Param(Params._dummy(), "upperBoundsOnIntercepts", + "The upper bounds on intercepts if fitting under bound " + "constrained optimization. The bound vector size must be " + "equal with 1 for binomial regression, or the number of " + "classes for multinomial regression.", + typeConverter=TypeConverters.toVector) + @keyword_only def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, threshold=0.5, thresholds=None, probabilityCol="probability", rawPredictionCol="rawPrediction", standardization=True, weightCol=None, - aggregationDepth=2, family="auto"): + aggregationDepth=2, family="auto", + lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None, + lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None): + """ __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \ threshold=0.5, thresholds=None, probabilityCol="probability", \ rawPredictionCol="rawPrediction", standardization=True, weightCol=None, \ - aggregationDepth=2, family="auto") + aggregationDepth=2, family="auto", \ + lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None, \ + lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None): If the threshold and thresholds Params are both set, they must be equivalent. """ super(LogisticRegression, self).__init__() @@ -274,13 +311,17 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, threshold=0.5, thresholds=None, probabilityCol="probability", rawPredictionCol="rawPrediction", standardization=True, weightCol=None, - aggregationDepth=2, family="auto"): + aggregationDepth=2, family="auto", + lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None, + lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None): """ setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \ maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \ threshold=0.5, thresholds=None, probabilityCol="probability", \ rawPredictionCol="rawPrediction", standardization=True, weightCol=None, \ - aggregationDepth=2, family="auto") + aggregationDepth=2, family="auto", \ + lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None, \ + lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None): Sets params for logistic regression. If the threshold and thresholds Params are both set, they must be equivalent. """ @@ -375,6 +416,48 @@ def getFamily(self): """ return self.getOrDefault(self.family) + @since("2.3.0") + def setLowerBoundsOnCoefficients(self, value): + """ + Sets the value of :py:attr:`lowerBoundsOnCoefficients` + """ + return self._set(lowerBoundsOnCoefficients=value) + + @since("2.3.0") + def getLowerBoundsOnCoefficients(self): + """ + Gets the value of :py:attr:`lowerBoundsOnCoefficients` + """ + return self.getOrDefault(self.lowerBoundsOnCoefficients) + + @since("2.3.0") + def setUpperBoundsOnCoefficients(self, value): + """ + Sets the value of :py:attr:`upperBoundsOnCoefficients` + """ + return self._set(upperBoundsOnCoefficients=value) + + @since("2.3.0") + def getUpperBoundsOnCoefficients(self): + """ + Gets the value of :py:attr:`upperBoundsOnCoefficients` + """ + return self.getOrDefault(self.upperBoundsOnCoefficients) + + @since("2.3.0") + def setLowerBoundsOnIntercepts(self, value): + """ + Sets the value of :py:attr:`lowerBoundsOnIntercepts` + """ + return self._set(lowerBoundsOnIntercepts=value) + + @since("2.3.0") + def getLowerBoundsOnIntercepts(self): + """ + Gets the value of :py:attr:`lowerBoundsOnIntercepts` + """ + return self.getOrDefault(self.lowerBoundsOnIntercepts) + class LogisticRegressionModel(JavaModel, JavaClassificationModel, JavaMLWritable, JavaMLReadable): """ diff --git a/python/pyspark/ml/param/__init__.py b/python/pyspark/ml/param/__init__.py index 99d8fa3a5b73e..d1ccd3354df68 100644 --- a/python/pyspark/ml/param/__init__.py +++ b/python/pyspark/ml/param/__init__.py @@ -27,7 +27,7 @@ from py4j.java_gateway import JavaObject -from pyspark.ml.linalg import DenseVector, Vector +from pyspark.ml.linalg import DenseVector, Vector, Matrix from pyspark.ml.util import Identifiable @@ -169,6 +169,15 @@ def toVector(value): return DenseVector(value) raise TypeError("Could not convert %s to vector" % value) + @staticmethod + def toMatrix(value): + """ + Convert a value to ML Matrix, if possible + """ + if isinstance(value, Matrix): + return value + raise TypeError("Could not convert %s to Matrix" % value) + @staticmethod def toFloat(value): """ diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index 17a39472e1fe5..40dcd66621357 100755 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -71,6 +71,34 @@ ser = PickleSerializer() +def generate_multinomial_logistic_input( + weights, x_mean, x_variance, add_intercept, n_points, seed=None): + """Creates multinomial logistic dataset""" + + if seed: + np.random.seed(seed) + n_features = x_mean.shape[0] + + x = np.random.randn(n_points, n_features) + x = x * np.sqrt(x_variance) + x_mean + + if add_intercept: + x = np.hstack([x, np.ones((n_points, 1))]) + + # Compute margins + margins = np.hstack([np.zeros((n_points, 1)), x.dot(weights.T)]) + # Shift to avoid overflow and compute probs + probs = np.exp(np.subtract(margins, margins.max(axis=1).reshape(n_points, -1))) + # Compute cumulative prob + cum_probs = np.cumsum(probs / probs.sum(axis=1).reshape(n_points, -1), axis=1) + # Assign class + classes = np.apply_along_axis( + lambda x: np.searchsorted(cum_probs[1, ], np.random.random()), + axis=1, arr=cum_probs) + return [(float(label), DenseVector(features)) + for (label, features) in list(zip(classes, x[:, :-int(add_intercept)]))] + + class MLlibTestCase(unittest.TestCase): def setUp(self): self.sc = SparkContext('local[4]', "MLlib tests") @@ -832,6 +860,96 @@ def test_logistic_regression(self): except OSError: pass + def logistic_regression_check_thresholds(self): + self.assertIsInstance( + LogisticRegression(threshold=0.5, thresholds=[0.5, 0.5]), + LogisticRegressionModel + ) + + self.assertRaisesRegexp( + ValueError, + "Logistic Regression getThreshold found inconsistent.*$", + LogisticRegression, threshold=0.42, thresholds=[0.5, 0.5] + ) + + def test_binomial_logistic_regression_bounds(self): + x_mean = np.array([5.843, 3.057, 3.758, 1.199]) + x_variance = np.array([0.6856, 0.1899, 3.116, 0.581]) + + coefficients = np.array( + [[-0.57997, 0.912083, -0.371077, -0.819866, 2.688191]] + ) + + dataset = self.spark.createDataFrame( + generate_multinomial_logistic_input( + coefficients, x_mean, x_variance, True, 1000), + ["label", "features"] + ) + + lower_bounds_on_coefficients = Matrices.dense(1, 4, [0.0, -1.0, 0.0, -1.0]) + upper_bounds_on_coefficients = Matrices.dense(1, 4, [0.0, 1.0, 1.0, 0.0]) + lower_bounds_on_intercepts = Vectors.dense([0.0]) + upper_bounds_on_intercepts = Vectors.dense([1.0]) + + lr = LogisticRegression( + standardization=True, fitIntercept=True, + lowerBoundsOnCoefficients=lower_bounds_on_coefficients, + upperBoundsOnCoefficients=upper_bounds_on_coefficients, + lowerBoundsOnIntercepts=lower_bounds_on_intercepts, + upperBoundsOnIntercepts=upper_bounds_on_intercepts + ) + + lrm = lr.fit(dataset) + + self.assertIsInstance(lrm, LogisticRegressionModel) + self.assertTrue(np.all( + lower_bounds_on_coefficients.toArray() <= lrm.coefficientMatrix.toArray() + )) + + self.assertTrue(np.all( + lrm.coefficientMatrix.toArray() <= upper_bounds_on_coefficients.toArray() + )) + + def test_multinomial_regression_bounds(self): + x_mean = np.array([5.843, 3.057, 3.758, 1.199]) + x_variance = np.array([0.6856, 0.1899, 3.116, 0.581]) + + coefficients = np.array([ + [-0.57997, 0.912083, -0.371077, -0.819866, 2.688191], + [-0.16624, -0.84355, -0.048509, -0.301789, 4.170682] + ]) + + dataset = self.spark.createDataFrame( + generate_multinomial_logistic_input( + coefficients, x_mean, x_variance, True, 1000), + ["label", "features"] + ) + + lower_bounds_on_coefficients = Matrices.dense(3, 4, np.repeat(-10.0, 12)) + upper_bounds_on_coefficients = Matrices.dense(3, 4, np.repeat(10.0, 12)) + lower_bounds_on_intercepts = Vectors.dense(np.repeat(-3.0, 3)) + upper_bounds_on_intercepts = Vectors.dense(np.repeat(3.0, 3)) + + lr = LogisticRegression( + standardization=True, fitIntercept=True, + lowerBoundsOnCoefficients=lower_bounds_on_coefficients, + upperBoundsOnCoefficients=upper_bounds_on_coefficients, + lowerBoundsOnIntercepts=lower_bounds_on_intercepts, + upperBoundsOnIntercepts=upper_bounds_on_intercepts + ) + + lrm = lr.fit(dataset) + + self.assertIsInstance(lrm, LogisticRegressionModel) + + self.assertTrue(np.all( + lower_bounds_on_coefficients.toArray() <= lrm.coefficientMatrix.toArray() + )) + + self.assertTrue(np.all( + lrm.coefficientMatrix.toArray() <= upper_bounds_on_coefficients.toArray() + )) + def _compare_params(self, m1, m2, param): """ Compare 2 ML Params instances for the given param, and assert both have the same param value