apache · zero323 · May 7, 2017 · May 9, 2017 · May 9, 2017 · May 9, 2017
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
@@ -246,18 +246,55 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti
                    "be used in the model. Supported options: auto, binomial, multinomial",
                    typeConverter=TypeConverters.toString)
 
+    lowerBoundsOnCoefficients = Param(Params._dummy(), "lowerBoundsOnCoefficients",
+                                      "The lower bounds on coefficients if fitting under bound "
+                                      "constrained optimization. The bound matrix must be "
+                                      "compatible with the shape "
+                                      "(1, number of features) for binomial regression, or "
+                                      "(number of classes, number of features) "
+                                      "for multinomial regression.",
+                                      typeConverter=TypeConverters.toMatrix)
+
+    upperBoundsOnCoefficients = Param(Params._dummy(), "upperBoundsOnCoefficients",
+                                      "The upper bounds on coefficients if fitting under bound "
+                                      "constrained optimization. The bound matrix must be "
+                                      "compatible with the shape "
+                                      "(1, number of features) for binomial regression, or "
+                                      "(number of classes, number of features) "
+                                      "for multinomial regression.",
+                                      typeConverter=TypeConverters.toMatrix)
+
+    lowerBoundsOnIntercepts = Param(Params._dummy(), "lowerBoundsOnIntercepts",
+                                    "The lower bounds on intercepts if fitting under bound "
+                                    "constrained optimization. The bounds vector size must be"
+                                    "equal with 1 for binomial regression, or the number of"
+                                    "lasses for multinomial regression.",
+                                    typeConverter=TypeConverters.toVector)
+
+    upperBoundsOnIntercepts = Param(Params._dummy(), "upperBoundsOnIntercepts",
+                                    "The upper bounds on intercepts if fitting under bound "
+                                    "constrained optimization. The bound vector size must be "
+                                    "equal with 1 for binomial regression, or the number of "
+                                    "classes for multinomial regression.",
+                                    typeConverter=TypeConverters.toVector)
+
     @keyword_only
     def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
                  maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
                  threshold=0.5, thresholds=None, probabilityCol="probability",
                  rawPredictionCol="rawPrediction", standardization=True, weightCol=None,
-                 aggregationDepth=2, family="auto"):
+                 aggregationDepth=2, family="auto",
+                 lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None,
+                 lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None):
+
         """
         __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                  maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \
                  threshold=0.5, thresholds=None, probabilityCol="probability", \
                  rawPredictionCol="rawPrediction", standardization=True, weightCol=None, \
-                 aggregationDepth=2, family="auto")
+                 aggregationDepth=2, family="auto", \
+                 lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None, \
+                 lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None):
         If the threshold and thresholds Params are both set, they must be equivalent.
         """
         super(LogisticRegression, self).__init__()
@@ -274,13 +311,17 @@ def setParams(self, featuresCol="features", labelCol="label", predictionCol="pre
                   maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True,
                   threshold=0.5, thresholds=None, probabilityCol="probability",
                   rawPredictionCol="rawPrediction", standardization=True, weightCol=None,
-                  aggregationDepth=2, family="auto"):
+                  aggregationDepth=2, family="auto",
+                  lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None,
+                  lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None):
         """
         setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
                   maxIter=100, regParam=0.0, elasticNetParam=0.0, tol=1e-6, fitIntercept=True, \
                   threshold=0.5, thresholds=None, probabilityCol="probability", \
                   rawPredictionCol="rawPrediction", standardization=True, weightCol=None, \
-                  aggregationDepth=2, family="auto")
+                  aggregationDepth=2, family="auto", \
+                  lowerBoundsOnCoefficients=None, upperBoundsOnCoefficients=None, \
+                  lowerBoundsOnIntercepts=None, upperBoundsOnIntercepts=None):
         Sets params for logistic regression.
         If the threshold and thresholds Params are both set, they must be equivalent.
         """
@@ -375,6 +416,48 @@ def getFamily(self):
         """
         return self.getOrDefault(self.family)
 
+    @since("2.3.0")
+    def setLowerBoundsOnCoefficients(self, value):
+        """
+        Sets the value of :py:attr:`lowerBoundsOnCoefficients`
+        """
+        return self._set(lowerBoundsOnCoefficients=value)
+
+    @since("2.3.0")
+    def getLowerBoundsOnCoefficients(self):
+        """
+        Gets the value of :py:attr:`lowerBoundsOnCoefficients`
+        """
+        return self.getOrDefault(self.lowerBoundsOnCoefficients)
+
+    @since("2.3.0")
+    def setUpperBoundsOnCoefficients(self, value):
+        """
+        Sets the value of :py:attr:`upperBoundsOnCoefficients`
+        """
+        return self._set(upperBoundsOnCoefficients=value)
+
+    @since("2.3.0")
+    def getUpperBoundsOnCoefficients(self):
+        """
+        Gets the value of :py:attr:`upperBoundsOnCoefficients`
+        """
+        return self.getOrDefault(self.upperBoundsOnCoefficients)
+
+    @since("2.3.0")
+    def setLowerBoundsOnIntercepts(self, value):
+        """
+        Sets the value of :py:attr:`lowerBoundsOnIntercepts`
+        """
+        return self._set(lowerBoundsOnIntercepts=value)
+
+    @since("2.3.0")
+    def getLowerBoundsOnIntercepts(self):
+        """
+        Gets the value of :py:attr:`lowerBoundsOnIntercepts`
+        """
+        return self.getOrDefault(self.lowerBoundsOnIntercepts)
+
 
 class LogisticRegressionModel(JavaModel, JavaClassificationModel, JavaMLWritable, JavaMLReadable):
     """

diff --git a/python/pyspark/ml/param/__init__.py b/python/pyspark/ml/param/__init__.py
@@ -27,7 +27,7 @@
 
 from py4j.java_gateway import JavaObject
 
-from pyspark.ml.linalg import DenseVector, Vector
+from pyspark.ml.linalg import DenseVector, Vector, Matrix
 from pyspark.ml.util import Identifiable
 
 
@@ -169,6 +169,15 @@ def toVector(value):
                 return DenseVector(value)
         raise TypeError("Could not convert %s to vector" % value)
 
+    @staticmethod
+    def toMatrix(value):
+        """
+        Convert a value to ML Matrix, if possible
+        """
+        if isinstance(value, Matrix):
+            return value
+        raise TypeError("Could not convert %s to Matrix" % value)
+
     @staticmethod
     def toFloat(value):
         """

diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
@@ -71,6 +71,34 @@
 ser = PickleSerializer()
 
 
+def generate_multinomial_logistic_input(
+        weights, x_mean, x_variance, add_intercept, n_points, seed=None):
+    """Creates multinomial logistic dataset"""
+
+    if seed:
+        np.random.seed(seed)
+    n_features = x_mean.shape[0]
+
+    x = np.random.randn(n_points, n_features)
+    x = x * np.sqrt(x_variance) + x_mean
+
+    if add_intercept:
+        x = np.hstack([x, np.ones((n_points, 1))])
+
+    # Compute margins
+    margins = np.hstack([np.zeros((n_points, 1)), x.dot(weights.T)])
+    # Shift to avoid overflow and compute probs
+    probs = np.exp(np.subtract(margins, margins.max(axis=1).reshape(n_points, -1)))
+    # Compute cumulative prob
+    cum_probs = np.cumsum(probs / probs.sum(axis=1).reshape(n_points, -1), axis=1)
+    # Assign class
+    classes = np.apply_along_axis(
+        lambda x: np.searchsorted(cum_probs[1, ], np.random.random()),
+        axis=1, arr=cum_probs)
+    return [(float(label), DenseVector(features))
+            for (label, features) in list(zip(classes, x[:, :-int(add_intercept)]))]
+
+
 class MLlibTestCase(unittest.TestCase):
     def setUp(self):
         self.sc = SparkContext('local[4]', "MLlib tests")
@@ -832,6 +860,96 @@ def test_logistic_regression(self):
         except OSError:
             pass
 
+    def logistic_regression_check_thresholds(self):
+        self.assertIsInstance(
+            LogisticRegression(threshold=0.5, thresholds=[0.5, 0.5]),
+            LogisticRegressionModel
+        )
+
+        self.assertRaisesRegexp(
+            ValueError,
+            "Logistic Regression getThreshold found inconsistent.*$",
+            LogisticRegression, threshold=0.42, thresholds=[0.5, 0.5]
+        )
+
+    def test_binomial_logistic_regression_bounds(self):
+        x_mean = np.array([5.843, 3.057, 3.758, 1.199])
+        x_variance = np.array([0.6856, 0.1899, 3.116, 0.581])
+
+        coefficients = np.array(
+            [[-0.57997, 0.912083, -0.371077, -0.819866, 2.688191]]
+        )
+
+        dataset = self.spark.createDataFrame(
+            generate_multinomial_logistic_input(
+                coefficients, x_mean, x_variance, True, 1000),
+            ["label", "features"]
+        )
+
+        lower_bounds_on_coefficients = Matrices.dense(1, 4, [0.0, -1.0, 0.0, -1.0])
+        upper_bounds_on_coefficients = Matrices.dense(1, 4, [0.0, 1.0, 1.0, 0.0])
+        lower_bounds_on_intercepts = Vectors.dense([0.0])
+        upper_bounds_on_intercepts = Vectors.dense([1.0])
+
+        lr = LogisticRegression(
+            standardization=True, fitIntercept=True,
+            lowerBoundsOnCoefficients=lower_bounds_on_coefficients,
+            upperBoundsOnCoefficients=upper_bounds_on_coefficients,
+            lowerBoundsOnIntercepts=lower_bounds_on_intercepts,
+            upperBoundsOnIntercepts=upper_bounds_on_intercepts
+        )
+
+        lrm = lr.fit(dataset)
+
+        self.assertIsInstance(lrm, LogisticRegressionModel)
+        self.assertTrue(np.all(
+            lower_bounds_on_coefficients.toArray() <= lrm.coefficientMatrix.toArray()
+        ))
+
+        self.assertTrue(np.all(
+            lrm.coefficientMatrix.toArray() <= upper_bounds_on_coefficients.toArray()
+        ))
+
+    def test_multinomial_regression_bounds(self):
+        x_mean = np.array([5.843, 3.057, 3.758, 1.199])
+        x_variance = np.array([0.6856, 0.1899, 3.116, 0.581])
+
+        coefficients = np.array([
+            [-0.57997, 0.912083, -0.371077, -0.819866, 2.688191],
+            [-0.16624, -0.84355, -0.048509, -0.301789, 4.170682]
+        ])
+
+        dataset = self.spark.createDataFrame(
+            generate_multinomial_logistic_input(
+                coefficients, x_mean, x_variance, True, 1000),
+            ["label", "features"]
+        )
+
+        lower_bounds_on_coefficients = Matrices.dense(3, 4, np.repeat(-10.0, 12))
+        upper_bounds_on_coefficients = Matrices.dense(3, 4, np.repeat(10.0, 12))
+        lower_bounds_on_intercepts = Vectors.dense(np.repeat(-3.0, 3))
+        upper_bounds_on_intercepts = Vectors.dense(np.repeat(3.0, 3))
+
+        lr = LogisticRegression(
+            standardization=True, fitIntercept=True,
+            lowerBoundsOnCoefficients=lower_bounds_on_coefficients,
+            upperBoundsOnCoefficients=upper_bounds_on_coefficients,
+            lowerBoundsOnIntercepts=lower_bounds_on_intercepts,
+            upperBoundsOnIntercepts=upper_bounds_on_intercepts
+        )
+
+        lrm = lr.fit(dataset)
+
+        self.assertIsInstance(lrm, LogisticRegressionModel)
+
+        self.assertTrue(np.all(
+            lower_bounds_on_coefficients.toArray() <= lrm.coefficientMatrix.toArray()
+        ))
+
+        self.assertTrue(np.all(
+            lrm.coefficientMatrix.toArray() <= upper_bounds_on_coefficients.toArray()
+        ))
+
     def _compare_params(self, m1, m2, param):
         """
         Compare 2 ML Params instances for the given param, and assert both have the same param value