[SPARK-20906][SPARKR] Constrained Logistic Regression for SparkR

wangmiao1981 · Felix Cheung · commit 53543374ce0c · 2017-06-21T20:42:45.000-07:00
## What changes were proposed in this pull request? PR #17715 Added Constrained Logistic Regression for ML. We should add it to SparkR. ## How was this patch tested? Add new unit tests. Author: wangmiao1981 <wm624@hotmail.com> Closes #18128 from wangmiao1981/test.
diff --git a/R/pkg/R/mllib_classification.R b/R/pkg/R/mllib_classification.R
@@ -204,6 +204,20 @@ function(object, path, overwrite = FALSE) {
 #' @param aggregationDepth The depth for treeAggregate (greater than or equal to 2). If the dimensions of features
 #'                         or the number of partitions are large, this param could be adjusted to a larger size.
 #'                         This is an expert parameter. Default value should be good for most cases.
+#' @param lowerBoundsOnCoefficients The lower bounds on coefficients if fitting under bound constrained optimization.
+#'                                  The bound matrix must be compatible with the shape (1, number of features) for binomial
+#'                                  regression, or (number of classes, number of features) for multinomial regression.
+#'                                  It is a R matrix.
+#' @param upperBoundsOnCoefficients The upper bounds on coefficients if fitting under bound constrained optimization.
+#'                                  The bound matrix must be compatible with the shape (1, number of features) for binomial
+#'                                  regression, or (number of classes, number of features) for multinomial regression.
+#'                                  It is a R matrix.
+#' @param lowerBoundsOnIntercepts The lower bounds on intercepts if fitting under bound constrained optimization.
+#'                                The bounds vector size must be equal to 1 for binomial regression, or the number
+#'                                of classes for multinomial regression.
+#' @param upperBoundsOnIntercepts The upper bounds on intercepts if fitting under bound constrained optimization.
+#'                                The bound vector size must be equal to 1 for binomial regression, or the number
+#'                                of classes for multinomial regression.
 #' @param ... additional arguments passed to the method.
 #' @return \code{spark.logit} returns a fitted logistic regression model.
 #' @rdname spark.logit
@@ -241,21 +255,64 @@ function(object, path, overwrite = FALSE) {
 setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula"),
           function(data, formula, regParam = 0.0, elasticNetParam = 0.0, maxIter = 100,
                    tol = 1E-6, family = "auto", standardization = TRUE,
-                   thresholds = 0.5, weightCol = NULL, aggregationDepth = 2) {
+                   thresholds = 0.5, weightCol = NULL, aggregationDepth = 2,
+                   lowerBoundsOnCoefficients = NULL, upperBoundsOnCoefficients = NULL,
+                   lowerBoundsOnIntercepts = NULL, upperBoundsOnIntercepts = NULL) {
             formula <- paste(deparse(formula), collapse = "")
+            row <- 0
+            col <- 0
 
             if (!is.null(weightCol) && weightCol == "") {
               weightCol <- NULL
             } else if (!is.null(weightCol)) {
               weightCol <- as.character(weightCol)
             }
 
+            if (!is.null(lowerBoundsOnIntercepts)) {
+                lowerBoundsOnIntercepts <- as.array(lowerBoundsOnIntercepts)
+            }
+
+            if (!is.null(upperBoundsOnIntercepts)) {
+                upperBoundsOnIntercepts <- as.array(upperBoundsOnIntercepts)
+            }
+
+            if (!is.null(lowerBoundsOnCoefficients)) {
+              if (class(lowerBoundsOnCoefficients) != "matrix") {
+                stop("lowerBoundsOnCoefficients must be a matrix.")
+              }
+              row <- nrow(lowerBoundsOnCoefficients)
+              col <- ncol(lowerBoundsOnCoefficients)
+              lowerBoundsOnCoefficients <- as.array(as.vector(lowerBoundsOnCoefficients))
+            }
+
+            if (!is.null(upperBoundsOnCoefficients)) {
+              if (class(upperBoundsOnCoefficients) != "matrix") {
+                stop("upperBoundsOnCoefficients must be a matrix.")
+              }
+
+              if (!is.null(lowerBoundsOnCoefficients) && (row != nrow(upperBoundsOnCoefficients)
+                || col != ncol(upperBoundsOnCoefficients))) {
+                stop(paste0("dimension of upperBoundsOnCoefficients ",
+                           "is not the same as lowerBoundsOnCoefficients", sep = ""))
+              }
+
+              if (is.null(lowerBoundsOnCoefficients)) {
+                row <- nrow(upperBoundsOnCoefficients)
+                col <- ncol(upperBoundsOnCoefficients)
+              }
+
+              upperBoundsOnCoefficients <- as.array(as.vector(upperBoundsOnCoefficients))
+            }
+
             jobj <- callJStatic("org.apache.spark.ml.r.LogisticRegressionWrapper", "fit",
                                 data@sdf, formula, as.numeric(regParam),
                                 as.numeric(elasticNetParam), as.integer(maxIter),
                                 as.numeric(tol), as.character(family),
                                 as.logical(standardization), as.array(thresholds),
-                                weightCol, as.integer(aggregationDepth))
+                                weightCol, as.integer(aggregationDepth),
+                                as.integer(row), as.integer(col),
+                                lowerBoundsOnCoefficients, upperBoundsOnCoefficients,
+                                lowerBoundsOnIntercepts, upperBoundsOnIntercepts)
             new("LogisticRegressionModel", jobj = jobj)
           })
 
diff --git a/R/pkg/tests/fulltests/test_mllib_classification.R b/R/pkg/tests/fulltests/test_mllib_classification.R
@@ -223,6 +223,46 @@ test_that("spark.logit", {
   model2 <- spark.logit(df2, label ~ feature, weightCol = "weight")
   prediction2 <- collect(select(predict(model2, df2), "prediction"))
   expect_equal(sort(prediction2$prediction), c("0.0", "0.0", "0.0", "0.0", "0.0"))
+
+  # Test binomial logistic regression againt two classes with upperBoundsOnCoefficients
+  # and upperBoundsOnIntercepts
+  u <- matrix(c(1.0, 0.0, 1.0, 0.0), nrow = 1, ncol = 4)
+  model <- spark.logit(training, Species ~ ., upperBoundsOnCoefficients = u,
+                       upperBoundsOnIntercepts = 1.0)
+  summary <- summary(model)
+  coefsR <- c(-11.13331, 1.00000, 0.00000, 1.00000, 0.00000)
+  coefs <- summary$coefficients[, "Estimate"]
+  expect_true(all(abs(coefsR - coefs) < 0.1))
+  # Test upperBoundsOnCoefficients should be matrix
+  expect_error(spark.logit(training, Species ~ ., upperBoundsOnCoefficients = as.array(c(1, 2)),
+                           upperBoundsOnIntercepts = 1.0))
+
+  # Test binomial logistic regression againt two classes with lowerBoundsOnCoefficients
+  # and lowerBoundsOnIntercepts
+  l <- matrix(c(0.0, -1.0, 0.0, -1.0), nrow = 1, ncol = 4)
+  model <- spark.logit(training, Species ~ ., lowerBoundsOnCoefficients = l,
+                       lowerBoundsOnIntercepts = 0.0)
+  summary <- summary(model)
+  coefsR <- c(0, 0, -1, 0, 1.902192)
+  coefs <- summary$coefficients[, "Estimate"]
+  expect_true(all(abs(coefsR - coefs) < 0.1))
+  # Test lowerBoundsOnCoefficients should be matrix
+  expect_error(spark.logit(training, Species ~ ., lowerBoundsOnCoefficients = as.array(c(1, 2)),
+                           lowerBoundsOnIntercepts = 0.0))
+
+  # Test multinomial logistic regression with lowerBoundsOnCoefficients
+  # and lowerBoundsOnIntercepts
+  l <- matrix(c(0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0), nrow = 2, ncol = 4)
+  model <- spark.logit(training, Species ~ ., family = "multinomial",
+                       lowerBoundsOnCoefficients = l,
+                       lowerBoundsOnIntercepts = as.array(c(0.0, 0.0)))
+  summary <- summary(model)
+  versicolorCoefsR <- c(42.639465, 7.258104, 14.330814, 16.298243, 11.716429)
+  virginicaCoefsR <- c(0.0002970796, 4.79274, 7.65047, 25.72793, 30.0021)
+  versicolorCoefs <- summary$coefficients[, "versicolor"]
+  virginicaCoefs <- summary$coefficients[, "virginica"]
+  expect_true(all(abs(versicolorCoefsR - versicolorCoefs) < 0.1))
+  expect_true(all(abs(virginicaCoefsR - virginicaCoefs) < 0.1))
 })
 
 test_that("spark.mlp", {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala
@@ -214,7 +214,7 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas
 
   /**
    * The lower bounds on intercepts if fitting under bound constrained optimization.
-   * The bounds vector size must be equal with 1 for binomial regression, or the number
+   * The bounds vector size must be equal to 1 for binomial regression, or the number
    * of classes for multinomial regression. Otherwise, it throws exception.
    * Default is none.
    *
@@ -230,7 +230,7 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas
 
   /**
    * The upper bounds on intercepts if fitting under bound constrained optimization.
-   * The bound vector size must be equal with 1 for binomial regression, or the number
+   * The bound vector size must be equal to 1 for binomial regression, or the number
    * of classes for multinomial regression. Otherwise, it throws exception.
    * Default is none.
    *
@@ -451,12 +451,12 @@ class LogisticRegression @Since("1.2.0") (
     }
     if (isSet(lowerBoundsOnIntercepts)) {
       require($(lowerBoundsOnIntercepts).size == numCoefficientSets, "The size of " +
-        "lowerBoundsOnIntercepts must be equal with 1 for binomial regression, or the number of " +
+        "lowerBoundsOnIntercepts must be equal to 1 for binomial regression, or the number of " +
         s"classes for multinomial regression, but found: ${getLowerBoundsOnIntercepts.size}.")
     }
     if (isSet(upperBoundsOnIntercepts)) {
       require($(upperBoundsOnIntercepts).size == numCoefficientSets, "The size of " +
-        "upperBoundsOnIntercepts must be equal with 1 for binomial regression, or the number of " +
+        "upperBoundsOnIntercepts must be equal to 1 for binomial regression, or the number of " +
         s"classes for multinomial regression, but found: ${getUpperBoundsOnIntercepts.size}.")
     }
     if (isSet(lowerBoundsOnCoefficients) && isSet(upperBoundsOnCoefficients)) {
diff --git a/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala b/mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala
@@ -25,7 +25,7 @@ import org.json4s.jackson.JsonMethods._
 import org.apache.spark.ml.{Pipeline, PipelineModel}
 import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
 import org.apache.spark.ml.feature.{IndexToString, RFormula}
-import org.apache.spark.ml.linalg.Vector
+import org.apache.spark.ml.linalg.{Matrices, Vector, Vectors}
 import org.apache.spark.ml.r.RWrapperUtils._
 import org.apache.spark.ml.util._
 import org.apache.spark.sql.{DataFrame, Dataset}
@@ -97,7 +97,13 @@ private[r] object LogisticRegressionWrapper
       standardization: Boolean,
       thresholds: Array[Double],
       weightCol: String,
-      aggregationDepth: Int
+      aggregationDepth: Int,
+      numRowsOfBoundsOnCoefficients: Int,
+      numColsOfBoundsOnCoefficients: Int,
+      lowerBoundsOnCoefficients: Array[Double],
+      upperBoundsOnCoefficients: Array[Double],
+      lowerBoundsOnIntercepts: Array[Double],
+      upperBoundsOnIntercepts: Array[Double]
       ): LogisticRegressionWrapper = {
 
     val rFormula = new RFormula()
@@ -133,6 +139,30 @@ private[r] object LogisticRegressionWrapper
 
     if (weightCol != null) lr.setWeightCol(weightCol)
 
+    if (numRowsOfBoundsOnCoefficients != 0 &&
+      numColsOfBoundsOnCoefficients != 0 && lowerBoundsOnCoefficients != null) {
+      val coef = Matrices.dense(numRowsOfBoundsOnCoefficients,
+        numColsOfBoundsOnCoefficients, lowerBoundsOnCoefficients)
+      lr.setLowerBoundsOnCoefficients(coef)
+    }
+
+    if (numRowsOfBoundsOnCoefficients != 0 &&
+      numColsOfBoundsOnCoefficients != 0 && upperBoundsOnCoefficients != null) {
+      val coef = Matrices.dense(numRowsOfBoundsOnCoefficients,
+        numColsOfBoundsOnCoefficients, upperBoundsOnCoefficients)
+      lr.setUpperBoundsOnCoefficients(coef)
+    }
+
+    if (lowerBoundsOnIntercepts != null) {
+      val intercept = Vectors.dense(lowerBoundsOnIntercepts)
+      lr.setLowerBoundsOnIntercepts(intercept)
+    }
+
+    if (upperBoundsOnIntercepts != null) {
+      val intercept = Vectors.dense(upperBoundsOnIntercepts)
+      lr.setUpperBoundsOnIntercepts(intercept)
+    }
+
     val idxToStr = new IndexToString()
       .setInputCol(PREDICTED_LABEL_INDEX_COL)
       .setOutputCol(PREDICTED_LABEL_COL)

Original file line number	Diff line number	Diff line change
`@@ -214,7 +214,7 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas`
`214`	`214`
`215`	`215`	`/**`
`216`	`216`	`* The lower bounds on intercepts if fitting under bound constrained optimization.`
`217`		`- * The bounds vector size must be equal with 1 for binomial regression, or the number`
	`217`	`+ * The bounds vector size must be equal to 1 for binomial regression, or the number`
`218`	`218`	`* of classes for multinomial regression. Otherwise, it throws exception.`
`219`	`219`	`* Default is none.`
`220`	`220`	`*`
`@@ -230,7 +230,7 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas`
`230`	`230`
`231`	`231`	`/**`
`232`	`232`	`* The upper bounds on intercepts if fitting under bound constrained optimization.`
`233`		`- * The bound vector size must be equal with 1 for binomial regression, or the number`
	`233`	`+ * The bound vector size must be equal to 1 for binomial regression, or the number`
`234`	`234`	`* of classes for multinomial regression. Otherwise, it throws exception.`
`235`	`235`	`* Default is none.`
`236`	`236`	`*`
`@@ -451,12 +451,12 @@ class LogisticRegression @Since("1.2.0") (`
`451`	`451`	`}`
`452`	`452`	`if (isSet(lowerBoundsOnIntercepts)) {`
`453`	`453`	`require($(lowerBoundsOnIntercepts).size == numCoefficientSets, "The size of " +`
`454`		`- "lowerBoundsOnIntercepts must be equal with 1 for binomial regression, or the number of " +`
	`454`	`+ "lowerBoundsOnIntercepts must be equal to 1 for binomial regression, or the number of " +`
`455`	`455`	`s"classes for multinomial regression, but found: ${getLowerBoundsOnIntercepts.size}.")`
`456`	`456`	`}`
`457`	`457`	`if (isSet(upperBoundsOnIntercepts)) {`
`458`	`458`	`require($(upperBoundsOnIntercepts).size == numCoefficientSets, "The size of " +`
`459`		`- "upperBoundsOnIntercepts must be equal with 1 for binomial regression, or the number of " +`
	`459`	`+ "upperBoundsOnIntercepts must be equal to 1 for binomial regression, or the number of " +`
`460`	`460`	`s"classes for multinomial regression, but found: ${getUpperBoundsOnIntercepts.size}.")`
`461`	`461`	`}`
`462`	`462`	`if (isSet(lowerBoundsOnCoefficients) && isSet(upperBoundsOnCoefficients)) {`