Skip to content

Commit 5354337

Browse files
wangmiao1981Felix Cheung
authored andcommitted
[SPARK-20906][SPARKR] Constrained Logistic Regression for SparkR
## What changes were proposed in this pull request? PR #17715 Added Constrained Logistic Regression for ML. We should add it to SparkR. ## How was this patch tested? Add new unit tests. Author: wangmiao1981 <[email protected]> Closes #18128 from wangmiao1981/test.
1 parent 215281d commit 5354337

File tree

4 files changed

+135
-8
lines changed

4 files changed

+135
-8
lines changed

R/pkg/R/mllib_classification.R

Lines changed: 59 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,20 @@ function(object, path, overwrite = FALSE) {
204204
#' @param aggregationDepth The depth for treeAggregate (greater than or equal to 2). If the dimensions of features
205205
#' or the number of partitions are large, this param could be adjusted to a larger size.
206206
#' This is an expert parameter. Default value should be good for most cases.
207+
#' @param lowerBoundsOnCoefficients The lower bounds on coefficients if fitting under bound constrained optimization.
208+
#' The bound matrix must be compatible with the shape (1, number of features) for binomial
209+
#' regression, or (number of classes, number of features) for multinomial regression.
210+
#' It is a R matrix.
211+
#' @param upperBoundsOnCoefficients The upper bounds on coefficients if fitting under bound constrained optimization.
212+
#' The bound matrix must be compatible with the shape (1, number of features) for binomial
213+
#' regression, or (number of classes, number of features) for multinomial regression.
214+
#' It is a R matrix.
215+
#' @param lowerBoundsOnIntercepts The lower bounds on intercepts if fitting under bound constrained optimization.
216+
#' The bounds vector size must be equal to 1 for binomial regression, or the number
217+
#' of classes for multinomial regression.
218+
#' @param upperBoundsOnIntercepts The upper bounds on intercepts if fitting under bound constrained optimization.
219+
#' The bound vector size must be equal to 1 for binomial regression, or the number
220+
#' of classes for multinomial regression.
207221
#' @param ... additional arguments passed to the method.
208222
#' @return \code{spark.logit} returns a fitted logistic regression model.
209223
#' @rdname spark.logit
@@ -241,21 +255,64 @@ function(object, path, overwrite = FALSE) {
241255
setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula"),
242256
function(data, formula, regParam = 0.0, elasticNetParam = 0.0, maxIter = 100,
243257
tol = 1E-6, family = "auto", standardization = TRUE,
244-
thresholds = 0.5, weightCol = NULL, aggregationDepth = 2) {
258+
thresholds = 0.5, weightCol = NULL, aggregationDepth = 2,
259+
lowerBoundsOnCoefficients = NULL, upperBoundsOnCoefficients = NULL,
260+
lowerBoundsOnIntercepts = NULL, upperBoundsOnIntercepts = NULL) {
245261
formula <- paste(deparse(formula), collapse = "")
262+
row <- 0
263+
col <- 0
246264

247265
if (!is.null(weightCol) && weightCol == "") {
248266
weightCol <- NULL
249267
} else if (!is.null(weightCol)) {
250268
weightCol <- as.character(weightCol)
251269
}
252270

271+
if (!is.null(lowerBoundsOnIntercepts)) {
272+
lowerBoundsOnIntercepts <- as.array(lowerBoundsOnIntercepts)
273+
}
274+
275+
if (!is.null(upperBoundsOnIntercepts)) {
276+
upperBoundsOnIntercepts <- as.array(upperBoundsOnIntercepts)
277+
}
278+
279+
if (!is.null(lowerBoundsOnCoefficients)) {
280+
if (class(lowerBoundsOnCoefficients) != "matrix") {
281+
stop("lowerBoundsOnCoefficients must be a matrix.")
282+
}
283+
row <- nrow(lowerBoundsOnCoefficients)
284+
col <- ncol(lowerBoundsOnCoefficients)
285+
lowerBoundsOnCoefficients <- as.array(as.vector(lowerBoundsOnCoefficients))
286+
}
287+
288+
if (!is.null(upperBoundsOnCoefficients)) {
289+
if (class(upperBoundsOnCoefficients) != "matrix") {
290+
stop("upperBoundsOnCoefficients must be a matrix.")
291+
}
292+
293+
if (!is.null(lowerBoundsOnCoefficients) && (row != nrow(upperBoundsOnCoefficients)
294+
|| col != ncol(upperBoundsOnCoefficients))) {
295+
stop(paste0("dimension of upperBoundsOnCoefficients ",
296+
"is not the same as lowerBoundsOnCoefficients", sep = ""))
297+
}
298+
299+
if (is.null(lowerBoundsOnCoefficients)) {
300+
row <- nrow(upperBoundsOnCoefficients)
301+
col <- ncol(upperBoundsOnCoefficients)
302+
}
303+
304+
upperBoundsOnCoefficients <- as.array(as.vector(upperBoundsOnCoefficients))
305+
}
306+
253307
jobj <- callJStatic("org.apache.spark.ml.r.LogisticRegressionWrapper", "fit",
254308
data@sdf, formula, as.numeric(regParam),
255309
as.numeric(elasticNetParam), as.integer(maxIter),
256310
as.numeric(tol), as.character(family),
257311
as.logical(standardization), as.array(thresholds),
258-
weightCol, as.integer(aggregationDepth))
312+
weightCol, as.integer(aggregationDepth),
313+
as.integer(row), as.integer(col),
314+
lowerBoundsOnCoefficients, upperBoundsOnCoefficients,
315+
lowerBoundsOnIntercepts, upperBoundsOnIntercepts)
259316
new("LogisticRegressionModel", jobj = jobj)
260317
})
261318

R/pkg/tests/fulltests/test_mllib_classification.R

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,46 @@ test_that("spark.logit", {
223223
model2 <- spark.logit(df2, label ~ feature, weightCol = "weight")
224224
prediction2 <- collect(select(predict(model2, df2), "prediction"))
225225
expect_equal(sort(prediction2$prediction), c("0.0", "0.0", "0.0", "0.0", "0.0"))
226+
227+
# Test binomial logistic regression againt two classes with upperBoundsOnCoefficients
228+
# and upperBoundsOnIntercepts
229+
u <- matrix(c(1.0, 0.0, 1.0, 0.0), nrow = 1, ncol = 4)
230+
model <- spark.logit(training, Species ~ ., upperBoundsOnCoefficients = u,
231+
upperBoundsOnIntercepts = 1.0)
232+
summary <- summary(model)
233+
coefsR <- c(-11.13331, 1.00000, 0.00000, 1.00000, 0.00000)
234+
coefs <- summary$coefficients[, "Estimate"]
235+
expect_true(all(abs(coefsR - coefs) < 0.1))
236+
# Test upperBoundsOnCoefficients should be matrix
237+
expect_error(spark.logit(training, Species ~ ., upperBoundsOnCoefficients = as.array(c(1, 2)),
238+
upperBoundsOnIntercepts = 1.0))
239+
240+
# Test binomial logistic regression againt two classes with lowerBoundsOnCoefficients
241+
# and lowerBoundsOnIntercepts
242+
l <- matrix(c(0.0, -1.0, 0.0, -1.0), nrow = 1, ncol = 4)
243+
model <- spark.logit(training, Species ~ ., lowerBoundsOnCoefficients = l,
244+
lowerBoundsOnIntercepts = 0.0)
245+
summary <- summary(model)
246+
coefsR <- c(0, 0, -1, 0, 1.902192)
247+
coefs <- summary$coefficients[, "Estimate"]
248+
expect_true(all(abs(coefsR - coefs) < 0.1))
249+
# Test lowerBoundsOnCoefficients should be matrix
250+
expect_error(spark.logit(training, Species ~ ., lowerBoundsOnCoefficients = as.array(c(1, 2)),
251+
lowerBoundsOnIntercepts = 0.0))
252+
253+
# Test multinomial logistic regression with lowerBoundsOnCoefficients
254+
# and lowerBoundsOnIntercepts
255+
l <- matrix(c(0.0, -1.0, 0.0, -1.0, 0.0, -1.0, 0.0, -1.0), nrow = 2, ncol = 4)
256+
model <- spark.logit(training, Species ~ ., family = "multinomial",
257+
lowerBoundsOnCoefficients = l,
258+
lowerBoundsOnIntercepts = as.array(c(0.0, 0.0)))
259+
summary <- summary(model)
260+
versicolorCoefsR <- c(42.639465, 7.258104, 14.330814, 16.298243, 11.716429)
261+
virginicaCoefsR <- c(0.0002970796, 4.79274, 7.65047, 25.72793, 30.0021)
262+
versicolorCoefs <- summary$coefficients[, "versicolor"]
263+
virginicaCoefs <- summary$coefficients[, "virginica"]
264+
expect_true(all(abs(versicolorCoefsR - versicolorCoefs) < 0.1))
265+
expect_true(all(abs(virginicaCoefsR - virginicaCoefs) < 0.1))
226266
})
227267

228268
test_that("spark.mlp", {

mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,7 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas
214214

215215
/**
216216
* The lower bounds on intercepts if fitting under bound constrained optimization.
217-
* The bounds vector size must be equal with 1 for binomial regression, or the number
217+
* The bounds vector size must be equal to 1 for binomial regression, or the number
218218
* of classes for multinomial regression. Otherwise, it throws exception.
219219
* Default is none.
220220
*
@@ -230,7 +230,7 @@ private[classification] trait LogisticRegressionParams extends ProbabilisticClas
230230

231231
/**
232232
* The upper bounds on intercepts if fitting under bound constrained optimization.
233-
* The bound vector size must be equal with 1 for binomial regression, or the number
233+
* The bound vector size must be equal to 1 for binomial regression, or the number
234234
* of classes for multinomial regression. Otherwise, it throws exception.
235235
* Default is none.
236236
*
@@ -451,12 +451,12 @@ class LogisticRegression @Since("1.2.0") (
451451
}
452452
if (isSet(lowerBoundsOnIntercepts)) {
453453
require($(lowerBoundsOnIntercepts).size == numCoefficientSets, "The size of " +
454-
"lowerBoundsOnIntercepts must be equal with 1 for binomial regression, or the number of " +
454+
"lowerBoundsOnIntercepts must be equal to 1 for binomial regression, or the number of " +
455455
s"classes for multinomial regression, but found: ${getLowerBoundsOnIntercepts.size}.")
456456
}
457457
if (isSet(upperBoundsOnIntercepts)) {
458458
require($(upperBoundsOnIntercepts).size == numCoefficientSets, "The size of " +
459-
"upperBoundsOnIntercepts must be equal with 1 for binomial regression, or the number of " +
459+
"upperBoundsOnIntercepts must be equal to 1 for binomial regression, or the number of " +
460460
s"classes for multinomial regression, but found: ${getUpperBoundsOnIntercepts.size}.")
461461
}
462462
if (isSet(lowerBoundsOnCoefficients) && isSet(upperBoundsOnCoefficients)) {

mllib/src/main/scala/org/apache/spark/ml/r/LogisticRegressionWrapper.scala

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ import org.json4s.jackson.JsonMethods._
2525
import org.apache.spark.ml.{Pipeline, PipelineModel}
2626
import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
2727
import org.apache.spark.ml.feature.{IndexToString, RFormula}
28-
import org.apache.spark.ml.linalg.Vector
28+
import org.apache.spark.ml.linalg.{Matrices, Vector, Vectors}
2929
import org.apache.spark.ml.r.RWrapperUtils._
3030
import org.apache.spark.ml.util._
3131
import org.apache.spark.sql.{DataFrame, Dataset}
@@ -97,7 +97,13 @@ private[r] object LogisticRegressionWrapper
9797
standardization: Boolean,
9898
thresholds: Array[Double],
9999
weightCol: String,
100-
aggregationDepth: Int
100+
aggregationDepth: Int,
101+
numRowsOfBoundsOnCoefficients: Int,
102+
numColsOfBoundsOnCoefficients: Int,
103+
lowerBoundsOnCoefficients: Array[Double],
104+
upperBoundsOnCoefficients: Array[Double],
105+
lowerBoundsOnIntercepts: Array[Double],
106+
upperBoundsOnIntercepts: Array[Double]
101107
): LogisticRegressionWrapper = {
102108

103109
val rFormula = new RFormula()
@@ -133,6 +139,30 @@ private[r] object LogisticRegressionWrapper
133139

134140
if (weightCol != null) lr.setWeightCol(weightCol)
135141

142+
if (numRowsOfBoundsOnCoefficients != 0 &&
143+
numColsOfBoundsOnCoefficients != 0 && lowerBoundsOnCoefficients != null) {
144+
val coef = Matrices.dense(numRowsOfBoundsOnCoefficients,
145+
numColsOfBoundsOnCoefficients, lowerBoundsOnCoefficients)
146+
lr.setLowerBoundsOnCoefficients(coef)
147+
}
148+
149+
if (numRowsOfBoundsOnCoefficients != 0 &&
150+
numColsOfBoundsOnCoefficients != 0 && upperBoundsOnCoefficients != null) {
151+
val coef = Matrices.dense(numRowsOfBoundsOnCoefficients,
152+
numColsOfBoundsOnCoefficients, upperBoundsOnCoefficients)
153+
lr.setUpperBoundsOnCoefficients(coef)
154+
}
155+
156+
if (lowerBoundsOnIntercepts != null) {
157+
val intercept = Vectors.dense(lowerBoundsOnIntercepts)
158+
lr.setLowerBoundsOnIntercepts(intercept)
159+
}
160+
161+
if (upperBoundsOnIntercepts != null) {
162+
val intercept = Vectors.dense(upperBoundsOnIntercepts)
163+
lr.setUpperBoundsOnIntercepts(intercept)
164+
}
165+
136166
val idxToStr = new IndexToString()
137167
.setInputCol(PREDICTED_LABEL_INDEX_COL)
138168
.setOutputCol(PREDICTED_LABEL_COL)

0 commit comments

Comments
 (0)