Skip to content

Commit dc8ccbc

Browse files
committed
fix unit test
1 parent 6ec068e commit dc8ccbc

File tree

3 files changed

+24
-20
lines changed

3 files changed

+24
-20
lines changed

R/pkg/R/mllib_regression.R

Lines changed: 19 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -65,8 +65,6 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj"))
6565
#' @param maxIter integer giving the maximal number of IRLS iterations.
6666
#' @param weightCol the weight column name. If this is not set or \code{NULL}, we treat all instance
6767
#' weights as 1.0.
68-
#' @param offsetCol the offset column name. If this is not set or empty, we treat all instance offsets
69-
#' as 0.0. The feature specified as offset has a constant coefficient of 1.0.
7068
#' @param regParam regularization parameter for L2 regularization.
7169
#' @param var.power the power in the variance function of the Tweedie distribution which provides
7270
#' the relationship between the variance and mean of the distribution. Only
@@ -78,6 +76,8 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj"))
7876
#' "frequencyDesc", "frequencyAsc", "alphabetDesc", and "alphabetAsc".
7977
#' The default value is "frequencyDesc". When the ordering is set to
8078
#' "alphabetDesc", this drops the same category as R when encoding strings.
79+
#' @param offsetCol the offset column name. If this is not set or empty, we treat all instance offsets
80+
#' as 0.0. The feature specified as offset has a constant coefficient of 1.0.
8181
#' @param ... additional arguments passed to the method.
8282
#' @aliases spark.glm,SparkDataFrame,formula-method
8383
#' @return \code{spark.glm} returns a fitted generalized linear model.
@@ -127,9 +127,10 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj"))
127127
#' @seealso \link{glm}, \link{read.ml}
128128
setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
129129
function(data, formula, family = gaussian, tol = 1e-6, maxIter = 25, weightCol = NULL,
130-
offsetCol = NULL, regParam = 0.0, var.power = 0.0, link.power = 1.0 - var.power,
130+
regParam = 0.0, var.power = 0.0, link.power = 1.0 - var.power,
131131
stringIndexerOrderType = c("frequencyDesc", "frequencyAsc",
132-
"alphabetDesc", "alphabetAsc")) {
132+
"alphabetDesc", "alphabetAsc"),
133+
offsetCol = NULL) {
133134

134135
stringIndexerOrderType <- match.arg(stringIndexerOrderType)
135136
if (is.character(family)) {
@@ -161,18 +162,19 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
161162
weightCol <- as.character(weightCol)
162163
}
163164

164-
if (!is.null(offsetCol) && offsetCol == "") {
165-
offsetCol <- NULL
166-
} else if (!is.null(offsetCol)) {
165+
if (!is.null(offsetCol)) {
167166
offsetCol <- as.character(offsetCol)
167+
if (nchar(offsetCol) == 0) {
168+
offsetCol <- NULL
169+
}
168170
}
169171

170172
# For known families, Gamma is upper-cased
171173
jobj <- callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper",
172174
"fit", formula, data@sdf, tolower(family$family), family$link,
173-
tol, as.integer(maxIter), weightCol, offsetCol, regParam,
175+
tol, as.integer(maxIter), weightCol, regParam,
174176
as.double(var.power), as.double(link.power),
175-
stringIndexerOrderType)
177+
stringIndexerOrderType, offsetCol)
176178
new("GeneralizedLinearRegressionModel", jobj = jobj)
177179
})
178180

@@ -190,8 +192,6 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
190192
#' \code{poisson}, \code{Gamma}, and \code{tweedie}.
191193
#' @param weightCol the weight column name. If this is not set or \code{NULL}, we treat all instance
192194
#' weights as 1.0.
193-
#' @param offsetCol the offset column name. If this is not set or empty, we treat all instance offsets
194-
#' as 0.0. The feature specified as offset has a constant coefficient of 1.0.
195195
#' @param epsilon positive convergence tolerance of iterations.
196196
#' @param maxit integer giving the maximal number of IRLS iterations.
197197
#' @param var.power the index of the power variance function in the Tweedie family.
@@ -202,6 +202,8 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
202202
#' "frequencyDesc", "frequencyAsc", "alphabetDesc", and "alphabetAsc".
203203
#' The default value is "frequencyDesc". When the ordering is set to
204204
#' "alphabetDesc", this drops the same category as R when encoding strings.
205+
#' @param offsetCol the offset column name. If this is not set or empty, we treat all instance offsets
206+
#' as 0.0. The feature specified as offset has a constant coefficient of 1.0.
205207
#' @return \code{glm} returns a fitted generalized linear model.
206208
#' @rdname glm
207209
#' @export
@@ -217,12 +219,14 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
217219
#' @seealso \link{spark.glm}
218220
setMethod("glm", signature(formula = "formula", family = "ANY", data = "SparkDataFrame"),
219221
function(formula, family = gaussian, data, epsilon = 1e-6, maxit = 25, weightCol = NULL,
220-
offsetCol = NULL, var.power = 0.0, link.power = 1.0 - var.power,
222+
var.power = 0.0, link.power = 1.0 - var.power,
221223
stringIndexerOrderType = c("frequencyDesc", "frequencyAsc",
222-
"alphabetDesc", "alphabetAsc")) {
224+
"alphabetDesc", "alphabetAsc"),
225+
offsetCol = NULL) {
223226
spark.glm(data, formula, family, tol = epsilon, maxIter = maxit, weightCol = weightCol,
224-
offsetCol = offsetCol, var.power = var.power, link.power = link.power,
225-
stringIndexerOrderType = stringIndexerOrderType)
227+
var.power = var.power, link.power = link.power,
228+
stringIndexerOrderType = stringIndexerOrderType,
229+
offsetCol = offsetCol)
226230
})
227231

228232
# Returns the summary of a model produced by glm() or spark.glm(), similarly to R's summary().

R/pkg/tests/fulltests/test_mllib_regression.R

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -175,9 +175,9 @@ test_that("spark.glm summary", {
175175

176176
# Test spark.glm works with offset
177177
stats <- summary(spark.glm(training, Sepal_Width ~ Sepal_Length + Species,
178-
family = poisson(), offsetCol = "Pedal_Length"))
179-
rStats <- summary(glm(Sepal.Width ~ Sepal.Length + Species,
180-
data = iris, family = poisson(), offset = Pedal_Length))
178+
family = poisson(), offsetCol = "Petal_Length"))
179+
rStats <- suppressWarnings(summary(glm(Sepal.Width ~ Sepal.Length + Species,
180+
data = iris, family = poisson(), offset = iris$Petal.Length)))
181181
expect_true(all(abs(rStats$coefficients - stats$coefficients) < 1e-3))
182182

183183
# Test summary works on base GLM models

mllib/src/main/scala/org/apache/spark/ml/r/GeneralizedLinearRegressionWrapper.scala

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,11 +74,11 @@ private[r] object GeneralizedLinearRegressionWrapper
7474
tol: Double,
7575
maxIter: Int,
7676
weightCol: String,
77-
offsetCol: String,
7877
regParam: Double,
7978
variancePower: Double,
8079
linkPower: Double,
81-
stringIndexerOrderType: String): GeneralizedLinearRegressionWrapper = {
80+
stringIndexerOrderType: String,
81+
offsetCol: String): GeneralizedLinearRegressionWrapper = {
8282
// scalastyle:on
8383
val rFormula = new RFormula().setFormula(formula)
8484
.setStringIndexerOrderType(stringIndexerOrderType)

0 commit comments

Comments
 (0)