Skip to content

Commit 68a98bc

Browse files
authored
Merge pull request #3 from lxsmnv/master
Update from master
2 parents 02c9f3a + 615d9f0 commit 68a98bc

File tree

491 files changed

+9961
-3294
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

491 files changed

+9961
-3294
lines changed

.travis.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@ dist: trusty
2828
# 2. Choose language and target JDKs for parallel builds.
2929
language: java
3030
jdk:
31-
- oraclejdk7
3231
- oraclejdk8
3332

3433
# 3. Setup cache directory for SBT and Maven.

R/WINDOWS.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,6 @@ To run the SparkR unit tests on Windows, the following steps are required —ass
3838

3939
```
4040
R -e "install.packages('testthat', repos='http://cran.us.r-project.org')"
41-
.\bin\spark-submit2.cmd --conf spark.hadoop.fs.default.name="file:///" R\pkg\tests\run-all.R
41+
.\bin\spark-submit2.cmd --conf spark.hadoop.fs.defaultFS="file:///" R\pkg\tests\run-all.R
4242
```
4343

R/pkg/R/DataFrame.R

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -280,7 +280,7 @@ setMethod("dtypes",
280280

281281
#' Column Names of SparkDataFrame
282282
#'
283-
#' Return all column names as a list.
283+
#' Return a vector of column names.
284284
#'
285285
#' @param x a SparkDataFrame.
286286
#'
@@ -338,7 +338,7 @@ setMethod("colnames",
338338
})
339339

340340
#' @param value a character vector. Must have the same length as the number
341-
#' of columns in the SparkDataFrame.
341+
#' of columns to be renamed.
342342
#' @rdname columns
343343
#' @aliases colnames<-,SparkDataFrame-method
344344
#' @name colnames<-
@@ -1804,6 +1804,10 @@ setClassUnion("numericOrcharacter", c("numeric", "character"))
18041804
#' @note [[ since 1.4.0
18051805
setMethod("[[", signature(x = "SparkDataFrame", i = "numericOrcharacter"),
18061806
function(x, i) {
1807+
if (length(i) > 1) {
1808+
warning("Subset index has length > 1. Only the first index is used.")
1809+
i <- i[1]
1810+
}
18071811
if (is.numeric(i)) {
18081812
cols <- columns(x)
18091813
i <- cols[[i]]
@@ -1817,6 +1821,10 @@ setMethod("[[", signature(x = "SparkDataFrame", i = "numericOrcharacter"),
18171821
#' @note [[<- since 2.1.1
18181822
setMethod("[[<-", signature(x = "SparkDataFrame", i = "numericOrcharacter"),
18191823
function(x, i, value) {
1824+
if (length(i) > 1) {
1825+
warning("Subset index has length > 1. Only the first index is used.")
1826+
i <- i[1]
1827+
}
18201828
if (is.numeric(i)) {
18211829
cols <- columns(x)
18221830
i <- cols[[i]]

R/pkg/R/SQLContext.R

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -332,8 +332,10 @@ setMethod("toDF", signature(x = "RDD"),
332332

333333
#' Create a SparkDataFrame from a JSON file.
334334
#'
335-
#' Loads a JSON file (\href{http://jsonlines.org/}{JSON Lines text format or newline-delimited JSON}
336-
#' ), returning the result as a SparkDataFrame
335+
#' Loads a JSON file, returning the result as a SparkDataFrame
336+
#' By default, (\href{http://jsonlines.org/}{JSON Lines text format or newline-delimited JSON}
337+
#' ) is supported. For JSON (one record per file), set a named property \code{wholeFile} to
338+
#' \code{TRUE}.
337339
#' It goes through the entire dataset once to determine the schema.
338340
#'
339341
#' @param path Path of file to read. A vector of multiple paths is allowed.
@@ -346,6 +348,7 @@ setMethod("toDF", signature(x = "RDD"),
346348
#' sparkR.session()
347349
#' path <- "path/to/file.json"
348350
#' df <- read.json(path)
351+
#' df <- read.json(path, wholeFile = TRUE)
349352
#' df <- jsonFile(path)
350353
#' }
351354
#' @name read.json
@@ -778,14 +781,15 @@ dropTempView <- function(viewName) {
778781
#' @return SparkDataFrame
779782
#' @rdname read.df
780783
#' @name read.df
784+
#' @seealso \link{read.json}
781785
#' @export
782786
#' @examples
783787
#'\dontrun{
784788
#' sparkR.session()
785789
#' df1 <- read.df("path/to/file.json", source = "json")
786790
#' schema <- structType(structField("name", "string"),
787791
#' structField("info", "map<string,double>"))
788-
#' df2 <- read.df(mapTypeJsonPath, "json", schema)
792+
#' df2 <- read.df(mapTypeJsonPath, "json", schema, wholeFile = TRUE)
789793
#' df3 <- loadDF("data/test_table", "parquet", mergeSchema = "true")
790794
#' }
791795
#' @name read.df

R/pkg/R/generics.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1406,7 +1406,7 @@ setGeneric("spark.randomForest",
14061406

14071407
#' @rdname spark.survreg
14081408
#' @export
1409-
setGeneric("spark.survreg", function(data, formula) { standardGeneric("spark.survreg") })
1409+
setGeneric("spark.survreg", function(data, formula, ...) { standardGeneric("spark.survreg") })
14101410

14111411
#' @rdname spark.svmLinear
14121412
#' @export

R/pkg/R/mllib_classification.R

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -75,9 +75,9 @@ setClass("NaiveBayesModel", representation(jobj = "jobj"))
7575
#' @examples
7676
#' \dontrun{
7777
#' sparkR.session()
78-
#' df <- createDataFrame(iris)
79-
#' training <- df[df$Species %in% c("versicolor", "virginica"), ]
80-
#' model <- spark.svmLinear(training, Species ~ ., regParam = 0.5)
78+
#' t <- as.data.frame(Titanic)
79+
#' training <- createDataFrame(t)
80+
#' model <- spark.svmLinear(training, Survived ~ ., regParam = 0.5)
8181
#' summary <- summary(model)
8282
#'
8383
#' # fitted values on training data
@@ -207,6 +207,9 @@ function(object, path, overwrite = FALSE) {
207207
#' excepting that at most one value may be 0. The class with largest value p/t is predicted, where p
208208
#' is the original probability of that class and t is the class's threshold.
209209
#' @param weightCol The weight column name.
210+
#' @param aggregationDepth The depth for treeAggregate (greater than or equal to 2). If the dimensions of features
211+
#' or the number of partitions are large, this param could be adjusted to a larger size.
212+
#' This is an expert parameter. Default value should be good for most cases.
210213
#' @param ... additional arguments passed to the method.
211214
#' @return \code{spark.logit} returns a fitted logistic regression model.
212215
#' @rdname spark.logit
@@ -217,9 +220,9 @@ function(object, path, overwrite = FALSE) {
217220
#' \dontrun{
218221
#' sparkR.session()
219222
#' # binary logistic regression
220-
#' df <- createDataFrame(iris)
221-
#' training <- df[df$Species %in% c("versicolor", "virginica"), ]
222-
#' model <- spark.logit(training, Species ~ ., regParam = 0.5)
223+
#' t <- as.data.frame(Titanic)
224+
#' training <- createDataFrame(t)
225+
#' model <- spark.logit(training, Survived ~ ., regParam = 0.5)
223226
#' summary <- summary(model)
224227
#'
225228
#' # fitted values on training data
@@ -236,28 +239,29 @@ function(object, path, overwrite = FALSE) {
236239
#'
237240
#' # multinomial logistic regression
238241
#'
239-
#' df <- createDataFrame(iris)
240-
#' model <- spark.logit(df, Species ~ ., regParam = 0.5)
242+
#' model <- spark.logit(training, Class ~ ., regParam = 0.5)
241243
#' summary <- summary(model)
242244
#'
243245
#' }
244246
#' @note spark.logit since 2.1.0
245247
setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula"),
246248
function(data, formula, regParam = 0.0, elasticNetParam = 0.0, maxIter = 100,
247249
tol = 1E-6, family = "auto", standardization = TRUE,
248-
thresholds = 0.5, weightCol = NULL) {
250+
thresholds = 0.5, weightCol = NULL, aggregationDepth = 2) {
249251
formula <- paste(deparse(formula), collapse = "")
250252

251-
if (is.null(weightCol)) {
252-
weightCol <- ""
253+
if (!is.null(weightCol) && weightCol == "") {
254+
weightCol <- NULL
255+
} else if (!is.null(weightCol)) {
256+
weightCol <- as.character(weightCol)
253257
}
254258

255259
jobj <- callJStatic("org.apache.spark.ml.r.LogisticRegressionWrapper", "fit",
256260
data@sdf, formula, as.numeric(regParam),
257261
as.numeric(elasticNetParam), as.integer(maxIter),
258262
as.numeric(tol), as.character(family),
259263
as.logical(standardization), as.array(thresholds),
260-
as.character(weightCol))
264+
weightCol, as.integer(aggregationDepth))
261265
new("LogisticRegressionModel", jobj = jobj)
262266
})
263267

R/pkg/R/mllib_clustering.R

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -72,8 +72,9 @@ setClass("LDAModel", representation(jobj = "jobj"))
7272
#' @examples
7373
#' \dontrun{
7474
#' sparkR.session()
75-
#' df <- createDataFrame(iris)
76-
#' model <- spark.bisectingKmeans(df, Sepal_Length ~ Sepal_Width, k = 4)
75+
#' t <- as.data.frame(Titanic)
76+
#' df <- createDataFrame(t)
77+
#' model <- spark.bisectingKmeans(df, Class ~ Survived, k = 4)
7778
#' summary(model)
7879
#'
7980
#' # get fitted result from a bisecting k-means model
@@ -82,7 +83,7 @@ setClass("LDAModel", representation(jobj = "jobj"))
8283
#'
8384
#' # fitted values on training data
8485
#' fitted <- predict(model, df)
85-
#' head(select(fitted, "Sepal_Length", "prediction"))
86+
#' head(select(fitted, "Class", "prediction"))
8687
#'
8788
#' # save fitted model to input path
8889
#' path <- "path/to/model"
@@ -338,14 +339,14 @@ setMethod("write.ml", signature(object = "GaussianMixtureModel", path = "charact
338339
#' @examples
339340
#' \dontrun{
340341
#' sparkR.session()
341-
#' data(iris)
342-
#' df <- createDataFrame(iris)
343-
#' model <- spark.kmeans(df, Sepal_Length ~ Sepal_Width, k = 4, initMode = "random")
342+
#' t <- as.data.frame(Titanic)
343+
#' df <- createDataFrame(t)
344+
#' model <- spark.kmeans(df, Class ~ Survived, k = 4, initMode = "random")
344345
#' summary(model)
345346
#'
346347
#' # fitted values on training data
347348
#' fitted <- predict(model, df)
348-
#' head(select(fitted, "Sepal_Length", "prediction"))
349+
#' head(select(fitted, "Class", "prediction"))
349350
#'
350351
#' # save fitted model to input path
351352
#' path <- "path/to/model"

R/pkg/R/mllib_regression.R

Lines changed: 23 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -68,14 +68,14 @@ setClass("IsotonicRegressionModel", representation(jobj = "jobj"))
6868
#' @examples
6969
#' \dontrun{
7070
#' sparkR.session()
71-
#' data(iris)
72-
#' df <- createDataFrame(iris)
73-
#' model <- spark.glm(df, Sepal_Length ~ Sepal_Width, family = "gaussian")
71+
#' t <- as.data.frame(Titanic)
72+
#' df <- createDataFrame(t)
73+
#' model <- spark.glm(df, Freq ~ Sex + Age, family = "gaussian")
7474
#' summary(model)
7575
#'
7676
#' # fitted values on training data
7777
#' fitted <- predict(model, df)
78-
#' head(select(fitted, "Sepal_Length", "prediction"))
78+
#' head(select(fitted, "Freq", "prediction"))
7979
#'
8080
#' # save fitted model to input path
8181
#' path <- "path/to/model"
@@ -102,14 +102,16 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
102102
}
103103

104104
formula <- paste(deparse(formula), collapse = "")
105-
if (is.null(weightCol)) {
106-
weightCol <- ""
105+
if (!is.null(weightCol) && weightCol == "") {
106+
weightCol <- NULL
107+
} else if (!is.null(weightCol)) {
108+
weightCol <- as.character(weightCol)
107109
}
108110

109111
# For known families, Gamma is upper-cased
110112
jobj <- callJStatic("org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper",
111113
"fit", formula, data@sdf, tolower(family$family), family$link,
112-
tol, as.integer(maxIter), as.character(weightCol), regParam)
114+
tol, as.integer(maxIter), weightCol, regParam)
113115
new("GeneralizedLinearRegressionModel", jobj = jobj)
114116
})
115117

@@ -135,9 +137,9 @@ setMethod("spark.glm", signature(data = "SparkDataFrame", formula = "formula"),
135137
#' @examples
136138
#' \dontrun{
137139
#' sparkR.session()
138-
#' data(iris)
139-
#' df <- createDataFrame(iris)
140-
#' model <- glm(Sepal_Length ~ Sepal_Width, df, family = "gaussian")
140+
#' t <- as.data.frame(Titanic)
141+
#' df <- createDataFrame(t)
142+
#' model <- glm(Freq ~ Sex + Age, df, family = "gaussian")
141143
#' summary(model)
142144
#' }
143145
#' @note glm since 1.5.0
@@ -305,13 +307,15 @@ setMethod("spark.isoreg", signature(data = "SparkDataFrame", formula = "formula"
305307
function(data, formula, isotonic = TRUE, featureIndex = 0, weightCol = NULL) {
306308
formula <- paste(deparse(formula), collapse = "")
307309

308-
if (is.null(weightCol)) {
309-
weightCol <- ""
310+
if (!is.null(weightCol) && weightCol == "") {
311+
weightCol <- NULL
312+
} else if (!is.null(weightCol)) {
313+
weightCol <- as.character(weightCol)
310314
}
311315

312316
jobj <- callJStatic("org.apache.spark.ml.r.IsotonicRegressionWrapper", "fit",
313317
data@sdf, formula, as.logical(isotonic), as.integer(featureIndex),
314-
as.character(weightCol))
318+
weightCol)
315319
new("IsotonicRegressionModel", jobj = jobj)
316320
})
317321

@@ -372,6 +376,10 @@ setMethod("write.ml", signature(object = "IsotonicRegressionModel", path = "char
372376
#' @param formula a symbolic description of the model to be fitted. Currently only a few formula
373377
#' operators are supported, including '~', ':', '+', and '-'.
374378
#' Note that operator '.' is not supported currently.
379+
#' @param aggregationDepth The depth for treeAggregate (greater than or equal to 2). If the dimensions of features
380+
#' or the number of partitions are large, this param could be adjusted to a larger size.
381+
#' This is an expert parameter. Default value should be good for most cases.
382+
#' @param ... additional arguments passed to the method.
375383
#' @return \code{spark.survreg} returns a fitted AFT survival regression model.
376384
#' @rdname spark.survreg
377385
#' @seealso survival: \url{https://cran.r-project.org/package=survival}
@@ -396,10 +404,10 @@ setMethod("write.ml", signature(object = "IsotonicRegressionModel", path = "char
396404
#' }
397405
#' @note spark.survreg since 2.0.0
398406
setMethod("spark.survreg", signature(data = "SparkDataFrame", formula = "formula"),
399-
function(data, formula) {
407+
function(data, formula, aggregationDepth = 2) {
400408
formula <- paste(deparse(formula), collapse = "")
401409
jobj <- callJStatic("org.apache.spark.ml.r.AFTSurvivalRegressionWrapper",
402-
"fit", formula, data@sdf)
410+
"fit", formula, data@sdf, as.integer(aggregationDepth))
403411
new("AFTSurvivalRegressionModel", jobj = jobj)
404412
})
405413

R/pkg/R/mllib_tree.R

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -143,14 +143,15 @@ print.summary.treeEnsemble <- function(x) {
143143
#'
144144
#' # fit a Gradient Boosted Tree Classification Model
145145
#' # label must be binary - Only binary classification is supported for GBT.
146-
#' df <- createDataFrame(iris[iris$Species != "virginica", ])
147-
#' model <- spark.gbt(df, Species ~ Petal_Length + Petal_Width, "classification")
146+
#' t <- as.data.frame(Titanic)
147+
#' df <- createDataFrame(t)
148+
#' model <- spark.gbt(df, Survived ~ Age + Freq, "classification")
148149
#'
149150
#' # numeric label is also supported
150-
#' iris2 <- iris[iris$Species != "virginica", ]
151-
#' iris2$NumericSpecies <- ifelse(iris2$Species == "setosa", 0, 1)
152-
#' df <- createDataFrame(iris2)
153-
#' model <- spark.gbt(df, NumericSpecies ~ ., type = "classification")
151+
#' t2 <- as.data.frame(Titanic)
152+
#' t2$NumericGender <- ifelse(t2$Sex == "Male", 0, 1)
153+
#' df <- createDataFrame(t2)
154+
#' model <- spark.gbt(df, NumericGender ~ ., type = "classification")
154155
#' }
155156
#' @note spark.gbt since 2.1.0
156157
setMethod("spark.gbt", signature(data = "SparkDataFrame", formula = "formula"),
@@ -351,8 +352,9 @@ setMethod("write.ml", signature(object = "GBTClassificationModel", path = "chara
351352
#' summary(savedModel)
352353
#'
353354
#' # fit a Random Forest Classification Model
354-
#' df <- createDataFrame(iris)
355-
#' model <- spark.randomForest(df, Species ~ Petal_Length + Petal_Width, "classification")
355+
#' t <- as.data.frame(Titanic)
356+
#' df <- createDataFrame(t)
357+
#' model <- spark.randomForest(df, Survived ~ Freq + Age, "classification")
356358
#' }
357359
#' @note spark.randomForest since 2.1.0
358360
setMethod("spark.randomForest", signature(data = "SparkDataFrame", formula = "formula"),

R/pkg/inst/tests/testthat/test_mllib_classification.R

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,15 @@ test_that("spark.logit", {
211211
df <- createDataFrame(data)
212212
model <- spark.logit(df, label ~ feature)
213213
prediction <- collect(select(predict(model, df), "prediction"))
214-
expect_equal(prediction$prediction, c("0.0", "0.0", "1.0", "1.0", "0.0"))
214+
expect_equal(sort(prediction$prediction), c("0.0", "0.0", "0.0", "1.0", "1.0"))
215+
216+
# Test prediction with weightCol
217+
weight <- c(2.0, 2.0, 2.0, 1.0, 1.0)
218+
data2 <- as.data.frame(cbind(label, feature, weight))
219+
df2 <- createDataFrame(data2)
220+
model2 <- spark.logit(df2, label ~ feature, weightCol = "weight")
221+
prediction2 <- collect(select(predict(model2, df2), "prediction"))
222+
expect_equal(sort(prediction2$prediction), c("0.0", "0.0", "0.0", "0.0", "0.0"))
215223
})
216224

217225
test_that("spark.mlp", {

0 commit comments

Comments
 (0)