From b3062ebcf09b37f826449a6f88f3162cb772eea2 Mon Sep 17 00:00:00 2001 From: Wayne Zhang Date: Sun, 7 May 2017 23:16:30 -0700 Subject: [PATCH 01/10] cherrypick 2fdaeb52bbe2ed1a9127ac72917286e505303c85 --- R/pkg/vignettes/sparkr-vignettes.Rmd | 36 ++++++++++++++-------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd index d16526d306d6..f53f77a24a40 100644 --- a/R/pkg/vignettes/sparkr-vignettes.Rmd +++ b/R/pkg/vignettes/sparkr-vignettes.Rmd @@ -65,7 +65,7 @@ We can view the first few rows of the `SparkDataFrame` by `head` or `showDF` fun head(carsDF) ``` -Common data processing operations such as `filter`, `select` are supported on the `SparkDataFrame`. +Common data processing operations such as `filter` and `select` are supported on the `SparkDataFrame`. ```{r} carsSubDF <- select(carsDF, "model", "mpg", "hp") carsSubDF <- filter(carsSubDF, carsSubDF$hp >= 200) @@ -364,7 +364,7 @@ out <- dapply(carsSubDF, function(x) { x <- cbind(x, x$mpg * 1.61) }, schema) head(collect(out)) ``` -Like `dapply`, apply a function to each partition of a `SparkDataFrame` and collect the result back. The output of function should be a `data.frame`, but no schema is required in this case. Note that `dapplyCollect` can fail if the output of UDF run on all the partition cannot be pulled to the driver and fit in driver memory. +Like `dapply`, `dapplyCollect` can apply a function to each partition of a `SparkDataFrame` and collect the result back. The output of the function should be a `data.frame`, but no schema is required in this case. Note that `dapplyCollect` can fail if the output of the UDF on all partitions cannot be pulled into the driver's memory. ```{r} out <- dapplyCollect( @@ -390,7 +390,7 @@ result <- gapply( head(arrange(result, "max_mpg", decreasing = TRUE)) ``` -Like gapply, `gapplyCollect` applies a function to each partition of a `SparkDataFrame` and collect the result back to R `data.frame`. The output of the function should be a `data.frame` but no schema is required in this case. Note that `gapplyCollect` can fail if the output of UDF run on all the partition cannot be pulled to the driver and fit in driver memory. +Like `gapply`, `gapplyCollect` can apply a function to each partition of a `SparkDataFrame` and collect the result back to R `data.frame`. The output of the function should be a `data.frame` but no schema is required in this case. Note that `gapplyCollect` can fail if the output of the UDF on all partitions cannot be pulled into the driver's memory. ```{r} result <- gapplyCollect( @@ -443,20 +443,20 @@ options(ops) ### SQL Queries -A `SparkDataFrame` can also be registered as a temporary view in Spark SQL and that allows you to run SQL queries over its data. The sql function enables applications to run SQL queries programmatically and returns the result as a `SparkDataFrame`. +A `SparkDataFrame` can also be registered as a temporary view in Spark SQL so that one can run SQL queries over its data. The sql function enables applications to run SQL queries programmatically and returns the result as a `SparkDataFrame`. ```{r} people <- read.df(paste0(sparkR.conf("spark.home"), "/examples/src/main/resources/people.json"), "json") ``` -Register this SparkDataFrame as a temporary view. +Register this `SparkDataFrame` as a temporary view. ```{r} createOrReplaceTempView(people, "people") ``` -SQL statements can be run by using the sql method. +SQL statements can be run using the sql method. ```{r} teenagers <- sql("SELECT name FROM people WHERE age >= 13 AND age <= 19") head(teenagers) @@ -718,7 +718,7 @@ head(predict(isoregModel, newDF)) `spark.gbt` fits a [gradient-boosted tree](https://en.wikipedia.org/wiki/Gradient_boosting) classification or regression model on a `SparkDataFrame`. Users can call `summary` to get a summary of the fitted model, `predict` to make predictions, and `write.ml`/`read.ml` to save/load fitted models. -Similar to the random forest example above, we use the `longley` dataset to train a gradient-boosted tree and make predictions: +We use the `longley` dataset to train a gradient-boosted tree and make predictions: ```{r, warning=FALSE} df <- createDataFrame(longley) @@ -745,7 +745,7 @@ predictions <- predict(rfModel, df) `spark.gaussianMixture` fits multivariate [Gaussian Mixture Model](https://en.wikipedia.org/wiki/Mixture_model#Multivariate_Gaussian_mixture_model) (GMM) against a `SparkDataFrame`. [Expectation-Maximization](https://en.wikipedia.org/wiki/Expectation%E2%80%93maximization_algorithm) (EM) is used to approximate the maximum likelihood estimator (MLE) of the model. -We use a simulated example to demostrate the usage. +We use a simulated example to demonstrate the usage. ```{r} X1 <- data.frame(V1 = rnorm(4), V2 = rnorm(4)) X2 <- data.frame(V1 = rnorm(6, 3), V2 = rnorm(6, 4)) @@ -776,9 +776,9 @@ head(select(kmeansPredictions, "model", "mpg", "hp", "wt", "prediction"), n = 20 * Topics and documents both exist in a feature space, where feature vectors are vectors of word counts (bag of words). -* Rather than estimating a clustering using a traditional distance, LDA uses a function based on a statistical model of how text documents are generated. +* Rather than clustering using a traditional distance, LDA uses a function based on a statistical model of how text documents are generated. -To use LDA, we need to specify a `features` column in `data` where each entry represents a document. There are two type options for the column: +To use LDA, we need to specify a `features` column in `data` where each entry represents a document. There are two options for the column: * character string: This can be a string of the whole document. It will be parsed automatically. Additional stop words can be added in `customizedStopWords`. @@ -826,7 +826,7 @@ perplexity `spark.als` learns latent factors in [collaborative filtering](https://en.wikipedia.org/wiki/Recommender_system#Collaborative_filtering) via [alternating least squares](http://dl.acm.org/citation.cfm?id=1608614). -There are multiple options that can be configured in `spark.als`, including `rank`, `reg`, `nonnegative`. For a complete list, refer to the help file. +There are multiple options that can be configured in `spark.als`, including `rank`, `reg`, and `nonnegative`. For a complete list, refer to the help file. ```{r} ratings <- list(list(0, 0, 4.0), list(0, 1, 2.0), list(1, 1, 3.0), list(1, 2, 4.0), @@ -875,7 +875,7 @@ testSummary ### Model Persistence -The following example shows how to save/load an ML model by SparkR. +The following example shows how to save/load an ML model in SparkR. ```{r, warning=FALSE} irisDF <- createDataFrame(iris) gaussianGLM <- spark.glm(irisDF, Sepal_Length ~ Sepal_Width + Species, family = "gaussian") @@ -906,19 +906,19 @@ There are three main object classes in SparkR you may be working with. + `sdf` stores a reference to the corresponding Spark Dataset in the Spark JVM backend. + `env` saves the meta-information of the object such as `isCached`. -It can be created by data import methods or by transforming an existing `SparkDataFrame`. We can manipulate `SparkDataFrame` by numerous data processing functions and feed that into machine learning algorithms. + It can be created by data import methods or by transforming an existing `SparkDataFrame`. We can manipulate `SparkDataFrame` by numerous data processing functions and feed that into machine learning algorithms. -* `Column`: an S4 class representing column of `SparkDataFrame`. The slot `jc` saves a reference to the corresponding Column object in the Spark JVM backend. +* `Column`: an S4 class representing a column of `SparkDataFrame`. The slot `jc` saves a reference to the corresponding `Column` object in the Spark JVM backend. -It can be obtained from a `SparkDataFrame` by `$` operator, `df$col`. More often, it is used together with other functions, for example, with `select` to select particular columns, with `filter` and constructed conditions to select rows, with aggregation functions to compute aggregate statistics for each group. + It can be obtained from a `SparkDataFrame` by `$` operator, e.g., `df$col`. More often, it is used together with other functions, for example, with `select` to select particular columns, with `filter` and constructed conditions to select rows, with aggregation functions to compute aggregate statistics for each group. -* `GroupedData`: an S4 class representing grouped data created by `groupBy` or by transforming other `GroupedData`. Its `sgd` slot saves a reference to a RelationalGroupedDataset object in the backend. +* `GroupedData`: an S4 class representing grouped data created by `groupBy` or by transforming other `GroupedData`. Its `sgd` slot saves a reference to a `RelationalGroupedDataset` object in the backend. -This is often an intermediate object with group information and followed up by aggregation operations. + This is often an intermediate object with group information and followed up by aggregation operations. ### Architecture -A complete description of architecture can be seen in reference, in particular the paper *SparkR: Scaling R Programs with Spark*. +A complete description of architecture can be seen in the references, in particular the paper *SparkR: Scaling R Programs with Spark*. Under the hood of SparkR is Spark SQL engine. This avoids the overheads of running interpreted R code, and the optimized SQL execution engine in Spark uses structural information about data and computation flow to perform a bunch of optimizations to speed up the computation. From 5e9483871ad34fa8d653b69a77f066bd3183bd47 Mon Sep 17 00:00:00 2001 From: Felix Cheung Date: Sun, 7 May 2017 23:10:18 -0700 Subject: [PATCH 02/10] cherrypick c24bdaab5a234d18b273544cefc44cc4005bf8fc --- R/pkg/inst/tests/testthat/test_sparkSQL.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index 628512440d6e..07de45b5710e 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -89,6 +89,10 @@ mockLinesComplexType <- complexTypeJsonPath <- tempfile(pattern = "sparkr-test", fileext = ".tmp") writeLines(mockLinesComplexType, complexTypeJsonPath) +if (.Platform$OS.type == "windows") { + Sys.setenv(TZ = "GMT") +} + test_that("calling sparkRSQL.init returns existing SQL context", { sqlContext <- suppressWarnings(sparkRSQL.init(sc)) expect_equal(suppressWarnings(sparkRSQL.init(sc)), sqlContext) From 15c289477fe8313920f3656734fdd0eba1cfb3dc Mon Sep 17 00:00:00 2001 From: Felix Cheung Date: Fri, 8 Sep 2017 00:48:23 -0700 Subject: [PATCH 03/10] manual port of 0b0be47e7b742d96810c60b19a9aa920242e5224 --- .../tests/{testthat => fulltests}/jarTest.R | 0 .../packageInAJarTest.R | 0 .../{testthat => fulltests}/test_Serde.R | 0 .../{testthat => fulltests}/test_Windows.R | 0 .../{testthat => fulltests}/test_binaryFile.R | 0 .../test_binary_function.R | 0 .../{testthat => fulltests}/test_broadcast.R | 0 .../{testthat => fulltests}/test_client.R | 0 .../{testthat => fulltests}/test_context.R | 0 .../test_includePackage.R | 0 .../{testthat => fulltests}/test_jvm_api.R | 0 .../{testthat => fulltests}/test_mllib.R | 0 .../test_parallelize_collect.R | 0 .../tests/{testthat => fulltests}/test_rdd.R | 0 .../{testthat => fulltests}/test_shuffle.R | 0 .../{testthat => fulltests}/test_sparkR.R | 0 .../{testthat => fulltests}/test_sparkSQL.R | 0 .../tests/{testthat => fulltests}/test_take.R | 0 .../{testthat => fulltests}/test_textFile.R | 0 .../{testthat => fulltests}/test_utils.R | 0 R/pkg/inst/tests/testthat/test_basic.R | 90 +++++++++++++++++++ R/pkg/tests/run-all.R | 20 ++++- 22 files changed, 109 insertions(+), 1 deletion(-) rename R/pkg/inst/tests/{testthat => fulltests}/jarTest.R (100%) rename R/pkg/inst/tests/{testthat => fulltests}/packageInAJarTest.R (100%) rename R/pkg/inst/tests/{testthat => fulltests}/test_Serde.R (100%) rename R/pkg/inst/tests/{testthat => fulltests}/test_Windows.R (100%) rename R/pkg/inst/tests/{testthat => fulltests}/test_binaryFile.R (100%) rename R/pkg/inst/tests/{testthat => fulltests}/test_binary_function.R (100%) rename R/pkg/inst/tests/{testthat => fulltests}/test_broadcast.R (100%) rename R/pkg/inst/tests/{testthat => fulltests}/test_client.R (100%) rename R/pkg/inst/tests/{testthat => fulltests}/test_context.R (100%) rename R/pkg/inst/tests/{testthat => fulltests}/test_includePackage.R (100%) rename R/pkg/inst/tests/{testthat => fulltests}/test_jvm_api.R (100%) rename R/pkg/inst/tests/{testthat => fulltests}/test_mllib.R (100%) rename R/pkg/inst/tests/{testthat => fulltests}/test_parallelize_collect.R (100%) rename R/pkg/inst/tests/{testthat => fulltests}/test_rdd.R (100%) rename R/pkg/inst/tests/{testthat => fulltests}/test_shuffle.R (100%) rename R/pkg/inst/tests/{testthat => fulltests}/test_sparkR.R (100%) rename R/pkg/inst/tests/{testthat => fulltests}/test_sparkSQL.R (100%) rename R/pkg/inst/tests/{testthat => fulltests}/test_take.R (100%) rename R/pkg/inst/tests/{testthat => fulltests}/test_textFile.R (100%) rename R/pkg/inst/tests/{testthat => fulltests}/test_utils.R (100%) create mode 100644 R/pkg/inst/tests/testthat/test_basic.R diff --git a/R/pkg/inst/tests/testthat/jarTest.R b/R/pkg/inst/tests/fulltests/jarTest.R similarity index 100% rename from R/pkg/inst/tests/testthat/jarTest.R rename to R/pkg/inst/tests/fulltests/jarTest.R diff --git a/R/pkg/inst/tests/testthat/packageInAJarTest.R b/R/pkg/inst/tests/fulltests/packageInAJarTest.R similarity index 100% rename from R/pkg/inst/tests/testthat/packageInAJarTest.R rename to R/pkg/inst/tests/fulltests/packageInAJarTest.R diff --git a/R/pkg/inst/tests/testthat/test_Serde.R b/R/pkg/inst/tests/fulltests/test_Serde.R similarity index 100% rename from R/pkg/inst/tests/testthat/test_Serde.R rename to R/pkg/inst/tests/fulltests/test_Serde.R diff --git a/R/pkg/inst/tests/testthat/test_Windows.R b/R/pkg/inst/tests/fulltests/test_Windows.R similarity index 100% rename from R/pkg/inst/tests/testthat/test_Windows.R rename to R/pkg/inst/tests/fulltests/test_Windows.R diff --git a/R/pkg/inst/tests/testthat/test_binaryFile.R b/R/pkg/inst/tests/fulltests/test_binaryFile.R similarity index 100% rename from R/pkg/inst/tests/testthat/test_binaryFile.R rename to R/pkg/inst/tests/fulltests/test_binaryFile.R diff --git a/R/pkg/inst/tests/testthat/test_binary_function.R b/R/pkg/inst/tests/fulltests/test_binary_function.R similarity index 100% rename from R/pkg/inst/tests/testthat/test_binary_function.R rename to R/pkg/inst/tests/fulltests/test_binary_function.R diff --git a/R/pkg/inst/tests/testthat/test_broadcast.R b/R/pkg/inst/tests/fulltests/test_broadcast.R similarity index 100% rename from R/pkg/inst/tests/testthat/test_broadcast.R rename to R/pkg/inst/tests/fulltests/test_broadcast.R diff --git a/R/pkg/inst/tests/testthat/test_client.R b/R/pkg/inst/tests/fulltests/test_client.R similarity index 100% rename from R/pkg/inst/tests/testthat/test_client.R rename to R/pkg/inst/tests/fulltests/test_client.R diff --git a/R/pkg/inst/tests/testthat/test_context.R b/R/pkg/inst/tests/fulltests/test_context.R similarity index 100% rename from R/pkg/inst/tests/testthat/test_context.R rename to R/pkg/inst/tests/fulltests/test_context.R diff --git a/R/pkg/inst/tests/testthat/test_includePackage.R b/R/pkg/inst/tests/fulltests/test_includePackage.R similarity index 100% rename from R/pkg/inst/tests/testthat/test_includePackage.R rename to R/pkg/inst/tests/fulltests/test_includePackage.R diff --git a/R/pkg/inst/tests/testthat/test_jvm_api.R b/R/pkg/inst/tests/fulltests/test_jvm_api.R similarity index 100% rename from R/pkg/inst/tests/testthat/test_jvm_api.R rename to R/pkg/inst/tests/fulltests/test_jvm_api.R diff --git a/R/pkg/inst/tests/testthat/test_mllib.R b/R/pkg/inst/tests/fulltests/test_mllib.R similarity index 100% rename from R/pkg/inst/tests/testthat/test_mllib.R rename to R/pkg/inst/tests/fulltests/test_mllib.R diff --git a/R/pkg/inst/tests/testthat/test_parallelize_collect.R b/R/pkg/inst/tests/fulltests/test_parallelize_collect.R similarity index 100% rename from R/pkg/inst/tests/testthat/test_parallelize_collect.R rename to R/pkg/inst/tests/fulltests/test_parallelize_collect.R diff --git a/R/pkg/inst/tests/testthat/test_rdd.R b/R/pkg/inst/tests/fulltests/test_rdd.R similarity index 100% rename from R/pkg/inst/tests/testthat/test_rdd.R rename to R/pkg/inst/tests/fulltests/test_rdd.R diff --git a/R/pkg/inst/tests/testthat/test_shuffle.R b/R/pkg/inst/tests/fulltests/test_shuffle.R similarity index 100% rename from R/pkg/inst/tests/testthat/test_shuffle.R rename to R/pkg/inst/tests/fulltests/test_shuffle.R diff --git a/R/pkg/inst/tests/testthat/test_sparkR.R b/R/pkg/inst/tests/fulltests/test_sparkR.R similarity index 100% rename from R/pkg/inst/tests/testthat/test_sparkR.R rename to R/pkg/inst/tests/fulltests/test_sparkR.R diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/fulltests/test_sparkSQL.R similarity index 100% rename from R/pkg/inst/tests/testthat/test_sparkSQL.R rename to R/pkg/inst/tests/fulltests/test_sparkSQL.R diff --git a/R/pkg/inst/tests/testthat/test_take.R b/R/pkg/inst/tests/fulltests/test_take.R similarity index 100% rename from R/pkg/inst/tests/testthat/test_take.R rename to R/pkg/inst/tests/fulltests/test_take.R diff --git a/R/pkg/inst/tests/testthat/test_textFile.R b/R/pkg/inst/tests/fulltests/test_textFile.R similarity index 100% rename from R/pkg/inst/tests/testthat/test_textFile.R rename to R/pkg/inst/tests/fulltests/test_textFile.R diff --git a/R/pkg/inst/tests/testthat/test_utils.R b/R/pkg/inst/tests/fulltests/test_utils.R similarity index 100% rename from R/pkg/inst/tests/testthat/test_utils.R rename to R/pkg/inst/tests/fulltests/test_utils.R diff --git a/R/pkg/inst/tests/testthat/test_basic.R b/R/pkg/inst/tests/testthat/test_basic.R new file mode 100644 index 000000000000..de47162d5325 --- /dev/null +++ b/R/pkg/inst/tests/testthat/test_basic.R @@ -0,0 +1,90 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +context("basic tests for CRAN") + +test_that("create DataFrame from list or data.frame", { + sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE) + + i <- 4 + df <- createDataFrame(data.frame(dummy = 1:i)) + expect_equal(count(df), i) + + l <- list(list(a = 1, b = 2), list(a = 3, b = 4)) + df <- createDataFrame(l) + expect_equal(columns(df), c("a", "b")) + + a <- 1:3 + b <- c("a", "b", "c") + ldf <- data.frame(a, b) + df <- createDataFrame(ldf) + expect_equal(columns(df), c("a", "b")) + expect_equal(dtypes(df), list(c("a", "int"), c("b", "string"))) + expect_equal(count(df), 3) + ldf2 <- collect(df) + expect_equal(ldf$a, ldf2$a) + + mtcarsdf <- createDataFrame(mtcars) + expect_equivalent(collect(mtcarsdf), mtcars) + + bytes <- as.raw(c(1, 2, 3)) + df <- createDataFrame(list(list(bytes))) + expect_equal(collect(df)[[1]][[1]], bytes) + + sparkR.session.stop() +}) + +test_that("spark.glm and predict", { + sparkR.session(master = sparkRTestMaster, enableHiveSupport = FALSE) + + training <- suppressWarnings(createDataFrame(iris)) + # gaussian family + model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species) + prediction <- predict(model, training) + expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double") + vals <- collect(select(prediction, "prediction")) + rVals <- predict(glm(Sepal.Width ~ Sepal.Length + Species, data = iris), iris) + expect_true(all(abs(rVals - vals) < 1e-6), rVals - vals) + + # Gamma family + x <- runif(100, -1, 1) + y <- rgamma(100, rate = 10 / exp(0.5 + 1.2 * x), shape = 10) + df <- as.DataFrame(as.data.frame(list(x = x, y = y))) + model <- glm(y ~ x, family = Gamma, df) + out <- capture.output(print(summary(model))) + expect_true(any(grepl("Dispersion parameter for gamma family", out))) + + # tweedie family + model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species, + family = "tweedie", var.power = 1.2, link.power = 0.0) + prediction <- predict(model, training) + expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double") + vals <- collect(select(prediction, "prediction")) + + # manual calculation of the R predicted values to avoid dependence on statmod + #' library(statmod) + #' rModel <- glm(Sepal.Width ~ Sepal.Length + Species, data = iris, + #' family = tweedie(var.power = 1.2, link.power = 0.0)) + #' print(coef(rModel)) + + rCoef <- c(0.6455409, 0.1169143, -0.3224752, -0.3282174) + rVals <- exp(as.numeric(model.matrix(Sepal.Width ~ Sepal.Length + Species, + data = iris) %*% rCoef)) + expect_true(all(abs(rVals - vals) < 1e-5), rVals - vals) + + sparkR.session.stop() +}) diff --git a/R/pkg/tests/run-all.R b/R/pkg/tests/run-all.R index 9b75d9556692..f00a61067975 100644 --- a/R/pkg/tests/run-all.R +++ b/R/pkg/tests/run-all.R @@ -21,13 +21,31 @@ library(SparkR) # Turn all warnings into errors options("warn" = 2) -install.spark() +if (.Platform$OS.type == "windows") { + Sys.setenv(TZ = "GMT") +} # Setup global test environment +# Install Spark first to set SPARK_HOME +install.spark() + sparkRDir <- file.path(Sys.getenv("SPARK_HOME"), "R") sparkRFilesBefore <- list.files(path = sparkRDir, all.files = TRUE) sparkRWhitelistSQLDirs <- c("spark-warehouse", "metastore_db") invisible(lapply(sparkRWhitelistSQLDirs, function(x) { unlink(file.path(sparkRDir, x), recursive = TRUE, force = TRUE)})) +sparkRTestMaster <- "local[1]" +if (identical(Sys.getenv("NOT_CRAN"), "true")) { + sparkRTestMaster <- "" +} + test_package("SparkR") + +if (identical(Sys.getenv("NOT_CRAN"), "true")) { + # for testthat 1.0.2 later, change reporter from "summary" to default_reporter() + testthat:::run_tests("SparkR", + file.path(sparkRDir, "pkg", "tests", "fulltests"), + NULL, + "summary") +} From 79c82c773cbd8b1dbb59f8329c621bdbe9e2c287 Mon Sep 17 00:00:00 2001 From: Felix Cheung Date: Fri, 8 Sep 2017 00:50:16 -0700 Subject: [PATCH 04/10] port subset of 9f4ff9552470fb97ca38bb56bbf43be49a9a316c --- R/pkg/.Rbuildignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/R/pkg/.Rbuildignore b/R/pkg/.Rbuildignore index f12f8c275a98..280ab5875998 100644 --- a/R/pkg/.Rbuildignore +++ b/R/pkg/.Rbuildignore @@ -6,3 +6,5 @@ ^README\.Rmd$ ^src-native$ ^html$ +^tests/fulltests/* + From a74ba0276705e9ac84f56b57a558b2211cdf947b Mon Sep 17 00:00:00 2001 From: hyukjinkwon Date: Sun, 18 Jun 2017 11:26:27 -0700 Subject: [PATCH 05/10] [SPARK-21128][R] Remove both "spark-warehouse" and "metastore_db" before listing files in R tests ## What changes were proposed in this pull request? This PR proposes to list the files in test _after_ removing both "spark-warehouse" and "metastore_db" so that the next run of R tests pass fine. This is sometimes a bit annoying. ## How was this patch tested? Manually running multiple times R tests via `./R/run-tests.sh`. **Before** Second run: ``` SparkSQL functions: Spark package found in SPARK_HOME: .../spark ............................................................................................................................................................... ............................................................................................................................................................... ............................................................................................................................................................... ............................................................................................................................................................... ............................................................................................................................................................... ....................................................................................................1234....................... Failed ------------------------------------------------------------------------- 1. Failure: No extra files are created in SPARK_HOME by starting session and making calls (test_sparkSQL.R#3384) length(list1) not equal to length(list2). 1/1 mismatches [1] 25 - 23 == 2 2. Failure: No extra files are created in SPARK_HOME by starting session and making calls (test_sparkSQL.R#3384) sort(list1, na.last = TRUE) not equal to sort(list2, na.last = TRUE). 10/25 mismatches x[16]: "metastore_db" y[16]: "pkg" x[17]: "pkg" y[17]: "R" x[18]: "R" y[18]: "README.md" x[19]: "README.md" y[19]: "run-tests.sh" x[20]: "run-tests.sh" y[20]: "SparkR_2.2.0.tar.gz" x[21]: "metastore_db" y[21]: "pkg" x[22]: "pkg" y[22]: "R" x[23]: "R" y[23]: "README.md" x[24]: "README.md" y[24]: "run-tests.sh" x[25]: "run-tests.sh" y[25]: "SparkR_2.2.0.tar.gz" 3. Failure: No extra files are created in SPARK_HOME by starting session and making calls (test_sparkSQL.R#3388) length(list1) not equal to length(list2). 1/1 mismatches [1] 25 - 23 == 2 4. Failure: No extra files are created in SPARK_HOME by starting session and making calls (test_sparkSQL.R#3388) sort(list1, na.last = TRUE) not equal to sort(list2, na.last = TRUE). 10/25 mismatches x[16]: "metastore_db" y[16]: "pkg" x[17]: "pkg" y[17]: "R" x[18]: "R" y[18]: "README.md" x[19]: "README.md" y[19]: "run-tests.sh" x[20]: "run-tests.sh" y[20]: "SparkR_2.2.0.tar.gz" x[21]: "metastore_db" y[21]: "pkg" x[22]: "pkg" y[22]: "R" x[23]: "R" y[23]: "README.md" x[24]: "README.md" y[24]: "run-tests.sh" x[25]: "run-tests.sh" y[25]: "SparkR_2.2.0.tar.gz" DONE =========================================================================== ``` **After** Second run: ``` SparkSQL functions: Spark package found in SPARK_HOME: .../spark ............................................................................................................................................................... ............................................................................................................................................................... ............................................................................................................................................................... ............................................................................................................................................................... ............................................................................................................................................................... ............................................................................................................................... ``` Author: hyukjinkwon Closes #18335 from HyukjinKwon/SPARK-21128. --- R/pkg/tests/run-all.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/tests/run-all.R b/R/pkg/tests/run-all.R index f00a61067975..0aefd8006caa 100644 --- a/R/pkg/tests/run-all.R +++ b/R/pkg/tests/run-all.R @@ -30,10 +30,10 @@ if (.Platform$OS.type == "windows") { install.spark() sparkRDir <- file.path(Sys.getenv("SPARK_HOME"), "R") -sparkRFilesBefore <- list.files(path = sparkRDir, all.files = TRUE) sparkRWhitelistSQLDirs <- c("spark-warehouse", "metastore_db") invisible(lapply(sparkRWhitelistSQLDirs, function(x) { unlink(file.path(sparkRDir, x), recursive = TRUE, force = TRUE)})) +sparkRFilesBefore <- list.files(path = sparkRDir, all.files = TRUE) sparkRTestMaster <- "local[1]" if (identical(Sys.getenv("NOT_CRAN"), "true")) { From 129a9798a98ddd86601c876fb83eeb5f5995d814 Mon Sep 17 00:00:00 2001 From: Felix Cheung Date: Wed, 23 Aug 2017 21:35:17 -0700 Subject: [PATCH 06/10] [SPARK-21805][SPARKR] Disable R vignettes code on Windows ## What changes were proposed in this pull request? Code in vignettes requires winutils on windows to run, when publishing to CRAN or building from source, winutils might not be available, so it's better to disable code run (so resulting vigenttes will not have output from code, but text is still there and code is still there) fix * checking re-building of vignette outputs ... WARNING and > %LOCALAPPDATA% not found. Please define the environment variable or restart and enter an installation path in localDir. ## How was this patch tested? jenkins, appveyor, r-hub before: https://artifacts.r-hub.io/SparkR_2.2.0.tar.gz-49cecef3bb09db1db130db31604e0293/SparkR.Rcheck/00check.log after: https://artifacts.r-hub.io/SparkR_2.2.0.tar.gz-86a066c7576f46794930ad114e5cff7c/SparkR.Rcheck/00check.log Author: Felix Cheung Closes #19016 from felixcheung/rvigwind. --- R/pkg/DESCRIPTION | 2 +- R/pkg/R/install.R | 6 +++++- R/pkg/vignettes/sparkr-vignettes.Rmd | 11 +++++++++++ 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION index 2d461ca68920..899d41032570 100644 --- a/R/pkg/DESCRIPTION +++ b/R/pkg/DESCRIPTION @@ -2,7 +2,7 @@ Package: SparkR Type: Package Version: 2.1.2 Title: R Frontend for Apache Spark -Description: The SparkR package provides an R Frontend for Apache Spark. +Description: Provides an R Frontend for Apache Spark. Authors@R: c(person("Shivaram", "Venkataraman", role = c("aut", "cre"), email = "shivaram@cs.berkeley.edu"), person("Xiangrui", "Meng", role = "aut", diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index 4ca7aa664e02..082ae7d7481b 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -270,7 +270,11 @@ sparkCachePath <- function() { if (.Platform$OS.type == "windows") { winAppPath <- Sys.getenv("LOCALAPPDATA", unset = NA) if (is.na(winAppPath)) { - stop(paste("%LOCALAPPDATA% not found.", + message("%LOCALAPPDATA% not found. Falling back to %USERPROFILE%.") + winAppPath <- Sys.getenv("USERPROFILE", unset = NA) + } + if (is.na(winAppPath)) { + stop(paste("%LOCALAPPDATA% and %USERPROFILE% not found.", "Please define the environment variable", "or restart and enter an installation path in localDir.")) } else { diff --git a/R/pkg/vignettes/sparkr-vignettes.Rmd b/R/pkg/vignettes/sparkr-vignettes.Rmd index f53f77a24a40..0e344ddec49a 100644 --- a/R/pkg/vignettes/sparkr-vignettes.Rmd +++ b/R/pkg/vignettes/sparkr-vignettes.Rmd @@ -27,6 +27,17 @@ vignette: > limitations under the License. --> +```{r setup, include=FALSE} +library(knitr) +opts_hooks$set(eval = function(options) { + # override eval to FALSE only on windows + if (.Platform$OS.type == "windows") { + options$eval = FALSE + } + options +}) +``` + ## Overview SparkR is an R package that provides a light-weight frontend to use Apache Spark from R. With Spark `r packageVersion("SparkR")`, SparkR provides a distributed data frame implementation that supports data processing operations like selection, filtering, aggregation etc. and distributed machine learning using [MLlib](http://spark.apache.org/mllib/). From b53f812d729bb56929876bde9872cf1f2451e63a Mon Sep 17 00:00:00 2001 From: Felix Cheung Date: Fri, 8 Sep 2017 01:01:02 -0700 Subject: [PATCH 07/10] not cran --- R/run-tests.sh | 2 +- appveyor.yml | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/R/run-tests.sh b/R/run-tests.sh index 5e4dafaf76f3..dab6d2d31d45 100755 --- a/R/run-tests.sh +++ b/R/run-tests.sh @@ -23,7 +23,7 @@ FAILED=0 LOGFILE=$FWDIR/unit-tests.out rm -f $LOGFILE -SPARK_TESTING=1 $FWDIR/../bin/spark-submit --driver-java-options "-Dlog4j.configuration=file:$FWDIR/log4j.properties" --conf spark.hadoop.fs.default.name="file:///" $FWDIR/pkg/tests/run-all.R 2>&1 | tee -a $LOGFILE +SPARK_TESTING=1 NOT_CRAN=true $FWDIR/../bin/spark-submit --driver-java-options "-Dlog4j.configuration=file:$FWDIR/log4j.properties" --conf spark.hadoop.fs.default.name="file:///" $FWDIR/pkg/tests/run-all.R 2>&1 | tee -a $LOGFILE FAILED=$((PIPESTATUS[0]||$FAILED)) NUM_TEST_WARNING="$(grep -c -e 'Warnings ----------------' $LOGFILE)" diff --git a/appveyor.yml b/appveyor.yml index 5e756835bcb9..cec6e291accf 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -45,6 +45,9 @@ install: build_script: - cmd: mvn -DskipTests -Phadoop-2.6 -Psparkr -Phive -Phive-thriftserver package +environment: + NOT_CRAN: true + test_script: - cmd: .\bin\spark-submit2.cmd --conf spark.hadoop.fs.default.name="file:///" R\pkg\tests\run-all.R From bb6f326580e795fcdea2bc637a3b93ed08256962 Mon Sep 17 00:00:00 2001 From: Felix Cheung Date: Mon, 12 Jun 2017 22:08:49 -0700 Subject: [PATCH 08/10] [TEST][SPARKR][CORE] Fix broken SparkSubmitSuite ## What changes were proposed in this pull request? Fix test file path. This is broken in #18264 and undetected since R-only changes don't build core and subsequent post-commit with the change built fine (again because it wasn't building core) actually appveyor builds everything but it's not running scala suites ... ## How was this patch tested? jenkins srowen gatorsmile Author: Felix Cheung Closes #18283 from felixcheung/rsubmitsuite. (cherry picked from commit 278ba7a2c62b2cbb7bcfe79ce10d35ab57bb1950) Signed-off-by: Felix Cheung --- .../scala/org/apache/spark/deploy/SparkSubmitSuite.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala index d9e176a12226..499c38af483a 100644 --- a/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala +++ b/core/src/test/scala/org/apache/spark/deploy/SparkSubmitSuite.scala @@ -428,8 +428,8 @@ class SparkSubmitSuite assume(RUtils.isSparkRInstalled, "SparkR is not installed in this build.") val main = MavenCoordinate("my.great.lib", "mylib", "0.1") val sparkHome = sys.props.getOrElse("spark.test.home", fail("spark.test.home is not set!")) - val rScriptDir = - Seq(sparkHome, "R", "pkg", "inst", "tests", "packageInAJarTest.R").mkString(File.separator) + val rScriptDir = Seq( + sparkHome, "R", "pkg", "tests", "fulltests", "packageInAJarTest.R").mkString(File.separator) assert(new File(rScriptDir).exists) IvyTestUtils.withRepository(main, None, None, withR = true) { repo => val args = Seq( @@ -450,7 +450,7 @@ class SparkSubmitSuite // Check if the SparkR package is installed assume(RUtils.isSparkRInstalled, "SparkR is not installed in this build.") val rScriptDir = - Seq(sparkHome, "R", "pkg", "inst", "tests", "testthat", "jarTest.R").mkString(File.separator) + Seq(sparkHome, "R", "pkg", "tests", "fulltests", "jarTest.R").mkString(File.separator) assert(new File(rScriptDir).exists) // compile a small jar containing a class that will be called from R code. From 7ded0ae555247d588fe80dae5ab4763bed11295a Mon Sep 17 00:00:00 2001 From: Felix Cheung Date: Sat, 9 Sep 2017 16:46:58 -0700 Subject: [PATCH 09/10] tests in wrong directory --- R/pkg/{inst => }/tests/fulltests/jarTest.R | 0 R/pkg/{inst => }/tests/fulltests/packageInAJarTest.R | 0 R/pkg/{inst => }/tests/fulltests/test_Serde.R | 0 R/pkg/{inst => }/tests/fulltests/test_Windows.R | 0 R/pkg/{inst => }/tests/fulltests/test_binaryFile.R | 0 R/pkg/{inst => }/tests/fulltests/test_binary_function.R | 0 R/pkg/{inst => }/tests/fulltests/test_broadcast.R | 0 R/pkg/{inst => }/tests/fulltests/test_client.R | 0 R/pkg/{inst => }/tests/fulltests/test_context.R | 0 R/pkg/{inst => }/tests/fulltests/test_includePackage.R | 0 R/pkg/{inst => }/tests/fulltests/test_jvm_api.R | 0 R/pkg/{inst => }/tests/fulltests/test_mllib.R | 0 R/pkg/{inst => }/tests/fulltests/test_parallelize_collect.R | 0 R/pkg/{inst => }/tests/fulltests/test_rdd.R | 0 R/pkg/{inst => }/tests/fulltests/test_shuffle.R | 0 R/pkg/{inst => }/tests/fulltests/test_sparkR.R | 0 R/pkg/{inst => }/tests/fulltests/test_sparkSQL.R | 0 R/pkg/{inst => }/tests/fulltests/test_take.R | 0 R/pkg/{inst => }/tests/fulltests/test_textFile.R | 0 R/pkg/{inst => }/tests/fulltests/test_utils.R | 0 20 files changed, 0 insertions(+), 0 deletions(-) rename R/pkg/{inst => }/tests/fulltests/jarTest.R (100%) rename R/pkg/{inst => }/tests/fulltests/packageInAJarTest.R (100%) rename R/pkg/{inst => }/tests/fulltests/test_Serde.R (100%) rename R/pkg/{inst => }/tests/fulltests/test_Windows.R (100%) rename R/pkg/{inst => }/tests/fulltests/test_binaryFile.R (100%) rename R/pkg/{inst => }/tests/fulltests/test_binary_function.R (100%) rename R/pkg/{inst => }/tests/fulltests/test_broadcast.R (100%) rename R/pkg/{inst => }/tests/fulltests/test_client.R (100%) rename R/pkg/{inst => }/tests/fulltests/test_context.R (100%) rename R/pkg/{inst => }/tests/fulltests/test_includePackage.R (100%) rename R/pkg/{inst => }/tests/fulltests/test_jvm_api.R (100%) rename R/pkg/{inst => }/tests/fulltests/test_mllib.R (100%) rename R/pkg/{inst => }/tests/fulltests/test_parallelize_collect.R (100%) rename R/pkg/{inst => }/tests/fulltests/test_rdd.R (100%) rename R/pkg/{inst => }/tests/fulltests/test_shuffle.R (100%) rename R/pkg/{inst => }/tests/fulltests/test_sparkR.R (100%) rename R/pkg/{inst => }/tests/fulltests/test_sparkSQL.R (100%) rename R/pkg/{inst => }/tests/fulltests/test_take.R (100%) rename R/pkg/{inst => }/tests/fulltests/test_textFile.R (100%) rename R/pkg/{inst => }/tests/fulltests/test_utils.R (100%) diff --git a/R/pkg/inst/tests/fulltests/jarTest.R b/R/pkg/tests/fulltests/jarTest.R similarity index 100% rename from R/pkg/inst/tests/fulltests/jarTest.R rename to R/pkg/tests/fulltests/jarTest.R diff --git a/R/pkg/inst/tests/fulltests/packageInAJarTest.R b/R/pkg/tests/fulltests/packageInAJarTest.R similarity index 100% rename from R/pkg/inst/tests/fulltests/packageInAJarTest.R rename to R/pkg/tests/fulltests/packageInAJarTest.R diff --git a/R/pkg/inst/tests/fulltests/test_Serde.R b/R/pkg/tests/fulltests/test_Serde.R similarity index 100% rename from R/pkg/inst/tests/fulltests/test_Serde.R rename to R/pkg/tests/fulltests/test_Serde.R diff --git a/R/pkg/inst/tests/fulltests/test_Windows.R b/R/pkg/tests/fulltests/test_Windows.R similarity index 100% rename from R/pkg/inst/tests/fulltests/test_Windows.R rename to R/pkg/tests/fulltests/test_Windows.R diff --git a/R/pkg/inst/tests/fulltests/test_binaryFile.R b/R/pkg/tests/fulltests/test_binaryFile.R similarity index 100% rename from R/pkg/inst/tests/fulltests/test_binaryFile.R rename to R/pkg/tests/fulltests/test_binaryFile.R diff --git a/R/pkg/inst/tests/fulltests/test_binary_function.R b/R/pkg/tests/fulltests/test_binary_function.R similarity index 100% rename from R/pkg/inst/tests/fulltests/test_binary_function.R rename to R/pkg/tests/fulltests/test_binary_function.R diff --git a/R/pkg/inst/tests/fulltests/test_broadcast.R b/R/pkg/tests/fulltests/test_broadcast.R similarity index 100% rename from R/pkg/inst/tests/fulltests/test_broadcast.R rename to R/pkg/tests/fulltests/test_broadcast.R diff --git a/R/pkg/inst/tests/fulltests/test_client.R b/R/pkg/tests/fulltests/test_client.R similarity index 100% rename from R/pkg/inst/tests/fulltests/test_client.R rename to R/pkg/tests/fulltests/test_client.R diff --git a/R/pkg/inst/tests/fulltests/test_context.R b/R/pkg/tests/fulltests/test_context.R similarity index 100% rename from R/pkg/inst/tests/fulltests/test_context.R rename to R/pkg/tests/fulltests/test_context.R diff --git a/R/pkg/inst/tests/fulltests/test_includePackage.R b/R/pkg/tests/fulltests/test_includePackage.R similarity index 100% rename from R/pkg/inst/tests/fulltests/test_includePackage.R rename to R/pkg/tests/fulltests/test_includePackage.R diff --git a/R/pkg/inst/tests/fulltests/test_jvm_api.R b/R/pkg/tests/fulltests/test_jvm_api.R similarity index 100% rename from R/pkg/inst/tests/fulltests/test_jvm_api.R rename to R/pkg/tests/fulltests/test_jvm_api.R diff --git a/R/pkg/inst/tests/fulltests/test_mllib.R b/R/pkg/tests/fulltests/test_mllib.R similarity index 100% rename from R/pkg/inst/tests/fulltests/test_mllib.R rename to R/pkg/tests/fulltests/test_mllib.R diff --git a/R/pkg/inst/tests/fulltests/test_parallelize_collect.R b/R/pkg/tests/fulltests/test_parallelize_collect.R similarity index 100% rename from R/pkg/inst/tests/fulltests/test_parallelize_collect.R rename to R/pkg/tests/fulltests/test_parallelize_collect.R diff --git a/R/pkg/inst/tests/fulltests/test_rdd.R b/R/pkg/tests/fulltests/test_rdd.R similarity index 100% rename from R/pkg/inst/tests/fulltests/test_rdd.R rename to R/pkg/tests/fulltests/test_rdd.R diff --git a/R/pkg/inst/tests/fulltests/test_shuffle.R b/R/pkg/tests/fulltests/test_shuffle.R similarity index 100% rename from R/pkg/inst/tests/fulltests/test_shuffle.R rename to R/pkg/tests/fulltests/test_shuffle.R diff --git a/R/pkg/inst/tests/fulltests/test_sparkR.R b/R/pkg/tests/fulltests/test_sparkR.R similarity index 100% rename from R/pkg/inst/tests/fulltests/test_sparkR.R rename to R/pkg/tests/fulltests/test_sparkR.R diff --git a/R/pkg/inst/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R similarity index 100% rename from R/pkg/inst/tests/fulltests/test_sparkSQL.R rename to R/pkg/tests/fulltests/test_sparkSQL.R diff --git a/R/pkg/inst/tests/fulltests/test_take.R b/R/pkg/tests/fulltests/test_take.R similarity index 100% rename from R/pkg/inst/tests/fulltests/test_take.R rename to R/pkg/tests/fulltests/test_take.R diff --git a/R/pkg/inst/tests/fulltests/test_textFile.R b/R/pkg/tests/fulltests/test_textFile.R similarity index 100% rename from R/pkg/inst/tests/fulltests/test_textFile.R rename to R/pkg/tests/fulltests/test_textFile.R diff --git a/R/pkg/inst/tests/fulltests/test_utils.R b/R/pkg/tests/fulltests/test_utils.R similarity index 100% rename from R/pkg/inst/tests/fulltests/test_utils.R rename to R/pkg/tests/fulltests/test_utils.R From 9d53891843f008feb3fee4407e2dba5012c3d968 Mon Sep 17 00:00:00 2001 From: Felix Cheung Date: Sun, 10 Sep 2017 01:10:51 -0700 Subject: [PATCH 10/10] no tweedie in 2.1 --- R/pkg/inst/tests/testthat/test_basic.R | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/R/pkg/inst/tests/testthat/test_basic.R b/R/pkg/inst/tests/testthat/test_basic.R index de47162d5325..c0928677fbeb 100644 --- a/R/pkg/inst/tests/testthat/test_basic.R +++ b/R/pkg/inst/tests/testthat/test_basic.R @@ -68,23 +68,5 @@ test_that("spark.glm and predict", { out <- capture.output(print(summary(model))) expect_true(any(grepl("Dispersion parameter for gamma family", out))) - # tweedie family - model <- spark.glm(training, Sepal_Width ~ Sepal_Length + Species, - family = "tweedie", var.power = 1.2, link.power = 0.0) - prediction <- predict(model, training) - expect_equal(typeof(take(select(prediction, "prediction"), 1)$prediction), "double") - vals <- collect(select(prediction, "prediction")) - - # manual calculation of the R predicted values to avoid dependence on statmod - #' library(statmod) - #' rModel <- glm(Sepal.Width ~ Sepal.Length + Species, data = iris, - #' family = tweedie(var.power = 1.2, link.power = 0.0)) - #' print(coef(rModel)) - - rCoef <- c(0.6455409, 0.1169143, -0.3224752, -0.3282174) - rVals <- exp(as.numeric(model.matrix(Sepal.Width ~ Sepal.Length + Species, - data = iris) %*% rCoef)) - expect_true(all(abs(rVals - vals) < 1e-5), rVals - vals) - sparkR.session.stop() })