Skip to content

Commit 22f6cd8

Browse files
yanboliangshivaram
authored andcommitted
[SPARK-12310][SPARKR] Add write.json and write.parquet for SparkR
Add ```write.json``` and ```write.parquet``` for SparkR, and deprecated ```saveAsParquetFile```. Author: Yanbo Liang <[email protected]> Closes #10281 from yanboliang/spark-12310.
1 parent 2eb5af5 commit 22f6cd8

File tree

4 files changed

+119
-56
lines changed

4 files changed

+119
-56
lines changed

R/pkg/NAMESPACE

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,9 @@ exportMethods("arrange",
9292
"with",
9393
"withColumn",
9494
"withColumnRenamed",
95-
"write.df")
95+
"write.df",
96+
"write.json",
97+
"write.parquet")
9698

9799
exportClasses("Column")
98100

R/pkg/R/DataFrame.R

Lines changed: 45 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -596,30 +596,69 @@ setMethod("toJSON",
596596
RDD(jrdd, serializedMode = "string")
597597
})
598598

599-
#' saveAsParquetFile
599+
#' write.json
600+
#'
601+
#' Save the contents of a DataFrame as a JSON file (one object per line). Files written out
602+
#' with this method can be read back in as a DataFrame using read.json().
603+
#'
604+
#' @param x A SparkSQL DataFrame
605+
#' @param path The directory where the file is saved
606+
#'
607+
#' @family DataFrame functions
608+
#' @rdname write.json
609+
#' @name write.json
610+
#' @export
611+
#' @examples
612+
#'\dontrun{
613+
#' sc <- sparkR.init()
614+
#' sqlContext <- sparkRSQL.init(sc)
615+
#' path <- "path/to/file.json"
616+
#' df <- read.json(sqlContext, path)
617+
#' write.json(df, "/tmp/sparkr-tmp/")
618+
#'}
619+
setMethod("write.json",
620+
signature(x = "DataFrame", path = "character"),
621+
function(x, path) {
622+
write <- callJMethod(x@sdf, "write")
623+
invisible(callJMethod(write, "json", path))
624+
})
625+
626+
#' write.parquet
600627
#'
601628
#' Save the contents of a DataFrame as a Parquet file, preserving the schema. Files written out
602-
#' with this method can be read back in as a DataFrame using parquetFile().
629+
#' with this method can be read back in as a DataFrame using read.parquet().
603630
#'
604631
#' @param x A SparkSQL DataFrame
605632
#' @param path The directory where the file is saved
606633
#'
607634
#' @family DataFrame functions
608-
#' @rdname saveAsParquetFile
609-
#' @name saveAsParquetFile
635+
#' @rdname write.parquet
636+
#' @name write.parquet
610637
#' @export
611638
#' @examples
612639
#'\dontrun{
613640
#' sc <- sparkR.init()
614641
#' sqlContext <- sparkRSQL.init(sc)
615642
#' path <- "path/to/file.json"
616643
#' df <- read.json(sqlContext, path)
617-
#' saveAsParquetFile(df, "/tmp/sparkr-tmp/")
644+
#' write.parquet(df, "/tmp/sparkr-tmp1/")
645+
#' saveAsParquetFile(df, "/tmp/sparkr-tmp2/")
618646
#'}
647+
setMethod("write.parquet",
648+
signature(x = "DataFrame", path = "character"),
649+
function(x, path) {
650+
write <- callJMethod(x@sdf, "write")
651+
invisible(callJMethod(write, "parquet", path))
652+
})
653+
654+
#' @rdname write.parquet
655+
#' @name saveAsParquetFile
656+
#' @export
619657
setMethod("saveAsParquetFile",
620658
signature(x = "DataFrame", path = "character"),
621659
function(x, path) {
622-
invisible(callJMethod(x@sdf, "saveAsParquetFile", path))
660+
.Deprecated("write.parquet")
661+
write.parquet(x, path)
623662
})
624663

625664
#' Distinct

R/pkg/R/generics.R

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -519,10 +519,6 @@ setGeneric("sample_frac",
519519
#' @export
520520
setGeneric("sampleBy", function(x, col, fractions, seed) { standardGeneric("sampleBy") })
521521

522-
#' @rdname saveAsParquetFile
523-
#' @export
524-
setGeneric("saveAsParquetFile", function(x, path) { standardGeneric("saveAsParquetFile") })
525-
526522
#' @rdname saveAsTable
527523
#' @export
528524
setGeneric("saveAsTable", function(df, tableName, source, mode, ...) {
@@ -541,6 +537,18 @@ setGeneric("write.df", function(df, path, ...) { standardGeneric("write.df") })
541537
#' @export
542538
setGeneric("saveDF", function(df, path, ...) { standardGeneric("saveDF") })
543539

540+
#' @rdname write.json
541+
#' @export
542+
setGeneric("write.json", function(x, path) { standardGeneric("write.json") })
543+
544+
#' @rdname write.parquet
545+
#' @export
546+
setGeneric("write.parquet", function(x, path) { standardGeneric("write.parquet") })
547+
548+
#' @rdname write.parquet
549+
#' @export
550+
setGeneric("saveAsParquetFile", function(x, path) { standardGeneric("saveAsParquetFile") })
551+
544552
#' @rdname schema
545553
#' @export
546554
setGeneric("schema", function(x) { standardGeneric("schema") })

R/pkg/inst/tests/testthat/test_sparkSQL.R

Lines changed: 59 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -371,22 +371,49 @@ test_that("Collect DataFrame with complex types", {
371371
expect_equal(bob$height, 176.5)
372372
})
373373

374-
test_that("read.json()/jsonFile() on a local file returns a DataFrame", {
374+
test_that("read/write json files", {
375+
# Test read.df
376+
df <- read.df(sqlContext, jsonPath, "json")
377+
expect_is(df, "DataFrame")
378+
expect_equal(count(df), 3)
379+
380+
# Test read.df with a user defined schema
381+
schema <- structType(structField("name", type = "string"),
382+
structField("age", type = "double"))
383+
384+
df1 <- read.df(sqlContext, jsonPath, "json", schema)
385+
expect_is(df1, "DataFrame")
386+
expect_equal(dtypes(df1), list(c("name", "string"), c("age", "double")))
387+
388+
# Test loadDF
389+
df2 <- loadDF(sqlContext, jsonPath, "json", schema)
390+
expect_is(df2, "DataFrame")
391+
expect_equal(dtypes(df2), list(c("name", "string"), c("age", "double")))
392+
393+
# Test read.json
375394
df <- read.json(sqlContext, jsonPath)
376395
expect_is(df, "DataFrame")
377396
expect_equal(count(df), 3)
378-
# read.json()/jsonFile() works with multiple input paths
397+
398+
# Test write.df
379399
jsonPath2 <- tempfile(pattern="jsonPath2", fileext=".json")
380400
write.df(df, jsonPath2, "json", mode="overwrite")
381-
jsonDF1 <- read.json(sqlContext, c(jsonPath, jsonPath2))
401+
402+
# Test write.json
403+
jsonPath3 <- tempfile(pattern="jsonPath3", fileext=".json")
404+
write.json(df, jsonPath3)
405+
406+
# Test read.json()/jsonFile() works with multiple input paths
407+
jsonDF1 <- read.json(sqlContext, c(jsonPath2, jsonPath3))
382408
expect_is(jsonDF1, "DataFrame")
383409
expect_equal(count(jsonDF1), 6)
384410
# Suppress warnings because jsonFile is deprecated
385-
jsonDF2 <- suppressWarnings(jsonFile(sqlContext, c(jsonPath, jsonPath2)))
411+
jsonDF2 <- suppressWarnings(jsonFile(sqlContext, c(jsonPath2, jsonPath3)))
386412
expect_is(jsonDF2, "DataFrame")
387413
expect_equal(count(jsonDF2), 6)
388414

389415
unlink(jsonPath2)
416+
unlink(jsonPath3)
390417
})
391418

392419
test_that("jsonRDD() on a RDD with json string", {
@@ -454,6 +481,9 @@ test_that("insertInto() on a registered table", {
454481
expect_equal(count(sql(sqlContext, "select * from table1")), 2)
455482
expect_equal(first(sql(sqlContext, "select * from table1 order by age"))$name, "Bob")
456483
dropTempTable(sqlContext, "table1")
484+
485+
unlink(jsonPath2)
486+
unlink(parquetPath2)
457487
})
458488

459489
test_that("table() returns a new DataFrame", {
@@ -848,33 +878,6 @@ test_that("column calculation", {
848878
expect_equal(count(df2), 3)
849879
})
850880

851-
test_that("read.df() from json file", {
852-
df <- read.df(sqlContext, jsonPath, "json")
853-
expect_is(df, "DataFrame")
854-
expect_equal(count(df), 3)
855-
856-
# Check if we can apply a user defined schema
857-
schema <- structType(structField("name", type = "string"),
858-
structField("age", type = "double"))
859-
860-
df1 <- read.df(sqlContext, jsonPath, "json", schema)
861-
expect_is(df1, "DataFrame")
862-
expect_equal(dtypes(df1), list(c("name", "string"), c("age", "double")))
863-
864-
# Run the same with loadDF
865-
df2 <- loadDF(sqlContext, jsonPath, "json", schema)
866-
expect_is(df2, "DataFrame")
867-
expect_equal(dtypes(df2), list(c("name", "string"), c("age", "double")))
868-
})
869-
870-
test_that("write.df() as parquet file", {
871-
df <- read.df(sqlContext, jsonPath, "json")
872-
write.df(df, parquetPath, "parquet", mode="overwrite")
873-
df2 <- read.df(sqlContext, parquetPath, "parquet")
874-
expect_is(df2, "DataFrame")
875-
expect_equal(count(df2), 3)
876-
})
877-
878881
test_that("test HiveContext", {
879882
ssc <- callJMethod(sc, "sc")
880883
hiveCtx <- tryCatch({
@@ -895,6 +898,8 @@ test_that("test HiveContext", {
895898
df3 <- sql(hiveCtx, "select * from json2")
896899
expect_is(df3, "DataFrame")
897900
expect_equal(count(df3), 3)
901+
902+
unlink(jsonPath2)
898903
})
899904

900905
test_that("column operators", {
@@ -1333,6 +1338,9 @@ test_that("join() and merge() on a DataFrame", {
13331338
expect_error(merge(df, df3),
13341339
paste("The following column name: name_y occurs more than once in the 'DataFrame'.",
13351340
"Please use different suffixes for the intersected columns.", sep = ""))
1341+
1342+
unlink(jsonPath2)
1343+
unlink(jsonPath3)
13361344
})
13371345

13381346
test_that("toJSON() returns an RDD of the correct values", {
@@ -1396,6 +1404,8 @@ test_that("unionAll(), rbind(), except(), and intersect() on a DataFrame", {
13961404

13971405
# Test base::intersect is working
13981406
expect_equal(length(intersect(1:20, 3:23)), 18)
1407+
1408+
unlink(jsonPath2)
13991409
})
14001410

14011411
test_that("withColumn() and withColumnRenamed()", {
@@ -1440,31 +1450,35 @@ test_that("mutate(), transform(), rename() and names()", {
14401450
detach(airquality)
14411451
})
14421452

1443-
test_that("write.df() on DataFrame and works with read.parquet", {
1444-
df <- read.json(sqlContext, jsonPath)
1453+
test_that("read/write Parquet files", {
1454+
df <- read.df(sqlContext, jsonPath, "json")
1455+
# Test write.df and read.df
14451456
write.df(df, parquetPath, "parquet", mode="overwrite")
1446-
parquetDF <- read.parquet(sqlContext, parquetPath)
1447-
expect_is(parquetDF, "DataFrame")
1448-
expect_equal(count(df), count(parquetDF))
1449-
})
1457+
df2 <- read.df(sqlContext, parquetPath, "parquet")
1458+
expect_is(df2, "DataFrame")
1459+
expect_equal(count(df2), 3)
14501460

1451-
test_that("read.parquet()/parquetFile() works with multiple input paths", {
1452-
df <- read.json(sqlContext, jsonPath)
1453-
write.df(df, parquetPath, "parquet", mode="overwrite")
1461+
# Test write.parquet/saveAsParquetFile and read.parquet/parquetFile
14541462
parquetPath2 <- tempfile(pattern = "parquetPath2", fileext = ".parquet")
1455-
write.df(df, parquetPath2, "parquet", mode="overwrite")
1456-
parquetDF <- read.parquet(sqlContext, c(parquetPath, parquetPath2))
1463+
write.parquet(df, parquetPath2)
1464+
parquetPath3 <- tempfile(pattern = "parquetPath3", fileext = ".parquet")
1465+
suppressWarnings(saveAsParquetFile(df, parquetPath3))
1466+
parquetDF <- read.parquet(sqlContext, c(parquetPath2, parquetPath3))
14571467
expect_is(parquetDF, "DataFrame")
14581468
expect_equal(count(parquetDF), count(df) * 2)
1459-
parquetDF2 <- suppressWarnings(parquetFile(sqlContext, parquetPath, parquetPath2))
1469+
parquetDF2 <- suppressWarnings(parquetFile(sqlContext, parquetPath2, parquetPath3))
14601470
expect_is(parquetDF2, "DataFrame")
14611471
expect_equal(count(parquetDF2), count(df) * 2)
14621472

14631473
# Test if varargs works with variables
14641474
saveMode <- "overwrite"
14651475
mergeSchema <- "true"
1466-
parquetPath3 <- tempfile(pattern = "parquetPath3", fileext = ".parquet")
1467-
write.df(df, parquetPath2, "parquet", mode = saveMode, mergeSchema = mergeSchema)
1476+
parquetPath4 <- tempfile(pattern = "parquetPath3", fileext = ".parquet")
1477+
write.df(df, parquetPath3, "parquet", mode = saveMode, mergeSchema = mergeSchema)
1478+
1479+
unlink(parquetPath2)
1480+
unlink(parquetPath3)
1481+
unlink(parquetPath4)
14681482
})
14691483

14701484
test_that("describe() and summarize() on a DataFrame", {

0 commit comments

Comments
 (0)