From e7e471c58948c4d9829ea77d35318320e26fe411 Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Sat, 18 Jun 2016 23:59:07 -0700 Subject: [PATCH 1/3] Add `spark_partition_id` in SparkR --- R/pkg/NAMESPACE | 1 + R/pkg/R/functions.R | 21 +++++++++++++++++++++ R/pkg/R/generics.R | 4 ++++ R/pkg/inst/tests/testthat/test_sparkSQL.R | 1 + 4 files changed, 27 insertions(+) diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index 82e56ca43729..1bc27ba88fe0 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -258,6 +258,7 @@ exportMethods("%in%", "skewness", "sort_array", "soundex", + "spark_partition_id", "stddev", "stddev_pop", "stddev_samp", diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index a779127b379a..5b34082289a6 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -1179,6 +1179,27 @@ setMethod("soundex", column(jc) }) +#' spark_partition_id +#' +#' Return the column for partition ID of the Spark task. +#' Note that this is indeterministic because it depends on data partitioning and +#' task scheduling. +#' +#' This is equivalent to the SPARK_PARTITION_ID function in SQL. +#' +#' @rdname spark_partition_id +#' @name spark_partition_id +#' @export +#' @examples +#' \dontrun{select(df, spark_partition_id())} +#' @note spark_partition_id since 2.0.0 +setMethod("spark_partition_id", + signature(x = "missing"), + function() { + jc <- callJStatic("org.apache.spark.sql.functions", "spark_partition_id") + column(jc) + }) + #' @rdname sd #' @name stddev setMethod("stddev", diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 6e754afab6c6..8abd18d35e66 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -1126,6 +1126,10 @@ setGeneric("sort_array", function(x, asc = TRUE) { standardGeneric("sort_array") #' @export setGeneric("soundex", function(x) { standardGeneric("soundex") }) +#' @rdname spark_partition_id +#' @export +setGeneric("spark_partition_id", function(x) { standardGeneric("spark_partition_id") }) + #' @rdname sd #' @export setGeneric("stddev", function(x) { standardGeneric("stddev") }) diff --git a/R/pkg/inst/tests/testthat/test_sparkSQL.R b/R/pkg/inst/tests/testthat/test_sparkSQL.R index fcc2ab3ed6a2..53097f34d7a3 100644 --- a/R/pkg/inst/tests/testthat/test_sparkSQL.R +++ b/R/pkg/inst/tests/testthat/test_sparkSQL.R @@ -1058,6 +1058,7 @@ test_that("column functions", { c16 <- is.nan(c) + isnan(c) + isNaN(c) c17 <- cov(c, c1) + cov("c", "c1") + covar_samp(c, c1) + covar_samp("c", "c1") c18 <- covar_pop(c, c1) + covar_pop("c", "c1") + c19 <- spark_partition_id() # Test if base::is.nan() is exposed expect_equal(is.nan(c("a", "b")), c(FALSE, FALSE)) From cbd54b29883a4a8a0bfa9e5e6db97e30e764d277 Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Mon, 20 Jun 2016 11:52:23 -0700 Subject: [PATCH 2/3] Use new title/description convention. --- R/pkg/R/functions.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 5b34082289a6..5d352ace1aef 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -1179,7 +1179,7 @@ setMethod("soundex", column(jc) }) -#' spark_partition_id +#' Return the partition ID as a column #' #' Return the column for partition ID of the Spark task. #' Note that this is indeterministic because it depends on data partitioning and From ddb21023fcc998001fbf2cc152bd97bbb1a09c7b Mon Sep 17 00:00:00 2001 From: Dongjoon Hyun Date: Mon, 20 Jun 2016 12:23:16 -0700 Subject: [PATCH 3/3] Update function description. --- R/pkg/R/functions.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 5d352ace1aef..064edf2468dc 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -1181,8 +1181,8 @@ setMethod("soundex", #' Return the partition ID as a column #' -#' Return the column for partition ID of the Spark task. -#' Note that this is indeterministic because it depends on data partitioning and +#' Return the partition ID of the Spark task as a SparkDataFrame column. +#' Note that this is nondeterministic because it depends on data partitioning and #' task scheduling. #' #' This is equivalent to the SPARK_PARTITION_ID function in SQL.