[SPARKR-153] phase 2: implement aggregateByKey() and foldByKey().

Sun Rui · Sun Rui · commit 0cda231ef9bc · 2015-02-16T16:51:34.000+08:00
diff --git a/pkg/NAMESPACE b/pkg/NAMESPACE
@@ -2,6 +2,7 @@
 exportClasses("RDD")
 exportClasses("Broadcast")
 exportMethods(
+              "aggregateByKey",
               "aggregateRDD",
               "cache",
               "checkpoint",
@@ -19,6 +20,7 @@ exportMethods(
               "flatMap",
               "flatMapValues",
               "fold",
+              "foldByKey",
               "foreach",
               "foreachPartition",
               "fullOuterJoin",
diff --git a/pkg/R/pairRDD.R b/pkg/R/pairRDD.R
@@ -497,6 +497,88 @@ setMethod("combineByKey",
             lapplyPartition(shuffled, mergeAfterShuffle)
           })
 
+#' Aggregate a pair RDD by each key.
+#' 
+#' Aggregate the values of each key in an RDD, using given combine functions
+#' and a neutral "zero value". This function can return a different result type,
+#' U, than the type of the values in this RDD, V. Thus, we need one operation
+#' for merging a V into a U and one operation for merging two U's, The former 
+#' operation is used for merging values within a partition, and the latter is 
+#' used for merging values between partitions. To avoid memory allocation, both 
+#' of these functions are allowed to modify and return their first argument 
+#' instead of creating a new U.
+#' 
+#' @param rdd An RDD.
+#' @param zeroValue A neutral "zero value".
+#' @param seqOp A function to aggregate the values of each key. It may return 
+#'              a different result type from the type of the values.
+#' @param combOp A function to aggregate results of seqOp.
+#' @return An RDD containing the aggregation result.
+#' @rdname aggregateByKey
+#' @seealso foldByKey, combineByKey
+#' @export
+#' @examples
+#'\dontrun{
+#' sc <- sparkR.init()
+#' rdd <- parallelize(sc, list(list(1, 1), list(1, 2), list(2, 3), list(2, 4)))
+#' zeroValue <- list(0, 0)
+#' seqOp <- function(x, y) { list(x[[1]] + y, x[[2]] + 1) }
+#' combOp <- function(x, y) { list(x[[1]] + y[[1]], x[[2]] + y[[2]]) }
+#' aggregateByKey(rdd, zeroValue, seqOp, combOp, 2L) 
+#'   # list(list(1, list(3, 2)), list(2, list(7, 2)))
+#'}
+setGeneric("aggregateByKey",
+           function(rdd, zeroValue, seqOp, combOp, numPartitions) {
+             standardGeneric("aggregateByKey")
+           })
+
+#' @rdname aggregateByKey
+#' @aliases aggregateByKey,RDD,ANY,ANY,ANY,integer-method
+setMethod("aggregateByKey",
+          signature(rdd = "RDD", zeroValue = "ANY", seqOp = "ANY",
+                    combOp = "ANY", numPartitions = "integer"),
+          function(rdd, zeroValue, seqOp, combOp, numPartitions) {
+            createCombiner <- function(v) {
+              do.call(seqOp, list(zeroValue, v))
+            }
+
+            combineByKey(rdd, createCombiner, seqOp, combOp, numPartitions)
+          })
+
+#' Fold a pair RDD by each key.
+#' 
+#' Aggregate the values of each key in an RDD, using an associative function "func"
+#' and a neutral "zero value" which may be added to the result an arbitrary 
+#' number of times, and must not change the result (e.g., 0 for addition, or 
+#' 1 for multiplication.).
+#' 
+#' @param rdd An RDD.
+#' @param zeroValue A neutral "zero value".
+#' @param func An associative function for folding values of each key.
+#' @return An RDD containing the aggregation result.
+#' @rdname foldByKey
+#' @seealso aggregateByKey, combineByKey
+#' @export
+#' @examples
+#'\dontrun{
+#' sc <- sparkR.init()
+#' rdd <- parallelize(sc, list(list(1, 1), list(1, 2), list(2, 3), list(2, 4)))
+#' foldByKey(rdd, 0, "+", 2L) # list(list(1, 3), list(2, 7))
+#'}
+setGeneric("foldByKey",
+           function(rdd, zeroValue, func, numPartitions) {
+             standardGeneric("foldByKey")
+           })
+
+#' @rdname foldByKey
+#' @aliases foldByKey,RDD,ANY,ANY,integer-method
+setMethod("foldByKey",
+          signature(rdd = "RDD", zeroValue = "ANY",
+                    func = "ANY", numPartitions = "integer"),
+          function(rdd, zeroValue, func, numPartitions) {
+            aggregateByKey(rdd, zeroValue, func, func, numPartitions)
+          })
+
 ############ Binary Functions #############
 
 #' Join two RDDs
diff --git a/pkg/inst/tests/test_shuffle.R b/pkg/inst/tests/test_shuffle.R
@@ -70,6 +70,77 @@ test_that("combineByKey for doubles", {
   expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
 })
 
+test_that("aggregateByKey", {
+  # test aggregateByKey for int keys
+  rdd <- parallelize(sc, list(list(1, 1), list(1, 2), list(2, 3), list(2, 4)))
+
+  zeroValue <- list(0, 0)
+  seqOp <- function(x, y) { list(x[[1]] + y, x[[2]] + 1) }
+  combOp <- function(x, y) { list(x[[1]] + y[[1]], x[[2]] + y[[2]]) }
+  aggregatedRDD <- aggregateByKey(rdd, zeroValue, seqOp, combOp, 2L)   
+  
+  actual <- collect(aggregatedRDD)
+  
+  expected <- list(list(1, list(3, 2)), list(2, list(7, 2)))
+  expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
+
+  # test aggregateByKey for string keys
+  rdd <- parallelize(sc, list(list("a", 1), list("a", 2), list("b", 3), list("b", 4)))
+  
+  zeroValue <- list(0, 0)
+  seqOp <- function(x, y) { list(x[[1]] + y, x[[2]] + 1) }
+  combOp <- function(x, y) { list(x[[1]] + y[[1]], x[[2]] + y[[2]]) }
+  aggregatedRDD <- aggregateByKey(rdd, zeroValue, seqOp, combOp, 2L)   
+
+  actual <- collect(aggregatedRDD)
+  
+  expected <- list(list("a", list(3, 2)), list("b", list(7, 2)))
+  expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
+})
+
+test_that("foldByKey", {  
+  # test foldByKey for int keys
+  folded <- foldByKey(intRdd, 0, "+", 2L)
+  
+  actual <- collect(folded)
+  
+  expected <- list(list(2L, 101), list(1L, 199))
+  expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
+
+  # test foldByKey for double keys
+  folded <- foldByKey(doubleRdd, 0, "+", 2L)
+  
+  actual <- collect(folded)
+
+  expected <- list(list(1.5, 199), list(2.5, 101))
+  expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
+
+  # test foldByKey for string keys
+  stringKeyPairs <- list(list("a", -1), list("b", 100), list("b", 1), list("a", 200))
+  
+  stringKeyRDD <- parallelize(sc, stringKeyPairs)
+  folded <- foldByKey(stringKeyRDD, 0, "+", 2L)
+  
+  actual <- collect(folded)
+  
+  expected <- list(list("b", 101), list("a", 199))
+  expect_equal(sortKeyValueList(actual), sortKeyValueList(expected))
+  
+  # test foldByKey for empty pair RDD
+  rdd <- parallelize(sc, list())
+  folded <- foldByKey(rdd, 0, "+", 2L)
+  actual <- collect(folded)
+  expected <- list()
+  expect_equal(actual, expected)
+
+  # test foldByKey for RDD with only 1 pair
+  rdd <- parallelize(sc,  list(list(1, 1)))
+  folded <- foldByKey(rdd, 0, "+", 2L)
+  actual <- collect(folded)
+  expected <- list(list(1, 1))
+  expect_equal(actual, expected)
+})
+
 test_that("partitionBy() partitions data correctly", {
   # Partition by magnitude
   partitionByMagnitude <- function(key) { if (key >= 3) 1 else 0 }
diff --git a/pkg/man/aggregateByKey.Rd b/pkg/man/aggregateByKey.Rd
@@ -0,0 +1,50 @@
+% Generated by roxygen2 (4.0.2): do not edit by hand
+\docType{methods}
+\name{aggregateByKey}
+\alias{aggregateByKey}
+\alias{aggregateByKey,RDD,ANY,ANY,ANY,integer-method}
+\title{Aggregate a pair RDD by each key.}
+\usage{
+aggregateByKey(rdd, zeroValue, seqOp, combOp, numPartitions)
+
+\S4method{aggregateByKey}{RDD,ANY,ANY,ANY,integer}(rdd, zeroValue, seqOp,
+  combOp, numPartitions)
+}
+\arguments{
+\item{rdd}{An RDD.}
+
+\item{zeroValue}{A neutral "zero value".}
+
+\item{seqOp}{A function to aggregate the values of each key. It may return
+a different result type from the type of the values.}
+
+\item{combOp}{A function to aggregate results of seqOp.}
+}
+\value{
+An RDD containing the aggregation result.
+}
+\description{
+Aggregate the values of each key in an RDD, using given combine functions
+and a neutral "zero value". This function can return a different result type,
+U, than the type of the values in this RDD, V. Thus, we need one operation
+for merging a V into a U and one operation for merging two U's, The former
+operation is used for merging values within a partition, and the latter is
+used for merging values between partitions. To avoid memory allocation, both
+of these functions are allowed to modify and return their first argument
+instead of creating a new U.
+}
+\examples{
+\dontrun{
+sc <- sparkR.init()
+rdd <- parallelize(sc, list(list(1, 1), list(1, 2), list(2, 3), list(2, 4)))
+zeroValue <- list(0, 0)
+seqOp <- function(x, y) { list(x[[1]] + y, x[[2]] + 1) }
+combOp <- function(x, y) { list(x[[1]] + y[[1]], x[[2]] + y[[2]]) }
+aggregateByKey(rdd, zeroValue, seqOp, combOp, 2L)
+  # list(list(1, list(3, 2)), list(2, list(7, 2)))
+}
+}
+\seealso{
+foldByKey, combineByKey
+}
+
diff --git a/pkg/man/foldByKey.Rd b/pkg/man/foldByKey.Rd
@@ -0,0 +1,38 @@
+% Generated by roxygen2 (4.0.2): do not edit by hand
+\docType{methods}
+\name{foldByKey}
+\alias{foldByKey}
+\alias{foldByKey,RDD,ANY,ANY,integer-method}
+\title{Fold a pair RDD by each key.}
+\usage{
+foldByKey(rdd, zeroValue, func, numPartitions)
+
+\S4method{foldByKey}{RDD,ANY,ANY,integer}(rdd, zeroValue, func, numPartitions)
+}
+\arguments{
+\item{rdd}{An RDD.}
+
+\item{zeroValue}{A neutral "zero value".}
+
+\item{func}{An associative function for folding values of each key.}
+}
+\value{
+An RDD containing the aggregation result.
+}
+\description{
+Aggregate the values of each key in an RDD, using an associative function "func"
+and a neutral "zero value" which may be added to the result an arbitrary
+number of times, and must not change the result (e.g., 0 for addition, or
+1 for multiplication.).
+}
+\examples{
+\dontrun{
+sc <- sparkR.init()
+rdd <- parallelize(sc, list(list(1, 1), list(1, 2), list(2, 3), list(2, 4)))
+foldByKey(rdd, 0, "+", 2L) # list(list(1, 3), list(2, 7))
+}
+}
+\seealso{
+aggregateByKey, combineByKey
+}
+