add reduceByKeyLocally

lythesia · lythesia · commit b082a35e5d9a · 2015-02-06T16:36:34.000+08:00
diff --git a/pkg/NAMESPACE b/pkg/NAMESPACE
@@ -41,6 +41,7 @@ exportMethods(
               "persist",
               "reduce",
               "reduceByKey",
+              "reduceByKeyLocally",
               "rightOuterJoin",
               "sampleRDD",
               "saveAsTextFile",
diff --git a/pkg/R/RDD.R b/pkg/R/RDD.R
@@ -1382,26 +1382,18 @@ setMethod("groupByKey",
             groupVals <- function(part) {
               vals <- new.env()
               keys <- new.env()
+              pred <- function(item) exists(item$hash, keys)
               # Each item in the partition is list of (K, V)
               lapply(part,
                      function(item) {
-                       hashVal <- as.character(hashCode(item[[1]]))
-                       if (exists(hashVal, vals)) {
-                         acc <- vals[[hashVal]]
-                         acc[[length(acc) + 1]] <- item[[2]]
-                         vals[[hashVal]] <- acc
-                       } else {
-                         vals[[hashVal]] <- list(item[[2]])
-                         keys[[hashVal]] <- item[[1]]
-                       }
+                       item$hash <- as.character(hashCode(item[[1]]))
+                       updateOrCreatePair(item, keys, vals, pred,
+                                          function(vs, v) c(vs, list(v)),
+                                          function(x) list(x))
                      })
               # Every key in the environment contains a list
               # Convert that to list(K, Seq[V])
-              grouped <- lapply(ls(vals),
-                                function(name) {
-                                  list(keys[[name]], vals[[name]])
-                                })
-              grouped
+              convertEnvsToList(keys, vals)
             }
             lapplyPartition(shuffled, groupVals)
           })
@@ -1442,28 +1434,79 @@ setMethod("reduceByKey",
             reduceVals <- function(part) {
               vals <- new.env()
               keys <- new.env()
+              pred <- function(item) exists(item$hash, keys)
               lapply(part,
                      function(item) {
-                       hashVal <- as.character(hashCode(item[[1]]))
-                       if (exists(hashVal, vals)) {
-                         vals[[hashVal]] <- do.call(
-                           combineFunc, list(vals[[hashVal]], item[[2]]))
-                       } else {
-                         vals[[hashVal]] <- item[[2]]
-                         keys[[hashVal]] <- item[[1]]
-                       }
+                       item$hash <- as.character(hashCode(item[[1]]))
+                       updateOrCreatePair(item, keys, vals, pred, combineFunc, function(x) x)
                      })
-              combined <- lapply(ls(vals),
-                                  function(name) {
-                                    list(keys[[name]], vals[[name]])
-                                  })
-              combined
+              convertEnvsToList(keys, vals)
             }
             locallyReduced <- lapplyPartition(rdd, reduceVals)
             shuffled <- partitionBy(locallyReduced, numPartitions)
             lapplyPartition(shuffled, reduceVals)
           })
 
+#' Merge values by key locally
+#'
+#' This function operates on RDDs where every element is of the form list(K, V) or c(K, V).
+#' and merges the values for each key using an associative reduce function, but return the
+#' results immediately to master as R list.
+#'
+#' @param rdd The RDD to reduce by key. Should be an RDD where each element is
+#'             list(K, V) or c(K, V).
+#' @param combineFunc The associative reduce function to use.
+#' @return An list where each element is list(K, V') where V' is the merged
+#'         value
+#' @rdname reduceByKeyLocally
+#' @seealso reduceByKey
+#' @export
+#' @examples
+#'\dontrun{
+#' sc <- sparkR.init()
+#' pairs <- list(list(1, 2), list(1.1, 3), list(1, 4))
+#' rdd <- parallelize(sc, pairs)
+#' reduced <- reduceByKeyLocally(rdd, "+")
+#' reduced[[1]] # Should be a list(1, 6)
+#'}
+setGeneric("reduceByKeyLocally",
+           function(rdd, combineFunc) {
+             standardGeneric("reduceByKeyLocally")
+           })
+
+#' @rdname reduceByKeyLocally
+#' @aliases reduceByKeyLocally,RDD,integer-method
+setMethod("reduceByKeyLocally",
+          signature(rdd = "RDD", combineFunc = "ANY"),
+          function(rdd, combineFunc) {
+            reducePart <- function(part) {
+              vals <- new.env()
+              keys <- new.env()
+              pred <- function(item) exists(item$hash, keys)
+              lapply(part,
+                     function(item) {
+                       item$hash <- as.character(hashCode(item[[1]]))
+                       updateOrCreatePair(item, keys, vals, pred, combineFunc, function(x) x)
+                     })
+              list(list(keys, vals)) # return hash to avoid re-compute in merge
+            }
+            mergeParts <- function(accum, x) {
+              pred <- function(item) {
+                exists(item$hash, accum[[1]])
+              }
+              lapply(ls(x[[1]]),
+                     function(name) {
+                       item <- list(x[[1]][[name]], x[[2]][[name]])
+                       item$hash <- name
+                       updateOrCreatePair(item, accum[[1]], accum[[2]], pred, combineFunc, function(x) x)
+                     })
+              accum
+            }
+            reduced <- mapPartitions(rdd, reducePart)
+            merged <- reduce(reduced, mergeParts)
+            convertEnvsToList(merged[[1]], merged[[2]])
+          })
+
 #' Combine values by key
 #'
 #' Generic function to combine the elements for each key using a custom set of
@@ -1513,46 +1556,29 @@ setMethod("combineByKey",
             combineLocally <- function(part) {
               combiners <- new.env()
               keys <- new.env()
+              pred <- function(item) exists(item$hash, keys)
               lapply(part,
                      function(item) {
-                       k <- as.character(item[[1]])
-                       if (!exists(k, keys)) {
-                         combiners[[k]] <- do.call(createCombiner,
-                                                   list(item[[2]]))
-                         keys[[k]] <- item[[1]]
-                       } else {
-                         combiners[[k]] <- do.call(mergeValue,
-                                                   list(combiners[[k]],
-                                                        item[[2]]))
-                       }
-                     })
-              lapply(ls(keys), function(k) {
-                      list(keys[[k]], combiners[[k]])
+                       item$hash <- as.character(item[[1]])
+                       updateOrCreatePair(item, keys, combiners, pred, mergeValue, createCombiner)
                      })
+              convertEnvsToList(keys, combiners)
             }
             locallyCombined <- lapplyPartition(rdd, combineLocally)
             shuffled <- partitionBy(locallyCombined, numPartitions)
             mergeAfterShuffle <- function(part) {
               combiners <- new.env()
               keys <- new.env()
+              pred <- function(item) exists(item$hash, keys)
               lapply(part,
                      function(item) {
-                       k <- as.character(item[[1]])
-                       if (!exists(k, combiners)) {
-                         combiners[[k]] <- item[[2]]
-                         keys[[k]] <- item[[1]]
-                       } else {
-                         combiners[[k]] <- do.call(mergeCombiners,
-                                                   list(combiners[[k]],
-                                                        item[[2]]))
-                       }
-                     })
-              lapply(ls(keys), function(k) {
-                      list(keys[[k]], combiners[[k]])
+                       item$hash <- as.character(item[[1]])
+                       updateOrCreatePair(item, keys, combiners, pred, mergeCombiners,
+                                          function(x) x)
                      })
+              convertEnvsToList(keys, combiners)
             }
-            combined <-lapplyPartition(shuffled, mergeAfterShuffle)
-            combined
+            lapplyPartition(shuffled, mergeAfterShuffle)
           })
 
 ############ Binary Functions #############
diff --git a/pkg/R/utils.R b/pkg/R/utils.R
@@ -259,3 +259,32 @@ joinTaggedList <- function(tagged_list, cnull) {
   lists <- genCompactLists(tagged_list, cnull)
   mergeCompactLists(lists[[1]], lists[[2]])
 }
+
+# Utility function to reduce a key-value list with predicate
+# Used in *ByKey functions
+# param
+#   item key-val pair
+#   keys/vals env of key/value with hashes
+#   pred predicate function
+#   update_fn update or merge function for existing pair, similar with `mergeVal` @combineByKey
+#   create_fn create function for new pair, similar with `createCombiner` @combinebykey
+updateOrCreatePair <- function(item, keys, vals, pred, update_fn, create_fn) {
+  # assum hashval bind to `$hash`, key/val with index 1/2
+  hashVal <- item$hash
+  key <- item[[1]]
+  val <- item[[2]]
+  if (pred(item)) {
+    assign(hashVal, do.call(update_fn, list(get(hashVal, envir=vals), val)), envir=vals)
+  } else {
+    assign(hashVal, do.call(create_fn, list(val)), envir=vals)
+    assign(hashVal, key, envir=keys)
+  }
+}
+
+# Utility function to convert key&values envs into key-val list
+convertEnvsToList <- function(keys, vals) {
+  lapply(ls(keys),
+         function(name) {
+           list(keys[[name]], vals[[name]])
+         })
+}
diff --git a/pkg/inst/tests/test_rdd.R b/pkg/inst/tests/test_rdd.R
@@ -229,6 +229,19 @@ test_that("flatMapValues() on pairwise RDDs", {
                     list(2L, 1), list(2L, 2), list(1L, 200), list(1L, 201)))
 })
 
+test_that("reduceByKeyLocally() on PairwiseRDDs", {
+  pairs <- parallelize(sc, list(list(1, 2), list(1.1, 3), list(1, 4)), 2L)
+  actual <- reduceByKeyLocally(pairs, "+")
+  expect_equal(sortKeyValueList(actual),
+               sortKeyValueList(list(list(1, 6), list(1.1, 3))))
+
+  pairs <- parallelize(sc, list(list("abc", 1.2), list(1.1, 0), list("abc", 1.3),
+                                list("bb", 5)), 4L)
+  actual <- reduceByKeyLocally(pairs, "+")
+  expect_equal(sortKeyValueList(actual),
+               sortKeyValueList(list(list("abc", 2.5), list(1.1, 0), list("bb", 5))))
+})
+
 test_that("distinct() on RDDs", {
   nums.rep2 <- rep(1:10, 2)
   rdd.rep2 <- parallelize(sc, nums.rep2, 2L)
diff --git a/pkg/man/reduceByKeyLocally.Rd b/pkg/man/reduceByKeyLocally.Rd
@@ -0,0 +1,41 @@
+% Generated by roxygen2 (4.1.0): do not edit by hand
+% Please edit documentation in R/RDD.R
+\docType{methods}
+\name{reduceByKeyLocally}
+\alias{reduceByKeyLocally}
+\alias{reduceByKeyLocally,RDD,integer-method}
+\alias{reduceByKeyLocally,RDD-method}
+\title{Merge values by key locally}
+\usage{
+reduceByKeyLocally(rdd, combineFunc)
+
+\S4method{reduceByKeyLocally}{RDD}(rdd, combineFunc)
+}
+\arguments{
+\item{rdd}{The RDD to reduce by key. Should be an RDD where each element is
+list(K, V) or c(K, V).}
+
+\item{combineFunc}{The associative reduce function to use.}
+}
+\value{
+An list where each element is list(K, V') where V' is the merged
+        value
+}
+\description{
+This function operates on RDDs where every element is of the form list(K, V) or c(K, V).
+and merges the values for each key using an associative reduce function, but return the
+results immediately to master as R list.
+}
+\examples{
+\dontrun{
+sc <- sparkR.init()
+pairs <- list(list(1, 2), list(1.1, 3), list(1, 4))
+rdd <- parallelize(sc, pairs)
+reduced <- reduceByKeyLocally(rdd, "+")
+reduced[[1]] # Should be a list(1, 6)
+}
+}
+\seealso{
+reduceByKey
+}
+