@@ -1128,6 +1128,236 @@ setMethod("sortBy",
11281128 values(sortByKey(keyBy(rdd , func ), ascending , numPartitions ))
11291129 })
11301130
1131+ # Helper function to get first N elements from an RDD in the specified order.
1132+ # Param:
1133+ # rdd An RDD.
1134+ # num Number of elements to return.
1135+ # ascending A flag to indicate whether the sorting is ascending or descending.
1136+ # Return:
1137+ # A list of the first N elements from the RDD in the specified order.
1138+ #
1139+ takeOrderedElem <- function (rdd , num , ascending = TRUE ) {
1140+ if (num < = 0L ) {
1141+ return (list ())
1142+ }
1143+
1144+ partitionFunc <- function (part ) {
1145+ if (num < length(part )) {
1146+ # R limitation: order works only on primitive types!
1147+ ord <- order(unlist(part , recursive = FALSE ), decreasing = ! ascending )
1148+ list (part [ord [1 : num ]])
1149+ } else {
1150+ list (part )
1151+ }
1152+ }
1153+
1154+ reduceFunc <- function (elems , part ) {
1155+ newElems <- append(elems , part )
1156+ # R limitation: order works only on primitive types!
1157+ ord <- order(unlist(newElems , recursive = FALSE ), decreasing = ! ascending )
1158+ newElems [ord [1 : num ]]
1159+ }
1160+
1161+ newRdd <- mapPartitions(rdd , partitionFunc )
1162+ reduce(newRdd , reduceFunc )
1163+ }
1164+
1165+ # ' Returns the first N elements from an RDD in ascending order.
1166+ # '
1167+ # ' @param rdd An RDD.
1168+ # ' @param num Number of elements to return.
1169+ # ' @return The first N elements from the RDD in ascending order.
1170+ # ' @rdname takeOrdered
1171+ # ' @export
1172+ # ' @examples
1173+ # '\dontrun{
1174+ # ' sc <- sparkR.init()
1175+ # ' rdd <- parallelize(sc, list(10, 1, 2, 9, 3, 4, 5, 6, 7))
1176+ # ' takeOrdered(rdd, 6L) # list(1, 2, 3, 4, 5, 6)
1177+ # '}
1178+ setGeneric ("takeOrdered ", function(rdd, num) { standardGeneric("takeOrdered") })
1179+
1180+ # ' @rdname takeOrdered
1181+ # ' @aliases takeOrdered,RDD,RDD-method
1182+ setMethod ("takeOrdered ",
1183+ signature(rdd = " RDD" , num = " integer" ),
1184+ function (rdd , num ) {
1185+ takeOrderedElem(rdd , num )
1186+ })
1187+
1188+ # ' Returns the top N elements from an RDD.
1189+ # '
1190+ # ' @param rdd An RDD.
1191+ # ' @param num Number of elements to return.
1192+ # ' @return The top N elements from the RDD.
1193+ # ' @rdname top
1194+ # ' @export
1195+ # ' @examples
1196+ # '\dontrun{
1197+ # ' sc <- sparkR.init()
1198+ # ' rdd <- parallelize(sc, list(10, 1, 2, 9, 3, 4, 5, 6, 7))
1199+ # ' top(rdd, 6L) # list(10, 9, 7, 6, 5, 4)
1200+ # '}
1201+ setGeneric ("top ", function(rdd, num) { standardGeneric("top") })
1202+
1203+ # ' @rdname top
1204+ # ' @aliases top,RDD,RDD-method
1205+ setMethod ("top ",
1206+ signature(rdd = " RDD" , num = " integer" ),
1207+ function (rdd , num ) {
1208+ takeOrderedElem(rdd , num , FALSE )
1209+ })
1210+
1211+ # ' Fold an RDD using a given associative function and a neutral "zero value".
1212+ # '
1213+ # ' Aggregate the elements of each partition, and then the results for all the
1214+ # ' partitions, using a given associative function and a neutral "zero value".
1215+ # '
1216+ # ' @param rdd An RDD.
1217+ # ' @param zeroValue A neutral "zero value".
1218+ # ' @param op An associative function for the folding operation.
1219+ # ' @return The folding result.
1220+ # ' @rdname fold
1221+ # ' @seealso reduce
1222+ # ' @export
1223+ # ' @examples
1224+ # '\dontrun{
1225+ # ' sc <- sparkR.init()
1226+ # ' rdd <- parallelize(sc, list(1, 2, 3, 4, 5))
1227+ # ' fold(rdd, 0, "+") # 15
1228+ # '}
1229+ setGeneric ("fold ", function(rdd, zeroValue, op) { standardGeneric("fold") })
1230+
1231+ # ' @rdname fold
1232+ # ' @aliases fold,RDD,RDD-method
1233+ setMethod ("fold ",
1234+ signature(rdd = " RDD" , zeroValue = " ANY" , op = " ANY" ),
1235+ function (rdd , zeroValue , op ) {
1236+ aggregateRDD(rdd , zeroValue , op , op )
1237+ })
1238+
1239+ # ' Aggregate an RDD using the given combine functions and a neutral "zero value".
1240+ # '
1241+ # ' Aggregate the elements of each partition, and then the results for all the
1242+ # ' partitions, using given combine functions and a neutral "zero value".
1243+ # '
1244+ # ' @param rdd An RDD.
1245+ # ' @param zeroValue A neutral "zero value".
1246+ # ' @param seqOp A function to aggregate the RDD elements. It may return a different
1247+ # ' result type from the type of the RDD elements.
1248+ # ' @param combOp A function to aggregate results of seqOp.
1249+ # ' @return The aggregation result.
1250+ # ' @rdname aggregateRDD
1251+ # ' @seealso reduce
1252+ # ' @export
1253+ # ' @examples
1254+ # '\dontrun{
1255+ # ' sc <- sparkR.init()
1256+ # ' rdd <- parallelize(sc, list(1, 2, 3, 4))
1257+ # ' zeroValue <- list(0, 0)
1258+ # ' seqOp <- function(x, y) { list(x[[1]] + y, x[[2]] + 1) }
1259+ # ' combOp <- function(x, y) { list(x[[1]] + y[[1]], x[[2]] + y[[2]]) }
1260+ # ' aggregateRDD(rdd, zeroValue, seqOp, combOp) # list(10, 4)
1261+ # '}
1262+ setGeneric ("aggregateRDD ", function(rdd, zeroValue, seqOp, combOp) { standardGeneric("aggregateRDD") })
1263+
1264+ # ' @rdname aggregateRDD
1265+ # ' @aliases aggregateRDD,RDD,RDD-method
1266+ setMethod ("aggregateRDD ",
1267+ signature(rdd = " RDD" , zeroValue = " ANY" , seqOp = " ANY" , combOp = " ANY" ),
1268+ function (rdd , zeroValue , seqOp , combOp ) {
1269+ partitionFunc <- function (part ) {
1270+ Reduce(seqOp , part , zeroValue )
1271+ }
1272+
1273+ partitionList <- collect(lapplyPartition(rdd , partitionFunc ),
1274+ flatten = FALSE )
1275+ Reduce(combOp , partitionList , zeroValue )
1276+ })
1277+
1278+ # ########### Shuffle Functions ############
1279+
1280+ # ' Partition an RDD by key
1281+ # '
1282+ # ' This function operates on RDDs where every element is of the form list(K, V) or c(K, V).
1283+ # ' For each element of this RDD, the partitioner is used to compute a hash
1284+ # ' function and the RDD is partitioned using this hash value.
1285+ # '
1286+ # ' @param rdd The RDD to partition. Should be an RDD where each element is
1287+ # ' list(K, V) or c(K, V).
1288+ # ' @param numPartitions Number of partitions to create.
1289+ # ' @param ... Other optional arguments to partitionBy.
1290+ # '
1291+ # ' @param partitionFunc The partition function to use. Uses a default hashCode
1292+ # ' function if not provided
1293+ # ' @return An RDD partitioned using the specified partitioner.
1294+ # ' @rdname partitionBy
1295+ # ' @export
1296+ # ' @examples
1297+ # '\dontrun{
1298+ # ' sc <- sparkR.init()
1299+ # ' pairs <- list(list(1, 2), list(1.1, 3), list(1, 4))
1300+ # ' rdd <- parallelize(sc, pairs)
1301+ # ' parts <- partitionBy(rdd, 2L)
1302+ # ' collectPartition(parts, 0L) # First partition should contain list(1, 2) and list(1, 4)
1303+ # '}
1304+ setGeneric ("partitionBy ",
1305+ function (rdd , numPartitions , ... ) {
1306+ standardGeneric(" partitionBy" )
1307+ })
1308+
1309+ # ' @rdname partitionBy
1310+ # ' @aliases partitionBy,RDD,integer-method
1311+ setMethod ("partitionBy ",
1312+ signature(rdd = " RDD" , numPartitions = " integer" ),
1313+ function (rdd , numPartitions , partitionFunc = hashCode ) {
1314+
1315+ # if (missing(partitionFunc)) {
1316+ # partitionFunc <- hashCode
1317+ # }
1318+
1319+ depsBinArr <- getDependencies(partitionFunc )
1320+
1321+ serializedHashFuncBytes <- serialize(as.character(substitute(partitionFunc )),
1322+ connection = NULL ,
1323+ ascii = TRUE )
1324+
1325+ packageNamesArr <- serialize(.sparkREnv $ .packages ,
1326+ connection = NULL ,
1327+ ascii = TRUE )
1328+ broadcastArr <- lapply(ls(.broadcastNames ), function (name ) {
1329+ get(name , .broadcastNames ) })
1330+ jrdd <- getJRDD(rdd )
1331+
1332+ # We create a PairwiseRRDD that extends RDD[(Array[Byte],
1333+ # Array[Byte])], where the key is the hashed split, the value is
1334+ # the content (key-val pairs).
1335+ pairwiseRRDD <- newJObject(" edu.berkeley.cs.amplab.sparkr.PairwiseRRDD" ,
1336+ callJMethod(jrdd , " rdd" ),
1337+ as.integer(numPartitions ),
1338+ serializedHashFuncBytes ,
1339+ rdd @ env $ serialized ,
1340+ depsBinArr ,
1341+ packageNamesArr ,
1342+ as.character(.sparkREnv $ libname ),
1343+ broadcastArr ,
1344+ callJMethod(jrdd , " classTag" ))
1345+
1346+ # Create a corresponding partitioner.
1347+ rPartitioner <- newJObject(" org.apache.spark.HashPartitioner" ,
1348+ as.integer(numPartitions ))
1349+
1350+ # Call partitionBy on the obtained PairwiseRDD.
1351+ javaPairRDD <- callJMethod(pairwiseRRDD , " asJavaPairRDD" )
1352+ javaPairRDD <- callJMethod(javaPairRDD , " partitionBy" , rPartitioner )
1353+
1354+ # Call .values() on the result to get back the final result, the
1355+ # shuffled acutal content key-val pairs.
1356+ r <- callJMethod(javaPairRDD , " values" )
1357+
1358+ RDD(r , serialized = TRUE )
1359+ })
1360+
11311361# ########### Binary Functions #############
11321362
11331363# ' Return the union RDD of two RDDs.
0 commit comments