-
Notifications
You must be signed in to change notification settings - Fork 382
Enhance pipe to support more features we can do in hadoop streaming #638
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
9f84315
4a9913d
e179ff8
fb6d733
e6ae049
33c55ec
728665d
4508089
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -16,6 +16,7 @@ import org.apache.hadoop.mapred.TextOutputFormat | |
|
|
||
| import it.unimi.dsi.fastutil.objects.{Object2LongOpenHashMap => OLMap} | ||
|
|
||
| import spark.broadcast.Broadcast | ||
| import spark.Partitioner._ | ||
| import spark.partial.BoundedDouble | ||
| import spark.partial.CountEvaluator | ||
|
|
@@ -351,13 +352,93 @@ abstract class RDD[T: ClassManifest]( | |
| /** | ||
| * Return an RDD created by piping elements to a forked external process. | ||
| */ | ||
| def pipe(command: Seq[String]): RDD[String] = new PipedRDD(this, command) | ||
| def pipe(command: String, env: Map[String, String]): RDD[String] = | ||
| new PipedRDD(this, command, env) | ||
|
|
||
| /** | ||
| * Return an RDD created by piping elements to a forked external process. | ||
| */ | ||
| def pipe(command: Seq[String], env: Map[String, String]): RDD[String] = | ||
| new PipedRDD(this, command, env) | ||
| * How each record in RDD is outputed to the process can be controled by providing a | ||
| * function trasnform(T, outputFunction: String => Unit). transform() will be called with | ||
| * the currnet record in RDD as the 1st parameter, and the function to output the record to | ||
| * the external process (like out.println()) as the 2nd parameter. | ||
| * Here's an example on how to pipe the RDD data of groupBy() in a streaming way, | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe add blank lines inside the comment so that this isn't a wall of text? And maybe enclose parameter names in backticks to make them more apparent?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It might be cleaner to use the Scaladoc
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. will change. |
||
| * instead of constructing a huge String to concat all the records: | ||
| * def tranform(record:(String, Seq[String]), f:String=>Unit) = for (e <- record._2){f(e)} | ||
| * pipeContext can be used to transfer additional context data to the external process | ||
| * besides the RDD. pipeContext is a broadcast Seq[String], each line would be piped to | ||
| * external process with "^A" as the delimiter in the end of context data. Delimiter can also | ||
| * be customized by the last parameter delimiter. | ||
| */ | ||
| def pipe[U<: Seq[String]]( | ||
| command: String, | ||
| env: Map[String, String], | ||
| transform: (T,String => Unit) => Any, | ||
| pipeContext: Broadcast[U], | ||
| delimiter: String): RDD[String] = | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will change. |
||
| new PipedRDD(this, command, env, transform, pipeContext, delimiter) | ||
|
|
||
| /** | ||
| * Return an RDD created by piping elements to a forked external process. | ||
| * How each record in RDD is outputed to the process can be controled by providing a | ||
| * function trasnform(T, outputFunction: String => Unit). transform() will be called with | ||
| * the currnet record in RDD as the 1st parameter, and the function to output the record to | ||
| * the external process (like out.println()) as the 2nd parameter. | ||
| * Here's an example on how to pipe the RDD data of groupBy() in a streaming way, | ||
| * instead of constructing a huge String to concat all the records: | ||
| * def tranform(record:(String, Seq[String]), f:String=>Unit) = for (e <- record._2){f(e)} | ||
| * pipeContext can be used to transfer additional context data to the external process | ||
| * besides the RDD. pipeContext is a broadcast Seq[String], each line would be piped to | ||
| * external process with "^A" as the delimiter in the end of context data. Delimiter can also | ||
| * be customized by the last parameter delimiter. | ||
| */ | ||
| def pipe[U<: Seq[String]]( | ||
| command: String, | ||
| transform: (T,String => Unit) => Any, | ||
| pipeContext: Broadcast[U]): RDD[String] = | ||
| new PipedRDD(this, command, Map[String, String](), transform, pipeContext, "\u0001") | ||
|
|
||
| /** | ||
| * Return an RDD created by piping elements to a forked external process. | ||
| * How each record in RDD is outputed to the process can be controled by providing a | ||
| * function trasnform(T, outputFunction: String => Unit). transform() will be called with | ||
| * the currnet record in RDD as the 1st parameter, and the function to output the record to | ||
| * the external process (like out.println()) as the 2nd parameter. | ||
| * Here's an example on how to pipe the RDD data of groupBy() in a streaming way, | ||
| * instead of constructing a huge String to concat all the records: | ||
| * def tranform(record:(String, Seq[String]), f:String=>Unit) = for (e <- record._2){f(e)} | ||
| * pipeContext can be used to transfer additional context data to the external process | ||
| * besides the RDD. pipeContext is a broadcast Seq[String], each line would be piped to | ||
| * external process with "^A" as the delimiter in the end of context data. Delimiter can also | ||
| * be customized by the last parameter delimiter. | ||
| */ | ||
| def pipe[U<: Seq[String]]( | ||
|
||
| command: String, | ||
| env: Map[String, String], | ||
| transform: (T,String => Unit) => Any, | ||
| pipeContext: Broadcast[U]): RDD[String] = | ||
| new PipedRDD(this, command, env, transform, pipeContext, "\u0001") | ||
|
|
||
| /** | ||
| * Return an RDD created by piping elements to a forked external process. | ||
| * How each record in RDD is outputed to the process can be controled by providing a | ||
| * function trasnform(T, outputFunction: String => Unit). transform() will be called with | ||
| * the currnet record in RDD as the 1st parameter, and the function to output the record to | ||
| * the external process (like out.println()) as the 2nd parameter. | ||
| * Here's an example on how to pipe the RDD data of groupBy() in a streaming way, | ||
| * instead of constructing a huge String to concat all the records: | ||
| * def tranform(record:(String, Seq[String]), f:String=>Unit) = for (e <- record._2){f(e)} | ||
| * pipeContext can be used to transfer additional context data to the external process | ||
| * besides the RDD. pipeContext is a broadcast Seq[String], each line would be piped to | ||
| * external process with "^A" as the delimiter in the end of context data. Delimiter can also | ||
| * be customized by the last parameter delimiter. | ||
| */ | ||
| def pipe[U<: Seq[String]]( | ||
| command: Seq[String], | ||
| env: Map[String, String] = Map(), | ||
| transform: (T,String => Unit) => Any = null, | ||
| pipeContext: Broadcast[U] = null, | ||
| delimiter: String = "\u0001"): RDD[String] = | ||
| new PipedRDD(this, command, env, transform, pipeContext, delimiter) | ||
|
|
||
| /** | ||
| * Return a new RDD by applying a function to each partition of this RDD. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -9,23 +9,34 @@ import scala.collection.mutable.ArrayBuffer | |
| import scala.io.Source | ||
|
|
||
| import spark.{RDD, SparkEnv, Partition, TaskContext} | ||
| import spark.broadcast.Broadcast | ||
|
|
||
|
|
||
| /** | ||
| * An RDD that pipes the contents of each parent partition through an external command | ||
| * (printing them one per line) and returns the output as a collection of strings. | ||
| */ | ||
| class PipedRDD[T: ClassManifest]( | ||
| class PipedRDD[T: ClassManifest, U <: Seq[String]]( | ||
| prev: RDD[T], | ||
| command: Seq[String], | ||
| envVars: Map[String, String]) | ||
| envVars: Map[String, String], | ||
| transform: (T, String => Unit) => Any, | ||
| pipeContext: Broadcast[U], | ||
| delimiter: String | ||
| ) | ||
| extends RDD[String](prev) { | ||
|
|
||
| def this(prev: RDD[T], command: Seq[String]) = this(prev, command, Map()) | ||
|
|
||
| // Similar to Runtime.exec(), if we are given a single string, split it into words | ||
| // using a standard StringTokenizer (i.e. by spaces) | ||
| def this(prev: RDD[T], command: String) = this(prev, PipedRDD.tokenize(command)) | ||
| def this( | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can put the default arguments in the default constructor (i.e. the one right after "class PipedRDD").
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ehm. You are right. I'll change this. |
||
| prev: RDD[T], | ||
| command: String, | ||
| envVars: Map[String, String] = Map(), | ||
| transform: (T, String => Unit) => Any = null, | ||
| pipeContext: Broadcast[U] = null, | ||
| delimiter: String = "\u0001") = | ||
| this(prev, PipedRDD.tokenize(command), envVars, transform, pipeContext, delimiter) | ||
|
|
||
|
|
||
| override def getPartitions: Array[Partition] = firstParent[T].partitions | ||
|
|
||
|
|
@@ -52,8 +63,21 @@ class PipedRDD[T: ClassManifest]( | |
| override def run() { | ||
| SparkEnv.set(env) | ||
| val out = new PrintWriter(proc.getOutputStream) | ||
|
|
||
| // input the pipeContext firstly | ||
| if ( pipeContext != null) { | ||
| for (elem <- pipeContext.value) { | ||
| out.println(elem) | ||
| } | ||
| // delimiter\n as the marker of the end of the pipeContext | ||
| out.println(delimiter) | ||
| } | ||
| for (elem <- firstParent[T].iterator(split, context)) { | ||
| out.println(elem) | ||
| if (transform != null) { | ||
| transform(elem, out.println(_)) | ||
|
||
| } else { | ||
| out.println(elem) | ||
| } | ||
| } | ||
| out.close() | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Typo:
trasnformshould betransform.