1717
1818package org .apache .spark .rdd
1919
20+ import java .io .File
21+ import java .io .FilenameFilter
2022import java .io .PrintWriter
2123import java .util .StringTokenizer
2224
@@ -26,7 +28,9 @@ import scala.collection.mutable.ArrayBuffer
2628import scala .io .Source
2729import scala .reflect .ClassTag
2830
31+ import org .apache .commons .io .FileUtils
2932import org .apache .spark .{Partition , SparkEnv , TaskContext }
33+ import org .apache .spark .util .Utils
3034
3135
3236/**
@@ -38,7 +42,8 @@ class PipedRDD[T: ClassTag](
3842 command : Seq [String ],
3943 envVars : Map [String , String ],
4044 printPipeContext : (String => Unit ) => Unit ,
41- printRDDElement : (T , String => Unit ) => Unit )
45+ printRDDElement : (T , String => Unit ) => Unit ,
46+ separateWorkingDir : Boolean )
4247 extends RDD [String ](prev) {
4348
4449 // Similar to Runtime.exec(), if we are given a single string, split it into words
@@ -48,12 +53,24 @@ class PipedRDD[T: ClassTag](
4853 command : String ,
4954 envVars : Map [String , String ] = Map (),
5055 printPipeContext : (String => Unit ) => Unit = null ,
51- printRDDElement : (T , String => Unit ) => Unit = null ) =
52- this (prev, PipedRDD .tokenize(command), envVars, printPipeContext, printRDDElement)
56+ printRDDElement : (T , String => Unit ) => Unit = null ,
57+ separateWorkingDir : Boolean = false ) =
58+ this (prev, PipedRDD .tokenize(command), envVars, printPipeContext, printRDDElement,
59+ separateWorkingDir)
5360
5461
5562 override def getPartitions : Array [Partition ] = firstParent[T ].partitions
5663
64+ /**
65+ * A FilenameFilter that accepts anything that isn't equal to the name passed in.
66+ * @param name of file or directory to leave out
67+ */
68+ class NotEqualsFileNameFilter (name : String ) extends FilenameFilter {
69+ def accept (dir : File , name : String ): Boolean = {
70+ ! name.equals(name)
71+ }
72+ }
73+
5774 override def compute (split : Partition , context : TaskContext ): Iterator [String ] = {
5875 val pb = new ProcessBuilder (command)
5976 // Add the environmental variables to the process.
@@ -67,6 +84,38 @@ class PipedRDD[T: ClassTag](
6784 currentEnvVars.putAll(hadoopSplit.getPipeEnvVars())
6885 }
6986
87+ // When spark.worker.separated.working.directory option is turned on, each
88+ // task will be run in separate directory. This should be resolve file
89+ // access conflict issue
90+ val taskDirectory = " ./tasks/" + java.util.UUID .randomUUID.toString
91+ var workInTaskDirectory = false
92+ logDebug(" taskDirectory = " + taskDirectory)
93+ if (separateWorkingDir == true ) {
94+ val currentDir = new File (" ." )
95+ logDebug(" currentDir = " + currentDir)
96+ val taskDirFile = new File (taskDirectory)
97+ taskDirFile.mkdirs()
98+
99+ try {
100+ val tasksDirFilter = new NotEqualsFileNameFilter (" tasks" )
101+
102+ // Need to add symlinks to jars, files, and directories. On Yarn we could have
103+ // directories and other files not known to the SparkContext that were added via the
104+ // Hadoop distributed cache. We also don't want to symlink to the /tasks directories we
105+ // are creating here.
106+ for (file <- currentDir.list(tasksDirFilter)) {
107+ val fileWithDir = new File (currentDir, file)
108+ Utils .symlink(new File (fileWithDir.getAbsolutePath()),
109+ new File (taskDirectory + " /" + fileWithDir.getName()))
110+ }
111+ pb.directory(taskDirFile)
112+ workInTaskDirectory = true
113+ } catch {
114+ case e : Exception => logError(" Unable to setup task working directory: " + e.getMessage +
115+ " (" + taskDirectory + " )" )
116+ }
117+ }
118+
70119 val proc = pb.start()
71120 val env = SparkEnv .get
72121
@@ -112,6 +161,13 @@ class PipedRDD[T: ClassTag](
112161 if (exitStatus != 0 ) {
113162 throw new Exception (" Subprocess exited with status " + exitStatus)
114163 }
164+
165+ // cleanup task working directory if used
166+ if (workInTaskDirectory == true ) {
167+ FileUtils .deleteQuietly(new File (taskDirectory))
168+ logDebug(" Removed task working directory " + taskDirectory)
169+ }
170+
115171 false
116172 }
117173 }
0 commit comments