@@ -30,11 +30,11 @@ import scala.reflect.ClassTag
3030
3131import com .clearspring .analytics .stream .cardinality .HyperLogLog
3232import org .apache .hadoop .conf .{Configurable , Configuration }
33- import org .apache .hadoop .fs .{ FileSystem , Path }
33+ import org .apache .hadoop .fs .FileSystem
3434import org .apache .hadoop .io .SequenceFile .CompressionType
3535import org .apache .hadoop .io .compress .CompressionCodec
3636import org .apache .hadoop .mapred .{FileOutputCommitter , FileOutputFormat , JobConf , OutputFormat }
37- import org .apache .hadoop .mapreduce .{OutputFormat => NewOutputFormat , Job => NewAPIHadoopJob , RecordWriter => NewRecordWriter , JobContext , SparkHadoopMapReduceUtil }
37+ import org .apache .hadoop .mapreduce .{OutputFormat => NewOutputFormat , Job => NewAPIHadoopJob , RecordWriter => NewRecordWriter , SparkHadoopMapReduceUtil }
3838import org .apache .hadoop .mapreduce .lib .output .{FileOutputFormat => NewFileOutputFormat }
3939
4040// SparkHadoopWriter and SparkHadoopMapReduceUtil are actually source files defined in Spark.
@@ -603,50 +603,9 @@ class PairRDDFunctions[K: ClassTag, V: ClassTag](self: RDD[(K, V)])
603603 val job = new NewAPIHadoopJob (conf)
604604 job.setOutputKeyClass(keyClass)
605605 job.setOutputValueClass(valueClass)
606-
607- val wrappedConf = new SerializableWritable (job.getConfiguration)
608- val outpath = new Path (path)
609- NewFileOutputFormat .setOutputPath(job, outpath)
610- val jobFormat = outputFormatClass.newInstance
611- jobFormat.checkOutputSpecs(job)
612- val formatter = new SimpleDateFormat (" yyyyMMddHHmm" )
613- val jobtrackerID = formatter.format(new Date ())
614- val stageId = self.id
615- def writeShard (context : TaskContext , iter : Iterator [(K ,V )]): Int = {
616- // Hadoop wants a 32-bit task attempt ID, so if ours is bigger than Int.MaxValue, roll it
617- // around by taking a mod. We expect that no task will be attempted 2 billion times.
618- val attemptNumber = (context.attemptId % Int .MaxValue ).toInt
619- /* "reduce task" <split #> <attempt # = spark task #> */
620- val attemptId = newTaskAttemptID(jobtrackerID, stageId, isMap = false , context.partitionId,
621- attemptNumber)
622- val hadoopContext = newTaskAttemptContext(wrappedConf.value, attemptId)
623- val format = outputFormatClass.newInstance
624- format match {
625- case c : Configurable => c.setConf(wrappedConf.value)
626- case _ => ()
627- }
628- val committer = format.getOutputCommitter(hadoopContext)
629- committer.setupTask(hadoopContext)
630- val writer = format.getRecordWriter(hadoopContext).asInstanceOf [NewRecordWriter [K ,V ]]
631- while (iter.hasNext) {
632- val (k, v) = iter.next()
633- writer.write(k, v)
634- }
635- writer.close(hadoopContext)
636- committer.commitTask(hadoopContext)
637- return 1
638- }
639-
640- /* apparently we need a TaskAttemptID to construct an OutputCommitter;
641- * however we're only going to use this local OutputCommitter for
642- * setupJob/commitJob, so we just use a dummy "map" task.
643- */
644- val jobAttemptId = newTaskAttemptID(jobtrackerID, stageId, isMap = true , 0 , 0 )
645- val jobTaskContext = newTaskAttemptContext(wrappedConf.value, jobAttemptId)
646- val jobCommitter = jobFormat.getOutputCommitter(jobTaskContext)
647- jobCommitter.setupJob(jobTaskContext)
648- self.context.runJob(self, writeShard _)
649- jobCommitter.commitJob(jobTaskContext)
606+ job.setOutputFormatClass(outputFormatClass)
607+ job.getConfiguration.set(" mapred.output.dir" , path)
608+ saveAsNewAPIHadoopDataset(job.getConfiguration)
650609 }
651610
652611 /**
@@ -692,6 +651,59 @@ class PairRDDFunctions[K: ClassTag, V: ClassTag](self: RDD[(K, V)])
692651 saveAsHadoopDataset(conf)
693652 }
694653
654+ /**
655+ * Output the RDD to any Hadoop-supported storage system with new Hadoop API, using a Hadoop
656+ * Configuration object for that storage system. The Conf should set an OutputFormat and any
657+ * output paths required (e.g. a table name to write to) in the same way as it would be
658+ * configured for a Hadoop MapReduce job.
659+ */
660+ def saveAsNewAPIHadoopDataset (conf : Configuration ) {
661+ val job = new NewAPIHadoopJob (conf)
662+ val formatter = new SimpleDateFormat (" yyyyMMddHHmm" )
663+ val jobtrackerID = formatter.format(new Date ())
664+ val stageId = self.id
665+ val wrappedConf = new SerializableWritable (job.getConfiguration)
666+ val outfmt = job.getOutputFormatClass
667+ val jobFormat = outfmt.newInstance
668+
669+ if (jobFormat.isInstanceOf [NewFileOutputFormat [_, _]]) {
670+ // FileOutputFormat ignores the filesystem parameter
671+ jobFormat.checkOutputSpecs(job)
672+ }
673+
674+ def writeShard (context : TaskContext , iter : Iterator [(K ,V )]): Int = {
675+ // Hadoop wants a 32-bit task attempt ID, so if ours is bigger than Int.MaxValue, roll it
676+ // around by taking a mod. We expect that no task will be attempted 2 billion times.
677+ val attemptNumber = (context.attemptId % Int .MaxValue ).toInt
678+ /* "reduce task" <split #> <attempt # = spark task #> */
679+ val attemptId = newTaskAttemptID(jobtrackerID, stageId, isMap = false , context.partitionId,
680+ attemptNumber)
681+ val hadoopContext = newTaskAttemptContext(wrappedConf.value, attemptId)
682+ val format = outfmt.newInstance
683+ format match {
684+ case c : Configurable => c.setConf(wrappedConf.value)
685+ case _ => ()
686+ }
687+ val committer = format.getOutputCommitter(hadoopContext)
688+ committer.setupTask(hadoopContext)
689+ val writer = format.getRecordWriter(hadoopContext).asInstanceOf [NewRecordWriter [K ,V ]]
690+ while (iter.hasNext) {
691+ val (k, v) = iter.next()
692+ writer.write(k, v)
693+ }
694+ writer.close(hadoopContext)
695+ committer.commitTask(hadoopContext)
696+ return 1
697+ }
698+
699+ val jobAttemptId = newTaskAttemptID(jobtrackerID, stageId, isMap = true , 0 , 0 )
700+ val jobTaskContext = newTaskAttemptContext(wrappedConf.value, jobAttemptId)
701+ val jobCommitter = jobFormat.getOutputCommitter(jobTaskContext)
702+ jobCommitter.setupJob(jobTaskContext)
703+ self.context.runJob(self, writeShard _)
704+ jobCommitter.commitJob(jobTaskContext)
705+ }
706+
695707 /**
696708 * Output the RDD to any Hadoop-supported storage system, using a Hadoop JobConf object for
697709 * that storage system. The JobConf should set an OutputFormat and any output paths required
0 commit comments