diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/FileOutputCommitter.java b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/FileOutputCommitter.java index 7f1ea6175ed6e..a1749b0c8a9a0 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/FileOutputCommitter.java +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/java/org/apache/hadoop/mapreduce/lib/output/FileOutputCommitter.java @@ -68,7 +68,7 @@ public class FileOutputCommitter extends PathOutputCommitter { "mapreduce.fileoutputcommitter.marksuccessfuljobs"; public static final String FILEOUTPUTCOMMITTER_ALGORITHM_VERSION = "mapreduce.fileoutputcommitter.algorithm.version"; - public static final int FILEOUTPUTCOMMITTER_ALGORITHM_VERSION_DEFAULT = 2; + public static final int FILEOUTPUTCOMMITTER_ALGORITHM_VERSION_DEFAULT = 1; // Skip cleanup _temporary folders under job's output directory public static final String FILEOUTPUTCOMMITTER_CLEANUP_SKIPPED = "mapreduce.fileoutputcommitter.cleanup.skipped"; @@ -348,6 +348,19 @@ public Path getWorkPath() throws IOException { * @param context the job's context */ public void setupJob(JobContext context) throws IOException { + // warning about v2 which is made to a custom logger so people can + // turn it off if they are happy with the v2 commit protocol. + if (algorithmVersion == 2) { + Logger log = LoggerFactory.getLogger( + "org.apache.hadoop.mapreduce.lib.output." + + "FileOutputCommitter.Algorithm"); + + log.warn("The v2 commit algorithm assumes that the content of generated output files is" + + " consistent across all task attempts" + + " -if this is not true for this job, switch to the v1 commit algorithm." + + " See MAPREDUCE-7282"); + } + if (hasOutputPath()) { Path jobAttemptPath = getJobAttemptPath(context); FileSystem fs = jobAttemptPath.getFileSystem( diff --git a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/resources/mapred-default.xml b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/resources/mapred-default.xml index c40bb0b19c88b..e6f2b311a7c3a 100644 --- a/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/resources/mapred-default.xml +++ b/hadoop-mapreduce-project/hadoop-mapreduce-client/hadoop-mapreduce-client-core/src/main/resources/mapred-default.xml @@ -1562,10 +1562,10 @@ mapreduce.fileoutputcommitter.algorithm.version - 2 - The file output committer algorithm version - valid algorithm version number: 1 or 2 - default to 2, which is the original algorithm + 1 + The file output committer algorithm version. + + There are two algorithm versions in Hadoop, "1" and "2". In algorithm version 1, @@ -1611,6 +1611,35 @@ large jobs by having the tasks commit directly to the final output directory as they were completing and commitJob had very little to do. + + The v2 commit algorithm assumes that the content of generated output files + is consistent across all task attempts - if this is not true for a job, + the v1 commit algorithm should be used. + + This is because task commits are not atomic + + If a first task attempt fails part-way + through its task commit, the output directory can end up + with data from that failed commit, alongside the data + from any subsequent attempts. + + See https://issues.apache.org/jira/browse/MAPREDUCE-7282 + + Although no-longer the default, this algorithm is safe to use if + all task attempts for a single task meet the following requirements + -they generate exactly the same set of files + -the contents of each file are exactly the same in each task attempt + + That is: + 1. If a second attempt commits work, there will be no leftover files from + a first attempt which failed during its task commit. + 2. If a network partition causes the first task attempt to overwrite + some/all of the output of a second attempt, the result will be + exactly the same as if it had not done so. + + To avoid the warning message on job setup, set the log level of the log + org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter.Algorithm + to ERROR.