apache · srowen · Oct 2, 2018 · Oct 6, 2018 · Oct 8, 2018 · srowen
diff --git a/dev/appveyor-install-dependencies.ps1 b/dev/appveyor-install-dependencies.ps1
@@ -95,7 +95,8 @@ $env:MAVEN_OPTS = "-Xmx2g -XX:ReservedCodeCacheSize=512m"
 Pop-Location
 
 # ========================== Hadoop bin package
-$hadoopVer = "2.6.4"
+# This must match the version at https://github.com/steveloughran/winutils/tree/master/hadoop-2.7.1
+$hadoopVer = "2.7.1"
 $hadoopPath = "$tools\hadoop"
 if (!(Test-Path $hadoopPath)) {
     New-Item -ItemType Directory -Force -Path $hadoopPath | Out-Null

diff --git a/dev/create-release/release-build.sh b/dev/create-release/release-build.sh
@@ -191,9 +191,19 @@ if [[ "$1" == "package" ]]; then
   make_binary_release() {
     NAME=$1
     FLAGS="$MVN_EXTRA_OPTS -B $BASE_RELEASE_PROFILES $2"
+    # BUILD_PACKAGE can be "withpip", "withr", or both as "withpip,withr"
     BUILD_PACKAGE=$3
     SCALA_VERSION=$4
 
+    PIP_FLAG=""
+    if [[ $BUILD_PACKAGE == *"withpip"* ]]; then
+      PIP_FLAG="--pip"
+    fi
+    R_FLAG=""
+    if [[ $BUILD_PACKAGE == *"withr"* ]]; then
+      R_FLAG="--r"
+    fi
+
     # We increment the Zinc port each time to avoid OOM's and other craziness if multiple builds
     # share the same Zinc server.
     ZINC_PORT=$((ZINC_PORT + 1))
@@ -217,18 +227,13 @@ if [[ "$1" == "package" ]]; then
     # Get maven home set by MVN
     MVN_HOME=`$MVN -version 2>&1 | grep 'Maven home' | awk '{print $NF}'`
 
+    echo "Creating distribution"
+    ./dev/make-distribution.sh --name $NAME --mvn $MVN_HOME/bin/mvn --tgz \
+      $PIP_FLAG $R_FLAG $FLAGS \
+      -DzincPort=$ZINC_PORT 2>&1 >  ../binary-release-$NAME.log
+    cd ..
 
-    if [ -z "$BUILD_PACKAGE" ]; then
-      echo "Creating distribution without PIP/R package"
-      ./dev/make-distribution.sh --name $NAME --mvn $MVN_HOME/bin/mvn --tgz $FLAGS \
-        -DzincPort=$ZINC_PORT 2>&1 >  ../binary-release-$NAME.log
-      cd ..
-    elif [[ "$BUILD_PACKAGE" == "withr" ]]; then
-      echo "Creating distribution with R package"
-      ./dev/make-distribution.sh --name $NAME --mvn $MVN_HOME/bin/mvn --tgz --r $FLAGS \
-        -DzincPort=$ZINC_PORT 2>&1 >  ../binary-release-$NAME.log
-      cd ..
-
+    if [[ -n $R_FLAG ]]; then
       echo "Copying and signing R source package"
       R_DIST_NAME=SparkR_$SPARK_VERSION.tar.gz
       cp spark-$SPARK_VERSION-bin-$NAME/R/$R_DIST_NAME .
@@ -239,12 +244,9 @@ if [[ "$1" == "package" ]]; then
       echo $GPG_PASSPHRASE | $GPG --passphrase-fd 0 --print-md \
         SHA512 $R_DIST_NAME > \
         $R_DIST_NAME.sha512
-    else
-      echo "Creating distribution with PIP package"
-      ./dev/make-distribution.sh --name $NAME --mvn $MVN_HOME/bin/mvn --tgz --pip $FLAGS \
-        -DzincPort=$ZINC_PORT 2>&1 >  ../binary-release-$NAME.log
-      cd ..
+    fi
 
+    if [[ -n $PIP_FLAG ]]; then
       echo "Copying and signing python distribution"
       PYTHON_DIST_NAME=pyspark-$PYSPARK_VERSION.tar.gz
       cp spark-$SPARK_VERSION-bin-$NAME/python/dist/$PYTHON_DIST_NAME .
@@ -277,19 +279,18 @@ if [[ "$1" == "package" ]]; then
   declare -A BINARY_PKGS_ARGS
   BINARY_PKGS_ARGS["hadoop2.7"]="-Phadoop-2.7 $HIVE_PROFILES"
   if ! is_dry_run; then
-    BINARY_PKGS_ARGS["hadoop2.6"]="-Phadoop-2.6 $HIVE_PROFILES"
     BINARY_PKGS_ARGS["without-hadoop"]="-Phadoop-provided"
+    if [[ $SPARK_VERSION < "3.0." ]]; then
+      BINARY_PKGS_ARGS["hadoop2.6"]="-Phadoop-2.6 $HIVE_PROFILES"
+    fi
     if [[ $SPARK_VERSION < "2.2." ]]; then
       BINARY_PKGS_ARGS["hadoop2.4"]="-Phadoop-2.4 $HIVE_PROFILES"
       BINARY_PKGS_ARGS["hadoop2.3"]="-Phadoop-2.3 $HIVE_PROFILES"
     fi
   fi
 
   declare -A BINARY_PKGS_EXTRA
-  BINARY_PKGS_EXTRA["hadoop2.7"]="withpip"
-  if ! is_dry_run; then
-    BINARY_PKGS_EXTRA["hadoop2.6"]="withr"
-  fi
+  BINARY_PKGS_EXTRA["hadoop2.7"]="withpip,withr"
 
   echo "Packages to build: ${!BINARY_PKGS_ARGS[@]}"
   for key in ${!BINARY_PKGS_ARGS[@]}; do

diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6
diff --git a/dev/run-tests.py b/dev/run-tests.py
@@ -305,7 +305,6 @@ def get_hadoop_profiles(hadoop_version):
     """
 
     sbt_maven_hadoop_profiles = {
-        "hadoop2.6": ["-Phadoop-2.6"],
         "hadoop2.7": ["-Phadoop-2.7"],
     }
 
@@ -369,15 +368,7 @@ def build_spark_assembly_sbt(hadoop_version, checkstyle=False):
     if checkstyle:
         run_java_style_checks()
 
-    # Note that we skip Unidoc build only if Hadoop 2.6 is explicitly set in this SBT build.
-    # Due to a different dependency resolution in SBT & Unidoc by an unknown reason, the
-    # documentation build fails on a specific machine & environment in Jenkins but it was unable
-    # to reproduce. Please see SPARK-20343. This is a band-aid fix that should be removed in
-    # the future.
-    is_hadoop_version_2_6 = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE") == "hadoop2.6"
-    if not is_hadoop_version_2_6:
-        # Make sure that Java and Scala API documentation can be generated
-        build_spark_unidoc_sbt(hadoop_version)
+    build_spark_unidoc_sbt(hadoop_version)
 
 
 def build_apache_spark(build_tool, hadoop_version):
@@ -528,14 +519,14 @@ def main():
         # if we're on the Amplab Jenkins build servers setup variables
         # to reflect the environment settings
         build_tool = os.environ.get("AMPLAB_JENKINS_BUILD_TOOL", "sbt")
-        hadoop_version = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE", "hadoop2.6")
+        hadoop_version = os.environ.get("AMPLAB_JENKINS_BUILD_PROFILE", "hadoop2.7")
         test_env = "amplab_jenkins"
         # add path for Python3 in Jenkins if we're calling from a Jenkins machine
         os.environ["PATH"] = "/home/anaconda/envs/py3k/bin:" + os.environ.get("PATH")
     else:
         # else we're running locally and can use local settings
         build_tool = "sbt"
-        hadoop_version = os.environ.get("HADOOP_PROFILE", "hadoop2.6")
+        hadoop_version = os.environ.get("HADOOP_PROFILE", "hadoop2.7")
         test_env = "local"
 
     print("[info] Using build tool", build_tool, "with Hadoop profile", hadoop_version,

diff --git a/dev/test-dependencies.sh b/dev/test-dependencies.sh
@@ -32,7 +32,6 @@ export LC_ALL=C
 HADOOP2_MODULE_PROFILES="-Phive-thriftserver -Pmesos -Pkafka-0-8 -Pkubernetes -Pyarn -Pflume -Phive"
 MVN="build/mvn"
 HADOOP_PROFILES=(
-    hadoop-2.6
     hadoop-2.7
     hadoop-3.1
 )

diff --git a/docs/building-spark.md b/docs/building-spark.md
@@ -49,25 +49,20 @@ To create a Spark distribution like those distributed by the
 to be runnable, use `./dev/make-distribution.sh` in the project root directory. It can be configured
 with Maven profile settings and so on like the direct Maven build. Example:
 
-    ./dev/make-distribution.sh --name custom-spark --pip --r --tgz -Psparkr -Phadoop-2.7 -Phive -Phive-thriftserver -Pmesos -Pyarn -Pkubernetes
+    ./dev/make-distribution.sh --name custom-spark --pip --r --tgz -Psparkr -Phive -Phive-thriftserver -Pmesos -Pyarn -Pkubernetes
 
 This will build Spark distribution along with Python pip and R packages. For more information on usage, run `./dev/make-distribution.sh --help`
 
 ## Specifying the Hadoop Version and Enabling YARN
 
 You can specify the exact version of Hadoop to compile against through the `hadoop.version` property. 
-If unset, Spark will build against Hadoop 2.6.X by default.
 
 You can enable the `yarn` profile and optionally set the `yarn.version` property if it is different 
 from `hadoop.version`.
 
-Examples:
+Example:
 
-    # Apache Hadoop 2.6.X
-    ./build/mvn -Pyarn -DskipTests clean package
-
-    # Apache Hadoop 2.7.X and later
-    ./build/mvn -Pyarn -Phadoop-2.7 -Dhadoop.version=2.7.3 -DskipTests clean package
+    ./build/mvn -Pyarn -Dhadoop.version=2.8.5 -DskipTests clean package
 
 ## Building With Hive and JDBC Support
 

diff --git a/docs/index.md b/docs/index.md
@@ -30,9 +30,6 @@ Spark runs on Java 8+, Python 2.7+/3.4+ and R 3.1+. For the Scala API, Spark {{s
 uses Scala {{site.SCALA_BINARY_VERSION}}. You will need to use a compatible Scala version
 ({{site.SCALA_BINARY_VERSION}}.x).
 
-Note that support for Java 7, Python 2.6 and old Hadoop versions before 2.6.5 were removed as of Spark 2.2.0.
-Support for Scala 2.10 was removed as of 2.3.0.
-
 # Running the Examples and Shell
 
 Spark comes with several sample programs.  Scala, Java, Python and R examples are in the

diff --git a/docs/running-on-yarn.md b/docs/running-on-yarn.md
@@ -396,8 +396,7 @@ To use a custom metrics.properties for the application master and executors, upd
   and those log files will be aggregated in a rolling fashion.
   This will be used with YARN's rolling log aggregation, to enable this feature in YARN side
   <code>yarn.nodemanager.log-aggregation.roll-monitoring-interval-seconds</code> should be
-  configured in yarn-site.xml.
-  This feature can only be used with Hadoop 2.6.4+. The Spark log4j appender needs be changed to use
+  configured in yarn-site.xml. The Spark log4j appender needs be changed to use
   FileAppender or another appender that can handle the files being removed while it is running. Based
   on the file name configured in the log4j configuration (like spark.log), the user should set the
   regex (spark*) to include all the log files that need to be aggregated.