diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index e0c92a834b49f..b1ea4dadc355e 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -7,308 +7,8 @@ on: - '!branch-*.*' jobs: - # Build: build Spark and run the tests for specified modules. - build: - name: "Build modules: ${{ matrix.modules }} ${{ matrix.comment }} (JDK ${{ matrix.java }}, ${{ matrix.hadoop }}, ${{ matrix.hive }})" - # Ubuntu 20.04 is the latest LTS. The next LTS is 22.04. - runs-on: ubuntu-20.04 - strategy: - fail-fast: false - matrix: - java: - - 8 - hadoop: - - hadoop3.2 - hive: - - hive2.3 - # TODO(SPARK-32246): We don't test 'streaming-kinesis-asl' for now. - # Kinesis tests depends on external Amazon kinesis service. - # Note that the modules below are from sparktestsupport/modules.py. - modules: - - >- - core, unsafe, kvstore, avro, - network-common, network-shuffle, repl, launcher, - examples, sketch, graphx - - >- - catalyst, hive-thriftserver - - >- - streaming, sql-kafka-0-10, streaming-kafka-0-10, - mllib-local, mllib, - yarn, mesos, kubernetes, hadoop-cloud, spark-ganglia-lgpl - # Here, we split Hive and SQL tests into some of slow ones and the rest of them. - included-tags: [""] - excluded-tags: [""] - comment: [""] - include: - # Hive tests - - modules: hive - java: 8 - hadoop: hadoop3.2 - hive: hive2.3 - included-tags: org.apache.spark.tags.SlowHiveTest - comment: "- slow tests" - - modules: hive - java: 8 - hadoop: hadoop3.2 - hive: hive2.3 - excluded-tags: org.apache.spark.tags.SlowHiveTest - comment: "- other tests" - # SQL tests - - modules: sql - java: 8 - hadoop: hadoop3.2 - hive: hive2.3 - included-tags: org.apache.spark.tags.ExtendedSQLTest - comment: "- slow tests" - - modules: sql - java: 8 - hadoop: hadoop3.2 - hive: hive2.3 - excluded-tags: org.apache.spark.tags.ExtendedSQLTest - comment: "- other tests" - env: - MODULES_TO_TEST: ${{ matrix.modules }} - EXCLUDED_TAGS: ${{ matrix.excluded-tags }} - INCLUDED_TAGS: ${{ matrix.included-tags }} - HADOOP_PROFILE: ${{ matrix.hadoop }} - HIVE_PROFILE: ${{ matrix.hive }} - GITHUB_PREV_SHA: ${{ github.event.before }} - SPARK_LOCAL_IP: localhost - steps: - - name: Checkout Spark repository - uses: actions/checkout@v2 - # In order to fetch changed files - with: - fetch-depth: 0 - repository: apache/spark - ref: master - - name: Sync the current branch with the latest in Apache Spark - if: github.repository != 'apache/spark' - id: sync-branch - run: | - apache_spark_ref=`git rev-parse HEAD` - git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" - echo "::set-output name=APACHE_SPARK_REF::$apache_spark_ref" - # Cache local repositories. Note that GitHub Actions cache has a 2G limit. - - name: Cache Scala, SBT and Maven - uses: actions/cache@v2 - with: - path: | - build/apache-maven-* - build/scala-* - build/*.jar - ~/.sbt - key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} - restore-keys: | - build- - - name: Cache Coursier local repository - uses: actions/cache@v2 - with: - path: ~/.cache/coursier - key: ${{ matrix.java }}-${{ matrix.hadoop }}-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} - restore-keys: | - ${{ matrix.java }}-${{ matrix.hadoop }}-coursier- - - name: Install Java ${{ matrix.java }} - uses: actions/setup-java@v1 - with: - java-version: ${{ matrix.java }} - - name: Install Python 3.8 - uses: actions/setup-python@v2 - # We should install one Python that is higher then 3+ for SQL and Yarn because: - # - SQL component also has Python related tests, for example, IntegratedUDFTestUtils. - # - Yarn has a Python specific test too, for example, YarnClusterSuite. - if: contains(matrix.modules, 'yarn') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) - with: - python-version: 3.8 - architecture: x64 - - name: Install Python packages (Python 3.8) - if: (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) - run: | - python3.8 -m pip install numpy 'pyarrow<5.0.0' pandas scipy xmlrunner - python3.8 -m pip list - # Run the tests. - - name: Run tests - run: | - export APACHE_SPARK_REF=${{ steps.sync-branch.outputs.APACHE_SPARK_REF }} - # Hive and SQL tests become flaky when running in parallel as it's too intensive. - if [[ "$MODULES_TO_TEST" == "hive" ]] || [[ "$MODULES_TO_TEST" == "sql" ]]; then export SERIAL_SBT_TESTS=1; fi - ./dev/run-tests --parallelism 2 --modules "$MODULES_TO_TEST" --included-tags "$INCLUDED_TAGS" --excluded-tags "$EXCLUDED_TAGS" - - name: Upload test results to report - if: always() - uses: actions/upload-artifact@v2 - with: - name: test-results-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }} - path: "**/target/test-reports/*.xml" - - name: Upload unit tests log files - if: failure() - uses: actions/upload-artifact@v2 - with: - name: unit-tests-log-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }} - path: "**/target/unit-tests.log" - - pyspark: - name: "Build modules: ${{ matrix.modules }}" - runs-on: ubuntu-20.04 - container: - image: dongjoon/apache-spark-github-action-image:20210602 - strategy: - fail-fast: false - matrix: - modules: - - >- - pyspark-sql, pyspark-mllib, pyspark-resource - - >- - pyspark-core, pyspark-streaming, pyspark-ml - - >- - pyspark-pandas - - >- - pyspark-pandas-slow - env: - MODULES_TO_TEST: ${{ matrix.modules }} - HADOOP_PROFILE: hadoop3.2 - HIVE_PROFILE: hive2.3 - GITHUB_PREV_SHA: ${{ github.event.before }} - SPARK_LOCAL_IP: localhost - steps: - - name: Checkout Spark repository - uses: actions/checkout@v2 - # In order to fetch changed files - with: - fetch-depth: 0 - repository: apache/spark - ref: master - - name: Sync the current branch with the latest in Apache Spark - if: github.repository != 'apache/spark' - id: sync-branch - run: | - apache_spark_ref=`git rev-parse HEAD` - git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" - echo "::set-output name=APACHE_SPARK_REF::$apache_spark_ref" - # Cache local repositories. Note that GitHub Actions cache has a 2G limit. - - name: Cache Scala, SBT and Maven - uses: actions/cache@v2 - with: - path: | - build/apache-maven-* - build/scala-* - build/*.jar - ~/.sbt - key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} - restore-keys: | - build- - - name: Cache Coursier local repository - uses: actions/cache@v2 - with: - path: ~/.cache/coursier - key: pyspark-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} - restore-keys: | - pyspark-coursier- - - name: Install Python 3.6 - uses: actions/setup-python@v2 - with: - python-version: 3.6 - architecture: x64 - # This step takes much less time (~30s) than other Python versions so it is not included - # in the Docker image being used. There is also a technical issue to install Python 3.6 on - # Ubuntu 20.04. See also SPARK-33162. - - name: Install Python packages (Python 3.6) - run: | - python3.6 -m pip install numpy 'pyarrow<4.0.0' pandas scipy xmlrunner 'plotly>=4.8' - python3.6 -m pip list - - name: List Python packages (Python 3.9) - run: | - python3.9 -m pip list - - name: Install Conda for pip packaging test - run: | - curl -s https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh > miniconda.sh - bash miniconda.sh -b -p $HOME/miniconda - # Run the tests. - - name: Run tests - run: | - export APACHE_SPARK_REF=${{ steps.sync-branch.outputs.APACHE_SPARK_REF }} - export PATH=$PATH:$HOME/miniconda/bin - ./dev/run-tests --parallelism 2 --modules "$MODULES_TO_TEST" - - name: Upload test results to report - if: always() - uses: actions/upload-artifact@v2 - with: - name: test-results-${{ matrix.modules }}--8-hadoop3.2-hive2.3 - path: "**/target/test-reports/*.xml" - - name: Upload unit tests log files - if: failure() - uses: actions/upload-artifact@v2 - with: - name: unit-tests-log-${{ matrix.modules }}--8-hadoop3.2-hive2.3 - path: "**/target/unit-tests.log" - - sparkr: - name: "Build modules: sparkr" - runs-on: ubuntu-20.04 - container: - image: dongjoon/apache-spark-github-action-image:20210602 - env: - HADOOP_PROFILE: hadoop3.2 - HIVE_PROFILE: hive2.3 - GITHUB_PREV_SHA: ${{ github.event.before }} - SPARK_LOCAL_IP: localhost - steps: - - name: Checkout Spark repository - uses: actions/checkout@v2 - # In order to fetch changed files - with: - fetch-depth: 0 - repository: apache/spark - ref: master - - name: Sync the current branch with the latest in Apache Spark - if: github.repository != 'apache/spark' - id: sync-branch - run: | - apache_spark_ref=`git rev-parse HEAD` - git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" - echo "::set-output name=APACHE_SPARK_REF::$apache_spark_ref" - # Cache local repositories. Note that GitHub Actions cache has a 2G limit. - - name: Cache Scala, SBT and Maven - uses: actions/cache@v2 - with: - path: | - build/apache-maven-* - build/scala-* - build/*.jar - ~/.sbt - key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} - restore-keys: | - build- - - name: Cache Coursier local repository - uses: actions/cache@v2 - with: - path: ~/.cache/coursier - key: sparkr-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} - restore-keys: | - sparkr-coursier- - - name: Run tests - run: | - # The followings are also used by `r-lib/actions/setup-r` to avoid - # R issues at docker environment - export TZ=UTC - export _R_CHECK_SYSTEM_CLOCK_=FALSE - export APACHE_SPARK_REF=${{ steps.sync-branch.outputs.APACHE_SPARK_REF }} - ./dev/run-tests --parallelism 2 --modules sparkr - - name: Upload test results to report - if: always() - uses: actions/upload-artifact@v2 - with: - name: test-results-sparkr--8-hadoop3.2-hive2.3 - path: "**/target/test-reports/*.xml" - - # Static analysis, and documentation build lint: - name: Linters, licenses, dependencies and documentation generation + name: Documentation generation runs-on: ubuntu-20.04 env: LC_ALL: C.UTF-8 @@ -320,15 +20,6 @@ jobs: uses: actions/checkout@v2 with: fetch-depth: 0 - repository: apache/spark - ref: master - - name: Sync the current branch with the latest in Apache Spark - if: github.repository != 'apache/spark' - id: sync-branch - run: | - git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" # Cache local repositories. Note that GitHub Actions cache has a 2G limit. - name: Cache Scala, SBT and Maven uses: actions/cache@v2 @@ -360,23 +51,6 @@ jobs: with: python-version: 3.6 architecture: x64 - - name: Install Python linter dependencies - run: | - # TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes. - # See also https://github.com/sphinx-doc/sphinx/issues/7551. - # Jinja2 3.0.0+ causes error when building with Sphinx. - # See also https://issues.apache.org/jira/browse/SPARK-35375. - python3.6 -m pip install flake8 pydata_sphinx_theme 'mypy==0.910' numpydoc 'jinja2<3.0.0' 'black==21.5b2' - - name: Install R linter dependencies and SparkR - run: | - apt-get install -y libcurl4-openssl-dev libgit2-dev libssl-dev libxml2-dev - Rscript -e "install.packages(c('devtools'), repos='https://cloud.r-project.org/')" - Rscript -e "devtools::install_github('jimhester/lintr@v2.0.1')" - ./R/install-dev.sh - - name: Instll JavaScript linter dependencies - run: | - apt update - apt-get install -y nodejs npm - name: Install dependencies for documentation generation run: | # pandoc is required to generate PySpark APIs as well in nbsphinx. @@ -389,324 +63,20 @@ jobs: python3.6 -m pip install sphinx_plotly_directive 'pyarrow<5.0.0' pandas 'plotly>=4.8' apt-get update -y apt-get install -y ruby ruby-dev - Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'roxygen2'), repos='https://cloud.r-project.org/')" gem install bundler cd docs bundle install - - name: Scala linter - run: ./dev/lint-scala - - name: Java linter - run: ./dev/lint-java - - name: Python linter - run: ./dev/lint-python - - name: R linter - run: ./dev/lint-r - - name: JS linter - run: ./dev/lint-js - - name: License test - run: ./dev/check-license - - name: Dependencies test - run: ./dev/test-dependencies.sh - name: Run documentation build run: | cd docs - bundle exec jekyll build - - java-11-17: - name: Java ${{ matrix.java }} build with Maven - strategy: - fail-fast: false - matrix: - java: - - 11 - - 17-ea - runs-on: ubuntu-20.04 - steps: - - name: Checkout Spark repository - uses: actions/checkout@v2 - with: - fetch-depth: 0 - repository: apache/spark - ref: master - - name: Sync the current branch with the latest in Apache Spark - if: github.repository != 'apache/spark' - id: sync-branch - run: | - git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" - - name: Cache Scala, SBT and Maven - uses: actions/cache@v2 - with: - path: | - build/apache-maven-* - build/scala-* - build/*.jar - ~/.sbt - key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} - restore-keys: | - build- - - name: Cache Maven local repository - uses: actions/cache@v2 - with: - path: ~/.m2/repository - key: java${{ matrix.java }}-maven-${{ hashFiles('**/pom.xml') }} - restore-keys: | - java${{ matrix.java }}-maven- - - name: Install Java ${{ matrix.java }} - uses: actions/setup-java@v1 - with: - java-version: ${{ matrix.java }} - - name: Build with Maven - run: | - export MAVEN_OPTS="-Xss64m -Xmx2g -XX:ReservedCodeCacheSize=1g -Dorg.slf4j.simpleLogger.defaultLogLevel=WARN" - export MAVEN_CLI_OPTS="--no-transfer-progress" - export JAVA_VERSION=${{ matrix.java }} - # It uses Maven's 'install' intentionally, see https://github.com/apache/spark/pull/26414. - ./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Djava.version=${JAVA_VERSION/-ea} install - rm -rf ~/.m2/repository/org/apache/spark - - scala-213: - name: Scala 2.13 build with SBT - runs-on: ubuntu-20.04 - steps: - - name: Checkout Spark repository - uses: actions/checkout@v2 - with: - fetch-depth: 0 - repository: apache/spark - ref: master - - name: Sync the current branch with the latest in Apache Spark - if: github.repository != 'apache/spark' - id: sync-branch - run: | - git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" - - name: Cache Scala, SBT and Maven - uses: actions/cache@v2 - with: - path: | - build/apache-maven-* - build/scala-* - build/*.jar - ~/.sbt - key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} - restore-keys: | - build- - - name: Cache Coursier local repository - uses: actions/cache@v2 - with: - path: ~/.cache/coursier - key: scala-213-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} - restore-keys: | - scala-213-coursier- - - name: Install Java 8 - uses: actions/setup-java@v1 - with: - java-version: 8 - - name: Build with SBT - run: | - ./dev/change-scala-version.sh 2.13 - ./build/sbt -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Pdocker-integration-tests -Pkubernetes-integration-tests -Pspark-ganglia-lgpl -Pscala-2.13 compile test:compile - - hadoop-2: - name: Hadoop 2 build with SBT - runs-on: ubuntu-20.04 - steps: - - name: Checkout Spark repository - uses: actions/checkout@v2 - with: - fetch-depth: 0 - repository: apache/spark - ref: master - - name: Sync the current branch with the latest in Apache Spark - if: github.repository != 'apache/spark' - id: sync-branch - run: | - git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" - - name: Cache Scala, SBT and Maven - uses: actions/cache@v2 - with: - path: | - build/apache-maven-* - build/scala-* - build/*.jar - ~/.sbt - key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} - restore-keys: | - build- - - name: Cache Coursier local repository - uses: actions/cache@v2 - with: - path: ~/.cache/coursier - key: hadoop-2-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} - restore-keys: | - hadoop-2-coursier- - - name: Install Java 8 - uses: actions/setup-java@v1 - with: - java-version: 8 - - name: Build with SBT - run: | - ./build/sbt -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Phadoop-2.7 compile test:compile - - tpcds-1g: - name: Run TPC-DS queries with SF=1 - runs-on: ubuntu-20.04 - env: - SPARK_LOCAL_IP: localhost - steps: - - name: Checkout Spark repository - uses: actions/checkout@v2 - with: - fetch-depth: 0 - repository: apache/spark - ref: master - - name: Sync the current branch with the latest in Apache Spark - if: github.repository != 'apache/spark' - id: sync-branch - run: | - git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" - - name: Cache Scala, SBT and Maven - uses: actions/cache@v2 - with: - path: | - build/apache-maven-* - build/scala-* - build/*.jar - ~/.sbt - key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} - restore-keys: | - build- - - name: Cache Coursier local repository - uses: actions/cache@v2 - with: - path: ~/.cache/coursier - key: tpcds-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} - restore-keys: | - tpcds-coursier- - - name: Install Java 8 - uses: actions/setup-java@v1 - with: - java-version: 8 - - name: Cache TPC-DS generated data - id: cache-tpcds-sf-1 - uses: actions/cache@v2 - with: - path: ./tpcds-sf-1 - key: tpcds-${{ hashFiles('.github/workflows/build_and_test.yml', 'sql/core/src/test/scala/org/apache/spark/sql/TPCDSSchema.scala') }} - - name: Checkout tpcds-kit repository - if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true' - uses: actions/checkout@v2 - with: - repository: databricks/tpcds-kit - ref: 2a5078a782192ddb6efbcead8de9973d6ab4f069 - path: ./tpcds-kit - - name: Build tpcds-kit - if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true' - run: cd tpcds-kit/tools && make OS=LINUX - - name: Generate TPC-DS (SF=1) table data - if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true' - run: build/sbt "sql/test:runMain org.apache.spark.sql.GenTPCDSData --dsdgenDir `pwd`/tpcds-kit/tools --location `pwd`/tpcds-sf-1 --scaleFactor 1 --numPartitions 1 --overwrite" - - name: Run TPC-DS queries - run: | - SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 build/sbt "sql/testOnly org.apache.spark.sql.TPCDSQueryTestSuite" - - name: Upload test results to report - if: always() - uses: actions/upload-artifact@v2 - with: - name: test-results-tpcds--8-hadoop3.2-hive2.3 - path: "**/target/test-reports/*.xml" - - name: Upload unit tests log files - if: failure() - uses: actions/upload-artifact@v2 - with: - name: unit-tests-log-tpcds--8-hadoop3.2-hive2.3 - path: "**/target/unit-tests.log" - - docker-integration-tests: - name: Run docker integration tests - runs-on: ubuntu-20.04 - env: - HADOOP_PROFILE: hadoop3.2 - HIVE_PROFILE: hive2.3 - GITHUB_PREV_SHA: ${{ github.event.before }} - SPARK_LOCAL_IP: localhost - ORACLE_DOCKER_IMAGE_NAME: oracle/database:18.4.0-xe - steps: - - name: Checkout Spark repository - uses: actions/checkout@v2 - with: - fetch-depth: 0 - repository: apache/spark - ref: master - - name: Sync the current branch with the latest in Apache Spark - if: github.repository != 'apache/spark' - id: sync-branch - run: | - apache_spark_ref=`git rev-parse HEAD` - git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" - echo "::set-output name=APACHE_SPARK_REF::$apache_spark_ref" - - name: Cache Scala, SBT and Maven - uses: actions/cache@v2 - with: - path: | - build/apache-maven-* - build/scala-* - build/*.jar - ~/.sbt - key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} - restore-keys: | - build- - - name: Cache Coursier local repository - uses: actions/cache@v2 - with: - path: ~/.cache/coursier - key: docker-integration-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} - restore-keys: | - docker-integration-coursier- - - name: Install Java 8 - uses: actions/setup-java@v1 - with: - java-version: 8 - - name: Cache Oracle docker-images repository - id: cache-oracle-docker-images - uses: actions/cache@v2 - with: - path: ./oracle/docker-images - # key should contains the commit hash of the Oracle docker images to be checkout. - key: oracle-docker-images-3f422c4a35b423dfcdbcc57a84f01db6c82eb6c1 - - name: Checkout Oracle docker-images repository - uses: actions/checkout@v2 - with: - fetch-depth: 0 - repository: oracle/docker-images - ref: 3f422c4a35b423dfcdbcc57a84f01db6c82eb6c1 - path: ./oracle/docker-images - - name: Install Oracle Docker image - run: | - cd oracle/docker-images/OracleDatabase/SingleInstance/dockerfiles - ./buildContainerImage.sh -v 18.4.0 -x - - name: Run tests - run: | - export APACHE_SPARK_REF=${{ steps.sync-branch.outputs.APACHE_SPARK_REF }} - ./dev/run-tests --parallelism 2 --modules docker-integration-tests --included-tags org.apache.spark.tags.DockerTest - - name: Upload test results to report - if: always() - uses: actions/upload-artifact@v2 - with: - name: test-results-docker-integration--8-hadoop3.2-hive2.3 - path: "**/target/test-reports/*.xml" - - name: Upload unit tests log files - if: failure() - uses: actions/upload-artifact@v2 - with: - name: unit-tests-log-docker-integration--8-hadoop3.2-hive2.3 - path: "**/target/unit-tests.log" + SKIP_SCALADOC=1 SKIP_RDOC=1 SKIP_SQLDOC=1 bundle exec jekyll build + cd .. + cp -r docs/_site docs.bak + git checkout host-docs + rm -fr docs + mv docs.bak docs + touch docs/.nojekyll + git add -f docs/ + git add -f docs/.* # make sure adding .nojekyll + git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Update documentation" + git push origin host-docs diff --git a/docs/img/SparkComponents.png b/docs/img/SparkComponents.png new file mode 100644 index 0000000000000..bc4f1e02e55b8 Binary files /dev/null and b/docs/img/SparkComponents.png differ diff --git a/python/docs/source/getting_started/index.rst b/python/docs/source/getting_started/index.rst index a216e3bbe3b85..820e024cc214c 100644 --- a/python/docs/source/getting_started/index.rst +++ b/python/docs/source/getting_started/index.rst @@ -29,6 +29,8 @@ There are live notebooks where you can try PySpark out without any other step: * |binder_df|_ * |binder_ps|_ +* binder for streaming possibly +* binder for ML The list below is the contents of this quickstart page: diff --git a/python/docs/source/index.rst b/python/docs/source/index.rst index 7f650b79a1ad5..6d5405d015bc2 100644 --- a/python/docs/source/index.rst +++ b/python/docs/source/index.rst @@ -29,37 +29,48 @@ interactively analyzing your data in a distributed environment. PySpark supports of Spark's features such as Spark SQL, DataFrame, Streaming, MLlib (Machine Learning) and Spark Core. -.. image:: ../../../docs/img/pyspark-components.png +.. .. image:: ../../../docs/img/pyspark-components.png +.. image:: ../../../docs/img/SparkComponents.png :alt: PySpark Components -**Spark SQL and DataFrame** +***************** +High-level APIs +***************** -Spark SQL is a Spark module for structured data processing. It provides -a programming abstraction called DataFrame and can also act as distributed -SQL query engine. - -**pandas API on Spark** - -pandas API on Spark allows you to scale your pandas workload out. -With this package, you can: - -* Be immediately productive with Spark, with no learning curve, if you are already familiar with pandas. -* Have a single codebase that works both with pandas (tests, smaller datasets) and with Spark (distributed datasets). -* Switch to pandas API and PySpark API contexts easily without any overhead. - -**Streaming** +`Streaming `_ Running on top of Spark, the streaming feature in Apache Spark enables powerful interactive and analytical applications across both streaming and historical data, while inheriting Spark's ease of use and fault tolerance characteristics. -**MLlib** +`Machine Learning (MLlib) `_ Built on top of Spark, MLlib is a scalable machine learning library that provides a uniform set of high-level APIs that help users create and tune practical machine learning pipelines. -**Spark Core** +******************* +Data APIs +******************* + +`Spark SQL and DataFrame `_ + +Spark SQL is a Spark module, similar to SQL, that can be used for structured data processing. It provides +a programming abstraction called DataFrame and can also act as distributed +SQL query engine. + +`pandas API on Spark `_ + +The pandas API on Spark allows you to scale your pandas workload out. +With this package, you can: + +* Be immediately productive with Spark, with no learning curve, if you are already familiar with pandas. +* Have a single codebase that works both with pandas (tests, smaller datasets) and with Spark (distributed datasets). +* Switch to pandas API and PySpark API contexts easily without any overhead. + +***************** +Spark Core + Spark SQL Engine +***************** Spark Core is the underlying general execution engine for the Spark platform that all other functionality is built on top of. It provides an RDD (Resilient Distributed Dataset) diff --git a/python/docs/source/user_guide/index.rst b/python/docs/source/user_guide/index.rst index 5cc8bc3d38e0e..e04dbcc0fd35b 100644 --- a/python/docs/source/user_guide/index.rst +++ b/python/docs/source/user_guide/index.rst @@ -16,25 +16,65 @@ under the License. +.. ========== +.. PySpark User Guide +.. ========== + +.. The PySpark specific user guide is as follows: + +.. toctree:: +.. :maxdepth: 2 + +.. python_packaging +.. sql/index +.. pandas_on_spark/index + +============= +PySpark User Guide +============= + +Installation +============ + +- Installing packages: `Python package management `_ + + - `Using the PySpark Native Features `_ + - `Using Conda `_ + - `Using Virtualenv `_ + - `Using PEX `_ + +Data APIs +================ + +- `PySpark DataFrame <../getting_started/quickstart_df.html>`_ +- `Executing SQL in PySpark <../getting_started/quickstart_df.html#Working-with-SQL>`_ +- `The pandas API on PySpark <../getting_started/quickstart_ps.html>`_ + +High-level APIs +=============== + +- `Streaming in PySpark `_ +- `Machine Learning in PySpark `_ + + +The Spark internals +=============== + +- `Spark Core + SQL Engine `_ + ========== -User Guide +Spark User Guide (All languages) ========== +The following guides shared with other languages in the Programming Guides section +at `the Spark documentation `_ as listed below: -There are basic guides shared with other languages in Programming Guides -at `the Spark documentation `_ as below: - -- `RDD Programming Guide `_ - `Spark SQL, DataFrames and Datasets Guide `_ - `Structured Streaming Programming Guide `_ -- `Spark Streaming Programming Guide `_ - `Machine Learning Library (MLlib) Guide `_ +- `pandas API on Spark Guide `_ -PySpark specific user guide is as follows: - -.. toctree:: - :maxdepth: 2 +The following APIs are deprecated and no longer recommended for use: - python_packaging - sql/index - pandas_on_spark/index +- `RDD Programming Guide `_ +- `Spark Streaming Programming Guide `_