diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index e505be7d4d98..2610f50a6244 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -64,24 +64,13 @@ jobs: with: fetch-depth: 0 repository: apache/spark - ref: ${{ inputs.branch }} - - name: Sync the current branch with the latest in Apache Spark - if: github.repository != 'apache/spark' - run: | - echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV - git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty + ref: branch-3.4 - name: Check all modules id: set-outputs run: | if [ -z "${{ inputs.jobs }}" ]; then pyspark=true; sparkr=true; tpcds=true; docker=true; pyspark_modules=`cd dev && python -c "import sparktestsupport.modules as m; print(','.join(m.name for m in m.all_modules if m.name.startswith('pyspark')))"` - pyspark=`./dev/is-changed.py -m $pyspark_modules` - sparkr=`./dev/is-changed.py -m sparkr` - tpcds=`./dev/is-changed.py -m sql` - docker=`./dev/is-changed.py -m docker-integration-tests` # 'build' and 'maven-build' are always true for now. # It does not save significant time and most of PRs trigger the build. precondition=" @@ -113,175 +102,10 @@ jobs: run: | # Convert to lowercase to meet Docker repo name requirement REPO_OWNER=$(echo "${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]') - IMG_NAME="apache-spark-ci-image:${{ inputs.branch }}-${{ github.run_id }}" + IMG_NAME="apache-spark-ci-image:branch-3.4-${{ github.run_id }}" IMG_URL="ghcr.io/$REPO_OWNER/$IMG_NAME" echo "image_url=$IMG_URL" >> $GITHUB_OUTPUT - # Build: build Spark and run the tests for specified modules. - build: - name: "Build modules: ${{ matrix.modules }} ${{ matrix.comment }}" - needs: precondition - if: fromJson(needs.precondition.outputs.required).build == 'true' - runs-on: ubuntu-latest - timeout-minutes: 300 - strategy: - fail-fast: false - matrix: - java: - - ${{ inputs.java }} - hadoop: - - ${{ inputs.hadoop }} - hive: - - hive2.3 - # Note that the modules below are from sparktestsupport/modules.py. - modules: - - >- - core, unsafe, kvstore, avro, utils, - network-common, network-shuffle, repl, launcher, - examples, sketch, variant - - >- - api, catalyst, hive-thriftserver - - >- - mllib-local, mllib, graphx - - >- - streaming, sql-kafka-0-10, streaming-kafka-0-10, streaming-kinesis-asl, - kubernetes, hadoop-cloud, spark-ganglia-lgpl, protobuf - - >- - yarn, connect - # Here, we split Hive and SQL tests into some of slow ones and the rest of them. - included-tags: [""] - excluded-tags: [""] - comment: [""] - include: - # Hive tests - - modules: hive - java: ${{ inputs.java }} - hadoop: ${{ inputs.hadoop }} - hive: hive2.3 - included-tags: org.apache.spark.tags.SlowHiveTest - comment: "- slow tests" - - modules: hive - java: ${{ inputs.java }} - hadoop: ${{ inputs.hadoop }} - hive: hive2.3 - excluded-tags: org.apache.spark.tags.SlowHiveTest - comment: "- other tests" - # SQL tests - - modules: sql - java: ${{ inputs.java }} - hadoop: ${{ inputs.hadoop }} - hive: hive2.3 - included-tags: org.apache.spark.tags.ExtendedSQLTest - comment: "- extended tests" - - modules: sql - java: ${{ inputs.java }} - hadoop: ${{ inputs.hadoop }} - hive: hive2.3 - included-tags: org.apache.spark.tags.SlowSQLTest - comment: "- slow tests" - - modules: sql - java: ${{ inputs.java }} - hadoop: ${{ inputs.hadoop }} - hive: hive2.3 - excluded-tags: org.apache.spark.tags.ExtendedSQLTest,org.apache.spark.tags.SlowSQLTest - comment: "- other tests" - env: - MODULES_TO_TEST: ${{ matrix.modules }} - EXCLUDED_TAGS: ${{ matrix.excluded-tags }} - INCLUDED_TAGS: ${{ matrix.included-tags }} - HADOOP_PROFILE: ${{ matrix.hadoop }} - HIVE_PROFILE: ${{ matrix.hive }} - GITHUB_PREV_SHA: ${{ github.event.before }} - SPARK_LOCAL_IP: localhost - SKIP_UNIDOC: true - SKIP_MIMA: true - SKIP_PACKAGING: true - steps: - - name: Checkout Spark repository - uses: actions/checkout@v4 - # In order to fetch changed files - with: - fetch-depth: 0 - repository: apache/spark - ref: ${{ inputs.branch }} - - name: Sync the current branch with the latest in Apache Spark - if: github.repository != 'apache/spark' - run: | - echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV - git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty - # Cache local repositories. Note that GitHub Actions cache has a 10G limit. - - name: Cache Scala, SBT and Maven - uses: actions/cache@v4 - with: - path: | - build/apache-maven-* - build/scala-* - build/*.jar - ~/.sbt - key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} - restore-keys: | - build- - - name: Cache Coursier local repository - uses: actions/cache@v4 - with: - path: ~/.cache/coursier - key: ${{ matrix.java }}-${{ matrix.hadoop }}-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} - restore-keys: | - ${{ matrix.java }}-${{ matrix.hadoop }}-coursier- - - name: Free up disk space - run: | - if [ -f ./dev/free_disk_space ]; then - ./dev/free_disk_space - fi - - name: Install Java ${{ matrix.java }} - uses: actions/setup-java@v4 - with: - distribution: zulu - java-version: ${{ matrix.java }} - - name: Install Python 3.9 - uses: actions/setup-python@v5 - # We should install one Python that is higher than 3+ for SQL and Yarn because: - # - SQL component also has Python related tests, for example, IntegratedUDFTestUtils. - # - Yarn has a Python specific test too, for example, YarnClusterSuite. - if: contains(matrix.modules, 'yarn') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) || contains(matrix.modules, 'connect') - with: - python-version: '3.9' - architecture: x64 - - name: Install Python packages (Python 3.9) - if: (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) || contains(matrix.modules, 'connect') - run: | - python3.9 -m pip install 'numpy>=1.20.0' pyarrow pandas scipy unittest-xml-reporting 'lxml==4.9.4' 'grpcio==1.62.0' 'grpcio-status==1.62.0' 'protobuf==4.25.1' - python3.9 -m pip list - # Run the tests. - - name: Run tests - env: ${{ fromJSON(inputs.envs) }} - shell: 'script -q -e -c "bash {0}"' - run: | - # Fix for TTY related issues when launching the Ammonite REPL in tests. - export TERM=vt100 - # Hive "other tests" test needs larger metaspace size based on experiment. - if [[ "$MODULES_TO_TEST" == "hive" ]] && [[ "$EXCLUDED_TAGS" == "org.apache.spark.tags.SlowHiveTest" ]]; then export METASPACE_SIZE=2g; fi - # SPARK-46283: should delete the following env replacement after SPARK 3.x EOL - if [[ "$MODULES_TO_TEST" == *"streaming-kinesis-asl"* ]] && [[ "${{ inputs.branch }}" =~ ^branch-3 ]]; then - MODULES_TO_TEST=${MODULES_TO_TEST//streaming-kinesis-asl, /} - fi - export SERIAL_SBT_TESTS=1 - ./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST" --included-tags "$INCLUDED_TAGS" --excluded-tags "$EXCLUDED_TAGS" - - name: Upload test results to report - if: always() - uses: actions/upload-artifact@v4 - with: - name: test-results-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }} - path: "**/target/test-reports/*.xml" - - name: Upload unit tests log files - if: ${{ !success() }} - uses: actions/upload-artifact@v4 - with: - name: unit-tests-log-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }} - path: "**/target/unit-tests.log" - infra-image: name: "Base image build" needs: precondition @@ -305,14 +129,7 @@ jobs: with: fetch-depth: 0 repository: apache/spark - ref: ${{ inputs.branch }} - - name: Sync the current branch with the latest in Apache Spark - if: github.repository != 'apache/spark' - run: | - echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV - git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty + ref: branch-3.4 - name: Set up QEMU uses: docker/setup-qemu-action@v3 - name: Set up Docker Buildx @@ -326,7 +143,7 @@ jobs: tags: | ${{ needs.precondition.outputs.image_url }} # Use the infra image cache to speed up - cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-cache:${{ inputs.branch }} + cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-cache:branch-3.4 pyspark: needs: [precondition, infra-image] @@ -341,12 +158,16 @@ jobs: fail-fast: false matrix: java: - - ${{ inputs.java }} + - 8 + branch: + - branch-3.4 modules: - >- pyspark-sql, pyspark-resource, pyspark-testing - >- - pyspark-core, pyspark-errors, pyspark-streaming + pyspark-core, pyspark-streaming + - >- + pyspark-errors - >- pyspark-mllib, pyspark-ml, pyspark-ml-connect - >- @@ -365,7 +186,7 @@ jobs: pyspark-pandas-connect-part3 env: MODULES_TO_TEST: ${{ matrix.modules }} - PYTHON_TO_TEST: 'python3.9' + PYTHON_TO_TEST: '' HADOOP_PROFILE: ${{ inputs.hadoop }} HIVE_PROFILE: hive2.3 GITHUB_PREV_SHA: ${{ github.event.before }} @@ -382,17 +203,10 @@ jobs: with: fetch-depth: 0 repository: apache/spark - ref: ${{ inputs.branch }} + ref: ${{ matrix.branch }} - name: Add GITHUB_WORKSPACE to git trust safe.directory run: | git config --global --add safe.directory ${GITHUB_WORKSPACE} - - name: Sync the current branch with the latest in Apache Spark - if: github.repository != 'apache/spark' - run: | - echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV - git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty # Cache local repositories. Note that GitHub Actions cache has a 10G limit. - name: Cache Scala, SBT and Maven uses: actions/cache@v4 @@ -438,20 +252,34 @@ jobs: curl -s https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh > miniconda.sh bash miniconda.sh -b -p $HOME/miniconda rm miniconda.sh + - name: Install Python test dependencies for branch-3.4 + if: matrix.branch == 'branch-3.4' + run: | + python3.9 -m pip install 'numpy==1.23.5' 'pandas<=1.5.3' 'pyarrow==12.0.1' 'matplotlib==3.8.3' 'torch==2.2.1' 'torchvision==0.17.1' 'scikit-learn==1.1.*' + - name: Install Python test dependencies for branch-3.5 + if: matrix.branch == 'branch-3.5' + run: | + python3.9 -m pip install 'numpy==1.25.1' 'pandas<=2.0.3' 'pyarrow==12.0.1' 'matplotlib==3.7.2' 'torch==2.0.1' 'torchvision==0.15.2' 'scikit-learn==1.1.*' # Run the tests. - name: Run tests env: ${{ fromJSON(inputs.envs) }} shell: 'script -q -e -c "bash {0}"' run: | + python3.9 -m pip list + pypy3 -m pip list + export SCALA_PROFILE="scala2.13" + unset GITHUB_ACTIONS if [[ "$MODULES_TO_TEST" == *"pyspark-errors"* ]]; then export PATH=$PATH:$HOME/miniconda/bin export SKIP_PACKAGING=false echo "Python Packaging Tests Enabled!" fi if [ ! -z "$PYTHON_TO_TEST" ]; then + echo "1" ./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST" --python-executables "$PYTHON_TO_TEST" else # For branch-3.5 and below, it uses the default Python versions. + echo "2" ./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST" fi - name: Upload coverage to Codecov @@ -476,85 +304,6 @@ jobs: name: unit-tests-log-${{ matrix.modules }}--${{ matrix.java }}-${{ inputs.hadoop }}-hive2.3-${{ env.PYTHON_TO_TEST }} path: "**/target/unit-tests.log" - sparkr: - needs: [precondition, infra-image] - # always run if sparkr == 'true', even infra-image is skip (such as non-master job) - if: (!cancelled()) && fromJson(needs.precondition.outputs.required).sparkr == 'true' - name: "Build modules: sparkr" - runs-on: ubuntu-latest - timeout-minutes: 300 - container: - image: ${{ needs.precondition.outputs.image_url }} - env: - HADOOP_PROFILE: ${{ inputs.hadoop }} - HIVE_PROFILE: hive2.3 - GITHUB_PREV_SHA: ${{ github.event.before }} - SPARK_LOCAL_IP: localhost - SKIP_UNIDOC: true - SKIP_MIMA: true - SKIP_PACKAGING: true - steps: - - name: Checkout Spark repository - uses: actions/checkout@v4 - # In order to fetch changed files - with: - fetch-depth: 0 - repository: apache/spark - ref: ${{ inputs.branch }} - - name: Add GITHUB_WORKSPACE to git trust safe.directory - run: | - git config --global --add safe.directory ${GITHUB_WORKSPACE} - - name: Sync the current branch with the latest in Apache Spark - if: github.repository != 'apache/spark' - run: | - echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV - git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty - # Cache local repositories. Note that GitHub Actions cache has a 10G limit. - - name: Cache Scala, SBT and Maven - uses: actions/cache@v4 - with: - path: | - build/apache-maven-* - build/scala-* - build/*.jar - ~/.sbt - key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} - restore-keys: | - build- - - name: Cache Coursier local repository - uses: actions/cache@v4 - with: - path: ~/.cache/coursier - key: sparkr-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} - restore-keys: | - sparkr-coursier- - - name: Free up disk space - run: | - if [ -f ./dev/free_disk_space_container ]; then - ./dev/free_disk_space_container - fi - - name: Install Java ${{ inputs.java }} - uses: actions/setup-java@v4 - with: - distribution: zulu - java-version: ${{ inputs.java }} - - name: Run tests - env: ${{ fromJSON(inputs.envs) }} - run: | - # The followings are also used by `r-lib/actions/setup-r` to avoid - # R issues at docker environment - export TZ=UTC - export _R_CHECK_SYSTEM_CLOCK_=FALSE - ./dev/run-tests --parallelism 1 --modules sparkr - - name: Upload test results to report - if: always() - uses: actions/upload-artifact@v4 - with: - name: test-results-sparkr--${{ inputs.java }}-${{ inputs.hadoop }}-hive2.3 - path: "**/target/test-reports/*.xml" - buf: needs: [precondition] if: (!cancelled()) && fromJson(needs.precondition.outputs.required).buf == 'true' @@ -620,17 +369,10 @@ jobs: with: fetch-depth: 0 repository: apache/spark - ref: ${{ inputs.branch }} + ref: branch-3.4 - name: Add GITHUB_WORKSPACE to git trust safe.directory run: | git config --global --add safe.directory ${GITHUB_WORKSPACE} - - name: Sync the current branch with the latest in Apache Spark - if: github.repository != 'apache/spark' - run: | - echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV - git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty # Cache local repositories. Note that GitHub Actions cache has a 10G limit. - name: Cache Scala, SBT and Maven uses: actions/cache@v4 @@ -666,7 +408,7 @@ jobs: uses: actions/setup-java@v4 with: distribution: zulu - java-version: ${{ inputs.java }} + java-version: 8 - name: License test run: ./dev/check-license - name: Dependencies test @@ -680,31 +422,19 @@ jobs: - name: Spark connect jvm client mima check run: ./dev/connect-jvm-client-mima-check - name: Install Python linter dependencies for branch-3.4 - if: inputs.branch == 'branch-3.4' run: | # SPARK-44554: Copy from https://github.com/apache/spark/blob/a05c27e85829fe742c1828507a1fd180cdc84b54/.github/workflows/build_and_test.yml#L571-L578 # Should delete this section after SPARK 3.4 EOL. - python3.9 -m pip install 'flake8==3.9.0' pydata_sphinx_theme 'mypy==0.920' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' numpydoc 'jinja2<3.0.0' 'black==22.6.0' + python3.9 -m pip list + python3.9 -m pip install 'flake8==3.9.0' pydata-sphinx-theme 'mypy==0.920' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' 'numpy==1.25.1' 'pyarrow==12.0.1' numpydoc 'jinja2<3.0.0' 'black==22.6.0' 'pandas==1.5.3' 'matplotlib==3.7.2' python3.9 -m pip install 'pandas-stubs==1.2.0.53' ipython 'grpcio==1.48.1' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' - name: Install Python linter dependencies for branch-3.5 if: inputs.branch == 'branch-3.5' run: | # SPARK-45212: Copy from https://github.com/apache/spark/blob/555c8def51e5951c7bf5165a332795e9e330ec9d/.github/workflows/build_and_test.yml#L631-L638 # Should delete this section after SPARK 3.5 EOL. - python3.9 -m pip install 'flake8==3.9.0' pydata_sphinx_theme 'mypy==0.982' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' numpydoc 'jinja2<3.0.0' 'black==22.6.0' + python3.9 -m pip install 'flake8==3.9.0' pydata-sphinx-theme 'mypy==0.982' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' 'numpy==1.25.1' 'pyarrow==12.0.1' numpydoc 'jinja2<3.0.0' 'black==22.6.0' 'pandas<=2.0.3' 'matplotlib==3.7.2' 'torch==2.0.1' 'torchvision==0.15.2' python3.9 -m pip install 'pandas-stubs==1.2.0.53' ipython 'grpcio==1.56.0' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' - - name: Install Python dependencies for python linter and documentation generation - if: inputs.branch != 'branch-3.4' && inputs.branch != 'branch-3.5' - run: | - # Should unpin 'sphinxcontrib-*' after upgrading sphinx>5 - # See 'ipython_genutils' in SPARK-38517 - # See 'docutils<0.18.0' in SPARK-39421 - python3.9 -m pip install 'sphinx==4.5.0' mkdocs 'pydata_sphinx_theme>=0.13' sphinx-copybutton nbsphinx numpydoc jinja2 markupsafe 'pyzmq<24.0.0' \ - ipython ipython_genutils sphinx_plotly_directive 'numpy>=1.20.0' pyarrow pandas 'plotly>=4.8' 'docutils<0.18.0' \ - 'flake8==3.9.0' 'mypy==1.8.0' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' 'black==23.9.1' \ - 'pandas-stubs==1.2.0.53' 'grpcio==1.62.0' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' \ - 'sphinxcontrib-applehelp==1.0.4' 'sphinxcontrib-devhelp==1.0.2' 'sphinxcontrib-htmlhelp==2.0.1' 'sphinxcontrib-qthelp==1.0.3' 'sphinxcontrib-serializinghtml==1.1.5' - python3.9 -m pip list - name: Python linter run: PYTHON_EXECUTABLE=python3.9 ./dev/lint-python # Should delete this section after SPARK 3.5 EOL. @@ -723,7 +453,6 @@ jobs: run: if test -f ./dev/connect-check-protos.py; then PATH=$PATH:$HOME/buf/bin PYTHON_EXECUTABLE=python3.9 ./dev/connect-check-protos.py; fi # Should delete this section after SPARK 3.5 EOL. - name: Install JavaScript linter dependencies for branch-3.4, branch-3.5 - if: inputs.branch == 'branch-3.4' || inputs.branch == 'branch-3.5' run: | apt update apt-get install -y nodejs npm @@ -731,7 +460,6 @@ jobs: run: ./dev/lint-js # Should delete this section after SPARK 3.5 EOL. - name: Install R linter dependencies for branch-3.4, branch-3.5 - if: inputs.branch == 'branch-3.4' || inputs.branch == 'branch-3.5' run: | apt update apt-get install -y libcurl4-openssl-dev libgit2-dev libssl-dev libxml2-dev \ @@ -743,7 +471,6 @@ jobs: run: ./R/install-dev.sh # Should delete this section after SPARK 3.5 EOL. - name: Install dependencies for documentation generation for branch-3.4, branch-3.5 - if: inputs.branch == 'branch-3.4' || inputs.branch == 'branch-3.5' run: | # pandoc is required to generate PySpark APIs as well in nbsphinx. apt-get update -y @@ -753,10 +480,15 @@ jobs: Rscript -e "devtools::install_version('pkgdown', version='2.0.1', repos='https://cloud.r-project.org')" Rscript -e "devtools::install_version('preferably', version='0.4', repos='https://cloud.r-project.org')" # Should unpin 'sphinxcontrib-*' after upgrading sphinx>5 - python3.9 -m pip install 'sphinx==4.5.0' mkdocs 'pydata_sphinx_theme>=0.13' sphinx-copybutton nbsphinx numpydoc jinja2 markupsafe 'pyzmq<24.0.0' 'sphinxcontrib-applehelp==1.0.4' 'sphinxcontrib-devhelp==1.0.2' 'sphinxcontrib-htmlhelp==2.0.1' 'sphinxcontrib-qthelp==1.0.3' 'sphinxcontrib-serializinghtml==1.1.5' + python3.9 -m pip install 'sphinx==3.0.4' mkdocs 'docutils==0.17.1' 'pydata-sphinx-theme==0.8.0' 'alabaster==0.7.12' sphinx-copybutton 'nbsphinx==0.9.2' 'numpydoc==1.4.0' 'jinja2==2.11.3' 'markupsafe==2.0.1' 'pyzmq<24.0.0' 'sphinxcontrib-applehelp==1.0.4' 'sphinxcontrib-devhelp==1.0.2' 'sphinxcontrib-htmlhelp==2.0.1' 'sphinxcontrib-qthelp==1.0.3' 'sphinxcontrib-serializinghtml==1.1.5' python3.9 -m pip install ipython_genutils # See SPARK-38517 - python3.9 -m pip install sphinx_plotly_directive 'numpy>=1.20.0' pyarrow pandas 'plotly>=4.8' - python3.9 -m pip install 'docutils<0.18.0' # See SPARK-39421 + - name: Install dependencies for documentation generation for branch-3.4 + run: | + python3.9 -m pip install sphinx_plotly_directive 'numpy>=1.20.0' pyarrow 'pandas==1.5.3' 'plotly>=4.8' + - name: Install dependencies for documentation generation for branch-3.5 + if: inputs.branch == 'branch-3.5' + run: | + python3.9 -m pip install sphinx_plotly_directive 'numpy>=1.20.0' pyarrow 'pandas=2.0.3' 'plotly>=4.8' - name: Install dependencies for documentation generation run: | gem install bundler -v 2.4.22 @@ -766,6 +498,7 @@ jobs: run: ./dev/lint-r - name: Run documentation build run: | + python3.9 -m pip list # Build docs first with SKIP_API to ensure they are buildable without requiring any # language docs to be built beforehand. cd docs; SKIP_API=1 bundle exec jekyll build; cd .. @@ -788,67 +521,6 @@ jobs: path: site.tar.bz2 retention-days: 1 - maven-build: - needs: precondition - if: fromJson(needs.precondition.outputs.required).maven-build == 'true' - name: Java ${{ matrix.java }} build with Maven (${{ matrix.os }}) - strategy: - fail-fast: false - matrix: - include: - - java: 17 - os: ubuntu-latest - - java: 21 - os: ubuntu-latest - - java: 21 - os: macos-14 - runs-on: ${{ matrix.os }} - timeout-minutes: 300 - steps: - - name: Checkout Spark repository - uses: actions/checkout@v4 - with: - fetch-depth: 0 - repository: apache/spark - ref: ${{ inputs.branch }} - - name: Sync the current branch with the latest in Apache Spark - if: github.repository != 'apache/spark' - run: | - git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty - - name: Cache Scala, SBT and Maven - uses: actions/cache@v4 - with: - path: | - build/apache-maven-* - build/scala-* - build/*.jar - ~/.sbt - key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} - restore-keys: | - build- - - name: Cache Maven local repository - uses: actions/cache@v4 - with: - path: ~/.m2/repository - key: java${{ matrix.java }}-maven-${{ hashFiles('**/pom.xml') }} - restore-keys: | - java${{ matrix.java }}-maven- - - name: Install Java ${{ matrix.java }} - uses: actions/setup-java@v4 - with: - distribution: zulu - java-version: ${{ matrix.java }} - - name: Build with Maven - run: | - export MAVEN_OPTS="-Xss64m -Xmx2g -XX:ReservedCodeCacheSize=1g -Dorg.slf4j.simpleLogger.defaultLogLevel=WARN" - export MAVEN_CLI_OPTS="--no-transfer-progress" - export JAVA_VERSION=${{ matrix.java }} - # It uses Maven's 'install' intentionally, see https://github.com/apache/spark/pull/26414. - ./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Djava.version=${JAVA_VERSION/-ea} install - rm -rf ~/.m2/repository/org/apache/spark - # Any TPC-DS related updates on this job need to be applied to tpcds-1g-gen job of benchmark.yml as well tpcds-1g: needs: precondition @@ -949,164 +621,3 @@ jobs: with: name: unit-tests-log-tpcds--${{ inputs.java }}-${{ inputs.hadoop }}-hive2.3 path: "**/target/unit-tests.log" - - docker-integration-tests: - needs: precondition - if: fromJson(needs.precondition.outputs.required).docker-integration-tests == 'true' - name: Run Docker integration tests - runs-on: ubuntu-latest - timeout-minutes: 300 - env: - HADOOP_PROFILE: ${{ inputs.hadoop }} - HIVE_PROFILE: hive2.3 - GITHUB_PREV_SHA: ${{ github.event.before }} - SPARK_LOCAL_IP: localhost - ORACLE_DOCKER_IMAGE_NAME: gvenzl/oracle-free:23.3 - SKIP_UNIDOC: true - SKIP_MIMA: true - SKIP_PACKAGING: true - steps: - - name: Checkout Spark repository - uses: actions/checkout@v4 - with: - fetch-depth: 0 - repository: apache/spark - ref: ${{ inputs.branch }} - - name: Sync the current branch with the latest in Apache Spark - if: github.repository != 'apache/spark' - run: | - echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV - git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty - - name: Cache Scala, SBT and Maven - uses: actions/cache@v4 - with: - path: | - build/apache-maven-* - build/scala-* - build/*.jar - ~/.sbt - key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} - restore-keys: | - build- - - name: Cache Coursier local repository - uses: actions/cache@v4 - with: - path: ~/.cache/coursier - key: docker-integration-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} - restore-keys: | - docker-integration-coursier- - - name: Install Java ${{ inputs.java }} - uses: actions/setup-java@v4 - with: - distribution: zulu - java-version: ${{ inputs.java }} - - name: Run tests - env: ${{ fromJSON(inputs.envs) }} - run: | - ./dev/run-tests --parallelism 1 --modules docker-integration-tests --included-tags org.apache.spark.tags.DockerTest - - name: Upload test results to report - if: always() - uses: actions/upload-artifact@v4 - with: - name: test-results-docker-integration--${{ inputs.java }}-${{ inputs.hadoop }}-hive2.3 - path: "**/target/test-reports/*.xml" - - name: Upload unit tests log files - if: ${{ !success() }} - uses: actions/upload-artifact@v4 - with: - name: unit-tests-log-docker-integration--${{ inputs.java }}-${{ inputs.hadoop }}-hive2.3 - path: "**/target/unit-tests.log" - - k8s-integration-tests: - needs: precondition - if: fromJson(needs.precondition.outputs.required).k8s-integration-tests == 'true' - name: Run Spark on Kubernetes Integration test - runs-on: ubuntu-latest - timeout-minutes: 300 - steps: - - name: Checkout Spark repository - uses: actions/checkout@v4 - with: - fetch-depth: 0 - repository: apache/spark - ref: ${{ inputs.branch }} - - name: Sync the current branch with the latest in Apache Spark - if: github.repository != 'apache/spark' - run: | - echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV - git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty - - name: Cache Scala, SBT and Maven - uses: actions/cache@v4 - with: - path: | - build/apache-maven-* - build/scala-* - build/*.jar - ~/.sbt - key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} - restore-keys: | - build- - - name: Cache Coursier local repository - uses: actions/cache@v4 - with: - path: ~/.cache/coursier - key: k8s-integration-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} - restore-keys: | - k8s-integration-coursier- - - name: Install Java ${{ inputs.java }} - uses: actions/setup-java@v4 - with: - distribution: zulu - java-version: ${{ inputs.java }} - - name: start minikube - run: | - # See more in "Installation" https://minikube.sigs.k8s.io/docs/start/ - curl -LO https://storage.googleapis.com/minikube/releases/latest/minikube-linux-amd64 - sudo install minikube-linux-amd64 /usr/local/bin/minikube - rm minikube-linux-amd64 - # Github Action limit cpu:2, memory: 6947MB, limit to 2U6G for better resource statistic - minikube start --cpus 2 --memory 6144 - - name: Print K8S pods and nodes info - run: | - kubectl get pods -A - kubectl describe node - - name: Run Spark on K8S integration test - run: | - # Prepare PV test - PVC_TMP_DIR=$(mktemp -d) - export PVC_TESTS_HOST_PATH=$PVC_TMP_DIR - export PVC_TESTS_VM_PATH=$PVC_TMP_DIR - minikube mount ${PVC_TESTS_HOST_PATH}:${PVC_TESTS_VM_PATH} --gid=0 --uid=185 & - kubectl create clusterrolebinding serviceaccounts-cluster-admin --clusterrole=cluster-admin --group=system:serviceaccounts || true - kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/v1.8.2/installer/volcano-development.yaml || true - eval $(minikube docker-env) - build/sbt -Phadoop-3 -Psparkr -Pkubernetes -Pvolcano -Pkubernetes-integration-tests -Dspark.kubernetes.test.volcanoMaxConcurrencyJobNum=1 -Dtest.exclude.tags=local "kubernetes-integration-tests/test" - - name: Upload Spark on K8S integration tests log files - if: ${{ !success() }} - uses: actions/upload-artifact@v4 - with: - name: spark-on-kubernetes-it-log - path: "**/target/integration-tests.log" - - ui: - needs: [precondition] - if: fromJson(needs.precondition.outputs.required).ui == 'true' - name: Run Spark UI tests - runs-on: ubuntu-latest - timeout-minutes: 300 - steps: - - uses: actions/checkout@v4 - - name: Use Node.js - uses: actions/setup-node@v4 - with: - node-version: 20 - cache: 'npm' - cache-dependency-path: ui-test/package-lock.json - - run: | - cd ui-test - npm install --save-dev - node --experimental-vm-modules node_modules/.bin/jest