diff --git a/ci/docker/conda-python-spark.dockerfile b/ci/docker/conda-python-spark.dockerfile index 6b572d68015..d3f0a224582 100644 --- a/ci/docker/conda-python-spark.dockerfile +++ b/ci/docker/conda-python-spark.dockerfile @@ -34,11 +34,12 @@ RUN conda install -q \ # installing specific version of spark ARG spark=master COPY ci/scripts/install_spark.sh /arrow/ci/scripts/ -RUN /arrow/ci/scripts/install_spark.sh ${spark} /spark /arrow/ci/etc +RUN /arrow/ci/scripts/install_spark.sh ${spark} /spark # build cpp with tests ENV CC=gcc \ CXX=g++ \ ARROW_PYTHON=ON \ ARROW_HDFS=ON \ - ARROW_BUILD_TESTS=OFF + ARROW_BUILD_TESTS=OFF \ + SPARK_VERSION=${spark} diff --git a/ci/scripts/install_spark.sh b/ci/scripts/install_spark.sh index 43f2f58c022..936313fd809 100755 --- a/ci/scripts/install_spark.sh +++ b/ci/scripts/install_spark.sh @@ -19,14 +19,13 @@ set -e -if [ "$#" -ne 3 ]; then - echo "Usage: $0 " +if [ "$#" -ne 2 ]; then + echo "Usage: $0 " exit 1 fi spark=$1 target=$2 -patches=${3} -mkdir "${target}" -wget -q -O - https://github.com/apache/spark/archive/${spark}.tar.gz | tar -xzf - --strip-components=1 -C "${target}" +git clone https://github.com/apache/spark "${target}" +git -C "${target}" checkout "${spark}" diff --git a/ci/scripts/integration_spark.sh b/ci/scripts/integration_spark.sh index e591901c4b7..9828a28a1ec 100755 --- a/ci/scripts/integration_spark.sh +++ b/ci/scripts/integration_spark.sh @@ -20,6 +20,12 @@ set -eu source_dir=${1} spark_dir=${2} +spark_version=${SPARK_VERSION:-master} + +if [ "${SPARK_VERSION:0:2}" == "2." ]; then + # https://github.com/apache/spark/blob/master/docs/sql-pyspark-pandas-with-arrow.md#compatibility-setting-for-pyarrow--0150-and-spark-23x-24x + export ARROW_PRE_0_15_IPC_FORMAT=1 +fi pushd ${source_dir}/java arrow_version=`mvn org.apache.maven.plugins:maven-help-plugin:2.1.1:evaluate -Dexpression=project.version | sed -n -e '/^\[.*\]/ !{ /^[0-9]/ { p; q } }'` diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 4bd3acf678f..64ad44f1724 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -1833,12 +1833,22 @@ tasks: HDFS: 2.9.2 run: conda-python-hdfs - test-conda-python-3.7-spark-master: + test-conda-python-3.7-spark-branch-3.0: ci: github template: docker-tests/github.linux.yml params: env: PYTHON: 3.7 + SPARK: "branch-3.0" + # use the master branch of spark, so prevent reusing any layers + run: --no-leaf-cache conda-python-spark + + test-conda-python-3.8-spark-master: + ci: github + template: docker-tests/github.linux.yml + params: + env: + PYTHON: 3.8 SPARK: master # use the master branch of spark, so prevent reusing any layers run: --no-leaf-cache conda-python-spark