diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index 8ba64d19217..f342fb44db4 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -132,6 +132,7 @@ dev/tasks/linux-packages/debian/source/format dev/tasks/linux-packages/debian/watch dev/tasks/conda-recipes/* docs/requirements.txt +integration/spark/ARROW-6429.patch go/arrow/go.sum go/arrow/Gopkg.lock go/arrow/internal/cpu/* diff --git a/integration/spark/ARROW-6429.patch b/integration/spark/ARROW-6429.patch new file mode 100644 index 00000000000..e6140847111 --- /dev/null +++ b/integration/spark/ARROW-6429.patch @@ -0,0 +1,31 @@ +diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala +index 1a6f4ac..42d555b 100644 +--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala ++++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala +@@ -26,7 +26,7 @@ import org.apache.arrow.flatbuf.MessageHeader + import org.apache.arrow.memory.BufferAllocator + import org.apache.arrow.vector._ + import org.apache.arrow.vector.ipc.{ArrowStreamWriter, ReadChannel, WriteChannel} +-import org.apache.arrow.vector.ipc.message.{ArrowRecordBatch, MessageSerializer} ++import org.apache.arrow.vector.ipc.message.{ArrowRecordBatch, IpcOption, MessageSerializer} + + import org.apache.spark.TaskContext + import org.apache.spark.api.java.JavaRDD +@@ -64,7 +64,7 @@ private[sql] class ArrowBatchStreamWriter( + * End the Arrow stream, does not close output stream. + */ + def end(): Unit = { +- ArrowStreamWriter.writeEndOfStream(writeChannel) ++ ArrowStreamWriter.writeEndOfStream(writeChannel, new IpcOption) + } + } + +@@ -252,7 +252,7 @@ private[sql] object ArrowConverters { + if (msgMetadata.getMessage.headerType() == MessageHeader.RecordBatch) { + + // Buffer backed output large enough to hold the complete serialized message +- val bbout = new ByteBufferOutputStream(4 + msgMetadata.getMessageLength + bodyLength) ++ val bbout = new ByteBufferOutputStream(8 + msgMetadata.getMessageLength + bodyLength) + + // Write message metadata to ByteBuffer output stream + MessageSerializer.writeMessageBuffer( diff --git a/integration/spark/Dockerfile b/integration/spark/Dockerfile index d6a29bde6bc..6b032034986 100644 --- a/integration/spark/Dockerfile +++ b/integration/spark/Dockerfile @@ -17,7 +17,7 @@ FROM arrow:python-3.6 # installing java and maven -ARG MAVEN_VERSION=3.5.4 +ARG MAVEN_VERSION=3.6.2 ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 \ MAVEN_HOME=/usr/local/maven \ M2_HOME=/root/.m2 \ @@ -38,6 +38,11 @@ RUN wget -q -O /tmp/spark.tar.gz https://github.com/apache/spark/archive/$SPARK_ tar -xzf /tmp/spark.tar.gz -C /spark --strip-components=1 && \ rm /tmp/spark.tar.gz +# patch spark to build with current Arrow Java +COPY integration/spark/ARROW-6429.patch /tmp/ +RUN patch -d /spark -p1 -i /tmp/ARROW-6429.patch && \ + rm /tmp/ARROW-6429.patch + # build cpp with tests ENV CC=gcc \ CXX=g++ \ diff --git a/integration/spark/runtest.sh b/integration/spark/runtest.sh index 593683b8079..f8d2cbe9cb7 100755 --- a/integration/spark/runtest.sh +++ b/integration/spark/runtest.sh @@ -47,8 +47,6 @@ pushd /spark build/mvn -B -Dtest=none -DwildcardSuites=$(IFS=,; echo "${SPARK_SCALA_TESTS[*]}") test # Run pyarrow related Python tests only - echo "Testing PySpark:" - SPARK_PYTHON_TESTS=( "pyspark.sql.tests.test_arrow" "pyspark.sql.tests.test_pandas_udf" @@ -58,5 +56,5 @@ pushd /spark "pyspark.sql.tests.test_pandas_udf_window") (echo "Testing PySpark:"; IFS=$'\n'; echo "${SPARK_PYTHON_TESTS[*]}") - python/run-tests --testnames "$(IFS=,; echo "${SPARK_PYTHON_TESTS[*]}")" + python/run-tests --testnames "$(IFS=,; echo "${SPARK_PYTHON_TESTS[*]}")" --python-executables python popd