apache · BryanCutler · Sep 22, 2019 · Sep 22, 2019 · Sep 24, 2019 · Sep 24, 2019
diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt
@@ -132,6 +132,7 @@ dev/tasks/linux-packages/debian/source/format
 dev/tasks/linux-packages/debian/watch
 dev/tasks/conda-recipes/*
 docs/requirements.txt
+integration/spark/ARROW-6429.patch
 go/arrow/go.sum
 go/arrow/Gopkg.lock
 go/arrow/internal/cpu/*

diff --git a/integration/spark/ARROW-6429.patch b/integration/spark/ARROW-6429.patch
@@ -0,0 +1,31 @@
+diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala
+index 1a6f4ac..42d555b 100644
+--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala
++++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala
+@@ -26,7 +26,7 @@ import org.apache.arrow.flatbuf.MessageHeader
+ import org.apache.arrow.memory.BufferAllocator
+ import org.apache.arrow.vector._
+ import org.apache.arrow.vector.ipc.{ArrowStreamWriter, ReadChannel, WriteChannel}
+-import org.apache.arrow.vector.ipc.message.{ArrowRecordBatch, MessageSerializer}
++import org.apache.arrow.vector.ipc.message.{ArrowRecordBatch, IpcOption, MessageSerializer}
+
+ import org.apache.spark.TaskContext
+ import org.apache.spark.api.java.JavaRDD
+@@ -64,7 +64,7 @@ private[sql] class ArrowBatchStreamWriter(
+    * End the Arrow stream, does not close output stream.
+    */
+   def end(): Unit = {
+-    ArrowStreamWriter.writeEndOfStream(writeChannel)
++    ArrowStreamWriter.writeEndOfStream(writeChannel, new IpcOption)
+   }
+ }
+
+@@ -252,7 +252,7 @@ private[sql] object ArrowConverters {
+         if (msgMetadata.getMessage.headerType() == MessageHeader.RecordBatch) {
+
+           // Buffer backed output large enough to hold the complete serialized message
+-          val bbout = new ByteBufferOutputStream(4 + msgMetadata.getMessageLength + bodyLength)
++          val bbout = new ByteBufferOutputStream(8 + msgMetadata.getMessageLength + bodyLength)
+
+           // Write message metadata to ByteBuffer output stream
+           MessageSerializer.writeMessageBuffer(
diff --git a/integration/spark/Dockerfile b/integration/spark/Dockerfile
@@ -17,7 +17,7 @@
 FROM arrow:python-3.6
 
 # installing java and maven
-ARG MAVEN_VERSION=3.5.4
+ARG MAVEN_VERSION=3.6.2
 ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 \
     MAVEN_HOME=/usr/local/maven \
     M2_HOME=/root/.m2 \
@@ -38,6 +38,11 @@ RUN wget -q -O /tmp/spark.tar.gz https://github.com/apache/spark/archive/$SPARK_
     tar -xzf /tmp/spark.tar.gz -C /spark --strip-components=1 && \
     rm /tmp/spark.tar.gz
 
+# patch spark to build with current Arrow Java
+COPY integration/spark/ARROW-6429.patch /tmp/
+RUN patch -d /spark -p1 -i /tmp/ARROW-6429.patch && \
+    rm /tmp/ARROW-6429.patch
+
 # build cpp with tests
 ENV CC=gcc \
     CXX=g++ \

diff --git a/integration/spark/runtest.sh b/integration/spark/runtest.sh
@@ -47,8 +47,6 @@ pushd /spark
   build/mvn -B -Dtest=none -DwildcardSuites=$(IFS=,; echo "${SPARK_SCALA_TESTS[*]}") test
 
   # Run pyarrow related Python tests only
-  echo "Testing PySpark:"
-
   SPARK_PYTHON_TESTS=(
     "pyspark.sql.tests.test_arrow"
     "pyspark.sql.tests.test_pandas_udf"
@@ -58,5 +56,5 @@ pushd /spark
     "pyspark.sql.tests.test_pandas_udf_window")
 
   (echo "Testing PySpark:"; IFS=$'\n'; echo "${SPARK_PYTHON_TESTS[*]}")
-  python/run-tests --testnames "$(IFS=,; echo "${SPARK_PYTHON_TESTS[*]}")"
+  python/run-tests --testnames "$(IFS=,; echo "${SPARK_PYTHON_TESTS[*]}")" --python-executables python
 popd