Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions dev/release/rat_exclude_files.txt
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ dev/tasks/linux-packages/debian/source/format
dev/tasks/linux-packages/debian/watch
dev/tasks/conda-recipes/*
docs/requirements.txt
integration/spark/ARROW-6429.patch
go/arrow/go.sum
go/arrow/Gopkg.lock
go/arrow/internal/cpu/*
Expand Down
31 changes: 31 additions & 0 deletions integration/spark/ARROW-6429.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala
index 1a6f4ac..42d555b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala
@@ -26,7 +26,7 @@ import org.apache.arrow.flatbuf.MessageHeader
import org.apache.arrow.memory.BufferAllocator
import org.apache.arrow.vector._
import org.apache.arrow.vector.ipc.{ArrowStreamWriter, ReadChannel, WriteChannel}
-import org.apache.arrow.vector.ipc.message.{ArrowRecordBatch, MessageSerializer}
+import org.apache.arrow.vector.ipc.message.{ArrowRecordBatch, IpcOption, MessageSerializer}

import org.apache.spark.TaskContext
import org.apache.spark.api.java.JavaRDD
@@ -64,7 +64,7 @@ private[sql] class ArrowBatchStreamWriter(
* End the Arrow stream, does not close output stream.
*/
def end(): Unit = {
- ArrowStreamWriter.writeEndOfStream(writeChannel)
+ ArrowStreamWriter.writeEndOfStream(writeChannel, new IpcOption)
}
}

@@ -252,7 +252,7 @@ private[sql] object ArrowConverters {
if (msgMetadata.getMessage.headerType() == MessageHeader.RecordBatch) {

// Buffer backed output large enough to hold the complete serialized message
- val bbout = new ByteBufferOutputStream(4 + msgMetadata.getMessageLength + bodyLength)
+ val bbout = new ByteBufferOutputStream(8 + msgMetadata.getMessageLength + bodyLength)

// Write message metadata to ByteBuffer output stream
MessageSerializer.writeMessageBuffer(
7 changes: 6 additions & 1 deletion integration/spark/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
FROM arrow:python-3.6

# installing java and maven
ARG MAVEN_VERSION=3.5.4
ARG MAVEN_VERSION=3.6.2
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the minimum version used by Spark, so setting this here will prevent Spark from downloading it during the build phase

ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64 \
MAVEN_HOME=/usr/local/maven \
M2_HOME=/root/.m2 \
Expand All @@ -38,6 +38,11 @@ RUN wget -q -O /tmp/spark.tar.gz https://github.com/apache/spark/archive/$SPARK_
tar -xzf /tmp/spark.tar.gz -C /spark --strip-components=1 && \
rm /tmp/spark.tar.gz

# patch spark to build with current Arrow Java
COPY integration/spark/ARROW-6429.patch /tmp/
RUN patch -d /spark -p1 -i /tmp/ARROW-6429.patch && \
rm /tmp/ARROW-6429.patch

# build cpp with tests
ENV CC=gcc \
CXX=g++ \
Expand Down
4 changes: 1 addition & 3 deletions integration/spark/runtest.sh
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,6 @@ pushd /spark
build/mvn -B -Dtest=none -DwildcardSuites=$(IFS=,; echo "${SPARK_SCALA_TESTS[*]}") test

# Run pyarrow related Python tests only
echo "Testing PySpark:"

SPARK_PYTHON_TESTS=(
"pyspark.sql.tests.test_arrow"
"pyspark.sql.tests.test_pandas_udf"
Expand All @@ -58,5 +56,5 @@ pushd /spark
"pyspark.sql.tests.test_pandas_udf_window")

(echo "Testing PySpark:"; IFS=$'\n'; echo "${SPARK_PYTHON_TESTS[*]}")
python/run-tests --testnames "$(IFS=,; echo "${SPARK_PYTHON_TESTS[*]}")"
python/run-tests --testnames "$(IFS=,; echo "${SPARK_PYTHON_TESTS[*]}")" --python-executables python
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Spark will look for and test separately against installed python versions, so setting this will make sure to run the tests once on the default python

popd