diff --git a/ci/cpp-python-msvc-build.bat b/ci/cpp-python-msvc-build.bat index d3f540b2d8c..ecc68e0c39c 100644 --- a/ci/cpp-python-msvc-build.bat +++ b/ci/cpp-python-msvc-build.bat @@ -133,6 +133,8 @@ popd pushd python +pip install pickle5 + set PYARROW_CXXFLAGS=/WX set PYARROW_CMAKE_GENERATOR=%GENERATOR% set PYARROW_BUNDLE_ARROW_CPP=ON @@ -167,6 +169,6 @@ pip install %WHEEL_PATH% || exit /B python -c "import pyarrow" || exit /B python -c "import pyarrow.parquet" || exit /B -pip install pandas pytest pytest-faulthandler +pip install pandas pickle5 pytest pytest-faulthandler py.test -r sxX --durations=15 --pyargs pyarrow.tests || exit /B diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh index 4eeb103d0c3..0743f86f335 100755 --- a/ci/travis_script_python.sh +++ b/ci/travis_script_python.sh @@ -102,6 +102,9 @@ pushd $ARROW_PYTHON_DIR # Other stuff pip install pip install -q -r requirements.txt +if [ "$PYTHON_VERSION" == "3.6" ]; then + pip install -q pickle5 +fi if [ "$ARROW_TRAVIS_COVERAGE" == "1" ]; then export PYARROW_GENERATE_COVERAGE=1 pip install -q coverage diff --git a/python/pyarrow/compat.py b/python/pyarrow/compat.py index 47aeaa5bfd5..562ae920d23 100644 --- a/python/pyarrow/compat.py +++ b/python/pyarrow/compat.py @@ -107,7 +107,10 @@ def frombytes(o): def unichar(s): return unichr(s) else: - import pickle as builtin_pickle + try: + import pickle5 as builtin_pickle + except ImportError: + import pickle as builtin_pickle unicode_type = str def lzip(*x): @@ -142,10 +145,7 @@ def unichar(s): try: import cloudpickle as pickle except ImportError: - try: - import cPickle as pickle - except ImportError: - import pickle + pickle = builtin_pickle def encode_file_path(path): import os diff --git a/python/pyarrow/io.pxi b/python/pyarrow/io.pxi index ad9bf0e3494..7ab416884e3 100644 --- a/python/pyarrow/io.pxi +++ b/python/pyarrow/io.pxi @@ -19,7 +19,7 @@ # arrow::ipc from libc.stdlib cimport malloc, free -from pyarrow.compat import frombytes, tobytes, encode_file_path +from pyarrow.compat import builtin_pickle, frombytes, tobytes, encode_file_path from io import BufferedIOBase, UnsupportedOperation import re @@ -810,8 +810,11 @@ cdef class Buffer: else: return NotImplemented - def __reduce__(self): - return py_buffer, (self.to_pybytes(),) + def __reduce_ex__(self, protocol): + if protocol >= 5: + return py_buffer, (builtin_pickle.PickleBuffer(self),) + else: + return py_buffer, (self.to_pybytes(),) def to_pybytes(self): return cp.PyBytes_FromStringAndSize( diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 7ab54dd42c0..af2708fd03e 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -17,6 +17,7 @@ import collections import datetime +import pickle import pytest import struct import sys @@ -24,7 +25,10 @@ import numpy as np import pandas as pd import pandas.util.testing as tm -import pickle +try: + import pickle5 +except ImportError: + pickle5 = None import pyarrow as pa from pyarrow.pandas_compat import get_logical_type @@ -633,7 +637,7 @@ def test_cast_date64_to_int(): assert result.equals(expected) -@pytest.mark.parametrize( +pickle_test_parametrize = pytest.mark.parametrize( ('data', 'typ'), [ ([True, False, True, True], pa.bool_()), @@ -647,12 +651,38 @@ def test_cast_date64_to_int(): pa.struct([pa.field('a', pa.int64()), pa.field('b', pa.string())])) ] ) + + +@pickle_test_parametrize def test_array_pickle(data, typ): # Allocate here so that we don't have any Arrow data allocated. # This is needed to ensure that allocator tests can be reliable. array = pa.array(data, type=typ) - result = pickle.loads(pickle.dumps(array)) - assert array.equals(result) + for proto in range(0, pickle.HIGHEST_PROTOCOL + 1): + result = pickle.loads(pickle.dumps(array, proto)) + assert array.equals(result) + + +@pickle_test_parametrize +def test_array_pickle5(data, typ): + # Test zero-copy pickling with protocol 5 (PEP 574) + picklemod = pickle5 or pickle + if pickle5 is None and picklemod.HIGHEST_PROTOCOL < 5: + pytest.skip("need pickle5 package or Python 3.8+") + + array = pa.array(data, type=typ) + addresses = [buf.address if buf is not None else 0 + for buf in array.buffers()] + + for proto in range(5, pickle.HIGHEST_PROTOCOL + 1): + buffers = [] + pickled = picklemod.dumps(array, proto, buffer_callback=buffers.append) + result = picklemod.loads(pickled, buffers=buffers) + assert array.equals(result) + + result_addresses = [buf.address if buf is not None else 0 + for buf in result.buffers()] + assert result_addresses == addresses @pytest.mark.parametrize(