From 93c61c7c766ba942cc0b1710efe65ce838ce8e04 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Tue, 18 Jul 2023 18:29:16 +0900 Subject: [PATCH 01/41] GH-36730: [Python] Add support for Cython 3.0.0 --- .github/workflows/dev.yml | 2 +- ci/conda_env_python.txt | 2 +- dev/release/verify-release-candidate.sh | 2 +- docs/source/cpp/compute.rst | 18 +++++++++++++ python/CMakeLists.txt | 29 +++++++++++++-------- python/pyarrow/_flight.pyx | 10 ++++--- python/pyarrow/includes/libarrow_flight.pxd | 15 +++++++---- python/pyarrow/ipc.pxi | 15 ++++++----- python/pyarrow/scalar.pxi | 4 +-- python/pyproject.toml | 2 +- python/requirements-build.txt | 2 +- python/requirements-wheel-build.txt | 2 +- python/setup.py | 6 ++--- 13 files changed, 72 insertions(+), 37 deletions(-) diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index cee3c74762c..e8fe565ace0 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -103,7 +103,7 @@ jobs: shell: bash run: | gem install test-unit - pip install "cython<3" setuptools six pytest jira + pip install cython setuptools six pytest jira - name: Run Release Test env: ARROW_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/ci/conda_env_python.txt b/ci/conda_env_python.txt index 4ae5c3614a1..04f985c94bb 100644 --- a/ci/conda_env_python.txt +++ b/ci/conda_env_python.txt @@ -18,7 +18,7 @@ # don't add pandas here, because it is not a mandatory test dependency boto3 # not a direct dependency of s3fs, but needed for our s3fs fixture cffi -cython<3 +cython cloudpickle fsspec hypothesis diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index ce31b497c1f..8c5de9bda85 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -665,7 +665,7 @@ test_python() { show_header "Build and test Python libraries" # Build and test Python - maybe_setup_virtualenv "cython<3" numpy setuptools_scm setuptools || exit 1 + maybe_setup_virtualenv cython numpy setuptools_scm setuptools || exit 1 maybe_setup_conda --file ci/conda_env_python.txt || exit 1 if [ "${USE_CONDA}" -gt 0 ]; then diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 44f43cbc877..969fc386dbf 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -1629,6 +1629,7 @@ do not detect overflow. They are alsoavailable in an overflow-checking variant, suffixed ``_checked``, which returns an ``Invalid`` :class:`Status` when overflow is detected. +<<<<<<< HEAD +-------------------------+-------+-------------+-------------+--------------------------------+-----------+ | Function name | Arity | Input types | Output type | Options class | Notes | +=========================+=======+=============+=============+================================+===========+ @@ -1646,6 +1647,23 @@ overflow is detected. +-------------------------+-------+-------------+-------------+--------------------------------+-----------+ | cumulative_mean | Unary | Numeric | Float64 | :struct:`CumulativeOptions` | \(1) \(2) | +-------------------------+-------+-------------+-------------+--------------------------------+-----------+ +======= ++-------------------------+-------+-------------+-------------+--------------------------------+-------+ +| Function name | Arity | Input types | Output type | Options class | Notes | ++=========================+=======+=============+=============+================================+=======+ +| cumulative_sum | Unary | Numeric | Numeric | :struct:`CumulativeOptions` | \(1) | ++-------------------------+-------+-------------+-------------+--------------------------------+-------+ +| cumulative_sum_checked | Unary | Numeric | Numeric | :struct:`CumulativeOptions` | \(1) | ++-------------------------+-------+-------------+-------------+--------------------------------+-------+ +| cumulative_prod | Unary | Numeric | Numeric | :struct:`CumulativeOptions` | \(1) | ++-------------------------+-------+-------------+-------------+--------------------------------+-------+ +| cumulative_prod_checked | Unary | Numeric | Numeric | :struct:`CumulativeOptions` | \(1) | ++-------------------------+-------+-------------+-------------+--------------------------------+-------+ +| cumulative_max | Unary | Numeric | Numeric | :struct:`CumulativeOptions` | \(1) | ++-------------------------+-------+-------------+-------------+--------------------------------+-------+ +| cumulative_min | Unary | Numeric | Numeric | :struct:`CumulativeOptions` | \(1) | ++-------------------------+-------+-------------+-------------+--------------------------------+-------+ +>>>>>>> 9b9f7a317 (Fix table format) * \(1) CumulativeOptions has two optional parameters. The first parameter :member:`CumulativeOptions::start` is a starting value for the running diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 242ba8448f4..29f8d2da72f 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -168,37 +168,44 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${PYARROW_CXXFLAGS}") if(MSVC) # MSVC version of -Wno-return-type-c-linkage - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4190") + string(APPEND CMAKE_CXX_FLAGS " /wd4190") # Cython generates some bitshift expressions that MSVC does not like in # __Pyx_PyFloat_DivideObjC - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4293") + string(APPEND CMAKE_CXX_FLAGS " /wd4293") # Converting to/from C++ bool is pretty wonky in Cython. The C4800 warning # seem harmless, and probably not worth the effort of working around it - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4800") + string(APPEND CMAKE_CXX_FLAGS " /wd4800") # See https://github.com/cython/cython/issues/2731. Change introduced in # Cython 0.29.1 causes "unsafe use of type 'bool' in operation" - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4804") + string(APPEND CMAKE_CXX_FLAGS " /wd4804") + + # See https://github.com/cython/cython/issues/4445. + # + # Cython 3 emits "(void)__Pyx_PyObject_CallMethod0;" to suppress a + # "unused function" warning but the code emits another "function + # call missing argument list" warning. + string(APPEND CMAKE_CXX_FLAGS " /wd4551") else() # Enable perf and other tools to work properly - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-omit-frame-pointer") + string(APPEND CMAKE_CXX_FLAGS " -fno-omit-frame-pointer") # Suppress Cython warnings - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-variable -Wno-maybe-uninitialized") + string(APPEND CMAKE_CXX_FLAGS " -Wno-unused-variable -Wno-maybe-uninitialized") if(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang") # Cython warnings in clang - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-parentheses-equality") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-constant-logical-operand") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-missing-declarations") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-sometimes-uninitialized") + string(APPEND CMAKE_CXX_FLAGS " -Wno-parentheses-equality") + string(APPEND CMAKE_CXX_FLAGS " -Wno-constant-logical-operand") + string(APPEND CMAKE_CXX_FLAGS " -Wno-missing-declarations") + string(APPEND CMAKE_CXX_FLAGS " -Wno-sometimes-uninitialized") # We have public Cython APIs which return C++ types, which are in an extern # "C" blog (no symbol mangling) and clang doesn't like this - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-return-type-c-linkage") + string(APPEND CMAKE_CXX_FLAGS " -Wno-return-type-c-linkage") endif() endif() diff --git a/python/pyarrow/_flight.pyx b/python/pyarrow/_flight.pyx index 42b221ed72a..79aa24e4ce8 100644 --- a/python/pyarrow/_flight.pyx +++ b/python/pyarrow/_flight.pyx @@ -988,8 +988,10 @@ cdef class _MetadataRecordBatchReader(_Weakrefable, _ReadPandasMixin): cdef shared_ptr[CMetadataRecordBatchReader] reader def __iter__(self): - while True: - yield self.read_chunk() + return self + + def __next__(self): + return self.read_chunk() @property def schema(self): @@ -1699,7 +1701,9 @@ cdef class FlightClient(_Weakrefable): def close(self): """Close the client and disconnect.""" - check_flight_status(self.client.get().Close()) + client = self.client.get() + if client != NULL: + check_flight_status(client.Close()) def __del__(self): # Not ideal, but close() wasn't originally present so diff --git a/python/pyarrow/includes/libarrow_flight.pxd b/python/pyarrow/includes/libarrow_flight.pxd index 4bddd2d080f..c4cf5830c41 100644 --- a/python/pyarrow/includes/libarrow_flight.pxd +++ b/python/pyarrow/includes/libarrow_flight.pxd @@ -118,16 +118,16 @@ cdef extern from "arrow/flight/api.h" namespace "arrow" nogil: c_bool Equals(const CLocation& other) @staticmethod - CResult[CLocation] Parse(c_string& uri_string) + CResult[CLocation] Parse(const c_string& uri_string) @staticmethod - CResult[CLocation] ForGrpcTcp(c_string& host, int port) + CResult[CLocation] ForGrpcTcp(const c_string& host, int port) @staticmethod - CResult[CLocation] ForGrpcTls(c_string& host, int port) + CResult[CLocation] ForGrpcTls(const c_string& host, int port) @staticmethod - CResult[CLocation] ForGrpcUnix(c_string& path) + CResult[CLocation] ForGrpcUnix(const c_string& path) cdef cppclass CFlightEndpoint" arrow::flight::FlightEndpoint": CFlightEndpoint() @@ -172,7 +172,9 @@ cdef extern from "arrow/flight/api.h" namespace "arrow" nogil: CResult[unique_ptr[CFlightInfo]] Next() cdef cppclass CSimpleFlightListing" arrow::flight::SimpleFlightListing": - CSimpleFlightListing(vector[CFlightInfo]&& info) + # This doesn't work with Cython >= 3 + # CSimpleFlightListing(vector[CFlightInfo]&& info) + CSimpleFlightListing(const vector[CFlightInfo]& info) cdef cppclass CFlightPayload" arrow::flight::FlightPayload": shared_ptr[CBuffer] descriptor @@ -310,7 +312,10 @@ cdef extern from "arrow/flight/api.h" namespace "arrow" nogil: cdef cppclass CCallHeaders" arrow::flight::CallHeaders": cppclass const_iterator: pair[c_string, c_string] operator*() + # For Cython < 3 const_iterator operator++() + # For Cython >= 3 + const_iterator operator++(int) bint operator==(const_iterator) bint operator!=(const_iterator) const_iterator cbegin() diff --git a/python/pyarrow/ipc.pxi b/python/pyarrow/ipc.pxi index a8398597fe6..53e521fc114 100644 --- a/python/pyarrow/ipc.pxi +++ b/python/pyarrow/ipc.pxi @@ -436,8 +436,10 @@ cdef class MessageReader(_Weakrefable): return result def __iter__(self): - while True: - yield self.read_next_message() + return self + + def __next__(self): + return self.read_next_message() def read_next_message(self): """ @@ -656,11 +658,10 @@ cdef class RecordBatchReader(_Weakrefable): # cdef block is in lib.pxd def __iter__(self): - while True: - try: - yield self.read_next_batch() - except StopIteration: - return + return self + + def __next__(self): + return self.read_next_batch() @property def schema(self): diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index e07949c6755..9a66dc81226 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -819,8 +819,8 @@ cdef class MapScalar(ListScalar): Iterate over this element's values. """ arr = self.values - if array is None: - raise StopIteration + if arr is None: + return for k, v in zip(arr.field(self.type.key_field.name), arr.field(self.type.item_field.name)): yield (k.as_py(), v.as_py()) diff --git a/python/pyproject.toml b/python/pyproject.toml index 7e613045858..fe8c938a9ce 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -17,7 +17,7 @@ [build-system] requires = [ - "cython >= 0.29.31,<3", + "cython >= 0.29.31", "oldest-supported-numpy>=0.14", "setuptools_scm", "setuptools >= 40.1.0", diff --git a/python/requirements-build.txt b/python/requirements-build.txt index 6378d1b94e1..507e9081373 100644 --- a/python/requirements-build.txt +++ b/python/requirements-build.txt @@ -1,4 +1,4 @@ -cython>=0.29.31,<3 +cython>=0.29.31 oldest-supported-numpy>=0.14 setuptools_scm setuptools>=38.6.0 diff --git a/python/requirements-wheel-build.txt b/python/requirements-wheel-build.txt index e4f5243fbc2..6043d2ffb2c 100644 --- a/python/requirements-wheel-build.txt +++ b/python/requirements-wheel-build.txt @@ -1,4 +1,4 @@ -cython>=0.29.31,<3 +cython>=0.29.31 oldest-supported-numpy>=0.14 setuptools_scm setuptools>=58 diff --git a/python/setup.py b/python/setup.py index abd9d03cfb1..0a6fc861eb5 100755 --- a/python/setup.py +++ b/python/setup.py @@ -40,9 +40,9 @@ # Check if we're running 64-bit Python is_64_bit = sys.maxsize > 2**32 -if Cython.__version__ < '0.29.31' or Cython.__version__ >= '3.0': +if Cython.__version__ < '0.29.31': raise Exception( - 'Please update your Cython version. Supported Cython >= 0.29.31, < 3.0') + 'Please update your Cython version. Supported Cython >= 0.29.31') setup_dir = os.path.abspath(os.path.dirname(__file__)) @@ -492,7 +492,7 @@ def has_ext_modules(foo): 'pyarrow/_generated_version.py'), 'version_scheme': guess_next_dev_version }, - setup_requires=['setuptools_scm', 'cython >= 0.29.31,<3'] + setup_requires, + setup_requires=['setuptools_scm', 'cython >= 0.29.31'] + setup_requires, install_requires=install_requires, tests_require=['pytest', 'pandas', 'hypothesis'], python_requires='>=3.8', From dd9f5e5949409a4fc11c55531df759742c539439 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Wed, 9 Aug 2023 16:31:34 -0400 Subject: [PATCH 02/41] Fix bad merge conflict resolution --- docs/source/cpp/compute.rst | 74 ++++++++++++++----------------------- 1 file changed, 28 insertions(+), 46 deletions(-) diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 969fc386dbf..66d393a5094 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -49,8 +49,8 @@ Computation inputs are represented as a general :class:`Datum` class, which is a tagged union of several shapes of data such as :class:`Scalar`, :class:`Array` and :class:`ChunkedArray`. Many compute functions support both array (chunked or not) and scalar inputs, however some will mandate -particular input types. For example, while ``array_sort_indices`` requires its -first and only input to be an array, the generalized ``sort_indices`` +particular input types. For example, while ``array_sort_indices`` requires its +first and only input to be an array, the generalized ``sort_indices`` function accepts an array, chunked array, record batch or table. .. _invoking-compute-functions: @@ -572,28 +572,28 @@ representation based on the rounding criterion. | trunc | Unary | Numeric | Float32/Float64/Decimal | | | +-------------------+------------+-------------+-------------------------+----------------------------------+--------+ -* \(1) By default rounding functions change a value to the nearest - integer using HALF_TO_EVEN to resolve ties. Options are available to control - the rounding criterion. All ``round`` functions have the +* \(1) By default rounding functions change a value to the nearest + integer using HALF_TO_EVEN to resolve ties. Options are available to control + the rounding criterion. All ``round`` functions have the ``round_mode`` option to set the rounding mode. * \(2) Round to a number of digits where the ``ndigits`` option of :struct:`RoundOptions` specifies the rounding precision in terms of number of digits. A negative value corresponds to digits in the non-fractional part. For example, -2 corresponds to rounding to the nearest multiple of 100 (zeroing the ones and tens digits). Default value of ``ndigits`` is 0 - which rounds to the nearest integer. For integer inputs a non-negative + which rounds to the nearest integer. For integer inputs a non-negative ``ndigits`` value is ignored and the input is returned unchanged. For integer - inputs, if ``-ndigits`` is larger than the maximum number of digits the + inputs, if ``-ndigits`` is larger than the maximum number of digits the input type can hold, an error is returned. * \(3) Round to a multiple where the ``multiple`` option of :struct:`RoundToMultipleOptions` specifies the rounding scale. The rounding - multiple has to be a positive value and can be casted to input type. - For example, 100 corresponds to rounding to the nearest multiple of 100 - (zeroing the ones and tens digits). Default value of ``multiple`` is 1 which + multiple has to be a positive value and can be casted to input type. + For example, 100 corresponds to rounding to the nearest multiple of 100 + (zeroing the ones and tens digits). Default value of ``multiple`` is 1 which rounds to the nearest integer. * \(4) Round the first input to multiple of the second input. The rounding - multiple has to be a positive value and can be casted to the first input type. - For example, 100 corresponds to rounding to the nearest multiple of 100 + multiple has to be a positive value and can be casted to the first input type. + For example, 100 corresponds to rounding to the nearest multiple of 100 (zeroing the ones and tens digits). For ``round`` functions, the following rounding modes are available. @@ -634,8 +634,8 @@ The example values are given for default values of ``ndigits`` and ``multiple``. | | | -3.5 -> -3, -4.5 -> -5 | +-----------------------+--------------------------------------------------------------+---------------------------+ -The following table gives examples of how ``ndigits`` (for the ``round`` -and ``round_binary`` functions) and ``multiple`` (for ``round_to_multiple``) +The following table gives examples of how ``ndigits`` (for the ``round`` +and ``round_binary`` functions) and ``multiple`` (for ``round_to_multiple``) influence the operation performed, respectively. +--------------------+-------------------+---------------------------+ @@ -1621,15 +1621,14 @@ Array-wise ("vector") functions Cumulative Functions ~~~~~~~~~~~~~~~~~~~~ -Cumulative functions are vector functions that perform a running accumulation on -their input using a given binary associative operation with an identidy element -(a monoid) and output an array containing the corresponding intermediate running -values. The input is expected to be of numeric type. By default these functions -do not detect overflow. They are alsoavailable in an overflow-checking variant, -suffixed ``_checked``, which returns an ``Invalid`` :class:`Status` when +Cumulative functions are vector functions that perform a running accumulation on +their input using a given binary associative operation with an identidy element +(a monoid) and output an array containing the corresponding intermediate running +values. The input is expected to be of numeric type. By default these functions +do not detect overflow. They are alsoavailable in an overflow-checking variant, +suffixed ``_checked``, which returns an ``Invalid`` :class:`Status` when overflow is detected. -<<<<<<< HEAD +-------------------------+-------+-------------+-------------+--------------------------------+-----------+ | Function name | Arity | Input types | Output type | Options class | Notes | +=========================+=======+=============+=============+================================+===========+ @@ -1647,28 +1646,11 @@ overflow is detected. +-------------------------+-------+-------------+-------------+--------------------------------+-----------+ | cumulative_mean | Unary | Numeric | Float64 | :struct:`CumulativeOptions` | \(1) \(2) | +-------------------------+-------+-------------+-------------+--------------------------------+-----------+ -======= -+-------------------------+-------+-------------+-------------+--------------------------------+-------+ -| Function name | Arity | Input types | Output type | Options class | Notes | -+=========================+=======+=============+=============+================================+=======+ -| cumulative_sum | Unary | Numeric | Numeric | :struct:`CumulativeOptions` | \(1) | -+-------------------------+-------+-------------+-------------+--------------------------------+-------+ -| cumulative_sum_checked | Unary | Numeric | Numeric | :struct:`CumulativeOptions` | \(1) | -+-------------------------+-------+-------------+-------------+--------------------------------+-------+ -| cumulative_prod | Unary | Numeric | Numeric | :struct:`CumulativeOptions` | \(1) | -+-------------------------+-------+-------------+-------------+--------------------------------+-------+ -| cumulative_prod_checked | Unary | Numeric | Numeric | :struct:`CumulativeOptions` | \(1) | -+-------------------------+-------+-------------+-------------+--------------------------------+-------+ -| cumulative_max | Unary | Numeric | Numeric | :struct:`CumulativeOptions` | \(1) | -+-------------------------+-------+-------------+-------------+--------------------------------+-------+ -| cumulative_min | Unary | Numeric | Numeric | :struct:`CumulativeOptions` | \(1) | -+-------------------------+-------+-------------+-------------+--------------------------------+-------+ ->>>>>>> 9b9f7a317 (Fix table format) * \(1) CumulativeOptions has two optional parameters. The first parameter :member:`CumulativeOptions::start` is a starting value for the running - accumulation. It has a default value of 0 for `sum`, 1 for `prod`, min of - input type for `max`, and max of input type for `min`. Specified values of + accumulation. It has a default value of 0 for `sum`, 1 for `prod`, min of + input type for `max`, and max of input type for `min`. Specified values of ``start`` must be castable to the input type. The second parameter :member:`CumulativeOptions::skip_nulls` is a boolean. When set to false (the default), the first encountered null is propagated. When set to @@ -1878,9 +1860,9 @@ replaced, based on the remaining inputs. Pairwise functions ~~~~~~~~~~~~~~~~~~~~ -Pairwise functions are unary vector functions that perform a binary operation on +Pairwise functions are unary vector functions that perform a binary operation on a pair of elements in the input array, typically on adjacent elements. The n-th -output is computed by applying the binary operation to the n-th and (n-p)-th inputs, +output is computed by applying the binary operation to the n-th and (n-p)-th inputs, where p is the period. The default period is 1, in which case the binary operation is applied to adjacent pairs of inputs. The period can also be negative, in which case the n-th output is computed by applying the binary @@ -1894,9 +1876,9 @@ operation to the n-th and (n+abs(p))-th inputs. | pairwise_diff_checked | Unary | Numeric/Temporal | Numeric/Temporal | :struct:`PairwiseOptions` | \(1)(3) | +------------------------+-------+----------------------+----------------------+--------------------------------+----------+ -* \(1) Computes the first order difference of an array, It internally calls - the scalar function ``Subtract`` (or the checked variant) to compute - differences, so its behavior and supported types are the same as - ``Subtract``. The period can be specified in :struct:`PairwiseOptions`. +* \(1) Computes the first order difference of an array, It internally calls + the scalar function ``Subtract`` (or the checked variant) to compute + differences, so its behavior and supported types are the same as + ``Subtract``. The period can be specified in :struct:`PairwiseOptions`. * \(2) Wraps around the result when overflow is detected. * \(3) Returns an ``Invalid`` :class:`Status` when overflow is detected. From 2da3f31d00b5394d97cb9e9ad55163927f97f3fc Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Wed, 9 Aug 2023 16:35:38 -0400 Subject: [PATCH 03/41] Revert unnecessary edits of compute.rst --- docs/source/cpp/compute.rst | 56 ++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 66d393a5094..44f43cbc877 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -49,8 +49,8 @@ Computation inputs are represented as a general :class:`Datum` class, which is a tagged union of several shapes of data such as :class:`Scalar`, :class:`Array` and :class:`ChunkedArray`. Many compute functions support both array (chunked or not) and scalar inputs, however some will mandate -particular input types. For example, while ``array_sort_indices`` requires its -first and only input to be an array, the generalized ``sort_indices`` +particular input types. For example, while ``array_sort_indices`` requires its +first and only input to be an array, the generalized ``sort_indices`` function accepts an array, chunked array, record batch or table. .. _invoking-compute-functions: @@ -572,28 +572,28 @@ representation based on the rounding criterion. | trunc | Unary | Numeric | Float32/Float64/Decimal | | | +-------------------+------------+-------------+-------------------------+----------------------------------+--------+ -* \(1) By default rounding functions change a value to the nearest - integer using HALF_TO_EVEN to resolve ties. Options are available to control - the rounding criterion. All ``round`` functions have the +* \(1) By default rounding functions change a value to the nearest + integer using HALF_TO_EVEN to resolve ties. Options are available to control + the rounding criterion. All ``round`` functions have the ``round_mode`` option to set the rounding mode. * \(2) Round to a number of digits where the ``ndigits`` option of :struct:`RoundOptions` specifies the rounding precision in terms of number of digits. A negative value corresponds to digits in the non-fractional part. For example, -2 corresponds to rounding to the nearest multiple of 100 (zeroing the ones and tens digits). Default value of ``ndigits`` is 0 - which rounds to the nearest integer. For integer inputs a non-negative + which rounds to the nearest integer. For integer inputs a non-negative ``ndigits`` value is ignored and the input is returned unchanged. For integer - inputs, if ``-ndigits`` is larger than the maximum number of digits the + inputs, if ``-ndigits`` is larger than the maximum number of digits the input type can hold, an error is returned. * \(3) Round to a multiple where the ``multiple`` option of :struct:`RoundToMultipleOptions` specifies the rounding scale. The rounding - multiple has to be a positive value and can be casted to input type. - For example, 100 corresponds to rounding to the nearest multiple of 100 - (zeroing the ones and tens digits). Default value of ``multiple`` is 1 which + multiple has to be a positive value and can be casted to input type. + For example, 100 corresponds to rounding to the nearest multiple of 100 + (zeroing the ones and tens digits). Default value of ``multiple`` is 1 which rounds to the nearest integer. * \(4) Round the first input to multiple of the second input. The rounding - multiple has to be a positive value and can be casted to the first input type. - For example, 100 corresponds to rounding to the nearest multiple of 100 + multiple has to be a positive value and can be casted to the first input type. + For example, 100 corresponds to rounding to the nearest multiple of 100 (zeroing the ones and tens digits). For ``round`` functions, the following rounding modes are available. @@ -634,8 +634,8 @@ The example values are given for default values of ``ndigits`` and ``multiple``. | | | -3.5 -> -3, -4.5 -> -5 | +-----------------------+--------------------------------------------------------------+---------------------------+ -The following table gives examples of how ``ndigits`` (for the ``round`` -and ``round_binary`` functions) and ``multiple`` (for ``round_to_multiple``) +The following table gives examples of how ``ndigits`` (for the ``round`` +and ``round_binary`` functions) and ``multiple`` (for ``round_to_multiple``) influence the operation performed, respectively. +--------------------+-------------------+---------------------------+ @@ -1621,12 +1621,12 @@ Array-wise ("vector") functions Cumulative Functions ~~~~~~~~~~~~~~~~~~~~ -Cumulative functions are vector functions that perform a running accumulation on -their input using a given binary associative operation with an identidy element -(a monoid) and output an array containing the corresponding intermediate running -values. The input is expected to be of numeric type. By default these functions -do not detect overflow. They are alsoavailable in an overflow-checking variant, -suffixed ``_checked``, which returns an ``Invalid`` :class:`Status` when +Cumulative functions are vector functions that perform a running accumulation on +their input using a given binary associative operation with an identidy element +(a monoid) and output an array containing the corresponding intermediate running +values. The input is expected to be of numeric type. By default these functions +do not detect overflow. They are alsoavailable in an overflow-checking variant, +suffixed ``_checked``, which returns an ``Invalid`` :class:`Status` when overflow is detected. +-------------------------+-------+-------------+-------------+--------------------------------+-----------+ @@ -1649,8 +1649,8 @@ overflow is detected. * \(1) CumulativeOptions has two optional parameters. The first parameter :member:`CumulativeOptions::start` is a starting value for the running - accumulation. It has a default value of 0 for `sum`, 1 for `prod`, min of - input type for `max`, and max of input type for `min`. Specified values of + accumulation. It has a default value of 0 for `sum`, 1 for `prod`, min of + input type for `max`, and max of input type for `min`. Specified values of ``start`` must be castable to the input type. The second parameter :member:`CumulativeOptions::skip_nulls` is a boolean. When set to false (the default), the first encountered null is propagated. When set to @@ -1860,9 +1860,9 @@ replaced, based on the remaining inputs. Pairwise functions ~~~~~~~~~~~~~~~~~~~~ -Pairwise functions are unary vector functions that perform a binary operation on +Pairwise functions are unary vector functions that perform a binary operation on a pair of elements in the input array, typically on adjacent elements. The n-th -output is computed by applying the binary operation to the n-th and (n-p)-th inputs, +output is computed by applying the binary operation to the n-th and (n-p)-th inputs, where p is the period. The default period is 1, in which case the binary operation is applied to adjacent pairs of inputs. The period can also be negative, in which case the n-th output is computed by applying the binary @@ -1876,9 +1876,9 @@ operation to the n-th and (n+abs(p))-th inputs. | pairwise_diff_checked | Unary | Numeric/Temporal | Numeric/Temporal | :struct:`PairwiseOptions` | \(1)(3) | +------------------------+-------+----------------------+----------------------+--------------------------------+----------+ -* \(1) Computes the first order difference of an array, It internally calls - the scalar function ``Subtract`` (or the checked variant) to compute - differences, so its behavior and supported types are the same as - ``Subtract``. The period can be specified in :struct:`PairwiseOptions`. +* \(1) Computes the first order difference of an array, It internally calls + the scalar function ``Subtract`` (or the checked variant) to compute + differences, so its behavior and supported types are the same as + ``Subtract``. The period can be specified in :struct:`PairwiseOptions`. * \(2) Wraps around the result when overflow is detected. * \(3) Returns an ``Invalid`` :class:`Status` when overflow is detected. From 01025e3f7410a0b20ba5dbb5b48db8c95628f2fe Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Thu, 10 Aug 2023 10:07:48 -0400 Subject: [PATCH 04/41] Try class instead of classmethod in __reduce__ --- python/pyarrow/_dataset_parquet.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx index 79bd270ce54..b8840f9d866 100644 --- a/python/pyarrow/_dataset_parquet.pyx +++ b/python/pyarrow/_dataset_parquet.pyx @@ -786,7 +786,7 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): thrift_string_size_limit=self.thrift_string_size_limit, thrift_container_size_limit=self.thrift_container_size_limit, ) - return ParquetFragmentScanOptions._reconstruct, (kwargs,) + return ParquetFragmentScanOptions, (kwargs,) cdef class ParquetFactoryOptions(_Weakrefable): From cf1ea3ffa1682d9c74021489fe18beabeee75cc1 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Thu, 10 Aug 2023 10:22:33 -0400 Subject: [PATCH 05/41] Revert "Try class instead of classmethod in __reduce__" This reverts commit 074edbdedbb527fe3fea99aa522ebb7014558b7b. --- python/pyarrow/_dataset_parquet.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx index b8840f9d866..79bd270ce54 100644 --- a/python/pyarrow/_dataset_parquet.pyx +++ b/python/pyarrow/_dataset_parquet.pyx @@ -786,7 +786,7 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): thrift_string_size_limit=self.thrift_string_size_limit, thrift_container_size_limit=self.thrift_container_size_limit, ) - return ParquetFragmentScanOptions, (kwargs,) + return ParquetFragmentScanOptions._reconstruct, (kwargs,) cdef class ParquetFactoryOptions(_Weakrefable): From e2d631eb0bd84bb811b99caa38eb297e14d2be65 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Fri, 11 Aug 2023 10:24:19 -0400 Subject: [PATCH 06/41] Fix test_fragments_repr --- python/pyarrow/_dataset.pyx | 2 +- python/pyarrow/tests/test_dataset.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index d29fa125e20..29d74446519 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -1838,7 +1838,7 @@ cdef class FileFragment(Fragment): typ = "" partition_dict = get_partition_keys(self.partition_expression) partition = ", ".join( - [f"{key}={val}" for key, val in partition_dict.items()] + sorted([f"{key}={val}" for key, val in partition_dict.items()]) ) if partition: partition = f" partition=[{partition}]" diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index b8a0c380899..18a35d72090 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -1617,7 +1617,7 @@ def test_fragments_repr(tempdir, dataset): assert ( repr(fragment) == "" + "partition=[group=1, key=xxx]>" ) # single-file parquet dataset (no partition information in repr) From d1e7d8a6de23cb8bba33153e56afe165a5380607 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Fri, 11 Aug 2023 16:11:33 -0400 Subject: [PATCH 07/41] Update substrait test --- python/pyarrow/tests/test_substrait.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/python/pyarrow/tests/test_substrait.py b/python/pyarrow/tests/test_substrait.py index 5dda2cfcf09..bb942dddca1 100644 --- a/python/pyarrow/tests/test_substrait.py +++ b/python/pyarrow/tests/test_substrait.py @@ -243,12 +243,9 @@ def test_named_table_invalid_table_name(): test_table_1 = pa.Table.from_pydict({"x": [1, 2, 3]}) def table_provider(names, _): - if not names: - raise Exception("No names provided") - elif names[0] == "t1": + if names and names[0] == "t1": return test_table_1 - else: - raise Exception("Unrecognized table name") + return None substrait_query = """ { From f2937d88028eeb48053424220e801696671d7a6c Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Fri, 11 Aug 2023 16:40:30 -0400 Subject: [PATCH 08/41] Try noexcept on substrait function --- python/pyarrow/includes/libarrow_substrait.pxd | 2 +- python/pyarrow/tests/test_substrait.py | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/includes/libarrow_substrait.pxd b/python/pyarrow/includes/libarrow_substrait.pxd index c41f4c05d3a..622be20448f 100644 --- a/python/pyarrow/includes/libarrow_substrait.pxd +++ b/python/pyarrow/includes/libarrow_substrait.pxd @@ -72,6 +72,6 @@ cdef extern from "arrow/engine/substrait/util.h" namespace "arrow::engine" nogil CResult[shared_ptr[CRecordBatchReader]] ExecuteSerializedPlan( const CBuffer& substrait_buffer, const ExtensionIdRegistry* registry, CFunctionRegistry* func_registry, const CConversionOptions& conversion_options, - c_bool use_threads) + c_bool use_threads) noexcept CResult[shared_ptr[CBuffer]] SerializeJsonPlan(const c_string& substrait_json) diff --git a/python/pyarrow/tests/test_substrait.py b/python/pyarrow/tests/test_substrait.py index bb942dddca1..5dda2cfcf09 100644 --- a/python/pyarrow/tests/test_substrait.py +++ b/python/pyarrow/tests/test_substrait.py @@ -243,9 +243,12 @@ def test_named_table_invalid_table_name(): test_table_1 = pa.Table.from_pydict({"x": [1, 2, 3]}) def table_provider(names, _): - if names and names[0] == "t1": + if not names: + raise Exception("No names provided") + elif names[0] == "t1": return test_table_1 - return None + else: + raise Exception("Unrecognized table name") substrait_query = """ { From 25ba02808a31d7289d15b1c87789fed4554f172c Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Fri, 11 Aug 2023 17:15:19 -0400 Subject: [PATCH 09/41] Move the noexcept to the correct spot --- python/pyarrow/_substrait.pyx | 2 +- python/pyarrow/includes/libarrow_substrait.pxd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/_substrait.pyx b/python/pyarrow/_substrait.pyx index 4efad2c4d1b..4d5810e8b25 100644 --- a/python/pyarrow/_substrait.pyx +++ b/python/pyarrow/_substrait.pyx @@ -29,7 +29,7 @@ from pyarrow.includes.libarrow_substrait cimport * cdef CDeclaration _create_named_table_provider( dict named_args, const std_vector[c_string]& names, const CSchema& schema -): +) noexcept: cdef: c_string c_name shared_ptr[CTable] c_in_table diff --git a/python/pyarrow/includes/libarrow_substrait.pxd b/python/pyarrow/includes/libarrow_substrait.pxd index 622be20448f..c41f4c05d3a 100644 --- a/python/pyarrow/includes/libarrow_substrait.pxd +++ b/python/pyarrow/includes/libarrow_substrait.pxd @@ -72,6 +72,6 @@ cdef extern from "arrow/engine/substrait/util.h" namespace "arrow::engine" nogil CResult[shared_ptr[CRecordBatchReader]] ExecuteSerializedPlan( const CBuffer& substrait_buffer, const ExtensionIdRegistry* registry, CFunctionRegistry* func_registry, const CConversionOptions& conversion_options, - c_bool use_threads) noexcept + c_bool use_threads) CResult[shared_ptr[CBuffer]] SerializeJsonPlan(const c_string& substrait_json) From 097f6f34939597e215e3505b8d1818532b57c7bd Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Mon, 14 Aug 2023 18:10:23 -0400 Subject: [PATCH 10/41] Fix cloudpickle test --- python/pyarrow/_dataset_parquet.pyx | 13 ++++++++++++- python/pyarrow/_fs.pyx | 10 +++++++++- python/pyarrow/_gcsfs.pyx | 10 +++++++++- python/pyarrow/_hdfs.pyx | 10 +++++++++- python/pyarrow/_s3fs.pyx | 10 +++++++++- 5 files changed, 48 insertions(+), 5 deletions(-) diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx index 79bd270ce54..7222814f3d0 100644 --- a/python/pyarrow/_dataset_parquet.pyx +++ b/python/pyarrow/_dataset_parquet.pyx @@ -771,6 +771,7 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): other.thrift_container_size_limit) return attrs == other_attrs +<<<<<<< HEAD @staticmethod @binding(True) # Required for Cython < 3 def _reconstruct(kwargs): @@ -778,6 +779,8 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): # reconstructor, hence this wrapper. return ParquetFragmentScanOptions(**kwargs) +======= +>>>>>>> 0930ea44c (Fix cloudpickle test) def __reduce__(self): kwargs = dict( use_buffered_stream=self.use_buffered_stream, @@ -786,7 +789,15 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): thrift_string_size_limit=self.thrift_string_size_limit, thrift_container_size_limit=self.thrift_container_size_limit, ) - return ParquetFragmentScanOptions._reconstruct, (kwargs,) + return _reconstruct_parquet_fragment_scan_options, (kwargs,) + + +def _reconstruct_parquet_fragment_scan_options(kwargs): + # __reduce__ doesn't allow passing named arguments directly to the + # reconstructor, hence this wrapper. + # In Cython >= 3.0.0, function binding is turned on by default, so + # a global static method is used (instead of a class method) for pickling. + return ParquetFragmentScanOptions(**kwargs) cdef class ParquetFactoryOptions(_Weakrefable): diff --git a/python/pyarrow/_fs.pyx b/python/pyarrow/_fs.pyx index ef8db31bfc2..40f0132afb0 100644 --- a/python/pyarrow/_fs.pyx +++ b/python/pyarrow/_fs.pyx @@ -1116,10 +1116,18 @@ cdef class LocalFileSystem(FileSystem): def __reduce__(self): cdef CLocalFileSystemOptions opts = self.localfs.options() - return LocalFileSystem._reconstruct, (dict( + return _reconstruct_local_file_system, (dict( use_mmap=opts.use_mmap),) +def _reconstruct_local_file_system(kwargs): + # __reduce__ doesn't allow passing named arguments directly to the + # reconstructor, hence this wrapper. + # In Cython >= 3.0.0, function binding is turned on by default, so + # a global static method is used (instead of a class method) for pickling. + return LocalFileSystem(**kwargs) + + cdef class SubTreeFileSystem(FileSystem): """ Delegates to another implementation after prepending a fixed base path. diff --git a/python/pyarrow/_gcsfs.pyx b/python/pyarrow/_gcsfs.pyx index 5e69413cea9..20781bdd125 100644 --- a/python/pyarrow/_gcsfs.pyx +++ b/python/pyarrow/_gcsfs.pyx @@ -182,7 +182,7 @@ cdef class GcsFileSystem(FileSystem): if opts.project_id.has_value(): project_id = frombytes(opts.project_id.value()) return ( - GcsFileSystem._reconstruct, (dict( + _reconstruct_gcs_file_system, (dict( access_token=frombytes(opts.credentials.access_token()), anonymous=opts.credentials.anonymous(), credential_token_expiration=expiration_dt, @@ -210,3 +210,11 @@ cdef class GcsFileSystem(FileSystem): """ if self.gcsfs.options().project_id.has_value(): return frombytes(self.gcsfs.options().project_id.value()) + + +def _reconstruct_gcs_file_system(kwargs): + # __reduce__ doesn't allow passing named arguments directly to the + # reconstructor, hence this wrapper. + # In Cython >= 3.0.0, function binding is turned on by default, so + # a global static method is used (instead of a class method) for pickling. + return GcsFileSystem(**kwargs) diff --git a/python/pyarrow/_hdfs.pyx b/python/pyarrow/_hdfs.pyx index c426337a12e..52cfe6c4287 100644 --- a/python/pyarrow/_hdfs.pyx +++ b/python/pyarrow/_hdfs.pyx @@ -146,7 +146,7 @@ replication=1)`` def __reduce__(self): cdef CHdfsOptions opts = self.hdfs.options() return ( - HadoopFileSystem._reconstruct, (dict( + _reconstruct_hdfs_file_system, (dict( host=frombytes(opts.connection_config.host), port=opts.connection_config.port, user=frombytes(opts.connection_config.user), @@ -158,3 +158,11 @@ replication=1)`` for k, v in opts.connection_config.extra_conf}, ),) ) + + +def _reconstruct_hdfs_file_system(kwargs): + # __reduce__ doesn't allow passing named arguments directly to the + # reconstructor, hence this wrapper. + # In Cython >= 3.0.0, function binding is turned on by default, so + # a global static method is used (instead of a class method) for pickling. + return HadoopFileSystem(**kwargs) diff --git a/python/pyarrow/_s3fs.pyx b/python/pyarrow/_s3fs.pyx index ab451713699..6f0e99033cc 100644 --- a/python/pyarrow/_s3fs.pyx +++ b/python/pyarrow/_s3fs.pyx @@ -412,7 +412,7 @@ cdef class S3FileSystem(FileSystem): session_token = None return ( - S3FileSystem._reconstruct, (dict( + _reconstruct_s3fs_file_system, (dict( access_key=access_key, secret_key=secret_key, session_token=session_token, @@ -447,3 +447,11 @@ cdef class S3FileSystem(FileSystem): The AWS region this filesystem connects to. """ return frombytes(self.s3fs.region()) + + +def _reconstruct_s3fs_file_system(kwargs): + # __reduce__ doesn't allow passing named arguments directly to the + # reconstructor, hence this wrapper. + # In Cython >= 3.0.0, function binding is turned on by default, so + # a global static method is used (instead of a class method) for pickling. + return S3FileSystem(**kwargs) From 3263eafc49a7e1d3f3b98c0bc870b8566787c2df Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Tue, 15 Aug 2023 10:30:15 -0400 Subject: [PATCH 11/41] Also run test_fs.py with cloudpickle --- python/pyarrow/tests/test_fs.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index 8135f70f690..5c5a0e276af 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -438,6 +438,28 @@ def allow_append_to_file(request, filesystem_config): return filesystem_config['allow_append_to_file'] +@pytest.fixture( + params=[ + pytest.lazy_fixture('builtin_pickle'), + pytest.lazy_fixture('cloudpickle') + ] + ) +def pickle(request): + return request.param + + +@pytest.fixture +def builtin_pickle(): + import pickle + return pickle + + +@pytest.fixture +def cloudpickle(): + cp = pytest.importorskip('cloudpickle') + return cp + + def check_mtime(file_info): assert isinstance(file_info.mtime, datetime) assert isinstance(file_info.mtime_ns, int) From 5241c247adea9da17bf142780a20c5d868cd3be7 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Tue, 15 Aug 2023 10:32:30 -0400 Subject: [PATCH 12/41] Lint --- python/pyarrow/tests/test_fs.py | 10 +++++----- python/some_path | 0 2 files changed, 5 insertions(+), 5 deletions(-) create mode 100644 python/some_path diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index 5c5a0e276af..6820cdeb809 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -439,11 +439,11 @@ def allow_append_to_file(request, filesystem_config): @pytest.fixture( - params=[ - pytest.lazy_fixture('builtin_pickle'), - pytest.lazy_fixture('cloudpickle') - ] - ) + params=[ + pytest.lazy_fixture('builtin_pickle'), + pytest.lazy_fixture('cloudpickle') + ] +) def pickle(request): return request.param diff --git a/python/some_path b/python/some_path new file mode 100644 index 00000000000..e69de29bb2d From 021e133f4c2782840b43c278db9a2add7e912705 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Tue, 15 Aug 2023 10:50:43 -0400 Subject: [PATCH 13/41] Delete accidental file --- python/some_path | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 python/some_path diff --git a/python/some_path b/python/some_path deleted file mode 100644 index e69de29bb2d..00000000000 From ee9f049acab45626de90daaa95a26f892417b514 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Thu, 17 Aug 2023 16:42:53 -0400 Subject: [PATCH 14/41] Make _reconstruct staticmethods --- python/pyarrow/_dataset_parquet.pyx | 17 ++++++++--------- python/pyarrow/_fs.pyx | 10 +--------- python/pyarrow/_gcsfs.pyx | 10 +--------- python/pyarrow/_hdfs.pyx | 10 +--------- python/pyarrow/_s3fs.pyx | 10 +--------- 5 files changed, 12 insertions(+), 45 deletions(-) diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx index 7222814f3d0..3ad85c197e2 100644 --- a/python/pyarrow/_dataset_parquet.pyx +++ b/python/pyarrow/_dataset_parquet.pyx @@ -771,16 +771,23 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): other.thrift_container_size_limit) return attrs == other_attrs +<<<<<<< HEAD <<<<<<< HEAD @staticmethod @binding(True) # Required for Cython < 3 +======= + @staticmethod +>>>>>>> 12e6eb47d (Make _reconstruct staticmethods) def _reconstruct(kwargs): # __reduce__ doesn't allow passing named arguments directly to the # reconstructor, hence this wrapper. return ParquetFragmentScanOptions(**kwargs) +<<<<<<< HEAD ======= >>>>>>> 0930ea44c (Fix cloudpickle test) +======= +>>>>>>> 12e6eb47d (Make _reconstruct staticmethods) def __reduce__(self): kwargs = dict( use_buffered_stream=self.use_buffered_stream, @@ -789,15 +796,7 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): thrift_string_size_limit=self.thrift_string_size_limit, thrift_container_size_limit=self.thrift_container_size_limit, ) - return _reconstruct_parquet_fragment_scan_options, (kwargs,) - - -def _reconstruct_parquet_fragment_scan_options(kwargs): - # __reduce__ doesn't allow passing named arguments directly to the - # reconstructor, hence this wrapper. - # In Cython >= 3.0.0, function binding is turned on by default, so - # a global static method is used (instead of a class method) for pickling. - return ParquetFragmentScanOptions(**kwargs) + return ParquetFragmentScanOptions._reconstruct, (kwargs,) cdef class ParquetFactoryOptions(_Weakrefable): diff --git a/python/pyarrow/_fs.pyx b/python/pyarrow/_fs.pyx index 40f0132afb0..ef8db31bfc2 100644 --- a/python/pyarrow/_fs.pyx +++ b/python/pyarrow/_fs.pyx @@ -1116,18 +1116,10 @@ cdef class LocalFileSystem(FileSystem): def __reduce__(self): cdef CLocalFileSystemOptions opts = self.localfs.options() - return _reconstruct_local_file_system, (dict( + return LocalFileSystem._reconstruct, (dict( use_mmap=opts.use_mmap),) -def _reconstruct_local_file_system(kwargs): - # __reduce__ doesn't allow passing named arguments directly to the - # reconstructor, hence this wrapper. - # In Cython >= 3.0.0, function binding is turned on by default, so - # a global static method is used (instead of a class method) for pickling. - return LocalFileSystem(**kwargs) - - cdef class SubTreeFileSystem(FileSystem): """ Delegates to another implementation after prepending a fixed base path. diff --git a/python/pyarrow/_gcsfs.pyx b/python/pyarrow/_gcsfs.pyx index 20781bdd125..5e69413cea9 100644 --- a/python/pyarrow/_gcsfs.pyx +++ b/python/pyarrow/_gcsfs.pyx @@ -182,7 +182,7 @@ cdef class GcsFileSystem(FileSystem): if opts.project_id.has_value(): project_id = frombytes(opts.project_id.value()) return ( - _reconstruct_gcs_file_system, (dict( + GcsFileSystem._reconstruct, (dict( access_token=frombytes(opts.credentials.access_token()), anonymous=opts.credentials.anonymous(), credential_token_expiration=expiration_dt, @@ -210,11 +210,3 @@ cdef class GcsFileSystem(FileSystem): """ if self.gcsfs.options().project_id.has_value(): return frombytes(self.gcsfs.options().project_id.value()) - - -def _reconstruct_gcs_file_system(kwargs): - # __reduce__ doesn't allow passing named arguments directly to the - # reconstructor, hence this wrapper. - # In Cython >= 3.0.0, function binding is turned on by default, so - # a global static method is used (instead of a class method) for pickling. - return GcsFileSystem(**kwargs) diff --git a/python/pyarrow/_hdfs.pyx b/python/pyarrow/_hdfs.pyx index 52cfe6c4287..c426337a12e 100644 --- a/python/pyarrow/_hdfs.pyx +++ b/python/pyarrow/_hdfs.pyx @@ -146,7 +146,7 @@ replication=1)`` def __reduce__(self): cdef CHdfsOptions opts = self.hdfs.options() return ( - _reconstruct_hdfs_file_system, (dict( + HadoopFileSystem._reconstruct, (dict( host=frombytes(opts.connection_config.host), port=opts.connection_config.port, user=frombytes(opts.connection_config.user), @@ -158,11 +158,3 @@ replication=1)`` for k, v in opts.connection_config.extra_conf}, ),) ) - - -def _reconstruct_hdfs_file_system(kwargs): - # __reduce__ doesn't allow passing named arguments directly to the - # reconstructor, hence this wrapper. - # In Cython >= 3.0.0, function binding is turned on by default, so - # a global static method is used (instead of a class method) for pickling. - return HadoopFileSystem(**kwargs) diff --git a/python/pyarrow/_s3fs.pyx b/python/pyarrow/_s3fs.pyx index 6f0e99033cc..ab451713699 100644 --- a/python/pyarrow/_s3fs.pyx +++ b/python/pyarrow/_s3fs.pyx @@ -412,7 +412,7 @@ cdef class S3FileSystem(FileSystem): session_token = None return ( - _reconstruct_s3fs_file_system, (dict( + S3FileSystem._reconstruct, (dict( access_key=access_key, secret_key=secret_key, session_token=session_token, @@ -447,11 +447,3 @@ cdef class S3FileSystem(FileSystem): The AWS region this filesystem connects to. """ return frombytes(self.s3fs.region()) - - -def _reconstruct_s3fs_file_system(kwargs): - # __reduce__ doesn't allow passing named arguments directly to the - # reconstructor, hence this wrapper. - # In Cython >= 3.0.0, function binding is turned on by default, so - # a global static method is used (instead of a class method) for pickling. - return S3FileSystem(**kwargs) From 0cd34396db5834c739750a1d899da2ad67a3d461 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Thu, 17 Aug 2023 17:11:49 -0400 Subject: [PATCH 15/41] Add test for MapScalar.__iter__ --- python/pyarrow/tests/test_scalars.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index 5f6c8c813f1..8a1dcfb057f 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -700,6 +700,10 @@ def test_map(pickle_module): for i, j in zip(s, v): assert i == j + # test iteration with missing values + for _ in pa.scalar(None, type=ty): + pass + assert s.as_py() == v assert s[1] == ( pa.scalar('b', type=pa.string()), From 148d2857bf7bf7925d54bc924ccde3336bd80ebe Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Fri, 18 Aug 2023 10:13:21 -0400 Subject: [PATCH 16/41] Parametrize all pickling tests to use both the pickle and cloudpickle modules (caught 1 bug in DictionaryScalar) --- python/pyarrow/tests/test_csv.py | 3 +++ python/pyarrow/tests/test_dataset.py | 4 ++++ python/pyarrow/tests/test_fs.py | 22 ---------------------- 3 files changed, 7 insertions(+), 22 deletions(-) diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py index afc5380b755..c0e610bee2c 100644 --- a/python/pyarrow/tests/test_csv.py +++ b/python/pyarrow/tests/test_csv.py @@ -1465,6 +1465,9 @@ class TestSerialCSVTableRead(BaseCSVTableRead): def use_threads(self): return False + def test_invalid_row_handler(self, pickle_module): + BaseCSVTableRead.test_invalid_row_handler(self, pickle_module) + class TestThreadedCSVTableRead(BaseCSVTableRead): @property diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 18a35d72090..07f20de36b5 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -1679,8 +1679,12 @@ def test_partitioning_factory(mockfs, pickled, pickle_module): @pytest.mark.parametrize('infer_dictionary', [False, True]) @pytest.mark.parametrize( "pickled", [lambda x, m: x, lambda x, m: m.loads(m.dumps(x))]) +<<<<<<< HEAD def test_partitioning_factory_dictionary(mockfs, infer_dictionary, pickled, pickle_module): +======= +def test_partitioning_factory_dictionary(mockfs, infer_dictionary, pickled, pickle_module): +>>>>>>> b4dac3a4c (Parametrize all pickling tests to use both the pickle and cloudpickle modules (caught 1 bug in DictionaryScalar)) paths_or_selector = fs.FileSelector('subdir', recursive=True) format = ds.ParquetFileFormat() options = ds.FileSystemFactoryOptions('subdir') diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index 6820cdeb809..8135f70f690 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -438,28 +438,6 @@ def allow_append_to_file(request, filesystem_config): return filesystem_config['allow_append_to_file'] -@pytest.fixture( - params=[ - pytest.lazy_fixture('builtin_pickle'), - pytest.lazy_fixture('cloudpickle') - ] -) -def pickle(request): - return request.param - - -@pytest.fixture -def builtin_pickle(): - import pickle - return pickle - - -@pytest.fixture -def cloudpickle(): - cp = pytest.importorskip('cloudpickle') - return cp - - def check_mtime(file_info): assert isinstance(file_info.mtime, datetime) assert isinstance(file_info.mtime_ns, int) From 1d99984b72e344796a9daab8e9d81f7af677502d Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Fri, 18 Aug 2023 10:16:59 -0400 Subject: [PATCH 17/41] Lint --- python/pyarrow/tests/test_dataset.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 07f20de36b5..18a35d72090 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -1679,12 +1679,8 @@ def test_partitioning_factory(mockfs, pickled, pickle_module): @pytest.mark.parametrize('infer_dictionary', [False, True]) @pytest.mark.parametrize( "pickled", [lambda x, m: x, lambda x, m: m.loads(m.dumps(x))]) -<<<<<<< HEAD def test_partitioning_factory_dictionary(mockfs, infer_dictionary, pickled, pickle_module): -======= -def test_partitioning_factory_dictionary(mockfs, infer_dictionary, pickled, pickle_module): ->>>>>>> b4dac3a4c (Parametrize all pickling tests to use both the pickle and cloudpickle modules (caught 1 bug in DictionaryScalar)) paths_or_selector = fs.FileSelector('subdir', recursive=True) format = ds.ParquetFileFormat() options = ds.FileSystemFactoryOptions('subdir') From fe24f0bafb45d1140aee9f7cf2c27a027ced979a Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Fri, 18 Aug 2023 10:23:38 -0400 Subject: [PATCH 18/41] Remove unnecessary test function --- python/pyarrow/tests/test_csv.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py index c0e610bee2c..afc5380b755 100644 --- a/python/pyarrow/tests/test_csv.py +++ b/python/pyarrow/tests/test_csv.py @@ -1465,9 +1465,6 @@ class TestSerialCSVTableRead(BaseCSVTableRead): def use_threads(self): return False - def test_invalid_row_handler(self, pickle_module): - BaseCSVTableRead.test_invalid_row_handler(self, pickle_module) - class TestThreadedCSVTableRead(BaseCSVTableRead): @property From ac2d11f5a14f30faafa9a18478c7ba0567c7d6f1 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Mon, 21 Aug 2023 15:11:49 -0400 Subject: [PATCH 19/41] Add Cython<3 dev CI job --- ci/docker/conda-python-cython2.dockerfile | 24 +++++++++++++++++++ dev/tasks/tasks.yml | 8 +++++++ docker-compose.yml | 28 +++++++++++++++++++++++ 3 files changed, 60 insertions(+) create mode 100644 ci/docker/conda-python-cython2.dockerfile diff --git a/ci/docker/conda-python-cython2.dockerfile b/ci/docker/conda-python-cython2.dockerfile new file mode 100644 index 00000000000..cff4bee37f5 --- /dev/null +++ b/ci/docker/conda-python-cython2.dockerfile @@ -0,0 +1,24 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +ARG repo +ARG arch=amd64 +ARG python=3.8 +FROM ${repo}:${arch}-conda-python-${python} + +RUN mamba install -q -y cython<3 && \ + mamba clean --all diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index ed238778635..c982244ae93 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -1286,6 +1286,14 @@ tasks: PYTHON: "3.10" image: conda-python-substrait + test-conda-python-3.11-cython2: + ci: github + template: docker-tests/github.linux.yml + params: + env: + PYTHON: "3.11" + image: conda-python-cython2 + test-debian-11-python-3: ci: azure template: docker-tests/azure.linux.yml diff --git a/docker-compose.yml b/docker-compose.yml index a79b13c0a5f..5fc57f2e12f 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -119,6 +119,7 @@ x-hierarchy: - conda-python: - conda-python-pandas: - conda-python-docs + - conda-python-cython2 - conda-python-dask - conda-python-hdfs - conda-python-java-integration @@ -1349,6 +1350,33 @@ services: /arrow/ci/scripts/java_build.sh /arrow /build /tmp/dist/java && /arrow/ci/scripts/java_cdata_integration.sh /arrow /tmp/dist/java" ] + conda-python-cython2: + # Usage: + # docker-compose build conda + # docker-compose build conda-cpp + # docker-compose build conda-python-cython2 + # docker-compose run --rm conda-python-cython2 + image: ${REPO}:${ARCH}-conda-python-${PYTHON}-cython2 + build: + context: . + dockerfile: ci/docker/conda-python-cython2.dockerfile + cache_from: + - ${REPO}:${ARCH}-conda-python-${PYTHON}-cython2 + args: + repo: ${REPO} + arch: ${ARCH} + python: ${PYTHON} + shm_size: *shm-size + environment: + <<: [*common, *ccache, *sccache] + PYTEST_ARGS: # inherit + volumes: *conda-volumes + command: &python-conda-command + [" + /arrow/ci/scripts/cpp_build.sh /arrow /build && + /arrow/ci/scripts/python_build.sh /arrow /build && + /arrow/ci/scripts/python_test.sh /arrow"] + ################################## R ######################################## ubuntu-r: From 32a4402290d94bd79e77b92042cda150461e3a61 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Mon, 21 Aug 2023 15:14:16 -0400 Subject: [PATCH 20/41] Fix docker_compose.yml --- docker-compose.yml | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 5fc57f2e12f..96196bcda5a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1371,11 +1371,7 @@ services: <<: [*common, *ccache, *sccache] PYTEST_ARGS: # inherit volumes: *conda-volumes - command: &python-conda-command - [" - /arrow/ci/scripts/cpp_build.sh /arrow /build && - /arrow/ci/scripts/python_build.sh /arrow /build && - /arrow/ci/scripts/python_test.sh /arrow"] + command: *python-conda-command ################################## R ######################################## From 3b10534653386ec1660e0772647deb2aba0f7f68 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Mon, 21 Aug 2023 15:47:23 -0400 Subject: [PATCH 21/41] Update CI config --- ci/docker/conda-python-cython2.dockerfile | 2 +- dev/tasks/tasks.yml | 4 ++-- docker-compose.yml | 3 ++- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/ci/docker/conda-python-cython2.dockerfile b/ci/docker/conda-python-cython2.dockerfile index cff4bee37f5..1a175af36df 100644 --- a/ci/docker/conda-python-cython2.dockerfile +++ b/ci/docker/conda-python-cython2.dockerfile @@ -16,7 +16,7 @@ # under the License. ARG repo -ARG arch=amd64 +ARG arch ARG python=3.8 FROM ${repo}:${arch}-conda-python-${python} diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index c982244ae93..29e038a9224 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -1286,12 +1286,12 @@ tasks: PYTHON: "3.10" image: conda-python-substrait - test-conda-python-3.11-cython2: + test-conda-python-3.10-cython2: ci: github template: docker-tests/github.linux.yml params: env: - PYTHON: "3.11" + PYTHON: "3.10" image: conda-python-cython2 test-debian-11-python-3: diff --git a/docker-compose.yml b/docker-compose.yml index 96196bcda5a..8ae06900c57 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1354,6 +1354,7 @@ services: # Usage: # docker-compose build conda # docker-compose build conda-cpp + # docker-compose build conda-python # docker-compose build conda-python-cython2 # docker-compose run --rm conda-python-cython2 image: ${REPO}:${ARCH}-conda-python-${PYTHON}-cython2 @@ -1368,7 +1369,7 @@ services: python: ${PYTHON} shm_size: *shm-size environment: - <<: [*common, *ccache, *sccache] + <<: [*common, *ccache] PYTEST_ARGS: # inherit volumes: *conda-volumes command: *python-conda-command From bdec8090d2dfbb09d044a57a3dd29979b6455220 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Mon, 21 Aug 2023 17:42:01 -0400 Subject: [PATCH 22/41] Fix dockerfile --- ci/docker/conda-python-cython2.dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/docker/conda-python-cython2.dockerfile b/ci/docker/conda-python-cython2.dockerfile index 1a175af36df..ad5ce53c7f4 100644 --- a/ci/docker/conda-python-cython2.dockerfile +++ b/ci/docker/conda-python-cython2.dockerfile @@ -20,5 +20,5 @@ ARG arch ARG python=3.8 FROM ${repo}:${arch}-conda-python-${python} -RUN mamba install -q -y cython<3 && \ +RUN mamba install -q -y cython"<3" && \ mamba clean --all From 2df074f7650664f965a3d6e8c29338f455ecfa77 Mon Sep 17 00:00:00 2001 From: Dane Pitkin <48041712+danepitkin@users.noreply.github.com> Date: Mon, 28 Aug 2023 12:03:15 -0400 Subject: [PATCH 23/41] Apply suggestions from code review Fix rebase --- python/pyarrow/_dataset_parquet.pyx | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx index 3ad85c197e2..79bd270ce54 100644 --- a/python/pyarrow/_dataset_parquet.pyx +++ b/python/pyarrow/_dataset_parquet.pyx @@ -771,23 +771,13 @@ cdef class ParquetFragmentScanOptions(FragmentScanOptions): other.thrift_container_size_limit) return attrs == other_attrs -<<<<<<< HEAD -<<<<<<< HEAD @staticmethod @binding(True) # Required for Cython < 3 -======= - @staticmethod ->>>>>>> 12e6eb47d (Make _reconstruct staticmethods) def _reconstruct(kwargs): # __reduce__ doesn't allow passing named arguments directly to the # reconstructor, hence this wrapper. return ParquetFragmentScanOptions(**kwargs) -<<<<<<< HEAD -======= ->>>>>>> 0930ea44c (Fix cloudpickle test) -======= ->>>>>>> 12e6eb47d (Make _reconstruct staticmethods) def __reduce__(self): kwargs = dict( use_buffered_stream=self.use_buffered_stream, From 91257db44e78e4904208ffb8971842d0e874669c Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Mon, 28 Aug 2023 12:14:42 -0400 Subject: [PATCH 24/41] Handle repr non-determinism in test case --- python/pyarrow/_dataset.pyx | 2 +- python/pyarrow/tests/test_dataset.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 29d74446519..d29fa125e20 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -1838,7 +1838,7 @@ cdef class FileFragment(Fragment): typ = "" partition_dict = get_partition_keys(self.partition_expression) partition = ", ".join( - sorted([f"{key}={val}" for key, val in partition_dict.items()]) + [f"{key}={val}" for key, val in partition_dict.items()] ) if partition: partition = f" partition=[{partition}]" diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 18a35d72090..e1458f1c2e1 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -1615,6 +1615,10 @@ def test_fragments_repr(tempdir, dataset): # partitioned parquet dataset fragment = list(dataset.get_fragments())[0] assert ( + # Ordering of partition items is non-deterministic + repr(fragment) == + "" or repr(fragment) == "" From de35878f850cf3cbfa3683e07681b8d475a1c6d7 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Mon, 28 Aug 2023 17:30:40 -0400 Subject: [PATCH 25/41] Ignore numpydocs warnings --- python/pyarrow/_dataset.pyx | 3 ++- python/pyarrow/includes/libarrow.pxd | 2 +- python/pyarrow/includes/libarrow_fs.pxd | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index d29fa125e20..480a17c6763 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -1077,7 +1077,8 @@ cdef class FileSystemDataset(Dataset): @classmethod def from_paths(cls, paths, schema=None, format=None, - filesystem=None, partitions=None, root_partition=None): + filesystem=None, partitions=None, + root_partition=None): # numpydoc ignore=PR01 """A Dataset created from a list of paths on a particular filesystem. Parameters diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index f4d6541fa72..3aaf981b77d 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1554,7 +1554,7 @@ cdef extern from "arrow/ipc/api.h" namespace "arrow::ipc" nogil: # TODO: use "cpdef enum class" to automatically get a Python wrapper? # See # https://github.com/cython/cython/commit/2c7c22f51405299a4e247f78edf52957d30cf71d#diff-61c1365c0f761a8137754bb3a73bfbf7 - ctypedef enum CMetadataVersion" arrow::ipc::MetadataVersion": + ctypedef enum CMetadataVersion" arrow::ipc::MetadataVersion": # numpydoc ignore=PR01 CMetadataVersion_V1" arrow::ipc::MetadataVersion::V1" CMetadataVersion_V2" arrow::ipc::MetadataVersion::V2" CMetadataVersion_V3" arrow::ipc::MetadataVersion::V3" diff --git a/python/pyarrow/includes/libarrow_fs.pxd b/python/pyarrow/includes/libarrow_fs.pxd index 2727fc20119..4e1b1575855 100644 --- a/python/pyarrow/includes/libarrow_fs.pxd +++ b/python/pyarrow/includes/libarrow_fs.pxd @@ -23,7 +23,7 @@ from pyarrow.includes.libarrow_python cimport CTimePoint cdef extern from "arrow/filesystem/api.h" namespace "arrow::fs" nogil: - ctypedef enum CFileType "arrow::fs::FileType": + ctypedef enum CFileType "arrow::fs::FileType": # numpydoc ignore=PR01 CFileType_NotFound "arrow::fs::FileType::NotFound" CFileType_Unknown "arrow::fs::FileType::Unknown" CFileType_File "arrow::fs::FileType::File" @@ -118,7 +118,7 @@ cdef extern from "arrow/filesystem/api.h" namespace "arrow::fs" nogil: c_string base_path() shared_ptr[CFileSystem] base_fs() - ctypedef enum CS3LogLevel "arrow::fs::S3LogLevel": + ctypedef enum CS3LogLevel "arrow::fs::S3LogLevel": # numpydoc ignore=PR01 CS3LogLevel_Off "arrow::fs::S3LogLevel::Off" CS3LogLevel_Fatal "arrow::fs::S3LogLevel::Fatal" CS3LogLevel_Error "arrow::fs::S3LogLevel::Error" From a9d99d4b938e5aec2b8f0872b3edfa9c677e9582 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Tue, 29 Aug 2023 15:20:05 -0400 Subject: [PATCH 26/41] Try fixing numpydoc warning ignores --- python/pyarrow/_dataset.pyx | 4 ++-- python/pyarrow/_fs.pxd | 2 +- python/pyarrow/_s3fs.pyx | 2 +- python/pyarrow/includes/libarrow.pxd | 2 +- python/pyarrow/includes/libarrow_fs.pxd | 4 ++-- python/pyarrow/ipc.pxi | 2 +- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 480a17c6763..ef7c56d1cd4 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -1075,8 +1075,8 @@ cdef class FileSystemDataset(Dataset): self.partition_expression ) - @classmethod - def from_paths(cls, paths, schema=None, format=None, + @staticmethod + def from_paths(paths, schema=None, format=None, filesystem=None, partitions=None, root_partition=None): # numpydoc ignore=PR01 """A Dataset created from a list of paths on a particular filesystem. diff --git a/python/pyarrow/_fs.pxd b/python/pyarrow/_fs.pxd index 4504b78b837..f142a76482e 100644 --- a/python/pyarrow/_fs.pxd +++ b/python/pyarrow/_fs.pxd @@ -23,7 +23,7 @@ from pyarrow.lib import _detect_compression, frombytes, tobytes from pyarrow.lib cimport * -cpdef enum FileType: +cpdef enum FileType: # numpydoc ignore=PR01 NotFound = CFileType_NotFound Unknown = CFileType_Unknown File = CFileType_File diff --git a/python/pyarrow/_s3fs.pyx b/python/pyarrow/_s3fs.pyx index ab451713699..62d311fd3be 100644 --- a/python/pyarrow/_s3fs.pyx +++ b/python/pyarrow/_s3fs.pyx @@ -28,7 +28,7 @@ from pyarrow.includes.libarrow_fs cimport * from pyarrow._fs cimport FileSystem -cpdef enum S3LogLevel: +cpdef enum S3LogLevel: # numpydoc ignore=PR01 Off = CS3LogLevel_Off Fatal = CS3LogLevel_Fatal Error = CS3LogLevel_Error diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 3aaf981b77d..f4d6541fa72 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -1554,7 +1554,7 @@ cdef extern from "arrow/ipc/api.h" namespace "arrow::ipc" nogil: # TODO: use "cpdef enum class" to automatically get a Python wrapper? # See # https://github.com/cython/cython/commit/2c7c22f51405299a4e247f78edf52957d30cf71d#diff-61c1365c0f761a8137754bb3a73bfbf7 - ctypedef enum CMetadataVersion" arrow::ipc::MetadataVersion": # numpydoc ignore=PR01 + ctypedef enum CMetadataVersion" arrow::ipc::MetadataVersion": CMetadataVersion_V1" arrow::ipc::MetadataVersion::V1" CMetadataVersion_V2" arrow::ipc::MetadataVersion::V2" CMetadataVersion_V3" arrow::ipc::MetadataVersion::V3" diff --git a/python/pyarrow/includes/libarrow_fs.pxd b/python/pyarrow/includes/libarrow_fs.pxd index 4e1b1575855..2727fc20119 100644 --- a/python/pyarrow/includes/libarrow_fs.pxd +++ b/python/pyarrow/includes/libarrow_fs.pxd @@ -23,7 +23,7 @@ from pyarrow.includes.libarrow_python cimport CTimePoint cdef extern from "arrow/filesystem/api.h" namespace "arrow::fs" nogil: - ctypedef enum CFileType "arrow::fs::FileType": # numpydoc ignore=PR01 + ctypedef enum CFileType "arrow::fs::FileType": CFileType_NotFound "arrow::fs::FileType::NotFound" CFileType_Unknown "arrow::fs::FileType::Unknown" CFileType_File "arrow::fs::FileType::File" @@ -118,7 +118,7 @@ cdef extern from "arrow/filesystem/api.h" namespace "arrow::fs" nogil: c_string base_path() shared_ptr[CFileSystem] base_fs() - ctypedef enum CS3LogLevel "arrow::fs::S3LogLevel": # numpydoc ignore=PR01 + ctypedef enum CS3LogLevel "arrow::fs::S3LogLevel": CS3LogLevel_Off "arrow::fs::S3LogLevel::Off" CS3LogLevel_Fatal "arrow::fs::S3LogLevel::Fatal" CS3LogLevel_Error "arrow::fs::S3LogLevel::Error" diff --git a/python/pyarrow/ipc.pxi b/python/pyarrow/ipc.pxi index 53e521fc114..bfa233e03df 100644 --- a/python/pyarrow/ipc.pxi +++ b/python/pyarrow/ipc.pxi @@ -19,7 +19,7 @@ from collections import namedtuple import warnings -cpdef enum MetadataVersion: +cpdef enum MetadataVersion: # numpydoc ignore=PR01 V1 = CMetadataVersion_V1 V2 = CMetadataVersion_V2 V3 = CMetadataVersion_V3 From 2d59f167a89fbba69798f190ff8dbfc67e809b7b Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Tue, 29 Aug 2023 18:33:47 -0400 Subject: [PATCH 27/41] Try adding docstrings to cpdef enums --- python/pyarrow/_dataset.pyx | 8 ++++---- python/pyarrow/_fs.pxd | 5 ++++- python/pyarrow/_s3fs.pyx | 5 ++++- python/pyarrow/ipc.pxi | 5 ++++- 4 files changed, 16 insertions(+), 7 deletions(-) diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index ef7c56d1cd4..1311869c311 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -1076,10 +1076,10 @@ cdef class FileSystemDataset(Dataset): ) @staticmethod - def from_paths(paths, schema=None, format=None, - filesystem=None, partitions=None, - root_partition=None): # numpydoc ignore=PR01 - """A Dataset created from a list of paths on a particular filesystem. + def from_paths(paths, schema=None, format=None, filesystem=None, + partitions=None, root_partition=None): + """ + A Dataset created from a list of paths on a particular filesystem. Parameters ---------- diff --git a/python/pyarrow/_fs.pxd b/python/pyarrow/_fs.pxd index f142a76482e..6ad2ace9a6d 100644 --- a/python/pyarrow/_fs.pxd +++ b/python/pyarrow/_fs.pxd @@ -23,7 +23,10 @@ from pyarrow.lib import _detect_compression, frombytes, tobytes from pyarrow.lib cimport * -cpdef enum FileType: # numpydoc ignore=PR01 +cpdef enum FileType: + """ + File type + """ NotFound = CFileType_NotFound Unknown = CFileType_Unknown File = CFileType_File diff --git a/python/pyarrow/_s3fs.pyx b/python/pyarrow/_s3fs.pyx index 62d311fd3be..9e06addfec8 100644 --- a/python/pyarrow/_s3fs.pyx +++ b/python/pyarrow/_s3fs.pyx @@ -28,7 +28,10 @@ from pyarrow.includes.libarrow_fs cimport * from pyarrow._fs cimport FileSystem -cpdef enum S3LogLevel: # numpydoc ignore=PR01 +cpdef enum S3LogLevel: + """ + S3 Log level + """ Off = CS3LogLevel_Off Fatal = CS3LogLevel_Fatal Error = CS3LogLevel_Error diff --git a/python/pyarrow/ipc.pxi b/python/pyarrow/ipc.pxi index bfa233e03df..a1f266f4b45 100644 --- a/python/pyarrow/ipc.pxi +++ b/python/pyarrow/ipc.pxi @@ -19,7 +19,10 @@ from collections import namedtuple import warnings -cpdef enum MetadataVersion: # numpydoc ignore=PR01 +cpdef enum MetadataVersion: + """ + IPC Metadata version + """ V1 = CMetadataVersion_V1 V2 = CMetadataVersion_V2 V3 = CMetadataVersion_V3 From e5e3ceaffeb538a8f729a2896789b14117893f95 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Wed, 30 Aug 2023 15:41:15 -0400 Subject: [PATCH 28/41] Ignore EnumType parameter docstrings checks during numpydoc validation --- dev/archery/archery/lang/python.py | 10 ++++++++++ python/pyarrow/_fs.pxd | 3 --- python/pyarrow/_s3fs.pyx | 3 --- python/pyarrow/ipc.pxi | 3 --- 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/dev/archery/archery/lang/python.py b/dev/archery/archery/lang/python.py index 8600a0d7c48..21f7ec5aa30 100644 --- a/dev/archery/archery/lang/python.py +++ b/dev/archery/archery/lang/python.py @@ -16,6 +16,7 @@ # under the License. from contextlib import contextmanager +from enum import EnumType import inspect import tokenize @@ -112,6 +113,10 @@ def inspect_signature(obj): class NumpyDoc: + IGNORE_VALIDATION_ERRORS_FOR_TYPE = { + # Enum function signatures should never be documented + EnumType: ["PR01"] + } def __init__(self, symbols=None): if not have_numpydoc: @@ -229,6 +234,11 @@ def callback(obj): continue if disallow_rules and errcode in disallow_rules: continue + if any(isinstance(obj, obj_type) and errcode in errcode_list + for obj_type, errcode_list + in NumpyDoc.IGNORE_VALIDATION_ERRORS_FOR_TYPE.items()): + print(obj) + continue errors.append((errcode, errmsg)) if len(errors): diff --git a/python/pyarrow/_fs.pxd b/python/pyarrow/_fs.pxd index 6ad2ace9a6d..4504b78b837 100644 --- a/python/pyarrow/_fs.pxd +++ b/python/pyarrow/_fs.pxd @@ -24,9 +24,6 @@ from pyarrow.lib cimport * cpdef enum FileType: - """ - File type - """ NotFound = CFileType_NotFound Unknown = CFileType_Unknown File = CFileType_File diff --git a/python/pyarrow/_s3fs.pyx b/python/pyarrow/_s3fs.pyx index 9e06addfec8..ab451713699 100644 --- a/python/pyarrow/_s3fs.pyx +++ b/python/pyarrow/_s3fs.pyx @@ -29,9 +29,6 @@ from pyarrow._fs cimport FileSystem cpdef enum S3LogLevel: - """ - S3 Log level - """ Off = CS3LogLevel_Off Fatal = CS3LogLevel_Fatal Error = CS3LogLevel_Error diff --git a/python/pyarrow/ipc.pxi b/python/pyarrow/ipc.pxi index a1f266f4b45..53e521fc114 100644 --- a/python/pyarrow/ipc.pxi +++ b/python/pyarrow/ipc.pxi @@ -20,9 +20,6 @@ import warnings cpdef enum MetadataVersion: - """ - IPC Metadata version - """ V1 = CMetadataVersion_V1 V2 = CMetadataVersion_V2 V3 = CMetadataVersion_V3 From f47f806312fd3c7f6d1f5073c66a0efc620b6005 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Wed, 30 Aug 2023 15:42:39 -0400 Subject: [PATCH 29/41] Remove print statement --- dev/archery/archery/lang/python.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dev/archery/archery/lang/python.py b/dev/archery/archery/lang/python.py index 21f7ec5aa30..922af14e105 100644 --- a/dev/archery/archery/lang/python.py +++ b/dev/archery/archery/lang/python.py @@ -237,7 +237,6 @@ def callback(obj): if any(isinstance(obj, obj_type) and errcode in errcode_list for obj_type, errcode_list in NumpyDoc.IGNORE_VALIDATION_ERRORS_FOR_TYPE.items()): - print(obj) continue errors.append((errcode, errmsg)) From 676b5382178065cdbf7b9ad71400961db6810042 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Wed, 30 Aug 2023 16:08:29 -0400 Subject: [PATCH 30/41] Disable debug builds for cuda and ubuntu 20 on azure --- dev/tasks/tasks.yml | 2 +- docker-compose.yml | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 29e038a9224..85806fe0454 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -1308,7 +1308,7 @@ tasks: params: env: UBUNTU: 20.04 - image: ubuntu-python + image: ubuntu-python-no-debug test-ubuntu-22.04-python-3: ci: github diff --git a/docker-compose.yml b/docker-compose.yml index 8ae06900c57..ec89e8695f6 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -882,6 +882,7 @@ services: ARROW_S3: "OFF" ARROW_SUBSTRAIT: "OFF" ARROW_WITH_OPENTELEMETRY: "OFF" + CMAKE_BUILD_TYPE: "Release" # (GH-37478) TODO default back to debug builds SETUPTOOLS_SCM_PRETEND_VERSION: volumes: *ubuntu-volumes deploy: *cuda-deploy @@ -935,6 +936,30 @@ services: volumes: *ubuntu-volumes command: *python-command + ubuntu-python-no-debug: + # (GH-37478) TODO delete ubuntu-python-no-debug + # Usage: + # docker-compose build ubuntu-cpp + # docker-compose build ubuntu-python-no-debug + # docker-compose run --rm ubuntu-python-no-debug + # Parameters: + # ARCH: amd64, arm64v8, ... + # UBUNTU: 20.04 + image: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-python-3 + build: + context: . + dockerfile: ci/docker/linux-apt-python-3.dockerfile + cache_from: + - ${REPO}:${ARCH}-ubuntu-${UBUNTU}-python-3-no-debug + args: + base: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-cpp + shm_size: *shm-size + environment: + <<: [*common, *ccache] + CMAKE_BUILD_TYPE: "Release" + volumes: *ubuntu-volumes + command: *python-command + ubuntu-swift: # Usage: # docker-compose build ubuntu-swift From d89fde11bc450f847ad46e97d7a4e5dd4cf03c14 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Wed, 30 Aug 2023 16:09:14 -0400 Subject: [PATCH 31/41] Use backwards-compatible Enum base class --- dev/archery/archery/lang/python.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dev/archery/archery/lang/python.py b/dev/archery/archery/lang/python.py index 922af14e105..d4c1853d097 100644 --- a/dev/archery/archery/lang/python.py +++ b/dev/archery/archery/lang/python.py @@ -16,7 +16,7 @@ # under the License. from contextlib import contextmanager -from enum import EnumType +from enum import EnumMeta import inspect import tokenize @@ -115,7 +115,7 @@ def inspect_signature(obj): class NumpyDoc: IGNORE_VALIDATION_ERRORS_FOR_TYPE = { # Enum function signatures should never be documented - EnumType: ["PR01"] + EnumMeta: ["PR01"] } def __init__(self, symbols=None): From 59fffdc032d4d631a82f2f7d40977962fa0271f7 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Wed, 30 Aug 2023 16:13:10 -0400 Subject: [PATCH 32/41] Update x-hierarchy in docker-compose --- docker-compose.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/docker-compose.yml b/docker-compose.yml index ec89e8695f6..c30f5c5eb95 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -148,6 +148,7 @@ x-hierarchy: - ubuntu-lint - ubuntu-python: - ubuntu-docs + - ubuntu-python-no-debug - ubuntu-python-sdist-test - ubuntu-r - ubuntu-r-only-r From 7c56dccfb7bd3cea873c091b6f8fa79bebc8b003 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Wed, 30 Aug 2023 17:14:15 -0400 Subject: [PATCH 33/41] Disable gdb tests for non-debug builds --- docker-compose.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docker-compose.yml b/docker-compose.yml index c30f5c5eb95..96aa2ffa9d7 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -883,7 +883,8 @@ services: ARROW_S3: "OFF" ARROW_SUBSTRAIT: "OFF" ARROW_WITH_OPENTELEMETRY: "OFF" - CMAKE_BUILD_TYPE: "Release" # (GH-37478) TODO default back to debug builds + CMAKE_BUILD_TYPE: "Release" # (GH-37478) + PYTEST_ARGS: "-m \"not gdb\"" # (GH-37478) SETUPTOOLS_SCM_PRETEND_VERSION: volumes: *ubuntu-volumes deploy: *cuda-deploy @@ -958,6 +959,7 @@ services: environment: <<: [*common, *ccache] CMAKE_BUILD_TYPE: "Release" + PYTEST_ARGS: "-m \"not gdb\"" volumes: *ubuntu-volumes command: *python-command From 27d1128131fd704dbe35ed8d41c67c49b8904038 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Tue, 12 Sep 2023 12:01:16 -0400 Subject: [PATCH 34/41] Revert "Disable gdb tests for non-debug builds" This reverts commit 139dd2f81bc5a854521a01068f4d6e470cafa86a. --- docker-compose.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 96aa2ffa9d7..c30f5c5eb95 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -883,8 +883,7 @@ services: ARROW_S3: "OFF" ARROW_SUBSTRAIT: "OFF" ARROW_WITH_OPENTELEMETRY: "OFF" - CMAKE_BUILD_TYPE: "Release" # (GH-37478) - PYTEST_ARGS: "-m \"not gdb\"" # (GH-37478) + CMAKE_BUILD_TYPE: "Release" # (GH-37478) TODO default back to debug builds SETUPTOOLS_SCM_PRETEND_VERSION: volumes: *ubuntu-volumes deploy: *cuda-deploy @@ -959,7 +958,6 @@ services: environment: <<: [*common, *ccache] CMAKE_BUILD_TYPE: "Release" - PYTEST_ARGS: "-m \"not gdb\"" volumes: *ubuntu-volumes command: *python-command From 23e8a3e7b222f364f036fd7b0fda3122d38b5c24 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Tue, 12 Sep 2023 12:01:25 -0400 Subject: [PATCH 35/41] Revert "Update x-hierarchy in docker-compose" This reverts commit 3480a3972170f199ad6d1daf3e6533f17ffac9b3. --- docker-compose.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/docker-compose.yml b/docker-compose.yml index c30f5c5eb95..ec89e8695f6 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -148,7 +148,6 @@ x-hierarchy: - ubuntu-lint - ubuntu-python: - ubuntu-docs - - ubuntu-python-no-debug - ubuntu-python-sdist-test - ubuntu-r - ubuntu-r-only-r From 4ec5188adfc5d4411d5f27ebf115a5f42b21ef63 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Tue, 12 Sep 2023 12:01:35 -0400 Subject: [PATCH 36/41] Revert "Disable debug builds for cuda and ubuntu 20 on azure" This reverts commit 4f38547b0d72d1f3626147f3f99ae7a7297ee6ac. --- dev/tasks/tasks.yml | 2 +- docker-compose.yml | 25 ------------------------- 2 files changed, 1 insertion(+), 26 deletions(-) diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 85806fe0454..29e038a9224 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -1308,7 +1308,7 @@ tasks: params: env: UBUNTU: 20.04 - image: ubuntu-python-no-debug + image: ubuntu-python test-ubuntu-22.04-python-3: ci: github diff --git a/docker-compose.yml b/docker-compose.yml index ec89e8695f6..8ae06900c57 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -882,7 +882,6 @@ services: ARROW_S3: "OFF" ARROW_SUBSTRAIT: "OFF" ARROW_WITH_OPENTELEMETRY: "OFF" - CMAKE_BUILD_TYPE: "Release" # (GH-37478) TODO default back to debug builds SETUPTOOLS_SCM_PRETEND_VERSION: volumes: *ubuntu-volumes deploy: *cuda-deploy @@ -936,30 +935,6 @@ services: volumes: *ubuntu-volumes command: *python-command - ubuntu-python-no-debug: - # (GH-37478) TODO delete ubuntu-python-no-debug - # Usage: - # docker-compose build ubuntu-cpp - # docker-compose build ubuntu-python-no-debug - # docker-compose run --rm ubuntu-python-no-debug - # Parameters: - # ARCH: amd64, arm64v8, ... - # UBUNTU: 20.04 - image: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-python-3 - build: - context: . - dockerfile: ci/docker/linux-apt-python-3.dockerfile - cache_from: - - ${REPO}:${ARCH}-ubuntu-${UBUNTU}-python-3-no-debug - args: - base: ${REPO}:${ARCH}-ubuntu-${UBUNTU}-cpp - shm_size: *shm-size - environment: - <<: [*common, *ccache] - CMAKE_BUILD_TYPE: "Release" - volumes: *ubuntu-volumes - command: *python-command - ubuntu-swift: # Usage: # docker-compose build ubuntu-swift From 90f5321a30c3a9a20e67292a779e979d2a29903b Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Fri, 15 Sep 2023 12:47:26 -0400 Subject: [PATCH 37/41] Disable Cython 3 --- .github/workflows/dev.yml | 2 +- ci/conda_env_python.txt | 2 +- dev/release/verify-release-candidate.sh | 2 +- python/pyproject.toml | 2 +- python/requirements-build.txt | 2 +- python/requirements-wheel-build.txt | 2 +- python/setup.py | 6 +++--- 7 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml index e8fe565ace0..cee3c74762c 100644 --- a/.github/workflows/dev.yml +++ b/.github/workflows/dev.yml @@ -103,7 +103,7 @@ jobs: shell: bash run: | gem install test-unit - pip install cython setuptools six pytest jira + pip install "cython<3" setuptools six pytest jira - name: Run Release Test env: ARROW_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/ci/conda_env_python.txt b/ci/conda_env_python.txt index 04f985c94bb..4ae5c3614a1 100644 --- a/ci/conda_env_python.txt +++ b/ci/conda_env_python.txt @@ -18,7 +18,7 @@ # don't add pandas here, because it is not a mandatory test dependency boto3 # not a direct dependency of s3fs, but needed for our s3fs fixture cffi -cython +cython<3 cloudpickle fsspec hypothesis diff --git a/dev/release/verify-release-candidate.sh b/dev/release/verify-release-candidate.sh index 8c5de9bda85..ce31b497c1f 100755 --- a/dev/release/verify-release-candidate.sh +++ b/dev/release/verify-release-candidate.sh @@ -665,7 +665,7 @@ test_python() { show_header "Build and test Python libraries" # Build and test Python - maybe_setup_virtualenv cython numpy setuptools_scm setuptools || exit 1 + maybe_setup_virtualenv "cython<3" numpy setuptools_scm setuptools || exit 1 maybe_setup_conda --file ci/conda_env_python.txt || exit 1 if [ "${USE_CONDA}" -gt 0 ]; then diff --git a/python/pyproject.toml b/python/pyproject.toml index fe8c938a9ce..7e613045858 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -17,7 +17,7 @@ [build-system] requires = [ - "cython >= 0.29.31", + "cython >= 0.29.31,<3", "oldest-supported-numpy>=0.14", "setuptools_scm", "setuptools >= 40.1.0", diff --git a/python/requirements-build.txt b/python/requirements-build.txt index 507e9081373..6378d1b94e1 100644 --- a/python/requirements-build.txt +++ b/python/requirements-build.txt @@ -1,4 +1,4 @@ -cython>=0.29.31 +cython>=0.29.31,<3 oldest-supported-numpy>=0.14 setuptools_scm setuptools>=38.6.0 diff --git a/python/requirements-wheel-build.txt b/python/requirements-wheel-build.txt index 6043d2ffb2c..e4f5243fbc2 100644 --- a/python/requirements-wheel-build.txt +++ b/python/requirements-wheel-build.txt @@ -1,4 +1,4 @@ -cython>=0.29.31 +cython>=0.29.31,<3 oldest-supported-numpy>=0.14 setuptools_scm setuptools>=58 diff --git a/python/setup.py b/python/setup.py index 0a6fc861eb5..abd9d03cfb1 100755 --- a/python/setup.py +++ b/python/setup.py @@ -40,9 +40,9 @@ # Check if we're running 64-bit Python is_64_bit = sys.maxsize > 2**32 -if Cython.__version__ < '0.29.31': +if Cython.__version__ < '0.29.31' or Cython.__version__ >= '3.0': raise Exception( - 'Please update your Cython version. Supported Cython >= 0.29.31') + 'Please update your Cython version. Supported Cython >= 0.29.31, < 3.0') setup_dir = os.path.abspath(os.path.dirname(__file__)) @@ -492,7 +492,7 @@ def has_ext_modules(foo): 'pyarrow/_generated_version.py'), 'version_scheme': guess_next_dev_version }, - setup_requires=['setuptools_scm', 'cython >= 0.29.31'] + setup_requires, + setup_requires=['setuptools_scm', 'cython >= 0.29.31,<3'] + setup_requires, install_requires=install_requires, tests_require=['pytest', 'pandas', 'hypothesis'], python_requires='>=3.8', From 6f32fcf3e02aec40d6bf9cbebbf39b4c54091093 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Fri, 15 Sep 2023 12:49:24 -0400 Subject: [PATCH 38/41] Clean up quotes in dockerfile --- ci/docker/conda-python-cython2.dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/docker/conda-python-cython2.dockerfile b/ci/docker/conda-python-cython2.dockerfile index ad5ce53c7f4..d67ef677276 100644 --- a/ci/docker/conda-python-cython2.dockerfile +++ b/ci/docker/conda-python-cython2.dockerfile @@ -20,5 +20,5 @@ ARG arch ARG python=3.8 FROM ${repo}:${arch}-conda-python-${python} -RUN mamba install -q -y cython"<3" && \ +RUN mamba install -q -y "cython<3" && \ mamba clean --all From ec5754cf58b657f21f16ec91a8d932e94120ae64 Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Mon, 18 Sep 2023 13:34:22 -0400 Subject: [PATCH 39/41] Add todo, revert back to classmethod --- python/pyarrow/_dataset.pyx | 4 ++-- python/pyarrow/_substrait.pyx | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 1311869c311..3aafd9ff9c0 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -1075,8 +1075,8 @@ cdef class FileSystemDataset(Dataset): self.partition_expression ) - @staticmethod - def from_paths(paths, schema=None, format=None, filesystem=None, + @classmethod + def from_paths(cls, paths, schema=None, format=None, filesystem=None, partitions=None, root_partition=None): """ A Dataset created from a list of paths on a particular filesystem. diff --git a/python/pyarrow/_substrait.pyx b/python/pyarrow/_substrait.pyx index 4d5810e8b25..067cb5f9168 100644 --- a/python/pyarrow/_substrait.pyx +++ b/python/pyarrow/_substrait.pyx @@ -27,6 +27,7 @@ from pyarrow.includes.libarrow cimport * from pyarrow.includes.libarrow_substrait cimport * +# TODO GH-37235: Fix exception handling cdef CDeclaration _create_named_table_provider( dict named_args, const std_vector[c_string]& names, const CSchema& schema ) noexcept: From eee7573dda6a3734bec148a5befc2add5a95674b Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Mon, 18 Sep 2023 13:36:50 -0400 Subject: [PATCH 40/41] Revert formatting change --- python/pyarrow/_dataset.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 3aafd9ff9c0..77fdadcef9a 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -1060,7 +1060,7 @@ cdef class FileSystemDataset(Dataset): return Partitioning.wrap(c_partitioning) except TypeError: # e.g. type_name "default" - return None + return Nonesubs cdef void init(self, const shared_ptr[CDataset]& sp): Dataset.init(self, sp) @@ -1076,8 +1076,8 @@ cdef class FileSystemDataset(Dataset): ) @classmethod - def from_paths(cls, paths, schema=None, format=None, filesystem=None, - partitions=None, root_partition=None): + def from_paths(cls, paths, schema=None, format=None, + filesystem=None, partitions=None, root_partition=None): """ A Dataset created from a list of paths on a particular filesystem. From 3c4f581103560cda3cf848c3e1b7ae3c86f1e19f Mon Sep 17 00:00:00 2001 From: Dane Pitkin Date: Mon, 18 Sep 2023 15:02:40 -0400 Subject: [PATCH 41/41] Revert typo --- python/pyarrow/_dataset.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 77fdadcef9a..48ee6769153 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -1060,7 +1060,7 @@ cdef class FileSystemDataset(Dataset): return Partitioning.wrap(c_partitioning) except TypeError: # e.g. type_name "default" - return Nonesubs + return None cdef void init(self, const shared_ptr[CDataset]& sp): Dataset.init(self, sp)