From 2a6baee3c86075e25a781a7995f41555df86c3e4 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 24 Jul 2025 18:48:54 +0200 Subject: [PATCH 01/32] Add typing to arrow using stubs --- .github/workflows/python.yml | 6 + dev/release/rat_exclude_files.txt | 1 + python/py.typed | 16 + python/pyproject.toml | 21 + python/stubs/LICENSE | 24 + python/stubs/__init__.pyi | 656 ++ python/stubs/__lib_pxi/__init__.pyi | 0 python/stubs/__lib_pxi/array.pyi | 4274 ++++++++++ python/stubs/__lib_pxi/benchmark.pyi | 1 + python/stubs/__lib_pxi/builder.pyi | 89 + python/stubs/__lib_pxi/compat.pyi | 5 + python/stubs/__lib_pxi/config.pyi | 41 + python/stubs/__lib_pxi/device.pyi | 88 + python/stubs/__lib_pxi/error.pyi | 53 + python/stubs/__lib_pxi/io.pyi | 1474 ++++ python/stubs/__lib_pxi/ipc.pyi | 705 ++ python/stubs/__lib_pxi/memory.pyi | 174 + python/stubs/__lib_pxi/pandas_shim.pyi | 51 + python/stubs/__lib_pxi/scalar.pyi | 1017 +++ python/stubs/__lib_pxi/table.pyi | 5609 +++++++++++++ python/stubs/__lib_pxi/tensor.pyi | 688 ++ python/stubs/__lib_pxi/types.pyi | 4413 ++++++++++ python/stubs/_azurefs.pyi | 74 + python/stubs/_compute.pyi | 1721 ++++ python/stubs/_csv.pyi | 641 ++ python/stubs/_cuda.pyi | 556 ++ python/stubs/_dataset.pyi | 2299 ++++++ python/stubs/_dataset_orc.pyi | 6 + python/stubs/_dataset_parquet.pyi | 314 + python/stubs/_dataset_parquet_encryption.pyi | 85 + python/stubs/_feather.pyi | 29 + python/stubs/_flight.pyi | 1380 ++++ python/stubs/_fs.pyi | 1005 +++ python/stubs/_gcsfs.pyi | 83 + python/stubs/_hdfs.pyi | 75 + python/stubs/_json.pyi | 169 + python/stubs/_orc.pyi | 56 + python/stubs/_parquet.pyi | 445 + python/stubs/_parquet_encryption.pyi | 67 + python/stubs/_s3fs.pyi | 74 + python/stubs/_stubs_typing.pyi | 80 + python/stubs/_substrait.pyi | 39 + python/stubs/acero.pyi | 85 + python/stubs/benchmark.pyi | 3 + python/stubs/cffi.pyi | 4 + python/stubs/compute.pyi | 7779 ++++++++++++++++++ python/stubs/csv.pyi | 27 + python/stubs/cuda.pyi | 25 + python/stubs/dataset.pyi | 229 + python/stubs/feather.pyi | 50 + python/stubs/flight.pyi | 95 + python/stubs/fs.pyi | 77 + python/stubs/gandiva.pyi | 65 + python/stubs/interchange/__init__.pyi | 0 python/stubs/interchange/buffer.pyi | 58 + python/stubs/interchange/column.pyi | 252 + python/stubs/interchange/dataframe.pyi | 102 + python/stubs/interchange/from_dataframe.pyi | 244 + python/stubs/ipc.pyi | 123 + python/stubs/json.pyi | 3 + python/stubs/lib.pyi | 106 + python/stubs/orc.pyi | 279 + python/stubs/pandas_compat.pyi | 54 + python/stubs/parquet/__init__.pyi | 1 + python/stubs/parquet/core.pyi | 2061 +++++ python/stubs/parquet/encryption.pyi | 15 + python/stubs/substrait.pyi | 21 + python/stubs/types.pyi | 194 + python/stubs/util.pyi | 27 + 69 files changed, 40583 insertions(+) create mode 100644 python/py.typed create mode 100644 python/stubs/LICENSE create mode 100644 python/stubs/__init__.pyi create mode 100644 python/stubs/__lib_pxi/__init__.pyi create mode 100644 python/stubs/__lib_pxi/array.pyi create mode 100644 python/stubs/__lib_pxi/benchmark.pyi create mode 100644 python/stubs/__lib_pxi/builder.pyi create mode 100644 python/stubs/__lib_pxi/compat.pyi create mode 100644 python/stubs/__lib_pxi/config.pyi create mode 100644 python/stubs/__lib_pxi/device.pyi create mode 100644 python/stubs/__lib_pxi/error.pyi create mode 100644 python/stubs/__lib_pxi/io.pyi create mode 100644 python/stubs/__lib_pxi/ipc.pyi create mode 100644 python/stubs/__lib_pxi/memory.pyi create mode 100644 python/stubs/__lib_pxi/pandas_shim.pyi create mode 100644 python/stubs/__lib_pxi/scalar.pyi create mode 100644 python/stubs/__lib_pxi/table.pyi create mode 100644 python/stubs/__lib_pxi/tensor.pyi create mode 100644 python/stubs/__lib_pxi/types.pyi create mode 100644 python/stubs/_azurefs.pyi create mode 100644 python/stubs/_compute.pyi create mode 100644 python/stubs/_csv.pyi create mode 100644 python/stubs/_cuda.pyi create mode 100644 python/stubs/_dataset.pyi create mode 100644 python/stubs/_dataset_orc.pyi create mode 100644 python/stubs/_dataset_parquet.pyi create mode 100644 python/stubs/_dataset_parquet_encryption.pyi create mode 100644 python/stubs/_feather.pyi create mode 100644 python/stubs/_flight.pyi create mode 100644 python/stubs/_fs.pyi create mode 100644 python/stubs/_gcsfs.pyi create mode 100644 python/stubs/_hdfs.pyi create mode 100644 python/stubs/_json.pyi create mode 100644 python/stubs/_orc.pyi create mode 100644 python/stubs/_parquet.pyi create mode 100644 python/stubs/_parquet_encryption.pyi create mode 100644 python/stubs/_s3fs.pyi create mode 100644 python/stubs/_stubs_typing.pyi create mode 100644 python/stubs/_substrait.pyi create mode 100644 python/stubs/acero.pyi create mode 100644 python/stubs/benchmark.pyi create mode 100644 python/stubs/cffi.pyi create mode 100644 python/stubs/compute.pyi create mode 100644 python/stubs/csv.pyi create mode 100644 python/stubs/cuda.pyi create mode 100644 python/stubs/dataset.pyi create mode 100644 python/stubs/feather.pyi create mode 100644 python/stubs/flight.pyi create mode 100644 python/stubs/fs.pyi create mode 100644 python/stubs/gandiva.pyi create mode 100644 python/stubs/interchange/__init__.pyi create mode 100644 python/stubs/interchange/buffer.pyi create mode 100644 python/stubs/interchange/column.pyi create mode 100644 python/stubs/interchange/dataframe.pyi create mode 100644 python/stubs/interchange/from_dataframe.pyi create mode 100644 python/stubs/ipc.pyi create mode 100644 python/stubs/json.pyi create mode 100644 python/stubs/lib.pyi create mode 100644 python/stubs/orc.pyi create mode 100644 python/stubs/pandas_compat.pyi create mode 100644 python/stubs/parquet/__init__.pyi create mode 100644 python/stubs/parquet/core.pyi create mode 100644 python/stubs/parquet/encryption.pyi create mode 100644 python/stubs/substrait.pyi create mode 100644 python/stubs/types.pyi create mode 100644 python/stubs/util.pyi diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 15dfa11fc4c..15906a10ac0 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -138,6 +138,12 @@ jobs: continue-on-error: true run: archery docker push ${{ matrix.image }} + - name: Type check with ty + working-directory: python + run: |- + python -m pip install ty + python -m ty check + macos: name: ${{ matrix.architecture }} macOS ${{ matrix.macos-version }} Python 3 runs-on: macos-${{ matrix.macos-version }} diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index 51c01516e7c..c4dc26e7784 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -95,6 +95,7 @@ python/pyarrow/tests/__init__.py python/pyarrow/vendored/* python/pyarrow/src/arrow/python/vendored/* python/requirements*.txt +python/stubs/* pax_global_header MANIFEST.in __init__.pxd diff --git a/python/py.typed b/python/py.typed new file mode 100644 index 00000000000..13a83393a91 --- /dev/null +++ b/python/py.typed @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/python/pyproject.toml b/python/pyproject.toml index 113d0b16f19..5c0580a0510 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -97,3 +97,24 @@ version_file = 'pyarrow/_generated_version.py' version_scheme = 'guess-next-dev' git_describe_command = 'git describe --dirty --tags --long --match "apache-arrow-[0-9]*.*"' fallback_version = '22.0.0a0' + +[tool.ty.rules] +invalid-argument-type = "ignore" +invalid-assignment = "ignore" +invalid-context-manager = "ignore" +invalid-return-type = "ignore" +invalid-type-form = "ignore" +no-matching-overload = "ignore" +non-subscriptable = "ignore" +not-iterable = "ignore" +possibly-unbound-attribute = "ignore" +possibly-unbound-import = "ignore" +too-many-positional-arguments = "ignore" +unknown-argument = "ignore" +unresolved-attribute = "ignore" +unresolved-global = "ignore" +unresolved-import = "ignore" +unresolved-reference = "ignore" +unsupported-operator = "ignore" +missing-argument = "ignore" +call-non-callable = "ignore" diff --git a/python/stubs/LICENSE b/python/stubs/LICENSE new file mode 100644 index 00000000000..6d8e2aff5b7 --- /dev/null +++ b/python/stubs/LICENSE @@ -0,0 +1,24 @@ +BSD 2-Clause License + +Copyright (c) 2024, ZhengYu, Xu + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/python/stubs/__init__.pyi b/python/stubs/__init__.pyi new file mode 100644 index 00000000000..8a0d1e870c5 --- /dev/null +++ b/python/stubs/__init__.pyi @@ -0,0 +1,656 @@ +# ruff: noqa: F401, I001, E402 +__version__: str + +import pyarrow.lib as _lib + +_gc_enabled: bool + +from pyarrow.lib import ( + BuildInfo, + RuntimeInfo, + set_timezone_db_path, + MonthDayNano, + VersionInfo, + cpp_build_info, + cpp_version, + cpp_version_info, + runtime_info, + cpu_count, + set_cpu_count, + enable_signal_handlers, + io_thread_count, + set_io_thread_count, +) + +def show_versions() -> None: ... +def show_info() -> None: ... +def _module_is_available(module: str) -> bool: ... +def _filesystem_is_available(fs: str) -> bool: ... + +from pyarrow.lib import ( + null, + bool_, + int8, + int16, + int32, + int64, + uint8, + uint16, + uint32, + uint64, + time32, + time64, + timestamp, + date32, + date64, + duration, + month_day_nano_interval, + float16, + float32, + float64, + binary, + string, + utf8, + binary_view, + string_view, + large_binary, + large_string, + large_utf8, + decimal32, + decimal64, + decimal128, + decimal256, + list_, + large_list, + list_view, + large_list_view, + map_, + struct, + union, + sparse_union, + dense_union, + dictionary, + run_end_encoded, + json_, + uuid, + fixed_shape_tensor, + bool8, + opaque, + field, + type_for_alias, + DataType, + DictionaryType, + StructType, + ListType, + LargeListType, + FixedSizeListType, + ListViewType, + LargeListViewType, + MapType, + UnionType, + SparseUnionType, + DenseUnionType, + TimestampType, + Time32Type, + Time64Type, + DurationType, + FixedSizeBinaryType, + Decimal32Type, + Decimal64Type, + Decimal128Type, + Decimal256Type, + BaseExtensionType, + ExtensionType, + RunEndEncodedType, + FixedShapeTensorType, + Bool8Type, + UuidType, + JsonType, + OpaqueType, + PyExtensionType, + UnknownExtensionType, + register_extension_type, + unregister_extension_type, + DictionaryMemo, + KeyValueMetadata, + Field, + Schema, + schema, + unify_schemas, + Array, + Tensor, + array, + chunked_array, + record_batch, + nulls, + repeat, + SparseCOOTensor, + SparseCSRMatrix, + SparseCSCMatrix, + SparseCSFTensor, + infer_type, + from_numpy_dtype, + NullArray, + NumericArray, + IntegerArray, + FloatingPointArray, + BooleanArray, + Int8Array, + UInt8Array, + Int16Array, + UInt16Array, + Int32Array, + UInt32Array, + Int64Array, + UInt64Array, + HalfFloatArray, + FloatArray, + DoubleArray, + ListArray, + LargeListArray, + FixedSizeListArray, + ListViewArray, + LargeListViewArray, + MapArray, + UnionArray, + BinaryArray, + StringArray, + LargeBinaryArray, + LargeStringArray, + BinaryViewArray, + StringViewArray, + FixedSizeBinaryArray, + DictionaryArray, + Date32Array, + Date64Array, + TimestampArray, + Time32Array, + Time64Array, + DurationArray, + MonthDayNanoIntervalArray, + Decimal32Array, + Decimal64Array, + Decimal128Array, + Decimal256Array, + StructArray, + ExtensionArray, + RunEndEncodedArray, + FixedShapeTensorArray, + Bool8Array, + UuidArray, + JsonArray, + OpaqueArray, + scalar, + NA, + _NULL as NULL, + Scalar, + NullScalar, + BooleanScalar, + Int8Scalar, + Int16Scalar, + Int32Scalar, + Int64Scalar, + UInt8Scalar, + UInt16Scalar, + UInt32Scalar, + UInt64Scalar, + HalfFloatScalar, + FloatScalar, + DoubleScalar, + Decimal32Scalar, + Decimal64Scalar, + Decimal128Scalar, + Decimal256Scalar, + ListScalar, + LargeListScalar, + FixedSizeListScalar, + ListViewScalar, + LargeListViewScalar, + Date32Scalar, + Date64Scalar, + Time32Scalar, + Time64Scalar, + TimestampScalar, + DurationScalar, + MonthDayNanoIntervalScalar, + BinaryScalar, + LargeBinaryScalar, + BinaryViewScalar, + StringScalar, + LargeStringScalar, + StringViewScalar, + FixedSizeBinaryScalar, + DictionaryScalar, + MapScalar, + StructScalar, + UnionScalar, + RunEndEncodedScalar, + ExtensionScalar, + Bool8Scalar, + UuidScalar, + JsonScalar, + OpaqueScalar, +) + +# Buffers, allocation +from pyarrow.lib import DeviceAllocationType, Device, MemoryManager, default_cpu_memory_manager + +from pyarrow.lib import ( + Buffer, + ResizableBuffer, + foreign_buffer, + py_buffer, + Codec, + compress, + decompress, + allocate_buffer, +) + +from pyarrow.lib import ( + MemoryPool, + LoggingMemoryPool, + ProxyMemoryPool, + total_allocated_bytes, + set_memory_pool, + default_memory_pool, + system_memory_pool, + jemalloc_memory_pool, + mimalloc_memory_pool, + logging_memory_pool, + proxy_memory_pool, + log_memory_allocations, + jemalloc_set_decay_ms, + supported_memory_backends, +) + +# I/O +from pyarrow.lib import ( + NativeFile, + PythonFile, + BufferedInputStream, + BufferedOutputStream, + CacheOptions, + CompressedInputStream, + CompressedOutputStream, + TransformInputStream, + transcoding_input_stream, + FixedSizeBufferWriter, + BufferReader, + BufferOutputStream, + OSFile, + MemoryMappedFile, + memory_map, + create_memory_map, + MockOutputStream, + input_stream, + output_stream, + have_libhdfs, +) + +from pyarrow.lib import ( + ChunkedArray, + RecordBatch, + Table, + table, + concat_arrays, + concat_tables, + TableGroupBy, + RecordBatchReader, +) + +# Exceptions +from pyarrow.lib import ( + ArrowCancelled, + ArrowCapacityError, + ArrowException, + ArrowKeyError, + ArrowIndexError, + ArrowInvalid, + ArrowIOError, + ArrowMemoryError, + ArrowNotImplementedError, + ArrowTypeError, + ArrowSerializationError, +) + +from pyarrow.ipc import serialize_pandas, deserialize_pandas +import pyarrow.ipc as ipc + +import pyarrow.types as types + +# ---------------------------------------------------------------------- +# Deprecations + +from pyarrow.util import _deprecate_api, _deprecate_class + +from pyarrow.ipc import ( + Message, + MessageReader, + MetadataVersion, + RecordBatchFileReader, + RecordBatchFileWriter, + RecordBatchStreamReader, + RecordBatchStreamWriter, +) + +# ---------------------------------------------------------------------- +# Returning absolute path to the pyarrow include directory (if bundled, e.g. in +# wheels) +def get_include() -> str: ... +def _get_pkg_config_executable() -> str: ... +def _has_pkg_config(pkgname: str) -> bool: ... +def _read_pkg_config_variable(pkgname: str, cli_args: list[str]) -> str: ... +def get_libraries() -> list[str]: ... +def create_library_symlinks() -> None: ... +def get_library_dirs() -> list[str]: ... + +__all__ = [ + "__version__", + "_lib", + "_gc_enabled", + "BuildInfo", + "RuntimeInfo", + "set_timezone_db_path", + "MonthDayNano", + "VersionInfo", + "cpp_build_info", + "cpp_version", + "cpp_version_info", + "runtime_info", + "cpu_count", + "set_cpu_count", + "enable_signal_handlers", + "io_thread_count", + "set_io_thread_count", + "show_versions", + "show_info", + "_module_is_available", + "_filesystem_is_available", + "null", + "bool_", + "int8", + "int16", + "int32", + "int64", + "uint8", + "uint16", + "uint32", + "uint64", + "time32", + "time64", + "timestamp", + "date32", + "date64", + "duration", + "month_day_nano_interval", + "float16", + "float32", + "float64", + "binary", + "string", + "utf8", + "binary_view", + "string_view", + "large_binary", + "large_string", + "large_utf8", + "decimal32", + "decimal64", + "decimal128", + "decimal256", + "list_", + "large_list", + "list_view", + "large_list_view", + "map_", + "struct", + "union", + "sparse_union", + "dense_union", + "dictionary", + "run_end_encoded", + "json_", + "uuid", + "fixed_shape_tensor", + "bool8", + "opaque", + "field", + "type_for_alias", + "DataType", + "DictionaryType", + "StructType", + "ListType", + "LargeListType", + "FixedSizeListType", + "ListViewType", + "LargeListViewType", + "MapType", + "UnionType", + "SparseUnionType", + "DenseUnionType", + "TimestampType", + "Time32Type", + "Time64Type", + "DurationType", + "FixedSizeBinaryType", + "Decimal32Type", + "Decimal64Type", + "Decimal128Type", + "Decimal256Type", + "BaseExtensionType", + "ExtensionType", + "RunEndEncodedType", + "FixedShapeTensorType", + "Bool8Type", + "UuidType", + "JsonType", + "OpaqueType", + "PyExtensionType", + "UnknownExtensionType", + "register_extension_type", + "unregister_extension_type", + "DictionaryMemo", + "KeyValueMetadata", + "Field", + "Schema", + "schema", + "unify_schemas", + "Array", + "Tensor", + "array", + "chunked_array", + "record_batch", + "nulls", + "repeat", + "SparseCOOTensor", + "SparseCSRMatrix", + "SparseCSCMatrix", + "SparseCSFTensor", + "infer_type", + "from_numpy_dtype", + "NullArray", + "NumericArray", + "IntegerArray", + "FloatingPointArray", + "BooleanArray", + "Int8Array", + "UInt8Array", + "Int16Array", + "UInt16Array", + "Int32Array", + "UInt32Array", + "Int64Array", + "UInt64Array", + "HalfFloatArray", + "FloatArray", + "DoubleArray", + "ListArray", + "LargeListArray", + "FixedSizeListArray", + "ListViewArray", + "LargeListViewArray", + "MapArray", + "UnionArray", + "BinaryArray", + "StringArray", + "LargeBinaryArray", + "LargeStringArray", + "BinaryViewArray", + "StringViewArray", + "FixedSizeBinaryArray", + "DictionaryArray", + "Date32Array", + "Date64Array", + "TimestampArray", + "Time32Array", + "Time64Array", + "DurationArray", + "MonthDayNanoIntervalArray", + "Decimal32Array", + "Decimal64Array", + "Decimal128Array", + "Decimal256Array", + "StructArray", + "ExtensionArray", + "Bool8Array", + "UuidArray", + "JsonArray", + "OpaqueArray", + "RunEndEncodedArray", + "FixedShapeTensorArray", + "scalar", + "NA", + "NULL", + "Scalar", + "NullScalar", + "BooleanScalar", + "Int8Scalar", + "Int16Scalar", + "Int32Scalar", + "Int64Scalar", + "UInt8Scalar", + "UInt16Scalar", + "UInt32Scalar", + "UInt64Scalar", + "HalfFloatScalar", + "FloatScalar", + "DoubleScalar", + "Decimal32Scalar", + "Decimal64Scalar", + "Decimal128Scalar", + "Decimal256Scalar", + "ListScalar", + "LargeListScalar", + "FixedSizeListScalar", + "ListViewScalar", + "LargeListViewScalar", + "Date32Scalar", + "Date64Scalar", + "Time32Scalar", + "Time64Scalar", + "TimestampScalar", + "DurationScalar", + "MonthDayNanoIntervalScalar", + "BinaryScalar", + "LargeBinaryScalar", + "BinaryViewScalar", + "StringScalar", + "LargeStringScalar", + "StringViewScalar", + "FixedSizeBinaryScalar", + "DictionaryScalar", + "MapScalar", + "StructScalar", + "UnionScalar", + "RunEndEncodedScalar", + "ExtensionScalar", + "Bool8Scalar", + "UuidScalar", + "JsonScalar", + "OpaqueScalar", + "DeviceAllocationType", + "Device", + "MemoryManager", + "default_cpu_memory_manager", + "Buffer", + "ResizableBuffer", + "foreign_buffer", + "py_buffer", + "Codec", + "compress", + "decompress", + "allocate_buffer", + "MemoryPool", + "LoggingMemoryPool", + "ProxyMemoryPool", + "total_allocated_bytes", + "set_memory_pool", + "default_memory_pool", + "system_memory_pool", + "jemalloc_memory_pool", + "mimalloc_memory_pool", + "logging_memory_pool", + "proxy_memory_pool", + "log_memory_allocations", + "jemalloc_set_decay_ms", + "supported_memory_backends", + "NativeFile", + "PythonFile", + "BufferedInputStream", + "BufferedOutputStream", + "CacheOptions", + "CompressedInputStream", + "CompressedOutputStream", + "TransformInputStream", + "transcoding_input_stream", + "FixedSizeBufferWriter", + "BufferReader", + "BufferOutputStream", + "OSFile", + "MemoryMappedFile", + "memory_map", + "create_memory_map", + "MockOutputStream", + "input_stream", + "output_stream", + "have_libhdfs", + "ChunkedArray", + "RecordBatch", + "Table", + "table", + "concat_arrays", + "concat_tables", + "TableGroupBy", + "RecordBatchReader", + "ArrowCancelled", + "ArrowCapacityError", + "ArrowException", + "ArrowKeyError", + "ArrowIndexError", + "ArrowInvalid", + "ArrowIOError", + "ArrowMemoryError", + "ArrowNotImplementedError", + "ArrowTypeError", + "ArrowSerializationError", + "serialize_pandas", + "deserialize_pandas", + "ipc", + "types", + "_deprecate_api", + "_deprecate_class", + "Message", + "MessageReader", + "MetadataVersion", + "RecordBatchFileReader", + "RecordBatchFileWriter", + "RecordBatchStreamReader", + "RecordBatchStreamWriter", + "get_include", + "_get_pkg_config_executable", + "_has_pkg_config", + "_read_pkg_config_variable", + "get_libraries", + "create_library_symlinks", + "get_library_dirs", +] diff --git a/python/stubs/__lib_pxi/__init__.pyi b/python/stubs/__lib_pxi/__init__.pyi new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/stubs/__lib_pxi/array.pyi b/python/stubs/__lib_pxi/array.pyi new file mode 100644 index 00000000000..ec1cda30a88 --- /dev/null +++ b/python/stubs/__lib_pxi/array.pyi @@ -0,0 +1,4274 @@ +import datetime as dt +import sys + +from collections.abc import Callable +from decimal import Decimal + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from typing import ( + Any, + Generic, + Iterable, + Iterator, + Literal, + TypeVar, + overload, +) + +import numpy as np +import pandas as pd + +from pandas.core.dtypes.base import ExtensionDtype +from pyarrow._compute import CastOptions +from pyarrow._stubs_typing import ( + ArrayLike, + Indices, + Mask, + Order, + SupportArrowArray, + SupportArrowDeviceArray, +) +from pyarrow.lib import ( + Buffer, + Device, + MemoryManager, + MemoryPool, + MonthDayNano, + Tensor, + _Weakrefable, +) +from typing_extensions import deprecated + +from . import scalar, types +from .device import DeviceAllocationType +from .scalar import NullableCollection, Scalar +from .types import ( + DataType, + Field, + MapType, + _AsPyType, + _BasicDataType, + _BasicValueT, + _DataTypeT, + _IndexT, + _RunEndType, + _Size, +) + +@overload +def array( + values: NullableCollection[bool], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> BooleanArray: ... +@overload +def array( + values: NullableCollection[int], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Int64Array: ... +@overload +def array( + values: NullableCollection[float], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> DoubleArray: ... +@overload +def array( + values: NullableCollection[Decimal], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Decimal128Array: ... +@overload +def array( + values: NullableCollection[dict[str, Any]], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> StructArray: ... +@overload +def array( + values: NullableCollection[dt.date], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Date32Array: ... +@overload +def array( + values: NullableCollection[dt.time], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Time64Array[Literal["us"]]: ... +@overload +def array( + values: NullableCollection[dt.timedelta], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> DurationArray[Literal["us"]]: ... +@overload +def array( + values: NullableCollection[MonthDayNano], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> MonthDayNanoIntervalArray: ... +@overload +def array( + values: NullableCollection[str], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> StringArray: ... +@overload +def array( + values: NullableCollection[bytes], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> BinaryArray: ... +@overload +def array( + values: NullableCollection[list[Any]], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> ListArray[Any]: ... +@overload +def array( + values: NullableCollection[_ScalarT], + type: None = None, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Array[_ScalarT]: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["null"] | types.NullType, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> NullArray: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["bool", "boolean"] | types.BoolType, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> BooleanArray: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["i1", "int8"] | types.Int8Type, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Int8Array: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["i2", "int16"] | types.Int16Type, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Int16Array: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["i4", "int32"] | types.Int32Type, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Int32Array: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["i8", "int64"] | types.Int64Type, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Int64Array: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["u1", "uint8"] | types.UInt8Type, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> UInt8Array: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["u2", "uint16"] | types.UInt16Type, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> UInt16Array: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["u4", "uint32"] | types.Uint32Type, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> UInt32Array: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["u8", "uint64"] | types.UInt64Type, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> UInt64Array: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["f2", "halffloat", "float16"] | types.Float16Type, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> HalfFloatArray: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["f4", "float", "float32"] | types.Float32Type, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> FloatArray: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["f8", "double", "float64"] | types.Float64Type, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> DoubleArray: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["string", "str", "utf8"] | types.StringType, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> StringArray: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["binary"] | types.BinaryType, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> BinaryArray: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["large_string", "large_str", "large_utf8"] | types.LargeStringType, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> LargeStringArray: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["large_binary"] | types.LargeBinaryType, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> LargeBinaryArray: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["binary_view"] | types.BinaryViewType, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> BinaryViewArray: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["string_view"] | types.StringViewType, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> StringViewArray: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["date32", "date32[day]"] | types.Date32Type, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Date32Array: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["date64", "date64[ms]"] | types.Date64Type, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Date64Array: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["time32[s]"] | types.Time32Type[Literal["s"]], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Time32Array[Literal["s"]]: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["time32[ms]"] | types.Time32Type[Literal["ms"]], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Time32Array[Literal["ms"]]: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["time64[us]"] | types.Time64Type[Literal["us"]], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Time64Array[Literal["us"]]: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["time64[ns]"] | types.Time64Type[Literal["ns"]], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Time64Array[Literal["ns"]]: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["timestamp[s]"] | types.TimestampType[Literal["s"]], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> TimestampArray[Literal["s"]]: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["timestamp[ms]"] | types.TimestampType[Literal["ms"]], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> TimestampArray[Literal["ms"]]: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["timestamp[us]"] | types.TimestampType[Literal["us"]], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> TimestampArray[Literal["us"]]: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["duration[s]"] | types.DurationType[Literal["s"]], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> DurationArray[Literal["s"]]: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["duration[ms]"] | types.DurationType[Literal["ms"]], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> DurationArray[Literal["ms"]]: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["duration[us]"] | types.DurationType[Literal["us"]], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> DurationArray[Literal["us"]]: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["duration[ns]"] | types.DurationType[Literal["ns"]], + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> DurationArray[Literal["ns"]]: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["month_day_nano_interval"] | types.MonthDayNanoIntervalType, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> MonthDayNanoIntervalArray: ... +@overload +def array( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: _DataTypeT, + mask: Mask | None = None, + size: int | None = None, + from_pandas: bool | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, +) -> Array[Scalar[_DataTypeT]]: ... +def array(*args, **kawrgs): + """ + Create pyarrow.Array instance from a Python object. + + Parameters + ---------- + obj : sequence, iterable, ndarray, pandas.Series, Arrow-compatible array + If both type and size are specified may be a single use iterable. If + not strongly-typed, Arrow type will be inferred for resulting array. + Any Arrow-compatible array that implements the Arrow PyCapsule Protocol + (has an ``__arrow_c_array__`` or ``__arrow_c_device_array__`` method) + can be passed as well. + type : pyarrow.DataType + Explicit type to attempt to coerce to, otherwise will be inferred from + the data. + mask : array[bool], optional + Indicate which values are null (True) or not null (False). + size : int64, optional + Size of the elements. If the input is larger than size bail at this + length. For iterators, if size is larger than the input iterator this + will be treated as a "max size", but will involve an initial allocation + of size followed by a resize to the actual size (so if you know the + exact size specifying it correctly will give you better performance). + from_pandas : bool, default None + Use pandas's semantics for inferring nulls from values in + ndarray-like data. If passed, the mask tasks precedence, but + if a value is unmasked (not-null), but still null according to + pandas semantics, then it is null. Defaults to False if not + passed explicitly by user, or True if a pandas object is + passed in. + safe : bool, default True + Check for overflows or other unsafe conversions. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the currently-set default + memory pool. + + Returns + ------- + array : pyarrow.Array or pyarrow.ChunkedArray + A ChunkedArray instead of an Array is returned if: + + - the object data overflowed binary storage. + - the object's ``__arrow_array__`` protocol method returned a chunked + array. + + Notes + ----- + Timezone will be preserved in the returned array for timezone-aware data, + else no timezone will be returned for naive timestamps. + Internally, UTC values are stored for timezone-aware data with the + timezone set in the data type. + + Pandas's DateOffsets and dateutil.relativedelta.relativedelta are by + default converted as MonthDayNanoIntervalArray. relativedelta leapdays + are ignored as are all absolute fields on both objects. datetime.timedelta + can also be converted to MonthDayNanoIntervalArray but this requires + passing MonthDayNanoIntervalType explicitly. + + Converting to dictionary array will promote to a wider integer type for + indices if the number of distinct values cannot be represented, even if + the index type was explicitly set. This means that if there are more than + 127 values the returned dictionary array's index type will be at least + pa.int16() even if pa.int8() was passed to the function. Note that an + explicit index type will not be demoted even if it is wider than required. + + Examples + -------- + >>> import pandas as pd + >>> import pyarrow as pa + >>> pa.array(pd.Series([1, 2])) + + [ + 1, + 2 + ] + + >>> pa.array(["a", "b", "a"], type=pa.dictionary(pa.int8(), pa.string())) + + ... + -- dictionary: + [ + "a", + "b" + ] + -- indices: + [ + 0, + 1, + 0 + ] + + >>> import numpy as np + >>> pa.array(pd.Series([1, 2]), mask=np.array([0, 1], dtype=bool)) + + [ + 1, + null + ] + + >>> arr = pa.array(range(1024), type=pa.dictionary(pa.int8(), pa.int64())) + >>> arr.type.index_type + DataType(int16) + """ + +@overload +def asarray(values: NullableCollection[bool]) -> BooleanArray: ... +@overload +def asarray(values: NullableCollection[int]) -> Int64Array: ... +@overload +def asarray(values: NullableCollection[float]) -> DoubleArray: ... +@overload +def asarray(values: NullableCollection[Decimal]) -> Decimal128Array: ... +@overload +def asarray(values: NullableCollection[dict[str, Any]]) -> StructArray: ... +@overload +def asarray(values: NullableCollection[dt.date]) -> Date32Array: ... +@overload +def asarray(values: NullableCollection[dt.time]) -> Time64Array: ... +@overload +def asarray(values: NullableCollection[dt.timedelta]) -> DurationArray: ... +@overload +def asarray(values: NullableCollection[MonthDayNano]) -> MonthDayNanoIntervalArray: ... +@overload +def asarray(values: NullableCollection[list[Any]]) -> ListArray[Any]: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["null"] | types.NullType, +) -> NullArray: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["bool", "boolean"] | types.BoolType, +) -> BooleanArray: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["i1", "int8"] | types.Int8Type, +) -> Int8Array: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["i2", "int16"] | types.Int16Type, +) -> Int16Array: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["i4", "int32"] | types.Int32Type, +) -> Int32Array: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["i8", "int64"] | types.Int64Type, +) -> Int64Array: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["u1", "uint8"] | types.UInt8Type, +) -> UInt8Array: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["u2", "uint16"] | types.UInt16Type, +) -> UInt16Array: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["u4", "uint32"] | types.Uint32Type, +) -> UInt32Array: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["u8", "uint64"] | types.UInt64Type, +) -> UInt64Array: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["f2", "halffloat", "float16"] | types.Float16Type, +) -> HalfFloatArray: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["f4", "float", "float32"] | types.Float32Type, +) -> FloatArray: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["f8", "double", "float64"] | types.Float64Type, +) -> DoubleArray: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["string", "str", "utf8"] | types.StringType, +) -> StringArray: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["binary"] | types.BinaryType, +) -> BinaryArray: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["large_string", "large_str", "large_utf8"] | types.LargeStringType, +) -> LargeStringArray: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["large_binary"] | types.LargeBinaryType, +) -> LargeBinaryArray: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["binary_view"] | types.BinaryViewType, +) -> BinaryViewArray: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["string_view"] | types.StringViewType, +) -> StringViewArray: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["date32", "date32[day]"] | types.Date32Type, +) -> Date32Array: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["date64", "date64[ms]"] | types.Date64Type, +) -> Date64Array: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["time32[s]"] | types.Time32Type[Literal["s"]], +) -> Time32Array[Literal["s"]]: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["time32[ms]"] | types.Time32Type[Literal["ms"]], +) -> Time32Array[Literal["ms"]]: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["time64[us]"] | types.Time64Type[Literal["us"]], +) -> Time64Array[Literal["us"]]: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["time64[ns]"] | types.Time64Type[Literal["ns"]], +) -> Time64Array[Literal["ns"]]: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["timestamp[s]"] | types.TimestampType[Literal["s"]], +) -> TimestampArray[Literal["s"]]: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["timestamp[ms]"] | types.TimestampType[Literal["ms"]], +) -> TimestampArray[Literal["ms"]]: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["timestamp[us]"] | types.TimestampType[Literal["us"]], +) -> TimestampArray[Literal["us"]]: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["timestamp[ns]"] | types.TimestampType[Literal["ns"]], +) -> TimestampArray[Literal["ns"]]: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["duration[s]"] | types.DurationType[Literal["s"]], +) -> DurationArray[Literal["s"]]: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["duration[ms]"] | types.DurationType[Literal["ms"]], +) -> DurationArray[Literal["ms"]]: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["duration[us]"] | types.DurationType[Literal["us"]], +) -> DurationArray[Literal["us"]]: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["duration[ns]"] | types.DurationType[Literal["ns"]], +) -> DurationArray[Literal["ns"]]: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: Literal["month_day_nano_interval"] | types.MonthDayNanoIntervalType, +) -> MonthDayNanoIntervalArray: ... +@overload +def asarray( + values: Iterable[Any] | SupportArrowArray | SupportArrowDeviceArray, + type: _DataTypeT, +) -> Array[Scalar[_DataTypeT]]: ... +def asarray(*args, **kwargs): + """ + Convert to pyarrow.Array, inferring type if not provided. + + Parameters + ---------- + values : array-like + This can be a sequence, numpy.ndarray, pyarrow.Array or + pyarrow.ChunkedArray. If a ChunkedArray is passed, the output will be + a ChunkedArray, otherwise the output will be a Array. + type : string or DataType + Explicitly construct the array with this type. Attempt to cast if + indicated type is different. + + Returns + ------- + arr : Array or ChunkedArray + """ + +@overload +def nulls(size: int, memory_pool: MemoryPool | None = None) -> NullArray: ... +@overload +def nulls( + size: int, type: types.NullType | None, memory_pool: MemoryPool | None = None +) -> NullArray: ... +@overload +def nulls( + size: int, type: types.BoolType, memory_pool: MemoryPool | None = None +) -> BooleanArray: ... +@overload +def nulls(size: int, type: types.Int8Type, memory_pool: MemoryPool | None = None) -> Int8Array: ... +@overload +def nulls( + size: int, type: types.Int16Type, memory_pool: MemoryPool | None = None +) -> Int16Array: ... +@overload +def nulls( + size: int, type: types.Int32Type, memory_pool: MemoryPool | None = None +) -> Int32Array: ... +@overload +def nulls( + size: int, type: types.Int64Type, memory_pool: MemoryPool | None = None +) -> Int64Array: ... +@overload +def nulls( + size: int, type: types.UInt8Type, memory_pool: MemoryPool | None = None +) -> UInt8Array: ... +@overload +def nulls( + size: int, type: types.UInt16Type, memory_pool: MemoryPool | None = None +) -> UInt16Array: ... +@overload +def nulls( + size: int, type: types.Uint32Type, memory_pool: MemoryPool | None = None +) -> UInt32Array: ... +@overload +def nulls( + size: int, type: types.UInt64Type, memory_pool: MemoryPool | None = None +) -> UInt64Array: ... +@overload +def nulls( + size: int, type: types.Float16Type, memory_pool: MemoryPool | None = None +) -> HalfFloatArray: ... +@overload +def nulls( + size: int, type: types.Float32Type, memory_pool: MemoryPool | None = None +) -> FloatArray: ... +@overload +def nulls( + size: int, type: types.Float64Type, memory_pool: MemoryPool | None = None +) -> DoubleArray: ... +@overload +def nulls( + size: int, type: types.Decimal32Type, memory_pool: MemoryPool | None = None +) -> Decimal128Array: ... +@overload +def nulls( + size: int, type: types.Decimal64Type, memory_pool: MemoryPool | None = None +) -> Decimal128Array: ... +@overload +def nulls( + size: int, type: types.Decimal128Type, memory_pool: MemoryPool | None = None +) -> Decimal128Array: ... +@overload +def nulls( + size: int, type: types.Decimal256Type, memory_pool: MemoryPool | None = None +) -> Decimal256Array: ... +@overload +def nulls( + size: int, type: types.Date32Type, memory_pool: MemoryPool | None = None +) -> Date32Array: ... +@overload +def nulls( + size: int, type: types.Date64Type, memory_pool: MemoryPool | None = None +) -> Date64Array: ... +@overload +def nulls( + size: int, type: types.Time32Type[types._Time32Unit], memory_pool: MemoryPool | None = None +) -> Time32Array[types._Time32Unit]: ... +@overload +def nulls( + size: int, type: types.Time64Type[types._Time64Unit], memory_pool: MemoryPool | None = None +) -> Time64Array[types._Time64Unit]: ... +@overload +def nulls( + size: int, + type: types.TimestampType[types._Unit, types._Tz], + memory_pool: MemoryPool | None = None, +) -> TimestampArray[types._Unit, types._Tz]: ... +@overload +def nulls( + size: int, type: types.DurationType[types._Unit], memory_pool: MemoryPool | None = None +) -> DurationArray[types._Unit]: ... +@overload +def nulls( + size: int, type: types.MonthDayNanoIntervalType, memory_pool: MemoryPool | None = None +) -> MonthDayNanoIntervalArray: ... +@overload +def nulls( + size: int, + type: types.BinaryType, + memory_pool: MemoryPool | None = None, +) -> BinaryArray: ... +@overload +def nulls( + size: int, + type: types.LargeBinaryType, + memory_pool: MemoryPool | None = None, +) -> LargeBinaryArray: ... +@overload +def nulls( + size: int, + type: types.FixedSizeBinaryType, + memory_pool: MemoryPool | None = None, +) -> FixedSizeBinaryArray: ... +@overload +def nulls( + size: int, + type: types.StringType, + memory_pool: MemoryPool | None = None, +) -> StringArray: ... +@overload +def nulls( + size: int, + type: types.LargeStringType, + memory_pool: MemoryPool | None = None, +) -> LargeStringArray: ... +@overload +def nulls( + size: int, + type: types.BinaryViewType, + memory_pool: MemoryPool | None = None, +) -> BinaryViewArray: ... +@overload +def nulls( + size: int, + type: types.StringViewType, + memory_pool: MemoryPool | None = None, +) -> StringViewArray: ... +@overload +def nulls( + size: int, + type: types.LargeListType[_DataTypeT], + memory_pool: MemoryPool | None = None, +) -> LargeListArray[_DataTypeT]: ... +@overload +def nulls( + size: int, + type: types.ListViewType[_DataTypeT], + memory_pool: MemoryPool | None = None, +) -> ListViewArray[_DataTypeT]: ... +@overload +def nulls( + size: int, + type: types.LargeListViewType[_DataTypeT], + memory_pool: MemoryPool | None = None, +) -> LargeListViewArray[_DataTypeT]: ... +@overload +def nulls( + size: int, + type: types.FixedSizeListType[_DataTypeT, _Size], + memory_pool: MemoryPool | None = None, +) -> FixedSizeListArray[_DataTypeT, _Size]: ... +@overload +def nulls( + size: int, + type: types.ListType[_DataTypeT], + memory_pool: MemoryPool | None = None, +) -> ListArray[scalar.ListScalar[_DataTypeT]]: ... +@overload +def nulls( + size: int, + type: types.StructType, + memory_pool: MemoryPool | None = None, +) -> StructArray: ... +@overload +def nulls( + size: int, + type: types.MapType[_MapKeyT, _MapItemT], + memory_pool: MemoryPool | None = None, +) -> MapArray[_MapKeyT, _MapItemT]: ... +@overload +def nulls( + size: int, + type: types.DictionaryType[_IndexT, _BasicValueT], + memory_pool: MemoryPool | None = None, +) -> DictionaryArray[_IndexT, _BasicValueT]: ... +@overload +def nulls( + size: int, + type: types.RunEndEncodedType[_RunEndType, _BasicValueT], + memory_pool: MemoryPool | None = None, +) -> RunEndEncodedArray[_RunEndType, _BasicValueT]: ... +@overload +def nulls( + size: int, + type: types.UnionType, + memory_pool: MemoryPool | None = None, +) -> UnionArray: ... +@overload +def nulls( + size: int, + type: types.FixedShapeTensorType[types._ValueT], + memory_pool: MemoryPool | None = None, +) -> FixedShapeTensorArray[Any]: ... +@overload +def nulls( + size: int, + type: types.Bool8Type, + memory_pool: MemoryPool | None = None, +) -> Bool8Array: ... +@overload +def nulls( + size: int, + type: types.UuidType, + memory_pool: MemoryPool | None = None, +) -> UuidArray[Any]: ... +@overload +def nulls( + size: int, + type: types.JsonType, + memory_pool: MemoryPool | None = None, +) -> JsonArray[Any]: ... +@overload +def nulls( + size: int, + type: types.OpaqueType, + memory_pool: MemoryPool | None = None, +) -> OpaqueArray[Any]: ... +@overload +def nulls( + size: int, + type: types.ExtensionType, + memory_pool: MemoryPool | None = None, +) -> ExtensionArray[Any]: ... +def nulls(*args, **kwargs): + """ + Create a strongly-typed Array instance with all elements null. + + Parameters + ---------- + size : int + Array length. + type : pyarrow.DataType, default None + Explicit type for the array. By default use NullType. + memory_pool : MemoryPool, default None + Arrow MemoryPool to use for allocations. Uses the default memory + pool if not passed. + + Returns + ------- + arr : Array + + Examples + -------- + >>> import pyarrow as pa + >>> pa.nulls(10) + + 10 nulls + + >>> pa.nulls(3, pa.uint32()) + + [ + null, + null, + null + ] + """ + +@overload +def repeat( + value: None | scalar.NullScalar, size: int, memory_pool: MemoryPool | None = None +) -> NullArray: ... +@overload +def repeat( # type: ignore[overload-overlap] + value: bool | scalar.BooleanScalar, size: int, memory_pool: MemoryPool | None = None +) -> BooleanArray: ... +@overload +def repeat( + value: scalar.Int8Scalar, size: int, memory_pool: MemoryPool | None = None +) -> Int8Array: ... +@overload +def repeat( + value: scalar.Int16Scalar, size: int, memory_pool: MemoryPool | None = None +) -> Int16Array: ... +@overload +def repeat( + value: scalar.Int32Scalar, size: int, memory_pool: MemoryPool | None = None +) -> Int32Array: ... +@overload +def repeat( + value: int | scalar.Int64Scalar, size: int, memory_pool: MemoryPool | None = None +) -> Int64Array: ... +@overload +def repeat( + value: scalar.UInt8Scalar, size: int, memory_pool: MemoryPool | None = None +) -> UInt8Array: ... +@overload +def repeat( + value: scalar.UInt16Scalar, size: int, memory_pool: MemoryPool | None = None +) -> UInt16Array: ... +@overload +def repeat( + value: scalar.UInt32Scalar, size: int, memory_pool: MemoryPool | None = None +) -> UInt32Array: ... +@overload +def repeat( + value: scalar.UInt64Scalar, size: int, memory_pool: MemoryPool | None = None +) -> UInt64Array: ... +@overload +def repeat( + value: scalar.HalfFloatScalar, size: int, memory_pool: MemoryPool | None = None +) -> HalfFloatArray: ... +@overload +def repeat( + value: scalar.FloatScalar, size: int, memory_pool: MemoryPool | None = None +) -> FloatArray: ... +@overload +def repeat( + value: float | scalar.DoubleScalar, size: int, memory_pool: MemoryPool | None = None +) -> DoubleArray: ... +@overload +def repeat( + value: Decimal | scalar.Decimal32Scalar, size: int, memory_pool: MemoryPool | None = None +) -> Decimal32Array: ... +@overload +def repeat( + value: scalar.Decimal64Scalar, size: int, memory_pool: MemoryPool | None = None +) -> Decimal64Array: ... +@overload +def repeat( + value: scalar.Decimal128Scalar, size: int, memory_pool: MemoryPool | None = None +) -> Decimal128Array: ... +@overload +def repeat( + value: scalar.Decimal256Scalar, size: int, memory_pool: MemoryPool | None = None +) -> Decimal256Array: ... +@overload +def repeat( + value: dt.date | scalar.Date32Scalar, size: int, memory_pool: MemoryPool | None = None +) -> Date32Array: ... +@overload +def repeat( + value: scalar.Date64Scalar, size: int, memory_pool: MemoryPool | None = None +) -> Date64Array: ... +@overload +def repeat( + value: scalar.Time32Scalar[types._Time32Unit], size: int, memory_pool: MemoryPool | None = None +) -> Time32Array[types._Time32Unit]: ... +@overload +def repeat( + value: dt.time | scalar.Time64Scalar[types._Time64Unit], + size: int, + memory_pool: MemoryPool | None = None, +) -> Time64Array[types._Time64Unit]: ... +@overload +def repeat( + value: scalar.TimestampScalar[types._Unit, types._Tz], + size: int, + memory_pool: MemoryPool | None = None, +) -> TimestampArray[types._Unit, types._Tz]: ... +@overload +def repeat( + value: dt.timedelta | scalar.DurationScalar[types._Unit], + size: int, + memory_pool: MemoryPool | None = None, +) -> DurationArray[types._Unit]: ... +@overload +def repeat( # pyright: ignore[reportOverlappingOverload] + value: MonthDayNano | scalar.MonthDayNanoIntervalScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> MonthDayNanoIntervalArray: ... +@overload +def repeat( + value: bytes | scalar.BinaryScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> BinaryArray: ... +@overload +def repeat( + value: scalar.LargeBinaryScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> LargeBinaryArray: ... +@overload +def repeat( + value: scalar.FixedSizeBinaryScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> FixedSizeBinaryArray: ... +@overload +def repeat( + value: str | scalar.StringScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> StringArray: ... +@overload +def repeat( + value: scalar.LargeStringScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> LargeStringArray: ... +@overload +def repeat( + value: scalar.BinaryViewScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> BinaryViewArray: ... +@overload +def repeat( + value: scalar.StringViewScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> StringViewArray: ... +@overload +def repeat( + value: list[Any] | tuple[Any] | scalar.ListScalar[_DataTypeT], + size: int, + memory_pool: MemoryPool | None = None, +) -> ListArray[scalar.ListScalar[_DataTypeT]]: ... +@overload +def repeat( + value: scalar.FixedSizeListScalar[_DataTypeT, _Size], + size: int, + memory_pool: MemoryPool | None = None, +) -> FixedSizeListArray[_DataTypeT, _Size]: ... +@overload +def repeat( + value: scalar.LargeListScalar[_DataTypeT], + size: int, + memory_pool: MemoryPool | None = None, +) -> LargeListArray[_DataTypeT]: ... +@overload +def repeat( + value: scalar.ListViewScalar[_DataTypeT], + size: int, + memory_pool: MemoryPool | None = None, +) -> ListViewArray[_DataTypeT]: ... +@overload +def repeat( + value: scalar.LargeListViewScalar[_DataTypeT], + size: int, + memory_pool: MemoryPool | None = None, +) -> LargeListViewArray[_DataTypeT]: ... +@overload +def repeat( + value: dict[str, Any] | scalar.StructScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> StructArray: ... +@overload +def repeat( + value: scalar.MapScalar[_MapKeyT, _MapItemT], + size: int, + memory_pool: MemoryPool | None = None, +) -> MapArray[_MapKeyT, _MapItemT]: ... +@overload +def repeat( + value: scalar.DictionaryScalar[_IndexT, _BasicValueT], + size: int, + memory_pool: MemoryPool | None = None, +) -> DictionaryArray[_IndexT, _BasicValueT]: ... +@overload +def repeat( + value: scalar.RunEndEncodedScalar[_RunEndType, _BasicValueT], + size: int, + memory_pool: MemoryPool | None = None, +) -> RunEndEncodedArray[_RunEndType, _BasicValueT]: ... +@overload +def repeat( + value: scalar.UnionScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> UnionArray: ... +@overload +def repeat( + value: scalar.FixedShapeTensorScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> FixedShapeTensorArray[Any]: ... +@overload +def repeat( + value: scalar.Bool8Scalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> Bool8Array: ... +@overload +def repeat( + value: scalar.UuidScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> UuidArray[Any]: ... +@overload +def repeat( + value: scalar.JsonScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> JsonArray[Any]: ... +@overload +def repeat( + value: scalar.OpaqueScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> OpaqueArray[Any]: ... +@overload +def repeat( + value: scalar.ExtensionScalar, + size: int, + memory_pool: MemoryPool | None = None, +) -> ExtensionArray[Any]: ... +def repeat(*args, **kwargs): + """ + Create an Array instance whose slots are the given scalar. + + Parameters + ---------- + value : Scalar-like object + Either a pyarrow.Scalar or any python object coercible to a Scalar. + size : int + Number of times to repeat the scalar in the output Array. + memory_pool : MemoryPool, default None + Arrow MemoryPool to use for allocations. Uses the default memory + pool if not passed. + + Returns + ------- + arr : Array + + Examples + -------- + >>> import pyarrow as pa + >>> pa.repeat(10, 3) + + [ + 10, + 10, + 10 + ] + + >>> pa.repeat([1, 2], 2) + + [ + [ + 1, + 2 + ], + [ + 1, + 2 + ] + ] + + >>> pa.repeat("string", 3) + + [ + "string", + "string", + "string" + ] + + >>> pa.repeat(pa.scalar({"a": 1, "b": [1, 2]}), 2) + + -- is_valid: all not null + -- child 0 type: int64 + [ + 1, + 1 + ] + -- child 1 type: list + [ + [ + 1, + 2 + ], + [ + 1, + 2 + ] + ] + """ + +def infer_type(values: Iterable[Any], mask: Mask, from_pandas: bool = False) -> DataType: + """ + Attempt to infer Arrow data type that can hold the passed Python + sequence type in an Array object + + Parameters + ---------- + values : array-like + Sequence to infer type from. + mask : ndarray (bool type), optional + Optional exclusion mask where True marks null, False non-null. + from_pandas : bool, default False + Use pandas's NA/null sentinel values for type inference. + + Returns + ------- + type : DataType + """ + +class ArrayStatistics(_Weakrefable): + """ + The class for statistics of an array. + """ + @property + def null_count(self) -> int: + """ + The number of nulls. + """ + @property + def distinct_count(self) -> int: + """ + The number of distinct values. + """ + @property + def min(self) -> Any: + """ + The minimum value. + """ + @property + def is_min_exact(self) -> bool: + """ + Whether the minimum value is an exact value or not. + """ + @property + def max(self) -> Any: + """ + The maximum value. + """ + + @property + def is_max_exact(self) -> bool: + """ + Whether the maximum value is an exact value or not. + """ + +_ConvertAs = TypeVar("_ConvertAs", pd.DataFrame, pd.Series) + +class _PandasConvertible(_Weakrefable, Generic[_ConvertAs]): + def to_pandas( + self, + memory_pool: MemoryPool | None = None, + categories: list | None = None, + strings_to_categorical: bool = False, + zero_copy_only: bool = False, + integer_object_nulls: bool = False, + date_as_object: bool = True, + timestamp_as_object: bool = False, + use_threads: bool = True, + deduplicate_objects: bool = True, + ignore_metadata: bool = False, + safe: bool = True, + split_blocks: bool = False, + self_destruct: bool = False, + maps_as_pydicts: Literal["None", "lossy", "strict"] | None = None, + types_mapper: Callable[[DataType], ExtensionDtype | None] | None = None, + coerce_temporal_nanoseconds: bool = False, + ) -> _ConvertAs: + """ + Convert to a pandas-compatible NumPy array or DataFrame, as appropriate + + Parameters + ---------- + memory_pool : MemoryPool, default None + Arrow MemoryPool to use for allocations. Uses the default memory + pool if not passed. + categories : list, default empty + List of fields that should be returned as pandas.Categorical. Only + applies to table-like data structures. + strings_to_categorical : bool, default False + Encode string (UTF8) and binary types to pandas.Categorical. + zero_copy_only : bool, default False + Raise an ArrowException if this function call would require copying + the underlying data. + integer_object_nulls : bool, default False + Cast integers with nulls to objects + date_as_object : bool, default True + Cast dates to objects. If False, convert to datetime64 dtype with + the equivalent time unit (if supported). Note: in pandas version + < 2.0, only datetime64[ns] conversion is supported. + timestamp_as_object : bool, default False + Cast non-nanosecond timestamps (np.datetime64) to objects. This is + useful in pandas version 1.x if you have timestamps that don't fit + in the normal date range of nanosecond timestamps (1678 CE-2262 CE). + Non-nanosecond timestamps are supported in pandas version 2.0. + If False, all timestamps are converted to datetime64 dtype. + use_threads : bool, default True + Whether to parallelize the conversion using multiple threads. + deduplicate_objects : bool, default True + Do not create multiple copies Python objects when created, to save + on memory use. Conversion will be slower. + ignore_metadata : bool, default False + If True, do not use the 'pandas' metadata to reconstruct the + DataFrame index, if present + safe : bool, default True + For certain data types, a cast is needed in order to store the + data in a pandas DataFrame or Series (e.g. timestamps are always + stored as nanoseconds in pandas). This option controls whether it + is a safe cast or not. + split_blocks : bool, default False + If True, generate one internal "block" for each column when + creating a pandas.DataFrame from a RecordBatch or Table. While this + can temporarily reduce memory note that various pandas operations + can trigger "consolidation" which may balloon memory use. + self_destruct : bool, default False + EXPERIMENTAL: If True, attempt to deallocate the originating Arrow + memory while converting the Arrow object to pandas. If you use the + object after calling to_pandas with this option it will crash your + program. + + Note that you may not see always memory usage improvements. For + example, if multiple columns share an underlying allocation, + memory can't be freed until all columns are converted. + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + This can change the ordering of (key, value) pairs, and will + deduplicate multiple keys, resulting in a possible loss of data. + + If 'lossy', this key deduplication results in a warning printed + when detected. If 'strict', this instead results in an exception + being raised when detected. + types_mapper : function, default None + A function mapping a pyarrow DataType to a pandas ExtensionDtype. + This can be used to override the default pandas type for conversion + of built-in pyarrow types or in absence of pandas_metadata in the + Table schema. The function receives a pyarrow DataType and is + expected to return a pandas ExtensionDtype or ``None`` if the + default conversion should be used for that type. If you have + a dictionary mapping, you can pass ``dict.get`` as function. + coerce_temporal_nanoseconds : bool, default False + Only applicable to pandas version >= 2.0. + A legacy option to coerce date32, date64, duration, and timestamp + time units to nanoseconds when converting to pandas. This is the + default behavior in pandas version 1.x. Set this option to True if + you'd like to use this coercion when using pandas version >= 2.0 + for backwards compatibility (not recommended otherwise). + + Returns + ------- + pandas.Series or pandas.DataFrame depending on type of object + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + + Convert a Table to pandas DataFrame: + + >>> table = pa.table( + ... [ + ... pa.array([2, 4, 5, 100]), + ... pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]), + ... ], + ... names=["n_legs", "animals"], + ... ) + >>> table.to_pandas() + n_legs animals + 0 2 Flamingo + 1 4 Horse + 2 5 Brittle stars + 3 100 Centipede + >>> isinstance(table.to_pandas(), pd.DataFrame) + True + + Convert a RecordBatch to pandas DataFrame: + + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 4, 5, 100]) + >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) + >>> batch = pa.record_batch([n_legs, animals], names=["n_legs", "animals"]) + >>> batch + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,4,5,100] + animals: ["Flamingo","Horse","Brittle stars","Centipede"] + >>> batch.to_pandas() + n_legs animals + 0 2 Flamingo + 1 4 Horse + 2 5 Brittle stars + 3 100 Centipede + >>> isinstance(batch.to_pandas(), pd.DataFrame) + True + + Convert a Chunked Array to pandas Series: + + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs.to_pandas() + 0 2 + 1 2 + 2 4 + 3 4 + 4 5 + 5 100 + dtype: int64 + >>> isinstance(n_legs.to_pandas(), pd.Series) + True + """ + +_CastAs = TypeVar("_CastAs", bound=DataType) +_Scalar_co = TypeVar("_Scalar_co", bound=Scalar, covariant=True) +_ScalarT = TypeVar("_ScalarT", bound=Scalar) + +class Array(_PandasConvertible[pd.Series], Generic[_Scalar_co]): + """ + The base class for all Arrow arrays. + """ + + def diff(self, other: Self) -> str: + """ + Compare contents of this array against another one. + + Return a string containing the result of diffing this array + (on the left side) against the other array (on the right side). + + Parameters + ---------- + other : Array + The other array to compare this array with. + + Returns + ------- + diff : str + A human-readable printout of the differences. + + Examples + -------- + >>> import pyarrow as pa + >>> left = pa.array(["one", "two", "three"]) + >>> right = pa.array(["two", None, "two-and-a-half", "three"]) + >>> print(left.diff(right)) # doctest: +SKIP + + @@ -0, +0 @@ + -"one" + @@ -2, +1 @@ + +null + +"two-and-a-half" + """ + def cast( + self, + target_type: _CastAs, + safe: bool = True, + options: CastOptions | None = None, + memory_pool: MemoryPool | None = None, + ) -> Array[Scalar[_CastAs]]: + """ + Cast array values to another data type + + See :func:`pyarrow.compute.cast` for usage. + + Parameters + ---------- + target_type : DataType, default None + Type to cast array to. + safe : boolean, default True + Whether to check for conversion errors such as overflow. + options : CastOptions, default None + Additional checks pass by CastOptions + memory_pool : MemoryPool, optional + memory pool to use for allocations during function execution. + + Returns + ------- + cast : Array + """ + def view(self, target_type: _CastAs) -> Array[Scalar[_CastAs]]: + """ + Return zero-copy "view" of array as another data type. + + The data types must have compatible columnar buffer layouts + + Parameters + ---------- + target_type : DataType + Type to construct view as. + + Returns + ------- + view : Array + """ + def sum(self, **kwargs) -> _Scalar_co: + """ + Sum the values in a numerical array. + + See :func:`pyarrow.compute.sum` for full usage. + + Parameters + ---------- + **kwargs : dict, optional + Options to pass to :func:`pyarrow.compute.sum`. + + Returns + ------- + sum : Scalar + A scalar containing the sum value. + """ + @property + def type(self: Array[Scalar[_DataTypeT]]) -> _DataTypeT: ... + def unique(self) -> Self: + """ + Compute distinct elements in array. + + Returns + ------- + unique : Array + An array of the same data type, with deduplicated elements. + """ + def dictionary_encode(self, null_encoding: str = "mask") -> DictionaryArray: + """ + Compute dictionary-encoded representation of array. + + See :func:`pyarrow.compute.dictionary_encode` for full usage. + + Parameters + ---------- + null_encoding : str, default "mask" + How to handle null entries. + + Returns + ------- + encoded : DictionaryArray + A dictionary-encoded version of this array. + """ + def value_count(self) -> StructArray: + """ + Compute counts of unique elements in array. + + Returns + ------- + StructArray + An array of structs + """ + @overload + @staticmethod + def from_pandas( + obj: pd.Series | np.ndarray | ArrayLike, + *, + mask: Mask | None = None, + type: _DataTypeT, + safe: bool = True, + memory_pool: MemoryPool | None = None, + ) -> Array[Scalar[_DataTypeT]]: ... + @overload + @staticmethod + def from_pandas( + obj: pd.Series | np.ndarray | ArrayLike, + *, + mask: Mask | None = None, + safe: bool = True, + memory_pool: MemoryPool | None = None, + ) -> Array[Scalar]: ... + @staticmethod + def from_pandas(*args, **kwargs): + """ + Convert pandas.Series to an Arrow Array. + + This method uses Pandas semantics about what values indicate + nulls. See pyarrow.array for more general conversion from arrays or + sequences to Arrow arrays. + + Parameters + ---------- + obj : ndarray, pandas.Series, array-like + mask : array (boolean), optional + Indicate which values are null (True) or not null (False). + type : pyarrow.DataType + Explicit type to attempt to coerce to, otherwise will be inferred + from the data. + safe : bool, default True + Check for overflows or other unsafe conversions. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the currently-set default + memory pool. + + Notes + ----- + Localized timestamps will currently be returned as UTC (pandas's native + representation). Timezone-naive data will be implicitly interpreted as + UTC. + + Returns + ------- + array : pyarrow.Array or pyarrow.ChunkedArray + ChunkedArray is returned if object data overflows binary buffer. + """ + @staticmethod + def from_buffers( + type: _DataTypeT, + length: int, + buffers: list[Buffer], + null_count: int = -1, + offset=0, + children: NullableCollection[Array[Scalar[_DataTypeT]]] | None = None, + ) -> Array[Scalar[_DataTypeT]]: + """ + Construct an Array from a sequence of buffers. + + The concrete type returned depends on the datatype. + + Parameters + ---------- + type : DataType + The value type of the array. + length : int + The number of values in the array. + buffers : List[Buffer] + The buffers backing this array. + null_count : int, default -1 + The number of null entries in the array. Negative value means that + the null count is not known. + offset : int, default 0 + The array's logical offset (in values, not in bytes) from the + start of each buffer. + children : List[Array], default None + Nested type children with length matching type.num_fields. + + Returns + ------- + array : Array + """ + @property + def null_count(self) -> int: ... + @property + def nbytes(self) -> int: + """ + Total number of bytes consumed by the elements of the array. + + In other words, the sum of bytes from all buffer + ranges referenced. + + Unlike `get_total_buffer_size` this method will account for array + offsets. + + If buffers are shared between arrays then the shared + portion will be counted multiple times. + + The dictionary of dictionary arrays will always be counted in their + entirety even if the array only references a portion of the dictionary. + """ + def get_total_buffer_size(self) -> int: + """ + The sum of bytes in each buffer referenced by the array. + + An array may only reference a portion of a buffer. + This method will overestimate in this case and return the + byte size of the entire buffer. + + If a buffer is referenced multiple times then it will + only be counted once. + """ + def __sizeof__(self) -> int: ... + def __iter__(self) -> Iterator[_Scalar_co]: ... + def to_string( + self, + *, + indent: int = 2, + top_level_indent: int = 0, + window: int = 10, + container_window: int = 2, + skip_new_lines: bool = False, + ) -> str: + """ + Render a "pretty-printed" string representation of the Array. + + Note: for data on a non-CPU device, the full array is copied to CPU + memory. + + Parameters + ---------- + indent : int, default 2 + How much to indent the internal items in the string to + the right, by default ``2``. + top_level_indent : int, default 0 + How much to indent right the entire content of the array, + by default ``0``. + window : int + How many primitive items to preview at the begin and end + of the array when the array is bigger than the window. + The other items will be ellipsed. + container_window : int + How many container items (such as a list in a list array) + to preview at the begin and end of the array when the array + is bigger than the window. + skip_new_lines : bool + If the array should be rendered as a single line of text + or if each element should be on its own line. + """ + format = to_string + def equals(self, other: Self) -> bool: ... + def __len__(self) -> int: ... + def is_null(self, *, nan_is_null: bool = False) -> BooleanArray: + """ + Return BooleanArray indicating the null values. + + Parameters + ---------- + nan_is_null : bool (optional, default False) + Whether floating-point NaN values should also be considered null. + + Returns + ------- + array : boolean Array + """ + def is_nan(self) -> BooleanArray: + """ + Return BooleanArray indicating the NaN values. + + Returns + ------- + array : boolean Array + """ + def is_valid(self) -> BooleanArray: + """ + Return BooleanArray indicating the non-null values. + """ + def fill_null( + self: Array[Scalar[_BasicDataType[_AsPyType]]], fill_value: _AsPyType + ) -> Array[Scalar[_BasicDataType[_AsPyType]]]: + """ + See :func:`pyarrow.compute.fill_null` for usage. + + Parameters + ---------- + fill_value : any + The replacement value for null entries. + + Returns + ------- + result : Array + A new array with nulls replaced by the given value. + """ + @overload + def __getitem__(self, key: int) -> _Scalar_co: ... + @overload + def __getitem__(self, key: slice) -> Self: ... + def __getitem__(self, key): + """ + Slice or return value at given index + + Parameters + ---------- + key : integer or slice + Slices with step not equal to 1 (or None) will produce a copy + rather than a zero-copy view + + Returns + ------- + value : Scalar (index) or Array (slice) + """ + def slice(self, offset: int = 0, length: int | None = None) -> Self: + """ + Compute zero-copy slice of this array. + + Parameters + ---------- + offset : int, default 0 + Offset from start of array to slice. + length : int, default None + Length of slice (default is until end of Array starting from + offset). + + Returns + ------- + sliced : Array + An array with the same datatype, containing the sliced values. + """ + def take(self, indices: Indices) -> Self: + """ + Select values from an array. + + See :func:`pyarrow.compute.take` for full usage. + + Parameters + ---------- + indices : Array or array-like + The indices in the array whose values will be returned. + + Returns + ------- + taken : Array + An array with the same datatype, containing the taken values. + """ + def drop_null(self) -> Self: + """ + Remove missing values from an array. + """ + def filter( + self, + mask: Mask, + *, + null_selection_behavior: Literal["drop", "emit_null"] = "drop", + ) -> Self: + """ + Select values from an array. + + See :func:`pyarrow.compute.filter` for full usage. + + Parameters + ---------- + mask : Array or array-like + The boolean mask to filter the array with. + null_selection_behavior : str, default "drop" + How nulls in the mask should be handled. + + Returns + ------- + filtered : Array + An array of the same type, with only the elements selected by + the boolean mask. + """ + @overload + def index( + self: Array[_ScalarT], + value: _ScalarT, + start: int | None = None, + end: int | None = None, + *, + memory_pool: MemoryPool | None = None, + ) -> scalar.Int64Scalar: ... + @overload + def index( + self: Array[Scalar[_BasicDataType[_AsPyType]]], + value: _AsPyType, + start: int | None = None, + end: int | None = None, + *, + memory_pool: MemoryPool | None = None, + ) -> scalar.Int64Scalar: ... + def index(self, *args, **kwargs): + """ + Find the first index of a value. + + See :func:`pyarrow.compute.index` for full usage. + + Parameters + ---------- + value : Scalar or object + The value to look for in the array. + start : int, optional + The start index where to look for `value`. + end : int, optional + The end index where to look for `value`. + memory_pool : MemoryPool, optional + A memory pool for potential memory allocations. + + Returns + ------- + index : Int64Scalar + The index of the value in the array (-1 if not found). + """ + def sort(self, order: Order = "ascending", **kwargs) -> Self: + """ + Sort the Array + + Parameters + ---------- + order : str, default "ascending" + Which order to sort values in. + Accepted values are "ascending", "descending". + **kwargs : dict, optional + Additional sorting options. + As allowed by :class:`SortOptions` + + Returns + ------- + result : Array + """ + def __array__(self, dtype: np.dtype | None = None, copy: bool | None = None) -> np.ndarray: ... + def to_numpy(self, zero_copy_only: bool = True, writable: bool = False) -> np.ndarray: + """ + Return a NumPy view or copy of this array. + + By default, tries to return a view of this array. This is only + supported for primitive arrays with the same memory layout as NumPy + (i.e. integers, floating point, ..) and without any nulls. + + For the extension arrays, this method simply delegates to the + underlying storage array. + + Parameters + ---------- + zero_copy_only : bool, default True + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls, or for non-primitive types). + writable : bool, default False + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + + Returns + ------- + array : numpy.ndarray + """ + def to_pylist( + self: Array[Scalar[_BasicDataType[_AsPyType]]], + *, + map_as_pydicts: Literal["lossy", "strict"] | None = None, + ) -> list[_AsPyType | None]: + """ + Convert to a list of native Python objects. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + + If 'lossy', whenever duplicate keys are detected, a warning will be printed. + The last seen value of a duplicate key will be in the Python dictionary. + If 'strict', this instead results in an exception being raised when detected. + + Returns + ------- + lst : list + """ + tolist = to_pylist + def validate(self, *, full: bool = False) -> None: + """ + Perform validation checks. An exception is raised if validation fails. + + By default only cheap validation checks are run. Pass `full=True` + for thorough validation checks (potentially O(n)). + + Parameters + ---------- + full : bool, default False + If True, run expensive checks, otherwise cheap checks only. + + Raises + ------ + ArrowInvalid + """ + @property + def offset(self) -> int: + """ + A relative position into another array's data. + + The purpose is to enable zero-copy slicing. This value defaults to zero + but must be applied on all operations with the physical storage + buffers. + """ + def buffers(self) -> list[Buffer | None]: + """ + Return a list of Buffer objects pointing to this array's physical + storage. + + To correctly interpret these buffers, you need to also apply the offset + multiplied with the size of the stored data type. + """ + def copy_to(self, destination: MemoryManager | Device) -> Self: + """ + Construct a copy of the array with all buffers on destination + device. + + This method recursively copies the array's buffers and those of its + children onto the destination MemoryManager device and returns the + new Array. + + Parameters + ---------- + destination : pyarrow.MemoryManager or pyarrow.Device + The destination device to copy the array to. + + Returns + ------- + Array + """ + def _export_to_c(self, out_ptr: int, out_schema_ptr: int = 0) -> None: + """ + Export to a C ArrowArray struct, given its pointer. + + If a C ArrowSchema struct pointer is also given, the array type + is exported to it at the same time. + + Parameters + ---------- + out_ptr: int + The raw pointer to a C ArrowArray struct. + out_schema_ptr: int (optional) + The raw pointer to a C ArrowSchema struct. + + Be careful: if you don't pass the ArrowArray struct to a consumer, + array memory will leak. This is a low-level function intended for + expert users. + """ + @classmethod + def _import_from_c(cls, in_ptr: int, type: int | DataType) -> Self: + """ + Import Array from a C ArrowArray struct, given its pointer + and the imported array type. + + Parameters + ---------- + in_ptr: int + The raw pointer to a C ArrowArray struct. + type: DataType or int + Either a DataType object, or the raw pointer to a C ArrowSchema + struct. + + This is a low-level function intended for expert users. + """ + def __arrow_c_array__(self, requested_schema=None) -> Any: + """ + Get a pair of PyCapsules containing a C ArrowArray representation of the object. + + Parameters + ---------- + requested_schema : PyCapsule | None + A PyCapsule containing a C ArrowSchema representation of a requested + schema. PyArrow will attempt to cast the array to this data type. + If None, the array will be returned as-is, with a type matching the + one returned by :meth:`__arrow_c_schema__()`. + + Returns + ------- + Tuple[PyCapsule, PyCapsule] + A pair of PyCapsules containing a C ArrowSchema and ArrowArray, + respectively. + """ + @classmethod + def _import_from_c_capsule(cls, schema_capsule, array_capsule) -> Self: ... + def _export_to_c_device(self, out_ptr: int, out_schema_ptr: int = 0) -> None: + """ + Export to a C ArrowDeviceArray struct, given its pointer. + + If a C ArrowSchema struct pointer is also given, the array type + is exported to it at the same time. + + Parameters + ---------- + out_ptr: int + The raw pointer to a C ArrowDeviceArray struct. + out_schema_ptr: int (optional) + The raw pointer to a C ArrowSchema struct. + + Be careful: if you don't pass the ArrowDeviceArray struct to a consumer, + array memory will leak. This is a low-level function intended for + expert users. + """ + @classmethod + def _import_from_c_device(cls, in_ptr: int, type: DataType | int) -> Self: + """ + Import Array from a C ArrowDeviceArray struct, given its pointer + and the imported array type. + + Parameters + ---------- + in_ptr: int + The raw pointer to a C ArrowDeviceArray struct. + type: DataType or int + Either a DataType object, or the raw pointer to a C ArrowSchema + struct. + + This is a low-level function intended for expert users. + """ + + def __arrow_c_device_array__(self, requested_schema=None, **kwargs) -> Any: + """ + Get a pair of PyCapsules containing a C ArrowDeviceArray representation + of the object. + + Parameters + ---------- + requested_schema : PyCapsule | None + A PyCapsule containing a C ArrowSchema representation of a requested + schema. PyArrow will attempt to cast the array to this data type. + If None, the array will be returned as-is, with a type matching the + one returned by :meth:`__arrow_c_schema__()`. + kwargs + Currently no additional keyword arguments are supported, but + this method will accept any keyword with a value of ``None`` + for compatibility with future keywords. + + Returns + ------- + Tuple[PyCapsule, PyCapsule] + A pair of PyCapsules containing a C ArrowSchema and ArrowDeviceArray, + respectively. + """ + @classmethod + def _import_from_c_device_capsule(cls, schema_capsule, array_capsule) -> Self: ... + def __dlpack__(self, stream: int | None = None) -> Any: + """Export a primitive array as a DLPack capsule. + + Parameters + ---------- + stream : int, optional + A Python integer representing a pointer to a stream. Currently not supported. + Stream is provided by the consumer to the producer to instruct the producer + to ensure that operations can safely be performed on the array. + + Returns + ------- + capsule : PyCapsule + A DLPack capsule for the array, pointing to a DLManagedTensor. + """ + def __dlpack_device__(self) -> tuple[int, int]: + """ + Return the DLPack device tuple this arrays resides on. + + Returns + ------- + tuple : Tuple[int, int] + Tuple with index specifying the type of the device (where + CPU = 1, see cpp/src/arrow/c/dpack_abi.h) and index of the + device which is 0 by default for CPU. + """ + @property + def device_type(self) -> DeviceAllocationType: + """ + The device type where the array resides. + + Returns + ------- + DeviceAllocationType + """ + + @property + def is_cpu(self) -> bool: + """ + Whether the array is CPU-accessible. + """ + @property + def statistics(self) -> ArrayStatistics | None: + """ + Statistics of the array. + """ + +class NullArray(Array[scalar.NullScalar]): ... + +class BooleanArray(Array[scalar.BooleanScalar]): + @property + def false_count(self) -> int: ... + @property + def true_count(self) -> int: ... + +class NumericArray(Array[_ScalarT]): ... +class IntegerArray(NumericArray[_ScalarT]): ... +class FloatingPointArray(NumericArray[_ScalarT]): ... +class Int8Array(IntegerArray[scalar.Int8Scalar]): ... +class UInt8Array(IntegerArray[scalar.UInt8Scalar]): ... +class Int16Array(IntegerArray[scalar.Int16Scalar]): ... +class UInt16Array(IntegerArray[scalar.UInt16Scalar]): ... +class Int32Array(IntegerArray[scalar.Int32Scalar]): ... +class UInt32Array(IntegerArray[scalar.UInt32Scalar]): ... +class Int64Array(IntegerArray[scalar.Int64Scalar]): ... +class UInt64Array(IntegerArray[scalar.UInt64Scalar]): ... +class Date32Array(NumericArray[scalar.Date32Scalar]): ... +class Date64Array(NumericArray[scalar.Date64Scalar]): ... +class TimestampArray(NumericArray[scalar.TimestampScalar[types._Unit, types._Tz]]): ... +class Time32Array(NumericArray[scalar.Time32Scalar[types._Time32Unit]]): ... +class Time64Array(NumericArray[scalar.Time64Scalar[types._Time64Unit]]): ... +class DurationArray(NumericArray[scalar.DurationScalar[types._Unit]]): ... +class MonthDayNanoIntervalArray(Array[scalar.MonthDayNanoIntervalScalar]): ... +class HalfFloatArray(FloatingPointArray[scalar.HalfFloatScalar]): ... +class FloatArray(FloatingPointArray[scalar.FloatScalar]): ... +class DoubleArray(FloatingPointArray[scalar.DoubleScalar]): ... +class FixedSizeBinaryArray(Array[scalar.FixedSizeBinaryScalar]): ... +class Decimal32Array(FixedSizeBinaryArray): ... +class Decimal64Array(FixedSizeBinaryArray): ... +class Decimal128Array(FixedSizeBinaryArray): ... +class Decimal256Array(FixedSizeBinaryArray): ... + +class BaseListArray(Array[_ScalarT]): + def flatten(self, recursive: bool = False) -> Array: ... + def value_parent_indices(self) -> Int64Array: ... + def value_lengths(self) -> Int32Array: ... + +class ListArray(BaseListArray[_ScalarT]): + @overload + @classmethod + def from_arrays( + cls, + offsets: Int32Array | list[int], + values: Array[Scalar[_DataTypeT]], + *, + type: None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> ListArray[scalar.ListScalar[_DataTypeT]]: ... + @overload + @classmethod + def from_arrays( + cls, + offsets: Int32Array | list[int], + values: list[int], + *, + type: None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> ListArray[scalar.ListScalar[types.Int64Type]]: ... + @overload + @classmethod + def from_arrays( + cls, + offsets: Int32Array | list[int], + values: list[float], + *, + type: None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> ListArray[scalar.ListScalar[types.Float64Type]]: ... + @overload + @classmethod + def from_arrays( + cls, + offsets: Int32Array | list[int], + values: list[str], + *, + type: None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> ListArray[scalar.ListScalar[types.StringType]]: ... + @overload + @classmethod + def from_arrays( + cls, + offsets: Int32Array | list[int], + values: list[bytes], + *, + type: None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> ListArray[scalar.ListScalar[types.BinaryType]]: ... + @overload + @classmethod + def from_arrays( + cls, + offsets: Int32Array | list[int], + values: list, + *, + type: None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> ListArray: ... + @overload + @classmethod + def from_arrays( + cls, + offsets: Int32Array | list[int], + values: Array | list, + *, + type: _DataTypeT, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> ListArray[scalar.ListScalar[_DataTypeT]]: ... + @classmethod + def from_arrays(cls, *args, **kwargs): + """ + Construct ListArray from arrays of int32 offsets and values. + + Parameters + ---------- + offsets : Array (int32 type) + values : Array (any type) + type : DataType, optional + If not specified, a default ListType with the values' type is + used. + pool : MemoryPool, optional + mask : Array (boolean type), optional + Indicate which values are null (True) or not null (False). + + Returns + ------- + list_array : ListArray + + Examples + -------- + >>> import pyarrow as pa + >>> values = pa.array([1, 2, 3, 4]) + >>> offsets = pa.array([0, 2, 4]) + >>> pa.ListArray.from_arrays(offsets, values) + + [ + [ + 1, + 2 + ], + [ + 3, + 4 + ] + ] + >>> # nulls in the offsets array become null lists + >>> offsets = pa.array([0, None, 2, 4]) + >>> pa.ListArray.from_arrays(offsets, values) + + [ + [ + 1, + 2 + ], + null, + [ + 3, + 4 + ] + ] + """ + @property + def values(self) -> Array: + """ + Return the underlying array of values which backs the ListArray + ignoring the array's offset. + + If any of the list elements are null, but are backed by a + non-empty sub-list, those elements will be included in the + output. + + Compare with :meth:`flatten`, which returns only the non-null + values taking into consideration the array's offset. + + Returns + ------- + values : Array + + See Also + -------- + ListArray.flatten : ... + + Examples + -------- + + The values include null elements from sub-lists: + + >>> import pyarrow as pa + >>> array = pa.array([[1, 2], None, [3, 4, None, 6]]) + >>> array.values + + [ + 1, + 2, + 3, + 4, + null, + 6 + ] + + If an array is sliced, the slice still uses the same + underlying data as the original array, just with an + offset. Since values ignores the offset, the values are the + same: + + >>> sliced = array.slice(1, 2) + >>> sliced + + [ + null, + [ + 3, + 4, + null, + 6 + ] + ] + >>> sliced.values + + [ + 1, + 2, + 3, + 4, + null, + 6 + ] + + """ + @property + def offsets(self) -> Int32Array: + """ + Return the list offsets as an int32 array. + + The returned array will not have a validity bitmap, so you cannot + expect to pass it to `ListArray.from_arrays` and get back the same + list array if the original one has nulls. + + Returns + ------- + offsets : Int32Array + + Examples + -------- + >>> import pyarrow as pa + >>> array = pa.array([[1, 2], None, [3, 4, 5]]) + >>> array.offsets + + [ + 0, + 2, + 2, + 5 + ] + """ + +class LargeListArray(BaseListArray[scalar.LargeListScalar[_DataTypeT]]): + @overload + @classmethod + def from_arrays( + cls, + offsets: Int64Array, + values: Array[Scalar[_DataTypeT]], + *, + type: None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> LargeListArray[_DataTypeT]: ... + @overload + @classmethod + def from_arrays( + cls, + offsets: Int64Array, + values: Array, + *, + type: _DataTypeT, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> LargeListArray[_DataTypeT]: ... + @classmethod + def from_arrays(cls, *args, **kwargs): + """ + Construct LargeListArray from arrays of int64 offsets and values. + + Parameters + ---------- + offsets : Array (int64 type) + values : Array (any type) + type : DataType, optional + If not specified, a default ListType with the values' type is + used. + pool : MemoryPool, optional + mask : Array (boolean type), optional + Indicate which values are null (True) or not null (False). + + Returns + ------- + list_array : LargeListArray + """ + @property + def values(self) -> Array: + """ + Return the underlying array of values which backs the LargeListArray + ignoring the array's offset. + + If any of the list elements are null, but are backed by a + non-empty sub-list, those elements will be included in the + output. + + Compare with :meth:`flatten`, which returns only the non-null + values taking into consideration the array's offset. + + Returns + ------- + values : Array + + See Also + -------- + LargeListArray.flatten : ... + + Examples + -------- + + The values include null elements from the sub-lists: + + >>> import pyarrow as pa + >>> array = pa.array( + ... [[1, 2], None, [3, 4, None, 6]], + ... type=pa.large_list(pa.int32()), + ... ) + >>> array.values + + [ + 1, + 2, + 3, + 4, + null, + 6 + ] + + If an array is sliced, the slice still uses the same + underlying data as the original array, just with an + offset. Since values ignores the offset, the values are the + same: + + >>> sliced = array.slice(1, 2) + >>> sliced + + [ + null, + [ + 3, + 4, + null, + 6 + ] + ] + >>> sliced.values + + [ + 1, + 2, + 3, + 4, + null, + 6 + ] + """ + @property + def offsets(self) -> Int64Array: + """ + Return the list offsets as an int64 array. + + The returned array will not have a validity bitmap, so you cannot + expect to pass it to `LargeListArray.from_arrays` and get back the + same list array if the original one has nulls. + + Returns + ------- + offsets : Int64Array + """ + +class ListViewArray(BaseListArray[scalar.ListViewScalar[_DataTypeT]]): + @overload + @classmethod + def from_arrays( + cls, + offsets: Int32Array, + values: Array[Scalar[_DataTypeT]], + *, + type: None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> ListViewArray[_DataTypeT]: ... + @overload + @classmethod + def from_arrays( + cls, + offsets: Int32Array, + values: Array, + *, + type: _DataTypeT, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> ListViewArray[_DataTypeT]: ... + @classmethod + def from_arrays(cls, *args, **kwargs): + """ + Construct ListViewArray from arrays of int32 offsets, sizes, and values. + + Parameters + ---------- + offsets : Array (int32 type) + sizes : Array (int32 type) + values : Array (any type) + type : DataType, optional + If not specified, a default ListType with the values' type is + used. + pool : MemoryPool, optional + mask : Array (boolean type), optional + Indicate which values are null (True) or not null (False). + + Returns + ------- + list_view_array : ListViewArray + + Examples + -------- + >>> import pyarrow as pa + >>> values = pa.array([1, 2, 3, 4]) + >>> offsets = pa.array([0, 1, 2]) + >>> sizes = pa.array([2, 2, 2]) + >>> pa.ListViewArray.from_arrays(offsets, sizes, values) + + [ + [ + 1, + 2 + ], + [ + 2, + 3 + ], + [ + 3, + 4 + ] + ] + >>> # use a null mask to represent null values + >>> mask = pa.array([False, True, False]) + >>> pa.ListViewArray.from_arrays(offsets, sizes, values, mask=mask) + + [ + [ + 1, + 2 + ], + null, + [ + 3, + 4 + ] + ] + >>> # null values can be defined in either offsets or sizes arrays + >>> # WARNING: this will result in a copy of the offsets or sizes arrays + >>> offsets = pa.array([0, None, 2]) + >>> pa.ListViewArray.from_arrays(offsets, sizes, values) + + [ + [ + 1, + 2 + ], + null, + [ + 3, + 4 + ] + ] + """ + @property + def values(self) -> Array: + """ + Return the underlying array of values which backs the ListViewArray + ignoring the array's offset and sizes. + + The values array may be out of order and/or contain additional values + that are not found in the logical representation of the array. The only + guarantee is that each non-null value in the ListView Array is contiguous. + + Compare with :meth:`flatten`, which returns only the non-null + values taking into consideration the array's order and offset. + + Returns + ------- + values : Array + + Examples + -------- + The values include null elements from sub-lists: + + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) + >>> array + + [ + [ + 1, + 2 + ], + [], + [ + 2, + null, + 3, + 4 + ] + ] + >>> array.values + + [ + 1, + 2, + null, + 3, + 4 + ] + """ + @property + def offsets(self) -> Int32Array: + """ + Return the list offsets as an int32 array. + + The returned array will not have a validity bitmap, so you cannot + expect to pass it to `ListViewArray.from_arrays` and get back the same + list array if the original one has nulls. + + Returns + ------- + offsets : Int32Array + + Examples + -------- + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) + >>> array.offsets + + [ + 0, + 0, + 1 + ] + """ + @property + def sizes(self) -> Int32Array: + """ + Return the list sizes as an int32 array. + + The returned array will not have a validity bitmap, so you cannot + expect to pass it to `ListViewArray.from_arrays` and get back the same + list array if the original one has nulls. + + Returns + ------- + sizes : Int32Array + + Examples + -------- + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.ListViewArray.from_arrays(offsets, sizes, values) + >>> array.sizes + + [ + 2, + 0, + 4 + ] + """ + +class LargeListViewArray(BaseListArray[scalar.LargeListScalar[_DataTypeT]]): + @overload + @classmethod + def from_arrays( + cls, + offsets: Int64Array, + values: Array[Scalar[_DataTypeT]], + *, + type: None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> LargeListViewArray[_DataTypeT]: ... + @overload + @classmethod + def from_arrays( + cls, + offsets: Int64Array, + values: Array, + *, + type: _DataTypeT, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> LargeListViewArray[_DataTypeT]: ... + @classmethod + def from_arrays(cls, *args, **kwargs): + """ + Construct LargeListViewArray from arrays of int64 offsets and values. + + Parameters + ---------- + offsets : Array (int64 type) + sizes : Array (int64 type) + values : Array (any type) + type : DataType, optional + If not specified, a default ListType with the values' type is + used. + pool : MemoryPool, optional + mask : Array (boolean type), optional + Indicate which values are null (True) or not null (False). + + Returns + ------- + list_view_array : LargeListViewArray + + Examples + -------- + >>> import pyarrow as pa + >>> values = pa.array([1, 2, 3, 4]) + >>> offsets = pa.array([0, 1, 2]) + >>> sizes = pa.array([2, 2, 2]) + >>> pa.LargeListViewArray.from_arrays(offsets, sizes, values) + + [ + [ + 1, + 2 + ], + [ + 2, + 3 + ], + [ + 3, + 4 + ] + ] + >>> # use a null mask to represent null values + >>> mask = pa.array([False, True, False]) + >>> pa.LargeListViewArray.from_arrays(offsets, sizes, values, mask=mask) + + [ + [ + 1, + 2 + ], + null, + [ + 3, + 4 + ] + ] + >>> # null values can be defined in either offsets or sizes arrays + >>> # WARNING: this will result in a copy of the offsets or sizes arrays + >>> offsets = pa.array([0, None, 2]) + >>> pa.LargeListViewArray.from_arrays(offsets, sizes, values) + + [ + [ + 1, + 2 + ], + null, + [ + 3, + 4 + ] + ] + """ + @property + def values(self) -> Array: + """ + Return the underlying array of values which backs the LargeListArray + ignoring the array's offset. + + The values array may be out of order and/or contain additional values + that are not found in the logical representation of the array. The only + guarantee is that each non-null value in the ListView Array is contiguous. + + Compare with :meth:`flatten`, which returns only the non-null + values taking into consideration the array's order and offset. + + Returns + ------- + values : Array + + See Also + -------- + LargeListArray.flatten : ... + + Examples + -------- + + The values include null elements from sub-lists: + + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) + >>> array + + [ + [ + 1, + 2 + ], + [], + [ + 2, + null, + 3, + 4 + ] + ] + >>> array.values + + [ + 1, + 2, + null, + 3, + 4 + ] + """ + @property + def offsets(self) -> Int64Array: + """ + Return the list view offsets as an int64 array. + + The returned array will not have a validity bitmap, so you cannot + expect to pass it to `LargeListViewArray.from_arrays` and get back the + same list array if the original one has nulls. + + Returns + ------- + offsets : Int64Array + + Examples + -------- + + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) + >>> array.offsets + + [ + 0, + 0, + 1 + ] + """ + @property + def sizes(self) -> Int64Array: + """ + Return the list view sizes as an int64 array. + + The returned array will not have a validity bitmap, so you cannot + expect to pass it to `LargeListViewArray.from_arrays` and get back the + same list array if the original one has nulls. + + Returns + ------- + sizes : Int64Array + + Examples + -------- + + >>> import pyarrow as pa + >>> values = [1, 2, None, 3, 4] + >>> offsets = [0, 0, 1] + >>> sizes = [2, 0, 4] + >>> array = pa.LargeListViewArray.from_arrays(offsets, sizes, values) + >>> array.sizes + + [ + 2, + 0, + 4 + ] + """ + +class FixedSizeListArray(BaseListArray[scalar.FixedSizeListScalar[_DataTypeT, _Size]]): + @overload + @classmethod + def from_arrays( + cls, + values: Array[Scalar[_DataTypeT]], + *, + type: None = None, + mask: Mask | None = None, + ) -> FixedSizeListArray[_DataTypeT, None]: ... + @overload + @classmethod + def from_arrays( + cls, + values: Array[Scalar[_DataTypeT]], + limit_size: _Size, + *, + type: None = None, + mask: Mask | None = None, + ) -> FixedSizeListArray[_DataTypeT, _Size]: ... + @classmethod + def from_arrays(cls, *args, **kwargs): + """ + Construct FixedSizeListArray from array of values and a list length. + + Parameters + ---------- + values : Array (any type) + list_size : int + The fixed length of the lists. + type : DataType, optional + If not specified, a default ListType with the values' type and + `list_size` length is used. + mask : Array (boolean type), optional + Indicate which values are null (True) or not null (False). + + + Returns + ------- + FixedSizeListArray + + Examples + -------- + + Create from a values array and a list size: + + >>> import pyarrow as pa + >>> values = pa.array([1, 2, 3, 4]) + >>> arr = pa.FixedSizeListArray.from_arrays(values, 2) + >>> arr + + [ + [ + 1, + 2 + ], + [ + 3, + 4 + ] + ] + + Or create from a values array, list size and matching type: + + >>> typ = pa.list_(pa.field("values", pa.int64()), 2) + >>> arr = pa.FixedSizeListArray.from_arrays(values, type=typ) + >>> arr + + [ + [ + 1, + 2 + ], + [ + 3, + 4 + ] + ] + """ + @property + def values(self) -> BaseListArray[scalar.ListScalar[_DataTypeT]]: + """ + Return the underlying array of values which backs the + FixedSizeListArray. + + Note even null elements are included. + + Compare with :meth:`flatten`, which returns only the non-null + sub-list values. + + Returns + ------- + values : Array + + See Also + -------- + FixedSizeListArray.flatten : ... + + Examples + -------- + >>> import pyarrow as pa + >>> array = pa.array([[1, 2], None, [3, None]], type=pa.list_(pa.int32(), 2)) + >>> array.values + + [ + 1, + 2, + null, + null, + 3, + null + ] + + """ + +_MapKeyT = TypeVar("_MapKeyT", bound=_BasicDataType) +_MapItemT = TypeVar("_MapItemT", bound=_BasicDataType) + +class MapArray(ListArray[scalar.MapScalar[_MapKeyT, _MapItemT]]): + @overload + @classmethod + def from_arrays( + cls, + offsets: Int64Array, + keys: Array[Scalar[_MapKeyT]], + items: Array[Scalar[_MapItemT]], + *, + type: None = None, + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> MapArray[_MapKeyT, _MapItemT]: ... + @overload + @classmethod + def from_arrays( # pyright: ignore[reportIncompatibleMethodOverride] + cls, + offsets: Int64Array, + values: Array, + *, + type: MapType[_MapKeyT, _MapItemT], + pool: MemoryPool | None = None, + mask: Mask | None = None, + ) -> MapArray[_MapKeyT, _MapItemT]: ... + @classmethod + def from_arrays(cls, *args, **kwargs): # pyright: ignore[reportIncompatibleMethodOverride] + """ + Construct MapArray from arrays of int32 offsets and key, item arrays. + + Parameters + ---------- + offsets : array-like or sequence (int32 type) + keys : array-like or sequence (any type) + items : array-like or sequence (any type) + type : DataType, optional + If not specified, a default MapArray with the keys' and items' type is used. + pool : MemoryPool + mask : Array (boolean type), optional + Indicate which values are null (True) or not null (False). + + Returns + ------- + map_array : MapArray + + Examples + -------- + First, let's understand the structure of our dataset when viewed in a rectangular data model. + The total of 5 respondents answered the question "How much did you like the movie x?". + The value -1 in the integer array means that the value is missing. The boolean array + represents the null bitmask corresponding to the missing values in the integer array. + + >>> import pyarrow as pa + >>> movies_rectangular = np.ma.masked_array( + ... [[10, -1, -1], [8, 4, 5], [-1, 10, 3], [-1, -1, -1], [-1, -1, -1]], + ... [ + ... [False, True, True], + ... [False, False, False], + ... [True, False, False], + ... [True, True, True], + ... [True, True, True], + ... ], + ... ) + + To represent the same data with the MapArray and from_arrays, the data is + formed like this: + + >>> offsets = [ + ... 0, # -- row 1 start + ... 1, # -- row 2 start + ... 4, # -- row 3 start + ... 6, # -- row 4 start + ... 6, # -- row 5 start + ... 6, # -- row 5 end + ... ] + >>> movies = [ + ... "Dark Knight", # ---------------------------------- row 1 + ... "Dark Knight", + ... "Meet the Parents", + ... "Superman", # -- row 2 + ... "Meet the Parents", + ... "Superman", # ----------------- row 3 + ... ] + >>> likings = [ + ... 10, # -------- row 1 + ... 8, + ... 4, + ... 5, # --- row 2 + ... 10, + ... 3, # ------ row 3 + ... ] + >>> pa.MapArray.from_arrays(offsets, movies, likings).to_pandas() + 0 [(Dark Knight, 10)] + 1 [(Dark Knight, 8), (Meet the Parents, 4), (Sup... + 2 [(Meet the Parents, 10), (Superman, 3)] + 3 [] + 4 [] + dtype: object + + If the data in the empty rows needs to be marked as missing, it's possible + to do so by modifying the offsets argument, so that we specify `None` as + the starting positions of the rows we want marked as missing. The end row + offset still has to refer to the existing value from keys (and values): + + >>> offsets = [ + ... 0, # ----- row 1 start + ... 1, # ----- row 2 start + ... 4, # ----- row 3 start + ... None, # -- row 4 start + ... None, # -- row 5 start + ... 6, # ----- row 5 end + ... ] + >>> pa.MapArray.from_arrays(offsets, movies, likings).to_pandas() + 0 [(Dark Knight, 10)] + 1 [(Dark Knight, 8), (Meet the Parents, 4), (Sup... + 2 [(Meet the Parents, 10), (Superman, 3)] + 3 None + 4 None + dtype: object + """ + @property + def keys(self) -> Array: + """Flattened array of keys across all maps in array""" + @property + def items(self) -> Array: + """Flattened array of items across all maps in array""" + +class UnionArray(Array[scalar.UnionScalar]): + @deprecated("Use fields() instead") + def child(self, pos: int) -> Field: + """ + DEPRECATED, use field() instead. + + Parameters + ---------- + pos : int + The physical index of the union child field (not its type code). + + Returns + ------- + field : pyarrow.Field + The given child field. + """ + def field(self, pos: int) -> Array: + """ + Return the given child field as an individual array. + + For sparse unions, the returned array has its offset, length, + and null count adjusted. + + For dense unions, the returned array is unchanged. + + Parameters + ---------- + pos : int + The physical index of the union child field (not its type code). + + Returns + ------- + field : Array + The given child field. + """ + @property + def type_codes(self) -> Int8Array: + """Get the type codes array.""" + @property + def offsets(self) -> Int32Array: + """ + Get the value offsets array (dense arrays only). + + Does not account for any slice offset. + """ + @staticmethod + def from_dense( + type: Int8Array, + value_offsets: Int32Array, + children: NullableCollection[Array], + field_names: list[str] | None = None, + type_codes: Int8Array | None = None, + ) -> UnionArray: + """ + Construct dense UnionArray from arrays of int8 types, int32 offsets and + children arrays + + Parameters + ---------- + types : Array (int8 type) + value_offsets : Array (int32 type) + children : list + field_names : list + type_codes : list + + Returns + ------- + union_array : UnionArray + """ + @staticmethod + def from_sparse( + types: Int8Array, + children: NullableCollection[Array], + field_names: list[str] | None = None, + type_codes: Int8Array | None = None, + ) -> UnionArray: + """ + Construct sparse UnionArray from arrays of int8 types and children + arrays + + Parameters + ---------- + types : Array (int8 type) + children : list + field_names : list + type_codes : list + + Returns + ------- + union_array : UnionArray + """ + +class StringArray(Array[scalar.StringScalar]): + @staticmethod + def from_buffers( # type: ignore[override] + length: int, + value_offsets: Buffer, + data: Buffer, + null_bitmap: Buffer | None = None, + null_count: int | None = -1, + offset: int | None = 0, + ) -> StringArray: + """ + Construct a StringArray from value_offsets and data buffers. + If there are nulls in the data, also a null_bitmap and the matching + null_count must be passed. + + Parameters + ---------- + length : int + value_offsets : Buffer + data : Buffer + null_bitmap : Buffer, optional + null_count : int, default 0 + offset : int, default 0 + + Returns + ------- + string_array : StringArray + """ + +class LargeStringArray(Array[scalar.LargeStringScalar]): + @staticmethod + def from_buffers( # type: ignore[override] + length: int, + value_offsets: Buffer, + data: Buffer, + null_bitmap: Buffer | None = None, + null_count: int | None = -1, + offset: int | None = 0, + ) -> StringArray: + """ + Construct a LargeStringArray from value_offsets and data buffers. + If there are nulls in the data, also a null_bitmap and the matching + null_count must be passed. + + Parameters + ---------- + length : int + value_offsets : Buffer + data : Buffer + null_bitmap : Buffer, optional + null_count : int, default 0 + offset : int, default 0 + + Returns + ------- + string_array : StringArray + """ + +class StringViewArray(Array[scalar.StringViewScalar]): ... + +class BinaryArray(Array[scalar.BinaryScalar]): + @property + def total_values_length(self) -> int: + """ + The number of bytes from beginning to end of the data buffer addressed + by the offsets of this BinaryArray. + """ + +class LargeBinaryArray(Array[scalar.LargeBinaryScalar]): + @property + def total_values_length(self) -> int: + """ + The number of bytes from beginning to end of the data buffer addressed + by the offsets of this LargeBinaryArray. + """ + +class BinaryViewArray(Array[scalar.BinaryViewScalar]): ... + +class DictionaryArray(Array[scalar.DictionaryScalar[_IndexT, _BasicValueT]]): + def dictionary_encode(self) -> Self: ... # type: ignore[override] + def dictionary_decode(self) -> Array[Scalar[_BasicValueT]]: + """ + Decodes the DictionaryArray to an Array. + """ + @property + def indices(self) -> Array[Scalar[_IndexT]]: ... + @property + def dictionary(self) -> Array[Scalar[_BasicValueT]]: ... + @staticmethod + def from_buffers( # type: ignore[override] + type: _BasicValueT, + length: int, + buffers: list[Buffer], + dictionary: Array | np.ndarray | pd.Series, + null_count: int = -1, + offset: int = 0, + ) -> DictionaryArray[Any, _BasicValueT]: + """ + Construct a DictionaryArray from buffers. + + Parameters + ---------- + type : pyarrow.DataType + length : int + The number of values in the array. + buffers : List[Buffer] + The buffers backing the indices array. + dictionary : pyarrow.Array, ndarray or pandas.Series + The array of values referenced by the indices. + null_count : int, default -1 + The number of null entries in the indices array. Negative value means that + the null count is not known. + offset : int, default 0 + The array's logical offset (in values, not in bytes) from the + start of each buffer. + + Returns + ------- + dict_array : DictionaryArray + """ + @staticmethod + def from_arrays( + indices: Indices, + dictionary: Array | np.ndarray | pd.Series, + mask: np.ndarray | pd.Series | BooleanArray | None = None, + ordered: bool = False, + from_pandas: bool = False, + safe: bool = True, + memory_pool: MemoryPool | None = None, + ) -> DictionaryArray: + """ + Construct a DictionaryArray from indices and values. + + Parameters + ---------- + indices : pyarrow.Array, numpy.ndarray or pandas.Series, int type + Non-negative integers referencing the dictionary values by zero + based index. + dictionary : pyarrow.Array, ndarray or pandas.Series + The array of values referenced by the indices. + mask : ndarray or pandas.Series, bool type + True values indicate that indices are actually null. + ordered : bool, default False + Set to True if the category values are ordered. + from_pandas : bool, default False + If True, the indices should be treated as though they originated in + a pandas.Categorical (null encoded as -1). + safe : bool, default True + If True, check that the dictionary indices are in range. + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise uses default pool. + + Returns + ------- + dict_array : DictionaryArray + """ + +class StructArray(Array[scalar.StructScalar]): + def field(self, index: int | str) -> Array: + """ + Retrieves the child array belonging to field. + + Parameters + ---------- + index : Union[int, str] + Index / position or name of the field. + + Returns + ------- + result : Array + """ + def flatten(self, memory_pool: MemoryPool | None = None) -> list[Array]: + """ + Return one individual array for each field in the struct. + + Parameters + ---------- + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool. + + Returns + ------- + result : List[Array] + """ + @staticmethod + def from_arrays( + arrays: Iterable[Array], + names: list[str] | None = None, + fields: list[Field] | None = None, + mask=None, + memory_pool: MemoryPool | None = None, + type: types.StructType | None = None, + ) -> StructArray: + """ + Construct StructArray from collection of arrays representing + each field in the struct. + + Either field names, field instances or a struct type must be passed. + + Parameters + ---------- + arrays : sequence of Array + names : List[str] (optional) + Field names for each struct child. + fields : List[Field] (optional) + Field instances for each struct child. + mask : pyarrow.Array[bool] (optional) + Indicate which values are null (True) or not null (False). + memory_pool : MemoryPool (optional) + For memory allocations, if required, otherwise uses default pool. + type : pyarrow.StructType (optional) + Struct type for name and type of each child. + + Returns + ------- + result : StructArray + """ + def sort(self, order: Order = "ascending", by: str | None = None, **kwargs) -> StructArray: + """ + Sort the StructArray + + Parameters + ---------- + order : str, default "ascending" + Which order to sort values in. + Accepted values are "ascending", "descending". + by : str or None, default None + If to sort the array by one of its fields + or by the whole array. + **kwargs : dict, optional + Additional sorting options. + As allowed by :class:`SortOptions` + + Returns + ------- + result : StructArray + """ + +class RunEndEncodedArray(Array[scalar.RunEndEncodedScalar[_RunEndType, _BasicValueT]]): + @overload + @staticmethod + def from_arrays( + run_ends: Int16Array, + values: Array, + type: DataType | None = None, + ) -> RunEndEncodedArray[types.Int16Type, _BasicValueT]: ... + @overload + @staticmethod + def from_arrays( + run_ends: Int32Array, + values: Array, + type: DataType | None = None, + ) -> RunEndEncodedArray[types.Int32Type, _BasicValueT]: ... + @overload + @staticmethod + def from_arrays( + run_ends: Int64Array, + values: Array, + type: DataType | None = None, + ) -> RunEndEncodedArray[types.Int64Type, _BasicValueT]: ... + @staticmethod + def from_arrays(*args, **kwargs): + """ + Construct RunEndEncodedArray from run_ends and values arrays. + + Parameters + ---------- + run_ends : Array (int16, int32, or int64 type) + The run_ends array. + values : Array (any type) + The values array. + type : pyarrow.DataType, optional + The run_end_encoded(run_end_type, value_type) array type. + + Returns + ------- + RunEndEncodedArray + """ + @staticmethod + def from_buffers( # pyright: ignore[reportIncompatibleMethodOverride] + type: DataType, + length: int, + buffers: list[Buffer], + null_count: int = -1, + offset=0, + children: tuple[Array, Array] | None = None, + ) -> RunEndEncodedArray[Any, _BasicValueT]: + """ + Construct a RunEndEncodedArray from all the parameters that make up an + Array. + + RunEndEncodedArrays do not have buffers, only children arrays, but this + implementation is needed to satisfy the Array interface. + + Parameters + ---------- + type : DataType + The run_end_encoded(run_end_type, value_type) type. + length : int + The logical length of the run-end encoded array. Expected to match + the last value of the run_ends array (children[0]) minus the offset. + buffers : List[Buffer] + Empty List or [None]. + null_count : int, default -1 + The number of null entries in the array. Run-end encoded arrays + are specified to not have valid bits and null_count always equals 0. + offset : int, default 0 + The array's logical offset (in values, not in bytes) from the + start of each buffer. + children : List[Array] + Nested type children containing the run_ends and values arrays. + + Returns + ------- + RunEndEncodedArray + """ + @property + def run_ends(self) -> Array[scalar.Scalar[_RunEndType]]: + """ + An array holding the logical indexes of each run-end. + + The physical offset to the array is applied. + """ + @property + def values(self) -> Array[scalar.Scalar[_BasicValueT]]: + """ + An array holding the values of each run. + + The physical offset to the array is applied. + """ + def find_physical_offset(self) -> int: + """ + Find the physical offset of this REE array. + + This is the offset of the run that contains the value of the first + logical element of this array considering its offset. + + This function uses binary-search, so it has a O(log N) cost. + """ + def find_physical_length(self) -> int: + """ + Find the physical length of this REE array. + + The physical length of an REE is the number of physical values (and + run-ends) necessary to represent the logical range of values from offset + to length. + + This function uses binary-search, so it has a O(log N) cost. + """ + +_ArrayT = TypeVar("_ArrayT", bound=Array) + +class ExtensionArray(Array[scalar.ExtensionScalar], Generic[_ArrayT]): + @property + def storage(self) -> Any: ... + @staticmethod + def from_storage(typ: types.BaseExtensionType, storage: _ArrayT) -> ExtensionArray[_ArrayT]: + """ + Construct ExtensionArray from type and storage array. + + Parameters + ---------- + typ : DataType + The extension type for the result array. + storage : Array + The underlying storage for the result array. + + Returns + ------- + ext_array : ExtensionArray + """ + +class JsonArray(ExtensionArray[_ArrayT]): + """ + Concrete class for Arrow arrays of JSON data type. + + This does not guarantee that the JSON data actually + is valid JSON. + + Examples + -------- + Define the extension type for JSON array + + >>> import pyarrow as pa + >>> json_type = pa.json_(pa.large_utf8()) + + Create an extension array + + >>> arr = [None, '{ "id":30, "values":["a", "b"] }'] + >>> storage = pa.array(arr, pa.large_utf8()) + >>> pa.ExtensionArray.from_storage(json_type, storage) + + [ + null, + "{ "id":30, "values":["a", "b"] }" + ] + """ + +class UuidArray(ExtensionArray[_ArrayT]): ... + +class FixedShapeTensorArray(ExtensionArray[_ArrayT]): + """ + Concrete class for fixed shape tensor extension arrays. + + Examples + -------- + Define the extension type for tensor array + + >>> import pyarrow as pa + >>> tensor_type = pa.fixed_shape_tensor(pa.int32(), [2, 2]) + + Create an extension array + + >>> arr = [[1, 2, 3, 4], [10, 20, 30, 40], [100, 200, 300, 400]] + >>> storage = pa.array(arr, pa.list_(pa.int32(), 4)) + >>> pa.ExtensionArray.from_storage(tensor_type, storage) + + [ + [ + 1, + 2, + 3, + 4 + ], + [ + 10, + 20, + 30, + 40 + ], + [ + 100, + 200, + 300, + 400 + ] + ] + """ + + def to_numpy_ndarray(self) -> np.ndarray: + """ + Convert fixed shape tensor extension array to a multi-dimensional numpy.ndarray. + + The resulting ndarray will have (ndim + 1) dimensions. + The size of the first dimension will be the length of the fixed shape tensor array + and the rest of the dimensions will match the permuted shape of the fixed + shape tensor. + + The conversion is zero-copy. + + Returns + ------- + numpy.ndarray + Ndarray representing tensors in the fixed shape tensor array concatenated + along the first dimension. + """ + def to_tensor(self) -> Tensor: + """ + Convert fixed shape tensor extension array to a pyarrow.Tensor. + + The resulting Tensor will have (ndim + 1) dimensions. + The size of the first dimension will be the length of the fixed shape tensor array + and the rest of the dimensions will match the permuted shape of the fixed + shape tensor. + + The conversion is zero-copy. + + Returns + ------- + pyarrow.Tensor + Tensor representing tensors in the fixed shape tensor array concatenated + along the first dimension. + """ + + @classmethod + def from_numpy_ndarray(cls, obj: np.ndarray) -> Self: + """ + Convert numpy tensors (ndarrays) to a fixed shape tensor extension array. + The first dimension of ndarray will become the length of the fixed + shape tensor array. + If input array data is not contiguous a copy will be made. + + Parameters + ---------- + obj : numpy.ndarray + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> arr = np.array([[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]], dtype=np.float32) + >>> pa.FixedShapeTensorArray.from_numpy_ndarray(arr) + + [ + [ + 1, + 2, + 3, + 4, + 5, + 6 + ], + [ + 1, + 2, + 3, + 4, + 5, + 6 + ] + ] + """ + +class OpaqueArray(ExtensionArray[_ArrayT]): + """ + Concrete class for opaque extension arrays. + + Examples + -------- + Define the extension type for an opaque array + + >>> import pyarrow as pa + >>> opaque_type = pa.opaque( + ... pa.binary(), + ... type_name="geometry", + ... vendor_name="postgis", + ... ) + + Create an extension array + + >>> arr = [None, b"data"] + >>> storage = pa.array(arr, pa.binary()) + >>> pa.ExtensionArray.from_storage(opaque_type, storage) + + [ + null, + 64617461 + ] + """ + +class Bool8Array(ExtensionArray): + """ + Concrete class for bool8 extension arrays. + + Examples + -------- + Define the extension type for an bool8 array + + >>> import pyarrow as pa + >>> bool8_type = pa.bool8() + + Create an extension array + + >>> arr = [-1, 0, 1, 2, None] + >>> storage = pa.array(arr, pa.int8()) + >>> pa.ExtensionArray.from_storage(bool8_type, storage) + + [ + -1, + 0, + 1, + 2, + null + ] + """ + + def to_numpy(self, zero_copy_only: bool = ..., writable: bool = ...) -> np.ndarray: + """ + Return a NumPy bool view or copy of this array. + + By default, tries to return a view of this array. This is only + supported for arrays without any nulls. + + Parameters + ---------- + zero_copy_only : bool, default True + If True, an exception will be raised if the conversion to a numpy + array would require copying the underlying data (e.g. in presence + of nulls). + writable : bool, default False + For numpy arrays created with zero copy (view on the Arrow data), + the resulting array is not writable (Arrow data is immutable). + By setting this to True, a copy of the array is made to ensure + it is writable. + + Returns + ------- + array : numpy.ndarray + """ + @classmethod + def from_storage(cls, storage: Int8Array) -> Self: # type: ignore[override] + """ + Construct Bool8Array from Int8Array storage. + + Parameters + ---------- + storage : Int8Array + The underlying storage for the result array. + + Returns + ------- + bool8_array : Bool8Array + """ + @classmethod + def from_numpy(cls, obj: np.ndarray) -> Self: + """ + Convert numpy array to a bool8 extension array without making a copy. + The input array must be 1-dimensional, with either bool_ or int8 dtype. + + Parameters + ---------- + obj : numpy.ndarray + + Returns + ------- + bool8_array : Bool8Array + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> arr = np.array([True, False, True], dtype=np.bool_) + >>> pa.Bool8Array.from_numpy(arr) + + [ + 1, + 0, + 1 + ] + """ + +def concat_arrays(arrays: Iterable[_ArrayT], memory_pool: MemoryPool | None = None) -> _ArrayT: + """ + Concatenate the given arrays. + + The contents of the input arrays are copied into the returned array. + + Raises + ------ + ArrowInvalid + If not all of the arrays have the same type. + + Parameters + ---------- + arrays : iterable of pyarrow.Array + Arrays to concatenate, must be identically typed. + memory_pool : MemoryPool, default None + For memory allocations. If None, the default pool is used. + + Examples + -------- + >>> import pyarrow as pa + >>> arr1 = pa.array([2, 4, 5, 100]) + >>> arr2 = pa.array([2, 4]) + >>> pa.concat_arrays([arr1, arr2]) + + [ + 2, + 4, + 5, + 100, + 2, + 4 + ] + + """ + +def _empty_array(type: _DataTypeT) -> Array[scalar.Scalar[_DataTypeT]]: + """ + Create empty array of the given type. + """ + +__all__ = [ + "array", + "asarray", + "nulls", + "repeat", + "infer_type", + "_PandasConvertible", + "Array", + "NullArray", + "BooleanArray", + "NumericArray", + "IntegerArray", + "FloatingPointArray", + "Int8Array", + "UInt8Array", + "Int16Array", + "UInt16Array", + "Int32Array", + "UInt32Array", + "Int64Array", + "UInt64Array", + "Date32Array", + "Date64Array", + "TimestampArray", + "Time32Array", + "Time64Array", + "DurationArray", + "MonthDayNanoIntervalArray", + "HalfFloatArray", + "FloatArray", + "DoubleArray", + "FixedSizeBinaryArray", + "Decimal32Array", + "Decimal64Array", + "Decimal128Array", + "Decimal256Array", + "BaseListArray", + "ListArray", + "LargeListArray", + "ListViewArray", + "LargeListViewArray", + "FixedSizeListArray", + "MapArray", + "UnionArray", + "StringArray", + "LargeStringArray", + "StringViewArray", + "BinaryArray", + "LargeBinaryArray", + "BinaryViewArray", + "DictionaryArray", + "StructArray", + "RunEndEncodedArray", + "ExtensionArray", + "Bool8Array", + "UuidArray", + "JsonArray", + "OpaqueArray", + "FixedShapeTensorArray", + "concat_arrays", + "_empty_array", +] diff --git a/python/stubs/__lib_pxi/benchmark.pyi b/python/stubs/__lib_pxi/benchmark.pyi new file mode 100644 index 00000000000..66981bf0f51 --- /dev/null +++ b/python/stubs/__lib_pxi/benchmark.pyi @@ -0,0 +1 @@ +def benchmark_PandasObjectIsNull(list) -> None: ... # noqa: N802 diff --git a/python/stubs/__lib_pxi/builder.pyi b/python/stubs/__lib_pxi/builder.pyi new file mode 100644 index 00000000000..4a0e9ca4708 --- /dev/null +++ b/python/stubs/__lib_pxi/builder.pyi @@ -0,0 +1,89 @@ +from typing import Iterable + +from pyarrow.lib import MemoryPool, _Weakrefable + +from .array import StringArray, StringViewArray + +class StringBuilder(_Weakrefable): + """ + Builder class for UTF8 strings. + + This class exposes facilities for incrementally adding string values and + building the null bitmap for a pyarrow.Array (type='string'). + """ + def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... + def append(self, value: str | bytes | None): + """ + Append a single value to the builder. + + The value can either be a string/bytes object or a null value + (np.nan or None). + + Parameters + ---------- + value : string/bytes or np.nan/None + The value to append to the string array builder. + """ + def append_values(self, values: Iterable[str | bytes | None]): + """ + Append all the values from an iterable. + + Parameters + ---------- + values : iterable of string/bytes or np.nan/None values + The values to append to the string array builder. + """ + def finish(self) -> StringArray: + """ + Return result of builder as an Array object; also resets the builder. + + Returns + ------- + array : pyarrow.Array + """ + @property + def null_count(self) -> int: ... + def __len__(self) -> int: ... + +class StringViewBuilder(_Weakrefable): + """ + Builder class for UTF8 string views. + + This class exposes facilities for incrementally adding string values and + building the null bitmap for a pyarrow.Array (type='string_view'). + """ + def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... + def append(self, value: str | bytes | None): + """ + Append a single value to the builder. + + The value can either be a string/bytes object or a null value + (np.nan or None). + + Parameters + ---------- + value : string/bytes or np.nan/None + The value to append to the string array builder. + """ + def append_values(self, values: Iterable[str | bytes | None]): + """ + Append all the values from an iterable. + + Parameters + ---------- + values : iterable of string/bytes or np.nan/None values + The values to append to the string array builder. + """ + def finish(self) -> StringViewArray: + """ + Return result of builder as an Array object; also resets the builder. + + Returns + ------- + array : pyarrow.Array + """ + @property + def null_count(self) -> int: ... + def __len__(self) -> int: ... + +__all__ = ["StringBuilder", "StringViewBuilder"] diff --git a/python/stubs/__lib_pxi/compat.pyi b/python/stubs/__lib_pxi/compat.pyi new file mode 100644 index 00000000000..ae667be453e --- /dev/null +++ b/python/stubs/__lib_pxi/compat.pyi @@ -0,0 +1,5 @@ +def encode_file_path(path: str | bytes) -> bytes: ... +def tobytes(o: str | bytes) -> bytes: ... +def frombytes(o: bytes, *, safe: bool = False): ... + +__all__ = ["encode_file_path", "tobytes", "frombytes"] diff --git a/python/stubs/__lib_pxi/config.pyi b/python/stubs/__lib_pxi/config.pyi new file mode 100644 index 00000000000..166e10c9734 --- /dev/null +++ b/python/stubs/__lib_pxi/config.pyi @@ -0,0 +1,41 @@ +from typing import NamedTuple + +class VersionInfo(NamedTuple): + major: int + minor: int + patch: int + +class BuildInfo(NamedTuple): + version: str + version_info: VersionInfo + so_version: str + full_so_version: str + compiler_id: str + compiler_version: str + compiler_flags: str + git_id: str + git_description: str + package_kind: str + build_type: str + +class RuntimeInfo(NamedTuple): + simd_level: str + detected_simd_level: str + +cpp_build_info: BuildInfo +cpp_version: str +cpp_version_info: VersionInfo + +def runtime_info() -> RuntimeInfo: ... +def set_timezone_db_path(path: str) -> None: ... + +__all__ = [ + "VersionInfo", + "BuildInfo", + "RuntimeInfo", + "cpp_build_info", + "cpp_version", + "cpp_version_info", + "runtime_info", + "set_timezone_db_path", +] diff --git a/python/stubs/__lib_pxi/device.pyi b/python/stubs/__lib_pxi/device.pyi new file mode 100644 index 00000000000..d1b9f39eedd --- /dev/null +++ b/python/stubs/__lib_pxi/device.pyi @@ -0,0 +1,88 @@ +import enum + +from pyarrow.lib import _Weakrefable + +class DeviceAllocationType(enum.Flag): + CPU = enum.auto() + CUDA = enum.auto() + CUDA_HOST = enum.auto() + OPENCL = enum.auto() + VULKAN = enum.auto() + METAL = enum.auto() + VPI = enum.auto() + ROCM = enum.auto() + ROCM_HOST = enum.auto() + EXT_DEV = enum.auto() + CUDA_MANAGED = enum.auto() + ONEAPI = enum.auto() + WEBGPU = enum.auto() + HEXAGON = enum.auto() + +class Device(_Weakrefable): + """ + Abstract interface for hardware devices + + This object represents a device with access to some memory spaces. + When handling a Buffer or raw memory address, it allows deciding in which + context the raw memory address should be interpreted + (e.g. CPU-accessible memory, or embedded memory on some particular GPU). + """ + + @property + def type_name(self) -> str: + """ + A shorthand for this device's type. + """ + @property + def device_id(self) -> int: + """ + A device ID to identify this device if there are multiple of this type. + + If there is no "device_id" equivalent (such as for the main CPU device on + non-numa systems) returns -1. + """ + @property + def is_cpu(self) -> bool: + """ + Whether this device is the main CPU device. + + This shorthand method is very useful when deciding whether a memory address + is CPU-accessible. + """ + @property + def device_type(self) -> DeviceAllocationType: + """ + Return the DeviceAllocationType of this device. + """ + +class MemoryManager(_Weakrefable): + """ + An object that provides memory management primitives. + + A MemoryManager is always tied to a particular Device instance. + It can also have additional parameters (such as a MemoryPool to + allocate CPU memory). + + """ + @property + def device(self) -> Device: + """ + The device this MemoryManager is tied to. + """ + @property + def is_cpu(self) -> bool: + """ + Whether this MemoryManager is tied to the main CPU device. + + This shorthand method is very useful when deciding whether a memory + address is CPU-accessible. + """ + +def default_cpu_memory_manager() -> MemoryManager: + """ + Return the default CPU MemoryManager instance. + + The returned singleton instance uses the default MemoryPool. + """ + +__all__ = ["DeviceAllocationType", "Device", "MemoryManager", "default_cpu_memory_manager"] diff --git a/python/stubs/__lib_pxi/error.pyi b/python/stubs/__lib_pxi/error.pyi new file mode 100644 index 00000000000..981ed51e680 --- /dev/null +++ b/python/stubs/__lib_pxi/error.pyi @@ -0,0 +1,53 @@ +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self + +class ArrowException(Exception): ... +class ArrowInvalid(ValueError, ArrowException): ... +class ArrowMemoryError(MemoryError, ArrowException): ... +class ArrowKeyError(KeyError, ArrowException): ... +class ArrowTypeError(TypeError, ArrowException): ... +class ArrowNotImplementedError(NotImplementedError, ArrowException): ... +class ArrowCapacityError(ArrowException): ... +class ArrowIndexError(IndexError, ArrowException): ... +class ArrowSerializationError(ArrowException): ... + +class ArrowCancelled(ArrowException): + signum: int | None + def __init__(self, message: str, signum: int | None = None) -> None: ... + +ArrowIOError = IOError + +class StopToken: ... + +def enable_signal_handlers(enable: bool) -> None: ... + +have_signal_refcycle: bool + +class SignalStopHandler: + def __enter__(self) -> Self: ... + def __exit__(self, exc_type, exc_value, exc_tb) -> None: ... + def __dealloc__(self) -> None: ... + @property + def stop_token(self) -> StopToken: ... + +__all__ = [ + "ArrowException", + "ArrowInvalid", + "ArrowMemoryError", + "ArrowKeyError", + "ArrowTypeError", + "ArrowNotImplementedError", + "ArrowCapacityError", + "ArrowIndexError", + "ArrowSerializationError", + "ArrowCancelled", + "ArrowIOError", + "StopToken", + "enable_signal_handlers", + "have_signal_refcycle", + "SignalStopHandler", +] diff --git a/python/stubs/__lib_pxi/io.pyi b/python/stubs/__lib_pxi/io.pyi new file mode 100644 index 00000000000..d882fd79d57 --- /dev/null +++ b/python/stubs/__lib_pxi/io.pyi @@ -0,0 +1,1474 @@ +import sys + +from collections.abc import Callable +from io import IOBase + +from _typeshed import StrPath + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias + +from typing import Any, Literal, SupportsIndex, overload + +from pyarrow._stubs_typing import Compression, SupportPyBuffer +from pyarrow.lib import MemoryPool, _Weakrefable + +from .device import Device, DeviceAllocationType, MemoryManager +from .types import KeyValueMetadata + +def have_libhdfs() -> bool: + """ + Return true if HDFS (HadoopFileSystem) library is set up correctly. + """ + +def io_thread_count() -> int: + """ + Return the number of threads to use for I/O operations. + + Many operations, such as scanning a dataset, will implicitly make + use of this pool. The number of threads is set to a fixed value at + startup. It can be modified at runtime by calling + :func:`set_io_thread_count()`. + + See Also + -------- + set_io_thread_count : Modify the size of this pool. + cpu_count : The analogous function for the CPU thread pool. + """ + +def set_io_thread_count(count: int) -> None: + """ + Set the number of threads to use for I/O operations. + + Many operations, such as scanning a dataset, will implicitly make + use of this pool. + + Parameters + ---------- + count : int + The max number of threads that may be used for I/O. + Must be positive. + + See Also + -------- + io_thread_count : Get the size of this pool. + set_cpu_count : The analogous function for the CPU thread pool. + """ + +Mode: TypeAlias = Literal["rb", "wb", "rb+", "ab"] + +class NativeFile(_Weakrefable): + """ + The base class for all Arrow streams. + + Streams are either readable, writable, or both. + They optionally support seeking. + + While this class exposes methods to read or write data from Python, the + primary intent of using a Arrow stream is to pass it to other Arrow + facilities that will make use of it, such as Arrow IPC routines. + + Be aware that there are subtle differences with regular Python files, + e.g. destroying a writable Arrow stream without closing it explicitly + will not flush any pending data. + """ + + _default_chunk_size: int + + def __enter__(self) -> Self: ... + def __exit__(self, *args) -> None: ... + @property + def mode(self) -> Mode: + """ + The file mode. Currently instances of NativeFile may support: + + * rb: binary read + * wb: binary write + * rb+: binary read and write + * ab: binary append + """ + def readable(self) -> bool: ... + def seekable(self) -> bool: ... + def isatty(self) -> bool: ... + def fileno(self) -> int: ... + @property + def closed(self) -> bool: ... + def close(self) -> None: ... + def size(self) -> int: + """ + Return file size + """ + def metadata(self) -> KeyValueMetadata: + """ + Return file metadata + """ + def tell(self) -> int: + """ + Return current stream position + """ + def seek(self, position: int, whence: int = 0) -> int: + """ + Change current file stream position + + Parameters + ---------- + position : int + Byte offset, interpreted relative to value of whence argument + whence : int, default 0 + Point of reference for seek offset + + Notes + ----- + Values of whence: + * 0 -- start of stream (the default); offset should be zero or positive + * 1 -- current stream position; offset may be negative + * 2 -- end of stream; offset is usually negative + + Returns + ------- + int + The new absolute stream position. + """ + def flush(self) -> None: + """ + Flush the stream, if applicable. + + An error is raised if stream is not writable. + """ + def write(self, data: bytes | SupportPyBuffer) -> int: + """ + Write data to the file. + + Parameters + ---------- + data : bytes-like object or exporter of buffer protocol + + Returns + ------- + int + nbytes: number of bytes written + """ + def read(self, nbytes: int | None = None) -> bytes: + """ + Read and return up to n bytes. + + If *nbytes* is None, then the entire remaining file contents are read. + + Parameters + ---------- + nbytes : int, default None + + Returns + ------- + data : bytes + """ + def get_stream(self, file_offset: int, nbytes: int) -> Self: + """ + Return an input stream that reads a file segment independent of the + state of the file. + + Allows reading portions of a random access file as an input stream + without interfering with each other. + + Parameters + ---------- + file_offset : int + nbytes : int + + Returns + ------- + stream : NativeFile + """ + def read_at(self) -> bytes: + """ + Read indicated number of bytes at offset from the file + + Parameters + ---------- + nbytes : int + offset : int + + Returns + ------- + data : bytes + """ + def read1(self) -> bytes: + """Read and return up to n bytes. + + Unlike read(), if *nbytes* is None then a chunk is read, not the + entire file. + + Parameters + ---------- + nbytes : int, default None + The maximum number of bytes to read. + + Returns + ------- + data : bytes + """ + def readall(self) -> bytes: ... + def readinto(self, b: SupportPyBuffer) -> int: + """ + Read into the supplied buffer + + Parameters + ---------- + b : buffer-like object + A writable buffer object (such as a bytearray). + + Returns + ------- + written : int + number of bytes written + """ + + def readline(self, size: int | None = None) -> bytes: + """Read and return a line of bytes from the file. + + If size is specified, read at most size bytes. + + Line terminator is always b"\\n". + + Parameters + ---------- + size : int + maximum number of bytes read + """ + def readlines(self, hint: int | None = None) -> list[bytes]: + """Read lines of the file + + Parameters + ---------- + hint : int + maximum number of bytes read until we stop + """ + def __iter__(self) -> Self: ... + def __next__(self) -> bytes: ... + def read_buffer(self, nbytes: int | None = None) -> Buffer: + """ + Read from buffer. + + Parameters + ---------- + nbytes : int, optional + maximum number of bytes read + """ + def truncate(self) -> None: ... + def writelines(self, lines: list[bytes]): + """ + Write lines to the file. + + Parameters + ---------- + lines : iterable + Iterable of bytes-like objects or exporters of buffer protocol + """ + def download(self, stream_or_path: StrPath | IOBase, buffer_size: int | None = None) -> None: + """ + Read this file completely to a local path or destination stream. + + This method first seeks to the beginning of the file. + + Parameters + ---------- + stream_or_path : str or file-like object + If a string, a local file path to write to; otherwise, + should be a writable stream. + buffer_size : int, optional + The buffer size to use for data transfers. + """ + def upload(self, stream: IOBase, buffer_size: int | None) -> None: + """ + Write from a source stream to this file. + + Parameters + ---------- + stream : file-like object + Source stream to pipe to this file. + buffer_size : int, optional + The buffer size to use for data transfers. + """ + +# ---------------------------------------------------------------------- +# Python file-like objects + +class PythonFile(NativeFile): + """ + A stream backed by a Python file object. + + This class allows using Python file objects with arbitrary Arrow + functions, including functions written in another language than Python. + + As a downside, there is a non-zero redirection cost in translating + Arrow stream calls to Python method calls. Furthermore, Python's + Global Interpreter Lock may limit parallelism in some situations. + + Examples + -------- + >>> import io + >>> import pyarrow as pa + >>> pa.PythonFile(io.BytesIO()) + + + Create a stream for writing: + + >>> buf = io.BytesIO() + >>> f = pa.PythonFile(buf, mode="w") + >>> f.writable() + True + >>> f.write(b"PythonFile") + 10 + >>> buf.getvalue() + b'PythonFile' + >>> f.close() + >>> f + + + Create a stream for reading: + + >>> buf = io.BytesIO(b"PythonFile") + >>> f = pa.PythonFile(buf, mode="r") + >>> f.mode + 'rb' + >>> f.read() + b'PythonFile' + >>> f + + >>> f.close() + >>> f + + """ + def __init__(self, handle: IOBase, mode: Literal["r", "w"] | None = None) -> None: ... + def truncate(self, pos: int | None = None) -> None: + """ + Parameters + ---------- + pos : int, optional + """ + +class MemoryMappedFile(NativeFile): + """ + A stream that represents a memory-mapped file. + + Supports 'r', 'r+', 'w' modes. + + Examples + -------- + Create a new file with memory map: + + >>> import pyarrow as pa + >>> mmap = pa.create_memory_map("example_mmap.dat", 10) + >>> mmap + + >>> mmap.close() + + Open an existing file with memory map: + + >>> with pa.memory_map("example_mmap.dat") as mmap: + ... mmap + + """ + @classmethod + def create(cls, path: str, size: int) -> Self: + """ + Create a MemoryMappedFile + + Parameters + ---------- + path : str + Where to create the file. + size : int + Size of the memory mapped file. + """ + def _open(self, path: str, mode: Literal["r", "rb", "w", "wb", "r+", "r+b", "rb+"] = "r"): ... + def resize(self, new_size: int) -> None: + """ + Resize the map and underlying file. + + Parameters + ---------- + new_size : new size in bytes + """ + +def memory_map( + path: str, mode: Literal["r", "rb", "w", "wb", "r+", "r+b", "rb+"] = "r" +) -> MemoryMappedFile: + """ + Open memory map at file path. Size of the memory map cannot change. + + Parameters + ---------- + path : str + mode : {'r', 'r+', 'w'}, default 'r' + Whether the file is opened for reading ('r'), writing ('w') + or both ('r+'). + + Returns + ------- + mmap : MemoryMappedFile + + Examples + -------- + Reading from a memory map without any memory allocation or copying: + + >>> import pyarrow as pa + >>> with pa.output_stream("example_mmap.txt") as stream: + ... stream.write(b"Constructing a buffer referencing the mapped memory") + 51 + >>> with pa.memory_map("example_mmap.txt") as mmap: + ... mmap.read_at(6, 45) + b'memory' + """ + +create_memory_map = MemoryMappedFile.create + +class OSFile(NativeFile): + """ + A stream backed by a regular file descriptor. + + Examples + -------- + Create a new file to write to: + + >>> import pyarrow as pa + >>> with pa.OSFile("example_osfile.arrow", mode="w") as f: + ... f.writable() + ... f.write(b"OSFile") + ... f.seekable() + True + 6 + False + + Open the file to read: + + >>> with pa.OSFile("example_osfile.arrow", mode="r") as f: + ... f.mode + ... f.read() + 'rb' + b'OSFile' + + Open the file to append: + + >>> with pa.OSFile("example_osfile.arrow", mode="ab") as f: + ... f.mode + ... f.write(b" is super!") + 'ab' + 10 + >>> with pa.OSFile("example_osfile.arrow") as f: + ... f.read() + b'OSFile is super!' + + Inspect created OSFile: + + >>> pa.OSFile("example_osfile.arrow") + + """ + def __init__( + self, + path: str, + mode: Literal["r", "rb", "w", "wb", "a", "ab"], + memory_pool: MemoryPool | None = None, + ) -> None: ... + +class FixedSizeBufferWriter(NativeFile): + """ + A stream writing to a Arrow buffer. + + Examples + -------- + Create a stream to write to ``pyarrow.Buffer``: + + >>> import pyarrow as pa + >>> buf = pa.allocate_buffer(5) + >>> with pa.output_stream(buf) as stream: + ... stream.write(b"abcde") + ... stream + 5 + + + Inspect the buffer: + + >>> buf.to_pybytes() + b'abcde' + >>> buf + + """ + def __init__(self, buffer: Buffer) -> None: ... + def set_memcopy_threads(self, num_threads: int) -> None: ... + def set_memcopy_blocksize(self, blocksize: int) -> None: ... + def set_memcopy_threshold(self, threshold: int) -> None: ... + +# ---------------------------------------------------------------------- +# Arrow buffers + +class Buffer(_Weakrefable): + """ + The base class for all Arrow buffers. + + A buffer represents a contiguous memory area. Many buffers will own + their memory, though not all of them do. + """ + def __len__(self) -> int: ... + def _assert_cpu(self) -> None: ... + @property + def size(self) -> int: + """ + The buffer size in bytes. + """ + @property + def address(self) -> int: + """ + The buffer's address, as an integer. + + The returned address may point to CPU or device memory. + Use `is_cpu()` to disambiguate. + """ + def hex(self) -> bytes: + """ + Compute hexadecimal representation of the buffer. + + Returns + ------- + : bytes + """ + @property + def is_mutable(self) -> bool: + """ + Whether the buffer is mutable. + """ + @property + def is_cpu(self) -> bool: + """ + Whether the buffer is CPU-accessible. + """ + @property + def device(self) -> Device: + """ + The device where the buffer resides. + + Returns + ------- + Device + """ + @property + def memory_manager(self) -> MemoryManager: + """ + The memory manager associated with the buffer. + + Returns + ------- + MemoryManager + """ + @property + def device_type(self) -> DeviceAllocationType: + """ + The device type where the buffer resides. + + Returns + ------- + DeviceAllocationType + """ + @property + def parent(self) -> Buffer | None: ... + @overload + def __getitem__(self, key: slice) -> Self: ... + @overload + def __getitem__(self, key: int) -> int: ... + def slice(self, offset: int = 0, length: int | None = None) -> Self: + """ + Slice this buffer. Memory is not copied. + + You can also use the Python slice notation ``buffer[start:stop]``. + + Parameters + ---------- + offset : int, default 0 + Offset from start of buffer to slice. + length : int, default None + Length of slice (default is until end of Buffer starting from + offset). + + Returns + ------- + sliced : Buffer + A logical view over this buffer. + """ + def equals(self, other: Self) -> bool: + """ + Determine if two buffers contain exactly the same data. + + Parameters + ---------- + other : Buffer + + Returns + ------- + are_equal : bool + True if buffer contents and size are equal + """ + def __reduce_ex__(self, protocol: SupportsIndex) -> str | tuple[Any, ...]: ... + def to_pybytes(self) -> bytes: + """ + Return this buffer as a Python bytes object. Memory is copied. + """ + def __buffer__(self, flags: int, /) -> memoryview: ... + +class ResizableBuffer(Buffer): + """ + A base class for buffers that can be resized. + """ + + def resize(self, new_size: int, shrink_to_fit: bool = False) -> None: + """ + Resize buffer to indicated size. + + Parameters + ---------- + new_size : int + New size of buffer (padding may be added internally). + shrink_to_fit : bool, default False + If this is true, the buffer is shrunk when new_size is less + than the current size. + If this is false, the buffer is never shrunk. + """ + +@overload +def allocate_buffer(size: int, memory_pool: MemoryPool | None = None) -> Buffer: ... +@overload +def allocate_buffer( + size: int, memory_pool: MemoryPool | None, resizable: Literal[False] +) -> Buffer: ... +@overload +def allocate_buffer( + size: int, memory_pool: MemoryPool | None, resizable: Literal[True] +) -> ResizableBuffer: ... +def allocate_buffer(*args, **kwargs): + """ + Allocate a mutable buffer. + + Parameters + ---------- + size : int + Number of bytes to allocate (plus internal padding) + memory_pool : MemoryPool, optional + The pool to allocate memory from. + If not given, the default memory pool is used. + resizable : bool, default False + If true, the returned buffer is resizable. + + Returns + ------- + buffer : Buffer or ResizableBuffer + """ + +# ---------------------------------------------------------------------- +# Arrow Stream +class BufferOutputStream(NativeFile): + """ + An output stream that writes to a resizable buffer. + + The buffer is produced as a result when ``getvalue()`` is called. + + Examples + -------- + Create an output stream, write data to it and finalize it with + ``getvalue()``: + + >>> import pyarrow as pa + >>> f = pa.BufferOutputStream() + >>> f.write(b"pyarrow.Buffer") + 14 + >>> f.closed + False + >>> f.getvalue() + + >>> f.closed + True + """ + def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... + def getvalue(self) -> Buffer: + """ + Finalize output stream and return result as pyarrow.Buffer. + + Returns + ------- + value : Buffer + """ + +class MockOutputStream(NativeFile): ... + +class BufferReader(NativeFile): + """ + Zero-copy reader from objects convertible to Arrow buffer. + + Parameters + ---------- + obj : Python bytes or pyarrow.Buffer + + Examples + -------- + Create an Arrow input stream and inspect it: + + >>> import pyarrow as pa + >>> data = b"reader data" + >>> buf = memoryview(data) + >>> with pa.input_stream(buf) as stream: + ... stream.size() + ... stream.read(6) + ... stream.seek(7) + ... stream.read(15) + 11 + b'reader' + 7 + b'data' + """ + def __init__(self, obj) -> None: ... + +class CompressedInputStream(NativeFile): + """ + An input stream wrapper which decompresses data on the fly. + + Parameters + ---------- + stream : string, path, pyarrow.NativeFile, or file-like object + Input stream object to wrap with the compression. + compression : str + The compression type ("bz2", "brotli", "gzip", "lz4" or "zstd"). + + Examples + -------- + Create an output stream which compresses the data: + + >>> import pyarrow as pa + >>> data = b"Compressed stream" + >>> raw = pa.BufferOutputStream() + >>> with pa.CompressedOutputStream(raw, "gzip") as compressed: + ... compressed.write(data) + 17 + + Create an input stream with decompression referencing the + buffer with compressed data: + + >>> cdata = raw.getvalue() + >>> with pa.input_stream(cdata, compression="gzip") as compressed: + ... compressed.read() + b'Compressed stream' + + which actually translates to the use of ``BufferReader``and + ``CompressedInputStream``: + + >>> raw = pa.BufferReader(cdata) + >>> with pa.CompressedInputStream(raw, "gzip") as compressed: + ... compressed.read() + b'Compressed stream' + """ + + def __init__( + self, + stream: StrPath | NativeFile | IOBase, + compression: Literal["bz2", "brotli", "gzip", "lz4", "zstd"], + ) -> None: ... + +class CompressedOutputStream(NativeFile): + """ + An output stream wrapper which compresses data on the fly. + + Parameters + ---------- + stream : string, path, pyarrow.NativeFile, or file-like object + Input stream object to wrap with the compression. + compression : str + The compression type ("bz2", "brotli", "gzip", "lz4" or "zstd"). + + Examples + -------- + Create an output stream which compresses the data: + + >>> import pyarrow as pa + >>> data = b"Compressed stream" + >>> raw = pa.BufferOutputStream() + >>> with pa.CompressedOutputStream(raw, "gzip") as compressed: + ... compressed.write(data) + 17 + """ + def __init__( + self, + stream: StrPath | NativeFile | IOBase, + compression: Literal["bz2", "brotli", "gzip", "lz4", "zstd"], + ) -> None: ... + +class BufferedInputStream(NativeFile): + """ + An input stream that performs buffered reads from + an unbuffered input stream, which can mitigate the overhead + of many small reads in some cases. + + Parameters + ---------- + stream : NativeFile + The input stream to wrap with the buffer + buffer_size : int + Size of the temporary read buffer. + memory_pool : MemoryPool + The memory pool used to allocate the buffer. + """ + def __init__( + self, stream: NativeFile, buffer_size: int, memory_pool: MemoryPool | None = None + ) -> None: ... + def detach(self) -> NativeFile: + """ + Release the raw InputStream. + Further operations on this stream are invalid. + + Returns + ------- + raw : NativeFile + The underlying raw input stream + """ + +class BufferedOutputStream(NativeFile): + """ + An output stream that performs buffered reads from + an unbuffered output stream, which can mitigate the overhead + of many small writes in some cases. + + Parameters + ---------- + stream : NativeFile + The writable output stream to wrap with the buffer + buffer_size : int + Size of the buffer that should be added. + memory_pool : MemoryPool + The memory pool used to allocate the buffer. + """ + def __init__( + self, stream: NativeFile, buffer_size: int, memory_pool: MemoryPool | None = None + ) -> None: ... + def detach(self) -> NativeFile: + """ + Flush any buffered writes and release the raw OutputStream. + Further operations on this stream are invalid. + + Returns + ------- + raw : NativeFile + The underlying raw output stream. + """ + +class TransformInputStream(NativeFile): + """ + Transform an input stream. + + Parameters + ---------- + stream : NativeFile + The stream to transform. + transform_func : callable + The transformation to apply. + """ + def __init__(self, stream: NativeFile, transform_func: Callable[[Buffer], Any]) -> None: ... + +class Transcoder: + def __init__(self, decoder, encoder) -> None: ... + def __call__(self, buf: Buffer): ... + +def transcoding_input_stream( + stream: NativeFile, src_encoding: str, dest_encoding: str +) -> TransformInputStream: + """ + Add a transcoding transformation to the stream. + Incoming data will be decoded according to ``src_encoding`` and + then re-encoded according to ``dest_encoding``. + + Parameters + ---------- + stream : NativeFile + The stream to which the transformation should be applied. + src_encoding : str + The codec to use when reading data. + dest_encoding : str + The codec to use for emitted data. + """ + +def py_buffer(obj: SupportPyBuffer) -> Buffer: + """ + Construct an Arrow buffer from a Python bytes-like or buffer-like object + + Parameters + ---------- + obj : object + the object from which the buffer should be constructed. + """ + +def foreign_buffer(address: int, size: int, base: Any | None = None) -> Buffer: + """ + Construct an Arrow buffer with the given *address* and *size*. + + The buffer will be optionally backed by the Python *base* object, if given. + The *base* object will be kept alive as long as this buffer is alive, + including across language boundaries (for example if the buffer is + referenced by C++ code). + + Parameters + ---------- + address : int + The starting address of the buffer. The address can + refer to both device or host memory but it must be + accessible from device after mapping it with + `get_device_address` method. + size : int + The size of device buffer in bytes. + base : {None, object} + Object that owns the referenced memory. + """ + +def as_buffer(o: Buffer | SupportPyBuffer) -> Buffer: ... + +# --------------------------------------------------------------------- + +class CacheOptions(_Weakrefable): + """ + Cache options for a pre-buffered fragment scan. + + Parameters + ---------- + hole_size_limit : int, default 8KiB + The maximum distance in bytes between two consecutive ranges; beyond + this value, ranges are not combined. + range_size_limit : int, default 32MiB + The maximum size in bytes of a combined range; if combining two + consecutive ranges would produce a range of a size greater than this, + they are not combined + lazy : bool, default True + lazy = false: request all byte ranges when PreBuffer or WillNeed is called. + lazy = True, prefetch_limit = 0: request merged byte ranges only after the reader + needs them. + lazy = True, prefetch_limit = k: prefetch up to k merged byte ranges ahead of the + range that is currently being read. + prefetch_limit : int, default 0 + The maximum number of ranges to be prefetched. This is only used for + lazy cache to asynchronously read some ranges after reading the target + range. + """ + + hole_size_limit: int + range_size_limit: int + lazy: bool + prefetch_limit: int + def __init__( + self, + *, + hole_size_limit: int | None = None, + range_size_limit: int | None = None, + lazy: bool = True, + prefetch_limit: int = 0, + ) -> None: ... + @classmethod + def from_network_metrics( + cls, + time_to_first_byte_millis: int, + transfer_bandwidth_mib_per_sec: int, + ideal_bandwidth_utilization_frac: float = 0.9, + max_ideal_request_size_mib: int = 64, + ) -> Self: + """ + Create suitable CacheOptions based on provided network metrics. + + Typically this will be used with object storage solutions like Amazon S3, + Google Cloud Storage and Azure Blob Storage. + + Parameters + ---------- + time_to_first_byte_millis : int + Seek-time or Time-To-First-Byte (TTFB) in milliseconds, also called call + setup latency of a new read request. The value is a positive integer. + transfer_bandwidth_mib_per_sec : int + Data transfer Bandwidth (BW) in MiB/sec (per connection). The value is a positive + integer. + ideal_bandwidth_utilization_frac : int, default 0.9 + Transfer bandwidth utilization fraction (per connection) to maximize the net + data load. The value is a positive float less than 1. + max_ideal_request_size_mib : int, default 64 + The maximum single data request size (in MiB) to maximize the net data load. + + Returns + ------- + CacheOptions + """ + +class Codec(_Weakrefable): + """ + Compression codec. + + Parameters + ---------- + compression : str + Type of compression codec to initialize, valid values are: 'gzip', + 'bz2', 'brotli', 'lz4' (or 'lz4_frame'), 'lz4_raw', 'zstd' and + 'snappy'. + compression_level : int, None + Optional parameter specifying how aggressively to compress. The + possible ranges and effect of this parameter depend on the specific + codec chosen. Higher values compress more but typically use more + resources (CPU/RAM). Some codecs support negative values. + + gzip + The compression_level maps to the memlevel parameter of + deflateInit2. Higher levels use more RAM but are faster + and should have higher compression ratios. + + bz2 + The compression level maps to the blockSize100k parameter of + the BZ2_bzCompressInit function. Higher levels use more RAM + but are faster and should have higher compression ratios. + + brotli + The compression level maps to the BROTLI_PARAM_QUALITY + parameter. Higher values are slower and should have higher + compression ratios. + + lz4/lz4_frame/lz4_raw + The compression level parameter is not supported and must + be None + + zstd + The compression level maps to the compressionLevel parameter + of ZSTD_initCStream. Negative values are supported. Higher + values are slower and should have higher compression ratios. + + snappy + The compression level parameter is not supported and must + be None + + + Raises + ------ + ValueError + If invalid compression value is passed. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.Codec.is_available("gzip") + True + >>> codec = pa.Codec("gzip") + >>> codec.name + 'gzip' + >>> codec.compression_level + 9 + """ + def __init__(self, compression: Compression, compression_level: int | None = None) -> None: ... + @classmethod + def detect(cls, path: StrPath) -> Self: + """ + Detect and instantiate compression codec based on file extension. + + Parameters + ---------- + path : str, path-like + File-path to detect compression from. + + Raises + ------ + TypeError + If the passed value is not path-like. + ValueError + If the compression can't be detected from the path. + + Returns + ------- + Codec + """ + @staticmethod + def is_available(compression: Compression) -> bool: + """ + Returns whether the compression support has been built and enabled. + + Parameters + ---------- + compression : str + Type of compression codec, + refer to Codec docstring for a list of supported ones. + + Returns + ------- + bool + """ + @staticmethod + def supports_compression_level(compression: Compression) -> int: + """ + Returns true if the compression level parameter is supported + for the given codec. + + Parameters + ---------- + compression : str + Type of compression codec, + refer to Codec docstring for a list of supported ones. + """ + @staticmethod + def default_compression_level(compression: Compression) -> int: + """ + Returns the compression level that Arrow will use for the codec if + None is specified. + + Parameters + ---------- + compression : str + Type of compression codec, + refer to Codec docstring for a list of supported ones. + """ + @staticmethod + def minimum_compression_level(compression: Compression) -> int: + """ + Returns the smallest valid value for the compression level + + Parameters + ---------- + compression : str + Type of compression codec, + refer to Codec docstring for a list of supported ones. + """ + @staticmethod + def maximum_compression_level(compression: Compression) -> int: + """ + Returns the largest valid value for the compression level + + Parameters + ---------- + compression : str + Type of compression codec, + refer to Codec docstring for a list of supported ones. + """ + @property + def name(self) -> Compression: + """Returns the name of the codec""" + @property + def compression_level(self) -> int: + """Returns the compression level parameter of the codec""" + @overload + def compress( + self, + buf: Buffer | bytes | SupportPyBuffer, + *, + memory_pool: MemoryPool | None = None, + ) -> Buffer: ... + @overload + def compress( + self, + buf: Buffer | bytes | SupportPyBuffer, + *, + asbytes: Literal[False], + memory_pool: MemoryPool | None = None, + ) -> Buffer: ... + @overload + def compress( + self, + buf: Buffer | bytes | SupportPyBuffer, + *, + asbytes: Literal[True], + memory_pool: MemoryPool | None = None, + ) -> bytes: ... + def compress(self, *args, **kwargs): + """ + Compress data from buffer-like object. + + Parameters + ---------- + buf : pyarrow.Buffer, bytes, or other object supporting buffer protocol + asbytes : bool, default False + Return result as Python bytes object, otherwise Buffer + memory_pool : MemoryPool, default None + Memory pool to use for buffer allocations, if any + + Returns + ------- + compressed : pyarrow.Buffer or bytes (if asbytes=True) + """ + @overload + def decompress( + self, + buf: Buffer | bytes | SupportPyBuffer, + decompressed_size: int | None = None, + *, + memory_pool: MemoryPool | None = None, + ) -> Buffer: ... + @overload + def decompress( + self, + buf: Buffer | bytes | SupportPyBuffer, + decompressed_size: int | None = None, + *, + asbytes: Literal[False], + memory_pool: MemoryPool | None = None, + ) -> Buffer: ... + @overload + def decompress( + self, + buf: Buffer | bytes | SupportPyBuffer, + decompressed_size: int | None = None, + *, + asbytes: Literal[True], + memory_pool: MemoryPool | None = None, + ) -> bytes: ... + def decompress(self, *args, **kwargs): + """ + Decompress data from buffer-like object. + + Parameters + ---------- + buf : pyarrow.Buffer, bytes, or memoryview-compatible object + decompressed_size : int, default None + Size of the decompressed result + asbytes : boolean, default False + Return result as Python bytes object, otherwise Buffer + memory_pool : MemoryPool, default None + Memory pool to use for buffer allocations, if any. + + Returns + ------- + uncompressed : pyarrow.Buffer or bytes (if asbytes=True) + """ + +@overload +def compress( + buf: Buffer | bytes | SupportPyBuffer, + codec: Compression = "lz4", + *, + memory_pool: MemoryPool | None = None, +) -> Buffer: ... +@overload +def compress( + buf: Buffer | bytes | SupportPyBuffer, + codec: Compression = "lz4", + *, + asbytes: Literal[False], + memory_pool: MemoryPool | None = None, +) -> Buffer: ... +@overload +def compress( + buf: Buffer | bytes | SupportPyBuffer, + codec: Compression = "lz4", + *, + asbytes: Literal[True], + memory_pool: MemoryPool | None = None, +) -> bytes: ... +def compress(*args, **kwargs): + """ + Compress data from buffer-like object. + + Parameters + ---------- + buf : pyarrow.Buffer, bytes, or other object supporting buffer protocol + codec : str, default 'lz4' + Compression codec. + Supported types: {'brotli, 'gzip', 'lz4', 'lz4_raw', 'snappy', 'zstd'} + asbytes : bool, default False + Return result as Python bytes object, otherwise Buffer. + memory_pool : MemoryPool, default None + Memory pool to use for buffer allocations, if any. + + Returns + ------- + compressed : pyarrow.Buffer or bytes (if asbytes=True) + """ + +@overload +def decompress( + buf: Buffer | bytes | SupportPyBuffer, + decompressed_size: int | None = None, + codec: Compression = "lz4", + *, + memory_pool: MemoryPool | None = None, +) -> Buffer: ... +@overload +def decompress( + buf: Buffer | bytes | SupportPyBuffer, + decompressed_size: int | None = None, + codec: Compression = "lz4", + *, + asbytes: Literal[False], + memory_pool: MemoryPool | None = None, +) -> Buffer: ... +@overload +def decompress( + buf: Buffer | bytes | SupportPyBuffer, + decompressed_size: int | None = None, + codec: Compression = "lz4", + *, + asbytes: Literal[True], + memory_pool: MemoryPool | None = None, +) -> bytes: ... +def decompress(*args, **kwargs): + """ + Decompress data from buffer-like object. + + Parameters + ---------- + buf : pyarrow.Buffer, bytes, or memoryview-compatible object + Input object to decompress data from. + decompressed_size : int, default None + Size of the decompressed result + codec : str, default 'lz4' + Compression codec. + Supported types: {'brotli, 'gzip', 'lz4', 'lz4_raw', 'snappy', 'zstd'} + asbytes : bool, default False + Return result as Python bytes object, otherwise Buffer. + memory_pool : MemoryPool, default None + Memory pool to use for buffer allocations, if any. + + Returns + ------- + uncompressed : pyarrow.Buffer or bytes (if asbytes=True) + """ + +def input_stream( + source: StrPath | Buffer | IOBase, + compression: Literal["detect", "bz2", "brotli", "gzip", "lz4", "zstd"] = "detect", + buffer_size: int | None = None, +) -> BufferReader: + """ + Create an Arrow input stream. + + Parameters + ---------- + source : str, Path, buffer, or file-like object + The source to open for reading. + compression : str optional, default 'detect' + The compression algorithm to use for on-the-fly decompression. + If "detect" and source is a file path, then compression will be + chosen based on the file extension. + If None, no compression will be applied. + Otherwise, a well-known algorithm name must be supplied (e.g. "gzip"). + buffer_size : int, default None + If None or 0, no buffering will happen. Otherwise the size of the + temporary read buffer. + + Examples + -------- + Create a readable BufferReader (NativeFile) from a Buffer or a memoryview object: + + >>> import pyarrow as pa + >>> buf = memoryview(b"some data") + >>> with pa.input_stream(buf) as stream: + ... stream.read(4) + b'some' + + Create a readable OSFile (NativeFile) from a string or file path: + + >>> import gzip + >>> with gzip.open("example.gz", "wb") as f: + ... f.write(b"some data") + 9 + >>> with pa.input_stream("example.gz") as stream: + ... stream.read() + b'some data' + + Create a readable PythonFile (NativeFile) from a a Python file object: + + >>> with open("example.txt", mode="w") as f: + ... f.write("some text") + 9 + >>> with pa.input_stream("example.txt") as stream: + ... stream.read(6) + b'some t' + """ + +def output_stream( + source: StrPath | Buffer | IOBase, + compression: Literal["detect", "bz2", "brotli", "gzip", "lz4", "zstd"] = "detect", + buffer_size: int | None = None, +) -> NativeFile: + """ + Create an Arrow output stream. + + Parameters + ---------- + source : str, Path, buffer, file-like object + The source to open for writing. + compression : str optional, default 'detect' + The compression algorithm to use for on-the-fly compression. + If "detect" and source is a file path, then compression will be + chosen based on the file extension. + If None, no compression will be applied. + Otherwise, a well-known algorithm name must be supplied (e.g. "gzip"). + buffer_size : int, default None + If None or 0, no buffering will happen. Otherwise the size of the + temporary write buffer. + + Examples + -------- + Create a writable NativeFile from a pyarrow Buffer: + + >>> import pyarrow as pa + >>> data = b"buffer data" + >>> empty_obj = bytearray(11) + >>> buf = pa.py_buffer(empty_obj) + >>> with pa.output_stream(buf) as stream: + ... stream.write(data) + 11 + >>> with pa.input_stream(buf) as stream: + ... stream.read(6) + b'buffer' + + or from a memoryview object: + + >>> buf = memoryview(empty_obj) + >>> with pa.output_stream(buf) as stream: + ... stream.write(data) + 11 + >>> with pa.input_stream(buf) as stream: + ... stream.read() + b'buffer data' + + Create a writable NativeFile from a string or file path: + + >>> with pa.output_stream("example_second.txt") as stream: + ... stream.write(b"Write some data") + 15 + >>> with pa.input_stream("example_second.txt") as stream: + ... stream.read() + b'Write some data' + """ + +__all__ = [ + "have_libhdfs", + "io_thread_count", + "set_io_thread_count", + "NativeFile", + "PythonFile", + "MemoryMappedFile", + "memory_map", + "create_memory_map", + "OSFile", + "FixedSizeBufferWriter", + "Buffer", + "ResizableBuffer", + "allocate_buffer", + "BufferOutputStream", + "MockOutputStream", + "BufferReader", + "CompressedInputStream", + "CompressedOutputStream", + "BufferedInputStream", + "BufferedOutputStream", + "TransformInputStream", + "Transcoder", + "transcoding_input_stream", + "py_buffer", + "foreign_buffer", + "as_buffer", + "CacheOptions", + "Codec", + "compress", + "decompress", + "input_stream", + "output_stream", +] diff --git a/python/stubs/__lib_pxi/ipc.pyi b/python/stubs/__lib_pxi/ipc.pyi new file mode 100644 index 00000000000..3d72892061e --- /dev/null +++ b/python/stubs/__lib_pxi/ipc.pyi @@ -0,0 +1,705 @@ +import enum +import sys + +from io import IOBase + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from typing import Iterable, Iterator, Literal, Mapping, NamedTuple + +import pandas as pd + +from pyarrow._stubs_typing import SupportArrowStream, SupportPyBuffer +from pyarrow.lib import MemoryPool, RecordBatch, Schema, Table, Tensor, _Weakrefable + +from .io import Buffer, Codec, NativeFile +from .types import DictionaryMemo, KeyValueMetadata + +class MetadataVersion(enum.IntEnum): + V1 = enum.auto() + V2 = enum.auto() + V3 = enum.auto() + V4 = enum.auto() + V5 = enum.auto() + +class WriteStats(NamedTuple): + """IPC write statistics + + Parameters + ---------- + num_messages : int + Number of messages. + num_record_batches : int + Number of record batches. + num_dictionary_batches : int + Number of dictionary batches. + num_dictionary_deltas : int + Delta of dictionaries. + num_replaced_dictionaries : int + Number of replaced dictionaries. + """ + + num_messages: int + num_record_batches: int + num_dictionary_batches: int + num_dictionary_deltas: int + num_replaced_dictionaries: int + +class ReadStats(NamedTuple): + """IPC read statistics + + Parameters + ---------- + num_messages : int + Number of messages. + num_record_batches : int + Number of record batches. + num_dictionary_batches : int + Number of dictionary batches. + num_dictionary_deltas : int + Delta of dictionaries. + num_replaced_dictionaries : int + Number of replaced dictionaries. + """ + + num_messages: int + num_record_batches: int + num_dictionary_batches: int + num_dictionary_deltas: int + num_replaced_dictionaries: int + +class IpcReadOptions(_Weakrefable): + """ + Serialization options for reading IPC format. + + Parameters + ---------- + ensure_native_endian : bool, default True + Whether to convert incoming data to platform-native endianness. + use_threads : bool + Whether to use the global CPU thread pool to parallelize any + computational tasks like decompression + included_fields : list + If empty (the default), return all deserialized fields. + If non-empty, the values are the indices of fields to read on + the top-level schema + """ + + ensure_native_endian: bool + use_threads: bool + included_fields: list[int] + def __init__( + self, + *, + ensure_native_endian: bool = True, + use_threads: bool = True, + included_fields: list[int] | None = None, + ) -> None: ... + +class IpcWriteOptions(_Weakrefable): + """ + Serialization options for the IPC format. + + Parameters + ---------- + metadata_version : MetadataVersion, default MetadataVersion.V5 + The metadata version to write. V5 is the current and latest, + V4 is the pre-1.0 metadata version (with incompatible Union layout). + allow_64bit : bool, default False + If true, allow field lengths that don't fit in a signed 32-bit int. + use_legacy_format : bool, default False + Whether to use the pre-Arrow 0.15 IPC format. + compression : str, Codec, or None + compression codec to use for record batch buffers. + If None then batch buffers will be uncompressed. + Must be "lz4", "zstd" or None. + To specify a compression_level use `pyarrow.Codec` + use_threads : bool + Whether to use the global CPU thread pool to parallelize any + computational tasks like compression. + emit_dictionary_deltas : bool + Whether to emit dictionary deltas. Default is false for maximum + stream compatibility. + unify_dictionaries : bool + If true then calls to write_table will attempt to unify dictionaries + across all batches in the table. This can help avoid the need for + replacement dictionaries (which the file format does not support) + but requires computing the unified dictionary and then remapping + the indices arrays. + + This parameter is ignored when writing to the IPC stream format as + the IPC stream format can support replacement dictionaries. + """ + + metadata_version: MetadataVersion + allow_64bit: bool + use_legacy_format: bool + compression: Codec | Literal["lz4", "zstd"] | None + use_threads: bool + emit_dictionary_deltas: bool + unify_dictionaries: bool + def __init__( + self, + *, + metadata_version: MetadataVersion = MetadataVersion.V5, + allow_64bit: bool = False, + use_legacy_format: bool = False, + compression: Codec | Literal["lz4", "zstd"] | None = None, + use_threads: bool = True, + emit_dictionary_deltas: bool = False, + unify_dictionaries: bool = False, + ) -> None: ... + +class Message(_Weakrefable): + """ + Container for an Arrow IPC message with metadata and optional body + """ + + @property + def type(self) -> str: ... + @property + def metadata(self) -> Buffer: ... + @property + def metadata_version(self) -> MetadataVersion: ... + @property + def body(self) -> Buffer | None: ... + def equals(self, other: Message) -> bool: ... + def serialize_to( + self, sink: NativeFile, alignment: int = 8, memory_pool: MemoryPool | None = None + ): + """ + Write message to generic OutputStream + + Parameters + ---------- + sink : NativeFile + alignment : int, default 8 + Byte alignment for metadata and body + memory_pool : MemoryPool, default None + Uses default memory pool if not specified + """ + def serialize(self, alignment: int = 8, memory_pool: MemoryPool | None = None) -> Buffer: + """ + Write message as encapsulated IPC message + + Parameters + ---------- + alignment : int, default 8 + Byte alignment for metadata and body + memory_pool : MemoryPool, default None + Uses default memory pool if not specified + + Returns + ------- + serialized : Buffer + """ + +class MessageReader(_Weakrefable): + """ + Interface for reading Message objects from some source (like an + InputStream) + """ + @classmethod + def open_stream(cls, source: bytes | NativeFile | IOBase | SupportPyBuffer) -> Self: + """ + Open stream from source, if you want to use memory map use + MemoryMappedFile as source. + + Parameters + ---------- + source : bytes/buffer-like, pyarrow.NativeFile, or file-like Python object + A readable source, like an InputStream + """ + def __iter__(self) -> Self: ... + def read_next_message(self) -> Message: + """ + Read next Message from the stream. + + Raises + ------ + StopIteration + At end of stream + """ + __next__ = read_next_message + +# ---------------------------------------------------------------------- +# File and stream readers and writers + +class _CRecordBatchWriter(_Weakrefable): + """The base RecordBatchWriter wrapper. + + Provides common implementations of convenience methods. Should not + be instantiated directly by user code. + """ + def write(self, table_or_batch: Table | RecordBatch): + """ + Write RecordBatch or Table to stream. + + Parameters + ---------- + table_or_batch : {RecordBatch, Table} + """ + def write_batch( + self, + batch: RecordBatch, + custom_metadata: Mapping[bytes, bytes] | KeyValueMetadata | None = None, + ): + """ + Write RecordBatch to stream. + + Parameters + ---------- + batch : RecordBatch + custom_metadata : mapping or KeyValueMetadata + Keys and values must be string-like / coercible to bytes + """ + def write_table(self, table: Table, max_chunksize: int | None = None) -> None: + """ + Write Table to stream in (contiguous) RecordBatch objects. + + Parameters + ---------- + table : Table + max_chunksize : int, default None + Maximum number of rows for RecordBatch chunks. Individual chunks may + be smaller depending on the chunk layout of individual columns. + """ + def close(self) -> None: + """ + Close stream and write end-of-stream 0 marker. + """ + def __enter__(self) -> Self: ... + def __exit__(self, exc_type, exc_val, exc_tb): ... + @property + def stats(self) -> WriteStats: + """ + Current IPC write statistics. + """ + +class _RecordBatchStreamWriter(_CRecordBatchWriter): + def __dealloc__(self) -> None: ... + def _open(self, sink, schema: Schema, options: IpcWriteOptions = IpcWriteOptions()): ... + +class _ReadPandasMixin: + def read_pandas(self, **options) -> pd.DataFrame: + """ + Read contents of stream to a pandas.DataFrame. + + Read all record batches as a pyarrow.Table then convert it to a + pandas.DataFrame using Table.to_pandas. + + Parameters + ---------- + **options + Arguments to forward to :meth:`Table.to_pandas`. + + Returns + ------- + df : pandas.DataFrame + """ + +class RecordBatchReader(_Weakrefable): + """Base class for reading stream of record batches. + + Record batch readers function as iterators of record batches that also + provide the schema (without the need to get any batches). + + Warnings + -------- + Do not call this class's constructor directly, use one of the + ``RecordBatchReader.from_*`` functions instead. + + Notes + ----- + To import and export using the Arrow C stream interface, use the + ``_import_from_c`` and ``_export_to_c`` methods. However, keep in mind this + interface is intended for expert users. + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema([("x", pa.int64())]) + >>> def iter_record_batches(): + ... for i in range(2): + ... yield pa.RecordBatch.from_arrays([pa.array([1, 2, 3])], schema=schema) + >>> reader = pa.RecordBatchReader.from_batches(schema, iter_record_batches()) + >>> print(reader.schema) + x: int64 + >>> for batch in reader: + ... print(batch) + pyarrow.RecordBatch + x: int64 + ---- + x: [1,2,3] + pyarrow.RecordBatch + x: int64 + ---- + x: [1,2,3] + """ + + def __iter__(self) -> Self: ... + def read_next_batch(self) -> RecordBatch: + """ + Read next RecordBatch from the stream. + + Raises + ------ + StopIteration: + At end of stream. + + Returns + ------- + RecordBatch + """ + __next__ = read_next_batch + @property + def schema(self) -> Schema: + """ + Shared schema of the record batches in the stream. + + Returns + ------- + Schema + """ + def read_next_batch_with_custom_metadata(self) -> RecordBatchWithMetadata: + """ + Read next RecordBatch from the stream along with its custom metadata. + + Raises + ------ + StopIteration: + At end of stream. + + Returns + ------- + batch : RecordBatch + custom_metadata : KeyValueMetadata + """ + def iter_batches_with_custom_metadata( + self, + ) -> Iterator[RecordBatchWithMetadata]: + """ + Iterate over record batches from the stream along with their custom + metadata. + + Yields + ------ + RecordBatchWithMetadata + """ + def read_all(self) -> Table: + """ + Read all record batches as a pyarrow.Table. + + Returns + ------- + Table + """ + read_pandas = _ReadPandasMixin.read_pandas # pyright: ignore[reportUnknownMemberType,reportUnknownVariableType] + def close(self) -> None: + """ + Release any resources associated with the reader. + """ + def __enter__(self) -> Self: ... + def __exit__(self, exc_type, exc_val, exc_tb): ... + def cast(self, target_schema: Schema) -> Self: + """ + Wrap this reader with one that casts each batch lazily as it is pulled. + Currently only a safe cast to target_schema is implemented. + + Parameters + ---------- + target_schema : Schema + Schema to cast to, the names and order of fields must match. + + Returns + ------- + RecordBatchReader + """ + def _export_to_c(self, out_ptr: int) -> None: + """ + Export to a C ArrowArrayStream struct, given its pointer. + + Parameters + ---------- + out_ptr: int + The raw pointer to a C ArrowArrayStream struct. + + Be careful: if you don't pass the ArrowArrayStream struct to a + consumer, array memory will leak. This is a low-level function + intended for expert users. + """ + @classmethod + def _import_from_c(cls, in_ptr: int) -> Self: + """ + Import RecordBatchReader from a C ArrowArrayStream struct, + given its pointer. + + Parameters + ---------- + in_ptr: int + The raw pointer to a C ArrowArrayStream struct. + + This is a low-level function intended for expert users. + """ + def __arrow_c_stream__(self, requested_schema=None): + """ + Export to a C ArrowArrayStream PyCapsule. + + Parameters + ---------- + requested_schema : PyCapsule, default None + The schema to which the stream should be casted, passed as a + PyCapsule containing a C ArrowSchema representation of the + requested schema. + + Returns + ------- + PyCapsule + A capsule containing a C ArrowArrayStream struct. + """ + @classmethod + def _import_from_c_capsule(cls, stream) -> Self: + """ + Import RecordBatchReader from a C ArrowArrayStream PyCapsule. + + Parameters + ---------- + stream: PyCapsule + A capsule containing a C ArrowArrayStream PyCapsule. + + Returns + ------- + RecordBatchReader + """ + @classmethod + def from_stream(cls, data: SupportArrowStream, schema: Schema | None = None) -> Self: + """ + Create RecordBatchReader from a Arrow-compatible stream object. + + This accepts objects implementing the Arrow PyCapsule Protocol for + streams, i.e. objects that have a ``__arrow_c_stream__`` method. + + Parameters + ---------- + data : Arrow-compatible stream object + Any object that implements the Arrow PyCapsule Protocol for + streams. + schema : Schema, default None + The schema to which the stream should be casted, if supported + by the stream object. + + Returns + ------- + RecordBatchReader + """ + @classmethod + def from_batches(cls, schema: Schema, batches: Iterable[RecordBatch]) -> Self: + """ + Create RecordBatchReader from an iterable of batches. + + Parameters + ---------- + schema : Schema + The shared schema of the record batches + batches : Iterable[RecordBatch] + The batches that this reader will return. + + Returns + ------- + reader : RecordBatchReader + """ + +class _RecordBatchStreamReader(RecordBatchReader): + @property + def stats(self) -> ReadStats: + """ + Current IPC read statistics. + """ + +class _RecordBatchFileWriter(_RecordBatchStreamWriter): ... + +class RecordBatchWithMetadata(NamedTuple): + """RecordBatch with its custom metadata + + Parameters + ---------- + batch : RecordBatch + custom_metadata : KeyValueMetadata + """ + + batch: RecordBatch + custom_metadata: KeyValueMetadata + +class _RecordBatchFileReader(_Weakrefable): + @property + def num_record_batches(self) -> int: + """ + The number of record batches in the IPC file. + """ + def get_batch(self, i: int) -> RecordBatch: + """ + Read the record batch with the given index. + + Parameters + ---------- + i : int + The index of the record batch in the IPC file. + + Returns + ------- + batch : RecordBatch + """ + get_record_batch = get_batch + def get_batch_with_custom_metadata(self, i: int) -> RecordBatchWithMetadata: + """ + Read the record batch with the given index along with + its custom metadata + + Parameters + ---------- + i : int + The index of the record batch in the IPC file. + + Returns + ------- + batch : RecordBatch + custom_metadata : KeyValueMetadata + """ + def read_all(self) -> Table: + """ + Read all record batches as a pyarrow.Table + """ + read_pandas = _ReadPandasMixin.read_pandas # pyright: ignore[reportUnknownMemberType,reportUnknownVariableType] + def __enter__(self) -> Self: ... + def __exit__(self, exc_type, exc_val, exc_tb): ... + @property + def schema(self) -> Schema: ... + @property + def stats(self) -> ReadStats: ... + +def get_tensor_size(tensor: Tensor) -> int: + """ + Return total size of serialized Tensor including metadata and padding. + + Parameters + ---------- + tensor : Tensor + The tensor for which we want to known the size. + """ + +def get_record_batch_size(batch: RecordBatch) -> int: + """ + Return total size of serialized RecordBatch including metadata and padding. + + Parameters + ---------- + batch : RecordBatch + The recordbatch for which we want to know the size. + """ + +def write_tensor(tensor: Tensor, dest: NativeFile) -> int: + """ + Write pyarrow.Tensor to pyarrow.NativeFile object its current position. + + Parameters + ---------- + tensor : pyarrow.Tensor + dest : pyarrow.NativeFile + + Returns + ------- + bytes_written : int + Total number of bytes written to the file + """ + +def read_tensor(source: NativeFile) -> Tensor: + """Read pyarrow.Tensor from pyarrow.NativeFile object from current + position. If the file source supports zero copy (e.g. a memory map), then + this operation does not allocate any memory. This function not assume that + the stream is aligned + + Parameters + ---------- + source : pyarrow.NativeFile + + Returns + ------- + tensor : Tensor + + """ + +def read_message(source: NativeFile | IOBase | SupportPyBuffer) -> Message: + """ + Read length-prefixed message from file or buffer-like object + + Parameters + ---------- + source : pyarrow.NativeFile, file-like object, or buffer-like object + + Returns + ------- + message : Message + """ + +def read_schema(obj: Buffer | Message, dictionary_memo: DictionaryMemo | None = None) -> Schema: + """ + Read Schema from message or buffer + + Parameters + ---------- + obj : buffer or Message + dictionary_memo : DictionaryMemo, optional + Needed to be able to reconstruct dictionary-encoded fields + with read_record_batch + + Returns + ------- + schema : Schema + """ + +def read_record_batch( + obj: Message | SupportPyBuffer, schema: Schema, dictionary_memo: DictionaryMemo | None = None +) -> RecordBatch: + """ + Read RecordBatch from message, given a known schema. If reading data from a + complete IPC stream, use ipc.open_stream instead + + Parameters + ---------- + obj : Message or Buffer-like + schema : Schema + dictionary_memo : DictionaryMemo, optional + If message contains dictionaries, must pass a populated + DictionaryMemo + + Returns + ------- + batch : RecordBatch + """ + +__all__ = [ + "MetadataVersion", + "WriteStats", + "ReadStats", + "IpcReadOptions", + "IpcWriteOptions", + "Message", + "MessageReader", + "_CRecordBatchWriter", + "_RecordBatchStreamWriter", + "_ReadPandasMixin", + "RecordBatchReader", + "_RecordBatchStreamReader", + "_RecordBatchFileWriter", + "RecordBatchWithMetadata", + "_RecordBatchFileReader", + "get_tensor_size", + "get_record_batch_size", + "write_tensor", + "read_tensor", + "read_message", + "read_schema", + "read_record_batch", +] diff --git a/python/stubs/__lib_pxi/memory.pyi b/python/stubs/__lib_pxi/memory.pyi new file mode 100644 index 00000000000..57a3bb4f1b3 --- /dev/null +++ b/python/stubs/__lib_pxi/memory.pyi @@ -0,0 +1,174 @@ +from pyarrow.lib import _Weakrefable + +class MemoryPool(_Weakrefable): + """ + Base class for memory allocation. + + Besides tracking its number of allocated bytes, a memory pool also + takes care of the required 64-byte alignment for Arrow data. + """ + + def release_unused(self) -> None: + """ + Attempt to return to the OS any memory being held onto by the pool. + + This function should not be called except potentially for + benchmarking or debugging as it could be expensive and detrimental to + performance. + + This is best effort and may not have any effect on some memory pools + or in some situations (e.g. fragmentation). + """ + def bytes_allocated(self) -> int: + """ + Return the number of bytes that are currently allocated from this + memory pool. + """ + def total_bytes_allocated(self) -> int: + """ + Return the total number of bytes that have been allocated from this + memory pool. + """ + def max_memory(self) -> int | None: + """ + Return the peak memory allocation in this memory pool. + This can be an approximate number in multi-threaded applications. + + None is returned if the pool implementation doesn't know how to + compute this number. + """ + def num_allocations(self) -> int: + """ + Return the number of allocations or reallocations that were made + using this memory pool. + """ + def print_stats(self) -> None: + """ + Print statistics about this memory pool. + + The output format is implementation-specific. Not all memory pools + implement this method. + """ + @property + def backend_name(self) -> str: + """ + The name of the backend used by this MemoryPool (e.g. "jemalloc"). + """ + +class LoggingMemoryPool(MemoryPool): ... +class ProxyMemoryPool(MemoryPool): ... + +def default_memory_pool() -> MemoryPool: + """ + Return the process-global memory pool. + + Examples + -------- + >>> default_memory_pool() + + """ + +def proxy_memory_pool(parent: MemoryPool) -> ProxyMemoryPool: + """ + Create and return a MemoryPool instance that redirects to the + *parent*, but with separate allocation statistics. + + Parameters + ---------- + parent : MemoryPool + The real memory pool that should be used for allocations. + """ + +def logging_memory_pool(parent: MemoryPool) -> LoggingMemoryPool: + """ + Create and return a MemoryPool instance that redirects to the + *parent*, but also dumps allocation logs on stderr. + + Parameters + ---------- + parent : MemoryPool + The real memory pool that should be used for allocations. + """ + +def system_memory_pool() -> MemoryPool: + """ + Return a memory pool based on the C malloc heap. + """ + +def jemalloc_memory_pool() -> MemoryPool: + """ + Return a memory pool based on the jemalloc heap. + + NotImplementedError is raised if jemalloc support is not enabled. + """ + +def mimalloc_memory_pool() -> MemoryPool: + """ + Return a memory pool based on the mimalloc heap. + + NotImplementedError is raised if mimalloc support is not enabled. + """ + +def set_memory_pool(pool: MemoryPool) -> None: + """ + Set the default memory pool. + + Parameters + ---------- + pool : MemoryPool + The memory pool that should be used by default. + """ + +def log_memory_allocations(enable: bool = True) -> None: + """ + Enable or disable memory allocator logging for debugging purposes + + Parameters + ---------- + enable : bool, default True + Pass False to disable logging + """ + +def total_allocated_bytes() -> int: + """ + Return the currently allocated bytes from the default memory pool. + Other memory pools may not be accounted for. + """ + +def jemalloc_set_decay_ms(decay_ms: int) -> None: + """ + Set arenas.dirty_decay_ms and arenas.muzzy_decay_ms to indicated number of + milliseconds. A value of 0 (the default) results in dirty / muzzy memory + pages being released right away to the OS, while a higher value will result + in a time-based decay. See the jemalloc docs for more information + + It's best to set this at the start of your application. + + Parameters + ---------- + decay_ms : int + Number of milliseconds to set for jemalloc decay conf parameters. Note + that this change will only affect future memory arenas + """ + +def supported_memory_backends() -> list[str]: + """ + Return a list of available memory pool backends + """ + +__all__ = [ + "MemoryPool", + "LoggingMemoryPool", + "ProxyMemoryPool", + "default_memory_pool", + "proxy_memory_pool", + "logging_memory_pool", + "system_memory_pool", + "jemalloc_memory_pool", + "mimalloc_memory_pool", + "set_memory_pool", + "log_memory_allocations", + "total_allocated_bytes", + "jemalloc_set_decay_ms", + "supported_memory_backends", +] diff --git a/python/stubs/__lib_pxi/pandas_shim.pyi b/python/stubs/__lib_pxi/pandas_shim.pyi new file mode 100644 index 00000000000..0e80fae4ebf --- /dev/null +++ b/python/stubs/__lib_pxi/pandas_shim.pyi @@ -0,0 +1,51 @@ +from types import ModuleType +from typing import Any, Iterable, TypeGuard + +import pandas as pd + +from numpy import dtype +from pandas.core.dtypes.base import ExtensionDtype + +class _PandasAPIShim: + has_sparse: bool + + def series(self, *args, **kwargs) -> pd.Series: ... + def data_frame(self, *args, **kwargs) -> pd.DataFrame: ... + @property + def have_pandas(self) -> bool: ... + @property + def compat(self) -> ModuleType: ... + @property + def pd(self) -> ModuleType: ... + def infer_dtype(self, obj: Iterable) -> str: ... + def pandas_dtype(self, dtype: str) -> dtype: ... + @property + def loose_version(self) -> Any: ... + @property + def version(self) -> str: ... + def is_v1(self) -> bool: ... + def is_ge_v21(self) -> bool: ... + def is_ge_v23(self) -> bool: ... + def is_ge_v3(self) -> bool: ... + @property + def categorical_type(self) -> type[pd.Categorical]: ... + @property + def datetimetz_type(self) -> type[pd.DatetimeTZDtype]: ... + @property + def extension_dtype(self) -> type[ExtensionDtype]: ... + def is_array_like( + self, obj: Any + ) -> TypeGuard[pd.Series | pd.Index | pd.Categorical | ExtensionDtype]: ... + def is_categorical(self, obj: Any) -> TypeGuard[pd.Categorical]: ... + def is_datetimetz(self, obj: Any) -> TypeGuard[pd.DatetimeTZDtype]: ... + def is_extension_array_dtype(self, obj: Any) -> TypeGuard[ExtensionDtype]: ... + def is_sparse(self, obj: Any) -> bool: ... + def is_data_frame(self, obj: Any) -> TypeGuard[pd.DataFrame]: ... + def is_series(self, obj: Any) -> TypeGuard[pd.Series]: ... + def is_index(self, obj: Any) -> TypeGuard[pd.Index]: ... + def get_values(self, obj: Any) -> bool: ... + def get_rangeindex_attribute(self, level, name): ... + +_pandas_api: _PandasAPIShim + +__all__ = ["_PandasAPIShim", "_pandas_api"] diff --git a/python/stubs/__lib_pxi/scalar.pyi b/python/stubs/__lib_pxi/scalar.pyi new file mode 100644 index 00000000000..81ab5012067 --- /dev/null +++ b/python/stubs/__lib_pxi/scalar.pyi @@ -0,0 +1,1017 @@ +import collections.abc +import datetime as dt +import sys + +from decimal import Decimal + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias +from typing import Any, Generic, Iterator, Literal, Mapping, overload + +import numpy as np + +from pyarrow._compute import CastOptions +from pyarrow.lib import Array, Buffer, MemoryPool, MonthDayNano, Tensor, _Weakrefable +from typing_extensions import Protocol, TypeVar + +from . import types +from .types import ( + _AsPyType, + _DataTypeT, + _Time32Unit, + _Time64Unit, + _Tz, + _Unit, +) + +_AsPyTypeK = TypeVar("_AsPyTypeK") +_AsPyTypeV = TypeVar("_AsPyTypeV") +_DataType_co = TypeVar("_DataType_co", bound=types.DataType, covariant=True) + +class Scalar(_Weakrefable, Generic[_DataType_co]): + """ + The base class for scalars. + """ + @property + def type(self) -> _DataType_co: + """ + Data type of the Scalar object. + """ + @property + def is_valid(self) -> bool: + """ + Holds a valid (non-null) value. + """ + @overload + def cast( + self, + target_type: None, + safe: bool = True, + options: CastOptions | None = None, + memory_pool: MemoryPool | None = None, + ) -> Self: ... + @overload + def cast( + self, + target_type: _DataTypeT, + safe: bool = True, + options: CastOptions | None = None, + memory_pool: MemoryPool | None = None, + ) -> Scalar[_DataTypeT]: ... + def cast(self, *args, **kwargs): + """ + Cast scalar value to another data type. + + See :func:`pyarrow.compute.cast` for usage. + + Parameters + ---------- + target_type : DataType, default None + Type to cast scalar to. + safe : boolean, default True + Whether to check for conversion errors such as overflow. + options : CastOptions, default None + Additional checks pass by CastOptions + memory_pool : MemoryPool, optional + memory pool to use for allocations during function execution. + + Returns + ------- + scalar : A Scalar of the given target data type. + """ + def validate(self, *, full: bool = False) -> None: + """ + Perform validation checks. An exception is raised if validation fails. + + By default only cheap validation checks are run. Pass `full=True` + for thorough validation checks (potentially O(n)). + + Parameters + ---------- + full : bool, default False + If True, run expensive checks, otherwise cheap checks only. + + Raises + ------ + ArrowInvalid + """ + def equals(self, other: Scalar) -> bool: ... + def __hash__(self) -> int: ... + @overload + def as_py( + self: Scalar[types._BasicDataType[_AsPyType]], + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, + ) -> _AsPyType: ... + @overload + def as_py( + self: Scalar[types.ListType[types._BasicDataType[_AsPyType]]], + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, + ) -> list[_AsPyType]: ... + @overload + def as_py( + self: Scalar[ + types.ListType[ + types.DictionaryType[types._IndexT, types._BasicDataType[_AsPyTypeV], Any] + ] + ], + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, + ) -> list[dict[int, _AsPyTypeV]]: ... + @overload + def as_py( + self: Scalar[ + types.ListType[types.DictionaryType[Any, types._BasicDataType[_AsPyTypeV], Any]], + ], + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, + ) -> list[dict[Any, _AsPyTypeV]]: ... + @overload + def as_py( + self: Scalar[types.ListType[types.DictionaryType[types._IndexT, Any, Any]],], + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, + ) -> list[dict[int, Any]]: ... + @overload + def as_py( + self: Scalar[types.StructType], + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, + ) -> list[dict[str, Any]]: ... + @overload + def as_py( + self: Scalar[ + types.MapType[types._BasicDataType[_AsPyTypeK], types._BasicDataType[_AsPyTypeV]] + ], + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, + ) -> list[tuple[_AsPyTypeK, _AsPyTypeV]]: ... + @overload + def as_py( + self: Scalar[types.MapType[Any, types._BasicDataType[_AsPyTypeV]]], + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, + ) -> list[tuple[Any, _AsPyTypeV]]: ... + @overload + def as_py( + self: Scalar[types.MapType[types._BasicDataType[_AsPyTypeK], Any]], + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, + ) -> list[tuple[_AsPyTypeK, Any]]: ... + @overload + def as_py( + self: Scalar[Any], + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, + ) -> Any: ... + def as_py(self, *args, **kwargs): + """ + Return this value as a Python representation. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + + If 'lossy', whenever duplicate keys are detected, a warning will be printed. + The last seen value of a duplicate key will be in the Python dictionary. + If 'strict', this instead results in an exception being raised when detected. + """ + +_NULL: TypeAlias = None +NA = _NULL + +class NullScalar(Scalar[types.NullType]): ... +class BooleanScalar(Scalar[types.BoolType]): ... +class UInt8Scalar(Scalar[types.UInt8Type]): ... +class Int8Scalar(Scalar[types.Int8Type]): ... +class UInt16Scalar(Scalar[types.UInt16Type]): ... +class Int16Scalar(Scalar[types.Int16Type]): ... +class UInt32Scalar(Scalar[types.Uint32Type]): ... +class Int32Scalar(Scalar[types.Int32Type]): ... +class UInt64Scalar(Scalar[types.UInt64Type]): ... +class Int64Scalar(Scalar[types.Int64Type]): ... +class HalfFloatScalar(Scalar[types.Float16Type]): ... +class FloatScalar(Scalar[types.Float32Type]): ... +class DoubleScalar(Scalar[types.Float64Type]): ... +class Decimal32Scalar(Scalar[types.Decimal32Type[types._Precision, types._Scale]]): ... +class Decimal64Scalar(Scalar[types.Decimal64Type[types._Precision, types._Scale]]): ... +class Decimal128Scalar(Scalar[types.Decimal128Type[types._Precision, types._Scale]]): ... +class Decimal256Scalar(Scalar[types.Decimal256Type[types._Precision, types._Scale]]): ... +class Date32Scalar(Scalar[types.Date32Type]): ... + +class Date64Scalar(Scalar[types.Date64Type]): + @property + def value(self) -> dt.date | None: ... + +class Time32Scalar(Scalar[types.Time32Type[_Time32Unit]]): + @property + def value(self) -> dt.time | None: ... + +class Time64Scalar(Scalar[types.Time64Type[_Time64Unit]]): + @property + def value(self) -> dt.time | None: ... + +class TimestampScalar(Scalar[types.TimestampType[_Unit, _Tz]]): + @property + def value(self) -> int | None: ... + +class DurationScalar(Scalar[types.DurationType[_Unit]]): + @property + def value(self) -> dt.timedelta | None: ... + +class MonthDayNanoIntervalScalar(Scalar[types.MonthDayNanoIntervalType]): + @property + def value(self) -> MonthDayNano | None: ... + +class BinaryScalar(Scalar[types.BinaryType]): + def as_buffer(self) -> Buffer: ... + +class LargeBinaryScalar(Scalar[types.LargeBinaryType]): + def as_buffer(self) -> Buffer: ... + +class FixedSizeBinaryScalar(Scalar[types.FixedSizeBinaryType]): + def as_buffer(self) -> Buffer: ... + +class StringScalar(Scalar[types.StringType]): + def as_buffer(self) -> Buffer: ... + +class LargeStringScalar(Scalar[types.LargeStringType]): + def as_buffer(self) -> Buffer: ... + +class BinaryViewScalar(Scalar[types.BinaryViewType]): + def as_buffer(self) -> Buffer: ... + +class StringViewScalar(Scalar[types.StringViewType]): + def as_buffer(self) -> Buffer: ... + +class ListScalar(Scalar[types.ListType[_DataTypeT]]): + @property + def values(self) -> Array | None: ... + def __len__(self) -> int: ... + def __getitem__(self, i: int) -> Scalar[_DataTypeT]: ... + def __iter__(self) -> Iterator[Array]: ... + +class FixedSizeListScalar(Scalar[types.FixedSizeListType[_DataTypeT, types._Size]]): + @property + def values(self) -> Array | None: ... + def __len__(self) -> int: ... + def __getitem__(self, i: int) -> Scalar[_DataTypeT]: ... + def __iter__(self) -> Iterator[Array]: ... + +class LargeListScalar(Scalar[types.LargeListType[_DataTypeT]]): + @property + def values(self) -> Array | None: ... + def __len__(self) -> int: ... + def __getitem__(self, i: int) -> Scalar[_DataTypeT]: ... + def __iter__(self) -> Iterator[Array]: ... + +class ListViewScalar(Scalar[types.ListViewType[_DataTypeT]]): + @property + def values(self) -> Array | None: ... + def __len__(self) -> int: ... + def __getitem__(self, i: int) -> Scalar[_DataTypeT]: ... + def __iter__(self) -> Iterator[Array]: ... + +class LargeListViewScalar(Scalar[types.LargeListViewType[_DataTypeT]]): + @property + def values(self) -> Array | None: ... + def __len__(self) -> int: ... + def __getitem__(self, i: int) -> Scalar[_DataTypeT]: ... + def __iter__(self) -> Iterator[Array]: ... + +class StructScalar(Scalar[types.StructType], collections.abc.Mapping[str, Scalar]): + def __len__(self) -> int: ... + def __iter__(self) -> Iterator[str]: ... + def __getitem__(self, __key: str) -> Scalar[Any]: ... # type: ignore[override] + def _as_py_tuple(self) -> list[tuple[str, Any]]: ... + +class MapScalar(Scalar[types.MapType[types._K, types._ValueT]]): + @property + def values(self) -> Array | None: ... + def __len__(self) -> int: ... + def __getitem__(self, i: int) -> tuple[Scalar[types._K], types._ValueT, Any]: ... + @overload + def __iter__( + self: Scalar[ + types.MapType[types._BasicDataType[_AsPyTypeK], types._BasicDataType[_AsPyTypeV]] + ], + ) -> Iterator[tuple[_AsPyTypeK, _AsPyTypeV]]: ... + @overload + def __iter__( + self: Scalar[types.MapType[Any, types._BasicDataType[_AsPyTypeV]],], + ) -> Iterator[tuple[Any, _AsPyTypeV]]: ... + @overload + def __iter__( + self: Scalar[types.MapType[types._BasicDataType[_AsPyTypeK], Any],], + ) -> Iterator[tuple[_AsPyTypeK, Any]]: ... + +class DictionaryScalar(Scalar[types.DictionaryType[types._IndexT, types._BasicValueT]]): + @property + def index(self) -> Scalar[types._IndexT]: ... + @property + def value(self) -> Scalar[types._BasicValueT]: ... + @property + def dictionary(self) -> Array: ... + +class RunEndEncodedScalar(Scalar[types.RunEndEncodedType[types._RunEndType, types._BasicValueT]]): + @property + def value(self) -> tuple[int, types._BasicValueT] | None: ... + +class UnionScalar(Scalar[types.UnionType]): + @property + def value(self) -> Any | None: ... + @property + def type_code(self) -> str: ... + +class ExtensionScalar(Scalar[types.ExtensionType]): + @property + def value(self) -> Any | None: ... + @staticmethod + def from_storage(typ: types.BaseExtensionType, value) -> ExtensionScalar: + """ + Construct ExtensionScalar from type and storage value. + + Parameters + ---------- + typ : DataType + The extension type for the result scalar. + value : object + The storage value for the result scalar. + + Returns + ------- + ext_scalar : ExtensionScalar + """ + +class Bool8Scalar(Scalar[types.Bool8Type]): ... +class UuidScalar(Scalar[types.UuidType]): ... +class JsonScalar(Scalar[types.JsonType]): ... +class OpaqueScalar(Scalar[types.OpaqueType]): ... + +class FixedShapeTensorScalar(ExtensionScalar): + def to_numpy(self) -> np.ndarray: + """ + Convert fixed shape tensor scalar to a numpy.ndarray. + + The resulting ndarray's shape matches the permuted shape of the + fixed shape tensor scalar. + The conversion is zero-copy. + + Returns + ------- + numpy.ndarray + """ + def to_tensor(self) -> Tensor: + """ + Convert fixed shape tensor extension scalar to a pyarrow.Tensor, using shape + and strides derived from corresponding FixedShapeTensorType. + + The conversion is zero-copy. + + Returns + ------- + pyarrow.Tensor + Tensor represented stored in FixedShapeTensorScalar. + """ + +_V = TypeVar("_V") + +class NullableCollection(Protocol[_V]): # pyright: ignore[reportInvalidTypeVarUse] + def __iter__(self) -> Iterator[_V] | Iterator[_V | None]: ... + def __len__(self) -> int: ... + def __contains__(self, item: Any, /) -> bool: ... + +@overload +def scalar( + value: str, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> StringScalar: ... +@overload +def scalar( + value: bytes, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> BinaryScalar: ... +@overload +def scalar( # pyright: ignore[reportOverlappingOverload] + value: bool, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> BooleanScalar: ... +@overload +def scalar( + value: int, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Int64Scalar: ... +@overload +def scalar( + value: float, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> DoubleScalar: ... +@overload +def scalar( + value: Decimal, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Decimal128Scalar: ... +@overload +def scalar( # pyright: ignore[reportOverlappingOverload] + value: dt.datetime, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> TimestampScalar[Literal["us"]]: ... +@overload +def scalar( + value: dt.date, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Date32Scalar: ... +@overload +def scalar( + value: dt.time, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Time64Scalar[Literal["us"]]: ... +@overload +def scalar( + value: dt.timedelta, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> DurationScalar[Literal["us"]]: ... +@overload +def scalar( # pyright: ignore[reportOverlappingOverload] + value: MonthDayNano, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> MonthDayNanoIntervalScalar: ... +@overload +def scalar( + value: Mapping[str, Any], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> StructScalar: ... +@overload +def scalar( + value: NullableCollection[str], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[types.ListType[types.StringType]]: ... +@overload +def scalar( + value: NullableCollection[bytes], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[types.ListType[types.BinaryType]]: ... +@overload +def scalar( + value: NullableCollection[bool], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[types.ListType[types.BoolType]]: ... +@overload +def scalar( + value: NullableCollection[int], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[types.ListType[types.Int64Type]]: ... +@overload +def scalar( + value: NullableCollection[float], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[types.ListType[types.Float64Type]]: ... +@overload +def scalar( + value: NullableCollection[Decimal], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[types.ListType[types.Decimal32Type]]: ... +@overload +def scalar( + value: NullableCollection[dt.datetime], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[types.ListType[types.TimestampType[Literal["us"]]]]: ... +@overload +def scalar( + value: NullableCollection[dt.date], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[types.ListType[types.Date32Type]]: ... +@overload +def scalar( + value: NullableCollection[dt.time], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[types.ListType[types.Time64Type[Literal["us"]]]]: ... +@overload +def scalar( + value: NullableCollection[dt.timedelta], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[types.ListType[types.DurationType[Literal["us"]]]]: ... +@overload +def scalar( + value: NullableCollection[MonthDayNano], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[types.ListType[types.MonthDayNanoIntervalType]]: ... +@overload +def scalar( + value: NullableCollection[Any], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[Any]: ... +@overload +def scalar( + value: Any, + type: types.NullType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> NullScalar: ... +@overload +def scalar( + value: Any, + type: types.BoolType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> BooleanScalar: ... +@overload +def scalar( + value: Any, + type: types.UInt8Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> UInt8Scalar: ... +@overload +def scalar( + value: Any, + type: types.Int8Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Int8Scalar: ... +@overload +def scalar( + value: Any, + type: types.UInt16Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> UInt16Scalar: ... +@overload +def scalar( + value: Any, + type: types.Int16Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Int16Scalar: ... +@overload +def scalar( + value: Any, + type: types.Uint32Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> UInt32Scalar: ... +@overload +def scalar( + value: Any, + type: types.Int32Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Int32Scalar: ... +@overload +def scalar( + value: Any, + type: types.UInt64Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> UInt64Scalar: ... +@overload +def scalar( + value: Any, + type: types.Int64Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Int64Scalar: ... +@overload +def scalar( + value: Any, + type: types.Float16Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> HalfFloatScalar: ... +@overload +def scalar( + value: Any, + type: types.Float32Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> FloatScalar: ... +@overload +def scalar( + value: Any, + type: types.Float64Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> DoubleScalar: ... +@overload +def scalar( + value: Any, + type: types.Date32Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Date32Scalar: ... +@overload +def scalar( + value: Any, + type: types.Date64Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Date64Scalar: ... +@overload +def scalar( + value: Any, + type: types.MonthDayNanoIntervalType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> MonthDayNanoIntervalScalar: ... +@overload +def scalar( + value: Any, + type: types.StringType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> StringScalar: ... +@overload +def scalar( + value: Any, + type: types.LargeStringType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> LargeStringScalar: ... +@overload +def scalar( + value: Any, + type: types.StringViewType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> StringViewScalar: ... +@overload +def scalar( + value: Any, + type: types.BinaryType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> BinaryScalar: ... +@overload +def scalar( + value: Any, + type: types.LargeBinaryType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> LargeBinaryScalar: ... +@overload +def scalar( + value: Any, + type: types.BinaryViewType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> BinaryViewScalar: ... +@overload +def scalar( + value: Any, + type: types.TimestampType[types._Unit, types._Tz], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> TimestampScalar[types._Unit, types._Tz]: ... +@overload +def scalar( + value: Any, + type: types.Time32Type[types._Time32Unit], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Time32Scalar[types._Time32Unit]: ... +@overload +def scalar( + value: Any, + type: types.Time64Type[types._Time64Unit], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Time64Scalar[types._Time64Unit]: ... +@overload +def scalar( + value: Any, + type: types.DurationType[types._Unit], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> DurationScalar[types._Unit]: ... +@overload +def scalar( + value: Any, + type: types.Decimal32Type[types._Precision, types._Scale], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Decimal32Scalar[types._Precision, types._Scale]: ... +@overload +def scalar( + value: Any, + type: types.Decimal64Type[types._Precision, types._Scale], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Decimal64Scalar[types._Precision, types._Scale]: ... +@overload +def scalar( + value: Any, + type: types.Decimal128Type[types._Precision, types._Scale], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Decimal128Scalar[types._Precision, types._Scale]: ... +@overload +def scalar( + value: Any, + type: types.Decimal256Type[types._Precision, types._Scale], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Decimal256Scalar[types._Precision, types._Scale]: ... +@overload +def scalar( + value: Any, + type: types.ListType[_DataTypeT], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListScalar[_DataTypeT]: ... +@overload +def scalar( + value: Any, + type: types.LargeListType[_DataTypeT], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> LargeListScalar[_DataTypeT]: ... +@overload +def scalar( + value: Any, + type: types.ListViewType[_DataTypeT], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> ListViewScalar[_DataTypeT]: ... +@overload +def scalar( + value: Any, + type: types.LargeListViewType[_DataTypeT], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> LargeListViewScalar[_DataTypeT]: ... +@overload +def scalar( + value: Any, + type: types.FixedSizeListType[_DataTypeT, types._Size], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> FixedSizeListScalar[_DataTypeT, types._Size]: ... +@overload +def scalar( + value: Any, + type: types.DictionaryType[types._IndexT, types._BasicValueT], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> DictionaryScalar[types._IndexT, types._BasicValueT]: ... +@overload +def scalar( + value: Any, + type: types.MapType[types._K, types._ValueT], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> MapScalar[types._K, types._ValueT]: ... +@overload +def scalar( + value: Any, + type: types.StructType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> StructScalar: ... +@overload +def scalar( + value: Any, + type: types.UnionType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> UnionScalar: ... +@overload +def scalar( + value: Any, + type: types.RunEndEncodedType[types._RunEndType, types._BasicValueT], + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> RunEndEncodedScalar[types._RunEndType, types._BasicValueT]: ... +@overload +def scalar( + value: Any, + type: types.Bool8Type, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Bool8Scalar: ... +@overload +def scalar( + value: Any, + type: types.UuidType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> UuidScalar: ... +@overload +def scalar( + value: Any, + type: types.JsonType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> JsonScalar: ... +@overload +def scalar( + value: Any, + type: types.OpaqueType, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> OpaqueScalar: ... +@overload +def scalar( + value: Any, + type: _DataTypeT, + *, + from_pandas: bool | None = None, + memory_pool: MemoryPool | None = None, +) -> Scalar[_DataTypeT]: ... +def scalar(*args, **kwargs): + """ + Create a pyarrow.Scalar instance from a Python object. + + Parameters + ---------- + value : Any + Python object coercible to arrow's type system. + type : pyarrow.DataType + Explicit type to attempt to coerce to, otherwise will be inferred from + the value. + from_pandas : bool, default None + Use pandas's semantics for inferring nulls from values in + ndarray-like data. Defaults to False if not passed explicitly by user, + or True if a pandas object is passed in. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the currently-set default + memory pool. + + Returns + ------- + scalar : pyarrow.Scalar + + Examples + -------- + >>> import pyarrow as pa + + >>> pa.scalar(42) + + + >>> pa.scalar("string") + + + >>> pa.scalar([1, 2]) + + + >>> pa.scalar([1, 2], type=pa.list_(pa.int16())) + + """ + +__all__ = [ + "Scalar", + "_NULL", + "NA", + "NullScalar", + "BooleanScalar", + "UInt8Scalar", + "Int8Scalar", + "UInt16Scalar", + "Int16Scalar", + "UInt32Scalar", + "Int32Scalar", + "UInt64Scalar", + "Int64Scalar", + "HalfFloatScalar", + "FloatScalar", + "DoubleScalar", + "Decimal32Scalar", + "Decimal64Scalar", + "Decimal128Scalar", + "Decimal256Scalar", + "Date32Scalar", + "Date64Scalar", + "Time32Scalar", + "Time64Scalar", + "TimestampScalar", + "DurationScalar", + "MonthDayNanoIntervalScalar", + "BinaryScalar", + "LargeBinaryScalar", + "FixedSizeBinaryScalar", + "StringScalar", + "LargeStringScalar", + "BinaryViewScalar", + "StringViewScalar", + "ListScalar", + "FixedSizeListScalar", + "LargeListScalar", + "ListViewScalar", + "LargeListViewScalar", + "StructScalar", + "MapScalar", + "DictionaryScalar", + "RunEndEncodedScalar", + "UnionScalar", + "ExtensionScalar", + "FixedShapeTensorScalar", + "Bool8Scalar", + "UuidScalar", + "JsonScalar", + "OpaqueScalar", + "scalar", +] diff --git a/python/stubs/__lib_pxi/table.pyi b/python/stubs/__lib_pxi/table.pyi new file mode 100644 index 00000000000..ad9d0392137 --- /dev/null +++ b/python/stubs/__lib_pxi/table.pyi @@ -0,0 +1,5609 @@ +import datetime as dt +import sys + +from decimal import Decimal + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias +from typing import ( + Any, + Collection, + Generator, + Generic, + Iterable, + Iterator, + Literal, + Mapping, + Sequence, + TypeVar, + overload, +) + +import numpy as np +import pandas as pd + +from numpy.typing import NDArray +from pyarrow._compute import ( + CastOptions, + CountOptions, + FunctionOptions, + ScalarAggregateOptions, + TDigestOptions, + VarianceOptions, +) +from pyarrow._stubs_typing import ( + Indices, + Mask, + NullEncoding, + NullSelectionBehavior, + Order, + SupportArrowArray, + SupportArrowDeviceArray, + SupportArrowStream, +) +from pyarrow.compute import ArrayOrChunkedArray, Expression +from pyarrow.interchange.dataframe import _PyArrowDataFrame +from pyarrow.lib import Device, Field, MemoryManager, MemoryPool, MonthDayNano, Schema + +from . import array, scalar, types +from .array import Array, NullableCollection, StructArray, _CastAs, _PandasConvertible +from .device import DeviceAllocationType +from .io import Buffer +from .ipc import RecordBatchReader +from .scalar import Int64Scalar, Scalar +from .tensor import Tensor +from .types import _AsPyType, _BasicDataType, _DataTypeT + +_ScalarT = TypeVar("_ScalarT", bound=Scalar) +_Scalar_co = TypeVar("_Scalar_co", bound=Scalar, covariant=True) + +_Aggregation: TypeAlias = Literal[ + "all", + "any", + "approximate_median", + "count", + "count_all", + "count_distinct", + "distinct", + "first", + "first_last", + "last", + "list", + "max", + "mean", + "min", + "min_max", + "one", + "product", + "stddev", + "sum", + "tdigest", + "variance", +] +_AggregationPrefixed: TypeAlias = Literal[ + "hash_all", + "hash_any", + "hash_approximate_median", + "hash_count", + "hash_count_all", + "hash_count_distinct", + "hash_distinct", + "hash_first", + "hash_first_last", + "hash_last", + "hash_list", + "hash_max", + "hash_mean", + "hash_min", + "hash_min_max", + "hash_one", + "hash_product", + "hash_stddev", + "hash_sum", + "hash_tdigest", + "hash_variance", +] +Aggregation: TypeAlias = _Aggregation | _AggregationPrefixed +AggregateOptions: TypeAlias = ( + ScalarAggregateOptions | CountOptions | TDigestOptions | VarianceOptions | FunctionOptions +) + +UnarySelector: TypeAlias = str +NullarySelector: TypeAlias = tuple[()] +NarySelector: TypeAlias = list[str] | tuple[str, ...] +ColumnSelector: TypeAlias = UnarySelector | NullarySelector | NarySelector + +class ChunkedArray(_PandasConvertible[pd.Series], Generic[_Scalar_co]): + """ + An array-like composed from a (possibly empty) collection of pyarrow.Arrays + + Warnings + -------- + Do not call this class's constructor directly. + + Examples + -------- + To construct a ChunkedArray object use :func:`pyarrow.chunked_array`: + + >>> import pyarrow as pa + >>> pa.chunked_array([], type=pa.int8()) + + [ + ... + ] + + >>> pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + + [ + [ + 2, + 2, + 4 + ], + [ + 4, + 5, + 100 + ] + ] + >>> isinstance(pa.chunked_array([[2, 2, 4], [4, 5, 100]]), pa.ChunkedArray) + True + """ + + @property + def data(self) -> Self: ... + @property + def type(self: ChunkedArray[Scalar[_DataTypeT]]) -> _DataTypeT: + """ + Return data type of a ChunkedArray. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs.type + DataType(int64) + """ + def length(self) -> int: + """ + Return length of a ChunkedArray. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs.length() + 6 + """ + __len__ = length + def to_string( + self, + *, + indent: int = 0, + window: int = 5, + container_window: int = 2, + skip_new_lines: bool = False, + ) -> str: + """ + Render a "pretty-printed" string representation of the ChunkedArray + + Parameters + ---------- + indent : int + How much to indent right the content of the array, + by default ``0``. + window : int + How many items to preview within each chunk at the begin and end + of the chunk when the chunk is bigger than the window. + The other elements will be ellipsed. + container_window : int + How many chunks to preview at the begin and end + of the array when the array is bigger than the window. + The other elements will be ellipsed. + This setting also applies to list columns. + skip_new_lines : bool + If the array should be rendered as a single line of text + or if each element should be on its own line. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs.to_string(skip_new_lines=True) + '[[2,2,4],[4,5,100]]' + """ + format = to_string + def validate(self, *, full: bool = False) -> None: + """ + Perform validation checks. An exception is raised if validation fails. + + By default only cheap validation checks are run. Pass `full=True` + for thorough validation checks (potentially O(n)). + + Parameters + ---------- + full : bool, default False + If True, run expensive checks, otherwise cheap checks only. + + Raises + ------ + ArrowInvalid + """ + @property + def null_count(self) -> int: + """ + Number of null entries + + Returns + ------- + int + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) + >>> n_legs.null_count + 1 + """ + @property + def nbytes(self) -> int: + """ + Total number of bytes consumed by the elements of the chunked array. + + In other words, the sum of bytes from all buffer ranges referenced. + + Unlike `get_total_buffer_size` this method will account for array + offsets. + + If buffers are shared between arrays then the shared + portion will only be counted multiple times. + + The dictionary of dictionary arrays will always be counted in their + entirety even if the array only references a portion of the dictionary. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) + >>> n_legs.nbytes + 49 + """ + def get_total_buffer_size(self) -> int: + """ + The sum of bytes in each buffer referenced by the chunked array. + + An array may only reference a portion of a buffer. + This method will overestimate in this case and return the + byte size of the entire buffer. + + If a buffer is referenced multiple times then it will + only be counted once. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) + >>> n_legs.get_total_buffer_size() + 49 + """ + def __sizeof__(self) -> int: ... + @overload + def __getitem__(self, key: slice) -> Self: ... + @overload + def __getitem__(self, key: int) -> _Scalar_co: ... + def __getitem__(self, key): + """ + Slice or return value at given index + + Parameters + ---------- + key : integer or slice + Slices with step not equal to 1 (or None) will produce a copy + rather than a zero-copy view + + Returns + ------- + value : Scalar (index) or ChunkedArray (slice) + """ + def getitem(self, i: int) -> Scalar: ... + def is_null(self, *, nan_is_null: bool = False) -> ChunkedArray[scalar.BooleanScalar]: + """ + Return boolean array indicating the null values. + + Parameters + ---------- + nan_is_null : bool (optional, default False) + Whether floating-point NaN values should also be considered null. + + Returns + ------- + array : boolean Array or ChunkedArray + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) + >>> n_legs.is_null() + + [ + [ + false, + false, + false, + false, + true, + false + ] + ] + """ + def is_nan(self) -> ChunkedArray[scalar.BooleanScalar]: + """ + Return boolean array indicating the NaN values. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> arr = pa.chunked_array([[2, np.nan, 4], [4, None, 100]]) + >>> arr.is_nan() + + [ + [ + false, + true, + false, + false, + null, + false + ] + ] + """ + def is_valid(self) -> ChunkedArray[scalar.BooleanScalar]: + """ + Return boolean array indicating the non-null values. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) + >>> n_legs.is_valid() + + [ + [ + true, + true, + true + ], + [ + true, + false, + true + ] + ] + """ + def fill_null(self, fill_value: Scalar[_DataTypeT]) -> Self: + """ + Replace each null element in values with fill_value. + + See :func:`pyarrow.compute.fill_null` for full usage. + + Parameters + ---------- + fill_value : any + The replacement value for null entries. + + Returns + ------- + result : Array or ChunkedArray + A new array with nulls replaced by the given value. + + Examples + -------- + >>> import pyarrow as pa + >>> fill_value = pa.scalar(5, type=pa.int8()) + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) + >>> n_legs.fill_null(fill_value) + + [ + [ + 2, + 2, + 4, + 4, + 5, + 100 + ] + ] + """ + def equals(self, other: Self) -> bool: + """ + Return whether the contents of two chunked arrays are equal. + + Parameters + ---------- + other : pyarrow.ChunkedArray + Chunked array to compare against. + + Returns + ------- + are_equal : bool + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> animals = pa.chunked_array( + ... (["Flamingo", "Parrot", "Dog"], ["Horse", "Brittle stars", "Centipede"]) + ... ) + >>> n_legs.equals(n_legs) + True + >>> n_legs.equals(animals) + False + """ + def to_numpy(self, zero_copy_only: bool = False) -> np.ndarray: + """ + Return a NumPy copy of this array (experimental). + + Parameters + ---------- + zero_copy_only : bool, default False + Introduced for signature consistence with pyarrow.Array.to_numpy. + This must be False here since NumPy arrays' buffer must be contiguous. + + Returns + ------- + array : numpy.ndarray + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs.to_numpy() + array([ 2, 2, 4, 4, 5, 100]) + """ + def __array__(self, dtype: np.dtype | None = None, copy: bool | None = None) -> np.ndarray: ... + @overload + def cast( + self, + target_type: None = None, + safe: bool | None = None, + options: CastOptions | None = None, + ) -> Self: ... + @overload + def cast( + self, target_type: _CastAs, safe: bool | None = None, options: CastOptions | None = None + ) -> ChunkedArray[Scalar[_CastAs]]: ... + def cast(self, *args, **kwargs): + """ + Cast array values to another data type + + See :func:`pyarrow.compute.cast` for usage. + + Parameters + ---------- + target_type : DataType, None + Type to cast array to. + safe : boolean, default True + Whether to check for conversion errors such as overflow. + options : CastOptions, default None + Additional checks pass by CastOptions + + Returns + ------- + cast : Array or ChunkedArray + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs.type + DataType(int64) + + Change the data type of an array: + + >>> n_legs_seconds = n_legs.cast(pa.duration("s")) + >>> n_legs_seconds.type + DurationType(duration[s]) + """ + def dictionary_encode(self, null_encoding: NullEncoding = "mask") -> Self: + """ + Compute dictionary-encoded representation of array. + + See :func:`pyarrow.compute.dictionary_encode` for full usage. + + Parameters + ---------- + null_encoding : str, default "mask" + How to handle null entries. + + Returns + ------- + encoded : ChunkedArray + A dictionary-encoded version of this array. + + Examples + -------- + >>> import pyarrow as pa + >>> animals = pa.chunked_array( + ... (["Flamingo", "Parrot", "Dog"], ["Horse", "Brittle stars", "Centipede"]) + ... ) + >>> animals.dictionary_encode() + + [ + ... + -- dictionary: + [ + "Flamingo", + "Parrot", + "Dog", + "Horse", + "Brittle stars", + "Centipede" + ] + -- indices: + [ + 0, + 1, + 2 + ], + ... + -- dictionary: + [ + "Flamingo", + "Parrot", + "Dog", + "Horse", + "Brittle stars", + "Centipede" + ] + -- indices: + [ + 3, + 4, + 5 + ] + ] + """ + def flatten(self, memory_pool: MemoryPool | None = None) -> list[ChunkedArray[Any]]: + """ + Flatten this ChunkedArray. If it has a struct type, the column is + flattened into one array per struct field. + + Parameters + ---------- + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool + + Returns + ------- + result : list of ChunkedArray + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> c_arr = pa.chunked_array(n_legs.value_counts()) + >>> c_arr + + [ + -- is_valid: all not null + -- child 0 type: int64 + [ + 2, + 4, + 5, + 100 + ] + -- child 1 type: int64 + [ + 2, + 2, + 1, + 1 + ] + ] + >>> c_arr.flatten() + [ + [ + [ + 2, + 4, + 5, + 100 + ] + ], + [ + [ + 2, + 2, + 1, + 1 + ] + ]] + >>> c_arr.type + StructType(struct) + >>> n_legs.type + DataType(int64) + """ + def combine_chunks(self, memory_pool: MemoryPool | None = None) -> Array[_Scalar_co]: + """ + Flatten this ChunkedArray into a single non-chunked array. + + Parameters + ---------- + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool + + Returns + ------- + result : Array + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs + + [ + [ + 2, + 2, + 4 + ], + [ + 4, + 5, + 100 + ] + ] + >>> n_legs.combine_chunks() + + [ + 2, + 2, + 4, + 4, + 5, + 100 + ] + """ + def unique(self) -> ChunkedArray[_Scalar_co]: + """ + Compute distinct elements in array + + Returns + ------- + pyarrow.Array + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs + + [ + [ + 2, + 2, + 4 + ], + [ + 4, + 5, + 100 + ] + ] + >>> n_legs.unique() + + [ + 2, + 4, + 5, + 100 + ] + """ + def value_counts(self) -> StructArray: + """ + Compute counts of unique elements in array. + + Returns + ------- + An array of structs + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs + + [ + [ + 2, + 2, + 4 + ], + [ + 4, + 5, + 100 + ] + ] + >>> n_legs.value_counts() + + -- is_valid: all not null + -- child 0 type: int64 + [ + 2, + 4, + 5, + 100 + ] + -- child 1 type: int64 + [ + 2, + 2, + 1, + 1 + ] + """ + def slice(self, offset: int = 0, length: int | None = None) -> Self: + """ + Compute zero-copy slice of this ChunkedArray + + Parameters + ---------- + offset : int, default 0 + Offset from start of array to slice + length : int, default None + Length of slice (default is until end of batch starting from + offset) + + Returns + ------- + sliced : ChunkedArray + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs + + [ + [ + 2, + 2, + 4 + ], + [ + 4, + 5, + 100 + ] + ] + >>> n_legs.slice(2, 2) + + [ + [ + 4 + ], + [ + 4 + ] + ] + """ + def filter(self, mask: Mask, null_selection_behavior: NullSelectionBehavior = "drop") -> Self: + """ + Select values from the chunked array. + + See :func:`pyarrow.compute.filter` for full usage. + + Parameters + ---------- + mask : Array or array-like + The boolean mask to filter the chunked array with. + null_selection_behavior : str, default "drop" + How nulls in the mask should be handled. + + Returns + ------- + filtered : Array or ChunkedArray + An array of the same type, with only the elements selected by + the boolean mask. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs + + [ + [ + 2, + 2, + 4 + ], + [ + 4, + 5, + 100 + ] + ] + >>> mask = pa.array([True, False, None, True, False, True]) + >>> n_legs.filter(mask) + + [ + [ + 2 + ], + [ + 4, + 100 + ] + ] + >>> n_legs.filter(mask, null_selection_behavior="emit_null") + + [ + [ + 2, + null + ], + [ + 4, + 100 + ] + ] + """ + @overload + def index( + self: ChunkedArray[Scalar[_BasicDataType[_AsPyType]]], + value: Scalar[_DataTypeT] | _AsPyType, + start: int | None = None, + end: int | None = None, + *, + memory_pool: MemoryPool | None = None, + ) -> Int64Scalar: ... + @overload + def index( + self, + value: Scalar[_DataTypeT], + start: int | None = None, + end: int | None = None, + *, + memory_pool: MemoryPool | None = None, + ) -> Int64Scalar: ... + def index(self, *args, **kwargs): + """ + Find the first index of a value. + + See :func:`pyarrow.compute.index` for full usage. + + Parameters + ---------- + value : Scalar or object + The value to look for in the array. + start : int, optional + The start index where to look for `value`. + end : int, optional + The end index where to look for `value`. + memory_pool : MemoryPool, optional + A memory pool for potential memory allocations. + + Returns + ------- + index : Int64Scalar + The index of the value in the array (-1 if not found). + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs + + [ + [ + 2, + 2, + 4 + ], + [ + 4, + 5, + 100 + ] + ] + >>> n_legs.index(4) + + >>> n_legs.index(4, start=3) + + """ + def take(self, indices: Indices) -> Self: + """ + Select values from the chunked array. + + See :func:`pyarrow.compute.take` for full usage. + + Parameters + ---------- + indices : Array or array-like + The indices in the array whose values will be returned. + + Returns + ------- + taken : Array or ChunkedArray + An array with the same datatype, containing the taken values. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> n_legs + + [ + [ + 2, + 2, + 4 + ], + [ + 4, + 5, + 100 + ] + ] + >>> n_legs.take([1, 4, 5]) + + [ + [ + 2, + 5, + 100 + ] + ] + """ + def drop_null(self) -> Self: + """ + Remove missing values from a chunked array. + See :func:`pyarrow.compute.drop_null` for full description. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, None], [4, 5, 100]]) + >>> n_legs + + [ + [ + 2, + 2, + null + ], + [ + 4, + 5, + 100 + ] + ] + >>> n_legs.drop_null() + + [ + [ + 2, + 2 + ], + [ + 4, + 5, + 100 + ] + ] + """ + def sort(self, order: Order = "ascending", **kwargs) -> Self: + """ + Sort the ChunkedArray + + Parameters + ---------- + order : str, default "ascending" + Which order to sort values in. + Accepted values are "ascending", "descending". + **kwargs : dict, optional + Additional sorting options. + As allowed by :class:`SortOptions` + + Returns + ------- + result : ChunkedArray + """ + def unify_dictionaries(self, memory_pool: MemoryPool | None = None) -> Self: + """ + Unify dictionaries across all chunks. + + This method returns an equivalent chunked array, but where all + chunks share the same dictionary values. Dictionary indices are + transposed accordingly. + + If there are no dictionaries in the chunked array, it is returned + unchanged. + + Parameters + ---------- + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool + + Returns + ------- + result : ChunkedArray + + Examples + -------- + >>> import pyarrow as pa + >>> arr_1 = pa.array(["Flamingo", "Parrot", "Dog"]).dictionary_encode() + >>> arr_2 = pa.array(["Horse", "Brittle stars", "Centipede"]).dictionary_encode() + >>> c_arr = pa.chunked_array([arr_1, arr_2]) + >>> c_arr + + [ + ... + -- dictionary: + [ + "Flamingo", + "Parrot", + "Dog" + ] + -- indices: + [ + 0, + 1, + 2 + ], + ... + -- dictionary: + [ + "Horse", + "Brittle stars", + "Centipede" + ] + -- indices: + [ + 0, + 1, + 2 + ] + ] + >>> c_arr.unify_dictionaries() + + [ + ... + -- dictionary: + [ + "Flamingo", + "Parrot", + "Dog", + "Horse", + "Brittle stars", + "Centipede" + ] + -- indices: + [ + 0, + 1, + 2 + ], + ... + -- dictionary: + [ + "Flamingo", + "Parrot", + "Dog", + "Horse", + "Brittle stars", + "Centipede" + ] + -- indices: + [ + 3, + 4, + 5 + ] + ] + """ + @property + def num_chunks(self) -> int: + """ + Number of underlying chunks. + + Returns + ------- + int + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, None], [4, 5, 100]]) + >>> n_legs.num_chunks + 2 + """ + def chunk(self, i: int) -> ChunkedArray[_Scalar_co]: + """ + Select a chunk by its index. + + Parameters + ---------- + i : int + + Returns + ------- + pyarrow.Array + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, None], [4, 5, 100]]) + >>> n_legs.chunk(1) + + [ + 4, + 5, + 100 + ] + """ + @property + def chunks(self) -> list[Array[_Scalar_co]]: + """ + Convert to a list of single-chunked arrays. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, None], [4, 5, 100]]) + >>> n_legs + + [ + [ + 2, + 2, + null + ], + [ + 4, + 5, + 100 + ] + ] + >>> n_legs.chunks + [ + [ + 2, + 2, + null + ], + [ + 4, + 5, + 100 + ]] + """ + @overload + def iterchunks( + self: ChunkedArray[scalar.NullScalar], + ) -> Generator[array.NullArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.BooleanScalar], + ) -> Generator[array.BooleanArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.UInt8Scalar], + ) -> Generator[array.UInt8Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Int8Scalar], + ) -> Generator[array.Int8Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.UInt16Scalar], + ) -> Generator[array.UInt16Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Int16Scalar], + ) -> Generator[array.Int16Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.UInt32Scalar], + ) -> Generator[array.UInt32Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Int32Scalar], + ) -> Generator[array.Int32Array, None, None]: + """ + Convert to an iterator of ChunkArrays. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) + >>> for i in n_legs.iterchunks(): + ... print(i.null_count) + 0 + 1 + + """ + @overload + def iterchunks( + self: ChunkedArray[scalar.UInt64Scalar], + ) -> Generator[array.UInt64Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Int64Scalar], + ) -> Generator[array.Int64Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.HalfFloatScalar], + ) -> Generator[array.HalfFloatArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.FloatScalar], + ) -> Generator[array.FloatArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.DoubleScalar], + ) -> Generator[array.DoubleArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Decimal32Scalar], + ) -> Generator[array.Decimal32Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Decimal64Scalar], + ) -> Generator[array.Decimal64Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Decimal128Scalar], + ) -> Generator[array.Decimal128Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Decimal256Scalar], + ) -> Generator[array.Decimal256Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Date32Scalar], + ) -> Generator[array.Date32Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Date64Scalar], + ) -> Generator[array.Date64Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Time32Scalar[types._Time32Unit]], + ) -> Generator[array.Time32Array[types._Time32Unit], None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Time64Scalar[types._Time64Unit]], + ) -> Generator[array.Time64Array[types._Time64Unit], None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.DurationScalar[types._Unit]], + ) -> Generator[array.DurationArray[types._Unit], None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.MonthDayNanoIntervalScalar], + ) -> Generator[array.MonthDayNanoIntervalArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.BinaryScalar], + ) -> Generator[array.BinaryArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.LargeBinaryScalar], + ) -> Generator[array.LargeBinaryArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.FixedSizeBinaryScalar], + ) -> Generator[array.FixedSizeBinaryArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.StringScalar], + ) -> Generator[array.StringArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.LargeStringScalar], + ) -> Generator[array.LargeStringArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.BinaryViewScalar], + ) -> Generator[array.BinaryViewArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.StringViewScalar], + ) -> Generator[array.StringViewArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.ListScalar[_DataTypeT]], + ) -> Generator[array.ListArray[scalar.ListScalar[_DataTypeT]], None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.FixedSizeListScalar[_DataTypeT, types._Size]], + ) -> Generator[array.FixedSizeListArray[_DataTypeT, types._Size], None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.LargeListScalar[_DataTypeT]], + ) -> Generator[array.LargeListArray[_DataTypeT], None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.LargeListViewScalar[_DataTypeT]], + ) -> Generator[array.LargeListViewArray[_DataTypeT], None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.StructScalar], + ) -> Generator[array.StructArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.MapScalar[array._MapKeyT, array._MapItemT]], + ) -> Generator[array.MapArray[array._MapKeyT, array._MapItemT], None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.DictionaryScalar[types._IndexT, types._BasicValueT]], + ) -> Generator[array.DictionaryArray[types._IndexT, types._BasicValueT], None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.RunEndEncodedScalar], + ) -> Generator[array.RunEndEncodedArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.UnionScalar], + ) -> Generator[array.UnionArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.Bool8Scalar], + ) -> Generator[array.Bool8Array, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.UuidScalar], + ) -> Generator[array.UuidArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.JsonScalar], + ) -> Generator[array.JsonArray, None, None]: ... + @overload + def iterchunks( + self: ChunkedArray[scalar.OpaqueScalar], + ) -> Generator[array.OpaqueArray, None, None]: ... + def iterchunks(self): + """ + Convert to an iterator of ChunkArrays. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) + >>> for i in n_legs.iterchunks(): + ... print(i.null_count) + 0 + 1 + + """ + def __iter__(self) -> Iterator[_Scalar_co]: ... + def to_pylist( + self: ChunkedArray[Scalar[_BasicDataType[_AsPyType]]], + *, + maps_as_pydicts: Literal["lossy", "strict"] | None = None, + ) -> list[_AsPyType | None]: + """ + Convert to a list of native Python objects. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + + If 'lossy', whenever duplicate keys are detected, a warning will be printed. + The last seen value of a duplicate key will be in the Python dictionary. + If 'strict', this instead results in an exception being raised when detected. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, None, 100]]) + >>> n_legs.to_pylist() + [2, 2, 4, 4, None, 100] + """ + def __arrow_c_stream__(self, requested_schema=None) -> Any: + """ + Export to a C ArrowArrayStream PyCapsule. + + Parameters + ---------- + requested_schema : PyCapsule, default None + The schema to which the stream should be casted, passed as a + PyCapsule containing a C ArrowSchema representation of the + requested schema. + + Returns + ------- + PyCapsule + A capsule containing a C ArrowArrayStream struct. + """ + @classmethod + def _import_from_c_capsule(cls, stream) -> Self: + """ + Import ChunkedArray from a C ArrowArrayStream PyCapsule. + + Parameters + ---------- + stream: PyCapsule + A capsule containing a C ArrowArrayStream PyCapsule. + + Returns + ------- + ChunkedArray + """ + @property + def is_cpu(self) -> bool: + """ + Whether all chunks in the ChunkedArray are CPU-accessible. + """ + +@overload +def chunked_array( + values: Iterable[NullableCollection[bool]], + type: None = None, +) -> ChunkedArray[scalar.BooleanScalar]: ... +@overload +def chunked_array( + values: Iterable[NullableCollection[int]], + type: None = None, +) -> ChunkedArray[scalar.Int64Scalar]: ... +@overload +def chunked_array( + values: Iterable[NullableCollection[float]], + type: None = None, +) -> ChunkedArray[scalar.DoubleScalar]: ... +@overload +def chunked_array( + values: Iterable[NullableCollection[Decimal]], + type: None = None, +) -> ChunkedArray[scalar.Decimal128Scalar]: ... +@overload +def chunked_array( + values: Iterable[NullableCollection[dict[str, Any]]], + type: None = None, +) -> ChunkedArray[scalar.StructScalar]: ... +@overload +def chunked_array( + values: Iterable[NullableCollection[dt.datetime]], + type: None = None, +) -> ChunkedArray[scalar.TimestampScalar]: ... +@overload +def chunked_array( + values: Iterable[NullableCollection[dt.date]], + type: None = None, +) -> ChunkedArray[scalar.Date32Scalar]: ... +@overload +def chunked_array( + values: Iterable[NullableCollection[dt.time]], + type: None = None, +) -> ChunkedArray[scalar.Time64Scalar[Literal["us"]]]: ... +@overload +def chunked_array( + values: Iterable[NullableCollection[dt.timedelta]], + type: None = None, +) -> ChunkedArray[scalar.DurationScalar[Literal["us"]]]: ... +@overload +def chunked_array( + values: Iterable[NullableCollection[MonthDayNano]], + type: None = None, +) -> ChunkedArray[scalar.MonthDayNanoIntervalScalar]: ... +@overload +def chunked_array( + values: Iterable[NullableCollection[str]], + type: None = None, +) -> ChunkedArray[scalar.StringScalar]: ... +@overload +def chunked_array( + values: Iterable[NullableCollection[bytes]], + type: None = None, +) -> ChunkedArray[scalar.BinaryScalar]: ... +@overload +def chunked_array( + values: Iterable[NullableCollection[list[Any]]], + type: None = None, +) -> ChunkedArray[scalar.ListScalar[Any]]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["null"] | types.NullType, +) -> ChunkedArray[scalar.NullScalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["bool", "boolean"] | types.BoolType, +) -> ChunkedArray[scalar.BooleanScalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["i1", "int8"] | types.Int8Type, +) -> ChunkedArray[scalar.Int8Scalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["i2", "int16"] | types.Int16Type, +) -> ChunkedArray[scalar.Int16Scalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["i4", "int32"] | types.Int32Type, +) -> ChunkedArray[scalar.Int32Scalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["i8", "int64"] | types.Int64Type, +) -> ChunkedArray[scalar.Int64Scalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["u1", "uint8"] | types.UInt8Type, +) -> ChunkedArray[scalar.UInt8Scalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["u2", "uint16"] | types.UInt16Type, +) -> ChunkedArray[scalar.UInt16Scalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["u4", "uint32"] | types.Uint32Type, +) -> ChunkedArray[scalar.UInt32Scalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["u8", "uint64"] | types.UInt64Type, +) -> ChunkedArray[scalar.UInt64Scalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["f2", "halffloat", "float16"] | types.Float16Type, +) -> ChunkedArray[scalar.HalfFloatScalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["f4", "float", "float32"] | types.Float32Type, +) -> ChunkedArray[scalar.FloatScalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["f8", "double", "float64"] | types.Float64Type, +) -> ChunkedArray[scalar.DoubleScalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["string", "str", "utf8"] | types.StringType, +) -> ChunkedArray[scalar.StringScalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["binary"] | types.BinaryType, +) -> ChunkedArray[scalar.BinaryScalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["large_string", "large_str", "large_utf8"] | types.LargeStringType, +) -> ChunkedArray[scalar.LargeStringScalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["large_binary"] | types.LargeBinaryType, +) -> ChunkedArray[scalar.LargeBinaryScalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["binary_view"] | types.BinaryViewType, +) -> ChunkedArray[scalar.BinaryViewScalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["string_view"] | types.StringViewType, +) -> ChunkedArray[scalar.StringViewScalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["date32", "date32[day]"] | types.Date32Type, +) -> ChunkedArray[scalar.Date32Scalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["date64", "date64[ms]"] | types.Date64Type, +) -> ChunkedArray[scalar.Date64Scalar]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["time32[s]"] | types.Time32Type[Literal["s"]], +) -> ChunkedArray[scalar.Time32Scalar[Literal["s"]]]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["time32[ms]"] | types.Time32Type[Literal["ms"]], +) -> ChunkedArray[scalar.Time32Scalar[Literal["ms"]]]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["time64[us]"] | types.Time64Type[Literal["us"]], +) -> ChunkedArray[scalar.Time64Scalar[Literal["us"]]]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["time64[ns]"] | types.Time64Type[Literal["ns"]], +) -> ChunkedArray[scalar.Time64Scalar[Literal["ns"]]]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["timestamp[s]"] | types.TimestampType[Literal["s"]], +) -> ChunkedArray[scalar.TimestampScalar[Literal["s"]]]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["timestamp[ms]"] | types.TimestampType[Literal["ms"]], +) -> ChunkedArray[scalar.TimestampScalar[Literal["ms"]]]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["timestamp[us]"] | types.TimestampType[Literal["us"]], +) -> ChunkedArray[scalar.TimestampScalar[Literal["us"]]]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["timestamp[ns]"] | types.TimestampType[Literal["ns"]], +) -> ChunkedArray[scalar.TimestampScalar[Literal["ns"]]]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["duration[s]"] | types.DurationType[Literal["s"]], +) -> ChunkedArray[scalar.DurationScalar[Literal["s"]]]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["duration[ms]"] | types.DurationType[Literal["ms"]], +) -> ChunkedArray[scalar.DurationScalar[Literal["ms"]]]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["duration[us]"] | types.DurationType[Literal["us"]], +) -> ChunkedArray[scalar.DurationScalar[Literal["us"]]]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any] | SupportArrowStream | SupportArrowArray], + type: Literal["duration[ns]"] | types.DurationType[Literal["ns"]], +) -> ChunkedArray[scalar.DurationScalar[Literal["ns"]]]: ... +@overload +def chunked_array( + values: Iterable[Iterable[Any]] | SupportArrowStream | SupportArrowArray, + type: Literal["month_day_nano_interval"] | types.MonthDayNanoIntervalType, +) -> ChunkedArray[scalar.MonthDayNanoIntervalScalar]: ... +@overload +def chunked_array( + values: Iterable[Array[_ScalarT]], + type: None = None, +) -> ChunkedArray[_ScalarT]: ... +def chunked_array(value, type=None): + """ + Construct chunked array from list of array-like objects + + Parameters + ---------- + arrays : Array, list of Array, or array-like + Must all be the same data type. Can be empty only if type also passed. + Any Arrow-compatible array that implements the Arrow PyCapsule Protocol + (has an ``__arrow_c_array__`` or ``__arrow_c_stream__`` method) can be + passed as well. + type : DataType or string coercible to DataType + + Returns + ------- + ChunkedArray + + Examples + -------- + >>> import pyarrow as pa + >>> pa.chunked_array([], type=pa.int8()) + + [ + ... + ] + + >>> pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + + [ + [ + 2, + 2, + 4 + ], + [ + 4, + 5, + 100 + ] + ] + """ + +_ColumnT = TypeVar("_ColumnT", bound=ArrayOrChunkedArray[Any]) + +class _Tabular(_PandasConvertible[pd.DataFrame], Generic[_ColumnT]): + def __array__(self, dtype: np.dtype | None = None, copy: bool | None = None) -> np.ndarray: ... + def __dataframe__( + self, nan_as_null: bool = False, allow_copy: bool = True + ) -> _PyArrowDataFrame: + """ + Return the dataframe interchange object implementing the interchange protocol. + + Parameters + ---------- + nan_as_null : bool, default False + Whether to tell the DataFrame to overwrite null values in the data + with ``NaN`` (or ``NaT``). + allow_copy : bool, default True + Whether to allow memory copying when exporting. If set to False + it would cause non-zero-copy exports to fail. + + Returns + ------- + DataFrame interchange object + The object which consuming library can use to ingress the dataframe. + + Notes + ----- + Details on the interchange protocol: + https://data-apis.org/dataframe-protocol/latest/index.html + `nan_as_null` currently has no effect; once support for nullable extension + dtypes is added, this value should be propagated to columns. + """ + @overload + def __getitem__(self, key: int | str) -> _ColumnT: ... + @overload + def __getitem__(self, key: slice) -> Self: ... + def __getitem__(self, key): + """ + Slice or return column at given index or column name + + Parameters + ---------- + key : integer, str, or slice + Slices with step not equal to 1 (or None) will produce a copy + rather than a zero-copy view + + Returns + ------- + Array (from RecordBatch) or ChunkedArray (from Table) for column input. + RecordBatch or Table for slice input. + """ + def __len__(self) -> int: ... + def column(self, i: int | str) -> _ColumnT: + """ + Select single column from Table or RecordBatch. + + Parameters + ---------- + i : int or string + The index or name of the column to retrieve. + + Returns + ------- + column : Array (for RecordBatch) or ChunkedArray (for Table) + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + + Select a column by numeric index: + + >>> table.column(0) + + [ + [ + 2, + 4, + 5, + 100 + ] + ] + + Select a column by its name: + + >>> table.column("animals") + + [ + [ + "Flamingo", + "Horse", + "Brittle stars", + "Centipede" + ] + ] + """ + @property + def column_names(self) -> list[str]: + """ + Names of the Table or RecordBatch columns. + + Returns + ------- + list of str + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> table = pa.Table.from_arrays( + ... [[2, 4, 5, 100], ["Flamingo", "Horse", "Brittle stars", "Centipede"]], + ... names=["n_legs", "animals"], + ... ) + >>> table.column_names + ['n_legs', 'animals'] + """ + @property + def columns(self) -> list[_ColumnT]: + """ + List of all columns in numerical order. + + Returns + ------- + columns : list of Array (for RecordBatch) or list of ChunkedArray (for Table) + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.columns + [ + [ + [ + null, + 4, + 5, + null + ] + ], + [ + [ + "Flamingo", + "Horse", + null, + "Centipede" + ] + ]] + """ + def drop_null(self) -> Self: + """ + Remove rows that contain missing values from a Table or RecordBatch. + + See :func:`pyarrow.compute.drop_null` for full usage. + + Returns + ------- + Table or RecordBatch + A tabular object with the same schema, with rows containing + no missing values. + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [None, 2022, 2019, 2021], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", None, "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.drop_null() + pyarrow.Table + year: double + n_legs: int64 + animals: string + ---- + year: [[2022,2021]] + n_legs: [[4,100]] + animals: [["Horse","Centipede"]] + """ + def field(self, i: int | str) -> Field: + """ + Select a schema field by its column name or numeric index. + + Parameters + ---------- + i : int or string + The index or name of the field to retrieve. + + Returns + ------- + Field + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.field(0) + pyarrow.Field + >>> table.field(1) + pyarrow.Field + """ + @classmethod + def from_pydict( + cls, + mapping: Mapping[str, ArrayOrChunkedArray[Any] | list | np.ndarray], + schema: Schema | None = None, + metadata: Mapping | None = None, + ) -> Self: + """ + Construct a Table or RecordBatch from Arrow arrays or columns. + + Parameters + ---------- + mapping : dict or Mapping + A mapping of strings to Arrays or Python lists. + schema : Schema, default None + If not passed, will be inferred from the Mapping values. + metadata : dict or Mapping, default None + Optional metadata for the schema (if inferred). + + Returns + ------- + Table or RecordBatch + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 4, 5, 100]) + >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) + >>> pydict = {"n_legs": n_legs, "animals": animals} + + Construct a Table from a dictionary of arrays: + + >>> pa.Table.from_pydict(pydict) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + >>> pa.Table.from_pydict(pydict).schema + n_legs: int64 + animals: string + + Construct a Table from a dictionary of arrays with metadata: + + >>> my_metadata = {"n_legs": "Number of legs per animal"} + >>> pa.Table.from_pydict(pydict, metadata=my_metadata).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + + Construct a Table from a dictionary of arrays with pyarrow schema: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> pa.Table.from_pydict(pydict, schema=my_schema).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + """ + @classmethod + def from_pylist( + cls, + mapping: Sequence[Mapping[str, Any]], + schema: Schema | None = None, + metadata: Mapping | None = None, + ) -> Self: + """ + Construct a Table or RecordBatch from list of rows / dictionaries. + + Parameters + ---------- + mapping : list of dicts of rows + A mapping of strings to row values. + schema : Schema, default None + If not passed, will be inferred from the first row of the + mapping values. + metadata : dict or Mapping, default None + Optional metadata for the schema (if inferred). + + Returns + ------- + Table or RecordBatch + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> pylist = [{"n_legs": 2, "animals": "Flamingo"}, {"n_legs": 4, "animals": "Dog"}] + + Construct a Table from a list of rows: + + >>> pa.Table.from_pylist(pylist) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4]] + animals: [["Flamingo","Dog"]] + + Construct a Table from a list of rows with metadata: + + >>> my_metadata = {"n_legs": "Number of legs per animal"} + >>> pa.Table.from_pylist(pylist, metadata=my_metadata).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + + Construct a Table from a list of rows with pyarrow schema: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> pa.Table.from_pylist(pylist, schema=my_schema).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + """ + def itercolumns(self) -> Generator[_ColumnT, None, None]: + """ + Iterator over all columns in their numerical order. + + Yields + ------ + Array (for RecordBatch) or ChunkedArray (for Table) + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} + ... ) + >>> table = pa.Table.from_pandas(df) + >>> for i in table.itercolumns(): + ... print(i.null_count) + 2 + 1 + """ + @property + def num_columns(self) -> int: ... + @property + def num_rows(self) -> int: ... + @property + def shape(self) -> tuple[int, int]: + """ + Dimensions of the table or record batch: (#rows, #columns). + + Returns + ------- + (int, int) + Number of rows and number of columns. + + Examples + -------- + >>> import pyarrow as pa + >>> table = pa.table( + ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} + ... ) + >>> table.shape + (4, 2) + """ + @property + def schema(self) -> Schema: ... + @property + def nbytes(self) -> int: ... + def sort_by(self, sorting: str | list[tuple[str, Order]], **kwargs) -> Self: + """ + Sort the Table or RecordBatch by one or multiple columns. + + Parameters + ---------- + sorting : str or list[tuple(name, order)] + Name of the column to use to sort (ascending), or + a list of multiple sorting conditions where + each entry is a tuple with column name + and sorting order ("ascending" or "descending") + **kwargs : dict, optional + Additional sorting options. + As allowed by :class:`SortOptions` + + Returns + ------- + Table or RecordBatch + A new tabular object sorted according to the sort keys. + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pandas as pd + >>> import pyarrow as pa + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.sort_by("animal") + pyarrow.Table + year: int64 + n_legs: int64 + animal: string + ---- + year: [[2019,2021,2021,2020,2022,2022]] + n_legs: [[5,100,4,2,4,2]] + animal: [["Brittle stars","Centipede","Dog","Flamingo","Horse","Parrot"]] + """ + def take(self, indices: Indices) -> Self: + """ + Select rows from a Table or RecordBatch. + + See :func:`pyarrow.compute.take` for full usage. + + Parameters + ---------- + indices : Array or array-like + The indices in the tabular object whose rows will be returned. + + Returns + ------- + Table or RecordBatch + A tabular object with the same schema, containing the taken rows. + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2019, 2021], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.take([1, 3]) + pyarrow.Table + year: int64 + n_legs: int64 + animals: string + ---- + year: [[2022,2021]] + n_legs: [[4,100]] + animals: [["Horse","Centipede"]] + """ + def filter( + self, mask: Mask | Expression, null_selection_behavior: NullSelectionBehavior = "drop" + ) -> Self: + """ + Select rows from the table or record batch based on a boolean mask. + + The Table can be filtered based on a mask, which will be passed to + :func:`pyarrow.compute.filter` to perform the filtering, or it can + be filtered through a boolean :class:`.Expression` + + Parameters + ---------- + mask : Array or array-like or .Expression + The boolean mask or the :class:`.Expression` to filter the table with. + null_selection_behavior : str, default "drop" + How nulls in the mask should be handled, does nothing if + an :class:`.Expression` is used. + + Returns + ------- + filtered : Table or RecordBatch + A tabular object of the same schema, with only the rows selected + by applied filtering + + Examples + -------- + Using a Table (works similarly for RecordBatch): + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "year": [2020, 2022, 2019, 2021], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + + Define an expression and select rows: + + >>> import pyarrow.compute as pc + >>> expr = pc.field("year") <= 2020 + >>> table.filter(expr) + pyarrow.Table + year: int64 + n_legs: int64 + animals: string + ---- + year: [[2020,2019]] + n_legs: [[2,5]] + animals: [["Flamingo","Brittle stars"]] + + Define a mask and select rows: + + >>> mask = [True, True, False, None] + >>> table.filter(mask) + pyarrow.Table + year: int64 + n_legs: int64 + animals: string + ---- + year: [[2020,2022]] + n_legs: [[2,4]] + animals: [["Flamingo","Horse"]] + >>> table.filter(mask, null_selection_behavior="emit_null") + pyarrow.Table + year: int64 + n_legs: int64 + animals: string + ---- + year: [[2020,2022,null]] + n_legs: [[2,4,null]] + animals: [["Flamingo","Horse",null]] + """ + def to_pydict( + self, *, maps_as_pydicts: Literal["lossy", "strict"] | None = None + ) -> dict[str, list]: + """ + Convert the Table or RecordBatch to a dict or OrderedDict. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + + If 'lossy', whenever duplicate keys are detected, a warning will be printed. + The last seen value of a duplicate key will be in the Python dictionary. + If 'strict', this instead results in an exception being raised when detected. + + Returns + ------- + dict + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> table = pa.Table.from_arrays([n_legs, animals], names=["n_legs", "animals"]) + >>> table.to_pydict() + {'n_legs': [2, 2, 4, 4, 5, 100], 'animals': ['Flamingo', 'Parrot', ..., 'Centipede']} + """ + def to_pylist( + self, *, maps_as_pydicts: Literal["lossy", "strict"] | None = None + ) -> list[dict[str, Any]]: + """ + Convert the Table or RecordBatch to a list of rows / dictionaries. + + Parameters + ---------- + maps_as_pydicts : str, optional, default `None` + Valid values are `None`, 'lossy', or 'strict'. + The default behavior (`None`), is to convert Arrow Map arrays to + Python association lists (list-of-tuples) in the same order as the + Arrow Map, as in [(key1, value1), (key2, value2), ...]. + + If 'lossy' or 'strict', convert Arrow Map arrays to native Python dicts. + + If 'lossy', whenever duplicate keys are detected, a warning will be printed. + The last seen value of a duplicate key will be in the Python dictionary. + If 'strict', this instead results in an exception being raised when detected. + + Returns + ------- + list + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> data = [[2, 4, 5, 100], ["Flamingo", "Horse", "Brittle stars", "Centipede"]] + >>> table = pa.table(data, names=["n_legs", "animals"]) + >>> table.to_pylist() + [{'n_legs': 2, 'animals': 'Flamingo'}, {'n_legs': 4, 'animals': 'Horse'}, ... + """ + def to_string(self, *, show_metadata: bool = False, preview_cols: int = 0) -> str: + """ + Return human-readable string representation of Table or RecordBatch. + + Parameters + ---------- + show_metadata : bool, default False + Display Field-level and Schema-level KeyValueMetadata. + preview_cols : int, default 0 + Display values of the columns for the first N columns. + + Returns + ------- + str + """ + def remove_column(self, i: int) -> Self: ... + def drop_columns(self, columns: str | list[str]) -> Self: + """ + Drop one or more columns and return a new Table or RecordBatch. + + Parameters + ---------- + columns : str or list[str] + Field name(s) referencing existing column(s). + + Raises + ------ + KeyError + If any of the passed column names do not exist. + + Returns + ------- + Table or RecordBatch + A tabular object without the column(s). + + Examples + -------- + Table (works similarly for RecordBatch) + + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + + Drop one column: + + >>> table.drop_columns("animals") + pyarrow.Table + n_legs: int64 + ---- + n_legs: [[2,4,5,100]] + + Drop one or more columns: + + >>> table.drop_columns(["n_legs", "animals"]) + pyarrow.Table + ... + ---- + """ + def add_column( + self, i: int, field_: str | Field, column: ArrayOrChunkedArray[Any] | list + ) -> Self: ... + def append_column(self, field_: str | Field, column: ArrayOrChunkedArray[Any] | list) -> Self: + """ + Append column at end of columns. + + Parameters + ---------- + field_ : str or Field + If a string is passed then the type is deduced from the column + data. + column : Array or value coercible to array + Column data. + + Returns + ------- + Table or RecordBatch + New table or record batch with the passed column added. + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + + Append column at the end: + + >>> year = [2021, 2022, 2019, 2021] + >>> table.append_column("year", [year]) + pyarrow.Table + n_legs: int64 + animals: string + year: int64 + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + year: [[2021,2022,2019,2021]] + """ + +class RecordBatch(_Tabular[Array]): + """ + Batch of rows of columns of equal length + + Warnings + -------- + Do not call this class's constructor directly, use one of the + ``RecordBatch.from_*`` functions instead. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array(["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"]) + >>> names = ["n_legs", "animals"] + + Constructing a RecordBatch from arrays: + + >>> pa.RecordBatch.from_arrays([n_legs, animals], names=names) + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,2,4,4,5,100] + animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] + >>> pa.RecordBatch.from_arrays([n_legs, animals], names=names).to_pandas() + n_legs animals + 0 2 Flamingo + 1 2 Parrot + 2 4 Dog + 3 4 Horse + 4 5 Brittle stars + 5 100 Centipede + + Constructing a RecordBatch from pandas DataFrame: + + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2021, 2022], + ... "month": [3, 5, 7, 9], + ... "day": [1, 5, 9, 13], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> pa.RecordBatch.from_pandas(df) + pyarrow.RecordBatch + year: int64 + month: int64 + day: int64 + n_legs: int64 + animals: string + ---- + year: [2020,2022,2021,2022] + month: [3,5,7,9] + day: [1,5,9,13] + n_legs: [2,4,5,100] + animals: ["Flamingo","Horse","Brittle stars","Centipede"] + >>> pa.RecordBatch.from_pandas(df).to_pandas() + year month day n_legs animals + 0 2020 3 1 2 Flamingo + 1 2022 5 5 4 Horse + 2 2021 7 9 5 Brittle stars + 3 2022 9 13 100 Centipede + + Constructing a RecordBatch from pylist: + + >>> pylist = [{"n_legs": 2, "animals": "Flamingo"}, {"n_legs": 4, "animals": "Dog"}] + >>> pa.RecordBatch.from_pylist(pylist).to_pandas() + n_legs animals + 0 2 Flamingo + 1 4 Dog + + You can also construct a RecordBatch using :func:`pyarrow.record_batch`: + + >>> pa.record_batch([n_legs, animals], names=names).to_pandas() + n_legs animals + 0 2 Flamingo + 1 2 Parrot + 2 4 Dog + 3 4 Horse + 4 5 Brittle stars + 5 100 Centipede + + >>> pa.record_batch(df) + pyarrow.RecordBatch + year: int64 + month: int64 + day: int64 + n_legs: int64 + animals: string + ---- + year: [2020,2022,2021,2022] + month: [3,5,7,9] + day: [1,5,9,13] + n_legs: [2,4,5,100] + animals: ["Flamingo","Horse","Brittle stars","Centipede"] + """ + + def validate(self, *, full: bool = False) -> None: + """ + Perform validation checks. An exception is raised if validation fails. + + By default only cheap validation checks are run. Pass `full=True` + for thorough validation checks (potentially O(n)). + + Parameters + ---------- + full : bool, default False + If True, run expensive checks, otherwise cheap checks only. + + Raises + ------ + ArrowInvalid + """ + def replace_schema_metadata(self, metadata: dict | None = None) -> Self: + """ + Create shallow copy of record batch by replacing schema + key-value metadata with the indicated new metadata (which may be None, + which deletes any existing metadata + + Parameters + ---------- + metadata : dict, default None + + Returns + ------- + shallow_copy : RecordBatch + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + + Constructing a RecordBatch with schema and metadata: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.int64())], metadata={"n_legs": "Number of legs per animal"} + ... ) + >>> batch = pa.RecordBatch.from_arrays([n_legs], schema=my_schema) + >>> batch.schema + n_legs: int64 + -- schema metadata -- + n_legs: 'Number of legs per animal' + + Shallow copy of a RecordBatch with deleted schema metadata: + + >>> batch.replace_schema_metadata().schema + n_legs: int64 + """ + @property + def num_columns(self) -> int: + """ + Number of columns + + Returns + ------- + int + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) + >>> batch.num_columns + 2 + """ + + @property + def num_rows(self) -> int: + """ + Number of rows + + Due to the definition of a RecordBatch, all columns have the same + number of rows. + + Returns + ------- + int + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) + >>> batch.num_rows + 6 + """ + @property + def schema(self) -> Schema: + """ + Schema of the RecordBatch and its columns + + Returns + ------- + pyarrow.Schema + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) + >>> batch.schema + n_legs: int64 + animals: string + """ + @property + def nbytes(self) -> int: + """ + Total number of bytes consumed by the elements of the record batch. + + In other words, the sum of bytes from all buffer ranges referenced. + + Unlike `get_total_buffer_size` this method will account for array + offsets. + + If buffers are shared between arrays then the shared + portion will only be counted multiple times. + + The dictionary of dictionary arrays will always be counted in their + entirety even if the array only references a portion of the dictionary. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) + >>> batch.nbytes + 116 + """ + def get_total_buffer_size(self) -> int: + """ + The sum of bytes in each buffer referenced by the record batch + + An array may only reference a portion of a buffer. + This method will overestimate in this case and return the + byte size of the entire buffer. + + If a buffer is referenced multiple times then it will + only be counted once. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) + >>> batch.get_total_buffer_size() + 120 + """ + + def __sizeof__(self) -> int: ... + def add_column( + self, i: int, field_: str | Field, column: ArrayOrChunkedArray[Any] | list + ) -> Self: + """ + Add column to RecordBatch at position i. + + A new record batch is returned with the column added, the original record batch + object is left unchanged. + + Parameters + ---------- + i : int + Index to place the column at. + field_ : str or Field + If a string is passed then the type is deduced from the column + data. + column : Array or value coercible to array + Column data. + + Returns + ------- + RecordBatch + New record batch with the passed column added. + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> batch = pa.RecordBatch.from_pandas(df) + + Add column: + + >>> year = [2021, 2022, 2019, 2021] + >>> batch.add_column(0, "year", year) + pyarrow.RecordBatch + year: int64 + n_legs: int64 + animals: string + ---- + year: [2021,2022,2019,2021] + n_legs: [2,4,5,100] + animals: ["Flamingo","Horse","Brittle stars","Centipede"] + + Original record batch is left unchanged: + + >>> batch + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,4,5,100] + animals: ["Flamingo","Horse","Brittle stars","Centipede"] + """ + def remove_column(self, i: int) -> Self: + """ + Create new RecordBatch with the indicated column removed. + + Parameters + ---------- + i : int + Index of column to remove. + + Returns + ------- + Table + New record batch without the column. + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> batch = pa.RecordBatch.from_pandas(df) + >>> batch.remove_column(1) + pyarrow.RecordBatch + n_legs: int64 + ---- + n_legs: [2,4,5,100] + """ + def set_column(self, i: int, field_: str | Field, column: Array | list) -> Self: + """ + Replace column in RecordBatch at position. + + Parameters + ---------- + i : int + Index to place the column at. + field_ : str or Field + If a string is passed then the type is deduced from the column + data. + column : Array or value coercible to array + Column data. + + Returns + ------- + RecordBatch + New record batch with the passed column set. + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> batch = pa.RecordBatch.from_pandas(df) + + Replace a column: + + >>> year = [2021, 2022, 2019, 2021] + >>> batch.set_column(1, "year", year) + pyarrow.RecordBatch + n_legs: int64 + year: int64 + ---- + n_legs: [2,4,5,100] + year: [2021,2022,2019,2021] + """ + @overload + def rename_columns(self, names: list[str]) -> Self: ... + @overload + def rename_columns(self, names: dict[str, str]) -> Self: ... + def rename_columns(self, names): + """ + Create new record batch with columns renamed to provided names. + + Parameters + ---------- + names : list[str] or dict[str, str] + List of new column names or mapping of old column names to new column names. + + If a mapping of old to new column names is passed, then all columns which are + found to match a provided old column name will be renamed to the new column name. + If any column names are not found in the mapping, a KeyError will be raised. + + Raises + ------ + KeyError + If any of the column names passed in the names mapping do not exist. + + Returns + ------- + RecordBatch + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> batch = pa.RecordBatch.from_pandas(df) + >>> new_names = ["n", "name"] + >>> batch.rename_columns(new_names) + pyarrow.RecordBatch + n: int64 + name: string + ---- + n: [2,4,5,100] + name: ["Flamingo","Horse","Brittle stars","Centipede"] + >>> new_names = {"n_legs": "n", "animals": "name"} + >>> batch.rename_columns(new_names) + pyarrow.RecordBatch + n: int64 + name: string + ---- + n: [2,4,5,100] + name: ["Flamingo","Horse","Brittle stars","Centipede"] + """ + def serialize(self, memory_pool: MemoryPool | None = None) -> Buffer: + """ + Write RecordBatch to Buffer as encapsulated IPC message, which does not + include a Schema. + + To reconstruct a RecordBatch from the encapsulated IPC message Buffer + returned by this function, a Schema must be passed separately. See + Examples. + + Parameters + ---------- + memory_pool : MemoryPool, default None + Uses default memory pool if not specified + + Returns + ------- + serialized : Buffer + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) + >>> buf = batch.serialize() + >>> buf + + + Reconstruct RecordBatch from IPC message Buffer and original Schema + + >>> pa.ipc.read_record_batch(buf, batch.schema) + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,2,4,4,5,100] + animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] + """ + def slice(self, offset: int = 0, length: int | None = None) -> Self: + """ + Compute zero-copy slice of this RecordBatch + + Parameters + ---------- + offset : int, default 0 + Offset from start of record batch to slice + length : int, default None + Length of slice (default is until end of batch starting from + offset) + + Returns + ------- + sliced : RecordBatch + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) + >>> batch.to_pandas() + n_legs animals + 0 2 Flamingo + 1 2 Parrot + 2 4 Dog + 3 4 Horse + 4 5 Brittle stars + 5 100 Centipede + >>> batch.slice(offset=3).to_pandas() + n_legs animals + 0 4 Horse + 1 5 Brittle stars + 2 100 Centipede + >>> batch.slice(length=2).to_pandas() + n_legs animals + 0 2 Flamingo + 1 2 Parrot + >>> batch.slice(offset=3, length=1).to_pandas() + n_legs animals + 0 4 Horse + """ + def equals(self, other: Self, check_metadata: bool = False) -> bool: + """ + Check if contents of two record batches are equal. + + Parameters + ---------- + other : pyarrow.RecordBatch + RecordBatch to compare against. + check_metadata : bool, default False + Whether schema metadata equality should be checked as well. + + Returns + ------- + are_equal : bool + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> batch = pa.RecordBatch.from_arrays([n_legs, animals], names=["n_legs", "animals"]) + >>> batch_0 = pa.record_batch([]) + >>> batch_1 = pa.RecordBatch.from_arrays( + ... [n_legs, animals], + ... names=["n_legs", "animals"], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> batch.equals(batch) + True + >>> batch.equals(batch_0) + False + >>> batch.equals(batch_1) + True + >>> batch.equals(batch_1, check_metadata=True) + False + """ + def select(self, columns: Iterable[str] | Iterable[int] | NDArray[np.str_]) -> Self: + """ + Select columns of the RecordBatch. + + Returns a new RecordBatch with the specified columns, and metadata + preserved. + + Parameters + ---------- + columns : list-like + The column names or integer indices to select. + + Returns + ------- + RecordBatch + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> batch = pa.record_batch([n_legs, animals], names=["n_legs", "animals"]) + + Select columns my indices: + + >>> batch.select([1]) + pyarrow.RecordBatch + animals: string + ---- + animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] + + Select columns by names: + + >>> batch.select(["n_legs"]) + pyarrow.RecordBatch + n_legs: int64 + ---- + n_legs: [2,2,4,4,5,100] + """ + def cast( + self, target_schema: Schema, safe: bool | None = None, options: CastOptions | None = None + ) -> Self: + """ + Cast record batch values to another schema. + + Parameters + ---------- + target_schema : Schema + Schema to cast to, the names and order of fields must match. + safe : bool, default True + Check for overflows or other unsafe conversions. + options : CastOptions, default None + Additional checks pass by CastOptions + + Returns + ------- + RecordBatch + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> batch = pa.RecordBatch.from_pandas(df) + >>> batch.schema + n_legs: int64 + animals: string + -- schema metadata -- + pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, ... + + Define new schema and cast batch values: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.duration("s")), pa.field("animals", pa.string())] + ... ) + >>> batch.cast(target_schema=my_schema) + pyarrow.RecordBatch + n_legs: duration[s] + animals: string + ---- + n_legs: [2,4,5,100] + animals: ["Flamingo","Horse","Brittle stars","Centipede"] + """ + @classmethod + def from_arrays( + cls, + arrays: Collection[Array], + names: list[str] | None = None, + schema: Schema | None = None, + metadata: Mapping | None = None, + ) -> Self: + """ + Construct a RecordBatch from multiple pyarrow.Arrays + + Parameters + ---------- + arrays : list of pyarrow.Array + One for each field in RecordBatch + names : list of str, optional + Names for the batch fields. If not passed, schema must be passed + schema : Schema, default None + Schema for the created batch. If not passed, names must be passed + metadata : dict or Mapping, default None + Optional metadata for the schema (if inferred). + + Returns + ------- + pyarrow.RecordBatch + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> names = ["n_legs", "animals"] + + Construct a RecordBatch from pyarrow Arrays using names: + + >>> pa.RecordBatch.from_arrays([n_legs, animals], names=names) + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,2,4,4,5,100] + animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] + >>> pa.RecordBatch.from_arrays([n_legs, animals], names=names).to_pandas() + n_legs animals + 0 2 Flamingo + 1 2 Parrot + 2 4 Dog + 3 4 Horse + 4 5 Brittle stars + 5 100 Centipede + + Construct a RecordBatch from pyarrow Arrays using schema: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> pa.RecordBatch.from_arrays([n_legs, animals], schema=my_schema).to_pandas() + n_legs animals + 0 2 Flamingo + 1 2 Parrot + 2 4 Dog + 3 4 Horse + 4 5 Brittle stars + 5 100 Centipede + >>> pa.RecordBatch.from_arrays([n_legs, animals], schema=my_schema).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + """ + @classmethod + def from_pandas( + cls, + df: pd.DataFrame, + schema: Schema | None = None, + preserve_index: bool | None = None, + nthreads: int | None = None, + columns: list[str] | None = None, + ) -> Self: + """ + Convert pandas.DataFrame to an Arrow RecordBatch + + Parameters + ---------- + df : pandas.DataFrame + schema : pyarrow.Schema, optional + The expected schema of the RecordBatch. This can be used to + indicate the type of columns if we cannot infer it automatically. + If passed, the output will have exactly this schema. Columns + specified in the schema that are not found in the DataFrame columns + or its index will raise an error. Additional columns or index + levels in the DataFrame which are not specified in the schema will + be ignored. + preserve_index : bool, optional + Whether to store the index as an additional column in the resulting + ``RecordBatch``. The default of None will store the index as a + column, except for RangeIndex which is stored as metadata only. Use + ``preserve_index=True`` to force it to be stored as a column. + nthreads : int, default None + If greater than 1, convert columns to Arrow in parallel using + indicated number of threads. By default, this follows + :func:`pyarrow.cpu_count` (may use up to system CPU count threads). + columns : list, optional + List of column to be converted. If None, use all columns. + + Returns + ------- + pyarrow.RecordBatch + + + Examples + -------- + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2021, 2022], + ... "month": [3, 5, 7, 9], + ... "day": [1, 5, 9, 13], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + + Convert pandas DataFrame to RecordBatch: + + >>> import pyarrow as pa + >>> pa.RecordBatch.from_pandas(df) + pyarrow.RecordBatch + year: int64 + month: int64 + day: int64 + n_legs: int64 + animals: string + ---- + year: [2020,2022,2021,2022] + month: [3,5,7,9] + day: [1,5,9,13] + n_legs: [2,4,5,100] + animals: ["Flamingo","Horse","Brittle stars","Centipede"] + + Convert pandas DataFrame to RecordBatch using schema: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> pa.RecordBatch.from_pandas(df, schema=my_schema) + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,4,5,100] + animals: ["Flamingo","Horse","Brittle stars","Centipede"] + + Convert pandas DataFrame to RecordBatch specifying columns: + + >>> pa.RecordBatch.from_pandas(df, columns=["n_legs"]) + pyarrow.RecordBatch + n_legs: int64 + ---- + n_legs: [2,4,5,100] + """ + @classmethod + def from_struct_array( + cls, struct_array: StructArray | ChunkedArray[scalar.StructScalar] + ) -> Self: + """ + Construct a RecordBatch from a StructArray. + + Each field in the StructArray will become a column in the resulting + ``RecordBatch``. + + Parameters + ---------- + struct_array : StructArray + Array to construct the record batch from. + + Returns + ------- + pyarrow.RecordBatch + + Examples + -------- + >>> import pyarrow as pa + >>> struct = pa.array([{"n_legs": 2, "animals": "Parrot"}, {"year": 2022, "n_legs": 4}]) + >>> pa.RecordBatch.from_struct_array(struct).to_pandas() + animals n_legs year + 0 Parrot 2 NaN + 1 None 4 2022.0 + """ + def to_struct_array(self) -> StructArray: + """ + Convert to a struct array. + """ + def to_tensor( + self, + null_to_nan: bool = False, + row_major: bool = True, + memory_pool: MemoryPool | None = None, + ) -> Tensor: + """ + Convert to a :class:`~pyarrow.Tensor`. + + RecordBatches that can be converted have fields of type signed or unsigned + integer or float, including all bit-widths. + + ``null_to_nan`` is ``False`` by default and this method will raise an error in case + any nulls are present. RecordBatches with nulls can be converted with ``null_to_nan`` + set to ``True``. In this case null values are converted to ``NaN`` and integer type + arrays are promoted to the appropriate float type. + + Parameters + ---------- + null_to_nan : bool, default False + Whether to write null values in the result as ``NaN``. + row_major : bool, default True + Whether resulting Tensor is row-major or column-major + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool + + Examples + -------- + >>> import pyarrow as pa + >>> batch = pa.record_batch( + ... [ + ... pa.array([1, 2, 3, 4, None], type=pa.int32()), + ... pa.array([10, 20, 30, 40, None], type=pa.float32()), + ... ], + ... names=["a", "b"], + ... ) + + >>> batch + pyarrow.RecordBatch + a: int32 + b: float + ---- + a: [1,2,3,4,null] + b: [10,20,30,40,null] + + Convert a RecordBatch to row-major Tensor with null values + written as ``NaN``s + + >>> batch.to_tensor(null_to_nan=True) + + type: double + shape: (5, 2) + strides: (16, 8) + >>> batch.to_tensor(null_to_nan=True).to_numpy() + array([[ 1., 10.], + [ 2., 20.], + [ 3., 30.], + [ 4., 40.], + [nan, nan]]) + + Convert a RecordBatch to column-major Tensor + + >>> batch.to_tensor(null_to_nan=True, row_major=False) + + type: double + shape: (5, 2) + strides: (8, 40) + >>> batch.to_tensor(null_to_nan=True, row_major=False).to_numpy() + array([[ 1., 10.], + [ 2., 20.], + [ 3., 30.], + [ 4., 40.], + [nan, nan]]) + """ + def _export_to_c(self, out_ptr: int, out_schema_ptr: int = 0): + """ + Export to a C ArrowArray struct, given its pointer. + + If a C ArrowSchema struct pointer is also given, the record batch + schema is exported to it at the same time. + + Parameters + ---------- + out_ptr: int + The raw pointer to a C ArrowArray struct. + out_schema_ptr: int (optional) + The raw pointer to a C ArrowSchema struct. + + Be careful: if you don't pass the ArrowArray struct to a consumer, + array memory will leak. This is a low-level function intended for + expert users. + """ + @classmethod + def _import_from_c(cls, in_ptr: int, schema: Schema) -> Self: + """ + Import RecordBatch from a C ArrowArray struct, given its pointer + and the imported schema. + + Parameters + ---------- + in_ptr: int + The raw pointer to a C ArrowArray struct. + type: Schema or int + Either a Schema object, or the raw pointer to a C ArrowSchema + struct. + + This is a low-level function intended for expert users. + """ + def __arrow_c_array__(self, requested_schema=None): + """ + Get a pair of PyCapsules containing a C ArrowArray representation of the object. + + Parameters + ---------- + requested_schema : PyCapsule | None + A PyCapsule containing a C ArrowSchema representation of a requested + schema. PyArrow will attempt to cast the batch to this schema. + If None, the batch will be returned as-is, with a schema matching the + one returned by :meth:`__arrow_c_schema__()`. + + Returns + ------- + Tuple[PyCapsule, PyCapsule] + A pair of PyCapsules containing a C ArrowSchema and ArrowArray, + respectively. + """ + def __arrow_c_stream__(self, requested_schema=None): + """ + Export the batch as an Arrow C stream PyCapsule. + + Parameters + ---------- + requested_schema : PyCapsule, default None + The schema to which the stream should be casted, passed as a + PyCapsule containing a C ArrowSchema representation of the + requested schema. + Currently, this is not supported and will raise a + NotImplementedError if the schema doesn't match the current schema. + + Returns + ------- + PyCapsule + """ + @classmethod + def _import_from_c_capsule(cls, schema_capsule, array_capsule) -> Self: + """ + Import RecordBatch from a pair of PyCapsules containing a C ArrowSchema + and ArrowArray, respectively. + + Parameters + ---------- + schema_capsule : PyCapsule + A PyCapsule containing a C ArrowSchema representation of the schema. + array_capsule : PyCapsule + A PyCapsule containing a C ArrowArray representation of the array. + + Returns + ------- + pyarrow.RecordBatch + """ + def _export_to_c_device(self, out_ptr: int, out_schema_ptr: int = 0) -> None: + """ + Export to a C ArrowDeviceArray struct, given its pointer. + + If a C ArrowSchema struct pointer is also given, the record batch + schema is exported to it at the same time. + + Parameters + ---------- + out_ptr: int + The raw pointer to a C ArrowDeviceArray struct. + out_schema_ptr: int (optional) + The raw pointer to a C ArrowSchema struct. + + Be careful: if you don't pass the ArrowDeviceArray struct to a consumer, + array memory will leak. This is a low-level function intended for + expert users. + """ + @classmethod + def _import_from_c_device(cls, in_ptr: int, schema: Schema) -> Self: + """ + Import RecordBatch from a C ArrowDeviceArray struct, given its pointer + and the imported schema. + + Parameters + ---------- + in_ptr: int + The raw pointer to a C ArrowDeviceArray struct. + type: Schema or int + Either a Schema object, or the raw pointer to a C ArrowSchema + struct. + + This is a low-level function intended for expert users. + """ + def __arrow_c_device_array__(self, requested_schema=None, **kwargs): + """ + Get a pair of PyCapsules containing a C ArrowDeviceArray representation + of the object. + + Parameters + ---------- + requested_schema : PyCapsule | None + A PyCapsule containing a C ArrowSchema representation of a requested + schema. PyArrow will attempt to cast the batch to this data type. + If None, the batch will be returned as-is, with a type matching the + one returned by :meth:`__arrow_c_schema__()`. + kwargs + Currently no additional keyword arguments are supported, but + this method will accept any keyword with a value of ``None`` + for compatibility with future keywords. + + Returns + ------- + Tuple[PyCapsule, PyCapsule] + A pair of PyCapsules containing a C ArrowSchema and ArrowDeviceArray, + respectively. + """ + @classmethod + def _import_from_c_device_capsule(cls, schema_capsule, array_capsule) -> Self: + """ + Import RecordBatch from a pair of PyCapsules containing a + C ArrowSchema and ArrowDeviceArray, respectively. + + Parameters + ---------- + schema_capsule : PyCapsule + A PyCapsule containing a C ArrowSchema representation of the schema. + array_capsule : PyCapsule + A PyCapsule containing a C ArrowDeviceArray representation of the array. + + Returns + ------- + pyarrow.RecordBatch + """ + @property + def device_type(self) -> DeviceAllocationType: + """ + The device type where the arrays in the RecordBatch reside. + + Returns + ------- + DeviceAllocationType + """ + @property + def is_cpu(self) -> bool: + """ + Whether the RecordBatch's arrays are CPU-accessible. + """ + def copy_to(self, destination: MemoryManager | Device) -> Self: + """ + Copy the entire RecordBatch to destination device. + + This copies each column of the record batch to create + a new record batch where all underlying buffers for the columns have + been copied to the destination MemoryManager. + + Parameters + ---------- + destination : pyarrow.MemoryManager or pyarrow.Device + The destination device to copy the array to. + + Returns + ------- + RecordBatch + """ + +def table_to_blocks(options, table: Table, categories, extension_columns): ... + +JoinType: TypeAlias = Literal[ + "left semi", + "right semi", + "left anti", + "right anti", + "inner", + "left outer", + "right outer", + "full outer", +] + +class Table(_Tabular[ChunkedArray[Any]]): + """ + A collection of top-level named, equal length Arrow arrays. + + Warnings + -------- + Do not call this class's constructor directly, use one of the ``from_*`` + methods instead. + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 4, 5, 100]) + >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) + >>> names = ["n_legs", "animals"] + + Construct a Table from arrays: + + >>> pa.Table.from_arrays([n_legs, animals], names=names) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + + Construct a Table from a RecordBatch: + + >>> batch = pa.record_batch([n_legs, animals], names=names) + >>> pa.Table.from_batches([batch]) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + + Construct a Table from pandas DataFrame: + + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2019, 2021], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> pa.Table.from_pandas(df) + pyarrow.Table + year: int64 + n_legs: int64 + animals: string + ---- + year: [[2020,2022,2019,2021]] + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + + Construct a Table from a dictionary of arrays: + + >>> pydict = {"n_legs": n_legs, "animals": animals} + >>> pa.Table.from_pydict(pydict) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + >>> pa.Table.from_pydict(pydict).schema + n_legs: int64 + animals: string + + Construct a Table from a dictionary of arrays with metadata: + + >>> my_metadata = {"n_legs": "Number of legs per animal"} + >>> pa.Table.from_pydict(pydict, metadata=my_metadata).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + + Construct a Table from a list of rows: + + >>> pylist = [{"n_legs": 2, "animals": "Flamingo"}, {"year": 2021, "animals": "Centipede"}] + >>> pa.Table.from_pylist(pylist) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,null]] + animals: [["Flamingo","Centipede"]] + + Construct a Table from a list of rows with pyarrow schema: + + >>> my_schema = pa.schema( + ... [ + ... pa.field("year", pa.int64()), + ... pa.field("n_legs", pa.int64()), + ... pa.field("animals", pa.string()), + ... ], + ... metadata={"year": "Year of entry"}, + ... ) + >>> pa.Table.from_pylist(pylist, schema=my_schema).schema + year: int64 + n_legs: int64 + animals: string + -- schema metadata -- + year: 'Year of entry' + + Construct a Table with :func:`pyarrow.table`: + + >>> pa.table([n_legs, animals], names=names) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + """ + + def validate(self, *, full=False) -> None: + """ + Perform validation checks. An exception is raised if validation fails. + + By default only cheap validation checks are run. Pass `full=True` + for thorough validation checks (potentially O(n)). + + Parameters + ---------- + full : bool, default False + If True, run expensive checks, otherwise cheap checks only. + + Raises + ------ + ArrowInvalid + """ + def slice(self, offset=0, length=None) -> Self: + """ + Compute zero-copy slice of this Table. + + Parameters + ---------- + offset : int, default 0 + Offset from start of table to slice. + length : int, default None + Length of slice (default is until end of table starting from + offset). + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2019, 2021], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.slice(length=3) + pyarrow.Table + year: int64 + n_legs: int64 + animals: string + ---- + year: [[2020,2022,2019]] + n_legs: [[2,4,5]] + animals: [["Flamingo","Horse","Brittle stars"]] + >>> table.slice(offset=2) + pyarrow.Table + year: int64 + n_legs: int64 + animals: string + ---- + year: [[2019,2021]] + n_legs: [[5,100]] + animals: [["Brittle stars","Centipede"]] + >>> table.slice(offset=2, length=1) + pyarrow.Table + year: int64 + n_legs: int64 + animals: string + ---- + year: [[2019]] + n_legs: [[5]] + animals: [["Brittle stars"]] + """ + def select(self, columns: Iterable[str] | Iterable[int] | NDArray[np.str_]) -> Self: + """ + Select columns of the Table. + + Returns a new Table with the specified columns, and metadata + preserved. + + Parameters + ---------- + columns : list-like + The column names or integer indices to select. + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2019, 2021], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.select([0, 1]) + pyarrow.Table + year: int64 + n_legs: int64 + ---- + year: [[2020,2022,2019,2021]] + n_legs: [[2,4,5,100]] + >>> table.select(["year"]) + pyarrow.Table + year: int64 + ---- + year: [[2020,2022,2019,2021]] + """ + def replace_schema_metadata(self, metadata: dict | None = None) -> Self: + """ + Create shallow copy of table by replacing schema + key-value metadata with the indicated new metadata (which may be None), + which deletes any existing metadata. + + Parameters + ---------- + metadata : dict, default None + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2019, 2021], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + + Constructing a Table with pyarrow schema and metadata: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> table = pa.table(df, my_schema) + >>> table.schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + pandas: ... + + Create a shallow copy of a Table with deleted schema metadata: + + >>> table.replace_schema_metadata().schema + n_legs: int64 + animals: string + + Create a shallow copy of a Table with new schema metadata: + + >>> metadata = {"animals": "Which animal"} + >>> table.replace_schema_metadata(metadata=metadata).schema + n_legs: int64 + animals: string + -- schema metadata -- + animals: 'Which animal' + """ + def flatten(self, memory_pool: MemoryPool | None = None) -> Self: + """ + Flatten this Table. + + Each column with a struct type is flattened + into one column per struct field. Other columns are left unchanged. + + Parameters + ---------- + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> struct = pa.array([{"n_legs": 2, "animals": "Parrot"}, {"year": 2022, "n_legs": 4}]) + >>> month = pa.array([4, 6]) + >>> table = pa.Table.from_arrays([struct, month], names=["a", "month"]) + >>> table + pyarrow.Table + a: struct + child 0, animals: string + child 1, n_legs: int64 + child 2, year: int64 + month: int64 + ---- + a: [ + -- is_valid: all not null + -- child 0 type: string + ["Parrot",null] + -- child 1 type: int64 + [2,4] + -- child 2 type: int64 + [null,2022]] + month: [[4,6]] + + Flatten the columns with struct field: + + >>> table.flatten() + pyarrow.Table + a.animals: string + a.n_legs: int64 + a.year: int64 + month: int64 + ---- + a.animals: [["Parrot",null]] + a.n_legs: [[2,4]] + a.year: [[null,2022]] + month: [[4,6]] + """ + def combine_chunks(self, memory_pool: MemoryPool | None = None) -> Self: + """ + Make a new table by combining the chunks this table has. + + All the underlying chunks in the ChunkedArray of each column are + concatenated into zero or one chunk. + + Parameters + ---------- + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool. + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> animals = pa.chunked_array( + ... [["Flamingo", "Parrot", "Dog"], ["Horse", "Brittle stars", "Centipede"]] + ... ) + >>> names = ["n_legs", "animals"] + >>> table = pa.table([n_legs, animals], names=names) + >>> table + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,2,4],[4,5,100]] + animals: [["Flamingo","Parrot","Dog"],["Horse","Brittle stars","Centipede"]] + >>> table.combine_chunks() + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,2,4,4,5,100]] + animals: [["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"]] + """ + def unify_dictionaries(self, memory_pool: MemoryPool | None = None) -> Self: + """ + Unify dictionaries across all chunks. + + This method returns an equivalent table, but where all chunks of + each column share the same dictionary values. Dictionary indices + are transposed accordingly. + + Columns without dictionaries are returned unchanged. + + Parameters + ---------- + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> arr_1 = pa.array(["Flamingo", "Parrot", "Dog"]).dictionary_encode() + >>> arr_2 = pa.array(["Horse", "Brittle stars", "Centipede"]).dictionary_encode() + >>> c_arr = pa.chunked_array([arr_1, arr_2]) + >>> table = pa.table([c_arr], names=["animals"]) + >>> table + pyarrow.Table + animals: dictionary + ---- + animals: [ -- dictionary: + ["Flamingo","Parrot","Dog"] -- indices: + [0,1,2], -- dictionary: + ["Horse","Brittle stars","Centipede"] -- indices: + [0,1,2]] + + Unify dictionaries across both chunks: + + >>> table.unify_dictionaries() + pyarrow.Table + animals: dictionary + ---- + animals: [ -- dictionary: + ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] -- indices: + [0,1,2], -- dictionary: + ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] -- indices: + [3,4,5]] + """ + def equals(self, other: Self, check_metadata: bool = False) -> Self: + """ + Check if contents of two tables are equal. + + Parameters + ---------- + other : pyarrow.Table + Table to compare against. + check_metadata : bool, default False + Whether schema metadata equality should be checked as well. + + Returns + ------- + bool + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array( + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"] + ... ) + >>> names = ["n_legs", "animals"] + >>> table = pa.Table.from_arrays([n_legs, animals], names=names) + >>> table_0 = pa.Table.from_arrays([]) + >>> table_1 = pa.Table.from_arrays( + ... [n_legs, animals], names=names, metadata={"n_legs": "Number of legs per animal"} + ... ) + >>> table.equals(table) + True + >>> table.equals(table_0) + False + >>> table.equals(table_1) + True + >>> table.equals(table_1, check_metadata=True) + False + """ + def cast( + self, target_schema: Schema, safe: bool | None = None, options: CastOptions | None = None + ) -> Self: + """ + Cast table values to another schema. + + Parameters + ---------- + target_schema : Schema + Schema to cast to, the names and order of fields must match. + safe : bool, default True + Check for overflows or other unsafe conversions. + options : CastOptions, default None + Additional checks pass by CastOptions + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.schema + n_legs: int64 + animals: string + -- schema metadata -- + pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, ... + + Define new schema and cast table values: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.duration("s")), pa.field("animals", pa.string())] + ... ) + >>> table.cast(target_schema=my_schema) + pyarrow.Table + n_legs: duration[s] + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + """ + @classmethod + def from_pandas( + cls, + df: pd.DataFrame, + schema: Schema | None = None, + preserve_index: bool | None = None, + nthreads: int | None = None, + columns: list[str] | None = None, + safe: bool = True, + ) -> Self: + """ + Convert pandas.DataFrame to an Arrow Table. + + The column types in the resulting Arrow Table are inferred from the + dtypes of the pandas.Series in the DataFrame. In the case of non-object + Series, the NumPy dtype is translated to its Arrow equivalent. In the + case of `object`, we need to guess the datatype by looking at the + Python objects in this Series. + + Be aware that Series of the `object` dtype don't carry enough + information to always lead to a meaningful Arrow type. In the case that + we cannot infer a type, e.g. because the DataFrame is of length 0 or + the Series only contains None/nan objects, the type is set to + null. This behavior can be avoided by constructing an explicit schema + and passing it to this function. + + Parameters + ---------- + df : pandas.DataFrame + schema : pyarrow.Schema, optional + The expected schema of the Arrow Table. This can be used to + indicate the type of columns if we cannot infer it automatically. + If passed, the output will have exactly this schema. Columns + specified in the schema that are not found in the DataFrame columns + or its index will raise an error. Additional columns or index + levels in the DataFrame which are not specified in the schema will + be ignored. + preserve_index : bool, optional + Whether to store the index as an additional column in the resulting + ``Table``. The default of None will store the index as a column, + except for RangeIndex which is stored as metadata only. Use + ``preserve_index=True`` to force it to be stored as a column. + nthreads : int, default None + If greater than 1, convert columns to Arrow in parallel using + indicated number of threads. By default, this follows + :func:`pyarrow.cpu_count` (may use up to system CPU count threads). + columns : list, optional + List of column to be converted. If None, use all columns. + safe : bool, default True + Check for overflows or other unsafe conversions. + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> pa.Table.from_pandas(df) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + """ + @classmethod + def from_arrays( + cls, + arrays: Collection[ArrayOrChunkedArray[Any]], + names: list[str] | None = None, + schema: Schema | None = None, + metadata: Mapping | None = None, + ) -> Self: + """ + Construct a Table from Arrow arrays. + + Parameters + ---------- + arrays : list of pyarrow.Array or pyarrow.ChunkedArray + Equal-length arrays that should form the table. + names : list of str, optional + Names for the table columns. If not passed, schema must be passed. + schema : Schema, default None + Schema for the created table. If not passed, names must be passed. + metadata : dict or Mapping, default None + Optional metadata for the schema (if inferred). + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 4, 5, 100]) + >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) + >>> names = ["n_legs", "animals"] + + Construct a Table from arrays: + + >>> pa.Table.from_arrays([n_legs, animals], names=names) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + + Construct a Table from arrays with metadata: + + >>> my_metadata = {"n_legs": "Number of legs per animal"} + >>> pa.Table.from_arrays([n_legs, animals], names=names, metadata=my_metadata) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + >>> pa.Table.from_arrays([n_legs, animals], names=names, metadata=my_metadata).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + + Construct a Table from arrays with pyarrow schema: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"animals": "Name of the animal species"}, + ... ) + >>> pa.Table.from_arrays([n_legs, animals], schema=my_schema) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + >>> pa.Table.from_arrays([n_legs, animals], schema=my_schema).schema + n_legs: int64 + animals: string + -- schema metadata -- + animals: 'Name of the animal species' + """ + @classmethod + def from_struct_array( + cls, struct_array: StructArray | ChunkedArray[scalar.StructScalar] + ) -> Self: + """ + Construct a Table from a StructArray. + + Each field in the StructArray will become a column in the resulting + ``Table``. + + Parameters + ---------- + struct_array : StructArray or ChunkedArray + Array to construct the table from. + + Returns + ------- + pyarrow.Table + + Examples + -------- + >>> import pyarrow as pa + >>> struct = pa.array([{"n_legs": 2, "animals": "Parrot"}, {"year": 2022, "n_legs": 4}]) + >>> pa.Table.from_struct_array(struct).to_pandas() + animals n_legs year + 0 Parrot 2 NaN + 1 None 4 2022.0 + """ + def to_struct_array( + self, max_chunksize: int | None = None + ) -> ChunkedArray[scalar.StructScalar]: + """ + Convert to a chunked array of struct type. + + Parameters + ---------- + max_chunksize : int, default None + Maximum number of rows for ChunkedArray chunks. Individual chunks + may be smaller depending on the chunk layout of individual columns. + + Returns + ------- + ChunkedArray + """ + @classmethod + def from_batches(cls, batches: Iterable[RecordBatch], schema: Schema | None = None) -> Self: + """ + Construct a Table from a sequence or iterator of Arrow RecordBatches. + + Parameters + ---------- + batches : sequence or iterator of RecordBatch + Sequence of RecordBatch to be converted, all schemas must be equal. + schema : Schema, default None + If not passed, will be inferred from the first RecordBatch. + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 4, 5, 100]) + >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) + >>> names = ["n_legs", "animals"] + >>> batch = pa.record_batch([n_legs, animals], names=names) + >>> batch.to_pandas() + n_legs animals + 0 2 Flamingo + 1 4 Horse + 2 5 Brittle stars + 3 100 Centipede + + Construct a Table from a RecordBatch: + + >>> pa.Table.from_batches([batch]) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + + Construct a Table from a sequence of RecordBatches: + + >>> pa.Table.from_batches([batch, batch]) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100],[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"],["Flamingo","Horse","Brittle stars","Centipede"]] + """ + def to_batches(self, max_chunksize: int | None = None) -> list[RecordBatch]: + """ + Convert Table to a list of RecordBatch objects. + + Note that this method is zero-copy, it merely exposes the same data + under a different API. + + Parameters + ---------- + max_chunksize : int, default None + Maximum number of rows for each RecordBatch chunk. Individual chunks + may be smaller depending on the chunk layout of individual columns. + + Returns + ------- + list[RecordBatch] + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + + Convert a Table to a RecordBatch: + + >>> table.to_batches()[0].to_pandas() + n_legs animals + 0 2 Flamingo + 1 4 Horse + 2 5 Brittle stars + 3 100 Centipede + + Convert a Table to a list of RecordBatches: + + >>> table.to_batches(max_chunksize=2)[0].to_pandas() + n_legs animals + 0 2 Flamingo + 1 4 Horse + >>> table.to_batches(max_chunksize=2)[1].to_pandas() + n_legs animals + 0 5 Brittle stars + 1 100 Centipede + """ + def to_reader(self, max_chunksize: int | None = None) -> RecordBatchReader: + """ + Convert the Table to a RecordBatchReader. + + Note that this method is zero-copy, it merely exposes the same data + under a different API. + + Parameters + ---------- + max_chunksize : int, default None + Maximum number of rows for each RecordBatch chunk. Individual chunks + may be smaller depending on the chunk layout of individual columns. + + Returns + ------- + RecordBatchReader + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + + Convert a Table to a RecordBatchReader: + + >>> table.to_reader() + + + >>> reader = table.to_reader() + >>> reader.schema + n_legs: int64 + animals: string + -- schema metadata -- + pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, ... + >>> reader.read_all() + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + """ + @property + def schema(self) -> Schema: + """ + Schema of the table and its columns. + + Returns + ------- + Schema + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.schema + n_legs: int64 + animals: string + -- schema metadata -- + pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' ... + """ + @property + def num_columns(self) -> int: + """ + Number of columns in this table. + + Returns + ------- + int + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.num_columns + 2 + """ + @property + def num_rows(self) -> int: + """ + Number of rows in this table. + + Due to the definition of a table, all columns have the same number of + rows. + + Returns + ------- + int + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.num_rows + 4 + """ + @property + def nbytes(self) -> int: + """ + Total number of bytes consumed by the elements of the table. + + In other words, the sum of bytes from all buffer ranges referenced. + + Unlike `get_total_buffer_size` this method will account for array + offsets. + + If buffers are shared between arrays then the shared + portion will only be counted multiple times. + + The dictionary of dictionary arrays will always be counted in their + entirety even if the array only references a portion of the dictionary. + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.nbytes + 72 + """ + def get_total_buffer_size(self) -> int: + """ + The sum of bytes in each buffer referenced by the table. + + An array may only reference a portion of a buffer. + This method will overestimate in this case and return the + byte size of the entire buffer. + + If a buffer is referenced multiple times then it will + only be counted once. + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... {"n_legs": [None, 4, 5, None], "animals": ["Flamingo", "Horse", None, "Centipede"]} + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.get_total_buffer_size() + 76 + """ + def __sizeof__(self) -> int: ... + def add_column( + self, i: int, field_: str | Field, column: ArrayOrChunkedArray[Any] | list + ) -> Self: + """ + Add column to Table at position. + + A new table is returned with the column added, the original table + object is left unchanged. + + Parameters + ---------- + i : int + Index to place the column at. + field_ : str or Field + If a string is passed then the type is deduced from the column + data. + column : Array, list of Array, or values coercible to arrays + Column data. + + Returns + ------- + Table + New table with the passed column added. + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + + Add column: + + >>> year = [2021, 2022, 2019, 2021] + >>> table.add_column(0, "year", [year]) + pyarrow.Table + year: int64 + n_legs: int64 + animals: string + ---- + year: [[2021,2022,2019,2021]] + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + + Original table is left unchanged: + + >>> table + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + """ + def remove_column(self, i: int) -> Self: + """ + Create new Table with the indicated column removed. + + Parameters + ---------- + i : int + Index of column to remove. + + Returns + ------- + Table + New table without the column. + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.remove_column(1) + pyarrow.Table + n_legs: int64 + ---- + n_legs: [[2,4,5,100]] + """ + def set_column( + self, i: int, field_: str | Field, column: ArrayOrChunkedArray[Any] | list + ) -> Self: + """ + Replace column in Table at position. + + Parameters + ---------- + i : int + Index to place the column at. + field_ : str or Field + If a string is passed then the type is deduced from the column + data. + column : Array, list of Array, or values coercible to arrays + Column data. + + Returns + ------- + Table + New table with the passed column set. + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + + Replace a column: + + >>> year = [2021, 2022, 2019, 2021] + >>> table.set_column(1, "year", [year]) + pyarrow.Table + n_legs: int64 + year: int64 + ---- + n_legs: [[2,4,5,100]] + year: [[2021,2022,2019,2021]] + """ + @overload + def rename_columns(self, names: list[str]) -> Self: ... + @overload + def rename_columns(self, names: dict[str, str]) -> Self: ... + def rename_columns(self, names): + """ + Create new table with columns renamed to provided names. + + Parameters + ---------- + names : list[str] or dict[str, str] + List of new column names or mapping of old column names to new column names. + + If a mapping of old to new column names is passed, then all columns which are + found to match a provided old column name will be renamed to the new column name. + If any column names are not found in the mapping, a KeyError will be raised. + + Raises + ------ + KeyError + If any of the column names passed in the names mapping do not exist. + + Returns + ------- + Table + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> new_names = ["n", "name"] + >>> table.rename_columns(new_names) + pyarrow.Table + n: int64 + name: string + ---- + n: [[2,4,5,100]] + name: [["Flamingo","Horse","Brittle stars","Centipede"]] + >>> new_names = {"n_legs": "n", "animals": "name"} + >>> table.rename_columns(new_names) + pyarrow.Table + n: int64 + name: string + ---- + n: [[2,4,5,100]] + name: [["Flamingo","Horse","Brittle stars","Centipede"]] + """ + def drop(self, columns: str | list[str]) -> Self: + """ + Drop one or more columns and return a new table. + + Alias of Table.drop_columns, but kept for backwards compatibility. + + Parameters + ---------- + columns : str or list[str] + Field name(s) referencing existing column(s). + + Returns + ------- + Table + New table without the column(s). + """ + def group_by(self, keys: str | list[str], use_threads: bool = True) -> TableGroupBy: + """ + Declare a grouping over the columns of the table. + + Resulting grouping can then be used to perform aggregations + with a subsequent ``aggregate()`` method. + + Parameters + ---------- + keys : str or list[str] + Name of the columns that should be used as the grouping key. + use_threads : bool, default True + Whether to use multithreading or not. When set to True (the + default), no stable ordering of the output is guaranteed. + + Returns + ------- + TableGroupBy + + See Also + -------- + TableGroupBy.aggregate + + Examples + -------- + >>> import pandas as pd + >>> import pyarrow as pa + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> table.group_by("year").aggregate([("n_legs", "sum")]) + pyarrow.Table + year: int64 + n_legs_sum: int64 + ---- + year: [[2020,2022,2021,2019]] + n_legs_sum: [[2,6,104,5]] + """ + def join( + self, + right_table: Self, + keys: str | list[str], + right_keys: str | list[str] | None = None, + join_type: JoinType = "left outer", + left_suffix: str | None = None, + right_suffix: str | None = None, + coalesce_keys: bool = True, + use_threads: bool = True, + ) -> Self: + """ + Perform a join between this table and another one. + + Result of the join will be a new Table, where further + operations can be applied. + + Parameters + ---------- + right_table : Table + The table to join to the current one, acting as the right table + in the join operation. + keys : str or list[str] + The columns from current table that should be used as keys + of the join operation left side. + right_keys : str or list[str], default None + The columns from the right_table that should be used as keys + on the join operation right side. + When ``None`` use the same key names as the left table. + join_type : str, default "left outer" + The kind of join that should be performed, one of + ("left semi", "right semi", "left anti", "right anti", + "inner", "left outer", "right outer", "full outer") + left_suffix : str, default None + Which suffix to add to left column names. This prevents confusion + when the columns in left and right tables have colliding names. + right_suffix : str, default None + Which suffix to add to the right column names. This prevents confusion + when the columns in left and right tables have colliding names. + coalesce_keys : bool, default True + If the duplicated keys should be omitted from one of the sides + in the join result. + use_threads : bool, default True + Whether to use multithreading or not. + + Returns + ------- + Table + + Examples + -------- + >>> import pandas as pd + >>> import pyarrow as pa + >>> df1 = pd.DataFrame({"id": [1, 2, 3], "year": [2020, 2022, 2019]}) + >>> df2 = pd.DataFrame( + ... {"id": [3, 4], "n_legs": [5, 100], "animal": ["Brittle stars", "Centipede"]} + ... ) + >>> t1 = pa.Table.from_pandas(df1) + >>> t2 = pa.Table.from_pandas(df2) + + Left outer join: + + >>> t1.join(t2, "id").combine_chunks().sort_by("year") + pyarrow.Table + id: int64 + year: int64 + n_legs: int64 + animal: string + ---- + id: [[3,1,2]] + year: [[2019,2020,2022]] + n_legs: [[5,null,null]] + animal: [["Brittle stars",null,null]] + + Full outer join: + + >>> t1.join(t2, "id", join_type="full outer").combine_chunks().sort_by("year") + pyarrow.Table + id: int64 + year: int64 + n_legs: int64 + animal: string + ---- + id: [[3,1,2,4]] + year: [[2019,2020,2022,null]] + n_legs: [[5,null,null,100]] + animal: [["Brittle stars",null,null,"Centipede"]] + + Right outer join: + + >>> t1.join(t2, "id", join_type="right outer").combine_chunks().sort_by("year") + pyarrow.Table + year: int64 + id: int64 + n_legs: int64 + animal: string + ---- + year: [[2019,null]] + id: [[3,4]] + n_legs: [[5,100]] + animal: [["Brittle stars","Centipede"]] + + Right anti join + + >>> t1.join(t2, "id", join_type="right anti") + pyarrow.Table + id: int64 + n_legs: int64 + animal: string + ---- + id: [[4]] + n_legs: [[100]] + animal: [["Centipede"]] + """ + def join_asof( + self, + right_table: Self, + on: str, + by: str | list[str], + tolerance: int, + right_on: str | list[str] | None = None, + right_by: str | list[str] | None = None, + ) -> Self: + """ + Perform an asof join between this table and another one. + + This is similar to a left-join except that we match on nearest key rather + than equal keys. Both tables must be sorted by the key. This type of join + is most useful for time series data that are not perfectly aligned. + + Optionally match on equivalent keys with "by" before searching with "on". + + Result of the join will be a new Table, where further + operations can be applied. + + Parameters + ---------- + right_table : Table + The table to join to the current one, acting as the right table + in the join operation. + on : str + The column from current table that should be used as the "on" key + of the join operation left side. + + An inexact match is used on the "on" key, i.e. a row is considered a + match if and only if left_on - tolerance <= right_on <= left_on. + + The input dataset must be sorted by the "on" key. Must be a single + field of a common type. + + Currently, the "on" key must be an integer, date, or timestamp type. + by : str or list[str] + The columns from current table that should be used as the keys + of the join operation left side. The join operation is then done + only for the matches in these columns. + tolerance : int + The tolerance for inexact "on" key matching. A right row is considered + a match with the left row ``right.on - left.on <= tolerance``. The + ``tolerance`` may be: + + - negative, in which case a past-as-of-join occurs; + - or positive, in which case a future-as-of-join occurs; + - or zero, in which case an exact-as-of-join occurs. + + The tolerance is interpreted in the same units as the "on" key. + right_on : str or list[str], default None + The columns from the right_table that should be used as the on key + on the join operation right side. + When ``None`` use the same key name as the left table. + right_by : str or list[str], default None + The columns from the right_table that should be used as keys + on the join operation right side. + When ``None`` use the same key names as the left table. + + Returns + ------- + Table + + Example + -------- + >>> import pyarrow as pa + >>> t1 = pa.table({"id": [1, 3, 2, 3, 3], "year": [2020, 2021, 2022, 2022, 2023]}) + >>> t2 = pa.table( + ... { + ... "id": [3, 4], + ... "year": [2020, 2021], + ... "n_legs": [5, 100], + ... "animal": ["Brittle stars", "Centipede"], + ... } + ... ) + + >>> t1.join_asof(t2, on="year", by="id", tolerance=-2) + pyarrow.Table + id: int64 + year: int64 + n_legs: int64 + animal: string + ---- + id: [[1,3,2,3,3]] + year: [[2020,2021,2022,2022,2023]] + n_legs: [[null,5,null,5,null]] + animal: [[null,"Brittle stars",null,"Brittle stars",null]] + """ + def __arrow_c_stream__(self, requested_schema=None): + """ + Export the table as an Arrow C stream PyCapsule. + + Parameters + ---------- + requested_schema : PyCapsule, default None + The schema to which the stream should be casted, passed as a + PyCapsule containing a C ArrowSchema representation of the + requested schema. + Currently, this is not supported and will raise a + NotImplementedError if the schema doesn't match the current schema. + + Returns + ------- + PyCapsule + """ + @property + def is_cpu(self) -> bool: + """ + Whether all ChunkedArrays are CPU-accessible. + """ + +def record_batch( + data: dict[str, list[Any] | Array[Any]] + | Collection[Array[Any]] + | pd.DataFrame + | SupportArrowArray + | SupportArrowDeviceArray, + names: list[str] | None = None, + schema: Schema | None = None, + metadata: Mapping[Any, Any] | None = None, +) -> RecordBatch: + """ + Create a pyarrow.RecordBatch from another Python data structure or sequence + of arrays. + + Parameters + ---------- + data : dict, list, pandas.DataFrame, Arrow-compatible table + A mapping of strings to Arrays or Python lists, a list of Arrays, + a pandas DataFame, or any tabular object implementing the + Arrow PyCapsule Protocol (has an ``__arrow_c_array__`` or + ``__arrow_c_device_array__`` method). + names : list, default None + Column names if list of arrays passed as data. Mutually exclusive with + 'schema' argument. + schema : Schema, default None + The expected schema of the RecordBatch. If not passed, will be inferred + from the data. Mutually exclusive with 'names' argument. + metadata : dict or Mapping, default None + Optional metadata for the schema (if schema not passed). + + Returns + ------- + RecordBatch + + See Also + -------- + RecordBatch.from_arrays, RecordBatch.from_pandas, table + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 2, 4, 4, 5, 100]) + >>> animals = pa.array(["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"]) + >>> names = ["n_legs", "animals"] + + Construct a RecordBatch from a python dictionary: + + >>> pa.record_batch({"n_legs": n_legs, "animals": animals}) + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,2,4,4,5,100] + animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] + >>> pa.record_batch({"n_legs": n_legs, "animals": animals}).to_pandas() + n_legs animals + 0 2 Flamingo + 1 2 Parrot + 2 4 Dog + 3 4 Horse + 4 5 Brittle stars + 5 100 Centipede + + Creating a RecordBatch from a list of arrays with names: + + >>> pa.record_batch([n_legs, animals], names=names) + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,2,4,4,5,100] + animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] + + Creating a RecordBatch from a list of arrays with names and metadata: + + >>> my_metadata = {"n_legs": "How many legs does an animal have?"} + >>> pa.record_batch([n_legs, animals], names=names, metadata=my_metadata) + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,2,4,4,5,100] + animals: ["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"] + >>> pa.record_batch([n_legs, animals], names=names, metadata=my_metadata).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'How many legs does an animal have?' + + Creating a RecordBatch from a pandas DataFrame: + + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2021, 2022], + ... "month": [3, 5, 7, 9], + ... "day": [1, 5, 9, 13], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> pa.record_batch(df) + pyarrow.RecordBatch + year: int64 + month: int64 + day: int64 + n_legs: int64 + animals: string + ---- + year: [2020,2022,2021,2022] + month: [3,5,7,9] + day: [1,5,9,13] + n_legs: [2,4,5,100] + animals: ["Flamingo","Horse","Brittle stars","Centipede"] + + >>> pa.record_batch(df).to_pandas() + year month day n_legs animals + 0 2020 3 1 2 Flamingo + 1 2022 5 5 4 Horse + 2 2021 7 9 5 Brittle stars + 3 2022 9 13 100 Centipede + + Creating a RecordBatch from a pandas DataFrame with schema: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> pa.record_batch(df, my_schema).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + pandas: ... + >>> pa.record_batch(df, my_schema).to_pandas() + n_legs animals + 0 2 Flamingo + 1 4 Horse + 2 5 Brittle stars + 3 100 Centipede + """ + +@overload +def table( + data: dict[str, list[Any] | Array[Any]], + schema: Schema | None = None, + metadata: Mapping[Any, Any] | None = None, + nthreads: int | None = None, +) -> Table: ... +@overload +def table( + data: Collection[ArrayOrChunkedArray[Any]] + | pd.DataFrame + | SupportArrowArray + | SupportArrowStream + | SupportArrowDeviceArray, + names: list[str] | None = None, + schema: Schema | None = None, + metadata: Mapping[Any, Any] | None = None, + nthreads: int | None = None, +) -> Table: ... +def table(*args, **kwargs): + """ + Create a pyarrow.Table from a Python data structure or sequence of arrays. + + Parameters + ---------- + data : dict, list, pandas.DataFrame, Arrow-compatible table + A mapping of strings to Arrays or Python lists, a list of arrays or + chunked arrays, a pandas DataFame, or any tabular object implementing + the Arrow PyCapsule Protocol (has an ``__arrow_c_array__``, + ``__arrow_c_device_array__`` or ``__arrow_c_stream__`` method). + names : list, default None + Column names if list of arrays passed as data. Mutually exclusive with + 'schema' argument. + schema : Schema, default None + The expected schema of the Arrow Table. If not passed, will be inferred + from the data. Mutually exclusive with 'names' argument. + If passed, the output will have exactly this schema (raising an error + when columns are not found in the data and ignoring additional data not + specified in the schema, when data is a dict or DataFrame). + metadata : dict or Mapping, default None + Optional metadata for the schema (if schema not passed). + nthreads : int, default None + For pandas.DataFrame inputs: if greater than 1, convert columns to + Arrow in parallel using indicated number of threads. By default, + this follows :func:`pyarrow.cpu_count` (may use up to system CPU count + threads). + + Returns + ------- + Table + + See Also + -------- + Table.from_arrays, Table.from_pandas, Table.from_pydict + + Examples + -------- + >>> import pyarrow as pa + >>> n_legs = pa.array([2, 4, 5, 100]) + >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) + >>> names = ["n_legs", "animals"] + + Construct a Table from a python dictionary: + + >>> pa.table({"n_legs": n_legs, "animals": animals}) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + + Construct a Table from arrays: + + >>> pa.table([n_legs, animals], names=names) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + + Construct a Table from arrays with metadata: + + >>> my_metadata = {"n_legs": "Number of legs per animal"} + >>> pa.table([n_legs, animals], names=names, metadata=my_metadata).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + + Construct a Table from pandas DataFrame: + + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2019, 2021], + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> pa.table(df) + pyarrow.Table + year: int64 + n_legs: int64 + animals: string + ---- + year: [[2020,2022,2019,2021]] + n_legs: [[2,4,5,100]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + + Construct a Table from pandas DataFrame with pyarrow schema: + + >>> my_schema = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> pa.table(df, my_schema).schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + pandas: '{"index_columns": [], "column_indexes": [{"name": null, ... + + Construct a Table from chunked arrays: + + >>> n_legs = pa.chunked_array([[2, 2, 4], [4, 5, 100]]) + >>> animals = pa.chunked_array( + ... [["Flamingo", "Parrot", "Dog"], ["Horse", "Brittle stars", "Centipede"]] + ... ) + >>> table = pa.table([n_legs, animals], names=names) + >>> table + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,2,4],[4,5,100]] + animals: [["Flamingo","Parrot","Dog"],["Horse","Brittle stars","Centipede"]] + """ + +def concat_tables( + tables: Iterable[Table], + memory_pool: MemoryPool | None = None, + promote_options: Literal["none", "default", "permissive"] = "none", + **kwargs: Any, +) -> Table: + """ + Concatenate pyarrow.Table objects. + + If promote_options="none", a zero-copy concatenation will be performed. The schemas + of all the Tables must be the same (except the metadata), otherwise an + exception will be raised. The result Table will share the metadata with the + first table. + + If promote_options="default", any null type arrays will be casted to the type of other + arrays in the column of the same name. If a table is missing a particular + field, null values of the appropriate type will be generated to take the + place of the missing field. The new schema will share the metadata with the + first table. Each field in the new schema will share the metadata with the + first table which has the field defined. Note that type promotions may + involve additional allocations on the given ``memory_pool``. + + If promote_options="permissive", the behavior of default plus types will be promoted + to the common denominator that fits all the fields. + + Parameters + ---------- + tables : iterable of pyarrow.Table objects + Pyarrow tables to concatenate into a single Table. + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool. + promote_options : str, default none + Accepts strings "none", "default" and "permissive". + **kwargs : dict, optional + + Examples + -------- + >>> import pyarrow as pa + >>> t1 = pa.table( + ... [ + ... pa.array([2, 4, 5, 100]), + ... pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]), + ... ], + ... names=["n_legs", "animals"], + ... ) + >>> t2 = pa.table([pa.array([2, 4]), pa.array(["Parrot", "Dog"])], names=["n_legs", "animals"]) + >>> pa.concat_tables([t1, t2]) + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[2,4,5,100],[2,4]] + animals: [["Flamingo","Horse","Brittle stars","Centipede"],["Parrot","Dog"]] + + """ + +class TableGroupBy: + """ + A grouping of columns in a table on which to perform aggregations. + + Parameters + ---------- + table : pyarrow.Table + Input table to execute the aggregation on. + keys : str or list[str] + Name of the grouped columns. + use_threads : bool, default True + Whether to use multithreading or not. When set to True (the default), + no stable ordering of the output is guaranteed. + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.table( + ... [ + ... pa.array(["a", "a", "b", "b", "c"]), + ... pa.array([1, 2, 3, 4, 5]), + ... ], + ... names=["keys", "values"], + ... ) + + Grouping of columns: + + >>> pa.TableGroupBy(t, "keys") + + + Perform aggregations: + + >>> pa.TableGroupBy(t, "keys").aggregate([("values", "sum")]) + pyarrow.Table + keys: string + values_sum: int64 + ---- + keys: [["a","b","c"]] + values_sum: [[3,7,5]] + """ + + keys: str | list[str] + def __init__(self, table: Table, keys: str | list[str], use_threads: bool = True): ... + def aggregate( + self, + aggregations: Iterable[ + tuple[ColumnSelector, Aggregation] + | tuple[ColumnSelector, Aggregation, AggregateOptions | None] + ], + ) -> Table: + """ + Perform an aggregation over the grouped columns of the table. + + Parameters + ---------- + aggregations : list[tuple(str, str)] or \ +list[tuple(str, str, FunctionOptions)] + List of tuples, where each tuple is one aggregation specification + and consists of: aggregation column name followed + by function name and optionally aggregation function option. + Pass empty list to get a single row for each group. + The column name can be a string, an empty list or a list of + column names, for unary, nullary and n-ary aggregation functions + respectively. + + For the list of function names and respective aggregation + function options see :ref:`py-grouped-aggrs`. + + Returns + ------- + Table + Results of the aggregation functions. + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.table([ + ... pa.array(["a", "a", "b", "b", "c"]), + ... pa.array([1, 2, 3, 4, 5]), + ... ], names=["keys", "values"]) + + Sum the column "values" over the grouped column "keys": + + >>> t.group_by("keys").aggregate([("values", "sum")]) + pyarrow.Table + keys: string + values_sum: int64 + ---- + keys: [["a","b","c"]] + values_sum: [[3,7,5]] + + Count the rows over the grouped column "keys": + + >>> t.group_by("keys").aggregate([([], "count_all")]) + pyarrow.Table + keys: string + count_all: int64 + ---- + keys: [["a","b","c"]] + count_all: [[2,2,1]] + + Do multiple aggregations: + + >>> t.group_by("keys").aggregate([ + ... ("values", "sum"), + ... ("keys", "count") + ... ]) + pyarrow.Table + keys: string + values_sum: int64 + keys_count: int64 + ---- + keys: [["a","b","c"]] + values_sum: [[3,7,5]] + keys_count: [[2,2,1]] + + Count the number of non-null values for column "values" + over the grouped column "keys": + + >>> import pyarrow.compute as pc + >>> t.group_by(["keys"]).aggregate([ + ... ("values", "count", pc.CountOptions(mode="only_valid")) + ... ]) + pyarrow.Table + keys: string + values_count: int64 + ---- + keys: [["a","b","c"]] + values_count: [[2,2,1]] + + Get a single row for each group in column "keys": + + >>> t.group_by("keys").aggregate([]) + pyarrow.Table + keys: string + ---- + keys: [["a","b","c"]] + """ + def _table(self) -> Table: ... + @property + def _use_threads(self) -> bool: ... + +def concat_batches( + recordbatches: Iterable[RecordBatch], memory_pool: MemoryPool | None = None +) -> RecordBatch: + """ + Concatenate pyarrow.RecordBatch objects. + + All recordbatches must share the same Schema, + the operation implies a copy of the data to merge + the arrays of the different RecordBatches. + + Parameters + ---------- + recordbatches : iterable of pyarrow.RecordBatch objects + Pyarrow record batches to concatenate into a single RecordBatch. + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool. + + Examples + -------- + >>> import pyarrow as pa + >>> t1 = pa.record_batch( + ... [ + ... pa.array([2, 4, 5, 100]), + ... pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]), + ... ], + ... names=["n_legs", "animals"], + ... ) + >>> t2 = pa.record_batch( + ... [pa.array([2, 4]), pa.array(["Parrot", "Dog"])], names=["n_legs", "animals"] + ... ) + >>> pa.concat_batches([t1, t2]) + pyarrow.RecordBatch + n_legs: int64 + animals: string + ---- + n_legs: [2,4,5,100,2,4] + animals: ["Flamingo","Horse","Brittle stars","Centipede","Parrot","Dog"] + + """ + +__all__ = [ + "ChunkedArray", + "chunked_array", + "_Tabular", + "RecordBatch", + "table_to_blocks", + "Table", + "record_batch", + "table", + "concat_tables", + "TableGroupBy", + "concat_batches", +] diff --git a/python/stubs/__lib_pxi/tensor.pyi b/python/stubs/__lib_pxi/tensor.pyi new file mode 100644 index 00000000000..d849abd0f1f --- /dev/null +++ b/python/stubs/__lib_pxi/tensor.pyi @@ -0,0 +1,688 @@ +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self + +import numpy as np + +from pyarrow.lib import _Weakrefable +from scipy.sparse import coo_matrix, csr_matrix +from sparse import COO + +class Tensor(_Weakrefable): + """ + A n-dimensional array a.k.a Tensor. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) + >>> pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + + type: int32 + shape: (2, 3) + strides: (12, 4) + """ + + @classmethod + def from_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: + """ + Create a Tensor from a numpy array. + + Parameters + ---------- + obj : numpy.ndarray + The source numpy array + dim_names : list, optional + Names of each dimension of the Tensor. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) + >>> pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + + type: int32 + shape: (2, 3) + strides: (12, 4) + """ + def to_numpy(self) -> np.ndarray: + """ + Convert arrow::Tensor to numpy.ndarray with zero copy + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) + >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> tensor.to_numpy() + array([[ 2, 2, 4], + [ 4, 5, 100]], dtype=int32) + """ + def equals(self, other: Tensor) -> bool: + """ + Return true if the tensors contains exactly equal data. + + Parameters + ---------- + other : Tensor + The other tensor to compare for equality. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) + >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> y = np.array([[2, 2, 4], [4, 5, 10]], np.int32) + >>> tensor2 = pa.Tensor.from_numpy(y, dim_names=["a", "b"]) + >>> tensor.equals(tensor) + True + >>> tensor.equals(tensor2) + False + """ + def dim_name(self, i: int) -> str: + """ + Returns the name of the i-th tensor dimension. + + Parameters + ---------- + i : int + The physical index of the tensor dimension. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) + >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> tensor.dim_name(0) + 'dim1' + >>> tensor.dim_name(1) + 'dim2' + """ + @property + def dim_names(self) -> list[str]: + """ + Names of this tensor dimensions. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) + >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> tensor.dim_names + ['dim1', 'dim2'] + """ + @property + def is_mutable(self) -> bool: + """ + Is this tensor mutable or immutable. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) + >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> tensor.is_mutable + True + """ + @property + def is_contiguous(self) -> bool: + """ + Is this tensor contiguous in memory. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) + >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> tensor.is_contiguous + True + """ + @property + def ndim(self) -> int: + """ + The dimension (n) of this tensor. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) + >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> tensor.ndim + 2 + """ + @property + def size(self) -> str: + """ + The size of this tensor. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) + >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> tensor.size + 6 + """ + @property + def shape(self) -> tuple[int, ...]: + """ + The shape of this tensor. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) + >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> tensor.shape + (2, 3) + """ + @property + def strides(self) -> tuple[int, ...]: + """ + Strides of this tensor. + + Examples + -------- + >>> import pyarrow as pa + >>> import numpy as np + >>> x = np.array([[2, 2, 4], [4, 5, 100]], np.int32) + >>> tensor = pa.Tensor.from_numpy(x, dim_names=["dim1", "dim2"]) + >>> tensor.strides + (12, 4) + """ + +class SparseCOOTensor(_Weakrefable): + @classmethod + def from_dense_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: + """ + Convert numpy.ndarray to arrow::SparseCOOTensor + + Parameters + ---------- + obj : numpy.ndarray + Data used to populate the rows. + dim_names : list[str], optional + Names of the dimensions. + + Returns + ------- + pyarrow.SparseCOOTensor + """ + + @classmethod + def from_numpy( + cls, + data: np.ndarray, + coords: np.ndarray, + shape: tuple[int, ...], + dim_names: list[str] | None = None, + ) -> Self: + """ + Create arrow::SparseCOOTensor from numpy.ndarrays + + Parameters + ---------- + data : numpy.ndarray + Data used to populate the rows. + coords : numpy.ndarray + Coordinates of the data. + shape : tuple + Shape of the tensor. + dim_names : list, optional + Names of the dimensions. + """ + @classmethod + def from_scipy(cls, obj: csr_matrix, dim_names: list[str] | None = None) -> Self: + """ + Convert scipy.sparse.coo_matrix to arrow::SparseCOOTensor + + Parameters + ---------- + obj : scipy.sparse.csr_matrix + The scipy matrix that should be converted. + dim_names : list, optional + Names of the dimensions. + """ + @classmethod + def from_pydata_sparse(cls, obj: COO, dim_names: list[str] | None = None) -> Self: + """ + Convert pydata/sparse.COO to arrow::SparseCOOTensor. + + Parameters + ---------- + obj : pydata.sparse.COO + The sparse multidimensional array that should be converted. + dim_names : list, optional + Names of the dimensions. + """ + @classmethod + def from_tensor(cls, obj: Tensor) -> Self: + """ + Convert arrow::Tensor to arrow::SparseCOOTensor. + + Parameters + ---------- + obj : Tensor + The tensor that should be converted. + """ + def to_numpy(self) -> tuple[np.ndarray, np.ndarray]: + """ + Convert arrow::SparseCOOTensor to numpy.ndarrays with zero copy. + """ + def to_scipy(self) -> coo_matrix: + """ + Convert arrow::SparseCOOTensor to scipy.sparse.coo_matrix. + """ + def to_pydata_sparse(self) -> COO: + """ + Convert arrow::SparseCOOTensor to pydata/sparse.COO. + """ + def to_tensor(self) -> Tensor: + """ + Convert arrow::SparseCOOTensor to arrow::Tensor. + """ + def equals(self, other: Self) -> bool: + """ + Return true if sparse tensors contains exactly equal data. + + Parameters + ---------- + other : SparseCOOTensor + The other tensor to compare for equality. + """ + @property + def is_mutable(self) -> bool: ... + @property + def ndim(self) -> int: ... + @property + def size(self) -> str: ... + @property + def shape(self) -> tuple[int, ...]: ... + def dim_name(self, i: int) -> str: + """ + Returns the name of the i-th tensor dimension. + + Parameters + ---------- + i : int + The physical index of the tensor dimension. + + Returns + ------- + str + """ + @property + def dim_names(self) -> list[str]: ... + @property + def non_zero_length(self) -> int: ... + @property + def has_canonical_format(self) -> bool: ... + +class SparseCSRMatrix(_Weakrefable): + """ + A sparse CSR matrix. + """ + + @classmethod + def from_dense_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: + """ + Convert numpy.ndarray to arrow::SparseCSRMatrix + + Parameters + ---------- + obj : numpy.ndarray + The dense numpy array that should be converted. + dim_names : list, optional + The names of the dimensions. + + Returns + ------- + pyarrow.SparseCSRMatrix + """ + @classmethod + def from_numpy( + cls, + data: np.ndarray, + indptr: np.ndarray, + indices: np.ndarray, + shape: tuple[int, ...], + dim_names: list[str] | None = None, + ) -> Self: + """ + Create arrow::SparseCSRMatrix from numpy.ndarrays. + + Parameters + ---------- + data : numpy.ndarray + Data used to populate the sparse matrix. + indptr : numpy.ndarray + Range of the rows, + The i-th row spans from `indptr[i]` to `indptr[i+1]` in the data. + indices : numpy.ndarray + Column indices of the corresponding non-zero values. + shape : tuple + Shape of the matrix. + dim_names : list, optional + Names of the dimensions. + """ + @classmethod + def from_scipy(cls, obj: csr_matrix, dim_names: list[str] | None = None) -> Self: + """ + Convert scipy.sparse.csr_matrix to arrow::SparseCSRMatrix. + + Parameters + ---------- + obj : scipy.sparse.csr_matrix + The scipy matrix that should be converted. + dim_names : list, optional + Names of the dimensions. + """ + @classmethod + def from_tensor(cls, obj: Tensor) -> Self: + """ + Convert arrow::Tensor to arrow::SparseCSRMatrix. + + Parameters + ---------- + obj : Tensor + The dense tensor that should be converted. + """ + def to_numpy(self) -> tuple[np.ndarray, np.ndarray, np.ndarray]: + """ + Convert arrow::SparseCSRMatrix to numpy.ndarrays with zero copy. + """ + def to_scipy(self) -> csr_matrix: + """ + Convert arrow::SparseCSRMatrix to scipy.sparse.csr_matrix. + """ + def to_tensor(self) -> Tensor: + """ + Convert arrow::SparseCSRMatrix to arrow::Tensor. + """ + def equals(self, other: Self) -> bool: + """ + Return true if sparse tensors contains exactly equal data. + + Parameters + ---------- + other : SparseCSRMatrix + The other tensor to compare for equality. + """ + @property + def is_mutable(self) -> bool: ... + @property + def ndim(self) -> int: ... + @property + def size(self) -> str: ... + @property + def shape(self) -> tuple[int, ...]: ... + def dim_name(self, i: int) -> str: + """ + Returns the name of the i-th tensor dimension. + + Parameters + ---------- + i : int + The physical index of the tensor dimension. + + Returns + ------- + str + """ + @property + def dim_names(self) -> list[str]: ... + @property + def non_zero_length(self) -> int: ... + +class SparseCSCMatrix(_Weakrefable): + """ + A sparse CSC matrix. + """ + + @classmethod + def from_dense_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: + """ + Convert numpy.ndarray to arrow::SparseCSCMatrix + + Parameters + ---------- + obj : numpy.ndarray + Data used to populate the rows. + dim_names : list[str], optional + Names of the dimensions. + + Returns + ------- + pyarrow.SparseCSCMatrix + """ + @classmethod + def from_numpy( + cls, + data: np.ndarray, + indptr: np.ndarray, + indices: np.ndarray, + shape: tuple[int, ...], + dim_names: list[str] | None = None, + ) -> Self: + """ + Create arrow::SparseCSCMatrix from numpy.ndarrays + + Parameters + ---------- + data : numpy.ndarray + Data used to populate the sparse matrix. + indptr : numpy.ndarray + Range of the rows, + The i-th row spans from `indptr[i]` to `indptr[i+1]` in the data. + indices : numpy.ndarray + Column indices of the corresponding non-zero values. + shape : tuple + Shape of the matrix. + dim_names : list, optional + Names of the dimensions. + """ + @classmethod + def from_scipy(cls, obj: csr_matrix, dim_names: list[str] | None = None) -> Self: + """ + Convert scipy.sparse.csc_matrix to arrow::SparseCSCMatrix + + Parameters + ---------- + obj : scipy.sparse.csc_matrix + The scipy matrix that should be converted. + dim_names : list, optional + Names of the dimensions. + """ + @classmethod + def from_tensor(cls, obj: Tensor) -> Self: + """ + Convert arrow::Tensor to arrow::SparseCSCMatrix + + Parameters + ---------- + obj : Tensor + The dense tensor that should be converted. + """ + def to_numpy(self) -> tuple[np.ndarray, np.ndarray, np.ndarray]: + """ + Convert arrow::SparseCSCMatrix to numpy.ndarrays with zero copy + """ + def to_scipy(self) -> csr_matrix: + """ + Convert arrow::SparseCSCMatrix to scipy.sparse.csc_matrix + """ + def to_tensor(self) -> Tensor: + """ + Convert arrow::SparseCSCMatrix to arrow::Tensor + """ + def equals(self, other: Self) -> bool: + """ + Return true if sparse tensors contains exactly equal data + + Parameters + ---------- + other : SparseCSCMatrix + The other tensor to compare for equality. + """ + @property + def is_mutable(self) -> bool: ... + @property + def ndim(self) -> int: ... + @property + def size(self) -> str: ... + @property + def shape(self) -> tuple[int, ...]: ... + def dim_name(self, i: int) -> str: + """ + Returns the name of the i-th tensor dimension. + + Parameters + ---------- + i : int + The physical index of the tensor dimension. + + Returns + ------- + str + """ + @property + def dim_names(self) -> list[str]: ... + @property + def non_zero_length(self) -> int: ... + +class SparseCSFTensor(_Weakrefable): + """ + A sparse CSF tensor. + + CSF is a generalization of compressed sparse row (CSR) index. + + CSF index recursively compresses each dimension of a tensor into a set + of prefix trees. Each path from a root to leaf forms one tensor + non-zero index. CSF is implemented with two arrays of buffers and one + arrays of integers. + """ + + @classmethod + def from_dense_numpy(cls, obj: np.ndarray, dim_names: list[str] | None = None) -> Self: + """ + Convert numpy.ndarray to arrow::SparseCSFTensor + + Parameters + ---------- + obj : numpy.ndarray + Data used to populate the rows. + dim_names : list[str], optional + Names of the dimensions. + + Returns + ------- + pyarrow.SparseCSFTensor + """ + @classmethod + def from_numpy( + cls, + data: np.ndarray, + indptr: np.ndarray, + indices: np.ndarray, + shape: tuple[int, ...], + dim_names: list[str] | None = None, + ) -> Self: + """ + Create arrow::SparseCSFTensor from numpy.ndarrays + + Parameters + ---------- + data : numpy.ndarray + Data used to populate the sparse tensor. + indptr : numpy.ndarray + The sparsity structure. + Each two consecutive dimensions in a tensor correspond to + a buffer in indices. + A pair of consecutive values at `indptr[dim][i]` + `indptr[dim][i + 1]` signify a range of nodes in + `indices[dim + 1]` who are children of `indices[dim][i]` node. + indices : numpy.ndarray + Stores values of nodes. + Each tensor dimension corresponds to a buffer in indptr. + shape : tuple + Shape of the matrix. + axis_order : list, optional + the sequence in which dimensions were traversed to + produce the prefix tree. + dim_names : list, optional + Names of the dimensions. + """ + @classmethod + def from_tensor(cls, obj: Tensor) -> Self: + """ + Convert arrow::Tensor to arrow::SparseCSFTensor + + Parameters + ---------- + obj : Tensor + The dense tensor that should be converted. + """ + def to_numpy(self) -> tuple[np.ndarray, np.ndarray, np.ndarray]: + """ + Convert arrow::SparseCSFTensor to numpy.ndarrays with zero copy + """ + def to_tensor(self) -> Tensor: + """ + Convert arrow::SparseCSFTensor to arrow::Tensor + """ + def equals(self, other: Self) -> bool: + """ + Return true if sparse tensors contains exactly equal data + + Parameters + ---------- + other : SparseCSFTensor + The other tensor to compare for equality. + """ + @property + def is_mutable(self) -> bool: ... + @property + def ndim(self) -> int: ... + @property + def size(self) -> str: ... + @property + def shape(self) -> tuple[int, ...]: ... + def dim_name(self, i: int) -> str: + """ + Returns the name of the i-th tensor dimension. + + Parameters + ---------- + i : int + The physical index of the tensor dimension. + + Returns + ------- + str + """ + @property + def dim_names(self) -> list[str]: ... + @property + def non_zero_length(self) -> int: ... + +__all__ = [ + "Tensor", + "SparseCOOTensor", + "SparseCSRMatrix", + "SparseCSCMatrix", + "SparseCSFTensor", +] diff --git a/python/stubs/__lib_pxi/types.pyi b/python/stubs/__lib_pxi/types.pyi new file mode 100644 index 00000000000..7fe6c36e332 --- /dev/null +++ b/python/stubs/__lib_pxi/types.pyi @@ -0,0 +1,4413 @@ +import datetime as dt +import sys + +from collections.abc import Mapping, Sequence +from decimal import Decimal + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self + +from typing import Any, Generic, Iterable, Iterator, Literal, overload + +import numpy as np +import pandas as pd + +from pyarrow._stubs_typing import SupportArrowSchema +from pyarrow.lib import ( + Array, + ChunkedArray, + ExtensionArray, + MemoryPool, + MonthDayNano, + Table, +) +from typing_extensions import TypeVar, deprecated + +from .io import Buffer +from .scalar import ExtensionScalar + +_AsPyType = TypeVar("_AsPyType") +_DataTypeT = TypeVar("_DataTypeT", bound=DataType) + +class _Weakrefable: ... +class _Metadata(_Weakrefable): ... + +class DataType(_Weakrefable): + """ + Base class of all Arrow data types. + + Each data type is an *instance* of this class. + + Examples + -------- + Instance of int64 type: + + >>> import pyarrow as pa + >>> pa.int64() + DataType(int64) + """ + def field(self, i: int) -> Field: + """ + Parameters + ---------- + i : int + + Returns + ------- + pyarrow.Field + """ + @property + def id(self) -> int: ... + @property + def bit_width(self) -> int: + """ + Bit width for fixed width type. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.int64() + DataType(int64) + >>> pa.int64().bit_width + 64 + """ + @property + def byte_width(self) -> int: + """ + Byte width for fixed width type. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.int64() + DataType(int64) + >>> pa.int64().byte_width + 8 + """ + @property + def num_fields(self) -> int: + """ + The number of child fields. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.int64() + DataType(int64) + >>> pa.int64().num_fields + 0 + >>> pa.list_(pa.string()) + ListType(list) + >>> pa.list_(pa.string()).num_fields + 1 + >>> struct = pa.struct({"x": pa.int32(), "y": pa.string()}) + >>> struct.num_fields + 2 + """ + @property + def num_buffers(self) -> int: + """ + Number of data buffers required to construct Array type + excluding children. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.int64().num_buffers + 2 + >>> pa.string().num_buffers + 3 + """ + def __hash__(self) -> int: ... + def equals(self, other: DataType | str, *, check_metadata: bool = False) -> bool: + """ + Return true if type is equivalent to passed value. + + Parameters + ---------- + other : DataType or string convertible to DataType + check_metadata : bool + Whether nested Field metadata equality should be checked as well. + + Returns + ------- + is_equal : bool + + Examples + -------- + >>> import pyarrow as pa + >>> pa.int64().equals(pa.string()) + False + >>> pa.int64().equals(pa.int64()) + True + """ + def to_pandas_dtype(self) -> np.generic: + """ + Return the equivalent NumPy / Pandas dtype. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.int64().to_pandas_dtype() + + """ + def _export_to_c(self, out_ptr: int) -> None: + """ + Export to a C ArrowSchema struct, given its pointer. + + Be careful: if you don't pass the ArrowSchema struct to a consumer, + its memory will leak. This is a low-level function intended for + expert users. + """ + @classmethod + def _import_from_c(cls, in_ptr: int) -> Self: + """ + Import DataType from a C ArrowSchema struct, given its pointer. + + This is a low-level function intended for expert users. + """ + def __arrow_c_schema__(self) -> Any: + """ + Export to a ArrowSchema PyCapsule + + Unlike _export_to_c, this will not leak memory if the capsule is not used. + """ + @classmethod + def _import_from_c_capsule(cls, schema) -> Self: + """ + Import a DataType from a ArrowSchema PyCapsule + + Parameters + ---------- + schema : PyCapsule + A valid PyCapsule with name 'arrow_schema' containing an + ArrowSchema pointer. + """ + +class _BasicDataType(DataType, Generic[_AsPyType]): ... +class NullType(_BasicDataType[None]): ... +class BoolType(_BasicDataType[bool]): ... +class UInt8Type(_BasicDataType[int]): ... +class Int8Type(_BasicDataType[int]): ... +class UInt16Type(_BasicDataType[int]): ... +class Int16Type(_BasicDataType[int]): ... +class Uint32Type(_BasicDataType[int]): ... +class Int32Type(_BasicDataType[int]): ... +class UInt64Type(_BasicDataType[int]): ... +class Int64Type(_BasicDataType[int]): ... +class Float16Type(_BasicDataType[float]): ... +class Float32Type(_BasicDataType[float]): ... +class Float64Type(_BasicDataType[float]): ... +class Date32Type(_BasicDataType[dt.date]): ... +class Date64Type(_BasicDataType[dt.date]): ... +class MonthDayNanoIntervalType(_BasicDataType[MonthDayNano]): ... +class StringType(_BasicDataType[str]): ... +class LargeStringType(_BasicDataType[str]): ... +class StringViewType(_BasicDataType[str]): ... +class BinaryType(_BasicDataType[bytes]): ... +class LargeBinaryType(_BasicDataType[bytes]): ... +class BinaryViewType(_BasicDataType[bytes]): ... + +_Unit = TypeVar("_Unit", bound=Literal["s", "ms", "us", "ns"], default=Literal["us"]) +_Tz = TypeVar("_Tz", str, None, default=None) + +class TimestampType(_BasicDataType[int], Generic[_Unit, _Tz]): + """ + Concrete class for timestamp data types. + + Examples + -------- + >>> import pyarrow as pa + + Create an instance of timestamp type: + + >>> pa.timestamp("us") + TimestampType(timestamp[us]) + + Create an instance of timestamp type with timezone: + + >>> pa.timestamp("s", tz="UTC") + TimestampType(timestamp[s, tz=UTC]) + """ + @property + def unit(self) -> _Unit: + """ + The timestamp unit ('s', 'ms', 'us' or 'ns'). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.timestamp("us") + >>> t.unit + 'us' + """ + @property + def tz(self) -> _Tz: + """ + The timestamp time zone, if any, or None. + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.timestamp("s", tz="UTC") + >>> t.tz + 'UTC' + """ + +_Time32Unit = TypeVar("_Time32Unit", bound=Literal["s", "ms"]) + +class Time32Type(_BasicDataType[dt.time], Generic[_Time32Unit]): + """ + Concrete class for time32 data types. + + Supported time unit resolutions are 's' [second] + and 'ms' [millisecond]. + + Examples + -------- + Create an instance of time32 type: + + >>> import pyarrow as pa + >>> pa.time32("ms") + Time32Type(time32[ms]) + """ + @property + def unit(self) -> _Time32Unit: + """ + The time unit ('s' or 'ms'). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.time32("ms") + >>> t.unit + 'ms' + """ + +_Time64Unit = TypeVar("_Time64Unit", bound=Literal["us", "ns"]) + +class Time64Type(_BasicDataType[dt.time], Generic[_Time64Unit]): + """ + Concrete class for time64 data types. + + Supported time unit resolutions are 'us' [microsecond] + and 'ns' [nanosecond]. + + Examples + -------- + Create an instance of time64 type: + + >>> import pyarrow as pa + >>> pa.time64("us") + Time64Type(time64[us]) + """ + @property + def unit(self) -> _Time64Unit: + """ + The time unit ('us' or 'ns'). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.time64("us") + >>> t.unit + 'us' + """ + +class DurationType(_BasicDataType[dt.timedelta], Generic[_Unit]): + """ + Concrete class for duration data types. + + Examples + -------- + Create an instance of duration type: + + >>> import pyarrow as pa + >>> pa.duration("s") + DurationType(duration[s]) + """ + @property + def unit(self) -> _Unit: + """ + The duration unit ('s', 'ms', 'us' or 'ns'). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.duration("s") + >>> t.unit + 's' + """ + +class FixedSizeBinaryType(_BasicDataType[Decimal]): + """ + Concrete class for fixed-size binary data types. + + Examples + -------- + Create an instance of fixed-size binary type: + + >>> import pyarrow as pa + >>> pa.binary(3) + FixedSizeBinaryType(fixed_size_binary[3]) + """ + +_Precision = TypeVar("_Precision", default=Any) +_Scale = TypeVar("_Scale", default=Any) + +class Decimal32Type(FixedSizeBinaryType, Generic[_Precision, _Scale]): + """ + Concrete class for decimal32 data types. + + Examples + -------- + Create an instance of decimal32 type: + + >>> import pyarrow as pa + >>> pa.decimal32(5, 2) + Decimal32Type(decimal32(5, 2)) + """ + @property + def precision(self) -> _Precision: + """ + The decimal precision, in number of decimal digits (an integer). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.decimal32(5, 2) + >>> t.precision + 5 + """ + @property + def scale(self) -> _Scale: + """ + The decimal scale (an integer). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.decimal32(5, 2) + >>> t.scale + 2 + """ + +class Decimal64Type(FixedSizeBinaryType, Generic[_Precision, _Scale]): + """ + Concrete class for decimal64 data types. + + Examples + -------- + Create an instance of decimal64 type: + + >>> import pyarrow as pa + >>> pa.decimal64(5, 2) + Decimal64Type(decimal64(5, 2)) + """ + @property + def precision(self) -> _Precision: + """ + The decimal precision, in number of decimal digits (an integer). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.decimal64(5, 2) + >>> t.precision + 5 + """ + @property + def scale(self) -> _Scale: + """ + The decimal scale (an integer). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.decimal64(5, 2) + >>> t.scale + 2 + """ + +class Decimal128Type(FixedSizeBinaryType, Generic[_Precision, _Scale]): + """ + Concrete class for decimal128 data types. + + Examples + -------- + Create an instance of decimal128 type: + + >>> import pyarrow as pa + >>> pa.decimal128(5, 2) + Decimal128Type(decimal128(5, 2)) + """ + @property + def precision(self) -> _Precision: + """ + The decimal precision, in number of decimal digits (an integer). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.decimal128(5, 2) + >>> t.precision + 5 + """ + @property + def scale(self) -> _Scale: + """ + The decimal scale (an integer). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.decimal128(5, 2) + >>> t.scale + 2 + """ + +class Decimal256Type(FixedSizeBinaryType, Generic[_Precision, _Scale]): + """ + Concrete class for decimal256 data types. + + Examples + -------- + Create an instance of decimal256 type: + + >>> import pyarrow as pa + >>> pa.decimal256(76, 38) + Decimal256Type(decimal256(76, 38)) + """ + @property + def precision(self) -> _Precision: + """ + The decimal precision, in number of decimal digits (an integer). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.decimal256(76, 38) + >>> t.precision + 76 + """ + @property + def scale(self) -> _Scale: + """ + The decimal scale (an integer). + + Examples + -------- + >>> import pyarrow as pa + >>> t = pa.decimal256(76, 38) + >>> t.scale + 38 + """ + +class ListType(DataType, Generic[_DataTypeT]): + """ + Concrete class for list data types. + + Examples + -------- + Create an instance of ListType: + + >>> import pyarrow as pa + >>> pa.list_(pa.string()) + ListType(list) + """ + @property + def value_field(self) -> Field[_DataTypeT]: + """ + The field for list values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.list_(pa.string()).value_field + pyarrow.Field + """ + @property + def value_type(self) -> _DataTypeT: + """ + The data type of list values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.list_(pa.string()).value_type + DataType(string) + """ + +class LargeListType(DataType, Generic[_DataTypeT]): + """ + Concrete class for large list data types + (like ListType, but with 64-bit offsets). + + Examples + -------- + Create an instance of LargeListType: + + >>> import pyarrow as pa + >>> pa.large_list(pa.string()) + LargeListType(large_list) + """ + @property + def value_field(self) -> Field[_DataTypeT]: ... + @property + def value_type(self) -> _DataTypeT: + """ + The data type of large list values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.large_list(pa.string()).value_type + DataType(string) + """ + +class ListViewType(DataType, Generic[_DataTypeT]): + """ + Concrete class for list view data types. + + Examples + -------- + Create an instance of ListViewType: + + >>> import pyarrow as pa + >>> pa.list_view(pa.string()) + ListViewType(list_view) + """ + @property + def value_field(self) -> Field[_DataTypeT]: + """ + The field for list view values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.list_view(pa.string()).value_field + pyarrow.Field + """ + @property + def value_type(self) -> _DataTypeT: + """ + The data type of list view values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.list_view(pa.string()).value_type + DataType(string) + """ + +class LargeListViewType(DataType, Generic[_DataTypeT]): + """ + Concrete class for large list view data types + (like ListViewType, but with 64-bit offsets). + + Examples + -------- + Create an instance of LargeListViewType: + + >>> import pyarrow as pa + >>> pa.large_list_view(pa.string()) + LargeListViewType(large_list_view) + """ + @property + def value_field(self) -> Field[_DataTypeT]: + """ + The field for large list view values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.large_list_view(pa.string()).value_field + pyarrow.Field + """ + @property + def value_type(self) -> _DataTypeT: + """ + The data type of large list view values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.large_list_view(pa.string()).value_type + DataType(string) + """ + +class FixedSizeListType(DataType, Generic[_DataTypeT, _Size]): + """ + Concrete class for fixed size list data types. + + Examples + -------- + Create an instance of FixedSizeListType: + + >>> import pyarrow as pa + >>> pa.list_(pa.int32(), 2) + FixedSizeListType(fixed_size_list[2]) + """ + @property + def value_field(self) -> Field[_DataTypeT]: + """ + The field for list values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.list_(pa.int32(), 2).value_field + pyarrow.Field + """ + @property + def value_type(self) -> _DataTypeT: + """ + The data type of large list values. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.list_(pa.int32(), 2).value_type + DataType(int32) + """ + @property + def list_size(self) -> _Size: + """ + The size of the fixed size lists. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.list_(pa.int32(), 2).list_size + 2 + """ + +class DictionaryMemo(_Weakrefable): + """ + Tracking container for dictionary-encoded fields. + """ + +_IndexT = TypeVar( + "_IndexT", + UInt8Type, + Int8Type, + UInt16Type, + Int16Type, + Uint32Type, + Int32Type, + UInt64Type, + Int64Type, +) +_BasicValueT = TypeVar("_BasicValueT", bound=_BasicDataType) +_ValueT = TypeVar("_ValueT", bound=DataType) +_Ordered = TypeVar("_Ordered", Literal[True], Literal[False], default=Literal[False]) + +class DictionaryType(DataType, Generic[_IndexT, _BasicValueT, _Ordered]): + """ + Concrete class for dictionary data types. + + Examples + -------- + Create an instance of dictionary type: + + >>> import pyarrow as pa + >>> pa.dictionary(pa.int64(), pa.utf8()) + DictionaryType(dictionary) + """ + + @property + def ordered(self) -> _Ordered: + """ + Whether the dictionary is ordered, i.e. whether the ordering of values + in the dictionary is important. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.dictionary(pa.int64(), pa.utf8()).ordered + False + """ + @property + def index_type(self) -> _IndexT: + """ + The data type of dictionary indices (a signed integer type). + + Examples + -------- + >>> import pyarrow as pa + >>> pa.dictionary(pa.int16(), pa.utf8()).index_type + DataType(int16) + """ + @property + def value_type(self) -> _BasicValueT: + """ + The dictionary value type. + + The dictionary values are found in an instance of DictionaryArray. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.dictionary(pa.int16(), pa.utf8()).value_type + DataType(string) + """ + +_K = TypeVar("_K", bound=DataType) + +class MapType(DataType, Generic[_K, _ValueT, _Ordered]): + """ + Concrete class for map data types. + + Examples + -------- + Create an instance of MapType: + + >>> import pyarrow as pa + >>> pa.map_(pa.string(), pa.int32()) + MapType(map) + >>> pa.map_(pa.string(), pa.int32(), keys_sorted=True) + MapType(map) + """ + + @property + def key_field(self) -> Field[_K]: + """ + The field for keys in the map entries. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.map_(pa.string(), pa.int32()).key_field + pyarrow.Field + """ + @property + def key_type(self) -> _K: + """ + The data type of keys in the map entries. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.map_(pa.string(), pa.int32()).key_type + DataType(string) + """ + @property + def item_field(self) -> Field[_ValueT]: + """ + The field for items in the map entries. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.map_(pa.string(), pa.int32()).item_field + pyarrow.Field + """ + @property + def item_type(self) -> _ValueT: + """ + The data type of items in the map entries. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.map_(pa.string(), pa.int32()).item_type + DataType(int32) + """ + @property + def keys_sorted(self) -> _Ordered: + """ + Should the entries be sorted according to keys. + + Examples + -------- + >>> import pyarrow as pa + >>> pa.map_(pa.string(), pa.int32(), keys_sorted=True).keys_sorted + True + """ + +_Size = TypeVar("_Size", default=int) + +class StructType(DataType): + """ + Concrete class for struct data types. + + ``StructType`` supports direct indexing using ``[...]`` (implemented via + ``__getitem__``) to access its fields. + It will return the struct field with the given index or name. + + Examples + -------- + >>> import pyarrow as pa + + Accessing fields using direct indexing: + + >>> struct_type = pa.struct({"x": pa.int32(), "y": pa.string()}) + >>> struct_type[0] + pyarrow.Field + >>> struct_type["y"] + pyarrow.Field + + Accessing fields using ``field()``: + + >>> struct_type.field(1) + pyarrow.Field + >>> struct_type.field("x") + pyarrow.Field + + # Creating a schema from the struct type's fields: + >>> pa.schema(list(struct_type)) + x: int32 + y: string + """ + def get_field_index(self, name: str) -> int: + """ + Return index of the unique field with the given name. + + Parameters + ---------- + name : str + The name of the field to look up. + + Returns + ------- + index : int + The index of the field with the given name; -1 if the + name isn't found or there are several fields with the given + name. + + Examples + -------- + >>> import pyarrow as pa + >>> struct_type = pa.struct({"x": pa.int32(), "y": pa.string()}) + + Index of the field with a name 'y': + + >>> struct_type.get_field_index("y") + 1 + + Index of the field that does not exist: + + >>> struct_type.get_field_index("z") + -1 + """ + def field(self, i: int | str) -> Field: + """ + Select a field by its column name or numeric index. + + Parameters + ---------- + i : int or str + + Returns + ------- + pyarrow.Field + + Examples + -------- + + >>> import pyarrow as pa + >>> struct_type = pa.struct({"x": pa.int32(), "y": pa.string()}) + + Select the second field: + + >>> struct_type.field(1) + pyarrow.Field + + Select the field named 'x': + + >>> struct_type.field("x") + pyarrow.Field + """ + def get_all_field_indices(self, name: str) -> list[int]: + """ + Return sorted list of indices for the fields with the given name. + + Parameters + ---------- + name : str + The name of the field to look up. + + Returns + ------- + indices : List[int] + + Examples + -------- + >>> import pyarrow as pa + >>> struct_type = pa.struct({"x": pa.int32(), "y": pa.string()}) + >>> struct_type.get_all_field_indices("x") + [0] + """ + def __len__(self) -> int: ... + def __iter__(self) -> Iterator[Field]: ... + __getitem__ = field # pyright: ignore[reportUnknownVariableType] + @property + def names(self) -> list[str]: + """ + Lists the field names. + + Examples + -------- + >>> import pyarrow as pa + >>> struct_type = pa.struct([("a", pa.int64()), ("b", pa.float64()), ("c", pa.string())]) + >>> struct_type.names + ['a', 'b', 'c'] + """ + @property + def fields(self) -> list[Field]: + """ + Lists all fields within the StructType. + + Examples + -------- + >>> import pyarrow as pa + >>> struct_type = pa.struct([("a", pa.int64()), ("b", pa.float64()), ("c", pa.string())]) + >>> struct_type.fields + [pyarrow.Field, pyarrow.Field, pyarrow.Field] + """ + +class UnionType(DataType): + """ + Base class for union data types. + + Examples + -------- + Create an instance of a dense UnionType using ``pa.union``: + + >>> import pyarrow as pa + >>> ( + ... pa.union( + ... [pa.field("a", pa.binary(10)), pa.field("b", pa.string())], + ... mode=pa.lib.UnionMode_DENSE, + ... ), + ... ) + (DenseUnionType(dense_union),) + + Create an instance of a dense UnionType using ``pa.dense_union``: + + >>> pa.dense_union([pa.field("a", pa.binary(10)), pa.field("b", pa.string())]) + DenseUnionType(dense_union) + + Create an instance of a sparse UnionType using ``pa.union``: + + >>> ( + ... pa.union( + ... [pa.field("a", pa.binary(10)), pa.field("b", pa.string())], + ... mode=pa.lib.UnionMode_SPARSE, + ... ), + ... ) + (SparseUnionType(sparse_union),) + + Create an instance of a sparse UnionType using ``pa.sparse_union``: + + >>> pa.sparse_union([pa.field("a", pa.binary(10)), pa.field("b", pa.string())]) + SparseUnionType(sparse_union) + """ + @property + def mode(self) -> Literal["sparse", "dense"]: + """ + The mode of the union ("dense" or "sparse"). + + Examples + -------- + >>> import pyarrow as pa + >>> union = pa.sparse_union([pa.field("a", pa.binary(10)), pa.field("b", pa.string())]) + >>> union.mode + 'sparse' + """ + @property + def type_codes(self) -> list[int]: + """ + The type code to indicate each data type in this union. + + Examples + -------- + >>> import pyarrow as pa + >>> union = pa.sparse_union([pa.field("a", pa.binary(10)), pa.field("b", pa.string())]) + >>> union.type_codes + [0, 1] + """ + def __len__(self) -> int: ... + def __iter__(self) -> Iterator[Field]: ... + def field(self, i: int) -> Field: + """ + Return a child field by its numeric index. + + Parameters + ---------- + i : int + + Returns + ------- + pyarrow.Field + + Examples + -------- + >>> import pyarrow as pa + >>> union = pa.sparse_union([pa.field("a", pa.binary(10)), pa.field("b", pa.string())]) + >>> union[0] + pyarrow.Field + """ + __getitem__ = field # pyright: ignore[reportUnknownVariableType] + +class SparseUnionType(UnionType): + """ + Concrete class for sparse union types. + + Examples + -------- + Create an instance of a sparse UnionType using ``pa.union``: + + >>> import pyarrow as pa + >>> ( + ... pa.union( + ... [pa.field("a", pa.binary(10)), pa.field("b", pa.string())], + ... mode=pa.lib.UnionMode_SPARSE, + ... ), + ... ) + (SparseUnionType(sparse_union),) + + Create an instance of a sparse UnionType using ``pa.sparse_union``: + + >>> pa.sparse_union([pa.field("a", pa.binary(10)), pa.field("b", pa.string())]) + SparseUnionType(sparse_union) + """ + @property + def mode(self) -> Literal["sparse"]: ... + +class DenseUnionType(UnionType): + """ + Concrete class for dense union types. + + Examples + -------- + Create an instance of a dense UnionType using ``pa.union``: + + >>> import pyarrow as pa + >>> ( + ... pa.union( + ... [pa.field("a", pa.binary(10)), pa.field("b", pa.string())], + ... mode=pa.lib.UnionMode_DENSE, + ... ), + ... ) + (DenseUnionType(dense_union),) + + Create an instance of a dense UnionType using ``pa.dense_union``: + + >>> pa.dense_union([pa.field("a", pa.binary(10)), pa.field("b", pa.string())]) + DenseUnionType(dense_union) + """ + + @property + def mode(self) -> Literal["dense"]: ... + +_RunEndType = TypeVar("_RunEndType", Int16Type, Int32Type, Int64Type) + +class RunEndEncodedType(DataType, Generic[_RunEndType, _BasicValueT]): + """ + Concrete class for run-end encoded types. + """ + @property + def run_end_type(self) -> _RunEndType: ... + @property + def value_type(self) -> _BasicValueT: ... + +_StorageT = TypeVar("_StorageT", bound=Array | ChunkedArray) + +class BaseExtensionType(DataType): + """ + Concrete base class for extension types. + """ + def __arrow_ext_class__(self) -> type[ExtensionArray]: + """ + The associated array extension class + """ + def __arrow_ext_scalar_class__(self) -> type[ExtensionScalar]: + """ + The associated scalar class + """ + @property + def extension_name(self) -> str: + """ + The extension type name. + """ + @property + def storage_type(self) -> DataType: + """ + The underlying storage type. + """ + def wrap_array(self, storage: _StorageT) -> _StorageT: ... + +class ExtensionType(BaseExtensionType): + """ + Concrete base class for Python-defined extension types. + + Parameters + ---------- + storage_type : DataType + The underlying storage type for the extension type. + extension_name : str + A unique name distinguishing this extension type. The name will be + used when deserializing IPC data. + + Examples + -------- + Define a RationalType extension type subclassing ExtensionType: + + >>> import pyarrow as pa + >>> class RationalType(pa.ExtensionType): + ... def __init__(self, data_type: pa.DataType): + ... if not pa.types.is_integer(data_type): + ... raise TypeError(f"data_type must be an integer type not {data_type}") + ... super().__init__( + ... pa.struct( + ... [ + ... ("numer", data_type), + ... ("denom", data_type), + ... ], + ... ), + ... # N.B. This name does _not_ reference `data_type` so deserialization + ... # will work for _any_ integer `data_type` after registration + ... "my_package.rational", + ... ) + ... def __arrow_ext_serialize__(self) -> bytes: + ... # No parameters are necessary + ... return b"" + ... @classmethod + ... def __arrow_ext_deserialize__(cls, storage_type, serialized): + ... # return an instance of this subclass + ... return RationalType(storage_type[0].type) + + Register the extension type: + + >>> pa.register_extension_type(RationalType(pa.int64())) + + Create an instance of RationalType extension type: + + >>> rational_type = RationalType(pa.int32()) + + Inspect the extension type: + + >>> rational_type.extension_name + 'my_package.rational' + >>> rational_type.storage_type + StructType(struct) + + Wrap an array as an extension array: + + >>> storage_array = pa.array( + ... [ + ... {"numer": 10, "denom": 17}, + ... {"numer": 20, "denom": 13}, + ... ], + ... type=rational_type.storage_type, + ... ) + >>> rational_array = rational_type.wrap_array(storage_array) + >>> rational_array + + -- is_valid: all not null + -- child 0 type: int32 + [ + 10, + 20 + ] + -- child 1 type: int32 + [ + 17, + 13 + ] + + Or do the same with creating an ExtensionArray: + + >>> rational_array = pa.ExtensionArray.from_storage(rational_type, storage_array) + >>> rational_array + + -- is_valid: all not null + -- child 0 type: int32 + [ + 10, + 20 + ] + -- child 1 type: int32 + [ + 17, + 13 + ] + + Unregister the extension type: + + >>> pa.unregister_extension_type("my_package.rational") + + Note that even though we registered the concrete type + ``RationalType(pa.int64())``, PyArrow will be able to deserialize + ``RationalType(integer_type)`` for any ``integer_type``, as the deserializer + will reference the name ``my_package.rational`` and the ``@classmethod`` + ``__arrow_ext_deserialize__``. + """ + + def __init__(self, storage_type: DataType, extension_name: str) -> None: ... + def __arrow_ext_serialize__(self) -> bytes: + """ + Serialized representation of metadata to reconstruct the type object. + + This method should return a bytes object, and those serialized bytes + are stored in the custom metadata of the Field holding an extension + type in an IPC message. + The bytes are passed to ``__arrow_ext_deserialize`` and should hold + sufficient information to reconstruct the data type instance. + """ + @classmethod + def __arrow_ext_deserialize__(cls, storage_type: DataType, serialized: bytes) -> Self: + """ + Return an extension type instance from the storage type and serialized + metadata. + + This method should return an instance of the ExtensionType subclass + that matches the passed storage type and serialized metadata (the + return value of ``__arrow_ext_serialize__``). + """ + +class FixedShapeTensorType(BaseExtensionType, Generic[_ValueT]): + """ + Concrete class for fixed shape tensor extension type. + + Examples + -------- + Create an instance of fixed shape tensor extension type: + + >>> import pyarrow as pa + >>> pa.fixed_shape_tensor(pa.int32(), [2, 2]) + FixedShapeTensorType(extension) + + Create an instance of fixed shape tensor extension type with + permutation: + + >>> tensor_type = pa.fixed_shape_tensor(pa.int8(), (2, 2, 3), permutation=[0, 2, 1]) + >>> tensor_type.permutation + [0, 2, 1] + """ + @property + def value_type(self) -> _ValueT: + """ + Data type of an individual tensor. + """ + @property + def shape(self) -> list[int]: + """ + Shape of the tensors. + """ + @property + def dim_names(self) -> list[str] | None: + """ + Explicit names of the dimensions. + """ + @property + def permutation(self) -> list[int] | None: + """ + Indices of the dimensions ordering. + """ + +class Bool8Type(BaseExtensionType): + """ + Concrete class for bool8 extension type. + + Bool8 is an alternate representation for boolean + arrays using 8 bits instead of 1 bit per value. The underlying + storage type is int8. + + Examples + -------- + Create an instance of bool8 extension type: + + >>> import pyarrow as pa + >>> pa.bool8() + Bool8Type(extension) + """ + +class UuidType(BaseExtensionType): + """ + Concrete class for UUID extension type. + """ + +class JsonType(BaseExtensionType): + """ + Concrete class for JSON extension type. + + Examples + -------- + Define the extension type for JSON array + + >>> import pyarrow as pa + >>> json_type = pa.json_(pa.large_utf8()) + + Create an extension array + + >>> arr = [None, '{ "id":30, "values":["a", "b"] }'] + >>> storage = pa.array(arr, pa.large_utf8()) + >>> pa.ExtensionArray.from_storage(json_type, storage) + + [ + null, + "{ "id":30, "values":["a", "b"] }" + ] + """ + +class OpaqueType(BaseExtensionType): + """ + Concrete class for opaque extension type. + + Opaque is a placeholder for a type from an external (often non-Arrow) + system that could not be interpreted. + + Examples + -------- + Create an instance of opaque extension type: + + >>> import pyarrow as pa + >>> pa.opaque(pa.int32(), "geometry", "postgis") + OpaqueType(extension) + """ + @property + def type_name(self) -> str: + """ + The name of the type in the external system. + """ + @property + def vendor_name(self) -> str: + """ + The name of the external system. + """ + +@deprecated( + "This class is deprecated and its deserialization is disabled by default. " + ":class:`ExtensionType` is recommended instead." +) +class PyExtensionType(ExtensionType): + """ + Concrete base class for Python-defined extension types based on pickle + for (de)serialization. + + .. warning:: + This class is deprecated and its deserialization is disabled by default. + :class:`ExtensionType` is recommended instead. + + Parameters + ---------- + storage_type : DataType + The storage type for which the extension is built. + """ + def __init__(self, storage_type: DataType) -> None: ... + @classmethod + def set_auto_load(cls, value: bool) -> None: + """ + Enable or disable auto-loading of serialized PyExtensionType instances. + + Parameters + ---------- + value : bool + Whether to enable auto-loading. + """ + +class UnknownExtensionType(PyExtensionType): # type: ignore + """ + A concrete class for Python-defined extension types that refer to + an unknown Python implementation. + + Parameters + ---------- + storage_type : DataType + The storage type for which the extension is built. + serialized : bytes + The serialised output. + """ + def __init__(self, storage_type: DataType, serialized: bytes) -> None: ... + +def register_extension_type(ext_type: PyExtensionType) -> None: # type: ignore + """ + Register a Python extension type. + + Registration is based on the extension name (so different registered types + need unique extension names). Registration needs an extension type + instance, but then works for any instance of the same subclass regardless + of parametrization of the type. + + Parameters + ---------- + ext_type : BaseExtensionType instance + The ExtensionType subclass to register. + + Examples + -------- + Define a RationalType extension type subclassing ExtensionType: + + >>> import pyarrow as pa + >>> class RationalType(pa.ExtensionType): + ... def __init__(self, data_type: pa.DataType): + ... if not pa.types.is_integer(data_type): + ... raise TypeError(f"data_type must be an integer type not {data_type}") + ... super().__init__( + ... pa.struct( + ... [ + ... ("numer", data_type), + ... ("denom", data_type), + ... ], + ... ), + ... # N.B. This name does _not_ reference `data_type` so deserialization + ... # will work for _any_ integer `data_type` after registration + ... "my_package.rational", + ... ) + ... def __arrow_ext_serialize__(self) -> bytes: + ... # No parameters are necessary + ... return b"" + ... @classmethod + ... def __arrow_ext_deserialize__(cls, storage_type, serialized): + ... # return an instance of this subclass + ... return RationalType(storage_type[0].type) + + Register the extension type: + + >>> pa.register_extension_type(RationalType(pa.int64())) + + Unregister the extension type: + + >>> pa.unregister_extension_type("my_package.rational") + """ + +def unregister_extension_type(type_name: str) -> None: + """ + Unregister a Python extension type. + + Parameters + ---------- + type_name : str + The name of the ExtensionType subclass to unregister. + + Examples + -------- + Define a RationalType extension type subclassing ExtensionType: + + >>> import pyarrow as pa + >>> class RationalType(pa.ExtensionType): + ... def __init__(self, data_type: pa.DataType): + ... if not pa.types.is_integer(data_type): + ... raise TypeError(f"data_type must be an integer type not {data_type}") + ... super().__init__( + ... pa.struct( + ... [ + ... ("numer", data_type), + ... ("denom", data_type), + ... ], + ... ), + ... # N.B. This name does _not_ reference `data_type` so deserialization + ... # will work for _any_ integer `data_type` after registration + ... "my_package.rational", + ... ) + ... def __arrow_ext_serialize__(self) -> bytes: + ... # No parameters are necessary + ... return b"" + ... @classmethod + ... def __arrow_ext_deserialize__(cls, storage_type, serialized): + ... # return an instance of this subclass + ... return RationalType(storage_type[0].type) + + Register the extension type: + + >>> pa.register_extension_type(RationalType(pa.int64())) + + Unregister the extension type: + + >>> pa.unregister_extension_type("my_package.rational") + """ + +class KeyValueMetadata(_Metadata, Mapping[bytes, bytes]): + """ + KeyValueMetadata + + Parameters + ---------- + __arg0__ : dict + A dict of the key-value metadata + **kwargs : optional + additional key-value metadata + """ + def __init__(self, __arg0__: Mapping[bytes, bytes] | None = None, **kwargs) -> None: ... + def equals(self, other: KeyValueMetadata) -> bool: ... + def __len__(self) -> int: ... + def __contains__(self, __key: object) -> bool: ... + def __getitem__(self, __key: Any) -> Any: ... + def __iter__(self) -> Iterator[bytes]: ... + def get_all(self, key: str) -> list[bytes]: ... + def to_dict(self) -> dict[bytes, bytes]: + """ + Convert KeyValueMetadata to dict. If a key occurs twice, the value for + the first one is returned + """ + +def ensure_metadata( + meta: Mapping[bytes | str, bytes | str] | KeyValueMetadata | None, allow_none: bool = False +) -> KeyValueMetadata | None: ... + +class Field(_Weakrefable, Generic[_DataTypeT]): + """ + A named field, with a data type, nullability, and optional metadata. + + Notes + ----- + Do not use this class's constructor directly; use pyarrow.field + + Examples + -------- + Create an instance of pyarrow.Field: + + >>> import pyarrow as pa + >>> pa.field("key", pa.int32()) + pyarrow.Field + >>> pa.field("key", pa.int32(), nullable=False) + pyarrow.Field + >>> field = pa.field("key", pa.int32(), metadata={"key": "Something important"}) + >>> field + pyarrow.Field + >>> field.metadata + {b'key': b'Something important'} + + Use the field to create a struct type: + + >>> pa.struct([field]) + StructType(struct) + """ + + def equals(self, other: Field, check_metadata: bool = False) -> bool: + """ + Test if this field is equal to the other + + Parameters + ---------- + other : pyarrow.Field + check_metadata : bool, default False + Whether Field metadata equality should be checked as well. + + Returns + ------- + is_equal : bool + + Examples + -------- + >>> import pyarrow as pa + >>> f1 = pa.field("key", pa.int32()) + >>> f2 = pa.field("key", pa.int32(), nullable=False) + >>> f1.equals(f2) + False + >>> f1.equals(f1) + True + """ + def __hash__(self) -> int: ... + @property + def nullable(self) -> bool: + """ + The field nullability. + + Examples + -------- + >>> import pyarrow as pa + >>> f1 = pa.field("key", pa.int32()) + >>> f2 = pa.field("key", pa.int32(), nullable=False) + >>> f1.nullable + True + >>> f2.nullable + False + """ + @property + def name(self) -> str: + """ + The field name. + + Examples + -------- + >>> import pyarrow as pa + >>> field = pa.field("key", pa.int32()) + >>> field.name + 'key' + """ + @property + def metadata(self) -> dict[bytes, bytes] | None: + """ + The field metadata (if any is set). + + Returns + ------- + metadata : dict or None + + Examples + -------- + >>> import pyarrow as pa + >>> field = pa.field("key", pa.int32(), metadata={"key": "Something important"}) + >>> field.metadata + {b'key': b'Something important'} + """ + @property + def type(self) -> _DataTypeT: ... + def with_metadata(self, metadata: dict[bytes | str, bytes | str]) -> Self: + """ + Add metadata as dict of string keys and values to Field + + Parameters + ---------- + metadata : dict + Keys and values must be string-like / coercible to bytes + + Returns + ------- + field : pyarrow.Field + + Examples + -------- + >>> import pyarrow as pa + >>> field = pa.field("key", pa.int32()) + + Create new field by adding metadata to existing one: + + >>> field_new = field.with_metadata({"key": "Something important"}) + >>> field_new + pyarrow.Field + >>> field_new.metadata + {b'key': b'Something important'} + """ + def remove_metadata(self) -> Self: + """ + Create new field without metadata, if any + + Returns + ------- + field : pyarrow.Field + + Examples + -------- + >>> import pyarrow as pa + >>> field = pa.field("key", pa.int32(), metadata={"key": "Something important"}) + >>> field.metadata + {b'key': b'Something important'} + + Create new field by removing the metadata from the existing one: + + >>> field_new = field.remove_metadata() + >>> field_new.metadata + """ + def with_type(self, new_type: _DataTypeT) -> Field[_DataTypeT]: + """ + A copy of this field with the replaced type + + Parameters + ---------- + new_type : pyarrow.DataType + + Returns + ------- + field : pyarrow.Field + + Examples + -------- + >>> import pyarrow as pa + >>> field = pa.field("key", pa.int32()) + >>> field + pyarrow.Field + + Create new field by replacing type of an existing one: + + >>> field_new = field.with_type(pa.int64()) + >>> field_new + pyarrow.Field + """ + def with_name(self, name: str) -> Self: + """ + A copy of this field with the replaced name + + Parameters + ---------- + name : str + + Returns + ------- + field : pyarrow.Field + + Examples + -------- + >>> import pyarrow as pa + >>> field = pa.field("key", pa.int32()) + >>> field + pyarrow.Field + + Create new field by replacing the name of an existing one: + + >>> field_new = field.with_name("lock") + >>> field_new + pyarrow.Field + """ + def with_nullable(self, nullable: bool) -> Field[_DataTypeT]: + """ + A copy of this field with the replaced nullability + + Parameters + ---------- + nullable : bool + + Returns + ------- + field: pyarrow.Field + + Examples + -------- + >>> import pyarrow as pa + >>> field = pa.field("key", pa.int32()) + >>> field + pyarrow.Field + >>> field.nullable + True + + Create new field by replacing the nullability of an existing one: + + >>> field_new = field.with_nullable(False) + >>> field_new + pyarrow.Field + >>> field_new.nullable + False + """ + def flatten(self) -> list[Field]: + """ + Flatten this field. If a struct field, individual child fields + will be returned with their names prefixed by the parent's name. + + Returns + ------- + fields : List[pyarrow.Field] + + Examples + -------- + >>> import pyarrow as pa + >>> f1 = pa.field("bar", pa.float64(), nullable=False) + >>> f2 = pa.field("foo", pa.int32()).with_metadata({"key": "Something important"}) + >>> ff = pa.field("ff", pa.struct([f1, f2]), nullable=False) + + Flatten a struct field: + + >>> ff + pyarrow.Field not null> + >>> ff.flatten() + [pyarrow.Field, pyarrow.Field] + """ + def _export_to_c(self, out_ptr: int) -> None: + """ + Export to a C ArrowSchema struct, given its pointer. + + Be careful: if you don't pass the ArrowSchema struct to a consumer, + its memory will leak. This is a low-level function intended for + expert users. + """ + @classmethod + def _import_from_c(cls, in_ptr: int) -> Self: + """ + Import Field from a C ArrowSchema struct, given its pointer. + + This is a low-level function intended for expert users. + """ + def __arrow_c_schema__(self) -> Any: + """ + Export to a ArrowSchema PyCapsule + + Unlike _export_to_c, this will not leak memory if the capsule is not used. + """ + @classmethod + def _import_from_c_capsule(cls, schema) -> Self: + """ + Import a Field from a ArrowSchema PyCapsule + + Parameters + ---------- + schema : PyCapsule + A valid PyCapsule with name 'arrow_schema' containing an + ArrowSchema pointer. + """ + +class Schema(_Weakrefable): + """ + A named collection of types a.k.a schema. A schema defines the + column names and types in a record batch or table data structure. + They also contain metadata about the columns. For example, schemas + converted from Pandas contain metadata about their original Pandas + types so they can be converted back to the same types. + + Warnings + -------- + Do not call this class's constructor directly. Instead use + :func:`pyarrow.schema` factory function which makes a new Arrow + Schema object. + + Examples + -------- + Create a new Arrow Schema object: + + >>> import pyarrow as pa + >>> pa.schema([("some_int", pa.int32()), ("some_string", pa.string())]) + some_int: int32 + some_string: string + + Create Arrow Schema with metadata: + + >>> pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + """ + + def __len__(self) -> int: ... + def __getitem__(self, key: str) -> Field: ... + _field = __getitem__ # pyright: ignore[reportUnknownVariableType] + def __iter__(self) -> Iterator[Field]: ... + def __hash__(self) -> int: ... + def __sizeof__(self) -> int: ... + @property + def pandas_metadata(self) -> dict: + """ + Return deserialized-from-JSON pandas metadata field (if it exists) + + Examples + -------- + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_legs": [2, 4, 5, 100], + ... "animals": ["Flamingo", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> schema = pa.Table.from_pandas(df).schema + + Select pandas metadata field from Arrow Schema: + + >>> schema.pandas_metadata + {'index_columns': [{'kind': 'range', 'name': None, 'start': 0, 'stop': 4, 'step': 1}], ... + """ + @property + def names(self) -> list[str]: + """ + The schema's field names. + + Returns + ------- + list of str + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + + Get the names of the schema's fields: + + >>> schema.names + ['n_legs', 'animals'] + """ + @property + def types(self) -> list[DataType]: + """ + The schema's field types. + + Returns + ------- + list of DataType + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + + Get the types of the schema's fields: + + >>> schema.types + [DataType(int64), DataType(string)] + """ + @property + def metadata(self) -> dict[bytes, bytes]: + """ + The schema's metadata (if any is set). + + Returns + ------- + metadata: dict or None + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + + Get the metadata of the schema's fields: + + >>> schema.metadata + {b'n_legs': b'Number of legs per animal'} + """ + def empty_table(self) -> Table: + """ + Provide an empty table according to the schema. + + Returns + ------- + table: pyarrow.Table + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + + Create an empty table with schema's fields: + + >>> schema.empty_table() + pyarrow.Table + n_legs: int64 + animals: string + ---- + n_legs: [[]] + animals: [[]] + """ + def equals(self, other: Schema, check_metadata: bool = False) -> bool: + """ + Test if this schema is equal to the other + + Parameters + ---------- + other : pyarrow.Schema + check_metadata : bool, default False + Key/value metadata must be equal too + + Returns + ------- + is_equal : bool + + Examples + -------- + >>> import pyarrow as pa + >>> schema1 = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> schema2 = pa.schema([("some_int", pa.int32()), ("some_string", pa.string())]) + + Test two equal schemas: + + >>> schema1.equals(schema1) + True + + Test two unequal schemas: + + >>> schema1.equals(schema2) + False + """ + @classmethod + def from_pandas(cls, df: pd.DataFrame, preserve_index: bool | None = None) -> Schema: + """ + Returns implied schema from dataframe + + Parameters + ---------- + df : pandas.DataFrame + preserve_index : bool, default True + Whether to store the index as an additional column (or columns, for + MultiIndex) in the resulting `Table`. + The default of None will store the index as a column, except for + RangeIndex which is stored as metadata only. Use + ``preserve_index=True`` to force it to be stored as a column. + + Returns + ------- + pyarrow.Schema + + Examples + -------- + >>> import pandas as pd + >>> import pyarrow as pa + >>> df = pd.DataFrame({"int": [1, 2], "str": ["a", "b"]}) + + Create an Arrow Schema from the schema of a pandas dataframe: + + >>> pa.Schema.from_pandas(df) + int: int64 + str: string + -- schema metadata -- + pandas: '{"index_columns": [{"kind": "range", "name": null, ... + """ + def field(self, i: int | str | bytes) -> Field: + """ + Select a field by its column name or numeric index. + + Parameters + ---------- + i : int or string + + Returns + ------- + pyarrow.Field + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + + Select the second field: + + >>> schema.field(1) + pyarrow.Field + + Select the field of the column named 'n_legs': + + >>> schema.field("n_legs") + pyarrow.Field + """ + @deprecated("Use 'field' instead") + def field_by_name(self, name: str) -> Field: + """ + DEPRECATED + + Parameters + ---------- + name : str + + Returns + ------- + field: pyarrow.Field + """ + def get_field_index(self, name: str) -> int: + """ + Return index of the unique field with the given name. + + Parameters + ---------- + name : str + The name of the field to look up. + + Returns + ------- + index : int + The index of the field with the given name; -1 if the + name isn't found or there are several fields with the given + name. + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + + Get the index of the field named 'animals': + + >>> schema.get_field_index("animals") + 1 + + Index in case of several fields with the given name: + + >>> schema = pa.schema( + ... [ + ... pa.field("n_legs", pa.int64()), + ... pa.field("animals", pa.string()), + ... pa.field("animals", pa.bool_()), + ... ], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> schema.get_field_index("animals") + -1 + """ + def get_all_field_indices(self, name: str) -> list[int]: + """ + Return sorted list of indices for the fields with the given name. + + Parameters + ---------- + name : str + The name of the field to look up. + + Returns + ------- + indices : List[int] + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema( + ... [ + ... pa.field("n_legs", pa.int64()), + ... pa.field("animals", pa.string()), + ... pa.field("animals", pa.bool_()), + ... ] + ... ) + + Get the indexes of the fields named 'animals': + + >>> schema.get_all_field_indices("animals") + [1, 2] + """ + def append(self, field: Field) -> Schema: + """ + Append a field at the end of the schema. + + In contrast to Python's ``list.append()`` it does return a new + object, leaving the original Schema unmodified. + + Parameters + ---------- + field : Field + + Returns + ------- + schema: Schema + New object with appended field. + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + + Append a field 'extra' at the end of the schema: + + >>> schema_new = schema.append(pa.field("extra", pa.bool_())) + >>> schema_new + n_legs: int64 + animals: string + extra: bool + + Original schema is unmodified: + + >>> schema + n_legs: int64 + animals: string + """ + def insert(self, i: int, field: Field) -> Schema: + """ + Add a field at position i to the schema. + + Parameters + ---------- + i : int + field : Field + + Returns + ------- + schema: Schema + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + + Insert a new field on the second position: + + >>> schema.insert(1, pa.field("extra", pa.bool_())) + n_legs: int64 + extra: bool + animals: string + """ + def remove(self, i: int) -> Schema: + """ + Remove the field at index i from the schema. + + Parameters + ---------- + i : int + + Returns + ------- + schema: Schema + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + + Remove the second field of the schema: + + >>> schema.remove(1) + n_legs: int64 + """ + def set(self, i: int, field: Field) -> Schema: + """ + Replace a field at position i in the schema. + + Parameters + ---------- + i : int + field : Field + + Returns + ------- + schema: Schema + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + + Replace the second field of the schema with a new field 'extra': + + >>> schema.set(1, pa.field("replaced", pa.bool_())) + n_legs: int64 + replaced: bool + """ + @deprecated("Use 'with_metadata' instead") + def add_metadata(self, metadata: dict) -> Schema: + """ + DEPRECATED + + Parameters + ---------- + metadata : dict + Keys and values must be string-like / coercible to bytes + """ + def with_metadata(self, metadata: dict) -> Schema: + """ + Add metadata as dict of string keys and values to Schema + + Parameters + ---------- + metadata : dict + Keys and values must be string-like / coercible to bytes + + Returns + ------- + schema : pyarrow.Schema + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + + Add metadata to existing schema field: + + >>> schema.with_metadata({"n_legs": "Number of legs per animal"}) + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + """ + def serialize(self, memory_pool: MemoryPool | None = None) -> Buffer: + """ + Write Schema to Buffer as encapsulated IPC message + + Parameters + ---------- + memory_pool : MemoryPool, default None + Uses default memory pool if not specified + + Returns + ------- + serialized : Buffer + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema([pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())]) + + Write schema to Buffer: + + >>> schema.serialize() + + """ + def remove_metadata(self) -> Schema: + """ + Create new schema without metadata, if any + + Returns + ------- + schema : pyarrow.Schema + + Examples + -------- + >>> import pyarrow as pa + >>> schema = pa.schema( + ... [pa.field("n_legs", pa.int64()), pa.field("animals", pa.string())], + ... metadata={"n_legs": "Number of legs per animal"}, + ... ) + >>> schema + n_legs: int64 + animals: string + -- schema metadata -- + n_legs: 'Number of legs per animal' + + Create a new schema with removing the metadata from the original: + + >>> schema.remove_metadata() + n_legs: int64 + animals: string + """ + def to_string( + self, + truncate_metadata: bool = True, + show_field_metadata: bool = True, + show_schema_metadata: bool = True, + ) -> str: + """ + Return human-readable representation of Schema + + Parameters + ---------- + truncate_metadata : boolean, default True + Limit metadata key/value display to a single line of ~80 characters + or less + show_field_metadata : boolean, default True + Display Field-level KeyValueMetadata + show_schema_metadata : boolean, default True + Display Schema-level KeyValueMetadata + + Returns + ------- + str : the formatted output + """ + def _export_to_c(self, out_ptr: int) -> None: + """ + Export to a C ArrowSchema struct, given its pointer. + + Be careful: if you don't pass the ArrowSchema struct to a consumer, + its memory will leak. This is a low-level function intended for + expert users. + """ + @classmethod + def _import_from_c(cls, in_ptr: int) -> Schema: + """ + Import Schema from a C ArrowSchema struct, given its pointer. + + This is a low-level function intended for expert users. + """ + def __arrow_c_schema__(self) -> Any: + """ + Export to a ArrowSchema PyCapsule + + Unlike _export_to_c, this will not leak memory if the capsule is not used. + """ + @staticmethod + def _import_from_c_capsule(schema: Any) -> Schema: + """ + Import a Schema from a ArrowSchema PyCapsule + + Parameters + ---------- + schema : PyCapsule + A valid PyCapsule with name 'arrow_schema' containing an + ArrowSchema pointer. + """ + +def unify_schemas( + schemas: list[Schema], *, promote_options: Literal["default", "permissive"] = "default" +) -> Schema: + """ + Unify schemas by merging fields by name. + + The resulting schema will contain the union of fields from all schemas. + Fields with the same name will be merged. Note that two fields with + different types will fail merging by default. + + - The unified field will inherit the metadata from the schema where + that field is first defined. + - The first N fields in the schema will be ordered the same as the + N fields in the first schema. + + The resulting schema will inherit its metadata from the first input + schema. + + Parameters + ---------- + schemas : list of Schema + Schemas to merge into a single one. + promote_options : str, default default + Accepts strings "default" and "permissive". + Default: null and only null can be unified with another type. + Permissive: types are promoted to the greater common denominator. + + Returns + ------- + Schema + + Raises + ------ + ArrowInvalid : + If any input schema contains fields with duplicate names. + If Fields of the same name are not mergeable. + """ + +@overload +def field(name: SupportArrowSchema) -> Field[Any]: ... +@overload +def field( + name: str, type: _DataTypeT, nullable: bool = ..., metadata: dict[Any, Any] | None = None +) -> Field[_DataTypeT]: ... +def field(*args, **kwargs): + """ + Create a pyarrow.Field instance. + + Parameters + ---------- + name : str or bytes + Name of the field. + Alternatively, you can also pass an object that implements the Arrow + PyCapsule Protocol for schemas (has an ``__arrow_c_schema__`` method). + type : pyarrow.DataType or str + Arrow datatype of the field or a string matching one. + nullable : bool, default True + Whether the field's values are nullable. + metadata : dict, default None + Optional field metadata, the keys and values must be coercible to + bytes. + + Returns + ------- + field : pyarrow.Field + + Examples + -------- + Create an instance of pyarrow.Field: + + >>> import pyarrow as pa + >>> pa.field("key", pa.int32()) + pyarrow.Field + >>> pa.field("key", pa.int32(), nullable=False) + pyarrow.Field + + >>> field = pa.field("key", pa.int32(), metadata={"key": "Something important"}) + >>> field + pyarrow.Field + >>> field.metadata + {b'key': b'Something important'} + + Use the field to create a struct type: + + >>> pa.struct([field]) + StructType(struct) + + A str can also be passed for the type parameter: + + >>> pa.field("key", "int32") + pyarrow.Field + """ + +def null() -> NullType: + """ + Create instance of null type. + + Examples + -------- + Create an instance of a null type: + + >>> import pyarrow as pa + >>> pa.null() + DataType(null) + >>> print(pa.null()) + null + + Create a ``Field`` type with a null type and a name: + + >>> pa.field("null_field", pa.null()) + pyarrow.Field + """ + +def bool_() -> BoolType: + """ + Create instance of boolean type. + + Examples + -------- + Create an instance of a boolean type: + + >>> import pyarrow as pa + >>> pa.bool_() + DataType(bool) + >>> print(pa.bool_()) + bool + + Create a ``Field`` type with a boolean type + and a name: + + >>> pa.field("bool_field", pa.bool_()) + pyarrow.Field + """ + +def uint8() -> UInt8Type: + """ + Create instance of unsigned int8 type. + + Examples + -------- + Create an instance of unsigned int8 type: + + >>> import pyarrow as pa + >>> pa.uint8() + DataType(uint8) + >>> print(pa.uint8()) + uint8 + + Create an array with unsigned int8 type: + + >>> pa.array([0, 1, 2], type=pa.uint8()) + + [ + 0, + 1, + 2 + ] + """ + +def int8() -> Int8Type: + """ + Create instance of signed int8 type. + + Examples + -------- + Create an instance of int8 type: + + >>> import pyarrow as pa + >>> pa.int8() + DataType(int8) + >>> print(pa.int8()) + int8 + + Create an array with int8 type: + + >>> pa.array([0, 1, 2], type=pa.int8()) + + [ + 0, + 1, + 2 + ] + """ + +def uint16() -> UInt16Type: + """ + Create instance of unsigned uint16 type. + + Examples + -------- + Create an instance of unsigned int16 type: + + >>> import pyarrow as pa + >>> pa.uint16() + DataType(uint16) + >>> print(pa.uint16()) + uint16 + + Create an array with unsigned int16 type: + + >>> pa.array([0, 1, 2], type=pa.uint16()) + + [ + 0, + 1, + 2 + ] + """ + +def int16() -> Int16Type: + """ + Create instance of signed int16 type. + + Examples + -------- + Create an instance of int16 type: + + >>> import pyarrow as pa + >>> pa.int16() + DataType(int16) + >>> print(pa.int16()) + int16 + + Create an array with int16 type: + + >>> pa.array([0, 1, 2], type=pa.int16()) + + [ + 0, + 1, + 2 + ] + """ + +def uint32() -> Uint32Type: + """ + Create instance of unsigned uint32 type. + + Examples + -------- + Create an instance of unsigned int32 type: + + >>> import pyarrow as pa + >>> pa.uint32() + DataType(uint32) + >>> print(pa.uint32()) + uint32 + + Create an array with unsigned int32 type: + + >>> pa.array([0, 1, 2], type=pa.uint32()) + + [ + 0, + 1, + 2 + ] + """ + +def int32() -> Int32Type: + """ + Create instance of signed int32 type. + + Examples + -------- + Create an instance of int32 type: + + >>> import pyarrow as pa + >>> pa.int32() + DataType(int32) + >>> print(pa.int32()) + int32 + + Create an array with int32 type: + + >>> pa.array([0, 1, 2], type=pa.int32()) + + [ + 0, + 1, + 2 + ] + """ + +def int64() -> Int64Type: + """ + Create instance of signed int64 type. + + Examples + -------- + Create an instance of int64 type: + + >>> import pyarrow as pa + >>> pa.int64() + DataType(int64) + >>> print(pa.int64()) + int64 + + Create an array with int64 type: + + >>> pa.array([0, 1, 2], type=pa.int64()) + + [ + 0, + 1, + 2 + ] + """ + +def uint64() -> UInt64Type: + """ + Create instance of unsigned uint64 type. + + Examples + -------- + Create an instance of unsigned int64 type: + + >>> import pyarrow as pa + >>> pa.uint64() + DataType(uint64) + >>> print(pa.uint64()) + uint64 + + Create an array with unsigned uint64 type: + + >>> pa.array([0, 1, 2], type=pa.uint64()) + + [ + 0, + 1, + 2 + ] + """ + +def tzinfo_to_string(tz: dt.tzinfo) -> str: + """ + Converts a time zone object into a string indicating the name of a time + zone, one of: + * As used in the Olson time zone database (the "tz database" or + "tzdata"), such as "America/New_York" + * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30 + + Parameters + ---------- + tz : datetime.tzinfo + Time zone object + + Returns + ------- + name : str + Time zone name + """ + +def string_to_tzinfo(name: str) -> dt.tzinfo: + """ + Convert a time zone name into a time zone object. + + Supported input strings are: + * As used in the Olson time zone database (the "tz database" or + "tzdata"), such as "America/New_York" + * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30 + + Parameters + ---------- + name: str + Time zone name. + + Returns + ------- + tz : datetime.tzinfo + Time zone object + """ + +@overload +def timestamp(unit: _Unit) -> TimestampType[_Unit, _Tz]: ... +@overload +def timestamp(unit: _Unit, tz: _Tz) -> TimestampType[_Unit, _Tz]: ... +def timestamp(*args, **kwargs): + """ + Create instance of timestamp type with resolution and optional time zone. + + Parameters + ---------- + unit : str + one of 's' [second], 'ms' [millisecond], 'us' [microsecond], or 'ns' + [nanosecond] + tz : str, default None + Time zone name. None indicates time zone naive + + Examples + -------- + Create an instance of timestamp type: + + >>> import pyarrow as pa + >>> pa.timestamp("us") + TimestampType(timestamp[us]) + >>> pa.timestamp("s", tz="America/New_York") + TimestampType(timestamp[s, tz=America/New_York]) + >>> pa.timestamp("s", tz="+07:30") + TimestampType(timestamp[s, tz=+07:30]) + + Use timestamp type when creating a scalar object: + + >>> from datetime import datetime + >>> pa.scalar(datetime(2012, 1, 1), type=pa.timestamp("s", tz="UTC")) + + >>> pa.scalar(datetime(2012, 1, 1), type=pa.timestamp("us")) + + + Returns + ------- + timestamp_type : TimestampType + """ + +def time32(unit: _Time32Unit) -> Time32Type[_Time32Unit]: + """ + Create instance of 32-bit time (time of day) type with unit resolution. + + Parameters + ---------- + unit : str + one of 's' [second], or 'ms' [millisecond] + + Returns + ------- + type : pyarrow.Time32Type + + Examples + -------- + >>> import pyarrow as pa + >>> pa.time32("s") + Time32Type(time32[s]) + >>> pa.time32("ms") + Time32Type(time32[ms]) + """ + +def time64(unit: _Time64Unit) -> Time64Type[_Time64Unit]: + """ + Create instance of 64-bit time (time of day) type with unit resolution. + + Parameters + ---------- + unit : str + One of 'us' [microsecond], or 'ns' [nanosecond]. + + Returns + ------- + type : pyarrow.Time64Type + + Examples + -------- + >>> import pyarrow as pa + >>> pa.time64("us") + Time64Type(time64[us]) + >>> pa.time64("ns") + Time64Type(time64[ns]) + """ + +def duration(unit: _Unit) -> DurationType[_Unit]: + """ + Create instance of a duration type with unit resolution. + + Parameters + ---------- + unit : str + One of 's' [second], 'ms' [millisecond], 'us' [microsecond], or + 'ns' [nanosecond]. + + Returns + ------- + type : pyarrow.DurationType + + Examples + -------- + Create an instance of duration type: + + >>> import pyarrow as pa + >>> pa.duration("us") + DurationType(duration[us]) + >>> pa.duration("s") + DurationType(duration[s]) + + Create an array with duration type: + + >>> pa.array([0, 1, 2], type=pa.duration("s")) + + [ + 0, + 1, + 2 + ] + """ + +def month_day_nano_interval() -> MonthDayNanoIntervalType: + """ + Create instance of an interval type representing months, days and + nanoseconds between two dates. + + Examples + -------- + Create an instance of an month_day_nano_interval type: + + >>> import pyarrow as pa + >>> pa.month_day_nano_interval() + DataType(month_day_nano_interval) + + Create a scalar with month_day_nano_interval type: + + >>> pa.scalar((1, 15, -30), type=pa.month_day_nano_interval()) + + """ + +def date32() -> Date32Type: + """ + Create instance of 32-bit date (days since UNIX epoch 1970-01-01). + + Examples + -------- + Create an instance of 32-bit date type: + + >>> import pyarrow as pa + >>> pa.date32() + DataType(date32[day]) + + Create a scalar with 32-bit date type: + + >>> from datetime import date + >>> pa.scalar(date(2012, 1, 1), type=pa.date32()) + + """ + +def date64() -> Date64Type: + """ + Create instance of 64-bit date (milliseconds since UNIX epoch 1970-01-01). + + Examples + -------- + Create an instance of 64-bit date type: + + >>> import pyarrow as pa + >>> pa.date64() + DataType(date64[ms]) + + Create a scalar with 64-bit date type: + + >>> from datetime import datetime + >>> pa.scalar(datetime(2012, 1, 1), type=pa.date64()) + + """ + +def float16() -> Float16Type: + """ + Create half-precision floating point type. + + Examples + -------- + Create an instance of float16 type: + + >>> import pyarrow as pa + >>> pa.float16() + DataType(halffloat) + >>> print(pa.float16()) + halffloat + + Create an array with float16 type: + + >>> arr = np.array([1.5, np.nan], dtype=np.float16) + >>> a = pa.array(arr, type=pa.float16()) + >>> a + + [ + 15872, + 32256 + ] + + Note that unlike other float types, if you convert this array + to a python list, the types of its elements will be ``np.float16`` + + >>> [type(val) for val in a.to_pylist()] + [, ] + """ + +def float32() -> Float32Type: + """ + Create single-precision floating point type. + + Examples + -------- + Create an instance of float32 type: + + >>> import pyarrow as pa + >>> pa.float32() + DataType(float) + >>> print(pa.float32()) + float + + Create an array with float32 type: + + >>> pa.array([0.0, 1.0, 2.0], type=pa.float32()) + + [ + 0, + 1, + 2 + ] + """ + +def float64() -> Float64Type: + """ + Create double-precision floating point type. + + Examples + -------- + Create an instance of float64 type: + + >>> import pyarrow as pa + >>> pa.float64() + DataType(double) + >>> print(pa.float64()) + double + + Create an array with float64 type: + + >>> pa.array([0.0, 1.0, 2.0], type=pa.float64()) + + [ + 0, + 1, + 2 + ] + """ + +@overload +def decimal32(precision: _Precision) -> Decimal32Type[_Precision, Literal[0]]: ... +@overload +def decimal32(precision: _Precision, scale: _Scale) -> Decimal32Type[_Precision, _Scale]: ... +def decimal32(*args, **kwargs): + """ + Create decimal type with precision and scale and 32-bit width. + + Arrow decimals are fixed-point decimal numbers encoded as a scaled + integer. The precision is the number of significant digits that the + decimal type can represent; the scale is the number of digits after + the decimal point (note the scale can be negative). + + As an example, ``decimal32(7, 3)`` can exactly represent the numbers + 1234.567 and -1234.567 (encoded internally as the 32-bit integers + 1234567 and -1234567, respectively), but neither 12345.67 nor 123.4567. + + ``decimal32(5, -3)`` can exactly represent the number 12345000 + (encoded internally as the 32-bit integer 12345), but neither + 123450000 nor 1234500. + + If you need a precision higher than 9 significant digits, consider + using ``decimal64``, ``decimal128``, or ``decimal256``. + + Parameters + ---------- + precision : int + Must be between 1 and 9 + scale : int + + Returns + ------- + decimal_type : Decimal32Type + + Examples + -------- + Create an instance of decimal type: + + >>> import pyarrow as pa + >>> pa.decimal32(5, 2) + Decimal32Type(decimal32(5, 2)) + + Create an array with decimal type: + + >>> import decimal + >>> a = decimal.Decimal("123.45") + >>> pa.array([a], pa.decimal32(5, 2)) + + [ + 123.45 + ] + """ + +@overload +def decimal64(precision: _Precision) -> Decimal64Type[_Precision, Literal[0]]: ... +@overload +def decimal64(precision: _Precision, scale: _Scale) -> Decimal64Type[_Precision, _Scale]: ... +def decimal64(*args, **kwargs): + """ + Create decimal type with precision and scale and 64-bit width. + + Arrow decimals are fixed-point decimal numbers encoded as a scaled + integer. The precision is the number of significant digits that the + decimal type can represent; the scale is the number of digits after + the decimal point (note the scale can be negative). + + As an example, ``decimal64(7, 3)`` can exactly represent the numbers + 1234.567 and -1234.567 (encoded internally as the 64-bit integers + 1234567 and -1234567, respectively), but neither 12345.67 nor 123.4567. + + ``decimal64(5, -3)`` can exactly represent the number 12345000 + (encoded internally as the 64-bit integer 12345), but neither + 123450000 nor 1234500. + + If you need a precision higher than 18 significant digits, consider + using ``decimal128``, or ``decimal256``. + + Parameters + ---------- + precision : int + Must be between 1 and 18 + scale : int + + Returns + ------- + decimal_type : Decimal64Type + + Examples + -------- + Create an instance of decimal type: + + >>> import pyarrow as pa + >>> pa.decimal64(5, 2) + Decimal64Type(decimal64(5, 2)) + + Create an array with decimal type: + + >>> import decimal + >>> a = decimal.Decimal("123.45") + >>> pa.array([a], pa.decimal64(5, 2)) + + [ + 123.45 + ] + """ + +@overload +def decimal128(precision: _Precision) -> Decimal128Type[_Precision, Literal[0]]: ... +@overload +def decimal128(precision: _Precision, scale: _Scale) -> Decimal128Type[_Precision, _Scale]: ... +def decimal128(*args, **kwargs): + """ + Create decimal type with precision and scale and 128-bit width. + + Arrow decimals are fixed-point decimal numbers encoded as a scaled + integer. The precision is the number of significant digits that the + decimal type can represent; the scale is the number of digits after + the decimal point (note the scale can be negative). + + As an example, ``decimal128(7, 3)`` can exactly represent the numbers + 1234.567 and -1234.567 (encoded internally as the 128-bit integers + 1234567 and -1234567, respectively), but neither 12345.67 nor 123.4567. + + ``decimal128(5, -3)`` can exactly represent the number 12345000 + (encoded internally as the 128-bit integer 12345), but neither + 123450000 nor 1234500. + + If you need a precision higher than 38 significant digits, consider + using ``decimal256``. + + Parameters + ---------- + precision : int + Must be between 1 and 38 + scale : int + + Returns + ------- + decimal_type : Decimal128Type + + Examples + -------- + Create an instance of decimal type: + + >>> import pyarrow as pa + >>> pa.decimal128(5, 2) + Decimal128Type(decimal128(5, 2)) + + Create an array with decimal type: + + >>> import decimal + >>> a = decimal.Decimal("123.45") + >>> pa.array([a], pa.decimal128(5, 2)) + + [ + 123.45 + ] + """ + +@overload +def decimal256(precision: _Precision) -> Decimal256Type[_Precision, Literal[0]]: ... +@overload +def decimal256(precision: _Precision, scale: _Scale) -> Decimal256Type[_Precision, _Scale]: ... +def decimal256(*args, **kwargs): + """ + Create decimal type with precision and scale and 256-bit width. + + Arrow decimals are fixed-point decimal numbers encoded as a scaled + integer. The precision is the number of significant digits that the + decimal type can represent; the scale is the number of digits after + the decimal point (note the scale can be negative). + + For most use cases, the maximum precision offered by ``decimal128`` + is sufficient, and it will result in a more compact and more efficient + encoding. ``decimal256`` is useful if you need a precision higher + than 38 significant digits. + + Parameters + ---------- + precision : int + Must be between 1 and 76 + scale : int + + Returns + ------- + decimal_type : Decimal256Type + """ + +def string() -> StringType: + """ + Create UTF8 variable-length string type. + + Examples + -------- + Create an instance of a string type: + + >>> import pyarrow as pa + >>> pa.string() + DataType(string) + + and use the string type to create an array: + + >>> pa.array(["foo", "bar", "baz"], type=pa.string()) + + [ + "foo", + "bar", + "baz" + ] + """ + +utf8 = string +""" +Alias for string(). + +Examples +-------- +Create an instance of a string type: + +>>> import pyarrow as pa +>>> pa.utf8() +DataType(string) + +and use the string type to create an array: + +>>> pa.array(['foo', 'bar', 'baz'], type=pa.utf8()) + +[ + "foo", + "bar", + "baz" +] +""" + +@overload +def binary(length: Literal[-1] = ...) -> BinaryType: ... +@overload +def binary(length: int) -> FixedSizeBinaryType: ... +def binary(length): + """ + Create variable-length or fixed size binary type. + + Parameters + ---------- + length : int, optional, default -1 + If length == -1 then return a variable length binary type. If length is + greater than or equal to 0 then return a fixed size binary type of + width `length`. + + Examples + -------- + Create an instance of a variable-length binary type: + + >>> import pyarrow as pa + >>> pa.binary() + DataType(binary) + + and use the variable-length binary type to create an array: + + >>> pa.array(["foo", "bar", "baz"], type=pa.binary()) + + [ + 666F6F, + 626172, + 62617A + ] + + Create an instance of a fixed-size binary type: + + >>> pa.binary(3) + FixedSizeBinaryType(fixed_size_binary[3]) + + and use the fixed-length binary type to create an array: + + >>> pa.array(["foo", "bar", "baz"], type=pa.binary(3)) + + [ + 666F6F, + 626172, + 62617A + ] + """ + +def large_binary() -> LargeBinaryType: + """ + Create large variable-length binary type. + + This data type may not be supported by all Arrow implementations. Unless + you need to represent data larger than 2GB, you should prefer binary(). + + Examples + -------- + Create an instance of large variable-length binary type: + + >>> import pyarrow as pa + >>> pa.large_binary() + DataType(large_binary) + + and use the type to create an array: + + >>> pa.array(["foo", "bar", "baz"], type=pa.large_binary()) + + [ + 666F6F, + 626172, + 62617A + ] + """ + +def large_string() -> LargeStringType: + """ + Create large UTF8 variable-length string type. + + This data type may not be supported by all Arrow implementations. Unless + you need to represent data larger than 2GB, you should prefer string(). + + Examples + -------- + Create an instance of large UTF8 variable-length binary type: + + >>> import pyarrow as pa + >>> pa.large_string() + DataType(large_string) + + and use the type to create an array: + + >>> pa.array(["foo", "bar"] * 50, type=pa.large_string()) + + [ + "foo", + "bar", + ... + "foo", + "bar" + ] + """ + +large_utf8 = large_string +""" +Alias for large_string(). + +Examples +-------- +Create an instance of large UTF8 variable-length binary type: + +>>> import pyarrow as pa +>>> pa.large_utf8() +DataType(large_string) + +and use the type to create an array: + +>>> pa.array(['foo', 'bar'] * 50, type=pa.large_utf8()) + +[ + "foo", + "bar", + ... + "foo", + "bar" +] +""" + +def binary_view() -> BinaryViewType: + """ + Create a variable-length binary view type. + + Examples + -------- + Create an instance of a string type: + + >>> import pyarrow as pa + >>> pa.binary_view() + DataType(binary_view) + """ + +def string_view() -> StringViewType: + """ + Create UTF8 variable-length string view type. + + Examples + -------- + Create an instance of a string type: + + >>> import pyarrow as pa + >>> pa.string_view() + DataType(string_view) + """ + +@overload +def list_( + value_type: _DataTypeT | Field[_DataTypeT], list_size: Literal[-1] = ... +) -> ListType[_DataTypeT]: ... +@overload +def list_( + value_type: _DataTypeT | Field[_DataTypeT], list_size: _Size +) -> FixedSizeListType[_DataTypeT, _Size]: ... +def list_(*args, **kwargs): + """ + Create ListType instance from child data type or field. + + Parameters + ---------- + value_type : DataType or Field + list_size : int, optional, default -1 + If length == -1 then return a variable length list type. If length is + greater than or equal to 0 then return a fixed size list type. + + Returns + ------- + list_type : DataType + + Examples + -------- + Create an instance of ListType: + + >>> import pyarrow as pa + >>> pa.list_(pa.string()) + ListType(list) + >>> pa.list_(pa.int32(), 2) + FixedSizeListType(fixed_size_list[2]) + + Use the ListType to create a scalar: + + >>> pa.scalar(["foo", None], type=pa.list_(pa.string(), 2)) + + + or an array: + + >>> pa.array([[1, 2], [3, 4]], pa.list_(pa.int32(), 2)) + + [ + [ + 1, + 2 + ], + [ + 3, + 4 + ] + ] + """ + +def large_list(value_type: _DataTypeT | Field[_DataTypeT]) -> LargeListType[_DataTypeT]: + """ + Create LargeListType instance from child data type or field. + + This data type may not be supported by all Arrow implementations. + Unless you need to represent data larger than 2**31 elements, you should + prefer list_(). + + Parameters + ---------- + value_type : DataType or Field + + Returns + ------- + list_type : DataType + + Examples + -------- + Create an instance of LargeListType: + + >>> import pyarrow as pa + >>> pa.large_list(pa.int8()) + LargeListType(large_list) + + Use the LargeListType to create an array: + + >>> pa.array([[-1, 3]] * 5, type=pa.large_list(pa.int8())) + + [ + [ + -1, + 3 + ], + [ + -1, + 3 + ], + ... + """ + +def list_view(value_type: _DataTypeT | Field[_DataTypeT]) -> ListViewType[_DataTypeT]: + """ + Create ListViewType instance from child data type or field. + + This data type may not be supported by all Arrow implementations + because it is an alternative to the ListType. + + Parameters + ---------- + value_type : DataType or Field + + Returns + ------- + list_view_type : DataType + + Examples + -------- + Create an instance of ListViewType: + + >>> import pyarrow as pa + >>> pa.list_view(pa.string()) + ListViewType(list_view) + """ + +def large_list_view( + value_type: _DataTypeT | Field[_DataTypeT], +) -> LargeListViewType[_DataTypeT]: + """ + Create LargeListViewType instance from child data type or field. + + This data type may not be supported by all Arrow implementations + because it is an alternative to the ListType. + + Parameters + ---------- + value_type : DataType or Field + + Returns + ------- + list_view_type : DataType + + Examples + -------- + Create an instance of LargeListViewType: + + >>> import pyarrow as pa + >>> pa.large_list_view(pa.int8()) + LargeListViewType(large_list_view) + """ + +@overload +def map_(key_type: _K, item_type: _ValueT) -> MapType[_K, _ValueT, _Ordered]: ... +@overload +def map_( + key_type: _K, item_type: _ValueT, key_sorted: _Ordered +) -> MapType[_K, _ValueT, _Ordered]: ... +def map_(*args, **kwargs): + """ + Create MapType instance from key and item data types or fields. + + Parameters + ---------- + key_type : DataType or Field + item_type : DataType or Field + keys_sorted : bool + + Returns + ------- + map_type : DataType + + Examples + -------- + Create an instance of MapType: + + >>> import pyarrow as pa + >>> pa.map_(pa.string(), pa.int32()) + MapType(map) + >>> pa.map_(pa.string(), pa.int32(), keys_sorted=True) + MapType(map) + + Use MapType to create an array: + + >>> data = [[{"key": "a", "value": 1}, {"key": "b", "value": 2}], [{"key": "c", "value": 3}]] + >>> pa.array(data, type=pa.map_(pa.string(), pa.int32(), keys_sorted=True)) + + [ + keys: + [ + "a", + "b" + ] + values: + [ + 1, + 2 + ], + keys: + [ + "c" + ] + values: + [ + 3 + ] + ] + """ + +@overload +def dictionary( + index_type: _IndexT, value_type: _BasicValueT +) -> DictionaryType[_IndexT, _BasicValueT, _Ordered]: ... +@overload +def dictionary( + index_type: _IndexT, value_type: _BasicValueT, ordered: _Ordered +) -> DictionaryType[_IndexT, _BasicValueT, _Ordered]: ... +def dictionary(*args, **kwargs): + """ + Dictionary (categorical, or simply encoded) type. + + Parameters + ---------- + index_type : DataType + value_type : DataType + ordered : bool + + Returns + ------- + type : DictionaryType + + Examples + -------- + Create an instance of dictionary type: + + >>> import pyarrow as pa + >>> pa.dictionary(pa.int64(), pa.utf8()) + DictionaryType(dictionary) + + Use dictionary type to create an array: + + >>> pa.array(["a", "b", None, "d"], pa.dictionary(pa.int64(), pa.utf8())) + + ... + -- dictionary: + [ + "a", + "b", + "d" + ] + -- indices: + [ + 0, + 1, + null, + 2 + ] + """ + +def struct( + fields: Iterable[Field[Any] | tuple[str, Field[Any]] | tuple[str, DataType]] + | Mapping[str, Field[Any]], +) -> StructType: + """ + Create StructType instance from fields. + + A struct is a nested type parameterized by an ordered sequence of types + (which can all be distinct), called its fields. + + Parameters + ---------- + fields : iterable of Fields or tuples, or mapping of strings to DataTypes + Each field must have a UTF8-encoded name, and these field names are + part of the type metadata. + + Examples + -------- + Create an instance of StructType from an iterable of tuples: + + >>> import pyarrow as pa + >>> fields = [ + ... ("f1", pa.int32()), + ... ("f2", pa.string()), + ... ] + >>> struct_type = pa.struct(fields) + >>> struct_type + StructType(struct) + + Retrieve a field from a StructType: + + >>> struct_type[0] + pyarrow.Field + >>> struct_type["f1"] + pyarrow.Field + + Create an instance of StructType from an iterable of Fields: + + >>> fields = [ + ... pa.field("f1", pa.int32()), + ... pa.field("f2", pa.string(), nullable=False), + ... ] + >>> pa.struct(fields) + StructType(struct) + + Returns + ------- + type : DataType + """ + +def sparse_union( + child_fields: list[Field[Any]], type_codes: list[int] | None = None +) -> SparseUnionType: + """ + Create SparseUnionType from child fields. + + A sparse union is a nested type where each logical value is taken from + a single child. A buffer of 8-bit type ids indicates which child + a given logical value is to be taken from. + + In a sparse union, each child array should have the same length as the + union array, regardless of the actual number of union values that + refer to it. + + Parameters + ---------- + child_fields : sequence of Field values + Each field must have a UTF8-encoded name, and these field names are + part of the type metadata. + type_codes : list of integers, default None + + Returns + ------- + type : SparseUnionType + """ + +def dense_union( + child_fields: list[Field[Any]], type_codes: list[int] | None = None +) -> DenseUnionType: + """ + Create DenseUnionType from child fields. + + A dense union is a nested type where each logical value is taken from + a single child, at a specific offset. A buffer of 8-bit type ids + indicates which child a given logical value is to be taken from, + and a buffer of 32-bit offsets indicates at which physical position + in the given child array the logical value is to be taken from. + + Unlike a sparse union, a dense union allows encoding only the child array + values which are actually referred to by the union array. This is + counterbalanced by the additional footprint of the offsets buffer, and + the additional indirection cost when looking up values. + + Parameters + ---------- + child_fields : sequence of Field values + Each field must have a UTF8-encoded name, and these field names are + part of the type metadata. + type_codes : list of integers, default None + + Returns + ------- + type : DenseUnionType + """ + +@overload +def union( + child_fields: list[Field[Any]], mode: Literal["sparse"], type_codes: list[int] | None = None +) -> SparseUnionType: ... +@overload +def union( + child_fields: list[Field[Any]], mode: Literal["dense"], type_codes: list[int] | None = None +) -> DenseUnionType: ... +def union(*args, **kwargs): + """ + Create UnionType from child fields. + + A union is a nested type where each logical value is taken from a + single child. A buffer of 8-bit type ids indicates which child + a given logical value is to be taken from. + + Unions come in two flavors: sparse and dense + (see also `pyarrow.sparse_union` and `pyarrow.dense_union`). + + Parameters + ---------- + child_fields : sequence of Field values + Each field must have a UTF8-encoded name, and these field names are + part of the type metadata. + mode : str + Must be 'sparse' or 'dense' + type_codes : list of integers, default None + + Returns + ------- + type : UnionType + """ + +def run_end_encoded( + run_end_type: _RunEndType, value_type: _BasicValueT +) -> RunEndEncodedType[_RunEndType, _BasicValueT]: + """ + Create RunEndEncodedType from run-end and value types. + + Parameters + ---------- + run_end_type : pyarrow.DataType + The integer type of the run_ends array. Must be 'int16', 'int32', or 'int64'. + value_type : pyarrow.DataType + The type of the values array. + + Returns + ------- + type : RunEndEncodedType + """ + +def json_(storage_type: DataType = ...) -> JsonType: + """ + Create instance of JSON extension type. + + Parameters + ---------- + storage_type : DataType, default pyarrow.string() + The underlying data type. Can be on of the following types: + string, large_string, string_view. + + Returns + ------- + type : JsonType + + Examples + -------- + Create an instance of JSON extension type: + + >>> import pyarrow as pa + >>> pa.json_(pa.utf8()) + JsonType(extension) + + Use the JSON type to create an array: + + >>> pa.array(['{"a": 1}', '{"b": 2}'], type=pa.json_(pa.utf8())) + + [ + "{"a": 1}", + "{"b": 2}" + ] + """ + +def uuid() -> UuidType: + """ + Create UuidType instance. + + Returns + ------- + type : UuidType + """ + +def fixed_shape_tensor( + value_type: _ValueT, + shape: Sequence[int], + dim_names: Sequence[str] | None = None, + permutation: Sequence[int] | None = None, +) -> FixedShapeTensorType[_ValueT]: + """ + Create instance of fixed shape tensor extension type with shape and optional + names of tensor dimensions and indices of the desired logical + ordering of dimensions. + + Parameters + ---------- + value_type : DataType + Data type of individual tensor elements. + shape : tuple or list of integers + The physical shape of the contained tensors. + dim_names : tuple or list of strings, default None + Explicit names to tensor dimensions. + permutation : tuple or list integers, default None + Indices of the desired ordering of the original dimensions. + The indices contain a permutation of the values ``[0, 1, .., N-1]`` where + N is the number of dimensions. The permutation indicates which dimension + of the logical layout corresponds to which dimension of the physical tensor. + For more information on this parameter see + :ref:`fixed_shape_tensor_extension`. + + Examples + -------- + Create an instance of fixed shape tensor extension type: + + >>> import pyarrow as pa + >>> tensor_type = pa.fixed_shape_tensor(pa.int32(), [2, 2]) + >>> tensor_type + FixedShapeTensorType(extension) + + Inspect the data type: + + >>> tensor_type.value_type + DataType(int32) + >>> tensor_type.shape + [2, 2] + + Create a table with fixed shape tensor extension array: + + >>> arr = [[1, 2, 3, 4], [10, 20, 30, 40], [100, 200, 300, 400]] + >>> storage = pa.array(arr, pa.list_(pa.int32(), 4)) + >>> tensor = pa.ExtensionArray.from_storage(tensor_type, storage) + >>> pa.table([tensor], names=["tensor_array"]) + pyarrow.Table + tensor_array: extension + ---- + tensor_array: [[[1,2,3,4],[10,20,30,40],[100,200,300,400]]] + + Create an instance of fixed shape tensor extension type with names + of tensor dimensions: + + >>> tensor_type = pa.fixed_shape_tensor(pa.int8(), (2, 2, 3), dim_names=["C", "H", "W"]) + >>> tensor_type.dim_names + ['C', 'H', 'W'] + + Create an instance of fixed shape tensor extension type with + permutation: + + >>> tensor_type = pa.fixed_shape_tensor(pa.int8(), (2, 2, 3), permutation=[0, 2, 1]) + >>> tensor_type.permutation + [0, 2, 1] + + Returns + ------- + type : FixedShapeTensorType + """ + +def bool8() -> Bool8Type: + """ + Create instance of bool8 extension type. + + Examples + -------- + Create an instance of bool8 extension type: + + >>> import pyarrow as pa + >>> type = pa.bool8() + >>> type + Bool8Type(extension) + + Inspect the data type: + + >>> type.storage_type + DataType(int8) + + Create a table with a bool8 array: + + >>> arr = [-1, 0, 1, 2, None] + >>> storage = pa.array(arr, pa.int8()) + >>> other = pa.ExtensionArray.from_storage(type, storage) + >>> pa.table([other], names=["unknown_col"]) + pyarrow.Table + unknown_col: extension + ---- + unknown_col: [[-1,0,1,2,null]] + + Returns + ------- + type : Bool8Type + """ + +def opaque(storage_type: DataType, type_name: str, vendor_name: str) -> OpaqueType: + """ + Create instance of opaque extension type. + + Parameters + ---------- + storage_type : DataType + The underlying data type. + type_name : str + The name of the type in the external system. + vendor_name : str + The name of the external system. + + Examples + -------- + Create an instance of an opaque extension type: + + >>> import pyarrow as pa + >>> type = pa.opaque(pa.binary(), "other", "jdbc") + >>> type + OpaqueType(extension) + + Inspect the data type: + + >>> type.storage_type + DataType(binary) + >>> type.type_name + 'other' + >>> type.vendor_name + 'jdbc' + + Create a table with an opaque array: + + >>> arr = [None, b"foobar"] + >>> storage = pa.array(arr, pa.binary()) + >>> other = pa.ExtensionArray.from_storage(type, storage) + >>> pa.table([other], names=["unknown_col"]) + pyarrow.Table + unknown_col: extension + ---- + unknown_col: [[null,666F6F626172]] + + Returns + ------- + type : OpaqueType + """ + +@overload +def type_for_alias(name: Literal["null"]) -> NullType: ... +@overload +def type_for_alias(name: Literal["bool", "boolean"]) -> BoolType: ... +@overload +def type_for_alias(name: Literal["i1", "int8"]) -> Int8Type: ... +@overload +def type_for_alias(name: Literal["i2", "int16"]) -> Int16Type: ... +@overload +def type_for_alias(name: Literal["i4", "int32"]) -> Int32Type: ... +@overload +def type_for_alias(name: Literal["i8", "int64"]) -> Int64Type: ... +@overload +def type_for_alias(name: Literal["u1", "uint8"]) -> UInt8Type: ... +@overload +def type_for_alias(name: Literal["u2", "uint16"]) -> UInt16Type: ... +@overload +def type_for_alias(name: Literal["u4", "uint32"]) -> Uint32Type: ... +@overload +def type_for_alias(name: Literal["u8", "uint64"]) -> UInt64Type: ... +@overload +def type_for_alias(name: Literal["f2", "halffloat", "float16"]) -> Float16Type: ... +@overload +def type_for_alias(name: Literal["f4", "float", "float32"]) -> Float32Type: ... +@overload +def type_for_alias(name: Literal["f8", "double", "float64"]) -> Float64Type: ... +@overload +def type_for_alias(name: Literal["string", "str", "utf8"]) -> StringType: ... +@overload +def type_for_alias(name: Literal["binary"]) -> BinaryType: ... +@overload +def type_for_alias( + name: Literal["large_string", "large_str", "large_utf8"], +) -> LargeStringType: ... +@overload +def type_for_alias(name: Literal["large_binary"]) -> LargeBinaryType: ... +@overload +def type_for_alias(name: Literal["binary_view"]) -> BinaryViewType: ... +@overload +def type_for_alias(name: Literal["string_view"]) -> StringViewType: ... +@overload +def type_for_alias(name: Literal["date32", "date32[day]"]) -> Date32Type: ... +@overload +def type_for_alias(name: Literal["date64", "date64[ms]"]) -> Date64Type: ... +@overload +def type_for_alias(name: Literal["time32[s]"]) -> Time32Type[Literal["s"]]: ... +@overload +def type_for_alias(name: Literal["time32[ms]"]) -> Time32Type[Literal["ms"]]: ... +@overload +def type_for_alias(name: Literal["time64[us]"]) -> Time64Type[Literal["us"]]: ... +@overload +def type_for_alias(name: Literal["time64[ns]"]) -> Time64Type[Literal["ns"]]: ... +@overload +def type_for_alias(name: Literal["timestamp[s]"]) -> TimestampType[Literal["s"], Any]: ... +@overload +def type_for_alias(name: Literal["timestamp[ms]"]) -> TimestampType[Literal["ms"], Any]: ... +@overload +def type_for_alias(name: Literal["timestamp[us]"]) -> TimestampType[Literal["us"], Any]: ... +@overload +def type_for_alias(name: Literal["timestamp[ns]"]) -> TimestampType[Literal["ns"], Any]: ... +@overload +def type_for_alias(name: Literal["duration[s]"]) -> DurationType[Literal["s"]]: ... +@overload +def type_for_alias(name: Literal["duration[ms]"]) -> DurationType[Literal["ms"]]: ... +@overload +def type_for_alias(name: Literal["duration[us]"]) -> DurationType[Literal["us"]]: ... +@overload +def type_for_alias(name: Literal["duration[ns]"]) -> DurationType[Literal["ns"]]: ... +@overload +def type_for_alias(name: Literal["month_day_nano_interval"]) -> MonthDayNanoIntervalType: ... +def type_for_alias(name): + """ + Return DataType given a string alias if one exists. + + Parameters + ---------- + name : str + The alias of the DataType that should be retrieved. + + Returns + ------- + type : DataType + """ + +@overload +def ensure_type(ty: None, allow_none: Literal[True]) -> None: ... +@overload +def ensure_type(ty: _DataTypeT) -> _DataTypeT: ... +@overload +def ensure_type(ty: Literal["null"]) -> NullType: ... +@overload +def ensure_type(ty: Literal["bool", "boolean"]) -> BoolType: ... +@overload +def ensure_type(ty: Literal["i1", "int8"]) -> Int8Type: ... +@overload +def ensure_type(ty: Literal["i2", "int16"]) -> Int16Type: ... +@overload +def ensure_type(ty: Literal["i4", "int32"]) -> Int32Type: ... +@overload +def ensure_type(ty: Literal["i8", "int64"]) -> Int64Type: ... +@overload +def ensure_type(ty: Literal["u1", "uint8"]) -> UInt8Type: ... +@overload +def ensure_type(ty: Literal["u2", "uint16"]) -> UInt16Type: ... +@overload +def ensure_type(ty: Literal["u4", "uint32"]) -> Uint32Type: ... +@overload +def ensure_type(ty: Literal["u8", "uint64"]) -> UInt64Type: ... +@overload +def ensure_type(ty: Literal["f2", "halffloat", "float16"]) -> Float16Type: ... +@overload +def ensure_type(ty: Literal["f4", "float", "float32"]) -> Float32Type: ... +@overload +def ensure_type(ty: Literal["f8", "double", "float64"]) -> Float64Type: ... +@overload +def ensure_type(ty: Literal["string", "str", "utf8"]) -> StringType: ... +@overload +def ensure_type(ty: Literal["binary"]) -> BinaryType: ... +@overload +def ensure_type( + ty: Literal["large_string", "large_str", "large_utf8"], +) -> LargeStringType: ... +@overload +def ensure_type(ty: Literal["large_binary"]) -> LargeBinaryType: ... +@overload +def ensure_type(ty: Literal["binary_view"]) -> BinaryViewType: ... +@overload +def ensure_type(ty: Literal["string_view"]) -> StringViewType: ... +@overload +def ensure_type(ty: Literal["date32", "date32[day]"]) -> Date32Type: ... +@overload +def ensure_type(ty: Literal["date64", "date64[ms]"]) -> Date64Type: ... +@overload +def ensure_type(ty: Literal["time32[s]"]) -> Time32Type[Literal["s"]]: ... +@overload +def ensure_type(ty: Literal["time32[ms]"]) -> Time32Type[Literal["ms"]]: ... +@overload +def ensure_type(ty: Literal["time64[us]"]) -> Time64Type[Literal["us"]]: ... +@overload +def ensure_type(ty: Literal["time64[ns]"]) -> Time64Type[Literal["ns"]]: ... +@overload +def ensure_type(ty: Literal["timestamp[s]"]) -> TimestampType[Literal["s"], Any]: ... +@overload +def ensure_type(ty: Literal["timestamp[ms]"]) -> TimestampType[Literal["ms"], Any]: ... +@overload +def ensure_type(ty: Literal["timestamp[us]"]) -> TimestampType[Literal["us"], Any]: ... +@overload +def ensure_type(ty: Literal["timestamp[ns]"]) -> TimestampType[Literal["ns"], Any]: ... +@overload +def ensure_type(ty: Literal["duration[s]"]) -> DurationType[Literal["s"]]: ... +@overload +def ensure_type(ty: Literal["duration[ms]"]) -> DurationType[Literal["ms"]]: ... +@overload +def ensure_type(ty: Literal["duration[us]"]) -> DurationType[Literal["us"]]: ... +@overload +def ensure_type(ty: Literal["duration[ns]"]) -> DurationType[Literal["ns"]]: ... +@overload +def ensure_type(ty: Literal["month_day_nano_interval"]) -> MonthDayNanoIntervalType: ... +def schema( + fields: Iterable[Field[Any]] | Iterable[tuple[str, DataType]] | Mapping[str, DataType], + metadata: dict[bytes | str, bytes | str] | None = None, +) -> Schema: + """ + Construct pyarrow.Schema from collection of fields. + + Parameters + ---------- + fields : iterable of Fields or tuples, or mapping of strings to DataTypes + Can also pass an object that implements the Arrow PyCapsule Protocol + for schemas (has an ``__arrow_c_schema__`` method). + metadata : dict, default None + Keys and values must be coercible to bytes. + + Examples + -------- + Create a Schema from iterable of tuples: + + >>> import pyarrow as pa + >>> pa.schema( + ... [ + ... ("some_int", pa.int32()), + ... ("some_string", pa.string()), + ... pa.field("some_required_string", pa.string(), nullable=False), + ... ] + ... ) + some_int: int32 + some_string: string + some_required_string: string not null + + Create a Schema from iterable of Fields: + + >>> pa.schema([pa.field("some_int", pa.int32()), pa.field("some_string", pa.string())]) + some_int: int32 + some_string: string + + DataTypes can also be passed as strings. The following is equivalent to the + above example: + + >>> pa.schema([pa.field("some_int", "int32"), pa.field("some_string", "string")]) + some_int: int32 + some_string: string + + Or more concisely: + + >>> pa.schema([("some_int", "int32"), ("some_string", "string")]) + some_int: int32 + some_string: string + + Returns + ------- + schema : pyarrow.Schema + """ + +def from_numpy_dtype(dtype: np.dtype[Any]) -> DataType: + """ + Convert NumPy dtype to pyarrow.DataType. + + Parameters + ---------- + dtype : the numpy dtype to convert + + + Examples + -------- + Create a pyarrow DataType from NumPy dtype: + + >>> import pyarrow as pa + >>> import numpy as np + >>> pa.from_numpy_dtype(np.dtype("float16")) + DataType(halffloat) + >>> pa.from_numpy_dtype("U") + DataType(string) + >>> pa.from_numpy_dtype(bool) + DataType(bool) + >>> pa.from_numpy_dtype(np.str_) + DataType(string) + """ + +def is_boolean_value(obj: Any) -> bool: + """ + Check if the object is a boolean. + + Parameters + ---------- + obj : object + The object to check + """ + +def is_integer_value(obj: Any) -> bool: + """ + Check if the object is an integer. + + Parameters + ---------- + obj : object + The object to check + """ + +def is_float_value(obj: Any) -> bool: + """ + Check if the object is a float. + + Parameters + ---------- + obj : object + The object to check + """ + +__all__ = [ + "_Weakrefable", + "_Metadata", + "DataType", + "_BasicDataType", + "NullType", + "BoolType", + "UInt8Type", + "Int8Type", + "UInt16Type", + "Int16Type", + "Uint32Type", + "Int32Type", + "UInt64Type", + "Int64Type", + "Float16Type", + "Float32Type", + "Float64Type", + "Date32Type", + "Date64Type", + "MonthDayNanoIntervalType", + "StringType", + "LargeStringType", + "StringViewType", + "BinaryType", + "LargeBinaryType", + "BinaryViewType", + "TimestampType", + "Time32Type", + "Time64Type", + "DurationType", + "FixedSizeBinaryType", + "Decimal32Type", + "Decimal64Type", + "Decimal128Type", + "Decimal256Type", + "ListType", + "LargeListType", + "ListViewType", + "LargeListViewType", + "FixedSizeListType", + "DictionaryMemo", + "DictionaryType", + "MapType", + "StructType", + "UnionType", + "SparseUnionType", + "DenseUnionType", + "RunEndEncodedType", + "BaseExtensionType", + "ExtensionType", + "FixedShapeTensorType", + "Bool8Type", + "UuidType", + "JsonType", + "OpaqueType", + "PyExtensionType", + "UnknownExtensionType", + "register_extension_type", + "unregister_extension_type", + "KeyValueMetadata", + "ensure_metadata", + "Field", + "Schema", + "unify_schemas", + "field", + "null", + "bool_", + "uint8", + "int8", + "uint16", + "int16", + "uint32", + "int32", + "int64", + "uint64", + "tzinfo_to_string", + "string_to_tzinfo", + "timestamp", + "time32", + "time64", + "duration", + "month_day_nano_interval", + "date32", + "date64", + "float16", + "float32", + "float64", + "decimal32", + "decimal64", + "decimal128", + "decimal256", + "string", + "utf8", + "binary", + "large_binary", + "large_string", + "large_utf8", + "binary_view", + "string_view", + "list_", + "large_list", + "list_view", + "large_list_view", + "map_", + "dictionary", + "struct", + "sparse_union", + "dense_union", + "union", + "run_end_encoded", + "json_", + "uuid", + "fixed_shape_tensor", + "bool8", + "opaque", + "type_for_alias", + "ensure_type", + "schema", + "from_numpy_dtype", + "is_boolean_value", + "is_integer_value", + "is_float_value", +] diff --git a/python/stubs/_azurefs.pyi b/python/stubs/_azurefs.pyi new file mode 100644 index 00000000000..317943ce20f --- /dev/null +++ b/python/stubs/_azurefs.pyi @@ -0,0 +1,74 @@ +from typing import Literal + +from ._fs import FileSystem + +class AzureFileSystem(FileSystem): + """ + Azure Blob Storage backed FileSystem implementation + + This implementation supports flat namespace and hierarchical namespace (HNS) a.k.a. + Data Lake Gen2 storage accounts. HNS will be automatically detected and HNS specific + features will be used when they provide a performance advantage. Azurite emulator is + also supported. Note: `/` is the only supported delimiter. + + The storage account is considered the root of the filesystem. When enabled, containers + will be created or deleted during relevant directory operations. Obviously, this also + requires authentication with the additional permissions. + + By default `DefaultAzureCredential `__ + is used for authentication. This means it will try several types of authentication + and go with the first one that works. If any authentication parameters are provided when + initialising the FileSystem, they will be used instead of the default credential. + + Parameters + ---------- + account_name : str + Azure Blob Storage account name. This is the globally unique identifier for the + storage account. + account_key : str, default None + Account key of the storage account. If sas_token and account_key are None the + default credential will be used. The parameters account_key and sas_token are + mutually exclusive. + blob_storage_authority : str, default None + hostname[:port] of the Blob Service. Defaults to `.blob.core.windows.net`. Useful + for connecting to a local emulator, like Azurite. + dfs_storage_authority : str, default None + hostname[:port] of the Data Lake Gen 2 Service. Defaults to + `.dfs.core.windows.net`. Useful for connecting to a local emulator, like Azurite. + blob_storage_scheme : str, default None + Either `http` or `https`. Defaults to `https`. Useful for connecting to a local + emulator, like Azurite. + dfs_storage_scheme : str, default None + Either `http` or `https`. Defaults to `https`. Useful for connecting to a local + emulator, like Azurite. + sas_token : str, default None + SAS token for the storage account, used as an alternative to account_key. If sas_token + and account_key are None the default credential will be used. The parameters + account_key and sas_token are mutually exclusive. + + Examples + -------- + >>> from pyarrow import fs + >>> azure_fs = fs.AzureFileSystem(account_name="myaccount") + >>> azurite_fs = fs.AzureFileSystem( + ... account_name="devstoreaccount1", + ... account_key="Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==", + ... blob_storage_authority="127.0.0.1:10000", + ... dfs_storage_authority="127.0.0.1:10000", + ... blob_storage_scheme="http", + ... dfs_storage_scheme="http", + ... ) + + For usage of the methods see examples for :func:`~pyarrow.fs.LocalFileSystem`. + """ + + def __init__( + self, + account_name: str, + account_key: str | None = None, + blob_storage_authority: str | None = None, + dfs_storage_authority: str | None = None, + blob_storage_schema: Literal["http", "https"] = "https", + dfs_storage_schema: Literal["http", "https"] = "https", + sas_token: str | None = None, + ) -> None: ... diff --git a/python/stubs/_compute.pyi b/python/stubs/_compute.pyi new file mode 100644 index 00000000000..3d61ae42787 --- /dev/null +++ b/python/stubs/_compute.pyi @@ -0,0 +1,1721 @@ +from typing import ( + Any, + Callable, + Iterable, + Literal, + Sequence, + TypeAlias, + TypedDict, + overload, +) + +from . import lib + +_Order: TypeAlias = Literal["ascending", "descending"] +_Placement: TypeAlias = Literal["at_start", "at_end"] + +class Kernel(lib._Weakrefable): + """ + A kernel object. + + Kernels handle the execution of a Function for a certain signature. + """ + +class Function(lib._Weakrefable): + """ + A compute function. + + A function implements a certain logical computation over a range of + possible input signatures. Each signature accepts a range of input + types and is implemented by a given Kernel. + + Functions can be of different kinds: + + * "scalar" functions apply an item-wise computation over all items + of their inputs. Each item in the output only depends on the values + of the inputs at the same position. Examples: addition, comparisons, + string predicates... + + * "vector" functions apply a collection-wise computation, such that + each item in the output may depend on the values of several items + in each input. Examples: dictionary encoding, sorting, extracting + unique values... + + * "scalar_aggregate" functions reduce the dimensionality of the inputs by + applying a reduction function. Examples: sum, min_max, mode... + + * "hash_aggregate" functions apply a reduction function to an input + subdivided by grouping criteria. They may not be directly called. + Examples: hash_sum, hash_min_max... + + * "meta" functions dispatch to other functions. + """ + @property + def arity(self) -> int: + """ + The function arity. + + If Ellipsis (i.e. `...`) is returned, the function takes a variable + number of arguments. + """ + @property + def kind( + self, + ) -> Literal["scalar", "vector", "scalar_aggregate", "hash_aggregate", "meta"]: + """ + The function kind. + """ + @property + def name(self) -> str: + """ + The function name. + """ + @property + def num_kernels(self) -> int: + """ + The number of kernels implementing this function. + """ + def call( + self, + args: Iterable, + options: FunctionOptions | None = None, + memory_pool: lib.MemoryPool | None = None, + length: int | None = None, + ) -> Any: + """ + Call the function on the given arguments. + + Parameters + ---------- + args : iterable + The arguments to pass to the function. Accepted types depend + on the specific function. + options : FunctionOptions, optional + Options instance for executing this function. This should have + the right concrete options type. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + length : int, optional + Batch size for execution, for nullary (no argument) functions. If + not passed, will be inferred from passed data. + """ + +class FunctionOptions(lib._Weakrefable): + def serialize(self) -> lib.Buffer: ... + @classmethod + def deserialize(cls, buf: lib.Buffer) -> FunctionOptions: ... + +class FunctionRegistry(lib._Weakrefable): + def get_function(self, name: str) -> Function: + """ + Look up a function by name in the registry. + + Parameters + ---------- + name : str + The name of the function to lookup + """ + + def list_functions(self) -> list[str]: + """ + Return all function names in the registry. + """ + +class HashAggregateFunction(Function): ... +class HashAggregateKernel(Kernel): ... +class ScalarAggregateFunction(Function): ... +class ScalarAggregateKernel(Kernel): ... +class ScalarFunction(Function): ... +class ScalarKernel(Kernel): ... +class VectorFunction(Function): ... +class VectorKernel(Kernel): ... + +# ==================== _compute.pyx Option classes ==================== +class ArraySortOptions(FunctionOptions): + """ + Options for the `array_sort_indices` function. + + Parameters + ---------- + order : str, default "ascending" + Which order to sort values in. + Accepted values are "ascending", "descending". + null_placement : str, default "at_end" + Where nulls in the input should be sorted. + Accepted values are "at_start", "at_end". + """ + def __init__( + self, + order: _Order = "ascending", + null_placement: _Placement = "at_end", + ) -> None: ... + +class AssumeTimezoneOptions(FunctionOptions): + """ + Options for the `assume_timezone` function. + + Parameters + ---------- + timezone : str + Timezone to assume for the input. + ambiguous : str, default "raise" + How to handle timestamps that are ambiguous in the assumed timezone. + Accepted values are "raise", "earliest", "latest". + nonexistent : str, default "raise" + How to handle timestamps that don't exist in the assumed timezone. + Accepted values are "raise", "earliest", "latest". + """ + + def __init__( + self, + timezone: str, + *, + ambiguous: Literal["raise", "earliest", "latest"] = "raise", + nonexistent: Literal["raise", "earliest", "latest"] = "raise", + ) -> None: ... + +class CastOptions(FunctionOptions): + """ + Options for the `cast` function. + + Parameters + ---------- + target_type : DataType, optional + The PyArrow type to cast to. + allow_int_overflow : bool, default False + Whether integer overflow is allowed when casting. + allow_time_truncate : bool, default False + Whether time precision truncation is allowed when casting. + allow_time_overflow : bool, default False + Whether date/time range overflow is allowed when casting. + allow_decimal_truncate : bool, default False + Whether decimal precision truncation is allowed when casting. + allow_float_truncate : bool, default False + Whether floating-point precision truncation is allowed when casting. + allow_invalid_utf8 : bool, default False + Whether producing invalid utf8 data is allowed when casting. + """ + + allow_int_overflow: bool + allow_time_truncate: bool + allow_time_overflow: bool + allow_decimal_truncate: bool + allow_float_truncate: bool + allow_invalid_utf8: bool + + def __init__( + self, + target_type: lib.DataType | None = None, + *, + allow_int_overflow: bool | None = None, + allow_time_truncate: bool | None = None, + allow_time_overflow: bool | None = None, + allow_decimal_truncate: bool | None = None, + allow_float_truncate: bool | None = None, + allow_invalid_utf8: bool | None = None, + ) -> None: ... + @staticmethod + def safe(target_type: lib.DataType | None = None) -> CastOptions: ... + @staticmethod + def unsafe(target_type: lib.DataType | None = None) -> CastOptions: ... + def is_safe(self) -> bool: ... + +class CountOptions(FunctionOptions): + """ + Options for the `count` function. + + Parameters + ---------- + mode : str, default "only_valid" + Which values to count in the input. + Accepted values are "only_valid", "only_null", "all". + """ + def __init__(self, mode: Literal["only_valid", "only_null", "all"] = "only_valid") -> None: ... + +class CumulativeOptions(FunctionOptions): + """ + Options for `cumulative_*` functions. + + - cumulative_sum + - cumulative_sum_checked + - cumulative_prod + - cumulative_prod_checked + - cumulative_max + - cumulative_min + + Parameters + ---------- + start : Scalar, default None + Starting value for the cumulative operation. If none is given, + a default value depending on the operation and input type is used. + skip_nulls : bool, default False + When false, the first encountered null is propagated. + """ + def __init__(self, start: lib.Scalar | None = None, *, skip_nulls: bool = False) -> None: ... + +class CumulativeSumOptions(FunctionOptions): + """ + Options for `cumulative_sum` function. + + Parameters + ---------- + start : Scalar, default None + Starting value for sum computation + skip_nulls : bool, default False + When false, the first encountered null is propagated. + """ + def __init__(self, start: lib.Scalar | None = None, *, skip_nulls: bool = False) -> None: ... + +class DayOfWeekOptions(FunctionOptions): + """ + Options for the `day_of_week` function. + + Parameters + ---------- + count_from_zero : bool, default True + If True, number days from 0, otherwise from 1. + week_start : int, default 1 + Which day does the week start with (Monday=1, Sunday=7). + How this value is numbered is unaffected by `count_from_zero`. + """ + + def __init__(self, *, count_from_zero: bool = True, week_start: int = 1) -> None: ... + +class DictionaryEncodeOptions(FunctionOptions): + """ + Options for dictionary encoding. + + Parameters + ---------- + null_encoding : str, default "mask" + How to encode nulls in the input. + Accepted values are "mask" (null inputs emit a null in the indices + array), "encode" (null inputs emit a non-null index pointing to + a null value in the dictionary array). + """ + def __init__(self, null_encoding: Literal["mask", "encode"] = "mask") -> None: ... + +class RunEndEncodeOptions(FunctionOptions): + """ + Options for run-end encoding. + + Parameters + ---------- + run_end_type : DataType, default pyarrow.int32() + The data type of the run_ends array. + + Accepted values are pyarrow.{int16(), int32(), int64()}. + """ + # TODO: default is DataType(int32) + def __init__(self, run_end_type: lib.DataType = ...) -> None: ... + +class ElementWiseAggregateOptions(FunctionOptions): + """ + Options for element-wise aggregate functions. + + Parameters + ---------- + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + """ + def __init__(self, *, skip_nulls: bool = True) -> None: ... + +class ExtractRegexOptions(FunctionOptions): + """ + Options for the `extract_regex` function. + + Parameters + ---------- + pattern : str + Regular expression with named capture fields. + """ + def __init__(self, pattern: str) -> None: ... + +class ExtractRegexSpanOptions(FunctionOptions): + """ + Options for the `extract_regex_span` function. + + Parameters + ---------- + pattern : str + Regular expression with named capture fields. + """ + def __init__(self, pattern: str) -> None: ... + +class FilterOptions(FunctionOptions): + """ + Options for selecting with a boolean filter. + + Parameters + ---------- + null_selection_behavior : str, default "drop" + How to handle nulls in the selection filter. + Accepted values are "drop", "emit_null". + """ + + def __init__(self, null_selection_behavior: Literal["drop", "emit_null"] = "drop") -> None: ... + +class IndexOptions(FunctionOptions): + """ + Options for the `index` function. + + Parameters + ---------- + value : Scalar + The value to search for. + """ + def __init__(self, value: lib.Scalar) -> None: ... + +class JoinOptions(FunctionOptions): + """ + Options for the `binary_join_element_wise` function. + + Parameters + ---------- + null_handling : str, default "emit_null" + How to handle null values in the inputs. + Accepted values are "emit_null", "skip", "replace". + null_replacement : str, default "" + Replacement string to emit for null inputs if `null_handling` + is "replace". + """ + @overload + def __init__(self, null_handling: Literal["emit_null", "skip"] = "emit_null") -> None: ... + @overload + def __init__(self, null_handling: Literal["replace"], null_replacement: str = "") -> None: ... + +class ListSliceOptions(FunctionOptions): + """ + Options for list array slicing. + + Parameters + ---------- + start : int + Index to start slicing inner list elements (inclusive). + stop : Optional[int], default None + If given, index to stop slicing at (exclusive). + If not given, slicing will stop at the end. (NotImplemented) + step : int, default 1 + Slice step. + return_fixed_size_list : Optional[bool], default None + Whether to return a FixedSizeListArray. If true _and_ stop is after + a list element's length, nulls will be appended to create the + requested slice size. The default of `None` will return the same + type which was passed in. + """ + def __init__( + self, + start: int, + stop: int | None = None, + step: int = 1, + return_fixed_size_list: bool | None = None, + ) -> None: ... + +class ListFlattenOptions(FunctionOptions): + """ + Options for `list_flatten` function + + Parameters + ---------- + recursive : bool, default False + When True, the list array is flattened recursively until an array + of non-list values is formed. + """ + def __init__(self, recursive: bool = False) -> None: ... + +class MakeStructOptions(FunctionOptions): + """ + Options for the `make_struct` function. + + Parameters + ---------- + field_names : sequence of str + Names of the struct fields to create. + field_nullability : sequence of bool, optional + Nullability information for each struct field. + If omitted, all fields are nullable. + field_metadata : sequence of KeyValueMetadata, optional + Metadata for each struct field. + """ + def __init__( + self, + field_names: Sequence[str] = (), + *, + field_nullability: Sequence[bool] | None = None, + field_metadata: Sequence[lib.KeyValueMetadata] | None = None, + ) -> None: ... + +class MapLookupOptions(FunctionOptions): + """ + Options for the `map_lookup` function. + + Parameters + ---------- + query_key : Scalar or Object can be converted to Scalar + The key to search for. + occurrence : str + The occurrence(s) to return from the Map + Accepted values are "first", "last", or "all". + """ + # TODO: query_key: Scalar or Object can be converted to Scalar + def __init__( + self, query_key: lib.Scalar, occurrence: Literal["first", "last", "all"] + ) -> None: ... + +class MatchSubstringOptions(FunctionOptions): + """ + Options for looking for a substring. + + Parameters + ---------- + pattern : str + Substring pattern to look for inside input values. + ignore_case : bool, default False + Whether to perform a case-insensitive match. + """ + + def __init__(self, pattern: str, *, ignore_case: bool = False) -> None: ... + +class ModeOptions(FunctionOptions): + """ + Options for the `mode` function. + + Parameters + ---------- + n : int, default 1 + Number of distinct most-common values to return. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + """ + def __init__(self, n: int = 1, *, skip_nulls: bool = True, min_count: int = 0) -> None: ... + +class NullOptions(FunctionOptions): + """ + Options for the `is_null` function. + + Parameters + ---------- + nan_is_null : bool, default False + Whether floating-point NaN values are considered null. + """ + def __init__(self, *, nan_is_null: bool = False) -> None: ... + +class PadOptions(FunctionOptions): + """ + Options for padding strings. + + Parameters + ---------- + width : int + Desired string length. + padding : str, default " " + What to pad the string with. Should be one byte or codepoint. + lean_left_on_odd_padding : bool, default True + What to do if there is an odd number of padding characters (in case + of centered padding). Defaults to aligning on the left (i.e. adding + the extra padding character on the right). + """ + def __init__( + self, width: int, padding: str = " ", lean_left_on_odd_padding: bool = True + ) -> None: ... + +class PairwiseOptions(FunctionOptions): + """ + Options for `pairwise` functions. + + Parameters + ---------- + period : int, default 1 + Period for applying the period function. + """ + def __init__(self, period: int = 1) -> None: ... + +class PartitionNthOptions(FunctionOptions): + """ + Options for the `partition_nth_indices` function. + + Parameters + ---------- + pivot : int + Index into the equivalent sorted array of the pivot element. + null_placement : str, default "at_end" + Where nulls in the input should be partitioned. + Accepted values are "at_start", "at_end". + """ + def __init__(self, pivot: int, *, null_placement: _Placement = "at_end") -> None: ... + +class WinsorizeOptions(FunctionOptions): + """ + Options for the `winsorize` function. + + Parameters + ---------- + lower_limit : float, between 0 and 1 + The quantile below which all values are replaced with the quantile's value. + upper_limit : float, between 0 and 1 + The quantile above which all values are replaced with the quantile's value. + """ + def __init__(self, lower_limit: float, upper_limit: float) -> None: ... + +class QuantileOptions(FunctionOptions): + """ + Options for the `quantile` function. + + Parameters + ---------- + q : double or sequence of double, default 0.5 + Probability levels of the quantiles to compute. All values must be in + [0, 1]. + interpolation : str, default "linear" + How to break ties between competing data points for a given quantile. + Accepted values are: + + - "linear": compute an interpolation + - "lower": always use the smallest of the two data points + - "higher": always use the largest of the two data points + - "nearest": select the data point that is closest to the quantile + - "midpoint": compute the (unweighted) mean of the two data points + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + """ + def __init__( + self, + q: float | Sequence[float], + *, + interpolation: Literal["linear", "lower", "higher", "nearest", "midpoint"] = "linear", + skip_nulls: bool = True, + min_count: int = 0, + ) -> None: ... + +class RandomOptions(FunctionOptions): + """ + Options for random generation. + + Parameters + ---------- + initializer : int or str + How to initialize the underlying random generator. + If an integer is given, it is used as a seed. + If "system" is given, the random generator is initialized with + a system-specific source of (hopefully true) randomness. + Other values are invalid. + """ + def __init__(self, *, initializer: int | Literal["system"] = "system") -> None: ... + +class RankOptions(FunctionOptions): + """ + Options for the `rank` function. + + Parameters + ---------- + sort_keys : sequence of (name, order) tuples or str, default "ascending" + Names of field/column keys to sort the input on, + along with the order each field/column is sorted in. + Accepted values for `order` are "ascending", "descending". + The field name can be a string column name or expression. + Alternatively, one can simply pass "ascending" or "descending" as a string + if the input is array-like. + null_placement : str, default "at_end" + Where nulls in input should be sorted. + Accepted values are "at_start", "at_end". + tiebreaker : str, default "first" + Configure how ties between equal values are handled. + Accepted values are: + + - "min": Ties get the smallest possible rank in sorted order. + - "max": Ties get the largest possible rank in sorted order. + - "first": Ranks are assigned in order of when ties appear in the + input. This ensures the ranks are a stable permutation + of the input. + - "dense": The ranks span a dense [1, M] interval where M is the + number of distinct values in the input. + """ + def __init__( + self, + sort_keys: _Order | Sequence[tuple[str, _Order]] = "ascending", + *, + null_placement: _Placement = "at_end", + tiebreaker: Literal["min", "max", "first", "dense"] = "first", + ) -> None: ... + +class RankQuantileOptions(FunctionOptions): + """ + Options for the `rank_quantile` function. + + Parameters + ---------- + sort_keys : sequence of (name, order) tuples or str, default "ascending" + Names of field/column keys to sort the input on, + along with the order each field/column is sorted in. + Accepted values for `order` are "ascending", "descending". + The field name can be a string column name or expression. + Alternatively, one can simply pass "ascending" or "descending" as a string + if the input is array-like. + null_placement : str, default "at_end" + Where nulls in input should be sorted. + Accepted values are "at_start", "at_end". + """ + + def __init__( + self, + sort_keys: _Order | Sequence[tuple[str, _Order]] = "ascending", + *, + null_placement: _Placement = "at_end", + ) -> None: ... + +class PivotWiderOptions(FunctionOptions): + """ + Options for the `pivot_wider` function. + + Parameters + ---------- + key_names : sequence of str + The pivot key names expected in the pivot key column. + For each entry in `key_names`, a column with the same name is emitted + in the struct output. + unexpected_key_behavior : str, default "ignore" + The behavior when pivot keys not in `key_names` are encountered. + Accepted values are "ignore", "raise". + If "ignore", unexpected keys are silently ignored. + If "raise", unexpected keys raise a KeyError. + """ + def __init__( + self, + key_names: Sequence[str], + *, + unexpected_key_behavior: Literal["ignore", "raise"] = "ignore", + ) -> None: ... + +class ReplaceSliceOptions(FunctionOptions): + """ + Options for replacing slices. + + Parameters + ---------- + start : int + Index to start slicing at (inclusive). + stop : int + Index to stop slicing at (exclusive). + replacement : str + What to replace the slice with. + """ + def __init__(self, start: int, stop: int, replacement: str) -> None: ... + +class ReplaceSubstringOptions(FunctionOptions): + """ + Options for replacing matched substrings. + + Parameters + ---------- + pattern : str + Substring pattern to look for inside input values. + replacement : str + What to replace the pattern with. + max_replacements : int or None, default None + The maximum number of strings to replace in each + input value (unlimited if None). + """ + def __init__( + self, pattern: str, replacement: str, *, max_replacements: int | None = None + ) -> None: ... + +_RoundMode: TypeAlias = Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", +] + +class RoundBinaryOptions(FunctionOptions): + """ + Options for rounding numbers when ndigits is provided by a second array + + Parameters + ---------- + round_mode : str, default "half_to_even" + Rounding and tie-breaking mode. + Accepted values are "down", "up", "towards_zero", "towards_infinity", + "half_down", "half_up", "half_towards_zero", "half_towards_infinity", + "half_to_even", "half_to_odd". + """ + def __init__( + self, + round_mode: _RoundMode = "half_to_even", + ) -> None: ... + +class RoundOptions(FunctionOptions): + """ + Options for rounding numbers. + + Parameters + ---------- + ndigits : int, default 0 + Number of fractional digits to round to. + round_mode : str, default "half_to_even" + Rounding and tie-breaking mode. + Accepted values are "down", "up", "towards_zero", "towards_infinity", + "half_down", "half_up", "half_towards_zero", "half_towards_infinity", + "half_to_even", "half_to_odd". + """ + def __init__( + self, + ndigits: int = 0, + round_mode: _RoundMode = "half_to_even", + ) -> None: ... + +_DateTimeUint: TypeAlias = Literal[ + "year", + "quarter", + "month", + "week", + "day", + "hour", + "minute", + "second", + "millisecond", + "microsecond", + "nanosecond", +] + +class RoundTemporalOptions(FunctionOptions): + """ + Options for rounding temporal values. + + Parameters + ---------- + multiple : int, default 1 + Number of units to round to. + unit : str, default "day" + The unit in which `multiple` is expressed. + Accepted values are "year", "quarter", "month", "week", "day", + "hour", "minute", "second", "millisecond", "microsecond", + "nanosecond". + week_starts_monday : bool, default True + If True, weeks start on Monday; if False, on Sunday. + ceil_is_strictly_greater : bool, default False + If True, ceil returns a rounded value that is strictly greater than the + input. For example: ceiling 1970-01-01T00:00:00 to 3 hours would + yield 1970-01-01T03:00:00 if set to True and 1970-01-01T00:00:00 + if set to False. + This applies to the ceil_temporal function only. + calendar_based_origin : bool, default False + By default, the origin is 1970-01-01T00:00:00. By setting this to True, + rounding origin will be beginning of one less precise calendar unit. + E.g.: rounding to hours will use beginning of day as origin. + + By default time is rounded to a multiple of units since + 1970-01-01T00:00:00. By setting calendar_based_origin to true, + time will be rounded to number of units since the last greater + calendar unit. + For example: rounding to multiple of days since the beginning of the + month or to hours since the beginning of the day. + Exceptions: week and quarter are not used as greater units, + therefore days will be rounded to the beginning of the month not + week. Greater unit of week is a year. + Note that ceiling and rounding might change sorting order of an array + near greater unit change. For example rounding YYYY-mm-dd 23:00:00 to + 5 hours will ceil and round to YYYY-mm-dd+1 01:00:00 and floor to + YYYY-mm-dd 20:00:00. On the other hand YYYY-mm-dd+1 00:00:00 will + ceil, round and floor to YYYY-mm-dd+1 00:00:00. This can break the + order of an already ordered array. + """ + def __init__( + self, + multiple: int = 1, + unit: _DateTimeUint = "day", + *, + week_starts_monday: bool = True, + ceil_is_strictly_greater: bool = False, + calendar_based_origin: bool = False, + ) -> None: ... + +class RoundToMultipleOptions(FunctionOptions): + """ + Options for rounding numbers to a multiple. + + Parameters + ---------- + multiple : numeric scalar, default 1.0 + Multiple to round to. Should be a scalar of a type compatible + with the argument to be rounded. + round_mode : str, default "half_to_even" + Rounding and tie-breaking mode. + Accepted values are "down", "up", "towards_zero", "towards_infinity", + "half_down", "half_up", "half_towards_zero", "half_towards_infinity", + "half_to_even", "half_to_odd". + """ + def __init__(self, multiple: float = 1.0, round_mode: _RoundMode = "half_to_even") -> None: ... + +class ScalarAggregateOptions(FunctionOptions): + """ + Options for scalar aggregations. + + Parameters + ---------- + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + """ + def __init__(self, *, skip_nulls: bool = True, min_count: int = 1) -> None: ... + +class SelectKOptions(FunctionOptions): + """ + Options for top/bottom k-selection. + + Parameters + ---------- + k : int + Number of leading values to select in sorted order + (i.e. the largest values if sort order is "descending", + the smallest otherwise). + sort_keys : sequence of (name, order) tuples + Names of field/column keys to sort the input on, + along with the order each field/column is sorted in. + Accepted values for `order` are "ascending", "descending". + The field name can be a string column name or expression. + """ + + def __init__(self, k: int, sort_keys: Sequence[tuple[str, _Order]]) -> None: ... + +class SetLookupOptions(FunctionOptions): + """ + Options for the `is_in` and `index_in` functions. + + Parameters + ---------- + value_set : Array + Set of values to look for in the input. + skip_nulls : bool, default False + If False, nulls in the input are matched in the value_set just + like regular values. + If True, nulls in the input always fail matching. + """ + def __init__(self, value_set: lib.Array, *, skip_nulls: bool = True) -> None: ... + +class SliceOptions(FunctionOptions): + """ + Options for slicing. + + Parameters + ---------- + start : int + Index to start slicing at (inclusive). + stop : int or None, default None + If given, index to stop slicing at (exclusive). + If not given, slicing will stop at the end. + step : int, default 1 + Slice step. + """ + + def __init__(self, start: int, stop: int | None = None, step: int = 1) -> None: ... + +class SortOptions(FunctionOptions): + """ + Options for the `sort_indices` function. + + Parameters + ---------- + sort_keys : sequence of (name, order) tuples + Names of field/column keys to sort the input on, + along with the order each field/column is sorted in. + Accepted values for `order` are "ascending", "descending". + The field name can be a string column name or expression. + null_placement : str, default "at_end" + Where nulls in input should be sorted, only applying to + columns/fields mentioned in `sort_keys`. + Accepted values are "at_start", "at_end". + """ + def __init__( + self, sort_keys: Sequence[tuple[str, _Order]], *, null_placement: _Placement = "at_end" + ) -> None: ... + +class SplitOptions(FunctionOptions): + """ + Options for splitting on whitespace. + + Parameters + ---------- + max_splits : int or None, default None + Maximum number of splits for each input value (unlimited if None). + reverse : bool, default False + Whether to start splitting from the end of each input value. + This only has an effect if `max_splits` is not None. + """ + + def __init__(self, *, max_splits: int | None = None, reverse: bool = False) -> None: ... + +class SplitPatternOptions(FunctionOptions): + """ + Options for splitting on a string pattern. + + Parameters + ---------- + pattern : str + String pattern to split on. + max_splits : int or None, default None + Maximum number of splits for each input value (unlimited if None). + reverse : bool, default False + Whether to start splitting from the end of each input value. + This only has an effect if `max_splits` is not None. + """ + def __init__( + self, pattern: str, *, max_splits: int | None = None, reverse: bool = False + ) -> None: ... + +class StrftimeOptions(FunctionOptions): + """ + Options for the `strftime` function. + + Parameters + ---------- + format : str, default "%Y-%m-%dT%H:%M:%S" + Pattern for formatting input values. + locale : str, default "C" + Locale to use for locale-specific format specifiers. + """ + def __init__(self, format: str = "%Y-%m-%dT%H:%M:%S", locale: str = "C") -> None: ... + +class StrptimeOptions(FunctionOptions): + """ + Options for the `strptime` function. + + Parameters + ---------- + format : str + Pattern for parsing input strings as timestamps, such as "%Y/%m/%d". + Note that the semantics of the format follow the C/C++ strptime, not the Python one. + There are differences in behavior, for example how the "%y" placeholder + handles years with less than four digits. + unit : str + Timestamp unit of the output. + Accepted values are "s", "ms", "us", "ns". + error_is_null : boolean, default False + Return null on parsing errors if true or raise if false. + """ + def __init__( + self, format: str, unit: Literal["s", "ms", "us", "ns"], error_is_null: bool = False + ) -> None: ... + +class StructFieldOptions(FunctionOptions): + """ + Options for the `struct_field` function. + + Parameters + ---------- + indices : List[str], List[bytes], List[int], Expression, bytes, str, or int + List of indices for chained field lookup, for example `[4, 1]` + will look up the second nested field in the fifth outer field. + """ + def __init__( + self, indices: list[str] | list[bytes] | list[int] | Expression | bytes | str | int + ) -> None: ... + +class TakeOptions(FunctionOptions): + """ + Options for the `take` and `array_take` functions. + + Parameters + ---------- + boundscheck : boolean, default True + Whether to check indices are within bounds. If False and an + index is out of bounds, behavior is undefined (the process + may crash). + """ + def __init__(self, boundscheck: bool = True) -> None: ... + +class TDigestOptions(FunctionOptions): + """ + Options for the `tdigest` function. + + Parameters + ---------- + q : double or sequence of double, default 0.5 + Probability levels of the quantiles to approximate. All values must be + in [0, 1]. + delta : int, default 100 + Compression parameter for the T-digest algorithm. + buffer_size : int, default 500 + Buffer size for the T-digest algorithm. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + """ + def __init__( + self, + q: float | Sequence[float] = 0.5, + *, + delta: int = 100, + buffer_size: int = 500, + skip_nulls: bool = True, + min_count: int = 0, + ) -> None: ... + +class TrimOptions(FunctionOptions): + """ + Options for trimming characters from strings. + + Parameters + ---------- + characters : str + Individual characters to be trimmed from the string. + """ + def __init__(self, characters: str) -> None: ... + +class Utf8NormalizeOptions(FunctionOptions): + """ + Options for the `utf8_normalize` function. + + Parameters + ---------- + form : str + Unicode normalization form. + Accepted values are "NFC", "NFKC", "NFD", NFKD". + """ + + def __init__(self, form: Literal["NFC", "NFKC", "NFD", "NFKD"]) -> None: ... + +class VarianceOptions(FunctionOptions): + """ + Options for the `variance` and `stddev` functions. + + Parameters + ---------- + ddof : int, default 0 + Number of degrees of freedom. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + """ + def __init__(self, *, ddof: int = 0, skip_nulls: bool = True, min_count: int = 0) -> None: ... + +class SkewOptions(FunctionOptions): + """ + Options for the `skew` and `kurtosis` functions. + + Parameters + ---------- + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + biased : bool, default True + Whether the calculated value is biased. + If False, the value computed includes a correction factor to reduce bias. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + """ + def __init__( + self, *, skip_nulls: bool = True, biased: bool = True, min_count: int = 0 + ) -> None: ... + +class WeekOptions(FunctionOptions): + """ + Options for the `week` function. + + Parameters + ---------- + week_starts_monday : bool, default True + If True, weeks start on Monday; if False, on Sunday. + count_from_zero : bool, default False + If True, dates at the start of a year that fall into the last week + of the previous year emit 0. + If False, they emit 52 or 53 (the week number of the last week + of the previous year). + first_week_is_fully_in_year : bool, default False + If True, week number 0 is fully in January. + If False, a week that begins on December 29, 30 or 31 is considered + to be week number 0 of the following year. + """ + def __init__( + self, + *, + week_starts_monday: bool = True, + count_from_zero: bool = False, + first_week_is_fully_in_year: bool = False, + ) -> None: ... + +# ==================== _compute.pyx Functions ==================== + +def call_function( + name: str, + args: list, + options: FunctionOptions | None = None, + memory_pool: lib.MemoryPool | None = None, + length: int | None = None, +) -> Any: + """ + Call a named function. + + The function is looked up in the global registry + (as returned by `function_registry()`). + + Parameters + ---------- + name : str + The name of the function to call. + args : list + The arguments to the function. + options : optional + options provided to the function. + memory_pool : MemoryPool, optional + memory pool to use for allocations during function execution. + length : int, optional + Batch size for execution, for nullary (no argument) functions. If not + passed, inferred from data. + """ + +def function_registry() -> FunctionRegistry: ... +def get_function(name: str) -> Function: + """ + Get a function by name. + + The function is looked up in the global registry + (as returned by `function_registry()`). + + Parameters + ---------- + name : str + The name of the function to lookup + """ + +def list_functions() -> list[str]: + """ + Return all function names in the global registry. + """ + +# ==================== _compute.pyx Udf ==================== + +def call_tabular_function( + function_name: str, args: Iterable | None = None, func_registry: FunctionRegistry | None = None +) -> lib.RecordBatchReader: + """ + Get a record batch iterator from a tabular function. + + Parameters + ---------- + function_name : str + Name of the function. + args : iterable + The arguments to pass to the function. Accepted types depend + on the specific function. Currently, only an empty args is supported. + func_registry : FunctionRegistry + Optional function registry to use instead of the default global one. + """ + +class _FunctionDoc(TypedDict): + summary: str + description: str + +def register_scalar_function( + func: Callable, + function_name: str, + function_doc: _FunctionDoc, + in_types: dict[str, lib.DataType], + out_type: lib.DataType, + func_registry: FunctionRegistry | None = None, +) -> None: + """ + Register a user-defined scalar function. + + This API is EXPERIMENTAL. + + A scalar function is a function that executes elementwise + operations on arrays or scalars, i.e. a scalar function must + be computed row-by-row with no state where each output row + is computed only from its corresponding input row. + In other words, all argument arrays have the same length, + and the output array is of the same length as the arguments. + Scalar functions are the only functions allowed in query engine + expressions. + + Parameters + ---------- + func : callable + A callable implementing the user-defined function. + The first argument is the context argument of type + UdfContext. + Then, it must take arguments equal to the number of + in_types defined. It must return an Array or Scalar + matching the out_type. It must return a Scalar if + all arguments are scalar, else it must return an Array. + + To define a varargs function, pass a callable that takes + *args. The last in_type will be the type of all varargs + arguments. + function_name : str + Name of the function. There should only be one function + registered with this name in the function registry. + function_doc : dict + A dictionary object with keys "summary" (str), + and "description" (str). + in_types : Dict[str, DataType] + A dictionary mapping function argument names to + their respective DataType. + The argument names will be used to generate + documentation for the function. The number of + arguments specified here determines the function + arity. + out_type : DataType + Output type of the function. + func_registry : FunctionRegistry + Optional function registry to use instead of the default global one. + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> + >>> func_doc = {} + >>> func_doc["summary"] = "simple udf" + >>> func_doc["description"] = "add a constant to a scalar" + >>> + >>> def add_constant(ctx, array): + ... return pc.add(array, 1, memory_pool=ctx.memory_pool) + >>> + >>> func_name = "py_add_func" + >>> in_types = {"array": pa.int64()} + >>> out_type = pa.int64() + >>> pc.register_scalar_function(add_constant, func_name, func_doc, in_types, out_type) + >>> + >>> func = pc.get_function(func_name) + >>> func.name + 'py_add_func' + >>> answer = pc.call_function(func_name, [pa.array([20])]) + >>> answer + + [ + 21 + ] + """ + +def register_tabular_function( + func: Callable, + function_name: str, + function_doc: _FunctionDoc, + in_types: dict[str, lib.DataType], + out_type: lib.DataType, + func_registry: FunctionRegistry | None = None, +) -> None: + """ + Register a user-defined tabular function. + + This API is EXPERIMENTAL. + + A tabular function is one accepting a context argument of type + UdfContext and returning a generator of struct arrays. + The in_types argument must be empty and the out_type argument + specifies a schema. Each struct array must have field types + corresponding to the schema. + + Parameters + ---------- + func : callable + A callable implementing the user-defined function. + The only argument is the context argument of type + UdfContext. It must return a callable that + returns on each invocation a StructArray matching + the out_type, where an empty array indicates end. + function_name : str + Name of the function. There should only be one function + registered with this name in the function registry. + function_doc : dict + A dictionary object with keys "summary" (str), + and "description" (str). + in_types : Dict[str, DataType] + Must be an empty dictionary (reserved for future use). + out_type : Union[Schema, DataType] + Schema of the function's output, or a corresponding flat struct type. + func_registry : FunctionRegistry + Optional function registry to use instead of the default global one. + """ + +def register_aggregate_function( + func: Callable, + function_name: str, + function_doc: _FunctionDoc, + in_types: dict[str, lib.DataType], + out_type: lib.DataType, + func_registry: FunctionRegistry | None = None, +) -> None: + """ + Register a user-defined non-decomposable aggregate function. + + This API is EXPERIMENTAL. + + A non-decomposable aggregation function is a function that executes + aggregate operations on the whole data that it is aggregating. + In other words, non-decomposable aggregate function cannot be + split into consume/merge/finalize steps. + + This is often used with ordered or segmented aggregation where groups + can be emit before accumulating all of the input data. + + Note that currently the size of any input column cannot exceed 2 GB + for a single segment (all groups combined). + + Parameters + ---------- + func : callable + A callable implementing the user-defined function. + The first argument is the context argument of type + UdfContext. + Then, it must take arguments equal to the number of + in_types defined. It must return a Scalar matching the + out_type. + To define a varargs function, pass a callable that takes + *args. The in_type needs to match in type of inputs when + the function gets called. + function_name : str + Name of the function. This name must be unique, i.e., + there should only be one function registered with + this name in the function registry. + function_doc : dict + A dictionary object with keys "summary" (str), + and "description" (str). + in_types : Dict[str, DataType] + A dictionary mapping function argument names to + their respective DataType. + The argument names will be used to generate + documentation for the function. The number of + arguments specified here determines the function + arity. + out_type : DataType + Output type of the function. + func_registry : FunctionRegistry + Optional function registry to use instead of the default global one. + + Examples + -------- + >>> import numpy as np + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> + >>> func_doc = {} + >>> func_doc["summary"] = "simple median udf" + >>> func_doc["description"] = "compute median" + >>> + >>> def compute_median(ctx, array): + ... return pa.scalar(np.median(array)) + >>> + >>> func_name = "py_compute_median" + >>> in_types = {"array": pa.int64()} + >>> out_type = pa.float64() + >>> pc.register_aggregate_function(compute_median, func_name, func_doc, in_types, out_type) + >>> + >>> func = pc.get_function(func_name) + >>> func.name + 'py_compute_median' + >>> answer = pc.call_function(func_name, [pa.array([20, 40])]) + >>> answer + + >>> table = pa.table([pa.array([1, 1, 2, 2]), pa.array([10, 20, 30, 40])], names=["k", "v"]) + >>> result = table.group_by("k").aggregate([("v", "py_compute_median")]) + >>> result + pyarrow.Table + k: int64 + v_py_compute_median: double + ---- + k: [[1,2]] + v_py_compute_median: [[15,35]] + """ + +def register_vector_function( + func: Callable, + function_name: str, + function_doc: _FunctionDoc, + in_types: dict[str, lib.DataType], + out_type: lib.DataType, + func_registry: FunctionRegistry | None = None, +) -> None: + """ + Register a user-defined vector function. + + This API is EXPERIMENTAL. + + A vector function is a function that executes vector + operations on arrays. Vector function is often used + when compute doesn't fit other more specific types of + functions (e.g., scalar and aggregate). + + Parameters + ---------- + func : callable + A callable implementing the user-defined function. + The first argument is the context argument of type + UdfContext. + Then, it must take arguments equal to the number of + in_types defined. It must return an Array or Scalar + matching the out_type. It must return a Scalar if + all arguments are scalar, else it must return an Array. + + To define a varargs function, pass a callable that takes + *args. The last in_type will be the type of all varargs + arguments. + function_name : str + Name of the function. There should only be one function + registered with this name in the function registry. + function_doc : dict + A dictionary object with keys "summary" (str), + and "description" (str). + in_types : Dict[str, DataType] + A dictionary mapping function argument names to + their respective DataType. + The argument names will be used to generate + documentation for the function. The number of + arguments specified here determines the function + arity. + out_type : DataType + Output type of the function. + func_registry : FunctionRegistry + Optional function registry to use instead of the default global one. + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> + >>> func_doc = {} + >>> func_doc["summary"] = "percent rank" + >>> func_doc["description"] = "compute percent rank" + >>> + >>> def list_flatten_udf(ctx, x): + ... return pc.list_flatten(x) + >>> + >>> func_name = "list_flatten_udf" + >>> in_types = {"array": pa.list_(pa.int64())} + >>> out_type = pa.int64() + >>> pc.register_vector_function(list_flatten_udf, func_name, func_doc, in_types, out_type) + >>> + >>> answer = pc.call_function(func_name, [pa.array([[1, 2], [3, 4]])]) + >>> answer + + [ + 1, + 2, + 3, + 4 + ] + """ + +class UdfContext: + """ + Per-invocation function context/state. + + This object will always be the first argument to a user-defined + function. It should not be used outside of a call to the function. + """ + + @property + def batch_length(self) -> int: + """ + The common length of all input arguments (int). + + In the case that all arguments are scalars, this value + is used to pass the "actual length" of the arguments, + e.g. because the scalar values are encoding a column + with a constant value. + """ + @property + def memory_pool(self) -> lib.MemoryPool: + """ + A memory pool for allocations (:class:`MemoryPool`). + + This is the memory pool supplied by the user when they invoked + the function and it should be used in any calls to arrow that the + UDF makes if that call accepts a memory_pool. + """ + +# ==================== _compute.pyx Expression ==================== +class Expression(lib._Weakrefable): + """ + A logical expression to be evaluated against some input. + + To create an expression: + + - Use the factory function ``pyarrow.compute.scalar()`` to create a + scalar (not necessary when combined, see example below). + - Use the factory function ``pyarrow.compute.field()`` to reference + a field (column in table). + - Compare fields and scalars with ``<``, ``<=``, ``==``, ``>=``, ``>``. + - Combine expressions using python operators ``&`` (logical and), + ``|`` (logical or) and ``~`` (logical not). + Note: python keywords ``and``, ``or`` and ``not`` cannot be used + to combine expressions. + - Create expression predicates using Expression methods such as + ``pyarrow.compute.Expression.isin()``. + + Examples + -------- + + >>> import pyarrow.compute as pc + >>> (pc.field("a") < pc.scalar(3)) | (pc.field("b") > 7) + 7))> + >>> pc.field("a") != 3 + + >>> pc.field("a").isin([1, 2, 3]) + + """ + + @staticmethod + def from_substrait(buffer: bytes | lib.Buffer) -> Expression: + """ + Deserialize an expression from Substrait + + The serialized message must be an ExtendedExpression message that has + only a single expression. The name of the expression and the schema + the expression was bound to will be ignored. Use + pyarrow.substrait.deserialize_expressions if this information is needed + or if the message might contain multiple expressions. + + Parameters + ---------- + message : bytes or Buffer or a protobuf Message + The Substrait message to deserialize + + Returns + ------- + Expression + The deserialized expression + """ + def to_substrait(self, schema: lib.Schema, allow_arrow_extensions: bool = False) -> lib.Buffer: + """ + Serialize the expression using Substrait + + The expression will be serialized as an ExtendedExpression message that has a + single expression named "expression" + + Parameters + ---------- + schema : Schema + The input schema the expression will be bound to + allow_arrow_extensions : bool, default False + If False then only functions that are part of the core Substrait function + definitions will be allowed. Set this to True to allow pyarrow-specific functions + but the result may not be accepted by other compute libraries. + + Returns + ------- + Buffer + A buffer containing the serialized Protobuf plan. + """ + def __invert__(self) -> Expression: ... + def __and__(self, other) -> Expression: ... + def __or__(self, other) -> Expression: ... + def __add__(self, other) -> Expression: ... + def __mul__(self, other) -> Expression: ... + def __sub__(self, other) -> Expression: ... + def __eq__(self, value: object) -> Expression: ... # type: ignore[override] + def __ne__(self, value: object) -> Expression: ... # type: ignore[override] + def __gt__(self, value: object) -> Expression: ... # type: ignore[override] + def __lt__(self, value: object) -> Expression: ... # type: ignore[override] + def __ge__(self, value: object) -> Expression: ... # type: ignore[override] + def __le__(self, value: object) -> Expression: ... # type: ignore[override] + def __truediv__(self, other) -> Expression: ... + def is_valid(self) -> bool: + """ + Check whether the expression is not-null (valid). + + This creates a new expression equivalent to calling the + `is_valid` compute function on this expression. + + Returns + ------- + is_valid : Expression + """ + def is_null(self, nan_is_null: bool = False) -> Expression: + """ + Check whether the expression is null. + + This creates a new expression equivalent to calling the + `is_null` compute function on this expression. + + Parameters + ---------- + nan_is_null : boolean, default False + Whether floating-point NaNs are considered null. + + Returns + ------- + is_null : Expression + """ + def is_nan(self) -> Expression: + """ + Check whether the expression is NaN. + + This creates a new expression equivalent to calling the + `is_nan` compute function on this expression. + + Returns + ------- + is_nan : Expression + """ + def cast( + self, type: lib.DataType, safe: bool = True, options: CastOptions | None = None + ) -> Expression: + """ + Explicitly set or change the expression's data type. + + This creates a new expression equivalent to calling the + `cast` compute function on this expression. + + Parameters + ---------- + type : DataType, default None + Type to cast array to. + safe : boolean, default True + Whether to check for conversion errors such as overflow. + options : CastOptions, default None + Additional checks pass by CastOptions + + Returns + ------- + cast : Expression + """ + def isin(self, values: lib.Array | Iterable) -> Expression: + """ + Check whether the expression is contained in values. + + This creates a new expression equivalent to calling the + `is_in` compute function on this expression. + + Parameters + ---------- + values : Array or iterable + The values to check for. + + Returns + ------- + isin : Expression + A new expression that, when evaluated, checks whether + this expression's value is contained in `values`. + """ + +# ==================== _compute.py ==================== diff --git a/python/stubs/_csv.pyi b/python/stubs/_csv.pyi new file mode 100644 index 00000000000..2f49f8c9a6c --- /dev/null +++ b/python/stubs/_csv.pyi @@ -0,0 +1,641 @@ +from dataclasses import dataclass, field +from typing import IO, Any, Callable, Literal + +from _typeshed import StrPath + +from . import lib + +@dataclass(kw_only=True) +class ReadOptions(lib._Weakrefable): + """ + Options for reading CSV files. + + Parameters + ---------- + use_threads : bool, optional (default True) + Whether to use multiple threads to accelerate reading + block_size : int, optional + How much bytes to process at a time from the input stream. + This will determine multi-threading granularity as well as + the size of individual record batches or table chunks. + Minimum valid value for block size is 1 + skip_rows : int, optional (default 0) + The number of rows to skip before the column names (if any) + and the CSV data. + skip_rows_after_names : int, optional (default 0) + The number of rows to skip after the column names. + This number can be larger than the number of rows in one + block, and empty rows are counted. + The order of application is as follows: + - `skip_rows` is applied (if non-zero); + - column names are read (unless `column_names` is set); + - `skip_rows_after_names` is applied (if non-zero). + column_names : list, optional + The column names of the target table. If empty, fall back on + `autogenerate_column_names`. + autogenerate_column_names : bool, optional (default False) + Whether to autogenerate column names if `column_names` is empty. + If true, column names will be of the form "f0", "f1"... + If false, column names will be read from the first CSV row + after `skip_rows`. + encoding : str, optional (default 'utf8') + The character encoding of the CSV data. Columns that cannot + decode using this encoding can still be read as Binary. + + Examples + -------- + + Defining an example data: + + >>> import io + >>> s = "1,2,3\\nFlamingo,2,2022-03-01\\nHorse,4,2022-03-02\\nBrittle stars,5,2022-03-03\\nCentipede,100,2022-03-04" + >>> print(s) + 1,2,3 + Flamingo,2,2022-03-01 + Horse,4,2022-03-02 + Brittle stars,5,2022-03-03 + Centipede,100,2022-03-04 + + Ignore the first numbered row and substitute it with defined + or autogenerated column names: + + >>> from pyarrow import csv + >>> read_options = csv.ReadOptions(column_names=["animals", "n_legs", "entry"], skip_rows=1) + >>> csv.read_csv(io.BytesIO(s.encode()), read_options=read_options) + pyarrow.Table + animals: string + n_legs: int64 + entry: date32[day] + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + n_legs: [[2,4,5,100]] + entry: [[2022-03-01,2022-03-02,2022-03-03,2022-03-04]] + + >>> read_options = csv.ReadOptions(autogenerate_column_names=True, skip_rows=1) + >>> csv.read_csv(io.BytesIO(s.encode()), read_options=read_options) + pyarrow.Table + f0: string + f1: int64 + f2: date32[day] + ---- + f0: [["Flamingo","Horse","Brittle stars","Centipede"]] + f1: [[2,4,5,100]] + f2: [[2022-03-01,2022-03-02,2022-03-03,2022-03-04]] + + Remove the first 2 rows of the data: + + >>> read_options = csv.ReadOptions(skip_rows_after_names=2) + >>> csv.read_csv(io.BytesIO(s.encode()), read_options=read_options) + pyarrow.Table + 1: string + 2: int64 + 3: date32[day] + ---- + 1: [["Brittle stars","Centipede"]] + 2: [[5,100]] + 3: [[2022-03-03,2022-03-04]] + """ + + use_threads: bool = field(default=True, kw_only=False) + block_size: int | None = None + skip_rows: int = 0 + skip_rows_after_names: int = 0 + column_names: list[str] | None = None + autogenerate_column_names: bool = False + encoding: str = "utf8" + + def validate(self) -> None: ... + +@dataclass(kw_only=True) +class ParseOptions(lib._Weakrefable): + """ + Options for parsing CSV files. + + Parameters + ---------- + delimiter : 1-character string, optional (default ',') + The character delimiting individual cells in the CSV data. + quote_char : 1-character string or False, optional (default '"') + The character used optionally for quoting CSV values + (False if quoting is not allowed). + double_quote : bool, optional (default True) + Whether two quotes in a quoted CSV value denote a single quote + in the data. + escape_char : 1-character string or False, optional (default False) + The character used optionally for escaping special characters + (False if escaping is not allowed). + newlines_in_values : bool, optional (default False) + Whether newline characters are allowed in CSV values. + Setting this to True reduces the performance of multi-threaded + CSV reading. + ignore_empty_lines : bool, optional (default True) + Whether empty lines are ignored in CSV input. + If False, an empty line is interpreted as containing a single empty + value (assuming a one-column CSV file). + invalid_row_handler : callable, optional (default None) + If not None, this object is called for each CSV row that fails + parsing (because of a mismatching number of columns). + It should accept a single InvalidRow argument and return either + "skip" or "error" depending on the desired outcome. + + Examples + -------- + + Defining an example file from bytes object: + + >>> import io + >>> s = ( + ... "animals;n_legs;entry\\n" + ... "Flamingo;2;2022-03-01\\n" + ... "# Comment here:\\n" + ... "Horse;4;2022-03-02\\n" + ... "Brittle stars;5;2022-03-03\\n" + ... "Centipede;100;2022-03-04" + ... ) + >>> print(s) + animals;n_legs;entry + Flamingo;2;2022-03-01 + # Comment here: + Horse;4;2022-03-02 + Brittle stars;5;2022-03-03 + Centipede;100;2022-03-04 + >>> source = io.BytesIO(s.encode()) + + Read the data from a file skipping rows with comments + and defining the delimiter: + + >>> from pyarrow import csv + >>> def skip_comment(row): + ... if row.text.startswith("# "): + ... return "skip" + ... else: + ... return "error" + >>> parse_options = csv.ParseOptions(delimiter=";", invalid_row_handler=skip_comment) + >>> csv.read_csv(source, parse_options=parse_options) + pyarrow.Table + animals: string + n_legs: int64 + entry: date32[day] + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + n_legs: [[2,4,5,100]] + entry: [[2022-03-01,2022-03-02,2022-03-03,2022-03-04]] + """ + + delimiter: str = field(default=",", kw_only=False) + quote_char: str | Literal[False] = '"' + double_quote: bool = True + escape_char: str | Literal[False] = False + newlines_in_values: bool = False + ignore_empty_lines: bool = True + invalid_row_handler: Callable[[InvalidRow], Literal["skip", "error"]] | None = None + + def validate(self) -> None: ... + +@dataclass(kw_only=True) +class ConvertOptions(lib._Weakrefable): + """ + Options for converting CSV data. + + Parameters + ---------- + check_utf8 : bool, optional (default True) + Whether to check UTF8 validity of string columns. + column_types : pyarrow.Schema or dict, optional + Explicitly map column names to column types. Passing this argument + disables type inference on the defined columns. + null_values : list, optional + A sequence of strings that denote nulls in the data + (defaults are appropriate in most cases). Note that by default, + string columns are not checked for null values. To enable + null checking for those, specify ``strings_can_be_null=True``. + true_values : list, optional + A sequence of strings that denote true booleans in the data + (defaults are appropriate in most cases). + false_values : list, optional + A sequence of strings that denote false booleans in the data + (defaults are appropriate in most cases). + decimal_point : 1-character string, optional (default '.') + The character used as decimal point in floating-point and decimal + data. + strings_can_be_null : bool, optional (default False) + Whether string / binary columns can have null values. + If true, then strings in null_values are considered null for + string columns. + If false, then all strings are valid string values. + quoted_strings_can_be_null : bool, optional (default True) + Whether quoted values can be null. + If true, then strings in "null_values" are also considered null + when they appear quoted in the CSV file. Otherwise, quoted values + are never considered null. + include_columns : list, optional + The names of columns to include in the Table. + If empty, the Table will include all columns from the CSV file. + If not empty, only these columns will be included, in this order. + include_missing_columns : bool, optional (default False) + If false, columns in `include_columns` but not in the CSV file will + error out. + If true, columns in `include_columns` but not in the CSV file will + produce a column of nulls (whose type is selected using + `column_types`, or null by default). + This option is ignored if `include_columns` is empty. + auto_dict_encode : bool, optional (default False) + Whether to try to automatically dict-encode string / binary data. + If true, then when type inference detects a string or binary column, + it it dict-encoded up to `auto_dict_max_cardinality` distinct values + (per chunk), after which it switches to regular encoding. + This setting is ignored for non-inferred columns (those in + `column_types`). + auto_dict_max_cardinality : int, optional + The maximum dictionary cardinality for `auto_dict_encode`. + This value is per chunk. + timestamp_parsers : list, optional + A sequence of strptime()-compatible format strings, tried in order + when attempting to infer or convert timestamp values (the special + value ISO8601() can also be given). By default, a fast built-in + ISO-8601 parser is used. + + Examples + -------- + + Defining an example data: + + >>> import io + >>> s = ( + ... "animals,n_legs,entry,fast\\n" + ... "Flamingo,2,01/03/2022,Yes\\n" + ... "Horse,4,02/03/2022,Yes\\n" + ... "Brittle stars,5,03/03/2022,No\\n" + ... "Centipede,100,04/03/2022,No\\n" + ... ",6,05/03/2022," + ... ) + >>> print(s) + animals,n_legs,entry,fast + Flamingo,2,01/03/2022,Yes + Horse,4,02/03/2022,Yes + Brittle stars,5,03/03/2022,No + Centipede,100,04/03/2022,No + ,6,05/03/2022, + + Change the type of a column: + + >>> import pyarrow as pa + >>> from pyarrow import csv + >>> convert_options = csv.ConvertOptions(column_types={"n_legs": pa.float64()}) + >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) + pyarrow.Table + animals: string + n_legs: double + entry: string + fast: string + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede",""]] + n_legs: [[2,4,5,100,6]] + entry: [["01/03/2022","02/03/2022","03/03/2022","04/03/2022","05/03/2022"]] + fast: [["Yes","Yes","No","No",""]] + + Define a date parsing format to get a timestamp type column + (in case dates are not in ISO format and not converted by default): + + >>> convert_options = csv.ConvertOptions(timestamp_parsers=["%m/%d/%Y", "%m-%d-%Y"]) + >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) + pyarrow.Table + animals: string + n_legs: int64 + entry: timestamp[s] + fast: string + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede",""]] + n_legs: [[2,4,5,100,6]] + entry: [[2022-01-03 00:00:00,2022-02-03 00:00:00,2022-03-03 00:00:00,2022-04-03 00:00:00,2022-05-03 00:00:00]] + fast: [["Yes","Yes","No","No",""]] + + Specify a subset of columns to be read: + + >>> convert_options = csv.ConvertOptions(include_columns=["animals", "n_legs"]) + >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) + pyarrow.Table + animals: string + n_legs: int64 + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede",""]] + n_legs: [[2,4,5,100,6]] + + List additional column to be included as a null typed column: + + >>> convert_options = csv.ConvertOptions( + ... include_columns=["animals", "n_legs", "location"], include_missing_columns=True + ... ) + >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) + pyarrow.Table + animals: string + n_legs: int64 + location: null + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede",""]] + n_legs: [[2,4,5,100,6]] + location: [5 nulls] + + Define columns as dictionary type (by default only the + string/binary columns are dictionary encoded): + + >>> convert_options = csv.ConvertOptions( + ... timestamp_parsers=["%m/%d/%Y", "%m-%d-%Y"], auto_dict_encode=True + ... ) + >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) + pyarrow.Table + animals: dictionary + n_legs: int64 + entry: timestamp[s] + fast: dictionary + ---- + animals: [ -- dictionary: + ["Flamingo","Horse","Brittle stars","Centipede",""] -- indices: + [0,1,2,3,4]] + n_legs: [[2,4,5,100,6]] + entry: [[2022-01-03 00:00:00,2022-02-03 00:00:00,2022-03-03 00:00:00,2022-04-03 00:00:00,2022-05-03 00:00:00]] + fast: [ -- dictionary: + ["Yes","No",""] -- indices: + [0,0,1,1,2]] + + Set upper limit for the number of categories. If the categories + is more than the limit, the conversion to dictionary will not + happen: + + >>> convert_options = csv.ConvertOptions( + ... include_columns=["animals"], auto_dict_encode=True, auto_dict_max_cardinality=2 + ... ) + >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) + pyarrow.Table + animals: string + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede",""]] + + Set empty strings to missing values: + + >>> convert_options = csv.ConvertOptions( + ... include_columns=["animals", "n_legs"], strings_can_be_null=True + ... ) + >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) + pyarrow.Table + animals: string + n_legs: int64 + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede",null]] + n_legs: [[2,4,5,100,6]] + + Define values to be True and False when converting a column + into a bool type: + + >>> convert_options = csv.ConvertOptions( + ... include_columns=["fast"], false_values=["No"], true_values=["Yes"] + ... ) + >>> csv.read_csv(io.BytesIO(s.encode()), convert_options=convert_options) + pyarrow.Table + fast: bool + ---- + fast: [[true,true,false,false,null]] + """ + + check_utf8: bool = field(default=True, kw_only=False) + column_types: lib.Schema | dict | None = None + null_values: list[str] | None = None + true_values: list[str] | None = None + false_values: list[str] | None = None + decimal_point: str = "." + strings_can_be_null: bool = False + quoted_strings_can_be_null: bool = True + include_columns: list[str] | None = None + include_missing_columns: bool = False + auto_dict_encode: bool = False + auto_dict_max_cardinality: int | None = None + timestamp_parsers: list[str] | None = None + + def validate(self) -> None: ... + +@dataclass(kw_only=True) +class WriteOptions(lib._Weakrefable): + """ + Options for writing CSV files. + + Parameters + ---------- + include_header : bool, optional (default True) + Whether to write an initial header line with column names + batch_size : int, optional (default 1024) + How many rows to process together when converting and writing + CSV data + delimiter : 1-character string, optional (default ",") + The character delimiting individual cells in the CSV data. + quoting_style : str, optional (default "needed") + Whether to quote values, and if so, which quoting style to use. + The following values are accepted: + + - "needed" (default): only enclose values in quotes when needed. + - "all_valid": enclose all valid values in quotes; nulls are not quoted. + - "none": do not enclose any values in quotes; values containing + special characters (such as quotes, cell delimiters or line endings) + will raise an error. + """ + + include_header: bool = field(default=True, kw_only=False) + batch_size: int = 1024 + delimiter: str = "," + quoting_style: Literal["needed", "all_valid", "none"] = "needed" + + def validate(self) -> None: ... + +@dataclass +class InvalidRow(lib._Weakrefable): + """ + Description of an invalid row in a CSV file. + + Parameters + ---------- + expected_columns : int + The expected number of columns in the row. + actual_columns : int + The actual number of columns in the row. + number : int or None + The physical row number if known, otherwise None. + text : str + The contents of the row. + """ + + expected_columns: int + actual_columns: int + number: int | None + text: str + +class CSVWriter(lib._CRecordBatchWriter): + """ + Writer to create a CSV file. + + Parameters + ---------- + sink : str, path, pyarrow.OutputStream or file-like object + The location where to write the CSV data. + schema : pyarrow.Schema + The schema of the data to be written. + write_options : pyarrow.csv.WriteOptions + Options to configure writing the CSV data. + memory_pool : MemoryPool, optional + Pool for temporary allocations. + """ + + def __init__( + self, + # TODO: OutputStream + sink: StrPath | IO[Any], + schema: lib.Schema, + write_options: WriteOptions | None = None, + *, + memory_pool: lib.MemoryPool | None = None, + ) -> None: ... + +class CSVStreamingReader(lib.RecordBatchReader): ... + +ISO8601: lib._Weakrefable + +def open_csv( + input_file: StrPath | IO[Any], + read_options: ReadOptions | None = None, + parse_options: ParseOptions | None = None, + convert_options: ConvertOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> CSVStreamingReader: + """ + Open a streaming reader of CSV data. + + Reading using this function is always single-threaded. + + Parameters + ---------- + input_file : string, path or file-like object + The location of CSV data. If a string or path, and if it ends + with a recognized compressed file extension (e.g. ".gz" or ".bz2"), + the data is automatically decompressed when reading. + read_options : pyarrow.csv.ReadOptions, optional + Options for the CSV reader (see pyarrow.csv.ReadOptions constructor + for defaults) + parse_options : pyarrow.csv.ParseOptions, optional + Options for the CSV parser + (see pyarrow.csv.ParseOptions constructor for defaults) + convert_options : pyarrow.csv.ConvertOptions, optional + Options for converting CSV data + (see pyarrow.csv.ConvertOptions constructor for defaults) + memory_pool : MemoryPool, optional + Pool to allocate RecordBatch memory from + + Returns + ------- + :class:`pyarrow.csv.CSVStreamingReader` + """ + +def read_csv( + input_file: StrPath | IO[Any], + read_options: ReadOptions | None = None, + parse_options: ParseOptions | None = None, + convert_options: ConvertOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Table: + """ + Read a Table from a stream of CSV data. + + Parameters + ---------- + input_file : string, path or file-like object + The location of CSV data. If a string or path, and if it ends + with a recognized compressed file extension (e.g. ".gz" or ".bz2"), + the data is automatically decompressed when reading. + read_options : pyarrow.csv.ReadOptions, optional + Options for the CSV reader (see pyarrow.csv.ReadOptions constructor + for defaults) + parse_options : pyarrow.csv.ParseOptions, optional + Options for the CSV parser + (see pyarrow.csv.ParseOptions constructor for defaults) + convert_options : pyarrow.csv.ConvertOptions, optional + Options for converting CSV data + (see pyarrow.csv.ConvertOptions constructor for defaults) + memory_pool : MemoryPool, optional + Pool to allocate Table memory from + + Returns + ------- + :class:`pyarrow.Table` + Contents of the CSV file as a in-memory table. + + Examples + -------- + + Defining an example file from bytes object: + + >>> import io + >>> s = ( + ... "animals,n_legs,entry\\n" + ... "Flamingo,2,2022-03-01\\n" + ... "Horse,4,2022-03-02\\n" + ... "Brittle stars,5,2022-03-03\\n" + ... "Centipede,100,2022-03-04" + ... ) + >>> print(s) + animals,n_legs,entry + Flamingo,2,2022-03-01 + Horse,4,2022-03-02 + Brittle stars,5,2022-03-03 + Centipede,100,2022-03-04 + >>> source = io.BytesIO(s.encode()) + + Reading from the file + + >>> from pyarrow import csv + >>> csv.read_csv(source) + pyarrow.Table + animals: string + n_legs: int64 + entry: date32[day] + ---- + animals: [["Flamingo","Horse","Brittle stars","Centipede"]] + n_legs: [[2,4,5,100]] + entry: [[2022-03-01,2022-03-02,2022-03-03,2022-03-04]] + """ + +def write_csv( + data: lib.RecordBatch | lib.Table, + output_file: StrPath | lib.NativeFile | IO[Any], + write_options: WriteOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> None: + """ + Write record batch or table to a CSV file. + + Parameters + ---------- + data : pyarrow.RecordBatch or pyarrow.Table + The data to write. + output_file : string, path, pyarrow.NativeFile, or file-like object + The location where to write the CSV data. + write_options : pyarrow.csv.WriteOptions + Options to configure writing the CSV data. + memory_pool : MemoryPool, optional + Pool for temporary allocations. + + Examples + -------- + + >>> import pyarrow as pa + >>> from pyarrow import csv + + >>> legs = pa.array([2, 4, 5, 100]) + >>> animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"]) + >>> entry_date = pa.array(["01/03/2022", "02/03/2022", "03/03/2022", "04/03/2022"]) + >>> table = pa.table([animals, legs, entry_date], names=["animals", "n_legs", "entry"]) + + >>> csv.write_csv(table, "animals.csv") + + >>> write_options = csv.WriteOptions(include_header=False) + >>> csv.write_csv(table, "animals.csv", write_options=write_options) + + >>> write_options = csv.WriteOptions(delimiter=";") + >>> csv.write_csv(table, "animals.csv", write_options=write_options) + """ diff --git a/python/stubs/_cuda.pyi b/python/stubs/_cuda.pyi new file mode 100644 index 00000000000..ad52b2f380f --- /dev/null +++ b/python/stubs/_cuda.pyi @@ -0,0 +1,556 @@ +from typing import Any + +import cuda # type: ignore[import-not-found] + +from numba.cuda.cudadrv import driver as _numba_driver # type: ignore[import-not-found] + +from . import lib +from ._stubs_typing import ArrayLike + +class Context(lib._Weakrefable): + """ + CUDA driver context. + """ + + def __init__(self, device_number: int = 0, handle: int | None = None) -> None: + """ + Create a CUDA driver context for a particular device. + + If a CUDA context handle is passed, it is wrapped, otherwise + a default CUDA context for the given device is requested. + + Parameters + ---------- + device_number : int (default 0) + Specify the GPU device for which the CUDA driver context is + requested. + handle : int, optional + Specify CUDA handle for a shared context that has been created + by another library. + """ + @staticmethod + def from_numba(context: _numba_driver.Context | None = None) -> Context: + """ + Create a Context instance from a Numba CUDA context. + + Parameters + ---------- + context : {numba.cuda.cudadrv.driver.Context, None} + A Numba CUDA context instance. + If None, the current Numba context is used. + + Returns + ------- + shared_context : pyarrow.cuda.Context + Context instance. + """ + def to_numba(self) -> _numba_driver.Context: + """ + Convert Context to a Numba CUDA context. + + Returns + ------- + context : numba.cuda.cudadrv.driver.Context + Numba CUDA context instance. + """ + @staticmethod + def get_num_devices() -> int: + """Return the number of GPU devices.""" + @property + def device_number(self) -> int: + """Return context device number.""" + @property + def handle(self) -> int: + """Return pointer to context handle.""" + def synchronize(self) -> None: + """Blocks until the device has completed all preceding requested + tasks. + """ + @property + def bytes_allocated(self) -> int: + """Return the number of allocated bytes.""" + def get_device_address(self, address: int) -> int: + """Return the device address that is reachable from kernels running in + the context + + Parameters + ---------- + address : int + Specify memory address value + + Returns + ------- + device_address : int + Device address accessible from device context + + Notes + ----- + The device address is defined as a memory address accessible + by device. While it is often a device memory address but it + can be also a host memory address, for instance, when the + memory is allocated as host memory (using cudaMallocHost or + cudaHostAlloc) or as managed memory (using cudaMallocManaged) + or the host memory is page-locked (using cudaHostRegister). + """ + def new_buffer(self, nbytes: int) -> CudaBuffer: + """Return new device buffer. + + Parameters + ---------- + nbytes : int + Specify the number of bytes to be allocated. + + Returns + ------- + buf : CudaBuffer + Allocated buffer. + """ + @property + def memory_manager(self) -> lib.MemoryManager: + """ + The default memory manager tied to this context's device. + + Returns + ------- + MemoryManager + """ + @property + def device(self) -> lib.Device: + """ + The device instance associated with this context. + + Returns + ------- + Device + """ + def foreign_buffer(self, address: int, size: int, base: Any | None = None) -> CudaBuffer: + """ + Create device buffer from address and size as a view. + + The caller is responsible for allocating and freeing the + memory. When `address==size==0` then a new zero-sized buffer + is returned. + + Parameters + ---------- + address : int + Specify the starting address of the buffer. The address can + refer to both device or host memory but it must be + accessible from device after mapping it with + `get_device_address` method. + size : int + Specify the size of device buffer in bytes. + base : {None, object} + Specify object that owns the referenced memory. + + Returns + ------- + cbuf : CudaBuffer + Device buffer as a view of device reachable memory. + + """ + def open_ipc_buffer(self, ipc_handle: IpcMemHandle) -> CudaBuffer: + """Open existing CUDA IPC memory handle + + Parameters + ---------- + ipc_handle : IpcMemHandle + Specify opaque pointer to CUipcMemHandle (driver API). + + Returns + ------- + buf : CudaBuffer + referencing device buffer + """ + def buffer_from_data( + self, + data: CudaBuffer | HostBuffer | lib.Buffer | ArrayLike, + offset: int = 0, + size: int = -1, + ) -> CudaBuffer: + """Create device buffer and initialize with data. + + Parameters + ---------- + data : {CudaBuffer, HostBuffer, Buffer, array-like} + Specify data to be copied to device buffer. + offset : int + Specify the offset of input buffer for device data + buffering. Default: 0. + size : int + Specify the size of device buffer in bytes. Default: all + (starting from input offset) + + Returns + ------- + cbuf : CudaBuffer + Device buffer with copied data. + """ + def buffer_from_object(self, obj: Any) -> CudaBuffer: + """Create device buffer view of arbitrary object that references + device accessible memory. + + When the object contains a non-contiguous view of device + accessible memory then the returned device buffer will contain + contiguous view of the memory, that is, including the + intermediate data that is otherwise invisible to the input + object. + + Parameters + ---------- + obj : {object, Buffer, HostBuffer, CudaBuffer, ...} + Specify an object that holds (device or host) address that + can be accessed from device. This includes objects with + types defined in pyarrow.cuda as well as arbitrary objects + that implement the CUDA array interface as defined by numba. + + Returns + ------- + cbuf : CudaBuffer + Device buffer as a view of device accessible memory. + + """ + +class IpcMemHandle(lib._Weakrefable): + """A serializable container for a CUDA IPC handle.""" + @staticmethod + def from_buffer(opaque_handle: lib.Buffer) -> IpcMemHandle: + """Create IpcMemHandle from opaque buffer (e.g. from another + process) + + Parameters + ---------- + opaque_handle : + a CUipcMemHandle as a const void* + + Returns + ------- + ipc_handle : IpcMemHandle + """ + def serialize(self, pool: lib.MemoryPool | None = None) -> lib.Buffer: + """Write IpcMemHandle to a Buffer + + Parameters + ---------- + pool : {MemoryPool, None} + Specify a pool to allocate memory from + + Returns + ------- + buf : Buffer + The serialized buffer. + """ + +class CudaBuffer(lib.Buffer): + """An Arrow buffer with data located in a GPU device. + + To create a CudaBuffer instance, use Context.device_buffer(). + + The memory allocated in a CudaBuffer is freed when the buffer object + is deleted. + """ + + @staticmethod + def from_buffer(buf: lib.Buffer) -> CudaBuffer: + """Convert back generic buffer into CudaBuffer + + Parameters + ---------- + buf : Buffer + Specify buffer containing CudaBuffer + + Returns + ------- + dbuf : CudaBuffer + Resulting device buffer. + """ + @staticmethod + def from_numba(mem: _numba_driver.MemoryPointer) -> CudaBuffer: + """Create a CudaBuffer view from numba MemoryPointer instance. + + Parameters + ---------- + mem : numba.cuda.cudadrv.driver.MemoryPointer + + Returns + ------- + cbuf : CudaBuffer + Device buffer as a view of numba MemoryPointer. + """ + def to_numba(self) -> _numba_driver.MemoryPointer: + """Return numba memory pointer of CudaBuffer instance.""" + def copy_to_host( + self, + position: int = 0, + nbytes: int = -1, + buf: lib.Buffer | None = None, + memory_pool: lib.MemoryPool | None = None, + resizable: bool = False, + ) -> lib.Buffer: + """Copy memory from GPU device to CPU host + + Caller is responsible for ensuring that all tasks affecting + the memory are finished. Use + + `.context.synchronize()` + + when needed. + + Parameters + ---------- + position : int + Specify the starting position of the source data in GPU + device buffer. Default: 0. + nbytes : int + Specify the number of bytes to copy. Default: -1 (all from + the position until host buffer is full). + buf : Buffer + Specify a pre-allocated output buffer in host. Default: None + (allocate new output buffer). + memory_pool : MemoryPool + resizable : bool + Specify extra arguments to allocate_buffer. Used only when + buf is None. + + Returns + ------- + buf : Buffer + Output buffer in host. + + """ + def copy_from_host( + self, data: lib.Buffer | ArrayLike, position: int = 0, nbytes: int = -1 + ) -> int: + """Copy data from host to device. + + The device buffer must be pre-allocated. + + Parameters + ---------- + data : {Buffer, array-like} + Specify data in host. It can be array-like that is valid + argument to py_buffer + position : int + Specify the starting position of the copy in device buffer. + Default: 0. + nbytes : int + Specify the number of bytes to copy. Default: -1 (all from + source until device buffer, starting from position, is full) + + Returns + ------- + nbytes : int + Number of bytes copied. + """ + def copy_from_device(self, buf: CudaBuffer, position: int = 0, nbytes: int = -1) -> int: + """Copy data from device to device. + + Parameters + ---------- + buf : CudaBuffer + Specify source device buffer. + position : int + Specify the starting position of the copy in device buffer. + Default: 0. + nbytes : int + Specify the number of bytes to copy. Default: -1 (all from + source until device buffer, starting from position, is full) + + Returns + ------- + nbytes : int + Number of bytes copied. + + """ + def export_for_ipc(self) -> IpcMemHandle: + """ + Expose this device buffer as IPC memory which can be used in other + processes. + + After calling this function, this device memory will not be + freed when the CudaBuffer is destructed. + + Returns + ------- + ipc_handle : IpcMemHandle + The exported IPC handle + + """ + @property + def context(self) -> Context: + """Returns the CUDA driver context of this buffer.""" + def slice(self, offset: int = 0, length: int | None = None) -> CudaBuffer: + """Return slice of device buffer + + Parameters + ---------- + offset : int, default 0 + Specify offset from the start of device buffer to slice + length : int, default None + Specify the length of slice (default is until end of device + buffer starting from offset). If the length is larger than + the data available, the returned slice will have a size of + the available data starting from the offset. + + Returns + ------- + sliced : CudaBuffer + Zero-copy slice of device buffer. + + """ + def to_pybytes(self) -> bytes: + """Return device buffer content as Python bytes.""" + +class HostBuffer(lib.Buffer): + """Device-accessible CPU memory created using cudaHostAlloc. + + To create a HostBuffer instance, use + + cuda.new_host_buffer() + """ + @property + def size(self) -> int: ... + +class BufferReader(lib.NativeFile): + """File interface for zero-copy read from CUDA buffers. + + Note: Read methods return pointers to device memory. This means + you must be careful using this interface with any Arrow code which + may expect to be able to do anything other than pointer arithmetic + on the returned buffers. + """ + def __init__(self, obj: CudaBuffer) -> None: ... + def read_buffer(self, nbytes: int | None = None) -> CudaBuffer: + """Return a slice view of the underlying device buffer. + + The slice will start at the current reader position and will + have specified size in bytes. + + Parameters + ---------- + nbytes : int, default None + Specify the number of bytes to read. Default: None (read all + remaining bytes). + + Returns + ------- + cbuf : CudaBuffer + New device buffer. + + """ + +class BufferWriter(lib.NativeFile): + """File interface for writing to CUDA buffers. + + By default writes are unbuffered. Use set_buffer_size to enable + buffering. + """ + def __init__(self, obj: CudaBuffer) -> None: ... + def writeat(self, position: int, data: ArrayLike) -> None: + """Write data to buffer starting from position. + + Parameters + ---------- + position : int + Specify device buffer position where the data will be + written. + data : array-like + Specify data, the data instance must implement buffer + protocol. + """ + @property + def buffer_size(self) -> int: + """Returns size of host (CPU) buffer, 0 for unbuffered""" + @buffer_size.setter + def buffer_size(self, buffer_size: int): + """Set CPU buffer size to limit calls to cudaMemcpy + + Parameters + ---------- + buffer_size : int + Specify the size of CPU buffer to allocate in bytes. + """ + @property + def num_bytes_buffered(self) -> int: + """Returns number of bytes buffered on host""" + +def new_host_buffer(size: int, device: int = 0) -> HostBuffer: + """Return buffer with CUDA-accessible memory on CPU host + + Parameters + ---------- + size : int + Specify the number of bytes to be allocated. + device : int + Specify GPU device number. + + Returns + ------- + dbuf : HostBuffer + Allocated host buffer + """ + +def serialize_record_batch(batch: lib.RecordBatch, ctx: Context) -> CudaBuffer: + """Write record batch message to GPU device memory + + Parameters + ---------- + batch : RecordBatch + Record batch to write + ctx : Context + CUDA Context to allocate device memory from + + Returns + ------- + dbuf : CudaBuffer + device buffer which contains the record batch message + """ + +def read_message( + source: CudaBuffer | cuda.BufferReader, pool: lib.MemoryManager | None = None +) -> lib.Message: + """Read Arrow IPC message located on GPU device + + Parameters + ---------- + source : {CudaBuffer, cuda.BufferReader} + Device buffer or reader of device buffer. + pool : MemoryPool (optional) + Pool to allocate CPU memory for the metadata + + Returns + ------- + message : Message + The deserialized message, body still on device + """ + +def read_record_batch( + buffer: lib.Buffer, + object: lib.Schema, + *, + dictionary_memo: lib.DictionaryMemo | None = None, + pool: lib.MemoryPool | None = None, +) -> lib.RecordBatch: + """Construct RecordBatch referencing IPC message located on CUDA device. + + While the metadata is copied to host memory for deserialization, + the record batch data remains on the device. + + Parameters + ---------- + buffer : + Device buffer containing the complete IPC message + schema : Schema + The schema for the record batch + dictionary_memo : DictionaryMemo, optional + If message contains dictionaries, must pass a populated + DictionaryMemo + pool : MemoryPool (optional) + Pool to allocate metadata from + + Returns + ------- + batch : RecordBatch + Reconstructed record batch, with device pointers + + """ diff --git a/python/stubs/_dataset.pyi b/python/stubs/_dataset.pyi new file mode 100644 index 00000000000..af864f9154b --- /dev/null +++ b/python/stubs/_dataset.pyi @@ -0,0 +1,2299 @@ +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from typing import ( + IO, + Any, + Callable, + Generic, + Iterator, + Literal, + NamedTuple, + TypeVar, + overload, +) + +from _typeshed import StrPath + +from . import _csv, _json, _parquet, lib +from ._fs import FileSelector, FileSystem, SupportedFileSystem +from ._stubs_typing import Indices, JoinType, Order +from .acero import ExecNodeOptions +from .compute import Expression +from .ipc import IpcWriteOptions, RecordBatchReader + +class Dataset(lib._Weakrefable): + """ + Collection of data fragments and potentially child datasets. + + Arrow Datasets allow you to query against data that has been split across + multiple files. This sharding of data may indicate partitioning, which + can accelerate queries that only touch some partitions (files). + """ + + @property + def partition_expression(self) -> Expression: + """ + An Expression which evaluates to true for all data viewed by this + Dataset. + """ + def replace_schema(self, schema: lib.Schema) -> None: + """ + Return a copy of this Dataset with a different schema. + + The copy will view the same Fragments. If the new schema is not + compatible with the original dataset's schema then an error will + be raised. + + Parameters + ---------- + schema : Schema + The new dataset schema. + """ + def get_fragments(self, filter: Expression | None = None): + """Returns an iterator over the fragments in this dataset. + + Parameters + ---------- + filter : Expression, default None + Return fragments matching the optional filter, either using the + partition_expression or internal information like Parquet's + statistics. + + Returns + ------- + fragments : iterator of Fragment + """ + def scanner( + self, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Scanner: + """ + Build a scan operation against the dataset. + + Data is not loaded immediately. Instead, this produces a Scanner, + which exposes further operations (e.g. loading all data as a + table, counting rows). + + See the :meth:`Scanner.from_dataset` method for further information. + + Parameters + ---------- + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + scanner : Scanner + + Examples + -------- + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "dataset_scanner.parquet") + + >>> import pyarrow.dataset as ds + >>> dataset = ds.dataset("dataset_scanner.parquet") + + Selecting a subset of the columns: + + >>> dataset.scanner(columns=["year", "n_legs"]).to_table() + pyarrow.Table + year: int64 + n_legs: int64 + ---- + year: [[2020,2022,2021,2022,2019,2021]] + n_legs: [[2,2,4,4,5,100]] + + Projecting selected columns using an expression: + + >>> dataset.scanner( + ... columns={ + ... "n_legs_uint": ds.field("n_legs").cast("uint8"), + ... } + ... ).to_table() + pyarrow.Table + n_legs_uint: uint8 + ---- + n_legs_uint: [[2,2,4,4,5,100]] + + Filtering rows while scanning: + + >>> dataset.scanner(filter=ds.field("year") > 2020).to_table() + pyarrow.Table + year: int64 + n_legs: int64 + animal: string + ---- + year: [[2022,2021,2022,2021]] + n_legs: [[2,4,4,100]] + animal: [["Parrot","Dog","Horse","Centipede"]] + """ + def to_batches( + self, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Iterator[lib.RecordBatch]: + """ + Read the dataset as materialized record batches. + + Parameters + ---------- + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + record_batches : iterator of RecordBatch + """ + def to_table( + self, + columns: list[str] | dict[str, Expression] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> lib.Table: + """ + Read the dataset to an Arrow table. + + Note that this method reads all the selected data from the dataset + into memory. + + Parameters + ---------- + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + table : Table + """ + def take( + self, + indices: Indices, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> lib.Table: + """ + Select rows of data by index. + + Parameters + ---------- + indices : Array or array-like + indices of rows to select in the dataset. + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + table : Table + """ + def head( + self, + num_rows: int, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> lib.Table: + """ + Load the first N rows of the dataset. + + Parameters + ---------- + num_rows : int + The number of rows to load. + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + table : Table + """ + def count_rows( + self, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> int: + """ + Count rows matching the scanner filter. + + Parameters + ---------- + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + count : int + """ + @property + def schema(self) -> lib.Schema: + """The common schema of the full Dataset""" + def filter(self, expression: Expression) -> Self: + """ + Apply a row filter to the dataset. + + Parameters + ---------- + expression : Expression + The filter that should be applied to the dataset. + + Returns + ------- + Dataset + """ + def sort_by(self, sorting: str | list[tuple[str, Order]], **kwargs) -> InMemoryDataset: + """ + Sort the Dataset by one or multiple columns. + + Parameters + ---------- + sorting : str or list[tuple(name, order)] + Name of the column to use to sort (ascending), or + a list of multiple sorting conditions where + each entry is a tuple with column name + and sorting order ("ascending" or "descending") + **kwargs : dict, optional + Additional sorting options. + As allowed by :class:`SortOptions` + + Returns + ------- + InMemoryDataset + A new dataset sorted according to the sort keys. + """ + def join( + self, + right_dataset: Dataset, + keys: str | list[str], + right_keys: str | list[str] | None = None, + join_type: JoinType = "left outer", + left_suffix: str | None = None, + right_suffix: str | None = None, + coalesce_keys: bool = True, + use_threads: bool = True, + ) -> InMemoryDataset: + """ + Perform a join between this dataset and another one. + + Result of the join will be a new dataset, where further + operations can be applied. + + Parameters + ---------- + right_dataset : dataset + The dataset to join to the current one, acting as the right dataset + in the join operation. + keys : str or list[str] + The columns from current dataset that should be used as keys + of the join operation left side. + right_keys : str or list[str], default None + The columns from the right_dataset that should be used as keys + on the join operation right side. + When ``None`` use the same key names as the left dataset. + join_type : str, default "left outer" + The kind of join that should be performed, one of + ("left semi", "right semi", "left anti", "right anti", + "inner", "left outer", "right outer", "full outer") + left_suffix : str, default None + Which suffix to add to right column names. This prevents confusion + when the columns in left and right datasets have colliding names. + right_suffix : str, default None + Which suffix to add to the left column names. This prevents confusion + when the columns in left and right datasets have colliding names. + coalesce_keys : bool, default True + If the duplicated keys should be omitted from one of the sides + in the join result. + use_threads : bool, default True + Whenever to use multithreading or not. + + Returns + ------- + InMemoryDataset + """ + def join_asof( + self, + right_dataset: Dataset, + on: str, + by: str | list[str], + tolerance: int, + right_on: str | list[str] | None = None, + right_by: str | list[str] | None = None, + ) -> InMemoryDataset: + """ + Perform an asof join between this dataset and another one. + + This is similar to a left-join except that we match on nearest key rather + than equal keys. Both datasets must be sorted by the key. This type of join + is most useful for time series data that are not perfectly aligned. + + Optionally match on equivalent keys with "by" before searching with "on". + + Result of the join will be a new Dataset, where further + operations can be applied. + + Parameters + ---------- + right_dataset : dataset + The dataset to join to the current one, acting as the right dataset + in the join operation. + on : str + The column from current dataset that should be used as the "on" key + of the join operation left side. + + An inexact match is used on the "on" key, i.e. a row is considered a + match if and only if left_on - tolerance <= right_on <= left_on. + + The input table must be sorted by the "on" key. Must be a single + field of a common type. + + Currently, the "on" key must be an integer, date, or timestamp type. + by : str or list[str] + The columns from current dataset that should be used as the keys + of the join operation left side. The join operation is then done + only for the matches in these columns. + tolerance : int + The tolerance for inexact "on" key matching. A right row is considered + a match with the left row `right.on - left.on <= tolerance`. The + `tolerance` may be: + + - negative, in which case a past-as-of-join occurs; + - or positive, in which case a future-as-of-join occurs; + - or zero, in which case an exact-as-of-join occurs. + + The tolerance is interpreted in the same units as the "on" key. + right_on : str or list[str], default None + The columns from the right_dataset that should be used as the on key + on the join operation right side. + When ``None`` use the same key name as the left dataset. + right_by : str or list[str], default None + The columns from the right_dataset that should be used as by keys + on the join operation right side. + When ``None`` use the same key names as the left dataset. + + Returns + ------- + InMemoryDataset + """ + +class InMemoryDataset(Dataset): + """ + A Dataset wrapping in-memory data. + + Parameters + ---------- + source : RecordBatch, Table, list, tuple + The data for this dataset. Can be a RecordBatch, Table, list of + RecordBatch/Table, iterable of RecordBatch, or a RecordBatchReader + If an iterable is provided, the schema must also be provided. + schema : Schema, optional + Only required if passing an iterable as the source + """ + +class UnionDataset(Dataset): + """ + A Dataset wrapping child datasets. + + Children's schemas must agree with the provided schema. + + Parameters + ---------- + schema : Schema + A known schema to conform to. + children : list of Dataset + One or more input children + """ + + @property + def children(self) -> list[Dataset]: ... + +class FileSystemDataset(Dataset): + """ + A Dataset of file fragments. + + A FileSystemDataset is composed of one or more FileFragment. + + Parameters + ---------- + fragments : list[Fragments] + List of fragments to consume. + schema : Schema + The top-level schema of the Dataset. + format : FileFormat + File format of the fragments, currently only ParquetFileFormat, + IpcFileFormat, CsvFileFormat, and JsonFileFormat are supported. + filesystem : FileSystem + FileSystem of the fragments. + root_partition : Expression, optional + The top-level partition of the DataDataset. + """ + + def __init__( + self, + fragments: list[Fragment], + schema: lib.Schema, + format: FileFormat, + filesystem: SupportedFileSystem | None = None, + root_partition: Expression | None = None, + ) -> None: ... + @classmethod + def from_paths( + cls, + paths: list[str], + schema: lib.Schema | None = None, + format: FileFormat | None = None, + filesystem: SupportedFileSystem | None = None, + partitions: list[Expression] | None = None, + root_partition: Expression | None = None, + ) -> FileSystemDataset: + """ + A Dataset created from a list of paths on a particular filesystem. + + Parameters + ---------- + paths : list of str + List of file paths to create the fragments from. + schema : Schema + The top-level schema of the DataDataset. + format : FileFormat + File format to create fragments from, currently only + ParquetFileFormat, IpcFileFormat, CsvFileFormat, and JsonFileFormat are supported. + filesystem : FileSystem + The filesystem which files are from. + partitions : list[Expression], optional + Attach additional partition information for the file paths. + root_partition : Expression, optional + The top-level partition of the DataDataset. + """ + @property + def filesystem(self) -> FileSystem: ... + @property + def partitioning(self) -> Partitioning | None: + """ + The partitioning of the Dataset source, if discovered. + + If the FileSystemDataset is created using the ``dataset()`` factory + function with a partitioning specified, this will return the + finalized Partitioning object from the dataset discovery. In all + other cases, this returns None. + """ + @property + def files(self) -> list[str]: + """List of the files""" + @property + def format(self) -> FileFormat: + """The FileFormat of this source.""" + +class FileWriteOptions(lib._Weakrefable): + @property + def format(self) -> FileFormat: ... + +class FileFormat(lib._Weakrefable): + def inspect( + self, file: StrPath | IO, filesystem: SupportedFileSystem | None = None + ) -> lib.Schema: + """ + Infer the schema of a file. + + Parameters + ---------- + file : file-like object, path-like or str + The file or file path to infer a schema from. + filesystem : Filesystem, optional + If `filesystem` is given, `file` must be a string and specifies + the path of the file to read from the filesystem. + + Returns + ------- + schema : Schema + The schema inferred from the file + """ + def make_fragment( + self, + file: StrPath | IO, + filesystem: SupportedFileSystem | None = None, + partition_expression: Expression | None = None, + *, + file_size: int | None = None, + ) -> Fragment: + """ + Make a FileFragment from a given file. + + Parameters + ---------- + file : file-like object, path-like or str + The file or file path to make a fragment from. + filesystem : Filesystem, optional + If `filesystem` is given, `file` must be a string and specifies + the path of the file to read from the filesystem. + partition_expression : Expression, optional + An expression that is guaranteed true for all rows in the fragment. Allows + fragment to be potentially skipped while scanning with a filter. + file_size : int, optional + The size of the file in bytes. Can improve performance with high-latency filesystems + when file size needs to be known before reading. + + Returns + ------- + fragment : Fragment + The file fragment + """ + def make_write_options(self) -> FileWriteOptions: ... + @property + def default_extname(self) -> str: ... + @property + def default_fragment_scan_options(self) -> FragmentScanOptions: ... + @default_fragment_scan_options.setter + def default_fragment_scan_options(self, options: FragmentScanOptions) -> None: ... + +class Fragment(lib._Weakrefable): + """Fragment of data from a Dataset.""" + @property + def physical_schema(self) -> lib.Schema: + """Return the physical schema of this Fragment. This schema can be + different from the dataset read schema.""" + @property + def partition_expression(self) -> Expression: + """An Expression which evaluates to true for all data viewed by this + Fragment. + """ + def scanner( + self, + schema: lib.Schema | None = None, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Scanner: + """ + Build a scan operation against the fragment. + + Data is not loaded immediately. Instead, this produces a Scanner, + which exposes further operations (e.g. loading all data as a + table, counting rows). + + Parameters + ---------- + schema : Schema + Schema to use for scanning. This is used to unify a Fragment to + its Dataset's schema. If not specified this will use the + Fragment's physical schema which might differ for each Fragment. + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + scanner : Scanner + """ + def to_batches( + self, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Iterator[lib.RecordBatch]: + """ + Read the fragment as materialized record batches. + + Parameters + ---------- + schema : Schema, optional + Concrete schema to use for scanning. + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + record_batches : iterator of RecordBatch + """ + def to_table( + self, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> lib.Table: + """ + Convert this Fragment into a Table. + + Use this convenience utility with care. This will serially materialize + the Scan result in memory before creating the Table. + + Parameters + ---------- + schema : Schema, optional + Concrete schema to use for scanning. + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + table : Table + """ + def take( + self, + indices: Indices, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> lib.Table: + """ + Select rows of data by index. + + Parameters + ---------- + indices : Array or array-like + The indices of row to select in the dataset. + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + Table + """ + def head( + self, + num_rows: int, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> lib.Table: + """ + Load the first N rows of the fragment. + + Parameters + ---------- + num_rows : int + The number of rows to load. + columns : list of str, default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + Table + """ + def count_rows( + self, + columns: list[str] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> int: + """ + Count rows matching the scanner filter. + + Parameters + ---------- + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + + Returns + ------- + count : int + """ + +class FileFragment(Fragment): + """A Fragment representing a data file.""" + + def open(self) -> lib.NativeFile: + """ + Open a NativeFile of the buffer or file viewed by this fragment. + """ + @property + def path(self) -> str: + """ + The path of the data file viewed by this fragment, if it views a + file. If instead it views a buffer, this will be "". + """ + @property + def filesystem(self) -> FileSystem: + """ + The FileSystem containing the data file viewed by this fragment, if + it views a file. If instead it views a buffer, this will be None. + """ + @property + def buffer(self) -> lib.Buffer: + """ + The buffer viewed by this fragment, if it views a buffer. If + instead it views a file, this will be None. + """ + @property + def format(self) -> FileFormat: + """ + The format of the data file viewed by this fragment. + """ + +class FragmentScanOptions(lib._Weakrefable): + """Scan options specific to a particular fragment and scan operation.""" + + @property + def type_name(self) -> str: ... + +class IpcFileWriteOptions(FileWriteOptions): + @property + def write_options(self) -> IpcWriteOptions: ... + @write_options.setter + def write_options(self, write_options: IpcWriteOptions) -> None: ... + +class IpcFileFormat(FileFormat): + def equals(self, other: IpcFileFormat) -> bool: ... + def make_write_options(self, **kwargs) -> IpcFileWriteOptions: ... + @property + def default_extname(self) -> str: ... + +class FeatherFileFormat(IpcFileFormat): ... + +class CsvFileFormat(FileFormat): + """ + FileFormat for CSV files. + + Parameters + ---------- + parse_options : pyarrow.csv.ParseOptions + Options regarding CSV parsing. + default_fragment_scan_options : CsvFragmentScanOptions + Default options for fragments scan. + convert_options : pyarrow.csv.ConvertOptions + Options regarding value conversion. + read_options : pyarrow.csv.ReadOptions + General read options. + """ + def __init__( + self, + parse_options: _csv.ParseOptions | None = None, + default_fragment_scan_options: CsvFragmentScanOptions | None = None, + convert_options: _csv.ConvertOptions | None = None, + read_options: _csv.ReadOptions | None = None, + ) -> None: ... + def make_write_options(self) -> _csv.WriteOptions: ... # type: ignore[override] + @property + def parse_options(self) -> _csv.ParseOptions: ... + @parse_options.setter + def parse_options(self, parse_options: _csv.ParseOptions) -> None: ... + def equals(self, other: CsvFileFormat) -> bool: ... + +class CsvFragmentScanOptions(FragmentScanOptions): + """ + Scan-specific options for CSV fragments. + + Parameters + ---------- + convert_options : pyarrow.csv.ConvertOptions + Options regarding value conversion. + read_options : pyarrow.csv.ReadOptions + General read options. + """ + + convert_options: _csv.ConvertOptions + read_options: _csv.ReadOptions + + def __init__( + self, convert_options: _csv.ConvertOptions, read_options: _csv.ReadOptions + ) -> None: ... + def equals(self, other: CsvFragmentScanOptions) -> bool: ... + +class CsvFileWriteOptions(FileWriteOptions): + write_options: _csv.WriteOptions + +class JsonFileFormat(FileFormat): + """ + FileFormat for JSON files. + + Parameters + ---------- + default_fragment_scan_options : JsonFragmentScanOptions + Default options for fragments scan. + parse_options : pyarrow.json.ParseOptions + Options regarding json parsing. + read_options : pyarrow.json.ReadOptions + General read options. + """ + def __init__( + self, + default_fragment_scan_options: JsonFragmentScanOptions | None = None, + parse_options: _json.ParseOptions | None = None, + read_options: _json.ReadOptions | None = None, + ) -> None: ... + def equals(self, other: JsonFileFormat) -> bool: ... + +class JsonFragmentScanOptions(FragmentScanOptions): + """ + Scan-specific options for JSON fragments. + + Parameters + ---------- + parse_options : pyarrow.json.ParseOptions + Options regarding JSON parsing. + read_options : pyarrow.json.ReadOptions + General read options. + """ + + parse_options: _json.ParseOptions + read_options: _json.ReadOptions + def __init__( + self, parse_options: _json.ParseOptions, read_options: _json.ReadOptions + ) -> None: ... + def equals(self, other: JsonFragmentScanOptions) -> bool: ... + +class Partitioning(lib._Weakrefable): + def parse(self, path: str) -> Expression: + """ + Parse a path into a partition expression. + + Parameters + ---------- + path : str + + Returns + ------- + pyarrow.dataset.Expression + """ + def format(self, expr: Expression) -> tuple[str, str]: + """ + Convert a filter expression into a tuple of (directory, filename) using + the current partitioning scheme + + Parameters + ---------- + expr : pyarrow.dataset.Expression + + Returns + ------- + tuple[str, str] + + Examples + -------- + + Specify the Schema for paths like "/2009/June": + + >>> import pyarrow as pa + >>> import pyarrow.dataset as ds + >>> import pyarrow.compute as pc + >>> part = ds.partitioning(pa.schema([("year", pa.int16()), ("month", pa.string())])) + >>> part.format((pc.field("year") == 1862) & (pc.field("month") == "Jan")) + ('1862/Jan', '') + """ + @property + def schema(self) -> lib.Schema: + """The arrow Schema attached to the partitioning.""" + +class PartitioningFactory(lib._Weakrefable): + @property + def type_name(self) -> str: ... + +class KeyValuePartitioning(Partitioning): + @property + def dictionaries(self) -> list[lib.Array | None]: + """ + The unique values for each partition field, if available. + + Those values are only available if the Partitioning object was + created through dataset discovery from a PartitioningFactory, or + if the dictionaries were manually specified in the constructor. + If no dictionary field is available, this returns an empty list. + """ + +class DirectoryPartitioning(KeyValuePartitioning): + """ + A Partitioning based on a specified Schema. + + The DirectoryPartitioning expects one segment in the file path for each + field in the schema (all fields are required to be present). + For example given schema the path "/2009/11" would + be parsed to ("year"_ == 2009 and "month"_ == 11). + + Parameters + ---------- + schema : Schema + The schema that describes the partitions present in the file path. + dictionaries : dict[str, Array] + If the type of any field of `schema` is a dictionary type, the + corresponding entry of `dictionaries` must be an array containing + every value which may be taken by the corresponding column or an + error will be raised in parsing. + segment_encoding : str, default "uri" + After splitting paths into segments, decode the segments. Valid + values are "uri" (URI-decode segments) and "none" (leave as-is). + + Returns + ------- + DirectoryPartitioning + + Examples + -------- + >>> from pyarrow.dataset import DirectoryPartitioning + >>> partitioning = DirectoryPartitioning( + ... pa.schema([("year", pa.int16()), ("month", pa.int8())]) + ... ) + >>> print(partitioning.parse("/2009/11/")) + ((year == 2009) and (month == 11)) + """ + + @staticmethod + def discover( + field_names: list[str] | None = None, + infer_dictionary: bool = False, + max_partition_dictionary_size: int = 0, + schema: lib.Schema | None = None, + segment_encoding: Literal["uri", "none"] = "uri", + ) -> PartitioningFactory: + """ + Discover a DirectoryPartitioning. + + Parameters + ---------- + field_names : list of str + The names to associate with the values from the subdirectory names. + If schema is given, will be populated from the schema. + infer_dictionary : bool, default False + When inferring a schema for partition fields, yield dictionary + encoded types instead of plain types. This can be more efficient + when materializing virtual columns, and Expressions parsed by the + finished Partitioning will include dictionaries of all unique + inspected values for each field. + max_partition_dictionary_size : int, default 0 + Synonymous with infer_dictionary for backwards compatibility with + 1.0: setting this to -1 or None is equivalent to passing + infer_dictionary=True. + schema : Schema, default None + Use this schema instead of inferring a schema from partition + values. Partition values will be validated against this schema + before accumulation into the Partitioning's dictionary. + segment_encoding : str, default "uri" + After splitting paths into segments, decode the segments. Valid + values are "uri" (URI-decode segments) and "none" (leave as-is). + + Returns + ------- + PartitioningFactory + To be used in the FileSystemFactoryOptions. + """ + def __init__( + self, + schema: lib.Schema, + dictionaries: dict[str, lib.Array] | None = None, + segment_encoding: Literal["uri", "none"] = "uri", + ) -> None: ... + +class HivePartitioning(KeyValuePartitioning): + """ + A Partitioning for "/$key=$value/" nested directories as found in + Apache Hive. + + Multi-level, directory based partitioning scheme originating from + Apache Hive with all data files stored in the leaf directories. Data is + partitioned by static values of a particular column in the schema. + Partition keys are represented in the form $key=$value in directory names. + Field order is ignored, as are missing or unrecognized field names. + + For example, given schema, a possible + path would be "/year=2009/month=11/day=15". + + Parameters + ---------- + schema : Schema + The schema that describes the partitions present in the file path. + dictionaries : dict[str, Array] + If the type of any field of `schema` is a dictionary type, the + corresponding entry of `dictionaries` must be an array containing + every value which may be taken by the corresponding column or an + error will be raised in parsing. + null_fallback : str, default "__HIVE_DEFAULT_PARTITION__" + If any field is None then this fallback will be used as a label + segment_encoding : str, default "uri" + After splitting paths into segments, decode the segments. Valid + values are "uri" (URI-decode segments) and "none" (leave as-is). + + Returns + ------- + HivePartitioning + + Examples + -------- + >>> from pyarrow.dataset import HivePartitioning + >>> partitioning = HivePartitioning(pa.schema([("year", pa.int16()), ("month", pa.int8())])) + >>> print(partitioning.parse("/year=2009/month=11/")) + ((year == 2009) and (month == 11)) + + """ + def __init__( + self, + schema: lib.Schema, + dictionaries: dict[str, lib.Array] | None = None, + null_fallback: str = "__HIVE_DEFAULT_PARTITION__", + segment_encoding: Literal["uri", "none"] = "uri", + ) -> None: ... + @staticmethod + def discover( + infer_dictionary: bool = False, + max_partition_dictionary_size: int = 0, + null_fallback="__HIVE_DEFAULT_PARTITION__", + schema: lib.Schema | None = None, + segment_encoding: Literal["uri", "none"] = "uri", + ) -> PartitioningFactory: + """ + Discover a HivePartitioning. + + Parameters + ---------- + infer_dictionary : bool, default False + When inferring a schema for partition fields, yield dictionary + encoded types instead of plain. This can be more efficient when + materializing virtual columns, and Expressions parsed by the + finished Partitioning will include dictionaries of all unique + inspected values for each field. + max_partition_dictionary_size : int, default 0 + Synonymous with infer_dictionary for backwards compatibility with + 1.0: setting this to -1 or None is equivalent to passing + infer_dictionary=True. + null_fallback : str, default "__HIVE_DEFAULT_PARTITION__" + When inferring a schema for partition fields this value will be + replaced by null. The default is set to __HIVE_DEFAULT_PARTITION__ + for compatibility with Spark + schema : Schema, default None + Use this schema instead of inferring a schema from partition + values. Partition values will be validated against this schema + before accumulation into the Partitioning's dictionary. + segment_encoding : str, default "uri" + After splitting paths into segments, decode the segments. Valid + values are "uri" (URI-decode segments) and "none" (leave as-is). + + Returns + ------- + PartitioningFactory + To be used in the FileSystemFactoryOptions. + """ + +class FilenamePartitioning(KeyValuePartitioning): + """ + A Partitioning based on a specified Schema. + + The FilenamePartitioning expects one segment in the file name for each + field in the schema (all fields are required to be present) separated + by '_'. For example given schema the name + ``"2009_11_"`` would be parsed to ("year" == 2009 and "month" == 11). + + Parameters + ---------- + schema : Schema + The schema that describes the partitions present in the file path. + dictionaries : dict[str, Array] + If the type of any field of `schema` is a dictionary type, the + corresponding entry of `dictionaries` must be an array containing + every value which may be taken by the corresponding column or an + error will be raised in parsing. + segment_encoding : str, default "uri" + After splitting paths into segments, decode the segments. Valid + values are "uri" (URI-decode segments) and "none" (leave as-is). + + Returns + ------- + FilenamePartitioning + + Examples + -------- + >>> from pyarrow.dataset import FilenamePartitioning + >>> partitioning = FilenamePartitioning( + ... pa.schema([("year", pa.int16()), ("month", pa.int8())]) + ... ) + >>> print(partitioning.parse("2009_11_data.parquet")) + ((year == 2009) and (month == 11)) + """ + + def __init__( + self, + schema: lib.Schema, + dictionaries: dict[str, lib.Array] | None = None, + segment_encoding: Literal["uri", "none"] = "uri", + ) -> None: ... + @staticmethod + def discover( + field_names: list[str] | None = None, + infer_dictionary: bool = False, + schema: lib.Schema | None = None, + segment_encoding: Literal["uri", "none"] = "uri", + ) -> PartitioningFactory: + """ + Discover a FilenamePartitioning. + + Parameters + ---------- + field_names : list of str + The names to associate with the values from the subdirectory names. + If schema is given, will be populated from the schema. + infer_dictionary : bool, default False + When inferring a schema for partition fields, yield dictionary + encoded types instead of plain types. This can be more efficient + when materializing virtual columns, and Expressions parsed by the + finished Partitioning will include dictionaries of all unique + inspected values for each field. + schema : Schema, default None + Use this schema instead of inferring a schema from partition + values. Partition values will be validated against this schema + before accumulation into the Partitioning's dictionary. + segment_encoding : str, default "uri" + After splitting paths into segments, decode the segments. Valid + values are "uri" (URI-decode segments) and "none" (leave as-is). + + Returns + ------- + PartitioningFactory + To be used in the FileSystemFactoryOptions. + """ + +class DatasetFactory(lib._Weakrefable): + """ + DatasetFactory is used to create a Dataset, inspect the Schema + of the fragments contained in it, and declare a partitioning. + """ + + root_partition: Expression + def finish(self, schema: lib.Schema | None = None) -> Dataset: + """ + Create a Dataset using the inspected schema or an explicit schema + (if given). + + Parameters + ---------- + schema : Schema, default None + The schema to conform the source to. If None, the inspected + schema is used. + + Returns + ------- + Dataset + """ + def inspect(self) -> lib.Schema: + """ + Inspect all data fragments and return a common Schema. + + Returns + ------- + Schema + """ + def inspect_schemas(self) -> list[lib.Schema]: ... + +class FileSystemFactoryOptions(lib._Weakrefable): + """ + Influences the discovery of filesystem paths. + + Parameters + ---------- + partition_base_dir : str, optional + For the purposes of applying the partitioning, paths will be + stripped of the partition_base_dir. Files not matching the + partition_base_dir prefix will be skipped for partitioning discovery. + The ignored files will still be part of the Dataset, but will not + have partition information. + partitioning : Partitioning/PartitioningFactory, optional + Apply the Partitioning to every discovered Fragment. See Partitioning or + PartitioningFactory documentation. + exclude_invalid_files : bool, optional (default True) + If True, invalid files will be excluded (file format specific check). + This will incur IO for each files in a serial and single threaded + fashion. Disabling this feature will skip the IO, but unsupported + files may be present in the Dataset (resulting in an error at scan + time). + selector_ignore_prefixes : list, optional + When discovering from a Selector (and not from an explicit file list), + ignore files and directories matching any of these prefixes. + By default this is ['.', '_']. + """ + + partitioning: Partitioning + partitioning_factory: PartitioningFactory + partition_base_dir: str + exclude_invalid_files: bool + selector_ignore_prefixes: list[str] + + def __init__( + self, + artition_base_dir: str | None = None, + partitioning: Partitioning | PartitioningFactory | None = None, + exclude_invalid_files: bool = True, + selector_ignore_prefixes: list[str] | None = None, + ) -> None: ... + +class FileSystemDatasetFactory(DatasetFactory): + """ + Create a DatasetFactory from a list of paths with schema inspection. + + Parameters + ---------- + filesystem : pyarrow.fs.FileSystem + Filesystem to discover. + paths_or_selector : pyarrow.fs.FileSelector or list of path-likes + Either a Selector object or a list of path-like objects. + format : FileFormat + Currently only ParquetFileFormat and IpcFileFormat are supported. + options : FileSystemFactoryOptions, optional + Various flags influencing the discovery of filesystem paths. + """ + + def __init__( + self, + filesystem: SupportedFileSystem, + paths_or_selector: FileSelector, + format: FileFormat, + options: FileSystemFactoryOptions | None = None, + ) -> None: ... + +class UnionDatasetFactory(DatasetFactory): + """ + Provides a way to inspect/discover a Dataset's expected schema before + materialization. + + Parameters + ---------- + factories : list of DatasetFactory + """ + def __init__(self, factories: list[DatasetFactory]) -> None: ... + +_RecordBatchT = TypeVar("_RecordBatchT", bound=lib.RecordBatch) + +class RecordBatchIterator(lib._Weakrefable, Generic[_RecordBatchT]): + """An iterator over a sequence of record batches.""" + def __iter__(self) -> Self: ... + def __next__(self) -> _RecordBatchT: ... + +class TaggedRecordBatch(NamedTuple): + """ + A combination of a record batch and the fragment it came from. + + Parameters + ---------- + record_batch : RecordBatch + The record batch. + fragment : Fragment + Fragment of the record batch. + """ + + record_batch: lib.RecordBatch + fragment: Fragment + +class TaggedRecordBatchIterator(lib._Weakrefable): + """An iterator over a sequence of record batches with fragments.""" + def __iter__(self) -> Self: ... + def __next__(self) -> TaggedRecordBatch: ... + +class Scanner(lib._Weakrefable): + """A materialized scan operation with context and options bound. + + A scanner is the class that glues the scan tasks, data fragments and data + sources together. + """ + @staticmethod + def from_dataset( + dataset: Dataset, + *, + columns: list[str] | dict[str, Expression] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Scanner: + """ + Create Scanner from Dataset, + + Parameters + ---------- + dataset : Dataset + Dataset to scan. + columns : list[str] or dict[str, Expression], default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + """ + @staticmethod + def from_fragment( + fragment: Fragment, + *, + schema: lib.Schema | None = None, + columns: list[str] | dict[str, Expression] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Scanner: + """ + Create Scanner from Fragment, + + Parameters + ---------- + fragment : Fragment + fragment to scan. + schema : Schema, optional + The schema of the fragment. + columns : list[str] or dict[str, Expression], default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + """ + @overload + @staticmethod + def from_batches( + source: Iterator[lib.RecordBatch], + *, + schema: lib.Schema, + columns: list[str] | dict[str, Expression] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Scanner: ... + @overload + @staticmethod + def from_batches( + source: RecordBatchReader, + *, + columns: list[str] | dict[str, Expression] | None = None, + filter: Expression | None = None, + batch_size: int = ..., + batch_readahead: int = 16, + fragment_readahead: int = 4, + fragment_scan_options: FragmentScanOptions | None = None, + use_threads: bool = True, + cache_metadata: bool = True, + memory_pool: lib.MemoryPool | None = None, + ) -> Scanner: ... + @staticmethod + def from_batches(*args, **kwargs): + """ + Create a Scanner from an iterator of batches. + + This creates a scanner which can be used only once. It is + intended to support writing a dataset (which takes a scanner) + from a source which can be read only once (e.g. a + RecordBatchReader or generator). + + Parameters + ---------- + source : Iterator or Arrow-compatible stream object + The iterator of Batches. This can be a pyarrow RecordBatchReader, + any object that implements the Arrow PyCapsule Protocol for + streams, or an actual Python iterator of RecordBatches. + schema : Schema + The schema of the batches (required when passing a Python + iterator). + columns : list[str] or dict[str, Expression], default None + The columns to project. This can be a list of column names to + include (order and duplicates will be preserved), or a dictionary + with {new_column_name: expression} values for more advanced + projections. + + The list of columns or expressions may use the special fields + `__batch_index` (the index of the batch within the fragment), + `__fragment_index` (the index of the fragment within the dataset), + `__last_in_fragment` (whether the batch is last in fragment), and + `__filename` (the name of the source file or a description of the + source fragment). + + The columns will be passed down to Datasets and corresponding data + fragments to avoid loading, copying, and deserializing columns + that will not be required further down the compute chain. + By default all of the available columns are projected. Raises + an exception if any of the referenced column names does not exist + in the dataset's Schema. + filter : Expression, default None + Scan will return only the rows matching the filter. + If possible the predicate will be pushed down to exploit the + partition information or internal metadata found in the data + source, e.g. Parquet statistics. Otherwise filters the loaded + RecordBatches before yielding them. + batch_size : int, default 131_072 + The maximum row count for scanned record batches. If scanned + record batches are overflowing memory then this method can be + called to reduce their size. + batch_readahead : int, default 16 + The number of batches to read ahead in a file. This might not work + for all file formats. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_readahead : int, default 4 + The number of files to read ahead. Increasing this number will increase + RAM usage but could also improve IO utilization. + fragment_scan_options : FragmentScanOptions, default None + Options specific to a particular scan and fragment type, which + can change between different scans of the same dataset. + use_threads : bool, default True + If enabled, then maximum parallelism will be used determined by + the number of available CPU cores. + cache_metadata : bool, default True + If enabled, metadata may be cached when scanning to speed up + repeated scans. + memory_pool : MemoryPool, default None + For memory allocations, if required. If not specified, uses the + default pool. + """ + @property + def dataset_schema(self) -> lib.Schema: + """The schema with which batches will be read from fragments.""" + @property + def projected_schema(self) -> lib.Schema: + """ + The materialized schema of the data, accounting for projections. + + This is the schema of any data returned from the scanner. + """ + def to_batches(self) -> Iterator[lib.RecordBatch]: + """ + Consume a Scanner in record batches. + + Returns + ------- + record_batches : iterator of RecordBatch + """ + def scan_batches(self) -> TaggedRecordBatchIterator: + """ + Consume a Scanner in record batches with corresponding fragments. + + Returns + ------- + record_batches : iterator of TaggedRecordBatch + """ + def to_table(self) -> lib.Table: + """ + Convert a Scanner into a Table. + + Use this convenience utility with care. This will serially materialize + the Scan result in memory before creating the Table. + + Returns + ------- + Table + """ + def take(self, indices: Indices) -> lib.Table: + """ + Select rows of data by index. + + Will only consume as many batches of the underlying dataset as + needed. Otherwise, this is equivalent to + ``to_table().take(indices)``. + + Parameters + ---------- + indices : Array or array-like + indices of rows to select in the dataset. + + Returns + ------- + Table + """ + def head(self, num_rows: int) -> lib.Table: + """ + Load the first N rows of the dataset. + + Parameters + ---------- + num_rows : int + The number of rows to load. + + Returns + ------- + Table + """ + def count_rows(self) -> int: + """ + Count rows matching the scanner filter. + + Returns + ------- + count : int + """ + def to_reader(self) -> RecordBatchReader: + """Consume this scanner as a RecordBatchReader. + + Returns + ------- + RecordBatchReader + """ + +def get_partition_keys(partition_expression: Expression) -> dict[str, Any]: + """ + Extract partition keys (equality constraints between a field and a scalar) + from an expression as a dict mapping the field's name to its value. + + NB: All expressions yielded by a HivePartitioning or DirectoryPartitioning + will be conjunctions of equality conditions and are accessible through this + function. Other subexpressions will be ignored. + + Parameters + ---------- + partition_expression : pyarrow.dataset.Expression + + Returns + ------- + dict + + Examples + -------- + + For example, an expression of + + is converted to {'part': 'A', 'year': 2016} + """ + +class WrittenFile(lib._Weakrefable): + """ + Metadata information about files written as + part of a dataset write operation + + Parameters + ---------- + path : str + Path to the file. + metadata : pyarrow.parquet.FileMetaData, optional + For Parquet files, the Parquet file metadata. + size : int + The size of the file in bytes. + """ + def __init__(self, path: str, metadata: _parquet.FileMetaData | None, size: int) -> None: ... + +def _filesystemdataset_write( + data: Scanner, + base_dir: StrPath, + basename_template: str, + filesystem: SupportedFileSystem, + partitioning: Partitioning, + file_options: FileWriteOptions, + max_partitions: int, + file_visitor: Callable[[str], None], + existing_data_behavior: Literal["error", "overwrite_or_ignore", "delete_matching"], + max_open_files: int, + max_rows_per_file: int, + min_rows_per_group: int, + max_rows_per_group: int, + create_dir: bool, +): ... + +class _ScanNodeOptions(ExecNodeOptions): + def _set_options(self, dataset: Dataset, scan_options: dict) -> None: ... + +class ScanNodeOptions(_ScanNodeOptions): + """ + A Source node which yields batches from a Dataset scan. + + This is the option class for the "scan" node factory. + + This node is capable of applying pushdown projections or filters + to the file readers which reduce the amount of data that needs to + be read (if supported by the file format). But note that this does not + construct associated filter or project nodes to perform the final + filtering or projection. Rather, you may supply the same filter + expression or projection to the scan node that you also supply + to the filter or project node. + + Yielded batches will be augmented with fragment/batch indices when + implicit_ordering=True to enable stable ordering for simple ExecPlans. + + Parameters + ---------- + dataset : pyarrow.dataset.Dataset + The table which acts as the data source. + **kwargs : dict, optional + Scan options. See `Scanner.from_dataset` for possible arguments. + require_sequenced_output : bool, default False + Batches are yielded sequentially, like single-threaded + implicit_ordering : bool, default False + Preserve implicit ordering of data. + """ + + def __init__( + self, dataset: Dataset, require_sequenced_output: bool = False, **kwargs + ) -> None: ... diff --git a/python/stubs/_dataset_orc.pyi b/python/stubs/_dataset_orc.pyi new file mode 100644 index 00000000000..9c4ac04198f --- /dev/null +++ b/python/stubs/_dataset_orc.pyi @@ -0,0 +1,6 @@ +from ._dataset import FileFormat + +class OrcFileFormat(FileFormat): + def equals(self, other: OrcFileFormat) -> bool: ... + @property + def default_extname(self): ... diff --git a/python/stubs/_dataset_parquet.pyi b/python/stubs/_dataset_parquet.pyi new file mode 100644 index 00000000000..cbcc17235f1 --- /dev/null +++ b/python/stubs/_dataset_parquet.pyi @@ -0,0 +1,314 @@ +from dataclasses import dataclass +from typing import IO, Any, Iterable, TypedDict + +from _typeshed import StrPath + +from ._compute import Expression +from ._dataset import ( + DatasetFactory, + FileFormat, + FileFragment, + FileWriteOptions, + Fragment, + FragmentScanOptions, + Partitioning, + PartitioningFactory, +) +from ._dataset_parquet_encryption import ParquetDecryptionConfig +from ._fs import SupportedFileSystem +from ._parquet import FileDecryptionProperties, FileMetaData +from .lib import CacheOptions, Schema, _Weakrefable + +parquet_encryption_enabled: bool + +class ParquetFileFormat(FileFormat): + """ + FileFormat for Parquet + + Parameters + ---------- + read_options : ParquetReadOptions + Read options for the file. + default_fragment_scan_options : ParquetFragmentScanOptions + Scan Options for the file. + **kwargs : dict + Additional options for read option or scan option + """ + def __init__( + self, + read_options: ParquetReadOptions | None = None, + default_fragment_scan_options: ParquetFragmentScanOptions | None = None, + **kwargs, + ) -> None: ... + @property + def read_options(self) -> ParquetReadOptions: ... + def make_write_options(self) -> ParquetFileWriteOptions: ... # type: ignore[override] + def equals(self, other: ParquetFileFormat) -> bool: ... + @property + def default_extname(self) -> str: ... + def make_fragment( + self, + file: StrPath | IO, + filesystem: SupportedFileSystem | None = None, + partition_expression: Expression | None = None, + row_groups: Iterable[int] | None = None, + *, + file_size: int | None = None, + ) -> Fragment: + """ + Make a FileFragment from a given file. + + Parameters + ---------- + file : file-like object, path-like or str + The file or file path to make a fragment from. + filesystem : Filesystem, optional + If `filesystem` is given, `file` must be a string and specifies + the path of the file to read from the filesystem. + partition_expression : Expression, optional + An expression that is guaranteed true for all rows in the fragment. Allows + fragment to be potentially skipped while scanning with a filter. + row_groups : Iterable, optional + The indices of the row groups to include + file_size : int, optional + The size of the file in bytes. Can improve performance with high-latency filesystems + when file size needs to be known before reading. + + Returns + ------- + fragment : Fragment + The file fragment + """ + +class _NameStats(TypedDict): + min: Any + max: Any + +class RowGroupInfo: + """ + A wrapper class for RowGroup information + + Parameters + ---------- + id : integer + The group ID. + metadata : FileMetaData + The rowgroup metadata. + schema : Schema + Schema of the rows. + """ + + id: int + metadata: FileMetaData + schema: Schema + + def __init__(self, id: int, metadata: FileMetaData, schema: Schema) -> None: ... + @property + def num_rows(self) -> int: ... + @property + def total_byte_size(self) -> int: ... + @property + def statistics(self) -> dict[str, _NameStats]: ... + +class ParquetFileFragment(FileFragment): + """A Fragment representing a parquet file.""" + + def ensure_complete_metadata(self) -> None: ... + @property + def row_groups(self) -> list[RowGroupInfo]: ... + @property + def metadata(self) -> FileMetaData: ... + @property + def num_row_groups(self) -> int: + """ + Return the number of row groups viewed by this fragment (not the + number of row groups in the origin file). + """ + def split_by_row_group( + self, filter: Expression | None = None, schema: Schema | None = None + ) -> list[Fragment]: + """ + Split the fragment into multiple fragments. + + Yield a Fragment wrapping each row group in this ParquetFileFragment. + Row groups will be excluded whose metadata contradicts the optional + filter. + + Parameters + ---------- + filter : Expression, default None + Only include the row groups which satisfy this predicate (using + the Parquet RowGroup statistics). + schema : Schema, default None + Schema to use when filtering row groups. Defaults to the + Fragment's physical schema + + Returns + ------- + A list of Fragments + """ + def subset( + self, + filter: Expression | None = None, + schema: Schema | None = None, + row_group_ids: list[int] | None = None, + ) -> ParquetFileFormat: + """ + Create a subset of the fragment (viewing a subset of the row groups). + + Subset can be specified by either a filter predicate (with optional + schema) or by a list of row group IDs. Note that when using a filter, + the resulting fragment can be empty (viewing no row groups). + + Parameters + ---------- + filter : Expression, default None + Only include the row groups which satisfy this predicate (using + the Parquet RowGroup statistics). + schema : Schema, default None + Schema to use when filtering row groups. Defaults to the + Fragment's physical schema + row_group_ids : list of ints + The row group IDs to include in the subset. Can only be specified + if `filter` is None. + + Returns + ------- + ParquetFileFragment + """ + +class ParquetReadOptions(_Weakrefable): + """ + Parquet format specific options for reading. + + Parameters + ---------- + dictionary_columns : list of string, default None + Names of columns which should be dictionary encoded as + they are read + coerce_int96_timestamp_unit : str, default None + Cast timestamps that are stored in INT96 format to a particular + resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' + and therefore INT96 timestamps will be inferred as timestamps + in nanoseconds + """ + def __init__( + self, dictionary_columns: list[str] | None, coerce_int96_timestamp_unit: str | None = None + ) -> None: ... + @property + def coerce_int96_timestamp_unit(self) -> str: ... + @coerce_int96_timestamp_unit.setter + def coerce_int96_timestamp_unit(self, unit: str) -> None: ... + def equals(self, other: ParquetReadOptions) -> bool: ... + +class ParquetFileWriteOptions(FileWriteOptions): + def update(self, **kwargs) -> None: ... + def _set_properties(self) -> None: ... + def _set_arrow_properties(self) -> None: ... + def _set_encryption_config(self) -> None: ... + +@dataclass(kw_only=True) +class ParquetFragmentScanOptions(FragmentScanOptions): + """ + Scan-specific options for Parquet fragments. + + Parameters + ---------- + use_buffered_stream : bool, default False + Read files through buffered input streams rather than loading entire + row groups at once. This may be enabled to reduce memory overhead. + Disabled by default. + buffer_size : int, default 8192 + Size of buffered stream, if enabled. Default is 8KB. + pre_buffer : bool, default True + If enabled, pre-buffer the raw Parquet data instead of issuing one + read per column chunk. This can improve performance on high-latency + filesystems (e.g. S3, GCS) by coalescing and issuing file reads in + parallel using a background I/O thread pool. + Set to False if you want to prioritize minimal memory usage + over maximum speed. + cache_options : pyarrow.CacheOptions, default None + Cache options used when pre_buffer is enabled. The default values should + be good for most use cases. You may want to adjust these for example if + you have exceptionally high latency to the file system. + thrift_string_size_limit : int, default None + If not None, override the maximum total string size allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + thrift_container_size_limit : int, default None + If not None, override the maximum total size of containers allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + decryption_config : pyarrow.dataset.ParquetDecryptionConfig, default None + If not None, use the provided ParquetDecryptionConfig to decrypt the + Parquet file. + decryption_properties : pyarrow.parquet.FileDecryptionProperties, default None + If not None, use the provided FileDecryptionProperties to decrypt encrypted + Parquet file. + page_checksum_verification : bool, default False + If True, verify the page checksum for each page read from the file. + """ + + use_buffered_stream: bool = False + buffer_size: int = 8192 + pre_buffer: bool = True + cache_options: CacheOptions | None = None + thrift_string_size_limit: int | None = None + thrift_container_size_limit: int | None = None + decryption_config: ParquetDecryptionConfig | None = None + decryption_properties: FileDecryptionProperties | None = None + page_checksum_verification: bool = False + + def equals(self, other: ParquetFragmentScanOptions) -> bool: ... + +@dataclass +class ParquetFactoryOptions(_Weakrefable): + """ + Influences the discovery of parquet dataset. + + Parameters + ---------- + partition_base_dir : str, optional + For the purposes of applying the partitioning, paths will be + stripped of the partition_base_dir. Files not matching the + partition_base_dir prefix will be skipped for partitioning discovery. + The ignored files will still be part of the Dataset, but will not + have partition information. + partitioning : Partitioning, PartitioningFactory, optional + The partitioning scheme applied to fragments, see ``Partitioning``. + validate_column_chunk_paths : bool, default False + Assert that all ColumnChunk paths are consistent. The parquet spec + allows for ColumnChunk data to be stored in multiple files, but + ParquetDatasetFactory supports only a single file with all ColumnChunk + data. If this flag is set construction of a ParquetDatasetFactory will + raise an error if ColumnChunk data is not resident in a single file. + """ + + partition_base_dir: str | None = None + partitioning: Partitioning | PartitioningFactory | None = None + validate_column_chunk_paths: bool = False + +class ParquetDatasetFactory(DatasetFactory): + """ + Create a ParquetDatasetFactory from a Parquet `_metadata` file. + + Parameters + ---------- + metadata_path : str + Path to the `_metadata` parquet metadata-only file generated with + `pyarrow.parquet.write_metadata`. + filesystem : pyarrow.fs.FileSystem + Filesystem to read the metadata_path from, and subsequent parquet + files. + format : ParquetFileFormat + Parquet format options. + options : ParquetFactoryOptions, optional + Various flags influencing the discovery of filesystem paths. + """ + def __init__( + self, + metadata_path: str, + filesystem: SupportedFileSystem, + format: FileFormat, + options: ParquetFactoryOptions | None = None, + ) -> None: ... diff --git a/python/stubs/_dataset_parquet_encryption.pyi b/python/stubs/_dataset_parquet_encryption.pyi new file mode 100644 index 00000000000..7623275b865 --- /dev/null +++ b/python/stubs/_dataset_parquet_encryption.pyi @@ -0,0 +1,85 @@ +from ._dataset_parquet import ParquetFileWriteOptions, ParquetFragmentScanOptions +from ._parquet import FileDecryptionProperties +from ._parquet_encryption import CryptoFactory, EncryptionConfiguration, KmsConnectionConfig +from .lib import _Weakrefable + +class ParquetEncryptionConfig(_Weakrefable): + """ + Core configuration class encapsulating parameters for high-level encryption + within the Parquet framework. + + The ParquetEncryptionConfig class serves as a bridge for passing encryption-related + parameters to the appropriate components within the Parquet library. It maintains references + to objects that define the encryption strategy, Key Management Service (KMS) configuration, + and specific encryption configurations for Parquet data. + + Parameters + ---------- + crypto_factory : pyarrow.parquet.encryption.CryptoFactory + Shared pointer to a `CryptoFactory` object. The `CryptoFactory` is responsible for + creating cryptographic components, such as encryptors and decryptors. + kms_connection_config : pyarrow.parquet.encryption.KmsConnectionConfig + Shared pointer to a `KmsConnectionConfig` object. This object holds the configuration + parameters necessary for connecting to a Key Management Service (KMS). + encryption_config : pyarrow.parquet.encryption.EncryptionConfiguration + Shared pointer to an `EncryptionConfiguration` object. This object defines specific + encryption settings for Parquet data, including the keys assigned to different columns. + + Raises + ------ + ValueError + Raised if `encryption_config` is None. + """ + def __init__( + self, + crypto_factory: CryptoFactory, + kms_connection_config: KmsConnectionConfig, + encryption_config: EncryptionConfiguration, + ) -> None: ... + +class ParquetDecryptionConfig(_Weakrefable): + """ + Core configuration class encapsulating parameters for high-level decryption + within the Parquet framework. + + ParquetDecryptionConfig is designed to pass decryption-related parameters to + the appropriate decryption components within the Parquet library. It holds references to + objects that define the decryption strategy, Key Management Service (KMS) configuration, + and specific decryption configurations for reading encrypted Parquet data. + + Parameters + ---------- + crypto_factory : pyarrow.parquet.encryption.CryptoFactory + Shared pointer to a `CryptoFactory` object, pivotal in creating cryptographic + components for the decryption process. + kms_connection_config : pyarrow.parquet.encryption.KmsConnectionConfig + Shared pointer to a `KmsConnectionConfig` object, containing parameters necessary + for connecting to a Key Management Service (KMS) during decryption. + decryption_config : pyarrow.parquet.encryption.DecryptionConfiguration + Shared pointer to a `DecryptionConfiguration` object, specifying decryption settings + for reading encrypted Parquet data. + + Raises + ------ + ValueError + Raised if `decryption_config` is None. + """ + def __init__( + self, + crypto_factory: CryptoFactory, + kms_connection_config: KmsConnectionConfig, + encryption_config: EncryptionConfiguration, + ) -> None: ... + +def set_encryption_config( + opts: ParquetFileWriteOptions, + config: ParquetEncryptionConfig, +) -> None: ... +def set_decryption_properties( + opts: ParquetFragmentScanOptions, + config: FileDecryptionProperties, +): ... +def set_decryption_config( + opts: ParquetFragmentScanOptions, + config: ParquetDecryptionConfig, +): ... diff --git a/python/stubs/_feather.pyi b/python/stubs/_feather.pyi new file mode 100644 index 00000000000..8bb914ba45d --- /dev/null +++ b/python/stubs/_feather.pyi @@ -0,0 +1,29 @@ +from typing import IO + +from _typeshed import StrPath + +from .lib import Buffer, NativeFile, Table, _Weakrefable + +class FeatherError(Exception): ... + +def write_feather( + table: Table, + dest: StrPath | IO | NativeFile, + compression: str | None = None, + compression_level: int | None = None, + chunksize: int | None = None, + version: int = 2, +): ... + +class FeatherReader(_Weakrefable): + def __init__( + self, + source: StrPath | IO | NativeFile | Buffer, + use_memory_map: bool, + use_threads: bool, + ) -> None: ... + @property + def version(self) -> str: ... + def read(self) -> Table: ... + def read_indices(self, indices: list[int]) -> Table: ... + def read_names(self, names: list[str]) -> Table: ... diff --git a/python/stubs/_flight.pyi b/python/stubs/_flight.pyi new file mode 100644 index 00000000000..4450c42df49 --- /dev/null +++ b/python/stubs/_flight.pyi @@ -0,0 +1,1380 @@ +import asyncio +import enum +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from typing import Generator, Generic, Iterable, Iterator, NamedTuple, TypeVar + +from typing_extensions import deprecated + +from .ipc import _ReadPandasMixin +from .lib import ( + ArrowCancelled, + ArrowException, + ArrowInvalid, + Buffer, + IpcReadOptions, + IpcWriteOptions, + RecordBatch, + RecordBatchReader, + Schema, + Table, + TimestampScalar, + _CRecordBatchWriter, + _Weakrefable, +) + +_T = TypeVar("_T") + +class FlightCallOptions(_Weakrefable): + """RPC-layer options for a Flight call.""" + + def __init__( + self, + timeout: float | None = None, + write_options: IpcWriteOptions | None = None, + headers: list[tuple[str, str]] | None = None, + read_options: IpcReadOptions | None = None, + ) -> None: + """Create call options. + + Parameters + ---------- + timeout : float, None + A timeout for the call, in seconds. None means that the + timeout defaults to an implementation-specific value. + write_options : pyarrow.ipc.IpcWriteOptions, optional + IPC write options. The default options can be controlled + by environment variables (see pyarrow.ipc). + headers : List[Tuple[str, str]], optional + A list of arbitrary headers as key, value tuples + read_options : pyarrow.ipc.IpcReadOptions, optional + Serialization options for reading IPC format. + """ + +class CertKeyPair(NamedTuple): + """A TLS certificate and key for use in Flight.""" + + cert: str + key: str + +class FlightError(Exception): + """ + The base class for Flight-specific errors. + + A server may raise this class or one of its subclasses to provide + a more detailed error to clients. + + Parameters + ---------- + message : str, optional + The error message. + extra_info : bytes, optional + Extra binary error details that were provided by the + server/will be sent to the client. + + Attributes + ---------- + extra_info : bytes + Extra binary error details that were provided by the + server/will be sent to the client. + """ + + extra_info: bytes + +class FlightInternalError(FlightError, ArrowException): + """An error internal to the Flight server occurred.""" + +class FlightTimedOutError(FlightError, ArrowException): + """The Flight RPC call timed out.""" + +class FlightCancelledError(FlightError, ArrowCancelled): + """The operation was cancelled.""" + +class FlightServerError(FlightError, ArrowException): + """A server error occurred.""" + +class FlightUnauthenticatedError(FlightError, ArrowException): + """The client is not authenticated.""" + +class FlightUnauthorizedError(FlightError, ArrowException): + """The client is not authorized to perform the given operation.""" + +class FlightUnavailableError(FlightError, ArrowException): + """The server is not reachable or available.""" + +class FlightWriteSizeExceededError(ArrowInvalid): + """A write operation exceeded the client-configured limit.""" + + limit: int + actual: int + +class Action(_Weakrefable): + """An action executable on a Flight service.""" + + def __init__(self, action_type: bytes | str, buf: Buffer | bytes) -> None: + """Create an action from a type and a buffer. + + Parameters + ---------- + action_type : bytes or str + buf : Buffer or bytes-like object + """ + @property + def type(self) -> str: + """The action type.""" + @property + def body(self) -> Buffer: + """The action body (arguments for the action).""" + def serialize(self) -> bytes: + """Get the wire-format representation of this type. + + Useful when interoperating with non-Flight systems (e.g. REST + services) that may want to return Flight types. + + """ + @classmethod + def deserialize(cls, serialized: bytes) -> Self: + """Parse the wire-format representation of this type. + + Useful when interoperating with non-Flight systems (e.g. REST + services) that may want to return Flight types. + + """ + +class ActionType(NamedTuple): + """A type of action that is executable on a Flight service.""" + + type: str + description: str + + def make_action(self, buf: Buffer | bytes) -> Action: + """Create an Action with this type. + + Parameters + ---------- + buf : obj + An Arrow buffer or Python bytes or bytes-like object. + """ + +class Result(_Weakrefable): + """A result from executing an Action.""" + def __init__(self, buf: Buffer | bytes) -> None: + """Create a new result. + + Parameters + ---------- + buf : Buffer or bytes-like object + """ + @property + def body(self) -> Buffer: + """Get the Buffer containing the result.""" + def serialize(self) -> bytes: + """Get the wire-format representation of this type. + + Useful when interoperating with non-Flight systems (e.g. REST + services) that may want to return Flight types. + + """ + @classmethod + def deserialize(cls, serialized: bytes) -> Self: + """Parse the wire-format representation of this type. + + Useful when interoperating with non-Flight systems (e.g. REST + services) that may want to return Flight types. + + """ + +class BasicAuth(_Weakrefable): + """A container for basic auth.""" + def __init__( + self, username: str | bytes | None = None, password: str | bytes | None = None + ) -> None: + """Create a new basic auth object. + + Parameters + ---------- + username : string + password : string + """ + @property + def username(self) -> bytes: ... + @property + def password(self) -> bytes: ... + def serialize(self) -> str: ... + @staticmethod + def deserialize(serialized: str | bytes) -> BasicAuth: ... + +class DescriptorType(enum.Enum): + """ + The type of a FlightDescriptor. + + Attributes + ---------- + + UNKNOWN + An unknown descriptor type. + + PATH + A Flight stream represented by a path. + + CMD + A Flight stream represented by an application-defined command. + + """ + + UNKNOWN = 0 + PATH = 1 + CMD = 2 + +class FlightMethod(enum.Enum): + """The implemented methods in Flight.""" + + INVALID = 0 + HANDSHAKE = 1 + LIST_FLIGHTS = 2 + GET_FLIGHT_INFO = 3 + GET_SCHEMA = 4 + DO_GET = 5 + DO_PUT = 6 + DO_ACTION = 7 + LIST_ACTIONS = 8 + DO_EXCHANGE = 9 + +class FlightDescriptor(_Weakrefable): + """A description of a data stream available from a Flight service.""" + @staticmethod + def for_path(*path: str | bytes) -> FlightDescriptor: + """Create a FlightDescriptor for a resource path.""" + + @staticmethod + def for_command(command: str | bytes) -> FlightDescriptor: + """Create a FlightDescriptor for an opaque command.""" + @property + def descriptor_type(self) -> DescriptorType: + """Get the type of this descriptor.""" + @property + def path(self) -> list[bytes] | None: + """Get the path for this descriptor.""" + @property + def command(self) -> bytes | None: + """Get the command for this descriptor.""" + def serialize(self) -> bytes: ... + @classmethod + def deserialize(cls, serialized: bytes) -> Self: ... + +class Ticket(_Weakrefable): + """A ticket for requesting a Flight stream.""" + def __init__(self, ticket: str | bytes) -> None: ... + @property + def ticket(self) -> bytes: ... + def serialize(self) -> bytes: ... + @classmethod + def deserialize(cls, serialized: bytes) -> Self: ... + +class Location(_Weakrefable): + """The location of a Flight service.""" + def __init__(self, uri: str | bytes) -> None: ... + @property + def uri(self) -> bytes: ... + def equals(self, other: Location) -> bool: ... + @staticmethod + def for_grpc_tcp(host: str | bytes, port: int) -> Location: + """Create a Location for a TCP-based gRPC service.""" + @staticmethod + def for_grpc_tls(host: str | bytes, port: int) -> Location: + """Create a Location for a TLS-based gRPC service.""" + @staticmethod + def for_grpc_unix(path: str | bytes) -> Location: + """Create a Location for a domain socket-based gRPC service.""" + +class FlightEndpoint(_Weakrefable): + """A Flight stream, along with the ticket and locations to access it.""" + def __init__( + self, + ticket: Ticket | str | bytes, + locations: list[str | Location], + expiration_time: TimestampScalar | None = ..., + app_metadata: bytes | str = ..., + ): + """Create a FlightEndpoint from a ticket and list of locations. + + Parameters + ---------- + ticket : Ticket or bytes + the ticket needed to access this flight + locations : list of string URIs + locations where this flight is available + expiration_time : TimestampScalar, default None + Expiration time of this stream. If present, clients may assume + they can retry DoGet requests. Otherwise, clients should avoid + retrying DoGet requests. + app_metadata : bytes or str, default "" + Application-defined opaque metadata. + + Raises + ------ + ArrowException + If one of the location URIs is not a valid URI. + """ + @property + def ticket(self) -> Ticket: + """Get the ticket in this endpoint.""" + @property + def locations(self) -> list[Location]: + """Get locations where this flight is available.""" + def serialize(self) -> bytes: ... + @property + def expiration_time(self) -> TimestampScalar | None: + """Get the expiration time of this stream. + + If present, clients may assume they can retry DoGet requests. + Otherwise, clients should avoid retrying DoGet requests. + + """ + @property + def app_metadata(self) -> bytes | str: + """Get application-defined opaque metadata.""" + @classmethod + def deserialize(cls, serialized: bytes) -> Self: ... + +class SchemaResult(_Weakrefable): + """The serialized schema returned from a GetSchema request.""" + def __init__(self, schema: Schema) -> None: + """Create a SchemaResult from a schema. + + Parameters + ---------- + schema: Schema + the schema of the data in this flight. + """ + @property + def schema(self) -> Schema: + """The schema of the data in this flight.""" + def serialize(self) -> bytes: ... + @classmethod + def deserialize(cls, serialized: bytes) -> Self: ... + +class FlightInfo(_Weakrefable): + """A description of a Flight stream.""" + def __init__( + self, + schema: Schema, + descriptor: FlightDescriptor, + endpoints: list[FlightEndpoint], + total_records: int = ..., + total_bytes: int = ..., + ordered: bool = ..., + app_metadata: bytes | str = ..., + ) -> None: + """Create a FlightInfo object from a schema, descriptor, and endpoints. + + Parameters + ---------- + schema : Schema + the schema of the data in this flight. + descriptor : FlightDescriptor + the descriptor for this flight. + endpoints : list of FlightEndpoint + a list of endpoints where this flight is available. + total_records : int, default None + the total records in this flight, -1 or None if unknown. + total_bytes : int, default None + the total bytes in this flight, -1 or None if unknown. + ordered : boolean, default False + Whether endpoints are in the same order as the data. + app_metadata : bytes or str, default "" + Application-defined opaque metadata. + """ + @property + def schema(self) -> Schema: + """The schema of the data in this flight.""" + @property + def descriptor(self) -> FlightDescriptor: + """The descriptor of the data in this flight.""" + @property + def endpoints(self) -> list[FlightEndpoint]: + """The endpoints where this flight is available.""" + @property + def total_records(self) -> int: + """The total record count of this flight, or -1 if unknown.""" + @property + def total_bytes(self) -> int: + """The size in bytes of the data in this flight, or -1 if unknown.""" + @property + def ordered(self) -> bool: + """Whether endpoints are in the same order as the data.""" + @property + def app_metadata(self) -> bytes | str: + """ + Application-defined opaque metadata. + + There is no inherent or required relationship between this and the + app_metadata fields in the FlightEndpoints or resulting FlightData + messages. Since this metadata is application-defined, a given + application could define there to be a relationship, but there is + none required by the spec. + + """ + def serialize(self) -> bytes: ... + @classmethod + def deserialize(cls, serialized: bytes) -> Self: ... + +class FlightStreamChunk(_Weakrefable): + """A RecordBatch with application metadata on the side.""" + @property + def data(self) -> RecordBatch | None: ... + @property + def app_metadata(self) -> Buffer | None: ... + def __iter__(self): ... + +class _MetadataRecordBatchReader(_Weakrefable, _ReadPandasMixin): + """A reader for Flight streams.""" + + # Needs to be separate class so the "real" class can subclass the + # pure-Python mixin class + + def __iter__(self) -> Self: ... + def __next__(self) -> FlightStreamChunk: ... + @property + def schema(self) -> Schema: + """Get the schema for this reader.""" + def read_all(self) -> Table: + """Read the entire contents of the stream as a Table.""" + def read_chunk(self) -> FlightStreamChunk: + """Read the next FlightStreamChunk along with any metadata. + + Returns + ------- + chunk : FlightStreamChunk + The next FlightStreamChunk in the stream. + + Raises + ------ + StopIteration + when the stream is finished + """ + def to_reader(self) -> RecordBatchReader: + """Convert this reader into a regular RecordBatchReader. + + This may fail if the schema cannot be read from the remote end. + + Returns + ------- + RecordBatchReader + """ + +class MetadataRecordBatchReader(_MetadataRecordBatchReader): + """The base class for readers for Flight streams. + + See Also + -------- + FlightStreamReader + """ + +class FlightStreamReader(MetadataRecordBatchReader): + """A reader that can also be canceled.""" + def cancel(self) -> None: + """Cancel the read operation.""" + def read_all(self) -> Table: + """Read the entire contents of the stream as a Table.""" + +class MetadataRecordBatchWriter(_CRecordBatchWriter): + """A RecordBatchWriter that also allows writing application metadata. + + This class is a context manager; on exit, close() will be called. + """ + + def begin(self, schema: Schema, options: IpcWriteOptions | None = None) -> None: + """Prepare to write data to this stream with the given schema.""" + def write_metadata(self, buf: Buffer) -> None: + """Write Flight metadata by itself.""" + def write_batch(self, batch: RecordBatch) -> None: # type: ignore[override] + """ + Write RecordBatch to stream. + + Parameters + ---------- + batch : RecordBatch + """ + def write_table(self, table: Table, max_chunksize: int | None = None, **kwargs) -> None: + """ + Write Table to stream in (contiguous) RecordBatch objects. + + Parameters + ---------- + table : Table + max_chunksize : int, default None + Maximum number of rows for RecordBatch chunks. Individual chunks may + be smaller depending on the chunk layout of individual columns. + """ + def close(self) -> None: + """ + Close stream and write end-of-stream 0 marker. + """ + def write_with_metadata(self, batch: RecordBatch, buf: Buffer) -> None: + """Write a RecordBatch along with Flight metadata. + + Parameters + ---------- + batch : RecordBatch + The next RecordBatch in the stream. + buf : Buffer + Application-specific metadata for the batch as defined by + Flight. + """ + +class FlightStreamWriter(MetadataRecordBatchWriter): + """A writer that also allows closing the write side of a stream.""" + def done_writing(self) -> None: + """Indicate that the client is done writing, but not done reading.""" + +class FlightMetadataReader(_Weakrefable): + """A reader for Flight metadata messages sent during a DoPut.""" + def read(self) -> Buffer | None: + """Read the next metadata message.""" + +class FlightMetadataWriter(_Weakrefable): + """A sender for Flight metadata messages during a DoPut.""" + def write(self, message: Buffer) -> None: + """Write the next metadata message. + + Parameters + ---------- + message : Buffer + """ + +class AsyncioCall(Generic[_T]): + """State for an async RPC using asyncio.""" + + _future: asyncio.Future[_T] + + def as_awaitable(self) -> asyncio.Future[_T]: ... + def wakeup(self, result_or_exception: BaseException | _T) -> None: ... + +class AsyncioFlightClient: + """ + A FlightClient with an asyncio-based async interface. + + This interface is EXPERIMENTAL. + """ + + def __init__(self, client: FlightClient) -> None: ... + async def get_flight_info( + self, + descriptor: FlightDescriptor, + *, + options: FlightCallOptions | None = None, + ): ... + +class FlightClient(_Weakrefable): + """A client to a Flight service. + + Connect to a Flight service on the given host and port. + + Parameters + ---------- + location : str, tuple or Location + Location to connect to. Either a gRPC URI like `grpc://localhost:port`, + a tuple of (host, port) pair, or a Location instance. + tls_root_certs : bytes or None + PEM-encoded + cert_chain: bytes or None + Client certificate if using mutual TLS + private_key: bytes or None + Client private key for cert_chain is using mutual TLS + override_hostname : str or None + Override the hostname checked by TLS. Insecure, use with caution. + middleware : list optional, default None + A list of ClientMiddlewareFactory instances. + write_size_limit_bytes : int optional, default None + A soft limit on the size of a data payload sent to the + server. Enabled if positive. If enabled, writing a record + batch that (when serialized) exceeds this limit will raise an + exception; the client can retry the write with a smaller + batch. + disable_server_verification : boolean optional, default False + A flag that indicates that, if the client is connecting + with TLS, that it skips server verification. If this is + enabled, all other TLS settings are overridden. + generic_options : list optional, default None + A list of generic (string, int or string) option tuples passed + to the underlying transport. Effect is implementation + dependent. + """ + def __init__( + self, + location: str | tuple[str, int] | Location, + *, + tls_root_certs: str | None = None, + cert_chain: str | None = None, + private_key: str | None = None, + override_hostname: str | None = None, + middleware: list[ClientMiddlewareFactory] | None = None, + write_size_limit_bytes: int | None = None, + disable_server_verification: bool = False, + generic_options: list[tuple[str, int | str]] | None = None, + ): ... + @property + def supports_async(self) -> bool: ... + def as_async(self) -> AsyncioFlightClient: ... + def wait_for_available(self, timeout: int = 5) -> None: + """Block until the server can be contacted. + + Parameters + ---------- + timeout : int, default 5 + The maximum seconds to wait. + """ + @deprecated( + "Use the ``FlightClient`` constructor or ``pyarrow.flight.connect`` function instead." + ) + @classmethod + def connect( + cls, + location: str | tuple[str, int] | Location, + tls_root_certs: str | None = None, + cert_chain: str | None = None, + private_key: str | None = None, + override_hostname: str | None = None, + disable_server_verification: bool = False, + ) -> FlightClient: + """Connect to a Flight server. + + .. deprecated:: 0.15.0 + Use the ``FlightClient`` constructor or ``pyarrow.flight.connect`` function instead. + """ + def authenticate( + self, auth_handler: ClientAuthHandler, options: FlightCallOptions | None = None + ) -> None: + """Authenticate to the server. + + Parameters + ---------- + auth_handler : ClientAuthHandler + The authentication mechanism to use. + options : FlightCallOptions + Options for this call. + """ + def authenticate_basic_token( + self, username: str, password: str, options: FlightCallOptions | None = None + ) -> tuple[str, str]: + """Authenticate to the server with HTTP basic authentication. + + Parameters + ---------- + username : string + Username to authenticate with + password : string + Password to authenticate with + options : FlightCallOptions + Options for this call + + Returns + ------- + tuple : Tuple[str, str] + A tuple representing the FlightCallOptions authorization + header entry of a bearer token. + """ + def list_actions(self, options: FlightCallOptions | None = None) -> list[Action]: + """List the actions available on a service.""" + def do_action( + self, action: Action, options: FlightCallOptions | None = None + ) -> Iterator[Result]: + """ + Execute an action on a service. + + Parameters + ---------- + action : str, tuple, or Action + Can be action type name (no body), type and body, or any Action + object + options : FlightCallOptions + RPC options + + Returns + ------- + results : iterator of Result values + """ + def list_flights( + self, criteria: str | None = None, options: FlightCallOptions | None = None + ) -> Generator[FlightInfo, None, None]: + """List the flights available on a service.""" + def get_flight_info( + self, descriptor: FlightDescriptor, options: FlightCallOptions | None = None + ) -> FlightInfo: + """Request information about an available flight.""" + def get_schema( + self, descriptor: FlightDescriptor, options: FlightCallOptions | None = None + ) -> Schema: + """Request schema for an available flight.""" + def do_get( + self, ticket: Ticket, options: FlightCallOptions | None = None + ) -> FlightStreamReader: + """Request the data for a flight. + + Returns + ------- + reader : FlightStreamReader + """ + def do_put( + self, + descriptor: FlightDescriptor, + schema: Schema, + options: FlightCallOptions | None = None, + ) -> tuple[FlightStreamWriter, FlightStreamReader]: + """Upload data to a flight. + + Returns + ------- + writer : FlightStreamWriter + reader : FlightMetadataReader + """ + def do_exchange( + self, descriptor: FlightDescriptor, options: FlightCallOptions | None = None + ) -> tuple[FlightStreamWriter, FlightStreamReader]: + """Start a bidirectional data exchange with a server. + + Parameters + ---------- + descriptor : FlightDescriptor + A descriptor for the flight. + options : FlightCallOptions + RPC options. + + Returns + ------- + writer : FlightStreamWriter + reader : FlightStreamReader + """ + def close(self) -> None: + """Close the client and disconnect.""" + def __enter__(self) -> Self: ... + def __exit__(self, exc_type, exc_value, traceback) -> None: ... + +class FlightDataStream(_Weakrefable): + """ + Abstract base class for Flight data streams. + + See Also + -------- + RecordBatchStream + GeneratorStream + """ + +class RecordBatchStream(FlightDataStream): + """A Flight data stream backed by RecordBatches. + + The remainder of this DoGet request will be handled in C++, + without having to acquire the GIL. + + """ + def __init__( + self, data_source: RecordBatchReader | Table, options: IpcWriteOptions | None = None + ) -> None: + """Create a RecordBatchStream from a data source. + + Parameters + ---------- + data_source : RecordBatchReader or Table + The data to stream to the client. + options : pyarrow.ipc.IpcWriteOptions, optional + Optional IPC options to control how to write the data. + """ + +class GeneratorStream(FlightDataStream): + """A Flight data stream backed by a Python generator.""" + def __init__( + self, + schema: Schema, + generator: Iterable[FlightDataStream | Table | RecordBatch | RecordBatchReader], + options: IpcWriteOptions | None = None, + ) -> None: + """Create a GeneratorStream from a Python generator. + + Parameters + ---------- + schema : Schema + The schema for the data to be returned. + + generator : iterator or iterable + The generator should yield other FlightDataStream objects, + Tables, RecordBatches, or RecordBatchReaders. + + options : pyarrow.ipc.IpcWriteOptions, optional + """ + +class ServerCallContext(_Weakrefable): + """Per-call state/context.""" + def peer_identity(self) -> bytes: + """Get the identity of the authenticated peer. + + May be the empty string. + """ + def peer(self) -> str: + """Get the address of the peer.""" + # Set safe=True as gRPC on Windows sometimes gives garbage bytes + def is_cancelled(self) -> bool: + """Check if the current RPC call has been canceled by the client.""" + def add_header(self, key: str, value: str) -> None: + """Add a response header.""" + def add_trailer(self, key: str, value: str) -> None: + """Add a response trailer.""" + def get_middleware(self, key: str) -> ServerMiddleware | None: + """ + Get a middleware instance by key. + + Returns None if the middleware was not found. + """ + +class ServerAuthReader(_Weakrefable): + """A reader for messages from the client during an auth handshake.""" + def read(self) -> str: ... + +class ServerAuthSender(_Weakrefable): + """A writer for messages to the client during an auth handshake.""" + def write(self, message: str) -> None: ... + +class ClientAuthReader(_Weakrefable): + """A reader for messages from the server during an auth handshake.""" + def read(self) -> str: ... + +class ClientAuthSender(_Weakrefable): + """A writer for messages to the server during an auth handshake.""" + def write(self, message: str) -> None: ... + +class ServerAuthHandler(_Weakrefable): + """Authentication middleware for a server. + + To implement an authentication mechanism, subclass this class and + override its methods. + + """ + def authenticate(self, outgoing: ServerAuthSender, incoming: ServerAuthReader): + """Conduct the handshake with the client. + + May raise an error if the client cannot authenticate. + + Parameters + ---------- + outgoing : ServerAuthSender + A channel to send messages to the client. + incoming : ServerAuthReader + A channel to read messages from the client. + """ + def is_valid(self, token: str) -> bool: + """Validate a client token, returning their identity. + + May return an empty string (if the auth mechanism does not + name the peer) or raise an exception (if the token is + invalid). + + Parameters + ---------- + token : bytes + The authentication token from the client. + + """ + +class ClientAuthHandler(_Weakrefable): + """Authentication plugin for a client.""" + def authenticate(self, outgoing: ClientAuthSender, incoming: ClientAuthReader): + """Conduct the handshake with the server. + + Parameters + ---------- + outgoing : ClientAuthSender + A channel to send messages to the server. + incoming : ClientAuthReader + A channel to read messages from the server. + """ + def get_token(self) -> str: + """Get the auth token for a call.""" + +class CallInfo(NamedTuple): + """Information about a particular RPC for Flight middleware.""" + + method: FlightMethod + +class ClientMiddlewareFactory(_Weakrefable): + """A factory for new middleware instances. + + All middleware methods will be called from the same thread as the + RPC method implementation. That is, thread-locals set in the + client are accessible from the middleware itself. + + """ + def start_call(self, info: CallInfo) -> ClientMiddleware | None: + """Called at the start of an RPC. + + This must be thread-safe and must not raise exceptions. + + Parameters + ---------- + info : CallInfo + Information about the call. + + Returns + ------- + instance : ClientMiddleware + An instance of ClientMiddleware (the instance to use for + the call), or None if this call is not intercepted. + + """ + +class ClientMiddleware(_Weakrefable): + """Client-side middleware for a call, instantiated per RPC. + + Methods here should be fast and must be infallible: they should + not raise exceptions or stall indefinitely. + + """ + + def sending_headers(self) -> dict[str, list[str] | list[bytes]]: + """A callback before headers are sent. + + Returns + ------- + headers : dict + A dictionary of header values to add to the request, or + None if no headers are to be added. The dictionary should + have string keys and string or list-of-string values. + + Bytes values are allowed, but the underlying transport may + not support them or may restrict them. For gRPC, binary + values are only allowed on headers ending in "-bin". + + Header names must be lowercase ASCII. + + """ + + def received_headers(self, headers: dict[str, list[str] | list[bytes]]): + """A callback when headers are received. + + The default implementation does nothing. + + Parameters + ---------- + headers : dict + A dictionary of headers from the server. Keys are strings + and values are lists of strings (for text headers) or + bytes (for binary headers). + + """ + + def call_completed(self, exception: ArrowException): + """A callback when the call finishes. + + The default implementation does nothing. + + Parameters + ---------- + exception : ArrowException + If the call errored, this is the equivalent + exception. Will be None if the call succeeded. + + """ + +class ServerMiddlewareFactory(_Weakrefable): + """A factory for new middleware instances. + + All middleware methods will be called from the same thread as the + RPC method implementation. That is, thread-locals set in the + middleware are accessible from the method itself. + + """ + + def start_call( + self, info: CallInfo, headers: dict[str, list[str] | list[bytes]] + ) -> ServerMiddleware | None: + """Called at the start of an RPC. + + This must be thread-safe. + + Parameters + ---------- + info : CallInfo + Information about the call. + headers : dict + A dictionary of headers from the client. Keys are strings + and values are lists of strings (for text headers) or + bytes (for binary headers). + + Returns + ------- + instance : ServerMiddleware + An instance of ServerMiddleware (the instance to use for + the call), or None if this call is not intercepted. + + Raises + ------ + exception : pyarrow.ArrowException + If an exception is raised, the call will be rejected with + the given error. + + """ + +class TracingServerMiddlewareFactory(ServerMiddlewareFactory): + """A factory for tracing middleware instances. + + This enables OpenTelemetry support in Arrow (if Arrow was compiled + with OpenTelemetry support enabled). A new span will be started on + each RPC call. The TracingServerMiddleware instance can then be + retrieved within an RPC handler to get the propagated context, + which can be used to start a new span on the Python side. + + Because the Python/C++ OpenTelemetry libraries do not + interoperate, spans on the C++ side are not directly visible to + the Python side and vice versa. + + """ + +class ServerMiddleware(_Weakrefable): + """Server-side middleware for a call, instantiated per RPC. + + Methods here should be fast and must be infallible: they should + not raise exceptions or stall indefinitely. + + """ + + def sending_headers(self) -> dict[str, list[str] | list[bytes]]: + """A callback before headers are sent. + + Returns + ------- + headers : dict + A dictionary of header values to add to the response, or + None if no headers are to be added. The dictionary should + have string keys and string or list-of-string values. + + Bytes values are allowed, but the underlying transport may + not support them or may restrict them. For gRPC, binary + values are only allowed on headers ending in "-bin". + + Header names must be lowercase ASCII. + + """ + def call_completed(self, exception: ArrowException): + """A callback when the call finishes. + + Parameters + ---------- + exception : pyarrow.ArrowException + If the call errored, this is the equivalent + exception. Will be None if the call succeeded. + + """ + +class TracingServerMiddleware(ServerMiddleware): + trace_context: dict + def __init__(self, trace_context: dict) -> None: ... + +class _ServerMiddlewareFactoryWrapper(ServerMiddlewareFactory): + """Wrapper to bundle server middleware into a single C++ one.""" + + def __init__(self, factories: dict[str, ServerMiddlewareFactory]) -> None: ... + def start_call( # type: ignore[override] + self, info: CallInfo, headers: dict[str, list[str] | list[bytes]] + ) -> _ServerMiddlewareFactoryWrapper | None: ... + +class _ServerMiddlewareWrapper(ServerMiddleware): + def __init__(self, middleware: dict[str, ServerMiddleware]) -> None: ... + def send_headers(self) -> dict[str, dict[str, list[str] | list[bytes]]]: ... + def call_completed(self, exception: ArrowException) -> None: ... + +class _FlightServerFinalizer(_Weakrefable): + """ + A finalizer that shuts down the server on destruction. + + See ARROW-16597. If the server is still active at interpreter + exit, the process may segfault. + """ + + def finalize(self) -> None: ... + +class FlightServerBase(_Weakrefable): + """A Flight service definition. + + To start the server, create an instance of this class with an + appropriate location. The server will be running as soon as the + instance is created; it is not required to call :meth:`serve`. + + Override methods to define your Flight service. + + Parameters + ---------- + location : str, tuple or Location optional, default None + Location to serve on. Either a gRPC URI like `grpc://localhost:port`, + a tuple of (host, port) pair, or a Location instance. + If None is passed then the server will be started on localhost with a + system provided random port. + auth_handler : ServerAuthHandler optional, default None + An authentication mechanism to use. May be None. + tls_certificates : list optional, default None + A list of (certificate, key) pairs. + verify_client : boolean optional, default False + If True, then enable mutual TLS: require the client to present + a client certificate, and validate the certificate. + root_certificates : bytes optional, default None + If enabling mutual TLS, this specifies the PEM-encoded root + certificate used to validate client certificates. + middleware : dict optional, default None + A dictionary of :class:`ServerMiddlewareFactory` instances. The + string keys can be used to retrieve the middleware instance within + RPC handlers (see :meth:`ServerCallContext.get_middleware`). + + """ + def __init__( + self, + location: str | tuple[str, int] | Location | None = None, + auth_handler: ServerAuthHandler | None = None, + tls_certificates: list[tuple[str, str]] | None = None, + verify_client: bool = False, + root_certificates: str | None = None, + middleware: dict[str, ServerMiddlewareFactory] | None = None, + ): ... + @property + def port(self) -> int: + """ + Get the port that this server is listening on. + + Returns a non-positive value if the operation is invalid + (e.g. init() was not called or server is listening on a domain + socket). + """ + def list_flights(self, context: ServerCallContext, criteria: str) -> Iterator[FlightInfo]: + """List flights available on this service. + + Applications should override this method to implement their + own behavior. The default method raises a NotImplementedError. + + Parameters + ---------- + context : ServerCallContext + Common contextual information. + criteria : bytes + Filter criteria provided by the client. + + Returns + ------- + iterator of FlightInfo + + """ + def get_flight_info( + self, context: ServerCallContext, descriptor: FlightDescriptor + ) -> FlightInfo: + """Get information about a flight. + + Applications should override this method to implement their + own behavior. The default method raises a NotImplementedError. + + Parameters + ---------- + context : ServerCallContext + Common contextual information. + descriptor : FlightDescriptor + The descriptor for the flight provided by the client. + + Returns + ------- + FlightInfo + + """ + def get_schema(self, context: ServerCallContext, descriptor: FlightDescriptor) -> Schema: + """Get the schema of a flight. + + Applications should override this method to implement their + own behavior. The default method raises a NotImplementedError. + + Parameters + ---------- + context : ServerCallContext + Common contextual information. + descriptor : FlightDescriptor + The descriptor for the flight provided by the client. + + Returns + ------- + Schema + + """ + def do_put( + self, + context: ServerCallContext, + descriptor: FlightDescriptor, + reader: MetadataRecordBatchReader, + writer: FlightMetadataWriter, + ) -> None: + """Write data to a flight. + + Applications should override this method to implement their + own behavior. The default method raises a NotImplementedError. + + Parameters + ---------- + context : ServerCallContext + Common contextual information. + descriptor : FlightDescriptor + The descriptor for the flight provided by the client. + reader : MetadataRecordBatchReader + A reader for data uploaded by the client. + writer : FlightMetadataWriter + A writer to send responses to the client. + + """ + def do_get(self, context: ServerCallContext, ticket: Ticket) -> FlightDataStream: + """Write data to a flight. + + Applications should override this method to implement their + own behavior. The default method raises a NotImplementedError. + + Parameters + ---------- + context : ServerCallContext + Common contextual information. + ticket : Ticket + The ticket for the flight. + + Returns + ------- + FlightDataStream + A stream of data to send back to the client. + + """ + def do_exchange( + self, + context: ServerCallContext, + descriptor: FlightDescriptor, + reader: MetadataRecordBatchReader, + writer: MetadataRecordBatchWriter, + ) -> None: + """Write data to a flight. + + Applications should override this method to implement their + own behavior. The default method raises a NotImplementedError. + + Parameters + ---------- + context : ServerCallContext + Common contextual information. + descriptor : FlightDescriptor + The descriptor for the flight provided by the client. + reader : MetadataRecordBatchReader + A reader for data uploaded by the client. + writer : MetadataRecordBatchWriter + A writer to send responses to the client. + + """ + def list_actions(self, context: ServerCallContext) -> Iterable[Action]: + """List custom actions available on this server. + + Applications should override this method to implement their + own behavior. The default method raises a NotImplementedError. + + Parameters + ---------- + context : ServerCallContext + Common contextual information. + + Returns + ------- + iterator of ActionType or tuple + + """ + def do_action(self, context: ServerCallContext, action: Action) -> Iterable[bytes]: + """Execute a custom action. + + This method should return an iterator, or it should be a + generator. Applications should override this method to + implement their own behavior. The default method raises a + NotImplementedError. + + Parameters + ---------- + context : ServerCallContext + Common contextual information. + action : Action + The action to execute. + + Returns + ------- + iterator of bytes + + """ + def serve(self) -> None: + """Block until the server shuts down. + + This method only returns if shutdown() is called or a signal is + received. + """ + def run(self) -> None: + """Block until the server shuts down. + + .. deprecated:: 0.15.0 + Use the ``FlightServer.serve`` method instead + """ + def shutdown(self) -> None: + """Shut down the server, blocking until current requests finish. + + Do not call this directly from the implementation of a Flight + method, as then the server will block forever waiting for that + request to finish. Instead, call this method from a background + thread. + + This method should only be called once. + """ + def wait(self) -> None: + """Block until server is terminated with shutdown.""" + def __enter__(self) -> Self: ... + def __exit__(self, exc_type, exc_value, traceback): ... + +def connect( + location: str | tuple[str, int] | Location, + *, + tls_root_certs: str | None = None, + cert_chain: str | None = None, + private_key: str | None = None, + override_hostname: str | None = None, + middleware: list[ClientMiddlewareFactory] | None = None, + write_size_limit_bytes: int | None = None, + disable_server_verification: bool = False, + generic_options: list[tuple[str, int | str]] | None = None, +) -> FlightClient: + """ + Connect to a Flight server. + + Parameters + ---------- + location : str, tuple, or Location + Location to connect to. Either a URI like "grpc://localhost:port", + a tuple of (host, port), or a Location instance. + tls_root_certs : bytes or None + PEM-encoded. + cert_chain: str or None + If provided, enables TLS mutual authentication. + private_key: str or None + If provided, enables TLS mutual authentication. + override_hostname : str or None + Override the hostname checked by TLS. Insecure, use with caution. + middleware : list or None + A list of ClientMiddlewareFactory instances to apply. + write_size_limit_bytes : int or None + A soft limit on the size of a data payload sent to the + server. Enabled if positive. If enabled, writing a record + batch that (when serialized) exceeds this limit will raise an + exception; the client can retry the write with a smaller + batch. + disable_server_verification : boolean or None + Disable verifying the server when using TLS. + Insecure, use with caution. + generic_options : list or None + A list of generic (string, int or string) options to pass to + the underlying transport. + + Returns + ------- + client : FlightClient + """ diff --git a/python/stubs/_fs.pyi b/python/stubs/_fs.pyi new file mode 100644 index 00000000000..7670ef5230d --- /dev/null +++ b/python/stubs/_fs.pyi @@ -0,0 +1,1005 @@ +import datetime as dt +import enum +import sys + +from abc import ABC, abstractmethod + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias + +from typing import Union, overload + +from fsspec import AbstractFileSystem # type: ignore[import-untyped] + +from .lib import NativeFile, _Weakrefable + +SupportedFileSystem: TypeAlias = Union[AbstractFileSystem, FileSystem] + +class FileType(enum.IntFlag): + NotFound = enum.auto() + Unknown = enum.auto() + File = enum.auto() + Directory = enum.auto() + +class FileInfo(_Weakrefable): + """ + FileSystem entry info. + + Parameters + ---------- + path : str + The full path to the filesystem entry. + type : FileType + The type of the filesystem entry. + mtime : datetime or float, default None + If given, the modification time of the filesystem entry. + If a float is given, it is the number of seconds since the + Unix epoch. + mtime_ns : int, default None + If given, the modification time of the filesystem entry, + in nanoseconds since the Unix epoch. + `mtime` and `mtime_ns` are mutually exclusive. + size : int, default None + If given, the filesystem entry size in bytes. This should only + be given if `type` is `FileType.File`. + + Examples + -------- + Generate a file: + + >>> from pyarrow import fs + >>> local = fs.LocalFileSystem() + >>> path_fs = local_path + "/pyarrow-fs-example.dat" + >>> with local.open_output_stream(path_fs) as stream: + ... stream.write(b"data") + 4 + + Get FileInfo object using ``get_file_info()``: + + >>> file_info = local.get_file_info(path_fs) + >>> file_info + + + Inspect FileInfo attributes: + + >>> file_info.type + + + >>> file_info.is_file + True + + >>> file_info.path + '/.../pyarrow-fs-example.dat' + + >>> file_info.base_name + 'pyarrow-fs-example.dat' + + >>> file_info.size + 4 + + >>> file_info.extension + 'dat' + + >>> file_info.mtime # doctest: +SKIP + datetime.datetime(2022, 6, 29, 7, 56, 10, 873922, tzinfo=datetime.timezone.utc) + + >>> file_info.mtime_ns # doctest: +SKIP + 1656489370873922073 + """ + + def __init__( + self, + path: str, + type: FileType = FileType.Unknown, + *, + mtime: dt.datetime | float | None = None, + mtime_ns: int | None = None, + size: int | None = None, + ): ... + @property + def type(self) -> FileType: + """ + Type of the file. + + The returned enum values can be the following: + + - FileType.NotFound: target does not exist + - FileType.Unknown: target exists but its type is unknown (could be a + special file such as a Unix socket or character device, or + Windows NUL / CON / ...) + - FileType.File: target is a regular file + - FileType.Directory: target is a regular directory + + Returns + ------- + type : FileType + """ + @property + def is_file(self) -> bool: ... + @property + def path(self) -> str: + """ + The full file path in the filesystem. + + Examples + -------- + >>> file_info = local.get_file_info(path) + >>> file_info.path + '/.../pyarrow-fs-example.dat' + """ + @property + def base_name(self) -> str: + """ + The file base name. + + Component after the last directory separator. + + Examples + -------- + >>> file_info = local.get_file_info(path) + >>> file_info.base_name + 'pyarrow-fs-example.dat' + """ + @property + def size(self) -> int: + """ + The size in bytes, if available. + + Only regular files are guaranteed to have a size. + + Returns + ------- + size : int or None + """ + @property + def extension(self) -> str: + """ + The file extension. + + Examples + -------- + >>> file_info = local.get_file_info(path) + >>> file_info.extension + 'dat' + """ + @property + def mtime(self) -> dt.datetime | None: + """ + The time of last modification, if available. + + Returns + ------- + mtime : datetime.datetime or None + + Examples + -------- + >>> file_info = local.get_file_info(path) + >>> file_info.mtime # doctest: +SKIP + datetime.datetime(2022, 6, 29, 7, 56, 10, 873922, tzinfo=datetime.timezone.utc) + """ + @property + def mtime_ns(self) -> int | None: + """ + The time of last modification, if available, expressed in nanoseconds + since the Unix epoch. + + Returns + ------- + mtime_ns : int or None + + Examples + -------- + >>> file_info = local.get_file_info(path) + >>> file_info.mtime_ns # doctest: +SKIP + 1656489370873922073 + """ + +class FileSelector(_Weakrefable): + """ + File and directory selector. + + It contains a set of options that describes how to search for files and + directories. + + Parameters + ---------- + base_dir : str + The directory in which to select files. Relative paths also work, use + '.' for the current directory and '..' for the parent. + allow_not_found : bool, default False + The behavior if `base_dir` doesn't exist in the filesystem. + If false, an error is returned. + If true, an empty selection is returned. + recursive : bool, default False + Whether to recurse into subdirectories. + + Examples + -------- + List the contents of a directory and subdirectories: + + >>> selector_1 = fs.FileSelector(local_path, recursive=True) + >>> local.get_file_info(selector_1) # doctest: +SKIP + [, + , + ] + + List only the contents of the base directory: + + >>> selector_2 = fs.FileSelector(local_path) + >>> local.get_file_info(selector_2) # doctest: +SKIP + [, + ] + + Return empty selection if the directory doesn't exist: + + >>> selector_not_found = fs.FileSelector( + ... local_path + "/missing", recursive=True, allow_not_found=True + ... ) + >>> local.get_file_info(selector_not_found) + [] + """ + + base_dir: str + allow_not_found: bool + recursive: bool + def __init__(self, base_dir: str, allow_not_found: bool = False, recursive: bool = False): ... + +class FileSystem(_Weakrefable): + """ + Abstract file system API. + """ + + @classmethod + def from_uri(cls, uri: str) -> tuple[Self, str]: + """ + Create a new FileSystem from URI or Path. + + Recognized URI schemes are "file", "mock", "s3fs", "gs", "gcs", "hdfs" and "viewfs". + In addition, the argument can be a pathlib.Path object, or a string + describing an absolute local path. + + Parameters + ---------- + uri : string + URI-based path, for example: file:///some/local/path. + + Returns + ------- + tuple of (FileSystem, str path) + With (filesystem, path) tuple where path is the abstract path + inside the FileSystem instance. + + Examples + -------- + Create a new FileSystem subclass from a URI: + + >>> uri = "file:///{}/pyarrow-fs-example.dat".format(local_path) + >>> local_new, path_new = fs.FileSystem.from_uri(uri) + >>> local_new + >> path_new + '/.../pyarrow-fs-example.dat' + + Or from a s3 bucket: + + >>> fs.FileSystem.from_uri("s3://usgs-landsat/collection02/") + (, 'usgs-landsat/collection02') + """ + def equals(self, other: FileSystem) -> bool: + """ + Parameters + ---------- + other : pyarrow.fs.FileSystem + + Returns + ------- + bool + """ + @property + def type_name(self) -> str: + """ + The filesystem's type name. + """ + @overload + def get_file_info(self, paths_or_selector: str) -> FileInfo: ... + @overload + def get_file_info(self, paths_or_selector: FileSelector | list[str]) -> list[FileInfo]: ... + def get_file_info(self, paths_or_selector): + """ + Get info for the given files. + + Any symlink is automatically dereferenced, recursively. A non-existing + or unreachable file returns a FileStat object and has a FileType of + value NotFound. An exception indicates a truly exceptional condition + (low-level I/O error, etc.). + + Parameters + ---------- + paths_or_selector : FileSelector, path-like or list of path-likes + Either a selector object, a path-like object or a list of + path-like objects. The selector's base directory will not be + part of the results, even if it exists. If it doesn't exist, + use `allow_not_found`. + + Returns + ------- + FileInfo or list of FileInfo + Single FileInfo object is returned for a single path, otherwise + a list of FileInfo objects is returned. + + Examples + -------- + >>> local + + >>> local.get_file_info("/{}/pyarrow-fs-example.dat".format(local_path)) + + """ + def create_dir(self, path: str, *, recursive: bool = True) -> None: + """ + Create a directory and subdirectories. + + This function succeeds if the directory already exists. + + Parameters + ---------- + path : str + The path of the new directory. + recursive : bool, default True + Create nested directories as well. + """ + def delete_dir(self, path: str) -> None: + """ + Delete a directory and its contents, recursively. + + Parameters + ---------- + path : str + The path of the directory to be deleted. + """ + def delete_dir_contents( + self, path: str, *, accept_root_dir: bool = False, missing_dir_ok: bool = False + ) -> None: + """ + Delete a directory's contents, recursively. + + Like delete_dir, but doesn't delete the directory itself. + + Parameters + ---------- + path : str + The path of the directory to be deleted. + accept_root_dir : boolean, default False + Allow deleting the root directory's contents + (if path is empty or "/") + missing_dir_ok : boolean, default False + If False then an error is raised if path does + not exist + """ + def move(self, src: str, dest: str) -> None: + """ + Move / rename a file or directory. + + If the destination exists: + - if it is a non-empty directory, an error is returned + - otherwise, if it has the same type as the source, it is replaced + - otherwise, behavior is unspecified (implementation-dependent). + + Parameters + ---------- + src : str + The path of the file or the directory to be moved. + dest : str + The destination path where the file or directory is moved to. + + Examples + -------- + Create a new folder with a file: + + >>> local.create_dir("/tmp/other_dir") + >>> local.copy_file(path, "/tmp/move_example.dat") + + Move the file: + + >>> local.move("/tmp/move_example.dat", "/tmp/other_dir/move_example_2.dat") + + Inspect the file info: + + >>> local.get_file_info("/tmp/other_dir/move_example_2.dat") + + >>> local.get_file_info("/tmp/move_example.dat") + + + Delete the folder: + >>> local.delete_dir("/tmp/other_dir") + """ + def copy_file(self, src: str, dest: str) -> None: + """ + Copy a file. + + If the destination exists and is a directory, an error is returned. + Otherwise, it is replaced. + + Parameters + ---------- + src : str + The path of the file to be copied from. + dest : str + The destination path where the file is copied to. + + Examples + -------- + >>> local.copy_file(path, local_path + "/pyarrow-fs-example_copy.dat") + + Inspect the file info: + + >>> local.get_file_info(local_path + "/pyarrow-fs-example_copy.dat") + + >>> local.get_file_info(path) + + """ + def delete_file(self, path: str) -> None: + """ + Delete a file. + + Parameters + ---------- + path : str + The path of the file to be deleted. + """ + def open_input_file(self, path: str) -> NativeFile: + """ + Open an input file for random access reading. + + Parameters + ---------- + path : str + The source to open for reading. + + Returns + ------- + stream : NativeFile + + Examples + -------- + Print the data from the file with `open_input_file()`: + + >>> with local.open_input_file(path) as f: + ... print(f.readall()) + b'data' + """ + def open_input_stream( + self, path: str, compression: str | None = "detect", buffer_size: int | None = None + ) -> NativeFile: + """ + Open an input stream for sequential reading. + + Parameters + ---------- + path : str + The source to open for reading. + compression : str optional, default 'detect' + The compression algorithm to use for on-the-fly decompression. + If "detect" and source is a file path, then compression will be + chosen based on the file extension. + If None, no compression will be applied. Otherwise, a well-known + algorithm name must be supplied (e.g. "gzip"). + buffer_size : int optional, default None + If None or 0, no buffering will happen. Otherwise the size of the + temporary read buffer. + + Returns + ------- + stream : NativeFile + + Examples + -------- + Print the data from the file with `open_input_stream()`: + + >>> with local.open_input_stream(path) as f: + ... print(f.readall()) + b'data' + """ + def open_output_stream( + self, + path: str, + compression: str | None = "detect", + buffer_size: int | None = None, + metadata: dict[str, str] | None = None, + ) -> NativeFile: + """ + Open an output stream for sequential writing. + + If the target already exists, existing data is truncated. + + Parameters + ---------- + path : str + The source to open for writing. + compression : str optional, default 'detect' + The compression algorithm to use for on-the-fly compression. + If "detect" and source is a file path, then compression will be + chosen based on the file extension. + If None, no compression will be applied. Otherwise, a well-known + algorithm name must be supplied (e.g. "gzip"). + buffer_size : int optional, default None + If None or 0, no buffering will happen. Otherwise the size of the + temporary write buffer. + metadata : dict optional, default None + If not None, a mapping of string keys to string values. + Some filesystems support storing metadata along the file + (such as "Content-Type"). + Unsupported metadata keys will be ignored. + + Returns + ------- + stream : NativeFile + + Examples + -------- + >>> local = fs.LocalFileSystem() + >>> with local.open_output_stream(path) as stream: + ... stream.write(b"data") + 4 + """ + def open_append_stream( + self, + path: str, + compression: str | None = "detect", + buffer_size: int | None = None, + metadata: dict[str, str] | None = None, + ): + """ + Open an output stream for appending. + + If the target doesn't exist, a new empty file is created. + + .. note:: + Some filesystem implementations do not support efficient + appending to an existing file, in which case this method will + raise NotImplementedError. + Consider writing to multiple files (using e.g. the dataset layer) + instead. + + Parameters + ---------- + path : str + The source to open for writing. + compression : str optional, default 'detect' + The compression algorithm to use for on-the-fly compression. + If "detect" and source is a file path, then compression will be + chosen based on the file extension. + If None, no compression will be applied. Otherwise, a well-known + algorithm name must be supplied (e.g. "gzip"). + buffer_size : int optional, default None + If None or 0, no buffering will happen. Otherwise the size of the + temporary write buffer. + metadata : dict optional, default None + If not None, a mapping of string keys to string values. + Some filesystems support storing metadata along the file + (such as "Content-Type"). + Unsupported metadata keys will be ignored. + + Returns + ------- + stream : NativeFile + + Examples + -------- + Append new data to a FileSystem subclass with nonempty file: + + >>> with local.open_append_stream(path) as f: + ... f.write(b"+newly added") + 12 + + Print out the content to the file: + + >>> with local.open_input_file(path) as f: + ... print(f.readall()) + b'data+newly added' + """ + def normalize_path(self, path: str) -> str: + """ + Normalize filesystem path. + + Parameters + ---------- + path : str + The path to normalize + + Returns + ------- + normalized_path : str + The normalized path + """ + +class LocalFileSystem(FileSystem): + """ + A FileSystem implementation accessing files on the local machine. + + Details such as symlinks are abstracted away (symlinks are always followed, + except when deleting an entry). + + Parameters + ---------- + use_mmap : bool, default False + Whether open_input_stream and open_input_file should return + a mmap'ed file or a regular file. + + Examples + -------- + Create a FileSystem object with LocalFileSystem constructor: + + >>> from pyarrow import fs + >>> local = fs.LocalFileSystem() + >>> local + + + and write data on to the file: + + >>> with local.open_output_stream("/tmp/local_fs.dat") as stream: + ... stream.write(b"data") + 4 + >>> with local.open_input_stream("/tmp/local_fs.dat") as stream: + ... print(stream.readall()) + b'data' + + Create a FileSystem object inferred from a URI of the saved file: + + >>> local_new, path = fs.LocalFileSystem().from_uri("/tmp/local_fs.dat") + >>> local_new + >> path + '/tmp/local_fs.dat' + + Check if FileSystems `local` and `local_new` are equal: + + >>> local.equals(local_new) + True + + Compare two different FileSystems: + + >>> local2 = fs.LocalFileSystem(use_mmap=True) + >>> local.equals(local2) + False + + Copy a file and print out the data: + + >>> local.copy_file("/tmp/local_fs.dat", "/tmp/local_fs-copy.dat") + >>> with local.open_input_stream("/tmp/local_fs-copy.dat") as stream: + ... print(stream.readall()) + b'data' + + Open an output stream for appending, add text and print the new data: + + >>> with local.open_append_stream("/tmp/local_fs-copy.dat") as f: + ... f.write(b"+newly added") + 12 + + >>> with local.open_input_stream("/tmp/local_fs-copy.dat") as f: + ... print(f.readall()) + b'data+newly added' + + Create a directory, copy a file into it and then delete the whole directory: + + >>> local.create_dir("/tmp/new_folder") + >>> local.copy_file("/tmp/local_fs.dat", "/tmp/new_folder/local_fs.dat") + >>> local.get_file_info("/tmp/new_folder") + + >>> local.delete_dir("/tmp/new_folder") + >>> local.get_file_info("/tmp/new_folder") + + + Create a directory, copy a file into it and then delete + the content of the directory: + + >>> local.create_dir("/tmp/new_folder") + >>> local.copy_file("/tmp/local_fs.dat", "/tmp/new_folder/local_fs.dat") + >>> local.get_file_info("/tmp/new_folder/local_fs.dat") + + >>> local.delete_dir_contents("/tmp/new_folder") + >>> local.get_file_info("/tmp/new_folder") + + >>> local.get_file_info("/tmp/new_folder/local_fs.dat") + + + Create a directory, copy a file into it and then delete + the file from the directory: + + >>> local.create_dir("/tmp/new_folder") + >>> local.copy_file("/tmp/local_fs.dat", "/tmp/new_folder/local_fs.dat") + >>> local.delete_file("/tmp/new_folder/local_fs.dat") + >>> local.get_file_info("/tmp/new_folder/local_fs.dat") + + >>> local.get_file_info("/tmp/new_folder") + + + Move the file: + + >>> local.move("/tmp/local_fs-copy.dat", "/tmp/new_folder/local_fs-copy.dat") + >>> local.get_file_info("/tmp/new_folder/local_fs-copy.dat") + + >>> local.get_file_info("/tmp/local_fs-copy.dat") + + + To finish delete the file left: + >>> local.delete_file("/tmp/local_fs.dat") + """ + + def __init__(self, *, use_mmap: bool = False) -> None: ... + +class SubTreeFileSystem(FileSystem): + """ + Delegates to another implementation after prepending a fixed base path. + + This is useful to expose a logical view of a subtree of a filesystem, + for example a directory in a LocalFileSystem. + + Note, that this makes no security guarantee. For example, symlinks may + allow to "escape" the subtree and access other parts of the underlying + filesystem. + + Parameters + ---------- + base_path : str + The root of the subtree. + base_fs : FileSystem + FileSystem object the operations delegated to. + + Examples + -------- + Create a LocalFileSystem instance: + + >>> from pyarrow import fs + >>> local = fs.LocalFileSystem() + >>> with local.open_output_stream("/tmp/local_fs.dat") as stream: + ... stream.write(b"data") + 4 + + Create a directory and a SubTreeFileSystem instance: + + >>> local.create_dir("/tmp/sub_tree") + >>> subtree = fs.SubTreeFileSystem("/tmp/sub_tree", local) + + Write data into the existing file: + + >>> with subtree.open_append_stream("sub_tree_fs.dat") as f: + ... f.write(b"+newly added") + 12 + + Print out the attributes: + + >>> subtree.base_fs + + >>> subtree.base_path + '/tmp/sub_tree/' + + Get info for the given directory or given file: + + >>> subtree.get_file_info("") + + >>> subtree.get_file_info("sub_tree_fs.dat") + + + Delete the file and directory: + + >>> subtree.delete_file("sub_tree_fs.dat") + >>> local.delete_dir("/tmp/sub_tree") + >>> local.delete_file("/tmp/local_fs.dat") + + For usage of the methods see examples for :func:`~pyarrow.fs.LocalFileSystem`. + """ + def __init__(self, base_path: str, base_fs: FileSystem): ... + @property + def base_path(self) -> str: ... + @property + def base_fs(self) -> FileSystem: ... + +class _MockFileSystem(FileSystem): + def __init__(self, current_time: dt.datetime | None = None) -> None: ... + +class PyFileSystem(FileSystem): + """ + A FileSystem with behavior implemented in Python. + + Parameters + ---------- + handler : FileSystemHandler + The handler object implementing custom filesystem behavior. + + Examples + -------- + Create an fsspec-based filesystem object for GitHub: + + >>> from fsspec.implementations import github + >>> gfs = github.GithubFileSystem("apache", "arrow") # doctest: +SKIP + + Get a PyArrow FileSystem object: + + >>> from pyarrow.fs import PyFileSystem, FSSpecHandler + >>> pa_fs = PyFileSystem(FSSpecHandler(gfs)) # doctest: +SKIP + + Use :func:`~pyarrow.fs.FileSystem` functionality ``get_file_info()``: + + >>> pa_fs.get_file_info("README.md") # doctest: +SKIP + + """ + def __init__(self, handler: FileSystemHandler) -> None: ... + @property + def handler(self) -> FileSystemHandler: + """ + The filesystem's underlying handler. + + Returns + ------- + handler : FileSystemHandler + """ + +class FileSystemHandler(ABC): + """ + An abstract class exposing methods to implement PyFileSystem's behavior. + """ + @abstractmethod + def get_type_name(self) -> str: + """ + Implement PyFileSystem.type_name. + """ + @abstractmethod + def get_file_info(self, paths: str | list[str]) -> FileInfo | list[FileInfo]: + """ + Implement PyFileSystem.get_file_info(paths). + + Parameters + ---------- + paths : list of str + paths for which we want to retrieve the info. + """ + @abstractmethod + def get_file_info_selector(self, selector: FileSelector) -> list[FileInfo]: + """ + Implement PyFileSystem.get_file_info(selector). + + Parameters + ---------- + selector : FileSelector + selector for which we want to retrieve the info. + """ + + @abstractmethod + def create_dir(self, path: str, recursive: bool) -> None: + """ + Implement PyFileSystem.create_dir(...). + + Parameters + ---------- + path : str + path of the directory. + recursive : bool + if the parent directories should be created too. + """ + @abstractmethod + def delete_dir(self, path: str) -> None: + """ + Implement PyFileSystem.delete_dir(...). + + Parameters + ---------- + path : str + path of the directory. + """ + @abstractmethod + def delete_dir_contents(self, path: str, missing_dir_ok: bool = False) -> None: + """ + Implement PyFileSystem.delete_dir_contents(...). + + Parameters + ---------- + path : str + path of the directory. + missing_dir_ok : bool + if False an error should be raised if path does not exist + """ + @abstractmethod + def delete_root_dir_contents(self) -> None: + """ + Implement PyFileSystem.delete_dir_contents("/", accept_root_dir=True). + """ + @abstractmethod + def delete_file(self, path: str) -> None: + """ + Implement PyFileSystem.delete_file(...). + + Parameters + ---------- + path : str + path of the file. + """ + @abstractmethod + def move(self, src: str, dest: str) -> None: + """ + Implement PyFileSystem.move(...). + + Parameters + ---------- + src : str + path of what should be moved. + dest : str + path of where it should be moved to. + """ + + @abstractmethod + def copy_file(self, src: str, dest: str) -> None: + """ + Implement PyFileSystem.copy_file(...). + + Parameters + ---------- + src : str + path of what should be copied. + dest : str + path of where it should be copied to. + """ + @abstractmethod + def open_input_stream(self, path: str) -> NativeFile: + """ + Implement PyFileSystem.open_input_stream(...). + + Parameters + ---------- + path : str + path of what should be opened. + """ + @abstractmethod + def open_input_file(self, path: str) -> NativeFile: + """ + Implement PyFileSystem.open_input_file(...). + + Parameters + ---------- + path : str + path of what should be opened. + """ + @abstractmethod + def open_output_stream(self, path: str, metadata: dict[str, str]) -> NativeFile: + """ + Implement PyFileSystem.open_output_stream(...). + + Parameters + ---------- + path : str + path of what should be opened. + metadata : mapping + Mapping of string keys to string values. + Some filesystems support storing metadata along the file + (such as "Content-Type"). + """ + + @abstractmethod + def open_append_stream(self, path: str, metadata: dict[str, str]) -> NativeFile: + """ + Implement PyFileSystem.open_append_stream(...). + + Parameters + ---------- + path : str + path of what should be opened. + metadata : mapping + Mapping of string keys to string values. + Some filesystems support storing metadata along the file + (such as "Content-Type"). + """ + @abstractmethod + def normalize_path(self, path: str) -> str: + """ + Implement PyFileSystem.normalize_path(...). + + Parameters + ---------- + path : str + path of what should be normalized. + """ diff --git a/python/stubs/_gcsfs.pyi b/python/stubs/_gcsfs.pyi new file mode 100644 index 00000000000..4fc7ea68e48 --- /dev/null +++ b/python/stubs/_gcsfs.pyi @@ -0,0 +1,83 @@ +import datetime as dt + +from ._fs import FileSystem +from .lib import KeyValueMetadata + +class GcsFileSystem(FileSystem): + """ + Google Cloud Storage (GCS) backed FileSystem implementation + + By default uses the process described in https://google.aip.dev/auth/4110 + to resolve credentials. If not running on Google Cloud Platform (GCP), + this generally requires the environment variable + GOOGLE_APPLICATION_CREDENTIALS to point to a JSON file + containing credentials. + + Note: GCS buckets are special and the operations available on them may be + limited or more expensive than expected compared to local file systems. + + Note: When pickling a GcsFileSystem that uses default credentials, resolution + credentials are not stored in the serialized data. Therefore, when unpickling + it is assumed that the necessary credentials are in place for the target + process. + + Parameters + ---------- + anonymous : boolean, default False + Whether to connect anonymously. + If true, will not attempt to look up credentials using standard GCP + configuration methods. + access_token : str, default None + GCP access token. If provided, temporary credentials will be fetched by + assuming this role; also, a `credential_token_expiration` must be + specified as well. + target_service_account : str, default None + An optional service account to try to impersonate when accessing GCS. This + requires the specified credential user or service account to have the necessary + permissions. + credential_token_expiration : datetime, default None + Expiration for credential generated with an access token. Must be specified + if `access_token` is specified. + default_bucket_location : str, default 'US' + GCP region to create buckets in. + scheme : str, default 'https' + GCS connection transport scheme. + endpoint_override : str, default None + Override endpoint with a connect string such as "localhost:9000" + default_metadata : mapping or pyarrow.KeyValueMetadata, default None + Default metadata for `open_output_stream`. This will be ignored if + non-empty metadata is passed to `open_output_stream`. + retry_time_limit : timedelta, default None + Set the maximum amount of time the GCS client will attempt to retry + transient errors. Subsecond granularity is ignored. + project_id : str, default None + The GCP project identifier to use for creating buckets. + If not set, the library uses the GOOGLE_CLOUD_PROJECT environment + variable. Most I/O operations do not need a project id, only applications + that create new buckets need a project id. + """ + + def __init__( + self, + *, + anonymous: bool = False, + access_token: str | None = None, + target_service_account: str | None = None, + credential_token_expiration: dt.datetime | None = None, + default_bucket_location: str = "US", + scheme: str = "https", + endpoint_override: str | None = None, + default_metadata: dict | KeyValueMetadata | None = None, + retry_time_limit: dt.timedelta | None = None, + project_id: str | None = None, + ): ... + @property + def default_bucket_location(self) -> str: + """ + The GCP location this filesystem will write to. + """ + @property + def project_id(self) -> str: + """ + The GCP project id this filesystem will use. + """ diff --git a/python/stubs/_hdfs.pyi b/python/stubs/_hdfs.pyi new file mode 100644 index 00000000000..200f669379b --- /dev/null +++ b/python/stubs/_hdfs.pyi @@ -0,0 +1,75 @@ +from _typeshed import StrPath + +from ._fs import FileSystem + +class HadoopFileSystem(FileSystem): + """ + HDFS backed FileSystem implementation + + Parameters + ---------- + host : str + HDFS host to connect to. Set to "default" for fs.defaultFS from + core-site.xml. + port : int, default 8020 + HDFS port to connect to. Set to 0 for default or logical (HA) nodes. + user : str, default None + Username when connecting to HDFS; None implies login user. + replication : int, default 3 + Number of copies each block will have. + buffer_size : int, default 0 + If 0, no buffering will happen otherwise the size of the temporary read + and write buffer. + default_block_size : int, default None + None means the default configuration for HDFS, a typical block size is + 128 MB. + kerb_ticket : string or path, default None + If not None, the path to the Kerberos ticket cache. + extra_conf : dict, default None + Extra key/value pairs for configuration; will override any + hdfs-site.xml properties. + + Examples + -------- + >>> from pyarrow import fs + >>> hdfs = fs.HadoopFileSystem( + ... host, port, user=user, kerb_ticket=ticket_cache_path + ... ) # doctest: +SKIP + + For usage of the methods see examples for :func:`~pyarrow.fs.LocalFileSystem`. + """ + def __init__( + self, + host: str, + port: int = 8020, + *, + user: str | None = None, + replication: int = 3, + buffer_size: int = 0, + default_block_size: int | None = None, + kerb_ticket: StrPath | None = None, + extra_conf: dict | None = None, + ): ... + @staticmethod + def from_uri(uri: str) -> HadoopFileSystem: # type: ignore[override] + """ + Instantiate HadoopFileSystem object from an URI string. + + The following two calls are equivalent + + * ``HadoopFileSystem.from_uri('hdfs://localhost:8020/?user=test\ +&replication=1')`` + * ``HadoopFileSystem('localhost', port=8020, user='test', \ +replication=1)`` + + Parameters + ---------- + uri : str + A string URI describing the connection to HDFS. + In order to change the user, replication, buffer_size or + default_block_size pass the values as query parts. + + Returns + ------- + HadoopFileSystem + """ diff --git a/python/stubs/_json.pyi b/python/stubs/_json.pyi new file mode 100644 index 00000000000..43d2ae83cd8 --- /dev/null +++ b/python/stubs/_json.pyi @@ -0,0 +1,169 @@ +from typing import IO, Any, Literal + +from _typeshed import StrPath + +from .lib import MemoryPool, RecordBatchReader, Schema, Table, _Weakrefable + +class ReadOptions(_Weakrefable): + """ + Options for reading JSON files. + + Parameters + ---------- + use_threads : bool, optional (default True) + Whether to use multiple threads to accelerate reading + block_size : int, optional + How much bytes to process at a time from the input stream. + This will determine multi-threading granularity as well as + the size of individual chunks in the Table. + """ + + use_threads: bool + """ + Whether to use multiple threads to accelerate reading. + """ + block_size: int + """ + How much bytes to process at a time from the input stream. + + This will determine multi-threading granularity as well as the size of + individual chunks in the Table. + """ + def __init__(self, use_threads: bool | None = None, block_size: int | None = None): ... + def equals(self, other: ReadOptions) -> bool: + """ + Parameters + ---------- + other : pyarrow.json.ReadOptions + + Returns + ------- + bool + """ + +class ParseOptions(_Weakrefable): + """ + Options for parsing JSON files. + + Parameters + ---------- + explicit_schema : Schema, optional (default None) + Optional explicit schema (no type inference, ignores other fields). + newlines_in_values : bool, optional (default False) + Whether objects may be printed across multiple lines (for example + pretty printed). If false, input must end with an empty line. + unexpected_field_behavior : str, default "infer" + How JSON fields outside of explicit_schema (if given) are treated. + + Possible behaviors: + + - "ignore": unexpected JSON fields are ignored + - "error": error out on unexpected JSON fields + - "infer": unexpected JSON fields are type-inferred and included in + the output + """ + + explicit_schema: Schema + """ + Optional explicit schema (no type inference, ignores other fields) + """ + newlines_in_values: bool + """ + Whether newline characters are allowed in JSON values. + Setting this to True reduces the performance of multi-threaded + JSON reading. + """ + unexpected_field_behavior: Literal["ignore", "error", "infer"] + """ + How JSON fields outside of explicit_schema (if given) are treated. + + Possible behaviors: + + - "ignore": unexpected JSON fields are ignored + - "error": error out on unexpected JSON fields + - "infer": unexpected JSON fields are type-inferred and included in + the output + + Set to "infer" by default. + """ + def __init__( + self, + explicit_schema: Schema | None = None, + newlines_in_values: bool | None = None, + unexpected_field_behavior: Literal["ignore", "error", "infer"] = "infer", + ): ... + def equals(self, other: ParseOptions) -> bool: + """ + Parameters + ---------- + other : pyarrow.json.ParseOptions + + Returns + ------- + bool + """ + +class JSONStreamingReader(RecordBatchReader): + """An object that reads record batches incrementally from a JSON file. + + Should not be instantiated directly by user code. + """ + +def read_json( + input_file: StrPath | IO[Any], + read_options: ReadOptions | None = None, + parse_options: ParseOptions | None = None, + memory_pool: MemoryPool | None = None, +) -> Table: + """ + Read a Table from a stream of JSON data. + + Parameters + ---------- + input_file : str, path or file-like object + The location of JSON data. Currently only the line-delimited JSON + format is supported. + read_options : pyarrow.json.ReadOptions, optional + Options for the JSON reader (see ReadOptions constructor for defaults). + parse_options : pyarrow.json.ParseOptions, optional + Options for the JSON parser + (see ParseOptions constructor for defaults). + memory_pool : MemoryPool, optional + Pool to allocate Table memory from. + + Returns + ------- + :class:`pyarrow.Table` + Contents of the JSON file as a in-memory table. + """ + +def open_json( + input_file: StrPath | IO[Any], + read_options: ReadOptions | None = None, + parse_options: ParseOptions | None = None, + memory_pool: MemoryPool | None = None, +) -> JSONStreamingReader: + """ + Open a streaming reader of JSON data. + + Reading using this function is always single-threaded. + + Parameters + ---------- + input_file : string, path or file-like object + The location of JSON data. If a string or path, and if it ends + with a recognized compressed file extension (e.g. ".gz" or ".bz2"), + the data is automatically decompressed when reading. + read_options : pyarrow.json.ReadOptions, optional + Options for the JSON reader (see pyarrow.json.ReadOptions constructor + for defaults) + parse_options : pyarrow.json.ParseOptions, optional + Options for the JSON parser + (see pyarrow.json.ParseOptions constructor for defaults) + memory_pool : MemoryPool, optional + Pool to allocate RecordBatch memory from + + Returns + ------- + :class:`pyarrow.json.JSONStreamingReader` + """ diff --git a/python/stubs/_orc.pyi b/python/stubs/_orc.pyi new file mode 100644 index 00000000000..71bf0dde9ba --- /dev/null +++ b/python/stubs/_orc.pyi @@ -0,0 +1,56 @@ +from typing import IO, Literal + +from .lib import ( + Buffer, + KeyValueMetadata, + MemoryPool, + NativeFile, + RecordBatch, + Schema, + Table, + _Weakrefable, +) + +class ORCReader(_Weakrefable): + def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... + def open(self, source: str | NativeFile | Buffer, use_memory_map: bool = True): ... + def metadata(self) -> KeyValueMetadata: ... + def schema(self) -> Schema: ... + def nrows(self) -> int: ... + def nstripes(self) -> int: ... + def file_version(self) -> str: ... + def software_version(self) -> str: ... + def compression(self) -> Literal["UNCOMPRESSED", "ZLIB", "SNAPPY", "LZ4", "ZSTD"]: ... + def compression_size(self) -> int: ... + def row_index_stride(self) -> int: ... + def writer(self) -> str: ... + def writer_version(self) -> str: ... + def nstripe_statistics(self) -> int: ... + def content_length(self) -> int: ... + def stripe_statistics_length(self) -> int: ... + def file_footer_length(self) -> int: ... + def file_postscript_length(self) -> int: ... + def file_length(self) -> int: ... + def serialized_file_tail(self) -> int: ... + def read_stripe(self, n: int, columns: list[str] | None = None) -> RecordBatch: ... + def read(self, columns: list[str] | None = None) -> Table: ... + +class ORCWriter(_Weakrefable): + def open( + self, + where: str | NativeFile | IO, + *, + file_version: str | None = None, + batch_size: int | None = None, + stripe_size: int | None = None, + compression: Literal["UNCOMPRESSED", "ZLIB", "SNAPPY", "LZ4", "ZSTD"] | None = None, + compression_block_size: int | None = None, + compression_strategy: Literal["COMPRESSION", "SPEED"] | None = None, + row_index_stride: int | None = None, + padding_tolerance: float | None = None, + dictionary_key_size_threshold: float | None = None, + bloom_filter_columns: list[int] | None = None, + bloom_filter_fpp: float | None = None, + ) -> None: ... + def write(self, table: Table) -> None: ... + def close(self) -> None: ... diff --git a/python/stubs/_parquet.pyi b/python/stubs/_parquet.pyi new file mode 100644 index 00000000000..a9187df0428 --- /dev/null +++ b/python/stubs/_parquet.pyi @@ -0,0 +1,445 @@ +from typing import IO, Any, Iterable, Iterator, Literal, Sequence, TypeAlias, TypedDict + +from _typeshed import StrPath + +from ._stubs_typing import Order +from .lib import ( + Buffer, + ChunkedArray, + KeyValueMetadata, + MemoryPool, + NativeFile, + RecordBatch, + Schema, + Table, + _Weakrefable, +) + +_PhysicalType: TypeAlias = Literal[ + "BOOLEAN", + "INT32", + "INT64", + "INT96", + "FLOAT", + "DOUBLE", + "BYTE_ARRAY", + "FIXED_LEN_BYTE_ARRAY", + "UNKNOWN", +] +_LogicTypeName: TypeAlias = Literal[ + "UNDEFINED", + "STRING", + "MAP", + "LIST", + "ENUM", + "DECIMAL", + "DATE", + "TIME", + "TIMESTAMP", + "INT", + "FLOAT16", + "JSON", + "BSON", + "UUID", + "NONE", + "UNKNOWN", +] +_ConvertedType: TypeAlias = Literal[ + "NONE", + "UTF8", + "MAP", + "MAP_KEY_VALUE", + "LIST", + "ENUM", + "DECIMAL", + "DATE", + "TIME_MILLIS", + "TIME_MICROS", + "TIMESTAMP_MILLIS", + "TIMESTAMP_MICROS", + "UINT_8", + "UINT_16", + "UINT_32", + "UINT_64", + "INT_8", + "INT_16", + "INT_32", + "INT_64", + "JSON", + "BSON", + "INTERVAL", + "UNKNOWN", +] +_Encoding: TypeAlias = Literal[ + "PLAIN", + "PLAIN_DICTIONARY", + "RLE", + "BIT_PACKED", + "DELTA_BINARY_PACKED", + "DELTA_LENGTH_BYTE_ARRAY", + "DELTA_BYTE_ARRAY", + "RLE_DICTIONARY", + "BYTE_STREAM_SPLIT", + "UNKNOWN", +] +_Compression: TypeAlias = Literal[ + "UNCOMPRESSED", + "SNAPPY", + "GZIP", + "LZO", + "BROTLI", + "LZ4", + "ZSTD", + "UNKNOWN", +] + +class _Statistics(TypedDict): + has_min_max: bool + min: Any | None + max: Any | None + null_count: int | None + distinct_count: int | None + num_values: int + physical_type: _PhysicalType + +class Statistics(_Weakrefable): + def to_dict(self) -> _Statistics: ... + def equals(self, other: Statistics) -> bool: ... + @property + def has_min_max(self) -> bool: ... + @property + def hash_null_count(self) -> bool: ... + @property + def has_distinct_count(self) -> bool: ... + @property + def min_raw(self) -> Any | None: ... + @property + def max_raw(self) -> Any | None: ... + @property + def min(self) -> Any | None: ... + @property + def max(self) -> Any | None: ... + @property + def null_count(self) -> int | None: ... + @property + def distinct_count(self) -> int | None: ... + @property + def num_values(self) -> int: ... + @property + def physical_type(self) -> _PhysicalType: ... + @property + def logical_type(self) -> ParquetLogicalType: ... + @property + def converted_type(self) -> _ConvertedType | None: ... + +class ParquetLogicalType(_Weakrefable): + def to_json(self) -> str: ... + @property + def type(self) -> _LogicTypeName: ... + +class _ColumnChunkMetaData(TypedDict): + file_offset: int + file_path: str | None + physical_type: _PhysicalType + num_values: int + path_in_schema: str + is_stats_set: bool + statistics: Statistics | None + compression: _Compression + encodings: tuple[_Encoding, ...] + has_dictionary_page: bool + dictionary_page_offset: int | None + data_page_offset: int + total_compressed_size: int + total_uncompressed_size: int + +class ColumnChunkMetaData(_Weakrefable): + def to_dict(self) -> _ColumnChunkMetaData: ... + def equals(self, other: ColumnChunkMetaData) -> bool: ... + @property + def file_offset(self) -> int: ... + @property + def file_path(self) -> str | None: ... + @property + def physical_type(self) -> _PhysicalType: ... + @property + def num_values(self) -> int: ... + @property + def path_in_schema(self) -> str: ... + @property + def is_stats_set(self) -> bool: ... + @property + def statistics(self) -> Statistics | None: ... + @property + def compression(self) -> _Compression: ... + @property + def encodings(self) -> tuple[_Encoding, ...]: ... + @property + def has_dictionary_page(self) -> bool: ... + @property + def dictionary_page_offset(self) -> int | None: ... + @property + def data_page_offset(self) -> int: ... + @property + def has_index_page(self) -> bool: ... + @property + def index_page_offset(self) -> int: ... + @property + def total_compressed_size(self) -> int: ... + @property + def total_uncompressed_size(self) -> int: ... + @property + def has_offset_index(self) -> bool: ... + @property + def has_column_index(self) -> bool: ... + @property + def metadata(self) -> dict[bytes, bytes] | None: ... + +class _SortingColumn(TypedDict): + column_index: int + descending: bool + nulls_first: bool + +class SortingColumn: + def __init__( + self, column_index: int, descending: bool = False, nulls_first: bool = False + ) -> None: ... + @classmethod + def from_ordering( + cls, + schema: Schema, + sort_keys: Sequence[tuple[str, Order]], + null_placement: Literal["at_start", "at_end"] = "at_end", + ) -> tuple[SortingColumn, ...]: ... + @staticmethod + def to_ordering( + schema: Schema, sorting_columns: tuple[SortingColumn, ...] + ) -> tuple[Sequence[tuple[str, Order]], Literal["at_start", "at_end"]]: ... + def __hash__(self) -> int: ... + @property + def column_index(self) -> int: ... + @property + def descending(self) -> bool: ... + @property + def nulls_first(self) -> bool: ... + def to_dict(self) -> _SortingColumn: ... + +class _RowGroupMetaData(TypedDict): + num_columns: int + num_rows: int + total_byte_size: int + columns: list[ColumnChunkMetaData] + sorting_columns: list[SortingColumn] + +class RowGroupMetaData(_Weakrefable): + def __init__(self, parent: FileMetaData, index: int) -> None: ... + def equals(self, other: RowGroupMetaData) -> bool: ... + def column(self, i: int) -> ColumnChunkMetaData: ... + def to_dict(self) -> _RowGroupMetaData: ... + @property + def num_columns(self) -> int: ... + @property + def num_rows(self) -> int: ... + @property + def total_byte_size(self) -> int: ... + @property + def sorting_columns(self) -> list[SortingColumn]: ... + +class _FileMetaData(TypedDict): + created_by: str + num_columns: int + num_rows: int + num_row_groups: int + format_version: str + serialized_size: int + +class FileMetaData(_Weakrefable): + def __hash__(self) -> int: ... + def to_dict(self) -> _FileMetaData: ... + def equals(self, other: FileMetaData) -> bool: ... + @property + def schema(self) -> ParquetSchema: ... + @property + def serialized_size(self) -> int: ... + @property + def num_columns(self) -> int: ... + @property + def num_rows(self) -> int: ... + @property + def num_row_groups(self) -> int: ... + @property + def format_version(self) -> str: ... + @property + def created_by(self) -> str: ... + @property + def metadata(self) -> dict[bytes, bytes] | None: ... + def row_group(self, i: int) -> RowGroupMetaData: ... + def set_file_path(self, path: str) -> None: ... + def append_row_groups(self, other: FileMetaData) -> None: ... + def write_metadata_file(self, where: StrPath | Buffer | NativeFile | IO) -> None: ... + +class ParquetSchema(_Weakrefable): + def __init__(self, container: FileMetaData) -> None: ... + def __getitem__(self, i: int) -> ColumnChunkMetaData: ... + def __hash__(self) -> int: ... + def __len__(self) -> int: ... + @property + def names(self) -> list[str]: ... + def to_arrow_schema(self) -> Schema: ... + def equals(self, other: ParquetSchema) -> bool: ... + def column(self, i: int) -> ColumnSchema: ... + +class ColumnSchema(_Weakrefable): + def __init__(self, schema: ParquetSchema, index: int) -> None: ... + def equals(self, other: ColumnSchema) -> bool: ... + @property + def name(self) -> str: ... + @property + def path(self) -> str: ... + @property + def max_definition_level(self) -> int: ... + @property + def max_repetition_level(self) -> int: ... + @property + def physical_type(self) -> _PhysicalType: ... + @property + def logical_type(self) -> ParquetLogicalType: ... + @property + def converted_type(self) -> _ConvertedType | None: ... + @property + def length(self) -> int | None: ... + @property + def precision(self) -> int | None: ... + @property + def scale(self) -> int | None: ... + +class ParquetReader(_Weakrefable): + def __init__(self, memory_pool: MemoryPool | None = None) -> None: ... + def open( + self, + source: StrPath | NativeFile | IO, + *, + use_memory_map: bool = False, + read_dictionary: Iterable[int] | Iterable[str] | None = None, + metadata: FileMetaData | None = None, + buffer_size: int = 0, + pre_buffer: bool = False, + coerce_int96_timestamp_unit: str | None = None, + decryption_properties: FileDecryptionProperties | None = None, + thrift_string_size_limit: int | None = None, + thrift_container_size_limit: int | None = None, + page_checksum_verification: bool = False, + ): ... + @property + def column_paths(self) -> list[str]: ... + @property + def metadata(self) -> FileMetaData: ... + @property + def schema_arrow(self) -> Schema: ... + @property + def num_row_groups(self) -> int: ... + def set_use_threads(self, use_threads: bool) -> None: ... + def set_batch_size(self, batch_size: int) -> None: ... + def iter_batches( + self, + batch_size: int, + row_groups: list[int], + column_indices: list[int] | None = None, + use_threads: bool = True, + ) -> Iterator[RecordBatch]: ... + def read_row_group( + self, i: int, column_indices: list[int] | None = None, use_threads: bool = True + ) -> Table: ... + def read_row_groups( + self, + row_groups: list[int], + column_indices: list[int] | None = None, + use_threads: bool = True, + ) -> Table: ... + def read_all( + self, column_indices: list[int] | None = None, use_threads: bool = True + ) -> Table: ... + def scan_contents(self, column_indices: list[int] | None = None, batch_size: int = 65536): ... + def column_name_idx(self, column_name: str) -> int: ... + def read_column(self, column_index: int) -> ChunkedArray: ... + def close(self) -> None: ... + @property + def closed(self) -> bool: ... + +class ParquetWriter(_Weakrefable): + def __init__( + self, + where: StrPath | NativeFile | IO, + schema: Schema, + use_dictionary: bool | list[str] | None = None, + compression: _Compression | dict[str, _Compression] | None = None, + version: str | None = None, + write_statistics: bool | list[str] | None = None, + memory_pool: MemoryPool | None = None, + use_deprecated_int96_timestamps: bool = False, + coerce_timestamps: Literal["ms", "us"] | None = None, + data_page_size: int | None = None, + allow_truncated_timestamps: bool = False, + compression_level: int | dict[str, int] | None = None, + use_byte_stream_split: bool | list[str] = False, + column_encoding: _Encoding | dict[str, _Encoding] | None = None, + writer_engine_version: str | None = None, + data_page_version: str | None = None, + use_compliant_nested_type: bool = True, + encryption_properties: FileDecryptionProperties | None = None, + write_batch_size: int | None = None, + dictionary_pagesize_limit: int | None = None, + store_schema: bool = True, + write_page_index: bool = False, + write_page_checksum: bool = False, + sorting_columns: tuple[SortingColumn, ...] | None = None, + store_decimal_as_integer: bool = False, + ): ... + def close(self) -> None: ... + def write_table(self, table: Table, row_group_size: int | None = None) -> None: ... + def add_key_value_metadata(self, key_value_metadata: KeyValueMetadata) -> None: ... + @property + def metadata(self) -> FileMetaData: ... + @property + def use_dictionary(self) -> bool | list[str] | None: ... + @property + def use_deprecated_int96_timestamps(self) -> bool: ... + @property + def use_byte_stream_split(self) -> bool | list[str]: ... + @property + def column_encoding(self) -> _Encoding | dict[str, _Encoding] | None: ... + @property + def coerce_timestamps(self) -> Literal["ms", "us"] | None: ... + @property + def allow_truncated_timestamps(self) -> bool: ... + @property + def compression(self) -> _Compression | dict[str, _Compression] | None: ... + @property + def compression_level(self) -> int | dict[str, int] | None: ... + @property + def data_page_version(self) -> str | None: ... + @property + def use_compliant_nested_type(self) -> bool: ... + @property + def version(self) -> str | None: ... + @property + def write_statistics(self) -> bool | list[str] | None: ... + @property + def writer_engine_version(self) -> str: ... + @property + def row_group_size(self) -> int: ... + @property + def data_page_size(self) -> int: ... + @property + def encryption_properties(self) -> FileDecryptionProperties: ... + @property + def write_batch_size(self) -> int: ... + @property + def dictionary_pagesize_limit(self) -> int: ... + @property + def store_schema(self) -> bool: ... + @property + def store_decimal_as_integer(self) -> bool: ... + +class FileEncryptionProperties: ... +class FileDecryptionProperties: ... diff --git a/python/stubs/_parquet_encryption.pyi b/python/stubs/_parquet_encryption.pyi new file mode 100644 index 00000000000..c707edb844a --- /dev/null +++ b/python/stubs/_parquet_encryption.pyi @@ -0,0 +1,67 @@ +import datetime as dt + +from typing import Callable + +from ._parquet import FileDecryptionProperties, FileEncryptionProperties +from .lib import _Weakrefable + +class EncryptionConfiguration(_Weakrefable): + footer_key: str + column_keys: dict[str, list[str]] + encryption_algorithm: str + plaintext_footer: bool + double_wrapping: bool + cache_lifetime: dt.timedelta + internal_key_material: bool + data_key_length_bits: int + + def __init__( + self, + footer_key: str, + *, + column_keys: dict[str, str | list[str]] | None = None, + encryption_algorithm: str | None = None, + plaintext_footer: bool | None = None, + double_wrapping: bool | None = None, + cache_lifetime: dt.timedelta | None = None, + internal_key_material: bool | None = None, + data_key_length_bits: int | None = None, + ) -> None: ... + +class DecryptionConfiguration(_Weakrefable): + cache_lifetime: dt.timedelta + def __init__(self, *, cache_lifetime: dt.timedelta | None = None): ... + +class KmsConnectionConfig(_Weakrefable): + kms_instance_id: str + kms_instance_url: str + key_access_token: str + custom_kms_conf: dict[str, str] + def __init__( + self, + *, + kms_instance_id: str | None = None, + kms_instance_url: str | None = None, + key_access_token: str | None = None, + custom_kms_conf: dict[str, str] | None = None, + ) -> None: ... + def refresh_key_access_token(self, value: str) -> None: ... + +class KmsClient(_Weakrefable): + def wrap_key(self, key_bytes: bytes, master_key_identifier: str) -> str: ... + def unwrap_key(self, wrapped_key: str, master_key_identifier: str) -> str: ... + +class CryptoFactory(_Weakrefable): + def __init__(self, kms_client_factory: Callable[[KmsConnectionConfig], KmsClient]): ... + def file_encryption_properties( + self, + kms_connection_config: KmsConnectionConfig, + encryption_config: EncryptionConfiguration, + ) -> FileEncryptionProperties: ... + def file_decryption_properties( + self, + kms_connection_config: KmsConnectionConfig, + decryption_config: DecryptionConfiguration | None = None, + ) -> FileDecryptionProperties: ... + def remove_cache_entries_for_token(self, access_token: str) -> None: ... + def remove_cache_entries_for_all_tokens(self) -> None: ... diff --git a/python/stubs/_s3fs.pyi b/python/stubs/_s3fs.pyi new file mode 100644 index 00000000000..fc13c498bd9 --- /dev/null +++ b/python/stubs/_s3fs.pyi @@ -0,0 +1,74 @@ +import enum + +from typing import Literal, NotRequired, Required, TypedDict + +from ._fs import FileSystem +from .lib import KeyValueMetadata + +class _ProxyOptions(TypedDict): + schema: Required[Literal["http", "https"]] + host: Required[str] + port: Required[int] + username: NotRequired[str] + password: NotRequired[str] + +class S3LogLevel(enum.IntEnum): + Off = enum.auto() + Fatal = enum.auto() + Error = enum.auto() + Warn = enum.auto() + Info = enum.auto() + Debug = enum.auto() + Trace = enum.auto() + +Off = S3LogLevel.Off +Fatal = S3LogLevel.Fatal +Error = S3LogLevel.Error +Warn = S3LogLevel.Warn +Info = S3LogLevel.Info +Debug = S3LogLevel.Debug +Trace = S3LogLevel.Trace + +def initialize_s3( + log_level: S3LogLevel = S3LogLevel.Fatal, num_event_loop_threads: int = 1 +) -> None: ... +def ensure_s3_initialized() -> None: ... +def finalize_s3() -> None: ... +def ensure_s3_finalized() -> None: ... +def resolve_s3_region(bucket: str) -> str: ... + +class S3RetryStrategy: + max_attempts: int + def __init__(self, max_attempts=3) -> None: ... + +class AwsStandardS3RetryStrategy(S3RetryStrategy): ... +class AwsDefaultS3RetryStrategy(S3RetryStrategy): ... + +class S3FileSystem(FileSystem): + def __init__( + self, + *, + access_key: str | None = None, + secret_key: str | None = None, + session_token: str | None = None, + anonymous: bool = False, + region: str | None = None, + request_timeout: float | None = None, + connect_timeout: float | None = None, + scheme: Literal["http", "https"] = "https", + endpoint_override: str | None = None, + background_writes: bool = True, + default_metadata: dict | KeyValueMetadata | None = None, + role_arn: str | None = None, + session_name: str | None = None, + external_id: str | None = None, + load_frequency: int = 900, + proxy_options: _ProxyOptions | str | None = None, + allow_bucket_creation: bool = False, + allow_bucket_deletion: bool = False, + check_directory_existence_before_creation: bool = False, + retry_strategy: S3RetryStrategy = AwsStandardS3RetryStrategy(max_attempts=3), + force_virtual_addressing: bool = False, + ): ... + @property + def region(self) -> str: ... diff --git a/python/stubs/_stubs_typing.pyi b/python/stubs/_stubs_typing.pyi new file mode 100644 index 00000000000..c259513f1ea --- /dev/null +++ b/python/stubs/_stubs_typing.pyi @@ -0,0 +1,80 @@ +import datetime as dt + +from collections.abc import Sequence +from decimal import Decimal +from typing import Any, Collection, Literal, Protocol, TypeAlias, TypeVar + +import numpy as np + +from numpy.typing import NDArray + +from .compute import BooleanArray, IntegerArray + +ArrayLike: TypeAlias = Any +ScalarLike: TypeAlias = Any +Order: TypeAlias = Literal["ascending", "descending"] +JoinType: TypeAlias = Literal[ + "left semi", + "right semi", + "left anti", + "right anti", + "inner", + "left outer", + "right outer", + "full outer", +] +Compression: TypeAlias = Literal[ + "gzip", "bz2", "brotli", "lz4", "lz4_frame", "lz4_raw", "zstd", "snappy" +] +NullEncoding: TypeAlias = Literal["mask", "encode"] +NullSelectionBehavior: TypeAlias = Literal["drop", "emit_null"] +Mask: TypeAlias = Sequence[bool | None] | NDArray[np.bool_] | BooleanArray +Indices: TypeAlias = Sequence[int] | NDArray[np.integer[Any]] | IntegerArray +PyScalar: TypeAlias = ( + bool | int | float | Decimal | str | bytes | dt.date | dt.datetime | dt.time | dt.timedelta +) + +_T = TypeVar("_T") +SingleOrList: TypeAlias = list[_T] | _T + +class SupportEq(Protocol): + def __eq__(self, other) -> bool: ... + +class SupportLt(Protocol): + def __lt__(self, other) -> bool: ... + +class SupportGt(Protocol): + def __gt__(self, other) -> bool: ... + +class SupportLe(Protocol): + def __le__(self, other) -> bool: ... + +class SupportGe(Protocol): + def __ge__(self, other) -> bool: ... + +FilterTuple: TypeAlias = ( + tuple[str, Literal["=", "==", "!="], SupportEq] + | tuple[str, Literal["<"], SupportLt] + | tuple[str, Literal[">"], SupportGt] + | tuple[str, Literal["<="], SupportLe] + | tuple[str, Literal[">="], SupportGe] + | tuple[str, Literal["in", "not in"], Collection] +) + +class Buffer(Protocol): + def __buffer__(self, flags: int, /) -> memoryview: ... + +class SupportPyBuffer(Protocol): + def __buffer__(self, flags: int, /) -> memoryview: ... + +class SupportArrowStream(Protocol): + def __arrow_c_stream__(self, requested_schema=None) -> Any: ... + +class SupportArrowArray(Protocol): + def __arrow_c_array__(self, requested_schema=None) -> Any: ... + +class SupportArrowDeviceArray(Protocol): + def __arrow_c_device_array__(self, requested_schema=None, **kwargs) -> Any: ... + +class SupportArrowSchema(Protocol): + def __arrow_c_schema(self) -> Any: ... diff --git a/python/stubs/_substrait.pyi b/python/stubs/_substrait.pyi new file mode 100644 index 00000000000..ff226e9521b --- /dev/null +++ b/python/stubs/_substrait.pyi @@ -0,0 +1,39 @@ +from typing import Any, Callable + +from ._compute import Expression +from .lib import Buffer, RecordBatchReader, Schema, Table, _Weakrefable + +def run_query( + plan: Buffer | int, + *, + table_provider: Callable[[list[str], Schema], Table] | None = None, + use_threads: bool = True, +) -> RecordBatchReader: ... +def _parse_json_plan(plan: bytes) -> Buffer: ... + +class SubstraitSchema: + schema: Schema + expression: Expression + def __init__(self, schema: Schema, expression: Expression) -> None: ... + def to_pysubstrait(self) -> Any: ... + +def serialize_schema(schema: Schema) -> SubstraitSchema: ... +def deserialize_schema(buf: Buffer | bytes) -> Schema: ... +def serialize_expressions( + exprs: list[Expression], + names: list[str], + schema: Schema, + *, + allow_arrow_extensions: bool = False, +) -> Buffer: ... + +class BoundExpressions(_Weakrefable): + @property + def schema(self) -> Schema: ... + @property + def expressions(self) -> dict[str, Expression]: ... + @classmethod + def from_substrait(cls, message: Buffer | bytes) -> BoundExpressions: ... + +def deserialize_expressions(buf: Buffer | bytes) -> BoundExpressions: ... +def get_supported_functions() -> list[str]: ... diff --git a/python/stubs/acero.pyi b/python/stubs/acero.pyi new file mode 100644 index 00000000000..8a520bdc24a --- /dev/null +++ b/python/stubs/acero.pyi @@ -0,0 +1,85 @@ +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias +from typing import Literal + +from . import lib +from .compute import Expression, FunctionOptions + +_StrOrExpr: TypeAlias = str | Expression + +class Declaration(lib._Weakrefable): + def __init__( + self, + factory_name: str, + options: ExecNodeOptions, + inputs: list[Declaration] | None = None, + ) -> None: ... + @classmethod + def from_sequence(cls, decls: list[Declaration]) -> Self: ... + def to_reader(self, use_threads: bool = True) -> lib.RecordBatchReader: ... + def to_table(self, use_threads: bool = True) -> lib.Table: ... + +class ExecNodeOptions(lib._Weakrefable): ... + +class TableSourceNodeOptions(ExecNodeOptions): + def __init__(self, table: lib.Table) -> None: ... + +class FilterNodeOptions(ExecNodeOptions): + def __init__(self, filter_expression: Expression) -> None: ... + +class ProjectNodeOptions(ExecNodeOptions): + def __init__(self, expressions: list[Expression], names: list[str] | None = None) -> None: ... + +class AggregateNodeOptions(ExecNodeOptions): + def __init__( + self, + aggregates: list[tuple[list[str], str, FunctionOptions, str]], + keys: list[_StrOrExpr] | None = None, + ) -> None: ... + +class OrderByNodeOptions(ExecNodeOptions): + def __init__( + self, + sort_keys: tuple[tuple[str, Literal["ascending", "descending"]], ...] = (), + *, + null_placement: Literal["at_start", "at_end"] = "at_end", + ) -> None: ... + +class HashJoinNodeOptions(ExecNodeOptions): + def __init__( + self, + join_type: Literal[ + "left semi", + "right semi", + "left anti", + "right anti", + "inner", + "left outer", + "right outer", + "full outer", + ], + left_keys: _StrOrExpr | list[_StrOrExpr], + right_keys: _StrOrExpr | list[_StrOrExpr], + left_output: list[_StrOrExpr] | None = None, + right_output: list[_StrOrExpr] | None = None, + output_suffix_for_left: str = "", + output_suffix_for_right: str = "", + ) -> None: ... + +class AsofJoinNodeOptions(ExecNodeOptions): + def __init__( + self, + left_on: _StrOrExpr, + left_by: _StrOrExpr | list[_StrOrExpr], + right_on: _StrOrExpr, + right_by: _StrOrExpr | list[_StrOrExpr], + tolerance: int, + ) -> None: ... diff --git a/python/stubs/benchmark.pyi b/python/stubs/benchmark.pyi new file mode 100644 index 00000000000..048973301dc --- /dev/null +++ b/python/stubs/benchmark.pyi @@ -0,0 +1,3 @@ +from pyarrow.lib import benchmark_PandasObjectIsNull + +__all__ = ["benchmark_PandasObjectIsNull"] diff --git a/python/stubs/cffi.pyi b/python/stubs/cffi.pyi new file mode 100644 index 00000000000..2ae945c5974 --- /dev/null +++ b/python/stubs/cffi.pyi @@ -0,0 +1,4 @@ +import cffi + +c_source: str +ffi: cffi.FFI diff --git a/python/stubs/compute.pyi b/python/stubs/compute.pyi new file mode 100644 index 00000000000..8d8fc35b134 --- /dev/null +++ b/python/stubs/compute.pyi @@ -0,0 +1,7779 @@ +# ruff: noqa: I001 +from typing import Literal, TypeAlias, TypeVar, overload, Any, Iterable, ParamSpec, Sequence +from collections.abc import Callable + +# Option classes +from pyarrow._compute import ArraySortOptions as ArraySortOptions +from pyarrow._compute import AssumeTimezoneOptions as AssumeTimezoneOptions +from pyarrow._compute import CastOptions as CastOptions +from pyarrow._compute import CountOptions as CountOptions +from pyarrow._compute import CumulativeOptions as CumulativeOptions +from pyarrow._compute import CumulativeSumOptions as CumulativeSumOptions +from pyarrow._compute import DayOfWeekOptions as DayOfWeekOptions +from pyarrow._compute import DictionaryEncodeOptions as DictionaryEncodeOptions +from pyarrow._compute import ElementWiseAggregateOptions as ElementWiseAggregateOptions + +# Expressions +from pyarrow._compute import Expression as Expression +from pyarrow._compute import ExtractRegexOptions as ExtractRegexOptions +from pyarrow._compute import ExtractRegexSpanOptions as ExtractRegexSpanOptions +from pyarrow._compute import FilterOptions as FilterOptions +from pyarrow._compute import Function as Function +from pyarrow._compute import FunctionOptions as FunctionOptions +from pyarrow._compute import FunctionRegistry as FunctionRegistry +from pyarrow._compute import HashAggregateFunction as HashAggregateFunction +from pyarrow._compute import HashAggregateKernel as HashAggregateKernel +from pyarrow._compute import IndexOptions as IndexOptions +from pyarrow._compute import JoinOptions as JoinOptions +from pyarrow._compute import Kernel as Kernel +from pyarrow._compute import ListFlattenOptions as ListFlattenOptions +from pyarrow._compute import ListSliceOptions as ListSliceOptions +from pyarrow._compute import MakeStructOptions as MakeStructOptions +from pyarrow._compute import MapLookupOptions as MapLookupOptions +from pyarrow._compute import MatchSubstringOptions as MatchSubstringOptions +from pyarrow._compute import ModeOptions as ModeOptions +from pyarrow._compute import NullOptions as NullOptions +from pyarrow._compute import PadOptions as PadOptions +from pyarrow._compute import PairwiseOptions as PairwiseOptions +from pyarrow._compute import PartitionNthOptions as PartitionNthOptions +from pyarrow._compute import PivotWiderOptions as PivotWiderOptions +from pyarrow._compute import QuantileOptions as QuantileOptions +from pyarrow._compute import RandomOptions as RandomOptions +from pyarrow._compute import RankOptions as RankOptions +from pyarrow._compute import RankQuantileOptions as RankQuantileOptions +from pyarrow._compute import ReplaceSliceOptions as ReplaceSliceOptions +from pyarrow._compute import ReplaceSubstringOptions as ReplaceSubstringOptions +from pyarrow._compute import RoundBinaryOptions as RoundBinaryOptions +from pyarrow._compute import RoundOptions as RoundOptions +from pyarrow._compute import RoundTemporalOptions as RoundTemporalOptions +from pyarrow._compute import RoundToMultipleOptions as RoundToMultipleOptions +from pyarrow._compute import RunEndEncodeOptions as RunEndEncodeOptions +from pyarrow._compute import ScalarAggregateFunction as ScalarAggregateFunction +from pyarrow._compute import ScalarAggregateKernel as ScalarAggregateKernel +from pyarrow._compute import ScalarAggregateOptions as ScalarAggregateOptions +from pyarrow._compute import ScalarFunction as ScalarFunction +from pyarrow._compute import ScalarKernel as ScalarKernel +from pyarrow._compute import SelectKOptions as SelectKOptions +from pyarrow._compute import SetLookupOptions as SetLookupOptions +from pyarrow._compute import SkewOptions as SkewOptions +from pyarrow._compute import SliceOptions as SliceOptions +from pyarrow._compute import SortOptions as SortOptions +from pyarrow._compute import SplitOptions as SplitOptions +from pyarrow._compute import SplitPatternOptions as SplitPatternOptions +from pyarrow._compute import StrftimeOptions as StrftimeOptions +from pyarrow._compute import StrptimeOptions as StrptimeOptions +from pyarrow._compute import StructFieldOptions as StructFieldOptions +from pyarrow._compute import TakeOptions as TakeOptions +from pyarrow._compute import TDigestOptions as TDigestOptions +from pyarrow._compute import TrimOptions as TrimOptions +from pyarrow._compute import UdfContext as UdfContext +from pyarrow._compute import Utf8NormalizeOptions as Utf8NormalizeOptions +from pyarrow._compute import VarianceOptions as VarianceOptions +from pyarrow._compute import VectorFunction as VectorFunction +from pyarrow._compute import VectorKernel as VectorKernel +from pyarrow._compute import WeekOptions as WeekOptions +from pyarrow._compute import WinsorizeOptions as WinsorizeOptions + +# Functions +from pyarrow._compute import call_function as call_function + +# Udf +from pyarrow._compute import call_tabular_function as call_tabular_function +from pyarrow._compute import function_registry as function_registry +from pyarrow._compute import get_function as get_function +from pyarrow._compute import list_functions as list_functions +from pyarrow._compute import register_aggregate_function as register_aggregate_function +from pyarrow._compute import register_scalar_function as register_scalar_function +from pyarrow._compute import register_tabular_function as register_tabular_function +from pyarrow._compute import register_vector_function as register_vector_function + +from pyarrow._compute import _Order, _Placement +from pyarrow._stubs_typing import ArrayLike, ScalarLike +from . import lib + +_P = ParamSpec("_P") +_R = TypeVar("_R") + +def field(*name_or_index: str | tuple[str, ...] | int) -> Expression: + """Reference a column of the dataset. + + Stores only the field's name. Type and other information is known only when + the expression is bound to a dataset having an explicit scheme. + + Nested references are allowed by passing multiple names or a tuple of + names. For example ``('foo', 'bar')`` references the field named "bar" + inside the field named "foo". + + Parameters + ---------- + *name_or_index : string, multiple strings, tuple or int + The name or index of the (possibly nested) field the expression + references to. + + Returns + ------- + field_expr : Expression + Reference to the given field + + Examples + -------- + >>> import pyarrow.compute as pc + >>> pc.field("a") + + >>> pc.field(1) + + >>> pc.field(("a", "b")) + >> pc.field("a", "b") + Expression: + """Expression representing a scalar value. + + Creates an Expression object representing a scalar value that can be used + in compute expressions and predicates. + + Parameters + ---------- + value : bool, int, float or string + Python value of the scalar. This function accepts any value that can be + converted to a ``pyarrow.Scalar`` using ``pa.scalar()``. + + Notes + ----- + This function differs from ``pyarrow.scalar()`` in the following way: + + * ``pyarrow.scalar()`` creates a ``pyarrow.Scalar`` object that represents + a single value in Arrow's memory model. + * ``pyarrow.compute.scalar()`` creates an ``Expression`` object representing + a scalar value that can be used in compute expressions, predicates, and + dataset filtering operations. + + Returns + ------- + scalar_expr : Expression + An Expression representing the scalar value + """ + +def _clone_signature(f: Callable[_P, _R]) -> Callable[_P, _R]: ... + +# ============= compute functions ============= +_DataTypeT = TypeVar("_DataTypeT", bound=lib.DataType) +_Scalar_CoT = TypeVar("_Scalar_CoT", bound=lib.Scalar, covariant=True) +_ScalarT = TypeVar("_ScalarT", bound=lib.Scalar) +_ArrayT = TypeVar("_ArrayT", bound=lib.Array | lib.ChunkedArray) +_ScalarOrArrayT = TypeVar("_ScalarOrArrayT", bound=lib.Array | lib.Scalar | lib.ChunkedArray) +ArrayOrChunkedArray: TypeAlias = lib.Array[_Scalar_CoT] | lib.ChunkedArray[_Scalar_CoT] +ScalarOrArray: TypeAlias = ArrayOrChunkedArray[_Scalar_CoT] | _Scalar_CoT + +SignedIntegerScalar: TypeAlias = ( + lib.Scalar[lib.Int8Type] + | lib.Scalar[lib.Int16Type] + | lib.Scalar[lib.Int32Type] + | lib.Scalar[lib.Int64Type] +) +UnsignedIntegerScalar: TypeAlias = ( + lib.Scalar[lib.UInt8Type] + | lib.Scalar[lib.UInt16Type] + | lib.Scalar[lib.Uint32Type] + | lib.Scalar[lib.UInt64Type] +) +IntegerScalar: TypeAlias = SignedIntegerScalar | UnsignedIntegerScalar +FloatScalar: TypeAlias = ( + lib.Scalar[lib.Float16Type] | lib.Scalar[lib.Float32Type] | lib.Scalar[lib.Float64Type] +) +DecimalScalar: TypeAlias = ( + lib.Scalar[lib.Decimal32Type] + | lib.Scalar[lib.Decimal64Type] + | lib.Scalar[lib.Decimal128Type] + | lib.Scalar[lib.Decimal256Type] +) +NonFloatNumericScalar: TypeAlias = IntegerScalar | DecimalScalar +NumericScalar: TypeAlias = IntegerScalar | FloatScalar | DecimalScalar +BinaryScalar: TypeAlias = ( + lib.Scalar[lib.BinaryType] + | lib.Scalar[lib.LargeBinaryType] + | lib.Scalar[lib.FixedSizeBinaryType] +) +StringScalar: TypeAlias = lib.Scalar[lib.StringType] | lib.Scalar[lib.LargeStringType] +StringOrBinaryScalar: TypeAlias = StringScalar | BinaryScalar +_ListScalar: TypeAlias = lib.ListViewScalar[_DataTypeT] | lib.FixedSizeListScalar[_DataTypeT, Any] +_LargeListScalar: TypeAlias = lib.LargeListScalar[_DataTypeT] | lib.LargeListViewScalar[_DataTypeT] +ListScalar: TypeAlias = ( + lib.ListScalar[_DataTypeT] | _ListScalar[_DataTypeT] | _LargeListScalar[_DataTypeT] +) +TemporalScalar: TypeAlias = ( + lib.Date32Scalar + | lib.Date64Scalar + | lib.Time32Scalar[Any] + | lib.Time64Scalar[Any] + | lib.TimestampScalar[Any] + | lib.DurationScalar[Any] + | lib.MonthDayNanoIntervalScalar +) +NumericOrDurationScalar: TypeAlias = NumericScalar | lib.DurationScalar +NumericOrTemporalScalar: TypeAlias = NumericScalar | TemporalScalar + +_NumericOrTemporalScalarT = TypeVar("_NumericOrTemporalScalarT", bound=NumericOrTemporalScalar) +NumericArray: TypeAlias = ArrayOrChunkedArray[_NumericScalarT] +_NumericArrayT = TypeVar("_NumericArrayT", bound=NumericArray) +_NumericScalarT = TypeVar("_NumericScalarT", bound=NumericScalar) +_NumericOrDurationT = TypeVar("_NumericOrDurationT", bound=NumericOrDurationScalar) +NumericOrDurationArray: TypeAlias = ArrayOrChunkedArray[NumericOrDurationScalar] +_NumericOrDurationArrayT = TypeVar("_NumericOrDurationArrayT", bound=NumericOrDurationArray) +NumericOrTemporalArray: TypeAlias = ArrayOrChunkedArray[_NumericOrTemporalScalarT] +_NumericOrTemporalArrayT = TypeVar("_NumericOrTemporalArrayT", bound=NumericOrTemporalArray) +BooleanArray: TypeAlias = ArrayOrChunkedArray[lib.BooleanScalar] +_BooleanArrayT = TypeVar("_BooleanArrayT", bound=BooleanArray) +IntegerArray: TypeAlias = ArrayOrChunkedArray[IntegerScalar] +_FloatScalarT = TypeVar("_FloatScalarT", bound=FloatScalar) +FloatArray: TypeAlias = ArrayOrChunkedArray[FloatScalar] +_FloatArrayT = TypeVar("_FloatArrayT", bound=FloatArray) +_StringScalarT = TypeVar("_StringScalarT", bound=StringScalar) +StringArray: TypeAlias = ArrayOrChunkedArray[StringScalar] +_StringArrayT = TypeVar("_StringArrayT", bound=StringArray) +_BinaryScalarT = TypeVar("_BinaryScalarT", bound=BinaryScalar) +BinaryArray: TypeAlias = ArrayOrChunkedArray[BinaryScalar] +_BinaryArrayT = TypeVar("_BinaryArrayT", bound=BinaryArray) +_StringOrBinaryScalarT = TypeVar("_StringOrBinaryScalarT", bound=StringOrBinaryScalar) +StringOrBinaryArray: TypeAlias = StringArray | BinaryArray +_StringOrBinaryArrayT = TypeVar("_StringOrBinaryArrayT", bound=StringOrBinaryArray) +_TemporalScalarT = TypeVar("_TemporalScalarT", bound=TemporalScalar) +TemporalArray: TypeAlias = ArrayOrChunkedArray[TemporalScalar] +_TemporalArrayT = TypeVar("_TemporalArrayT", bound=TemporalArray) +_ListArray: TypeAlias = ArrayOrChunkedArray[_ListScalar[_DataTypeT]] +_LargeListArray: TypeAlias = ArrayOrChunkedArray[_LargeListScalar[_DataTypeT]] +ListArray: TypeAlias = ArrayOrChunkedArray[ListScalar[_DataTypeT]] +# =============================== 1. Aggregation =============================== + +# ========================= 1.1 functions ========================= + +def all( + array: lib.BooleanScalar | BooleanArray, + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanScalar: + """ + Test whether all elements in a boolean array evaluate to true. + + Null values are ignored by default. + If the `skip_nulls` option is set to false, then Kleene logic is used. + See "kleene_and" for more details on Kleene logic. + + Parameters + ---------- + array : Array-like + Argument to compute function. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +any = _clone_signature(all) +""" +Test whether any element in a boolean array evaluates to true. + +Null values are ignored by default. +If the `skip_nulls` option is set to false, then Kleene logic is used. +See "kleene_or" for more details on Kleene logic. + +Parameters +---------- +array : Array-like + Argument to compute function. +skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. +min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. +options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +def approximate_median( + array: NumericScalar | NumericArray, + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleScalar: + """ + Approximate median of a numeric array with T-Digest algorithm. + + Nulls and NaNs are ignored. + A null scalar is returned if there is no valid data point. + + Parameters + ---------- + array : Array-like + Argument to compute function. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def count( + array: lib.Array | lib.ChunkedArray, + /, + mode: Literal["only_valid", "only_null", "all"] = "only_valid", + *, + options: CountOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar: + """ + Count the number of null / non-null values. + + By default, only non-null values are counted. + This can be changed through CountOptions. + + Parameters + ---------- + array : Array-like + Argument to compute function. + mode : str, default "only_valid" + Which values to count in the input. + Accepted values are "only_valid", "only_null", "all". + options : pyarrow.compute.CountOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def count_distinct( + array: lib.Array | lib.ChunkedArray, + /, + mode: Literal["only_valid", "only_null", "all"] = "only_valid", + *, + options: CountOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar: + """ + Count the number of unique values. + + By default, only non-null values are counted. + This can be changed through CountOptions. + + Parameters + ---------- + array : Array-like + Argument to compute function. + mode : str, default "only_valid" + Which values to count in the input. + Accepted values are "only_valid", "only_null", "all". + options : pyarrow.compute.CountOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def first( + array: lib.Array[_ScalarT] | lib.ChunkedArray[_ScalarT], + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _ScalarT: + """ + Compute the first value in each group. + + Null values are ignored by default. + If skip_nulls = false, then this will return the first and last values + regardless if it is null + + Parameters + ---------- + array : Array-like + Argument to compute function. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def first_last( + array: lib.Array[Any] | lib.ChunkedArray[Any], + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StructScalar: + """ + Compute the first and last values of an array. + + Null values are ignored by default. + If skip_nulls = false, then this will return the first and last values + regardless if it is null + + Parameters + ---------- + array : Array-like + Argument to compute function. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def index( + data: lib.Array[Any] | lib.ChunkedArray[Any], + value, + start: int | None = None, + end: int | None = None, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar: + """ + Find the index of the first occurrence of a given value. + + Parameters + ---------- + data : Array-like + value : Scalar-like object + The value to search for. + start : int, optional + end : int, optional + memory_pool : MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + Returns + ------- + index : int + the index, or -1 if not found + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> arr = pa.array(["Lorem", "ipsum", "dolor", "sit", "Lorem", "ipsum"]) + >>> pc.index(arr, "ipsum") + + >>> pc.index(arr, "ipsum", start=2) + + >>> pc.index(arr, "amet") + + """ + +last = _clone_signature(first) +""" +Compute the first and last values of an array. + +Null values are ignored by default. +If skip_nulls = false, then this will return the first and last values +regardless if it is null + +Parameters +---------- +array : Array-like + Argument to compute function. +skip_nulls : bool, default True +In [15]: print(pc.last.__doc__) +Compute the first value in each group. + +Null values are ignored by default. +If skip_nulls = false, then this will return the first and last values +regardless if it is null + +Parameters +---------- +array : Array-like + Argument to compute function. +skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. +min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. +options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +max = _clone_signature(first) +""" +Compute the minimum or maximum values of a numeric array. + +Null values are ignored by default. +This can be changed through ScalarAggregateOptions. + +Parameters +---------- +array : Array-like + Argument to compute function. +skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. +min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. +options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +min = _clone_signature(first) +""" +Compute the minimum or maximum values of a numeric array. + +Null values are ignored by default. +This can be changed through ScalarAggregateOptions. + +Parameters +---------- +array : Array-like + Argument to compute function. +skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. +min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. +options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +min_max = _clone_signature(first_last) +""" +Compute the minimum and maximum values of a numeric array. + +Null values are ignored by default. +This can be changed through ScalarAggregateOptions. + +Parameters +---------- +array : Array-like + Argument to compute function. +skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. +min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. +options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def mean( + array: FloatScalar | FloatArray, + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleScalar: ... +@overload +def mean( + array: lib.NumericArray[lib.Decimal128Scalar] + | lib.ChunkedArray[lib.Decimal128Scalar] + | lib.Decimal128Scalar, + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Decimal128Scalar: ... +@overload +def mean( + array: lib.NumericArray[lib.Decimal256Scalar] + | lib.ChunkedArray[lib.Decimal256Scalar] + | lib.Decimal256Scalar, + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Decimal256Scalar: ... +def mean(*args, **kwargs): + """ + Compute the mean of a numeric array. + + Null values are ignored by default. Minimum count of non-null + values can be set and null is returned if too few are present. + This can be changed through ScalarAggregateOptions. + The result is a double for integer and floating point arguments, + and a decimal with the same bit-width/precision/scale for decimal arguments. + For integers and floats, NaN is returned if min_count = 0 and + there are no values. For decimals, null is returned instead. + + Parameters + ---------- + array : Array-like + Argument to compute function. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def mode( + array: NumericScalar | NumericArray, + /, + n: int = 1, + *, + skip_nulls: bool = True, + min_count: int = 0, + options: ModeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StructArray: + """ + Compute the modal (most common) values of a numeric array. + + Compute the n most common values and their respective occurrence counts. + The output has type `struct`, where T is the + input type. + The results are ordered by descending `count` first, and ascending `mode` + when breaking ties. + Nulls are ignored. If there are no non-null values in the array, + an empty array is returned. + + Parameters + ---------- + array : Array-like + Argument to compute function. + n : int, default 1 + Number of distinct most-common values to return. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.ModeOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> arr = pa.array([1, 1, 2, 2, 3, 2, 2, 2]) + >>> modes = pc.mode(arr, 2) + >>> modes[0] + + >>> modes[1] + + """ + +def product( + array: _ScalarT | lib.NumericArray[_ScalarT], + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _ScalarT: + """ + Compute the product of values in a numeric array. + + Null values are ignored by default. Minimum count of non-null + values can be set and null is returned if too few are present. + This can be changed through ScalarAggregateOptions. + + Parameters + ---------- + array : Array-like + Argument to compute function. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def quantile( + array: NumericScalar | NumericArray, + /, + q: float = 0.5, + *, + interpolation: Literal["linear", "lower", "higher", "nearest", "midpoint"] = "linear", + skip_nulls: bool = True, + min_count: int = 0, + options: QuantileOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleArray: + """ + Compute an array of quantiles of a numeric array or chunked array. + + By default, 0.5 quantile (median) is returned. + If quantile lies between two data points, an interpolated value is + returned based on selected interpolation method. + Nulls and NaNs are ignored. + An array of nulls is returned if there is no valid data point. + + Parameters + ---------- + array : Array-like + Argument to compute function. + q : double or sequence of double, default 0.5 + Probability levels of the quantiles to compute. All values must be in + [0, 1]. + interpolation : str, default "linear" + How to break ties between competing data points for a given quantile. + Accepted values are: + + - "linear": compute an interpolation + - "lower": always use the smallest of the two data points + - "higher": always use the largest of the two data points + - "nearest": select the data point that is closest to the quantile + - "midpoint": compute the (unweighted) mean of the two data points + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.QuantileOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def stddev( + array: NumericScalar | NumericArray, + /, + *, + ddof: float = 0, + skip_nulls: bool = True, + min_count: int = 0, + options: VarianceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleScalar: + """ + Calculate the standard deviation of a numeric array. + + The number of degrees of freedom can be controlled using VarianceOptions. + By default (`ddof` = 0), the population standard deviation is calculated. + Nulls are ignored. If there are not enough non-null values in the array + to satisfy `ddof`, null is returned. + + Parameters + ---------- + array : Array-like + Argument to compute function. + ddof : int, default 0 + Number of degrees of freedom. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.VarianceOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def sum( + array: _NumericScalarT | NumericArray[_NumericScalarT], + /, + *, + skip_nulls: bool = True, + min_count: int = 1, + options: ScalarAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericScalarT: + """ + Compute the sum of a numeric array. + + Null values are ignored by default. Minimum count of non-null + values can be set and null is returned if too few are present. + This can be changed through ScalarAggregateOptions. + + Parameters + ---------- + array : Array-like + Argument to compute function. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 1 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.ScalarAggregateOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def tdigest( + array: NumericScalar | NumericArray, + /, + q: float = 0.5, + *, + delta: int = 100, + buffer_size: int = 500, + skip_nulls: bool = True, + min_count: int = 0, + options: TDigestOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleArray: + """ + Approximate quantiles of a numeric array with T-Digest algorithm. + + By default, 0.5 quantile (median) is returned. + Nulls and NaNs are ignored. + An array of nulls is returned if there is no valid data point. + + Parameters + ---------- + array : Array-like + Argument to compute function. + q : double or sequence of double, default 0.5 + Probability levels of the quantiles to approximate. All values must be + in [0, 1]. + delta : int, default 100 + Compression parameter for the T-digest algorithm. + buffer_size : int, default 500 + Buffer size for the T-digest algorithm. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.TDigestOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + """ + +def variance( + array: NumericScalar | NumericArray, + /, + *, + ddof: int = 0, + skip_nulls: bool = True, + min_count: int = 0, + options: VarianceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleScalar: + """ + Calculate the variance of a numeric array. + + The number of degrees of freedom can be controlled using VarianceOptions. + By default (`ddof` = 0), the population variance is calculated. + Nulls are ignored. If there are not enough non-null values in the array + to satisfy `ddof`, null is returned. + + Parameters + ---------- + array : Array-like + Argument to compute function. + ddof : int, default 0 + Number of degrees of freedom. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + min_count : int, default 0 + Minimum number of non-null values in the input. If the number + of non-null values is below `min_count`, the output is null. + options : pyarrow.compute.VarianceOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def top_k_unstable( + values: lib.Array | lib.ChunkedArray | lib.RecordBatch | lib.Table, + k: int, + sort_keys: list | None = None, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Array: + """ + Select the indices of the top-k ordered elements from array- or table-like + data. + + This is a specialization for :func:`select_k_unstable`. Output is not + guaranteed to be stable. + + Parameters + ---------- + values : Array, ChunkedArray, RecordBatch, or Table + Data to sort and get top indices from. + k : int + The number of `k` elements to keep. + sort_keys : List-like + Column key names to order by when input is table-like data. + memory_pool : MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + Returns + ------- + result : Array + Indices of the top-k ordered elements + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> arr = pa.array(["a", "b", "c", None, "e", "f"]) + >>> pc.top_k_unstable(arr, k=3) + + [ + 5, + 4, + 2 + ] + """ + +def bottom_k_unstable( + values: lib.Array | lib.ChunkedArray | lib.RecordBatch | lib.Table, + k: int, + sort_keys: list | None = None, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Array: + """ + Select the indices of the bottom-k ordered elements from + array- or table-like data. + + This is a specialization for :func:`select_k_unstable`. Output is not + guaranteed to be stable. + + Parameters + ---------- + values : Array, ChunkedArray, RecordBatch, or Table + Data to sort and get bottom indices from. + k : int + The number of `k` elements to keep. + sort_keys : List-like + Column key names to order by when input is table-like data. + memory_pool : MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + Returns + ------- + result : Array of indices + Indices of the bottom-k ordered elements + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.compute as pc + >>> arr = pa.array(["a", "b", "c", None, "e", "f"]) + >>> pc.bottom_k_unstable(arr, k=3) + + [ + 0, + 1, + 2 + ] + """ + +# ========================= 2. Element-wise (“scalar”) functions ========================= + +# ========================= 2.1 Arithmetic ========================= +@overload +def abs( + x: _NumericOrDurationT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _NumericOrDurationT: ... +@overload +def abs( + x: _NumericOrDurationArrayT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _NumericOrDurationArrayT: ... +@overload +def abs(x: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... +def abs(*args, **kwargs): + """ + Calculate the absolute value of the argument element-wise. + + Results will wrap around on integer overflow. + Use function "abs_checked" if you want overflow + to return an error. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +abs_checked = _clone_signature(abs) +""" +Calculate the absolute value of the argument element-wise. + +This function returns an error on overflow. For a variant that +doesn't fail on overflow, use function "abs". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def add( + x: _NumericOrTemporalScalarT, + y: _NumericOrTemporalScalarT, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericOrTemporalScalarT: ... +@overload +def add( + x: _NumericOrTemporalArrayT, + y: _NumericOrTemporalArrayT, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericOrTemporalArrayT: ... +@overload +def add( + x: Expression, y: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +@overload +def add( + x: NumericOrTemporalScalar, + y: _NumericOrTemporalArrayT, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericOrTemporalArrayT: ... +@overload +def add( + x: _NumericOrTemporalArrayT, + y: NumericOrTemporalScalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericOrTemporalArrayT: ... +@overload +def add( + x: NumericOrTemporalScalar, y: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +@overload +def add( + x: Expression, y: NumericOrTemporalScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +def add(*args, **kwargs): + """ + Add the arguments element-wise. + + Results will wrap around on integer overflow. + Use function "add_checked" if you want overflow + to return an error. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + y : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +add_checked = _clone_signature(add) +""" +Add the arguments element-wise. + +This function returns an error on overflow. For a variant that +doesn't fail on overflow, use function "add". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + +""" + +@overload +def divide( + dividend: _NumericOrTemporalScalarT, + divisor: _NumericOrTemporalScalarT, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericOrTemporalScalarT: ... +@overload +def divide( + dividend: _NumericOrTemporalArrayT, + divisor: _NumericOrTemporalArrayT, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericOrTemporalArrayT: ... +@overload +def divide( + dividend: Expression, + divisor: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def divide( + dividend: NumericOrTemporalScalar, + divisor: _NumericOrTemporalArrayT, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericOrTemporalArrayT: ... +@overload +def divide( + dividend: _NumericOrTemporalArrayT, + divisor: NumericOrTemporalScalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericOrTemporalArrayT: ... +@overload +def divide( + dividend: NumericOrTemporalScalar, + divisor: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def divide( + dividend: Expression, + divisor: NumericOrTemporalScalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def divide(*args, **kwargs): + """ + Divide the arguments element-wise. + + Integer division by zero returns an error. However, integer overflow + wraps around, and floating-point division by zero returns an infinite. + Use function "divide_checked" if you want to get an error + in all the aforementioned cases. + + Parameters + ---------- + dividend : Array-like or scalar-like + Argument to compute function. + divisor : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + """ + +divide_checked = _clone_signature(divide) +""" +Divide the arguments element-wise. + +An error is returned when trying to divide by zero, or when +integer overflow is encountered. + +Parameters +---------- +dividend : Array-like or scalar-like + Argument to compute function. +divisor : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def exp( + exponent: _FloatArrayT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _FloatArrayT: ... +@overload +def exp( + exponent: ArrayOrChunkedArray[NonFloatNumericScalar], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleArray: ... +@overload +def exp( + exponent: _FloatScalarT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _FloatScalarT: ... +@overload +def exp( + exponent: NonFloatNumericScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.DoubleScalar: ... +@overload +def exp(exponent: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... +def exp(*args, **kwargs): + """ + Compute Euler's number raised to the power of specified exponent, element-wise. + + If exponent is null the result will be null. + + Parameters + ---------- + exponent : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +multiply = _clone_signature(add) +""" +Multiply the arguments element-wise. + +Results will wrap around on integer overflow. +Use function "multiply_checked" if you want overflow +to return an error. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +multiply_checked = _clone_signature(add) +""" +Multiply the arguments element-wise. + +This function returns an error on overflow. For a variant that +doesn't fail on overflow, use function "multiply". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def negate( + x: _NumericOrDurationT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _NumericOrDurationT: ... +@overload +def negate( + x: _NumericOrDurationArrayT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _NumericOrDurationArrayT: ... +@overload +def negate(x: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... +def negate(*args, **kwargs): + """ + Negate the argument element-wise. + + Results will wrap around on integer overflow. + Use function "negate_checked" if you want overflow + to return an error. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +negate_checked = _clone_signature(negate) +""" +Negate the arguments element-wise. + +This function returns an error on overflow. For a variant that +doesn't fail on overflow, use function "negate". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def power( + base: _NumericScalarT, + exponent: _NumericScalarT, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericScalarT: ... +@overload +def power( + base: _NumericArrayT, + exponent: _NumericArrayT, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericArrayT: ... +@overload +def power( + base: Expression, + exponent: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def power( + base: _NumericArrayT, + exponent: NumericScalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericArrayT: ... +@overload +def power( + base: NumericScalar, + exponent: _NumericArrayT, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericArrayT: ... +@overload +def power( + base: NumericScalar, + exponent: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def power( + base: Expression, + exponent: NumericScalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def power(*args, **kwargs): + """ + Raise arguments to power element-wise. + + Integer to negative integer power returns an error. However, integer overflow + wraps around. If either base or exponent is null the result will be null. + + Parameters + ---------- + base : Array-like or scalar-like + Argument to compute function. + exponent : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +power_checked = _clone_signature(power) +""" +Raise arguments to power element-wise. + +An error is returned when integer to negative integer power is encountered, +or integer overflow is encountered. + +Parameters +---------- +base : Array-like or scalar-like + Argument to compute function. +exponent : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def sign( + x: NumericOrDurationArray, /, *, memory_pool: lib.MemoryPool | None = None +) -> ( + lib.NumericArray[lib.Int8Scalar] + | lib.NumericArray[lib.FloatScalar] + | lib.NumericArray[lib.DoubleScalar] +): ... +@overload +def sign( + x: NumericOrDurationScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int8Scalar | lib.FloatScalar | lib.DoubleScalar: ... +@overload +def sign(x: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... +def sign(*args, **kwargs): + """ + Get the signedness of the arguments element-wise. + + Output is any of (-1,1) for nonzero inputs and 0 for zero input. + NaN values return NaN. Integral values return signedness as Int8 and + floating-point values return it with the same type as the input values. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + """ + +@overload +def sqrt(x: NumericArray, /, *, memory_pool: lib.MemoryPool | None = None) -> FloatArray: ... +@overload +def sqrt(x: NumericScalar, /, *, memory_pool: lib.MemoryPool | None = None) -> FloatScalar: ... +@overload +def sqrt(x: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... +def sqrt(*args, **kwargs): + """ + Takes the square root of arguments element-wise. + + A negative argument returns a NaN. For a variant that returns an + error, use function "sqrt_checked". + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + + """ + +sqrt_checked = _clone_signature(sqrt) +""" +Takes the square root of arguments element-wise. + +A negative argument returns an error. For a variant that returns a +NaN, use function "sqrt". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +subtract = _clone_signature(add) +""" +Subtract the arguments element-wise. + +Results will wrap around on integer overflow. +Use function "subtract_checked" if you want overflow +to return an error. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +subtract_checked = _clone_signature(add) +""" +Subtract the arguments element-wise. + +This function returns an error on overflow. For a variant that +doesn't fail on overflow, use function "subtract". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +# ========================= 2.1 Bit-wise functions ========================= +@overload +def bit_wise_and( + x: _NumericScalarT, y: _NumericScalarT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _NumericScalarT: ... +@overload +def bit_wise_and( + x: _NumericArrayT, + y: _NumericArrayT, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericArrayT: ... +@overload +def bit_wise_and( + x: NumericScalar, y: _NumericArrayT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _NumericArrayT: ... +@overload +def bit_wise_and( + x: _NumericArrayT, y: NumericScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> _NumericArrayT: ... +@overload +def bit_wise_and( + x: Expression, + y: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def bit_wise_and( + x: Expression, + y: NumericScalar | ArrayOrChunkedArray[NumericScalar], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def bit_wise_and( + x: NumericScalar | ArrayOrChunkedArray[NumericScalar], + y: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def bit_wise_and(*args, **kwargs): + """ + Bit-wise AND the arguments element-wise. + + Null values return null. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + y : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def bit_wise_not( + x: _NumericScalarT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _NumericScalarT: ... +@overload +def bit_wise_not( + x: _NumericArrayT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _NumericArrayT: ... +@overload +def bit_wise_not(x: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... +def bit_wise_not(*args, **kwargs): + """ + Bit-wise negate the arguments element-wise. + + Null values return null. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +bit_wise_or = _clone_signature(bit_wise_and) +""" +Bit-wise OR the arguments element-wise. + +Null values return null. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +bit_wise_xor = _clone_signature(bit_wise_and) +""" +Bit-wise XOR the arguments element-wise. + +Null values return null. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +shift_left = _clone_signature(bit_wise_and) +""" +Left shift `x` by `y`. + +The shift operates as if on the two's complement representation of the number. +In other words, this is equivalent to multiplying `x` by 2 to the power `y`, +even if overflow occurs. +`x` is returned if `y` (the amount to shift by) is (1) negative or +(2) greater than or equal to the precision of `x`. +Use function "shift_left_checked" if you want an invalid shift amount +to return an error. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +shift_left_checked = _clone_signature(bit_wise_and) +""" +Left shift `x` by `y`. + +The shift operates as if on the two's complement representation of the number. +In other words, this is equivalent to multiplying `x` by 2 to the power `y`, +even if overflow occurs. +An error is raised if `y` (the amount to shift by) is (1) negative or +(2) greater than or equal to the precision of `x`. +See "shift_left" for a variant that doesn't fail for an invalid shift amount. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +shift_right = _clone_signature(bit_wise_and) +""" +Right shift `x` by `y`. + +This is equivalent to dividing `x` by 2 to the power `y`. +`x` is returned if `y` (the amount to shift by) is: (1) negative or +(2) greater than or equal to the precision of `x`. +Use function "shift_right_checked" if you want an invalid shift amount +to return an error. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +shift_right_checked = _clone_signature(bit_wise_and) +""" +Right shift `x` by `y`. + +This is equivalent to dividing `x` by 2 to the power `y`. +An error is raised if `y` (the amount to shift by) is (1) negative or +(2) greater than or equal to the precision of `x`. +See "shift_right" for a variant that doesn't fail for an invalid shift amount + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +# ========================= 2.2 Rounding functions ========================= +@overload +def ceil(x: _FloatScalarT, /, *, memory_pool: lib.MemoryPool | None = None) -> _FloatScalarT: ... +@overload +def ceil(x: _FloatArrayT, /, *, memory_pool: lib.MemoryPool | None = None) -> _FloatArrayT: ... +@overload +def ceil(x: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... +def ceil(*args, **kwargs): + """ + Round up to the nearest integer. + + Compute the smallest integer value not less in magnitude than `x`. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +floor = _clone_signature(ceil) +""" +Round down to the nearest integer. + +Compute the largest integer value not greater in magnitude than `x`. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def round( + x: _NumericScalarT, + /, + ndigits: int = 0, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericScalarT: ... +@overload +def round( + x: _NumericArrayT, + /, + ndigits: int = 0, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericArrayT: ... +@overload +def round( + x: Expression, + /, + ndigits: int = 0, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def round(*args, **kwargs): + """ + Round to a given precision. + + Options are used to control the number of digits and rounding mode. + Default behavior is to round to the nearest integer and + use half-to-even rule to break ties. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + ndigits : int, default 0 + Number of fractional digits to round to. + round_mode : str, default "half_to_even" + Rounding and tie-breaking mode. + Accepted values are "down", "up", "towards_zero", "towards_infinity", + "half_down", "half_up", "half_towards_zero", "half_towards_infinity", + "half_to_even", "half_to_odd". + options : pyarrow.compute.RoundOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def round_to_multiple( + x: _NumericScalarT, + /, + multiple: int = 0, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundToMultipleOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericScalarT: ... +@overload +def round_to_multiple( + x: _NumericArrayT, + /, + multiple: int = 0, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundToMultipleOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericArrayT: ... +@overload +def round_to_multiple( + x: Expression, + /, + multiple: int = 0, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundToMultipleOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def round_to_multiple(*args, **kwargs): + """ + Round to a given multiple. + + Options are used to control the rounding multiple and rounding mode. + Default behavior is to round to the nearest integer and + use half-to-even rule to break ties. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + multiple : numeric scalar, default 1.0 + Multiple to round to. Should be a scalar of a type compatible + with the argument to be rounded. + round_mode : str, default "half_to_even" + Rounding and tie-breaking mode. + Accepted values are "down", "up", "towards_zero", "towards_infinity", + "half_down", "half_up", "half_towards_zero", "half_towards_infinity", + "half_to_even", "half_to_odd". + options : pyarrow.compute.RoundToMultipleOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def round_binary( + x: _NumericScalarT, + s: int | lib.Int8Scalar | lib.Int16Scalar | lib.Int32Scalar | lib.Int64Scalar, + /, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundBinaryOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericScalarT: ... +@overload +def round_binary( + x: _NumericScalarT, + s: Iterable, + /, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundBinaryOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.NumericArray[_NumericScalarT]: ... +@overload +def round_binary( + x: _NumericArrayT, + s: int | lib.Int8Scalar | lib.Int16Scalar | lib.Int32Scalar | lib.Int64Scalar | Iterable, + /, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundBinaryOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericArrayT: ... +@overload +def round_binary( + x: Expression, + s: Iterable, + /, + round_mode: Literal[ + "down", + "up", + "towards_zero", + "towards_infinity", + "half_down", + "half_up", + "half_towards_zero", + "half_towards_infinity", + "half_to_even", + "half_to_odd", + ] = "half_to_even", + *, + options: RoundBinaryOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def round_binary(*args, **kwargs): + """ + Round to the given precision. + + Options are used to control the rounding mode. + Default behavior is to use the half-to-even rule to break ties. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + s : Array-like or scalar-like + Argument to compute function. + round_mode : str, default "half_to_even" + Rounding and tie-breaking mode. + Accepted values are "down", "up", "towards_zero", "towards_infinity", + "half_down", "half_up", "half_towards_zero", "half_towards_infinity", + "half_to_even", "half_to_odd". + options : pyarrow.compute.RoundBinaryOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +trunc = _clone_signature(ceil) +""" +Compute the integral part. + +Compute the nearest integer not greater in magnitude than `x`. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +# ========================= 2.3 Logarithmic functions ========================= +@overload +def ln( + x: FloatScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.FloatScalar | lib.DoubleScalar: ... +@overload +def ln( + x: FloatArray, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar]: ... +@overload +def ln(x: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... +def ln(*args, **kwargs): + """ + Compute natural logarithm. + + Non-positive values return -inf or NaN. Null values return null. + Use function "ln_checked" if you want non-positive values to raise an error. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +ln_checked = _clone_signature(ln) +""" +Compute natural logarithm. + +Non-positive values raise an error. Null values return null. +Use function "ln" if you want non-positive values to return -inf or NaN. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +log10 = _clone_signature(ln) +""" +Compute base 10 logarithm. + +Non-positive values return -inf or NaN. Null values return null. +Use function "log10_checked" if you want non-positive values +to raise an error. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +log10_checked = _clone_signature(ln) +""" +Compute base 10 logarithm. + +Non-positive values raise an error. Null values return null. +Use function "log10" if you want non-positive values +to return -inf or NaN. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +log1p = _clone_signature(ln) +""" +Compute natural log of (1+x). + +Values <= -1 return -inf or NaN. Null values return null. +This function may be more precise than log(1 + x) for x close to zero. +Use function "log1p_checked" if you want invalid values to raise an error. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +log1p_checked = _clone_signature(ln) +""" +Compute natural log of (1+x). + +Values <= -1 return -inf or NaN. Null values return null. +This function may be more precise than log(1 + x) for x close to zero. +Use function "log1p" if you want invalid values to return -inf or NaN. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +log2 = _clone_signature(ln) +""" +Compute base 2 logarithm. + +Non-positive values return -inf or NaN. Null values return null. +Use function "log2_checked" if you want non-positive values +to raise an error. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +log2_checked = _clone_signature(ln) +""" +Compute base 2 logarithm. + +Non-positive values raise an error. Null values return null. +Use function "log2" if you want non-positive values +to return -inf or NaN. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def logb( + x: FloatScalar, b: FloatScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.FloatScalar | lib.DoubleScalar: ... +@overload +def logb( + x: FloatArray, b: FloatArray, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar]: ... +@overload +def logb( + x: FloatScalar, + b: FloatArray, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar]: ... +@overload +def logb( + x: FloatArray, + b: FloatScalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar]: ... +@overload +def logb( + x: Expression | Any, b: Expression | Any, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression | Any: ... +def logb(*args, **kwargs): + """ + Compute base `b` logarithm. + + Values <= 0 return -inf or NaN. Null values return null. + Use function "logb_checked" if you want non-positive values to raise an error. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + b : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +logb_checked = _clone_signature(logb) +""" +Compute base `b` logarithm. + +Values <= 0 return -inf or NaN. Null values return null. +Use function "logb" if you want non-positive values to return -inf or NaN. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +b : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +# ========================= 2.4 Trigonometric functions ========================= +acos = _clone_signature(ln) +""" +Compute the inverse cosine. + +NaN is returned for invalid input values; +to raise an error instead, see "acos_checked". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +acos_checked = _clone_signature(ln) +""" +Compute the inverse cosine. + +Invalid input values raise an error; +to return NaN instead, see "acos". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +asin = _clone_signature(ln) +""" +Compute the inverse sine. + +NaN is returned for invalid input values; +to raise an error instead, see "asin_checked". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +asin_checked = _clone_signature(ln) +""" +Compute the inverse sine. + +Invalid input values raise an error; +to return NaN instead, see "asin". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +atan = _clone_signature(ln) +""" +Compute the inverse tangent of x. + +The return value is in the range [-pi/2, pi/2]; +for a full return range [-pi, pi], see "atan2". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +cos = _clone_signature(ln) +""" +Compute the cosine. + +NaN is returned for invalid input values; +to raise an error instead, see "cos_checked". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +cos_checked = _clone_signature(ln) +""" +Compute the cosine. + +Infinite values raise an error; +to return NaN instead, see "cos". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +sin = _clone_signature(ln) +""" +Compute the sine. + +NaN is returned for invalid input values; +to raise an error instead, see "sin_checked". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +sin_checked = _clone_signature(ln) +""" +Compute the sine. + +Invalid input values raise an error; +to return NaN instead, see "sin". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +tan = _clone_signature(ln) +""" +Compute the tangent. + +NaN is returned for invalid input values; +to raise an error instead, see "tan_checked". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +tan_checked = _clone_signature(ln) +""" +Compute the tangent. + +Infinite values raise an error; +to return NaN instead, see "tan". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def atan2( + y: FloatScalar, x: FloatScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.FloatScalar | lib.DoubleScalar: ... +@overload +def atan2( + y: FloatArray, x: FloatArray, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar]: ... +@overload +def atan2( + y: FloatArray, + x: FloatScalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar]: ... +@overload +def atan2( + y: FloatScalar, + x: FloatArray, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.NumericArray[lib.FloatScalar] | lib.NumericArray[lib.DoubleScalar]: ... +@overload +def atan2( + y: Expression, x: Any, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +@overload +def atan2( + y: Any, x: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +def atan2(*args, **kwargs): + """ + Compute the inverse tangent of y/x. + + The return value is in the range [-pi, pi]. + + Parameters + ---------- + y : Array-like or scalar-like + Argument to compute function. + x : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 2.5 Comparisons functions ========================= +@overload +def equal( + x: lib.Scalar, y: lib.Scalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar: ... +@overload +def equal( + x: lib.Scalar, + y: lib.Array | lib.ChunkedArray, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanArray: ... +@overload +def equal( + x: lib.Array | lib.ChunkedArray, + y: lib.Scalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanArray: ... +@overload +def equal( + x: lib.Array | lib.ChunkedArray, + y: lib.Array | lib.ChunkedArray, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanArray: ... +@overload +def equal( + x: Expression, + y: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def equal( + x: lib.Scalar, + y: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def equal( + x: Expression, + y: lib.Scalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def equal(*args, **kwargs): + """ + Compare values for equality (x == y). + + A null on either side emits a null comparison result. + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + y : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +greater = _clone_signature(equal) +""" +Compare values for ordered inequality (x > y). + +A null on either side emits a null comparison result. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +greater_equal = _clone_signature(equal) +""" +Compare values for ordered inequality (x >= y). + +A null on either side emits a null comparison result. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +less = _clone_signature(equal) +""" +Compare values for ordered inequality (x < y). + +A null on either side emits a null comparison result. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +less_equal = _clone_signature(equal) +""" +Compare values for ordered inequality (x <= y). + +A null on either side emits a null comparison result. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +not_equal = _clone_signature(equal) +""" +Compare values for inequality (x != y). + +A null on either side emits a null comparison result. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def max_element_wise( + *args: ScalarOrArray[_Scalar_CoT], + skip_nulls: bool = True, + options: ElementWiseAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _Scalar_CoT: ... +@overload +def max_element_wise( + *args: Expression, + skip_nulls: bool = True, + options: ElementWiseAggregateOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def max_element_wise(*args, **kwargs): + """ + Find the element-wise maximum value. + + Nulls are ignored (by default) or propagated. + NaN is preferred over null, but not over any valid value. + + Parameters + ---------- + *args : Array-like or scalar-like + Argument to compute function. + skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. + options : pyarrow.compute.ElementWiseAggregateOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +min_element_wise = _clone_signature(max_element_wise) +""" +Find the element-wise minimum value. + +Nulls are ignored (by default) or propagated. +NaN is preferred over null, but not over any valid value. + +Parameters +---------- +*args : Array-like or scalar-like + Argument to compute function. +skip_nulls : bool, default True + Whether to skip (ignore) nulls in the input. + If False, any null in the input forces the output to null. +options : pyarrow.compute.ElementWiseAggregateOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +# ========================= 2.6 Logical functions ========================= +@overload +def and_( + x: lib.BooleanScalar, y: lib.BooleanScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar: ... +@overload +def and_( + x: BooleanArray, + y: BooleanArray, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanArray: ... +@overload +def and_( + x: Expression, + y: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def and_( + x: lib.BooleanScalar, + y: BooleanArray, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanArray: ... +@overload +def and_( + x: BooleanArray, + y: lib.BooleanScalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanArray: ... +@overload +def and_( + x: lib.BooleanScalar, + y: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def and_( + x: Expression, + y: lib.BooleanScalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def and_( + x: ScalarOrArray[lib.BooleanScalar], + y: ScalarOrArray[lib.BooleanScalar], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> ScalarOrArray[lib.BooleanScalar]: ... +def and_(*args, **kwargs): + """ + Logical 'and' boolean values. + + When a null is encountered in either input, a null is output. + For a different null behavior, see function "and_kleene". + + Parameters + ---------- + x : Array-like or scalar-like + Argument to compute function. + y : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +and_kleene = _clone_signature(and_) +""" +Logical 'and' boolean values (Kleene logic). + +This function behaves as follows with nulls: + +- true and null = null +- null and true = null +- false and null = false +- null and false = false +- null and null = null + +In other words, in this context a null value really means "unknown", +and an unknown value 'and' false is always false. +For a different null behavior, see function "and". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +and_not = _clone_signature(and_) +""" +Logical 'and not' boolean values. + +When a null is encountered in either input, a null is output. +For a different null behavior, see function "and_not_kleene". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +and_not_kleene = _clone_signature(and_) +""" +Logical 'and not' boolean values (Kleene logic). + +This function behaves as follows with nulls: + +- true and not null = null +- null and not false = null +- false and not null = false +- null and not true = false +- null and not null = null + +In other words, in this context a null value really means "unknown", +and an unknown value 'and not' true is always false, as is false +'and not' an unknown value. +For a different null behavior, see function "and_not". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +or_ = _clone_signature(and_) +""" +Logical 'or' boolean values. + +When a null is encountered in either input, a null is output. +For a different null behavior, see function "or_kleene". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +or_kleene = _clone_signature(and_) +""" +Logical 'or' boolean values (Kleene logic). + +This function behaves as follows with nulls: + +- true or null = true +- null or true = true +- false or null = null +- null or false = null +- null or null = null + +In other words, in this context a null value really means "unknown", +and an unknown value 'or' true is always true. +For a different null behavior, see function "or". + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +xor = _clone_signature(and_) +""" +Logical 'xor' boolean values. + +When a null is encountered in either input, a null is output. + +Parameters +---------- +x : Array-like or scalar-like + Argument to compute function. +y : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def invert( + x: lib.BooleanScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar: ... +@overload +def invert( + x: _BooleanArrayT, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _BooleanArrayT: ... +@overload +def invert( + x: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def invert(*args, **kwargs): + """ + Invert boolean values. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 2.10 String predicates ========================= +@overload +def ascii_is_alnum( + strings: StringScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar: ... +@overload +def ascii_is_alnum( + strings: StringArray, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanArray: ... +@overload +def ascii_is_alnum( + strings: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +def ascii_is_alnum(*args, **kwargs): + """ + Classify strings as ASCII alphanumeric. + + For each string in `strings`, emit true iff the string is non-empty + and consists only of alphanumeric ASCII characters. Null strings emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +ascii_is_alpha = _clone_signature(ascii_is_alnum) +""" +Classify strings as ASCII alphabetic. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of alphabetic ASCII characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_is_decimal = _clone_signature(ascii_is_alnum) +""" +Classify strings as ASCII decimal. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of decimal ASCII characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_is_lower = _clone_signature(ascii_is_alnum) +""" +Classify strings as ASCII lowercase. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of lowercase ASCII characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_is_printable = _clone_signature(ascii_is_alnum) +""" +Classify strings as ASCII printable. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of printable ASCII characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_is_space = _clone_signature(ascii_is_alnum) +""" +Classify strings as ASCII whitespace. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of whitespace ASCII characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_is_upper = _clone_signature(ascii_is_alnum) +""" +Classify strings as ASCII uppercase. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of uppercase ASCII characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_is_alnum = _clone_signature(ascii_is_alnum) +""" +Classify strings as alphanumeric. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of alphanumeric Unicode characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_is_alpha = _clone_signature(ascii_is_alnum) +""" +Classify strings as alphabetic. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of alphabetic Unicode characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_is_decimal = _clone_signature(ascii_is_alnum) +""" +Classify strings as decimal. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of decimal Unicode characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_is_digit = _clone_signature(ascii_is_alnum) +""" +Classify strings as digits. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of Unicode digits. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_is_lower = _clone_signature(ascii_is_alnum) +""" +Classify strings as lowercase. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of lowercase Unicode characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_is_numeric = _clone_signature(ascii_is_alnum) +""" +Classify strings as numeric. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of numeric Unicode characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_is_printable = _clone_signature(ascii_is_alnum) +""" +Classify strings as printable. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of printable Unicode characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_is_space = _clone_signature(ascii_is_alnum) +""" +Classify strings as whitespace. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of whitespace Unicode characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_is_upper = _clone_signature(ascii_is_alnum) +""" +Classify strings as uppercase. + +For each string in `strings`, emit true iff the string is non-empty +and consists only of uppercase Unicode characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_is_title = _clone_signature(ascii_is_alnum) +""" +Classify strings as ASCII titlecase. + +For each string in `strings`, emit true iff the string is title-cased, +i.e. it has at least one cased character, each uppercase character +follows an uncased character, and each lowercase character follows +an uppercase character. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_is_title = _clone_signature(ascii_is_alnum) +""" +Classify strings as titlecase. + +For each string in `strings`, emit true iff the string is title-cased, +i.e. it has at least one cased character, each uppercase character +follows an uncased character, and each lowercase character follows +an uppercase character. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +string_is_ascii = _clone_signature(ascii_is_alnum) +""" +Classify strings as ASCII. + +For each string in `strings`, emit true iff the string consists only +of ASCII characters. Null strings emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +# ========================= 2.11 String transforms ========================= +@overload +def ascii_capitalize( + strings: _StringScalarT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _StringScalarT: ... +@overload +def ascii_capitalize( + strings: _StringArrayT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _StringArrayT: ... +@overload +def ascii_capitalize( + strings: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +def ascii_capitalize(*args, **kwargs): + """ + Capitalize the first character of ASCII input. + + For each string in `strings`, return a capitalized version. + + This function assumes the input is fully ASCII. If it may contain + non-ASCII characters, use "utf8_capitalize" instead. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +ascii_lower = _clone_signature(ascii_capitalize) +""" +Transform ASCII input to lowercase. + +For each string in `strings`, return a lowercase version. + +This function assumes the input is fully ASCII. If it may contain +non-ASCII characters, use "utf8_lower" instead. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_reverse = _clone_signature(ascii_capitalize) +""" +Reverse ASCII input. + +For each ASCII string in `strings`, return a reversed version. + +This function assumes the input is fully ASCII. If it may contain +non-ASCII characters, use "utf8_reverse" instead. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_swapcase = _clone_signature(ascii_capitalize) +""" +Transform ASCII input by inverting casing. + +For each string in `strings`, return a string with opposite casing. + +This function assumes the input is fully ASCII. If it may contain +non-ASCII characters, use "utf8_swapcase" instead. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_title = _clone_signature(ascii_capitalize) +""" +Titlecase each word of ASCII input. + +For each string in `strings`, return a titlecased version. +Each word in the output will start with an uppercase character and its +remaining characters will be lowercase. + +This function assumes the input is fully ASCII. If it may contain +non-ASCII characters, use "utf8_title" instead. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_upper = _clone_signature(ascii_capitalize) +""" +Transform ASCII input to uppercase. + +For each string in `strings`, return an uppercase version. + +This function assumes the input is fully ASCII. It it may contain +non-ASCII characters, use "utf8_upper" instead. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def binary_length( + strings: lib.BinaryScalar | lib.StringScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int32Scalar: ... +@overload +def binary_length( + strings: lib.LargeBinaryScalar | lib.LargeStringScalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar: ... +@overload +def binary_length( + strings: lib.BinaryArray + | lib.StringArray + | lib.ChunkedArray[lib.BinaryScalar] + | lib.ChunkedArray[lib.StringScalar], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int32Array: ... +@overload +def binary_length( + strings: lib.LargeBinaryArray + | lib.LargeStringArray + | lib.ChunkedArray[lib.LargeBinaryScalar] + | lib.ChunkedArray[lib.LargeStringScalar], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Array: ... +@overload +def binary_length( + strings: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def binary_length(*args, **kwargs): + """ + Compute string lengths. + + For each string in `strings`, emit its length of bytes. + Null values emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def binary_repeat( + strings: _StringOrBinaryScalarT, + num_repeats: int, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _StringOrBinaryScalarT: ... +@overload +def binary_repeat( + strings: _StringOrBinaryScalarT, + num_repeats: list[int] | list[int | None], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Array[_StringOrBinaryScalarT]: ... +@overload +def binary_repeat( + strings: _StringOrBinaryArrayT, + num_repeats: int | list[int] | list[int | None], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _StringOrBinaryArrayT: ... +@overload +def binary_repeat( + strings: Expression, + num_repeats: int | list[int] | list[int | None], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def binary_repeat(*args, **kwargs): + """ + Repeat a binary string. + + For each binary string in `strings`, return a replicated version. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + num_repeats : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def binary_replace_slice( + strings: _StringOrBinaryScalarT, + /, + start: int, + stop: int, + replacement: str | bytes, + *, + options: ReplaceSliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringOrBinaryScalarT: ... +@overload +def binary_replace_slice( + strings: _StringOrBinaryArrayT, + /, + start: int, + stop: int, + replacement: str | bytes, + *, + options: ReplaceSliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringOrBinaryArrayT: ... +@overload +def binary_replace_slice( + strings: Expression, + /, + start: int, + stop: int, + replacement: str | bytes, + *, + options: ReplaceSliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def binary_replace_slice(*args, **kwargs): + """ + Replace a slice of a binary string. + + For each string in `strings`, replace a slice of the string defined by `start` + and `stop` indices with the given `replacement`. `start` is inclusive + and `stop` is exclusive, and both are measured in bytes. + Null values emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + start : int + Index to start slicing at (inclusive). + stop : int + Index to stop slicing at (exclusive). + replacement : str + What to replace the slice with. + options : pyarrow.compute.ReplaceSliceOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def binary_reverse( + strings: _BinaryScalarT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _BinaryScalarT: ... +@overload +def binary_reverse( + strings: _BinaryArrayT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _BinaryArrayT: ... +@overload +def binary_reverse( + strings: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +def binary_reverse(*args, **kwargs): + """ + Reverse binary input. + + For each binary string in `strings`, return a reversed version. + + This function reverses the binary data at a byte-level. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def replace_substring( + strings: _StringScalarT, + /, + pattern: str | bytes, + replacement: str | bytes, + *, + max_replacements: int | None = None, + options: ReplaceSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT: ... +@overload +def replace_substring( + strings: _StringArrayT, + /, + pattern: str | bytes, + replacement: str | bytes, + *, + max_replacements: int | None = None, + options: ReplaceSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringArrayT: ... +@overload +def replace_substring( + strings: Expression, + /, + pattern: str | bytes, + replacement: str | bytes, + *, + max_replacements: int | None = None, + options: ReplaceSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def replace_substring(*args, **kwargs): + """ + Replace matching non-overlapping substrings with replacement. + + For each string in `strings`, replace non-overlapping substrings that match + the given literal `pattern` with the given `replacement`. + If `max_replacements` is given and not equal to -1, it limits the + maximum amount replacements per input, counted from the left. + Null values emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + pattern : str + Substring pattern to look for inside input values. + replacement : str + What to replace the pattern with. + max_replacements : int or None, default None + The maximum number of strings to replace in each + input value (unlimited if None). + options : pyarrow.compute.ReplaceSubstringOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +replace_substring_regex = _clone_signature(replace_substring) +""" +Replace matching non-overlapping substrings with replacement. + +For each string in `strings`, replace non-overlapping substrings that match +the given regular expression `pattern` with the given `replacement`. +If `max_replacements` is given and not equal to -1, it limits the +maximum amount replacements per input, counted from the left. +Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +pattern : str + Substring pattern to look for inside input values. +replacement : str + What to replace the pattern with. +max_replacements : int or None, default None + The maximum number of strings to replace in each + input value (unlimited if None). +options : pyarrow.compute.ReplaceSubstringOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def utf8_capitalize( + strings: _StringScalarT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _StringScalarT: ... +@overload +def utf8_capitalize( + strings: _StringArrayT, /, *, memory_pool: lib.MemoryPool | None = None +) -> _StringArrayT: ... +@overload +def utf8_capitalize( + strings: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +def utf8_capitalize(*args, **kwargs): + """ + Capitalize the first character of input. + + For each string in `strings`, return a capitalized version, + with the first character uppercased and the others lowercased. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def utf8_length( + strings: lib.StringScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int32Scalar: ... +@overload +def utf8_length( + strings: lib.LargeStringScalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar: ... +@overload +def utf8_length( + strings: lib.StringArray | lib.ChunkedArray[lib.StringScalar], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int32Array: ... +@overload +def utf8_length( + strings: lib.LargeStringArray | lib.ChunkedArray[lib.LargeStringScalar], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Array: ... +@overload +def utf8_length( + strings: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def utf8_length(*args, **kwargs): + """ + Compute UTF8 string lengths. + + For each string in `strings`, emit its length in UTF8 characters. + Null values emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +utf8_lower = _clone_signature(utf8_capitalize) +""" +Transform input to lowercase. + +For each string in `strings`, return a lowercase version. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def utf8_replace_slice( + strings: _StringScalarT, + /, + start: int, + stop: int, + replacement: str | bytes, + *, + options: ReplaceSliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT: ... +@overload +def utf8_replace_slice( + strings: _StringArrayT, + /, + start: int, + stop: int, + replacement: str | bytes, + *, + options: ReplaceSliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringArrayT: ... +@overload +def utf8_replace_slice( + strings: Expression, + /, + start: int, + stop: int, + replacement: str | bytes, + *, + options: ReplaceSliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def utf8_replace_slice(*args, **kwargs): + """ + Replace a slice of a string. + + For each string in `strings`, replace a slice of the string defined by `start` + and `stop` indices with the given `replacement`. `start` is inclusive + and `stop` is exclusive, and both are measured in UTF8 characters. + Null values emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + start : int + Index to start slicing at (inclusive). + stop : int + Index to stop slicing at (exclusive). + replacement : str + What to replace the slice with. + options : pyarrow.compute.ReplaceSliceOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +utf8_reverse = _clone_signature(utf8_capitalize) +""" +Reverse input. + +For each string in `strings`, return a reversed version. + +This function operates on Unicode codepoints, not grapheme +clusters. Hence, it will not correctly reverse grapheme clusters +composed of multiple codepoints. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_swapcase = _clone_signature(utf8_capitalize) +""" +Transform input lowercase characters to uppercase and uppercase characters to lowercase. + +For each string in `strings`, return an opposite case version. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_title = _clone_signature(utf8_capitalize) +""" +Titlecase each word of input. + +For each string in `strings`, return a titlecased version. +Each word in the output will start with an uppercase character and its +remaining characters will be lowercase. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_upper = _clone_signature(utf8_capitalize) +""" +Transform input to uppercase. + +For each string in `strings`, return an uppercase version. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory poo +""" + +# ========================= 2.12 String padding ========================= +@overload +def ascii_center( + strings: _StringScalarT, + /, + width: int, + padding: str = " ", + lean_left_on_odd_padding: bool = True, + *, + options: PadOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT: ... +@overload +def ascii_center( + strings: _StringArrayT, + /, + width: int, + padding: str = " ", + lean_left_on_odd_padding: bool = True, + *, + options: PadOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringArrayT: ... +@overload +def ascii_center( + strings: Expression, + /, + width: int, + padding: str = " ", + lean_left_on_odd_padding: bool = True, + *, + options: PadOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def ascii_center(*args, **kwargs): + """ + Center strings by padding with a given character. + + For each string in `strings`, emit a centered string by padding both sides + with the given ASCII character. + Null values emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + width : int + Desired string length. + padding : str, default " " + What to pad the string with. Should be one byte or codepoint. + lean_left_on_odd_padding : bool, default True + What to do if there is an odd number of padding characters (in case + of centered padding). Defaults to aligning on the left (i.e. adding + the extra padding character on the right). + options : pyarrow.compute.PadOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +ascii_lpad = _clone_signature(ascii_center) +""" +Right-align strings by padding with a given character. + +For each string in `strings`, emit a right-aligned string by prepending +the given ASCII character. +Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +width : int + Desired string length. +padding : str, default " " + What to pad the string with. Should be one byte or codepoint. +lean_left_on_odd_padding : bool, default True + What to do if there is an odd number of padding characters (in case + of centered padding). Defaults to aligning on the left (i.e. adding + the extra padding character on the right). +options : pyarrow.compute.PadOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_rpad = _clone_signature(ascii_center) +""" +Left-align strings by padding with a given character. + +For each string in `strings`, emit a left-aligned string by appending +the given ASCII character. +Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +width : int + Desired string length. +padding : str, default " " + What to pad the string with. Should be one byte or codepoint. +lean_left_on_odd_padding : bool, default True + What to do if there is an odd number of padding characters (in case + of centered padding). Defaults to aligning on the left (i.e. adding + the extra padding character on the right). +options : pyarrow.compute.PadOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_center = _clone_signature(ascii_center) +""" +Center strings by padding with a given character. + +For each string in `strings`, emit a centered string by padding both sides +with the given UTF8 codeunit. +Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +width : int + Desired string length. +padding : str, default " " + What to pad the string with. Should be one byte or codepoint. +lean_left_on_odd_padding : bool, default True + What to do if there is an odd number of padding characters (in case + of centered padding). Defaults to aligning on the left (i.e. adding + the extra padding character on the right). +options : pyarrow.compute.PadOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_lpad = _clone_signature(ascii_center) +""" +Right-align strings by padding with a given character. + +For each string in `strings`, emit a right-aligned string by prepending +the given UTF8 codeunit. +Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +width : int + Desired string length. +padding : str, default " " + What to pad the string with. Should be one byte or codepoint. +lean_left_on_odd_padding : bool, default True + What to do if there is an odd number of padding characters (in case + of centered padding). Defaults to aligning on the left (i.e. adding + the extra padding character on the right). +options : pyarrow.compute.PadOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_rpad = _clone_signature(ascii_center) +""" +Left-align strings by padding with a given character. + +For each string in `strings`, emit a left-aligned string by appending +the given UTF8 codeunit. +Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +width : int + Desired string length. +padding : str, default " " + What to pad the string with. Should be one byte or codepoint. +lean_left_on_odd_padding : bool, default True + What to do if there is an odd number of padding characters (in case + of centered padding). Defaults to aligning on the left (i.e. adding + the extra padding character on the right). +options : pyarrow.compute.PadOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +# ========================= 2.13 String trimming ========================= +@overload +def ascii_ltrim( + strings: _StringScalarT, + /, + characters: str, + *, + options: TrimOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT: ... +@overload +def ascii_ltrim( + strings: _StringArrayT, + /, + characters: str, + *, + options: TrimOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringArrayT: ... +@overload +def ascii_ltrim( + strings: Expression, + /, + characters: str, + *, + options: TrimOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def ascii_ltrim(*args, **kwargs): + """ + Trim leading characters. + + For each string in `strings`, remove any leading characters + from the `characters` option (as given in TrimOptions). + Null values emit null. + Both the `strings` and the `characters` are interpreted as + ASCII; to trim non-ASCII characters, use `utf8_ltrim`. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + characters : str + Individual characters to be trimmed from the string. + options : pyarrow.compute.TrimOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +ascii_rtrim = _clone_signature(ascii_ltrim) +""" +Trim trailing characters. + +For each string in `strings`, remove any trailing characters +from the `characters` option (as given in TrimOptions). +Null values emit null. +Both the `strings` and the `characters` are interpreted as +ASCII; to trim non-ASCII characters, use `utf8_rtrim`. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +characters : str + Individual characters to be trimmed from the string. +options : pyarrow.compute.TrimOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_trim = _clone_signature(ascii_ltrim) +""" +Trim leading and trailing characters. + +For each string in `strings`, remove any leading or trailing characters +from the `characters` option (as given in TrimOptions). +Null values emit null. +Both the `strings` and the `characters` are interpreted as +ASCII; to trim non-ASCII characters, use `utf8_trim`. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +characters : str + Individual characters to be trimmed from the string. +options : pyarrow.compute.TrimOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_ltrim = _clone_signature(ascii_ltrim) +""" +Trim leading characters. + +For each string in `strings`, remove any leading characters +from the `characters` option (as given in TrimOptions). +Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +characters : str + Individual characters to be trimmed from the string. +options : pyarrow.compute.TrimOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_rtrim = _clone_signature(ascii_ltrim) +""" +Trim trailing characters. + +For each string in `strings`, remove any trailing characters +from the `characters` option (as given in TrimOptions). +Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +characters : str + Individual characters to be trimmed from the string. +options : pyarrow.compute.TrimOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_trim = _clone_signature(ascii_ltrim) +""" +Trim leading and trailing characters. + +For each string in `strings`, remove any leading or trailing characters +from the `characters` option (as given in TrimOptions). +Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +characters : str + Individual characters to be trimmed from the string. +options : pyarrow.compute.TrimOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def ascii_ltrim_whitespace( + strings: _StringScalarT, + /, + *, + options: TrimOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT: ... +@overload +def ascii_ltrim_whitespace( + strings: _StringArrayT, + /, + *, + options: TrimOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringArrayT: ... +@overload +def ascii_ltrim_whitespace( + strings: Expression, + /, + *, + options: TrimOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def ascii_ltrim_whitespace(*args, **kwargs): + """ + Trim leading ASCII whitespace characters. + + For each string in `strings`, emit a string with leading ASCII whitespace + characters removed. Use `utf8_ltrim_whitespace` to trim leading Unicode + whitespace characters. Null values emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +ascii_rtrim_whitespace = _clone_signature(ascii_ltrim_whitespace) +""" +Trim trailing ASCII whitespace characters. + +For each string in `strings`, emit a string with trailing ASCII whitespace +characters removed. Use `utf8_rtrim_whitespace` to trim trailing Unicode +whitespace characters. Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +ascii_trim_whitespace = _clone_signature(ascii_ltrim_whitespace) +""" +Trim leading and trailing ASCII whitespace characters. + +For each string in `strings`, emit a string with leading and trailing ASCII +whitespace characters removed. Use `utf8_trim_whitespace` to trim Unicode +whitespace characters. Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_ltrim_whitespace = _clone_signature(ascii_ltrim_whitespace) +""" +Trim leading whitespace characters. + +For each string in `strings`, emit a string with leading whitespace +characters removed, where whitespace characters are defined by the Unicode +standard. Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_rtrim_whitespace = _clone_signature(ascii_ltrim_whitespace) +""" +Trim trailing whitespace characters. + +For each string in `strings`, emit a string with trailing whitespace +characters removed, where whitespace characters are defined by the Unicode +standard. Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_trim_whitespace = _clone_signature(ascii_ltrim_whitespace) +""" +Trim leading and trailing whitespace characters. + +For each string in `strings`, emit a string with leading and trailing +whitespace characters removed, where whitespace characters are defined +by the Unicode standard. Null values emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +# ========================= 2.14 String splitting ========================= +@overload +def ascii_split_whitespace( + strings: _StringScalarT, + /, + *, + max_splits: int | None = None, + reverse: bool = False, + options: SplitOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.ListArray[_StringScalarT]: ... +@overload +def ascii_split_whitespace( + strings: lib.Array[lib.Scalar[_DataTypeT]], + /, + *, + max_splits: int | None = None, + reverse: bool = False, + options: SplitOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.ListArray[lib.ListScalar[_DataTypeT]]: ... +@overload +def ascii_split_whitespace( + strings: Expression, + /, + *, + max_splits: int | None = None, + reverse: bool = False, + options: SplitOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def ascii_split_whitespace(*args, **kwargs): + """ + Split string according to any ASCII whitespace. + + Split each string according any non-zero length sequence of ASCII + whitespace characters. The output for each string input is a list + of strings. + + The maximum number of splits and direction of splitting + (forward, reverse) can optionally be defined in SplitOptions. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + max_splits : int or None, default None + Maximum number of splits for each input value (unlimited if None). + reverse : bool, default False + Whether to start splitting from the end of each input value. + This only has an effect if `max_splits` is not None. + options : pyarrow.compute.SplitOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def split_pattern( + strings: _StringOrBinaryScalarT, + /, + pattern: str, + *, + max_splits: int | None = None, + reverse: bool = False, + options: SplitOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.ListArray[_StringOrBinaryScalarT]: ... +@overload +def split_pattern( + strings: lib.Array[lib.Scalar[_DataTypeT]], + /, + pattern: str, + *, + max_splits: int | None = None, + reverse: bool = False, + options: SplitPatternOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.ListArray[lib.ListScalar[_DataTypeT]]: ... +@overload +def split_pattern( + strings: Expression, + /, + pattern: str, + *, + max_splits: int | None = None, + reverse: bool = False, + options: SplitPatternOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def split_pattern(*args, **kwargs): + """ + Split string according to separator. + + Split each string according to the exact `pattern` defined in + SplitPatternOptions. The output for each string input is a list + of strings. + + The maximum number of splits and direction of splitting + (forward, reverse) can optionally be defined in SplitPatternOptions. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + pattern : str + String pattern to split on. + max_splits : int or None, default None + Maximum number of splits for each input value (unlimited if None). + reverse : bool, default False + Whether to start splitting from the end of each input value. + This only has an effect if `max_splits` is not None. + options : pyarrow.compute.SplitPatternOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +split_pattern_regex = _clone_signature(split_pattern) +""" +Split string according to regex pattern. + +Split each string according to the regex `pattern` defined in +SplitPatternOptions. The output for each string input is a list +of strings. + +The maximum number of splits and direction of splitting +(forward, reverse) can optionally be defined in SplitPatternOptions. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +pattern : str + String pattern to split on. +max_splits : int or None, default None + Maximum number of splits for each input value (unlimited if None). +reverse : bool, default False + Whether to start splitting from the end of each input value. + This only has an effect if `max_splits` is not None. +options : pyarrow.compute.SplitPatternOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +utf8_split_whitespace = _clone_signature(ascii_split_whitespace) +""" +Split string according to any Unicode whitespace. + +Split each string according any non-zero length sequence of Unicode +whitespace characters. The output for each string input is a list +of strings. + +The maximum number of splits and direction of splitting +(forward, reverse) can optionally be defined in SplitOptions. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +max_splits : int or None, default None + Maximum number of splits for each input value (unlimited if None). +reverse : bool, default False + Whether to start splitting from the end of each input value. + This only has an effect if `max_splits` is not None. +options : pyarrow.compute.SplitOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +# ========================= 2.15 String component extraction ========================= +@overload +def extract_regex( + strings: StringOrBinaryScalar, + /, + pattern: str, + *, + options: ExtractRegexOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StructScalar: ... +@overload +def extract_regex( + strings: StringOrBinaryArray, + /, + pattern: str, + *, + options: ExtractRegexOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StructArray: ... +@overload +def extract_regex( + strings: Expression, + /, + pattern: str, + *, + options: ExtractRegexOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def extract_regex(*args, **kwargs): + """ + Extract substrings captured by a regex pattern. + + For each string in `strings`, match the regular expression and, if + successful, emit a struct with field names and values coming from the + regular expression's named capture groups. If the input is null or the + regular expression fails matching, a null output value is emitted. + + Regular expression matching is done using the Google RE2 library. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + pattern : str + Regular expression with named capture fields. + options : pyarrow.compute.ExtractRegexOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 2.16 String join ========================= +def binary_join( + strings, separator, /, *, memory_pool: lib.MemoryPool | None = None +) -> StringScalar | StringArray: + """ + Join a list of strings together with a separator. + + Concatenate the strings in `list`. The `separator` is inserted + between each given string. + Any null input and any null `list` element emits a null output. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + separator : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def binary_join_element_wise( + *strings: _StringOrBinaryScalarT, + null_handling: Literal["emit_null", "skip", "replace"] = "emit_null", + null_replacement: str = "", + options: JoinOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringOrBinaryScalarT: ... +@overload +def binary_join_element_wise( + *strings: _StringOrBinaryArrayT, + null_handling: Literal["emit_null", "skip", "replace"] = "emit_null", + null_replacement: str = "", + options: JoinOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringOrBinaryArrayT: ... +@overload +def binary_join_element_wise( + *strings: Expression, + null_handling: Literal["emit_null", "skip", "replace"] = "emit_null", + null_replacement: str = "", + options: JoinOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def binary_join_element_wise(*args, **kwargs): + """ + Join string arguments together, with the last argument as separator. + + Concatenate the `strings` except for the last one. The last argument + in `strings` is inserted between each given string. + Any null separator element emits a null output. Null elements either + emit a null (the default), are skipped, or replaced with a given string. + + Parameters + ---------- + *strings : Array-like or scalar-like + Argument to compute function. + null_handling : str, default "emit_null" + How to handle null values in the inputs. + Accepted values are "emit_null", "skip", "replace". + null_replacement : str, default "" + Replacement string to emit for null inputs if `null_handling` + is "replace". + options : pyarrow.compute.JoinOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 2.17 String Slicing ========================= +@overload +def binary_slice( + strings: _BinaryScalarT, + /, + start: int, + stop: int | None = None, + step: int = 1, + *, + options: SliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _BinaryScalarT: ... +@overload +def binary_slice( + strings: _BinaryArrayT, + /, + start: int, + stop: int | None = None, + step: int = 1, + *, + options: SliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _BinaryArrayT: ... +@overload +def binary_slice( + strings: Expression, + /, + start: int, + stop: int | None = None, + step: int = 1, + *, + options: SliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def binary_slice(*args, **kwargs): + """ + Slice binary string. + + For each binary string in `strings`, emit the substring defined by + (`start`, `stop`, `step`) as given by `SliceOptions` where `start` is + inclusive and `stop` is exclusive. All three values are measured in + bytes. + If `step` is negative, the string will be advanced in reversed order. + An error is raised if `step` is zero. + Null inputs emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + start : int + Index to start slicing at (inclusive). + stop : int or None, default None + If given, index to stop slicing at (exclusive). + If not given, slicing will stop at the end. + step : int, default 1 + Slice step. + options : pyarrow.compute.SliceOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def utf8_slice_codeunits( + strings: _StringScalarT, + /, + start: int, + stop: int | None = None, + step: int = 1, + *, + options: SliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringScalarT: ... +@overload +def utf8_slice_codeunits( + strings: _StringArrayT, + /, + start: int, + stop: int | None = None, + step: int = 1, + *, + options: SliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _StringArrayT: ... +@overload +def utf8_slice_codeunits( + strings: Expression, + /, + start: int, + stop: int | None = None, + step: int = 1, + *, + options: SliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def utf8_slice_codeunits(*args, **kwargs): + """ + Slice string. + + For each string in `strings`, emit the substring defined by + (`start`, `stop`, `step`) as given by `SliceOptions` where `start` is + inclusive and `stop` is exclusive. All three values are measured in + UTF8 codeunits. + If `step` is negative, the string will be advanced in reversed order. + An error is raised if `step` is zero. + Null inputs emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + start : int + Index to start slicing at (inclusive). + stop : int or None, default None + If given, index to stop slicing at (exclusive). + If not given, slicing will stop at the end. + step : int, default 1 + Slice step. + options : pyarrow.compute.SliceOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 2.18 Containment tests ========================= +@overload +def count_substring( + strings: lib.StringScalar | lib.BinaryScalar, + /, + pattern: str, + *, + ignore_case: bool = False, + options: MatchSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int32Scalar: ... +@overload +def count_substring( + strings: lib.LargeStringScalar | lib.LargeBinaryScalar, + /, + pattern: str, + *, + ignore_case: bool = False, + options: MatchSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar: ... +@overload +def count_substring( + strings: lib.StringArray + | lib.BinaryArray + | lib.ChunkedArray[lib.StringScalar] + | lib.ChunkedArray[lib.BinaryScalar], + /, + pattern: str, + *, + ignore_case: bool = False, + options: MatchSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int32Array: ... +@overload +def count_substring( + strings: lib.LargeStringArray + | lib.LargeBinaryArray + | lib.ChunkedArray[lib.LargeStringScalar] + | lib.ChunkedArray[lib.LargeBinaryScalar], + /, + pattern: str, + *, + ignore_case: bool = False, + options: MatchSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Array: ... +@overload +def count_substring( + strings: Expression, + /, + pattern: str, + *, + ignore_case: bool = False, + options: MatchSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def count_substring(*args, **kwargs): + """ + Count occurrences of substring. + + For each string in `strings`, emit the number of occurrences of the given + literal pattern. + Null inputs emit null. The pattern must be given in MatchSubstringOptions. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + pattern : str + Substring pattern to look for inside input values. + ignore_case : bool, default False + Whether to perform a case-insensitive match. + options : pyarrow.compute.MatchSubstringOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +count_substring_regex = _clone_signature(count_substring) +""" +Count occurrences of substring. + +For each string in `strings`, emit the number of occurrences of the given +regular expression pattern. +Null inputs emit null. The pattern must be given in MatchSubstringOptions. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +pattern : str + Substring pattern to look for inside input values. +ignore_case : bool, default False + Whether to perform a case-insensitive match. +options : pyarrow.compute.MatchSubstringOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def ends_with( + strings: StringScalar | BinaryScalar, + /, + pattern: str, + *, + ignore_case: bool = False, + options: MatchSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanScalar: ... +@overload +def ends_with( + strings: StringArray | BinaryArray, + /, + pattern: str, + *, + ignore_case: bool = False, + options: MatchSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanArray: ... +@overload +def ends_with( + strings: Expression, + /, + pattern: str, + *, + ignore_case: bool = False, + options: MatchSubstringOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def ends_with(*args, **kwargs): + """ + Check if strings end with a literal pattern. + + For each string in `strings`, emit true iff it ends with a given pattern. + The pattern must be given in MatchSubstringOptions. + If ignore_case is set, only simple case folding is performed. + + Null inputs emit null. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + pattern : str + Substring pattern to look for inside input values. + ignore_case : bool, default False + Whether to perform a case-insensitive match. + options : pyarrow.compute.MatchSubstringOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +find_substring = _clone_signature(count_substring) +""" +Find first occurrence of substring. + +For each string in `strings`, emit the index in bytes of the first occurrence +of the given literal pattern, or -1 if not found. +Null inputs emit null. The pattern must be given in MatchSubstringOptions. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +pattern : str + Substring pattern to look for inside input values. +ignore_case : bool, default False + Whether to perform a case-insensitive match. +options : pyarrow.compute.MatchSubstringOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +find_substring_regex = _clone_signature(count_substring) +""" +Find location of first match of regex pattern. + +For each string in `strings`, emit the index in bytes of the first occurrence +of the given literal pattern, or -1 if not found. +Null inputs emit null. The pattern must be given in MatchSubstringOptions. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +pattern : str + Substring pattern to look for inside input values. +ignore_case : bool, default False + Whether to perform a case-insensitive match. +options : pyarrow.compute.MatchSubstringOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def index_in( + values: lib.Scalar, + /, + value_set: lib.Array | lib.ChunkedArray, + *, + skip_nulls: bool = False, + options: SetLookupOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int32Scalar: ... +@overload +def index_in( + values: lib.Array | lib.ChunkedArray, + /, + value_set: lib.Array | lib.ChunkedArray, + *, + skip_nulls: bool = False, + options: SetLookupOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int32Array: ... +@overload +def index_in( + values: Expression, + /, + value_set: lib.Array | lib.ChunkedArray, + *, + skip_nulls: bool = False, + options: SetLookupOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def index_in(*args, **kwargs): + """ + Return index of each element in a set of values. + + For each element in `values`, return its index in a given set of + values, or null if it is not found there. + The set of values to look for must be given in SetLookupOptions. + By default, nulls are matched against the value set, this can be + changed in SetLookupOptions. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + value_set : Array + Set of values to look for in the input. + skip_nulls : bool, default False + If False, nulls in the input are matched in the value_set just + like regular values. + If True, nulls in the input always fail matching. + options : pyarrow.compute.SetLookupOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def is_in( + values: lib.Scalar, + /, + value_set: lib.Array | lib.ChunkedArray, + *, + skip_nulls: bool = False, + options: SetLookupOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanScalar: ... +@overload +def is_in( + values: lib.Array | lib.ChunkedArray, + /, + value_set: lib.Array | lib.ChunkedArray, + *, + skip_nulls: bool = False, + options: SetLookupOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanArray: ... +@overload +def is_in( + values: Expression, + /, + value_set: lib.Array | lib.ChunkedArray, + *, + skip_nulls: bool = False, + options: SetLookupOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def is_in(*args, **kwargs): + """ + Find each element in a set of values. + + For each element in `values`, return true if it is found in a given + set of values, false otherwise. + The set of values to look for must be given in SetLookupOptions. + By default, nulls are matched against the value set, this can be + changed in SetLookupOptions. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + value_set : Array + Set of values to look for in the input. + skip_nulls : bool, default False + If False, nulls in the input are matched in the value_set just + like regular values. + If True, nulls in the input always fail matching. + options : pyarrow.compute.SetLookupOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +match_like = _clone_signature(ends_with) +""" +Match strings against SQL-style LIKE pattern. + +For each string in `strings`, emit true iff it matches a given pattern +at any position. '%' will match any number of characters, '_' will +match exactly one character, and any other character matches itself. +To match a literal '%', '_', or '\', precede the character with a backslash. +Null inputs emit null. The pattern must be given in MatchSubstringOptions. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +pattern : str + Substring pattern to look for inside input values. +ignore_case : bool, default False + Whether to perform a case-insensitive match. +options : pyarrow.compute.MatchSubstringOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +match_substring = _clone_signature(ends_with) +""" +Match strings against literal pattern. + +For each string in `strings`, emit true iff it contains a given pattern. +Null inputs emit null. +The pattern must be given in MatchSubstringOptions. +If ignore_case is set, only simple case folding is performed. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +pattern : str + Substring pattern to look for inside input values. +ignore_case : bool, default False + Whether to perform a case-insensitive match. +options : pyarrow.compute.MatchSubstringOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +match_substring_regex = _clone_signature(ends_with) +""" +Match strings against regex pattern. + +For each string in `strings`, emit true iff it matches a given pattern +at any position. The pattern must be given in MatchSubstringOptions. +If ignore_case is set, only simple case folding is performed. + +Null inputs emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +pattern : str + Substring pattern to look for inside input values. +ignore_case : bool, default False + Whether to perform a case-insensitive match. +options : pyarrow.compute.MatchSubstringOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +starts_with = _clone_signature(ends_with) +""" +Check if strings start with a literal pattern. + +For each string in `strings`, emit true iff it starts with a given pattern. +The pattern must be given in MatchSubstringOptions. +If ignore_case is set, only simple case folding is performed. + +Null inputs emit null. + +Parameters +---------- +strings : Array-like or scalar-like + Argument to compute function. +pattern : str + Substring pattern to look for inside input values. +ignore_case : bool, default False + Whether to perform a case-insensitive match. +options : pyarrow.compute.MatchSubstringOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +# ========================= 2.19 Categorizations ========================= +@overload +def is_finite( + values: NumericScalar | lib.NullScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar: ... +@overload +def is_finite( + values: NumericArray | lib.NullArray, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanArray: ... +@overload +def is_finite( + values: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +def is_finite(*args, **kwargs): + """ + Return true if value is finite. + + For each input value, emit true iff the value is finite + (i.e. neither NaN, inf, nor -inf). + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +is_inf = _clone_signature(is_finite) +""" +Return true if infinity. + +For each input value, emit true iff the value is infinite (inf or -inf). + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +is_nan = _clone_signature(is_finite) +""" +Return true if NaN. + +For each input value, emit true iff the value is NaN. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def is_null( + values: lib.Scalar, + /, + *, + nan_is_null: bool = False, + options: NullOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanScalar: ... +@overload +def is_null( + values: lib.Array | lib.ChunkedArray, + /, + *, + nan_is_null: bool = False, + options: NullOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanArray: ... +@overload +def is_null( + values: Expression, + /, + *, + nan_is_null: bool = False, + options: NullOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def is_null(*args, **kwargs): + """ + Return true if null (and optionally NaN). + + For each input value, emit true iff the value is null. + True may also be emitted for NaN values by setting the `nan_is_null` flag. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + nan_is_null : bool, default False + Whether floating-point NaN values are considered null. + options : pyarrow.compute.NullOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def is_valid( + values: lib.Scalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar: ... +@overload +def is_valid( + values: lib.Array | lib.ChunkedArray, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanArray: ... +@overload +def is_valid( + values: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +def is_valid(*args, **kwargs): + """ + Return true if non-null. + + For each input value, emit true iff the value is valid (i.e. non-null). + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +true_unless_null = _clone_signature(is_valid) +""" +Return true if non-null, else return null. + +For each input value, emit true iff the value +is valid (non-null), otherwise emit null. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +# ========================= 2.20 Selecting / multiplexing ========================= +def case_when(cond, /, *cases, memory_pool: lib.MemoryPool | None = None): + """ + Choose values based on multiple conditions. + + `cond` must be a struct of Boolean values. `cases` can be a mix + of scalar and array arguments (of any type, but all must be the + same type or castable to a common type), with either exactly one + datum per child of `cond`, or one more `cases` than children of + `cond` (in which case we have an "else" value). + + Each row of the output will be the corresponding value of the + first datum in `cases` for which the corresponding child of `cond` + is true, or otherwise the "else" value (if given), or null. + + Essentially, this implements a switch-case or if-else, if-else... statement. + + Parameters + ---------- + cond : Array-like or scalar-like + Argument to compute function. + *cases : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def choose(indices, /, *values, memory_pool: lib.MemoryPool | None = None): + """ + Choose values from several arrays. + + For each row, the value of the first argument is used as a 0-based index + into the list of `values` arrays (i.e. index 0 selects the first of the + `values` arrays). The output value is the corresponding value of the + selected argument. + + If an index is null, the output will be null. + + Parameters + ---------- + indices : Array-like or scalar-like + Argument to compute function. + *values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def coalesce( + *values: _ScalarOrArrayT, memory_pool: lib.MemoryPool | None = None +) -> _ScalarOrArrayT: + """ + Select the first non-null value. + + Each row of the output will be the value from the first corresponding input + for which the value is not null. If all inputs are null in a row, the output + will be null. + + Parameters + ---------- + *values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +fill_null = coalesce +"""Replace each null element in values with a corresponding +element from fill_value. + +If fill_value is scalar-like, then every null element in values +will be replaced with fill_value. If fill_value is array-like, +then the i-th element in values will be replaced with the i-th +element in fill_value. + +The fill_value's type must be the same as that of values, or it +must be able to be implicitly casted to the array's type. + +This is an alias for :func:`coalesce`. + +Parameters +---------- +values : Array, ChunkedArray, or Scalar-like object + Each null element is replaced with the corresponding value + from fill_value. +fill_value : Array, ChunkedArray, or Scalar-like object + If not same type as values, will attempt to cast. + +Returns +------- +result : depends on inputs + Values with all null elements replaced + +Examples +-------- +>>> import pyarrow as pa +>>> arr = pa.array([1, 2, None, 3], type=pa.int8()) +>>> fill_value = pa.scalar(5, type=pa.int8()) +>>> arr.fill_null(fill_value) + +[ + 1, + 2, + 5, + 3 +] +>>> arr = pa.array([1, 2, None, 4, None]) +>>> arr.fill_null(pa.array([10, 20, 30, 40, 50])) + +[ + 1, + 2, + 30, + 4, + 50 +] +""" + +def if_else( + cond: ArrayLike | ScalarLike, + left: ArrayLike | ScalarLike, + right: ArrayLike | ScalarLike, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> ArrayLike | ScalarLike: + """ + Choose values based on a condition. + + `cond` must be a Boolean scalar/ array. + `left` or `right` must be of the same type scalar/ array. + `null` values in `cond` will be promoted to the output. + + Parameters + ---------- + cond : Array-like or scalar-like + Argument to compute function. + left : Array-like or scalar-like + Argument to compute function. + right : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 2.21 Structural transforms ========================= + +@overload +def list_value_length( + lists: _ListArray[Any], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int32Array: ... +@overload +def list_value_length( + lists: _LargeListArray[Any], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Array: ... +@overload +def list_value_length( + lists: ListArray[Any], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int32Array | lib.Int64Array: ... +@overload +def list_value_length( + lists: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def list_value_length(*args, **kwargs): + """ + Compute list lengths. + + `lists` must have a list-like type. + For each non-null value in `lists`, its length is emitted. + Null values emit a null in the output. + + Parameters + ---------- + lists : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def make_struct( + *args: lib.Scalar, + field_names: list[str] | tuple[str, ...] = (), + field_nullability: bool | None = None, + field_metadata: list[lib.KeyValueMetadata] | None = None, + options: MakeStructOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StructScalar: ... +@overload +def make_struct( + *args: lib.Array | lib.ChunkedArray, + field_names: list[str] | tuple[str, ...] = (), + field_nullability: bool | None = None, + field_metadata: list[lib.KeyValueMetadata] | None = None, + options: MakeStructOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StructArray: ... +@overload +def make_struct( + *args: Expression, + field_names: list[str] | tuple[str, ...] = (), + field_nullability: bool | None = None, + field_metadata: list[lib.KeyValueMetadata] | None = None, + options: MakeStructOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def make_struct(*args, **kwargs): + """ + Wrap Arrays into a StructArray. + + Names of the StructArray's fields are + specified through MakeStructOptions. + + Parameters + ---------- + *args : Array-like or scalar-like + Argument to compute function. + field_names : sequence of str + Names of the struct fields to create. + field_nullability : sequence of bool, optional + Nullability information for each struct field. + If omitted, all fields are nullable. + field_metadata : sequence of KeyValueMetadata, optional + Metadata for each struct field. + options : pyarrow.compute.MakeStructOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 2.22 Conversions ========================= +@overload +def ceil_temporal( + timestamps: _TemporalScalarT, + /, + multiple: int = 1, + unit: Literal[ + "year", + "quarter", + "month", + "week", + "day", + "hour", + "minute", + "second", + "millisecond", + "microsecond", + "nanosecond", + ] = "day", + *, + week_starts_monday: bool = True, + ceil_is_strictly_greater: bool = False, + calendar_based_origin: bool = False, + options: RoundTemporalOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _TemporalScalarT: ... +@overload +def ceil_temporal( + timestamps: _TemporalArrayT, + /, + multiple: int = 1, + unit: Literal[ + "year", + "quarter", + "month", + "week", + "day", + "hour", + "minute", + "second", + "millisecond", + "microsecond", + "nanosecond", + ] = "day", + *, + week_starts_monday: bool = True, + ceil_is_strictly_greater: bool = False, + calendar_based_origin: bool = False, + options: RoundTemporalOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _TemporalArrayT: ... +@overload +def ceil_temporal( + timestamps: Expression, + /, + multiple: int = 1, + unit: Literal[ + "year", + "quarter", + "month", + "week", + "day", + "hour", + "minute", + "second", + "millisecond", + "microsecond", + "nanosecond", + ] = "day", + *, + week_starts_monday: bool = True, + ceil_is_strictly_greater: bool = False, + calendar_based_origin: bool = False, + options: RoundTemporalOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def ceil_temporal(*args, **kwargs): + """ + Round temporal values up to nearest multiple of specified time unit. + + Null values emit null. + An error is returned if the values have a defined timezone but it + cannot be found in the timezone database. + + Parameters + ---------- + timestamps : Array-like or scalar-like + Argument to compute function. + multiple : int, default 1 + Number of units to round to. + unit : str, default "day" + The unit in which `multiple` is expressed. + Accepted values are "year", "quarter", "month", "week", "day", + "hour", "minute", "second", "millisecond", "microsecond", + "nanosecond". + week_starts_monday : bool, default True + If True, weeks start on Monday; if False, on Sunday. + ceil_is_strictly_greater : bool, default False + If True, ceil returns a rounded value that is strictly greater than the + input. For example: ceiling 1970-01-01T00:00:00 to 3 hours would + yield 1970-01-01T03:00:00 if set to True and 1970-01-01T00:00:00 + if set to False. + This applies to the ceil_temporal function only. + calendar_based_origin : bool, default False + By default, the origin is 1970-01-01T00:00:00. By setting this to True, + rounding origin will be beginning of one less precise calendar unit. + E.g.: rounding to hours will use beginning of day as origin. + + By default time is rounded to a multiple of units since + 1970-01-01T00:00:00. By setting calendar_based_origin to true, + time will be rounded to number of units since the last greater + calendar unit. + For example: rounding to multiple of days since the beginning of the + month or to hours since the beginning of the day. + Exceptions: week and quarter are not used as greater units, + therefore days will be rounded to the beginning of the month not + week. Greater unit of week is a year. + Note that ceiling and rounding might change sorting order of an array + near greater unit change. For example rounding YYYY-mm-dd 23:00:00 to + 5 hours will ceil and round to YYYY-mm-dd+1 01:00:00 and floor to + YYYY-mm-dd 20:00:00. On the other hand YYYY-mm-dd+1 00:00:00 will + ceil, round and floor to YYYY-mm-dd+1 00:00:00. This can break the + order of an already ordered array. + options : pyarrow.compute.RoundTemporalOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +floor_temporal = _clone_signature(ceil_temporal) +""" +Round temporal values down to nearest multiple of specified time unit. + +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +timestamps : Array-like or scalar-like + Argument to compute function. +multiple : int, default 1 + Number of units to round to. +unit : str, default "day" + The unit in which `multiple` is expressed. + Accepted values are "year", "quarter", "month", "week", "day", + "hour", "minute", "second", "millisecond", "microsecond", + "nanosecond". +week_starts_monday : bool, default True + If True, weeks start on Monday; if False, on Sunday. +ceil_is_strictly_greater : bool, default False + If True, ceil returns a rounded value that is strictly greater than the + input. For example: ceiling 1970-01-01T00:00:00 to 3 hours would + yield 1970-01-01T03:00:00 if set to True and 1970-01-01T00:00:00 + if set to False. + This applies to the ceil_temporal function only. +calendar_based_origin : bool, default False + By default, the origin is 1970-01-01T00:00:00. By setting this to True, + rounding origin will be beginning of one less precise calendar unit. + E.g.: rounding to hours will use beginning of day as origin. + + By default time is rounded to a multiple of units since + 1970-01-01T00:00:00. By setting calendar_based_origin to true, + time will be rounded to number of units since the last greater + calendar unit. + For example: rounding to multiple of days since the beginning of the + month or to hours since the beginning of the day. + Exceptions: week and quarter are not used as greater units, + therefore days will be rounded to the beginning of the month not + week. Greater unit of week is a year. + Note that ceiling and rounding might change sorting order of an array + near greater unit change. For example rounding YYYY-mm-dd 23:00:00 to + 5 hours will ceil and round to YYYY-mm-dd+1 01:00:00 and floor to + YYYY-mm-dd 20:00:00. On the other hand YYYY-mm-dd+1 00:00:00 will + ceil, round and floor to YYYY-mm-dd+1 00:00:00. This can break the + order of an already ordered array. +options : pyarrow.compute.RoundTemporalOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +round_temporal = _clone_signature(ceil_temporal) +""" +Round temporal values to the nearest multiple of specified time unit. + +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +timestamps : Array-like or scalar-like + Argument to compute function. +multiple : int, default 1 + Number of units to round to. +unit : str, default "day" + The unit in which `multiple` is expressed. + Accepted values are "year", "quarter", "month", "week", "day", + "hour", "minute", "second", "millisecond", "microsecond", + "nanosecond". +week_starts_monday : bool, default True + If True, weeks start on Monday; if False, on Sunday. +ceil_is_strictly_greater : bool, default False + If True, ceil returns a rounded value that is strictly greater than the + input. For example: ceiling 1970-01-01T00:00:00 to 3 hours would + yield 1970-01-01T03:00:00 if set to True and 1970-01-01T00:00:00 + if set to False. + This applies to the ceil_temporal function only. +calendar_based_origin : bool, default False + By default, the origin is 1970-01-01T00:00:00. By setting this to True, + rounding origin will be beginning of one less precise calendar unit. + E.g.: rounding to hours will use beginning of day as origin. + + By default time is rounded to a multiple of units since + 1970-01-01T00:00:00. By setting calendar_based_origin to true, + time will be rounded to number of units since the last greater + calendar unit. + For example: rounding to multiple of days since the beginning of the + month or to hours since the beginning of the day. + Exceptions: week and quarter are not used as greater units, + therefore days will be rounded to the beginning of the month not + week. Greater unit of week is a year. + Note that ceiling and rounding might change sorting order of an array + near greater unit change. For example rounding YYYY-mm-dd 23:00:00 to + 5 hours will ceil and round to YYYY-mm-dd+1 01:00:00 and floor to + YYYY-mm-dd 20:00:00. On the other hand YYYY-mm-dd+1 00:00:00 will + ceil, round and floor to YYYY-mm-dd+1 00:00:00. This can break the + order of an already ordered array. +options : pyarrow.compute.RoundTemporalOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def cast( + arr: lib.Scalar, + target_type: _DataTypeT, + safe: bool | None = None, + options: CastOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Scalar[_DataTypeT]: ... +@overload +def cast( + arr: lib.Array, + target_type: _DataTypeT, + safe: bool | None = None, + options: CastOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Array[lib.Scalar[_DataTypeT]]: ... +@overload +def cast( + arr: lib.ChunkedArray, + target_type: _DataTypeT, + safe: bool | None = None, + options: CastOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.ChunkedArray[lib.Scalar[_DataTypeT]]: ... +def cast(*args, **kwargs): + """ + Cast array values to another data type. Can also be invoked as an array + instance method. + + Parameters + ---------- + arr : Array-like + target_type : DataType or str + Type to cast to + safe : bool, default True + Check for overflows or other unsafe conversions + options : CastOptions, default None + Additional checks pass by CastOptions + memory_pool : MemoryPool, optional + memory pool to use for allocations during function execution. + + Examples + -------- + >>> from datetime import datetime + >>> import pyarrow as pa + >>> arr = pa.array([datetime(2010, 1, 1), datetime(2015, 1, 1)]) + >>> arr.type + TimestampType(timestamp[us]) + + You can use ``pyarrow.DataType`` objects to specify the target type: + + >>> cast(arr, pa.timestamp("ms")) + + [ + 2010-01-01 00:00:00.000, + 2015-01-01 00:00:00.000 + ] + + >>> cast(arr, pa.timestamp("ms")).type + TimestampType(timestamp[ms]) + + Alternatively, it is also supported to use the string aliases for these + types: + + >>> arr.cast("timestamp[ms]") + + [ + 2010-01-01 00:00:00.000, + 2015-01-01 00:00:00.000 + ] + >>> arr.cast("timestamp[ms]").type + TimestampType(timestamp[ms]) + + Returns + ------- + casted : Array + The cast result as a new Array + """ + +@overload +def strftime( + timestamps: TemporalScalar, + /, + format: str = "%Y-%m-%dT%H:%M:%S", + locale: str = "C", + *, + options: StrftimeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StringScalar: ... +@overload +def strftime( + timestamps: TemporalArray, + /, + format: str = "%Y-%m-%dT%H:%M:%S", + locale: str = "C", + *, + options: StrftimeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.StringArray: ... +@overload +def strftime( + timestamps: Expression, + /, + format: str = "%Y-%m-%dT%H:%M:%S", + locale: str = "C", + *, + options: StrftimeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def strftime(*args, **kwargs): + """ + Format temporal values according to a format string. + + For each input value, emit a formatted string. + The time format string and locale can be set using StrftimeOptions. + The output precision of the "%S" (seconds) format code depends on + the input time precision: it is an integer for timestamps with + second precision, a real number with the required number of fractional + digits for higher precisions. + Null values emit null. + An error is returned if the values have a defined timezone but it + cannot be found in the timezone database, or if the specified locale + does not exist on this system. + + Parameters + ---------- + timestamps : Array-like or scalar-like + Argument to compute function. + format : str, default "%Y-%m-%dT%H:%M:%S" + Pattern for formatting input values. + locale : str, default "C" + Locale to use for locale-specific format specifiers. + options : pyarrow.compute.StrftimeOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def strptime( + strings: StringScalar, + /, + format: str, + unit: Literal["s", "ms", "us", "ns"], + error_is_null: bool = False, + *, + options: StrptimeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.TimestampScalar: ... +@overload +def strptime( + strings: StringArray, + /, + format: str, + unit: Literal["s", "ms", "us", "ns"], + error_is_null: bool = False, + *, + options: StrptimeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.TimestampArray: ... +@overload +def strptime( + strings: Expression, + /, + format: str, + unit: Literal["s", "ms", "us", "ns"], + error_is_null: bool = False, + *, + options: StrptimeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def strptime(*args, **kwargs): + """ + Parse timestamps. + + For each string in `strings`, parse it as a timestamp. + The timestamp unit and the expected string pattern must be given + in StrptimeOptions. Null inputs emit null. If a non-null string + fails parsing, an error is returned by default. + + Parameters + ---------- + strings : Array-like or scalar-like + Argument to compute function. + format : str + Pattern for parsing input strings as timestamps, such as "%Y/%m/%d". + Note that the semantics of the format follow the C/C++ strptime, not the Python one. + There are differences in behavior, for example how the "%y" placeholder + handles years with less than four digits. + unit : str + Timestamp unit of the output. + Accepted values are "s", "ms", "us", "ns". + error_is_null : boolean, default False + Return null on parsing errors if true or raise if false. + options : pyarrow.compute.StrptimeOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 2.23 Temporal component extraction ========================= +@overload +def day( + values: TemporalScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int64Scalar: ... +@overload +def day( + values: TemporalArray, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int64Array: ... +@overload +def day(values: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... +def day(*args, **kwargs): + """ + Extract day number. + + Null values emit null. + An error is returned if the values have a defined timezone but it + cannot be found in the timezone database. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def day_of_week( + values: TemporalScalar, + /, + *, + count_from_zero: bool = True, + week_start: int = 1, + options: DayOfWeekOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar: ... +@overload +def day_of_week( + values: TemporalArray, + /, + *, + count_from_zero: bool = True, + week_start: int = 1, + options: DayOfWeekOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Array: ... +@overload +def day_of_week( + values: Expression, + /, + *, + count_from_zero: bool = True, + week_start: int = 1, + options: DayOfWeekOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def day_of_week(*args, **kwargs): + """ + Extract day of the week number. + + By default, the week starts on Monday represented by 0 and ends on Sunday + represented by 6. + `DayOfWeekOptions.week_start` can be used to set another starting day using + the ISO numbering convention (1=start week on Monday, 7=start week on Sunday). + Day numbers can start at 0 or 1 based on `DayOfWeekOptions.count_from_zero`. + Null values emit null. + An error is returned if the values have a defined timezone but it + cannot be found in the timezone database. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + count_from_zero : bool, default True + If True, number days from 0, otherwise from 1. + week_start : int, default 1 + Which day does the week start with (Monday=1, Sunday=7). + How this value is numbered is unaffected by `count_from_zero`. + options : pyarrow.compute.DayOfWeekOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +day_of_year = _clone_signature(day) +""" +Extract day of year number. + +January 1st maps to day number 1, February 1st to 32, etc. +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def hour( + values: lib.TimestampScalar[Any] | lib.Time32Scalar[Any] | lib.Time64Scalar[Any], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar: ... +@overload +def hour( + values: lib.TimestampArray[Any] + | lib.Time32Array[Any] + | lib.Time64Array[Any] + | lib.ChunkedArray[lib.TimestampScalar[Any]] + | lib.ChunkedArray[lib.Time32Scalar[Any]] + | lib.ChunkedArray[lib.Time64Scalar[Any]], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Array: ... +@overload +def hour( + values: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def hour(*args, **kwargs): + """ + Extract hour value. + + Null values emit null. + An error is returned if the values have a defined timezone but it + cannot be found in the timezone database. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def is_dst( + values: lib.TimestampScalar[Any], /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.BooleanScalar: ... +@overload +def is_dst( + values: lib.TimestampArray[Any] | lib.ChunkedArray[lib.TimestampScalar[Any]], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanArray: ... +@overload +def is_dst(values: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... +def is_dst(*args, **kwargs): + """ + Extracts if currently observing daylight savings. + + IsDaylightSavings returns true if a timestamp has a daylight saving + offset in the given timezone. + Null values emit null. + An error is returned if the values do not have a defined timezone. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def iso_week( + values: lib.TimestampScalar[Any], /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int64Scalar: ... +@overload +def iso_week( + values: lib.TimestampArray[Any] | lib.ChunkedArray[lib.TimestampScalar[Any]], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Array: ... +@overload +def iso_week( + values: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +def iso_week(*args, **kwargs): + """ + Extract ISO week of year number. + + First ISO week has the majority (4 or more) of its days in January. + ISO week starts on Monday. The week number starts with 1 and can run + up to 53. + Null values emit null. + An error is returned if the values have a defined timezone but it + cannot be found in the timezone database. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +iso_year = _clone_signature(iso_week) +""" +Extract ISO year number. + +First week of an ISO year has the majority (4 or more) of its days in January. +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def is_leap_year( + values: lib.TimestampScalar[Any] | lib.Date32Scalar | lib.Date64Scalar, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanScalar: ... +@overload +def is_leap_year( + values: lib.TimestampArray + | lib.Date32Array + | lib.Date64Array + | lib.ChunkedArray[lib.TimestampScalar] + | lib.ChunkedArray[lib.Date32Scalar] + | lib.ChunkedArray[lib.Date64Scalar], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.BooleanArray: ... +@overload +def is_leap_year( + values: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def is_leap_year(*args, **kwargs): + """ + Extract if year is a leap year. + + Null values emit null. + An error is returned if the values have a defined timezone but it + cannot be found in the timezone database. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +microsecond = _clone_signature(iso_week) +""" +Extract microsecond values. + +Microsecond returns number of microseconds since the last full millisecond. +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +millisecond = _clone_signature(iso_week) +""" +Extract millisecond values. + +Millisecond returns number of milliseconds since the last full second. +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +minute = _clone_signature(iso_week) +""" +Extract minute values. + +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +month = _clone_signature(day_of_week) +""" +Extract month number. + +Month is encoded as January=1, December=12. +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +nanosecond = _clone_signature(hour) +""" +Extract nanosecond values. + +Nanosecond returns number of nanoseconds since the last full microsecond. +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +quarter = _clone_signature(day_of_week) +""" +Extract quarter of year number. + +First quarter maps to 1 and forth quarter maps to 4. +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +second = _clone_signature(hour) +""" +Extract second values. + +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +subsecond = _clone_signature(hour) +""" +Extract subsecond values. + +Subsecond returns the fraction of a second since the last full second. +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +us_week = _clone_signature(iso_week) +""" +Extract US week of year number. + +First US week has the majority (4 or more) of its days in January. +US week starts on Monday. The week number starts with 1 and can run +up to 53. +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +us_year = _clone_signature(iso_week) +""" +Extract US epidemiological year number. + +First week of US epidemiological year has the majority (4 or more) of +it's days in January. Last week of US epidemiological year has the +year's last Wednesday in it. US epidemiological week starts on Sunday. +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +year = _clone_signature(iso_week) +""" +Extract year number. + +Null values emit null. +An error is returned if the values have a defined timezone but it +cannot be found in the timezone database. + +Parameters +---------- +values : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +@overload +def week( + values: lib.TimestampScalar, + /, + *, + week_starts_monday: bool = True, + count_from_zero: bool = False, + first_week_is_fully_in_year: bool = False, + options: WeekOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar: ... +@overload +def week( + values: lib.TimestampArray | lib.ChunkedArray[lib.TimestampScalar], + /, + *, + week_starts_monday: bool = True, + count_from_zero: bool = False, + first_week_is_fully_in_year: bool = False, + options: WeekOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Array: ... +@overload +def week( + values: Expression, + /, + *, + week_starts_monday: bool = True, + count_from_zero: bool = False, + first_week_is_fully_in_year: bool = False, + options: WeekOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def week(*args, **kwargs): + """ + Extract week of year number. + + First week has the majority (4 or more) of its days in January. + Year can have 52 or 53 weeks. Week numbering can start with 0 or 1 using + DayOfWeekOptions.count_from_zero. + An error is returned if the values have a defined timezone but it + cannot be found in the timezone database. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + week_starts_monday : bool, default True + If True, weeks start on Monday; if False, on Sunday. + count_from_zero : bool, default False + If True, dates at the start of a year that fall into the last week + of the previous year emit 0. + If False, they emit 52 or 53 (the week number of the last week + of the previous year). + first_week_is_fully_in_year : bool, default False + If True, week number 0 is fully in January. + If False, a week that begins on December 29, 30 or 31 is considered + to be week number 0 of the following year. + options : pyarrow.compute.WeekOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def year_month_day( + values: TemporalScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.StructScalar: ... +@overload +def year_month_day( + values: TemporalArray, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.StructArray: ... +@overload +def year_month_day( + values: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +def year_month_day(*args, **kwargs): + """ + Extract (year, month, day) struct. + + Null values emit null. + An error is returned in the values have a defined timezone but it + cannot be found in the timezone database. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 2.24 Temporal difference ========================= +def day_time_interval_between(start, end, /, *, memory_pool: lib.MemoryPool | None = None): + """ + Compute the number of days and milliseconds between two timestamps. + + Returns the number of days and milliseconds from `start` to `end`. + That is, first the difference in days is computed as if both + timestamps were truncated to the day, then the difference between time times + of the two timestamps is computed as if both times were truncated to the + millisecond. + Null values return null. + + Parameters + ---------- + start : Array-like or scalar-like + Argument to compute function. + end : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def days_between( + start, end, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int64Scalar | lib.Int64Array: + """ + Compute the number of days between two timestamps. + + Returns the number of day boundaries crossed from `start` to `end`. + That is, the difference is calculated as if the timestamps were + truncated to the day. + Null values emit null. + + Parameters + ---------- + start : Array-like or scalar-like + Argument to compute function. + end : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +hours_between = _clone_signature(days_between) +""" +Compute the number of hours between two timestamps. + +Returns the number of hour boundaries crossed from `start` to `end`. +That is, the difference is calculated as if the timestamps were +truncated to the hour. +Null values emit null. + +Parameters +---------- +start : Array-like or scalar-like + Argument to compute function. +end : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +microseconds_between = _clone_signature(days_between) +""" +Compute the number of microseconds between two timestamps. + +Returns the number of microsecond boundaries crossed from `start` to `end`. +That is, the difference is calculated as if the timestamps were +truncated to the microsecond. +Null values emit null. + +Parameters +---------- +start : Array-like or scalar-like + Argument to compute function. +end : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +milliseconds_between = _clone_signature(days_between) +""" +Compute the number of millisecond boundaries between two timestamps. + +Returns the number of millisecond boundaries crossed from `start` to `end`. +That is, the difference is calculated as if the timestamps were +truncated to the millisecond. +Null values emit null. + +Parameters +---------- +start : Array-like or scalar-like + Argument to compute function. +end : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +minutes_between = _clone_signature(days_between) +""" +Compute the number of millisecond boundaries between two timestamps. + +Returns the number of millisecond boundaries crossed from `start` to `end`. +That is, the difference is calculated as if the timestamps were +truncated to the millisecond. +Null values emit null. +In [152]: print(pc.minutes_between.__doc__) +Compute the number of minute boundaries between two timestamps. + +Returns the number of minute boundaries crossed from `start` to `end`. +That is, the difference is calculated as if the timestamps were +truncated to the minute. +Null values emit null. + +Parameters +---------- +start : Array-like or scalar-like + Argument to compute function. +end : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +def month_day_nano_interval_between( + start, end, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.MonthDayNanoIntervalScalar | lib.MonthDayNanoIntervalArray: + """ + Compute the number of months, days and nanoseconds between two timestamps. + + Returns the number of months, days, and nanoseconds from `start` to `end`. + That is, first the difference in months is computed as if both timestamps + were truncated to the months, then the difference between the days + is computed, and finally the difference between the times of the two + timestamps is computed as if both times were truncated to the nanosecond. + Null values return null. + + Parameters + ---------- + start : Array-like or scalar-like + Argument to compute function. + end : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def month_interval_between(start, end, /, *, memory_pool: lib.MemoryPool | None = None): + """ + Compute the number of months between two timestamps. + + Returns the number of month boundaries crossed from `start` to `end`. + That is, the difference is calculated as if the timestamps were + truncated to the month. + Null values emit null. + + Parameters + ---------- + start : Array-like or scalar-like + Argument to compute function. + end : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +nanoseconds_between = _clone_signature(days_between) +""" +Compute the number of nanoseconds between two timestamps. + +Returns the number of nanosecond boundaries crossed from `start` to `end`. +That is, the difference is calculated as if the timestamps were +truncated to the nanosecond. +Null values emit null. + +Parameters +---------- +start : Array-like or scalar-like + Argument to compute function. +end : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +quarters_between = _clone_signature(days_between) +""" +Compute the number of quarters between two timestamps. + +Returns the number of quarter start boundaries crossed from `start` to `end`. +That is, the difference is calculated as if the timestamps were +truncated to the quarter. +Null values emit null. + +Parameters +---------- +start : Array-like or scalar-like + Argument to compute function. +end : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +seconds_between = _clone_signature(days_between) +""" +Compute the number of seconds between two timestamps. + +Returns the number of second boundaries crossed from `start` to `end`. +That is, the difference is calculated as if the timestamps were +truncated to the second. +Null values emit null. + +Parameters +---------- +start : Array-like or scalar-like + Argument to compute function. +end : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +def weeks_between( + start, + end, + /, + *, + count_from_zero: bool = True, + week_start: int = 1, + options: DayOfWeekOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Int64Scalar | lib.Int64Array: + """ + Compute the number of weeks between two timestamps. + + Returns the number of week boundaries crossed from `start` to `end`. + That is, the difference is calculated as if the timestamps were + truncated to the week. + Null values emit null. + + Parameters + ---------- + start : Array-like or scalar-like + Argument to compute function. + end : Array-like or scalar-like + Argument to compute function. + count_from_zero : bool, default True + If True, number days from 0, otherwise from 1. + week_start : int, default 1 + Which day does the week start with (Monday=1, Sunday=7). + How this value is numbered is unaffected by `count_from_zero`. + options : pyarrow.compute.DayOfWeekOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +years_between = _clone_signature(days_between) +""" +Compute the number of years between two timestamps. + +Returns the number of year boundaries crossed from `start` to `end`. +That is, the difference is calculated as if the timestamps were +truncated to the year. +Null values emit null. + +Parameters +---------- +start : Array-like or scalar-like + Argument to compute function. +end : Array-like or scalar-like + Argument to compute function. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" + +# ========================= 2.25 Timezone handling ========================= +@overload +def assume_timezone( + timestamps: lib.TimestampScalar, + /, + timezone: str, + *, + ambiguous: Literal["raise", "earliest", "latest"] = "raise", + nonexistent: Literal["raise", "earliest", "latest"] = "raise", + options: AssumeTimezoneOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.TimestampScalar: ... +@overload +def assume_timezone( + timestamps: lib.TimestampArray | lib.ChunkedArray[lib.TimestampScalar], + /, + timezone: str, + *, + ambiguous: Literal["raise", "earliest", "latest"] = "raise", + nonexistent: Literal["raise", "earliest", "latest"] = "raise", + options: AssumeTimezoneOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.TimestampArray: ... +@overload +def assume_timezone( + timestamps: Expression, + /, + timezone: str, + *, + ambiguous: Literal["raise", "earliest", "latest"] = "raise", + nonexistent: Literal["raise", "earliest", "latest"] = "raise", + options: AssumeTimezoneOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def assume_timezone(*args, **kwargs): + """ + Convert naive timestamp to timezone-aware timestamp. + + Input timestamps are assumed to be relative to the timezone given in the + `timezone` option. They are converted to UTC-relative timestamps and + the output type has its timezone set to the value of the `timezone` + option. Null values emit null. + This function is meant to be used when an external system produces + "timezone-naive" timestamps which need to be converted to + "timezone-aware" timestamps. An error is returned if the timestamps + already have a defined timezone. + + Parameters + ---------- + timestamps : Array-like or scalar-like + Argument to compute function. + timezone : str + Timezone to assume for the input. + ambiguous : str, default "raise" + How to handle timestamps that are ambiguous in the assumed timezone. + Accepted values are "raise", "earliest", "latest". + nonexistent : str, default "raise" + How to handle timestamps that don't exist in the assumed timezone. + Accepted values are "raise", "earliest", "latest". + options : pyarrow.compute.AssumeTimezoneOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def local_timestamp( + timestamps: lib.TimestampScalar, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.TimestampScalar: ... +@overload +def local_timestamp( + timestamps: lib.TimestampArray | lib.ChunkedArray[lib.TimestampScalar], + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.TimestampArray: ... +@overload +def local_timestamp( + timestamps: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +def local_timestamp(*args, **kwargs): + """ + Convert timestamp to a timezone-naive local time timestamp. + + LocalTimestamp converts timezone-aware timestamp to local timestamp + of the given timestamp's timezone and removes timezone metadata. + Alternative name for this timestamp is also wall clock time. + If input is in UTC or without timezone, then unchanged input values + without timezone metadata are returned. + Null values emit null. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 2.26 Random number generation ========================= +def random( + n: int, + *, + initializer: Literal["system"] | int = "system", + options: RandomOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.DoubleArray: + """ + Generate numbers in the range [0, 1). + + Generated values are uniformly-distributed, double-precision + in range [0, 1). Algorithm and seed can be changed via RandomOptions. + + Parameters + ---------- + n : int + Number of values to generate, must be greater than or equal to 0 + initializer : int or str + How to initialize the underlying random generator. + If an integer is given, it is used as a seed. + If "system" is given, the random generator is initialized with + a system-specific source of (hopefully true) randomness. + Other values are invalid. + options : pyarrow.compute.RandomOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 3. Array-wise (“vector”) functions ========================= + +# ========================= 3.1 Cumulative Functions ========================= +@overload +def cumulative_sum( + values: _NumericArrayT, + /, + start: lib.Scalar | None = None, + *, + skip_nulls: bool = False, + options: CumulativeSumOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericArrayT: ... +@overload +def cumulative_sum( + values: Expression, + /, + start: lib.Scalar | None = None, + *, + skip_nulls: bool = False, + options: CumulativeSumOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def cumulative_sum(*args, **kwargs): + """ + Compute the cumulative sum over a numeric input. + + `values` must be numeric. Return an array/chunked array which is the + cumulative sum computed over `values`. Results will wrap around on + integer overflow. Use function "cumulative_sum_checked" if you want + overflow to return an error. The default start is 0. + + Parameters + ---------- + values : Array-like + Argument to compute function. + start : Scalar, default None + Starting value for the cumulative operation. If none is given, + a default value depending on the operation and input type is used. + skip_nulls : bool, default False + When false, the first encountered null is propagated. + options : pyarrow.compute.CumulativeOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +cumulative_sum_checked = _clone_signature(cumulative_sum) +""" +Compute the cumulative sum over a numeric input. + +`values` must be numeric. Return an array/chunked array which is the +cumulative sum computed over `values`. This function returns an error +on overflow. For a variant that doesn't fail on overflow, use +function "cumulative_sum". The default start is 0. + +Parameters +---------- +values : Array-like + Argument to compute function. +start : Scalar, default None + Starting value for the cumulative operation. If none is given, + a default value depending on the operation and input type is used. +skip_nulls : bool, default False + When false, the first encountered null is propagated. +options : pyarrow.compute.CumulativeOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +cumulative_prod = _clone_signature(cumulative_sum) +""" +Compute the cumulative product over a numeric input. + +`values` must be numeric. Return an array/chunked array which is the +cumulative product computed over `values`. Results will wrap around on +integer overflow. Use function "cumulative_prod_checked" if you want +overflow to return an error. The default start is 1. + +Parameters +---------- +values : Array-like + Argument to compute function. +start : Scalar, default None + Starting value for the cumulative operation. If none is given, + a default value depending on the operation and input type is used. +skip_nulls : bool, default False + When false, the first encountered null is propagated. +options : pyarrow.compute.CumulativeOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +cumulative_prod_checked = _clone_signature(cumulative_sum) +""" +Compute the cumulative product over a numeric input. + +`values` must be numeric. Return an array/chunked array which is the +cumulative product computed over `values`. This function returns an error +on overflow. For a variant that doesn't fail on overflow, use +function "cumulative_prod". The default start is 1. + +Parameters +---------- +values : Array-like + Argument to compute function. +start : Scalar, default None + Starting value for the cumulative operation. If none is given, + a default value depending on the operation and input type is used. +skip_nulls : bool, default False + When false, the first encountered null is propagated. +options : pyarrow.compute.CumulativeOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +cumulative_max = _clone_signature(cumulative_sum) +""" +Compute the cumulative max over a numeric input. + +`values` must be numeric. Return an array/chunked array which is the +cumulative max computed over `values`. The default start is the minimum +value of input type (so that any other value will replace the +start as the new maximum). + +Parameters +---------- +values : Array-like + Argument to compute function. +start : Scalar, default None + Starting value for the cumulative operation. If none is given, + a default value depending on the operation and input type is used. +skip_nulls : bool, default False + When false, the first encountered null is propagated. +options : pyarrow.compute.CumulativeOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +cumulative_min = _clone_signature(cumulative_sum) +""" +Compute the cumulative min over a numeric input. + +`values` must be numeric. Return an array/chunked array which is the +cumulative min computed over `values`. The default start is the maximum +value of input type (so that any other value will replace the +start as the new minimum). + +Parameters +---------- +values : Array-like + Argument to compute function. +start : Scalar, default None + Starting value for the cumulative operation. If none is given, + a default value depending on the operation and input type is used. +skip_nulls : bool, default False + When false, the first encountered null is propagated. +options : pyarrow.compute.CumulativeOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +cumulative_mean = _clone_signature(cumulative_sum) +""" +Compute the cumulative max over a numeric input. + +`values` must be numeric. Return an array/chunked array which is the +cumulative max computed over `values`. The default start is the minimum +value of input type (so that any other value will replace the +start as the new maximum). + +Parameters +---------- +values : Array-like + Argument to compute function. +start : Scalar, default None + Starting value for the cumulative operation. If none is given, + a default value depending on the operation and input type is used. +skip_nulls : bool, default False + When false, the first encountered null is propagated. +options : pyarrow.compute.CumulativeOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" +# ========================= 3.2 Associative transforms ========================= + +@overload +def dictionary_encode( + array: _ScalarOrArrayT, + /, + null_encoding: Literal["mask", "encode"] = "mask", + *, + options=None, + memory_pool: lib.MemoryPool | None = None, +) -> _ScalarOrArrayT: ... +@overload +def dictionary_encode( + array: Expression, + /, + null_encoding: Literal["mask", "encode"] = "mask", + *, + options=None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def unique(array: _ArrayT, /, *, memory_pool: lib.MemoryPool | None = None) -> _ArrayT: ... +@overload +def unique(array: Expression, /, *, memory_pool: lib.MemoryPool | None = None) -> Expression: ... +@overload +def value_counts( + array: lib.Array | lib.ChunkedArray, /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.StructArray: ... +@overload +def value_counts( + array: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... + +# ========================= 3.3 Selections ========================= +@overload +def array_filter( + array: _ArrayT, + selection_filter: list[bool] | list[bool | None] | BooleanArray, + /, + null_selection_behavior: Literal["drop", "emit_null"] = "drop", + *, + options: FilterOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _ArrayT: ... +@overload +def array_filter( + array: Expression, + selection_filter: list[bool] | list[bool | None] | BooleanArray, + /, + null_selection_behavior: Literal["drop", "emit_null"] = "drop", + *, + options: FilterOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def array_take( + array: _ArrayT, + indices: list[int] + | list[int | None] + | lib.Int16Array + | lib.Int32Array + | lib.Int64Array + | lib.ChunkedArray[lib.Int16Scalar] + | lib.ChunkedArray[lib.Int32Scalar] + | lib.ChunkedArray[lib.Int64Scalar], + /, + *, + boundscheck: bool = True, + options: TakeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _ArrayT: ... +@overload +def array_take( + array: Expression, + indices: list[int] + | list[int | None] + | lib.Int16Array + | lib.Int32Array + | lib.Int64Array + | lib.ChunkedArray[lib.Int16Scalar] + | lib.ChunkedArray[lib.Int32Scalar] + | lib.ChunkedArray[lib.Int64Scalar], + /, + *, + boundscheck: bool = True, + options: TakeOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def drop_null(input: _ArrayT, /, *, memory_pool: lib.MemoryPool | None = None) -> _ArrayT: ... +@overload +def drop_null( + input: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... + +filter = array_filter +take = array_take +""" +Select values (or records) from array- or table-like data given integer +selection indices. + +The result will be of the same type(s) as the input, with elements taken +from the input array (or record batch / table fields) at the given +indices. If an index is null then the corresponding value in the output +will be null. + +Parameters +---------- +data : Array, ChunkedArray, RecordBatch, or Table +indices : Array, ChunkedArray + Must be of integer type +boundscheck : boolean, default True + Whether to boundscheck the indices. If False and there is an out of + bounds index, will likely cause the process to crash. +memory_pool : MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + +Returns +------- +result : depends on inputs + Selected values for the given indices + +Examples +-------- +>>> import pyarrow as pa +>>> arr = pa.array(["a", "b", "c", None, "e", "f"]) +>>> indices = pa.array([0, None, 4, 3]) +>>> arr.take(indices) + +[ + "a", + null, + "e", + null +] +""" + +# ========================= 3.4 Containment tests ========================= +@overload +def indices_nonzero( + values: lib.BooleanArray + | lib.NullArray + | NumericArray + | lib.Decimal128Array + | lib.Decimal256Array, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.UInt64Array: ... +@overload +def indices_nonzero( + values: Expression, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def indices_nonzero(*args, **kwargs): + """ + Return the indices of the values in the array that are non-zero. + + For each input value, check if it's zero, false or null. Emit the index + of the value in the array if it's none of the those. + + Parameters + ---------- + values : Array-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 3.5 Sorts and partitions ========================= +@overload +def array_sort_indices( + array: lib.Array | lib.ChunkedArray, + /, + order: _Order = "ascending", + *, + null_placement: _Placement = "at_end", + options: ArraySortOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.UInt64Array: ... +@overload +def array_sort_indices( + array: Expression, + /, + order: _Order = "ascending", + *, + null_placement: _Placement = "at_end", + options: ArraySortOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def array_sort_indices(*args, **kwargs): + """ + Return the indices that would sort an array. + + This function computes an array of indices that define a stable sort + of the input array. By default, Null values are considered greater + than any other value and are therefore sorted at the end of the array. + For floating-point types, NaNs are considered greater than any + other non-null value, but smaller than null values. + + The handling of nulls and NaNs can be changed in ArraySortOptions. + + Parameters + ---------- + array : Array-like + Argument to compute function. + order : str, default "ascending" + Which order to sort values in. + Accepted values are "ascending", "descending". + null_placement : str, default "at_end" + Where nulls in the input should be sorted. + Accepted values are "at_start", "at_end". + options : pyarrow.compute.ArraySortOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def partition_nth_indices( + array: lib.Array | lib.ChunkedArray, + /, + pivot: int, + *, + null_placement: _Placement = "at_end", + options: PartitionNthOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.UInt64Array: ... +@overload +def partition_nth_indices( + array: Expression, + /, + pivot: int, + *, + null_placement: _Placement = "at_end", + options: PartitionNthOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def partition_nth_indices(*args, **kwargs): + """ + Return the indices that would partition an array around a pivot. + + This functions computes an array of indices that define a non-stable + partial sort of the input array. + + The output is such that the `N`'th index points to the `N`'th element + of the input in sorted order, and all indices before the `N`'th point + to elements in the input less or equal to elements at or after the `N`'th. + + By default, null values are considered greater than any other value + and are therefore partitioned towards the end of the array. + For floating-point types, NaNs are considered greater than any + other non-null value, but smaller than null values. + + The pivot index `N` must be given in PartitionNthOptions. + The handling of nulls and NaNs can also be changed in PartitionNthOptions. + + Parameters + ---------- + array : Array-like + Argument to compute function. + pivot : int + Index into the equivalent sorted array of the pivot element. + null_placement : str, default "at_end" + Where nulls in the input should be partitioned. + Accepted values are "at_start", "at_end". + options : pyarrow.compute.PartitionNthOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def rank( + input: lib.Array | lib.ChunkedArray, + /, + sort_keys: _Order = "ascending", + *, + null_placement: _Placement = "at_end", + tiebreaker: Literal["min", "max", "first", "dense"] = "first", + options: RankOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.UInt64Array: + """ + Compute ordinal ranks of an array (1-based). + + This function computes a rank of the input array. + By default, null values are considered greater than any other value and + are therefore sorted at the end of the input. For floating-point types, + NaNs are considered greater than any other non-null value, but smaller + than null values. The default tiebreaker is to assign ranks in order of + when ties appear in the input. + + The handling of nulls, NaNs and tiebreakers can be changed in RankOptions. + + Parameters + ---------- + input : Array-like or scalar-like + Argument to compute function. + sort_keys : sequence of (name, order) tuples or str, default "ascending" + Names of field/column keys to sort the input on, + along with the order each field/column is sorted in. + Accepted values for `order` are "ascending", "descending". + The field name can be a string column name or expression. + Alternatively, one can simply pass "ascending" or "descending" as a string + if the input is array-like. + null_placement : str, default "at_end" + Where nulls in input should be sorted. + Accepted values are "at_start", "at_end". + tiebreaker : str, default "first" + Configure how ties between equal values are handled. + Accepted values are: + + - "min": Ties get the smallest possible rank in sorted order. + - "max": Ties get the largest possible rank in sorted order. + - "first": Ranks are assigned in order of when ties appear in the + input. This ensures the ranks are a stable permutation + of the input. + - "dense": The ranks span a dense [1, M] interval where M is the + number of distinct values in the input. + options : pyarrow.compute.RankOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def select_k_unstable( + input: lib.Array | lib.ChunkedArray, + /, + k: int, + sort_keys: list[tuple[str, _Order]], + *, + options: SelectKOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.UInt64Array: ... +@overload +def select_k_unstable( + input: Expression, + /, + k: int, + sort_keys: list[tuple[str, _Order]], + *, + options: SelectKOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def select_k_unstable(*args, **kwargs): + """ + Select the indices of the first `k` ordered elements from the input. + + This function selects an array of indices of the first `k` ordered elements + from the `input` array, record batch or table specified in the column keys + (`options.sort_keys`). Output is not guaranteed to be stable. + Null values are considered greater than any other value and are + therefore ordered at the end. For floating-point types, NaNs are considered + greater than any other non-null value, but smaller than null values. + + Parameters + ---------- + input : Array-like or scalar-like + Argument to compute function. + k : int + Number of leading values to select in sorted order + (i.e. the largest values if sort order is "descending", + the smallest otherwise). + sort_keys : sequence of (name, order) tuples + Names of field/column keys to sort the input on, + along with the order each field/column is sorted in. + Accepted values for `order` are "ascending", "descending". + The field name can be a string column name or expression. + options : pyarrow.compute.SelectKOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def sort_indices( + input: lib.Array | lib.ChunkedArray | lib.RecordBatch | lib.Table, + /, + sort_keys: Sequence[tuple[str, _Order]] = (), + *, + null_placement: _Placement = "at_end", + options: SortOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.UInt64Array: ... +@overload +def sort_indices( + input: Expression, + /, + sort_keys: Sequence[tuple[str, _Order]] = (), + *, + null_placement: _Placement = "at_end", + options: SortOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def sort_indices(*args, **kwargs): + """ + Return the indices that would sort an array, record batch or table. + + This function computes an array of indices that define a stable sort + of the input array, record batch or table. By default, null values are + considered greater than any other value and are therefore sorted at the + end of the input. For floating-point types, NaNs are considered greater + than any other non-null value, but smaller than null values. + + The handling of nulls and NaNs can be changed in SortOptions. + + Parameters + ---------- + input : Array-like or scalar-like + Argument to compute function. + sort_keys : sequence of (name, order) tuples + Names of field/column keys to sort the input on, + along with the order each field/column is sorted in. + Accepted values for `order` are "ascending", "descending". + The field name can be a string column name or expression. + null_placement : str, default "at_end" + Where nulls in input should be sorted, only applying to + columns/fields mentioned in `sort_keys`. + Accepted values are "at_start", "at_end". + options : pyarrow.compute.SortOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 3.6 Structural transforms ========================= +@overload +def list_element( + lists: Expression, index: ScalarLike, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +@overload +def list_element( + lists: lib.Array[ListScalar[_DataTypeT]], + index: ScalarLike, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.Array[lib.Scalar[_DataTypeT]]: ... +@overload +def list_element( + lists: lib.ChunkedArray[ListScalar[_DataTypeT]], + index: ScalarLike, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> lib.ChunkedArray[lib.Scalar[_DataTypeT]]: ... +@overload +def list_element( + lists: ListScalar[_DataTypeT], + index: ScalarLike, + /, + *, + memory_pool: lib.MemoryPool | None = None, +) -> _DataTypeT: ... +def list_element(*args, **kwargs): + """ + Compute elements using of nested list values using an index. + + `lists` must have a list-like type. + For each value in each list of `lists`, the element at `index` + is emitted. Null values emit a null in the output. + + Parameters + ---------- + lists : Array-like or scalar-like + Argument to compute function. + index : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def list_flatten( + lists: Expression, + /, + recursive: bool = False, + *, + options: ListFlattenOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def list_flatten( + lists: ArrayOrChunkedArray[ListScalar[Any]], + /, + recursive: bool = False, + *, + options: ListFlattenOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.ListArray[Any]: ... +def list_flatten(*args, **kwargs): + """ + Flatten list values. + + `lists` must have a list-like type (lists, list-views, and + fixed-size lists). + Return an array with the top list level flattened unless + `recursive` is set to true in ListFlattenOptions. When that + is that case, flattening happens recursively until a non-list + array is formed. + + Null list values do not emit anything to the output. + + Parameters + ---------- + lists : Array-like + Argument to compute function. + recursive : bool, default False + When True, the list array is flattened recursively until an array + of non-list values is formed. + options : pyarrow.compute.ListFlattenOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def list_parent_indices( + lists: Expression, /, *, memory_pool: lib.MemoryPool | None = None +) -> Expression: ... +@overload +def list_parent_indices( + lists: ArrayOrChunkedArray[Any], /, *, memory_pool: lib.MemoryPool | None = None +) -> lib.Int64Array: ... +def list_parent_indices(*args, **kwargs): + """ + Compute parent indices of nested list values. + + `lists` must have a list-like or list-view type. + For each value in each list of `lists`, the top-level list index + is emitted. + + Parameters + ---------- + lists : Array-like or scalar-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +@overload +def list_slice( + lists: Expression, + /, + start: int, + stop: int | None = None, + step: int = 1, + return_fixed_size_list: bool | None = None, + *, + options: ListSliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +@overload +def list_slice( + lists: ArrayOrChunkedArray[Any], + /, + start: int, + stop: int | None = None, + step: int = 1, + return_fixed_size_list: bool | None = None, + *, + options: ListSliceOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> lib.ListArray[Any]: ... +def list_slice(*args, **kwargs): + """ + Compute slice of list-like array. + + `lists` must have a list-like type. + For each list element, compute a slice, returning a new list array. + A variable or fixed size list array is returned, depending on options. + + Parameters + ---------- + lists : Array-like or scalar-like + Argument to compute function. + start : int + Index to start slicing inner list elements (inclusive). + stop : Optional[int], default None + If given, index to stop slicing at (exclusive). + If not given, slicing will stop at the end. (NotImplemented) + step : int, default 1 + Slice step. + return_fixed_size_list : Optional[bool], default None + Whether to return a FixedSizeListArray. If true _and_ stop is after + a list element's length, nulls will be appended to create the + requested slice size. The default of `None` will return the same + type which was passed in. + options : pyarrow.compute.ListSliceOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def map_lookup( + container, + /, + query_key, + occurrence: str, + *, + options: MapLookupOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +): + """ + Find the items corresponding to a given key in a Map. + + For a given query key (passed via MapLookupOptions), extract + either the FIRST, LAST or ALL items from a Map that have + matching keys. + + Parameters + ---------- + container : Array-like or scalar-like + Argument to compute function. + query_key : Scalar or Object can be converted to Scalar + The key to search for. + occurrence : str + The occurrence(s) to return from the Map + Accepted values are "first", "last", or "all". + options : pyarrow.compute.MapLookupOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def struct_field( + values, + /, + indices, + *, + options: StructFieldOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +): + """ + Extract children of a struct or union by index. + + Given a list of indices (passed via StructFieldOptions), extract + the child array or scalar with the given child index, recursively. + + For union inputs, nulls are emitted for union values that reference + a different child than specified. Also, the indices are always + in physical order, not logical type codes - for example, the first + child is always index 0. + + An empty list of indices returns the argument unchanged. + + Parameters + ---------- + values : Array-like or scalar-like + Argument to compute function. + indices : List[str], List[bytes], List[int], Expression, bytes, str, or int + List of indices for chained field lookup, for example `[4, 1]` + will look up the second nested field in the fifth outer field. + options : pyarrow.compute.StructFieldOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def fill_null_backward(values, /, *, memory_pool: lib.MemoryPool | None = None): + """ + Carry non-null values backward to fill null slots. + + Given an array, propagate next valid observation backward to previous valid + or nothing if all next values are null. + + Parameters + ---------- + values : Array-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def fill_null_forward(values, /, *, memory_pool: lib.MemoryPool | None = None): + """ + Carry non-null values forward to fill null slots. + + Given an array, propagate last valid observation forward to next valid + or nothing if all previous values are null. + + Parameters + ---------- + values : Array-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +def replace_with_mask( + values, + mask: list[bool] | list[bool | None] | BooleanArray, + replacements, + /, + *, + memory_pool: lib.MemoryPool | None = None, +): + """ + Replace items selected with a mask. + + Given an array and a boolean mask (either scalar or of equal length), + along with replacement values (either scalar or array), + each element of the array for which the corresponding mask element is + true will be replaced by the next value from the replacements, + or with null if the mask is null. + Hence, for replacement arrays, len(replacements) == sum(mask == true). + + Parameters + ---------- + values : Array-like + Argument to compute function. + mask : Array-like + Argument to compute function. + replacements : Array-like + Argument to compute function. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +# ========================= 3.7 Pairwise functions ========================= +@overload +def pairwise_diff( + input: _NumericOrTemporalArrayT, + /, + period: int = 1, + *, + options: PairwiseOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> _NumericOrTemporalArrayT: ... +@overload +def pairwise_diff( + input: Expression, + /, + period: int = 1, + *, + options: PairwiseOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> Expression: ... +def pairwise_diff(*args, **kwargs): + """ + Compute first order difference of an array. + + Computes the first order difference of an array, It internally calls + the scalar function "subtract" to compute + differences, so its + behavior and supported types are the same as + "subtract". The period can be specified in :struct:`PairwiseOptions`. + + Results will wrap around on integer overflow. Use function + "pairwise_diff_checked" if you want overflow to return an error. + + Parameters + ---------- + input : Array-like + Argument to compute function. + period : int, default 1 + Period for applying the period function. + options : pyarrow.compute.PairwiseOptions, optional + Alternative way of passing options. + memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. + """ + +pairwise_diff_checked = _clone_signature(pairwise_diff) +""" +Compute first order difference of an array. + +Computes the first order difference of an array, It internally calls +the scalar function "subtract_checked" (or the checked variant) to compute +differences, so its behavior and supported types are the same as +"subtract_checked". The period can be specified in :struct:`PairwiseOptions`. + +This function returns an error on overflow. For a variant that doesn't +fail on overflow, use function "pairwise_diff". + +Parameters +---------- +input : Array-like + Argument to compute function. +period : int, default 1 + Period for applying the period function. +options : pyarrow.compute.PairwiseOptions, optional + Alternative way of passing options. +memory_pool : pyarrow.MemoryPool, optional + If not passed, will allocate memory from the default memory pool. +""" diff --git a/python/stubs/csv.pyi b/python/stubs/csv.pyi new file mode 100644 index 00000000000..510229d7e72 --- /dev/null +++ b/python/stubs/csv.pyi @@ -0,0 +1,27 @@ +from pyarrow._csv import ( + ISO8601, + ConvertOptions, + CSVStreamingReader, + CSVWriter, + InvalidRow, + ParseOptions, + ReadOptions, + WriteOptions, + open_csv, + read_csv, + write_csv, +) + +__all__ = [ + "ISO8601", + "ConvertOptions", + "CSVStreamingReader", + "CSVWriter", + "InvalidRow", + "ParseOptions", + "ReadOptions", + "WriteOptions", + "open_csv", + "read_csv", + "write_csv", +] diff --git a/python/stubs/cuda.pyi b/python/stubs/cuda.pyi new file mode 100644 index 00000000000..e11baf7d4e7 --- /dev/null +++ b/python/stubs/cuda.pyi @@ -0,0 +1,25 @@ +from pyarrow._cuda import ( + BufferReader, + BufferWriter, + Context, + CudaBuffer, + HostBuffer, + IpcMemHandle, + new_host_buffer, + read_message, + read_record_batch, + serialize_record_batch, +) + +__all__ = [ + "BufferReader", + "BufferWriter", + "Context", + "CudaBuffer", + "HostBuffer", + "IpcMemHandle", + "new_host_buffer", + "read_message", + "read_record_batch", + "serialize_record_batch", +] diff --git a/python/stubs/dataset.pyi b/python/stubs/dataset.pyi new file mode 100644 index 00000000000..98f1a38aa85 --- /dev/null +++ b/python/stubs/dataset.pyi @@ -0,0 +1,229 @@ +from typing import Callable, Iterable, Literal, Sequence, TypeAlias, overload + +from _typeshed import StrPath +from pyarrow._dataset import ( + CsvFileFormat, + CsvFragmentScanOptions, + Dataset, + DatasetFactory, + DirectoryPartitioning, + FeatherFileFormat, + FileFormat, + FileFragment, + FilenamePartitioning, + FileSystemDataset, + FileSystemDatasetFactory, + FileSystemFactoryOptions, + FileWriteOptions, + Fragment, + FragmentScanOptions, + HivePartitioning, + InMemoryDataset, + IpcFileFormat, + IpcFileWriteOptions, + JsonFileFormat, + JsonFragmentScanOptions, + Partitioning, + PartitioningFactory, + Scanner, + TaggedRecordBatch, + UnionDataset, + UnionDatasetFactory, + WrittenFile, + get_partition_keys, +) +from pyarrow._dataset_orc import OrcFileFormat +from pyarrow._dataset_parquet import ( + ParquetDatasetFactory, + ParquetFactoryOptions, + ParquetFileFormat, + ParquetFileFragment, + ParquetFileWriteOptions, + ParquetFragmentScanOptions, + ParquetReadOptions, + RowGroupInfo, +) +from pyarrow._dataset_parquet_encryption import ( + ParquetDecryptionConfig, + ParquetEncryptionConfig, +) +from pyarrow.compute import Expression, field, scalar +from pyarrow.lib import Array, RecordBatch, RecordBatchReader, Schema, Table + +from ._fs import SupportedFileSystem + +_orc_available: bool +_parquet_available: bool + +__all__ = [ + "CsvFileFormat", + "CsvFragmentScanOptions", + "Dataset", + "DatasetFactory", + "DirectoryPartitioning", + "FeatherFileFormat", + "FileFormat", + "FileFragment", + "FilenamePartitioning", + "FileSystemDataset", + "FileSystemDatasetFactory", + "FileSystemFactoryOptions", + "FileWriteOptions", + "Fragment", + "FragmentScanOptions", + "HivePartitioning", + "InMemoryDataset", + "IpcFileFormat", + "IpcFileWriteOptions", + "JsonFileFormat", + "JsonFragmentScanOptions", + "Partitioning", + "PartitioningFactory", + "Scanner", + "TaggedRecordBatch", + "UnionDataset", + "UnionDatasetFactory", + "WrittenFile", + "get_partition_keys", + # Orc + "OrcFileFormat", + # Parquet + "ParquetDatasetFactory", + "ParquetFactoryOptions", + "ParquetFileFormat", + "ParquetFileFragment", + "ParquetFileWriteOptions", + "ParquetFragmentScanOptions", + "ParquetReadOptions", + "RowGroupInfo", + # Parquet Encryption + "ParquetDecryptionConfig", + "ParquetEncryptionConfig", + # Compute + "Expression", + "field", + "scalar", + # Dataset + "partitioning", + "parquet_dataset", + "write_dataset", +] + +_DatasetFormat: TypeAlias = Literal["parquet", "ipc", "arrow", "feather", "csv"] + +@overload +def partitioning( + schema: Schema, +) -> Partitioning: ... +@overload +def partitioning( + schema: Schema, + *, + flavor: Literal["filename"], + dictionaries: dict[str, Array] | None = None, +) -> Partitioning: ... +@overload +def partitioning( + schema: Schema, + *, + flavor: Literal["filename"], + dictionaries: Literal["infer"], +) -> PartitioningFactory: ... +@overload +def partitioning( + field_names: list[str], + *, + flavor: Literal["filename"], +) -> PartitioningFactory: ... +@overload +def partitioning( + schema: Schema, + *, + flavor: Literal["hive"], + dictionaries: Literal["infer"], +) -> PartitioningFactory: ... +@overload +def partitioning( + *, + flavor: Literal["hive"], +) -> PartitioningFactory: ... +@overload +def partitioning( + schema: Schema, + *, + flavor: Literal["hive"], + dictionaries: dict[str, Array] | None = None, +) -> Partitioning: ... +def parquet_dataset( + metadata_path: StrPath, + schema: Schema | None = None, + filesystem: SupportedFileSystem | None = None, + format: ParquetFileFormat | None = None, + partitioning: Partitioning | PartitioningFactory | None = None, + partition_base_dir: str | None = None, +) -> FileSystemDataset: ... +@overload +def dataset( + source: StrPath | Sequence[StrPath], + schema: Schema | None = None, + format: FileFormat | _DatasetFormat | None = None, + filesystem: SupportedFileSystem | str | None = None, + partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, + partition_base_dir: str | None = None, + exclude_invalid_files: bool | None = None, + ignore_prefixes: list[str] | None = None, +) -> FileSystemDataset: ... +@overload +def dataset( + source: list[Dataset], + schema: Schema | None = None, + format: FileFormat | _DatasetFormat | None = None, + filesystem: SupportedFileSystem | str | None = None, + partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, + partition_base_dir: str | None = None, + exclude_invalid_files: bool | None = None, + ignore_prefixes: list[str] | None = None, +) -> UnionDataset: ... +@overload +def dataset( + source: Iterable[RecordBatch] | Iterable[Table] | RecordBatchReader, + schema: Schema | None = None, + format: FileFormat | _DatasetFormat | None = None, + filesystem: SupportedFileSystem | str | None = None, + partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, + partition_base_dir: str | None = None, + exclude_invalid_files: bool | None = None, + ignore_prefixes: list[str] | None = None, +) -> InMemoryDataset: ... +@overload +def dataset( + source: RecordBatch | Table, + schema: Schema | None = None, + format: FileFormat | _DatasetFormat | None = None, + filesystem: SupportedFileSystem | str | None = None, + partitioning: Partitioning | PartitioningFactory | str | list[str] | None = None, + partition_base_dir: str | None = None, + exclude_invalid_files: bool | None = None, + ignore_prefixes: list[str] | None = None, +) -> InMemoryDataset: ... +def write_dataset( + data: Dataset | Table | RecordBatch | RecordBatchReader | list[Table] | Iterable[RecordBatch], + base_dir: StrPath, + *, + basename_template: str | None = None, + format: FileFormat | _DatasetFormat | None = None, + partitioning: Partitioning | list[str] | None = None, + partitioning_flavor: str | None = None, + schema: Schema | None = None, + filesystem: SupportedFileSystem | None = None, + file_options: FileWriteOptions | None = None, + use_threads: bool = True, + max_partitions: int = 1024, + max_open_files: int = 1024, + max_rows_per_file: int = 0, + min_rows_per_group: int = 0, + max_rows_per_group: int = 1024 * 1024, + file_visitor: Callable[[str], None] | None = None, + existing_data_behavior: Literal["error", "overwrite_or_ignore", "delete_matching"] = "error", + create_dir: bool = True, +): ... diff --git a/python/stubs/feather.pyi b/python/stubs/feather.pyi new file mode 100644 index 00000000000..9451ee15763 --- /dev/null +++ b/python/stubs/feather.pyi @@ -0,0 +1,50 @@ +from typing import IO, Literal + +import pandas as pd + +from _typeshed import StrPath +from pyarrow._feather import FeatherError +from pyarrow.lib import Table + +__all__ = [ + "FeatherError", + "FeatherDataset", + "check_chunked_overflow", + "write_feather", + "read_feather", + "read_table", +] + +class FeatherDataset: + path_or_paths: str | list[str] + validate_schema: bool + + def __init__(self, path_or_paths: str | list[str], validate_schema: bool = True) -> None: ... + def read_table(self, columns: list[str] | None = None) -> Table: ... + def validate_schemas(self, piece, table: Table) -> None: ... + def read_pandas( + self, columns: list[str] | None = None, use_threads: bool = True + ) -> pd.DataFrame: ... + +def check_chunked_overflow(name: str, col) -> None: ... +def write_feather( + df: pd.DataFrame | Table, + dest: StrPath | IO, + compression: Literal["zstd", "lz4", "uncompressed"] | None = None, + compression_level: int | None = None, + chunksize: int | None = None, + version: Literal[1, 2] = 2, +) -> None: ... +def read_feather( + source: StrPath | IO, + columns: list[str] | None = None, + use_threads: bool = True, + memory_map: bool = False, + **kwargs, +) -> pd.DataFrame: ... +def read_table( + source: StrPath | IO, + columns: list[str] | None = None, + memory_map: bool = False, + use_threads: bool = True, +) -> Table: ... diff --git a/python/stubs/flight.pyi b/python/stubs/flight.pyi new file mode 100644 index 00000000000..9b806ccf305 --- /dev/null +++ b/python/stubs/flight.pyi @@ -0,0 +1,95 @@ +from pyarrow._flight import ( + Action, + ActionType, + BasicAuth, + CallInfo, + CertKeyPair, + ClientAuthHandler, + ClientMiddleware, + ClientMiddlewareFactory, + DescriptorType, + FlightCallOptions, + FlightCancelledError, + FlightClient, + FlightDataStream, + FlightDescriptor, + FlightEndpoint, + FlightError, + FlightInfo, + FlightInternalError, + FlightMetadataReader, + FlightMetadataWriter, + FlightMethod, + FlightServerBase, + FlightServerError, + FlightStreamChunk, + FlightStreamReader, + FlightStreamWriter, + FlightTimedOutError, + FlightUnauthenticatedError, + FlightUnauthorizedError, + FlightUnavailableError, + FlightWriteSizeExceededError, + GeneratorStream, + Location, + MetadataRecordBatchReader, + MetadataRecordBatchWriter, + RecordBatchStream, + Result, + SchemaResult, + ServerAuthHandler, + ServerCallContext, + ServerMiddleware, + ServerMiddlewareFactory, + Ticket, + TracingServerMiddlewareFactory, + connect, +) + +__all__ = [ + "Action", + "ActionType", + "BasicAuth", + "CallInfo", + "CertKeyPair", + "ClientAuthHandler", + "ClientMiddleware", + "ClientMiddlewareFactory", + "DescriptorType", + "FlightCallOptions", + "FlightCancelledError", + "FlightClient", + "FlightDataStream", + "FlightDescriptor", + "FlightEndpoint", + "FlightError", + "FlightInfo", + "FlightInternalError", + "FlightMetadataReader", + "FlightMetadataWriter", + "FlightMethod", + "FlightServerBase", + "FlightServerError", + "FlightStreamChunk", + "FlightStreamReader", + "FlightStreamWriter", + "FlightTimedOutError", + "FlightUnauthenticatedError", + "FlightUnauthorizedError", + "FlightUnavailableError", + "FlightWriteSizeExceededError", + "GeneratorStream", + "Location", + "MetadataRecordBatchReader", + "MetadataRecordBatchWriter", + "RecordBatchStream", + "Result", + "SchemaResult", + "ServerAuthHandler", + "ServerCallContext", + "ServerMiddleware", + "ServerMiddlewareFactory", + "Ticket", + "TracingServerMiddlewareFactory", + "connect", +] diff --git a/python/stubs/fs.pyi b/python/stubs/fs.pyi new file mode 100644 index 00000000000..6bf75616c13 --- /dev/null +++ b/python/stubs/fs.pyi @@ -0,0 +1,77 @@ +from pyarrow._fs import ( # noqa + FileSelector, + FileType, + FileInfo, + FileSystem, + LocalFileSystem, + SubTreeFileSystem, + _MockFileSystem, + FileSystemHandler, + PyFileSystem, + SupportedFileSystem, +) +from pyarrow._azurefs import AzureFileSystem +from pyarrow._hdfs import HadoopFileSystem +from pyarrow._gcsfs import GcsFileSystem +from pyarrow._s3fs import ( # noqa + AwsDefaultS3RetryStrategy, + AwsStandardS3RetryStrategy, + S3FileSystem, + S3LogLevel, + S3RetryStrategy, + ensure_s3_initialized, + finalize_s3, + ensure_s3_finalized, + initialize_s3, + resolve_s3_region, +) + +FileStats = FileInfo + +def copy_files( + source: str, + destination: str, + source_filesystem: SupportedFileSystem | None = None, + destination_filesystem: SupportedFileSystem | None = None, + *, + chunk_size: int = 1024 * 1024, + use_threads: bool = True, +) -> None: ... + +class FSSpecHandler(FileSystemHandler): # type: ignore[misc] + fs: SupportedFileSystem + def __init__(self, fs: SupportedFileSystem) -> None: ... + +__all__ = [ + # _fs + "FileSelector", + "FileType", + "FileInfo", + "FileSystem", + "LocalFileSystem", + "SubTreeFileSystem", + "_MockFileSystem", + "FileSystemHandler", + "PyFileSystem", + # _azurefs + "AzureFileSystem", + # _hdfs + "HadoopFileSystem", + # _gcsfs + "GcsFileSystem", + # _s3fs + "AwsDefaultS3RetryStrategy", + "AwsStandardS3RetryStrategy", + "S3FileSystem", + "S3LogLevel", + "S3RetryStrategy", + "ensure_s3_initialized", + "finalize_s3", + "ensure_s3_finalized", + "initialize_s3", + "resolve_s3_region", + # fs + "FileStats", + "copy_files", + "FSSpecHandler", +] diff --git a/python/stubs/gandiva.pyi b/python/stubs/gandiva.pyi new file mode 100644 index 00000000000..a344f885b29 --- /dev/null +++ b/python/stubs/gandiva.pyi @@ -0,0 +1,65 @@ +from typing import Iterable, Literal + +from .lib import Array, DataType, Field, MemoryPool, RecordBatch, Schema, _Weakrefable + +class Node(_Weakrefable): + def return_type(self) -> DataType: ... + +class Expression(_Weakrefable): + def root(self) -> Node: ... + def result(self) -> Field: ... + +class Condition(_Weakrefable): + def root(self) -> Node: ... + def result(self) -> Field: ... + +class SelectionVector(_Weakrefable): + def to_array(self) -> Array: ... + +class Projector(_Weakrefable): + @property + def llvm_ir(self): ... + def evaluate( + self, batch: RecordBatch, selection: SelectionVector | None = None + ) -> list[Array]: ... + +class Filter(_Weakrefable): + @property + def llvm_ir(self): ... + def evaluate( + self, batch: RecordBatch, pool: MemoryPool, dtype: DataType | str = "int32" + ) -> SelectionVector: ... + +class TreeExprBuilder(_Weakrefable): + def make_literal(self, value: float | str | bytes | bool, dtype: DataType) -> Node: ... + def make_expression(self, root_node: Node, return_field: Field) -> Expression: ... + def make_function(self, name: str, children: list[Node], return_type: DataType) -> Node: ... + def make_field(self, field: Field) -> Node: ... + def make_if( + self, condition: Node, this_node: Node, else_node: Node, return_type: DataType + ) -> Node: ... + def make_and(self, children: list[Node]) -> Node: ... + def make_or(self, children: list[Node]) -> Node: ... + def make_in_expression(self, node: Node, values: Iterable, dtype: DataType) -> Node: ... + def make_condition(self, condition: Node) -> Condition: ... + +class Configuration(_Weakrefable): + def __init__(self, optimize: bool = True, dump_ir: bool = False) -> None: ... + +def make_projector( + schema: Schema, + children: list[Expression], + pool: MemoryPool, + selection_mode: Literal["NONE", "UINT16", "UINT32", "UINT64"] = "NONE", + configuration: Configuration | None = None, +) -> Projector: ... +def make_filter( + schema: Schema, condition: Condition, configuration: Configuration | None = None +) -> Filter: ... + +class FunctionSignature(_Weakrefable): + def return_type(self) -> DataType: ... + def param_types(self) -> list[DataType]: ... + def name(self) -> str: ... + +def get_registered_function_signatures() -> list[FunctionSignature]: ... diff --git a/python/stubs/interchange/__init__.pyi b/python/stubs/interchange/__init__.pyi new file mode 100644 index 00000000000..e69de29bb2d diff --git a/python/stubs/interchange/buffer.pyi b/python/stubs/interchange/buffer.pyi new file mode 100644 index 00000000000..46673961a75 --- /dev/null +++ b/python/stubs/interchange/buffer.pyi @@ -0,0 +1,58 @@ +import enum + +from pyarrow.lib import Buffer + +class DlpackDeviceType(enum.IntEnum): + """Integer enum for device type codes matching DLPack.""" + + CPU = 1 + CUDA = 2 + CPU_PINNED = 3 + OPENCL = 4 + VULKAN = 7 + METAL = 8 + VPI = 9 + ROCM = 10 + +class _PyArrowBuffer: + """ + Data in the buffer is guaranteed to be contiguous in memory. + + Note that there is no dtype attribute present, a buffer can be thought of + as simply a block of memory. However, if the column that the buffer is + attached to has a dtype that's supported by DLPack and ``__dlpack__`` is + implemented, then that dtype information will be contained in the return + value from ``__dlpack__``. + + This distinction is useful to support both data exchange via DLPack on a + buffer and (b) dtypes like variable-length strings which do not have a + fixed number of bytes per element. + """ + def __init__(self, x: Buffer, allow_copy: bool = True) -> None: ... + @property + def bufsize(self) -> int: + """ + Buffer size in bytes. + """ + @property + def ptr(self) -> int: + """ + Pointer to start of the buffer as an integer. + """ + def __dlpack__(self): + """ + Produce DLPack capsule (see array API standard). + + Raises: + - TypeError : if the buffer contains unsupported dtypes. + - NotImplementedError : if DLPack support is not implemented + + Useful to have to connect to array libraries. Support optional because + it's not completely trivial to implement for a Python-only library. + """ + def __dlpack_device__(self) -> tuple[DlpackDeviceType, int | None]: + """ + Device type and device ID for where the data in the buffer resides. + Uses device type codes matching DLPack. + Note: must be implemented even if ``__dlpack__`` is not. + """ diff --git a/python/stubs/interchange/column.pyi b/python/stubs/interchange/column.pyi new file mode 100644 index 00000000000..e6662867b6b --- /dev/null +++ b/python/stubs/interchange/column.pyi @@ -0,0 +1,252 @@ +import enum + +from typing import Any, Iterable, TypeAlias, TypedDict + +from pyarrow.lib import Array, ChunkedArray + +from .buffer import _PyArrowBuffer + +class DtypeKind(enum.IntEnum): + """ + Integer enum for data types. + + Attributes + ---------- + INT : int + Matches to signed integer data type. + UINT : int + Matches to unsigned integer data type. + FLOAT : int + Matches to floating point data type. + BOOL : int + Matches to boolean data type. + STRING : int + Matches to string data type (UTF-8 encoded). + DATETIME : int + Matches to datetime data type. + CATEGORICAL : int + Matches to categorical data type. + """ + + INT = 0 + UINT = 1 + FLOAT = 2 + BOOL = 20 + STRING = 21 # UTF-8 + DATETIME = 22 + CATEGORICAL = 23 + +Dtype: TypeAlias = tuple[DtypeKind, int, str, str] + +class ColumnNullType(enum.IntEnum): + """ + Integer enum for null type representation. + + Attributes + ---------- + NON_NULLABLE : int + Non-nullable column. + USE_NAN : int + Use explicit float NaN value. + USE_SENTINEL : int + Sentinel value besides NaN. + USE_BITMASK : int + The bit is set/unset representing a null on a certain position. + USE_BYTEMASK : int + The byte is set/unset representing a null on a certain position. + """ + + NON_NULLABLE = 0 + USE_NAN = 1 + USE_SENTINEL = 2 + USE_BITMASK = 3 + USE_BYTEMASK = 4 + +class ColumnBuffers(TypedDict): + data: tuple[_PyArrowBuffer, Dtype] + validity: tuple[_PyArrowBuffer, Dtype] | None + offsets: tuple[_PyArrowBuffer, Dtype] | None + +class CategoricalDescription(TypedDict): + is_ordered: bool + is_dictionary: bool + categories: _PyArrowColumn | None + +class Endianness(enum.Enum): + LITTLE = "<" + BIG = ">" + NATIVE = "=" + NA = "|" + +class NoBufferPresent(Exception): + """Exception to signal that there is no requested buffer.""" + +class _PyArrowColumn: + """ + A column object, with only the methods and properties required by the + interchange protocol defined. + + A column can contain one or more chunks. Each chunk can contain up to three + buffers - a data buffer, a mask buffer (depending on null representation), + and an offsets buffer (if variable-size binary; e.g., variable-length + strings). + + TBD: Arrow has a separate "null" dtype, and has no separate mask concept. + Instead, it seems to use "children" for both columns with a bit mask, + and for nested dtypes. Unclear whether this is elegant or confusing. + This design requires checking the null representation explicitly. + + The Arrow design requires checking: + 1. the ARROW_FLAG_NULLABLE (for sentinel values) + 2. if a column has two children, combined with one of those children + having a null dtype. + + Making the mask concept explicit seems useful. One null dtype would + not be enough to cover both bit and byte masks, so that would mean + even more checking if we did it the Arrow way. + + TBD: there's also the "chunk" concept here, which is implicit in Arrow as + multiple buffers per array (= column here). Semantically it may make + sense to have both: chunks were meant for example for lazy evaluation + of data which doesn't fit in memory, while multiple buffers per column + could also come from doing a selection operation on a single + contiguous buffer. + + Given these concepts, one would expect chunks to be all of the same + size (say a 10,000 row dataframe could have 10 chunks of 1,000 rows), + while multiple buffers could have data-dependent lengths. Not an issue + in pandas if one column is backed by a single NumPy array, but in + Arrow it seems possible. + Are multiple chunks *and* multiple buffers per column necessary for + the purposes of this interchange protocol, or must producers either + reuse the chunk concept for this or copy the data? + + Note: this Column object can only be produced by ``__dataframe__``, so + doesn't need its own version or ``__column__`` protocol. + """ + def __init__(self, column: Array | ChunkedArray, allow_copy: bool = True) -> None: ... + def size(self) -> int: + """ + Size of the column, in elements. + + Corresponds to DataFrame.num_rows() if column is a single chunk; + equal to size of this current chunk otherwise. + + Is a method rather than a property because it may cause a (potentially + expensive) computation for some dataframe implementations. + """ + @property + def offset(self) -> int: + """ + Offset of first element. + + May be > 0 if using chunks; for example for a column with N chunks of + equal size M (only the last chunk may be shorter), + ``offset = n * M``, ``n = 0 .. N-1``. + """ + @property + def dtype(self) -> tuple[DtypeKind, int, str, str]: + """ + Dtype description as a tuple ``(kind, bit-width, format string, + endianness)``. + + Bit-width : the number of bits as an integer + Format string : data type description format string in Apache Arrow C + Data Interface format. + Endianness : current only native endianness (``=``) is supported + + Notes: + - Kind specifiers are aligned with DLPack where possible (hence the + jump to 20, leave enough room for future extension) + - Masks must be specified as boolean with either bit width 1 (for + bit masks) or 8 (for byte masks). + - Dtype width in bits was preferred over bytes + - Endianness isn't too useful, but included now in case in the + future we need to support non-native endianness + - Went with Apache Arrow format strings over NumPy format strings + because they're more complete from a dataframe perspective + - Format strings are mostly useful for datetime specification, and + for categoricals. + - For categoricals, the format string describes the type of the + categorical in the data buffer. In case of a separate encoding of + the categorical (e.g. an integer to string mapping), this can + be derived from ``self.describe_categorical``. + - Data types not included: complex, Arrow-style null, binary, + decimal, and nested (list, struct, map, union) dtypes. + """ + @property + def describe_categorical(self) -> CategoricalDescription: + """ + If the dtype is categorical, there are two options: + - There are only values in the data buffer. + - There is a separate non-categorical Column encoding categorical + values. + + Raises TypeError if the dtype is not categorical + + Returns the dictionary with description on how to interpret the + data buffer: + - "is_ordered" : bool, whether the ordering of dictionary indices + is semantically meaningful. + - "is_dictionary" : bool, whether a mapping of + categorical values to other objects exists + - "categories" : Column representing the (implicit) mapping of + indices to category values (e.g. an array of + cat1, cat2, ...). None if not a dictionary-style + categorical. + + TBD: are there any other in-memory representations that are needed? + """ + @property + def describe_null(self) -> tuple[ColumnNullType, Any]: + """ + Return the missing value (or "null") representation the column dtype + uses, as a tuple ``(kind, value)``. + + Value : if kind is "sentinel value", the actual value. If kind is a bit + mask or a byte mask, the value (0 or 1) indicating a missing value. + None otherwise. + """ + @property + def null_count(self) -> int: + """ + Number of null elements, if known. + + Note: Arrow uses -1 to indicate "unknown", but None seems cleaner. + """ + @property + def metadata(self) -> dict[str, Any]: + """ + The metadata for the column. See `DataFrame.metadata` for more details. + """ + def num_chunks(self) -> int: + """ + Return the number of chunks the column consists of. + """ + def get_chunks(self, n_chunks: int | None = None) -> Iterable[_PyArrowColumn]: + """ + Return an iterator yielding the chunks. + + See `DataFrame.get_chunks` for details on ``n_chunks``. + """ + def get_buffers(self) -> ColumnBuffers: + """ + Return a dictionary containing the underlying buffers. + + The returned dictionary has the following contents: + + - "data": a two-element tuple whose first element is a buffer + containing the data and whose second element is the data + buffer's associated dtype. + - "validity": a two-element tuple whose first element is a buffer + containing mask values indicating missing data and + whose second element is the mask value buffer's + associated dtype. None if the null representation is + not a bit or byte mask. + - "offsets": a two-element tuple whose first element is a buffer + containing the offset values for variable-size binary + data (e.g., variable-length strings) and whose second + element is the offsets buffer's associated dtype. None + if the data buffer does not have an associated offsets + buffer. + """ diff --git a/python/stubs/interchange/dataframe.pyi b/python/stubs/interchange/dataframe.pyi new file mode 100644 index 00000000000..526a58926a9 --- /dev/null +++ b/python/stubs/interchange/dataframe.pyi @@ -0,0 +1,102 @@ +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from typing import Any, Iterable, Sequence + +from pyarrow.interchange.column import _PyArrowColumn +from pyarrow.lib import RecordBatch, Table + +class _PyArrowDataFrame: + """ + A data frame class, with only the methods required by the interchange + protocol defined. + + A "data frame" represents an ordered collection of named columns. + A column's "name" must be a unique string. + Columns may be accessed by name or by position. + + This could be a public data frame class, or an object with the methods and + attributes defined on this DataFrame class could be returned from the + ``__dataframe__`` method of a public data frame class in a library adhering + to the dataframe interchange protocol specification. + """ + + def __init__( + self, df: Table | RecordBatch, nan_as_null: bool = False, allow_copy: bool = True + ) -> None: ... + def __dataframe__( + self, nan_as_null: bool = False, allow_copy: bool = True + ) -> _PyArrowDataFrame: + """ + Construct a new exchange object, potentially changing the parameters. + ``nan_as_null`` is a keyword intended for the consumer to tell the + producer to overwrite null values in the data with ``NaN``. + It is intended for cases where the consumer does not support the bit + mask or byte mask that is the producer's native representation. + ``allow_copy`` is a keyword that defines whether or not the library is + allowed to make a copy of the data. For example, copying data would be + necessary if a library supports strided buffers, given that this + protocol specifies contiguous buffers. + """ + @property + def metadata(self) -> dict[str, Any]: + """ + The metadata for the data frame, as a dictionary with string keys. The + contents of `metadata` may be anything, they are meant for a library + to store information that it needs to, e.g., roundtrip losslessly or + for two implementations to share data that is not (yet) part of the + interchange protocol specification. For avoiding collisions with other + entries, please add name the keys with the name of the library + followed by a period and the desired name, e.g, ``pandas.indexcol``. + """ + def num_columns(self) -> int: + """ + Return the number of columns in the DataFrame. + """ + def num_rows(self) -> int: + """ + Return the number of rows in the DataFrame, if available. + """ + def num_chunks(self) -> int: + """ + Return the number of chunks the DataFrame consists of. + """ + def column_names(self) -> Iterable[str]: + """ + Return an iterator yielding the column names. + """ + def get_column(self, i: int) -> _PyArrowColumn: + """ + Return the column at the indicated position. + """ + def get_column_by_name(self, name: str) -> _PyArrowColumn: + """ + Return the column whose name is the indicated name. + """ + def get_columns(self) -> Iterable[_PyArrowColumn]: + """ + Return an iterator yielding the columns. + """ + def select_columns(self, indices: Sequence[int]) -> Self: + """ + Create a new DataFrame by selecting a subset of columns by index. + """ + def select_columns_by_name(self, names: Sequence[str]) -> Self: + """ + Create a new DataFrame by selecting a subset of columns by name. + """ + def get_chunks(self, n_chunks: int | None = None) -> Iterable[Self]: + """ + Return an iterator yielding the chunks. + + By default (None), yields the chunks that the data is stored as by the + producer. If given, ``n_chunks`` must be a multiple of + ``self.num_chunks()``, meaning the producer must subdivide each chunk + before yielding it. + + Note that the producer must ensure that all columns are chunked the + same way. + """ diff --git a/python/stubs/interchange/from_dataframe.pyi b/python/stubs/interchange/from_dataframe.pyi new file mode 100644 index 00000000000..b04b6268975 --- /dev/null +++ b/python/stubs/interchange/from_dataframe.pyi @@ -0,0 +1,244 @@ +from typing import Any, Protocol, TypeAlias + +from pyarrow.lib import Array, Buffer, DataType, DictionaryArray, RecordBatch, Table + +from .column import ( + ColumnBuffers, + ColumnNullType, + Dtype, + DtypeKind, +) + +class DataFrameObject(Protocol): + def __dataframe__(self, nan_as_null: bool = False, allow_copy: bool = True) -> Any: ... + +ColumnObject: TypeAlias = Any + +def from_dataframe(df: DataFrameObject, allow_copy=True) -> Table: + """ + Build a ``pa.Table`` from any DataFrame supporting the interchange protocol. + + Parameters + ---------- + df : DataFrameObject + Object supporting the interchange protocol, i.e. `__dataframe__` + method. + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pa.Table + + Examples + -------- + >>> import pyarrow + >>> from pyarrow.interchange import from_dataframe + + Convert a pandas dataframe to a pyarrow table: + + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "n_attendees": [100, 10, 1], + ... "country": ["Italy", "Spain", "Slovenia"], + ... } + ... ) + >>> df + n_attendees country + 0 100 Italy + 1 10 Spain + 2 1 Slovenia + >>> from_dataframe(df) + pyarrow.Table + n_attendees: int64 + country: large_string + ---- + n_attendees: [[100,10,1]] + country: [["Italy","Spain","Slovenia"]] + """ + +def protocol_df_chunk_to_pyarrow(df: DataFrameObject, allow_copy: bool = True) -> RecordBatch: + """ + Convert interchange protocol chunk to ``pa.RecordBatch``. + + Parameters + ---------- + df : DataFrameObject + Object supporting the interchange protocol, i.e. `__dataframe__` + method. + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pa.RecordBatch + """ + +def column_to_array(col: ColumnObject, allow_copy: bool = True) -> Array: + """ + Convert a column holding one of the primitive dtypes to a PyArrow array. + A primitive type is one of: int, uint, float, bool (1 bit). + + Parameters + ---------- + col : ColumnObject + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pa.Array + """ + +def bool_column_to_array(col: ColumnObject, allow_copy: bool = True) -> Array: + """ + Convert a column holding boolean dtype to a PyArrow array. + + Parameters + ---------- + col : ColumnObject + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pa.Array + """ + +def categorical_column_to_dictionary( + col: ColumnObject, allow_copy: bool = True +) -> DictionaryArray: + """ + Convert a column holding categorical data to a pa.DictionaryArray. + + Parameters + ---------- + col : ColumnObject + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pa.DictionaryArray + """ + +def parse_datetime_format_str(format_str: str) -> tuple[str, str]: + """Parse datetime `format_str` to interpret the `data`.""" + +def map_date_type(data_type: tuple[DtypeKind, int, str, str]) -> DataType: + """Map column date type to pyarrow date type.""" + +def buffers_to_array( + buffers: ColumnBuffers, + data_type: tuple[DtypeKind, int, str, str], + length: int, + describe_null: ColumnNullType, + offset: int = 0, + allow_copy: bool = True, +) -> Array: + """ + Build a PyArrow array from the passed buffer. + + Parameters + ---------- + buffer : ColumnBuffers + Dictionary containing tuples of underlying buffers and + their associated dtype. + data_type : Tuple[DtypeKind, int, str, str], + Dtype description of the column as a tuple ``(kind, bit-width, format string, + endianness)``. + length : int + The number of values in the array. + describe_null: ColumnNullType + Null representation the column dtype uses, + as a tuple ``(kind, value)`` + offset : int, default: 0 + Number of elements to offset from the start of the buffer. + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pa.Array + + Notes + ----- + The returned array doesn't own the memory. The caller of this function + is responsible for keeping the memory owner object alive as long as + the returned PyArrow array is being used. + """ + +def validity_buffer_from_mask( + validity_buff: Buffer, + validity_dtype: Dtype, + describe_null: ColumnNullType, + length: int, + offset: int = 0, + allow_copy: bool = True, +) -> Buffer: + """ + Build a PyArrow buffer from the passed mask buffer. + + Parameters + ---------- + validity_buff : BufferObject + Tuple of underlying validity buffer and associated dtype. + validity_dtype : Dtype + Dtype description as a tuple ``(kind, bit-width, format string, + endianness)``. + describe_null : ColumnNullType + Null representation the column dtype uses, + as a tuple ``(kind, value)`` + length : int + The number of values in the array. + offset : int, default: 0 + Number of elements to offset from the start of the buffer. + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pa.Buffer + """ + +def validity_buffer_nan_sentinel( + data_pa_buffer: Buffer, + data_type: Dtype, + describe_null: ColumnNullType, + length: int, + offset: int = 0, + allow_copy: bool = True, +) -> Buffer: + """ + Build a PyArrow buffer from NaN or sentinel values. + + Parameters + ---------- + data_pa_buffer : pa.Buffer + PyArrow buffer for the column data. + data_type : Dtype + Dtype description as a tuple ``(kind, bit-width, format string, + endianness)``. + describe_null : ColumnNullType + Null representation the column dtype uses, + as a tuple ``(kind, value)`` + length : int + The number of values in the array. + offset : int, default: 0 + Number of elements to offset from the start of the buffer. + allow_copy : bool, default: True + Whether to allow copying the memory to perform the conversion + (if false then zero-copy approach is requested). + + Returns + ------- + pa.Buffer + """ diff --git a/python/stubs/ipc.pyi b/python/stubs/ipc.pyi new file mode 100644 index 00000000000..c7f2af004d4 --- /dev/null +++ b/python/stubs/ipc.pyi @@ -0,0 +1,123 @@ +from io import IOBase + +import pandas as pd +import pyarrow.lib as lib + +from pyarrow.lib import ( + IpcReadOptions, + IpcWriteOptions, + Message, + MessageReader, + MetadataVersion, + ReadStats, + RecordBatchReader, + WriteStats, + _ReadPandasMixin, + get_record_batch_size, + get_tensor_size, + read_message, + read_record_batch, + read_schema, + read_tensor, + write_tensor, +) + +class RecordBatchStreamReader(lib._RecordBatchStreamReader): + def __init__( + self, + source: bytes | lib.Buffer | lib.NativeFile | IOBase, + *, + options: IpcReadOptions | None = None, + memory_pool: lib.MemoryPool | None = None, + ) -> None: ... + +class RecordBatchStreamWriter(lib._RecordBatchStreamWriter): + def __init__( + self, + sink: str | lib.NativeFile | IOBase, + schema: lib.Schema, + *, + use_legacy_format: bool | None = None, + options: IpcWriteOptions | None = None, + ) -> None: ... + +class RecordBatchFileReader(lib._RecordBatchFileReader): + def __init__( + self, + source: bytes | lib.Buffer | lib.NativeFile | IOBase, + footer_offset: int | None = None, + *, + options: IpcReadOptions | None, + memory_pool: lib.MemoryPool | None = None, + ) -> None: ... + +class RecordBatchFileWriter(lib._RecordBatchFileWriter): + def __init__( + self, + sink: str | lib.NativeFile | IOBase, + schema: lib.Schema, + *, + use_legacy_format: bool | None = None, + options: IpcWriteOptions | None = None, + ) -> None: ... + +def new_stream( + sink: str | lib.NativeFile | IOBase, + schema: lib.Schema, + *, + use_legacy_format: bool | None = None, + options: IpcWriteOptions | None = None, +) -> RecordBatchStreamWriter: ... +def open_stream( + source: bytes | lib.Buffer | lib.NativeFile | IOBase, + *, + options: IpcReadOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> RecordBatchStreamReader: ... +def new_file( + sink: str | lib.NativeFile | IOBase, + schema: lib.Schema, + *, + use_legacy_format: bool | None = None, + options: IpcWriteOptions | None = None, +) -> RecordBatchFileWriter: ... +def open_file( + source: bytes | lib.Buffer | lib.NativeFile | IOBase, + footer_offset: int | None = None, + *, + options: IpcReadOptions | None = None, + memory_pool: lib.MemoryPool | None = None, +) -> RecordBatchFileReader: ... +def serialize_pandas( + df: pd.DataFrame, *, nthreads: int | None = None, preserve_index: bool | None = None +) -> lib.Buffer: ... +def deserialize_pandas(buf: lib.Buffer, *, use_threads: bool = True) -> pd.DataFrame: ... + +__all__ = [ + "IpcReadOptions", + "IpcWriteOptions", + "Message", + "MessageReader", + "MetadataVersion", + "ReadStats", + "RecordBatchReader", + "WriteStats", + "_ReadPandasMixin", + "get_record_batch_size", + "get_tensor_size", + "read_message", + "read_record_batch", + "read_schema", + "read_tensor", + "write_tensor", + "RecordBatchStreamReader", + "RecordBatchStreamWriter", + "RecordBatchFileReader", + "RecordBatchFileWriter", + "new_stream", + "open_stream", + "new_file", + "open_file", + "serialize_pandas", + "deserialize_pandas", +] diff --git a/python/stubs/json.pyi b/python/stubs/json.pyi new file mode 100644 index 00000000000..db1d35e0b8b --- /dev/null +++ b/python/stubs/json.pyi @@ -0,0 +1,3 @@ +from pyarrow._json import ParseOptions, ReadOptions, open_json, read_json + +__all__ = ["ParseOptions", "ReadOptions", "read_json", "open_json"] diff --git a/python/stubs/lib.pyi b/python/stubs/lib.pyi new file mode 100644 index 00000000000..1698b55520b --- /dev/null +++ b/python/stubs/lib.pyi @@ -0,0 +1,106 @@ +# ruff: noqa: F403 +from typing import NamedTuple + +from .__lib_pxi.array import * +from .__lib_pxi.benchmark import * +from .__lib_pxi.builder import * +from .__lib_pxi.compat import * +from .__lib_pxi.config import * +from .__lib_pxi.device import * +from .__lib_pxi.error import * +from .__lib_pxi.io import * +from .__lib_pxi.ipc import * +from .__lib_pxi.memory import * +from .__lib_pxi.pandas_shim import * +from .__lib_pxi.scalar import * +from .__lib_pxi.table import * +from .__lib_pxi.tensor import * +from .__lib_pxi.types import * + +class MonthDayNano(NamedTuple): + days: int + months: int + nanoseconds: int + +def cpu_count() -> int: + """ + Return the number of threads to use in parallel operations. + + The number of threads is determined at startup by inspecting the + ``OMP_NUM_THREADS`` and ``OMP_THREAD_LIMIT`` environment variables. + If neither is present, it will default to the number of hardware threads + on the system. It can be modified at runtime by calling + :func:`set_cpu_count()`. + + See Also + -------- + set_cpu_count : Modify the size of this pool. + io_thread_count : The analogous function for the I/O thread pool. + """ + +def set_cpu_count(count: int) -> None: + """ + Set the number of threads to use in parallel operations. + + Parameters + ---------- + count : int + The number of concurrent threads that should be used. + + See Also + -------- + cpu_count : Get the size of this pool. + set_io_thread_count : The analogous function for the I/O thread pool. + """ + +def is_threading_enabled() -> bool: + """ + Returns True if threading is enabled in libarrow. + + If it isn't enabled, then python shouldn't create any + threads either, because we're probably on a system where + threading doesn't work (e.g. Emscripten). + """ + +Type_NA: int +Type_BOOL: int +Type_UINT8: int +Type_INT8: int +Type_UINT16: int +Type_INT16: int +Type_UINT32: int +Type_INT32: int +Type_UINT64: int +Type_INT64: int +Type_HALF_FLOAT: int +Type_FLOAT: int +Type_DOUBLE: int +Type_DECIMAL128: int +Type_DECIMAL256: int +Type_DATE32: int +Type_DATE64: int +Type_TIMESTAMP: int +Type_TIME32: int +Type_TIME64: int +Type_DURATION: int +Type_INTERVAL_MONTH_DAY_NANO: int +Type_BINARY: int +Type_STRING: int +Type_LARGE_BINARY: int +Type_LARGE_STRING: int +Type_FIXED_SIZE_BINARY: int +Type_BINARY_VIEW: int +Type_STRING_VIEW: int +Type_LIST: int +Type_LARGE_LIST: int +Type_LIST_VIEW: int +Type_LARGE_LIST_VIEW: int +Type_MAP: int +Type_FIXED_SIZE_LIST: int +Type_STRUCT: int +Type_SPARSE_UNION: int +Type_DENSE_UNION: int +Type_DICTIONARY: int +Type_RUN_END_ENCODED: int +UnionMode_SPARSE: int +UnionMode_DENSE: int diff --git a/python/stubs/orc.pyi b/python/stubs/orc.pyi new file mode 100644 index 00000000000..2eba8d40a11 --- /dev/null +++ b/python/stubs/orc.pyi @@ -0,0 +1,279 @@ +import sys + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from typing import IO, Literal + +from _typeshed import StrPath + +from . import _orc +from ._fs import SupportedFileSystem +from .lib import KeyValueMetadata, NativeFile, RecordBatch, Schema, Table + +class ORCFile: + """ + Reader interface for a single ORC file + + Parameters + ---------- + source : str or pyarrow.NativeFile + Readable source. For passing Python file objects or byte buffers, + see pyarrow.io.PythonFileInterface or pyarrow.io.BufferReader. + """ + + reader: _orc.ORCReader + def __init__(self, source: StrPath | NativeFile | IO) -> None: ... + @property + def metadata(self) -> KeyValueMetadata: + """The file metadata, as an arrow KeyValueMetadata""" + @property + def schema(self) -> Schema: + """The file schema, as an arrow schema""" + @property + def nrows(self) -> int: + """The number of rows in the file""" + @property + def nstripes(self) -> int: + """The number of stripes in the file""" + @property + def file_version(self) -> str: + """Format version of the ORC file, must be 0.11 or 0.12""" + @property + def software_version(self) -> str: + """Software instance and version that wrote this file""" + @property + def compression(self) -> Literal["UNCOMPRESSED", "ZLIB", "SNAPPY", "LZ4", "ZSTD"]: + """Compression codec of the file""" + @property + def compression_size(self) -> int: + """Number of bytes to buffer for the compression codec in the file""" + @property + def writer(self) -> str: + """Name of the writer that wrote this file. + If the writer is unknown then its Writer ID + (a number) is returned""" + @property + def writer_version(self) -> str: + """Version of the writer""" + @property + def row_index_stride(self) -> int: + """Number of rows per an entry in the row index or 0 + if there is no row index""" + @property + def nstripe_statistics(self) -> int: + """Number of stripe statistics""" + @property + def content_length(self) -> int: + """Length of the data stripes in the file in bytes""" + @property + def stripe_statistics_length(self) -> int: + """The number of compressed bytes in the file stripe statistics""" + @property + def file_footer_length(self) -> int: + """The number of compressed bytes in the file footer""" + @property + def file_postscript_length(self) -> int: + """The number of bytes in the file postscript""" + @property + def file_length(self) -> int: + """The number of bytes in the file""" + def read_stripe(self, n: int, columns: list[str] | None = None) -> RecordBatch: + """Read a single stripe from the file. + + Parameters + ---------- + n : int + The stripe index + columns : list + If not None, only these columns will be read from the stripe. A + column name may be a prefix of a nested field, e.g. 'a' will select + 'a.b', 'a.c', and 'a.d.e' + + Returns + ------- + pyarrow.RecordBatch + Content of the stripe as a RecordBatch. + """ + def read(self, columns: list[str] | None = None) -> Table: + """Read the whole file. + + Parameters + ---------- + columns : list + If not None, only these columns will be read from the file. A + column name may be a prefix of a nested field, e.g. 'a' will select + 'a.b', 'a.c', and 'a.d.e'. Output always follows the + ordering of the file and not the `columns` list. + + Returns + ------- + pyarrow.Table + Content of the file as a Table. + """ + +class ORCWriter: + """ + Writer interface for a single ORC file + + Parameters + ---------- + where : str or pyarrow.io.NativeFile + Writable target. For passing Python file objects or byte buffers, + see pyarrow.io.PythonFileInterface, pyarrow.io.BufferOutputStream + or pyarrow.io.FixedSizeBufferWriter. + file_version : {"0.11", "0.12"}, default "0.12" + Determine which ORC file version to use. + `Hive 0.11 / ORC v0 `_ + is the older version + while `Hive 0.12 / ORC v1 `_ + is the newer one. + batch_size : int, default 1024 + Number of rows the ORC writer writes at a time. + stripe_size : int, default 64 * 1024 * 1024 + Size of each ORC stripe in bytes. + compression : string, default 'uncompressed' + The compression codec. + Valid values: {'UNCOMPRESSED', 'SNAPPY', 'ZLIB', 'LZ4', 'ZSTD'} + Note that LZ0 is currently not supported. + compression_block_size : int, default 64 * 1024 + Size of each compression block in bytes. + compression_strategy : string, default 'speed' + The compression strategy i.e. speed vs size reduction. + Valid values: {'SPEED', 'COMPRESSION'} + row_index_stride : int, default 10000 + The row index stride i.e. the number of rows per + an entry in the row index. + padding_tolerance : double, default 0.0 + The padding tolerance. + dictionary_key_size_threshold : double, default 0.0 + The dictionary key size threshold. 0 to disable dictionary encoding. + 1 to always enable dictionary encoding. + bloom_filter_columns : None, set-like or list-like, default None + Columns that use the bloom filter. + bloom_filter_fpp : double, default 0.05 + Upper limit of the false-positive rate of the bloom filter. + """ + + writer: _orc.ORCWriter + is_open: bool + def __init__( + self, + where: StrPath | NativeFile | IO, + *, + file_version: str = "0.12", + batch_size: int = 1024, + stripe_size: int = 64 * 1024 * 1024, + compression: Literal["UNCOMPRESSED", "ZLIB", "SNAPPY", "LZ4", "ZSTD"] = "UNCOMPRESSED", + compression_block_size: int = 65536, + compression_strategy: Literal["COMPRESSION", "SPEED"] = "SPEED", + row_index_stride: int = 10000, + padding_tolerance: float = 0.0, + dictionary_key_size_threshold: float = 0.0, + bloom_filter_columns: list[int] | None = None, + bloom_filter_fpp: float = 0.05, + ): ... + def __enter__(self) -> Self: ... + def __exit__(self, *args, **kwargs) -> None: ... + def write(self, table: Table) -> None: + """ + Write the table into an ORC file. The schema of the table must + be equal to the schema used when opening the ORC file. + + Parameters + ---------- + table : pyarrow.Table + The table to be written into the ORC file + """ + def close(self) -> None: + """ + Close the ORC file + """ + +def read_table( + source: StrPath | NativeFile | IO, + columns: list[str] | None = None, + filesystem: SupportedFileSystem | None = None, +) -> Table: + """ + Read a Table from an ORC file. + + Parameters + ---------- + source : str, pyarrow.NativeFile, or file-like object + If a string passed, can be a single file name. For file-like objects, + only read a single file. Use pyarrow.BufferReader to read a file + contained in a bytes or buffer-like object. + columns : list + If not None, only these columns will be read from the file. A column + name may be a prefix of a nested field, e.g. 'a' will select 'a.b', + 'a.c', and 'a.d.e'. Output always follows the ordering of the file and + not the `columns` list. If empty, no columns will be read. Note + that the table will still have the correct num_rows set despite having + no columns. + filesystem : FileSystem, default None + If nothing passed, will be inferred based on path. + Path will try to be found in the local on-disk filesystem otherwise + it will be parsed as an URI to determine the filesystem. + """ + +def write_table( + table: Table, + where: StrPath | NativeFile | IO, + *, + file_version: str = "0.12", + batch_size: int = 1024, + stripe_size: int = 64 * 1024 * 1024, + compression: Literal["UNCOMPRESSED", "ZLIB", "SNAPPY", "LZ4", "ZSTD"] = "UNCOMPRESSED", + compression_block_size: int = 65536, + compression_strategy: Literal["COMPRESSION", "SPEED"] = "SPEED", + row_index_stride: int = 10000, + padding_tolerance: float = 0.0, + dictionary_key_size_threshold: float = 0.0, + bloom_filter_columns: list[int] | None = None, + bloom_filter_fpp: float = 0.05, +) -> None: + """ + Write a table into an ORC file. + + Parameters + ---------- + table : pyarrow.lib.Table + The table to be written into the ORC file + where : str or pyarrow.io.NativeFile + Writable target. For passing Python file objects or byte buffers, + see pyarrow.io.PythonFileInterface, pyarrow.io.BufferOutputStream + or pyarrow.io.FixedSizeBufferWriter. + file_version : {"0.11", "0.12"}, default "0.12" + Determine which ORC file version to use. + `Hive 0.11 / ORC v0 `_ + is the older version + while `Hive 0.12 / ORC v1 `_ + is the newer one. + batch_size : int, default 1024 + Number of rows the ORC writer writes at a time. + stripe_size : int, default 64 * 1024 * 1024 + Size of each ORC stripe in bytes. + compression : string, default 'uncompressed' + The compression codec. + Valid values: {'UNCOMPRESSED', 'SNAPPY', 'ZLIB', 'LZ4', 'ZSTD'} + Note that LZ0 is currently not supported. + compression_block_size : int, default 64 * 1024 + Size of each compression block in bytes. + compression_strategy : string, default 'speed' + The compression strategy i.e. speed vs size reduction. + Valid values: {'SPEED', 'COMPRESSION'} + row_index_stride : int, default 10000 + The row index stride i.e. the number of rows per + an entry in the row index. + padding_tolerance : double, default 0.0 + The padding tolerance. + dictionary_key_size_threshold : double, default 0.0 + The dictionary key size threshold. 0 to disable dictionary encoding. + 1 to always enable dictionary encoding. + bloom_filter_columns : None, set-like or list-like, default None + Columns that use the bloom filter. + bloom_filter_fpp : double, default 0.05 + Upper limit of the false-positive rate of the bloom filter. + """ diff --git a/python/stubs/pandas_compat.pyi b/python/stubs/pandas_compat.pyi new file mode 100644 index 00000000000..efbd05ac2fe --- /dev/null +++ b/python/stubs/pandas_compat.pyi @@ -0,0 +1,54 @@ +from typing import Any, TypedDict, TypeVar + +import numpy as np +import pandas as pd + +from pandas import DatetimeTZDtype + +from .lib import Array, DataType, Schema, Table + +_T = TypeVar("_T") + +def get_logical_type_map() -> dict[int, str]: ... +def get_logical_type(arrow_type: DataType) -> str: ... +def get_numpy_logical_type_map() -> dict[type[np.generic], str]: ... +def get_logical_type_from_numpy(pandas_collection) -> str: ... +def get_extension_dtype_info(column) -> tuple[str, dict[str, Any]]: ... + +class _ColumnMetadata(TypedDict): + name: str + field_name: str + pandas_type: int + numpy_type: str + metadata: dict | None + +def get_column_metadata( + column: pd.Series | pd.Index, name: str, arrow_type: DataType, field_name: str +) -> _ColumnMetadata: ... +def construct_metadata( + columns_to_convert: list[pd.Series], + df: pd.DataFrame, + column_names: list[str], + index_levels: list[pd.Index], + index_descriptors: list[dict], + preserve_index: bool, + types: list[DataType], + column_field_names: list[str] = ..., +) -> dict[bytes, bytes]: ... +def dataframe_to_types( + df: pd.DataFrame, preserve_index: bool | None, columns: list[str] | None = None +) -> tuple[list[str], list[DataType], dict[bytes, bytes]]: ... +def dataframe_to_arrays( + df: pd.DataFrame, + schema: Schema, + preserve_index: bool | None, + nthreads: int = 1, + columns: list[str] | None = None, + safe: bool = True, +) -> tuple[Array, Schema, int]: ... +def get_datetimetz_type(values: _T, dtype, type_) -> tuple[_T, DataType]: ... +def make_datetimetz(unit: str, tz: str) -> DatetimeTZDtype: ... +def table_to_dataframe( + options, table: Table, categories=None, ignore_metadata: bool = False, types_mapper=None +) -> pd.DataFrame: ... +def make_tz_aware(series: pd.Series, tz: str) -> pd.Series: ... diff --git a/python/stubs/parquet/__init__.pyi b/python/stubs/parquet/__init__.pyi new file mode 100644 index 00000000000..4ef88705809 --- /dev/null +++ b/python/stubs/parquet/__init__.pyi @@ -0,0 +1 @@ +from .core import * # noqa diff --git a/python/stubs/parquet/core.pyi b/python/stubs/parquet/core.pyi new file mode 100644 index 00000000000..56b2c8447d9 --- /dev/null +++ b/python/stubs/parquet/core.pyi @@ -0,0 +1,2061 @@ +import sys + +from pathlib import Path + +if sys.version_info >= (3, 11): + from typing import Self +else: + from typing_extensions import Self +from typing import IO, Callable, Iterator, Literal, Sequence + +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias + +from pyarrow import _parquet +from pyarrow._compute import Expression +from pyarrow._fs import FileSystem, SupportedFileSystem +from pyarrow._parquet import ( + ColumnChunkMetaData, + ColumnSchema, + FileDecryptionProperties, + FileEncryptionProperties, + FileMetaData, + ParquetLogicalType, + ParquetReader, + ParquetSchema, + RowGroupMetaData, + SortingColumn, + Statistics, +) +from pyarrow._stubs_typing import FilterTuple, SingleOrList +from pyarrow.dataset import ParquetFileFragment, Partitioning +from pyarrow.lib import NativeFile, RecordBatch, Schema, Table +from typing_extensions import deprecated + +__all__ = ( + "ColumnChunkMetaData", + "ColumnSchema", + "FileDecryptionProperties", + "FileEncryptionProperties", + "FileMetaData", + "ParquetDataset", + "ParquetFile", + "ParquetLogicalType", + "ParquetReader", + "ParquetSchema", + "ParquetWriter", + "RowGroupMetaData", + "SortingColumn", + "Statistics", + "read_metadata", + "read_pandas", + "read_schema", + "read_table", + "write_metadata", + "write_table", + "write_to_dataset", + "_filters_to_expression", + "filters_to_expression", +) + +def filters_to_expression(filters: list[FilterTuple | list[FilterTuple]]) -> Expression: + """ + Check if filters are well-formed and convert to an ``Expression``. + + Parameters + ---------- + filters : List[Tuple] or List[List[Tuple]] + + Notes + ----- + See internal ``pyarrow._DNF_filter_doc`` attribute for more details. + + Examples + -------- + + >>> filters_to_expression([("foo", "==", "bar")]) + + + Returns + ------- + pyarrow.compute.Expression + An Expression representing the filters + """ + +@deprecated("use filters_to_expression") +def _filters_to_expression(filters: list[FilterTuple | list[FilterTuple]]) -> Expression: ... + +_Compression: TypeAlias = Literal["gzip", "bz2", "brotli", "lz4", "zstd", "snappy", "none"] + +class ParquetFile: + """ + Reader interface for a single Parquet file. + + Parameters + ---------- + source : str, pathlib.Path, pyarrow.NativeFile, or file-like object + Readable source. For passing bytes or buffer-like file containing a + Parquet file, use pyarrow.BufferReader. + metadata : FileMetaData, default None + Use existing metadata object, rather than reading from file. + common_metadata : FileMetaData, default None + Will be used in reads for pandas schema metadata if not found in the + main file's metadata, no other uses at the moment. + read_dictionary : list + List of column names to read directly as DictionaryArray. + memory_map : bool, default False + If the source is a file path, use a memory map to read file, which can + improve performance in some environments. + buffer_size : int, default 0 + If positive, perform read buffering when deserializing individual + column chunks. Otherwise IO calls are unbuffered. + pre_buffer : bool, default False + Coalesce and issue file reads in parallel to improve performance on + high-latency filesystems (e.g. S3). If True, Arrow will use a + background I/O thread pool. + coerce_int96_timestamp_unit : str, default None + Cast timestamps that are stored in INT96 format to a particular + resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' + and therefore INT96 timestamps will be inferred as timestamps + in nanoseconds. + decryption_properties : FileDecryptionProperties, default None + File decryption properties for Parquet Modular Encryption. + thrift_string_size_limit : int, default None + If not None, override the maximum total string size allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + thrift_container_size_limit : int, default None + If not None, override the maximum total size of containers allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + filesystem : FileSystem, default None + If nothing passed, will be inferred based on path. + Path will try to be found in the local on-disk filesystem otherwise + it will be parsed as an URI to determine the filesystem. + page_checksum_verification : bool, default False + If True, verify the checksum for each page read from the file. + + Examples + -------- + + Generate an example PyArrow Table and write it to Parquet file: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "example.parquet") + + Create a ``ParquetFile`` object from the Parquet file: + + >>> parquet_file = pq.ParquetFile("example.parquet") + + Read the data: + + >>> parquet_file.read() + pyarrow.Table + n_legs: int64 + animal: string + ---- + n_legs: [[2,2,4,4,5,100]] + animal: [["Flamingo","Parrot","Dog","Horse","Brittle stars","Centipede"]] + + Create a ParquetFile object with "animal" column as DictionaryArray: + + >>> parquet_file = pq.ParquetFile("example.parquet", read_dictionary=["animal"]) + >>> parquet_file.read() + pyarrow.Table + n_legs: int64 + animal: dictionary + ---- + n_legs: [[2,2,4,4,5,100]] + animal: [ -- dictionary: + ["Flamingo","Parrot",...,"Brittle stars","Centipede"] -- indices: + [0,1,2,3,4,5]] + """ + + reader: ParquetReader + common_metadata: FileMetaData + + def __init__( + self, + source: str | Path | NativeFile | IO, + *, + metadata: FileMetaData | None = None, + common_metadata: FileMetaData | None = None, + read_dictionary: list[str] | None = None, + memory_map: bool = False, + buffer_size: int = 0, + pre_buffer: bool = False, + coerce_int96_timestamp_unit: str | None = None, + decryption_properties: FileDecryptionProperties | None = None, + thrift_string_size_limit: int | None = None, + thrift_container_size_limit: int | None = None, + filesystem: SupportedFileSystem | None = None, + page_checksum_verification: bool = False, + ): ... + def __enter__(self) -> Self: ... + def __exit__(self, *args, **kwargs) -> None: ... + @property + def metadata(self) -> FileMetaData: + """ + Return the Parquet metadata. + """ + @property + def schema(self) -> ParquetSchema: + """ + Return the Parquet schema, unconverted to Arrow types + """ + @property + def schema_arrow(self) -> Schema: + """ + Return the inferred Arrow schema, converted from the whole Parquet + file's schema + + Examples + -------- + Generate an example Parquet file: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "example.parquet") + >>> parquet_file = pq.ParquetFile("example.parquet") + + Read the Arrow schema: + + >>> parquet_file.schema_arrow + n_legs: int64 + animal: string + """ + @property + def num_row_groups(self) -> int: + """ + Return the number of row groups of the Parquet file. + + Examples + -------- + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "example.parquet") + >>> parquet_file = pq.ParquetFile("example.parquet") + + >>> parquet_file.num_row_groups + 1 + """ + def close(self, force: bool = False) -> None: ... + @property + def closed(self) -> bool: ... + def read_row_group( + self, + i: int, + columns: list | None = None, + use_threads: bool = True, + use_pandas_metadata: bool = False, + ) -> Table: + """ + Read a single row group from a Parquet file. + + Parameters + ---------- + i : int + Index of the individual row group that we want to read. + columns : list + If not None, only these columns will be read from the row group. A + column name may be a prefix of a nested field, e.g. 'a' will select + 'a.b', 'a.c', and 'a.d.e'. + use_threads : bool, default True + Perform multi-threaded column reads. + use_pandas_metadata : bool, default False + If True and file has custom pandas schema metadata, ensure that + index columns are also loaded. + + Returns + ------- + pyarrow.table.Table + Content of the row group as a table (of columns) + + Examples + -------- + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "example.parquet") + >>> parquet_file = pq.ParquetFile("example.parquet") + + >>> parquet_file.read_row_group(0) + pyarrow.Table + n_legs: int64 + animal: string + ---- + n_legs: [[2,2,4,4,5,100]] + animal: [["Flamingo","Parrot",...,"Brittle stars","Centipede"]] + """ + def read_row_groups( + self, + row_groups: list, + columns: list | None = None, + use_threads: bool = True, + use_pandas_metadata: bool = False, + ) -> Table: + """ + Read a multiple row groups from a Parquet file. + + Parameters + ---------- + row_groups : list + Only these row groups will be read from the file. + columns : list + If not None, only these columns will be read from the row group. A + column name may be a prefix of a nested field, e.g. 'a' will select + 'a.b', 'a.c', and 'a.d.e'. + use_threads : bool, default True + Perform multi-threaded column reads. + use_pandas_metadata : bool, default False + If True and file has custom pandas schema metadata, ensure that + index columns are also loaded. + + Returns + ------- + pyarrow.table.Table + Content of the row groups as a table (of columns). + + Examples + -------- + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "example.parquet") + >>> parquet_file = pq.ParquetFile("example.parquet") + + >>> parquet_file.read_row_groups([0, 0]) + pyarrow.Table + n_legs: int64 + animal: string + ---- + n_legs: [[2,2,4,4,5,...,2,4,4,5,100]] + animal: [["Flamingo","Parrot","Dog",...,"Brittle stars","Centipede"]] + """ + def iter_batches( + self, + batch_size: int = 65536, + row_groups: list | None = None, + columns: list | None = None, + use_threads: bool = True, + use_pandas_metadata: bool = False, + ) -> Iterator[RecordBatch]: + """ + Read streaming batches from a Parquet file. + + Parameters + ---------- + batch_size : int, default 64K + Maximum number of records to yield per batch. Batches may be + smaller if there aren't enough rows in the file. + row_groups : list + Only these row groups will be read from the file. + columns : list + If not None, only these columns will be read from the file. A + column name may be a prefix of a nested field, e.g. 'a' will select + 'a.b', 'a.c', and 'a.d.e'. + use_threads : boolean, default True + Perform multi-threaded column reads. + use_pandas_metadata : boolean, default False + If True and file has custom pandas schema metadata, ensure that + index columns are also loaded. + + Yields + ------ + pyarrow.RecordBatch + Contents of each batch as a record batch + + Examples + -------- + Generate an example Parquet file: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "example.parquet") + >>> parquet_file = pq.ParquetFile("example.parquet") + >>> for i in parquet_file.iter_batches(): + ... print("RecordBatch") + ... print(i.to_pandas()) + RecordBatch + n_legs animal + 0 2 Flamingo + 1 2 Parrot + 2 4 Dog + 3 4 Horse + 4 5 Brittle stars + 5 100 Centipede + """ + def read( + self, + columns: list | None = None, + use_threads: bool = True, + use_pandas_metadata: bool = False, + ) -> Table: + """ + Read a Table from Parquet format. + + Parameters + ---------- + columns : list + If not None, only these columns will be read from the file. A + column name may be a prefix of a nested field, e.g. 'a' will select + 'a.b', 'a.c', and 'a.d.e'. + use_threads : bool, default True + Perform multi-threaded column reads. + use_pandas_metadata : bool, default False + If True and file has custom pandas schema metadata, ensure that + index columns are also loaded. + + Returns + ------- + pyarrow.table.Table + Content of the file as a table (of columns). + + Examples + -------- + Generate an example Parquet file: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "example.parquet") + >>> parquet_file = pq.ParquetFile("example.parquet") + + Read a Table: + + >>> parquet_file.read(columns=["animal"]) + pyarrow.Table + animal: string + ---- + animal: [["Flamingo","Parrot",...,"Brittle stars","Centipede"]] + """ + def scan_contents(self, columns: list | None = None, batch_size: int = 65536) -> int: + """ + Read contents of file for the given columns and batch size. + + Notes + ----- + This function's primary purpose is benchmarking. + The scan is executed on a single thread. + + Parameters + ---------- + columns : list of integers, default None + Select columns to read, if None scan all columns. + batch_size : int, default 64K + Number of rows to read at a time internally. + + Returns + ------- + num_rows : int + Number of rows in file + + Examples + -------- + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "example.parquet") + >>> parquet_file = pq.ParquetFile("example.parquet") + + >>> parquet_file.scan_contents() + 6 + """ + +class ParquetWriter: + """ + Class for incrementally building a Parquet file for Arrow tables. + + Parameters + ---------- + where : path or file-like object + schema : pyarrow.Schema + version : {"1.0", "2.4", "2.6"}, default "2.6" + Determine which Parquet logical types are available for use, whether the + reduced set from the Parquet 1.x.x format or the expanded logical types + added in later format versions. + Files written with version='2.4' or '2.6' may not be readable in all + Parquet implementations, so version='1.0' is likely the choice that + maximizes file compatibility. + UINT32 and some logical types are only available with version '2.4'. + Nanosecond timestamps are only available with version '2.6'. + Other features such as compression algorithms or the new serialized + data page format must be enabled separately (see 'compression' and + 'data_page_version'). + use_dictionary : bool or list, default True + Specify if we should use dictionary encoding in general or only for + some columns. + When encoding the column, if the dictionary size is too large, the + column will fallback to ``PLAIN`` encoding. Specially, ``BOOLEAN`` type + doesn't support dictionary encoding. + compression : str or dict, default 'snappy' + Specify the compression codec, either on a general basis or per-column. + Valid values: {'NONE', 'SNAPPY', 'GZIP', 'BROTLI', 'LZ4', 'ZSTD'}. + write_statistics : bool or list, default True + Specify if we should write statistics in general (default is True) or only + for some columns. + use_deprecated_int96_timestamps : bool, default None + Write timestamps to INT96 Parquet format. Defaults to False unless enabled + by flavor argument. This take priority over the coerce_timestamps option. + coerce_timestamps : str, default None + Cast timestamps to a particular resolution. If omitted, defaults are chosen + depending on `version`. For ``version='1.0'`` and ``version='2.4'``, + nanoseconds are cast to microseconds ('us'), while for + ``version='2.6'`` (the default), they are written natively without loss + of resolution. Seconds are always cast to milliseconds ('ms') by default, + as Parquet does not have any temporal type with seconds resolution. + If the casting results in loss of data, it will raise an exception + unless ``allow_truncated_timestamps=True`` is given. + Valid values: {None, 'ms', 'us'} + allow_truncated_timestamps : bool, default False + Allow loss of data when coercing timestamps to a particular + resolution. E.g. if microsecond or nanosecond data is lost when coercing to + 'ms', do not raise an exception. Passing ``allow_truncated_timestamp=True`` + will NOT result in the truncation exception being ignored unless + ``coerce_timestamps`` is not None. + data_page_size : int, default None + Set a target threshold for the approximate encoded size of data + pages within a column chunk (in bytes). If None, use the default data page + size of 1MByte. + flavor : {'spark'}, default None + Sanitize schema or set other compatibility options to work with + various target systems. + filesystem : FileSystem, default None + If nothing passed, will be inferred from `where` if path-like, else + `where` is already a file-like object so no filesystem is needed. + compression_level : int or dict, default None + Specify the compression level for a codec, either on a general basis or + per-column. If None is passed, arrow selects the compression level for + the compression codec in use. The compression level has a different + meaning for each codec, so you have to read the documentation of the + codec you are using. + An exception is thrown if the compression codec does not allow specifying + a compression level. + use_byte_stream_split : bool or list, default False + Specify if the byte_stream_split encoding should be used in general or + only for some columns. If both dictionary and byte_stream_stream are + enabled, then dictionary is preferred. + The byte_stream_split encoding is valid for integer, floating-point + and fixed-size binary data types (including decimals); it should be + combined with a compression codec so as to achieve size reduction. + column_encoding : string or dict, default None + Specify the encoding scheme on a per column basis. + Can only be used when ``use_dictionary`` is set to False, and + cannot be used in combination with ``use_byte_stream_split``. + Currently supported values: {'PLAIN', 'BYTE_STREAM_SPLIT', + 'DELTA_BINARY_PACKED', 'DELTA_LENGTH_BYTE_ARRAY', 'DELTA_BYTE_ARRAY'}. + Certain encodings are only compatible with certain data types. + Please refer to the encodings section of `Reading and writing Parquet + files `_. + data_page_version : {"1.0", "2.0"}, default "1.0" + The serialized Parquet data page format version to write, defaults to + 1.0. This does not impact the file schema logical types and Arrow to + Parquet type casting behavior; for that use the "version" option. + use_compliant_nested_type : bool, default True + Whether to write compliant Parquet nested type (lists) as defined + `here `_, defaults to ``True``. + For ``use_compliant_nested_type=True``, this will write into a list + with 3-level structure where the middle level, named ``list``, + is a repeated group with a single field named ``element``:: + + group (LIST) { + repeated group list { + element; + } + } + + For ``use_compliant_nested_type=False``, this will also write into a list + with 3-level structure, where the name of the single field of the middle + level ``list`` is taken from the element name for nested columns in Arrow, + which defaults to ``item``:: + + group (LIST) { + repeated group list { + item; + } + } + encryption_properties : FileEncryptionProperties, default None + File encryption properties for Parquet Modular Encryption. + If None, no encryption will be done. + The encryption properties can be created using: + ``CryptoFactory.file_encryption_properties()``. + write_batch_size : int, default None + Number of values to write to a page at a time. If None, use the default of + 1024. ``write_batch_size`` is complementary to ``data_page_size``. If pages + are exceeding the ``data_page_size`` due to large column values, lowering + the batch size can help keep page sizes closer to the intended size. + dictionary_pagesize_limit : int, default None + Specify the dictionary page size limit per row group. If None, use the + default 1MB. + store_schema : bool, default True + By default, the Arrow schema is serialized and stored in the Parquet + file metadata (in the "ARROW:schema" key). When reading the file, + if this key is available, it will be used to more faithfully recreate + the original Arrow data. For example, for tz-aware timestamp columns + it will restore the timezone (Parquet only stores the UTC values without + timezone), or columns with duration type will be restored from the int64 + Parquet column. + write_page_index : bool, default False + Whether to write a page index in general for all columns. + Writing statistics to the page index disables the old method of writing + statistics to each data page header. The page index makes statistics-based + filtering more efficient than the page header, as it gathers all the + statistics for a Parquet file in a single place, avoiding scattered I/O. + Note that the page index is not yet used on the read size by PyArrow. + write_page_checksum : bool, default False + Whether to write page checksums in general for all columns. + Page checksums enable detection of data corruption, which might occur during + transmission or in the storage. + sorting_columns : Sequence of SortingColumn, default None + Specify the sort order of the data being written. The writer does not sort + the data nor does it verify that the data is sorted. The sort order is + written to the row group metadata, which can then be used by readers. + store_decimal_as_integer : bool, default False + Allow decimals with 1 <= precision <= 18 to be stored as integers. + In Parquet, DECIMAL can be stored in any of the following physical types: + - int32: for 1 <= precision <= 9. + - int64: for 10 <= precision <= 18. + - fixed_len_byte_array: precision is limited by the array size. + Length n can store <= floor(log_10(2^(8*n - 1) - 1)) base-10 digits. + - binary: precision is unlimited. The minimum number of bytes to store the + unscaled value is used. + + By default, this is DISABLED and all decimal types annotate fixed_len_byte_array. + When enabled, the writer will use the following physical types to store decimals: + - int32: for 1 <= precision <= 9. + - int64: for 10 <= precision <= 18. + - fixed_len_byte_array: for precision > 18. + + As a consequence, decimal columns stored in integer types are more compact. + writer_engine_version : unused + **options : dict + If options contains a key `metadata_collector` then the + corresponding value is assumed to be a list (or any object with + `.append` method) that will be filled with the file metadata instance + of the written file. + + Examples + -------- + Generate an example PyArrow Table and RecordBatch: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> batch = pa.record_batch( + ... [ + ... [2, 2, 4, 4, 5, 100], + ... ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... ], + ... names=["n_legs", "animal"], + ... ) + + create a ParquetWriter object: + + >>> import pyarrow.parquet as pq + >>> writer = pq.ParquetWriter("example.parquet", table.schema) + + and write the Table into the Parquet file: + + >>> writer.write_table(table) + >>> writer.close() + + >>> pq.read_table("example.parquet").to_pandas() + n_legs animal + 0 2 Flamingo + 1 2 Parrot + 2 4 Dog + 3 4 Horse + 4 5 Brittle stars + 5 100 Centipede + + create a ParquetWriter object for the RecordBatch: + + >>> writer2 = pq.ParquetWriter("example2.parquet", batch.schema) + + and write the RecordBatch into the Parquet file: + + >>> writer2.write_batch(batch) + >>> writer2.close() + + >>> pq.read_table("example2.parquet").to_pandas() + n_legs animal + 0 2 Flamingo + 1 2 Parrot + 2 4 Dog + 3 4 Horse + 4 5 Brittle stars + 5 100 Centipede + """ + + flavor: str + schema_changed: bool + schema: ParquetSchema + where: str | Path | IO + file_handler: NativeFile | None + writer: _parquet.ParquetWriter + is_open: bool + + def __init__( + self, + where: str | Path | IO | NativeFile, + schema: Schema, + filesystem: SupportedFileSystem | None = None, + flavor: str | None = None, + version: Literal["1.0", "2.4", "2.6"] = ..., + use_dictionary: bool = True, + compression: _Compression | dict[str, _Compression] = "snappy", + write_statistics: bool | list = True, + use_deprecated_int96_timestamps: bool | None = None, + compression_level: int | dict | None = None, + use_byte_stream_split: bool | list = False, + column_encoding: str | dict | None = None, + writer_engine_version=None, + data_page_version: Literal["1.0", "2.0"] = ..., + use_compliant_nested_type: bool = True, + encryption_properties: FileEncryptionProperties | None = None, + write_batch_size: int | None = None, + dictionary_pagesize_limit: int | None = None, + store_schema: bool = True, + write_page_index: bool = False, + write_page_checksum: bool = False, + sorting_columns: Sequence[SortingColumn] | None = None, + store_decimal_as_integer: bool = False, + **options, + ) -> None: ... + def __enter__(self) -> Self: ... + def __exit__(self, *args, **kwargs) -> Literal[False]: ... + def write( + self, table_or_batch: RecordBatch | Table, row_group_size: int | None = None + ) -> None: + """ + Write RecordBatch or Table to the Parquet file. + + Parameters + ---------- + table_or_batch : {RecordBatch, Table} + row_group_size : int, default None + Maximum number of rows in each written row group. If None, + the row group size will be the minimum of the input + table or batch length and 1024 * 1024. + """ + def write_batch(self, batch: RecordBatch, row_group_size: int | None = None) -> None: + """ + Write RecordBatch to the Parquet file. + + Parameters + ---------- + batch : RecordBatch + row_group_size : int, default None + Maximum number of rows in written row group. If None, the + row group size will be the minimum of the RecordBatch + size and 1024 * 1024. If set larger than 64Mi then 64Mi + will be used instead. + """ + def write_table(self, table: Table, row_group_size: int | None = None) -> None: + """ + Write Table to the Parquet file. + + Parameters + ---------- + table : Table + row_group_size : int, default None + Maximum number of rows in each written row group. If None, + the row group size will be the minimum of the Table size + and 1024 * 1024. If set larger than 64Mi then 64Mi will + be used instead. + + """ + def close(self) -> None: + """ + Close the connection to the Parquet file. + """ + def add_key_value_metadata(self, key_value_metadata: dict[str, str]) -> None: + """ + Add key-value metadata to the file. + This will overwrite any existing metadata with the same key. + + Parameters + ---------- + key_value_metadata : dict + Keys and values must be string-like / coercible to bytes. + """ + +class ParquetDataset: + """ + Encapsulates details of reading a complete Parquet dataset possibly + consisting of multiple files and partitions in subdirectories. + + Parameters + ---------- + path_or_paths : str or List[str] + A directory name, single file name, or list of file names. + filesystem : FileSystem, default None + If nothing passed, will be inferred based on path. + Path will try to be found in the local on-disk filesystem otherwise + it will be parsed as an URI to determine the filesystem. + schema : pyarrow.parquet.Schema + Optionally provide the Schema for the Dataset, in which case it will + not be inferred from the source. + filters : pyarrow.compute.Expression or List[Tuple] or List[List[Tuple]], default None + Rows which do not match the filter predicate will be removed from scanned + data. Partition keys embedded in a nested directory structure will be + exploited to avoid loading files at all if they contain no matching rows. + Within-file level filtering and different partitioning schemes are supported. + + Predicates are expressed using an ``Expression`` or using + the disjunctive normal form (DNF), like ``[[('x', '=', 0), ...], ...]``. + DNF allows arbitrary boolean logical combinations of single column predicates. + The innermost tuples each describe a single column predicate. The list of inner + predicates is interpreted as a conjunction (AND), forming a more selective and + multiple column predicate. Finally, the most outer list combines these filters + as a disjunction (OR). + + Predicates may also be passed as List[Tuple]. This form is interpreted + as a single conjunction. To express OR in predicates, one must + use the (preferred) List[List[Tuple]] notation. + + Each tuple has format: (``key``, ``op``, ``value``) and compares the + ``key`` with the ``value``. + The supported ``op`` are: ``=`` or ``==``, ``!=``, ``<``, ``>``, ``<=``, + ``>=``, ``in`` and ``not in``. If the ``op`` is ``in`` or ``not in``, the + ``value`` must be a collection such as a ``list``, a ``set`` or a + ``tuple``. + + Examples: + + Using the ``Expression`` API: + + .. code-block:: python + + import pyarrow.compute as pc + pc.field('x') = 0 + pc.field('y').isin(['a', 'b', 'c']) + ~pc.field('y').isin({'a', 'b'}) + + Using the DNF format: + + .. code-block:: python + + ("x", "=", 0) + ("y", "in", ["a", "b", "c"]) + ("z", "not in", {"a", "b"}) + + + read_dictionary : list, default None + List of names or column paths (for nested types) to read directly + as DictionaryArray. Only supported for BYTE_ARRAY storage. To read + a flat column as dictionary-encoded pass the column name. For + nested types, you must pass the full column "path", which could be + something like level1.level2.list.item. Refer to the Parquet + file's schema to obtain the paths. + memory_map : bool, default False + If the source is a file path, use a memory map to read file, which can + improve performance in some environments. + buffer_size : int, default 0 + If positive, perform read buffering when deserializing individual + column chunks. Otherwise IO calls are unbuffered. + partitioning : pyarrow.dataset.Partitioning or str or list of str, default "hive" + The partitioning scheme for a partitioned dataset. The default of "hive" + assumes directory names with key=value pairs like "/year=2009/month=11". + In addition, a scheme like "/2009/11" is also supported, in which case + you need to specify the field names or a full schema. See the + ``pyarrow.dataset.partitioning()`` function for more details. + ignore_prefixes : list, optional + Files matching any of these prefixes will be ignored by the + discovery process. + This is matched to the basename of a path. + By default this is ['.', '_']. + Note that discovery happens only if a directory is passed as source. + pre_buffer : bool, default True + Coalesce and issue file reads in parallel to improve performance on + high-latency filesystems (e.g. S3, GCS). If True, Arrow will use a + background I/O thread pool. If using a filesystem layer that itself + performs readahead (e.g. fsspec's S3FS), disable readahead for best + results. Set to False if you want to prioritize minimal memory usage + over maximum speed. + coerce_int96_timestamp_unit : str, default None + Cast timestamps that are stored in INT96 format to a particular resolution + (e.g. 'ms'). Setting to None is equivalent to 'ns' and therefore INT96 + timestamps will be inferred as timestamps in nanoseconds. + decryption_properties : FileDecryptionProperties or None + File-level decryption properties. + The decryption properties can be created using + ``CryptoFactory.file_decryption_properties()``. + thrift_string_size_limit : int, default None + If not None, override the maximum total string size allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + thrift_container_size_limit : int, default None + If not None, override the maximum total size of containers allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + page_checksum_verification : bool, default False + If True, verify the page checksum for each page read from the file. + + Examples + -------- + Generate an example PyArrow Table and write it to a partitioned dataset: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_to_dataset(table, root_path="dataset_v2", partition_cols=["year"]) + + create a ParquetDataset object from the dataset source: + + >>> dataset = pq.ParquetDataset("dataset_v2/") + + and read the data: + + >>> dataset.read().to_pandas() + n_legs animal year + 0 5 Brittle stars 2019 + 1 2 Flamingo 2020 + 2 4 Dog 2021 + 3 100 Centipede 2021 + 4 2 Parrot 2022 + 5 4 Horse 2022 + + create a ParquetDataset object with filter: + + >>> dataset = pq.ParquetDataset("dataset_v2/", filters=[("n_legs", "=", 4)]) + >>> dataset.read().to_pandas() + n_legs animal year + 0 4 Dog 2021 + 1 4 Horse 2022 + """ + def __init__( + self, + path_or_paths: SingleOrList[str] + | SingleOrList[Path] + | SingleOrList[NativeFile] + | SingleOrList[IO], + filesystem: SupportedFileSystem | None = None, + schema: Schema | None = None, + *, + filters: Expression | FilterTuple | list[FilterTuple] | None = None, + read_dictionary: list[str] | None = None, + memory_map: bool = False, + buffer_size: int = 0, + partitioning: str | list[str] | Partitioning | None = "hive", + ignore_prefixes: list[str] | None = None, + pre_buffer: bool = True, + coerce_int96_timestamp_unit: str | None = None, + decryption_properties: FileDecryptionProperties | None = None, + thrift_string_size_limit: int | None = None, + thrift_container_size_limit: int | None = None, + page_checksum_verification: bool = False, + ): ... + def equals(self, other: ParquetDataset) -> bool: ... + @property + def schema(self) -> Schema: + """ + Schema of the Dataset. + + Examples + -------- + Generate an example dataset: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_to_dataset(table, root_path="dataset_v2_schema", partition_cols=["year"]) + >>> dataset = pq.ParquetDataset("dataset_v2_schema/") + + Read the schema: + + >>> dataset.schema + n_legs: int64 + animal: string + year: dictionary + """ + def read( + self, + columns: list[str] | None = None, + use_threads: bool = True, + use_pandas_metadata: bool = False, + ) -> Table: + """ + Read (multiple) Parquet files as a single pyarrow.Table. + + Parameters + ---------- + columns : List[str] + Names of columns to read from the dataset. The partition fields + are not automatically included. + use_threads : bool, default True + Perform multi-threaded column reads. + use_pandas_metadata : bool, default False + If True and file has custom pandas schema metadata, ensure that + index columns are also loaded. + + Returns + ------- + pyarrow.Table + Content of the file as a table (of columns). + + Examples + -------- + Generate an example dataset: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_to_dataset(table, root_path="dataset_v2_read", partition_cols=["year"]) + >>> dataset = pq.ParquetDataset("dataset_v2_read/") + + Read the dataset: + + >>> dataset.read(columns=["n_legs"]) + pyarrow.Table + n_legs: int64 + ---- + n_legs: [[5],[2],[4,100],[2,4]] + """ + def read_pandas(self, **kwargs) -> Table: + """ + Read dataset including pandas metadata, if any. Other arguments passed + through to :func:`read`, see docstring for further details. + + Parameters + ---------- + **kwargs : optional + Additional options for :func:`read` + + Examples + -------- + Generate an example parquet file: + + >>> import pyarrow as pa + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> table = pa.Table.from_pandas(df) + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "table_V2.parquet") + >>> dataset = pq.ParquetDataset("table_V2.parquet") + + Read the dataset with pandas metadata: + + >>> dataset.read_pandas(columns=["n_legs"]) + pyarrow.Table + n_legs: int64 + ---- + n_legs: [[2,2,4,4,5,100]] + + >>> dataset.read_pandas(columns=["n_legs"]).schema.pandas_metadata + {'index_columns': [{'kind': 'range', 'name': None, 'start': 0, ...} + """ + @property + def fragments(self) -> list[ParquetFileFragment]: + """ + A list of the Dataset source fragments or pieces with absolute + file paths. + + Examples + -------- + Generate an example dataset: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_to_dataset(table, root_path="dataset_v2_fragments", partition_cols=["year"]) + >>> dataset = pq.ParquetDataset("dataset_v2_fragments/") + + List the fragments: + + >>> dataset.fragments + [ list[str]: + """ + A list of absolute Parquet file paths in the Dataset source. + + Examples + -------- + Generate an example dataset: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_to_dataset(table, root_path="dataset_v2_files", partition_cols=["year"]) + >>> dataset = pq.ParquetDataset("dataset_v2_files/") + + List the files: + + >>> dataset.files + ['dataset_v2_files/year=2019/...-0.parquet', ... + """ + @property + def filesystem(self) -> FileSystem: + """ + The filesystem type of the Dataset source. + """ + @property + def partitioning(self) -> Partitioning: + """ + The partitioning of the Dataset source, if discovered. + """ + +def read_table( + source: SingleOrList[str] | SingleOrList[Path] | SingleOrList[NativeFile] | SingleOrList[IO], + *, + columns: list | None = None, + use_threads: bool = True, + schema: Schema | None = None, + use_pandas_metadata: bool = False, + read_dictionary: list[str] | None = None, + memory_map: bool = False, + buffer_size: int = 0, + partitioning: str | list[str] | Partitioning | None = "hive", + filesystem: SupportedFileSystem | None = None, + filters: Expression | FilterTuple | list[FilterTuple] | None = None, + ignore_prefixes: list[str] | None = None, + pre_buffer: bool = True, + coerce_int96_timestamp_unit: str | None = None, + decryption_properties: FileDecryptionProperties | None = None, + thrift_string_size_limit: int | None = None, + thrift_container_size_limit: int | None = None, + page_checksum_verification: bool = False, +) -> Table: + """ + Read a Table from Parquet format + + Parameters + ---------- + source : str, pyarrow.NativeFile, or file-like object + If a string passed, can be a single file name or directory name. For + file-like objects, only read a single file. Use pyarrow.BufferReader to + read a file contained in a bytes or buffer-like object. + columns : list + If not None, only these columns will be read from the file. A column + name may be a prefix of a nested field, e.g. 'a' will select 'a.b', + 'a.c', and 'a.d.e'. If empty, no columns will be read. Note + that the table will still have the correct num_rows set despite having + no columns. + use_threads : bool, default True + Perform multi-threaded column reads. + schema : Schema, optional + Optionally provide the Schema for the parquet dataset, in which case it + will not be inferred from the source. + use_pandas_metadata : bool, default False + If True and file has custom pandas schema metadata, ensure that + index columns are also loaded. + read_dictionary : list, default None + List of names or column paths (for nested types) to read directly + as DictionaryArray. Only supported for BYTE_ARRAY storage. To read + a flat column as dictionary-encoded pass the column name. For + nested types, you must pass the full column "path", which could be + something like level1.level2.list.item. Refer to the Parquet + file's schema to obtain the paths. + memory_map : bool, default False + If the source is a file path, use a memory map to read file, which can + improve performance in some environments. + buffer_size : int, default 0 + If positive, perform read buffering when deserializing individual + column chunks. Otherwise IO calls are unbuffered. + partitioning : pyarrow.dataset.Partitioning or str or list of str, default "hive" + The partitioning scheme for a partitioned dataset. The default of "hive" + assumes directory names with key=value pairs like "/year=2009/month=11". + In addition, a scheme like "/2009/11" is also supported, in which case + you need to specify the field names or a full schema. See the + ``pyarrow.dataset.partitioning()`` function for more details. + filesystem : FileSystem, default None + If nothing passed, will be inferred based on path. + Path will try to be found in the local on-disk filesystem otherwise + it will be parsed as an URI to determine the filesystem. + filters : pyarrow.compute.Expression or List[Tuple] or List[List[Tuple]], default None + Rows which do not match the filter predicate will be removed from scanned + data. Partition keys embedded in a nested directory structure will be + exploited to avoid loading files at all if they contain no matching rows. + Within-file level filtering and different partitioning schemes are supported. + + Predicates are expressed using an ``Expression`` or using + the disjunctive normal form (DNF), like ``[[('x', '=', 0), ...], ...]``. + DNF allows arbitrary boolean logical combinations of single column predicates. + The innermost tuples each describe a single column predicate. The list of inner + predicates is interpreted as a conjunction (AND), forming a more selective and + multiple column predicate. Finally, the most outer list combines these filters + as a disjunction (OR). + + Predicates may also be passed as List[Tuple]. This form is interpreted + as a single conjunction. To express OR in predicates, one must + use the (preferred) List[List[Tuple]] notation. + + Each tuple has format: (``key``, ``op``, ``value``) and compares the + ``key`` with the ``value``. + The supported ``op`` are: ``=`` or ``==``, ``!=``, ``<``, ``>``, ``<=``, + ``>=``, ``in`` and ``not in``. If the ``op`` is ``in`` or ``not in``, the + ``value`` must be a collection such as a ``list``, a ``set`` or a + ``tuple``. + + Examples: + + Using the ``Expression`` API: + + .. code-block:: python + + import pyarrow.compute as pc + pc.field('x') = 0 + pc.field('y').isin(['a', 'b', 'c']) + ~pc.field('y').isin({'a', 'b'}) + + Using the DNF format: + + .. code-block:: python + + ("x", "=", 0) + ("y", "in", ["a", "b", "c"]) + ("z", "not in", {"a", "b"}) + + + ignore_prefixes : list, optional + Files matching any of these prefixes will be ignored by the + discovery process. + This is matched to the basename of a path. + By default this is ['.', '_']. + Note that discovery happens only if a directory is passed as source. + pre_buffer : bool, default True + Coalesce and issue file reads in parallel to improve performance on + high-latency filesystems (e.g. S3). If True, Arrow will use a + background I/O thread pool. If using a filesystem layer that itself + performs readahead (e.g. fsspec's S3FS), disable readahead for best + results. + coerce_int96_timestamp_unit : str, default None + Cast timestamps that are stored in INT96 format to a particular + resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' + and therefore INT96 timestamps will be inferred as timestamps + in nanoseconds. + decryption_properties : FileDecryptionProperties or None + File-level decryption properties. + The decryption properties can be created using + ``CryptoFactory.file_decryption_properties()``. + thrift_string_size_limit : int, default None + If not None, override the maximum total string size allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + thrift_container_size_limit : int, default None + If not None, override the maximum total size of containers allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + page_checksum_verification : bool, default False + If True, verify the checksum for each page read from the file. + + Returns + ------- + pyarrow.Table + Content of the file as a table (of columns) + + + Examples + -------- + + Generate an example PyArrow Table and write it to a partitioned dataset: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + >>> import pyarrow.parquet as pq + >>> pq.write_to_dataset(table, root_path="dataset_name_2", partition_cols=["year"]) + + Read the data: + + >>> pq.read_table("dataset_name_2").to_pandas() + n_legs animal year + 0 5 Brittle stars 2019 + 1 2 Flamingo 2020 + 2 4 Dog 2021 + 3 100 Centipede 2021 + 4 2 Parrot 2022 + 5 4 Horse 2022 + + + Read only a subset of columns: + + >>> pq.read_table("dataset_name_2", columns=["n_legs", "animal"]) + pyarrow.Table + n_legs: int64 + animal: string + ---- + n_legs: [[5],[2],[4,100],[2,4]] + animal: [["Brittle stars"],["Flamingo"],["Dog","Centipede"],["Parrot","Horse"]] + + Read a subset of columns and read one column as DictionaryArray: + + >>> pq.read_table("dataset_name_2", columns=["n_legs", "animal"], read_dictionary=["animal"]) + pyarrow.Table + n_legs: int64 + animal: dictionary + ---- + n_legs: [[5],[2],[4,100],[2,4]] + animal: [ -- dictionary: + ["Brittle stars"] -- indices: + [0], -- dictionary: + ["Flamingo"] -- indices: + [0], -- dictionary: + ["Dog","Centipede"] -- indices: + [0,1], -- dictionary: + ["Parrot","Horse"] -- indices: + [0,1]] + + Read the table with filter: + + >>> pq.read_table( + ... "dataset_name_2", columns=["n_legs", "animal"], filters=[("n_legs", "<", 4)] + ... ).to_pandas() + n_legs animal + 0 2 Flamingo + 1 2 Parrot + + Read data from a single Parquet file: + + >>> pq.write_table(table, "example.parquet") + >>> pq.read_table("dataset_name_2").to_pandas() + n_legs animal year + 0 5 Brittle stars 2019 + 1 2 Flamingo 2020 + 2 4 Dog 2021 + 3 100 Centipede 2021 + 4 2 Parrot 2022 + 5 4 Horse 2022 + """ + +def read_pandas( + source: str | Path | NativeFile | IO, columns: list | None = None, **kwargs +) -> Table: + """ + + Read a Table from Parquet format, also reading DataFrame + index values if known in the file metadata + + Parameters + ---------- + source : str, pyarrow.NativeFile, or file-like object + If a string passed, can be a single file name or directory name. For + file-like objects, only read a single file. Use pyarrow.BufferReader to + read a file contained in a bytes or buffer-like object. + columns : list + If not None, only these columns will be read from the file. A column + name may be a prefix of a nested field, e.g. 'a' will select 'a.b', + 'a.c', and 'a.d.e'. If empty, no columns will be read. Note + that the table will still have the correct num_rows set despite having + no columns. + use_threads : bool, default True + Perform multi-threaded column reads. + schema : Schema, optional + Optionally provide the Schema for the parquet dataset, in which case it + will not be inferred from the source. + read_dictionary : list, default None + List of names or column paths (for nested types) to read directly + as DictionaryArray. Only supported for BYTE_ARRAY storage. To read + a flat column as dictionary-encoded pass the column name. For + nested types, you must pass the full column "path", which could be + something like level1.level2.list.item. Refer to the Parquet + file's schema to obtain the paths. + memory_map : bool, default False + If the source is a file path, use a memory map to read file, which can + improve performance in some environments. + buffer_size : int, default 0 + If positive, perform read buffering when deserializing individual + column chunks. Otherwise IO calls are unbuffered. + partitioning : pyarrow.dataset.Partitioning or str or list of str, default "hive" + The partitioning scheme for a partitioned dataset. The default of "hive" + assumes directory names with key=value pairs like "/year=2009/month=11". + In addition, a scheme like "/2009/11" is also supported, in which case + you need to specify the field names or a full schema. See the + ``pyarrow.dataset.partitioning()`` function for more details. + **kwargs + additional options for :func:`read_table` + filesystem : FileSystem, default None + If nothing passed, will be inferred based on path. + Path will try to be found in the local on-disk filesystem otherwise + it will be parsed as an URI to determine the filesystem. + filters : pyarrow.compute.Expression or List[Tuple] or List[List[Tuple]], default None + Rows which do not match the filter predicate will be removed from scanned + data. Partition keys embedded in a nested directory structure will be + exploited to avoid loading files at all if they contain no matching rows. + Within-file level filtering and different partitioning schemes are supported. + + Predicates are expressed using an ``Expression`` or using + the disjunctive normal form (DNF), like ``[[('x', '=', 0), ...], ...]``. + DNF allows arbitrary boolean logical combinations of single column predicates. + The innermost tuples each describe a single column predicate. The list of inner + predicates is interpreted as a conjunction (AND), forming a more selective and + multiple column predicate. Finally, the most outer list combines these filters + as a disjunction (OR). + + Predicates may also be passed as List[Tuple]. This form is interpreted + as a single conjunction. To express OR in predicates, one must + use the (preferred) List[List[Tuple]] notation. + + Each tuple has format: (``key``, ``op``, ``value``) and compares the + ``key`` with the ``value``. + The supported ``op`` are: ``=`` or ``==``, ``!=``, ``<``, ``>``, ``<=``, + ``>=``, ``in`` and ``not in``. If the ``op`` is ``in`` or ``not in``, the + ``value`` must be a collection such as a ``list``, a ``set`` or a + ``tuple``. + + Examples: + + Using the ``Expression`` API: + + .. code-block:: python + + import pyarrow.compute as pc + pc.field('x') = 0 + pc.field('y').isin(['a', 'b', 'c']) + ~pc.field('y').isin({'a', 'b'}) + + Using the DNF format: + + .. code-block:: python + + ("x", "=", 0) + ("y", "in", ["a", "b", "c"]) + ("z", "not in", {"a", "b"}) + + + ignore_prefixes : list, optional + Files matching any of these prefixes will be ignored by the + discovery process. + This is matched to the basename of a path. + By default this is ['.', '_']. + Note that discovery happens only if a directory is passed as source. + pre_buffer : bool, default True + Coalesce and issue file reads in parallel to improve performance on + high-latency filesystems (e.g. S3). If True, Arrow will use a + background I/O thread pool. If using a filesystem layer that itself + performs readahead (e.g. fsspec's S3FS), disable readahead for best + results. + coerce_int96_timestamp_unit : str, default None + Cast timestamps that are stored in INT96 format to a particular + resolution (e.g. 'ms'). Setting to None is equivalent to 'ns' + and therefore INT96 timestamps will be inferred as timestamps + in nanoseconds. + decryption_properties : FileDecryptionProperties or None + File-level decryption properties. + The decryption properties can be created using + ``CryptoFactory.file_decryption_properties()``. + thrift_string_size_limit : int, default None + If not None, override the maximum total string size allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + thrift_container_size_limit : int, default None + If not None, override the maximum total size of containers allocated + when decoding Thrift structures. The default limit should be + sufficient for most Parquet files. + page_checksum_verification : bool, default False + If True, verify the checksum for each page read from the file. + + Returns + ------- + pyarrow.Table + Content of the file as a Table of Columns, including DataFrame + indexes as columns + """ + +def write_table( + table: Table, + where: str | Path | NativeFile | IO, + row_group_size: int | None = None, + version: Literal["1.0", "2.4", "2.6"] = "2.6", + use_dictionary: bool = True, + compression: _Compression | dict[str, _Compression] = "snappy", + write_statistics: bool | list = True, + use_deprecated_int96_timestamps: bool | None = None, + coerce_timestamps: str | None = None, + allow_truncated_timestamps: bool = False, + data_page_size: int | None = None, + flavor: str | None = None, + filesystem: SupportedFileSystem | None = None, + compression_level: int | dict | None = None, + use_byte_stream_split: bool = False, + column_encoding: str | dict | None = None, + data_page_version: Literal["1.0", "2.0"] = ..., + use_compliant_nested_type: bool = True, + encryption_properties: FileEncryptionProperties | None = None, + write_batch_size: int | None = None, + dictionary_pagesize_limit: int | None = None, + store_schema: bool = True, + write_page_index: bool = False, + write_page_checksum: bool = False, + sorting_columns: Sequence[SortingColumn] | None = None, + store_decimal_as_integer: bool = False, + **kwargs, +) -> None: + """ + + Write a Table to Parquet format. + + Parameters + ---------- + table : pyarrow.Table + where : string or pyarrow.NativeFile + row_group_size : int + Maximum number of rows in each written row group. If None, the + row group size will be the minimum of the Table size and + 1024 * 1024. + version : {"1.0", "2.4", "2.6"}, default "2.6" + Determine which Parquet logical types are available for use, whether the + reduced set from the Parquet 1.x.x format or the expanded logical types + added in later format versions. + Files written with version='2.4' or '2.6' may not be readable in all + Parquet implementations, so version='1.0' is likely the choice that + maximizes file compatibility. + UINT32 and some logical types are only available with version '2.4'. + Nanosecond timestamps are only available with version '2.6'. + Other features such as compression algorithms or the new serialized + data page format must be enabled separately (see 'compression' and + 'data_page_version'). + use_dictionary : bool or list, default True + Specify if we should use dictionary encoding in general or only for + some columns. + When encoding the column, if the dictionary size is too large, the + column will fallback to ``PLAIN`` encoding. Specially, ``BOOLEAN`` type + doesn't support dictionary encoding. + compression : str or dict, default 'snappy' + Specify the compression codec, either on a general basis or per-column. + Valid values: {'NONE', 'SNAPPY', 'GZIP', 'BROTLI', 'LZ4', 'ZSTD'}. + write_statistics : bool or list, default True + Specify if we should write statistics in general (default is True) or only + for some columns. + use_deprecated_int96_timestamps : bool, default None + Write timestamps to INT96 Parquet format. Defaults to False unless enabled + by flavor argument. This take priority over the coerce_timestamps option. + coerce_timestamps : str, default None + Cast timestamps to a particular resolution. If omitted, defaults are chosen + depending on `version`. For ``version='1.0'`` and ``version='2.4'``, + nanoseconds are cast to microseconds ('us'), while for + ``version='2.6'`` (the default), they are written natively without loss + of resolution. Seconds are always cast to milliseconds ('ms') by default, + as Parquet does not have any temporal type with seconds resolution. + If the casting results in loss of data, it will raise an exception + unless ``allow_truncated_timestamps=True`` is given. + Valid values: {None, 'ms', 'us'} + allow_truncated_timestamps : bool, default False + Allow loss of data when coercing timestamps to a particular + resolution. E.g. if microsecond or nanosecond data is lost when coercing to + 'ms', do not raise an exception. Passing ``allow_truncated_timestamp=True`` + will NOT result in the truncation exception being ignored unless + ``coerce_timestamps`` is not None. + data_page_size : int, default None + Set a target threshold for the approximate encoded size of data + pages within a column chunk (in bytes). If None, use the default data page + size of 1MByte. + flavor : {'spark'}, default None + Sanitize schema or set other compatibility options to work with + various target systems. + filesystem : FileSystem, default None + If nothing passed, will be inferred from `where` if path-like, else + `where` is already a file-like object so no filesystem is needed. + compression_level : int or dict, default None + Specify the compression level for a codec, either on a general basis or + per-column. If None is passed, arrow selects the compression level for + the compression codec in use. The compression level has a different + meaning for each codec, so you have to read the documentation of the + codec you are using. + An exception is thrown if the compression codec does not allow specifying + a compression level. + use_byte_stream_split : bool or list, default False + Specify if the byte_stream_split encoding should be used in general or + only for some columns. If both dictionary and byte_stream_stream are + enabled, then dictionary is preferred. + The byte_stream_split encoding is valid for integer, floating-point + and fixed-size binary data types (including decimals); it should be + combined with a compression codec so as to achieve size reduction. + column_encoding : string or dict, default None + Specify the encoding scheme on a per column basis. + Can only be used when ``use_dictionary`` is set to False, and + cannot be used in combination with ``use_byte_stream_split``. + Currently supported values: {'PLAIN', 'BYTE_STREAM_SPLIT', + 'DELTA_BINARY_PACKED', 'DELTA_LENGTH_BYTE_ARRAY', 'DELTA_BYTE_ARRAY'}. + Certain encodings are only compatible with certain data types. + Please refer to the encodings section of `Reading and writing Parquet + files `_. + data_page_version : {"1.0", "2.0"}, default "1.0" + The serialized Parquet data page format version to write, defaults to + 1.0. This does not impact the file schema logical types and Arrow to + Parquet type casting behavior; for that use the "version" option. + use_compliant_nested_type : bool, default True + Whether to write compliant Parquet nested type (lists) as defined + `here `_, defaults to ``True``. + For ``use_compliant_nested_type=True``, this will write into a list + with 3-level structure where the middle level, named ``list``, + is a repeated group with a single field named ``element``:: + + group (LIST) { + repeated group list { + element; + } + } + + For ``use_compliant_nested_type=False``, this will also write into a list + with 3-level structure, where the name of the single field of the middle + level ``list`` is taken from the element name for nested columns in Arrow, + which defaults to ``item``:: + + group (LIST) { + repeated group list { + item; + } + } + encryption_properties : FileEncryptionProperties, default None + File encryption properties for Parquet Modular Encryption. + If None, no encryption will be done. + The encryption properties can be created using: + ``CryptoFactory.file_encryption_properties()``. + write_batch_size : int, default None + Number of values to write to a page at a time. If None, use the default of + 1024. ``write_batch_size`` is complementary to ``data_page_size``. If pages + are exceeding the ``data_page_size`` due to large column values, lowering + the batch size can help keep page sizes closer to the intended size. + dictionary_pagesize_limit : int, default None + Specify the dictionary page size limit per row group. If None, use the + default 1MB. + store_schema : bool, default True + By default, the Arrow schema is serialized and stored in the Parquet + file metadata (in the "ARROW:schema" key). When reading the file, + if this key is available, it will be used to more faithfully recreate + the original Arrow data. For example, for tz-aware timestamp columns + it will restore the timezone (Parquet only stores the UTC values without + timezone), or columns with duration type will be restored from the int64 + Parquet column. + write_page_index : bool, default False + Whether to write a page index in general for all columns. + Writing statistics to the page index disables the old method of writing + statistics to each data page header. The page index makes statistics-based + filtering more efficient than the page header, as it gathers all the + statistics for a Parquet file in a single place, avoiding scattered I/O. + Note that the page index is not yet used on the read size by PyArrow. + write_page_checksum : bool, default False + Whether to write page checksums in general for all columns. + Page checksums enable detection of data corruption, which might occur during + transmission or in the storage. + sorting_columns : Sequence of SortingColumn, default None + Specify the sort order of the data being written. The writer does not sort + the data nor does it verify that the data is sorted. The sort order is + written to the row group metadata, which can then be used by readers. + store_decimal_as_integer : bool, default False + Allow decimals with 1 <= precision <= 18 to be stored as integers. + In Parquet, DECIMAL can be stored in any of the following physical types: + - int32: for 1 <= precision <= 9. + - int64: for 10 <= precision <= 18. + - fixed_len_byte_array: precision is limited by the array size. + Length n can store <= floor(log_10(2^(8*n - 1) - 1)) base-10 digits. + - binary: precision is unlimited. The minimum number of bytes to store the + unscaled value is used. + + By default, this is DISABLED and all decimal types annotate fixed_len_byte_array. + When enabled, the writer will use the following physical types to store decimals: + - int32: for 1 <= precision <= 9. + - int64: for 10 <= precision <= 18. + - fixed_len_byte_array: for precision > 18. + + As a consequence, decimal columns stored in integer types are more compact. + + **kwargs : optional + Additional options for ParquetWriter + + Examples + -------- + Generate an example PyArrow Table: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + + and write the Table into Parquet file: + + >>> import pyarrow.parquet as pq + >>> pq.write_table(table, "example.parquet") + + Defining row group size for the Parquet file: + + >>> pq.write_table(table, "example.parquet", row_group_size=3) + + Defining row group compression (default is Snappy): + + >>> pq.write_table(table, "example.parquet", compression="none") + + Defining row group compression and encoding per-column: + + >>> pq.write_table( + ... table, + ... "example.parquet", + ... compression={"n_legs": "snappy", "animal": "gzip"}, + ... use_dictionary=["n_legs", "animal"], + ... ) + + Defining column encoding per-column: + + >>> pq.write_table( + ... table, "example.parquet", column_encoding={"animal": "PLAIN"}, use_dictionary=False + ... ) + """ + +def write_to_dataset( + table: Table, + root_path: str | Path, + partition_cols: list[str] | None = None, + filesystem: SupportedFileSystem | None = None, + schema: Schema | None = None, + partitioning: Partitioning | list[str] | None = None, + basename_template: str | None = None, + use_threads: bool | None = None, + file_visitor: Callable[[str], None] | None = None, + existing_data_behavior: Literal["overwrite_or_ignore", "error", "delete_matching"] + | None = None, + **kwargs, +) -> None: + """ + Wrapper around dataset.write_dataset for writing a Table to + Parquet format by partitions. + For each combination of partition columns and values, + a subdirectories are created in the following + manner: + + root_dir/ + group1=value1 + group2=value1 + .parquet + group2=value2 + .parquet + group1=valueN + group2=value1 + .parquet + group2=valueN + .parquet + + Parameters + ---------- + table : pyarrow.Table + root_path : str, pathlib.Path + The root directory of the dataset. + partition_cols : list, + Column names by which to partition the dataset. + Columns are partitioned in the order they are given. + filesystem : FileSystem, default None + If nothing passed, will be inferred based on path. + Path will try to be found in the local on-disk filesystem otherwise + it will be parsed as an URI to determine the filesystem. + schema : Schema, optional + This Schema of the dataset. + partitioning : Partitioning or list[str], optional + The partitioning scheme specified with the + ``pyarrow.dataset.partitioning()`` function or a list of field names. + When providing a list of field names, you can use + ``partitioning_flavor`` to drive which partitioning type should be + used. + basename_template : str, optional + A template string used to generate basenames of written data files. + The token '{i}' will be replaced with an automatically incremented + integer. If not specified, it defaults to "guid-{i}.parquet". + use_threads : bool, default True + Write files in parallel. If enabled, then maximum parallelism will be + used determined by the number of available CPU cores. + file_visitor : function + If set, this function will be called with a WrittenFile instance + for each file created during the call. This object will have both + a path attribute and a metadata attribute. + + The path attribute will be a string containing the path to + the created file. + + The metadata attribute will be the parquet metadata of the file. + This metadata will have the file path attribute set and can be used + to build a _metadata file. The metadata attribute will be None if + the format is not parquet. + + Example visitor which simple collects the filenames created:: + + visited_paths = [] + + def file_visitor(written_file): + visited_paths.append(written_file.path) + + existing_data_behavior : 'overwrite_or_ignore' | 'error' | 'delete_matching' + Controls how the dataset will handle data that already exists in + the destination. The default behaviour is 'overwrite_or_ignore'. + + 'overwrite_or_ignore' will ignore any existing data and will + overwrite files with the same name as an output file. Other + existing files will be ignored. This behavior, in combination + with a unique basename_template for each write, will allow for + an append workflow. + + 'error' will raise an error if any data exists in the destination. + + 'delete_matching' is useful when you are writing a partitioned + dataset. The first time each partition directory is encountered + the entire directory will be deleted. This allows you to overwrite + old partitions completely. + **kwargs : dict, + Used as additional kwargs for :func:`pyarrow.dataset.write_dataset` + function for matching kwargs, and remainder to + :func:`pyarrow.dataset.ParquetFileFormat.make_write_options`. + See the docstring of :func:`write_table` and + :func:`pyarrow.dataset.write_dataset` for the available options. + Using `metadata_collector` in kwargs allows one to collect the + file metadata instances of dataset pieces. The file paths in the + ColumnChunkMetaData will be set relative to `root_path`. + + Examples + -------- + Generate an example PyArrow Table: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "year": [2020, 2022, 2021, 2022, 2019, 2021], + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + + and write it to a partitioned dataset: + + >>> import pyarrow.parquet as pq + >>> pq.write_to_dataset(table, root_path="dataset_name_3", partition_cols=["year"]) + >>> pq.ParquetDataset("dataset_name_3").files + ['dataset_name_3/year=2019/...-0.parquet', ... + + Write a single Parquet file into the root folder: + + >>> pq.write_to_dataset(table, root_path="dataset_name_4") + >>> pq.ParquetDataset("dataset_name_4/").files + ['dataset_name_4/...-0.parquet'] + """ + +def write_metadata( + schema: Schema, + where: str | NativeFile, + metadata_collector: list[FileMetaData] | None = None, + filesystem: SupportedFileSystem | None = None, + **kwargs, +) -> None: + """ + Write metadata-only Parquet file from schema. This can be used with + `write_to_dataset` to generate `_common_metadata` and `_metadata` sidecar + files. + + Parameters + ---------- + schema : pyarrow.Schema + where : string or pyarrow.NativeFile + metadata_collector : list + where to collect metadata information. + filesystem : FileSystem, default None + If nothing passed, will be inferred from `where` if path-like, else + `where` is already a file-like object so no filesystem is needed. + **kwargs : dict, + Additional kwargs for ParquetWriter class. See docstring for + `ParquetWriter` for more information. + + Examples + -------- + Generate example data: + + >>> import pyarrow as pa + >>> table = pa.table( + ... { + ... "n_legs": [2, 2, 4, 4, 5, 100], + ... "animal": ["Flamingo", "Parrot", "Dog", "Horse", "Brittle stars", "Centipede"], + ... } + ... ) + + Write a dataset and collect metadata information. + + >>> metadata_collector = [] + >>> import pyarrow.parquet as pq + >>> pq.write_to_dataset(table, "dataset_metadata", metadata_collector=metadata_collector) + + Write the `_common_metadata` parquet file without row groups statistics. + + >>> pq.write_metadata(table.schema, "dataset_metadata/_common_metadata") + + Write the `_metadata` parquet file with row groups statistics. + + >>> pq.write_metadata( + ... table.schema, "dataset_metadata/_metadata", metadata_collector=metadata_collector + ... ) + """ + +def read_metadata( + where: str | Path | IO | NativeFile, + memory_map: bool = False, + decryption_properties: FileDecryptionProperties | None = None, + filesystem: SupportedFileSystem | None = None, +) -> FileMetaData: + """ + Read FileMetaData from footer of a single Parquet file. + + Parameters + ---------- + where : str (file path) or file-like object + memory_map : bool, default False + Create memory map when the source is a file path. + decryption_properties : FileDecryptionProperties, default None + Decryption properties for reading encrypted Parquet files. + filesystem : FileSystem, default None + If nothing passed, will be inferred based on path. + Path will try to be found in the local on-disk filesystem otherwise + it will be parsed as an URI to determine the filesystem. + + Returns + ------- + metadata : FileMetaData + The metadata of the Parquet file + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.parquet as pq + >>> table = pa.table({"n_legs": [4, 5, 100], "animal": ["Dog", "Brittle stars", "Centipede"]}) + >>> pq.write_table(table, "example.parquet") + + >>> pq.read_metadata("example.parquet") + + created_by: parquet-cpp-arrow version ... + num_columns: 2 + num_rows: 3 + num_row_groups: 1 + format_version: 2.6 + serialized_size: ... + """ + +def read_schema( + where: str | Path | IO | NativeFile, + memory_map: bool = False, + decryption_properties: FileDecryptionProperties | None = None, + filesystem: SupportedFileSystem | None = None, +) -> Schema: + """ + Read effective Arrow schema from Parquet file metadata. + + Parameters + ---------- + where : str (file path) or file-like object + memory_map : bool, default False + Create memory map when the source is a file path. + decryption_properties : FileDecryptionProperties, default None + Decryption properties for reading encrypted Parquet files. + filesystem : FileSystem, default None + If nothing passed, will be inferred based on path. + Path will try to be found in the local on-disk filesystem otherwise + it will be parsed as an URI to determine the filesystem. + + Returns + ------- + schema : pyarrow.Schema + The schema of the Parquet file + + Examples + -------- + >>> import pyarrow as pa + >>> import pyarrow.parquet as pq + >>> table = pa.table({"n_legs": [4, 5, 100], "animal": ["Dog", "Brittle stars", "Centipede"]}) + >>> pq.write_table(table, "example.parquet") + + >>> pq.read_schema("example.parquet") + n_legs: int64 + animal: string + """ diff --git a/python/stubs/parquet/encryption.pyi b/python/stubs/parquet/encryption.pyi new file mode 100644 index 00000000000..5a77dae7ef7 --- /dev/null +++ b/python/stubs/parquet/encryption.pyi @@ -0,0 +1,15 @@ +from pyarrow._parquet_encryption import ( + CryptoFactory, + DecryptionConfiguration, + EncryptionConfiguration, + KmsClient, + KmsConnectionConfig, +) + +__all__ = [ + "CryptoFactory", + "DecryptionConfiguration", + "EncryptionConfiguration", + "KmsClient", + "KmsConnectionConfig", +] diff --git a/python/stubs/substrait.pyi b/python/stubs/substrait.pyi new file mode 100644 index 00000000000..a56a8a5b40f --- /dev/null +++ b/python/stubs/substrait.pyi @@ -0,0 +1,21 @@ +from pyarrow._substrait import ( + BoundExpressions, + SubstraitSchema, + deserialize_expressions, + deserialize_schema, + get_supported_functions, + run_query, + serialize_expressions, + serialize_schema, +) + +__all__ = [ + "BoundExpressions", + "get_supported_functions", + "run_query", + "deserialize_expressions", + "serialize_expressions", + "deserialize_schema", + "serialize_schema", + "SubstraitSchema", +] diff --git a/python/stubs/types.pyi b/python/stubs/types.pyi new file mode 100644 index 00000000000..0cb4f6171d3 --- /dev/null +++ b/python/stubs/types.pyi @@ -0,0 +1,194 @@ +import sys + +from typing import Any + +if sys.version_info >= (3, 13): + from typing import TypeIs +else: + from typing_extensions import TypeIs +if sys.version_info >= (3, 10): + from typing import TypeAlias +else: + from typing_extensions import TypeAlias + +from pyarrow.lib import ( + BinaryType, + BinaryViewType, + BoolType, + DataType, + Date32Type, + Date64Type, + Decimal32Type, + Decimal64Type, + Decimal128Type, + Decimal256Type, + DenseUnionType, + DictionaryType, + DurationType, + FixedSizeBinaryType, + FixedSizeListType, + Float16Type, + Float32Type, + Float64Type, + Int8Type, + Int16Type, + Int32Type, + Int64Type, + LargeBinaryType, + LargeListType, + LargeListViewType, + LargeStringType, + ListType, + ListViewType, + MapType, + MonthDayNanoIntervalType, + NullType, + RunEndEncodedType, + SparseUnionType, + StringType, + StringViewType, + StructType, + Time32Type, + Time64Type, + TimestampType, + UInt8Type, + UInt16Type, + Uint32Type, + UInt64Type, +) + +_SignedInteger: TypeAlias = Int8Type | Int16Type | Int32Type | Int64Type +_UnsignedInteger: TypeAlias = UInt8Type | UInt16Type | Uint32Type | UInt64Type +_Integer: TypeAlias = _SignedInteger | _UnsignedInteger +_Floating: TypeAlias = Float16Type | Float32Type | Float64Type +_Decimal: TypeAlias = ( + Decimal32Type[Any, Any] + | Decimal64Type[Any, Any] + | Decimal128Type[Any, Any] + | Decimal256Type[Any, Any] +) +_Date: TypeAlias = Date32Type | Date64Type +_Time: TypeAlias = Time32Type[Any] | Time64Type[Any] +_Interval: TypeAlias = MonthDayNanoIntervalType +_Temporal: TypeAlias = TimestampType[Any, Any] | DurationType[Any] | _Time | _Date | _Interval +_Union: TypeAlias = SparseUnionType | DenseUnionType +_Nested: TypeAlias = ( + ListType[Any] + | FixedSizeListType[Any, Any] + | LargeListType[Any] + | ListViewType[Any] + | LargeListViewType[Any] + | StructType + | MapType[Any, Any, Any] + | _Union +) + +def is_null(t: DataType) -> TypeIs[NullType]: ... +def is_boolean(t: DataType) -> TypeIs[BoolType]: ... +def is_integer(t: DataType) -> TypeIs[_Integer]: ... +def is_signed_integer(t: DataType) -> TypeIs[_SignedInteger]: ... +def is_unsigned_integer(t: DataType) -> TypeIs[_UnsignedInteger]: ... +def is_int8(t: DataType) -> TypeIs[Int8Type]: ... +def is_int16(t: DataType) -> TypeIs[Int16Type]: ... +def is_int32(t: DataType) -> TypeIs[Int32Type]: ... +def is_int64(t: DataType) -> TypeIs[Int64Type]: ... +def is_uint8(t: DataType) -> TypeIs[UInt8Type]: ... +def is_uint16(t: DataType) -> TypeIs[UInt16Type]: ... +def is_uint32(t: DataType) -> TypeIs[Uint32Type]: ... +def is_uint64(t: DataType) -> TypeIs[UInt64Type]: ... +def is_floating(t: DataType) -> TypeIs[_Floating]: ... +def is_float16(t: DataType) -> TypeIs[Float16Type]: ... +def is_float32(t: DataType) -> TypeIs[Float32Type]: ... +def is_float64(t: DataType) -> TypeIs[Float64Type]: ... +def is_list(t: DataType) -> TypeIs[ListType[Any]]: ... +def is_large_list(t: DataType) -> TypeIs[LargeListType[Any]]: ... +def is_fixed_size_list(t: DataType) -> TypeIs[FixedSizeListType[Any, Any]]: ... +def is_list_view(t: DataType) -> TypeIs[ListViewType[Any]]: ... +def is_large_list_view(t: DataType) -> TypeIs[LargeListViewType[Any]]: ... +def is_struct(t: DataType) -> TypeIs[StructType]: ... +def is_union(t: DataType) -> TypeIs[_Union]: ... +def is_nested(t: DataType) -> TypeIs[_Nested]: ... +def is_run_end_encoded(t: DataType) -> TypeIs[RunEndEncodedType[Any, Any]]: ... +def is_temporal(t: DataType) -> TypeIs[_Temporal]: ... +def is_timestamp(t: DataType) -> TypeIs[TimestampType[Any, Any]]: ... +def is_duration(t: DataType) -> TypeIs[DurationType[Any]]: ... +def is_time(t: DataType) -> TypeIs[_Time]: ... +def is_time32(t: DataType) -> TypeIs[Time32Type[Any]]: ... +def is_time64(t: DataType) -> TypeIs[Time64Type[Any]]: ... +def is_binary(t: DataType) -> TypeIs[BinaryType]: ... +def is_large_binary(t: DataType) -> TypeIs[LargeBinaryType]: ... +def is_unicode(t: DataType) -> TypeIs[StringType]: ... +def is_string(t: DataType) -> TypeIs[StringType]: ... +def is_large_unicode(t: DataType) -> TypeIs[LargeStringType]: ... +def is_large_string(t: DataType) -> TypeIs[LargeStringType]: ... +def is_fixed_size_binary(t: DataType) -> TypeIs[FixedSizeBinaryType]: ... +def is_binary_view(t: DataType) -> TypeIs[BinaryViewType]: ... +def is_string_view(t: DataType) -> TypeIs[StringViewType]: ... +def is_date(t: DataType) -> TypeIs[_Date]: ... +def is_date32(t: DataType) -> TypeIs[Date32Type]: ... +def is_date64(t: DataType) -> TypeIs[Date64Type]: ... +def is_map(t: DataType) -> TypeIs[MapType[Any, Any, Any]]: ... +def is_decimal(t: DataType) -> TypeIs[_Decimal]: ... +def is_decimal32(t: DataType) -> TypeIs[Decimal32Type[Any, Any]]: ... +def is_decimal64(t: DataType) -> TypeIs[Decimal64Type[Any, Any]]: ... +def is_decimal128(t: DataType) -> TypeIs[Decimal128Type[Any, Any]]: ... +def is_decimal256(t: DataType) -> TypeIs[Decimal256Type[Any, Any]]: ... +def is_dictionary(t: DataType) -> TypeIs[DictionaryType[Any, Any, Any]]: ... +def is_interval(t: DataType) -> TypeIs[_Interval]: ... +def is_primitive(t: DataType) -> bool: ... + +__all__ = [ + "is_binary", + "is_binary_view", + "is_boolean", + "is_date", + "is_date32", + "is_date64", + "is_decimal", + "is_decimal128", + "is_decimal256", + "is_decimal32", + "is_decimal64", + "is_dictionary", + "is_duration", + "is_fixed_size_binary", + "is_fixed_size_list", + "is_float16", + "is_float32", + "is_float64", + "is_floating", + "is_int16", + "is_int32", + "is_int64", + "is_int8", + "is_integer", + "is_interval", + "is_large_binary", + "is_large_list", + "is_large_list_view", + "is_large_string", + "is_large_unicode", + "is_list", + "is_list_view", + "is_map", + "is_nested", + "is_null", + "is_primitive", + "is_run_end_encoded", + "is_signed_integer", + "is_string", + "is_string_view", + "is_struct", + "is_temporal", + "is_time", + "is_time32", + "is_time64", + "is_timestamp", + "is_uint16", + "is_uint32", + "is_uint64", + "is_uint8", + "is_unicode", + "is_union", + "is_unsigned_integer", +] diff --git a/python/stubs/util.pyi b/python/stubs/util.pyi new file mode 100644 index 00000000000..c2ecf7d6b61 --- /dev/null +++ b/python/stubs/util.pyi @@ -0,0 +1,27 @@ +from collections.abc import Callable +from os import PathLike +from typing import Any, Protocol, Sequence, TypeVar + +_F = TypeVar("_F", bound=Callable) +_N = TypeVar("_N") + +class _DocStringComponents(Protocol): + _docstring_components: list[str] + +def doc( + *docstrings: str | _DocStringComponents | Callable | None, **params: Any +) -> Callable[[_F], _F]: ... +def _is_iterable(obj) -> bool: ... +def _is_path_like(path) -> bool: ... +def _stringify_path(path: str | PathLike) -> str: ... +def product(seq: Sequence[_N]) -> _N: ... +def get_contiguous_span( + shape: tuple[int, ...], strides: tuple[int, ...], itemsize: int +) -> tuple[int, int]: ... +def find_free_port() -> int: ... +def guid() -> str: ... +def _download_urllib(url, out_path) -> None: ... +def _download_requests(url, out_path) -> None: ... +def download_tzdata_on_windows() -> None: ... +def _deprecate_api(old_name, new_name, api, next_version, type=...): ... +def _deprecate_class(old_name, new_class, next_version, instancecheck=True): ... From 70807bbb5547573bf95f760619f80c2ed0785c79 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 24 Jul 2025 19:30:47 +0200 Subject: [PATCH 02/32] GH-7: [Python] Fix invalid-context-manager error (#30) * Add py.typed file to signify that the library is typed See the relevant PEP https://peps.python.org/pep-0561 * Prepare `pyarrow-stubs` for history merging MINOR: [Python] Prepare `pyarrow-stubs` for history merging Co-authored-by: ZhengYu, Xu * Add `ty` configuration and suppress error codes * One line per rule * Add licence header from original repo for all `.pyi` files * Revert "Add licence header from original repo for all `.pyi` files" This reverts commit 1631f3916479ce9e1fd7df1194f61cb420962fd5. * Prepare for licence merging * Exclude `stubs` from `rat` test * Add Apache licence clause to `py.typed` * Reduce list * Add `ty` as a step in the action * Run in the correct directory * Remove `check` from `pip` * Fix `unresolved-reference` error * Revert "Fix `unresolved-reference` error" This reverts commit 7ee3d2f04b2a01a2f1dacccc6d0995f48a3d374d. * Fix invalid-context-manager --------- Co-authored-by: Patrick J. Roddy --- python/pyarrow/tests/test_flight.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/tests/test_flight.py b/python/pyarrow/tests/test_flight.py index e9e99d8eb83..42de960ac04 100644 --- a/python/pyarrow/tests/test_flight.py +++ b/python/pyarrow/tests/test_flight.py @@ -49,8 +49,14 @@ ClientMiddleware, ClientMiddlewareFactory, ) except ImportError: + class context_like(object): + def __enter__(self): + return self + def __exit__(self, exc_type, exc_value, traceback): + pass + flight = None - FlightClient, FlightServerBase = object, object + FlightClient, FlightServerBase = context_like, context_like ServerAuthHandler, ClientAuthHandler = object, object ServerMiddleware, ServerMiddlewareFactory = object, object ClientMiddleware, ClientMiddlewareFactory = object, object From e69c7f54656db27035f43927773abd8eb70f4e08 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 24 Jul 2025 19:38:10 +0200 Subject: [PATCH 03/32] GH-8: [Python] Fix invalid-type-form (#31) * Add py.typed file to signify that the library is typed See the relevant PEP https://peps.python.org/pep-0561 * Prepare `pyarrow-stubs` for history merging MINOR: [Python] Prepare `pyarrow-stubs` for history merging Co-authored-by: ZhengYu, Xu * Add `ty` configuration and suppress error codes * One line per rule * Add licence header from original repo for all `.pyi` files * Revert "Add licence header from original repo for all `.pyi` files" This reverts commit 1631f3916479ce9e1fd7df1194f61cb420962fd5. * Prepare for licence merging * Exclude `stubs` from `rat` test * Add Apache licence clause to `py.typed` * Reduce list * Add `ty` as a step in the action * Run in the correct directory * Remove `check` from `pip` * Fix `unresolved-reference` error * Revert "Fix `unresolved-reference` error" This reverts commit 7ee3d2f04b2a01a2f1dacccc6d0995f48a3d374d. * Fix invalid-type-form errors --------- Co-authored-by: Patrick J. Roddy --- python/stubs/__lib_pxi/array.pyi | 4 ++-- python/stubs/__lib_pxi/io.pyi | 3 ++- python/stubs/__lib_pxi/table.pyi | 3 ++- python/stubs/compute.pyi | 3 ++- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/python/stubs/__lib_pxi/array.pyi b/python/stubs/__lib_pxi/array.pyi index ec1cda30a88..17eb4c6d888 100644 --- a/python/stubs/__lib_pxi/array.pyi +++ b/python/stubs/__lib_pxi/array.pyi @@ -1,3 +1,4 @@ +import builtins import datetime as dt import sys @@ -1990,8 +1991,7 @@ class Array(_PandasConvertible[pd.Series], Generic[_Scalar_co]): @overload def __getitem__(self, key: int) -> _Scalar_co: ... @overload - def __getitem__(self, key: slice) -> Self: ... - def __getitem__(self, key): + def __getitem__(self, key: builtins.slice) -> Self: ... """ Slice or return value at given index diff --git a/python/stubs/__lib_pxi/io.pyi b/python/stubs/__lib_pxi/io.pyi index d882fd79d57..37c8aefb06b 100644 --- a/python/stubs/__lib_pxi/io.pyi +++ b/python/stubs/__lib_pxi/io.pyi @@ -1,3 +1,4 @@ +import builtins import sys from collections.abc import Callable @@ -578,7 +579,7 @@ class Buffer(_Weakrefable): @property def parent(self) -> Buffer | None: ... @overload - def __getitem__(self, key: slice) -> Self: ... + def __getitem__(self, key: builtins.slice) -> Self: ... @overload def __getitem__(self, key: int) -> int: ... def slice(self, offset: int = 0, length: int | None = None) -> Self: diff --git a/python/stubs/__lib_pxi/table.pyi b/python/stubs/__lib_pxi/table.pyi index ad9d0392137..ad34e9b6dff 100644 --- a/python/stubs/__lib_pxi/table.pyi +++ b/python/stubs/__lib_pxi/table.pyi @@ -1,3 +1,4 @@ +import builtins import datetime as dt import sys @@ -294,7 +295,7 @@ class ChunkedArray(_PandasConvertible[pd.Series], Generic[_Scalar_co]): """ def __sizeof__(self) -> int: ... @overload - def __getitem__(self, key: slice) -> Self: ... + def __getitem__(self, key: builtins.slice) -> Self: ... @overload def __getitem__(self, key: int) -> _Scalar_co: ... def __getitem__(self, key): diff --git a/python/stubs/compute.pyi b/python/stubs/compute.pyi index 8d8fc35b134..f9039731ee6 100644 --- a/python/stubs/compute.pyi +++ b/python/stubs/compute.pyi @@ -93,6 +93,7 @@ from . import lib _P = ParamSpec("_P") _R = TypeVar("_R") +_CallableType = Callable[_P, _R] def field(*name_or_index: str | tuple[str, ...] | int) -> Expression: """Reference a column of the dataset. @@ -156,7 +157,7 @@ def scalar(value: bool | float | str) -> Expression: An Expression representing the scalar value """ -def _clone_signature(f: Callable[_P, _R]) -> Callable[_P, _R]: ... +def _clone_signature(f: _CallableType) -> _CallableType: ... # ============= compute functions ============= _DataTypeT = TypeVar("_DataTypeT", bound=lib.DataType) From 8298c76f2a9c4b8e74407ba2f32d8cdd0f981943 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 24 Jul 2025 19:41:32 +0200 Subject: [PATCH 04/32] GH-9: [Python] Fix non-subscriptable error (#32) * fix: The type parameter of array should be covariant (#253) * release 20.0.0.20250716 (#254) * Add py.typed file to signify that the library is typed See the relevant PEP https://peps.python.org/pep-0561 * Prepare `pyarrow-stubs` for history merging MINOR: [Python] Prepare `pyarrow-stubs` for history merging Co-authored-by: ZhengYu, Xu * Add `ty` configuration and suppress error codes * One line per rule * Add licence header from original repo for all `.pyi` files * Revert "Add licence header from original repo for all `.pyi` files" This reverts commit 1631f3916479ce9e1fd7df1194f61cb420962fd5. * Prepare for licence merging * Exclude `stubs` from `rat` test * Add Apache licence clause to `py.typed` * Reduce list * Add `ty` as a step in the action * Run in the correct directory * Remove `check` from `pip` * Fix `unresolved-reference` error * Revert "Fix `unresolved-reference` error" This reverts commit 7ee3d2f04b2a01a2f1dacccc6d0995f48a3d374d. * Fix non-subscriptable issues --------- Co-authored-by: Patrick J. Roddy --- python/pyarrow/pandas_compat.py | 4 ++-- python/pyarrow/tests/test_cuda_numba_interop.py | 12 ++++++------ python/pyarrow/tests/test_gdb.py | 2 +- python/stubs/__lib_pxi/array.pyi | 1 + 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index 5e2ee49437e..bb54c3b22c3 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -755,8 +755,8 @@ def _reconstruct_block(item, columns=None, extension_columns=None, return_block= # create ExtensionBlock arr = item['py_array'] assert len(placement) == 1 - name = columns[placement[0]] - pandas_dtype = extension_columns[name] + name = columns.get(placement[0], None) + pandas_dtype = extension_columns.get(name, None) if not hasattr(pandas_dtype, '__from_arrow__'): raise ValueError("This column does not support to be converted " "to a pandas ExtensionArray") diff --git a/python/pyarrow/tests/test_cuda_numba_interop.py b/python/pyarrow/tests/test_cuda_numba_interop.py index 876f3c7f761..3bd81d755f5 100644 --- a/python/pyarrow/tests/test_cuda_numba_interop.py +++ b/python/pyarrow/tests/test_cuda_numba_interop.py @@ -49,7 +49,7 @@ def teardown_module(module): @pytest.mark.parametrize("c", range(len(context_choice_ids)), ids=context_choice_ids) def test_context(c): - ctx, nb_ctx = context_choices[c] + ctx, nb_ctx = context_choices.get(c, (None, None)) assert ctx.handle == nb_ctx.handle.value assert ctx.handle == ctx.to_numba().handle.value ctx2 = cuda.Context.from_numba(nb_ctx) @@ -83,7 +83,7 @@ def make_random_buffer(size, target='host', dtype='uint8', ctx=None): @pytest.mark.parametrize("dtype", dtypes, ids=dtypes) @pytest.mark.parametrize("size", [0, 1, 8, 1000]) def test_from_object(c, dtype, size): - ctx, nb_ctx = context_choices[c] + ctx, nb_ctx = context_choices.get(c, (None, None)) arr, cbuf = make_random_buffer(size, target='device', dtype=dtype, ctx=ctx) # Creating device buffer from numba DeviceNDArray: @@ -161,7 +161,7 @@ def __cuda_array_interface__(self): ids=context_choice_ids) @pytest.mark.parametrize("dtype", dtypes, ids=dtypes) def test_numba_memalloc(c, dtype): - ctx, nb_ctx = context_choices[c] + ctx, nb_ctx = context_choices.get(c, (None, None)) dtype = np.dtype(dtype) # Allocate memory using numba context # Warning: this will not be reflected in pyarrow context manager @@ -184,7 +184,7 @@ def test_numba_memalloc(c, dtype): ids=context_choice_ids) @pytest.mark.parametrize("dtype", dtypes, ids=dtypes) def test_pyarrow_memalloc(c, dtype): - ctx, nb_ctx = context_choices[c] + ctx, nb_ctx = context_choices.get(c, (None, None)) size = 10 arr, cbuf = make_random_buffer(size, target='device', dtype=dtype, ctx=ctx) @@ -198,7 +198,7 @@ def test_pyarrow_memalloc(c, dtype): ids=context_choice_ids) @pytest.mark.parametrize("dtype", dtypes, ids=dtypes) def test_numba_context(c, dtype): - ctx, nb_ctx = context_choices[c] + ctx, nb_ctx = context_choices.get(c, (None, None)) size = 10 with nb_cuda.gpus[0]: arr, cbuf = make_random_buffer(size, target='device', @@ -217,7 +217,7 @@ def test_numba_context(c, dtype): ids=context_choice_ids) @pytest.mark.parametrize("dtype", dtypes, ids=dtypes) def test_pyarrow_jit(c, dtype): - ctx, nb_ctx = context_choices[c] + ctx, nb_ctx = context_choices.get(c, (None, None)) @nb_cuda.jit def increment_by_one(an_array): diff --git a/python/pyarrow/tests/test_gdb.py b/python/pyarrow/tests/test_gdb.py index 912953ae60d..58aabb7368e 100644 --- a/python/pyarrow/tests/test_gdb.py +++ b/python/pyarrow/tests/test_gdb.py @@ -159,7 +159,7 @@ def select_frame(self, func_name): if m is None: pytest.fail(f"Could not select frame for function {func_name}") - frame_num = int(m[1]) + frame_num = int(m.get(1, None)) out = self.run_command(f"frame {frame_num}") assert f"in {func_name}" in out diff --git a/python/stubs/__lib_pxi/array.pyi b/python/stubs/__lib_pxi/array.pyi index 17eb4c6d888..ffdb8a9c075 100644 --- a/python/stubs/__lib_pxi/array.pyi +++ b/python/stubs/__lib_pxi/array.pyi @@ -1992,6 +1992,7 @@ class Array(_PandasConvertible[pd.Series], Generic[_Scalar_co]): def __getitem__(self, key: int) -> _Scalar_co: ... @overload def __getitem__(self, key: builtins.slice) -> Self: ... + def __getitem__(self, key): """ Slice or return value at given index From 42c73d5f4b47253c422f5bef3926ded7721fed2d Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 24 Jul 2025 19:55:29 +0200 Subject: [PATCH 05/32] GH-11: [Python] Fix no-matching-overload error (#34) * Add py.typed file to signify that the library is typed See the relevant PEP https://peps.python.org/pep-0561 * Prepare `pyarrow-stubs` for history merging MINOR: [Python] Prepare `pyarrow-stubs` for history merging Co-authored-by: ZhengYu, Xu * Add `ty` configuration and suppress error codes * One line per rule * Add licence header from original repo for all `.pyi` files * Revert "Add licence header from original repo for all `.pyi` files" This reverts commit 1631f3916479ce9e1fd7df1194f61cb420962fd5. * Prepare for licence merging * Exclude `stubs` from `rat` test * Add Apache licence clause to `py.typed` * Reduce list * Add `ty` as a step in the action * Run in the correct directory * Remove `check` from `pip` * Fix `unresolved-reference` error * Revert "Fix `unresolved-reference` error" This reverts commit 7ee3d2f04b2a01a2f1dacccc6d0995f48a3d374d. * Fix no-matching-overload --------- Co-authored-by: Patrick J. Roddy --- python/pyarrow/tests/test_array.py | 3 ++- python/pyarrow/tests/test_compute.py | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 009ab1e849b..9a5044ce394 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -550,7 +550,8 @@ def test_arange(): for case in cases: result = pa.arange(*case) result.validate(full=True) - assert result.equals(pa.array(list(range(*case)), type=pa.int64())) + + assert result.equals(pa.array(list(range(*case)), type=pa.int64())) # type: ignore[no-matching-overload] # Validate memory_pool keyword argument result = pa.arange(-1, 101, memory_pool=pa.default_memory_pool()) diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index ad61dbc48a7..97f694df1fc 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -1737,10 +1737,10 @@ def test_arithmetic_multiply(): @pytest.mark.parametrize("ty", ["round", "round_to_multiple"]) def test_round_to_integer(ty): if ty == "round": - round = pc.round + round_func = pc.round RoundOptions = partial(pc.RoundOptions, ndigits=0) elif ty == "round_to_multiple": - round = pc.round_to_multiple + round_func = pc.round_to_multiple RoundOptions = partial(pc.RoundToMultipleOptions, multiple=1) values = [3.2, 3.5, 3.7, 4.5, -3.2, -3.5, -3.7, None] @@ -1758,7 +1758,7 @@ def test_round_to_integer(ty): } for round_mode, expected in rmode_and_expected.items(): options = RoundOptions(round_mode=round_mode) - result = round(values, options=options) + result = round_func(values, options=options) expected_array = pa.array(expected, type=pa.float64()) assert expected_array.equals(result) From 053fdbdbbde29a33bae3036605e8c1e79d9590cd Mon Sep 17 00:00:00 2001 From: "Patrick J. Roddy" Date: Thu, 24 Jul 2025 18:58:20 +0100 Subject: [PATCH 06/32] GH-12: [Python] Fix `invalid-return-type` error (#25) * Add py.typed file to signify that the library is typed See the relevant PEP https://peps.python.org/pep-0561 * Prepare `pyarrow-stubs` for history merging MINOR: [Python] Prepare `pyarrow-stubs` for history merging Co-authored-by: ZhengYu, Xu * Add `ty` configuration and suppress error codes * One line per rule * Add licence header from original repo for all `.pyi` files * Revert "Add licence header from original repo for all `.pyi` files" This reverts commit 1631f3916479ce9e1fd7df1194f61cb420962fd5. * Prepare for licence merging * Exclude `stubs` from `rat` test * Add Apache licence clause to `py.typed` * Reduce list * Add `ty` as a step in the action * Run in the correct directory * Remove `check` from `pip` * Fix `unresolved-reference` error * Revert "Fix `unresolved-reference` error" This reverts commit 7ee3d2f04b2a01a2f1dacccc6d0995f48a3d374d. * Fix `invalid-return-type` error * Fix linting --------- Co-authored-by: Rok Mihevc --- python/pyarrow/interchange/column.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/python/pyarrow/interchange/column.py b/python/pyarrow/interchange/column.py index ddbceabcb00..f80c586ff95 100644 --- a/python/pyarrow/interchange/column.py +++ b/python/pyarrow/interchange/column.py @@ -20,7 +20,6 @@ import enum from typing import ( Any, - Dict, Iterable, Optional, Tuple, @@ -379,7 +378,7 @@ def describe_null(self) -> Tuple[ColumnNullType, Any]: return ColumnNullType.USE_BITMASK, 0 @property - def null_count(self) -> int: + def null_count(self) -> int | None: """ Number of null elements, if known. @@ -390,7 +389,7 @@ def null_count(self) -> int: return n @property - def metadata(self) -> Dict[str, Any]: + def metadata(self) -> None: """ The metadata for the column. See `DataFrame.metadata` for more details. """ @@ -466,7 +465,7 @@ def get_buffers(self) -> ColumnBuffers: def _get_data_buffer( self, - ) -> Tuple[_PyArrowBuffer, Any]: # Any is for self.dtype tuple + ) -> Tuple[_PyArrowBuffer, Any] | None: # Any is for self.dtype tuple """ Return the buffer containing the data and the buffer's associated dtype. @@ -505,7 +504,7 @@ def _get_validity_buffer(self) -> Tuple[_PyArrowBuffer, Any]: "There are no missing values so " "does not have a separate mask") - def _get_offsets_buffer(self) -> Tuple[_PyArrowBuffer, Any]: + def _get_offsets_buffer(self) -> Tuple[_PyArrowBuffer, Any] | None: """ Return the buffer containing the offset values for variable-size binary data (e.g., variable-length strings) and the buffer's associated dtype. From 298aac092d950cec7603cac3faf25e33b39a943f Mon Sep 17 00:00:00 2001 From: "Patrick J. Roddy" Date: Thu, 24 Jul 2025 19:05:54 +0100 Subject: [PATCH 07/32] GH-14: [Python] Fix `not-iterable` typing (#26) * Add py.typed file to signify that the library is typed See the relevant PEP https://peps.python.org/pep-0561 * Prepare `pyarrow-stubs` for history merging MINOR: [Python] Prepare `pyarrow-stubs` for history merging Co-authored-by: ZhengYu, Xu * Add `ty` configuration and suppress error codes * One line per rule * Add licence header from original repo for all `.pyi` files * Revert "Add licence header from original repo for all `.pyi` files" This reverts commit 1631f3916479ce9e1fd7df1194f61cb420962fd5. * Prepare for licence merging * Exclude `stubs` from `rat` test * Add Apache licence clause to `py.typed` * Reduce list * Add `ty` as a step in the action * Run in the correct directory * Remove `check` from `pip` * Fix `unresolved-reference` error * Revert "Fix `unresolved-reference` error" This reverts commit 7ee3d2f04b2a01a2f1dacccc6d0995f48a3d374d. * Fix `not-iterable` error --------- Co-authored-by: Rok Mihevc --- python/pyarrow/interchange/column.py | 8 +++++++- python/pyarrow/tests/test_feather.py | 11 ++++++----- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/python/pyarrow/interchange/column.py b/python/pyarrow/interchange/column.py index f80c586ff95..2ecc690b80e 100644 --- a/python/pyarrow/interchange/column.py +++ b/python/pyarrow/interchange/column.py @@ -313,7 +313,13 @@ def _dtype_from_arrowdtype( kind = DtypeKind.CATEGORICAL arr = self._col indices_dtype = arr.indices.type - _, f_string = _PYARROW_KINDS.get(indices_dtype) + mapping = _PYARROW_KINDS.get(indices_dtype) + if mapping is None: + raise ValueError( + f"Dictionary index data type {indices_dtype} " + "not supported by interchange protocol" + ) + _, f_string = mapping return kind, bit_width, f_string, Endianness.NATIVE else: kind, f_string = _PYARROW_KINDS.get(dtype, (None, None)) diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py index 054bf920b26..d2b59fddebb 100644 --- a/python/pyarrow/tests/test_feather.py +++ b/python/pyarrow/tests/test_feather.py @@ -72,11 +72,12 @@ def setup_module(module): def teardown_module(module): - for path in TEST_FILES: - try: - os.remove(path) - except os.error: - pass + if TEST_FILES is not None: + for path in TEST_FILES: + try: + os.remove(path) + except os.error: + pass @pytest.mark.pandas From 778e77bfcca8ed59e5e5c195a5ec4d6984b87038 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 24 Jul 2025 20:08:50 +0200 Subject: [PATCH 08/32] GH-15: [Python] Fix possibly-unbound-attribute (#38) * Add py.typed file to signify that the library is typed See the relevant PEP https://peps.python.org/pep-0561 * Prepare `pyarrow-stubs` for history merging MINOR: [Python] Prepare `pyarrow-stubs` for history merging Co-authored-by: ZhengYu, Xu * Add `ty` configuration and suppress error codes * One line per rule * Add licence header from original repo for all `.pyi` files * Revert "Add licence header from original repo for all `.pyi` files" This reverts commit 1631f3916479ce9e1fd7df1194f61cb420962fd5. * Prepare for licence merging * Exclude `stubs` from `rat` test * Add Apache licence clause to `py.typed` * Reduce list * Add `ty` as a step in the action * Run in the correct directory * Remove `check` from `pip` * Fix `unresolved-reference` error * Revert "Fix `unresolved-reference` error" This reverts commit 7ee3d2f04b2a01a2f1dacccc6d0995f48a3d374d. * Fix possibly-unbound-attribute --------- Co-authored-by: Patrick J. Roddy --- python/setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/setup.py b/python/setup.py index 88119e2d2aa..504c78d61bb 100755 --- a/python/setup.py +++ b/python/setup.py @@ -44,7 +44,7 @@ # as here it may be set to the host not target platform is_emscripten = ( sysconfig.get_config_var("SOABI") - and sysconfig.get_config_var("SOABI").find("emscripten") != -1 + and sysconfig.get_config_var("SOABI").find("emscripten") != -1 # type: ignore[possibly-unbound] ) @@ -254,7 +254,7 @@ def _run_cmake(self): if os.path.isfile('CMakeCache.txt'): cachefile = open('CMakeCache.txt', 'r') cachedir = re.search('CMAKE_CACHEFILE_DIR:INTERNAL=(.*)', - cachefile.read()).group(1) + cachefile.read()).group(1) # type: ignore[possibly-unbound-attribute] cachefile.close() if (cachedir != build_temp): build_base = pjoin(saved_cwd, build_cmd.build_base) From 004697b626fe18447fb35fc91bc5f730b32d3083 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 24 Jul 2025 20:11:14 +0200 Subject: [PATCH 09/32] GH-16: [Python] Fix too-many-positional-arguments (#35) * Add py.typed file to signify that the library is typed See the relevant PEP https://peps.python.org/pep-0561 * Prepare `pyarrow-stubs` for history merging MINOR: [Python] Prepare `pyarrow-stubs` for history merging Co-authored-by: ZhengYu, Xu * Add `ty` configuration and suppress error codes * One line per rule * Add licence header from original repo for all `.pyi` files * Revert "Add licence header from original repo for all `.pyi` files" This reverts commit 1631f3916479ce9e1fd7df1194f61cb420962fd5. * Prepare for licence merging * Exclude `stubs` from `rat` test * Add Apache licence clause to `py.typed` * Reduce list * Add `ty` as a step in the action * Run in the correct directory * Remove `check` from `pip` * Fix `unresolved-reference` error * Revert "Fix `unresolved-reference` error" This reverts commit 7ee3d2f04b2a01a2f1dacccc6d0995f48a3d374d. * Fix too-many-positional-arguments --------- Co-authored-by: Patrick J. Roddy --- python/pyarrow/tests/test_flight.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/tests/test_flight.py b/python/pyarrow/tests/test_flight.py index 42de960ac04..91a5aa865db 100644 --- a/python/pyarrow/tests/test_flight.py +++ b/python/pyarrow/tests/test_flight.py @@ -56,7 +56,15 @@ def __exit__(self, exc_type, exc_value, traceback): pass flight = None - FlightClient, FlightServerBase = context_like, context_like + class MockContextManager: + def __enter__(self): + return self + def __exit__(self, exc_type, exc_val, exc_tb): + pass + class FlightServerBase(MockContextManager): + pass + class FlightClient(MockContextManager): + pass ServerAuthHandler, ClientAuthHandler = object, object ServerMiddleware, ServerMiddlewareFactory = object, object ClientMiddleware, ClientMiddlewareFactory = object, object From f31bab8d039ae0364fec89ed2f74cb6f5d7ee9b4 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 24 Jul 2025 20:13:44 +0200 Subject: [PATCH 10/32] GH-17: [Python] Fix unknown-argument (#36) * Add py.typed file to signify that the library is typed See the relevant PEP https://peps.python.org/pep-0561 * Prepare `pyarrow-stubs` for history merging MINOR: [Python] Prepare `pyarrow-stubs` for history merging Co-authored-by: ZhengYu, Xu * Add `ty` configuration and suppress error codes * One line per rule * Add licence header from original repo for all `.pyi` files * Revert "Add licence header from original repo for all `.pyi` files" This reverts commit 1631f3916479ce9e1fd7df1194f61cb420962fd5. * Prepare for licence merging * Exclude `stubs` from `rat` test * Add Apache licence clause to `py.typed` * Reduce list * Add `ty` as a step in the action * Run in the correct directory * Remove `check` from `pip` * Fix `unresolved-reference` error * Revert "Fix `unresolved-reference` error" This reverts commit 7ee3d2f04b2a01a2f1dacccc6d0995f48a3d374d. * Fix unknown-argument --------- Co-authored-by: Patrick J. Roddy --- python/pyarrow/tests/test_flight.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/pyarrow/tests/test_flight.py b/python/pyarrow/tests/test_flight.py index 91a5aa865db..a5746b462e2 100644 --- a/python/pyarrow/tests/test_flight.py +++ b/python/pyarrow/tests/test_flight.py @@ -57,6 +57,8 @@ def __exit__(self, exc_type, exc_value, traceback): flight = None class MockContextManager: + def __init__(self, *args, **kwargs): + pass def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): From 6ab7643154bf7c8294f69412b0d058cb689631ba Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 24 Jul 2025 20:20:31 +0200 Subject: [PATCH 11/32] GH-18: [Python] Fix unresolved-attribute (#37) * Add py.typed file to signify that the library is typed See the relevant PEP https://peps.python.org/pep-0561 * Prepare `pyarrow-stubs` for history merging MINOR: [Python] Prepare `pyarrow-stubs` for history merging Co-authored-by: ZhengYu, Xu * Add `ty` configuration and suppress error codes * One line per rule * Add licence header from original repo for all `.pyi` files * Revert "Add licence header from original repo for all `.pyi` files" This reverts commit 1631f3916479ce9e1fd7df1194f61cb420962fd5. * Prepare for licence merging * Exclude `stubs` from `rat` test * Add Apache licence clause to `py.typed` * Reduce list * Add `ty` as a step in the action * Run in the correct directory * Remove `check` from `pip` * Fix `unresolved-reference` error * Revert "Fix `unresolved-reference` error" This reverts commit 7ee3d2f04b2a01a2f1dacccc6d0995f48a3d374d. * Fix unresolved-attribute --------- Co-authored-by: Patrick J. Roddy --- python/pyarrow/cffi.py | 4 +-- python/pyarrow/pandas_compat.py | 8 +++--- python/pyarrow/parquet/core.py | 4 +-- python/pyarrow/tests/parquet/test_pandas.py | 4 +-- .../tests/parquet/test_parquet_file.py | 2 +- python/pyarrow/tests/test_cython.py | 4 +-- python/pyarrow/tests/test_extension_type.py | 4 +-- python/pyarrow/tests/test_flight.py | 25 ++++++++++++----- python/pyarrow/tests/test_json.py | 4 +-- python/pyarrow/tests/test_jvm.py | 12 ++++----- python/pyarrow/tests/test_pandas.py | 27 ++++++++++--------- python/pyarrow/vendored/docscrape.py | 6 ++++- python/scripts/run_emscripten_tests.py | 2 +- python/setup.py | 2 +- python/stubs/__lib_pxi/pandas_shim.pyi | 22 +++++++-------- python/stubs/cffi.pyi | 4 +-- 16 files changed, 76 insertions(+), 58 deletions(-) diff --git a/python/pyarrow/cffi.py b/python/pyarrow/cffi.py index 1da1a916914..3f5e748daf4 100644 --- a/python/pyarrow/cffi.py +++ b/python/pyarrow/cffi.py @@ -17,7 +17,7 @@ from __future__ import absolute_import -import cffi +from cffi import FFI c_source = """ struct ArrowSchema { @@ -77,5 +77,5 @@ """ # TODO use out-of-line mode for faster import and avoid C parsing -ffi = cffi.FFI() +ffi = FFI() ffi.cdef(c_source) diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index bb54c3b22c3..5a5e7b10f28 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -25,7 +25,7 @@ from copy import deepcopy import decimal from itertools import zip_longest -import json +from json import dumps as json_dumps import operator import re import warnings @@ -276,7 +276,7 @@ def construct_metadata(columns_to_convert, df, column_names, index_levels, index_descriptors = index_column_metadata = column_indexes = [] return { - b'pandas': json.dumps({ + b'pandas': json_dumps({ 'index_columns': index_descriptors, 'column_indexes': column_indexes, 'columns': column_metadata + index_column_metadata, @@ -511,7 +511,7 @@ def _get_index_level(df, name): def _level_name(name): # preserve type when default serializable, otherwise str it try: - json.dumps(name) + json_dumps(name) return name except TypeError: return str(name) @@ -826,7 +826,7 @@ def table_to_dataframe( axes = [columns, index] mgr = BlockManager(blocks, axes) if _pandas_api.is_ge_v21(): - df = DataFrame._from_mgr(mgr, mgr.axes) + df = DataFrame._from_mgr(mgr, mgr.axes) # type: ignore[unresolved-attribute] else: df = DataFrame(mgr) return df diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index aaf15c20288..8c1a2ae7822 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -21,7 +21,7 @@ from functools import reduce import inspect -import json +from json import loads as json_loads import os import re import operator @@ -1192,7 +1192,7 @@ def add_key_value_metadata(self, key_value_metadata): def _get_pandas_index_columns(keyvalues): - return (json.loads(keyvalues[b'pandas'].decode('utf8')) + return (json_loads(keyvalues[b'pandas'].decode('utf8')) ['index_columns']) diff --git a/python/pyarrow/tests/parquet/test_pandas.py b/python/pyarrow/tests/parquet/test_pandas.py index 703232b7cac..7f647883561 100644 --- a/python/pyarrow/tests/parquet/test_pandas.py +++ b/python/pyarrow/tests/parquet/test_pandas.py @@ -16,7 +16,7 @@ # under the License. import io -import json +from json import loads as json_loads try: import numpy as np @@ -65,7 +65,7 @@ def test_pandas_parquet_custom_metadata(tempdir): metadata = pq.read_metadata(filename).metadata assert b'pandas' in metadata - js = json.loads(metadata[b'pandas'].decode('utf8')) + js = json_loads(metadata[b'pandas'].decode('utf8')) assert js['index_columns'] == [{'kind': 'range', 'name': None, 'start': 0, 'stop': 10000, diff --git a/python/pyarrow/tests/parquet/test_parquet_file.py b/python/pyarrow/tests/parquet/test_parquet_file.py index 24ffe612ef7..aef0954eacd 100644 --- a/python/pyarrow/tests/parquet/test_parquet_file.py +++ b/python/pyarrow/tests/parquet/test_parquet_file.py @@ -408,7 +408,7 @@ def test_parquet_file_hugginface_support(): pytest.skip("fsspec is not installed, skipping Hugging Face test") fake_hf_module = types.ModuleType("huggingface_hub") - fake_hf_module.HfFileSystem = MemoryFileSystem + fake_hf_module.HfFileSystem = MemoryFileSystem # type: ignore[unresolved-attribute] with mock.patch.dict("sys.modules", {"huggingface_hub": fake_hf_module}): uri = "hf://datasets/apache/arrow/test.parquet" table = pa.table({"a": range(10)}) diff --git a/python/pyarrow/tests/test_cython.py b/python/pyarrow/tests/test_cython.py index e0116a4bb76..fdacb16be29 100644 --- a/python/pyarrow/tests/test_cython.py +++ b/python/pyarrow/tests/test_cython.py @@ -191,7 +191,7 @@ def test_visit_strings(tmpdir): strings = ['a', 'b', 'c'] visited = [] - mod._visit_strings(strings, visited.append) + mod._visit_strings(strings, visited.append) # type: ignore[unresolved-attribute] assert visited == strings @@ -200,4 +200,4 @@ def raise_on_b(s): if s == 'b': raise ValueError('wtf') - mod._visit_strings(strings, raise_on_b) + mod._visit_strings(strings, raise_on_b) # type: ignore[unresolved-attribute] diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index ebac37e862b..ea1c0afd7ff 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1353,11 +1353,11 @@ def test_cpp_extension_in_python(tmpdir): sys.path.insert(0, str(tmpdir)) mod = __import__('extensions') - uuid_type = mod._make_uuid_type() + uuid_type = mod._make_uuid_type() # type: ignore[unresolved-attribute] assert uuid_type.extension_name == "example-uuid" assert uuid_type.storage_type == pa.binary(16) - array = mod._make_uuid_array() + array = mod._make_uuid_array() # type: ignore[unresolved-attribute] assert array.type == uuid_type assert array.to_pylist() == [b'abcdefghijklmno0', b'0onmlkjihgfedcba'] assert array[0].as_py() == b'abcdefghijklmno0' diff --git a/python/pyarrow/tests/test_flight.py b/python/pyarrow/tests/test_flight.py index a5746b462e2..a3364ef05b8 100644 --- a/python/pyarrow/tests/test_flight.py +++ b/python/pyarrow/tests/test_flight.py @@ -26,7 +26,8 @@ import threading import time import traceback -import json +from json import dumps as json_dumps +from json import dumps as json_loads from datetime import datetime try: @@ -64,9 +65,19 @@ def __enter__(self): def __exit__(self, exc_type, exc_val, exc_tb): pass class FlightServerBase(MockContextManager): - pass + def serve(self): + pass class FlightClient(MockContextManager): - pass + def get_flight_info(self, **kwargs): + pass + def do_action(self, **kwargs): + pass + def do_get(self, **kwargs): + pass + def do_put(self, **kwargs): + pass + def close(self): + pass ServerAuthHandler, ClientAuthHandler = object, object ServerMiddleware, ServerMiddlewareFactory = object, object ClientMiddleware, ClientMiddlewareFactory = object, object @@ -332,7 +343,7 @@ class InvalidStreamFlightServer(FlightServerBase): def do_get(self, context, ticket): data1 = [pa.array([-10, -5, 0, 5, 10], type=pa.int32())] data2 = [pa.array([-10.0, -5.0, 0.0, 5.0, 10.0], type=pa.float64())] - assert data1.type != data2.type + assert data1[0].type != data2[0].type table1 = pa.Table.from_arrays(data1, names=['a']) table2 = pa.Table.from_arrays(data2, names=['a']) assert table1.schema == self.schema @@ -1759,7 +1770,7 @@ def test_flight_do_put_limit(): with pytest.raises(flight.FlightWriteSizeExceededError, match="exceeded soft limit") as excinfo: writer.write_batch(large_batch) - assert excinfo.value.limit == 4096 + assert excinfo.value.limit == 4096 # type: ignore[unresolved-attribute] smaller_batches = [ large_batch.slice(0, 384), large_batch.slice(384), @@ -2373,7 +2384,7 @@ class ActionNoneFlightServer(EchoFlightServer): def do_action(self, context, action): if action.type == "get_value": - return [json.dumps(self.VALUES).encode('utf-8')] + return [json_dumps(self.VALUES).encode('utf-8')] elif action.type == "append": self.VALUES.append(True) return None @@ -2390,7 +2401,7 @@ def test_none_action_side_effect(): FlightClient(('localhost', server.port)) as client: client.do_action(flight.Action("append", b"")) r = client.do_action(flight.Action("get_value", b"")) - assert json.loads(next(r).body.to_pybytes()) == [True] + assert json_loads(next(r).body.to_pybytes()) == [True] @pytest.mark.slow # Takes a while for gRPC to "realize" writes fail diff --git a/python/pyarrow/tests/test_json.py b/python/pyarrow/tests/test_json.py index c3f9fe333bd..68ac40063c9 100644 --- a/python/pyarrow/tests/test_json.py +++ b/python/pyarrow/tests/test_json.py @@ -20,7 +20,7 @@ from decimal import Decimal import io import itertools -import json +from json import dumps as json_dumps import string import unittest @@ -49,7 +49,7 @@ def make_random_json(num_cols=2, num_rows=10, linesep='\r\n'): lines = [] for row in arr.T: json_obj = OrderedDict([(k, int(v)) for (k, v) in zip(col_names, row)]) - lines.append(json.dumps(json_obj)) + lines.append(json_dumps(json_obj)) data = linesep.join(lines).encode() columns = [pa.array(col, type=pa.int64()) for col in arr] expected = pa.Table.from_arrays(columns, col_names) diff --git a/python/pyarrow/tests/test_jvm.py b/python/pyarrow/tests/test_jvm.py index d2ba780efc7..d71380b8666 100644 --- a/python/pyarrow/tests/test_jvm.py +++ b/python/pyarrow/tests/test_jvm.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -import json +from json import dumps as json_dumps import os import pyarrow as pa import pyarrow.jvm as pa_jvm @@ -175,23 +175,23 @@ def test_jvm_types(root_allocator, pa_type, jvm_spec, nullable): # TODO: This needs to be set for complex types 'children': [] } - jvm_field = _jvm_field(json.dumps(spec)) + jvm_field = _jvm_field(json_dumps(spec)) result = pa_jvm.field(jvm_field) expected_field = pa.field('field_name', pa_type, nullable=nullable) assert result == expected_field - jvm_schema = _jvm_schema(json.dumps(spec)) + jvm_schema = _jvm_schema(json_dumps(spec)) result = pa_jvm.schema(jvm_schema) assert result == pa.schema([expected_field]) # Schema with custom metadata - jvm_schema = _jvm_schema(json.dumps(spec), {'meta': 'data'}) + jvm_schema = _jvm_schema(json_dumps(spec), {'meta': 'data'}) result = pa_jvm.schema(jvm_schema) assert result == pa.schema([expected_field], {'meta': 'data'}) # Schema with custom field metadata spec['metadata'] = [{'key': 'field meta', 'value': 'field data'}] - jvm_schema = _jvm_schema(json.dumps(spec)) + jvm_schema = _jvm_schema(json_dumps(spec)) result = pa_jvm.schema(jvm_schema) expected_field = expected_field.with_metadata( {'field meta': 'field data'}) @@ -379,7 +379,7 @@ def test_jvm_record_batch(root_allocator, pa_type, py_data, jvm_type, # TODO: This needs to be set for complex types 'children': [] } - jvm_field = _jvm_field(json.dumps(spec)) + jvm_field = _jvm_field(json_dumps(spec)) # Create VectorSchemaRoot jvm_fields = jpype.JClass('java.util.ArrayList')() diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index ceea2527da0..f0bc4a31f34 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -17,7 +17,7 @@ import gc import decimal -import json +from json import dumps as json_dumps import multiprocessing as mp import sys import warnings @@ -3264,7 +3264,8 @@ def test_error_sparse(self): df = pd.DataFrame({'a': pd.arrays.SparseArray([1, np.nan, 3])}) except AttributeError: # pandas.arrays module introduced in pandas 0.24 - df = pd.DataFrame({'a': pd.SparseArray([1, np.nan, 3])}) + from pandas import SparseArray + df = pd.DataFrame({'a': SparseArray([1, np.nan, 3])}) with pytest.raises(TypeError, match="Sparse pandas data"): pa.Table.from_pandas(df) @@ -4422,11 +4423,12 @@ def test_convert_to_extension_array(monkeypatch): # monkeypatch pandas Int64Dtype to *not* have the protocol method if Version(pd.__version__) < Version("1.3.0.dev"): + from pandas.core import integer monkeypatch.delattr( - pd.core.arrays.integer._IntegerDtype, "__from_arrow__") + integer._IntegerDtype, "__from_arrow__") else: monkeypatch.delattr( - pd.core.arrays.integer.NumericDtype, "__from_arrow__") + pd.core.arrays.integer.NumericDtype, "__from_arrow__") # type: ignore[unresolved-attribute] # Int64Dtype has no __from_arrow__ -> use normal conversion result = table.to_pandas() assert len(_get_mgr(result).blocks) == 1 @@ -4467,11 +4469,12 @@ def test_conversion_extensiontype_to_extensionarray(monkeypatch): # monkeypatch pandas Int64Dtype to *not* have the protocol method # (remove the version added above and the actual version for recent pandas) if Version(pd.__version__) < Version("1.3.0.dev"): + from pandas.core import integer monkeypatch.delattr( - pd.core.arrays.integer._IntegerDtype, "__from_arrow__") + integer._IntegerDtype, "__from_arrow__") else: monkeypatch.delattr( - pd.core.arrays.integer.NumericDtype, "__from_arrow__") + pd.core.arrays.integer.NumericDtype, "__from_arrow__") # type: ignore[unresolved-attribute] result = arr.to_pandas() assert _get_mgr(result).blocks[0].values.dtype == np.dtype("int64") @@ -4650,7 +4653,7 @@ def test_metadata_compat_range_index_pre_0_12(): t1 = pa.Table.from_arrays([a_arrow, rng_index_arrow], names=['a', 'qux']) t1 = t1.replace_schema_metadata({ - b'pandas': json.dumps( + b'pandas': json_dumps( {'index_columns': ['qux'], 'column_indexes': [{'name': None, 'field_name': None, @@ -4679,7 +4682,7 @@ def test_metadata_compat_range_index_pre_0_12(): t2 = pa.Table.from_arrays([a_arrow, rng_index_arrow], names=['qux', gen_name_0]) t2 = t2.replace_schema_metadata({ - b'pandas': json.dumps( + b'pandas': json_dumps( {'index_columns': [gen_name_0], 'column_indexes': [{'name': None, 'field_name': None, @@ -4708,7 +4711,7 @@ def test_metadata_compat_range_index_pre_0_12(): t3 = pa.Table.from_arrays([a_arrow, rng_index_arrow], names=['a', gen_name_0]) t3 = t3.replace_schema_metadata({ - b'pandas': json.dumps( + b'pandas': json_dumps( {'index_columns': [gen_name_0], 'column_indexes': [{'name': None, 'field_name': None, @@ -4737,7 +4740,7 @@ def test_metadata_compat_range_index_pre_0_12(): t4 = pa.Table.from_arrays([a_arrow, rng_index_arrow, b_arrow], names=['a', 'qux', gen_name_1]) t4 = t4.replace_schema_metadata({ - b'pandas': json.dumps( + b'pandas': json_dumps( {'index_columns': ['qux', gen_name_1], 'column_indexes': [{'name': None, 'field_name': None, @@ -4771,7 +4774,7 @@ def test_metadata_compat_range_index_pre_0_12(): t5 = pa.Table.from_arrays([a_arrow, rng_index_arrow, b_arrow], names=['a', gen_name_0, gen_name_1]) t5 = t5.replace_schema_metadata({ - b'pandas': json.dumps( + b'pandas': json_dumps( {'index_columns': [gen_name_0, gen_name_1], 'column_indexes': [{'name': None, 'field_name': None, @@ -4818,7 +4821,7 @@ def test_metadata_compat_missing_field_name(): # metadata generated by fastparquet 0.3.2 with missing field_names table = table.replace_schema_metadata({ - b'pandas': json.dumps({ + b'pandas': json_dumps({ 'column_indexes': [ {'field_name': None, 'metadata': None, diff --git a/python/pyarrow/vendored/docscrape.py b/python/pyarrow/vendored/docscrape.py index 6c4d6e01400..096ef245243 100644 --- a/python/pyarrow/vendored/docscrape.py +++ b/python/pyarrow/vendored/docscrape.py @@ -105,6 +105,10 @@ def is_empty(self): class ParseError(Exception): + def __init__(self, *args, docstring=None, **kwargs): + self.__init__(*args, **kwargs) + self.docstring = docstring + def __str__(self): message = self.args[0] if hasattr(self, 'docstring'): @@ -153,7 +157,7 @@ def __init__(self, docstring, config=None): try: self._parse() except ParseError as e: - e.docstring = orig_docstring + e.docstring = orig_docstring # type: ignore[unresolved-attribute] raise def __getitem__(self, key): diff --git a/python/scripts/run_emscripten_tests.py b/python/scripts/run_emscripten_tests.py index 53d3dd52bd8..9b833525939 100644 --- a/python/scripts/run_emscripten_tests.py +++ b/python/scripts/run_emscripten_tests.py @@ -130,7 +130,7 @@ def launch_server(dist_dir): address = q.get(timeout=50) time.sleep(0.1) # wait to make sure server is started yield address - p.terminate() + p.join() class NodeDriver: diff --git a/python/setup.py b/python/setup.py index 504c78d61bb..4e87ecfbfcc 100755 --- a/python/setup.py +++ b/python/setup.py @@ -48,7 +48,7 @@ ) -if Cython.__version__ < '3': +if Cython.__version__ < '3': # type: ignore[unresolved-attribute] raise Exception( 'Please update your Cython version. Supported Cython >= 3') diff --git a/python/stubs/__lib_pxi/pandas_shim.pyi b/python/stubs/__lib_pxi/pandas_shim.pyi index 0e80fae4ebf..29a8485d062 100644 --- a/python/stubs/__lib_pxi/pandas_shim.pyi +++ b/python/stubs/__lib_pxi/pandas_shim.pyi @@ -1,7 +1,7 @@ from types import ModuleType from typing import Any, Iterable, TypeGuard -import pandas as pd +import pandas from numpy import dtype from pandas.core.dtypes.base import ExtensionDtype @@ -9,8 +9,8 @@ from pandas.core.dtypes.base import ExtensionDtype class _PandasAPIShim: has_sparse: bool - def series(self, *args, **kwargs) -> pd.Series: ... - def data_frame(self, *args, **kwargs) -> pd.DataFrame: ... + def series(self, *args, **kwargs) -> pandas.Series: ... + def data_frame(self, *args, **kwargs) -> pandas.DataFrame: ... @property def have_pandas(self) -> bool: ... @property @@ -28,21 +28,21 @@ class _PandasAPIShim: def is_ge_v23(self) -> bool: ... def is_ge_v3(self) -> bool: ... @property - def categorical_type(self) -> type[pd.Categorical]: ... + def categorical_type(self) -> type[pandas.Categorical]: ... @property - def datetimetz_type(self) -> type[pd.DatetimeTZDtype]: ... + def datetimetz_type(self) -> type[pandas.DatetimeTZDtype]: ... @property def extension_dtype(self) -> type[ExtensionDtype]: ... def is_array_like( self, obj: Any - ) -> TypeGuard[pd.Series | pd.Index | pd.Categorical | ExtensionDtype]: ... - def is_categorical(self, obj: Any) -> TypeGuard[pd.Categorical]: ... - def is_datetimetz(self, obj: Any) -> TypeGuard[pd.DatetimeTZDtype]: ... + ) -> TypeGuard[pandas.Series | pandas.Index | pandas.Categorical | ExtensionDtype]: ... + def is_categorical(self, obj: Any) -> TypeGuard[pandas.Categorical]: ... + def is_datetimetz(self, obj: Any) -> TypeGuard[pandas.DatetimeTZDtype]: ... def is_extension_array_dtype(self, obj: Any) -> TypeGuard[ExtensionDtype]: ... def is_sparse(self, obj: Any) -> bool: ... - def is_data_frame(self, obj: Any) -> TypeGuard[pd.DataFrame]: ... - def is_series(self, obj: Any) -> TypeGuard[pd.Series]: ... - def is_index(self, obj: Any) -> TypeGuard[pd.Index]: ... + def is_data_frame(self, obj: Any) -> TypeGuard[pandas.DataFrame]: ... + def is_series(self, obj: Any) -> TypeGuard[pandas.Series]: ... + def is_index(self, obj: Any) -> TypeGuard[pandas.Index]: ... def get_values(self, obj: Any) -> bool: ... def get_rangeindex_attribute(self, level, name): ... diff --git a/python/stubs/cffi.pyi b/python/stubs/cffi.pyi index 2ae945c5974..217b4b2ea44 100644 --- a/python/stubs/cffi.pyi +++ b/python/stubs/cffi.pyi @@ -1,4 +1,4 @@ -import cffi +from cffi import FFI c_source: str -ffi: cffi.FFI +ffi: FFI From 0585bf88056081650b191e2298df2278cc894d2d Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 24 Jul 2025 20:23:45 +0200 Subject: [PATCH 12/32] GH-19: [Python] Fix unresolved-global (#39) * Add py.typed file to signify that the library is typed See the relevant PEP https://peps.python.org/pep-0561 * Prepare `pyarrow-stubs` for history merging MINOR: [Python] Prepare `pyarrow-stubs` for history merging Co-authored-by: ZhengYu, Xu * Add `ty` configuration and suppress error codes * One line per rule * Add licence header from original repo for all `.pyi` files * Revert "Add licence header from original repo for all `.pyi` files" This reverts commit 1631f3916479ce9e1fd7df1194f61cb420962fd5. * Prepare for licence merging * Exclude `stubs` from `rat` test * Add Apache licence clause to `py.typed` * Reduce list * Add `ty` as a step in the action * Run in the correct directory * Remove `check` from `pip` * Fix `unresolved-reference` error * Revert "Fix `unresolved-reference` error" This reverts commit 7ee3d2f04b2a01a2f1dacccc6d0995f48a3d374d. * Fix unresolved-global --------- Co-authored-by: Patrick J. Roddy --- python/pyarrow/tests/test_flight.py | 2 +- python/scripts/run_emscripten_tests.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/tests/test_flight.py b/python/pyarrow/tests/test_flight.py index a3364ef05b8..5fe85ef4870 100644 --- a/python/pyarrow/tests/test_flight.py +++ b/python/pyarrow/tests/test_flight.py @@ -1120,7 +1120,7 @@ def test_client_wait_for_available(): server = None def serve(): - global server + nonlocal server time.sleep(0.5) server = FlightServerBase(location) server.serve() diff --git a/python/scripts/run_emscripten_tests.py b/python/scripts/run_emscripten_tests.py index 9b833525939..82846a65366 100644 --- a/python/scripts/run_emscripten_tests.py +++ b/python/scripts/run_emscripten_tests.py @@ -114,7 +114,7 @@ def end_headers(self): def run_server_thread(dist_dir, q): - global _SERVER_ADDRESS + global _SERVER_ADDRESS # type: ignore[unresolved-global] os.chdir(dist_dir) server = http.server.HTTPServer(("", 0), TemplateOverrider) q.put(server.server_address) From 68b9347b821bd4518215e8b7fbea33ec43b12b24 Mon Sep 17 00:00:00 2001 From: "Patrick J. Roddy" Date: Thu, 24 Jul 2025 19:29:00 +0100 Subject: [PATCH 13/32] GH-20: [Python] Fix `unsupported-reference` typing (#24) * Add py.typed file to signify that the library is typed See the relevant PEP https://peps.python.org/pep-0561 * Prepare `pyarrow-stubs` for history merging MINOR: [Python] Prepare `pyarrow-stubs` for history merging Co-authored-by: ZhengYu, Xu * Add `ty` configuration and suppress error codes * One line per rule * Add licence header from original repo for all `.pyi` files * Revert "Add licence header from original repo for all `.pyi` files" This reverts commit 1631f3916479ce9e1fd7df1194f61cb420962fd5. * Prepare for licence merging * Exclude `stubs` from `rat` test * Add Apache licence clause to `py.typed` * Reduce list * Add `ty` as a step in the action * Run in the correct directory * Remove `check` from `pip` * Fix `unresolved-reference` error * Revert "Fix `unresolved-reference` error" This reverts commit 7ee3d2f04b2a01a2f1dacccc6d0995f48a3d374d. * Fix `unresolved-reference` error --------- Co-authored-by: Rok Mihevc --- python/stubs/__lib_pxi/types.pyi | 3 ++- python/stubs/_fs.pyi | 5 +++-- python/stubs/compute.pyi | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/python/stubs/__lib_pxi/types.pyi b/python/stubs/__lib_pxi/types.pyi index 7fe6c36e332..a7b6062b275 100644 --- a/python/stubs/__lib_pxi/types.pyi +++ b/python/stubs/__lib_pxi/types.pyi @@ -29,7 +29,6 @@ from .io import Buffer from .scalar import ExtensionScalar _AsPyType = TypeVar("_AsPyType") -_DataTypeT = TypeVar("_DataTypeT", bound=DataType) class _Weakrefable: ... class _Metadata(_Weakrefable): ... @@ -186,6 +185,8 @@ class DataType(_Weakrefable): ArrowSchema pointer. """ +_DataTypeT = TypeVar("_DataTypeT", bound=DataType) + class _BasicDataType(DataType, Generic[_AsPyType]): ... class NullType(_BasicDataType[None]): ... class BoolType(_BasicDataType[bool]): ... diff --git a/python/stubs/_fs.pyi b/python/stubs/_fs.pyi index 7670ef5230d..edce54110f7 100644 --- a/python/stubs/_fs.pyi +++ b/python/stubs/_fs.pyi @@ -19,8 +19,6 @@ from fsspec import AbstractFileSystem # type: ignore[import-untyped] from .lib import NativeFile, _Weakrefable -SupportedFileSystem: TypeAlias = Union[AbstractFileSystem, FileSystem] - class FileType(enum.IntFlag): NotFound = enum.auto() Unknown = enum.auto() @@ -618,6 +616,9 @@ class FileSystem(_Weakrefable): The normalized path """ + +SupportedFileSystem: TypeAlias = Union[AbstractFileSystem, FileSystem] + class LocalFileSystem(FileSystem): """ A FileSystem implementation accessing files on the local machine. diff --git a/python/stubs/compute.pyi b/python/stubs/compute.pyi index f9039731ee6..1cf52ff07ca 100644 --- a/python/stubs/compute.pyi +++ b/python/stubs/compute.pyi @@ -217,9 +217,9 @@ NumericOrDurationScalar: TypeAlias = NumericScalar | lib.DurationScalar NumericOrTemporalScalar: TypeAlias = NumericScalar | TemporalScalar _NumericOrTemporalScalarT = TypeVar("_NumericOrTemporalScalarT", bound=NumericOrTemporalScalar) +_NumericScalarT = TypeVar("_NumericScalarT", bound=NumericScalar) NumericArray: TypeAlias = ArrayOrChunkedArray[_NumericScalarT] _NumericArrayT = TypeVar("_NumericArrayT", bound=NumericArray) -_NumericScalarT = TypeVar("_NumericScalarT", bound=NumericScalar) _NumericOrDurationT = TypeVar("_NumericOrDurationT", bound=NumericOrDurationScalar) NumericOrDurationArray: TypeAlias = ArrayOrChunkedArray[NumericOrDurationScalar] _NumericOrDurationArrayT = TypeVar("_NumericOrDurationArrayT", bound=NumericOrDurationArray) From 9811a700e5320917a91709350e40fd1a8e5ab126 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 24 Jul 2025 20:41:21 +0200 Subject: [PATCH 14/32] GH-6: [Python] Fix invalid-argument-type error (#29) * Add py.typed file to signify that the library is typed See the relevant PEP https://peps.python.org/pep-0561 * Prepare `pyarrow-stubs` for history merging MINOR: [Python] Prepare `pyarrow-stubs` for history merging Co-authored-by: ZhengYu, Xu * Add `ty` configuration and suppress error codes * One line per rule * Add licence header from original repo for all `.pyi` files * Revert "Add licence header from original repo for all `.pyi` files" This reverts commit 1631f3916479ce9e1fd7df1194f61cb420962fd5. * Prepare for licence merging * Exclude `stubs` from `rat` test * Add Apache licence clause to `py.typed` * Reduce list * Add `ty` as a step in the action * Run in the correct directory * Remove `check` from `pip` * Fix `unresolved-reference` error * Revert "Fix `unresolved-reference` error" This reverts commit 7ee3d2f04b2a01a2f1dacccc6d0995f48a3d374d. * removing OrderedDict, etc --------- Co-authored-by: Patrick J. Roddy --- python/pyarrow/tests/parquet/test_basic.py | 4 ++-- python/pyarrow/tests/test_compute.py | 5 +++-- python/pyarrow/tests/test_pandas.py | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py index 67515c5e247..f615e39c21c 100644 --- a/python/pyarrow/tests/parquet/test_basic.py +++ b/python/pyarrow/tests/parquet/test_basic.py @@ -230,11 +230,11 @@ def test_empty_table_no_columns(): def test_write_nested_zero_length_array_chunk_failure(): # Bug report in ARROW-3792 - cols = OrderedDict( + cols = dict( int32=pa.int32(), list_string=pa.list_(pa.string()) ) - data = [[], [OrderedDict(int32=1, list_string=('G',)), ]] + data = [[], [dict(int32=1, list_string=('G',)), ]] # This produces a table with a column like # )> diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 97f694df1fc..dc35fc6619c 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -2561,12 +2561,13 @@ def test_assume_timezone(): f"timezone '{timezone}'"): pc.assume_timezone(ambiguous_array, options=options_ambiguous_raise) - expected = ambiguous.tz_localize(timezone, ambiguous=[True, True, True]) + expected = ambiguous.tz_localize(timezone, ambiguous=np.array([True, True, True])) result = pc.assume_timezone( ambiguous_array, options=options_ambiguous_earliest) result.equals(pa.array(expected)) - expected = ambiguous.tz_localize(timezone, ambiguous=[False, False, False]) + expected = ambiguous.tz_localize( + timezone, ambiguous=np.array([False, False, False])) result = pc.assume_timezone( ambiguous_array, options=options_ambiguous_latest) result.equals(pa.array(expected)) diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index f0bc4a31f34..4af077ea0ef 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -4963,7 +4963,7 @@ def test_does_not_mutate_timedelta_dtype(): assert np.dtype(np.timedelta64) == expected - df = pd.DataFrame({"a": [np.timedelta64()]}) + df = pd.DataFrame({"a": [np.timedelta64("s")]}) t = pa.Table.from_pandas(df) t.to_pandas() From 5b10460d1ee5a14faa74d7381d6c03698af31bb3 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 24 Jul 2025 20:46:43 +0200 Subject: [PATCH 15/32] GH-27: [Python] Fix call-non-callable error (#28) * Add py.typed file to signify that the library is typed See the relevant PEP https://peps.python.org/pep-0561 * Prepare `pyarrow-stubs` for history merging MINOR: [Python] Prepare `pyarrow-stubs` for history merging Co-authored-by: ZhengYu, Xu * Add `ty` configuration and suppress error codes * One line per rule * Add licence header from original repo for all `.pyi` files * Revert "Add licence header from original repo for all `.pyi` files" This reverts commit 1631f3916479ce9e1fd7df1194f61cb420962fd5. * Prepare for licence merging * Exclude `stubs` from `rat` test * Add Apache licence clause to `py.typed` * Reduce list * Add `ty` as a step in the action * Run in the correct directory * Remove `check` from `pip` * Fix `unresolved-reference` error * Revert "Fix `unresolved-reference` error" This reverts commit 7ee3d2f04b2a01a2f1dacccc6d0995f48a3d374d. * to_sparse appears deprecated, ignore inline * Update python/pyarrow/tests/test_feather.py --------- Co-authored-by: Patrick J. Roddy --- python/pyarrow/tests/test_feather.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py index d2b59fddebb..9db63572cb7 100644 --- a/python/pyarrow/tests/test_feather.py +++ b/python/pyarrow/tests/test_feather.py @@ -591,7 +591,7 @@ def test_sparse_dataframe(version): # GH #221 data = {'A': [0, 1, 2], 'B': [1, 0, 1]} - df = pd.DataFrame(data).to_sparse(fill_value=1) + df = pd.DataFrame(data).to_sparse(fill_value=1) # type: ignore[call-non-callable] expected = df.to_dense() _check_pandas_roundtrip(df, expected, version=version) From 0f841ef0effc15034271e06e855675c220c07757 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 24 Jul 2025 20:49:00 +0200 Subject: [PATCH 16/32] GH-10: [Python] Fix invalid-assignment error (#33) * Add py.typed file to signify that the library is typed See the relevant PEP https://peps.python.org/pep-0561 * Prepare `pyarrow-stubs` for history merging MINOR: [Python] Prepare `pyarrow-stubs` for history merging Co-authored-by: ZhengYu, Xu * Add `ty` configuration and suppress error codes * One line per rule * Add licence header from original repo for all `.pyi` files * Revert "Add licence header from original repo for all `.pyi` files" This reverts commit 1631f3916479ce9e1fd7df1194f61cb420962fd5. * Prepare for licence merging * Exclude `stubs` from `rat` test * Add Apache licence clause to `py.typed` * Reduce list * Add `ty` as a step in the action * Run in the correct directory * Remove `check` from `pip` * Fix `unresolved-reference` error * Revert "Fix `unresolved-reference` error" This reverts commit 7ee3d2f04b2a01a2f1dacccc6d0995f48a3d374d. * Fix invalid-assignment --------- Co-authored-by: Patrick J. Roddy --- python/pyarrow/pandas_compat.py | 2 +- python/pyarrow/tests/interchange/test_conversion.py | 2 +- .../pyarrow/tests/interchange/test_interchange_spec.py | 3 ++- python/pyarrow/tests/parquet/common.py | 2 +- python/pyarrow/tests/parquet/test_basic.py | 4 ++-- .../tests/parquet/test_compliant_nested_type.py | 2 +- python/pyarrow/tests/parquet/test_data_types.py | 4 ++-- python/pyarrow/tests/parquet/test_dataset.py | 4 ++-- python/pyarrow/tests/parquet/test_datetime.py | 4 ++-- python/pyarrow/tests/parquet/test_metadata.py | 4 ++-- python/pyarrow/tests/parquet/test_pandas.py | 4 ++-- python/pyarrow/tests/parquet/test_parquet_file.py | 2 +- python/pyarrow/tests/parquet/test_parquet_writer.py | 2 +- python/pyarrow/tests/strategies.py | 8 ++++---- python/pyarrow/tests/test_adhoc_memory_leak.py | 2 +- python/pyarrow/tests/test_array.py | 2 +- python/pyarrow/tests/test_cffi.py | 4 ++-- python/pyarrow/tests/test_compute.py | 4 ++-- python/pyarrow/tests/test_convert_builtin.py | 2 +- python/pyarrow/tests/test_dataset.py | 4 ++-- python/pyarrow/tests/test_extension_type.py | 2 +- python/pyarrow/tests/test_feather.py | 2 +- python/pyarrow/tests/test_flight.py | 2 +- python/pyarrow/tests/test_io.py | 2 +- python/pyarrow/tests/test_ipc.py | 2 +- python/pyarrow/tests/test_json.py | 2 +- python/pyarrow/tests/test_pandas.py | 2 +- python/pyarrow/tests/test_scalars.py | 2 +- python/pyarrow/tests/test_schema.py | 2 +- python/pyarrow/tests/test_sparse_tensor.py | 10 ++++------ python/pyarrow/tests/test_table.py | 2 +- python/pyarrow/tests/test_types.py | 5 +++-- python/pyarrow/tests/test_udf.py | 2 +- 33 files changed, 51 insertions(+), 51 deletions(-) diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index 5a5e7b10f28..7b9f5008a10 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -33,7 +33,7 @@ try: import numpy as np except ImportError: - np = None + np = None # type: ignore[assignment] import pyarrow as pa from pyarrow.lib import _pandas_api, frombytes, is_threading_enabled # noqa diff --git a/python/pyarrow/tests/interchange/test_conversion.py b/python/pyarrow/tests/interchange/test_conversion.py index 50da6693aff..a584f379738 100644 --- a/python/pyarrow/tests/interchange/test_conversion.py +++ b/python/pyarrow/tests/interchange/test_conversion.py @@ -23,7 +23,7 @@ try: import numpy as np except ImportError: - np = None + pass import pyarrow.interchange as pi from pyarrow.interchange.column import ( diff --git a/python/pyarrow/tests/interchange/test_interchange_spec.py b/python/pyarrow/tests/interchange/test_interchange_spec.py index cea694d1c1e..56a424fd57a 100644 --- a/python/pyarrow/tests/interchange/test_interchange_spec.py +++ b/python/pyarrow/tests/interchange/test_interchange_spec.py @@ -20,10 +20,11 @@ import hypothesis.strategies as st import pytest +np = None try: import numpy as np except ImportError: - np = None + pass import pyarrow as pa import pyarrow.tests.strategies as past diff --git a/python/pyarrow/tests/parquet/common.py b/python/pyarrow/tests/parquet/common.py index 4f5946649b8..7351a4c3e94 100644 --- a/python/pyarrow/tests/parquet/common.py +++ b/python/pyarrow/tests/parquet/common.py @@ -20,7 +20,7 @@ try: import numpy as np except ImportError: - np = None + np = None # type: ignore[assignment] import pyarrow as pa from pyarrow.tests import util diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py index f615e39c21c..528f8e51683 100644 --- a/python/pyarrow/tests/parquet/test_basic.py +++ b/python/pyarrow/tests/parquet/test_basic.py @@ -44,12 +44,12 @@ from pyarrow.tests.pandas_examples import dataframe_with_lists from pyarrow.tests.parquet.common import alltypes_sample except ImportError: - pd = tm = None + pd = tm = None # type: ignore[assignment] try: import numpy as np except ImportError: - np = None + np = None # type: ignore[assignment] # Marks all of the tests in this module # Ignore these with pytest ... -m 'not parquet' diff --git a/python/pyarrow/tests/parquet/test_compliant_nested_type.py b/python/pyarrow/tests/parquet/test_compliant_nested_type.py index 2345855a332..8a64cd0cab7 100644 --- a/python/pyarrow/tests/parquet/test_compliant_nested_type.py +++ b/python/pyarrow/tests/parquet/test_compliant_nested_type.py @@ -32,7 +32,7 @@ from pyarrow.tests.parquet.common import _roundtrip_pandas_dataframe except ImportError: - pd = tm = None + pd = tm = None # type: ignore[assignment] # Marks all of the tests in this module diff --git a/python/pyarrow/tests/parquet/test_data_types.py b/python/pyarrow/tests/parquet/test_data_types.py index c546bc1532a..66e12d11b21 100644 --- a/python/pyarrow/tests/parquet/test_data_types.py +++ b/python/pyarrow/tests/parquet/test_data_types.py @@ -22,7 +22,7 @@ try: import numpy as np except ImportError: - np = None + np = None # type: ignore[assignment] import pytest import pyarrow as pa @@ -44,7 +44,7 @@ dataframe_with_lists) from pyarrow.tests.parquet.common import alltypes_sample except ImportError: - pd = tm = None + pd = tm = None # type: ignore[assignment] # Marks all of the tests in this module diff --git a/python/pyarrow/tests/parquet/test_dataset.py b/python/pyarrow/tests/parquet/test_dataset.py index b8939443c1d..a162006dc0c 100644 --- a/python/pyarrow/tests/parquet/test_dataset.py +++ b/python/pyarrow/tests/parquet/test_dataset.py @@ -24,7 +24,7 @@ try: import numpy as np except ImportError: - np = None + np = None # type: ignore[assignment] import pytest import unittest.mock as mock @@ -48,7 +48,7 @@ import pandas.testing as tm except ImportError: - pd = tm = None + pd = tm = None # type: ignore[assignment] # Marks all of the tests in this module diff --git a/python/pyarrow/tests/parquet/test_datetime.py b/python/pyarrow/tests/parquet/test_datetime.py index b89fd97cb91..62904937eb5 100644 --- a/python/pyarrow/tests/parquet/test_datetime.py +++ b/python/pyarrow/tests/parquet/test_datetime.py @@ -22,7 +22,7 @@ try: import numpy as np except ImportError: - np = None + np = None # type: ignore[assignment] import pytest import pyarrow as pa @@ -41,7 +41,7 @@ from pyarrow.tests.parquet.common import _roundtrip_pandas_dataframe except ImportError: - pd = tm = None + pd = tm = None # type: ignore[assignment] # Marks all of the tests in this module diff --git a/python/pyarrow/tests/parquet/test_metadata.py b/python/pyarrow/tests/parquet/test_metadata.py index 148bfebaa67..d8fafde185f 100644 --- a/python/pyarrow/tests/parquet/test_metadata.py +++ b/python/pyarrow/tests/parquet/test_metadata.py @@ -23,7 +23,7 @@ try: import numpy as np except ImportError: - np = None + np = None # type: ignore[assignment] import pytest import pyarrow as pa @@ -44,7 +44,7 @@ from pyarrow.tests.parquet.common import alltypes_sample except ImportError: - pd = tm = None + pd = tm = None # type: ignore[assignment] # Marks all of the tests in this module diff --git a/python/pyarrow/tests/parquet/test_pandas.py b/python/pyarrow/tests/parquet/test_pandas.py index 7f647883561..9b9e7c4e48e 100644 --- a/python/pyarrow/tests/parquet/test_pandas.py +++ b/python/pyarrow/tests/parquet/test_pandas.py @@ -21,7 +21,7 @@ try: import numpy as np except ImportError: - np = None + np = None # type: ignore[assignment] import pytest import pyarrow as pa @@ -44,7 +44,7 @@ from pyarrow.tests.parquet.common import (_roundtrip_pandas_dataframe, alltypes_sample) except ImportError: - pd = tm = None + pd = tm = None # type: ignore[assignment] # Marks all of the tests in this module diff --git a/python/pyarrow/tests/parquet/test_parquet_file.py b/python/pyarrow/tests/parquet/test_parquet_file.py index aef0954eacd..28f25ac8482 100644 --- a/python/pyarrow/tests/parquet/test_parquet_file.py +++ b/python/pyarrow/tests/parquet/test_parquet_file.py @@ -38,7 +38,7 @@ from pyarrow.tests.parquet.common import alltypes_sample except ImportError: - pd = tm = None + pd = tm = None # type: ignore[assignment] # Marks all of the tests in this module diff --git a/python/pyarrow/tests/parquet/test_parquet_writer.py b/python/pyarrow/tests/parquet/test_parquet_writer.py index d1e9e874ba1..8f163dfc0b5 100644 --- a/python/pyarrow/tests/parquet/test_parquet_writer.py +++ b/python/pyarrow/tests/parquet/test_parquet_writer.py @@ -33,7 +33,7 @@ import pandas.testing as tm except ImportError: - pd = tm = None + pd = tm = None # type: ignore[assignment] # Marks all of the tests in this module diff --git a/python/pyarrow/tests/strategies.py b/python/pyarrow/tests/strategies.py index 450cce74f1d..243815c59f7 100644 --- a/python/pyarrow/tests/strategies.py +++ b/python/pyarrow/tests/strategies.py @@ -24,15 +24,15 @@ try: import hypothesis.extra.numpy as npst except ImportError: - npst = None + npst = None # type: ignore[assignment] try: import hypothesis.extra.pytz as tzst except ImportError: - tzst = None + tzst = None # type: ignore[assignment] try: import zoneinfo except ImportError: - zoneinfo = None + zoneinfo = None # type: ignore[assignment] if sys.platform == 'win32': try: import tzdata # noqa:F401 @@ -41,7 +41,7 @@ try: import numpy as np except ImportError: - np = None + np = None # type: ignore[assignment] import pyarrow as pa diff --git a/python/pyarrow/tests/test_adhoc_memory_leak.py b/python/pyarrow/tests/test_adhoc_memory_leak.py index 76a766984da..9f61bc7ddfe 100644 --- a/python/pyarrow/tests/test_adhoc_memory_leak.py +++ b/python/pyarrow/tests/test_adhoc_memory_leak.py @@ -20,7 +20,7 @@ try: import numpy as np except ImportError: - np = None + pass import pyarrow as pa import pyarrow.tests.util as test_util diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 9a5044ce394..a06e3f76570 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -30,7 +30,7 @@ try: import numpy as np except ImportError: - np = None + np = None # type: ignore[assignment] import pyarrow as pa import pyarrow.tests.strategies as past diff --git a/python/pyarrow/tests/test_cffi.py b/python/pyarrow/tests/test_cffi.py index 84290a6b880..2d0ff8b45f1 100644 --- a/python/pyarrow/tests/test_cffi.py +++ b/python/pyarrow/tests/test_cffi.py @@ -24,7 +24,7 @@ try: from pyarrow.cffi import ffi except ImportError: - ffi = None + ffi = None # type: ignore[assignment] import pytest @@ -32,7 +32,7 @@ import pandas as pd import pandas.testing as tm except ImportError: - pd = tm = None + pd = tm = None # type: ignore[assignment] needs_cffi = pytest.mark.skipif(ffi is None, diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index dc35fc6619c..003fb5db41d 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -31,12 +31,12 @@ try: import numpy as np except ImportError: - np = None + np = None # type: ignore[assignment] try: import pandas as pd except ImportError: - pd = None + pd = None # type: ignore[assignment] import pyarrow as pa import pyarrow.compute as pc diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index 07286125c4c..468bddf58cb 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -27,7 +27,7 @@ try: import numpy as np except ImportError: - np = None + np = None # type: ignore[assignment] from pyarrow.pandas_compat import _pandas_api # noqa import pyarrow as pa diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index e7365643b84..d1cd3f6b8a1 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -32,7 +32,7 @@ try: import numpy as np except ImportError: - np = None + np = None # type: ignore[assignment] import pytest import pyarrow as pa @@ -49,7 +49,7 @@ try: import pandas as pd except ImportError: - pd = None + pd = None # type: ignore[assignment] try: import pyarrow.dataset as ds diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index ea1c0afd7ff..1a851611b14 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -27,7 +27,7 @@ try: import numpy as np except ImportError: - np = None + np = None # type: ignore[assignment] import pyarrow as pa from pyarrow.vendored.version import Version diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py index 9db63572cb7..8235260f468 100644 --- a/python/pyarrow/tests/test_feather.py +++ b/python/pyarrow/tests/test_feather.py @@ -26,7 +26,7 @@ try: import numpy as np except ImportError: - np = None + np = None # type: ignore[assignment] import pyarrow as pa import pyarrow.tests.strategies as past diff --git a/python/pyarrow/tests/test_flight.py b/python/pyarrow/tests/test_flight.py index 5fe85ef4870..600c6492780 100644 --- a/python/pyarrow/tests/test_flight.py +++ b/python/pyarrow/tests/test_flight.py @@ -33,7 +33,7 @@ try: import numpy as np except ImportError: - np = None + np = None # type: ignore[assignment] import pytest import pyarrow as pa diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py index a6d3546e57c..b1ec7674f87 100644 --- a/python/pyarrow/tests/test_io.py +++ b/python/pyarrow/tests/test_io.py @@ -33,7 +33,7 @@ try: import numpy as np except ImportError: - np = None + np = None # type: ignore[assignment] from pyarrow.util import guid from pyarrow import Codec diff --git a/python/pyarrow/tests/test_ipc.py b/python/pyarrow/tests/test_ipc.py index b3b3367223d..26df224ee49 100644 --- a/python/pyarrow/tests/test_ipc.py +++ b/python/pyarrow/tests/test_ipc.py @@ -28,7 +28,7 @@ try: import numpy as np except ImportError: - np = None + np = None # type: ignore[assignment] import pyarrow as pa from pyarrow.tests.util import changed_environ, invoke_script diff --git a/python/pyarrow/tests/test_json.py b/python/pyarrow/tests/test_json.py index 68ac40063c9..ab0602cd198 100644 --- a/python/pyarrow/tests/test_json.py +++ b/python/pyarrow/tests/test_json.py @@ -27,7 +27,7 @@ try: import numpy as np except ImportError: - np = None + np = None # type: ignore[assignment] import pytest import pyarrow as pa diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 4af077ea0ef..535b95515dc 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -38,7 +38,7 @@ VisibleDeprecationWarning as _np_VisibleDeprecationWarning ) except ImportError: - np = None + np = None # type: ignore[assignment] from pyarrow.pandas_compat import get_logical_type, _pandas_api from pyarrow.tests.util import invoke_script, random_ascii, rands diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index 0f62dd98f82..f48761b1918 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -24,7 +24,7 @@ try: import numpy as np except ImportError: - np = None + np = None # type: ignore[assignment] import pyarrow as pa import pyarrow.compute as pc diff --git a/python/pyarrow/tests/test_schema.py b/python/pyarrow/tests/test_schema.py index a1197ed2d08..6d1ff431819 100644 --- a/python/pyarrow/tests/test_schema.py +++ b/python/pyarrow/tests/test_schema.py @@ -23,7 +23,7 @@ try: import numpy as np except ImportError: - np = None + np = None # type: ignore[assignment] import pyarrow as pa import pyarrow.tests.util as test_util diff --git a/python/pyarrow/tests/test_sparse_tensor.py b/python/pyarrow/tests/test_sparse_tensor.py index eca8090d77a..27974b80f80 100644 --- a/python/pyarrow/tests/test_sparse_tensor.py +++ b/python/pyarrow/tests/test_sparse_tensor.py @@ -26,12 +26,10 @@ import pyarrow as pa try: + import scipy from scipy.sparse import csr_array, coo_array, csr_matrix, coo_matrix except ImportError: - coo_matrix = None - csr_matrix = None - csr_array = None - coo_array = None + scipy = None # type: ignore[assignment] try: import sparse @@ -401,7 +399,7 @@ def test_dense_to_sparse_tensor(dtype_str, arrow_type, sparse_tensor_type): assert np.array_equal(array, result_array) -@pytest.mark.skipif(not coo_matrix, reason="requires scipy") +@pytest.mark.skipif(not scipy, reason="requires scipy") @pytest.mark.parametrize('sparse_object', (coo_array, coo_matrix)) @pytest.mark.parametrize('dtype_str,arrow_type', scipy_type_pairs) def test_sparse_coo_tensor_scipy_roundtrip(dtype_str, arrow_type, @@ -443,7 +441,7 @@ def test_sparse_coo_tensor_scipy_roundtrip(dtype_str, arrow_type, assert out_scipy_matrix.has_canonical_format -@pytest.mark.skipif(not csr_matrix, reason="requires scipy") +@pytest.mark.skipif(not scipy, reason="requires scipy") @pytest.mark.parametrize('sparse_object', (csr_array, csr_matrix)) @pytest.mark.parametrize('dtype_str,arrow_type', scipy_type_pairs) def test_sparse_csr_matrix_scipy_roundtrip(dtype_str, arrow_type, diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index b65fb7d952c..ead5cbaddc5 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -23,7 +23,7 @@ try: import numpy as np except ImportError: - np = None + np = None # type: ignore[assignment] import pytest import pyarrow as pa import pyarrow.compute as pc diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index e628e559b84..338c022a223 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -27,13 +27,13 @@ try: import hypothesis.extra.pytz as tzst except ImportError: - tzst = None + tzst = None # type: ignore[assignment] import weakref try: import numpy as np except ImportError: - np = None + pass import pyarrow as pa import pyarrow.types as types import pyarrow.tests.strategies as past @@ -1322,6 +1322,7 @@ def test_field_modified_copies(): assert f0.equals(f0_) +@pytest.mark.numpy def test_is_integer_value(): assert pa.types.is_integer_value(1) if np is not None: diff --git a/python/pyarrow/tests/test_udf.py b/python/pyarrow/tests/test_udf.py index 93004a30618..dbc30867971 100644 --- a/python/pyarrow/tests/test_udf.py +++ b/python/pyarrow/tests/test_udf.py @@ -21,7 +21,7 @@ try: import numpy as np except ImportError: - np = None + np = None # type: ignore[assignment] import pyarrow as pa from pyarrow import compute as pc From b2395974a4c1e31c55fefed98cea0ac56fc71f6d Mon Sep 17 00:00:00 2001 From: "Patrick J. Roddy" Date: Thu, 24 Jul 2025 19:50:27 +0100 Subject: [PATCH 17/32] GH-21: [Python] Fix `unsupported-operator` typing (#22) * Add py.typed file to signify that the library is typed See the relevant PEP https://peps.python.org/pep-0561 * Prepare `pyarrow-stubs` for history merging MINOR: [Python] Prepare `pyarrow-stubs` for history merging Co-authored-by: ZhengYu, Xu * Add `ty` configuration and suppress error codes * One line per rule * Add licence header from original repo for all `.pyi` files * Revert "Add licence header from original repo for all `.pyi` files" This reverts commit 1631f3916479ce9e1fd7df1194f61cb420962fd5. * Prepare for licence merging * Exclude `stubs` from `rat` test * Add Apache licence clause to `py.typed` * Reduce list * Add `ty` as a step in the action * Run in the correct directory * Remove `check` from `pip` * Fix `unsupported-operator` error --------- Co-authored-by: Rok Mihevc --- python/stubs/__lib_pxi/table.pyi | 5 +++-- python/stubs/_fs.pyi | 1 - python/stubs/_stubs_typing.pyi | 26 +++++++++++++------------- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/python/stubs/__lib_pxi/table.pyi b/python/stubs/__lib_pxi/table.pyi index ad34e9b6dff..fbcfb1ef745 100644 --- a/python/stubs/__lib_pxi/table.pyi +++ b/python/stubs/__lib_pxi/table.pyi @@ -23,6 +23,7 @@ from typing import ( Mapping, Sequence, TypeVar, + Union, overload, ) @@ -117,8 +118,8 @@ AggregateOptions: TypeAlias = ( UnarySelector: TypeAlias = str NullarySelector: TypeAlias = tuple[()] -NarySelector: TypeAlias = list[str] | tuple[str, ...] -ColumnSelector: TypeAlias = UnarySelector | NullarySelector | NarySelector +NarySelector: TypeAlias = Union[list[str], tuple[str, ...]] +ColumnSelector: TypeAlias = Union[UnarySelector, NullarySelector, NarySelector] class ChunkedArray(_PandasConvertible[pd.Series], Generic[_Scalar_co]): """ diff --git a/python/stubs/_fs.pyi b/python/stubs/_fs.pyi index edce54110f7..9f6e28dcf0f 100644 --- a/python/stubs/_fs.pyi +++ b/python/stubs/_fs.pyi @@ -616,7 +616,6 @@ class FileSystem(_Weakrefable): The normalized path """ - SupportedFileSystem: TypeAlias = Union[AbstractFileSystem, FileSystem] class LocalFileSystem(FileSystem): diff --git a/python/stubs/_stubs_typing.pyi b/python/stubs/_stubs_typing.pyi index c259513f1ea..40d931d24ed 100644 --- a/python/stubs/_stubs_typing.pyi +++ b/python/stubs/_stubs_typing.pyi @@ -2,7 +2,7 @@ import datetime as dt from collections.abc import Sequence from decimal import Decimal -from typing import Any, Collection, Literal, Protocol, TypeAlias, TypeVar +from typing import Any, Collection, Literal, Protocol, TypeAlias, TypeVar, Union import numpy as np @@ -30,12 +30,12 @@ NullEncoding: TypeAlias = Literal["mask", "encode"] NullSelectionBehavior: TypeAlias = Literal["drop", "emit_null"] Mask: TypeAlias = Sequence[bool | None] | NDArray[np.bool_] | BooleanArray Indices: TypeAlias = Sequence[int] | NDArray[np.integer[Any]] | IntegerArray -PyScalar: TypeAlias = ( - bool | int | float | Decimal | str | bytes | dt.date | dt.datetime | dt.time | dt.timedelta -) +PyScalar: TypeAlias = Union[ + bool, int, float, Decimal, str, bytes, dt.date, dt.datetime, dt.time, dt.timedelta +] _T = TypeVar("_T") -SingleOrList: TypeAlias = list[_T] | _T +SingleOrList: TypeAlias = Union[list[_T], _T] class SupportEq(Protocol): def __eq__(self, other) -> bool: ... @@ -52,14 +52,14 @@ class SupportLe(Protocol): class SupportGe(Protocol): def __ge__(self, other) -> bool: ... -FilterTuple: TypeAlias = ( - tuple[str, Literal["=", "==", "!="], SupportEq] - | tuple[str, Literal["<"], SupportLt] - | tuple[str, Literal[">"], SupportGt] - | tuple[str, Literal["<="], SupportLe] - | tuple[str, Literal[">="], SupportGe] - | tuple[str, Literal["in", "not in"], Collection] -) +FilterTuple: TypeAlias = Union[ + tuple[str, Literal["=", "==", "!="], SupportEq], + tuple[str, Literal["<"], SupportLt], + tuple[str, Literal[">"], SupportGt], + tuple[str, Literal["<="], SupportLe], + tuple[str, Literal[">="], SupportGe], + tuple[str, Literal["in", "not in"], Collection] +] class Buffer(Protocol): def __buffer__(self, flags: int, /) -> memoryview: ... From c7963f5c64e18181039aeeb506f6f070ee341ab0 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 24 Jul 2025 21:10:57 +0200 Subject: [PATCH 18/32] Merge branch 'unresolved-import' into add-pyarrow-stubs diff --git c/python/pyarrow/__init__.py i/python/pyarrow/__init__.py index da2fe96647..45aa2b619f 100644 --- c/python/pyarrow/__init__.py +++ i/python/pyarrow/__init__.py @@ -58,8 +58,8 @@ except ImportError: except ImportError: __version__ = None -import pyarrow.lib as _lib -from pyarrow.lib import (BuildInfo, CppBuildInfo, RuntimeInfo, set_timezone_db_path, +import pyarrow.lib as _lib # type: ignore[unresolved_import] +from pyarrow.lib import (BuildInfo, CppBuildInfo, RuntimeInfo, set_timezone_db_path, # type: ignore[unresolved_import] MonthDayNano, VersionInfo, build_info, cpp_build_info, cpp_version, cpp_version_info, runtime_info, cpu_count, set_cpu_count, enable_signal_handlers, @@ -153,7 +153,7 @@ def show_info(): print(f" {codec: <20}: {status: <8}") -from pyarrow.lib import (null, bool_, +from pyarrow.lib import (null, bool_, # type: ignore[unresolved_import] int8, int16, int32, int64, uint8, uint16, uint32, uint64, time32, time64, timestamp, date32, date64, duration, @@ -237,13 +237,13 @@ from pyarrow.lib import (null, bool_, FixedShapeTensorScalar, JsonScalar, OpaqueScalar, UuidScalar) # Buffers, allocation -from pyarrow.lib import (DeviceAllocationType, Device, MemoryManager, +from pyarrow.lib import (DeviceAllocationType, Device, MemoryManager, # type: ignore[unresolved_import] default_cpu_memory_manager) -from pyarrow.lib import (Buffer, ResizableBuffer, foreign_buffer, py_buffer, +from pyarrow.lib import (Buffer, ResizableBuffer, foreign_buffer, py_buffer, # type: ignore[unresolved_import] Codec, compress, decompress, allocate_buffer) -from pyarrow.lib import (MemoryPool, LoggingMemoryPool, ProxyMemoryPool, +from pyarrow.lib import (MemoryPool, LoggingMemoryPool, ProxyMemoryPool, # type: ignore[unresolved_import] total_allocated_bytes, set_memory_pool, default_memory_pool, system_memory_pool, jemalloc_memory_pool, mimalloc_memory_pool, @@ -252,7 +252,7 @@ from pyarrow.lib import (MemoryPool, LoggingMemoryPool, ProxyMemoryPool, supported_memory_backends) # I/O -from pyarrow.lib import (NativeFile, PythonFile, +from pyarrow.lib import (NativeFile, PythonFile, # type: ignore[unresolved_import] BufferedInputStream, BufferedOutputStream, CacheOptions, CompressedInputStream, CompressedOutputStream, TransformInputStream, transcoding_input_stream, @@ -263,12 +263,12 @@ from pyarrow.lib import (NativeFile, PythonFile, input_stream, output_stream, have_libhdfs) -from pyarrow.lib import (ChunkedArray, RecordBatch, Table, table, +from pyarrow.lib import (ChunkedArray, RecordBatch, Table, table, # type: ignore[unresolved_import] concat_arrays, concat_tables, TableGroupBy, RecordBatchReader, concat_batches) # Exceptions -from pyarrow.lib import (ArrowCancelled, +from pyarrow.lib import (ArrowCancelled, # type: ignore[unresolved_import] ArrowCapacityError, ArrowException, ArrowKeyError, diff --git c/python/pyarrow/acero.py i/python/pyarrow/acero.py index e475e8db5c..dcead124d3 100644 --- c/python/pyarrow/acero.py +++ i/python/pyarrow/acero.py @@ -22,11 +22,11 @@ # distutils: language = c++ # cython: language_level = 3 -from pyarrow.lib import Table, RecordBatch, array +from pyarrow.lib import Table, RecordBatch, array # type: ignore[unresolved_import] from pyarrow.compute import Expression, field try: - from pyarrow._acero import ( # noqa + from pyarrow._acero import ( # type: ignore[unresolved_import] # noqa Declaration, ExecNodeOptions, TableSourceNodeOptions, @@ -45,7 +45,7 @@ except ImportError as exc: try: import pyarrow.dataset as ds - from pyarrow._dataset import ScanNodeOptions + from pyarrow._dataset import ScanNodeOptions # type: ignore[unresolved_import] except ImportError: class DatasetModuleStub: class Dataset: diff --git c/python/pyarrow/benchmark.py i/python/pyarrow/benchmark.py index 25ee1141f0..c0ea1b0ec8 100644 --- c/python/pyarrow/benchmark.py +++ i/python/pyarrow/benchmark.py @@ -18,4 +18,4 @@ # flake8: noqa -from pyarrow.lib import benchmark_PandasObjectIsNull +from pyarrow.lib import benchmark_PandasObjectIsNull # type: ignore[unresolved_import] diff --git c/python/pyarrow/compute.py i/python/pyarrow/compute.py index fe0afdb0a8..52e2de0e48 100644 --- c/python/pyarrow/compute.py +++ i/python/pyarrow/compute.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -from pyarrow._compute import ( # noqa +from pyarrow._compute import ( # type: ignore[unresolved_import] # noqa Function, FunctionOptions, FunctionRegistry, diff --git c/python/pyarrow/conftest.py i/python/pyarrow/conftest.py index 41beaa1404..d1b1567389 100644 --- c/python/pyarrow/conftest.py +++ i/python/pyarrow/conftest.py @@ -21,7 +21,7 @@ import os import pyarrow as pa from pyarrow import Codec from pyarrow import fs -from pyarrow.lib import is_threading_enabled +from pyarrow.lib import is_threading_enabled # type: ignore[unresolved_import] from pyarrow.tests.util import windows_has_tzdata import sys @@ -120,13 +120,13 @@ except ImportError: pass try: - import fastparquet # noqa + import fastparquet # type: ignore[unresolved_import] # noqa defaults['fastparquet'] = True except ImportError: pass try: - import pyarrow.gandiva # noqa + import pyarrow.gandiva # type: ignore[unresolved_import] # noqa defaults['gandiva'] = True except ImportError: pass diff --git c/python/pyarrow/csv.py i/python/pyarrow/csv.py index 1ae197f9f2..76ab1c5e03 100644 --- c/python/pyarrow/csv.py +++ i/python/pyarrow/csv.py @@ -16,7 +16,7 @@ # under the License. -from pyarrow._csv import ( # noqa +from pyarrow._csv import ( # type: ignore[unresolved_import] # noqa ReadOptions, ParseOptions, ConvertOptions, ISO8601, open_csv, read_csv, CSVStreamingReader, write_csv, WriteOptions, CSVWriter, InvalidRow) diff --git c/python/pyarrow/cuda.py i/python/pyarrow/cuda.py index 18c530d4af..834096cfa3 100644 --- c/python/pyarrow/cuda.py +++ i/python/pyarrow/cuda.py @@ -18,7 +18,7 @@ # flake8: noqa -from pyarrow._cuda import (Context, IpcMemHandle, CudaBuffer, +from pyarrow._cuda import (Context, IpcMemHandle, CudaBuffer, # type: ignore[unresolved_import] HostBuffer, BufferReader, BufferWriter, new_host_buffer, serialize_record_batch, read_message, diff --git c/python/pyarrow/dataset.py i/python/pyarrow/dataset.py index ef4f728872..1ab75f8a7f 100644 --- c/python/pyarrow/dataset.py +++ i/python/pyarrow/dataset.py @@ -21,7 +21,7 @@ import pyarrow as pa from pyarrow.util import _is_iterable, _stringify_path, _is_path_like try: - from pyarrow._dataset import ( # noqa + from pyarrow._dataset import ( # type: ignore[unresolved_import] # noqa CsvFileFormat, CsvFragmentScanOptions, JsonFileFormat, @@ -70,7 +70,7 @@ _orc_msg = ( ) try: - from pyarrow._dataset_orc import OrcFileFormat + from pyarrow._dataset_orc import OrcFileFormat # type: ignore[unresolved_import] _orc_available = True except ImportError: pass @@ -82,7 +82,7 @@ _parquet_msg = ( ) try: - from pyarrow._dataset_parquet import ( # noqa + from pyarrow._dataset_parquet import ( # type: ignore[unresolved_import] # noqa ParquetDatasetFactory, ParquetFactoryOptions, ParquetFileFormat, @@ -98,7 +98,7 @@ except ImportError: try: - from pyarrow._dataset_parquet_encryption import ( # noqa + from pyarrow._dataset_parquet_encryption import ( # type: ignore[unresolved_import] # noqa ParquetDecryptionConfig, ParquetEncryptionConfig, ) diff --git c/python/pyarrow/feather.py i/python/pyarrow/feather.py index 241c27706a..28a5c2c547 100644 --- c/python/pyarrow/feather.py +++ i/python/pyarrow/feather.py @@ -20,11 +20,12 @@ from collections.abc import Sequence import os from pyarrow.pandas_compat import _pandas_api # noqa -from pyarrow.lib import (Codec, Table, # noqa +from pyarrow.lib import (Codec, Table, # type: ignore[unresolved_import] # noqa concat_tables, schema) -import pyarrow.lib as ext -from pyarrow import _feather -from pyarrow._feather import FeatherError # noqa: F401 +import pyarrow.lib as ext # type: ignore[unresolved_import] +from pyarrow import _feather # type: ignore[unresolved_import] +from pyarrow._feather import FeatherError \ + # type: ignore[unresolved_import] # noqa: F401 class FeatherDataset: diff --git c/python/pyarrow/flight.py i/python/pyarrow/flight.py index b1836907c6..d6c4602b45 100644 --- c/python/pyarrow/flight.py +++ i/python/pyarrow/flight.py @@ -16,7 +16,7 @@ # under the License. try: - from pyarrow._flight import ( # noqa:F401 + from pyarrow._flight import ( # type: ignore[unresolved_import] # noqa:F401 connect, Action, ActionType, diff --git c/python/pyarrow/fs.py i/python/pyarrow/fs.py index 157dbdf938..c7f1b325c7 100644 --- c/python/pyarrow/fs.py +++ i/python/pyarrow/fs.py @@ -21,7 +21,7 @@ FileSystem abstraction to interact with various local and remote filesystems. from pyarrow.util import _is_path_like, _stringify_path -from pyarrow._fs import ( # noqa +from pyarrow._fs import ( # type: ignore[unresolved_import] # noqa FileSelector, FileType, FileInfo, @@ -40,22 +40,22 @@ FileStats = FileInfo _not_imported = [] try: - from pyarrow._azurefs import AzureFileSystem # noqa + from pyarrow._azurefs import AzureFileSystem # type: ignore[unresolved_import] # noqa except ImportError: _not_imported.append("AzureFileSystem") try: - from pyarrow._hdfs import HadoopFileSystem # noqa + from pyarrow._hdfs import HadoopFileSystem # type: ignore[unresolved_import] # noqa except ImportError: _not_imported.append("HadoopFileSystem") try: - from pyarrow._gcsfs import GcsFileSystem # noqa + from pyarrow._gcsfs import GcsFileSystem # type: ignore[unresolved_import] # noqa except ImportError: _not_imported.append("GcsFileSystem") try: - from pyarrow._s3fs import ( # noqa + from pyarrow._s3fs import ( # type: ignore[unresolved_import] # noqa AwsDefaultS3RetryStrategy, AwsStandardS3RetryStrategy, S3FileSystem, S3LogLevel, S3RetryStrategy, ensure_s3_initialized, finalize_s3, ensure_s3_finalized, initialize_s3, resolve_s3_region) diff --git c/python/pyarrow/ipc.py i/python/pyarrow/ipc.py index 4e23667878..39ec944b72 100644 --- c/python/pyarrow/ipc.py +++ i/python/pyarrow/ipc.py @@ -21,14 +21,14 @@ import os import pyarrow as pa -from pyarrow.lib import (IpcReadOptions, IpcWriteOptions, ReadStats, WriteStats, # noqa +from pyarrow.lib import (IpcReadOptions, IpcWriteOptions, ReadStats, WriteStats, # type: ignore[unresolved_import] # noqa Message, MessageReader, RecordBatchReader, _ReadPandasMixin, MetadataVersion, Alignment, read_message, read_record_batch, read_schema, read_tensor, write_tensor, get_record_batch_size, get_tensor_size) -import pyarrow.lib as lib +import pyarrow.lib as lib # type: ignore[unresolved_import] class RecordBatchStreamReader(lib._RecordBatchStreamReader): diff --git c/python/pyarrow/json.py i/python/pyarrow/json.py index 24e6046135..d4988a1b5a 100644 --- c/python/pyarrow/json.py +++ i/python/pyarrow/json.py @@ -16,4 +16,4 @@ # under the License. -from pyarrow._json import ReadOptions, ParseOptions, read_json, open_json # noqa +from pyarrow._json import ReadOptions, ParseOptions, read_json, open_json # type: ignore[unresolved_import] # noqa diff --git c/python/pyarrow/orc.py i/python/pyarrow/orc.py index 4e0d66ec66..03c6a48046 100644 --- c/python/pyarrow/orc.py +++ i/python/pyarrow/orc.py @@ -19,8 +19,8 @@ from numbers import Integral import warnings -from pyarrow.lib import Table -import pyarrow._orc as _orc +from pyarrow.lib import Table # type: ignore[unresolved_import] +import pyarrow._orc as _orc # type: ignore[unresolved_import] from pyarrow.fs import _resolve_filesystem_and_path diff --git c/python/pyarrow/pandas_compat.py i/python/pyarrow/pandas_compat.py index 7b9f5008a1..f284d411ab 100644 --- c/python/pyarrow/pandas_compat.py +++ i/python/pyarrow/pandas_compat.py @@ -35,7 +35,7 @@ try: except ImportError: np = None # type: ignore[assignment] import pyarrow as pa -from pyarrow.lib import _pandas_api, frombytes, is_threading_enabled # noqa +from pyarrow.lib import _pandas_api, frombytes, is_threading_enabled # type: ignore[unresolved_import] # noqa _logical_type_map = {} @@ -729,7 +729,7 @@ def _reconstruct_block(item, columns=None, extension_columns=None, return_block= pandas Block """ - import pandas.core.internals as _int + import pandas.core.internals as _int # type: ignore[unresolved_import] block_arr = item.get('block', None) placement = item['placement'] @@ -806,7 +806,8 @@ def table_to_dataframe( result = pa.lib.table_to_blocks(options, table, categories, list(ext_columns_dtypes.keys())) if _pandas_api.is_ge_v3(): - from pandas.api.internals import create_dataframe_from_blocks + from pandas.api.internals import create_dataframe_from_blocks \ + # type: ignore[unresolved_import] blocks = [ _reconstruct_block( @@ -816,7 +817,8 @@ def table_to_dataframe( df = create_dataframe_from_blocks(blocks, index=index, columns=columns) return df else: - from pandas.core.internals import BlockManager + from pandas.core.internals import BlockManager \ + # type: ignore[unresolved_import] from pandas import DataFrame blocks = [ diff --git c/python/pyarrow/parquet/core.py i/python/pyarrow/parquet/core.py index 8c1a2ae782..7b6c57f968 100644 --- c/python/pyarrow/parquet/core.py +++ i/python/pyarrow/parquet/core.py @@ -29,14 +29,14 @@ import operator import pyarrow as pa try: - import pyarrow._parquet as _parquet + import pyarrow._parquet as _parquet # type: ignore[unresolved_import] except ImportError as exc: raise ImportError( "The pyarrow installation is not built with support " f"for the Parquet file format ({str(exc)})" ) from None -from pyarrow._parquet import (ParquetReader, Statistics, # noqa +from pyarrow._parquet import (ParquetReader, Statistics, # type: ignore[unresolved_import] # noqa FileMetaData, RowGroupMetaData, ColumnChunkMetaData, ParquetSchema, ColumnSchema, diff --git c/python/pyarrow/parquet/encryption.py i/python/pyarrow/parquet/encryption.py index df6eed913f..43e3bce04e 100644 --- c/python/pyarrow/parquet/encryption.py +++ i/python/pyarrow/parquet/encryption.py @@ -16,7 +16,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -from pyarrow._parquet_encryption import (CryptoFactory, # noqa +from pyarrow._parquet_encryption import (CryptoFactory, # type: ignore[unresolved_import] # noqa EncryptionConfiguration, DecryptionConfiguration, KmsConnectionConfig, diff --git c/python/pyarrow/substrait.py i/python/pyarrow/substrait.py index db2c3a96a1..7ddfa790cb 100644 --- c/python/pyarrow/substrait.py +++ i/python/pyarrow/substrait.py @@ -16,7 +16,7 @@ # under the License. try: - from pyarrow._substrait import ( # noqa + from pyarrow._substrait import ( # type: ignore[unresolved_import] # noqa BoundExpressions, get_supported_functions, run_query, diff --git c/python/pyarrow/tests/test_builder.py i/python/pyarrow/tests/test_builder.py index 9187a19b5f..65ca1458d0 100644 --- c/python/pyarrow/tests/test_builder.py +++ i/python/pyarrow/tests/test_builder.py @@ -19,7 +19,8 @@ import math import weakref import pyarrow as pa -from pyarrow.lib import StringBuilder, StringViewBuilder +from pyarrow.lib import StringBuilder, StringViewBuilder \ + # type: ignore[unresolved_import] def test_weakref(): diff --git c/python/pyarrow/tests/test_compute.py i/python/pyarrow/tests/test_compute.py index 003fb5db41..4ab0e63213 100644 --- c/python/pyarrow/tests/test_compute.py +++ i/python/pyarrow/tests/test_compute.py @@ -40,7 +40,7 @@ except ImportError: import pyarrow as pa import pyarrow.compute as pc -from pyarrow.lib import ArrowNotImplementedError +from pyarrow.lib import ArrowNotImplementedError # type: ignore[unresolved_import] try: import pyarrow.substrait as pas diff --git c/python/pyarrow/tests/test_cpp_internals.py i/python/pyarrow/tests/test_cpp_internals.py index 7508d8f0b9..359ef62b1f 100644 --- c/python/pyarrow/tests/test_cpp_internals.py +++ i/python/pyarrow/tests/test_cpp_internals.py @@ -20,7 +20,7 @@ from os.path import join as pjoin import pytest -from pyarrow._pyarrow_cpp_tests import get_cpp_tests +from pyarrow._pyarrow_cpp_tests import get_cpp_tests # type: ignore[unresolved_import] def inject_cpp_tests(ns): diff --git c/python/pyarrow/tests/test_cuda_numba_interop.py i/python/pyarrow/tests/test_cuda_numba_interop.py index 3bd81d755f..f211f0046f 100644 --- c/python/pyarrow/tests/test_cuda_numba_interop.py +++ i/python/pyarrow/tests/test_cuda_numba_interop.py @@ -26,7 +26,8 @@ dtypes = ['uint8', 'int16', 'float32'] cuda = pytest.importorskip("pyarrow.cuda") nb_cuda = pytest.importorskip("numba.cuda") -from numba.cuda.cudadrv.devicearray import DeviceNDArray # noqa: E402 +from numba.cuda.cudadrv.devicearray import DeviceNDArray \ + # type: ignore[unresolved_import] # noqa: E402 context_choices = None diff --git c/python/pyarrow/tests/test_dataset.py i/python/pyarrow/tests/test_dataset.py index d1cd3f6b8a..d9a4d3df20 100644 --- c/python/pyarrow/tests/test_dataset.py +++ i/python/pyarrow/tests/test_dataset.py @@ -41,7 +41,7 @@ import pyarrow.csv import pyarrow.feather import pyarrow.fs as fs import pyarrow.json -from pyarrow.lib import is_threading_enabled +from pyarrow.lib import is_threading_enabled # type: ignore[unresolved_import] from pyarrow.tests.util import (FSProtocolClass, ProxyHandler, _configure_s3_limited_user, _filesystem_uri, change_cwd) diff --git c/python/pyarrow/tests/test_flight.py i/python/pyarrow/tests/test_flight.py index 600c649278..0c0bc7089b 100644 --- c/python/pyarrow/tests/test_flight.py +++ i/python/pyarrow/tests/test_flight.py @@ -37,7 +37,7 @@ except ImportError: import pytest import pyarrow as pa -from pyarrow.lib import IpcReadOptions, tobytes +from pyarrow.lib import IpcReadOptions, tobytes # type: ignore[unresolved_import] from pyarrow.util import find_free_port from pyarrow.tests import util diff --git c/python/pyarrow/tests/test_fs.py i/python/pyarrow/tests/test_fs.py index a5a10fa55c..61dcb76b24 100644 --- c/python/pyarrow/tests/test_fs.py +++ i/python/pyarrow/tests/test_fs.py @@ -2168,7 +2168,7 @@ def test_fsspec_filesystem_from_uri(): def test_huggingface_filesystem_from_uri(): pytest.importorskip("fsspec") try: - from huggingface_hub import HfFileSystem + from huggingface_hub import HfFileSystem # type: ignore[unresolved_import] except ImportError: pytest.skip("huggingface_hub not installed") diff --git c/python/pyarrow/tests/test_gandiva.py i/python/pyarrow/tests/test_gandiva.py index 80d119a485..01a6d2151a 100644 --- c/python/pyarrow/tests/test_gandiva.py +++ i/python/pyarrow/tests/test_gandiva.py @@ -23,7 +23,7 @@ import pyarrow as pa @pytest.mark.gandiva def test_tree_exp_builder(): - import pyarrow.gandiva as gandiva + import pyarrow.gandiva as gandiva # type: ignore[unresolved_import] builder = gandiva.TreeExprBuilder() @@ -65,7 +65,7 @@ def test_tree_exp_builder(): @pytest.mark.gandiva def test_table(): - import pyarrow.gandiva as gandiva + import pyarrow.gandiva as gandiva # type: ignore[unresolved_import] table = pa.Table.from_arrays([pa.array([1.0, 2.0]), pa.array([3.0, 4.0])], ['a', 'b']) @@ -92,7 +92,7 @@ def test_table(): @pytest.mark.gandiva def test_filter(): - import pyarrow.gandiva as gandiva + import pyarrow.gandiva as gandiva # type: ignore[unresolved_import] table = pa.Table.from_arrays([pa.array([1.0 * i for i in range(10000)])], ['a']) @@ -116,7 +116,7 @@ def test_filter(): @pytest.mark.gandiva def test_in_expr(): - import pyarrow.gandiva as gandiva + import pyarrow.gandiva as gandiva # type: ignore[unresolved_import] arr = pa.array(["ga", "an", "nd", "di", "iv", "va"]) table = pa.Table.from_arrays([arr], ["a"]) @@ -154,7 +154,7 @@ def test_in_expr(): @pytest.mark.skip(reason="Gandiva C++ did not have *real* binary, " "time and date support.") def test_in_expr_todo(): - import pyarrow.gandiva as gandiva + import pyarrow.gandiva as gandiva # type: ignore[unresolved_import] # TODO: Implement reasonable support for timestamp, time & date. # Current exceptions: # pyarrow.lib.ArrowException: ExpressionValidationError: @@ -227,7 +227,7 @@ def test_in_expr_todo(): @pytest.mark.gandiva def test_boolean(): - import pyarrow.gandiva as gandiva + import pyarrow.gandiva as gandiva # type: ignore[unresolved_import] table = pa.Table.from_arrays([ pa.array([1., 31., 46., 3., 57., 44., 22.]), @@ -254,7 +254,7 @@ def test_boolean(): @pytest.mark.gandiva def test_literals(): - import pyarrow.gandiva as gandiva + import pyarrow.gandiva as gandiva # type: ignore[unresolved_import] builder = gandiva.TreeExprBuilder() @@ -294,7 +294,7 @@ def test_literals(): @pytest.mark.gandiva def test_regex(): - import pyarrow.gandiva as gandiva + import pyarrow.gandiva as gandiva # type: ignore[unresolved_import] elements = ["park", "sparkle", "bright spark and fire", "spark"] data = pa.array(elements, type=pa.string()) @@ -318,7 +318,7 @@ def test_regex(): @pytest.mark.gandiva def test_get_registered_function_signatures(): - import pyarrow.gandiva as gandiva + import pyarrow.gandiva as gandiva # type: ignore[unresolved_import] signatures = gandiva.get_registered_function_signatures() assert type(signatures[0].return_type()) is pa.DataType @@ -328,7 +328,7 @@ def test_get_registered_function_signatures(): @pytest.mark.gandiva def test_filter_project(): - import pyarrow.gandiva as gandiva + import pyarrow.gandiva as gandiva # type: ignore[unresolved_import] mpool = pa.default_memory_pool() # Create a table with some sample data array0 = pa.array([10, 12, -20, 5, 21, 29], pa.int32()) @@ -375,7 +375,7 @@ def test_filter_project(): @pytest.mark.gandiva def test_to_string(): - import pyarrow.gandiva as gandiva + import pyarrow.gandiva as gandiva # type: ignore[unresolved_import] builder = gandiva.TreeExprBuilder() assert str(builder.make_literal(2.0, pa.float64()) @@ -395,7 +395,7 @@ def test_to_string(): @pytest.mark.gandiva def test_rejects_none(): - import pyarrow.gandiva as gandiva + import pyarrow.gandiva as gandiva # type: ignore[unresolved_import] builder = gandiva.TreeExprBuilder() diff --git c/python/pyarrow/tests/test_jvm.py i/python/pyarrow/tests/test_jvm.py index d71380b866..b048fcea9e 100644 --- c/python/pyarrow/tests/test_jvm.py +++ i/python/pyarrow/tests/test_jvm.py @@ -76,8 +76,8 @@ def test_jvm_buffer(root_allocator): def test_jvm_buffer_released(root_allocator): - import jpype.imports # noqa - from java.lang import IllegalArgumentException + import jpype.imports # type: ignore[unresolved_import] # noqa + from java.lang import IllegalArgumentException # type: ignore[unresolved_import] jvm_buffer = root_allocator.buffer(8) jvm_buffer.release() diff --git c/python/pyarrow/tests/test_misc.py i/python/pyarrow/tests/test_misc.py index 64f45d8bed..09ac52588e 100644 --- c/python/pyarrow/tests/test_misc.py +++ i/python/pyarrow/tests/test_misc.py @@ -22,7 +22,7 @@ import sys import pytest import pyarrow as pa -from pyarrow.lib import ArrowInvalid +from pyarrow.lib import ArrowInvalid # type: ignore[unresolved_import] def test_get_include(): diff --git c/python/pyarrow/tests/test_sparse_tensor.py i/python/pyarrow/tests/test_sparse_tensor.py index 27974b80f8..e4d141e2a6 100644 --- c/python/pyarrow/tests/test_sparse_tensor.py +++ i/python/pyarrow/tests/test_sparse_tensor.py @@ -32,7 +32,7 @@ except ImportError: scipy = None # type: ignore[assignment] try: - import sparse + import sparse # type: ignore[unresolved_import] except ImportError: sparse = None diff --git c/python/pyarrow/tests/test_substrait.py i/python/pyarrow/tests/test_substrait.py index fcd1c8d48c..fae89d3cee 100644 --- c/python/pyarrow/tests/test_substrait.py +++ i/python/pyarrow/tests/test_substrait.py @@ -22,8 +22,9 @@ import pytest import pyarrow as pa import pyarrow.compute as pc -from pyarrow.lib import tobytes -from pyarrow.lib import ArrowInvalid, ArrowNotImplementedError +from pyarrow.lib import tobytes # type: ignore[unresolved_import] +from pyarrow.lib import ArrowInvalid, ArrowNotImplementedError \ + # type: ignore[unresolved_import] try: import pyarrow.substrait as substrait @@ -36,7 +37,7 @@ pytestmark = pytest.mark.substrait def mock_udf_context(batch_length=10): - from pyarrow._compute import _get_udf_context + from pyarrow._compute import _get_udf_context # type: ignore[unresolved_import] return _get_udf_context(pa.default_memory_pool(), batch_length) diff --git c/python/pyarrow/tests/test_udf.py i/python/pyarrow/tests/test_udf.py index dbc3086797..891295a551 100644 --- c/python/pyarrow/tests/test_udf.py +++ i/python/pyarrow/tests/test_udf.py @@ -39,7 +39,7 @@ except ImportError: def mock_udf_context(batch_length=10): - from pyarrow._compute import _get_udf_context + from pyarrow._compute import _get_udf_context # type: ignore[unresolved_import] return _get_udf_context(pa.default_memory_pool(), batch_length) diff --git c/python/pyarrow/types.py i/python/pyarrow/types.py index ab4e5d1b99..ee2b7e1440 100644 --- c/python/pyarrow/types.py +++ i/python/pyarrow/types.py @@ -20,11 +20,11 @@ from enum import IntEnum -from pyarrow.lib import (is_boolean_value, # noqa +from pyarrow.lib import (is_boolean_value, # type: ignore[unresolved_import] # noqa is_integer_value, is_float_value) -import pyarrow.lib as lib +import pyarrow.lib as lib # type: ignore[unresolved_import] from pyarrow.util import doc diff --git c/python/stubs/__init__.pyi i/python/stubs/__init__.pyi index 8a0d1e870c..0a1c49067c 100644 --- c/python/stubs/__init__.pyi +++ i/python/stubs/__init__.pyi @@ -1,11 +1,11 @@ # ruff: noqa: F401, I001, E402 __version__: str -import pyarrow.lib as _lib +import pyarrow.lib as _lib # type: ignore[unresolved_import] _gc_enabled: bool -from pyarrow.lib import ( +from pyarrow.lib import ( # type: ignore[unresolved_import] BuildInfo, RuntimeInfo, set_timezone_db_path, @@ -27,7 +27,7 @@ def show_info() -> None: ... def _module_is_available(module: str) -> bool: ... def _filesystem_is_available(fs: str) -> bool: ... -from pyarrow.lib import ( +from pyarrow.lib import ( # type: ignore[unresolved_import] null, bool_, int8, @@ -233,9 +233,9 @@ from pyarrow.lib import ( ) # Buffers, allocation -from pyarrow.lib import DeviceAllocationType, Device, MemoryManager, default_cpu_memory_manager +from pyarrow.lib import DeviceAllocationType, Device, MemoryManager, default_cpu_memory_manager # type: ignore[unresolved_import] -from pyarrow.lib import ( +from pyarrow.lib import ( # type: ignore[unresolved_import] Buffer, ResizableBuffer, foreign_buffer, @@ -246,7 +246,7 @@ from pyarrow.lib import ( allocate_buffer, ) -from pyarrow.lib import ( +from pyarrow.lib import ( # type: ignore[unresolved_import] MemoryPool, LoggingMemoryPool, ProxyMemoryPool, @@ -264,7 +264,7 @@ from pyarrow.lib import ( ) # I/O -from pyarrow.lib import ( +from pyarrow.lib import ( # type: ignore[unresolved_import] NativeFile, PythonFile, BufferedInputStream, @@ -287,7 +287,7 @@ from pyarrow.lib import ( have_libhdfs, ) -from pyarrow.lib import ( +from pyarrow.lib import ( # type: ignore[unresolved_import] ChunkedArray, RecordBatch, Table, @@ -299,7 +299,7 @@ from pyarrow.lib import ( ) # Exceptions -from pyarrow.lib import ( +from pyarrow.lib import ( # type: ignore[unresolved_import] ArrowCancelled, ArrowCapacityError, ArrowException, diff --git c/python/stubs/__lib_pxi/array.pyi i/python/stubs/__lib_pxi/array.pyi index ffdb8a9c07..37b397f6bb 100644 --- c/python/stubs/__lib_pxi/array.pyi +++ i/python/stubs/__lib_pxi/array.pyi @@ -23,8 +23,8 @@ import numpy as np import pandas as pd from pandas.core.dtypes.base import ExtensionDtype -from pyarrow._compute import CastOptions -from pyarrow._stubs_typing import ( +from pyarrow._compute import CastOptions # type: ignore[unresolved_import] +from pyarrow._stubs_typing import ( # type: ignore[unresolved_import] ArrayLike, Indices, Mask, @@ -32,7 +32,7 @@ from pyarrow._stubs_typing import ( SupportArrowArray, SupportArrowDeviceArray, ) -from pyarrow.lib import ( +from pyarrow.lib import ( # type: ignore[unresolved_import] Buffer, Device, MemoryManager, diff --git c/python/stubs/__lib_pxi/builder.pyi i/python/stubs/__lib_pxi/builder.pyi index 4a0e9ca470..655d6436da 100644 --- c/python/stubs/__lib_pxi/builder.pyi +++ i/python/stubs/__lib_pxi/builder.pyi @@ -1,6 +1,6 @@ from typing import Iterable -from pyarrow.lib import MemoryPool, _Weakrefable +from pyarrow.lib import MemoryPool, _Weakrefable # type: ignore[unresolved_import] from .array import StringArray, StringViewArray diff --git c/python/stubs/__lib_pxi/device.pyi i/python/stubs/__lib_pxi/device.pyi index d1b9f39eed..edcabdd796 100644 --- c/python/stubs/__lib_pxi/device.pyi +++ i/python/stubs/__lib_pxi/device.pyi @@ -1,6 +1,6 @@ import enum -from pyarrow.lib import _Weakrefable +from pyarrow.lib import _Weakrefable # type: ignore[unresolved_import] class DeviceAllocationType(enum.Flag): CPU = enum.auto() diff --git c/python/stubs/__lib_pxi/io.pyi i/python/stubs/__lib_pxi/io.pyi index 37c8aefb06..488dbf163a 100644 --- c/python/stubs/__lib_pxi/io.pyi +++ i/python/stubs/__lib_pxi/io.pyi @@ -17,8 +17,8 @@ else: from typing import Any, Literal, SupportsIndex, overload -from pyarrow._stubs_typing import Compression, SupportPyBuffer -from pyarrow.lib import MemoryPool, _Weakrefable +from pyarrow._stubs_typing import Compression, SupportPyBuffer # type: ignore[unresolved_import] +from pyarrow.lib import MemoryPool, _Weakrefable # type: ignore[unresolved_import] from .device import Device, DeviceAllocationType, MemoryManager from .types import KeyValueMetadata diff --git c/python/stubs/__lib_pxi/ipc.pyi i/python/stubs/__lib_pxi/ipc.pyi index 3d72892061..13363e4447 100644 --- c/python/stubs/__lib_pxi/ipc.pyi +++ i/python/stubs/__lib_pxi/ipc.pyi @@ -11,8 +11,8 @@ from typing import Iterable, Iterator, Literal, Mapping, NamedTuple import pandas as pd -from pyarrow._stubs_typing import SupportArrowStream, SupportPyBuffer -from pyarrow.lib import MemoryPool, RecordBatch, Schema, Table, Tensor, _Weakrefable +from pyarrow._stubs_typing import SupportArrowStream, SupportPyBuffer # type: ignore[unresolved_import] +from pyarrow.lib import MemoryPool, RecordBatch, Schema, Table, Tensor, _Weakrefable # type: ignore[unresolved_import] from .io import Buffer, Codec, NativeFile from .types import DictionaryMemo, KeyValueMetadata diff --git c/python/stubs/__lib_pxi/memory.pyi i/python/stubs/__lib_pxi/memory.pyi index 57a3bb4f1b..c58bf20dd9 100644 --- c/python/stubs/__lib_pxi/memory.pyi +++ i/python/stubs/__lib_pxi/memory.pyi @@ -1,4 +1,4 @@ -from pyarrow.lib import _Weakrefable +from pyarrow.lib import _Weakrefable # type: ignore[unresolved_import] class MemoryPool(_Weakrefable): """ diff --git c/python/stubs/__lib_pxi/pandas_shim.pyi i/python/stubs/__lib_pxi/pandas_shim.pyi index 29a8485d06..c8cebf765a 100644 --- c/python/stubs/__lib_pxi/pandas_shim.pyi +++ i/python/stubs/__lib_pxi/pandas_shim.pyi @@ -1,5 +1,5 @@ from types import ModuleType -from typing import Any, Iterable, TypeGuard +from typing import Any, Iterable, TypeGuard # type: ignore[unresolved_import] import pandas diff --git c/python/stubs/__lib_pxi/scalar.pyi i/python/stubs/__lib_pxi/scalar.pyi index 81ab501206..cfd4ee6f34 100644 --- c/python/stubs/__lib_pxi/scalar.pyi +++ i/python/stubs/__lib_pxi/scalar.pyi @@ -16,8 +16,8 @@ from typing import Any, Generic, Iterator, Literal, Mapping, overload import numpy as np -from pyarrow._compute import CastOptions -from pyarrow.lib import Array, Buffer, MemoryPool, MonthDayNano, Tensor, _Weakrefable +from pyarrow._compute import CastOptions # type: ignore[unresolved_import] +from pyarrow.lib import Array, Buffer, MemoryPool, MonthDayNano, Tensor, _Weakrefable # type: ignore[unresolved_import] from typing_extensions import Protocol, TypeVar from . import types diff --git c/python/stubs/__lib_pxi/table.pyi i/python/stubs/__lib_pxi/table.pyi index fbcfb1ef74..1ce21b6ed2 100644 --- c/python/stubs/__lib_pxi/table.pyi +++ i/python/stubs/__lib_pxi/table.pyi @@ -31,7 +31,7 @@ import numpy as np import pandas as pd from numpy.typing import NDArray -from pyarrow._compute import ( +from pyarrow._compute import ( # type: ignore[unresolved_import] CastOptions, CountOptions, FunctionOptions, @@ -39,7 +39,7 @@ from pyarrow._compute import ( TDigestOptions, VarianceOptions, ) -from pyarrow._stubs_typing import ( +from pyarrow._stubs_typing import ( # type: ignore[unresolved_import] Indices, Mask, NullEncoding, @@ -49,12 +49,15 @@ from pyarrow._stubs_typing import ( SupportArrowDeviceArray, SupportArrowStream, ) -from pyarrow.compute import ArrayOrChunkedArray, Expression +from pyarrow.compute import ArrayOrChunkedArray, Expression # type: ignore[unresolved_import] from pyarrow.interchange.dataframe import _PyArrowDataFrame -from pyarrow.lib import Device, Field, MemoryManager, MemoryPool, MonthDayNano, Schema +from pyarrow.lib import Device, Field, MemoryManager, MemoryPool, MonthDayNano, Schema # type: ignore[unresolved_import] from . import array, scalar, types -from .array import Array, NullableCollection, StructArray, _CastAs, _PandasConvertible +from .array import ( + Array, StructArray, _CastAs, _PandasConvertible, + NullableCollection, # type: ignore[unresolved_import] +) from .device import DeviceAllocationType from .io import Buffer from .ipc import RecordBatchReader diff --git c/python/stubs/__lib_pxi/tensor.pyi i/python/stubs/__lib_pxi/tensor.pyi index d849abd0f1..a28804c6e3 100644 --- c/python/stubs/__lib_pxi/tensor.pyi +++ i/python/stubs/__lib_pxi/tensor.pyi @@ -7,9 +7,9 @@ else: import numpy as np -from pyarrow.lib import _Weakrefable +from pyarrow.lib import _Weakrefable # type: ignore[unresolved_import] from scipy.sparse import coo_matrix, csr_matrix -from sparse import COO +from sparse import COO # type: ignore[unresolved_import] class Tensor(_Weakrefable): """ diff --git c/python/stubs/__lib_pxi/types.pyi i/python/stubs/__lib_pxi/types.pyi index a7b6062b27..d38269ef34 100644 --- c/python/stubs/__lib_pxi/types.pyi +++ i/python/stubs/__lib_pxi/types.pyi @@ -14,8 +14,8 @@ from typing import Any, Generic, Iterable, Iterator, Literal, overload import numpy as np import pandas as pd -from pyarrow._stubs_typing import SupportArrowSchema -from pyarrow.lib import ( +from pyarrow._stubs_typing import SupportArrowSchema # type: ignore[unresolved_import] +from pyarrow.lib import ( # type: ignore[unresolved_import] Array, ChunkedArray, ExtensionArray, @@ -29,6 +29,7 @@ from .io import Buffer from .scalar import ExtensionScalar _AsPyType = TypeVar("_AsPyType") +_DataTypeT = TypeVar("_DataTypeT", bound=DataType) class _Weakrefable: ... class _Metadata(_Weakrefable): ... diff --git c/python/stubs/_compute.pyi i/python/stubs/_compute.pyi index 3d61ae4278..071fceb392 100644 --- c/python/stubs/_compute.pyi +++ i/python/stubs/_compute.pyi @@ -1,12 +1,6 @@ from typing import ( - Any, - Callable, - Iterable, - Literal, - Sequence, - TypeAlias, - TypedDict, - overload, + Any, Callable, Iterable, Literal, Sequence, TypedDict, overload, + TypeAlias, # type: ignore[unresolved_import] ) from . import lib diff --git c/python/stubs/_fs.pyi i/python/stubs/_fs.pyi index 9f6e28dcf0..d3b194e3de 100644 --- c/python/stubs/_fs.pyi +++ i/python/stubs/_fs.pyi @@ -19,6 +19,8 @@ from fsspec import AbstractFileSystem # type: ignore[import-untyped] from .lib import NativeFile, _Weakrefable +SupportedFileSystem: TypeAlias = Union[AbstractFileSystem, FileSystem] + class FileType(enum.IntFlag): NotFound = enum.auto() Unknown = enum.auto() diff --git c/python/stubs/_parquet.pyi i/python/stubs/_parquet.pyi index a9187df042..053f2d0826 100644 --- c/python/stubs/_parquet.pyi +++ i/python/stubs/_parquet.pyi @@ -1,4 +1,7 @@ -from typing import IO, Any, Iterable, Iterator, Literal, Sequence, TypeAlias, TypedDict +from typing import ( + IO, Any, Iterable, Iterator, Literal, Sequence, TypedDict, + TypeAlias, # type: ignore[unresolved_import] +) from _typeshed import StrPath diff --git c/python/stubs/_s3fs.pyi i/python/stubs/_s3fs.pyi index fc13c498bd..8e67c80561 100644 --- c/python/stubs/_s3fs.pyi +++ i/python/stubs/_s3fs.pyi @@ -1,6 +1,9 @@ import enum -from typing import Literal, NotRequired, Required, TypedDict +from typing import ( + Literal, TypedDict, + NotRequired, Required, # type: ignore[unresolved_import] +) from ._fs import FileSystem from .lib import KeyValueMetadata diff --git c/python/stubs/_stubs_typing.pyi i/python/stubs/_stubs_typing.pyi index 40d931d24e..73bb9f38a9 100644 --- c/python/stubs/_stubs_typing.pyi +++ i/python/stubs/_stubs_typing.pyi @@ -2,7 +2,10 @@ import datetime as dt from collections.abc import Sequence from decimal import Decimal -from typing import Any, Collection, Literal, Protocol, TypeAlias, TypeVar, Union +from typing import ( + Any, Collection, Literal, Protocol, TypeVar, Union, + TypeAlias # type: ignore[unresolved_import] +) import numpy as np diff --git c/python/stubs/benchmark.pyi i/python/stubs/benchmark.pyi index 048973301d..972fad10a5 100644 --- c/python/stubs/benchmark.pyi +++ i/python/stubs/benchmark.pyi @@ -1,3 +1,3 @@ -from pyarrow.lib import benchmark_PandasObjectIsNull +from pyarrow.lib import benchmark_PandasObjectIsNull # type: ignore[unresolved_import] __all__ = ["benchmark_PandasObjectIsNull"] diff --git c/python/stubs/compute.pyi i/python/stubs/compute.pyi index 1cf52ff07c..775b7fa504 100644 --- c/python/stubs/compute.pyi +++ i/python/stubs/compute.pyi @@ -1,94 +1,100 @@ # ruff: noqa: I001 -from typing import Literal, TypeAlias, TypeVar, overload, Any, Iterable, ParamSpec, Sequence +from typing import Literal, TypeAlias, TypeVar, overload, Any, Iterable, ParamSpec, Sequence # type: ignore[unresolved_import] from collections.abc import Callable # Option classes -from pyarrow._compute import ArraySortOptions as ArraySortOptions -from pyarrow._compute import AssumeTimezoneOptions as AssumeTimezoneOptions -from pyarrow._compute import CastOptions as CastOptions -from pyarrow._compute import CountOptions as CountOptions -from pyarrow._compute import CumulativeOptions as CumulativeOptions -from pyarrow._compute import CumulativeSumOptions as CumulativeSumOptions -from pyarrow._compute import DayOfWeekOptions as DayOfWeekOptions -from pyarrow._compute import DictionaryEncodeOptions as DictionaryEncodeOptions -from pyarrow._compute import ElementWiseAggregateOptions as ElementWiseAggregateOptions +from pyarrow._compute import ( # type: ignore[unresolved_import] + ArraySortOptions as ArraySortOptions, + AssumeTimezoneOptions as AssumeTimezoneOptions, + CastOptions as CastOptions, + CountOptions as CountOptions, + CumulativeOptions as CumulativeOptions, + CumulativeSumOptions as CumulativeSumOptions, + DayOfWeekOptions as DayOfWeekOptions, + DictionaryEncodeOptions as DictionaryEncodeOptions, + ElementWiseAggregateOptions as ElementWiseAggregateOptions, +) # Expressions -from pyarrow._compute import Expression as Expression -from pyarrow._compute import ExtractRegexOptions as ExtractRegexOptions -from pyarrow._compute import ExtractRegexSpanOptions as ExtractRegexSpanOptions -from pyarrow._compute import FilterOptions as FilterOptions -from pyarrow._compute import Function as Function -from pyarrow._compute import FunctionOptions as FunctionOptions -from pyarrow._compute import FunctionRegistry as FunctionRegistry -from pyarrow._compute import HashAggregateFunction as HashAggregateFunction -from pyarrow._compute import HashAggregateKernel as HashAggregateKernel -from pyarrow._compute import IndexOptions as IndexOptions -from pyarrow._compute import JoinOptions as JoinOptions -from pyarrow._compute import Kernel as Kernel -from pyarrow._compute import ListFlattenOptions as ListFlattenOptions -from pyarrow._compute import ListSliceOptions as ListSliceOptions -from pyarrow._compute import MakeStructOptions as MakeStructOptions -from pyarrow._compute import MapLookupOptions as MapLookupOptions -from pyarrow._compute import MatchSubstringOptions as MatchSubstringOptions -from pyarrow._compute import ModeOptions as ModeOptions -from pyarrow._compute import NullOptions as NullOptions -from pyarrow._compute import PadOptions as PadOptions -from pyarrow._compute import PairwiseOptions as PairwiseOptions -from pyarrow._compute import PartitionNthOptions as PartitionNthOptions -from pyarrow._compute import PivotWiderOptions as PivotWiderOptions -from pyarrow._compute import QuantileOptions as QuantileOptions -from pyarrow._compute import RandomOptions as RandomOptions -from pyarrow._compute import RankOptions as RankOptions -from pyarrow._compute import RankQuantileOptions as RankQuantileOptions -from pyarrow._compute import ReplaceSliceOptions as ReplaceSliceOptions -from pyarrow._compute import ReplaceSubstringOptions as ReplaceSubstringOptions -from pyarrow._compute import RoundBinaryOptions as RoundBinaryOptions -from pyarrow._compute import RoundOptions as RoundOptions -from pyarrow._compute import RoundTemporalOptions as RoundTemporalOptions -from pyarrow._compute import RoundToMultipleOptions as RoundToMultipleOptions -from pyarrow._compute import RunEndEncodeOptions as RunEndEncodeOptions -from pyarrow._compute import ScalarAggregateFunction as ScalarAggregateFunction -from pyarrow._compute import ScalarAggregateKernel as ScalarAggregateKernel -from pyarrow._compute import ScalarAggregateOptions as ScalarAggregateOptions -from pyarrow._compute import ScalarFunction as ScalarFunction -from pyarrow._compute import ScalarKernel as ScalarKernel -from pyarrow._compute import SelectKOptions as SelectKOptions -from pyarrow._compute import SetLookupOptions as SetLookupOptions -from pyarrow._compute import SkewOptions as SkewOptions -from pyarrow._compute import SliceOptions as SliceOptions -from pyarrow._compute import SortOptions as SortOptions -from pyarrow._compute import SplitOptions as SplitOptions -from pyarrow._compute import SplitPatternOptions as SplitPatternOptions -from pyarrow._compute import StrftimeOptions as StrftimeOptions -from pyarrow._compute import StrptimeOptions as StrptimeOptions -from pyarrow._compute import StructFieldOptions as StructFieldOptions -from pyarrow._compute import TakeOptions as TakeOptions -from pyarrow._compute import TDigestOptions as TDigestOptions -from pyarrow._compute import TrimOptions as TrimOptions -from pyarrow._compute import UdfContext as UdfContext -from pyarrow._compute import Utf8NormalizeOptions as Utf8NormalizeOptions -from pyarrow._compute import VarianceOptions as VarianceOptions -from pyarrow._compute import VectorFunction as VectorFunction -from pyarrow._compute import VectorKernel as VectorKernel -from pyarrow._compute import WeekOptions as WeekOptions -from pyarrow._compute import WinsorizeOptions as WinsorizeOptions +from pyarrow._compute import ( # type: ignore[unresolved_import] + Expression as Expression, + ExtractRegexOptions as ExtractRegexOptions, + ExtractRegexSpanOptions as ExtractRegexSpanOptions, + FilterOptions as FilterOptions, + Function as Function, + FunctionOptions as FunctionOptions, + FunctionRegistry as FunctionRegistry, + HashAggregateFunction as HashAggregateFunction, + HashAggregateKernel as HashAggregateKernel, + IndexOptions as IndexOptions, + JoinOptions as JoinOptions, + Kernel as Kernel, + ListFlattenOptions as ListFlattenOptions, + ListSliceOptions as ListSliceOptions, + MakeStructOptions as MakeStructOptions, + MapLookupOptions as MapLookupOptions, + MatchSubstringOptions as MatchSubstringOptions, + ModeOptions as ModeOptions, + NullOptions as NullOptions, + PadOptions as PadOptions, + PairwiseOptions as PairwiseOptions, + PartitionNthOptions as PartitionNthOptions, + PivotWiderOptions as PivotWiderOptions, + QuantileOptions as QuantileOptions, + RandomOptions as RandomOptions, + RankOptions as RankOptions, + RankQuantileOptions as RankQuantileOptions, + ReplaceSliceOptions as ReplaceSliceOptions, + ReplaceSubstringOptions as ReplaceSubstringOptions, + RoundBinaryOptions as RoundBinaryOptions, + RoundOptions as RoundOptions, + RoundTemporalOptions as RoundTemporalOptions, + RoundToMultipleOptions as RoundToMultipleOptions, + RunEndEncodeOptions as RunEndEncodeOptions, + ScalarAggregateFunction as ScalarAggregateFunction, + ScalarAggregateKernel as ScalarAggregateKernel, + ScalarAggregateOptions as ScalarAggregateOptions, + ScalarFunction as ScalarFunction, + ScalarKernel as ScalarKernel, + SelectKOptions as SelectKOptions, + SetLookupOptions as SetLookupOptions, + SkewOptions as SkewOptions, + SliceOptions as SliceOptions, + SortOptions as SortOptions, + SplitOptions as SplitOptions, + SplitPatternOptions as SplitPatternOptions, + StrftimeOptions as StrftimeOptions, + StrptimeOptions as StrptimeOptions, + StructFieldOptions as StructFieldOptions, + TakeOptions as TakeOptions, + TDigestOptions as TDigestOptions, + TrimOptions as TrimOptions, + UdfContext as UdfContext, + Utf8NormalizeOptions as Utf8NormalizeOptions, + VarianceOptions as VarianceOptions, + VectorFunction as VectorFunction, + VectorKernel as VectorKernel, + WeekOptions as WeekOptions, + WinsorizeOptions as WinsorizeOptions, +) # Functions -from pyarrow._compute import call_function as call_function +from pyarrow._compute import call_function as call_function # type: ignore[unresolved_import] # Udf -from pyarrow._compute import call_tabular_function as call_tabular_function -from pyarrow._compute import function_registry as function_registry -from pyarrow._compute import get_function as get_function -from pyarrow._compute import list_functions as list_functions -from pyarrow._compute import register_aggregate_function as register_aggregate_function -from pyarrow._compute import register_scalar_function as register_scalar_function -from pyarrow._compute import register_tabular_function as register_tabular_function -from pyarrow._compute import register_vector_function as register_vector_function +from pyarrow._compute import ( # type: ignore[unresolved_import] + call_tabular_function as call_tabular_function, + function_registry as function_registry, + get_function as get_function, + list_functions as list_functions, + register_aggregate_function as register_aggregate_function, + register_scalar_function as register_scalar_function, + register_tabular_function as register_tabular_function, + register_vector_function as register_vector_function, +) -from pyarrow._compute import _Order, _Placement -from pyarrow._stubs_typing import ArrayLike, ScalarLike +from pyarrow._compute import _Order, _Placement # type: ignore[unresolved_import] +from pyarrow._stubs_typing import ArrayLike, ScalarLike # type: ignore[unresolved_import] from . import lib _P = ParamSpec("_P") diff --git c/python/stubs/csv.pyi i/python/stubs/csv.pyi index 510229d7e7..cea5542d1c 100644 --- c/python/stubs/csv.pyi +++ i/python/stubs/csv.pyi @@ -1,4 +1,4 @@ -from pyarrow._csv import ( +from pyarrow._csv import ( # type: ignore[unresolved_import] ISO8601, ConvertOptions, CSVStreamingReader, diff --git c/python/stubs/cuda.pyi i/python/stubs/cuda.pyi index e11baf7d4e..3c69e746f7 100644 --- c/python/stubs/cuda.pyi +++ i/python/stubs/cuda.pyi @@ -1,4 +1,4 @@ -from pyarrow._cuda import ( +from pyarrow._cuda import ( # type: ignore[unresolved_import] BufferReader, BufferWriter, Context, diff --git c/python/stubs/dataset.pyi i/python/stubs/dataset.pyi index 98f1a38aa8..a57e9f2f3f 100644 --- c/python/stubs/dataset.pyi +++ i/python/stubs/dataset.pyi @@ -1,7 +1,7 @@ -from typing import Callable, Iterable, Literal, Sequence, TypeAlias, overload +from typing import Callable, Iterable, Literal, Sequence, TypeAlias, overload # type: ignore[unresolved_import] from _typeshed import StrPath -from pyarrow._dataset import ( +from pyarrow._dataset import ( # type: ignore[unresolved_import] CsvFileFormat, CsvFragmentScanOptions, Dataset, @@ -32,8 +32,8 @@ from pyarrow._dataset import ( WrittenFile, get_partition_keys, ) -from pyarrow._dataset_orc import OrcFileFormat -from pyarrow._dataset_parquet import ( +from pyarrow._dataset_orc import OrcFileFormat # type: ignore[unresolved_import] +from pyarrow._dataset_parquet import ( # type: ignore[unresolved_import] ParquetDatasetFactory, ParquetFactoryOptions, ParquetFileFormat, @@ -43,12 +43,12 @@ from pyarrow._dataset_parquet import ( ParquetReadOptions, RowGroupInfo, ) -from pyarrow._dataset_parquet_encryption import ( +from pyarrow._dataset_parquet_encryption import ( # type: ignore[unresolved_import] ParquetDecryptionConfig, ParquetEncryptionConfig, ) from pyarrow.compute import Expression, field, scalar -from pyarrow.lib import Array, RecordBatch, RecordBatchReader, Schema, Table +from pyarrow.lib import Array, RecordBatch, RecordBatchReader, Schema, Table # type: ignore[unresolved_import] from ._fs import SupportedFileSystem diff --git c/python/stubs/feather.pyi i/python/stubs/feather.pyi index 9451ee1576..63766cd5d6 100644 --- c/python/stubs/feather.pyi +++ i/python/stubs/feather.pyi @@ -3,8 +3,8 @@ from typing import IO, Literal import pandas as pd from _typeshed import StrPath -from pyarrow._feather import FeatherError -from pyarrow.lib import Table +from pyarrow._feather import FeatherError # type: ignore[unresolved_import] +from pyarrow.lib import Table # type: ignore[unresolved_import] __all__ = [ "FeatherError", diff --git c/python/stubs/flight.pyi i/python/stubs/flight.pyi index 9b806ccf30..aa06f3ebec 100644 --- c/python/stubs/flight.pyi +++ i/python/stubs/flight.pyi @@ -1,4 +1,4 @@ -from pyarrow._flight import ( +from pyarrow._flight import ( # type: ignore[unresolved_import] Action, ActionType, BasicAuth, diff --git c/python/stubs/fs.pyi i/python/stubs/fs.pyi index 6bf75616c1..07a1d7765e 100644 --- c/python/stubs/fs.pyi +++ i/python/stubs/fs.pyi @@ -1,4 +1,4 @@ -from pyarrow._fs import ( # noqa +from pyarrow._fs import ( # type: ignore[unresolved_import] # noqa FileSelector, FileType, FileInfo, @@ -10,10 +10,10 @@ from pyarrow._fs import ( # noqa PyFileSystem, SupportedFileSystem, ) -from pyarrow._azurefs import AzureFileSystem -from pyarrow._hdfs import HadoopFileSystem -from pyarrow._gcsfs import GcsFileSystem -from pyarrow._s3fs import ( # noqa +from pyarrow._azurefs import AzureFileSystem # type: ignore[unresolved_import] +from pyarrow._hdfs import HadoopFileSystem # type: ignore[unresolved_import] +from pyarrow._gcsfs import GcsFileSystem # type: ignore[unresolved_import] +from pyarrow._s3fs import ( # type: ignore[unresolved_import] # noqa AwsDefaultS3RetryStrategy, AwsStandardS3RetryStrategy, S3FileSystem, diff --git c/python/stubs/interchange/buffer.pyi i/python/stubs/interchange/buffer.pyi index 46673961a7..afef5acf35 100644 --- c/python/stubs/interchange/buffer.pyi +++ i/python/stubs/interchange/buffer.pyi @@ -1,6 +1,6 @@ import enum -from pyarrow.lib import Buffer +from pyarrow.lib import Buffer # type: ignore[unresolved_import] class DlpackDeviceType(enum.IntEnum): """Integer enum for device type codes matching DLPack.""" diff --git c/python/stubs/interchange/column.pyi i/python/stubs/interchange/column.pyi index e6662867b6..7d89c4ae6b 100644 --- c/python/stubs/interchange/column.pyi +++ i/python/stubs/interchange/column.pyi @@ -1,8 +1,8 @@ import enum -from typing import Any, Iterable, TypeAlias, TypedDict +from typing import Any, Iterable, TypeAlias, TypedDict # type: ignore[unresolved_import] -from pyarrow.lib import Array, ChunkedArray +from pyarrow.lib import Array, ChunkedArray # type: ignore[unresolved_import] from .buffer import _PyArrowBuffer diff --git c/python/stubs/interchange/dataframe.pyi i/python/stubs/interchange/dataframe.pyi index 526a58926a..7a17dfeb1e 100644 --- c/python/stubs/interchange/dataframe.pyi +++ i/python/stubs/interchange/dataframe.pyi @@ -7,7 +7,7 @@ else: from typing import Any, Iterable, Sequence from pyarrow.interchange.column import _PyArrowColumn -from pyarrow.lib import RecordBatch, Table +from pyarrow.lib import RecordBatch, Table # type: ignore[unresolved_import] class _PyArrowDataFrame: """ diff --git c/python/stubs/interchange/from_dataframe.pyi i/python/stubs/interchange/from_dataframe.pyi index b04b626897..ad461270f5 100644 --- c/python/stubs/interchange/from_dataframe.pyi +++ i/python/stubs/interchange/from_dataframe.pyi @@ -1,6 +1,6 @@ -from typing import Any, Protocol, TypeAlias +from typing import Any, Protocol, TypeAlias # type: ignore[unresolved_import] -from pyarrow.lib import Array, Buffer, DataType, DictionaryArray, RecordBatch, Table +from pyarrow.lib import Array, Buffer, DataType, DictionaryArray, RecordBatch, Table # type: ignore[unresolved_import] from .column import ( ColumnBuffers, diff --git c/python/stubs/ipc.pyi i/python/stubs/ipc.pyi index c7f2af004d..2a5e8294e4 100644 --- c/python/stubs/ipc.pyi +++ i/python/stubs/ipc.pyi @@ -1,9 +1,9 @@ from io import IOBase import pandas as pd -import pyarrow.lib as lib +import pyarrow.lib as lib # type: ignore[unresolved_import] -from pyarrow.lib import ( +from pyarrow.lib import ( # type: ignore[unresolved_import] IpcReadOptions, IpcWriteOptions, Message, diff --git c/python/stubs/json.pyi i/python/stubs/json.pyi index db1d35e0b8..97b94d5dd7 100644 --- c/python/stubs/json.pyi +++ i/python/stubs/json.pyi @@ -1,3 +1,3 @@ -from pyarrow._json import ParseOptions, ReadOptions, open_json, read_json +from pyarrow._json import ParseOptions, ReadOptions, open_json, read_json # type: ignore[unresolved_import] __all__ = ["ParseOptions", "ReadOptions", "read_json", "open_json"] diff --git c/python/stubs/parquet/core.pyi i/python/stubs/parquet/core.pyi index 56b2c8447d..01dce442fe 100644 --- c/python/stubs/parquet/core.pyi +++ i/python/stubs/parquet/core.pyi @@ -13,10 +13,10 @@ if sys.version_info >= (3, 10): else: from typing_extensions import TypeAlias -from pyarrow import _parquet -from pyarrow._compute import Expression -from pyarrow._fs import FileSystem, SupportedFileSystem -from pyarrow._parquet import ( +from pyarrow import _parquet # type: ignore[unresolved_import] +from pyarrow._compute import Expression # type: ignore[unresolved_import] +from pyarrow._fs import FileSystem, SupportedFileSystem # type: ignore[unresolved_import] +from pyarrow._parquet import ( # type: ignore[unresolved_import] ColumnChunkMetaData, ColumnSchema, FileDecryptionProperties, @@ -29,9 +29,9 @@ from pyarrow._parquet import ( SortingColumn, Statistics, ) -from pyarrow._stubs_typing import FilterTuple, SingleOrList +from pyarrow._stubs_typing import FilterTuple, SingleOrList # type: ignore[unresolved_import] from pyarrow.dataset import ParquetFileFragment, Partitioning -from pyarrow.lib import NativeFile, RecordBatch, Schema, Table +from pyarrow.lib import NativeFile, RecordBatch, Schema, Table # type: ignore[unresolved_import] from typing_extensions import deprecated __all__ = ( diff --git c/python/stubs/parquet/encryption.pyi i/python/stubs/parquet/encryption.pyi index 5a77dae7ef..daade78e6d 100644 --- c/python/stubs/parquet/encryption.pyi +++ i/python/stubs/parquet/encryption.pyi @@ -1,4 +1,4 @@ -from pyarrow._parquet_encryption import ( +from pyarrow._parquet_encryption import ( # type: ignore[unresolved_import] CryptoFactory, DecryptionConfiguration, EncryptionConfiguration, diff --git c/python/stubs/substrait.pyi i/python/stubs/substrait.pyi index a56a8a5b40..004439d4c1 100644 --- c/python/stubs/substrait.pyi +++ i/python/stubs/substrait.pyi @@ -1,4 +1,4 @@ -from pyarrow._substrait import ( +from pyarrow._substrait import ( # type: ignore[unresolved_import] BoundExpressions, SubstraitSchema, deserialize_expressions, diff --git c/python/stubs/types.pyi i/python/stubs/types.pyi index 0cb4f6171d..c128770d17 100644 --- c/python/stubs/types.pyi +++ i/python/stubs/types.pyi @@ -11,7 +11,7 @@ if sys.version_info >= (3, 10): else: from typing_extensions import TypeAlias -from pyarrow.lib import ( +from pyarrow.lib import ( # type: ignore[unresolved_import] BinaryType, BinaryViewType, BoolType, --- python/pyarrow/__init__.py | 18 +- python/pyarrow/acero.py | 6 +- python/pyarrow/benchmark.py | 2 +- python/pyarrow/compute.py | 2 +- python/pyarrow/conftest.py | 6 +- python/pyarrow/csv.py | 2 +- python/pyarrow/cuda.py | 2 +- python/pyarrow/dataset.py | 8 +- python/pyarrow/feather.py | 9 +- python/pyarrow/flight.py | 2 +- python/pyarrow/fs.py | 10 +- python/pyarrow/ipc.py | 4 +- python/pyarrow/json.py | 2 +- python/pyarrow/orc.py | 4 +- python/pyarrow/pandas_compat.py | 10 +- python/pyarrow/parquet/core.py | 4 +- python/pyarrow/parquet/encryption.py | 2 +- python/pyarrow/substrait.py | 2 +- python/pyarrow/tests/test_builder.py | 3 +- python/pyarrow/tests/test_compute.py | 2 +- python/pyarrow/tests/test_cpp_internals.py | 2 +- .../pyarrow/tests/test_cuda_numba_interop.py | 3 +- python/pyarrow/tests/test_dataset.py | 2 +- python/pyarrow/tests/test_flight.py | 2 +- python/pyarrow/tests/test_fs.py | 2 +- python/pyarrow/tests/test_gandiva.py | 24 +-- python/pyarrow/tests/test_jvm.py | 4 +- python/pyarrow/tests/test_misc.py | 2 +- python/pyarrow/tests/test_sparse_tensor.py | 2 +- python/pyarrow/tests/test_substrait.py | 7 +- python/pyarrow/tests/test_udf.py | 2 +- python/pyarrow/types.py | 4 +- python/stubs/__init__.pyi | 18 +- python/stubs/__lib_pxi/array.pyi | 6 +- python/stubs/__lib_pxi/builder.pyi | 2 +- python/stubs/__lib_pxi/device.pyi | 2 +- python/stubs/__lib_pxi/io.pyi | 4 +- python/stubs/__lib_pxi/ipc.pyi | 4 +- python/stubs/__lib_pxi/memory.pyi | 2 +- python/stubs/__lib_pxi/pandas_shim.pyi | 2 +- python/stubs/__lib_pxi/scalar.pyi | 4 +- python/stubs/__lib_pxi/table.pyi | 13 +- python/stubs/__lib_pxi/tensor.pyi | 4 +- python/stubs/__lib_pxi/types.pyi | 5 +- python/stubs/_compute.pyi | 10 +- python/stubs/_fs.pyi | 2 + python/stubs/_parquet.pyi | 5 +- python/stubs/_s3fs.pyi | 5 +- python/stubs/_stubs_typing.pyi | 5 +- python/stubs/benchmark.pyi | 2 +- python/stubs/compute.pyi | 168 +++++++++--------- python/stubs/csv.pyi | 2 +- python/stubs/cuda.pyi | 2 +- python/stubs/dataset.pyi | 12 +- python/stubs/feather.pyi | 4 +- python/stubs/flight.pyi | 2 +- python/stubs/fs.pyi | 10 +- python/stubs/interchange/buffer.pyi | 2 +- python/stubs/interchange/column.pyi | 4 +- python/stubs/interchange/dataframe.pyi | 2 +- python/stubs/interchange/from_dataframe.pyi | 4 +- python/stubs/ipc.pyi | 4 +- python/stubs/json.pyi | 2 +- python/stubs/parquet/core.pyi | 12 +- python/stubs/parquet/encryption.pyi | 2 +- python/stubs/substrait.pyi | 2 +- python/stubs/types.pyi | 2 +- 67 files changed, 254 insertions(+), 233 deletions(-) diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index da2fe966475..45aa2b619f8 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -58,8 +58,8 @@ def parse_git(root, **kwargs): except ImportError: __version__ = None -import pyarrow.lib as _lib -from pyarrow.lib import (BuildInfo, CppBuildInfo, RuntimeInfo, set_timezone_db_path, +import pyarrow.lib as _lib # type: ignore[unresolved_import] +from pyarrow.lib import (BuildInfo, CppBuildInfo, RuntimeInfo, set_timezone_db_path, # type: ignore[unresolved_import] MonthDayNano, VersionInfo, build_info, cpp_build_info, cpp_version, cpp_version_info, runtime_info, cpu_count, set_cpu_count, enable_signal_handlers, @@ -153,7 +153,7 @@ def print_entry(label, value): print(f" {codec: <20}: {status: <8}") -from pyarrow.lib import (null, bool_, +from pyarrow.lib import (null, bool_, # type: ignore[unresolved_import] int8, int16, int32, int64, uint8, uint16, uint32, uint64, time32, time64, timestamp, date32, date64, duration, @@ -237,13 +237,13 @@ def print_entry(label, value): FixedShapeTensorScalar, JsonScalar, OpaqueScalar, UuidScalar) # Buffers, allocation -from pyarrow.lib import (DeviceAllocationType, Device, MemoryManager, +from pyarrow.lib import (DeviceAllocationType, Device, MemoryManager, # type: ignore[unresolved_import] default_cpu_memory_manager) -from pyarrow.lib import (Buffer, ResizableBuffer, foreign_buffer, py_buffer, +from pyarrow.lib import (Buffer, ResizableBuffer, foreign_buffer, py_buffer, # type: ignore[unresolved_import] Codec, compress, decompress, allocate_buffer) -from pyarrow.lib import (MemoryPool, LoggingMemoryPool, ProxyMemoryPool, +from pyarrow.lib import (MemoryPool, LoggingMemoryPool, ProxyMemoryPool, # type: ignore[unresolved_import] total_allocated_bytes, set_memory_pool, default_memory_pool, system_memory_pool, jemalloc_memory_pool, mimalloc_memory_pool, @@ -252,7 +252,7 @@ def print_entry(label, value): supported_memory_backends) # I/O -from pyarrow.lib import (NativeFile, PythonFile, +from pyarrow.lib import (NativeFile, PythonFile, # type: ignore[unresolved_import] BufferedInputStream, BufferedOutputStream, CacheOptions, CompressedInputStream, CompressedOutputStream, TransformInputStream, transcoding_input_stream, @@ -263,12 +263,12 @@ def print_entry(label, value): input_stream, output_stream, have_libhdfs) -from pyarrow.lib import (ChunkedArray, RecordBatch, Table, table, +from pyarrow.lib import (ChunkedArray, RecordBatch, Table, table, # type: ignore[unresolved_import] concat_arrays, concat_tables, TableGroupBy, RecordBatchReader, concat_batches) # Exceptions -from pyarrow.lib import (ArrowCancelled, +from pyarrow.lib import (ArrowCancelled, # type: ignore[unresolved_import] ArrowCapacityError, ArrowException, ArrowKeyError, diff --git a/python/pyarrow/acero.py b/python/pyarrow/acero.py index e475e8db5c2..dcead124d31 100644 --- a/python/pyarrow/acero.py +++ b/python/pyarrow/acero.py @@ -22,11 +22,11 @@ # distutils: language = c++ # cython: language_level = 3 -from pyarrow.lib import Table, RecordBatch, array +from pyarrow.lib import Table, RecordBatch, array # type: ignore[unresolved_import] from pyarrow.compute import Expression, field try: - from pyarrow._acero import ( # noqa + from pyarrow._acero import ( # type: ignore[unresolved_import] # noqa Declaration, ExecNodeOptions, TableSourceNodeOptions, @@ -45,7 +45,7 @@ try: import pyarrow.dataset as ds - from pyarrow._dataset import ScanNodeOptions + from pyarrow._dataset import ScanNodeOptions # type: ignore[unresolved_import] except ImportError: class DatasetModuleStub: class Dataset: diff --git a/python/pyarrow/benchmark.py b/python/pyarrow/benchmark.py index 25ee1141f08..c0ea1b0ec89 100644 --- a/python/pyarrow/benchmark.py +++ b/python/pyarrow/benchmark.py @@ -18,4 +18,4 @@ # flake8: noqa -from pyarrow.lib import benchmark_PandasObjectIsNull +from pyarrow.lib import benchmark_PandasObjectIsNull # type: ignore[unresolved_import] diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index fe0afdb0a87..52e2de0e484 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -from pyarrow._compute import ( # noqa +from pyarrow._compute import ( # type: ignore[unresolved_import] # noqa Function, FunctionOptions, FunctionRegistry, diff --git a/python/pyarrow/conftest.py b/python/pyarrow/conftest.py index 41beaa14041..d1b1567389b 100644 --- a/python/pyarrow/conftest.py +++ b/python/pyarrow/conftest.py @@ -21,7 +21,7 @@ import pyarrow as pa from pyarrow import Codec from pyarrow import fs -from pyarrow.lib import is_threading_enabled +from pyarrow.lib import is_threading_enabled # type: ignore[unresolved_import] from pyarrow.tests.util import windows_has_tzdata import sys @@ -120,13 +120,13 @@ pass try: - import fastparquet # noqa + import fastparquet # type: ignore[unresolved_import] # noqa defaults['fastparquet'] = True except ImportError: pass try: - import pyarrow.gandiva # noqa + import pyarrow.gandiva # type: ignore[unresolved_import] # noqa defaults['gandiva'] = True except ImportError: pass diff --git a/python/pyarrow/csv.py b/python/pyarrow/csv.py index 1ae197f9f20..76ab1c5e03d 100644 --- a/python/pyarrow/csv.py +++ b/python/pyarrow/csv.py @@ -16,7 +16,7 @@ # under the License. -from pyarrow._csv import ( # noqa +from pyarrow._csv import ( # type: ignore[unresolved_import] # noqa ReadOptions, ParseOptions, ConvertOptions, ISO8601, open_csv, read_csv, CSVStreamingReader, write_csv, WriteOptions, CSVWriter, InvalidRow) diff --git a/python/pyarrow/cuda.py b/python/pyarrow/cuda.py index 18c530d4afe..834096cfa30 100644 --- a/python/pyarrow/cuda.py +++ b/python/pyarrow/cuda.py @@ -18,7 +18,7 @@ # flake8: noqa -from pyarrow._cuda import (Context, IpcMemHandle, CudaBuffer, +from pyarrow._cuda import (Context, IpcMemHandle, CudaBuffer, # type: ignore[unresolved_import] HostBuffer, BufferReader, BufferWriter, new_host_buffer, serialize_record_batch, read_message, diff --git a/python/pyarrow/dataset.py b/python/pyarrow/dataset.py index ef4f7288723..1ab75f8a7fb 100644 --- a/python/pyarrow/dataset.py +++ b/python/pyarrow/dataset.py @@ -21,7 +21,7 @@ from pyarrow.util import _is_iterable, _stringify_path, _is_path_like try: - from pyarrow._dataset import ( # noqa + from pyarrow._dataset import ( # type: ignore[unresolved_import] # noqa CsvFileFormat, CsvFragmentScanOptions, JsonFileFormat, @@ -70,7 +70,7 @@ ) try: - from pyarrow._dataset_orc import OrcFileFormat + from pyarrow._dataset_orc import OrcFileFormat # type: ignore[unresolved_import] _orc_available = True except ImportError: pass @@ -82,7 +82,7 @@ ) try: - from pyarrow._dataset_parquet import ( # noqa + from pyarrow._dataset_parquet import ( # type: ignore[unresolved_import] # noqa ParquetDatasetFactory, ParquetFactoryOptions, ParquetFileFormat, @@ -98,7 +98,7 @@ try: - from pyarrow._dataset_parquet_encryption import ( # noqa + from pyarrow._dataset_parquet_encryption import ( # type: ignore[unresolved_import] # noqa ParquetDecryptionConfig, ParquetEncryptionConfig, ) diff --git a/python/pyarrow/feather.py b/python/pyarrow/feather.py index 241c27706a6..28a5c2c5476 100644 --- a/python/pyarrow/feather.py +++ b/python/pyarrow/feather.py @@ -20,11 +20,12 @@ import os from pyarrow.pandas_compat import _pandas_api # noqa -from pyarrow.lib import (Codec, Table, # noqa +from pyarrow.lib import (Codec, Table, # type: ignore[unresolved_import] # noqa concat_tables, schema) -import pyarrow.lib as ext -from pyarrow import _feather -from pyarrow._feather import FeatherError # noqa: F401 +import pyarrow.lib as ext # type: ignore[unresolved_import] +from pyarrow import _feather # type: ignore[unresolved_import] +from pyarrow._feather import FeatherError \ + # type: ignore[unresolved_import] # noqa: F401 class FeatherDataset: diff --git a/python/pyarrow/flight.py b/python/pyarrow/flight.py index b1836907c67..d6c4602b45d 100644 --- a/python/pyarrow/flight.py +++ b/python/pyarrow/flight.py @@ -16,7 +16,7 @@ # under the License. try: - from pyarrow._flight import ( # noqa:F401 + from pyarrow._flight import ( # type: ignore[unresolved_import] # noqa:F401 connect, Action, ActionType, diff --git a/python/pyarrow/fs.py b/python/pyarrow/fs.py index 157dbdf9380..c7f1b325c70 100644 --- a/python/pyarrow/fs.py +++ b/python/pyarrow/fs.py @@ -21,7 +21,7 @@ from pyarrow.util import _is_path_like, _stringify_path -from pyarrow._fs import ( # noqa +from pyarrow._fs import ( # type: ignore[unresolved_import] # noqa FileSelector, FileType, FileInfo, @@ -40,22 +40,22 @@ _not_imported = [] try: - from pyarrow._azurefs import AzureFileSystem # noqa + from pyarrow._azurefs import AzureFileSystem # type: ignore[unresolved_import] # noqa except ImportError: _not_imported.append("AzureFileSystem") try: - from pyarrow._hdfs import HadoopFileSystem # noqa + from pyarrow._hdfs import HadoopFileSystem # type: ignore[unresolved_import] # noqa except ImportError: _not_imported.append("HadoopFileSystem") try: - from pyarrow._gcsfs import GcsFileSystem # noqa + from pyarrow._gcsfs import GcsFileSystem # type: ignore[unresolved_import] # noqa except ImportError: _not_imported.append("GcsFileSystem") try: - from pyarrow._s3fs import ( # noqa + from pyarrow._s3fs import ( # type: ignore[unresolved_import] # noqa AwsDefaultS3RetryStrategy, AwsStandardS3RetryStrategy, S3FileSystem, S3LogLevel, S3RetryStrategy, ensure_s3_initialized, finalize_s3, ensure_s3_finalized, initialize_s3, resolve_s3_region) diff --git a/python/pyarrow/ipc.py b/python/pyarrow/ipc.py index 4e236678788..39ec944b728 100644 --- a/python/pyarrow/ipc.py +++ b/python/pyarrow/ipc.py @@ -21,14 +21,14 @@ import pyarrow as pa -from pyarrow.lib import (IpcReadOptions, IpcWriteOptions, ReadStats, WriteStats, # noqa +from pyarrow.lib import (IpcReadOptions, IpcWriteOptions, ReadStats, WriteStats, # type: ignore[unresolved_import] # noqa Message, MessageReader, RecordBatchReader, _ReadPandasMixin, MetadataVersion, Alignment, read_message, read_record_batch, read_schema, read_tensor, write_tensor, get_record_batch_size, get_tensor_size) -import pyarrow.lib as lib +import pyarrow.lib as lib # type: ignore[unresolved_import] class RecordBatchStreamReader(lib._RecordBatchStreamReader): diff --git a/python/pyarrow/json.py b/python/pyarrow/json.py index 24e60461350..d4988a1b5ae 100644 --- a/python/pyarrow/json.py +++ b/python/pyarrow/json.py @@ -16,4 +16,4 @@ # under the License. -from pyarrow._json import ReadOptions, ParseOptions, read_json, open_json # noqa +from pyarrow._json import ReadOptions, ParseOptions, read_json, open_json # type: ignore[unresolved_import] # noqa diff --git a/python/pyarrow/orc.py b/python/pyarrow/orc.py index 4e0d66ec665..03c6a48046e 100644 --- a/python/pyarrow/orc.py +++ b/python/pyarrow/orc.py @@ -19,8 +19,8 @@ from numbers import Integral import warnings -from pyarrow.lib import Table -import pyarrow._orc as _orc +from pyarrow.lib import Table # type: ignore[unresolved_import] +import pyarrow._orc as _orc # type: ignore[unresolved_import] from pyarrow.fs import _resolve_filesystem_and_path diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index 7b9f5008a10..f284d411abf 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -35,7 +35,7 @@ except ImportError: np = None # type: ignore[assignment] import pyarrow as pa -from pyarrow.lib import _pandas_api, frombytes, is_threading_enabled # noqa +from pyarrow.lib import _pandas_api, frombytes, is_threading_enabled # type: ignore[unresolved_import] # noqa _logical_type_map = {} @@ -729,7 +729,7 @@ def _reconstruct_block(item, columns=None, extension_columns=None, return_block= pandas Block """ - import pandas.core.internals as _int + import pandas.core.internals as _int # type: ignore[unresolved_import] block_arr = item.get('block', None) placement = item['placement'] @@ -806,7 +806,8 @@ def table_to_dataframe( result = pa.lib.table_to_blocks(options, table, categories, list(ext_columns_dtypes.keys())) if _pandas_api.is_ge_v3(): - from pandas.api.internals import create_dataframe_from_blocks + from pandas.api.internals import create_dataframe_from_blocks \ + # type: ignore[unresolved_import] blocks = [ _reconstruct_block( @@ -816,7 +817,8 @@ def table_to_dataframe( df = create_dataframe_from_blocks(blocks, index=index, columns=columns) return df else: - from pandas.core.internals import BlockManager + from pandas.core.internals import BlockManager \ + # type: ignore[unresolved_import] from pandas import DataFrame blocks = [ diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index 8c1a2ae7822..7b6c57f9683 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -29,14 +29,14 @@ import pyarrow as pa try: - import pyarrow._parquet as _parquet + import pyarrow._parquet as _parquet # type: ignore[unresolved_import] except ImportError as exc: raise ImportError( "The pyarrow installation is not built with support " f"for the Parquet file format ({str(exc)})" ) from None -from pyarrow._parquet import (ParquetReader, Statistics, # noqa +from pyarrow._parquet import (ParquetReader, Statistics, # type: ignore[unresolved_import] # noqa FileMetaData, RowGroupMetaData, ColumnChunkMetaData, ParquetSchema, ColumnSchema, diff --git a/python/pyarrow/parquet/encryption.py b/python/pyarrow/parquet/encryption.py index df6eed913fa..43e3bce04e6 100644 --- a/python/pyarrow/parquet/encryption.py +++ b/python/pyarrow/parquet/encryption.py @@ -16,7 +16,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -from pyarrow._parquet_encryption import (CryptoFactory, # noqa +from pyarrow._parquet_encryption import (CryptoFactory, # type: ignore[unresolved_import] # noqa EncryptionConfiguration, DecryptionConfiguration, KmsConnectionConfig, diff --git a/python/pyarrow/substrait.py b/python/pyarrow/substrait.py index db2c3a96a19..7ddfa790cb6 100644 --- a/python/pyarrow/substrait.py +++ b/python/pyarrow/substrait.py @@ -16,7 +16,7 @@ # under the License. try: - from pyarrow._substrait import ( # noqa + from pyarrow._substrait import ( # type: ignore[unresolved_import] # noqa BoundExpressions, get_supported_functions, run_query, diff --git a/python/pyarrow/tests/test_builder.py b/python/pyarrow/tests/test_builder.py index 9187a19b5fc..65ca1458d0c 100644 --- a/python/pyarrow/tests/test_builder.py +++ b/python/pyarrow/tests/test_builder.py @@ -19,7 +19,8 @@ import weakref import pyarrow as pa -from pyarrow.lib import StringBuilder, StringViewBuilder +from pyarrow.lib import StringBuilder, StringViewBuilder \ + # type: ignore[unresolved_import] def test_weakref(): diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 003fb5db41d..4ab0e632134 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -40,7 +40,7 @@ import pyarrow as pa import pyarrow.compute as pc -from pyarrow.lib import ArrowNotImplementedError +from pyarrow.lib import ArrowNotImplementedError # type: ignore[unresolved_import] try: import pyarrow.substrait as pas diff --git a/python/pyarrow/tests/test_cpp_internals.py b/python/pyarrow/tests/test_cpp_internals.py index 7508d8f0b98..359ef62b1f8 100644 --- a/python/pyarrow/tests/test_cpp_internals.py +++ b/python/pyarrow/tests/test_cpp_internals.py @@ -20,7 +20,7 @@ import pytest -from pyarrow._pyarrow_cpp_tests import get_cpp_tests +from pyarrow._pyarrow_cpp_tests import get_cpp_tests # type: ignore[unresolved_import] def inject_cpp_tests(ns): diff --git a/python/pyarrow/tests/test_cuda_numba_interop.py b/python/pyarrow/tests/test_cuda_numba_interop.py index 3bd81d755f5..f211f0046f0 100644 --- a/python/pyarrow/tests/test_cuda_numba_interop.py +++ b/python/pyarrow/tests/test_cuda_numba_interop.py @@ -26,7 +26,8 @@ cuda = pytest.importorskip("pyarrow.cuda") nb_cuda = pytest.importorskip("numba.cuda") -from numba.cuda.cudadrv.devicearray import DeviceNDArray # noqa: E402 +from numba.cuda.cudadrv.devicearray import DeviceNDArray \ + # type: ignore[unresolved_import] # noqa: E402 context_choices = None diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index d1cd3f6b8a1..d9a4d3df207 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -41,7 +41,7 @@ import pyarrow.feather import pyarrow.fs as fs import pyarrow.json -from pyarrow.lib import is_threading_enabled +from pyarrow.lib import is_threading_enabled # type: ignore[unresolved_import] from pyarrow.tests.util import (FSProtocolClass, ProxyHandler, _configure_s3_limited_user, _filesystem_uri, change_cwd) diff --git a/python/pyarrow/tests/test_flight.py b/python/pyarrow/tests/test_flight.py index 600c6492780..0c0bc7089b9 100644 --- a/python/pyarrow/tests/test_flight.py +++ b/python/pyarrow/tests/test_flight.py @@ -37,7 +37,7 @@ import pytest import pyarrow as pa -from pyarrow.lib import IpcReadOptions, tobytes +from pyarrow.lib import IpcReadOptions, tobytes # type: ignore[unresolved_import] from pyarrow.util import find_free_port from pyarrow.tests import util diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index a5a10fa55c6..61dcb76b247 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -2168,7 +2168,7 @@ def test_fsspec_filesystem_from_uri(): def test_huggingface_filesystem_from_uri(): pytest.importorskip("fsspec") try: - from huggingface_hub import HfFileSystem + from huggingface_hub import HfFileSystem # type: ignore[unresolved_import] except ImportError: pytest.skip("huggingface_hub not installed") diff --git a/python/pyarrow/tests/test_gandiva.py b/python/pyarrow/tests/test_gandiva.py index 80d119a4853..01a6d2151a0 100644 --- a/python/pyarrow/tests/test_gandiva.py +++ b/python/pyarrow/tests/test_gandiva.py @@ -23,7 +23,7 @@ @pytest.mark.gandiva def test_tree_exp_builder(): - import pyarrow.gandiva as gandiva + import pyarrow.gandiva as gandiva # type: ignore[unresolved_import] builder = gandiva.TreeExprBuilder() @@ -65,7 +65,7 @@ def test_tree_exp_builder(): @pytest.mark.gandiva def test_table(): - import pyarrow.gandiva as gandiva + import pyarrow.gandiva as gandiva # type: ignore[unresolved_import] table = pa.Table.from_arrays([pa.array([1.0, 2.0]), pa.array([3.0, 4.0])], ['a', 'b']) @@ -92,7 +92,7 @@ def test_table(): @pytest.mark.gandiva def test_filter(): - import pyarrow.gandiva as gandiva + import pyarrow.gandiva as gandiva # type: ignore[unresolved_import] table = pa.Table.from_arrays([pa.array([1.0 * i for i in range(10000)])], ['a']) @@ -116,7 +116,7 @@ def test_filter(): @pytest.mark.gandiva def test_in_expr(): - import pyarrow.gandiva as gandiva + import pyarrow.gandiva as gandiva # type: ignore[unresolved_import] arr = pa.array(["ga", "an", "nd", "di", "iv", "va"]) table = pa.Table.from_arrays([arr], ["a"]) @@ -154,7 +154,7 @@ def test_in_expr(): @pytest.mark.skip(reason="Gandiva C++ did not have *real* binary, " "time and date support.") def test_in_expr_todo(): - import pyarrow.gandiva as gandiva + import pyarrow.gandiva as gandiva # type: ignore[unresolved_import] # TODO: Implement reasonable support for timestamp, time & date. # Current exceptions: # pyarrow.lib.ArrowException: ExpressionValidationError: @@ -227,7 +227,7 @@ def test_in_expr_todo(): @pytest.mark.gandiva def test_boolean(): - import pyarrow.gandiva as gandiva + import pyarrow.gandiva as gandiva # type: ignore[unresolved_import] table = pa.Table.from_arrays([ pa.array([1., 31., 46., 3., 57., 44., 22.]), @@ -254,7 +254,7 @@ def test_boolean(): @pytest.mark.gandiva def test_literals(): - import pyarrow.gandiva as gandiva + import pyarrow.gandiva as gandiva # type: ignore[unresolved_import] builder = gandiva.TreeExprBuilder() @@ -294,7 +294,7 @@ def test_literals(): @pytest.mark.gandiva def test_regex(): - import pyarrow.gandiva as gandiva + import pyarrow.gandiva as gandiva # type: ignore[unresolved_import] elements = ["park", "sparkle", "bright spark and fire", "spark"] data = pa.array(elements, type=pa.string()) @@ -318,7 +318,7 @@ def test_regex(): @pytest.mark.gandiva def test_get_registered_function_signatures(): - import pyarrow.gandiva as gandiva + import pyarrow.gandiva as gandiva # type: ignore[unresolved_import] signatures = gandiva.get_registered_function_signatures() assert type(signatures[0].return_type()) is pa.DataType @@ -328,7 +328,7 @@ def test_get_registered_function_signatures(): @pytest.mark.gandiva def test_filter_project(): - import pyarrow.gandiva as gandiva + import pyarrow.gandiva as gandiva # type: ignore[unresolved_import] mpool = pa.default_memory_pool() # Create a table with some sample data array0 = pa.array([10, 12, -20, 5, 21, 29], pa.int32()) @@ -375,7 +375,7 @@ def test_filter_project(): @pytest.mark.gandiva def test_to_string(): - import pyarrow.gandiva as gandiva + import pyarrow.gandiva as gandiva # type: ignore[unresolved_import] builder = gandiva.TreeExprBuilder() assert str(builder.make_literal(2.0, pa.float64()) @@ -395,7 +395,7 @@ def test_to_string(): @pytest.mark.gandiva def test_rejects_none(): - import pyarrow.gandiva as gandiva + import pyarrow.gandiva as gandiva # type: ignore[unresolved_import] builder = gandiva.TreeExprBuilder() diff --git a/python/pyarrow/tests/test_jvm.py b/python/pyarrow/tests/test_jvm.py index d71380b8666..b048fcea9ee 100644 --- a/python/pyarrow/tests/test_jvm.py +++ b/python/pyarrow/tests/test_jvm.py @@ -76,8 +76,8 @@ def test_jvm_buffer(root_allocator): def test_jvm_buffer_released(root_allocator): - import jpype.imports # noqa - from java.lang import IllegalArgumentException + import jpype.imports # type: ignore[unresolved_import] # noqa + from java.lang import IllegalArgumentException # type: ignore[unresolved_import] jvm_buffer = root_allocator.buffer(8) jvm_buffer.release() diff --git a/python/pyarrow/tests/test_misc.py b/python/pyarrow/tests/test_misc.py index 64f45d8bed8..09ac52588ed 100644 --- a/python/pyarrow/tests/test_misc.py +++ b/python/pyarrow/tests/test_misc.py @@ -22,7 +22,7 @@ import pytest import pyarrow as pa -from pyarrow.lib import ArrowInvalid +from pyarrow.lib import ArrowInvalid # type: ignore[unresolved_import] def test_get_include(): diff --git a/python/pyarrow/tests/test_sparse_tensor.py b/python/pyarrow/tests/test_sparse_tensor.py index 27974b80f80..e4d141e2a6f 100644 --- a/python/pyarrow/tests/test_sparse_tensor.py +++ b/python/pyarrow/tests/test_sparse_tensor.py @@ -32,7 +32,7 @@ scipy = None # type: ignore[assignment] try: - import sparse + import sparse # type: ignore[unresolved_import] except ImportError: sparse = None diff --git a/python/pyarrow/tests/test_substrait.py b/python/pyarrow/tests/test_substrait.py index fcd1c8d48c5..fae89d3cee5 100644 --- a/python/pyarrow/tests/test_substrait.py +++ b/python/pyarrow/tests/test_substrait.py @@ -22,8 +22,9 @@ import pyarrow as pa import pyarrow.compute as pc -from pyarrow.lib import tobytes -from pyarrow.lib import ArrowInvalid, ArrowNotImplementedError +from pyarrow.lib import tobytes # type: ignore[unresolved_import] +from pyarrow.lib import ArrowInvalid, ArrowNotImplementedError \ + # type: ignore[unresolved_import] try: import pyarrow.substrait as substrait @@ -36,7 +37,7 @@ def mock_udf_context(batch_length=10): - from pyarrow._compute import _get_udf_context + from pyarrow._compute import _get_udf_context # type: ignore[unresolved_import] return _get_udf_context(pa.default_memory_pool(), batch_length) diff --git a/python/pyarrow/tests/test_udf.py b/python/pyarrow/tests/test_udf.py index dbc30867971..891295a5519 100644 --- a/python/pyarrow/tests/test_udf.py +++ b/python/pyarrow/tests/test_udf.py @@ -39,7 +39,7 @@ def mock_udf_context(batch_length=10): - from pyarrow._compute import _get_udf_context + from pyarrow._compute import _get_udf_context # type: ignore[unresolved_import] return _get_udf_context(pa.default_memory_pool(), batch_length) diff --git a/python/pyarrow/types.py b/python/pyarrow/types.py index ab4e5d1b992..ee2b7e1440f 100644 --- a/python/pyarrow/types.py +++ b/python/pyarrow/types.py @@ -20,11 +20,11 @@ from enum import IntEnum -from pyarrow.lib import (is_boolean_value, # noqa +from pyarrow.lib import (is_boolean_value, # type: ignore[unresolved_import] # noqa is_integer_value, is_float_value) -import pyarrow.lib as lib +import pyarrow.lib as lib # type: ignore[unresolved_import] from pyarrow.util import doc diff --git a/python/stubs/__init__.pyi b/python/stubs/__init__.pyi index 8a0d1e870c5..0a1c49067c3 100644 --- a/python/stubs/__init__.pyi +++ b/python/stubs/__init__.pyi @@ -1,11 +1,11 @@ # ruff: noqa: F401, I001, E402 __version__: str -import pyarrow.lib as _lib +import pyarrow.lib as _lib # type: ignore[unresolved_import] _gc_enabled: bool -from pyarrow.lib import ( +from pyarrow.lib import ( # type: ignore[unresolved_import] BuildInfo, RuntimeInfo, set_timezone_db_path, @@ -27,7 +27,7 @@ def show_info() -> None: ... def _module_is_available(module: str) -> bool: ... def _filesystem_is_available(fs: str) -> bool: ... -from pyarrow.lib import ( +from pyarrow.lib import ( # type: ignore[unresolved_import] null, bool_, int8, @@ -233,9 +233,9 @@ from pyarrow.lib import ( ) # Buffers, allocation -from pyarrow.lib import DeviceAllocationType, Device, MemoryManager, default_cpu_memory_manager +from pyarrow.lib import DeviceAllocationType, Device, MemoryManager, default_cpu_memory_manager # type: ignore[unresolved_import] -from pyarrow.lib import ( +from pyarrow.lib import ( # type: ignore[unresolved_import] Buffer, ResizableBuffer, foreign_buffer, @@ -246,7 +246,7 @@ from pyarrow.lib import ( allocate_buffer, ) -from pyarrow.lib import ( +from pyarrow.lib import ( # type: ignore[unresolved_import] MemoryPool, LoggingMemoryPool, ProxyMemoryPool, @@ -264,7 +264,7 @@ from pyarrow.lib import ( ) # I/O -from pyarrow.lib import ( +from pyarrow.lib import ( # type: ignore[unresolved_import] NativeFile, PythonFile, BufferedInputStream, @@ -287,7 +287,7 @@ from pyarrow.lib import ( have_libhdfs, ) -from pyarrow.lib import ( +from pyarrow.lib import ( # type: ignore[unresolved_import] ChunkedArray, RecordBatch, Table, @@ -299,7 +299,7 @@ from pyarrow.lib import ( ) # Exceptions -from pyarrow.lib import ( +from pyarrow.lib import ( # type: ignore[unresolved_import] ArrowCancelled, ArrowCapacityError, ArrowException, diff --git a/python/stubs/__lib_pxi/array.pyi b/python/stubs/__lib_pxi/array.pyi index ffdb8a9c075..37b397f6bb9 100644 --- a/python/stubs/__lib_pxi/array.pyi +++ b/python/stubs/__lib_pxi/array.pyi @@ -23,8 +23,8 @@ import numpy as np import pandas as pd from pandas.core.dtypes.base import ExtensionDtype -from pyarrow._compute import CastOptions -from pyarrow._stubs_typing import ( +from pyarrow._compute import CastOptions # type: ignore[unresolved_import] +from pyarrow._stubs_typing import ( # type: ignore[unresolved_import] ArrayLike, Indices, Mask, @@ -32,7 +32,7 @@ from pyarrow._stubs_typing import ( SupportArrowArray, SupportArrowDeviceArray, ) -from pyarrow.lib import ( +from pyarrow.lib import ( # type: ignore[unresolved_import] Buffer, Device, MemoryManager, diff --git a/python/stubs/__lib_pxi/builder.pyi b/python/stubs/__lib_pxi/builder.pyi index 4a0e9ca4708..655d6436da8 100644 --- a/python/stubs/__lib_pxi/builder.pyi +++ b/python/stubs/__lib_pxi/builder.pyi @@ -1,6 +1,6 @@ from typing import Iterable -from pyarrow.lib import MemoryPool, _Weakrefable +from pyarrow.lib import MemoryPool, _Weakrefable # type: ignore[unresolved_import] from .array import StringArray, StringViewArray diff --git a/python/stubs/__lib_pxi/device.pyi b/python/stubs/__lib_pxi/device.pyi index d1b9f39eedd..edcabdd796a 100644 --- a/python/stubs/__lib_pxi/device.pyi +++ b/python/stubs/__lib_pxi/device.pyi @@ -1,6 +1,6 @@ import enum -from pyarrow.lib import _Weakrefable +from pyarrow.lib import _Weakrefable # type: ignore[unresolved_import] class DeviceAllocationType(enum.Flag): CPU = enum.auto() diff --git a/python/stubs/__lib_pxi/io.pyi b/python/stubs/__lib_pxi/io.pyi index 37c8aefb06b..488dbf163a7 100644 --- a/python/stubs/__lib_pxi/io.pyi +++ b/python/stubs/__lib_pxi/io.pyi @@ -17,8 +17,8 @@ else: from typing import Any, Literal, SupportsIndex, overload -from pyarrow._stubs_typing import Compression, SupportPyBuffer -from pyarrow.lib import MemoryPool, _Weakrefable +from pyarrow._stubs_typing import Compression, SupportPyBuffer # type: ignore[unresolved_import] +from pyarrow.lib import MemoryPool, _Weakrefable # type: ignore[unresolved_import] from .device import Device, DeviceAllocationType, MemoryManager from .types import KeyValueMetadata diff --git a/python/stubs/__lib_pxi/ipc.pyi b/python/stubs/__lib_pxi/ipc.pyi index 3d72892061e..13363e4447a 100644 --- a/python/stubs/__lib_pxi/ipc.pyi +++ b/python/stubs/__lib_pxi/ipc.pyi @@ -11,8 +11,8 @@ from typing import Iterable, Iterator, Literal, Mapping, NamedTuple import pandas as pd -from pyarrow._stubs_typing import SupportArrowStream, SupportPyBuffer -from pyarrow.lib import MemoryPool, RecordBatch, Schema, Table, Tensor, _Weakrefable +from pyarrow._stubs_typing import SupportArrowStream, SupportPyBuffer # type: ignore[unresolved_import] +from pyarrow.lib import MemoryPool, RecordBatch, Schema, Table, Tensor, _Weakrefable # type: ignore[unresolved_import] from .io import Buffer, Codec, NativeFile from .types import DictionaryMemo, KeyValueMetadata diff --git a/python/stubs/__lib_pxi/memory.pyi b/python/stubs/__lib_pxi/memory.pyi index 57a3bb4f1b3..c58bf20dd90 100644 --- a/python/stubs/__lib_pxi/memory.pyi +++ b/python/stubs/__lib_pxi/memory.pyi @@ -1,4 +1,4 @@ -from pyarrow.lib import _Weakrefable +from pyarrow.lib import _Weakrefable # type: ignore[unresolved_import] class MemoryPool(_Weakrefable): """ diff --git a/python/stubs/__lib_pxi/pandas_shim.pyi b/python/stubs/__lib_pxi/pandas_shim.pyi index 29a8485d062..c8cebf765ad 100644 --- a/python/stubs/__lib_pxi/pandas_shim.pyi +++ b/python/stubs/__lib_pxi/pandas_shim.pyi @@ -1,5 +1,5 @@ from types import ModuleType -from typing import Any, Iterable, TypeGuard +from typing import Any, Iterable, TypeGuard # type: ignore[unresolved_import] import pandas diff --git a/python/stubs/__lib_pxi/scalar.pyi b/python/stubs/__lib_pxi/scalar.pyi index 81ab5012067..cfd4ee6f34a 100644 --- a/python/stubs/__lib_pxi/scalar.pyi +++ b/python/stubs/__lib_pxi/scalar.pyi @@ -16,8 +16,8 @@ from typing import Any, Generic, Iterator, Literal, Mapping, overload import numpy as np -from pyarrow._compute import CastOptions -from pyarrow.lib import Array, Buffer, MemoryPool, MonthDayNano, Tensor, _Weakrefable +from pyarrow._compute import CastOptions # type: ignore[unresolved_import] +from pyarrow.lib import Array, Buffer, MemoryPool, MonthDayNano, Tensor, _Weakrefable # type: ignore[unresolved_import] from typing_extensions import Protocol, TypeVar from . import types diff --git a/python/stubs/__lib_pxi/table.pyi b/python/stubs/__lib_pxi/table.pyi index fbcfb1ef745..1ce21b6ed27 100644 --- a/python/stubs/__lib_pxi/table.pyi +++ b/python/stubs/__lib_pxi/table.pyi @@ -31,7 +31,7 @@ import numpy as np import pandas as pd from numpy.typing import NDArray -from pyarrow._compute import ( +from pyarrow._compute import ( # type: ignore[unresolved_import] CastOptions, CountOptions, FunctionOptions, @@ -39,7 +39,7 @@ from pyarrow._compute import ( TDigestOptions, VarianceOptions, ) -from pyarrow._stubs_typing import ( +from pyarrow._stubs_typing import ( # type: ignore[unresolved_import] Indices, Mask, NullEncoding, @@ -49,12 +49,15 @@ from pyarrow._stubs_typing import ( SupportArrowDeviceArray, SupportArrowStream, ) -from pyarrow.compute import ArrayOrChunkedArray, Expression +from pyarrow.compute import ArrayOrChunkedArray, Expression # type: ignore[unresolved_import] from pyarrow.interchange.dataframe import _PyArrowDataFrame -from pyarrow.lib import Device, Field, MemoryManager, MemoryPool, MonthDayNano, Schema +from pyarrow.lib import Device, Field, MemoryManager, MemoryPool, MonthDayNano, Schema # type: ignore[unresolved_import] from . import array, scalar, types -from .array import Array, NullableCollection, StructArray, _CastAs, _PandasConvertible +from .array import ( + Array, StructArray, _CastAs, _PandasConvertible, + NullableCollection, # type: ignore[unresolved_import] +) from .device import DeviceAllocationType from .io import Buffer from .ipc import RecordBatchReader diff --git a/python/stubs/__lib_pxi/tensor.pyi b/python/stubs/__lib_pxi/tensor.pyi index d849abd0f1f..a28804c6e36 100644 --- a/python/stubs/__lib_pxi/tensor.pyi +++ b/python/stubs/__lib_pxi/tensor.pyi @@ -7,9 +7,9 @@ else: import numpy as np -from pyarrow.lib import _Weakrefable +from pyarrow.lib import _Weakrefable # type: ignore[unresolved_import] from scipy.sparse import coo_matrix, csr_matrix -from sparse import COO +from sparse import COO # type: ignore[unresolved_import] class Tensor(_Weakrefable): """ diff --git a/python/stubs/__lib_pxi/types.pyi b/python/stubs/__lib_pxi/types.pyi index a7b6062b275..d38269ef341 100644 --- a/python/stubs/__lib_pxi/types.pyi +++ b/python/stubs/__lib_pxi/types.pyi @@ -14,8 +14,8 @@ from typing import Any, Generic, Iterable, Iterator, Literal, overload import numpy as np import pandas as pd -from pyarrow._stubs_typing import SupportArrowSchema -from pyarrow.lib import ( +from pyarrow._stubs_typing import SupportArrowSchema # type: ignore[unresolved_import] +from pyarrow.lib import ( # type: ignore[unresolved_import] Array, ChunkedArray, ExtensionArray, @@ -29,6 +29,7 @@ from .io import Buffer from .scalar import ExtensionScalar _AsPyType = TypeVar("_AsPyType") +_DataTypeT = TypeVar("_DataTypeT", bound=DataType) class _Weakrefable: ... class _Metadata(_Weakrefable): ... diff --git a/python/stubs/_compute.pyi b/python/stubs/_compute.pyi index 3d61ae42787..071fceb3928 100644 --- a/python/stubs/_compute.pyi +++ b/python/stubs/_compute.pyi @@ -1,12 +1,6 @@ from typing import ( - Any, - Callable, - Iterable, - Literal, - Sequence, - TypeAlias, - TypedDict, - overload, + Any, Callable, Iterable, Literal, Sequence, TypedDict, overload, + TypeAlias, # type: ignore[unresolved_import] ) from . import lib diff --git a/python/stubs/_fs.pyi b/python/stubs/_fs.pyi index 9f6e28dcf0f..d3b194e3ded 100644 --- a/python/stubs/_fs.pyi +++ b/python/stubs/_fs.pyi @@ -19,6 +19,8 @@ from fsspec import AbstractFileSystem # type: ignore[import-untyped] from .lib import NativeFile, _Weakrefable +SupportedFileSystem: TypeAlias = Union[AbstractFileSystem, FileSystem] + class FileType(enum.IntFlag): NotFound = enum.auto() Unknown = enum.auto() diff --git a/python/stubs/_parquet.pyi b/python/stubs/_parquet.pyi index a9187df0428..053f2d08266 100644 --- a/python/stubs/_parquet.pyi +++ b/python/stubs/_parquet.pyi @@ -1,4 +1,7 @@ -from typing import IO, Any, Iterable, Iterator, Literal, Sequence, TypeAlias, TypedDict +from typing import ( + IO, Any, Iterable, Iterator, Literal, Sequence, TypedDict, + TypeAlias, # type: ignore[unresolved_import] +) from _typeshed import StrPath diff --git a/python/stubs/_s3fs.pyi b/python/stubs/_s3fs.pyi index fc13c498bd9..8e67c805619 100644 --- a/python/stubs/_s3fs.pyi +++ b/python/stubs/_s3fs.pyi @@ -1,6 +1,9 @@ import enum -from typing import Literal, NotRequired, Required, TypedDict +from typing import ( + Literal, TypedDict, + NotRequired, Required, # type: ignore[unresolved_import] +) from ._fs import FileSystem from .lib import KeyValueMetadata diff --git a/python/stubs/_stubs_typing.pyi b/python/stubs/_stubs_typing.pyi index 40d931d24ed..73bb9f38a95 100644 --- a/python/stubs/_stubs_typing.pyi +++ b/python/stubs/_stubs_typing.pyi @@ -2,7 +2,10 @@ import datetime as dt from collections.abc import Sequence from decimal import Decimal -from typing import Any, Collection, Literal, Protocol, TypeAlias, TypeVar, Union +from typing import ( + Any, Collection, Literal, Protocol, TypeVar, Union, + TypeAlias # type: ignore[unresolved_import] +) import numpy as np diff --git a/python/stubs/benchmark.pyi b/python/stubs/benchmark.pyi index 048973301dc..972fad10a5f 100644 --- a/python/stubs/benchmark.pyi +++ b/python/stubs/benchmark.pyi @@ -1,3 +1,3 @@ -from pyarrow.lib import benchmark_PandasObjectIsNull +from pyarrow.lib import benchmark_PandasObjectIsNull # type: ignore[unresolved_import] __all__ = ["benchmark_PandasObjectIsNull"] diff --git a/python/stubs/compute.pyi b/python/stubs/compute.pyi index 1cf52ff07ca..775b7fa504e 100644 --- a/python/stubs/compute.pyi +++ b/python/stubs/compute.pyi @@ -1,94 +1,100 @@ # ruff: noqa: I001 -from typing import Literal, TypeAlias, TypeVar, overload, Any, Iterable, ParamSpec, Sequence +from typing import Literal, TypeAlias, TypeVar, overload, Any, Iterable, ParamSpec, Sequence # type: ignore[unresolved_import] from collections.abc import Callable # Option classes -from pyarrow._compute import ArraySortOptions as ArraySortOptions -from pyarrow._compute import AssumeTimezoneOptions as AssumeTimezoneOptions -from pyarrow._compute import CastOptions as CastOptions -from pyarrow._compute import CountOptions as CountOptions -from pyarrow._compute import CumulativeOptions as CumulativeOptions -from pyarrow._compute import CumulativeSumOptions as CumulativeSumOptions -from pyarrow._compute import DayOfWeekOptions as DayOfWeekOptions -from pyarrow._compute import DictionaryEncodeOptions as DictionaryEncodeOptions -from pyarrow._compute import ElementWiseAggregateOptions as ElementWiseAggregateOptions +from pyarrow._compute import ( # type: ignore[unresolved_import] + ArraySortOptions as ArraySortOptions, + AssumeTimezoneOptions as AssumeTimezoneOptions, + CastOptions as CastOptions, + CountOptions as CountOptions, + CumulativeOptions as CumulativeOptions, + CumulativeSumOptions as CumulativeSumOptions, + DayOfWeekOptions as DayOfWeekOptions, + DictionaryEncodeOptions as DictionaryEncodeOptions, + ElementWiseAggregateOptions as ElementWiseAggregateOptions, +) # Expressions -from pyarrow._compute import Expression as Expression -from pyarrow._compute import ExtractRegexOptions as ExtractRegexOptions -from pyarrow._compute import ExtractRegexSpanOptions as ExtractRegexSpanOptions -from pyarrow._compute import FilterOptions as FilterOptions -from pyarrow._compute import Function as Function -from pyarrow._compute import FunctionOptions as FunctionOptions -from pyarrow._compute import FunctionRegistry as FunctionRegistry -from pyarrow._compute import HashAggregateFunction as HashAggregateFunction -from pyarrow._compute import HashAggregateKernel as HashAggregateKernel -from pyarrow._compute import IndexOptions as IndexOptions -from pyarrow._compute import JoinOptions as JoinOptions -from pyarrow._compute import Kernel as Kernel -from pyarrow._compute import ListFlattenOptions as ListFlattenOptions -from pyarrow._compute import ListSliceOptions as ListSliceOptions -from pyarrow._compute import MakeStructOptions as MakeStructOptions -from pyarrow._compute import MapLookupOptions as MapLookupOptions -from pyarrow._compute import MatchSubstringOptions as MatchSubstringOptions -from pyarrow._compute import ModeOptions as ModeOptions -from pyarrow._compute import NullOptions as NullOptions -from pyarrow._compute import PadOptions as PadOptions -from pyarrow._compute import PairwiseOptions as PairwiseOptions -from pyarrow._compute import PartitionNthOptions as PartitionNthOptions -from pyarrow._compute import PivotWiderOptions as PivotWiderOptions -from pyarrow._compute import QuantileOptions as QuantileOptions -from pyarrow._compute import RandomOptions as RandomOptions -from pyarrow._compute import RankOptions as RankOptions -from pyarrow._compute import RankQuantileOptions as RankQuantileOptions -from pyarrow._compute import ReplaceSliceOptions as ReplaceSliceOptions -from pyarrow._compute import ReplaceSubstringOptions as ReplaceSubstringOptions -from pyarrow._compute import RoundBinaryOptions as RoundBinaryOptions -from pyarrow._compute import RoundOptions as RoundOptions -from pyarrow._compute import RoundTemporalOptions as RoundTemporalOptions -from pyarrow._compute import RoundToMultipleOptions as RoundToMultipleOptions -from pyarrow._compute import RunEndEncodeOptions as RunEndEncodeOptions -from pyarrow._compute import ScalarAggregateFunction as ScalarAggregateFunction -from pyarrow._compute import ScalarAggregateKernel as ScalarAggregateKernel -from pyarrow._compute import ScalarAggregateOptions as ScalarAggregateOptions -from pyarrow._compute import ScalarFunction as ScalarFunction -from pyarrow._compute import ScalarKernel as ScalarKernel -from pyarrow._compute import SelectKOptions as SelectKOptions -from pyarrow._compute import SetLookupOptions as SetLookupOptions -from pyarrow._compute import SkewOptions as SkewOptions -from pyarrow._compute import SliceOptions as SliceOptions -from pyarrow._compute import SortOptions as SortOptions -from pyarrow._compute import SplitOptions as SplitOptions -from pyarrow._compute import SplitPatternOptions as SplitPatternOptions -from pyarrow._compute import StrftimeOptions as StrftimeOptions -from pyarrow._compute import StrptimeOptions as StrptimeOptions -from pyarrow._compute import StructFieldOptions as StructFieldOptions -from pyarrow._compute import TakeOptions as TakeOptions -from pyarrow._compute import TDigestOptions as TDigestOptions -from pyarrow._compute import TrimOptions as TrimOptions -from pyarrow._compute import UdfContext as UdfContext -from pyarrow._compute import Utf8NormalizeOptions as Utf8NormalizeOptions -from pyarrow._compute import VarianceOptions as VarianceOptions -from pyarrow._compute import VectorFunction as VectorFunction -from pyarrow._compute import VectorKernel as VectorKernel -from pyarrow._compute import WeekOptions as WeekOptions -from pyarrow._compute import WinsorizeOptions as WinsorizeOptions +from pyarrow._compute import ( # type: ignore[unresolved_import] + Expression as Expression, + ExtractRegexOptions as ExtractRegexOptions, + ExtractRegexSpanOptions as ExtractRegexSpanOptions, + FilterOptions as FilterOptions, + Function as Function, + FunctionOptions as FunctionOptions, + FunctionRegistry as FunctionRegistry, + HashAggregateFunction as HashAggregateFunction, + HashAggregateKernel as HashAggregateKernel, + IndexOptions as IndexOptions, + JoinOptions as JoinOptions, + Kernel as Kernel, + ListFlattenOptions as ListFlattenOptions, + ListSliceOptions as ListSliceOptions, + MakeStructOptions as MakeStructOptions, + MapLookupOptions as MapLookupOptions, + MatchSubstringOptions as MatchSubstringOptions, + ModeOptions as ModeOptions, + NullOptions as NullOptions, + PadOptions as PadOptions, + PairwiseOptions as PairwiseOptions, + PartitionNthOptions as PartitionNthOptions, + PivotWiderOptions as PivotWiderOptions, + QuantileOptions as QuantileOptions, + RandomOptions as RandomOptions, + RankOptions as RankOptions, + RankQuantileOptions as RankQuantileOptions, + ReplaceSliceOptions as ReplaceSliceOptions, + ReplaceSubstringOptions as ReplaceSubstringOptions, + RoundBinaryOptions as RoundBinaryOptions, + RoundOptions as RoundOptions, + RoundTemporalOptions as RoundTemporalOptions, + RoundToMultipleOptions as RoundToMultipleOptions, + RunEndEncodeOptions as RunEndEncodeOptions, + ScalarAggregateFunction as ScalarAggregateFunction, + ScalarAggregateKernel as ScalarAggregateKernel, + ScalarAggregateOptions as ScalarAggregateOptions, + ScalarFunction as ScalarFunction, + ScalarKernel as ScalarKernel, + SelectKOptions as SelectKOptions, + SetLookupOptions as SetLookupOptions, + SkewOptions as SkewOptions, + SliceOptions as SliceOptions, + SortOptions as SortOptions, + SplitOptions as SplitOptions, + SplitPatternOptions as SplitPatternOptions, + StrftimeOptions as StrftimeOptions, + StrptimeOptions as StrptimeOptions, + StructFieldOptions as StructFieldOptions, + TakeOptions as TakeOptions, + TDigestOptions as TDigestOptions, + TrimOptions as TrimOptions, + UdfContext as UdfContext, + Utf8NormalizeOptions as Utf8NormalizeOptions, + VarianceOptions as VarianceOptions, + VectorFunction as VectorFunction, + VectorKernel as VectorKernel, + WeekOptions as WeekOptions, + WinsorizeOptions as WinsorizeOptions, +) # Functions -from pyarrow._compute import call_function as call_function +from pyarrow._compute import call_function as call_function # type: ignore[unresolved_import] # Udf -from pyarrow._compute import call_tabular_function as call_tabular_function -from pyarrow._compute import function_registry as function_registry -from pyarrow._compute import get_function as get_function -from pyarrow._compute import list_functions as list_functions -from pyarrow._compute import register_aggregate_function as register_aggregate_function -from pyarrow._compute import register_scalar_function as register_scalar_function -from pyarrow._compute import register_tabular_function as register_tabular_function -from pyarrow._compute import register_vector_function as register_vector_function - -from pyarrow._compute import _Order, _Placement -from pyarrow._stubs_typing import ArrayLike, ScalarLike +from pyarrow._compute import ( # type: ignore[unresolved_import] + call_tabular_function as call_tabular_function, + function_registry as function_registry, + get_function as get_function, + list_functions as list_functions, + register_aggregate_function as register_aggregate_function, + register_scalar_function as register_scalar_function, + register_tabular_function as register_tabular_function, + register_vector_function as register_vector_function, +) + +from pyarrow._compute import _Order, _Placement # type: ignore[unresolved_import] +from pyarrow._stubs_typing import ArrayLike, ScalarLike # type: ignore[unresolved_import] from . import lib _P = ParamSpec("_P") diff --git a/python/stubs/csv.pyi b/python/stubs/csv.pyi index 510229d7e72..cea5542d1c5 100644 --- a/python/stubs/csv.pyi +++ b/python/stubs/csv.pyi @@ -1,4 +1,4 @@ -from pyarrow._csv import ( +from pyarrow._csv import ( # type: ignore[unresolved_import] ISO8601, ConvertOptions, CSVStreamingReader, diff --git a/python/stubs/cuda.pyi b/python/stubs/cuda.pyi index e11baf7d4e7..3c69e746f7b 100644 --- a/python/stubs/cuda.pyi +++ b/python/stubs/cuda.pyi @@ -1,4 +1,4 @@ -from pyarrow._cuda import ( +from pyarrow._cuda import ( # type: ignore[unresolved_import] BufferReader, BufferWriter, Context, diff --git a/python/stubs/dataset.pyi b/python/stubs/dataset.pyi index 98f1a38aa85..a57e9f2f3f0 100644 --- a/python/stubs/dataset.pyi +++ b/python/stubs/dataset.pyi @@ -1,7 +1,7 @@ -from typing import Callable, Iterable, Literal, Sequence, TypeAlias, overload +from typing import Callable, Iterable, Literal, Sequence, TypeAlias, overload # type: ignore[unresolved_import] from _typeshed import StrPath -from pyarrow._dataset import ( +from pyarrow._dataset import ( # type: ignore[unresolved_import] CsvFileFormat, CsvFragmentScanOptions, Dataset, @@ -32,8 +32,8 @@ from pyarrow._dataset import ( WrittenFile, get_partition_keys, ) -from pyarrow._dataset_orc import OrcFileFormat -from pyarrow._dataset_parquet import ( +from pyarrow._dataset_orc import OrcFileFormat # type: ignore[unresolved_import] +from pyarrow._dataset_parquet import ( # type: ignore[unresolved_import] ParquetDatasetFactory, ParquetFactoryOptions, ParquetFileFormat, @@ -43,12 +43,12 @@ from pyarrow._dataset_parquet import ( ParquetReadOptions, RowGroupInfo, ) -from pyarrow._dataset_parquet_encryption import ( +from pyarrow._dataset_parquet_encryption import ( # type: ignore[unresolved_import] ParquetDecryptionConfig, ParquetEncryptionConfig, ) from pyarrow.compute import Expression, field, scalar -from pyarrow.lib import Array, RecordBatch, RecordBatchReader, Schema, Table +from pyarrow.lib import Array, RecordBatch, RecordBatchReader, Schema, Table # type: ignore[unresolved_import] from ._fs import SupportedFileSystem diff --git a/python/stubs/feather.pyi b/python/stubs/feather.pyi index 9451ee15763..63766cd5d61 100644 --- a/python/stubs/feather.pyi +++ b/python/stubs/feather.pyi @@ -3,8 +3,8 @@ from typing import IO, Literal import pandas as pd from _typeshed import StrPath -from pyarrow._feather import FeatherError -from pyarrow.lib import Table +from pyarrow._feather import FeatherError # type: ignore[unresolved_import] +from pyarrow.lib import Table # type: ignore[unresolved_import] __all__ = [ "FeatherError", diff --git a/python/stubs/flight.pyi b/python/stubs/flight.pyi index 9b806ccf305..aa06f3ebec7 100644 --- a/python/stubs/flight.pyi +++ b/python/stubs/flight.pyi @@ -1,4 +1,4 @@ -from pyarrow._flight import ( +from pyarrow._flight import ( # type: ignore[unresolved_import] Action, ActionType, BasicAuth, diff --git a/python/stubs/fs.pyi b/python/stubs/fs.pyi index 6bf75616c13..07a1d7765e6 100644 --- a/python/stubs/fs.pyi +++ b/python/stubs/fs.pyi @@ -1,4 +1,4 @@ -from pyarrow._fs import ( # noqa +from pyarrow._fs import ( # type: ignore[unresolved_import] # noqa FileSelector, FileType, FileInfo, @@ -10,10 +10,10 @@ from pyarrow._fs import ( # noqa PyFileSystem, SupportedFileSystem, ) -from pyarrow._azurefs import AzureFileSystem -from pyarrow._hdfs import HadoopFileSystem -from pyarrow._gcsfs import GcsFileSystem -from pyarrow._s3fs import ( # noqa +from pyarrow._azurefs import AzureFileSystem # type: ignore[unresolved_import] +from pyarrow._hdfs import HadoopFileSystem # type: ignore[unresolved_import] +from pyarrow._gcsfs import GcsFileSystem # type: ignore[unresolved_import] +from pyarrow._s3fs import ( # type: ignore[unresolved_import] # noqa AwsDefaultS3RetryStrategy, AwsStandardS3RetryStrategy, S3FileSystem, diff --git a/python/stubs/interchange/buffer.pyi b/python/stubs/interchange/buffer.pyi index 46673961a75..afef5acf353 100644 --- a/python/stubs/interchange/buffer.pyi +++ b/python/stubs/interchange/buffer.pyi @@ -1,6 +1,6 @@ import enum -from pyarrow.lib import Buffer +from pyarrow.lib import Buffer # type: ignore[unresolved_import] class DlpackDeviceType(enum.IntEnum): """Integer enum for device type codes matching DLPack.""" diff --git a/python/stubs/interchange/column.pyi b/python/stubs/interchange/column.pyi index e6662867b6b..7d89c4ae6b0 100644 --- a/python/stubs/interchange/column.pyi +++ b/python/stubs/interchange/column.pyi @@ -1,8 +1,8 @@ import enum -from typing import Any, Iterable, TypeAlias, TypedDict +from typing import Any, Iterable, TypeAlias, TypedDict # type: ignore[unresolved_import] -from pyarrow.lib import Array, ChunkedArray +from pyarrow.lib import Array, ChunkedArray # type: ignore[unresolved_import] from .buffer import _PyArrowBuffer diff --git a/python/stubs/interchange/dataframe.pyi b/python/stubs/interchange/dataframe.pyi index 526a58926a9..7a17dfeb1eb 100644 --- a/python/stubs/interchange/dataframe.pyi +++ b/python/stubs/interchange/dataframe.pyi @@ -7,7 +7,7 @@ else: from typing import Any, Iterable, Sequence from pyarrow.interchange.column import _PyArrowColumn -from pyarrow.lib import RecordBatch, Table +from pyarrow.lib import RecordBatch, Table # type: ignore[unresolved_import] class _PyArrowDataFrame: """ diff --git a/python/stubs/interchange/from_dataframe.pyi b/python/stubs/interchange/from_dataframe.pyi index b04b6268975..ad461270f56 100644 --- a/python/stubs/interchange/from_dataframe.pyi +++ b/python/stubs/interchange/from_dataframe.pyi @@ -1,6 +1,6 @@ -from typing import Any, Protocol, TypeAlias +from typing import Any, Protocol, TypeAlias # type: ignore[unresolved_import] -from pyarrow.lib import Array, Buffer, DataType, DictionaryArray, RecordBatch, Table +from pyarrow.lib import Array, Buffer, DataType, DictionaryArray, RecordBatch, Table # type: ignore[unresolved_import] from .column import ( ColumnBuffers, diff --git a/python/stubs/ipc.pyi b/python/stubs/ipc.pyi index c7f2af004d4..2a5e8294e46 100644 --- a/python/stubs/ipc.pyi +++ b/python/stubs/ipc.pyi @@ -1,9 +1,9 @@ from io import IOBase import pandas as pd -import pyarrow.lib as lib +import pyarrow.lib as lib # type: ignore[unresolved_import] -from pyarrow.lib import ( +from pyarrow.lib import ( # type: ignore[unresolved_import] IpcReadOptions, IpcWriteOptions, Message, diff --git a/python/stubs/json.pyi b/python/stubs/json.pyi index db1d35e0b8b..97b94d5dd77 100644 --- a/python/stubs/json.pyi +++ b/python/stubs/json.pyi @@ -1,3 +1,3 @@ -from pyarrow._json import ParseOptions, ReadOptions, open_json, read_json +from pyarrow._json import ParseOptions, ReadOptions, open_json, read_json # type: ignore[unresolved_import] __all__ = ["ParseOptions", "ReadOptions", "read_json", "open_json"] diff --git a/python/stubs/parquet/core.pyi b/python/stubs/parquet/core.pyi index 56b2c8447d9..01dce442feb 100644 --- a/python/stubs/parquet/core.pyi +++ b/python/stubs/parquet/core.pyi @@ -13,10 +13,10 @@ if sys.version_info >= (3, 10): else: from typing_extensions import TypeAlias -from pyarrow import _parquet -from pyarrow._compute import Expression -from pyarrow._fs import FileSystem, SupportedFileSystem -from pyarrow._parquet import ( +from pyarrow import _parquet # type: ignore[unresolved_import] +from pyarrow._compute import Expression # type: ignore[unresolved_import] +from pyarrow._fs import FileSystem, SupportedFileSystem # type: ignore[unresolved_import] +from pyarrow._parquet import ( # type: ignore[unresolved_import] ColumnChunkMetaData, ColumnSchema, FileDecryptionProperties, @@ -29,9 +29,9 @@ from pyarrow._parquet import ( SortingColumn, Statistics, ) -from pyarrow._stubs_typing import FilterTuple, SingleOrList +from pyarrow._stubs_typing import FilterTuple, SingleOrList # type: ignore[unresolved_import] from pyarrow.dataset import ParquetFileFragment, Partitioning -from pyarrow.lib import NativeFile, RecordBatch, Schema, Table +from pyarrow.lib import NativeFile, RecordBatch, Schema, Table # type: ignore[unresolved_import] from typing_extensions import deprecated __all__ = ( diff --git a/python/stubs/parquet/encryption.pyi b/python/stubs/parquet/encryption.pyi index 5a77dae7ef7..daade78e6dd 100644 --- a/python/stubs/parquet/encryption.pyi +++ b/python/stubs/parquet/encryption.pyi @@ -1,4 +1,4 @@ -from pyarrow._parquet_encryption import ( +from pyarrow._parquet_encryption import ( # type: ignore[unresolved_import] CryptoFactory, DecryptionConfiguration, EncryptionConfiguration, diff --git a/python/stubs/substrait.pyi b/python/stubs/substrait.pyi index a56a8a5b40f..004439d4c19 100644 --- a/python/stubs/substrait.pyi +++ b/python/stubs/substrait.pyi @@ -1,4 +1,4 @@ -from pyarrow._substrait import ( +from pyarrow._substrait import ( # type: ignore[unresolved_import] BoundExpressions, SubstraitSchema, deserialize_expressions, diff --git a/python/stubs/types.pyi b/python/stubs/types.pyi index 0cb4f6171d3..c128770d178 100644 --- a/python/stubs/types.pyi +++ b/python/stubs/types.pyi @@ -11,7 +11,7 @@ if sys.version_info >= (3, 10): else: from typing_extensions import TypeAlias -from pyarrow.lib import ( +from pyarrow.lib import ( # type: ignore[unresolved_import] BinaryType, BinaryViewType, BoolType, From a0d5b743c3c56c435f6b06699fc54249070961df Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 24 Jul 2025 22:21:43 +0200 Subject: [PATCH 19/32] fix unsupported-operator --- python/pyarrow/pandas_compat.py | 3 +- .../interchange/test_interchange_spec.py | 6 +- python/pyarrow/tests/parquet/test_basic.py | 1 - .../tests/parquet/test_parquet_file.py | 2 +- python/pyarrow/tests/test_array.py | 3 +- python/pyarrow/tests/test_cython.py | 4 +- python/pyarrow/tests/test_extension_type.py | 4 +- python/pyarrow/tests/test_flight.py | 14 +++- python/pyarrow/tests/test_jvm.py | 5 +- python/pyarrow/tests/test_pandas.py | 6 +- python/pyproject.toml | 4 +- python/setup.py | 8 +- python/stubs/_stubs_typing.pyi | 4 +- python/stubs/compute.pyi | 74 +++++++++---------- 14 files changed, 77 insertions(+), 61 deletions(-) diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index f284d411abf..8dcfb282b31 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -828,7 +828,8 @@ def table_to_dataframe( axes = [columns, index] mgr = BlockManager(blocks, axes) if _pandas_api.is_ge_v21(): - df = DataFrame._from_mgr(mgr, mgr.axes) # type: ignore[unresolved-attribute] + # type: ignore[unresolved-attribute] + df = DataFrame._from_mgr(mgr, mgr.axes) else: df = DataFrame(mgr) return df diff --git a/python/pyarrow/tests/interchange/test_interchange_spec.py b/python/pyarrow/tests/interchange/test_interchange_spec.py index 56a424fd57a..68afc0c633b 100644 --- a/python/pyarrow/tests/interchange/test_interchange_spec.py +++ b/python/pyarrow/tests/interchange/test_interchange_spec.py @@ -18,15 +18,15 @@ import ctypes import hypothesis as h import hypothesis.strategies as st - +import pyarrow as pa +import pyarrow.tests.strategies as past import pytest + np = None try: import numpy as np except ImportError: pass -import pyarrow as pa -import pyarrow.tests.strategies as past all_types = st.deferred( diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py index 528f8e51683..18381538211 100644 --- a/python/pyarrow/tests/parquet/test_basic.py +++ b/python/pyarrow/tests/parquet/test_basic.py @@ -16,7 +16,6 @@ # under the License. import os -from collections import OrderedDict import io import warnings from shutil import copytree diff --git a/python/pyarrow/tests/parquet/test_parquet_file.py b/python/pyarrow/tests/parquet/test_parquet_file.py index 28f25ac8482..df5b82ad8d9 100644 --- a/python/pyarrow/tests/parquet/test_parquet_file.py +++ b/python/pyarrow/tests/parquet/test_parquet_file.py @@ -408,7 +408,7 @@ def test_parquet_file_hugginface_support(): pytest.skip("fsspec is not installed, skipping Hugging Face test") fake_hf_module = types.ModuleType("huggingface_hub") - fake_hf_module.HfFileSystem = MemoryFileSystem # type: ignore[unresolved-attribute] + fake_hf_module.HfFileSystem = MemoryFileSystem # type: ignore[unresolved-attribute] with mock.patch.dict("sys.modules", {"huggingface_hub": fake_hf_module}): uri = "hf://datasets/apache/arrow/test.parquet" table = pa.table({"a": range(10)}) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index a06e3f76570..a1377d0c839 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -551,7 +551,8 @@ def test_arange(): result = pa.arange(*case) result.validate(full=True) - assert result.equals(pa.array(list(range(*case)), type=pa.int64())) # type: ignore[no-matching-overload] + # type: ignore[no-matching-overload] + assert result.equals(pa.array(list(range(*case)), type=pa.int64())) # Validate memory_pool keyword argument result = pa.arange(-1, 101, memory_pool=pa.default_memory_pool()) diff --git a/python/pyarrow/tests/test_cython.py b/python/pyarrow/tests/test_cython.py index fdacb16be29..c9c35087839 100644 --- a/python/pyarrow/tests/test_cython.py +++ b/python/pyarrow/tests/test_cython.py @@ -191,7 +191,7 @@ def test_visit_strings(tmpdir): strings = ['a', 'b', 'c'] visited = [] - mod._visit_strings(strings, visited.append) # type: ignore[unresolved-attribute] + mod._visit_strings(strings, visited.append) # type: ignore[unresolved-attribute] assert visited == strings @@ -200,4 +200,4 @@ def raise_on_b(s): if s == 'b': raise ValueError('wtf') - mod._visit_strings(strings, raise_on_b) # type: ignore[unresolved-attribute] + mod._visit_strings(strings, raise_on_b) # type: ignore[unresolved-attribute] diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 1a851611b14..a3847c44e4f 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1353,11 +1353,11 @@ def test_cpp_extension_in_python(tmpdir): sys.path.insert(0, str(tmpdir)) mod = __import__('extensions') - uuid_type = mod._make_uuid_type() # type: ignore[unresolved-attribute] + uuid_type = mod._make_uuid_type() # type: ignore[unresolved-attribute] assert uuid_type.extension_name == "example-uuid" assert uuid_type.storage_type == pa.binary(16) - array = mod._make_uuid_array() # type: ignore[unresolved-attribute] + array = mod._make_uuid_array() # type: ignore[unresolved-attribute] assert array.type == uuid_type assert array.to_pylist() == [b'abcdefghijklmno0', b'0onmlkjihgfedcba'] assert array[0].as_py() == b'abcdefghijklmno0' diff --git a/python/pyarrow/tests/test_flight.py b/python/pyarrow/tests/test_flight.py index 0c0bc7089b9..b33ae005331 100644 --- a/python/pyarrow/tests/test_flight.py +++ b/python/pyarrow/tests/test_flight.py @@ -53,29 +53,39 @@ class context_like(object): def __enter__(self): return self + def __exit__(self, exc_type, exc_value, traceback): pass flight = None + class MockContextManager: def __init__(self, *args, **kwargs): - pass + pass + def __enter__(self): return self + def __exit__(self, exc_type, exc_val, exc_tb): pass + class FlightServerBase(MockContextManager): def serve(self): pass + class FlightClient(MockContextManager): def get_flight_info(self, **kwargs): pass + def do_action(self, **kwargs): pass + def do_get(self, **kwargs): pass + def do_put(self, **kwargs): pass + def close(self): pass ServerAuthHandler, ClientAuthHandler = object, object @@ -1770,7 +1780,7 @@ def test_flight_do_put_limit(): with pytest.raises(flight.FlightWriteSizeExceededError, match="exceeded soft limit") as excinfo: writer.write_batch(large_batch) - assert excinfo.value.limit == 4096 # type: ignore[unresolved-attribute] + assert excinfo.value.limit == 4096 # type: ignore[unresolved-attribute] smaller_batches = [ large_batch.slice(0, 384), large_batch.slice(384), diff --git a/python/pyarrow/tests/test_jvm.py b/python/pyarrow/tests/test_jvm.py index b048fcea9ee..876c05d740a 100644 --- a/python/pyarrow/tests/test_jvm.py +++ b/python/pyarrow/tests/test_jvm.py @@ -16,6 +16,7 @@ # under the License. from json import dumps as json_dumps +from json import loads as json_loads import os import pyarrow as pa import pyarrow.jvm as pa_jvm @@ -171,7 +172,7 @@ def test_jvm_types(root_allocator, pa_type, jvm_spec, nullable): spec = { 'name': 'field_name', 'nullable': nullable, - 'type': json.loads(jvm_spec), + 'type': json_loads(jvm_spec), # TODO: This needs to be set for complex types 'children': [] } @@ -375,7 +376,7 @@ def test_jvm_record_batch(root_allocator, pa_type, py_data, jvm_type, spec = { 'name': 'field_name', 'nullable': False, - 'type': json.loads(jvm_spec), + 'type': json_loads(jvm_spec), # TODO: This needs to be set for complex types 'children': [] } diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 535b95515dc..1bd5b58025d 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -4428,7 +4428,8 @@ def test_convert_to_extension_array(monkeypatch): integer._IntegerDtype, "__from_arrow__") else: monkeypatch.delattr( - pd.core.arrays.integer.NumericDtype, "__from_arrow__") # type: ignore[unresolved-attribute] + # type: ignore[unresolved-attribute] + pd.core.arrays.integer.NumericDtype, "__from_arrow__") # Int64Dtype has no __from_arrow__ -> use normal conversion result = table.to_pandas() assert len(_get_mgr(result).blocks) == 1 @@ -4474,7 +4475,8 @@ def test_conversion_extensiontype_to_extensionarray(monkeypatch): integer._IntegerDtype, "__from_arrow__") else: monkeypatch.delattr( - pd.core.arrays.integer.NumericDtype, "__from_arrow__") # type: ignore[unresolved-attribute] + # type: ignore[unresolved-attribute] + pd.core.arrays.integer.NumericDtype, "__from_arrow__") result = arr.to_pandas() assert _get_mgr(result).blocks[0].values.dtype == np.dtype("int64") diff --git a/python/pyproject.toml b/python/pyproject.toml index 5c0580a0510..a35a73911c2 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -115,6 +115,6 @@ unresolved-attribute = "ignore" unresolved-global = "ignore" unresolved-import = "ignore" unresolved-reference = "ignore" -unsupported-operator = "ignore" +#unsupported-operator = "ignore" missing-argument = "ignore" -call-non-callable = "ignore" +#call-non-callable = "ignore" diff --git a/python/setup.py b/python/setup.py index 4e87ecfbfcc..d037b82f4ad 100755 --- a/python/setup.py +++ b/python/setup.py @@ -44,11 +44,12 @@ # as here it may be set to the host not target platform is_emscripten = ( sysconfig.get_config_var("SOABI") - and sysconfig.get_config_var("SOABI").find("emscripten") != -1 # type: ignore[possibly-unbound] + # type: ignore[possibly-unbound] + and sysconfig.get_config_var("SOABI").find("emscripten") != -1 ) -if Cython.__version__ < '3': # type: ignore[unresolved-attribute] +if Cython.__version__ < '3': # type: ignore[unresolved-attribute] raise Exception( 'Please update your Cython version. Supported Cython >= 3') @@ -254,7 +255,8 @@ def _run_cmake(self): if os.path.isfile('CMakeCache.txt'): cachefile = open('CMakeCache.txt', 'r') cachedir = re.search('CMAKE_CACHEFILE_DIR:INTERNAL=(.*)', - cachefile.read()).group(1) # type: ignore[possibly-unbound-attribute] + # type: ignore[possibly-unbound-attribute] + cachefile.read()).group(1) cachefile.close() if (cachedir != build_temp): build_base = pjoin(saved_cwd, build_cmd.build_base) diff --git a/python/stubs/_stubs_typing.pyi b/python/stubs/_stubs_typing.pyi index 73bb9f38a95..3529290ff17 100644 --- a/python/stubs/_stubs_typing.pyi +++ b/python/stubs/_stubs_typing.pyi @@ -31,8 +31,8 @@ Compression: TypeAlias = Literal[ ] NullEncoding: TypeAlias = Literal["mask", "encode"] NullSelectionBehavior: TypeAlias = Literal["drop", "emit_null"] -Mask: TypeAlias = Sequence[bool | None] | NDArray[np.bool_] | BooleanArray -Indices: TypeAlias = Sequence[int] | NDArray[np.integer[Any]] | IntegerArray +Mask: TypeAlias = Union[Sequence[bool | None], NDArray[np.bool_], BooleanArray] +Indices: TypeAlias = Union[Sequence[int], NDArray[np.integer[Any]], IntegerArray] PyScalar: TypeAlias = Union[ bool, int, float, Decimal, str, bytes, dt.date, dt.datetime, dt.time, dt.timedelta ] diff --git a/python/stubs/compute.pyi b/python/stubs/compute.pyi index 775b7fa504e..4788837eeb9 100644 --- a/python/stubs/compute.pyi +++ b/python/stubs/compute.pyi @@ -171,54 +171,54 @@ _Scalar_CoT = TypeVar("_Scalar_CoT", bound=lib.Scalar, covariant=True) _ScalarT = TypeVar("_ScalarT", bound=lib.Scalar) _ArrayT = TypeVar("_ArrayT", bound=lib.Array | lib.ChunkedArray) _ScalarOrArrayT = TypeVar("_ScalarOrArrayT", bound=lib.Array | lib.Scalar | lib.ChunkedArray) -ArrayOrChunkedArray: TypeAlias = lib.Array[_Scalar_CoT] | lib.ChunkedArray[_Scalar_CoT] +ArrayOrChunkedArray: TypeAlias = Union[lib.Array[_Scalar_CoT], lib.ChunkedArray[_Scalar_CoT]] ScalarOrArray: TypeAlias = ArrayOrChunkedArray[_Scalar_CoT] | _Scalar_CoT -SignedIntegerScalar: TypeAlias = ( - lib.Scalar[lib.Int8Type] - | lib.Scalar[lib.Int16Type] - | lib.Scalar[lib.Int32Type] - | lib.Scalar[lib.Int64Type] -) -UnsignedIntegerScalar: TypeAlias = ( - lib.Scalar[lib.UInt8Type] - | lib.Scalar[lib.UInt16Type] - | lib.Scalar[lib.Uint32Type] - | lib.Scalar[lib.UInt64Type] -) +SignedIntegerScalar: TypeAlias = Union[ + lib.Scalar[lib.Int8Type], + lib.Scalar[lib.Int16Type], + lib.Scalar[lib.Int32Type], + lib.Scalar[lib.Int64Type], +] +UnsignedIntegerScalar: TypeAlias = Union[ + lib.Scalar[lib.UInt8Type], + lib.Scalar[lib.UInt16Type], + lib.Scalar[lib.Uint32Type], + lib.Scalar[lib.UInt64Type], +] IntegerScalar: TypeAlias = SignedIntegerScalar | UnsignedIntegerScalar -FloatScalar: TypeAlias = ( - lib.Scalar[lib.Float16Type] | lib.Scalar[lib.Float32Type] | lib.Scalar[lib.Float64Type] -) -DecimalScalar: TypeAlias = ( - lib.Scalar[lib.Decimal32Type] - | lib.Scalar[lib.Decimal64Type] - | lib.Scalar[lib.Decimal128Type] - | lib.Scalar[lib.Decimal256Type] -) +FloatScalar: TypeAlias = Union[ + lib.Scalar[lib.Float16Type], lib.Scalar[lib.Float32Type], lib.Scalar[lib.Float64Type], +] +DecimalScalar: TypeAlias = Union[ + lib.Scalar[lib.Decimal32Type], + lib.Scalar[lib.Decimal64Type], + lib.Scalar[lib.Decimal128Type], + lib.Scalar[lib.Decimal256Type], +] NonFloatNumericScalar: TypeAlias = IntegerScalar | DecimalScalar NumericScalar: TypeAlias = IntegerScalar | FloatScalar | DecimalScalar -BinaryScalar: TypeAlias = ( - lib.Scalar[lib.BinaryType] - | lib.Scalar[lib.LargeBinaryType] - | lib.Scalar[lib.FixedSizeBinaryType] -) -StringScalar: TypeAlias = lib.Scalar[lib.StringType] | lib.Scalar[lib.LargeStringType] +BinaryScalar: TypeAlias = Union[ + lib.Scalar[lib.BinaryType], + lib.Scalar[lib.LargeBinaryType], + lib.Scalar[lib.FixedSizeBinaryType], +] +StringScalar: TypeAlias = Union[lib.Scalar[lib.StringType], lib.Scalar[lib.LargeStringType]] StringOrBinaryScalar: TypeAlias = StringScalar | BinaryScalar _ListScalar: TypeAlias = lib.ListViewScalar[_DataTypeT] | lib.FixedSizeListScalar[_DataTypeT, Any] _LargeListScalar: TypeAlias = lib.LargeListScalar[_DataTypeT] | lib.LargeListViewScalar[_DataTypeT] ListScalar: TypeAlias = ( lib.ListScalar[_DataTypeT] | _ListScalar[_DataTypeT] | _LargeListScalar[_DataTypeT] ) -TemporalScalar: TypeAlias = ( - lib.Date32Scalar - | lib.Date64Scalar - | lib.Time32Scalar[Any] - | lib.Time64Scalar[Any] - | lib.TimestampScalar[Any] - | lib.DurationScalar[Any] - | lib.MonthDayNanoIntervalScalar -) +TemporalScalar: TypeAlias = Union[ + lib.Date32Scalar, + lib.Date64Scalar, + lib.Time32Scalar[Any], + lib.Time64Scalar[Any], + lib.TimestampScalar[Any], + lib.DurationScalar[Any], + lib.MonthDayNanoIntervalScalar +] NumericOrDurationScalar: TypeAlias = NumericScalar | lib.DurationScalar NumericOrTemporalScalar: TypeAlias = NumericScalar | TemporalScalar From eff8ada65b969798e4c359a1159c4221714e0354 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 24 Jul 2025 22:26:39 +0200 Subject: [PATCH 20/32] Fix unresolved-reference --- python/pyproject.toml | 2 +- python/stubs/__lib_pxi/types.pyi | 1 - python/stubs/_fs.pyi | 2 -- python/stubs/compute.pyi | 6 +++--- 4 files changed, 4 insertions(+), 7 deletions(-) diff --git a/python/pyproject.toml b/python/pyproject.toml index a35a73911c2..85f44572ae8 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -114,7 +114,7 @@ unknown-argument = "ignore" unresolved-attribute = "ignore" unresolved-global = "ignore" unresolved-import = "ignore" -unresolved-reference = "ignore" +#unresolved-reference = "ignore" #unsupported-operator = "ignore" missing-argument = "ignore" #call-non-callable = "ignore" diff --git a/python/stubs/__lib_pxi/types.pyi b/python/stubs/__lib_pxi/types.pyi index d38269ef341..f1b8d540e31 100644 --- a/python/stubs/__lib_pxi/types.pyi +++ b/python/stubs/__lib_pxi/types.pyi @@ -29,7 +29,6 @@ from .io import Buffer from .scalar import ExtensionScalar _AsPyType = TypeVar("_AsPyType") -_DataTypeT = TypeVar("_DataTypeT", bound=DataType) class _Weakrefable: ... class _Metadata(_Weakrefable): ... diff --git a/python/stubs/_fs.pyi b/python/stubs/_fs.pyi index d3b194e3ded..9f6e28dcf0f 100644 --- a/python/stubs/_fs.pyi +++ b/python/stubs/_fs.pyi @@ -19,8 +19,6 @@ from fsspec import AbstractFileSystem # type: ignore[import-untyped] from .lib import NativeFile, _Weakrefable -SupportedFileSystem: TypeAlias = Union[AbstractFileSystem, FileSystem] - class FileType(enum.IntFlag): NotFound = enum.auto() Unknown = enum.auto() diff --git a/python/stubs/compute.pyi b/python/stubs/compute.pyi index 4788837eeb9..373155d6e4e 100644 --- a/python/stubs/compute.pyi +++ b/python/stubs/compute.pyi @@ -1,5 +1,5 @@ # ruff: noqa: I001 -from typing import Literal, TypeAlias, TypeVar, overload, Any, Iterable, ParamSpec, Sequence # type: ignore[unresolved_import] +from typing import Literal, TypeAlias, TypeVar, overload, Any, Iterable, ParamSpec, Sequence, Union # type: ignore[unresolved_import] from collections.abc import Callable # Option classes @@ -205,8 +205,8 @@ BinaryScalar: TypeAlias = Union[ ] StringScalar: TypeAlias = Union[lib.Scalar[lib.StringType], lib.Scalar[lib.LargeStringType]] StringOrBinaryScalar: TypeAlias = StringScalar | BinaryScalar -_ListScalar: TypeAlias = lib.ListViewScalar[_DataTypeT] | lib.FixedSizeListScalar[_DataTypeT, Any] -_LargeListScalar: TypeAlias = lib.LargeListScalar[_DataTypeT] | lib.LargeListViewScalar[_DataTypeT] +_ListScalar: TypeAlias = Union[lib.ListViewScalar[_DataTypeT], lib.FixedSizeListScalar[_DataTypeT, Any]] +_LargeListScalar: TypeAlias = Union[lib.LargeListScalar[_DataTypeT], lib.LargeListViewScalar[_DataTypeT]] ListScalar: TypeAlias = ( lib.ListScalar[_DataTypeT] | _ListScalar[_DataTypeT] | _LargeListScalar[_DataTypeT] ) From 1b3b39bca766eb433c4287e6b2aa5b509d57213c Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 24 Jul 2025 22:34:38 +0200 Subject: [PATCH 21/32] Fix not-iterable --- python/pyarrow/interchange/from_dataframe.py | 4 ++-- python/pyproject.toml | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/python/pyarrow/interchange/from_dataframe.py b/python/pyarrow/interchange/from_dataframe.py index fcaec41e3dc..47ddbb885ff 100644 --- a/python/pyarrow/interchange/from_dataframe.py +++ b/python/pyarrow/interchange/from_dataframe.py @@ -450,7 +450,7 @@ def buffers_to_array( def validity_buffer_from_mask( validity_buff: BufferObject, validity_dtype: Dtype, - describe_null: ColumnNullType, + describe_null: Tuple[ColumnNullType, Any], length: int, offset: int = 0, allow_copy: bool = True, @@ -529,7 +529,7 @@ def validity_buffer_from_mask( def validity_buffer_nan_sentinel( data_pa_buffer: BufferObject, data_type: Dtype, - describe_null: ColumnNullType, + describe_null: Tuple[ColumnNullType, Any], length: int, offset: int = 0, allow_copy: bool = True, diff --git a/python/pyproject.toml b/python/pyproject.toml index 85f44572ae8..17076cdd6bc 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -105,14 +105,14 @@ invalid-context-manager = "ignore" invalid-return-type = "ignore" invalid-type-form = "ignore" no-matching-overload = "ignore" -non-subscriptable = "ignore" -not-iterable = "ignore" +#non-subscriptable = "ignore" +#not-iterable = "ignore" possibly-unbound-attribute = "ignore" possibly-unbound-import = "ignore" too-many-positional-arguments = "ignore" -unknown-argument = "ignore" +#unknown-argument = "ignore" unresolved-attribute = "ignore" -unresolved-global = "ignore" +#unresolved-global = "ignore" unresolved-import = "ignore" #unresolved-reference = "ignore" #unsupported-operator = "ignore" From e2b0a7edefe763a025b85830c0858ef424b2bbdb Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Thu, 24 Jul 2025 23:31:28 +0200 Subject: [PATCH 22/32] Fix no-matching-overload --- python/benchmarks/parquet.py | 2 +- python/pyarrow/__init__.py | 2 +- python/pyarrow/_compute.pyx | 2 +- python/pyarrow/_dataset.pyx | 4 ++-- python/pyarrow/_dataset_parquet.pyx | 2 +- python/pyarrow/_substrait.pyx | 2 +- python/pyarrow/acero.py | 2 +- python/pyarrow/lib.pyx | 2 +- python/pyarrow/pandas_compat.py | 2 +- python/pyarrow/tests/parquet/common.py | 2 +- python/pyarrow/tests/parquet/test_basic.py | 6 +++--- .../tests/parquet/test_compliant_nested_type.py | 4 ++-- python/pyarrow/tests/parquet/test_data_types.py | 6 +++--- python/pyarrow/tests/parquet/test_dataset.py | 6 +++--- python/pyarrow/tests/parquet/test_datetime.py | 6 +++--- python/pyarrow/tests/parquet/test_encryption.py | 3 +-- python/pyarrow/tests/parquet/test_metadata.py | 6 +++--- python/pyarrow/tests/parquet/test_pandas.py | 4 ++-- python/pyarrow/tests/parquet/test_parquet_file.py | 2 +- python/pyarrow/tests/parquet/test_parquet_writer.py | 2 +- python/pyarrow/tests/strategies.py | 10 +++++----- python/pyarrow/tests/test_acero.py | 2 +- python/pyarrow/tests/test_array.py | 6 +++--- python/pyarrow/tests/test_cffi.py | 4 ++-- python/pyarrow/tests/test_compute.py | 6 +++--- python/pyarrow/tests/test_convert_builtin.py | 2 +- python/pyarrow/tests/test_dataset.py | 8 ++++---- python/pyarrow/tests/test_dataset_encryption.py | 3 +-- python/pyarrow/tests/test_extension_type.py | 4 ++-- python/pyarrow/tests/test_feather.py | 2 +- python/pyarrow/tests/test_flight.py | 2 +- python/pyarrow/tests/test_io.py | 2 +- python/pyarrow/tests/test_ipc.py | 2 +- python/pyarrow/tests/test_json.py | 2 +- python/pyarrow/tests/test_pandas.py | 2 +- python/pyarrow/tests/test_scalars.py | 2 +- python/pyarrow/tests/test_schema.py | 2 +- python/pyarrow/tests/test_sparse_tensor.py | 4 ++-- python/pyarrow/tests/test_substrait.py | 2 +- python/pyarrow/tests/test_table.py | 2 +- python/pyarrow/tests/test_types.py | 2 +- python/pyarrow/tests/test_udf.py | 4 ++-- python/pyarrow/tests/util.py | 3 ++- python/pyproject.toml | 8 ++++---- python/stubs/_csv.pyi | 8 ++++---- 45 files changed, 80 insertions(+), 81 deletions(-) diff --git a/python/benchmarks/parquet.py b/python/benchmarks/parquet.py index 44b27ff0f46..7dbd6adcc38 100644 --- a/python/benchmarks/parquet.py +++ b/python/benchmarks/parquet.py @@ -21,7 +21,7 @@ try: import pyarrow.parquet as pq except ImportError: - pq = None + pass from pyarrow.tests.util import rands diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 45aa2b619f8..1170db23424 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -56,7 +56,7 @@ def parse_git(root, **kwargs): __version__ = setuptools_scm.get_version('../', parse=parse_git) except ImportError: - __version__ = None + __version__ = None # type: ignore[invalid-assignment] import pyarrow.lib as _lib # type: ignore[unresolved_import] from pyarrow.lib import (BuildInfo, CppBuildInfo, RuntimeInfo, set_timezone_db_path, # type: ignore[unresolved_import] diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index 59fd775b5ac..ad0b116fdc6 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -36,7 +36,7 @@ import inspect try: import numpy as np except ImportError: - np = None + pass import warnings diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index d279881d15f..abfd011fa21 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -42,7 +42,7 @@ from pyarrow._json cimport ReadOptions as JsonReadOptions try: import pyarrow.substrait as pa_substrait except ImportError: - pa_substrait = None + pass _DEFAULT_BATCH_SIZE = 2**17 @@ -89,7 +89,7 @@ def _get_parquet_classes(): try: import pyarrow._dataset_parquet as _dataset_pq except ImportError: - _dataset_pq = None + pass def _get_parquet_symbol(name): diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx index 9405b5d8c54..e17867426dc 100644 --- a/python/pyarrow/_dataset_parquet.pyx +++ b/python/pyarrow/_dataset_parquet.pyx @@ -59,7 +59,7 @@ try: ) parquet_encryption_enabled = True except ImportError: - parquet_encryption_enabled = False + pass cdef Expression _true = Expression._scalar(True) diff --git a/python/pyarrow/_substrait.pyx b/python/pyarrow/_substrait.pyx index d9359c8e77d..b317ba1e639 100644 --- a/python/pyarrow/_substrait.pyx +++ b/python/pyarrow/_substrait.pyx @@ -29,7 +29,7 @@ from pyarrow.includes.libarrow_substrait cimport * try: import substrait as py_substrait except ImportError: - py_substrait = None + pass else: import substrait.proto # no-cython-lint diff --git a/python/pyarrow/acero.py b/python/pyarrow/acero.py index dcead124d31..e56fe0000ea 100644 --- a/python/pyarrow/acero.py +++ b/python/pyarrow/acero.py @@ -53,7 +53,7 @@ class Dataset: class InMemoryDataset: pass - ds = DatasetModuleStub + ds = DatasetModuleStub # type: ignore[invalid-assignment] def _dataset_to_decl(dataset, use_threads=True, implicit_ordering=False): diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx index 5dca6fd8d2e..2da25a570ae 100644 --- a/python/pyarrow/lib.pyx +++ b/python/pyarrow/lib.pyx @@ -24,7 +24,7 @@ import decimal as _pydecimal try: import numpy as np except ImportError: - np = None + pass import os import sys diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index 8dcfb282b31..db81b40d334 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -33,7 +33,7 @@ try: import numpy as np except ImportError: - np = None # type: ignore[assignment] + pass import pyarrow as pa from pyarrow.lib import _pandas_api, frombytes, is_threading_enabled # type: ignore[unresolved_import] # noqa diff --git a/python/pyarrow/tests/parquet/common.py b/python/pyarrow/tests/parquet/common.py index 7351a4c3e94..28e04abf1c5 100644 --- a/python/pyarrow/tests/parquet/common.py +++ b/python/pyarrow/tests/parquet/common.py @@ -20,7 +20,7 @@ try: import numpy as np except ImportError: - np = None # type: ignore[assignment] + pass import pyarrow as pa from pyarrow.tests import util diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py index 18381538211..7a0dfcde270 100644 --- a/python/pyarrow/tests/parquet/test_basic.py +++ b/python/pyarrow/tests/parquet/test_basic.py @@ -33,7 +33,7 @@ import pyarrow.parquet as pq from pyarrow.tests.parquet.common import _read_table, _write_table except ImportError: - pq = None + pass try: @@ -43,12 +43,12 @@ from pyarrow.tests.pandas_examples import dataframe_with_lists from pyarrow.tests.parquet.common import alltypes_sample except ImportError: - pd = tm = None # type: ignore[assignment] + pass try: import numpy as np except ImportError: - np = None # type: ignore[assignment] + pass # Marks all of the tests in this module # Ignore these with pytest ... -m 'not parquet' diff --git a/python/pyarrow/tests/parquet/test_compliant_nested_type.py b/python/pyarrow/tests/parquet/test_compliant_nested_type.py index 8a64cd0cab7..3e6543a0538 100644 --- a/python/pyarrow/tests/parquet/test_compliant_nested_type.py +++ b/python/pyarrow/tests/parquet/test_compliant_nested_type.py @@ -24,7 +24,7 @@ from pyarrow.tests.parquet.common import (_read_table, _check_roundtrip) except ImportError: - pq = None + pass try: import pandas as pd @@ -32,7 +32,7 @@ from pyarrow.tests.parquet.common import _roundtrip_pandas_dataframe except ImportError: - pd = tm = None # type: ignore[assignment] + pass # Marks all of the tests in this module diff --git a/python/pyarrow/tests/parquet/test_data_types.py b/python/pyarrow/tests/parquet/test_data_types.py index 66e12d11b21..b48627d679d 100644 --- a/python/pyarrow/tests/parquet/test_data_types.py +++ b/python/pyarrow/tests/parquet/test_data_types.py @@ -22,7 +22,7 @@ try: import numpy as np except ImportError: - np = None # type: ignore[assignment] + pass import pytest import pyarrow as pa @@ -33,7 +33,7 @@ import pyarrow.parquet as pq from pyarrow.tests.parquet.common import _read_table, _write_table except ImportError: - pq = None + pass try: @@ -44,7 +44,7 @@ dataframe_with_lists) from pyarrow.tests.parquet.common import alltypes_sample except ImportError: - pd = tm = None # type: ignore[assignment] + pass # Marks all of the tests in this module diff --git a/python/pyarrow/tests/parquet/test_dataset.py b/python/pyarrow/tests/parquet/test_dataset.py index a162006dc0c..1e6897f703d 100644 --- a/python/pyarrow/tests/parquet/test_dataset.py +++ b/python/pyarrow/tests/parquet/test_dataset.py @@ -24,7 +24,7 @@ try: import numpy as np except ImportError: - np = None # type: ignore[assignment] + pass import pytest import unittest.mock as mock @@ -40,7 +40,7 @@ from pyarrow.tests.parquet.common import ( _read_table, _test_dataframe, _write_table) except ImportError: - pq = None + pass try: @@ -48,7 +48,7 @@ import pandas.testing as tm except ImportError: - pd = tm = None # type: ignore[assignment] + pass # Marks all of the tests in this module diff --git a/python/pyarrow/tests/parquet/test_datetime.py b/python/pyarrow/tests/parquet/test_datetime.py index 62904937eb5..7a95debca3f 100644 --- a/python/pyarrow/tests/parquet/test_datetime.py +++ b/python/pyarrow/tests/parquet/test_datetime.py @@ -22,7 +22,7 @@ try: import numpy as np except ImportError: - np = None # type: ignore[assignment] + pass import pytest import pyarrow as pa @@ -32,7 +32,7 @@ import pyarrow.parquet as pq from pyarrow.tests.parquet.common import _read_table, _write_table except ImportError: - pq = None + pass try: @@ -41,7 +41,7 @@ from pyarrow.tests.parquet.common import _roundtrip_pandas_dataframe except ImportError: - pd = tm = None # type: ignore[assignment] + pass # Marks all of the tests in this module diff --git a/python/pyarrow/tests/parquet/test_encryption.py b/python/pyarrow/tests/parquet/test_encryption.py index a11a4935a1c..5815d65c8d8 100644 --- a/python/pyarrow/tests/parquet/test_encryption.py +++ b/python/pyarrow/tests/parquet/test_encryption.py @@ -22,8 +22,7 @@ import pyarrow.parquet as pq import pyarrow.parquet.encryption as pe except ImportError: - pq = None - pe = None + pass else: from pyarrow.tests.parquet.encryption import ( InMemoryKmsClient, verify_file_encrypted) diff --git a/python/pyarrow/tests/parquet/test_metadata.py b/python/pyarrow/tests/parquet/test_metadata.py index d8fafde185f..85823b5cfa2 100644 --- a/python/pyarrow/tests/parquet/test_metadata.py +++ b/python/pyarrow/tests/parquet/test_metadata.py @@ -23,7 +23,7 @@ try: import numpy as np except ImportError: - np = None # type: ignore[assignment] + pass import pytest import pyarrow as pa @@ -35,7 +35,7 @@ import pyarrow.parquet as pq from pyarrow.tests.parquet.common import _write_table except ImportError: - pq = None + pass try: @@ -44,7 +44,7 @@ from pyarrow.tests.parquet.common import alltypes_sample except ImportError: - pd = tm = None # type: ignore[assignment] + pass # Marks all of the tests in this module diff --git a/python/pyarrow/tests/parquet/test_pandas.py b/python/pyarrow/tests/parquet/test_pandas.py index 9b9e7c4e48e..34d7c1c750a 100644 --- a/python/pyarrow/tests/parquet/test_pandas.py +++ b/python/pyarrow/tests/parquet/test_pandas.py @@ -21,7 +21,7 @@ try: import numpy as np except ImportError: - np = None # type: ignore[assignment] + pss import pytest import pyarrow as pa @@ -44,7 +44,7 @@ from pyarrow.tests.parquet.common import (_roundtrip_pandas_dataframe, alltypes_sample) except ImportError: - pd = tm = None # type: ignore[assignment] + pass # Marks all of the tests in this module diff --git a/python/pyarrow/tests/parquet/test_parquet_file.py b/python/pyarrow/tests/parquet/test_parquet_file.py index df5b82ad8d9..6864fe08dca 100644 --- a/python/pyarrow/tests/parquet/test_parquet_file.py +++ b/python/pyarrow/tests/parquet/test_parquet_file.py @@ -38,7 +38,7 @@ from pyarrow.tests.parquet.common import alltypes_sample except ImportError: - pd = tm = None # type: ignore[assignment] + pass # Marks all of the tests in this module diff --git a/python/pyarrow/tests/parquet/test_parquet_writer.py b/python/pyarrow/tests/parquet/test_parquet_writer.py index 8f163dfc0b5..d82b6c6da8b 100644 --- a/python/pyarrow/tests/parquet/test_parquet_writer.py +++ b/python/pyarrow/tests/parquet/test_parquet_writer.py @@ -33,7 +33,7 @@ import pandas.testing as tm except ImportError: - pd = tm = None # type: ignore[assignment] + pass # Marks all of the tests in this module diff --git a/python/pyarrow/tests/strategies.py b/python/pyarrow/tests/strategies.py index 243815c59f7..07ebaa771f1 100644 --- a/python/pyarrow/tests/strategies.py +++ b/python/pyarrow/tests/strategies.py @@ -24,24 +24,24 @@ try: import hypothesis.extra.numpy as npst except ImportError: - npst = None # type: ignore[assignment] + pass try: import hypothesis.extra.pytz as tzst except ImportError: - tzst = None # type: ignore[assignment] + pass try: import zoneinfo except ImportError: - zoneinfo = None # type: ignore[assignment] + pass if sys.platform == 'win32': try: import tzdata # noqa:F401 except ImportError: - zoneinfo = None + pass try: import numpy as np except ImportError: - np = None # type: ignore[assignment] + pass import pyarrow as pa diff --git a/python/pyarrow/tests/test_acero.py b/python/pyarrow/tests/test_acero.py index cb97e3849fd..8dba7471b49 100644 --- a/python/pyarrow/tests/test_acero.py +++ b/python/pyarrow/tests/test_acero.py @@ -39,7 +39,7 @@ import pyarrow.dataset as ds from pyarrow.acero import ScanNodeOptions except ImportError: - ds = None + pass pytestmark = pytest.mark.acero diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index a1377d0c839..92db9fc177a 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -30,7 +30,7 @@ try: import numpy as np except ImportError: - np = None # type: ignore[assignment] + pass import pyarrow as pa import pyarrow.tests.strategies as past @@ -551,8 +551,8 @@ def test_arange(): result = pa.arange(*case) result.validate(full=True) - # type: ignore[no-matching-overload] - assert result.equals(pa.array(list(range(*case)), type=pa.int64())) + assert result.equals(pa.array(list(range(*case)), type=pa.int64())) \ + # type: ignore[no-matching-overload] # Validate memory_pool keyword argument result = pa.arange(-1, 101, memory_pool=pa.default_memory_pool()) diff --git a/python/pyarrow/tests/test_cffi.py b/python/pyarrow/tests/test_cffi.py index 2d0ff8b45f1..60f3a5621b9 100644 --- a/python/pyarrow/tests/test_cffi.py +++ b/python/pyarrow/tests/test_cffi.py @@ -24,7 +24,7 @@ try: from pyarrow.cffi import ffi except ImportError: - ffi = None # type: ignore[assignment] + pass import pytest @@ -32,7 +32,7 @@ import pandas as pd import pandas.testing as tm except ImportError: - pd = tm = None # type: ignore[assignment] + pass needs_cffi = pytest.mark.skipif(ffi is None, diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 4ab0e632134..3737fc89025 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -31,12 +31,12 @@ try: import numpy as np except ImportError: - np = None # type: ignore[assignment] + pass try: import pandas as pd except ImportError: - pd = None # type: ignore[assignment] + pass import pyarrow as pa import pyarrow.compute as pc @@ -45,7 +45,7 @@ try: import pyarrow.substrait as pas except ImportError: - pas = None + pass exported_functions = [ func for (name, func) in sorted(pc.__dict__.items()) diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index 468bddf58cb..a420af18864 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -27,7 +27,7 @@ try: import numpy as np except ImportError: - np = None # type: ignore[assignment] + pass from pyarrow.pandas_compat import _pandas_api # noqa import pyarrow as pa diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index d9a4d3df207..b22f423cad8 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -32,7 +32,7 @@ try: import numpy as np except ImportError: - np = None # type: ignore[assignment] + pass import pytest import pyarrow as pa @@ -49,17 +49,17 @@ try: import pandas as pd except ImportError: - pd = None # type: ignore[assignment] + pass try: import pyarrow.dataset as ds except ImportError: - ds = None + pass try: import pyarrow.parquet as pq except ImportError: - pq = None + pass # Marks all of the tests in this module # Ignore these with pytest ... -m 'not dataset' diff --git a/python/pyarrow/tests/test_dataset_encryption.py b/python/pyarrow/tests/test_dataset_encryption.py index eb79121b1cd..11d2e1f367a 100644 --- a/python/pyarrow/tests/test_dataset_encryption.py +++ b/python/pyarrow/tests/test_dataset_encryption.py @@ -29,8 +29,7 @@ import pyarrow.parquet as pq import pyarrow.dataset as ds except ImportError: - pq = None - ds = None + pass try: from pyarrow.tests.parquet.encryption import InMemoryKmsClient diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index a3847c44e4f..c1e5db238ad 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -27,7 +27,7 @@ try: import numpy as np except ImportError: - np = None # type: ignore[assignment] + pass import pyarrow as pa from pyarrow.vendored.version import Version @@ -1882,7 +1882,7 @@ def test_bool8_from_numpy_conversion(): ValueError, match="Cannot convert 0-D array to bool8 array", ): - pa.Bool8Array.from_numpy(np.bool_()) + pa.Bool8Array.from_numpy(np.bool_()) # type: ignore[no-matching-overload] # must use compatible storage type with pytest.raises( diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py index 8235260f468..6b35822017b 100644 --- a/python/pyarrow/tests/test_feather.py +++ b/python/pyarrow/tests/test_feather.py @@ -26,7 +26,7 @@ try: import numpy as np except ImportError: - np = None # type: ignore[assignment] + pass import pyarrow as pa import pyarrow.tests.strategies as past diff --git a/python/pyarrow/tests/test_flight.py b/python/pyarrow/tests/test_flight.py index b33ae005331..4f4c5f20e5d 100644 --- a/python/pyarrow/tests/test_flight.py +++ b/python/pyarrow/tests/test_flight.py @@ -33,7 +33,7 @@ try: import numpy as np except ImportError: - np = None # type: ignore[assignment] + pass import pytest import pyarrow as pa diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py index b1ec7674f87..43fd0e1ac0e 100644 --- a/python/pyarrow/tests/test_io.py +++ b/python/pyarrow/tests/test_io.py @@ -33,7 +33,7 @@ try: import numpy as np except ImportError: - np = None # type: ignore[assignment] + pass from pyarrow.util import guid from pyarrow import Codec diff --git a/python/pyarrow/tests/test_ipc.py b/python/pyarrow/tests/test_ipc.py index 26df224ee49..ed6e7563ed2 100644 --- a/python/pyarrow/tests/test_ipc.py +++ b/python/pyarrow/tests/test_ipc.py @@ -28,7 +28,7 @@ try: import numpy as np except ImportError: - np = None # type: ignore[assignment] + pass import pyarrow as pa from pyarrow.tests.util import changed_environ, invoke_script diff --git a/python/pyarrow/tests/test_json.py b/python/pyarrow/tests/test_json.py index ab0602cd198..90ce549c6e6 100644 --- a/python/pyarrow/tests/test_json.py +++ b/python/pyarrow/tests/test_json.py @@ -27,7 +27,7 @@ try: import numpy as np except ImportError: - np = None # type: ignore[assignment] + pass import pytest import pyarrow as pa diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 1bd5b58025d..45352ee3614 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -38,7 +38,7 @@ VisibleDeprecationWarning as _np_VisibleDeprecationWarning ) except ImportError: - np = None # type: ignore[assignment] + pass from pyarrow.pandas_compat import get_logical_type, _pandas_api from pyarrow.tests.util import invoke_script, random_ascii, rands diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index f48761b1918..cdbe3d00aae 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -24,7 +24,7 @@ try: import numpy as np except ImportError: - np = None # type: ignore[assignment] + pass import pyarrow as pa import pyarrow.compute as pc diff --git a/python/pyarrow/tests/test_schema.py b/python/pyarrow/tests/test_schema.py index 6d1ff431819..dc98f03cded 100644 --- a/python/pyarrow/tests/test_schema.py +++ b/python/pyarrow/tests/test_schema.py @@ -23,7 +23,7 @@ try: import numpy as np except ImportError: - np = None # type: ignore[assignment] + pass import pyarrow as pa import pyarrow.tests.util as test_util diff --git a/python/pyarrow/tests/test_sparse_tensor.py b/python/pyarrow/tests/test_sparse_tensor.py index e4d141e2a6f..6a398f38ac5 100644 --- a/python/pyarrow/tests/test_sparse_tensor.py +++ b/python/pyarrow/tests/test_sparse_tensor.py @@ -29,12 +29,12 @@ import scipy from scipy.sparse import csr_array, coo_array, csr_matrix, coo_matrix except ImportError: - scipy = None # type: ignore[assignment] + pass try: import sparse # type: ignore[unresolved_import] except ImportError: - sparse = None + pass tensor_type_pairs = [ diff --git a/python/pyarrow/tests/test_substrait.py b/python/pyarrow/tests/test_substrait.py index fae89d3cee5..d3f5d848bce 100644 --- a/python/pyarrow/tests/test_substrait.py +++ b/python/pyarrow/tests/test_substrait.py @@ -29,7 +29,7 @@ try: import pyarrow.substrait as substrait except ImportError: - substrait = None + pass # Marks all of the tests in this module # Ignore these with pytest ... -m 'not substrait' diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index ead5cbaddc5..eeb6c8f8539 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -23,7 +23,7 @@ try: import numpy as np except ImportError: - np = None # type: ignore[assignment] + pass import pytest import pyarrow as pa import pyarrow.compute as pc diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index 338c022a223..5e5f4903e29 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -27,7 +27,7 @@ try: import hypothesis.extra.pytz as tzst except ImportError: - tzst = None # type: ignore[assignment] + pass import weakref try: diff --git a/python/pyarrow/tests/test_udf.py b/python/pyarrow/tests/test_udf.py index 891295a5519..aed2fbceaeb 100644 --- a/python/pyarrow/tests/test_udf.py +++ b/python/pyarrow/tests/test_udf.py @@ -21,7 +21,7 @@ try: import numpy as np except ImportError: - np = None # type: ignore[assignment] + pass import pyarrow as pa from pyarrow import compute as pc @@ -35,7 +35,7 @@ try: import pyarrow.dataset as ds except ImportError: - ds = None + pass def mock_udf_context(batch_length=10): diff --git a/python/pyarrow/tests/util.py b/python/pyarrow/tests/util.py index d8b250ffff0..7eefd067807 100644 --- a/python/pyarrow/tests/util.py +++ b/python/pyarrow/tests/util.py @@ -171,7 +171,8 @@ def get_modified_env_with_pythonpath(): existing_pythonpath = env.get('PYTHONPATH', '') module_path = os.path.abspath( - os.path.dirname(os.path.dirname(pa.__file__))) + os.path.dirname(os.path.dirname(pa.__file__))) \ + # type: ignore[no-matching-overload] if existing_pythonpath: new_pythonpath = os.pathsep.join((module_path, existing_pythonpath)) diff --git a/python/pyproject.toml b/python/pyproject.toml index 17076cdd6bc..983d2ed9174 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -101,10 +101,10 @@ fallback_version = '22.0.0a0' [tool.ty.rules] invalid-argument-type = "ignore" invalid-assignment = "ignore" -invalid-context-manager = "ignore" -invalid-return-type = "ignore" -invalid-type-form = "ignore" -no-matching-overload = "ignore" +#invalid-context-manager = "ignore" +#invalid-return-type = "ignore" +#invalid-type-form = "ignore" +#no-matching-overload = "ignore" #non-subscriptable = "ignore" #not-iterable = "ignore" possibly-unbound-attribute = "ignore" diff --git a/python/stubs/_csv.pyi b/python/stubs/_csv.pyi index 2f49f8c9a6c..1f724594d35 100644 --- a/python/stubs/_csv.pyi +++ b/python/stubs/_csv.pyi @@ -96,7 +96,7 @@ class ReadOptions(lib._Weakrefable): 3: [[2022-03-03,2022-03-04]] """ - use_threads: bool = field(default=True, kw_only=False) + use_threads: bool = field(default=True) block_size: int | None = None skip_rows: int = 0 skip_rows_after_names: int = 0 @@ -182,7 +182,7 @@ class ParseOptions(lib._Weakrefable): entry: [[2022-03-01,2022-03-02,2022-03-03,2022-03-04]] """ - delimiter: str = field(default=",", kw_only=False) + delimiter: str = field(default=",") quote_char: str | Literal[False] = '"' double_quote: bool = True escape_char: str | Literal[False] = False @@ -397,7 +397,7 @@ class ConvertOptions(lib._Weakrefable): fast: [[true,true,false,false,null]] """ - check_utf8: bool = field(default=True, kw_only=False) + check_utf8: bool = field(default=True) column_types: lib.Schema | dict | None = None null_values: list[str] | None = None true_values: list[str] | None = None @@ -438,7 +438,7 @@ class WriteOptions(lib._Weakrefable): will raise an error. """ - include_header: bool = field(default=True, kw_only=False) + include_header: bool = field(default=True) batch_size: int = 1024 delimiter: str = "," quoting_style: Literal["needed", "all_valid", "none"] = "needed" From 494a581199b2f02700477e4432c5c019f55a9e16 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 25 Jul 2025 00:00:51 +0200 Subject: [PATCH 23/32] Fix invalid-assignment --- .../pyarrow/tests/parquet/test_compliant_nested_type.py | 2 -- python/pyarrow/tests/parquet/test_metadata.py | 2 -- python/pyarrow/tests/parquet/test_pandas.py | 4 ++-- python/pyarrow/tests/parquet/test_parquet_file.py | 4 +--- python/pyarrow/tests/parquet/test_parquet_writer.py | 2 +- python/pyarrow/tests/test_compute.py | 4 ---- python/pyarrow/tests/test_flight.py | 9 --------- python/pyproject.toml | 2 +- 8 files changed, 5 insertions(+), 24 deletions(-) diff --git a/python/pyarrow/tests/parquet/test_compliant_nested_type.py b/python/pyarrow/tests/parquet/test_compliant_nested_type.py index 3e6543a0538..d7388be8a1b 100644 --- a/python/pyarrow/tests/parquet/test_compliant_nested_type.py +++ b/python/pyarrow/tests/parquet/test_compliant_nested_type.py @@ -28,8 +28,6 @@ try: import pandas as pd - import pandas.testing as tm - from pyarrow.tests.parquet.common import _roundtrip_pandas_dataframe except ImportError: pass diff --git a/python/pyarrow/tests/parquet/test_metadata.py b/python/pyarrow/tests/parquet/test_metadata.py index 85823b5cfa2..d180fbfb4e5 100644 --- a/python/pyarrow/tests/parquet/test_metadata.py +++ b/python/pyarrow/tests/parquet/test_metadata.py @@ -40,8 +40,6 @@ try: import pandas as pd - import pandas.testing as tm - from pyarrow.tests.parquet.common import alltypes_sample except ImportError: pass diff --git a/python/pyarrow/tests/parquet/test_pandas.py b/python/pyarrow/tests/parquet/test_pandas.py index 34d7c1c750a..f9f4e74dc86 100644 --- a/python/pyarrow/tests/parquet/test_pandas.py +++ b/python/pyarrow/tests/parquet/test_pandas.py @@ -21,7 +21,7 @@ try: import numpy as np except ImportError: - pss + pass import pytest import pyarrow as pa @@ -34,7 +34,7 @@ from pyarrow.tests.parquet.common import (_read_table, _test_dataframe, _write_table) except ImportError: - pq = None + pass try: diff --git a/python/pyarrow/tests/parquet/test_parquet_file.py b/python/pyarrow/tests/parquet/test_parquet_file.py index 6864fe08dca..b09c26c7144 100644 --- a/python/pyarrow/tests/parquet/test_parquet_file.py +++ b/python/pyarrow/tests/parquet/test_parquet_file.py @@ -30,12 +30,10 @@ import pyarrow.parquet as pq from pyarrow.tests.parquet.common import _write_table except ImportError: - pq = None + pass try: - import pandas as pd import pandas.testing as tm - from pyarrow.tests.parquet.common import alltypes_sample except ImportError: pass diff --git a/python/pyarrow/tests/parquet/test_parquet_writer.py b/python/pyarrow/tests/parquet/test_parquet_writer.py index d82b6c6da8b..d6f30ea16be 100644 --- a/python/pyarrow/tests/parquet/test_parquet_writer.py +++ b/python/pyarrow/tests/parquet/test_parquet_writer.py @@ -25,7 +25,7 @@ from pyarrow.tests.parquet.common import (_read_table, _test_dataframe, _range_integers) except ImportError: - pq = None + pass try: diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 3737fc89025..4e39383473c 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -42,10 +42,6 @@ import pyarrow.compute as pc from pyarrow.lib import ArrowNotImplementedError # type: ignore[unresolved_import] -try: - import pyarrow.substrait as pas -except ImportError: - pass exported_functions = [ func for (name, func) in sorted(pc.__dict__.items()) diff --git a/python/pyarrow/tests/test_flight.py b/python/pyarrow/tests/test_flight.py index 4f4c5f20e5d..01a7cc058b2 100644 --- a/python/pyarrow/tests/test_flight.py +++ b/python/pyarrow/tests/test_flight.py @@ -50,15 +50,6 @@ ClientMiddleware, ClientMiddlewareFactory, ) except ImportError: - class context_like(object): - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_value, traceback): - pass - - flight = None - class MockContextManager: def __init__(self, *args, **kwargs): pass diff --git a/python/pyproject.toml b/python/pyproject.toml index 983d2ed9174..952b73c93dc 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -100,7 +100,7 @@ fallback_version = '22.0.0a0' [tool.ty.rules] invalid-argument-type = "ignore" -invalid-assignment = "ignore" +#invalid-assignment = "ignore" #invalid-context-manager = "ignore" #invalid-return-type = "ignore" #invalid-type-form = "ignore" From c5251d30c70d7e992641e8dcda5e0ebdbd42a76b Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 25 Jul 2025 00:17:26 +0200 Subject: [PATCH 24/32] Fix too-many-positional-arguments --- python/pyarrow/tests/test_dataset.py | 3 ++- python/pyarrow/tests/test_flight.py | 8 ++++---- python/pyproject.toml | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index b22f423cad8..8dfdbcb8c2a 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -2216,7 +2216,8 @@ def test_dictionary_partitioning_outer_nulls_raises(tempdir): def test_positional_keywords_raises(tempdir): table = pa.table({'a': ['x', 'y', None], 'b': ['x', 'y', 'z']}) with pytest.raises(TypeError): - ds.write_dataset(table, tempdir, "basename-{i}.arrow") + ds.write_dataset(table, tempdir, "basename-{i}.arrow") \ + # type: ignore[too-many-positional-arguments] @pytest.mark.parquet diff --git a/python/pyarrow/tests/test_flight.py b/python/pyarrow/tests/test_flight.py index 01a7cc058b2..f14e5215b33 100644 --- a/python/pyarrow/tests/test_flight.py +++ b/python/pyarrow/tests/test_flight.py @@ -65,16 +65,16 @@ def serve(self): pass class FlightClient(MockContextManager): - def get_flight_info(self, **kwargs): + def get_flight_info(self, *args, **kwargs): pass - def do_action(self, **kwargs): + def do_action(self, *args, **kwargs): pass - def do_get(self, **kwargs): + def do_get(self, *args, **kwargs): pass - def do_put(self, **kwargs): + def do_put(self, *args, **kwargs): pass def close(self): diff --git a/python/pyproject.toml b/python/pyproject.toml index 952b73c93dc..7a560ee5081 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -109,7 +109,7 @@ invalid-argument-type = "ignore" #not-iterable = "ignore" possibly-unbound-attribute = "ignore" possibly-unbound-import = "ignore" -too-many-positional-arguments = "ignore" +#too-many-positional-arguments = "ignore" #unknown-argument = "ignore" unresolved-attribute = "ignore" #unresolved-global = "ignore" From 5d0463e1939105b2522b07f37f457e1451c5ad6c Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 25 Jul 2025 00:43:40 +0200 Subject: [PATCH 25/32] Fix invalid-argument-type --- python/pyarrow/interchange/from_dataframe.py | 2 +- python/pyarrow/tests/parquet/test_data_types.py | 2 +- python/pyarrow/tests/parquet/test_pandas.py | 12 ++++++------ python/pyarrow/tests/test_pandas.py | 8 +++++--- python/pyarrow/tests/test_table.py | 2 +- python/pyproject.toml | 2 +- 6 files changed, 15 insertions(+), 13 deletions(-) diff --git a/python/pyarrow/interchange/from_dataframe.py b/python/pyarrow/interchange/from_dataframe.py index 47ddbb885ff..80ddc8fa024 100644 --- a/python/pyarrow/interchange/from_dataframe.py +++ b/python/pyarrow/interchange/from_dataframe.py @@ -346,7 +346,7 @@ def buffers_to_array( buffers: ColumnBuffers, data_type: Tuple[DtypeKind, int, str, str], length: int, - describe_null: ColumnNullType, + describe_null: Tuple[ColumnNullType, Any], offset: int = 0, allow_copy: bool = True, ) -> pa.Array: diff --git a/python/pyarrow/tests/parquet/test_data_types.py b/python/pyarrow/tests/parquet/test_data_types.py index b48627d679d..898071d8f85 100644 --- a/python/pyarrow/tests/parquet/test_data_types.py +++ b/python/pyarrow/tests/parquet/test_data_types.py @@ -390,7 +390,7 @@ def test_parquet_nested_convenience(tempdir): read = pq.read_table( path, columns=['a']) - tm.assert_frame_equal(read.to_pandas(), df[['a']]) + tm.assert_frame_equal(read.to_pandas(), df[['a']]) # type: ignore[invalid-argument-type] read = pq.read_table( path, columns=['a', 'b']) diff --git a/python/pyarrow/tests/parquet/test_pandas.py b/python/pyarrow/tests/parquet/test_pandas.py index f9f4e74dc86..96622c4746b 100644 --- a/python/pyarrow/tests/parquet/test_pandas.py +++ b/python/pyarrow/tests/parquet/test_pandas.py @@ -425,7 +425,7 @@ def test_backwards_compatible_column_metadata_handling(datadir): table = _read_table( path, columns=['a']) result = table.to_pandas() - tm.assert_frame_equal(result, expected[['a']].reset_index(drop=True)) + tm.assert_frame_equal(result, expected[['a']].reset_index(drop=True)) # type: ignore[invalid-argument-type] @pytest.mark.pandas @@ -485,7 +485,7 @@ def test_pandas_categorical_roundtrip(): codes = np.array([2, 0, 0, 2, 0, -1, 2], dtype='int32') categories = ['foo', 'bar', 'baz'] df = pd.DataFrame({'x': pd.Categorical.from_codes( - codes, categories=categories)}) + codes, categories=pd.Index(categories))}) buf = pa.BufferOutputStream() pq.write_table(pa.table(df), buf) @@ -530,15 +530,15 @@ def test_write_to_dataset_pandas_preserve_extensiondtypes(tempdir): table, str(tempdir / "case1"), partition_cols=['part'], ) result = pq.read_table(str(tempdir / "case1")).to_pandas() - tm.assert_frame_equal(result[["col"]], df[["col"]]) + tm.assert_frame_equal(result[["col"]], df[["col"]]) # type: ignore[invalid-argument-type] pq.write_to_dataset(table, str(tempdir / "case2")) result = pq.read_table(str(tempdir / "case2")).to_pandas() - tm.assert_frame_equal(result[["col"]], df[["col"]]) + tm.assert_frame_equal(result[["col"]], df[["col"]]) # type: ignore[invalid-argument-type] pq.write_table(table, str(tempdir / "data.parquet")) result = pq.read_table(str(tempdir / "data.parquet")).to_pandas() - tm.assert_frame_equal(result[["col"]], df[["col"]]) + tm.assert_frame_equal(result[["col"]], df[["col"]]) # type: ignore[invalid-argument-type] @pytest.mark.pandas @@ -555,7 +555,7 @@ def test_write_to_dataset_pandas_preserve_index(tempdir): table, str(tempdir / "case1"), partition_cols=['part'], ) result = pq.read_table(str(tempdir / "case1")).to_pandas() - tm.assert_frame_equal(result, df_cat) + tm.assert_frame_equal(result, df_cat) # type: ignore[invalid-argument-type] pq.write_to_dataset(table, str(tempdir / "case2")) result = pq.read_table(str(tempdir / "case2")).to_pandas() diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 45352ee3614..b1d28a61531 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -627,11 +627,11 @@ def test_table_column_subset_metadata(self): expected = df[['a']] if isinstance(df.index, pd.DatetimeIndex): df.index.freq = None - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # type: ignore[invalid-argument-type] table_subset2 = table_subset.remove_column(1) result = table_subset2.to_pandas() - tm.assert_frame_equal(result, df[['a']].reset_index(drop=True)) + tm.assert_frame_equal(result, df[['a']].reset_index(drop=True)) # type: ignore[invalid-argument-type] def test_to_pandas_column_subset_multiindex(self): # ARROW-10122 @@ -3720,7 +3720,9 @@ def test_table_from_pandas_schema_field_order_metadata(): coerce_cols_to_types["datetime"] = "datetime64[s, UTC]" expected = df[["float", "datetime"]].astype(coerce_cols_to_types) - tm.assert_frame_equal(result, expected) + # TODO: result and expected should have the same type, + # see other ignore[invalid-argument-type] involving assert_frame_equal + tm.assert_frame_equal(result, expected) # type: ignore[invalid-argument-type] # ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index eeb6c8f8539..e7726fd0023 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -418,7 +418,7 @@ def test_to_pandas_empty_table(): table = pa.table(df) result = table.schema.empty_table().to_pandas() assert result.shape == (0, 2) - tm.assert_frame_equal(result, df.iloc[:0]) + tm.assert_frame_equal(result, df.iloc[:0]) # type: ignore[invalid-argument-type] @pytest.mark.pandas diff --git a/python/pyproject.toml b/python/pyproject.toml index 7a560ee5081..9229f42087d 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -99,7 +99,7 @@ git_describe_command = 'git describe --dirty --tags --long --match "apache-arrow fallback_version = '22.0.0a0' [tool.ty.rules] -invalid-argument-type = "ignore" +#invalid-argument-type = "ignore" #invalid-assignment = "ignore" #invalid-context-manager = "ignore" #invalid-return-type = "ignore" From eadaf0c317649dd077122df8233e8dc0177236bb Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 25 Jul 2025 00:48:22 +0200 Subject: [PATCH 26/32] Fix missing-argument --- python/pyarrow/__init__.py | 2 +- python/pyarrow/tests/parquet/common.py | 2 +- python/pyarrow/tests/parquet/test_data_types.py | 3 ++- python/pyarrow/tests/parquet/test_pandas.py | 12 ++++++++---- python/pyarrow/tests/parquet/test_parquet_file.py | 2 +- python/pyarrow/tests/test_pandas.py | 6 ++++-- python/pyproject.toml | 2 +- 7 files changed, 18 insertions(+), 11 deletions(-) diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 1170db23424..2b96edee84e 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -52,7 +52,7 @@ def parse_git(root, **kwargs): from setuptools_scm.git import parse kwargs['describe_command'] = \ "git describe --dirty --tags --long --match 'apache-arrow-[0-9]*.*'" - return parse(root, **kwargs) + return parse(root, **kwargs) # type: ignore[missing-argument] __version__ = setuptools_scm.get_version('../', parse=parse_git) except ImportError: diff --git a/python/pyarrow/tests/parquet/common.py b/python/pyarrow/tests/parquet/common.py index 28e04abf1c5..8ce804262d1 100644 --- a/python/pyarrow/tests/parquet/common.py +++ b/python/pyarrow/tests/parquet/common.py @@ -41,7 +41,7 @@ def _write_table(table, path, **kwargs): def _read_table(*args, **kwargs): import pyarrow.parquet as pq - table = pq.read_table(*args, **kwargs) + table = pq.read_table(*args, **kwargs) # type: ignore[missing-argument] table.validate(full=True) return table diff --git a/python/pyarrow/tests/parquet/test_data_types.py b/python/pyarrow/tests/parquet/test_data_types.py index 898071d8f85..9f8f5212382 100644 --- a/python/pyarrow/tests/parquet/test_data_types.py +++ b/python/pyarrow/tests/parquet/test_data_types.py @@ -390,7 +390,8 @@ def test_parquet_nested_convenience(tempdir): read = pq.read_table( path, columns=['a']) - tm.assert_frame_equal(read.to_pandas(), df[['a']]) # type: ignore[invalid-argument-type] + tm.assert_frame_equal(read.to_pandas(), df[['a']]) \ + # type: ignore[invalid-argument-type] read = pq.read_table( path, columns=['a', 'b']) diff --git a/python/pyarrow/tests/parquet/test_pandas.py b/python/pyarrow/tests/parquet/test_pandas.py index 96622c4746b..5f9fdc7896d 100644 --- a/python/pyarrow/tests/parquet/test_pandas.py +++ b/python/pyarrow/tests/parquet/test_pandas.py @@ -425,7 +425,8 @@ def test_backwards_compatible_column_metadata_handling(datadir): table = _read_table( path, columns=['a']) result = table.to_pandas() - tm.assert_frame_equal(result, expected[['a']].reset_index(drop=True)) # type: ignore[invalid-argument-type] + tm.assert_frame_equal(result, expected[['a']].reset_index( + drop=True)) # type: ignore[invalid-argument-type] @pytest.mark.pandas @@ -530,15 +531,18 @@ def test_write_to_dataset_pandas_preserve_extensiondtypes(tempdir): table, str(tempdir / "case1"), partition_cols=['part'], ) result = pq.read_table(str(tempdir / "case1")).to_pandas() - tm.assert_frame_equal(result[["col"]], df[["col"]]) # type: ignore[invalid-argument-type] + tm.assert_frame_equal(result[["col"]], df[["col"]]) \ + # type: ignore[invalid-argument-type] pq.write_to_dataset(table, str(tempdir / "case2")) result = pq.read_table(str(tempdir / "case2")).to_pandas() - tm.assert_frame_equal(result[["col"]], df[["col"]]) # type: ignore[invalid-argument-type] + tm.assert_frame_equal(result[["col"]], df[["col"]]) \ + # type: ignore[invalid-argument-type] pq.write_table(table, str(tempdir / "data.parquet")) result = pq.read_table(str(tempdir / "data.parquet")).to_pandas() - tm.assert_frame_equal(result[["col"]], df[["col"]]) # type: ignore[invalid-argument-type] + tm.assert_frame_equal(result[["col"]], df[["col"]]) \ + # type: ignore[invalid-argument-type] @pytest.mark.pandas diff --git a/python/pyarrow/tests/parquet/test_parquet_file.py b/python/pyarrow/tests/parquet/test_parquet_file.py index b09c26c7144..4d4b467e9d3 100644 --- a/python/pyarrow/tests/parquet/test_parquet_file.py +++ b/python/pyarrow/tests/parquet/test_parquet_file.py @@ -325,7 +325,7 @@ def test_parquet_file_with_filesystem(s3_example_fs, use_uri): table = pa.table({"a": range(10)}) pq.write_table(table, s3_path, filesystem=s3_fs) - parquet_file = pq.ParquetFile(*args, **kwargs) + parquet_file = pq.ParquetFile(*args, **kwargs) # type: ignore[missing-argument] assert parquet_file.read() == table assert not parquet_file.closed parquet_file.close() diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index b1d28a61531..287b761a0a8 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -627,11 +627,13 @@ def test_table_column_subset_metadata(self): expected = df[['a']] if isinstance(df.index, pd.DatetimeIndex): df.index.freq = None - tm.assert_frame_equal(result, expected) # type: ignore[invalid-argument-type] + tm.assert_frame_equal(result, expected) \ + # type: ignore[invalid-argument-type] table_subset2 = table_subset.remove_column(1) result = table_subset2.to_pandas() - tm.assert_frame_equal(result, df[['a']].reset_index(drop=True)) # type: ignore[invalid-argument-type] + tm.assert_frame_equal(result, df[['a']].reset_index(drop=True)) \ + # type: ignore[invalid-argument-type] def test_to_pandas_column_subset_multiindex(self): # ARROW-10122 diff --git a/python/pyproject.toml b/python/pyproject.toml index 9229f42087d..1ee5e6930d5 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -116,5 +116,5 @@ unresolved-attribute = "ignore" unresolved-import = "ignore" #unresolved-reference = "ignore" #unsupported-operator = "ignore" -missing-argument = "ignore" +#missing-argument = "ignore" #call-non-callable = "ignore" From f7f7f5e1cf958faeacec70f92d6a80d5d1a9cc7d Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 25 Jul 2025 00:54:38 +0200 Subject: [PATCH 27/32] Fix unresolved-import --- python/pyarrow/tests/test_pandas.py | 8 +++----- python/pyproject.toml | 2 +- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 287b761a0a8..3c3d874395e 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -3266,7 +3266,7 @@ def test_error_sparse(self): df = pd.DataFrame({'a': pd.arrays.SparseArray([1, np.nan, 3])}) except AttributeError: # pandas.arrays module introduced in pandas 0.24 - from pandas import SparseArray + from pandas import SparseArray # type: ignore[unresolved-import] df = pd.DataFrame({'a': SparseArray([1, np.nan, 3])}) with pytest.raises(TypeError, match="Sparse pandas data"): pa.Table.from_pandas(df) @@ -4427,12 +4427,11 @@ def test_convert_to_extension_array(monkeypatch): # monkeypatch pandas Int64Dtype to *not* have the protocol method if Version(pd.__version__) < Version("1.3.0.dev"): - from pandas.core import integer + from pandas.core import integer # type: ignore[unresolved-import] monkeypatch.delattr( integer._IntegerDtype, "__from_arrow__") else: monkeypatch.delattr( - # type: ignore[unresolved-attribute] pd.core.arrays.integer.NumericDtype, "__from_arrow__") # Int64Dtype has no __from_arrow__ -> use normal conversion result = table.to_pandas() @@ -4474,12 +4473,11 @@ def test_conversion_extensiontype_to_extensionarray(monkeypatch): # monkeypatch pandas Int64Dtype to *not* have the protocol method # (remove the version added above and the actual version for recent pandas) if Version(pd.__version__) < Version("1.3.0.dev"): - from pandas.core import integer + from pandas.core import integer # type: ignore[unresolved-import] monkeypatch.delattr( integer._IntegerDtype, "__from_arrow__") else: monkeypatch.delattr( - # type: ignore[unresolved-attribute] pd.core.arrays.integer.NumericDtype, "__from_arrow__") result = arr.to_pandas() diff --git a/python/pyproject.toml b/python/pyproject.toml index 1ee5e6930d5..b293ad834ef 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -113,7 +113,7 @@ possibly-unbound-import = "ignore" #unknown-argument = "ignore" unresolved-attribute = "ignore" #unresolved-global = "ignore" -unresolved-import = "ignore" +#unresolved-import = "ignore" #unresolved-reference = "ignore" #unsupported-operator = "ignore" #missing-argument = "ignore" From d769e72d233ff7f24cd6ed9d6abb1bdca5a1a8c3 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 25 Jul 2025 01:28:51 +0200 Subject: [PATCH 28/32] Fix possibly-unbound-import --- python/pyarrow/conftest.py | 10 +++--- python/pyarrow/tests/test_acero.py | 2 +- python/pyarrow/tests/test_dataset.py | 4 +-- python/pyarrow/tests/test_fs.py | 49 ++++++++++++++-------------- python/pyproject.toml | 2 +- python/stubs/parquet/core.pyi | 2 +- 6 files changed, 35 insertions(+), 34 deletions(-) diff --git a/python/pyarrow/conftest.py b/python/pyarrow/conftest.py index d1b1567389b..563c98bfdc8 100644 --- a/python/pyarrow/conftest.py +++ b/python/pyarrow/conftest.py @@ -186,25 +186,25 @@ pass try: - from pyarrow.fs import AzureFileSystem # noqa + from pyarrow.fs import AzureFileSystem # type: ignore[possibly-unbound-import] # noqa defaults['azure'] = True except ImportError: pass try: - from pyarrow.fs import GcsFileSystem # noqa + from pyarrow.fs import GcsFileSystem # type: ignore[possibly-unbound-import] # noqa defaults['gcs'] = True except ImportError: pass try: - from pyarrow.fs import S3FileSystem # noqa + from pyarrow.fs import S3FileSystem # type: ignore[possibly-unbound-import] # noqa defaults['s3'] = True except ImportError: pass try: - from pyarrow.fs import HadoopFileSystem # noqa + from pyarrow.fs import HadoopFileSystem # type: ignore[possibly-unbound-import] # noqa defaults['hdfs'] = True except ImportError: pass @@ -250,7 +250,7 @@ def pytest_ignore_collect(collection_path, config): if 'pyarrow/fs' in str(collection_path): try: - from pyarrow.fs import S3FileSystem # noqa + from pyarrow.fs import S3FileSystem # type: ignore[possibly-unbound-import] # noqa return False except ImportError: return True diff --git a/python/pyarrow/tests/test_acero.py b/python/pyarrow/tests/test_acero.py index 8dba7471b49..ac58792cd50 100644 --- a/python/pyarrow/tests/test_acero.py +++ b/python/pyarrow/tests/test_acero.py @@ -37,7 +37,7 @@ try: import pyarrow.dataset as ds - from pyarrow.acero import ScanNodeOptions + from pyarrow.acero import ScanNodeOptions # type: ignore[possibly-unbound-import] except ImportError: pass diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 8dfdbcb8c2a..b797c49a1ba 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -3437,7 +3437,7 @@ def test_orc_scan_options(tempdir, dataset_reader): def test_orc_format_not_supported(): try: - from pyarrow.dataset import OrcFileFormat # noqa + from pyarrow.dataset import OrcFileFormat # type: ignore[possibly-unbound-import] # noqa except ImportError: # ORC is not available, test error message with pytest.raises( @@ -5138,7 +5138,7 @@ def test_write_dataset_s3_put_only(s3_server): # required while writing a dataset in s3 where we have very # limited permissions and thus we can directly write the dataset # without creating a directory. - from pyarrow.fs import S3FileSystem + from pyarrow.fs import S3FileSystem # type: ignore[possibly-unbound-import] # write dataset with s3 filesystem host, port, _, _ = s3_server['connection'] diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index 61dcb76b247..7c891c7919d 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -39,6 +39,31 @@ copy_files) from pyarrow.util import find_free_port +try: + from pyarrow.fs import ( + AwsDefaultS3RetryStrategy, # type: ignore[possibly-unbound-import] + AwsStandardS3RetryStrategy, # type: ignore[possibly-unbound-import] + S3FileSystem, # type: ignore[possibly-unbound-import] + resolve_s3_region, # type: ignore[possibly-unbound-import] + S3RetryStrategy # type: ignore[possibly-unbound-import] + ) +except ImportError: + pass + +try: + from pyarrow.fs import AzureFileSystem # type: ignore[possibly-unbound-import] +except ImportError: + pass + +try: + from pyarrow.fs import GcsFileSystem # type: ignore[possibly-unbound-import] +except ImportError: + pass + +try: + from pyarrow.fs import HadoopFileSystem # type: ignore[possibly-unbound-import] +except ImportError: + pass here = os.path.dirname(os.path.abspath(__file__)) @@ -211,7 +236,6 @@ def subtree_localfs(request, tempdir, localfs): @pytest.fixture def gcsfs(request, gcs_server): request.config.pyarrow.requires('gcs') - from pyarrow.fs import GcsFileSystem host, port = gcs_server['connection'] bucket = 'pyarrow-filesystem/' @@ -241,7 +265,6 @@ def gcsfs(request, gcs_server): @pytest.fixture def s3fs(request, s3_server): request.config.pyarrow.requires('s3') - from pyarrow.fs import S3FileSystem host, port, access_key, secret_key = s3_server['connection'] bucket = 'pyarrow-filesystem/' @@ -301,7 +324,6 @@ def subtree_s3fs(request, s3fs): @pytest.fixture def azurefs(request, azure_server): request.config.pyarrow.requires('azure') - from pyarrow.fs import AzureFileSystem host, port, account_name, account_key = azure_server['connection'] azurite_authority = f"{host}:{port}" @@ -333,8 +355,6 @@ def hdfs(request, hdfs_connection): if not pa.have_libhdfs(): pytest.skip('Cannot locate libhdfs') - from pyarrow.fs import HadoopFileSystem - host, port, user = hdfs_connection fs = HadoopFileSystem(host, port=port, user=user) @@ -515,7 +535,6 @@ def skip_azure(fs, reason): @pytest.mark.s3 def test_s3fs_limited_permissions_create_bucket(s3_server): - from pyarrow.fs import S3FileSystem _configure_s3_limited_user(s3_server, _minio_limited_policy, 'test_fs_limited_user', 'limited123') host, port, _, _ = s3_server['connection'] @@ -1147,7 +1166,6 @@ def test_mockfs_mtime_roundtrip(mockfs): @pytest.mark.gcs def test_gcs_options(pickle_module): - from pyarrow.fs import GcsFileSystem dt = datetime.now() fs = GcsFileSystem(access_token='abc', target_service_account='service_account@apache', @@ -1185,10 +1203,6 @@ def test_gcs_options(pickle_module): @pytest.mark.s3 def test_s3_options(pickle_module): - from pyarrow.fs import (AwsDefaultS3RetryStrategy, - AwsStandardS3RetryStrategy, S3FileSystem, - S3RetryStrategy) - fs = S3FileSystem(access_key='access', secret_key='secret', session_token='token', region='us-east-2', scheme='https', endpoint_override='localhost:8999') @@ -1289,8 +1303,6 @@ def test_s3_options(pickle_module): @pytest.mark.s3 def test_s3_proxy_options(monkeypatch, pickle_module): - from pyarrow.fs import S3FileSystem - # The following two are equivalent: proxy_opts_1_dict = {'scheme': 'http', 'host': 'localhost', 'port': 8999} proxy_opts_1_str = 'http://localhost:8999' @@ -1430,8 +1442,6 @@ def test_s3_proxy_options(monkeypatch, pickle_module): @pytest.mark.s3 def test_s3fs_wrong_region(): - from pyarrow.fs import S3FileSystem - # wrong region for bucket # anonymous=True incase CI/etc has invalid credentials fs = S3FileSystem(region='eu-north-1', anonymous=True) @@ -1454,8 +1464,6 @@ def test_s3fs_wrong_region(): @pytest.mark.azure def test_azurefs_options(pickle_module): - from pyarrow.fs import AzureFileSystem - fs1 = AzureFileSystem(account_name='fake-account-name') assert isinstance(fs1, AzureFileSystem) assert pickle_module.loads(pickle_module.dumps(fs1)) == fs1 @@ -1548,7 +1556,6 @@ def test_azurefs_options(pickle_module): @pytest.mark.hdfs def test_hdfs_options(hdfs_connection, pickle_module): - from pyarrow.fs import HadoopFileSystem if not pa.have_libhdfs(): pytest.skip('Cannot locate libhdfs') @@ -1655,8 +1662,6 @@ def test_filesystem_from_path_object(path): @pytest.mark.s3 def test_filesystem_from_uri_s3(s3_server): - from pyarrow.fs import S3FileSystem - host, port, access_key, secret_key = s3_server['connection'] uri = f"s3://{access_key}:{secret_key}@mybucket/foo/bar?scheme=http&" \ @@ -1674,8 +1679,6 @@ def test_filesystem_from_uri_s3(s3_server): @pytest.mark.gcs def test_filesystem_from_uri_gcs(gcs_server): - from pyarrow.fs import GcsFileSystem - host, port = gcs_server['connection'] uri = ("gs://anonymous@" + @@ -1864,7 +1867,6 @@ def test_py_open_append_stream(): def test_s3_real_aws(): # Exercise connection code with an AWS-backed S3 bucket. # This is a minimal integration check for ARROW-9261 and similar issues. - from pyarrow.fs import S3FileSystem default_region = (os.environ.get('PYARROW_TEST_S3_REGION') or 'us-east-1') fs = S3FileSystem(anonymous=True) @@ -1920,7 +1922,6 @@ def test_s3_real_aws_region_selection(): @pytest.mark.s3 def test_resolve_s3_region(): - from pyarrow.fs import resolve_s3_region assert resolve_s3_region('voltrondata-labs-datasets') == 'us-east-2' assert resolve_s3_region('mf-nwp-models') == 'eu-west-1' diff --git a/python/pyproject.toml b/python/pyproject.toml index b293ad834ef..7aaf602966e 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -108,7 +108,7 @@ fallback_version = '22.0.0a0' #non-subscriptable = "ignore" #not-iterable = "ignore" possibly-unbound-attribute = "ignore" -possibly-unbound-import = "ignore" +#possibly-unbound-import = "ignore" #too-many-positional-arguments = "ignore" #unknown-argument = "ignore" unresolved-attribute = "ignore" diff --git a/python/stubs/parquet/core.pyi b/python/stubs/parquet/core.pyi index 01dce442feb..5ad47403821 100644 --- a/python/stubs/parquet/core.pyi +++ b/python/stubs/parquet/core.pyi @@ -30,7 +30,7 @@ from pyarrow._parquet import ( # type: ignore[unresolved_import] Statistics, ) from pyarrow._stubs_typing import FilterTuple, SingleOrList # type: ignore[unresolved_import] -from pyarrow.dataset import ParquetFileFragment, Partitioning +from pyarrow.dataset import ParquetFileFragment, Partitioning # type: ignore[possibly-unbound-import] from pyarrow.lib import NativeFile, RecordBatch, Schema, Table # type: ignore[unresolved_import] from typing_extensions import deprecated From 954c49daf0e593298a4093d517489c37d2fc395a Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 25 Jul 2025 03:10:50 +0200 Subject: [PATCH 29/32] Fix possibly-unbound-import --- .../dataset/write_dataset_encrypted.py | 15 +- python/pyarrow/pandas_compat.py | 6 +- python/pyarrow/parquet/core.py | 5 +- .../interchange/test_interchange_spec.py | 1 - python/pyarrow/tests/parquet/test_metadata.py | 4 +- python/pyarrow/tests/test_csv.py | 2 +- python/pyarrow/tests/test_cuda.py | 4 +- .../pyarrow/tests/test_cuda_numba_interop.py | 5 +- python/pyarrow/tests/test_dataset.py | 141 ++++++++++-------- .../pyarrow/tests/test_dataset_encryption.py | 35 +++-- python/pyarrow/tests/test_feather.py | 2 +- python/pyarrow/tests/test_jvm.py | 2 +- python/pyarrow/tests/wsgi_examples.py | 2 +- python/pyproject.toml | 2 +- python/setup.py | 12 +- 15 files changed, 133 insertions(+), 105 deletions(-) diff --git a/python/examples/dataset/write_dataset_encrypted.py b/python/examples/dataset/write_dataset_encrypted.py index 910559939e6..5d8160b02c9 100644 --- a/python/examples/dataset/write_dataset_encrypted.py +++ b/python/examples/dataset/write_dataset_encrypted.py @@ -67,16 +67,19 @@ def kms_factory(kms_connection_configuration): crypto_factory = pe.CryptoFactory(kms_factory) -parquet_encryption_cfg = ds.ParquetEncryptionConfig( +parquet_encryption_cfg = ds.ParquetEncryptionConfig( \ + # type: ignore[possibly-unbound-attribute] crypto_factory, kms_connection_config, encryption_config) -parquet_decryption_cfg = ds.ParquetDecryptionConfig(crypto_factory, - kms_connection_config, - decryption_config) +parquet_decryption_cfg = ds.ParquetDecryptionConfig( \ + # type: ignore[possibly-unbound-attribute] + crypto_factory, kms_connection_config, decryption_config) # set encryption config for parquet fragment scan options -pq_scan_opts = ds.ParquetFragmentScanOptions() +pq_scan_opts = ds.ParquetFragmentScanOptions() \ + # type: ignore[possibly-unbound-attribute] pq_scan_opts.parquet_decryption_config = parquet_decryption_cfg -pformat = pa.dataset.ParquetFileFormat(default_fragment_scan_options=pq_scan_opts) +pformat = pa.dataset.ParquetFileFormat(default_fragment_scan_options=pq_scan_opts) \ + # type: ignore[possibly-unbound-attribute] if os.path.exists('sample_dataset'): shutil.rmtree('sample_dataset') diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index db81b40d334..131025e60c0 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -755,8 +755,10 @@ def _reconstruct_block(item, columns=None, extension_columns=None, return_block= # create ExtensionBlock arr = item['py_array'] assert len(placement) == 1 - name = columns.get(placement[0], None) - pandas_dtype = extension_columns.get(name, None) + name = columns.get(placement[0], None) \ + # type: ignore[possibly-unbound-attribute] + pandas_dtype = extension_columns.get(name, None) \ + # type: ignore[possibly-unbound-attribute] if not hasattr(pandas_dtype, '__from_arrow__'): raise ValueError("This column does not support to be converted " "to a pandas ExtensionArray") diff --git a/python/pyarrow/parquet/core.py b/python/pyarrow/parquet/core.py index 7b6c57f9683..d38b3fbff92 100644 --- a/python/pyarrow/parquet/core.py +++ b/python/pyarrow/parquet/core.py @@ -1404,7 +1404,8 @@ def __init__(self, path_or_paths, filesystem=None, schema=None, *, filters=None, else: single_file = path_or_paths - parquet_format = ds.ParquetFileFormat(**read_options) + parquet_format = ds.ParquetFileFormat(**read_options) \ + # type: ignore[possibly-unbound-attribute] if single_file is not None: fragment = parquet_format.make_fragment(single_file, filesystem) @@ -2200,7 +2201,7 @@ def file_visitor(written_file): metadata_collector.append(written_file.metadata) # map format arguments - parquet_format = ds.ParquetFileFormat() + parquet_format = ds.ParquetFileFormat() # type: ignore[possibly-unbound-attribute] write_options = parquet_format.make_write_options(**kwargs) # map old filesystems to new one diff --git a/python/pyarrow/tests/interchange/test_interchange_spec.py b/python/pyarrow/tests/interchange/test_interchange_spec.py index 68afc0c633b..14e2aab4bfb 100644 --- a/python/pyarrow/tests/interchange/test_interchange_spec.py +++ b/python/pyarrow/tests/interchange/test_interchange_spec.py @@ -22,7 +22,6 @@ import pyarrow.tests.strategies as past import pytest -np = None try: import numpy as np except ImportError: diff --git a/python/pyarrow/tests/parquet/test_metadata.py b/python/pyarrow/tests/parquet/test_metadata.py index d180fbfb4e5..3386f77bb1a 100644 --- a/python/pyarrow/tests/parquet/test_metadata.py +++ b/python/pyarrow/tests/parquet/test_metadata.py @@ -494,12 +494,12 @@ def test_multi_dataset_metadata(tempdir): # Write merged metadata-only file with open(metapath, "wb") as f: - _meta.write_metadata_file(f) + _meta.write_metadata_file(f) # type: ignore[possibly-unbound-attribute] # Read back the metadata meta = pq.read_metadata(metapath) md = meta.to_dict() - _md = _meta.to_dict() + _md = _meta.to_dict() # type: ignore[possibly-unbound-attribute] for key in _md: if key != 'serialized_size': assert _md[key] == md[key] diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py index 2794d07e87c..170f62a43bd 100644 --- a/python/pyarrow/tests/test_csv.py +++ b/python/pyarrow/tests/test_csv.py @@ -1502,7 +1502,7 @@ def signal_from_thread(): # Interruption should have arrived timely assert last_duration <= 2.0 - e = exc_info.__context__ + e = exc_info.__context__ # type: ignore[possibly-unbound-attribute] assert isinstance(e, pa.ArrowCancelled) assert e.signum == signal.SIGINT diff --git a/python/pyarrow/tests/test_cuda.py b/python/pyarrow/tests/test_cuda.py index e06f479987c..d8298eec773 100644 --- a/python/pyarrow/tests/test_cuda.py +++ b/python/pyarrow/tests/test_cuda.py @@ -42,8 +42,8 @@ not has_ipc_support, reason='CUDA IPC not supported in platform `%s`' % (platform)) -global_context = None # for flake8 -global_context1 = None # for flake8 +global_context = cuda.Context(0) # for flake8 +global_context1 = cuda.Context(0) # for flake8 def setup_module(module): diff --git a/python/pyarrow/tests/test_cuda_numba_interop.py b/python/pyarrow/tests/test_cuda_numba_interop.py index f211f0046f0..cfcf6673755 100644 --- a/python/pyarrow/tests/test_cuda_numba_interop.py +++ b/python/pyarrow/tests/test_cuda_numba_interop.py @@ -30,7 +30,7 @@ # type: ignore[unresolved_import] # noqa: E402 -context_choices = None +context_choices = {} context_choice_ids = ['pyarrow.cuda', 'numba.cuda'] @@ -73,7 +73,8 @@ def make_random_buffer(size, target='host', dtype='uint8', ctx=None): return arr, buf elif target == 'device': arr, buf = make_random_buffer(size, target='host', dtype=dtype) - dbuf = ctx.new_buffer(size * dtype.itemsize) + dbuf = ctx.new_buffer(size * dtype.itemsize) \ + # type: ignore[possibly-unbound-attribute] dbuf.copy_from_host(buf, position=0, nbytes=buf.size) return arr, dbuf raise ValueError('invalid target value') diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index b797c49a1ba..6303b47bd44 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -53,11 +53,24 @@ try: import pyarrow.dataset as ds + from pyarrow.dataset import ( + ParquetFragmentScanOptions, ParquetReadOptions, ParquetFileFragment \ + # type: ignore[possibly-unbound-attribute] + ) +except ImportError: + pass + +try: + from pyarrow.dataset import ( + OrcFileFormat # type: ignore[possibly-unbound-attribute] + ) except ImportError: pass try: import pyarrow.parquet as pq + from pyarrow.parquet import ParquetFileFormat \ + # type: ignore[possibly-unbound-attribute] except ImportError: pass @@ -270,7 +283,7 @@ def multisourcefs(request): @pytest.fixture def dataset(mockfs): - format = ds.ParquetFileFormat() + format = ParquetFileFormat() selector = fs.FileSelector('subdir', recursive=True) options = ds.FileSystemFactoryOptions('subdir') options.partitioning = ds.DirectoryPartitioning( @@ -338,7 +351,7 @@ def test_filesystem_dataset(mockfs): schema = pa.schema([ pa.field('const', pa.int64()) ]) - file_format = ds.ParquetFileFormat() + file_format = ParquetFileFormat() paths = ['subdir/1/xxx/file0.parquet', 'subdir/2/yyy/file1.parquet'] partitions = [ds.field('part') == x for x in range(1, 3)] fragments = [file_format.make_fragment(path, mockfs, part) @@ -356,7 +369,7 @@ def test_filesystem_dataset(mockfs): for dataset in [dataset_from_fragments, dataset_from_paths]: assert isinstance(dataset, ds.FileSystemDataset) - assert isinstance(dataset.format, ds.ParquetFileFormat) + assert isinstance(dataset.format, ParquetFileFormat) assert dataset.partition_expression.equals(root_partition) assert set(dataset.files) == set(paths) @@ -364,14 +377,14 @@ def test_filesystem_dataset(mockfs): for fragment, partition, path in zip(fragments, partitions, paths): assert fragment.partition_expression.equals(partition) assert fragment.path == path - assert isinstance(fragment.format, ds.ParquetFileFormat) - assert isinstance(fragment, ds.ParquetFileFragment) + assert isinstance(fragment.format, ParquetFileFormat) + assert isinstance(fragment, ParquetFileFragment) assert fragment.row_groups == [0] assert fragment.num_row_groups == 1 row_group_fragments = list(fragment.split_by_row_group()) assert fragment.num_row_groups == len(row_group_fragments) == 1 - assert isinstance(row_group_fragments[0], ds.ParquetFileFragment) + assert isinstance(row_group_fragments[0], ParquetFileFragment) assert row_group_fragments[0].path == path assert row_group_fragments[0].row_groups == [0] assert row_group_fragments[0].num_row_groups == 1 @@ -490,7 +503,7 @@ def test_dataset(dataset, dataset_reader): def test_dataset_factory_inspect_schema_promotion(promotable_mockfs): mockfs, path1, path2 = promotable_mockfs factory = ds.FileSystemDatasetFactory( - mockfs, [path1, path2], ds.ParquetFileFormat() + mockfs, [path1, path2], ParquetFileFormat() ) with pytest.raises( @@ -534,7 +547,7 @@ def test_dataset_factory_inspect_schema_promotion(promotable_mockfs): def test_dataset_factory_inspect_bad_params(promotable_mockfs): mockfs, path1, path2 = promotable_mockfs factory = ds.FileSystemDatasetFactory( - mockfs, [path1, path2], ds.ParquetFileFormat() + mockfs, [path1, path2], ParquetFileFormat() ) with pytest.raises(ValueError, match='Invalid promote_options: bad_option'): @@ -942,11 +955,11 @@ def test_partition_keys(): @pytest.mark.parquet def test_parquet_read_options(): - opts1 = ds.ParquetReadOptions() - opts2 = ds.ParquetReadOptions(dictionary_columns=['a', 'b']) - opts3 = ds.ParquetReadOptions(coerce_int96_timestamp_unit="ms") - opts4 = ds.ParquetReadOptions(binary_type=pa.binary_view()) - opts5 = ds.ParquetReadOptions(list_type=pa.LargeListType) + opts1 = ParquetReadOptions() + opts2 = ParquetReadOptions(dictionary_columns=['a', 'b']) + opts3 = ParquetReadOptions(coerce_int96_timestamp_unit="ms") + opts4 = ParquetReadOptions(binary_type=pa.binary_view()) + opts5 = ParquetReadOptions(list_type=pa.LargeListType) assert opts1.dictionary_columns == set() @@ -984,37 +997,37 @@ def test_parquet_read_options(): @pytest.mark.parquet def test_parquet_file_format_read_options(): - pff1 = ds.ParquetFileFormat() - pff2 = ds.ParquetFileFormat(dictionary_columns={'a'}) - pff3 = ds.ParquetFileFormat(coerce_int96_timestamp_unit="s") - pff4 = ds.ParquetFileFormat(binary_type=pa.binary_view()) - pff5 = ds.ParquetFileFormat(list_type=pa.LargeListType) - - assert pff1.read_options == ds.ParquetReadOptions() - assert pff2.read_options == ds.ParquetReadOptions(dictionary_columns=['a']) - assert pff3.read_options == ds.ParquetReadOptions( + pff1 = ParquetFileFormat() + pff2 = ParquetFileFormat(dictionary_columns={'a'}) + pff3 = ParquetFileFormat(coerce_int96_timestamp_unit="s") + pff4 = ParquetFileFormat(binary_type=pa.binary_view()) + pff5 = ParquetFileFormat(list_type=pa.LargeListType) + + assert pff1.read_options == ParquetReadOptions() + assert pff2.read_options == ParquetReadOptions(dictionary_columns=['a']) + assert pff3.read_options == ParquetReadOptions( coerce_int96_timestamp_unit="s") - assert pff4.read_options == ds.ParquetReadOptions( + assert pff4.read_options == ParquetReadOptions( binary_type=pa.binary_view()) - assert pff5.read_options == ds.ParquetReadOptions( + assert pff5.read_options == ParquetReadOptions( list_type=pa.LargeListType) @pytest.mark.parquet def test_parquet_scan_options(): - opts1 = ds.ParquetFragmentScanOptions() - opts2 = ds.ParquetFragmentScanOptions(buffer_size=4096) - opts3 = ds.ParquetFragmentScanOptions( + opts1 = ParquetFragmentScanOptions() + opts2 = ParquetFragmentScanOptions(buffer_size=4096) + opts3 = ParquetFragmentScanOptions( buffer_size=2**13, use_buffered_stream=True) - opts4 = ds.ParquetFragmentScanOptions(buffer_size=2**13, pre_buffer=False) - opts5 = ds.ParquetFragmentScanOptions( + opts4 = ParquetFragmentScanOptions(buffer_size=2**13, pre_buffer=False) + opts5 = ParquetFragmentScanOptions( thrift_string_size_limit=123456, thrift_container_size_limit=987654,) - opts6 = ds.ParquetFragmentScanOptions( + opts6 = ParquetFragmentScanOptions( page_checksum_verification=True) cache_opts = pa.CacheOptions( hole_size_limit=2**10, range_size_limit=8*2**10, lazy=True) - opts7 = ds.ParquetFragmentScanOptions(pre_buffer=True, cache_options=cache_opts) + opts7 = ParquetFragmentScanOptions(pre_buffer=True, cache_options=cache_opts) assert opts1.use_buffered_stream is False assert opts1.buffer_size == 2**13 @@ -1076,16 +1089,16 @@ def test_file_format_pickling(pickle_module): use_threads=False, block_size=14)), ] try: - formats.append(ds.OrcFileFormat()) + formats.append(OrcFileFormat()) except ImportError: pass if pq is not None: formats.extend([ - ds.ParquetFileFormat(), - ds.ParquetFileFormat(dictionary_columns={'a'}), - ds.ParquetFileFormat(use_buffered_stream=True), - ds.ParquetFileFormat( + ParquetFileFormat(), + ParquetFileFormat(dictionary_columns={'a'}), + ParquetFileFormat(use_buffered_stream=True), + ParquetFileFormat( use_buffered_stream=True, buffer_size=4096, thrift_string_size_limit=123, @@ -1114,8 +1127,8 @@ def test_fragment_scan_options_pickling(pickle_module): if pq is not None: options.extend([ - ds.ParquetFragmentScanOptions(buffer_size=4096), - ds.ParquetFragmentScanOptions(pre_buffer=True), + ParquetFragmentScanOptions(buffer_size=4096), + ParquetFragmentScanOptions(pre_buffer=True), ]) for option in options: @@ -1132,8 +1145,8 @@ def test_fragment_scan_options_pickling(pickle_module): @pytest.mark.parametrize('pre_buffer', [False, True]) @pytest.mark.parquet def test_filesystem_factory(mockfs, paths_or_selector, pre_buffer): - format = ds.ParquetFileFormat( - read_options=ds.ParquetReadOptions(dictionary_columns={"str"}), + format = ParquetFileFormat( + read_options=ParquetReadOptions(dictionary_columns={"str"}), pre_buffer=pre_buffer ) @@ -1205,7 +1218,7 @@ def test_filesystem_factory(mockfs, paths_or_selector, pre_buffer): @pytest.mark.parquet def test_make_fragment(multisourcefs): - parquet_format = ds.ParquetFileFormat() + parquet_format = ParquetFileFormat() dataset = ds.dataset('/plain', filesystem=multisourcefs, format=parquet_format) @@ -1216,7 +1229,7 @@ def test_make_fragment(multisourcefs): row_group_fragment = parquet_format.make_fragment(path, multisourcefs, row_groups=[0]) for f in [fragment, row_group_fragment]: - assert isinstance(f, ds.ParquetFileFragment) + assert isinstance(f, ParquetFileFragment) assert f.path == path assert isinstance(f.filesystem, type(multisourcefs)) assert row_group_fragment.row_groups == [0] @@ -1232,7 +1245,7 @@ def test_make_fragment_with_size(s3_example_simple): """ table, path, fs, uri, host, port, access_key, secret_key = s3_example_simple - file_format = ds.ParquetFileFormat() + file_format = ParquetFileFormat() paths = [path] fragments = [file_format.make_fragment(path, fs) @@ -1339,8 +1352,8 @@ def test_make_parquet_fragment_from_buffer(dataset_reader, pickle_module): arrays[1], arrays[2].dictionary_encode() ] - dictionary_format = ds.ParquetFileFormat( - read_options=ds.ParquetReadOptions( + dictionary_format = ParquetFileFormat( + read_options=ParquetReadOptions( dictionary_columns=['alpha', 'animal'] ), use_buffered_stream=True, @@ -1348,7 +1361,7 @@ def test_make_parquet_fragment_from_buffer(dataset_reader, pickle_module): ) cases = [ - (arrays, ds.ParquetFileFormat()), + (arrays, ParquetFileFormat()), (dictionary_arrays, dictionary_format) ] for arrays, format_ in cases: @@ -1952,7 +1965,7 @@ def test_fragments_repr(tempdir, dataset): "pickled", [lambda x, m: x, lambda x, m: m.loads(m.dumps(x))]) def test_partitioning_factory(mockfs, pickled, pickle_module): paths_or_selector = fs.FileSelector('subdir', recursive=True) - format = ds.ParquetFileFormat() + format = ParquetFileFormat() options = ds.FileSystemFactoryOptions('subdir') partitioning_factory = ds.DirectoryPartitioning.discover(['group', 'key']) @@ -1987,7 +2000,7 @@ def test_partitioning_factory(mockfs, pickled, pickle_module): def test_partitioning_factory_dictionary(mockfs, infer_dictionary, pickled, pickle_module): paths_or_selector = fs.FileSelector('subdir', recursive=True) - format = ds.ParquetFileFormat() + format = ParquetFileFormat() options = ds.FileSystemFactoryOptions('subdir') partitioning_factory = ds.DirectoryPartitioning.discover( @@ -2595,12 +2608,12 @@ def test_construct_from_invalid_sources_raise(multisourcefs): child1 = ds.FileSystemDatasetFactory( multisourcefs, fs.FileSelector('/plain'), - format=ds.ParquetFileFormat() + format=ParquetFileFormat() ) child2 = ds.FileSystemDatasetFactory( multisourcefs, fs.FileSelector('/schema'), - format=ds.ParquetFileFormat() + format=ParquetFileFormat() ) batch1 = pa.RecordBatch.from_arrays([pa.array(range(10))], names=["a"]) batch2 = pa.RecordBatch.from_arrays([pa.array(range(10))], names=["b"]) @@ -3072,7 +3085,7 @@ def test_file_format_inspect_fsspec(tempdir): assert fsspec_fs.ls(tempdir)[0].endswith("data.parquet") # inspect using dataset file format - format = ds.ParquetFileFormat() + format = ParquetFileFormat() # manually creating a PyFileSystem instead of using fs._ensure_filesystem # which would convert an fsspec local filesystem to a native one filesystem = fs.PyFileSystem(fs.FSSpecHandler(fsspec_fs)) @@ -3159,7 +3172,7 @@ def test_filter_compute_expression(tempdir, dataset_reader): def test_dataset_union(multisourcefs): child = ds.FileSystemDatasetFactory( multisourcefs, fs.FileSelector('/plain'), - format=ds.ParquetFileFormat() + format=ParquetFileFormat() ) factory = ds.UnionDatasetFactory([child]) @@ -3382,7 +3395,7 @@ def test_orc_format(tempdir, dataset_reader): path = str(tempdir / 'test.orc') orc.write_table(table, path) - dataset = ds.dataset(path, format=ds.OrcFileFormat()) + dataset = ds.dataset(path, format=OrcFileFormat()) fragments = list(dataset.get_fragments()) assert isinstance(fragments[0], ds.FileFragment) result = dataset_reader.to_table(dataset) @@ -3456,7 +3469,7 @@ def test_orc_writer_not_implemented_for_dataset(): pa.table({"a": range(10)}), format='orc', base_dir='/tmp' ) - of = ds.OrcFileFormat() + of = OrcFileFormat() with pytest.raises( NotImplementedError, match="Writing datasets not yet implemented for this file format" @@ -4922,7 +4935,7 @@ def test_write_dataset_parquet(tempdir): # using custom options for version in ["1.0", "2.4", "2.6"]: - format = ds.ParquetFileFormat() + format = ParquetFileFormat() opts = format.make_write_options(version=version) assert " should error is dataset was properly encrypted - pformat = pa.dataset.ParquetFileFormat() + pformat = ParquetFileFormat() with pytest.raises(IOError, match=r"no decryption"): ds.dataset("sample_dataset", format=pformat, filesystem=mockfs) # set decryption config for parquet fragment scan options - pq_scan_opts = ds.ParquetFragmentScanOptions( + pq_scan_opts = ParquetFragmentScanOptions( decryption_config=parquet_decryption_cfg ) - pformat = pa.dataset.ParquetFileFormat(default_fragment_scan_options=pq_scan_opts) + pformat = ParquetFileFormat(default_fragment_scan_options=pq_scan_opts) dataset = ds.dataset("sample_dataset", format=pformat, filesystem=mockfs) assert table.equals(dataset.to_table()) @@ -144,11 +153,11 @@ def test_dataset_encryption_decryption(): # set decryption properties for parquet fragment scan options decryption_properties = crypto_factory.file_decryption_properties( kms_connection_config, decryption_config) - pq_scan_opts = ds.ParquetFragmentScanOptions( + pq_scan_opts = ParquetFragmentScanOptions( decryption_properties=decryption_properties ) - pformat = pa.dataset.ParquetFileFormat(default_fragment_scan_options=pq_scan_opts) + pformat = ParquetFileFormat(default_fragment_scan_options=pq_scan_opts) dataset = ds.dataset("sample_dataset", format=pformat, filesystem=mockfs) assert table.equals(dataset.to_table()) @@ -163,7 +172,7 @@ def test_write_dataset_parquet_without_encryption(): # Set the encryption configuration using ParquetFileFormat # and make_write_options - pformat = pa.dataset.ParquetFileFormat() + pformat = ParquetFileFormat() with pytest.raises(NotImplementedError): _ = pformat.make_write_options(encryption_config="some value") @@ -201,14 +210,14 @@ def unwrap_key(self, wrapped_key: bytes, _: str) -> bytes: plaintext_footer=False, data_key_length_bits=128, ) - pqe_config = ds.ParquetEncryptionConfig( + pqe_config = ParquetEncryptionConfig( crypto_factory, kms_config, encryption_config ) - pqd_config = ds.ParquetDecryptionConfig( + pqd_config = ParquetDecryptionConfig( crypto_factory, kms_config, pe.DecryptionConfiguration() ) - scan_options = ds.ParquetFragmentScanOptions(decryption_config=pqd_config) - file_format = ds.ParquetFileFormat(default_fragment_scan_options=scan_options) + scan_options = ParquetFragmentScanOptions(decryption_config=pqd_config) + file_format = ParquetFileFormat(default_fragment_scan_options=scan_options) write_options = file_format.make_write_options(encryption_config=pqe_config) file_decryption_properties = crypto_factory.file_decryption_properties(kms_config) diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py index 6b35822017b..c4631903c1a 100644 --- a/python/pyarrow/tests/test_feather.py +++ b/python/pyarrow/tests/test_feather.py @@ -63,7 +63,7 @@ def compression(request): yield request.param -TEST_FILES = None +TEST_FILES = [] def setup_module(module): diff --git a/python/pyarrow/tests/test_jvm.py b/python/pyarrow/tests/test_jvm.py index 876c05d740a..51f259e4bd5 100644 --- a/python/pyarrow/tests/test_jvm.py +++ b/python/pyarrow/tests/test_jvm.py @@ -43,7 +43,7 @@ def root_allocator(): 'POM:version', namespaces={ 'POM': 'http://maven.apache.org/POM/4.0.0' - }).text + }).text # type: ignore[possibly-unbound-attribute] jar_path = os.path.join( arrow_dir, 'java', 'tools', 'target', f'arrow-tools-{version}-jar-with-dependencies.jar') diff --git a/python/pyarrow/tests/wsgi_examples.py b/python/pyarrow/tests/wsgi_examples.py index 440b107abe5..1fafa852dc6 100644 --- a/python/pyarrow/tests/wsgi_examples.py +++ b/python/pyarrow/tests/wsgi_examples.py @@ -28,7 +28,7 @@ def application(env, start_response): # See test_fs::test_uwsgi_integration start_response('200 OK', [('Content-Type', 'text/html')]) # flake8: noqa - fs = pyarrow.fs.S3FileSystem() + fs = pyarrow.fs.S3FileSystem() # type: ignore[possibly-unbound-attribute] return [b"Hello World\n"] else: start_response('404 Not Found', [('Content-Type', 'text/html')]) diff --git a/python/pyproject.toml b/python/pyproject.toml index 7aaf602966e..8e7f14d3c46 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -107,7 +107,7 @@ fallback_version = '22.0.0a0' #no-matching-overload = "ignore" #non-subscriptable = "ignore" #not-iterable = "ignore" -possibly-unbound-attribute = "ignore" +#possibly-unbound-attribute = "ignore" #possibly-unbound-import = "ignore" #too-many-positional-arguments = "ignore" #unknown-argument = "ignore" diff --git a/python/setup.py b/python/setup.py index d037b82f4ad..f74824d5e6e 100755 --- a/python/setup.py +++ b/python/setup.py @@ -43,9 +43,9 @@ # We can't use sys.platform in a cross-compiling situation # as here it may be set to the host not target platform is_emscripten = ( - sysconfig.get_config_var("SOABI") - # type: ignore[possibly-unbound] - and sysconfig.get_config_var("SOABI").find("emscripten") != -1 + sysconfig.get_config_var("SOABI") and + sysconfig.get_config_var("SOABI").find("emscripten") != -1 \ + # type: ignore[possibly-unbound-attribute] ) @@ -254,9 +254,9 @@ def _run_cmake(self): # Detect if we built elsewhere if os.path.isfile('CMakeCache.txt'): cachefile = open('CMakeCache.txt', 'r') - cachedir = re.search('CMAKE_CACHEFILE_DIR:INTERNAL=(.*)', - # type: ignore[possibly-unbound-attribute] - cachefile.read()).group(1) + cachedir = re.search( # type: ignore[possibly-unbound-attribute] + 'CMAKE_CACHEFILE_DIR:INTERNAL=(.*)', + cachefile.read()).group(1) cachefile.close() if (cachedir != build_temp): build_base = pjoin(saved_cwd, build_cmd.build_base) From 291dd88b1ca2a6cb6453e4ebb0e8fd8b22d43e58 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Fri, 25 Jul 2025 14:19:22 +0200 Subject: [PATCH 30/32] Fix unresolved-attribute --- python/examples/flight/client.py | 2 +- python/pyarrow/interchange/from_dataframe.py | 11 +- python/pyarrow/pandas_compat.py | 78 +- python/pyarrow/tests/parquet/test_pandas.py | 3 +- python/pyarrow/tests/test_acero.py | 21 +- python/pyarrow/tests/test_array.py | 24 +- python/pyarrow/tests/test_cffi.py | 3 +- python/pyarrow/tests/test_compute.py | 1093 +++++++++--------- python/pyarrow/tests/test_csv.py | 3 +- python/pyarrow/tests/test_cuda.py | 5 +- python/pyarrow/tests/test_dataset.py | 30 +- python/pyarrow/tests/test_exec_plan.py | 9 +- python/pyarrow/tests/test_gdb.py | 8 +- python/pyarrow/tests/test_io.py | 13 +- python/pyarrow/tests/test_ipc.py | 10 +- python/pyarrow/tests/test_pandas.py | 18 +- python/pyarrow/tests/test_scalars.py | 2 +- python/pyarrow/tests/test_schema.py | 4 +- python/pyarrow/tests/test_sparse_tensor.py | 11 +- python/pyarrow/tests/test_strategies.py | 16 +- python/pyarrow/tests/test_substrait.py | 46 +- python/pyarrow/tests/test_table.py | 19 +- python/pyarrow/tests/test_types.py | 52 +- python/pyproject.toml | 2 +- python/scripts/test_leak.py | 2 +- 25 files changed, 777 insertions(+), 708 deletions(-) diff --git a/python/examples/flight/client.py b/python/examples/flight/client.py index 75976674bf2..8abce1ae8c8 100644 --- a/python/examples/flight/client.py +++ b/python/examples/flight/client.py @@ -70,7 +70,7 @@ def do_action(args, client, connection_args={}): print('Running action', args.action_type) for result in client.do_action(action): print("Got result", result.body.to_pybytes()) - except pyarrow.lib.ArrowIOError as e: + except pyarrow.lib.ArrowIOError as e: # type: ignore[unresolved-attribute] print("Error calling action:", e) diff --git a/python/pyarrow/interchange/from_dataframe.py b/python/pyarrow/interchange/from_dataframe.py index 80ddc8fa024..106c582c22b 100644 --- a/python/pyarrow/interchange/from_dataframe.py +++ b/python/pyarrow/interchange/from_dataframe.py @@ -32,6 +32,7 @@ import re import pyarrow.compute as pc +from pyarrow.compute import equal, invert, is_nan # type: ignore[unresolved-attribute] from pyarrow.interchange.column import Dtype @@ -513,7 +514,7 @@ def validity_buffer_from_mask( offset=offset) if sentinel_val == 1: - mask_bool = pc.invert(mask_bool) + mask_bool = invert(mask_bool) return mask_bool.buffers()[1] @@ -583,8 +584,8 @@ def validity_buffer_nan_sentinel( [None, data_pa_buffer], offset=offset, ) - mask = pc.is_nan(pyarrow_data) - mask = pc.invert(mask) + mask = is_nan(pyarrow_data) + mask = invert(mask) return mask.buffers()[1] # Check for sentinel values @@ -603,8 +604,8 @@ def validity_buffer_nan_sentinel( length, [None, data_pa_buffer], offset=offset) - sentinel_arr = pc.equal(pyarrow_data, sentinel_val) - mask_bool = pc.invert(sentinel_arr) + sentinel_arr = equal(pyarrow_data, sentinel_val) + mask_bool = invert(sentinel_arr) return mask_bool.buffers()[1] elif null_kind == ColumnNullType.NON_NULLABLE: diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index 131025e60c0..970126da64c 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -34,6 +34,12 @@ import numpy as np except ImportError: pass + +try: + from pyarrow import lib # type: ignore[unresolved-attribute] +except ImportError: + pass + import pyarrow as pa from pyarrow.lib import _pandas_api, frombytes, is_threading_enabled # type: ignore[unresolved_import] # noqa @@ -48,26 +54,26 @@ def get_logical_type_map(): if not _logical_type_map: _logical_type_map.update({ - pa.lib.Type_NA: 'empty', - pa.lib.Type_BOOL: 'bool', - pa.lib.Type_INT8: 'int8', - pa.lib.Type_INT16: 'int16', - pa.lib.Type_INT32: 'int32', - pa.lib.Type_INT64: 'int64', - pa.lib.Type_UINT8: 'uint8', - pa.lib.Type_UINT16: 'uint16', - pa.lib.Type_UINT32: 'uint32', - pa.lib.Type_UINT64: 'uint64', - pa.lib.Type_HALF_FLOAT: 'float16', - pa.lib.Type_FLOAT: 'float32', - pa.lib.Type_DOUBLE: 'float64', - pa.lib.Type_DATE32: 'date', - pa.lib.Type_DATE64: 'date', - pa.lib.Type_TIME32: 'time', - pa.lib.Type_TIME64: 'time', - pa.lib.Type_BINARY: 'bytes', - pa.lib.Type_FIXED_SIZE_BINARY: 'bytes', - pa.lib.Type_STRING: 'unicode', + lib.Type_NA: 'empty', + lib.Type_BOOL: 'bool', + lib.Type_INT8: 'int8', + lib.Type_INT16: 'int16', + lib.Type_INT32: 'int32', + lib.Type_INT64: 'int64', + lib.Type_UINT8: 'uint8', + lib.Type_UINT16: 'uint16', + lib.Type_UINT32: 'uint32', + lib.Type_UINT64: 'uint64', + lib.Type_HALF_FLOAT: 'float16', + lib.Type_FLOAT: 'float32', + lib.Type_DOUBLE: 'float64', + lib.Type_DATE32: 'date', + lib.Type_DATE64: 'date', + lib.Type_TIME32: 'time', + lib.Type_TIME64: 'time', + lib.Type_BINARY: 'bytes', + lib.Type_FIXED_SIZE_BINARY: 'bytes', + lib.Type_STRING: 'unicode', }) return _logical_type_map @@ -78,11 +84,11 @@ def get_logical_type(arrow_type): try: return logical_type_map[arrow_type.id] except KeyError: - if isinstance(arrow_type, pa.lib.DictionaryType): + if isinstance(arrow_type, lib.DictionaryType): return 'categorical' - elif isinstance(arrow_type, pa.lib.ListType): + elif isinstance(arrow_type, lib.ListType): return f'list[{get_logical_type(arrow_type.value_type)}]' - elif isinstance(arrow_type, pa.lib.TimestampType): + elif isinstance(arrow_type, lib.TimestampType): return 'datetimetz' if arrow_type.tz is not None else 'datetime' elif pa.types.is_decimal(arrow_type): return 'decimal' @@ -139,7 +145,7 @@ def get_extension_dtype_info(column): } physical_dtype = str(cats.codes.dtype) elif hasattr(dtype, 'tz'): - metadata = {'timezone': pa.lib.tzinfo_to_string(dtype.tz)} + metadata = {'timezone': lib.tzinfo_to_string(dtype.tz)} physical_dtype = 'datetime64[ns]' else: metadata = None @@ -569,7 +575,7 @@ def dataframe_to_types(df, preserve_index, columns=None): type_ = pa.array(empty, from_pandas=True).type else: values, type_ = get_datetimetz_type(values, c.dtype, None) - type_ = pa.lib._ndarray_to_arrow_type(values, type_) + type_ = lib._ndarray_to_arrow_type(values, type_) if type_ is None: type_ = pa.array(c, from_pandas=True).type types.append(type_) @@ -755,10 +761,8 @@ def _reconstruct_block(item, columns=None, extension_columns=None, return_block= # create ExtensionBlock arr = item['py_array'] assert len(placement) == 1 - name = columns.get(placement[0], None) \ - # type: ignore[possibly-unbound-attribute] - pandas_dtype = extension_columns.get(name, None) \ - # type: ignore[possibly-unbound-attribute] + name = columns[placement[0]] # type: ignore[non-subscriptable] + pandas_dtype = extension_columns[name] # type: ignore[non-subscriptable] if not hasattr(pandas_dtype, '__from_arrow__'): raise ValueError("This column does not support to be converted " "to a pandas ExtensionArray") @@ -775,7 +779,7 @@ def _reconstruct_block(item, columns=None, extension_columns=None, return_block= def make_datetimetz(unit, tz): if _pandas_api.is_v1(): unit = 'ns' # ARROW-3789: Coerce date/timestamp types to datetime64[ns] - tz = pa.lib.string_to_tzinfo(tz) + tz = lib.string_to_tzinfo(tz) return _pandas_api.datetimetz_type(unit, tz=tz) @@ -805,8 +809,8 @@ def table_to_dataframe( columns = _deserialize_column_index(table, all_columns, column_indexes) column_names = table.column_names - result = pa.lib.table_to_blocks(options, table, categories, - list(ext_columns_dtypes.keys())) + result = lib.table_to_blocks(options, table, categories, + list(ext_columns_dtypes.keys())) if _pandas_api.is_ge_v3(): from pandas.api.internals import create_dataframe_from_blocks \ # type: ignore[unresolved_import] @@ -830,8 +834,8 @@ def table_to_dataframe( axes = [columns, index] mgr = BlockManager(blocks, axes) if _pandas_api.is_ge_v21(): - # type: ignore[unresolved-attribute] - df = DataFrame._from_mgr(mgr, mgr.axes) + df = DataFrame._from_mgr(mgr, mgr.axes) \ + # type: ignore[unresolved-attribute] else: df = DataFrame(mgr) return df @@ -1166,7 +1170,7 @@ def _reconstruct_columns_from_metadata(columns, column_indexes): level = level.map(encoder) # ARROW-13756: if index is timezone aware DataTimeIndex elif pandas_dtype == "datetimetz": - tz = pa.lib.string_to_tzinfo( + tz = lib.string_to_tzinfo( column_indexes[0]['metadata']['timezone']) level = pd.to_datetime(level, utc=True).tz_convert(tz) if _pandas_api.is_ge_v3(): @@ -1234,7 +1238,7 @@ def _add_any_metadata(table, pandas_metadata): if idx != -1: if col_meta['pandas_type'] == 'datetimetz': col = table[idx] - if not isinstance(col.type, pa.lib.TimestampType): + if not isinstance(col.type, lib.TimestampType): continue metadata = col_meta['metadata'] if not metadata: @@ -1273,7 +1277,7 @@ def make_tz_aware(series, tz): """ Make a datetime64 Series timezone-aware for the given tz """ - tz = pa.lib.string_to_tzinfo(tz) + tz = lib.string_to_tzinfo(tz) series = (series.dt.tz_localize('utc') .dt.tz_convert(tz)) return series diff --git a/python/pyarrow/tests/parquet/test_pandas.py b/python/pyarrow/tests/parquet/test_pandas.py index 5f9fdc7896d..edc7a2610eb 100644 --- a/python/pyarrow/tests/parquet/test_pandas.py +++ b/python/pyarrow/tests/parquet/test_pandas.py @@ -260,7 +260,8 @@ def test_pandas_parquet_configuration_options(tempdir): for compression in ['NONE', 'SNAPPY', 'GZIP', 'LZ4', 'ZSTD']: if (compression != 'NONE' and - not pa.lib.Codec.is_available(compression)): + not pa.lib.Codec.is_available(compression)): \ + # type: ignore[unresolved-attribute] continue _write_table(arrow_table, filename, version='2.6', compression=compression) diff --git a/python/pyarrow/tests/test_acero.py b/python/pyarrow/tests/test_acero.py index ac58792cd50..bbec49c5360 100644 --- a/python/pyarrow/tests/test_acero.py +++ b/python/pyarrow/tests/test_acero.py @@ -19,7 +19,8 @@ import pyarrow as pa import pyarrow.compute as pc -from pyarrow.compute import field +from pyarrow.compute import field, multiply, sum, equal, all as pc_all \ + # type: ignore[unresolved-import] try: from pyarrow.acero import ( @@ -121,7 +122,7 @@ def test_filter(table_source): ]) def test_filter_all_rows(source): # GH-46057: filtering all rows should return empty RecordBatch with same schema - result_expr = source.filter(pc.field("number") < 0) + result_expr = source.filter(field("number") < 0) assert result_expr.num_rows == 0 assert type(result_expr) is type(source) @@ -138,7 +139,7 @@ def test_project(table_source): # default name from expression decl = Declaration.from_sequence([ table_source, - Declaration("project", ProjectNodeOptions([pc.multiply(field("a"), 2)])) + Declaration("project", ProjectNodeOptions([multiply(field("a"), 2)])) ]) result = decl.to_table() assert result.schema.names == ["multiply(a, 2)"] @@ -147,7 +148,7 @@ def test_project(table_source): # provide name decl = Declaration.from_sequence([ table_source, - Declaration("project", ProjectNodeOptions([pc.multiply(field("a"), 2)], ["a2"])) + Declaration("project", ProjectNodeOptions([multiply(field("a"), 2)], ["a2"])) ]) result = decl.to_table() assert result.schema.names == ["a2"] @@ -155,12 +156,12 @@ def test_project(table_source): # input validation with pytest.raises(ValueError): - ProjectNodeOptions([pc.multiply(field("a"), 2)], ["a2", "b2"]) + ProjectNodeOptions([multiply(field("a"), 2)], ["a2", "b2"]) # no scalar expression decl = Declaration.from_sequence([ table_source, - Declaration("project", ProjectNodeOptions([pc.sum(field("a"))])) + Declaration("project", ProjectNodeOptions([sum(field("a"))])) ]) with pytest.raises(ValueError, match="cannot Execute non-scalar expression"): _ = decl.to_table() @@ -370,7 +371,7 @@ def test_hash_join_with_residual_filter(): join_opts = HashJoinNodeOptions( "inner", left_keys="key", right_keys="key", - filter_expression=pc.equal(pc.field('a'), 5)) + filter_expression=equal(field('a'), 5)) joined = Declaration( "hashjoin", options=join_opts, inputs=[left_source, right_source]) result = joined.to_table() @@ -382,7 +383,7 @@ def test_hash_join_with_residual_filter(): # test filter expression referencing columns from both side join_opts = HashJoinNodeOptions( "left outer", left_keys="key", right_keys="key", - filter_expression=pc.equal(pc.field("a"), 5) | pc.equal(pc.field("b"), 10) + filter_expression=equal(field("a"), 5) | equal(field("b"), 10) ) joined = Declaration( "hashjoin", options=join_opts, inputs=[left_source, right_source]) @@ -487,10 +488,10 @@ def test_scan(tempdir): # projection scan option - scan_opts = ScanNodeOptions(dataset, columns={"a2": pc.multiply(field("a"), 2)}) + scan_opts = ScanNodeOptions(dataset, columns={"a2": multiply(field("a"), 2)}) decl = Declaration("scan", scan_opts) result = decl.to_table() # "a" is included in the result (needed later on for the actual projection) assert result["a"].to_pylist() == [1, 2, 3] # "b" is still included, but without data as it will be removed by the projection - assert pc.all(result["b"].is_null()).as_py() + assert pc_all(result["b"].is_null()).as_py() diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 92db9fc177a..6ab39dd8716 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -31,6 +31,10 @@ import numpy as np except ImportError: pass +try: + from pyarrow import lib # type: ignore[unresolved-import] +except ImportError: + pass import pyarrow as pa import pyarrow.tests.strategies as past @@ -323,7 +327,7 @@ def test_asarray(): np_arr = np.asarray([_ for _ in arr]) assert np_arr.tolist() == [0, 1, 2, 3] assert np_arr.dtype == np.dtype('O') - assert isinstance(np_arr[0], pa.lib.Int64Value) + assert isinstance(np_arr[0], lib.Int64Value) # Calling with the arrow array gives back an array with 'int64' dtype np_arr = np.asarray(arr) @@ -1908,9 +1912,9 @@ def test_cast_from_null(): out_types = [ pa.union([pa.field('a', pa.binary(10)), - pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), + pa.field('b', pa.string())], mode=lib.UnionMode_DENSE), pa.union([pa.field('a', pa.binary(10)), - pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), + pa.field('b', pa.string())], mode=lib.UnionMode_SPARSE), ] in_arr = pa.array(in_data, type=pa.null()) for out_type in out_types: @@ -3223,8 +3227,8 @@ def test_struct_array_field(): x2 = a.field('x') y2 = a.field('y') - assert isinstance(x0, pa.lib.Int16Array) - assert isinstance(y1, pa.lib.FloatArray) + assert isinstance(x0, lib.Int16Array) + assert isinstance(y1, lib.FloatArray) assert x0.equals(pa.array([1, 3, 5], type=pa.int16())) assert y0.equals(pa.array([2.5, 4.5, 6.5], type=pa.float32())) assert x0.equals(x1) @@ -3258,8 +3262,8 @@ def test_struct_array_flattened_field(): x2 = a._flattened_field('x') y2 = a._flattened_field('y') - assert isinstance(x0, pa.lib.Int16Array) - assert isinstance(y1, pa.lib.FloatArray) + assert isinstance(x0, lib.Int16Array) + assert isinstance(y1, lib.FloatArray) assert x0.equals(pa.array([1, None, 5], type=pa.int16())) assert y0.equals(pa.array([2.5, None, 6.5], type=pa.float32())) assert x0.equals(x1) @@ -3307,7 +3311,7 @@ def test_empty_cast(): # ARROW-4766: Ensure that supported types conversion don't segfault # on empty arrays of common types pa.array([], type=t1).cast(t2) - except (pa.lib.ArrowNotImplementedError, pa.ArrowInvalid): + except (lib.ArrowNotImplementedError, pa.ArrowInvalid): continue @@ -4103,7 +4107,7 @@ def test_list_view_from_arrays_fails(list_array_type, list_type_factory): mask = pa.array([False, False, True]) # Ambiguous to specify both validity map and offsets or sizes with nulls - with pytest.raises(pa.lib.ArrowInvalid): + with pytest.raises(lib.ArrowInvalid): list_array_type.from_arrays(offsets, sizes, values, mask=mask) offsets = [0, 1, 1] @@ -4111,7 +4115,7 @@ def test_list_view_from_arrays_fails(list_array_type, list_type_factory): array_slice = array[1:] # List offsets and sizes must not be slices if a validity map is specified - with pytest.raises(pa.lib.ArrowInvalid): + with pytest.raises(lib.ArrowInvalid): list_array_type.from_arrays( array_slice.offsets, array_slice.sizes, array_slice.values, mask=array_slice.is_null()) diff --git a/python/pyarrow/tests/test_cffi.py b/python/pyarrow/tests/test_cffi.py index 60f3a5621b9..306225dbf69 100644 --- a/python/pyarrow/tests/test_cffi.py +++ b/python/pyarrow/tests/test_cffi.py @@ -676,7 +676,8 @@ def test_roundtrip_reader_capsule(constructor): obj = constructor(schema, batches) bad_schema = pa.schema({'ints': pa.int32()}) - with pytest.raises(pa.lib.ArrowTypeError, match="Field 0 cannot be cast"): + with pytest.raises(pa.lib.ArrowTypeError, match="Field 0 cannot be cast"): \ + # type: ignore[unresolved-attribute] obj.__arrow_c_stream__(bad_schema.__arrow_c_schema__()) # Can work with matching schema diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 4e39383473c..e9afe643994 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -27,6 +27,41 @@ import random import sys import textwrap +from pyarrow import lib # type: ignore[unresolved-import] +from pyarrow.compute import \ + sum as pc_sum, mode, variance, skew, kurtosis, count_substring, \ + count_substring_regex, find_substring, find_substring_regex, match_like, \ + match_substring, match_substring_regex, utf8_trim_whitespace, \ + ascii_trim_whitespace, utf8_trim, utf8_slice_codeunits, binary_slice, \ + split_pattern, utf8_split_whitespace, ascii_split_whitespace, \ + split_pattern_regex, any as pc_any, all as pc_all, filter, min_max, \ + choose, utf8_is_printable, ascii_center, ascii_lpad, ascii_rpad, utf8_center, \ + utf8_lpad, utf8_rpad, binary_replace_slice, utf8_replace_slice, \ + replace_substring, replace_substring_regex, extract_regex, extract_regex_span, \ + binary_join, binary_join_element_wise, not_equal, less, less_equal, greater, \ + greater_equal, equal, round_to_multiple, round_binary, is_null, or_kleene, \ + is_valid, and_, and_kleene, or_, xor, invert, dictionary_decode, \ + dictionary_encode, strptime, strftime, year as pc_year, \ + is_leap_year as pc_is_leap_year, month as pc_month, day as pc_day, \ + day_of_year as pc_day_of_year, iso_year as pc_iso_year, iso_week as pc_iso_week, \ + iso_calendar as pc_iso_calendar, quarter as pc_quarter, hour as pc_hour, \ + minute as pc_minute, second as pc_second, millisecond as pc_millisecond, \ + microsecond as pc_microsecond, nanosecond as pc_nanosecond, \ + subsecond as pc_subsecond, local_timestamp as pc_local_timestamp, \ + is_dst as pc_is_dst, day_of_week as pc_day_of_week, \ + week as pc_week, \ + assume_timezone as pc_assume_timezone, count, ceil_temporal, floor_temporal, \ + round_temporal, partition_nth_indices, select_k_unstable, array_sort_indices, \ + sort_indices, is_in, index_in, quantile, tdigest, cumulative_sum, \ + cumulative_prod, max_element_wise, min_element_wise, cumulative_min, \ + cumulative_max, map_lookup, struct_field, case_when, make_struct, list_element, \ + count_distinct, utf8_normalize, rank, rank_quantile, rank_normal, negate, \ + subtract, divide, multiply, power, sqrt, exp, cos, sin, tan, acos, atan, \ + asin, atan2, sinh, cosh, tanh, asinh, acosh, atanh, abs as pc_abs, sign, \ + bit_wise_not, bit_wise_and, \ + bit_wise_or, bit_wise_xor, is_nan, is_finite, coalesce, hour, round as pc_round, \ + add as pc_add, cast, list_slice, run_end_decode, run_end_encode, pairwise_diff, \ + pairwise_diff_checked, pivot_wider, winsorize # type: ignore[unresolved-import] try: import numpy as np @@ -324,36 +359,36 @@ def test_function_attributes(): def test_input_type_conversion(): # Automatic array conversion from Python - arr = pc.add([1, 2], [4, None]) + arr = pc_add([1, 2], [4, None]) assert arr.to_pylist() == [5, None] # Automatic scalar conversion from Python - arr = pc.add([1, 2], 4) + arr = pc_add([1, 2], 4) assert arr.to_pylist() == [5, 6] # Other scalar type - assert pc.equal(["foo", "bar", None], - "foo").to_pylist() == [True, False, None] + assert equal(["foo", "bar", None], + "foo").to_pylist() == [True, False, None] @pytest.mark.parametrize('arrow_type', numerical_arrow_types) def test_sum_array(arrow_type): arr = pa.array([1, 2, 3, 4], type=arrow_type) assert arr.sum().as_py() == 10 - assert pc.sum(arr).as_py() == 10 + assert pc_sum(arr).as_py() == 10 arr = pa.array([1, 2, 3, 4, None], type=arrow_type) assert arr.sum().as_py() == 10 - assert pc.sum(arr).as_py() == 10 + assert pc_sum(arr).as_py() == 10 arr = pa.array([None], type=arrow_type) assert arr.sum().as_py() is None # noqa: E711 - assert pc.sum(arr).as_py() is None # noqa: E711 + assert pc_sum(arr).as_py() is None # noqa: E711 assert arr.sum(min_count=0).as_py() == 0 - assert pc.sum(arr, min_count=0).as_py() == 0 + assert pc_sum(arr, min_count=0).as_py() == 0 arr = pa.array([], type=arrow_type) assert arr.sum().as_py() is None # noqa: E711 assert arr.sum(min_count=0).as_py() == 0 - assert pc.sum(arr, min_count=0).as_py() == 0 + assert pc_sum(arr, min_count=0).as_py() == 0 @pytest.mark.parametrize("arrow_type", [pa.decimal128(3, 2), pa.decimal256(3, 2)]) @@ -402,24 +437,24 @@ def test_sum_decimal_array(arrow_type): @pytest.mark.parametrize('arrow_type', numerical_arrow_types) def test_sum_chunked_array(arrow_type): arr = pa.chunked_array([pa.array([1, 2, 3, 4], type=arrow_type)]) - assert pc.sum(arr).as_py() == 10 + assert pc_sum(arr).as_py() == 10 arr = pa.chunked_array([ pa.array([1, 2], type=arrow_type), pa.array([3, 4], type=arrow_type) ]) - assert pc.sum(arr).as_py() == 10 + assert pc_sum(arr).as_py() == 10 arr = pa.chunked_array([ pa.array([1, 2], type=arrow_type), pa.array([], type=arrow_type), pa.array([3, 4], type=arrow_type) ]) - assert pc.sum(arr).as_py() == 10 + assert pc_sum(arr).as_py() == 10 arr = pa.chunked_array((), type=arrow_type) assert arr.num_chunks == 0 - assert pc.sum(arr).as_py() is None # noqa: E711 - assert pc.sum(arr, min_count=0).as_py() == 0 + assert pc_sum(arr).as_py() is None # noqa: E711 + assert pc_sum(arr, min_count=0).as_py() == 0 @pytest.mark.parametrize('arrow_type', [pa.decimal128(3, 2), pa.decimal256(3, 2)]) @@ -438,77 +473,77 @@ def test_sum_chunked_array_decimal_type(arrow_type): pa.array([Decimal("1.23"), Decimal("4.56")], type=arrow_type) ] ) - assert pc.sum(arr).as_py() == expected_sum - assert pc.sum(arr).type == max_precision_type + assert pc_sum(arr).as_py() == expected_sum + assert pc_sum(arr).type == max_precision_type arr = pa.chunked_array([ pa.array([Decimal("1.23")], type=arrow_type), pa.array([Decimal("4.56")], type=arrow_type) ]) - assert pc.sum(arr).as_py() == expected_sum - assert pc.sum(arr).type == max_precision_type + assert pc_sum(arr).as_py() == expected_sum + assert pc_sum(arr).type == max_precision_type arr = pa.chunked_array([ pa.array([Decimal("1.23")], type=arrow_type), pa.array([], type=arrow_type), pa.array([Decimal("4.56")], type=arrow_type) ]) - assert pc.sum(arr).as_py() == expected_sum - assert pc.sum(arr).type == max_precision_type + assert pc_sum(arr).as_py() == expected_sum + assert pc_sum(arr).type == max_precision_type arr = pa.chunked_array((), type=arrow_type) assert arr.num_chunks == 0 - assert pc.sum(arr).as_py() is None # noqa: E711 - assert pc.sum(arr).type == max_precision_type - assert pc.sum(arr, min_count=0).as_py() == zero - assert pc.sum(arr, min_count=0).type == max_precision_type + assert pc_sum(arr).as_py() is None # noqa: E711 + assert pc_sum(arr).type == max_precision_type + assert pc_sum(arr, min_count=0).as_py() == zero + assert pc_sum(arr, min_count=0).type == max_precision_type def test_mode_array(): # ARROW-9917 - arr = pa.array([1, 1, 3, 4, 3, 5], type='int64') - mode = pc.mode(arr) - assert len(mode) == 1 - assert mode[0].as_py() == {"mode": 1, "count": 2} - - mode = pc.mode(arr, n=2) - assert len(mode) == 2 - assert mode[0].as_py() == {"mode": 1, "count": 2} - assert mode[1].as_py() == {"mode": 3, "count": 2} - - arr = pa.array([], type='int64') - assert len(pc.mode(arr)) == 0 - - arr = pa.array([1, 1, 3, 4, 3, None], type='int64') - mode = pc.mode(arr, skip_nulls=False) - assert len(mode) == 0 - mode = pc.mode(arr, min_count=6) - assert len(mode) == 0 - mode = pc.mode(arr, skip_nulls=False, min_count=5) - assert len(mode) == 0 - - arr = pa.array([True, False]) - mode = pc.mode(arr, n=2) - assert len(mode) == 2 - assert mode[0].as_py() == {"mode": False, "count": 1} - assert mode[1].as_py() == {"mode": True, "count": 1} + data = pa.array([1, 1, 3, 4, 3, 5], type='int64') + arr = mode(data) + assert len(arr) == 1 + assert arr[0].as_py() == {"mode": 1, "count": 2} + + arr = mode(data, n=2) + assert len(arr) == 2 + assert arr[0].as_py() == {"mode": 1, "count": 2} + assert arr[1].as_py() == {"mode": 3, "count": 2} + + data = pa.array([], type='int64') + assert len(mode(data)) == 0 + + data = pa.array([1, 1, 3, 4, 3, None], type='int64') + arr = mode(data, skip_nulls=False) + assert len(arr) == 0 + arr = mode(data, min_count=6) + assert len(arr) == 0 + arr = mode(data, skip_nulls=False, min_count=5) + assert len(arr) == 0 + + data = pa.array([True, False]) + arr = mode(data, n=2) + assert len(arr) == 2 + assert arr[0].as_py() == {"mode": False, "count": 1} + assert arr[1].as_py() == {"mode": True, "count": 1} def test_mode_chunked_array(): # ARROW-9917 - arr = pa.chunked_array([pa.array([1, 1, 3, 4, 3, 5], type='int64')]) - mode = pc.mode(arr) - assert len(mode) == 1 - assert mode[0].as_py() == {"mode": 1, "count": 2} + data = pa.chunked_array([pa.array([1, 1, 3, 4, 3, 5], type='int64')]) + arr = mode(data) + assert len(arr) == 1 + assert arr[0].as_py() == {"mode": 1, "count": 2} - mode = pc.mode(arr, n=2) - assert len(mode) == 2 - assert mode[0].as_py() == {"mode": 1, "count": 2} - assert mode[1].as_py() == {"mode": 3, "count": 2} + arr = mode(data, n=2) + assert len(arr) == 2 + assert arr[0].as_py() == {"mode": 1, "count": 2} + assert arr[1].as_py() == {"mode": 3, "count": 2} arr = pa.chunked_array((), type='int64') assert arr.num_chunks == 0 - assert len(pc.mode(arr)) == 0 + assert len(mode(arr)) == 0 def test_empty_chunked_array(): @@ -521,23 +556,23 @@ def test_empty_chunked_array(): def test_variance(): data = [1, 2, 3, 4, 5, 6, 7, 8] - assert pc.variance(data).as_py() == 5.25 - assert pc.variance(data, ddof=0).as_py() == 5.25 - assert pc.variance(data, ddof=1).as_py() == 6.0 + assert variance(data).as_py() == 5.25 + assert variance(data, ddof=0).as_py() == 5.25 + assert variance(data, ddof=1).as_py() == 6.0 def test_skew(): data = [1, 1, None, 2] - assert pc.skew(data).as_py() == pytest.approx(0.707106781186548, rel=1e-10) - assert pc.skew(data, skip_nulls=False).as_py() is None - assert pc.skew(data, min_count=4).as_py() is None + assert skew(data).as_py() == pytest.approx(0.707106781186548, rel=1e-10) + assert skew(data, skip_nulls=False).as_py() is None + assert skew(data, min_count=4).as_py() is None def test_kurtosis(): data = [1, 1, None, 2] - assert pc.kurtosis(data).as_py() == pytest.approx(-1.5, rel=1e-10) - assert pc.kurtosis(data, skip_nulls=False).as_py() is None - assert pc.kurtosis(data, min_count=4).as_py() is None + assert kurtosis(data).as_py() == pytest.approx(-1.5, rel=1e-10) + assert kurtosis(data, skip_nulls=False).as_py() is None + assert kurtosis(data, min_count=4).as_py() is None @pytest.mark.parametrize("input, expected", ( @@ -550,8 +585,8 @@ def test_kurtosis(): ([1, 40], {'skew': None, 'kurtosis': None}), )) def test_unbiased_skew_and_kurtosis(input, expected): - arrow_skew = pc.skew(input, skip_nulls=True, biased=False) - arrow_kurtosis = pc.kurtosis(input, skip_nulls=True, biased=False) + arrow_skew = skew(input, skip_nulls=True, biased=False) + arrow_kurtosis = kurtosis(input, skip_nulls=True, biased=False) assert arrow_skew.as_py() == expected['skew'] assert arrow_kurtosis.as_py() == expected['kurtosis'] @@ -561,11 +596,11 @@ def test_count_substring(): (pa.large_string(), pa.int64())]: arr = pa.array(["ab", "cab", "abcab", "ba", "AB", None], type=ty) - result = pc.count_substring(arr, "ab") + result = count_substring(arr, "ab") expected = pa.array([1, 1, 2, 0, 0, None], type=offset) assert expected == result - result = pc.count_substring(arr, "ab", ignore_case=True) + result = count_substring(arr, "ab", ignore_case=True) expected = pa.array([1, 1, 2, 0, 1, None], type=offset) assert expected == result @@ -575,11 +610,11 @@ def test_count_substring_regex(): (pa.large_string(), pa.int64())]: arr = pa.array(["ab", "cab", "baAacaa", "ba", "AB", None], type=ty) - result = pc.count_substring_regex(arr, "a+") + result = count_substring_regex(arr, "a+") expected = pa.array([1, 1, 3, 1, 0, None], type=offset) assert expected.equals(result) - result = pc.count_substring_regex(arr, "a+", ignore_case=True) + result = count_substring_regex(arr, "a+", ignore_case=True) expected = pa.array([1, 1, 2, 1, 1, None], type=offset) assert expected.equals(result) @@ -587,61 +622,61 @@ def test_count_substring_regex(): def test_find_substring(): for ty in [pa.string(), pa.binary(), pa.large_string(), pa.large_binary()]: arr = pa.array(["ab", "cab", "ba", None], type=ty) - result = pc.find_substring(arr, "ab") + result = find_substring(arr, "ab") assert result.to_pylist() == [0, 1, -1, None] - result = pc.find_substring_regex(arr, "a?b") + result = find_substring_regex(arr, "a?b") assert result.to_pylist() == [0, 1, 0, None] arr = pa.array(["ab*", "cAB*", "ba", "aB?"], type=ty) - result = pc.find_substring(arr, "aB*", ignore_case=True) + result = find_substring(arr, "aB*", ignore_case=True) assert result.to_pylist() == [0, 1, -1, -1] - result = pc.find_substring_regex(arr, "a?b", ignore_case=True) + result = find_substring_regex(arr, "a?b", ignore_case=True) assert result.to_pylist() == [0, 1, 0, 0] def test_match_like(): arr = pa.array(["ab", "ba%", "ba", "ca%d", None]) - result = pc.match_like(arr, r"_a\%%") + result = match_like(arr, r"_a\%%") expected = pa.array([False, True, False, True, None]) assert expected.equals(result) arr = pa.array(["aB", "bA%", "ba", "ca%d", None]) - result = pc.match_like(arr, r"_a\%%", ignore_case=True) + result = match_like(arr, r"_a\%%", ignore_case=True) expected = pa.array([False, True, False, True, None]) assert expected.equals(result) - result = pc.match_like(arr, r"_a\%%", ignore_case=False) + result = match_like(arr, r"_a\%%", ignore_case=False) expected = pa.array([False, False, False, True, None]) assert expected.equals(result) def test_match_substring(): arr = pa.array(["ab", "abc", "ba", None]) - result = pc.match_substring(arr, "ab") + result = match_substring(arr, "ab") expected = pa.array([True, True, False, None]) assert expected.equals(result) arr = pa.array(["áB", "Ábc", "ba", None]) - result = pc.match_substring(arr, "áb", ignore_case=True) + result = match_substring(arr, "áb", ignore_case=True) expected = pa.array([True, True, False, None]) assert expected.equals(result) - result = pc.match_substring(arr, "áb", ignore_case=False) + result = match_substring(arr, "áb", ignore_case=False) expected = pa.array([False, False, False, None]) assert expected.equals(result) def test_match_substring_regex(): arr = pa.array(["ab", "abc", "ba", "c", None]) - result = pc.match_substring_regex(arr, "^a?b") + result = match_substring_regex(arr, "^a?b") expected = pa.array([True, True, True, False, None]) assert expected.equals(result) arr = pa.array(["aB", "Abc", "BA", "c", None]) - result = pc.match_substring_regex(arr, "^a?b", ignore_case=True) + result = match_substring_regex(arr, "^a?b", ignore_case=True) expected = pa.array([True, True, True, False, None]) assert expected.equals(result) - result = pc.match_substring_regex(arr, "^a?b", ignore_case=False) + result = match_substring_regex(arr, "^a?b", ignore_case=False) expected = pa.array([False, False, False, False, None]) assert expected.equals(result) @@ -649,21 +684,21 @@ def test_match_substring_regex(): def test_trim(): # \u3000 is unicode whitespace arr = pa.array([" foo", None, " \u3000foo bar \t"]) - result = pc.utf8_trim_whitespace(arr) + result = utf8_trim_whitespace(arr) expected = pa.array(["foo", None, "foo bar"]) assert expected.equals(result) arr = pa.array([" foo", None, " \u3000foo bar \t"]) - result = pc.ascii_trim_whitespace(arr) + result = ascii_trim_whitespace(arr) expected = pa.array(["foo", None, "\u3000foo bar"]) assert expected.equals(result) arr = pa.array([" foo", None, " \u3000foo bar \t"]) - result = pc.utf8_trim(arr, characters=' f\u3000') + result = utf8_trim(arr, characters=' f\u3000') expected = pa.array(["oo", None, "oo bar \t"]) assert expected.equals(result) # Positional option - result = pc.utf8_trim(arr, ' f\u3000') + result = utf8_trim(arr, ' f\u3000') expected = pa.array(["oo", None, "oo bar \t"]) assert expected.equals(result) @@ -675,12 +710,12 @@ def test_slice_compatibility(): for step in [-3, -2, -1, 1, 2, 3]: expected = pa.array([k.as_py()[start:stop:step] for k in arr]) - result = pc.utf8_slice_codeunits( + result = utf8_slice_codeunits( arr, start=start, stop=stop, step=step) assert expected.equals(result) # Positional options - assert pc.utf8_slice_codeunits(arr, - start, stop, step) == result + assert utf8_slice_codeunits(arr, + start, stop, step) == result def test_binary_slice_compatibility(): @@ -693,113 +728,113 @@ def test_binary_slice_compatibility(): continue expected = pa.array([k.as_py()[start:stop:step] for k in arr]) - result = pc.binary_slice( + result = binary_slice( arr, start=start, stop=stop, step=step) assert expected.equals(result) # Positional options - assert pc.binary_slice(arr, start, stop, step) == result + assert binary_slice(arr, start, stop, step) == result # Fixed size binary input / output for item in data: fsb_scalar = pa.scalar(item, type=pa.binary(len(item))) expected = item[start:stop:step] - actual = pc.binary_slice(fsb_scalar, start, stop, step) + actual = binary_slice(fsb_scalar, start, stop, step) assert actual.type == pa.binary(len(expected)) assert actual.as_py() == expected def test_split_pattern(): arr = pa.array(["-foo---bar--", "---foo---b"]) - result = pc.split_pattern(arr, pattern="---") + result = split_pattern(arr, pattern="---") expected = pa.array([["-foo", "bar--"], ["", "foo", "b"]]) assert expected.equals(result) - result = pc.split_pattern(arr, "---", max_splits=1) + result = split_pattern(arr, "---", max_splits=1) expected = pa.array([["-foo", "bar--"], ["", "foo---b"]]) assert expected.equals(result) - result = pc.split_pattern(arr, "---", max_splits=1, reverse=True) + result = split_pattern(arr, "---", max_splits=1, reverse=True) expected = pa.array([["-foo", "bar--"], ["---foo", "b"]]) assert expected.equals(result) def test_split_whitespace_utf8(): arr = pa.array(["foo bar", " foo \u3000\tb"]) - result = pc.utf8_split_whitespace(arr) + result = utf8_split_whitespace(arr) expected = pa.array([["foo", "bar"], ["", "foo", "b"]]) assert expected.equals(result) - result = pc.utf8_split_whitespace(arr, max_splits=1) + result = utf8_split_whitespace(arr, max_splits=1) expected = pa.array([["foo", "bar"], ["", "foo \u3000\tb"]]) assert expected.equals(result) - result = pc.utf8_split_whitespace(arr, max_splits=1, reverse=True) + result = utf8_split_whitespace(arr, max_splits=1, reverse=True) expected = pa.array([["foo", "bar"], [" foo", "b"]]) assert expected.equals(result) def test_split_whitespace_ascii(): arr = pa.array(["foo bar", " foo \u3000\tb"]) - result = pc.ascii_split_whitespace(arr) + result = ascii_split_whitespace(arr) expected = pa.array([["foo", "bar"], ["", "foo", "\u3000", "b"]]) assert expected.equals(result) - result = pc.ascii_split_whitespace(arr, max_splits=1) + result = ascii_split_whitespace(arr, max_splits=1) expected = pa.array([["foo", "bar"], ["", "foo \u3000\tb"]]) assert expected.equals(result) - result = pc.ascii_split_whitespace(arr, max_splits=1, reverse=True) + result = ascii_split_whitespace(arr, max_splits=1, reverse=True) expected = pa.array([["foo", "bar"], [" foo \u3000", "b"]]) assert expected.equals(result) def test_split_pattern_regex(): arr = pa.array(["-foo---bar--", "---foo---b"]) - result = pc.split_pattern_regex(arr, pattern="-+") + result = split_pattern_regex(arr, pattern="-+") expected = pa.array([["", "foo", "bar", ""], ["", "foo", "b"]]) assert expected.equals(result) - result = pc.split_pattern_regex(arr, "-+", max_splits=1) + result = split_pattern_regex(arr, "-+", max_splits=1) expected = pa.array([["", "foo---bar--"], ["", "foo---b"]]) assert expected.equals(result) with pytest.raises(NotImplementedError, match="Cannot split in reverse with regex"): - result = pc.split_pattern_regex( + result = split_pattern_regex( arr, pattern="---", max_splits=1, reverse=True) def test_min_max(): # An example generated function wrapper with possible options data = [4, 5, 6, None, 1] - s = pc.min_max(data) + s = min_max(data) assert s.as_py() == {'min': 1, 'max': 6} - s = pc.min_max(data, options=pc.ScalarAggregateOptions()) + s = min_max(data, options=pc.ScalarAggregateOptions()) assert s.as_py() == {'min': 1, 'max': 6} - s = pc.min_max(data, options=pc.ScalarAggregateOptions(skip_nulls=True)) + s = min_max(data, options=pc.ScalarAggregateOptions(skip_nulls=True)) assert s.as_py() == {'min': 1, 'max': 6} - s = pc.min_max(data, options=pc.ScalarAggregateOptions(skip_nulls=False)) + s = min_max(data, options=pc.ScalarAggregateOptions(skip_nulls=False)) assert s.as_py() == {'min': None, 'max': None} # Options as dict of kwargs - s = pc.min_max(data, options={'skip_nulls': False}) + s = min_max(data, options={'skip_nulls': False}) assert s.as_py() == {'min': None, 'max': None} # Options as named functions arguments - s = pc.min_max(data, skip_nulls=False) + s = min_max(data, skip_nulls=False) assert s.as_py() == {'min': None, 'max': None} # Both options and named arguments with pytest.raises(TypeError): - s = pc.min_max( + s = min_max( data, options=pc.ScalarAggregateOptions(), skip_nulls=False) # Wrong options type options = pc.TakeOptions() with pytest.raises(TypeError): - s = pc.min_max(data, options=options) + s = min_max(data, options=options) # Missing argument with pytest.raises(TypeError, match="min_max takes 1 positional"): - s = pc.min_max() + s = min_max() def test_any(): @@ -808,17 +843,17 @@ def test_any(): options = pc.ScalarAggregateOptions(skip_nulls=False, min_count=0) a = pa.array([], type='bool') - assert pc.any(a).as_py() is None - assert pc.any(a, min_count=0).as_py() is False - assert pc.any(a, options=options).as_py() is False + assert pc_any(a).as_py() is None + assert pc_any(a, min_count=0).as_py() is False + assert pc_any(a, options=options).as_py() is False a = pa.array([False, None, True]) - assert pc.any(a).as_py() is True - assert pc.any(a, options=options).as_py() is True + assert pc_any(a).as_py() is True + assert pc_any(a, options=options).as_py() is True a = pa.array([False, None, False]) - assert pc.any(a).as_py() is False - assert pc.any(a, options=options).as_py() is None + assert pc_any(a).as_py() is False + assert pc_any(a, options=options).as_py() is None def test_all(): @@ -827,39 +862,39 @@ def test_all(): options = pc.ScalarAggregateOptions(skip_nulls=False, min_count=0) a = pa.array([], type='bool') - assert pc.all(a).as_py() is None - assert pc.all(a, min_count=0).as_py() is True - assert pc.all(a, options=options).as_py() is True + assert pc_all(a).as_py() is None + assert pc_all(a, min_count=0).as_py() is True + assert pc_all(a, options=options).as_py() is True a = pa.array([False, True]) - assert pc.all(a).as_py() is False - assert pc.all(a, options=options).as_py() is False + assert pc_all(a).as_py() is False + assert pc_all(a, options=options).as_py() is False a = pa.array([True, None]) - assert pc.all(a).as_py() is True - assert pc.all(a, options=options).as_py() is None + assert pc_all(a).as_py() is True + assert pc_all(a, options=options).as_py() is None a = pa.chunked_array([[True], [True, None]]) - assert pc.all(a).as_py() is True - assert pc.all(a, options=options).as_py() is None + assert pc_all(a).as_py() is True + assert pc_all(a, options=options).as_py() is None a = pa.chunked_array([[True], [False]]) - assert pc.all(a).as_py() is False - assert pc.all(a, options=options).as_py() is False + assert pc_all(a).as_py() is False + assert pc_all(a, options=options).as_py() is False def test_is_valid(): # An example generated function wrapper without options data = [4, 5, None] - assert pc.is_valid(data).to_pylist() == [True, True, False] + assert is_valid(data).to_pylist() == [True, True, False] with pytest.raises(TypeError): - pc.is_valid(data, options=None) + is_valid(data, options=None) def test_generated_docstrings(): # With options - assert pc.min_max.__doc__ == textwrap.dedent("""\ + assert min_max.__doc__ == textwrap.dedent("""\ Compute the minimum and maximum values of a numeric array. Null values are ignored by default. @@ -881,7 +916,7 @@ def test_generated_docstrings(): If not passed, will allocate memory from the default memory pool. """) # Without options - assert pc.add.__doc__ == textwrap.dedent("""\ + assert pc_add.__doc__ == textwrap.dedent("""\ Add the arguments element-wise. Results will wrap around on integer overflow. @@ -898,7 +933,7 @@ def test_generated_docstrings(): If not passed, will allocate memory from the default memory pool. """) # Varargs with options - assert pc.min_element_wise.__doc__ == textwrap.dedent("""\ + assert min_element_wise.__doc__ == textwrap.dedent("""\ Find the element-wise minimum value. Nulls are ignored (by default) or propagated. @@ -916,7 +951,7 @@ def test_generated_docstrings(): memory_pool : pyarrow.MemoryPool, optional If not passed, will allocate memory from the default memory pool. """) - assert pc.filter.__doc__ == textwrap.dedent("""\ + assert filter.__doc__ == textwrap.dedent("""\ Filter with a boolean selection filter. The output is populated with values from the input at positions @@ -963,24 +998,24 @@ def test_generated_signatures(): # options and their default values. # Without options - sig = inspect.signature(pc.add) + sig = inspect.signature(pc_add) assert str(sig) == "(x, y, /, *, memory_pool=None)" # With options - sig = inspect.signature(pc.min_max) + sig = inspect.signature(min_max) assert str(sig) == ("(array, /, *, skip_nulls=True, min_count=1, " "options=None, memory_pool=None)") # With positional options - sig = inspect.signature(pc.quantile) + sig = inspect.signature(quantile) assert str(sig) == ("(array, /, q=0.5, *, interpolation='linear', " "skip_nulls=True, min_count=0, " "options=None, memory_pool=None)") # Varargs with options - sig = inspect.signature(pc.binary_join_element_wise) + sig = inspect.signature(binary_join_element_wise) assert str(sig) == ("(*strings, null_handling='emit_null', " "null_replacement='', options=None, " "memory_pool=None)") # Varargs without options - sig = inspect.signature(pc.choose) + sig = inspect.signature(choose) assert str(sig) == "(indices, /, *values, memory_pool=None)" # Nullary with options sig = inspect.signature(pc.random) @@ -997,7 +1032,7 @@ def find_new_unicode_codepoints(): new = set() characters = [chr(c) for c in range(0x80, 0x11000) if not (0xD800 <= c < 0xE000)] - is_printable = pc.utf8_is_printable(pa.array(characters)).to_pylist() + is_printable = utf8_is_printable(pa.array(characters)).to_pylist() for i, c in enumerate(characters): if is_printable[i] != c.isprintable(): new.add(ord(c)) @@ -1117,20 +1152,20 @@ def test_string_py_compat_boolean(function_name, variant): def test_pad(): arr = pa.array([None, 'a', 'abcd']) - assert pc.ascii_center(arr, width=3).tolist() == [None, ' a ', 'abcd'] - assert pc.ascii_lpad(arr, width=3).tolist() == [None, ' a', 'abcd'] - assert pc.ascii_rpad(arr, width=3).tolist() == [None, 'a ', 'abcd'] - assert pc.ascii_center(arr, 3).tolist() == [None, ' a ', 'abcd'] - assert pc.ascii_lpad(arr, 3).tolist() == [None, ' a', 'abcd'] - assert pc.ascii_rpad(arr, 3).tolist() == [None, 'a ', 'abcd'] + assert ascii_center(arr, width=3).tolist() == [None, ' a ', 'abcd'] + assert ascii_lpad(arr, width=3).tolist() == [None, ' a', 'abcd'] + assert ascii_rpad(arr, width=3).tolist() == [None, 'a ', 'abcd'] + assert ascii_center(arr, 3).tolist() == [None, ' a ', 'abcd'] + assert ascii_lpad(arr, 3).tolist() == [None, ' a', 'abcd'] + assert ascii_rpad(arr, 3).tolist() == [None, 'a ', 'abcd'] arr = pa.array([None, 'á', 'abcd']) - assert pc.utf8_center(arr, width=3).tolist() == [None, ' á ', 'abcd'] - assert pc.utf8_lpad(arr, width=3).tolist() == [None, ' á', 'abcd'] - assert pc.utf8_rpad(arr, width=3).tolist() == [None, 'á ', 'abcd'] - assert pc.utf8_center(arr, 3).tolist() == [None, ' á ', 'abcd'] - assert pc.utf8_lpad(arr, 3).tolist() == [None, ' á', 'abcd'] - assert pc.utf8_rpad(arr, 3).tolist() == [None, 'á ', 'abcd'] + assert utf8_center(arr, width=3).tolist() == [None, ' á ', 'abcd'] + assert utf8_lpad(arr, width=3).tolist() == [None, ' á', 'abcd'] + assert utf8_rpad(arr, width=3).tolist() == [None, 'á ', 'abcd'] + assert utf8_center(arr, 3).tolist() == [None, ' á ', 'abcd'] + assert utf8_lpad(arr, 3).tolist() == [None, ' á', 'abcd'] + assert utf8_rpad(arr, 3).tolist() == [None, 'á ', 'abcd'] def test_utf8_zfill(): @@ -1173,53 +1208,53 @@ def test_replace_slice(): for start in offsets: for stop in offsets: expected = series.str.slice_replace(start, stop, 'XX') - actual = pc.binary_replace_slice( + actual = binary_replace_slice( arr, start=start, stop=stop, replacement='XX') assert actual.tolist() == expected.tolist() # Positional options - assert pc.binary_replace_slice(arr, start, stop, 'XX') == actual + assert binary_replace_slice(arr, start, stop, 'XX') == actual arr = pa.array([None, '', 'π', 'πb', 'πbθ', 'πbθd', 'πbθde']) series = arr.to_pandas().astype(object).replace({np.nan: None}) for start in offsets: for stop in offsets: expected = series.str.slice_replace(start, stop, 'XX') - actual = pc.utf8_replace_slice( + actual = utf8_replace_slice( arr, start=start, stop=stop, replacement='XX') assert actual.tolist() == expected.tolist() def test_replace_plain(): data = pa.array(['foozfoo', 'food', None]) - ar = pc.replace_substring(data, pattern='foo', replacement='bar') + ar = replace_substring(data, pattern='foo', replacement='bar') assert ar.tolist() == ['barzbar', 'bard', None] - ar = pc.replace_substring(data, 'foo', 'bar') + ar = replace_substring(data, 'foo', 'bar') assert ar.tolist() == ['barzbar', 'bard', None] - ar = pc.replace_substring(data, pattern='foo', replacement='bar', - max_replacements=1) + ar = replace_substring(data, pattern='foo', replacement='bar', + max_replacements=1) assert ar.tolist() == ['barzfoo', 'bard', None] - ar = pc.replace_substring(data, 'foo', 'bar', max_replacements=1) + ar = replace_substring(data, 'foo', 'bar', max_replacements=1) assert ar.tolist() == ['barzfoo', 'bard', None] def test_replace_regex(): data = pa.array(['foo', 'mood', None]) expected = ['f00', 'm00d', None] - ar = pc.replace_substring_regex(data, pattern='(.)oo', replacement=r'\100') + ar = replace_substring_regex(data, pattern='(.)oo', replacement=r'\100') assert ar.tolist() == expected - ar = pc.replace_substring_regex(data, '(.)oo', replacement=r'\100') + ar = replace_substring_regex(data, '(.)oo', replacement=r'\100') assert ar.tolist() == expected - ar = pc.replace_substring_regex(data, '(.)oo', r'\100') + ar = replace_substring_regex(data, '(.)oo', r'\100') assert ar.tolist() == expected def test_extract_regex(): ar = pa.array(['a1', 'zb2z']) expected = [{'letter': 'a', 'digit': '1'}, {'letter': 'b', 'digit': '2'}] - struct = pc.extract_regex(ar, pattern=r'(?P[ab])(?P\d)') + struct = extract_regex(ar, pattern=r'(?P[ab])(?P\d)') assert struct.tolist() == expected - struct = pc.extract_regex(ar, r'(?P[ab])(?P\d)') + struct = extract_regex(ar, r'(?P[ab])(?P\d)') assert struct.tolist() == expected @@ -1227,50 +1262,50 @@ def test_extract_regex_span(): ar = pa.array(['a1', 'zb234z']) expected = [{'letter': [0, 1], 'digit': [1, 1]}, {'letter': [1, 1], 'digit': [2, 3]}] - struct = pc.extract_regex_span(ar, pattern=r'(?P[ab])(?P\d+)') + struct = extract_regex_span(ar, pattern=r'(?P[ab])(?P\d+)') assert struct.tolist() == expected - struct = pc.extract_regex_span(ar, r'(?P[ab])(?P\d+)') + struct = extract_regex_span(ar, r'(?P[ab])(?P\d+)') assert struct.tolist() == expected def test_binary_join(): ar_list = pa.array([['foo', 'bar'], None, []]) expected = pa.array(['foo-bar', None, '']) - assert pc.binary_join(ar_list, '-').equals(expected) + assert binary_join(ar_list, '-').equals(expected) separator_array = pa.array(['1', '2'], type=pa.binary()) expected = pa.array(['a1b', 'c2d'], type=pa.binary()) ar_list = pa.array([['a', 'b'], ['c', 'd']], type=pa.list_(pa.binary())) - assert pc.binary_join(ar_list, separator_array).equals(expected) + assert binary_join(ar_list, separator_array).equals(expected) def test_binary_join_element_wise(): null = pa.scalar(None, type=pa.string()) arrs = [[None, 'a', 'b'], ['c', None, 'd'], [None, '-', '--']] - assert pc.binary_join_element_wise(*arrs).to_pylist() == \ + assert binary_join_element_wise(*arrs).to_pylist() == \ [None, None, 'b--d'] - assert pc.binary_join_element_wise('a', 'b', '-').as_py() == 'a-b' - assert pc.binary_join_element_wise('a', null, '-').as_py() is None - assert pc.binary_join_element_wise('a', 'b', null).as_py() is None + assert binary_join_element_wise('a', 'b', '-').as_py() == 'a-b' + assert binary_join_element_wise('a', null, '-').as_py() is None + assert binary_join_element_wise('a', 'b', null).as_py() is None skip = pc.JoinOptions(null_handling='skip') - assert pc.binary_join_element_wise(*arrs, options=skip).to_pylist() == \ + assert binary_join_element_wise(*arrs, options=skip).to_pylist() == \ [None, 'a', 'b--d'] - assert pc.binary_join_element_wise( + assert binary_join_element_wise( 'a', 'b', '-', options=skip).as_py() == 'a-b' - assert pc.binary_join_element_wise( + assert binary_join_element_wise( 'a', null, '-', options=skip).as_py() == 'a' - assert pc.binary_join_element_wise( + assert binary_join_element_wise( 'a', 'b', null, options=skip).as_py() is None replace = pc.JoinOptions(null_handling='replace', null_replacement='spam') - assert pc.binary_join_element_wise(*arrs, options=replace).to_pylist() == \ + assert binary_join_element_wise(*arrs, options=replace).to_pylist() == \ [None, 'a-spam', 'b--d'] - assert pc.binary_join_element_wise( + assert binary_join_element_wise( 'a', 'b', '-', options=replace).as_py() == 'a-b' - assert pc.binary_join_element_wise( + assert binary_join_element_wise( 'a', null, '-', options=replace).as_py() == 'a-spam' - assert pc.binary_join_element_wise( + assert binary_join_element_wise( 'a', 'b', null, options=replace).as_py() is None @@ -1598,22 +1633,22 @@ def con(values): arr1 = con([1, 2, 3, 4, None]) arr2 = con([1, 1, 4, None, 4]) - result = pc.equal(arr1, arr2) + result = equal(arr1, arr2) assert result.equals(con([True, False, False, None, None])) - result = pc.not_equal(arr1, arr2) + result = not_equal(arr1, arr2) assert result.equals(con([False, True, True, None, None])) - result = pc.less(arr1, arr2) + result = less(arr1, arr2) assert result.equals(con([False, False, True, None, None])) - result = pc.less_equal(arr1, arr2) + result = less_equal(arr1, arr2) assert result.equals(con([True, False, True, None, None])) - result = pc.greater(arr1, arr2) + result = greater(arr1, arr2) assert result.equals(con([False, True, False, None, None])) - result = pc.greater_equal(arr1, arr2) + result = greater_equal(arr1, arr2) assert result.equals(con([True, True, False, None, None])) @@ -1629,28 +1664,28 @@ def con(values): arr = con(['a', 'b', 'c', None]) scalar = pa.scalar('b') - result = pc.equal(arr, scalar) + result = equal(arr, scalar) assert result.equals(con([False, True, False, None])) if typ == "array": nascalar = pa.scalar(None, type="string") - result = pc.equal(arr, nascalar) - isnull = pc.is_null(result) + result = equal(arr, nascalar) + isnull = is_null(result) assert isnull.equals(con([True, True, True, True])) - result = pc.not_equal(arr, scalar) + result = not_equal(arr, scalar) assert result.equals(con([True, False, True, None])) - result = pc.less(arr, scalar) + result = less(arr, scalar) assert result.equals(con([True, False, False, None])) - result = pc.less_equal(arr, scalar) + result = less_equal(arr, scalar) assert result.equals(con([True, True, False, None])) - result = pc.greater(arr, scalar) + result = greater(arr, scalar) assert result.equals(con([False, False, True, None])) - result = pc.greater_equal(arr, scalar) + result = greater_equal(arr, scalar) assert result.equals(con([False, True, True, None])) @@ -1666,27 +1701,27 @@ def con(values): arr = con([1, 2, 3, None]) scalar = pa.scalar(2) - result = pc.equal(arr, scalar) + result = equal(arr, scalar) assert result.equals(con([False, True, False, None])) if typ == "array": nascalar = pa.scalar(None, type="int64") - result = pc.equal(arr, nascalar) + result = equal(arr, nascalar) assert result.to_pylist() == [None, None, None, None] - result = pc.not_equal(arr, scalar) + result = not_equal(arr, scalar) assert result.equals(con([True, False, True, None])) - result = pc.less(arr, scalar) + result = less(arr, scalar) assert result.equals(con([True, False, False, None])) - result = pc.less_equal(arr, scalar) + result = less_equal(arr, scalar) assert result.equals(con([True, True, False, None])) - result = pc.greater(arr, scalar) + result = greater(arr, scalar) assert result.equals(con([False, False, True, None])) - result = pc.greater_equal(arr, scalar) + result = greater_equal(arr, scalar) assert result.equals(con([False, True, True, None])) @@ -1702,14 +1737,14 @@ def test_compare_chunked_array_mixed(): (arr_chunked, arr), (arr_chunked, arr_chunked2), ]: - result = pc.equal(left, right) + result = equal(left, right) assert result.equals(expected) def test_arithmetic_add(): left = pa.array([1, 2, 3, 4, 5]) right = pa.array([0, -1, 1, 2, 3]) - result = pc.add(left, right) + result = pc_add(left, right) expected = pa.array([1, 1, 4, 6, 8]) assert result.equals(expected) @@ -1717,7 +1752,7 @@ def test_arithmetic_add(): def test_arithmetic_subtract(): left = pa.array([1, 2, 3, 4, 5]) right = pa.array([0, -1, 1, 2, 3]) - result = pc.subtract(left, right) + result = subtract(left, right) expected = pa.array([1, 3, 2, 2, 2]) assert result.equals(expected) @@ -1725,7 +1760,7 @@ def test_arithmetic_subtract(): def test_arithmetic_multiply(): left = pa.array([1, 2, 3, 4, 5]) right = pa.array([0, -1, 1, 2, 3]) - result = pc.multiply(left, right) + result = multiply(left, right) expected = pa.array([0, -2, 3, 8, 15]) assert result.equals(expected) @@ -1733,10 +1768,10 @@ def test_arithmetic_multiply(): @pytest.mark.parametrize("ty", ["round", "round_to_multiple"]) def test_round_to_integer(ty): if ty == "round": - round_func = pc.round + round_func = pc_round RoundOptions = partial(pc.RoundOptions, ndigits=0) elif ty == "round_to_multiple": - round_func = pc.round_to_multiple + round_func = round_to_multiple RoundOptions = partial(pc.RoundToMultipleOptions, multiple=1) values = [3.2, 3.5, 3.7, 4.5, -3.2, -3.5, -3.7, None] @@ -1771,11 +1806,11 @@ def test_round(): } for ndigits, expected in ndigits_and_expected.items(): options = pc.RoundOptions(ndigits, "half_towards_infinity") - result = pc.round(values, options=options) + result = pc_round(values, options=options) np.testing.assert_allclose(result, pa.array(expected), equal_nan=True) - assert pc.round(values, ndigits, + assert pc_round(values, ndigits, round_mode="half_towards_infinity") == result - assert pc.round(values, ndigits, "half_towards_infinity") == result + assert pc_round(values, ndigits, "half_towards_infinity") == result @pytest.mark.numpy @@ -1791,19 +1826,19 @@ def test_round_to_multiple(): } for multiple, expected in multiple_and_expected.items(): options = pc.RoundToMultipleOptions(multiple, "half_towards_infinity") - result = pc.round_to_multiple(values, options=options) + result = round_to_multiple(values, options=options) np.testing.assert_allclose(result, pa.array(expected), equal_nan=True) - assert pc.round_to_multiple(values, multiple, - "half_towards_infinity") == result + assert round_to_multiple(values, multiple, + "half_towards_infinity") == result for multiple in [0, -2, pa.scalar(-10.4)]: with pytest.raises(pa.ArrowInvalid, match="Rounding multiple must be positive"): - pc.round_to_multiple(values, multiple=multiple) + round_to_multiple(values, multiple=multiple) for multiple in [object, 99999999999999999999999]: with pytest.raises(TypeError, match="is not a valid multiple type"): - pc.round_to_multiple(values, multiple=multiple) + round_to_multiple(values, multiple=multiple) def test_round_binary(): @@ -1811,15 +1846,15 @@ def test_round_binary(): scales = pa.array([-3, -2, -1, 0, 1, 2, 3], pa.int32()) expected = pa.array( [0, 200, 350, 457, 123.5, 234.57, 345.678], pa.float64()) - assert pc.round_binary(values, scales) == expected + assert round_binary(values, scales) == expected expect_zero = pa.scalar(0, pa.float64()) expect_inf = pa.scalar(10, pa.float64()) scale = pa.scalar(-1, pa.int32()) - assert pc.round_binary( + assert round_binary( 5.0, scale, round_mode="half_towards_zero") == expect_zero - assert pc.round_binary( + assert round_binary( 5.0, scale, round_mode="half_towards_infinity") == expect_inf @@ -1828,11 +1863,11 @@ def test_is_null(): result = arr.is_null() expected = pa.array([False, False, False, True]) assert result.equals(expected) - assert result.equals(pc.is_null(arr)) + assert result.equals(is_null(arr)) result = arr.is_valid() expected = pa.array([True, True, True, False]) assert result.equals(expected) - assert result.equals(pc.is_valid(arr)) + assert result.equals(is_valid(arr)) arr = pa.chunked_array([[1, 2], [3, None]]) result = arr.is_null() @@ -1952,27 +1987,27 @@ def test_logical(): a = pa.array([True, False, False, None]) b = pa.array([True, True, False, True]) - assert pc.and_(a, b) == pa.array([True, False, False, None]) - assert pc.and_kleene(a, b) == pa.array([True, False, False, None]) + assert and_(a, b) == pa.array([True, False, False, None]) + assert and_kleene(a, b) == pa.array([True, False, False, None]) - assert pc.or_(a, b) == pa.array([True, True, False, None]) - assert pc.or_kleene(a, b) == pa.array([True, True, False, True]) + assert or_(a, b) == pa.array([True, True, False, None]) + assert or_kleene(a, b) == pa.array([True, True, False, True]) - assert pc.xor(a, b) == pa.array([False, True, False, None]) + assert xor(a, b) == pa.array([False, True, False, None]) - assert pc.invert(a) == pa.array([False, True, True, None]) + assert invert(a) == pa.array([False, True, True, None]) def test_dictionary_decode(): array = pa.array(["a", "a", "b", "c", "b"]) dictionary_array = array.dictionary_encode() - dictionary_array_decode = pc.dictionary_decode(dictionary_array) + dictionary_array_decode = dictionary_decode(dictionary_array) assert array != dictionary_array assert array == dictionary_array_decode - assert array == pc.dictionary_decode(array) - assert pc.dictionary_encode(dictionary_array) == dictionary_array + assert array == dictionary_decode(array) + assert dictionary_encode(dictionary_array) == dictionary_array def test_cast(): @@ -2049,7 +2084,7 @@ def test_fsl_to_fsl_cast(value_type): # Different sized FSL cast_type = pa.list_(pa.field("element", value_type), 3) err_msg = 'Size of FixedSizeList is not the same.' - with pytest.raises(pa.lib.ArrowTypeError, match=err_msg): + with pytest.raises(lib.ArrowTypeError, match=err_msg): fsl.cast(cast_type) @@ -2247,28 +2282,28 @@ def test_cast_float_to_decimal_random(float_ty, decimal_traits): def test_strptime(): arr = pa.array(["5/1/2020", None, "12/13/1900"]) - got = pc.strptime(arr, format='%m/%d/%Y', unit='s') + got = strptime(arr, format='%m/%d/%Y', unit='s') expected = pa.array( [datetime.datetime(2020, 5, 1), None, datetime.datetime(1900, 12, 13)], type=pa.timestamp('s')) assert got == expected # Positional format - assert pc.strptime(arr, '%m/%d/%Y', unit='s') == got + assert strptime(arr, '%m/%d/%Y', unit='s') == got expected = pa.array([datetime.datetime(2020, 1, 5), None, None], type=pa.timestamp('s')) - got = pc.strptime(arr, format='%d/%m/%Y', unit='s', error_is_null=True) + got = strptime(arr, format='%d/%m/%Y', unit='s', error_is_null=True) assert got == expected with pytest.raises(pa.ArrowInvalid, match="Failed to parse string: '5/1/2020'"): - pc.strptime(arr, format='%Y-%m-%d', unit='s', error_is_null=False) + strptime(arr, format='%Y-%m-%d', unit='s', error_is_null=False) with pytest.raises(pa.ArrowInvalid, match="Failed to parse string: '5/1/2020'"): - pc.strptime(arr, format='%Y-%m-%d', unit='s') + strptime(arr, format='%Y-%m-%d', unit='s') - got = pc.strptime(arr, format='%Y-%m-%d', unit='s', error_is_null=True) + got = strptime(arr, format='%Y-%m-%d', unit='s', error_is_null=True) assert got == pa.array([None, None, None], type=pa.timestamp('s')) @@ -2290,7 +2325,7 @@ def test_strftime(): tsa = pa.array(ts, type=pa.timestamp(unit, timezone)) for fmt in formats: options = pc.StrftimeOptions(fmt) - result = pc.strftime(tsa, options=options) + result = strftime(tsa, options=options) # cast to the same type as result to ignore string vs large_string expected = pa.array(ts.strftime(fmt)).cast(result.type) assert result.equals(expected) @@ -2299,34 +2334,34 @@ def test_strftime(): # Default format tsa = pa.array(ts, type=pa.timestamp("s", timezone)) - result = pc.strftime(tsa, options=pc.StrftimeOptions()) + result = strftime(tsa, options=pc.StrftimeOptions()) expected = pa.array(ts.strftime(fmt)).cast(result.type) assert result.equals(expected) # Default format plus timezone tsa = pa.array(ts, type=pa.timestamp("s", timezone)) - result = pc.strftime(tsa, options=pc.StrftimeOptions(fmt + "%Z")) + result = strftime(tsa, options=pc.StrftimeOptions(fmt + "%Z")) expected = pa.array(ts.strftime(fmt + "%Z")).cast(result.type) assert result.equals(expected) # Pandas %S is equivalent to %S in arrow for unit="s" tsa = pa.array(ts, type=pa.timestamp("s", timezone)) options = pc.StrftimeOptions("%S") - result = pc.strftime(tsa, options=options) + result = strftime(tsa, options=options) expected = pa.array(ts.strftime("%S")).cast(result.type) assert result.equals(expected) # Pandas %S.%f is equivalent to %S in arrow for unit="us" tsa = pa.array(ts, type=pa.timestamp("us", timezone)) options = pc.StrftimeOptions("%S") - result = pc.strftime(tsa, options=options) + result = strftime(tsa, options=options) expected = pa.array(ts.strftime("%S.%f")).cast(result.type) assert result.equals(expected) # Test setting locale tsa = pa.array(ts, type=pa.timestamp("s", timezone)) options = pc.StrftimeOptions(fmt, locale="C") - result = pc.strftime(tsa, options=options) + result = strftime(tsa, options=options) expected = pa.array(ts.strftime(fmt)).cast(result.type) assert result.equals(expected) @@ -2334,19 +2369,19 @@ def test_strftime(): fmt = "%Y-%m-%dT%H:%M:%S" ts = pd.to_datetime(times) tsa = pa.array(ts, type=pa.timestamp("s")) - result = pc.strftime(tsa, options=pc.StrftimeOptions(fmt)) + result = strftime(tsa, options=pc.StrftimeOptions(fmt)) expected = pa.array(ts.strftime(fmt)).cast(result.type) # Positional format - assert pc.strftime(tsa, fmt) == result + assert strftime(tsa, fmt) == result assert result.equals(expected) with pytest.raises(pa.ArrowInvalid, match="Timezone not present, cannot convert to string"): - pc.strftime(tsa, options=pc.StrftimeOptions(fmt + "%Z")) + strftime(tsa, options=pc.StrftimeOptions(fmt + "%Z")) with pytest.raises(pa.ArrowInvalid, match="Timezone not present, cannot convert to string"): - pc.strftime(tsa, options=pc.StrftimeOptions(fmt + "%z")) + strftime(tsa, options=pc.StrftimeOptions(fmt + "%z")) def _check_datetime_components(timestamps, timezone=None): @@ -2394,42 +2429,42 @@ def _check_datetime_components(timestamps, timezone=None): microsecond = ts.dt.microsecond.astype("int64") nanosecond = ts.dt.nanosecond.astype("int64") - assert pc.year(tsa).equals(pa.array(year)) - assert pc.is_leap_year(tsa).equals(pa.array(ts.dt.is_leap_year)) - assert pc.month(tsa).equals(pa.array(month)) - assert pc.day(tsa).equals(pa.array(day)) - assert pc.day_of_week(tsa).equals(pa.array(dayofweek)) - assert pc.day_of_year(tsa).equals(pa.array(dayofyear)) - assert pc.iso_year(tsa).equals(pa.array(iso_year)) - assert pc.iso_week(tsa).equals(pa.array(iso_week)) - assert pc.iso_calendar(tsa).equals(iso_calendar) - assert pc.quarter(tsa).equals(pa.array(quarter)) - assert pc.hour(tsa).equals(pa.array(hour)) - assert pc.minute(tsa).equals(pa.array(minute)) - assert pc.second(tsa).equals(pa.array(second)) - assert pc.millisecond(tsa).equals(pa.array(microsecond // 10 ** 3)) - assert pc.microsecond(tsa).equals(pa.array(microsecond % 10 ** 3)) - assert pc.nanosecond(tsa).equals(pa.array(nanosecond)) - assert pc.subsecond(tsa).equals(pa.array(subseconds)) - assert pc.local_timestamp(tsa).equals(pa.array(ts.dt.tz_localize(None))) + assert pc_year(tsa).equals(pa.array(year)) + assert pc_is_leap_year(tsa).equals(pa.array(ts.dt.is_leap_year)) + assert pc_month(tsa).equals(pa.array(month)) + assert pc_day(tsa).equals(pa.array(day)) + assert pc_day_of_week(tsa).equals(pa.array(dayofweek)) + assert pc_day_of_year(tsa).equals(pa.array(dayofyear)) + assert pc_iso_year(tsa).equals(pa.array(iso_year)) + assert pc_iso_week(tsa).equals(pa.array(iso_week)) + assert pc_iso_calendar(tsa).equals(iso_calendar) + assert pc_quarter(tsa).equals(pa.array(quarter)) + assert pc_hour(tsa).equals(pa.array(hour)) + assert pc_minute(tsa).equals(pa.array(minute)) + assert pc_second(tsa).equals(pa.array(second)) + assert pc_millisecond(tsa).equals(pa.array(microsecond // 10 ** 3)) + assert pc_microsecond(tsa).equals(pa.array(microsecond % 10 ** 3)) + assert pc_nanosecond(tsa).equals(pa.array(nanosecond)) + assert pc_subsecond(tsa).equals(pa.array(subseconds)) + assert pc_local_timestamp(tsa).equals(pa.array(ts.dt.tz_localize(None))) if ts.dt.tz: if ts.dt.tz is datetime.timezone.utc: # datetime with utc returns None for dst() - is_dst = [False] * len(ts) + arr_is_dst = [False] * len(ts) else: - is_dst = ts.apply(lambda x: x.dst().seconds > 0) - assert pc.is_dst(tsa).equals(pa.array(is_dst)) + arr_is_dst = ts.apply(lambda x: x.dst().seconds > 0) + assert pc_is_dst(tsa).equals(pa.array(arr_is_dst)) day_of_week_options = pc.DayOfWeekOptions( count_from_zero=False, week_start=1) - assert pc.day_of_week(tsa, options=day_of_week_options).equals( + assert pc_day_of_week(tsa, options=day_of_week_options).equals( pa.array(dayofweek + 1)) week_options = pc.WeekOptions( week_starts_monday=True, count_from_zero=False, first_week_is_fully_in_year=False) - assert pc.week(tsa, options=week_options).equals(pa.array(iso_week)) + assert pc_week(tsa, options=week_options).equals(pa.array(iso_week)) @pytest.mark.pandas @@ -2468,7 +2503,7 @@ def test_iso_calendar_longer_array(unit): # https://github.com/apache/arrow/issues/38655 # ensure correct result for array length > 32 arr = pa.array([datetime.datetime(2022, 1, 2, 9)]*50, pa.timestamp(unit)) - result = pc.iso_calendar(arr) + result = pc_iso_calendar(arr) expected = pa.StructArray.from_arrays( [[2021]*50, [52]*50, [7]*50], names=['iso_year', 'iso_week', 'iso_day_of_week'] @@ -2507,18 +2542,18 @@ def test_assume_timezone(): options = pc.AssumeTimezoneOptions(timezone) ta = pa.array(timestamps, type=ts_type) expected = timestamps.tz_localize(timezone) - result = pc.assume_timezone(ta, options=options) + result = pc_assume_timezone(ta, options=options) assert result.equals(pa.array(expected)) - result = pc.assume_timezone(ta, timezone) # Positional option + result = pc_assume_timezone(ta, timezone) # Positional option assert result.equals(pa.array(expected)) ta_zoned = pa.array(timestamps, type=pa.timestamp("ns", timezone)) with pytest.raises(pa.ArrowInvalid, match="already have a timezone:"): - pc.assume_timezone(ta_zoned, options=options) + pc_assume_timezone(ta_zoned, options=options) invalid_options = pc.AssumeTimezoneOptions("Europe/Brusselsss") with pytest.raises(ValueError, match="not found in timezone database"): - pc.assume_timezone(ta, options=invalid_options) + pc_assume_timezone(ta, options=invalid_options) timezone = "Europe/Brussels" @@ -2531,18 +2566,18 @@ def test_assume_timezone(): with pytest.raises(ValueError, match="Timestamp doesn't exist in " f"timezone '{timezone}'"): - pc.assume_timezone(nonexistent_array, + pc_assume_timezone(nonexistent_array, options=options_nonexistent_raise) expected = pa.array(nonexistent.tz_localize( timezone, nonexistent="shift_forward")) - result = pc.assume_timezone( + result = pc_assume_timezone( nonexistent_array, options=options_nonexistent_latest) expected.equals(result) expected = pa.array(nonexistent.tz_localize( timezone, nonexistent="shift_backward")) - result = pc.assume_timezone( + result = pc_assume_timezone( nonexistent_array, options=options_nonexistent_earliest) expected.equals(result) @@ -2555,16 +2590,16 @@ def test_assume_timezone(): with pytest.raises(ValueError, match="Timestamp is ambiguous in " f"timezone '{timezone}'"): - pc.assume_timezone(ambiguous_array, options=options_ambiguous_raise) + pc_assume_timezone(ambiguous_array, options=options_ambiguous_raise) expected = ambiguous.tz_localize(timezone, ambiguous=np.array([True, True, True])) - result = pc.assume_timezone( + result = pc_assume_timezone( ambiguous_array, options=options_ambiguous_earliest) result.equals(pa.array(expected)) expected = ambiguous.tz_localize( timezone, ambiguous=np.array([False, False, False])) - result = pc.assume_timezone( + result = pc_assume_timezone( ambiguous_array, options=options_ambiguous_latest) result.equals(pa.array(expected)) @@ -2593,15 +2628,15 @@ def _check_temporal_rounding(ts, values, unit): frequency = str(value) + unit_shorthand[unit] options = pc.RoundTemporalOptions(value, unit) - result = pc.ceil_temporal(ta, options=options).to_pandas() + result = ceil_temporal(ta, options=options).to_pandas() expected = ts.dt.ceil(frequency) np.testing.assert_array_equal(result, expected) - result = pc.floor_temporal(ta, options=options).to_pandas() + result = floor_temporal(ta, options=options).to_pandas() expected = ts.dt.floor(frequency) np.testing.assert_array_equal(result, expected) - result = pc.round_temporal(ta, options=options).to_pandas() + result = round_temporal(ta, options=options).to_pandas() expected = ts.dt.round(frequency) np.testing.assert_array_equal(result, expected) @@ -2614,29 +2649,29 @@ def _check_temporal_rounding(ts, values, unit): origin = ts.dt.floor(greater_unit[unit]) if ta.type.tz is None: - result = pc.ceil_temporal(ta, options=options).to_pandas() + result = ceil_temporal(ta, options=options).to_pandas() expected = (ts - origin).dt.ceil(frequency) + origin np.testing.assert_array_equal(result, expected) - result = pc.floor_temporal(ta, options=options).to_pandas() + result = floor_temporal(ta, options=options).to_pandas() expected = (ts - origin).dt.floor(frequency) + origin np.testing.assert_array_equal(result, expected) - result = pc.round_temporal(ta, options=options).to_pandas() + result = round_temporal(ta, options=options).to_pandas() expected = (ts - origin).dt.round(frequency) + origin np.testing.assert_array_equal(result, expected) # Check RoundTemporalOptions partial defaults if unit == "day": - result = pc.ceil_temporal(ta, multiple=value).to_pandas() + result = ceil_temporal(ta, multiple=value).to_pandas() expected = ts.dt.ceil(frequency) np.testing.assert_array_equal(result, expected) - result = pc.floor_temporal(ta, multiple=value).to_pandas() + result = floor_temporal(ta, multiple=value).to_pandas() expected = ts.dt.floor(frequency) np.testing.assert_array_equal(result, expected) - result = pc.round_temporal(ta, multiple=value).to_pandas() + result = round_temporal(ta, multiple=value).to_pandas() expected = ts.dt.round(frequency) np.testing.assert_array_equal(result, expected) @@ -2647,7 +2682,7 @@ def _check_temporal_rounding(ts, values, unit): if ta.type.tz is None: options = pc.RoundTemporalOptions( value, unit, ceil_is_strictly_greater=True) - result = pc.ceil_temporal(ta, options=options) + result = ceil_temporal(ta, options=options) expected = ts.dt.ceil(frequency) expected = np.where( @@ -2660,15 +2695,15 @@ def _check_temporal_rounding(ts, values, unit): if unit == "day": frequency = "1D" - result = pc.ceil_temporal(ta).to_pandas() + result = ceil_temporal(ta).to_pandas() expected = ts.dt.ceil(frequency) np.testing.assert_array_equal(result, expected) - result = pc.floor_temporal(ta).to_pandas() + result = floor_temporal(ta).to_pandas() expected = ts.dt.floor(frequency) np.testing.assert_array_equal(result, expected) - result = pc.round_temporal(ta).to_pandas() + result = round_temporal(ta).to_pandas() expected = ts.dt.round(frequency) np.testing.assert_array_equal(result, expected) @@ -2706,15 +2741,15 @@ def test_round_temporal(unit): def test_count(): arr = pa.array([1, 2, 3, None, None]) - assert pc.count(arr).as_py() == 3 - assert pc.count(arr, mode='only_valid').as_py() == 3 - assert pc.count(arr, mode='only_null').as_py() == 2 - assert pc.count(arr, mode='all').as_py() == 5 - assert pc.count(arr, 'all').as_py() == 5 + assert count(arr).as_py() == 3 + assert count(arr, mode='only_valid').as_py() == 3 + assert count(arr, mode='only_null').as_py() == 2 + assert count(arr, mode='all').as_py() == 5 + assert count(arr, 'all').as_py() == 5 with pytest.raises(ValueError, match='"something else" is not a valid count mode'): - pc.count(arr, 'something else') + count(arr, 'something else') def test_index(): @@ -2756,15 +2791,15 @@ def test_partition_nth(): data = list(range(100, 140)) random.shuffle(data) pivot = 10 - indices = pc.partition_nth_indices(data, pivot=pivot) + indices = partition_nth_indices(data, pivot=pivot) check_partition_nth(data, indices, pivot, "at_end") # Positional pivot argument - assert pc.partition_nth_indices(data, pivot) == indices + assert partition_nth_indices(data, pivot) == indices with pytest.raises( ValueError, match="'partition_nth_indices' cannot be called without options"): - pc.partition_nth_indices(data) + partition_nth_indices(data) def test_partition_nth_null_placement(): @@ -2773,14 +2808,14 @@ def test_partition_nth_null_placement(): for pivot in (0, 7, 13, 19): for null_placement in ("at_start", "at_end"): - indices = pc.partition_nth_indices(data, pivot=pivot, - null_placement=null_placement) + indices = partition_nth_indices(data, pivot=pivot, + null_placement=null_placement) check_partition_nth(data, indices, pivot, null_placement) def test_select_k_array(): def validate_select_k(select_k_indices, arr, order, stable_sort=False): - sorted_indices = pc.sort_indices(arr, sort_keys=[("dummy", order)]) + sorted_indices = sort_indices(arr, sort_keys=[("dummy", order)]) head_k_indices = sorted_indices.slice(0, len(select_k_indices)) if stable_sort: assert select_k_indices == head_k_indices @@ -2792,7 +2827,7 @@ def validate_select_k(select_k_indices, arr, order, stable_sort=False): arr = pa.array([1, 2, None, 0]) for k in [0, 2, 4]: for order in ["descending", "ascending"]: - result = pc.select_k_unstable( + result = select_k_unstable( arr, k=k, sort_keys=[("dummy", order)]) validate_select_k(result, arr, order) @@ -2802,26 +2837,26 @@ def validate_select_k(select_k_indices, arr, order, stable_sort=False): result = pc.bottom_k_unstable(arr, k=k) validate_select_k(result, arr, "ascending") - result = pc.select_k_unstable( + result = select_k_unstable( arr, options=pc.SelectKOptions( k=2, sort_keys=[("dummy", "descending")]) ) validate_select_k(result, arr, "descending") - result = pc.select_k_unstable( + result = select_k_unstable( arr, options=pc.SelectKOptions(k=2, sort_keys=[("dummy", "ascending")]) ) validate_select_k(result, arr, "ascending") # Position options - assert pc.select_k_unstable(arr, 2, - sort_keys=[("dummy", "ascending")]) == result - assert pc.select_k_unstable(arr, 2, [("dummy", "ascending")]) == result + assert select_k_unstable(arr, 2, + sort_keys=[("dummy", "ascending")]) == result + assert select_k_unstable(arr, 2, [("dummy", "ascending")]) == result def test_select_k_table(): def validate_select_k(select_k_indices, tbl, sort_keys, stable_sort=False): - sorted_indices = pc.sort_indices(tbl, sort_keys=sort_keys) + sorted_indices = sort_indices(tbl, sort_keys=sort_keys) head_k_indices = sorted_indices.slice(0, len(select_k_indices)) if stable_sort: assert select_k_indices == head_k_indices @@ -2832,11 +2867,11 @@ def validate_select_k(select_k_indices, tbl, sort_keys, stable_sort=False): table = pa.table({"a": [1, 2, 0], "b": [1, 0, 1]}) for k in [0, 2, 4]: - result = pc.select_k_unstable( + result = select_k_unstable( table, k=k, sort_keys=[("a", "ascending")]) validate_select_k(result, table, sort_keys=[("a", "ascending")]) - result = pc.select_k_unstable( + result = select_k_unstable( table, k=k, sort_keys=[(pc.field("a"), "ascending"), ("b", "ascending")]) validate_select_k( result, table, sort_keys=[("a", "ascending"), ("b", "ascending")]) @@ -2851,65 +2886,65 @@ def validate_select_k(select_k_indices, tbl, sort_keys, stable_sort=False): with pytest.raises( ValueError, match="'select_k_unstable' cannot be called without options"): - pc.select_k_unstable(table) + select_k_unstable(table) with pytest.raises(ValueError, match="select_k_unstable requires a nonnegative `k`"): - pc.select_k_unstable(table, k=-1, sort_keys=[("a", "ascending")]) + select_k_unstable(table, k=-1, sort_keys=[("a", "ascending")]) with pytest.raises(ValueError, match="select_k_unstable requires a " "non-empty `sort_keys`"): - pc.select_k_unstable(table, k=2, sort_keys=[]) + select_k_unstable(table, k=2, sort_keys=[]) with pytest.raises(ValueError, match="not a valid sort order"): - pc.select_k_unstable(table, k=k, sort_keys=[("a", "nonscending")]) + select_k_unstable(table, k=k, sort_keys=[("a", "nonscending")]) with pytest.raises(ValueError, match="Invalid sort key column: No match for.*unknown"): - pc.select_k_unstable(table, k=k, sort_keys=[("unknown", "ascending")]) + select_k_unstable(table, k=k, sort_keys=[("unknown", "ascending")]) def test_array_sort_indices(): arr = pa.array([1, 2, None, 0]) - result = pc.array_sort_indices(arr) + result = array_sort_indices(arr) assert result.to_pylist() == [3, 0, 1, 2] - result = pc.array_sort_indices(arr, order="ascending") + result = array_sort_indices(arr, order="ascending") assert result.to_pylist() == [3, 0, 1, 2] - result = pc.array_sort_indices(arr, order="descending") + result = array_sort_indices(arr, order="descending") assert result.to_pylist() == [1, 0, 3, 2] - result = pc.array_sort_indices(arr, order="descending", - null_placement="at_start") + result = array_sort_indices(arr, order="descending", + null_placement="at_start") assert result.to_pylist() == [2, 1, 0, 3] - result = pc.array_sort_indices(arr, "descending", - null_placement="at_start") + result = array_sort_indices(arr, "descending", + null_placement="at_start") assert result.to_pylist() == [2, 1, 0, 3] with pytest.raises(ValueError, match="not a valid sort order"): - pc.array_sort_indices(arr, order="nonscending") + array_sort_indices(arr, order="nonscending") def test_sort_indices_array(): arr = pa.array([1, 2, None, 0]) - result = pc.sort_indices(arr) + result = sort_indices(arr) assert result.to_pylist() == [3, 0, 1, 2] - result = pc.sort_indices(arr, sort_keys=[("dummy", "ascending")]) + result = sort_indices(arr, sort_keys=[("dummy", "ascending")]) assert result.to_pylist() == [3, 0, 1, 2] - result = pc.sort_indices(arr, sort_keys=[("dummy", "descending")]) + result = sort_indices(arr, sort_keys=[("dummy", "descending")]) assert result.to_pylist() == [1, 0, 3, 2] - result = pc.sort_indices(arr, sort_keys=[("dummy", "descending")], - null_placement="at_start") + result = sort_indices(arr, sort_keys=[("dummy", "descending")], + null_placement="at_start") assert result.to_pylist() == [2, 1, 0, 3] # Positional `sort_keys` - result = pc.sort_indices(arr, [("dummy", "descending")], - null_placement="at_start") + result = sort_indices(arr, [("dummy", "descending")], + null_placement="at_start") assert result.to_pylist() == [2, 1, 0, 3] # Using SortOptions - result = pc.sort_indices( + result = sort_indices( arr, options=pc.SortOptions(sort_keys=[("dummy", "descending")]) ) assert result.to_pylist() == [1, 0, 3, 2] - result = pc.sort_indices( + result = sort_indices( arr, options=pc.SortOptions(sort_keys=[("dummy", "descending")], null_placement="at_start") ) @@ -2919,134 +2954,134 @@ def test_sort_indices_array(): def test_sort_indices_table(): table = pa.table({"a": [1, 1, None, 0], "b": [1, 0, 0, 1]}) - result = pc.sort_indices(table, sort_keys=[("a", "ascending")]) + result = sort_indices(table, sort_keys=[("a", "ascending")]) assert result.to_pylist() == [3, 0, 1, 2] - result = pc.sort_indices(table, sort_keys=[(pc.field("a"), "ascending")], - null_placement="at_start") + result = sort_indices(table, sort_keys=[(pc.field("a"), "ascending")], + null_placement="at_start") assert result.to_pylist() == [2, 3, 0, 1] - result = pc.sort_indices( + result = sort_indices( table, sort_keys=[("a", "descending"), ("b", "ascending")] ) assert result.to_pylist() == [1, 0, 3, 2] - result = pc.sort_indices( + result = sort_indices( table, sort_keys=[("a", "descending"), ("b", "ascending")], null_placement="at_start" ) assert result.to_pylist() == [2, 1, 0, 3] # Positional `sort_keys` - result = pc.sort_indices( + result = sort_indices( table, [("a", "descending"), ("b", "ascending")], null_placement="at_start" ) assert result.to_pylist() == [2, 1, 0, 3] with pytest.raises(ValueError, match="Must specify one or more sort keys"): - pc.sort_indices(table) + sort_indices(table) with pytest.raises(ValueError, match="Invalid sort key column: No match for.*unknown"): - pc.sort_indices(table, sort_keys=[("unknown", "ascending")]) + sort_indices(table, sort_keys=[("unknown", "ascending")]) with pytest.raises(ValueError, match="not a valid sort order"): - pc.sort_indices(table, sort_keys=[("a", "nonscending")]) + sort_indices(table, sort_keys=[("a", "nonscending")]) def test_is_in(): arr = pa.array([1, 2, None, 1, 2, 3]) - result = pc.is_in(arr, value_set=pa.array([1, 3, None])) + result = is_in(arr, value_set=pa.array([1, 3, None])) assert result.to_pylist() == [True, False, True, True, False, True] - result = pc.is_in(arr, value_set=pa.array([1, 3, None]), skip_nulls=True) + result = is_in(arr, value_set=pa.array([1, 3, None]), skip_nulls=True) assert result.to_pylist() == [True, False, False, True, False, True] - result = pc.is_in(arr, value_set=pa.array([1, 3])) + result = is_in(arr, value_set=pa.array([1, 3])) assert result.to_pylist() == [True, False, False, True, False, True] - result = pc.is_in(arr, value_set=pa.array([1, 3]), skip_nulls=True) + result = is_in(arr, value_set=pa.array([1, 3]), skip_nulls=True) assert result.to_pylist() == [True, False, False, True, False, True] def test_index_in(): arr = pa.array([1, 2, None, 1, 2, 3]) - result = pc.index_in(arr, value_set=pa.array([1, 3, None])) + result = index_in(arr, value_set=pa.array([1, 3, None])) assert result.to_pylist() == [0, None, 2, 0, None, 1] - result = pc.index_in(arr, value_set=pa.array([1, 3, None]), - skip_nulls=True) + result = index_in(arr, value_set=pa.array([1, 3, None]), + skip_nulls=True) assert result.to_pylist() == [0, None, None, 0, None, 1] - result = pc.index_in(arr, value_set=pa.array([1, 3])) + result = index_in(arr, value_set=pa.array([1, 3])) assert result.to_pylist() == [0, None, None, 0, None, 1] - result = pc.index_in(arr, value_set=pa.array([1, 3]), skip_nulls=True) + result = index_in(arr, value_set=pa.array([1, 3]), skip_nulls=True) assert result.to_pylist() == [0, None, None, 0, None, 1] # Positional value_set - result = pc.index_in(arr, pa.array([1, 3]), skip_nulls=True) + result = index_in(arr, pa.array([1, 3]), skip_nulls=True) assert result.to_pylist() == [0, None, None, 0, None, 1] def test_quantile(): arr = pa.array([1, 2, 3, 4]) - result = pc.quantile(arr) + result = quantile(arr) assert result.to_pylist() == [2.5] - result = pc.quantile(arr, interpolation='lower') + result = quantile(arr, interpolation='lower') assert result.to_pylist() == [2] - result = pc.quantile(arr, interpolation='higher') + result = quantile(arr, interpolation='higher') assert result.to_pylist() == [3] - result = pc.quantile(arr, interpolation='nearest') + result = quantile(arr, interpolation='nearest') assert result.to_pylist() == [3] - result = pc.quantile(arr, interpolation='midpoint') + result = quantile(arr, interpolation='midpoint') assert result.to_pylist() == [2.5] - result = pc.quantile(arr, interpolation='linear') + result = quantile(arr, interpolation='linear') assert result.to_pylist() == [2.5] arr = pa.array([1, 2]) - result = pc.quantile(arr, q=[0.25, 0.5, 0.75]) + result = quantile(arr, q=[0.25, 0.5, 0.75]) assert result.to_pylist() == [1.25, 1.5, 1.75] - result = pc.quantile(arr, q=[0.25, 0.5, 0.75], interpolation='lower') + result = quantile(arr, q=[0.25, 0.5, 0.75], interpolation='lower') assert result.to_pylist() == [1, 1, 1] - result = pc.quantile(arr, q=[0.25, 0.5, 0.75], interpolation='higher') + result = quantile(arr, q=[0.25, 0.5, 0.75], interpolation='higher') assert result.to_pylist() == [2, 2, 2] - result = pc.quantile(arr, q=[0.25, 0.5, 0.75], interpolation='midpoint') + result = quantile(arr, q=[0.25, 0.5, 0.75], interpolation='midpoint') assert result.to_pylist() == [1.5, 1.5, 1.5] - result = pc.quantile(arr, q=[0.25, 0.5, 0.75], interpolation='nearest') + result = quantile(arr, q=[0.25, 0.5, 0.75], interpolation='nearest') assert result.to_pylist() == [1, 1, 2] - result = pc.quantile(arr, q=[0.25, 0.5, 0.75], interpolation='linear') + result = quantile(arr, q=[0.25, 0.5, 0.75], interpolation='linear') assert result.to_pylist() == [1.25, 1.5, 1.75] # Positional `q` - result = pc.quantile(arr, [0.25, 0.5, 0.75], interpolation='linear') + result = quantile(arr, [0.25, 0.5, 0.75], interpolation='linear') assert result.to_pylist() == [1.25, 1.5, 1.75] with pytest.raises(ValueError, match="Quantile must be between 0 and 1"): - pc.quantile(arr, q=1.1) + quantile(arr, q=1.1) with pytest.raises(ValueError, match="not a valid quantile interpolation"): - pc.quantile(arr, interpolation='zzz') + quantile(arr, interpolation='zzz') def test_tdigest(): arr = pa.array([1, 2, 3, 4]) - result = pc.tdigest(arr) + result = tdigest(arr) assert result.to_pylist() == [2.5] arr = pa.chunked_array([pa.array([1, 2]), pa.array([3, 4])]) - result = pc.tdigest(arr) + result = tdigest(arr) assert result.to_pylist() == [2.5] arr = pa.array([1, 2, 3, 4]) - result = pc.tdigest(arr, q=[0, 0.5, 1]) + result = tdigest(arr, q=[0, 0.5, 1]) assert result.to_pylist() == [1, 2.5, 4] arr = pa.chunked_array([pa.array([1, 2]), pa.array([3, 4])]) - result = pc.tdigest(arr, [0, 0.5, 1]) # positional `q` + result = tdigest(arr, [0, 0.5, 1]) # positional `q` assert result.to_pylist() == [1, 2.5, 4] @@ -3062,32 +3097,32 @@ def test_min_max_element_wise(): arr2 = pa.array([3, 1, 2]) arr3 = pa.array([2, 3, None]) - result = pc.max_element_wise(arr1, arr2) + result = max_element_wise(arr1, arr2) assert result == pa.array([3, 2, 3]) - result = pc.min_element_wise(arr1, arr2) + result = min_element_wise(arr1, arr2) assert result == pa.array([1, 1, 2]) - result = pc.max_element_wise(arr1, arr2, arr3) + result = max_element_wise(arr1, arr2, arr3) assert result == pa.array([3, 3, 3]) - result = pc.min_element_wise(arr1, arr2, arr3) + result = min_element_wise(arr1, arr2, arr3) assert result == pa.array([1, 1, 2]) # with specifying the option - result = pc.max_element_wise(arr1, arr3, skip_nulls=True) + result = max_element_wise(arr1, arr3, skip_nulls=True) assert result == pa.array([2, 3, 3]) - result = pc.min_element_wise(arr1, arr3, skip_nulls=True) + result = min_element_wise(arr1, arr3, skip_nulls=True) assert result == pa.array([1, 2, 3]) - result = pc.max_element_wise( + result = max_element_wise( arr1, arr3, options=pc.ElementWiseAggregateOptions()) assert result == pa.array([2, 3, 3]) - result = pc.min_element_wise( + result = min_element_wise( arr1, arr3, options=pc.ElementWiseAggregateOptions()) assert result == pa.array([1, 2, 3]) # not skipping nulls - result = pc.max_element_wise(arr1, arr3, skip_nulls=False) + result = max_element_wise(arr1, arr3, skip_nulls=False) assert result == pa.array([2, 3, None]) - result = pc.min_element_wise(arr1, arr3, skip_nulls=False) + result = min_element_wise(arr1, arr3, skip_nulls=False) assert result == pa.array([1, 2, None]) @@ -3113,9 +3148,9 @@ def test_cumulative_sum(start, skip_nulls): if skip_nulls else pa.chunked_array([[0, None, None, None]]) ] for i, arr in enumerate(arrays): - result = pc.cumulative_sum(arr, start=strt, skip_nulls=skip_nulls) + result = cumulative_sum(arr, start=strt, skip_nulls=skip_nulls) # Add `start` offset to expected array before comparing - expected = pc.add(expected_arrays[i], strt if strt is not None + expected = pc_add(expected_arrays[i], strt if strt is not None else 0) assert result.equals(expected) @@ -3134,16 +3169,16 @@ def test_cumulative_sum(start, skip_nulls): if skip_nulls else np.array([1, np.nan, None, None, None, None]) ] for i, arr in enumerate(arrays): - result = pc.cumulative_sum(arr, start=strt, skip_nulls=skip_nulls) + result = cumulative_sum(arr, start=strt, skip_nulls=skip_nulls) # Add `start` offset to expected array before comparing - expected = pc.add(expected_arrays[i], strt if strt is not None + expected = pc_add(expected_arrays[i], strt if strt is not None else 0) np.testing.assert_array_almost_equal(result.to_numpy( zero_copy_only=False), expected.to_numpy(zero_copy_only=False)) for strt in ['a', pa.scalar('arrow'), 1.1]: with pytest.raises(pa.ArrowInvalid): - pc.cumulative_sum([1, 2, 3], start=strt) + cumulative_sum([1, 2, 3], start=strt) @pytest.mark.numpy @@ -3168,10 +3203,10 @@ def test_cumulative_prod(start, skip_nulls): if skip_nulls else pa.chunked_array([[1, None, None, None]]) ] for i, arr in enumerate(arrays): - result = pc.cumulative_prod(arr, start=strt, skip_nulls=skip_nulls) + result = cumulative_prod(arr, start=strt, skip_nulls=skip_nulls) # Multiply `start` offset to expected array before comparing - expected = pc.multiply(expected_arrays[i], strt if strt is not None - else 1) + expected = multiply(expected_arrays[i], strt if strt is not None + else 1) assert result.equals(expected) starts = [None, start, pa.scalar(start, type=pa.float32()), @@ -3189,16 +3224,16 @@ def test_cumulative_prod(start, skip_nulls): if skip_nulls else np.array([1, np.nan, None, None, None, None]) ] for i, arr in enumerate(arrays): - result = pc.cumulative_prod(arr, start=strt, skip_nulls=skip_nulls) + result = cumulative_prod(arr, start=strt, skip_nulls=skip_nulls) # Multiply `start` offset to expected array before comparing - expected = pc.multiply(expected_arrays[i], strt if strt is not None - else 1) + expected = multiply(expected_arrays[i], strt if strt is not None + else 1) np.testing.assert_array_almost_equal(result.to_numpy( zero_copy_only=False), expected.to_numpy(zero_copy_only=False)) for strt in ['a', pa.scalar('arrow'), 1.1]: with pytest.raises(pa.ArrowInvalid): - pc.cumulative_prod([1, 2, 3], start=strt) + cumulative_prod([1, 2, 3], start=strt) @pytest.mark.numpy @@ -3224,9 +3259,9 @@ def test_cumulative_max(start, skip_nulls): pa.chunked_array([[2, 2, None, None, None, None]]) ] for i, arr in enumerate(arrays): - result = pc.cumulative_max(arr, start=strt, skip_nulls=skip_nulls) + result = cumulative_max(arr, start=strt, skip_nulls=skip_nulls) # Max `start` offset with expected array before comparing - expected = pc.max_element_wise( + expected = max_element_wise( expected_arrays[i], strt if strt is not None else int(-1e9), skip_nulls=False) assert result.equals(expected) @@ -3246,9 +3281,9 @@ def test_cumulative_max(start, skip_nulls): if skip_nulls else np.array([2.5, 2.5, None, None, None, None]) ] for i, arr in enumerate(arrays): - result = pc.cumulative_max(arr, start=strt, skip_nulls=skip_nulls) + result = cumulative_max(arr, start=strt, skip_nulls=skip_nulls) # Max `start` offset with expected array before comparing - expected = pc.max_element_wise( + expected = max_element_wise( expected_arrays[i], strt if strt is not None else -1e9, skip_nulls=False) np.testing.assert_array_almost_equal(result.to_numpy( @@ -3256,7 +3291,7 @@ def test_cumulative_max(start, skip_nulls): for strt in ['a', pa.scalar('arrow'), 1.1]: with pytest.raises(pa.ArrowInvalid): - pc.cumulative_max([1, 2, 3], start=strt) + cumulative_max([1, 2, 3], start=strt) @pytest.mark.numpy @@ -3282,9 +3317,9 @@ def test_cumulative_min(start, skip_nulls): pa.chunked_array([[5, 5, None, None, None, None]]) ] for i, arr in enumerate(arrays): - result = pc.cumulative_min(arr, start=strt, skip_nulls=skip_nulls) + result = cumulative_min(arr, start=strt, skip_nulls=skip_nulls) # Min `start` offset with expected array before comparing - expected = pc.min_element_wise( + expected = min_element_wise( expected_arrays[i], strt if strt is not None else int(1e9), skip_nulls=False) assert result.equals(expected) @@ -3304,9 +3339,9 @@ def test_cumulative_min(start, skip_nulls): if skip_nulls else np.array([5.5, 5.5, None, None, None, None]) ] for i, arr in enumerate(arrays): - result = pc.cumulative_min(arr, start=strt, skip_nulls=skip_nulls) + result = cumulative_min(arr, start=strt, skip_nulls=skip_nulls) # Min `start` offset with expected array before comparing - expected = pc.min_element_wise( + expected = min_element_wise( expected_arrays[i], strt if strt is not None else 1e9, skip_nulls=False) np.testing.assert_array_almost_equal(result.to_numpy( @@ -3314,26 +3349,26 @@ def test_cumulative_min(start, skip_nulls): for strt in ['a', pa.scalar('arrow'), 1.1]: with pytest.raises(pa.ArrowInvalid): - pc.cumulative_max([1, 2, 3], start=strt) + cumulative_max([1, 2, 3], start=strt) def test_make_struct(): - assert pc.make_struct(1, 'a').as_py() == {'0': 1, '1': 'a'} + assert make_struct(1, 'a').as_py() == {'0': 1, '1': 'a'} - assert pc.make_struct(1, 'a', field_names=['i', 's']).as_py() == { + assert make_struct(1, 'a', field_names=['i', 's']).as_py() == { 'i': 1, 's': 'a'} - assert pc.make_struct([1, 2, 3], - "a b c".split()) == pa.StructArray.from_arrays([ - [1, 2, 3], - "a b c".split()], names='0 1'.split()) + assert make_struct([1, 2, 3], + "a b c".split()) == pa.StructArray.from_arrays([ + [1, 2, 3], + "a b c".split()], names='0 1'.split()) with pytest.raises(ValueError, match="Array arguments must all be the same length"): - pc.make_struct([1, 2, 3, 4], "a b c".split()) + make_struct([1, 2, 3, 4], "a b c".split()) with pytest.raises(ValueError, match="0 arguments but 2 field names"): - pc.make_struct(field_names=['one', 'two']) + make_struct(field_names=['one', 'two']) def test_map_lookup(): @@ -3345,12 +3380,12 @@ def test_map_lookup(): result_all = pa.array([[1], None, None, [5, 7], None], type=pa.list_(pa.int32())) - assert pc.map_lookup(arr, 'one', 'first') == result_first - assert pc.map_lookup(arr, pa.scalar( + assert map_lookup(arr, 'one', 'first') == result_first + assert map_lookup(arr, pa.scalar( 'one', type=pa.utf8()), 'first') == result_first - assert pc.map_lookup(arr, pa.scalar( + assert map_lookup(arr, pa.scalar( 'one', type=pa.utf8()), 'last') == result_last - assert pc.map_lookup(arr, pa.scalar( + assert map_lookup(arr, pa.scalar( 'one', type=pa.utf8()), 'all') == result_all @@ -3360,42 +3395,42 @@ def test_struct_fields_options(): c = pa.StructArray.from_arrays([a, b], ["a", "b"]) arr = pa.StructArray.from_arrays([a, c], ["a", "c"]) - assert pc.struct_field(arr, '.c.b') == b - assert pc.struct_field(arr, b'.c.b') == b - assert pc.struct_field(arr, ['c', 'b']) == b - assert pc.struct_field(arr, [1, 'b']) == b - assert pc.struct_field(arr, (b'c', 'b')) == b - assert pc.struct_field(arr, pc.field(('c', 'b'))) == b + assert struct_field(arr, '.c.b') == b + assert struct_field(arr, b'.c.b') == b + assert struct_field(arr, ['c', 'b']) == b + assert struct_field(arr, [1, 'b']) == b + assert struct_field(arr, (b'c', 'b')) == b + assert struct_field(arr, pc.field(('c', 'b'))) == b - assert pc.struct_field(arr, '.a') == a - assert pc.struct_field(arr, ['a']) == a - assert pc.struct_field(arr, 'a') == a - assert pc.struct_field(arr, pc.field(('a',))) == a + assert struct_field(arr, '.a') == a + assert struct_field(arr, ['a']) == a + assert struct_field(arr, 'a') == a + assert struct_field(arr, pc.field(('a',))) == a - assert pc.struct_field(arr, indices=[1, 1]) == b - assert pc.struct_field(arr, (1, 1)) == b - assert pc.struct_field(arr, [0]) == a - assert pc.struct_field(arr, []) == arr + assert struct_field(arr, indices=[1, 1]) == b + assert struct_field(arr, (1, 1)) == b + assert struct_field(arr, [0]) == a + assert struct_field(arr, []) == arr with pytest.raises(pa.ArrowInvalid, match="No match for FieldRef"): - pc.struct_field(arr, 'foo') + struct_field(arr, 'foo') with pytest.raises(pa.ArrowInvalid, match="No match for FieldRef"): - pc.struct_field(arr, '.c.foo') + struct_field(arr, '.c.foo') # drill into a non-struct array and continue to ask for a field with pytest.raises(pa.ArrowInvalid, match="No match for FieldRef"): - pc.struct_field(arr, '.a.foo') + struct_field(arr, '.a.foo') # TODO: https://issues.apache.org/jira/browse/ARROW-14853 - # assert pc.struct_field(arr) == arr + # assert struct_field(arr) == arr def test_case_when(): - assert pc.case_when(pc.make_struct([True, False, None], - [False, True, None]), - [1, 2, 3], - [11, 12, 13]) == pa.array([1, 12, None]) + assert case_when(make_struct([True, False, None], + [False, True, None]), + [1, 2, 3], + [11, 12, 13]) == pa.array([1, 12, None]) def test_list_element(): @@ -3406,12 +3441,12 @@ def test_list_element(): lists = pa.array([l1, l2], list_type) index = 1 - result = pa.compute.list_element(lists, index) + result = list_element(lists, index) expected = pa.array([None, {'a': 0.52, 'b': 3}], element_type) assert result.equals(expected) index = 4 - result = pa.compute.list_element(lists, index) + result = list_element(lists, index) expected = pa.array([{'a': 5.6, 'b': 6}, {'a': .6, 'b': 8}], element_type) assert result.equals(expected) @@ -3419,28 +3454,28 @@ def test_list_element(): def test_count_distinct(): samples = [datetime.datetime(year=y, month=1, day=1) for y in range(1992, 2092)] arr = pa.array(samples, pa.timestamp("ns")) - assert pc.count_distinct(arr) == pa.scalar(len(samples), type=pa.int64()) + assert count_distinct(arr) == pa.scalar(len(samples), type=pa.int64()) def test_count_distinct_options(): arr = pa.array([1, 2, 3, None, None]) - assert pc.count_distinct(arr).as_py() == 3 - assert pc.count_distinct(arr, mode='only_valid').as_py() == 3 - assert pc.count_distinct(arr, mode='only_null').as_py() == 1 - assert pc.count_distinct(arr, mode='all').as_py() == 4 - assert pc.count_distinct(arr, 'all').as_py() == 4 + assert count_distinct(arr).as_py() == 3 + assert count_distinct(arr, mode='only_valid').as_py() == 3 + assert count_distinct(arr, mode='only_null').as_py() == 1 + assert count_distinct(arr, mode='all').as_py() == 4 + assert count_distinct(arr, 'all').as_py() == 4 def test_utf8_normalize(): arr = pa.array(["01²3"]) - assert pc.utf8_normalize(arr, form="NFC") == arr - assert pc.utf8_normalize(arr, form="NFKC") == pa.array(["0123"]) - assert pc.utf8_normalize(arr, "NFD") == arr - assert pc.utf8_normalize(arr, "NFKD") == pa.array(["0123"]) + assert utf8_normalize(arr, form="NFC") == arr + assert utf8_normalize(arr, form="NFKC") == pa.array(["0123"]) + assert utf8_normalize(arr, "NFD") == arr + assert utf8_normalize(arr, "NFKD") == pa.array(["0123"]) with pytest.raises( ValueError, match='"NFZ" is not a valid Unicode normalization form'): - pc.utf8_normalize(arr, form="NFZ") + utf8_normalize(arr, form="NFZ") def test_random(): @@ -3482,7 +3517,7 @@ def test_rank_options_tiebreaker(tiebreaker, expected_values): rank_options = pc.RankOptions(sort_keys="ascending", null_placement="at_end", tiebreaker=tiebreaker) - result = pc.rank(arr, options=rank_options) + result = rank(arr, options=rank_options) expected = pa.array(expected_values, type=pa.uint64()) assert result.equals(expected) @@ -3492,24 +3527,24 @@ def test_rank_options(): expected = pa.array([3, 1, 4, 6, 5, 7, 2], type=pa.uint64()) # Ensure rank can be called without specifying options - result = pc.rank(arr) + result = rank(arr) assert result.equals(expected) # Ensure default RankOptions - result = pc.rank(arr, options=pc.RankOptions()) + result = rank(arr, options=pc.RankOptions()) assert result.equals(expected) # Ensure sort_keys tuple usage - result = pc.rank(arr, options=pc.RankOptions( + result = rank(arr, options=pc.RankOptions( sort_keys=[("b", "ascending")]) ) assert result.equals(expected) - result = pc.rank(arr, null_placement="at_start") + result = rank(arr, null_placement="at_start") expected_at_start = pa.array([5, 3, 6, 1, 7, 2, 4], type=pa.uint64()) assert result.equals(expected_at_start) - result = pc.rank(arr, sort_keys="descending") + result = rank(arr, sort_keys="descending") expected_descending = pa.array([3, 4, 1, 6, 2, 7, 5], type=pa.uint64()) assert result.equals(expected_descending) @@ -3525,29 +3560,29 @@ def test_rank_quantile_options(): expected = pa.array([0.7, 0.1, 0.7, 0.3, 0.7], type=pa.float64()) # Ensure rank_quantile can be called without specifying options - result = pc.rank_quantile(arr) + result = rank_quantile(arr) assert result.equals(expected) # Ensure default RankOptions - result = pc.rank_quantile(arr, options=pc.RankQuantileOptions()) + result = rank_quantile(arr, options=pc.RankQuantileOptions()) assert result.equals(expected) # Ensure sort_keys tuple usage - result = pc.rank_quantile(arr, options=pc.RankQuantileOptions( + result = rank_quantile(arr, options=pc.RankQuantileOptions( sort_keys=[("b", "ascending")]) ) assert result.equals(expected) - result = pc.rank_quantile(arr, null_placement="at_start") + result = rank_quantile(arr, null_placement="at_start") expected_at_start = pa.array([0.3, 0.7, 0.3, 0.9, 0.3], type=pa.float64()) assert result.equals(expected_at_start) - result = pc.rank_quantile(arr, sort_keys="descending") + result = rank_quantile(arr, sort_keys="descending") expected_descending = pa.array([0.7, 0.3, 0.7, 0.1, 0.7], type=pa.float64()) assert result.equals(expected_descending) with pytest.raises(ValueError, match="not a valid sort order"): - pc.rank_quantile(arr, sort_keys="XXX") + rank_quantile(arr, sort_keys="XXX") def test_rank_normal_options(): @@ -3556,21 +3591,21 @@ def test_rank_normal_options(): expected = pytest.approx( [0.5244005127080407, -1.2815515655446004, 0.5244005127080407, -0.5244005127080409, 0.5244005127080407]) - result = pc.rank_normal(arr) + result = rank_normal(arr) assert result.to_pylist() == expected - result = pc.rank_normal(arr, null_placement="at_end", sort_keys="ascending") + result = rank_normal(arr, null_placement="at_end", sort_keys="ascending") assert result.to_pylist() == expected - result = pc.rank_normal(arr, options=pc.RankQuantileOptions()) + result = rank_normal(arr, options=pc.RankQuantileOptions()) assert result.to_pylist() == expected expected = pytest.approx( [-0.5244005127080409, 1.2815515655446004, -0.5244005127080409, 0.5244005127080407, -0.5244005127080409]) - result = pc.rank_normal(arr, null_placement="at_start", sort_keys="descending") + result = rank_normal(arr, null_placement="at_start", sort_keys="descending") assert result.to_pylist() == expected - result = pc.rank_normal(arr, - options=pc.RankQuantileOptions(null_placement="at_start", - sort_keys="descending")) + result = rank_normal(arr, + options=pc.RankQuantileOptions(null_placement="at_start", + sort_keys="descending")) assert result.to_pylist() == expected @@ -3598,17 +3633,17 @@ def create_sample_expressions(): # These expressions include at least one function call exprs_with_call = [a == b, a != b, a > b, c & j, c | j, ~c, d.is_valid(), - a + b, a - b, a * b, a / b, pc.negate(a), - pc.add(a, b), pc.subtract(a, b), pc.divide(a, b), - pc.multiply(a, b), pc.power(a, a), pc.sqrt(a), - pc.exp(b), pc.cos(b), pc.sin(b), pc.tan(b), - pc.acos(b), pc.atan(b), pc.asin(b), pc.atan2(b, b), - pc.sinh(a), pc.cosh(a), pc.tanh(a), - pc.asinh(a), pc.acosh(b), pc.atanh(k), - pc.abs(b), pc.sign(a), pc.bit_wise_not(a), - pc.bit_wise_and(a, a), pc.bit_wise_or(a, a), - pc.bit_wise_xor(a, a), pc.is_nan(b), pc.is_finite(b), - pc.coalesce(a, b), + a + b, a - b, a * b, a / b, negate(a), + pc_add(a, b), subtract(a, b), divide(a, b), + multiply(a, b), power(a, a), sqrt(a), + exp(b), cos(b), sin(b), tan(b), + acos(b), atan(b), asin(b), atan2(b, b), + sinh(a), cosh(a), tanh(a), + asinh(a), acosh(b), atanh(k), + pc_abs(b), sign(a), bit_wise_not(a), + bit_wise_and(a, a), bit_wise_or(a, a), + bit_wise_xor(a, a), is_nan(b), is_finite(b), + coalesce(a, b), a.cast(pa.int32(), safe=False)] # These expressions test out various reference styles and may include function @@ -3772,29 +3807,29 @@ def test_expression_call_function(): field = pc.field("field") # no options - assert str(pc.hour(field)) == "hour(field)" + assert str(hour(field)) == "hour(field)" # default options - assert str(pc.round(field)) == "round(field)" + assert str(pc_round(field)) == "round(field)" # specified options - assert str(pc.round(field, ndigits=1)) == \ + assert str(pc_round(field, ndigits=1)) == \ "round(field, {ndigits=1, round_mode=HALF_TO_EVEN})" # Will convert non-expression arguments if possible - assert str(pc.add(field, 1)) == "add(field, 1)" - assert str(pc.add(field, pa.scalar(1))) == "add(field, 1)" + assert str(pc_add(field, 1)) == "add(field, 1)" + assert str(pc_add(field, pa.scalar(1))) == "add(field, 1)" # Invalid pc.scalar input gives original error message msg = "only other expressions allowed as arguments" with pytest.raises(TypeError, match=msg): - pc.add(field, object) + pc_add(field, object) def test_cast_table_raises(): table = pa.table({'a': [1, 2]}) - with pytest.raises(pa.lib.ArrowTypeError): - pc.cast(table, pa.int64()) + with pytest.raises(lib.ArrowTypeError): + cast(table, pa.int64()) @pytest.mark.parametrize("start,stop,expected", ( @@ -3821,9 +3856,9 @@ def test_list_slice_output_fixed(start, stop, step, expected, value_type, msg = ("Unable to produce FixedSizeListArray from " "non-FixedSizeListArray without `stop` being set.") with pytest.raises(pa.ArrowInvalid, match=msg): - pc.list_slice(*args) + list_slice(*args) else: - result = pc.list_slice(*args) + result = list_slice(*args) pylist = result.cast(pa.list_(pa.int8(), result.type.list_size)).to_pylist() assert pylist == [e[::step] if e else e for e in expected] @@ -3854,8 +3889,8 @@ def test_list_slice_output_variable(start, stop, step, value_type, list_type): if list_type == "fixed": list_type = pa.list_ # non fixed output type - result = pc.list_slice(arr, start, stop, step, - return_fixed_size_list=False) + result = list_slice(arr, start, stop, step, + return_fixed_size_list=False) assert result.type == list_type(value_type()) pylist = result.cast(pa.list_(pa.int8())).to_pylist() @@ -3872,7 +3907,7 @@ def test_list_slice_output_variable(start, stop, step, value_type, list_type): lambda: pa.large_list(pa.field('col', pa.int8())))) def test_list_slice_field_names_retained(return_fixed_size, type): arr = pa.array([[1]], type()) - out = pc.list_slice(arr, 0, 1, return_fixed_size_list=return_fixed_size) + out = list_slice(arr, 0, 1, return_fixed_size_list=return_fixed_size) assert arr.type.field(0).name == out.type.field(0).name # Verify out type matches in type if return_fixed_size_list==None @@ -3884,27 +3919,27 @@ def test_list_slice_bad_parameters(): arr = pa.array([[1]], pa.list_(pa.int8(), 1)) msg = r"`start`(.*) should be greater than 0 and smaller than `stop`(.*)" with pytest.raises(pa.ArrowInvalid, match=msg): - pc.list_slice(arr, -1, 1) # negative start? + list_slice(arr, -1, 1) # negative start? with pytest.raises(pa.ArrowInvalid, match=msg): - pc.list_slice(arr, 2, 1) # start > stop? + list_slice(arr, 2, 1) # start > stop? # TODO(ARROW-18281): start==stop -> empty lists with pytest.raises(pa.ArrowInvalid, match=msg): - pc.list_slice(arr, 0, 0) # start == stop? + list_slice(arr, 0, 0) # start == stop? # Step not >= 1 msg = "`step` must be >= 1, got: " with pytest.raises(pa.ArrowInvalid, match=msg + "0"): - pc.list_slice(arr, 0, 1, step=0) + list_slice(arr, 0, 1, step=0) with pytest.raises(pa.ArrowInvalid, match=msg + "-1"): - pc.list_slice(arr, 0, 1, step=-1) + list_slice(arr, 0, 1, step=-1) def check_run_end_encode_decode(value_type, run_end_encode_opts=None): values = [1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3] arr = pa.array(values, type=value_type) - encoded = pc.run_end_encode(arr, options=run_end_encode_opts) - decoded = pc.run_end_decode(encoded) + encoded = run_end_encode(arr, options=run_end_encode_opts) + decoded = run_end_decode(encoded) assert decoded.type == arr.type assert decoded.equals(arr) @@ -3941,65 +3976,65 @@ def test_run_end_encode(value_type, option): def test_pairwise_diff(): arr = pa.array([1, 2, 3, None, 4, 5]) expected = pa.array([None, 1, 1, None, None, 1]) - result = pa.compute.pairwise_diff(arr, period=1) + result = pairwise_diff(arr, period=1) assert result.equals(expected) arr = pa.array([1, 2, 3, None, 4, 5]) expected = pa.array([None, None, 2, None, 1, None]) - result = pa.compute.pairwise_diff(arr, period=2) + result = pairwise_diff(arr, period=2) assert result.equals(expected) # negative period arr = pa.array([1, 2, 3, None, 4, 5], type=pa.int8()) expected = pa.array([-1, -1, None, None, -1, None], type=pa.int8()) - result = pa.compute.pairwise_diff(arr, period=-1) + result = pairwise_diff(arr, period=-1) assert result.equals(expected) # wrap around overflow arr = pa.array([1, 2, 3, None, 4, 5], type=pa.uint8()) expected = pa.array([255, 255, None, None, 255, None], type=pa.uint8()) - result = pa.compute.pairwise_diff(arr, period=-1) + result = pairwise_diff(arr, period=-1) assert result.equals(expected) # fail on overflow arr = pa.array([1, 2, 3, None, 4, 5], type=pa.uint8()) with pytest.raises(pa.ArrowInvalid, match="overflow"): - pa.compute.pairwise_diff_checked(arr, period=-1) + pairwise_diff_checked(arr, period=-1) def test_pivot_wider(): key_names = ["width", "height"] - result = pc.pivot_wider(["height", "width", "depth"], [10, None, 11]) + result = pivot_wider(["height", "width", "depth"], [10, None, 11]) assert result.as_py() == {} - result = pc.pivot_wider(["height", "width", "depth"], [10, None, 11], - key_names) + result = pivot_wider(["height", "width", "depth"], [10, None, 11], + key_names) assert result.as_py() == {"width": None, "height": 10} # check key order assert list(result.as_py()) == ["width", "height"] - result = pc.pivot_wider(["height", "width", "depth"], [10, None, 11], - key_names=key_names) + result = pivot_wider(["height", "width", "depth"], [10, None, 11], + key_names=key_names) assert result.as_py() == {"width": None, "height": 10} with pytest.raises(KeyError, match="Unexpected pivot key: depth"): - result = pc.pivot_wider(["height", "width", "depth"], [10, None, 11], - key_names=key_names, - unexpected_key_behavior="raise") + result = pivot_wider(["height", "width", "depth"], [10, None, 11], + key_names=key_names, + unexpected_key_behavior="raise") with pytest.raises(ValueError, match="Encountered more than one non-null value"): - result = pc.pivot_wider(["height", "width", "height"], [10, None, 11], - key_names=key_names) + result = pivot_wider(["height", "width", "height"], [10, None, 11], + key_names=key_names) def test_winsorize(): arr = pa.array([10, 4, 9, 8, 5, 3, 7, 2, 1, 6]) - result = pc.winsorize(arr, 0.1, 0.8) + result = winsorize(arr, 0.1, 0.8) assert result.to_pylist() == [8, 4, 8, 8, 5, 3, 7, 2, 2, 6] - result = pc.winsorize( + result = winsorize( arr, options=pc.WinsorizeOptions(lower_limit=0.1, upper_limit=0.8)) assert result.to_pylist() == [8, 4, 8, 8, 5, 3, 7, 2, 2, 6] diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py index 170f62a43bd..71c96835d2c 100644 --- a/python/pyarrow/tests/test_csv.py +++ b/python/pyarrow/tests/test_csv.py @@ -1989,7 +1989,8 @@ def test_write_quoting_style(): buf = io.BytesIO() for write_options, res in [ (WriteOptions(quoting_style='needed'), b'"c1"\n","\n""""\n'), - (WriteOptions(quoting_style='none'), pa.lib.ArrowInvalid), + (WriteOptions(quoting_style='none'), pa.lib.ArrowInvalid), \ + # type: ignore[unresolved-attribute] ]: with CSVWriter(buf, t.schema, write_options=write_options) as writer: try: diff --git a/python/pyarrow/tests/test_cuda.py b/python/pyarrow/tests/test_cuda.py index d8298eec773..1ca5a9529e4 100644 --- a/python/pyarrow/tests/test_cuda.py +++ b/python/pyarrow/tests/test_cuda.py @@ -807,8 +807,9 @@ def test_create_table_with_device_buffers(): def other_process_for_test_IPC(handle_buffer, expected_arr): - other_context = pa.cuda.Context(0) - ipc_handle = pa.cuda.IpcMemHandle.from_buffer(handle_buffer) + other_context = pa.cuda.Context(0) # type: ignore[unresolved-attribute] + ipc_handle = pa.cuda.IpcMemHandle.from_buffer(handle_buffer) \ + # type: ignore[unresolved-attribute] ipc_buf = other_context.open_ipc_buffer(ipc_handle) ipc_buf.context.synchronize() buf = ipc_buf.copy_to_host() diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 6303b47bd44..344201ff4f9 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -41,6 +41,9 @@ import pyarrow.feather import pyarrow.fs as fs import pyarrow.json +from pyarrow import lib # type: ignore[unresolved-attribute] +from pyarrow.compute import (is_in, hour, days_between, sort_indices, unique) \ + # type: ignore[unresolved-attribute] from pyarrow.lib import is_threading_enabled # type: ignore[unresolved_import] from pyarrow.tests.util import (FSProtocolClass, ProxyHandler, _configure_s3_limited_user, _filesystem_uri, @@ -53,24 +56,21 @@ try: import pyarrow.dataset as ds - from pyarrow.dataset import ( - ParquetFragmentScanOptions, ParquetReadOptions, ParquetFileFragment \ - # type: ignore[possibly-unbound-attribute] - ) + from pyarrow.dataset import ParquetFragmentScanOptions, ParquetReadOptions, \ + ParquetFileFragment, ParquetFileFormat # type: ignore[possibly-unbound-attribute] except ImportError: pass try: from pyarrow.dataset import ( - OrcFileFormat # type: ignore[possibly-unbound-attribute] + OrcFileFormat # type: ignore[possibly-unbound-import] ) except ImportError: pass try: - import pyarrow.parquet as pq - from pyarrow.parquet import ParquetFileFormat \ - # type: ignore[possibly-unbound-attribute] + import pyarrow.parquet as pq \ + # type: ignore[unresolved-import] except ImportError: pass @@ -1276,7 +1276,7 @@ def test_make_fragment_with_size(s3_example_simple): fragments_with_size, format=file_format, schema=table.schema, filesystem=fs ) - with pytest.raises(pyarrow.lib.ArrowInvalid, match='Parquet file size is 1 bytes'): + with pytest.raises(lib.ArrowInvalid, match='Parquet file size is 1 bytes'): table = dataset_with_size.to_table() # too large sizes -> error @@ -3158,13 +3158,13 @@ def test_filter_compute_expression(tempdir, dataset_reader): _, path = _create_single_file(tempdir, table) dataset = ds.dataset(str(path)) - filter_ = pc.is_in(ds.field('A'), pa.array(["a", "b"])) + filter_ = is_in(ds.field('A'), pa.array(["a", "b"])) assert dataset_reader.to_table(dataset, filter=filter_).num_rows == 3 - filter_ = pc.hour(ds.field('B')) >= 3 + filter_ = hour(ds.field('B')) >= 3 assert dataset_reader.to_table(dataset, filter=filter_).num_rows == 2 - days = pc.days_between(ds.field('B'), ds.field("C")) + days = days_between(ds.field('B'), ds.field("C")) result = dataset_reader.to_table(dataset, columns={"days": days}) assert result["days"].to_pylist() == [0, 1, 2, 3, 4] @@ -3687,7 +3687,7 @@ def test_column_names_encoding(tempdir, dataset_reader): # Reading as string without specifying encoding should produce an error dataset = ds.dataset(path, format='csv', schema=expected_schema) - with pytest.raises(pyarrow.lib.ArrowInvalid, match="invalid UTF8"): + with pytest.raises(lib.ArrowInvalid, match="invalid UTF8"): dataset_reader.to_table(dataset) # Setting the encoding in the read_options should transcode the data @@ -4189,7 +4189,7 @@ def test_write_to_dataset_given_null_just_works(tempdir): def _sort_table(tab, sort_col): import pyarrow.compute as pc - sorted_indices = pc.sort_indices( + sorted_indices = sort_indices( tab, options=pc.SortOptions([(sort_col, 'ascending')])) return pc.take(tab, sorted_indices) @@ -4637,7 +4637,7 @@ def test_write_dataset_max_open_files(tempdir): def _get_compare_pair(data_source, record_batch, file_format, col_id): num_of_files_generated = _get_num_of_files_generated( base_directory=data_source, file_format=file_format) - number_of_partitions = len(pa.compute.unique(record_batch[col_id])) + number_of_partitions = len(unique(record_batch[col_id])) return num_of_files_generated, number_of_partitions # CASE 1: when max_open_files=default & max_open_files >= num_of_partitions diff --git a/python/pyarrow/tests/test_exec_plan.py b/python/pyarrow/tests/test_exec_plan.py index d85a2c21524..177f3baa378 100644 --- a/python/pyarrow/tests/test_exec_plan.py +++ b/python/pyarrow/tests/test_exec_plan.py @@ -220,13 +220,14 @@ def test_table_join_keys_order(): def test_filter_table_errors(): + from pyarrow.compute import divide # type: ignore[unresolved-attribute] t = pa.table({ "a": [1, 2, 3, 4, 5], "b": [10, 20, 30, 40, 50] }) with pytest.raises(pa.ArrowTypeError): - _filter_table(t, pc.divide(pc.field("a"), pc.scalar(2))) + _filter_table(t, divide(pc.field("a"), pc.scalar(2))) with pytest.raises(pa.ArrowInvalid): _filter_table(t, (pc.field("Z") <= pc.scalar(2))) @@ -267,14 +268,16 @@ def test_filter_table_ordering(): def test_complex_filter_table(): + from pyarrow.compute import bit_wise_and, multiply \ + # type: ignore[unresolved-attribute] t = pa.table({ "a": [1, 2, 3, 4, 5, 6, 6], "b": [10, 20, 30, 40, 50, 60, 61] }) result = _filter_table( - t, ((pc.bit_wise_and(pc.field("a"), pc.scalar(1)) == pc.scalar(0)) & - (pc.multiply(pc.field("a"), pc.scalar(10)) == pc.field("b"))) + t, ((bit_wise_and(pc.field("a"), pc.scalar(1)) == pc.scalar(0)) & + (multiply(pc.field("a"), pc.scalar(10)) == pc.field("b"))) ) assert result == pa.table({ diff --git a/python/pyarrow/tests/test_gdb.py b/python/pyarrow/tests/test_gdb.py index 58aabb7368e..4b1641557e7 100644 --- a/python/pyarrow/tests/test_gdb.py +++ b/python/pyarrow/tests/test_gdb.py @@ -158,10 +158,10 @@ def select_frame(self, func_name): m = re.search(pat, out) if m is None: pytest.fail(f"Could not select frame for function {func_name}") - - frame_num = int(m.get(1, None)) - out = self.run_command(f"frame {frame_num}") - assert f"in {func_name}" in out + else: + frame_num = int(m[1]) + out = self.run_command(f"frame {frame_num}") + assert f"in {func_name}" in out def join(self): if self.proc is not None: diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py index 43fd0e1ac0e..bea9a929673 100644 --- a/python/pyarrow/tests/test_io.py +++ b/python/pyarrow/tests/test_io.py @@ -35,6 +35,11 @@ except ImportError: pass +try: + from pyarrow import lib # type: ignore[unresolved-attribute] +except ImportError: + pass + from pyarrow.util import guid from pyarrow import Codec import pyarrow as pa @@ -812,7 +817,7 @@ def test_cache_options_pickling(pickle_module): @pytest.mark.numpy @pytest.mark.parametrize("compression", [ pytest.param( - "bz2", marks=pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError) + "bz2", marks=pytest.mark.xfail(raises=lib.ArrowNotImplementedError) ), "brotli", "gzip", @@ -853,7 +858,7 @@ def test_compress_decompress(compression): @pytest.mark.numpy @pytest.mark.parametrize("compression", [ pytest.param( - "bz2", marks=pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError) + "bz2", marks=pytest.mark.xfail(raises=lib.ArrowNotImplementedError) ), "brotli", "gzip", @@ -1725,7 +1730,7 @@ def test_output_stream_constructor(tmpdir): ]) def test_compression_detection(path, expected_compression): if not Codec.is_available(expected_compression): - with pytest.raises(pa.lib.ArrowNotImplementedError): + with pytest.raises(lib.ArrowNotImplementedError): Codec.detect(path) else: codec = Codec.detect(path) @@ -1750,7 +1755,7 @@ def test_unknown_compression_raises(): "zstd", pytest.param( "snappy", - marks=pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError) + marks=pytest.mark.xfail(raises=lib.ArrowNotImplementedError) ) ]) def test_compressed_roundtrip(compression): diff --git a/python/pyarrow/tests/test_ipc.py b/python/pyarrow/tests/test_ipc.py index ed6e7563ed2..77018f93a24 100644 --- a/python/pyarrow/tests/test_ipc.py +++ b/python/pyarrow/tests/test_ipc.py @@ -33,6 +33,10 @@ import pyarrow as pa from pyarrow.tests.util import changed_environ, invoke_script +try: + from pyarrow import lib # type: ignore[unresolved-attribute] +except ImportError: + pass try: from pandas.testing import assert_frame_equal @@ -1234,7 +1238,7 @@ def __arrow_c_stream__(self, requested_schema=None): assert reader.read_all() == expected.cast(good_schema) # If schema doesn't match, raises TypeError - with pytest.raises(pa.lib.ArrowTypeError, match='Field 0 cannot be cast'): + with pytest.raises(lib.ArrowTypeError, match='Field 0 cannot be cast'): pa.RecordBatchReader.from_stream( wrapper, schema=pa.schema([pa.field('a', pa.list_(pa.int32()))]) ) @@ -1271,7 +1275,7 @@ def test_record_batch_reader_cast(): # Check error for impossible cast in call to .cast() reader = pa.RecordBatchReader.from_batches(schema_src, data) - with pytest.raises(pa.lib.ArrowTypeError, match='Field 0 cannot be cast'): + with pytest.raises(lib.ArrowTypeError, match='Field 0 cannot be cast'): reader.cast(pa.schema([pa.field('a', pa.list_(pa.int32()))])) # Cast to same type should always work (also for types without a T->T cast function) @@ -1309,7 +1313,7 @@ def test_record_batch_reader_cast_nulls(): # when the batch is pulled reader = pa.RecordBatchReader.from_batches(schema_src, data_with_nulls) casted_reader = reader.cast(schema_dst) - with pytest.raises(pa.lib.ArrowInvalid, match="Can't cast array"): + with pytest.raises(lib.ArrowInvalid, match="Can't cast array"): casted_reader.read_all() diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 3c3d874395e..9f15bc73c5b 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -32,7 +32,8 @@ import numpy as np import numpy.testing as npt try: - _np_VisibleDeprecationWarning = np.VisibleDeprecationWarning + _np_VisibleDeprecationWarning = np.VisibleDeprecationWarning \ + # type: ignore[unresolved-attribute] except AttributeError: from numpy.exceptions import ( VisibleDeprecationWarning as _np_VisibleDeprecationWarning @@ -47,6 +48,7 @@ from pyarrow.vendored.version import Version import pyarrow as pa +from pyarrow import lib # type: ignore[unresolved-attribute] try: from pyarrow import parquet as pq except ImportError: @@ -1939,7 +1941,7 @@ def test_array_of_bytes_to_strings(self): # cannot be converted to utf-8 def test_array_of_bytes_to_strings_bad_data(self): with pytest.raises( - pa.lib.ArrowInvalid, + lib.ArrowInvalid, match="was not a utf8 string"): pa.array(np.array([b'\x80\x81'], dtype=object), pa.string()) @@ -1955,13 +1957,13 @@ def test_numpy_string_array_to_fixed_size_binary(self): expected = pa.array([b'foo', None, b'baz'], type=pa.binary(3)) assert converted.equals(expected) - with pytest.raises(pa.lib.ArrowInvalid, + with pytest.raises(lib.ArrowInvalid, match=r'Got bytestring of length 3 \(expected 4\)'): arr = np.array([b'foo', b'bar', b'baz'], dtype='|S3') pa.array(arr, type=pa.binary(4)) with pytest.raises( - pa.lib.ArrowInvalid, + lib.ArrowInvalid, match=r'Got bytestring of length 12 \(expected 3\)'): arr = np.array([b'foo', b'bar', b'baz'], dtype='|U3') pa.array(arr, type=pa.binary(3)) @@ -4432,7 +4434,8 @@ def test_convert_to_extension_array(monkeypatch): integer._IntegerDtype, "__from_arrow__") else: monkeypatch.delattr( - pd.core.arrays.integer.NumericDtype, "__from_arrow__") + pd.core.arrays.integer.NumericDtype, "__from_arrow__") \ + # type: ignore[unresolved-attribute] # Int64Dtype has no __from_arrow__ -> use normal conversion result = table.to_pandas() assert len(_get_mgr(result).blocks) == 1 @@ -4478,7 +4481,8 @@ def test_conversion_extensiontype_to_extensionarray(monkeypatch): integer._IntegerDtype, "__from_arrow__") else: monkeypatch.delattr( - pd.core.arrays.integer.NumericDtype, "__from_arrow__") + pd.core.arrays.integer.NumericDtype, "__from_arrow__") \ + # type: ignore[unresolved-attribute] result = arr.to_pandas() assert _get_mgr(result).blocks[0].values.dtype == np.dtype("int64") @@ -5122,7 +5126,7 @@ def test_roundtrip_map_array_with_pydicts_duplicate_keys(): # ------------------------ # With maps as pydicts - with pytest.raises(pa.lib.ArrowException): + with pytest.raises(lib.ArrowException): # raises because of duplicate keys maps.to_pandas(maps_as_pydicts="strict") series_pydicts = maps.to_pandas(maps_as_pydicts="lossy") diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index cdbe3d00aae..407c69263e8 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -201,7 +201,7 @@ def test_timestamp_scalar(): assert b == "" c = repr(pa.scalar(datetime.datetime(2015, 1, 1), type=pa.timestamp('us'))) assert c == "" - d = repr(pc.assume_timezone( + d = repr(pc.assume_timezone( # type: ignore[unresolved-attribute] pa.scalar("2000-01-01").cast(pa.timestamp("s")), "America/New_York")) assert d == "" diff --git a/python/pyarrow/tests/test_schema.py b/python/pyarrow/tests/test_schema.py index dc98f03cded..48af7b143ff 100644 --- a/python/pyarrow/tests/test_schema.py +++ b/python/pyarrow/tests/test_schema.py @@ -627,11 +627,11 @@ def test_type_schema_pickling(pickle_module): pa.union([ pa.field('a', pa.int8()), pa.field('b', pa.int16()) - ], pa.lib.UnionMode_SPARSE), + ], pa.lib.UnionMode_SPARSE), # type: ignore[unresolved-attribute] pa.union([ pa.field('a', pa.int8()), pa.field('b', pa.int16()) - ], pa.lib.UnionMode_DENSE), + ], pa.lib.UnionMode_DENSE), # type: ignore[unresolved-attribute] pa.time32('s'), pa.time64('us'), pa.date32(), diff --git a/python/pyarrow/tests/test_sparse_tensor.py b/python/pyarrow/tests/test_sparse_tensor.py index 6a398f38ac5..89823e04943 100644 --- a/python/pyarrow/tests/test_sparse_tensor.py +++ b/python/pyarrow/tests/test_sparse_tensor.py @@ -26,15 +26,14 @@ import pyarrow as pa try: - import scipy from scipy.sparse import csr_array, coo_array, csr_matrix, coo_matrix except ImportError: - pass + pytestmark = pytest.mark.scipy try: import sparse # type: ignore[unresolved_import] except ImportError: - pass + pytestmark = pytest.mark.pydata_sparse tensor_type_pairs = [ @@ -399,7 +398,7 @@ def test_dense_to_sparse_tensor(dtype_str, arrow_type, sparse_tensor_type): assert np.array_equal(array, result_array) -@pytest.mark.skipif(not scipy, reason="requires scipy") +@pytest.mark.scipy @pytest.mark.parametrize('sparse_object', (coo_array, coo_matrix)) @pytest.mark.parametrize('dtype_str,arrow_type', scipy_type_pairs) def test_sparse_coo_tensor_scipy_roundtrip(dtype_str, arrow_type, @@ -441,7 +440,7 @@ def test_sparse_coo_tensor_scipy_roundtrip(dtype_str, arrow_type, assert out_scipy_matrix.has_canonical_format -@pytest.mark.skipif(not scipy, reason="requires scipy") +@pytest.mark.scipy @pytest.mark.parametrize('sparse_object', (csr_array, csr_matrix)) @pytest.mark.parametrize('dtype_str,arrow_type', scipy_type_pairs) def test_sparse_csr_matrix_scipy_roundtrip(dtype_str, arrow_type, @@ -469,7 +468,7 @@ def test_sparse_csr_matrix_scipy_roundtrip(dtype_str, arrow_type, assert np.array_equal(dense_array, sparse_tensor.to_tensor().to_numpy()) -@pytest.mark.skipif(not sparse, reason="requires pydata/sparse") +@pytest.mark.pydata_sparse @pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs) def test_pydata_sparse_sparse_coo_tensor_roundtrip(dtype_str, arrow_type): dtype = np.dtype(dtype_str) diff --git a/python/pyarrow/tests/test_strategies.py b/python/pyarrow/tests/test_strategies.py index babb839b534..0fe9508aef0 100644 --- a/python/pyarrow/tests/test_strategies.py +++ b/python/pyarrow/tests/test_strategies.py @@ -19,29 +19,29 @@ import pytest -import pyarrow as pa +from pyarrow import lib # type: ignore[unresolved-attribute] import pyarrow.tests.strategies as past @h.given(past.all_types) def test_types(ty): - assert isinstance(ty, pa.lib.DataType) + assert isinstance(ty, lib.DataType) @h.given(past.all_fields) def test_fields(field): - assert isinstance(field, pa.lib.Field) + assert isinstance(field, lib.Field) @h.given(past.all_schemas) def test_schemas(schema): - assert isinstance(schema, pa.lib.Schema) + assert isinstance(schema, lib.Schema) @pytest.mark.numpy @h.given(past.all_arrays) def test_arrays(array): - assert isinstance(array, pa.lib.Array) + assert isinstance(array, lib.Array) @pytest.mark.numpy @@ -52,15 +52,15 @@ def test_array_nullability(array): @h.given(past.chunked_arrays(past.primitive_types)) def test_chunked_arrays(chunked_array): - assert isinstance(chunked_array, pa.lib.ChunkedArray) + assert isinstance(chunked_array, lib.ChunkedArray) @h.given(past.all_record_batches) def test_record_batches(record_bath): - assert isinstance(record_bath, pa.lib.RecordBatch) + assert isinstance(record_bath, lib.RecordBatch) @pytest.mark.numpy @h.given(past.all_tables) def test_tables(table): - assert isinstance(table, pa.lib.Table) + assert isinstance(table, lib.Table) diff --git a/python/pyarrow/tests/test_substrait.py b/python/pyarrow/tests/test_substrait.py index d3f5d848bce..8ac0951e489 100644 --- a/python/pyarrow/tests/test_substrait.py +++ b/python/pyarrow/tests/test_substrait.py @@ -22,6 +22,8 @@ import pyarrow as pa import pyarrow.compute as pc +from pyarrow.compute import equal # type: ignore[unresolved-attribute] +from pyarrow import _substrait # type: ignore[unresolved-attribute] from pyarrow.lib import tobytes # type: ignore[unresolved_import] from pyarrow.lib import ArrowInvalid, ArrowNotImplementedError \ # type: ignore[unresolved_import] @@ -86,7 +88,7 @@ def test_run_serialized_query(tmpdir, use_threads): query = tobytes(substrait_query.replace( "FILENAME_PLACEHOLDER", pathlib.Path(path).as_uri())) - buf = pa._substrait._parse_json_plan(query) + buf = _substrait._parse_json_plan(query) reader = substrait.run_query(buf, use_threads=use_threads) res_tb = reader.read_all() @@ -117,7 +119,7 @@ def test_invalid_plan(): ] } """ - buf = pa._substrait._parse_json_plan(tobytes(query)) + buf = _substrait._parse_json_plan(tobytes(query)) exec_message = "Plan has no relations" with pytest.raises(ArrowInvalid, match=exec_message): substrait.run_query(buf) @@ -163,7 +165,7 @@ def test_binary_conversion_with_json_options(tmpdir, use_threads): path = _write_dummy_data_to_disk(tmpdir, file_name, table) query = tobytes(substrait_query.replace( "FILENAME_PLACEHOLDER", pathlib.Path(path).as_uri())) - buf = pa._substrait._parse_json_plan(tobytes(query)) + buf = _substrait._parse_json_plan(tobytes(query)) reader = substrait.run_query(buf, use_threads=use_threads) res_tb = reader.read_all() @@ -182,7 +184,7 @@ def has_function(fns, ext_file, fn_name): def test_get_supported_functions(): - supported_functions = pa._substrait.get_supported_functions() + supported_functions = _substrait.get_supported_functions() # It probably doesn't make sense to exhaustively verify this list but # we can check a sample aggregate and a sample non-aggregate entry assert has_function(supported_functions, @@ -233,7 +235,7 @@ def table_provider(names, schema): } """ - buf = pa._substrait._parse_json_plan(tobytes(substrait_query)) + buf = _substrait._parse_json_plan(tobytes(substrait_query)) reader = pa.substrait.run_query( buf, table_provider=table_provider, use_threads=use_threads) res_tb = reader.read_all() @@ -276,7 +278,7 @@ def table_provider(names, _): } """ - buf = pa._substrait._parse_json_plan(tobytes(substrait_query)) + buf = _substrait._parse_json_plan(tobytes(substrait_query)) exec_message = "Invalid NamedTable Source" with pytest.raises(ArrowInvalid, match=exec_message): substrait.run_query(buf, table_provider=table_provider) @@ -318,7 +320,7 @@ def table_provider(names, _): } """ query = tobytes(substrait_query) - buf = pa._substrait._parse_json_plan(tobytes(query)) + buf = _substrait._parse_json_plan(tobytes(query)) exec_message = "names for NamedTable not provided" with pytest.raises(ArrowInvalid, match=exec_message): substrait.run_query(buf, table_provider=table_provider) @@ -437,7 +439,7 @@ def table_provider(names, _): } """ - buf = pa._substrait._parse_json_plan(substrait_query) + buf = _substrait._parse_json_plan(substrait_query) reader = pa.substrait.run_query( buf, table_provider=table_provider, use_threads=use_threads) res_tb = reader.read_all() @@ -560,7 +562,7 @@ def table_provider(names, _): } """ - buf = pa._substrait._parse_json_plan(substrait_query) + buf = _substrait._parse_json_plan(substrait_query) with pytest.raises(pa.ArrowKeyError) as excinfo: pa.substrait.run_query(buf, table_provider=table_provider) assert "No function registered" in str(excinfo.value) @@ -599,7 +601,7 @@ def table_provider(names, schema): } """ - buf = pa._substrait._parse_json_plan(tobytes(substrait_query)) + buf = _substrait._parse_json_plan(tobytes(substrait_query)) reader = pa.substrait.run_query( buf, table_provider=table_provider, use_threads=use_threads) res_tb = reader.read_all() @@ -745,7 +747,7 @@ def table_provider(names, _): ], } """ - buf = pa._substrait._parse_json_plan(substrait_query) + buf = _substrait._parse_json_plan(substrait_query) reader = pa.substrait.run_query( buf, table_provider=table_provider, use_threads=False) res_tb = reader.read_all() @@ -914,7 +916,7 @@ def table_provider(names, _): ], } """ - buf = pa._substrait._parse_json_plan(substrait_query) + buf = _substrait._parse_json_plan(substrait_query) reader = pa.substrait.run_query( buf, table_provider=table_provider, use_threads=False) res_tb = reader.read_all() @@ -930,8 +932,8 @@ def table_provider(names, _): @pytest.mark.parametrize("expr", [ - pc.equal(pc.field("x"), 7), - pc.equal(pc.field("x"), pc.field("y")), + equal(pc.field("x"), 7), + equal(pc.field("x"), pc.field("y")), pc.field("x") > 50 ]) def test_serializing_expressions(expr): @@ -986,7 +988,7 @@ def test_arrow_one_way_types(): ) def check_one_way(field): - expr = pc.is_null(pc.field(field.name)) + expr = pc.is_null(pc.field(field.name)) # type: ignore[unresolved-attribute] buf = pa.substrait.serialize_expressions([expr], ["test_expr"], schema) returned = pa.substrait.deserialize_expressions(buf) assert alt_schema == returned.schema @@ -1000,8 +1002,8 @@ def test_invalid_expression_ser_des(): pa.field("x", pa.int32()), pa.field("y", pa.int32()) ]) - expr = pc.equal(pc.field("x"), 7) - bad_expr = pc.equal(pc.field("z"), 7) + expr = equal(pc.field("x"), 7) + bad_expr = equal(pc.field("z"), 7) # Invalid number of names with pytest.raises(ValueError) as excinfo: pa.substrait.serialize_expressions([expr], [], schema) @@ -1020,13 +1022,13 @@ def test_serializing_multiple_expressions(): pa.field("x", pa.int32()), pa.field("y", pa.int32()) ]) - exprs = [pc.equal(pc.field("x"), 7), pc.equal(pc.field("x"), pc.field("y"))] + exprs = [equal(pc.field("x"), 7), equal(pc.field("x"), pc.field("y"))] buf = pa.substrait.serialize_expressions(exprs, ["first", "second"], schema) returned = pa.substrait.deserialize_expressions(buf) assert schema == returned.schema assert len(returned.expressions) == 2 - norm_exprs = [pc.equal(pc.field(0), 7), pc.equal(pc.field(0), pc.field(1))] + norm_exprs = [equal(pc.field(0), 7), equal(pc.field(0), pc.field(1))] assert str(returned.expressions["first"]) == str(norm_exprs[0]) assert str(returned.expressions["second"]) == str(norm_exprs[1]) @@ -1036,8 +1038,8 @@ def test_serializing_with_compute(): pa.field("x", pa.int32()), pa.field("y", pa.int32()) ]) - expr = pc.equal(pc.field("x"), 7) - expr_norm = pc.equal(pc.field(0), 7) + expr = equal(pc.field("x"), 7) + expr_norm = equal(pc.field(0), 7) buf = expr.to_substrait(schema) returned = pa.substrait.deserialize_expressions(buf) @@ -1067,7 +1069,7 @@ def test_serializing_udfs(): ]) a = pc.scalar(10) b = pc.scalar(4) - exprs = [pc.shift_left(a, b)] + exprs = [pc.shift_left(a, b)] # type: ignore[unresolved-attribute] with pytest.raises(ArrowNotImplementedError): pa.substrait.serialize_expressions(exprs, ["expr"], schema) diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index e7726fd0023..64624c93f1e 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -26,6 +26,7 @@ pass import pytest import pyarrow as pa +from pyarrow import lib # type: ignore[unresolved-import] import pyarrow.compute as pc from pyarrow.interchange import from_dataframe from pyarrow.vendored.version import Version @@ -49,8 +50,8 @@ def test_chunked_array_basics(): [7, 8, 9] ]) assert isinstance(data.chunks, list) - assert all(isinstance(c, pa.lib.Int64Array) for c in data.chunks) - assert all(isinstance(c, pa.lib.Int64Array) for c in data.iterchunks()) + assert all(isinstance(c, lib.Int64Array) for c in data.chunks) + assert all(isinstance(c, lib.Int64Array) for c in data.iterchunks()) assert len(data.chunks) == 3 assert data.get_total_buffer_size() == sum(c.get_total_buffer_size() for c in data.iterchunks()) @@ -650,7 +651,7 @@ def __arrow_c_stream__(self, requested_schema=None): # If schema doesn't match, raises NotImplementedError with pytest.raises( - pa.lib.ArrowTypeError, match="Field 0 cannot be cast" + lib.ArrowTypeError, match="Field 0 cannot be cast" ): pa.table( wrapper, schema=pa.schema([pa.field('a', pa.list_(pa.int32()))]) @@ -2230,7 +2231,7 @@ def test_invalid_table_construct(): u8 = pa.uint8() arrays = [pa.array(array, type=u8), pa.array(array[1:], type=u8)] - with pytest.raises(pa.lib.ArrowInvalid): + with pytest.raises(lib.ArrowInvalid): pa.Table.from_arrays(arrays, names=["a1", "a2"]) @@ -3299,7 +3300,7 @@ def test_table_join_asof_by_length_mismatch(): }) msg = "inconsistent size of by-key across inputs" - with pytest.raises(pa.lib.ArrowInvalid, match=msg): + with pytest.raises(lib.ArrowInvalid, match=msg): t1.join_asof( t2, on="on", by=["colA", "colB"], tolerance=1, right_on="on", right_by=["colA"], @@ -3321,7 +3322,7 @@ def test_table_join_asof_by_type_mismatch(): }) msg = "Expected by-key type int64 but got double for field colA in input 1" - with pytest.raises(pa.lib.ArrowInvalid, match=msg): + with pytest.raises(lib.ArrowInvalid, match=msg): t1.join_asof( t2, on="on", by=["colA"], tolerance=1, right_on="on", right_by=["colA"], @@ -3343,7 +3344,7 @@ def test_table_join_asof_on_type_mismatch(): }) msg = "Expected on-key type int64 but got double for field on in input 1" - with pytest.raises(pa.lib.ArrowInvalid, match=msg): + with pytest.raises(lib.ArrowInvalid, match=msg): t1.join_asof( t2, on="on", by=["colA"], tolerance=1, right_on="on", right_by=["colA"], @@ -3470,14 +3471,14 @@ def test_invalid_non_join_column(): }) # check as left table - with pytest.raises(pa.lib.ArrowInvalid) as excinfo: + with pytest.raises(lib.ArrowInvalid) as excinfo: t1.join(t2, 'id', join_type='inner') exp_error_msg = "Data type list is not supported " \ + "in join non-key field array_column" assert exp_error_msg in str(excinfo.value) # check as right table - with pytest.raises(pa.lib.ArrowInvalid) as excinfo: + with pytest.raises(lib.ArrowInvalid) as excinfo: t2.join(t1, 'id', join_type='inner') assert exp_error_msg in str(excinfo.value) diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index 5e5f4903e29..4077b302f71 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -38,6 +38,8 @@ import pyarrow.types as types import pyarrow.tests.strategies as past +from pyarrow import lib # type: ignore[unresolved-import] + def get_many_types(): # returning them from a function is required because of pa.dictionary @@ -83,14 +85,14 @@ def get_many_types(): pa.field('b', pa.int8(), nullable=False), pa.field('c', pa.string())]), pa.union([pa.field('a', pa.binary(10)), - pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), + pa.field('b', pa.string())], mode=lib.UnionMode_DENSE), pa.union([pa.field('a', pa.binary(10)), - pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE, + pa.field('b', pa.string())], mode=lib.UnionMode_DENSE, type_codes=[4, 8]), pa.union([pa.field('a', pa.binary(10)), - pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), + pa.field('b', pa.string())], mode=lib.UnionMode_SPARSE), pa.union([pa.field('a', pa.binary(10), nullable=False), - pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), + pa.field('b', pa.string())], mode=lib.UnionMode_SPARSE), pa.dictionary(pa.int32(), pa.string()), pa.run_end_encoded(pa.int16(), pa.int32()), pa.run_end_encoded(pa.int32(), pa.string()), @@ -247,7 +249,7 @@ def test_is_nested_or_struct(): def test_is_union(): - for mode in [pa.lib.UnionMode_SPARSE, pa.lib.UnionMode_DENSE]: + for mode in [lib.UnionMode_SPARSE, lib.UnionMode_DENSE]: assert types.is_union(pa.union([pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string())], @@ -353,7 +355,7 @@ def test_is_primitive(): (datetime.timezone(datetime.timedelta(hours=1, minutes=30)), '+01:30') ]) def test_tzinfo_to_string(tz, expected): - assert pa.lib.tzinfo_to_string(tz) == expected + assert lib.tzinfo_to_string(tz) == expected def test_pytz_tzinfo_to_string(): @@ -361,13 +363,13 @@ def test_pytz_tzinfo_to_string(): tz = [pytz.utc, pytz.timezone('Europe/Paris')] expected = ['UTC', 'Europe/Paris'] - assert [pa.lib.tzinfo_to_string(i) for i in tz] == expected + assert [lib.tzinfo_to_string(i) for i in tz] == expected # StaticTzInfo.tzname returns with '-09' so we need to infer the timezone's # name from the tzinfo.zone attribute tz = [pytz.timezone('Etc/GMT-9'), pytz.FixedOffset(180)] expected = ['Etc/GMT-9', '+03:00'] - assert [pa.lib.tzinfo_to_string(i) for i in tz] == expected + assert [lib.tzinfo_to_string(i) for i in tz] == expected @pytest.mark.timezone_data @@ -381,9 +383,9 @@ def test_dateutil_tzinfo_to_string(): import dateutil.tz tz = dateutil.tz.UTC - assert pa.lib.tzinfo_to_string(tz) == 'UTC' + assert lib.tzinfo_to_string(tz) == 'UTC' tz = dateutil.tz.gettz('Europe/Paris') - assert pa.lib.tzinfo_to_string(tz) == 'Europe/Paris' + assert lib.tzinfo_to_string(tz) == 'Europe/Paris' @pytest.mark.timezone_data @@ -395,20 +397,20 @@ def test_zoneinfo_tzinfo_to_string(): pytest.importorskip('tzdata') tz = zoneinfo.ZoneInfo('UTC') - assert pa.lib.tzinfo_to_string(tz) == 'UTC' + assert lib.tzinfo_to_string(tz) == 'UTC' tz = zoneinfo.ZoneInfo('Europe/Paris') - assert pa.lib.tzinfo_to_string(tz) == 'Europe/Paris' + assert lib.tzinfo_to_string(tz) == 'Europe/Paris' def test_tzinfo_to_string_errors(): msg = "Not an instance of datetime.tzinfo" with pytest.raises(TypeError): - pa.lib.tzinfo_to_string("Europe/Budapest") + lib.tzinfo_to_string("Europe/Budapest") tz = datetime.timezone(datetime.timedelta(hours=1, seconds=30)) msg = "Offset must represent whole number of minutes" with pytest.raises(ValueError, match=msg): - pa.lib.tzinfo_to_string(tz) + lib.tzinfo_to_string(tz) if tzst: @@ -421,8 +423,8 @@ def test_tzinfo_to_string_errors(): def test_pytz_timezone_roundtrip(tz): if tz is None: pytest.skip('requires timezone not None') - timezone_string = pa.lib.tzinfo_to_string(tz) - timezone_tzinfo = pa.lib.string_to_tzinfo(timezone_string) + timezone_string = lib.tzinfo_to_string(tz) + timezone_tzinfo = lib.string_to_tzinfo(timezone_string) assert timezone_tzinfo == tz @@ -482,14 +484,14 @@ def tzname(self, dt): def utcoffset(self, dt): return None - assert pa.lib.tzinfo_to_string(CorrectTimezone1()) == "-02:30" - assert pa.lib.tzinfo_to_string(CorrectTimezone2()) == "+03:00" + assert lib.tzinfo_to_string(CorrectTimezone1()) == "-02:30" + assert lib.tzinfo_to_string(CorrectTimezone2()) == "+03:00" msg = (r"Object returned by tzinfo.utcoffset\(None\) is not an instance " r"of datetime.timedelta") for wrong in [BuggyTimezone1(), BuggyTimezone2(), BuggyTimezone3()]: with pytest.raises(ValueError, match=msg): - pa.lib.tzinfo_to_string(wrong) + lib.tzinfo_to_string(wrong) def test_string_to_tzinfo(): @@ -499,7 +501,7 @@ def test_string_to_tzinfo(): expected = [pytz.utc, pytz.timezone('Europe/Paris'), pytz.FixedOffset(180), pytz.FixedOffset(90), pytz.FixedOffset(-120)] - result = [pa.lib.string_to_tzinfo(i) for i in string] + result = [lib.string_to_tzinfo(i) for i in string] assert result == expected except ImportError: @@ -511,7 +513,7 @@ def test_string_to_tzinfo(): datetime.timezone( datetime.timedelta(hours=1, minutes=30)), datetime.timezone(-datetime.timedelta(hours=2))] - result = [pa.lib.string_to_tzinfo(i) for i in string] + result = [lib.string_to_tzinfo(i) for i in string] assert result == expected except ImportError: @@ -525,8 +527,8 @@ def test_timezone_string_roundtrip_pytz(): pytz.utc, pytz.timezone('America/New_York')] name = ['+01:30', '-01:30', 'UTC', 'America/New_York'] - assert [pa.lib.tzinfo_to_string(i) for i in tz] == name - assert [pa.lib.string_to_tzinfo(i)for i in name] == tz + assert [lib.tzinfo_to_string(i) for i in tz] == name + assert [lib.string_to_tzinfo(i)for i in name] == tz def test_timestamp(): @@ -797,13 +799,13 @@ def check_fields(ty, fields): sparse_factories = [ partial(pa.union, mode='sparse'), - partial(pa.union, mode=pa.lib.UnionMode_SPARSE), + partial(pa.union, mode=lib.UnionMode_SPARSE), pa.sparse_union, ] dense_factories = [ partial(pa.union, mode='dense'), - partial(pa.union, mode=pa.lib.UnionMode_DENSE), + partial(pa.union, mode=lib.UnionMode_DENSE), pa.dense_union, ] diff --git a/python/pyproject.toml b/python/pyproject.toml index 8e7f14d3c46..9c16ee08892 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -111,7 +111,7 @@ fallback_version = '22.0.0a0' #possibly-unbound-import = "ignore" #too-many-positional-arguments = "ignore" #unknown-argument = "ignore" -unresolved-attribute = "ignore" +#unresolved-attribute = "ignore" #unresolved-global = "ignore" #unresolved-import = "ignore" #unresolved-reference = "ignore" diff --git a/python/scripts/test_leak.py b/python/scripts/test_leak.py index e99c4751680..9ce8cbb2ba7 100644 --- a/python/scripts/test_leak.py +++ b/python/scripts/test_leak.py @@ -71,7 +71,7 @@ def func(): writer.close() buf_reader = pa.BufferReader(sink.getvalue()) - reader = pa.open_file(buf_reader) + reader = pa.ipc.open_file(buf_reader) reader.read_all() assert_does_not_leak(func, iterations=50, tolerance=50) From 98f258b5b3fcbe6262445c0c8ede6e693873bd19 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Sat, 26 Jul 2025 17:37:33 +0200 Subject: [PATCH 31/32] Revert test changes --- .../tests/interchange/test_conversion.py | 2 +- .../interchange/test_interchange_spec.py | 8 +- python/pyarrow/tests/parquet/common.py | 4 +- python/pyarrow/tests/parquet/test_basic.py | 11 +- .../parquet/test_compliant_nested_type.py | 6 +- .../pyarrow/tests/parquet/test_data_types.py | 9 +- python/pyarrow/tests/parquet/test_dataset.py | 6 +- python/pyarrow/tests/parquet/test_datetime.py | 6 +- .../pyarrow/tests/parquet/test_encryption.py | 3 +- python/pyarrow/tests/parquet/test_metadata.py | 12 +- python/pyarrow/tests/parquet/test_pandas.py | 29 +- .../tests/parquet/test_parquet_file.py | 10 +- .../tests/parquet/test_parquet_writer.py | 4 +- python/pyarrow/tests/strategies.py | 10 +- python/pyarrow/tests/test_acero.py | 25 +- .../pyarrow/tests/test_adhoc_memory_leak.py | 2 +- python/pyarrow/tests/test_array.py | 30 +- python/pyarrow/tests/test_builder.py | 3 +- python/pyarrow/tests/test_cffi.py | 7 +- python/pyarrow/tests/test_compute.py | 1110 ++++++++--------- python/pyarrow/tests/test_convert_builtin.py | 2 +- python/pyarrow/tests/test_cpp_internals.py | 2 +- python/pyarrow/tests/test_csv.py | 5 +- python/pyarrow/tests/test_cuda.py | 9 +- .../pyarrow/tests/test_cuda_numba_interop.py | 20 +- python/pyarrow/tests/test_cython.py | 4 +- python/pyarrow/tests/test_dataset.py | 174 ++- .../pyarrow/tests/test_dataset_encryption.py | 38 +- python/pyarrow/tests/test_exec_plan.py | 9 +- python/pyarrow/tests/test_extension_type.py | 8 +- python/pyarrow/tests/test_feather.py | 17 +- python/pyarrow/tests/test_flight.py | 48 +- python/pyarrow/tests/test_fs.py | 51 +- python/pyarrow/tests/test_gandiva.py | 24 +- python/pyarrow/tests/test_gdb.py | 8 +- python/pyarrow/tests/test_io.py | 15 +- python/pyarrow/tests/test_ipc.py | 12 +- python/pyarrow/tests/test_json.py | 6 +- python/pyarrow/tests/test_jvm.py | 23 +- python/pyarrow/tests/test_misc.py | 2 +- python/pyarrow/tests/test_pandas.py | 55 +- python/pyarrow/tests/test_scalars.py | 4 +- python/pyarrow/tests/test_schema.py | 6 +- python/pyarrow/tests/test_sparse_tensor.py | 15 +- python/pyarrow/tests/test_strategies.py | 16 +- python/pyarrow/tests/test_substrait.py | 55 +- python/pyarrow/tests/test_table.py | 23 +- python/pyarrow/tests/test_types.py | 57 +- python/pyarrow/tests/test_udf.py | 6 +- python/pyarrow/tests/util.py | 3 +- python/pyarrow/tests/wsgi_examples.py | 2 +- python/pyproject.toml | 3 + 52 files changed, 949 insertions(+), 1070 deletions(-) diff --git a/python/pyarrow/tests/interchange/test_conversion.py b/python/pyarrow/tests/interchange/test_conversion.py index a584f379738..50da6693aff 100644 --- a/python/pyarrow/tests/interchange/test_conversion.py +++ b/python/pyarrow/tests/interchange/test_conversion.py @@ -23,7 +23,7 @@ try: import numpy as np except ImportError: - pass + np = None import pyarrow.interchange as pi from pyarrow.interchange.column import ( diff --git a/python/pyarrow/tests/interchange/test_interchange_spec.py b/python/pyarrow/tests/interchange/test_interchange_spec.py index 14e2aab4bfb..cea694d1c1e 100644 --- a/python/pyarrow/tests/interchange/test_interchange_spec.py +++ b/python/pyarrow/tests/interchange/test_interchange_spec.py @@ -18,14 +18,14 @@ import ctypes import hypothesis as h import hypothesis.strategies as st -import pyarrow as pa -import pyarrow.tests.strategies as past -import pytest +import pytest try: import numpy as np except ImportError: - pass + np = None +import pyarrow as pa +import pyarrow.tests.strategies as past all_types = st.deferred( diff --git a/python/pyarrow/tests/parquet/common.py b/python/pyarrow/tests/parquet/common.py index 8ce804262d1..4f5946649b8 100644 --- a/python/pyarrow/tests/parquet/common.py +++ b/python/pyarrow/tests/parquet/common.py @@ -20,7 +20,7 @@ try: import numpy as np except ImportError: - pass + np = None import pyarrow as pa from pyarrow.tests import util @@ -41,7 +41,7 @@ def _write_table(table, path, **kwargs): def _read_table(*args, **kwargs): import pyarrow.parquet as pq - table = pq.read_table(*args, **kwargs) # type: ignore[missing-argument] + table = pq.read_table(*args, **kwargs) table.validate(full=True) return table diff --git a/python/pyarrow/tests/parquet/test_basic.py b/python/pyarrow/tests/parquet/test_basic.py index 7a0dfcde270..67515c5e247 100644 --- a/python/pyarrow/tests/parquet/test_basic.py +++ b/python/pyarrow/tests/parquet/test_basic.py @@ -16,6 +16,7 @@ # under the License. import os +from collections import OrderedDict import io import warnings from shutil import copytree @@ -33,7 +34,7 @@ import pyarrow.parquet as pq from pyarrow.tests.parquet.common import _read_table, _write_table except ImportError: - pass + pq = None try: @@ -43,12 +44,12 @@ from pyarrow.tests.pandas_examples import dataframe_with_lists from pyarrow.tests.parquet.common import alltypes_sample except ImportError: - pass + pd = tm = None try: import numpy as np except ImportError: - pass + np = None # Marks all of the tests in this module # Ignore these with pytest ... -m 'not parquet' @@ -229,11 +230,11 @@ def test_empty_table_no_columns(): def test_write_nested_zero_length_array_chunk_failure(): # Bug report in ARROW-3792 - cols = dict( + cols = OrderedDict( int32=pa.int32(), list_string=pa.list_(pa.string()) ) - data = [[], [dict(int32=1, list_string=('G',)), ]] + data = [[], [OrderedDict(int32=1, list_string=('G',)), ]] # This produces a table with a column like # )> diff --git a/python/pyarrow/tests/parquet/test_compliant_nested_type.py b/python/pyarrow/tests/parquet/test_compliant_nested_type.py index d7388be8a1b..2345855a332 100644 --- a/python/pyarrow/tests/parquet/test_compliant_nested_type.py +++ b/python/pyarrow/tests/parquet/test_compliant_nested_type.py @@ -24,13 +24,15 @@ from pyarrow.tests.parquet.common import (_read_table, _check_roundtrip) except ImportError: - pass + pq = None try: import pandas as pd + import pandas.testing as tm + from pyarrow.tests.parquet.common import _roundtrip_pandas_dataframe except ImportError: - pass + pd = tm = None # Marks all of the tests in this module diff --git a/python/pyarrow/tests/parquet/test_data_types.py b/python/pyarrow/tests/parquet/test_data_types.py index 9f8f5212382..c546bc1532a 100644 --- a/python/pyarrow/tests/parquet/test_data_types.py +++ b/python/pyarrow/tests/parquet/test_data_types.py @@ -22,7 +22,7 @@ try: import numpy as np except ImportError: - pass + np = None import pytest import pyarrow as pa @@ -33,7 +33,7 @@ import pyarrow.parquet as pq from pyarrow.tests.parquet.common import _read_table, _write_table except ImportError: - pass + pq = None try: @@ -44,7 +44,7 @@ dataframe_with_lists) from pyarrow.tests.parquet.common import alltypes_sample except ImportError: - pass + pd = tm = None # Marks all of the tests in this module @@ -390,8 +390,7 @@ def test_parquet_nested_convenience(tempdir): read = pq.read_table( path, columns=['a']) - tm.assert_frame_equal(read.to_pandas(), df[['a']]) \ - # type: ignore[invalid-argument-type] + tm.assert_frame_equal(read.to_pandas(), df[['a']]) read = pq.read_table( path, columns=['a', 'b']) diff --git a/python/pyarrow/tests/parquet/test_dataset.py b/python/pyarrow/tests/parquet/test_dataset.py index 1e6897f703d..b8939443c1d 100644 --- a/python/pyarrow/tests/parquet/test_dataset.py +++ b/python/pyarrow/tests/parquet/test_dataset.py @@ -24,7 +24,7 @@ try: import numpy as np except ImportError: - pass + np = None import pytest import unittest.mock as mock @@ -40,7 +40,7 @@ from pyarrow.tests.parquet.common import ( _read_table, _test_dataframe, _write_table) except ImportError: - pass + pq = None try: @@ -48,7 +48,7 @@ import pandas.testing as tm except ImportError: - pass + pd = tm = None # Marks all of the tests in this module diff --git a/python/pyarrow/tests/parquet/test_datetime.py b/python/pyarrow/tests/parquet/test_datetime.py index 7a95debca3f..b89fd97cb91 100644 --- a/python/pyarrow/tests/parquet/test_datetime.py +++ b/python/pyarrow/tests/parquet/test_datetime.py @@ -22,7 +22,7 @@ try: import numpy as np except ImportError: - pass + np = None import pytest import pyarrow as pa @@ -32,7 +32,7 @@ import pyarrow.parquet as pq from pyarrow.tests.parquet.common import _read_table, _write_table except ImportError: - pass + pq = None try: @@ -41,7 +41,7 @@ from pyarrow.tests.parquet.common import _roundtrip_pandas_dataframe except ImportError: - pass + pd = tm = None # Marks all of the tests in this module diff --git a/python/pyarrow/tests/parquet/test_encryption.py b/python/pyarrow/tests/parquet/test_encryption.py index 5815d65c8d8..a11a4935a1c 100644 --- a/python/pyarrow/tests/parquet/test_encryption.py +++ b/python/pyarrow/tests/parquet/test_encryption.py @@ -22,7 +22,8 @@ import pyarrow.parquet as pq import pyarrow.parquet.encryption as pe except ImportError: - pass + pq = None + pe = None else: from pyarrow.tests.parquet.encryption import ( InMemoryKmsClient, verify_file_encrypted) diff --git a/python/pyarrow/tests/parquet/test_metadata.py b/python/pyarrow/tests/parquet/test_metadata.py index 3386f77bb1a..148bfebaa67 100644 --- a/python/pyarrow/tests/parquet/test_metadata.py +++ b/python/pyarrow/tests/parquet/test_metadata.py @@ -23,7 +23,7 @@ try: import numpy as np except ImportError: - pass + np = None import pytest import pyarrow as pa @@ -35,14 +35,16 @@ import pyarrow.parquet as pq from pyarrow.tests.parquet.common import _write_table except ImportError: - pass + pq = None try: import pandas as pd + import pandas.testing as tm + from pyarrow.tests.parquet.common import alltypes_sample except ImportError: - pass + pd = tm = None # Marks all of the tests in this module @@ -494,12 +496,12 @@ def test_multi_dataset_metadata(tempdir): # Write merged metadata-only file with open(metapath, "wb") as f: - _meta.write_metadata_file(f) # type: ignore[possibly-unbound-attribute] + _meta.write_metadata_file(f) # Read back the metadata meta = pq.read_metadata(metapath) md = meta.to_dict() - _md = _meta.to_dict() # type: ignore[possibly-unbound-attribute] + _md = _meta.to_dict() for key in _md: if key != 'serialized_size': assert _md[key] == md[key] diff --git a/python/pyarrow/tests/parquet/test_pandas.py b/python/pyarrow/tests/parquet/test_pandas.py index edc7a2610eb..703232b7cac 100644 --- a/python/pyarrow/tests/parquet/test_pandas.py +++ b/python/pyarrow/tests/parquet/test_pandas.py @@ -16,12 +16,12 @@ # under the License. import io -from json import loads as json_loads +import json try: import numpy as np except ImportError: - pass + np = None import pytest import pyarrow as pa @@ -34,7 +34,7 @@ from pyarrow.tests.parquet.common import (_read_table, _test_dataframe, _write_table) except ImportError: - pass + pq = None try: @@ -44,7 +44,7 @@ from pyarrow.tests.parquet.common import (_roundtrip_pandas_dataframe, alltypes_sample) except ImportError: - pass + pd = tm = None # Marks all of the tests in this module @@ -65,7 +65,7 @@ def test_pandas_parquet_custom_metadata(tempdir): metadata = pq.read_metadata(filename).metadata assert b'pandas' in metadata - js = json_loads(metadata[b'pandas'].decode('utf8')) + js = json.loads(metadata[b'pandas'].decode('utf8')) assert js['index_columns'] == [{'kind': 'range', 'name': None, 'start': 0, 'stop': 10000, @@ -260,8 +260,7 @@ def test_pandas_parquet_configuration_options(tempdir): for compression in ['NONE', 'SNAPPY', 'GZIP', 'LZ4', 'ZSTD']: if (compression != 'NONE' and - not pa.lib.Codec.is_available(compression)): \ - # type: ignore[unresolved-attribute] + not pa.lib.Codec.is_available(compression)): continue _write_table(arrow_table, filename, version='2.6', compression=compression) @@ -426,8 +425,7 @@ def test_backwards_compatible_column_metadata_handling(datadir): table = _read_table( path, columns=['a']) result = table.to_pandas() - tm.assert_frame_equal(result, expected[['a']].reset_index( - drop=True)) # type: ignore[invalid-argument-type] + tm.assert_frame_equal(result, expected[['a']].reset_index(drop=True)) @pytest.mark.pandas @@ -487,7 +485,7 @@ def test_pandas_categorical_roundtrip(): codes = np.array([2, 0, 0, 2, 0, -1, 2], dtype='int32') categories = ['foo', 'bar', 'baz'] df = pd.DataFrame({'x': pd.Categorical.from_codes( - codes, categories=pd.Index(categories))}) + codes, categories=categories)}) buf = pa.BufferOutputStream() pq.write_table(pa.table(df), buf) @@ -532,18 +530,15 @@ def test_write_to_dataset_pandas_preserve_extensiondtypes(tempdir): table, str(tempdir / "case1"), partition_cols=['part'], ) result = pq.read_table(str(tempdir / "case1")).to_pandas() - tm.assert_frame_equal(result[["col"]], df[["col"]]) \ - # type: ignore[invalid-argument-type] + tm.assert_frame_equal(result[["col"]], df[["col"]]) pq.write_to_dataset(table, str(tempdir / "case2")) result = pq.read_table(str(tempdir / "case2")).to_pandas() - tm.assert_frame_equal(result[["col"]], df[["col"]]) \ - # type: ignore[invalid-argument-type] + tm.assert_frame_equal(result[["col"]], df[["col"]]) pq.write_table(table, str(tempdir / "data.parquet")) result = pq.read_table(str(tempdir / "data.parquet")).to_pandas() - tm.assert_frame_equal(result[["col"]], df[["col"]]) \ - # type: ignore[invalid-argument-type] + tm.assert_frame_equal(result[["col"]], df[["col"]]) @pytest.mark.pandas @@ -560,7 +555,7 @@ def test_write_to_dataset_pandas_preserve_index(tempdir): table, str(tempdir / "case1"), partition_cols=['part'], ) result = pq.read_table(str(tempdir / "case1")).to_pandas() - tm.assert_frame_equal(result, df_cat) # type: ignore[invalid-argument-type] + tm.assert_frame_equal(result, df_cat) pq.write_to_dataset(table, str(tempdir / "case2")) result = pq.read_table(str(tempdir / "case2")).to_pandas() diff --git a/python/pyarrow/tests/parquet/test_parquet_file.py b/python/pyarrow/tests/parquet/test_parquet_file.py index 4d4b467e9d3..24ffe612ef7 100644 --- a/python/pyarrow/tests/parquet/test_parquet_file.py +++ b/python/pyarrow/tests/parquet/test_parquet_file.py @@ -30,13 +30,15 @@ import pyarrow.parquet as pq from pyarrow.tests.parquet.common import _write_table except ImportError: - pass + pq = None try: + import pandas as pd import pandas.testing as tm + from pyarrow.tests.parquet.common import alltypes_sample except ImportError: - pass + pd = tm = None # Marks all of the tests in this module @@ -325,7 +327,7 @@ def test_parquet_file_with_filesystem(s3_example_fs, use_uri): table = pa.table({"a": range(10)}) pq.write_table(table, s3_path, filesystem=s3_fs) - parquet_file = pq.ParquetFile(*args, **kwargs) # type: ignore[missing-argument] + parquet_file = pq.ParquetFile(*args, **kwargs) assert parquet_file.read() == table assert not parquet_file.closed parquet_file.close() @@ -406,7 +408,7 @@ def test_parquet_file_hugginface_support(): pytest.skip("fsspec is not installed, skipping Hugging Face test") fake_hf_module = types.ModuleType("huggingface_hub") - fake_hf_module.HfFileSystem = MemoryFileSystem # type: ignore[unresolved-attribute] + fake_hf_module.HfFileSystem = MemoryFileSystem with mock.patch.dict("sys.modules", {"huggingface_hub": fake_hf_module}): uri = "hf://datasets/apache/arrow/test.parquet" table = pa.table({"a": range(10)}) diff --git a/python/pyarrow/tests/parquet/test_parquet_writer.py b/python/pyarrow/tests/parquet/test_parquet_writer.py index d6f30ea16be..d1e9e874ba1 100644 --- a/python/pyarrow/tests/parquet/test_parquet_writer.py +++ b/python/pyarrow/tests/parquet/test_parquet_writer.py @@ -25,7 +25,7 @@ from pyarrow.tests.parquet.common import (_read_table, _test_dataframe, _range_integers) except ImportError: - pass + pq = None try: @@ -33,7 +33,7 @@ import pandas.testing as tm except ImportError: - pass + pd = tm = None # Marks all of the tests in this module diff --git a/python/pyarrow/tests/strategies.py b/python/pyarrow/tests/strategies.py index 07ebaa771f1..450cce74f1d 100644 --- a/python/pyarrow/tests/strategies.py +++ b/python/pyarrow/tests/strategies.py @@ -24,24 +24,24 @@ try: import hypothesis.extra.numpy as npst except ImportError: - pass + npst = None try: import hypothesis.extra.pytz as tzst except ImportError: - pass + tzst = None try: import zoneinfo except ImportError: - pass + zoneinfo = None if sys.platform == 'win32': try: import tzdata # noqa:F401 except ImportError: - pass + zoneinfo = None try: import numpy as np except ImportError: - pass + np = None import pyarrow as pa diff --git a/python/pyarrow/tests/test_acero.py b/python/pyarrow/tests/test_acero.py index bbec49c5360..cb97e3849fd 100644 --- a/python/pyarrow/tests/test_acero.py +++ b/python/pyarrow/tests/test_acero.py @@ -19,8 +19,7 @@ import pyarrow as pa import pyarrow.compute as pc -from pyarrow.compute import field, multiply, sum, equal, all as pc_all \ - # type: ignore[unresolved-import] +from pyarrow.compute import field try: from pyarrow.acero import ( @@ -38,9 +37,9 @@ try: import pyarrow.dataset as ds - from pyarrow.acero import ScanNodeOptions # type: ignore[possibly-unbound-import] + from pyarrow.acero import ScanNodeOptions except ImportError: - pass + ds = None pytestmark = pytest.mark.acero @@ -122,7 +121,7 @@ def test_filter(table_source): ]) def test_filter_all_rows(source): # GH-46057: filtering all rows should return empty RecordBatch with same schema - result_expr = source.filter(field("number") < 0) + result_expr = source.filter(pc.field("number") < 0) assert result_expr.num_rows == 0 assert type(result_expr) is type(source) @@ -139,7 +138,7 @@ def test_project(table_source): # default name from expression decl = Declaration.from_sequence([ table_source, - Declaration("project", ProjectNodeOptions([multiply(field("a"), 2)])) + Declaration("project", ProjectNodeOptions([pc.multiply(field("a"), 2)])) ]) result = decl.to_table() assert result.schema.names == ["multiply(a, 2)"] @@ -148,7 +147,7 @@ def test_project(table_source): # provide name decl = Declaration.from_sequence([ table_source, - Declaration("project", ProjectNodeOptions([multiply(field("a"), 2)], ["a2"])) + Declaration("project", ProjectNodeOptions([pc.multiply(field("a"), 2)], ["a2"])) ]) result = decl.to_table() assert result.schema.names == ["a2"] @@ -156,12 +155,12 @@ def test_project(table_source): # input validation with pytest.raises(ValueError): - ProjectNodeOptions([multiply(field("a"), 2)], ["a2", "b2"]) + ProjectNodeOptions([pc.multiply(field("a"), 2)], ["a2", "b2"]) # no scalar expression decl = Declaration.from_sequence([ table_source, - Declaration("project", ProjectNodeOptions([sum(field("a"))])) + Declaration("project", ProjectNodeOptions([pc.sum(field("a"))])) ]) with pytest.raises(ValueError, match="cannot Execute non-scalar expression"): _ = decl.to_table() @@ -371,7 +370,7 @@ def test_hash_join_with_residual_filter(): join_opts = HashJoinNodeOptions( "inner", left_keys="key", right_keys="key", - filter_expression=equal(field('a'), 5)) + filter_expression=pc.equal(pc.field('a'), 5)) joined = Declaration( "hashjoin", options=join_opts, inputs=[left_source, right_source]) result = joined.to_table() @@ -383,7 +382,7 @@ def test_hash_join_with_residual_filter(): # test filter expression referencing columns from both side join_opts = HashJoinNodeOptions( "left outer", left_keys="key", right_keys="key", - filter_expression=equal(field("a"), 5) | equal(field("b"), 10) + filter_expression=pc.equal(pc.field("a"), 5) | pc.equal(pc.field("b"), 10) ) joined = Declaration( "hashjoin", options=join_opts, inputs=[left_source, right_source]) @@ -488,10 +487,10 @@ def test_scan(tempdir): # projection scan option - scan_opts = ScanNodeOptions(dataset, columns={"a2": multiply(field("a"), 2)}) + scan_opts = ScanNodeOptions(dataset, columns={"a2": pc.multiply(field("a"), 2)}) decl = Declaration("scan", scan_opts) result = decl.to_table() # "a" is included in the result (needed later on for the actual projection) assert result["a"].to_pylist() == [1, 2, 3] # "b" is still included, but without data as it will be removed by the projection - assert pc_all(result["b"].is_null()).as_py() + assert pc.all(result["b"].is_null()).as_py() diff --git a/python/pyarrow/tests/test_adhoc_memory_leak.py b/python/pyarrow/tests/test_adhoc_memory_leak.py index 9f61bc7ddfe..76a766984da 100644 --- a/python/pyarrow/tests/test_adhoc_memory_leak.py +++ b/python/pyarrow/tests/test_adhoc_memory_leak.py @@ -20,7 +20,7 @@ try: import numpy as np except ImportError: - pass + np = None import pyarrow as pa import pyarrow.tests.util as test_util diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 6ab39dd8716..009ab1e849b 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -30,11 +30,7 @@ try: import numpy as np except ImportError: - pass -try: - from pyarrow import lib # type: ignore[unresolved-import] -except ImportError: - pass + np = None import pyarrow as pa import pyarrow.tests.strategies as past @@ -327,7 +323,7 @@ def test_asarray(): np_arr = np.asarray([_ for _ in arr]) assert np_arr.tolist() == [0, 1, 2, 3] assert np_arr.dtype == np.dtype('O') - assert isinstance(np_arr[0], lib.Int64Value) + assert isinstance(np_arr[0], pa.lib.Int64Value) # Calling with the arrow array gives back an array with 'int64' dtype np_arr = np.asarray(arr) @@ -554,9 +550,7 @@ def test_arange(): for case in cases: result = pa.arange(*case) result.validate(full=True) - - assert result.equals(pa.array(list(range(*case)), type=pa.int64())) \ - # type: ignore[no-matching-overload] + assert result.equals(pa.array(list(range(*case)), type=pa.int64())) # Validate memory_pool keyword argument result = pa.arange(-1, 101, memory_pool=pa.default_memory_pool()) @@ -1912,9 +1906,9 @@ def test_cast_from_null(): out_types = [ pa.union([pa.field('a', pa.binary(10)), - pa.field('b', pa.string())], mode=lib.UnionMode_DENSE), + pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union([pa.field('a', pa.binary(10)), - pa.field('b', pa.string())], mode=lib.UnionMode_SPARSE), + pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), ] in_arr = pa.array(in_data, type=pa.null()) for out_type in out_types: @@ -3227,8 +3221,8 @@ def test_struct_array_field(): x2 = a.field('x') y2 = a.field('y') - assert isinstance(x0, lib.Int16Array) - assert isinstance(y1, lib.FloatArray) + assert isinstance(x0, pa.lib.Int16Array) + assert isinstance(y1, pa.lib.FloatArray) assert x0.equals(pa.array([1, 3, 5], type=pa.int16())) assert y0.equals(pa.array([2.5, 4.5, 6.5], type=pa.float32())) assert x0.equals(x1) @@ -3262,8 +3256,8 @@ def test_struct_array_flattened_field(): x2 = a._flattened_field('x') y2 = a._flattened_field('y') - assert isinstance(x0, lib.Int16Array) - assert isinstance(y1, lib.FloatArray) + assert isinstance(x0, pa.lib.Int16Array) + assert isinstance(y1, pa.lib.FloatArray) assert x0.equals(pa.array([1, None, 5], type=pa.int16())) assert y0.equals(pa.array([2.5, None, 6.5], type=pa.float32())) assert x0.equals(x1) @@ -3311,7 +3305,7 @@ def test_empty_cast(): # ARROW-4766: Ensure that supported types conversion don't segfault # on empty arrays of common types pa.array([], type=t1).cast(t2) - except (lib.ArrowNotImplementedError, pa.ArrowInvalid): + except (pa.lib.ArrowNotImplementedError, pa.ArrowInvalid): continue @@ -4107,7 +4101,7 @@ def test_list_view_from_arrays_fails(list_array_type, list_type_factory): mask = pa.array([False, False, True]) # Ambiguous to specify both validity map and offsets or sizes with nulls - with pytest.raises(lib.ArrowInvalid): + with pytest.raises(pa.lib.ArrowInvalid): list_array_type.from_arrays(offsets, sizes, values, mask=mask) offsets = [0, 1, 1] @@ -4115,7 +4109,7 @@ def test_list_view_from_arrays_fails(list_array_type, list_type_factory): array_slice = array[1:] # List offsets and sizes must not be slices if a validity map is specified - with pytest.raises(lib.ArrowInvalid): + with pytest.raises(pa.lib.ArrowInvalid): list_array_type.from_arrays( array_slice.offsets, array_slice.sizes, array_slice.values, mask=array_slice.is_null()) diff --git a/python/pyarrow/tests/test_builder.py b/python/pyarrow/tests/test_builder.py index 65ca1458d0c..9187a19b5fc 100644 --- a/python/pyarrow/tests/test_builder.py +++ b/python/pyarrow/tests/test_builder.py @@ -19,8 +19,7 @@ import weakref import pyarrow as pa -from pyarrow.lib import StringBuilder, StringViewBuilder \ - # type: ignore[unresolved_import] +from pyarrow.lib import StringBuilder, StringViewBuilder def test_weakref(): diff --git a/python/pyarrow/tests/test_cffi.py b/python/pyarrow/tests/test_cffi.py index 306225dbf69..84290a6b880 100644 --- a/python/pyarrow/tests/test_cffi.py +++ b/python/pyarrow/tests/test_cffi.py @@ -24,7 +24,7 @@ try: from pyarrow.cffi import ffi except ImportError: - pass + ffi = None import pytest @@ -32,7 +32,7 @@ import pandas as pd import pandas.testing as tm except ImportError: - pass + pd = tm = None needs_cffi = pytest.mark.skipif(ffi is None, @@ -676,8 +676,7 @@ def test_roundtrip_reader_capsule(constructor): obj = constructor(schema, batches) bad_schema = pa.schema({'ints': pa.int32()}) - with pytest.raises(pa.lib.ArrowTypeError, match="Field 0 cannot be cast"): \ - # type: ignore[unresolved-attribute] + with pytest.raises(pa.lib.ArrowTypeError, match="Field 0 cannot be cast"): obj.__arrow_c_stream__(bad_schema.__arrow_c_schema__()) # Can work with matching schema diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index e9afe643994..ad61dbc48a7 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -27,56 +27,25 @@ import random import sys import textwrap -from pyarrow import lib # type: ignore[unresolved-import] -from pyarrow.compute import \ - sum as pc_sum, mode, variance, skew, kurtosis, count_substring, \ - count_substring_regex, find_substring, find_substring_regex, match_like, \ - match_substring, match_substring_regex, utf8_trim_whitespace, \ - ascii_trim_whitespace, utf8_trim, utf8_slice_codeunits, binary_slice, \ - split_pattern, utf8_split_whitespace, ascii_split_whitespace, \ - split_pattern_regex, any as pc_any, all as pc_all, filter, min_max, \ - choose, utf8_is_printable, ascii_center, ascii_lpad, ascii_rpad, utf8_center, \ - utf8_lpad, utf8_rpad, binary_replace_slice, utf8_replace_slice, \ - replace_substring, replace_substring_regex, extract_regex, extract_regex_span, \ - binary_join, binary_join_element_wise, not_equal, less, less_equal, greater, \ - greater_equal, equal, round_to_multiple, round_binary, is_null, or_kleene, \ - is_valid, and_, and_kleene, or_, xor, invert, dictionary_decode, \ - dictionary_encode, strptime, strftime, year as pc_year, \ - is_leap_year as pc_is_leap_year, month as pc_month, day as pc_day, \ - day_of_year as pc_day_of_year, iso_year as pc_iso_year, iso_week as pc_iso_week, \ - iso_calendar as pc_iso_calendar, quarter as pc_quarter, hour as pc_hour, \ - minute as pc_minute, second as pc_second, millisecond as pc_millisecond, \ - microsecond as pc_microsecond, nanosecond as pc_nanosecond, \ - subsecond as pc_subsecond, local_timestamp as pc_local_timestamp, \ - is_dst as pc_is_dst, day_of_week as pc_day_of_week, \ - week as pc_week, \ - assume_timezone as pc_assume_timezone, count, ceil_temporal, floor_temporal, \ - round_temporal, partition_nth_indices, select_k_unstable, array_sort_indices, \ - sort_indices, is_in, index_in, quantile, tdigest, cumulative_sum, \ - cumulative_prod, max_element_wise, min_element_wise, cumulative_min, \ - cumulative_max, map_lookup, struct_field, case_when, make_struct, list_element, \ - count_distinct, utf8_normalize, rank, rank_quantile, rank_normal, negate, \ - subtract, divide, multiply, power, sqrt, exp, cos, sin, tan, acos, atan, \ - asin, atan2, sinh, cosh, tanh, asinh, acosh, atanh, abs as pc_abs, sign, \ - bit_wise_not, bit_wise_and, \ - bit_wise_or, bit_wise_xor, is_nan, is_finite, coalesce, hour, round as pc_round, \ - add as pc_add, cast, list_slice, run_end_decode, run_end_encode, pairwise_diff, \ - pairwise_diff_checked, pivot_wider, winsorize # type: ignore[unresolved-import] try: import numpy as np except ImportError: - pass + np = None try: import pandas as pd except ImportError: - pass + pd = None import pyarrow as pa import pyarrow.compute as pc -from pyarrow.lib import ArrowNotImplementedError # type: ignore[unresolved_import] +from pyarrow.lib import ArrowNotImplementedError +try: + import pyarrow.substrait as pas +except ImportError: + pas = None exported_functions = [ func for (name, func) in sorted(pc.__dict__.items()) @@ -359,36 +328,36 @@ def test_function_attributes(): def test_input_type_conversion(): # Automatic array conversion from Python - arr = pc_add([1, 2], [4, None]) + arr = pc.add([1, 2], [4, None]) assert arr.to_pylist() == [5, None] # Automatic scalar conversion from Python - arr = pc_add([1, 2], 4) + arr = pc.add([1, 2], 4) assert arr.to_pylist() == [5, 6] # Other scalar type - assert equal(["foo", "bar", None], - "foo").to_pylist() == [True, False, None] + assert pc.equal(["foo", "bar", None], + "foo").to_pylist() == [True, False, None] @pytest.mark.parametrize('arrow_type', numerical_arrow_types) def test_sum_array(arrow_type): arr = pa.array([1, 2, 3, 4], type=arrow_type) assert arr.sum().as_py() == 10 - assert pc_sum(arr).as_py() == 10 + assert pc.sum(arr).as_py() == 10 arr = pa.array([1, 2, 3, 4, None], type=arrow_type) assert arr.sum().as_py() == 10 - assert pc_sum(arr).as_py() == 10 + assert pc.sum(arr).as_py() == 10 arr = pa.array([None], type=arrow_type) assert arr.sum().as_py() is None # noqa: E711 - assert pc_sum(arr).as_py() is None # noqa: E711 + assert pc.sum(arr).as_py() is None # noqa: E711 assert arr.sum(min_count=0).as_py() == 0 - assert pc_sum(arr, min_count=0).as_py() == 0 + assert pc.sum(arr, min_count=0).as_py() == 0 arr = pa.array([], type=arrow_type) assert arr.sum().as_py() is None # noqa: E711 assert arr.sum(min_count=0).as_py() == 0 - assert pc_sum(arr, min_count=0).as_py() == 0 + assert pc.sum(arr, min_count=0).as_py() == 0 @pytest.mark.parametrize("arrow_type", [pa.decimal128(3, 2), pa.decimal256(3, 2)]) @@ -437,24 +406,24 @@ def test_sum_decimal_array(arrow_type): @pytest.mark.parametrize('arrow_type', numerical_arrow_types) def test_sum_chunked_array(arrow_type): arr = pa.chunked_array([pa.array([1, 2, 3, 4], type=arrow_type)]) - assert pc_sum(arr).as_py() == 10 + assert pc.sum(arr).as_py() == 10 arr = pa.chunked_array([ pa.array([1, 2], type=arrow_type), pa.array([3, 4], type=arrow_type) ]) - assert pc_sum(arr).as_py() == 10 + assert pc.sum(arr).as_py() == 10 arr = pa.chunked_array([ pa.array([1, 2], type=arrow_type), pa.array([], type=arrow_type), pa.array([3, 4], type=arrow_type) ]) - assert pc_sum(arr).as_py() == 10 + assert pc.sum(arr).as_py() == 10 arr = pa.chunked_array((), type=arrow_type) assert arr.num_chunks == 0 - assert pc_sum(arr).as_py() is None # noqa: E711 - assert pc_sum(arr, min_count=0).as_py() == 0 + assert pc.sum(arr).as_py() is None # noqa: E711 + assert pc.sum(arr, min_count=0).as_py() == 0 @pytest.mark.parametrize('arrow_type', [pa.decimal128(3, 2), pa.decimal256(3, 2)]) @@ -473,77 +442,77 @@ def test_sum_chunked_array_decimal_type(arrow_type): pa.array([Decimal("1.23"), Decimal("4.56")], type=arrow_type) ] ) - assert pc_sum(arr).as_py() == expected_sum - assert pc_sum(arr).type == max_precision_type + assert pc.sum(arr).as_py() == expected_sum + assert pc.sum(arr).type == max_precision_type arr = pa.chunked_array([ pa.array([Decimal("1.23")], type=arrow_type), pa.array([Decimal("4.56")], type=arrow_type) ]) - assert pc_sum(arr).as_py() == expected_sum - assert pc_sum(arr).type == max_precision_type + assert pc.sum(arr).as_py() == expected_sum + assert pc.sum(arr).type == max_precision_type arr = pa.chunked_array([ pa.array([Decimal("1.23")], type=arrow_type), pa.array([], type=arrow_type), pa.array([Decimal("4.56")], type=arrow_type) ]) - assert pc_sum(arr).as_py() == expected_sum - assert pc_sum(arr).type == max_precision_type + assert pc.sum(arr).as_py() == expected_sum + assert pc.sum(arr).type == max_precision_type arr = pa.chunked_array((), type=arrow_type) assert arr.num_chunks == 0 - assert pc_sum(arr).as_py() is None # noqa: E711 - assert pc_sum(arr).type == max_precision_type - assert pc_sum(arr, min_count=0).as_py() == zero - assert pc_sum(arr, min_count=0).type == max_precision_type + assert pc.sum(arr).as_py() is None # noqa: E711 + assert pc.sum(arr).type == max_precision_type + assert pc.sum(arr, min_count=0).as_py() == zero + assert pc.sum(arr, min_count=0).type == max_precision_type def test_mode_array(): # ARROW-9917 - data = pa.array([1, 1, 3, 4, 3, 5], type='int64') - arr = mode(data) - assert len(arr) == 1 - assert arr[0].as_py() == {"mode": 1, "count": 2} - - arr = mode(data, n=2) - assert len(arr) == 2 - assert arr[0].as_py() == {"mode": 1, "count": 2} - assert arr[1].as_py() == {"mode": 3, "count": 2} - - data = pa.array([], type='int64') - assert len(mode(data)) == 0 - - data = pa.array([1, 1, 3, 4, 3, None], type='int64') - arr = mode(data, skip_nulls=False) - assert len(arr) == 0 - arr = mode(data, min_count=6) - assert len(arr) == 0 - arr = mode(data, skip_nulls=False, min_count=5) - assert len(arr) == 0 - - data = pa.array([True, False]) - arr = mode(data, n=2) - assert len(arr) == 2 - assert arr[0].as_py() == {"mode": False, "count": 1} - assert arr[1].as_py() == {"mode": True, "count": 1} + arr = pa.array([1, 1, 3, 4, 3, 5], type='int64') + mode = pc.mode(arr) + assert len(mode) == 1 + assert mode[0].as_py() == {"mode": 1, "count": 2} + + mode = pc.mode(arr, n=2) + assert len(mode) == 2 + assert mode[0].as_py() == {"mode": 1, "count": 2} + assert mode[1].as_py() == {"mode": 3, "count": 2} + + arr = pa.array([], type='int64') + assert len(pc.mode(arr)) == 0 + + arr = pa.array([1, 1, 3, 4, 3, None], type='int64') + mode = pc.mode(arr, skip_nulls=False) + assert len(mode) == 0 + mode = pc.mode(arr, min_count=6) + assert len(mode) == 0 + mode = pc.mode(arr, skip_nulls=False, min_count=5) + assert len(mode) == 0 + + arr = pa.array([True, False]) + mode = pc.mode(arr, n=2) + assert len(mode) == 2 + assert mode[0].as_py() == {"mode": False, "count": 1} + assert mode[1].as_py() == {"mode": True, "count": 1} def test_mode_chunked_array(): # ARROW-9917 - data = pa.chunked_array([pa.array([1, 1, 3, 4, 3, 5], type='int64')]) - arr = mode(data) - assert len(arr) == 1 - assert arr[0].as_py() == {"mode": 1, "count": 2} + arr = pa.chunked_array([pa.array([1, 1, 3, 4, 3, 5], type='int64')]) + mode = pc.mode(arr) + assert len(mode) == 1 + assert mode[0].as_py() == {"mode": 1, "count": 2} - arr = mode(data, n=2) - assert len(arr) == 2 - assert arr[0].as_py() == {"mode": 1, "count": 2} - assert arr[1].as_py() == {"mode": 3, "count": 2} + mode = pc.mode(arr, n=2) + assert len(mode) == 2 + assert mode[0].as_py() == {"mode": 1, "count": 2} + assert mode[1].as_py() == {"mode": 3, "count": 2} arr = pa.chunked_array((), type='int64') assert arr.num_chunks == 0 - assert len(mode(arr)) == 0 + assert len(pc.mode(arr)) == 0 def test_empty_chunked_array(): @@ -556,23 +525,23 @@ def test_empty_chunked_array(): def test_variance(): data = [1, 2, 3, 4, 5, 6, 7, 8] - assert variance(data).as_py() == 5.25 - assert variance(data, ddof=0).as_py() == 5.25 - assert variance(data, ddof=1).as_py() == 6.0 + assert pc.variance(data).as_py() == 5.25 + assert pc.variance(data, ddof=0).as_py() == 5.25 + assert pc.variance(data, ddof=1).as_py() == 6.0 def test_skew(): data = [1, 1, None, 2] - assert skew(data).as_py() == pytest.approx(0.707106781186548, rel=1e-10) - assert skew(data, skip_nulls=False).as_py() is None - assert skew(data, min_count=4).as_py() is None + assert pc.skew(data).as_py() == pytest.approx(0.707106781186548, rel=1e-10) + assert pc.skew(data, skip_nulls=False).as_py() is None + assert pc.skew(data, min_count=4).as_py() is None def test_kurtosis(): data = [1, 1, None, 2] - assert kurtosis(data).as_py() == pytest.approx(-1.5, rel=1e-10) - assert kurtosis(data, skip_nulls=False).as_py() is None - assert kurtosis(data, min_count=4).as_py() is None + assert pc.kurtosis(data).as_py() == pytest.approx(-1.5, rel=1e-10) + assert pc.kurtosis(data, skip_nulls=False).as_py() is None + assert pc.kurtosis(data, min_count=4).as_py() is None @pytest.mark.parametrize("input, expected", ( @@ -585,8 +554,8 @@ def test_kurtosis(): ([1, 40], {'skew': None, 'kurtosis': None}), )) def test_unbiased_skew_and_kurtosis(input, expected): - arrow_skew = skew(input, skip_nulls=True, biased=False) - arrow_kurtosis = kurtosis(input, skip_nulls=True, biased=False) + arrow_skew = pc.skew(input, skip_nulls=True, biased=False) + arrow_kurtosis = pc.kurtosis(input, skip_nulls=True, biased=False) assert arrow_skew.as_py() == expected['skew'] assert arrow_kurtosis.as_py() == expected['kurtosis'] @@ -596,11 +565,11 @@ def test_count_substring(): (pa.large_string(), pa.int64())]: arr = pa.array(["ab", "cab", "abcab", "ba", "AB", None], type=ty) - result = count_substring(arr, "ab") + result = pc.count_substring(arr, "ab") expected = pa.array([1, 1, 2, 0, 0, None], type=offset) assert expected == result - result = count_substring(arr, "ab", ignore_case=True) + result = pc.count_substring(arr, "ab", ignore_case=True) expected = pa.array([1, 1, 2, 0, 1, None], type=offset) assert expected == result @@ -610,11 +579,11 @@ def test_count_substring_regex(): (pa.large_string(), pa.int64())]: arr = pa.array(["ab", "cab", "baAacaa", "ba", "AB", None], type=ty) - result = count_substring_regex(arr, "a+") + result = pc.count_substring_regex(arr, "a+") expected = pa.array([1, 1, 3, 1, 0, None], type=offset) assert expected.equals(result) - result = count_substring_regex(arr, "a+", ignore_case=True) + result = pc.count_substring_regex(arr, "a+", ignore_case=True) expected = pa.array([1, 1, 2, 1, 1, None], type=offset) assert expected.equals(result) @@ -622,61 +591,61 @@ def test_count_substring_regex(): def test_find_substring(): for ty in [pa.string(), pa.binary(), pa.large_string(), pa.large_binary()]: arr = pa.array(["ab", "cab", "ba", None], type=ty) - result = find_substring(arr, "ab") + result = pc.find_substring(arr, "ab") assert result.to_pylist() == [0, 1, -1, None] - result = find_substring_regex(arr, "a?b") + result = pc.find_substring_regex(arr, "a?b") assert result.to_pylist() == [0, 1, 0, None] arr = pa.array(["ab*", "cAB*", "ba", "aB?"], type=ty) - result = find_substring(arr, "aB*", ignore_case=True) + result = pc.find_substring(arr, "aB*", ignore_case=True) assert result.to_pylist() == [0, 1, -1, -1] - result = find_substring_regex(arr, "a?b", ignore_case=True) + result = pc.find_substring_regex(arr, "a?b", ignore_case=True) assert result.to_pylist() == [0, 1, 0, 0] def test_match_like(): arr = pa.array(["ab", "ba%", "ba", "ca%d", None]) - result = match_like(arr, r"_a\%%") + result = pc.match_like(arr, r"_a\%%") expected = pa.array([False, True, False, True, None]) assert expected.equals(result) arr = pa.array(["aB", "bA%", "ba", "ca%d", None]) - result = match_like(arr, r"_a\%%", ignore_case=True) + result = pc.match_like(arr, r"_a\%%", ignore_case=True) expected = pa.array([False, True, False, True, None]) assert expected.equals(result) - result = match_like(arr, r"_a\%%", ignore_case=False) + result = pc.match_like(arr, r"_a\%%", ignore_case=False) expected = pa.array([False, False, False, True, None]) assert expected.equals(result) def test_match_substring(): arr = pa.array(["ab", "abc", "ba", None]) - result = match_substring(arr, "ab") + result = pc.match_substring(arr, "ab") expected = pa.array([True, True, False, None]) assert expected.equals(result) arr = pa.array(["áB", "Ábc", "ba", None]) - result = match_substring(arr, "áb", ignore_case=True) + result = pc.match_substring(arr, "áb", ignore_case=True) expected = pa.array([True, True, False, None]) assert expected.equals(result) - result = match_substring(arr, "áb", ignore_case=False) + result = pc.match_substring(arr, "áb", ignore_case=False) expected = pa.array([False, False, False, None]) assert expected.equals(result) def test_match_substring_regex(): arr = pa.array(["ab", "abc", "ba", "c", None]) - result = match_substring_regex(arr, "^a?b") + result = pc.match_substring_regex(arr, "^a?b") expected = pa.array([True, True, True, False, None]) assert expected.equals(result) arr = pa.array(["aB", "Abc", "BA", "c", None]) - result = match_substring_regex(arr, "^a?b", ignore_case=True) + result = pc.match_substring_regex(arr, "^a?b", ignore_case=True) expected = pa.array([True, True, True, False, None]) assert expected.equals(result) - result = match_substring_regex(arr, "^a?b", ignore_case=False) + result = pc.match_substring_regex(arr, "^a?b", ignore_case=False) expected = pa.array([False, False, False, False, None]) assert expected.equals(result) @@ -684,21 +653,21 @@ def test_match_substring_regex(): def test_trim(): # \u3000 is unicode whitespace arr = pa.array([" foo", None, " \u3000foo bar \t"]) - result = utf8_trim_whitespace(arr) + result = pc.utf8_trim_whitespace(arr) expected = pa.array(["foo", None, "foo bar"]) assert expected.equals(result) arr = pa.array([" foo", None, " \u3000foo bar \t"]) - result = ascii_trim_whitespace(arr) + result = pc.ascii_trim_whitespace(arr) expected = pa.array(["foo", None, "\u3000foo bar"]) assert expected.equals(result) arr = pa.array([" foo", None, " \u3000foo bar \t"]) - result = utf8_trim(arr, characters=' f\u3000') + result = pc.utf8_trim(arr, characters=' f\u3000') expected = pa.array(["oo", None, "oo bar \t"]) assert expected.equals(result) # Positional option - result = utf8_trim(arr, ' f\u3000') + result = pc.utf8_trim(arr, ' f\u3000') expected = pa.array(["oo", None, "oo bar \t"]) assert expected.equals(result) @@ -710,12 +679,12 @@ def test_slice_compatibility(): for step in [-3, -2, -1, 1, 2, 3]: expected = pa.array([k.as_py()[start:stop:step] for k in arr]) - result = utf8_slice_codeunits( + result = pc.utf8_slice_codeunits( arr, start=start, stop=stop, step=step) assert expected.equals(result) # Positional options - assert utf8_slice_codeunits(arr, - start, stop, step) == result + assert pc.utf8_slice_codeunits(arr, + start, stop, step) == result def test_binary_slice_compatibility(): @@ -728,113 +697,113 @@ def test_binary_slice_compatibility(): continue expected = pa.array([k.as_py()[start:stop:step] for k in arr]) - result = binary_slice( + result = pc.binary_slice( arr, start=start, stop=stop, step=step) assert expected.equals(result) # Positional options - assert binary_slice(arr, start, stop, step) == result + assert pc.binary_slice(arr, start, stop, step) == result # Fixed size binary input / output for item in data: fsb_scalar = pa.scalar(item, type=pa.binary(len(item))) expected = item[start:stop:step] - actual = binary_slice(fsb_scalar, start, stop, step) + actual = pc.binary_slice(fsb_scalar, start, stop, step) assert actual.type == pa.binary(len(expected)) assert actual.as_py() == expected def test_split_pattern(): arr = pa.array(["-foo---bar--", "---foo---b"]) - result = split_pattern(arr, pattern="---") + result = pc.split_pattern(arr, pattern="---") expected = pa.array([["-foo", "bar--"], ["", "foo", "b"]]) assert expected.equals(result) - result = split_pattern(arr, "---", max_splits=1) + result = pc.split_pattern(arr, "---", max_splits=1) expected = pa.array([["-foo", "bar--"], ["", "foo---b"]]) assert expected.equals(result) - result = split_pattern(arr, "---", max_splits=1, reverse=True) + result = pc.split_pattern(arr, "---", max_splits=1, reverse=True) expected = pa.array([["-foo", "bar--"], ["---foo", "b"]]) assert expected.equals(result) def test_split_whitespace_utf8(): arr = pa.array(["foo bar", " foo \u3000\tb"]) - result = utf8_split_whitespace(arr) + result = pc.utf8_split_whitespace(arr) expected = pa.array([["foo", "bar"], ["", "foo", "b"]]) assert expected.equals(result) - result = utf8_split_whitespace(arr, max_splits=1) + result = pc.utf8_split_whitespace(arr, max_splits=1) expected = pa.array([["foo", "bar"], ["", "foo \u3000\tb"]]) assert expected.equals(result) - result = utf8_split_whitespace(arr, max_splits=1, reverse=True) + result = pc.utf8_split_whitespace(arr, max_splits=1, reverse=True) expected = pa.array([["foo", "bar"], [" foo", "b"]]) assert expected.equals(result) def test_split_whitespace_ascii(): arr = pa.array(["foo bar", " foo \u3000\tb"]) - result = ascii_split_whitespace(arr) + result = pc.ascii_split_whitespace(arr) expected = pa.array([["foo", "bar"], ["", "foo", "\u3000", "b"]]) assert expected.equals(result) - result = ascii_split_whitespace(arr, max_splits=1) + result = pc.ascii_split_whitespace(arr, max_splits=1) expected = pa.array([["foo", "bar"], ["", "foo \u3000\tb"]]) assert expected.equals(result) - result = ascii_split_whitespace(arr, max_splits=1, reverse=True) + result = pc.ascii_split_whitespace(arr, max_splits=1, reverse=True) expected = pa.array([["foo", "bar"], [" foo \u3000", "b"]]) assert expected.equals(result) def test_split_pattern_regex(): arr = pa.array(["-foo---bar--", "---foo---b"]) - result = split_pattern_regex(arr, pattern="-+") + result = pc.split_pattern_regex(arr, pattern="-+") expected = pa.array([["", "foo", "bar", ""], ["", "foo", "b"]]) assert expected.equals(result) - result = split_pattern_regex(arr, "-+", max_splits=1) + result = pc.split_pattern_regex(arr, "-+", max_splits=1) expected = pa.array([["", "foo---bar--"], ["", "foo---b"]]) assert expected.equals(result) with pytest.raises(NotImplementedError, match="Cannot split in reverse with regex"): - result = split_pattern_regex( + result = pc.split_pattern_regex( arr, pattern="---", max_splits=1, reverse=True) def test_min_max(): # An example generated function wrapper with possible options data = [4, 5, 6, None, 1] - s = min_max(data) + s = pc.min_max(data) assert s.as_py() == {'min': 1, 'max': 6} - s = min_max(data, options=pc.ScalarAggregateOptions()) + s = pc.min_max(data, options=pc.ScalarAggregateOptions()) assert s.as_py() == {'min': 1, 'max': 6} - s = min_max(data, options=pc.ScalarAggregateOptions(skip_nulls=True)) + s = pc.min_max(data, options=pc.ScalarAggregateOptions(skip_nulls=True)) assert s.as_py() == {'min': 1, 'max': 6} - s = min_max(data, options=pc.ScalarAggregateOptions(skip_nulls=False)) + s = pc.min_max(data, options=pc.ScalarAggregateOptions(skip_nulls=False)) assert s.as_py() == {'min': None, 'max': None} # Options as dict of kwargs - s = min_max(data, options={'skip_nulls': False}) + s = pc.min_max(data, options={'skip_nulls': False}) assert s.as_py() == {'min': None, 'max': None} # Options as named functions arguments - s = min_max(data, skip_nulls=False) + s = pc.min_max(data, skip_nulls=False) assert s.as_py() == {'min': None, 'max': None} # Both options and named arguments with pytest.raises(TypeError): - s = min_max( + s = pc.min_max( data, options=pc.ScalarAggregateOptions(), skip_nulls=False) # Wrong options type options = pc.TakeOptions() with pytest.raises(TypeError): - s = min_max(data, options=options) + s = pc.min_max(data, options=options) # Missing argument with pytest.raises(TypeError, match="min_max takes 1 positional"): - s = min_max() + s = pc.min_max() def test_any(): @@ -843,17 +812,17 @@ def test_any(): options = pc.ScalarAggregateOptions(skip_nulls=False, min_count=0) a = pa.array([], type='bool') - assert pc_any(a).as_py() is None - assert pc_any(a, min_count=0).as_py() is False - assert pc_any(a, options=options).as_py() is False + assert pc.any(a).as_py() is None + assert pc.any(a, min_count=0).as_py() is False + assert pc.any(a, options=options).as_py() is False a = pa.array([False, None, True]) - assert pc_any(a).as_py() is True - assert pc_any(a, options=options).as_py() is True + assert pc.any(a).as_py() is True + assert pc.any(a, options=options).as_py() is True a = pa.array([False, None, False]) - assert pc_any(a).as_py() is False - assert pc_any(a, options=options).as_py() is None + assert pc.any(a).as_py() is False + assert pc.any(a, options=options).as_py() is None def test_all(): @@ -862,39 +831,39 @@ def test_all(): options = pc.ScalarAggregateOptions(skip_nulls=False, min_count=0) a = pa.array([], type='bool') - assert pc_all(a).as_py() is None - assert pc_all(a, min_count=0).as_py() is True - assert pc_all(a, options=options).as_py() is True + assert pc.all(a).as_py() is None + assert pc.all(a, min_count=0).as_py() is True + assert pc.all(a, options=options).as_py() is True a = pa.array([False, True]) - assert pc_all(a).as_py() is False - assert pc_all(a, options=options).as_py() is False + assert pc.all(a).as_py() is False + assert pc.all(a, options=options).as_py() is False a = pa.array([True, None]) - assert pc_all(a).as_py() is True - assert pc_all(a, options=options).as_py() is None + assert pc.all(a).as_py() is True + assert pc.all(a, options=options).as_py() is None a = pa.chunked_array([[True], [True, None]]) - assert pc_all(a).as_py() is True - assert pc_all(a, options=options).as_py() is None + assert pc.all(a).as_py() is True + assert pc.all(a, options=options).as_py() is None a = pa.chunked_array([[True], [False]]) - assert pc_all(a).as_py() is False - assert pc_all(a, options=options).as_py() is False + assert pc.all(a).as_py() is False + assert pc.all(a, options=options).as_py() is False def test_is_valid(): # An example generated function wrapper without options data = [4, 5, None] - assert is_valid(data).to_pylist() == [True, True, False] + assert pc.is_valid(data).to_pylist() == [True, True, False] with pytest.raises(TypeError): - is_valid(data, options=None) + pc.is_valid(data, options=None) def test_generated_docstrings(): # With options - assert min_max.__doc__ == textwrap.dedent("""\ + assert pc.min_max.__doc__ == textwrap.dedent("""\ Compute the minimum and maximum values of a numeric array. Null values are ignored by default. @@ -916,7 +885,7 @@ def test_generated_docstrings(): If not passed, will allocate memory from the default memory pool. """) # Without options - assert pc_add.__doc__ == textwrap.dedent("""\ + assert pc.add.__doc__ == textwrap.dedent("""\ Add the arguments element-wise. Results will wrap around on integer overflow. @@ -933,7 +902,7 @@ def test_generated_docstrings(): If not passed, will allocate memory from the default memory pool. """) # Varargs with options - assert min_element_wise.__doc__ == textwrap.dedent("""\ + assert pc.min_element_wise.__doc__ == textwrap.dedent("""\ Find the element-wise minimum value. Nulls are ignored (by default) or propagated. @@ -951,7 +920,7 @@ def test_generated_docstrings(): memory_pool : pyarrow.MemoryPool, optional If not passed, will allocate memory from the default memory pool. """) - assert filter.__doc__ == textwrap.dedent("""\ + assert pc.filter.__doc__ == textwrap.dedent("""\ Filter with a boolean selection filter. The output is populated with values from the input at positions @@ -998,24 +967,24 @@ def test_generated_signatures(): # options and their default values. # Without options - sig = inspect.signature(pc_add) + sig = inspect.signature(pc.add) assert str(sig) == "(x, y, /, *, memory_pool=None)" # With options - sig = inspect.signature(min_max) + sig = inspect.signature(pc.min_max) assert str(sig) == ("(array, /, *, skip_nulls=True, min_count=1, " "options=None, memory_pool=None)") # With positional options - sig = inspect.signature(quantile) + sig = inspect.signature(pc.quantile) assert str(sig) == ("(array, /, q=0.5, *, interpolation='linear', " "skip_nulls=True, min_count=0, " "options=None, memory_pool=None)") # Varargs with options - sig = inspect.signature(binary_join_element_wise) + sig = inspect.signature(pc.binary_join_element_wise) assert str(sig) == ("(*strings, null_handling='emit_null', " "null_replacement='', options=None, " "memory_pool=None)") # Varargs without options - sig = inspect.signature(choose) + sig = inspect.signature(pc.choose) assert str(sig) == "(indices, /, *values, memory_pool=None)" # Nullary with options sig = inspect.signature(pc.random) @@ -1032,7 +1001,7 @@ def find_new_unicode_codepoints(): new = set() characters = [chr(c) for c in range(0x80, 0x11000) if not (0xD800 <= c < 0xE000)] - is_printable = utf8_is_printable(pa.array(characters)).to_pylist() + is_printable = pc.utf8_is_printable(pa.array(characters)).to_pylist() for i, c in enumerate(characters): if is_printable[i] != c.isprintable(): new.add(ord(c)) @@ -1152,20 +1121,20 @@ def test_string_py_compat_boolean(function_name, variant): def test_pad(): arr = pa.array([None, 'a', 'abcd']) - assert ascii_center(arr, width=3).tolist() == [None, ' a ', 'abcd'] - assert ascii_lpad(arr, width=3).tolist() == [None, ' a', 'abcd'] - assert ascii_rpad(arr, width=3).tolist() == [None, 'a ', 'abcd'] - assert ascii_center(arr, 3).tolist() == [None, ' a ', 'abcd'] - assert ascii_lpad(arr, 3).tolist() == [None, ' a', 'abcd'] - assert ascii_rpad(arr, 3).tolist() == [None, 'a ', 'abcd'] + assert pc.ascii_center(arr, width=3).tolist() == [None, ' a ', 'abcd'] + assert pc.ascii_lpad(arr, width=3).tolist() == [None, ' a', 'abcd'] + assert pc.ascii_rpad(arr, width=3).tolist() == [None, 'a ', 'abcd'] + assert pc.ascii_center(arr, 3).tolist() == [None, ' a ', 'abcd'] + assert pc.ascii_lpad(arr, 3).tolist() == [None, ' a', 'abcd'] + assert pc.ascii_rpad(arr, 3).tolist() == [None, 'a ', 'abcd'] arr = pa.array([None, 'á', 'abcd']) - assert utf8_center(arr, width=3).tolist() == [None, ' á ', 'abcd'] - assert utf8_lpad(arr, width=3).tolist() == [None, ' á', 'abcd'] - assert utf8_rpad(arr, width=3).tolist() == [None, 'á ', 'abcd'] - assert utf8_center(arr, 3).tolist() == [None, ' á ', 'abcd'] - assert utf8_lpad(arr, 3).tolist() == [None, ' á', 'abcd'] - assert utf8_rpad(arr, 3).tolist() == [None, 'á ', 'abcd'] + assert pc.utf8_center(arr, width=3).tolist() == [None, ' á ', 'abcd'] + assert pc.utf8_lpad(arr, width=3).tolist() == [None, ' á', 'abcd'] + assert pc.utf8_rpad(arr, width=3).tolist() == [None, 'á ', 'abcd'] + assert pc.utf8_center(arr, 3).tolist() == [None, ' á ', 'abcd'] + assert pc.utf8_lpad(arr, 3).tolist() == [None, ' á', 'abcd'] + assert pc.utf8_rpad(arr, 3).tolist() == [None, 'á ', 'abcd'] def test_utf8_zfill(): @@ -1208,53 +1177,53 @@ def test_replace_slice(): for start in offsets: for stop in offsets: expected = series.str.slice_replace(start, stop, 'XX') - actual = binary_replace_slice( + actual = pc.binary_replace_slice( arr, start=start, stop=stop, replacement='XX') assert actual.tolist() == expected.tolist() # Positional options - assert binary_replace_slice(arr, start, stop, 'XX') == actual + assert pc.binary_replace_slice(arr, start, stop, 'XX') == actual arr = pa.array([None, '', 'π', 'πb', 'πbθ', 'πbθd', 'πbθde']) series = arr.to_pandas().astype(object).replace({np.nan: None}) for start in offsets: for stop in offsets: expected = series.str.slice_replace(start, stop, 'XX') - actual = utf8_replace_slice( + actual = pc.utf8_replace_slice( arr, start=start, stop=stop, replacement='XX') assert actual.tolist() == expected.tolist() def test_replace_plain(): data = pa.array(['foozfoo', 'food', None]) - ar = replace_substring(data, pattern='foo', replacement='bar') + ar = pc.replace_substring(data, pattern='foo', replacement='bar') assert ar.tolist() == ['barzbar', 'bard', None] - ar = replace_substring(data, 'foo', 'bar') + ar = pc.replace_substring(data, 'foo', 'bar') assert ar.tolist() == ['barzbar', 'bard', None] - ar = replace_substring(data, pattern='foo', replacement='bar', - max_replacements=1) + ar = pc.replace_substring(data, pattern='foo', replacement='bar', + max_replacements=1) assert ar.tolist() == ['barzfoo', 'bard', None] - ar = replace_substring(data, 'foo', 'bar', max_replacements=1) + ar = pc.replace_substring(data, 'foo', 'bar', max_replacements=1) assert ar.tolist() == ['barzfoo', 'bard', None] def test_replace_regex(): data = pa.array(['foo', 'mood', None]) expected = ['f00', 'm00d', None] - ar = replace_substring_regex(data, pattern='(.)oo', replacement=r'\100') + ar = pc.replace_substring_regex(data, pattern='(.)oo', replacement=r'\100') assert ar.tolist() == expected - ar = replace_substring_regex(data, '(.)oo', replacement=r'\100') + ar = pc.replace_substring_regex(data, '(.)oo', replacement=r'\100') assert ar.tolist() == expected - ar = replace_substring_regex(data, '(.)oo', r'\100') + ar = pc.replace_substring_regex(data, '(.)oo', r'\100') assert ar.tolist() == expected def test_extract_regex(): ar = pa.array(['a1', 'zb2z']) expected = [{'letter': 'a', 'digit': '1'}, {'letter': 'b', 'digit': '2'}] - struct = extract_regex(ar, pattern=r'(?P[ab])(?P\d)') + struct = pc.extract_regex(ar, pattern=r'(?P[ab])(?P\d)') assert struct.tolist() == expected - struct = extract_regex(ar, r'(?P[ab])(?P\d)') + struct = pc.extract_regex(ar, r'(?P[ab])(?P\d)') assert struct.tolist() == expected @@ -1262,50 +1231,50 @@ def test_extract_regex_span(): ar = pa.array(['a1', 'zb234z']) expected = [{'letter': [0, 1], 'digit': [1, 1]}, {'letter': [1, 1], 'digit': [2, 3]}] - struct = extract_regex_span(ar, pattern=r'(?P[ab])(?P\d+)') + struct = pc.extract_regex_span(ar, pattern=r'(?P[ab])(?P\d+)') assert struct.tolist() == expected - struct = extract_regex_span(ar, r'(?P[ab])(?P\d+)') + struct = pc.extract_regex_span(ar, r'(?P[ab])(?P\d+)') assert struct.tolist() == expected def test_binary_join(): ar_list = pa.array([['foo', 'bar'], None, []]) expected = pa.array(['foo-bar', None, '']) - assert binary_join(ar_list, '-').equals(expected) + assert pc.binary_join(ar_list, '-').equals(expected) separator_array = pa.array(['1', '2'], type=pa.binary()) expected = pa.array(['a1b', 'c2d'], type=pa.binary()) ar_list = pa.array([['a', 'b'], ['c', 'd']], type=pa.list_(pa.binary())) - assert binary_join(ar_list, separator_array).equals(expected) + assert pc.binary_join(ar_list, separator_array).equals(expected) def test_binary_join_element_wise(): null = pa.scalar(None, type=pa.string()) arrs = [[None, 'a', 'b'], ['c', None, 'd'], [None, '-', '--']] - assert binary_join_element_wise(*arrs).to_pylist() == \ + assert pc.binary_join_element_wise(*arrs).to_pylist() == \ [None, None, 'b--d'] - assert binary_join_element_wise('a', 'b', '-').as_py() == 'a-b' - assert binary_join_element_wise('a', null, '-').as_py() is None - assert binary_join_element_wise('a', 'b', null).as_py() is None + assert pc.binary_join_element_wise('a', 'b', '-').as_py() == 'a-b' + assert pc.binary_join_element_wise('a', null, '-').as_py() is None + assert pc.binary_join_element_wise('a', 'b', null).as_py() is None skip = pc.JoinOptions(null_handling='skip') - assert binary_join_element_wise(*arrs, options=skip).to_pylist() == \ + assert pc.binary_join_element_wise(*arrs, options=skip).to_pylist() == \ [None, 'a', 'b--d'] - assert binary_join_element_wise( + assert pc.binary_join_element_wise( 'a', 'b', '-', options=skip).as_py() == 'a-b' - assert binary_join_element_wise( + assert pc.binary_join_element_wise( 'a', null, '-', options=skip).as_py() == 'a' - assert binary_join_element_wise( + assert pc.binary_join_element_wise( 'a', 'b', null, options=skip).as_py() is None replace = pc.JoinOptions(null_handling='replace', null_replacement='spam') - assert binary_join_element_wise(*arrs, options=replace).to_pylist() == \ + assert pc.binary_join_element_wise(*arrs, options=replace).to_pylist() == \ [None, 'a-spam', 'b--d'] - assert binary_join_element_wise( + assert pc.binary_join_element_wise( 'a', 'b', '-', options=replace).as_py() == 'a-b' - assert binary_join_element_wise( + assert pc.binary_join_element_wise( 'a', null, '-', options=replace).as_py() == 'a-spam' - assert binary_join_element_wise( + assert pc.binary_join_element_wise( 'a', 'b', null, options=replace).as_py() is None @@ -1633,22 +1602,22 @@ def con(values): arr1 = con([1, 2, 3, 4, None]) arr2 = con([1, 1, 4, None, 4]) - result = equal(arr1, arr2) + result = pc.equal(arr1, arr2) assert result.equals(con([True, False, False, None, None])) - result = not_equal(arr1, arr2) + result = pc.not_equal(arr1, arr2) assert result.equals(con([False, True, True, None, None])) - result = less(arr1, arr2) + result = pc.less(arr1, arr2) assert result.equals(con([False, False, True, None, None])) - result = less_equal(arr1, arr2) + result = pc.less_equal(arr1, arr2) assert result.equals(con([True, False, True, None, None])) - result = greater(arr1, arr2) + result = pc.greater(arr1, arr2) assert result.equals(con([False, True, False, None, None])) - result = greater_equal(arr1, arr2) + result = pc.greater_equal(arr1, arr2) assert result.equals(con([True, True, False, None, None])) @@ -1664,28 +1633,28 @@ def con(values): arr = con(['a', 'b', 'c', None]) scalar = pa.scalar('b') - result = equal(arr, scalar) + result = pc.equal(arr, scalar) assert result.equals(con([False, True, False, None])) if typ == "array": nascalar = pa.scalar(None, type="string") - result = equal(arr, nascalar) - isnull = is_null(result) + result = pc.equal(arr, nascalar) + isnull = pc.is_null(result) assert isnull.equals(con([True, True, True, True])) - result = not_equal(arr, scalar) + result = pc.not_equal(arr, scalar) assert result.equals(con([True, False, True, None])) - result = less(arr, scalar) + result = pc.less(arr, scalar) assert result.equals(con([True, False, False, None])) - result = less_equal(arr, scalar) + result = pc.less_equal(arr, scalar) assert result.equals(con([True, True, False, None])) - result = greater(arr, scalar) + result = pc.greater(arr, scalar) assert result.equals(con([False, False, True, None])) - result = greater_equal(arr, scalar) + result = pc.greater_equal(arr, scalar) assert result.equals(con([False, True, True, None])) @@ -1701,27 +1670,27 @@ def con(values): arr = con([1, 2, 3, None]) scalar = pa.scalar(2) - result = equal(arr, scalar) + result = pc.equal(arr, scalar) assert result.equals(con([False, True, False, None])) if typ == "array": nascalar = pa.scalar(None, type="int64") - result = equal(arr, nascalar) + result = pc.equal(arr, nascalar) assert result.to_pylist() == [None, None, None, None] - result = not_equal(arr, scalar) + result = pc.not_equal(arr, scalar) assert result.equals(con([True, False, True, None])) - result = less(arr, scalar) + result = pc.less(arr, scalar) assert result.equals(con([True, False, False, None])) - result = less_equal(arr, scalar) + result = pc.less_equal(arr, scalar) assert result.equals(con([True, True, False, None])) - result = greater(arr, scalar) + result = pc.greater(arr, scalar) assert result.equals(con([False, False, True, None])) - result = greater_equal(arr, scalar) + result = pc.greater_equal(arr, scalar) assert result.equals(con([False, True, True, None])) @@ -1737,14 +1706,14 @@ def test_compare_chunked_array_mixed(): (arr_chunked, arr), (arr_chunked, arr_chunked2), ]: - result = equal(left, right) + result = pc.equal(left, right) assert result.equals(expected) def test_arithmetic_add(): left = pa.array([1, 2, 3, 4, 5]) right = pa.array([0, -1, 1, 2, 3]) - result = pc_add(left, right) + result = pc.add(left, right) expected = pa.array([1, 1, 4, 6, 8]) assert result.equals(expected) @@ -1752,7 +1721,7 @@ def test_arithmetic_add(): def test_arithmetic_subtract(): left = pa.array([1, 2, 3, 4, 5]) right = pa.array([0, -1, 1, 2, 3]) - result = subtract(left, right) + result = pc.subtract(left, right) expected = pa.array([1, 3, 2, 2, 2]) assert result.equals(expected) @@ -1760,7 +1729,7 @@ def test_arithmetic_subtract(): def test_arithmetic_multiply(): left = pa.array([1, 2, 3, 4, 5]) right = pa.array([0, -1, 1, 2, 3]) - result = multiply(left, right) + result = pc.multiply(left, right) expected = pa.array([0, -2, 3, 8, 15]) assert result.equals(expected) @@ -1768,10 +1737,10 @@ def test_arithmetic_multiply(): @pytest.mark.parametrize("ty", ["round", "round_to_multiple"]) def test_round_to_integer(ty): if ty == "round": - round_func = pc_round + round = pc.round RoundOptions = partial(pc.RoundOptions, ndigits=0) elif ty == "round_to_multiple": - round_func = round_to_multiple + round = pc.round_to_multiple RoundOptions = partial(pc.RoundToMultipleOptions, multiple=1) values = [3.2, 3.5, 3.7, 4.5, -3.2, -3.5, -3.7, None] @@ -1789,7 +1758,7 @@ def test_round_to_integer(ty): } for round_mode, expected in rmode_and_expected.items(): options = RoundOptions(round_mode=round_mode) - result = round_func(values, options=options) + result = round(values, options=options) expected_array = pa.array(expected, type=pa.float64()) assert expected_array.equals(result) @@ -1806,11 +1775,11 @@ def test_round(): } for ndigits, expected in ndigits_and_expected.items(): options = pc.RoundOptions(ndigits, "half_towards_infinity") - result = pc_round(values, options=options) + result = pc.round(values, options=options) np.testing.assert_allclose(result, pa.array(expected), equal_nan=True) - assert pc_round(values, ndigits, + assert pc.round(values, ndigits, round_mode="half_towards_infinity") == result - assert pc_round(values, ndigits, "half_towards_infinity") == result + assert pc.round(values, ndigits, "half_towards_infinity") == result @pytest.mark.numpy @@ -1826,19 +1795,19 @@ def test_round_to_multiple(): } for multiple, expected in multiple_and_expected.items(): options = pc.RoundToMultipleOptions(multiple, "half_towards_infinity") - result = round_to_multiple(values, options=options) + result = pc.round_to_multiple(values, options=options) np.testing.assert_allclose(result, pa.array(expected), equal_nan=True) - assert round_to_multiple(values, multiple, - "half_towards_infinity") == result + assert pc.round_to_multiple(values, multiple, + "half_towards_infinity") == result for multiple in [0, -2, pa.scalar(-10.4)]: with pytest.raises(pa.ArrowInvalid, match="Rounding multiple must be positive"): - round_to_multiple(values, multiple=multiple) + pc.round_to_multiple(values, multiple=multiple) for multiple in [object, 99999999999999999999999]: with pytest.raises(TypeError, match="is not a valid multiple type"): - round_to_multiple(values, multiple=multiple) + pc.round_to_multiple(values, multiple=multiple) def test_round_binary(): @@ -1846,15 +1815,15 @@ def test_round_binary(): scales = pa.array([-3, -2, -1, 0, 1, 2, 3], pa.int32()) expected = pa.array( [0, 200, 350, 457, 123.5, 234.57, 345.678], pa.float64()) - assert round_binary(values, scales) == expected + assert pc.round_binary(values, scales) == expected expect_zero = pa.scalar(0, pa.float64()) expect_inf = pa.scalar(10, pa.float64()) scale = pa.scalar(-1, pa.int32()) - assert round_binary( + assert pc.round_binary( 5.0, scale, round_mode="half_towards_zero") == expect_zero - assert round_binary( + assert pc.round_binary( 5.0, scale, round_mode="half_towards_infinity") == expect_inf @@ -1863,11 +1832,11 @@ def test_is_null(): result = arr.is_null() expected = pa.array([False, False, False, True]) assert result.equals(expected) - assert result.equals(is_null(arr)) + assert result.equals(pc.is_null(arr)) result = arr.is_valid() expected = pa.array([True, True, True, False]) assert result.equals(expected) - assert result.equals(is_valid(arr)) + assert result.equals(pc.is_valid(arr)) arr = pa.chunked_array([[1, 2], [3, None]]) result = arr.is_null() @@ -1987,27 +1956,27 @@ def test_logical(): a = pa.array([True, False, False, None]) b = pa.array([True, True, False, True]) - assert and_(a, b) == pa.array([True, False, False, None]) - assert and_kleene(a, b) == pa.array([True, False, False, None]) + assert pc.and_(a, b) == pa.array([True, False, False, None]) + assert pc.and_kleene(a, b) == pa.array([True, False, False, None]) - assert or_(a, b) == pa.array([True, True, False, None]) - assert or_kleene(a, b) == pa.array([True, True, False, True]) + assert pc.or_(a, b) == pa.array([True, True, False, None]) + assert pc.or_kleene(a, b) == pa.array([True, True, False, True]) - assert xor(a, b) == pa.array([False, True, False, None]) + assert pc.xor(a, b) == pa.array([False, True, False, None]) - assert invert(a) == pa.array([False, True, True, None]) + assert pc.invert(a) == pa.array([False, True, True, None]) def test_dictionary_decode(): array = pa.array(["a", "a", "b", "c", "b"]) dictionary_array = array.dictionary_encode() - dictionary_array_decode = dictionary_decode(dictionary_array) + dictionary_array_decode = pc.dictionary_decode(dictionary_array) assert array != dictionary_array assert array == dictionary_array_decode - assert array == dictionary_decode(array) - assert dictionary_encode(dictionary_array) == dictionary_array + assert array == pc.dictionary_decode(array) + assert pc.dictionary_encode(dictionary_array) == dictionary_array def test_cast(): @@ -2084,7 +2053,7 @@ def test_fsl_to_fsl_cast(value_type): # Different sized FSL cast_type = pa.list_(pa.field("element", value_type), 3) err_msg = 'Size of FixedSizeList is not the same.' - with pytest.raises(lib.ArrowTypeError, match=err_msg): + with pytest.raises(pa.lib.ArrowTypeError, match=err_msg): fsl.cast(cast_type) @@ -2282,28 +2251,28 @@ def test_cast_float_to_decimal_random(float_ty, decimal_traits): def test_strptime(): arr = pa.array(["5/1/2020", None, "12/13/1900"]) - got = strptime(arr, format='%m/%d/%Y', unit='s') + got = pc.strptime(arr, format='%m/%d/%Y', unit='s') expected = pa.array( [datetime.datetime(2020, 5, 1), None, datetime.datetime(1900, 12, 13)], type=pa.timestamp('s')) assert got == expected # Positional format - assert strptime(arr, '%m/%d/%Y', unit='s') == got + assert pc.strptime(arr, '%m/%d/%Y', unit='s') == got expected = pa.array([datetime.datetime(2020, 1, 5), None, None], type=pa.timestamp('s')) - got = strptime(arr, format='%d/%m/%Y', unit='s', error_is_null=True) + got = pc.strptime(arr, format='%d/%m/%Y', unit='s', error_is_null=True) assert got == expected with pytest.raises(pa.ArrowInvalid, match="Failed to parse string: '5/1/2020'"): - strptime(arr, format='%Y-%m-%d', unit='s', error_is_null=False) + pc.strptime(arr, format='%Y-%m-%d', unit='s', error_is_null=False) with pytest.raises(pa.ArrowInvalid, match="Failed to parse string: '5/1/2020'"): - strptime(arr, format='%Y-%m-%d', unit='s') + pc.strptime(arr, format='%Y-%m-%d', unit='s') - got = strptime(arr, format='%Y-%m-%d', unit='s', error_is_null=True) + got = pc.strptime(arr, format='%Y-%m-%d', unit='s', error_is_null=True) assert got == pa.array([None, None, None], type=pa.timestamp('s')) @@ -2325,7 +2294,7 @@ def test_strftime(): tsa = pa.array(ts, type=pa.timestamp(unit, timezone)) for fmt in formats: options = pc.StrftimeOptions(fmt) - result = strftime(tsa, options=options) + result = pc.strftime(tsa, options=options) # cast to the same type as result to ignore string vs large_string expected = pa.array(ts.strftime(fmt)).cast(result.type) assert result.equals(expected) @@ -2334,34 +2303,34 @@ def test_strftime(): # Default format tsa = pa.array(ts, type=pa.timestamp("s", timezone)) - result = strftime(tsa, options=pc.StrftimeOptions()) + result = pc.strftime(tsa, options=pc.StrftimeOptions()) expected = pa.array(ts.strftime(fmt)).cast(result.type) assert result.equals(expected) # Default format plus timezone tsa = pa.array(ts, type=pa.timestamp("s", timezone)) - result = strftime(tsa, options=pc.StrftimeOptions(fmt + "%Z")) + result = pc.strftime(tsa, options=pc.StrftimeOptions(fmt + "%Z")) expected = pa.array(ts.strftime(fmt + "%Z")).cast(result.type) assert result.equals(expected) # Pandas %S is equivalent to %S in arrow for unit="s" tsa = pa.array(ts, type=pa.timestamp("s", timezone)) options = pc.StrftimeOptions("%S") - result = strftime(tsa, options=options) + result = pc.strftime(tsa, options=options) expected = pa.array(ts.strftime("%S")).cast(result.type) assert result.equals(expected) # Pandas %S.%f is equivalent to %S in arrow for unit="us" tsa = pa.array(ts, type=pa.timestamp("us", timezone)) options = pc.StrftimeOptions("%S") - result = strftime(tsa, options=options) + result = pc.strftime(tsa, options=options) expected = pa.array(ts.strftime("%S.%f")).cast(result.type) assert result.equals(expected) # Test setting locale tsa = pa.array(ts, type=pa.timestamp("s", timezone)) options = pc.StrftimeOptions(fmt, locale="C") - result = strftime(tsa, options=options) + result = pc.strftime(tsa, options=options) expected = pa.array(ts.strftime(fmt)).cast(result.type) assert result.equals(expected) @@ -2369,19 +2338,19 @@ def test_strftime(): fmt = "%Y-%m-%dT%H:%M:%S" ts = pd.to_datetime(times) tsa = pa.array(ts, type=pa.timestamp("s")) - result = strftime(tsa, options=pc.StrftimeOptions(fmt)) + result = pc.strftime(tsa, options=pc.StrftimeOptions(fmt)) expected = pa.array(ts.strftime(fmt)).cast(result.type) # Positional format - assert strftime(tsa, fmt) == result + assert pc.strftime(tsa, fmt) == result assert result.equals(expected) with pytest.raises(pa.ArrowInvalid, match="Timezone not present, cannot convert to string"): - strftime(tsa, options=pc.StrftimeOptions(fmt + "%Z")) + pc.strftime(tsa, options=pc.StrftimeOptions(fmt + "%Z")) with pytest.raises(pa.ArrowInvalid, match="Timezone not present, cannot convert to string"): - strftime(tsa, options=pc.StrftimeOptions(fmt + "%z")) + pc.strftime(tsa, options=pc.StrftimeOptions(fmt + "%z")) def _check_datetime_components(timestamps, timezone=None): @@ -2429,42 +2398,42 @@ def _check_datetime_components(timestamps, timezone=None): microsecond = ts.dt.microsecond.astype("int64") nanosecond = ts.dt.nanosecond.astype("int64") - assert pc_year(tsa).equals(pa.array(year)) - assert pc_is_leap_year(tsa).equals(pa.array(ts.dt.is_leap_year)) - assert pc_month(tsa).equals(pa.array(month)) - assert pc_day(tsa).equals(pa.array(day)) - assert pc_day_of_week(tsa).equals(pa.array(dayofweek)) - assert pc_day_of_year(tsa).equals(pa.array(dayofyear)) - assert pc_iso_year(tsa).equals(pa.array(iso_year)) - assert pc_iso_week(tsa).equals(pa.array(iso_week)) - assert pc_iso_calendar(tsa).equals(iso_calendar) - assert pc_quarter(tsa).equals(pa.array(quarter)) - assert pc_hour(tsa).equals(pa.array(hour)) - assert pc_minute(tsa).equals(pa.array(minute)) - assert pc_second(tsa).equals(pa.array(second)) - assert pc_millisecond(tsa).equals(pa.array(microsecond // 10 ** 3)) - assert pc_microsecond(tsa).equals(pa.array(microsecond % 10 ** 3)) - assert pc_nanosecond(tsa).equals(pa.array(nanosecond)) - assert pc_subsecond(tsa).equals(pa.array(subseconds)) - assert pc_local_timestamp(tsa).equals(pa.array(ts.dt.tz_localize(None))) + assert pc.year(tsa).equals(pa.array(year)) + assert pc.is_leap_year(tsa).equals(pa.array(ts.dt.is_leap_year)) + assert pc.month(tsa).equals(pa.array(month)) + assert pc.day(tsa).equals(pa.array(day)) + assert pc.day_of_week(tsa).equals(pa.array(dayofweek)) + assert pc.day_of_year(tsa).equals(pa.array(dayofyear)) + assert pc.iso_year(tsa).equals(pa.array(iso_year)) + assert pc.iso_week(tsa).equals(pa.array(iso_week)) + assert pc.iso_calendar(tsa).equals(iso_calendar) + assert pc.quarter(tsa).equals(pa.array(quarter)) + assert pc.hour(tsa).equals(pa.array(hour)) + assert pc.minute(tsa).equals(pa.array(minute)) + assert pc.second(tsa).equals(pa.array(second)) + assert pc.millisecond(tsa).equals(pa.array(microsecond // 10 ** 3)) + assert pc.microsecond(tsa).equals(pa.array(microsecond % 10 ** 3)) + assert pc.nanosecond(tsa).equals(pa.array(nanosecond)) + assert pc.subsecond(tsa).equals(pa.array(subseconds)) + assert pc.local_timestamp(tsa).equals(pa.array(ts.dt.tz_localize(None))) if ts.dt.tz: if ts.dt.tz is datetime.timezone.utc: # datetime with utc returns None for dst() - arr_is_dst = [False] * len(ts) + is_dst = [False] * len(ts) else: - arr_is_dst = ts.apply(lambda x: x.dst().seconds > 0) - assert pc_is_dst(tsa).equals(pa.array(arr_is_dst)) + is_dst = ts.apply(lambda x: x.dst().seconds > 0) + assert pc.is_dst(tsa).equals(pa.array(is_dst)) day_of_week_options = pc.DayOfWeekOptions( count_from_zero=False, week_start=1) - assert pc_day_of_week(tsa, options=day_of_week_options).equals( + assert pc.day_of_week(tsa, options=day_of_week_options).equals( pa.array(dayofweek + 1)) week_options = pc.WeekOptions( week_starts_monday=True, count_from_zero=False, first_week_is_fully_in_year=False) - assert pc_week(tsa, options=week_options).equals(pa.array(iso_week)) + assert pc.week(tsa, options=week_options).equals(pa.array(iso_week)) @pytest.mark.pandas @@ -2503,7 +2472,7 @@ def test_iso_calendar_longer_array(unit): # https://github.com/apache/arrow/issues/38655 # ensure correct result for array length > 32 arr = pa.array([datetime.datetime(2022, 1, 2, 9)]*50, pa.timestamp(unit)) - result = pc_iso_calendar(arr) + result = pc.iso_calendar(arr) expected = pa.StructArray.from_arrays( [[2021]*50, [52]*50, [7]*50], names=['iso_year', 'iso_week', 'iso_day_of_week'] @@ -2542,18 +2511,18 @@ def test_assume_timezone(): options = pc.AssumeTimezoneOptions(timezone) ta = pa.array(timestamps, type=ts_type) expected = timestamps.tz_localize(timezone) - result = pc_assume_timezone(ta, options=options) + result = pc.assume_timezone(ta, options=options) assert result.equals(pa.array(expected)) - result = pc_assume_timezone(ta, timezone) # Positional option + result = pc.assume_timezone(ta, timezone) # Positional option assert result.equals(pa.array(expected)) ta_zoned = pa.array(timestamps, type=pa.timestamp("ns", timezone)) with pytest.raises(pa.ArrowInvalid, match="already have a timezone:"): - pc_assume_timezone(ta_zoned, options=options) + pc.assume_timezone(ta_zoned, options=options) invalid_options = pc.AssumeTimezoneOptions("Europe/Brusselsss") with pytest.raises(ValueError, match="not found in timezone database"): - pc_assume_timezone(ta, options=invalid_options) + pc.assume_timezone(ta, options=invalid_options) timezone = "Europe/Brussels" @@ -2566,18 +2535,18 @@ def test_assume_timezone(): with pytest.raises(ValueError, match="Timestamp doesn't exist in " f"timezone '{timezone}'"): - pc_assume_timezone(nonexistent_array, + pc.assume_timezone(nonexistent_array, options=options_nonexistent_raise) expected = pa.array(nonexistent.tz_localize( timezone, nonexistent="shift_forward")) - result = pc_assume_timezone( + result = pc.assume_timezone( nonexistent_array, options=options_nonexistent_latest) expected.equals(result) expected = pa.array(nonexistent.tz_localize( timezone, nonexistent="shift_backward")) - result = pc_assume_timezone( + result = pc.assume_timezone( nonexistent_array, options=options_nonexistent_earliest) expected.equals(result) @@ -2590,16 +2559,15 @@ def test_assume_timezone(): with pytest.raises(ValueError, match="Timestamp is ambiguous in " f"timezone '{timezone}'"): - pc_assume_timezone(ambiguous_array, options=options_ambiguous_raise) + pc.assume_timezone(ambiguous_array, options=options_ambiguous_raise) - expected = ambiguous.tz_localize(timezone, ambiguous=np.array([True, True, True])) - result = pc_assume_timezone( + expected = ambiguous.tz_localize(timezone, ambiguous=[True, True, True]) + result = pc.assume_timezone( ambiguous_array, options=options_ambiguous_earliest) result.equals(pa.array(expected)) - expected = ambiguous.tz_localize( - timezone, ambiguous=np.array([False, False, False])) - result = pc_assume_timezone( + expected = ambiguous.tz_localize(timezone, ambiguous=[False, False, False]) + result = pc.assume_timezone( ambiguous_array, options=options_ambiguous_latest) result.equals(pa.array(expected)) @@ -2628,15 +2596,15 @@ def _check_temporal_rounding(ts, values, unit): frequency = str(value) + unit_shorthand[unit] options = pc.RoundTemporalOptions(value, unit) - result = ceil_temporal(ta, options=options).to_pandas() + result = pc.ceil_temporal(ta, options=options).to_pandas() expected = ts.dt.ceil(frequency) np.testing.assert_array_equal(result, expected) - result = floor_temporal(ta, options=options).to_pandas() + result = pc.floor_temporal(ta, options=options).to_pandas() expected = ts.dt.floor(frequency) np.testing.assert_array_equal(result, expected) - result = round_temporal(ta, options=options).to_pandas() + result = pc.round_temporal(ta, options=options).to_pandas() expected = ts.dt.round(frequency) np.testing.assert_array_equal(result, expected) @@ -2649,29 +2617,29 @@ def _check_temporal_rounding(ts, values, unit): origin = ts.dt.floor(greater_unit[unit]) if ta.type.tz is None: - result = ceil_temporal(ta, options=options).to_pandas() + result = pc.ceil_temporal(ta, options=options).to_pandas() expected = (ts - origin).dt.ceil(frequency) + origin np.testing.assert_array_equal(result, expected) - result = floor_temporal(ta, options=options).to_pandas() + result = pc.floor_temporal(ta, options=options).to_pandas() expected = (ts - origin).dt.floor(frequency) + origin np.testing.assert_array_equal(result, expected) - result = round_temporal(ta, options=options).to_pandas() + result = pc.round_temporal(ta, options=options).to_pandas() expected = (ts - origin).dt.round(frequency) + origin np.testing.assert_array_equal(result, expected) # Check RoundTemporalOptions partial defaults if unit == "day": - result = ceil_temporal(ta, multiple=value).to_pandas() + result = pc.ceil_temporal(ta, multiple=value).to_pandas() expected = ts.dt.ceil(frequency) np.testing.assert_array_equal(result, expected) - result = floor_temporal(ta, multiple=value).to_pandas() + result = pc.floor_temporal(ta, multiple=value).to_pandas() expected = ts.dt.floor(frequency) np.testing.assert_array_equal(result, expected) - result = round_temporal(ta, multiple=value).to_pandas() + result = pc.round_temporal(ta, multiple=value).to_pandas() expected = ts.dt.round(frequency) np.testing.assert_array_equal(result, expected) @@ -2682,7 +2650,7 @@ def _check_temporal_rounding(ts, values, unit): if ta.type.tz is None: options = pc.RoundTemporalOptions( value, unit, ceil_is_strictly_greater=True) - result = ceil_temporal(ta, options=options) + result = pc.ceil_temporal(ta, options=options) expected = ts.dt.ceil(frequency) expected = np.where( @@ -2695,15 +2663,15 @@ def _check_temporal_rounding(ts, values, unit): if unit == "day": frequency = "1D" - result = ceil_temporal(ta).to_pandas() + result = pc.ceil_temporal(ta).to_pandas() expected = ts.dt.ceil(frequency) np.testing.assert_array_equal(result, expected) - result = floor_temporal(ta).to_pandas() + result = pc.floor_temporal(ta).to_pandas() expected = ts.dt.floor(frequency) np.testing.assert_array_equal(result, expected) - result = round_temporal(ta).to_pandas() + result = pc.round_temporal(ta).to_pandas() expected = ts.dt.round(frequency) np.testing.assert_array_equal(result, expected) @@ -2741,15 +2709,15 @@ def test_round_temporal(unit): def test_count(): arr = pa.array([1, 2, 3, None, None]) - assert count(arr).as_py() == 3 - assert count(arr, mode='only_valid').as_py() == 3 - assert count(arr, mode='only_null').as_py() == 2 - assert count(arr, mode='all').as_py() == 5 - assert count(arr, 'all').as_py() == 5 + assert pc.count(arr).as_py() == 3 + assert pc.count(arr, mode='only_valid').as_py() == 3 + assert pc.count(arr, mode='only_null').as_py() == 2 + assert pc.count(arr, mode='all').as_py() == 5 + assert pc.count(arr, 'all').as_py() == 5 with pytest.raises(ValueError, match='"something else" is not a valid count mode'): - count(arr, 'something else') + pc.count(arr, 'something else') def test_index(): @@ -2791,15 +2759,15 @@ def test_partition_nth(): data = list(range(100, 140)) random.shuffle(data) pivot = 10 - indices = partition_nth_indices(data, pivot=pivot) + indices = pc.partition_nth_indices(data, pivot=pivot) check_partition_nth(data, indices, pivot, "at_end") # Positional pivot argument - assert partition_nth_indices(data, pivot) == indices + assert pc.partition_nth_indices(data, pivot) == indices with pytest.raises( ValueError, match="'partition_nth_indices' cannot be called without options"): - partition_nth_indices(data) + pc.partition_nth_indices(data) def test_partition_nth_null_placement(): @@ -2808,14 +2776,14 @@ def test_partition_nth_null_placement(): for pivot in (0, 7, 13, 19): for null_placement in ("at_start", "at_end"): - indices = partition_nth_indices(data, pivot=pivot, - null_placement=null_placement) + indices = pc.partition_nth_indices(data, pivot=pivot, + null_placement=null_placement) check_partition_nth(data, indices, pivot, null_placement) def test_select_k_array(): def validate_select_k(select_k_indices, arr, order, stable_sort=False): - sorted_indices = sort_indices(arr, sort_keys=[("dummy", order)]) + sorted_indices = pc.sort_indices(arr, sort_keys=[("dummy", order)]) head_k_indices = sorted_indices.slice(0, len(select_k_indices)) if stable_sort: assert select_k_indices == head_k_indices @@ -2827,7 +2795,7 @@ def validate_select_k(select_k_indices, arr, order, stable_sort=False): arr = pa.array([1, 2, None, 0]) for k in [0, 2, 4]: for order in ["descending", "ascending"]: - result = select_k_unstable( + result = pc.select_k_unstable( arr, k=k, sort_keys=[("dummy", order)]) validate_select_k(result, arr, order) @@ -2837,26 +2805,26 @@ def validate_select_k(select_k_indices, arr, order, stable_sort=False): result = pc.bottom_k_unstable(arr, k=k) validate_select_k(result, arr, "ascending") - result = select_k_unstable( + result = pc.select_k_unstable( arr, options=pc.SelectKOptions( k=2, sort_keys=[("dummy", "descending")]) ) validate_select_k(result, arr, "descending") - result = select_k_unstable( + result = pc.select_k_unstable( arr, options=pc.SelectKOptions(k=2, sort_keys=[("dummy", "ascending")]) ) validate_select_k(result, arr, "ascending") # Position options - assert select_k_unstable(arr, 2, - sort_keys=[("dummy", "ascending")]) == result - assert select_k_unstable(arr, 2, [("dummy", "ascending")]) == result + assert pc.select_k_unstable(arr, 2, + sort_keys=[("dummy", "ascending")]) == result + assert pc.select_k_unstable(arr, 2, [("dummy", "ascending")]) == result def test_select_k_table(): def validate_select_k(select_k_indices, tbl, sort_keys, stable_sort=False): - sorted_indices = sort_indices(tbl, sort_keys=sort_keys) + sorted_indices = pc.sort_indices(tbl, sort_keys=sort_keys) head_k_indices = sorted_indices.slice(0, len(select_k_indices)) if stable_sort: assert select_k_indices == head_k_indices @@ -2867,11 +2835,11 @@ def validate_select_k(select_k_indices, tbl, sort_keys, stable_sort=False): table = pa.table({"a": [1, 2, 0], "b": [1, 0, 1]}) for k in [0, 2, 4]: - result = select_k_unstable( + result = pc.select_k_unstable( table, k=k, sort_keys=[("a", "ascending")]) validate_select_k(result, table, sort_keys=[("a", "ascending")]) - result = select_k_unstable( + result = pc.select_k_unstable( table, k=k, sort_keys=[(pc.field("a"), "ascending"), ("b", "ascending")]) validate_select_k( result, table, sort_keys=[("a", "ascending"), ("b", "ascending")]) @@ -2886,65 +2854,65 @@ def validate_select_k(select_k_indices, tbl, sort_keys, stable_sort=False): with pytest.raises( ValueError, match="'select_k_unstable' cannot be called without options"): - select_k_unstable(table) + pc.select_k_unstable(table) with pytest.raises(ValueError, match="select_k_unstable requires a nonnegative `k`"): - select_k_unstable(table, k=-1, sort_keys=[("a", "ascending")]) + pc.select_k_unstable(table, k=-1, sort_keys=[("a", "ascending")]) with pytest.raises(ValueError, match="select_k_unstable requires a " "non-empty `sort_keys`"): - select_k_unstable(table, k=2, sort_keys=[]) + pc.select_k_unstable(table, k=2, sort_keys=[]) with pytest.raises(ValueError, match="not a valid sort order"): - select_k_unstable(table, k=k, sort_keys=[("a", "nonscending")]) + pc.select_k_unstable(table, k=k, sort_keys=[("a", "nonscending")]) with pytest.raises(ValueError, match="Invalid sort key column: No match for.*unknown"): - select_k_unstable(table, k=k, sort_keys=[("unknown", "ascending")]) + pc.select_k_unstable(table, k=k, sort_keys=[("unknown", "ascending")]) def test_array_sort_indices(): arr = pa.array([1, 2, None, 0]) - result = array_sort_indices(arr) + result = pc.array_sort_indices(arr) assert result.to_pylist() == [3, 0, 1, 2] - result = array_sort_indices(arr, order="ascending") + result = pc.array_sort_indices(arr, order="ascending") assert result.to_pylist() == [3, 0, 1, 2] - result = array_sort_indices(arr, order="descending") + result = pc.array_sort_indices(arr, order="descending") assert result.to_pylist() == [1, 0, 3, 2] - result = array_sort_indices(arr, order="descending", - null_placement="at_start") + result = pc.array_sort_indices(arr, order="descending", + null_placement="at_start") assert result.to_pylist() == [2, 1, 0, 3] - result = array_sort_indices(arr, "descending", - null_placement="at_start") + result = pc.array_sort_indices(arr, "descending", + null_placement="at_start") assert result.to_pylist() == [2, 1, 0, 3] with pytest.raises(ValueError, match="not a valid sort order"): - array_sort_indices(arr, order="nonscending") + pc.array_sort_indices(arr, order="nonscending") def test_sort_indices_array(): arr = pa.array([1, 2, None, 0]) - result = sort_indices(arr) + result = pc.sort_indices(arr) assert result.to_pylist() == [3, 0, 1, 2] - result = sort_indices(arr, sort_keys=[("dummy", "ascending")]) + result = pc.sort_indices(arr, sort_keys=[("dummy", "ascending")]) assert result.to_pylist() == [3, 0, 1, 2] - result = sort_indices(arr, sort_keys=[("dummy", "descending")]) + result = pc.sort_indices(arr, sort_keys=[("dummy", "descending")]) assert result.to_pylist() == [1, 0, 3, 2] - result = sort_indices(arr, sort_keys=[("dummy", "descending")], - null_placement="at_start") + result = pc.sort_indices(arr, sort_keys=[("dummy", "descending")], + null_placement="at_start") assert result.to_pylist() == [2, 1, 0, 3] # Positional `sort_keys` - result = sort_indices(arr, [("dummy", "descending")], - null_placement="at_start") + result = pc.sort_indices(arr, [("dummy", "descending")], + null_placement="at_start") assert result.to_pylist() == [2, 1, 0, 3] # Using SortOptions - result = sort_indices( + result = pc.sort_indices( arr, options=pc.SortOptions(sort_keys=[("dummy", "descending")]) ) assert result.to_pylist() == [1, 0, 3, 2] - result = sort_indices( + result = pc.sort_indices( arr, options=pc.SortOptions(sort_keys=[("dummy", "descending")], null_placement="at_start") ) @@ -2954,134 +2922,134 @@ def test_sort_indices_array(): def test_sort_indices_table(): table = pa.table({"a": [1, 1, None, 0], "b": [1, 0, 0, 1]}) - result = sort_indices(table, sort_keys=[("a", "ascending")]) + result = pc.sort_indices(table, sort_keys=[("a", "ascending")]) assert result.to_pylist() == [3, 0, 1, 2] - result = sort_indices(table, sort_keys=[(pc.field("a"), "ascending")], - null_placement="at_start") + result = pc.sort_indices(table, sort_keys=[(pc.field("a"), "ascending")], + null_placement="at_start") assert result.to_pylist() == [2, 3, 0, 1] - result = sort_indices( + result = pc.sort_indices( table, sort_keys=[("a", "descending"), ("b", "ascending")] ) assert result.to_pylist() == [1, 0, 3, 2] - result = sort_indices( + result = pc.sort_indices( table, sort_keys=[("a", "descending"), ("b", "ascending")], null_placement="at_start" ) assert result.to_pylist() == [2, 1, 0, 3] # Positional `sort_keys` - result = sort_indices( + result = pc.sort_indices( table, [("a", "descending"), ("b", "ascending")], null_placement="at_start" ) assert result.to_pylist() == [2, 1, 0, 3] with pytest.raises(ValueError, match="Must specify one or more sort keys"): - sort_indices(table) + pc.sort_indices(table) with pytest.raises(ValueError, match="Invalid sort key column: No match for.*unknown"): - sort_indices(table, sort_keys=[("unknown", "ascending")]) + pc.sort_indices(table, sort_keys=[("unknown", "ascending")]) with pytest.raises(ValueError, match="not a valid sort order"): - sort_indices(table, sort_keys=[("a", "nonscending")]) + pc.sort_indices(table, sort_keys=[("a", "nonscending")]) def test_is_in(): arr = pa.array([1, 2, None, 1, 2, 3]) - result = is_in(arr, value_set=pa.array([1, 3, None])) + result = pc.is_in(arr, value_set=pa.array([1, 3, None])) assert result.to_pylist() == [True, False, True, True, False, True] - result = is_in(arr, value_set=pa.array([1, 3, None]), skip_nulls=True) + result = pc.is_in(arr, value_set=pa.array([1, 3, None]), skip_nulls=True) assert result.to_pylist() == [True, False, False, True, False, True] - result = is_in(arr, value_set=pa.array([1, 3])) + result = pc.is_in(arr, value_set=pa.array([1, 3])) assert result.to_pylist() == [True, False, False, True, False, True] - result = is_in(arr, value_set=pa.array([1, 3]), skip_nulls=True) + result = pc.is_in(arr, value_set=pa.array([1, 3]), skip_nulls=True) assert result.to_pylist() == [True, False, False, True, False, True] def test_index_in(): arr = pa.array([1, 2, None, 1, 2, 3]) - result = index_in(arr, value_set=pa.array([1, 3, None])) + result = pc.index_in(arr, value_set=pa.array([1, 3, None])) assert result.to_pylist() == [0, None, 2, 0, None, 1] - result = index_in(arr, value_set=pa.array([1, 3, None]), - skip_nulls=True) + result = pc.index_in(arr, value_set=pa.array([1, 3, None]), + skip_nulls=True) assert result.to_pylist() == [0, None, None, 0, None, 1] - result = index_in(arr, value_set=pa.array([1, 3])) + result = pc.index_in(arr, value_set=pa.array([1, 3])) assert result.to_pylist() == [0, None, None, 0, None, 1] - result = index_in(arr, value_set=pa.array([1, 3]), skip_nulls=True) + result = pc.index_in(arr, value_set=pa.array([1, 3]), skip_nulls=True) assert result.to_pylist() == [0, None, None, 0, None, 1] # Positional value_set - result = index_in(arr, pa.array([1, 3]), skip_nulls=True) + result = pc.index_in(arr, pa.array([1, 3]), skip_nulls=True) assert result.to_pylist() == [0, None, None, 0, None, 1] def test_quantile(): arr = pa.array([1, 2, 3, 4]) - result = quantile(arr) + result = pc.quantile(arr) assert result.to_pylist() == [2.5] - result = quantile(arr, interpolation='lower') + result = pc.quantile(arr, interpolation='lower') assert result.to_pylist() == [2] - result = quantile(arr, interpolation='higher') + result = pc.quantile(arr, interpolation='higher') assert result.to_pylist() == [3] - result = quantile(arr, interpolation='nearest') + result = pc.quantile(arr, interpolation='nearest') assert result.to_pylist() == [3] - result = quantile(arr, interpolation='midpoint') + result = pc.quantile(arr, interpolation='midpoint') assert result.to_pylist() == [2.5] - result = quantile(arr, interpolation='linear') + result = pc.quantile(arr, interpolation='linear') assert result.to_pylist() == [2.5] arr = pa.array([1, 2]) - result = quantile(arr, q=[0.25, 0.5, 0.75]) + result = pc.quantile(arr, q=[0.25, 0.5, 0.75]) assert result.to_pylist() == [1.25, 1.5, 1.75] - result = quantile(arr, q=[0.25, 0.5, 0.75], interpolation='lower') + result = pc.quantile(arr, q=[0.25, 0.5, 0.75], interpolation='lower') assert result.to_pylist() == [1, 1, 1] - result = quantile(arr, q=[0.25, 0.5, 0.75], interpolation='higher') + result = pc.quantile(arr, q=[0.25, 0.5, 0.75], interpolation='higher') assert result.to_pylist() == [2, 2, 2] - result = quantile(arr, q=[0.25, 0.5, 0.75], interpolation='midpoint') + result = pc.quantile(arr, q=[0.25, 0.5, 0.75], interpolation='midpoint') assert result.to_pylist() == [1.5, 1.5, 1.5] - result = quantile(arr, q=[0.25, 0.5, 0.75], interpolation='nearest') + result = pc.quantile(arr, q=[0.25, 0.5, 0.75], interpolation='nearest') assert result.to_pylist() == [1, 1, 2] - result = quantile(arr, q=[0.25, 0.5, 0.75], interpolation='linear') + result = pc.quantile(arr, q=[0.25, 0.5, 0.75], interpolation='linear') assert result.to_pylist() == [1.25, 1.5, 1.75] # Positional `q` - result = quantile(arr, [0.25, 0.5, 0.75], interpolation='linear') + result = pc.quantile(arr, [0.25, 0.5, 0.75], interpolation='linear') assert result.to_pylist() == [1.25, 1.5, 1.75] with pytest.raises(ValueError, match="Quantile must be between 0 and 1"): - quantile(arr, q=1.1) + pc.quantile(arr, q=1.1) with pytest.raises(ValueError, match="not a valid quantile interpolation"): - quantile(arr, interpolation='zzz') + pc.quantile(arr, interpolation='zzz') def test_tdigest(): arr = pa.array([1, 2, 3, 4]) - result = tdigest(arr) + result = pc.tdigest(arr) assert result.to_pylist() == [2.5] arr = pa.chunked_array([pa.array([1, 2]), pa.array([3, 4])]) - result = tdigest(arr) + result = pc.tdigest(arr) assert result.to_pylist() == [2.5] arr = pa.array([1, 2, 3, 4]) - result = tdigest(arr, q=[0, 0.5, 1]) + result = pc.tdigest(arr, q=[0, 0.5, 1]) assert result.to_pylist() == [1, 2.5, 4] arr = pa.chunked_array([pa.array([1, 2]), pa.array([3, 4])]) - result = tdigest(arr, [0, 0.5, 1]) # positional `q` + result = pc.tdigest(arr, [0, 0.5, 1]) # positional `q` assert result.to_pylist() == [1, 2.5, 4] @@ -3097,32 +3065,32 @@ def test_min_max_element_wise(): arr2 = pa.array([3, 1, 2]) arr3 = pa.array([2, 3, None]) - result = max_element_wise(arr1, arr2) + result = pc.max_element_wise(arr1, arr2) assert result == pa.array([3, 2, 3]) - result = min_element_wise(arr1, arr2) + result = pc.min_element_wise(arr1, arr2) assert result == pa.array([1, 1, 2]) - result = max_element_wise(arr1, arr2, arr3) + result = pc.max_element_wise(arr1, arr2, arr3) assert result == pa.array([3, 3, 3]) - result = min_element_wise(arr1, arr2, arr3) + result = pc.min_element_wise(arr1, arr2, arr3) assert result == pa.array([1, 1, 2]) # with specifying the option - result = max_element_wise(arr1, arr3, skip_nulls=True) + result = pc.max_element_wise(arr1, arr3, skip_nulls=True) assert result == pa.array([2, 3, 3]) - result = min_element_wise(arr1, arr3, skip_nulls=True) + result = pc.min_element_wise(arr1, arr3, skip_nulls=True) assert result == pa.array([1, 2, 3]) - result = max_element_wise( + result = pc.max_element_wise( arr1, arr3, options=pc.ElementWiseAggregateOptions()) assert result == pa.array([2, 3, 3]) - result = min_element_wise( + result = pc.min_element_wise( arr1, arr3, options=pc.ElementWiseAggregateOptions()) assert result == pa.array([1, 2, 3]) # not skipping nulls - result = max_element_wise(arr1, arr3, skip_nulls=False) + result = pc.max_element_wise(arr1, arr3, skip_nulls=False) assert result == pa.array([2, 3, None]) - result = min_element_wise(arr1, arr3, skip_nulls=False) + result = pc.min_element_wise(arr1, arr3, skip_nulls=False) assert result == pa.array([1, 2, None]) @@ -3148,9 +3116,9 @@ def test_cumulative_sum(start, skip_nulls): if skip_nulls else pa.chunked_array([[0, None, None, None]]) ] for i, arr in enumerate(arrays): - result = cumulative_sum(arr, start=strt, skip_nulls=skip_nulls) + result = pc.cumulative_sum(arr, start=strt, skip_nulls=skip_nulls) # Add `start` offset to expected array before comparing - expected = pc_add(expected_arrays[i], strt if strt is not None + expected = pc.add(expected_arrays[i], strt if strt is not None else 0) assert result.equals(expected) @@ -3169,16 +3137,16 @@ def test_cumulative_sum(start, skip_nulls): if skip_nulls else np.array([1, np.nan, None, None, None, None]) ] for i, arr in enumerate(arrays): - result = cumulative_sum(arr, start=strt, skip_nulls=skip_nulls) + result = pc.cumulative_sum(arr, start=strt, skip_nulls=skip_nulls) # Add `start` offset to expected array before comparing - expected = pc_add(expected_arrays[i], strt if strt is not None + expected = pc.add(expected_arrays[i], strt if strt is not None else 0) np.testing.assert_array_almost_equal(result.to_numpy( zero_copy_only=False), expected.to_numpy(zero_copy_only=False)) for strt in ['a', pa.scalar('arrow'), 1.1]: with pytest.raises(pa.ArrowInvalid): - cumulative_sum([1, 2, 3], start=strt) + pc.cumulative_sum([1, 2, 3], start=strt) @pytest.mark.numpy @@ -3203,10 +3171,10 @@ def test_cumulative_prod(start, skip_nulls): if skip_nulls else pa.chunked_array([[1, None, None, None]]) ] for i, arr in enumerate(arrays): - result = cumulative_prod(arr, start=strt, skip_nulls=skip_nulls) + result = pc.cumulative_prod(arr, start=strt, skip_nulls=skip_nulls) # Multiply `start` offset to expected array before comparing - expected = multiply(expected_arrays[i], strt if strt is not None - else 1) + expected = pc.multiply(expected_arrays[i], strt if strt is not None + else 1) assert result.equals(expected) starts = [None, start, pa.scalar(start, type=pa.float32()), @@ -3224,16 +3192,16 @@ def test_cumulative_prod(start, skip_nulls): if skip_nulls else np.array([1, np.nan, None, None, None, None]) ] for i, arr in enumerate(arrays): - result = cumulative_prod(arr, start=strt, skip_nulls=skip_nulls) + result = pc.cumulative_prod(arr, start=strt, skip_nulls=skip_nulls) # Multiply `start` offset to expected array before comparing - expected = multiply(expected_arrays[i], strt if strt is not None - else 1) + expected = pc.multiply(expected_arrays[i], strt if strt is not None + else 1) np.testing.assert_array_almost_equal(result.to_numpy( zero_copy_only=False), expected.to_numpy(zero_copy_only=False)) for strt in ['a', pa.scalar('arrow'), 1.1]: with pytest.raises(pa.ArrowInvalid): - cumulative_prod([1, 2, 3], start=strt) + pc.cumulative_prod([1, 2, 3], start=strt) @pytest.mark.numpy @@ -3259,9 +3227,9 @@ def test_cumulative_max(start, skip_nulls): pa.chunked_array([[2, 2, None, None, None, None]]) ] for i, arr in enumerate(arrays): - result = cumulative_max(arr, start=strt, skip_nulls=skip_nulls) + result = pc.cumulative_max(arr, start=strt, skip_nulls=skip_nulls) # Max `start` offset with expected array before comparing - expected = max_element_wise( + expected = pc.max_element_wise( expected_arrays[i], strt if strt is not None else int(-1e9), skip_nulls=False) assert result.equals(expected) @@ -3281,9 +3249,9 @@ def test_cumulative_max(start, skip_nulls): if skip_nulls else np.array([2.5, 2.5, None, None, None, None]) ] for i, arr in enumerate(arrays): - result = cumulative_max(arr, start=strt, skip_nulls=skip_nulls) + result = pc.cumulative_max(arr, start=strt, skip_nulls=skip_nulls) # Max `start` offset with expected array before comparing - expected = max_element_wise( + expected = pc.max_element_wise( expected_arrays[i], strt if strt is not None else -1e9, skip_nulls=False) np.testing.assert_array_almost_equal(result.to_numpy( @@ -3291,7 +3259,7 @@ def test_cumulative_max(start, skip_nulls): for strt in ['a', pa.scalar('arrow'), 1.1]: with pytest.raises(pa.ArrowInvalid): - cumulative_max([1, 2, 3], start=strt) + pc.cumulative_max([1, 2, 3], start=strt) @pytest.mark.numpy @@ -3317,9 +3285,9 @@ def test_cumulative_min(start, skip_nulls): pa.chunked_array([[5, 5, None, None, None, None]]) ] for i, arr in enumerate(arrays): - result = cumulative_min(arr, start=strt, skip_nulls=skip_nulls) + result = pc.cumulative_min(arr, start=strt, skip_nulls=skip_nulls) # Min `start` offset with expected array before comparing - expected = min_element_wise( + expected = pc.min_element_wise( expected_arrays[i], strt if strt is not None else int(1e9), skip_nulls=False) assert result.equals(expected) @@ -3339,9 +3307,9 @@ def test_cumulative_min(start, skip_nulls): if skip_nulls else np.array([5.5, 5.5, None, None, None, None]) ] for i, arr in enumerate(arrays): - result = cumulative_min(arr, start=strt, skip_nulls=skip_nulls) + result = pc.cumulative_min(arr, start=strt, skip_nulls=skip_nulls) # Min `start` offset with expected array before comparing - expected = min_element_wise( + expected = pc.min_element_wise( expected_arrays[i], strt if strt is not None else 1e9, skip_nulls=False) np.testing.assert_array_almost_equal(result.to_numpy( @@ -3349,26 +3317,26 @@ def test_cumulative_min(start, skip_nulls): for strt in ['a', pa.scalar('arrow'), 1.1]: with pytest.raises(pa.ArrowInvalid): - cumulative_max([1, 2, 3], start=strt) + pc.cumulative_max([1, 2, 3], start=strt) def test_make_struct(): - assert make_struct(1, 'a').as_py() == {'0': 1, '1': 'a'} + assert pc.make_struct(1, 'a').as_py() == {'0': 1, '1': 'a'} - assert make_struct(1, 'a', field_names=['i', 's']).as_py() == { + assert pc.make_struct(1, 'a', field_names=['i', 's']).as_py() == { 'i': 1, 's': 'a'} - assert make_struct([1, 2, 3], - "a b c".split()) == pa.StructArray.from_arrays([ - [1, 2, 3], - "a b c".split()], names='0 1'.split()) + assert pc.make_struct([1, 2, 3], + "a b c".split()) == pa.StructArray.from_arrays([ + [1, 2, 3], + "a b c".split()], names='0 1'.split()) with pytest.raises(ValueError, match="Array arguments must all be the same length"): - make_struct([1, 2, 3, 4], "a b c".split()) + pc.make_struct([1, 2, 3, 4], "a b c".split()) with pytest.raises(ValueError, match="0 arguments but 2 field names"): - make_struct(field_names=['one', 'two']) + pc.make_struct(field_names=['one', 'two']) def test_map_lookup(): @@ -3380,12 +3348,12 @@ def test_map_lookup(): result_all = pa.array([[1], None, None, [5, 7], None], type=pa.list_(pa.int32())) - assert map_lookup(arr, 'one', 'first') == result_first - assert map_lookup(arr, pa.scalar( + assert pc.map_lookup(arr, 'one', 'first') == result_first + assert pc.map_lookup(arr, pa.scalar( 'one', type=pa.utf8()), 'first') == result_first - assert map_lookup(arr, pa.scalar( + assert pc.map_lookup(arr, pa.scalar( 'one', type=pa.utf8()), 'last') == result_last - assert map_lookup(arr, pa.scalar( + assert pc.map_lookup(arr, pa.scalar( 'one', type=pa.utf8()), 'all') == result_all @@ -3395,42 +3363,42 @@ def test_struct_fields_options(): c = pa.StructArray.from_arrays([a, b], ["a", "b"]) arr = pa.StructArray.from_arrays([a, c], ["a", "c"]) - assert struct_field(arr, '.c.b') == b - assert struct_field(arr, b'.c.b') == b - assert struct_field(arr, ['c', 'b']) == b - assert struct_field(arr, [1, 'b']) == b - assert struct_field(arr, (b'c', 'b')) == b - assert struct_field(arr, pc.field(('c', 'b'))) == b + assert pc.struct_field(arr, '.c.b') == b + assert pc.struct_field(arr, b'.c.b') == b + assert pc.struct_field(arr, ['c', 'b']) == b + assert pc.struct_field(arr, [1, 'b']) == b + assert pc.struct_field(arr, (b'c', 'b')) == b + assert pc.struct_field(arr, pc.field(('c', 'b'))) == b - assert struct_field(arr, '.a') == a - assert struct_field(arr, ['a']) == a - assert struct_field(arr, 'a') == a - assert struct_field(arr, pc.field(('a',))) == a + assert pc.struct_field(arr, '.a') == a + assert pc.struct_field(arr, ['a']) == a + assert pc.struct_field(arr, 'a') == a + assert pc.struct_field(arr, pc.field(('a',))) == a - assert struct_field(arr, indices=[1, 1]) == b - assert struct_field(arr, (1, 1)) == b - assert struct_field(arr, [0]) == a - assert struct_field(arr, []) == arr + assert pc.struct_field(arr, indices=[1, 1]) == b + assert pc.struct_field(arr, (1, 1)) == b + assert pc.struct_field(arr, [0]) == a + assert pc.struct_field(arr, []) == arr with pytest.raises(pa.ArrowInvalid, match="No match for FieldRef"): - struct_field(arr, 'foo') + pc.struct_field(arr, 'foo') with pytest.raises(pa.ArrowInvalid, match="No match for FieldRef"): - struct_field(arr, '.c.foo') + pc.struct_field(arr, '.c.foo') # drill into a non-struct array and continue to ask for a field with pytest.raises(pa.ArrowInvalid, match="No match for FieldRef"): - struct_field(arr, '.a.foo') + pc.struct_field(arr, '.a.foo') # TODO: https://issues.apache.org/jira/browse/ARROW-14853 - # assert struct_field(arr) == arr + # assert pc.struct_field(arr) == arr def test_case_when(): - assert case_when(make_struct([True, False, None], - [False, True, None]), - [1, 2, 3], - [11, 12, 13]) == pa.array([1, 12, None]) + assert pc.case_when(pc.make_struct([True, False, None], + [False, True, None]), + [1, 2, 3], + [11, 12, 13]) == pa.array([1, 12, None]) def test_list_element(): @@ -3441,12 +3409,12 @@ def test_list_element(): lists = pa.array([l1, l2], list_type) index = 1 - result = list_element(lists, index) + result = pa.compute.list_element(lists, index) expected = pa.array([None, {'a': 0.52, 'b': 3}], element_type) assert result.equals(expected) index = 4 - result = list_element(lists, index) + result = pa.compute.list_element(lists, index) expected = pa.array([{'a': 5.6, 'b': 6}, {'a': .6, 'b': 8}], element_type) assert result.equals(expected) @@ -3454,28 +3422,28 @@ def test_list_element(): def test_count_distinct(): samples = [datetime.datetime(year=y, month=1, day=1) for y in range(1992, 2092)] arr = pa.array(samples, pa.timestamp("ns")) - assert count_distinct(arr) == pa.scalar(len(samples), type=pa.int64()) + assert pc.count_distinct(arr) == pa.scalar(len(samples), type=pa.int64()) def test_count_distinct_options(): arr = pa.array([1, 2, 3, None, None]) - assert count_distinct(arr).as_py() == 3 - assert count_distinct(arr, mode='only_valid').as_py() == 3 - assert count_distinct(arr, mode='only_null').as_py() == 1 - assert count_distinct(arr, mode='all').as_py() == 4 - assert count_distinct(arr, 'all').as_py() == 4 + assert pc.count_distinct(arr).as_py() == 3 + assert pc.count_distinct(arr, mode='only_valid').as_py() == 3 + assert pc.count_distinct(arr, mode='only_null').as_py() == 1 + assert pc.count_distinct(arr, mode='all').as_py() == 4 + assert pc.count_distinct(arr, 'all').as_py() == 4 def test_utf8_normalize(): arr = pa.array(["01²3"]) - assert utf8_normalize(arr, form="NFC") == arr - assert utf8_normalize(arr, form="NFKC") == pa.array(["0123"]) - assert utf8_normalize(arr, "NFD") == arr - assert utf8_normalize(arr, "NFKD") == pa.array(["0123"]) + assert pc.utf8_normalize(arr, form="NFC") == arr + assert pc.utf8_normalize(arr, form="NFKC") == pa.array(["0123"]) + assert pc.utf8_normalize(arr, "NFD") == arr + assert pc.utf8_normalize(arr, "NFKD") == pa.array(["0123"]) with pytest.raises( ValueError, match='"NFZ" is not a valid Unicode normalization form'): - utf8_normalize(arr, form="NFZ") + pc.utf8_normalize(arr, form="NFZ") def test_random(): @@ -3517,7 +3485,7 @@ def test_rank_options_tiebreaker(tiebreaker, expected_values): rank_options = pc.RankOptions(sort_keys="ascending", null_placement="at_end", tiebreaker=tiebreaker) - result = rank(arr, options=rank_options) + result = pc.rank(arr, options=rank_options) expected = pa.array(expected_values, type=pa.uint64()) assert result.equals(expected) @@ -3527,24 +3495,24 @@ def test_rank_options(): expected = pa.array([3, 1, 4, 6, 5, 7, 2], type=pa.uint64()) # Ensure rank can be called without specifying options - result = rank(arr) + result = pc.rank(arr) assert result.equals(expected) # Ensure default RankOptions - result = rank(arr, options=pc.RankOptions()) + result = pc.rank(arr, options=pc.RankOptions()) assert result.equals(expected) # Ensure sort_keys tuple usage - result = rank(arr, options=pc.RankOptions( + result = pc.rank(arr, options=pc.RankOptions( sort_keys=[("b", "ascending")]) ) assert result.equals(expected) - result = rank(arr, null_placement="at_start") + result = pc.rank(arr, null_placement="at_start") expected_at_start = pa.array([5, 3, 6, 1, 7, 2, 4], type=pa.uint64()) assert result.equals(expected_at_start) - result = rank(arr, sort_keys="descending") + result = pc.rank(arr, sort_keys="descending") expected_descending = pa.array([3, 4, 1, 6, 2, 7, 5], type=pa.uint64()) assert result.equals(expected_descending) @@ -3560,29 +3528,29 @@ def test_rank_quantile_options(): expected = pa.array([0.7, 0.1, 0.7, 0.3, 0.7], type=pa.float64()) # Ensure rank_quantile can be called without specifying options - result = rank_quantile(arr) + result = pc.rank_quantile(arr) assert result.equals(expected) # Ensure default RankOptions - result = rank_quantile(arr, options=pc.RankQuantileOptions()) + result = pc.rank_quantile(arr, options=pc.RankQuantileOptions()) assert result.equals(expected) # Ensure sort_keys tuple usage - result = rank_quantile(arr, options=pc.RankQuantileOptions( + result = pc.rank_quantile(arr, options=pc.RankQuantileOptions( sort_keys=[("b", "ascending")]) ) assert result.equals(expected) - result = rank_quantile(arr, null_placement="at_start") + result = pc.rank_quantile(arr, null_placement="at_start") expected_at_start = pa.array([0.3, 0.7, 0.3, 0.9, 0.3], type=pa.float64()) assert result.equals(expected_at_start) - result = rank_quantile(arr, sort_keys="descending") + result = pc.rank_quantile(arr, sort_keys="descending") expected_descending = pa.array([0.7, 0.3, 0.7, 0.1, 0.7], type=pa.float64()) assert result.equals(expected_descending) with pytest.raises(ValueError, match="not a valid sort order"): - rank_quantile(arr, sort_keys="XXX") + pc.rank_quantile(arr, sort_keys="XXX") def test_rank_normal_options(): @@ -3591,21 +3559,21 @@ def test_rank_normal_options(): expected = pytest.approx( [0.5244005127080407, -1.2815515655446004, 0.5244005127080407, -0.5244005127080409, 0.5244005127080407]) - result = rank_normal(arr) + result = pc.rank_normal(arr) assert result.to_pylist() == expected - result = rank_normal(arr, null_placement="at_end", sort_keys="ascending") + result = pc.rank_normal(arr, null_placement="at_end", sort_keys="ascending") assert result.to_pylist() == expected - result = rank_normal(arr, options=pc.RankQuantileOptions()) + result = pc.rank_normal(arr, options=pc.RankQuantileOptions()) assert result.to_pylist() == expected expected = pytest.approx( [-0.5244005127080409, 1.2815515655446004, -0.5244005127080409, 0.5244005127080407, -0.5244005127080409]) - result = rank_normal(arr, null_placement="at_start", sort_keys="descending") + result = pc.rank_normal(arr, null_placement="at_start", sort_keys="descending") assert result.to_pylist() == expected - result = rank_normal(arr, - options=pc.RankQuantileOptions(null_placement="at_start", - sort_keys="descending")) + result = pc.rank_normal(arr, + options=pc.RankQuantileOptions(null_placement="at_start", + sort_keys="descending")) assert result.to_pylist() == expected @@ -3633,17 +3601,17 @@ def create_sample_expressions(): # These expressions include at least one function call exprs_with_call = [a == b, a != b, a > b, c & j, c | j, ~c, d.is_valid(), - a + b, a - b, a * b, a / b, negate(a), - pc_add(a, b), subtract(a, b), divide(a, b), - multiply(a, b), power(a, a), sqrt(a), - exp(b), cos(b), sin(b), tan(b), - acos(b), atan(b), asin(b), atan2(b, b), - sinh(a), cosh(a), tanh(a), - asinh(a), acosh(b), atanh(k), - pc_abs(b), sign(a), bit_wise_not(a), - bit_wise_and(a, a), bit_wise_or(a, a), - bit_wise_xor(a, a), is_nan(b), is_finite(b), - coalesce(a, b), + a + b, a - b, a * b, a / b, pc.negate(a), + pc.add(a, b), pc.subtract(a, b), pc.divide(a, b), + pc.multiply(a, b), pc.power(a, a), pc.sqrt(a), + pc.exp(b), pc.cos(b), pc.sin(b), pc.tan(b), + pc.acos(b), pc.atan(b), pc.asin(b), pc.atan2(b, b), + pc.sinh(a), pc.cosh(a), pc.tanh(a), + pc.asinh(a), pc.acosh(b), pc.atanh(k), + pc.abs(b), pc.sign(a), pc.bit_wise_not(a), + pc.bit_wise_and(a, a), pc.bit_wise_or(a, a), + pc.bit_wise_xor(a, a), pc.is_nan(b), pc.is_finite(b), + pc.coalesce(a, b), a.cast(pa.int32(), safe=False)] # These expressions test out various reference styles and may include function @@ -3807,29 +3775,29 @@ def test_expression_call_function(): field = pc.field("field") # no options - assert str(hour(field)) == "hour(field)" + assert str(pc.hour(field)) == "hour(field)" # default options - assert str(pc_round(field)) == "round(field)" + assert str(pc.round(field)) == "round(field)" # specified options - assert str(pc_round(field, ndigits=1)) == \ + assert str(pc.round(field, ndigits=1)) == \ "round(field, {ndigits=1, round_mode=HALF_TO_EVEN})" # Will convert non-expression arguments if possible - assert str(pc_add(field, 1)) == "add(field, 1)" - assert str(pc_add(field, pa.scalar(1))) == "add(field, 1)" + assert str(pc.add(field, 1)) == "add(field, 1)" + assert str(pc.add(field, pa.scalar(1))) == "add(field, 1)" # Invalid pc.scalar input gives original error message msg = "only other expressions allowed as arguments" with pytest.raises(TypeError, match=msg): - pc_add(field, object) + pc.add(field, object) def test_cast_table_raises(): table = pa.table({'a': [1, 2]}) - with pytest.raises(lib.ArrowTypeError): - cast(table, pa.int64()) + with pytest.raises(pa.lib.ArrowTypeError): + pc.cast(table, pa.int64()) @pytest.mark.parametrize("start,stop,expected", ( @@ -3856,9 +3824,9 @@ def test_list_slice_output_fixed(start, stop, step, expected, value_type, msg = ("Unable to produce FixedSizeListArray from " "non-FixedSizeListArray without `stop` being set.") with pytest.raises(pa.ArrowInvalid, match=msg): - list_slice(*args) + pc.list_slice(*args) else: - result = list_slice(*args) + result = pc.list_slice(*args) pylist = result.cast(pa.list_(pa.int8(), result.type.list_size)).to_pylist() assert pylist == [e[::step] if e else e for e in expected] @@ -3889,8 +3857,8 @@ def test_list_slice_output_variable(start, stop, step, value_type, list_type): if list_type == "fixed": list_type = pa.list_ # non fixed output type - result = list_slice(arr, start, stop, step, - return_fixed_size_list=False) + result = pc.list_slice(arr, start, stop, step, + return_fixed_size_list=False) assert result.type == list_type(value_type()) pylist = result.cast(pa.list_(pa.int8())).to_pylist() @@ -3907,7 +3875,7 @@ def test_list_slice_output_variable(start, stop, step, value_type, list_type): lambda: pa.large_list(pa.field('col', pa.int8())))) def test_list_slice_field_names_retained(return_fixed_size, type): arr = pa.array([[1]], type()) - out = list_slice(arr, 0, 1, return_fixed_size_list=return_fixed_size) + out = pc.list_slice(arr, 0, 1, return_fixed_size_list=return_fixed_size) assert arr.type.field(0).name == out.type.field(0).name # Verify out type matches in type if return_fixed_size_list==None @@ -3919,27 +3887,27 @@ def test_list_slice_bad_parameters(): arr = pa.array([[1]], pa.list_(pa.int8(), 1)) msg = r"`start`(.*) should be greater than 0 and smaller than `stop`(.*)" with pytest.raises(pa.ArrowInvalid, match=msg): - list_slice(arr, -1, 1) # negative start? + pc.list_slice(arr, -1, 1) # negative start? with pytest.raises(pa.ArrowInvalid, match=msg): - list_slice(arr, 2, 1) # start > stop? + pc.list_slice(arr, 2, 1) # start > stop? # TODO(ARROW-18281): start==stop -> empty lists with pytest.raises(pa.ArrowInvalid, match=msg): - list_slice(arr, 0, 0) # start == stop? + pc.list_slice(arr, 0, 0) # start == stop? # Step not >= 1 msg = "`step` must be >= 1, got: " with pytest.raises(pa.ArrowInvalid, match=msg + "0"): - list_slice(arr, 0, 1, step=0) + pc.list_slice(arr, 0, 1, step=0) with pytest.raises(pa.ArrowInvalid, match=msg + "-1"): - list_slice(arr, 0, 1, step=-1) + pc.list_slice(arr, 0, 1, step=-1) def check_run_end_encode_decode(value_type, run_end_encode_opts=None): values = [1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3] arr = pa.array(values, type=value_type) - encoded = run_end_encode(arr, options=run_end_encode_opts) - decoded = run_end_decode(encoded) + encoded = pc.run_end_encode(arr, options=run_end_encode_opts) + decoded = pc.run_end_decode(encoded) assert decoded.type == arr.type assert decoded.equals(arr) @@ -3976,65 +3944,65 @@ def test_run_end_encode(value_type, option): def test_pairwise_diff(): arr = pa.array([1, 2, 3, None, 4, 5]) expected = pa.array([None, 1, 1, None, None, 1]) - result = pairwise_diff(arr, period=1) + result = pa.compute.pairwise_diff(arr, period=1) assert result.equals(expected) arr = pa.array([1, 2, 3, None, 4, 5]) expected = pa.array([None, None, 2, None, 1, None]) - result = pairwise_diff(arr, period=2) + result = pa.compute.pairwise_diff(arr, period=2) assert result.equals(expected) # negative period arr = pa.array([1, 2, 3, None, 4, 5], type=pa.int8()) expected = pa.array([-1, -1, None, None, -1, None], type=pa.int8()) - result = pairwise_diff(arr, period=-1) + result = pa.compute.pairwise_diff(arr, period=-1) assert result.equals(expected) # wrap around overflow arr = pa.array([1, 2, 3, None, 4, 5], type=pa.uint8()) expected = pa.array([255, 255, None, None, 255, None], type=pa.uint8()) - result = pairwise_diff(arr, period=-1) + result = pa.compute.pairwise_diff(arr, period=-1) assert result.equals(expected) # fail on overflow arr = pa.array([1, 2, 3, None, 4, 5], type=pa.uint8()) with pytest.raises(pa.ArrowInvalid, match="overflow"): - pairwise_diff_checked(arr, period=-1) + pa.compute.pairwise_diff_checked(arr, period=-1) def test_pivot_wider(): key_names = ["width", "height"] - result = pivot_wider(["height", "width", "depth"], [10, None, 11]) + result = pc.pivot_wider(["height", "width", "depth"], [10, None, 11]) assert result.as_py() == {} - result = pivot_wider(["height", "width", "depth"], [10, None, 11], - key_names) + result = pc.pivot_wider(["height", "width", "depth"], [10, None, 11], + key_names) assert result.as_py() == {"width": None, "height": 10} # check key order assert list(result.as_py()) == ["width", "height"] - result = pivot_wider(["height", "width", "depth"], [10, None, 11], - key_names=key_names) + result = pc.pivot_wider(["height", "width", "depth"], [10, None, 11], + key_names=key_names) assert result.as_py() == {"width": None, "height": 10} with pytest.raises(KeyError, match="Unexpected pivot key: depth"): - result = pivot_wider(["height", "width", "depth"], [10, None, 11], - key_names=key_names, - unexpected_key_behavior="raise") + result = pc.pivot_wider(["height", "width", "depth"], [10, None, 11], + key_names=key_names, + unexpected_key_behavior="raise") with pytest.raises(ValueError, match="Encountered more than one non-null value"): - result = pivot_wider(["height", "width", "height"], [10, None, 11], - key_names=key_names) + result = pc.pivot_wider(["height", "width", "height"], [10, None, 11], + key_names=key_names) def test_winsorize(): arr = pa.array([10, 4, 9, 8, 5, 3, 7, 2, 1, 6]) - result = winsorize(arr, 0.1, 0.8) + result = pc.winsorize(arr, 0.1, 0.8) assert result.to_pylist() == [8, 4, 8, 8, 5, 3, 7, 2, 2, 6] - result = winsorize( + result = pc.winsorize( arr, options=pc.WinsorizeOptions(lower_limit=0.1, upper_limit=0.8)) assert result.to_pylist() == [8, 4, 8, 8, 5, 3, 7, 2, 2, 6] diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index a420af18864..07286125c4c 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -27,7 +27,7 @@ try: import numpy as np except ImportError: - pass + np = None from pyarrow.pandas_compat import _pandas_api # noqa import pyarrow as pa diff --git a/python/pyarrow/tests/test_cpp_internals.py b/python/pyarrow/tests/test_cpp_internals.py index 359ef62b1f8..7508d8f0b98 100644 --- a/python/pyarrow/tests/test_cpp_internals.py +++ b/python/pyarrow/tests/test_cpp_internals.py @@ -20,7 +20,7 @@ import pytest -from pyarrow._pyarrow_cpp_tests import get_cpp_tests # type: ignore[unresolved_import] +from pyarrow._pyarrow_cpp_tests import get_cpp_tests def inject_cpp_tests(ns): diff --git a/python/pyarrow/tests/test_csv.py b/python/pyarrow/tests/test_csv.py index 71c96835d2c..2794d07e87c 100644 --- a/python/pyarrow/tests/test_csv.py +++ b/python/pyarrow/tests/test_csv.py @@ -1502,7 +1502,7 @@ def signal_from_thread(): # Interruption should have arrived timely assert last_duration <= 2.0 - e = exc_info.__context__ # type: ignore[possibly-unbound-attribute] + e = exc_info.__context__ assert isinstance(e, pa.ArrowCancelled) assert e.signum == signal.SIGINT @@ -1989,8 +1989,7 @@ def test_write_quoting_style(): buf = io.BytesIO() for write_options, res in [ (WriteOptions(quoting_style='needed'), b'"c1"\n","\n""""\n'), - (WriteOptions(quoting_style='none'), pa.lib.ArrowInvalid), \ - # type: ignore[unresolved-attribute] + (WriteOptions(quoting_style='none'), pa.lib.ArrowInvalid), ]: with CSVWriter(buf, t.schema, write_options=write_options) as writer: try: diff --git a/python/pyarrow/tests/test_cuda.py b/python/pyarrow/tests/test_cuda.py index 1ca5a9529e4..e06f479987c 100644 --- a/python/pyarrow/tests/test_cuda.py +++ b/python/pyarrow/tests/test_cuda.py @@ -42,8 +42,8 @@ not has_ipc_support, reason='CUDA IPC not supported in platform `%s`' % (platform)) -global_context = cuda.Context(0) # for flake8 -global_context1 = cuda.Context(0) # for flake8 +global_context = None # for flake8 +global_context1 = None # for flake8 def setup_module(module): @@ -807,9 +807,8 @@ def test_create_table_with_device_buffers(): def other_process_for_test_IPC(handle_buffer, expected_arr): - other_context = pa.cuda.Context(0) # type: ignore[unresolved-attribute] - ipc_handle = pa.cuda.IpcMemHandle.from_buffer(handle_buffer) \ - # type: ignore[unresolved-attribute] + other_context = pa.cuda.Context(0) + ipc_handle = pa.cuda.IpcMemHandle.from_buffer(handle_buffer) ipc_buf = other_context.open_ipc_buffer(ipc_handle) ipc_buf.context.synchronize() buf = ipc_buf.copy_to_host() diff --git a/python/pyarrow/tests/test_cuda_numba_interop.py b/python/pyarrow/tests/test_cuda_numba_interop.py index cfcf6673755..876f3c7f761 100644 --- a/python/pyarrow/tests/test_cuda_numba_interop.py +++ b/python/pyarrow/tests/test_cuda_numba_interop.py @@ -26,11 +26,10 @@ cuda = pytest.importorskip("pyarrow.cuda") nb_cuda = pytest.importorskip("numba.cuda") -from numba.cuda.cudadrv.devicearray import DeviceNDArray \ - # type: ignore[unresolved_import] # noqa: E402 +from numba.cuda.cudadrv.devicearray import DeviceNDArray # noqa: E402 -context_choices = {} +context_choices = None context_choice_ids = ['pyarrow.cuda', 'numba.cuda'] @@ -50,7 +49,7 @@ def teardown_module(module): @pytest.mark.parametrize("c", range(len(context_choice_ids)), ids=context_choice_ids) def test_context(c): - ctx, nb_ctx = context_choices.get(c, (None, None)) + ctx, nb_ctx = context_choices[c] assert ctx.handle == nb_ctx.handle.value assert ctx.handle == ctx.to_numba().handle.value ctx2 = cuda.Context.from_numba(nb_ctx) @@ -73,8 +72,7 @@ def make_random_buffer(size, target='host', dtype='uint8', ctx=None): return arr, buf elif target == 'device': arr, buf = make_random_buffer(size, target='host', dtype=dtype) - dbuf = ctx.new_buffer(size * dtype.itemsize) \ - # type: ignore[possibly-unbound-attribute] + dbuf = ctx.new_buffer(size * dtype.itemsize) dbuf.copy_from_host(buf, position=0, nbytes=buf.size) return arr, dbuf raise ValueError('invalid target value') @@ -85,7 +83,7 @@ def make_random_buffer(size, target='host', dtype='uint8', ctx=None): @pytest.mark.parametrize("dtype", dtypes, ids=dtypes) @pytest.mark.parametrize("size", [0, 1, 8, 1000]) def test_from_object(c, dtype, size): - ctx, nb_ctx = context_choices.get(c, (None, None)) + ctx, nb_ctx = context_choices[c] arr, cbuf = make_random_buffer(size, target='device', dtype=dtype, ctx=ctx) # Creating device buffer from numba DeviceNDArray: @@ -163,7 +161,7 @@ def __cuda_array_interface__(self): ids=context_choice_ids) @pytest.mark.parametrize("dtype", dtypes, ids=dtypes) def test_numba_memalloc(c, dtype): - ctx, nb_ctx = context_choices.get(c, (None, None)) + ctx, nb_ctx = context_choices[c] dtype = np.dtype(dtype) # Allocate memory using numba context # Warning: this will not be reflected in pyarrow context manager @@ -186,7 +184,7 @@ def test_numba_memalloc(c, dtype): ids=context_choice_ids) @pytest.mark.parametrize("dtype", dtypes, ids=dtypes) def test_pyarrow_memalloc(c, dtype): - ctx, nb_ctx = context_choices.get(c, (None, None)) + ctx, nb_ctx = context_choices[c] size = 10 arr, cbuf = make_random_buffer(size, target='device', dtype=dtype, ctx=ctx) @@ -200,7 +198,7 @@ def test_pyarrow_memalloc(c, dtype): ids=context_choice_ids) @pytest.mark.parametrize("dtype", dtypes, ids=dtypes) def test_numba_context(c, dtype): - ctx, nb_ctx = context_choices.get(c, (None, None)) + ctx, nb_ctx = context_choices[c] size = 10 with nb_cuda.gpus[0]: arr, cbuf = make_random_buffer(size, target='device', @@ -219,7 +217,7 @@ def test_numba_context(c, dtype): ids=context_choice_ids) @pytest.mark.parametrize("dtype", dtypes, ids=dtypes) def test_pyarrow_jit(c, dtype): - ctx, nb_ctx = context_choices.get(c, (None, None)) + ctx, nb_ctx = context_choices[c] @nb_cuda.jit def increment_by_one(an_array): diff --git a/python/pyarrow/tests/test_cython.py b/python/pyarrow/tests/test_cython.py index c9c35087839..e0116a4bb76 100644 --- a/python/pyarrow/tests/test_cython.py +++ b/python/pyarrow/tests/test_cython.py @@ -191,7 +191,7 @@ def test_visit_strings(tmpdir): strings = ['a', 'b', 'c'] visited = [] - mod._visit_strings(strings, visited.append) # type: ignore[unresolved-attribute] + mod._visit_strings(strings, visited.append) assert visited == strings @@ -200,4 +200,4 @@ def raise_on_b(s): if s == 'b': raise ValueError('wtf') - mod._visit_strings(strings, raise_on_b) # type: ignore[unresolved-attribute] + mod._visit_strings(strings, raise_on_b) diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 344201ff4f9..e7365643b84 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -32,7 +32,7 @@ try: import numpy as np except ImportError: - pass + np = None import pytest import pyarrow as pa @@ -41,10 +41,7 @@ import pyarrow.feather import pyarrow.fs as fs import pyarrow.json -from pyarrow import lib # type: ignore[unresolved-attribute] -from pyarrow.compute import (is_in, hour, days_between, sort_indices, unique) \ - # type: ignore[unresolved-attribute] -from pyarrow.lib import is_threading_enabled # type: ignore[unresolved_import] +from pyarrow.lib import is_threading_enabled from pyarrow.tests.util import (FSProtocolClass, ProxyHandler, _configure_s3_limited_user, _filesystem_uri, change_cwd) @@ -52,27 +49,17 @@ try: import pandas as pd except ImportError: - pass + pd = None try: import pyarrow.dataset as ds - from pyarrow.dataset import ParquetFragmentScanOptions, ParquetReadOptions, \ - ParquetFileFragment, ParquetFileFormat # type: ignore[possibly-unbound-attribute] except ImportError: - pass + ds = None try: - from pyarrow.dataset import ( - OrcFileFormat # type: ignore[possibly-unbound-import] - ) -except ImportError: - pass - -try: - import pyarrow.parquet as pq \ - # type: ignore[unresolved-import] + import pyarrow.parquet as pq except ImportError: - pass + pq = None # Marks all of the tests in this module # Ignore these with pytest ... -m 'not dataset' @@ -283,7 +270,7 @@ def multisourcefs(request): @pytest.fixture def dataset(mockfs): - format = ParquetFileFormat() + format = ds.ParquetFileFormat() selector = fs.FileSelector('subdir', recursive=True) options = ds.FileSystemFactoryOptions('subdir') options.partitioning = ds.DirectoryPartitioning( @@ -351,7 +338,7 @@ def test_filesystem_dataset(mockfs): schema = pa.schema([ pa.field('const', pa.int64()) ]) - file_format = ParquetFileFormat() + file_format = ds.ParquetFileFormat() paths = ['subdir/1/xxx/file0.parquet', 'subdir/2/yyy/file1.parquet'] partitions = [ds.field('part') == x for x in range(1, 3)] fragments = [file_format.make_fragment(path, mockfs, part) @@ -369,7 +356,7 @@ def test_filesystem_dataset(mockfs): for dataset in [dataset_from_fragments, dataset_from_paths]: assert isinstance(dataset, ds.FileSystemDataset) - assert isinstance(dataset.format, ParquetFileFormat) + assert isinstance(dataset.format, ds.ParquetFileFormat) assert dataset.partition_expression.equals(root_partition) assert set(dataset.files) == set(paths) @@ -377,14 +364,14 @@ def test_filesystem_dataset(mockfs): for fragment, partition, path in zip(fragments, partitions, paths): assert fragment.partition_expression.equals(partition) assert fragment.path == path - assert isinstance(fragment.format, ParquetFileFormat) - assert isinstance(fragment, ParquetFileFragment) + assert isinstance(fragment.format, ds.ParquetFileFormat) + assert isinstance(fragment, ds.ParquetFileFragment) assert fragment.row_groups == [0] assert fragment.num_row_groups == 1 row_group_fragments = list(fragment.split_by_row_group()) assert fragment.num_row_groups == len(row_group_fragments) == 1 - assert isinstance(row_group_fragments[0], ParquetFileFragment) + assert isinstance(row_group_fragments[0], ds.ParquetFileFragment) assert row_group_fragments[0].path == path assert row_group_fragments[0].row_groups == [0] assert row_group_fragments[0].num_row_groups == 1 @@ -503,7 +490,7 @@ def test_dataset(dataset, dataset_reader): def test_dataset_factory_inspect_schema_promotion(promotable_mockfs): mockfs, path1, path2 = promotable_mockfs factory = ds.FileSystemDatasetFactory( - mockfs, [path1, path2], ParquetFileFormat() + mockfs, [path1, path2], ds.ParquetFileFormat() ) with pytest.raises( @@ -547,7 +534,7 @@ def test_dataset_factory_inspect_schema_promotion(promotable_mockfs): def test_dataset_factory_inspect_bad_params(promotable_mockfs): mockfs, path1, path2 = promotable_mockfs factory = ds.FileSystemDatasetFactory( - mockfs, [path1, path2], ParquetFileFormat() + mockfs, [path1, path2], ds.ParquetFileFormat() ) with pytest.raises(ValueError, match='Invalid promote_options: bad_option'): @@ -955,11 +942,11 @@ def test_partition_keys(): @pytest.mark.parquet def test_parquet_read_options(): - opts1 = ParquetReadOptions() - opts2 = ParquetReadOptions(dictionary_columns=['a', 'b']) - opts3 = ParquetReadOptions(coerce_int96_timestamp_unit="ms") - opts4 = ParquetReadOptions(binary_type=pa.binary_view()) - opts5 = ParquetReadOptions(list_type=pa.LargeListType) + opts1 = ds.ParquetReadOptions() + opts2 = ds.ParquetReadOptions(dictionary_columns=['a', 'b']) + opts3 = ds.ParquetReadOptions(coerce_int96_timestamp_unit="ms") + opts4 = ds.ParquetReadOptions(binary_type=pa.binary_view()) + opts5 = ds.ParquetReadOptions(list_type=pa.LargeListType) assert opts1.dictionary_columns == set() @@ -997,37 +984,37 @@ def test_parquet_read_options(): @pytest.mark.parquet def test_parquet_file_format_read_options(): - pff1 = ParquetFileFormat() - pff2 = ParquetFileFormat(dictionary_columns={'a'}) - pff3 = ParquetFileFormat(coerce_int96_timestamp_unit="s") - pff4 = ParquetFileFormat(binary_type=pa.binary_view()) - pff5 = ParquetFileFormat(list_type=pa.LargeListType) - - assert pff1.read_options == ParquetReadOptions() - assert pff2.read_options == ParquetReadOptions(dictionary_columns=['a']) - assert pff3.read_options == ParquetReadOptions( + pff1 = ds.ParquetFileFormat() + pff2 = ds.ParquetFileFormat(dictionary_columns={'a'}) + pff3 = ds.ParquetFileFormat(coerce_int96_timestamp_unit="s") + pff4 = ds.ParquetFileFormat(binary_type=pa.binary_view()) + pff5 = ds.ParquetFileFormat(list_type=pa.LargeListType) + + assert pff1.read_options == ds.ParquetReadOptions() + assert pff2.read_options == ds.ParquetReadOptions(dictionary_columns=['a']) + assert pff3.read_options == ds.ParquetReadOptions( coerce_int96_timestamp_unit="s") - assert pff4.read_options == ParquetReadOptions( + assert pff4.read_options == ds.ParquetReadOptions( binary_type=pa.binary_view()) - assert pff5.read_options == ParquetReadOptions( + assert pff5.read_options == ds.ParquetReadOptions( list_type=pa.LargeListType) @pytest.mark.parquet def test_parquet_scan_options(): - opts1 = ParquetFragmentScanOptions() - opts2 = ParquetFragmentScanOptions(buffer_size=4096) - opts3 = ParquetFragmentScanOptions( + opts1 = ds.ParquetFragmentScanOptions() + opts2 = ds.ParquetFragmentScanOptions(buffer_size=4096) + opts3 = ds.ParquetFragmentScanOptions( buffer_size=2**13, use_buffered_stream=True) - opts4 = ParquetFragmentScanOptions(buffer_size=2**13, pre_buffer=False) - opts5 = ParquetFragmentScanOptions( + opts4 = ds.ParquetFragmentScanOptions(buffer_size=2**13, pre_buffer=False) + opts5 = ds.ParquetFragmentScanOptions( thrift_string_size_limit=123456, thrift_container_size_limit=987654,) - opts6 = ParquetFragmentScanOptions( + opts6 = ds.ParquetFragmentScanOptions( page_checksum_verification=True) cache_opts = pa.CacheOptions( hole_size_limit=2**10, range_size_limit=8*2**10, lazy=True) - opts7 = ParquetFragmentScanOptions(pre_buffer=True, cache_options=cache_opts) + opts7 = ds.ParquetFragmentScanOptions(pre_buffer=True, cache_options=cache_opts) assert opts1.use_buffered_stream is False assert opts1.buffer_size == 2**13 @@ -1089,16 +1076,16 @@ def test_file_format_pickling(pickle_module): use_threads=False, block_size=14)), ] try: - formats.append(OrcFileFormat()) + formats.append(ds.OrcFileFormat()) except ImportError: pass if pq is not None: formats.extend([ - ParquetFileFormat(), - ParquetFileFormat(dictionary_columns={'a'}), - ParquetFileFormat(use_buffered_stream=True), - ParquetFileFormat( + ds.ParquetFileFormat(), + ds.ParquetFileFormat(dictionary_columns={'a'}), + ds.ParquetFileFormat(use_buffered_stream=True), + ds.ParquetFileFormat( use_buffered_stream=True, buffer_size=4096, thrift_string_size_limit=123, @@ -1127,8 +1114,8 @@ def test_fragment_scan_options_pickling(pickle_module): if pq is not None: options.extend([ - ParquetFragmentScanOptions(buffer_size=4096), - ParquetFragmentScanOptions(pre_buffer=True), + ds.ParquetFragmentScanOptions(buffer_size=4096), + ds.ParquetFragmentScanOptions(pre_buffer=True), ]) for option in options: @@ -1145,8 +1132,8 @@ def test_fragment_scan_options_pickling(pickle_module): @pytest.mark.parametrize('pre_buffer', [False, True]) @pytest.mark.parquet def test_filesystem_factory(mockfs, paths_or_selector, pre_buffer): - format = ParquetFileFormat( - read_options=ParquetReadOptions(dictionary_columns={"str"}), + format = ds.ParquetFileFormat( + read_options=ds.ParquetReadOptions(dictionary_columns={"str"}), pre_buffer=pre_buffer ) @@ -1218,7 +1205,7 @@ def test_filesystem_factory(mockfs, paths_or_selector, pre_buffer): @pytest.mark.parquet def test_make_fragment(multisourcefs): - parquet_format = ParquetFileFormat() + parquet_format = ds.ParquetFileFormat() dataset = ds.dataset('/plain', filesystem=multisourcefs, format=parquet_format) @@ -1229,7 +1216,7 @@ def test_make_fragment(multisourcefs): row_group_fragment = parquet_format.make_fragment(path, multisourcefs, row_groups=[0]) for f in [fragment, row_group_fragment]: - assert isinstance(f, ParquetFileFragment) + assert isinstance(f, ds.ParquetFileFragment) assert f.path == path assert isinstance(f.filesystem, type(multisourcefs)) assert row_group_fragment.row_groups == [0] @@ -1245,7 +1232,7 @@ def test_make_fragment_with_size(s3_example_simple): """ table, path, fs, uri, host, port, access_key, secret_key = s3_example_simple - file_format = ParquetFileFormat() + file_format = ds.ParquetFileFormat() paths = [path] fragments = [file_format.make_fragment(path, fs) @@ -1276,7 +1263,7 @@ def test_make_fragment_with_size(s3_example_simple): fragments_with_size, format=file_format, schema=table.schema, filesystem=fs ) - with pytest.raises(lib.ArrowInvalid, match='Parquet file size is 1 bytes'): + with pytest.raises(pyarrow.lib.ArrowInvalid, match='Parquet file size is 1 bytes'): table = dataset_with_size.to_table() # too large sizes -> error @@ -1352,8 +1339,8 @@ def test_make_parquet_fragment_from_buffer(dataset_reader, pickle_module): arrays[1], arrays[2].dictionary_encode() ] - dictionary_format = ParquetFileFormat( - read_options=ParquetReadOptions( + dictionary_format = ds.ParquetFileFormat( + read_options=ds.ParquetReadOptions( dictionary_columns=['alpha', 'animal'] ), use_buffered_stream=True, @@ -1361,7 +1348,7 @@ def test_make_parquet_fragment_from_buffer(dataset_reader, pickle_module): ) cases = [ - (arrays, ParquetFileFormat()), + (arrays, ds.ParquetFileFormat()), (dictionary_arrays, dictionary_format) ] for arrays, format_ in cases: @@ -1965,7 +1952,7 @@ def test_fragments_repr(tempdir, dataset): "pickled", [lambda x, m: x, lambda x, m: m.loads(m.dumps(x))]) def test_partitioning_factory(mockfs, pickled, pickle_module): paths_or_selector = fs.FileSelector('subdir', recursive=True) - format = ParquetFileFormat() + format = ds.ParquetFileFormat() options = ds.FileSystemFactoryOptions('subdir') partitioning_factory = ds.DirectoryPartitioning.discover(['group', 'key']) @@ -2000,7 +1987,7 @@ def test_partitioning_factory(mockfs, pickled, pickle_module): def test_partitioning_factory_dictionary(mockfs, infer_dictionary, pickled, pickle_module): paths_or_selector = fs.FileSelector('subdir', recursive=True) - format = ParquetFileFormat() + format = ds.ParquetFileFormat() options = ds.FileSystemFactoryOptions('subdir') partitioning_factory = ds.DirectoryPartitioning.discover( @@ -2229,8 +2216,7 @@ def test_dictionary_partitioning_outer_nulls_raises(tempdir): def test_positional_keywords_raises(tempdir): table = pa.table({'a': ['x', 'y', None], 'b': ['x', 'y', 'z']}) with pytest.raises(TypeError): - ds.write_dataset(table, tempdir, "basename-{i}.arrow") \ - # type: ignore[too-many-positional-arguments] + ds.write_dataset(table, tempdir, "basename-{i}.arrow") @pytest.mark.parquet @@ -2608,12 +2594,12 @@ def test_construct_from_invalid_sources_raise(multisourcefs): child1 = ds.FileSystemDatasetFactory( multisourcefs, fs.FileSelector('/plain'), - format=ParquetFileFormat() + format=ds.ParquetFileFormat() ) child2 = ds.FileSystemDatasetFactory( multisourcefs, fs.FileSelector('/schema'), - format=ParquetFileFormat() + format=ds.ParquetFileFormat() ) batch1 = pa.RecordBatch.from_arrays([pa.array(range(10))], names=["a"]) batch2 = pa.RecordBatch.from_arrays([pa.array(range(10))], names=["b"]) @@ -3085,7 +3071,7 @@ def test_file_format_inspect_fsspec(tempdir): assert fsspec_fs.ls(tempdir)[0].endswith("data.parquet") # inspect using dataset file format - format = ParquetFileFormat() + format = ds.ParquetFileFormat() # manually creating a PyFileSystem instead of using fs._ensure_filesystem # which would convert an fsspec local filesystem to a native one filesystem = fs.PyFileSystem(fs.FSSpecHandler(fsspec_fs)) @@ -3158,13 +3144,13 @@ def test_filter_compute_expression(tempdir, dataset_reader): _, path = _create_single_file(tempdir, table) dataset = ds.dataset(str(path)) - filter_ = is_in(ds.field('A'), pa.array(["a", "b"])) + filter_ = pc.is_in(ds.field('A'), pa.array(["a", "b"])) assert dataset_reader.to_table(dataset, filter=filter_).num_rows == 3 - filter_ = hour(ds.field('B')) >= 3 + filter_ = pc.hour(ds.field('B')) >= 3 assert dataset_reader.to_table(dataset, filter=filter_).num_rows == 2 - days = days_between(ds.field('B'), ds.field("C")) + days = pc.days_between(ds.field('B'), ds.field("C")) result = dataset_reader.to_table(dataset, columns={"days": days}) assert result["days"].to_pylist() == [0, 1, 2, 3, 4] @@ -3172,7 +3158,7 @@ def test_filter_compute_expression(tempdir, dataset_reader): def test_dataset_union(multisourcefs): child = ds.FileSystemDatasetFactory( multisourcefs, fs.FileSelector('/plain'), - format=ParquetFileFormat() + format=ds.ParquetFileFormat() ) factory = ds.UnionDatasetFactory([child]) @@ -3395,7 +3381,7 @@ def test_orc_format(tempdir, dataset_reader): path = str(tempdir / 'test.orc') orc.write_table(table, path) - dataset = ds.dataset(path, format=OrcFileFormat()) + dataset = ds.dataset(path, format=ds.OrcFileFormat()) fragments = list(dataset.get_fragments()) assert isinstance(fragments[0], ds.FileFragment) result = dataset_reader.to_table(dataset) @@ -3450,7 +3436,7 @@ def test_orc_scan_options(tempdir, dataset_reader): def test_orc_format_not_supported(): try: - from pyarrow.dataset import OrcFileFormat # type: ignore[possibly-unbound-import] # noqa + from pyarrow.dataset import OrcFileFormat # noqa except ImportError: # ORC is not available, test error message with pytest.raises( @@ -3469,7 +3455,7 @@ def test_orc_writer_not_implemented_for_dataset(): pa.table({"a": range(10)}), format='orc', base_dir='/tmp' ) - of = OrcFileFormat() + of = ds.OrcFileFormat() with pytest.raises( NotImplementedError, match="Writing datasets not yet implemented for this file format" @@ -3687,7 +3673,7 @@ def test_column_names_encoding(tempdir, dataset_reader): # Reading as string without specifying encoding should produce an error dataset = ds.dataset(path, format='csv', schema=expected_schema) - with pytest.raises(lib.ArrowInvalid, match="invalid UTF8"): + with pytest.raises(pyarrow.lib.ArrowInvalid, match="invalid UTF8"): dataset_reader.to_table(dataset) # Setting the encoding in the read_options should transcode the data @@ -4189,7 +4175,7 @@ def test_write_to_dataset_given_null_just_works(tempdir): def _sort_table(tab, sort_col): import pyarrow.compute as pc - sorted_indices = sort_indices( + sorted_indices = pc.sort_indices( tab, options=pc.SortOptions([(sort_col, 'ascending')])) return pc.take(tab, sorted_indices) @@ -4637,7 +4623,7 @@ def test_write_dataset_max_open_files(tempdir): def _get_compare_pair(data_source, record_batch, file_format, col_id): num_of_files_generated = _get_num_of_files_generated( base_directory=data_source, file_format=file_format) - number_of_partitions = len(unique(record_batch[col_id])) + number_of_partitions = len(pa.compute.unique(record_batch[col_id])) return num_of_files_generated, number_of_partitions # CASE 1: when max_open_files=default & max_open_files >= num_of_partitions @@ -4935,7 +4921,7 @@ def test_write_dataset_parquet(tempdir): # using custom options for version in ["1.0", "2.4", "2.6"]: - format = ParquetFileFormat() + format = ds.ParquetFileFormat() opts = format.make_write_options(version=version) assert " should error is dataset was properly encrypted - pformat = ParquetFileFormat() + pformat = pa.dataset.ParquetFileFormat() with pytest.raises(IOError, match=r"no decryption"): ds.dataset("sample_dataset", format=pformat, filesystem=mockfs) # set decryption config for parquet fragment scan options - pq_scan_opts = ParquetFragmentScanOptions( + pq_scan_opts = ds.ParquetFragmentScanOptions( decryption_config=parquet_decryption_cfg ) - pformat = ParquetFileFormat(default_fragment_scan_options=pq_scan_opts) + pformat = pa.dataset.ParquetFileFormat(default_fragment_scan_options=pq_scan_opts) dataset = ds.dataset("sample_dataset", format=pformat, filesystem=mockfs) assert table.equals(dataset.to_table()) @@ -153,11 +145,11 @@ def test_dataset_encryption_decryption(): # set decryption properties for parquet fragment scan options decryption_properties = crypto_factory.file_decryption_properties( kms_connection_config, decryption_config) - pq_scan_opts = ParquetFragmentScanOptions( + pq_scan_opts = ds.ParquetFragmentScanOptions( decryption_properties=decryption_properties ) - pformat = ParquetFileFormat(default_fragment_scan_options=pq_scan_opts) + pformat = pa.dataset.ParquetFileFormat(default_fragment_scan_options=pq_scan_opts) dataset = ds.dataset("sample_dataset", format=pformat, filesystem=mockfs) assert table.equals(dataset.to_table()) @@ -172,7 +164,7 @@ def test_write_dataset_parquet_without_encryption(): # Set the encryption configuration using ParquetFileFormat # and make_write_options - pformat = ParquetFileFormat() + pformat = pa.dataset.ParquetFileFormat() with pytest.raises(NotImplementedError): _ = pformat.make_write_options(encryption_config="some value") @@ -210,14 +202,14 @@ def unwrap_key(self, wrapped_key: bytes, _: str) -> bytes: plaintext_footer=False, data_key_length_bits=128, ) - pqe_config = ParquetEncryptionConfig( + pqe_config = ds.ParquetEncryptionConfig( crypto_factory, kms_config, encryption_config ) - pqd_config = ParquetDecryptionConfig( + pqd_config = ds.ParquetDecryptionConfig( crypto_factory, kms_config, pe.DecryptionConfiguration() ) - scan_options = ParquetFragmentScanOptions(decryption_config=pqd_config) - file_format = ParquetFileFormat(default_fragment_scan_options=scan_options) + scan_options = ds.ParquetFragmentScanOptions(decryption_config=pqd_config) + file_format = ds.ParquetFileFormat(default_fragment_scan_options=scan_options) write_options = file_format.make_write_options(encryption_config=pqe_config) file_decryption_properties = crypto_factory.file_decryption_properties(kms_config) diff --git a/python/pyarrow/tests/test_exec_plan.py b/python/pyarrow/tests/test_exec_plan.py index 177f3baa378..d85a2c21524 100644 --- a/python/pyarrow/tests/test_exec_plan.py +++ b/python/pyarrow/tests/test_exec_plan.py @@ -220,14 +220,13 @@ def test_table_join_keys_order(): def test_filter_table_errors(): - from pyarrow.compute import divide # type: ignore[unresolved-attribute] t = pa.table({ "a": [1, 2, 3, 4, 5], "b": [10, 20, 30, 40, 50] }) with pytest.raises(pa.ArrowTypeError): - _filter_table(t, divide(pc.field("a"), pc.scalar(2))) + _filter_table(t, pc.divide(pc.field("a"), pc.scalar(2))) with pytest.raises(pa.ArrowInvalid): _filter_table(t, (pc.field("Z") <= pc.scalar(2))) @@ -268,16 +267,14 @@ def test_filter_table_ordering(): def test_complex_filter_table(): - from pyarrow.compute import bit_wise_and, multiply \ - # type: ignore[unresolved-attribute] t = pa.table({ "a": [1, 2, 3, 4, 5, 6, 6], "b": [10, 20, 30, 40, 50, 60, 61] }) result = _filter_table( - t, ((bit_wise_and(pc.field("a"), pc.scalar(1)) == pc.scalar(0)) & - (multiply(pc.field("a"), pc.scalar(10)) == pc.field("b"))) + t, ((pc.bit_wise_and(pc.field("a"), pc.scalar(1)) == pc.scalar(0)) & + (pc.multiply(pc.field("a"), pc.scalar(10)) == pc.field("b"))) ) assert result == pa.table({ diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index c1e5db238ad..ebac37e862b 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -27,7 +27,7 @@ try: import numpy as np except ImportError: - pass + np = None import pyarrow as pa from pyarrow.vendored.version import Version @@ -1353,11 +1353,11 @@ def test_cpp_extension_in_python(tmpdir): sys.path.insert(0, str(tmpdir)) mod = __import__('extensions') - uuid_type = mod._make_uuid_type() # type: ignore[unresolved-attribute] + uuid_type = mod._make_uuid_type() assert uuid_type.extension_name == "example-uuid" assert uuid_type.storage_type == pa.binary(16) - array = mod._make_uuid_array() # type: ignore[unresolved-attribute] + array = mod._make_uuid_array() assert array.type == uuid_type assert array.to_pylist() == [b'abcdefghijklmno0', b'0onmlkjihgfedcba'] assert array[0].as_py() == b'abcdefghijklmno0' @@ -1882,7 +1882,7 @@ def test_bool8_from_numpy_conversion(): ValueError, match="Cannot convert 0-D array to bool8 array", ): - pa.Bool8Array.from_numpy(np.bool_()) # type: ignore[no-matching-overload] + pa.Bool8Array.from_numpy(np.bool_()) # must use compatible storage type with pytest.raises( diff --git a/python/pyarrow/tests/test_feather.py b/python/pyarrow/tests/test_feather.py index c4631903c1a..054bf920b26 100644 --- a/python/pyarrow/tests/test_feather.py +++ b/python/pyarrow/tests/test_feather.py @@ -26,7 +26,7 @@ try: import numpy as np except ImportError: - pass + np = None import pyarrow as pa import pyarrow.tests.strategies as past @@ -63,7 +63,7 @@ def compression(request): yield request.param -TEST_FILES = [] +TEST_FILES = None def setup_module(module): @@ -72,12 +72,11 @@ def setup_module(module): def teardown_module(module): - if TEST_FILES is not None: - for path in TEST_FILES: - try: - os.remove(path) - except os.error: - pass + for path in TEST_FILES: + try: + os.remove(path) + except os.error: + pass @pytest.mark.pandas @@ -591,7 +590,7 @@ def test_sparse_dataframe(version): # GH #221 data = {'A': [0, 1, 2], 'B': [1, 0, 1]} - df = pd.DataFrame(data).to_sparse(fill_value=1) # type: ignore[call-non-callable] + df = pd.DataFrame(data).to_sparse(fill_value=1) expected = df.to_dense() _check_pandas_roundtrip(df, expected, version=version) diff --git a/python/pyarrow/tests/test_flight.py b/python/pyarrow/tests/test_flight.py index f14e5215b33..e9e99d8eb83 100644 --- a/python/pyarrow/tests/test_flight.py +++ b/python/pyarrow/tests/test_flight.py @@ -26,18 +26,17 @@ import threading import time import traceback -from json import dumps as json_dumps -from json import dumps as json_loads +import json from datetime import datetime try: import numpy as np except ImportError: - pass + np = None import pytest import pyarrow as pa -from pyarrow.lib import IpcReadOptions, tobytes # type: ignore[unresolved_import] +from pyarrow.lib import IpcReadOptions, tobytes from pyarrow.util import find_free_port from pyarrow.tests import util @@ -50,35 +49,8 @@ ClientMiddleware, ClientMiddlewareFactory, ) except ImportError: - class MockContextManager: - def __init__(self, *args, **kwargs): - pass - - def __enter__(self): - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - pass - - class FlightServerBase(MockContextManager): - def serve(self): - pass - - class FlightClient(MockContextManager): - def get_flight_info(self, *args, **kwargs): - pass - - def do_action(self, *args, **kwargs): - pass - - def do_get(self, *args, **kwargs): - pass - - def do_put(self, *args, **kwargs): - pass - - def close(self): - pass + flight = None + FlightClient, FlightServerBase = object, object ServerAuthHandler, ClientAuthHandler = object, object ServerMiddleware, ServerMiddlewareFactory = object, object ClientMiddleware, ClientMiddlewareFactory = object, object @@ -344,7 +316,7 @@ class InvalidStreamFlightServer(FlightServerBase): def do_get(self, context, ticket): data1 = [pa.array([-10, -5, 0, 5, 10], type=pa.int32())] data2 = [pa.array([-10.0, -5.0, 0.0, 5.0, 10.0], type=pa.float64())] - assert data1[0].type != data2[0].type + assert data1.type != data2.type table1 = pa.Table.from_arrays(data1, names=['a']) table2 = pa.Table.from_arrays(data2, names=['a']) assert table1.schema == self.schema @@ -1121,7 +1093,7 @@ def test_client_wait_for_available(): server = None def serve(): - nonlocal server + global server time.sleep(0.5) server = FlightServerBase(location) server.serve() @@ -1771,7 +1743,7 @@ def test_flight_do_put_limit(): with pytest.raises(flight.FlightWriteSizeExceededError, match="exceeded soft limit") as excinfo: writer.write_batch(large_batch) - assert excinfo.value.limit == 4096 # type: ignore[unresolved-attribute] + assert excinfo.value.limit == 4096 smaller_batches = [ large_batch.slice(0, 384), large_batch.slice(384), @@ -2385,7 +2357,7 @@ class ActionNoneFlightServer(EchoFlightServer): def do_action(self, context, action): if action.type == "get_value": - return [json_dumps(self.VALUES).encode('utf-8')] + return [json.dumps(self.VALUES).encode('utf-8')] elif action.type == "append": self.VALUES.append(True) return None @@ -2402,7 +2374,7 @@ def test_none_action_side_effect(): FlightClient(('localhost', server.port)) as client: client.do_action(flight.Action("append", b"")) r = client.do_action(flight.Action("get_value", b"")) - assert json_loads(next(r).body.to_pybytes()) == [True] + assert json.loads(next(r).body.to_pybytes()) == [True] @pytest.mark.slow # Takes a while for gRPC to "realize" writes fail diff --git a/python/pyarrow/tests/test_fs.py b/python/pyarrow/tests/test_fs.py index 7c891c7919d..a5a10fa55c6 100644 --- a/python/pyarrow/tests/test_fs.py +++ b/python/pyarrow/tests/test_fs.py @@ -39,31 +39,6 @@ copy_files) from pyarrow.util import find_free_port -try: - from pyarrow.fs import ( - AwsDefaultS3RetryStrategy, # type: ignore[possibly-unbound-import] - AwsStandardS3RetryStrategy, # type: ignore[possibly-unbound-import] - S3FileSystem, # type: ignore[possibly-unbound-import] - resolve_s3_region, # type: ignore[possibly-unbound-import] - S3RetryStrategy # type: ignore[possibly-unbound-import] - ) -except ImportError: - pass - -try: - from pyarrow.fs import AzureFileSystem # type: ignore[possibly-unbound-import] -except ImportError: - pass - -try: - from pyarrow.fs import GcsFileSystem # type: ignore[possibly-unbound-import] -except ImportError: - pass - -try: - from pyarrow.fs import HadoopFileSystem # type: ignore[possibly-unbound-import] -except ImportError: - pass here = os.path.dirname(os.path.abspath(__file__)) @@ -236,6 +211,7 @@ def subtree_localfs(request, tempdir, localfs): @pytest.fixture def gcsfs(request, gcs_server): request.config.pyarrow.requires('gcs') + from pyarrow.fs import GcsFileSystem host, port = gcs_server['connection'] bucket = 'pyarrow-filesystem/' @@ -265,6 +241,7 @@ def gcsfs(request, gcs_server): @pytest.fixture def s3fs(request, s3_server): request.config.pyarrow.requires('s3') + from pyarrow.fs import S3FileSystem host, port, access_key, secret_key = s3_server['connection'] bucket = 'pyarrow-filesystem/' @@ -324,6 +301,7 @@ def subtree_s3fs(request, s3fs): @pytest.fixture def azurefs(request, azure_server): request.config.pyarrow.requires('azure') + from pyarrow.fs import AzureFileSystem host, port, account_name, account_key = azure_server['connection'] azurite_authority = f"{host}:{port}" @@ -355,6 +333,8 @@ def hdfs(request, hdfs_connection): if not pa.have_libhdfs(): pytest.skip('Cannot locate libhdfs') + from pyarrow.fs import HadoopFileSystem + host, port, user = hdfs_connection fs = HadoopFileSystem(host, port=port, user=user) @@ -535,6 +515,7 @@ def skip_azure(fs, reason): @pytest.mark.s3 def test_s3fs_limited_permissions_create_bucket(s3_server): + from pyarrow.fs import S3FileSystem _configure_s3_limited_user(s3_server, _minio_limited_policy, 'test_fs_limited_user', 'limited123') host, port, _, _ = s3_server['connection'] @@ -1166,6 +1147,7 @@ def test_mockfs_mtime_roundtrip(mockfs): @pytest.mark.gcs def test_gcs_options(pickle_module): + from pyarrow.fs import GcsFileSystem dt = datetime.now() fs = GcsFileSystem(access_token='abc', target_service_account='service_account@apache', @@ -1203,6 +1185,10 @@ def test_gcs_options(pickle_module): @pytest.mark.s3 def test_s3_options(pickle_module): + from pyarrow.fs import (AwsDefaultS3RetryStrategy, + AwsStandardS3RetryStrategy, S3FileSystem, + S3RetryStrategy) + fs = S3FileSystem(access_key='access', secret_key='secret', session_token='token', region='us-east-2', scheme='https', endpoint_override='localhost:8999') @@ -1303,6 +1289,8 @@ def test_s3_options(pickle_module): @pytest.mark.s3 def test_s3_proxy_options(monkeypatch, pickle_module): + from pyarrow.fs import S3FileSystem + # The following two are equivalent: proxy_opts_1_dict = {'scheme': 'http', 'host': 'localhost', 'port': 8999} proxy_opts_1_str = 'http://localhost:8999' @@ -1442,6 +1430,8 @@ def test_s3_proxy_options(monkeypatch, pickle_module): @pytest.mark.s3 def test_s3fs_wrong_region(): + from pyarrow.fs import S3FileSystem + # wrong region for bucket # anonymous=True incase CI/etc has invalid credentials fs = S3FileSystem(region='eu-north-1', anonymous=True) @@ -1464,6 +1454,8 @@ def test_s3fs_wrong_region(): @pytest.mark.azure def test_azurefs_options(pickle_module): + from pyarrow.fs import AzureFileSystem + fs1 = AzureFileSystem(account_name='fake-account-name') assert isinstance(fs1, AzureFileSystem) assert pickle_module.loads(pickle_module.dumps(fs1)) == fs1 @@ -1556,6 +1548,7 @@ def test_azurefs_options(pickle_module): @pytest.mark.hdfs def test_hdfs_options(hdfs_connection, pickle_module): + from pyarrow.fs import HadoopFileSystem if not pa.have_libhdfs(): pytest.skip('Cannot locate libhdfs') @@ -1662,6 +1655,8 @@ def test_filesystem_from_path_object(path): @pytest.mark.s3 def test_filesystem_from_uri_s3(s3_server): + from pyarrow.fs import S3FileSystem + host, port, access_key, secret_key = s3_server['connection'] uri = f"s3://{access_key}:{secret_key}@mybucket/foo/bar?scheme=http&" \ @@ -1679,6 +1674,8 @@ def test_filesystem_from_uri_s3(s3_server): @pytest.mark.gcs def test_filesystem_from_uri_gcs(gcs_server): + from pyarrow.fs import GcsFileSystem + host, port = gcs_server['connection'] uri = ("gs://anonymous@" + @@ -1867,6 +1864,7 @@ def test_py_open_append_stream(): def test_s3_real_aws(): # Exercise connection code with an AWS-backed S3 bucket. # This is a minimal integration check for ARROW-9261 and similar issues. + from pyarrow.fs import S3FileSystem default_region = (os.environ.get('PYARROW_TEST_S3_REGION') or 'us-east-1') fs = S3FileSystem(anonymous=True) @@ -1922,6 +1920,7 @@ def test_s3_real_aws_region_selection(): @pytest.mark.s3 def test_resolve_s3_region(): + from pyarrow.fs import resolve_s3_region assert resolve_s3_region('voltrondata-labs-datasets') == 'us-east-2' assert resolve_s3_region('mf-nwp-models') == 'eu-west-1' @@ -2169,7 +2168,7 @@ def test_fsspec_filesystem_from_uri(): def test_huggingface_filesystem_from_uri(): pytest.importorskip("fsspec") try: - from huggingface_hub import HfFileSystem # type: ignore[unresolved_import] + from huggingface_hub import HfFileSystem except ImportError: pytest.skip("huggingface_hub not installed") diff --git a/python/pyarrow/tests/test_gandiva.py b/python/pyarrow/tests/test_gandiva.py index 01a6d2151a0..80d119a4853 100644 --- a/python/pyarrow/tests/test_gandiva.py +++ b/python/pyarrow/tests/test_gandiva.py @@ -23,7 +23,7 @@ @pytest.mark.gandiva def test_tree_exp_builder(): - import pyarrow.gandiva as gandiva # type: ignore[unresolved_import] + import pyarrow.gandiva as gandiva builder = gandiva.TreeExprBuilder() @@ -65,7 +65,7 @@ def test_tree_exp_builder(): @pytest.mark.gandiva def test_table(): - import pyarrow.gandiva as gandiva # type: ignore[unresolved_import] + import pyarrow.gandiva as gandiva table = pa.Table.from_arrays([pa.array([1.0, 2.0]), pa.array([3.0, 4.0])], ['a', 'b']) @@ -92,7 +92,7 @@ def test_table(): @pytest.mark.gandiva def test_filter(): - import pyarrow.gandiva as gandiva # type: ignore[unresolved_import] + import pyarrow.gandiva as gandiva table = pa.Table.from_arrays([pa.array([1.0 * i for i in range(10000)])], ['a']) @@ -116,7 +116,7 @@ def test_filter(): @pytest.mark.gandiva def test_in_expr(): - import pyarrow.gandiva as gandiva # type: ignore[unresolved_import] + import pyarrow.gandiva as gandiva arr = pa.array(["ga", "an", "nd", "di", "iv", "va"]) table = pa.Table.from_arrays([arr], ["a"]) @@ -154,7 +154,7 @@ def test_in_expr(): @pytest.mark.skip(reason="Gandiva C++ did not have *real* binary, " "time and date support.") def test_in_expr_todo(): - import pyarrow.gandiva as gandiva # type: ignore[unresolved_import] + import pyarrow.gandiva as gandiva # TODO: Implement reasonable support for timestamp, time & date. # Current exceptions: # pyarrow.lib.ArrowException: ExpressionValidationError: @@ -227,7 +227,7 @@ def test_in_expr_todo(): @pytest.mark.gandiva def test_boolean(): - import pyarrow.gandiva as gandiva # type: ignore[unresolved_import] + import pyarrow.gandiva as gandiva table = pa.Table.from_arrays([ pa.array([1., 31., 46., 3., 57., 44., 22.]), @@ -254,7 +254,7 @@ def test_boolean(): @pytest.mark.gandiva def test_literals(): - import pyarrow.gandiva as gandiva # type: ignore[unresolved_import] + import pyarrow.gandiva as gandiva builder = gandiva.TreeExprBuilder() @@ -294,7 +294,7 @@ def test_literals(): @pytest.mark.gandiva def test_regex(): - import pyarrow.gandiva as gandiva # type: ignore[unresolved_import] + import pyarrow.gandiva as gandiva elements = ["park", "sparkle", "bright spark and fire", "spark"] data = pa.array(elements, type=pa.string()) @@ -318,7 +318,7 @@ def test_regex(): @pytest.mark.gandiva def test_get_registered_function_signatures(): - import pyarrow.gandiva as gandiva # type: ignore[unresolved_import] + import pyarrow.gandiva as gandiva signatures = gandiva.get_registered_function_signatures() assert type(signatures[0].return_type()) is pa.DataType @@ -328,7 +328,7 @@ def test_get_registered_function_signatures(): @pytest.mark.gandiva def test_filter_project(): - import pyarrow.gandiva as gandiva # type: ignore[unresolved_import] + import pyarrow.gandiva as gandiva mpool = pa.default_memory_pool() # Create a table with some sample data array0 = pa.array([10, 12, -20, 5, 21, 29], pa.int32()) @@ -375,7 +375,7 @@ def test_filter_project(): @pytest.mark.gandiva def test_to_string(): - import pyarrow.gandiva as gandiva # type: ignore[unresolved_import] + import pyarrow.gandiva as gandiva builder = gandiva.TreeExprBuilder() assert str(builder.make_literal(2.0, pa.float64()) @@ -395,7 +395,7 @@ def test_to_string(): @pytest.mark.gandiva def test_rejects_none(): - import pyarrow.gandiva as gandiva # type: ignore[unresolved_import] + import pyarrow.gandiva as gandiva builder = gandiva.TreeExprBuilder() diff --git a/python/pyarrow/tests/test_gdb.py b/python/pyarrow/tests/test_gdb.py index 4b1641557e7..912953ae60d 100644 --- a/python/pyarrow/tests/test_gdb.py +++ b/python/pyarrow/tests/test_gdb.py @@ -158,10 +158,10 @@ def select_frame(self, func_name): m = re.search(pat, out) if m is None: pytest.fail(f"Could not select frame for function {func_name}") - else: - frame_num = int(m[1]) - out = self.run_command(f"frame {frame_num}") - assert f"in {func_name}" in out + + frame_num = int(m[1]) + out = self.run_command(f"frame {frame_num}") + assert f"in {func_name}" in out def join(self): if self.proc is not None: diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py index bea9a929673..a6d3546e57c 100644 --- a/python/pyarrow/tests/test_io.py +++ b/python/pyarrow/tests/test_io.py @@ -33,12 +33,7 @@ try: import numpy as np except ImportError: - pass - -try: - from pyarrow import lib # type: ignore[unresolved-attribute] -except ImportError: - pass + np = None from pyarrow.util import guid from pyarrow import Codec @@ -817,7 +812,7 @@ def test_cache_options_pickling(pickle_module): @pytest.mark.numpy @pytest.mark.parametrize("compression", [ pytest.param( - "bz2", marks=pytest.mark.xfail(raises=lib.ArrowNotImplementedError) + "bz2", marks=pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError) ), "brotli", "gzip", @@ -858,7 +853,7 @@ def test_compress_decompress(compression): @pytest.mark.numpy @pytest.mark.parametrize("compression", [ pytest.param( - "bz2", marks=pytest.mark.xfail(raises=lib.ArrowNotImplementedError) + "bz2", marks=pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError) ), "brotli", "gzip", @@ -1730,7 +1725,7 @@ def test_output_stream_constructor(tmpdir): ]) def test_compression_detection(path, expected_compression): if not Codec.is_available(expected_compression): - with pytest.raises(lib.ArrowNotImplementedError): + with pytest.raises(pa.lib.ArrowNotImplementedError): Codec.detect(path) else: codec = Codec.detect(path) @@ -1755,7 +1750,7 @@ def test_unknown_compression_raises(): "zstd", pytest.param( "snappy", - marks=pytest.mark.xfail(raises=lib.ArrowNotImplementedError) + marks=pytest.mark.xfail(raises=pa.lib.ArrowNotImplementedError) ) ]) def test_compressed_roundtrip(compression): diff --git a/python/pyarrow/tests/test_ipc.py b/python/pyarrow/tests/test_ipc.py index 77018f93a24..b3b3367223d 100644 --- a/python/pyarrow/tests/test_ipc.py +++ b/python/pyarrow/tests/test_ipc.py @@ -28,15 +28,11 @@ try: import numpy as np except ImportError: - pass + np = None import pyarrow as pa from pyarrow.tests.util import changed_environ, invoke_script -try: - from pyarrow import lib # type: ignore[unresolved-attribute] -except ImportError: - pass try: from pandas.testing import assert_frame_equal @@ -1238,7 +1234,7 @@ def __arrow_c_stream__(self, requested_schema=None): assert reader.read_all() == expected.cast(good_schema) # If schema doesn't match, raises TypeError - with pytest.raises(lib.ArrowTypeError, match='Field 0 cannot be cast'): + with pytest.raises(pa.lib.ArrowTypeError, match='Field 0 cannot be cast'): pa.RecordBatchReader.from_stream( wrapper, schema=pa.schema([pa.field('a', pa.list_(pa.int32()))]) ) @@ -1275,7 +1271,7 @@ def test_record_batch_reader_cast(): # Check error for impossible cast in call to .cast() reader = pa.RecordBatchReader.from_batches(schema_src, data) - with pytest.raises(lib.ArrowTypeError, match='Field 0 cannot be cast'): + with pytest.raises(pa.lib.ArrowTypeError, match='Field 0 cannot be cast'): reader.cast(pa.schema([pa.field('a', pa.list_(pa.int32()))])) # Cast to same type should always work (also for types without a T->T cast function) @@ -1313,7 +1309,7 @@ def test_record_batch_reader_cast_nulls(): # when the batch is pulled reader = pa.RecordBatchReader.from_batches(schema_src, data_with_nulls) casted_reader = reader.cast(schema_dst) - with pytest.raises(lib.ArrowInvalid, match="Can't cast array"): + with pytest.raises(pa.lib.ArrowInvalid, match="Can't cast array"): casted_reader.read_all() diff --git a/python/pyarrow/tests/test_json.py b/python/pyarrow/tests/test_json.py index 90ce549c6e6..c3f9fe333bd 100644 --- a/python/pyarrow/tests/test_json.py +++ b/python/pyarrow/tests/test_json.py @@ -20,14 +20,14 @@ from decimal import Decimal import io import itertools -from json import dumps as json_dumps +import json import string import unittest try: import numpy as np except ImportError: - pass + np = None import pytest import pyarrow as pa @@ -49,7 +49,7 @@ def make_random_json(num_cols=2, num_rows=10, linesep='\r\n'): lines = [] for row in arr.T: json_obj = OrderedDict([(k, int(v)) for (k, v) in zip(col_names, row)]) - lines.append(json_dumps(json_obj)) + lines.append(json.dumps(json_obj)) data = linesep.join(lines).encode() columns = [pa.array(col, type=pa.int64()) for col in arr] expected = pa.Table.from_arrays(columns, col_names) diff --git a/python/pyarrow/tests/test_jvm.py b/python/pyarrow/tests/test_jvm.py index 51f259e4bd5..d2ba780efc7 100644 --- a/python/pyarrow/tests/test_jvm.py +++ b/python/pyarrow/tests/test_jvm.py @@ -15,8 +15,7 @@ # specific language governing permissions and limitations # under the License. -from json import dumps as json_dumps -from json import loads as json_loads +import json import os import pyarrow as pa import pyarrow.jvm as pa_jvm @@ -43,7 +42,7 @@ def root_allocator(): 'POM:version', namespaces={ 'POM': 'http://maven.apache.org/POM/4.0.0' - }).text # type: ignore[possibly-unbound-attribute] + }).text jar_path = os.path.join( arrow_dir, 'java', 'tools', 'target', f'arrow-tools-{version}-jar-with-dependencies.jar') @@ -77,8 +76,8 @@ def test_jvm_buffer(root_allocator): def test_jvm_buffer_released(root_allocator): - import jpype.imports # type: ignore[unresolved_import] # noqa - from java.lang import IllegalArgumentException # type: ignore[unresolved_import] + import jpype.imports # noqa + from java.lang import IllegalArgumentException jvm_buffer = root_allocator.buffer(8) jvm_buffer.release() @@ -172,27 +171,27 @@ def test_jvm_types(root_allocator, pa_type, jvm_spec, nullable): spec = { 'name': 'field_name', 'nullable': nullable, - 'type': json_loads(jvm_spec), + 'type': json.loads(jvm_spec), # TODO: This needs to be set for complex types 'children': [] } - jvm_field = _jvm_field(json_dumps(spec)) + jvm_field = _jvm_field(json.dumps(spec)) result = pa_jvm.field(jvm_field) expected_field = pa.field('field_name', pa_type, nullable=nullable) assert result == expected_field - jvm_schema = _jvm_schema(json_dumps(spec)) + jvm_schema = _jvm_schema(json.dumps(spec)) result = pa_jvm.schema(jvm_schema) assert result == pa.schema([expected_field]) # Schema with custom metadata - jvm_schema = _jvm_schema(json_dumps(spec), {'meta': 'data'}) + jvm_schema = _jvm_schema(json.dumps(spec), {'meta': 'data'}) result = pa_jvm.schema(jvm_schema) assert result == pa.schema([expected_field], {'meta': 'data'}) # Schema with custom field metadata spec['metadata'] = [{'key': 'field meta', 'value': 'field data'}] - jvm_schema = _jvm_schema(json_dumps(spec)) + jvm_schema = _jvm_schema(json.dumps(spec)) result = pa_jvm.schema(jvm_schema) expected_field = expected_field.with_metadata( {'field meta': 'field data'}) @@ -376,11 +375,11 @@ def test_jvm_record_batch(root_allocator, pa_type, py_data, jvm_type, spec = { 'name': 'field_name', 'nullable': False, - 'type': json_loads(jvm_spec), + 'type': json.loads(jvm_spec), # TODO: This needs to be set for complex types 'children': [] } - jvm_field = _jvm_field(json_dumps(spec)) + jvm_field = _jvm_field(json.dumps(spec)) # Create VectorSchemaRoot jvm_fields = jpype.JClass('java.util.ArrayList')() diff --git a/python/pyarrow/tests/test_misc.py b/python/pyarrow/tests/test_misc.py index 09ac52588ed..64f45d8bed8 100644 --- a/python/pyarrow/tests/test_misc.py +++ b/python/pyarrow/tests/test_misc.py @@ -22,7 +22,7 @@ import pytest import pyarrow as pa -from pyarrow.lib import ArrowInvalid # type: ignore[unresolved_import] +from pyarrow.lib import ArrowInvalid def test_get_include(): diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 9f15bc73c5b..ceea2527da0 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -17,7 +17,7 @@ import gc import decimal -from json import dumps as json_dumps +import json import multiprocessing as mp import sys import warnings @@ -32,14 +32,13 @@ import numpy as np import numpy.testing as npt try: - _np_VisibleDeprecationWarning = np.VisibleDeprecationWarning \ - # type: ignore[unresolved-attribute] + _np_VisibleDeprecationWarning = np.VisibleDeprecationWarning except AttributeError: from numpy.exceptions import ( VisibleDeprecationWarning as _np_VisibleDeprecationWarning ) except ImportError: - pass + np = None from pyarrow.pandas_compat import get_logical_type, _pandas_api from pyarrow.tests.util import invoke_script, random_ascii, rands @@ -48,7 +47,6 @@ from pyarrow.vendored.version import Version import pyarrow as pa -from pyarrow import lib # type: ignore[unresolved-attribute] try: from pyarrow import parquet as pq except ImportError: @@ -629,13 +627,11 @@ def test_table_column_subset_metadata(self): expected = df[['a']] if isinstance(df.index, pd.DatetimeIndex): df.index.freq = None - tm.assert_frame_equal(result, expected) \ - # type: ignore[invalid-argument-type] + tm.assert_frame_equal(result, expected) table_subset2 = table_subset.remove_column(1) result = table_subset2.to_pandas() - tm.assert_frame_equal(result, df[['a']].reset_index(drop=True)) \ - # type: ignore[invalid-argument-type] + tm.assert_frame_equal(result, df[['a']].reset_index(drop=True)) def test_to_pandas_column_subset_multiindex(self): # ARROW-10122 @@ -1941,7 +1937,7 @@ def test_array_of_bytes_to_strings(self): # cannot be converted to utf-8 def test_array_of_bytes_to_strings_bad_data(self): with pytest.raises( - lib.ArrowInvalid, + pa.lib.ArrowInvalid, match="was not a utf8 string"): pa.array(np.array([b'\x80\x81'], dtype=object), pa.string()) @@ -1957,13 +1953,13 @@ def test_numpy_string_array_to_fixed_size_binary(self): expected = pa.array([b'foo', None, b'baz'], type=pa.binary(3)) assert converted.equals(expected) - with pytest.raises(lib.ArrowInvalid, + with pytest.raises(pa.lib.ArrowInvalid, match=r'Got bytestring of length 3 \(expected 4\)'): arr = np.array([b'foo', b'bar', b'baz'], dtype='|S3') pa.array(arr, type=pa.binary(4)) with pytest.raises( - lib.ArrowInvalid, + pa.lib.ArrowInvalid, match=r'Got bytestring of length 12 \(expected 3\)'): arr = np.array([b'foo', b'bar', b'baz'], dtype='|U3') pa.array(arr, type=pa.binary(3)) @@ -3268,8 +3264,7 @@ def test_error_sparse(self): df = pd.DataFrame({'a': pd.arrays.SparseArray([1, np.nan, 3])}) except AttributeError: # pandas.arrays module introduced in pandas 0.24 - from pandas import SparseArray # type: ignore[unresolved-import] - df = pd.DataFrame({'a': SparseArray([1, np.nan, 3])}) + df = pd.DataFrame({'a': pd.SparseArray([1, np.nan, 3])}) with pytest.raises(TypeError, match="Sparse pandas data"): pa.Table.from_pandas(df) @@ -3724,9 +3719,7 @@ def test_table_from_pandas_schema_field_order_metadata(): coerce_cols_to_types["datetime"] = "datetime64[s, UTC]" expected = df[["float", "datetime"]].astype(coerce_cols_to_types) - # TODO: result and expected should have the same type, - # see other ignore[invalid-argument-type] involving assert_frame_equal - tm.assert_frame_equal(result, expected) # type: ignore[invalid-argument-type] + tm.assert_frame_equal(result, expected) # ---------------------------------------------------------------------- @@ -4429,13 +4422,11 @@ def test_convert_to_extension_array(monkeypatch): # monkeypatch pandas Int64Dtype to *not* have the protocol method if Version(pd.__version__) < Version("1.3.0.dev"): - from pandas.core import integer # type: ignore[unresolved-import] monkeypatch.delattr( - integer._IntegerDtype, "__from_arrow__") + pd.core.arrays.integer._IntegerDtype, "__from_arrow__") else: monkeypatch.delattr( - pd.core.arrays.integer.NumericDtype, "__from_arrow__") \ - # type: ignore[unresolved-attribute] + pd.core.arrays.integer.NumericDtype, "__from_arrow__") # Int64Dtype has no __from_arrow__ -> use normal conversion result = table.to_pandas() assert len(_get_mgr(result).blocks) == 1 @@ -4476,13 +4467,11 @@ def test_conversion_extensiontype_to_extensionarray(monkeypatch): # monkeypatch pandas Int64Dtype to *not* have the protocol method # (remove the version added above and the actual version for recent pandas) if Version(pd.__version__) < Version("1.3.0.dev"): - from pandas.core import integer # type: ignore[unresolved-import] monkeypatch.delattr( - integer._IntegerDtype, "__from_arrow__") + pd.core.arrays.integer._IntegerDtype, "__from_arrow__") else: monkeypatch.delattr( - pd.core.arrays.integer.NumericDtype, "__from_arrow__") \ - # type: ignore[unresolved-attribute] + pd.core.arrays.integer.NumericDtype, "__from_arrow__") result = arr.to_pandas() assert _get_mgr(result).blocks[0].values.dtype == np.dtype("int64") @@ -4661,7 +4650,7 @@ def test_metadata_compat_range_index_pre_0_12(): t1 = pa.Table.from_arrays([a_arrow, rng_index_arrow], names=['a', 'qux']) t1 = t1.replace_schema_metadata({ - b'pandas': json_dumps( + b'pandas': json.dumps( {'index_columns': ['qux'], 'column_indexes': [{'name': None, 'field_name': None, @@ -4690,7 +4679,7 @@ def test_metadata_compat_range_index_pre_0_12(): t2 = pa.Table.from_arrays([a_arrow, rng_index_arrow], names=['qux', gen_name_0]) t2 = t2.replace_schema_metadata({ - b'pandas': json_dumps( + b'pandas': json.dumps( {'index_columns': [gen_name_0], 'column_indexes': [{'name': None, 'field_name': None, @@ -4719,7 +4708,7 @@ def test_metadata_compat_range_index_pre_0_12(): t3 = pa.Table.from_arrays([a_arrow, rng_index_arrow], names=['a', gen_name_0]) t3 = t3.replace_schema_metadata({ - b'pandas': json_dumps( + b'pandas': json.dumps( {'index_columns': [gen_name_0], 'column_indexes': [{'name': None, 'field_name': None, @@ -4748,7 +4737,7 @@ def test_metadata_compat_range_index_pre_0_12(): t4 = pa.Table.from_arrays([a_arrow, rng_index_arrow, b_arrow], names=['a', 'qux', gen_name_1]) t4 = t4.replace_schema_metadata({ - b'pandas': json_dumps( + b'pandas': json.dumps( {'index_columns': ['qux', gen_name_1], 'column_indexes': [{'name': None, 'field_name': None, @@ -4782,7 +4771,7 @@ def test_metadata_compat_range_index_pre_0_12(): t5 = pa.Table.from_arrays([a_arrow, rng_index_arrow, b_arrow], names=['a', gen_name_0, gen_name_1]) t5 = t5.replace_schema_metadata({ - b'pandas': json_dumps( + b'pandas': json.dumps( {'index_columns': [gen_name_0, gen_name_1], 'column_indexes': [{'name': None, 'field_name': None, @@ -4829,7 +4818,7 @@ def test_metadata_compat_missing_field_name(): # metadata generated by fastparquet 0.3.2 with missing field_names table = table.replace_schema_metadata({ - b'pandas': json_dumps({ + b'pandas': json.dumps({ 'column_indexes': [ {'field_name': None, 'metadata': None, @@ -4971,7 +4960,7 @@ def test_does_not_mutate_timedelta_dtype(): assert np.dtype(np.timedelta64) == expected - df = pd.DataFrame({"a": [np.timedelta64("s")]}) + df = pd.DataFrame({"a": [np.timedelta64()]}) t = pa.Table.from_pandas(df) t.to_pandas() @@ -5126,7 +5115,7 @@ def test_roundtrip_map_array_with_pydicts_duplicate_keys(): # ------------------------ # With maps as pydicts - with pytest.raises(lib.ArrowException): + with pytest.raises(pa.lib.ArrowException): # raises because of duplicate keys maps.to_pandas(maps_as_pydicts="strict") series_pydicts = maps.to_pandas(maps_as_pydicts="lossy") diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index 407c69263e8..0f62dd98f82 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -24,7 +24,7 @@ try: import numpy as np except ImportError: - pass + np = None import pyarrow as pa import pyarrow.compute as pc @@ -201,7 +201,7 @@ def test_timestamp_scalar(): assert b == "" c = repr(pa.scalar(datetime.datetime(2015, 1, 1), type=pa.timestamp('us'))) assert c == "" - d = repr(pc.assume_timezone( # type: ignore[unresolved-attribute] + d = repr(pc.assume_timezone( pa.scalar("2000-01-01").cast(pa.timestamp("s")), "America/New_York")) assert d == "" diff --git a/python/pyarrow/tests/test_schema.py b/python/pyarrow/tests/test_schema.py index 48af7b143ff..a1197ed2d08 100644 --- a/python/pyarrow/tests/test_schema.py +++ b/python/pyarrow/tests/test_schema.py @@ -23,7 +23,7 @@ try: import numpy as np except ImportError: - pass + np = None import pyarrow as pa import pyarrow.tests.util as test_util @@ -627,11 +627,11 @@ def test_type_schema_pickling(pickle_module): pa.union([ pa.field('a', pa.int8()), pa.field('b', pa.int16()) - ], pa.lib.UnionMode_SPARSE), # type: ignore[unresolved-attribute] + ], pa.lib.UnionMode_SPARSE), pa.union([ pa.field('a', pa.int8()), pa.field('b', pa.int16()) - ], pa.lib.UnionMode_DENSE), # type: ignore[unresolved-attribute] + ], pa.lib.UnionMode_DENSE), pa.time32('s'), pa.time64('us'), pa.date32(), diff --git a/python/pyarrow/tests/test_sparse_tensor.py b/python/pyarrow/tests/test_sparse_tensor.py index 89823e04943..eca8090d77a 100644 --- a/python/pyarrow/tests/test_sparse_tensor.py +++ b/python/pyarrow/tests/test_sparse_tensor.py @@ -28,12 +28,15 @@ try: from scipy.sparse import csr_array, coo_array, csr_matrix, coo_matrix except ImportError: - pytestmark = pytest.mark.scipy + coo_matrix = None + csr_matrix = None + csr_array = None + coo_array = None try: - import sparse # type: ignore[unresolved_import] + import sparse except ImportError: - pytestmark = pytest.mark.pydata_sparse + sparse = None tensor_type_pairs = [ @@ -398,7 +401,7 @@ def test_dense_to_sparse_tensor(dtype_str, arrow_type, sparse_tensor_type): assert np.array_equal(array, result_array) -@pytest.mark.scipy +@pytest.mark.skipif(not coo_matrix, reason="requires scipy") @pytest.mark.parametrize('sparse_object', (coo_array, coo_matrix)) @pytest.mark.parametrize('dtype_str,arrow_type', scipy_type_pairs) def test_sparse_coo_tensor_scipy_roundtrip(dtype_str, arrow_type, @@ -440,7 +443,7 @@ def test_sparse_coo_tensor_scipy_roundtrip(dtype_str, arrow_type, assert out_scipy_matrix.has_canonical_format -@pytest.mark.scipy +@pytest.mark.skipif(not csr_matrix, reason="requires scipy") @pytest.mark.parametrize('sparse_object', (csr_array, csr_matrix)) @pytest.mark.parametrize('dtype_str,arrow_type', scipy_type_pairs) def test_sparse_csr_matrix_scipy_roundtrip(dtype_str, arrow_type, @@ -468,7 +471,7 @@ def test_sparse_csr_matrix_scipy_roundtrip(dtype_str, arrow_type, assert np.array_equal(dense_array, sparse_tensor.to_tensor().to_numpy()) -@pytest.mark.pydata_sparse +@pytest.mark.skipif(not sparse, reason="requires pydata/sparse") @pytest.mark.parametrize('dtype_str,arrow_type', tensor_type_pairs) def test_pydata_sparse_sparse_coo_tensor_roundtrip(dtype_str, arrow_type): dtype = np.dtype(dtype_str) diff --git a/python/pyarrow/tests/test_strategies.py b/python/pyarrow/tests/test_strategies.py index 0fe9508aef0..babb839b534 100644 --- a/python/pyarrow/tests/test_strategies.py +++ b/python/pyarrow/tests/test_strategies.py @@ -19,29 +19,29 @@ import pytest -from pyarrow import lib # type: ignore[unresolved-attribute] +import pyarrow as pa import pyarrow.tests.strategies as past @h.given(past.all_types) def test_types(ty): - assert isinstance(ty, lib.DataType) + assert isinstance(ty, pa.lib.DataType) @h.given(past.all_fields) def test_fields(field): - assert isinstance(field, lib.Field) + assert isinstance(field, pa.lib.Field) @h.given(past.all_schemas) def test_schemas(schema): - assert isinstance(schema, lib.Schema) + assert isinstance(schema, pa.lib.Schema) @pytest.mark.numpy @h.given(past.all_arrays) def test_arrays(array): - assert isinstance(array, lib.Array) + assert isinstance(array, pa.lib.Array) @pytest.mark.numpy @@ -52,15 +52,15 @@ def test_array_nullability(array): @h.given(past.chunked_arrays(past.primitive_types)) def test_chunked_arrays(chunked_array): - assert isinstance(chunked_array, lib.ChunkedArray) + assert isinstance(chunked_array, pa.lib.ChunkedArray) @h.given(past.all_record_batches) def test_record_batches(record_bath): - assert isinstance(record_bath, lib.RecordBatch) + assert isinstance(record_bath, pa.lib.RecordBatch) @pytest.mark.numpy @h.given(past.all_tables) def test_tables(table): - assert isinstance(table, lib.Table) + assert isinstance(table, pa.lib.Table) diff --git a/python/pyarrow/tests/test_substrait.py b/python/pyarrow/tests/test_substrait.py index 8ac0951e489..fcd1c8d48c5 100644 --- a/python/pyarrow/tests/test_substrait.py +++ b/python/pyarrow/tests/test_substrait.py @@ -22,16 +22,13 @@ import pyarrow as pa import pyarrow.compute as pc -from pyarrow.compute import equal # type: ignore[unresolved-attribute] -from pyarrow import _substrait # type: ignore[unresolved-attribute] -from pyarrow.lib import tobytes # type: ignore[unresolved_import] -from pyarrow.lib import ArrowInvalid, ArrowNotImplementedError \ - # type: ignore[unresolved_import] +from pyarrow.lib import tobytes +from pyarrow.lib import ArrowInvalid, ArrowNotImplementedError try: import pyarrow.substrait as substrait except ImportError: - pass + substrait = None # Marks all of the tests in this module # Ignore these with pytest ... -m 'not substrait' @@ -39,7 +36,7 @@ def mock_udf_context(batch_length=10): - from pyarrow._compute import _get_udf_context # type: ignore[unresolved_import] + from pyarrow._compute import _get_udf_context return _get_udf_context(pa.default_memory_pool(), batch_length) @@ -88,7 +85,7 @@ def test_run_serialized_query(tmpdir, use_threads): query = tobytes(substrait_query.replace( "FILENAME_PLACEHOLDER", pathlib.Path(path).as_uri())) - buf = _substrait._parse_json_plan(query) + buf = pa._substrait._parse_json_plan(query) reader = substrait.run_query(buf, use_threads=use_threads) res_tb = reader.read_all() @@ -119,7 +116,7 @@ def test_invalid_plan(): ] } """ - buf = _substrait._parse_json_plan(tobytes(query)) + buf = pa._substrait._parse_json_plan(tobytes(query)) exec_message = "Plan has no relations" with pytest.raises(ArrowInvalid, match=exec_message): substrait.run_query(buf) @@ -165,7 +162,7 @@ def test_binary_conversion_with_json_options(tmpdir, use_threads): path = _write_dummy_data_to_disk(tmpdir, file_name, table) query = tobytes(substrait_query.replace( "FILENAME_PLACEHOLDER", pathlib.Path(path).as_uri())) - buf = _substrait._parse_json_plan(tobytes(query)) + buf = pa._substrait._parse_json_plan(tobytes(query)) reader = substrait.run_query(buf, use_threads=use_threads) res_tb = reader.read_all() @@ -184,7 +181,7 @@ def has_function(fns, ext_file, fn_name): def test_get_supported_functions(): - supported_functions = _substrait.get_supported_functions() + supported_functions = pa._substrait.get_supported_functions() # It probably doesn't make sense to exhaustively verify this list but # we can check a sample aggregate and a sample non-aggregate entry assert has_function(supported_functions, @@ -235,7 +232,7 @@ def table_provider(names, schema): } """ - buf = _substrait._parse_json_plan(tobytes(substrait_query)) + buf = pa._substrait._parse_json_plan(tobytes(substrait_query)) reader = pa.substrait.run_query( buf, table_provider=table_provider, use_threads=use_threads) res_tb = reader.read_all() @@ -278,7 +275,7 @@ def table_provider(names, _): } """ - buf = _substrait._parse_json_plan(tobytes(substrait_query)) + buf = pa._substrait._parse_json_plan(tobytes(substrait_query)) exec_message = "Invalid NamedTable Source" with pytest.raises(ArrowInvalid, match=exec_message): substrait.run_query(buf, table_provider=table_provider) @@ -320,7 +317,7 @@ def table_provider(names, _): } """ query = tobytes(substrait_query) - buf = _substrait._parse_json_plan(tobytes(query)) + buf = pa._substrait._parse_json_plan(tobytes(query)) exec_message = "names for NamedTable not provided" with pytest.raises(ArrowInvalid, match=exec_message): substrait.run_query(buf, table_provider=table_provider) @@ -439,7 +436,7 @@ def table_provider(names, _): } """ - buf = _substrait._parse_json_plan(substrait_query) + buf = pa._substrait._parse_json_plan(substrait_query) reader = pa.substrait.run_query( buf, table_provider=table_provider, use_threads=use_threads) res_tb = reader.read_all() @@ -562,7 +559,7 @@ def table_provider(names, _): } """ - buf = _substrait._parse_json_plan(substrait_query) + buf = pa._substrait._parse_json_plan(substrait_query) with pytest.raises(pa.ArrowKeyError) as excinfo: pa.substrait.run_query(buf, table_provider=table_provider) assert "No function registered" in str(excinfo.value) @@ -601,7 +598,7 @@ def table_provider(names, schema): } """ - buf = _substrait._parse_json_plan(tobytes(substrait_query)) + buf = pa._substrait._parse_json_plan(tobytes(substrait_query)) reader = pa.substrait.run_query( buf, table_provider=table_provider, use_threads=use_threads) res_tb = reader.read_all() @@ -747,7 +744,7 @@ def table_provider(names, _): ], } """ - buf = _substrait._parse_json_plan(substrait_query) + buf = pa._substrait._parse_json_plan(substrait_query) reader = pa.substrait.run_query( buf, table_provider=table_provider, use_threads=False) res_tb = reader.read_all() @@ -916,7 +913,7 @@ def table_provider(names, _): ], } """ - buf = _substrait._parse_json_plan(substrait_query) + buf = pa._substrait._parse_json_plan(substrait_query) reader = pa.substrait.run_query( buf, table_provider=table_provider, use_threads=False) res_tb = reader.read_all() @@ -932,8 +929,8 @@ def table_provider(names, _): @pytest.mark.parametrize("expr", [ - equal(pc.field("x"), 7), - equal(pc.field("x"), pc.field("y")), + pc.equal(pc.field("x"), 7), + pc.equal(pc.field("x"), pc.field("y")), pc.field("x") > 50 ]) def test_serializing_expressions(expr): @@ -988,7 +985,7 @@ def test_arrow_one_way_types(): ) def check_one_way(field): - expr = pc.is_null(pc.field(field.name)) # type: ignore[unresolved-attribute] + expr = pc.is_null(pc.field(field.name)) buf = pa.substrait.serialize_expressions([expr], ["test_expr"], schema) returned = pa.substrait.deserialize_expressions(buf) assert alt_schema == returned.schema @@ -1002,8 +999,8 @@ def test_invalid_expression_ser_des(): pa.field("x", pa.int32()), pa.field("y", pa.int32()) ]) - expr = equal(pc.field("x"), 7) - bad_expr = equal(pc.field("z"), 7) + expr = pc.equal(pc.field("x"), 7) + bad_expr = pc.equal(pc.field("z"), 7) # Invalid number of names with pytest.raises(ValueError) as excinfo: pa.substrait.serialize_expressions([expr], [], schema) @@ -1022,13 +1019,13 @@ def test_serializing_multiple_expressions(): pa.field("x", pa.int32()), pa.field("y", pa.int32()) ]) - exprs = [equal(pc.field("x"), 7), equal(pc.field("x"), pc.field("y"))] + exprs = [pc.equal(pc.field("x"), 7), pc.equal(pc.field("x"), pc.field("y"))] buf = pa.substrait.serialize_expressions(exprs, ["first", "second"], schema) returned = pa.substrait.deserialize_expressions(buf) assert schema == returned.schema assert len(returned.expressions) == 2 - norm_exprs = [equal(pc.field(0), 7), equal(pc.field(0), pc.field(1))] + norm_exprs = [pc.equal(pc.field(0), 7), pc.equal(pc.field(0), pc.field(1))] assert str(returned.expressions["first"]) == str(norm_exprs[0]) assert str(returned.expressions["second"]) == str(norm_exprs[1]) @@ -1038,8 +1035,8 @@ def test_serializing_with_compute(): pa.field("x", pa.int32()), pa.field("y", pa.int32()) ]) - expr = equal(pc.field("x"), 7) - expr_norm = equal(pc.field(0), 7) + expr = pc.equal(pc.field("x"), 7) + expr_norm = pc.equal(pc.field(0), 7) buf = expr.to_substrait(schema) returned = pa.substrait.deserialize_expressions(buf) @@ -1069,7 +1066,7 @@ def test_serializing_udfs(): ]) a = pc.scalar(10) b = pc.scalar(4) - exprs = [pc.shift_left(a, b)] # type: ignore[unresolved-attribute] + exprs = [pc.shift_left(a, b)] with pytest.raises(ArrowNotImplementedError): pa.substrait.serialize_expressions(exprs, ["expr"], schema) diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index 64624c93f1e..b65fb7d952c 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -23,10 +23,9 @@ try: import numpy as np except ImportError: - pass + np = None import pytest import pyarrow as pa -from pyarrow import lib # type: ignore[unresolved-import] import pyarrow.compute as pc from pyarrow.interchange import from_dataframe from pyarrow.vendored.version import Version @@ -50,8 +49,8 @@ def test_chunked_array_basics(): [7, 8, 9] ]) assert isinstance(data.chunks, list) - assert all(isinstance(c, lib.Int64Array) for c in data.chunks) - assert all(isinstance(c, lib.Int64Array) for c in data.iterchunks()) + assert all(isinstance(c, pa.lib.Int64Array) for c in data.chunks) + assert all(isinstance(c, pa.lib.Int64Array) for c in data.iterchunks()) assert len(data.chunks) == 3 assert data.get_total_buffer_size() == sum(c.get_total_buffer_size() for c in data.iterchunks()) @@ -419,7 +418,7 @@ def test_to_pandas_empty_table(): table = pa.table(df) result = table.schema.empty_table().to_pandas() assert result.shape == (0, 2) - tm.assert_frame_equal(result, df.iloc[:0]) # type: ignore[invalid-argument-type] + tm.assert_frame_equal(result, df.iloc[:0]) @pytest.mark.pandas @@ -651,7 +650,7 @@ def __arrow_c_stream__(self, requested_schema=None): # If schema doesn't match, raises NotImplementedError with pytest.raises( - lib.ArrowTypeError, match="Field 0 cannot be cast" + pa.lib.ArrowTypeError, match="Field 0 cannot be cast" ): pa.table( wrapper, schema=pa.schema([pa.field('a', pa.list_(pa.int32()))]) @@ -2231,7 +2230,7 @@ def test_invalid_table_construct(): u8 = pa.uint8() arrays = [pa.array(array, type=u8), pa.array(array[1:], type=u8)] - with pytest.raises(lib.ArrowInvalid): + with pytest.raises(pa.lib.ArrowInvalid): pa.Table.from_arrays(arrays, names=["a1", "a2"]) @@ -3300,7 +3299,7 @@ def test_table_join_asof_by_length_mismatch(): }) msg = "inconsistent size of by-key across inputs" - with pytest.raises(lib.ArrowInvalid, match=msg): + with pytest.raises(pa.lib.ArrowInvalid, match=msg): t1.join_asof( t2, on="on", by=["colA", "colB"], tolerance=1, right_on="on", right_by=["colA"], @@ -3322,7 +3321,7 @@ def test_table_join_asof_by_type_mismatch(): }) msg = "Expected by-key type int64 but got double for field colA in input 1" - with pytest.raises(lib.ArrowInvalid, match=msg): + with pytest.raises(pa.lib.ArrowInvalid, match=msg): t1.join_asof( t2, on="on", by=["colA"], tolerance=1, right_on="on", right_by=["colA"], @@ -3344,7 +3343,7 @@ def test_table_join_asof_on_type_mismatch(): }) msg = "Expected on-key type int64 but got double for field on in input 1" - with pytest.raises(lib.ArrowInvalid, match=msg): + with pytest.raises(pa.lib.ArrowInvalid, match=msg): t1.join_asof( t2, on="on", by=["colA"], tolerance=1, right_on="on", right_by=["colA"], @@ -3471,14 +3470,14 @@ def test_invalid_non_join_column(): }) # check as left table - with pytest.raises(lib.ArrowInvalid) as excinfo: + with pytest.raises(pa.lib.ArrowInvalid) as excinfo: t1.join(t2, 'id', join_type='inner') exp_error_msg = "Data type list is not supported " \ + "in join non-key field array_column" assert exp_error_msg in str(excinfo.value) # check as right table - with pytest.raises(lib.ArrowInvalid) as excinfo: + with pytest.raises(pa.lib.ArrowInvalid) as excinfo: t2.join(t1, 'id', join_type='inner') assert exp_error_msg in str(excinfo.value) diff --git a/python/pyarrow/tests/test_types.py b/python/pyarrow/tests/test_types.py index 4077b302f71..e628e559b84 100644 --- a/python/pyarrow/tests/test_types.py +++ b/python/pyarrow/tests/test_types.py @@ -27,19 +27,17 @@ try: import hypothesis.extra.pytz as tzst except ImportError: - pass + tzst = None import weakref try: import numpy as np except ImportError: - pass + np = None import pyarrow as pa import pyarrow.types as types import pyarrow.tests.strategies as past -from pyarrow import lib # type: ignore[unresolved-import] - def get_many_types(): # returning them from a function is required because of pa.dictionary @@ -85,14 +83,14 @@ def get_many_types(): pa.field('b', pa.int8(), nullable=False), pa.field('c', pa.string())]), pa.union([pa.field('a', pa.binary(10)), - pa.field('b', pa.string())], mode=lib.UnionMode_DENSE), + pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE), pa.union([pa.field('a', pa.binary(10)), - pa.field('b', pa.string())], mode=lib.UnionMode_DENSE, + pa.field('b', pa.string())], mode=pa.lib.UnionMode_DENSE, type_codes=[4, 8]), pa.union([pa.field('a', pa.binary(10)), - pa.field('b', pa.string())], mode=lib.UnionMode_SPARSE), + pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), pa.union([pa.field('a', pa.binary(10), nullable=False), - pa.field('b', pa.string())], mode=lib.UnionMode_SPARSE), + pa.field('b', pa.string())], mode=pa.lib.UnionMode_SPARSE), pa.dictionary(pa.int32(), pa.string()), pa.run_end_encoded(pa.int16(), pa.int32()), pa.run_end_encoded(pa.int32(), pa.string()), @@ -249,7 +247,7 @@ def test_is_nested_or_struct(): def test_is_union(): - for mode in [lib.UnionMode_SPARSE, lib.UnionMode_DENSE]: + for mode in [pa.lib.UnionMode_SPARSE, pa.lib.UnionMode_DENSE]: assert types.is_union(pa.union([pa.field('a', pa.int32()), pa.field('b', pa.int8()), pa.field('c', pa.string())], @@ -355,7 +353,7 @@ def test_is_primitive(): (datetime.timezone(datetime.timedelta(hours=1, minutes=30)), '+01:30') ]) def test_tzinfo_to_string(tz, expected): - assert lib.tzinfo_to_string(tz) == expected + assert pa.lib.tzinfo_to_string(tz) == expected def test_pytz_tzinfo_to_string(): @@ -363,13 +361,13 @@ def test_pytz_tzinfo_to_string(): tz = [pytz.utc, pytz.timezone('Europe/Paris')] expected = ['UTC', 'Europe/Paris'] - assert [lib.tzinfo_to_string(i) for i in tz] == expected + assert [pa.lib.tzinfo_to_string(i) for i in tz] == expected # StaticTzInfo.tzname returns with '-09' so we need to infer the timezone's # name from the tzinfo.zone attribute tz = [pytz.timezone('Etc/GMT-9'), pytz.FixedOffset(180)] expected = ['Etc/GMT-9', '+03:00'] - assert [lib.tzinfo_to_string(i) for i in tz] == expected + assert [pa.lib.tzinfo_to_string(i) for i in tz] == expected @pytest.mark.timezone_data @@ -383,9 +381,9 @@ def test_dateutil_tzinfo_to_string(): import dateutil.tz tz = dateutil.tz.UTC - assert lib.tzinfo_to_string(tz) == 'UTC' + assert pa.lib.tzinfo_to_string(tz) == 'UTC' tz = dateutil.tz.gettz('Europe/Paris') - assert lib.tzinfo_to_string(tz) == 'Europe/Paris' + assert pa.lib.tzinfo_to_string(tz) == 'Europe/Paris' @pytest.mark.timezone_data @@ -397,20 +395,20 @@ def test_zoneinfo_tzinfo_to_string(): pytest.importorskip('tzdata') tz = zoneinfo.ZoneInfo('UTC') - assert lib.tzinfo_to_string(tz) == 'UTC' + assert pa.lib.tzinfo_to_string(tz) == 'UTC' tz = zoneinfo.ZoneInfo('Europe/Paris') - assert lib.tzinfo_to_string(tz) == 'Europe/Paris' + assert pa.lib.tzinfo_to_string(tz) == 'Europe/Paris' def test_tzinfo_to_string_errors(): msg = "Not an instance of datetime.tzinfo" with pytest.raises(TypeError): - lib.tzinfo_to_string("Europe/Budapest") + pa.lib.tzinfo_to_string("Europe/Budapest") tz = datetime.timezone(datetime.timedelta(hours=1, seconds=30)) msg = "Offset must represent whole number of minutes" with pytest.raises(ValueError, match=msg): - lib.tzinfo_to_string(tz) + pa.lib.tzinfo_to_string(tz) if tzst: @@ -423,8 +421,8 @@ def test_tzinfo_to_string_errors(): def test_pytz_timezone_roundtrip(tz): if tz is None: pytest.skip('requires timezone not None') - timezone_string = lib.tzinfo_to_string(tz) - timezone_tzinfo = lib.string_to_tzinfo(timezone_string) + timezone_string = pa.lib.tzinfo_to_string(tz) + timezone_tzinfo = pa.lib.string_to_tzinfo(timezone_string) assert timezone_tzinfo == tz @@ -484,14 +482,14 @@ def tzname(self, dt): def utcoffset(self, dt): return None - assert lib.tzinfo_to_string(CorrectTimezone1()) == "-02:30" - assert lib.tzinfo_to_string(CorrectTimezone2()) == "+03:00" + assert pa.lib.tzinfo_to_string(CorrectTimezone1()) == "-02:30" + assert pa.lib.tzinfo_to_string(CorrectTimezone2()) == "+03:00" msg = (r"Object returned by tzinfo.utcoffset\(None\) is not an instance " r"of datetime.timedelta") for wrong in [BuggyTimezone1(), BuggyTimezone2(), BuggyTimezone3()]: with pytest.raises(ValueError, match=msg): - lib.tzinfo_to_string(wrong) + pa.lib.tzinfo_to_string(wrong) def test_string_to_tzinfo(): @@ -501,7 +499,7 @@ def test_string_to_tzinfo(): expected = [pytz.utc, pytz.timezone('Europe/Paris'), pytz.FixedOffset(180), pytz.FixedOffset(90), pytz.FixedOffset(-120)] - result = [lib.string_to_tzinfo(i) for i in string] + result = [pa.lib.string_to_tzinfo(i) for i in string] assert result == expected except ImportError: @@ -513,7 +511,7 @@ def test_string_to_tzinfo(): datetime.timezone( datetime.timedelta(hours=1, minutes=30)), datetime.timezone(-datetime.timedelta(hours=2))] - result = [lib.string_to_tzinfo(i) for i in string] + result = [pa.lib.string_to_tzinfo(i) for i in string] assert result == expected except ImportError: @@ -527,8 +525,8 @@ def test_timezone_string_roundtrip_pytz(): pytz.utc, pytz.timezone('America/New_York')] name = ['+01:30', '-01:30', 'UTC', 'America/New_York'] - assert [lib.tzinfo_to_string(i) for i in tz] == name - assert [lib.string_to_tzinfo(i)for i in name] == tz + assert [pa.lib.tzinfo_to_string(i) for i in tz] == name + assert [pa.lib.string_to_tzinfo(i)for i in name] == tz def test_timestamp(): @@ -799,13 +797,13 @@ def check_fields(ty, fields): sparse_factories = [ partial(pa.union, mode='sparse'), - partial(pa.union, mode=lib.UnionMode_SPARSE), + partial(pa.union, mode=pa.lib.UnionMode_SPARSE), pa.sparse_union, ] dense_factories = [ partial(pa.union, mode='dense'), - partial(pa.union, mode=lib.UnionMode_DENSE), + partial(pa.union, mode=pa.lib.UnionMode_DENSE), pa.dense_union, ] @@ -1324,7 +1322,6 @@ def test_field_modified_copies(): assert f0.equals(f0_) -@pytest.mark.numpy def test_is_integer_value(): assert pa.types.is_integer_value(1) if np is not None: diff --git a/python/pyarrow/tests/test_udf.py b/python/pyarrow/tests/test_udf.py index aed2fbceaeb..93004a30618 100644 --- a/python/pyarrow/tests/test_udf.py +++ b/python/pyarrow/tests/test_udf.py @@ -21,7 +21,7 @@ try: import numpy as np except ImportError: - pass + np = None import pyarrow as pa from pyarrow import compute as pc @@ -35,11 +35,11 @@ try: import pyarrow.dataset as ds except ImportError: - pass + ds = None def mock_udf_context(batch_length=10): - from pyarrow._compute import _get_udf_context # type: ignore[unresolved_import] + from pyarrow._compute import _get_udf_context return _get_udf_context(pa.default_memory_pool(), batch_length) diff --git a/python/pyarrow/tests/util.py b/python/pyarrow/tests/util.py index 7eefd067807..d8b250ffff0 100644 --- a/python/pyarrow/tests/util.py +++ b/python/pyarrow/tests/util.py @@ -171,8 +171,7 @@ def get_modified_env_with_pythonpath(): existing_pythonpath = env.get('PYTHONPATH', '') module_path = os.path.abspath( - os.path.dirname(os.path.dirname(pa.__file__))) \ - # type: ignore[no-matching-overload] + os.path.dirname(os.path.dirname(pa.__file__))) if existing_pythonpath: new_pythonpath = os.pathsep.join((module_path, existing_pythonpath)) diff --git a/python/pyarrow/tests/wsgi_examples.py b/python/pyarrow/tests/wsgi_examples.py index 1fafa852dc6..440b107abe5 100644 --- a/python/pyarrow/tests/wsgi_examples.py +++ b/python/pyarrow/tests/wsgi_examples.py @@ -28,7 +28,7 @@ def application(env, start_response): # See test_fs::test_uwsgi_integration start_response('200 OK', [('Content-Type', 'text/html')]) # flake8: noqa - fs = pyarrow.fs.S3FileSystem() # type: ignore[possibly-unbound-attribute] + fs = pyarrow.fs.S3FileSystem() return [b"Hello World\n"] else: start_response('404 Not Found', [('Content-Type', 'text/html')]) diff --git a/python/pyproject.toml b/python/pyproject.toml index 9c16ee08892..94a0d9a6b4d 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -118,3 +118,6 @@ fallback_version = '22.0.0a0' #unsupported-operator = "ignore" #missing-argument = "ignore" #call-non-callable = "ignore" + +[tool.ty.src] +exclude = ["pyarrow/tests"] From 89e2e7593c59e6cea5d82ec807e6ca2e3de7a950 Mon Sep 17 00:00:00 2001 From: Rok Mihevc Date: Sat, 26 Jul 2025 18:31:47 +0200 Subject: [PATCH 32/32] Converging --- .github/workflows/python.yml | 3 +-- python/pyarrow/__init__.py | 7 ++++--- python/pyarrow/_compute.pyx | 2 +- python/pyarrow/_dataset.pyx | 4 ++-- python/pyarrow/_dataset_parquet.pyx | 2 +- python/pyarrow/_substrait.pyx | 2 +- python/pyarrow/cffi.py | 2 +- python/pyarrow/conftest.py | 12 ++++++------ python/pyarrow/fs.py | 2 +- python/pyarrow/lib.pyx | 2 +- python/pyarrow/pandas_compat.py | 4 ++-- python/pyarrow/util.py | 2 +- python/pyarrow/vendored/docscrape.py | 2 +- python/pyproject.toml | 22 +--------------------- 14 files changed, 24 insertions(+), 44 deletions(-) diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml index 15906a10ac0..cb342b132d2 100644 --- a/.github/workflows/python.yml +++ b/.github/workflows/python.yml @@ -139,10 +139,9 @@ jobs: run: archery docker push ${{ matrix.image }} - name: Type check with ty - working-directory: python run: |- python -m pip install ty - python -m ty check + pushd python; python -m ty check macos: name: ${{ matrix.architecture }} macOS ${{ matrix.macos-version }} Python 3 diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 2b96edee84e..545c68c72e7 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -37,11 +37,12 @@ import warnings as _warnings try: - from ._generated_version import version as __version__ + from ._generated_version import version as __version__ \ + # type: ignore[unresolved-import] except ImportError: # Package is not installed, parse git tag at runtime try: - import setuptools_scm + import setuptools_scm # type: ignore[unresolved-import] # Code duplicated from setup.py to avoid a dependency on each other def parse_git(root, **kwargs): @@ -49,7 +50,7 @@ def parse_git(root, **kwargs): Parse function for setuptools_scm that ignores tags for non-C++ subprojects, e.g. apache-arrow-js-XXX tags. """ - from setuptools_scm.git import parse + from setuptools_scm.git import parse # type: ignore[unresolved-import] kwargs['describe_command'] = \ "git describe --dirty --tags --long --match 'apache-arrow-[0-9]*.*'" return parse(root, **kwargs) # type: ignore[missing-argument] diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index ad0b116fdc6..59fd775b5ac 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -36,7 +36,7 @@ import inspect try: import numpy as np except ImportError: - pass + np = None import warnings diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index abfd011fa21..d279881d15f 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -42,7 +42,7 @@ from pyarrow._json cimport ReadOptions as JsonReadOptions try: import pyarrow.substrait as pa_substrait except ImportError: - pass + pa_substrait = None _DEFAULT_BATCH_SIZE = 2**17 @@ -89,7 +89,7 @@ def _get_parquet_classes(): try: import pyarrow._dataset_parquet as _dataset_pq except ImportError: - pass + _dataset_pq = None def _get_parquet_symbol(name): diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx index e17867426dc..9405b5d8c54 100644 --- a/python/pyarrow/_dataset_parquet.pyx +++ b/python/pyarrow/_dataset_parquet.pyx @@ -59,7 +59,7 @@ try: ) parquet_encryption_enabled = True except ImportError: - pass + parquet_encryption_enabled = False cdef Expression _true = Expression._scalar(True) diff --git a/python/pyarrow/_substrait.pyx b/python/pyarrow/_substrait.pyx index b317ba1e639..d9359c8e77d 100644 --- a/python/pyarrow/_substrait.pyx +++ b/python/pyarrow/_substrait.pyx @@ -29,7 +29,7 @@ from pyarrow.includes.libarrow_substrait cimport * try: import substrait as py_substrait except ImportError: - pass + py_substrait = None else: import substrait.proto # no-cython-lint diff --git a/python/pyarrow/cffi.py b/python/pyarrow/cffi.py index 3f5e748daf4..3ac74d6b17b 100644 --- a/python/pyarrow/cffi.py +++ b/python/pyarrow/cffi.py @@ -17,7 +17,7 @@ from __future__ import absolute_import -from cffi import FFI +from cffi import FFI # type: ignore[unresolved-import] c_source = """ struct ArrowSchema { diff --git a/python/pyarrow/conftest.py b/python/pyarrow/conftest.py index 563c98bfdc8..0c5f453fd13 100644 --- a/python/pyarrow/conftest.py +++ b/python/pyarrow/conftest.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -import pytest +import pytest # type: ignore[unresolved-import] import os import pyarrow as pa @@ -114,7 +114,7 @@ defaults['timezone_data'] = os.path.exists("/usr/share/zoneinfo") try: - import cython # noqa + import cython # type: ignore[unresolved-import] # noqa defaults['cython'] = True except ImportError: pass @@ -156,13 +156,13 @@ pass try: - import pandas # noqa + import pandas # type: ignore[unresolved-import] # noqa defaults['pandas'] = True except ImportError: defaults['nopandas'] = True try: - import numpy # noqa + import numpy # type: ignore[unresolved-import] # noqa defaults['numpy'] = True except ImportError: defaults['nonumpy'] = True @@ -336,7 +336,7 @@ def unary_agg_func_fixture(): Register a unary aggregate function (mean) """ from pyarrow import compute as pc - import numpy as np + import numpy as np # type: ignore[unresolved-import] def func(ctx, x): return pa.scalar(np.nanmean(x)) @@ -362,7 +362,7 @@ def varargs_agg_func_fixture(): Register a unary aggregate function """ from pyarrow import compute as pc - import numpy as np + import numpy as np # type: ignore[unresolved-import] def func(ctx, *args): sum = 0.0 diff --git a/python/pyarrow/fs.py b/python/pyarrow/fs.py index c7f1b325c70..11fc0697c9d 100644 --- a/python/pyarrow/fs.py +++ b/python/pyarrow/fs.py @@ -111,7 +111,7 @@ def _ensure_filesystem(filesystem, *, use_mmap=False): else: # handle fsspec-compatible filesystems try: - import fsspec + import fsspec # type: ignore[unresolved-import] except ImportError: pass else: diff --git a/python/pyarrow/lib.pyx b/python/pyarrow/lib.pyx index 2da25a570ae..5dca6fd8d2e 100644 --- a/python/pyarrow/lib.pyx +++ b/python/pyarrow/lib.pyx @@ -24,7 +24,7 @@ import decimal as _pydecimal try: import numpy as np except ImportError: - pass + np = None import os import sys diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index 970126da64c..f4f79edc8bf 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -31,7 +31,7 @@ import warnings try: - import numpy as np + import numpy as np # type: ignore[unresolved-import] except ImportError: pass @@ -825,7 +825,7 @@ def table_to_dataframe( else: from pandas.core.internals import BlockManager \ # type: ignore[unresolved_import] - from pandas import DataFrame + from pandas import DataFrame # type: ignore[unresolved-import] blocks = [ _reconstruct_block(item, column_names, ext_columns_dtypes) diff --git a/python/pyarrow/util.py b/python/pyarrow/util.py index 5878d1f9026..5947da95b7b 100644 --- a/python/pyarrow/util.py +++ b/python/pyarrow/util.py @@ -238,7 +238,7 @@ def _download_urllib(url, out_path): def _download_requests(url, out_path): - import requests + import requests # type: ignore[unresolved-import] with requests.get(url) as response: with open(out_path, 'wb') as f: f.write(response.content) diff --git a/python/pyarrow/vendored/docscrape.py b/python/pyarrow/vendored/docscrape.py index 096ef245243..3fba2524a45 100644 --- a/python/pyarrow/vendored/docscrape.py +++ b/python/pyarrow/vendored/docscrape.py @@ -622,7 +622,7 @@ def __init__(self, cls, doc=None, modulename='', func_doc=FunctionDoc, self._cls = cls if 'sphinx' in sys.modules: - from sphinx.ext.autodoc import ALL + from sphinx.ext.autodoc import ALL # type: ignore[unresolved-import] else: ALL = object() diff --git a/python/pyproject.toml b/python/pyproject.toml index 94a0d9a6b4d..c5cdca80b2e 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -98,26 +98,6 @@ version_scheme = 'guess-next-dev' git_describe_command = 'git describe --dirty --tags --long --match "apache-arrow-[0-9]*.*"' fallback_version = '22.0.0a0' -[tool.ty.rules] -#invalid-argument-type = "ignore" -#invalid-assignment = "ignore" -#invalid-context-manager = "ignore" -#invalid-return-type = "ignore" -#invalid-type-form = "ignore" -#no-matching-overload = "ignore" -#non-subscriptable = "ignore" -#not-iterable = "ignore" -#possibly-unbound-attribute = "ignore" -#possibly-unbound-import = "ignore" -#too-many-positional-arguments = "ignore" -#unknown-argument = "ignore" -#unresolved-attribute = "ignore" -#unresolved-global = "ignore" -#unresolved-import = "ignore" -#unresolved-reference = "ignore" -#unsupported-operator = "ignore" -#missing-argument = "ignore" -#call-non-callable = "ignore" - [tool.ty.src] exclude = ["pyarrow/tests"] +include = ["pyarrow"]