Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
2b6b8c1
GH-33321: Support converting to non-nano datetime64 for pandas >= 2.0
May 17, 2023
2a4f19a
Separate implementation for date32 and date64
May 17, 2023
79f9518
Update types to return non-nano pandas dtypes
May 24, 2023
042f9e9
Typo in types.pxi
May 24, 2023
97e514f
Expose coerce_temporal_nanoseconds in PandasOptions for backwards com…
May 24, 2023
e883534
Maintain pandas dtype conversion for 1.x
May 24, 2023
d0c3492
Typos
May 24, 2023
cbd7d69
Fix duration typo, add is_v1() api to pandas shim
Jun 5, 2023
dbf7e13
Improve coerce_temporal_nanoseconds documentation
Jun 5, 2023
c4d2dd5
Fix typo, update test_pandas.py
Jun 6, 2023
5602d52
Fix most tests, revert ARROW-18088
Jun 6, 2023
d5022c3
Lint
Jun 6, 2023
d43ea6c
Fix test case for pandas 1.0
Jun 6, 2023
db5a6d4
Lint C++
Jun 6, 2023
4ccc9a5
Fix some dataset tests
Jun 7, 2023
8b4c51e
Fix abort
Jun 7, 2023
828390b
Add TZ types for s/ms/us, fix abort by adding date64 to ms conversion
Jun 8, 2023
3651ecb
date32 converts to ms instead of second
Jun 8, 2023
072f190
Lint
Jun 8, 2023
348e9fb
Fix pandas 1.0 test
Jun 8, 2023
3fe24dd
Refactor tests, add tz to numpy test
Jun 8, 2023
778f4cc
Clean up tests, add comments
Jun 8, 2023
4329f43
Update python/pyarrow/src/arrow/python/arrow_to_pandas.cc
danepitkin Jun 8, 2023
1e63e3f
Address comments, add tests for coerce_temporal_nanoseconds arg, fix …
Jun 8, 2023
db1763f
Lint
Jun 8, 2023
f19be39
Fix doctest
Jun 8, 2023
c186430
Re-enable test case test_timestamp_to_pandas_out_of_bounds
Jun 8, 2023
424f9d0
Lint
Jun 8, 2023
80379ac
Templatize DatetimeTZWriter class
Jun 14, 2023
f551b3a
Address comments
Jun 26, 2023
d9caca7
Fix numpy Day unit regression
Jun 27, 2023
690f5b9
Add us, ns sample to existing parquet tests
Jun 27, 2023
713cbac
Lint
Jun 27, 2023
53cdeb7
Lint c++
Jun 27, 2023
c9c9a20
Lint c++ part 2
Jun 27, 2023
8696adf
Maintain backwards compatibility for older numpy versions
Jun 27, 2023
09d1f71
Coerce to ns in to_pandas_dtype for non-extension types
Jun 27, 2023
68a011d
Fail gracefully in _get_pandas_type
Jun 27, 2023
f8bb888
Fix pandas v1 to_pandas_dtype()
Jun 28, 2023
b8102cf
Fix pandas v1 test
Jun 28, 2023
6ffb5e5
Rebase
Jun 30, 2023
8509052
small fixes
jorisvandenbossche Jul 7, 2023
ed02603
GH-36537: [Python] Ensure dataset writer follows default Parquet vers…
jorisvandenbossche Jul 7, 2023
a6487c2
small test clean-up
jorisvandenbossche Jul 7, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion python/pyarrow/_dataset_parquet.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -531,6 +531,7 @@ cdef class ParquetFileWriteOptions(FileWriteOptions):
"use_deprecated_int96_timestamps",
"coerce_timestamps",
"allow_truncated_timestamps",
"use_compliant_nested_type",
}

setters = set()
Expand Down Expand Up @@ -586,7 +587,7 @@ cdef class ParquetFileWriteOptions(FileWriteOptions):
self._properties = dict(
use_dictionary=True,
compression="snappy",
version="1.0",
version="2.6",
write_statistics=None,
data_page_size=None,
compression_level=None,
Expand All @@ -601,6 +602,11 @@ cdef class ParquetFileWriteOptions(FileWriteOptions):
self._set_properties()
self._set_arrow_properties()

def __repr__(self):
return "<pyarrow.dataset.ParquetFileWriteOptions {0}>".format(
" ".join([f"{key}={value}" for key, value in self._properties.items()])
)


cdef set _PARQUET_READ_OPTIONS = {
'dictionary_columns', 'coerce_int96_timestamp_unit'
Expand Down
31 changes: 23 additions & 8 deletions python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -700,7 +700,8 @@ cdef class _PandasConvertible(_Weakrefable):
bint split_blocks=False,
bint self_destruct=False,
str maps_as_pydicts=None,
types_mapper=None
types_mapper=None,
bint coerce_temporal_nanoseconds=False
):
"""
Convert to a pandas-compatible NumPy array or DataFrame, as appropriate
Expand All @@ -721,12 +722,15 @@ cdef class _PandasConvertible(_Weakrefable):
integer_object_nulls : bool, default False
Cast integers with nulls to objects
date_as_object : bool, default True
Cast dates to objects. If False, convert to datetime64[ns] dtype.
Cast dates to objects. If False, convert to datetime64 dtype with
the equivalent time unit (if supported). Note: in pandas version
< 2.0, only datetime64[ns] conversion is supported.
timestamp_as_object : bool, default False
Cast non-nanosecond timestamps (np.datetime64) to objects. This is
useful if you have timestamps that don't fit in the normal date
range of nanosecond timestamps (1678 CE-2262 CE).
If False, all timestamps are converted to datetime64[ns] dtype.
useful in pandas version 1.x if you have timestamps that don't fit
in the normal date range of nanosecond timestamps (1678 CE-2262 CE).
Non-nanosecond timestamps are supported in pandas version 2.0.
If False, all timestamps are converted to datetime64 dtype.
use_threads : bool, default True
Whether to parallelize the conversion using multiple threads.
deduplicate_objects : bool, default True
Expand Down Expand Up @@ -775,6 +779,13 @@ cdef class _PandasConvertible(_Weakrefable):
expected to return a pandas ExtensionDtype or ``None`` if the
default conversion should be used for that type. If you have
a dictionary mapping, you can pass ``dict.get`` as function.
coerce_temporal_nanoseconds : bool, default False
Only applicable to pandas version >= 2.0.
A legacy option to coerce date32, date64, duration, and timestamp
time units to nanoseconds when converting to pandas. This is the
default behavior in pandas version 1.x. Set this option to True if
you'd like to use this coercion when using pandas version >= 2.0
for backwards compatibility (not recommended otherwise).

Returns
-------
Expand Down Expand Up @@ -850,7 +861,8 @@ cdef class _PandasConvertible(_Weakrefable):
safe=safe,
split_blocks=split_blocks,
self_destruct=self_destruct,
maps_as_pydicts=maps_as_pydicts
maps_as_pydicts=maps_as_pydicts,
coerce_temporal_nanoseconds=coerce_temporal_nanoseconds
)
return self._to_pandas(options, categories=categories,
ignore_metadata=ignore_metadata,
Expand All @@ -870,6 +882,7 @@ cdef PandasOptions _convert_pandas_options(dict options):
result.safe_cast = options['safe']
result.split_blocks = options['split_blocks']
result.self_destruct = options['self_destruct']
result.coerce_temporal_nanoseconds = options['coerce_temporal_nanoseconds']
result.ignore_timezone = os.environ.get('PYARROW_IGNORE_TIMEZONE', False)

maps_as_pydicts = options['maps_as_pydicts']
Expand Down Expand Up @@ -1525,6 +1538,7 @@ cdef class Array(_PandasConvertible):
# so it can't be done if the user requested a zero_copy.
c_options.decode_dictionaries = not zero_copy_only
c_options.zero_copy_only = zero_copy_only
c_options.to_numpy = True

with nogil:
check_status(ConvertArrayToPandas(c_options, self.sp_array,
Expand Down Expand Up @@ -1689,8 +1703,9 @@ cdef _array_like_to_pandas(obj, options, types_mapper):
arr = dtype.__from_arrow__(obj)
return pandas_api.series(arr, name=name, copy=False)

# ARROW-3789(wesm): Convert date/timestamp types to datetime64[ns]
c_options.coerce_temporal_nanoseconds = True
if pandas_api.is_v1():
# ARROW-3789: Coerce date/timestamp types to datetime64[ns]
c_options.coerce_temporal_nanoseconds = True

if isinstance(obj, Array):
with nogil:
Expand Down
1 change: 1 addition & 0 deletions python/pyarrow/includes/libarrow_python.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil:
c_bool decode_dictionaries
unordered_set[c_string] categorical_columns
unordered_set[c_string] extension_columns
c_bool to_numpy

cdef cppclass CSerializedPyObject" arrow::py::SerializedPyObject":
shared_ptr[CRecordBatch] batch
Expand Down
8 changes: 8 additions & 0 deletions python/pyarrow/pandas-shim.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ cdef class _PandasAPIShim(object):
object _array_like_types, _is_extension_array_dtype
bint has_sparse
bint _pd024
bint _is_v1

def __init__(self):
self._tried_importing_pandas = False
Expand All @@ -58,6 +59,7 @@ cdef class _PandasAPIShim(object):
self._pd = pd
self._version = pd.__version__
self._loose_version = Version(pd.__version__)
self._is_v1 = False

if self._loose_version < Version('1.0.0'):
self._have_pandas = False
Expand All @@ -72,6 +74,8 @@ cdef class _PandasAPIShim(object):
"installed. Therefore, pandas-specific integration is not "
"used.".format(self._version), stacklevel=2)
return
elif self._loose_version < Version('2.0.0'):
self._is_v1 = True

self._compat_module = pdcompat
self._data_frame = pd.DataFrame
Expand Down Expand Up @@ -150,6 +154,10 @@ cdef class _PandasAPIShim(object):
self._check_import()
return self._version

def is_v1(self):
self._check_import()
return self._is_v1

@property
def categorical_type(self):
self._check_import()
Expand Down
9 changes: 6 additions & 3 deletions python/pyarrow/pandas_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -714,7 +714,8 @@ def _reconstruct_block(item, columns=None, extension_columns=None):
ordered=item['ordered'])
block = _int.make_block(cat, placement=placement)
elif 'timezone' in item:
dtype = make_datetimetz(item['timezone'])
unit, _ = np.datetime_data(block_arr.dtype)
dtype = make_datetimetz(unit, item['timezone'])
block = _int.make_block(block_arr, placement=placement,
klass=_int.DatetimeTZBlock,
dtype=dtype)
Expand All @@ -738,9 +739,11 @@ def _reconstruct_block(item, columns=None, extension_columns=None):
return block


def make_datetimetz(tz):
def make_datetimetz(unit, tz):
if _pandas_api.is_v1():
unit = 'ns' # ARROW-3789: Coerce date/timestamp types to datetime64[ns]
tz = pa.lib.string_to_tzinfo(tz)
return _pandas_api.datetimetz_type('ns', tz=tz)
return _pandas_api.datetimetz_type(unit, tz=tz)


def table_to_blockmanager(options, table, categories=None,
Expand Down
Loading