apache · jorisvandenbossche · Jul 7, 2023 · May 17, 2023 · May 17, 2023 · May 24, 2023
@@ -531,6 +531,7 @@ cdef class ParquetFileWriteOptions(FileWriteOptions):
             "use_deprecated_int96_timestamps",
             "coerce_timestamps",
             "allow_truncated_timestamps",
+            "use_compliant_nested_type",
         }
 
         setters = set()
@@ -586,7 +587,7 @@ cdef class ParquetFileWriteOptions(FileWriteOptions):
         self._properties = dict(
             use_dictionary=True,
             compression="snappy",
-            version="1.0",
+            version="2.6",
             write_statistics=None,
             data_page_size=None,
             compression_level=None,
@@ -601,6 +602,11 @@ cdef class ParquetFileWriteOptions(FileWriteOptions):
         self._set_properties()
         self._set_arrow_properties()
 
+    def __repr__(self):
+        return "<pyarrow.dataset.ParquetFileWriteOptions {0}>".format(
+            " ".join([f"{key}={value}" for key, value in self._properties.items()])
+        )
+
 
 cdef set _PARQUET_READ_OPTIONS = {
     'dictionary_columns', 'coerce_int96_timestamp_unit'

@@ -700,7 +700,8 @@ cdef class _PandasConvertible(_Weakrefable):
             bint split_blocks=False,
             bint self_destruct=False,
             str maps_as_pydicts=None,
-            types_mapper=None
+            types_mapper=None,
+            bint coerce_temporal_nanoseconds=False
     ):
         """
         Convert to a pandas-compatible NumPy array or DataFrame, as appropriate
@@ -721,12 +722,15 @@ cdef class _PandasConvertible(_Weakrefable):
         integer_object_nulls : bool, default False
             Cast integers with nulls to objects
         date_as_object : bool, default True
-            Cast dates to objects. If False, convert to datetime64[ns] dtype.
+            Cast dates to objects. If False, convert to datetime64 dtype with
+            the equivalent time unit (if supported). Note: in pandas version
+            < 2.0, only datetime64[ns] conversion is supported.
         timestamp_as_object : bool, default False
             Cast non-nanosecond timestamps (np.datetime64) to objects. This is
-            useful if you have timestamps that don't fit in the normal date
-            range of nanosecond timestamps (1678 CE-2262 CE).
-            If False, all timestamps are converted to datetime64[ns] dtype.
+            useful in pandas version 1.x if you have timestamps that don't fit
+            in the normal date range of nanosecond timestamps (1678 CE-2262 CE).
+            Non-nanosecond timestamps are supported in pandas version 2.0.
+            If False, all timestamps are converted to datetime64 dtype.
         use_threads : bool, default True
             Whether to parallelize the conversion using multiple threads.
         deduplicate_objects : bool, default True
@@ -775,6 +779,13 @@ cdef class _PandasConvertible(_Weakrefable):
             expected to return a pandas ExtensionDtype or ``None`` if the
             default conversion should be used for that type. If you have
             a dictionary mapping, you can pass ``dict.get`` as function.
+        coerce_temporal_nanoseconds : bool, default False
+            Only applicable to pandas version >= 2.0.
+            A legacy option to coerce date32, date64, duration, and timestamp
+            time units to nanoseconds when converting to pandas. This is the
+            default behavior in pandas version 1.x. Set this option to True if
+            you'd like to use this coercion when using pandas version >= 2.0
+            for backwards compatibility (not recommended otherwise).
 
         Returns
         -------
@@ -850,7 +861,8 @@ cdef class _PandasConvertible(_Weakrefable):
             safe=safe,
             split_blocks=split_blocks,
             self_destruct=self_destruct,
-            maps_as_pydicts=maps_as_pydicts
+            maps_as_pydicts=maps_as_pydicts,
+            coerce_temporal_nanoseconds=coerce_temporal_nanoseconds
         )
         return self._to_pandas(options, categories=categories,
                                ignore_metadata=ignore_metadata,
@@ -870,6 +882,7 @@ cdef PandasOptions _convert_pandas_options(dict options):
     result.safe_cast = options['safe']
     result.split_blocks = options['split_blocks']
     result.self_destruct = options['self_destruct']
+    result.coerce_temporal_nanoseconds = options['coerce_temporal_nanoseconds']
     result.ignore_timezone = os.environ.get('PYARROW_IGNORE_TIMEZONE', False)
 
     maps_as_pydicts = options['maps_as_pydicts']
@@ -1525,6 +1538,7 @@ cdef class Array(_PandasConvertible):
         # so it can't be done if the user requested a zero_copy.
         c_options.decode_dictionaries = not zero_copy_only
         c_options.zero_copy_only = zero_copy_only
+        c_options.to_numpy = True
 
         with nogil:
             check_status(ConvertArrayToPandas(c_options, self.sp_array,
@@ -1689,8 +1703,9 @@ cdef _array_like_to_pandas(obj, options, types_mapper):
         arr = dtype.__from_arrow__(obj)
         return pandas_api.series(arr, name=name, copy=False)
 
-    # ARROW-3789(wesm): Convert date/timestamp types to datetime64[ns]
-    c_options.coerce_temporal_nanoseconds = True
+    if pandas_api.is_v1():
+        # ARROW-3789: Coerce date/timestamp types to datetime64[ns]
+        c_options.coerce_temporal_nanoseconds = True
 
     if isinstance(obj, Array):
         with nogil:

@@ -197,6 +197,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil:
         c_bool decode_dictionaries
         unordered_set[c_string] categorical_columns
         unordered_set[c_string] extension_columns
+        c_bool to_numpy
 
     cdef cppclass CSerializedPyObject" arrow::py::SerializedPyObject":
         shared_ptr[CRecordBatch] batch

@@ -37,6 +37,7 @@ cdef class _PandasAPIShim(object):
         object _array_like_types, _is_extension_array_dtype
         bint has_sparse
         bint _pd024
+        bint _is_v1
 
     def __init__(self):
         self._tried_importing_pandas = False
@@ -58,6 +59,7 @@ cdef class _PandasAPIShim(object):
         self._pd = pd
         self._version = pd.__version__
         self._loose_version = Version(pd.__version__)
+        self._is_v1 = False
 
         if self._loose_version < Version('1.0.0'):
             self._have_pandas = False
@@ -72,6 +74,8 @@ cdef class _PandasAPIShim(object):
                     "installed. Therefore, pandas-specific integration is not "
                     "used.".format(self._version), stacklevel=2)
                 return
+        elif self._loose_version < Version('2.0.0'):
+            self._is_v1 = True
 
         self._compat_module = pdcompat
         self._data_frame = pd.DataFrame
@@ -150,6 +154,10 @@ cdef class _PandasAPIShim(object):
         self._check_import()
         return self._version
 
+    def is_v1(self):
+        self._check_import()
+        return self._is_v1
+
     @property
     def categorical_type(self):
         self._check_import()

@@ -714,7 +714,8 @@ def _reconstruct_block(item, columns=None, extension_columns=None):
             ordered=item['ordered'])
         block = _int.make_block(cat, placement=placement)
     elif 'timezone' in item:
-        dtype = make_datetimetz(item['timezone'])
+        unit, _ = np.datetime_data(block_arr.dtype)
+        dtype = make_datetimetz(unit, item['timezone'])
         block = _int.make_block(block_arr, placement=placement,
                                 klass=_int.DatetimeTZBlock,
                                 dtype=dtype)
@@ -738,9 +739,11 @@ def _reconstruct_block(item, columns=None, extension_columns=None):
     return block
 
 
-def make_datetimetz(tz):
+def make_datetimetz(unit, tz):
+    if _pandas_api.is_v1():
+        unit = 'ns'  # ARROW-3789: Coerce date/timestamp types to datetime64[ns]
     tz = pa.lib.string_to_tzinfo(tz)
-    return _pandas_api.datetimetz_type('ns', tz=tz)
+    return _pandas_api.datetimetz_type(unit, tz=tz)
 
 
 def table_to_blockmanager(options, table, categories=None,