diff --git a/python/pyarrow/_dataset_parquet.pyx b/python/pyarrow/_dataset_parquet.pyx index ad914c77bf3..bc4786b9cd6 100644 --- a/python/pyarrow/_dataset_parquet.pyx +++ b/python/pyarrow/_dataset_parquet.pyx @@ -531,6 +531,7 @@ cdef class ParquetFileWriteOptions(FileWriteOptions): "use_deprecated_int96_timestamps", "coerce_timestamps", "allow_truncated_timestamps", + "use_compliant_nested_type", } setters = set() @@ -586,7 +587,7 @@ cdef class ParquetFileWriteOptions(FileWriteOptions): self._properties = dict( use_dictionary=True, compression="snappy", - version="1.0", + version="2.6", write_statistics=None, data_page_size=None, compression_level=None, @@ -601,6 +602,11 @@ cdef class ParquetFileWriteOptions(FileWriteOptions): self._set_properties() self._set_arrow_properties() + def __repr__(self): + return "".format( + " ".join([f"{key}={value}" for key, value in self._properties.items()]) + ) + cdef set _PARQUET_READ_OPTIONS = { 'dictionary_columns', 'coerce_int96_timestamp_unit' diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index b704da73606..2f8959cd721 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -700,7 +700,8 @@ cdef class _PandasConvertible(_Weakrefable): bint split_blocks=False, bint self_destruct=False, str maps_as_pydicts=None, - types_mapper=None + types_mapper=None, + bint coerce_temporal_nanoseconds=False ): """ Convert to a pandas-compatible NumPy array or DataFrame, as appropriate @@ -721,12 +722,15 @@ cdef class _PandasConvertible(_Weakrefable): integer_object_nulls : bool, default False Cast integers with nulls to objects date_as_object : bool, default True - Cast dates to objects. If False, convert to datetime64[ns] dtype. + Cast dates to objects. If False, convert to datetime64 dtype with + the equivalent time unit (if supported). Note: in pandas version + < 2.0, only datetime64[ns] conversion is supported. timestamp_as_object : bool, default False Cast non-nanosecond timestamps (np.datetime64) to objects. This is - useful if you have timestamps that don't fit in the normal date - range of nanosecond timestamps (1678 CE-2262 CE). - If False, all timestamps are converted to datetime64[ns] dtype. + useful in pandas version 1.x if you have timestamps that don't fit + in the normal date range of nanosecond timestamps (1678 CE-2262 CE). + Non-nanosecond timestamps are supported in pandas version 2.0. + If False, all timestamps are converted to datetime64 dtype. use_threads : bool, default True Whether to parallelize the conversion using multiple threads. deduplicate_objects : bool, default True @@ -775,6 +779,13 @@ cdef class _PandasConvertible(_Weakrefable): expected to return a pandas ExtensionDtype or ``None`` if the default conversion should be used for that type. If you have a dictionary mapping, you can pass ``dict.get`` as function. + coerce_temporal_nanoseconds : bool, default False + Only applicable to pandas version >= 2.0. + A legacy option to coerce date32, date64, duration, and timestamp + time units to nanoseconds when converting to pandas. This is the + default behavior in pandas version 1.x. Set this option to True if + you'd like to use this coercion when using pandas version >= 2.0 + for backwards compatibility (not recommended otherwise). Returns ------- @@ -850,7 +861,8 @@ cdef class _PandasConvertible(_Weakrefable): safe=safe, split_blocks=split_blocks, self_destruct=self_destruct, - maps_as_pydicts=maps_as_pydicts + maps_as_pydicts=maps_as_pydicts, + coerce_temporal_nanoseconds=coerce_temporal_nanoseconds ) return self._to_pandas(options, categories=categories, ignore_metadata=ignore_metadata, @@ -870,6 +882,7 @@ cdef PandasOptions _convert_pandas_options(dict options): result.safe_cast = options['safe'] result.split_blocks = options['split_blocks'] result.self_destruct = options['self_destruct'] + result.coerce_temporal_nanoseconds = options['coerce_temporal_nanoseconds'] result.ignore_timezone = os.environ.get('PYARROW_IGNORE_TIMEZONE', False) maps_as_pydicts = options['maps_as_pydicts'] @@ -1525,6 +1538,7 @@ cdef class Array(_PandasConvertible): # so it can't be done if the user requested a zero_copy. c_options.decode_dictionaries = not zero_copy_only c_options.zero_copy_only = zero_copy_only + c_options.to_numpy = True with nogil: check_status(ConvertArrayToPandas(c_options, self.sp_array, @@ -1689,8 +1703,9 @@ cdef _array_like_to_pandas(obj, options, types_mapper): arr = dtype.__from_arrow__(obj) return pandas_api.series(arr, name=name, copy=False) - # ARROW-3789(wesm): Convert date/timestamp types to datetime64[ns] - c_options.coerce_temporal_nanoseconds = True + if pandas_api.is_v1(): + # ARROW-3789: Coerce date/timestamp types to datetime64[ns] + c_options.coerce_temporal_nanoseconds = True if isinstance(obj, Array): with nogil: diff --git a/python/pyarrow/includes/libarrow_python.pxd b/python/pyarrow/includes/libarrow_python.pxd index 2052600c9f3..f08fcaa40d1 100644 --- a/python/pyarrow/includes/libarrow_python.pxd +++ b/python/pyarrow/includes/libarrow_python.pxd @@ -197,6 +197,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: c_bool decode_dictionaries unordered_set[c_string] categorical_columns unordered_set[c_string] extension_columns + c_bool to_numpy cdef cppclass CSerializedPyObject" arrow::py::SerializedPyObject": shared_ptr[CRecordBatch] batch diff --git a/python/pyarrow/pandas-shim.pxi b/python/pyarrow/pandas-shim.pxi index 7dc5d590a72..a0c0cabf6d3 100644 --- a/python/pyarrow/pandas-shim.pxi +++ b/python/pyarrow/pandas-shim.pxi @@ -37,6 +37,7 @@ cdef class _PandasAPIShim(object): object _array_like_types, _is_extension_array_dtype bint has_sparse bint _pd024 + bint _is_v1 def __init__(self): self._tried_importing_pandas = False @@ -58,6 +59,7 @@ cdef class _PandasAPIShim(object): self._pd = pd self._version = pd.__version__ self._loose_version = Version(pd.__version__) + self._is_v1 = False if self._loose_version < Version('1.0.0'): self._have_pandas = False @@ -72,6 +74,8 @@ cdef class _PandasAPIShim(object): "installed. Therefore, pandas-specific integration is not " "used.".format(self._version), stacklevel=2) return + elif self._loose_version < Version('2.0.0'): + self._is_v1 = True self._compat_module = pdcompat self._data_frame = pd.DataFrame @@ -150,6 +154,10 @@ cdef class _PandasAPIShim(object): self._check_import() return self._version + def is_v1(self): + self._check_import() + return self._is_v1 + @property def categorical_type(self): self._check_import() diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index 5369677e87b..12f1cc43129 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -714,7 +714,8 @@ def _reconstruct_block(item, columns=None, extension_columns=None): ordered=item['ordered']) block = _int.make_block(cat, placement=placement) elif 'timezone' in item: - dtype = make_datetimetz(item['timezone']) + unit, _ = np.datetime_data(block_arr.dtype) + dtype = make_datetimetz(unit, item['timezone']) block = _int.make_block(block_arr, placement=placement, klass=_int.DatetimeTZBlock, dtype=dtype) @@ -738,9 +739,11 @@ def _reconstruct_block(item, columns=None, extension_columns=None): return block -def make_datetimetz(tz): +def make_datetimetz(unit, tz): + if _pandas_api.is_v1(): + unit = 'ns' # ARROW-3789: Coerce date/timestamp types to datetime64[ns] tz = pa.lib.string_to_tzinfo(tz) - return _pandas_api.datetimetz_type('ns', tz=tz) + return _pandas_api.datetimetz_type(unit, tz=tz) def table_to_blockmanager(options, table, categories=None, diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc index 2cd6f5c26de..91c7b8a4571 100644 --- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc +++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc @@ -343,6 +343,9 @@ class PandasWriter { DATETIME_MILLI, DATETIME_MICRO, DATETIME_NANO, + DATETIME_SECOND_TZ, + DATETIME_MILLI_TZ, + DATETIME_MICRO_TZ, DATETIME_NANO_TZ, TIMEDELTA_SECOND, TIMEDELTA_MILLI, @@ -1488,7 +1491,7 @@ class BoolWriter : public TypedPandasWriter { // Date / timestamp types template -inline void ConvertDatetimeLikeNanos(const ChunkedArray& data, int64_t* out_values) { +inline void ConvertDatetime(const ChunkedArray& data, int64_t* out_values) { for (int c = 0; c < data.num_chunks(); c++) { const auto& arr = *data.chunk(c); const T* in_values = GetPrimitiveValues(arr); @@ -1570,7 +1573,30 @@ class DatetimeWriter : public TypedPandasWriter { }; using DatetimeSecondWriter = DatetimeWriter; -using DatetimeMilliWriter = DatetimeWriter; + +class DatetimeMilliWriter : public DatetimeWriter { + public: + using DatetimeWriter::DatetimeWriter; + + Status CopyInto(std::shared_ptr data, int64_t rel_placement) override { + Type::type type = data->type()->id(); + int64_t* out_values = this->GetBlockColumnStart(rel_placement); + if (type == Type::DATE32) { + // Convert from days since epoch to datetime64[ms] + ConvertDatetime(*data, out_values); + } else if (type == Type::DATE64) { + ConvertNumericNullable(*data, kPandasTimestampNull, out_values); + } else { + const auto& ts_type = checked_cast(*data->type()); + DCHECK_EQ(TimeUnit::MILLI, ts_type.unit()) + << "Should only call instances of this writer " + << "with arrays of the correct unit"; + ConvertNumericNullable(*data, kPandasTimestampNull, out_values); + } + return Status::OK(); + } +}; + using DatetimeMicroWriter = DatetimeWriter; class DatetimeNanoWriter : public DatetimeWriter { @@ -1592,11 +1618,11 @@ class DatetimeNanoWriter : public DatetimeWriter { if (type == Type::DATE32) { // Convert from days since epoch to datetime64[ns] - ConvertDatetimeLikeNanos(*data, out_values); + ConvertDatetime(*data, out_values); } else if (type == Type::DATE64) { // Date64Type is millisecond timestamp stored as int64_t // TODO(wesm): Do we want to make sure to zero out the milliseconds? - ConvertDatetimeLikeNanos(*data, out_values); + ConvertDatetime(*data, out_values); } else if (type == Type::TIMESTAMP) { const auto& ts_type = checked_cast(*data->type()); @@ -1619,16 +1645,17 @@ class DatetimeNanoWriter : public DatetimeWriter { } }; -class DatetimeTZWriter : public DatetimeNanoWriter { +template +class DatetimeTZWriter : public BASE { public: DatetimeTZWriter(const PandasOptions& options, const std::string& timezone, int64_t num_rows) - : DatetimeNanoWriter(options, num_rows, 1), timezone_(timezone) {} + : BASE(options, num_rows, 1), timezone_(timezone) {} protected: Status GetResultBlock(PyObject** out) override { - RETURN_NOT_OK(MakeBlock1D()); - *out = block_arr_.obj(); + RETURN_NOT_OK(this->MakeBlock1D()); + *out = this->block_arr_.obj(); return Status::OK(); } @@ -1645,6 +1672,11 @@ class DatetimeTZWriter : public DatetimeNanoWriter { std::string timezone_; }; +using DatetimeSecondTZWriter = DatetimeTZWriter; +using DatetimeMilliTZWriter = DatetimeTZWriter; +using DatetimeMicroTZWriter = DatetimeTZWriter; +using DatetimeNanoTZWriter = DatetimeTZWriter; + template class TimedeltaWriter : public TypedPandasWriter { public: @@ -1690,11 +1722,11 @@ class TimedeltaNanoWriter : public TimedeltaWriter { if (ts_type.unit() == TimeUnit::NANO) { ConvertNumericNullable(*data, kPandasTimestampNull, out_values); } else if (ts_type.unit() == TimeUnit::MICRO) { - ConvertDatetimeLikeNanos(*data, out_values); + ConvertDatetime(*data, out_values); } else if (ts_type.unit() == TimeUnit::MILLI) { - ConvertDatetimeLikeNanos(*data, out_values); + ConvertDatetime(*data, out_values); } else if (ts_type.unit() == TimeUnit::SECOND) { - ConvertDatetimeLikeNanos(*data, out_values); + ConvertDatetime(*data, out_values); } else { return Status::NotImplemented("Unsupported time unit"); } @@ -1945,6 +1977,12 @@ Status MakeWriter(const PandasOptions& options, PandasWriter::type writer_type, *writer = std::make_shared>(options, num_rows); \ break; +#define TZ_CASE(NAME, TYPE) \ + case PandasWriter::NAME: { \ + const auto& ts_type = checked_cast(type); \ + *writer = std::make_shared(options, ts_type.timezone(), num_rows); \ + } break; + switch (writer_type) { case PandasWriter::CATEGORICAL: { const auto& index_type = *checked_cast(type).index_type(); @@ -1991,10 +2029,10 @@ Status MakeWriter(const PandasOptions& options, PandasWriter::type writer_type, BLOCK_CASE(TIMEDELTA_MILLI, TimedeltaMilliWriter); BLOCK_CASE(TIMEDELTA_MICRO, TimedeltaMicroWriter); BLOCK_CASE(TIMEDELTA_NANO, TimedeltaNanoWriter); - case PandasWriter::DATETIME_NANO_TZ: { - const auto& ts_type = checked_cast(type); - *writer = std::make_shared(options, ts_type.timezone(), num_rows); - } break; + TZ_CASE(DATETIME_SECOND_TZ, DatetimeSecondTZWriter); + TZ_CASE(DATETIME_MILLI_TZ, DatetimeMilliTZWriter); + TZ_CASE(DATETIME_MICRO_TZ, DatetimeMicroTZWriter); + TZ_CASE(DATETIME_NANO_TZ, DatetimeNanoTZWriter); default: return Status::NotImplemented("Unsupported block type"); } @@ -2057,13 +2095,25 @@ static Status GetPandasWriterType(const ChunkedArray& data, const PandasOptions& case Type::INTERVAL_MONTH_DAY_NANO: // fall through *output_type = PandasWriter::OBJECT; break; - case Type::DATE32: // fall through + case Type::DATE32: + if (options.date_as_object) { + *output_type = PandasWriter::OBJECT; + } else if (options.coerce_temporal_nanoseconds) { + *output_type = PandasWriter::DATETIME_NANO; + } else if (options.to_numpy) { + // Numpy supports Day, but Pandas does not + *output_type = PandasWriter::DATETIME_DAY; + } else { + *output_type = PandasWriter::DATETIME_MILLI; + } + break; case Type::DATE64: if (options.date_as_object) { *output_type = PandasWriter::OBJECT; + } else if (options.coerce_temporal_nanoseconds) { + *output_type = PandasWriter::DATETIME_NANO; } else { - *output_type = options.coerce_temporal_nanoseconds ? PandasWriter::DATETIME_NANO - : PandasWriter::DATETIME_DAY; + *output_type = PandasWriter::DATETIME_MILLI; } break; case Type::TIMESTAMP: { @@ -2072,24 +2122,43 @@ static Status GetPandasWriterType(const ChunkedArray& data, const PandasOptions& // Nanoseconds are never out of bounds for pandas, so in that case // we don't convert to object *output_type = PandasWriter::OBJECT; - } else if (!ts_type.timezone().empty()) { - *output_type = PandasWriter::DATETIME_NANO_TZ; } else if (options.coerce_temporal_nanoseconds) { - *output_type = PandasWriter::DATETIME_NANO; + if (!ts_type.timezone().empty()) { + *output_type = PandasWriter::DATETIME_NANO_TZ; + } else { + *output_type = PandasWriter::DATETIME_NANO; + } } else { - switch (ts_type.unit()) { - case TimeUnit::SECOND: - *output_type = PandasWriter::DATETIME_SECOND; - break; - case TimeUnit::MILLI: - *output_type = PandasWriter::DATETIME_MILLI; - break; - case TimeUnit::MICRO: - *output_type = PandasWriter::DATETIME_MICRO; - break; - case TimeUnit::NANO: - *output_type = PandasWriter::DATETIME_NANO; - break; + if (!ts_type.timezone().empty()) { + switch (ts_type.unit()) { + case TimeUnit::SECOND: + *output_type = PandasWriter::DATETIME_SECOND_TZ; + break; + case TimeUnit::MILLI: + *output_type = PandasWriter::DATETIME_MILLI_TZ; + break; + case TimeUnit::MICRO: + *output_type = PandasWriter::DATETIME_MICRO_TZ; + break; + case TimeUnit::NANO: + *output_type = PandasWriter::DATETIME_NANO_TZ; + break; + } + } else { + switch (ts_type.unit()) { + case TimeUnit::SECOND: + *output_type = PandasWriter::DATETIME_SECOND; + break; + case TimeUnit::MILLI: + *output_type = PandasWriter::DATETIME_MILLI; + break; + case TimeUnit::MICRO: + *output_type = PandasWriter::DATETIME_MICRO; + break; + case TimeUnit::NANO: + *output_type = PandasWriter::DATETIME_NANO; + break; + } } } } break; @@ -2243,6 +2312,9 @@ class ConsolidatedBlockCreator : public PandasBlockCreator { int block_placement = 0; std::shared_ptr writer; if (output_type == PandasWriter::CATEGORICAL || + output_type == PandasWriter::DATETIME_SECOND_TZ || + output_type == PandasWriter::DATETIME_MILLI_TZ || + output_type == PandasWriter::DATETIME_MICRO_TZ || output_type == PandasWriter::DATETIME_NANO_TZ || output_type == PandasWriter::EXTENSION) { RETURN_NOT_OK(MakeWriter(options_, output_type, type, num_rows_, @@ -2278,6 +2350,9 @@ class ConsolidatedBlockCreator : public PandasBlockCreator { PandasWriter::type output_type = this->column_types_[i]; switch (output_type) { case PandasWriter::CATEGORICAL: + case PandasWriter::DATETIME_SECOND_TZ: + case PandasWriter::DATETIME_MILLI_TZ: + case PandasWriter::DATETIME_MICRO_TZ: case PandasWriter::DATETIME_NANO_TZ: case PandasWriter::EXTENSION: { auto it = this->singleton_blocks_.find(i); diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.h b/python/pyarrow/src/arrow/python/arrow_to_pandas.h index 1da88961d37..82e0a600513 100644 --- a/python/pyarrow/src/arrow/python/arrow_to_pandas.h +++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.h @@ -117,6 +117,10 @@ struct PandasOptions { // Columns that should be passed through to be converted to // ExtensionArray/Block std::unordered_set extension_columns; + + // Used internally to decipher between to_numpy() and to_pandas() when + // the expected output differs + bool to_numpy = false; }; ARROW_PYTHON_EXPORT diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index fd5ba263d24..238fdb86bcc 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -480,6 +480,8 @@ cdef class ChunkedArray(_PandasConvertible): PandasOptions c_options object values + c_options.to_numpy = True + with nogil: check_status( ConvertChunkedArrayToPandas( @@ -2981,8 +2983,9 @@ def table_to_blocks(options, Table table, categories, extension_columns): c_options.extension_columns = {tobytes(col) for col in extension_columns} - # ARROW-3789(wesm); Convert date/timestamp types to datetime64[ns] - c_options.coerce_temporal_nanoseconds = True + if pandas_api.is_v1(): + # ARROW-3789: Coerce date/timestamp types to datetime64[ns] + c_options.coerce_temporal_nanoseconds = True if c_options.self_destruct: # Move the shared_ptr, table is now unsafe to use further diff --git a/python/pyarrow/tests/parquet/common.py b/python/pyarrow/tests/parquet/common.py index f4e609c6ff1..4401d3ca6bb 100644 --- a/python/pyarrow/tests/parquet/common.py +++ b/python/pyarrow/tests/parquet/common.py @@ -150,8 +150,7 @@ def make_sample_file(table_or_df): a_table = pa.Table.from_pandas(table_or_df) buf = io.BytesIO() - _write_table(a_table, buf, compression='SNAPPY', version='2.6', - coerce_timestamps='ms') + _write_table(a_table, buf, compression='SNAPPY', version='2.6') buf.seek(0) return pq.ParquetFile(buf) @@ -173,11 +172,13 @@ def alltypes_sample(size=10000, seed=0, categorical=False): 'float32': np.arange(size, dtype=np.float32), 'float64': np.arange(size, dtype=np.float64), 'bool': np.random.randn(size) > 0, - # TODO(wesm): Test other timestamp resolutions now that arrow supports - # them - 'datetime': np.arange("2016-01-01T00:00:00.001", size, - dtype='datetime64[ms]').astype('datetime64[ns]'), - 'timedelta': np.arange(0, size, dtype="timedelta64[ns]"), + 'datetime_ms': np.arange("2016-01-01T00:00:00.001", size, + dtype='datetime64[ms]'), + 'datetime_us': np.arange("2016-01-01T00:00:00.000001", size, + dtype='datetime64[us]'), + 'datetime_ns': np.arange("2016-01-01T00:00:00.000000001", size, + dtype='datetime64[ns]'), + 'timedelta': np.arange(0, size, dtype="timedelta64[s]"), 'str': pd.Series([str(x) for x in range(size)]), 'empty_str': [''] * size, 'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None], diff --git a/python/pyarrow/tests/parquet/test_data_types.py b/python/pyarrow/tests/parquet/test_data_types.py index 109d82831c5..32fe128bbae 100644 --- a/python/pyarrow/tests/parquet/test_data_types.py +++ b/python/pyarrow/tests/parquet/test_data_types.py @@ -64,7 +64,7 @@ def test_parquet_2_0_roundtrip(tempdir, chunk_size, use_legacy_dataset): assert arrow_table.schema.pandas_metadata is not None _write_table(arrow_table, filename, version='2.6', - coerce_timestamps='ms', chunk_size=chunk_size) + chunk_size=chunk_size) table_read = pq.read_pandas( filename, use_legacy_dataset=use_legacy_dataset) assert table_read.schema.pandas_metadata is not None diff --git a/python/pyarrow/tests/parquet/test_dataset.py b/python/pyarrow/tests/parquet/test_dataset.py index c9a0c63eb11..cd991617c9f 100644 --- a/python/pyarrow/tests/parquet/test_dataset.py +++ b/python/pyarrow/tests/parquet/test_dataset.py @@ -1262,13 +1262,14 @@ def _test_write_to_dataset_with_partitions(base_path, import pyarrow.parquet as pq # ARROW-1400 - output_df = pd.DataFrame({'group1': list('aaabbbbccc'), - 'group2': list('eefeffgeee'), - 'num': list(range(10)), - 'nan': [np.nan] * 10, - 'date': np.arange('2017-01-01', '2017-01-11', - dtype='datetime64[D]')}) - output_df["date"] = output_df["date"].astype('datetime64[ns]') + output_df = pd.DataFrame({ + 'group1': list('aaabbbbccc'), + 'group2': list('eefeffgeee'), + 'num': list(range(10)), + 'nan': [np.nan] * 10, + 'date': np.arange('2017-01-01', '2017-01-11', dtype='datetime64[D]').astype( + 'datetime64[ns]') + }) cols = output_df.columns.tolist() partition_by = ['group1', 'group2'] output_table = pa.Table.from_pandas(output_df, schema=schema, safe=False, @@ -1313,6 +1314,11 @@ def _test_write_to_dataset_with_partitions(base_path, # Partitioned columns become 'categorical' dtypes for col in partition_by: output_df[col] = output_df[col].astype('category') + + if schema: + expected_date_type = schema.field_by_name('date').type.to_pandas_dtype() + output_df["date"] = output_df["date"].astype(expected_date_type) + tm.assert_frame_equal(output_df, input_df) @@ -1324,12 +1330,13 @@ def _test_write_to_dataset_no_partitions(base_path, import pyarrow.parquet as pq # ARROW-1400 - output_df = pd.DataFrame({'group1': list('aaabbbbccc'), - 'group2': list('eefeffgeee'), - 'num': list(range(10)), - 'date': np.arange('2017-01-01', '2017-01-11', - dtype='datetime64[D]')}) - output_df["date"] = output_df["date"].astype('datetime64[ns]') + output_df = pd.DataFrame({ + 'group1': list('aaabbbbccc'), + 'group2': list('eefeffgeee'), + 'num': list(range(10)), + 'date': np.arange('2017-01-01', '2017-01-11', dtype='datetime64[D]').astype( + 'datetime64[ns]') + }) cols = output_df.columns.tolist() output_table = pa.Table.from_pandas(output_df) @@ -1355,7 +1362,7 @@ def _test_write_to_dataset_no_partitions(base_path, input_df = input_table.to_pandas() input_df = input_df.drop_duplicates() input_df = input_df[cols] - assert output_df.equals(input_df) + tm.assert_frame_equal(output_df, input_df) @pytest.mark.pandas @@ -1458,7 +1465,6 @@ def test_write_to_dataset_with_partitions_and_custom_filenames( 'nan': [np.nan] * 10, 'date': np.arange('2017-01-01', '2017-01-11', dtype='datetime64[D]')}) - output_df["date"] = output_df["date"].astype('datetime64[ns]') partition_by = ['group1', 'group2'] output_table = pa.Table.from_pandas(output_df) path = str(tempdir) diff --git a/python/pyarrow/tests/parquet/test_datetime.py b/python/pyarrow/tests/parquet/test_datetime.py index 1cad82b8398..f97c451df7a 100644 --- a/python/pyarrow/tests/parquet/test_datetime.py +++ b/python/pyarrow/tests/parquet/test_datetime.py @@ -50,9 +50,11 @@ @pytest.mark.pandas @parametrize_legacy_dataset def test_pandas_parquet_datetime_tz(use_legacy_dataset): - s = pd.Series([datetime.datetime(2017, 9, 6)]) + # Pandas v2 defaults to [ns], but Arrow defaults to [us] time units + # so we need to cast the pandas dtype. Pandas v1 will always silently + # coerce to [ns] due to lack of non-[ns] support. + s = pd.Series([datetime.datetime(2017, 9, 6)], dtype='datetime64[us]') s = s.dt.tz_localize('utc') - s.index = s # Both a column and an index to hit both use cases @@ -64,7 +66,7 @@ def test_pandas_parquet_datetime_tz(use_legacy_dataset): arrow_table = pa.Table.from_pandas(df) - _write_table(arrow_table, f, coerce_timestamps='ms') + _write_table(arrow_table, f) f.seek(0) table_read = pq.read_pandas(f, use_legacy_dataset=use_legacy_dataset) @@ -153,7 +155,7 @@ def test_coerce_timestamps_truncated(tempdir): df_ms = table_ms.to_pandas() arrays_expected = {'datetime64': [dt_ms, dt_ms]} - df_expected = pd.DataFrame(arrays_expected) + df_expected = pd.DataFrame(arrays_expected, dtype='datetime64[ms]') tm.assert_frame_equal(df_expected, df_ms) diff --git a/python/pyarrow/tests/parquet/test_pandas.py b/python/pyarrow/tests/parquet/test_pandas.py index 6bd68e08fc5..0ed305bff19 100644 --- a/python/pyarrow/tests/parquet/test_pandas.py +++ b/python/pyarrow/tests/parquet/test_pandas.py @@ -59,7 +59,7 @@ def test_pandas_parquet_custom_metadata(tempdir): arrow_table = pa.Table.from_pandas(df) assert b'pandas' in arrow_table.schema.metadata - _write_table(arrow_table, filename, version='2.6', coerce_timestamps='ms') + _write_table(arrow_table, filename) metadata = pq.read_metadata(filename).metadata assert b'pandas' in metadata @@ -113,7 +113,7 @@ def test_pandas_parquet_column_multiindex(tempdir, use_legacy_dataset): arrow_table = pa.Table.from_pandas(df) assert arrow_table.schema.pandas_metadata is not None - _write_table(arrow_table, filename, version='2.6', coerce_timestamps='ms') + _write_table(arrow_table, filename) table_read = pq.read_pandas( filename, use_legacy_dataset=use_legacy_dataset) @@ -136,7 +136,7 @@ def test_pandas_parquet_2_0_roundtrip_read_pandas_no_index_written( # While index_columns should be empty, columns needs to be filled still. assert js['columns'] - _write_table(arrow_table, filename, version='2.6', coerce_timestamps='ms') + _write_table(arrow_table, filename) table_read = pq.read_pandas( filename, use_legacy_dataset=use_legacy_dataset) @@ -344,7 +344,12 @@ def test_index_column_name_duplicate(tempdir, use_legacy_dataset): } } path = str(tempdir / 'data.parquet') - dfx = pd.DataFrame(data).set_index('time', drop=False) + + # Pandas v2 defaults to [ns], but Arrow defaults to [us] time units + # so we need to cast the pandas dtype. Pandas v1 will always silently + # coerce to [ns] due to lack of non-[ns] support. + dfx = pd.DataFrame(data, dtype='datetime64[us]').set_index('time', drop=False) + tdfx = pa.Table.from_pandas(dfx) _write_table(tdfx, path) arrow_table = _read_table(path, use_legacy_dataset=use_legacy_dataset) diff --git a/python/pyarrow/tests/parquet/test_parquet_file.py b/python/pyarrow/tests/parquet/test_parquet_file.py index bd468949a84..9f920206a10 100644 --- a/python/pyarrow/tests/parquet/test_parquet_file.py +++ b/python/pyarrow/tests/parquet/test_parquet_file.py @@ -210,7 +210,7 @@ def test_iter_batches_columns_reader(tempdir, batch_size): filename = tempdir / 'pandas_roundtrip.parquet' arrow_table = pa.Table.from_pandas(df) _write_table(arrow_table, filename, version='2.6', - coerce_timestamps='ms', chunk_size=chunk_size) + chunk_size=chunk_size) file_ = pq.ParquetFile(filename) for columns in [df.columns[:10], df.columns[10:]]: @@ -234,7 +234,7 @@ def test_iter_batches_reader(tempdir, chunk_size): assert arrow_table.schema.pandas_metadata is not None _write_table(arrow_table, filename, version='2.6', - coerce_timestamps='ms', chunk_size=chunk_size) + chunk_size=chunk_size) file_ = pq.ParquetFile(filename) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index a9e8f09d1bf..ed29bf5cae6 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -223,8 +223,9 @@ def test_to_numpy_writable(): @pytest.mark.parametrize('unit', ['s', 'ms', 'us', 'ns']) -def test_to_numpy_datetime64(unit): - arr = pa.array([1, 2, 3], pa.timestamp(unit)) +@pytest.mark.parametrize('tz', [None, "UTC"]) +def test_to_numpy_datetime64(unit, tz): + arr = pa.array([1, 2, 3], pa.timestamp(unit, tz=tz)) expected = np.array([1, 2, 3], dtype="datetime64[{}]".format(unit)) np_arr = arr.to_numpy() np.testing.assert_array_equal(np_arr, expected) @@ -2165,12 +2166,15 @@ def test_pandas_null_sentinels_index(): assert result.equals(expected) -def test_array_from_numpy_datetimeD(): +def test_array_roundtrip_from_numpy_datetimeD(): arr = np.array([None, datetime.date(2017, 4, 4)], dtype='datetime64[D]') result = pa.array(arr) expected = pa.array([None, datetime.date(2017, 4, 4)], type=pa.date32()) assert result.equals(expected) + result = result.to_numpy(zero_copy_only=False) + np.testing.assert_array_equal(result, arr) + assert result.dtype == arr.dtype def test_array_from_naive_datetimes(): diff --git a/python/pyarrow/tests/test_dataset.py b/python/pyarrow/tests/test_dataset.py index 144da21cf5e..7e8ce329be1 100644 --- a/python/pyarrow/tests/test_dataset.py +++ b/python/pyarrow/tests/test_dataset.py @@ -4539,7 +4539,9 @@ def test_write_table_partitioned_dict(tempdir): @pytest.mark.parquet def test_write_dataset_parquet(tempdir): table = pa.table([ - pa.array(range(20)), pa.array(np.random.randn(20)), + pa.array(range(20), type="uint32"), + pa.array(np.arange("2012-01-01", 20, dtype="datetime64[D]").astype( + "datetime64[ns]")), pa.array(np.repeat(['a', 'b'], 10)) ], names=["f1", "f2", "part"]) @@ -4551,7 +4553,7 @@ def test_write_dataset_parquet(tempdir): file_paths = list(base_dir.rglob("*")) expected_paths = [base_dir / "part-0.parquet"] assert set(file_paths) == set(expected_paths) - # check Table roundtrip + # check Table roundtrip with default version result = ds.dataset(base_dir, format="parquet").to_table() assert result.equals(table) @@ -4559,12 +4561,24 @@ def test_write_dataset_parquet(tempdir): for version in ["1.0", "2.4", "2.6"]: format = ds.ParquetFileFormat() opts = format.make_write_options(version=version) + assert " 0, - # TODO(wesm): Pandas only support ns resolution, Arrow supports s, ms, - # us, ns - 'datetime': np.arange("2016-01-01T00:00:00.001", size, - dtype='datetime64[ms]').astype("datetime64[ns]"), + 'datetime[s]': np.arange("2016-01-01T00:00:00.001", size, + dtype='datetime64[s]'), + 'datetime[ms]': np.arange("2016-01-01T00:00:00.001", size, + dtype='datetime64[ms]'), + 'datetime[us]': np.arange("2016-01-01T00:00:00.001", size, + dtype='datetime64[us]'), + 'datetime[ns]': np.arange("2016-01-01T00:00:00.001", size, + dtype='datetime64[ns]'), + 'timedelta64[s]': np.arange(0, size, dtype='timedelta64[s]'), + 'timedelta64[ms]': np.arange(0, size, dtype='timedelta64[ms]'), + 'timedelta64[us]': np.arange(0, size, dtype='timedelta64[us]'), + 'timedelta64[ns]': np.arange(0, size, dtype='timedelta64[ns]'), 'str': [str(x) for x in range(size)], 'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None], 'empty_str': [''] * size @@ -1017,27 +1025,30 @@ def test_timestamps_notimezone_nulls(self): expected_schema=schema, ) - def test_timestamps_with_timezone(self): + @pytest.mark.parametrize('unit', ['s', 'ms', 'us', 'ns']) + def test_timestamps_with_timezone(self, unit): + if Version(pd.__version__) < Version("2.0.0") and unit != 'ns': + pytest.skip("pandas < 2.0 only supports nanosecond datetime64") df = pd.DataFrame({ 'datetime64': np.array([ '2007-07-13T01:23:34.123', '2006-01-13T12:34:56.432', '2010-08-13T05:46:57.437'], - dtype='datetime64[ms]').astype("datetime64[ns]") + dtype=f'datetime64[{unit}]') }) df['datetime64'] = df['datetime64'].dt.tz_localize('US/Eastern') _check_pandas_roundtrip(df) _check_series_roundtrip(df['datetime64']) - # drop-in a null and ns instead of ms + # drop-in a null df = pd.DataFrame({ 'datetime64': np.array([ '2007-07-13T01:23:34.123456789', None, '2006-01-13T12:34:56.432539784', '2010-08-13T05:46:57.437699912'], - dtype='datetime64[ns]') + dtype=f'datetime64[{unit}]') }) df['datetime64'] = df['datetime64'].dt.tz_localize('US/Eastern') @@ -1054,8 +1065,11 @@ def test_python_datetime(self): assert isinstance(table[0].chunk(0), pa.TimestampArray) result = table.to_pandas() + # Pandas v2 defaults to [ns], but Arrow defaults to [us] time units + # so we need to cast the pandas dtype. Pandas v1 will always silently + # coerce to [ns] due to lack of non-[ns] support. expected_df = pd.DataFrame({ - 'datetime': date_array + 'datetime': pd.Series(date_array, dtype='datetime64[us]') }) tm.assert_frame_equal(expected_df, result) @@ -1108,7 +1122,12 @@ class MyDatetime(datetime): assert isinstance(table[0].chunk(0), pa.TimestampArray) result = table.to_pandas() - expected_df = pd.DataFrame({"datetime": date_array}) + + # Pandas v2 defaults to [ns], but Arrow defaults to [us] time units + # so we need to cast the pandas dtype. Pandas v1 will always silently + # coerce to [ns] due to lack of non-[ns] support. + expected_df = pd.DataFrame( + {"datetime": pd.Series(date_array, dtype='datetime64[us]')}) # https://github.com/pandas-dev/pandas/issues/21142 expected_df["datetime"] = pd.to_datetime(expected_df["datetime"]) @@ -1169,31 +1188,42 @@ def test_pandas_datetime_to_date64(self, mask): assert arr.equals(expected) - def test_array_types_date_as_object(self): + @pytest.mark.parametrize("coerce_to_ns,expected_dtype", + [(False, 'datetime64[ms]'), + (True, 'datetime64[ns]')]) + def test_array_types_date_as_object(self, coerce_to_ns, expected_dtype): data = [date(2000, 1, 1), None, date(1970, 1, 1), date(2040, 2, 26)] - expected_d = np.array(['2000-01-01', None, '1970-01-01', - '2040-02-26'], dtype='datetime64[D]') + expected_days = np.array(['2000-01-01', None, '1970-01-01', + '2040-02-26'], dtype='datetime64[D]') + + if Version(pd.__version__) < Version("2.0.0"): + # ARROW-3789: Coerce date/timestamp types to datetime64[ns] + expected_dtype = 'datetime64[ns]' - expected_ns = np.array(['2000-01-01', None, '1970-01-01', - '2040-02-26'], dtype='datetime64[ns]') + expected = np.array(['2000-01-01', None, '1970-01-01', + '2040-02-26'], dtype=expected_dtype) objects = [pa.array(data), pa.chunked_array([data])] for obj in objects: - result = obj.to_pandas() - expected_obj = expected_d.astype(object) + result = obj.to_pandas(coerce_temporal_nanoseconds=coerce_to_ns) + expected_obj = expected_days.astype(object) assert result.dtype == expected_obj.dtype npt.assert_array_equal(result, expected_obj) - result = obj.to_pandas(date_as_object=False) - assert result.dtype == expected_ns.dtype - npt.assert_array_equal(result, expected_ns) + result = obj.to_pandas(date_as_object=False, + coerce_temporal_nanoseconds=coerce_to_ns) + assert result.dtype == expected.dtype + npt.assert_array_equal(result, expected) - def test_table_convert_date_as_object(self): + @pytest.mark.parametrize("coerce_to_ns,expected_type", + [(False, 'datetime64[ms]'), + (True, 'datetime64[ns]')]) + def test_table_convert_date_as_object(self, coerce_to_ns, expected_type): df = pd.DataFrame({ 'date': [date(2000, 1, 1), None, @@ -1202,13 +1232,51 @@ def test_table_convert_date_as_object(self): table = pa.Table.from_pandas(df, preserve_index=False) - df_datetime = table.to_pandas(date_as_object=False) + df_datetime = table.to_pandas(date_as_object=False, + coerce_temporal_nanoseconds=coerce_to_ns) df_object = table.to_pandas() - tm.assert_frame_equal(df.astype('datetime64[ns]'), df_datetime, + tm.assert_frame_equal(df.astype(expected_type), df_datetime, check_dtype=True) tm.assert_frame_equal(df, df_object, check_dtype=True) + @pytest.mark.parametrize("arrow_type", + [pa.date32(), pa.date64(), pa.timestamp('s'), + pa.timestamp('ms'), pa.timestamp('us'), + pa.timestamp('ns'), pa.timestamp('s', 'UTC'), + pa.timestamp('ms', 'UTC'), pa.timestamp('us', 'UTC'), + pa.timestamp('ns', 'UTC')]) + def test_array_coerce_temporal_nanoseconds(self, arrow_type): + data = [date(2000, 1, 1), datetime(2001, 1, 1)] + expected = pd.Series(data) + arr = pa.array(data).cast(arrow_type) + result = arr.to_pandas( + coerce_temporal_nanoseconds=True, date_as_object=False) + expected_tz = None + if hasattr(arrow_type, 'tz') and arrow_type.tz is not None: + expected_tz = 'UTC' + expected_type = pa.timestamp('ns', expected_tz).to_pandas_dtype() + tm.assert_series_equal(result, expected.astype(expected_type)) + + @pytest.mark.parametrize("arrow_type", + [pa.date32(), pa.date64(), pa.timestamp('s'), + pa.timestamp('ms'), pa.timestamp('us'), + pa.timestamp('ns'), pa.timestamp('s', 'UTC'), + pa.timestamp('ms', 'UTC'), pa.timestamp('us', 'UTC'), + pa.timestamp('ns', 'UTC')]) + def test_table_coerce_temporal_nanoseconds(self, arrow_type): + data = [date(2000, 1, 1), datetime(2001, 1, 1)] + schema = pa.schema([pa.field('date', arrow_type)]) + expected_df = pd.DataFrame({'date': data}) + table = pa.table([pa.array(data)], schema=schema) + result_df = table.to_pandas( + coerce_temporal_nanoseconds=True, date_as_object=False) + expected_tz = None + if hasattr(arrow_type, 'tz') and arrow_type.tz is not None: + expected_tz = 'UTC' + expected_type = pa.timestamp('ns', expected_tz).to_pandas_dtype() + tm.assert_frame_equal(result_df, expected_df.astype(expected_type)) + def test_date_infer(self): df = pd.DataFrame({ 'date': [date(2000, 1, 1), @@ -1266,9 +1334,11 @@ def test_date_objects_typed(self): dtype='datetime64[D]')) ex_values[1] = pd.NaT.value - ex_datetime64ns = ex_values.astype('datetime64[ns]') - expected_pandas = pd.DataFrame({'date32': ex_datetime64ns, - 'date64': ex_datetime64ns}, + # date32 and date64 convert to [ms] in pandas v2, but + # in pandas v1 they are siliently coerced to [ns] + ex_datetime64ms = ex_values.astype('datetime64[ms]') + expected_pandas = pd.DataFrame({'date32': ex_datetime64ms, + 'date64': ex_datetime64ms}, columns=colnames) table_pandas = table.to_pandas(date_as_object=False) tm.assert_frame_equal(table_pandas, expected_pandas) @@ -1428,8 +1498,11 @@ def test_numpy_datetime64_columns(self): dtype='datetime64[s]') _check_array_from_pandas_roundtrip(datetime64_s) - def test_timestamp_to_pandas_ns(self): + def test_timestamp_to_pandas_coerces_to_ns(self): # non-ns timestamp gets cast to ns on conversion to pandas + if Version(pd.__version__) >= Version("2.0.0"): + pytest.skip("pandas >= 2.0 supports non-nanosecond datetime64") + arr = pa.array([1, 2, 3], pa.timestamp('ms')) expected = pd.Series(pd.to_datetime([1, 2, 3], unit='ms')) s = arr.to_pandas() @@ -1440,13 +1513,7 @@ def test_timestamp_to_pandas_ns(self): def test_timestamp_to_pandas_out_of_bounds(self): # ARROW-7758 check for out of bounds timestamps for non-ns timestamps - - if Version(pd.__version__) >= Version("2.1.0.dev"): - # GH-35235: test fail due to __from_pyarrow__ being added to pandas - # https://github.com/pandas-dev/pandas/pull/52201 - # Needs: https://github.com/apache/arrow/issues/33321 - pytest.skip( - "Need support converting to non-nano datetime64 for pandas >= 2.0") + # that end up getting coerced into ns timestamps. for unit in ['s', 'ms', 'us']: for tz in [None, 'America/New_York']: @@ -1455,26 +1522,27 @@ def test_timestamp_to_pandas_out_of_bounds(self): msg = "would result in out of bounds timestamp" with pytest.raises(ValueError, match=msg): - arr.to_pandas() + arr.to_pandas(coerce_temporal_nanoseconds=True) with pytest.raises(ValueError, match=msg): - table.to_pandas() + table.to_pandas(coerce_temporal_nanoseconds=True) with pytest.raises(ValueError, match=msg): # chunked array - table.column('a').to_pandas() + table.column('a').to_pandas(coerce_temporal_nanoseconds=True) # just ensure those don't give an error, but do not # check actual garbage output - arr.to_pandas(safe=False) - table.to_pandas(safe=False) - table.column('a').to_pandas(safe=False) + arr.to_pandas(safe=False, coerce_temporal_nanoseconds=True) + table.to_pandas(safe=False, coerce_temporal_nanoseconds=True) + table.column('a').to_pandas( + safe=False, coerce_temporal_nanoseconds=True) def test_timestamp_to_pandas_empty_chunked(self): # ARROW-7907 table with chunked array with 0 chunks table = pa.table({'a': pa.chunked_array([], type=pa.timestamp('us'))}) result = table.to_pandas() - expected = pd.DataFrame({'a': pd.Series([], dtype="datetime64[ns]")}) + expected = pd.DataFrame({'a': pd.Series([], dtype="datetime64[us]")}) tm.assert_frame_equal(result, expected) @pytest.mark.parametrize('dtype', [pa.date32(), pa.date64()]) @@ -1516,24 +1584,30 @@ def test_fixed_offset_timezone(self): # TODO remove if https://github.com/apache/arrow/issues/15047 is fixed _check_pandas_roundtrip(df, check_dtype=False) - def test_timedeltas_no_nulls(self): + @pytest.mark.parametrize("unit", ['s', 'ms', 'us', 'ns']) + def test_timedeltas_no_nulls(self, unit): + if Version(pd.__version__) < Version("2.0.0"): + unit = 'ns' df = pd.DataFrame({ 'timedelta64': np.array([0, 3600000000000, 7200000000000], - dtype='timedelta64[ns]') + dtype=f'timedelta64[{unit}]') }) - field = pa.field('timedelta64', pa.duration('ns')) + field = pa.field('timedelta64', pa.duration(unit)) schema = pa.schema([field]) _check_pandas_roundtrip( df, expected_schema=schema, ) - def test_timedeltas_nulls(self): + @pytest.mark.parametrize("unit", ['s', 'ms', 'us', 'ns']) + def test_timedeltas_nulls(self, unit): + if Version(pd.__version__) < Version("2.0.0"): + unit = 'ns' df = pd.DataFrame({ 'timedelta64': np.array([0, None, 7200000000000], - dtype='timedelta64[ns]') + dtype=f'timedelta64[{unit}]') }) - field = pa.field('timedelta64', pa.duration('ns')) + field = pa.field('timedelta64', pa.duration(unit)) schema = pa.schema([field]) _check_pandas_roundtrip( df, @@ -2855,7 +2929,7 @@ def test_strided_data_import(self): cases.append(boolean_objects) cases.append(np.arange("2016-01-01T00:00:00.001", N * K, - dtype='datetime64[ms]').astype("datetime64[ns]") + dtype='datetime64[ms]') .reshape(N, K).copy()) strided_mask = (random_numbers > 0).astype(bool)[:, 0] @@ -3384,7 +3458,7 @@ def test_table_from_pandas_schema_with_custom_metadata(): assert table.schema.metadata.get(b'meta') == b'True' -def test_table_from_pandas_schema_field_order_metadat(): +def test_table_from_pandas_schema_field_order_metadata(): # ARROW-10532 # ensure that a different field order in specified schema doesn't # mangle metadata @@ -3408,7 +3482,12 @@ def test_table_from_pandas_schema_field_order_metadat(): assert metadata_datetime["metadata"] == {'timezone': 'UTC'} result = table.to_pandas() - expected = df[["float", "datetime"]].astype({"float": "float32"}) + coerce_cols_to_types = {"float": "float32"} + if Version(pd.__version__) >= Version("2.0.0"): + # Pandas v2 now support non-nanosecond time units + coerce_cols_to_types["datetime"] = "datetime64[s, UTC]" + expected = df[["float", "datetime"]].astype(coerce_cols_to_types) + tm.assert_frame_equal(result, expected) @@ -4179,20 +4258,20 @@ def test_to_pandas_extension_dtypes_mapping(): assert isinstance(result['a'].dtype, pd.PeriodDtype) -def test_array_to_pandas(): +@pytest.mark.parametrize("arr", + [pd.period_range("2012-01-01", periods=3, freq="D").array, + pd.interval_range(1, 4).array]) +def test_array_to_pandas(arr): if Version(pd.__version__) < Version("1.1"): pytest.skip("ExtensionDtype to_pandas method missing") - for arr in [pd.period_range("2012-01-01", periods=3, freq="D").array, - pd.interval_range(1, 4).array]: - result = pa.array(arr).to_pandas() - expected = pd.Series(arr) - tm.assert_series_equal(result, expected) - - # TODO implement proper conversion for chunked array - # result = pa.table({"col": arr})["col"].to_pandas() - # expected = pd.Series(arr, name="col") - # tm.assert_series_equal(result, expected) + result = pa.array(arr).to_pandas() + expected = pd.Series(arr) + tm.assert_series_equal(result, expected) + + result = pa.table({"col": arr})["col"].to_pandas() + expected = pd.Series(arr, name="col") + tm.assert_series_equal(result, expected) def test_roundtrip_empty_table_with_extension_dtype_index(): diff --git a/python/pyarrow/tests/test_schema.py b/python/pyarrow/tests/test_schema.py index 0c4dea673b0..2f2417f590a 100644 --- a/python/pyarrow/tests/test_schema.py +++ b/python/pyarrow/tests/test_schema.py @@ -25,6 +25,12 @@ import pyarrow as pa import pyarrow.tests.util as test_util +from pyarrow.vendored.version import Version + +try: + import pandas as pd +except ImportError: + pass def test_schema_constructor_errors(): @@ -45,7 +51,9 @@ def test_type_integers(): def test_type_to_pandas_dtype(): - M8_ns = np.dtype('datetime64[ns]') + M8 = np.dtype('datetime64[ms]') + if Version(pd.__version__) < Version("2.0.0"): + M8 = np.dtype('datetime64[ns]') cases = [ (pa.null(), np.object_), (pa.bool_(), np.bool_), @@ -60,9 +68,9 @@ def test_type_to_pandas_dtype(): (pa.float16(), np.float16), (pa.float32(), np.float32), (pa.float64(), np.float64), - (pa.date32(), M8_ns), - (pa.date64(), M8_ns), - (pa.timestamp('ms'), M8_ns), + (pa.date32(), M8), + (pa.date64(), M8), + (pa.timestamp('ms'), M8), (pa.binary(), np.object_), (pa.binary(12), np.object_), (pa.string(), np.object_), diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index a3311cbbcf4..fbd4f8a94b6 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -40,10 +40,21 @@ cdef dict _pandas_type_map = { _Type_HALF_FLOAT: np.float16, _Type_FLOAT: np.float32, _Type_DOUBLE: np.float64, - _Type_DATE32: np.dtype('datetime64[ns]'), - _Type_DATE64: np.dtype('datetime64[ns]'), - _Type_TIMESTAMP: np.dtype('datetime64[ns]'), - _Type_DURATION: np.dtype('timedelta64[ns]'), + # Pandas does not support [D]ay, so default to [ms] for date32 + _Type_DATE32: np.dtype('datetime64[ms]'), + _Type_DATE64: np.dtype('datetime64[ms]'), + _Type_TIMESTAMP: { + 's': np.dtype('datetime64[s]'), + 'ms': np.dtype('datetime64[ms]'), + 'us': np.dtype('datetime64[us]'), + 'ns': np.dtype('datetime64[ns]'), + }, + _Type_DURATION: { + 's': np.dtype('timedelta64[s]'), + 'ms': np.dtype('timedelta64[ms]'), + 'us': np.dtype('timedelta64[us]'), + 'ns': np.dtype('timedelta64[ns]'), + }, _Type_BINARY: np.object_, _Type_FIXED_SIZE_BINARY: np.object_, _Type_STRING: np.object_, @@ -115,6 +126,44 @@ def _is_primitive(Type type): return is_primitive(type) +def _get_pandas_type(arrow_type, coerce_to_ns=False): + cdef Type type_id = arrow_type.id + if type_id not in _pandas_type_map: + return None + if coerce_to_ns: + # ARROW-3789: Coerce date/timestamp types to datetime64[ns] + if type_id == _Type_DURATION: + return np.dtype('timedelta64[ns]') + return np.dtype('datetime64[ns]') + pandas_type = _pandas_type_map[type_id] + if isinstance(pandas_type, dict): + unit = getattr(arrow_type, 'unit', None) + pandas_type = pandas_type.get(unit, None) + return pandas_type + + +def _get_pandas_tz_type(arrow_type, coerce_to_ns=False): + from pyarrow.pandas_compat import make_datetimetz + unit = 'ns' if coerce_to_ns else arrow_type.unit + return make_datetimetz(unit, arrow_type.tz) + + +def _to_pandas_dtype(arrow_type, options=None): + coerce_to_ns = (options and options.get('coerce_temporal_nanoseconds', False)) or ( + _pandas_api.is_v1() and arrow_type.id in + [_Type_DATE32, _Type_DATE64, _Type_TIMESTAMP, _Type_DURATION]) + + if getattr(arrow_type, 'tz', None): + dtype = _get_pandas_tz_type(arrow_type, coerce_to_ns) + else: + dtype = _get_pandas_type(arrow_type, coerce_to_ns) + + if not dtype: + raise NotImplementedError(str(arrow_type)) + + return dtype + + # Workaround for Cython parsing bug # https://github.com/cython/cython/issues/2143 ctypedef CFixedWidthType* _CFixedWidthTypePtr @@ -274,11 +323,7 @@ cdef class DataType(_Weakrefable): >>> pa.int64().to_pandas_dtype() """ - cdef Type type_id = self.type.id() - if type_id in _pandas_type_map: - return _pandas_type_map[type_id] - else: - raise NotImplementedError(str(self)) + return _to_pandas_dtype(self) def _export_to_c(self, out_ptr): """ @@ -1005,24 +1050,6 @@ cdef class TimestampType(DataType): else: return None - def to_pandas_dtype(self): - """ - Return the equivalent NumPy / Pandas dtype. - - Examples - -------- - >>> import pyarrow as pa - >>> t = pa.timestamp('s', tz='UTC') - >>> t.to_pandas_dtype() - datetime64[ns, UTC] - """ - if self.tz is None: - return _pandas_type_map[_Type_TIMESTAMP] - else: - # Return DatetimeTZ - from pyarrow.pandas_compat import make_datetimetz - return make_datetimetz(self.tz) - def __reduce__(self): return timestamp, (self.unit, self.tz)