diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index 47b62a35521..e4d3e793d25 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -67,6 +67,30 @@ using internal::CheckIndexBounds; using internal::GetByteWidth; using internal::OptionalParallelFor; +namespace py { +namespace { + +// Fix options for conversion of an inner (child) array. +PandasOptions MakeInnerOptions(PandasOptions options) { + // Make sure conversion of inner dictionary arrays always returns an array, + // not a dict {'indices': array, 'dictionary': array, 'ordered': bool} + options.decode_dictionaries = true; + options.categorical_columns.clear(); + options.strings_to_categorical = false; + + // In ARROW-7723, we found as a result of ARROW-3789 that second + // through microsecond resolution tz-aware timestamps were being promoted to + // use the DATETIME_NANO_TZ conversion path, yielding a datetime64[ns] NumPy + // array in this function. PyArray_GETITEM returns datetime.datetime for + // units second through microsecond but PyLong for nanosecond (because + // datetime.datetime does not support nanoseconds). + // We force the object conversion to preserve the value of the timezone. + // Nanoseconds are returned as integers. + options.coerce_temporal_nanoseconds = false; + + return options; +} + // ---------------------------------------------------------------------- // PyCapsule code for setting ndarray base to reference C++ object @@ -78,8 +102,6 @@ struct BufferCapsule { std::shared_ptr buffer; }; -namespace { - void ArrayCapsule_Destructor(PyObject* capsule) { delete reinterpret_cast(PyCapsule_GetPointer(capsule, "arrow::Array")); } @@ -88,13 +110,9 @@ void BufferCapsule_Destructor(PyObject* capsule) { delete reinterpret_cast(PyCapsule_GetPointer(capsule, "arrow::Buffer")); } -} // namespace - // ---------------------------------------------------------------------- // pandas 0.x DataFrame conversion internals -namespace py { - using internal::arrow_traits; using internal::npy_traits; @@ -628,8 +646,8 @@ inline Status ConvertAsPyObjects(const PandasOptions& options, const ChunkedArra return Status::OK(); } -inline Status ConvertStruct(const PandasOptions& options, const ChunkedArray& data, - PyObject** out_values) { +Status ConvertStruct(PandasOptions options, const ChunkedArray& data, + PyObject** out_values) { if (data.num_chunks() == 0) { return Status::OK(); } @@ -641,29 +659,18 @@ inline Status ConvertStruct(const PandasOptions& options, const ChunkedArray& da std::vector fields_data(num_fields); OwnedRef dict_item; - // In ARROW-7723, we found as a result of ARROW-3789 that second - // through microsecond resolution tz-aware timestamps were being promoted to - // use the DATETIME_NANO_TZ conversion path, yielding a datetime64[ns] NumPy - // array in this function. PyArray_GETITEM returns datetime.datetime for - // units second through microsecond but PyLong for nanosecond (because - // datetime.datetime does not support nanoseconds). - // We force the object conversion to preserve the value of the timezone. - // Nanoseconds are returned integers inside of structs. - PandasOptions modified_options = options; - modified_options.coerce_temporal_nanoseconds = false; + // See notes in MakeInnerOptions. + options = MakeInnerOptions(std::move(options)); + // Don't blindly convert because timestamps in lists are handled differently. + options.timestamp_as_object = true; for (int c = 0; c < data.num_chunks(); c++) { auto arr = checked_cast(data.chunk(c).get()); // Convert the struct arrays first for (int32_t i = 0; i < num_fields; i++) { - PyObject* numpy_array; - std::shared_ptr field = arr->field(static_cast(i)); - // See notes above about timestamp conversion. Don't blindly convert because - // timestamps in lists are handled differently. - modified_options.timestamp_as_object = - field->type()->id() == Type::TIMESTAMP ? true : options.timestamp_as_object; - RETURN_NOT_OK(ConvertArrayToPandas(modified_options, field, nullptr, &numpy_array)); - fields_data[i].reset(numpy_array); + const auto field = arr->field(static_cast(i)); + RETURN_NOT_OK(ConvertArrayToPandas(options, field, nullptr, fields_data[i].ref())); + DCHECK(PyArray_Check(fields_data[i].obj())); } // Construct a dictionary for each row @@ -707,7 +714,7 @@ inline Status ConvertStruct(const PandasOptions& options, const ChunkedArray& da } Status DecodeDictionaries(MemoryPool* pool, const std::shared_ptr& dense_type, - std::vector>* arrays) { + ArrayVector* arrays) { compute::ExecContext ctx(pool); compute::CastOptions options; for (size_t i = 0; i < arrays->size(); ++i) { @@ -717,11 +724,19 @@ Status DecodeDictionaries(MemoryPool* pool, const std::shared_ptr& den return Status::OK(); } +Status DecodeDictionaries(MemoryPool* pool, const std::shared_ptr& dense_type, + std::shared_ptr* array) { + auto chunks = (*array)->chunks(); + RETURN_NOT_OK(DecodeDictionaries(pool, dense_type, &chunks)); + *array = std::make_shared(std::move(chunks), dense_type); + return Status::OK(); +} + template -Status ConvertListsLike(const PandasOptions& options, const ChunkedArray& data, +Status ConvertListsLike(PandasOptions options, const ChunkedArray& data, PyObject** out_values) { // Get column of underlying value arrays - std::vector> value_arrays; + ArrayVector value_arrays; for (int c = 0; c < data.num_chunks(); c++) { const auto& arr = checked_cast(*data.chunk(c)); value_arrays.emplace_back(arr.values()); @@ -730,44 +745,29 @@ Status ConvertListsLike(const PandasOptions& options, const ChunkedArray& data, const auto& list_type = checked_cast(*data.type()); auto value_type = list_type.value_type(); - if (value_type->id() == Type::DICTIONARY) { - // ARROW-6899: Convert dictionary-encoded children to dense instead of - // failing below. A more efficient conversion than this could be done later - auto dense_type = checked_cast(*value_type).value_type(); - RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &value_arrays)); - value_type = dense_type; - } - auto flat_column = std::make_shared(value_arrays, value_type); - // TODO(ARROW-489): Currently we don't have a Python reference for single columns. - // Storing a reference to the whole Array would be too expensive. - - // ARROW-3789(wesm): During refactoring I found that unit tests assumed that - // timestamp units would be preserved on list conversions in - // Table.to_pandas. So we set the option here to not coerce things to - // nanoseconds. Bit of a hack but this seemed the simplest thing to satisfy - // the existing unit tests - PandasOptions modified_options = options; - modified_options.coerce_temporal_nanoseconds = false; + + options = MakeInnerOptions(std::move(options)); OwnedRefNoGIL owned_numpy_array; - RETURN_NOT_OK(ConvertChunkedArrayToPandas(modified_options, flat_column, nullptr, + RETURN_NOT_OK(ConvertChunkedArrayToPandas(options, flat_column, nullptr, owned_numpy_array.ref())); PyObject* numpy_array = owned_numpy_array.obj(); + DCHECK(PyArray_Check(numpy_array)); int64_t chunk_offset = 0; for (int c = 0; c < data.num_chunks(); c++) { - auto arr = std::static_pointer_cast(data.chunk(c)); + const auto& arr = checked_cast(*data.chunk(c)); const bool has_nulls = data.null_count() > 0; - for (int64_t i = 0; i < arr->length(); ++i) { - if (has_nulls && arr->IsNull(i)) { + for (int64_t i = 0; i < arr.length(); ++i) { + if (has_nulls && arr.IsNull(i)) { Py_INCREF(Py_None); *out_values = Py_None; } else { - OwnedRef start(PyLong_FromLongLong(arr->value_offset(i) + chunk_offset)); - OwnedRef end(PyLong_FromLongLong(arr->value_offset(i + 1) + chunk_offset)); + OwnedRef start(PyLong_FromLongLong(arr.value_offset(i) + chunk_offset)); + OwnedRef end(PyLong_FromLongLong(arr.value_offset(i + 1) + chunk_offset)); OwnedRef slice(PySlice_New(start.obj(), end.obj(), nullptr)); if (ARROW_PREDICT_FALSE(slice.obj() == nullptr)) { @@ -785,7 +785,7 @@ Status ConvertListsLike(const PandasOptions& options, const ChunkedArray& data, } RETURN_IF_PYERROR(); - chunk_offset += arr->values()->length(); + chunk_offset += arr.values()->length(); } return Status::OK(); @@ -1834,9 +1834,8 @@ class PandasBlockCreator { public: using WriterMap = std::unordered_map>; - explicit PandasBlockCreator(const PandasOptions& options, - std::vector> fields, - std::vector> arrays) + explicit PandasBlockCreator(const PandasOptions& options, FieldVector fields, + ChunkedArrayVector arrays) : options_(options), fields_(std::move(fields)), arrays_(std::move(arrays)) { num_columns_ = static_cast(arrays_.size()); if (num_columns_ > 0) { @@ -1864,8 +1863,8 @@ class PandasBlockCreator { protected: PandasOptions options_; - std::vector> fields_; - std::vector> arrays_; + FieldVector fields_; + ChunkedArrayVector arrays_; int num_columns_; int64_t num_rows_; @@ -2038,9 +2037,8 @@ class SplitBlockCreator : public PandasBlockCreator { std::vector> writers_; }; -Status ConvertCategoricals(const PandasOptions& options, - std::vector>* arrays, - std::vector>* fields) { +Status ConvertCategoricals(const PandasOptions& options, ChunkedArrayVector* arrays, + FieldVector* fields) { std::vector columns_to_encode; // For Categorical conversions @@ -2076,6 +2074,8 @@ Status ConvertCategoricals(const PandasOptions& options, static_cast(columns_to_encode.size()), EncodeColumn); } +} // namespace + Status ConvertArrayToPandas(const PandasOptions& options, std::shared_ptr arr, PyObject* py_ref, PyObject** out) { return ConvertChunkedArrayToPandas( @@ -2085,6 +2085,13 @@ Status ConvertArrayToPandas(const PandasOptions& options, std::shared_ptr Status ConvertChunkedArrayToPandas(const PandasOptions& options, std::shared_ptr arr, PyObject* py_ref, PyObject** out) { + if (options.decode_dictionaries && arr->type()->id() == Type::DICTIONARY) { + const auto& dense_type = + checked_cast(*arr->type()).value_type(); + RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &arr)); + DCHECK_NE(arr->type()->id(), Type::DICTIONARY); + } + if (options.strings_to_categorical && is_base_binary_like(arr->type()->id())) { if (options.zero_copy_only) { return Status::Invalid("Need to dictionary encode a column, but ", @@ -2105,6 +2112,9 @@ Status ConvertChunkedArrayToPandas(const PandasOptions& options, PandasWriter::type output_type; RETURN_NOT_OK(GetPandasWriterType(*arr, modified_options, &output_type)); + if (options.decode_dictionaries) { + DCHECK_NE(output_type, PandasWriter::CATEGORICAL); + } std::shared_ptr writer; RETURN_NOT_OK(MakeWriter(modified_options, output_type, *arr->type(), arr->length(), @@ -2115,8 +2125,8 @@ Status ConvertChunkedArrayToPandas(const PandasOptions& options, Status ConvertTableToPandas(const PandasOptions& options, std::shared_ptr table, PyObject** out) { - std::vector> arrays = table->columns(); - std::vector> fields = table->fields(); + ChunkedArrayVector arrays = table->columns(); + FieldVector fields = table->fields(); // ARROW-3789: allow "self-destructing" by releasing references to columns as // we convert them to pandas diff --git a/cpp/src/arrow/python/arrow_to_pandas.h b/cpp/src/arrow/python/arrow_to_pandas.h index abf4bbdef0d..6570364b8d2 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.h +++ b/cpp/src/arrow/python/arrow_to_pandas.h @@ -90,6 +90,9 @@ struct PandasOptions { /// conversions bool self_destruct = false; + // Used internally for nested arrays. + bool decode_dictionaries = false; + // Columns that should be casted to categorical std::unordered_set categorical_columns; diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 2165627ad08..6eae419f05a 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -2103,6 +2103,19 @@ def test_large_binary_list(self): s, pd.Series([["aa", "bb"], None, ["cc"], []]), check_names=False) + def test_list_of_dictionary(self): + child = pa.array(["foo", "bar", None, "foo"]).dictionary_encode() + arr = pa.ListArray.from_arrays([0, 1, 3, 3, 4], child) + + # Expected a Series of lists + expected = pd.Series(arr.to_pylist()) + tm.assert_series_equal(arr.to_pandas(), expected) + + # Same but with nulls + arr = arr.take([0, 1, None, 3]) + expected[2] = None + tm.assert_series_equal(arr.to_pandas(), expected) + @pytest.mark.slow @pytest.mark.large_memory def test_auto_chunking_on_list_overflow(self): @@ -2297,6 +2310,24 @@ def test_from_tuples(self): df, expected=expected_df, schema=expected_schema, expected_schema=expected_schema) + def test_struct_of_dictionary(self): + names = ['ints', 'strs'] + children = [pa.array([456, 789, 456]).dictionary_encode(), + pa.array(["foo", "foo", None]).dictionary_encode()] + arr = pa.StructArray.from_arrays(children, names=names) + + # Expected a Series of {field name: field value} dicts + rows_as_tuples = zip(*(child.to_pylist() for child in children)) + rows_as_dicts = [dict(zip(names, row)) for row in rows_as_tuples] + + expected = pd.Series(rows_as_dicts) + tm.assert_series_equal(arr.to_pandas(), expected) + + # Same but with nulls + arr = arr.take([0, None, 2]) + expected[1] = None + tm.assert_series_equal(arr.to_pandas(), expected) + class TestZeroCopyConversion: """