diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc index e4d3e793d25..be27a108dd1 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.cc +++ b/cpp/src/arrow/python/arrow_to_pandas.cc @@ -791,6 +791,110 @@ Status ConvertListsLike(PandasOptions options, const ChunkedArray& data, return Status::OK(); } +Status ConvertMap(PandasOptions options, const ChunkedArray& data, + PyObject** out_values) { + // Get columns of underlying key/item arrays + std::vector> key_arrays; + std::vector> item_arrays; + for (int c = 0; c < data.num_chunks(); ++c) { + const auto& map_arr = checked_cast(*data.chunk(c)); + key_arrays.emplace_back(map_arr.keys()); + item_arrays.emplace_back(map_arr.items()); + } + + const auto& map_type = checked_cast(*data.type()); + auto key_type = map_type.key_type(); + auto item_type = map_type.item_type(); + + // ARROW-6899: Convert dictionary-encoded children to dense instead of + // failing below. A more efficient conversion than this could be done later + if (key_type->id() == Type::DICTIONARY) { + auto dense_type = checked_cast(*key_type).value_type(); + RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &key_arrays)); + key_type = dense_type; + } + if (item_type->id() == Type::DICTIONARY) { + auto dense_type = checked_cast(*item_type).value_type(); + RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &item_arrays)); + item_type = dense_type; + } + + // See notes in MakeInnerOptions. + options = MakeInnerOptions(std::move(options)); + // Don't blindly convert because timestamps in lists are handled differently. + options.timestamp_as_object = true; + + auto flat_keys = std::make_shared(key_arrays, key_type); + auto flat_items = std::make_shared(item_arrays, item_type); + OwnedRef list_item; + OwnedRef key_value; + OwnedRef item_value; + OwnedRefNoGIL owned_numpy_keys; + RETURN_NOT_OK( + ConvertChunkedArrayToPandas(options, flat_keys, nullptr, owned_numpy_keys.ref())); + OwnedRefNoGIL owned_numpy_items; + RETURN_NOT_OK( + ConvertChunkedArrayToPandas(options, flat_items, nullptr, owned_numpy_items.ref())); + PyArrayObject* py_keys = reinterpret_cast(owned_numpy_keys.obj()); + PyArrayObject* py_items = reinterpret_cast(owned_numpy_items.obj()); + + int64_t chunk_offset = 0; + for (int c = 0; c < data.num_chunks(); ++c) { + const auto& arr = checked_cast(*data.chunk(c)); + const bool has_nulls = data.null_count() > 0; + + // Make a list of key/item pairs for each row in array + for (int64_t i = 0; i < arr.length(); ++i) { + if (has_nulls && arr.IsNull(i)) { + Py_INCREF(Py_None); + *out_values = Py_None; + } else { + int64_t entry_offset = arr.value_offset(i); + int64_t num_maps = arr.value_offset(i + 1) - entry_offset; + + // Build the new list object for the row of maps + list_item.reset(PyList_New(num_maps)); + RETURN_IF_PYERROR(); + + // Add each key/item pair in the row + for (int64_t j = 0; j < num_maps; ++j) { + // Get key value, key is non-nullable for a valid row + auto ptr_key = reinterpret_cast( + PyArray_GETPTR1(py_keys, chunk_offset + entry_offset + j)); + key_value.reset(PyArray_GETITEM(py_keys, ptr_key)); + RETURN_IF_PYERROR(); + + if (item_arrays[c]->IsNull(entry_offset + j)) { + // Translate the Null to a None + Py_INCREF(Py_None); + item_value.reset(Py_None); + } else { + // Get valid value from item array + auto ptr_item = reinterpret_cast( + PyArray_GETPTR1(py_items, chunk_offset + entry_offset + j)); + item_value.reset(PyArray_GETITEM(py_items, ptr_item)); + RETURN_IF_PYERROR(); + } + + // Add the key/item pair to the list for the row + PyList_SET_ITEM(list_item.obj(), j, + PyTuple_Pack(2, key_value.obj(), item_value.obj())); + RETURN_IF_PYERROR(); + } + + // Pass ownership to the resulting array + *out_values = list_item.detach(); + } + ++out_values; + } + RETURN_IF_PYERROR(); + + chunk_offset += arr.values()->length(); + } + + return Status::OK(); +} + template inline void ConvertNumericNullable(const ChunkedArray& data, InType na_value, OutType* out_values) { @@ -1027,6 +1131,8 @@ struct ObjectWriterVisitor { return ConvertListsLike(options, data, out_values); } + Status Visit(const MapType& type) { return ConvertMap(options, data, out_values); } + Status Visit(const StructType& type) { return ConvertStruct(options, data, out_values); } @@ -1801,7 +1907,8 @@ static Status GetPandasWriterType(const ChunkedArray& data, const PandasOptions& } break; case Type::FIXED_SIZE_LIST: case Type::LIST: - case Type::LARGE_LIST: { + case Type::LARGE_LIST: + case Type::MAP: { auto list_type = std::static_pointer_cast(data.type()); if (!ListTypeSupported(*list_type->value_type())) { return Status::NotImplemented("Not implemented type for Arrow list to pandas: ", diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py index 6eae419f05a..0e01dc08ef6 100644 --- a/python/pyarrow/tests/test_pandas.py +++ b/python/pyarrow/tests/test_pandas.py @@ -2132,6 +2132,60 @@ def test_auto_chunking_on_list_overflow(self): assert len(column_a.chunk(0)) == 2**24 - 1 assert len(column_a.chunk(1)) == 1 + def test_map_array_roundtrip(self): + data = [[(b'a', 1), (b'b', 2)], + [(b'c', 3)], + [(b'd', 4), (b'e', 5), (b'f', 6)], + [(b'g', 7)]] + + df = pd.DataFrame({"map": data}) + schema = pa.schema([("map", pa.map_(pa.binary(), pa.int32()))]) + + _check_pandas_roundtrip(df, schema=schema) + + def test_map_array_chunked(self): + data1 = [[(b'a', 1), (b'b', 2)], + [(b'c', 3)], + [(b'd', 4), (b'e', 5), (b'f', 6)], + [(b'g', 7)]] + data2 = [[(k, v * 2) for k, v in row] for row in data1] + + arr1 = pa.array(data1, type=pa.map_(pa.binary(), pa.int32())) + arr2 = pa.array(data2, type=pa.map_(pa.binary(), pa.int32())) + arr = pa.chunked_array([arr1, arr2]) + + expected = pd.Series(data1 + data2) + actual = arr.to_pandas() + tm.assert_series_equal(actual, expected, check_names=False) + + def test_map_array_with_nulls(self): + data = [[(b'a', 1), (b'b', 2)], + None, + [(b'd', 4), (b'e', 5), (b'f', None)], + [(b'g', 7)]] + + # None value in item array causes upcast to float + expected = [[(k, float(v) if v is not None else None) for k, v in row] + if row is not None else None for row in data] + expected = pd.Series(expected) + + arr = pa.array(data, type=pa.map_(pa.binary(), pa.int32())) + actual = arr.to_pandas() + tm.assert_series_equal(actual, expected, check_names=False) + + def test_map_array_dictionary_encoded(self): + offsets = pa.array([0, 3, 5]) + items = pa.array(['a', 'b', 'c', 'a', 'd']).dictionary_encode() + keys = pa.array(list(range(len(items)))) + arr = pa.MapArray.from_arrays(offsets, keys, items) + + # Dictionary encoded values converted to dense + expected = pd.Series( + [[(0, 'a'), (1, 'b'), (2, 'c')], [(3, 'a'), (4, 'd')]]) + + actual = arr.to_pandas() + tm.assert_series_equal(actual, expected, check_names=False) + class TestConvertStructTypes: """