Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 108 additions & 1 deletion cpp/src/arrow/python/arrow_to_pandas.cc
Original file line number Diff line number Diff line change
Expand Up @@ -791,6 +791,110 @@ Status ConvertListsLike(PandasOptions options, const ChunkedArray& data,
return Status::OK();
}

Status ConvertMap(PandasOptions options, const ChunkedArray& data,
PyObject** out_values) {
// Get columns of underlying key/item arrays
std::vector<std::shared_ptr<Array>> key_arrays;
std::vector<std::shared_ptr<Array>> item_arrays;
for (int c = 0; c < data.num_chunks(); ++c) {
const auto& map_arr = checked_cast<const MapArray&>(*data.chunk(c));
key_arrays.emplace_back(map_arr.keys());
item_arrays.emplace_back(map_arr.items());
}

const auto& map_type = checked_cast<const MapType&>(*data.type());
auto key_type = map_type.key_type();
auto item_type = map_type.item_type();

// ARROW-6899: Convert dictionary-encoded children to dense instead of
// failing below. A more efficient conversion than this could be done later
if (key_type->id() == Type::DICTIONARY) {
auto dense_type = checked_cast<const DictionaryType&>(*key_type).value_type();
RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &key_arrays));
key_type = dense_type;
}
if (item_type->id() == Type::DICTIONARY) {
auto dense_type = checked_cast<const DictionaryType&>(*item_type).value_type();
RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &item_arrays));
item_type = dense_type;
}

// See notes in MakeInnerOptions.
options = MakeInnerOptions(std::move(options));
// Don't blindly convert because timestamps in lists are handled differently.
options.timestamp_as_object = true;

auto flat_keys = std::make_shared<ChunkedArray>(key_arrays, key_type);
auto flat_items = std::make_shared<ChunkedArray>(item_arrays, item_type);
OwnedRef list_item;
OwnedRef key_value;
OwnedRef item_value;
OwnedRefNoGIL owned_numpy_keys;
RETURN_NOT_OK(
ConvertChunkedArrayToPandas(options, flat_keys, nullptr, owned_numpy_keys.ref()));
OwnedRefNoGIL owned_numpy_items;
RETURN_NOT_OK(
ConvertChunkedArrayToPandas(options, flat_items, nullptr, owned_numpy_items.ref()));
PyArrayObject* py_keys = reinterpret_cast<PyArrayObject*>(owned_numpy_keys.obj());
PyArrayObject* py_items = reinterpret_cast<PyArrayObject*>(owned_numpy_items.obj());

int64_t chunk_offset = 0;
for (int c = 0; c < data.num_chunks(); ++c) {
const auto& arr = checked_cast<const MapArray&>(*data.chunk(c));
const bool has_nulls = data.null_count() > 0;

// Make a list of key/item pairs for each row in array
for (int64_t i = 0; i < arr.length(); ++i) {
if (has_nulls && arr.IsNull(i)) {
Py_INCREF(Py_None);
*out_values = Py_None;
} else {
int64_t entry_offset = arr.value_offset(i);
int64_t num_maps = arr.value_offset(i + 1) - entry_offset;

// Build the new list object for the row of maps
list_item.reset(PyList_New(num_maps));
RETURN_IF_PYERROR();

// Add each key/item pair in the row
for (int64_t j = 0; j < num_maps; ++j) {
// Get key value, key is non-nullable for a valid row
auto ptr_key = reinterpret_cast<const char*>(
PyArray_GETPTR1(py_keys, chunk_offset + entry_offset + j));
key_value.reset(PyArray_GETITEM(py_keys, ptr_key));
RETURN_IF_PYERROR();

if (item_arrays[c]->IsNull(entry_offset + j)) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do you need this? I would expect py_items to contain a None entry already...?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point, let me try that

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, this doesn't seem to quite work. The backing array is numpy so if the dtype is an object, it will return None, but if numeric, it will be nan. I think we want the conversion to have None for all null values right?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, ok, thank you.

// Translate the Null to a None
Py_INCREF(Py_None);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why incref here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't the line below adding another reference to that object?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, right, it should be ok.

item_value.reset(Py_None);
} else {
// Get valid value from item array
auto ptr_item = reinterpret_cast<const char*>(
PyArray_GETPTR1(py_items, chunk_offset + entry_offset + j));
item_value.reset(PyArray_GETITEM(py_items, ptr_item));
RETURN_IF_PYERROR();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this required? I would expect the above to be always successful?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, I'm not an expert on the python/numpy c-apis, so I was following ConvertStruct here. I thought that PyArray_GETITEM might set an error, but looks like it would return null on failure. Is it possible the reset of the item_value PyObject could cause an error?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I though PyArray_GETITEM was a simple macro but it seems more involved. So it can return an error indeed.

}

// Add the key/item pair to the list for the row
PyList_SET_ITEM(list_item.obj(), j,
PyTuple_Pack(2, key_value.obj(), item_value.obj()));
RETURN_IF_PYERROR();
}

// Pass ownership to the resulting array
*out_values = list_item.detach();
}
++out_values;
}
RETURN_IF_PYERROR();

chunk_offset += arr.values()->length();
}

return Status::OK();
}

template <typename InType, typename OutType>
inline void ConvertNumericNullable(const ChunkedArray& data, InType na_value,
OutType* out_values) {
Expand Down Expand Up @@ -1027,6 +1131,8 @@ struct ObjectWriterVisitor {
return ConvertListsLike<ArrayType>(options, data, out_values);
}

Status Visit(const MapType& type) { return ConvertMap(options, data, out_values); }

Status Visit(const StructType& type) {
return ConvertStruct(options, data, out_values);
}
Expand Down Expand Up @@ -1801,7 +1907,8 @@ static Status GetPandasWriterType(const ChunkedArray& data, const PandasOptions&
} break;
case Type::FIXED_SIZE_LIST:
case Type::LIST:
case Type::LARGE_LIST: {
case Type::LARGE_LIST:
case Type::MAP: {
auto list_type = std::static_pointer_cast<BaseListType>(data.type());
if (!ListTypeSupported(*list_type->value_type())) {
return Status::NotImplemented("Not implemented type for Arrow list to pandas: ",
Expand Down
54 changes: 54 additions & 0 deletions python/pyarrow/tests/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -2132,6 +2132,60 @@ def test_auto_chunking_on_list_overflow(self):
assert len(column_a.chunk(0)) == 2**24 - 1
assert len(column_a.chunk(1)) == 1

def test_map_array_roundtrip(self):
data = [[(b'a', 1), (b'b', 2)],
[(b'c', 3)],
[(b'd', 4), (b'e', 5), (b'f', 6)],
[(b'g', 7)]]

df = pd.DataFrame({"map": data})
schema = pa.schema([("map", pa.map_(pa.binary(), pa.int32()))])

_check_pandas_roundtrip(df, schema=schema)

def test_map_array_chunked(self):
data1 = [[(b'a', 1), (b'b', 2)],
[(b'c', 3)],
[(b'd', 4), (b'e', 5), (b'f', 6)],
[(b'g', 7)]]
data2 = [[(k, v * 2) for k, v in row] for row in data1]

arr1 = pa.array(data1, type=pa.map_(pa.binary(), pa.int32()))
arr2 = pa.array(data2, type=pa.map_(pa.binary(), pa.int32()))
arr = pa.chunked_array([arr1, arr2])

expected = pd.Series(data1 + data2)
actual = arr.to_pandas()
tm.assert_series_equal(actual, expected, check_names=False)

def test_map_array_with_nulls(self):
data = [[(b'a', 1), (b'b', 2)],
None,
[(b'd', 4), (b'e', 5), (b'f', None)],
[(b'g', 7)]]

# None value in item array causes upcast to float
expected = [[(k, float(v) if v is not None else None) for k, v in row]
if row is not None else None for row in data]
expected = pd.Series(expected)

arr = pa.array(data, type=pa.map_(pa.binary(), pa.int32()))
actual = arr.to_pandas()
tm.assert_series_equal(actual, expected, check_names=False)

def test_map_array_dictionary_encoded(self):
offsets = pa.array([0, 3, 5])
items = pa.array(['a', 'b', 'c', 'a', 'd']).dictionary_encode()
keys = pa.array(list(range(len(items))))
arr = pa.MapArray.from_arrays(offsets, keys, items)

# Dictionary encoded values converted to dense
expected = pd.Series(
[[(0, 'a'), (1, 'b'), (2, 'c')], [(3, 'a'), (4, 'd')]])

actual = arr.to_pandas()
tm.assert_series_equal(actual, expected, check_names=False)


class TestConvertStructTypes:
"""
Expand Down