apache · BryanCutler · Sep 30, 2020 · Sep 30, 2020 · Oct 2, 2020 · Oct 4, 2020
diff --git a/cpp/src/arrow/python/arrow_to_pandas.cc b/cpp/src/arrow/python/arrow_to_pandas.cc
@@ -791,6 +791,110 @@ Status ConvertListsLike(PandasOptions options, const ChunkedArray& data,
   return Status::OK();
 }
 
+Status ConvertMap(PandasOptions options, const ChunkedArray& data,
+                  PyObject** out_values) {
+  // Get columns of underlying key/item arrays
+  std::vector<std::shared_ptr<Array>> key_arrays;
+  std::vector<std::shared_ptr<Array>> item_arrays;
+  for (int c = 0; c < data.num_chunks(); ++c) {
+    const auto& map_arr = checked_cast<const MapArray&>(*data.chunk(c));
+    key_arrays.emplace_back(map_arr.keys());
+    item_arrays.emplace_back(map_arr.items());
+  }
+
+  const auto& map_type = checked_cast<const MapType&>(*data.type());
+  auto key_type = map_type.key_type();
+  auto item_type = map_type.item_type();
+
+  // ARROW-6899: Convert dictionary-encoded children to dense instead of
+  // failing below. A more efficient conversion than this could be done later
+  if (key_type->id() == Type::DICTIONARY) {
+    auto dense_type = checked_cast<const DictionaryType&>(*key_type).value_type();
+    RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &key_arrays));
+    key_type = dense_type;
+  }
+  if (item_type->id() == Type::DICTIONARY) {
+    auto dense_type = checked_cast<const DictionaryType&>(*item_type).value_type();
+    RETURN_NOT_OK(DecodeDictionaries(options.pool, dense_type, &item_arrays));
+    item_type = dense_type;
+  }
+
+  // See notes in MakeInnerOptions.
+  options = MakeInnerOptions(std::move(options));
+  // Don't blindly convert because timestamps in lists are handled differently.
+  options.timestamp_as_object = true;
+
+  auto flat_keys = std::make_shared<ChunkedArray>(key_arrays, key_type);
+  auto flat_items = std::make_shared<ChunkedArray>(item_arrays, item_type);
+  OwnedRef list_item;
+  OwnedRef key_value;
+  OwnedRef item_value;
+  OwnedRefNoGIL owned_numpy_keys;
+  RETURN_NOT_OK(
+      ConvertChunkedArrayToPandas(options, flat_keys, nullptr, owned_numpy_keys.ref()));
+  OwnedRefNoGIL owned_numpy_items;
+  RETURN_NOT_OK(
+      ConvertChunkedArrayToPandas(options, flat_items, nullptr, owned_numpy_items.ref()));
+  PyArrayObject* py_keys = reinterpret_cast<PyArrayObject*>(owned_numpy_keys.obj());
+  PyArrayObject* py_items = reinterpret_cast<PyArrayObject*>(owned_numpy_items.obj());
+
+  int64_t chunk_offset = 0;
+  for (int c = 0; c < data.num_chunks(); ++c) {
+    const auto& arr = checked_cast<const MapArray&>(*data.chunk(c));
+    const bool has_nulls = data.null_count() > 0;
+
+    // Make a list of key/item pairs for each row in array
+    for (int64_t i = 0; i < arr.length(); ++i) {
+      if (has_nulls && arr.IsNull(i)) {
+        Py_INCREF(Py_None);
+        *out_values = Py_None;
+      } else {
+        int64_t entry_offset = arr.value_offset(i);
+        int64_t num_maps = arr.value_offset(i + 1) - entry_offset;
+
+        // Build the new list object for the row of maps
+        list_item.reset(PyList_New(num_maps));
+        RETURN_IF_PYERROR();
+
+        // Add each key/item pair in the row
+        for (int64_t j = 0; j < num_maps; ++j) {
+          // Get key value, key is non-nullable for a valid row
+          auto ptr_key = reinterpret_cast<const char*>(
+              PyArray_GETPTR1(py_keys, chunk_offset + entry_offset + j));
+          key_value.reset(PyArray_GETITEM(py_keys, ptr_key));
+          RETURN_IF_PYERROR();
+
+          if (item_arrays[c]->IsNull(entry_offset + j)) {
+            // Translate the Null to a None
+            Py_INCREF(Py_None);
+            item_value.reset(Py_None);
+          } else {
+            // Get valid value from item array
+            auto ptr_item = reinterpret_cast<const char*>(
+                PyArray_GETPTR1(py_items, chunk_offset + entry_offset + j));
+            item_value.reset(PyArray_GETITEM(py_items, ptr_item));
+            RETURN_IF_PYERROR();
+          }
+
+          // Add the key/item pair to the list for the row
+          PyList_SET_ITEM(list_item.obj(), j,
+                          PyTuple_Pack(2, key_value.obj(), item_value.obj()));
+          RETURN_IF_PYERROR();
+        }
+
+        // Pass ownership to the resulting array
+        *out_values = list_item.detach();
+      }
+      ++out_values;
+    }
+    RETURN_IF_PYERROR();
+
+    chunk_offset += arr.values()->length();
+  }
+
+  return Status::OK();
+}
+
 template <typename InType, typename OutType>
 inline void ConvertNumericNullable(const ChunkedArray& data, InType na_value,
                                    OutType* out_values) {
@@ -1027,6 +1131,8 @@ struct ObjectWriterVisitor {
     return ConvertListsLike<ArrayType>(options, data, out_values);
   }
 
+  Status Visit(const MapType& type) { return ConvertMap(options, data, out_values); }
+
   Status Visit(const StructType& type) {
     return ConvertStruct(options, data, out_values);
   }
@@ -1801,7 +1907,8 @@ static Status GetPandasWriterType(const ChunkedArray& data, const PandasOptions&
     } break;
     case Type::FIXED_SIZE_LIST:
     case Type::LIST:
-    case Type::LARGE_LIST: {
+    case Type::LARGE_LIST:
+    case Type::MAP: {
       auto list_type = std::static_pointer_cast<BaseListType>(data.type());
       if (!ListTypeSupported(*list_type->value_type())) {
         return Status::NotImplemented("Not implemented type for Arrow list to pandas: ",

diff --git a/python/pyarrow/tests/test_pandas.py b/python/pyarrow/tests/test_pandas.py
@@ -2132,6 +2132,60 @@ def test_auto_chunking_on_list_overflow(self):
         assert len(column_a.chunk(0)) == 2**24 - 1
         assert len(column_a.chunk(1)) == 1
 
+    def test_map_array_roundtrip(self):
+        data = [[(b'a', 1), (b'b', 2)],
+                [(b'c', 3)],
+                [(b'd', 4), (b'e', 5), (b'f', 6)],
+                [(b'g', 7)]]
+
+        df = pd.DataFrame({"map": data})
+        schema = pa.schema([("map", pa.map_(pa.binary(), pa.int32()))])
+
+        _check_pandas_roundtrip(df, schema=schema)
+
+    def test_map_array_chunked(self):
+        data1 = [[(b'a', 1), (b'b', 2)],
+                 [(b'c', 3)],
+                 [(b'd', 4), (b'e', 5), (b'f', 6)],
+                 [(b'g', 7)]]
+        data2 = [[(k, v * 2) for k, v in row] for row in data1]
+
+        arr1 = pa.array(data1, type=pa.map_(pa.binary(), pa.int32()))
+        arr2 = pa.array(data2, type=pa.map_(pa.binary(), pa.int32()))
+        arr = pa.chunked_array([arr1, arr2])
+
+        expected = pd.Series(data1 + data2)
+        actual = arr.to_pandas()
+        tm.assert_series_equal(actual, expected, check_names=False)
+
+    def test_map_array_with_nulls(self):
+        data = [[(b'a', 1), (b'b', 2)],
+                None,
+                [(b'd', 4), (b'e', 5), (b'f', None)],
+                [(b'g', 7)]]
+
+        # None value in item array causes upcast to float
+        expected = [[(k, float(v) if v is not None else None) for k, v in row]
+                    if row is not None else None for row in data]
+        expected = pd.Series(expected)
+
+        arr = pa.array(data, type=pa.map_(pa.binary(), pa.int32()))
+        actual = arr.to_pandas()
+        tm.assert_series_equal(actual, expected, check_names=False)
+
+    def test_map_array_dictionary_encoded(self):
+        offsets = pa.array([0, 3, 5])
+        items = pa.array(['a', 'b', 'c', 'a', 'd']).dictionary_encode()
+        keys = pa.array(list(range(len(items))))
+        arr = pa.MapArray.from_arrays(offsets, keys, items)
+
+        # Dictionary encoded values converted to dense
+        expected = pd.Series(
+            [[(0, 'a'), (1, 'b'), (2, 'c')], [(3, 'a'), (4, 'd')]])
+
+        actual = arr.to_pandas()
+        tm.assert_series_equal(actual, expected, check_names=False)
+
 
 class TestConvertStructTypes:
     """