diff --git a/cpp/src/arrow/python/numpy-internal.h b/cpp/src/arrow/python/numpy-internal.h index 6954e35c3e1..e27ae5cac51 100644 --- a/cpp/src/arrow/python/numpy-internal.h +++ b/cpp/src/arrow/python/numpy-internal.h @@ -40,14 +40,12 @@ class Ndarray1DIndexer { Ndarray1DIndexer() : arr_(NULLPTR), data_(NULLPTR) {} - explicit Ndarray1DIndexer(PyArrayObject* arr) : Ndarray1DIndexer() { Init(arr); } - - void Init(PyArrayObject* arr) { + explicit Ndarray1DIndexer(PyArrayObject* arr) : Ndarray1DIndexer() { arr_ = arr; DCHECK_EQ(1, PyArray_NDIM(arr)) << "Only works with 1-dimensional arrays"; Py_INCREF(arr); - data_ = reinterpret_cast(PyArray_DATA(arr)); - stride_ = PyArray_STRIDES(arr)[0] / sizeof(T); + data_ = reinterpret_cast(PyArray_DATA(arr)); + stride_ = PyArray_STRIDES(arr)[0]; } ~Ndarray1DIndexer() { Py_XDECREF(arr_); } @@ -56,14 +54,18 @@ class Ndarray1DIndexer { T* data() const { return data_; } - bool is_strided() const { return stride_ != 1; } + bool is_strided() const { return stride_ != sizeof(T); } - T& operator[](size_type index) { return data_[index * stride_]; } - T& operator[](size_type index) const { return data_[index * stride_]; } + T& operator[](size_type index) { + return *reinterpret_cast(data_ + index * stride_); + } + const T& operator[](size_type index) const { + return *reinterpret_cast(data_ + index * stride_); + } private: PyArrayObject* arr_; - T* data_; + uint8_t* data_; int64_t stride_; }; diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index a944b809141..ef63ccf83df 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -318,8 +318,18 @@ Status NumPyConverter::Convert() { return Status::Invalid("only handle 1-dimensional arrays"); } - DCHECK_NE(dtype_->type_num, NPY_OBJECT) - << "This class does not handle NPY_OBJECT arrays"; + if (dtype_->type_num == NPY_OBJECT) { + // If an object array, convert it like a normal Python sequence + PyConversionOptions py_options; + py_options.type = type_; + py_options.from_pandas = from_pandas_; + std::shared_ptr res; + RETURN_NOT_OK(ConvertPySequence(reinterpret_cast(arr_), + reinterpret_cast(mask_), py_options, + &res)); + out_arrays_ = res->chunks(); + return Status::OK(); + } if (type_ == nullptr) { return Status::Invalid("Must pass data type for non-object arrays"); @@ -790,15 +800,6 @@ Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pa return Status::Invalid("Input object was not a NumPy array"); } - PyArrayObject* arr = reinterpret_cast(ao); - - if (PyArray_DESCR(arr)->type_num == NPY_OBJECT) { - PyConversionOptions py_options; - py_options.type = type; - py_options.from_pandas = from_pandas; - return ConvertPySequence(ao, mo, py_options, out); - } - NumPyConverter converter(pool, ao, mo, type, from_pandas, cast_options); RETURN_NOT_OK(converter.Convert()); const auto& output_arrays = converter.result(); diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index 8f2c2eb78b4..e9044866f8c 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -1851,21 +1851,29 @@ def test_from_numpy(self): assert arr.to_pylist() == [{}, {}] def test_from_numpy_nested(self): + # Note: an object field inside a struct dt = np.dtype([('x', np.dtype([('xx', np.int8), ('yy', np.bool_)])), - ('y', np.int16)]) + ('y', np.int16), + ('z', np.object_)]) + # Note: itemsize is not a multiple of sizeof(object) + assert dt.itemsize == 12 ty = pa.struct([pa.field('x', pa.struct([pa.field('xx', pa.int8()), pa.field('yy', pa.bool_())])), - pa.field('y', pa.int16())]) + pa.field('y', pa.int16()), + pa.field('z', pa.string())]) data = np.array([], dtype=dt) arr = pa.array(data, type=ty) assert arr.to_pylist() == [] - data = np.array([((1, True), 2), ((3, False), 4)], dtype=dt) + data = np.array([ + ((1, True), 2, 'foo'), + ((3, False), 4, 'bar')], dtype=dt) arr = pa.array(data, type=ty) - assert arr.to_pylist() == [{'x': {'xx': 1, 'yy': True}, 'y': 2}, - {'x': {'xx': 3, 'yy': False}, 'y': 4}] + assert arr.to_pylist() == [ + {'x': {'xx': 1, 'yy': True}, 'y': 2, 'z': 'foo'}, + {'x': {'xx': 3, 'yy': False}, 'y': 4, 'z': 'bar'}] @pytest.mark.large_memory def test_from_numpy_large(self):