diff --git a/python/pyarrow/array.pyx b/python/pyarrow/array.pyx index 456bf6d1da8..a80b3ce8398 100644 --- a/python/pyarrow/array.pyx +++ b/python/pyarrow/array.pyx @@ -288,4 +288,3 @@ cdef class RowBatch: def __getitem__(self, i): return self.arrays[i] - diff --git a/python/pyarrow/includes/pyarrow.pxd b/python/pyarrow/includes/pyarrow.pxd index 1066b8034be..92c814706fd 100644 --- a/python/pyarrow/includes/pyarrow.pxd +++ b/python/pyarrow/includes/pyarrow.pxd @@ -46,6 +46,6 @@ cdef extern from "pyarrow/api.h" namespace "pyarrow" nogil: Status PandasMaskedToArrow(MemoryPool* pool, object ao, object mo, shared_ptr[CArray]* out) - Status ArrowToPandas(const shared_ptr[CColumn]& arr, PyObject** out) + Status ArrowToPandas(const shared_ptr[CColumn]& arr, object py_ref, PyObject** out) MemoryPool* GetMemoryPool() diff --git a/python/pyarrow/table.pyx b/python/pyarrow/table.pyx index 4c4816f0c7e..f02d36f520b 100644 --- a/python/pyarrow/table.pyx +++ b/python/pyarrow/table.pyx @@ -96,7 +96,7 @@ cdef class Column: import pandas as pd - check_status(pyarrow.ArrowToPandas(self.sp_column, &arr)) + check_status(pyarrow.ArrowToPandas(self.sp_column, self, &arr)) return pd.Series(arr, name=self.name) cdef _check_nullptr(self): @@ -205,6 +205,7 @@ cdef class Table: cdef: PyObject* arr shared_ptr[CColumn] col + Column column import pandas as pd @@ -212,7 +213,8 @@ cdef class Table: data = [] for i in range(self.table.num_columns()): col = self.table.column(i) - check_status(pyarrow.ArrowToPandas(col, &arr)) + column = self.column(i) + check_status(pyarrow.ArrowToPandas(col, column, &arr)) names.append(frombytes(col.get().name())) data.append( arr) diff --git a/python/src/pyarrow/adapters/pandas.cc b/python/src/pyarrow/adapters/pandas.cc index 22f1d7575f8..b39fde92034 100644 --- a/python/src/pyarrow/adapters/pandas.cc +++ b/python/src/pyarrow/adapters/pandas.cc @@ -520,8 +520,8 @@ static inline PyObject* make_pystring(const uint8_t* data, int32_t length) { template class ArrowDeserializer { public: - ArrowDeserializer(const std::shared_ptr& col) : - col_(col) {} + ArrowDeserializer(const std::shared_ptr& col, PyObject* py_ref) : + col_(col), py_ref_(py_ref) {} Status Convert(PyObject** out) { const std::shared_ptr data = col_->data(); @@ -548,6 +548,33 @@ class ArrowDeserializer { return Status::OK(); } + Status OutputFromData(int type, void* data) { + // Zero-Copy. We can pass the data pointer directly to NumPy. + Py_INCREF(py_ref_); + OwnedRef py_ref(py_ref); + npy_intp dims[1] = {col_->length()}; + out_ = reinterpret_cast(PyArray_SimpleNewFromData(1, dims, + type, data)); + + if (out_ == NULL) { + // Error occurred, trust that SimpleNew set the error state + return Status::OK(); + } + + if (PyArray_SetBaseObject(out_, py_ref_) == -1) { + // Error occurred, trust that SetBaseObject set the error state + return Status::OK(); + } else { + // PyArray_SetBaseObject steals our reference to py_ref_ + py_ref.release(); + } + + // Arrow data is immutable. + PyArray_CLEARFLAGS(out_, NPY_ARRAY_WRITEABLE); + + return Status::OK(); + } + template inline typename std::enable_if< arrow_traits::is_floating, Status>::type @@ -556,18 +583,20 @@ class ArrowDeserializer { arrow::PrimitiveArray* prim_arr = static_cast( arr.get()); - - RETURN_NOT_OK(AllocateOutput(arrow_traits::npy_type)); + const T* in_values = reinterpret_cast(prim_arr->data()->data()); if (arr->null_count() > 0) { + RETURN_NOT_OK(AllocateOutput(arrow_traits::npy_type)); + T* out_values = reinterpret_cast(PyArray_DATA(out_)); - const T* in_values = reinterpret_cast(prim_arr->data()->data()); for (int64_t i = 0; i < arr->length(); ++i) { out_values[i] = arr->IsNull(i) ? NAN : in_values[i]; } } else { - memcpy(PyArray_DATA(out_), prim_arr->data()->data(), - arr->length() * arr->type()->value_size()); + // Zero-Copy. We can pass the data pointer directly to NumPy. + void* data = const_cast(in_values); + int type = arrow_traits::npy_type; + RETURN_NOT_OK(OutputFromData(type, data)); } return Status::OK(); @@ -594,10 +623,10 @@ class ArrowDeserializer { out_values[i] = prim_arr->IsNull(i) ? NAN : in_values[i]; } } else { - RETURN_NOT_OK(AllocateOutput(arrow_traits::npy_type)); - - memcpy(PyArray_DATA(out_), in_values, - arr->length() * arr->type()->value_size()); + // Zero-Copy. We can pass the data pointer directly to NumPy. + void* data = const_cast(in_values); + int type = arrow_traits::npy_type; + RETURN_NOT_OK(OutputFromData(type, data)); } return Status::OK(); @@ -680,18 +709,20 @@ class ArrowDeserializer { } private: std::shared_ptr col_; + PyObject* py_ref_; PyArrayObject* out_; }; -#define FROM_ARROW_CASE(TYPE) \ - case arrow::Type::TYPE: \ - { \ - ArrowDeserializer converter(col); \ - return converter.Convert(out); \ - } \ +#define FROM_ARROW_CASE(TYPE) \ + case arrow::Type::TYPE: \ + { \ + ArrowDeserializer converter(col, py_ref); \ + return converter.Convert(out); \ + } \ break; -Status ArrowToPandas(const std::shared_ptr& col, PyObject** out) { +Status ArrowToPandas(const std::shared_ptr& col, PyObject* py_ref, + PyObject** out) { switch(col->type()->type) { FROM_ARROW_CASE(BOOL); FROM_ARROW_CASE(INT8); diff --git a/python/src/pyarrow/adapters/pandas.h b/python/src/pyarrow/adapters/pandas.h index 58eb3ca61cd..17922349de6 100644 --- a/python/src/pyarrow/adapters/pandas.h +++ b/python/src/pyarrow/adapters/pandas.h @@ -36,7 +36,8 @@ namespace pyarrow { class Status; -Status ArrowToPandas(const std::shared_ptr& col, PyObject** out); +Status ArrowToPandas(const std::shared_ptr& col, PyObject* py_ref, + PyObject** out); Status PandasMaskedToArrow(arrow::MemoryPool* pool, PyObject* ao, PyObject* mo, std::shared_ptr* out); diff --git a/python/src/pyarrow/common.h b/python/src/pyarrow/common.h index cc9ad9ec5bb..0211e8948f2 100644 --- a/python/src/pyarrow/common.h +++ b/python/src/pyarrow/common.h @@ -53,6 +53,10 @@ class OwnedRef { obj_ = obj; } + void release() { + obj_ = nullptr; + } + PyObject* obj() const{ return obj_; }