From 959385dc5dc7985e988561b842e5788ba8997605 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Tue, 7 Jan 2020 13:13:25 +0100 Subject: [PATCH 1/8] virtual --- cpp/src/arrow/python/python_to_arrow.cc | 169 +++++++++++------------- cpp/src/arrow/python/python_to_arrow.h | 5 + 2 files changed, 80 insertions(+), 94 deletions(-) diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index d2a58067ebf..ca4ea65e39e 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -74,13 +74,16 @@ class SeqConverter { // converting Python objects to Arrow nested types virtual Status Init(ArrayBuilder* builder) = 0; - // Append a single (non-sequence) Python datum to the underlying builder, - // virtual function - virtual Status AppendSingleVirtual(PyObject* obj) = 0; + // // Append a single (non-sequence) Python datum to the underlying builder, + // // virtual function + // virtual Status AppendSingleVirtual(PyObject* obj) = 0; // Append the contents of a Python sequence to the underlying builder, // virtual version virtual Status AppendMultiple(PyObject* seq, int64_t size) = 0; + virtual Status AppendSingle(PyObject* seq) = 0; + virtual Status AppendItem(PyObject* seq) = 0; + virtual Status AppendNull() = 0; // Append the contents of a Python sequence to the underlying builder, // virtual version @@ -100,7 +103,7 @@ class SeqConverter { // always call Finish to deal with the edge case where a size-0 sequence // was converted with a specific output type, like array([], type=t) RETURN_NOT_OK(Close()); - *out = std::make_shared(chunks_, builder_->type()); + *out = std::make_shared(this->chunks_, builder_->type()); return Status::OK(); } @@ -195,7 +198,7 @@ struct Unbox { // We use CRTP to avoid virtual calls to the AppendItem(), AppendNull(), and // IsNull() on the hot path -template +template class TypedConverter : public SeqConverter { public: using BuilderType = typename TypeTraits::BuilderType; @@ -210,42 +213,39 @@ class TypedConverter : public SeqConverter { bool CheckNull(PyObject* obj) const { return NullChecker::Check(obj); } // Append a missing item (default implementation) - Status AppendNull() { return this->typed_builder_->AppendNull(); } + Status AppendNull() override { return this->typed_builder_->AppendNull(); } // This is overridden in several subclasses, but if an Unbox implementation // is defined, it will be used here - Status AppendItem(PyObject* obj) { return Unbox::Append(typed_builder_, obj); } + // Status AppendItem(PyObject* obj) override { + // return Unbox::Append(this->typed_builder_, obj); + // } - Status AppendSingle(PyObject* obj) { - auto self = checked_cast(this); - return CheckNull(obj) ? self->AppendNull() : self->AppendItem(obj); + Status AppendSingle(PyObject* obj) override { + return CheckNull(obj) ? AppendNull() : AppendItem(obj); } - Status AppendSingleVirtual(PyObject* obj) override { return AppendSingle(obj); } - Status AppendMultiple(PyObject* obj, int64_t size) override { /// Ensure we've allocated enough space - RETURN_NOT_OK(this->typed_builder_->Reserve(size)); + RETURN_NOT_OK(typed_builder_->Reserve(size)); // Iterate over the items adding each one - auto self = checked_cast(this); - return internal::VisitSequence(obj, [self](PyObject* item, bool* /* unused */) { - return self->AppendSingle(item); + return internal::VisitSequence(obj, [this](PyObject* item, bool* /* unused */) { + return this->AppendSingle(item); }); } Status AppendMultipleMasked(PyObject* obj, PyObject* mask, int64_t size) override { /// Ensure we've allocated enough space - RETURN_NOT_OK(this->typed_builder_->Reserve(size)); + RETURN_NOT_OK(typed_builder_->Reserve(size)); // Iterate over the items adding each one - auto self = checked_cast(this); return internal::VisitSequenceMasked( - obj, mask, [self](PyObject* item, bool is_masked, bool* /* unused */) { + obj, mask, [this](PyObject* item, bool is_masked, bool* /* unused */) { if (is_masked) { - return self->AppendNull(); + return this->AppendNull(); } else { // This will also apply the null-checking convention in the event // that the value is not masked - return self->AppendSingle(item); + return this->AppendSingle(item); } }); } @@ -258,10 +258,9 @@ class TypedConverter : public SeqConverter { // Sequence converter for null type template -class NullConverter - : public TypedConverter, null_coding> { +class NullConverter : public TypedConverter { public: - Status AppendItem(PyObject* obj) { + Status AppendItem(PyObject* obj) override { return internal::InvalidValue(obj, "converting to null type"); } }; @@ -270,10 +269,9 @@ class NullConverter // Sequence converter for boolean type template -class BoolConverter - : public TypedConverter, null_coding> { +class BoolConverter : public TypedConverter { public: - Status AppendItem(PyObject* obj) { + Status AppendItem(PyObject* obj) override { if (obj == Py_True) { return this->typed_builder_->Append(true); } else if (obj == Py_False) { @@ -288,17 +286,19 @@ class BoolConverter // Sequence converter template for numeric (integer and floating point) types template -class NumericConverter - : public TypedConverter, null_coding> {}; +class NumericConverter : public TypedConverter { + Status AppendItem(PyObject* obj) override { + return Unbox::Append(this->typed_builder_, obj); + } +}; // ---------------------------------------------------------------------- // Sequence converters for temporal types template -class Date32Converter - : public TypedConverter, null_coding> { +class Date32Converter : public TypedConverter { public: - Status AppendItem(PyObject* obj) { + Status AppendItem(PyObject* obj) override { int32_t t; if (PyDate_Check(obj)) { auto pydate = reinterpret_cast(obj); @@ -311,10 +311,9 @@ class Date32Converter }; template -class Date64Converter - : public TypedConverter, null_coding> { +class Date64Converter : public TypedConverter { public: - Status AppendItem(PyObject* obj) { + Status AppendItem(PyObject* obj) override { int64_t t; if (PyDateTime_Check(obj)) { auto pydate = reinterpret_cast(obj); @@ -332,12 +331,11 @@ class Date64Converter }; template -class Time32Converter - : public TypedConverter, null_coding> { +class Time32Converter : public TypedConverter { public: explicit Time32Converter(TimeUnit::type unit) : unit_(unit) {} - Status AppendItem(PyObject* obj) { + Status AppendItem(PyObject* obj) override { // TODO(kszucs): option for strict conversion? int32_t t; if (PyTime_Check(obj)) { @@ -363,12 +361,11 @@ class Time32Converter }; template -class Time64Converter - : public TypedConverter, null_coding> { +class Time64Converter : public TypedConverter { public: explicit Time64Converter(TimeUnit::type unit) : unit_(unit) {} - Status AppendItem(PyObject* obj) { + Status AppendItem(PyObject* obj) override { int64_t t; if (PyTime_Check(obj)) { // datetime.time stores microsecond resolution @@ -442,14 +439,12 @@ struct PyDateTimeTraits { }; template -class TemporalConverter - : public TypedConverter, - null_coding> { +class TemporalConverter : public TypedConverter { public: explicit TemporalConverter(TimeUnit::type unit) : unit_(unit) {} using traits = PyDateTimeTraits; - Status AppendItem(PyObject* obj) { + Status AppendItem(PyObject* obj) override { int64_t t; if (traits::PyTypeCheck(obj)) { auto pydatetime = reinterpret_cast(obj); @@ -554,10 +549,9 @@ inline Status BuilderAppend(FixedSizeBinaryBuilder* builder, PyObject* obj, } // namespace detail template -class BinaryLikeConverter - : public TypedConverter, null_coding> { +class BinaryLikeConverter : public TypedConverter { public: - Status AppendItem(PyObject* obj) { + Status AppendItem(PyObject* obj) override { // Accessing members of the templated base requires using this-> here bool is_full = false; RETURN_NOT_OK(detail::BuilderAppend(this->typed_builder_, obj, &is_full)); @@ -588,9 +582,7 @@ class FixedWidthBytesConverter // For String/UTF8, if strict_conversions enabled, we reject any non-UTF8, // otherwise we allow but return results as BinaryArray template -class StringConverter - : public TypedConverter, - null_coding> { +class StringConverter : public TypedConverter { public: StringConverter() : binary_count_(0) {} @@ -618,7 +610,7 @@ class StringConverter return detail::AppendPyString(this->typed_builder_, string_view_, is_full); } - Status AppendItem(PyObject* obj) { + Status AppendItem(PyObject* obj) override { bool is_full = false; RETURN_NOT_OK(Append(obj, &is_full)); @@ -634,7 +626,7 @@ class StringConverter return Status::OK(); } - virtual Status GetResult(std::shared_ptr* out) { + Status GetResult(std::shared_ptr* out) override { RETURN_NOT_OK(SeqConverter::GetResult(out)); // If we saw any non-unicode, cast results to BinaryArray @@ -677,22 +669,22 @@ class StringConverter } // Base class for ListConverter and FixedSizeListConverter (to have both work with CRTP) -template -class BaseListConverter : public TypedConverter { +template +class BaseListConverter : public TypedConverter { public: using BuilderType = typename TypeTraits::BuilderType; explicit BaseListConverter(bool from_pandas, bool strict_conversions) : from_pandas_(from_pandas), strict_conversions_(strict_conversions) {} - Status Init(ArrayBuilder* builder) { + Status Init(ArrayBuilder* builder) override { this->builder_ = builder; this->typed_builder_ = checked_cast(builder); - value_type_ = checked_cast(*builder->type()).value_type(); + this->value_type_ = checked_cast(*builder->type()).value_type(); RETURN_NOT_OK( GetConverter(value_type_, from_pandas_, strict_conversions_, &value_converter_)); - return value_converter_->Init(this->typed_builder_->value_builder()); + return this->value_converter_->Init(this->typed_builder_->value_builder()); } template @@ -765,7 +757,7 @@ class BaseListConverter : public TypedConverter "array input"); } return internal::VisitSequence(obj, [this](PyObject* item, bool*) { - return value_converter_->AppendSingleVirtual(item); + return value_converter_->AppendSingle(item); }); } default: { @@ -774,7 +766,7 @@ class BaseListConverter : public TypedConverter } } - Status AppendItem(PyObject* obj) { + Status AppendItem(PyObject* obj) override { RETURN_NOT_OK(this->typed_builder_->Append()); if (PyArray_Check(obj)) { return AppendNdarrayItem(obj); @@ -788,7 +780,7 @@ class BaseListConverter : public TypedConverter return value_converter_->AppendMultiple(obj, list_size); } - virtual Status GetResult(std::shared_ptr* out) { + virtual Status GetResult(std::shared_ptr* out) override { // TODO: Improved handling of chunked children if (value_converter_->num_chunks() > 0) { return Status::Invalid("List child type ", @@ -806,31 +798,25 @@ class BaseListConverter : public TypedConverter }; template -class ListConverter - : public BaseListConverter, - null_coding> { +class ListConverter : public BaseListConverter { public: - using BASE = - BaseListConverter, null_coding>; + using BASE = BaseListConverter; using BASE::BASE; }; template -class FixedSizeListConverter - : public BaseListConverter, - null_coding> { +class FixedSizeListConverter: public BaseListConverter { public: - using BASE = BaseListConverter, - null_coding>; + using BASE = BaseListConverter; using BASE::BASE; - Status Init(ArrayBuilder* builder) { + Status Init(ArrayBuilder* builder) override { RETURN_NOT_OK(BASE::Init(builder)); list_size_ = checked_pointer_cast(builder->type())->list_size(); return Status::OK(); } - Status AppendItem(PyObject* obj) { + Status AppendItem(PyObject* obj) override { // the same as BaseListConverter but with additional length checks RETURN_NOT_OK(this->typed_builder_->Append()); if (PyArray_Check(obj)) { @@ -864,16 +850,15 @@ class FixedSizeListConverter // Define a MapConverter as a ListConverter that uses MapBuilder.value_builder // to append struct of key/value pairs template -class MapConverter - : public BaseListConverter, null_coding> { +class MapConverter : public BaseListConverter { public: - using BASE = BaseListConverter, null_coding>; + using BASE = BaseListConverter; explicit MapConverter(bool from_pandas, bool strict_conversions) : BASE(from_pandas, strict_conversions), key_builder_(nullptr) {} - Status AppendSingleVirtual(PyObject* obj) override { - RETURN_NOT_OK(BASE::AppendSingleVirtual(obj)); + Status AppendSingle(PyObject* obj) override { + RETURN_NOT_OK(BASE::AppendSingle(obj)); return VerifyLastStructAppended(); } @@ -910,18 +895,17 @@ class MapConverter // Convert structs template -class StructConverter - : public TypedConverter, null_coding> { +class StructConverter : public TypedConverter { public: explicit StructConverter(bool from_pandas, bool strict_conversions) : from_pandas_(from_pandas), strict_conversions_(strict_conversions) {} - Status Init(ArrayBuilder* builder) { + Status Init(ArrayBuilder* builder) override { this->builder_ = builder; this->typed_builder_ = checked_cast(builder); auto struct_type = checked_pointer_cast(builder->type()); - num_fields_ = this->typed_builder_->num_fields(); + this->num_fields_ = this->typed_builder_->num_fields(); DCHECK_EQ(num_fields_, struct_type->num_children()); field_name_bytes_list_.reset(PyList_New(num_fields_)); @@ -952,7 +936,7 @@ class StructConverter return Status::OK(); } - Status AppendItem(PyObject* obj) { + Status AppendItem(PyObject* obj) override { RETURN_NOT_OK(this->typed_builder_->Append()); // Note heterogenous sequences are not allowed if (ARROW_PREDICT_FALSE(source_kind_ == SourceKind::UNKNOWN)) { @@ -974,12 +958,12 @@ class StructConverter } // Append a missing item - Status AppendNull() { + Status AppendNull() override { RETURN_NOT_OK(this->typed_builder_->AppendNull()); // Need to also insert a missing item on all child builders // (compare with ListConverter) for (int i = 0; i < num_fields_; i++) { - RETURN_NOT_OK(value_converters_[i]->AppendSingleVirtual(Py_None)); + RETURN_NOT_OK(this->value_converters_[i]->AppendSingle(Py_None)); } return Status::OK(); } @@ -1011,7 +995,7 @@ class StructConverter } // If we come here, it means all keys are absent for (int i = 0; i < num_fields_; i++) { - RETURN_NOT_OK(value_converters_[i]->AppendSingleVirtual(Py_None)); + RETURN_NOT_OK(value_converters_[i]->AppendSingle(Py_None)); } return Status::OK(); } @@ -1033,7 +1017,7 @@ class StructConverter RETURN_IF_PYERROR(); } RETURN_NOT_OK( - value_converters_[i]->AppendSingleVirtual(valueobj ? valueobj : Py_None)); + value_converters_[i]->AppendSingle(valueobj ? valueobj : Py_None)); } return Status::OK(); } @@ -1044,7 +1028,7 @@ class StructConverter } for (int i = 0; i < num_fields_; i++) { PyObject* valueobj = PyTuple_GET_ITEM(obj, i); - RETURN_NOT_OK(value_converters_[i]->AppendSingleVirtual(valueobj)); + RETURN_NOT_OK(value_converters_[i]->AppendSingle(valueobj)); } return Status::OK(); } @@ -1065,12 +1049,9 @@ class StructConverter }; template -class DecimalConverter - : public TypedConverter, - null_coding> { +class DecimalConverter : public TypedConverter { public: - using BASE = - TypedConverter, null_coding>; + using BASE = TypedConverter; Status Init(ArrayBuilder* builder) override { RETURN_NOT_OK(BASE::Init(builder)); @@ -1078,7 +1059,7 @@ class DecimalConverter return Status::OK(); } - Status AppendItem(PyObject* obj) { + Status AppendItem(PyObject* obj) override { Decimal128 value; RETURN_NOT_OK(internal::DecimalFromPyObject(obj, *decimal_type_, &value)); return this->typed_builder_->Append(value); diff --git a/cpp/src/arrow/python/python_to_arrow.h b/cpp/src/arrow/python/python_to_arrow.h index f9d97569ef4..3cc2cf9df0c 100644 --- a/cpp/src/arrow/python/python_to_arrow.h +++ b/cpp/src/arrow/python/python_to_arrow.h @@ -73,6 +73,11 @@ Status ConvertPySequence(PyObject* obj, PyObject* mask, const PyConversionOptions& options, std::shared_ptr* out); +// ARROW_PYTHON_EXPORT +// Status ConvertPySequence2(PyObject* obj, PyObject* mask, +// const PyConversionOptions& options, +// std::shared_ptr* out); + ARROW_PYTHON_EXPORT Status ConvertPySequence(PyObject* obj, const PyConversionOptions& options, std::shared_ptr* out); From 762d77af348626818a8ce25ee0585a56f761cfc6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Tue, 7 Jan 2020 15:27:30 +0100 Subject: [PATCH 2/8] Manually inline to AppendMultiple --- cpp/src/arrow/python/python_to_arrow.cc | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index ca4ea65e39e..177aa59ac99 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -287,6 +287,16 @@ class BoolConverter : public TypedConverter { template class NumericConverter : public TypedConverter { + + Status AppendMultiple(PyObject* obj, int64_t size) override { + /// Ensure we've allocated enough space + RETURN_NOT_OK(this->typed_builder_->Reserve(size)); + // Iterate over the items adding each one + return internal::VisitSequence(obj, [this](PyObject* item, bool* /* unused */) { + return NullChecker::Check(item) ? this->typed_builder_->AppendNull() : Unbox::Append(this->typed_builder_, item); + }); + } + Status AppendItem(PyObject* obj) override { return Unbox::Append(this->typed_builder_, obj); } From 691445c1a79f4832989988c96644ae325f6b9ecf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Tue, 7 Jan 2020 16:50:53 +0100 Subject: [PATCH 3/8] Revert "Manually inline to AppendMultiple" This reverts commit 02f4f43d48e433fca0054c0f289d9eb2b9ed79d8. --- cpp/src/arrow/python/python_to_arrow.cc | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 177aa59ac99..ca4ea65e39e 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -287,16 +287,6 @@ class BoolConverter : public TypedConverter { template class NumericConverter : public TypedConverter { - - Status AppendMultiple(PyObject* obj, int64_t size) override { - /// Ensure we've allocated enough space - RETURN_NOT_OK(this->typed_builder_->Reserve(size)); - // Iterate over the items adding each one - return internal::VisitSequence(obj, [this](PyObject* item, bool* /* unused */) { - return NullChecker::Check(item) ? this->typed_builder_->AppendNull() : Unbox::Append(this->typed_builder_, item); - }); - } - Status AppendItem(PyObject* obj) override { return Unbox::Append(this->typed_builder_, obj); } From 1fdaa7e8136989c9d53892f1cf6958f34904fbaf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Tue, 7 Jan 2020 17:51:07 +0100 Subject: [PATCH 4/8] ValueConverters --- cpp/src/arrow/python/python_to_arrow.cc | 458 +++++++++++++----------- 1 file changed, 254 insertions(+), 204 deletions(-) diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index ca4ea65e39e..caaa354ad88 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -53,6 +53,232 @@ using internal::checked_pointer_cast; namespace py { +// ---------------------------------------------------------------------- +// NullCoding + +enum class NullCoding : char { NONE_ONLY, PANDAS_SENTINELS }; + +template +struct NullChecker {}; + +template <> +struct NullChecker { + static inline bool Check(PyObject* obj) { return obj == Py_None; } +}; + +template <> +struct NullChecker { + static inline bool Check(PyObject* obj) { return internal::PandasObjectIsNull(obj); } +}; + +// ---------------------------------------------------------------------- +// ValueConverters +// +// Typed conversion logic for single python objects are encapsulated in +// ValueConverter structs using SFINAE. +// +// The FromPython medthod is responsible to convert the python object to the +// C++ value counterpart which can be directly appended to the ArrayBuilder or +// Scalar can be constructed from. + +template +struct ValueConverter {}; + +template <> +struct ValueConverter { + static inline Result FromPython(PyObject *obj) { + if (obj == Py_True) { + return true; + } else if (obj == Py_False) { + return false; + } else { + return internal::InvalidValue(obj, "tried to convert to boolean"); + } + } +}; + +template +struct ValueConverter> { + using ValueType = typename Type::c_type; + + static inline Result FromPython(PyObject *obj) { + ValueType value; + RETURN_NOT_OK(internal::CIntFromPython(obj, &value)); + return value; + } +}; + +template <> +struct ValueConverter { + using ValueType = typename HalfFloatType::c_type; + + static inline Result FromPython(PyObject *obj) { + ValueType value; + RETURN_NOT_OK(PyFloat_AsHalf(obj, &value)); + return value; + } +}; + +template <> +struct ValueConverter { + + static inline Result FromPython(PyObject *obj) { + float value; + if (internal::PyFloatScalar_Check(obj)) { + value = static_cast(PyFloat_AsDouble(obj)); + RETURN_IF_PYERROR(); + } else if (internal::PyIntScalar_Check(obj)) { + RETURN_NOT_OK(internal::IntegerScalarToFloat32Safe(obj, &value)); + } else { + return internal::InvalidValue(obj, "tried to convert to float32"); + } + return value; + } +}; + +template <> +struct ValueConverter { + + static inline Result FromPython(PyObject *obj) { + double value; + if (PyFloat_Check(obj)) { + value = PyFloat_AS_DOUBLE(obj); + } else if (internal::PyFloatScalar_Check(obj)) { + // Other kinds of float-y things + value = PyFloat_AsDouble(obj); + RETURN_IF_PYERROR(); + } else if (internal::PyIntScalar_Check(obj)) { + RETURN_NOT_OK(internal::IntegerScalarToDoubleSafe(obj, &value)); + } else { + return internal::InvalidValue(obj, "tried to convert to double"); + } + return value; + } +}; + +template <> +struct ValueConverter { + + static inline Result FromPython(PyObject *obj) { + int32_t value; + if (PyDate_Check(obj)) { + auto pydate = reinterpret_cast(obj); + value = static_cast(internal::PyDate_to_days(pydate)); + } else { + RETURN_NOT_OK(internal::CIntFromPython(obj, &value, "Integer too large for date32")); + } + return value; + } +}; + +template<> +struct ValueConverter { + + static inline Result FromPython(PyObject *obj) { + int64_t value; + if (PyDateTime_Check(obj)) { + auto pydate = reinterpret_cast(obj); + value = internal::PyDateTime_to_ms(pydate); + // Truncate any intraday milliseconds + value -= value % 86400000LL; + } else if (PyDate_Check(obj)) { + auto pydate = reinterpret_cast(obj); + value = internal::PyDate_to_ms(pydate); + } else { + RETURN_NOT_OK(internal::CIntFromPython(obj, &value, "Integer too large for date64")); + } + return value; + } +}; + +template<> +struct ValueConverter { + + static inline Result FromPython(PyObject *obj, TimeUnit::type unit) { + int32_t value; + if (PyTime_Check(obj)) { + // datetime.time stores microsecond resolution + switch (unit) { + case TimeUnit::SECOND: + value = static_cast(internal::PyTime_to_s(obj)); + break; + case TimeUnit::MILLI: + value = static_cast(internal::PyTime_to_ms(obj)); + break; + default: + return Status::UnknownError("Invalid time unit"); + } + } else { + RETURN_NOT_OK(internal::CIntFromPython(obj, &value, "Integer too large for int32")); + } + return value; + } +}; + +template<> +struct ValueConverter { + + static inline Result FromPython(PyObject *obj, TimeUnit::type unit) { + int64_t value; + if (PyTime_Check(obj)) { + // datetime.time stores microsecond resolution + switch (unit) { + case TimeUnit::MICRO: + value = internal::PyTime_to_us(obj); + break; + case TimeUnit::NANO: + value = internal::PyTime_to_ns(obj); + break; + default: + return Status::UnknownError("Invalid time unit"); + } + } else { + RETURN_NOT_OK(internal::CIntFromPython(obj, &value, "Integer too large for int64")); + } + return value; + } +}; + +// template +// struct ValueConverter> { +// using ValueType = PyBytesView; + +// static inline Result FromPython(PyObject *obj) { +// ValueType view; +// RETURN_NOT_OK(view.FromString(obj)); +// return view; +// } +// }; + +// template +// struct ValueConverter> { +// using ValueType = PyBytesView; + +// static inline Result FromPython(PyObject *obj, const Type& type) { +// ValueType view; +// RETURN_NOT_OK(view.FromString(obj)); +// const auto expected_length = type.byte_width(); +// if (ARROW_PREDICT_FALSE(view.size != expected_length)) { +// std::stringstream ss; +// ss << "expected to be length " << expected_length << " was " << view.size; +// return internal::InvalidValue(obj, ss.str()); +// } else { +// return view; +// } +// } +// }; + +// template +// struct ValueConverter> { +// using ValueType = PyBytesView; + +// static inline Result FromPython(PyObject *obj, bool* is_utf8) { +// ValueType view; +// RETURN_NOT_OK(view.FromString(obj, is_utf8)); +// return view; +// } +// }; + // ---------------------------------------------------------------------- // Sequence converter base and CRTP "middle" subclasses @@ -117,87 +343,7 @@ class SeqConverter { std::vector> chunks_; }; -enum class NullCoding : char { NONE_ONLY, PANDAS_SENTINELS }; - -template -struct NullChecker {}; - -template <> -struct NullChecker { - static inline bool Check(PyObject* obj) { return obj == Py_None; } -}; - -template <> -struct NullChecker { - static inline bool Check(PyObject* obj) { return internal::PandasObjectIsNull(obj); } -}; - -// ---------------------------------------------------------------------- -// Helper templates to append PyObject* to builder for each target conversion -// type - -template -struct Unbox {}; - -template -struct Unbox> { - using BuilderType = typename TypeTraits::BuilderType; - static inline Status Append(BuilderType* builder, PyObject* obj) { - typename Type::c_type value; - RETURN_NOT_OK(internal::CIntFromPython(obj, &value)); - return builder->Append(value); - } -}; - -template <> -struct Unbox { - static inline Status Append(HalfFloatBuilder* builder, PyObject* obj) { - npy_half val; - RETURN_NOT_OK(PyFloat_AsHalf(obj, &val)); - return builder->Append(val); - } -}; - -template <> -struct Unbox { - static inline Status Append(FloatBuilder* builder, PyObject* obj) { - if (internal::PyFloatScalar_Check(obj)) { - float val = static_cast(PyFloat_AsDouble(obj)); - RETURN_IF_PYERROR(); - return builder->Append(val); - } else if (internal::PyIntScalar_Check(obj)) { - float val = 0; - RETURN_NOT_OK(internal::IntegerScalarToFloat32Safe(obj, &val)); - return builder->Append(val); - } else { - return internal::InvalidValue(obj, "tried to convert to float32"); - } - } -}; -template <> -struct Unbox { - static inline Status Append(DoubleBuilder* builder, PyObject* obj) { - if (PyFloat_Check(obj)) { - double val = PyFloat_AS_DOUBLE(obj); - return builder->Append(val); - } else if (internal::PyFloatScalar_Check(obj)) { - // Other kinds of float-y things - double val = PyFloat_AsDouble(obj); - RETURN_IF_PYERROR(); - return builder->Append(val); - } else if (internal::PyIntScalar_Check(obj)) { - double val = 0; - RETURN_NOT_OK(internal::IntegerScalarToDoubleSafe(obj, &val)); - return builder->Append(val); - } else { - return internal::InvalidValue(obj, "tried to convert to double"); - } - } -}; - -// We use CRTP to avoid virtual calls to the AppendItem(), AppendNull(), and -// IsNull() on the hot path template class TypedConverter : public SeqConverter { public: @@ -266,126 +412,30 @@ class NullConverter : public TypedConverter { }; // ---------------------------------------------------------------------- -// Sequence converter for boolean type - -template -class BoolConverter : public TypedConverter { - public: - Status AppendItem(PyObject* obj) override { - if (obj == Py_True) { - return this->typed_builder_->Append(true); - } else if (obj == Py_False) { - return this->typed_builder_->Append(false); - } else { - return internal::InvalidValue(obj, "tried to convert to boolean"); - } - } -}; - -// ---------------------------------------------------------------------- -// Sequence converter template for numeric (integer and floating point) types +// Sequence converter template for primitive (integer and floating point bool) types template -class NumericConverter : public TypedConverter { +class PrimitiveConverter : public TypedConverter { Status AppendItem(PyObject* obj) override { - return Unbox::Append(this->typed_builder_, obj); + ARROW_ASSIGN_OR_RAISE(auto value, ValueConverter::FromPython(obj)); + return this->typed_builder_->Append(value); } }; // ---------------------------------------------------------------------- // Sequence converters for temporal types -template -class Date32Converter : public TypedConverter { - public: - Status AppendItem(PyObject* obj) override { - int32_t t; - if (PyDate_Check(obj)) { - auto pydate = reinterpret_cast(obj); - t = static_cast(internal::PyDate_to_days(pydate)); - } else { - RETURN_NOT_OK(internal::CIntFromPython(obj, &t, "Integer too large for date32")); - } - return this->typed_builder_->Append(t); - } -}; - -template -class Date64Converter : public TypedConverter { - public: - Status AppendItem(PyObject* obj) override { - int64_t t; - if (PyDateTime_Check(obj)) { - auto pydate = reinterpret_cast(obj); - t = internal::PyDateTime_to_ms(pydate); - // Truncate any intraday milliseconds - t -= t % 86400000LL; - } else if (PyDate_Check(obj)) { - auto pydate = reinterpret_cast(obj); - t = internal::PyDate_to_ms(pydate); - } else { - RETURN_NOT_OK(internal::CIntFromPython(obj, &t, "Integer too large for date64")); - } - return this->typed_builder_->Append(t); - } -}; - -template -class Time32Converter : public TypedConverter { - public: - explicit Time32Converter(TimeUnit::type unit) : unit_(unit) {} - - Status AppendItem(PyObject* obj) override { - // TODO(kszucs): option for strict conversion? - int32_t t; - if (PyTime_Check(obj)) { - // datetime.time stores microsecond resolution - switch (unit_) { - case TimeUnit::SECOND: - t = static_cast(internal::PyTime_to_s(obj)); - break; - case TimeUnit::MILLI: - t = static_cast(internal::PyTime_to_ms(obj)); - break; - default: - return Status::UnknownError("Invalid time unit"); - } - } else { - RETURN_NOT_OK(internal::CIntFromPython(obj, &t, "Integer too large for int32")); - } - return this->typed_builder_->Append(t); - } - - private: - TimeUnit::type unit_; -}; - -template -class Time64Converter : public TypedConverter { +template +class TimeConverter : public TypedConverter { public: - explicit Time64Converter(TimeUnit::type unit) : unit_(unit) {} + explicit TimeConverter(TimeUnit::type unit) : unit_(unit) {} Status AppendItem(PyObject* obj) override { - int64_t t; - if (PyTime_Check(obj)) { - // datetime.time stores microsecond resolution - switch (unit_) { - case TimeUnit::MICRO: - t = internal::PyTime_to_us(obj); - break; - case TimeUnit::NANO: - t = internal::PyTime_to_ns(obj); - break; - default: - return Status::UnknownError("Invalid time unit"); - } - } else { - RETURN_NOT_OK(internal::CIntFromPython(obj, &t, "Integer too large for int64")); - } - return this->typed_builder_->Append(t); + ARROW_ASSIGN_OR_RAISE(auto value, ValueConverter::FromPython(obj, unit_)); + return this->typed_builder_->Append(value); } - private: + protected: TimeUnit::type unit_; }; @@ -1069,9 +1119,9 @@ class DecimalConverter : public TypedConverter decimal_type_; }; -#define NUMERIC_CONVERTER(TYPE_ENUM, TYPE) \ +#define PRIMITIVE(TYPE_ENUM, TYPE) \ case Type::TYPE_ENUM: \ - *out = std::unique_ptr(new NumericConverter); \ + *out = std::unique_ptr(new PrimitiveConverter); \ break; #define SIMPLE_CONVERTER_CASE(TYPE_ENUM, TYPE_CLASS) \ @@ -1085,24 +1135,24 @@ Status GetConverterFlat(const std::shared_ptr& type, bool strict_conve std::unique_ptr* out) { switch (type->id()) { SIMPLE_CONVERTER_CASE(NA, NullConverter); - SIMPLE_CONVERTER_CASE(BOOL, BoolConverter); - NUMERIC_CONVERTER(INT8, Int8Type); - NUMERIC_CONVERTER(INT16, Int16Type); - NUMERIC_CONVERTER(INT32, Int32Type); - NUMERIC_CONVERTER(INT64, Int64Type); - NUMERIC_CONVERTER(UINT8, UInt8Type); - NUMERIC_CONVERTER(UINT16, UInt16Type); - NUMERIC_CONVERTER(UINT32, UInt32Type); - NUMERIC_CONVERTER(UINT64, UInt64Type); - NUMERIC_CONVERTER(HALF_FLOAT, HalfFloatType); - NUMERIC_CONVERTER(FLOAT, FloatType); - NUMERIC_CONVERTER(DOUBLE, DoubleType); + PRIMITIVE(BOOL, BooleanType); + PRIMITIVE(INT8, Int8Type); + PRIMITIVE(INT16, Int16Type); + PRIMITIVE(INT32, Int32Type); + PRIMITIVE(INT64, Int64Type); + PRIMITIVE(UINT8, UInt8Type); + PRIMITIVE(UINT16, UInt16Type); + PRIMITIVE(UINT32, UInt32Type); + PRIMITIVE(UINT64, UInt64Type); + PRIMITIVE(HALF_FLOAT, HalfFloatType); + PRIMITIVE(FLOAT, FloatType); + PRIMITIVE(DOUBLE, DoubleType); + PRIMITIVE(DATE32, Date32Type); + PRIMITIVE(DATE64, Date64Type); SIMPLE_CONVERTER_CASE(DECIMAL, DecimalConverter); SIMPLE_CONVERTER_CASE(BINARY, BytesConverter); SIMPLE_CONVERTER_CASE(LARGE_BINARY, LargeBytesConverter); SIMPLE_CONVERTER_CASE(FIXED_SIZE_BINARY, FixedWidthBytesConverter); - SIMPLE_CONVERTER_CASE(DATE32, Date32Converter); - SIMPLE_CONVERTER_CASE(DATE64, Date64Converter); case Type::STRING: if (strict_conversions) { *out = std::unique_ptr( @@ -1122,12 +1172,12 @@ Status GetConverterFlat(const std::shared_ptr& type, bool strict_conve } break; case Type::TIME32: { - *out = std::unique_ptr(new Time32Converter( + *out = std::unique_ptr(new TimeConverter( checked_cast(*type).unit())); break; } case Type::TIME64: { - *out = std::unique_ptr(new Time64Converter( + *out = std::unique_ptr(new TimeConverter( checked_cast(*type).unit())); break; } From 13c829c0c669cfdbae5354adb359279ace3cf889 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Tue, 7 Jan 2020 18:21:38 +0100 Subject: [PATCH 5/8] Rename append methods --- cpp/src/arrow/python/python_to_arrow.cc | 178 ++++++++++------------ cpp/src/arrow/python/python_to_arrow_2.cc | 118 ++++++++++++++ 2 files changed, 200 insertions(+), 96 deletions(-) create mode 100644 cpp/src/arrow/python/python_to_arrow_2.cc diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index caaa354ad88..a94d0b3a934 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -75,7 +75,7 @@ struct NullChecker { // ValueConverters // // Typed conversion logic for single python objects are encapsulated in -// ValueConverter structs using SFINAE. +// ValueConverter structs using SFINAE for specialization. // // The FromPython medthod is responsible to convert the python object to the // C++ value counterpart which can be directly appended to the ArrayBuilder or @@ -158,7 +158,6 @@ struct ValueConverter { template <> struct ValueConverter { - static inline Result FromPython(PyObject *obj) { int32_t value; if (PyDate_Check(obj)) { @@ -173,7 +172,6 @@ struct ValueConverter { template<> struct ValueConverter { - static inline Result FromPython(PyObject *obj) { int64_t value; if (PyDateTime_Check(obj)) { @@ -193,7 +191,6 @@ struct ValueConverter { template<> struct ValueConverter { - static inline Result FromPython(PyObject *obj, TimeUnit::type unit) { int32_t value; if (PyTime_Check(obj)) { @@ -217,7 +214,6 @@ struct ValueConverter { template<> struct ValueConverter { - static inline Result FromPython(PyObject *obj, TimeUnit::type unit) { int64_t value; if (PyTime_Check(obj)) { @@ -239,45 +235,40 @@ struct ValueConverter { } }; -// template -// struct ValueConverter> { -// using ValueType = PyBytesView; - -// static inline Result FromPython(PyObject *obj) { -// ValueType view; -// RETURN_NOT_OK(view.FromString(obj)); -// return view; -// } -// }; - -// template -// struct ValueConverter> { -// using ValueType = PyBytesView; - -// static inline Result FromPython(PyObject *obj, const Type& type) { -// ValueType view; -// RETURN_NOT_OK(view.FromString(obj)); -// const auto expected_length = type.byte_width(); -// if (ARROW_PREDICT_FALSE(view.size != expected_length)) { -// std::stringstream ss; -// ss << "expected to be length " << expected_length << " was " << view.size; -// return internal::InvalidValue(obj, ss.str()); -// } else { -// return view; -// } -// } -// }; - -// template -// struct ValueConverter> { -// using ValueType = PyBytesView; - -// static inline Result FromPython(PyObject *obj, bool* is_utf8) { -// ValueType view; -// RETURN_NOT_OK(view.FromString(obj, is_utf8)); -// return view; -// } -// }; +template +struct ValueConverter> { + static inline Result FromPython(PyObject *obj) { + PyBytesView view; + RETURN_NOT_OK(view.FromString(obj)); + return view; + } +}; + +template +struct ValueConverter> { + static inline Result FromPython(PyObject *obj, bool* is_utf8) { + PyBytesView view; + RETURN_NOT_OK(view.FromString(obj, is_utf8)); + return view; + } +}; + +template +struct ValueConverter> { + static inline Result FromPython(PyObject *obj, const Type& type) { + PyBytesView view; + RETURN_NOT_OK(view.FromString(obj)); + const auto expected_length = type.byte_width(); + if (ARROW_PREDICT_FALSE(view.size != expected_length)) { + std::stringstream ss; + ss << "expected to be length " << expected_length << " was " << view.size; + return internal::InvalidValue(obj, ss.str()); + } else { + return view; + } + } +}; + // ---------------------------------------------------------------------- // Sequence converter base and CRTP "middle" subclasses @@ -300,20 +291,22 @@ class SeqConverter { // converting Python objects to Arrow nested types virtual Status Init(ArrayBuilder* builder) = 0; - // // Append a single (non-sequence) Python datum to the underlying builder, - // // virtual function - // virtual Status AppendSingleVirtual(PyObject* obj) = 0; + // Append a single null value to the builder + virtual Status AppendNull() = 0; + + // Append a valid value + virtual Status AppendValue(PyObject* seq) = 0; + + // Append a single python object handling Null values + virtual Status Append(PyObject* seq) = 0; // Append the contents of a Python sequence to the underlying builder, // virtual version - virtual Status AppendMultiple(PyObject* seq, int64_t size) = 0; - virtual Status AppendSingle(PyObject* seq) = 0; - virtual Status AppendItem(PyObject* seq) = 0; - virtual Status AppendNull() = 0; + virtual Status Extend(PyObject* seq, int64_t size) = 0; // Append the contents of a Python sequence to the underlying builder, // virtual version - virtual Status AppendMultipleMasked(PyObject* seq, PyObject* mask, int64_t size) = 0; + virtual Status ExtendMasked(PyObject* seq, PyObject* mask, int64_t size) = 0; virtual Status Close() { if (chunks_.size() == 0 || builder_->length() > 0) { @@ -356,31 +349,24 @@ class TypedConverter : public SeqConverter { return Status::OK(); } - bool CheckNull(PyObject* obj) const { return NullChecker::Check(obj); } - // Append a missing item (default implementation) Status AppendNull() override { return this->typed_builder_->AppendNull(); } - // This is overridden in several subclasses, but if an Unbox implementation - // is defined, it will be used here - // Status AppendItem(PyObject* obj) override { - // return Unbox::Append(this->typed_builder_, obj); - // } - - Status AppendSingle(PyObject* obj) override { - return CheckNull(obj) ? AppendNull() : AppendItem(obj); + // Append null if the obj is None or pandas null otherwise the valid value + Status Append(PyObject* obj) override { + return NullChecker::Check(obj) ? AppendNull() : AppendValue(obj); } - Status AppendMultiple(PyObject* obj, int64_t size) override { + Status Extend(PyObject* obj, int64_t size) override { /// Ensure we've allocated enough space RETURN_NOT_OK(typed_builder_->Reserve(size)); // Iterate over the items adding each one return internal::VisitSequence(obj, [this](PyObject* item, bool* /* unused */) { - return this->AppendSingle(item); + return this->Append(item); }); } - Status AppendMultipleMasked(PyObject* obj, PyObject* mask, int64_t size) override { + Status ExtendMasked(PyObject* obj, PyObject* mask, int64_t size) override { /// Ensure we've allocated enough space RETURN_NOT_OK(typed_builder_->Reserve(size)); // Iterate over the items adding each one @@ -391,7 +377,7 @@ class TypedConverter : public SeqConverter { } else { // This will also apply the null-checking convention in the event // that the value is not masked - return this->AppendSingle(item); + return this->AppendValue(item); } }); } @@ -406,7 +392,7 @@ class TypedConverter : public SeqConverter { template class NullConverter : public TypedConverter { public: - Status AppendItem(PyObject* obj) override { + Status AppendValue(PyObject* obj) override { return internal::InvalidValue(obj, "converting to null type"); } }; @@ -416,7 +402,7 @@ class NullConverter : public TypedConverter { template class PrimitiveConverter : public TypedConverter { - Status AppendItem(PyObject* obj) override { + Status AppendValue(PyObject* obj) override { ARROW_ASSIGN_OR_RAISE(auto value, ValueConverter::FromPython(obj)); return this->typed_builder_->Append(value); } @@ -430,7 +416,7 @@ class TimeConverter : public TypedConverter { public: explicit TimeConverter(TimeUnit::type unit) : unit_(unit) {} - Status AppendItem(PyObject* obj) override { + Status AppendValue(PyObject* obj) override { ARROW_ASSIGN_OR_RAISE(auto value, ValueConverter::FromPython(obj, unit_)); return this->typed_builder_->Append(value); } @@ -494,7 +480,7 @@ class TemporalConverter : public TypedConverter { explicit TemporalConverter(TimeUnit::type unit) : unit_(unit) {} using traits = PyDateTimeTraits; - Status AppendItem(PyObject* obj) override { + Status AppendValue(PyObject* obj) override { int64_t t; if (traits::PyTypeCheck(obj)) { auto pydatetime = reinterpret_cast(obj); @@ -601,7 +587,7 @@ inline Status BuilderAppend(FixedSizeBinaryBuilder* builder, PyObject* obj, template class BinaryLikeConverter : public TypedConverter { public: - Status AppendItem(PyObject* obj) override { + Status AppendValue(PyObject* obj) override { // Accessing members of the templated base requires using this-> here bool is_full = false; RETURN_NOT_OK(detail::BuilderAppend(this->typed_builder_, obj, &is_full)); @@ -636,7 +622,7 @@ class StringConverter : public TypedConverter { public: StringConverter() : binary_count_(0) {} - Status Append(PyObject* obj, bool* is_full) { + Status AppendValue(PyObject* obj, bool* is_full) { if (STRICT) { // Force output to be unicode / utf8 and validate that any binary values // are utf8 @@ -660,9 +646,9 @@ class StringConverter : public TypedConverter { return detail::AppendPyString(this->typed_builder_, string_view_, is_full); } - Status AppendItem(PyObject* obj) override { + Status AppendValue(PyObject* obj) override { bool is_full = false; - RETURN_NOT_OK(Append(obj, &is_full)); + RETURN_NOT_OK(AppendValue(obj, &is_full)); // Exceeded capacity of builder if (ARROW_PREDICT_FALSE(is_full)) { @@ -671,7 +657,7 @@ class StringConverter : public TypedConverter { this->chunks_.emplace_back(std::move(chunk)); // Append the item now that the builder has been reset - RETURN_NOT_OK(Append(obj, &is_full)); + RETURN_NOT_OK(AppendValue(obj, &is_full)); } return Status::OK(); } @@ -707,7 +693,7 @@ class StringConverter : public TypedConverter { #define LIST_FAST_CASE(TYPE, NUMPY_TYPE, ArrowType) \ case Type::TYPE: { \ if (PyArray_DESCR(arr)->type_num != NUMPY_TYPE) { \ - return value_converter_->AppendMultiple(obj, value_length); \ + return value_converter_->Extend(obj, value_length); \ } \ return AppendNdarrayTypedItem(arr); \ } @@ -715,7 +701,7 @@ class StringConverter : public TypedConverter { // Use internal::VisitSequence, fast for NPY_OBJECT but slower otherwise #define LIST_SLOW_CASE(TYPE) \ case Type::TYPE: { \ - return value_converter_->AppendMultiple(obj, value_length); \ + return value_converter_->Extend(obj, value_length); \ } // Base class for ListConverter and FixedSizeListConverter (to have both work with CRTP) @@ -807,7 +793,7 @@ class BaseListConverter : public TypedConverter { "array input"); } return internal::VisitSequence(obj, [this](PyObject* item, bool*) { - return value_converter_->AppendSingle(item); + return value_converter_->Append(item); }); } default: { @@ -816,7 +802,7 @@ class BaseListConverter : public TypedConverter { } } - Status AppendItem(PyObject* obj) override { + Status AppendValue(PyObject* obj) override { RETURN_NOT_OK(this->typed_builder_->Append()); if (PyArray_Check(obj)) { return AppendNdarrayItem(obj); @@ -827,7 +813,7 @@ class BaseListConverter : public TypedConverter { " for conversion to list type"); } int64_t list_size = static_cast(PySequence_Size(obj)); - return value_converter_->AppendMultiple(obj, list_size); + return value_converter_->Extend(obj, list_size); } virtual Status GetResult(std::shared_ptr* out) override { @@ -866,7 +852,7 @@ class FixedSizeListConverter: public BaseListConvertertyped_builder_->Append()); if (PyArray_Check(obj)) { @@ -887,7 +873,7 @@ class FixedSizeListConverter: public BaseListConvertervalue_converter_->AppendMultiple(obj, list_size); + return this->value_converter_->Extend(obj, list_size); } protected: @@ -907,18 +893,18 @@ class MapConverter : public BaseListConverter { explicit MapConverter(bool from_pandas, bool strict_conversions) : BASE(from_pandas, strict_conversions), key_builder_(nullptr) {} - Status AppendSingle(PyObject* obj) override { - RETURN_NOT_OK(BASE::AppendSingle(obj)); + Status Append(PyObject* obj) override { + RETURN_NOT_OK(BASE::Append(obj)); return VerifyLastStructAppended(); } - Status AppendMultiple(PyObject* seq, int64_t size) override { - RETURN_NOT_OK(BASE::AppendMultiple(seq, size)); + Status Extend(PyObject* seq, int64_t size) override { + RETURN_NOT_OK(BASE::Extend(seq, size)); return VerifyLastStructAppended(); } - Status AppendMultipleMasked(PyObject* seq, PyObject* mask, int64_t size) override { - RETURN_NOT_OK(BASE::AppendMultipleMasked(seq, mask, size)); + Status ExtendMasked(PyObject* seq, PyObject* mask, int64_t size) override { + RETURN_NOT_OK(BASE::ExtendMasked(seq, mask, size)); return VerifyLastStructAppended(); } @@ -986,7 +972,7 @@ class StructConverter : public TypedConverter { return Status::OK(); } - Status AppendItem(PyObject* obj) override { + Status AppendValue(PyObject* obj) override { RETURN_NOT_OK(this->typed_builder_->Append()); // Note heterogenous sequences are not allowed if (ARROW_PREDICT_FALSE(source_kind_ == SourceKind::UNKNOWN)) { @@ -1013,7 +999,7 @@ class StructConverter : public TypedConverter { // Need to also insert a missing item on all child builders // (compare with ListConverter) for (int i = 0; i < num_fields_; i++) { - RETURN_NOT_OK(this->value_converters_[i]->AppendSingle(Py_None)); + RETURN_NOT_OK(this->value_converters_[i]->Append(Py_None)); } return Status::OK(); } @@ -1045,7 +1031,7 @@ class StructConverter : public TypedConverter { } // If we come here, it means all keys are absent for (int i = 0; i < num_fields_; i++) { - RETURN_NOT_OK(value_converters_[i]->AppendSingle(Py_None)); + RETURN_NOT_OK(value_converters_[i]->Append(Py_None)); } return Status::OK(); } @@ -1067,7 +1053,7 @@ class StructConverter : public TypedConverter { RETURN_IF_PYERROR(); } RETURN_NOT_OK( - value_converters_[i]->AppendSingle(valueobj ? valueobj : Py_None)); + value_converters_[i]->Append(valueobj ? valueobj : Py_None)); } return Status::OK(); } @@ -1078,7 +1064,7 @@ class StructConverter : public TypedConverter { } for (int i = 0; i < num_fields_; i++) { PyObject* valueobj = PyTuple_GET_ITEM(obj, i); - RETURN_NOT_OK(value_converters_[i]->AppendSingle(valueobj)); + RETURN_NOT_OK(value_converters_[i]->Append(valueobj)); } return Status::OK(); } @@ -1109,7 +1095,7 @@ class DecimalConverter : public TypedConvertertyped_builder_->Append(value); @@ -1356,9 +1342,9 @@ Status ConvertPySequence(PyObject* sequence_source, PyObject* mask, // Convert values if (mask != nullptr && mask != Py_None) { - RETURN_NOT_OK(converter->AppendMultipleMasked(seq, mask, size)); + RETURN_NOT_OK(converter->ExtendMasked(seq, mask, size)); } else { - RETURN_NOT_OK(converter->AppendMultiple(seq, size)); + RETURN_NOT_OK(converter->Extend(seq, size)); } // Retrieve result. Conversion may yield one or more array values diff --git a/cpp/src/arrow/python/python_to_arrow_2.cc b/cpp/src/arrow/python/python_to_arrow_2.cc new file mode 100644 index 00000000000..e026b03a36d --- /dev/null +++ b/cpp/src/arrow/python/python_to_arrow_2.cc @@ -0,0 +1,118 @@ + +///////////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////// + +class Value { + + template> + static inline Result FromPython(Type* type, PyObject* obj) { + typename Type::c_type value; + RETURN_NOT_OK(internal::CIntFromPython(obj, &value)); + return value; + } + +}; + +// class Converter; + +class Converter { + public: + // explicit Converter() {} + + virtual ~Converter() = default; + + virtual Result> ToArray(PyObject *obj, int64_t size) = 0; +}; + +template +class PrimitiveConverter : public Converter { + public: + using BuilderType = typename TypeTraits::BuilderType; + using CType = typename Type::c_type; + + explicit PrimitiveConverter(std::unique_ptr builder) + : builder_(std::move(builder)) {} + + Result> ToArray(PyObject *obj, int64_t size) override { + // Type* type = checked_cast(builder_->type()); + BuilderType* builder = checked_cast(builder_.get()); + + // Ensure we've allocated enough space + RETURN_NOT_OK(builder_->Reserve(size)); + + // Iterate over the items adding each one + return internal::VisitSequence(obj, [&, builder](PyObject* item, bool* /* unused */) { + if (item == Py_None) { // predict false + return builder->AppendNull(); + } else { + //ARROW_ASSIGN_OR_RAISE(auto value, 1);//Value::FromPython(type, item)); + CType value; + RETURN_NOT_OK(internal::CIntFromPython(item, &value)); + return builder->Append(value); + } + }); + } + + protected: + std::unique_ptr builder_; +}; + +Result> MakeConverter(const std::shared_ptr& type, + MemoryPool* pool) { + std::unique_ptr builder; + switch (type->id()) { + case Type::INT64: + RETURN_NOT_OK(MakeBuilder(pool, type, &builder)); + return std::make_shared>(std::move(builder)); + break; + default: + return Status::NotImplemented("Sequence converter for type ", type->ToString(), + " not implemented"); + } +} + +Status ConvertPySequence2(PyObject* sequence_source, PyObject* mask, + const PyConversionOptions& options, + std::shared_ptr* out) { + PyAcquireGIL lock; + + PyObject* seq; + OwnedRef tmp_seq_nanny; + + std::shared_ptr real_type; + + int64_t size = options.size; + RETURN_NOT_OK(ConvertToSequenceAndInferSize(sequence_source, &seq, &size)); + tmp_seq_nanny.reset(seq); + + // In some cases, type inference may be "loose", like strings. If the user + // passed pa.string(), then we will error if we encounter any non-UTF8 + // value. If not, then we will allow the result to be a BinaryArray + bool strict_conversions = false; + + if (options.type == nullptr) { + RETURN_NOT_OK(InferArrowType(seq, mask, options.from_pandas, &real_type)); + } else { + real_type = options.type; + strict_conversions = true; + } + DCHECK_GE(size, 0); + + // Create ArrayBuilder for type, then pass into the SeqConverter + // instance. The reason this is created here rather than in GetConverter is + // because of nested types (child SeqConverter objects need the child + // builders created by MakeBuilder) + // std::unique_ptr type_builder; + // RETURN_NOT_OK(MakeBuilder(options.pool, real_type, &type_builder)); + // RETURN_NOT_OK(converter->Init(type_builder.get())); + ARROW_ASSIGN_OR_RAISE(auto converter, MakeConverter(real_type, options.pool)); + ARROW_ASSIGN_OR_RAISE(auto array, converter->ToArray(seq, size)); + + *out = std::make_shared( + std::vector>{array}, real_type); + + return Status::OK(); +} From 39e69e297ad222da0848cb634af966034ef610e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Tue, 7 Jan 2020 18:37:13 +0100 Subject: [PATCH 6/8] Remove prototype implementation --- cpp/src/arrow/python/python_to_arrow_2.cc | 118 ---------------------- 1 file changed, 118 deletions(-) delete mode 100644 cpp/src/arrow/python/python_to_arrow_2.cc diff --git a/cpp/src/arrow/python/python_to_arrow_2.cc b/cpp/src/arrow/python/python_to_arrow_2.cc deleted file mode 100644 index e026b03a36d..00000000000 --- a/cpp/src/arrow/python/python_to_arrow_2.cc +++ /dev/null @@ -1,118 +0,0 @@ - -///////////////////////////////////////////////////////////////////////////// -///////////////////////////////////////////////////////////////////////////// -///////////////////////////////////////////////////////////////////////////// -///////////////////////////////////////////////////////////////////////////// -///////////////////////////////////////////////////////////////////////////// - -class Value { - - template> - static inline Result FromPython(Type* type, PyObject* obj) { - typename Type::c_type value; - RETURN_NOT_OK(internal::CIntFromPython(obj, &value)); - return value; - } - -}; - -// class Converter; - -class Converter { - public: - // explicit Converter() {} - - virtual ~Converter() = default; - - virtual Result> ToArray(PyObject *obj, int64_t size) = 0; -}; - -template -class PrimitiveConverter : public Converter { - public: - using BuilderType = typename TypeTraits::BuilderType; - using CType = typename Type::c_type; - - explicit PrimitiveConverter(std::unique_ptr builder) - : builder_(std::move(builder)) {} - - Result> ToArray(PyObject *obj, int64_t size) override { - // Type* type = checked_cast(builder_->type()); - BuilderType* builder = checked_cast(builder_.get()); - - // Ensure we've allocated enough space - RETURN_NOT_OK(builder_->Reserve(size)); - - // Iterate over the items adding each one - return internal::VisitSequence(obj, [&, builder](PyObject* item, bool* /* unused */) { - if (item == Py_None) { // predict false - return builder->AppendNull(); - } else { - //ARROW_ASSIGN_OR_RAISE(auto value, 1);//Value::FromPython(type, item)); - CType value; - RETURN_NOT_OK(internal::CIntFromPython(item, &value)); - return builder->Append(value); - } - }); - } - - protected: - std::unique_ptr builder_; -}; - -Result> MakeConverter(const std::shared_ptr& type, - MemoryPool* pool) { - std::unique_ptr builder; - switch (type->id()) { - case Type::INT64: - RETURN_NOT_OK(MakeBuilder(pool, type, &builder)); - return std::make_shared>(std::move(builder)); - break; - default: - return Status::NotImplemented("Sequence converter for type ", type->ToString(), - " not implemented"); - } -} - -Status ConvertPySequence2(PyObject* sequence_source, PyObject* mask, - const PyConversionOptions& options, - std::shared_ptr* out) { - PyAcquireGIL lock; - - PyObject* seq; - OwnedRef tmp_seq_nanny; - - std::shared_ptr real_type; - - int64_t size = options.size; - RETURN_NOT_OK(ConvertToSequenceAndInferSize(sequence_source, &seq, &size)); - tmp_seq_nanny.reset(seq); - - // In some cases, type inference may be "loose", like strings. If the user - // passed pa.string(), then we will error if we encounter any non-UTF8 - // value. If not, then we will allow the result to be a BinaryArray - bool strict_conversions = false; - - if (options.type == nullptr) { - RETURN_NOT_OK(InferArrowType(seq, mask, options.from_pandas, &real_type)); - } else { - real_type = options.type; - strict_conversions = true; - } - DCHECK_GE(size, 0); - - // Create ArrayBuilder for type, then pass into the SeqConverter - // instance. The reason this is created here rather than in GetConverter is - // because of nested types (child SeqConverter objects need the child - // builders created by MakeBuilder) - // std::unique_ptr type_builder; - // RETURN_NOT_OK(MakeBuilder(options.pool, real_type, &type_builder)); - // RETURN_NOT_OK(converter->Init(type_builder.get())); - ARROW_ASSIGN_OR_RAISE(auto converter, MakeConverter(real_type, options.pool)); - ARROW_ASSIGN_OR_RAISE(auto array, converter->ToArray(seq, size)); - - *out = std::make_shared( - std::vector>{array}, real_type); - - return Status::OK(); -} From b095214c97a1910742ae7584e338d87ec917f459 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Wed, 8 Jan 2020 17:03:12 +0100 Subject: [PATCH 7/8] String conversions --- cpp/src/arrow/python/python_to_arrow.cc | 224 +++++++++++------------- 1 file changed, 107 insertions(+), 117 deletions(-) diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index a94d0b3a934..43593bc3798 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -246,22 +246,46 @@ struct ValueConverter> { template struct ValueConverter> { - static inline Result FromPython(PyObject *obj, bool* is_utf8) { + + static inline Result FromPython(PyObject *obj) { + // strict conversion, force output to be unicode / utf8 and validate that + // any binary values are utf8 + bool is_utf8 = false; + PyBytesView view; + + RETURN_NOT_OK(view.FromString(obj, &is_utf8)); + if (!is_utf8) { + return internal::InvalidValue(obj, "was not a utf8 string"); + } + return view; + } + + static inline Result FromPython(PyObject *obj, bool& is_utf8) { PyBytesView view; - RETURN_NOT_OK(view.FromString(obj, is_utf8)); + + // Non-strict conversion; keep track of whether values are unicode or bytes + if (PyUnicode_Check(obj)) { + is_utf8 = true; + RETURN_NOT_OK(view.FromUnicode(obj)); + } else { + // If not unicode or bytes, FromBinary will error + is_utf8 = false; + RETURN_NOT_OK(view.FromBinary(obj)); + } + return view; } + }; template struct ValueConverter> { - static inline Result FromPython(PyObject *obj, const Type& type) { + static inline Result FromPython(PyObject *obj, int32_t byte_width) { PyBytesView view; RETURN_NOT_OK(view.FromString(obj)); - const auto expected_length = type.byte_width(); - if (ARROW_PREDICT_FALSE(view.size != expected_length)) { + if (ARROW_PREDICT_FALSE(view.size != byte_width)) { std::stringstream ss; - ss << "expected to be length " << expected_length << " was " << view.size; + ss << "expected to be length " << byte_width << " was " << view.size; return internal::InvalidValue(obj, ss.str()); } else { return view; @@ -269,7 +293,6 @@ struct ValueConverter> { } }; - // ---------------------------------------------------------------------- // Sequence converter base and CRTP "middle" subclasses @@ -377,7 +400,7 @@ class TypedConverter : public SeqConverter { } else { // This will also apply the null-checking convention in the event // that the value is not masked - return this->AppendValue(item); + return this->Append(item); // perhaps use AppendValue instead? } }); } @@ -535,131 +558,91 @@ class TemporalConverter : public TypedConverter { // ---------------------------------------------------------------------- // Sequence converters for Binary, FixedSizeBinary, String -namespace detail { +template +class BinaryLikeConverter : public TypedConverter { + public: + using BuilderType = typename TypeTraits::BuilderType; -template -inline Status AppendPyString(BuilderType* builder, const PyBytesView& view, - bool* is_full) { - if (view.size > BuilderType::memory_limit()) { - return Status::Invalid("string too large for datatype"); - } - DCHECK_GE(view.size, 0); - // Did we reach the builder size limit? - if (ARROW_PREDICT_FALSE(builder->value_data_length() + view.size > - BuilderType::memory_limit())) { - *is_full = true; + inline Status AutoChunk(Py_ssize_t size) { + // did we reach the builder size limit? + if (ARROW_PREDICT_FALSE(this->typed_builder_->value_data_length() + size > + BuilderType::memory_limit())) { + // builder would be full, so need to add a new chunk + std::shared_ptr chunk; + RETURN_NOT_OK(this->typed_builder_->Finish(&chunk)); + this->chunks_.emplace_back(std::move(chunk)); + } return Status::OK(); } - RETURN_NOT_OK(builder->Append(::arrow::util::string_view(view.bytes, view.size))); - *is_full = false; - return Status::OK(); -} -inline Status BuilderAppend(BinaryBuilder* builder, PyObject* obj, bool* is_full) { - PyBytesView view; - RETURN_NOT_OK(view.FromString(obj)); - return AppendPyString(builder, view, is_full); -} + Status AppendString(const PyBytesView& view) { + // check that the value fits in the datatype + if (view.size > BuilderType::memory_limit()) { + return Status::Invalid("string too large for datatype"); + } + DCHECK_GE(view.size, 0); -inline Status BuilderAppend(LargeBinaryBuilder* builder, PyObject* obj, bool* is_full) { - PyBytesView view; - RETURN_NOT_OK(view.FromString(obj)); - return AppendPyString(builder, view, is_full); -} + // create a new chunk if the value would overflow the builder + RETURN_NOT_OK(AutoChunk(view.size)); -inline Status BuilderAppend(FixedSizeBinaryBuilder* builder, PyObject* obj, - bool* is_full) { - PyBytesView view; - RETURN_NOT_OK(view.FromString(obj)); - const auto expected_length = - checked_cast(*builder->type()).byte_width(); - if (ARROW_PREDICT_FALSE(view.size != expected_length)) { - std::stringstream ss; - ss << "expected to be length " << expected_length << " was " << view.size; - return internal::InvalidValue(obj, ss.str()); - } + // now we can safely append the value to the builder + RETURN_NOT_OK(this->typed_builder_->Append( + ::arrow::util::string_view(view.bytes, view.size))); - return AppendPyString(builder, view, is_full); -} + return Status::OK(); + } -} // namespace detail + protected: + // Create a single instance of PyBytesView here to prevent unnecessary object + // creation/destruction + PyBytesView string_view_; +}; template -class BinaryLikeConverter : public TypedConverter { +class BinaryConverter : public BinaryLikeConverter { public: - Status AppendValue(PyObject* obj) override { - // Accessing members of the templated base requires using this-> here - bool is_full = false; - RETURN_NOT_OK(detail::BuilderAppend(this->typed_builder_, obj, &is_full)); - - // Exceeded capacity of builder - if (ARROW_PREDICT_FALSE(is_full)) { - std::shared_ptr chunk; - RETURN_NOT_OK(this->typed_builder_->Finish(&chunk)); - this->chunks_.emplace_back(std::move(chunk)); - - // Append the item now that the builder has been reset - return detail::BuilderAppend(this->typed_builder_, obj, &is_full); - } - return Status::OK(); + Status AppendValue(PyObject *obj) override { + ARROW_ASSIGN_OR_RAISE(auto view, ValueConverter::FromPython(obj)); + return this->AppendString(view); } }; template -class BytesConverter : public BinaryLikeConverter {}; +class FixedSizeBinaryConverter : public BinaryLikeConverter { + public: + explicit FixedSizeBinaryConverter(int32_t byte_width) : byte_width_(byte_width) {} -template -class LargeBytesConverter : public BinaryLikeConverter {}; + Status AppendValue(PyObject* obj) override { + ARROW_ASSIGN_OR_RAISE( + this->string_view_, ValueConverter::FromPython(obj, byte_width_)); + return this->AppendString(this->string_view_); + } -template -class FixedWidthBytesConverter - : public BinaryLikeConverter {}; + protected: + int32_t byte_width_; +}; // For String/UTF8, if strict_conversions enabled, we reject any non-UTF8, // otherwise we allow but return results as BinaryArray -template -class StringConverter : public TypedConverter { +template +class StringConverter : public BinaryLikeConverter { public: StringConverter() : binary_count_(0) {} - Status AppendValue(PyObject* obj, bool* is_full) { + Status AppendValue(PyObject* obj) override { if (STRICT) { - // Force output to be unicode / utf8 and validate that any binary values - // are utf8 - bool is_utf8 = false; - RETURN_NOT_OK(string_view_.FromString(obj, &is_utf8)); - if (!is_utf8) { - return internal::InvalidValue(obj, "was not a utf8 string"); - } + // raise if the object is not unicode or not an utf-8 encoded bytes + ARROW_ASSIGN_OR_RAISE(this->string_view_, ValueConverter::FromPython(obj)); } else { - // Non-strict conversion; keep track of whether values are unicode or - // bytes; if any bytes are observe, the result will be bytes - if (PyUnicode_Check(obj)) { - RETURN_NOT_OK(string_view_.FromUnicode(obj)); - } else { - // If not unicode or bytes, FromBinary will error - RETURN_NOT_OK(string_view_.FromBinary(obj)); + // keep track of whether values are unicode or bytes; if any bytes are + // observe, the result will be bytes + bool is_utf8; + ARROW_ASSIGN_OR_RAISE(this->string_view_, ValueConverter::FromPython(obj, is_utf8)); + if (!is_utf8) { ++binary_count_; } } - - return detail::AppendPyString(this->typed_builder_, string_view_, is_full); - } - - Status AppendValue(PyObject* obj) override { - bool is_full = false; - RETURN_NOT_OK(AppendValue(obj, &is_full)); - - // Exceeded capacity of builder - if (ARROW_PREDICT_FALSE(is_full)) { - std::shared_ptr chunk; - RETURN_NOT_OK(this->typed_builder_->Finish(&chunk)); - this->chunks_.emplace_back(std::move(chunk)); - - // Append the item now that the builder has been reset - RETURN_NOT_OK(AppendValue(obj, &is_full)); - } - return Status::OK(); + return this->AppendString(this->string_view_); } Status GetResult(std::shared_ptr* out) override { @@ -669,19 +652,14 @@ class StringConverter : public TypedConverter { if (binary_count_) { // We should have bailed out earlier DCHECK(!STRICT); - auto binary_type = - TypeTraits::type_singleton(); + TypeTraits::type_singleton(); return (*out)->View(binary_type, out); } return Status::OK(); } - private: - // Create a single instance of PyBytesView here to prevent unnecessary object - // creation/destruction - PyBytesView string_view_; - + protected: int64_t binary_count_; }; @@ -1136,9 +1114,19 @@ Status GetConverterFlat(const std::shared_ptr& type, bool strict_conve PRIMITIVE(DATE32, Date32Type); PRIMITIVE(DATE64, Date64Type); SIMPLE_CONVERTER_CASE(DECIMAL, DecimalConverter); - SIMPLE_CONVERTER_CASE(BINARY, BytesConverter); - SIMPLE_CONVERTER_CASE(LARGE_BINARY, LargeBytesConverter); - SIMPLE_CONVERTER_CASE(FIXED_SIZE_BINARY, FixedWidthBytesConverter); + case Type::BINARY: + *out = std::unique_ptr( + new BinaryConverter()); + break; + case Type::LARGE_BINARY: + *out = std::unique_ptr( + new BinaryConverter()); + break; + case Type::FIXED_SIZE_BINARY: + *out = std::unique_ptr( + new FixedSizeBinaryConverter( + checked_cast(*type).byte_width())); + break; case Type::STRING: if (strict_conversions) { *out = std::unique_ptr( @@ -1158,12 +1146,14 @@ Status GetConverterFlat(const std::shared_ptr& type, bool strict_conve } break; case Type::TIME32: { - *out = std::unique_ptr(new TimeConverter( + *out = std::unique_ptr( + new TimeConverter( checked_cast(*type).unit())); break; } case Type::TIME64: { - *out = std::unique_ptr(new TimeConverter( + *out = std::unique_ptr( + new TimeConverter( checked_cast(*type).unit())); break; } From 36e1fa013ebfec631fd896eccc679d0b9ec7733d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Wed, 8 Jan 2020 19:11:09 +0100 Subject: [PATCH 8/8] Temporal conversions --- cpp/src/arrow/python/python_to_arrow.cc | 210 +++++++++++++----------- 1 file changed, 117 insertions(+), 93 deletions(-) diff --git a/cpp/src/arrow/python/python_to_arrow.cc b/cpp/src/arrow/python/python_to_arrow.cc index 43593bc3798..a8869cd8250 100644 --- a/cpp/src/arrow/python/python_to_arrow.cc +++ b/cpp/src/arrow/python/python_to_arrow.cc @@ -121,7 +121,6 @@ struct ValueConverter { template <> struct ValueConverter { - static inline Result FromPython(PyObject *obj) { float value; if (internal::PyFloatScalar_Check(obj)) { @@ -138,7 +137,6 @@ struct ValueConverter { template <> struct ValueConverter { - static inline Result FromPython(PyObject *obj) { double value; if (PyFloat_Check(obj)) { @@ -235,6 +233,98 @@ struct ValueConverter { } }; +template<> +struct ValueConverter { + static inline Result FromPython(PyObject *obj, TimeUnit::type unit) { + int64_t value; + if (PyDateTime_Check(obj)) { + auto dt = reinterpret_cast(obj); + switch (unit) { + case TimeUnit::SECOND: + value = internal::PyDateTime_to_s(dt); + break; + case TimeUnit::MILLI: + value = internal::PyDateTime_to_ms(dt); + break; + case TimeUnit::MICRO: + value = internal::PyDateTime_to_us(dt); + break; + case TimeUnit::NANO: + value = internal::PyDateTime_to_ns(dt); + break; + default: + return Status::UnknownError("Invalid time unit"); + } + } else { + RETURN_NOT_OK(internal::CIntFromPython(obj, &value)); + } + return value; + }; + + static inline Result FromNumpy(PyObject *obj, TimeUnit::type unit) { + // validate that the numpy scalar has np.datetime64 dtype + std::shared_ptr type; + RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &type)); + if (type->id() != TimestampType::type_id) { + // TODO(kszucs): the message should highlight the received numpy dtype + return Status::Invalid("Expected np.datetime64 but got: ", type->ToString()); + } + // validate that the time units are matching + if (unit != checked_cast(*type).unit()) { + return Status::NotImplemented( + "Cannot convert NumPy np.datetime64 objects with differing unit"); + } + // convert the numpy value + return reinterpret_cast(obj)->obval; + } +}; + +template<> +struct ValueConverter { + static inline Result FromPython(PyObject *obj, TimeUnit::type unit) { + int64_t value; + if (PyDelta_Check(obj)) { + auto dt = reinterpret_cast(obj); + switch (unit) { + case TimeUnit::SECOND: + value = internal::PyDelta_to_s(dt); + break; + case TimeUnit::MILLI: + value = internal::PyDelta_to_ms(dt); + break; + case TimeUnit::MICRO: + value = internal::PyDelta_to_us(dt); + break; + case TimeUnit::NANO: + value = internal::PyDelta_to_ns(dt); + break; + default: + return Status::UnknownError("Invalid time unit"); + } + } else { + RETURN_NOT_OK(internal::CIntFromPython(obj, &value)); + } + return value; + } + + static inline Result FromNumpy(PyObject *obj, TimeUnit::type unit) { + // validate that the numpy scalar has np.timedelta64 dtype + std::shared_ptr type; + RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &type)); + if (type->id() != DurationType::type_id) { + // TODO(kszucs): the message should highlight the received numpy dtype + return Status::Invalid("Expected np.timedelta64 but got: ", type->ToString()); + } + // validate that the time units are matching + if (unit != checked_cast(*type).unit()) { + return Status::NotImplemented( + "Cannot convert NumPy np.timedelta64 objects with differing unit"); + } + // convert the numpy value + return reinterpret_cast(obj)->obval; + } +}; + template struct ValueConverter> { static inline Result FromPython(PyObject *obj) { @@ -272,10 +362,8 @@ struct ValueConverter> { is_utf8 = false; RETURN_NOT_OK(view.FromBinary(obj)); } - return view; } - }; template @@ -439,6 +527,7 @@ class TimeConverter : public TypedConverter { public: explicit TimeConverter(TimeUnit::type unit) : unit_(unit) {} + // TODO(kszucs): support numpy values for date and time converters Status AppendValue(PyObject* obj) override { ARROW_ASSIGN_OR_RAISE(auto value, ValueConverter::FromPython(obj, unit_)); return this->typed_builder_->Append(value); @@ -448,111 +537,44 @@ class TimeConverter : public TypedConverter { TimeUnit::type unit_; }; -template -struct PyDateTimeTraits {}; +// TODO(kszucs): move it to the type_traits +template +struct NumpyType {}; template <> -struct PyDateTimeTraits { - static inline int PyTypeCheck(PyObject* obj) { return PyDateTime_Check(obj); } - using PyDateTimeObject = PyDateTime_DateTime; - - static inline int64_t py_to_s(PyDateTime_DateTime* pydatetime) { - return internal::PyDateTime_to_s(pydatetime); - } - static inline int64_t py_to_ms(PyDateTime_DateTime* pydatetime) { - return internal::PyDateTime_to_ms(pydatetime); - } - static inline int64_t py_to_us(PyDateTime_DateTime* pydatetime) { - return internal::PyDateTime_to_us(pydatetime); +struct NumpyType { + static inline bool isnull(int64_t v) { + return internal::npy_traits::isnull(v); } - static inline int64_t py_to_ns(PyDateTime_DateTime* pydatetime) { - return internal::PyDateTime_to_ns(pydatetime); - } - - using np_traits = internal::npy_traits; - using NpScalarObject = PyDatetimeScalarObject; - static constexpr const char* const np_name = "datetime64"; }; template <> -struct PyDateTimeTraits { - static inline int PyTypeCheck(PyObject* obj) { return PyDelta_Check(obj); } - using PyDateTimeObject = PyDateTime_Delta; - - static inline int64_t py_to_s(PyDateTime_Delta* pytimedelta) { - return internal::PyDelta_to_s(pytimedelta); - } - static inline int64_t py_to_ms(PyDateTime_Delta* pytimedelta) { - return internal::PyDelta_to_ms(pytimedelta); - } - static inline int64_t py_to_us(PyDateTime_Delta* pytimedelta) { - return internal::PyDelta_to_us(pytimedelta); +struct NumpyType { + static inline bool isnull(int64_t v) { + return internal::npy_traits::isnull(v); } - static inline int64_t py_to_ns(PyDateTime_Delta* pytimedelta) { - return internal::PyDelta_to_ns(pytimedelta); - } - - using np_traits = internal::npy_traits; - using NpScalarObject = PyTimedeltaScalarObject; - static constexpr const char* const np_name = "timedelta64"; }; -template -class TemporalConverter : public TypedConverter { +template +class TemporalConverter : public TimeConverter { public: - explicit TemporalConverter(TimeUnit::type unit) : unit_(unit) {} - using traits = PyDateTimeTraits; + using TimeConverter::TimeConverter; Status AppendValue(PyObject* obj) override { - int64_t t; - if (traits::PyTypeCheck(obj)) { - auto pydatetime = reinterpret_cast(obj); - - switch (unit_) { - case TimeUnit::SECOND: - t = traits::py_to_s(pydatetime); - break; - case TimeUnit::MILLI: - t = traits::py_to_ms(pydatetime); - break; - case TimeUnit::MICRO: - t = traits::py_to_us(pydatetime); - break; - case TimeUnit::NANO: - t = traits::py_to_ns(pydatetime); - break; - default: - return Status::UnknownError("Invalid time unit"); - } - } else if (PyArray_CheckAnyScalarExact(obj)) { - // numpy.datetime64 - using npy_traits = typename traits::np_traits; - static constexpr const char* const np_name = traits::np_name; - - std::shared_ptr type; - RETURN_NOT_OK(NumPyDtypeToArrow(PyArray_DescrFromScalar(obj), &type)); - if (type->id() != ArrowType::type_id) { - return Status::Invalid("Expected np.", np_name, " but got: ", type->ToString()); - } - const ArrowType& ttype = checked_cast(*type); - if (unit_ != ttype.unit()) { - return Status::NotImplemented("Cannot convert NumPy ", np_name, - " objects with differing unit"); - } - - t = reinterpret_cast(obj)->obval; - if (npy_traits::isnull(t)) { + int64_t value; + if (PyArray_CheckAnyScalarExact(obj)) { + // convert np.datetime64 / np.timedelta64 depending on Type + ARROW_ASSIGN_OR_RAISE(value, ValueConverter::FromNumpy(obj, this->unit_)); + if (NumpyType::isnull(value)) { // checks numpy NaT sentinel after conversion return this->typed_builder_->AppendNull(); } } else { - RETURN_NOT_OK(internal::CIntFromPython(obj, &t)); + // convert builtin python objects + ARROW_ASSIGN_OR_RAISE(value, ValueConverter::FromPython(obj, this->unit_)); } - return this->typed_builder_->Append(t); + return this->typed_builder_->Append(value); } - - private: - TimeUnit::type unit_; }; // ---------------------------------------------------------------------- @@ -1159,13 +1181,15 @@ Status GetConverterFlat(const std::shared_ptr& type, bool strict_conve } case Type::TIMESTAMP: { *out = - std::unique_ptr(new TemporalConverter( + std::unique_ptr( + new TemporalConverter( checked_cast(*type).unit())); break; } case Type::DURATION: { *out = - std::unique_ptr(new TemporalConverter( + std::unique_ptr( + new TemporalConverter( checked_cast(*type).unit())); break; }