diff --git a/python/pyarrow/src/arrow/python/helpers.cc b/python/pyarrow/src/arrow/python/helpers.cc index ca89ebe9d8b..23cb3138f88 100644 --- a/python/pyarrow/src/arrow/python/helpers.cc +++ b/python/pyarrow/src/arrow/python/helpers.cc @@ -379,6 +379,72 @@ void InitPandasStaticData() { } #endif +// Support conversion path for UUID objects +namespace { + +// This needs a conditional, because using std::once_flag could introduce +// a deadlock when the GIL is enabled. See +// https://github.com/apache/arrow/commit/f69061935e92e36e25bb891177ca8bc4f463b272 for +// more info. +#ifdef Py_GIL_DISABLED +static std::once_flag uuid_static_initialized; +#else +static bool uuid_static_initialized = false; +#endif + +// Once initialized, these variables hold borrowed references to UUID static data. +// We should not use OwnedRef here because Python destructors would be +// called on a finalized interpreter. +static PyObject* uuid_UUID = nullptr; + +void GetUUIDStaticSymbols() { + OwnedRef uuid; + + // Import uuid + Status s = ImportModule("uuid", &uuid); + if (!s.ok()) { + return; + } + +#ifndef Py_GIL_DISABLED + // Since ImportModule can release the GIL, another thread could have + // already initialized the static data. + if (uuid_static_initialized) { + return; + } +#endif + + OwnedRef ref; + + // Retain reference to uuid.UUID + if (ImportFromModule(uuid.obj(), "UUID", &ref).ok()) { + uuid_UUID = ref.obj(); + } + +} + +} // namespace + +#ifdef Py_GIL_DISABLED +void InitUUIDStaticData() { + std::call_once(uuid_static_initialized, GetUUIDStaticSymbols); +} +#else +void InitUUIDStaticData() { + // NOTE: This is called with the GIL held. We needn't (and shouldn't, + // to avoid deadlocks) use an additional C++ lock (ARROW-10519). + if (uuid_static_initialized) { + return; + } + GetUUIDStaticSymbols(); + uuid_static_initialized = true; +} +#endif + +bool IsPyUUID(PyObject* obj) { + return PyObject_IsInstance(obj, uuid_UUID); +} + bool PandasObjectIsNull(PyObject* obj) { if (!MayHaveNaN(obj)) { return false; diff --git a/python/pyarrow/src/arrow/python/helpers.h b/python/pyarrow/src/arrow/python/helpers.h index e2fd8212ae6..592758149fd 100644 --- a/python/pyarrow/src/arrow/python/helpers.h +++ b/python/pyarrow/src/arrow/python/helpers.h @@ -98,6 +98,11 @@ inline bool IsPyBinary(PyObject* obj) { return PyBytes_Check(obj) || PyByteArray_Check(obj) || PyMemoryView_Check(obj); } +void InitUUIDStaticData(); + +// \brief Check that obj is a uuid.UUID instance +bool IsPyUUID(PyObject* obj); + // \brief Convert a Python integer into a C integer // \param[in] obj A Python integer // \param[out] out A pointer to a C integer to hold the result of the conversion diff --git a/python/pyarrow/src/arrow/python/inference.cc b/python/pyarrow/src/arrow/python/inference.cc index 1aa7915ba1e..d21735f729a 100644 --- a/python/pyarrow/src/arrow/python/inference.cc +++ b/python/pyarrow/src/arrow/python/inference.cc @@ -344,6 +344,7 @@ class TypeInferrer { arrow_scalar_count_(0), numpy_dtype_count_(0), interval_count_(0), + uuid_count_(0), max_decimal_metadata_(std::numeric_limits::min(), std::numeric_limits::min()), decimal_type_() { @@ -412,6 +413,8 @@ class TypeInferrer { ++decimal_count_; } else if (PyObject_IsInstance(obj, interval_types_.obj())) { ++interval_count_; + } else if (internal::IsPyUUID(obj)) { + ++uuid_count_; } else { return internal::InvalidValue(obj, "did not recognize Python value type when inferring " @@ -541,6 +544,9 @@ class TypeInferrer { *out = utf8(); } else if (interval_count_) { *out = month_day_nano_interval(); + } else if (uuid_count_) { + // WIP: not binary, how do we set to UUID canonical extension type? + *out = extension::uuid(); } else if (arrow_scalar_count_) { *out = scalar_type_; } else { @@ -698,6 +704,7 @@ class TypeInferrer { int64_t arrow_scalar_count_; int64_t numpy_dtype_count_; int64_t interval_count_; + int64_t uuid_count_; std::unique_ptr list_inferrer_; std::map struct_inferrers_; std::shared_ptr scalar_type_; @@ -721,6 +728,9 @@ Result> InferArrowType(PyObject* obj, PyObject* mask, internal::InitPandasStaticData(); } + // Support conversion path for uuid.UUID objects + internal::InitUUIDStaticData(); + std::shared_ptr out_type; TypeInferrer inferrer(pandas_null_sentinels); RETURN_NOT_OK(inferrer.VisitSequence(obj, mask)); diff --git a/python/pyarrow/src/arrow/python/inference.h b/python/pyarrow/src/arrow/python/inference.h index 983384db118..29585ed26d4 100644 --- a/python/pyarrow/src/arrow/python/inference.h +++ b/python/pyarrow/src/arrow/python/inference.h @@ -24,6 +24,7 @@ #include +#include "arrow/extension/uuid.h" #include "arrow/python/visibility.h" #include "arrow/type.h" #include "arrow/util/macros.h" diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc index e7195e99072..aeccf1071a9 100644 --- a/python/pyarrow/src/arrow/python/python_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc @@ -1242,6 +1242,13 @@ Result> ConvertPySequence(PyObject* obj, PyObject* internal::InitPandasStaticData(); } + ARROW_ASSIGN_OR_RAISE(auto is_uuid_imported, internal::IsModuleImported("uuid")); + if (is_uuid_imported) { + // If uuid has been already imported initialize the static uuid objects to + // support converting uuid.UUID objects + internal::InitUUIDStaticData(); + } + int64_t size = options.size; RETURN_NOT_OK(ConvertToSequenceAndInferSize(obj, &seq, &size)); tmp_seq_nanny.reset(seq); diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 4160d648294..d428a206180 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -4202,3 +4202,8 @@ def test_non_cpu_array(): arr.tolist() with pytest.raises(NotImplementedError): arr.validate(full=True) + +def test_array_from_uuid(): + import uuid + arr = pa.array([uuid.uuid4() for i in range(10)]) + assert len(arr) == 10