diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index d31c93119b7..655836df06e 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -172,7 +172,7 @@ def print_entry(label, value): union, sparse_union, dense_union, dictionary, run_end_encoded, - bool8, fixed_shape_tensor, opaque, uuid, + bool8, fixed_shape_tensor, json_, opaque, uuid, field, type_for_alias, DataType, DictionaryType, StructType, @@ -183,7 +183,7 @@ def print_entry(label, value): FixedSizeBinaryType, Decimal128Type, Decimal256Type, BaseExtensionType, ExtensionType, RunEndEncodedType, Bool8Type, FixedShapeTensorType, - OpaqueType, UuidType, + JsonType, OpaqueType, UuidType, PyExtensionType, UnknownExtensionType, register_extension_type, unregister_extension_type, DictionaryMemo, @@ -218,7 +218,7 @@ def print_entry(label, value): MonthDayNanoIntervalArray, Decimal128Array, Decimal256Array, StructArray, ExtensionArray, RunEndEncodedArray, Bool8Array, FixedShapeTensorArray, - OpaqueArray, UuidArray, + JsonArray, OpaqueArray, UuidArray, scalar, NA, _NULL as NULL, Scalar, NullScalar, BooleanScalar, Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar, @@ -236,7 +236,7 @@ def print_entry(label, value): FixedSizeBinaryScalar, DictionaryScalar, MapScalar, StructScalar, UnionScalar, RunEndEncodedScalar, Bool8Scalar, ExtensionScalar, - FixedShapeTensorScalar, OpaqueScalar, UuidScalar) + FixedShapeTensorScalar, JsonScalar, OpaqueScalar, UuidScalar) # Buffers, allocation from pyarrow.lib import (DeviceAllocationType, Device, MemoryManager, diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index ae9e7fd777e..eaedbf1e385 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -4344,6 +4344,33 @@ cdef class ExtensionArray(Array): return result +class JsonArray(ExtensionArray): + """ + Concrete class for Arrow arrays of JSON data type. + + This does not guarantee that the JSON data actually + is valid JSON. + + Examples + -------- + Define the extension type for JSON array + + >>> import pyarrow as pa + >>> json_type = pa.json_(pa.large_utf8()) + + Create an extension array + + >>> arr = [None, '{ "id":30, "values":["a", "b"] }'] + >>> storage = pa.array(arr, pa.large_utf8()) + >>> pa.ExtensionArray.from_storage(json_type, storage) + + [ + null, + "{ "id":30, "values":["a", "b"] }" + ] + """ + + class UuidArray(ExtensionArray): """ Concrete class for Arrow arrays of UUID data type. diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 8e6922a912a..797cefed8c7 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2867,6 +2867,13 @@ cdef extern from "arrow/extension_type.h" namespace "arrow": shared_ptr[CArray] storage() +cdef extern from "arrow/extension/json.h" namespace "arrow::extension" nogil: + cdef cppclass CJsonType" arrow::extension::JsonExtensionType"(CExtensionType): + + @staticmethod + CResult[shared_ptr[CDataType]] Make(shared_ptr[CDataType]& storage_type) + + cdef extern from "arrow/extension/uuid.h" namespace "arrow::extension" nogil: cdef cppclass CUuidType" arrow::extension::UuidType"(CExtensionType): diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index 25a7945dc3d..f3d4e1eec08 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -226,6 +226,11 @@ cdef class UuidType(BaseExtensionType): cdef: const CUuidType* uuid_ext_type +cdef class JsonType(BaseExtensionType): + cdef: + const CJsonType* json_ext_type + + cdef class PyExtensionType(ExtensionType): pass diff --git a/python/pyarrow/public-api.pxi b/python/pyarrow/public-api.pxi index d3e2ff2e99d..913e25e3082 100644 --- a/python/pyarrow/public-api.pxi +++ b/python/pyarrow/public-api.pxi @@ -131,6 +131,8 @@ cdef api object pyarrow_wrap_data_type( out = OpaqueType.__new__(OpaqueType) elif extension_name == b"arrow.uuid": out = UuidType.__new__(UuidType) + elif extension_name == b"arrow.json": + out = JsonType.__new__(JsonType) else: out = BaseExtensionType.__new__(BaseExtensionType) else: diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 68f77832c43..2bfdcddf307 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -1044,6 +1044,12 @@ cdef class ExtensionScalar(Scalar): return pyarrow_wrap_scalar( sp_scalar) +class JsonScalar(ExtensionScalar): + """ + Concrete class for JSON extension scalar. + """ + + class UuidScalar(ExtensionScalar): """ Concrete class for Uuid extension scalar. diff --git a/python/pyarrow/tests/parquet/test_data_types.py b/python/pyarrow/tests/parquet/test_data_types.py index 79dd9694826..1428f802397 100644 --- a/python/pyarrow/tests/parquet/test_data_types.py +++ b/python/pyarrow/tests/parquet/test_data_types.py @@ -510,3 +510,14 @@ def test_large_binary_overflow(): pa.ArrowInvalid, match="Parquet cannot store strings with size 2GB or more"): _write_table(table, writer, use_dictionary=use_dictionary) + + +@pytest.mark.parametrize("storage_type", ( + pa.string(), pa.large_string())) +def test_json_extension_type(storage_type): + data = ['{"a": 1}', '{"b": 2}', None] + arr = pa.array(data, type=pa.json_(storage_type)) + + table = pa.table([arr], names=["ext"]) + + _simple_table_roundtrip(table) diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index b74eca75bdc..634d9ce2d8d 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1926,3 +1926,56 @@ def test_bool8_scalar(): assert pa.scalar(1, type=pa.bool8()).as_py() is True assert pa.scalar(2, type=pa.bool8()).as_py() is True assert pa.scalar(None, type=pa.bool8()).as_py() is None + + +@pytest.mark.parametrize("storage_type", ( + pa.string(), pa.large_string(), pa.string_view())) +def test_json(storage_type, pickle_module): + data = ['{"a": 1}', '{"b": 2}', None] + json_type = pa.json_(storage_type) + storage = pa.array(data, type=storage_type) + array = pa.array(data, type=json_type) + json_arr_class = json_type.__arrow_ext_class__() + + assert pa.json_() == pa.json_(pa.utf8()) + assert json_type.extension_name == "arrow.json" + assert json_type.storage_type == storage_type + assert json_type.__class__ is pa.JsonType + + assert json_type == pa.json_(storage_type) + assert json_type != storage_type + + assert isinstance(array, pa.JsonArray) + + assert array.to_pylist() == data + assert array[0].as_py() == data[0] + assert array[2].as_py() is None + + # Pickle roundtrip + result = pickle_module.loads(pickle_module.dumps(json_type)) + assert result == json_type + + # IPC roundtrip + buf = ipc_write_batch(pa.RecordBatch.from_arrays([array], ["ext"])) + batch = ipc_read_batch(buf) + reconstructed_array = batch.column(0) + assert reconstructed_array.type == json_type + assert reconstructed_array == array + assert isinstance(array, json_arr_class) + + assert json_type.__arrow_ext_scalar_class__() == pa.JsonScalar + assert isinstance(array[0], pa.JsonScalar) + + # cast storage -> extension type + result = storage.cast(json_type) + assert result == array + + # cast extension type -> storage type + inner = array.cast(storage_type) + assert inner == storage + + for storage_type in (pa.int32(), pa.large_binary(), pa.float32()): + with pytest.raises( + pa.ArrowInvalid, + match=f"Invalid storage type for JsonExtensionType: {storage_type}"): + pa.json_(storage_type) diff --git a/python/pyarrow/tests/test_misc.py b/python/pyarrow/tests/test_misc.py index 5d3471c7c35..0b2055018f6 100644 --- a/python/pyarrow/tests/test_misc.py +++ b/python/pyarrow/tests/test_misc.py @@ -253,6 +253,9 @@ def test_set_timezone_db_path_non_windows(): pa.Bool8Array, pa.Bool8Scalar, pa.Bool8Type, + pa.JsonArray, + pa.JsonScalar, + pa.JsonType, ]) def test_extension_type_constructor_errors(klass): # ARROW-2638: prevent calling extension class constructors directly diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 70f12e9796e..c66ac5f28d3 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -1812,6 +1812,43 @@ cdef class ExtensionType(BaseExtensionType): return ExtensionScalar +cdef class JsonType(BaseExtensionType): + """ + Concrete class for JSON extension type. + + Examples + -------- + Define the extension type for JSON array + + >>> import pyarrow as pa + >>> json_type = pa.json_(pa.large_utf8()) + + Create an extension array + + >>> arr = [None, '{ "id":30, "values":["a", "b"] }'] + >>> storage = pa.array(arr, pa.large_utf8()) + >>> pa.ExtensionArray.from_storage(json_type, storage) + + [ + null, + "{ "id":30, "values":["a", "b"] }" + ] + """ + + cdef void init(self, const shared_ptr[CDataType]& type) except *: + BaseExtensionType.init(self, type) + self.json_ext_type = type.get() + + def __arrow_ext_class__(self): + return JsonArray + + def __reduce__(self): + return json_, (self.storage_type,) + + def __arrow_ext_scalar_class__(self): + return JsonScalar + + cdef class UuidType(BaseExtensionType): """ Concrete class for UUID extension type. @@ -5296,6 +5333,44 @@ def run_end_encoded(run_end_type, value_type): return pyarrow_wrap_data_type(ree_type) +def json_(DataType storage_type=utf8()): + """ + Create instance of JSON extension type. + + Parameters + ---------- + storage_type : DataType, default pyarrow.string() + The underlying data type. Can be on of the following types: + string, large_string, string_view. + + Returns + ------- + type : JsonType + + Examples + -------- + Create an instance of JSON extension type: + + >>> import pyarrow as pa + >>> pa.json_(pa.utf8()) + JsonType(extension) + + Use the JSON type to create an array: + + >>> pa.array(['{"a": 1}', '{"b": 2}'], type=pa.json_(pa.utf8())) + + [ + "{"a": 1}", + "{"b": 2}" + ] + """ + + cdef JsonType out = JsonType.__new__(JsonType) + c_json_ext_type = GetResultValue(CJsonType.Make(storage_type.sp_type)) + out.init(c_json_ext_type) + return out + + def uuid(): """ Create UuidType instance.