diff --git a/.github/workflows/parquet.yml b/.github/workflows/parquet.yml index 126e4aa3a614..0d1a01ca5e23 100644 --- a/.github/workflows/parquet.yml +++ b/.github/workflows/parquet.yml @@ -119,7 +119,9 @@ jobs: run: cargo check -p parquet --no-default-features --features flate2 --features flate2-rust_backened - name: Check compilation --no-default-features --features flate2 --features flate2-zlib-rs run: cargo check -p parquet --no-default-features --features flate2 --features flate2-zlib-rs - + - name: Check compilation --no-default-features --features variant_experimental + run: cargo check -p parquet --no-default-features --features variant_experimental + # test the parquet crate builds against wasm32 in stable rust wasm32-build: diff --git a/Cargo.toml b/Cargo.toml index 722a1cd7ea19..bf0efc37d30a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -104,7 +104,7 @@ parquet = { version = "56.1.0", path = "./parquet", default-features = false } # These crates have not yet been released and thus do not use the workspace version parquet-variant = { version = "0.1.0", path = "./parquet-variant" } parquet-variant-json = { version = "0.1.0", path = "./parquet-variant-json" } -parquet-variant-compute = { version = "0.1.0", path = "./parquet-variant-json" } +parquet-variant-compute = { version = "0.1.0", path = "./parquet-variant-compute" } chrono = { version = "0.4.40", default-features = false, features = ["clock"] } diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml index bae90a51f0a8..a39275fb254e 100644 --- a/parquet/Cargo.toml +++ b/parquet/Cargo.toml @@ -45,6 +45,10 @@ arrow-data = { workspace = true, optional = true } arrow-schema = { workspace = true, optional = true } arrow-select = { workspace = true, optional = true } arrow-ipc = { workspace = true, optional = true } +parquet-variant = { workspace = true, optional = true } +parquet-variant-json = { workspace = true, optional = true } +parquet-variant-compute = { workspace = true, optional = true } + object_store = { version = "0.12.0", default-features = false, optional = true } bytes = { version = "1.1", default-features = false, features = ["std"] } @@ -108,7 +112,7 @@ json = ["serde_json", "base64"] # Enable internal testing APIs test_common = ["arrow/test_utils"] # Experimental, unstable functionality primarily used for testing -experimental = [] +experimental = ["variant_experimental"] # Enable async APIs async = ["futures", "tokio"] # Enable object_store integration @@ -124,6 +128,8 @@ encryption = ["dep:ring"] # Explicitely enabling rust_backend and zlib-rs features for flate2 flate2-rust_backened = ["flate2/rust_backend"] flate2-zlib-rs = ["flate2/zlib-rs"] +# Enable parquet variant support +variant_experimental = ["parquet-variant", "parquet-variant-json", "parquet-variant-compute"] [[example]] diff --git a/parquet/README.md b/parquet/README.md index 8fc72bfbc32a..5e087ac6a929 100644 --- a/parquet/README.md +++ b/parquet/README.md @@ -64,9 +64,11 @@ The `parquet` crate provides the following features which may be enabled in your - `experimental` - Experimental APIs which may change, even between minor releases - `simdutf8` (default) - Use the [`simdutf8`] crate for SIMD-accelerated UTF-8 validation - `encryption` - support for reading / writing encrypted Parquet files +- `variant_experimental` - ⚠️ Experimental [Parquet Variant] support, which may change, even between minor releases. [`arrow`]: https://crates.io/crates/arrow [`simdutf8`]: https://crates.io/crates/simdutf8 +[parquet variant]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md ## Parquet Feature Status diff --git a/parquet/src/lib.rs b/parquet/src/lib.rs index 07a673c295bc..1142a1c4a0d0 100644 --- a/parquet/src/lib.rs +++ b/parquet/src/lib.rs @@ -86,6 +86,14 @@ //! [`ParquetRecordBatchStreamBuilder`]: arrow::async_reader::ParquetRecordBatchStreamBuilder //! [`ParquetObjectReader`]: arrow::async_reader::ParquetObjectReader //! +//! ## Variant Logical Type (`variant_experimental` feature) +//! +//! The [`variant`] module supports reading and writing Parquet files +//! with the [Variant Binary Encoding] logical type, which can represent +//! semi-structured data such as JSON efficiently. +//! +//! [Variant Binary Encoding]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md +//! //! ## Read/Write Parquet Directly //! //! Workloads needing finer-grained control, or to avoid a dependence on arrow, @@ -179,3 +187,6 @@ pub mod record; pub mod schema; pub mod thrift; + +#[cfg(feature = "variant_experimental")] +pub mod variant; diff --git a/parquet/src/variant.rs b/parquet/src/variant.rs new file mode 100644 index 000000000000..a837a877df76 --- /dev/null +++ b/parquet/src/variant.rs @@ -0,0 +1,115 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! ⚠️ Experimental Support for reading and writing [`Variant`]s to / from Parquet files ⚠️ +//! +//! This is a 🚧 Work In Progress +//! +//! Note: Requires the `variant_experimental` feature of the `parquet` crate to be enabled. +//! +//! # Features +//! * [`Variant`] represents variant value, which can be an object, list, or primitive. +//! * [`VariantBuilder`] for building `Variant` values. +//! * [`VariantArray`] for representing a column of Variant values. +//! * [`compute`] module with functions for manipulating Variants, such as +//! [`variant_get`] to extracting a value by path and functions to convert +//! between `Variant` and JSON. +//! +//! [Variant Logical Type]: Variant +//! [`VariantArray`]: compute::VariantArray +//! [`variant_get`]: compute::variant_get +//! +//! # Example: Writing a Parquet file with Variant column +//! ```rust +//! # use parquet::variant::compute::{VariantArray, VariantArrayBuilder}; +//! # use parquet::variant::VariantBuilderExt; +//! # use std::sync::Arc; +//! # use arrow_array::{ArrayRef, RecordBatch}; +//! # use parquet::arrow::ArrowWriter; +//! # fn main() -> Result<(), parquet::errors::ParquetError> { +//! // Use the VariantArrayBuilder to build a VariantArray +//! let mut builder = VariantArrayBuilder::new(3); +//! // row 1: {"name": "Alice"} +//! let mut variant_builder = builder.variant_builder(); +//! variant_builder.new_object().with_field("name", "Alice").finish(); +//! variant_builder.finish(); +//! let array = builder.build(); +//! +//! // TODO support writing VariantArray directly +//! // at the moment it panics when trying to downcast to a struct array +//! // https://github.com/apache/arrow-rs/issues/8296 +//! // let array: ArrayRef = Arc::new(array); +//! let array: ArrayRef = Arc::new(array.into_inner()); +//! +//! // create a RecordBatch with the VariantArray +//! let batch = RecordBatch::try_from_iter(vec![("data", array)])?; +//! +//! // write the RecordBatch to a Parquet file +//! let file = std::fs::File::create("variant.parquet")?; +//! let mut writer = ArrowWriter::try_new(file, batch.schema(), None)?; +//! writer.write(&batch)?; +//! writer.close()?; +//! +//! # std::fs::remove_file("variant.parquet")?; +//! # Ok(()) +//! # } +//! ``` +//! +//! # Example: Writing JSON with a Parquet file with Variant column +//! ```rust +//! # use std::sync::Arc; +//! # use arrow_array::{ArrayRef, RecordBatch, StringArray}; +//! # use parquet::variant::compute::json_to_variant; +//! # use parquet::variant::compute::VariantArray; +//! # use parquet::arrow::ArrowWriter; +//! # fn main() -> Result<(), parquet::errors::ParquetError> { +//! // Create an array of JSON strings, simulating a column of JSON data +//! // TODO use StringViewArray when available +//! let input_array = StringArray::from(vec![ +//! Some(r#"{"name": "Alice", "age": 30}"#), +//! Some(r#"{"name": "Bob", "age": 25, "address": {"city": "New York"}}"#), +//! None, +//! Some("{}"), +//! ]); +//! let input_array: ArrayRef = Arc::new(input_array); +//! +//! // Convert the JSON strings to a VariantArray +//! let array: VariantArray = json_to_variant(&input_array)?; +//! +//! // TODO support writing VariantArray directly +//! // at the moment it panics when trying to downcast to a struct array +//! // https://github.com/apache/arrow-rs/issues/8296 +//! // let array: ArrayRef = Arc::new(array); +//! let array: ArrayRef = Arc::new(array.into_inner()); +//! +//! // create a RecordBatch with the VariantArray +//! let batch = RecordBatch::try_from_iter(vec![("data", array)])?; +//! +//! // write the RecordBatch to a Parquet file +//! let file = std::fs::File::create("variant-json.parquet")?; +//! let mut writer = ArrowWriter::try_new(file, batch.schema(), None)?; +//! writer.write(&batch)?; +//! writer.close()?; +//! # std::fs::remove_file("variant-json.parquet")?; +//! # Ok(()) +//! # } +//! ``` +//! +//! # Example: Reading a Parquet file with Variant column +//! (TODO: add example) +pub use parquet_variant::*; +pub use parquet_variant_compute as compute; diff --git a/parquet/tests/simple_variant_integration.rs b/parquet/tests/variant_integration.rs similarity index 100% rename from parquet/tests/simple_variant_integration.rs rename to parquet/tests/variant_integration.rs