diff --git a/.github/workflows/parquet-variant.yml b/.github/workflows/parquet-variant.yml index 6fc5c3a8cd00..6ad4e86be422 100644 --- a/.github/workflows/parquet-variant.yml +++ b/.github/workflows/parquet-variant.yml @@ -46,8 +46,10 @@ jobs: submodules: true - name: Setup Rust toolchain uses: ./.github/actions/setup-builder - - name: Test + - name: Test parquet-variant run: cargo test -p parquet-variant + - name: Test parquet-variant-json + run: cargo test -p parquet-variant-json # test compilation linux-features: @@ -63,6 +65,8 @@ jobs: uses: ./.github/actions/setup-builder - name: Check compilation run: cargo check -p parquet-variant + - name: Check compilation + run: cargo check -p parquet-variant-json clippy: name: Clippy @@ -77,3 +81,5 @@ jobs: run: rustup component add clippy - name: Run clippy run: cargo clippy -p parquet-variant --all-targets --all-features -- -D warnings + - name: Run clippy + run: cargo clippy -p parquet-variant-json --all-targets --all-features -- -D warnings diff --git a/Cargo.toml b/Cargo.toml index 1083c9444c38..5f6861518e14 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -40,6 +40,7 @@ members = [ "arrow-string", "parquet", "parquet-variant", + "parquet-variant-json", "parquet_derive", "parquet_derive_test", ] @@ -99,6 +100,10 @@ arrow-select = { version = "55.2.0", path = "./arrow-select" } arrow-string = { version = "55.2.0", path = "./arrow-string" } parquet = { version = "55.2.0", path = "./parquet", default-features = false } +# These crates have not yet been released and thus do not use the workspace version +parquet-variant = { version = "0.1.0", path = "./parquet-variant"} +parquet-variant-json = { version = "0.1.0", path = "./parquet-variant-json" } + chrono = { version = "0.4.40", default-features = false, features = ["clock"] } # release inherited profile keeping debug information and symbols diff --git a/parquet-variant-json/Cargo.toml b/parquet-variant-json/Cargo.toml new file mode 100644 index 000000000000..830a3c060011 --- /dev/null +++ b/parquet-variant-json/Cargo.toml @@ -0,0 +1,49 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +[package] +name = "parquet-variant-json" +# This package is still in development and thus the version does +# not follow the versions of the rest of the crates in this repo. +version = "0.1.0" +license = { workspace = true } +description = "Apache Parquet Variant to/from JSON" +homepage = { workspace = true } +repository = { workspace = true } +authors = { workspace = true } +keywords = ["arrow", "parquet", "variant"] +readme = "README.md" +edition = { workspace = true } +# needs a newer version than workspace due to +# rror: `Option::::unwrap` is not yet stable as a const fn +rust-version = "1.83" + + +[dependencies] +arrow-schema = { workspace = true } +parquet-variant = { path = "../parquet-variant" } +chrono = { workspace = true } +serde_json = "1.0" +base64 = "0.22" + + +[lib] +name = "parquet_variant_json" +bench = false + +[dev-dependencies] + diff --git a/parquet-variant-json/src/from_json.rs b/parquet-variant-json/src/from_json.rs new file mode 100644 index 000000000000..c0910950367f --- /dev/null +++ b/parquet-variant-json/src/from_json.rs @@ -0,0 +1,690 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Module for parsing JSON strings as Variant + +use arrow_schema::ArrowError; +use parquet_variant::{ListBuilder, ObjectBuilder, Variant, VariantBuilder, VariantBuilderExt}; +use serde_json::{Number, Value}; + +/// Converts a JSON string to Variant using [`VariantBuilder`]. The resulting `value` and `metadata` +/// buffers can be extracted using `builder.finish()` +/// +/// # Arguments +/// * `json` - The JSON string to parse as Variant. +/// * `variant_builder` - Object of type `VariantBuilder` used to build the vatiant from the JSON +/// string +/// +/// # Returns +/// +/// * `Ok(())` if successful +/// * `Err` with error details if the conversion fails +/// +/// ```rust +/// # use parquet_variant::VariantBuilder; +/// # use parquet_variant_json::{ +/// # json_to_variant, variant_to_json_string, variant_to_json, variant_to_json_value +/// # }; +/// +/// let mut variant_builder = VariantBuilder::new(); +/// let person_string = "{\"name\":\"Alice\", \"age\":30, ".to_string() +/// + "\"email\":\"alice@example.com\", \"is_active\": true, \"score\": 95.7," +/// + "\"additional_info\": null}"; +/// json_to_variant(&person_string, &mut variant_builder)?; +/// +/// let (metadata, value) = variant_builder.finish(); +/// +/// let variant = parquet_variant::Variant::try_new(&metadata, &value)?; +/// +/// let json_result = variant_to_json_string(&variant)?; +/// let json_value = variant_to_json_value(&variant)?; +/// +/// let mut buffer = Vec::new(); +/// variant_to_json(&mut buffer, &variant)?; +/// let buffer_result = String::from_utf8(buffer)?; +/// assert_eq!(json_result, "{\"additional_info\":null,\"age\":30,".to_string() + +/// "\"email\":\"alice@example.com\",\"is_active\":true,\"name\":\"Alice\",\"score\":95.7}"); +/// assert_eq!(json_result, buffer_result); +/// assert_eq!(json_result, serde_json::to_string(&json_value)?); +/// # Ok::<(), Box>(()) +/// ``` +pub fn json_to_variant(json: &str, builder: &mut VariantBuilder) -> Result<(), ArrowError> { + let json: Value = serde_json::from_str(json) + .map_err(|e| ArrowError::InvalidArgumentError(format!("JSON format error: {e}")))?; + + build_json(&json, builder)?; + Ok(()) +} + +fn build_json(json: &Value, builder: &mut VariantBuilder) -> Result<(), ArrowError> { + append_json(json, builder)?; + Ok(()) +} + +fn variant_from_number<'m, 'v>(n: &Number) -> Result, ArrowError> { + if let Some(i) = n.as_i64() { + // Find minimum Integer width to fit + if i as i8 as i64 == i { + Ok((i as i8).into()) + } else if i as i16 as i64 == i { + Ok((i as i16).into()) + } else if i as i32 as i64 == i { + Ok((i as i32).into()) + } else { + Ok(i.into()) + } + } else { + // Todo: Try decimal once we implement custom JSON parsing where we have access to strings + // Try double - currently json_to_variant does not produce decimal + match n.as_f64() { + Some(f) => return Ok(f.into()), + None => Err(ArrowError::InvalidArgumentError(format!( + "Failed to parse {n} as number", + ))), + }? + } +} + +fn append_json<'m, 'v>( + json: &'v Value, + builder: &mut impl VariantBuilderExt<'m, 'v>, +) -> Result<(), ArrowError> { + match json { + Value::Null => builder.append_value(Variant::Null), + Value::Bool(b) => builder.append_value(*b), + Value::Number(n) => { + builder.append_value(variant_from_number(n)?); + } + Value::String(s) => builder.append_value(s.as_str()), + Value::Array(arr) => { + let mut list_builder = builder.new_list(); + for val in arr { + append_json(val, &mut list_builder)?; + } + list_builder.finish(); + } + Value::Object(obj) => { + let mut obj_builder = builder.new_object(); + for (key, value) in obj.iter() { + let mut field_builder = ObjectFieldBuilder { + key, + builder: &mut obj_builder, + }; + append_json(value, &mut field_builder)?; + } + obj_builder.finish()?; + } + }; + Ok(()) +} + +struct ObjectFieldBuilder<'o, 'v, 's> { + key: &'s str, + builder: &'o mut ObjectBuilder<'v>, +} + +impl<'m, 'v> VariantBuilderExt<'m, 'v> for ObjectFieldBuilder<'_, '_, '_> { + fn append_value(&mut self, value: impl Into>) { + self.builder.insert(self.key, value); + } + + fn new_list(&mut self) -> ListBuilder { + self.builder.new_list(self.key) + } + + fn new_object(&mut self) -> ObjectBuilder { + self.builder.new_object(self.key) + } +} + +#[cfg(test)] +mod test { + use super::*; + use crate::variant_to_json_string; + use arrow_schema::ArrowError; + use parquet_variant::{ + ShortString, Variant, VariantBuilder, VariantDecimal16, VariantDecimal4, VariantDecimal8, + }; + + struct JsonToVariantTest<'a> { + json: &'a str, + expected: Variant<'a, 'a>, + } + + impl<'a> JsonToVariantTest<'a> { + fn run(self) -> Result<(), ArrowError> { + let mut variant_builder = VariantBuilder::new(); + json_to_variant(self.json, &mut variant_builder)?; + let (metadata, value) = variant_builder.finish(); + let variant = Variant::try_new(&metadata, &value)?; + assert_eq!(variant, self.expected); + Ok(()) + } + } + + #[test] + fn test_json_to_variant_null() -> Result<(), ArrowError> { + JsonToVariantTest { + json: "null", + expected: Variant::Null, + } + .run() + } + + #[test] + fn test_json_to_variant_boolean_true() -> Result<(), ArrowError> { + JsonToVariantTest { + json: "true", + expected: Variant::BooleanTrue, + } + .run() + } + + #[test] + fn test_json_to_variant_boolean_false() -> Result<(), ArrowError> { + JsonToVariantTest { + json: "false", + expected: Variant::BooleanFalse, + } + .run() + } + + #[test] + fn test_json_to_variant_int8_positive() -> Result<(), ArrowError> { + JsonToVariantTest { + json: " 127 ", + expected: Variant::Int8(127), + } + .run() + } + + #[test] + fn test_json_to_variant_int8_negative() -> Result<(), ArrowError> { + JsonToVariantTest { + json: " -128 ", + expected: Variant::Int8(-128), + } + .run() + } + + #[test] + fn test_json_to_variant_int16() -> Result<(), ArrowError> { + JsonToVariantTest { + json: " 27134 ", + expected: Variant::Int16(27134), + } + .run() + } + + #[test] + fn test_json_to_variant_int32() -> Result<(), ArrowError> { + JsonToVariantTest { + json: " -32767431 ", + expected: Variant::Int32(-32767431), + } + .run() + } + + #[test] + fn test_json_to_variant_int64() -> Result<(), ArrowError> { + JsonToVariantTest { + json: "92842754201389", + expected: Variant::Int64(92842754201389), + } + .run() + } + + #[ignore] + #[test] + fn test_json_to_variant_decimal4_basic() -> Result<(), ArrowError> { + JsonToVariantTest { + json: "1.23", + expected: Variant::from(VariantDecimal4::try_new(123, 2)?), + } + .run() + } + + #[ignore] + #[test] + fn test_json_to_variant_decimal4_large_positive() -> Result<(), ArrowError> { + JsonToVariantTest { + json: "99999999.9", + expected: Variant::from(VariantDecimal4::try_new(999999999, 1)?), + } + .run() + } + + #[ignore] + #[test] + fn test_json_to_variant_decimal4_large_negative() -> Result<(), ArrowError> { + JsonToVariantTest { + json: "-99999999.9", + expected: Variant::from(VariantDecimal4::try_new(-999999999, 1)?), + } + .run() + } + + #[ignore] + #[test] + fn test_json_to_variant_decimal4_small_positive() -> Result<(), ArrowError> { + JsonToVariantTest { + json: "0.999999999", + expected: Variant::from(VariantDecimal4::try_new(999999999, 9)?), + } + .run() + } + + #[ignore] + #[test] + fn test_json_to_variant_decimal4_tiny_positive() -> Result<(), ArrowError> { + JsonToVariantTest { + json: "0.000000001", + expected: Variant::from(VariantDecimal4::try_new(1, 9)?), + } + .run() + } + + #[ignore] + #[test] + fn test_json_to_variant_decimal4_small_negative() -> Result<(), ArrowError> { + JsonToVariantTest { + json: "-0.999999999", + expected: Variant::from(VariantDecimal4::try_new(-999999999, 9)?), + } + .run() + } + + #[ignore] + #[test] + fn test_json_to_variant_decimal8_positive() -> Result<(), ArrowError> { + JsonToVariantTest { + json: "999999999.0", + expected: Variant::from(VariantDecimal8::try_new(9999999990, 1)?), + } + .run() + } + + #[ignore] + #[test] + fn test_json_to_variant_decimal8_negative() -> Result<(), ArrowError> { + JsonToVariantTest { + json: "-999999999.0", + expected: Variant::from(VariantDecimal8::try_new(-9999999990, 1)?), + } + .run() + } + + #[ignore] + #[test] + fn test_json_to_variant_decimal8_high_precision() -> Result<(), ArrowError> { + JsonToVariantTest { + json: "0.999999999999999999", + expected: Variant::from(VariantDecimal8::try_new(999999999999999999, 18)?), + } + .run() + } + + #[ignore] + #[test] + fn test_json_to_variant_decimal8_large_with_scale() -> Result<(), ArrowError> { + JsonToVariantTest { + json: "9999999999999999.99", + expected: Variant::from(VariantDecimal8::try_new(999999999999999999, 2)?), + } + .run() + } + + #[ignore] + #[test] + fn test_json_to_variant_decimal8_large_negative_with_scale() -> Result<(), ArrowError> { + JsonToVariantTest { + json: "-9999999999999999.99", + expected: Variant::from(VariantDecimal8::try_new(-999999999999999999, 2)?), + } + .run() + } + + #[ignore] + #[test] + fn test_json_to_variant_decimal16_large_integer() -> Result<(), ArrowError> { + JsonToVariantTest { + json: "9999999999999999999", // integer larger than i64 + expected: Variant::from(VariantDecimal16::try_new(9999999999999999999, 0)?), + } + .run() + } + + #[ignore] + #[test] + fn test_json_to_variant_decimal16_high_precision() -> Result<(), ArrowError> { + JsonToVariantTest { + json: "0.9999999999999999999", + expected: Variant::from(VariantDecimal16::try_new(9999999999999999999, 19)?), + } + .run() + } + + #[ignore] + #[test] + fn test_json_to_variant_decimal16_max_value() -> Result<(), ArrowError> { + JsonToVariantTest { + json: "79228162514264337593543950335", // 2 ^ 96 - 1 + expected: Variant::from(VariantDecimal16::try_new(79228162514264337593543950335, 0)?), + } + .run() + } + + #[ignore] + #[test] + fn test_json_to_variant_decimal16_max_scale() -> Result<(), ArrowError> { + JsonToVariantTest { + json: "7.9228162514264337593543950335", // using scale higher than this falls into double + // since the max scale is 28. + expected: Variant::from(VariantDecimal16::try_new( + 79228162514264337593543950335, + 28, + )?), + } + .run() + } + + #[test] + fn test_json_to_variant_double_precision() -> Result<(), ArrowError> { + JsonToVariantTest { + json: "0.79228162514264337593543950335", + expected: Variant::Double(0.792_281_625_142_643_4_f64), + } + .run() + } + + #[test] + fn test_json_to_variant_double_scientific_positive() -> Result<(), ArrowError> { + JsonToVariantTest { + json: "15e-1", + expected: Variant::Double(15e-1f64), + } + .run() + } + + #[test] + fn test_json_to_variant_double_scientific_negative() -> Result<(), ArrowError> { + JsonToVariantTest { + json: "-15e-1", + expected: Variant::Double(-15e-1f64), + } + .run() + } + + #[test] + fn test_json_to_variant_short_string() -> Result<(), ArrowError> { + JsonToVariantTest { + json: "\"harsh\"", + expected: Variant::ShortString(ShortString::try_new("harsh")?), + } + .run() + } + + #[test] + fn test_json_to_variant_short_string_max_length() -> Result<(), ArrowError> { + JsonToVariantTest { + json: &format!("\"{}\"", "a".repeat(63)), + expected: Variant::ShortString(ShortString::try_new(&"a".repeat(63))?), + } + .run() + } + + #[test] + fn test_json_to_variant_long_string() -> Result<(), ArrowError> { + JsonToVariantTest { + json: &format!("\"{}\"", "a".repeat(64)), + expected: Variant::String(&"a".repeat(64)), + } + .run() + } + + #[test] + fn test_json_to_variant_very_long_string() -> Result<(), ArrowError> { + JsonToVariantTest { + json: &format!("\"{}\"", "b".repeat(100000)), + expected: Variant::String(&"b".repeat(100000)), + } + .run() + } + + #[test] + fn test_json_to_variant_array_simple() -> Result<(), ArrowError> { + let mut variant_builder = VariantBuilder::new(); + let mut list_builder = variant_builder.new_list(); + list_builder.append_value(Variant::Int8(127)); + list_builder.append_value(Variant::Int16(128)); + list_builder.append_value(Variant::Int32(-32767431)); + list_builder.finish(); + let (metadata, value) = variant_builder.finish(); + let variant = Variant::try_new(&metadata, &value)?; + + JsonToVariantTest { + json: "[127, 128, -32767431]", + expected: variant, + } + .run() + } + + #[test] + fn test_json_to_variant_array_with_object() -> Result<(), ArrowError> { + let mut variant_builder = VariantBuilder::new(); + let mut list_builder = variant_builder.new_list(); + let mut object_builder_inner = list_builder.new_object(); + object_builder_inner.insert("age", Variant::Int8(32)); + object_builder_inner.finish().unwrap(); + list_builder.append_value(Variant::Int16(128)); + list_builder.append_value(Variant::BooleanFalse); + list_builder.finish(); + let (metadata, value) = variant_builder.finish(); + let variant = Variant::try_new(&metadata, &value)?; + + JsonToVariantTest { + json: "[{\"age\": 32}, 128, false]", + expected: variant, + } + .run() + } + + #[test] + fn test_json_to_variant_array_large_u16_offset() -> Result<(), ArrowError> { + // u16 offset - 128 i8's + 1 "true" = 257 bytes + let mut variant_builder = VariantBuilder::new(); + let mut list_builder = variant_builder.new_list(); + for _ in 0..128 { + list_builder.append_value(Variant::Int8(1)); + } + list_builder.append_value(Variant::BooleanTrue); + list_builder.finish(); + let (metadata, value) = variant_builder.finish(); + let variant = Variant::try_new(&metadata, &value)?; + + JsonToVariantTest { + json: &format!("[{} true]", "1, ".repeat(128)), + expected: variant, + } + .run() + } + + #[test] + fn test_json_to_variant_array_nested_large() -> Result<(), ArrowError> { + // verify u24, and large_size + let mut variant_builder = VariantBuilder::new(); + let mut list_builder = variant_builder.new_list(); + for _ in 0..256 { + let mut list_builder_inner = list_builder.new_list(); + for _ in 0..255 { + list_builder_inner.append_value(Variant::Null); + } + list_builder_inner.finish(); + } + list_builder.finish(); + let (metadata, value) = variant_builder.finish(); + let variant = Variant::try_new(&metadata, &value)?; + let intermediate = format!("[{}]", vec!["null"; 255].join(", ")); + let json = format!("[{}]", vec![intermediate; 256].join(", ")); + JsonToVariantTest { + json: json.as_str(), + expected: variant, + } + .run() + } + + #[test] + fn test_json_to_variant_object_simple() -> Result<(), ArrowError> { + let mut variant_builder = VariantBuilder::new(); + let mut object_builder = variant_builder.new_object(); + object_builder.insert("a", Variant::Int8(3)); + object_builder.insert("b", Variant::Int8(2)); + object_builder.finish().unwrap(); + let (metadata, value) = variant_builder.finish(); + let variant = Variant::try_new(&metadata, &value)?; + JsonToVariantTest { + json: "{\"b\": 2, \"a\": 1, \"a\": 3}", + expected: variant, + } + .run() + } + + #[test] + fn test_json_to_variant_object_complex() -> Result<(), ArrowError> { + let mut variant_builder = VariantBuilder::new(); + let mut object_builder = variant_builder.new_object(); + let mut inner_list_builder = object_builder.new_list("booleans"); + inner_list_builder.append_value(Variant::BooleanTrue); + inner_list_builder.append_value(Variant::BooleanFalse); + inner_list_builder.finish(); + object_builder.insert("null", Variant::Null); + let mut inner_list_builder = object_builder.new_list("numbers"); + inner_list_builder.append_value(Variant::Int8(4)); + inner_list_builder.append_value(Variant::Double(-3e0)); + inner_list_builder.append_value(Variant::Double(1001e-3)); + inner_list_builder.finish(); + object_builder.finish().unwrap(); + let (metadata, value) = variant_builder.finish(); + let variant = Variant::try_new(&metadata, &value)?; + JsonToVariantTest { + json: "{\"numbers\": [4, -3e0, 1001e-3], \"null\": null, \"booleans\": [true, false]}", + expected: variant, + } + .run() + } + + #[test] + fn test_json_to_variant_object_very_large() -> Result<(), ArrowError> { + // 256 elements (keys: 000-255) - each element is an object of 256 elements (240-495) - each + // element a list of numbers from 0-127 + let keys: Vec = (0..=255).map(|n| format!("{n:03}")).collect(); + let innermost_list: String = format!( + "[{}]", + (0..=127) + .map(|n| format!("{n}")) + .collect::>() + .join(",") + ); + let inner_keys: Vec = (240..=495).map(|n| format!("{n}")).collect(); + let inner_object = format!( + "{{{}:{}}}", + inner_keys + .iter() + .map(|k| format!("\"{k}\"")) + .collect::>() + .join(format!(":{innermost_list},").as_str()), + innermost_list + ); + let json = format!( + "{{{}:{}}}", + keys.iter() + .map(|k| format!("\"{k}\"")) + .collect::>() + .join(format!(":{inner_object},").as_str()), + inner_object + ); + // Manually verify raw JSON value size + let mut variant_builder = VariantBuilder::new(); + json_to_variant(&json, &mut variant_builder)?; + let (metadata, value) = variant_builder.finish(); + let v = Variant::try_new(&metadata, &value)?; + let output_string = variant_to_json_string(&v)?; + assert_eq!(output_string, json); + // Verify metadata size = 1 + 2 + 2 * 497 + 3 * 496 + assert_eq!(metadata.len(), 2485); + // Verify value size. + // Size of innermost_list: 1 + 1 + 258 + 256 = 516 + // Size of inner object: 1 + 4 + 256 + 257 * 3 + 256 * 516 = 133128 + // Size of json: 1 + 4 + 512 + 1028 + 256 * 133128 = 34082313 + assert_eq!(value.len(), 34082313); + + let mut variant_builder = VariantBuilder::new(); + let mut object_builder = variant_builder.new_object(); + keys.iter().for_each(|key| { + let mut inner_object_builder = object_builder.new_object(key); + inner_keys.iter().for_each(|inner_key| { + let mut list_builder = inner_object_builder.new_list(inner_key); + for i in 0..=127 { + list_builder.append_value(Variant::Int8(i)); + } + list_builder.finish(); + }); + inner_object_builder.finish().unwrap(); + }); + object_builder.finish().unwrap(); + let (metadata, value) = variant_builder.finish(); + let variant = Variant::try_new(&metadata, &value)?; + + JsonToVariantTest { + json: &json, + expected: variant, + } + .run() + } + + #[test] + fn test_json_to_variant_unicode() -> Result<(), ArrowError> { + let json = "{\"爱\":\"अ\",\"a\":1}"; + let mut variant_builder = VariantBuilder::new(); + json_to_variant(json, &mut variant_builder)?; + let (metadata, value) = variant_builder.finish(); + let v = Variant::try_new(&metadata, &value)?; + let output_string = variant_to_json_string(&v)?; + assert_eq!(output_string, "{\"a\":1,\"爱\":\"अ\"}"); + let mut variant_builder = VariantBuilder::new(); + let mut object_builder = variant_builder.new_object(); + object_builder.insert("a", Variant::Int8(1)); + object_builder.insert("爱", Variant::ShortString(ShortString::try_new("अ")?)); + object_builder.finish().unwrap(); + let (metadata, value) = variant_builder.finish(); + let variant = Variant::try_new(&metadata, &value)?; + + assert_eq!( + value, + &[2u8, 2u8, 0u8, 1u8, 0u8, 2u8, 6u8, 12u8, 1u8, 13u8, 0xe0u8, 0xa4u8, 0x85u8] + ); + assert_eq!( + metadata, + &[17u8, 2u8, 0u8, 1u8, 4u8, 97u8, 0xe7u8, 0x88u8, 0xb1u8] + ); + JsonToVariantTest { + json, + expected: variant, + } + .run() + } +} diff --git a/parquet-variant-json/src/lib.rs b/parquet-variant-json/src/lib.rs new file mode 100644 index 000000000000..bb774c05c135 --- /dev/null +++ b/parquet-variant-json/src/lib.rs @@ -0,0 +1,38 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Conversion between [JSON] and the [Variant Binary Encoding] from [Apache Parquet]. +//! +//! [JSON]: https://www.json.org/json-en.html +//! [Variant Binary Encoding]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md +//! [Apache Parquet]: https://parquet.apache.org/ +//! +//! * See [`json_to_variant`] for converting a JSON string to a Variant. +//! * See [`variant_to_json`] for converting a Variant to a JSON string. +//! +//! ## 🚧 Work In Progress +//! +//! This crate is under active development and is not yet ready for production use. +//! If you are interested in helping, you can find more information on the GitHub [Variant issue] +//! +//! [Variant issue]: https://github.com/apache/arrow-rs/issues/6736 + +mod from_json; +mod to_json; + +pub use from_json::json_to_variant; +pub use to_json::{variant_to_json, variant_to_json_string, variant_to_json_value}; diff --git a/parquet-variant/src/to_json.rs b/parquet-variant-json/src/to_json.rs similarity index 97% rename from parquet-variant/src/to_json.rs rename to parquet-variant-json/src/to_json.rs index b27fca6108d2..55e024a66c4a 100644 --- a/parquet-variant/src/to_json.rs +++ b/parquet-variant-json/src/to_json.rs @@ -21,7 +21,7 @@ use base64::{engine::general_purpose, Engine as _}; use serde_json::Value; use std::io::Write; -use crate::variant::{Variant, VariantList, VariantObject}; +use parquet_variant::{Variant, VariantList, VariantObject}; // Format string constants to avoid duplication and reduce errors const DATE_FORMAT: &str = "%Y-%m-%d"; @@ -61,7 +61,8 @@ fn format_binary_base64(bytes: &[u8]) -> String { /// /// /// ```rust -/// # use parquet_variant::{Variant, variant_to_json}; +/// # use parquet_variant::{Variant}; +/// # use parquet_variant_json::variant_to_json; /// # use arrow_schema::ArrowError; /// let variant = Variant::from("Hello, World!"); /// let mut buffer = Vec::new(); @@ -72,7 +73,8 @@ fn format_binary_base64(bytes: &[u8]) -> String { /// /// # Example: Create a [`Variant::Object`] and convert to JSON /// ```rust -/// # use parquet_variant::{Variant, VariantBuilder, variant_to_json}; +/// # use parquet_variant::{Variant, VariantBuilder}; +/// # use parquet_variant_json::variant_to_json; /// # use arrow_schema::ArrowError; /// let mut builder = VariantBuilder::new(); /// // Create an object builder that will write fields to the object @@ -203,7 +205,8 @@ fn convert_array_to_json(buffer: &mut impl Write, arr: &VariantList) -> Result<( /// # Examples /// /// ```rust -/// # use parquet_variant::{Variant, variant_to_json_string}; +/// # use parquet_variant::{Variant}; +/// # use parquet_variant_json::variant_to_json_string; /// # use arrow_schema::ArrowError; /// let variant = Variant::Int32(42); /// let json = variant_to_json_string(&variant)?; @@ -222,7 +225,8 @@ fn convert_array_to_json(buffer: &mut impl Write, arr: &VariantList) -> Result<( /// ``` /// /// ```rust -/// # use parquet_variant::{Variant, VariantBuilder, variant_to_json_string}; +/// # use parquet_variant::{Variant, VariantBuilder}; +/// # use parquet_variant_json::variant_to_json_string; /// # use arrow_schema::ArrowError; /// let mut builder = VariantBuilder::new(); /// // Create an object builder that will write fields to the object @@ -263,7 +267,8 @@ pub fn variant_to_json_string(variant: &Variant) -> Result { /// # Examples /// /// ```rust -/// # use parquet_variant::{Variant, variant_to_json_value}; +/// # use parquet_variant::{Variant}; +/// # use parquet_variant_json::variant_to_json_value; /// # use serde_json::Value; /// # use arrow_schema::ArrowError; /// let variant = Variant::from("hello"); @@ -366,8 +371,8 @@ pub fn variant_to_json_value(variant: &Variant) -> Result { #[cfg(test)] mod tests { use super::*; - use crate::{Variant, VariantDecimal16, VariantDecimal4, VariantDecimal8}; use chrono::{DateTime, NaiveDate, Utc}; + use parquet_variant::{VariantDecimal16, VariantDecimal4, VariantDecimal8}; #[test] fn test_decimal_edge_cases() -> Result<(), ArrowError> { @@ -490,7 +495,7 @@ mod tests { #[test] fn test_short_string_to_json() -> Result<(), ArrowError> { - use crate::variant::ShortString; + use parquet_variant::ShortString; let short_string = ShortString::try_new("short")?; let variant = Variant::ShortString(short_string); let json = variant_to_json_string(&variant)?; @@ -598,7 +603,7 @@ mod tests { #[test] fn test_primitive_json_conversion() { - use crate::variant::ShortString; + use parquet_variant::ShortString; // Null JsonTest { @@ -848,7 +853,7 @@ mod tests { #[test] fn test_simple_object_to_json() -> Result<(), ArrowError> { - use crate::builder::VariantBuilder; + use parquet_variant::VariantBuilder; // Create a simple object with various field types let mut builder = VariantBuilder::new(); @@ -884,7 +889,7 @@ mod tests { #[test] fn test_empty_object_to_json() -> Result<(), ArrowError> { - use crate::builder::VariantBuilder; + use parquet_variant::VariantBuilder; let mut builder = VariantBuilder::new(); @@ -906,7 +911,7 @@ mod tests { #[test] fn test_object_with_special_characters_to_json() -> Result<(), ArrowError> { - use crate::builder::VariantBuilder; + use parquet_variant::VariantBuilder; let mut builder = VariantBuilder::new(); @@ -936,7 +941,7 @@ mod tests { #[test] fn test_simple_list_to_json() -> Result<(), ArrowError> { - use crate::builder::VariantBuilder; + use parquet_variant::VariantBuilder; let mut builder = VariantBuilder::new(); @@ -966,7 +971,7 @@ mod tests { #[test] fn test_empty_list_to_json() -> Result<(), ArrowError> { - use crate::builder::VariantBuilder; + use parquet_variant::VariantBuilder; let mut builder = VariantBuilder::new(); @@ -988,7 +993,7 @@ mod tests { #[test] fn test_mixed_type_list_to_json() -> Result<(), ArrowError> { - use crate::builder::VariantBuilder; + use parquet_variant::VariantBuilder; let mut builder = VariantBuilder::new(); @@ -1020,7 +1025,7 @@ mod tests { #[test] fn test_object_field_ordering_in_json() -> Result<(), ArrowError> { - use crate::builder::VariantBuilder; + use parquet_variant::VariantBuilder; let mut builder = VariantBuilder::new(); @@ -1050,7 +1055,7 @@ mod tests { #[test] fn test_list_with_various_primitive_types_to_json() -> Result<(), ArrowError> { - use crate::builder::VariantBuilder; + use parquet_variant::VariantBuilder; let mut builder = VariantBuilder::new(); @@ -1086,7 +1091,7 @@ mod tests { #[test] fn test_object_with_various_primitive_types_to_json() -> Result<(), ArrowError> { - use crate::builder::VariantBuilder; + use parquet_variant::VariantBuilder; let mut builder = VariantBuilder::new(); diff --git a/parquet-variant/Cargo.toml b/parquet-variant/Cargo.toml index 708b614cf4b7..3edfbb76ed32 100644 --- a/parquet-variant/Cargo.toml +++ b/parquet-variant/Cargo.toml @@ -29,14 +29,12 @@ keywords = ["arrow", "parquet", "variant"] readme = "README.md" edition = { workspace = true } # needs a newer version than workspace due to -# rror: `Option::::unwrap` is not yet stable as a const fn +# Error: `Option::::unwrap` is not yet stable as a const fn rust-version = "1.83" [dependencies] arrow-schema = { workspace = true } chrono = { workspace = true } -serde_json = "1.0" -base64 = "0.22" indexmap = "2.10.0" diff --git a/parquet-variant/examples/variant_from_json_examples.rs b/parquet-variant/examples/variant_from_json_examples.rs deleted file mode 100644 index e8a8a9d24959..000000000000 --- a/parquet-variant/examples/variant_from_json_examples.rs +++ /dev/null @@ -1,50 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Example showing how to convert Variant values to JSON - -use parquet_variant::{ - json_to_variant, variant_to_json, variant_to_json_string, variant_to_json_value, VariantBuilder, -}; - -fn main() -> Result<(), Box> { - let person_string = "{\"name\":\"Alice\", \"age\":30, ".to_string() - + "\"email\":\"alice@example.com\", \"is_active\": true, \"score\": 95.7," - + "\"additional_info\": null}"; - - let mut variant_builder = VariantBuilder::new(); - json_to_variant(&person_string, &mut variant_builder)?; - - let (metadata, value) = variant_builder.finish(); - - let variant = parquet_variant::Variant::try_new(&metadata, &value)?; - - let json_result = variant_to_json_string(&variant)?; - let json_value = variant_to_json_value(&variant)?; - let pretty_json = serde_json::to_string_pretty(&json_value)?; - println!("{pretty_json}"); - - let mut buffer = Vec::new(); - variant_to_json(&mut buffer, &variant)?; - let buffer_result = String::from_utf8(buffer)?; - assert_eq!(json_result, "{\"additional_info\":null,\"age\":30,".to_string() + - "\"email\":\"alice@example.com\",\"is_active\":true,\"name\":\"Alice\",\"score\":95.7}"); - assert_eq!(json_result, buffer_result); - assert_eq!(json_result, serde_json::to_string(&json_value)?); - - Ok(()) -} diff --git a/parquet-variant/src/builder.rs b/parquet-variant/src/builder.rs index e224ec0e4d99..542065045c92 100644 --- a/parquet-variant/src/builder.rs +++ b/parquet-variant/src/builder.rs @@ -948,9 +948,11 @@ impl Drop for ObjectBuilder<'_> { fn drop(&mut self) {} } -/// Trait that abstracts functionality from Variant construction implementations, such as -/// [`VariantBuilder`] and [`ListBuilder`], to minimize code duplication. -pub(crate) trait VariantBuilderExt<'m, 'v> { +/// Extends [`VariantBuilder`] to help building nested [`Variant`]s +/// +/// Allows users to append values to a [`VariantBuilder`], [`ListBuilder`] or +/// [`ObjectBuilder`]. using the same interface. +pub trait VariantBuilderExt<'m, 'v> { fn append_value(&mut self, value: impl Into>); fn new_list(&mut self) -> ListBuilder; diff --git a/parquet-variant/src/from_json.rs b/parquet-variant/src/from_json.rs deleted file mode 100644 index c4adbd1377a8..000000000000 --- a/parquet-variant/src/from_json.rs +++ /dev/null @@ -1,151 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Module for parsing JSON strings as Variant - -use crate::{ListBuilder, ObjectBuilder, Variant, VariantBuilder, VariantBuilderExt}; -use arrow_schema::ArrowError; -use serde_json::{Number, Value}; - -/// Converts a JSON string to Variant using [`VariantBuilder`]. The resulting `value` and `metadata` -/// buffers can be extracted using `builder.finish()` -/// -/// # Arguments -/// * `json` - The JSON string to parse as Variant. -/// * `variant_builder` - Object of type `VariantBuilder` used to build the vatiant from the JSON -/// string -/// -/// # Returns -/// -/// * `Ok(())` if successful -/// * `Err` with error details if the conversion fails -/// -/// ```rust -/// # use parquet_variant::{ -/// json_to_variant, variant_to_json, variant_to_json_string, variant_to_json_value, VariantBuilder -/// }; -/// -/// let mut variant_builder = VariantBuilder::new(); -/// let person_string = "{\"name\":\"Alice\", \"age\":30, ".to_string() -/// + "\"email\":\"alice@example.com\", \"is_active\": true, \"score\": 95.7," -/// + "\"additional_info\": null}"; -/// json_to_variant(&person_string, &mut variant_builder)?; -/// -/// let (metadata, value) = variant_builder.finish(); -/// -/// let variant = parquet_variant::Variant::try_new(&metadata, &value)?; -/// -/// let json_result = variant_to_json_string(&variant)?; -/// let json_value = variant_to_json_value(&variant)?; -/// -/// let mut buffer = Vec::new(); -/// variant_to_json(&mut buffer, &variant)?; -/// let buffer_result = String::from_utf8(buffer)?; -/// assert_eq!(json_result, "{\"additional_info\":null,\"age\":30,".to_string() + -/// "\"email\":\"alice@example.com\",\"is_active\":true,\"name\":\"Alice\",\"score\":95.7}"); -/// assert_eq!(json_result, buffer_result); -/// assert_eq!(json_result, serde_json::to_string(&json_value)?); -/// # Ok::<(), Box>(()) -/// ``` -pub fn json_to_variant(json: &str, builder: &mut VariantBuilder) -> Result<(), ArrowError> { - let json: Value = serde_json::from_str(json) - .map_err(|e| ArrowError::InvalidArgumentError(format!("JSON format error: {e}")))?; - - build_json(&json, builder)?; - Ok(()) -} - -fn build_json(json: &Value, builder: &mut VariantBuilder) -> Result<(), ArrowError> { - append_json(json, builder)?; - Ok(()) -} - -fn variant_from_number<'m, 'v>(n: &Number) -> Result, ArrowError> { - if let Some(i) = n.as_i64() { - // Find minimum Integer width to fit - if i as i8 as i64 == i { - Ok((i as i8).into()) - } else if i as i16 as i64 == i { - Ok((i as i16).into()) - } else if i as i32 as i64 == i { - Ok((i as i32).into()) - } else { - Ok(i.into()) - } - } else { - // Todo: Try decimal once we implement custom JSON parsing where we have access to strings - // Try double - currently json_to_variant does not produce decimal - match n.as_f64() { - Some(f) => return Ok(f.into()), - None => Err(ArrowError::InvalidArgumentError(format!( - "Failed to parse {n} as number", - ))), - }? - } -} - -fn append_json<'m, 'v>( - json: &'v Value, - builder: &mut impl VariantBuilderExt<'m, 'v>, -) -> Result<(), ArrowError> { - match json { - Value::Null => builder.append_value(Variant::Null), - Value::Bool(b) => builder.append_value(*b), - Value::Number(n) => { - builder.append_value(variant_from_number(n)?); - } - Value::String(s) => builder.append_value(s.as_str()), - Value::Array(arr) => { - let mut list_builder = builder.new_list(); - for val in arr { - append_json(val, &mut list_builder)?; - } - list_builder.finish(); - } - Value::Object(obj) => { - let mut obj_builder = builder.new_object(); - for (key, value) in obj.iter() { - let mut field_builder = ObjectFieldBuilder { - key, - builder: &mut obj_builder, - }; - append_json(value, &mut field_builder)?; - } - obj_builder.finish()?; - } - }; - Ok(()) -} - -struct ObjectFieldBuilder<'o, 'v, 's> { - key: &'s str, - builder: &'o mut ObjectBuilder<'v>, -} - -impl<'m, 'v> VariantBuilderExt<'m, 'v> for ObjectFieldBuilder<'_, '_, '_> { - fn append_value(&mut self, value: impl Into>) { - self.builder.insert(self.key, value); - } - - fn new_list(&mut self) -> ListBuilder { - self.builder.new_list(self.key) - } - - fn new_object(&mut self) -> ObjectBuilder { - self.builder.new_object(self.key) - } -} diff --git a/parquet-variant/src/lib.rs b/parquet-variant/src/lib.rs index 1dcd70d66ad5..221c4e427ff3 100644 --- a/parquet-variant/src/lib.rs +++ b/parquet-variant/src/lib.rs @@ -29,12 +29,8 @@ mod builder; mod decoder; -mod from_json; -mod to_json; mod utils; mod variant; pub use builder::*; -pub use from_json::json_to_variant; -pub use to_json::{variant_to_json, variant_to_json_string, variant_to_json_value}; pub use variant::*; diff --git a/parquet-variant/tests/test_json_to_variant.rs b/parquet-variant/tests/test_json_to_variant.rs deleted file mode 100644 index e4c001d7a382..000000000000 --- a/parquet-variant/tests/test_json_to_variant.rs +++ /dev/null @@ -1,552 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Manually tests if parsing JSON strings to Variants returns the expected results. - -use arrow_schema::ArrowError; -use parquet_variant::{ - json_to_variant, variant_to_json_string, ShortString, Variant, VariantBuilder, - VariantDecimal16, VariantDecimal4, VariantDecimal8, -}; - -struct JsonToVariantTest<'a> { - json: &'a str, - expected: Variant<'a, 'a>, -} - -impl JsonToVariantTest<'_> { - fn run(self) -> Result<(), ArrowError> { - let mut variant_builder = VariantBuilder::new(); - json_to_variant(self.json, &mut variant_builder)?; - let (metadata, value) = variant_builder.finish(); - let variant = Variant::try_new(&metadata, &value)?; - assert_eq!(variant, self.expected); - Ok(()) - } -} - -#[test] -fn test_json_to_variant_null() -> Result<(), ArrowError> { - JsonToVariantTest { - json: "null", - expected: Variant::Null, - } - .run() -} - -#[test] -fn test_json_to_variant_boolean_true() -> Result<(), ArrowError> { - JsonToVariantTest { - json: "true", - expected: Variant::BooleanTrue, - } - .run() -} - -#[test] -fn test_json_to_variant_boolean_false() -> Result<(), ArrowError> { - JsonToVariantTest { - json: "false", - expected: Variant::BooleanFalse, - } - .run() -} - -#[test] -fn test_json_to_variant_int8_positive() -> Result<(), ArrowError> { - JsonToVariantTest { - json: " 127 ", - expected: Variant::Int8(127), - } - .run() -} - -#[test] -fn test_json_to_variant_int8_negative() -> Result<(), ArrowError> { - JsonToVariantTest { - json: " -128 ", - expected: Variant::Int8(-128), - } - .run() -} - -#[test] -fn test_json_to_variant_int16() -> Result<(), ArrowError> { - JsonToVariantTest { - json: " 27134 ", - expected: Variant::Int16(27134), - } - .run() -} - -#[test] -fn test_json_to_variant_int32() -> Result<(), ArrowError> { - JsonToVariantTest { - json: " -32767431 ", - expected: Variant::Int32(-32767431), - } - .run() -} - -#[test] -fn test_json_to_variant_int64() -> Result<(), ArrowError> { - JsonToVariantTest { - json: "92842754201389", - expected: Variant::Int64(92842754201389), - } - .run() -} - -#[ignore] -#[test] -fn test_json_to_variant_decimal4_basic() -> Result<(), ArrowError> { - JsonToVariantTest { - json: "1.23", - expected: Variant::from(VariantDecimal4::try_new(123, 2)?), - } - .run() -} - -#[ignore] -#[test] -fn test_json_to_variant_decimal4_large_positive() -> Result<(), ArrowError> { - JsonToVariantTest { - json: "99999999.9", - expected: Variant::from(VariantDecimal4::try_new(999999999, 1)?), - } - .run() -} - -#[ignore] -#[test] -fn test_json_to_variant_decimal4_large_negative() -> Result<(), ArrowError> { - JsonToVariantTest { - json: "-99999999.9", - expected: Variant::from(VariantDecimal4::try_new(-999999999, 1)?), - } - .run() -} - -#[ignore] -#[test] -fn test_json_to_variant_decimal4_small_positive() -> Result<(), ArrowError> { - JsonToVariantTest { - json: "0.999999999", - expected: Variant::from(VariantDecimal4::try_new(999999999, 9)?), - } - .run() -} - -#[ignore] -#[test] -fn test_json_to_variant_decimal4_tiny_positive() -> Result<(), ArrowError> { - JsonToVariantTest { - json: "0.000000001", - expected: Variant::from(VariantDecimal4::try_new(1, 9)?), - } - .run() -} - -#[ignore] -#[test] -fn test_json_to_variant_decimal4_small_negative() -> Result<(), ArrowError> { - JsonToVariantTest { - json: "-0.999999999", - expected: Variant::from(VariantDecimal4::try_new(-999999999, 9)?), - } - .run() -} - -#[ignore] -#[test] -fn test_json_to_variant_decimal8_positive() -> Result<(), ArrowError> { - JsonToVariantTest { - json: "999999999.0", - expected: Variant::from(VariantDecimal8::try_new(9999999990, 1)?), - } - .run() -} - -#[ignore] -#[test] -fn test_json_to_variant_decimal8_negative() -> Result<(), ArrowError> { - JsonToVariantTest { - json: "-999999999.0", - expected: Variant::from(VariantDecimal8::try_new(-9999999990, 1)?), - } - .run() -} - -#[ignore] -#[test] -fn test_json_to_variant_decimal8_high_precision() -> Result<(), ArrowError> { - JsonToVariantTest { - json: "0.999999999999999999", - expected: Variant::from(VariantDecimal8::try_new(999999999999999999, 18)?), - } - .run() -} - -#[ignore] -#[test] -fn test_json_to_variant_decimal8_large_with_scale() -> Result<(), ArrowError> { - JsonToVariantTest { - json: "9999999999999999.99", - expected: Variant::from(VariantDecimal8::try_new(999999999999999999, 2)?), - } - .run() -} - -#[ignore] -#[test] -fn test_json_to_variant_decimal8_large_negative_with_scale() -> Result<(), ArrowError> { - JsonToVariantTest { - json: "-9999999999999999.99", - expected: Variant::from(VariantDecimal8::try_new(-999999999999999999, 2)?), - } - .run() -} - -#[ignore] -#[test] -fn test_json_to_variant_decimal16_large_integer() -> Result<(), ArrowError> { - JsonToVariantTest { - json: "9999999999999999999", // integer larger than i64 - expected: Variant::from(VariantDecimal16::try_new(9999999999999999999, 0)?), - } - .run() -} - -#[ignore] -#[test] -fn test_json_to_variant_decimal16_high_precision() -> Result<(), ArrowError> { - JsonToVariantTest { - json: "0.9999999999999999999", - expected: Variant::from(VariantDecimal16::try_new(9999999999999999999, 19)?), - } - .run() -} - -#[ignore] -#[test] -fn test_json_to_variant_decimal16_max_value() -> Result<(), ArrowError> { - JsonToVariantTest { - json: "79228162514264337593543950335", // 2 ^ 96 - 1 - expected: Variant::from(VariantDecimal16::try_new(79228162514264337593543950335, 0)?), - } - .run() -} - -#[ignore] -#[test] -fn test_json_to_variant_decimal16_max_scale() -> Result<(), ArrowError> { - JsonToVariantTest { - json: "7.9228162514264337593543950335", // using scale higher than this falls into double - // since the max scale is 28. - expected: Variant::from(VariantDecimal16::try_new( - 79228162514264337593543950335, - 28, - )?), - } - .run() -} - -#[test] -fn test_json_to_variant_double_precision() -> Result<(), ArrowError> { - JsonToVariantTest { - json: "0.79228162514264337593543950335", - expected: Variant::Double(0.792_281_625_142_643_4_f64), - } - .run() -} - -#[test] -fn test_json_to_variant_double_scientific_positive() -> Result<(), ArrowError> { - JsonToVariantTest { - json: "15e-1", - expected: Variant::Double(15e-1f64), - } - .run() -} - -#[test] -fn test_json_to_variant_double_scientific_negative() -> Result<(), ArrowError> { - JsonToVariantTest { - json: "-15e-1", - expected: Variant::Double(-15e-1f64), - } - .run() -} - -#[test] -fn test_json_to_variant_short_string() -> Result<(), ArrowError> { - JsonToVariantTest { - json: "\"harsh\"", - expected: Variant::ShortString(ShortString::try_new("harsh")?), - } - .run() -} - -#[test] -fn test_json_to_variant_short_string_max_length() -> Result<(), ArrowError> { - JsonToVariantTest { - json: &format!("\"{}\"", "a".repeat(63)), - expected: Variant::ShortString(ShortString::try_new(&"a".repeat(63))?), - } - .run() -} - -#[test] -fn test_json_to_variant_long_string() -> Result<(), ArrowError> { - JsonToVariantTest { - json: &format!("\"{}\"", "a".repeat(64)), - expected: Variant::String(&"a".repeat(64)), - } - .run() -} - -#[test] -fn test_json_to_variant_very_long_string() -> Result<(), ArrowError> { - JsonToVariantTest { - json: &format!("\"{}\"", "b".repeat(100000)), - expected: Variant::String(&"b".repeat(100000)), - } - .run() -} - -#[test] -fn test_json_to_variant_array_simple() -> Result<(), ArrowError> { - let mut variant_builder = VariantBuilder::new(); - let mut list_builder = variant_builder.new_list(); - list_builder.append_value(Variant::Int8(127)); - list_builder.append_value(Variant::Int16(128)); - list_builder.append_value(Variant::Int32(-32767431)); - list_builder.finish(); - let (metadata, value) = variant_builder.finish(); - let variant = Variant::try_new(&metadata, &value)?; - - JsonToVariantTest { - json: "[127, 128, -32767431]", - expected: variant, - } - .run() -} - -#[test] -fn test_json_to_variant_array_with_object() -> Result<(), ArrowError> { - let mut variant_builder = VariantBuilder::new(); - let mut list_builder = variant_builder.new_list(); - let mut object_builder_inner = list_builder.new_object(); - object_builder_inner.insert("age", Variant::Int8(32)); - object_builder_inner.finish().unwrap(); - list_builder.append_value(Variant::Int16(128)); - list_builder.append_value(Variant::BooleanFalse); - list_builder.finish(); - let (metadata, value) = variant_builder.finish(); - let variant = Variant::try_new(&metadata, &value)?; - - JsonToVariantTest { - json: "[{\"age\": 32}, 128, false]", - expected: variant, - } - .run() -} - -#[test] -fn test_json_to_variant_array_large_u16_offset() -> Result<(), ArrowError> { - // u16 offset - 128 i8's + 1 "true" = 257 bytes - let mut variant_builder = VariantBuilder::new(); - let mut list_builder = variant_builder.new_list(); - for _ in 0..128 { - list_builder.append_value(Variant::Int8(1)); - } - list_builder.append_value(Variant::BooleanTrue); - list_builder.finish(); - let (metadata, value) = variant_builder.finish(); - let variant = Variant::try_new(&metadata, &value)?; - - JsonToVariantTest { - json: &format!("[{} true]", "1, ".repeat(128)), - expected: variant, - } - .run() -} - -#[test] -fn test_json_to_variant_array_nested_large() -> Result<(), ArrowError> { - // verify u24, and large_size - let mut variant_builder = VariantBuilder::new(); - let mut list_builder = variant_builder.new_list(); - for _ in 0..256 { - let mut list_builder_inner = list_builder.new_list(); - for _ in 0..255 { - list_builder_inner.append_value(Variant::Null); - } - list_builder_inner.finish(); - } - list_builder.finish(); - let (metadata, value) = variant_builder.finish(); - let variant = Variant::try_new(&metadata, &value)?; - let intermediate = format!("[{}]", vec!["null"; 255].join(", ")); - let json = format!("[{}]", vec![intermediate; 256].join(", ")); - JsonToVariantTest { - json: json.as_str(), - expected: variant, - } - .run() -} - -#[test] -fn test_json_to_variant_object_simple() -> Result<(), ArrowError> { - let mut variant_builder = VariantBuilder::new(); - let mut object_builder = variant_builder.new_object(); - object_builder.insert("a", Variant::Int8(3)); - object_builder.insert("b", Variant::Int8(2)); - object_builder.finish().unwrap(); - let (metadata, value) = variant_builder.finish(); - let variant = Variant::try_new(&metadata, &value)?; - JsonToVariantTest { - json: "{\"b\": 2, \"a\": 1, \"a\": 3}", - expected: variant, - } - .run() -} - -#[test] -fn test_json_to_variant_object_complex() -> Result<(), ArrowError> { - let mut variant_builder = VariantBuilder::new(); - let mut object_builder = variant_builder.new_object(); - let mut inner_list_builder = object_builder.new_list("booleans"); - inner_list_builder.append_value(Variant::BooleanTrue); - inner_list_builder.append_value(Variant::BooleanFalse); - inner_list_builder.finish(); - object_builder.insert("null", Variant::Null); - let mut inner_list_builder = object_builder.new_list("numbers"); - inner_list_builder.append_value(Variant::Int8(4)); - inner_list_builder.append_value(Variant::Double(-3e0)); - inner_list_builder.append_value(Variant::Double(1001e-3)); - inner_list_builder.finish(); - object_builder.finish().unwrap(); - let (metadata, value) = variant_builder.finish(); - let variant = Variant::try_new(&metadata, &value)?; - JsonToVariantTest { - json: "{\"numbers\": [4, -3e0, 1001e-3], \"null\": null, \"booleans\": [true, false]}", - expected: variant, - } - .run() -} - -#[test] -fn test_json_to_variant_object_very_large() -> Result<(), ArrowError> { - // 256 elements (keys: 000-255) - each element is an object of 256 elements (240-495) - each - // element a list of numbers from 0-127 - let keys: Vec = (0..=255).map(|n| format!("{n:03}")).collect(); - let innermost_list: String = format!( - "[{}]", - (0..=127) - .map(|n| format!("{n}")) - .collect::>() - .join(",") - ); - let inner_keys: Vec = (240..=495).map(|n| format!("{n}")).collect(); - let inner_object = format!( - "{{{}:{}}}", - inner_keys - .iter() - .map(|k| format!("\"{k}\"")) - .collect::>() - .join(format!(":{innermost_list},").as_str()), - innermost_list - ); - let json = format!( - "{{{}:{}}}", - keys.iter() - .map(|k| format!("\"{k}\"")) - .collect::>() - .join(format!(":{inner_object},").as_str()), - inner_object - ); - // Manually verify raw JSON value size - let mut variant_builder = VariantBuilder::new(); - json_to_variant(&json, &mut variant_builder)?; - let (metadata, value) = variant_builder.finish(); - let v = parquet_variant::Variant::try_new(&metadata, &value)?; - let output_string = variant_to_json_string(&v)?; - assert_eq!(output_string, json); - // Verify metadata size = 1 + 2 + 2 * 497 + 3 * 496 - assert_eq!(metadata.len(), 2485); - // Verify value size. - // Size of innermost_list: 1 + 1 + 258 + 256 = 516 - // Size of inner object: 1 + 4 + 256 + 257 * 3 + 256 * 516 = 133128 - // Size of json: 1 + 4 + 512 + 1028 + 256 * 133128 = 34082313 - assert_eq!(value.len(), 34082313); - - let mut variant_builder = VariantBuilder::new(); - let mut object_builder = variant_builder.new_object(); - keys.iter().for_each(|key| { - let mut inner_object_builder = object_builder.new_object(key); - inner_keys.iter().for_each(|inner_key| { - let mut list_builder = inner_object_builder.new_list(inner_key); - for i in 0..=127 { - list_builder.append_value(Variant::Int8(i)); - } - list_builder.finish(); - }); - inner_object_builder.finish().unwrap(); - }); - object_builder.finish().unwrap(); - let (metadata, value) = variant_builder.finish(); - let variant = Variant::try_new(&metadata, &value)?; - - JsonToVariantTest { - json: &json, - expected: variant, - } - .run() -} - -#[test] -fn test_json_to_variant_unicode() -> Result<(), ArrowError> { - let json = "{\"爱\":\"अ\",\"a\":1}"; - let mut variant_builder = VariantBuilder::new(); - json_to_variant(json, &mut variant_builder)?; - let (metadata, value) = variant_builder.finish(); - let v = parquet_variant::Variant::try_new(&metadata, &value)?; - let output_string = variant_to_json_string(&v)?; - assert_eq!(output_string, "{\"a\":1,\"爱\":\"अ\"}"); - let mut variant_builder = VariantBuilder::new(); - let mut object_builder = variant_builder.new_object(); - object_builder.insert("a", Variant::Int8(1)); - object_builder.insert("爱", Variant::ShortString(ShortString::try_new("अ")?)); - object_builder.finish().unwrap(); - let (metadata, value) = variant_builder.finish(); - let variant = Variant::try_new(&metadata, &value)?; - - assert_eq!( - value, - &[2u8, 2u8, 0u8, 1u8, 0u8, 2u8, 6u8, 12u8, 1u8, 13u8, 0xe0u8, 0xa4u8, 0x85u8] - ); - assert_eq!( - metadata, - &[0b10001u8, 2u8, 0u8, 1u8, 4u8, 97u8, 0xe7u8, 0x88u8, 0xb1u8] - ); - JsonToVariantTest { - json, - expected: variant, - } - .run() -}