diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index f9026735db1a..58b4060faf05 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -2017,4 +2017,499 @@ mod test { // Row 8: Large Int64 should fail conversion -> NULL assert!(int32_result_2.is_null(8)); // 9223372036854775807 (too large for Int32) } + + #[test] + fn test_struct_extraction_subset_superset_schema_perfectly_shredded() { + // Create variant with diverse null patterns and empty objects + let variant_array = create_comprehensive_shredded_variant(); + + // Request struct with fields "a", "b", "d" (skip existing "c", add missing "d") + let struct_fields = Fields::from(vec![ + Field::new("a", DataType::Int32, true), + Field::new("b", DataType::Int32, true), + Field::new("d", DataType::Int32, true), + ]); + let struct_type = DataType::Struct(struct_fields); + + let options = GetOptions { + path: VariantPath::default(), + as_type: Some(Arc::new(Field::new("result", struct_type, true))), + cast_options: CastOptions::default(), + }; + + let result = variant_get(&variant_array, options).unwrap(); + + // Verify the result is a StructArray with 3 fields and 5 rows + let struct_result = result.as_any().downcast_ref::().unwrap(); + assert_eq!(struct_result.len(), 5); + assert_eq!(struct_result.num_columns(), 3); + + let field_a = struct_result + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let field_b = struct_result + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + let field_d = struct_result + .column(2) + .as_any() + .downcast_ref::() + .unwrap(); + + // Row 0: Normal values {"a": 1, "b": 2, "c": 3} → {a: 1, b: 2, d: NULL} + assert!(!struct_result.is_null(0)); + assert_eq!(field_a.value(0), 1); + assert_eq!(field_b.value(0), 2); + assert!(field_d.is_null(0)); // Missing field "d" + + // Row 1: Top-level NULL → struct-level NULL + assert!(struct_result.is_null(1)); + + // Row 2: Field "a" missing → {a: NULL, b: 2, d: NULL} + assert!(!struct_result.is_null(2)); + assert!(field_a.is_null(2)); // Missing field "a" + assert_eq!(field_b.value(2), 2); + assert!(field_d.is_null(2)); // Missing field "d" + + // Row 3: Field "b" missing → {a: 1, b: NULL, d: NULL} + assert!(!struct_result.is_null(3)); + assert_eq!(field_a.value(3), 1); + assert!(field_b.is_null(3)); // Missing field "b" + assert!(field_d.is_null(3)); // Missing field "d" + + // Row 4: Empty object {} → {a: NULL, b: NULL, d: NULL} + assert!(!struct_result.is_null(4)); + assert!(field_a.is_null(4)); // Empty object + assert!(field_b.is_null(4)); // Empty object + assert!(field_d.is_null(4)); // Missing field "d" + } + + #[test] + fn test_nested_struct_extraction_perfectly_shredded() { + // Create nested variant with diverse null patterns + let variant_array = create_comprehensive_nested_shredded_variant(); + println!("variant_array: {variant_array:?}"); + + // Request 3-level nested struct type {"outer": {"inner": INT}} + let inner_field = Field::new("inner", DataType::Int32, true); + let inner_type = DataType::Struct(Fields::from(vec![inner_field])); + let outer_field = Field::new("outer", inner_type, true); + let result_type = DataType::Struct(Fields::from(vec![outer_field])); + + let options = GetOptions { + path: VariantPath::default(), + as_type: Some(Arc::new(Field::new("result", result_type, true))), + cast_options: CastOptions::default(), + }; + + let result = variant_get(&variant_array, options).unwrap(); + println!("result: {result:?}"); + + // Verify the result is a StructArray with "outer" field and 4 rows + let outer_struct = result.as_any().downcast_ref::().unwrap(); + assert_eq!(outer_struct.len(), 4); + assert_eq!(outer_struct.num_columns(), 1); + + // Get the "inner" struct column + let inner_struct = outer_struct + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(inner_struct.num_columns(), 1); + + // Get the "leaf" field (Int32 values) + let leaf_field = inner_struct + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + + // Row 0: Normal nested {"outer": {"inner": {"leaf": 42}}} + assert!(!outer_struct.is_null(0)); + assert!(!inner_struct.is_null(0)); + assert_eq!(leaf_field.value(0), 42); + + // Row 1: "inner" field missing → {outer: {inner: NULL}} + assert!(!outer_struct.is_null(1)); + assert!(!inner_struct.is_null(1)); // outer exists, inner exists but leaf is NULL + assert!(leaf_field.is_null(1)); // leaf field is NULL + + // Row 2: "outer" field missing → {outer: NULL} + assert!(!outer_struct.is_null(2)); + assert!(inner_struct.is_null(2)); // outer field is NULL + + // Row 3: Top-level NULL → struct-level NULL + assert!(outer_struct.is_null(3)); + } + + #[test] + fn test_path_based_null_masks_one_step() { + // Create nested variant with diverse null patterns + let variant_array = create_comprehensive_nested_shredded_variant(); + + // Extract "outer" field using path-based variant_get + let path = VariantPath::from("outer"); + let inner_field = Field::new("inner", DataType::Int32, true); + let result_type = DataType::Struct(Fields::from(vec![inner_field])); + + let options = GetOptions { + path, + as_type: Some(Arc::new(Field::new("result", result_type, true))), + cast_options: CastOptions::default(), + }; + + let result = variant_get(&variant_array, options).unwrap(); + + // Verify the result is a StructArray with "inner" field and 4 rows + let outer_result = result.as_any().downcast_ref::().unwrap(); + assert_eq!(outer_result.len(), 4); + assert_eq!(outer_result.num_columns(), 1); + + // Get the "inner" field (Int32 values) + let inner_field = outer_result + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + + // Row 0: Normal nested {"outer": {"inner": 42}} → {"inner": 42} + assert!(!outer_result.is_null(0)); + assert_eq!(inner_field.value(0), 42); + + // Row 1: Inner field null {"outer": {"inner": null}} → {"inner": null} + assert!(!outer_result.is_null(1)); + assert!(inner_field.is_null(1)); + + // Row 2: Outer field null {"outer": null} → null (entire struct is null) + assert!(outer_result.is_null(2)); + + // Row 3: Top-level null → null (entire struct is null) + assert!(outer_result.is_null(3)); + } + + #[test] + fn test_path_based_null_masks_two_steps() { + // Create nested variant with diverse null patterns + let variant_array = create_comprehensive_nested_shredded_variant(); + + // Extract "outer.inner" field using path-based variant_get + let path = VariantPath::from("outer").join("inner"); + + let options = GetOptions { + path, + as_type: Some(Arc::new(Field::new("result", DataType::Int32, true))), + cast_options: CastOptions::default(), + }; + + let result = variant_get(&variant_array, options).unwrap(); + + // Verify the result is an Int32Array with 4 rows + let int_result = result.as_any().downcast_ref::().unwrap(); + assert_eq!(int_result.len(), 4); + + // Row 0: Normal nested {"outer": {"inner": 42}} → 42 + assert!(!int_result.is_null(0)); + assert_eq!(int_result.value(0), 42); + + // Row 1: Inner field null {"outer": {"inner": null}} → null + assert!(int_result.is_null(1)); + + // Row 2: Outer field null {"outer": null} → null (path traversal fails) + assert!(int_result.is_null(2)); + + // Row 3: Top-level null → null (path traversal fails) + assert!(int_result.is_null(3)); + } + + #[test] + fn test_struct_extraction_mixed_and_unshredded() { + // Create a partially shredded variant (x shredded, y not) + let variant_array = create_mixed_and_unshredded_variant(); + + // Request struct with both shredded and unshredded fields + let struct_fields = Fields::from(vec![ + Field::new("x", DataType::Int32, true), + Field::new("y", DataType::Int32, true), + ]); + let struct_type = DataType::Struct(struct_fields); + + let options = GetOptions { + path: VariantPath::default(), + as_type: Some(Arc::new(Field::new("result", struct_type, true))), + cast_options: CastOptions::default(), + }; + + let result = variant_get(&variant_array, options).unwrap(); + + // Verify the mixed shredding works (should succeed with current implementation) + let struct_result = result.as_any().downcast_ref::().unwrap(); + assert_eq!(struct_result.len(), 4); + assert_eq!(struct_result.num_columns(), 2); + + let field_x = struct_result + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let field_y = struct_result + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + + // Row 0: {"x": 1, "y": 42} - x from shredded, y from value field + assert_eq!(field_x.value(0), 1); + assert_eq!(field_y.value(0), 42); + + // Row 1: {"x": 2} - x from shredded, y missing (perfect shredding) + assert_eq!(field_x.value(1), 2); + assert!(field_y.is_null(1)); + + // Row 2: {"x": 3, "y": null} - x from shredded, y explicitly null in value + assert_eq!(field_x.value(2), 3); + assert!(field_y.is_null(2)); + + // Row 3: top-level null - entire struct row should be null + assert!(struct_result.is_null(3)); + } + + /// Test that demonstrates the actual struct row builder gap + /// This test should fail because it hits unshredded nested structs + #[test] + fn test_struct_row_builder_gap_demonstration() { + // Create completely unshredded JSON variant (no typed_value at all) + let json_strings = vec![ + r#"{"outer": {"inner": 42}}"#, + r#"{"outer": {"inner": 100}}"#, + ]; + let string_array: Arc = Arc::new(StringArray::from(json_strings)); + let variant_array = json_to_variant(&string_array).unwrap(); + + // Request nested struct - this should fail at the row builder level + let inner_fields = Fields::from(vec![Field::new("inner", DataType::Int32, true)]); + let inner_struct_type = DataType::Struct(inner_fields); + let outer_fields = Fields::from(vec![Field::new("outer", inner_struct_type, true)]); + let outer_struct_type = DataType::Struct(outer_fields); + + let options = GetOptions { + path: VariantPath::default(), + as_type: Some(Arc::new(Field::new("result", outer_struct_type, true))), + cast_options: CastOptions::default(), + }; + + let variant_array_ref: Arc = Arc::new(variant_array); + let result = variant_get(&variant_array_ref, options); + + // Should fail with NotYetImplemented when the row builder tries to handle struct type + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error.to_string().contains("not yet implemented")); + } + + /// Create comprehensive shredded variant with diverse null patterns and empty objects + /// Rows: normal values, top-level null, missing field a, missing field b, empty object + fn create_comprehensive_shredded_variant() -> ArrayRef { + let (metadata, _) = { + let mut builder = parquet_variant::VariantBuilder::new(); + let obj = builder.new_object(); + obj.finish(); + builder.finish() + }; + + // Create null buffer for top-level nulls + let nulls = NullBuffer::from(vec![ + true, // row 0: normal values + false, // row 1: top-level null + true, // row 2: missing field a + true, // row 3: missing field b + true, // row 4: empty object + ]); + + let metadata_array = BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 5)); + + // Create shredded fields with different null patterns + // Field "a": present in rows 0,3 (missing in rows 1,2,4) + let a_field_typed_value = Int32Array::from(vec![Some(1), None, None, Some(1), None]); + let a_field_struct = StructArrayBuilder::new() + .with_field("typed_value", Arc::new(a_field_typed_value), true) + .build(); + let a_field_shredded = ShreddedVariantFieldArray::try_new(Arc::new(a_field_struct)) + .expect("should create ShreddedVariantFieldArray for a"); + + // Field "b": present in rows 0,2 (missing in rows 1,3,4) + let b_field_typed_value = Int32Array::from(vec![Some(2), None, Some(2), None, None]); + let b_field_struct = StructArrayBuilder::new() + .with_field("typed_value", Arc::new(b_field_typed_value), true) + .build(); + let b_field_shredded = ShreddedVariantFieldArray::try_new(Arc::new(b_field_struct)) + .expect("should create ShreddedVariantFieldArray for b"); + + // Field "c": present in row 0 only (missing in all other rows) + let c_field_typed_value = Int32Array::from(vec![Some(3), None, None, None, None]); + let c_field_struct = StructArrayBuilder::new() + .with_field("typed_value", Arc::new(c_field_typed_value), true) + .build(); + let c_field_shredded = ShreddedVariantFieldArray::try_new(Arc::new(c_field_struct)) + .expect("should create ShreddedVariantFieldArray for c"); + + // Create main typed_value struct + let typed_value_fields = Fields::from(vec![ + Field::new("a", a_field_shredded.data_type().clone(), true), + Field::new("b", b_field_shredded.data_type().clone(), true), + Field::new("c", c_field_shredded.data_type().clone(), true), + ]); + let typed_value_struct = StructArray::try_new( + typed_value_fields, + vec![ + Arc::new(a_field_shredded), + Arc::new(b_field_shredded), + Arc::new(c_field_shredded), + ], + None, + ) + .unwrap(); + + // Build final VariantArray with top-level nulls + let struct_array = StructArrayBuilder::new() + .with_field("metadata", Arc::new(metadata_array), false) + .with_field("typed_value", Arc::new(typed_value_struct), true) + .with_nulls(nulls) + .build(); + + Arc::new(VariantArray::try_new(Arc::new(struct_array)).expect("should create VariantArray")) + } + + /// Create comprehensive nested shredded variant with diverse null patterns + /// Represents 3-level structure: variant -> outer -> inner (INT value) + /// The shredding schema is: {"metadata": BINARY, "typed_value": {"outer": {"typed_value": {"inner": {"typed_value": INT}}}}} + /// Rows: normal nested value, inner field null, outer field null, top-level null + fn create_comprehensive_nested_shredded_variant() -> ArrayRef { + // Create the inner level: contains typed_value with Int32 values + // Row 0: has value 42, Row 1: inner null, Row 2: outer null, Row 3: top-level null + let inner_typed_value = Int32Array::from(vec![Some(42), None, None, None]); // dummy value for row 2 + let inner = StructArrayBuilder::new() + .with_field("typed_value", Arc::new(inner_typed_value), true) + .build(); + let inner = ShreddedVariantFieldArray::try_new(Arc::new(inner)).unwrap(); + + let outer_typed_value_nulls = NullBuffer::from(vec![ + true, // row 0: inner struct exists with typed_value=42 + false, // row 1: inner field NULL + false, // row 2: outer field NULL + false, // row 3: top-level NULL + ]); + let outer_typed_value = StructArrayBuilder::new() + .with_field("inner", Arc::new(inner), false) + .with_nulls(outer_typed_value_nulls) + .build(); + + let outer = StructArrayBuilder::new() + .with_field("typed_value", Arc::new(outer_typed_value), true) + .build(); + let outer = ShreddedVariantFieldArray::try_new(Arc::new(outer)).unwrap(); + + let typed_value_nulls = NullBuffer::from(vec![ + true, // row 0: inner struct exists with typed_value=42 + true, // row 1: inner field NULL + false, // row 2: outer field NULL + false, // row 3: top-level NULL + ]); + let typed_value = StructArrayBuilder::new() + .with_field("outer", Arc::new(outer), false) + .with_nulls(typed_value_nulls) + .build(); + + // Build final VariantArray with top-level nulls + let (metadata, _) = parquet_variant::VariantBuilder::new().finish(); + let metadata_array = BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 4)); + let nulls = NullBuffer::from(vec![ + true, // row 0: inner struct exists with typed_value=42 + true, // row 1: inner field NULL + true, // row 2: outer field NULL + false, // row 3: top-level NULL + ]); + let struct_array = StructArrayBuilder::new() + .with_field("metadata", Arc::new(metadata_array), false) + .with_field("typed_value", Arc::new(typed_value), true) + .with_nulls(nulls) + .build(); + + Arc::new(VariantArray::try_new(Arc::new(struct_array)).expect("should create VariantArray")) + } + + /// Create variant with mixed shredding (spec-compliant) including null scenarios + /// Field "x" is globally shredded, field "y" is never shredded + fn create_mixed_and_unshredded_variant() -> ArrayRef { + // Create spec-compliant mixed shredding: + // - Field "x" is globally shredded (has typed_value column) + // - Field "y" is never shredded (only appears in value field when present) + + let (metadata, y_field_value) = { + let mut builder = parquet_variant::VariantBuilder::new(); + let mut obj = builder.new_object(); + obj.insert("y", Variant::from(42)); + obj.finish(); + builder.finish() + }; + + let metadata_array = BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 4)); + + // Value field contains objects with unshredded fields only (never contains "x") + // Row 0: {"y": "foo"} - x is shredded out, y remains in value + // Row 1: {} - both x and y are absent (perfect shredding for x, y missing) + // Row 2: {"y": null} - x is shredded out, y explicitly null + // Row 3: top-level null (encoded in VariantArray's null mask, but fields contain valid data) + + let empty_object_value = { + let mut builder = parquet_variant::VariantBuilder::new(); + builder.new_object().finish(); + let (_, value) = builder.finish(); + value + }; + + let y_null_value = { + let mut builder = parquet_variant::VariantBuilder::new(); + builder.new_object().with_field("y", Variant::Null).finish(); + let (_, value) = builder.finish(); + value + }; + + let value_array = BinaryViewArray::from(vec![ + Some(y_field_value.as_slice()), // Row 0: {"y": 42} + Some(empty_object_value.as_slice()), // Row 1: {} + Some(y_null_value.as_slice()), // Row 2: {"y": null} + Some(empty_object_value.as_slice()), // Row 3: top-level null (but value field contains valid data) + ]); + + // Create shredded field "x" (globally shredded - never appears in value field) + // For top-level null row, the field still needs valid content (not null) + let x_field_typed_value = Int32Array::from(vec![Some(1), Some(2), Some(3), Some(0)]); + let x_field_struct = StructArrayBuilder::new() + .with_field("typed_value", Arc::new(x_field_typed_value), true) + .build(); + let x_field_shredded = ShreddedVariantFieldArray::try_new(Arc::new(x_field_struct)) + .expect("should create ShreddedVariantFieldArray for x"); + + // Create main typed_value struct (only contains shredded fields) + let typed_value_struct = StructArrayBuilder::new() + .with_field("x", Arc::new(x_field_shredded), false) + .build(); + + // Build VariantArray with both value and typed_value (PartiallyShredded) + // Top-level null is encoded in the main StructArray's null mask + let variant_nulls = NullBuffer::from(vec![true, true, true, false]); // Row 3 is top-level null + let struct_array = StructArrayBuilder::new() + .with_field("metadata", Arc::new(metadata_array), false) + .with_field("value", Arc::new(value_array), true) + .with_field("typed_value", Arc::new(typed_value_struct), true) + .with_nulls(variant_nulls) + .build(); + + Arc::new(VariantArray::try_new(Arc::new(struct_array)).expect("should create VariantArray")) + } }