Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 73 additions & 8 deletions crates/iceberg/src/arrow/record_batch_transformer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -83,14 +83,10 @@ fn constants_map(
// Handle both None (null) and Some(Literal::Primitive) cases
match &partition_data[pos] {
None => {
// TODO (https://github.com/apache/iceberg-rust/issues/1914): Add support for null datum values.
return Err(Error::new(
ErrorKind::Unexpected,
format!(
"Partition field {} has null value for identity transform",
field.source_id
),
));
// Skip null partition values - they will be resolved as null per Iceberg spec rule #4.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I got it, it's a little tricky. When it's value is null, it will be resolved to null since it's optional.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think there is a corner case where this approach will fail, as spec says identity partition should override values from underlying files, with this fix we will still return values from underlying files rather than null identity partition value.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I suspect the right move is to fix #1914 but I spent about an hour on it last night and decided the blast radius was too big for what I wanted to be a quick fix.

// When a partition value is null, we don't add it to the constants map,
// allowing downstream column resolution to handle it correctly.
continue;
}
Some(Literal::Primitive(value)) => {
// Create a Datum from the primitive type and value
Expand Down Expand Up @@ -1610,4 +1606,73 @@ mod test {
assert_eq!(get_string_value(result.column(4).as_ref(), 0), "");
assert_eq!(get_string_value(result.column(4).as_ref(), 1), "");
}

/// Test handling of null values in identity-partitioned columns.
///
/// Reproduces TestPartitionValues.testNullPartitionValue() from iceberg-java, which
/// writes records where the partition column has null values. Before the fix in #1922,
/// this would error with "Partition field X has null value for identity transform".
#[test]
fn null_identity_partition_value() {
use crate::spec::{Struct, Transform};

let schema = Arc::new(
Schema::builder()
.with_schema_id(0)
.with_fields(vec![
NestedField::optional(1, "id", Type::Primitive(PrimitiveType::Int)).into(),
NestedField::optional(2, "data", Type::Primitive(PrimitiveType::String)).into(),
])
.build()
.unwrap(),
);

let partition_spec = Arc::new(
crate::spec::PartitionSpec::builder(schema.clone())
.with_spec_id(0)
.add_partition_field("data", "data", Transform::Identity)
.unwrap()
.build()
.unwrap(),
);

// Partition has null value for the data column
let partition_data = Struct::from_iter(vec![None]);

let file_schema = Arc::new(ArrowSchema::new(vec![simple_field(
"id",
DataType::Int32,
true,
"1",
)]));

let projected_field_ids = [1, 2];

let mut transformer = RecordBatchTransformerBuilder::new(schema, &projected_field_ids)
.with_partition(partition_spec, partition_data)
.expect("Should handle null partition values")
.build();

let file_batch =
RecordBatch::try_new(file_schema, vec![Arc::new(Int32Array::from(vec![1, 2, 3]))])
.unwrap();

let result = transformer.process_record_batch(file_batch).unwrap();

assert_eq!(result.num_columns(), 2);
assert_eq!(result.num_rows(), 3);

let id_col = result
.column(0)
.as_any()
.downcast_ref::<Int32Array>()
.unwrap();
assert_eq!(id_col.values(), &[1, 2, 3]);

// Partition column with null value should produce nulls
let data_col = result.column(1);
assert!(data_col.is_null(0));
assert!(data_col.is_null(1));
assert!(data_col.is_null(2));
}
}
Loading