diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index adef5397684e..b1178f452548 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -336,6 +336,46 @@ mod variable; /// /// With `[]` represented by an empty byte array, and `null` a null byte array. /// +/// ## Fixed Size List Encoding +/// +/// Fixed Size Lists are encoded by first encoding all child elements to the row format. +/// +/// A non-null list value is then encoded as 0x01 followed by the concatenation of each +/// of the child elements. A null list value is encoded as a null marker. +/// +/// For example given: +/// +/// ```text +/// [1_u8, 2_u8] +/// [3_u8, null] +/// null +/// ``` +/// +/// The elements would be converted to: +/// +/// ```text +/// ┌──┬──┐ ┌──┬──┐ ┌──┬──┐ ┌──┬──┐ +/// 1 │01│01│ 2 │01│02│ 3 │01│03│ null │00│00│ +/// └──┴──┘ └──┴──┘ └──┴──┘ └──┴──┘ +///``` +/// +/// Which would be encoded as +/// +/// ```text +/// ┌──┬──┬──┬──┬──┐ +/// [1_u8, 2_u8] │01│01│01│01│02│ +/// └──┴──┴──┴──┴──┘ +/// └ 1 ┘ └ 2 ┘ +/// ┌──┬──┬──┬──┬──┐ +/// [3_u8, null] │01│01│03│00│00│ +/// └──┴──┴──┴──┴──┘ +/// └ 1 ┘ └null┘ +/// ┌──┐ +/// null │00│ +/// └──┘ +/// +///``` +/// /// # Ordering /// /// ## Float Ordering @@ -702,7 +742,18 @@ impl RowConverter { // SAFETY // We have validated that the rows came from this [`RowConverter`] // and therefore must be valid - unsafe { self.convert_raw(&mut rows, validate_utf8) } + let result = unsafe { self.convert_raw(&mut rows, validate_utf8) }?; + + for (i, row) in rows.iter().enumerate() { + if !row.is_empty() { + return Err(ArrowError::InvalidArgumentError(format!( + "Codecs {codecs:?} did not consume all bytes for row {i}, remaining bytes: {row:?}", + codecs = &self.codecs + ))); + } + } + + Ok(result) } /// Returns an empty [`Rows`] with capacity for `row_capacity` rows with @@ -2039,6 +2090,9 @@ mod tests { builder.values().append_null(); builder.append(true); builder.append(true); + builder.values().append_value(17); // MASKED + builder.values().append_null(); // MASKED + builder.append(false); let list = Arc::new(builder.finish()) as ArrayRef; let d = list.data_type().clone(); @@ -2047,11 +2101,12 @@ mod tests { let rows = converter.convert_columns(&[Arc::clone(&list)]).unwrap(); assert!(rows.row(0) > rows.row(1)); // [32, 52, 32] > [32, 52, 12] - assert!(rows.row(2) < rows.row(1)); // [32, 42] < [32, 52, 12] - assert!(rows.row(3) < rows.row(2)); // null < [32, 42] - assert!(rows.row(4) < rows.row(2)); // [32, null] < [32, 42] - assert!(rows.row(5) < rows.row(2)); // [] < [32, 42] + assert!(rows.row(2) < rows.row(1)); // [32, 52] < [32, 52, 12] + assert!(rows.row(3) < rows.row(2)); // null < [32, 52] + assert!(rows.row(4) < rows.row(2)); // [32, null] < [32, 52] + assert!(rows.row(5) < rows.row(2)); // [] < [32, 52] assert!(rows.row(3) < rows.row(5)); // null < [] + assert_eq!(rows.row(3), rows.row(6)); // null = null (different masked values) let back = converter.convert_rows(&rows).unwrap(); assert_eq!(back.len(), 1); @@ -2064,11 +2119,12 @@ mod tests { let rows = converter.convert_columns(&[Arc::clone(&list)]).unwrap(); assert!(rows.row(0) > rows.row(1)); // [32, 52, 32] > [32, 52, 12] - assert!(rows.row(2) < rows.row(1)); // [32, 42] < [32, 52, 12] - assert!(rows.row(3) > rows.row(2)); // null > [32, 42] - assert!(rows.row(4) > rows.row(2)); // [32, null] > [32, 42] - assert!(rows.row(5) < rows.row(2)); // [] < [32, 42] + assert!(rows.row(2) < rows.row(1)); // [32, 52] < [32, 52, 12] + assert!(rows.row(3) > rows.row(2)); // null > [32, 52] + assert!(rows.row(4) > rows.row(2)); // [32, null] > [32, 52] + assert!(rows.row(5) < rows.row(2)); // [] < [32, 52] assert!(rows.row(3) > rows.row(5)); // null > [] + assert_eq!(rows.row(3), rows.row(6)); // null = null (different masked values) let back = converter.convert_rows(&rows).unwrap(); assert_eq!(back.len(), 1); @@ -2081,11 +2137,12 @@ mod tests { let rows = converter.convert_columns(&[Arc::clone(&list)]).unwrap(); assert!(rows.row(0) < rows.row(1)); // [32, 52, 32] < [32, 52, 12] - assert!(rows.row(2) > rows.row(1)); // [32, 42] > [32, 52, 12] - assert!(rows.row(3) > rows.row(2)); // null > [32, 42] - assert!(rows.row(4) > rows.row(2)); // [32, null] > [32, 42] - assert!(rows.row(5) > rows.row(2)); // [] > [32, 42] + assert!(rows.row(2) > rows.row(1)); // [32, 52] > [32, 52, 12] + assert!(rows.row(3) > rows.row(2)); // null > [32, 52] + assert!(rows.row(4) > rows.row(2)); // [32, null] > [32, 52] + assert!(rows.row(5) > rows.row(2)); // [] > [32, 52] assert!(rows.row(3) > rows.row(5)); // null > [] + assert_eq!(rows.row(3), rows.row(6)); // null = null (different masked values) let back = converter.convert_rows(&rows).unwrap(); assert_eq!(back.len(), 1); @@ -2098,11 +2155,12 @@ mod tests { let rows = converter.convert_columns(&[Arc::clone(&list)]).unwrap(); assert!(rows.row(0) < rows.row(1)); // [32, 52, 32] < [32, 52, 12] - assert!(rows.row(2) > rows.row(1)); // [32, 42] > [32, 52, 12] - assert!(rows.row(3) < rows.row(2)); // null < [32, 42] - assert!(rows.row(4) < rows.row(2)); // [32, null] < [32, 42] - assert!(rows.row(5) > rows.row(2)); // [] > [32, 42] + assert!(rows.row(2) > rows.row(1)); // [32, 52] > [32, 52, 12] + assert!(rows.row(3) < rows.row(2)); // null < [32, 52] + assert!(rows.row(4) < rows.row(2)); // [32, null] < [32, 52] + assert!(rows.row(5) > rows.row(2)); // [] > [32, 52] assert!(rows.row(3) < rows.row(5)); // null < [] + assert_eq!(rows.row(3), rows.row(6)); // null = null (different masked values) let back = converter.convert_rows(&rows).unwrap(); assert_eq!(back.len(), 1); @@ -2321,6 +2379,64 @@ mod tests { assert_eq!(&back[0], &list); } + #[test] + fn test_two_fixed_size_lists() { + let mut first = FixedSizeListBuilder::new(UInt8Builder::new(), 1); + // 0: [100] + first.values().append_value(100); + first.append(true); + // 1: [101] + first.values().append_value(101); + first.append(true); + // 2: [102] + first.values().append_value(102); + first.append(true); + // 3: [null] + first.values().append_null(); + first.append(true); + // 4: null + first.values().append_null(); // MASKED + first.append(false); + let first = Arc::new(first.finish()) as ArrayRef; + let first_type = first.data_type().clone(); + + let mut second = FixedSizeListBuilder::new(UInt8Builder::new(), 1); + // 0: [200] + second.values().append_value(200); + second.append(true); + // 1: [201] + second.values().append_value(201); + second.append(true); + // 2: [202] + second.values().append_value(202); + second.append(true); + // 3: [null] + second.values().append_null(); + second.append(true); + // 4: null + second.values().append_null(); // MASKED + second.append(false); + let second = Arc::new(second.finish()) as ArrayRef; + let second_type = second.data_type().clone(); + + let converter = RowConverter::new(vec![ + SortField::new(first_type.clone()), + SortField::new(second_type.clone()), + ]) + .unwrap(); + + let rows = converter + .convert_columns(&[Arc::clone(&first), Arc::clone(&second)]) + .unwrap(); + + let back = converter.convert_rows(&rows).unwrap(); + assert_eq!(back.len(), 2); + back[0].to_data().validate_full().unwrap(); + assert_eq!(&back[0], &first); + back[1].to_data().validate_full().unwrap(); + assert_eq!(&back[1], &second); + } + fn generate_primitive_array(len: usize, valid_percent: f64) -> PrimitiveArray where K: ArrowPrimitiveType, diff --git a/arrow-row/src/list.rs b/arrow-row/src/list.rs index be7c02f3c86d..4884165d8368 100644 --- a/arrow-row/src/list.rs +++ b/arrow-row/src/list.rs @@ -192,7 +192,7 @@ pub fn compute_lengths_fixed_size_list( ) { let value_length = array.value_length().as_usize(); lengths.iter_mut().enumerate().for_each(|(idx, length)| { - *length = match array.is_valid(idx) { + *length += match array.is_valid(idx) { true => { 1 + ((idx * value_length)..(idx + 1) * value_length) .map(|child_idx| rows.row(child_idx).as_ref().len()) @@ -292,6 +292,7 @@ pub unsafe fn decode_fixed_size_list( row_offset = next_offset; } } + *row = &row[row_offset..]; // Update row for the next decoder } let children = converter.convert_raw(&mut child_rows, validate_utf8)?;