Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
150 changes: 133 additions & 17 deletions arrow-row/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -336,6 +336,46 @@ mod variable;
///
/// With `[]` represented by an empty byte array, and `null` a null byte array.
///
/// ## Fixed Size List Encoding
///
/// Fixed Size Lists are encoded by first encoding all child elements to the row format.
///
/// A non-null list value is then encoded as 0x01 followed by the concatenation of each
/// of the child elements. A null list value is encoded as a null marker.
///
/// For example given:
///
/// ```text
/// [1_u8, 2_u8]
/// [3_u8, null]
/// null
/// ```
///
/// The elements would be converted to:
///
/// ```text
/// ┌──┬──┐ ┌──┬──┐ ┌──┬──┐ ┌──┬──┐
/// 1 │01│01│ 2 │01│02│ 3 │01│03│ null │00│00│
/// └──┴──┘ └──┴──┘ └──┴──┘ └──┴──┘
///```
///
/// Which would be encoded as
///
/// ```text
/// ┌──┬──┬──┬──┬──┐
/// [1_u8, 2_u8] │01│01│01│01│02│
/// └──┴──┴──┴──┴──┘
/// └ 1 ┘ └ 2 ┘
/// ┌──┬──┬──┬──┬──┐
/// [3_u8, null] │01│01│03│00│00│
/// └──┴──┴──┴──┴──┘
/// └ 1 ┘ └null┘
/// ┌──┐
/// null │00│
/// └──┘
///
///```
///
/// # Ordering
///
/// ## Float Ordering
Expand Down Expand Up @@ -702,7 +742,18 @@ impl RowConverter {
// SAFETY
// We have validated that the rows came from this [`RowConverter`]
// and therefore must be valid
unsafe { self.convert_raw(&mut rows, validate_utf8) }
let result = unsafe { self.convert_raw(&mut rows, validate_utf8) }?;

for (i, row) in rows.iter().enumerate() {
if !row.is_empty() {
return Err(ArrowError::InvalidArgumentError(format!(
"Codecs {codecs:?} did not consume all bytes for row {i}, remaining bytes: {row:?}",
codecs = &self.codecs
)));
}
}

Ok(result)
}

/// Returns an empty [`Rows`] with capacity for `row_capacity` rows with
Expand Down Expand Up @@ -2039,6 +2090,9 @@ mod tests {
builder.values().append_null();
builder.append(true);
builder.append(true);
builder.values().append_value(17); // MASKED
builder.values().append_null(); // MASKED
builder.append(false);

let list = Arc::new(builder.finish()) as ArrayRef;
let d = list.data_type().clone();
Expand All @@ -2047,11 +2101,12 @@ mod tests {

let rows = converter.convert_columns(&[Arc::clone(&list)]).unwrap();
assert!(rows.row(0) > rows.row(1)); // [32, 52, 32] > [32, 52, 12]
assert!(rows.row(2) < rows.row(1)); // [32, 42] < [32, 52, 12]
assert!(rows.row(3) < rows.row(2)); // null < [32, 42]
assert!(rows.row(4) < rows.row(2)); // [32, null] < [32, 42]
assert!(rows.row(5) < rows.row(2)); // [] < [32, 42]
assert!(rows.row(2) < rows.row(1)); // [32, 52] < [32, 52, 12]
assert!(rows.row(3) < rows.row(2)); // null < [32, 52]
assert!(rows.row(4) < rows.row(2)); // [32, null] < [32, 52]
assert!(rows.row(5) < rows.row(2)); // [] < [32, 52]
assert!(rows.row(3) < rows.row(5)); // null < []
assert_eq!(rows.row(3), rows.row(6)); // null = null (different masked values)

let back = converter.convert_rows(&rows).unwrap();
assert_eq!(back.len(), 1);
Expand All @@ -2064,11 +2119,12 @@ mod tests {
let rows = converter.convert_columns(&[Arc::clone(&list)]).unwrap();

assert!(rows.row(0) > rows.row(1)); // [32, 52, 32] > [32, 52, 12]
assert!(rows.row(2) < rows.row(1)); // [32, 42] < [32, 52, 12]
assert!(rows.row(3) > rows.row(2)); // null > [32, 42]
assert!(rows.row(4) > rows.row(2)); // [32, null] > [32, 42]
assert!(rows.row(5) < rows.row(2)); // [] < [32, 42]
assert!(rows.row(2) < rows.row(1)); // [32, 52] < [32, 52, 12]
assert!(rows.row(3) > rows.row(2)); // null > [32, 52]
assert!(rows.row(4) > rows.row(2)); // [32, null] > [32, 52]
assert!(rows.row(5) < rows.row(2)); // [] < [32, 52]
assert!(rows.row(3) > rows.row(5)); // null > []
assert_eq!(rows.row(3), rows.row(6)); // null = null (different masked values)

let back = converter.convert_rows(&rows).unwrap();
assert_eq!(back.len(), 1);
Expand All @@ -2081,11 +2137,12 @@ mod tests {
let rows = converter.convert_columns(&[Arc::clone(&list)]).unwrap();

assert!(rows.row(0) < rows.row(1)); // [32, 52, 32] < [32, 52, 12]
assert!(rows.row(2) > rows.row(1)); // [32, 42] > [32, 52, 12]
assert!(rows.row(3) > rows.row(2)); // null > [32, 42]
assert!(rows.row(4) > rows.row(2)); // [32, null] > [32, 42]
assert!(rows.row(5) > rows.row(2)); // [] > [32, 42]
assert!(rows.row(2) > rows.row(1)); // [32, 52] > [32, 52, 12]
assert!(rows.row(3) > rows.row(2)); // null > [32, 52]
assert!(rows.row(4) > rows.row(2)); // [32, null] > [32, 52]
assert!(rows.row(5) > rows.row(2)); // [] > [32, 52]
assert!(rows.row(3) > rows.row(5)); // null > []
assert_eq!(rows.row(3), rows.row(6)); // null = null (different masked values)

let back = converter.convert_rows(&rows).unwrap();
assert_eq!(back.len(), 1);
Expand All @@ -2098,11 +2155,12 @@ mod tests {
let rows = converter.convert_columns(&[Arc::clone(&list)]).unwrap();

assert!(rows.row(0) < rows.row(1)); // [32, 52, 32] < [32, 52, 12]
assert!(rows.row(2) > rows.row(1)); // [32, 42] > [32, 52, 12]
assert!(rows.row(3) < rows.row(2)); // null < [32, 42]
assert!(rows.row(4) < rows.row(2)); // [32, null] < [32, 42]
assert!(rows.row(5) > rows.row(2)); // [] > [32, 42]
assert!(rows.row(2) > rows.row(1)); // [32, 52] > [32, 52, 12]
assert!(rows.row(3) < rows.row(2)); // null < [32, 52]
assert!(rows.row(4) < rows.row(2)); // [32, null] < [32, 52]
assert!(rows.row(5) > rows.row(2)); // [] > [32, 52]
assert!(rows.row(3) < rows.row(5)); // null < []
assert_eq!(rows.row(3), rows.row(6)); // null = null (different masked values)

let back = converter.convert_rows(&rows).unwrap();
assert_eq!(back.len(), 1);
Expand Down Expand Up @@ -2321,6 +2379,64 @@ mod tests {
assert_eq!(&back[0], &list);
}

#[test]
fn test_two_fixed_size_lists() {
let mut first = FixedSizeListBuilder::new(UInt8Builder::new(), 1);
// 0: [100]
first.values().append_value(100);
first.append(true);
// 1: [101]
first.values().append_value(101);
first.append(true);
// 2: [102]
first.values().append_value(102);
first.append(true);
// 3: [null]
first.values().append_null();
first.append(true);
// 4: null
first.values().append_null(); // MASKED
first.append(false);
let first = Arc::new(first.finish()) as ArrayRef;
let first_type = first.data_type().clone();

let mut second = FixedSizeListBuilder::new(UInt8Builder::new(), 1);
// 0: [200]
second.values().append_value(200);
second.append(true);
// 1: [201]
second.values().append_value(201);
second.append(true);
// 2: [202]
second.values().append_value(202);
second.append(true);
// 3: [null]
second.values().append_null();
second.append(true);
// 4: null
second.values().append_null(); // MASKED
second.append(false);
let second = Arc::new(second.finish()) as ArrayRef;
let second_type = second.data_type().clone();

let converter = RowConverter::new(vec![
SortField::new(first_type.clone()),
SortField::new(second_type.clone()),
])
.unwrap();

let rows = converter
.convert_columns(&[Arc::clone(&first), Arc::clone(&second)])
.unwrap();

let back = converter.convert_rows(&rows).unwrap();
assert_eq!(back.len(), 2);
back[0].to_data().validate_full().unwrap();
assert_eq!(&back[0], &first);
back[1].to_data().validate_full().unwrap();
assert_eq!(&back[1], &second);
}

fn generate_primitive_array<K>(len: usize, valid_percent: f64) -> PrimitiveArray<K>
where
K: ArrowPrimitiveType,
Expand Down
3 changes: 2 additions & 1 deletion arrow-row/src/list.rs
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ pub fn compute_lengths_fixed_size_list(
) {
let value_length = array.value_length().as_usize();
lengths.iter_mut().enumerate().for_each(|(idx, length)| {
*length = match array.is_valid(idx) {
*length += match array.is_valid(idx) {
true => {
1 + ((idx * value_length)..(idx + 1) * value_length)
.map(|child_idx| rows.row(child_idx).as_ref().len())
Expand Down Expand Up @@ -292,6 +292,7 @@ pub unsafe fn decode_fixed_size_list(
row_offset = next_offset;
}
}
*row = &row[row_offset..]; // Update row for the next decoder
}

let children = converter.convert_raw(&mut child_rows, validate_utf8)?;
Expand Down
Loading