From daf8f4ccca4ad230ccb96f7bde83d98b3715d613 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Fri, 25 Jul 2025 00:33:45 +0300 Subject: [PATCH 1/3] perf: only encode actual list values in `RowConverter` Waiting for: - #7994 to be merged first Closes #7993 --- arrow-row/src/lib.rs | 29 +++++++++++++++++++++++++---- arrow-row/src/list.rs | 12 ++++++++---- 2 files changed, 33 insertions(+), 8 deletions(-) diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index 325d2953c858..2ec1d06cf466 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -518,12 +518,33 @@ impl Codec { } Codec::List(converter) => { let values = match array.data_type() { - DataType::List(_) => as_list_array(array).values(), - DataType::LargeList(_) => as_large_list_array(array).values(), - DataType::FixedSizeList(_, _) => as_fixed_size_list_array(array).values(), + DataType::List(_) => { + let list_array = as_list_array(array); + let first_offset = list_array.offsets()[0] as usize; + let last_offset = + list_array.offsets()[list_array.offsets().len() - 1] as usize; + + list_array + .values() + .slice(first_offset, last_offset - first_offset) + } + DataType::LargeList(_) => { + let list_array = as_large_list_array(array); + + let first_offset = list_array.offsets()[0] as usize; + let last_offset = + list_array.offsets()[list_array.offsets().len() - 1] as usize; + + list_array + .values() + .slice(first_offset, last_offset - first_offset) + } + DataType::FixedSizeList(_, _) => { + as_fixed_size_list_array(array).values().clone() + } _ => unreachable!(), }; - let rows = converter.convert_columns(&[values.clone()])?; + let rows = converter.convert_columns(&[values])?; Ok(Encoder::List(rows)) } Codec::RunEndEncoded(converter) => { diff --git a/arrow-row/src/list.rs b/arrow-row/src/list.rs index e9dc38e0fbe3..733b298aaafd 100644 --- a/arrow-row/src/list.rs +++ b/arrow-row/src/list.rs @@ -27,14 +27,16 @@ pub fn compute_lengths( rows: &Rows, array: &GenericListArray, ) { + let shift = array.value_offsets().first().map_or(0, |o| o.as_usize()); + let offsets = array.value_offsets().windows(2); lengths .iter_mut() .zip(offsets) .enumerate() .for_each(|(idx, (length, offsets))| { - let start = offsets[0].as_usize(); - let end = offsets[1].as_usize(); + let start = offsets[0].as_usize() - shift; + let end = offsets[1].as_usize() - shift; let range = array.is_valid(idx).then_some(start..end); *length += encoded_len(rows, range); }); @@ -61,14 +63,16 @@ pub fn encode( opts: SortOptions, array: &GenericListArray, ) { + let shift = array.value_offsets().first().map_or(0, |o| o.as_usize()); + offsets .iter_mut() .skip(1) .zip(array.value_offsets().windows(2)) .enumerate() .for_each(|(idx, (offset, offsets))| { - let start = offsets[0].as_usize(); - let end = offsets[1].as_usize(); + let start = offsets[0].as_usize() - shift; + let end = offsets[1].as_usize() - shift; let range = array.is_valid(idx).then_some(start..end); let out = &mut data[*offset..]; *offset += encode_one(out, rows, range, opts) From 2e034fe69bff445e6d8a11422d902db1cdf4f282 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Fri, 25 Jul 2025 00:39:12 +0300 Subject: [PATCH 2/3] avoid map_or as there must always be at least 1 offset --- arrow-row/src/list.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arrow-row/src/list.rs b/arrow-row/src/list.rs index 733b298aaafd..91c788fc8f41 100644 --- a/arrow-row/src/list.rs +++ b/arrow-row/src/list.rs @@ -27,7 +27,7 @@ pub fn compute_lengths( rows: &Rows, array: &GenericListArray, ) { - let shift = array.value_offsets().first().map_or(0, |o| o.as_usize()); + let shift = array.value_offsets()[0].as_usize(); let offsets = array.value_offsets().windows(2); lengths @@ -63,7 +63,7 @@ pub fn encode( opts: SortOptions, array: &GenericListArray, ) { - let shift = array.value_offsets().first().map_or(0, |o| o.as_usize()); + let shift = array.value_offsets()[0].as_usize(); offsets .iter_mut() From 4270ee78f62a13f8eaadec768dbf1c301b65fc37 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Sun, 27 Jul 2025 13:56:47 +0300 Subject: [PATCH 3/3] added comment --- arrow-row/src/lib.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index 2ec1d06cf466..f05e8144e45d 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -524,6 +524,8 @@ impl Codec { let last_offset = list_array.offsets()[list_array.offsets().len() - 1] as usize; + // values can include more data than referenced in the ListArray, only encode + // the referenced values. list_array .values() .slice(first_offset, last_offset - first_offset) @@ -535,6 +537,8 @@ impl Codec { let last_offset = list_array.offsets()[list_array.offsets().len() - 1] as usize; + // values can include more data than referenced in the LargeListArray, only encode + // the referenced values. list_array .values() .slice(first_offset, last_offset - first_offset)