diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index f60688dc3337..cfb2462e738b 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -3101,13 +3101,16 @@ mod tests { } } + // Convert rows produced from convert_columns(). + // Note: validate_utf8 is set to false since Row is initialized through empty_rows() let back = converter.convert_rows(&rows).unwrap(); for (actual, expected) in back.iter().zip(&arrays) { actual.to_data().validate_full().unwrap(); dictionary_eq(actual, expected) } - // Check that we can convert + // Check that we can convert rows into ByteArray and then parse, convert it back to array + // Note: validate_utf8 is set to true since Row is initialized through RowParser let rows = rows.try_into_binary().expect("reasonable size"); let parser = converter.parser(); let back = converter @@ -3238,4 +3241,64 @@ mod tests { Ok(_) => panic!("Expected NotYetImplemented error for map data type"), } } + + #[test] + fn test_values_buffer_smaller_when_utf8_validation_disabled() { + fn get_values_buffer_len(col: ArrayRef) -> (usize, usize) { + // 1. Convert cols into rows + let converter = RowConverter::new(vec![SortField::new(DataType::Utf8View)]).unwrap(); + + // 2a. Convert rows into colsa (validate_utf8 = false) + let rows = converter.convert_columns(&[col]).unwrap(); + let converted = converter.convert_rows(&rows).unwrap(); + let unchecked_values_len = converted[0].as_string_view().data_buffers()[0].len(); + + // 2b. Convert rows into cols (validate_utf8 = true since Row is initialized through RowParser) + let rows = rows.try_into_binary().expect("reasonable size"); + let parser = converter.parser(); + let converted = converter + .convert_rows(rows.iter().map(|b| parser.parse(b.expect("valid bytes")))) + .unwrap(); + let checked_values_len = converted[0].as_string_view().data_buffers()[0].len(); + (unchecked_values_len, checked_values_len) + } + + // Case1. StringViewArray with inline strings + let col = Arc::new(StringViewArray::from_iter([ + Some("hello"), // short(5) + None, // null + Some("short"), // short(5) + Some("tiny"), // short(4) + ])) as ArrayRef; + + let (unchecked_values_len, checked_values_len) = get_values_buffer_len(col); + // Since there are no long (>12) strings, len of values buffer is 0 + assert_eq!(unchecked_values_len, 0); + // When utf8 validation enabled, values buffer includes inline strings (5+5+4) + assert_eq!(checked_values_len, 14); + + // Case2. StringViewArray with long(>12) strings + let col = Arc::new(StringViewArray::from_iter([ + Some("this is a very long string over 12 bytes"), + Some("another long string to test the buffer"), + ])) as ArrayRef; + + let (unchecked_values_len, checked_values_len) = get_values_buffer_len(col); + // Since there are no inline strings, expected length of values buffer is the same + assert!(unchecked_values_len > 0); + assert_eq!(unchecked_values_len, checked_values_len); + + // Case3. StringViewArray with both short and long strings + let col = Arc::new(StringViewArray::from_iter([ + Some("tiny"), // 4 (short) + Some("thisisexact13"), // 13 (long) + None, + Some("short"), // 5 (short) + ])) as ArrayRef; + + let (unchecked_values_len, checked_values_len) = get_values_buffer_len(col); + // Since there is single long string, len of values buffer is 13 + assert_eq!(unchecked_values_len, 13); + assert!(checked_values_len > unchecked_values_len); + } } diff --git a/arrow-row/src/variable.rs b/arrow-row/src/variable.rs index 4d4bcddc0807..7b19b4017617 100644 --- a/arrow-row/src/variable.rs +++ b/arrow-row/src/variable.rs @@ -20,7 +20,7 @@ use arrow_array::builder::BufferBuilder; use arrow_array::*; use arrow_buffer::bit_util::ceil; use arrow_buffer::MutableBuffer; -use arrow_data::ArrayDataBuilder; +use arrow_data::{ArrayDataBuilder, MAX_INLINE_VIEW_LEN}; use arrow_schema::{DataType, SortOptions}; use builder::make_view; @@ -249,9 +249,10 @@ pub fn decode_binary( fn decode_binary_view_inner( rows: &mut [&[u8]], options: SortOptions, - check_utf8: bool, + validate_utf8: bool, ) -> BinaryViewArray { let len = rows.len(); + let inline_str_max_len = MAX_INLINE_VIEW_LEN as usize; let mut null_count = 0; @@ -261,13 +262,33 @@ fn decode_binary_view_inner( valid }); - let values_capacity: usize = rows.iter().map(|row| decoded_len(row, options)).sum(); + // If we are validating UTF-8, decode all string values (including short strings) + // into the values buffer and validate UTF-8 once. If not validating, + // we save memory by only copying long strings to the values buffer, as short strings + // will be inlined into the view and do not need to be stored redundantly. + let values_capacity = if validate_utf8 { + // Capacity for all long and short strings + rows.iter().map(|row| decoded_len(row, options)).sum() + } else { + // Capacity for all long strings plus room for one short string + rows.iter().fold(0, |acc, row| { + let len = decoded_len(row, options); + if len > inline_str_max_len { + acc + len + } else { + acc + } + }) + inline_str_max_len + }; let mut values = MutableBuffer::new(values_capacity); - let mut views = BufferBuilder::::new(len); + let mut views = BufferBuilder::::new(len); for row in rows { let start_offset = values.len(); let offset = decode_blocks(row, options, |b| values.extend_from_slice(b)); + // Measure string length via change in values buffer. + // Used to check if decoded value should be truncated (short string) when validate_utf8 is false + let decoded_len = values.len() - start_offset; if row[0] == null_sentinel(options) { debug_assert_eq!(offset, 1); debug_assert_eq!(start_offset, values.len()); @@ -282,11 +303,16 @@ fn decode_binary_view_inner( let view = make_view(val, 0, start_offset as u32); views.append(view); + + // truncate inline string in values buffer if validate_utf8 is false + if !validate_utf8 && decoded_len <= inline_str_max_len { + values.truncate(start_offset); + } } *row = &row[offset..]; } - if check_utf8 { + if validate_utf8 { // the values contains all data, no matter if it is short or long // we can validate utf8 in one go. std::str::from_utf8(values.as_slice()).unwrap();