Skip to content

Commit

Permalink
Do not write ColumnIndex for null columns when not writing page sta…
Browse files Browse the repository at this point in the history
…tistics (#6011)

* disable column_index_builder if no page stats are collected

* add test

* no need to clone descr

---------

Co-authored-by: Andrew Lamb <[email protected]>
  • Loading branch information
etseidl and alamb committed Jul 16, 2024
1 parent b72098f commit 6ab853d
Showing 1 changed file with 31 additions and 1 deletion.
32 changes: 31 additions & 1 deletion parquet/src/column/writer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,12 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
// Used for level information
encodings.insert(Encoding::RLE);

// Disable column_index_builder if not collecting page statistics.
let mut column_index_builder = ColumnIndexBuilder::new();
if statistics_enabled != EnabledStatistics::Page {
column_index_builder.to_invalid()
}

Self {
descr,
props,
Expand Down Expand Up @@ -289,7 +295,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
num_column_nulls: 0,
column_distinct_count: None,
},
column_index_builder: ColumnIndexBuilder::new(),
column_index_builder,
offset_index_builder: OffsetIndexBuilder::new(),
encodings,
data_page_boundary_ascending: true,
Expand Down Expand Up @@ -3020,6 +3026,30 @@ mod tests {
assert!(incremented.is_none())
}

#[test]
fn test_no_column_index_when_stats_disabled() {
// https://github.com/apache/arrow-rs/issues/6010
// Test that column index is not created/written for all-nulls column when page
// statistics are disabled.
let descr = Arc::new(get_test_column_descr::<Int32Type>(1, 0));
let props = Arc::new(
WriterProperties::builder()
.set_statistics_enabled(EnabledStatistics::None)
.build(),
);
let column_writer = get_column_writer(descr, props, get_test_page_writer());
let mut writer = get_typed_column_writer::<Int32Type>(column_writer);

let data = Vec::new();
let def_levels = vec![0; 10];
writer.write_batch(&data, Some(&def_levels), None).unwrap();
writer.flush_data_pages().unwrap();

let column_close_result = writer.close().unwrap();
assert!(column_close_result.offset_index.is_some());
assert!(column_close_result.column_index.is_none());
}

#[test]
fn test_boundary_order() -> Result<()> {
let descr = Arc::new(get_test_column_descr::<Int32Type>(1, 0));
Expand Down

0 comments on commit 6ab853d

Please sign in to comment.