Skip to content

Commit b58d786

Browse files
committed
Apply page index filter.
1 parent 1e774a2 commit b58d786

File tree

4 files changed

+82
-24
lines changed

4 files changed

+82
-24
lines changed

src/query/storages/parquet/src/parquet_source.rs

Lines changed: 75 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@
1515
use std::any::Any;
1616
use std::sync::Arc;
1717

18+
use common_arrow::arrow::bitmap::Bitmap;
19+
use common_arrow::arrow::bitmap::MutableBitmap;
20+
use common_arrow::parquet::indexes::Interval;
1821
use common_base::base::Progress;
1922
use common_base::base::ProgressValues;
2023
use common_catalog::plan::PartInfoPtr;
@@ -48,9 +51,14 @@ struct PrewhereData {
4851
/// The states for [`ParquetSource`]. The states will recycle for each row group of a parquet file.
4952
enum State {
5053
ReadDataPrewhere(Option<PartInfoPtr>),
51-
ReadDataRemain(PartInfoPtr, PrewhereData),
52-
PrewhereFilter(PartInfoPtr, Vec<IndexedChunk>),
53-
Deserialize(PartInfoPtr, Vec<IndexedChunk>, Option<PrewhereData>),
54+
ReadDataRemain(PartInfoPtr, PrewhereData, Option<Bitmap>),
55+
PrewhereFilter(PartInfoPtr, Vec<IndexedChunk>, Option<Bitmap>),
56+
Deserialize(
57+
PartInfoPtr,
58+
Vec<IndexedChunk>,
59+
Option<PrewhereData>,
60+
Option<Bitmap>,
61+
),
5462
Generated(Option<PartInfoPtr>, DataBlock),
5563
Finish,
5664
}
@@ -108,12 +116,18 @@ impl ParquetSource {
108116
&mut self,
109117
part: PartInfoPtr,
110118
raw_chunks: Vec<IndexedChunk>,
119+
row_selection: Option<Bitmap>,
111120
) -> Result<()> {
112121
let rg_part = ParquetRowGroupPart::from_part(&part)?;
113122
// deserialize prewhere data block first
114-
let data_block = self
115-
.prewhere_reader
116-
.deserialize(rg_part, raw_chunks, None)?;
123+
let data_block = if let Some(row_selection) = &row_selection {
124+
self.prewhere_reader
125+
.deserialize(rg_part, raw_chunks, Some(row_selection.clone()))?
126+
} else {
127+
self.prewhere_reader
128+
.deserialize(rg_part, raw_chunks, None)?
129+
};
130+
117131
if let Some(filter) = self.prewhere_filter.as_ref() {
118132
// do filter
119133
let func_ctx = self.ctx.try_get_function_context()?;
@@ -170,10 +184,14 @@ impl ParquetSource {
170184
filtered_block.resort(self.src_schema.as_ref(), self.output_schema.as_ref())?;
171185
self.state = Generated(self.ctx.try_get_part(), block);
172186
} else {
173-
self.state = State::ReadDataRemain(part, PrewhereData {
174-
data_block: filtered_block,
175-
filter,
176-
});
187+
self.state = State::ReadDataRemain(
188+
part,
189+
PrewhereData {
190+
data_block: filtered_block,
191+
filter,
192+
},
193+
row_selection,
194+
);
177195
}
178196
Ok(())
179197
} else {
@@ -188,6 +206,7 @@ impl ParquetSource {
188206
part: PartInfoPtr,
189207
raw_chunks: Vec<IndexedChunk>,
190208
prewhere_data: Option<PrewhereData>,
209+
row_selection: Option<Bitmap>,
191210
) -> Result<()> {
192211
let rg_part = ParquetRowGroupPart::from_part(&part)?;
193212
let output_block = if let Some(PrewhereData {
@@ -207,8 +226,15 @@ impl ParquetSource {
207226
}
208227
Value::Column(bitmap) => {
209228
if !self.read_options.push_down_bitmap() || bitmap.unset_bits() == 0 {
210-
// don't need filter
211-
let block = remain_reader.deserialize(rg_part, raw_chunks, None)?;
229+
let block = if let Some(row_selection) = &row_selection {
230+
remain_reader.deserialize(
231+
rg_part,
232+
raw_chunks,
233+
Some(row_selection.clone()),
234+
)?
235+
} else {
236+
remain_reader.deserialize(rg_part, raw_chunks, None)?
237+
};
212238
DataBlock::filter_with_bitmap(block, &bitmap)?
213239
} else {
214240
remain_reader.deserialize(rg_part, raw_chunks, Some(bitmap))?
@@ -297,9 +323,9 @@ impl Processor for ParquetSource {
297323
match self.state {
298324
State::Finish => Ok(Event::Finished),
299325
State::ReadDataPrewhere(_)
300-
| State::ReadDataRemain(_, _)
301-
| State::PrewhereFilter(_, _)
302-
| State::Deserialize(_, _, _) => Ok(Event::Sync),
326+
| State::ReadDataRemain(_, _, _)
327+
| State::PrewhereFilter(_, _, _)
328+
| State::Deserialize(_, _, _, _) => Ok(Event::Sync),
303329
State::Generated(_, _) => Err(ErrorCode::Internal("It's a bug.")),
304330
}
305331
}
@@ -308,32 +334,59 @@ impl Processor for ParquetSource {
308334
match std::mem::replace(&mut self.state, State::Finish) {
309335
State::ReadDataPrewhere(Some(part)) => {
310336
let rg_part = ParquetRowGroupPart::from_part(&part)?;
337+
let row_selection = rg_part
338+
.row_selection
339+
.as_ref()
340+
.map(|sel| intervals_to_bitmap(sel, rg_part.num_rows));
311341
let chunks = self.prewhere_reader.sync_read_columns(rg_part)?;
312342
if self.prewhere_filter.is_some() {
313-
self.state = State::PrewhereFilter(part, chunks);
343+
self.state = State::PrewhereFilter(part, chunks, row_selection);
314344
} else {
315345
// If there is no prewhere filter, it means there is only the prewhere reader.
316346
assert!(self.remain_reader.is_none());
317347
// So all the needed columns are read.
318-
self.state = State::Deserialize(part, chunks, None)
348+
self.state = State::Deserialize(part, chunks, None, row_selection)
319349
}
320350
Ok(())
321351
}
322-
State::ReadDataRemain(part, prewhere_data) => {
352+
State::ReadDataRemain(part, prewhere_data, row_selection) => {
323353
if let Some(remain_reader) = self.remain_reader.as_ref() {
324354
let rg_part = ParquetRowGroupPart::from_part(&part)?;
325355
let chunks = remain_reader.sync_read_columns(rg_part)?;
326-
self.state = State::Deserialize(part, chunks, Some(prewhere_data));
356+
self.state =
357+
State::Deserialize(part, chunks, Some(prewhere_data), row_selection);
327358
Ok(())
328359
} else {
329360
Err(ErrorCode::Internal("It's a bug. No remain reader"))
330361
}
331362
}
332-
State::PrewhereFilter(part, chunks) => self.do_prewhere_filter(part, chunks),
333-
State::Deserialize(part, chunks, prewhere_data) => {
334-
self.do_deserialize(part, chunks, prewhere_data)
363+
State::PrewhereFilter(part, chunks, row_selection) => {
364+
self.do_prewhere_filter(part, chunks, row_selection)
365+
}
366+
State::Deserialize(part, chunks, prewhere_data, row_selection) => {
367+
self.do_deserialize(part, chunks, prewhere_data, row_selection)
335368
}
336369
_ => Err(ErrorCode::Internal("It's a bug.")),
337370
}
338371
}
339372
}
373+
374+
/// Convert intervals to a bitmap. The `intervals` represents the row selection across `num_rows`.
375+
fn intervals_to_bitmap(interval: &[Interval], num_rows: usize) -> Bitmap {
376+
debug_assert!(
377+
interval.is_empty()
378+
|| interval.last().unwrap().start + interval.last().unwrap().length < num_rows
379+
);
380+
381+
let mut bitmap = MutableBitmap::with_capacity(num_rows);
382+
let mut offset = 0;
383+
384+
for intv in interval {
385+
bitmap.extend_constant(intv.start - offset, false);
386+
bitmap.extend_constant(intv.length, true);
387+
offset = intv.start + intv.length;
388+
}
389+
bitmap.extend_constant(num_rows - offset, false);
390+
391+
bitmap.into()
392+
}

src/query/storages/parquet/src/pruning.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,8 @@ pub fn prune_and_set_partitions(
134134
page_pruners
135135
.as_ref()
136136
.map(|pruners| filter_pages(&mut file, schema, rg, pruners))
137-
.transpose()?
137+
.transpose()
138+
.unwrap_or(None)
138139
} else {
139140
None
140141
};

src/query/storages/parquet/src/table_function/read.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,7 @@ impl ParquetTable {
180180
prewhere_reader.clone(),
181181
prewhere_filter.clone(),
182182
remain_reader.clone(),
183+
self.read_options,
183184
)
184185
},
185186
max_io_requests,

src/query/storages/parquet/src/table_function/table.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,10 @@ impl ParquetTable {
152152
table_info,
153153
arrow_schema,
154154
operator,
155-
read_options: ReadOptions::new(), // Now, `read_options` is hard-coded.
155+
read_options: ReadOptions::new()
156+
.with_prune_row_groups()
157+
.with_prune_pages()
158+
.with_do_prewhere(), // Now, `read_options` is hard-coded.
156159
}))
157160
}
158161
}

0 commit comments

Comments
 (0)