Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions parquet/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,11 @@ name = "async_read_parquet"
required-features = ["arrow", "async"]
path = "./examples/async_read_parquet.rs"

[[example]]
name = "read_with_row_filter"
required-features = ["arrow"]
path = "./examples/read_with_row_filter.rs"

[[example]]
name = "read_with_rowgroup"
required-features = ["arrow", "async"]
Expand Down
49 changes: 49 additions & 0 deletions parquet/examples/read_with_row_filter.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

use arrow_array::Int32Array;
use arrow_cast::pretty::print_batches;
use parquet::arrow::ProjectionMask;
use parquet::arrow::arrow_reader::{ArrowPredicateFn, ParquetRecordBatchReaderBuilder, RowFilter};
use parquet::errors::Result;
use std::fs::File;

// RowFilter / with_row_filter usage. For background and more
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we better off removing this and keeping only the doctest to reduce duplication?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I agree -- I think the doc examples are easier to find so I recommend removing this example file

Actually, looking at the existing examples I think many of them are redundant / would be easier to find if we moved them into the documentation:
https://github.com/apache/arrow-rs/tree/main/parquet/examples

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

// context, see <https://arrow.apache.org/blog/2025/12/11/parquet-late-materialization-deep-dive/>
fn main() -> Result<()> {
let testdata = arrow::util::test_util::parquet_test_data();
let path = format!("{testdata}/alltypes_plain.parquet");
let file = File::open(&path)?;
let builder = ParquetRecordBatchReaderBuilder::try_new(file)?;
let schema_desc = builder.metadata().file_metadata().schema_descr_ptr();

// Create predicate: column id > 4. This col has index 0.
// Projection mask ensures only predicate columns are read to evaluate the filter.
let projection_mask = ProjectionMask::leaves(&schema_desc, [0]);
let predicate = ArrowPredicateFn::new(projection_mask, |batch| {
let id_col = batch.column(0);
arrow::compute::kernels::cmp::gt(id_col, &Int32Array::new_scalar(4))
});

let row_filter = RowFilter::new(vec![Box::new(predicate)]);
let reader = builder.with_row_filter(row_filter).build()?;

let filtered_batches: Vec<_> = reader.map(|b| b.unwrap()).collect();
print_batches(&filtered_batches)?;

Ok(())
}
30 changes: 30 additions & 0 deletions parquet/src/arrow/arrow_reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,36 @@ impl<T> ArrowReaderBuilder<T> {
///
/// It is recommended to enable reading the page index if using this functionality, to allow
/// more efficient skipping over data pages. See [`ArrowReaderOptions::with_page_index`].
///
/// For a running example see `parquet/examples/read_with_row_filter.rs`.
/// See <https://arrow.apache.org/blog/2025/12/11/parquet-late-materialization-deep-dive/>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

/// See the [blog post on late materialization] for a more technical explanation.
///
/// ...
///
/// [blog post on late materialization]: https://arrow.apache.org/blog/2025/12/11/parquet-late-materialization-deep-dive/

Slightly nice formatting this way

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1

/// for a technical explanation of late materialization.
///
/// # Example
/// ```rust
/// # use std::fs::File;
/// # use arrow_array::Int32Array;
/// # use parquet::arrow::ProjectionMask;
/// # use parquet::arrow::arrow_reader::{ArrowPredicateFn, ParquetRecordBatchReaderBuilder, RowFilter};
/// # fn main() -> Result<(), parquet::errors::ParquetError> {
/// # let testdata = arrow::util::test_util::parquet_test_data();
/// # let path = format!("{testdata}/alltypes_plain.parquet");
/// # let file = File::open(&path)?;
/// let builder = ParquetRecordBatchReaderBuilder::try_new(file)?;
/// let schema_desc = builder.metadata().file_metadata().schema_descr_ptr();
///
/// // Create predicate: column id > 4. This col has index 0.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
/// // Create predicate: column id > 4. This col has index 0.
/// // Create predicate that evaluates `id > 4`. The `id` column has index 0.

/// let projection = ProjectionMask::leaves(&schema_desc, [0]);
/// let predicate = ArrowPredicateFn::new(projection, |batch| {
/// let id_col = batch.column(0);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As a minor suggestion, I think it would make a nicer example if you picked a different column from the file other than 0 so that it is clear the batch passed to the predicate only contains the selected projection column

For example, perhaps you could use the int_col (column index 4)

> select * from './parquet-testing/data/alltypes_plain.parquet';
+----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+
| id | bool_col | tinyint_col | smallint_col | int_col | bigint_col | float_col | double_col | date_string_col  | string_col | timestamp_col       |
+----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+
| 4  | true     | 0           | 0            | 0       | 0          | 0.0       | 0.0        | 30332f30312f3039 | 30         | 2009-03-01T00:00:00 |
| 5  | false    | 1           | 1            | 1       | 10         | 1.1       | 10.1       | 30332f30312f3039 | 31         | 2009-03-01T00:01:00 |
| 6  | true     | 0           | 0            | 0       | 0          | 0.0       | 0.0        | 30342f30312f3039 | 30         | 2009-04-01T00:00:00 |
| 7  | false    | 1           | 1            | 1       | 10         | 1.1       | 10.1       | 30342f30312f3039 | 31         | 2009-04-01T00:01:00 |
| 2  | true     | 0           | 0            | 0       | 0          | 0.0       | 0.0        | 30322f30312f3039 | 30         | 2009-02-01T00:00:00 |
| 3  | false    | 1           | 1            | 1       | 10         | 1.1       | 10.1       | 30322f30312f3039 | 31         | 2009-02-01T00:01:00 |
| 0  | true     | 0           | 0            | 0       | 0          | 0.0       | 0.0        | 30312f30312f3039 | 30         | 2009-01-01T00:00:00 |
| 1  | false    | 1           | 1            | 1       | 10         | 1.1       | 10.1       | 30312f30312f3039 | 31         | 2009-01-01T00:01:00 |
+----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+
8 row(s) fetched.
Elapsed 0.039 seconds.

> describe './parquet-testing/data/alltypes_plain.parquet';
+-----------------+---------------+-------------+
| column_name     | data_type     | is_nullable |
+-----------------+---------------+-------------+
| id              | Int32         | YES         |
| bool_col        | Boolean       | YES         |
| tinyint_col     | Int32         | YES         |
| smallint_col    | Int32         | YES         |
| int_col         | Int32         | YES         |
| bigint_col      | Int64         | YES         |
| float_col       | Float32       | YES         |
| double_col      | Float64       | YES         |
| date_string_col | BinaryView    | YES         |
| string_col      | BinaryView    | YES         |
| timestamp_col   | Timestamp(ns) | YES         |
+-----------------+---------------+-------------+
11 row(s) fetched.
Elapsed 0.005 seconds.

/// arrow::compute::kernels::cmp::gt(id_col, &Int32Array::new_scalar(4))
/// });
///
/// let row_filter = RowFilter::new(vec![Box::new(predicate)]);
/// let _reader = builder.with_row_filter(row_filter).build()?;
/// # Ok(())
/// # }
/// ```
pub fn with_row_filter(self, filter: RowFilter) -> Self {
Self {
filter: Some(filter),
Expand Down
Loading