From 9f674407a219936b26f770f86c67e46ce80d957c Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 13 Jun 2024 20:38:56 -0400 Subject: [PATCH] Document when the ParquetRecordBatchReader will re-read metadata --- parquet/src/arrow/arrow_reader/mod.rs | 50 ++++++++++++++++++++++----- parquet/src/arrow/async_reader/mod.rs | 6 ++++ 2 files changed, 48 insertions(+), 8 deletions(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 6b95324bee39..fd9cbf2039ba 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -248,7 +248,9 @@ impl ArrowReaderBuilder { /// is then read from the file, including projection and filter pushdown #[derive(Debug, Clone, Default)] pub struct ArrowReaderOptions { + /// Should the reader strip any user defined metadata from the Arrow schema skip_arrow_metadata: bool, + /// If true, attempt to read `OffsetIndex` and `ColumnIndex` pub(crate) page_index: bool, } @@ -282,23 +284,41 @@ impl ArrowReaderOptions { } } -/// The cheaply clone-able metadata necessary to construct a [`ArrowReaderBuilder`] +/// The metadata necessary to construct a [`ArrowReaderBuilder`] /// -/// This allows loading the metadata for a file once and then using this to construct -/// multiple separate readers, for example, to distribute readers across multiple threads +/// Note this structure is cheaply clone-able as it consists of several arcs. +/// +/// This structure allows +/// +/// 1. Loading metadata for a file once and then using that same metadata to +/// construct multiple separate readers, for example, to distribute readers +/// across multiple threads +/// +/// 2. Using a cached copy of the [`ParquetMetadata`] rather than reading it +/// from the file each time a reader is constructed. +/// +/// [`ParquetMetadata`]: crate::file::metadata::ParquetMetaData #[derive(Debug, Clone)] pub struct ArrowReaderMetadata { + /// The Parquet Metadata, if known aprior pub(crate) metadata: Arc, - + /// The Arrow Schema pub(crate) schema: SchemaRef, pub(crate) fields: Option>, } impl ArrowReaderMetadata { - /// Loads [`ArrowReaderMetadata`] from the provided [`ChunkReader`] + /// Loads [`ArrowReaderMetadata`] from the provided [`ChunkReader`], if necessary + /// + /// See [`ParquetRecordBatchReaderBuilder::new_with_metadata`] for an + /// example of how this can be used + /// + /// # Notes /// - /// See [`ParquetRecordBatchReaderBuilder::new_with_metadata`] for how this can be used + /// If `options` has [`ArrowReaderOptions::with_page_index`] true, but + /// `Self::metadata` is missing the page index, this function will attempt + /// to load the page index by making an object store request. pub fn load(reader: &T, options: ArrowReaderOptions) -> Result { let mut metadata = footer::parse_metadata(reader)?; if options.page_index { @@ -320,6 +340,12 @@ impl ArrowReaderMetadata { Self::try_new(Arc::new(metadata), options) } + /// Create a new [`ArrowReaderMetadata`] + /// + /// # Notes + /// + /// This function does not attempt to load the PageIndex if not present in the metadata. + /// See [`Self::load`] for more details. pub fn try_new(metadata: Arc, options: ArrowReaderOptions) -> Result { let kv_metadata = match options.skip_arrow_metadata { true => None, @@ -407,9 +433,17 @@ impl ParquetRecordBatchReaderBuilder { /// Create a [`ParquetRecordBatchReaderBuilder`] from the provided [`ArrowReaderMetadata`] /// - /// This allows loading metadata once and using it to create multiple builders with - /// potentially different settings + /// This interface allows: + /// + /// 1. Loading metadata once and using it to create multiple builders with + /// potentially different settings or run on different threads /// + /// 2. Using a cached copy of the metadata rather than re-reading it from the + /// file each time a reader is constructed. + /// + /// See the docs on [`ArrowReaderMetadata`] for more details + /// + /// # Example /// ``` /// # use std::fs::metadata; /// # use std::sync::Arc; diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index 1e298c654975..0a72583b90d0 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -202,6 +202,12 @@ impl ArrowReaderMetadata { /// Returns a new [`ArrowReaderMetadata`] for this builder /// /// See [`ParquetRecordBatchStreamBuilder::new_with_metadata`] for how this can be used + /// + /// # Notes + /// + /// If `options` has [`ArrowReaderOptions::with_page_index`] true, but + /// `Self::metadata` is missing the page index, this function will attempt + /// to load the page index by making an object store request. pub async fn load_async( input: &mut T, options: ArrowReaderOptions,