-
Notifications
You must be signed in to change notification settings - Fork 4k
ARROW-10387: [Rust][Parquet] Avoid call for file size metadata to read footer #8525
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -16,7 +16,6 @@ | |
| // under the License. | ||
|
|
||
| use std::{ | ||
| cmp::min, | ||
| io::{Cursor, Read, Seek, SeekFrom}, | ||
| rc::Rc, | ||
| }; | ||
|
|
@@ -29,8 +28,9 @@ use crate::basic::ColumnOrder; | |
|
|
||
| use crate::errors::{ParquetError, Result}; | ||
| use crate::file::{ | ||
| metadata::*, reader::ChunkReader, DEFAULT_FOOTER_READ_SIZE, FOOTER_SIZE, | ||
| PARQUET_MAGIC, | ||
| metadata::*, | ||
| reader::{ChunkMode, ChunkReader, Length}, | ||
| DEFAULT_FOOTER_READ_SIZE, FOOTER_SIZE, PARQUET_MAGIC, | ||
| }; | ||
|
|
||
| use crate::schema::types::{self, SchemaDescriptor}; | ||
|
|
@@ -44,30 +44,31 @@ use crate::schema::types::{self, SchemaDescriptor}; | |
| /// The reader first reads DEFAULT_FOOTER_SIZE bytes from the end of the file. | ||
| /// If it is not enough according to the length indicated in the footer, it reads more bytes. | ||
| pub fn parse_metadata<R: ChunkReader>(chunk_reader: &R) -> Result<ParquetMetaData> { | ||
| // check file is large enough to hold footer | ||
| let file_size = chunk_reader.len(); | ||
| if file_size < (FOOTER_SIZE as u64) { | ||
| // read and cache up to DEFAULT_FOOTER_READ_SIZE bytes from the end and process the footer | ||
| let mut first_end_read = chunk_reader.get_read( | ||
| ChunkMode::FromEnd(DEFAULT_FOOTER_READ_SIZE as u64), | ||
| DEFAULT_FOOTER_READ_SIZE, | ||
| )?; | ||
| let first_end_len = first_end_read.len() as usize; | ||
|
|
||
| if first_end_len < FOOTER_SIZE { | ||
| return Err(general_err!( | ||
| "Invalid Parquet file. Size is smaller than footer" | ||
| )); | ||
| } | ||
|
|
||
| // read and cache up to DEFAULT_FOOTER_READ_SIZE bytes from the end and process the footer | ||
| let default_end_len = min(DEFAULT_FOOTER_READ_SIZE, chunk_reader.len() as usize); | ||
| let mut default_end_reader = chunk_reader | ||
| .get_read(chunk_reader.len() - default_end_len as u64, default_end_len)?; | ||
| let mut default_len_end_buf = vec![0; default_end_len]; | ||
| default_end_reader.read_exact(&mut default_len_end_buf)?; | ||
| let mut first_len_end_buf = vec![0; first_end_len]; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: perhaps a better name |
||
| first_end_read.read_exact(&mut first_len_end_buf)?; | ||
|
|
||
| // check this is indeed a parquet file | ||
| if default_len_end_buf[default_end_len - 4..] != PARQUET_MAGIC { | ||
| if first_len_end_buf[first_end_len - 4..] != PARQUET_MAGIC { | ||
| return Err(general_err!("Invalid Parquet file. Corrupt footer")); | ||
| } | ||
|
|
||
| // get the metadata length from the footer | ||
| let metadata_len = LittleEndian::read_i32( | ||
| &default_len_end_buf[default_end_len - 8..default_end_len - 4], | ||
| ) as i64; | ||
| let metadata_len = | ||
| LittleEndian::read_i32(&first_len_end_buf[first_end_len - 8..first_end_len - 4]) | ||
| as i64; | ||
| if metadata_len < 0 { | ||
| return Err(general_err!( | ||
| "Invalid Parquet file. Metadata length is less than zero ({})", | ||
|
|
@@ -77,24 +78,31 @@ pub fn parse_metadata<R: ChunkReader>(chunk_reader: &R) -> Result<ParquetMetaDat | |
| let footer_metadata_len = FOOTER_SIZE + metadata_len as usize; | ||
|
|
||
| // build up the reader covering the entire metadata | ||
| let mut default_end_cursor = Cursor::new(default_len_end_buf); | ||
| let mut first_end_cursor = Cursor::new(first_len_end_buf); | ||
| let metadata_read: Box<dyn Read>; | ||
| if footer_metadata_len > file_size as usize { | ||
| if first_end_len < DEFAULT_FOOTER_READ_SIZE | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. to check my understanding -- this branch is checking if the total file was smaller than
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. exactly, I should add a comment to explicit this |
||
| && footer_metadata_len > first_end_len as usize | ||
| { | ||
| return Err(general_err!( | ||
| "Invalid Parquet file. Metadata start is less than zero ({})", | ||
| file_size as i64 - footer_metadata_len as i64 | ||
| "Invalid Parquet file. Metadata size exceeds file size." | ||
| )); | ||
| } else if footer_metadata_len < DEFAULT_FOOTER_READ_SIZE { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should this be |
||
| // the whole metadata is in the bytes we already read | ||
| default_end_cursor.seek(SeekFrom::End(-(footer_metadata_len as i64)))?; | ||
| metadata_read = Box::new(default_end_cursor); | ||
| first_end_cursor.seek(SeekFrom::End(-(footer_metadata_len as i64)))?; | ||
| metadata_read = Box::new(first_end_cursor); | ||
| } else { | ||
| // the end of file read by default is not long enough, read missing bytes | ||
| let complementary_end_len = FOOTER_SIZE + metadata_len as usize - first_end_len; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: reuse
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. good idea |
||
| let complementary_end_read = chunk_reader.get_read( | ||
| file_size - footer_metadata_len as u64, | ||
| FOOTER_SIZE + metadata_len as usize - default_end_len, | ||
| ChunkMode::FromEnd(footer_metadata_len as u64), | ||
| complementary_end_len, | ||
| )?; | ||
| metadata_read = Box::new(complementary_end_read.chain(default_end_cursor)); | ||
| if complementary_end_read.len() < complementary_end_len as u64 { | ||
| return Err(general_err!( | ||
| "Invalid Parquet file. Metadata size exceeds file size." | ||
| )); | ||
| } | ||
| metadata_read = Box::new(complementary_end_read.chain(first_end_cursor)); | ||
| } | ||
|
|
||
| // TODO: row group filtering | ||
|
|
@@ -207,7 +215,7 @@ mod tests { | |
| assert!(reader_result.is_err()); | ||
| assert_eq!( | ||
| reader_result.err().unwrap(), | ||
| general_err!("Invalid Parquet file. Metadata start is less than zero (-255)") | ||
| general_err!("Invalid Parquet file. Metadata size exceeds file size.") | ||
| ); | ||
| } | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -32,6 +32,22 @@ use crate::basic::Type; | |
|
|
||
| use crate::column::reader::ColumnReaderImpl; | ||
|
|
||
| /// Parquet files must be read from end for the footer then from start for columns | ||
| pub enum ChunkMode { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. One suggestion I have is to call this Alternatively, given its similarity, I wonder if it would make sense to use
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. good idea |
||
| FromStart(u64), | ||
| FromEnd(u64), | ||
| } | ||
|
|
||
| impl ChunkMode { | ||
| /// FromStart offset can always be computed if you know the length | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. pls update to make it a more proper doc for the method, e.g., what should the caller pass for
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. good idea |
||
| pub fn from_start(&self, len: u64) -> u64 { | ||
| match self { | ||
| ChunkMode::FromStart(start_offset) => *start_offset, | ||
| ChunkMode::FromEnd(end_offset) => len.saturating_sub(*end_offset), | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /// Length should return the total number of bytes in the input source. | ||
| /// It's mainly used to read the metadata, which is at the end of the source. | ||
| #[allow(clippy::len_without_is_empty)] | ||
|
|
@@ -43,11 +59,12 @@ pub trait Length { | |
| /// The ChunkReader trait generates readers of chunks of a source. | ||
| /// For a file system reader, each chunk might contain a clone of File bounded on a given range. | ||
| /// For an object store reader, each read can be mapped to a range request. | ||
| pub trait ChunkReader: Length { | ||
| type T: Read; | ||
| /// get a serialy readeable slice of the current reader | ||
| /// This should fail if the slice exceeds the current bounds | ||
| fn get_read(&self, start: u64, length: usize) -> Result<Self::T>; | ||
| pub trait ChunkReader { | ||
| type T: Read + Length; | ||
| /// Get a serialy readeable slice of the current reader. | ||
| /// If one end of the slice exceeds the bounds of the source, the slice will be clamped to the source. | ||
| /// In that case, the length of the resulting Read will be smaller than the requested length. | ||
| fn get_read(&self, start: ChunkMode, length: usize) -> Result<Self::T>; | ||
| } | ||
|
|
||
| // ---------------------------------------------------------------------- | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -16,7 +16,7 @@ | |
| // under the License. | ||
|
|
||
| use std::cmp; | ||
| use std::io::{self, Error, ErrorKind, Read}; | ||
| use std::io::{self, Read}; | ||
| use std::rc::Rc; | ||
|
|
||
| /// This is object to use if your file is already in memory. | ||
|
|
@@ -42,20 +42,21 @@ impl SliceableCursor { | |
| } | ||
| } | ||
|
|
||
| /// Create a slice cursor using the same data as a current one. | ||
| pub fn slice(&self, start: u64, length: usize) -> io::Result<Self> { | ||
| let new_start = self.start + start; | ||
| if new_start >= self.inner.len() as u64 | ||
| || new_start as usize + length > self.inner.len() | ||
| { | ||
| return Err(Error::new(ErrorKind::InvalidInput, "out of bound")); | ||
| /// Create a slice cursor backed by the same data as a current one. | ||
| /// If the slice length is larger than the remaining bytes in the source, the slice is clamped. | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we replace
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I like the word but you are right 😄 |
||
| /// Panics if start is larger than the vector size. | ||
| pub fn slice(&self, start: u64, length: usize) -> Self { | ||
| if start > self.length as u64 { | ||
| panic!("Slice start larger than cursor"); | ||
| } | ||
| Ok(SliceableCursor { | ||
| let absolute_start = self.start + start; | ||
| let clamped_length = std::cmp::min(length, self.length - start as usize); | ||
| SliceableCursor { | ||
| inner: Rc::clone(&self.inner), | ||
| start: new_start, | ||
| pos: new_start, | ||
| length, | ||
| }) | ||
| start: absolute_start, | ||
| pos: absolute_start, | ||
| length: clamped_length, | ||
| } | ||
| } | ||
|
|
||
| fn remaining_slice(&self) -> &[u8] { | ||
|
|
@@ -107,7 +108,19 @@ mod tests { | |
|
|
||
| #[test] | ||
| fn read_all_slice() { | ||
| let cursor = get_u8_range().slice(10, 10).expect("error while slicing"); | ||
| let cursor = get_u8_range().slice(10, 10); | ||
| check_read_all(cursor, 10, 19); | ||
| } | ||
|
|
||
| #[test] | ||
| fn read_all_clipped_slice() { | ||
| let cursor = get_u8_range().slice(250, 10); | ||
| check_read_all(cursor, 250, 255); | ||
| } | ||
|
|
||
| #[test] | ||
| fn chaining_slices() { | ||
| let cursor = get_u8_range().slice(200, 50).slice(10, 10); | ||
| check_read_all(cursor, 210, 219); | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's weird that we are handling the "chunked read" logic in the footer reader - should this be inside the
ChunkReaderimplementation? e.g., the reader itself should do lazy loading on the input stream based on how application decide to seek & read the data.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's a good point. I copied this logic from the C++ implem but it's not great in terms of seperation of concerns. If I understand you point:
get_read(ChunkMode::FromEnd(FOOTER_SIZE))ChunkReaderimplem to buffer extra bytes if it founds thatFOOTER_SIZEis too small and it is cheap for him to get more bytes (and for instance the S3 ChunkReader will get 16kB instead)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes exactly, I feel the footer reader should not be aware of how the input stream is processed and also the logic can vary depending on the remote storage so the
DEFAULT_FOOTER_READ_SIZEmay not fit for all.