-
Notifications
You must be signed in to change notification settings - Fork 493
feat: Add existing parquet files #960
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 11 commits
6853bab
c574c5f
3f258ad
39ce23d
dd4abb7
65abfc3
4cb5b1e
909d098
e1dd355
8756a71
afbc642
a9c6b94
e01ead8
96aabfe
b38496e
267124d
33cac26
86ea8eb
c0cfb56
175d4ce
0c7caaa
6f0bc0b
d6cf198
2665ebc
d90871a
0b5d78c
43a6c85
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,21 +18,27 @@ | |
| //! This module contains transaction api. | ||
|
|
||
| use std::cmp::Ordering; | ||
| use std::collections::HashMap; | ||
| use std::collections::{HashMap, HashSet}; | ||
| use std::future::Future; | ||
| use std::mem::discriminant; | ||
| use std::ops::RangeFrom; | ||
| use std::sync::Arc; | ||
|
|
||
| use parquet::arrow::async_reader::AsyncFileReader; | ||
| use parquet::file::metadata::ParquetMetaData; | ||
| use uuid::Uuid; | ||
|
|
||
| use crate::arrow::ArrowFileReader; | ||
| use crate::error::Result; | ||
| use crate::io::OutputFile; | ||
| use crate::io::{FileIO, OutputFile}; | ||
| use crate::spec::{ | ||
| DataFile, DataFileFormat, FormatVersion, ManifestEntry, ManifestFile, ManifestListWriter, | ||
| ManifestWriterBuilder, NullOrder, Operation, Snapshot, SnapshotReference, SnapshotRetention, | ||
| SortDirection, SortField, SortOrder, Struct, StructType, Summary, Transform, MAIN_BRANCH, | ||
| visit_schema, DataContentType, DataFile, DataFileBuilder, DataFileFormat, FormatVersion, | ||
| ManifestEntry, ManifestFile, ManifestListWriter, ManifestWriterBuilder, NullOrder, Operation, | ||
| SchemaRef, Snapshot, SnapshotReference, SnapshotRetention, SortDirection, SortField, SortOrder, | ||
| Struct, StructType, Summary, TableMetadata, Transform, MAIN_BRANCH, | ||
| }; | ||
| use crate::table::Table; | ||
| use crate::writer::file_writer::parquet_writer::{IndexByParquetPathName, MinMaxColAggregator}; | ||
| use crate::TableUpdate::UpgradeFormatVersion; | ||
| use crate::{Catalog, Error, ErrorKind, TableCommit, TableRequirement, TableUpdate}; | ||
|
|
||
|
|
@@ -169,6 +175,172 @@ impl<'a> Transaction<'a> { | |
|
|
||
| catalog.update_table(table_commit).await | ||
| } | ||
|
|
||
| /// Adds existing parquet files | ||
| pub async fn add_parquet_files( | ||
| self, | ||
| file_paths: Vec<String>, | ||
| check_duplicate_files: bool, | ||
|
jonathanc-n marked this conversation as resolved.
Outdated
|
||
| ) -> Result<Transaction<'a>> { | ||
| if check_duplicate_files { | ||
|
jonathanc-n marked this conversation as resolved.
Outdated
|
||
| let unique_paths: HashSet<_> = file_paths.iter().collect(); | ||
| if unique_paths.len() != file_paths.len() { | ||
| return Err(Error::new( | ||
| ErrorKind::DataInvalid, | ||
| "Duplicate file paths provided", | ||
| )); | ||
| } | ||
| } | ||
| let table_metadata = self.table.metadata(); | ||
|
|
||
| let data_files = Transaction::parquet_files_to_data_files( | ||
| &self, | ||
| self.table.file_io(), | ||
| file_paths, | ||
| table_metadata, | ||
| ) | ||
| .await?; | ||
|
|
||
| let mut fast_append_action = self.fast_append(Some(Uuid::new_v4()), Vec::new())?; | ||
| fast_append_action.add_data_files(data_files)?; | ||
|
|
||
| fast_append_action.apply().await | ||
| } | ||
|
|
||
| async fn parquet_files_to_data_files( | ||
|
jonathanc-n marked this conversation as resolved.
Outdated
|
||
| &self, | ||
| file_io: &FileIO, | ||
| file_paths: Vec<String>, | ||
| table_metadata: &TableMetadata, | ||
| ) -> Result<Vec<DataFile>> { | ||
| let mut data_files: Vec<DataFile> = Vec::new(); | ||
| let partition_value = | ||
| self.create_default_partition_value(&table_metadata.default_partition_type)?; | ||
|
|
||
| for file_path in file_paths { | ||
| let input_file = file_io.new_input(&file_path)?; | ||
| if !input_file.exists().await? { | ||
| return Err(Error::new( | ||
| ErrorKind::DataInvalid, | ||
| "File does not exist".to_string(), | ||
| )); | ||
| } | ||
|
jonathanc-n marked this conversation as resolved.
Outdated
|
||
| let file_metadata = input_file.metadata().await?; | ||
| let file_size_in_bytes = file_metadata.size as usize; | ||
| let reader = input_file.reader().await?; | ||
|
|
||
| let mut parquet_reader = ArrowFileReader::new(file_metadata, reader); | ||
| let parquet_metadata = parquet_reader.get_metadata().await.map_err(|err| { | ||
| Error::new( | ||
| ErrorKind::DataInvalid, | ||
| format!("Error reading Parquet metadata: {}", err), | ||
| ) | ||
| })?; | ||
| let builder = self.parquet_to_data_file_builder( | ||
| table_metadata.current_schema().clone(), | ||
| parquet_metadata, | ||
| &partition_value, | ||
| file_size_in_bytes, | ||
| file_path, | ||
| )?; | ||
| let data_file = builder.build().unwrap(); | ||
| data_files.push(data_file); | ||
| } | ||
| Ok(data_files) | ||
| } | ||
|
|
||
| /// `ParquetMetadata` to data file builder | ||
| pub fn parquet_to_data_file_builder( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have some suggestion for this method:
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think i mentioned the problem earlier with reusing the function here. Writer returns raw metadata which is what the original |
||
| &self, | ||
| schema: SchemaRef, | ||
| metadata: Arc<ParquetMetaData>, | ||
| partition: &Struct, | ||
| written_size: usize, | ||
| file_path: String, | ||
| ) -> Result<DataFileBuilder> { | ||
| let index_by_parquet_path = { | ||
| let mut visitor = IndexByParquetPathName::new(); | ||
| visit_schema(&schema, &mut visitor)?; | ||
| visitor | ||
| }; | ||
|
|
||
| let (column_sizes, value_counts, null_value_counts, (lower_bounds, upper_bounds)) = { | ||
| let mut per_col_size: HashMap<i32, u64> = HashMap::new(); | ||
| let mut per_col_val_num: HashMap<i32, u64> = HashMap::new(); | ||
| let mut per_col_null_val_num: HashMap<i32, u64> = HashMap::new(); | ||
| let mut min_max_agg = MinMaxColAggregator::new(schema); | ||
|
|
||
| for row_group in metadata.row_groups() { | ||
| for column_chunk_metadata in row_group.columns() { | ||
| let parquet_path = column_chunk_metadata.column_descr().path().string(); | ||
|
|
||
| let Some(&field_id) = index_by_parquet_path.get(&parquet_path) else { | ||
| continue; | ||
| }; | ||
|
|
||
| *per_col_size.entry(field_id).or_insert(0) += | ||
| column_chunk_metadata.compressed_size() as u64; | ||
| *per_col_val_num.entry(field_id).or_insert(0) += | ||
| column_chunk_metadata.num_values() as u64; | ||
|
|
||
| if let Some(statistics) = column_chunk_metadata.statistics() { | ||
| if let Some(null_count) = statistics.null_count_opt() { | ||
| *per_col_null_val_num.entry(field_id).or_insert(0) += null_count; | ||
| } | ||
|
|
||
| min_max_agg.update(field_id, statistics.clone())?; | ||
| } | ||
| } | ||
| } | ||
| ( | ||
| per_col_size, | ||
| per_col_val_num, | ||
| per_col_null_val_num, | ||
| min_max_agg.produce(), | ||
| ) | ||
| }; | ||
|
|
||
| let mut builder = DataFileBuilder::default(); | ||
| builder | ||
| .content(DataContentType::Data) | ||
| .file_path(file_path) | ||
| .file_format(DataFileFormat::Parquet) | ||
| .partition(partition.clone()) | ||
| .record_count(metadata.file_metadata().num_rows() as u64) | ||
| .file_size_in_bytes(written_size as u64) | ||
| .column_sizes(column_sizes) | ||
| .value_counts(value_counts) | ||
| .null_value_counts(null_value_counts) | ||
| .lower_bounds(lower_bounds) | ||
| .upper_bounds(upper_bounds) | ||
| .split_offsets( | ||
| metadata | ||
| .row_groups() | ||
| .iter() | ||
| .filter_map(|group| group.file_offset()) | ||
| .collect(), | ||
| ); | ||
|
|
||
| Ok(builder) | ||
| } | ||
|
|
||
| fn create_default_partition_value(&self, partition_type: &StructType) -> Result<Struct> { | ||
| let literals = partition_type | ||
| .fields() | ||
| .iter() | ||
| .map(|field| { | ||
| let primitive_type = field.field_type.as_primitive_type().ok_or_else(|| { | ||
| Error::new( | ||
| ErrorKind::Unexpected, | ||
| "Partition field should only be a primitive type.", | ||
| ) | ||
| })?; | ||
| Ok(Some(primitive_type.type_to_literal())) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looking at the
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah this makes sense, we can go with first option and add a todo on the check if partition exist. |
||
| }) | ||
| .collect::<Result<Vec<_>>>()?; | ||
|
|
||
| Ok(Struct::from_iter(literals)) | ||
| } | ||
| } | ||
|
|
||
| /// FastAppendAction is a transaction action for fast append data files to the table. | ||
|
|
@@ -607,6 +779,7 @@ mod tests { | |
| use std::io::BufReader; | ||
|
|
||
| use crate::io::FileIOBuilder; | ||
| use crate::scan::tests::TableTestFixture; | ||
| use crate::spec::{ | ||
| DataContentType, DataFileBuilder, DataFileFormat, FormatVersion, Literal, Struct, | ||
| TableMetadata, | ||
|
|
@@ -847,6 +1020,7 @@ mod tests { | |
| .sequence_number() | ||
| .expect("Inherit sequence number by load manifest") | ||
| ); | ||
|
|
||
| assert_eq!( | ||
| new_snapshot.snapshot_id(), | ||
| manifest.entries()[0].snapshot_id().unwrap() | ||
|
|
@@ -869,4 +1043,80 @@ mod tests { | |
| "Should not allow to do same kinds update in same transaction" | ||
| ); | ||
| } | ||
|
|
||
| #[tokio::test] | ||
| async fn test_add_existing_parquet_files() { | ||
| let mut fixture = TableTestFixture::new(); | ||
| fixture.setup_manifest_files().await; | ||
| let tx = crate::transaction::Transaction::new(&fixture.table); | ||
|
|
||
| let file_paths = vec![ | ||
| format!("{}/1.parquet", &fixture.table_location), | ||
| format!("{}/2.parquet", &fixture.table_location), | ||
| format!("{}/3.parquet", &fixture.table_location), | ||
| ]; | ||
|
|
||
| // attempt to add the existing Parquet files with fast append | ||
| let new_tx = tx | ||
| .add_parquet_files(file_paths.clone(), true) | ||
| .await | ||
| .expect("Adding existing Parquet files should succeed"); | ||
|
|
||
| let mut found_add_snapshot = false; | ||
| let mut found_set_snapshot_ref = false; | ||
| for update in new_tx.updates.iter() { | ||
| match update { | ||
| TableUpdate::AddSnapshot { .. } => { | ||
| found_add_snapshot = true; | ||
| } | ||
| TableUpdate::SetSnapshotRef { | ||
| ref_name, | ||
| reference, | ||
| } => { | ||
| found_set_snapshot_ref = true; | ||
| assert_eq!(ref_name, crate::transaction::MAIN_BRANCH); | ||
| assert!(reference.snapshot_id > 0); | ||
| } | ||
| _ => {} | ||
| } | ||
| } | ||
| assert!(found_add_snapshot); | ||
| assert!(found_set_snapshot_ref); | ||
|
|
||
| let new_snapshot = if let TableUpdate::AddSnapshot { snapshot } = &new_tx.updates[0] { | ||
| snapshot | ||
| } else { | ||
| panic!("Expected the first update to be an AddSnapshot update"); | ||
| }; | ||
|
|
||
| let manifest_list = new_snapshot | ||
| .load_manifest_list(fixture.table.file_io(), fixture.table.metadata()) | ||
| .await | ||
| .expect("Failed to load manifest list"); | ||
|
|
||
| assert_eq!(manifest_list.entries().len(), 2); | ||
|
|
||
| // Load the manifest from the manifest list | ||
| let manifest = manifest_list.entries()[0] | ||
| .load_manifest(fixture.table.file_io()) | ||
| .await | ||
| .expect("Failed to load manifest"); | ||
|
|
||
| // Since we added three files with add_parquet_files, check that the manifest contains three entries | ||
| assert_eq!(manifest.entries().len(), 3); | ||
|
|
||
| // Verify each file path appears in manifest | ||
| let manifest_paths: Vec<String> = manifest | ||
| .entries() | ||
| .iter() | ||
| .map(|entry| entry.data_file().file_path.clone()) | ||
| .collect(); | ||
| for path in file_paths { | ||
| assert!( | ||
| manifest_paths.contains(&path), | ||
| "Manifest does not contain expected file path: {}", | ||
| path | ||
| ); | ||
| } | ||
| } | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.