-
Notifications
You must be signed in to change notification settings - Fork 251
chore: Upgrade to datafusion 47.0.0-rc1 and arrow-rs 55.0.0 #1563
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
cbce217
e5e49d9
ea66d82
5896128
74ae825
fe549cb
0ac9529
727a3fb
2544107
84896e1
748644a
2658f7c
e6f9e92
51b93ff
49f6cf2
7a8b316
baa5faa
0cfb757
eb90625
633d1de
0b4efe5
3ce6bfc
3afd646
6dc3384
da0cc73
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -85,6 +85,7 @@ impl DisplayAs for ExpandExec { | |
|
|
||
| Ok(()) | ||
| } | ||
| DisplayFormatType::TreeRender => unimplemented!(), | ||
| } | ||
| } | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,7 +21,9 @@ use crate::parquet::schema_adapter::SparkSchemaAdapterFactory; | |
| use arrow::datatypes::{Field, SchemaRef}; | ||
| use datafusion::config::TableParquetOptions; | ||
| use datafusion::datasource::listing::PartitionedFile; | ||
| use datafusion::datasource::physical_plan::{FileScanConfig, FileSource, ParquetSource}; | ||
| use datafusion::datasource::physical_plan::{ | ||
| FileGroup, FileScanConfigBuilder, FileSource, ParquetSource, | ||
| }; | ||
| use datafusion::datasource::source::DataSourceExec; | ||
| use datafusion::execution::object_store::ObjectStoreUrl; | ||
| use datafusion::physical_expr::expressions::BinaryExpr; | ||
|
|
@@ -80,23 +82,33 @@ pub(crate) fn init_datasource_exec( | |
| parquet_source = parquet_source.with_predicate(Arc::clone(data_schema), filter); | ||
| } | ||
| } | ||
|
|
||
| let file_groups = file_groups | ||
| .iter() | ||
| .map(|files| FileGroup::new(files.clone())) | ||
| .collect(); | ||
|
|
||
| let file_scan_config = match (data_schema, projection_vector, partition_fields) { | ||
| (Some(data_schema), Some(projection_vector), Some(partition_fields)) => get_file_config( | ||
| data_schema, | ||
| partition_schema, | ||
| file_groups, | ||
| object_store_url, | ||
| Arc::new(parquet_source), | ||
| ) | ||
| .with_projection(Some(projection_vector)) | ||
| .with_table_partition_cols(partition_fields), | ||
| _ => get_file_config( | ||
| (Some(data_schema), Some(projection_vector), Some(partition_fields)) => { | ||
| get_file_config_builder( | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this now uses a builder |
||
| data_schema, | ||
| partition_schema, | ||
| file_groups, | ||
| object_store_url, | ||
| Arc::new(parquet_source), | ||
| ) | ||
| .with_projection(Some(projection_vector)) | ||
| .with_table_partition_cols(partition_fields) | ||
| .build() | ||
| } | ||
| _ => get_file_config_builder( | ||
| required_schema, | ||
| partition_schema, | ||
| file_groups, | ||
| object_store_url, | ||
| Arc::new(parquet_source), | ||
| ), | ||
| ) | ||
| .build(), | ||
| }; | ||
|
|
||
| Ok(Arc::new(DataSourceExec::new(Arc::new(file_scan_config)))) | ||
|
|
@@ -113,13 +125,13 @@ fn get_options(session_timezone: &str) -> (TableParquetOptions, SparkParquetOpti | |
| (table_parquet_options, spark_parquet_options) | ||
| } | ||
|
|
||
| fn get_file_config( | ||
| fn get_file_config_builder( | ||
| schema: SchemaRef, | ||
| partition_schema: Option<SchemaRef>, | ||
| file_groups: Vec<Vec<PartitionedFile>>, | ||
| file_groups: Vec<FileGroup>, | ||
| object_store_url: ObjectStoreUrl, | ||
| file_source: Arc<dyn FileSource>, | ||
| ) -> FileScanConfig { | ||
| ) -> FileScanConfigBuilder { | ||
| match partition_schema { | ||
| Some(partition_schema) => { | ||
| let partition_fields: Vec<Field> = partition_schema | ||
|
|
@@ -129,11 +141,11 @@ fn get_file_config( | |
| Field::new(field.name(), field.data_type().clone(), field.is_nullable()) | ||
| }) | ||
| .collect_vec(); | ||
| FileScanConfig::new(object_store_url, Arc::clone(&schema), file_source) | ||
| FileScanConfigBuilder::new(object_store_url, Arc::clone(&schema), file_source) | ||
| .with_file_groups(file_groups) | ||
| .with_table_partition_cols(partition_fields) | ||
| } | ||
| _ => FileScanConfig::new(object_store_url, Arc::clone(&schema), file_source) | ||
| _ => FileScanConfigBuilder::new(object_store_url, Arc::clone(&schema), file_source) | ||
| .with_file_groups(file_groups), | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,7 +18,7 @@ | |
| //! Custom schema adapter that uses Spark-compatible conversions | ||
|
|
||
| use crate::parquet::parquet_support::{spark_parquet_convert, SparkParquetOptions}; | ||
| use arrow::array::{new_null_array, Array, RecordBatch, RecordBatchOptions}; | ||
| use arrow::array::{new_null_array, RecordBatch, RecordBatchOptions}; | ||
| use arrow::datatypes::{Schema, SchemaRef}; | ||
| use datafusion::datasource::schema_adapter::{SchemaAdapter, SchemaAdapterFactory, SchemaMapper}; | ||
| use datafusion::physical_plan::ColumnarValue; | ||
|
|
@@ -50,11 +50,10 @@ impl SchemaAdapterFactory for SparkSchemaAdapterFactory { | |
| fn create( | ||
| &self, | ||
| required_schema: SchemaRef, | ||
| table_schema: SchemaRef, | ||
| _table_schema: SchemaRef, | ||
| ) -> Box<dyn SchemaAdapter> { | ||
| Box::new(SparkSchemaAdapter { | ||
| required_schema, | ||
| table_schema, | ||
| parquet_options: self.parquet_options.clone(), | ||
| }) | ||
| } | ||
|
|
@@ -67,12 +66,6 @@ pub struct SparkSchemaAdapter { | |
| /// The schema for the table, projected to include only the fields being output (projected) by the | ||
| /// associated ParquetExec | ||
| required_schema: SchemaRef, | ||
| /// The entire table schema for the table we're using this to adapt. | ||
| /// | ||
| /// This is used to evaluate any filters pushed down into the scan | ||
| /// which may refer to columns that are not referred to anywhere | ||
| /// else in the plan. | ||
| table_schema: SchemaRef, | ||
| /// Spark cast options | ||
| parquet_options: SparkParquetOptions, | ||
| } | ||
|
|
@@ -139,7 +132,6 @@ impl SchemaAdapter for SparkSchemaAdapter { | |
| Arc::new(SchemaMapping { | ||
| required_schema: Arc::<Schema>::clone(&self.required_schema), | ||
| field_mappings, | ||
| table_schema: Arc::<Schema>::clone(&self.table_schema), | ||
| parquet_options: self.parquet_options.clone(), | ||
| }), | ||
| projection, | ||
|
|
@@ -186,11 +178,6 @@ pub struct SchemaMapping { | |
| /// They are Options instead of just plain `usize`s because the table could | ||
| /// have fields that don't exist in the file. | ||
| field_mappings: Vec<Option<usize>>, | ||
| /// The entire table schema, as opposed to the projected_table_schema (which | ||
| /// only contains the columns that we are projecting out of this query). | ||
| /// This contains all fields in the table, regardless of if they will be | ||
| /// projected out or not. | ||
| table_schema: SchemaRef, | ||
| /// Spark cast options | ||
| parquet_options: SparkParquetOptions, | ||
| } | ||
|
|
@@ -239,59 +226,6 @@ impl SchemaMapper for SchemaMapping { | |
| let record_batch = RecordBatch::try_new_with_options(schema, cols, &options)?; | ||
| Ok(record_batch) | ||
| } | ||
|
|
||
| /// Adapts a [`RecordBatch`]'s schema into one that has all the correct output types and only | ||
| /// contains the fields that exist in both the file schema and table schema. | ||
| /// | ||
| /// Unlike `map_batch` this method also preserves the columns that | ||
| /// may not appear in the final output (`projected_table_schema`) but may | ||
| /// appear in push down predicates | ||
| fn map_partial_batch(&self, batch: RecordBatch) -> datafusion::common::Result<RecordBatch> { | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @mbutrovich @parthchandra fyi, I am not sure of the impact, but I figured I would first see if any tests fail in CI |
||
| let batch_cols = batch.columns().to_vec(); | ||
| let schema = batch.schema(); | ||
|
|
||
| // for each field in the batch's schema (which is based on a file, not a table)... | ||
| let (cols, fields) = schema | ||
| .fields() | ||
| .iter() | ||
| .zip(batch_cols.iter()) | ||
| .flat_map(|(field, batch_col)| { | ||
| self.table_schema | ||
| .fields() | ||
| .iter() | ||
| .enumerate() | ||
| .find(|(_, b)| { | ||
| if self.parquet_options.case_sensitive { | ||
| b.name() == field.name() | ||
| } else { | ||
| b.name().to_lowercase() == field.name().to_lowercase() | ||
| } | ||
| }) | ||
| // but if we do have it, | ||
| .map(|(_, table_field)| { | ||
| // try to cast it into the correct output type. we don't want to ignore this | ||
| // error, though, so it's propagated. | ||
| spark_parquet_convert( | ||
| ColumnarValue::Array(Arc::clone(batch_col)), | ||
| table_field.data_type(), | ||
| &self.parquet_options, | ||
| )? | ||
| .into_array(batch_col.len()) | ||
| // and if that works, return the field and column. | ||
| .map(|new_col| (new_col, table_field.as_ref().clone())) | ||
| }) | ||
| }) | ||
| .collect::<Result<Vec<_>, _>>()? | ||
| .into_iter() | ||
| .unzip::<_, _, Vec<_>, Vec<_>>(); | ||
|
|
||
| // Necessary to handle empty batches | ||
| let options = RecordBatchOptions::new().with_row_count(Some(batch.num_rows())); | ||
|
|
||
| let schema = Arc::new(Schema::new_with_metadata(fields, schema.metadata().clone())); | ||
| let record_batch = RecordBatch::try_new_with_options(schema, cols, &options)?; | ||
| Ok(record_batch) | ||
| } | ||
| } | ||
|
|
||
| #[cfg(test)] | ||
|
|
@@ -306,7 +240,7 @@ mod test { | |
| use datafusion::common::config::TableParquetOptions; | ||
| use datafusion::common::DataFusionError; | ||
| use datafusion::datasource::listing::PartitionedFile; | ||
| use datafusion::datasource::physical_plan::{FileScanConfig, ParquetSource}; | ||
| use datafusion::datasource::physical_plan::{FileGroup, FileScanConfigBuilder, ParquetSource}; | ||
| use datafusion::datasource::source::DataSourceExec; | ||
| use datafusion::execution::object_store::ObjectStoreUrl; | ||
| use datafusion::execution::TaskContext; | ||
|
|
@@ -378,11 +312,11 @@ mod test { | |
| )), | ||
| ); | ||
|
|
||
| let files = FileGroup::new(vec![PartitionedFile::from_path(filename.to_string())?]); | ||
| let file_scan_config = | ||
| FileScanConfig::new(object_store_url, required_schema, parquet_source) | ||
| .with_file_groups(vec![vec![PartitionedFile::from_path( | ||
| filename.to_string(), | ||
| )?]]); | ||
| FileScanConfigBuilder::new(object_store_url, required_schema, parquet_source) | ||
| .with_file_groups(vec![files]) | ||
| .build(); | ||
|
|
||
| let parquet_exec = DataSourceExec::new(Arc::new(file_scan_config)); | ||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
DataFusion now supports a new EXPLAIN output, but this is not exposed in Comet, so we do not need to implement these new methods.