Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rename ColumnOptions to ParquetColumnOptions #11512

Merged
merged 4 commits into from
Jul 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions datafusion/common/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1414,7 +1414,7 @@ pub struct TableParquetOptions {
/// Global Parquet options that propagates to all columns.
pub global: ParquetOptions,
/// Column specific options. Default usage is parquet.XX::column.
pub column_specific_options: HashMap<String, ColumnOptions>,
pub column_specific_options: HashMap<String, ParquetColumnOptions>,
/// Additional file-level metadata to include. Inserted into the key_value_metadata
/// for the written [`FileMetaData`](https://docs.rs/parquet/latest/parquet/file/metadata/struct.FileMetaData.html).
///
Expand Down Expand Up @@ -1555,7 +1555,7 @@ config_namespace_with_hashmap! {
/// Options controlling parquet format for individual columns.
///
/// See [`ParquetOptions`] for more details
pub struct ColumnOptions {
pub struct ParquetColumnOptions {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could additionally (later) move the Parquet functionality to Parquet-related modules/files.
Perhaps just for organizational purposes.

/// Sets if bloom filter is enabled for the column path.
pub bloom_filter_enabled: Option<bool>, default = None

Expand Down
10 changes: 5 additions & 5 deletions datafusion/common/src/file_options/parquet_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -379,7 +379,7 @@ mod tests {
};
use std::collections::HashMap;

use crate::config::{ColumnOptions, ParquetOptions};
use crate::config::{ParquetColumnOptions, ParquetOptions};

use super::*;

Expand All @@ -388,8 +388,8 @@ mod tests {
/// Take the column defaults provided in [`ParquetOptions`], and generate a non-default col config.
fn column_options_with_non_defaults(
src_col_defaults: &ParquetOptions,
) -> ColumnOptions {
ColumnOptions {
) -> ParquetColumnOptions {
ParquetColumnOptions {
compression: Some("zstd(22)".into()),
dictionary_enabled: src_col_defaults.dictionary_enabled.map(|v| !v),
statistics_enabled: Some("none".into()),
Expand Down Expand Up @@ -446,10 +446,10 @@ mod tests {
fn extract_column_options(
props: &WriterProperties,
col: ColumnPath,
) -> ColumnOptions {
) -> ParquetColumnOptions {
let bloom_filter_default_props = props.bloom_filter_properties(&col);

ColumnOptions {
ParquetColumnOptions {
bloom_filter_enabled: Some(bloom_filter_default_props.is_some()),
encoding: props.encoding(&col).map(|s| s.to_string()),
dictionary_enabled: Some(props.dictionary_enabled(&col)),
Expand Down
8 changes: 4 additions & 4 deletions datafusion/proto-common/proto/datafusion_common.proto
Original file line number Diff line number Diff line change
Expand Up @@ -433,15 +433,15 @@ message JsonOptions {

message TableParquetOptions {
ParquetOptions global = 1;
repeated ColumnSpecificOptions column_specific_options = 2;
repeated ParquetColumnSpecificOptions column_specific_options = 2;
}

message ColumnSpecificOptions {
message ParquetColumnSpecificOptions {
string column_name = 1;
ColumnOptions options = 2;
ParquetColumnOptions options = 2;
}

message ColumnOptions {
message ParquetColumnOptions {
oneof bloom_filter_enabled_opt {
bool bloom_filter_enabled = 1;
}
Expand Down
10 changes: 6 additions & 4 deletions datafusion/proto-common/src/from_proto/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ use arrow::ipc::{reader::read_record_batch, root_as_message};
use datafusion_common::{
arrow_datafusion_err,
config::{
ColumnOptions, CsvOptions, JsonOptions, ParquetOptions, TableParquetOptions,
CsvOptions, JsonOptions, ParquetColumnOptions, ParquetOptions,
TableParquetOptions,
},
file_options::{csv_writer::CsvWriterOptions, json_writer::JsonWriterOptions},
parsers::CompressionTypeVariant,
Expand Down Expand Up @@ -960,12 +961,12 @@ impl TryFrom<&protobuf::ParquetOptions> for ParquetOptions {
}
}

impl TryFrom<&protobuf::ColumnOptions> for ColumnOptions {
impl TryFrom<&protobuf::ColumnOptions> for ParquetColumnOptions {
type Error = DataFusionError;
fn try_from(
value: &protobuf::ColumnOptions,
) -> datafusion_common::Result<Self, Self::Error> {
Ok(ColumnOptions {
Ok(ParquetColumnOptions {
compression: value.compression_opt.clone().map(|opt| match opt {
protobuf::column_options::CompressionOpt::Compression(v) => Some(v),
}).unwrap_or(None),
Expand Down Expand Up @@ -1013,7 +1014,8 @@ impl TryFrom<&protobuf::TableParquetOptions> for TableParquetOptions {
fn try_from(
value: &protobuf::TableParquetOptions,
) -> datafusion_common::Result<Self, Self::Error> {
let mut column_specific_options: HashMap<String, ColumnOptions> = HashMap::new();
let mut column_specific_options: HashMap<String, ParquetColumnOptions> =
HashMap::new();
for protobuf::ColumnSpecificOptions {
column_name,
options: maybe_options,
Expand Down
9 changes: 6 additions & 3 deletions datafusion/proto-common/src/to_proto/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ use arrow::datatypes::{
use arrow::ipc::writer::{DictionaryTracker, IpcDataGenerator};
use datafusion_common::{
config::{
ColumnOptions, CsvOptions, JsonOptions, ParquetOptions, TableParquetOptions,
CsvOptions, JsonOptions, ParquetColumnOptions, ParquetOptions,
TableParquetOptions,
},
file_options::{csv_writer::CsvWriterOptions, json_writer::JsonWriterOptions},
parsers::CompressionTypeVariant,
Expand Down Expand Up @@ -830,10 +831,12 @@ impl TryFrom<&ParquetOptions> for protobuf::ParquetOptions {
}
}

impl TryFrom<&ColumnOptions> for protobuf::ColumnOptions {
impl TryFrom<&ParquetColumnOptions> for protobuf::ColumnOptions {
type Error = DataFusionError;

fn try_from(value: &ColumnOptions) -> datafusion_common::Result<Self, Self::Error> {
fn try_from(
value: &ParquetColumnOptions,
) -> datafusion_common::Result<Self, Self::Error> {
Ok(protobuf::ColumnOptions {
compression_opt: value
.compression
Expand Down
48 changes: 26 additions & 22 deletions datafusion/proto/src/generated/datafusion_proto_common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -670,46 +670,50 @@ pub struct TableParquetOptions {
#[prost(message, optional, tag = "1")]
pub global: ::core::option::Option<ParquetOptions>,
#[prost(message, repeated, tag = "2")]
pub column_specific_options: ::prost::alloc::vec::Vec<ColumnSpecificOptions>,
pub column_specific_options: ::prost::alloc::vec::Vec<ParquetColumnSpecificOptions>,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct ColumnSpecificOptions {
pub struct ParquetColumnSpecificOptions {
#[prost(string, tag = "1")]
pub column_name: ::prost::alloc::string::String,
#[prost(message, optional, tag = "2")]
pub options: ::core::option::Option<ColumnOptions>,
pub options: ::core::option::Option<ParquetColumnOptions>,
}
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct ColumnOptions {
#[prost(oneof = "column_options::BloomFilterEnabledOpt", tags = "1")]
pub struct ParquetColumnOptions {
#[prost(oneof = "parquet_column_options::BloomFilterEnabledOpt", tags = "1")]
pub bloom_filter_enabled_opt: ::core::option::Option<
column_options::BloomFilterEnabledOpt,
parquet_column_options::BloomFilterEnabledOpt,
>,
#[prost(oneof = "column_options::EncodingOpt", tags = "2")]
pub encoding_opt: ::core::option::Option<column_options::EncodingOpt>,
#[prost(oneof = "column_options::DictionaryEnabledOpt", tags = "3")]
#[prost(oneof = "parquet_column_options::EncodingOpt", tags = "2")]
pub encoding_opt: ::core::option::Option<parquet_column_options::EncodingOpt>,
#[prost(oneof = "parquet_column_options::DictionaryEnabledOpt", tags = "3")]
pub dictionary_enabled_opt: ::core::option::Option<
column_options::DictionaryEnabledOpt,
parquet_column_options::DictionaryEnabledOpt,
>,
#[prost(oneof = "column_options::CompressionOpt", tags = "4")]
pub compression_opt: ::core::option::Option<column_options::CompressionOpt>,
#[prost(oneof = "column_options::StatisticsEnabledOpt", tags = "5")]
#[prost(oneof = "parquet_column_options::CompressionOpt", tags = "4")]
pub compression_opt: ::core::option::Option<parquet_column_options::CompressionOpt>,
#[prost(oneof = "parquet_column_options::StatisticsEnabledOpt", tags = "5")]
pub statistics_enabled_opt: ::core::option::Option<
column_options::StatisticsEnabledOpt,
parquet_column_options::StatisticsEnabledOpt,
>,
#[prost(oneof = "column_options::BloomFilterFppOpt", tags = "6")]
pub bloom_filter_fpp_opt: ::core::option::Option<column_options::BloomFilterFppOpt>,
#[prost(oneof = "column_options::BloomFilterNdvOpt", tags = "7")]
pub bloom_filter_ndv_opt: ::core::option::Option<column_options::BloomFilterNdvOpt>,
#[prost(oneof = "column_options::MaxStatisticsSizeOpt", tags = "8")]
#[prost(oneof = "parquet_column_options::BloomFilterFppOpt", tags = "6")]
pub bloom_filter_fpp_opt: ::core::option::Option<
parquet_column_options::BloomFilterFppOpt,
>,
#[prost(oneof = "parquet_column_options::BloomFilterNdvOpt", tags = "7")]
pub bloom_filter_ndv_opt: ::core::option::Option<
parquet_column_options::BloomFilterNdvOpt,
>,
#[prost(oneof = "parquet_column_options::MaxStatisticsSizeOpt", tags = "8")]
pub max_statistics_size_opt: ::core::option::Option<
column_options::MaxStatisticsSizeOpt,
parquet_column_options::MaxStatisticsSizeOpt,
>,
}
/// Nested message and enum types in `ColumnOptions`.
pub mod column_options {
/// Nested message and enum types in `ParquetColumnOptions`.
pub mod parquet_column_options {
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Oneof)]
pub enum BloomFilterEnabledOpt {
Expand Down
2 changes: 1 addition & 1 deletion datafusion/sqllogictest/test_files/copy.slt
Original file line number Diff line number Diff line change
Expand Up @@ -358,7 +358,7 @@ OPTIONS (
)

# errors for invalid property (not stating `format.metadata`)
statement error DataFusion error: Invalid or Unsupported Configuration: Config value "wrong-metadata" not found on ColumnOptions
statement error DataFusion error: Invalid or Unsupported Configuration: Config value "wrong-metadata" not found on ParquetColumnOptions
COPY source_table
TO 'test_files/scratch/copy/table_with_metadata/'
STORED AS PARQUET
Expand Down