Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FEAT] UTF8 to binary coercion flag #2893

Merged
merged 13 commits into from
Sep 24, 2024
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@ jaq-std = "1.2.0"
num-derive = "0.3.3"
num-traits = "0.2"
once_cell = "1.19.0"
path_macro = "1.0.0"
pretty_assertions = "1.4.0"
rand = "^0.8"
rayon = "1.10.0"
Expand Down
1 change: 1 addition & 0 deletions daft/daft/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -870,6 +870,7 @@ def read_parquet_into_pyarrow(
io_config: IOConfig | None = None,
multithreaded_io: bool | None = None,
coerce_int96_timestamp_unit: PyTimeUnit | None = None,
string_encoding: str | None = "utf-8",
file_timeout_ms: int | None = None,
): ...
def read_parquet_into_pyarrow_bulk(
Expand Down
2 changes: 2 additions & 0 deletions daft/table/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -524,6 +524,7 @@ def read_parquet_into_pyarrow(
io_config: IOConfig | None = None,
multithreaded_io: bool | None = None,
coerce_int96_timestamp_unit: TimeUnit = TimeUnit.ns(),
string_encoding: str | None = "utf-8",
raunakab marked this conversation as resolved.
Show resolved Hide resolved
file_timeout_ms: int | None = 900_000, # 15 minutes
) -> pa.Table:
fields, metadata, columns, num_rows_read = _read_parquet_into_pyarrow(
Expand All @@ -535,6 +536,7 @@ def read_parquet_into_pyarrow(
io_config=io_config,
multithreaded_io=multithreaded_io,
coerce_int96_timestamp_unit=coerce_int96_timestamp_unit._timeunit,
string_encoding=string_encoding,
file_timeout_ms=file_timeout_ms,
)
schema = pa.schema(fields, metadata=metadata)
Expand Down
32 changes: 21 additions & 11 deletions src/arrow2/src/io/parquet/read/schema/convert.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,24 +7,28 @@ use parquet2::schema::{
Repetition,
};

use crate::datatypes::{DataType, Field, IntervalUnit, TimeUnit};
use crate::io::parquet::read::schema::SchemaInferenceOptions;
use super::StringEncoding;
use crate::{
datatypes::{DataType, Field, IntervalUnit, TimeUnit},
io::parquet::read::schema::SchemaInferenceOptions,
};

/// Converts [`ParquetType`]s to a [`Field`], ignoring parquet fields that do not contain
/// any physical column.
#[allow(dead_code)]
pub fn parquet_to_arrow_schema(fields: &[ParquetType]) -> Vec<Field> {
parquet_to_arrow_schema_with_options(fields, &None)
parquet_to_arrow_schema_with_options(fields, None)
}

/// Like [`parquet_to_arrow_schema`] but with configurable options which affect the behavior of schema inference
pub fn parquet_to_arrow_schema_with_options(
fields: &[ParquetType],
options: &Option<SchemaInferenceOptions>,
options: Option<SchemaInferenceOptions>,
) -> Vec<Field> {
let options = options.unwrap_or_default();
fields
.iter()
.filter_map(|f| to_field(f, options.as_ref().unwrap_or(&Default::default())))
.filter_map(|f| to_field(f, &options))
.collect::<Vec<_>>()
}

Expand Down Expand Up @@ -145,9 +149,13 @@ fn from_int64(
fn from_byte_array(
logical_type: &Option<PrimitiveLogicalType>,
converted_type: &Option<PrimitiveConvertedType>,
options: &SchemaInferenceOptions,
) -> DataType {
match (logical_type, converted_type) {
(Some(PrimitiveLogicalType::String), _) => DataType::Utf8,
(Some(PrimitiveLogicalType::String), _) => match options.string_encoding {
StringEncoding::Utf8 => DataType::Utf8,
StringEncoding::Raw => DataType::Binary,
},
(Some(PrimitiveLogicalType::Json), _) => DataType::Binary,
(Some(PrimitiveLogicalType::Bson), _) => DataType::Binary,
(Some(PrimitiveLogicalType::Enum), _) => DataType::Binary,
Expand Down Expand Up @@ -219,9 +227,11 @@ fn to_primitive_type_inner(
PhysicalType::Int96 => DataType::Timestamp(options.int96_coerce_to_timeunit, None),
PhysicalType::Float => DataType::Float32,
PhysicalType::Double => DataType::Float64,
PhysicalType::ByteArray => {
from_byte_array(&primitive_type.logical_type, &primitive_type.converted_type)
}
PhysicalType::ByteArray => from_byte_array(
&primitive_type.logical_type,
&primitive_type.converted_type,
options,
),
PhysicalType::FixedLenByteArray(length) => from_fixed_len_byte_array(
length,
primitive_type.logical_type,
Expand Down Expand Up @@ -440,7 +450,6 @@ mod tests {
use parquet2::metadata::SchemaDescriptor;

use super::*;

use crate::error::Result;

#[test]
Expand Down Expand Up @@ -1123,8 +1132,9 @@ mod tests {
let parquet_schema = SchemaDescriptor::try_from_message(message_type)?;
let fields = parquet_to_arrow_schema_with_options(
parquet_schema.fields(),
&Some(SchemaInferenceOptions {
Some(SchemaInferenceOptions {
int96_coerce_to_timeunit: tu,
..Default::default()
}),
);
assert_eq!(arrow_fields, fields);
Expand Down
52 changes: 44 additions & 8 deletions src/arrow2/src/io/parquet/read/schema/mod.rs
Original file line number Diff line number Diff line change
@@ -1,21 +1,53 @@
//! APIs to handle Parquet <-> Arrow schemas.

use crate::datatypes::{Schema, TimeUnit};
use crate::error::Result;
use crate::{
datatypes::{Schema, TimeUnit},
error::Result,
};

mod convert;
mod metadata;

pub use convert::parquet_to_arrow_schema_with_options;
pub use metadata::{apply_schema_to_fields, read_schema_from_metadata};
pub use parquet2::metadata::{FileMetaData, KeyValue, SchemaDescriptor};
pub use parquet2::schema::types::ParquetType;

pub(crate) use convert::*;
pub use metadata::{apply_schema_to_fields, read_schema_from_metadata};
pub use parquet2::{
metadata::{FileMetaData, KeyValue, SchemaDescriptor},
schema::types::ParquetType,
};
use serde::{Deserialize, Serialize};

use self::metadata::parse_key_value_metadata;

/// String encoding options.
///
/// Each variant of this enum maps to a different interpretation of the underlying binary data:
/// 1. `StringEncoding::Utf8` assumes the underlying binary data is UTF-8 encoded.
/// 2. `StringEncoding::Raw` makes no assumptions about the encoding of the underlying binary data. This variant will change the logical type of the column to `DataType::Binary` in the final schema.
#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]

Check warning on line 27 in src/arrow2/src/io/parquet/read/schema/mod.rs

View check run for this annotation

Codecov / codecov/patch

src/arrow2/src/io/parquet/read/schema/mod.rs#L27

Added line #L27 was not covered by tests
pub enum StringEncoding {
Raw,
#[default]
Utf8,
}

impl<T: AsRef<str>> TryFrom<Option<T>> for StringEncoding {
type Error = crate::error::Error;

fn try_from(value: Option<T>) -> Result<Self> {
match value.as_ref().map(AsRef::as_ref) {
Some("utf-8") => Ok(Self::Utf8),
Some(encoding) => Err(crate::error::Error::InvalidArgumentError(format!(
"Unrecognized encoding: {}",
encoding
))),
None => Ok(Self::Raw),

Check warning on line 44 in src/arrow2/src/io/parquet/read/schema/mod.rs

View check run for this annotation

Codecov / codecov/patch

src/arrow2/src/io/parquet/read/schema/mod.rs#L40-L44

Added lines #L40 - L44 were not covered by tests
}
}
}

/// Options when inferring schemas from Parquet
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct SchemaInferenceOptions {
/// When inferring schemas from the Parquet INT96 timestamp type, this is the corresponding TimeUnit
/// in the inferred Arrow Timestamp type.
Expand All @@ -25,12 +57,16 @@
/// (e.g. TimeUnit::Milliseconds) will result in loss of precision, but support a larger range of dates
/// without overflowing when parsing the data.
pub int96_coerce_to_timeunit: TimeUnit,

/// The string encoding to assume when inferring the schema from Parquet data.
pub string_encoding: StringEncoding,
}

impl Default for SchemaInferenceOptions {
fn default() -> Self {
SchemaInferenceOptions {
int96_coerce_to_timeunit: TimeUnit::Nanosecond,
string_encoding: StringEncoding::default(),
}
}
}
Expand All @@ -42,13 +78,13 @@
/// This function errors iff the key `"ARROW:schema"` exists but is not correctly encoded,
/// indicating that that the file's arrow metadata was incorrectly written.
pub fn infer_schema(file_metadata: &FileMetaData) -> Result<Schema> {
infer_schema_with_options(file_metadata, &None)
infer_schema_with_options(file_metadata, None)

Check warning on line 81 in src/arrow2/src/io/parquet/read/schema/mod.rs

View check run for this annotation

Codecov / codecov/patch

src/arrow2/src/io/parquet/read/schema/mod.rs#L81

Added line #L81 was not covered by tests
}

/// Like [`infer_schema`] but with configurable options which affects the behavior of inference
pub fn infer_schema_with_options(
file_metadata: &FileMetaData,
options: &Option<SchemaInferenceOptions>,
options: Option<SchemaInferenceOptions>,
raunakab marked this conversation as resolved.
Show resolved Hide resolved
) -> Result<Schema> {
let mut metadata = parse_key_value_metadata(file_metadata.key_value_metadata());
let fields = parquet_to_arrow_schema_with_options(file_metadata.schema().fields(), options);
Expand Down
13 changes: 7 additions & 6 deletions src/daft-micropartition/src/micropartition.rs
Original file line number Diff line number Diff line change
Expand Up @@ -596,9 +596,9 @@ impl MicroPartition {
(
_,
_,
FileFormatConfig::Parquet(ParquetSourceConfig {
&FileFormatConfig::Parquet(ParquetSourceConfig {
coerce_int96_timestamp_unit,
field_id_mapping,
ref field_id_mapping,
chunk_size,
..
}),
Expand Down Expand Up @@ -646,12 +646,13 @@ impl MicroPartition {
if scan_task.sources.len() == 1 { 1 } else { 128 }, // Hardcoded for to 128 bulk reads
cfg.multithreaded_io,
&ParquetSchemaInferenceOptions {
coerce_int96_timestamp_unit: *coerce_int96_timestamp_unit,
coerce_int96_timestamp_unit,
..Default::default()
},
Some(schema.clone()),
field_id_mapping.clone(),
parquet_metadata,
*chunk_size,
chunk_size,
)
.context(DaftCoreComputeSnafu)
}
Expand Down Expand Up @@ -1162,7 +1163,7 @@ pub(crate) fn read_parquet_into_micropartition<T: AsRef<str>>(
let schemas = metadata
.iter()
.map(|m| {
let schema = infer_schema_with_options(m, &Some((*schema_infer_options).into()))?;
let schema = infer_schema_with_options(m, Some((*schema_infer_options).into()))?;
let daft_schema = daft_core::prelude::Schema::try_from(&schema)?;
DaftResult::Ok(Arc::new(daft_schema))
})
Expand All @@ -1186,7 +1187,7 @@ pub(crate) fn read_parquet_into_micropartition<T: AsRef<str>>(
let schemas = metadata
.iter()
.map(|m| {
let schema = infer_schema_with_options(m, &Some((*schema_infer_options).into()))?;
let schema = infer_schema_with_options(m, Some((*schema_infer_options).into()))?;
let daft_schema = daft_core::prelude::Schema::try_from(&schema)?;
DaftResult::Ok(Arc::new(daft_schema))
})
Expand Down
1 change: 1 addition & 0 deletions src/daft-parquet/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ tokio-util = {workspace = true}

[dev-dependencies]
bincode = {workspace = true}
path_macro = {workspace = true}

[features]
python = ["dep:pyo3", "common-error/python", "daft-core/python", "daft-io/python", "daft-table/python", "daft-stats/python", "daft-dsl/python", "common-arrow-ffi/python"]
Expand Down
11 changes: 6 additions & 5 deletions src/daft-parquet/src/file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -259,11 +259,12 @@ impl ParquetReaderBuilder {
}

pub fn build(self) -> super::Result<ParquetFileReader> {
let mut arrow_schema =
infer_schema_with_options(&self.metadata, &Some(self.schema_inference_options.into()))
.context(UnableToParseSchemaFromMetadataSnafu::<String> {
path: self.uri.clone(),
})?;
let options = self.schema_inference_options.into();
let mut arrow_schema = infer_schema_with_options(&self.metadata, Some(options)).context(
UnableToParseSchemaFromMetadataSnafu {
path: self.uri.clone(),
},
)?;

if let Some(names_to_keep) = self.selected_columns {
arrow_schema
Expand Down
3 changes: 3 additions & 0 deletions src/daft-parquet/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@ pub use python::register_modules;

#[derive(Debug, Snafu)]
pub enum Error {
#[snafu(display("{source}"))]
Arrow2Error { source: arrow2::error::Error },

#[snafu(display("{source}"))]
DaftIOError { source: daft_io::Error },

Expand Down
16 changes: 10 additions & 6 deletions src/daft-parquet/src/python.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@ pub mod pylib {
use daft_table::python::PyTable;
use pyo3::{pyfunction, types::PyModule, Bound, PyResult, Python};

use crate::read::{ArrowChunk, ParquetSchemaInferenceOptions};
use crate::read::{
ArrowChunk, ParquetSchemaInferenceOptions, ParquetSchemaInferenceOptionsBuilder,
};
#[allow(clippy::too_many_arguments)]
#[pyfunction]
pub fn read_parquet(
Expand Down Expand Up @@ -97,16 +99,19 @@ pub mod pylib {
io_config: Option<IOConfig>,
multithreaded_io: Option<bool>,
coerce_int96_timestamp_unit: Option<PyTimeUnit>,
string_encoding: Option<String>,
file_timeout_ms: Option<i64>,
) -> PyResult<PyArrowParquetType> {
let read_parquet_result = py.allow_threads(|| {
let (schema, all_arrays, num_rows) = py.allow_threads(|| {
let io_client = get_io_client(
multithreaded_io.unwrap_or(true),
io_config.unwrap_or_default().config.into(),
)?;
let schema_infer_options = ParquetSchemaInferenceOptions::new(
coerce_int96_timestamp_unit.map(|tu| tu.timeunit),
);
let schema_infer_options = ParquetSchemaInferenceOptionsBuilder {
coerce_int96_timestamp_unit,
string_encoding,
}
.build()?;

crate::read::read_parquet_into_pyarrow(
uri,
Expand All @@ -121,7 +126,6 @@ pub mod pylib {
file_timeout_ms,
)
})?;
let (schema, all_arrays, num_rows) = read_parquet_result;
let pyarrow = py.import_bound(pyo3::intern!(py, "pyarrow"))?;
convert_pyarrow_parquet_read_result_into_py(py, schema, all_arrays, num_rows, &pyarrow)
}
Expand Down
Loading
Loading