Eventual-Inc · raunakab · Sep 24, 2024 · Sep 23, 2024 · Sep 23, 2024 · Sep 24, 2024
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -161,6 +161,7 @@ jaq-std = "1.2.0"
 num-derive = "0.3.3"
 num-traits = "0.2"
 once_cell = "1.19.0"
+path_macro = "1.0.0"
 pretty_assertions = "1.4.0"
 rand = "^0.8"
 rayon = "1.10.0"

diff --git a/daft/daft/__init__.pyi b/daft/daft/__init__.pyi
@@ -870,6 +870,7 @@ def read_parquet_into_pyarrow(
     io_config: IOConfig | None = None,
     multithreaded_io: bool | None = None,
     coerce_int96_timestamp_unit: PyTimeUnit | None = None,
+    string_encoding: str | None = "utf-8",
     file_timeout_ms: int | None = None,
 ): ...
 def read_parquet_into_pyarrow_bulk(

diff --git a/daft/table/table.py b/daft/table/table.py
@@ -524,6 +524,7 @@ def read_parquet_into_pyarrow(
     io_config: IOConfig | None = None,
     multithreaded_io: bool | None = None,
     coerce_int96_timestamp_unit: TimeUnit = TimeUnit.ns(),
+    string_encoding: str | None = "utf-8",
     file_timeout_ms: int | None = 900_000,  # 15 minutes
 ) -> pa.Table:
     fields, metadata, columns, num_rows_read = _read_parquet_into_pyarrow(
@@ -535,6 +536,7 @@ def read_parquet_into_pyarrow(
         io_config=io_config,
         multithreaded_io=multithreaded_io,
         coerce_int96_timestamp_unit=coerce_int96_timestamp_unit._timeunit,
+        string_encoding=string_encoding,
         file_timeout_ms=file_timeout_ms,
     )
     schema = pa.schema(fields, metadata=metadata)

diff --git a/src/arrow2/src/io/parquet/read/schema/convert.rs b/src/arrow2/src/io/parquet/read/schema/convert.rs
@@ -7,24 +7,28 @@ use parquet2::schema::{
     Repetition,
 };
 
-use crate::datatypes::{DataType, Field, IntervalUnit, TimeUnit};
-use crate::io::parquet::read::schema::SchemaInferenceOptions;
+use super::StringEncoding;
+use crate::{
+    datatypes::{DataType, Field, IntervalUnit, TimeUnit},
+    io::parquet::read::schema::SchemaInferenceOptions,
+};
 
 /// Converts [`ParquetType`]s to a [`Field`], ignoring parquet fields that do not contain
 /// any physical column.
 #[allow(dead_code)]
 pub fn parquet_to_arrow_schema(fields: &[ParquetType]) -> Vec<Field> {
-    parquet_to_arrow_schema_with_options(fields, &None)
+    parquet_to_arrow_schema_with_options(fields, None)
 }
 
 /// Like [`parquet_to_arrow_schema`] but with configurable options which affect the behavior of schema inference
 pub fn parquet_to_arrow_schema_with_options(
     fields: &[ParquetType],
-    options: &Option<SchemaInferenceOptions>,
+    options: Option<SchemaInferenceOptions>,
 ) -> Vec<Field> {
+    let options = options.unwrap_or_default();
     fields
         .iter()
-        .filter_map(|f| to_field(f, options.as_ref().unwrap_or(&Default::default())))
+        .filter_map(|f| to_field(f, &options))
         .collect::<Vec<_>>()
 }
 
@@ -145,9 +149,13 @@ fn from_int64(
 fn from_byte_array(
     logical_type: &Option<PrimitiveLogicalType>,
     converted_type: &Option<PrimitiveConvertedType>,
+    options: &SchemaInferenceOptions,
 ) -> DataType {
     match (logical_type, converted_type) {
-        (Some(PrimitiveLogicalType::String), _) => DataType::Utf8,
+        (Some(PrimitiveLogicalType::String), _) => match options.string_encoding {
+            StringEncoding::Utf8 => DataType::Utf8,
+            StringEncoding::Raw => DataType::Binary,
+        },
         (Some(PrimitiveLogicalType::Json), _) => DataType::Binary,
         (Some(PrimitiveLogicalType::Bson), _) => DataType::Binary,
         (Some(PrimitiveLogicalType::Enum), _) => DataType::Binary,
@@ -219,9 +227,11 @@ fn to_primitive_type_inner(
         PhysicalType::Int96 => DataType::Timestamp(options.int96_coerce_to_timeunit, None),
         PhysicalType::Float => DataType::Float32,
         PhysicalType::Double => DataType::Float64,
-        PhysicalType::ByteArray => {
-            from_byte_array(&primitive_type.logical_type, &primitive_type.converted_type)
-        }
+        PhysicalType::ByteArray => from_byte_array(
+            &primitive_type.logical_type,
+            &primitive_type.converted_type,
+            options,
+        ),
         PhysicalType::FixedLenByteArray(length) => from_fixed_len_byte_array(
             length,
             primitive_type.logical_type,
@@ -440,7 +450,6 @@ mod tests {
     use parquet2::metadata::SchemaDescriptor;
 
     use super::*;
-
     use crate::error::Result;
 
     #[test]
@@ -1123,8 +1132,9 @@ mod tests {
             let parquet_schema = SchemaDescriptor::try_from_message(message_type)?;
             let fields = parquet_to_arrow_schema_with_options(
                 parquet_schema.fields(),
-                &Some(SchemaInferenceOptions {
+                Some(SchemaInferenceOptions {
                     int96_coerce_to_timeunit: tu,
+                    ..Default::default()
                 }),
             );
             assert_eq!(arrow_fields, fields);

diff --git a/src/arrow2/src/io/parquet/read/schema/mod.rs b/src/arrow2/src/io/parquet/read/schema/mod.rs
@@ -1,21 +1,53 @@
 //! APIs to handle Parquet <-> Arrow schemas.
 
-use crate::datatypes::{Schema, TimeUnit};
-use crate::error::Result;
+use crate::{
+    datatypes::{Schema, TimeUnit},
+    error::Result,
+};
 
 mod convert;
 mod metadata;
 
 pub use convert::parquet_to_arrow_schema_with_options;
-pub use metadata::{apply_schema_to_fields, read_schema_from_metadata};
-pub use parquet2::metadata::{FileMetaData, KeyValue, SchemaDescriptor};
-pub use parquet2::schema::types::ParquetType;
-
 pub(crate) use convert::*;
+pub use metadata::{apply_schema_to_fields, read_schema_from_metadata};
+pub use parquet2::{
+    metadata::{FileMetaData, KeyValue, SchemaDescriptor},
+    schema::types::ParquetType,
+};
+use serde::{Deserialize, Serialize};
 
 use self::metadata::parse_key_value_metadata;
 
+/// String encoding options.
+///
+/// Each variant of this enum maps to a different interpretation of the underlying binary data:
+/// 1. `StringEncoding::Utf8` assumes the underlying binary data is UTF-8 encoded.
+/// 2. `StringEncoding::Raw` makes no assumptions about the encoding of the underlying binary data. This variant will change the logical type of the column to `DataType::Binary` in the final schema.
+#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+pub enum StringEncoding {
+    Raw,
+    #[default]
+    Utf8,
+}
+
+impl<T: AsRef<str>> TryFrom<Option<T>> for StringEncoding {
+    type Error = crate::error::Error;
+
+    fn try_from(value: Option<T>) -> Result<Self> {
+        match value.as_ref().map(AsRef::as_ref) {
+            Some("utf-8") => Ok(Self::Utf8),
+            Some(encoding) => Err(crate::error::Error::InvalidArgumentError(format!(
+                "Unrecognized encoding: {}",
+                encoding
+            ))),
+            None => Ok(Self::Raw),
+        }
+    }
+}
+
 /// Options when inferring schemas from Parquet
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub struct SchemaInferenceOptions {
     /// When inferring schemas from the Parquet INT96 timestamp type, this is the corresponding TimeUnit
     /// in the inferred Arrow Timestamp type.
@@ -25,12 +57,16 @@
     /// (e.g. TimeUnit::Milliseconds) will result in loss of precision, but support a larger range of dates
     /// without overflowing when parsing the data.
     pub int96_coerce_to_timeunit: TimeUnit,
+
+    /// The string encoding to assume when inferring the schema from Parquet data.
+    pub string_encoding: StringEncoding,
 }
 
 impl Default for SchemaInferenceOptions {
     fn default() -> Self {
         SchemaInferenceOptions {
             int96_coerce_to_timeunit: TimeUnit::Nanosecond,
+            string_encoding: StringEncoding::default(),
         }
     }
 }
@@ -42,13 +78,13 @@
 /// This function errors iff the key `"ARROW:schema"` exists but is not correctly encoded,
 /// indicating that that the file's arrow metadata was incorrectly written.
 pub fn infer_schema(file_metadata: &FileMetaData) -> Result<Schema> {
-    infer_schema_with_options(file_metadata, &None)
+    infer_schema_with_options(file_metadata, None)
 }
 
 /// Like [`infer_schema`] but with configurable options which affects the behavior of inference
 pub fn infer_schema_with_options(
     file_metadata: &FileMetaData,
-    options: &Option<SchemaInferenceOptions>,
+    options: Option<SchemaInferenceOptions>,
 ) -> Result<Schema> {
     let mut metadata = parse_key_value_metadata(file_metadata.key_value_metadata());
     let fields = parquet_to_arrow_schema_with_options(file_metadata.schema().fields(), options);

diff --git a/src/daft-micropartition/src/micropartition.rs b/src/daft-micropartition/src/micropartition.rs
@@ -596,9 +596,9 @@ impl MicroPartition {
             (
                 _,
                 _,
-                FileFormatConfig::Parquet(ParquetSourceConfig {
+                &FileFormatConfig::Parquet(ParquetSourceConfig {
                     coerce_int96_timestamp_unit,
-                    field_id_mapping,
+                    ref field_id_mapping,
                     chunk_size,
                     ..
                 }),
@@ -646,12 +646,13 @@ impl MicroPartition {
                     if scan_task.sources.len() == 1 { 1 } else { 128 }, // Hardcoded for to 128 bulk reads
                     cfg.multithreaded_io,
                     &ParquetSchemaInferenceOptions {
-                        coerce_int96_timestamp_unit: *coerce_int96_timestamp_unit,
+                        coerce_int96_timestamp_unit,
+                        ..Default::default()
                     },
                     Some(schema.clone()),
                     field_id_mapping.clone(),
                     parquet_metadata,
-                    *chunk_size,
+                    chunk_size,
                 )
                 .context(DaftCoreComputeSnafu)
             }
@@ -1162,7 +1163,7 @@ pub(crate) fn read_parquet_into_micropartition<T: AsRef<str>>(
         let schemas = metadata
             .iter()
             .map(|m| {
-                let schema = infer_schema_with_options(m, &Some((*schema_infer_options).into()))?;
+                let schema = infer_schema_with_options(m, Some((*schema_infer_options).into()))?;
                 let daft_schema = daft_core::prelude::Schema::try_from(&schema)?;
                 DaftResult::Ok(Arc::new(daft_schema))
             })
@@ -1186,7 +1187,7 @@ pub(crate) fn read_parquet_into_micropartition<T: AsRef<str>>(
         let schemas = metadata
             .iter()
             .map(|m| {
-                let schema = infer_schema_with_options(m, &Some((*schema_infer_options).into()))?;
+                let schema = infer_schema_with_options(m, Some((*schema_infer_options).into()))?;
                 let daft_schema = daft_core::prelude::Schema::try_from(&schema)?;
                 DaftResult::Ok(Arc::new(daft_schema))
             })

diff --git a/src/daft-parquet/Cargo.toml b/src/daft-parquet/Cargo.toml
@@ -26,6 +26,7 @@ tokio-util = {workspace = true}
 
 [dev-dependencies]
 bincode = {workspace = true}
+path_macro = {workspace = true}
 
 [features]
 python = ["dep:pyo3", "common-error/python", "daft-core/python", "daft-io/python", "daft-table/python", "daft-stats/python", "daft-dsl/python", "common-arrow-ffi/python"]

diff --git a/src/daft-parquet/src/file.rs b/src/daft-parquet/src/file.rs
@@ -259,11 +259,12 @@ impl ParquetReaderBuilder {
     }
 
     pub fn build(self) -> super::Result<ParquetFileReader> {
-        let mut arrow_schema =
-            infer_schema_with_options(&self.metadata, &Some(self.schema_inference_options.into()))
-                .context(UnableToParseSchemaFromMetadataSnafu::<String> {
-                    path: self.uri.clone(),
-                })?;
+        let options = self.schema_inference_options.into();
+        let mut arrow_schema = infer_schema_with_options(&self.metadata, Some(options)).context(
+            UnableToParseSchemaFromMetadataSnafu {
+                path: self.uri.clone(),
+            },
+        )?;
 
         if let Some(names_to_keep) = self.selected_columns {
             arrow_schema

diff --git a/src/daft-parquet/src/lib.rs b/src/daft-parquet/src/lib.rs
@@ -19,6 +19,9 @@ pub use python::register_modules;
 
 #[derive(Debug, Snafu)]
 pub enum Error {
+    #[snafu(display("{source}"))]
+    Arrow2Error { source: arrow2::error::Error },
+
     #[snafu(display("{source}"))]
     DaftIOError { source: daft_io::Error },
 

diff --git a/src/daft-parquet/src/python.rs b/src/daft-parquet/src/python.rs
@@ -10,7 +10,9 @@ pub mod pylib {
     use daft_table::python::PyTable;
     use pyo3::{pyfunction, types::PyModule, Bound, PyResult, Python};
 
-    use crate::read::{ArrowChunk, ParquetSchemaInferenceOptions};
+    use crate::read::{
+        ArrowChunk, ParquetSchemaInferenceOptions, ParquetSchemaInferenceOptionsBuilder,
+    };
     #[allow(clippy::too_many_arguments)]
     #[pyfunction]
     pub fn read_parquet(
@@ -97,16 +99,19 @@ pub mod pylib {
         io_config: Option<IOConfig>,
         multithreaded_io: Option<bool>,
         coerce_int96_timestamp_unit: Option<PyTimeUnit>,
+        string_encoding: Option<String>,
         file_timeout_ms: Option<i64>,
     ) -> PyResult<PyArrowParquetType> {
-        let read_parquet_result = py.allow_threads(|| {
+        let (schema, all_arrays, num_rows) = py.allow_threads(|| {
             let io_client = get_io_client(
                 multithreaded_io.unwrap_or(true),
                 io_config.unwrap_or_default().config.into(),
             )?;
-            let schema_infer_options = ParquetSchemaInferenceOptions::new(
-                coerce_int96_timestamp_unit.map(|tu| tu.timeunit),
-            );
+            let schema_infer_options = ParquetSchemaInferenceOptionsBuilder {
+                coerce_int96_timestamp_unit,
+                string_encoding,
+            }
+            .build()?;
 
             crate::read::read_parquet_into_pyarrow(
                 uri,
@@ -121,7 +126,6 @@ pub mod pylib {
                 file_timeout_ms,
             )
         })?;
-        let (schema, all_arrays, num_rows) = read_parquet_result;
         let pyarrow = py.import_bound(pyo3::intern!(py, "pyarrow"))?;
         convert_pyarrow_parquet_read_result_into_py(py, schema, all_arrays, num_rows, &pyarrow)
     }