squash, rebase, and point at arrow-rs master

apache · Oct 5, 2023 · a70c095 · a70c095
1 parent 0408c2b
commit a70c095
Show file tree

Hide file tree

Showing 5 changed files with 495 additions and 210 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -48,12 +48,12 @@ rust-version = "1.70"
 version = "31.0.0"
 
 [workspace.dependencies]
-arrow = { version = "47.0.0", features = ["prettyprint"] }
-arrow-array = { version = "47.0.0", default-features = false, features = ["chrono-tz"] }
-arrow-buffer = { version = "47.0.0", default-features = false }
-arrow-flight = { version = "47.0.0", features = ["flight-sql-experimental"] }
-arrow-schema = { version = "47.0.0", default-features = false }
-parquet = { version = "47.0.0", features = ["arrow", "async", "object_store"] }
+arrow = { path = "../arrow-rs/arrow", features = ["prettyprint"] }
+arrow-array = { git = "https://github.com/apache/arrow-rs.git", default-features = false, features = ["chrono-tz"] }
+arrow-buffer = { git = "https://github.com/apache/arrow-rs.git", default-features = false }
+arrow-flight = { git = "https://github.com/apache/arrow-rs.git", features = ["flight-sql-experimental"] }
+arrow-schema = { git = "https://github.com/apache/arrow-rs.git", default-features = false }
+parquet = { git = "https://github.com/apache/arrow-rs.git", features = ["arrow", "async", "object_store"] }
 sqlparser = { version = "0.38.0", features = ["visitor"] }
 chrono = { version = "0.4.31", default-features = false }
 

diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs
@@ -358,12 +358,24 @@ config_namespace! {
         pub bloom_filter_ndv: Option<u64>, default = None
 
         /// Controls whether DataFusion will attempt to speed up writing
-        /// large parquet files by first writing multiple smaller files
-        /// and then stitching them together into a single large file.
-        /// This will result in faster write speeds, but higher memory usage.
-        /// Also currently unsupported are bloom filters and column indexes
-        /// when single_file_parallelism is enabled.
-        pub allow_single_file_parallelism: bool, default = false
+        /// parquet files by serializing them in parallel. Each column
+        /// in each row group in each output file are serialized in parallel
+        /// leveraging a maximum possible core count of n_files*n_row_groups*n_columns.
+        pub allow_single_file_parallelism: bool, default = true
+
+        /// If allow_single_file_parallelism=true, this setting allows
+        /// applying backpressure to prevent working on too many row groups in
+        /// parallel in case of limited memory or slow I/O speed causing
+        /// OOM errors. Lowering this number limits memory growth at the cost
+        /// of potentially slower write speeds.
+        pub maximum_parallel_row_group_writers: usize, default = 16
+
+        /// If allow_single_file_parallelism=true, this setting allows
+        /// applying backpressure to prevent too many RecordBatches building
+        /// up in memory in case the parallel writers cannot consume them fast
+        /// enough. Lowering this number limits memory growth at the cost
+        /// of potentially lower write speeds.
+        pub maximum_buffered_record_batches_per_stream: usize, default = 200
 
     }
 }