Skip to content

Commit

Permalink
parallel csv scan (#6801)
Browse files Browse the repository at this point in the history
* parallel csv scan

* add max line length

* Update according to review comments

* Update Configuration doc

---------

Co-authored-by: Andrew Lamb <[email protected]>
  • Loading branch information
2010YOUY01 and alamb committed Jul 12, 2023
1 parent 50135e8 commit ad3b8f6
Show file tree
Hide file tree
Showing 43 changed files with 1,424 additions and 573 deletions.
28 changes: 7 additions & 21 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,40 +17,26 @@

[workspace]
exclude = ["datafusion-cli"]
members = [
"datafusion/common",
"datafusion/core",
"datafusion/expr",
"datafusion/execution",
"datafusion/optimizer",
"datafusion/physical-expr",
"datafusion/proto",
"datafusion/proto/gen",
"datafusion/row",
"datafusion/sql",
"datafusion/substrait",
"datafusion-examples",
"test-utils",
"benchmarks",
members = ["datafusion/common", "datafusion/core", "datafusion/expr", "datafusion/execution", "datafusion/optimizer", "datafusion/physical-expr", "datafusion/proto", "datafusion/proto/gen", "datafusion/row", "datafusion/sql", "datafusion/substrait", "datafusion-examples", "test-utils", "benchmarks",
]
resolver = "2"

[workspace.package]
version = "27.0.0"
edition = "2021"
readme = "README.md"
authors = ["Apache Arrow <[email protected]>"]
license = "Apache-2.0"
edition = "2021"
homepage = "https://github.com/apache/arrow-datafusion"
license = "Apache-2.0"
readme = "README.md"
repository = "https://github.com/apache/arrow-datafusion"
rust-version = "1.64"
version = "27.0.0"

[workspace.dependencies]
arrow = { version = "43.0.0", features = ["prettyprint", "dyn_cmp_dict"] }
arrow-flight = { version = "43.0.0", features = ["flight-sql-experimental"] }
arrow-array = { version = "43.0.0", default-features = false, features = ["chrono-tz"] }
arrow-buffer = { version = "43.0.0", default-features = false }
arrow-flight = { version = "43.0.0", features = ["flight-sql-experimental"] }
arrow-schema = { version = "43.0.0", default-features = false }
arrow-array = { version = "43.0.0", default-features = false, features = ["chrono-tz"] }
parquet = { version = "43.0.0", features = ["arrow", "async", "object_store"] }
sqlparser = { version = "0.35", features = ["visitor"] }

Expand Down
88 changes: 44 additions & 44 deletions datafusion-cli/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

11 changes: 7 additions & 4 deletions datafusion/common/src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -323,10 +323,13 @@ config_namespace! {
/// long runner execution, all types of joins may encounter out-of-memory errors.
pub allow_symmetric_joins_without_pruning: bool, default = true

/// When set to true, file groups will be repartitioned to achieve maximum parallelism.
/// Currently supported only for Parquet format in which case
/// multiple row groups from the same file may be read concurrently. If false then each
/// row group is read serially, though different files may be read in parallel.
/// When set to `true`, file groups will be repartitioned to achieve maximum parallelism.
/// Currently Parquet and CSV formats are supported.
///
/// If set to `true`, all files will be repartitioned evenly (i.e., a single large file
/// might be partitioned into smaller chunks) for parallel scanning.
/// If set to `false`, different files will be read in parallel, but repartitioning won't
/// happen within a single file.
pub repartition_file_scans: bool, default = true

/// Should DataFusion repartition data using the partitions keys to execute window
Expand Down
1 change: 1 addition & 0 deletions datafusion/core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ env_logger = "0.10"
half = "2.2.1"
postgres-protocol = "0.6.4"
postgres-types = { version = "0.2.4", features = ["derive", "with-chrono-0_4"] }
regex = "1.5.4"
rstest = "0.18.0"
rust_decimal = { version = "1.27.0", features = ["tokio-pg"] }
sqllogictest = "0.14.0"
Expand Down
Loading

0 comments on commit ad3b8f6

Please sign in to comment.