Skip to content

Commit c0c9c72

Browse files
QP HoujorgecarleitaoyjshenIgosuki
authored
Officially maintained Arrow2 branch (#1556)
Migrate to arrow2 Co-authored-by: Jorge C. Leitao <[email protected]> Co-authored-by: Yijie Shen <[email protected]> Co-authored-by: Guillaume Balaine <[email protected]> Co-authored-by: Guillaume Balaine <[email protected]>
1 parent 2008b1d commit c0c9c72

File tree

176 files changed

+5942
-6944
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

176 files changed

+5942
-6944
lines changed

.github/workflows/rust.yml

+1-2
Original file line numberDiff line numberDiff line change
@@ -318,8 +318,7 @@ jobs:
318318
run: |
319319
cargo miri setup
320320
cargo clean
321-
# Ignore MIRI errors until we can get a clean run
322-
cargo miri test || true
321+
cargo miri test
323322
324323
# Check answers are correct when hash values collide
325324
hash-collisions:

Cargo.toml

+5
Original file line numberDiff line numberDiff line change
@@ -33,3 +33,8 @@ exclude = ["python"]
3333
[profile.release]
3434
lto = true
3535
codegen-units = 1
36+
37+
[patch.crates-io]
38+
arrow2 = { git = "https://github.com/jorgecarleitao/arrow2.git", rev = "ef7937dfe56033c2cc491482c67587b52cd91554" }
39+
#arrow2 = { git = "https://github.com/blaze-init/arrow2.git", branch = "shuffle_ipc" }
40+
#parquet2 = { git = "https://github.com/blaze-init/parquet2.git", branch = "meta_new" }

README.md

-2
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,6 @@ Run a SQL query against data stored in a CSV:
7171

7272
```rust
7373
use datafusion::prelude::*;
74-
use datafusion::arrow::util::pretty::print_batches;
7574
use datafusion::arrow::record_batch::RecordBatch;
7675

7776
#[tokio::main]
@@ -93,7 +92,6 @@ Use the DataFrame API to process data stored in a CSV:
9392

9493
```rust
9594
use datafusion::prelude::*;
96-
use datafusion::arrow::util::pretty::print_batches;
9795
use datafusion::arrow::record_batch::RecordBatch;
9896

9997
#[tokio::main]

ballista-examples/Cargo.toml

+2-2
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,8 @@ rust-version = "1.57"
3131
[dependencies]
3232
datafusion = { path = "../datafusion" }
3333
ballista = { path = "../ballista/rust/client", version = "0.6.0"}
34-
prost = "0.8"
35-
tonic = "0.5"
34+
prost = "0.9"
35+
tonic = "0.6"
3636
tokio = { version = "1.0", features = ["macros", "rt", "rt-multi-thread", "sync"] }
3737
futures = "0.3"
3838
num_cpus = "1.13.0"

ballista-examples/src/bin/ballista-dataframe.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ async fn main() -> Result<()> {
2727
.build()?;
2828
let ctx = BallistaContext::remote("localhost", 50050, &config);
2929

30-
let testdata = datafusion::arrow::util::test_util::parquet_test_data();
30+
let testdata = datafusion::test_util::parquet_test_data();
3131

3232
let filename = &format!("{}/alltypes_plain.parquet", testdata);
3333

ballista-examples/src/bin/ballista-sql.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ async fn main() -> Result<()> {
2727
.build()?;
2828
let ctx = BallistaContext::remote("localhost", 50050, &config);
2929

30-
let testdata = datafusion::arrow::util::test_util::arrow_test_data();
30+
let testdata = datafusion::test_util::arrow_test_data();
3131

3232
// register csv file with the execution context
3333
ctx.register_csv(

ballista/rust/client/README.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ data set.
9595

9696
```rust,no_run
9797
use ballista::prelude::*;
98-
use datafusion::arrow::util::pretty;
98+
use datafusion::arrow::io::print;
9999
use datafusion::prelude::CsvReadOptions;
100100
101101
#[tokio::main]
@@ -125,7 +125,7 @@ async fn main() -> Result<()> {
125125
126126
// collect the results and print them to stdout
127127
let results = df.collect().await?;
128-
pretty::print_batches(&results)?;
128+
print::print(&results);
129129
Ok(())
130130
}
131131
```

ballista/rust/client/src/columnar_batch.rs

+3-2
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ use ballista_core::error::{ballista_error, Result};
2121

2222
use datafusion::arrow::{
2323
array::ArrayRef,
24+
compute::aggregate::estimated_bytes_size,
2425
datatypes::{DataType, Schema},
2526
record_batch::RecordBatch,
2627
};
@@ -50,7 +51,7 @@ impl ColumnarBatch {
5051
.collect();
5152

5253
Self {
53-
schema: batch.schema(),
54+
schema: batch.schema().clone(),
5455
columns,
5556
}
5657
}
@@ -156,7 +157,7 @@ impl ColumnarValue {
156157

157158
pub fn memory_size(&self) -> usize {
158159
match self {
159-
ColumnarValue::Columnar(array) => array.get_array_memory_size(),
160+
ColumnarValue::Columnar(array) => estimated_bytes_size(array.as_ref()),
160161
_ => 0,
161162
}
162163
}

ballista/rust/core/Cargo.toml

+5-7
Original file line numberDiff line numberDiff line change
@@ -35,23 +35,21 @@ async-trait = "0.1.36"
3535
futures = "0.3"
3636
hashbrown = "0.11"
3737
log = "0.4"
38-
prost = "0.8"
38+
prost = "0.9"
3939
serde = {version = "1", features = ["derive"]}
4040
sqlparser = "0.13"
4141
tokio = "1.0"
42-
tonic = "0.5"
42+
tonic = "0.6"
4343
uuid = { version = "0.8", features = ["v4"] }
4444
chrono = { version = "0.4", default-features = false }
4545

46-
# workaround for https://github.com/apache/arrow-datafusion/issues/1498
47-
# should be able to remove when we update arrow-flight
48-
quote = "=1.0.10"
49-
arrow-flight = { version = "6.4.0" }
46+
arrow-format = { version = "0.3", features = ["flight-data", "flight-service"] }
47+
arrow = { package = "arrow2", version="0.8", features = ["io_ipc", "io_flight"] }
5048

5149
datafusion = { path = "../../../datafusion", version = "6.0.0" }
5250

5351
[dev-dependencies]
5452
tempfile = "3"
5553

5654
[build-dependencies]
57-
tonic-build = { version = "0.5" }
55+
tonic-build = { version = "0.6" }

ballista/rust/core/proto/ballista.proto

+21-3
Original file line numberDiff line numberDiff line change
@@ -1015,6 +1015,7 @@ enum TimeUnit{
10151015
enum IntervalUnit{
10161016
YearMonth = 0;
10171017
DayTime = 1;
1018+
MonthDayNano = 2;
10181019
}
10191020

10201021
message Decimal{
@@ -1028,11 +1029,11 @@ message List{
10281029

10291030
message FixedSizeList{
10301031
Field field_type = 1;
1031-
int32 list_size = 2;
1032+
uint32 list_size = 2;
10321033
}
10331034

10341035
message Dictionary{
1035-
ArrowType key = 1;
1036+
IntegerType key = 1;
10361037
ArrowType value = 2;
10371038
}
10381039

@@ -1135,7 +1136,7 @@ message ArrowType{
11351136
EmptyMessage UTF8 =14 ;
11361137
EmptyMessage LARGE_UTF8 = 32;
11371138
EmptyMessage BINARY =15 ;
1138-
int32 FIXED_SIZE_BINARY =16 ;
1139+
uint32 FIXED_SIZE_BINARY =16 ;
11391140
EmptyMessage LARGE_BINARY = 31;
11401141
EmptyMessage DATE32 =17 ;
11411142
EmptyMessage DATE64 =18 ;
@@ -1154,6 +1155,23 @@ message ArrowType{
11541155
}
11551156
}
11561157

1158+
// Broke out into multiple message types so that type
1159+
// metadata did not need to be in separate message
1160+
//All types that are of the empty message types contain no additional metadata
1161+
// about the type
1162+
message IntegerType{
1163+
oneof integer_type_enum{
1164+
EmptyMessage INT8 = 1;
1165+
EmptyMessage INT16 = 2;
1166+
EmptyMessage INT32 = 3;
1167+
EmptyMessage INT64 = 4;
1168+
EmptyMessage UINT8 = 5;
1169+
EmptyMessage UINT16 = 6;
1170+
EmptyMessage UINT32 = 7;
1171+
EmptyMessage UINT64 = 8;
1172+
}
1173+
}
1174+
11571175

11581176

11591177

ballista/rust/core/src/client.rs

+30-14
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,9 @@
1717

1818
//! Client API for sending requests to executors.
1919
20-
use std::sync::Arc;
20+
use arrow::io::flight::deserialize_schemas;
21+
use arrow::io::ipc::IpcSchema;
22+
use std::sync::{Arc, Mutex};
2123
use std::{collections::HashMap, pin::Pin};
2224
use std::{
2325
convert::{TryFrom, TryInto},
@@ -31,11 +33,10 @@ use crate::serde::scheduler::{
3133
Action, ExecutePartition, ExecutePartitionResult, PartitionId, PartitionStats,
3234
};
3335

34-
use arrow_flight::utils::flight_data_to_arrow_batch;
35-
use arrow_flight::Ticket;
36-
use arrow_flight::{flight_service_client::FlightServiceClient, FlightData};
36+
use arrow_format::flight::data::{FlightData, Ticket};
37+
use arrow_format::flight::service::flight_service_client::FlightServiceClient;
3738
use datafusion::arrow::{
38-
array::{StringArray, StructArray},
39+
array::{StructArray, Utf8Array},
3940
datatypes::{Schema, SchemaRef},
4041
error::{ArrowError, Result as ArrowResult},
4142
record_batch::RecordBatch,
@@ -122,10 +123,12 @@ impl BallistaClient {
122123
{
123124
Some(flight_data) => {
124125
// convert FlightData to a stream
125-
let schema = Arc::new(Schema::try_from(&flight_data)?);
126+
let (schema, ipc_schema) =
127+
deserialize_schemas(flight_data.data_body.as_slice()).unwrap();
128+
let schema = Arc::new(schema);
126129

127130
// all the remaining stream messages should be dictionary and record batches
128-
Ok(Box::pin(FlightDataStream::new(stream, schema)))
131+
Ok(Box::pin(FlightDataStream::new(stream, schema, ipc_schema)))
129132
}
130133
None => Err(ballista_error(
131134
"Did not receive schema batch from flight server",
@@ -135,32 +138,45 @@ impl BallistaClient {
135138
}
136139

137140
struct FlightDataStream {
138-
stream: Streaming<FlightData>,
141+
stream: Mutex<Streaming<FlightData>>,
139142
schema: SchemaRef,
143+
ipc_schema: IpcSchema,
140144
}
141145

142146
impl FlightDataStream {
143-
pub fn new(stream: Streaming<FlightData>, schema: SchemaRef) -> Self {
144-
Self { stream, schema }
147+
pub fn new(
148+
stream: Streaming<FlightData>,
149+
schema: SchemaRef,
150+
ipc_schema: IpcSchema,
151+
) -> Self {
152+
Self {
153+
stream: Mutex::new(stream),
154+
schema,
155+
ipc_schema,
156+
}
145157
}
146158
}
147159

148160
impl Stream for FlightDataStream {
149161
type Item = ArrowResult<RecordBatch>;
150162

151163
fn poll_next(
152-
mut self: std::pin::Pin<&mut Self>,
164+
self: std::pin::Pin<&mut Self>,
153165
cx: &mut Context<'_>,
154166
) -> Poll<Option<Self::Item>> {
155-
self.stream.poll_next_unpin(cx).map(|x| match x {
167+
let mut stream = self.stream.lock().unwrap();
168+
stream.poll_next_unpin(cx).map(|x| match x {
156169
Some(flight_data_chunk_result) => {
157170
let converted_chunk = flight_data_chunk_result
158171
.map_err(|e| ArrowError::from_external_error(Box::new(e)))
159172
.and_then(|flight_data_chunk| {
160-
flight_data_to_arrow_batch(
173+
let hm = HashMap::new();
174+
175+
arrow::io::flight::deserialize_batch(
161176
&flight_data_chunk,
162177
self.schema.clone(),
163-
&[],
178+
&self.ipc_schema,
179+
&hm,
164180
)
165181
});
166182
Some(converted_chunk)

0 commit comments

Comments
 (0)