Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 4 additions & 7 deletions datafusion/core/src/physical_plan/planner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1739,8 +1739,6 @@ mod tests {
col("c1").and(col("c1")),
// u8 AND u8
col("c3").and(col("c3")),
// utf8 = u32
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why remove this case?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

utf8 and u32 now have coerced type (utf8).

col("c1").eq(col("c2")),
// utf8 = bool
col("c1").eq(bool_expr.clone()),
// u32 AND bool
Expand Down Expand Up @@ -1842,7 +1840,7 @@ mod tests {
.build()?;
let execution_plan = plan(&logical_plan).await?;
// verify that the plan correctly adds cast from Int64(1) to Utf8
let expected = "InListExpr { expr: Column { name: \"c1\", index: 0 }, list: [Literal { value: Utf8(\"a\") }, CastExpr { expr: Literal { value: Int64(1) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }], negated: false, set: None }";
let expected = "expr: [(InListExpr { expr: Column { name: \"c1\", index: 0 }, list: [Literal { value: Utf8(\"a\") }, TryCastExpr { expr: Literal { value: Int64(1) }, cast_type: Utf8 }], negated: false, set: None }";
assert!(format!("{:?}", execution_plan).contains(expected));

// expression: "a in (struct::null, 'a')"
Expand All @@ -1857,8 +1855,7 @@ mod tests {
let execution_plan = plan(&logical_plan).await;

let e = execution_plan.unwrap_err().to_string();
assert_contains!(&e, "Unsupported CAST from Struct");
assert_contains!(&e, "to Boolean");
assert_contains!(&e, "Can not find compatible types to compare Boolean with [Struct([Field { name: \"foo\", data_type: Boolean, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: None }]), Utf8]");

Ok(())
}
Expand Down Expand Up @@ -1887,7 +1884,7 @@ mod tests {
.project(vec![col("c1").in_list(list, false)])?
.build()?;
let execution_plan = plan(&logical_plan).await?;
let expected = "expr: [(InListExpr { expr: Column { name: \"c1\", index: 0 }, list: [Literal { value: Utf8(\"a\") }, CastExpr { expr: Literal { value: Int64(1) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(2) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(3) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(4) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(5) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(6) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(7) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(8) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(9) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(10) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(11) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(12) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(13) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(14) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(15) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(16) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(17) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(18) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(19) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(20) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(21) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(22) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(23) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(24) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(25) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(26) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(27) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(28) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(29) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(30) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }], negated: false, set: Some(InSet { set:";
let expected = "expr: [(InListExpr { expr: Column { name: \"c1\", index: 0 }, list: [Literal { value: Utf8(\"a\") }, TryCastExpr { expr: Literal { value: Int64(1) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(2) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(3) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(4) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(5) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(6) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(7) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(8) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(9) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(10) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(11) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(12) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(13) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(14) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(15) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(16) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(17) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(18) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(19) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(20) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(21) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(22) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(23) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(24) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(25) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(26) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(27) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(28) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(29) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(30) }, cast_type: Utf8 }], negated: false, set: None }";
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here is this expression formatted for anyone else who is interested:

(InListExpr {
  expr: Column { name: \"c1\", index: 0 },
  list: [Literal { value: Utf8(\"a\") },
                 TryCastExpr { expr: Literal { value: Int64(1) },cast_type: Utf8 },
                 TryCastExpr { expr: Literal { value: Int64(2) }, cast_type: Utf8 },
                 TryCastExpr { expr: Literal { value: Int64(3) }, cast_type: Utf8 },
                 TryCastExpr { expr: Literal { value: Int64(4) }, cast_type: Utf8 },
                 TryCastExpr { expr: Literal { value: Int64(5) }, cast_type: Utf8 },
                 TryCastExpr { expr: Literal { value: Int64(6) }, cast_type: Utf8 },
                 TryCastExpr { expr: Literal { value: Int64(7) }, cast_type: Utf8 },
                 TryCastExpr { expr: Literal { value: Int64(8) }, cast_type: Utf8 },
                 TryCastExpr { expr: Literal { value: Int64(9) }, cast_type: Utf8 },
                 TryCastExpr { expr: Literal { value: Int64(10) }, cast_type: Utf8 }, 
                 TryCastExpr { expr: Literal { value: Int64(11) }, cast_type: Utf8 }, 
                 TryCastExpr { expr: Literal { value: Int64(12) }, cast_type: Utf8 }, 
                 TryCastExpr { expr: Literal { value: Int64(13) }, cast_type: Utf8 }, 
                 TryCastExpr { expr: Literal { value: Int64(14) }, cast_type: Utf8 }, 
                 TryCastExpr { expr: Literal { value: Int64(15) }, cast_type: Utf8 }, 
                 TryCastExpr { expr: Literal { value: Int64(16) }, cast_type: Utf8 }, 
                 TryCastExpr { expr: Literal { value: Int64(17) }, cast_type: Utf8 }, 
                 TryCastExpr { expr: Literal { value: Int64(18) }, cast_type: Utf8 }, 
                 TryCastExpr { expr: Literal { value: Int64(19) }, cast_type: Utf8 }, 
                 TryCastExpr { expr: Literal { value: Int64(20) }, cast_type: Utf8 }, 
                 TryCastExpr { expr: Literal { value: Int64(21) }, cast_type: Utf8 }, 
                 TryCastExpr { expr: Literal { value: Int64(22) }, cast_type: Utf8 }, 
                 TryCastExpr { expr: Literal { value: Int64(23) }, cast_type: Utf8 }, 
                 TryCastExpr { expr: Literal { value: Int64(24) }, cast_type: Utf8 }, 
                 TryCastExpr { expr: Literal { value: Int64(25) }, cast_type: Utf8 }, 
                 TryCastExpr { expr: Literal { value: Int64(26) }, cast_type: Utf8 }, 
                 TryCastExpr { expr: Literal { value: Int64(27) }, cast_type: Utf8 }, 
                 TryCastExpr { expr: Literal { value: Int64(28) }, cast_type: Utf8 }, 
                 TryCastExpr { expr: Literal { value: Int64(29) }, cast_type: Utf8 }, 
                 TryCastExpr { expr: Literal { value: Int64(30) }, cast_type: Utf8 }],
  negated: false,
  set: None
  }

assert!(format!("{:?}", execution_plan).contains(expected));
Ok(())
}
Expand All @@ -1906,7 +1903,7 @@ mod tests {
.project(vec![col("c1").in_list(list, false)])?
.build()?;
let execution_plan = plan(&logical_plan).await?;
let expected = "expr: [(InListExpr { expr: Column { name: \"c1\", index: 0 }, list: [CastExpr { expr: Literal { value: Int64(NULL) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(1) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(2) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(3) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(4) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(5) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(6) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(7) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(8) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(9) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(10) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(11) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(12) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(13) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(14) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(15) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(16) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(17) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(18) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(19) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(20) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(21) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(22) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(23) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(24) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(25) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(26) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(27) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(28) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(29) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }, CastExpr { expr: Literal { value: Int64(30) }, cast_type: Utf8, cast_options: CastOptions { safe: false } }], negated: false, set: Some(InSet { set: ";
let expected = "expr: [(InListExpr { expr: Column { name: \"c1\", index: 0 }, list: [TryCastExpr { expr: Literal { value: Int64(NULL) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(1) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(2) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(3) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(4) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(5) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(6) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(7) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(8) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(9) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(10) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(11) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(12) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(13) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(14) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(15) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(16) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(17) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(18) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(19) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(20) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(21) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(22) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(23) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(24) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(25) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(26) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(27) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(28) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(29) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(30) }, cast_type: Utf8 }], negated: false, set: None }";
assert!(format!("{:?}", execution_plan).contains(expected));
Ok(())
}
Expand Down
12 changes: 12 additions & 0 deletions datafusion/expr/src/binary_rule.rs
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ pub fn comparison_eq_coercion(
.or_else(|| temporal_coercion(lhs_type, rhs_type))
.or_else(|| string_coercion(lhs_type, rhs_type))
.or_else(|| null_coercion(lhs_type, rhs_type))
.or_else(|| string_numeric_coercion(lhs_type, rhs_type))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This makes a lot of sense -- thanks. 👍

}

fn comparison_order_coercion(
Expand All @@ -185,6 +186,17 @@ fn comparison_order_coercion(
.or_else(|| null_coercion(lhs_type, rhs_type))
}

fn string_numeric_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have some concerns about the rule between string and number.
I check some situation in the spark:

spark-sql> desc t3;
c1                      int

spark-sql> explain extended select * from t3 where c1 = cast(123.123 as string);
== Parsed Logical Plan ==
'Project [*]
+- 'Filter ('c1 = cast(123.123 as string))
   +- 'UnresolvedRelation [t3], [], false

== Analyzed Logical Plan ==
c1: int
Project [c1#186]
+- Filter (c1#186 = cast(cast(123.123 as string) as int))
   +- SubqueryAlias spark_catalog.default.t3
      +- HiveTableRelation [`default`.`t3`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, Data Cols: [c1#186], Partition Cols: []]

== Optimized Logical Plan ==
Filter (isnotnull(c1#186) AND (c1#186 = 123))
+- HiveTableRelation [`default`.`t3`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, Data Cols: [c1#186], Partition Cols: []]

== Physical Plan ==
*(1) Filter (isnotnull(c1#186) AND (c1#186 = 123))
+- Scan hive default.t3 [c1#186], HiveTableRelation [`default`.`t3`, org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe, Data Cols: [c1#186], Partition Cols: []]

In the previous case, the result of coercion is Int.
I think we need to create an issue to track this.
@viirya @alamb

use arrow::datatypes::DataType::*;
match (lhs_type, rhs_type) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I test in 748b6a65a5fa801595fd80a3c7b2728be3c9cdaa(not this commit)

explain select * from part where p_partkey in (1, 2, '3');
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| plan_type     | plan                                                                                                                                                                                                                                                                      |
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| logical_plan  | Projection: #part.p_partkey, #part.p_name, #part.p_mfgr, #part.p_brand, #part.p_type, #part.p_size, #part.p_container, #part.p_retailprice, #part.p_comment                                                                                                               |
|               |   Filter: #part.p_partkey IN ([Int64(1), Int64(2), Utf8("3")])                                                                                                                                                                                                            |
|               |     TableScan: part projection=Some([p_partkey, p_name, p_mfgr, p_brand, p_type, p_size, p_container, p_retailprice, p_comment]), partial_filters=[#part.p_partkey IN ([Int64(1), Int64(2), Utf8("3")])]                                                                  |
| physical_plan | ProjectionExec: expr=[p_partkey@0 as p_partkey, p_name@1 as p_name, p_mfgr@2 as p_mfgr, p_brand@3 as p_brand, p_type@4 as p_type, p_size@5 as p_size, p_container@6 as p_container, p_retailprice@7 as p_retailprice, p_comment@8 as p_comment]                           |
|               |   CoalesceBatchesExec: target_batch_size=4096                                                                                                                                                                                                                             |
|               |     FilterExec: p_partkey@0 IN ([Literal { value: Int64(1) }, Literal { value: Int64(2) }, CastExpr { expr: Literal { value: Utf8("3") }, cast_type: Int64, cast_options: CastOptions { safe: false } }])                                                                 |
|               |       RepartitionExec: partitioning=RoundRobinBatch(16)                                                                                                                                                                                                                   |
|               |         ParquetExec: limit=None, partitions=[/Users/yangjiang/test-data/tpch-1g-oneFile/part/part-00000-3a3c2777-00d3-4c27-b917-4ff2145123dc-c000.snappy.parquet], projection=[p_partkey, p_name, p_mfgr, p_brand, p_type, p_size, p_container, p_retailprice, p_comment] |
|               |                                                                                                                                                                                                                                                                           |
+---------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

int, int,utf8 cast to -> int, int, int,

In my opinion, after apply this patch it will get int, int,utf8cast to ->utf8, utf8, utf8I think when list_values_size is large, we will construct a hashSet in https://github.com/apache/arrow-datafusion/pull/2156, change toint` will get better performance in build hasSet, Am i right? 😄

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

test in this patch

explain select * from part where p_partkey in (1, 2, '3');
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| plan_type     | plan                                                                                                                                                                                                                                                                      |
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| logical_plan  | Projection: #part.p_partkey, #part.p_name, #part.p_mfgr, #part.p_brand, #part.p_type, #part.p_size, #part.p_container, #part.p_retailprice, #part.p_comment                                                                                                               |
|               |   Filter: #part.p_partkey IN ([Int64(1), Int64(2), Utf8("3")])                                                                                                                                                                                                            |
|               |     TableScan: part projection=Some([p_partkey, p_name, p_mfgr, p_brand, p_type, p_size, p_container, p_retailprice, p_comment]), partial_filters=[#part.p_partkey IN ([Int64(1), Int64(2), Utf8("3")])]                                                                  |
| physical_plan | ProjectionExec: expr=[p_partkey@0 as p_partkey, p_name@1 as p_name, p_mfgr@2 as p_mfgr, p_brand@3 as p_brand, p_type@4 as p_type, p_size@5 as p_size, p_container@6 as p_container, p_retailprice@7 as p_retailprice, p_comment@8 as p_comment]                           |
|               |   CoalesceBatchesExec: target_batch_size=4096                                                                                                                                                                                                                             |
|               |     FilterExec: CAST(p_partkey@0 AS Utf8) IN ([TryCastExpr { expr: Literal { value: Int64(1) }, cast_type: Utf8 }, TryCastExpr { expr: Literal { value: Int64(2) }, cast_type: Utf8 }, Literal { value: Utf8("3") }])                                                     |
|               |       RepartitionExec: partitioning=RoundRobinBatch(16)                                                                                                                                                                                                                   |
|               |         ParquetExec: limit=None, partitions=[/Users/yangjiang/test-data/tpch-1g-oneFile/part/part-00000-3a3c2777-00d3-4c27-b917-4ff2145123dc-c000.snappy.parquet], projection=[p_partkey, p_name, p_mfgr, p_brand, p_type, p_size, p_container, p_retailprice, p_comment] |
|               |                                                                                                                                                                                                                                                                           |
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+

Copy link
Member Author

@viirya viirya Jun 27, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess that you are basically talk the same as #2794 (comment), right?

Actually the string_numeric_coercion rule coerces Utf8 and LargeUtf8 to numeric type in its first version. But a few test cases in sql::expr::test_in_list_scalar failed. For example,

test_expression!("'2' IN ('a','b',1)", "false");

Because 'a' and 'b' cannot converted to int, they will be null. So the result of this in_list expression is null, instead of false now. There are also other similar cases.

So I changed the coercion rule to use Utf8 and LargeUtf8 to more fit with existing logic, without changing too much from existing behavior.

I'm fine if we can get a consensus about if numeric type is more correct for such cases. Then I can change them (the test cases) and the coercion rule.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As it is somehow following current behavior, I can address it in the other issue #2799.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I test in 748b6a65a5fa801595fd80a3c7b2728be3c9cdaa(not this commit)

explain select * from part where p_partkey in (1, 2, '3');
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| plan_type     | plan                                                                                                                                                                                                                                                                      |
+---------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| logical_plan  | Projection: #part.p_partkey, #part.p_name, #part.p_mfgr, #part.p_brand, #part.p_type, #part.p_size, #part.p_container, #part.p_retailprice, #part.p_comment                                                                                                               |
|               |   Filter: #part.p_partkey IN ([Int64(1), Int64(2), Utf8("3")])                                                                                                                                                                                                            |
|               |     TableScan: part projection=Some([p_partkey, p_name, p_mfgr, p_brand, p_type, p_size, p_container, p_retailprice, p_comment]), partial_filters=[#part.p_partkey IN ([Int64(1), Int64(2), Utf8("3")])]                                                                  |
| physical_plan | ProjectionExec: expr=[p_partkey@0 as p_partkey, p_name@1 as p_name, p_mfgr@2 as p_mfgr, p_brand@3 as p_brand, p_type@4 as p_type, p_size@5 as p_size, p_container@6 as p_container, p_retailprice@7 as p_retailprice, p_comment@8 as p_comment]                           |
|               |   CoalesceBatchesExec: target_batch_size=4096                                                                                                                                                                                                                             |
|               |     FilterExec: p_partkey@0 IN ([Literal { value: Int64(1) }, Literal { value: Int64(2) }, CastExpr { expr: Literal { value: Utf8("3") }, cast_type: Int64, cast_options: CastOptions { safe: false } }])                                                                 |
|               |       RepartitionExec: partitioning=RoundRobinBatch(16)                                                                                                                                                                                                                   |
|               |         ParquetExec: limit=None, partitions=[/Users/yangjiang/test-data/tpch-1g-oneFile/part/part-00000-3a3c2777-00d3-4c27-b917-4ff2145123dc-c000.snappy.parquet], projection=[p_partkey, p_name, p_mfgr, p_brand, p_type, p_size, p_container, p_retailprice, p_comment] |
|               |                                                                                                                                                                                                                                                                           |
+---------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

int, int,utf8 cast to -> int, int, int,

In my opinion, after apply this patch it will get int, int,utf8cast to ->utf8, utf8, utf8I think when list_values_size is large, we will construct a hashSet in https://github.com/apache/arrow-datafusion/pull/2156, change toint` will get better performance in build hasSet, Am i right? 😄

Yes, the performance is greater for comparing integer.

Now the coercion rule is unstable, we should do mush work in that.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm fine if we can get a consensus about if numeric type is more correct for such cases. Then I can change them (the test cases) and the coercion rule.

I think what is important here is to have a consistent set of semantics. I don't have any particular preference related to the automatic coercion of int, int, utf8 as I think there are different tradeoffs

For example coercing int, int, utf8 to utf8, utf8 utf8 would allow a predicate like c1 IN (1, 2 'foo') to run without error (assuming c1 can be coerced to utf8), but would result in a runtime error if we attempted to automatically coerce to int, int int. However, as @Ted-Jiang notes, the performance will be slower for predicates like c1 IN (1, 2, '3')

Best practice would be to explicitly cast all columns to i32 in the query if they were supposed to be compared as i32:

c1 in (1::smallint, 2::smallint, '3'::smallint)

but I realize that may not be practical for all users. 🤔

(Utf8, _) if DataType::is_numeric(rhs_type) => Some(Utf8),
(LargeUtf8, _) if DataType::is_numeric(rhs_type) => Some(LargeUtf8),
(_, Utf8) if DataType::is_numeric(lhs_type) => Some(Utf8),
(_, LargeUtf8) if DataType::is_numeric(lhs_type) => Some(LargeUtf8),
_ => None,
}
}

fn comparison_binary_numeric_coercion(
lhs_type: &DataType,
rhs_type: &DataType,
Expand Down
58 changes: 4 additions & 54 deletions datafusion/physical-expr/src/expressions/in_list.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,12 @@ use arrow::{
record_batch::RecordBatch,
};

use crate::expressions::try_cast;
use crate::{expressions, PhysicalExpr};
use arrow::array::*;
use arrow::buffer::{Buffer, MutableBuffer};
use datafusion_common::ScalarValue;
use datafusion_common::ScalarValue::Decimal128;
use datafusion_common::{DataFusionError, Result};
use datafusion_expr::binary_rule::comparison_eq_coercion;
use datafusion_expr::ColumnarValue;

/// Size at which to use a Set rather than Vec for `IN` / `NOT IN`
Expand Down Expand Up @@ -745,63 +743,13 @@ impl PhysicalExpr for InListExpr {
}
}

type InListCastResult = (Arc<dyn PhysicalExpr>, Vec<Arc<dyn PhysicalExpr>>);

/// Creates a unary expression InList
pub fn in_list(
expr: Arc<dyn PhysicalExpr>,
list: Vec<Arc<dyn PhysicalExpr>>,
negated: &bool,
input_schema: &Schema,
) -> Result<Arc<dyn PhysicalExpr>> {
let (cast_expr, cast_list) = in_list_cast(expr, list, input_schema)?;
Ok(Arc::new(InListExpr::new(cast_expr, cast_list, *negated)))
}

fn in_list_cast(
expr: Arc<dyn PhysicalExpr>,
list: Vec<Arc<dyn PhysicalExpr>>,
input_schema: &Schema,
) -> Result<InListCastResult> {
let expr_type = &expr.data_type(input_schema)?;
let list_types: Vec<DataType> = list
.iter()
.map(|list_expr| list_expr.data_type(input_schema).unwrap())
.collect();
// TODO in the arrow-rs, should support NULL type to Decimal Data type
// TODO support in the arrow-rs, NULL value cast to Decimal Value
// https://github.com/apache/arrow-datafusion/issues/2759
let result_type = get_coerce_type(expr_type, &list_types);
match result_type {
None => Err(DataFusionError::Internal(format!(
"In expr can find the coerced type for {:?} in {:?}",
expr_type, list_types
))),
Some(data_type) => {
// find the coerced type
let cast_expr = try_cast(expr, input_schema, data_type.clone())?;
let cast_list_expr = list
.into_iter()
.map(|list_expr| {
try_cast(list_expr, input_schema, data_type.clone()).unwrap()
})
.collect();
Ok((cast_expr, cast_list_expr))
}
}
}

fn get_coerce_type(expr_type: &DataType, list_type: &[DataType]) -> Option<DataType> {
// get the equal coerced data type
list_type
.iter()
.fold(Some(expr_type.clone()), |left, right_type| {
match left {
None => None,
// TODO refactor a framework to do the data type coercion
Some(left_type) => comparison_eq_coercion(&left_type, right_type),
}
})
Ok(Arc::new(InListExpr::new(expr, list, *negated)))
}

#[cfg(test)]
Expand All @@ -810,12 +758,14 @@ mod tests {

use super::*;
use crate::expressions::{col, lit};
use crate::planner::in_list_cast;
use datafusion_common::Result;

// applies the in_list expr to an input batch and list
macro_rules! in_list {
($BATCH:expr, $LIST:expr, $NEGATED:expr, $EXPECTED:expr, $COL:expr, $SCHEMA:expr) => {{
let expr = in_list($COL, $LIST, $NEGATED, $SCHEMA).unwrap();
let (cast_expr, cast_list_exprs) = in_list_cast($COL, $LIST, $SCHEMA)?;
let expr = in_list(cast_expr, cast_list_exprs, $NEGATED).unwrap();
let result = expr.evaluate(&$BATCH)?.into_array($BATCH.num_rows());
let result = result
.as_any()
Expand Down
Loading