Skip to content
Merged
192 changes: 192 additions & 0 deletions benchmarks/src/hj.rs
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,198 @@ const HASH_QUERIES: &[HashJoinQuery] = &[
build_size: "100K_(20%_dups)",
probe_size: "60M",
},
// RightSemi Join benchmarks with Int32 keys
// Q16: RightSemi, 100% Density, 100% Hit rate
HashJoinQuery {
sql: r###"SELECT l.k
Comment thread
coderfender marked this conversation as resolved.
Outdated
FROM (
SELECT CAST(l_suppkey AS INT) as k FROM lineitem
) l
WHERE EXISTS (
SELECT 1 FROM (SELECT CAST(s_suppkey AS INT) as k FROM supplier) s WHERE s.k = l.k
)"###,
density: 1.0,
prob_hit: 1.0,
build_size: "100K",
probe_size: "60M_RightSemi",
},
// Q17: RightSemi, 100% Density, 10% Hit rate
HashJoinQuery {
sql: r###"SELECT l.k
FROM (
SELECT CAST(CASE WHEN l_suppkey % 10 = 0 THEN l_suppkey ELSE l_suppkey + 1000000 END AS INT) as k
FROM lineitem
) l
WHERE EXISTS (
SELECT 1 FROM (SELECT CAST(s_suppkey AS INT) as k FROM supplier) s WHERE s.k = l.k
)"###,
density: 1.0,
prob_hit: 0.1,
build_size: "100K",
probe_size: "60M_RightSemi",
},
// Q18: RightSemi, 50% Density, 100% Hit rate
HashJoinQuery {
sql: r###"SELECT l.k
FROM (
SELECT CAST(l_suppkey * 2 AS INT) as k FROM lineitem
) l
WHERE EXISTS (
SELECT 1 FROM (SELECT CAST(s_suppkey * 2 AS INT) as k FROM supplier) s WHERE s.k = l.k
)"###,
density: 0.5,
prob_hit: 1.0,
build_size: "100K",
probe_size: "60M_RightSemi",
},
// Q19: RightSemi, 50% Density, 10% Hit rate
HashJoinQuery {
sql: r###"SELECT l.k
FROM (
SELECT CAST(CASE
WHEN l_suppkey % 10 = 0 THEN l_suppkey * 2
WHEN l_suppkey % 10 < 9 THEN l_suppkey * 2 + 1
ELSE l_suppkey * 2 + 1000000
END AS INT) as k
FROM lineitem
) l
WHERE EXISTS (
SELECT 1 FROM (SELECT CAST(s_suppkey * 2 AS INT) as k FROM supplier) s WHERE s.k = l.k
)"###,
density: 0.5,
prob_hit: 0.1,
build_size: "100K",
probe_size: "60M_RightSemi",
},
// Q20: RightSemi, 10% Density, 100% Hit rate
HashJoinQuery {
sql: r###"SELECT l.k
FROM (
SELECT CAST(l_suppkey * 10 AS INT) as k FROM lineitem
) l
WHERE EXISTS (
SELECT 1 FROM (SELECT CAST(s_suppkey * 10 AS INT) as k FROM supplier) s WHERE s.k = l.k
)"###,
density: 0.1,
prob_hit: 1.0,
build_size: "100K",
probe_size: "60M_RightSemi",
},
// Q21: RightSemi, 10% Density, 10% Hit rate
HashJoinQuery {
sql: r###"SELECT l.k
FROM (
SELECT CAST(CASE
WHEN l_suppkey % 10 = 0 THEN l_suppkey * 10
WHEN l_suppkey % 10 < 9 THEN l_suppkey * 10 + 1
ELSE l_suppkey * 10 + 1000000
END AS INT) as k
FROM lineitem
) l
WHERE EXISTS (
SELECT 1 FROM (SELECT CAST(s_suppkey * 10 AS INT) as k FROM supplier) s WHERE s.k = l.k
)"###,
density: 0.1,
prob_hit: 0.1,
build_size: "100K",
probe_size: "60M_RightSemi",
},
// RightAnti Join benchmarks with Int32 keys
// Q22: RightAnti, 100% Density, 100% Hit rate (no output)
HashJoinQuery {
sql: r###"SELECT l.k
FROM (
SELECT CAST(l_suppkey AS INT) as k FROM lineitem
) l
WHERE NOT EXISTS (
SELECT 1 FROM (SELECT CAST(s_suppkey AS INT) as k FROM supplier) s WHERE s.k = l.k
)"###,
density: 1.0,
prob_hit: 1.0,
build_size: "100K",
probe_size: "60M_RightAnti",
},
// Q23: RightAnti, 100% Density, 10% Hit rate (90% output)
HashJoinQuery {
sql: r###"SELECT l.k
FROM (
SELECT CAST(CASE WHEN l_suppkey % 10 = 0 THEN l_suppkey ELSE l_suppkey + 1000000 END AS INT) as k
FROM lineitem
) l
WHERE NOT EXISTS (
SELECT 1 FROM (SELECT CAST(s_suppkey AS INT) as k FROM supplier) s WHERE s.k = l.k
)"###,
density: 1.0,
prob_hit: 0.1,
build_size: "100K",
probe_size: "60M_RightAnti",
},
// Q24: RightAnti, 50% Density, 100% Hit rate (no output)
HashJoinQuery {
sql: r###"SELECT l.k
FROM (
SELECT CAST(l_suppkey * 2 AS INT) as k FROM lineitem
) l
WHERE NOT EXISTS (
SELECT 1 FROM (SELECT CAST(s_suppkey * 2 AS INT) as k FROM supplier) s WHERE s.k = l.k
)"###,
density: 0.5,
prob_hit: 1.0,
build_size: "100K",
probe_size: "60M_RightAnti",
},
// Q25: RightAnti, 50% Density, 10% Hit rate (90% output)
HashJoinQuery {
sql: r###"SELECT l.k
FROM (
SELECT CAST(CASE
WHEN l_suppkey % 10 = 0 THEN l_suppkey * 2
WHEN l_suppkey % 10 < 9 THEN l_suppkey * 2 + 1
ELSE l_suppkey * 2 + 1000000
END AS INT) as k
FROM lineitem
) l
WHERE NOT EXISTS (
SELECT 1 FROM (SELECT CAST(s_suppkey * 2 AS INT) as k FROM supplier) s WHERE s.k = l.k
)"###,
density: 0.5,
prob_hit: 0.1,
build_size: "100K",
probe_size: "60M_RightAnti",
},
// Q26: RightAnti, 10% Density, 100% Hit rate (no output)
HashJoinQuery {
sql: r###"SELECT l.k
FROM (
SELECT CAST(l_suppkey * 10 AS INT) as k FROM lineitem
) l
WHERE NOT EXISTS (
SELECT 1 FROM (SELECT CAST(s_suppkey * 10 AS INT) as k FROM supplier) s WHERE s.k = l.k
)"###,
density: 0.1,
prob_hit: 1.0,
build_size: "100K",
probe_size: "60M_RightAnti",
},
// Q27: RightAnti, 10% Density, 10% Hit rate (90% output)
HashJoinQuery {
sql: r###"SELECT l.k
FROM (
SELECT CAST(CASE
WHEN l_suppkey % 10 = 0 THEN l_suppkey * 10
WHEN l_suppkey % 10 < 9 THEN l_suppkey * 10 + 1
ELSE l_suppkey * 10 + 1000000
END AS INT) as k
FROM lineitem
) l
WHERE NOT EXISTS (
SELECT 1 FROM (SELECT CAST(s_suppkey * 10 AS INT) as k FROM supplier) s WHERE s.k = l.k
)"###,
density: 0.1,
prob_hit: 0.1,
build_size: "100K",
probe_size: "60M_RightAnti",
},
];

impl RunOpt {
Expand Down
5 changes: 5 additions & 0 deletions datafusion/physical-plan/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -108,3 +108,8 @@ required-features = ["test_utils"]
harness = false
name = "aggregate_vectorized"
required-features = ["test_utils"]

[[bench]]
harness = false
name = "hash_join_semi_anti"
required-features = ["test_utils"]
Loading
Loading