-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-27225][SQL] Implement join strategy hints #24164
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
1426294
e77c9f3
f198dfb
d25822d
4a13ffe
407c63f
6bd7f56
c535d36
e533ac2
7342fbd
0912997
c0b217c
4a48286
a9634c4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -66,17 +66,76 @@ object JoinHint { | |
| /** | ||
| * The hint attributes to be applied on a specific node. | ||
| * | ||
| * @param broadcast If set to true, it indicates that the broadcast hash join is the preferred join | ||
| * strategy and the node with this hint is preferred to be the build side. | ||
| * @param strategy The preferred join strategy. | ||
| */ | ||
| case class HintInfo(broadcast: Boolean = false) { | ||
| case class HintInfo(strategy: Option[JoinStrategyHint] = None) { | ||
|
|
||
| /** | ||
| * Combine two [[HintInfo]]s into one [[HintInfo]], in which the new strategy will the strategy | ||
| * in this [[HintInfo]] if defined, otherwise the strategy in the other [[HintInfo]]. | ||
| */ | ||
| def merge(other: HintInfo): HintInfo = { | ||
| HintInfo(strategy = this.strategy.orElse(other.strategy)) | ||
| } | ||
|
|
||
| override def toString: String = { | ||
| val hints = scala.collection.mutable.ArrayBuffer.empty[String] | ||
|
||
| if (broadcast) { | ||
| hints += "broadcast" | ||
| if (strategy.isDefined) { | ||
| hints += s"strategy=${strategy.get}" | ||
| } | ||
|
|
||
| if (hints.isEmpty) "none" else hints.mkString("(", ", ", ")") | ||
| } | ||
| } | ||
|
|
||
| sealed abstract class JoinStrategyHint { | ||
|
|
||
| def displayName: String | ||
| def hintAliases: Set[String] | ||
|
|
||
| override def toString: String = displayName | ||
| } | ||
|
|
||
| /** | ||
| * The enumeration of join strategy hints. | ||
| * | ||
| * The hinted strategy will be used for the join with which it is associated if doable. In case | ||
| * of contradicting strategy hints specified for each side of the join, hints are prioritized as | ||
| * BROADCAST over SHUFFLE_MERGE over SHUFFLE_HASH over SHUFFLE_REPLICATE_NL. | ||
| */ | ||
| object JoinStrategyHint { | ||
|
|
||
| val strategies: Set[JoinStrategyHint] = Set( | ||
| BROADCAST, | ||
| SHUFFLE_MERGE, | ||
| SHUFFLE_HASH, | ||
| SHUFFLE_REPLICATE_NL) | ||
| } | ||
|
|
||
| case object BROADCAST extends JoinStrategyHint { | ||
| override def displayName: String = "broadcast-hash" | ||
| override def hintAliases: Set[String] = Set( | ||
| "BROADCAST", | ||
| "BROADCASTJOIN", | ||
| "MAPJOIN") | ||
| } | ||
|
|
||
| case object SHUFFLE_MERGE extends JoinStrategyHint { | ||
| override def displayName: String = "shuffle-merge" | ||
| override def hintAliases: Set[String] = Set( | ||
| "SHUFFLE_MERGE", | ||
| "MERGE", | ||
| "MERGEJOIN") | ||
| } | ||
|
|
||
| case object SHUFFLE_HASH extends JoinStrategyHint { | ||
| override def displayName: String = "shuffle-hash" | ||
| override def hintAliases: Set[String] = Set( | ||
| "SHUFFLE_HASH") | ||
| } | ||
|
|
||
| case object SHUFFLE_REPLICATE_NL extends JoinStrategyHint { | ||
| override def displayName: String = "shuffle-replicate-nested-loop" | ||
| override def hintAliases: Set[String] = Set( | ||
| "SHUFFLE_REPLICATE_NL") | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This hint for cartesian products is useful for users?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes. In the default logic, broadcast-nl is prioritized over shuffle-replicate-nl (cartesian-product), so this can be used for special cases where shuffle-replicate-nl is favored.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we might need a code comment to explain SHUFFLE_REPLICATE_NL is |
||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks suspicious. What is the conflict resolution policy we are following here? A behavior change? Do we need to log the inputs and the resulting hint?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There is no behavior change here. There cannot be, coz we only had one hint before this PR, so a broadcast + broadcast = broadcast. The new behavior is defined in the description of this PR:
The merge function is implemented in HintInfo, which should be responsible for deciding the strategy of merging hints between a node on top and a node on the bottom.