-
Notifications
You must be signed in to change notification settings - Fork 4k
GH-15243: [C++] fix for potential deadlock in the group-by node #33700
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -486,7 +486,7 @@ class GroupByNode : public ExecNode { | |
| outputs_[0]->InputReceived(this, out_data_.Slice(batch_size * n, batch_size)); | ||
| } | ||
|
|
||
| Status OutputResult() { | ||
| Status DoOutputResult() { | ||
| // To simplify merging, ensure that the first grouper is nonempty | ||
| for (size_t i = 0; i < local_states_.size(); i++) { | ||
| if (local_states_[i].grouper) { | ||
|
|
@@ -500,11 +500,28 @@ class GroupByNode : public ExecNode { | |
|
|
||
| int64_t num_output_batches = bit_util::CeilDiv(out_data_.length, output_batch_size()); | ||
| outputs_[0]->InputFinished(this, static_cast<int>(num_output_batches)); | ||
| RETURN_NOT_OK(plan_->query_context()->StartTaskGroup(output_task_group_id_, | ||
| num_output_batches)); | ||
| Status st = | ||
| plan_->query_context()->StartTaskGroup(output_task_group_id_, num_output_batches); | ||
| if (st.IsCancelled()) { | ||
| // This means the user has cancelled/aborted the plan. We will not send any batches | ||
| // and end immediately. | ||
| finished_.MarkFinished(); | ||
| return Status::OK(); | ||
| } else { | ||
| return st; | ||
| } | ||
| return Status::OK(); | ||
| } | ||
|
|
||
| void OutputResult() { | ||
| // If something goes wrong outputting the result we need to make sure | ||
| // we still mark finished. | ||
| Status st = DoOutputResult(); | ||
| if (!st.ok()) { | ||
| finished_.MarkFinished(st); | ||
| } | ||
| } | ||
|
|
||
| void InputReceived(ExecNode* input, ExecBatch batch) override { | ||
| EVENT(span_, "InputReceived", {{"batch.length", batch.length}}); | ||
| util::tracing::Span span; | ||
|
|
@@ -521,7 +538,7 @@ class GroupByNode : public ExecNode { | |
| if (ErrorIfNotOk(Consume(ExecSpan(batch)))) return; | ||
|
|
||
| if (input_counter_.Increment()) { | ||
| ErrorIfNotOk(OutputResult()); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So we don't want to propagate the error back anymore? (I suppose that's all going away anyways?)
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, at the moment, the only thing that handles this error is the sink node and all it will do is trigger an abort which will end up coming back here and marking this finished anyways. That used to be needed because it was the only way to trigger an abort on a failure. Now, with the async scheduler, any failed task will trigger an abort. Furthemore, since |
||
| OutputResult(); | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -542,7 +559,7 @@ class GroupByNode : public ExecNode { | |
| DCHECK_EQ(input, inputs_[0]); | ||
|
|
||
| if (input_counter_.SetTotal(total_batches)) { | ||
| ErrorIfNotOk(OutputResult()); | ||
| OutputResult(); | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -551,7 +568,6 @@ class GroupByNode : public ExecNode { | |
| {{"node.label", label()}, | ||
| {"node.detail", ToString()}, | ||
| {"node.kind", kind_name()}}); | ||
|
|
||
| local_states_.resize(plan_->query_context()->max_concurrency()); | ||
| return Status::OK(); | ||
| } | ||
|
|
@@ -570,7 +586,9 @@ class GroupByNode : public ExecNode { | |
| EVENT(span_, "StopProducing"); | ||
| DCHECK_EQ(output, outputs_[0]); | ||
|
|
||
| if (input_counter_.Cancel()) finished_.MarkFinished(); | ||
| if (input_counter_.Cancel()) { | ||
| finished_.MarkFinished(); | ||
| } | ||
| inputs_[0]->StopProducing(this); | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: this if-else seems redundant, since OutputResult will MarkFinished even if we don't do this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is translating a failed status into a successful status (the test expects a plan to succeed after calling
StopProducing).There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The else condition propagates the failed status.