-
Notifications
You must be signed in to change notification settings - Fork 4k
ARROW-10008: [C++][Dataset] Fix filtering/row group statistics of dict columns #8311
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1179,8 +1179,8 @@ struct TreeEvaluator::Impl { | |
| Result<Datum> kernel(const Datum& left, | ||
| const Datum& right, | ||
| ExecContext* ctx)) const { | ||
| ARROW_ASSIGN_OR_RAISE(auto lhs, Evaluate(*expr.left_operand())); | ||
| ARROW_ASSIGN_OR_RAISE(auto rhs, Evaluate(*expr.right_operand())); | ||
| ARROW_ASSIGN_OR_RAISE(Datum lhs, Evaluate(*expr.left_operand())); | ||
| ARROW_ASSIGN_OR_RAISE(Datum rhs, Evaluate(*expr.right_operand())); | ||
|
|
||
| if (lhs.is_scalar()) { | ||
| ARROW_ASSIGN_OR_RAISE( | ||
|
|
@@ -1200,7 +1200,7 @@ struct TreeEvaluator::Impl { | |
| } | ||
|
|
||
| Result<Datum> operator()(const NotExpression& expr) const { | ||
| ARROW_ASSIGN_OR_RAISE(auto to_invert, Evaluate(*expr.operand())); | ||
| ARROW_ASSIGN_OR_RAISE(Datum to_invert, Evaluate(*expr.operand())); | ||
| if (IsNullDatum(to_invert)) { | ||
| return NullDatum(); | ||
| } | ||
|
|
@@ -1214,7 +1214,7 @@ struct TreeEvaluator::Impl { | |
| } | ||
|
|
||
| Result<Datum> operator()(const InExpression& expr) const { | ||
| ARROW_ASSIGN_OR_RAISE(auto operand_values, Evaluate(*expr.operand())); | ||
| ARROW_ASSIGN_OR_RAISE(Datum operand_values, Evaluate(*expr.operand())); | ||
| if (IsNullDatum(operand_values)) { | ||
| return Datum(expr.set()->null_count() != 0); | ||
| } | ||
|
|
@@ -1224,7 +1224,7 @@ struct TreeEvaluator::Impl { | |
| } | ||
|
|
||
| Result<Datum> operator()(const IsValidExpression& expr) const { | ||
| ARROW_ASSIGN_OR_RAISE(auto operand_values, Evaluate(*expr.operand())); | ||
| ARROW_ASSIGN_OR_RAISE(Datum operand_values, Evaluate(*expr.operand())); | ||
| if (IsNullDatum(operand_values)) { | ||
| return Datum(false); | ||
| } | ||
|
|
@@ -1255,14 +1255,42 @@ struct TreeEvaluator::Impl { | |
| } | ||
|
|
||
| Result<Datum> operator()(const ComparisonExpression& expr) const { | ||
| ARROW_ASSIGN_OR_RAISE(auto lhs, Evaluate(*expr.left_operand())); | ||
| ARROW_ASSIGN_OR_RAISE(auto rhs, Evaluate(*expr.right_operand())); | ||
| ARROW_ASSIGN_OR_RAISE(Datum lhs, Evaluate(*expr.left_operand())); | ||
| ARROW_ASSIGN_OR_RAISE(Datum rhs, Evaluate(*expr.right_operand())); | ||
|
|
||
| if (IsNullDatum(lhs) || IsNullDatum(rhs)) { | ||
| return Datum(std::make_shared<BooleanScalar>()); | ||
| } | ||
|
|
||
| DCHECK(lhs.is_array()); | ||
| if (lhs.type()->id() == Type::DICTIONARY && rhs.type()->id() == Type::DICTIONARY) { | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @wesm What do you think about adding kernels to scalar_compare.cc which do this inside compute:: ?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, this sounds fine, can you open a JIRA issue about it? |
||
| if (lhs.is_array() && rhs.is_array()) { | ||
| // decode dictionary arrays | ||
| for (Datum* arg : {&lhs, &rhs}) { | ||
| auto dict = checked_pointer_cast<DictionaryArray>(arg->make_array()); | ||
| ARROW_ASSIGN_OR_RAISE(*arg, compute::Take(dict->dictionary(), dict->indices(), | ||
| compute::TakeOptions::Defaults())); | ||
| } | ||
| } else if (lhs.is_array() || rhs.is_array()) { | ||
| auto dict = checked_pointer_cast<DictionaryArray>( | ||
| (lhs.is_array() ? lhs : rhs).make_array()); | ||
|
|
||
| ARROW_ASSIGN_OR_RAISE(auto scalar, checked_cast<const DictionaryScalar&>( | ||
| *(lhs.is_scalar() ? lhs : rhs).scalar()) | ||
| .GetEncodedValue()); | ||
| if (lhs.is_array()) { | ||
| lhs = dict->dictionary(); | ||
| rhs = std::move(scalar); | ||
| } else { | ||
| lhs = std::move(scalar); | ||
| rhs = dict->dictionary(); | ||
| } | ||
| ARROW_ASSIGN_OR_RAISE( | ||
| Datum out_dict, | ||
| compute::Compare(lhs, rhs, compute::CompareOptions(expr.op()), &ctx_)); | ||
|
|
||
| return compute::Take(out_dict, dict->indices(), compute::TakeOptions::Defaults()); | ||
| } | ||
| } | ||
|
|
||
| return compute::Compare(lhs, rhs, compute::CompareOptions(expr.op()), &ctx_); | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does this change behaviour? For a dictionary with string values, is field->type() string or dictionary?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
StatisticsAsScalarsreturns scalars whose types are the correct physical type, so even if the column wasdictionary(string)min and max would be juststringbefore this castThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(IE, it only changes behavior in cases where the physical type wasn't appropriate)