-
Notifications
You must be signed in to change notification settings - Fork 1.4k
feat: Add HiveTableHandle::columnHandles #15351
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -147,7 +147,8 @@ class HiveTableHandle : public ConnectorTableHandle { | |
| common::SubfieldFilters subfieldFilters, | ||
| const core::TypedExprPtr& remainingFilter, | ||
| const RowTypePtr& dataColumns = nullptr, | ||
| const std::unordered_map<std::string, std::string>& tableParameters = {}); | ||
| const std::unordered_map<std::string, std::string>& tableParameters = {}, | ||
| std::vector<HiveColumnHandlePtr> columnHandles = {}); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would you document this new argument?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I added the document on the accessor method below |
||
|
|
||
| const std::string& tableName() const { | ||
| return tableName_; | ||
|
|
@@ -157,27 +158,47 @@ class HiveTableHandle : public ConnectorTableHandle { | |
| return tableName(); | ||
| } | ||
|
|
||
| bool isFilterPushdownEnabled() const { | ||
| [[deprecated]] bool isFilterPushdownEnabled() const { | ||
| return filterPushdownEnabled_; | ||
| } | ||
|
|
||
| /// Single field filters that can be applied efficiently during file reading. | ||
| const common::SubfieldFilters& subfieldFilters() const { | ||
| return subfieldFilters_; | ||
| } | ||
|
|
||
| /// Everything else that cannot be converted into subfield filters, but still | ||
| /// require the data source to filter out. This is usually less efficient | ||
| /// than subfield filters but supports arbitrary boolean expression. | ||
| const core::TypedExprPtr& remainingFilter() const { | ||
| return remainingFilter_; | ||
| } | ||
|
|
||
| // Schema of the table. Need this for reading TEXTFILE. | ||
| /// Subset of schema of the table that we store in file (i.e., | ||
| /// non-partitioning columns). This must be in the exact order as columns in | ||
| /// file (except trailing columns), but with the table schema during read | ||
| /// time. | ||
| /// | ||
| /// This is needed for multiple purposes, including reading TEXTFILE and | ||
| /// handling schema evolution. | ||
| const RowTypePtr& dataColumns() const { | ||
| return dataColumns_; | ||
| } | ||
|
|
||
| /// Extra parameters to pass down to file format reader layer. Keys should be | ||
| /// in dwio::common::TableParameter. | ||
| const std::unordered_map<std::string, std::string>& tableParameters() const { | ||
| return tableParameters_; | ||
| } | ||
|
|
||
| /// Full schema including partitioning columns and data columns. If this is | ||
| /// non-empty, it should be a superset of the column handles in data source | ||
| /// assignments parameter (the shared_ptrs should pointing to the exact same | ||
| /// objects). | ||
| const std::vector<HiveColumnHandlePtr> columnHandles() const { | ||
| return columnHandles_; | ||
| } | ||
|
|
||
| std::string toString() const override; | ||
|
|
||
| folly::dynamic serialize() const override; | ||
|
|
@@ -195,6 +216,7 @@ class HiveTableHandle : public ConnectorTableHandle { | |
| const core::TypedExprPtr remainingFilter_; | ||
| const RowTypePtr dataColumns_; | ||
| const std::unordered_map<std::string, std::string> tableParameters_; | ||
| const std::vector<HiveColumnHandlePtr> columnHandles_; | ||
| }; | ||
|
|
||
| using HiveTableHandlePtr = std::shared_ptr<const HiveTableHandle>; | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6143,5 +6143,31 @@ TEST_F(TableScanTest, parallelUnitLoader) { | |
| ASSERT_GT(stats.count("waitForUnitReadyNanos"), 0); | ||
| } | ||
|
|
||
| TEST_F(TableScanTest, allColumnHandles) { | ||
| auto data = makeVectors(1, 10, ROW({"a", "b"}, BIGINT())); | ||
| auto filePath = TempFilePath::create(); | ||
| writeToFile(filePath->getPath(), data); | ||
| std::vector<HiveColumnHandlePtr> columnHandles = { | ||
| partitionKey("ds", VARCHAR()), | ||
| regularColumn("a", BIGINT()), | ||
| regularColumn("b", BIGINT()), | ||
| }; | ||
| connector::ColumnHandleMap assignments = {{"x", columnHandles[1]}}; | ||
| auto split = exec::test::HiveConnectorSplitBuilder(filePath->getPath()) | ||
| .partitionKey("ds", "2025-10-23") | ||
| .build(); | ||
| auto plan = PlanBuilder() | ||
| .startTableScan() | ||
| .outputType(ROW({"x"}, {BIGINT()})) | ||
| .assignments(assignments) | ||
| .dataColumns(asRowType(data[0]->type())) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we need both dataColumns and columnHandles?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. dataColumns are needed for schema evolution purpose as well, because column handle does not tell us the position of the column in file. In this test case though we can probably skip it, but in prod usage we always supply it.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would be nice to document all pieces of information that go into HiveTableHandle.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I will beef up the comments on the |
||
| .columnHandles(columnHandles) | ||
| .remainingFilter("length(ds) + a % 2 > 0") | ||
| .endTableScan() | ||
| .planNode(); | ||
| AssertQueryBuilder(plan).split(split).assertResults( | ||
| makeRowVector({data[0]->childAt(0)})); | ||
| } | ||
|
|
||
| } // namespace | ||
| } // namespace facebook::velox::exec | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would you add a comment to explain what this method is doing? Should we verify consistency between assignments and columnHandles? E.g. what if column x is reported as partition key in assignment and regular column in columnHandles?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah that's something I have in mind but did not add it, let me add some very strict check for now since it's new