diff --git a/presto-native-execution/presto_cpp/main/PrestoTask.cpp b/presto-native-execution/presto_cpp/main/PrestoTask.cpp index ae84bff7ded87..44fac67de0785 100644 --- a/presto-native-execution/presto_cpp/main/PrestoTask.cpp +++ b/presto-native-execution/presto_cpp/main/PrestoTask.cpp @@ -590,10 +590,20 @@ void PrestoTask::updateExecutionInfoLocked( prestoTaskStats.outputPositions = 0; prestoTaskStats.outputDataSizeInBytes = 0; - prestoTaskStats.queuedDrivers = veloxTaskStats.numQueuedDrivers; - prestoTaskStats.totalDrivers = veloxTaskStats.numTotalDrivers; + // Presto Java reports number of drivers to number of splits in Presto UI + // because split and driver are 1 to 1 mapping relationship. This is not true + // in Prestissimo where 1 driver handles many splits. In order to quickly + // unblock developers from viewing the correct progress of splits in + // Prestissimo's coordinator UI, we put number of splits in total, queued, and + // finished to indicate the progress of the query. Number of running drivers + // are passed as it is to have a proper running drivers count in UI. + // + // TODO: We should really extend the API (protocol::TaskStats and Presto + // coordinator UI) to have splits information as a proper fix. + prestoTaskStats.totalDrivers = veloxTaskStats.numTotalSplits; + prestoTaskStats.queuedDrivers = veloxTaskStats.numQueuedSplits; prestoTaskStats.runningDrivers = veloxTaskStats.numRunningDrivers; - prestoTaskStats.completedDrivers = veloxTaskStats.numCompletedDrivers; + prestoTaskStats.completedDrivers = veloxTaskStats.numFinishedSplits; prestoTaskStats.pipelines.resize(veloxTaskStats.pipelineStats.size()); for (int i = 0; i < veloxTaskStats.pipelineStats.size(); ++i) { diff --git a/presto-native-execution/presto_cpp/main/types/PrestoToVeloxConnector.cpp b/presto-native-execution/presto_cpp/main/types/PrestoToVeloxConnector.cpp index faa4f93eaf67a..a9a6f21ed25f5 100644 --- a/presto-native-execution/presto_cpp/main/types/PrestoToVeloxConnector.cpp +++ b/presto-native-execution/presto_cpp/main/types/PrestoToVeloxConnector.cpp @@ -1109,13 +1109,15 @@ HivePrestoToVeloxConnector::toVeloxSplit( for (const auto& [key, value] : hiveSplit->storage.serdeParameters) { serdeParameters[key] = value; } - std::unordered_map infoColumns; - infoColumns.reserve(2); - infoColumns.insert( - {"$file_size", std::to_string(hiveSplit->fileSplit.fileSize)}); - infoColumns.insert( + std::unordered_map infoColumns = { + {"$path", hiveSplit->fileSplit.path}, + {"$file_size", std::to_string(hiveSplit->fileSplit.fileSize)}, {"$file_modified_time", - std::to_string(hiveSplit->fileSplit.fileModifiedTime)}); + std::to_string(hiveSplit->fileSplit.fileModifiedTime)}, + }; + if (hiveSplit->tableBucketNumber) { + infoColumns["$bucket"] = std::to_string(*hiveSplit->tableBucketNumber); + } auto veloxSplit = std::make_unique( catalogId, diff --git a/presto-native-execution/presto_cpp/main/types/PrestoToVeloxQueryPlan.cpp b/presto-native-execution/presto_cpp/main/types/PrestoToVeloxQueryPlan.cpp index 9d32452444625..28ca974c5b6ad 100644 --- a/presto-native-execution/presto_cpp/main/types/PrestoToVeloxQueryPlan.cpp +++ b/presto-native-execution/presto_cpp/main/types/PrestoToVeloxQueryPlan.cpp @@ -606,6 +606,29 @@ core::PlanNodePtr VeloxQueryPlanConverterBase::toVeloxQueryPlan( left->outputType())); } + // For ScanFilter and ScanFilterProject, the planner sometimes put the + // remaining filter in a FilterNode after the TableScan. We need to put it + // back to TableScan so that Velox can leverage it to do stripe level + // skipping. Otherwise we only get row level skipping and lose some + // optimization opportunity in case of very low selectivity. + if (auto tableScan = std::dynamic_pointer_cast( + node->source)) { + if (auto* tableLayout = dynamic_cast( + tableScan->table.connectorTableLayout.get())) { + auto remainingFilter = + exprConverter_.toVeloxExpr(tableLayout->remainingPredicate); + if (auto* constant = dynamic_cast( + remainingFilter.get())) { + bool value = constant->value().value(); + // We should get empty values node instead of table scan if the + // remaining filter is constantly false. + VELOX_CHECK(value, "Unexpected always-false remaining predicate"); + tableLayout->remainingPredicate = node->predicate; + return toVeloxQueryPlan(tableScan, tableWriteInfo, taskId); + } + } + } + return std::make_shared( node->id, exprConverter_.toVeloxExpr(node->predicate), diff --git a/presto-native-execution/presto_cpp/main/types/tests/PlanConverterTest.cpp b/presto-native-execution/presto_cpp/main/types/tests/PlanConverterTest.cpp index b72814044fb0f..a29db9f97b17c 100644 --- a/presto-native-execution/presto_cpp/main/types/tests/PlanConverterTest.cpp +++ b/presto-native-execution/presto_cpp/main/types/tests/PlanConverterTest.cpp @@ -143,6 +143,10 @@ TEST_F(PlanConverterTest, scanAgg) { ASSERT_EQ( tableHandle->dataColumns()->toString(), "ROW>>,comment:VARCHAR>"); + ASSERT_TRUE(tableHandle->remainingFilter()); + ASSERT_EQ( + tableHandle->remainingFilter()->toString(), + "presto.default.lt(presto.default.rand(),0.0001)"); auto tableParameters = tableHandle->tableParameters(); ASSERT_EQ(tableParameters.size(), 6); diff --git a/presto-native-execution/presto_cpp/main/types/tests/PrestoToVeloxSplitTest.cpp b/presto-native-execution/presto_cpp/main/types/tests/PrestoToVeloxSplitTest.cpp index 5e3a8f98c295a..bdb08da2d6eb7 100644 --- a/presto-native-execution/presto_cpp/main/types/tests/PrestoToVeloxSplitTest.cpp +++ b/presto-native-execution/presto_cpp/main/types/tests/PrestoToVeloxSplitTest.cpp @@ -157,6 +157,8 @@ TEST_F(PrestoToVeloxSplitTest, bucketConversion) { ASSERT_EQ(veloxHiveSplit.bucketConversion->tableBucketCount, 4096); ASSERT_EQ(veloxHiveSplit.bucketConversion->partitionBucketCount, 512); ASSERT_EQ(veloxHiveSplit.bucketConversion->bucketColumnHandles.size(), 1); + ASSERT_EQ(veloxHiveSplit.infoColumns.at("$path"), hiveSplit.fileSplit.path); + ASSERT_EQ(veloxHiveSplit.infoColumns.at("$bucket"), "42"); auto& veloxColumn = veloxHiveSplit.bucketConversion->bucketColumnHandles[0]; ASSERT_EQ(veloxColumn->name(), "c0"); ASSERT_EQ(*veloxColumn->dataType(), *BIGINT()); diff --git a/presto-native-execution/presto_cpp/main/types/tests/data/ScanAgg.json b/presto-native-execution/presto_cpp/main/types/tests/data/ScanAgg.json index cdda8bdb383d2..1033e8a2617ff 100644 --- a/presto-native-execution/presto_cpp/main/types/tests/data/ScanAgg.json +++ b/presto-native-execution/presto_cpp/main/types/tests/data/ScanAgg.json @@ -7,166 +7,209 @@ "@type":".ProjectNode", "id":"1", "source":{ - "@type":".TableScanNode", - "id":"0", - "table":{ - "connectorId":"hive", - "connectorHandle":{ - "@type":"hive", - "schemaName":"tpch", - "tableName":"nation" - }, - "transaction":{ - "@type":"hive", - "uuid":"7cc96264-a0fa-45e4-9042-62754ac3a5a0" - }, - "connectorTableLayout":{ - "@type":"hive", - "schemaTableName":{ - "schema":"tpch", - "table":"nation" + "@type" : ".FilterNode", + "id" : "449", + "source":{ + "@type":".TableScanNode", + "id":"0", + "table":{ + "connectorId":"hive", + "connectorHandle":{ + "@type":"hive", + "schemaName":"tpch", + "tableName":"nation" + }, + "transaction":{ + "@type":"hive", + "uuid":"7cc96264-a0fa-45e4-9042-62754ac3a5a0" }, - "tablePath":"a/path/to/a/table", - "partitionColumns":[ + "connectorTableLayout":{ + "@type":"hive", + "schemaTableName":{ + "schema":"tpch", + "table":"nation" + }, + "tablePath":"a/path/to/a/table", + "partitionColumns":[ - ], - "dataColumns":[ - { - "name":"nationkey", - "type":"bigint" + ], + "dataColumns":[ + { + "name":"nationkey", + "type":"bigint" + }, + { + "name":"name", + "type":"varchar(25)" + }, + { + "name":"regionkey", + "type":"bigint" + }, + { + "name":"complex_type", + "type":"array>>" + }, + { + "name":"comment", + "type":"varchar(152)" + } + ], + "tableParameters":{ + "presto_version":"testversion", + "presto_query_id":"20200908_214711_00000_7xpqg", + "numFiles":"1", + "numRows":"25", + "rawDataSize":"2734", + "totalSize":"1451" }, - { - "name":"name", - "type":"varchar(25)" + "domainPredicate":{ + "columnDomains":[ + { + "column":"psudo_bool_column", + "domain":{ + "values":{ + "@type":"sortable", + "type":"boolean", + "ranges":[ + { + "low":{ + "type":"boolean", + "bound":"ABOVE" + }, + "high":{ + "type":"boolean", + "valueBlock":"CgAAAEJZVEVfQVJSQVkBAAAAAAE=", + "bound":"BELOW" + } + }, + { + "low":{ + "type":"boolean", + "valueBlock":"CgAAAEJZVEVfQVJSQVkBAAAAAAE=", + "bound":"ABOVE" + }, + "high":{ + "type":"boolean", + "bound":"BELOW" + } + } + ] + }, + "nullAllowed":false + } + } + ] }, - { - "name":"regionkey", - "type":"bigint" + "remainingPredicate":{ + "@type":"constant", + "valueBlock":"CgAAAEJZVEVfQVJSQVkBAAAAAAE=", + "type":"boolean" }, - { - "name":"complex_type", - "type":"array>>" + "predicateColumns":{ + }, - { - "name":"comment", - "type":"varchar(152)" - } - ], - "tableParameters":{ - "presto_version":"testversion", - "presto_query_id":"20200908_214711_00000_7xpqg", - "numFiles":"1", - "numRows":"25", - "rawDataSize":"2734", - "totalSize":"1451" - }, - "domainPredicate":{ - "columnDomains":[ + "partitionColumnPredicate":{ + "columnDomains":[ + + ] + }, + "pushdownFilterEnabled":true, + "layoutString":"tpch.nation{}", + "requestedColumns":[ { - "column":"psudo_bool_column", - "domain":{ - "values":{ - "@type":"sortable", - "type":"boolean", - "ranges":[ - { - "low":{ - "type":"boolean", - "bound":"ABOVE" - }, - "high":{ - "type":"boolean", - "valueBlock":"CgAAAEJZVEVfQVJSQVkBAAAAAAE=", - "bound":"BELOW" - } - }, - { - "low":{ - "type":"boolean", - "valueBlock":"CgAAAEJZVEVfQVJSQVkBAAAAAAE=", - "bound":"ABOVE" - }, - "high":{ - "type":"boolean", - "bound":"BELOW" - } - } - ] - }, - "nullAllowed":false - } - } - ] - }, - "remainingPredicate":{ - "@type":"constant", - "valueBlock":"CgAAAEJZVEVfQVJSQVkBAAAAAAE=", - "type":"boolean" - }, - "predicateColumns":{ + "@type":"hive", + "name":"regionkey", + "hiveType":"bigint", + "typeSignature":"bigint", + "hiveColumnIndex":2, + "columnType":"REGULAR", + "requiredSubfields":[ + ] + } + ], + "partialAggregationsPushedDown":false, + "appendRowNumber":false, + "footerStatsUnreliable":false + } + }, + "outputVariables":[ + { + "@type":"variable", + "name":"regionkey", + "type":"bigint" }, - "partitionColumnPredicate":{ - "columnDomains":[ + { + "@type":"variable", + "name":"complex_type", + "type":"array(map(varchar, row(id bigint, description varchar)))" + } + ], + "assignments":{ + "regionkey":{ + "@type":"hive", + "name":"regionkey", + "hiveType":"bigint", + "typeSignature":"bigint", + "hiveColumnIndex":2, + "columnType":"REGULAR", + "requiredSubfields":[ ] }, - "pushdownFilterEnabled":true, - "layoutString":"tpch.nation{}", - "requestedColumns":[ - { - "@type":"hive", - "name":"regionkey", - "hiveType":"bigint", - "typeSignature":"bigint", - "hiveColumnIndex":2, - "columnType":"REGULAR", - "requiredSubfields":[ - - ] - } - ], - "partialAggregationsPushedDown":false, - "appendRowNumber":false, - "footerStatsUnreliable":false + "complex_type":{ + "@type":"hive", + "name":"complex_type", + "hiveType":"array>>", + "typeSignature":"array(map(varchar, row(id bigint, description varchar)))", + "hiveColumnIndex":3, + "columnType":"REGULAR", + "requiredSubfields":[ + "complex_type[1][\"foo\"].id", + "complex_type[2][\"bar\"].id" + ] + } } }, - "outputVariables":[ - { - "@type":"variable", - "name":"regionkey", - "type":"bigint" - }, - { - "@type":"variable", - "name":"complex_type", - "type":"array(map(varchar, row(id bigint, description varchar)))" - } - ], - "assignments":{ - "regionkey":{ - "@type":"hive", - "name":"regionkey", - "hiveType":"bigint", - "typeSignature":"bigint", - "hiveColumnIndex":2, - "columnType":"REGULAR", - "requiredSubfields":[ - - ] + "predicate": { + "@type" : "call", + "displayName" : "LESS_THAN", + "functionHandle" : { + "@type" : "$static", + "signature" : { + "name" : "presto.default.$operator$less_than", + "kind" : "SCALAR", + "typeVariableConstraints" : [ ], + "longVariableConstraints" : [ ], + "returnType" : "boolean", + "argumentTypes" : [ "double", "double" ], + "variableArity" : false + } }, - "complex_type":{ - "@type":"hive", - "name":"complex_type", - "hiveType":"array>>", - "typeSignature":"array(map(varchar, row(id bigint, description varchar)))", - "hiveColumnIndex":3, - "columnType":"REGULAR", - "requiredSubfields":[ - "complex_type[1][\"foo\"].id", - "complex_type[2][\"bar\"].id" - ] - } + "returnType" : "boolean", + "arguments" : [ { + "@type" : "call", + "displayName" : "rand", + "functionHandle" : { + "@type" : "$static", + "signature" : { + "name" : "presto.default.rand", + "kind" : "SCALAR", + "typeVariableConstraints" : [ ], + "longVariableConstraints" : [ ], + "returnType" : "double", + "argumentTypes" : [ ], + "variableArity" : false + } + }, + "returnType" : "double", + "arguments" : [ ] + }, { + "@type" : "constant", + "valueBlock" : "CgAAAExPTkdfQVJSQVkBAAAAAC1DHOviNho/", + "type" : "double" + } ] } }, "assignments":{