diff --git a/dbms/src/Debug/MockTiDB.cpp b/dbms/src/Debug/MockTiDB.cpp index 66e6ed36a22..e6d59fc6c20 100644 --- a/dbms/src/Debug/MockTiDB.cpp +++ b/dbms/src/Debug/MockTiDB.cpp @@ -133,7 +133,8 @@ DatabaseID MockTiDB::newDataBase(const String & database_name) return schema_id; } -TableID MockTiDB::newTable(const String & database_name, const String & table_name, const ColumnsDescription & columns, Timestamp tso) +TableID MockTiDB::newTable(const String & database_name, const String & table_name, + const ColumnsDescription & columns, Timestamp tso, const String & handle_pk_name) { std::lock_guard lock(tables_mutex); @@ -153,14 +154,21 @@ TableID MockTiDB::newTable(const String & database_name, const String & table_na table_info.db_name = database_name; table_info.id = table_id_allocator++; table_info.name = table_name; + table_info.pk_is_handle = false; int i = 1; for (auto & column : columns.getAllPhysical()) { table_info.columns.emplace_back(reverseGetColumnInfo(column, i++, Field())); + if (handle_pk_name == column.name) + { + if (!column.type->isInteger() && !column.type->isUnsignedInteger()) + throw Exception("MockTiDB pk column must be integer or unsigned integer type", ErrorCodes::LOGICAL_ERROR); + table_info.columns.back().setPriKeyFlag(); + table_info.pk_is_handle = true; + } } - table_info.pk_is_handle = false; table_info.comment = "Mocked."; table_info.update_timestamp = tso; diff --git a/dbms/src/Debug/MockTiDB.h b/dbms/src/Debug/MockTiDB.h index ed3d30a1929..20afa87144a 100644 --- a/dbms/src/Debug/MockTiDB.h +++ b/dbms/src/Debug/MockTiDB.h @@ -55,7 +55,8 @@ class MockTiDB : public ext::singleton using TablePtr = std::shared_ptr; public: - TableID newTable(const String & database_name, const String & table_name, const ColumnsDescription & columns, Timestamp tso); + TableID newTable(const String & database_name, const String & table_name, + const ColumnsDescription & columns, Timestamp tso, const String & handle_pk_name); DatabaseID newDataBase(const String & database_name); diff --git a/dbms/src/Debug/dbgFuncCoprocessor.cpp b/dbms/src/Debug/dbgFuncCoprocessor.cpp index 8b84a373963..3f46106f2cb 100644 --- a/dbms/src/Debug/dbgFuncCoprocessor.cpp +++ b/dbms/src/Debug/dbgFuncCoprocessor.cpp @@ -163,32 +163,35 @@ void compileExpr(const DAGSchema & input, ASTPtr ast, tipb::Expr * expr, std::un { expr->set_sig(tipb::ScalarFuncSig::EQInt); auto * ft = expr->mutable_field_type(); - // TODO: TiDB will infer Int64. - ft->set_tp(TiDB::TypeTiny); + ft->set_tp(TiDB::TypeLongLong); ft->set_flag(TiDB::ColumnFlagUnsigned); } else if (func_name_lowercase == "and") { expr->set_sig(tipb::ScalarFuncSig::LogicalAnd); auto * ft = expr->mutable_field_type(); - // TODO: TiDB will infer Int64. - ft->set_tp(TiDB::TypeTiny); + ft->set_tp(TiDB::TypeLongLong); ft->set_flag(TiDB::ColumnFlagUnsigned); } else if (func_name_lowercase == "or") { expr->set_sig(tipb::ScalarFuncSig::LogicalOr); auto * ft = expr->mutable_field_type(); - // TODO: TiDB will infer Int64. - ft->set_tp(TiDB::TypeTiny); + ft->set_tp(TiDB::TypeLongLong); ft->set_flag(TiDB::ColumnFlagUnsigned); } else if (func_name_lowercase == "greater") { expr->set_sig(tipb::ScalarFuncSig::GTInt); auto * ft = expr->mutable_field_type(); - // TODO: TiDB will infer Int64. - ft->set_tp(TiDB::TypeTiny); + ft->set_tp(TiDB::TypeLongLong); + ft->set_flag(TiDB::ColumnFlagUnsigned); + } + else if (func_name_lowercase == "greaterorequals") + { + expr->set_sig(tipb::ScalarFuncSig::GEInt); + auto *ft = expr->mutable_field_type(); + ft->set_tp(TiDB::TypeLongLong); ft->set_flag(TiDB::ColumnFlagUnsigned); } else diff --git a/dbms/src/Debug/dbgFuncMockTiDBTable.cpp b/dbms/src/Debug/dbgFuncMockTiDBTable.cpp index 879c2c7663e..41badcadb75 100644 --- a/dbms/src/Debug/dbgFuncMockTiDBTable.cpp +++ b/dbms/src/Debug/dbgFuncMockTiDBTable.cpp @@ -25,13 +25,17 @@ extern const int LOGICAL_ERROR; void MockTiDBTable::dbgFuncMockTiDBTable(Context & context, const ASTs & args, DBGInvoker::Printer output) { - if (args.size() != 3) - throw Exception("Args not matched, should be: database-name, table-name, schema-string", ErrorCodes::BAD_ARGUMENTS); + if (args.size() != 3 && args.size() != 4) + throw Exception("Args not matched, should be: database-name, table-name, schema-string [, handle_pk_name]", ErrorCodes::BAD_ARGUMENTS); const String & database_name = typeid_cast(*args[0]).name; const String & table_name = typeid_cast(*args[1]).name; auto schema_str = safeGet(typeid_cast(*args[2]).value); + String handle_pk_name = ""; + if (args.size() == 4) + handle_pk_name = safeGet(typeid_cast(*args[3]).value); + ASTPtr columns_ast; ParserColumnDeclarationList schema_parser; Tokens tokens(schema_str.data(), schema_str.data() + schema_str.length()); @@ -43,7 +47,7 @@ void MockTiDBTable::dbgFuncMockTiDBTable(Context & context, const ASTs & args, D = InterpreterCreateQuery::getColumnsDescription(typeid_cast(*columns_ast), context); auto tso = context.getTMTContext().getPDClient()->getTS(); - TableID table_id = MockTiDB::instance().newTable(database_name, table_name, columns, tso); + TableID table_id = MockTiDB::instance().newTable(database_name, table_name, columns, tso, handle_pk_name); std::stringstream ss; ss << "mock table #" << table_id; diff --git a/dbms/src/Debug/dbgTools.cpp b/dbms/src/Debug/dbgTools.cpp index 2ac2479b225..77c9e0c50da 100644 --- a/dbms/src/Debug/dbgTools.cpp +++ b/dbms/src/Debug/dbgTools.cpp @@ -237,7 +237,7 @@ Field convertField(const ColumnInfo & column_info, const Field & field) void encodeRow(const TiDB::TableInfo & table_info, const std::vector & fields, std::stringstream & ss) { - if (table_info.columns.size() != fields.size()) + if (table_info.columns.size() != fields.size() + table_info.pk_is_handle) throw Exception("Encoding row has different sizes between columns and values", ErrorCodes::LOGICAL_ERROR); for (size_t i = 0; i < fields.size(); i++) { @@ -261,7 +261,7 @@ void insert(const TiDB::TableInfo & table_info, RegionID region_id, HandleID han fields.emplace_back(field); idx++; } - if (fields.size() != table_info.columns.size()) + if (fields.size() + table_info.pk_is_handle != table_info.columns.size()) throw Exception("Number of insert values and columns do not match.", ErrorCodes::LOGICAL_ERROR); TMTContext & tmt = context.getTMTContext(); diff --git a/dbms/src/Flash/Coprocessor/DAGExpressionAnalyzer.cpp b/dbms/src/Flash/Coprocessor/DAGExpressionAnalyzer.cpp index fe0eb24c081..4dd5e48a4a9 100644 --- a/dbms/src/Flash/Coprocessor/DAGExpressionAnalyzer.cpp +++ b/dbms/src/Flash/Coprocessor/DAGExpressionAnalyzer.cpp @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include #include @@ -214,7 +216,7 @@ void constructTZExpr(tipb::Expr & tz_expr, const tipb::DAGRequest & rqst, bool f } } -bool hasMeaningfulTZInfo(const tipb::DAGRequest &rqst) +bool hasMeaningfulTZInfo(const tipb::DAGRequest & rqst) { if (rqst.has_time_zone_name() && rqst.time_zone_name().length() > 0) return rqst.time_zone_name() != "UTC"; @@ -249,7 +251,7 @@ String DAGExpressionAnalyzer::appendTimeZoneCast( // column with UTC timezone will never be used in during agg), all the column with ts datatype will // convert back to UTC timezone bool DAGExpressionAnalyzer::appendTimeZoneCastsAfterTS( - ExpressionActionsChain &chain, std::vector is_ts_column, const tipb::DAGRequest &rqst) + ExpressionActionsChain & chain, std::vector is_ts_column, const tipb::DAGRequest & rqst) { if (!hasMeaningfulTZInfo(rqst)) return false; @@ -391,6 +393,35 @@ String DAGExpressionAnalyzer::appendCastIfNeeded(const tipb::Expr & expr, Expres return expr_name; } +void DAGExpressionAnalyzer::makeExplicitSetForIndex(const tipb::Expr & expr, const TMTStoragePtr & storage) +{ + for (auto & child : expr.children()) + { + makeExplicitSetForIndex(child, storage); + } + if (expr.tp() != tipb::ExprType::ScalarFunc) + { + return; + } + const String & func_name = getFunctionName(expr); + // only support col_name in (value_list) + if (isInOrGlobalInOperator(func_name) && expr.children(0).tp() == tipb::ExprType::ColumnRef && !prepared_sets.count(&expr)) + { + NamesAndTypesList column_list; + for (const auto & col : getCurrentInputColumns()) + { + column_list.emplace_back(col.name, col.type); + } + ExpressionActionsPtr temp_actions = std::make_shared(column_list, settings); + String name = getActions(expr.children(0), temp_actions); + ASTPtr name_ast = std::make_shared(name); + if (storage->mayBenefitFromIndexForIn(name_ast)) + { + makeExplicitSet(expr, temp_actions->getSampleBlock(), true, name); + } + } +} + void DAGExpressionAnalyzer::makeExplicitSet( const tipb::Expr & expr, const Block & sample_block, bool create_ordered_set, const String & left_arg_name) { @@ -400,7 +431,7 @@ void DAGExpressionAnalyzer::makeExplicitSet( } DataTypes set_element_types; // todo support tuple in, i.e. (a,b) in ((1,2), (3,4)), currently TiDB convert tuple in into a series of or/and/eq exprs - // which means tuple in is never be pushed to coprocessor, but it is quite in-efficient + // which means tuple in is never be pushed to coprocessor, but it is quite in-efficient set_element_types.push_back(sample_block.getByName(left_arg_name).type); // todo if this is a single value in, then convert it to equal expr diff --git a/dbms/src/Flash/Coprocessor/DAGExpressionAnalyzer.h b/dbms/src/Flash/Coprocessor/DAGExpressionAnalyzer.h index 1486783d467..1b5b65f0ff0 100644 --- a/dbms/src/Flash/Coprocessor/DAGExpressionAnalyzer.h +++ b/dbms/src/Flash/Coprocessor/DAGExpressionAnalyzer.h @@ -9,6 +9,7 @@ #include #include #include +#include namespace DB { @@ -60,11 +61,12 @@ class DAGExpressionAnalyzer : private boost::noncopyable String getActions(const tipb::Expr & expr, ExpressionActionsPtr & actions); const std::vector & getCurrentInputColumns(); void makeExplicitSet(const tipb::Expr & expr, const Block & sample_block, bool create_ordered_set, const String & left_arg_name); + void makeExplicitSetForIndex(const tipb::Expr & expr, const TMTStoragePtr & storage); String applyFunction(const String & func_name, Names & arg_names, ExpressionActionsPtr & actions); Int32 getImplicitCastCount() { return implicit_cast_count; }; - bool appendTimeZoneCastsAfterTS(ExpressionActionsChain &chain, std::vector is_ts_column, - const tipb::DAGRequest &rqst); + bool appendTimeZoneCastsAfterTS(ExpressionActionsChain & chain, std::vector is_ts_column, const tipb::DAGRequest & rqst); String appendTimeZoneCast(const String & tz_col, const String & ts_col, const String & func_name, ExpressionActionsPtr & actions); + DAGPreparedSets getPreparedSets() { return prepared_sets; } }; } // namespace DB diff --git a/dbms/src/Flash/Coprocessor/DAGQueryInfo.h b/dbms/src/Flash/Coprocessor/DAGQueryInfo.h new file mode 100644 index 00000000000..20274503782 --- /dev/null +++ b/dbms/src/Flash/Coprocessor/DAGQueryInfo.h @@ -0,0 +1,23 @@ +#pragma once + +#include + +#include +#include + +namespace DB +{ + +struct DAGQueryInfo +{ + DAGQueryInfo(const DAGQuerySource & dag_, DAGPreparedSets dag_sets_, std::vector & source_columns_) + : dag(dag_), dag_sets(std::move(dag_sets_)) + { + for (auto & c : source_columns_) + source_columns.emplace_back(c.name, c.type); + }; + const DAGQuerySource & dag; + DAGPreparedSets dag_sets; + NamesAndTypesList source_columns; +}; +} // namespace DB diff --git a/dbms/src/Flash/Coprocessor/InterpreterDAG.cpp b/dbms/src/Flash/Coprocessor/InterpreterDAG.cpp index d17f2f3e13c..a0029c04114 100644 --- a/dbms/src/Flash/Coprocessor/InterpreterDAG.cpp +++ b/dbms/src/Flash/Coprocessor/InterpreterDAG.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -208,9 +209,18 @@ void InterpreterDAG::executeTS(const tipb::TableScan & ts, Pipeline & pipeline) if (!checkKeyRanges(dag.getKeyRanges(), table_id, storage->pkIsUInt64())) throw Exception("Cop request only support full range scan for given region", ErrorCodes::COP_BAD_DAG_REQUEST); + if (dag.hasSelection()) + { + for (auto & condition : dag.getSelection().conditions()) + { + analyzer->makeExplicitSetForIndex(condition, storage); + } + } //todo support index in SelectQueryInfo query_info; + // set query to avoid unexpected NPE query_info.query = dag.getAST(); + query_info.dag_query = std::make_unique(dag, analyzer->getPreparedSets(), source_columns); query_info.mvcc_query_info = std::make_unique(); query_info.mvcc_query_info->resolve_locks = true; query_info.mvcc_query_info->read_tso = settings.read_tso; diff --git a/dbms/src/Storages/MergeTree/KeyCondition.cpp b/dbms/src/Storages/MergeTree/KeyCondition.cpp index de7797f6063..09e367c449e 100644 --- a/dbms/src/Storages/MergeTree/KeyCondition.cpp +++ b/dbms/src/Storages/MergeTree/KeyCondition.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -7,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -109,7 +111,7 @@ const KeyCondition::AtomMap KeyCondition::atom_map { { "notEquals", - [] (RPNElement & out, const Field & value, const ASTPtr &) + [] (RPNElement & out, const Field & value) { out.function = RPNElement::FUNCTION_NOT_IN_RANGE; out.range = Range(value); @@ -118,7 +120,7 @@ const KeyCondition::AtomMap KeyCondition::atom_map }, { "equals", - [] (RPNElement & out, const Field & value, const ASTPtr &) + [] (RPNElement & out, const Field & value) { out.function = RPNElement::FUNCTION_IN_RANGE; out.range = Range(value); @@ -127,7 +129,7 @@ const KeyCondition::AtomMap KeyCondition::atom_map }, { "less", - [] (RPNElement & out, const Field & value, const ASTPtr &) + [] (RPNElement & out, const Field & value) { out.function = RPNElement::FUNCTION_IN_RANGE; out.range = Range::createRightBounded(value, false); @@ -136,7 +138,7 @@ const KeyCondition::AtomMap KeyCondition::atom_map }, { "greater", - [] (RPNElement & out, const Field & value, const ASTPtr &) + [] (RPNElement & out, const Field & value) { out.function = RPNElement::FUNCTION_IN_RANGE; out.range = Range::createLeftBounded(value, false); @@ -145,7 +147,7 @@ const KeyCondition::AtomMap KeyCondition::atom_map }, { "lessOrEquals", - [] (RPNElement & out, const Field & value, const ASTPtr &) + [] (RPNElement & out, const Field & value) { out.function = RPNElement::FUNCTION_IN_RANGE; out.range = Range::createRightBounded(value, true); @@ -154,7 +156,7 @@ const KeyCondition::AtomMap KeyCondition::atom_map }, { "greaterOrEquals", - [] (RPNElement & out, const Field & value, const ASTPtr &) + [] (RPNElement & out, const Field & value) { out.function = RPNElement::FUNCTION_IN_RANGE; out.range = Range::createLeftBounded(value, true); @@ -163,25 +165,23 @@ const KeyCondition::AtomMap KeyCondition::atom_map }, { "in", - [] (RPNElement & out, const Field &, const ASTPtr & node) + [] (RPNElement & out, const Field &) { out.function = RPNElement::FUNCTION_IN_SET; - out.in_function = node; return true; } }, { "notIn", - [] (RPNElement & out, const Field &, const ASTPtr & node) + [] (RPNElement & out, const Field &) { out.function = RPNElement::FUNCTION_NOT_IN_SET; - out.in_function = node; return true; } }, { "like", - [] (RPNElement & out, const Field & value, const ASTPtr &) + [] (RPNElement & out, const Field & value) { if (value.getType() != Field::Types::String) return false; @@ -271,7 +271,7 @@ KeyCondition::KeyCondition( const NamesAndTypesList & all_columns, const SortDescription & sort_descr_, const ExpressionActionsPtr & key_expr_) - : sort_descr(sort_descr_), key_expr(key_expr_), prepared_sets(query_info.sets) + : sort_descr(sort_descr_), key_expr(key_expr_) { for (size_t i = 0; i < sort_descr.size(); ++i) { @@ -280,486 +280,60 @@ KeyCondition::KeyCondition( key_columns[name] = i; } - /** Evaluation of expressions that depend only on constants. - * For the index to be used, if it is written, for example `WHERE Date = toDate(now())`. - */ - Block block_with_constants = getBlockWithConstants(query_info.query, context, all_columns); - - /// Trasform WHERE section to Reverse Polish notation - const ASTSelectQuery & select = typeid_cast(*query_info.query); - if (select.where_expression) - { - traverseAST(select.where_expression, context, block_with_constants); - - if (select.prewhere_expression) - { - traverseAST(select.prewhere_expression, context, block_with_constants); - rpn.emplace_back(RPNElement::FUNCTION_AND); - } - } - else if (select.prewhere_expression) - { - traverseAST(select.prewhere_expression, context, block_with_constants); - } - else - { - rpn.emplace_back(RPNElement::FUNCTION_UNKNOWN); - } -} - -bool KeyCondition::addCondition(const String & column, const Range & range) -{ - if (!key_columns.count(column)) - return false; - rpn.emplace_back(RPNElement::FUNCTION_IN_RANGE, key_columns[column], range); - rpn.emplace_back(RPNElement::FUNCTION_AND); - return true; -} - -/** Computes value of constant expression and it data type. - * Returns false, if expression isn't constant. - */ -static bool getConstant(const ASTPtr & expr, Block & block_with_constants, Field & out_value, DataTypePtr & out_type) -{ - String column_name = expr->getColumnName(); - - if (const ASTLiteral * lit = typeid_cast(expr.get())) - { - /// By default block_with_constants has only one column named "_dummy". - /// If block contains only constants it's may not be preprocessed by - // ExpressionAnalyzer, so try to look up in the default column. - if (!block_with_constants.has(column_name)) - column_name = "_dummy"; - - /// Simple literal - out_value = lit->value; - out_type = block_with_constants.getByName(column_name).type; - return true; - } - else if (block_with_constants.has(column_name) && block_with_constants.getByName(column_name).column->isColumnConst()) - { - /// An expression which is dependent on constants only - const auto & expr_info = block_with_constants.getByName(column_name); - out_value = (*expr_info.column)[0]; - out_type = expr_info.type; - return true; - } - else - return false; -} - - -static void applyFunction( - const FunctionBasePtr & func, - const DataTypePtr & arg_type, const Field & arg_value, - DataTypePtr & res_type, Field & res_value) -{ - res_type = func->getReturnType(); - - Block block - { - { arg_type->createColumnConst(1, arg_value), arg_type, "x" }, - { nullptr, res_type, "y" } - }; - - func->execute(block, {0}, 1); - - block.safeGetByPosition(1).column->get(0, res_value); -} - - -void KeyCondition::traverseAST(const ASTPtr & node, const Context & context, Block & block_with_constants) -{ - RPNElement element; - - if (ASTFunction * func = typeid_cast(&*node)) - { - if (operatorFromAST(func, element)) - { - auto & args = typeid_cast(*func->arguments).children; - for (size_t i = 0, size = args.size(); i < size; ++i) - { - traverseAST(args[i], context, block_with_constants); - - /** The first part of the condition is for the correct support of `and` and `or` functions of arbitrary arity - * - in this case `n - 1` elements are added (where `n` is the number of arguments). - */ - if (i != 0 || element.function == RPNElement::FUNCTION_NOT) - rpn.emplace_back(std::move(element)); - } - - return; - } - } - - if (!atomFromAST(node, context, block_with_constants, element)) + if (query_info.fromAST()) { - element.function = RPNElement::FUNCTION_UNKNOWN; - } - - rpn.emplace_back(std::move(element)); -} - - -bool KeyCondition::canConstantBeWrappedByMonotonicFunctions( - const ASTPtr & node, - size_t & out_key_column_num, - DataTypePtr & out_key_column_type, - Field & out_value, - DataTypePtr & out_type) -{ - String expr_name = node->getColumnName(); - const auto & sample_block = key_expr->getSampleBlock(); - if (!sample_block.has(expr_name)) - return false; + RPNBuilder rpn_builder(key_expr_, key_columns, all_columns); + PreparedSets sets(query_info.sets); - bool found_transformation = false; - for (const ExpressionAction & a : key_expr->getActions()) - { - /** The key functional expression constraint may be inferred from a plain column in the expression. - * For example, if the key contains `toStartOfHour(Timestamp)` and query contains `WHERE Timestamp >= now()`, - * it can be assumed that if `toStartOfHour()` is monotonic on [now(), inf), the `toStartOfHour(Timestamp) >= toStartOfHour(now())` - * condition also holds, so the index may be used to select only parts satisfying this condition. - * - * To check the assumption, we'd need to assert that the inverse function to this transformation is also monotonic, however the - * inversion isn't exported (or even viable for not strictly monotonic functions such as `toStartOfHour()`). - * Instead, we can qualify only functions that do not transform the range (for example rounding), - * which while not strictly monotonic, are monotonic everywhere on the input range. + /** Evaluation of expressions that depend only on constants. + * For the index to be used, if it is written, for example `WHERE Date = toDate(now())`. */ - const auto & action = a.argument_names; - if (a.type == ExpressionAction::Type::APPLY_FUNCTION && action.size() == 1 && a.argument_names[0] == expr_name) - { - if (!a.function->hasInformationAboutMonotonicity()) - return false; - - // Range is irrelevant in this case - IFunction::Monotonicity monotonicity = a.function->getMonotonicityForRange(*out_type, Field(), Field()); - if (!monotonicity.is_always_monotonic) - return false; - - // Apply the next transformation step - DataTypePtr new_type; - applyFunction(a.function, out_type, out_value, new_type, out_value); - if (!new_type) - return false; + Block block_with_constants = getBlockWithConstants(query_info.query, context, all_columns); - out_type.swap(new_type); - expr_name = a.result_name; + /// Trasform WHERE section to Reverse Polish notation + const ASTSelectQuery & select = typeid_cast(*query_info.query); + if (select.where_expression) + { + rpn_builder.traverseNodeTree(select.where_expression, context, block_with_constants, sets, rpn); - // Transformation results in a key expression, accept - auto it = key_columns.find(expr_name); - if (key_columns.end() != it) + if (select.prewhere_expression) { - out_key_column_num = it->second; - out_key_column_type = sample_block.getByName(it->first).type; - found_transformation = true; - break; + rpn_builder.traverseNodeTree(select.prewhere_expression, context, block_with_constants, sets, rpn); + rpn.emplace_back(RPNElement::FUNCTION_AND); } } - } - - return found_transformation; -} - -void KeyCondition::getKeyTuplePositionMapping( - const ASTPtr & node, - const Context & context, - std::vector & indexes_mapping, - const size_t tuple_index, - size_t & out_key_column_num) -{ - MergeTreeSetIndex::KeyTuplePositionMapping index_mapping; - index_mapping.tuple_index = tuple_index; - DataTypePtr data_type; - if (isKeyPossiblyWrappedByMonotonicFunctions( - node, context, index_mapping.key_index, - data_type, index_mapping.functions)) - { - indexes_mapping.push_back(index_mapping); - if (out_key_column_num < index_mapping.key_index) - { - out_key_column_num = index_mapping.key_index; - } - } -} - - -/// Try to prepare KeyTuplePositionMapping for tuples from IN expression. -bool KeyCondition::isTupleIndexable( - const ASTPtr & node, - const Context & context, - RPNElement & out, - const SetPtr & prepared_set, - size_t & out_key_column_num) -{ - out_key_column_num = 0; - std::vector indexes_mapping; - - size_t num_key_columns = prepared_set->getDataTypes().size(); - - const ASTFunction * node_tuple = typeid_cast(node.get()); - if (node_tuple && node_tuple->name == "tuple") - { - if (num_key_columns != node_tuple->arguments->children.size()) - { - std::stringstream message; - message << "Number of columns in section IN doesn't match. " - << node_tuple->arguments->children.size() << " at left, " << num_key_columns << " at right."; - throw Exception(message.str(), ErrorCodes::NUMBER_OF_COLUMNS_DOESNT_MATCH); - } - - size_t current_tuple_index = 0; - for (const auto & arg : node_tuple->arguments->children) - { - getKeyTuplePositionMapping(arg, context, indexes_mapping, current_tuple_index, out_key_column_num); - ++current_tuple_index; - } + else if (select.prewhere_expression) + rpn_builder.traverseNodeTree(select.prewhere_expression, context, block_with_constants, sets, rpn); + else + rpn.emplace_back(RPNElement::FUNCTION_UNKNOWN); } else { - getKeyTuplePositionMapping(node, context, indexes_mapping, 0, out_key_column_num); - } - - if (indexes_mapping.empty()) - return false; - - out.set_index = std::make_shared( - prepared_set->getSetElements(), std::move(indexes_mapping)); - - return true; -} - - -bool KeyCondition::isKeyPossiblyWrappedByMonotonicFunctions( - const ASTPtr & node, - const Context & context, - size_t & out_key_column_num, - DataTypePtr & out_key_res_column_type, - RPNElement::MonotonicFunctionsChain & out_functions_chain) -{ - std::vector chain_not_tested_for_monotonicity; - DataTypePtr key_column_type; - - if (!isKeyPossiblyWrappedByMonotonicFunctionsImpl(node, out_key_column_num, key_column_type, chain_not_tested_for_monotonicity)) - return false; - - for (auto it = chain_not_tested_for_monotonicity.rbegin(); it != chain_not_tested_for_monotonicity.rend(); ++it) - { - auto func_builder = FunctionFactory::instance().tryGet((*it)->name, context); - ColumnsWithTypeAndName arguments{{ nullptr, key_column_type, "" }}; - auto func = func_builder->build(arguments); - - if (!func || !func->hasInformationAboutMonotonicity()) - return false; - - key_column_type = func->getReturnType(); - out_functions_chain.push_back(func); - } - - out_key_res_column_type = key_column_type; - - return true; -} - -bool KeyCondition::isKeyPossiblyWrappedByMonotonicFunctionsImpl( - const ASTPtr & node, - size_t & out_key_column_num, - DataTypePtr & out_key_column_type, - std::vector & out_functions_chain) -{ - /** By itself, the key column can be a functional expression. for example, `intHash32(UserID)`. - * Therefore, use the full name of the expression for search. - */ - const auto & sample_block = key_expr->getSampleBlock(); - String name = node->getColumnName(); - - auto it = key_columns.find(name); - if (key_columns.end() != it) - { - out_key_column_num = it->second; - out_key_column_type = sample_block.getByName(it->first).type; - return true; - } - - if (const ASTFunction * func = typeid_cast(node.get())) - { - const auto & args = func->arguments->children; - if (args.size() != 1) - return false; - - out_functions_chain.push_back(func); - - if (!isKeyPossiblyWrappedByMonotonicFunctionsImpl(args[0], out_key_column_num, out_key_column_type, out_functions_chain)) - return false; - - return true; - } - - return false; -} - - -static void castValueToType(const DataTypePtr & desired_type, Field & src_value, const DataTypePtr & src_type, const ASTPtr & node) -{ - if (desired_type->equals(*src_type)) - return; - - try - { - /// NOTE: We don't need accurate info about src_type at this moment - src_value = convertFieldToType(src_value, *desired_type); - } - catch (...) - { - throw Exception("Key expression contains comparison between inconvertible types: " + - desired_type->getName() + " and " + src_type->getName() + - " inside " + queryToString(node), - ErrorCodes::BAD_TYPE_OF_FIELD); - } -} - - -bool KeyCondition::atomFromAST(const ASTPtr & node, const Context & context, Block & block_with_constants, RPNElement & out) -{ - /** Functions < > = != <= >= in `notIn`, where one argument is a constant, and the other is one of columns of key, - * or itself, wrapped in a chain of possibly-monotonic functions, - * or constant expression - number. - */ - Field const_value; - DataTypePtr const_type; - if (const ASTFunction * func = typeid_cast(node.get())) - { - const ASTs & args = typeid_cast(*func->arguments).children; - - if (args.size() != 2) - return false; - - DataTypePtr key_expr_type; /// Type of expression containing key column - size_t key_arg_pos; /// Position of argument with key column (non-const argument) - size_t key_column_num; /// Number of a key column (inside sort_descr array) - RPNElement::MonotonicFunctionsChain chain; - bool is_set_const = false; - bool is_constant_transformed = false; - - if (prepared_sets.count(args[1].get()) - && isTupleIndexable(args[0], context, out, prepared_sets[args[1].get()], key_column_num)) + RPNBuilder rpn_builder(key_expr_, key_columns, query_info.dag_query->source_columns); + DAGPreparedSets sets(query_info.dag_query->dag_sets); + const auto & dag = query_info.dag_query->dag; + if (dag.hasSelection()) { - key_arg_pos = 0; - is_set_const = true; - } - else if (getConstant(args[1], block_with_constants, const_value, const_type) - && isKeyPossiblyWrappedByMonotonicFunctions(args[0], context, key_column_num, key_expr_type, chain)) - { - key_arg_pos = 0; - } - else if (getConstant(args[1], block_with_constants, const_value, const_type) - && canConstantBeWrappedByMonotonicFunctions(args[0], key_column_num, key_expr_type, const_value, const_type)) - { - key_arg_pos = 0; - is_constant_transformed = true; - } - else if (getConstant(args[0], block_with_constants, const_value, const_type) - && isKeyPossiblyWrappedByMonotonicFunctions(args[1], context, key_column_num, key_expr_type, chain)) - { - key_arg_pos = 1; - } - else if (getConstant(args[0], block_with_constants, const_value, const_type) - && canConstantBeWrappedByMonotonicFunctions(args[1], key_column_num, key_expr_type, const_value, const_type)) - { - key_arg_pos = 1; - is_constant_transformed = true; - } - else - return false; - - std::string func_name = func->name; - - /// Transformed constant must weaken the condition, for example "x > 5" must weaken to "round(x) >= 5" - if (is_constant_transformed) - { - if (func_name == "less") - func_name = "lessOrEquals"; - else if (func_name == "greater") - func_name = "greaterOrEquals"; - } - - /// Replace on to <-sign> - if (key_arg_pos == 1) - { - if (func_name == "less") - func_name = "greater"; - else if (func_name == "greater") - func_name = "less"; - else if (func_name == "greaterOrEquals") - func_name = "lessOrEquals"; - else if (func_name == "lessOrEquals") - func_name = "greaterOrEquals"; - else if (func_name == "in" || func_name == "notIn" || func_name == "like") + Block block_with_constants{{DataTypeUInt8().createColumnConstWithDefaultValue(1), std::make_shared(), "_dummy"}}; + auto & selection = dag.getSelection(); + for (int i = 0; i < selection.conditions_size(); i++) { - /// "const IN data_column" doesn't make sense (unlike "data_column IN const") - return false; + rpn_builder.traverseNodeTree(selection.conditions(i), context, block_with_constants, sets, rpn); + if (i != 0) + rpn.emplace_back(RPNElement::FUNCTION_AND); } } - - out.key_column = key_column_num; - out.monotonic_functions_chain = std::move(chain); - - const auto atom_it = atom_map.find(func_name); - if (atom_it == std::end(atom_map)) - return false; - - bool cast_not_needed = - is_set_const /// Set args are already casted inside Set::createFromAST - || (key_expr_type->isNumber() && const_type->isNumber()); /// Numbers are accurately compared without cast. - - if (!cast_not_needed) - castValueToType(key_expr_type, const_value, const_type, node); - - return atom_it->second(out, const_value, node); - } - else if (getConstant(node, block_with_constants, const_value, const_type)) /// For cases where it says, for example, `WHERE 0 AND something` - { - if (const_value.getType() == Field::Types::UInt64 - || const_value.getType() == Field::Types::Int64 - || const_value.getType() == Field::Types::Float64) - { - /// Zero in all types is represented in memory the same way as in UInt64. - out.function = const_value.get() - ? RPNElement::ALWAYS_TRUE - : RPNElement::ALWAYS_FALSE; - - return true; - } + else + rpn.emplace_back(RPNElement::FUNCTION_UNKNOWN); } - - return false; } -bool KeyCondition::operatorFromAST(const ASTFunction * func, RPNElement & out) +bool KeyCondition::addCondition(const String & column, const Range & range) { - /// Functions AND, OR, NOT. - /** Also a special function `indexHint` - works as if instead of calling a function there are just parentheses - * (or, the same thing - calling the function `and` from one argument). - */ - const ASTs & args = typeid_cast(*func->arguments).children; - - if (func->name == "not") - { - if (args.size() != 1) - return false; - - out.function = RPNElement::FUNCTION_NOT; - } - else - { - if (func->name == "and" || func->name == "indexHint") - out.function = RPNElement::FUNCTION_AND; - else if (func->name == "or") - out.function = RPNElement::FUNCTION_OR; - else - return false; - } - + if (!key_columns.count(column)) + return false; + rpn.emplace_back(RPNElement::FUNCTION_IN_RANGE, key_columns[column], range); + rpn.emplace_back(RPNElement::FUNCTION_AND); return true; } @@ -1014,17 +588,9 @@ bool KeyCondition::mayBeTrueInRangeImpl(const std::vector & key_ranges, c element.function == RPNElement::FUNCTION_IN_SET || element.function == RPNElement::FUNCTION_NOT_IN_SET) { - auto in_func = typeid_cast(element.in_function.get()); - const ASTs & args = typeid_cast(*in_func->arguments).children; - PreparedSets::const_iterator it = prepared_sets.find(args[1].get()); - if (in_func && it != prepared_sets.end()) - { - rpn_stack.emplace_back(element.set_index->mayBeTrueInRange(key_ranges, data_types)); - if (element.function == RPNElement::FUNCTION_NOT_IN_SET) - rpn_stack.back() = !rpn_stack.back(); - } - else - throw Exception("Set for IN is not created yet", ErrorCodes::LOGICAL_ERROR); + rpn_stack.emplace_back(element.set_index->mayBeTrueInRange(key_ranges, data_types)); + if (element.function == RPNElement::FUNCTION_NOT_IN_SET) + rpn_stack.back() = !rpn_stack.back(); } else if (element.function == RPNElement::FUNCTION_NOT) { @@ -1076,7 +642,7 @@ bool KeyCondition::mayBeTrueAfter( } -String KeyCondition::RPNElement::toString() const +String RPNElement::toString() const { auto print_wrapped_column = [this](std::ostringstream & ss) { diff --git a/dbms/src/Storages/MergeTree/KeyCondition.h b/dbms/src/Storages/MergeTree/KeyCondition.h index c7d55b0a575..6612ec0c7d8 100644 --- a/dbms/src/Storages/MergeTree/KeyCondition.h +++ b/dbms/src/Storages/MergeTree/KeyCondition.h @@ -216,6 +216,53 @@ class FieldWithInfinity FieldWithInfinity(const Type type_); }; +/// The expression is stored as Reverse Polish Notation. +struct RPNElement +{ + enum Function + { + /// Atoms of a Boolean expression. + FUNCTION_IN_RANGE, + FUNCTION_NOT_IN_RANGE, + FUNCTION_IN_SET, + FUNCTION_NOT_IN_SET, + FUNCTION_UNKNOWN, /// Can take any value. + /// Operators of the logical expression. + FUNCTION_NOT, + FUNCTION_AND, + FUNCTION_OR, + /// Constants + ALWAYS_FALSE, + ALWAYS_TRUE, + }; + + RPNElement() {} + RPNElement(Function function_) : function(function_) {} + RPNElement(Function function_, size_t key_column_) : function(function_), key_column(key_column_) {} + RPNElement(Function function_, size_t key_column_, const Range & range_) + : function(function_), range(range_), key_column(key_column_) {} + + String toString() const; + + Function function = FUNCTION_UNKNOWN; + + /// For FUNCTION_IN_RANGE and FUNCTION_NOT_IN_RANGE. + Range range; + size_t key_column; + /// For FUNCTION_IN_SET, FUNCTION_NOT_IN_SET + using MergeTreeSetIndexPtr = std::shared_ptr; + MergeTreeSetIndexPtr set_index; + + /** A chain of possibly monotone functions. + * If the key column is wrapped in functions that can be monotonous in some value ranges + * (for example: -toFloat64(toDayOfWeek(date))), then here the functions will be located: toDayOfWeek, toFloat64, negate. + */ + using MonotonicFunctionsChain = std::vector; + mutable MonotonicFunctionsChain monotonic_functions_chain; /// The function execution does not violate the constancy. +}; + +using RPN = std::vector; +using ColumnIndices = std::map; /** Condition on the index. * * Consists of the conditions for the key belonging to all possible ranges or sets, @@ -256,57 +303,10 @@ class KeyCondition String toString() const; - - /// The expression is stored as Reverse Polish Notation. - struct RPNElement - { - enum Function - { - /// Atoms of a Boolean expression. - FUNCTION_IN_RANGE, - FUNCTION_NOT_IN_RANGE, - FUNCTION_IN_SET, - FUNCTION_NOT_IN_SET, - FUNCTION_UNKNOWN, /// Can take any value. - /// Operators of the logical expression. - FUNCTION_NOT, - FUNCTION_AND, - FUNCTION_OR, - /// Constants - ALWAYS_FALSE, - ALWAYS_TRUE, - }; - - RPNElement() {} - RPNElement(Function function_) : function(function_) {} - RPNElement(Function function_, size_t key_column_) : function(function_), key_column(key_column_) {} - RPNElement(Function function_, size_t key_column_, const Range & range_) - : function(function_), range(range_), key_column(key_column_) {} - - String toString() const; - - Function function = FUNCTION_UNKNOWN; - - /// For FUNCTION_IN_RANGE and FUNCTION_NOT_IN_RANGE. - Range range; - size_t key_column; - /// For FUNCTION_IN_SET, FUNCTION_NOT_IN_SET - ASTPtr in_function; - using MergeTreeSetIndexPtr = std::shared_ptr; - MergeTreeSetIndexPtr set_index; - - /** A chain of possibly monotone functions. - * If the key column is wrapped in functions that can be monotonous in some value ranges - * (for example: -toFloat64(toDayOfWeek(date))), then here the functions will be located: toDayOfWeek, toFloat64, negate. - */ - using MonotonicFunctionsChain = std::vector; - mutable MonotonicFunctionsChain monotonic_functions_chain; /// The function execution does not violate the constancy. - }; - static Block getBlockWithConstants( - const ASTPtr & query, const Context & context, const NamesAndTypesList & all_columns); + const ASTPtr & query, const Context & context, const NamesAndTypesList & all_columns); - using AtomMap = std::unordered_map; + using AtomMap = std::unordered_map; static const AtomMap atom_map; static std::optional applyMonotonicFunctionsChainToRange( @@ -315,8 +315,6 @@ class KeyCondition DataTypePtr current_type); private: - using RPN = std::vector; - using ColumnIndices = std::map; bool mayBeTrueInRange( size_t used_key_size, @@ -327,56 +325,11 @@ class KeyCondition bool mayBeTrueInRangeImpl(const std::vector & key_ranges, const DataTypes & data_types) const; - void traverseAST(const ASTPtr & node, const Context & context, Block & block_with_constants); - bool atomFromAST(const ASTPtr & node, const Context & context, Block & block_with_constants, RPNElement & out); - bool operatorFromAST(const ASTFunction * func, RPNElement & out); - - /** Is node the key column - * or expression in which column of key is wrapped by chain of functions, - * that can be monotomic on certain ranges? - * If these conditions are true, then returns number of column in key, type of resulting expression - * and fills chain of possibly-monotonic functions. - */ - bool isKeyPossiblyWrappedByMonotonicFunctions( - const ASTPtr & node, - const Context & context, - size_t & out_key_column_num, - DataTypePtr & out_key_res_column_type, - RPNElement::MonotonicFunctionsChain & out_functions_chain); - - bool isKeyPossiblyWrappedByMonotonicFunctionsImpl( - const ASTPtr & node, - size_t & out_key_column_num, - DataTypePtr & out_key_column_type, - std::vector & out_functions_chain); - - bool canConstantBeWrappedByMonotonicFunctions( - const ASTPtr & node, - size_t & out_key_column_num, - DataTypePtr & out_key_column_type, - Field & out_value, - DataTypePtr & out_type); - - void getKeyTuplePositionMapping( - const ASTPtr & node, - const Context & context, - std::vector & indexes_mapping, - const size_t tuple_index, - size_t & out_key_column_num); - - bool isTupleIndexable( - const ASTPtr & node, - const Context & context, - RPNElement & out, - const SetPtr & prepared_set, - size_t & out_key_column_num); - RPN rpn; SortDescription sort_descr; ColumnIndices key_columns; ExpressionActionsPtr key_expr; - PreparedSets prepared_sets; }; } diff --git a/dbms/src/Storages/MergeTree/RPNBuilder.cpp b/dbms/src/Storages/MergeTree/RPNBuilder.cpp new file mode 100644 index 00000000000..9a2830612b9 --- /dev/null +++ b/dbms/src/Storages/MergeTree/RPNBuilder.cpp @@ -0,0 +1,540 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ +namespace ErrorCodes +{ +extern const int LOGICAL_ERROR; +extern const int BAD_TYPE_OF_FIELD; +extern const int NUMBER_OF_COLUMNS_DOESNT_MATCH; +} // namespace ErrorCodes + +const tipb::Expr & getChild(const tipb::Expr & node, int index) { return node.children(index); } + +const ASTPtr & getChild(const ASTPtr & node, int index) +{ + if (const ASTFunction * func = typeid_cast(node.get())) + { + return func->arguments->children[index]; + } + else + { + return node->children[index]; + } +} + +int getChildCount(const tipb::Expr & node) { return node.children_size(); } + +int getChildCount(const ASTPtr & node) +{ + if (const ASTFunction * func = typeid_cast(node.get())) + { + return func->arguments->children.size(); + } + else + { + return node->children.size(); + } +} + +const String getFuncName(const tipb::Expr & node) { return getFunctionName(node); } + +const String getFuncName(const ASTPtr & node) +{ + if (const ASTFunction * func = typeid_cast(node.get())) + { + return func->name; + } + return ""; +} + +const String getColumnName(const tipb::Expr & node, const NamesAndTypesList & source_columns) +{ + if (node.tp() == tipb::ExprType::ColumnRef) + { + auto col_id = getColumnID(node); + if (col_id < 0 || col_id >= (Int64)source_columns.size()) + return ""; + return source_columns.getNames()[col_id]; + } + return ""; +} + +const String getColumnName(const ASTPtr & node, const NamesAndTypesList &) { return node->getColumnName(); } + +bool isFuncNode(const ASTPtr & node) { return typeid_cast(node.get()); } + +bool isFuncNode(const tipb::Expr & node) { return node.tp() == tipb::ExprType::ScalarFunc; } + +/** Computes value of constant expression and it data type. + * Returns false, if expression isn't constant. + */ +bool getConstant(const ASTPtr & expr, Block & block_with_constants, Field & out_value, DataTypePtr & out_type) +{ + String column_name = expr->getColumnName(); + + if (const ASTLiteral * lit = typeid_cast(expr.get())) + { + /// By default block_with_constants has only one column named "_dummy". + /// If block contains only constants it's may not be preprocessed by + // ExpressionAnalyzer, so try to look up in the default column. + if (!block_with_constants.has(column_name)) + column_name = "_dummy"; + + /// Simple literal + out_value = lit->value; + out_type = block_with_constants.getByName(column_name).type; + return true; + } + else if (block_with_constants.has(column_name) && block_with_constants.getByName(column_name).column->isColumnConst()) + { + /// An expression which is dependent on constants only + const auto & expr_info = block_with_constants.getByName(column_name); + out_value = (*expr_info.column)[0]; + out_type = expr_info.type; + return true; + } + else + return false; +} + +/** Computes value of constant expression and it data type. + * Returns false, if expression isn't constant. + */ +bool getConstant(const tipb::Expr & expr, Block &, Field & out_value, DataTypePtr & out_type) +{ + + if (isLiteralExpr(expr)) + { + out_value = decodeLiteral(expr); + //todo check if need any extra cast + out_type = exprHasValidFieldType(expr) ? getDataTypeByFieldType(expr.field_type()) : applyVisitor(FieldToDataType(), out_value); + return true; + } + + return false; +} + +void castValueToType(const DataTypePtr & desired_type, Field & src_value, const DataTypePtr & src_type, const String & node) +{ + if (desired_type->equals(*src_type)) + return; + + try + { + /// NOTE: We don't need accurate info about src_type at this moment + src_value = convertFieldToType(src_value, *desired_type); + } + catch (...) + { + throw Exception("Key expression contains comparison between inconvertible types: " + desired_type->getName() + " and " + + src_type->getName() + " inside " + node, + ErrorCodes::BAD_TYPE_OF_FIELD); + } +} + +void applyFunction( + const FunctionBasePtr & func, const DataTypePtr & arg_type, const Field & arg_value, DataTypePtr & res_type, Field & res_value) +{ + res_type = func->getReturnType(); + + Block block{{arg_type->createColumnConst(1, arg_value), arg_type, "x"}, {nullptr, res_type, "y"}}; + + func->execute(block, {0}, 1); + + block.safeGetByPosition(1).column->get(0, res_value); +} + +bool setContains(const tipb::Expr & expr, DAGPreparedSets & sets) { return sets.count(&expr); } + +bool setContains(const ASTPtr & expr, PreparedSets & sets) { return sets.count(getChild(expr, 1).get()); } + +SetPtr & lookByExpr(const tipb::Expr & expr, DAGPreparedSets & sets) { return sets[&expr]; } + +SetPtr & lookByExpr(const ASTPtr & expr, PreparedSets & sets) { return sets[getChild(expr, 1).get()]; } + +String nodeToString(const tipb::Expr & node) { return node.DebugString(); } + +String nodeToString(const ASTPtr & node) { return queryToString(node); } + +template +bool RPNBuilder::isKeyPossiblyWrappedByMonotonicFunctionsImpl( + const NodeT & node, size_t & out_key_column_num, DataTypePtr & out_key_column_type, std::vector & out_functions_chain) +{ + /** By itself, the key column can be a functional expression. for example, `intHash32(UserID)`. + * Therefore, use the full name of the expression for search. + */ + const auto & sample_block = key_expr->getSampleBlock(); + String name = getColumnName(node, source_columns); + + auto it = key_columns.find(name); + if (key_columns.end() != it) + { + out_key_column_num = it->second; + out_key_column_type = sample_block.getByName(it->first).type; + return true; + } + + if (isFuncNode(node)) + { + if (getChildCount(node) != 1) + return false; + + out_functions_chain.push_back(getFuncName(node)); + + if (!isKeyPossiblyWrappedByMonotonicFunctionsImpl(getChild(node, 0), out_key_column_num, out_key_column_type, out_functions_chain)) + return false; + + return true; + } + + return false; +} + +template +bool RPNBuilder::isKeyPossiblyWrappedByMonotonicFunctions(const NodeT & node, + const Context & context, + size_t & out_key_column_num, + DataTypePtr & out_key_res_column_type, + RPNElement::MonotonicFunctionsChain & out_functions_chain) +{ + std::vector chain_not_tested_for_monotonicity; + DataTypePtr key_column_type; + + if (!isKeyPossiblyWrappedByMonotonicFunctionsImpl(node, out_key_column_num, key_column_type, chain_not_tested_for_monotonicity)) + return false; + + for (auto it = chain_not_tested_for_monotonicity.rbegin(); it != chain_not_tested_for_monotonicity.rend(); ++it) + { + auto func_builder = FunctionFactory::instance().tryGet(*it, context); + ColumnsWithTypeAndName arguments{{nullptr, key_column_type, ""}}; + auto func = func_builder->build(arguments); + + if (!func || !func->hasInformationAboutMonotonicity()) + return false; + + key_column_type = func->getReturnType(); + out_functions_chain.push_back(func); + } + + out_key_res_column_type = key_column_type; + + return true; +} + +template +void RPNBuilder::getKeyTuplePositionMapping(const NodeT & node, + const Context & context, + std::vector & indexes_mapping, + const size_t tuple_index, + size_t & out_key_column_num) +{ + MergeTreeSetIndex::KeyTuplePositionMapping index_mapping; + index_mapping.tuple_index = tuple_index; + DataTypePtr data_type; + if (isKeyPossiblyWrappedByMonotonicFunctions(node, context, index_mapping.key_index, data_type, index_mapping.functions)) + { + indexes_mapping.push_back(index_mapping); + if (out_key_column_num < index_mapping.key_index) + { + out_key_column_num = index_mapping.key_index; + } + } +} + +template +bool RPNBuilder::isTupleIndexable( + const NodeT & node, const Context & context, RPNElement & out, const SetPtr & prepared_set, size_t & out_key_column_num) +{ + out_key_column_num = 0; + std::vector indexes_mapping; + + size_t num_key_columns = prepared_set->getDataTypes().size(); + + bool is_func = isFuncNode(node); + if (is_func && getFuncName(node) == "tuple") + { + if (num_key_columns != (size_t)getChildCount(node)) + { + std::stringstream message; + message << "Number of columns in section IN doesn't match. " << getChildCount(node) << " at left, " << num_key_columns + << " at right."; + throw Exception(message.str(), ErrorCodes::NUMBER_OF_COLUMNS_DOESNT_MATCH); + } + + size_t current_tuple_index = 0; + for (int i = 0; i < getChildCount(node); i++) + { + const auto & arg = getChild(node, i); + getKeyTuplePositionMapping(arg, context, indexes_mapping, current_tuple_index, out_key_column_num); + ++current_tuple_index; + } + } + else + { + getKeyTuplePositionMapping(node, context, indexes_mapping, 0, out_key_column_num); + } + + if (indexes_mapping.empty()) + return false; + + out.set_index = std::make_shared(prepared_set->getSetElements(), std::move(indexes_mapping)); + + return true; +} + +template +bool RPNBuilder::canConstantBeWrappedByMonotonicFunctions( + const NodeT & node, size_t & out_key_column_num, DataTypePtr & out_key_column_type, Field & out_value, DataTypePtr & out_type) +{ + String expr_name = getColumnName(node, source_columns); + const auto & sample_block = key_expr->getSampleBlock(); + if (!sample_block.has(expr_name)) + return false; + + bool found_transformation = false; + for (const ExpressionAction & a : key_expr->getActions()) + { + /** The key functional expression constraint may be inferred from a plain column in the expression. + * For example, if the key contains `toStartOfHour(Timestamp)` and query contains `WHERE Timestamp >= now()`, + * it can be assumed that if `toStartOfHour()` is monotonic on [now(), inf), the `toStartOfHour(Timestamp) >= toStartOfHour(now())` + * condition also holds, so the index may be used to select only parts satisfying this condition. + * + * To check the assumption, we'd need to assert that the inverse function to this transformation is also monotonic, however the + * inversion isn't exported (or even viable for not strictly monotonic functions such as `toStartOfHour()`). + * Instead, we can qualify only functions that do not transform the range (for example rounding), + * which while not strictly monotonic, are monotonic everywhere on the input range. + */ + const auto & action = a.argument_names; + if (a.type == ExpressionAction::Type::APPLY_FUNCTION && action.size() == 1 && a.argument_names[0] == expr_name) + { + if (!a.function->hasInformationAboutMonotonicity()) + return false; + + // Range is irrelevant in this case + IFunction::Monotonicity monotonicity = a.function->getMonotonicityForRange(*out_type, Field(), Field()); + if (!monotonicity.is_always_monotonic) + return false; + + // Apply the next transformation step + DataTypePtr new_type; + applyFunction(a.function, out_type, out_value, new_type, out_value); + if (!new_type) + return false; + + out_type.swap(new_type); + expr_name = a.result_name; + + // Transformation results in a key expression, accept + auto it = key_columns.find(expr_name); + if (key_columns.end() != it) + { + out_key_column_num = it->second; + out_key_column_type = sample_block.getByName(it->first).type; + found_transformation = true; + break; + } + } + } + + return found_transformation; +} + +template +bool RPNBuilder::operatorFromNodeTree(const NodeT & node, RPNElement & out) +{ + /// Functions AND, OR, NOT. + /** Also a special function `indexHint` - works as if instead of calling a function there are just parentheses + * (or, the same thing - calling the function `and` from one argument). + */ + if (!isFuncNode(node)) + return false; + String name = getFuncName(node); + + if (name == "not") + { + if (getChildCount(node) != 1) + return false; + + out.function = RPNElement::FUNCTION_NOT; + } + else + { + if (name == "and" || name == "indexHint") + out.function = RPNElement::FUNCTION_AND; + else if (name == "or") + out.function = RPNElement::FUNCTION_OR; + else + return false; + } + + return true; +} + +template +bool RPNBuilder::atomFromNodeTree( + const NodeT & node, const Context & context, Block & block_with_constants, PreparedSetsT & sets, RPNElement & out) +{ + /** Functions < > = != <= >= in `notIn`, where one argument is a constant, and the other is one of columns of key, + * or itself, wrapped in a chain of possibly-monotonic functions, + * or constant expression - number. + */ + Field const_value; + DataTypePtr const_type; + if (isFuncNode(node)) + { + if (getChildCount(node) != 2) + return false; + + DataTypePtr key_expr_type; /// Type of expression containing key column + size_t key_arg_pos; /// Position of argument with key column (non-const argument) + size_t key_column_num; /// Number of a key column (inside sort_descr array) + RPNElement::MonotonicFunctionsChain chain; + bool is_set_const = false; + bool is_constant_transformed = false; + const NodeT & child0 = getChild(node, 0); + const NodeT & child1 = getChild(node, 1); + + if (setContains(node, sets) && isTupleIndexable(child0, context, out, lookByExpr(node, sets), key_column_num)) + { + key_arg_pos = 0; + is_set_const = true; + } + else if (getConstant(child1, block_with_constants, const_value, const_type) + && isKeyPossiblyWrappedByMonotonicFunctions(child0, context, key_column_num, key_expr_type, chain)) + { + key_arg_pos = 0; + } + else if (getConstant(child1, block_with_constants, const_value, const_type) + && canConstantBeWrappedByMonotonicFunctions(child0, key_column_num, key_expr_type, const_value, const_type)) + { + key_arg_pos = 0; + is_constant_transformed = true; + } + else if (getConstant(child0, block_with_constants, const_value, const_type) + && isKeyPossiblyWrappedByMonotonicFunctions(child1, context, key_column_num, key_expr_type, chain)) + { + key_arg_pos = 1; + } + else if (getConstant(child0, block_with_constants, const_value, const_type) + && canConstantBeWrappedByMonotonicFunctions(child1, key_column_num, key_expr_type, const_value, const_type)) + { + key_arg_pos = 1; + is_constant_transformed = true; + } + else + return false; + + std::string func_name = getFuncName(node); + + // make sure that RPNElement of FUNCTION_IN_SET/FUNCTION_NOT_IN_SET + // has valid set in PreparedSets + if (func_name == "in" || func_name == "notIn") + if (!is_set_const) + return false; + + /// Transformed constant must weaken the condition, for example "x > 5" must weaken to "round(x) >= 5" + if (is_constant_transformed) + { + if (func_name == "less") + func_name = "lessOrEquals"; + else if (func_name == "greater") + func_name = "greaterOrEquals"; + } + + /// Replace on to <-sign> + if (key_arg_pos == 1) + { + if (func_name == "less") + func_name = "greater"; + else if (func_name == "greater") + func_name = "less"; + else if (func_name == "greaterOrEquals") + func_name = "lessOrEquals"; + else if (func_name == "lessOrEquals") + func_name = "greaterOrEquals"; + else if (func_name == "in" || func_name == "notIn" || func_name == "like") + { + /// "const IN data_column" doesn't make sense (unlike "data_column IN const") + return false; + } + } + + out.key_column = key_column_num; + out.monotonic_functions_chain = std::move(chain); + + const auto atom_it = KeyCondition::atom_map.find(func_name); + if (atom_it == std::end(KeyCondition::atom_map)) + return false; + + bool cast_not_needed = is_set_const /// Set args are already casted inside Set::createFromAST + || (key_expr_type->isNumber() && const_type->isNumber()); /// Numbers are accurately compared without cast. + + if (!cast_not_needed) + castValueToType(key_expr_type, const_value, const_type, nodeToString(node)); + + return atom_it->second(out, const_value); + } + else if (getConstant( + node, block_with_constants, const_value, const_type)) /// For cases where it says, for example, `WHERE 0 AND something` + { + if (const_value.getType() == Field::Types::UInt64 || const_value.getType() == Field::Types::Int64 + || const_value.getType() == Field::Types::Float64) + { + /// Zero in all types is represented in memory the same way as in UInt64. + out.function = const_value.get() ? RPNElement::ALWAYS_TRUE : RPNElement::ALWAYS_FALSE; + + return true; + } + } + + return false; +} + +template +void RPNBuilder::traverseNodeTree( + const NodeT & node, const Context & context, Block & block_with_constants, PreparedSetsT & sets, RPN & rpn) +{ + RPNElement element; + + if (isFuncNode(node)) + { + if (operatorFromNodeTree(node, element)) + { + for (size_t i = 0, size = getChildCount(node); i < size; ++i) + { + traverseNodeTree(getChild(node, i), context, block_with_constants, sets, rpn); + + /** The first part of the condition is for the correct support of `and` and `or` functions of arbitrary arity + * - in this case `n - 1` elements are added (where `n` is the number of arguments). + */ + if (i != 0 || element.function == RPNElement::FUNCTION_NOT) + rpn.push_back(element); + } + + return; + } + } + + if (!atomFromNodeTree(node, context, block_with_constants, sets, element)) + { + element.function = RPNElement::FUNCTION_UNKNOWN; + } + + rpn.emplace_back(std::move(element)); +} + +template class RPNBuilder; +template class RPNBuilder; + +} // namespace DB diff --git a/dbms/src/Storages/MergeTree/RPNBuilder.h b/dbms/src/Storages/MergeTree/RPNBuilder.h new file mode 100644 index 00000000000..f9eaf263cf5 --- /dev/null +++ b/dbms/src/Storages/MergeTree/RPNBuilder.h @@ -0,0 +1,67 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ +using RPN = std::vector; +using DAGPreparedSets = std::unordered_map; + +void applyFunction( + const FunctionBasePtr & func, const DataTypePtr & arg_type, const Field & arg_value, DataTypePtr & res_type, Field & res_value); + +template +class RPNBuilder +{ +public: + RPNBuilder(const ExpressionActionsPtr & key_expr_, ColumnIndices & key_columns_, const NamesAndTypesList & source_columns_) + : key_expr(key_expr_), key_columns(key_columns_), source_columns(source_columns_) + {} + + bool isKeyPossiblyWrappedByMonotonicFunctionsImpl( + const NodeT & node, size_t & out_key_column_num, DataTypePtr & out_key_column_type, std::vector & out_functions_chain); + + /** Is node the key column + * or expression in which column of key is wrapped by chain of functions, + * that can be monotomic on certain ranges? + * If these conditions are true, then returns number of column in key, type of resulting expression + * and fills chain of possibly-monotonic functions. + */ + bool isKeyPossiblyWrappedByMonotonicFunctions(const NodeT & node, + const Context & context, + size_t & out_key_column_num, + DataTypePtr & out_key_res_column_type, + RPNElement::MonotonicFunctionsChain & out_functions_chain); + + void getKeyTuplePositionMapping(const NodeT & node, + const Context & context, + std::vector & indexes_mapping, + const size_t tuple_index, + size_t & out_key_column_num); + /// Try to prepare KeyTuplePositionMapping for tuples from IN expression. + bool isTupleIndexable( + const NodeT & node, const Context & context, RPNElement & out, const SetPtr & prepared_set, size_t & out_key_column_num); + + bool canConstantBeWrappedByMonotonicFunctions( + const NodeT & node, size_t & out_key_column_num, DataTypePtr & out_key_column_type, Field & out_value, DataTypePtr & out_type); + + bool operatorFromNodeTree(const NodeT & node, RPNElement & out); + + bool atomFromNodeTree( + const NodeT & node, const Context & context, Block & block_with_constants, PreparedSetsT & sets, RPNElement & out); + + void traverseNodeTree(const NodeT & node, const Context & context, Block & block_with_constants, PreparedSetsT & sets, RPN & rpn); + +protected: + const ExpressionActionsPtr & key_expr; + ColumnIndices & key_columns; + const NamesAndTypesList & source_columns; +}; +} // namespace DB diff --git a/dbms/src/Storages/SelectQueryInfo.cpp b/dbms/src/Storages/SelectQueryInfo.cpp index 65178a3b6b0..de3b3eb3c59 100644 --- a/dbms/src/Storages/SelectQueryInfo.cpp +++ b/dbms/src/Storages/SelectQueryInfo.cpp @@ -7,11 +7,13 @@ namespace DB SelectQueryInfo::SelectQueryInfo(const SelectQueryInfo & query_info_) : query(query_info_.query), sets(query_info_.sets), - mvcc_query_info(query_info_.mvcc_query_info != nullptr ? std::make_unique(*query_info_.mvcc_query_info) : nullptr) + mvcc_query_info(query_info_.mvcc_query_info != nullptr ? std::make_unique(*query_info_.mvcc_query_info) : nullptr), + dag_query(query_info_.dag_query != nullptr ? std::make_unique(*query_info_.dag_query) : nullptr) {} SelectQueryInfo::SelectQueryInfo(SelectQueryInfo && query_info_) - : query(query_info_.query), sets(query_info_.sets), mvcc_query_info(std::move(query_info_.mvcc_query_info)) + : query(query_info_.query), sets(query_info_.sets), mvcc_query_info(std::move(query_info_.mvcc_query_info)), + dag_query(std::move(query_info_.dag_query)) {} } // namespace DB diff --git a/dbms/src/Storages/SelectQueryInfo.h b/dbms/src/Storages/SelectQueryInfo.h index 01b73ac704f..67a3dcce2ba 100644 --- a/dbms/src/Storages/SelectQueryInfo.h +++ b/dbms/src/Storages/SelectQueryInfo.h @@ -2,6 +2,7 @@ #include #include +#include namespace DB { @@ -31,11 +32,15 @@ struct SelectQueryInfo std::unique_ptr mvcc_query_info; + std::unique_ptr dag_query; + SelectQueryInfo() = default; SelectQueryInfo(const SelectQueryInfo & query_info_); SelectQueryInfo(SelectQueryInfo && query_info_); + + bool fromAST() const { return dag_query == nullptr; }; }; } // namespace DB diff --git a/tests/mutable-test/txn_dag/key_condition.test b/tests/mutable-test/txn_dag/key_condition.test new file mode 100644 index 00000000000..3c30fcb413a --- /dev/null +++ b/tests/mutable-test/txn_dag/key_condition.test @@ -0,0 +1,35 @@ +# Preparation. +=> DBGInvoke __enable_schema_sync_service('true') + +=> DBGInvoke __drop_tidb_table(default, test) +=> drop table if exists default.test + +=> DBGInvoke __set_flush_threshold(1000000, 1000000) + +# Data. +=> DBGInvoke __mock_tidb_table(default, test, 'col_1 String, col_2 Int64','col_2') +=> DBGInvoke __refresh_schemas() +=> DBGInvoke __put_region(4, 0, 100, default, test) +=> DBGInvoke __raft_insert_row(default, test, 4, 66, 'test1') +=> DBGInvoke __raft_insert_row(default, test, 4, 77, 'test2') + +# DAG read by not specifying region id, where col_1 = 666. +=> DBGInvoke dag('select * from default.test where col_2 = 66') +┌─col_1─┬─col_2─┐ +│ test1 │ 66 │ +└───────┴───────┘ + +=> DBGInvoke dag('select * from default.test where col_2 > 66') +┌─col_1─┬─col_2─┐ +│ test2 │ 77 │ +└───────┴───────┘ + +=> DBGInvoke dag('select * from default.test where col_2 >= 66') +┌─col_1─┬─col_2─┐ +│ test1 │ 66 │ +│ test2 │ 77 │ +└───────┴───────┘ + +# Clean up. +=> DBGInvoke __drop_tidb_table(default, test) +=> drop table if exists default.test